diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1a527271238255f5ccf2c92f7ddddf3fe3195ea
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_backends.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_backends.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c91962db28d7629693cb756c988d933415ea4a4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_backends.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_torch_specific.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_torch_specific.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f84ad86189f67f07d93781077550d27745deb3eb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/_torch_specific.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/array_api.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/array_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f39b3b3886c020e484a77f39e0469097f176883
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/array_api.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/einops.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/einops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..242d98a65223ee266a9a30310e76a27d1aede757
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/einops.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/packing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/packing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c96504877cc125d7742398dda18b960e5ea18af7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/packing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/parsing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/parsing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51775290342b69a0562bca127070b63d01c3c010
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/__pycache__/parsing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53c06817a62bd4ecdd737e739114c4fc0d8c2c00
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/indexing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/indexing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8bddb80a75de9ab779a547629bf9394964bd137
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/__pycache__/indexing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/indexing.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4635f811a4afcfe059f19e5bf99cf97e9b128e2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/experimental/indexing.py
@@ -0,0 +1,5 @@
+"""
+This file contained some thoughts on indexing.
+
+These ideas were developed further in eindex (separate package).
+"""
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8b9fc7097cc1308bf63b1ed3fb8f3052b75d03
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__init__.py
@@ -0,0 +1,106 @@
+__author__ = "Alex Rogozhnikov"
+
+from typing import Any, Dict
+
+
+from ..einops import TransformRecipe, _apply_recipe, _prepare_recipes_for_all_dims, get_backend
+from .. import EinopsError
+
+
+class RearrangeMixin:
+    """
+    Rearrange layer behaves identically to einops.rearrange operation.
+
+    :param pattern: str, rearrangement pattern
+    :param axes_lengths: any additional specification of dimensions
+
+    See einops.rearrange for source_examples.
+    """
+
+    def __init__(self, pattern: str, **axes_lengths: Any) -> None:
+        super().__init__()
+        self.pattern = pattern
+        self.axes_lengths = axes_lengths
+        # self._recipe = self.recipe()  # checking parameters
+        self._multirecipe = self.multirecipe()
+        self._axes_lengths = tuple(self.axes_lengths.items())
+
+    def __repr__(self) -> str:
+        params = repr(self.pattern)
+        for axis, length in self.axes_lengths.items():
+            params += ", {}={}".format(axis, length)
+        return "{}({})".format(self.__class__.__name__, params)
+
+    def multirecipe(self) -> Dict[int, TransformRecipe]:
+        try:
+            return _prepare_recipes_for_all_dims(
+                self.pattern, operation="rearrange", axes_names=tuple(self.axes_lengths)
+            )
+        except EinopsError as e:
+            raise EinopsError(" Error while preparing {!r}\n {}".format(self, e))
+
+    def _apply_recipe(self, x):
+        backend = get_backend(x)
+        return _apply_recipe(
+            backend=backend,
+            recipe=self._multirecipe[len(x.shape)],
+            tensor=x,
+            reduction_type="rearrange",
+            axes_lengths=self._axes_lengths,
+        )
+
+    def __getstate__(self):
+        return {"pattern": self.pattern, "axes_lengths": self.axes_lengths}
+
+    def __setstate__(self, state):
+        self.__init__(pattern=state["pattern"], **state["axes_lengths"])
+
+
+class ReduceMixin:
+    """
+    Reduce layer behaves identically to einops.reduce operation.
+
+    :param pattern: str, rearrangement pattern
+    :param reduction: one of available reductions ('min', 'max', 'sum', 'mean', 'prod'), case-sensitive
+    :param axes_lengths: any additional specification of dimensions
+
+    See einops.reduce for source_examples.
+    """
+
+    def __init__(self, pattern: str, reduction: str, **axes_lengths: Any):
+        super().__init__()
+        self.pattern = pattern
+        self.reduction = reduction
+        self.axes_lengths = axes_lengths
+        self._multirecipe = self.multirecipe()
+        self._axes_lengths = tuple(self.axes_lengths.items())
+
+    def __repr__(self):
+        params = "{!r}, {!r}".format(self.pattern, self.reduction)
+        for axis, length in self.axes_lengths.items():
+            params += ", {}={}".format(axis, length)
+        return "{}({})".format(self.__class__.__name__, params)
+
+    def multirecipe(self) -> Dict[int, TransformRecipe]:
+        try:
+            return _prepare_recipes_for_all_dims(
+                self.pattern, operation=self.reduction, axes_names=tuple(self.axes_lengths)
+            )
+        except EinopsError as e:
+            raise EinopsError(" Error while preparing {!r}\n {}".format(self, e))
+
+    def _apply_recipe(self, x):
+        backend = get_backend(x)
+        return _apply_recipe(
+            backend=backend,
+            recipe=self._multirecipe[len(x.shape)],
+            tensor=x,
+            reduction_type=self.reduction,
+            axes_lengths=self._axes_lengths,
+        )
+
+    def __getstate__(self):
+        return {"pattern": self.pattern, "reduction": self.reduction, "axes_lengths": self.axes_lengths}
+
+    def __setstate__(self, state):
+        self.__init__(pattern=state["pattern"], reduction=state["reduction"], **state["axes_lengths"])
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13370f9bec6e534b90c272af5e6d799fcf2d40c3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/_einmix.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/_einmix.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80d1ed88ffa28dd92bf6e96b653ebfd17963bbdc
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/_einmix.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/flax.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/flax.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c0d454bc42b60d3316f7cfa76d66a4a39fe214c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/flax.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/keras.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/keras.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2977d8550a7479d837434dd77a801c6fbfa18b35
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/keras.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/oneflow.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/oneflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10cd49f21d1f568a2c36dd489e8b4472f08085c0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/oneflow.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/paddle.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/paddle.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae5c3b11bb9b111871e15759414834041cf0157b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/paddle.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/tensorflow.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/tensorflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49ffe8b742ccb0e139c82f92e35c71a15e0c966c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/tensorflow.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/torch.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/torch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c48830f009fcb09db26733b8d4176c92629f7d89
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/__pycache__/torch.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/_einmix.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/_einmix.py
new file mode 100644
index 0000000000000000000000000000000000000000..555441cbd8342859aefeaea3a8d303285edf8d5d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/_einmix.py
@@ -0,0 +1,229 @@
+from typing import Any, List, Optional, Dict
+
+from einops import EinopsError
+from einops.parsing import ParsedExpression, _ellipsis
+import warnings
+import string
+from ..einops import _product
+
+
+def _report_axes(axes: set, report_message: str):
+    if len(axes) > 0:
+        raise EinopsError(report_message.format(axes))
+
+
+class _EinmixMixin:
+    def __init__(self, pattern: str, weight_shape: str, bias_shape: Optional[str] = None, **axes_lengths: Any):
+        """
+        EinMix - Einstein summation with automated tensor management and axis packing/unpacking.
+
+        EinMix is a combination of einops and MLP, see tutorial:
+        https://github.com/arogozhnikov/einops/blob/main/docs/3-einmix-layer.ipynb
+
+        Imagine taking einsum with two arguments, one of each input, and one - tensor with weights
+        >>> einsum('time batch channel_in, channel_in channel_out -> time batch channel_out', input, weight)
+
+        This layer manages weights for you, syntax highlights a special role of weight matrix
+        >>> EinMix('time batch channel_in -> time batch channel_out', weight_shape='channel_in channel_out')
+        But otherwise it is the same einsum under the hood. Plus einops-rearrange.
+
+        Simple linear layer with a bias term (you have one like that in your framework)
+        >>> EinMix('t b cin -> t b cout', weight_shape='cin cout', bias_shape='cout', cin=10, cout=20)
+        There is no restriction to mix the last axis. Let's mix along height
+        >>> EinMix('h w c-> hout w c', weight_shape='h hout', bias_shape='hout', h=32, hout=32)
+        Example of channel-wise multiplication (like one used in normalizations)
+        >>> EinMix('t b c -> t b c', weight_shape='c', c=128)
+        Multi-head linear layer (each head is own linear layer):
+        >>> EinMix('t b (head cin) -> t b (head cout)', weight_shape='head cin cout', ...)
+
+        ... and yes, you need to specify all dimensions of weight shape/bias shape in parameters.
+
+        Use cases:
+        - when channel dimension is not last, use EinMix, not transposition
+        - patch/segment embeddings
+        - when need only within-group connections to reduce number of weights and computations
+        - next-gen MLPs (follow tutorial link above to learn more!)
+        - in general, any time you want to combine linear layer and einops.rearrange
+
+        Uniform He initialization is applied to weight tensor.
+        This accounts for the number of elements mixed and produced.
+
+        Parameters
+        :param pattern: transformation pattern, left side - dimensions of input, right side - dimensions of output
+        :param weight_shape: axes of weight. A tensor of this shape is created, stored, and optimized in a layer
+               If bias_shape is not specified, bias is not created.
+        :param bias_shape: axes of bias added to output. Weights of this shape are created and stored. If `None` (the default), no bias is added.
+        :param axes_lengths: dimensions of weight tensor
+        """
+        super().__init__()
+        self.pattern = pattern
+        self.weight_shape = weight_shape
+        self.bias_shape = bias_shape
+        self.axes_lengths = axes_lengths
+        self.initialize_einmix(
+            pattern=pattern, weight_shape=weight_shape, bias_shape=bias_shape, axes_lengths=axes_lengths
+        )
+
+    def initialize_einmix(self, pattern: str, weight_shape: str, bias_shape: Optional[str], axes_lengths: dict):
+        left_pattern, right_pattern = pattern.split("->")
+        left = ParsedExpression(left_pattern)
+        right = ParsedExpression(right_pattern)
+        weight = ParsedExpression(weight_shape)
+        _report_axes(
+            set.difference(right.identifiers, {*left.identifiers, *weight.identifiers}),
+            "Unrecognized identifiers on the right side of EinMix {}",
+        )
+        if weight.has_ellipsis:
+            raise EinopsError("Ellipsis is not supported in weight, as its shape should be fully specified")
+        if left.has_ellipsis or right.has_ellipsis:
+            if not (left.has_ellipsis and right.has_ellipsis):
+                raise EinopsError(f"Ellipsis in EinMix should be on both sides, {pattern}")
+            if left.has_ellipsis_parenthesized:
+                raise EinopsError(f"Ellipsis on left side can't be in parenthesis, got {pattern}")
+        if any(x.has_non_unitary_anonymous_axes for x in [left, right, weight]):
+            raise EinopsError("Anonymous axes (numbers) are not allowed in EinMix")
+        if "(" in weight_shape or ")" in weight_shape:
+            raise EinopsError(f"Parenthesis is not allowed in weight shape: {weight_shape}")
+
+        pre_reshape_pattern = None
+        pre_reshape_lengths = None
+        post_reshape_pattern = None
+        if any(len(group) != 1 for group in left.composition):
+            names: List[str] = []
+            for group in left.composition:
+                names += group
+            names = [name if name != _ellipsis else "..." for name in names]
+            composition = " ".join(names)
+            pre_reshape_pattern = f"{left_pattern}-> {composition}"
+            pre_reshape_lengths = {name: length for name, length in axes_lengths.items() if name in names}
+
+        if any(len(group) != 1 for group in right.composition) or right.has_ellipsis_parenthesized:
+            names = []
+            for group in right.composition:
+                names += group
+            names = [name if name != _ellipsis else "..." for name in names]
+            composition = " ".join(names)
+            post_reshape_pattern = f"{composition} ->{right_pattern}"
+
+        self._create_rearrange_layers(pre_reshape_pattern, pre_reshape_lengths, post_reshape_pattern, {})
+
+        for axis in weight.identifiers:
+            if axis not in axes_lengths:
+                raise EinopsError("Dimension {} of weight should be specified".format(axis))
+        _report_axes(
+            set.difference(set(axes_lengths), {*left.identifiers, *weight.identifiers}),
+            "Axes {} are not used in pattern",
+        )
+        _report_axes(
+            set.difference(weight.identifiers, {*left.identifiers, *right.identifiers}), "Weight axes {} are redundant"
+        )
+        if len(weight.identifiers) == 0:
+            warnings.warn("EinMix: weight has no dimensions (means multiplication by a number)")
+
+        _weight_shape = [axes_lengths[axis] for (axis,) in weight.composition]
+        # single output element is a combination of fan_in input elements
+        _fan_in = _product([axes_lengths[axis] for (axis,) in weight.composition if axis not in right.identifiers])
+        if bias_shape is not None:
+            # maybe I should put ellipsis in the beginning for simplicity?
+            if not isinstance(bias_shape, str):
+                raise EinopsError("bias shape should be string specifying which axes bias depends on")
+            bias = ParsedExpression(bias_shape)
+            _report_axes(
+                set.difference(bias.identifiers, right.identifiers),
+                "Bias axes {} not present in output",
+            )
+            _report_axes(
+                set.difference(bias.identifiers, set(axes_lengths)),
+                "Sizes not provided for bias axes {}",
+            )
+
+            _bias_shape = []
+            used_non_trivial_size = False
+            for axes in right.composition:
+                if axes == _ellipsis:
+                    if used_non_trivial_size:
+                        raise EinopsError("all bias dimensions should go after ellipsis in the output")
+                else:
+                    # handles ellipsis correctly
+                    for axis in axes:
+                        if axis == _ellipsis:
+                            if used_non_trivial_size:
+                                raise EinopsError("all bias dimensions should go after ellipsis in the output")
+                        elif axis in bias.identifiers:
+                            _bias_shape.append(axes_lengths[axis])
+                            used_non_trivial_size = True
+                        else:
+                            _bias_shape.append(1)
+        else:
+            _bias_shape = None
+
+        weight_bound = (3 / _fan_in) ** 0.5
+        bias_bound = (1 / _fan_in) ** 0.5
+        self._create_parameters(_weight_shape, weight_bound, _bias_shape, bias_bound)
+
+        # rewrite einsum expression with single-letter latin identifiers so that
+        # expression will be understood by any framework
+        mapped_identifiers = {*left.identifiers, *right.identifiers, *weight.identifiers}
+        if _ellipsis in mapped_identifiers:
+            mapped_identifiers.remove(_ellipsis)
+        mapped_identifiers = list(sorted(mapped_identifiers))
+        mapping2letters = {k: letter for letter, k in zip(string.ascii_lowercase, mapped_identifiers)}
+        mapping2letters[_ellipsis] = "..."  # preserve ellipsis
+
+        def write_flat_remapped(axes: ParsedExpression):
+            result = []
+            for composed_axis in axes.composition:
+                if isinstance(composed_axis, list):
+                    result.extend([mapping2letters[axis] for axis in composed_axis])
+                else:
+                    assert composed_axis == _ellipsis
+                    result.append("...")
+            return "".join(result)
+
+        self.einsum_pattern: str = "{},{}->{}".format(
+            write_flat_remapped(left),
+            write_flat_remapped(weight),
+            write_flat_remapped(right),
+        )
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        raise NotImplementedError("Should be defined in framework implementations")
+
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        """Shape and implementations"""
+        raise NotImplementedError("Should be defined in framework implementations")
+
+    def __repr__(self):
+        params = repr(self.pattern)
+        params += f", '{self.weight_shape}'"
+        if self.bias_shape is not None:
+            params += f", '{self.bias_shape}'"
+        for axis, length in self.axes_lengths.items():
+            params += ", {}={}".format(axis, length)
+        return "{}({})".format(self.__class__.__name__, params)
+
+
+class _EinmixDebugger(_EinmixMixin):
+    """Used only to test mixin"""
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_reshape_pattern = pre_reshape_pattern
+        self.pre_reshape_lengths = pre_reshape_lengths
+        self.post_reshape_pattern = post_reshape_pattern
+        self.post_reshape_lengths = post_reshape_lengths
+
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        self.saved_weight_shape = weight_shape
+        self.saved_bias_shape = bias_shape
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/flax.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/flax.py
new file mode 100644
index 0000000000000000000000000000000000000000..1496dd76d94aa5641f9a52a996a93640747ddfe6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/flax.py
@@ -0,0 +1,82 @@
+from dataclasses import field
+from typing import Optional, Dict, cast
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+from . import RearrangeMixin, ReduceMixin
+from ._einmix import _EinmixMixin
+
+__author__ = "Alex Rogozhnikov"
+
+
+class Reduce(nn.Module):
+    pattern: str
+    reduction: str
+    sizes: dict = field(default_factory=lambda: {})
+
+    def setup(self):
+        self.reducer = ReduceMixin(self.pattern, self.reduction, **self.sizes)
+
+    def __call__(self, input):
+        return self.reducer._apply_recipe(input)
+
+
+class Rearrange(nn.Module):
+    pattern: str
+    sizes: dict = field(default_factory=lambda: {})
+
+    def setup(self):
+        self.rearranger = RearrangeMixin(self.pattern, **self.sizes)
+
+    def __call__(self, input):
+        return self.rearranger._apply_recipe(input)
+
+
+class EinMix(nn.Module, _EinmixMixin):
+    pattern: str
+    weight_shape: str
+    bias_shape: Optional[str] = None
+    sizes: dict = field(default_factory=lambda: {})
+
+    def setup(self):
+        self.initialize_einmix(
+            pattern=self.pattern,
+            weight_shape=self.weight_shape,
+            bias_shape=self.bias_shape,
+            axes_lengths=self.sizes,
+        )
+
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        self.weight = self.param("weight", jax.nn.initializers.uniform(weight_bound), weight_shape)
+
+        if bias_shape is not None:
+            self.bias = self.param("bias", jax.nn.initializers.uniform(bias_bound), bias_shape)
+        else:
+            self.bias = None
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_rearrange = None
+        if pre_reshape_pattern is not None:
+            self.pre_rearrange = Rearrange(pre_reshape_pattern, sizes=cast(dict, pre_reshape_lengths))
+
+        self.post_rearrange = None
+        if post_reshape_pattern is not None:
+            self.post_rearrange = Rearrange(post_reshape_pattern, sizes=cast(dict, post_reshape_lengths))
+
+    def __call__(self, input):
+        if self.pre_rearrange is not None:
+            input = self.pre_rearrange(input)
+        result = jnp.einsum(self.einsum_pattern, input, self.weight)
+        if self.bias is not None:
+            result += self.bias
+        if self.post_rearrange is not None:
+            result = self.post_rearrange(result)
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/keras.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/keras.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aec5635312fe6d7a0b1d9a54aa25d6e67329550
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/keras.py
@@ -0,0 +1,9 @@
+__author__ = "Alex Rogozhnikov"
+
+from ..layers.tensorflow import Rearrange, Reduce, EinMix
+
+keras_custom_objects = {
+    Rearrange.__name__: Rearrange,
+    Reduce.__name__: Reduce,
+    EinMix.__name__: EinMix,
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/oneflow.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/oneflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3486626814d275441284f1aca3274157b302ff8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/oneflow.py
@@ -0,0 +1,54 @@
+from typing import Optional, Dict, cast
+
+import oneflow as flow
+
+from . import RearrangeMixin, ReduceMixin
+from ._einmix import _EinmixMixin
+
+__author__ = "Tianhe Ren & Depeng Liang"
+
+
+class Rearrange(RearrangeMixin, flow.nn.Module):
+    def forward(self, input):
+        return self._apply_recipe(input)
+
+
+class Reduce(ReduceMixin, flow.nn.Module):
+    def forward(self, input):
+        return self._apply_recipe(input)
+
+
+class EinMix(_EinmixMixin, flow.nn.Module):
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        self.weight = flow.nn.Parameter(
+            flow.zeros(weight_shape).uniform_(-weight_bound, weight_bound), requires_grad=True
+        )
+        if bias_shape is not None:
+            self.bias = flow.nn.Parameter(flow.zeros(bias_shape).uniform_(-bias_bound, bias_bound), requires_grad=True)
+        else:
+            self.bias = None
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_rearrange = None
+        if pre_reshape_pattern is not None:
+            self.pre_rearrange = Rearrange(pre_reshape_pattern, **cast(dict, pre_reshape_lengths))
+
+        self.post_rearrange = None
+        if post_reshape_pattern is not None:
+            self.post_rearrange = Rearrange(post_reshape_pattern, **cast(dict, post_reshape_lengths))
+
+    def forward(self, input):
+        if self.pre_rearrange is not None:
+            input = self.pre_rearrange(input)
+        result = flow.einsum(self.einsum_pattern, input, self.weight)
+        if self.bias is not None:
+            result += self.bias
+        if self.post_rearrange is not None:
+            result = self.post_rearrange(result)
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/paddle.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..87374cace2f8813c062ccc7bfd2cba11ea2d9e5b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/paddle.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict, cast
+
+import paddle
+
+from . import RearrangeMixin, ReduceMixin
+from ._einmix import _EinmixMixin
+
+__author__ = "PaddlePaddle"
+
+
+class Rearrange(RearrangeMixin, paddle.nn.Layer):
+    def forward(self, input):
+        return self._apply_recipe(input)
+
+
+class Reduce(ReduceMixin, paddle.nn.Layer):
+    def forward(self, input):
+        return self._apply_recipe(input)
+
+
+class EinMix(_EinmixMixin, paddle.nn.Layer):
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        self.weight = self.create_parameter(
+            weight_shape, default_initializer=paddle.nn.initializer.Uniform(-weight_bound, weight_bound)
+        )
+
+        if bias_shape is not None:
+            self.bias = self.create_parameter(
+                bias_shape, default_initializer=paddle.nn.initializer.Uniform(-bias_bound, bias_bound)
+            )
+        else:
+            self.bias = None
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_rearrange = None
+        if pre_reshape_pattern is not None:
+            self.pre_rearrange = Rearrange(pre_reshape_pattern, **cast(dict, pre_reshape_lengths))
+
+        self.post_rearrange = None
+        if post_reshape_pattern is not None:
+            self.post_rearrange = Rearrange(post_reshape_pattern, **cast(dict, post_reshape_lengths))
+
+    def forward(self, input):
+        if self.pre_rearrange is not None:
+            input = self.pre_rearrange(input)
+
+        result = paddle.einsum(self.einsum_pattern, input, self.weight)
+        if self.bias is not None:
+            result += self.bias
+        if self.post_rearrange is not None:
+            result = self.post_rearrange(result)
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/tensorflow.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/tensorflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..22acf34f31d757a631955c58cf4eb4d0b26193d6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/tensorflow.py
@@ -0,0 +1,103 @@
+"""
+Comment about tensorflow layers:
+unfortunately instructions on creation of TF layers change constantly,
+and changed way too many times at this point to remember what-compatible-where.
+
+Layers in einops==0.7.0 (and several prior versions)
+ are compatible with TF 2.13
+
+Layers in einops==0.8.0 were re-implemented
+ according to official instructions for TF 2.16
+
+"""
+
+from typing import Optional, Dict, cast
+
+import tensorflow as tf
+from tensorflow.keras.layers import Layer
+
+
+from . import RearrangeMixin, ReduceMixin
+from ._einmix import _EinmixMixin
+
+
+__author__ = "Alex Rogozhnikov"
+
+
+class Rearrange(RearrangeMixin, Layer):
+    def build(self, input_shape):
+        pass  # layer does not have any parameters to be initialized
+
+    def call(self, inputs):
+        return self._apply_recipe(inputs)
+
+    def get_config(self):
+        return {"pattern": self.pattern, **self.axes_lengths}
+
+
+class Reduce(ReduceMixin, Layer):
+    def build(self, input_shape):
+        pass  # layer does not have any parameters to be initialized
+
+    def call(self, inputs):
+        return self._apply_recipe(inputs)
+
+    def get_config(self):
+        return {"pattern": self.pattern, "reduction": self.reduction, **self.axes_lengths}
+
+
+class EinMix(_EinmixMixin, Layer):
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        # this method is called in __init__,
+        #  but we postpone actual creation to build(), as TF instruction suggests
+        self._params = [weight_shape, weight_bound, bias_shape, bias_bound]
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_rearrange = None
+        if pre_reshape_pattern is not None:
+            self.pre_rearrange = Rearrange(pre_reshape_pattern, **cast(dict, pre_reshape_lengths))
+
+        self.post_rearrange = None
+        if post_reshape_pattern is not None:
+            self.post_rearrange = Rearrange(post_reshape_pattern, **cast(dict, post_reshape_lengths))
+
+    def build(self, input_shape):
+        [weight_shape, weight_bound, bias_shape, bias_bound] = self._params
+        self.weight = self.add_weight(
+            shape=weight_shape,
+            initializer=tf.random_uniform_initializer(-weight_bound, weight_bound),
+            trainable=True,
+        )
+
+        if bias_shape is not None:
+            self.bias = self.add_weight(
+                shape=bias_shape,
+                initializer=tf.random_uniform_initializer(-bias_bound, bias_bound),
+                trainable=True,
+            )
+        else:
+            self.bias = None
+
+    def call(self, inputs):
+        if self.pre_rearrange is not None:
+            inputs = self.pre_rearrange(inputs)
+        result = tf.einsum(self.einsum_pattern, inputs, self.weight)
+        if self.bias is not None:
+            result = result + self.bias
+        if self.post_rearrange is not None:
+            result = self.post_rearrange(result)
+        return result
+
+    def get_config(self):
+        return {
+            "pattern": self.pattern,
+            "weight_shape": self.weight_shape,
+            "bias_shape": self.bias_shape,
+            **self.axes_lengths,
+        }
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/torch.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf83383d0e84a3057edfd7aa6ba8517d80e618c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/layers/torch.py
@@ -0,0 +1,67 @@
+from typing import Optional, Dict, cast
+
+import torch
+
+from . import RearrangeMixin, ReduceMixin
+from ._einmix import _EinmixMixin
+from .._torch_specific import apply_for_scriptable_torch
+
+__author__ = "Alex Rogozhnikov"
+
+
+class Rearrange(RearrangeMixin, torch.nn.Module):
+    def forward(self, input):
+        recipe = self._multirecipe[input.ndim]
+        return apply_for_scriptable_torch(recipe, input, reduction_type="rearrange", axes_dims=self._axes_lengths)
+
+    def _apply_recipe(self, x):
+        # overriding parent method to prevent it's scripting
+        pass
+
+
+class Reduce(ReduceMixin, torch.nn.Module):
+    def forward(self, input):
+        recipe = self._multirecipe[input.ndim]
+        return apply_for_scriptable_torch(recipe, input, reduction_type=self.reduction, axes_dims=self._axes_lengths)
+
+    def _apply_recipe(self, x):
+        # overriding parent method to prevent it's scripting
+        pass
+
+
+class EinMix(_EinmixMixin, torch.nn.Module):
+    def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
+        self.weight = torch.nn.Parameter(
+            torch.zeros(weight_shape).uniform_(-weight_bound, weight_bound), requires_grad=True
+        )
+        if bias_shape is not None:
+            self.bias = torch.nn.Parameter(
+                torch.zeros(bias_shape).uniform_(-bias_bound, bias_bound), requires_grad=True
+            )
+        else:
+            self.bias = None
+
+    def _create_rearrange_layers(
+        self,
+        pre_reshape_pattern: Optional[str],
+        pre_reshape_lengths: Optional[Dict],
+        post_reshape_pattern: Optional[str],
+        post_reshape_lengths: Optional[Dict],
+    ):
+        self.pre_rearrange = None
+        if pre_reshape_pattern is not None:
+            self.pre_rearrange = Rearrange(pre_reshape_pattern, **cast(dict, pre_reshape_lengths))
+
+        self.post_rearrange = None
+        if post_reshape_pattern is not None:
+            self.post_rearrange = Rearrange(post_reshape_pattern, **cast(dict, post_reshape_lengths))
+
+    def forward(self, input):
+        if self.pre_rearrange is not None:
+            input = self.pre_rearrange(input)
+        result = torch.einsum(self.einsum_pattern, input, self.weight)
+        if self.bias is not None:
+            result += self.bias
+        if self.post_rearrange is not None:
+            result = self.post_rearrange(result)
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8875fe8ff58a68ce08d07fb59d49a208287d8dc6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__init__.py
@@ -0,0 +1,109 @@
+"""
+Common utils for testing.
+These functions allow testing only some frameworks, not all.
+"""
+
+import logging
+import os
+from functools import lru_cache
+from typing import List, Tuple
+
+from einops import _backends
+import warnings
+
+__author__ = "Alex Rogozhnikov"
+
+
+# minimize noise in tests logging
+logging.getLogger("tensorflow").disabled = True
+logging.getLogger("matplotlib").disabled = True
+
+FLOAT_REDUCTIONS = ("min", "max", "sum", "mean", "prod")  # not includes any/all
+
+
+def find_names_of_all_frameworks() -> List[str]:
+    backend_subclasses = []
+    backends = _backends.AbstractBackend.__subclasses__()
+    while backends:
+        backend = backends.pop()
+        backends += backend.__subclasses__()
+        backend_subclasses.append(backend)
+    return [b.framework_name for b in backend_subclasses]
+
+
+ENVVAR_NAME = "EINOPS_TEST_BACKENDS"
+
+
+def unparse_backends(backend_names: List[str]) -> Tuple[str, str]:
+    _known_backends = find_names_of_all_frameworks()
+    for backend_name in backend_names:
+        if backend_name not in _known_backends:
+            raise RuntimeError(f"Unknown framework: {backend_name}")
+    return ENVVAR_NAME, ",".join(backend_names)
+
+
+@lru_cache(maxsize=1)
+def parse_backends_to_test() -> List[str]:
+    if ENVVAR_NAME not in os.environ:
+        raise RuntimeError(f"Testing frameworks were not specified, env var {ENVVAR_NAME} not set")
+    parsed_backends = os.environ[ENVVAR_NAME].split(",")
+    _known_backends = find_names_of_all_frameworks()
+    for backend_name in parsed_backends:
+        if backend_name not in _known_backends:
+            raise RuntimeError(f"Unknown framework: {backend_name}")
+
+    return parsed_backends
+
+
+def is_backend_tested(backend: str) -> bool:
+    """Used to skip test if corresponding backend is not tested"""
+    if backend not in find_names_of_all_frameworks():
+        raise RuntimeError(f"Unknown framework {backend}")
+    return backend in parse_backends_to_test()
+
+
+def collect_test_backends(symbolic=False, layers=False) -> List[_backends.AbstractBackend]:
+    """
+    :param symbolic: symbolic or imperative frameworks?
+    :param layers: layers or operations?
+    :return: list of backends satisfying set conditions
+    """
+    if not symbolic:
+        if not layers:
+            backend_types = [
+                _backends.NumpyBackend,
+                _backends.JaxBackend,
+                _backends.TorchBackend,
+                _backends.TensorflowBackend,
+                _backends.OneFlowBackend,
+                _backends.PaddleBackend,
+                _backends.CupyBackend,
+            ]
+        else:
+            backend_types = [
+                _backends.TorchBackend,
+                _backends.OneFlowBackend,
+                _backends.PaddleBackend,
+            ]
+    else:
+        if not layers:
+            backend_types = [
+                _backends.PyTensorBackend,
+            ]
+        else:
+            backend_types = [
+                _backends.TFKerasBackend,
+            ]
+
+    backend_names_to_test = parse_backends_to_test()
+    result = []
+    for backend_type in backend_types:
+        if backend_type.framework_name not in backend_names_to_test:
+            continue
+        try:
+            result.append(backend_type())
+        except ImportError:
+            # problem with backend installation fails a specific test function,
+            # but will be skipped in all other test cases
+            warnings.warn("backend could not be initialized for tests: {}".format(backend_type))
+    return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c43be28f488213ad4a2eeddd4bd8a052a753bb7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/run_tests.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/run_tests.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3f20150a54b8bbe318cf9ff985454632aacad3a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/run_tests.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_einsum.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_einsum.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d72be6c5024d4c996f529a97c4ada642ab8b3758
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_einsum.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_examples.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_examples.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06a58a10e5a861687ca96f4590ff39b7be9c4668
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_examples.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_layers.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_layers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d73d869cb1ecb3c24a97f49497564d08d3a670c3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_layers.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_ops.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e5ee24326b5932157d81ba9abe7eac6f6499b78
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_ops.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_other.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_other.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95455d0919549a74960d1f6db48de1433775644f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_other.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_packing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_packing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8c0634ff706902b891294ec5f7743f51dc215e4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_packing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_parsing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_parsing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9842964be4d7a550282285d7248eea19d030a978
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/__pycache__/test_parsing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/run_tests.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/run_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf7a4eb6e04c72c06c0a6ab613fc2be70b06309f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/run_tests.py
@@ -0,0 +1,85 @@
+"""
+Runs tests that are appropriate for framework.
+"""
+
+import os
+import sys
+from subprocess import Popen
+from pathlib import Path
+
+__author__ = "Alex Rogozhnikov"
+
+
+def run(cmd, **env):
+    # keeps printing output when testing
+    cmd = cmd.split(" ") if isinstance(cmd, str) else cmd
+    print("running:", cmd)
+    p = Popen(cmd, cwd=str(Path(__file__).parent), env={**os.environ, **env})
+    p.communicate()
+    return p.returncode
+
+
+def main():
+    _executable, *args = sys.argv
+    frameworks = [x for x in args if x != "--pip-install"]
+    pip_install_is_set = "--pip-install" in args
+    framework_name2installation = {
+        "numpy": ["numpy"],
+        "torch": ["torch --index-url https://download.pytorch.org/whl/cpu"],
+        "jax": ["jax[cpu]", "flax"],
+        "tensorflow": ["tensorflow"],
+        "cupy": ["cupy"],
+        # switch to stable paddlepaddle, because of https://github.com/PaddlePaddle/Paddle/issues/63927
+        # "paddle": ["paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html"],
+        "paddle": ["paddlepaddle"],
+        "oneflow": ["oneflow==0.9.0"],
+        "pytensor": ["pytensor"],
+    }
+
+    usage = f"""
+    Usage:   python -m einops.tests.run_tests <frameworks> [--pip-install]
+    Example: python -m einops.tests.run_tests numpy pytorch --pip-install
+
+    Available frameworks: {list(framework_name2installation)}
+    When --pip-install is set, auto-installs requirements with pip.
+     (make sure which pip points to right pip)
+    """
+    if len(frameworks) == 0:
+        print(usage)
+        return
+    else:
+        synonyms = {
+            "tf": "tensorflow",
+            "pytorch": "torch",
+            "paddlepaddle": "paddle",
+        }
+        frameworks = [synonyms.get(f, f) for f in frameworks]
+        wrong_frameworks = [f for f in frameworks if f not in framework_name2installation]
+        if wrong_frameworks:
+            print(usage)
+            raise RuntimeError(f"Unrecognized frameworks: {wrong_frameworks}")
+
+    if pip_install_is_set:
+        print("Install testing infra")
+        other_dependencies = ["pytest"]
+        assert 0 == run("pip install {} --progress-bar off -q".format(" ".join(other_dependencies)))
+
+        for framework in frameworks:
+            print(f"Installing {framework}")
+            pip_instructions = framework_name2installation[framework]
+            assert 0 == run("pip install {} --progress-bar off -q".format(" ".join(pip_instructions)))
+
+    # we need to inform testing script which frameworks to use
+    # this is done by setting an envvar EINOPS_TEST_BACKENDS
+    from einops.tests import unparse_backends
+
+    envvar_name, envvar_value = unparse_backends(backend_names=frameworks)
+    return_code = run(
+        "python -m pytest .",
+        **{envvar_name: envvar_value},
+    )
+    assert return_code == 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_einsum.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_einsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..873af2b43cd27e8059da5d3989056c2601f1abc1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_einsum.py
@@ -0,0 +1,352 @@
+from typing import Any, Callable
+from einops.tests import collect_test_backends
+from einops.einops import _compactify_pattern_for_einsum, einsum, EinopsError
+import numpy as np
+import pytest
+import string
+
+
+class Arguments:
+    def __init__(self, *args: Any, **kargs: Any):
+        self.args = args
+        self.kwargs = kargs
+
+    def __call__(self, function: Callable):
+        return function(*self.args, **self.kwargs)
+
+
+test_layer_cases = [
+    (
+        Arguments("b c_in h w -> w c_out h b", "c_in c_out", bias_shape=None, c_out=13, c_in=12),
+        (2, 12, 3, 4),
+        (4, 13, 3, 2),
+    ),
+    (
+        Arguments("b c_in h w -> w c_out h b", "c_in c_out", bias_shape="c_out", c_out=13, c_in=12),
+        (2, 12, 3, 4),
+        (4, 13, 3, 2),
+    ),
+    (
+        Arguments("b c_in h w -> w c_in h b", "", bias_shape=None, c_in=12),
+        (2, 12, 3, 4),
+        (4, 12, 3, 2),
+    ),
+    (
+        Arguments("b c_in h w -> b c_out", "c_in h w c_out", bias_shape=None, c_in=12, h=3, w=4, c_out=5),
+        (2, 12, 3, 4),
+        (2, 5),
+    ),
+    (
+        Arguments("b t head c_in -> b t head c_out", "head c_in c_out", bias_shape=None, head=4, c_in=5, c_out=6),
+        (2, 3, 4, 5),
+        (2, 3, 4, 6),
+    ),
+]
+
+
+# Each of the form:
+# (Arguments, true_einsum_pattern, in_shapes, out_shape)
+test_functional_cases = [
+    (
+        # Basic:
+        "b c h w, b w -> b h",
+        "abcd,ad->ac",
+        ((2, 3, 4, 5), (2, 5)),
+        (2, 4),
+    ),
+    (
+        # Three tensors:
+        "b c h w, b w, b c -> b h",
+        "abcd,ad,ab->ac",
+        ((2, 3, 40, 5), (2, 5), (2, 3)),
+        (2, 40),
+    ),
+    (
+        # Ellipsis, and full names:
+        "... one two three, three four five -> ... two five",
+        "...abc,cde->...be",
+        ((32, 5, 2, 3, 4), (4, 5, 6)),
+        (32, 5, 3, 6),
+    ),
+    (
+        # Ellipsis at the end:
+        "one two three ..., three four five -> two five ...",
+        "abc...,cde->be...",
+        ((2, 3, 4, 32, 5), (4, 5, 6)),
+        (3, 6, 32, 5),
+    ),
+    (
+        # Ellipsis on multiple tensors:
+        "... one two three, ... three four five -> ... two five",
+        "...abc,...cde->...be",
+        ((32, 5, 2, 3, 4), (32, 5, 4, 5, 6)),
+        (32, 5, 3, 6),
+    ),
+    (
+        # One tensor, and underscores:
+        "first_tensor second_tensor -> first_tensor",
+        "ab->a",
+        ((5, 4),),
+        (5,),
+    ),
+    (
+        # Trace (repeated index)
+        "i i -> ",
+        "aa->",
+        ((5, 5),),
+        (),
+    ),
+    (
+        # Too many spaces in string:
+        " one  two  ,  three four->two  four  ",
+        "ab,cd->bd",
+        ((2, 3), (4, 5)),
+        (3, 5),
+    ),
+    # The following tests were inspired by numpy's einsum tests
+    # https://github.com/numpy/numpy/blob/v1.23.0/numpy/core/tests/test_einsum.py
+    (
+        # Trace with other indices
+        "i middle i -> middle",
+        "aba->b",
+        ((5, 10, 5),),
+        (10,),
+    ),
+    (
+        # Ellipsis in the middle:
+        "i ... i -> ...",
+        "a...a->...",
+        ((5, 3, 2, 1, 4, 5),),
+        (3, 2, 1, 4),
+    ),
+    (
+        # Product of first and last axes:
+        "i ... i -> i ...",
+        "a...a->a...",
+        ((5, 3, 2, 1, 4, 5),),
+        (5, 3, 2, 1, 4),
+    ),
+    (
+        # Triple diagonal
+        "one one one -> one",
+        "aaa->a",
+        ((5, 5, 5),),
+        (5,),
+    ),
+    (
+        # Axis swap:
+        "i j k -> j i k",
+        "abc->bac",
+        ((1, 2, 3),),
+        (2, 1, 3),
+    ),
+    (
+        # Identity:
+        "... -> ...",
+        "...->...",
+        ((5, 4, 3, 2, 1),),
+        (5, 4, 3, 2, 1),
+    ),
+    (
+        # Elementwise product of three tensors
+        "..., ..., ... -> ...",
+        "...,...,...->...",
+        ((3, 2), (3, 2), (3, 2)),
+        (3, 2),
+    ),
+    (
+        # Basic summation:
+        "index ->",
+        "a->",
+        ((10,)),
+        (()),
+    ),
+]
+
+
+def test_layer():
+    for backend in collect_test_backends(layers=True, symbolic=False):
+        if backend.framework_name in ["tensorflow", "torch", "oneflow", "paddle"]:
+            layer_type = backend.layers().EinMix
+            for args, in_shape, out_shape in test_layer_cases:
+                layer = args(layer_type)
+                print("Running", layer.einsum_pattern, "for", backend.framework_name)
+                input = np.random.uniform(size=in_shape).astype("float32")
+                input_framework = backend.from_numpy(input)
+                output_framework = layer(input_framework)
+                output = backend.to_numpy(output_framework)
+                assert output.shape == out_shape
+
+
+valid_backends_functional = [
+    "tensorflow",
+    "torch",
+    "jax",
+    "numpy",
+    "oneflow",
+    "cupy",
+    "tensorflow.keras",
+    "paddle",
+    "pytensor",
+]
+
+
+def test_functional():
+    # Functional tests:
+    backends = filter(lambda x: x.framework_name in valid_backends_functional, collect_test_backends())
+    for backend in backends:
+        for einops_pattern, true_pattern, in_shapes, out_shape in test_functional_cases:
+            print(f"Running '{einops_pattern}' for {backend.framework_name}")
+
+            # Create pattern:
+            predicted_pattern = _compactify_pattern_for_einsum(einops_pattern)
+            assert predicted_pattern == true_pattern
+
+            # Generate example data:
+            rstate = np.random.RandomState(0)
+            in_arrays = [rstate.uniform(size=shape).astype("float32") for shape in in_shapes]
+            in_arrays_framework = [backend.from_numpy(array) for array in in_arrays]
+
+            # Loop over whether we call it manually with the backend,
+            # or whether we use `einops.einsum`.
+            for do_manual_call in [True, False]:
+                # Actually run einsum:
+                if do_manual_call:
+                    out_array = backend.einsum(predicted_pattern, *in_arrays_framework)
+                else:
+                    out_array = einsum(*in_arrays_framework, einops_pattern)
+
+                # Check shape:
+                if tuple(out_array.shape) != out_shape:
+                    raise ValueError(f"Expected output shape {out_shape} but got {out_array.shape}")
+
+                # Check values:
+                true_out_array = np.einsum(true_pattern, *in_arrays)
+                predicted_out_array = backend.to_numpy(out_array)
+                np.testing.assert_array_almost_equal(predicted_out_array, true_out_array, decimal=5)
+
+
+def test_functional_symbolic():
+    backends = filter(
+        lambda x: x.framework_name in valid_backends_functional, collect_test_backends(symbolic=True, layers=False)
+    )
+    for backend in backends:
+        for einops_pattern, true_pattern, in_shapes, out_shape in test_functional_cases:
+            print(f"Running '{einops_pattern}' for symbolic {backend.framework_name}")
+            # Create pattern:
+            predicted_pattern = _compactify_pattern_for_einsum(einops_pattern)
+            assert predicted_pattern == true_pattern
+
+            rstate = np.random.RandomState(0)
+            in_syms = [backend.create_symbol(in_shape) for in_shape in in_shapes]
+            in_data = [rstate.uniform(size=in_shape).astype("float32") for in_shape in in_shapes]
+
+            expected_out_data = np.einsum(true_pattern, *in_data)
+
+            for do_manual_call in [True, False]:
+                if do_manual_call:
+                    predicted_out_symbol = backend.einsum(predicted_pattern, *in_syms)
+                else:
+                    predicted_out_symbol = einsum(*in_syms, einops_pattern)
+
+                predicted_out_data = backend.eval_symbol(
+                    predicted_out_symbol,
+                    list(zip(in_syms, in_data)),
+                )
+                if predicted_out_data.shape != out_shape:
+                    raise ValueError(f"Expected output shape {out_shape} but got {predicted_out_data.shape}")
+                np.testing.assert_array_almost_equal(predicted_out_data, expected_out_data, decimal=5)
+
+
+def test_functional_errors():
+    # Specific backend does not matter, as errors are raised
+    # during the pattern creation.
+
+    rstate = np.random.RandomState(0)
+
+    def create_tensor(*shape):
+        return rstate.uniform(size=shape).astype("float32")
+
+    # raise NotImplementedError("Singleton () axes are not yet supported in einsum.")
+    with pytest.raises(NotImplementedError, match="^Singleton"):
+        einsum(
+            create_tensor(5, 1),
+            "i () -> i",
+        )
+
+    # raise NotImplementedError("Shape rearrangement is not yet supported in einsum.")
+    with pytest.raises(NotImplementedError, match="^Shape rearrangement"):
+        einsum(
+            create_tensor(5, 1),
+            "a b -> (a b)",
+        )
+
+    with pytest.raises(NotImplementedError, match="^Shape rearrangement"):
+        einsum(
+            create_tensor(10, 1),
+            "(a b) -> a b",
+        )
+
+    # raise RuntimeError("Encountered empty axis name in einsum.")
+    # raise RuntimeError("Axis name in einsum must be a string.")
+    # ^ Not tested, these are just a failsafe in case an unexpected error occurs.
+
+    # raise NotImplementedError("Anonymous axes are not yet supported in einsum.")
+    with pytest.raises(NotImplementedError, match="^Anonymous axes"):
+        einsum(
+            create_tensor(5, 1),
+            "i 2 -> i",
+        )
+
+    # ParsedExpression error:
+    with pytest.raises(EinopsError, match="^Invalid axis identifier"):
+        einsum(
+            create_tensor(5, 1),
+            "i 2j -> i",
+        )
+
+    # raise ValueError("Einsum pattern must contain '->'.")
+    with pytest.raises(ValueError, match="^Einsum pattern"):
+        einsum(
+            create_tensor(5, 3, 2),
+            "i j k",
+        )
+
+    # raise RuntimeError("Too many axes in einsum.")
+    with pytest.raises(RuntimeError, match="^Too many axes"):
+        einsum(
+            create_tensor(1),
+            " ".join(string.ascii_letters) + " extra ->",
+        )
+
+    # raise RuntimeError("Unknown axis on right side of einsum.")
+    with pytest.raises(RuntimeError, match="^Unknown axis"):
+        einsum(
+            create_tensor(5, 1),
+            "i j -> k",
+        )
+
+    # raise ValueError(
+    # "The last argument passed to `einops.einsum` must be a string,"
+    # " representing the einsum pattern."
+    # )
+    with pytest.raises(ValueError, match="^The last argument"):
+        einsum(
+            "i j k -> i",
+            create_tensor(5, 4, 3),
+        )
+
+    # raise ValueError(
+    #     "`einops.einsum` takes at minimum two arguments: the tensors,"
+    #     " followed by the pattern."
+    # )
+    with pytest.raises(ValueError, match="^`einops.einsum` takes"):
+        einsum(
+            "i j k -> i",
+        )
+    with pytest.raises(ValueError, match="^`einops.einsum` takes"):
+        einsum(
+            create_tensor(5, 1),
+        )
+
+    # TODO: Include check for giving normal einsum pattern rather than einops.
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_examples.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_examples.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d28d5b8b5d3320429047dc252a04b0ec7c95c33
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_examples.py
@@ -0,0 +1,297 @@
+import numpy
+import pytest
+
+from einops import rearrange, parse_shape, reduce
+from einops.tests import is_backend_tested
+from einops.tests.test_ops import imp_op_backends
+
+
+def test_rearrange_examples():
+    def test1(x):
+        # transpose
+        y = rearrange(x, "b c h w -> b h w c")
+        assert tuple(y.shape) == (10, 30, 40, 20)
+        return y
+
+    def test2(x):
+        # view / reshape
+        y = rearrange(x, "b c h w -> b (c h w)")
+        assert tuple(y.shape) == (10, 20 * 30 * 40)
+        return y
+
+    def test3(x):
+        # depth-to-space
+        y = rearrange(x, "b (c h1 w1) h w -> b c (h h1) (w w1)", h1=2, w1=2)
+        assert tuple(y.shape) == (10, 5, 30 * 2, 40 * 2)
+        return y
+
+    def test4(x):
+        # space-to-depth
+        y = rearrange(x, "b c (h h1) (w w1) -> b (h1 w1 c) h w", h1=2, w1=2)
+        assert tuple(y.shape) == (10, 20 * 4, 30 // 2, 40 // 2)
+        return y
+
+    def test5(x):
+        # simple transposition
+        y = rearrange(x, "b1 sound b2 letter -> b1 b2 sound letter")
+        assert tuple(y.shape) == (10, 30, 20, 40)
+        return y
+
+    def test6(x):
+        # parsing parameters
+        t = rearrange(x, "b c h w -> (b h w) c")
+        t = t[:, ::2]  # replacement for dot-product, just changes size of second axis
+        assert tuple(t.shape) == (10 * 30 * 40, 10)
+
+        y = rearrange(t, "(b h w) c2 -> b c2 h w", **parse_shape(x, "b _ h w"))
+        assert tuple(y.shape) == (10, 10, 30, 40)
+        return y
+
+    def test7(x):
+        # split of embedding into groups
+        y1, y2 = rearrange(x, "b (c g) h w -> g b c h w", g=2)
+        assert tuple(y1.shape) == (10, 10, 30, 40)
+        assert tuple(y2.shape) == (10, 10, 30, 40)
+        return y1 + y2  # only one tensor is expected in output
+
+    def test8(x):
+        # max-pooling
+        y = reduce(x, "b c (h h1) (w w1) -> b c h w", reduction="max", h1=2, w1=2)
+        assert tuple(y.shape) == (10, 20, 30 // 2, 40 // 2)
+        return y
+
+    def test9(x):
+        # squeeze - unsqueeze
+        y = reduce(x, "b c h w -> b c () ()", reduction="max")
+        assert tuple(y.shape) == (10, 20, 1, 1)
+        y = rearrange(y, "b c () () -> c b")
+        assert tuple(y.shape) == (20, 10)
+        return y
+
+    def test10(x):
+        # stack
+        tensors = list(x + 0)  # 0 is needed https://github.com/tensorflow/tensorflow/issues/23185
+        tensors = rearrange(tensors, "b c h w -> b h w c")
+        assert tuple(tensors.shape) == (10, 30, 40, 20)
+        return tensors
+
+    def test11(x):
+        # concatenate
+        tensors = list(x + 0)  # 0 is needed https://github.com/tensorflow/tensorflow/issues/23185
+        tensors = rearrange(tensors, "b c h w -> h (b w) c")
+        assert tuple(tensors.shape) == (30, 10 * 40, 20)
+        return tensors
+
+    def shufflenet(x, convolve, c1, c2):
+        # shufflenet reordering example
+        x = convolve(x)
+        x = rearrange(x, "b (c1 c2) h w-> b (c2 c1) h w", c1=c1, c2=c2)
+        x = convolve(x)
+        return x
+
+    def convolve_strided_1d(x, stride, usual_convolution):
+        x = rearrange(x, "b c t1 t2 -> b c (t1 t2)")  # reduce dimensionality
+        x = rearrange(x, "b c (t stride) -> (stride b) c t", stride=stride)
+        x = usual_convolution(x)
+        x = rearrange(x, "(stride b) c t -> b c (t stride)", stride=stride)
+        return x
+
+    def convolve_strided_2d(x, h_stride, w_stride, usual_convolution):
+        x = rearrange(x, "b c (h hs) (w ws) -> (hs ws b) c h w", hs=h_stride, ws=w_stride)
+        x = usual_convolution(x)
+        x = rearrange(x, "(hs ws b) c h w -> b c (h hs) (w ws)", hs=h_stride, ws=w_stride)
+        return x
+
+    def unet_like_1d(x, usual_convolution):
+        # u-net like steps for increasing / reducing dimensionality
+        x = rearrange(x, "b c t1 t2 -> b c (t1 t2)")  # reduce dimensionality
+        y = rearrange(x, "b c (t dt) -> b (dt c) t", dt=2)
+        y = usual_convolution(y)
+        x = x + rearrange(y, "b (dt c) t -> b c (t dt)", dt=2)
+        return x
+
+    # mock for convolution (works for all backends)
+    def convolve_mock(x):
+        return x
+
+    tests = [
+        test1,
+        test2,
+        test3,
+        test4,
+        test5,
+        test6,
+        test7,
+        test8,
+        test9,
+        test10,
+        test11,
+        lambda x: shufflenet(x, convolve=convolve_mock, c1=4, c2=5),
+        lambda x: convolve_strided_1d(x, stride=2, usual_convolution=convolve_mock),
+        lambda x: convolve_strided_2d(x, h_stride=2, w_stride=2, usual_convolution=convolve_mock),
+        lambda x: unet_like_1d(x, usual_convolution=convolve_mock),
+    ]
+
+    for backend in imp_op_backends:
+        print("testing source_examples for ", backend.framework_name)
+        for test in tests:
+            x = numpy.arange(10 * 20 * 30 * 40).reshape([10, 20, 30, 40])
+            result1 = test(x)
+            result2 = backend.to_numpy(test(backend.from_numpy(x)))
+            assert numpy.array_equal(result1, result2)
+
+            # now with strides
+            x = numpy.arange(10 * 2 * 20 * 3 * 30 * 1 * 40).reshape([10 * 2, 20 * 3, 30 * 1, 40 * 1])
+            # known torch bug - torch doesn't support negative steps
+            last_step = -1 if (backend.framework_name != "torch" and backend.framework_name != "oneflow") else 1
+            indexing_expression = numpy.index_exp[::2, ::3, ::1, ::last_step]
+            result1 = test(x[indexing_expression])
+            result2 = backend.to_numpy(test(backend.from_numpy(x)[indexing_expression]))
+            assert numpy.array_equal(result1, result2)
+
+
+def tensor_train_example_numpy():
+    # kept here just for a collection, only tested for numpy
+    # https://arxiv.org/pdf/1509.06569.pdf, (5)
+    x = numpy.ones([3, 4, 5, 6])
+    rank = 4
+    if numpy.__version__ < "1.15.0":
+        # numpy.einsum fails here, skip test
+        return
+    # creating appropriate Gs
+    Gs = [numpy.ones([d, d, rank, rank]) for d in x.shape]
+    Gs[0] = Gs[0][:, :, :1, :]
+    Gs[-1] = Gs[-1][:, :, :, :1]
+
+    # einsum way
+    y = x.reshape((1,) + x.shape)
+    for G in Gs:
+        # taking partial results left-to-right
+        # y = numpy.einsum('i j alpha beta, alpha i ...  -> beta ... j', G, y)
+        y = numpy.einsum("i j a b, a i ...  -> b ... j", G, y)
+    y1 = y.reshape(-1)
+
+    # alternative way
+    y = x.reshape(-1)
+    for G in Gs:
+        i, j, alpha, beta = G.shape
+        y = rearrange(y, "(i rest alpha) -> rest (alpha i)", alpha=alpha, i=i)
+        y = y @ rearrange(G, "i j alpha beta -> (alpha i) (j beta)")
+        y = rearrange(y, "rest (beta j) -> (beta rest j)", beta=beta, j=j)
+    y2 = y
+    assert numpy.allclose(y1, y2)
+
+    # yet another way
+    y = x
+    for G in Gs:
+        i, j, alpha, beta = G.shape
+        y = rearrange(y, "i ... (j alpha) -> ... j (alpha i)", alpha=alpha, i=i)
+        y = y @ rearrange(G, "i j alpha beta -> (alpha i) (j beta)")
+    y3 = y.reshape(-1)
+    assert numpy.allclose(y1, y3)
+
+
+def test_pytorch_yolo_fragment():
+    if not is_backend_tested("torch"):
+        pytest.skip()
+
+    import torch
+
+    def old_way(input, num_classes, num_anchors, anchors, stride_h, stride_w):
+        # https://github.com/BobLiu20/YOLOv3_PyTorch/blob/c6b483743598b5f64d520d81e7e5f47ba936d4c9/nets/yolo_loss.py#L28-L44
+        bs = input.size(0)
+        in_h = input.size(2)
+        in_w = input.size(3)
+        scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in anchors]
+
+        prediction = input.view(bs, num_anchors, 5 + num_classes, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
+        # Get outputs
+        x = torch.sigmoid(prediction[..., 0])  # Center x
+        y = torch.sigmoid(prediction[..., 1])  # Center y
+        w = prediction[..., 2]  # Width
+        h = prediction[..., 3]  # Height
+        conf = torch.sigmoid(prediction[..., 4])  # Conf
+        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
+
+        # https://github.com/BobLiu20/YOLOv3_PyTorch/blob/c6b483743598b5f64d520d81e7e5f47ba936d4c9/nets/yolo_loss.py#L70-L92
+        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+        # Calculate offsets for each grid
+        grid_x = (
+            torch.linspace(0, in_w - 1, in_w)
+            .repeat(in_w, 1)
+            .repeat(bs * num_anchors, 1, 1)
+            .view(x.shape)
+            .type(FloatTensor)
+        )
+        grid_y = (
+            torch.linspace(0, in_h - 1, in_h)
+            .repeat(in_h, 1)
+            .t()
+            .repeat(bs * num_anchors, 1, 1)
+            .view(y.shape)
+            .type(FloatTensor)
+        )
+        # Calculate anchor w, h
+        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
+        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
+        # Add offset and scale with anchors
+        pred_boxes = FloatTensor(prediction[..., :4].shape)
+        pred_boxes[..., 0] = x.data + grid_x
+        pred_boxes[..., 1] = y.data + grid_y
+        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
+        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
+        # Results
+        _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor)
+        output = torch.cat(
+            (pred_boxes.view(bs, -1, 4) * _scale, conf.view(bs, -1, 1), pred_cls.view(bs, -1, num_classes)), -1
+        )
+        return output
+
+    def new_way(input, num_classes, num_anchors, anchors, stride_h, stride_w):
+        raw_predictions = rearrange(input, " b (anchor prediction) h w -> prediction b anchor h w", anchor=num_anchors)
+
+        anchors = torch.FloatTensor(anchors).to(input.device)
+        anchor_sizes = rearrange(anchors, "anchor dim -> dim () anchor () ()")
+
+        _, _, _, in_h, in_w = raw_predictions.shape
+        grid_h = rearrange(torch.arange(in_h).float(), "h -> () () h ()").to(input.device)
+        grid_w = rearrange(torch.arange(in_w).float(), "w -> () () () w").to(input.device)
+
+        predicted_bboxes = torch.zeros_like(raw_predictions)
+        predicted_bboxes[0] = (raw_predictions[0].sigmoid() + grid_h) * stride_h  # center y
+        predicted_bboxes[1] = (raw_predictions[1].sigmoid() + grid_w) * stride_w  # center x
+        predicted_bboxes[2:4] = (raw_predictions[2:4].exp()) * anchor_sizes  # bbox width and height
+        predicted_bboxes[4] = raw_predictions[4].sigmoid()  # confidence
+        predicted_bboxes[5:] = raw_predictions[5:].sigmoid()  # class predictions
+        # only to match results of original code, not needed
+        return rearrange(predicted_bboxes, "prediction b anchor h w -> b anchor h w prediction")
+
+    stride_h = 4
+    stride_w = 4
+    batch_size = 5
+    num_classes = 12
+    anchors = [[50, 100], [100, 50], [75, 75]]
+    num_anchors = len(anchors)
+
+    input = torch.randn([batch_size, num_anchors * (5 + num_classes), 1, 1])
+    result1 = old_way(
+        input=input,
+        num_anchors=num_anchors,
+        num_classes=num_classes,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        anchors=anchors,
+    )
+    result2 = new_way(
+        input=input,
+        num_anchors=num_anchors,
+        num_classes=num_classes,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        anchors=anchors,
+    )
+    result1 = result1.reshape(result2.shape)
+    assert torch.allclose(result1, result2)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_layers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a70a5d67af54e1c32a30d95f9a23f7b924e15a8a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_layers.py
@@ -0,0 +1,469 @@
+import pickle
+from collections import namedtuple
+
+import numpy
+import pytest
+
+from einops import rearrange, reduce, EinopsError
+from einops.tests import collect_test_backends, is_backend_tested, FLOAT_REDUCTIONS as REDUCTIONS
+
+__author__ = "Alex Rogozhnikov"
+
+testcase = namedtuple("testcase", ["pattern", "axes_lengths", "input_shape", "wrong_shapes"])
+
+rearrangement_patterns = [
+    testcase(
+        "b c h w -> b (c h w)",
+        dict(c=20),
+        (10, 20, 30, 40),
+        [(), (10,), (10, 10, 10), (10, 21, 30, 40), [1, 20, 1, 1, 1]],
+    ),
+    testcase(
+        "b c (h1 h2) (w1 w2) -> b (c h2 w2) h1 w1",
+        dict(h2=2, w2=2),
+        (10, 20, 30, 40),
+        [(), (1, 1, 1, 1), (1, 10, 3), ()],
+    ),
+    testcase(
+        "b ... c -> c b ...",
+        dict(b=10),
+        (10, 20, 30),
+        [(), (10,), (5, 10)],
+    ),
+]
+
+
+def test_rearrange_imperative():
+    for backend in collect_test_backends(symbolic=False, layers=True):
+        print("Test layer for ", backend.framework_name)
+
+        for pattern, axes_lengths, input_shape, wrong_shapes in rearrangement_patterns:
+            x = numpy.arange(numpy.prod(input_shape), dtype="float32").reshape(input_shape)
+            result_numpy = rearrange(x, pattern, **axes_lengths)
+            layer = backend.layers().Rearrange(pattern, **axes_lengths)
+            for shape in wrong_shapes:
+                try:
+                    layer(backend.from_numpy(numpy.zeros(shape, dtype="float32")))
+                except BaseException:
+                    pass
+                else:
+                    raise AssertionError("Failure expected")
+
+            # simple pickling / unpickling
+            layer2 = pickle.loads(pickle.dumps(layer))
+            result1 = backend.to_numpy(layer(backend.from_numpy(x)))
+            result2 = backend.to_numpy(layer2(backend.from_numpy(x)))
+            assert numpy.allclose(result_numpy, result1)
+            assert numpy.allclose(result1, result2)
+
+            just_sum = backend.layers().Reduce("...->", reduction="sum")
+
+            variable = backend.from_numpy(x)
+            result = just_sum(layer(variable))
+
+            result.backward()
+            assert numpy.allclose(backend.to_numpy(variable.grad), 1)
+
+
+def test_rearrange_symbolic():
+    for backend in collect_test_backends(symbolic=True, layers=True):
+        print("Test layer for ", backend.framework_name)
+
+        for pattern, axes_lengths, input_shape, wrong_shapes in rearrangement_patterns:
+            x = numpy.arange(numpy.prod(input_shape), dtype="float32").reshape(input_shape)
+            result_numpy = rearrange(x, pattern, **axes_lengths)
+            layer = backend.layers().Rearrange(pattern, **axes_lengths)
+            input_shape_of_nones = [None] * len(input_shape)
+            shapes = [input_shape, input_shape_of_nones]
+
+            for shape in shapes:
+                symbol = backend.create_symbol(shape)
+                eval_inputs = [(symbol, x)]
+
+                result_symbol1 = layer(symbol)
+                result1 = backend.eval_symbol(result_symbol1, eval_inputs)
+                assert numpy.allclose(result_numpy, result1)
+
+                layer2 = pickle.loads(pickle.dumps(layer))
+                result_symbol2 = layer2(symbol)
+                result2 = backend.eval_symbol(result_symbol2, eval_inputs)
+                assert numpy.allclose(result1, result2)
+
+                # now testing back-propagation
+                just_sum = backend.layers().Reduce("...->", reduction="sum")
+
+                result_sum1 = backend.eval_symbol(just_sum(result_symbol1), eval_inputs)
+                result_sum2 = numpy.sum(x)
+
+                assert numpy.allclose(result_sum1, result_sum2)
+
+
+reduction_patterns = rearrangement_patterns + [
+    testcase("b c h w -> b ()", dict(b=10), (10, 20, 30, 40), [(10,), (10, 20, 30)]),
+    testcase("b c (h1 h2) (w1 w2) -> b c h1 w1", dict(h1=15, h2=2, w2=2), (10, 20, 30, 40), [(10, 20, 31, 40)]),
+    testcase("b ... c -> b", dict(b=10), (10, 20, 30, 40), [(10,), (11, 10)]),
+]
+
+
+def test_reduce_imperative():
+    for backend in collect_test_backends(symbolic=False, layers=True):
+        print("Test layer for ", backend.framework_name)
+        for reduction in REDUCTIONS:
+            for pattern, axes_lengths, input_shape, wrong_shapes in reduction_patterns:
+                print(backend, reduction, pattern, axes_lengths, input_shape, wrong_shapes)
+                x = numpy.arange(1, 1 + numpy.prod(input_shape), dtype="float32").reshape(input_shape)
+                x /= x.mean()
+                result_numpy = reduce(x, pattern, reduction, **axes_lengths)
+                layer = backend.layers().Reduce(pattern, reduction, **axes_lengths)
+                for shape in wrong_shapes:
+                    try:
+                        layer(backend.from_numpy(numpy.zeros(shape, dtype="float32")))
+                    except BaseException:
+                        pass
+                    else:
+                        raise AssertionError("Failure expected")
+
+                # simple pickling / unpickling
+                layer2 = pickle.loads(pickle.dumps(layer))
+                result1 = backend.to_numpy(layer(backend.from_numpy(x)))
+                result2 = backend.to_numpy(layer2(backend.from_numpy(x)))
+                assert numpy.allclose(result_numpy, result1)
+                assert numpy.allclose(result1, result2)
+
+                just_sum = backend.layers().Reduce("...->", reduction="sum")
+
+                variable = backend.from_numpy(x)
+                result = just_sum(layer(variable))
+
+                result.backward()
+                grad = backend.to_numpy(variable.grad)
+                if reduction == "sum":
+                    assert numpy.allclose(grad, 1)
+                if reduction == "mean":
+                    assert numpy.allclose(grad, grad.min())
+                if reduction in ["max", "min"]:
+                    assert numpy.all(numpy.in1d(grad, [0, 1]))
+                    assert numpy.sum(grad) > 0.5
+
+
+def test_reduce_symbolic():
+    for backend in collect_test_backends(symbolic=True, layers=True):
+        print("Test layer for ", backend.framework_name)
+        for reduction in REDUCTIONS:
+            for pattern, axes_lengths, input_shape, wrong_shapes in reduction_patterns:
+                x = numpy.arange(1, 1 + numpy.prod(input_shape), dtype="float32").reshape(input_shape)
+                x /= x.mean()
+                result_numpy = reduce(x, pattern, reduction, **axes_lengths)
+                layer = backend.layers().Reduce(pattern, reduction, **axes_lengths)
+                input_shape_of_nones = [None] * len(input_shape)
+                shapes = [input_shape, input_shape_of_nones]
+
+                for shape in shapes:
+                    symbol = backend.create_symbol(shape)
+                    eval_inputs = [(symbol, x)]
+
+                    result_symbol1 = layer(symbol)
+                    result1 = backend.eval_symbol(result_symbol1, eval_inputs)
+                    assert numpy.allclose(result_numpy, result1)
+
+                    layer2 = pickle.loads(pickle.dumps(layer))
+                    result_symbol2 = layer2(symbol)
+                    result2 = backend.eval_symbol(result_symbol2, eval_inputs)
+                    assert numpy.allclose(result1, result2)
+
+
+def create_torch_model(use_reduce=False, add_scripted_layer=False):
+    if not is_backend_tested("torch"):
+        pytest.skip()
+    else:
+        from torch.nn import Sequential, Conv2d, MaxPool2d, Linear, ReLU
+        from einops.layers.torch import Rearrange, Reduce, EinMix
+        import torch.jit
+
+        return Sequential(
+            Conv2d(3, 6, kernel_size=(5, 5)),
+            Reduce("b c (h h2) (w w2) -> b c h w", "max", h2=2, w2=2) if use_reduce else MaxPool2d(kernel_size=2),
+            Conv2d(6, 16, kernel_size=(5, 5)),
+            Reduce("b c (h h2) (w w2) -> b c h w", "max", h2=2, w2=2),
+            torch.jit.script(Rearrange("b c h w -> b (c h w)"))
+            if add_scripted_layer
+            else Rearrange("b c h w -> b (c h w)"),
+            Linear(16 * 5 * 5, 120),
+            ReLU(),
+            Linear(120, 84),
+            ReLU(),
+            EinMix("b c1 -> (b c2)", weight_shape="c1 c2", bias_shape="c2", c1=84, c2=84),
+            EinMix("(b c2) -> b c3", weight_shape="c2 c3", bias_shape="c3", c2=84, c3=84),
+            Linear(84, 10),
+        )
+
+
+def test_torch_layer():
+    if not is_backend_tested("torch"):
+        pytest.skip()
+    else:
+        # checked that torch present
+        import torch
+        import torch.jit
+
+        model1 = create_torch_model(use_reduce=True)
+        model2 = create_torch_model(use_reduce=False)
+        input = torch.randn([10, 3, 32, 32])
+        # random models have different predictions
+        assert not torch.allclose(model1(input), model2(input))
+        model2.load_state_dict(pickle.loads(pickle.dumps(model1.state_dict())))
+        assert torch.allclose(model1(input), model2(input))
+
+        # tracing (freezing)
+        model3 = torch.jit.trace(model2, example_inputs=input)
+        torch.testing.assert_close(model1(input), model3(input), atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(model1(input + 1), model3(input + 1), atol=1e-3, rtol=1e-3)
+
+        model4 = torch.jit.trace(model2, example_inputs=input)
+        torch.testing.assert_close(model1(input), model4(input), atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(model1(input + 1), model4(input + 1), atol=1e-3, rtol=1e-3)
+
+
+def test_torch_layers_scripting():
+    if not is_backend_tested("torch"):
+        pytest.skip()
+    else:
+        import torch
+
+        for script_layer in [False, True]:
+            model1 = create_torch_model(use_reduce=True, add_scripted_layer=script_layer)
+            model2 = torch.jit.script(model1)
+            input = torch.randn([10, 3, 32, 32])
+
+            torch.testing.assert_close(model1(input), model2(input), atol=1e-3, rtol=1e-3)
+
+
+def test_keras_layer():
+    if not is_backend_tested("tensorflow"):
+        pytest.skip()
+    else:
+        import tensorflow as tf
+
+        if tf.__version__ < "2.16.":
+            # current implementation of layers follows new TF interface
+            pytest.skip()
+        from tensorflow.keras.models import Sequential
+        from tensorflow.keras.layers import Conv2D as Conv2d, Dense as Linear, ReLU
+        from einops.layers.keras import Rearrange, Reduce, EinMix, keras_custom_objects
+
+        def create_keras_model():
+            return Sequential(
+                [
+                    Conv2d(6, kernel_size=5, input_shape=[32, 32, 3]),
+                    Reduce("b c (h h2) (w w2) -> b c h w", "max", h2=2, w2=2),
+                    Conv2d(16, kernel_size=5),
+                    Reduce("b c (h h2) (w w2) -> b c h w", "max", h2=2, w2=2),
+                    Rearrange("b c h w -> b (c h w)"),
+                    Linear(120),
+                    ReLU(),
+                    Linear(84),
+                    ReLU(),
+                    EinMix("b c1 -> (b c2)", weight_shape="c1 c2", bias_shape="c2", c1=84, c2=84),
+                    EinMix("(b c2) -> b c3", weight_shape="c2 c3", bias_shape="c3", c2=84, c3=84),
+                    Linear(10),
+                ]
+            )
+
+        model1 = create_keras_model()
+        model2 = create_keras_model()
+
+        input = numpy.random.normal(size=[10, 32, 32, 3]).astype("float32")
+        # two randomly init models should provide different outputs
+        assert not numpy.allclose(model1.predict_on_batch(input), model2.predict_on_batch(input))
+
+        # get some temp filename
+        tmp_model_filename = "/tmp/einops_tf_model.h5"
+        # save arch + weights
+        print("temp_path_keras1", tmp_model_filename)
+        tf.keras.models.save_model(model1, tmp_model_filename)
+        model3 = tf.keras.models.load_model(tmp_model_filename, custom_objects=keras_custom_objects)
+
+        numpy.testing.assert_allclose(model1.predict_on_batch(input), model3.predict_on_batch(input))
+
+        weight_filename = "/tmp/einops_tf_model.weights.h5"
+        # save arch as json
+        model4 = tf.keras.models.model_from_json(model1.to_json(), custom_objects=keras_custom_objects)
+        model1.save_weights(weight_filename)
+        model4.load_weights(weight_filename)
+        model2.load_weights(weight_filename)
+        # check that differently-inialized model receives same weights
+        numpy.testing.assert_allclose(model1.predict_on_batch(input), model2.predict_on_batch(input))
+        # ulimate test
+        # save-load architecture, and then load weights - should return same result
+        numpy.testing.assert_allclose(model1.predict_on_batch(input), model4.predict_on_batch(input))
+
+
+def test_flax_layers():
+    """
+    One-off simple tests for Flax layers.
+    Unfortunately, Flax layers have a different interface from other layers.
+    """
+    if not is_backend_tested("jax"):
+        pytest.skip()
+    else:
+        import jax
+        import jax.numpy as jnp
+
+        import flax
+        from flax import linen as nn
+        from einops.layers.flax import EinMix, Reduce, Rearrange
+
+        class NN(nn.Module):
+            @nn.compact
+            def __call__(self, x):
+                x = EinMix(
+                    "b (h h2) (w w2) c -> b h w c_out", "h2 w2 c c_out", "c_out", sizes=dict(h2=2, w2=3, c=4, c_out=5)
+                )(x)
+                x = Rearrange("b h w c -> b (w h c)", sizes=dict(c=5))(x)
+                x = Reduce("b hwc -> b", "mean", dict(hwc=2 * 3 * 5))(x)
+                return x
+
+        model = NN()
+        fixed_input = jnp.ones([10, 2 * 2, 3 * 3, 4])
+        params = model.init(jax.random.PRNGKey(0), fixed_input)
+
+        def eval_at_point(params):
+            return jnp.linalg.norm(model.apply(params, fixed_input))
+
+        vandg = jax.value_and_grad(eval_at_point)
+        value0 = eval_at_point(params)
+        value1, grad1 = vandg(params)
+        assert jnp.allclose(value0, value1)
+
+        params2 = jax.tree_map(lambda x1, x2: x1 - x2 * 0.001, params, grad1)
+
+        value2 = eval_at_point(params2)
+        assert value0 >= value2, (value0, value2)
+
+        # check serialization
+        fbytes = flax.serialization.to_bytes(params)
+        _loaded = flax.serialization.from_bytes(params, fbytes)
+
+
+def test_einmix_decomposition():
+    """
+    Testing that einmix correctly decomposes into smaller transformations.
+    """
+    from einops.layers._einmix import _EinmixDebugger
+
+    mixin1 = _EinmixDebugger(
+        "a b c d e -> e d c b a",
+        weight_shape="d a b",
+        d=2, a=3, b=5,
+    )  # fmt: off
+    assert mixin1.pre_reshape_pattern is None
+    assert mixin1.post_reshape_pattern is None
+    assert mixin1.einsum_pattern == "abcde,dab->edcba"
+    assert mixin1.saved_weight_shape == [2, 3, 5]
+    assert mixin1.saved_bias_shape is None
+
+    mixin2 = _EinmixDebugger(
+        "a b c d e -> e d c b a",
+        weight_shape="d a b",
+        bias_shape="a b c d e",
+        a=1, b=2, c=3, d=4, e=5,
+    )  # fmt: off
+    assert mixin2.pre_reshape_pattern is None
+    assert mixin2.post_reshape_pattern is None
+    assert mixin2.einsum_pattern == "abcde,dab->edcba"
+    assert mixin2.saved_weight_shape == [4, 1, 2]
+    assert mixin2.saved_bias_shape == [5, 4, 3, 2, 1]
+
+    mixin3 = _EinmixDebugger(
+        "... -> ...",
+        weight_shape="",
+        bias_shape="",
+    )  # fmt: off
+    assert mixin3.pre_reshape_pattern is None
+    assert mixin3.post_reshape_pattern is None
+    assert mixin3.einsum_pattern == "...,->..."
+    assert mixin3.saved_weight_shape == []
+    assert mixin3.saved_bias_shape == []
+
+    mixin4 = _EinmixDebugger(
+        "b a ...  -> b c ...",
+        weight_shape="b a c",
+        a=1, b=2, c=3,
+    )  # fmt: off
+    assert mixin4.pre_reshape_pattern is None
+    assert mixin4.post_reshape_pattern is None
+    assert mixin4.einsum_pattern == "ba...,bac->bc..."
+    assert mixin4.saved_weight_shape == [2, 1, 3]
+    assert mixin4.saved_bias_shape is None
+
+    mixin5 = _EinmixDebugger(
+        "(b a) ... -> b c (...)",
+        weight_shape="b a c",
+        a=1, b=2, c=3,
+    )  # fmt: off
+    assert mixin5.pre_reshape_pattern == "(b a) ... -> b a ..."
+    assert mixin5.pre_reshape_lengths == dict(a=1, b=2)
+    assert mixin5.post_reshape_pattern == "b c ... -> b c (...)"
+    assert mixin5.einsum_pattern == "ba...,bac->bc..."
+    assert mixin5.saved_weight_shape == [2, 1, 3]
+    assert mixin5.saved_bias_shape is None
+
+    mixin6 = _EinmixDebugger(
+        "b ... (a c) -> b ... (a d)",
+        weight_shape="c d",
+        bias_shape="a d",
+        a=1, c=3, d=4,
+    )  # fmt: off
+    assert mixin6.pre_reshape_pattern == "b ... (a c) -> b ... a c"
+    assert mixin6.pre_reshape_lengths == dict(a=1, c=3)
+    assert mixin6.post_reshape_pattern == "b ... a d -> b ... (a d)"
+    assert mixin6.einsum_pattern == "b...ac,cd->b...ad"
+    assert mixin6.saved_weight_shape == [3, 4]
+    assert mixin6.saved_bias_shape == [1, 1, 4]  # (b) a d, ellipsis does not participate
+
+    mixin7 = _EinmixDebugger(
+        "a ... (b c) -> a (... d b)",
+        weight_shape="c d b",
+        bias_shape="d b",
+        b=2, c=3, d=4,
+    )  # fmt: off
+    assert mixin7.pre_reshape_pattern == "a ... (b c) -> a ... b c"
+    assert mixin7.pre_reshape_lengths == dict(b=2, c=3)
+    assert mixin7.post_reshape_pattern == "a ... d b -> a (... d b)"
+    assert mixin7.einsum_pattern == "a...bc,cdb->a...db"
+    assert mixin7.saved_weight_shape == [3, 4, 2]
+    assert mixin7.saved_bias_shape == [1, 4, 2]  # (a) d b, ellipsis does not participate
+
+
+def test_einmix_restrictions():
+    """
+    Testing different cases
+    """
+    from einops.layers._einmix import _EinmixDebugger
+
+    with pytest.raises(EinopsError):
+        _EinmixDebugger(
+            "a b c d e -> e d c b a",
+            weight_shape="d a b",
+            d=2, a=3, # missing b
+        )  # fmt: off
+
+    with pytest.raises(EinopsError):
+        _EinmixDebugger(
+            "a b c d e -> e d c b a",
+            weight_shape="w a b",
+            d=2, a=3, b=1 # missing d
+        )  # fmt: off
+
+    with pytest.raises(EinopsError):
+        _EinmixDebugger(
+            "(...) a -> ... a",
+            weight_shape="a", a=1, # ellipsis on the left
+        )  # fmt: off
+
+    with pytest.raises(EinopsError):
+        _EinmixDebugger(
+            "(...) a -> a ...",
+            weight_shape="a", a=1, # ellipsis on the right side after bias axis
+            bias_shape='a',
+        )  # fmt: off
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_ops.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..3258bd2e71af7f1cb37a9b24d935df6095e61e43
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_ops.py
@@ -0,0 +1,651 @@
+import itertools
+
+import numpy
+import numpy as np
+import pytest
+
+from einops import EinopsError
+from einops.einops import rearrange, reduce, repeat, _enumerate_directions
+from einops.tests import collect_test_backends, is_backend_tested, FLOAT_REDUCTIONS as REDUCTIONS
+
+imp_op_backends = collect_test_backends(symbolic=False, layers=False)
+sym_op_backends = collect_test_backends(symbolic=True, layers=False)
+
+identity_patterns = [
+    "...->...",
+    "a b c d e-> a b c d e",
+    "a b c d e ...-> ... a b c d e",
+    "a b c d e ...-> a ... b c d e",
+    "... a b c d e -> ... a b c d e",
+    "a ... e-> a ... e",
+    "a ... -> a ... ",
+    "a ... c d e -> a (...) c d e",
+]
+
+equivalent_rearrange_patterns = [
+    ("a b c d e -> (a b) c d e", "a b ... -> (a b) ... "),
+    ("a b c d e -> a b (c d) e", "... c d e -> ... (c d) e"),
+    ("a b c d e -> a b c d e", "... -> ... "),
+    ("a b c d e -> (a b c d e)", "... ->  (...)"),
+    ("a b c d e -> b (c d e) a", "a b ... -> b (...) a"),
+    ("a b c d e -> b (a c d) e", "a b ... e -> b (a ...) e"),
+]
+
+equivalent_reduction_patterns = [
+    ("a b c d e -> ", " ... ->  "),
+    ("a b c d e -> (e a)", "a ... e -> (e a)"),
+    ("a b c d e -> d (a e)", " a b c d e ... -> d (a e) "),
+    ("a b c d e -> (a b)", " ... c d e  -> (...) "),
+]
+
+
+def test_collapsed_ellipsis_errors_out():
+    x = numpy.zeros([1, 1, 1, 1, 1])
+    rearrange(x, "a b c d ... ->  a b c ... d")
+    with pytest.raises(EinopsError):
+        rearrange(x, "a b c d (...) ->  a b c ... d")
+
+    rearrange(x, "... ->  (...)")
+    with pytest.raises(EinopsError):
+        rearrange(x, "(...) -> (...)")
+
+
+def test_ellipsis_ops_numpy():
+    x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
+    for pattern in identity_patterns:
+        assert numpy.array_equal(x, rearrange(x, pattern)), pattern
+
+    for pattern1, pattern2 in equivalent_rearrange_patterns:
+        assert numpy.array_equal(rearrange(x, pattern1), rearrange(x, pattern2))
+
+    for reduction in ["min", "max", "sum"]:
+        for pattern1, pattern2 in equivalent_reduction_patterns:
+            assert numpy.array_equal(reduce(x, pattern1, reduction=reduction), reduce(x, pattern2, reduction=reduction))
+
+    # now just check coincidence with numpy
+    all_rearrange_patterns = [*identity_patterns]
+    for pattern_pairs in equivalent_rearrange_patterns:
+        all_rearrange_patterns.extend(pattern_pairs)
+
+
+def check_op_against_numpy(backend, numpy_input, pattern, axes_lengths, reduction="rearrange", is_symbolic=False):
+    """
+    Helper to test result of operation (rearrange or transpose) against numpy
+    if reduction == 'rearrange', rearrange op is tested, otherwise reduce
+    """
+
+    def operation(x):
+        if reduction == "rearrange":
+            return rearrange(x, pattern, **axes_lengths)
+        else:
+            return reduce(x, pattern, reduction, **axes_lengths)
+
+    numpy_result = operation(numpy_input)
+    check_equal = numpy.array_equal
+    p_none_dimension = 0.5
+    if is_symbolic:
+        symbol_shape = [d if numpy.random.random() >= p_none_dimension else None for d in numpy_input.shape]
+        symbol = backend.create_symbol(shape=symbol_shape)
+        result_symbol = operation(symbol)
+        backend_result = backend.eval_symbol(result_symbol, [(symbol, numpy_input)])
+    else:
+        backend_result = operation(backend.from_numpy(numpy_input))
+        backend_result = backend.to_numpy(backend_result)
+
+    check_equal(numpy_result, backend_result)
+
+
+def test_ellipsis_ops_imperative():
+    """Checking various patterns against numpy"""
+    x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
+    for is_symbolic in [True, False]:
+        for backend in collect_test_backends(symbolic=is_symbolic, layers=False):
+            for pattern in identity_patterns + list(itertools.chain(*equivalent_rearrange_patterns)):
+                check_op_against_numpy(
+                    backend, x, pattern, axes_lengths={}, reduction="rearrange", is_symbolic=is_symbolic
+                )
+
+            for reduction in ["min", "max", "sum"]:
+                for pattern in itertools.chain(*equivalent_reduction_patterns):
+                    check_op_against_numpy(
+                        backend, x, pattern, axes_lengths={}, reduction=reduction, is_symbolic=is_symbolic
+                    )
+
+
+def test_rearrange_array_api():
+    import numpy as xp
+    from einops import array_api as AA
+
+    if xp.__version__ < "2.0.0":
+        pytest.skip()
+
+    x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
+    for pattern in identity_patterns + list(itertools.chain(*equivalent_rearrange_patterns)):
+        expected = rearrange(x, pattern)
+        result = AA.rearrange(xp.from_dlpack(x), pattern)
+        assert numpy.array_equal(AA.asnumpy(result + 0), expected)
+
+
+def test_reduce_array_api():
+    import numpy as xp
+    from einops import array_api as AA
+
+    if xp.__version__ < "2.0.0":
+        pytest.skip()
+
+    x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
+    for pattern in itertools.chain(*equivalent_reduction_patterns):
+        for reduction in ["min", "max", "sum"]:
+            expected = reduce(x, pattern, reduction=reduction)
+            result = AA.reduce(xp.from_dlpack(x), pattern, reduction=reduction)
+            assert numpy.array_equal(AA.asnumpy(np.asarray(result + 0)), expected)
+
+
+def test_rearrange_consistency_numpy():
+    shape = [1, 2, 3, 5, 7, 11]
+    x = numpy.arange(numpy.prod(shape)).reshape(shape)
+    for pattern in [
+        "a b c d e f -> a b c d e f",
+        "b a c d e f -> a b d e f c",
+        "a b c d e f -> f e d c b a",
+        "a b c d e f -> (f e) d (c b a)",
+        "a b c d e f -> (f e d c b a)",
+    ]:
+        result = rearrange(x, pattern)
+        assert len(numpy.setdiff1d(x, result)) == 0
+        assert result.dtype == x.dtype
+
+    result = rearrange(x, "a b c d e f -> a (b) (c d e) f")
+    assert numpy.array_equal(x.flatten(), result.flatten())
+
+    result = rearrange(x, "a aa aa1 a1a1 aaaa a11 -> a aa aa1 a1a1 aaaa a11")
+    assert numpy.array_equal(x, result)
+
+    result1 = rearrange(x, "a b c d e f -> f e d c b a")
+    result2 = rearrange(x, "f e d c b a -> a b c d e f")
+    assert numpy.array_equal(result1, result2)
+
+    result = rearrange(rearrange(x, "a b c d e f -> (f d) c (e b) a"), "(f d) c (e b) a -> a b c d e f", b=2, d=5)
+    assert numpy.array_equal(x, result)
+
+    sizes = dict(zip("abcdef", shape))
+    temp = rearrange(x, "a b c d e f -> (f d) c (e b) a", **sizes)
+    result = rearrange(temp, "(f d) c (e b) a -> a b c d e f", **sizes)
+    assert numpy.array_equal(x, result)
+
+    x2 = numpy.arange(2 * 3 * 4).reshape([2, 3, 4])
+    result = rearrange(x2, "a b c -> b c a")
+    assert x2[1, 2, 3] == result[2, 3, 1]
+    assert x2[0, 1, 2] == result[1, 2, 0]
+
+
+def test_rearrange_permutations_numpy():
+    # tests random permutation of axes against two independent numpy ways
+    for n_axes in range(1, 10):
+        input = numpy.arange(2**n_axes).reshape([2] * n_axes)
+        permutation = numpy.random.permutation(n_axes)
+        left_expression = " ".join("i" + str(axis) for axis in range(n_axes))
+        right_expression = " ".join("i" + str(axis) for axis in permutation)
+        expression = left_expression + " -> " + right_expression
+        result = rearrange(input, expression)
+
+        for pick in numpy.random.randint(0, 2, [10, n_axes]):
+            assert input[tuple(pick)] == result[tuple(pick[permutation])]
+
+    for n_axes in range(1, 10):
+        input = numpy.arange(2**n_axes).reshape([2] * n_axes)
+        permutation = numpy.random.permutation(n_axes)
+        left_expression = " ".join("i" + str(axis) for axis in range(n_axes)[::-1])
+        right_expression = " ".join("i" + str(axis) for axis in permutation[::-1])
+        expression = left_expression + " -> " + right_expression
+        result = rearrange(input, expression)
+        assert result.shape == input.shape
+        expected_result = numpy.zeros_like(input)
+        for original_axis, result_axis in enumerate(permutation):
+            expected_result |= ((input >> original_axis) & 1) << result_axis
+
+        assert numpy.array_equal(result, expected_result)
+
+
+def test_reduction_imperatives():
+    for backend in imp_op_backends:
+        print("Reduction tests for ", backend.framework_name)
+        for reduction in REDUCTIONS:
+            # slight redundancy for simpler order - numpy version is evaluated multiple times
+            input = numpy.arange(2 * 3 * 4 * 5 * 6, dtype="int64").reshape([2, 3, 4, 5, 6])
+            if reduction in ["mean", "prod"]:
+                input = input / input.astype("float64").mean()
+            test_cases = [
+                ["a b c d e -> ", {}, getattr(input, reduction)()],
+                ["a ... -> ", {}, getattr(input, reduction)()],
+                ["(a1 a2) ... (e1 e2) -> ", dict(a1=1, e2=2), getattr(input, reduction)()],
+                [
+                    "a b c d e -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                [
+                    "a ... c d e -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                [
+                    "a b c d e ... -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                ["a b c d e -> (e c a)", {}, getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1])],
+                ["(a a2) ... -> (a2 a) ...", dict(a2=1), input],
+            ]
+            for pattern, axes_lengths, expected_result in test_cases:
+                result = reduce(backend.from_numpy(input.copy()), pattern, reduction=reduction, **axes_lengths)
+                result = backend.to_numpy(result)
+                assert numpy.allclose(result, expected_result), f"Failed at {pattern}"
+
+
+def test_reduction_symbolic():
+    for backend in sym_op_backends:
+        print("Reduction tests for ", backend.framework_name)
+        for reduction in REDUCTIONS:
+            input = numpy.arange(2 * 3 * 4 * 5 * 6, dtype="int64").reshape([2, 3, 4, 5, 6])
+            input = input / input.astype("float64").mean()
+            # slight redundancy for simpler order - numpy version is evaluated multiple times
+            test_cases = [
+                ["a b c d e -> ", {}, getattr(input, reduction)()],
+                ["a ... -> ", {}, getattr(input, reduction)()],
+                ["(a a2) ... (e e2) -> ", dict(a2=1, e2=1), getattr(input, reduction)()],
+                [
+                    "a b c d e -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                [
+                    "a ... c d e -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                [
+                    "a b c d e ... -> (e c) a",
+                    {},
+                    getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1, 2]),
+                ],
+                ["a b c d e -> (e c a)", {}, getattr(input, reduction)(axis=(1, 3)).transpose(2, 1, 0).reshape([-1])],
+                ["(a a2) ... -> (a2 a) ...", dict(a2=1), input],
+            ]
+            for pattern, axes_lengths, expected_numpy_result in test_cases:
+                shapes = [input.shape, [None for _ in input.shape]]
+                for shape in shapes:
+                    sym = backend.create_symbol(shape)
+                    result_sym = reduce(sym, pattern, reduction=reduction, **axes_lengths)
+                    result = backend.eval_symbol(result_sym, [(sym, input)])
+                    assert numpy.allclose(result, expected_numpy_result)
+
+                if True:
+                    shape = []
+                    _axes_lengths = {**axes_lengths}
+                    for axis, length in zip("abcde", input.shape):
+                        # filling as much as possible with Nones
+                        if axis in pattern:
+                            shape.append(None)
+                            _axes_lengths[axis] = length
+                        else:
+                            shape.append(length)
+                    sym = backend.create_symbol(shape)
+                    result_sym = reduce(sym, pattern, reduction=reduction, **_axes_lengths)
+                    result = backend.eval_symbol(result_sym, [(sym, input)])
+                    assert numpy.allclose(result, expected_numpy_result)
+
+
+def test_reduction_stress_imperatives():
+    for backend in imp_op_backends:
+        print("Stress-testing reduction for ", backend.framework_name)
+        for reduction in REDUCTIONS + ("rearrange",):
+            dtype = "int64"
+            coincide = numpy.array_equal
+            if reduction in ["mean", "prod"]:
+                dtype = "float64"
+                coincide = numpy.allclose
+            max_dim = 11
+            if "oneflow" in backend.framework_name:
+                max_dim = 7
+            if "paddle" in backend.framework_name:
+                max_dim = 9
+            for n_axes in range(max_dim):
+                shape = numpy.random.randint(2, 4, size=n_axes)
+                permutation = numpy.random.permutation(n_axes)
+                skipped = 0 if reduction == "rearrange" else numpy.random.randint(n_axes + 1)
+                left = " ".join("x" + str(i) for i in range(n_axes))
+                right = " ".join("x" + str(i) for i in permutation[skipped:])
+                pattern = left + "->" + right
+                x = numpy.arange(1, 1 + numpy.prod(shape), dtype=dtype).reshape(shape)
+                if reduction == "prod":
+                    x /= x.mean()  # to avoid overflows
+                result1 = reduce(x, pattern, reduction=reduction)
+                result2 = x.transpose(permutation)
+                if skipped > 0:
+                    result2 = getattr(result2, reduction)(axis=tuple(range(skipped)))
+                assert coincide(result1, result2)
+                check_op_against_numpy(backend, x, pattern, reduction=reduction, axes_lengths={}, is_symbolic=False)
+
+
+def test_reduction_with_callable_imperatives():
+    x_numpy = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6]).astype("float32")
+    x_numpy /= x_numpy.max()
+
+    def logsumexp_torch(x, tuple_of_axes):
+        return x.logsumexp(tuple_of_axes)
+
+    def logsumexp_tf(x, tuple_of_axes):
+        import tensorflow as tf
+
+        return tf.reduce_logsumexp(x, tuple_of_axes)
+
+    def logsumexp_keras(x, tuple_of_axes):
+        import tensorflow.keras.backend as k
+
+        return k.logsumexp(x, tuple_of_axes)
+
+    def logsumexp_numpy(x, tuple_of_axes):
+        # very naive logsumexp to compare to
+        minused = x.max(tuple_of_axes)
+        y = x - x.max(tuple_of_axes, keepdims=True)
+        y = numpy.exp(y)
+        y = numpy.sum(y, axis=tuple_of_axes)
+        return numpy.log(y) + minused
+
+    from einops._backends import TorchBackend, TensorflowBackend, TFKerasBackend, NumpyBackend
+
+    backend2callback = {
+        TorchBackend.framework_name: logsumexp_torch,
+        TensorflowBackend.framework_name: logsumexp_tf,
+        TFKerasBackend.framework_name: logsumexp_keras,
+        NumpyBackend.framework_name: logsumexp_numpy,
+    }
+
+    for backend in imp_op_backends:
+        if backend.framework_name not in backend2callback:
+            continue
+
+        backend_callback = backend2callback[backend.framework_name]
+
+        x_backend = backend.from_numpy(x_numpy)
+        for pattern1, pattern2 in equivalent_reduction_patterns:
+            print("Test reduction with callable for ", backend.framework_name, pattern1, pattern2)
+            output_numpy = reduce(x_numpy, pattern1, reduction=logsumexp_numpy)
+            output_backend = reduce(x_backend, pattern1, reduction=backend_callback)
+            assert numpy.allclose(
+                output_numpy,
+                backend.to_numpy(output_backend),
+            )
+
+
+def test_enumerating_directions():
+    for backend in imp_op_backends:
+        print("testing directions for", backend.framework_name)
+        for shape in [[], [1], [1, 1, 1], [2, 3, 5, 7]]:
+            x = numpy.arange(numpy.prod(shape)).reshape(shape)
+            axes1 = _enumerate_directions(x)
+            axes2 = _enumerate_directions(backend.from_numpy(x))
+            assert len(axes1) == len(axes2) == len(shape)
+            for ax1, ax2 in zip(axes1, axes2):
+                ax2 = backend.to_numpy(ax2)
+                assert ax1.shape == ax2.shape
+                assert numpy.allclose(ax1, ax2)
+
+
+def test_concatenations_and_stacking():
+    for backend in imp_op_backends:
+        print("testing shapes for ", backend.framework_name)
+        for n_arrays in [1, 2, 5]:
+            shapes = [[], [1], [1, 1], [2, 3, 5, 7], [1] * 6]
+            for shape in shapes:
+                arrays1 = [numpy.arange(i, i + numpy.prod(shape)).reshape(shape) for i in range(n_arrays)]
+                arrays2 = [backend.from_numpy(array) for array in arrays1]
+                result0 = numpy.asarray(arrays1)
+                result1 = rearrange(arrays1, "...->...")
+                result2 = rearrange(arrays2, "...->...")
+                assert numpy.array_equal(result0, result1)
+                assert numpy.array_equal(result1, backend.to_numpy(result2))
+
+                result1 = rearrange(arrays1, "b ... -> ... b")
+                result2 = rearrange(arrays2, "b ... -> ... b")
+                assert numpy.array_equal(result1, backend.to_numpy(result2))
+
+
+def test_gradients_imperatives():
+    # lazy - just checking reductions
+    for reduction in REDUCTIONS:
+        if reduction in ("any", "all"):
+            continue  # non-differentiable ops
+        x = numpy.arange(1, 1 + 2 * 3 * 4).reshape([2, 3, 4]).astype("float32")
+        results = {}
+        for backend in imp_op_backends:
+            y0 = backend.from_numpy(x)
+            if not hasattr(y0, "grad"):
+                continue
+
+            y1 = reduce(y0, "a b c -> c a", reduction=reduction)
+            y2 = reduce(y1, "c a -> a c", reduction=reduction)
+            y3 = reduce(y2, "a (c1 c2) -> a", reduction=reduction, c1=2)
+            y4 = reduce(y3, "... -> ", reduction=reduction)
+
+            y4.backward()
+            grad = backend.to_numpy(y0.grad)
+            results[backend.framework_name] = grad
+
+        print("comparing gradients for", results.keys())
+        for name1, grad1 in results.items():
+            for name2, grad2 in results.items():
+                assert numpy.allclose(grad1, grad2), [name1, name2, "provided different gradients"]
+
+
+def test_tiling_imperatives():
+    for backend in imp_op_backends:
+        print("Tiling tests for ", backend.framework_name)
+        input = numpy.arange(2 * 3 * 5, dtype="int64").reshape([2, 1, 3, 1, 5])
+        test_cases = [
+            (1, 1, 1, 1, 1),
+            (1, 2, 1, 3, 1),
+            (3, 1, 1, 4, 1),
+        ]
+        for repeats in test_cases:
+            expected = numpy.tile(input, repeats)
+            converted = backend.from_numpy(input)
+            repeated = backend.tile(converted, repeats)
+            result = backend.to_numpy(repeated)
+            assert numpy.array_equal(result, expected)
+
+
+def test_tiling_symbolic():
+    for backend in sym_op_backends:
+        print("Tiling tests for ", backend.framework_name)
+        input = numpy.arange(2 * 3 * 5, dtype="int64").reshape([2, 1, 3, 1, 5])
+        test_cases = [
+            (1, 1, 1, 1, 1),
+            (1, 2, 1, 3, 1),
+            (3, 1, 1, 4, 1),
+        ]
+        for repeats in test_cases:
+            expected = numpy.tile(input, repeats)
+            sym = backend.create_symbol(input.shape)
+            result = backend.eval_symbol(backend.tile(sym, repeats), [[sym, input]])
+            assert numpy.array_equal(result, expected)
+
+            sym = backend.create_symbol([None] * len(input.shape))
+            result = backend.eval_symbol(backend.tile(sym, repeats), [[sym, input]])
+            assert numpy.array_equal(result, expected)
+
+
+repeat_test_cases = [
+    # all assume that input has shape [2, 3, 5]
+    ("a b c -> c a b", dict()),
+    ("a b c -> (c copy a b)", dict(copy=2, a=2, b=3, c=5)),
+    ("a b c -> (a copy) b c ", dict(copy=1)),
+    ("a b c -> (c a) (copy1 b copy2)", dict(a=2, copy1=1, copy2=2)),
+    ("a ...  -> a ... copy", dict(copy=4)),
+    ("... c -> ... (copy1 c copy2)", dict(copy1=1, copy2=2)),
+    ("...  -> ... ", dict()),
+    (" ...  -> copy1 ... copy2 ", dict(copy1=2, copy2=3)),
+    ("a b c  -> copy1 a copy2 b c () ", dict(copy1=2, copy2=1)),
+]
+
+
+def check_reversion(x, repeat_pattern, **sizes):
+    """Checks repeat pattern by running reduction"""
+    left, right = repeat_pattern.split("->")
+    reduce_pattern = right + "->" + left
+    repeated = repeat(x, repeat_pattern, **sizes)
+    reduced_min = reduce(repeated, reduce_pattern, reduction="min", **sizes)
+    reduced_max = reduce(repeated, reduce_pattern, reduction="max", **sizes)
+    assert numpy.array_equal(x, reduced_min)
+    assert numpy.array_equal(x, reduced_max)
+
+
+def test_repeat_numpy():
+    # check repeat vs reduce. Repeat works ok if reverse reduction with min and max work well
+    x = numpy.arange(2 * 3 * 5).reshape([2, 3, 5])
+    x1 = repeat(x, "a b c -> copy a b c ", copy=1)
+    assert numpy.array_equal(x[None], x1)
+    for pattern, axis_dimensions in repeat_test_cases:
+        check_reversion(x, pattern, **axis_dimensions)
+
+
+def test_repeat_imperatives():
+    x = numpy.arange(2 * 3 * 5).reshape([2, 3, 5])
+    for backend in imp_op_backends:
+        print("Repeat tests for ", backend.framework_name)
+
+        for pattern, axis_dimensions in repeat_test_cases:
+            expected = repeat(x, pattern, **axis_dimensions)
+            converted = backend.from_numpy(x)
+            repeated = repeat(converted, pattern, **axis_dimensions)
+            result = backend.to_numpy(repeated)
+            assert numpy.array_equal(result, expected)
+
+
+def test_repeat_symbolic():
+    x = numpy.arange(2 * 3 * 5).reshape([2, 3, 5])
+
+    for backend in sym_op_backends:
+        print("Repeat tests for ", backend.framework_name)
+
+        for pattern, axis_dimensions in repeat_test_cases:
+            expected = repeat(x, pattern, **axis_dimensions)
+
+            sym = backend.create_symbol(x.shape)
+            result = backend.eval_symbol(repeat(sym, pattern, **axis_dimensions), [[sym, x]])
+            assert numpy.array_equal(result, expected)
+
+
+def test_repeat_array_api():
+    import numpy as xp
+    from einops import array_api as AA
+
+    if xp.__version__ < "2.0.0":
+        pytest.skip()
+
+    x = numpy.arange(2 * 3 * 5).reshape([2, 3, 5])
+
+    for pattern, axis_dimensions in repeat_test_cases:
+        expected = repeat(x, pattern, **axis_dimensions)
+
+        result = AA.repeat(xp.from_dlpack(x), pattern, **axis_dimensions)
+        assert numpy.array_equal(AA.asnumpy(result + 0), expected)
+
+
+test_cases_repeat_anonymous = [
+    # all assume that input has shape [1, 2, 4, 6]
+    ("a b c d -> c a d b", dict()),
+    ("a b c d -> (c 2 d a b)", dict(a=1, c=4, d=6)),
+    ("1 b c d -> (d copy 1) 3 b c ", dict(copy=3)),
+    ("1 ...  -> 3 ... ", dict()),
+    ("() ... d -> 1 (copy1 d copy2) ... ", dict(copy1=2, copy2=3)),
+    ("1 b c d -> (1 1) (1 b) 2 c 3 d (1 1)", dict()),
+]
+
+
+def test_anonymous_axes():
+    x = numpy.arange(1 * 2 * 4 * 6).reshape([1, 2, 4, 6])
+    for pattern, axis_dimensions in test_cases_repeat_anonymous:
+        check_reversion(x, pattern, **axis_dimensions)
+
+
+def test_list_inputs():
+    x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
+
+    assert numpy.array_equal(
+        rearrange(list(x), "... -> (...)"),
+        rearrange(x, "... -> (...)"),
+    )
+    assert numpy.array_equal(
+        reduce(list(x), "a ... e -> (...)", "min"),
+        reduce(x, "a ... e -> (...)", "min"),
+    )
+    assert numpy.array_equal(
+        repeat(list(x), "...  -> b (...)", b=3),
+        repeat(x, "...  -> b (...)", b=3),
+    )
+
+
+def test_torch_compile_with_dynamic_shape():
+    if not is_backend_tested("torch"):
+        pytest.skip()
+    import torch
+
+    # somewhat reasonable debug messages
+    torch._dynamo.config.verbose = True
+
+    def func1(x):
+        # test contains ellipsis
+        a, b, c, *other = x.shape
+        x = rearrange(x, "(a a2) b c ... -> b (c a2) (a ...)", a2=2)
+        # test contains passing expression as axis length
+        x = reduce(x, "b ca2 A -> b A", "sum", ca2=c * 2)
+        return x
+
+    # seems can't test static and dynamic in the same test run.
+    # func1_compiled_static = torch.compile(func1, dynamic=False, fullgraph=True, backend='aot_eager')
+    func1_compiled_dynamic = torch.compile(func1, dynamic=True, fullgraph=True, backend="aot_eager")
+
+    x = torch.randn(size=[4, 5, 6, 3])
+    assert torch.equal(func1_compiled_dynamic(x), func1(x))
+    # check with input of different dimensionality, and with all shape elements changed
+    x = torch.randn(size=[6, 3, 4, 2, 3])
+    assert torch.equal(func1_compiled_dynamic(x), func1(x))
+
+
+def bit_count(x):
+    return sum((x >> i) & 1 for i in range(20))
+
+
+def test_reduction_imperatives_booleans():
+    """Checks that any/all reduction works in all frameworks"""
+    x_np = numpy.asarray([(bit_count(x) % 2) == 0 for x in range(2**6)]).reshape([2] * 6)
+    for backend in imp_op_backends:
+        print("Reduction any/all tests for ", backend.framework_name)
+
+        for axis in range(6):
+            expected_result_any = numpy.any(x_np, axis=axis, keepdims=True)
+            expected_result_all = numpy.all(x_np, axis=axis, keepdims=True)
+            assert not numpy.array_equal(expected_result_any, expected_result_all)
+
+            axes = list("abcdef")
+            axes_in = list(axes)
+            axes_out = list(axes)
+            axes_out[axis] = "1"
+            pattern = (" ".join(axes_in)) + " -> " + (" ".join(axes_out))
+
+            res_any = reduce(backend.from_numpy(x_np), pattern, reduction="any")
+            res_all = reduce(backend.from_numpy(x_np), pattern, reduction="all")
+
+            assert numpy.array_equal(expected_result_any, backend.to_numpy(res_any))
+            assert numpy.array_equal(expected_result_all, backend.to_numpy(res_all))
+
+        # expected result: any/all
+        expected_result_any = numpy.any(x_np, axis=(0, 1), keepdims=True)
+        expected_result_all = numpy.all(x_np, axis=(0, 1), keepdims=True)
+        pattern = "a b ... -> 1 1 ..."
+        res_any = reduce(backend.from_numpy(x_np), pattern, reduction="any")
+        res_all = reduce(backend.from_numpy(x_np), pattern, reduction="all")
+        assert numpy.array_equal(expected_result_any, backend.to_numpy(res_any))
+        assert numpy.array_equal(expected_result_all, backend.to_numpy(res_all))
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_other.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_other.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c4b9b36b0d112fa2cc855384d6e5e98d47babc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_other.py
@@ -0,0 +1,291 @@
+from doctest import testmod
+
+import numpy
+import pytest
+
+import einops
+import einops.layers
+import einops.parsing
+from einops._backends import AbstractBackend
+from einops.einops import rearrange, parse_shape, _optimize_transformation
+from einops.tests import collect_test_backends, is_backend_tested
+
+__author__ = "Alex Rogozhnikov"
+
+
+def test_doctests_examples():
+    # tests docstrings, additionally
+    testmod(einops.layers, raise_on_error=True, extraglobs=dict(np=numpy))
+    testmod(einops.einops, raise_on_error=True, extraglobs=dict(np=numpy))
+
+
+def test_backends_installed():
+    """
+    This test will fail if some of backends are not installed or can't be imported
+    Other tests will just work and only test installed backends.
+    """
+    from . import parse_backends_to_test
+
+    backends_to_test = parse_backends_to_test()
+    errors = []
+    for backend_type in AbstractBackend.__subclasses__():
+        if backend_type.framework_name not in backends_to_test:
+            continue
+        try:
+            # instantiate
+            backend_type()
+        except Exception as e:
+            errors.append((backend_type.framework_name, e))
+    assert len(errors) == 0, errors
+
+
+def test_optimize_transformations_numpy():
+    print("Testing optimizations")
+    shapes = [[2] * n_dimensions for n_dimensions in range(14)]
+    shapes += [[3] * n_dimensions for n_dimensions in range(6)]
+    shapes += [[2, 3, 5, 7]]
+    shapes += [[2, 3, 5, 7, 11, 17]]
+
+    for shape in shapes:
+        for attempt in range(5):
+            n_dimensions = len(shape)
+            x = numpy.random.randint(0, 2**12, size=shape).reshape([-1])
+            init_shape = shape[:]
+            n_reduced = numpy.random.randint(0, n_dimensions + 1)
+            reduced_axes = tuple(numpy.random.permutation(n_dimensions)[:n_reduced])
+            axes_reordering = numpy.random.permutation(n_dimensions - n_reduced)
+            final_shape = numpy.random.randint(0, 1024, size=333)  # just random
+
+            init_shape2, reduced_axes2, axes_reordering2, final_shape2 = combination2 = _optimize_transformation(
+                init_shape, reduced_axes, axes_reordering, final_shape
+            )
+
+            assert numpy.array_equal(final_shape, final_shape2)
+            result1 = x.reshape(init_shape).sum(axis=reduced_axes).transpose(axes_reordering).reshape([-1])
+            result2 = x.reshape(init_shape2).sum(axis=reduced_axes2).transpose(axes_reordering2).reshape([-1])
+            assert numpy.array_equal(result1, result2)
+
+            # testing we can't optimize this formula again
+            combination3 = _optimize_transformation(*combination2)
+            for a, b in zip(combination2, combination3):
+                assert numpy.array_equal(a, b)
+
+
+_IMPERATIVE_BACKENDS = collect_test_backends(symbolic=False, layers=False)
+
+x_np = numpy.zeros([10, 20, 30, 40])
+
+
+def test_parse_shape_imperative():
+    for backend in _IMPERATIVE_BACKENDS:
+        print("Shape parsing for ", backend.framework_name)
+        parsed1 = parse_shape(x_np, "a b c d")
+        parsed2 = parse_shape(backend.from_numpy(x_np), "a b c d")
+        assert parsed1 == parsed2 == dict(a=10, b=20, c=30, d=40)
+        assert parsed1 != dict(a=1, b=20, c=30, d=40) != parsed2
+
+
+def test_underscore():
+    for backend in _IMPERATIVE_BACKENDS:
+        parsed1 = parse_shape(x_np, "_ _ _ _")
+        parsed2 = parse_shape(backend.from_numpy(x_np), "_ _ _ _")
+        assert parsed1 == parsed2 == dict()
+
+
+def test_underscore_one():
+    for backend in _IMPERATIVE_BACKENDS:
+        parsed1 = parse_shape(x_np, "_ _ _ hello")
+        parsed2 = parse_shape(backend.from_numpy(x_np), "_ _ _ hello")
+        assert parsed1 == parsed2 == dict(hello=40)
+
+
+def test_underscore_several():
+    for backend in _IMPERATIVE_BACKENDS:
+        parsed1 = parse_shape(x_np, "_ _ a1 a1a111a")
+        parsed2 = parse_shape(backend.from_numpy(x_np), "_ _ a1 a1a111a")
+        assert parsed1 == parsed2 == dict(a1=30, a1a111a=40)
+
+
+def test_repeating():
+    with pytest.raises(einops.EinopsError):
+        parse_shape(x_np, "a a b b")
+
+    for backend in _IMPERATIVE_BACKENDS:
+        with pytest.raises(einops.EinopsError):
+            parse_shape(backend.from_numpy(x_np), "a a b b")
+
+
+def test_ellipsis():
+    for backend in _IMPERATIVE_BACKENDS:
+        for shape, pattern, expected in [
+            ([10, 20], "...", dict()),
+            ([10], "... a", dict(a=10)),
+            ([10, 20], "... a", dict(a=20)),
+            ([10, 20, 30], "... a", dict(a=30)),
+            ([10, 20, 30, 40], "... a", dict(a=40)),
+            ([10], "a ...", dict(a=10)),
+            ([10, 20], "a ...", dict(a=10)),
+            ([10, 20, 30], "a ...", dict(a=10)),
+            ([10, 20, 30, 40], "a ...", dict(a=10)),
+            ([10, 20, 30, 40], " a ... b", dict(a=10, b=40)),
+            ([10, 40], " a ... b", dict(a=10, b=40)),
+        ]:
+            x = numpy.ones(shape)
+            parsed1 = parse_shape(x, pattern)
+            parsed2 = parse_shape(backend.from_numpy(x), pattern)
+            assert parsed1 == parsed2 == expected
+
+
+def test_parse_with_anonymous_axes():
+    for backend in _IMPERATIVE_BACKENDS:
+        for shape, pattern, expected in [
+            ([1, 2, 3, 4], "1 2 3 a", dict(a=4)),
+            ([10, 1, 2], "a 1 2", dict(a=10)),
+            ([10, 1, 2], "a () 2", dict(a=10)),
+        ]:
+            x = numpy.ones(shape)
+            parsed1 = parse_shape(x, pattern)
+            parsed2 = parse_shape(backend.from_numpy(x), pattern)
+            assert parsed1 == parsed2 == expected
+
+
+def test_failures():
+    for backend in _IMPERATIVE_BACKENDS:
+        # every test should fail
+        for shape, pattern in [
+            ([1, 2, 3, 4], "a b c"),
+            ([1, 2, 3, 4], "2 a b c"),
+            ([1, 2, 3, 4], "a b c ()"),
+            ([1, 2, 3, 4], "a b c d e"),
+            ([1, 2, 3, 4], "a b c d e ..."),
+            ([1, 2, 3, 4], "a b c ()"),
+        ]:
+            with pytest.raises(RuntimeError):
+                x = numpy.ones(shape)
+                parse_shape(backend.from_numpy(x), pattern)
+
+
+_SYMBOLIC_BACKENDS = [
+    *collect_test_backends(symbolic=True, layers=False),
+    *collect_test_backends(symbolic=True, layers=True),
+]
+
+# tensorflow.keras needs special way to compile,
+# shape vars can be used only inside layers but not as outputs
+_SYMBOLIC_BACKENDS = [backend for backend in _SYMBOLIC_BACKENDS if backend.framework_name != "tensorflow.keras"]
+
+
+@pytest.mark.parametrize("backend", _SYMBOLIC_BACKENDS)
+def test_parse_shape_symbolic(backend):
+    for shape in [
+        [10, 20, 30, 40],
+        [10, 20, None, None],
+        [None, None, None, None],
+    ]:
+        print(
+            f"special shape parsing {backend.framework_name=} {shape=}",
+        )
+        input_symbol = backend.create_symbol(shape)
+
+        shape_placeholder = parse_shape(input_symbol, "a b c d")
+        shape = {}
+        for name, symbol in shape_placeholder.items():
+            shape[name] = (
+                symbol
+                if isinstance(symbol, int)
+                else backend.eval_symbol(symbol, [(input_symbol, numpy.zeros([10, 20, 30, 40]))])
+            )
+        print(shape)
+        result_placeholder = rearrange(
+            input_symbol, "a b (c1 c2) (d1 d2) -> (a b d1) c1 (c2 d2)", **parse_shape(input_symbol, "a b c1 _"), d2=2
+        )
+        result = backend.eval_symbol(result_placeholder, [(input_symbol, numpy.zeros([10, 20, 30, 40]))])
+        print(result.shape)
+        assert result.shape == (10 * 20 * 20, 30, 1 * 2)
+        assert numpy.allclose(result, 0)
+
+
+@pytest.mark.parametrize("backend", _SYMBOLIC_BACKENDS)
+def test_parse_shape_symbolic_ellipsis(backend):
+    for static_shape, shape, pattern, expected in [
+        ([10, 20], [None, None], "...", dict()),
+        ([10], [None], "... a", dict(a=10)),
+        ([10, 20], [None, None], "... a", dict(a=20)),
+        ([10, 20, 30], [None, None, None], "... a", dict(a=30)),
+        ([10, 20, 30, 40], [None, None, None, None], "... a", dict(a=40)),
+        ([10], [None], "a ...", dict(a=10)),
+        ([10, 20], [None, None], "a ...", dict(a=10)),
+        ([10, 20, 30], [None, None, None], "a ...", dict(a=10)),
+        ([10, 20, 30, 40], [None, None, None, None], "a ...", dict(a=10)),
+        ([10, 20, 30, 40], [None, None, None, None], " a ... b", dict(a=10, b=40)),
+        ([10, 40], [None, None], " a ... b ", dict(a=10, b=40)),
+    ]:
+        input_symbol = backend.create_symbol(shape)
+        shape_placeholder = parse_shape(input_symbol, pattern)
+        out_shape = {}
+        for name, symbol in shape_placeholder.items():
+            if isinstance(symbol, int):
+                out_shape[name] = symbol
+            else:
+                out_shape[name] = backend.eval_symbol(symbol, [(input_symbol, numpy.zeros(static_shape))])
+        assert out_shape == expected
+
+
+def test_is_float_type():
+    backends = collect_test_backends(symbolic=False, layers=False)
+    backends += collect_test_backends(symbolic=False, layers=True)
+    for backend in backends:
+        for dtype in ["int32", "int64", "float32", "float64"]:
+            is_float = "float" in dtype
+            input = numpy.zeros([3, 4, 5], dtype=dtype)
+            input = backend.from_numpy(input)
+            assert backend.is_float_type(input) == is_float, (dtype, backend, input.dtype)
+
+
+def test_torch_compile():
+    """
+    Test ensures that allow_ops_in_compiled_graph allows compiling in a single graph
+    Additionally we ensure that after compilation cache works properly
+     (by changing shapes and patterns)
+    We additionally check that pack/unpack still can be handled
+     despite variable number of inputs/outputs
+    """
+    if not is_backend_tested("torch"):
+        pytest.skip()
+    import torch
+    from torch import nn
+    from einops import repeat, reduce, pack, unpack, einsum
+    from einops._torch_specific import allow_ops_in_compiled_graph
+
+    allow_ops_in_compiled_graph()
+
+    class TorchModuleWithOperations(nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+
+        def forward(self, x_abc, suffix=""):
+            a, b, c = x_abc.shape
+
+            def suf(pattern):
+                parts = pattern.split()
+                return " ".join([p if p[-1] not in "acd" else p + suffix for p in parts])
+
+            # patterns look a bit strange because names a, c, d will be modified on every run
+            # by suf function
+            x_abcd = repeat(x_abc, suf("a b c -> a b c 4"))
+            x_abc = reduce(x_abcd, suf("a b c d -> a b c"), "min")
+            x_abdc, ps = pack([x_abc] * (2 + len(suffix)), suf("a b * c"))
+            x_array = unpack(rearrange(x_abdc, suf("a b d c -> (a b ) 1 c d")), ps, "ab one1 c *")
+            x1 = x_array[0] + len(x_array)
+            x1 = rearrange(x1, suf("(a b ) 1 c -> a b c"), b=b)
+            addition = einsum(x_abc, x_abcd, suf("a b c , a b c d -> d"))[0]
+            return x1 + addition
+
+    original = TorchModuleWithOperations()
+    compiled = torch.compile(original, fullgraph=True, backend="aot_eager")
+    for size in [10, 20, 40]:
+        x = torch.rand([size, size + 1, size + 2])
+        for suffix in ["", "suf1", "other_suffix"]:
+            result1 = compiled(x, suffix)
+            result2 = original(x, suffix)
+            assert torch.allclose(result1, result2)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_packing.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_packing.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a449c881294f963b3f234ca8860ed742a76c14
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_packing.py
@@ -0,0 +1,309 @@
+import dataclasses
+import typing
+
+import numpy as np
+import pytest
+
+from einops import EinopsError, asnumpy, pack, unpack
+from einops.tests import collect_test_backends
+
+
+def pack_unpack(xs, pattern):
+    x, ps = pack(xs, pattern)
+    unpacked = unpack(xs, ps, pattern)
+    assert len(unpacked) == len(xs)
+    for a, b in zip(unpacked, xs):
+        assert np.allclose(asnumpy(a), asnumpy(b))
+
+
+def unpack_and_pack(x, ps, pattern: str):
+    unpacked = unpack(x, ps, pattern)
+    packed, ps2 = pack(unpacked, pattern=pattern)
+
+    assert np.allclose(asnumpy(packed), asnumpy(x))
+    return unpacked
+
+
+def unpack_and_pack_against_numpy(x, ps, pattern: str):
+    capturer_backend = CaptureException()
+    capturer_numpy = CaptureException()
+
+    with capturer_backend:
+        unpacked = unpack(x, ps, pattern)
+        packed, ps2 = pack(unpacked, pattern=pattern)
+
+    with capturer_numpy:
+        x_np = asnumpy(x)
+        unpacked_np = unpack(x_np, ps, pattern)
+        packed_np, ps3 = pack(unpacked_np, pattern=pattern)
+
+    assert type(capturer_numpy.exception) == type(capturer_backend.exception)  # noqa E721
+    if capturer_numpy.exception is not None:
+        # both failed
+        return
+    else:
+        # neither failed, check results are identical
+        assert np.allclose(asnumpy(packed), asnumpy(x))
+        assert np.allclose(asnumpy(packed_np), asnumpy(x))
+        assert len(unpacked) == len(unpacked_np)
+        for a, b in zip(unpacked, unpacked_np):
+            assert np.allclose(asnumpy(a), b)
+
+
+class CaptureException:
+    def __enter__(self):
+        self.exception = None
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.exception = exc_val
+        return True
+
+
+def test_numpy_trivial(H=13, W=17):
+    def rand(*shape):
+        return np.random.random(shape)
+
+    def check(a, b):
+        assert a.dtype == b.dtype
+        assert a.shape == b.shape
+        assert np.all(a == b)
+
+    r, g, b = rand(3, H, W)
+    embeddings = rand(H, W, 32)
+
+    check(
+        np.stack([r, g, b], axis=2),
+        pack([r, g, b], "h w *")[0],
+    )
+    check(
+        np.stack([r, g, b], axis=1),
+        pack([r, g, b], "h * w")[0],
+    )
+    check(
+        np.stack([r, g, b], axis=0),
+        pack([r, g, b], "* h w")[0],
+    )
+
+    check(
+        np.concatenate([r, g, b], axis=1),
+        pack([r, g, b], "h *")[0],
+    )
+    check(
+        np.concatenate([r, g, b], axis=0),
+        pack([r, g, b], "* w")[0],
+    )
+
+    i = np.index_exp[:, :, None]
+    check(
+        np.concatenate([r[i], g[i], b[i], embeddings], axis=2),
+        pack([r, g, b, embeddings], "h w *")[0],
+    )
+
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "h w nonexisting_axis *")
+
+    pack([r, g, b], "some_name_for_H some_name_for_w1 *")
+
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "h _w *")  # no leading underscore
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "h_ w *")  # no trailing underscore
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "1h_ w *")
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "1 w *")
+    with pytest.raises(EinopsError):
+        pack([r, g, b, embeddings], "h h *")
+    # capital and non-capital are different
+    pack([r, g, b, embeddings], "h H *")
+
+
+@dataclasses.dataclass
+class UnpackTestCase:
+    shape: typing.Tuple[int, ...]
+    pattern: str
+
+    def dim(self):
+        return self.pattern.split().index("*")
+
+    def selfcheck(self):
+        assert self.shape[self.dim()] == 5
+
+
+cases = [
+    # NB: in all cases unpacked axis is of length 5.
+    # that's actively used in tests below
+    UnpackTestCase((5,), "*"),
+    UnpackTestCase((5, 7), "* seven"),
+    UnpackTestCase((7, 5), "seven *"),
+    UnpackTestCase((5, 3, 4), "* three four"),
+    UnpackTestCase((4, 5, 3), "four * three"),
+    UnpackTestCase((3, 4, 5), "three four *"),
+]
+
+
+def test_pack_unpack_with_numpy():
+    case: UnpackTestCase
+
+    for case in cases:
+        shape = case.shape
+        pattern = case.pattern
+
+        x = np.random.random(shape)
+        # all correct, no minus 1
+        unpack_and_pack(x, [[2], [1], [2]], pattern)
+        # no -1, asking for wrong shapes
+        with pytest.raises(BaseException):
+            unpack_and_pack(x, [[2], [1], [2]], pattern + " non_existent_axis")
+        with pytest.raises(BaseException):
+            unpack_and_pack(x, [[2], [1], [1]], pattern)
+        with pytest.raises(BaseException):
+            unpack_and_pack(x, [[4], [1], [1]], pattern)
+        # all correct, with -1
+        unpack_and_pack(x, [[2], [1], [-1]], pattern)
+        unpack_and_pack(x, [[2], [-1], [2]], pattern)
+        unpack_and_pack(x, [[-1], [1], [2]], pattern)
+        _, _, last = unpack_and_pack(x, [[2], [3], [-1]], pattern)
+        assert last.shape[case.dim()] == 0
+        # asking for more elements than available
+        with pytest.raises(BaseException):
+            unpack(x, [[2], [4], [-1]], pattern)
+        # this one does not raise, because indexing x[2:1] just returns zero elements
+        # with pytest.raises(BaseException):
+        #     unpack(x, [[2], [-1], [4]], pattern)
+        with pytest.raises(BaseException):
+            unpack(x, [[-1], [1], [5]], pattern)
+
+        # all correct, -1 nested
+        rs = unpack_and_pack(x, [[1, 2], [1, 1], [-1, 1]], pattern)
+        assert all(len(r.shape) == len(x.shape) + 1 for r in rs)
+        rs = unpack_and_pack(x, [[1, 2], [1, -1], [1, 1]], pattern)
+        assert all(len(r.shape) == len(x.shape) + 1 for r in rs)
+        rs = unpack_and_pack(x, [[2, -1], [1, 2], [1, 1]], pattern)
+        assert all(len(r.shape) == len(x.shape) + 1 for r in rs)
+
+        # asking for more elements, -1 nested
+        with pytest.raises(BaseException):
+            unpack(x, [[-1, 2], [1], [5]], pattern)
+        with pytest.raises(BaseException):
+            unpack(x, [[2, 2], [2], [5, -1]], pattern)
+
+        # asking for non-divisible number of elements
+        with pytest.raises(BaseException):
+            unpack(x, [[2, 1], [1], [3, -1]], pattern)
+        with pytest.raises(BaseException):
+            unpack(x, [[2, 1], [3, -1], [1]], pattern)
+        with pytest.raises(BaseException):
+            unpack(x, [[3, -1], [2, 1], [1]], pattern)
+
+        # -1 takes zero
+        unpack_and_pack(x, [[0], [5], [-1]], pattern)
+        unpack_and_pack(x, [[0], [-1], [5]], pattern)
+        unpack_and_pack(x, [[-1], [5], [0]], pattern)
+
+        # -1 takes zero, -1
+        unpack_and_pack(x, [[2, -1], [1, 5]], pattern)
+
+
+def test_pack_unpack_against_numpy():
+    for backend in collect_test_backends(symbolic=False, layers=False):
+        print(f"test packing against numpy for {backend.framework_name}")
+        check_zero_len = True
+
+        for case in cases:
+            unpack_and_pack = unpack_and_pack_against_numpy
+            shape = case.shape
+            pattern = case.pattern
+
+            x = np.random.random(shape)
+            x = backend.from_numpy(x)
+            # all correct, no minus 1
+            unpack_and_pack(x, [[2], [1], [2]], pattern)
+            # no -1, asking for wrong shapes
+            with pytest.raises(BaseException):
+                unpack(x, [[2], [1], [1]], pattern)
+
+            with pytest.raises(BaseException):
+                unpack(x, [[4], [1], [1]], pattern)
+            # all correct, with -1
+            unpack_and_pack(x, [[2], [1], [-1]], pattern)
+            unpack_and_pack(x, [[2], [-1], [2]], pattern)
+            unpack_and_pack(x, [[-1], [1], [2]], pattern)
+
+            # asking for more elements than available
+            with pytest.raises(BaseException):
+                unpack(x, [[2], [4], [-1]], pattern)
+            # this one does not raise, because indexing x[2:1] just returns zero elements
+            # with pytest.raises(BaseException):
+            #     unpack(x, [[2], [-1], [4]], pattern)
+            with pytest.raises(BaseException):
+                unpack(x, [[-1], [1], [5]], pattern)
+
+            # all correct, -1 nested
+            unpack_and_pack(x, [[1, 2], [1, 1], [-1, 1]], pattern)
+            unpack_and_pack(x, [[1, 2], [1, -1], [1, 1]], pattern)
+            unpack_and_pack(x, [[2, -1], [1, 2], [1, 1]], pattern)
+
+            # asking for more elements, -1 nested
+            with pytest.raises(BaseException):
+                unpack(x, [[-1, 2], [1], [5]], pattern)
+            with pytest.raises(BaseException):
+                unpack(x, [[2, 2], [2], [5, -1]], pattern)
+
+            # asking for non-divisible number of elements
+            with pytest.raises(BaseException):
+                unpack(x, [[2, 1], [1], [3, -1]], pattern)
+            with pytest.raises(BaseException):
+                unpack(x, [[2, 1], [3, -1], [1]], pattern)
+            with pytest.raises(BaseException):
+                unpack(x, [[3, -1], [2, 1], [1]], pattern)
+
+            if check_zero_len:
+                # -1 takes zero
+                unpack_and_pack(x, [[2], [3], [-1]], pattern)
+                unpack_and_pack(x, [[0], [5], [-1]], pattern)
+                unpack_and_pack(x, [[0], [-1], [5]], pattern)
+                unpack_and_pack(x, [[-1], [5], [0]], pattern)
+
+                # -1 takes zero, -1
+                unpack_and_pack(x, [[2, -1], [1, 5]], pattern)
+
+
+def test_pack_unpack_array_api():
+    from einops import array_api as AA
+    import numpy as xp
+
+    if xp.__version__ < "2.0.0":
+        pytest.skip()
+
+    for case in cases:
+        shape = case.shape
+        pattern = case.pattern
+        x_np = np.random.random(shape)
+        x_xp = xp.from_dlpack(x_np)
+
+        for ps in [
+            [[2], [1], [2]],
+            [[1], [1], [-1]],
+            [[1], [1], [-1, 3]],
+            [[2, 1], [1, 1, 1], [-1]],
+        ]:
+            x_np_split = unpack(x_np, ps, pattern)
+            x_xp_split = AA.unpack(x_xp, ps, pattern)
+            for a, b in zip(x_np_split, x_xp_split):
+                assert np.allclose(a, AA.asnumpy(b + 0))
+
+            x_agg_np, ps1 = pack(x_np_split, pattern)
+            x_agg_xp, ps2 = AA.pack(x_xp_split, pattern)
+            assert ps1 == ps2
+            assert np.allclose(x_agg_np, AA.asnumpy(x_agg_xp))
+
+        for ps in [
+            [[2, 3]],
+            [[1], [5]],
+            [[1], [5], [-1]],
+            [[1], [2, 3]],
+            [[1], [5], [-1, 2]],
+        ]:
+            with pytest.raises(BaseException):
+                unpack(x_np, ps, pattern)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_parsing.py b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d0440a98ada5c0d93d00caf9f7a7dcc7631b9c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/einops/tests/test_parsing.py
@@ -0,0 +1,126 @@
+import pytest
+
+from einops import EinopsError
+from einops.parsing import ParsedExpression, AnonymousAxis, _ellipsis
+
+__author__ = "Alex Rogozhnikov"
+
+
+class AnonymousAxisPlaceholder:
+    def __init__(self, value: int):
+        self.value = value
+        assert isinstance(self.value, int)
+
+    def __eq__(self, other):
+        return isinstance(other, AnonymousAxis) and self.value == other.value
+
+
+def test_anonymous_axes():
+    a, b = AnonymousAxis("2"), AnonymousAxis("2")
+    assert a != b
+    c, d = AnonymousAxisPlaceholder(2), AnonymousAxisPlaceholder(3)
+    assert a == c and b == c
+    assert a != d and b != d
+    assert [a, 2, b] == [c, 2, c]
+
+
+def test_elementary_axis_name():
+    for name in [
+        "a",
+        "b",
+        "h",
+        "dx",
+        "h1",
+        "zz",
+        "i9123",
+        "somelongname",
+        "Alex",
+        "camelCase",
+        "u_n_d_e_r_score",
+        "unreasonablyLongAxisName",
+    ]:
+        assert ParsedExpression.check_axis_name(name)
+
+    for name in ["", "2b", "12", "_startWithUnderscore", "endWithUnderscore_", "_", "...", _ellipsis]:
+        assert not ParsedExpression.check_axis_name(name)
+
+
+def test_invalid_expressions():
+    # double ellipsis should raise an error
+    ParsedExpression("... a b c d")
+    with pytest.raises(EinopsError):
+        ParsedExpression("... a b c d ...")
+    with pytest.raises(EinopsError):
+        ParsedExpression("... a b c (d ...)")
+    with pytest.raises(EinopsError):
+        ParsedExpression("(... a) b c (d ...)")
+
+    # double/missing/enclosed parenthesis
+    ParsedExpression("(a) b c (d ...)")
+    with pytest.raises(EinopsError):
+        ParsedExpression("(a)) b c (d ...)")
+    with pytest.raises(EinopsError):
+        ParsedExpression("(a b c (d ...)")
+    with pytest.raises(EinopsError):
+        ParsedExpression("(a) (()) b c (d ...)")
+    with pytest.raises(EinopsError):
+        ParsedExpression("(a) ((b c) (d ...))")
+
+    # invalid identifiers
+    ParsedExpression("camelCase under_scored cApiTaLs ß ...")
+    with pytest.raises(EinopsError):
+        ParsedExpression("1a")
+    with pytest.raises(EinopsError):
+        ParsedExpression("_pre")
+    with pytest.raises(EinopsError):
+        ParsedExpression("...pre")
+    with pytest.raises(EinopsError):
+        ParsedExpression("pre...")
+
+
+def test_parse_expression():
+    parsed = ParsedExpression("a1  b1   c1    d1")
+    assert parsed.identifiers == {"a1", "b1", "c1", "d1"}
+    assert parsed.composition == [["a1"], ["b1"], ["c1"], ["d1"]]
+    assert not parsed.has_non_unitary_anonymous_axes
+    assert not parsed.has_ellipsis
+
+    parsed = ParsedExpression("() () () ()")
+    assert parsed.identifiers == set()
+    assert parsed.composition == [[], [], [], []]
+    assert not parsed.has_non_unitary_anonymous_axes
+    assert not parsed.has_ellipsis
+
+    parsed = ParsedExpression("1 1 1 ()")
+    assert parsed.identifiers == set()
+    assert parsed.composition == [[], [], [], []]
+    assert not parsed.has_non_unitary_anonymous_axes
+    assert not parsed.has_ellipsis
+
+    aap = AnonymousAxisPlaceholder
+
+    parsed = ParsedExpression("5 (3 4)")
+    assert len(parsed.identifiers) == 3 and {i.value for i in parsed.identifiers} == {3, 4, 5}
+    assert parsed.composition == [[aap(5)], [aap(3), aap(4)]]
+    assert parsed.has_non_unitary_anonymous_axes
+    assert not parsed.has_ellipsis
+
+    parsed = ParsedExpression("5 1 (1 4) 1")
+    assert len(parsed.identifiers) == 2 and {i.value for i in parsed.identifiers} == {4, 5}
+    assert parsed.composition == [[aap(5)], [], [aap(4)], []]
+
+    parsed = ParsedExpression("name1 ... a1 12 (name2 14)")
+    assert len(parsed.identifiers) == 6
+    assert parsed.identifiers.difference({"name1", _ellipsis, "a1", "name2"}).__len__() == 2
+    assert parsed.composition == [["name1"], _ellipsis, ["a1"], [aap(12)], ["name2", aap(14)]]
+    assert parsed.has_non_unitary_anonymous_axes
+    assert parsed.has_ellipsis
+    assert not parsed.has_ellipsis_parenthesized
+
+    parsed = ParsedExpression("(name1 ... a1 12) name2 14")
+    assert len(parsed.identifiers) == 6
+    assert parsed.identifiers.difference({"name1", _ellipsis, "a1", "name2"}).__len__() == 2
+    assert parsed.composition == [["name1", _ellipsis, "a1", aap(12)], ["name2"], [aap(14)]]
+    assert parsed.has_non_unitary_anonymous_axes
+    assert parsed.has_ellipsis
+    assert parsed.has_ellipsis_parenthesized
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60f5cccd4e0f05a41e6585e2d1725d4759874b1e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a60dee051d1a5488c50b2ef28d284d590521821
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7443574231f4e5d58e247f5a1d8a48f6dec4d30d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9bb159704566df8880d1a13d865390cc2e13f6b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d8ab4806a5ffe0c2aa7d62e71cb2fc058061f83
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ba7546a822350267579320a3c56c0717349cbf3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f195dcfe31437756d7941e386e363683382ce753
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc13223cb3156cce2cf3b340f0cd355e29d291d1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdd74d5098c90b3714b52205cab0bae835f047da
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/core.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/core.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea513a8c66914392cac534d0982faa92d98c3591
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/core.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64e4ee0b3beba0895f6af692a34c5e0cca1ec6c7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02bb50e2a774864089de3568d1e57baf1a9caea0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef01a62a5ef47e2f1ca8a631f83646cda8ae79bc
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42c41cdd3fe79100eaeac7884be7a496e57b672b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f5f74685d39d72a422822e18a6e4e2b09ccf252
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/json.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/json.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aef02c1fb665620fc37beffe15d45cfb9edc0174
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/json.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..462ec2f10b8d5c716effffb759b2b0c818ec56b4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b7eb330ed6ed9ae27d486eaf9ff997eda015467
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82594851d6951fd7c8d0b2e13258f3da8d234722
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..094249a643b7b6d8534af3c7a0044e817ed0645a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b078fa67a27ac5a21d71b4ca06386e5364d0c583
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1ae22a9e5db17be0df34c7e38eb9da2d1f6ae81
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..702ba8edd7e7b6bb246a14d15dace1266e2eb5eb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/arrow.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/arrow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6120967b3f66c9ce1d0e2de4da110531bafe2f0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/arrow.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e65f62d66751e20f682979c3020a0737fbdc3d04
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bdfdf86d082681e11218f545fbeb755e6fcf51e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e27f1d2975f692713efe2acc90ffc1f5aa94518
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cached.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cached.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81886591e388efbdffb6f6146bb600385b7c35bf
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/cached.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/chained.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/chained.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd0e6af2b62d687a03cc5636ea52e3dc96f546d5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/chained.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dask.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dask.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..957289deab2e142aba90b511a8112b73121c8f27
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dask.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/data.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/data.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d826813cdffa4564a30579441fadda05e904c5a6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/data.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc894450b398371403a947a164b339297d832ebb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8242394960b664ac040db48186552bb5f90f062d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/ftp.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/ftp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00013ef98d52313454255f9b6c7b6f428e16920d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/ftp.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/gist.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/gist.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6135dd6a0ca3d893bc1f75884a8237f1f6749cae
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/gist.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/git.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/git.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47937cc7075f23c9213932fe47cb40bd0ced6aa6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/git.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/github.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/github.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7d276bd203375c5bc88664726e966869fdc91d5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/github.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f19a018c14182dd1e16176ed4ca90556afd722a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http_sync.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http_sync.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e28b3a694f36ee28e86232dfe325e42911da886b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/http_sync.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70f007906b88c57563d15dd8da8de9b5e92f17cf
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57c8ca4b5b8a0537df9514eebe62fa685b00b8b3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/local.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/local.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e36cca59a6f5182be4915254e3e77daf205a3d6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/local.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/memory.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/memory.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb65c148c88acbaf89123652446312aedd044a1d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/memory.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/reference.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/reference.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65bf2577a8cce58a388bc5942c106834f62ad0f9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/reference.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/sftp.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/sftp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..719cccdd6bd58e5225b1c6c7c8d7949852d9f06f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/sftp.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/smb.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/smb.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eab3abebed79e3aa834fa18665dfd55141669cf4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/smb.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/tar.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/tar.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bd947137b89bbe41ac29bccdfc1be2fe87ef0e1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/tar.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46196542b53a25aecb185c1fbb9794a81f3ed03c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/zip.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/zip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..044b081c117351c9f96291c7144fa877df79d694
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/__pycache__/zip.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/arrow.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/arrow.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cd6dc6b1e6ed23ea4a631a1e32d4768bbf045c6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/arrow.py
@@ -0,0 +1,310 @@
+import errno
+import io
+import os
+import secrets
+import shutil
+from contextlib import suppress
+from functools import cached_property, wraps
+from urllib.parse import parse_qs
+
+from fsspec.spec import AbstractFileSystem
+from fsspec.utils import (
+    get_package_version_without_import,
+    infer_storage_options,
+    mirror_from,
+    tokenize,
+)
+
+
+def wrap_exceptions(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except OSError as exception:
+            if not exception.args:
+                raise
+
+            message, *args = exception.args
+            if isinstance(message, str) and "does not exist" in message:
+                raise FileNotFoundError(errno.ENOENT, message) from exception
+            else:
+                raise
+
+    return wrapper
+
+
+PYARROW_VERSION = None
+
+
+class ArrowFSWrapper(AbstractFileSystem):
+    """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
+
+    Parameters
+    ----------
+    fs : pyarrow.fs.FileSystem
+
+    """
+
+    root_marker = "/"
+
+    def __init__(self, fs, **kwargs):
+        global PYARROW_VERSION
+        PYARROW_VERSION = get_package_version_without_import("pyarrow")
+        self.fs = fs
+        super().__init__(**kwargs)
+
+    @property
+    def protocol(self):
+        return self.fs.type_name
+
+    @cached_property
+    def fsid(self):
+        return "hdfs_" + tokenize(self.fs.host, self.fs.port)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        ops = infer_storage_options(path)
+        path = ops["path"]
+        if path.startswith("//"):
+            # special case for "hdfs://path" (without the triple slash)
+            path = path[1:]
+        return path
+
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        from pyarrow.fs import FileSelector
+
+        try:
+            entries = [
+                self._make_entry(entry)
+                for entry in self.fs.get_file_info(FileSelector(path))
+            ]
+        except (FileNotFoundError, NotADirectoryError):
+            entries = [self.info(path, **kwargs)]
+        if detail:
+            return entries
+        else:
+            return [entry["name"] for entry in entries]
+
+    def info(self, path, **kwargs):
+        path = self._strip_protocol(path)
+        [info] = self.fs.get_file_info([path])
+        return self._make_entry(info)
+
+    def exists(self, path):
+        path = self._strip_protocol(path)
+        try:
+            self.info(path)
+        except FileNotFoundError:
+            return False
+        else:
+            return True
+
+    def _make_entry(self, info):
+        from pyarrow.fs import FileType
+
+        if info.type is FileType.Directory:
+            kind = "directory"
+        elif info.type is FileType.File:
+            kind = "file"
+        elif info.type is FileType.NotFound:
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
+        else:
+            kind = "other"
+
+        return {
+            "name": info.path,
+            "size": info.size,
+            "type": kind,
+            "mtime": info.mtime,
+        }
+
+    @wrap_exceptions
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+
+        with self._open(path1, "rb") as lstream:
+            tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.fs.move(tmp_fname, path2)
+            except BaseException:
+                with suppress(FileNotFoundError):
+                    self.fs.delete_file(tmp_fname)
+                raise
+
+    @wrap_exceptions
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+        self.fs.move(path1, path2)
+
+    @wrap_exceptions
+    def rm_file(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_file(path)
+
+    @wrap_exceptions
+    def rm(self, path, recursive=False, maxdepth=None):
+        path = self._strip_protocol(path).rstrip("/")
+        if self.isdir(path):
+            if recursive:
+                self.fs.delete_dir(path)
+            else:
+                raise ValueError("Can't delete directories without recursive=False")
+        else:
+            self.fs.delete_file(path)
+
+    @wrap_exceptions
+    def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
+        if mode == "rb":
+            if seekable:
+                method = self.fs.open_input_file
+            else:
+                method = self.fs.open_input_stream
+        elif mode == "wb":
+            method = self.fs.open_output_stream
+        elif mode == "ab":
+            method = self.fs.open_append_stream
+        else:
+            raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
+
+        _kwargs = {}
+        if mode != "rb" or not seekable:
+            if int(PYARROW_VERSION.split(".")[0]) >= 4:
+                # disable compression auto-detection
+                _kwargs["compression"] = None
+        stream = method(path, **_kwargs)
+
+        return ArrowFile(self, stream, path, mode, block_size, **kwargs)
+
+    @wrap_exceptions
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            self.fs.create_dir(path, recursive=False)
+
+    @wrap_exceptions
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        self.fs.create_dir(path, recursive=True)
+
+    @wrap_exceptions
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_dir(path)
+
+    @wrap_exceptions
+    def modified(self, path):
+        path = self._strip_protocol(path)
+        return self.fs.get_file_info(path).mtime
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        kwargs.setdefault("seekable", start not in [None, 0])
+        return super().cat_file(path, start=None, end=None, **kwargs)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        kwargs.setdefault("seekable", False)
+        super().get_file(rpath, lpath, **kwargs)
+
+
+@mirror_from(
+    "stream",
+    [
+        "read",
+        "seek",
+        "tell",
+        "write",
+        "readable",
+        "writable",
+        "close",
+        "seekable",
+    ],
+)
+class ArrowFile(io.IOBase):
+    def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
+        self.path = path
+        self.mode = mode
+
+        self.fs = fs
+        self.stream = stream
+
+        self.blocksize = self.block_size = block_size
+        self.kwargs = kwargs
+
+    def __enter__(self):
+        return self
+
+    @property
+    def size(self):
+        return self.stream.size()
+
+    def __exit__(self, *args):
+        return self.close()
+
+
+class HadoopFileSystem(ArrowFSWrapper):
+    """A wrapper on top of the pyarrow.fs.HadoopFileSystem
+    to connect it's interface with fsspec"""
+
+    protocol = "hdfs"
+
+    def __init__(
+        self,
+        host="default",
+        port=0,
+        user=None,
+        kerb_ticket=None,
+        replication=3,
+        extra_conf=None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        host: str
+            Hostname, IP or "default" to try to read from Hadoop config
+        port: int
+            Port to connect on, or default from Hadoop config if 0
+        user: str or None
+            If given, connect as this username
+        kerb_ticket: str or None
+            If given, use this ticket for authentication
+        replication: int
+            set replication factor of file for write operations. default value is 3.
+        extra_conf: None or dict
+            Passed on to HadoopFileSystem
+        """
+        from pyarrow.fs import HadoopFileSystem
+
+        fs = HadoopFileSystem(
+            host=host,
+            port=port,
+            user=user,
+            kerb_ticket=kerb_ticket,
+            replication=replication,
+            extra_conf=extra_conf,
+        )
+        super().__init__(fs=fs, **kwargs)
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        ops = infer_storage_options(path)
+        out = {}
+        if ops.get("host", None):
+            out["host"] = ops["host"]
+        if ops.get("username", None):
+            out["user"] = ops["username"]
+        if ops.get("port", None):
+            out["port"] = ops["port"]
+        if ops.get("url_query", None):
+            queries = parse_qs(ops["url_query"])
+            if queries.get("replication", None):
+                out["replication"] = int(queries["replication"][0])
+        return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/asyn_wrapper.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/asyn_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..91db5eb48d00e36b46d9deb49504a7d2ad76d690
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/asyn_wrapper.py
@@ -0,0 +1,124 @@
+import asyncio
+import functools
+import inspect
+
+import fsspec
+from fsspec.asyn import AsyncFileSystem, running_async
+
+from .chained import ChainedFileSystem
+
+
+def async_wrapper(func, obj=None, semaphore=None):
+    """
+    Wraps a synchronous function to make it awaitable.
+
+    Parameters
+    ----------
+    func : callable
+        The synchronous function to wrap.
+    obj : object, optional
+        The instance to bind the function to, if applicable.
+    semaphore : asyncio.Semaphore, optional
+        A semaphore to limit concurrent calls.
+
+    Returns
+    -------
+    coroutine
+        An awaitable version of the function.
+    """
+
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        if semaphore:
+            async with semaphore:
+                return await asyncio.to_thread(func, *args, **kwargs)
+        return await asyncio.to_thread(func, *args, **kwargs)
+
+    return wrapper
+
+
+class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
+    """
+    A wrapper class to convert a synchronous filesystem into an asynchronous one.
+
+    This class takes an existing synchronous filesystem implementation and wraps all
+    its methods to provide an asynchronous interface.
+
+    Parameters
+    ----------
+    sync_fs : AbstractFileSystem
+        The synchronous filesystem instance to wrap.
+    """
+
+    protocol = "asyncwrapper", "async_wrapper"
+    cachable = False
+
+    def __init__(
+        self,
+        fs=None,
+        asynchronous=None,
+        target_protocol=None,
+        target_options=None,
+        semaphore=None,
+        max_concurrent_tasks=None,
+        **kwargs,
+    ):
+        if asynchronous is None:
+            asynchronous = running_async()
+        super().__init__(asynchronous=asynchronous, **kwargs)
+        if fs is not None:
+            self.sync_fs = fs
+        else:
+            self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
+        self.protocol = self.sync_fs.protocol
+        self.semaphore = semaphore
+        self._wrap_all_sync_methods()
+
+    @property
+    def fsid(self):
+        return f"async_{self.sync_fs.fsid}"
+
+    def _wrap_all_sync_methods(self):
+        """
+        Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
+        """
+        excluded_methods = {"open"}
+        for method_name in dir(self.sync_fs):
+            if method_name.startswith("_") or method_name in excluded_methods:
+                continue
+
+            attr = inspect.getattr_static(self.sync_fs, method_name)
+            if isinstance(attr, property):
+                continue
+
+            method = getattr(self.sync_fs, method_name)
+            if callable(method) and not inspect.iscoroutinefunction(method):
+                async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
+                setattr(self, f"_{method_name}", async_method)
+
+    @classmethod
+    def wrap_class(cls, sync_fs_class):
+        """
+        Create a new class that can be used to instantiate an AsyncFileSystemWrapper
+        with lazy instantiation of the underlying synchronous filesystem.
+
+        Parameters
+        ----------
+        sync_fs_class : type
+            The class of the synchronous filesystem to wrap.
+
+        Returns
+        -------
+        type
+            A new class that wraps the provided synchronous filesystem class.
+        """
+
+        class GeneratedAsyncFileSystemWrapper(cls):
+            def __init__(self, *args, **kwargs):
+                sync_fs = sync_fs_class(*args, **kwargs)
+                super().__init__(sync_fs)
+
+        GeneratedAsyncFileSystemWrapper.__name__ = (
+            f"Async{sync_fs_class.__name__}Wrapper"
+        )
+        return GeneratedAsyncFileSystemWrapper
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e7c7d88afdddf12f77b26bb635bd8bf1e2bd7f1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import abc
+import hashlib
+
+from fsspec.implementations.local import make_path_posix
+
+
+class AbstractCacheMapper(abc.ABC):
+    """Abstract super-class for mappers from remote URLs to local cached
+    basenames.
+    """
+
+    @abc.abstractmethod
+    def __call__(self, path: str) -> str: ...
+
+    def __eq__(self, other: object) -> bool:
+        # Identity only depends on class. When derived classes have attributes
+        # they will need to be included.
+        return isinstance(other, type(self))
+
+    def __hash__(self) -> int:
+        # Identity only depends on class. When derived classes have attributes
+        # they will need to be included.
+        return hash(type(self))
+
+
+class BasenameCacheMapper(AbstractCacheMapper):
+    """Cache mapper that uses the basename of the remote URL and a fixed number
+    of directory levels above this.
+
+    The default is zero directory levels, meaning different paths with the same
+    basename will have the same cached basename.
+    """
+
+    def __init__(self, directory_levels: int = 0):
+        if directory_levels < 0:
+            raise ValueError(
+                "BasenameCacheMapper requires zero or positive directory_levels"
+            )
+        self.directory_levels = directory_levels
+
+        # Separator for directories when encoded as strings.
+        self._separator = "_@_"
+
+    def __call__(self, path: str) -> str:
+        path = make_path_posix(path)
+        prefix, *bits = path.rsplit("/", self.directory_levels + 1)
+        if bits:
+            return self._separator.join(bits)
+        else:
+            return prefix  # No separator found, simple filename
+
+    def __eq__(self, other: object) -> bool:
+        return super().__eq__(other) and self.directory_levels == other.directory_levels
+
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash(self.directory_levels)
+
+
+class HashCacheMapper(AbstractCacheMapper):
+    """Cache mapper that uses a hash of the remote URL."""
+
+    def __call__(self, path: str) -> str:
+        return hashlib.sha256(path.encode()).hexdigest()
+
+
+def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
+    """Factory method to create cache mapper for backward compatibility with
+    ``CachingFileSystem`` constructor using ``same_names`` kwarg.
+    """
+    if same_names:
+        return BasenameCacheMapper()
+    else:
+        return HashCacheMapper()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1f7eb7f846186606921ff6a1539442a0899506
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py
@@ -0,0 +1,231 @@
+from __future__ import annotations
+
+import os
+import pickle
+import time
+from typing import TYPE_CHECKING
+
+from fsspec.utils import atomic_write
+
+try:
+    import ujson as json
+except ImportError:
+    if not TYPE_CHECKING:
+        import json
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from typing import Any, Literal, TypeAlias
+
+    from .cached import CachingFileSystem
+
+    Detail: TypeAlias = dict[str, Any]
+
+
+class CacheMetadata:
+    """Cache metadata.
+
+    All reading and writing of cache metadata is performed by this class,
+    accessing the cached files and blocks is not.
+
+    Metadata is stored in a single file per storage directory in JSON format.
+    For backward compatibility, also reads metadata stored in pickle format
+    which is converted to JSON when next saved.
+    """
+
+    def __init__(self, storage: list[str]):
+        """
+
+        Parameters
+        ----------
+        storage: list[str]
+            Directories containing cached files, must be at least one. Metadata
+            is stored in the last of these directories by convention.
+        """
+        if not storage:
+            raise ValueError("CacheMetadata expects at least one storage location")
+
+        self._storage = storage
+        self.cached_files: list[Detail] = [{}]
+
+        # Private attribute to force saving of metadata in pickle format rather than
+        # JSON for use in tests to confirm can read both pickle and JSON formats.
+        self._force_save_pickle = False
+
+    def _load(self, fn: str) -> Detail:
+        """Low-level function to load metadata from specific file"""
+        try:
+            with open(fn, "r") as f:
+                loaded = json.load(f)
+        except ValueError:
+            with open(fn, "rb") as f:
+                loaded = pickle.load(f)
+        for c in loaded.values():
+            if isinstance(c.get("blocks"), list):
+                c["blocks"] = set(c["blocks"])
+        return loaded
+
+    def _save(self, metadata_to_save: Detail, fn: str) -> None:
+        """Low-level function to save metadata to specific file"""
+        if self._force_save_pickle:
+            with atomic_write(fn) as f:
+                pickle.dump(metadata_to_save, f)
+        else:
+            with atomic_write(fn, mode="w") as f:
+                json.dump(metadata_to_save, f)
+
+    def _scan_locations(
+        self, writable_only: bool = False
+    ) -> Iterator[tuple[str, str, bool]]:
+        """Yield locations (filenames) where metadata is stored, and whether
+        writable or not.
+
+        Parameters
+        ----------
+        writable: bool
+            Set to True to only yield writable locations.
+
+        Returns
+        -------
+        Yields (str, str, bool)
+        """
+        n = len(self._storage)
+        for i, storage in enumerate(self._storage):
+            writable = i == n - 1
+            if writable_only and not writable:
+                continue
+            yield os.path.join(storage, "cache"), storage, writable
+
+    def check_file(
+        self, path: str, cfs: CachingFileSystem | None
+    ) -> Literal[False] | tuple[Detail, str]:
+        """If path is in cache return its details, otherwise return ``False``.
+
+        If the optional CachingFileSystem is specified then it is used to
+        perform extra checks to reject possible matches, such as if they are
+        too old.
+        """
+        for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
+            if path not in cache:
+                continue
+            detail = cache[path].copy()
+
+            if cfs is not None:
+                if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
+                    # Wrong file as determined by hash of file properties
+                    continue
+                if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
+                    # Cached file has expired
+                    continue
+
+            fn = os.path.join(base, detail["fn"])
+            if os.path.exists(fn):
+                return detail, fn
+        return False
+
+    def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
+        """Remove expired metadata from the cache.
+
+        Returns names of files corresponding to expired metadata and a boolean
+        flag indicating whether the writable cache is empty. Caller is
+        responsible for deleting the expired files.
+        """
+        expired_files = []
+        for path, detail in self.cached_files[-1].copy().items():
+            if time.time() - detail["time"] > expiry_time:
+                fn = detail.get("fn", "")
+                if not fn:
+                    raise RuntimeError(
+                        f"Cache metadata does not contain 'fn' for {path}"
+                    )
+                fn = os.path.join(self._storage[-1], fn)
+                expired_files.append(fn)
+                self.cached_files[-1].pop(path)
+
+        if self.cached_files[-1]:
+            cache_path = os.path.join(self._storage[-1], "cache")
+            self._save(self.cached_files[-1], cache_path)
+
+        writable_cache_empty = not self.cached_files[-1]
+        return expired_files, writable_cache_empty
+
+    def load(self) -> None:
+        """Load all metadata from disk and store in ``self.cached_files``"""
+        cached_files = []
+        for fn, _, _ in self._scan_locations():
+            if os.path.exists(fn):
+                # TODO: consolidate blocks here
+                cached_files.append(self._load(fn))
+            else:
+                cached_files.append({})
+        self.cached_files = cached_files or [{}]
+
+    def on_close_cached_file(self, f: Any, path: str) -> None:
+        """Perform side-effect actions on closing a cached file.
+
+        The actual closing of the file is the responsibility of the caller.
+        """
+        # File must be writeble, so in self.cached_files[-1]
+        c = self.cached_files[-1][path]
+        if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
+            c["blocks"] = True
+
+    def pop_file(self, path: str) -> str | None:
+        """Remove metadata of cached file.
+
+        If path is in the cache, return the filename of the cached file,
+        otherwise return ``None``.  Caller is responsible for deleting the
+        cached file.
+        """
+        details = self.check_file(path, None)
+        if not details:
+            return None
+        _, fn = details
+        if fn.startswith(self._storage[-1]):
+            self.cached_files[-1].pop(path)
+            self.save()
+        else:
+            raise PermissionError(
+                "Can only delete cached file in last, writable cache location"
+            )
+        return fn
+
+    def save(self) -> None:
+        """Save metadata to disk"""
+        for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
+            if not writable:
+                continue
+
+            if os.path.exists(fn):
+                cached_files = self._load(fn)
+                for k, c in cached_files.items():
+                    if k in cache:
+                        if c["blocks"] is True or cache[k]["blocks"] is True:
+                            c["blocks"] = True
+                        else:
+                            # self.cached_files[*][*]["blocks"] must continue to
+                            # point to the same set object so that updates
+                            # performed by MMapCache are propagated back to
+                            # self.cached_files.
+                            blocks = cache[k]["blocks"]
+                            blocks.update(c["blocks"])
+                            c["blocks"] = blocks
+                        c["time"] = max(c["time"], cache[k]["time"])
+                        c["uid"] = cache[k]["uid"]
+
+                # Files can be added to cache after it was written once
+                for k, c in cache.items():
+                    if k not in cached_files:
+                        cached_files[k] = c
+            else:
+                cached_files = cache
+            cache = {k: v.copy() for k, v in cached_files.items()}
+            for c in cache.values():
+                if isinstance(c["blocks"], set):
+                    c["blocks"] = list(c["blocks"])
+            self._save(cache, fn)
+            self.cached_files[-1] = cached_files
+
+    def update_file(self, path: str, detail: Detail) -> None:
+        """Update metadata for specific file in memory, do not save"""
+        self.cached_files[-1][path] = detail
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cached.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cached.py
new file mode 100644
index 0000000000000000000000000000000000000000..a55888bdc8f738c937e61ec7df040b2313b20073
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/cached.py
@@ -0,0 +1,1003 @@
+from __future__ import annotations
+
+import inspect
+import logging
+import os
+import tempfile
+import time
+import weakref
+from collections.abc import Callable
+from shutil import rmtree
+from typing import TYPE_CHECKING, Any, ClassVar
+
+from fsspec import filesystem
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.compression import compr
+from fsspec.core import BaseCache, MMapCache
+from fsspec.exceptions import BlocksizeMismatchError
+from fsspec.implementations.cache_mapper import create_cache_mapper
+from fsspec.implementations.cache_metadata import CacheMetadata
+from fsspec.implementations.chained import ChainedFileSystem
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.spec import AbstractBufferedFile
+from fsspec.transaction import Transaction
+from fsspec.utils import infer_compression
+
+if TYPE_CHECKING:
+    from fsspec.implementations.cache_mapper import AbstractCacheMapper
+
+logger = logging.getLogger("fsspec.cached")
+
+
+class WriteCachedTransaction(Transaction):
+    def complete(self, commit=True):
+        rpaths = [f.path for f in self.files]
+        lpaths = [f.fn for f in self.files]
+        if commit:
+            self.fs.put(lpaths, rpaths)
+        self.files.clear()
+        self.fs._intrans = False
+        self.fs._transaction = None
+        self.fs = None  # break cycle
+
+
+class CachingFileSystem(ChainedFileSystem):
+    """Locally caching filesystem, layer over any other FS
+
+    This class implements chunk-wise local storage of remote files, for quick
+    access after the initial download. The files are stored in a given
+    directory with hashes of URLs for the filenames. If no directory is given,
+    a temporary one is used, which should be cleaned up by the OS after the
+    process ends. The files themselves are sparse (as implemented in
+    :class:`~fsspec.caching.MMapCache`), so only the data which is accessed
+    takes up space.
+
+    Restrictions:
+
+    - the block-size must be the same for each access of a given file, unless
+      all blocks of the file have already been read
+    - caching can only be applied to file-systems which produce files
+      derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
+      allowed, for testing
+    """
+
+    protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
+    _strip_tokenize_options = ("fo",)
+
+    def __init__(
+        self,
+        target_protocol=None,
+        cache_storage="TMP",
+        cache_check=10,
+        check_files=False,
+        expiry_time=604800,
+        target_options=None,
+        fs=None,
+        same_names: bool | None = None,
+        compression=None,
+        cache_mapper: AbstractCacheMapper | None = None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        target_protocol: str (optional)
+            Target filesystem protocol. Provide either this or ``fs``.
+        cache_storage: str or list(str)
+            Location to store files. If "TMP", this is a temporary directory,
+            and will be cleaned up by the OS when this process ends (or later).
+            If a list, each location will be tried in the order given, but
+            only the last will be considered writable.
+        cache_check: int
+            Number of seconds between reload of cache metadata
+        check_files: bool
+            Whether to explicitly see if the UID of the remote file matches
+            the stored one before using. Warning: some file systems such as
+            HTTP cannot reliably give a unique hash of the contents of some
+            path, so be sure to set this option to False.
+        expiry_time: int
+            The time in seconds after which a local copy is considered useless.
+            Set to falsy to prevent expiry. The default is equivalent to one
+            week.
+        target_options: dict or None
+            Passed to the instantiation of the FS, if fs is None.
+        fs: filesystem instance
+            The target filesystem to run against. Provide this or ``protocol``.
+        same_names: bool (optional)
+            By default, target URLs are hashed using a ``HashCacheMapper`` so
+            that files from different backends with the same basename do not
+            conflict. If this argument is ``true``, a ``BasenameCacheMapper``
+            is used instead. Other cache mapper options are available by using
+            the ``cache_mapper`` keyword argument. Only one of this and
+            ``cache_mapper`` should be specified.
+        compression: str (optional)
+            To decompress on download. Can be 'infer' (guess from the URL name),
+            one of the entries in ``fsspec.compression.compr``, or None for no
+            decompression.
+        cache_mapper: AbstractCacheMapper (optional)
+            The object use to map from original filenames to cached filenames.
+            Only one of this and ``same_names`` should be specified.
+        """
+        super().__init__(**kwargs)
+        if fs is None and target_protocol is None:
+            raise ValueError(
+                "Please provide filesystem instance(fs) or target_protocol"
+            )
+        if not (fs is None) ^ (target_protocol is None):
+            raise ValueError(
+                "Both filesystems (fs) and target_protocol may not be both given."
+            )
+        if cache_storage == "TMP":
+            tempdir = tempfile.mkdtemp()
+            storage = [tempdir]
+            weakref.finalize(self, self._remove_tempdir, tempdir)
+        else:
+            if isinstance(cache_storage, str):
+                storage = [cache_storage]
+            else:
+                storage = cache_storage
+        os.makedirs(storage[-1], exist_ok=True)
+        self.storage = storage
+        self.kwargs = target_options or {}
+        self.cache_check = cache_check
+        self.check_files = check_files
+        self.expiry = expiry_time
+        self.compression = compression
+
+        # Size of cache in bytes. If None then the size is unknown and will be
+        # recalculated the next time cache_size() is called. On writes to the
+        # cache this is reset to None.
+        self._cache_size = None
+
+        if same_names is not None and cache_mapper is not None:
+            raise ValueError(
+                "Cannot specify both same_names and cache_mapper in "
+                "CachingFileSystem.__init__"
+            )
+        if cache_mapper is not None:
+            self._mapper = cache_mapper
+        else:
+            self._mapper = create_cache_mapper(
+                same_names if same_names is not None else False
+            )
+
+        self.target_protocol = (
+            target_protocol
+            if isinstance(target_protocol, str)
+            else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
+        )
+        self._metadata = CacheMetadata(self.storage)
+        self.load_cache()
+        self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
+
+        def _strip_protocol(path):
+            # acts as a method, since each instance has a difference target
+            return self.fs._strip_protocol(type(self)._strip_protocol(path))
+
+        self._strip_protocol: Callable = _strip_protocol
+
+    @staticmethod
+    def _remove_tempdir(tempdir):
+        try:
+            rmtree(tempdir)
+        except Exception:
+            pass
+
+    def _mkcache(self):
+        os.makedirs(self.storage[-1], exist_ok=True)
+
+    def cache_size(self):
+        """Return size of cache in bytes.
+
+        If more than one cache directory is in use, only the size of the last
+        one (the writable cache directory) is returned.
+        """
+        if self._cache_size is None:
+            cache_dir = self.storage[-1]
+            self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
+        return self._cache_size
+
+    def load_cache(self):
+        """Read set of stored blocks from file"""
+        self._metadata.load()
+        self._mkcache()
+        self.last_cache = time.time()
+
+    def save_cache(self):
+        """Save set of stored blocks from file"""
+        self._mkcache()
+        self._metadata.save()
+        self.last_cache = time.time()
+        self._cache_size = None
+
+    def _check_cache(self):
+        """Reload caches if time elapsed or any disappeared"""
+        self._mkcache()
+        if not self.cache_check:
+            # explicitly told not to bother checking
+            return
+        timecond = time.time() - self.last_cache > self.cache_check
+        existcond = all(os.path.exists(storage) for storage in self.storage)
+        if timecond or not existcond:
+            self.load_cache()
+
+    def _check_file(self, path):
+        """Is path in cache and still valid"""
+        path = self._strip_protocol(path)
+        self._check_cache()
+        return self._metadata.check_file(path, self)
+
+    def clear_cache(self):
+        """Remove all files and metadata from the cache
+
+        In the case of multiple cache locations, this clears only the last one,
+        which is assumed to be the read/write one.
+        """
+        rmtree(self.storage[-1])
+        self.load_cache()
+        self._cache_size = None
+
+    def clear_expired_cache(self, expiry_time=None):
+        """Remove all expired files and metadata from the cache
+
+        In the case of multiple cache locations, this clears only the last one,
+        which is assumed to be the read/write one.
+
+        Parameters
+        ----------
+        expiry_time: int
+            The time in seconds after which a local copy is considered useless.
+            If not defined the default is equivalent to the attribute from the
+            file caching instantiation.
+        """
+
+        if not expiry_time:
+            expiry_time = self.expiry
+
+        self._check_cache()
+
+        expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
+        for fn in expired_files:
+            if os.path.exists(fn):
+                os.remove(fn)
+
+        if writable_cache_empty:
+            rmtree(self.storage[-1])
+            self.load_cache()
+
+        self._cache_size = None
+
+    def pop_from_cache(self, path):
+        """Remove cached version of given file
+
+        Deletes local copy of the given (remote) path. If it is found in a cache
+        location which is not the last, it is assumed to be read-only, and
+        raises PermissionError
+        """
+        path = self._strip_protocol(path)
+        fn = self._metadata.pop_file(path)
+        if fn is not None:
+            os.remove(fn)
+        self._cache_size = None
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        """Wrap the target _open
+
+        If the whole file exists in the cache, just open it locally and
+        return that.
+
+        Otherwise, open the file on the target FS, and make it have a mmap
+        cache pointing to the location which we determine, in our cache.
+        The ``blocks`` instance is shared, so as the mmap cache instance
+        updates, so does the entry in our ``cached_files`` attribute.
+        We monkey-patch this file, so that when it closes, we call
+        ``close_and_update`` to save the state of the blocks.
+        """
+        path = self._strip_protocol(path)
+
+        path = self.fs._strip_protocol(path)
+        if "r" not in mode:
+            return self.fs._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        detail = self._check_file(path)
+        if detail:
+            # file is in cache
+            detail, fn = detail
+            hash, blocks = detail["fn"], detail["blocks"]
+            if blocks is True:
+                # stored file is complete
+                logger.debug("Opening local copy of %s", path)
+                return open(fn, mode)
+            # TODO: action where partial file exists in read-only cache
+            logger.debug("Opening partially cached copy of %s", path)
+        else:
+            hash = self._mapper(path)
+            fn = os.path.join(self.storage[-1], hash)
+            blocks = set()
+            detail = {
+                "original": path,
+                "fn": hash,
+                "blocks": blocks,
+                "time": time.time(),
+                "uid": self.fs.ukey(path),
+            }
+            self._metadata.update_file(path, detail)
+            logger.debug("Creating local sparse file for %s", path)
+
+        # explicitly submitting the size to the open call will avoid extra
+        # operations when opening. This is particularly relevant
+        # for any file that is read over a network, e.g. S3.
+        size = detail.get("size")
+
+        # call target filesystems open
+        self._mkcache()
+        f = self.fs._open(
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_options=cache_options,
+            cache_type="none",
+            size=size,
+            **kwargs,
+        )
+
+        # set size if not already set
+        if size is None:
+            detail["size"] = f.size
+            self._metadata.update_file(path, detail)
+
+        if self.compression:
+            comp = (
+                infer_compression(path)
+                if self.compression == "infer"
+                else self.compression
+            )
+            f = compr[comp](f, mode="rb")
+        if "blocksize" in detail:
+            if detail["blocksize"] != f.blocksize:
+                raise BlocksizeMismatchError(
+                    f"Cached file must be reopened with same block"
+                    f" size as original (old: {detail['blocksize']},"
+                    f" new {f.blocksize})"
+                )
+        else:
+            detail["blocksize"] = f.blocksize
+
+        def _fetch_ranges(ranges):
+            return self.fs.cat_ranges(
+                [path] * len(ranges),
+                [r[0] for r in ranges],
+                [r[1] for r in ranges],
+                **kwargs,
+            )
+
+        multi_fetcher = None if self.compression else _fetch_ranges
+        f.cache = MMapCache(
+            f.blocksize, f._fetch_range, f.size, fn, blocks, multi_fetcher=multi_fetcher
+        )
+        close = f.close
+        f.close = lambda: self.close_and_update(f, close)
+        self.save_cache()
+        return f
+
+    def _parent(self, path):
+        return self.fs._parent(path)
+
+    def hash_name(self, path: str, *args: Any) -> str:
+        # Kept for backward compatibility with downstream libraries.
+        # Ignores extra arguments, previously same_name boolean.
+        return self._mapper(path)
+
+    def close_and_update(self, f, close):
+        """Called when a file is closing, so store the set of blocks"""
+        if f.closed:
+            return
+        path = self._strip_protocol(f.path)
+        self._metadata.on_close_cached_file(f, path)
+        try:
+            logger.debug("going to save")
+            self.save_cache()
+            logger.debug("saved")
+        except OSError:
+            logger.debug("Cache saving failed while closing file")
+        except NameError:
+            logger.debug("Cache save failed due to interpreter shutdown")
+        close()
+        f.closed = True
+
+    def ls(self, path, detail=True):
+        return self.fs.ls(path, detail)
+
+    def __getattribute__(self, item):
+        if item in {
+            "load_cache",
+            "_open",
+            "save_cache",
+            "close_and_update",
+            "__init__",
+            "__getattribute__",
+            "__reduce__",
+            "_make_local_details",
+            "open",
+            "cat",
+            "cat_file",
+            "_cat_file",
+            "cat_ranges",
+            "_cat_ranges",
+            "get",
+            "read_block",
+            "tail",
+            "head",
+            "info",
+            "ls",
+            "exists",
+            "isfile",
+            "isdir",
+            "_check_file",
+            "_check_cache",
+            "_mkcache",
+            "clear_cache",
+            "clear_expired_cache",
+            "pop_from_cache",
+            "local_file",
+            "_paths_from_path",
+            "get_mapper",
+            "open_many",
+            "commit_many",
+            "hash_name",
+            "__hash__",
+            "__eq__",
+            "to_json",
+            "to_dict",
+            "cache_size",
+            "pipe_file",
+            "pipe",
+            "start_transaction",
+            "end_transaction",
+        }:
+            # all the methods defined in this class. Note `open` here, since
+            # it calls `_open`, but is actually in superclass
+            return lambda *args, **kw: getattr(type(self), item).__get__(self)(
+                *args, **kw
+            )
+        if item in ["__reduce_ex__"]:
+            raise AttributeError
+        if item in ["transaction"]:
+            # property
+            return type(self).transaction.__get__(self)
+        if item in {"_cache", "transaction_type", "protocol"}:
+            # class attributes
+            return getattr(type(self), item)
+        if item == "__class__":
+            return type(self)
+        d = object.__getattribute__(self, "__dict__")
+        fs = d.get("fs", None)  # fs is not immediately defined
+        if item in d:
+            return d[item]
+        elif fs is not None:
+            if item in fs.__dict__:
+                # attribute of instance
+                return fs.__dict__[item]
+            # attributed belonging to the target filesystem
+            cls = type(fs)
+            m = getattr(cls, item)
+            if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
+                not hasattr(m, "__self__") or m.__self__ is None
+            ):
+                # instance method
+                return m.__get__(fs, cls)
+            return m  # class method or attribute
+        else:
+            # attributes of the superclass, while target is being set up
+            return super().__getattribute__(item)
+
+    def __eq__(self, other):
+        """Test for equality."""
+        if self is other:
+            return True
+        if not isinstance(other, type(self)):
+            return False
+        return (
+            self.storage == other.storage
+            and self.kwargs == other.kwargs
+            and self.cache_check == other.cache_check
+            and self.check_files == other.check_files
+            and self.expiry == other.expiry
+            and self.compression == other.compression
+            and self._mapper == other._mapper
+            and self.target_protocol == other.target_protocol
+        )
+
+    def __hash__(self):
+        """Calculate hash."""
+        return (
+            hash(tuple(self.storage))
+            ^ hash(str(self.kwargs))
+            ^ hash(self.cache_check)
+            ^ hash(self.check_files)
+            ^ hash(self.expiry)
+            ^ hash(self.compression)
+            ^ hash(self._mapper)
+            ^ hash(self.target_protocol)
+        )
+
+
+class WholeFileCacheFileSystem(CachingFileSystem):
+    """Caches whole remote files on first access
+
+    This class is intended as a layer over any other file system, and
+    will make a local copy of each file accessed, so that all subsequent
+    reads are local. This is similar to ``CachingFileSystem``, but without
+    the block-wise functionality and so can work even when sparse files
+    are not allowed. See its docstring for definition of the init
+    arguments.
+
+    The class still needs access to the remote store for listing files,
+    and may refresh cached files.
+    """
+
+    protocol = "filecache"
+    local_file = True
+
+    def open_many(self, open_files, **kwargs):
+        paths = [of.path for of in open_files]
+        if "r" in open_files.mode:
+            self._mkcache()
+        else:
+            return [
+                LocalTempFile(
+                    self.fs,
+                    path,
+                    mode=open_files.mode,
+                    fn=os.path.join(self.storage[-1], self._mapper(path)),
+                    **kwargs,
+                )
+                for path in paths
+            ]
+
+        if self.compression:
+            raise NotImplementedError
+        details = [self._check_file(sp) for sp in paths]
+        downpath = [p for p, d in zip(paths, details) if not d]
+        downfn0 = [
+            os.path.join(self.storage[-1], self._mapper(p))
+            for p, d in zip(paths, details)
+        ]  # keep these path names for opening later
+        downfn = [fn for fn, d in zip(downfn0, details) if not d]
+        if downpath:
+            # skip if all files are already cached and up to date
+            self.fs.get(downpath, downfn)
+
+            # update metadata - only happens when downloads are successful
+            newdetail = [
+                {
+                    "original": path,
+                    "fn": self._mapper(path),
+                    "blocks": True,
+                    "time": time.time(),
+                    "uid": self.fs.ukey(path),
+                }
+                for path in downpath
+            ]
+            for path, detail in zip(downpath, newdetail):
+                self._metadata.update_file(path, detail)
+            self.save_cache()
+
+        def firstpart(fn):
+            # helper to adapt both whole-file and simple-cache
+            return fn[1] if isinstance(fn, tuple) else fn
+
+        return [
+            open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
+            for fn0, fn1 in zip(details, downfn0)
+        ]
+
+    def commit_many(self, open_files):
+        self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
+        [f.close() for f in open_files]
+        for f in open_files:
+            # in case autocommit is off, and so close did not already delete
+            try:
+                os.remove(f.name)
+            except FileNotFoundError:
+                pass
+        self._cache_size = None
+
+    def _make_local_details(self, path):
+        hash = self._mapper(path)
+        fn = os.path.join(self.storage[-1], hash)
+        detail = {
+            "original": path,
+            "fn": hash,
+            "blocks": True,
+            "time": time.time(),
+            "uid": self.fs.ukey(path),
+        }
+        self._metadata.update_file(path, detail)
+        logger.debug("Copying %s to local cache", path)
+        return fn
+
+    def cat(
+        self,
+        path,
+        recursive=False,
+        on_error="raise",
+        callback=DEFAULT_CALLBACK,
+        **kwargs,
+    ):
+        paths = self.expand_path(
+            path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
+        )
+        getpaths = []
+        storepaths = []
+        fns = []
+        out = {}
+        for p in paths.copy():
+            try:
+                detail = self._check_file(p)
+                if not detail:
+                    fn = self._make_local_details(p)
+                    getpaths.append(p)
+                    storepaths.append(fn)
+                else:
+                    detail, fn = detail if isinstance(detail, tuple) else (None, detail)
+                fns.append(fn)
+            except Exception as e:
+                if on_error == "raise":
+                    raise
+                if on_error == "return":
+                    out[p] = e
+                paths.remove(p)
+
+        if getpaths:
+            self.fs.get(getpaths, storepaths)
+            self.save_cache()
+
+        callback.set_size(len(paths))
+        for p, fn in zip(paths, fns):
+            with open(fn, "rb") as f:
+                out[p] = f.read()
+            callback.relative_update(1)
+        if isinstance(path, str) and len(paths) == 1 and recursive is False:
+            out = out[paths[0]]
+        return out
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        if "r" not in mode:
+            hash = self._mapper(path)
+            fn = os.path.join(self.storage[-1], hash)
+            user_specified_kwargs = {
+                k: v
+                for k, v in kwargs.items()
+                # those kwargs were added by open(), we don't want them
+                if k not in ["autocommit", "block_size", "cache_options"]
+            }
+            return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
+        detail = self._check_file(path)
+        if detail:
+            detail, fn = detail
+            _, blocks = detail["fn"], detail["blocks"]
+            if blocks is True:
+                logger.debug("Opening local copy of %s", path)
+
+                # In order to support downstream filesystems to be able to
+                # infer the compression from the original filename, like
+                # the `TarFileSystem`, let's extend the `io.BufferedReader`
+                # fileobject protocol by adding a dedicated attribute
+                # `original`.
+                f = open(fn, mode)
+                f.original = detail.get("original")
+                return f
+            else:
+                raise ValueError(
+                    f"Attempt to open partially cached file {path}"
+                    f" as a wholly cached file"
+                )
+        else:
+            fn = self._make_local_details(path)
+        kwargs["mode"] = mode
+
+        # call target filesystems open
+        self._mkcache()
+        if self.compression:
+            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+                if isinstance(f, AbstractBufferedFile):
+                    # want no type of caching if just downloading whole thing
+                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
+                comp = (
+                    infer_compression(path)
+                    if self.compression == "infer"
+                    else self.compression
+                )
+                f = compr[comp](f, mode="rb")
+                data = True
+                while data:
+                    block = getattr(f, "blocksize", 5 * 2**20)
+                    data = f.read(block)
+                    f2.write(data)
+        else:
+            self.fs.get_file(path, fn)
+        self.save_cache()
+        return self._open(path, mode)
+
+
+class SimpleCacheFileSystem(WholeFileCacheFileSystem):
+    """Caches whole remote files on first access
+
+    This class is intended as a layer over any other file system, and
+    will make a local copy of each file accessed, so that all subsequent
+    reads are local. This implementation only copies whole files, and
+    does not keep any metadata about the download time or file details.
+    It is therefore safer to use in multi-threaded/concurrent situations.
+
+    This is the only of the caching filesystems that supports write: you will
+    be given a real local open file, and upon close and commit, it will be
+    uploaded to the target filesystem; the writability or the target URL is
+    not checked until that time.
+
+    """
+
+    protocol = "simplecache"
+    local_file = True
+    transaction_type = WriteCachedTransaction
+
+    def __init__(self, **kwargs):
+        kw = kwargs.copy()
+        for key in ["cache_check", "expiry_time", "check_files"]:
+            kw[key] = False
+        super().__init__(**kw)
+        for storage in self.storage:
+            if not os.path.exists(storage):
+                os.makedirs(storage, exist_ok=True)
+
+    def _check_file(self, path):
+        self._check_cache()
+        sha = self._mapper(path)
+        for storage in self.storage:
+            fn = os.path.join(storage, sha)
+            if os.path.exists(fn):
+                return fn
+
+    def save_cache(self):
+        pass
+
+    def load_cache(self):
+        pass
+
+    def pipe_file(self, path, value=None, **kwargs):
+        if self._intrans:
+            with self.open(path, "wb") as f:
+                f.write(value)
+        else:
+            super().pipe_file(path, value)
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        details = []
+        try:
+            details = self.fs.ls(
+                path, detail=True, **kwargs
+            ).copy()  # don't edit original!
+        except FileNotFoundError as e:
+            ex = e
+        else:
+            ex = None
+        if self._intrans:
+            path1 = path.rstrip("/") + "/"
+            for f in self.transaction.files:
+                if f.path == path:
+                    details.append(
+                        {"name": path, "size": f.size or f.tell(), "type": "file"}
+                    )
+                elif f.path.startswith(path1):
+                    if f.path.count("/") == path1.count("/"):
+                        details.append(
+                            {"name": f.path, "size": f.size or f.tell(), "type": "file"}
+                        )
+                    else:
+                        dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
+                        details.append({"name": dname, "size": 0, "type": "directory"})
+        if ex is not None and not details:
+            raise ex
+        if detail:
+            return details
+        return sorted(_["name"] for _ in details)
+
+    def info(self, path, **kwargs):
+        path = self._strip_protocol(path)
+        if self._intrans:
+            f = [_ for _ in self.transaction.files if _.path == path]
+            if f:
+                size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell()
+                return {"name": path, "size": size, "type": "file"}
+            f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
+            if f:
+                return {"name": path, "size": 0, "type": "directory"}
+        return self.fs.info(path, **kwargs)
+
+    def pipe(self, path, value=None, **kwargs):
+        if isinstance(path, str):
+            self.pipe_file(self._strip_protocol(path), value, **kwargs)
+        elif isinstance(path, dict):
+            for k, v in path.items():
+                self.pipe_file(self._strip_protocol(k), v, **kwargs)
+        else:
+            raise ValueError("path must be str or dict")
+
+    async def _cat_file(self, path, start=None, end=None, **kwargs):
+        logger.debug("async cat_file %s", path)
+        path = self._strip_protocol(path)
+        sha = self._mapper(path)
+        fn = self._check_file(path)
+
+        if not fn:
+            fn = os.path.join(self.storage[-1], sha)
+            await self.fs._get_file(path, fn, **kwargs)
+
+        with open(fn, "rb") as f:  # noqa ASYNC230
+            if start:
+                f.seek(start)
+            size = -1 if end is None else end - f.tell()
+            return f.read(size)
+
+    async def _cat_ranges(
+        self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+    ):
+        logger.debug("async cat ranges %s", paths)
+        lpaths = []
+        rset = set()
+        download = []
+        rpaths = []
+        for p in paths:
+            fn = self._check_file(p)
+            if fn is None and p not in rset:
+                sha = self._mapper(p)
+                fn = os.path.join(self.storage[-1], sha)
+                download.append(fn)
+                rset.add(p)
+                rpaths.append(p)
+            lpaths.append(fn)
+        if download:
+            await self.fs._get(rpaths, download, on_error=on_error)
+
+        return LocalFileSystem().cat_ranges(
+            lpaths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
+        )
+
+    def cat_ranges(
+        self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+    ):
+        logger.debug("cat ranges %s", paths)
+        lpaths = [self._check_file(p) for p in paths]
+        rpaths = [p for l, p in zip(lpaths, paths) if l is False]
+        lpaths = [l for l, p in zip(lpaths, paths) if l is False]
+        self.fs.get(rpaths, lpaths)
+        paths = [self._check_file(p) for p in paths]
+        return LocalFileSystem().cat_ranges(
+            paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
+        )
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        sha = self._mapper(path)
+
+        if "r" not in mode:
+            fn = os.path.join(self.storage[-1], sha)
+            user_specified_kwargs = {
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["autocommit", "block_size", "cache_options"]
+            }  # those were added by open()
+            return LocalTempFile(
+                self,
+                path,
+                mode=mode,
+                autocommit=not self._intrans,
+                fn=fn,
+                **user_specified_kwargs,
+            )
+        fn = self._check_file(path)
+        if fn:
+            return open(fn, mode)
+
+        fn = os.path.join(self.storage[-1], sha)
+        logger.debug("Copying %s to local cache", path)
+        kwargs["mode"] = mode
+
+        self._mkcache()
+        self._cache_size = None
+        if self.compression:
+            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+                if isinstance(f, AbstractBufferedFile):
+                    # want no type of caching if just downloading whole thing
+                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
+                comp = (
+                    infer_compression(path)
+                    if self.compression == "infer"
+                    else self.compression
+                )
+                f = compr[comp](f, mode="rb")
+                data = True
+                while data:
+                    block = getattr(f, "blocksize", 5 * 2**20)
+                    data = f.read(block)
+                    f2.write(data)
+        else:
+            self.fs.get_file(path, fn)
+        return self._open(path, mode)
+
+
+class LocalTempFile:
+    """A temporary local file, which will be uploaded on commit"""
+
+    def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
+        self.fn = fn
+        self.fh = open(fn, mode)
+        self.mode = mode
+        if seek:
+            self.fh.seek(seek)
+        self.path = path
+        self.size = None
+        self.fs = fs
+        self.closed = False
+        self.autocommit = autocommit
+        self.kwargs = kwargs
+
+    def __reduce__(self):
+        # always open in r+b to allow continuing writing at a location
+        return (
+            LocalTempFile,
+            (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
+        )
+
+    def __enter__(self):
+        return self.fh
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        # self.size = self.fh.tell()
+        if self.closed:
+            return
+        self.fh.close()
+        self.closed = True
+        if self.autocommit:
+            self.commit()
+
+    def discard(self):
+        self.fh.close()
+        os.remove(self.fn)
+
+    def commit(self):
+        # calling put() with list arguments avoids path expansion and additional operations
+        # like isdir()
+        self.fs.put([self.fn], [self.path], **self.kwargs)
+        # we do not delete the local copy, it's still in the cache.
+
+    @property
+    def name(self):
+        return self.fn
+
+    def __repr__(self) -> str:
+        return f"LocalTempFile: {self.path}"
+
+    def __getattr__(self, item):
+        return getattr(self.fh, item)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/chained.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/chained.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfce64334e8db0272eefa96b4428b23524b059f0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/chained.py
@@ -0,0 +1,23 @@
+from typing import ClassVar
+
+from fsspec import AbstractFileSystem
+
+__all__ = ("ChainedFileSystem",)
+
+
+class ChainedFileSystem(AbstractFileSystem):
+    """Chained filesystem base class.
+
+    A chained filesystem is designed to be layered over another FS.
+    This is useful to implement things like caching.
+
+    This base class does very little on its own, but is used as a marker
+    that the class is designed for chaining.
+
+    Right now this is only used in `url_to_fs` to provide the path argument
+    (`fo`) to the chained filesystem from the underlying filesystem.
+
+    Additional functionality may be added in the future.
+    """
+
+    protocol: ClassVar[str] = "chained"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dask.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dask.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1276463db6866665e6a0fe114efc247971b57e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dask.py
@@ -0,0 +1,152 @@
+import dask
+from distributed.client import Client, _get_global_client
+from distributed.worker import Worker
+
+from fsspec import filesystem
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+from fsspec.utils import infer_storage_options
+
+
+def _get_client(client):
+    if client is None:
+        return _get_global_client()
+    elif isinstance(client, Client):
+        return client
+    else:
+        # e.g., connection string
+        return Client(client)
+
+
+def _in_worker():
+    return bool(Worker._instances)
+
+
+class DaskWorkerFileSystem(AbstractFileSystem):
+    """View files accessible to a worker as any other remote file-system
+
+    When instances are run on the worker, uses the real filesystem. When
+    run on the client, they call the worker to provide information or data.
+
+    **Warning** this implementation is experimental, and read-only for now.
+    """
+
+    def __init__(
+        self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if not (fs is None) ^ (target_protocol is None):
+            raise ValueError(
+                "Please provide one of filesystem instance (fs) or"
+                " target_protocol, not both"
+            )
+        self.target_protocol = target_protocol
+        self.target_options = target_options
+        self.worker = None
+        self.client = client
+        self.fs = fs
+        self._determine_worker()
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        so = infer_storage_options(path)
+        if "host" in so and "port" in so:
+            return {"client": f"{so['host']}:{so['port']}"}
+        else:
+            return {}
+
+    def _determine_worker(self):
+        if _in_worker():
+            self.worker = True
+            if self.fs is None:
+                self.fs = filesystem(
+                    self.target_protocol, **(self.target_options or {})
+                )
+        else:
+            self.worker = False
+            self.client = _get_client(self.client)
+            self.rfs = dask.delayed(self)
+
+    def mkdir(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mkdir(*args, **kwargs)
+        else:
+            self.rfs.mkdir(*args, **kwargs).compute()
+
+    def rm(self, *args, **kwargs):
+        if self.worker:
+            self.fs.rm(*args, **kwargs)
+        else:
+            self.rfs.rm(*args, **kwargs).compute()
+
+    def copy(self, *args, **kwargs):
+        if self.worker:
+            self.fs.copy(*args, **kwargs)
+        else:
+            self.rfs.copy(*args, **kwargs).compute()
+
+    def mv(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mv(*args, **kwargs)
+        else:
+            self.rfs.mv(*args, **kwargs).compute()
+
+    def ls(self, *args, **kwargs):
+        if self.worker:
+            return self.fs.ls(*args, **kwargs)
+        else:
+            return self.rfs.ls(*args, **kwargs).compute()
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        if self.worker:
+            return self.fs._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        else:
+            return DaskFile(
+                fs=self,
+                path=path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+
+    def fetch_range(self, path, mode, start, end):
+        if self.worker:
+            with self._open(path, mode) as f:
+                f.seek(start)
+                return f.read(end - start)
+        else:
+            return self.rfs.fetch_range(path, mode, start, end).compute()
+
+
+class DaskFile(AbstractBufferedFile):
+    def __init__(self, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError('Remote dask files can only be opened in "rb" mode')
+        super().__init__(**kwargs)
+
+    def _upload_chunk(self, final=False):
+        pass
+
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        pass
+
+    def _fetch_range(self, start, end):
+        """Get the specified set of bytes from remote"""
+        return self.fs.fetch_range(self.path, self.mode, start, end)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/data.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f11542b48c98fd53fc367ade7425a00b38487619
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/data.py
@@ -0,0 +1,57 @@
+import base64
+import io
+from urllib.parse import unquote
+
+from fsspec import AbstractFileSystem
+
+
+class DataFileSystem(AbstractFileSystem):
+    """A handy decoder for data-URLs
+
+    Example
+    -------
+    >>> with fsspec.open("data:,Hello%2C%20World%21") as f:
+    ...     print(f.read())
+    b"Hello, World!"
+
+    See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
+    """
+
+    protocol = "data"
+
+    def __init__(self, **kwargs):
+        """No parameters for this filesystem"""
+        super().__init__(**kwargs)
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        pref, data = path.split(",", 1)
+        if pref.endswith("base64"):
+            return base64.b64decode(data)[start:end]
+        return unquote(data).encode()[start:end]
+
+    def info(self, path, **kwargs):
+        pref, name = path.split(",", 1)
+        data = self.cat_file(path)
+        mime = pref.split(":", 1)[1].split(";", 1)[0]
+        return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        if "r" not in mode:
+            raise ValueError("Read only filesystem")
+        return io.BytesIO(self.cat_file(path))
+
+    @staticmethod
+    def encode(data: bytes, mime: str | None = None):
+        """Format the given data into data-URL syntax
+
+        This version always base64 encodes, even when the data is ascii/url-safe.
+        """
+        return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dbfs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dbfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7fc93d7389c894ecb5fc6267ce20abe4087068
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dbfs.py
@@ -0,0 +1,496 @@
+from __future__ import annotations
+
+import base64
+import urllib
+
+import requests
+from requests.adapters import HTTPAdapter, Retry
+from typing_extensions import override
+
+from fsspec import AbstractFileSystem
+from fsspec.spec import AbstractBufferedFile
+
+
+class DatabricksException(Exception):
+    """
+    Helper class for exceptions raised in this module.
+    """
+
+    def __init__(self, error_code, message, details=None):
+        """Create a new DatabricksException"""
+        super().__init__(message)
+
+        self.error_code = error_code
+        self.message = message
+        self.details = details
+
+
+class DatabricksFileSystem(AbstractFileSystem):
+    """
+    Get access to the Databricks filesystem implementation over HTTP.
+    Can be used inside and outside of a databricks cluster.
+    """
+
+    def __init__(self, instance, token, **kwargs):
+        """
+        Create a new DatabricksFileSystem.
+
+        Parameters
+        ----------
+        instance: str
+            The instance URL of the databricks cluster.
+            For example for an Azure databricks cluster, this
+            has the form adb-<some-number>.<two digits>.azuredatabricks.net.
+        token: str
+            Your personal token. Find out more
+            here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
+        """
+        self.instance = instance
+        self.token = token
+        self.session = requests.Session()
+        self.retries = Retry(
+            total=10,
+            backoff_factor=0.05,
+            status_forcelist=[408, 429, 500, 502, 503, 504],
+        )
+
+        self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
+        self.session.headers.update({"Authorization": f"Bearer {self.token}"})
+
+        super().__init__(**kwargs)
+
+    @override
+    def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
+        """Check cache for listing
+
+        Returns listing, if found (may be empty list for a directory that
+        exists but contains nothing), None if not in cache.
+        """
+        self.dircache.pop(path.rstrip("/"), None)
+
+        parent = self._parent(path)
+        if parent in self.dircache:
+            for entry in self.dircache[parent]:
+                if entry["name"] == path.rstrip("/"):
+                    if entry["type"] != "directory":
+                        return [entry]
+                    return []
+            raise FileNotFoundError(path)
+
+    def ls(self, path, detail=True, **kwargs):
+        """
+        List the contents of the given path.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path
+        detail: bool
+            Return not only the list of filenames,
+            but also additional information on file sizes
+            and types.
+        """
+        try:
+            out = self._ls_from_cache(path)
+        except FileNotFoundError:
+            # This happens if the `path`'s parent was cached, but `path` is not
+            # there. This suggests that `path` is new since the parent was
+            # cached. Attempt to invalidate parent's cache before continuing.
+            self.dircache.pop(self._parent(path), None)
+            out = None
+
+        if not out:
+            try:
+                r = self._send_to_api(
+                    method="get", endpoint="list", json={"path": path}
+                )
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    raise FileNotFoundError(e.message) from e
+
+                raise
+            files = r.get("files", [])
+            out = [
+                {
+                    "name": o["path"],
+                    "type": "directory" if o["is_dir"] else "file",
+                    "size": o["file_size"],
+                }
+                for o in files
+            ]
+            self.dircache[path] = out
+
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def makedirs(self, path, exist_ok=True):
+        """
+        Create a given absolute path and all of its parents.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        exist_ok: bool
+            If false, checks if the folder
+            exists before creating it (and raises an
+            Exception if this is the case)
+        """
+        if not exist_ok:
+            try:
+                # If the following succeeds, the path is already present
+                self._send_to_api(
+                    method="get", endpoint="get-status", json={"path": path}
+                )
+                raise FileExistsError(f"Path {path} already exists")
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    pass
+
+        try:
+            self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+
+            raise
+        self.invalidate_cache(self._parent(path))
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        """
+        Create a given absolute path and all of its parents.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        create_parents: bool
+            Whether to create all parents or not.
+            "False" is not implemented so far.
+        """
+        if not create_parents:
+            raise NotImplementedError
+
+        self.mkdirs(path, **kwargs)
+
+    def rm(self, path, recursive=False, **kwargs):
+        """
+        Remove the file or folder at the given absolute path.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path what to remove
+        recursive: bool
+            Recursively delete all files in a folder.
+        """
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="delete",
+                json={"path": path, "recursive": recursive},
+            )
+        except DatabricksException as e:
+            # This is not really an exception, it just means
+            # not everything was deleted so far
+            if e.error_code == "PARTIAL_DELETE":
+                self.rm(path=path, recursive=recursive)
+            elif e.error_code == "IO_ERROR":
+                # Using the same exception as the os module would use here
+                raise OSError(e.message) from e
+
+            raise
+        self.invalidate_cache(self._parent(path))
+
+    def mv(
+        self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
+    ):
+        """
+        Move a source to a destination path.
+
+        A note from the original [databricks API manual]
+        (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
+
+        When moving a large number of files the API call will time out after
+        approximately 60s, potentially resulting in partially moved data.
+        Therefore, for operations that move more than 10k files, we strongly
+        discourage using the DBFS REST API.
+
+        Parameters
+        ----------
+        source_path: str
+            From where to move (absolute path)
+        destination_path: str
+            To where to move (absolute path)
+        recursive: bool
+            Not implemented to far.
+        maxdepth:
+            Not implemented to far.
+        """
+        if recursive:
+            raise NotImplementedError
+        if maxdepth:
+            raise NotImplementedError
+
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="move",
+                json={"source_path": source_path, "destination_path": destination_path},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+
+            raise
+        self.invalidate_cache(self._parent(source_path))
+        self.invalidate_cache(self._parent(destination_path))
+
+    def _open(self, path, mode="rb", block_size="default", **kwargs):
+        """
+        Overwrite the base class method to make sure to create a DBFile.
+        All arguments are copied from the base method.
+
+        Only the default blocksize is allowed.
+        """
+        return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
+
+    def _send_to_api(self, method, endpoint, json):
+        """
+        Send the given json to the DBFS API
+        using a get or post request (specified by the argument `method`).
+
+        Parameters
+        ----------
+        method: str
+            Which http method to use for communication; "get" or "post".
+        endpoint: str
+            Where to send the request to (last part of the API URL)
+        json: dict
+            Dictionary of information to send
+        """
+        if method == "post":
+            session_call = self.session.post
+        elif method == "get":
+            session_call = self.session.get
+        else:
+            raise ValueError(f"Do not understand method {method}")
+
+        url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
+
+        r = session_call(url, json=json)
+
+        # The DBFS API will return a json, also in case of an exception.
+        # We want to preserve this information as good as possible.
+        try:
+            r.raise_for_status()
+        except requests.HTTPError as e:
+            # try to extract json error message
+            # if that fails, fall back to the original exception
+            try:
+                exception_json = e.response.json()
+            except Exception:
+                raise e from None
+
+            raise DatabricksException(**exception_json) from e
+
+        return r.json()
+
+    def _create_handle(self, path, overwrite=True):
+        """
+        Internal function to create a handle, which can be used to
+        write blocks of a file to DBFS.
+        A handle has a unique identifier which needs to be passed
+        whenever written during this transaction.
+        The handle is active for 10 minutes - after that a new
+        write transaction needs to be created.
+        Make sure to close the handle after you are finished.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path for this file.
+        overwrite: bool
+            If a file already exist at this location, either overwrite
+            it or raise an exception.
+        """
+        try:
+            r = self._send_to_api(
+                method="post",
+                endpoint="create",
+                json={"path": path, "overwrite": overwrite},
+            )
+            return r["handle"]
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+
+            raise
+
+    def _close_handle(self, handle):
+        """
+        Close a handle, which was opened by :func:`_create_handle`.
+
+        Parameters
+        ----------
+        handle: str
+            Which handle to close.
+        """
+        try:
+            self._send_to_api(method="post", endpoint="close", json={"handle": handle})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+
+            raise
+
+    def _add_data(self, handle, data):
+        """
+        Upload data to an already opened file handle
+        (opened by :func:`_create_handle`).
+        The maximal allowed data size is 1MB after
+        conversion to base64.
+        Remember to close the handle when you are finished.
+
+        Parameters
+        ----------
+        handle: str
+            Which handle to upload data to.
+        data: bytes
+            Block of data to add to the handle.
+        """
+        data = base64.b64encode(data).decode()
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="add-block",
+                json={"handle": handle, "data": data},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
+                raise ValueError(e.message) from e
+
+            raise
+
+    def _get_data(self, path, start, end):
+        """
+        Download data in bytes from a given absolute path in a block
+        from [start, start+length].
+        The maximum number of allowed bytes to read is 1MB.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to download data from
+        start: int
+            Start position of the block
+        end: int
+            End position of the block
+        """
+        try:
+            r = self._send_to_api(
+                method="get",
+                endpoint="read",
+                json={"path": path, "offset": start, "length": end - start},
+            )
+            return base64.b64decode(r["data"])
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
+                raise ValueError(e.message) from e
+
+            raise
+
+    def invalidate_cache(self, path=None):
+        if path is None:
+            self.dircache.clear()
+        else:
+            self.dircache.pop(path, None)
+        super().invalidate_cache(path)
+
+
+class DatabricksFile(AbstractBufferedFile):
+    """
+    Helper class for files referenced in the DatabricksFileSystem.
+    """
+
+    DEFAULT_BLOCK_SIZE = 1 * 2**20  # only allowed block size
+
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        **kwargs,
+    ):
+        """
+        Create a new instance of the DatabricksFile.
+
+        The blocksize needs to be the default one.
+        """
+        if block_size is None or block_size == "default":
+            block_size = self.DEFAULT_BLOCK_SIZE
+
+        assert block_size == self.DEFAULT_BLOCK_SIZE, (
+            f"Only the default block size is allowed, not {block_size}"
+        )
+
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_type=cache_type,
+            cache_options=cache_options or {},
+            **kwargs,
+        )
+
+    def _initiate_upload(self):
+        """Internal function to start a file upload"""
+        self.handle = self.fs._create_handle(self.path)
+
+    def _upload_chunk(self, final=False):
+        """Internal function to add a chunk of data to a started upload"""
+        self.buffer.seek(0)
+        data = self.buffer.getvalue()
+
+        data_chunks = [
+            data[start:end] for start, end in self._to_sized_blocks(len(data))
+        ]
+
+        for data_chunk in data_chunks:
+            self.fs._add_data(handle=self.handle, data=data_chunk)
+
+        if final:
+            self.fs._close_handle(handle=self.handle)
+            return True
+
+    def _fetch_range(self, start, end):
+        """Internal function to download a block of data"""
+        return_buffer = b""
+        length = end - start
+        for chunk_start, chunk_end in self._to_sized_blocks(length, start):
+            return_buffer += self.fs._get_data(
+                path=self.path, start=chunk_start, end=chunk_end
+            )
+
+        return return_buffer
+
+    def _to_sized_blocks(self, length, start=0):
+        """Helper function to split a range from 0 to total_length into blocksizes"""
+        end = start + length
+        for data_chunk in range(start, end, self.blocksize):
+            data_start = data_chunk
+            data_end = min(end, data_chunk + self.blocksize)
+            yield data_start, data_end
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dirfs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dirfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f3dd3cf4c2f421292ba5d9fab8b733a60550496
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/dirfs.py
@@ -0,0 +1,389 @@
+from .. import filesystem
+from ..asyn import AsyncFileSystem
+from .chained import ChainedFileSystem
+
+
+class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
+    """Directory prefix filesystem
+
+    The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
+    is relative to the `path`. After performing the necessary paths operation it
+    delegates everything to the wrapped filesystem.
+    """
+
+    protocol = "dir"
+
+    def __init__(
+        self,
+        path=None,
+        fs=None,
+        fo=None,
+        target_protocol=None,
+        target_options=None,
+        **storage_options,
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            Path to the directory.
+        fs: AbstractFileSystem
+            An instantiated filesystem to wrap.
+        target_protocol, target_options:
+            if fs is none, construct it from these
+        fo: str
+            Alternate for path; do not provide both
+        """
+        super().__init__(**storage_options)
+        if fs is None:
+            fs = filesystem(protocol=target_protocol, **(target_options or {}))
+        path = path or fo
+
+        if self.asynchronous and not fs.async_impl:
+            raise ValueError("can't use asynchronous with non-async fs")
+
+        if fs.async_impl and self.asynchronous != fs.asynchronous:
+            raise ValueError("both dirfs and fs should be in the same sync/async mode")
+
+        self.path = fs._strip_protocol(path)
+        self.fs = fs
+
+    def _join(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            if not path:
+                return self.path
+            return self.fs.sep.join((self.path, self._strip_protocol(path)))
+        if isinstance(path, dict):
+            return {self._join(_path): value for _path, value in path.items()}
+        return [self._join(_path) for _path in path]
+
+    def _relpath(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            # We need to account for S3FileSystem returning paths that do not
+            # start with a '/'
+            if path == self.path or (
+                self.path.startswith(self.fs.sep) and path == self.path[1:]
+            ):
+                return ""
+            prefix = self.path + self.fs.sep
+            if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
+                prefix = prefix[1:]
+            assert path.startswith(prefix)
+            return path[len(prefix) :]
+        return [self._relpath(_path) for _path in path]
+
+    # Wrappers below
+
+    @property
+    def sep(self):
+        return self.fs.sep
+
+    async def set_session(self, *args, **kwargs):
+        return await self.fs.set_session(*args, **kwargs)
+
+    async def _rm_file(self, path, **kwargs):
+        return await self.fs._rm_file(self._join(path), **kwargs)
+
+    def rm_file(self, path, **kwargs):
+        return self.fs.rm_file(self._join(path), **kwargs)
+
+    async def _rm(self, path, *args, **kwargs):
+        return await self.fs._rm(self._join(path), *args, **kwargs)
+
+    def rm(self, path, *args, **kwargs):
+        return self.fs.rm(self._join(path), *args, **kwargs)
+
+    async def _cp_file(self, path1, path2, **kwargs):
+        return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
+
+    def cp_file(self, path1, path2, **kwargs):
+        return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
+
+    async def _copy(
+        self,
+        path1,
+        path2,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+
+    def copy(self, path1, path2, *args, **kwargs):
+        return self.fs.copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+
+    async def _pipe(self, path, *args, **kwargs):
+        return await self.fs._pipe(self._join(path), *args, **kwargs)
+
+    def pipe(self, path, *args, **kwargs):
+        return self.fs.pipe(self._join(path), *args, **kwargs)
+
+    async def _pipe_file(self, path, *args, **kwargs):
+        return await self.fs._pipe_file(self._join(path), *args, **kwargs)
+
+    def pipe_file(self, path, *args, **kwargs):
+        return self.fs.pipe_file(self._join(path), *args, **kwargs)
+
+    async def _cat_file(self, path, *args, **kwargs):
+        return await self.fs._cat_file(self._join(path), *args, **kwargs)
+
+    def cat_file(self, path, *args, **kwargs):
+        return self.fs.cat_file(self._join(path), *args, **kwargs)
+
+    async def _cat(self, path, *args, **kwargs):
+        ret = await self.fs._cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+
+        return ret
+
+    def cat(self, path, *args, **kwargs):
+        ret = self.fs.cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+
+        return ret
+
+    async def _put_file(self, lpath, rpath, **kwargs):
+        return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
+
+    def put_file(self, lpath, rpath, **kwargs):
+        return self.fs.put_file(lpath, self._join(rpath), **kwargs)
+
+    async def _put(
+        self,
+        lpath,
+        rpath,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+
+    def put(self, lpath, rpath, *args, **kwargs):
+        return self.fs.put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+
+    async def _get_file(self, rpath, lpath, **kwargs):
+        return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        return self.fs.get_file(self._join(rpath), lpath, **kwargs)
+
+    async def _get(self, rpath, *args, **kwargs):
+        return await self.fs._get(self._join(rpath), *args, **kwargs)
+
+    def get(self, rpath, *args, **kwargs):
+        return self.fs.get(self._join(rpath), *args, **kwargs)
+
+    async def _isfile(self, path):
+        return await self.fs._isfile(self._join(path))
+
+    def isfile(self, path):
+        return self.fs.isfile(self._join(path))
+
+    async def _isdir(self, path):
+        return await self.fs._isdir(self._join(path))
+
+    def isdir(self, path):
+        return self.fs.isdir(self._join(path))
+
+    async def _size(self, path):
+        return await self.fs._size(self._join(path))
+
+    def size(self, path):
+        return self.fs.size(self._join(path))
+
+    async def _exists(self, path):
+        return await self.fs._exists(self._join(path))
+
+    def exists(self, path):
+        return self.fs.exists(self._join(path))
+
+    async def _info(self, path, **kwargs):
+        info = await self.fs._info(self._join(path), **kwargs)
+        info = info.copy()
+        info["name"] = self._relpath(info["name"])
+        return info
+
+    def info(self, path, **kwargs):
+        info = self.fs.info(self._join(path), **kwargs)
+        info = info.copy()
+        info["name"] = self._relpath(info["name"])
+        return info
+
+    async def _ls(self, path, detail=True, **kwargs):
+        ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+
+        return self._relpath(ret)
+
+    def ls(self, path, detail=True, **kwargs):
+        ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+
+        return self._relpath(ret)
+
+    async def _walk(self, path, *args, **kwargs):
+        async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+
+    def walk(self, path, *args, **kwargs):
+        for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+
+    async def _glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    def glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    async def _du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = await self.fs._du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+
+        return {self._relpath(path): size for path, size in ret.items()}
+
+    def du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = self.fs.du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+
+        return {self._relpath(path): size for path, size in ret.items()}
+
+    async def _find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    def find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    async def _expand_path(self, path, *args, **kwargs):
+        return self._relpath(
+            await self.fs._expand_path(self._join(path), *args, **kwargs)
+        )
+
+    def expand_path(self, path, *args, **kwargs):
+        return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
+
+    async def _mkdir(self, path, *args, **kwargs):
+        return await self.fs._mkdir(self._join(path), *args, **kwargs)
+
+    def mkdir(self, path, *args, **kwargs):
+        return self.fs.mkdir(self._join(path), *args, **kwargs)
+
+    async def _makedirs(self, path, *args, **kwargs):
+        return await self.fs._makedirs(self._join(path), *args, **kwargs)
+
+    def makedirs(self, path, *args, **kwargs):
+        return self.fs.makedirs(self._join(path), *args, **kwargs)
+
+    def rmdir(self, path):
+        return self.fs.rmdir(self._join(path))
+
+    def mv(self, path1, path2, **kwargs):
+        return self.fs.mv(
+            self._join(path1),
+            self._join(path2),
+            **kwargs,
+        )
+
+    def touch(self, path, **kwargs):
+        return self.fs.touch(self._join(path), **kwargs)
+
+    def created(self, path):
+        return self.fs.created(self._join(path))
+
+    def modified(self, path):
+        return self.fs.modified(self._join(path))
+
+    def sign(self, path, *args, **kwargs):
+        return self.fs.sign(self._join(path), *args, **kwargs)
+
+    def __repr__(self):
+        return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
+
+    def open(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return self.fs.open(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+
+    async def open_async(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs.open_async(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/ftp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/ftp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3db22b04a00ddf12582253ec19b2938c794c1da
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/ftp.py
@@ -0,0 +1,387 @@
+import os
+import uuid
+from ftplib import FTP, FTP_TLS, Error, error_perm
+from typing import Any
+
+from ..spec import AbstractBufferedFile, AbstractFileSystem
+from ..utils import infer_storage_options, isfilelike
+
+
+class FTPFileSystem(AbstractFileSystem):
+    """A filesystem over classic FTP"""
+
+    root_marker = "/"
+    cachable = False
+    protocol = "ftp"
+
+    def __init__(
+        self,
+        host,
+        port=21,
+        username=None,
+        password=None,
+        acct=None,
+        block_size=None,
+        tempdir=None,
+        timeout=30,
+        encoding="utf-8",
+        tls=False,
+        **kwargs,
+    ):
+        """
+        You can use _get_kwargs_from_urls to get some kwargs from
+        a reasonable FTP url.
+
+        Authentication will be anonymous if username/password are not
+        given.
+
+        Parameters
+        ----------
+        host: str
+            The remote server name/ip to connect to
+        port: int
+            Port to connect with
+        username: str or None
+            If authenticating, the user's identifier
+        password: str of None
+            User's password on the server, if using
+        acct: str or None
+            Some servers also need an "account" string for auth
+        block_size: int or None
+            If given, the read-ahead or write buffer size.
+        tempdir: str
+            Directory on remote to put temporary files when in a transaction
+        timeout: int
+            Timeout of the ftp connection in seconds
+        encoding: str
+            Encoding to use for directories and filenames in FTP connection
+        tls: bool
+            Use FTP-TLS, by default False
+        """
+        super().__init__(**kwargs)
+        self.host = host
+        self.port = port
+        self.tempdir = tempdir or "/tmp"
+        self.cred = username or "", password or "", acct or ""
+        self.timeout = timeout
+        self.encoding = encoding
+        if block_size is not None:
+            self.blocksize = block_size
+        else:
+            self.blocksize = 2**16
+        self.tls = tls
+        self._connect()
+        if self.tls:
+            self.ftp.prot_p()
+
+    def _connect(self):
+        if self.tls:
+            ftp_cls = FTP_TLS
+        else:
+            ftp_cls = FTP
+        self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
+        self.ftp.connect(self.host, self.port)
+        self.ftp.login(*self.cred)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        out = []
+        if path not in self.dircache:
+            try:
+                try:
+                    out = [
+                        (fn, details)
+                        for (fn, details) in self.ftp.mlsd(path)
+                        if fn not in [".", ".."]
+                        and details["type"] not in ["pdir", "cdir"]
+                    ]
+                except error_perm:
+                    out = _mlsd2(self.ftp, path)  # Not platform independent
+                for fn, details in out:
+                    details["name"] = "/".join(
+                        ["" if path == "/" else path, fn.lstrip("/")]
+                    )
+                    if details["type"] == "file":
+                        details["size"] = int(details["size"])
+                    else:
+                        details["size"] = 0
+                    if details["type"] == "dir":
+                        details["type"] = "directory"
+                self.dircache[path] = out
+            except Error:
+                try:
+                    info = self.info(path)
+                    if info["type"] == "file":
+                        out = [(path, info)]
+                except (Error, IndexError) as exc:
+                    raise FileNotFoundError(path) from exc
+        files = self.dircache.get(path, out)
+        if not detail:
+            return sorted([fn for fn, details in files])
+        return [details for fn, details in files]
+
+    def info(self, path, **kwargs):
+        # implement with direct method
+        path = self._strip_protocol(path)
+        if path == "/":
+            # special case, since this dir has no real entry
+            return {"name": "/", "size": 0, "type": "directory"}
+        files = self.ls(self._parent(path).lstrip("/"), True)
+        try:
+            out = next(f for f in files if f["name"] == path)
+        except StopIteration as exc:
+            raise FileNotFoundError(path) from exc
+        return out
+
+    def get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            if not os.path.exists(lpath):
+                os.mkdir(lpath)
+            return
+        if isfilelike(lpath):
+            outfile = lpath
+        else:
+            outfile = open(lpath, "wb")
+
+        def cb(x):
+            outfile.write(x)
+
+        self.ftp.retrbinary(
+            f"RETR {rpath}",
+            blocksize=self.blocksize,
+            callback=cb,
+        )
+        if not isfilelike(lpath):
+            outfile.close()
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        if end is not None:
+            return super().cat_file(path, start, end, **kwargs)
+        out = []
+
+        def cb(x):
+            out.append(x)
+
+        try:
+            self.ftp.retrbinary(
+                f"RETR {path}",
+                blocksize=self.blocksize,
+                rest=start,
+                callback=cb,
+            )
+        except (Error, error_perm) as orig_exc:
+            raise FileNotFoundError(path) from orig_exc
+        return b"".join(out)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        cache_options=None,
+        autocommit=True,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        block_size = block_size or self.blocksize
+        return FTPFile(
+            self,
+            path,
+            mode=mode,
+            block_size=block_size,
+            tempdir=self.tempdir,
+            autocommit=autocommit,
+            cache_options=cache_options,
+        )
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        self.ftp.delete(path)
+        self.invalidate_cache(self._parent(path))
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(paths):
+            if self.isfile(p):
+                self.rm_file(p)
+            else:
+                self.rmdir(p)
+
+    def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
+        path = self._strip_protocol(path)
+        parent = self._parent(path)
+        if parent != self.root_marker and not self.exists(parent) and create_parents:
+            self.mkdir(parent, create_parents=create_parents)
+
+        self.ftp.mkd(path)
+        self.invalidate_cache(self._parent(path))
+
+    def makedirs(self, path: str, exist_ok: bool = False) -> None:
+        path = self._strip_protocol(path)
+        if self.exists(path):
+            # NB: "/" does not "exist" as it has no directory entry
+            if not exist_ok:
+                raise FileExistsError(f"{path} exists without `exist_ok`")
+            # exists_ok=True -> no-op
+        else:
+            self.mkdir(path, create_parents=True)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        self.ftp.rmd(path)
+        self.invalidate_cache(self._parent(path))
+
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        self.ftp.rename(path1, path2)
+        self.invalidate_cache(self._parent(path1))
+        self.invalidate_cache(self._parent(path2))
+
+    def __del__(self):
+        self.ftp.close()
+
+    def invalidate_cache(self, path=None):
+        if path is None:
+            self.dircache.clear()
+        else:
+            self.dircache.pop(path, None)
+        super().invalidate_cache(path)
+
+
+class TransferDone(Exception):
+    """Internal exception to break out of transfer"""
+
+    pass
+
+
+class FTPFile(AbstractBufferedFile):
+    """Interact with a remote FTP file with read/write buffering"""
+
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        **kwargs,
+    ):
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+        if not autocommit:
+            self.target = self.path
+            self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
+
+    def commit(self):
+        self.fs.mv(self.path, self.target)
+
+    def discard(self):
+        self.fs.rm(self.path)
+
+    def _fetch_range(self, start, end):
+        """Get bytes between given byte limits
+
+        Implemented by raising an exception in the fetch callback when the
+        number of bytes received reaches the requested amount.
+
+        Will fail if the server does not respect the REST command on
+        retrieve requests.
+        """
+        out = []
+        total = [0]
+
+        def callback(x):
+            total[0] += len(x)
+            if total[0] > end - start:
+                out.append(x[: (end - start) - total[0]])
+                if end < self.size:
+                    raise TransferDone
+            else:
+                out.append(x)
+
+            if total[0] == end - start and end < self.size:
+                raise TransferDone
+
+        try:
+            self.fs.ftp.retrbinary(
+                f"RETR {self.path}",
+                blocksize=self.blocksize,
+                rest=start,
+                callback=callback,
+            )
+        except TransferDone:
+            try:
+                # stop transfer, we got enough bytes for this block
+                self.fs.ftp.abort()
+                self.fs.ftp.getmultiline()
+            except Error:
+                self.fs._connect()
+
+        return b"".join(out)
+
+    def _upload_chunk(self, final=False):
+        self.buffer.seek(0)
+        self.fs.ftp.storbinary(
+            f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
+        )
+        return True
+
+
+def _mlsd2(ftp, path="."):
+    """
+    Fall back to using `dir` instead of `mlsd` if not supported.
+
+    This parses a Linux style `ls -l` response to `dir`, but the response may
+    be platform dependent.
+
+    Parameters
+    ----------
+    ftp: ftplib.FTP
+    path: str
+        Expects to be given path, but defaults to ".".
+    """
+    lines = []
+    minfo = []
+    ftp.dir(path, lines.append)
+    for line in lines:
+        split_line = line.split()
+        if len(split_line) < 9:
+            continue
+        this = (
+            split_line[-1],
+            {
+                "modify": " ".join(split_line[5:8]),
+                "unix.owner": split_line[2],
+                "unix.group": split_line[3],
+                "unix.mode": split_line[0],
+                "size": split_line[4],
+            },
+        )
+        if this[1]["unix.mode"][0] == "d":
+            this[1]["type"] = "dir"
+        else:
+            this[1]["type"] = "file"
+        minfo.append(this)
+    return minfo
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/gist.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/gist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9ac0b6a1cdbcfba6188e2cdeab2350bb9aad0a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/gist.py
@@ -0,0 +1,241 @@
+import requests
+
+from ..spec import AbstractFileSystem
+from ..utils import infer_storage_options
+from .memory import MemoryFile
+
+
+class GistFileSystem(AbstractFileSystem):
+    """
+    Interface to files in a single GitHub Gist.
+
+    Provides read-only access to a gist's files. Gists do not contain
+    subdirectories, so file listing is straightforward.
+
+    Parameters
+    ----------
+    gist_id: str
+        The ID of the gist you want to access (the long hex value from the URL).
+    filenames: list[str] (optional)
+        If provided, only make a file system representing these files, and do not fetch
+        the list of all files for this gist.
+    sha: str (optional)
+        If provided, fetch a particular revision of the gist. If omitted,
+        the latest revision is used.
+    username: str (optional)
+        GitHub username for authentication.
+    token: str (optional)
+        GitHub personal access token (required if username is given), or.
+    timeout: (float, float) or float, optional
+        Connect and read timeouts for requests (default 60s each).
+    kwargs: dict
+        Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
+        metadata or reading ("opening") a file.
+    """
+
+    protocol = "gist"
+    gist_url = "https://api.github.com/gists/{gist_id}"
+    gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
+
+    def __init__(
+        self,
+        gist_id,
+        filenames=None,
+        sha=None,
+        username=None,
+        token=None,
+        timeout=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.gist_id = gist_id
+        self.filenames = filenames
+        self.sha = sha  # revision of the gist (optional)
+        if username is not None and token is None:
+            raise ValueError("User auth requires a token")
+        self.username = username
+        self.token = token
+        self.request_kw = kwargs
+        # Default timeouts to 60s connect/read if none provided
+        self.timeout = timeout if timeout is not None else (60, 60)
+
+        # We use a single-level "directory" cache, because a gist is essentially flat
+        self.dircache[""] = self._fetch_file_list()
+
+    @property
+    def kw(self):
+        """Auth parameters passed to 'requests' if we have username/token."""
+        kw = {
+            "headers": {
+                "Accept": "application/vnd.github+json",
+                "X-GitHub-Api-Version": "2022-11-28",
+            }
+        }
+        kw.update(self.request_kw)
+        if self.username and self.token:
+            kw["auth"] = (self.username, self.token)
+        elif self.token:
+            kw["headers"]["Authorization"] = f"Bearer {self.token}"
+        return kw
+
+    def _fetch_gist_metadata(self):
+        """
+        Fetch the JSON metadata for this gist (possibly for a specific revision).
+        """
+        if self.sha:
+            url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
+        else:
+            url = self.gist_url.format(gist_id=self.gist_id)
+
+        r = requests.get(url, timeout=self.timeout, **self.kw)
+        if r.status_code == 404:
+            raise FileNotFoundError(
+                f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
+            )
+        r.raise_for_status()
+        return r.json()
+
+    def _fetch_file_list(self):
+        """
+        Returns a list of dicts describing each file in the gist. These get stored
+        in self.dircache[""].
+        """
+        meta = self._fetch_gist_metadata()
+        if self.filenames:
+            available_files = meta.get("files", {})
+            files = {}
+            for fn in self.filenames:
+                if fn not in available_files:
+                    raise FileNotFoundError(fn)
+                files[fn] = available_files[fn]
+        else:
+            files = meta.get("files", {})
+
+        out = []
+        for fname, finfo in files.items():
+            if finfo is None:
+                # Occasionally GitHub returns a file entry with null if it was deleted
+                continue
+            # Build a directory entry
+            out.append(
+                {
+                    "name": fname,  # file's name
+                    "type": "file",  # gists have no subdirectories
+                    "size": finfo.get("size", 0),  # file size in bytes
+                    "raw_url": finfo.get("raw_url"),
+                }
+            )
+        return out
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        """
+        Remove 'gist://' from the path, if present.
+        """
+        # The default infer_storage_options can handle gist://username:token@id/file
+        # or gist://id/file, but let's ensure we handle a normal usage too.
+        # We'll just strip the protocol prefix if it exists.
+        path = infer_storage_options(path).get("path", path)
+        return path.lstrip("/")
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        """
+        Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
+        For example:
+          gist://:TOKEN@<gist_id>/file.txt
+          gist://username:TOKEN@<gist_id>/file.txt
+        """
+        so = infer_storage_options(path)
+        out = {}
+        if "username" in so and so["username"]:
+            out["username"] = so["username"]
+        if "password" in so and so["password"]:
+            out["token"] = so["password"]
+        if "host" in so and so["host"]:
+            # We interpret 'host' as the gist ID
+            out["gist_id"] = so["host"]
+
+        # Extract SHA and filename from path
+        if "path" in so and so["path"]:
+            path_parts = so["path"].rsplit("/", 2)[-2:]
+            if len(path_parts) == 2:
+                if path_parts[0]:  # SHA present
+                    out["sha"] = path_parts[0]
+                if path_parts[1]:  # filename also present
+                    out["filenames"] = [path_parts[1]]
+
+        return out
+
+    def ls(self, path="", detail=False, **kwargs):
+        """
+        List files in the gist. Gists are single-level, so any 'path' is basically
+        the filename, or empty for all files.
+
+        Parameters
+        ----------
+        path : str, optional
+            The filename to list. If empty, returns all files in the gist.
+        detail : bool, default False
+            If True, return a list of dicts; if False, return a list of filenames.
+        """
+        path = self._strip_protocol(path or "")
+        # If path is empty, return all
+        if path == "":
+            results = self.dircache[""]
+        else:
+            # We want just the single file with this name
+            all_files = self.dircache[""]
+            results = [f for f in all_files if f["name"] == path]
+            if not results:
+                raise FileNotFoundError(path)
+        if detail:
+            return results
+        else:
+            return sorted(f["name"] for f in results)
+
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        """
+        Read a single file from the gist.
+        """
+        if mode != "rb":
+            raise NotImplementedError("GitHub Gist FS is read-only (no write).")
+
+        path = self._strip_protocol(path)
+        # Find the file entry in our dircache
+        matches = [f for f in self.dircache[""] if f["name"] == path]
+        if not matches:
+            raise FileNotFoundError(path)
+        finfo = matches[0]
+
+        raw_url = finfo.get("raw_url")
+        if not raw_url:
+            raise FileNotFoundError(f"No raw_url for file: {path}")
+
+        r = requests.get(raw_url, timeout=self.timeout, **self.kw)
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
+        return MemoryFile(path, None, r.content)
+
+    def cat(self, path, recursive=False, on_error="raise", **kwargs):
+        """
+        Return {path: contents} for the given file or files. If 'recursive' is True,
+        and path is empty, returns all files in the gist.
+        """
+        paths = self.expand_path(path, recursive=recursive)
+        out = {}
+        for p in paths:
+            try:
+                with self.open(p, "rb") as f:
+                    out[p] = f.read()
+            except FileNotFoundError as e:
+                if on_error == "raise":
+                    raise e
+                elif on_error == "omit":
+                    pass  # skip
+                else:
+                    out[p] = e
+        if len(paths) == 1 and paths[0] == path:
+            return out[path]
+        return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/git.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/git.py
new file mode 100644
index 0000000000000000000000000000000000000000..808d293a1c991ea87d19a2129f3e56d9b813daaa
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/git.py
@@ -0,0 +1,114 @@
+import os
+
+import pygit2
+
+from fsspec.spec import AbstractFileSystem
+
+from .memory import MemoryFile
+
+
+class GitFileSystem(AbstractFileSystem):
+    """Browse the files of a local git repo at any hash/tag/branch
+
+    (experimental backend)
+    """
+
+    root_marker = ""
+    cachable = True
+
+    def __init__(self, path=None, fo=None, ref=None, **kwargs):
+        """
+
+        Parameters
+        ----------
+        path: str (optional)
+            Local location of the repo (uses current directory if not given).
+            May be deprecated in favour of ``fo``. When used with a higher
+            level function such as fsspec.open(), may be of the form
+            "git://[path-to-repo[:]][ref@]path/to/file" (but the actual
+            file path should not contain "@" or ":").
+        fo: str (optional)
+            Same as ``path``, but passed as part of a chained URL. This one
+            takes precedence if both are given.
+        ref: str (optional)
+            Reference to work with, could be a hash, tag or branch name. Defaults
+            to current working tree. Note that ``ls`` and ``open`` also take hash,
+            so this becomes the default for those operations
+        kwargs
+        """
+        super().__init__(**kwargs)
+        self.repo = pygit2.Repository(fo or path or os.getcwd())
+        self.ref = ref or "master"
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = super()._strip_protocol(path).lstrip("/")
+        if ":" in path:
+            path = path.split(":", 1)[1]
+        if "@" in path:
+            path = path.split("@", 1)[1]
+        return path.lstrip("/")
+
+    def _path_to_object(self, path, ref):
+        comm, ref = self.repo.resolve_refish(ref or self.ref)
+        parts = path.split("/")
+        tree = comm.tree
+        for part in parts:
+            if part and isinstance(tree, pygit2.Tree):
+                if part not in tree:
+                    raise FileNotFoundError(path)
+                tree = tree[part]
+        return tree
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        path = path.removeprefix("git://")
+        out = {}
+        if ":" in path:
+            out["path"], path = path.split(":", 1)
+        if "@" in path:
+            out["ref"], path = path.split("@", 1)
+        return out
+
+    @staticmethod
+    def _object_to_info(obj, path=None):
+        # obj.name and obj.filemode are None for the root tree!
+        is_dir = isinstance(obj, pygit2.Tree)
+        return {
+            "type": "directory" if is_dir else "file",
+            "name": (
+                "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
+            ),
+            "hex": str(obj.id),
+            "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
+            "size": 0 if is_dir else obj.size,
+        }
+
+    def ls(self, path, detail=True, ref=None, **kwargs):
+        tree = self._path_to_object(self._strip_protocol(path), ref)
+        return [
+            GitFileSystem._object_to_info(obj, path)
+            if detail
+            else GitFileSystem._object_to_info(obj, path)["name"]
+            for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
+        ]
+
+    def info(self, path, ref=None, **kwargs):
+        tree = self._path_to_object(self._strip_protocol(path), ref)
+        return GitFileSystem._object_to_info(tree, path)
+
+    def ukey(self, path, ref=None):
+        return self.info(path, ref=ref)["hex"]
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        ref=None,
+        **kwargs,
+    ):
+        obj = self._path_to_object(path, ref or self.ref)
+        return MemoryFile(data=obj.data)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/github.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/github.py
new file mode 100644
index 0000000000000000000000000000000000000000..3630f6db54413e2c396f6cc1b6b10cd379200043
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/github.py
@@ -0,0 +1,333 @@
+import base64
+import re
+
+import requests
+
+from ..spec import AbstractFileSystem
+from ..utils import infer_storage_options
+from .memory import MemoryFile
+
+
+class GithubFileSystem(AbstractFileSystem):
+    """Interface to files in github
+
+    An instance of this class provides the files residing within a remote github
+    repository. You may specify a point in the repos history, by SHA, branch
+    or tag (default is current master).
+
+    For files less than 1 MB in size, file content is returned directly in a
+    MemoryFile. For larger files, or for files tracked by git-lfs, file content
+    is returned as an HTTPFile wrapping the ``download_url`` provided by the
+    GitHub API.
+
+    When using fsspec.open, allows URIs of the form:
+
+    - "github://path/file", in which case you must specify org, repo and
+      may specify sha in the extra args
+    - 'github://org:repo@/precip/catalog.yml', where the org and repo are
+      part of the URI
+    - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
+
+    ``sha`` can be the full or abbreviated hex of the commit you want to fetch
+    from, or a branch or tag name (so long as it doesn't contain special characters
+    like "/", "?", which would have to be HTTP-encoded).
+
+    For authorised access, you must provide username and token, which can be made
+    at https://github.com/settings/tokens
+    """
+
+    url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
+    content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
+    protocol = "github"
+    timeout = (60, 60)  # connect, read timeouts
+
+    def __init__(
+        self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.org = org
+        self.repo = repo
+        if (username is None) ^ (token is None):
+            raise ValueError("Auth required both username and token")
+        self.username = username
+        self.token = token
+        if timeout is not None:
+            self.timeout = timeout
+        if sha is None:
+            # look up default branch (not necessarily "master")
+            u = "https://api.github.com/repos/{org}/{repo}"
+            r = requests.get(
+                u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
+            )
+            r.raise_for_status()
+            sha = r.json()["default_branch"]
+
+        self.root = sha
+        self.ls("")
+        try:
+            from .http import HTTPFileSystem
+
+            self.http_fs = HTTPFileSystem(**kwargs)
+        except ImportError:
+            self.http_fs = None
+
+    @property
+    def kw(self):
+        if self.username:
+            return {"auth": (self.username, self.token)}
+        return {}
+
+    @classmethod
+    def repos(cls, org_or_user, is_org=True):
+        """List repo names for given org or user
+
+        This may become the top level of the FS
+
+        Parameters
+        ----------
+        org_or_user: str
+            Name of the github org or user to query
+        is_org: bool (default True)
+            Whether the name is an organisation (True) or user (False)
+
+        Returns
+        -------
+        List of string
+        """
+        r = requests.get(
+            f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
+            timeout=cls.timeout,
+        )
+        r.raise_for_status()
+        return [repo["name"] for repo in r.json()]
+
+    @property
+    def tags(self):
+        """Names of tags in the repo"""
+        r = requests.get(
+            f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
+            timeout=self.timeout,
+            **self.kw,
+        )
+        r.raise_for_status()
+        return [t["name"] for t in r.json()]
+
+    @property
+    def branches(self):
+        """Names of branches in the repo"""
+        r = requests.get(
+            f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
+            timeout=self.timeout,
+            **self.kw,
+        )
+        r.raise_for_status()
+        return [t["name"] for t in r.json()]
+
+    @property
+    def refs(self):
+        """Named references, tags and branches"""
+        return {"tags": self.tags, "branches": self.branches}
+
+    def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
+        """List files at given path
+
+        Parameters
+        ----------
+        path: str
+            Location to list, relative to repo root
+        detail: bool
+            If True, returns list of dicts, one per file; if False, returns
+            list of full filenames only
+        sha: str (optional)
+            List at the given point in the repo history, branch or tag name or commit
+            SHA
+        _sha: str (optional)
+            List this specific tree object (used internally to descend into trees)
+        """
+        path = self._strip_protocol(path)
+        if path == "":
+            _sha = sha or self.root
+        if _sha is None:
+            parts = path.rstrip("/").split("/")
+            so_far = ""
+            _sha = sha or self.root
+            for part in parts:
+                out = self.ls(so_far, True, sha=sha, _sha=_sha)
+                so_far += "/" + part if so_far else part
+                out = [o for o in out if o["name"] == so_far]
+                if not out:
+                    raise FileNotFoundError(path)
+                out = out[0]
+                if out["type"] == "file":
+                    if detail:
+                        return [out]
+                    else:
+                        return path
+                _sha = out["sha"]
+        if path not in self.dircache or sha not in [self.root, None]:
+            r = requests.get(
+                self.url.format(org=self.org, repo=self.repo, sha=_sha),
+                timeout=self.timeout,
+                **self.kw,
+            )
+            if r.status_code == 404:
+                raise FileNotFoundError(path)
+            r.raise_for_status()
+            types = {"blob": "file", "tree": "directory"}
+            out = [
+                {
+                    "name": path + "/" + f["path"] if path else f["path"],
+                    "mode": f["mode"],
+                    "type": types[f["type"]],
+                    "size": f.get("size", 0),
+                    "sha": f["sha"],
+                }
+                for f in r.json()["tree"]
+                if f["type"] in types
+            ]
+            if sha in [self.root, None]:
+                self.dircache[path] = out
+        else:
+            out = self.dircache[path]
+        if detail:
+            return out
+        else:
+            return sorted([f["name"] for f in out])
+
+    def invalidate_cache(self, path=None):
+        self.dircache.clear()
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return super()._strip_protocol(path)
+        return opts["path"].lstrip("/")
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return {}
+        out = {"org": opts["username"], "repo": opts["password"]}
+        if opts["host"]:
+            out["sha"] = opts["host"]
+        return out
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        cache_options=None,
+        sha=None,
+        **kwargs,
+    ):
+        if mode != "rb":
+            raise NotImplementedError
+
+        # construct a url to hit the GitHub API's repo contents API
+        url = self.content_url.format(
+            org=self.org, repo=self.repo, path=path, sha=sha or self.root
+        )
+
+        # make a request to this API, and parse the response as JSON
+        r = requests.get(url, timeout=self.timeout, **self.kw)
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
+        content_json = r.json()
+
+        # if the response's content key is not empty, try to parse it as base64
+        if content_json["content"]:
+            content = base64.b64decode(content_json["content"])
+
+            # as long as the content does not start with the string
+            # "version https://git-lfs.github.com/"
+            # then it is probably not a git-lfs pointer and we can just return
+            # the content directly
+            if not content.startswith(b"version https://git-lfs.github.com/"):
+                return MemoryFile(None, None, content)
+
+        # we land here if the content was not present in the first response
+        # (regular file over 1MB or git-lfs tracked file)
+        # in this case, we get let the HTTPFileSystem handle the download
+        if self.http_fs is None:
+            raise ImportError(
+                "Please install fsspec[http] to access github files >1 MB "
+                "or git-lfs tracked files."
+            )
+        return self.http_fs.open(
+            content_json["download_url"],
+            mode=mode,
+            block_size=block_size,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+    def rm(self, path, recursive=False, maxdepth=None, message=None):
+        path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(path):
+            self.rm_file(p, message=message)
+
+    def rm_file(self, path, message=None, **kwargs):
+        """
+        Remove a file from a specified branch using a given commit message.
+
+        Since Github DELETE operation requires a branch name, and we can't reliably
+        determine whether the provided SHA refers to a branch, tag, or commit, we
+        assume it's a branch. If it's not, the user will encounter an error when
+        attempting to retrieve the file SHA or delete the file.
+
+        Parameters
+        ----------
+        path: str
+            The file's location relative to the repository root.
+        message: str, optional
+            The commit message for the deletion.
+        """
+
+        if not self.username:
+            raise ValueError("Authentication required")
+
+        path = self._strip_protocol(path)
+
+        # Attempt to get SHA from cache or Github API
+        sha = self._get_sha_from_cache(path)
+        if not sha:
+            url = self.content_url.format(
+                org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
+            )
+            r = requests.get(url, timeout=self.timeout, **self.kw)
+            if r.status_code == 404:
+                raise FileNotFoundError(path)
+            r.raise_for_status()
+            sha = r.json()["sha"]
+
+        # Delete the file
+        delete_url = self.content_url.format(
+            org=self.org, repo=self.repo, path=path, sha=self.root
+        )
+        branch = self.root
+        data = {
+            "message": message or f"Delete {path}",
+            "sha": sha,
+            **({"branch": branch} if branch else {}),
+        }
+
+        r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
+        error_message = r.json().get("message", "")
+        if re.search(r"Branch .+ not found", error_message):
+            error = "Remove only works when the filesystem is initialised from a branch or default (None)"
+            raise ValueError(error)
+        r.raise_for_status()
+
+        self.invalidate_cache(path)
+
+    def _get_sha_from_cache(self, path):
+        for entries in self.dircache.values():
+            for entry in entries:
+                entry_path = entry.get("name")
+                if entry_path and entry_path == path and "sha" in entry:
+                    return entry["sha"]
+        return None
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb1bc36074ff4c85463133387601ae16ae1280e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http.py
@@ -0,0 +1,897 @@
+import asyncio
+import io
+import logging
+import re
+import weakref
+from copy import copy
+from urllib.parse import urlparse
+
+import aiohttp
+import yarl
+
+from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.exceptions import FSTimeoutError
+from fsspec.spec import AbstractBufferedFile
+from fsspec.utils import (
+    DEFAULT_BLOCK_SIZE,
+    glob_translate,
+    isfilelike,
+    nullcontext,
+    tokenize,
+)
+
+from ..caching import AllBytes
+
+# https://stackoverflow.com/a/15926317/3821154
+ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
+ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
+logger = logging.getLogger("fsspec.http")
+
+
+async def get_client(**kwargs):
+    return aiohttp.ClientSession(**kwargs)
+
+
+class HTTPFileSystem(AsyncFileSystem):
+    """
+    Simple File-System for fetching data via HTTP(S)
+
+    ``ls()`` is implemented by loading the parent page and doing a regex
+    match on the result. If simple_link=True, anything of the form
+    "http(s)://server.com/stuff?thing=other"; otherwise only links within
+    HTML href tags will be used.
+    """
+
+    protocol = ("http", "https")
+    sep = "/"
+
+    def __init__(
+        self,
+        simple_links=True,
+        block_size=None,
+        same_scheme=True,
+        size_policy=None,
+        cache_type="bytes",
+        cache_options=None,
+        asynchronous=False,
+        loop=None,
+        client_kwargs=None,
+        get_client=get_client,
+        encoded=False,
+        **storage_options,
+    ):
+        """
+        NB: if this is called async, you must await set_client
+
+        Parameters
+        ----------
+        block_size: int
+            Blocks to read bytes; if 0, will default to raw requests file-like
+            objects instead of HTTPFile instances
+        simple_links: bool
+            If True, will consider both HTML <a> tags and anything that looks
+            like a URL; if False, will consider only the former.
+        same_scheme: True
+            When doing ls/glob, if this is True, only consider paths that have
+            http/https matching the input URLs.
+        size_policy: this argument is deprecated
+        client_kwargs: dict
+            Passed to aiohttp.ClientSession, see
+            https://docs.aiohttp.org/en/stable/client_reference.html
+            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
+        get_client: Callable[..., aiohttp.ClientSession]
+            A callable, which takes keyword arguments and constructs
+            an aiohttp.ClientSession. Its state will be managed by
+            the HTTPFileSystem class.
+        storage_options: key-value
+            Any other parameters passed on to requests
+        cache_type, cache_options: defaults used in open()
+        """
+        super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
+        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
+        self.simple_links = simple_links
+        self.same_schema = same_scheme
+        self.cache_type = cache_type
+        self.cache_options = cache_options
+        self.client_kwargs = client_kwargs or {}
+        self.get_client = get_client
+        self.encoded = encoded
+        self.kwargs = storage_options
+        self._session = None
+
+        # Clean caching-related parameters from `storage_options`
+        # before propagating them as `request_options` through `self.kwargs`.
+        # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
+        #       it clearer.
+        request_options = copy(storage_options)
+        self.use_listings_cache = request_options.pop("use_listings_cache", False)
+        request_options.pop("listings_expiry_time", None)
+        request_options.pop("max_paths", None)
+        request_options.pop("skip_instance_cache", None)
+        self.kwargs = request_options
+
+    @property
+    def fsid(self):
+        return "http"
+
+    def encode_url(self, url):
+        return yarl.URL(url, encoded=self.encoded)
+
+    @staticmethod
+    def close_session(loop, session):
+        if loop is not None and loop.is_running():
+            try:
+                sync(loop, session.close, timeout=0.1)
+                return
+            except (TimeoutError, FSTimeoutError, NotImplementedError):
+                pass
+        connector = getattr(session, "_connector", None)
+        if connector is not None:
+            # close after loop is dead
+            connector._close()
+
+    async def set_session(self):
+        if self._session is None:
+            self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
+            if not self.asynchronous:
+                weakref.finalize(self, self.close_session, self.loop, self._session)
+        return self._session
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        """For HTTP, we always want to keep the full URL"""
+        return path
+
+    @classmethod
+    def _parent(cls, path):
+        # override, since _strip_protocol is different for URLs
+        par = super()._parent(path)
+        if len(par) > 7:  # "http://..."
+            return par
+        return ""
+
+    async def _ls_real(self, url, detail=True, **kwargs):
+        # ignoring URL-encoded arguments
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+        session = await self.set_session()
+        async with session.get(self.encode_url(url), **self.kwargs) as r:
+            self._raise_not_found_for_status(r, url)
+
+            if "Content-Type" in r.headers:
+                mimetype = r.headers["Content-Type"].partition(";")[0]
+            else:
+                mimetype = None
+
+            if mimetype in ("text/html", None):
+                try:
+                    text = await r.text(errors="ignore")
+                    if self.simple_links:
+                        links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
+                    else:
+                        links = [u[2] for u in ex.findall(text)]
+                except UnicodeDecodeError:
+                    links = []  # binary, not HTML
+            else:
+                links = []
+
+        out = set()
+        parts = urlparse(url)
+        for l in links:
+            if isinstance(l, tuple):
+                l = l[1]
+            if l.startswith("/") and len(l) > 1:
+                # absolute URL on this server
+                l = f"{parts.scheme}://{parts.netloc}{l}"
+            if l.startswith("http"):
+                if self.same_schema and l.startswith(url.rstrip("/") + "/"):
+                    out.add(l)
+                elif l.replace("https", "http").startswith(
+                    url.replace("https", "http").rstrip("/") + "/"
+                ):
+                    # allowed to cross http <-> https
+                    out.add(l)
+            else:
+                if l not in ["..", "../"]:
+                    # Ignore FTP-like "parent"
+                    out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
+        if not out and url.endswith("/"):
+            out = await self._ls_real(url.rstrip("/"), detail=False)
+        if detail:
+            return [
+                {
+                    "name": u,
+                    "size": None,
+                    "type": "directory" if u.endswith("/") else "file",
+                }
+                for u in out
+            ]
+        else:
+            return sorted(out)
+
+    async def _ls(self, url, detail=True, **kwargs):
+        if self.use_listings_cache and url in self.dircache:
+            out = self.dircache[url]
+        else:
+            out = await self._ls_real(url, detail=detail, **kwargs)
+            self.dircache[url] = out
+        return out
+
+    ls = sync_wrapper(_ls)
+
+    def _raise_not_found_for_status(self, response, url):
+        """
+        Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
+        """
+        if response.status == 404:
+            raise FileNotFoundError(url)
+        response.raise_for_status()
+
+    async def _cat_file(self, url, start=None, end=None, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+
+        if start is not None or end is not None:
+            if start == end:
+                return b""
+            headers = kw.pop("headers", {}).copy()
+
+            headers["Range"] = await self._process_limits(url, start, end)
+            kw["headers"] = headers
+        session = await self.set_session()
+        async with session.get(self.encode_url(url), **kw) as r:
+            out = await r.read()
+            self._raise_not_found_for_status(r, url)
+        return out
+
+    async def _get_file(
+        self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
+    ):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(rpath)
+        session = await self.set_session()
+        async with session.get(self.encode_url(rpath), **kw) as r:
+            try:
+                size = int(r.headers["content-length"])
+            except (ValueError, KeyError):
+                size = None
+
+            callback.set_size(size)
+            self._raise_not_found_for_status(r, rpath)
+            if isfilelike(lpath):
+                outfile = lpath
+            else:
+                outfile = open(lpath, "wb")  # noqa: ASYNC230
+
+            try:
+                chunk = True
+                while chunk:
+                    chunk = await r.content.read(chunk_size)
+                    outfile.write(chunk)
+                    callback.relative_update(len(chunk))
+            finally:
+                if not isfilelike(lpath):
+                    outfile.close()
+
+    async def _put_file(
+        self,
+        lpath,
+        rpath,
+        chunk_size=5 * 2**20,
+        callback=DEFAULT_CALLBACK,
+        method="post",
+        mode="overwrite",
+        **kwargs,
+    ):
+        if mode != "overwrite":
+            raise NotImplementedError("Exclusive write")
+
+        async def gen_chunks():
+            # Support passing arbitrary file-like objects
+            # and use them instead of streams.
+            if isinstance(lpath, io.IOBase):
+                context = nullcontext(lpath)
+                use_seek = False  # might not support seeking
+            else:
+                context = open(lpath, "rb")  # noqa: ASYNC230
+                use_seek = True
+
+            with context as f:
+                if use_seek:
+                    callback.set_size(f.seek(0, 2))
+                    f.seek(0)
+                else:
+                    callback.set_size(getattr(f, "size", None))
+
+                chunk = f.read(chunk_size)
+                while chunk:
+                    yield chunk
+                    callback.relative_update(len(chunk))
+                    chunk = f.read(chunk_size)
+
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        session = await self.set_session()
+
+        method = method.lower()
+        if method not in ("post", "put"):
+            raise ValueError(
+                f"method has to be either 'post' or 'put', not: {method!r}"
+            )
+
+        meth = getattr(session, method)
+        async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
+            self._raise_not_found_for_status(resp, rpath)
+
+    async def _exists(self, path, strict=False, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        try:
+            logger.debug(path)
+            session = await self.set_session()
+            r = await session.get(self.encode_url(path), **kw)
+            async with r:
+                if strict:
+                    self._raise_not_found_for_status(r, path)
+                return r.status < 400
+        except FileNotFoundError:
+            return False
+        except aiohttp.ClientError:
+            if strict:
+                raise
+            return False
+
+    async def _isfile(self, path, **kwargs):
+        return await self._exists(path, **kwargs)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=None,  # XXX: This differs from the base class.
+        cache_type=None,
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        """Make a file-like object
+
+        Parameters
+        ----------
+        path: str
+            Full URL with protocol
+        mode: string
+            must be "rb"
+        block_size: int or None
+            Bytes to download in one request; use instance value if None. If
+            zero, will return a streaming Requests file-like instance.
+        kwargs: key-value
+            Any other parameters, passed to requests calls
+        """
+        if mode != "rb":
+            raise NotImplementedError
+        block_size = block_size if block_size is not None else self.block_size
+        kw = self.kwargs.copy()
+        kw["asynchronous"] = self.asynchronous
+        kw.update(kwargs)
+        info = {}
+        size = size or info.update(self.info(path, **kwargs)) or info["size"]
+        session = sync(self.loop, self.set_session)
+        if block_size and size and info.get("partial", True):
+            return HTTPFile(
+                self,
+                path,
+                session=session,
+                block_size=block_size,
+                mode=mode,
+                size=size,
+                cache_type=cache_type or self.cache_type,
+                cache_options=cache_options or self.cache_options,
+                loop=self.loop,
+                **kw,
+            )
+        else:
+            return HTTPStreamFile(
+                self,
+                path,
+                mode=mode,
+                loop=self.loop,
+                session=session,
+                **kw,
+            )
+
+    async def open_async(self, path, mode="rb", size=None, **kwargs):
+        session = await self.set_session()
+        if size is None:
+            try:
+                size = (await self._info(path, **kwargs))["size"]
+            except FileNotFoundError:
+                pass
+        return AsyncStreamFile(
+            self,
+            path,
+            loop=self.loop,
+            session=session,
+            size=size,
+            **kwargs,
+        )
+
+    def ukey(self, url):
+        """Unique identifier; assume HTTP files are static, unchanging"""
+        return tokenize(url, self.kwargs, self.protocol)
+
+    async def _info(self, url, **kwargs):
+        """Get info of URL
+
+        Tries to access location via HEAD, and then GET methods, but does
+        not fetch the data.
+
+        It is possible that the server does not supply any size information, in
+        which case size will be given as None (and certain operations on the
+        corresponding file will not work).
+        """
+        info = {}
+        session = await self.set_session()
+
+        for policy in ["head", "get"]:
+            try:
+                info.update(
+                    await _file_info(
+                        self.encode_url(url),
+                        size_policy=policy,
+                        session=session,
+                        **self.kwargs,
+                        **kwargs,
+                    )
+                )
+                if info.get("size") is not None:
+                    break
+            except Exception as exc:
+                if policy == "get":
+                    # If get failed, then raise a FileNotFoundError
+                    raise FileNotFoundError(url) from exc
+                logger.debug("", exc_info=exc)
+
+        return {"name": url, "size": None, **info, "type": "file"}
+
+    async def _glob(self, path, maxdepth=None, **kwargs):
+        """
+        Find files by glob-matching.
+
+        This implementation is idntical to the one in AbstractFileSystem,
+        but "?" is not considered as a character for globbing, because it is
+        so common in URLs, often identifying the "query" part.
+        """
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        import re
+
+        ends_with_slash = path.endswith("/")  # _strip_protocol strips trailing slash
+        path = self._strip_protocol(path)
+        append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
+        idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+        idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+
+        min_idx = min(idx_star, idx_brace)
+
+        detail = kwargs.pop("detail", False)
+
+        if not has_magic(path):
+            if await self._exists(path, **kwargs):
+                if not detail:
+                    return [path]
+                else:
+                    return {path: await self._info(path, **kwargs)}
+            else:
+                if not detail:
+                    return []  # glob of non-existent returns empty
+                else:
+                    return {}
+        elif "/" in path[:min_idx]:
+            min_idx = path[:min_idx].rindex("/")
+            root = path[: min_idx + 1]
+            depth = path[min_idx + 1 :].count("/") + 1
+        else:
+            root = ""
+            depth = path[min_idx + 1 :].count("/") + 1
+
+        if "**" in path:
+            if maxdepth is not None:
+                idx_double_stars = path.find("**")
+                depth_double_stars = path[idx_double_stars:].count("/") + 1
+                depth = depth - depth_double_stars + maxdepth
+            else:
+                depth = None
+
+        allpaths = await self._find(
+            root, maxdepth=depth, withdirs=True, detail=True, **kwargs
+        )
+
+        pattern = glob_translate(path + ("/" if ends_with_slash else ""))
+        pattern = re.compile(pattern)
+
+        out = {
+            (
+                p.rstrip("/")
+                if not append_slash_to_dirname
+                and info["type"] == "directory"
+                and p.endswith("/")
+                else p
+            ): info
+            for p, info in sorted(allpaths.items())
+            if pattern.match(p.rstrip("/"))
+        }
+
+        if detail:
+            return out
+        else:
+            return list(out)
+
+    async def _isdir(self, path):
+        # override, since all URLs are (also) files
+        try:
+            return bool(await self._ls(path))
+        except (FileNotFoundError, ValueError):
+            return False
+
+    async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
+        """
+        Write bytes to a remote file over HTTP.
+
+        Parameters
+        ----------
+        path : str
+            Target URL where the data should be written
+        value : bytes
+            Data to be written
+        mode : str
+            How to write to the file - 'overwrite' or 'append'
+        **kwargs : dict
+            Additional parameters to pass to the HTTP request
+        """
+        url = self._strip_protocol(path)
+        headers = kwargs.pop("headers", {})
+        headers["Content-Length"] = str(len(value))
+
+        session = await self.set_session()
+
+        async with session.put(url, data=value, headers=headers, **kwargs) as r:
+            r.raise_for_status()
+
+
+class HTTPFile(AbstractBufferedFile):
+    """
+    A file-like object pointing to a remote HTTP(S) resource
+
+    Supports only reading, with read-ahead of a predetermined block-size.
+
+    In the case that the server does not supply the filesize, only reading of
+    the complete file in one go is supported.
+
+    Parameters
+    ----------
+    url: str
+        Full URL of the remote resource, including the protocol
+    session: aiohttp.ClientSession or None
+        All calls will be made within this session, to avoid restarting
+        connections where the server allows this
+    block_size: int or None
+        The amount of read-ahead to do, in bytes. Default is 5MB, or the value
+        configured for the FileSystem creating this file
+    size: None or int
+        If given, this is the size of the file in bytes, and we don't attempt
+        to call the server to find the value.
+    kwargs: all other key-values are passed to requests calls.
+    """
+
+    def __init__(
+        self,
+        fs,
+        url,
+        session=None,
+        block_size=None,
+        mode="rb",
+        cache_type="bytes",
+        cache_options=None,
+        size=None,
+        loop=None,
+        asynchronous=False,
+        **kwargs,
+    ):
+        if mode != "rb":
+            raise NotImplementedError("File mode not supported")
+        self.asynchronous = asynchronous
+        self.loop = loop
+        self.url = url
+        self.session = session
+        self.details = {"name": url, "size": size, "type": "file"}
+        super().__init__(
+            fs=fs,
+            path=url,
+            mode=mode,
+            block_size=block_size,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+    def read(self, length=-1):
+        """Read bytes from file
+
+        Parameters
+        ----------
+        length: int
+            Read up to this many bytes. If negative, read all content to end of
+            file. If the server has not supplied the filesize, attempting to
+            read only part of the data will raise a ValueError.
+        """
+        if (
+            (length < 0 and self.loc == 0)  # explicit read all
+            # but not when the size is known and fits into a block anyways
+            and not (self.size is not None and self.size <= self.blocksize)
+        ):
+            self._fetch_all()
+        if self.size is None:
+            if length < 0:
+                self._fetch_all()
+        else:
+            length = min(self.size - self.loc, length)
+        return super().read(length)
+
+    async def async_fetch_all(self):
+        """Read whole file in one shot, without caching
+
+        This is only called when position is still at zero,
+        and read() is called without a byte-count.
+        """
+        logger.debug(f"Fetch all for {self}")
+        if not isinstance(self.cache, AllBytes):
+            r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
+            async with r:
+                r.raise_for_status()
+                out = await r.read()
+                self.cache = AllBytes(
+                    size=len(out), fetcher=None, blocksize=None, data=out
+                )
+                self.size = len(out)
+
+    _fetch_all = sync_wrapper(async_fetch_all)
+
+    def _parse_content_range(self, headers):
+        """Parse the Content-Range header"""
+        s = headers.get("Content-Range", "")
+        m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
+        if not m:
+            return None, None, None
+
+        if m[1] == "*":
+            start = end = None
+        else:
+            start, end = [int(x) for x in m[1].split("-")]
+        total = None if m[2] == "*" else int(m[2])
+        return start, end, total
+
+    async def async_fetch_range(self, start, end):
+        """Download a block of data
+
+        The expectation is that the server returns only the requested bytes,
+        with HTTP code 206. If this is not the case, we first check the headers,
+        and then stream the output - if the data size is bigger than we
+        requested, an exception is raised.
+        """
+        logger.debug(f"Fetch range for {self}: {start}-{end}")
+        kwargs = self.kwargs.copy()
+        headers = kwargs.pop("headers", {}).copy()
+        headers["Range"] = f"bytes={start}-{end - 1}"
+        logger.debug(f"{self.url} : {headers['Range']}")
+        r = await self.session.get(
+            self.fs.encode_url(self.url), headers=headers, **kwargs
+        )
+        async with r:
+            if r.status == 416:
+                # range request outside file
+                return b""
+            r.raise_for_status()
+
+            # If the server has handled the range request, it should reply
+            # with status 206 (partial content). But we'll guess that a suitable
+            # Content-Range header or a Content-Length no more than the
+            # requested range also mean we have got the desired range.
+            response_is_range = (
+                r.status == 206
+                or self._parse_content_range(r.headers)[0] == start
+                or int(r.headers.get("Content-Length", end + 1)) <= end - start
+            )
+
+            if response_is_range:
+                # partial content, as expected
+                out = await r.read()
+            elif start > 0:
+                raise ValueError(
+                    "The HTTP server doesn't appear to support range requests. "
+                    "Only reading this file from the beginning is supported. "
+                    "Open with block_size=0 for a streaming file interface."
+                )
+            else:
+                # Response is not a range, but we want the start of the file,
+                # so we can read the required amount anyway.
+                cl = 0
+                out = []
+                while True:
+                    chunk = await r.content.read(2**20)
+                    # data size unknown, let's read until we have enough
+                    if chunk:
+                        out.append(chunk)
+                        cl += len(chunk)
+                        if cl > end - start:
+                            break
+                    else:
+                        break
+                out = b"".join(out)[: end - start]
+            return out
+
+    _fetch_range = sync_wrapper(async_fetch_range)
+
+
+magic_check = re.compile("([*[])")
+
+
+def has_magic(s):
+    match = magic_check.search(s)
+    return match is not None
+
+
+class HTTPStreamFile(AbstractBufferedFile):
+    def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
+        self.asynchronous = kwargs.pop("asynchronous", False)
+        self.url = url
+        self.loop = loop
+        self.session = session
+        if mode != "rb":
+            raise ValueError
+        self.details = {"name": url, "size": None}
+        super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
+
+        async def cor():
+            r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
+            self.fs._raise_not_found_for_status(r, url)
+            return r
+
+        self.r = sync(self.loop, cor)
+        self.loop = fs.loop
+
+    def seek(self, loc, whence=0):
+        if loc == 0 and whence == 1:
+            return
+        if loc == self.loc and whence == 0:
+            return
+        raise ValueError("Cannot seek streaming HTTP file")
+
+    async def _read(self, num=-1):
+        out = await self.r.content.read(num)
+        self.loc += len(out)
+        return out
+
+    read = sync_wrapper(_read)
+
+    async def _close(self):
+        self.r.close()
+
+    def close(self):
+        asyncio.run_coroutine_threadsafe(self._close(), self.loop)
+        super().close()
+
+
+class AsyncStreamFile(AbstractAsyncStreamedFile):
+    def __init__(
+        self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
+    ):
+        self.url = url
+        self.session = session
+        self.r = None
+        if mode != "rb":
+            raise ValueError
+        self.details = {"name": url, "size": None}
+        self.kwargs = kwargs
+        super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
+        self.size = size
+
+    async def read(self, num=-1):
+        if self.r is None:
+            r = await self.session.get(
+                self.fs.encode_url(self.url), **self.kwargs
+            ).__aenter__()
+            self.fs._raise_not_found_for_status(r, self.url)
+            self.r = r
+        out = await self.r.content.read(num)
+        self.loc += len(out)
+        return out
+
+    async def close(self):
+        if self.r is not None:
+            self.r.close()
+            self.r = None
+        await super().close()
+
+
+async def get_range(session, url, start, end, file=None, **kwargs):
+    # explicit get a range when we know it must be safe
+    kwargs = kwargs.copy()
+    headers = kwargs.pop("headers", {}).copy()
+    headers["Range"] = f"bytes={start}-{end - 1}"
+    r = await session.get(url, headers=headers, **kwargs)
+    r.raise_for_status()
+    async with r:
+        out = await r.read()
+    if file:
+        with open(file, "r+b") as f:  # noqa: ASYNC230
+            f.seek(start)
+            f.write(out)
+    else:
+        return out
+
+
+async def _file_info(url, session, size_policy="head", **kwargs):
+    """Call HEAD on the server to get details about the file (size/checksum etc.)
+
+    Default operation is to explicitly allow redirects and use encoding
+    'identity' (no compression) to get the true size of the target.
+    """
+    logger.debug("Retrieve file size for %s", url)
+    kwargs = kwargs.copy()
+    ar = kwargs.pop("allow_redirects", True)
+    head = kwargs.get("headers", {}).copy()
+    head["Accept-Encoding"] = "identity"
+    kwargs["headers"] = head
+
+    info = {}
+    if size_policy == "head":
+        r = await session.head(url, allow_redirects=ar, **kwargs)
+    elif size_policy == "get":
+        r = await session.get(url, allow_redirects=ar, **kwargs)
+    else:
+        raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
+    async with r:
+        r.raise_for_status()
+
+        if "Content-Length" in r.headers:
+            # Some servers may choose to ignore Accept-Encoding and return
+            # compressed content, in which case the returned size is unreliable.
+            if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
+                "identity",
+                "",
+            ]:
+                info["size"] = int(r.headers["Content-Length"])
+        elif "Content-Range" in r.headers:
+            info["size"] = int(r.headers["Content-Range"].split("/")[1])
+
+        if "Content-Type" in r.headers:
+            info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
+
+        if r.headers.get("Accept-Ranges") == "none":
+            # Some servers may explicitly discourage partial content requests, but
+            # the lack of "Accept-Ranges" does not always indicate they would fail
+            info["partial"] = False
+
+        info["url"] = str(r.url)
+
+        for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
+            if r.headers.get(checksum_field):
+                info[checksum_field] = r.headers[checksum_field]
+
+    return info
+
+
+async def _file_size(url, session=None, *args, **kwargs):
+    if session is None:
+        session = await get_client()
+    info = await _file_info(url, session=session, *args, **kwargs)
+    return info.get("size")
+
+
+file_size = sync_wrapper(_file_size)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http_sync.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http_sync.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67ea3ea5fee9e6b51f7f3f66773e8cf65735e52
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/http_sync.py
@@ -0,0 +1,937 @@
+"""This file is largely copied from http.py"""
+
+import io
+import logging
+import re
+import urllib.error
+import urllib.parse
+from copy import copy
+from json import dumps, loads
+from urllib.parse import urlparse
+
+try:
+    import yarl
+except (ImportError, ModuleNotFoundError, OSError):
+    yarl = False
+
+from fsspec.callbacks import _DEFAULT_CALLBACK
+from fsspec.registry import register_implementation
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
+
+from ..caching import AllBytes
+
+# https://stackoverflow.com/a/15926317/3821154
+ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
+ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
+logger = logging.getLogger("fsspec.http")
+
+
+class JsHttpException(urllib.error.HTTPError): ...
+
+
+class StreamIO(io.BytesIO):
+    # fake class, so you can set attributes on it
+    # will eventually actually stream
+    ...
+
+
+class ResponseProxy:
+    """Looks like a requests response"""
+
+    def __init__(self, req, stream=False):
+        self.request = req
+        self.stream = stream
+        self._data = None
+        self._headers = None
+
+    @property
+    def raw(self):
+        if self._data is None:
+            b = self.request.response.to_bytes()
+            if self.stream:
+                self._data = StreamIO(b)
+            else:
+                self._data = b
+        return self._data
+
+    def close(self):
+        if hasattr(self, "_data"):
+            del self._data
+
+    @property
+    def headers(self):
+        if self._headers is None:
+            self._headers = dict(
+                [
+                    _.split(": ")
+                    for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
+                ]
+            )
+        return self._headers
+
+    @property
+    def status_code(self):
+        return int(self.request.status)
+
+    def raise_for_status(self):
+        if not self.ok:
+            raise JsHttpException(
+                self.url, self.status_code, self.reason, self.headers, None
+            )
+
+    def iter_content(self, chunksize, *_, **__):
+        while True:
+            out = self.raw.read(chunksize)
+            if out:
+                yield out
+            else:
+                break
+
+    @property
+    def reason(self):
+        return self.request.statusText
+
+    @property
+    def ok(self):
+        return self.status_code < 400
+
+    @property
+    def url(self):
+        return self.request.response.responseURL
+
+    @property
+    def text(self):
+        # TODO: encoding from headers
+        return self.content.decode()
+
+    @property
+    def content(self):
+        self.stream = False
+        return self.raw
+
+    def json(self):
+        return loads(self.text)
+
+
+class RequestsSessionShim:
+    def __init__(self):
+        self.headers = {}
+
+    def request(
+        self,
+        method,
+        url,
+        params=None,
+        data=None,
+        headers=None,
+        cookies=None,
+        files=None,
+        auth=None,
+        timeout=None,
+        allow_redirects=None,
+        proxies=None,
+        hooks=None,
+        stream=None,
+        verify=None,
+        cert=None,
+        json=None,
+    ):
+        from js import Blob, XMLHttpRequest
+
+        logger.debug("JS request: %s %s", method, url)
+
+        if cert or verify or proxies or files or cookies or hooks:
+            raise NotImplementedError
+        if data and json:
+            raise ValueError("Use json= or data=, not both")
+        req = XMLHttpRequest.new()
+        extra = auth if auth else ()
+        if params:
+            url = f"{url}?{urllib.parse.urlencode(params)}"
+        req.open(method, url, False, *extra)
+        if timeout:
+            req.timeout = timeout
+        if headers:
+            for k, v in headers.items():
+                req.setRequestHeader(k, v)
+
+        req.setRequestHeader("Accept", "application/octet-stream")
+        req.responseType = "arraybuffer"
+        if json:
+            blob = Blob.new([dumps(data)], {type: "application/json"})
+            req.send(blob)
+        elif data:
+            if isinstance(data, io.IOBase):
+                data = data.read()
+            blob = Blob.new([data], {type: "application/octet-stream"})
+            req.send(blob)
+        else:
+            req.send(None)
+        return ResponseProxy(req, stream=stream)
+
+    def get(self, url, **kwargs):
+        return self.request("GET", url, **kwargs)
+
+    def head(self, url, **kwargs):
+        return self.request("HEAD", url, **kwargs)
+
+    def post(self, url, **kwargs):
+        return self.request("POST}", url, **kwargs)
+
+    def put(self, url, **kwargs):
+        return self.request("PUT", url, **kwargs)
+
+    def patch(self, url, **kwargs):
+        return self.request("PATCH", url, **kwargs)
+
+    def delete(self, url, **kwargs):
+        return self.request("DELETE", url, **kwargs)
+
+
+class HTTPFileSystem(AbstractFileSystem):
+    """
+    Simple File-System for fetching data via HTTP(S)
+
+    This is the BLOCKING version of the normal HTTPFileSystem. It uses
+    requests in normal python and the JS runtime in pyodide.
+
+    ***This implementation is extremely experimental, do not use unless
+    you are testing pyodide/pyscript integration***
+    """
+
+    protocol = ("http", "https", "sync-http", "sync-https")
+    sep = "/"
+
+    def __init__(
+        self,
+        simple_links=True,
+        block_size=None,
+        same_scheme=True,
+        cache_type="readahead",
+        cache_options=None,
+        client_kwargs=None,
+        encoded=False,
+        **storage_options,
+    ):
+        """
+
+        Parameters
+        ----------
+        block_size: int
+            Blocks to read bytes; if 0, will default to raw requests file-like
+            objects instead of HTTPFile instances
+        simple_links: bool
+            If True, will consider both HTML <a> tags and anything that looks
+            like a URL; if False, will consider only the former.
+        same_scheme: True
+            When doing ls/glob, if this is True, only consider paths that have
+            http/https matching the input URLs.
+        size_policy: this argument is deprecated
+        client_kwargs: dict
+            Passed to aiohttp.ClientSession, see
+            https://docs.aiohttp.org/en/stable/client_reference.html
+            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
+        storage_options: key-value
+            Any other parameters passed on to requests
+        cache_type, cache_options: defaults used in open
+        """
+        super().__init__(self, **storage_options)
+        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
+        self.simple_links = simple_links
+        self.same_schema = same_scheme
+        self.cache_type = cache_type
+        self.cache_options = cache_options
+        self.client_kwargs = client_kwargs or {}
+        self.encoded = encoded
+        self.kwargs = storage_options
+
+        try:
+            import js  # noqa: F401
+
+            logger.debug("Starting JS session")
+            self.session = RequestsSessionShim()
+            self.js = True
+        except Exception as e:
+            import requests
+
+            logger.debug("Starting cpython session because of: %s", e)
+            self.session = requests.Session(**(client_kwargs or {}))
+            self.js = False
+
+        request_options = copy(storage_options)
+        self.use_listings_cache = request_options.pop("use_listings_cache", False)
+        request_options.pop("listings_expiry_time", None)
+        request_options.pop("max_paths", None)
+        request_options.pop("skip_instance_cache", None)
+        self.kwargs = request_options
+
+    @property
+    def fsid(self):
+        return "sync-http"
+
+    def encode_url(self, url):
+        if yarl:
+            return yarl.URL(url, encoded=self.encoded)
+        return url
+
+    @classmethod
+    def _strip_protocol(cls, path: str) -> str:
+        """For HTTP, we always want to keep the full URL"""
+        path = path.replace("sync-http://", "http://").replace(
+            "sync-https://", "https://"
+        )
+        return path
+
+    @classmethod
+    def _parent(cls, path):
+        # override, since _strip_protocol is different for URLs
+        par = super()._parent(path)
+        if len(par) > 7:  # "http://..."
+            return par
+        return ""
+
+    def _ls_real(self, url, detail=True, **kwargs):
+        # ignoring URL-encoded arguments
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+        r = self.session.get(self.encode_url(url), **self.kwargs)
+        self._raise_not_found_for_status(r, url)
+        text = r.text
+        if self.simple_links:
+            links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
+        else:
+            links = [u[2] for u in ex.findall(text)]
+        out = set()
+        parts = urlparse(url)
+        for l in links:
+            if isinstance(l, tuple):
+                l = l[1]
+            if l.startswith("/") and len(l) > 1:
+                # absolute URL on this server
+                l = parts.scheme + "://" + parts.netloc + l
+            if l.startswith("http"):
+                if self.same_schema and l.startswith(url.rstrip("/") + "/"):
+                    out.add(l)
+                elif l.replace("https", "http").startswith(
+                    url.replace("https", "http").rstrip("/") + "/"
+                ):
+                    # allowed to cross http <-> https
+                    out.add(l)
+            else:
+                if l not in ["..", "../"]:
+                    # Ignore FTP-like "parent"
+                    out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
+        if not out and url.endswith("/"):
+            out = self._ls_real(url.rstrip("/"), detail=False)
+        if detail:
+            return [
+                {
+                    "name": u,
+                    "size": None,
+                    "type": "directory" if u.endswith("/") else "file",
+                }
+                for u in out
+            ]
+        else:
+            return sorted(out)
+
+    def ls(self, url, detail=True, **kwargs):
+        if self.use_listings_cache and url in self.dircache:
+            out = self.dircache[url]
+        else:
+            out = self._ls_real(url, detail=detail, **kwargs)
+            self.dircache[url] = out
+        return out
+
+    def _raise_not_found_for_status(self, response, url):
+        """
+        Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
+        """
+        if response.status_code == 404:
+            raise FileNotFoundError(url)
+        response.raise_for_status()
+
+    def cat_file(self, url, start=None, end=None, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+
+        if start is not None or end is not None:
+            if start == end:
+                return b""
+            headers = kw.pop("headers", {}).copy()
+
+            headers["Range"] = self._process_limits(url, start, end)
+            kw["headers"] = headers
+        r = self.session.get(self.encode_url(url), **kw)
+        self._raise_not_found_for_status(r, url)
+        return r.content
+
+    def get_file(
+        self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
+    ):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(rpath)
+        r = self.session.get(self.encode_url(rpath), **kw)
+        try:
+            size = int(
+                r.headers.get("content-length", None)
+                or r.headers.get("Content-Length", None)
+            )
+        except (ValueError, KeyError, TypeError):
+            size = None
+
+        callback.set_size(size)
+        self._raise_not_found_for_status(r, rpath)
+        if not isfilelike(lpath):
+            lpath = open(lpath, "wb")
+        for chunk in r.iter_content(chunk_size, decode_unicode=False):
+            lpath.write(chunk)
+            callback.relative_update(len(chunk))
+
+    def put_file(
+        self,
+        lpath,
+        rpath,
+        chunk_size=5 * 2**20,
+        callback=_DEFAULT_CALLBACK,
+        method="post",
+        **kwargs,
+    ):
+        def gen_chunks():
+            # Support passing arbitrary file-like objects
+            # and use them instead of streams.
+            if isinstance(lpath, io.IOBase):
+                context = nullcontext(lpath)
+                use_seek = False  # might not support seeking
+            else:
+                context = open(lpath, "rb")
+                use_seek = True
+
+            with context as f:
+                if use_seek:
+                    callback.set_size(f.seek(0, 2))
+                    f.seek(0)
+                else:
+                    callback.set_size(getattr(f, "size", None))
+
+                chunk = f.read(chunk_size)
+                while chunk:
+                    yield chunk
+                    callback.relative_update(len(chunk))
+                    chunk = f.read(chunk_size)
+
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+
+        method = method.lower()
+        if method not in ("post", "put"):
+            raise ValueError(
+                f"method has to be either 'post' or 'put', not: {method!r}"
+            )
+
+        meth = getattr(self.session, method)
+        resp = meth(rpath, data=gen_chunks(), **kw)
+        self._raise_not_found_for_status(resp, rpath)
+
+    def _process_limits(self, url, start, end):
+        """Helper for "Range"-based _cat_file"""
+        size = None
+        suff = False
+        if start is not None and start < 0:
+            # if start is negative and end None, end is the "suffix length"
+            if end is None:
+                end = -start
+                start = ""
+                suff = True
+            else:
+                size = size or self.info(url)["size"]
+                start = size + start
+        elif start is None:
+            start = 0
+        if not suff:
+            if end is not None and end < 0:
+                if start is not None:
+                    size = size or self.info(url)["size"]
+                    end = size + end
+            elif end is None:
+                end = ""
+            if isinstance(end, int):
+                end -= 1  # bytes range is inclusive
+        return f"bytes={start}-{end}"
+
+    def exists(self, path, strict=False, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        try:
+            logger.debug(path)
+            r = self.session.get(self.encode_url(path), **kw)
+            if strict:
+                self._raise_not_found_for_status(r, path)
+            return r.status_code < 400
+        except FileNotFoundError:
+            return False
+        except Exception:
+            if strict:
+                raise
+            return False
+
+    def isfile(self, path, **kwargs):
+        return self.exists(path, **kwargs)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=None,  # XXX: This differs from the base class.
+        cache_type=None,
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        """Make a file-like object
+
+        Parameters
+        ----------
+        path: str
+            Full URL with protocol
+        mode: string
+            must be "rb"
+        block_size: int or None
+            Bytes to download in one request; use instance value if None. If
+            zero, will return a streaming Requests file-like instance.
+        kwargs: key-value
+            Any other parameters, passed to requests calls
+        """
+        if mode != "rb":
+            raise NotImplementedError
+        block_size = block_size if block_size is not None else self.block_size
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        size = size or self.info(path, **kwargs)["size"]
+        if block_size and size:
+            return HTTPFile(
+                self,
+                path,
+                session=self.session,
+                block_size=block_size,
+                mode=mode,
+                size=size,
+                cache_type=cache_type or self.cache_type,
+                cache_options=cache_options or self.cache_options,
+                **kw,
+            )
+        else:
+            return HTTPStreamFile(
+                self,
+                path,
+                mode=mode,
+                session=self.session,
+                **kw,
+            )
+
+    def ukey(self, url):
+        """Unique identifier; assume HTTP files are static, unchanging"""
+        return tokenize(url, self.kwargs, self.protocol)
+
+    def info(self, url, **kwargs):
+        """Get info of URL
+
+        Tries to access location via HEAD, and then GET methods, but does
+        not fetch the data.
+
+        It is possible that the server does not supply any size information, in
+        which case size will be given as None (and certain operations on the
+        corresponding file will not work).
+        """
+        info = {}
+        for policy in ["head", "get"]:
+            try:
+                info.update(
+                    _file_info(
+                        self.encode_url(url),
+                        size_policy=policy,
+                        session=self.session,
+                        **self.kwargs,
+                        **kwargs,
+                    )
+                )
+                if info.get("size") is not None:
+                    break
+            except Exception as exc:
+                if policy == "get":
+                    # If get failed, then raise a FileNotFoundError
+                    raise FileNotFoundError(url) from exc
+                logger.debug(str(exc))
+
+        return {"name": url, "size": None, **info, "type": "file"}
+
+    def glob(self, path, maxdepth=None, **kwargs):
+        """
+        Find files by glob-matching.
+
+        This implementation is idntical to the one in AbstractFileSystem,
+        but "?" is not considered as a character for globbing, because it is
+        so common in URLs, often identifying the "query" part.
+        """
+        import re
+
+        ends = path.endswith("/")
+        path = self._strip_protocol(path)
+        indstar = path.find("*") if path.find("*") >= 0 else len(path)
+        indbrace = path.find("[") if path.find("[") >= 0 else len(path)
+
+        ind = min(indstar, indbrace)
+
+        detail = kwargs.pop("detail", False)
+
+        if not has_magic(path):
+            root = path
+            depth = 1
+            if ends:
+                path += "/*"
+            elif self.exists(path):
+                if not detail:
+                    return [path]
+                else:
+                    return {path: self.info(path)}
+            else:
+                if not detail:
+                    return []  # glob of non-existent returns empty
+                else:
+                    return {}
+        elif "/" in path[:ind]:
+            ind2 = path[:ind].rindex("/")
+            root = path[: ind2 + 1]
+            depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
+        else:
+            root = ""
+            depth = None if "**" in path else path[ind + 1 :].count("/") + 1
+
+        allpaths = self.find(
+            root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
+        )
+        # Escape characters special to python regex, leaving our supported
+        # special characters in place.
+        # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
+        # for shell globbing details.
+        pattern = (
+            "^"
+            + (
+                path.replace("\\", r"\\")
+                .replace(".", r"\.")
+                .replace("+", r"\+")
+                .replace("//", "/")
+                .replace("(", r"\(")
+                .replace(")", r"\)")
+                .replace("|", r"\|")
+                .replace("^", r"\^")
+                .replace("$", r"\$")
+                .replace("{", r"\{")
+                .replace("}", r"\}")
+                .rstrip("/")
+            )
+            + "$"
+        )
+        pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
+        pattern = re.sub("[*]", "[^/]*", pattern)
+        pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
+        out = {
+            p: allpaths[p]
+            for p in sorted(allpaths)
+            if pattern.match(p.replace("//", "/").rstrip("/"))
+        }
+        if detail:
+            return out
+        else:
+            return list(out)
+
+    def isdir(self, path):
+        # override, since all URLs are (also) files
+        try:
+            return bool(self.ls(path))
+        except (FileNotFoundError, ValueError):
+            return False
+
+
+class HTTPFile(AbstractBufferedFile):
+    """
+    A file-like object pointing to a remove HTTP(S) resource
+
+    Supports only reading, with read-ahead of a predermined block-size.
+
+    In the case that the server does not supply the filesize, only reading of
+    the complete file in one go is supported.
+
+    Parameters
+    ----------
+    url: str
+        Full URL of the remote resource, including the protocol
+    session: requests.Session or None
+        All calls will be made within this session, to avoid restarting
+        connections where the server allows this
+    block_size: int or None
+        The amount of read-ahead to do, in bytes. Default is 5MB, or the value
+        configured for the FileSystem creating this file
+    size: None or int
+        If given, this is the size of the file in bytes, and we don't attempt
+        to call the server to find the value.
+    kwargs: all other key-values are passed to requests calls.
+    """
+
+    def __init__(
+        self,
+        fs,
+        url,
+        session=None,
+        block_size=None,
+        mode="rb",
+        cache_type="bytes",
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        if mode != "rb":
+            raise NotImplementedError("File mode not supported")
+        self.url = url
+        self.session = session
+        self.details = {"name": url, "size": size, "type": "file"}
+        super().__init__(
+            fs=fs,
+            path=url,
+            mode=mode,
+            block_size=block_size,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+    def read(self, length=-1):
+        """Read bytes from file
+
+        Parameters
+        ----------
+        length: int
+            Read up to this many bytes. If negative, read all content to end of
+            file. If the server has not supplied the filesize, attempting to
+            read only part of the data will raise a ValueError.
+        """
+        if (
+            (length < 0 and self.loc == 0)  # explicit read all
+            # but not when the size is known and fits into a block anyways
+            and not (self.size is not None and self.size <= self.blocksize)
+        ):
+            self._fetch_all()
+        if self.size is None:
+            if length < 0:
+                self._fetch_all()
+        else:
+            length = min(self.size - self.loc, length)
+        return super().read(length)
+
+    def _fetch_all(self):
+        """Read whole file in one shot, without caching
+
+        This is only called when position is still at zero,
+        and read() is called without a byte-count.
+        """
+        logger.debug(f"Fetch all for {self}")
+        if not isinstance(self.cache, AllBytes):
+            r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
+            r.raise_for_status()
+            out = r.content
+            self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
+            self.size = len(out)
+
+    def _parse_content_range(self, headers):
+        """Parse the Content-Range header"""
+        s = headers.get("Content-Range", "")
+        m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
+        if not m:
+            return None, None, None
+
+        if m[1] == "*":
+            start = end = None
+        else:
+            start, end = [int(x) for x in m[1].split("-")]
+        total = None if m[2] == "*" else int(m[2])
+        return start, end, total
+
+    def _fetch_range(self, start, end):
+        """Download a block of data
+
+        The expectation is that the server returns only the requested bytes,
+        with HTTP code 206. If this is not the case, we first check the headers,
+        and then stream the output - if the data size is bigger than we
+        requested, an exception is raised.
+        """
+        logger.debug(f"Fetch range for {self}: {start}-{end}")
+        kwargs = self.kwargs.copy()
+        headers = kwargs.pop("headers", {}).copy()
+        headers["Range"] = f"bytes={start}-{end - 1}"
+        logger.debug("%s : %s", self.url, headers["Range"])
+        r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
+        if r.status_code == 416:
+            # range request outside file
+            return b""
+        r.raise_for_status()
+
+        # If the server has handled the range request, it should reply
+        # with status 206 (partial content). But we'll guess that a suitable
+        # Content-Range header or a Content-Length no more than the
+        # requested range also mean we have got the desired range.
+        cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
+        response_is_range = (
+            r.status_code == 206
+            or self._parse_content_range(r.headers)[0] == start
+            or int(cl) <= end - start
+        )
+
+        if response_is_range:
+            # partial content, as expected
+            out = r.content
+        elif start > 0:
+            raise ValueError(
+                "The HTTP server doesn't appear to support range requests. "
+                "Only reading this file from the beginning is supported. "
+                "Open with block_size=0 for a streaming file interface."
+            )
+        else:
+            # Response is not a range, but we want the start of the file,
+            # so we can read the required amount anyway.
+            cl = 0
+            out = []
+            for chunk in r.iter_content(2**20, False):
+                out.append(chunk)
+                cl += len(chunk)
+            out = b"".join(out)[: end - start]
+        return out
+
+
+magic_check = re.compile("([*[])")
+
+
+def has_magic(s):
+    match = magic_check.search(s)
+    return match is not None
+
+
+class HTTPStreamFile(AbstractBufferedFile):
+    def __init__(self, fs, url, mode="rb", session=None, **kwargs):
+        self.url = url
+        self.session = session
+        if mode != "rb":
+            raise ValueError
+        self.details = {"name": url, "size": None}
+        super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
+
+        r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
+        self.fs._raise_not_found_for_status(r, url)
+        self.it = r.iter_content(1024, False)
+        self.leftover = b""
+
+        self.r = r
+
+    def seek(self, *args, **kwargs):
+        raise ValueError("Cannot seek streaming HTTP file")
+
+    def read(self, num=-1):
+        bufs = [self.leftover]
+        leng = len(self.leftover)
+        while leng < num or num < 0:
+            try:
+                out = self.it.__next__()
+            except StopIteration:
+                break
+            if out:
+                bufs.append(out)
+            else:
+                break
+            leng += len(out)
+        out = b"".join(bufs)
+        if num >= 0:
+            self.leftover = out[num:]
+            out = out[:num]
+        else:
+            self.leftover = b""
+        self.loc += len(out)
+        return out
+
+    def close(self):
+        self.r.close()
+        self.closed = True
+
+
+def get_range(session, url, start, end, **kwargs):
+    # explicit get a range when we know it must be safe
+    kwargs = kwargs.copy()
+    headers = kwargs.pop("headers", {}).copy()
+    headers["Range"] = f"bytes={start}-{end - 1}"
+    r = session.get(url, headers=headers, **kwargs)
+    r.raise_for_status()
+    return r.content
+
+
+def _file_info(url, session, size_policy="head", **kwargs):
+    """Call HEAD on the server to get details about the file (size/checksum etc.)
+
+    Default operation is to explicitly allow redirects and use encoding
+    'identity' (no compression) to get the true size of the target.
+    """
+    logger.debug("Retrieve file size for %s", url)
+    kwargs = kwargs.copy()
+    ar = kwargs.pop("allow_redirects", True)
+    head = kwargs.get("headers", {}).copy()
+    # TODO: not allowed in JS
+    # head["Accept-Encoding"] = "identity"
+    kwargs["headers"] = head
+
+    info = {}
+    if size_policy == "head":
+        r = session.head(url, allow_redirects=ar, **kwargs)
+    elif size_policy == "get":
+        r = session.get(url, allow_redirects=ar, **kwargs)
+    else:
+        raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
+    r.raise_for_status()
+
+    # TODO:
+    #  recognise lack of 'Accept-Ranges',
+    #                 or 'Accept-Ranges': 'none' (not 'bytes')
+    #  to mean streaming only, no random access => return None
+    if "Content-Length" in r.headers:
+        info["size"] = int(r.headers["Content-Length"])
+    elif "Content-Range" in r.headers:
+        info["size"] = int(r.headers["Content-Range"].split("/")[1])
+    elif "content-length" in r.headers:
+        info["size"] = int(r.headers["content-length"])
+    elif "content-range" in r.headers:
+        info["size"] = int(r.headers["content-range"].split("/")[1])
+
+    for checksum_field in ["ETag", "Content-MD5", "Digest"]:
+        if r.headers.get(checksum_field):
+            info[checksum_field] = r.headers[checksum_field]
+
+    return info
+
+
+# importing this is enough to register it
+def register():
+    register_implementation("http", HTTPFileSystem, clobber=True)
+    register_implementation("https", HTTPFileSystem, clobber=True)
+    register_implementation("sync-http", HTTPFileSystem, clobber=True)
+    register_implementation("sync-https", HTTPFileSystem, clobber=True)
+
+
+register()
+
+
+def unregister():
+    from fsspec.implementations.http import HTTPFileSystem
+
+    register_implementation("http", HTTPFileSystem, clobber=True)
+    register_implementation("https", HTTPFileSystem, clobber=True)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/jupyter.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/jupyter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5571ed56582170051f3b7cd903093eed4c65244
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/jupyter.py
@@ -0,0 +1,129 @@
+import base64
+import io
+import re
+
+import requests
+
+import fsspec
+
+
+class JupyterFileSystem(fsspec.AbstractFileSystem):
+    """View of the files as seen by a Jupyter server (notebook or lab)"""
+
+    protocol = ("jupyter", "jlab")
+
+    def __init__(self, url, tok=None, **kwargs):
+        """
+
+        Parameters
+        ----------
+        url : str
+            Base URL of the server, like "http://127.0.0.1:8888". May include
+            token in the string, which is given by the process when starting up
+        tok : str
+            If the token is obtained separately, can be given here
+        kwargs
+        """
+        if "?" in url:
+            if tok is None:
+                try:
+                    tok = re.findall("token=([a-z0-9]+)", url)[0]
+                except IndexError as e:
+                    raise ValueError("Could not determine token") from e
+            url = url.split("?", 1)[0]
+        self.url = url.rstrip("/") + "/api/contents"
+        self.session = requests.Session()
+        if tok:
+            self.session.headers["Authorization"] = f"token {tok}"
+
+        super().__init__(**kwargs)
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+
+        if out["type"] == "directory":
+            out = out["content"]
+        else:
+            out = [out]
+        for o in out:
+            o["name"] = o.pop("path")
+            o.pop("content")
+            if o["type"] == "notebook":
+                o["type"] = "file"
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+        if out["format"] == "text":
+            # data should be binary
+            b = out["content"].encode()
+        else:
+            b = base64.b64decode(out["content"])
+        return b[start:end]
+
+    def pipe_file(self, path, value, **_):
+        path = self._strip_protocol(path)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": len(value),
+            "content": base64.b64encode(value).decode(),
+            "format": "base64",
+            "type": "file",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents and "/" in path:
+            self.mkdir(path.rsplit("/", 1)[0], True)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": None,
+            "content": None,
+            "type": "directory",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+
+    def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
+        if path1 == path2:
+            return
+        self.session.patch(f"{self.url}/{path1}", json={"path": path2})
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        self.session.delete(f"{self.url}/{path}")
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        if mode == "rb":
+            data = self.cat_file(path)
+            return io.BytesIO(data)
+        else:
+            return SimpleFileWriter(self, path, mode="wb")
+
+
+class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
+    def _upload_chunk(self, final=False):
+        """Never uploads a chunk until file is done
+
+        Not suitable for large files
+        """
+        if final is False:
+            return False
+        self.buffer.seek(0)
+        data = self.buffer.read()
+        self.fs.pipe_file(self.path, data)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/libarchive.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/libarchive.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f8e750002df72865d611b48022e6634f9572614
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/libarchive.py
@@ -0,0 +1,213 @@
+from contextlib import contextmanager
+from ctypes import (
+    CFUNCTYPE,
+    POINTER,
+    c_int,
+    c_longlong,
+    c_void_p,
+    cast,
+    create_string_buffer,
+)
+
+import libarchive
+import libarchive.ffi as ffi
+
+from fsspec import open_files
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.implementations.memory import MemoryFile
+from fsspec.utils import DEFAULT_BLOCK_SIZE
+
+# Libarchive requires seekable files or memory only for certain archive
+# types. However, since we read the directory first to cache the contents
+# and also allow random access to any file, the file-like object needs
+# to be seekable no matter what.
+
+# Seek call-backs (not provided in the libarchive python wrapper)
+SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
+read_set_seek_callback = ffi.ffi(
+    "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
+)
+new_api = hasattr(ffi, "NO_OPEN_CB")
+
+
+@contextmanager
+def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
+    """Read an archive from a seekable file-like object.
+
+    The `file` object must support the standard `readinto` and 'seek' methods.
+    """
+    buf = create_string_buffer(block_size)
+    buf_p = cast(buf, c_void_p)
+
+    def read_func(archive_p, context, ptrptr):
+        # readinto the buffer, returns number of bytes read
+        length = file.readinto(buf)
+        # write the address of the buffer into the pointer
+        ptrptr = cast(ptrptr, POINTER(c_void_p))
+        ptrptr[0] = buf_p
+        # tell libarchive how much data was written into the buffer
+        return length
+
+    def seek_func(archive_p, context, offset, whence):
+        file.seek(offset, whence)
+        # tell libarchvie the current position
+        return file.tell()
+
+    read_cb = ffi.READ_CALLBACK(read_func)
+    seek_cb = SEEK_CALLBACK(seek_func)
+
+    if new_api:
+        open_cb = ffi.NO_OPEN_CB
+        close_cb = ffi.NO_CLOSE_CB
+    else:
+        open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
+        close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
+
+    with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
+        read_set_seek_callback(archive_p, seek_cb)
+        ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
+        yield libarchive.read.ArchiveRead(archive_p)
+
+
+class LibArchiveFileSystem(AbstractArchiveFileSystem):
+    """Compressed archives as a file-system (read-only)
+
+    Supports the following formats:
+    tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
+    Microsoft CAB, 7-Zip, WARC
+
+    See the libarchive documentation for further restrictions.
+    https://www.libarchive.org/
+
+    Keeps file object open while instance lives. It only works in seekable
+    file-like objects. In case the filesystem does not support this kind of
+    file object, it is recommended to cache locally.
+
+    This class is pickleable, but not necessarily thread-safe (depends on the
+    platform). See libarchive documentation for details.
+    """
+
+    root_marker = ""
+    protocol = "libarchive"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        mode="r",
+        target_protocol=None,
+        target_options=None,
+        block_size=DEFAULT_BLOCK_SIZE,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo: str or file-like
+            Contains ZIP, and must exist. If a str, will fetch file using
+            :meth:`~fsspec.open_files`, which must return one file exactly.
+        mode: str
+            Currently, only 'r' accepted
+        target_protocol: str (optional)
+            If ``fo`` is a string, this value can be used to override the
+            FS protocol inferred from a URL
+        target_options: dict (optional)
+            Kwargs passed when instantiating the target FS, if ``fo`` is
+            a string.
+        """
+        super().__init__(self, **kwargs)
+        if mode != "r":
+            raise ValueError("Only read from archive files accepted")
+        if isinstance(fo, str):
+            files = open_files(fo, protocol=target_protocol, **(target_options or {}))
+            if len(files) != 1:
+                raise ValueError(
+                    f'Path "{fo}" did not resolve to exactly one file: "{files}"'
+                )
+            fo = files[0]
+        self.of = fo
+        self.fo = fo.__enter__()  # the whole instance is a context
+        self.block_size = block_size
+        self.dir_cache = None
+
+    @contextmanager
+    def _open_archive(self):
+        self.fo.seek(0)
+        with custom_reader(self.fo, block_size=self.block_size) as arc:
+            yield arc
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        # file paths are always relative to the archive root
+        return super()._strip_protocol(path).lstrip("/")
+
+    def _get_dirs(self):
+        fields = {
+            "name": "pathname",
+            "size": "size",
+            "created": "ctime",
+            "mode": "mode",
+            "uid": "uid",
+            "gid": "gid",
+            "mtime": "mtime",
+        }
+
+        if self.dir_cache is not None:
+            return
+
+        self.dir_cache = {}
+        list_names = []
+        with self._open_archive() as arc:
+            for entry in arc:
+                if not entry.isdir and not entry.isfile:
+                    # Skip symbolic links, fifo entries, etc.
+                    continue
+                self.dir_cache.update(
+                    {
+                        dirname: {"name": dirname, "size": 0, "type": "directory"}
+                        for dirname in self._all_dirnames(set(entry.name))
+                    }
+                )
+                f = {key: getattr(entry, fields[key]) for key in fields}
+                f["type"] = "directory" if entry.isdir else "file"
+                list_names.append(entry.name)
+
+                self.dir_cache[f["name"]] = f
+        # libarchive does not seem to return an entry for the directories (at least
+        # not in all formats), so get the directories names from the files names
+        self.dir_cache.update(
+            {
+                dirname: {"name": dirname, "size": 0, "type": "directory"}
+                for dirname in self._all_dirnames(list_names)
+            }
+        )
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if mode != "rb":
+            raise NotImplementedError
+
+        data = b""
+        with self._open_archive() as arc:
+            for entry in arc:
+                if entry.pathname != path:
+                    continue
+
+                if entry.size == 0:
+                    # empty file, so there are no blocks
+                    break
+
+                for block in entry.get_blocks(entry.size):
+                    data = block
+                    break
+                else:
+                    raise ValueError
+        return MemoryFile(fs=self, path=path, data=data)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/local.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..64dc8bb21956e9c5024b47d46d9e9550931891d8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/local.py
@@ -0,0 +1,514 @@
+import datetime
+import io
+import logging
+import os
+import os.path as osp
+import shutil
+import stat
+import tempfile
+from functools import lru_cache
+
+from fsspec import AbstractFileSystem
+from fsspec.compression import compr
+from fsspec.core import get_compression
+from fsspec.utils import isfilelike, stringify_path
+
+logger = logging.getLogger("fsspec.local")
+
+
+class LocalFileSystem(AbstractFileSystem):
+    """Interface to files on local storage
+
+    Parameters
+    ----------
+    auto_mkdir: bool
+        Whether, when opening a file, the directory containing it should
+        be created (if it doesn't already exist). This is assumed by pyarrow
+        code.
+    """
+
+    root_marker = "/"
+    protocol = "file", "local"
+    local_file = True
+
+    def __init__(self, auto_mkdir=False, **kwargs):
+        super().__init__(**kwargs)
+        self.auto_mkdir = auto_mkdir
+
+    @property
+    def fsid(self):
+        return "local"
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.exists(path):
+            raise FileExistsError(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            os.mkdir(path, **kwargs)
+
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        os.makedirs(path, exist_ok=exist_ok)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        os.rmdir(path)
+
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        path_info = self.info(path)
+        infos = []
+        if path_info["type"] == "directory":
+            with os.scandir(path) as it:
+                for f in it:
+                    try:
+                        # Only get the info if requested since it is a bit expensive (the stat call inside)
+                        # The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
+                        info = self.info(f) if detail else self._strip_protocol(f.path)
+                        infos.append(info)
+                    except FileNotFoundError:
+                        pass
+        else:
+            infos = [path_info] if detail else [path_info["name"]]
+
+        return infos
+
+    def info(self, path, **kwargs):
+        if isinstance(path, os.DirEntry):
+            # scandir DirEntry
+            out = path.stat(follow_symlinks=False)
+            link = path.is_symlink()
+            if path.is_dir(follow_symlinks=False):
+                t = "directory"
+            elif path.is_file(follow_symlinks=False):
+                t = "file"
+            else:
+                t = "other"
+
+            size = out.st_size
+            if link:
+                try:
+                    out2 = path.stat(follow_symlinks=True)
+                    size = out2.st_size
+                except OSError:
+                    size = 0
+            path = self._strip_protocol(path.path)
+        else:
+            # str or path-like
+            path = self._strip_protocol(path)
+            out = os.stat(path, follow_symlinks=False)
+            link = stat.S_ISLNK(out.st_mode)
+            if link:
+                out = os.stat(path, follow_symlinks=True)
+            size = out.st_size
+            if stat.S_ISDIR(out.st_mode):
+                t = "directory"
+            elif stat.S_ISREG(out.st_mode):
+                t = "file"
+            else:
+                t = "other"
+
+        # Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
+        created_time = getattr(out, "st_birthtime", out.st_ctime)
+
+        result = {
+            "name": path,
+            "size": size,
+            "type": t,
+            "created": created_time,
+            "islink": link,
+        }
+        for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
+            result[field] = getattr(out, f"st_{field}")
+        if link:
+            result["destination"] = os.readlink(path)
+        return result
+
+    def lexists(self, path, **kwargs):
+        return osp.lexists(path)
+
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path2), exist_ok=True)
+        if self.isfile(path1):
+            shutil.copyfile(path1, path2)
+        elif self.isdir(path1):
+            self.mkdirs(path2, exist_ok=True)
+        else:
+            raise FileNotFoundError(path1)
+
+    def isfile(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isfile(path)
+
+    def isdir(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isdir(path)
+
+    def get_file(self, path1, path2, callback=None, **kwargs):
+        if isfilelike(path2):
+            with open(path1, "rb") as f:
+                shutil.copyfileobj(f, path2)
+        else:
+            return self.cp_file(path1, path2, **kwargs)
+
+    def put_file(self, path1, path2, callback=None, **kwargs):
+        return self.cp_file(path1, path2, **kwargs)
+
+    def mv(self, path1, path2, recursive: bool = True, **kwargs):
+        """Move files/directories
+        For the specific case of local, all ops on directories are recursive and
+        the recursive= kwarg is ignored.
+        """
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        shutil.move(path1, path2)
+
+    def link(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.link(src, dst, **kwargs)
+
+    def symlink(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.symlink(src, dst, **kwargs)
+
+    def islink(self, path) -> bool:
+        return os.path.islink(self._strip_protocol(path))
+
+    def rm_file(self, path):
+        os.remove(self._strip_protocol(path))
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        if not isinstance(path, list):
+            path = [path]
+
+        for p in path:
+            p = self._strip_protocol(p)
+            if self.isdir(p):
+                if not recursive:
+                    raise ValueError("Cannot delete directory, set recursive=True")
+                if osp.abspath(p) == os.getcwd():
+                    raise ValueError("Cannot delete current working directory")
+                shutil.rmtree(p)
+            else:
+                os.remove(p)
+
+    def unstrip_protocol(self, name):
+        name = self._strip_protocol(name)  # normalise for local/win/...
+        return f"file://{name}"
+
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir and "w" in mode:
+            self.makedirs(self._parent(path), exist_ok=True)
+        return LocalFileOpener(path, mode, fs=self, **kwargs)
+
+    def touch(self, path, truncate=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path), exist_ok=True)
+        if self.exists(path):
+            os.utime(path, None)
+        else:
+            open(path, "a").close()
+        if truncate:
+            os.truncate(path, 0)
+
+    def created(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(
+            info["created"], tz=datetime.timezone.utc
+        )
+
+    def modified(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
+
+    @classmethod
+    def _parent(cls, path):
+        path = cls._strip_protocol(path)
+        if os.sep == "/":
+            # posix native
+            return path.rsplit("/", 1)[0] or "/"
+        else:
+            # NT
+            path_ = path.rsplit("/", 1)[0]
+            if len(path_) <= 3:
+                if path_[1:2] == ":":
+                    # nt root (something like c:/)
+                    return path_[0] + ":/"
+            # More cases may be required here
+            return path_
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = stringify_path(path)
+        if path.startswith("file://"):
+            path = path[7:]
+        elif path.startswith("file:"):
+            path = path[5:]
+        elif path.startswith("local://"):
+            path = path[8:]
+        elif path.startswith("local:"):
+            path = path[6:]
+
+        path = make_path_posix(path)
+        if os.sep != "/":
+            # This code-path is a stripped down version of
+            # > drive, path = ntpath.splitdrive(path)
+            if path[1:2] == ":":
+                # Absolute drive-letter path, e.g. X:\Windows
+                # Relative path with drive, e.g. X:Windows
+                drive, path = path[:2], path[2:]
+            elif path[:2] == "//":
+                # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+                # Device drives, e.g. \\.\device or \\?\device
+                if (index1 := path.find("/", 2)) == -1 or (
+                    index2 := path.find("/", index1 + 1)
+                ) == -1:
+                    drive, path = path, ""
+                else:
+                    drive, path = path[:index2], path[index2:]
+            else:
+                # Relative path, e.g. Windows
+                drive = ""
+
+            path = path.rstrip("/") or cls.root_marker
+            return drive + path
+
+        else:
+            return path.rstrip("/") or cls.root_marker
+
+    def _isfilestore(self):
+        # Inheriting from DaskFileSystem makes this False (S3, etc. were)
+        # the original motivation. But we are a posix-like file system.
+        # See https://github.com/dask/dask/issues/5526
+        return True
+
+    def chmod(self, path, mode):
+        path = stringify_path(path)
+        return os.chmod(path, mode)
+
+
+def make_path_posix(path):
+    """Make path generic and absolute for current OS"""
+    if not isinstance(path, str):
+        if isinstance(path, (list, set, tuple)):
+            return type(path)(make_path_posix(p) for p in path)
+        else:
+            path = stringify_path(path)
+            if not isinstance(path, str):
+                raise TypeError(f"could not convert {path!r} to string")
+    if os.sep == "/":
+        # Native posix
+        if path.startswith("/"):
+            # most common fast case for posix
+            return path
+        elif path.startswith("~"):
+            return osp.expanduser(path)
+        elif path.startswith("./"):
+            path = path[2:]
+        elif path == ".":
+            path = ""
+        return f"{os.getcwd()}/{path}"
+    else:
+        # NT handling
+        if path[0:1] == "/" and path[2:3] == ":":
+            # path is like "/c:/local/path"
+            path = path[1:]
+        if path[1:2] == ":":
+            # windows full path like "C:\\local\\path"
+            if len(path) <= 3:
+                # nt root (something like c:/)
+                return path[0] + ":/"
+            path = path.replace("\\", "/")
+            return path
+        elif path[0:1] == "~":
+            return make_path_posix(osp.expanduser(path))
+        elif path.startswith(("\\\\", "//")):
+            # windows UNC/DFS-style paths
+            return "//" + path[2:].replace("\\", "/")
+        elif path.startswith(("\\", "/")):
+            # windows relative path with root
+            path = path.replace("\\", "/")
+            return f"{osp.splitdrive(os.getcwd())[0]}{path}"
+        else:
+            path = path.replace("\\", "/")
+            if path.startswith("./"):
+                path = path[2:]
+            elif path == ".":
+                path = ""
+            return f"{make_path_posix(os.getcwd())}/{path}"
+
+
+def trailing_sep(path):
+    """Return True if the path ends with a path separator.
+
+    A forward slash is always considered a path separator, even on Operating
+    Systems that normally use a backslash.
+    """
+    # TODO: if all incoming paths were posix-compliant then separator would
+    # always be a forward slash, simplifying this function.
+    # See https://github.com/fsspec/filesystem_spec/pull/1250
+    return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
+
+
+@lru_cache(maxsize=1)
+def get_umask(mask: int = 0o666) -> int:
+    """Get the current umask.
+
+    Follows https://stackoverflow.com/a/44130549 to get the umask.
+    Temporarily sets the umask to the given value, and then resets it to the
+    original value.
+    """
+    value = os.umask(mask)
+    os.umask(value)
+    return value
+
+
+class LocalFileOpener(io.IOBase):
+    def __init__(
+        self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
+    ):
+        logger.debug("open file: %s", path)
+        self.path = path
+        self.mode = mode
+        self.fs = fs
+        self.f = None
+        self.autocommit = autocommit
+        self.compression = get_compression(path, compression)
+        self.blocksize = io.DEFAULT_BUFFER_SIZE
+        self._open()
+
+    def _open(self):
+        if self.f is None or self.f.closed:
+            if self.autocommit or "w" not in self.mode:
+                self.f = open(self.path, mode=self.mode)
+                if self.compression:
+                    compress = compr[self.compression]
+                    self.f = compress(self.f, mode=self.mode)
+            else:
+                # TODO: check if path is writable?
+                i, name = tempfile.mkstemp()
+                os.close(i)  # we want normal open and normal buffered file
+                self.temp = name
+                self.f = open(name, mode=self.mode)
+            if "w" not in self.mode:
+                self.size = self.f.seek(0, 2)
+                self.f.seek(0)
+                self.f.size = self.size
+
+    def _fetch_range(self, start, end):
+        # probably only used by cached FS
+        if "r" not in self.mode:
+            raise ValueError
+        self._open()
+        self.f.seek(start)
+        return self.f.read(end - start)
+
+    def __setstate__(self, state):
+        self.f = None
+        loc = state.pop("loc", None)
+        self.__dict__.update(state)
+        if "r" in state["mode"]:
+            self.f = None
+            self._open()
+            self.f.seek(loc)
+
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        d.pop("f")
+        if "r" in self.mode:
+            d["loc"] = self.f.tell()
+        else:
+            if not self.f.closed:
+                raise ValueError("Cannot serialise open write-mode local file")
+        return d
+
+    def commit(self):
+        if self.autocommit:
+            raise RuntimeError("Can only commit if not already set to autocommit")
+        try:
+            shutil.move(self.temp, self.path)
+        except PermissionError as e:
+            # shutil.move raises PermissionError if os.rename
+            # and the default copy2 fallback with shutil.copystats fail.
+            # The file should be there nonetheless, but without copied permissions.
+            # If it doesn't exist, there was no permission to create the file.
+            if not os.path.exists(self.path):
+                raise e
+        else:
+            # If PermissionError is not raised, permissions can be set.
+            try:
+                mask = 0o666
+                os.chmod(self.path, mask & ~get_umask(mask))
+            except RuntimeError:
+                pass
+
+    def discard(self):
+        if self.autocommit:
+            raise RuntimeError("Cannot discard if set to autocommit")
+        os.remove(self.temp)
+
+    def readable(self) -> bool:
+        return True
+
+    def writable(self) -> bool:
+        return "r" not in self.mode
+
+    def read(self, *args, **kwargs):
+        return self.f.read(*args, **kwargs)
+
+    def write(self, *args, **kwargs):
+        return self.f.write(*args, **kwargs)
+
+    def tell(self, *args, **kwargs):
+        return self.f.tell(*args, **kwargs)
+
+    def seek(self, *args, **kwargs):
+        return self.f.seek(*args, **kwargs)
+
+    def seekable(self, *args, **kwargs):
+        return self.f.seekable(*args, **kwargs)
+
+    def readline(self, *args, **kwargs):
+        return self.f.readline(*args, **kwargs)
+
+    def readlines(self, *args, **kwargs):
+        return self.f.readlines(*args, **kwargs)
+
+    def close(self):
+        return self.f.close()
+
+    def truncate(self, size=None) -> int:
+        return self.f.truncate(size)
+
+    @property
+    def closed(self):
+        return self.f.closed
+
+    def fileno(self):
+        return self.raw.fileno()
+
+    def flush(self) -> None:
+        self.f.flush()
+
+    def __iter__(self):
+        return self.f.__iter__()
+
+    def __getattr__(self, item):
+        return getattr(self.f, item)
+
+    def __enter__(self):
+        self._incontext = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._incontext = False
+        self.f.__exit__(exc_type, exc_value, traceback)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/memory.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b67bbc84e7aa625eee5609c20f6a893ddd349e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/memory.py
@@ -0,0 +1,311 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+from errno import ENOTEMPTY
+from io import BytesIO
+from pathlib import PurePath, PureWindowsPath
+from typing import Any, ClassVar
+
+from fsspec import AbstractFileSystem
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.utils import stringify_path
+
+logger = logging.getLogger("fsspec.memoryfs")
+
+
+class MemoryFileSystem(AbstractFileSystem):
+    """A filesystem based on a dict of BytesIO objects
+
+    This is a global filesystem so instances of this class all point to the same
+    in memory filesystem.
+    """
+
+    store: ClassVar[dict[str, Any]] = {}  # global, do not overwrite!
+    pseudo_dirs = [""]  # global, do not overwrite!
+    protocol = "memory"
+    root_marker = "/"
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        if isinstance(path, PurePath):
+            if isinstance(path, PureWindowsPath):
+                return LocalFileSystem._strip_protocol(path)
+            else:
+                path = stringify_path(path)
+
+        path = path.removeprefix("memory://")
+        if "::" in path or "://" in path:
+            return path.rstrip("/")
+        path = path.lstrip("/").rstrip("/")
+        return "/" + path if path else ""
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        if path in self.store:
+            # there is a key with this exact name
+            if not detail:
+                return [path]
+            return [
+                {
+                    "name": path,
+                    "size": self.store[path].size,
+                    "type": "file",
+                    "created": self.store[path].created.timestamp(),
+                }
+            ]
+        paths = set()
+        starter = path + "/"
+        out = []
+        for p2 in tuple(self.store):
+            if p2.startswith(starter):
+                if "/" not in p2[len(starter) :]:
+                    # exact child
+                    out.append(
+                        {
+                            "name": p2,
+                            "size": self.store[p2].size,
+                            "type": "file",
+                            "created": self.store[p2].created.timestamp(),
+                        }
+                    )
+                elif len(p2) > len(starter):
+                    # implied child directory
+                    ppath = starter + p2[len(starter) :].split("/", 1)[0]
+                    if ppath not in paths:
+                        out = out or []
+                        out.append(
+                            {
+                                "name": ppath,
+                                "size": 0,
+                                "type": "directory",
+                            }
+                        )
+                        paths.add(ppath)
+        for p2 in self.pseudo_dirs:
+            if p2.startswith(starter):
+                if "/" not in p2[len(starter) :]:
+                    # exact child pdir
+                    if p2 not in paths:
+                        out.append({"name": p2, "size": 0, "type": "directory"})
+                        paths.add(p2)
+                else:
+                    # directory implied by deeper pdir
+                    ppath = starter + p2[len(starter) :].split("/", 1)[0]
+                    if ppath not in paths:
+                        out.append({"name": ppath, "size": 0, "type": "directory"})
+                        paths.add(ppath)
+        if not out:
+            if path in self.pseudo_dirs:
+                # empty dir
+                return []
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return sorted([f["name"] for f in out])
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if path in self.store or path in self.pseudo_dirs:
+            raise FileExistsError(path)
+        if self._parent(path).strip("/") and self.isfile(self._parent(path)):
+            raise NotADirectoryError(self._parent(path))
+        if create_parents and self._parent(path).strip("/"):
+            try:
+                self.mkdir(self._parent(path), create_parents, **kwargs)
+            except FileExistsError:
+                pass
+        if path and path not in self.pseudo_dirs:
+            self.pseudo_dirs.append(path)
+
+    def makedirs(self, path, exist_ok=False):
+        try:
+            self.mkdir(path, create_parents=True)
+        except FileExistsError:
+            if not exist_ok:
+                raise
+
+    def pipe_file(self, path, value, mode="overwrite", **kwargs):
+        """Set the bytes of given file
+
+        Avoids copies of the data if possible
+        """
+        mode = "xb" if mode == "create" else "wb"
+        self.open(path, mode=mode, data=value)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        if path == "":
+            # silently avoid deleting FS root
+            return
+        if path in self.pseudo_dirs:
+            if not self.ls(path):
+                self.pseudo_dirs.remove(path)
+            else:
+                raise OSError(ENOTEMPTY, "Directory not empty", path)
+        else:
+            raise FileNotFoundError(path)
+
+    def info(self, path, **kwargs):
+        logger.debug("info: %s", path)
+        path = self._strip_protocol(path)
+        if path in self.pseudo_dirs or any(
+            p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
+        ):
+            return {
+                "name": path,
+                "size": 0,
+                "type": "directory",
+            }
+        elif path in self.store:
+            filelike = self.store[path]
+            return {
+                "name": path,
+                "size": filelike.size,
+                "type": "file",
+                "created": getattr(filelike, "created", None),
+            }
+        else:
+            raise FileNotFoundError(path)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if "x" in mode and self.exists(path):
+            raise FileExistsError
+        if path in self.pseudo_dirs:
+            raise IsADirectoryError(path)
+        parent = path
+        while len(parent) > 1:
+            parent = self._parent(parent)
+            if self.isfile(parent):
+                raise FileExistsError(parent)
+        if mode in ["rb", "ab", "r+b", "a+b"]:
+            if path in self.store:
+                f = self.store[path]
+                if "a" in mode:
+                    # position at the end of file
+                    f.seek(0, 2)
+                else:
+                    # position at the beginning of file
+                    f.seek(0)
+                return f
+            else:
+                raise FileNotFoundError(path)
+        elif mode in {"wb", "w+b", "xb", "x+b"}:
+            if "x" in mode and self.exists(path):
+                raise FileExistsError
+            m = MemoryFile(self, path, kwargs.get("data"))
+            if not self._intrans:
+                m.commit()
+            return m
+        else:
+            name = self.__class__.__name__
+            raise ValueError(f"unsupported file mode for {name}: {mode!r}")
+
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        if self.isfile(path1):
+            self.store[path2] = MemoryFile(
+                self, path2, self.store[path1].getvalue()
+            )  # implicit copy
+        elif self.isdir(path1):
+            if path2 not in self.pseudo_dirs:
+                self.pseudo_dirs.append(path2)
+        else:
+            raise FileNotFoundError(path1)
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        logger.debug("cat: %s", path)
+        path = self._strip_protocol(path)
+        try:
+            return bytes(self.store[path].getbuffer()[start:end])
+        except KeyError as e:
+            raise FileNotFoundError(path) from e
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        try:
+            del self.store[path]
+        except KeyError as e:
+            raise FileNotFoundError(path) from e
+
+    def modified(self, path):
+        path = self._strip_protocol(path)
+        try:
+            return self.store[path].modified
+        except KeyError as e:
+            raise FileNotFoundError(path) from e
+
+    def created(self, path):
+        path = self._strip_protocol(path)
+        try:
+            return self.store[path].created
+        except KeyError as e:
+            raise FileNotFoundError(path) from e
+
+    def isfile(self, path):
+        path = self._strip_protocol(path)
+        return path in self.store
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        if isinstance(path, str):
+            path = self._strip_protocol(path)
+        else:
+            path = [self._strip_protocol(p) for p in path]
+        paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(paths):
+            if self.isfile(p):
+                self.rm_file(p)
+            # If the expanded path doesn't exist, it is only because the expanded
+            # path was a directory that does not exist in self.pseudo_dirs. This
+            # is possible if you directly create files without making the
+            # directories first.
+            elif not self.exists(p):
+                continue
+            else:
+                self.rmdir(p)
+
+
+class MemoryFile(BytesIO):
+    """A BytesIO which can't close and works as a context manager
+
+    Can initialise with data. Each path should only be active once at any moment.
+
+    No need to provide fs, path if auto-committing (default)
+    """
+
+    def __init__(self, fs=None, path=None, data=None):
+        logger.debug("open file %s", path)
+        self.fs = fs
+        self.path = path
+        self.created = datetime.now(tz=timezone.utc)
+        self.modified = datetime.now(tz=timezone.utc)
+        if data:
+            super().__init__(data)
+            self.seek(0)
+
+    @property
+    def size(self):
+        return self.getbuffer().nbytes
+
+    def __enter__(self):
+        return self
+
+    def close(self):
+        pass
+
+    def discard(self):
+        pass
+
+    def commit(self):
+        self.fs.store[self.path] = self
+        self.modified = datetime.now(tz=timezone.utc)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/reference.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e81224bd2c6aa182319c189aae26090f92572b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/reference.py
@@ -0,0 +1,1311 @@
+import base64
+import collections
+import io
+import itertools
+import logging
+import math
+import os
+from functools import lru_cache
+from itertools import chain
+from typing import TYPE_CHECKING, Literal
+
+import fsspec.core
+from fsspec.spec import AbstractBufferedFile
+
+try:
+    import ujson as json
+except ImportError:
+    if not TYPE_CHECKING:
+        import json
+
+from fsspec.asyn import AsyncFileSystem
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.core import filesystem, open, split_protocol
+from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
+from fsspec.utils import (
+    isfilelike,
+    merge_offset_ranges,
+    other_paths,
+)
+
+logger = logging.getLogger("fsspec.reference")
+
+
+class ReferenceNotReachable(RuntimeError):
+    def __init__(self, reference, target, *args):
+        super().__init__(*args)
+        self.reference = reference
+        self.target = target
+
+    def __str__(self):
+        return f'Reference "{self.reference}" failed to fetch target {self.target}'
+
+
+def _first(d):
+    return next(iter(d.values()))
+
+
+def _prot_in_references(path, references):
+    ref = references.get(path)
+    if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
+        return split_protocol(ref[0])[0] if ref[0] else ref[0]
+
+
+def _protocol_groups(paths, references):
+    if isinstance(paths, str):
+        return {_prot_in_references(paths, references): [paths]}
+    out = {}
+    for path in paths:
+        protocol = _prot_in_references(path, references)
+        out.setdefault(protocol, []).append(path)
+    return out
+
+
+class RefsValuesView(collections.abc.ValuesView):
+    def __iter__(self):
+        for val in self._mapping.zmetadata.values():
+            yield json.dumps(val).encode()
+        yield from self._mapping._items.values()
+        for field in self._mapping.listdir():
+            chunk_sizes = self._mapping._get_chunk_sizes(field)
+            if len(chunk_sizes) == 0:
+                yield self._mapping[field + "/0"]
+                continue
+            yield from self._mapping._generate_all_records(field)
+
+
+class RefsItemsView(collections.abc.ItemsView):
+    def __iter__(self):
+        return zip(self._mapping.keys(), self._mapping.values())
+
+
+def ravel_multi_index(idx, sizes):
+    val = 0
+    mult = 1
+    for i, s in zip(idx[::-1], sizes[::-1]):
+        val += i * mult
+        mult *= s
+    return val
+
+
+class LazyReferenceMapper(collections.abc.MutableMapping):
+    """This interface can be used to read/write references from Parquet stores.
+    It is not intended for other types of references.
+    It can be used with Kerchunk's MultiZarrToZarr method to combine
+    references into a parquet store.
+    Examples of this use-case can be found here:
+    https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
+
+    # import is class level to prevent numpy dep requirement for fsspec
+    @property
+    def np(self):
+        import numpy as np
+
+        return np
+
+    @property
+    def pd(self):
+        import pandas as pd
+
+        return pd
+
+    def __init__(
+        self,
+        root,
+        fs=None,
+        out_root=None,
+        cache_size=128,
+        categorical_threshold=10,
+        engine: Literal["fastparquet", "pyarrow"] = "fastparquet",
+    ):
+        """
+
+        This instance will be writable, storing changes in memory until full partitions
+        are accumulated or .flush() is called.
+
+        To create an empty lazy store, use .create()
+
+        Parameters
+        ----------
+        root : str
+            Root of parquet store
+        fs : fsspec.AbstractFileSystem
+            fsspec filesystem object, default is local filesystem.
+        cache_size : int, default=128
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+        categorical_threshold : int
+            Encode urls as pandas.Categorical to reduce memory footprint if the ratio
+            of the number of unique urls to total number of refs for each variable
+            is greater than or equal to this number. (default 10)
+        engine: Literal["fastparquet","pyarrow"]
+            Engine choice for reading parquet files. (default is "fastparquet")
+        """
+
+        self.root = root
+        self.chunk_sizes = {}
+        self.cat_thresh = categorical_threshold
+        self.engine = engine
+        self.cache_size = cache_size
+        self.url = self.root + "/{field}/refs.{record}.parq"
+        # TODO: derive fs from `root`
+        self.fs = fsspec.filesystem("file") if fs is None else fs
+        self.out_root = self.fs.unstrip_protocol(out_root or self.root)
+
+        from importlib.util import find_spec
+
+        if self.engine == "pyarrow" and find_spec("pyarrow") is None:
+            raise ImportError("engine choice `pyarrow` is not installed.")
+
+    def __getattr__(self, item):
+        if item in ("_items", "record_size", "zmetadata"):
+            self.setup()
+            # avoid possible recursion if setup fails somehow
+            return self.__dict__[item]
+        raise AttributeError(item)
+
+    def setup(self):
+        self._items = {}
+        self._items[".zmetadata"] = self.fs.cat_file(
+            "/".join([self.root, ".zmetadata"])
+        )
+        met = json.loads(self._items[".zmetadata"])
+        self.record_size = met["record_size"]
+        self.zmetadata = met["metadata"]
+
+        # Define function to open and decompress refs
+        @lru_cache(maxsize=self.cache_size)
+        def open_refs(field, record):
+            """cached parquet file loader"""
+            path = self.url.format(field=field, record=record)
+            data = io.BytesIO(self.fs.cat_file(path))
+            try:
+                df = self.pd.read_parquet(data, engine=self.engine)
+                refs = {c: df[c].to_numpy() for c in df.columns}
+            except OSError:
+                refs = None
+            return refs
+
+        self.open_refs = open_refs
+
+    @staticmethod
+    def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
+        """Make empty parquet reference set
+
+        First deletes the contents of the given directory, if it exists.
+
+        Parameters
+        ----------
+        root: str
+            Directory to contain the output; will be created
+        storage_options: dict | None
+            For making the filesystem to use for writing is fs is None
+        fs: FileSystem | None
+            Filesystem for writing
+        record_size: int
+            Number of references per parquet file
+        kwargs: passed to __init__
+
+        Returns
+        -------
+        LazyReferenceMapper instance
+        """
+        met = {"metadata": {}, "record_size": record_size}
+        if fs is None:
+            fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
+        if fs.exists(root):
+            fs.rm(root, recursive=True)
+        fs.makedirs(root, exist_ok=True)
+        fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
+        return LazyReferenceMapper(root, fs, **kwargs)
+
+    @lru_cache
+    def listdir(self):
+        """List top-level directories"""
+        dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
+        return set(dirs)
+
+    def ls(self, path="", detail=True):
+        """Shortcut file listings"""
+        path = path.rstrip("/")
+        pathdash = path + "/" if path else ""
+        dirnames = self.listdir()
+        dirs = [
+            d
+            for d in dirnames
+            if d.startswith(pathdash) and "/" not in d.lstrip(pathdash)
+        ]
+        if dirs:
+            others = {
+                f
+                for f in chain(
+                    [".zmetadata"],
+                    (name for name in self.zmetadata),
+                    (name for name in self._items),
+                )
+                if f.startswith(pathdash) and "/" not in f.lstrip(pathdash)
+            }
+            if detail is False:
+                others.update(dirs)
+                return sorted(others)
+            dirinfo = [{"name": name, "type": "directory", "size": 0} for name in dirs]
+            fileinfo = [
+                {
+                    "name": name,
+                    "type": "file",
+                    "size": len(
+                        json.dumps(self.zmetadata[name])
+                        if name in self.zmetadata
+                        else self._items[name]
+                    ),
+                }
+                for name in others
+            ]
+            return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
+        field = path
+        others = set(
+            [name for name in self.zmetadata if name.startswith(f"{path}/")]
+            + [name for name in self._items if name.startswith(f"{path}/")]
+        )
+        fileinfo = [
+            {
+                "name": name,
+                "type": "file",
+                "size": len(
+                    json.dumps(self.zmetadata[name])
+                    if name in self.zmetadata
+                    else self._items[name]
+                ),
+            }
+            for name in others
+        ]
+        keys = self._keys_in_field(field)
+
+        if detail is False:
+            return list(others) + list(keys)
+        recs = self._generate_all_records(field)
+        recinfo = [
+            {"name": name, "type": "file", "size": rec[-1]}
+            for name, rec in zip(keys, recs)
+            if rec[0]  # filters out path==None, deleted/missing
+        ]
+        return fileinfo + recinfo
+
+    def _load_one_key(self, key):
+        """Get the reference for one key
+
+        Returns bytes, one-element list or three-element list.
+        """
+        if key in self._items:
+            return self._items[key]
+        elif key in self.zmetadata:
+            return json.dumps(self.zmetadata[key]).encode()
+        elif "/" not in key or self._is_meta(key):
+            raise KeyError(key)
+        field, _ = key.rsplit("/", 1)
+        record, ri, chunk_size = self._key_to_record(key)
+        maybe = self._items.get((field, record), {}).get(ri, False)
+        if maybe is None:
+            # explicitly deleted
+            raise KeyError
+        elif maybe:
+            return maybe
+        elif chunk_size == 0:
+            return b""
+
+        # Chunk keys can be loaded from row group and cached in LRU cache
+        try:
+            refs = self.open_refs(field, record)
+        except (ValueError, TypeError, FileNotFoundError) as exc:
+            raise KeyError(key) from exc
+        columns = ["path", "offset", "size", "raw"]
+        selection = [refs[c][ri] if c in refs else None for c in columns]
+        raw = selection[-1]
+        if raw is not None:
+            return raw
+        if selection[0] is None:
+            raise KeyError("This reference does not exist or has been deleted")
+        if selection[1:3] == [0, 0]:
+            # URL only
+            return selection[:1]
+        # URL, offset, size
+        return selection[:3]
+
+    @lru_cache(4096)
+    def _key_to_record(self, key):
+        """Details needed to construct a reference for one key"""
+        field, chunk = key.rsplit("/", 1)
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            return 0, 0, 0
+        chunk_idx = [int(c) for c in chunk.split(".")]
+        chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
+        record = chunk_number // self.record_size
+        ri = chunk_number % self.record_size
+        return record, ri, len(chunk_sizes)
+
+    def _get_chunk_sizes(self, field):
+        """The number of chunks along each axis for a given field"""
+        if field not in self.chunk_sizes:
+            zarray = self.zmetadata[f"{field}/.zarray"]
+            size_ratio = [
+                math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
+            ]
+            self.chunk_sizes[field] = size_ratio or [1]
+        return self.chunk_sizes[field]
+
+    def _generate_record(self, field, record):
+        """The references for a given parquet file of a given field"""
+        refs = self.open_refs(field, record)
+        it = iter(zip(*refs.values()))
+        if len(refs) == 3:
+            # All urls
+            return (list(t) for t in it)
+        elif len(refs) == 1:
+            # All raws
+            return refs["raw"]
+        else:
+            # Mix of urls and raws
+            return (list(t[:3]) if not t[3] else t[3] for t in it)
+
+    def _generate_all_records(self, field):
+        """Load all the references within a field by iterating over the parquet files"""
+        nrec = 1
+        for ch in self._get_chunk_sizes(field):
+            nrec *= ch
+        nrec = math.ceil(nrec / self.record_size)
+        for record in range(nrec):
+            yield from self._generate_record(field, record)
+
+    def values(self):
+        return RefsValuesView(self)
+
+    def items(self):
+        return RefsItemsView(self)
+
+    def __hash__(self):
+        return id(self)
+
+    def __getitem__(self, key):
+        return self._load_one_key(key)
+
+    def __setitem__(self, key, value):
+        if "/" in key and not self._is_meta(key):
+            field, chunk = key.rsplit("/", 1)
+            record, i, _ = self._key_to_record(key)
+            subdict = self._items.setdefault((field, record), {})
+            subdict[i] = value
+            if len(subdict) == self.record_size:
+                self.write(field, record)
+        else:
+            # metadata or top-level
+            if hasattr(value, "to_bytes"):
+                val = value.to_bytes().decode()
+            elif isinstance(value, bytes):
+                val = value.decode()
+            else:
+                val = value
+            self._items[key] = val
+            new_value = json.loads(val)
+            self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
+
+    @staticmethod
+    def _is_meta(key):
+        return key.startswith(".z") or "/.z" in key
+
+    def __delitem__(self, key):
+        if key in self._items:
+            del self._items[key]
+        elif key in self.zmetadata:
+            del self.zmetadata[key]
+        else:
+            if "/" in key and not self._is_meta(key):
+                field, _ = key.rsplit("/", 1)
+                record, i, _ = self._key_to_record(key)
+                subdict = self._items.setdefault((field, record), {})
+                subdict[i] = None
+                if len(subdict) == self.record_size:
+                    self.write(field, record)
+            else:
+                # metadata or top-level
+                self._items[key] = None
+
+    def write(self, field, record, base_url=None, storage_options=None):
+        # extra requirements if writing
+        import kerchunk.df
+        import numpy as np
+        import pandas as pd
+
+        partition = self._items[(field, record)]
+        original = False
+        if len(partition) < self.record_size:
+            try:
+                original = self.open_refs(field, record)
+            except OSError:
+                pass
+
+        if original:
+            paths = original["path"]
+            offsets = original["offset"]
+            sizes = original["size"]
+            raws = original["raw"]
+        else:
+            paths = np.full(self.record_size, np.nan, dtype="O")
+            offsets = np.zeros(self.record_size, dtype="int64")
+            sizes = np.zeros(self.record_size, dtype="int64")
+            raws = np.full(self.record_size, np.nan, dtype="O")
+        for j, data in partition.items():
+            if isinstance(data, list):
+                if (
+                    str(paths.dtype) == "category"
+                    and data[0] not in paths.dtype.categories
+                ):
+                    paths = paths.add_categories(data[0])
+                paths[j] = data[0]
+                if len(data) > 1:
+                    offsets[j] = data[1]
+                    sizes[j] = data[2]
+            elif data is None:
+                # delete
+                paths[j] = None
+                offsets[j] = 0
+                sizes[j] = 0
+                raws[j] = None
+            else:
+                # this is the only call into kerchunk, could remove
+                raws[j] = kerchunk.df._proc_raw(data)
+        # TODO: only save needed columns
+        df = pd.DataFrame(
+            {
+                "path": paths,
+                "offset": offsets,
+                "size": sizes,
+                "raw": raws,
+            },
+            copy=False,
+        )
+        if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
+            df["path"] = df["path"].astype("category")
+        object_encoding = {"raw": "bytes", "path": "utf8"}
+        has_nulls = ["path", "raw"]
+
+        fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
+        self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
+
+        if self.engine == "pyarrow":
+            df_backend_kwargs = {"write_statistics": False}
+        elif self.engine == "fastparquet":
+            df_backend_kwargs = {
+                "stats": False,
+                "object_encoding": object_encoding,
+                "has_nulls": has_nulls,
+            }
+        else:
+            raise NotImplementedError(f"{self.engine} not supported")
+        df.to_parquet(
+            fn,
+            engine=self.engine,
+            storage_options=storage_options
+            or getattr(self.fs, "storage_options", None),
+            compression="zstd",
+            index=False,
+            **df_backend_kwargs,
+        )
+
+        partition.clear()
+        self._items.pop((field, record))
+
+    def flush(self, base_url=None, storage_options=None):
+        """Output any modified or deleted keys
+
+        Parameters
+        ----------
+        base_url: str
+            Location of the output
+        """
+
+        # write what we have so far and clear sub chunks
+        for thing in list(self._items):
+            if isinstance(thing, tuple):
+                field, record = thing
+                self.write(
+                    field,
+                    record,
+                    base_url=base_url,
+                    storage_options=storage_options,
+                )
+
+        # gather .zmetadata from self._items and write that too
+        for k in list(self._items):
+            if k != ".zmetadata" and ".z" in k:
+                self.zmetadata[k] = json.loads(self._items.pop(k))
+        met = {"metadata": self.zmetadata, "record_size": self.record_size}
+        self._items.clear()
+        self._items[".zmetadata"] = json.dumps(met).encode()
+        self.fs.pipe(
+            "/".join([base_url or self.out_root, ".zmetadata"]),
+            self._items[".zmetadata"],
+        )
+
+        # TODO: only clear those that we wrote to?
+        self.open_refs.cache_clear()
+
+    def __len__(self):
+        # Caveat: This counts expected references, not actual - but is fast
+        count = 0
+        for field in self.listdir():
+            if field.startswith("."):
+                count += 1
+            else:
+                count += math.prod(self._get_chunk_sizes(field))
+        count += len(self.zmetadata)  # all metadata keys
+        # any other files not in reference partitions
+        count += sum(1 for _ in self._items if not isinstance(_, tuple))
+        return count
+
+    def __iter__(self):
+        # Caveat: returns only existing keys, so the number of these does not
+        #  match len(self)
+        metas = set(self.zmetadata)
+        metas.update(self._items)
+        for bit in metas:
+            if isinstance(bit, str):
+                yield bit
+        for field in self.listdir():
+            for k in self._keys_in_field(field):
+                if k in self:
+                    yield k
+
+    def __contains__(self, item):
+        try:
+            self._load_one_key(item)
+            return True
+        except KeyError:
+            return False
+
+    def _keys_in_field(self, field):
+        """List key names in given field
+
+        Produces strings like "field/x.y" appropriate from the chunking of the array
+        """
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            yield field + "/0"
+            return
+        inds = itertools.product(*(range(i) for i in chunk_sizes))
+        for ind in inds:
+            yield field + "/" + ".".join([str(c) for c in ind])
+
+
+class ReferenceFileSystem(AsyncFileSystem):
+    """View byte ranges of some other file as a file system
+    Initial version: single file system target, which must support
+    async, and must allow start and end args in _cat_file. Later versions
+    may allow multiple arbitrary URLs for the targets.
+    This FileSystem is read-only. It is designed to be used with async
+    targets (for now). We do not get original file details from the target FS.
+    Configuration is by passing a dict of references at init, or a URL to
+    a JSON file containing the same; this dict
+    can also contain concrete data for some set of paths.
+    Reference dict format:
+    {path0: bytes_data, path1: (target_url, offset, size)}
+    https://github.com/fsspec/kerchunk/blob/main/README.md
+    """
+
+    protocol = "reference"
+    cachable = False
+
+    def __init__(
+        self,
+        fo,
+        target=None,
+        ref_storage_args=None,
+        target_protocol=None,
+        target_options=None,
+        remote_protocol=None,
+        remote_options=None,
+        fs=None,
+        template_overrides=None,
+        simple_templates=True,
+        max_gap=64_000,
+        max_block=256_000_000,
+        cache_size=128,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo : dict or str
+            The set of references to use for this instance, with a structure as above.
+            If str referencing a JSON file, will use fsspec.open, in conjunction
+            with target_options and target_protocol to open and parse JSON at this
+            location. If a directory, then assume references are a set of parquet
+            files to be loaded lazily.
+        target : str
+            For any references having target_url as None, this is the default file
+            target to use
+        ref_storage_args : dict
+            If references is a str, use these kwargs for loading the JSON file.
+            Deprecated: use target_options instead.
+        target_protocol : str
+            Used for loading the reference file, if it is a path. If None, protocol
+            will be derived from the given path
+        target_options : dict
+            Extra FS options for loading the reference file ``fo``, if given as a path
+        remote_protocol : str
+            The protocol of the filesystem on which the references will be evaluated
+            (unless fs is provided). If not given, will be derived from the first
+            URL that has a protocol in the templates or in the references, in that
+            order.
+        remote_options : dict
+            kwargs to go with remote_protocol
+        fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
+            Directly provide a file system(s):
+                - a single filesystem instance
+                - a dict of protocol:filesystem, where each value is either a filesystem
+                  instance, or a dict of kwargs that can be used to create in
+                  instance for the given protocol
+
+            If this is given, remote_options and remote_protocol are ignored.
+        template_overrides : dict
+            Swap out any templates in the references file with these - useful for
+            testing.
+        simple_templates: bool
+            Whether templates can be processed with simple replace (True) or if
+            jinja  is needed (False, much slower). All reference sets produced by
+            ``kerchunk`` are simple in this sense, but the spec allows for complex.
+        max_gap, max_block: int
+            For merging multiple concurrent requests to the same remote file.
+            Neighboring byte ranges will only be merged when their
+            inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
+            to only merge when it requires no extra bytes. Pass a negative
+            number to disable merging, appropriate for local target files.
+            Neighboring byte ranges will only be merged when the size of
+            the aggregated range is <= ``max_block``. Default is 256MB.
+        cache_size : int
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+            Only used for lazily loaded references.
+        kwargs : passed to parent class
+        """
+        super().__init__(**kwargs)
+        self.target = target
+        self.template_overrides = template_overrides
+        self.simple_templates = simple_templates
+        self.templates = {}
+        self.fss = {}
+        self._dircache = {}
+        self.max_gap = max_gap
+        self.max_block = max_block
+        if isinstance(fo, str):
+            dic = dict(
+                **(ref_storage_args or target_options or {}), protocol=target_protocol
+            )
+            ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
+            if ".json" not in fo2 and (
+                fo.endswith(("parq", "parquet", "/")) or ref_fs.isdir(fo2)
+            ):
+                # Lazy parquet refs
+                logger.info("Open lazy reference dict from URL %s", fo)
+                self.references = LazyReferenceMapper(
+                    fo2,
+                    fs=ref_fs,
+                    cache_size=cache_size,
+                )
+            else:
+                # text JSON
+                with fsspec.open(fo, "rb", **dic) as f:
+                    logger.info("Read reference from URL %s", fo)
+                    text = json.load(f)
+                self._process_references(text, template_overrides)
+        else:
+            # dictionaries
+            self._process_references(fo, template_overrides)
+        if isinstance(fs, dict):
+            self.fss = {
+                k: (
+                    fsspec.filesystem(k.split(":", 1)[0], **opts)
+                    if isinstance(opts, dict)
+                    else opts
+                )
+                for k, opts in fs.items()
+            }
+            if None not in self.fss:
+                self.fss[None] = filesystem("file")
+            return
+        if fs is not None:
+            # single remote FS
+            remote_protocol = (
+                fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
+            )
+            self.fss[remote_protocol] = fs
+
+        if remote_protocol is None:
+            # get single protocol from any templates
+            for ref in self.templates.values():
+                if callable(ref):
+                    ref = ref()
+                protocol, _ = fsspec.core.split_protocol(ref)
+                if protocol and protocol not in self.fss:
+                    fs = filesystem(protocol, **(remote_options or {}))
+                    self.fss[protocol] = fs
+        if remote_protocol is None:
+            # get single protocol from references
+            # TODO: warning here, since this can be very expensive?
+            for ref in self.references.values():
+                if callable(ref):
+                    ref = ref()
+                if isinstance(ref, list) and ref[0]:
+                    protocol, _ = fsspec.core.split_protocol(ref[0])
+                    if protocol not in self.fss:
+                        fs = filesystem(protocol, **(remote_options or {}))
+                        self.fss[protocol] = fs
+                        # only use first remote URL
+                        break
+
+        if remote_protocol and remote_protocol not in self.fss:
+            fs = filesystem(remote_protocol, **(remote_options or {}))
+            self.fss[remote_protocol] = fs
+
+        self.fss[None] = fs or filesystem("file")  # default one
+        # Wrap any non-async filesystems to ensure async methods are available below
+        for k, f in self.fss.items():
+            if not f.async_impl:
+                self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous)
+            elif self.asynchronous ^ f.asynchronous:
+                raise ValueError(
+                    "Reference-FS's target filesystem must have same value "
+                    "of asynchronous"
+                )
+
+    def _cat_common(self, path, start=None, end=None):
+        path = self._strip_protocol(path)
+        logger.debug(f"cat: {path}")
+        try:
+            part = self.references[path]
+        except KeyError as exc:
+            raise FileNotFoundError(path) from exc
+        if isinstance(part, str):
+            part = part.encode()
+        if hasattr(part, "to_bytes"):
+            part = part.to_bytes()
+        if isinstance(part, bytes):
+            logger.debug(f"Reference: {path}, type bytes")
+            if part.startswith(b"base64:"):
+                part = base64.b64decode(part[7:])
+            return part, None, None
+
+        if len(part) == 1:
+            logger.debug(f"Reference: {path}, whole file => {part}")
+            url = part[0]
+            start1, end1 = start, end
+        else:
+            url, start0, size = part
+            logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
+            end0 = start0 + size
+
+            if start is not None:
+                if start >= 0:
+                    start1 = start0 + start
+                else:
+                    start1 = end0 + start
+            else:
+                start1 = start0
+            if end is not None:
+                if end >= 0:
+                    end1 = start0 + end
+                else:
+                    end1 = end0 + end
+            else:
+                end1 = end0
+        if url is None:
+            url = self.target
+        return url, start1, end1
+
+    async def _cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            return await self.fss[protocol]._cat_file(
+                part_or_url, start=start0, end=end0
+            )
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+
+    def pipe_file(self, path, value, **_):
+        """Temporarily add binary data or reference as a file"""
+        self.references[path] = value
+
+    async def _get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = await self._cat_file(rpath)
+        with open(lpath, "wb") as f:
+            f.write(data)
+
+    def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = self.cat_file(rpath, **kwargs)
+        callback.set_size(len(data))
+        if isfilelike(lpath):
+            lpath.write(data)
+        else:
+            with open(lpath, "wb") as f:
+                f.write(data)
+        callback.absolute_update(len(data))
+
+    def get(self, rpath, lpath, recursive=False, **kwargs):
+        if recursive:
+            # trigger directory build
+            self.ls("")
+        rpath = self.expand_path(rpath, recursive=recursive)
+        fs = fsspec.filesystem("file", auto_mkdir=True)
+        targets = other_paths(rpath, lpath)
+        if recursive:
+            data = self.cat([r for r in rpath if not self.isdir(r)])
+        else:
+            data = self.cat(rpath)
+        for remote, local in zip(rpath, targets):
+            if remote in data:
+                fs.pipe_file(local, data[remote])
+
+    def cat(self, path, recursive=False, on_error="raise", **kwargs):
+        if isinstance(path, str) and recursive:
+            raise NotImplementedError
+        if isinstance(path, list) and (recursive or any("*" in p for p in path)):
+            raise NotImplementedError
+        # TODO: if references is lazy, pre-fetch all paths in batch before access
+        proto_dict = _protocol_groups(path, self.references)
+        out = {}
+        for proto, paths in proto_dict.items():
+            fs = self.fss[proto]
+            urls, starts, ends, valid_paths = [], [], [], []
+            for p in paths:
+                # find references or label not-found. Early exit if any not
+                # found and on_error is "raise"
+                try:
+                    u, s, e = self._cat_common(p)
+                    if not isinstance(u, (bytes, str)):
+                        # nan/None from parquet
+                        continue
+                except FileNotFoundError as err:
+                    if on_error == "raise":
+                        raise
+                    if on_error != "omit":
+                        out[p] = err
+                else:
+                    urls.append(u)
+                    starts.append(s)
+                    ends.append(e)
+                    valid_paths.append(p)
+
+            # process references into form for merging
+            urls2 = []
+            starts2 = []
+            ends2 = []
+            paths2 = []
+            whole_files = set()
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if isinstance(u, bytes):
+                    # data
+                    out[p] = u
+                elif s is None:
+                    # whole file - limits are None, None, but no further
+                    # entries take for this file
+                    whole_files.add(u)
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                # second run to account for files that are to be loaded whole
+                if s is not None and u not in whole_files:
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+
+            # merge and fetch consolidated ranges
+            new_paths, new_starts, new_ends = merge_offset_ranges(
+                list(urls2),
+                list(starts2),
+                list(ends2),
+                sort=True,
+                max_gap=self.max_gap,
+                max_block=self.max_block,
+            )
+            bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
+
+            # unbundle from merged bytes - simple approach
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if p in out:
+                    continue  # was bytes, already handled
+                for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
+                    if np == u and (ns is None or ne is None):
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s:e]
+                    elif np == u and s >= ns and e <= ne:
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s - ns : (e - ne) or None]
+
+        for k, v in out.copy().items():
+            # these were valid references, but fetch failed, so transform exc
+            if isinstance(v, Exception) and k in self.references:
+                ex = out[k]
+                new_ex = ReferenceNotReachable(k, self.references[k])
+                new_ex.__cause__ = ex
+                if on_error == "raise":
+                    raise new_ex
+                elif on_error != "omit":
+                    out[k] = new_ex
+
+        if len(out) == 1 and isinstance(path, str) and "*" not in path:
+            return _first(out)
+        return out
+
+    def _process_references(self, references, template_overrides=None):
+        vers = references.get("version", None)
+        if vers is None:
+            self._process_references0(references)
+        elif vers == 1:
+            self._process_references1(references, template_overrides=template_overrides)
+        else:
+            raise ValueError(f"Unknown reference spec version: {vers}")
+        # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
+        #  can replace with programmatic. Is it even needed for mapper interface?
+
+    def _process_references0(self, references):
+        """Make reference dict for Spec Version 0"""
+        if isinstance(references, dict):
+            # do not do this for lazy/parquet backend, which will not make dicts,
+            # but must remain writable in the original object
+            references = {
+                key: json.dumps(val) if isinstance(val, dict) else val
+                for key, val in references.items()
+            }
+        self.references = references
+
+    def _process_references1(self, references, template_overrides=None):
+        if not self.simple_templates or self.templates:
+            import jinja2
+        self.references = {}
+        self._process_templates(references.get("templates", {}))
+
+        @lru_cache(1000)
+        def _render_jinja(u):
+            return jinja2.Template(u).render(**self.templates)
+
+        for k, v in references.get("refs", {}).items():
+            if isinstance(v, str):
+                if v.startswith("base64:"):
+                    self.references[k] = base64.b64decode(v[7:])
+                self.references[k] = v
+            elif isinstance(v, dict):
+                self.references[k] = json.dumps(v)
+            elif self.templates:
+                u = v[0]
+                if "{{" in u:
+                    if self.simple_templates:
+                        u = (
+                            u.replace("{{", "{")
+                            .replace("}}", "}")
+                            .format(**self.templates)
+                        )
+                    else:
+                        u = _render_jinja(u)
+                self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
+            else:
+                self.references[k] = v
+        self.references.update(self._process_gen(references.get("gen", [])))
+
+    def _process_templates(self, tmp):
+        self.templates = {}
+        if self.template_overrides is not None:
+            tmp.update(self.template_overrides)
+        for k, v in tmp.items():
+            if "{{" in v:
+                import jinja2
+
+                self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
+                    temp
+                ).render(**kwargs)
+            else:
+                self.templates[k] = v
+
+    def _process_gen(self, gens):
+        out = {}
+        for gen in gens:
+            dimension = {
+                k: (
+                    v
+                    if isinstance(v, list)
+                    else range(v.get("start", 0), v["stop"], v.get("step", 1))
+                )
+                for k, v in gen["dimensions"].items()
+            }
+            products = (
+                dict(zip(dimension.keys(), values))
+                for values in itertools.product(*dimension.values())
+            )
+            for pr in products:
+                import jinja2
+
+                key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
+                url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
+                if ("offset" in gen) and ("length" in gen):
+                    offset = int(
+                        jinja2.Template(gen["offset"]).render(**pr, **self.templates)
+                    )
+                    length = int(
+                        jinja2.Template(gen["length"]).render(**pr, **self.templates)
+                    )
+                    out[key] = [url, offset, length]
+                elif ("offset" in gen) ^ ("length" in gen):
+                    raise ValueError(
+                        "Both 'offset' and 'length' are required for a "
+                        "reference generator entry if either is provided."
+                    )
+                else:
+                    out[key] = [url]
+        return out
+
+    def _dircache_from_items(self):
+        self.dircache = {"": []}
+        it = self.references.items()
+        for path, part in it:
+            if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
+                size = len(part)
+            elif len(part) == 1:
+                size = None
+            else:
+                _, _, size = part
+            par = path.rsplit("/", 1)[0] if "/" in path else ""
+            par0 = par
+            subdirs = [par0]
+            while par0 and par0 not in self.dircache:
+                # collect parent directories
+                par0 = self._parent(par0)
+                subdirs.append(par0)
+
+            subdirs.reverse()
+            for parent, child in zip(subdirs, subdirs[1:]):
+                # register newly discovered directories
+                assert child not in self.dircache
+                assert parent in self.dircache
+                self.dircache[parent].append(
+                    {"name": child, "type": "directory", "size": 0}
+                )
+                self.dircache[child] = []
+
+            self.dircache[par].append({"name": path, "type": "file", "size": size})
+
+    def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path)
+        # This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
+        # That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
+        if isinstance(part_or_url, bytes):
+            return io.BytesIO(part_or_url[start0:end0])
+
+        protocol, _ = split_protocol(part_or_url)
+        if start0 is None and end0 is None:
+            return self.fss[protocol]._open(
+                part_or_url,
+                mode,
+                block_size=block_size,
+                cache_options=cache_options,
+                **kwargs,
+            )
+
+        return ReferenceFile(
+            self,
+            path,
+            mode,
+            block_size=block_size,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+    def ls(self, path, detail=True, **kwargs):
+        logger.debug("list %s", path)
+        path = self._strip_protocol(path)
+        if isinstance(self.references, LazyReferenceMapper):
+            try:
+                return self.references.ls(path, detail)
+            except KeyError:
+                pass
+            raise FileNotFoundError(f"'{path}' is not a known key")
+        if not self.dircache:
+            self._dircache_from_items()
+        out = self._ls_from_cache(path)
+        if out is None:
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def exists(self, path, **kwargs):  # overwrite auto-sync version
+        return self.isdir(path) or self.isfile(path)
+
+    def isdir(self, path):  # overwrite auto-sync version
+        if self.dircache:
+            return path in self.dircache
+        elif isinstance(self.references, LazyReferenceMapper):
+            return path in self.references.listdir()
+        else:
+            # this may be faster than building dircache for single calls, but
+            # by looping will be slow for many calls; could cache it?
+            return any(_.startswith(f"{path}/") for _ in self.references)
+
+    def isfile(self, path):  # overwrite auto-sync version
+        return path in self.references
+
+    async def _ls(self, path, detail=True, **kwargs):  # calls fast sync code
+        return self.ls(path, detail, **kwargs)
+
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        if withdirs:
+            return super().find(
+                path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+            )
+        if path:
+            path = self._strip_protocol(path)
+            r = sorted(k for k in self.references if k.startswith(path))
+        else:
+            r = sorted(self.references)
+        if detail:
+            if not self.dircache:
+                self._dircache_from_items()
+            return {k: self._ls_from_cache(k)[0] for k in r}
+        else:
+            return r
+
+    def info(self, path, **kwargs):
+        out = self.references.get(path)
+        if out is not None:
+            if isinstance(out, (str, bytes)):
+                # decode base64 here
+                return {"name": path, "type": "file", "size": len(out)}
+            elif len(out) > 1:
+                return {"name": path, "type": "file", "size": out[2]}
+            else:
+                out0 = [{"name": path, "type": "file", "size": None}]
+        else:
+            out = self.ls(path, True)
+            out0 = [o for o in out if o["name"] == path]
+            if not out0:
+                return {"name": path, "type": "directory", "size": 0}
+        if out0[0]["size"] is None:
+            # if this is a whole remote file, update size using remote FS
+            prot, _ = split_protocol(self.references[path][0])
+            out0[0]["size"] = self.fss[prot].size(self.references[path][0])
+        return out0[0]
+
+    async def _info(self, path, **kwargs):  # calls fast sync code
+        return self.info(path)
+
+    async def _rm_file(self, path, **kwargs):
+        self.references.pop(
+            path, None
+        )  # ignores FileNotFound, just as well for directories
+        self.dircache.clear()  # this is a bit heavy handed
+
+    async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
+        if mode == "create" and self.exists(path):
+            raise FileExistsError
+        # can be str or bytes
+        self.references[path] = data
+        self.dircache.clear()  # this is a bit heavy handed
+
+    async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
+        # puts binary
+        if mode == "create" and self.exists(rpath):
+            raise FileExistsError
+        with open(lpath, "rb") as f:
+            self.references[rpath] = f.read()
+        self.dircache.clear()  # this is a bit heavy handed
+
+    def save_json(self, url, **storage_options):
+        """Write modified references into new location"""
+        out = {}
+        for k, v in self.references.items():
+            if isinstance(v, bytes):
+                try:
+                    out[k] = v.decode("ascii")
+                except UnicodeDecodeError:
+                    out[k] = (b"base64:" + base64.b64encode(v)).decode()
+            else:
+                out[k] = v
+        with fsspec.open(url, "wb", **storage_options) as f:
+            f.write(json.dumps({"version": 1, "refs": out}).encode())
+
+
+class ReferenceFile(AbstractBufferedFile):
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            size=size,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+        part_or_url, self.start, self.end = self.fs._cat_common(self.path)
+        protocol, _ = split_protocol(part_or_url)
+        self.src_fs = self.fs.fss[protocol]
+        self.src_path = part_or_url
+        self._f = None
+
+    @property
+    def f(self):
+        if self._f is None or self._f.closed:
+            self._f = self.src_fs._open(
+                self.src_path,
+                mode=self.mode,
+                block_size=self.blocksize,
+                autocommit=self.autocommit,
+                cache_type="none",
+                **self.kwargs,
+            )
+        return self._f
+
+    def close(self):
+        if self._f is not None:
+            self._f.close()
+        return super().close()
+
+    def _fetch_range(self, start, end):
+        start = start + self.start
+        end = min(end + self.start, self.end)
+        self.f.seek(start)
+        return self.f.read(end - start)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/sftp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/sftp.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a6db5b569c6f5c4d154372e7fb3c02741388fad
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/sftp.py
@@ -0,0 +1,187 @@
+import datetime
+import logging
+import os
+import types
+import uuid
+from stat import S_ISDIR, S_ISLNK
+
+import paramiko
+
+from .. import AbstractFileSystem
+from ..utils import infer_storage_options
+
+logger = logging.getLogger("fsspec.sftp")
+
+
+class SFTPFileSystem(AbstractFileSystem):
+    """Files over SFTP/SSH
+
+    Peer-to-peer filesystem over SSH using paramiko.
+
+    Note: if using this with the ``open`` or ``open_files``, with full URLs,
+    there is no way to tell if a path is relative, so all paths are assumed
+    to be absolute.
+    """
+
+    protocol = "sftp", "ssh"
+
+    def __init__(self, host, **ssh_kwargs):
+        """
+
+        Parameters
+        ----------
+        host: str
+            Hostname or IP as a string
+        temppath: str
+            Location on the server to put files, when within a transaction
+        ssh_kwargs: dict
+            Parameters passed on to connection. See details in
+            https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
+            May include port, username, password...
+        """
+        if self._cached:
+            return
+        super().__init__(**ssh_kwargs)
+        self.temppath = ssh_kwargs.pop("temppath", "/tmp")  # remote temp directory
+        self.host = host
+        self.ssh_kwargs = ssh_kwargs
+        self._connect()
+
+    def _connect(self):
+        logger.debug("Connecting to SFTP server %s", self.host)
+        self.client = paramiko.SSHClient()
+        self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        self.client.connect(self.host, **self.ssh_kwargs)
+        self.ftp = self.client.open_sftp()
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def mkdir(self, path, create_parents=True, mode=511):
+        path = self._strip_protocol(path)
+        logger.debug("Creating folder %s", path)
+        if self.exists(path):
+            raise FileExistsError(f"File exists: {path}")
+
+        if create_parents:
+            self.makedirs(path)
+        else:
+            self.ftp.mkdir(path, mode)
+
+    def makedirs(self, path, exist_ok=False, mode=511):
+        if self.exists(path) and not exist_ok:
+            raise FileExistsError(f"File exists: {path}")
+
+        parts = path.split("/")
+        new_path = "/" if path[:1] == "/" else ""
+
+        for part in parts:
+            if part:
+                new_path = f"{new_path}/{part}" if new_path else part
+                if not self.exists(new_path):
+                    self.ftp.mkdir(new_path, mode)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        logger.debug("Removing folder %s", path)
+        self.ftp.rmdir(path)
+
+    def info(self, path):
+        path = self._strip_protocol(path)
+        stat = self._decode_stat(self.ftp.stat(path))
+        stat["name"] = path
+        return stat
+
+    @staticmethod
+    def _decode_stat(stat, parent_path=None):
+        if S_ISDIR(stat.st_mode):
+            t = "directory"
+        elif S_ISLNK(stat.st_mode):
+            t = "link"
+        else:
+            t = "file"
+        out = {
+            "name": "",
+            "size": stat.st_size,
+            "type": t,
+            "uid": stat.st_uid,
+            "gid": stat.st_gid,
+            "time": datetime.datetime.fromtimestamp(
+                stat.st_atime, tz=datetime.timezone.utc
+            ),
+            "mtime": datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ),
+        }
+        if parent_path:
+            out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
+        return out
+
+    def ls(self, path, detail=False):
+        path = self._strip_protocol(path)
+        logger.debug("Listing folder %s", path)
+        stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
+        if detail:
+            return stats
+        else:
+            paths = [stat["name"] for stat in stats]
+            return sorted(paths)
+
+    def put(self, lpath, rpath, callback=None, **kwargs):
+        rpath = self._strip_protocol(rpath)
+        logger.debug("Put file %s into %s", lpath, rpath)
+        self.ftp.put(lpath, rpath)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            os.makedirs(lpath, exist_ok=True)
+        else:
+            self.ftp.get(self._strip_protocol(rpath), lpath)
+
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        """
+        block_size: int or None
+            If 0, no buffering, if 1, line buffering, if >1, buffer that many
+            bytes, if None use default from paramiko.
+        """
+        logger.debug("Opening file %s", path)
+        if kwargs.get("autocommit", True) is False:
+            # writes to temporary file, move on commit
+            path2 = "/".join([self.temppath, str(uuid.uuid4())])
+            f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
+            f.temppath = path2
+            f.targetpath = path
+            f.fs = self
+            f.commit = types.MethodType(commit_a_file, f)
+            f.discard = types.MethodType(discard_a_file, f)
+        else:
+            f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
+        return f
+
+    def _rm(self, path):
+        if self.isdir(path):
+            self.ftp.rmdir(path)
+        else:
+            self.ftp.remove(path)
+
+    def mv(self, old, new):
+        new = self._strip_protocol(new)
+        old = self._strip_protocol(old)
+        logger.debug("Renaming %s into %s", old, new)
+        self.ftp.posix_rename(old, new)
+
+
+def commit_a_file(self):
+    self.fs.mv(self.temppath, self.targetpath)
+
+
+def discard_a_file(self):
+    self.fs._rm(self.temppath)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/smb.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/smb.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6b3f5c3702de90cf121ccca49f3ca2b580df9f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/smb.py
@@ -0,0 +1,416 @@
+"""
+This module contains SMBFileSystem class responsible for handling access to
+Windows Samba network shares by using package smbprotocol
+"""
+
+import datetime
+import re
+import uuid
+from stat import S_ISDIR, S_ISLNK
+
+import smbclient
+import smbprotocol.exceptions
+
+from .. import AbstractFileSystem
+from ..utils import infer_storage_options
+
+# ! pylint: disable=bad-continuation
+
+
+class SMBFileSystem(AbstractFileSystem):
+    """Allow reading and writing to Windows and Samba network shares.
+
+    When using `fsspec.open()` for getting a file-like object the URI
+    should be specified as this format:
+    ``smb://workgroup;user:password@server:port/share/folder/file.csv``.
+
+    Example::
+
+        >>> import fsspec
+        >>> with fsspec.open(
+        ...     'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
+        ... ) as smbfile:
+        ...     df = pd.read_csv(smbfile, sep='|', header=None)
+
+    Note that you need to pass in a valid hostname or IP address for the host
+    component of the URL. Do not use the Windows/NetBIOS machine name for the
+    host component.
+
+    The first component of the path in the URL points to the name of the shared
+    folder. Subsequent path components will point to the directory/folder/file.
+
+    The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
+    optional.
+
+    .. note::
+
+        For working this source require `smbprotocol`_ to be installed, e.g.::
+
+            $ pip install smbprotocol
+            # or
+            # pip install smbprotocol[kerberos]
+
+    .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
+
+    Note: if using this with the ``open`` or ``open_files``, with full URLs,
+    there is no way to tell if a path is relative, so all paths are assumed
+    to be absolute.
+    """
+
+    protocol = "smb"
+
+    # pylint: disable=too-many-arguments
+    def __init__(
+        self,
+        host,
+        port=None,
+        username=None,
+        password=None,
+        timeout=60,
+        encrypt=None,
+        share_access=None,
+        register_session_retries=4,
+        register_session_retry_wait=1,
+        register_session_retry_factor=10,
+        auto_mkdir=False,
+        **kwargs,
+    ):
+        """
+        You can use _get_kwargs_from_urls to get some kwargs from
+        a reasonable SMB url.
+
+        Authentication will be anonymous or integrated if username/password are not
+        given.
+
+        Parameters
+        ----------
+        host: str
+            The remote server name/ip to connect to
+        port: int or None
+            Port to connect with. Usually 445, sometimes 139.
+        username: str or None
+            Username to connect with. Required if Kerberos auth is not being used.
+        password: str or None
+            User's password on the server, if using username
+        timeout: int
+            Connection timeout in seconds
+        encrypt: bool
+            Whether to force encryption or not, once this has been set to True
+            the session cannot be changed back to False.
+        share_access: str or None
+            Specifies the default access applied to file open operations
+            performed with this file system object.
+            This affects whether other processes can concurrently open a handle
+            to the same file.
+
+            - None (the default): exclusively locks the file until closed.
+            - 'r': Allow other handles to be opened with read access.
+            - 'w': Allow other handles to be opened with write access.
+            - 'd': Allow other handles to be opened with delete access.
+        register_session_retries: int
+            Number of retries to register a session with the server. Retries are not performed
+            for authentication errors, as they are considered as invalid credentials and not network
+            issues. If set to negative value, no register attempts will be performed.
+        register_session_retry_wait: int
+            Time in seconds to wait between each retry. Number must be non-negative.
+        register_session_retry_factor: int
+            Base factor for the wait time between each retry. The wait time
+            is calculated using exponential function. For factor=1 all wait times
+            will be equal to `register_session_retry_wait`. For any number of retries,
+            the last wait time will be equal to `register_session_retry_wait` and for retries>1
+            the first wait time will be equal to `register_session_retry_wait / factor`.
+            Number must be equal to or greater than 1. Optimal factor is 10.
+        auto_mkdir: bool
+            Whether, when opening a file, the directory containing it should
+            be created (if it doesn't already exist). This is assumed by pyarrow
+            and zarr-python code.
+        """
+        super().__init__(**kwargs)
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        self.timeout = timeout
+        self.encrypt = encrypt
+        self.temppath = kwargs.pop("temppath", "")
+        self.share_access = share_access
+        self.register_session_retries = register_session_retries
+        if register_session_retry_wait < 0:
+            raise ValueError(
+                "register_session_retry_wait must be a non-negative integer"
+            )
+        self.register_session_retry_wait = register_session_retry_wait
+        if register_session_retry_factor < 1:
+            raise ValueError(
+                "register_session_retry_factor must be a positive "
+                "integer equal to or greater than 1"
+            )
+        self.register_session_retry_factor = register_session_retry_factor
+        self.auto_mkdir = auto_mkdir
+        self._connect()
+
+    @property
+    def _port(self):
+        return 445 if self.port is None else self.port
+
+    def _connect(self):
+        import time
+
+        if self.register_session_retries <= -1:
+            return
+
+        retried_errors = []
+
+        wait_time = self.register_session_retry_wait
+        n_waits = (
+            self.register_session_retries - 1
+        )  # -1 = No wait time after the last retry
+        factor = self.register_session_retry_factor
+
+        # Generate wait times for each retry attempt.
+        # Wait times are calculated using exponential function. For factor=1 all wait times
+        # will be equal to `wait`. For any number of retries the last wait time will be
+        # equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
+        wait_times = iter(
+            factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
+        )
+
+        for attempt in range(self.register_session_retries + 1):
+            try:
+                smbclient.register_session(
+                    self.host,
+                    username=self.username,
+                    password=self.password,
+                    port=self._port,
+                    encrypt=self.encrypt,
+                    connection_timeout=self.timeout,
+                )
+                return
+            except (
+                smbprotocol.exceptions.SMBAuthenticationError,
+                smbprotocol.exceptions.LogonFailure,
+            ):
+                # These exceptions should not be repeated, as they clearly indicate
+                # that the credentials are invalid and not a network issue.
+                raise
+            except ValueError as exc:
+                if re.findall(r"\[Errno -\d+]", str(exc)):
+                    # This exception is raised by the smbprotocol.transport:Tcp.connect
+                    # and originates from socket.gaierror (OSError). These exceptions might
+                    # be raised due to network instability. We will retry to connect.
+                    retried_errors.append(exc)
+                else:
+                    # All another ValueError exceptions should be raised, as they are not
+                    # related to network issues.
+                    raise
+            except Exception as exc:
+                # Save the exception and retry to connect. This except might be dropped
+                # in the future, once all exceptions suited for retry are identified.
+                retried_errors.append(exc)
+
+            if attempt < self.register_session_retries:
+                time.sleep(next(wait_times))
+
+        # Raise last exception to inform user about the connection issues.
+        # Note: Should we use ExceptionGroup to raise all exceptions?
+        raise retried_errors[-1]
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        # smb://workgroup;user:password@host:port/share/folder/file.csv
+        out = infer_storage_options(path)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        wpath = _as_unc_path(self.host, path)
+        if create_parents:
+            smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
+        else:
+            smbclient.mkdir(wpath, port=self._port, **kwargs)
+
+    def makedirs(self, path, exist_ok=False):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
+
+    def rmdir(self, path):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            smbclient.rmdir(wpath, port=self._port)
+
+    def info(self, path, **kwargs):
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port, **kwargs)
+        if S_ISDIR(stats.st_mode):
+            stype = "directory"
+        elif S_ISLNK(stats.st_mode):
+            stype = "link"
+        else:
+            stype = "file"
+        res = {
+            "name": path + "/" if stype == "directory" else path,
+            "size": stats.st_size,
+            "type": stype,
+            "uid": stats.st_uid,
+            "gid": stats.st_gid,
+            "time": stats.st_atime,
+            "mtime": stats.st_mtime,
+        }
+        return res
+
+    def created(self, path):
+        """Return the created timestamp of a file as a datetime.datetime"""
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port)
+        return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
+
+    def modified(self, path):
+        """Return the modified timestamp of a file as a datetime.datetime"""
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port)
+        return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
+
+    def ls(self, path, detail=True, **kwargs):
+        unc = _as_unc_path(self.host, path)
+        listed = smbclient.listdir(unc, port=self._port, **kwargs)
+        dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
+        if detail:
+            dirs = [self.info(d) for d in dirs]
+        return dirs
+
+    # pylint: disable=too-many-arguments
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=-1,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        """
+        block_size: int or None
+            If 0, no buffering, 1, line buffering, >1, buffer that many bytes
+
+        Notes
+        -----
+        By specifying 'share_access' in 'kwargs' it is possible to override the
+        default shared access setting applied in the constructor of this object.
+        """
+        if self.auto_mkdir and "w" in mode:
+            self.makedirs(self._parent(path), exist_ok=True)
+        bls = block_size if block_size is not None and block_size >= 0 else -1
+        wpath = _as_unc_path(self.host, path)
+        share_access = kwargs.pop("share_access", self.share_access)
+        if "w" in mode and autocommit is False:
+            temp = _as_temp_path(self.host, path, self.temppath)
+            return SMBFileOpener(
+                wpath, temp, mode, port=self._port, block_size=bls, **kwargs
+            )
+        return smbclient.open_file(
+            wpath,
+            mode,
+            buffering=bls,
+            share_access=share_access,
+            port=self._port,
+            **kwargs,
+        )
+
+    def copy(self, path1, path2, **kwargs):
+        """Copy within two locations in the same filesystem"""
+        wpath1 = _as_unc_path(self.host, path1)
+        wpath2 = _as_unc_path(self.host, path2)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path2), exist_ok=True)
+        smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
+
+    def _rm(self, path):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            stats = smbclient.stat(wpath, port=self._port)
+            if S_ISDIR(stats.st_mode):
+                smbclient.rmdir(wpath, port=self._port)
+            else:
+                smbclient.remove(wpath, port=self._port)
+
+    def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
+        wpath1 = _as_unc_path(self.host, path1)
+        wpath2 = _as_unc_path(self.host, path2)
+        smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
+
+
+def _as_unc_path(host, path):
+    rpath = path.replace("/", "\\")
+    unc = f"\\\\{host}{rpath}"
+    return unc
+
+
+def _as_temp_path(host, path, temppath):
+    share = path.split("/")[1]
+    temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
+    unc = _as_unc_path(host, temp_file)
+    return unc
+
+
+def _share_has_path(path):
+    parts = path.count("/")
+    if path.endswith("/"):
+        return parts > 2
+    return parts > 1
+
+
+class SMBFileOpener:
+    """writes to remote temporary file, move on commit"""
+
+    def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
+        self.path = path
+        self.temp = temp
+        self.mode = mode
+        self.block_size = block_size
+        self.kwargs = kwargs
+        self.smbfile = None
+        self._incontext = False
+        self.port = port
+        self._open()
+
+    def _open(self):
+        if self.smbfile is None or self.smbfile.closed:
+            self.smbfile = smbclient.open_file(
+                self.temp,
+                self.mode,
+                port=self.port,
+                buffering=self.block_size,
+                **self.kwargs,
+            )
+
+    def commit(self):
+        """Move temp file to definitive on success."""
+        # TODO: use transaction support in SMB protocol
+        smbclient.replace(self.temp, self.path, port=self.port)
+
+    def discard(self):
+        """Remove the temp file on failure."""
+        smbclient.remove(self.temp, port=self.port)
+
+    def __fspath__(self):
+        return self.path
+
+    def __iter__(self):
+        return self.smbfile.__iter__()
+
+    def __getattr__(self, item):
+        return getattr(self.smbfile, item)
+
+    def __enter__(self):
+        self._incontext = True
+        return self.smbfile.__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._incontext = False
+        self.smbfile.__exit__(exc_type, exc_value, traceback)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/tar.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/tar.py
new file mode 100644
index 0000000000000000000000000000000000000000..412e5ba4d2cdea7db090dc96412e697909a38d78
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/tar.py
@@ -0,0 +1,124 @@
+import logging
+import tarfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.compression import compr
+from fsspec.utils import infer_compression
+
+typemap = {b"0": "file", b"5": "directory"}
+
+logger = logging.getLogger("tar")
+
+
+class TarFileSystem(AbstractArchiveFileSystem):
+    """Compressed Tar archives as a file-system (read-only)
+
+    Supports the following formats:
+    tar.gz, tar.bz2, tar.xz
+    """
+
+    root_marker = ""
+    protocol = "tar"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        index_store=None,
+        target_options=None,
+        target_protocol=None,
+        compression=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        target_options = target_options or {}
+
+        if isinstance(fo, str):
+            self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+            fo = self.of.open()  # keep the reference
+
+        # Try to infer compression.
+        if compression is None:
+            name = None
+
+            # Try different ways to get hold of the filename. `fo` might either
+            # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+            # `fsspec.AbstractFileSystem` instance.
+            try:
+                # Amended io.BufferedReader or similar.
+                # This uses a "protocol extension" where original filenames are
+                # propagated to archive-like filesystems in order to let them
+                # infer the right compression appropriately.
+                if hasattr(fo, "original"):
+                    name = fo.original
+
+                # fsspec.LocalFileOpener
+                elif hasattr(fo, "path"):
+                    name = fo.path
+
+                # io.BufferedReader
+                elif hasattr(fo, "name"):
+                    name = fo.name
+
+                # fsspec.AbstractFileSystem
+                elif hasattr(fo, "info"):
+                    name = fo.info()["name"]
+
+            except Exception as ex:
+                logger.warning(
+                    f"Unable to determine file name, not inferring compression: {ex}"
+                )
+
+            if name is not None:
+                compression = infer_compression(name)
+                logger.info(f"Inferred compression {compression} from file name {name}")
+
+        if compression is not None:
+            # TODO: tarfile already implements compression with modes like "'r:gz'",
+            #  but then would seek to offset in the file work?
+            fo = compr[compression](fo)
+
+        self._fo_ref = fo
+        self.fo = fo  # the whole instance is a context
+        self.tar = tarfile.TarFile(fileobj=self.fo)
+        self.dir_cache = None
+
+        self.index_store = index_store
+        self.index = None
+        self._index()
+
+    def _index(self):
+        # TODO: load and set saved index, if exists
+        out = {}
+        for ti in self.tar:
+            info = ti.get_info()
+            info["type"] = typemap.get(info["type"], "file")
+            name = ti.get_info()["name"].rstrip("/")
+            out[name] = (info, ti.offset_data)
+
+        self.index = out
+        # TODO: save index to self.index_store here, if set
+
+    def _get_dirs(self):
+        if self.dir_cache is not None:
+            return
+
+        # This enables ls to get directories as children as well as files
+        self.dir_cache = {
+            dirname: {"name": dirname, "size": 0, "type": "directory"}
+            for dirname in self._all_dirnames(self.tar.getnames())
+        }
+        for member in self.tar.getmembers():
+            info = member.get_info()
+            info["name"] = info["name"].rstrip("/")
+            info["type"] = typemap.get(info["type"], "file")
+            self.dir_cache[info["name"]] = info
+
+    def _open(self, path, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError("Read-only filesystem implementation")
+        details, offset = self.index[path]
+        if details["type"] != "file":
+            raise ValueError("Can only handle regular files")
+        return self.tar.extractfile(path)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..12cb23d1d48451c84349a3721ee1707a7edeed25
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py
@@ -0,0 +1,485 @@
+# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
+
+import logging
+import os
+import secrets
+import shutil
+import tempfile
+import uuid
+from contextlib import suppress
+from urllib.parse import quote
+
+import requests
+
+from ..spec import AbstractBufferedFile, AbstractFileSystem
+from ..utils import infer_storage_options, tokenize
+
+logger = logging.getLogger("webhdfs")
+
+
+class WebHDFS(AbstractFileSystem):
+    """
+    Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
+
+    Four auth mechanisms are supported:
+
+    insecure: no auth is done, and the user is assumed to be whoever they
+        say they are (parameter ``user``), or a predefined value such as
+        "dr.who" if not given
+    spnego: when kerberos authentication is enabled, auth is negotiated by
+        requests_kerberos https://github.com/requests/requests-kerberos .
+        This establishes a session based on existing kinit login and/or
+        specified principal/password; parameters are passed with ``kerb_kwargs``
+    token: uses an existing Hadoop delegation token from another secured
+        service. Indeed, this client can also generate such tokens when
+        not insecure. Note that tokens expire, but can be renewed (by a
+        previously specified user) and may allow for proxying.
+    basic-auth: used when both parameter ``user`` and parameter ``password``
+        are provided.
+
+    """
+
+    tempdir = str(tempfile.gettempdir())
+    protocol = "webhdfs", "webHDFS"
+
+    def __init__(
+        self,
+        host,
+        port=50070,
+        kerberos=False,
+        token=None,
+        user=None,
+        password=None,
+        proxy_to=None,
+        kerb_kwargs=None,
+        data_proxy=None,
+        use_https=False,
+        session_cert=None,
+        session_verify=True,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        host: str
+            Name-node address
+        port: int
+            Port for webHDFS
+        kerberos: bool
+            Whether to authenticate with kerberos for this connection
+        token: str or None
+            If given, use this token on every call to authenticate. A user
+            and user-proxy may be encoded in the token and should not be also
+            given
+        user: str or None
+            If given, assert the user name to connect with
+        password: str or None
+            If given, assert the password to use for basic auth. If password
+            is provided, user must be provided also
+        proxy_to: str or None
+            If given, the user has the authority to proxy, and this value is
+            the user in who's name actions are taken
+        kerb_kwargs: dict
+            Any extra arguments for HTTPKerberosAuth, see
+            `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
+        data_proxy: dict, callable or None
+            If given, map data-node addresses. This can be necessary if the
+            HDFS cluster is behind a proxy, running on Docker or otherwise has
+            a mismatch between the host-names given by the name-node and the
+            address by which to refer to them from the client. If a dict,
+            maps host names ``host->data_proxy[host]``; if a callable, full
+            URLs are passed, and function must conform to
+            ``url->data_proxy(url)``.
+        use_https: bool
+            Whether to connect to the Name-node using HTTPS instead of HTTP
+        session_cert: str or Tuple[str, str] or None
+            Path to a certificate file, or tuple of (cert, key) files to use
+            for the requests.Session
+        session_verify: str, bool or None
+            Path to a certificate file to use for verifying the requests.Session.
+        kwargs
+        """
+        if self._cached:
+            return
+        super().__init__(**kwargs)
+        self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
+        self.kerb = kerberos
+        self.kerb_kwargs = kerb_kwargs or {}
+        self.pars = {}
+        self.proxy = data_proxy or {}
+        if token is not None:
+            if user is not None or proxy_to is not None:
+                raise ValueError(
+                    "If passing a delegation token, must not set "
+                    "user or proxy_to, as these are encoded in the"
+                    " token"
+                )
+            self.pars["delegation"] = token
+        self.user = user
+        self.password = password
+
+        if password is not None:
+            if user is None:
+                raise ValueError(
+                    "If passing a password, the user must also be"
+                    "set in order to set up the basic-auth"
+                )
+        else:
+            if user is not None:
+                self.pars["user.name"] = user
+
+        if proxy_to is not None:
+            self.pars["doas"] = proxy_to
+        if kerberos and user is not None:
+            raise ValueError(
+                "If using Kerberos auth, do not specify the "
+                "user, this is handled by kinit."
+            )
+
+        self.session_cert = session_cert
+        self.session_verify = session_verify
+
+        self._connect()
+
+        self._fsid = f"webhdfs_{tokenize(host, port)}"
+
+    @property
+    def fsid(self):
+        return self._fsid
+
+    def _connect(self):
+        self.session = requests.Session()
+
+        if self.session_cert:
+            self.session.cert = self.session_cert
+
+        self.session.verify = self.session_verify
+
+        if self.kerb:
+            from requests_kerberos import HTTPKerberosAuth
+
+            self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
+
+        if self.user is not None and self.password is not None:
+            from requests.auth import HTTPBasicAuth
+
+            self.session.auth = HTTPBasicAuth(self.user, self.password)
+
+    def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
+        path = self._strip_protocol(path) if path is not None else ""
+        url = self._apply_proxy(self.url + quote(path, safe="/="))
+        args = kwargs.copy()
+        args.update(self.pars)
+        args["op"] = op.upper()
+        logger.debug("sending %s with %s", url, method)
+        out = self.session.request(
+            method=method.upper(),
+            url=url,
+            params=args,
+            data=data,
+            allow_redirects=redirect,
+        )
+        if out.status_code in [400, 401, 403, 404, 500]:
+            try:
+                err = out.json()
+                msg = err["RemoteException"]["message"]
+                exp = err["RemoteException"]["exception"]
+            except (ValueError, KeyError):
+                pass
+            else:
+                if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
+                    raise ValueError(msg)
+                elif exp in ["SecurityException", "AccessControlException"]:
+                    raise PermissionError(msg)
+                elif exp in ["FileNotFoundException"]:
+                    raise FileNotFoundError(msg)
+                else:
+                    raise RuntimeError(msg)
+        out.raise_for_status()
+        return out
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        replication=None,
+        permissions=None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        path: str
+            File location
+        mode: str
+            'rb', 'wb', etc.
+        block_size: int
+            Client buffer size for read-ahead or write buffer
+        autocommit: bool
+            If False, writes to temporary file that only gets put in final
+            location upon commit
+        replication: int
+            Number of copies of file on the cluster, write mode only
+        permissions: str or int
+            posix permissions, write mode only
+        kwargs
+
+        Returns
+        -------
+        WebHDFile instance
+        """
+        block_size = block_size or self.blocksize
+        return WebHDFile(
+            self,
+            path,
+            mode=mode,
+            block_size=block_size,
+            tempdir=self.tempdir,
+            autocommit=autocommit,
+            replication=replication,
+            permissions=permissions,
+        )
+
+    @staticmethod
+    def _process_info(info):
+        info["type"] = info["type"].lower()
+        info["size"] = info["length"]
+        return info
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        if "username" in out:
+            out["user"] = out.pop("username")
+        return out
+
+    def info(self, path):
+        out = self._call("GETFILESTATUS", path=path)
+        info = out.json()["FileStatus"]
+        info["name"] = path
+        return self._process_info(info)
+
+    def ls(self, path, detail=False, **kwargs):
+        out = self._call("LISTSTATUS", path=path)
+        infos = out.json()["FileStatuses"]["FileStatus"]
+        for info in infos:
+            self._process_info(info)
+            info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
+        if detail:
+            return sorted(infos, key=lambda i: i["name"])
+        else:
+            return sorted(info["name"] for info in infos)
+
+    def content_summary(self, path):
+        """Total numbers of files, directories and bytes under path"""
+        out = self._call("GETCONTENTSUMMARY", path=path)
+        return out.json()["ContentSummary"]
+
+    def ukey(self, path):
+        """Checksum info of file, giving method and result"""
+        out = self._call("GETFILECHECKSUM", path=path, redirect=False)
+        if "Location" in out.headers:
+            location = self._apply_proxy(out.headers["Location"])
+            out2 = self.session.get(location)
+            out2.raise_for_status()
+            return out2.json()["FileChecksum"]
+        else:
+            out.raise_for_status()
+            return out.json()["FileChecksum"]
+
+    def home_directory(self):
+        """Get user's home directory"""
+        out = self._call("GETHOMEDIRECTORY")
+        return out.json()["Path"]
+
+    def get_delegation_token(self, renewer=None):
+        """Retrieve token which can give the same authority to other uses
+
+        Parameters
+        ----------
+        renewer: str or None
+            User who may use this token; if None, will be current user
+        """
+        if renewer:
+            out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
+        else:
+            out = self._call("GETDELEGATIONTOKEN")
+        t = out.json()["Token"]
+        if t is None:
+            raise ValueError("No token available for this user/security context")
+        return t["urlString"]
+
+    def renew_delegation_token(self, token):
+        """Make token live longer. Returns new expiry time"""
+        out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
+        return out.json()["long"]
+
+    def cancel_delegation_token(self, token):
+        """Stop the token from being useful"""
+        self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
+
+    def chmod(self, path, mod):
+        """Set the permission at path
+
+        Parameters
+        ----------
+        path: str
+            location to set (file or directory)
+        mod: str or int
+            posix epresentation or permission, give as oct string, e.g, '777'
+            or 0o777
+        """
+        self._call("SETPERMISSION", method="put", path=path, permission=mod)
+
+    def chown(self, path, owner=None, group=None):
+        """Change owning user and/or group"""
+        kwargs = {}
+        if owner is not None:
+            kwargs["owner"] = owner
+        if group is not None:
+            kwargs["group"] = group
+        self._call("SETOWNER", method="put", path=path, **kwargs)
+
+    def set_replication(self, path, replication):
+        """
+        Set file replication factor
+
+        Parameters
+        ----------
+        path: str
+            File location (not for directories)
+        replication: int
+            Number of copies of file on the cluster. Should be smaller than
+            number of data nodes; normally 3 on most systems.
+        """
+        self._call("SETREPLICATION", path=path, method="put", replication=replication)
+
+    def mkdir(self, path, **kwargs):
+        self._call("MKDIRS", method="put", path=path)
+
+    def makedirs(self, path, exist_ok=False):
+        if exist_ok is False and self.exists(path):
+            raise FileExistsError(path)
+        self.mkdir(path)
+
+    def mv(self, path1, path2, **kwargs):
+        self._call("RENAME", method="put", path=path1, destination=path2)
+
+    def rm(self, path, recursive=False, **kwargs):
+        self._call(
+            "DELETE",
+            method="delete",
+            path=path,
+            recursive="true" if recursive else "false",
+        )
+
+    def rm_file(self, path, **kwargs):
+        self.rm(path)
+
+    def cp_file(self, lpath, rpath, **kwargs):
+        with self.open(lpath) as lstream:
+            tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
+            # Perform an atomic copy (stream to a temporary file and
+            # move it to the actual destination).
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.mv(tmp_fname, rpath)
+            except BaseException:
+                with suppress(FileNotFoundError):
+                    self.rm(tmp_fname)
+                raise
+
+    def _apply_proxy(self, location):
+        if self.proxy and callable(self.proxy):
+            location = self.proxy(location)
+        elif self.proxy:
+            # as a dict
+            for k, v in self.proxy.items():
+                location = location.replace(k, v, 1)
+        return location
+
+
+class WebHDFile(AbstractBufferedFile):
+    """A file living in HDFS over webHDFS"""
+
+    def __init__(self, fs, path, **kwargs):
+        super().__init__(fs, path, **kwargs)
+        kwargs = kwargs.copy()
+        if kwargs.get("permissions", None) is None:
+            kwargs.pop("permissions", None)
+        if kwargs.get("replication", None) is None:
+            kwargs.pop("replication", None)
+        self.permissions = kwargs.pop("permissions", 511)
+        tempdir = kwargs.pop("tempdir")
+        if kwargs.pop("autocommit", False) is False:
+            self.target = self.path
+            self.path = os.path.join(tempdir, str(uuid.uuid4()))
+
+    def _upload_chunk(self, final=False):
+        """Write one part of a multi-block file upload
+
+        Parameters
+        ==========
+        final: bool
+            This is the last block, so should complete file, if
+            self.autocommit is True.
+        """
+        out = self.fs.session.post(
+            self.location,
+            data=self.buffer.getvalue(),
+            headers={"content-type": "application/octet-stream"},
+        )
+        out.raise_for_status()
+        return True
+
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        kwargs = self.kwargs.copy()
+        if "a" in self.mode:
+            op, method = "APPEND", "POST"
+        else:
+            op, method = "CREATE", "PUT"
+            kwargs["overwrite"] = "true"
+        out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
+        location = self.fs._apply_proxy(out.headers["Location"])
+        if "w" in self.mode:
+            # create empty file to append to
+            out2 = self.fs.session.put(
+                location, headers={"content-type": "application/octet-stream"}
+            )
+            out2.raise_for_status()
+            # after creating empty file, change location to append to
+            out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
+            self.location = self.fs._apply_proxy(out2.headers["Location"])
+
+    def _fetch_range(self, start, end):
+        start = max(start, 0)
+        end = min(self.size, end)
+        if start >= end or start >= self.size:
+            return b""
+        out = self.fs._call(
+            "OPEN", path=self.path, offset=start, length=end - start, redirect=False
+        )
+        out.raise_for_status()
+        if "Location" in out.headers:
+            location = out.headers["Location"]
+            out2 = self.fs.session.get(self.fs._apply_proxy(location))
+            return out2.content
+        else:
+            return out.content
+
+    def commit(self):
+        self.fs.mv(self.path, self.target)
+
+    def discard(self):
+        self.fs.rm(self.path)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/zip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/zip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db3ae27806106a19a366886ab4b183f85c1cb1a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/implementations/zip.py
@@ -0,0 +1,177 @@
+import os
+import zipfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+
+
+class ZipFileSystem(AbstractArchiveFileSystem):
+    """Read/Write contents of ZIP archive as a file-system
+
+    Keeps file object open while instance lives.
+
+    This class is pickleable, but not necessarily thread-safe
+    """
+
+    root_marker = ""
+    protocol = "zip"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        mode="r",
+        target_protocol=None,
+        target_options=None,
+        compression=zipfile.ZIP_STORED,
+        allowZip64=True,
+        compresslevel=None,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo: str or file-like
+            Contains ZIP, and must exist. If a str, will fetch file using
+            :meth:`~fsspec.open_files`, which must return one file exactly.
+        mode: str
+            Accept: "r", "w", "a"
+        target_protocol: str (optional)
+            If ``fo`` is a string, this value can be used to override the
+            FS protocol inferred from a URL
+        target_options: dict (optional)
+            Kwargs passed when instantiating the target FS, if ``fo`` is
+            a string.
+        compression, allowZip64, compresslevel: passed to ZipFile
+            Only relevant when creating a ZIP
+        """
+        super().__init__(self, **kwargs)
+        if mode not in set("rwa"):
+            raise ValueError(f"mode '{mode}' no understood")
+        self.mode = mode
+        if isinstance(fo, (str, os.PathLike)):
+            if mode == "a":
+                m = "r+b"
+            else:
+                m = mode + "b"
+            fo = fsspec.open(
+                fo, mode=m, protocol=target_protocol, **(target_options or {})
+            )
+        self.force_zip_64 = allowZip64
+        self.of = fo
+        self.fo = fo.__enter__()  # the whole instance is a context
+        self.zip = zipfile.ZipFile(
+            self.fo,
+            mode=mode,
+            compression=compression,
+            allowZip64=allowZip64,
+            compresslevel=compresslevel,
+        )
+        self.dir_cache = None
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        # zip file paths are always relative to the archive root
+        return super()._strip_protocol(path).lstrip("/")
+
+    def __del__(self):
+        if hasattr(self, "zip"):
+            self.close()
+            del self.zip
+
+    def close(self):
+        """Commits any write changes to the file. Done on ``del`` too."""
+        self.zip.close()
+
+    def _get_dirs(self):
+        if self.dir_cache is None or self.mode in set("wa"):
+            # when writing, dir_cache is always in the ZipFile's attributes,
+            # not read from the file.
+            files = self.zip.infolist()
+            self.dir_cache = {
+                dirname.rstrip("/"): {
+                    "name": dirname.rstrip("/"),
+                    "size": 0,
+                    "type": "directory",
+                }
+                for dirname in self._all_dirnames(self.zip.namelist())
+            }
+            for z in files:
+                f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
+                f.update(
+                    {
+                        "name": z.filename.rstrip("/"),
+                        "size": z.file_size,
+                        "type": ("directory" if z.is_dir() else "file"),
+                    }
+                )
+                self.dir_cache[f["name"]] = f
+
+    def pipe_file(self, path, value, **kwargs):
+        # override upstream, because we know the exact file size in this case
+        self.zip.writestr(path, value, **kwargs)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if "r" in mode and self.mode in set("wa"):
+            if self.exists(path):
+                raise OSError("ZipFS can only be open for reading or writing, not both")
+            raise FileNotFoundError(path)
+        if "r" in self.mode and "w" in mode:
+            raise OSError("ZipFS can only be open for reading or writing, not both")
+        out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
+        if "r" in mode:
+            info = self.info(path)
+            out.size = info["size"]
+            out.name = info["name"]
+        return out
+
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+
+        # Remove the leading slash, as the zip file paths are always
+        # given without a leading slash
+        path = path.lstrip("/")
+        path_parts = list(filter(lambda s: bool(s), path.split("/")))
+
+        def _matching_starts(file_path):
+            file_parts = filter(lambda s: bool(s), file_path.split("/"))
+            return all(a == b for a, b in zip(path_parts, file_parts))
+
+        self._get_dirs()
+
+        result = {}
+        # To match posix find, if an exact file name is given, we should
+        # return only that file
+        if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
+            result[path] = self.dir_cache[path]
+            return result if detail else [path]
+
+        for file_path, file_info in self.dir_cache.items():
+            if not (path == "" or _matching_starts(file_path)):
+                continue
+
+            if file_info["type"] == "directory":
+                if withdirs:
+                    if file_path not in result:
+                        result[file_path.strip("/")] = file_info
+                continue
+
+            if file_path not in result:
+                result[file_path] = file_info if detail else None
+
+        if maxdepth:
+            path_depth = path.count("/")
+            result = {
+                k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
+            }
+        return result if detail else sorted(result)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed2ad802ecaf021106c25c03112f29e75c7b2f8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__init__.py
@@ -0,0 +1,289 @@
+import os
+from hashlib import md5
+
+import pytest
+
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.tests.abstract.copy import AbstractCopyTests  # noqa: F401
+from fsspec.tests.abstract.get import AbstractGetTests  # noqa: F401
+from fsspec.tests.abstract.open import AbstractOpenTests  # noqa: F401
+from fsspec.tests.abstract.pipe import AbstractPipeTests  # noqa: F401
+from fsspec.tests.abstract.put import AbstractPutTests  # noqa: F401
+
+
+class BaseAbstractFixtures:
+    """
+    Abstract base class containing fixtures that are used by but never need to
+    be overridden in derived filesystem-specific classes to run the abstract
+    tests on such filesystems.
+    """
+
+    @pytest.fixture
+    def fs_bulk_operations_scenario_0(self, fs, fs_join, fs_path):
+        """
+        Scenario on remote filesystem that is used for many cp/get/put tests.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._bulk_operations_scenario_0(fs, fs_join, fs_path)
+        yield source
+        fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
+        """
+        Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._glob_edge_cases_files(fs, fs_join, fs_path)
+        yield source
+        fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
+        """
+        Scenario on remote filesystem that is used to check cp/get/put on directory
+        and file with the same name prefixes.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
+        yield source
+        fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
+        """
+        Scenario on remote filesystem that is used to check cp/get/put files order
+        when source and destination are lists.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
+        yield source
+        fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def fs_target(self, fs, fs_join, fs_path):
+        """
+        Return name of remote directory that does not yet exist to copy into.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        target = fs_join(fs_path, "target")
+        yield target
+        if fs.exists(target):
+            fs.rm(target, recursive=True)
+
+    @pytest.fixture
+    def local_bulk_operations_scenario_0(self, local_fs, local_join, local_path):
+        """
+        Scenario on local filesystem that is used for many cp/get/put tests.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._bulk_operations_scenario_0(local_fs, local_join, local_path)
+        yield source
+        local_fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
+        """
+        Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._glob_edge_cases_files(local_fs, local_join, local_path)
+        yield source
+        local_fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def local_dir_and_file_with_same_name_prefix(
+        self, local_fs, local_join, local_path
+    ):
+        """
+        Scenario on local filesystem that is used to check cp/get/put on directory
+        and file with the same name prefixes.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._dir_and_file_with_same_name_prefix(
+            local_fs, local_join, local_path
+        )
+        yield source
+        local_fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
+        """
+        Scenario on local filesystem that is used to check cp/get/put files order
+        when source and destination are lists.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
+        yield source
+        local_fs.rm(source, recursive=True)
+
+    @pytest.fixture
+    def local_target(self, local_fs, local_join, local_path):
+        """
+        Return name of local directory that does not yet exist to copy into.
+
+        Cleans up at the end of each test it which it is used.
+        """
+        target = local_join(local_path, "target")
+        yield target
+        if local_fs.exists(target):
+            local_fs.rm(target, recursive=True)
+
+    def _glob_edge_cases_files(self, some_fs, some_join, some_path):
+        """
+        Scenario that is used for glob edge cases cp/get/put tests.
+        Creates the following directory and file structure:
+
+        📁 source
+        ├── 📄 file1
+        ├── 📄 file2
+        ├── 📁 subdir0
+        │   ├── 📄 subfile1
+        │   ├── 📄 subfile2
+        │   └── 📁 nesteddir
+        │       └── 📄 nestedfile
+        └── 📁 subdir1
+            ├── 📄 subfile1
+            ├── 📄 subfile2
+            └── 📁 nesteddir
+                └── 📄 nestedfile
+        """
+        source = some_join(some_path, "source")
+        some_fs.touch(some_join(source, "file1"))
+        some_fs.touch(some_join(source, "file2"))
+
+        for subdir_idx in range(2):
+            subdir = some_join(source, f"subdir{subdir_idx}")
+            nesteddir = some_join(subdir, "nesteddir")
+            some_fs.makedirs(nesteddir)
+            some_fs.touch(some_join(subdir, "subfile1"))
+            some_fs.touch(some_join(subdir, "subfile2"))
+            some_fs.touch(some_join(nesteddir, "nestedfile"))
+
+        return source
+
+    def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
+        """
+        Scenario that is used for many cp/get/put tests. Creates the following
+        directory and file structure:
+
+        📁 source
+        ├── 📄 file1
+        ├── 📄 file2
+        └── 📁 subdir
+            ├── 📄 subfile1
+            ├── 📄 subfile2
+            └── 📁 nesteddir
+                └── 📄 nestedfile
+        """
+        source = some_join(some_path, "source")
+        subdir = some_join(source, "subdir")
+        nesteddir = some_join(subdir, "nesteddir")
+        some_fs.makedirs(nesteddir)
+        some_fs.touch(some_join(source, "file1"))
+        some_fs.touch(some_join(source, "file2"))
+        some_fs.touch(some_join(subdir, "subfile1"))
+        some_fs.touch(some_join(subdir, "subfile2"))
+        some_fs.touch(some_join(nesteddir, "nestedfile"))
+        return source
+
+    def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
+        """
+        Scenario that is used to check cp/get/put on directory and file with
+        the same name prefixes. Creates the following directory and file structure:
+
+        📁 source
+        ├── 📄 subdir.txt
+        └── 📁 subdir
+            └── 📄 subfile.txt
+        """
+        source = some_join(some_path, "source")
+        subdir = some_join(source, "subdir")
+        file = some_join(source, "subdir.txt")
+        subfile = some_join(subdir, "subfile.txt")
+        some_fs.makedirs(subdir)
+        some_fs.touch(file)
+        some_fs.touch(subfile)
+        return source
+
+    def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
+        """
+        Scenario that is used to check cp/get/put files order when source and
+        destination are lists. Creates the following directory and file structure:
+
+        📁 source
+        └── 📄 {hashed([0-9])}.txt
+        """
+        source = some_join(some_path, "source")
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            path = some_join(source, f"{hashed_i}.txt")
+            some_fs.pipe(path=path, value=f"{i}".encode())
+        return source
+
+
+class AbstractFixtures(BaseAbstractFixtures):
+    """
+    Abstract base class containing fixtures that may be overridden in derived
+    filesystem-specific classes to run the abstract tests on such filesystems.
+
+    For any particular filesystem some of these fixtures must be overridden,
+    such as ``fs`` and ``fs_path``, and others may be overridden if the
+    default functions here are not appropriate, such as ``fs_join``.
+    """
+
+    @pytest.fixture
+    def fs(self):
+        raise NotImplementedError("This function must be overridden in derived classes")
+
+    @pytest.fixture
+    def fs_join(self):
+        """
+        Return a function that joins its arguments together into a path.
+
+        Most fsspec implementations join paths in a platform-dependent way,
+        but some will override this to always use a forward slash.
+        """
+        return os.path.join
+
+    @pytest.fixture
+    def fs_path(self):
+        raise NotImplementedError("This function must be overridden in derived classes")
+
+    @pytest.fixture(scope="class")
+    def local_fs(self):
+        # Maybe need an option for auto_mkdir=False?  This is only relevant
+        # for certain implementations.
+        return LocalFileSystem(auto_mkdir=True)
+
+    @pytest.fixture
+    def local_join(self):
+        """
+        Return a function that joins its arguments together into a path, on
+        the local filesystem.
+        """
+        return os.path.join
+
+    @pytest.fixture
+    def local_path(self, tmpdir):
+        return tmpdir
+
+    @pytest.fixture
+    def supports_empty_directories(self):
+        """
+        Return whether this implementation supports empty directories.
+        """
+        return True
+
+    @pytest.fixture
+    def fs_sanitize_path(self):
+        return lambda x: x
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88c932c3e11309499e273467d17376419b415efc
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f167be08e63788a44d6de2609428e8680faa31a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a52071e3928a87b7aab01b34ca9f3f7a4921091
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/get.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/get.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2447578b81491d365e6dbdf2af51a35bd64c108
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/get.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30bc4aec9e332b1390a8ceb734959422a0d10075
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87365638d493b2ffe4fec363af27c85c049dab5a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/pipe.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/pipe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a4a90090f06dfb72a621fab54d5280c9927f7f7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/pipe.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05081db543c4380a80ea44f6af4803e1bc09b567
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/common.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e7c4140404ab2a8928689721419cf05c2760b9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/common.py
@@ -0,0 +1,175 @@
+GLOB_EDGE_CASES_TESTS = {
+    "argnames": ("path", "recursive", "maxdepth", "expected"),
+    "argvalues": [
+        ("fil?1", False, None, ["file1"]),
+        ("fil?1", True, None, ["file1"]),
+        ("file[1-2]", False, None, ["file1", "file2"]),
+        ("file[1-2]", True, None, ["file1", "file2"]),
+        ("*", False, None, ["file1", "file2"]),
+        (
+            "*",
+            True,
+            None,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir0/nesteddir/nestedfile",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        ("*", True, 1, ["file1", "file2"]),
+        (
+            "*",
+            True,
+            2,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+            ],
+        ),
+        ("*1", False, None, ["file1"]),
+        (
+            "*1",
+            True,
+            None,
+            [
+                "file1",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        ("*1", True, 2, ["file1", "subdir1/subfile1", "subdir1/subfile2"]),
+        (
+            "**",
+            False,
+            None,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir0/nesteddir/nestedfile",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        (
+            "**",
+            True,
+            None,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir0/nesteddir/nestedfile",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        ("**", True, 1, ["file1", "file2"]),
+        (
+            "**",
+            True,
+            2,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir0/nesteddir/nestedfile",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        (
+            "**",
+            False,
+            2,
+            [
+                "file1",
+                "file2",
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+            ],
+        ),
+        ("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
+        (
+            "**/*1",
+            True,
+            None,
+            [
+                "file1",
+                "subdir0/subfile1",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        ("**/*1", True, 1, ["file1"]),
+        (
+            "**/*1",
+            True,
+            2,
+            ["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"],
+        ),
+        ("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
+        ("**/subdir0", False, None, []),
+        ("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
+        ("**/subdir0/nested*", False, 2, []),
+        ("**/subdir0/nested*", True, 2, ["nestedfile"]),
+        ("subdir[1-2]", False, None, []),
+        ("subdir[1-2]", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
+        ("subdir[1-2]", True, 2, ["subfile1", "subfile2"]),
+        ("subdir[0-1]", False, None, []),
+        (
+            "subdir[0-1]",
+            True,
+            None,
+            [
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir0/nesteddir/nestedfile",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+                "subdir1/nesteddir/nestedfile",
+            ],
+        ),
+        (
+            "subdir[0-1]/*fil[e]*",
+            False,
+            None,
+            [
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+            ],
+        ),
+        (
+            "subdir[0-1]/*fil[e]*",
+            True,
+            None,
+            [
+                "subdir0/subfile1",
+                "subdir0/subfile2",
+                "subdir1/subfile1",
+                "subdir1/subfile2",
+            ],
+        ),
+    ],
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/copy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39e57e5f7d52bfda8ab5e2398b04cc2303630a0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/copy.py
@@ -0,0 +1,557 @@
+from hashlib import md5
+from itertools import product
+
+import pytest
+
+from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
+
+
+class AbstractCopyTests:
+    def test_copy_file_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1a
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        target_file2 = fs_join(target, "file2")
+        target_subfile1 = fs_join(target, "subfile1")
+
+        # Copy from source directory
+        fs.cp(fs_join(source, "file2"), target)
+        assert fs.isfile(target_file2)
+
+        # Copy from sub directory
+        fs.cp(fs_join(source, "subdir", "subfile1"), target)
+        assert fs.isfile(target_subfile1)
+
+        # Remove copied files
+        fs.rm([target_file2, target_subfile1])
+        assert not fs.exists(target_file2)
+        assert not fs.exists(target_subfile1)
+
+        # Repeat with trailing slash on target
+        fs.cp(fs_join(source, "file2"), target + "/")
+        assert fs.isdir(target)
+        assert fs.isfile(target_file2)
+
+        fs.cp(fs_join(source, "subdir", "subfile1"), target + "/")
+        assert fs.isfile(target_subfile1)
+
+    def test_copy_file_to_new_directory(
+        self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
+    ):
+        # Copy scenario 1b
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.cp(
+            fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
+        )  # Note trailing slash
+        assert fs.isdir(target)
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_copy_file_to_file_in_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1c
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        fs.cp(fs_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
+        assert fs.isfile(fs_join(target, "newfile"))
+
+    def test_copy_file_to_file_in_new_directory(
+        self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
+    ):
+        # Copy scenario 1d
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.cp(
+            fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir", "newfile")
+        )
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "newfile"))
+
+    def test_copy_directory_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1e
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = target + "/" if target_slash else target
+
+            # Without recursive does nothing
+            fs.cp(s, t)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            fs.cp(s, t, recursive=True)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # Limit recursive by maxdepth
+            fs.cp(s, t, recursive=True, maxdepth=1)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_copy_directory_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1f
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive does nothing
+            fs.cp(s, t)
+            if supports_empty_directories:
+                assert fs.ls(target) == []
+            else:
+                with pytest.raises(FileNotFoundError):
+                    fs.ls(target)
+
+            # With recursive
+            fs.cp(s, t, recursive=True)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+            assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # Limit recursive by maxdepth
+            fs.cp(s, t, recursive=True, maxdepth=1)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+    def test_copy_glob_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1g
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            # Without recursive
+            fs.cp(fs_join(source, "subdir", "*"), t)
+            assert fs.isfile(fs_join(target, "subfile1"))
+            assert fs.isfile(fs_join(target, "subfile2"))
+            assert not fs.isdir(fs_join(target, "nesteddir"))
+            assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(
+                [
+                    fs_join(target, "subfile1"),
+                    fs_join(target, "subfile2"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+                # Limit recursive by maxdepth
+                fs.cp(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+    def test_copy_glob_to_new_directory(
+        self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
+    ):
+        # Copy scenario 1h
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive
+            fs.cp(fs_join(source, "subdir", "*"), t)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+            assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+                # Limit recursive by maxdepth
+                fs.cp(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+    @pytest.mark.parametrize(
+        GLOB_EDGE_CASES_TESTS["argnames"],
+        GLOB_EDGE_CASES_TESTS["argvalues"],
+    )
+    def test_copy_glob_edge_cases(
+        self,
+        path,
+        recursive,
+        maxdepth,
+        expected,
+        fs,
+        fs_join,
+        fs_glob_edge_cases_files,
+        fs_target,
+        fs_sanitize_path,
+    ):
+        # Copy scenario 1g
+        source = fs_glob_edge_cases_files
+
+        target = fs_target
+
+        for new_dir, target_slash in product([True, False], [True, False]):
+            fs.mkdir(target)
+
+            t = fs_join(target, "newdir") if new_dir else target
+            t = t + "/" if target_slash else t
+
+            fs.copy(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
+
+            output = fs.find(target)
+            if new_dir:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
+                ]
+            else:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, p)) for p in expected
+                ]
+            assert sorted(output) == sorted(prefixed_expected)
+
+            try:
+                fs.rm(target, recursive=True)
+            except FileNotFoundError:
+                pass
+
+    def test_copy_list_of_files_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        fs_target,
+        supports_empty_directories,
+    ):
+        # Copy scenario 2a
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            fs.cp(source_files, t)
+            assert fs.isfile(fs_join(target, "file1"))
+            assert fs.isfile(fs_join(target, "file2"))
+            assert fs.isfile(fs_join(target, "subfile1"))
+
+            fs.rm(
+                [
+                    fs_join(target, "file1"),
+                    fs_join(target, "file2"),
+                    fs_join(target, "subfile1"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_copy_list_of_files_to_new_directory(
+        self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
+    ):
+        # Copy scenario 2b
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        fs.cp(source_files, fs_join(target, "newdir") + "/")  # Note trailing slash
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "file1"))
+        assert fs.isfile(fs_join(target, "newdir", "file2"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_copy_two_files_new_directory(
+        self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
+    ):
+        # This is a duplicate of test_copy_list_of_files_to_new_directory and
+        # can eventually be removed.
+        source = fs_bulk_operations_scenario_0
+
+        target = fs_target
+        assert not fs.exists(target)
+        fs.cp([fs_join(source, "file1"), fs_join(source, "file2")], target)
+
+        assert fs.isdir(target)
+        assert fs.isfile(fs_join(target, "file1"))
+        assert fs.isfile(fs_join(target, "file2"))
+
+    def test_copy_directory_without_files_with_same_name_prefix(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        fs_dir_and_file_with_same_name_prefix,
+        supports_empty_directories,
+    ):
+        # Create the test dirs
+        source = fs_dir_and_file_with_same_name_prefix
+        target = fs_target
+
+        # Test without glob
+        fs.cp(fs_join(source, "subdir"), target, recursive=True)
+
+        assert fs.isfile(fs_join(target, "subfile.txt"))
+        assert not fs.isfile(fs_join(target, "subdir.txt"))
+
+        fs.rm([fs_join(target, "subfile.txt")])
+        if supports_empty_directories:
+            assert fs.ls(target) == []
+        else:
+            assert not fs.exists(target)
+
+        # Test with glob
+        fs.cp(fs_join(source, "subdir*"), target, recursive=True)
+
+        assert fs.isdir(fs_join(target, "subdir"))
+        assert fs.isfile(fs_join(target, "subdir", "subfile.txt"))
+        assert fs.isfile(fs_join(target, "subdir.txt"))
+
+    def test_copy_with_source_and_destination_as_list(
+        self, fs, fs_target, fs_join, fs_10_files_with_hashed_names
+    ):
+        # Create the test dir
+        source = fs_10_files_with_hashed_names
+        target = fs_target
+
+        # Create list of files for source and destination
+        source_files = []
+        destination_files = []
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            source_files.append(fs_join(source, f"{hashed_i}.txt"))
+            destination_files.append(fs_join(target, f"{hashed_i}.txt"))
+
+        # Copy and assert order was kept
+        fs.copy(path1=source_files, path2=destination_files)
+
+        for i in range(10):
+            file_content = fs.cat(destination_files[i]).decode("utf-8")
+            assert file_content == str(i)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/get.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/get.py
new file mode 100644
index 0000000000000000000000000000000000000000..851ab81ee581e74cac41c64c83ef0af75826d6b0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/get.py
@@ -0,0 +1,587 @@
+from hashlib import md5
+from itertools import product
+
+import pytest
+
+from fsspec.implementations.local import make_path_posix
+from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
+
+
+class AbstractGetTests:
+    def test_get_file_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1a
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+        assert local_fs.isdir(target)
+
+        target_file2 = local_join(target, "file2")
+        target_subfile1 = local_join(target, "subfile1")
+
+        # Copy from source directory
+        fs.get(fs_join(source, "file2"), target)
+        assert local_fs.isfile(target_file2)
+
+        # Copy from sub directory
+        fs.get(fs_join(source, "subdir", "subfile1"), target)
+        assert local_fs.isfile(target_subfile1)
+
+        # Remove copied files
+        local_fs.rm([target_file2, target_subfile1])
+        assert not local_fs.exists(target_file2)
+        assert not local_fs.exists(target_subfile1)
+
+        # Repeat with trailing slash on target
+        fs.get(fs_join(source, "file2"), target + "/")
+        assert local_fs.isdir(target)
+        assert local_fs.isfile(target_file2)
+
+        fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
+        assert local_fs.isfile(target_subfile1)
+
+    def test_get_file_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1b
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(
+            fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
+        )  # Note trailing slash
+
+        assert local_fs.isdir(target)
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+
+    def test_get_file_to_file_in_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1c
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
+        assert local_fs.isfile(local_join(target, "newfile"))
+
+    def test_get_file_to_file_in_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1d
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(
+            fs_join(source, "subdir", "subfile1"),
+            local_join(target, "newdir", "newfile"),
+        )
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "newfile"))
+
+    def test_get_directory_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1e
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+        assert local_fs.isdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = target + "/" if target_slash else target
+
+            # Without recursive does nothing
+            fs.get(s, t)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            fs.get(s, t, recursive=True)
+            if source_slash:
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert local_fs.isdir(local_join(target, "nesteddir"))
+                assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                        local_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert local_fs.isdir(local_join(target, "subdir"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
+                assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
+                assert local_fs.isfile(
+                    local_join(target, "subdir", "nesteddir", "nestedfile")
+                )
+
+                local_fs.rm(local_join(target, "subdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # Limit recursive by maxdepth
+            fs.get(s, t, recursive=True, maxdepth=1)
+            if source_slash:
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert not local_fs.exists(local_join(target, "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert local_fs.isdir(local_join(target, "subdir"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
+                assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
+
+                local_fs.rm(local_join(target, "subdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+    def test_get_directory_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1f
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = local_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive does nothing
+            fs.get(s, t)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            fs.get(s, t, recursive=True)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
+            assert local_fs.isfile(
+                local_join(target, "newdir", "nesteddir", "nestedfile")
+            )
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # Limit recursive by maxdepth
+            fs.get(s, t, recursive=True, maxdepth=1)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert not local_fs.exists(local_join(target, "newdir"))
+
+    def test_get_glob_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1g
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            # Without recursive
+            fs.get(fs_join(source, "subdir", "*"), t)
+            assert local_fs.isfile(local_join(target, "subfile1"))
+            assert local_fs.isfile(local_join(target, "subfile2"))
+            assert not local_fs.isdir(local_join(target, "nesteddir"))
+            assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(
+                [
+                    local_join(target, "subfile1"),
+                    local_join(target, "subfile2"),
+                ],
+                recursive=True,
+            )
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert local_fs.isdir(local_join(target, "nesteddir"))
+                assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                        local_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+                assert local_fs.ls(target) == []
+
+                # Limit recursive by maxdepth
+                fs.get(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert not local_fs.exists(local_join(target, "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+                assert local_fs.ls(target) == []
+
+    def test_get_glob_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1h
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive
+            fs.get(fs_join(source, "subdir", "*"), t)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+            assert not local_fs.exists(
+                local_join(target, "newdir", "nesteddir", "nestedfile")
+            )
+            assert not local_fs.exists(local_join(target, "subdir"))
+            assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert local_fs.isdir(local_join(target, "newdir"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+                assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
+                assert local_fs.isfile(
+                    local_join(target, "newdir", "nesteddir", "nestedfile")
+                )
+                assert not local_fs.exists(local_join(target, "subdir"))
+                assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+                local_fs.rm(local_join(target, "newdir"), recursive=True)
+                assert not local_fs.exists(local_join(target, "newdir"))
+
+                # Limit recursive by maxdepth
+                fs.get(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert local_fs.isdir(local_join(target, "newdir"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+                assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+                assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+                local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
+                assert not local_fs.exists(local_join(target, "newdir"))
+
+    @pytest.mark.parametrize(
+        GLOB_EDGE_CASES_TESTS["argnames"],
+        GLOB_EDGE_CASES_TESTS["argvalues"],
+    )
+    def test_get_glob_edge_cases(
+        self,
+        path,
+        recursive,
+        maxdepth,
+        expected,
+        fs,
+        fs_join,
+        fs_glob_edge_cases_files,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1g
+        source = fs_glob_edge_cases_files
+
+        target = local_target
+
+        for new_dir, target_slash in product([True, False], [True, False]):
+            local_fs.mkdir(target)
+
+            t = local_join(target, "newdir") if new_dir else target
+            t = t + "/" if target_slash else t
+
+            fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
+
+            output = local_fs.find(target)
+            if new_dir:
+                prefixed_expected = [
+                    make_path_posix(local_join(target, "newdir", p)) for p in expected
+                ]
+            else:
+                prefixed_expected = [
+                    make_path_posix(local_join(target, p)) for p in expected
+                ]
+            assert sorted(output) == sorted(prefixed_expected)
+
+            try:
+                local_fs.rm(target, recursive=True)
+            except FileNotFoundError:
+                pass
+
+    def test_get_list_of_files_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 2a
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            fs.get(source_files, t)
+            assert local_fs.isfile(local_join(target, "file1"))
+            assert local_fs.isfile(local_join(target, "file2"))
+            assert local_fs.isfile(local_join(target, "subfile1"))
+
+            local_fs.rm(
+                [
+                    local_join(target, "file1"),
+                    local_join(target, "file2"),
+                    local_join(target, "subfile1"),
+                ],
+                recursive=True,
+            )
+            assert local_fs.ls(target) == []
+
+    def test_get_list_of_files_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 2b
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        fs.get(source_files, local_join(target, "newdir") + "/")  # Note trailing slash
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "file1"))
+        assert local_fs.isfile(local_join(target, "newdir", "file2"))
+        assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+
+    def test_get_directory_recursive(
+        self, fs, fs_join, fs_path, local_fs, local_join, local_target
+    ):
+        # https://github.com/fsspec/filesystem_spec/issues/1062
+        # Recursive cp/get/put of source directory into non-existent target directory.
+        src = fs_join(fs_path, "src")
+        src_file = fs_join(src, "file")
+        fs.mkdir(src)
+        fs.touch(src_file)
+
+        target = local_target
+
+        # get without slash
+        assert not local_fs.exists(target)
+        for loop in range(2):
+            fs.get(src, target, recursive=True)
+            assert local_fs.isdir(target)
+
+            if loop == 0:
+                assert local_fs.isfile(local_join(target, "file"))
+                assert not local_fs.exists(local_join(target, "src"))
+            else:
+                assert local_fs.isfile(local_join(target, "file"))
+                assert local_fs.isdir(local_join(target, "src"))
+                assert local_fs.isfile(local_join(target, "src", "file"))
+
+        local_fs.rm(target, recursive=True)
+
+        # get with slash
+        assert not local_fs.exists(target)
+        for loop in range(2):
+            fs.get(src + "/", target, recursive=True)
+            assert local_fs.isdir(target)
+            assert local_fs.isfile(local_join(target, "file"))
+            assert not local_fs.exists(local_join(target, "src"))
+
+    def test_get_directory_without_files_with_same_name_prefix(
+        self,
+        fs,
+        fs_join,
+        local_fs,
+        local_join,
+        local_target,
+        fs_dir_and_file_with_same_name_prefix,
+    ):
+        # Create the test dirs
+        source = fs_dir_and_file_with_same_name_prefix
+        target = local_target
+
+        # Test without glob
+        fs.get(fs_join(source, "subdir"), target, recursive=True)
+
+        assert local_fs.isfile(local_join(target, "subfile.txt"))
+        assert not local_fs.isfile(local_join(target, "subdir.txt"))
+
+        local_fs.rm([local_join(target, "subfile.txt")])
+        assert local_fs.ls(target) == []
+
+        # Test with glob
+        fs.get(fs_join(source, "subdir*"), target, recursive=True)
+
+        assert local_fs.isdir(local_join(target, "subdir"))
+        assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
+        assert local_fs.isfile(local_join(target, "subdir.txt"))
+
+    def test_get_with_source_and_destination_as_list(
+        self,
+        fs,
+        fs_join,
+        local_fs,
+        local_join,
+        local_target,
+        fs_10_files_with_hashed_names,
+    ):
+        # Create the test dir
+        source = fs_10_files_with_hashed_names
+        target = local_target
+
+        # Create list of files for source and destination
+        source_files = []
+        destination_files = []
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            source_files.append(fs_join(source, f"{hashed_i}.txt"))
+            destination_files.append(
+                make_path_posix(local_join(target, f"{hashed_i}.txt"))
+            )
+
+        # Copy and assert order was kept
+        fs.get(rpath=source_files, lpath=destination_files)
+
+        for i in range(10):
+            file_content = local_fs.cat(destination_files[i]).decode("utf-8")
+            assert file_content == str(i)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/mv.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/mv.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f6caa3de815e024fa84de2acecc986c823ed29
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/mv.py
@@ -0,0 +1,57 @@
+import os
+
+import pytest
+
+import fsspec
+
+
+def test_move_raises_error_with_tmpdir(tmpdir):
+    # Create a file in the temporary directory
+    source = tmpdir.join("source_file.txt")
+    source.write("content")
+
+    # Define a destination that simulates a protected or invalid path
+    destination = tmpdir.join("non_existent_directory/destination_file.txt")
+
+    # Instantiate the filesystem (assuming the local file system interface)
+    fs = fsspec.filesystem("file")
+
+    # Use the actual file paths as string
+    with pytest.raises(FileNotFoundError):
+        fs.mv(str(source), str(destination))
+
+
+@pytest.mark.parametrize("recursive", (True, False))
+def test_move_raises_error_with_tmpdir_permission(recursive, tmpdir):
+    # Create a file in the temporary directory
+    source = tmpdir.join("source_file.txt")
+    source.write("content")
+
+    # Create a protected directory (non-writable)
+    protected_dir = tmpdir.mkdir("protected_directory")
+    protected_path = str(protected_dir)
+
+    # Set the directory to read-only
+    if os.name == "nt":
+        os.system(f'icacls "{protected_path}" /deny Everyone:(W)')
+    else:
+        os.chmod(protected_path, 0o555)  # Sets the directory to read-only
+
+    # Define a destination inside the protected directory
+    destination = protected_dir.join("destination_file.txt")
+
+    # Instantiate the filesystem (assuming the local file system interface)
+    fs = fsspec.filesystem("file")
+
+    # Try to move the file to the read-only directory, expecting a permission error
+    with pytest.raises(PermissionError):
+        fs.mv(str(source), str(destination), recursive=recursive)
+
+    # Assert the file was not created in the destination
+    assert not os.path.exists(destination)
+
+    # Cleanup: Restore permissions so the directory can be cleaned up
+    if os.name == "nt":
+        os.system(f'icacls "{protected_path}" /remove:d Everyone')
+    else:
+        os.chmod(protected_path, 0o755)  # Restore write permission for cleanup
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/open.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/open.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb75ea852276fb8d834345883813b8e27a0ae24c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/open.py
@@ -0,0 +1,11 @@
+import pytest
+
+
+class AbstractOpenTests:
+    def test_open_exclusive(self, fs, fs_target):
+        with fs.open(fs_target, "wb") as f:
+            f.write(b"data")
+        with fs.open(fs_target, "rb") as f:
+            assert f.read() == b"data"
+        with pytest.raises(FileExistsError):
+            fs.open(fs_target, "xb")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/pipe.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/pipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ecca96e9d23ff268a253c48269d5cca451ea270
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/pipe.py
@@ -0,0 +1,11 @@
+import pytest
+
+
+class AbstractPipeTests:
+    def test_pipe_exclusive(self, fs, fs_target):
+        fs.pipe_file(fs_target, b"data")
+        assert fs.cat_file(fs_target) == b"data"
+        with pytest.raises(FileExistsError):
+            fs.pipe_file(fs_target, b"data", mode="create")
+        fs.pipe_file(fs_target, b"new data", mode="overwrite")
+        assert fs.cat_file(fs_target) == b"new data"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/put.py b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/put.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc349977f0384d9fc86126498be5c6ad99a21d3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/fsspec/tests/abstract/put.py
@@ -0,0 +1,591 @@
+from hashlib import md5
+from itertools import product
+
+import pytest
+
+from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
+
+
+class AbstractPutTests:
+    def test_put_file_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1a
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        target_file2 = fs_join(target, "file2")
+        target_subfile1 = fs_join(target, "subfile1")
+
+        # Copy from source directory
+        fs.put(local_join(source, "file2"), target)
+        assert fs.isfile(target_file2)
+
+        # Copy from sub directory
+        fs.put(local_join(source, "subdir", "subfile1"), target)
+        assert fs.isfile(target_subfile1)
+
+        # Remove copied files
+        fs.rm([target_file2, target_subfile1])
+        assert not fs.exists(target_file2)
+        assert not fs.exists(target_subfile1)
+
+        # Repeat with trailing slash on target
+        fs.put(local_join(source, "file2"), target + "/")
+        assert fs.isdir(target)
+        assert fs.isfile(target_file2)
+
+        fs.put(local_join(source, "subdir", "subfile1"), target + "/")
+        assert fs.isfile(target_subfile1)
+
+    def test_put_file_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1b
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.put(
+            local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
+        )  # Note trailing slash
+        assert fs.isdir(target)
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_put_file_to_file_in_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        supports_empty_directories,
+        local_bulk_operations_scenario_0,
+    ):
+        # Copy scenario 1c
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
+        assert fs.isfile(fs_join(target, "newfile"))
+
+    def test_put_file_to_file_in_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1d
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.put(
+            local_join(source, "subdir", "subfile1"),
+            fs_join(target, "newdir", "newfile"),
+        )
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "newfile"))
+
+    def test_put_directory_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1e
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = target + "/" if target_slash else target
+
+            # Without recursive does nothing
+            fs.put(s, t)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            fs.put(s, t, recursive=True)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # Limit recursive by maxdepth
+            fs.put(s, t, recursive=True, maxdepth=1)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_put_directory_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1f
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive does nothing
+            fs.put(s, t)
+            if supports_empty_directories:
+                assert fs.ls(target) == []
+            else:
+                with pytest.raises(FileNotFoundError):
+                    fs.ls(target)
+
+            # With recursive
+            fs.put(s, t, recursive=True)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+            assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # Limit recursive by maxdepth
+            fs.put(s, t, recursive=True, maxdepth=1)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+    def test_put_glob_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        supports_empty_directories,
+        local_bulk_operations_scenario_0,
+    ):
+        # Copy scenario 1g
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            # Without recursive
+            fs.put(local_join(source, "subdir", "*"), t)
+            assert fs.isfile(fs_join(target, "subfile1"))
+            assert fs.isfile(fs_join(target, "subfile2"))
+            assert not fs.isdir(fs_join(target, "nesteddir"))
+            assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(
+                [
+                    fs_join(target, "subfile1"),
+                    fs_join(target, "subfile2"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+                # Limit recursive by maxdepth
+                fs.put(
+                    local_join(source, "subdir", glob),
+                    t,
+                    recursive=recursive,
+                    maxdepth=1,
+                )
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+    def test_put_glob_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1h
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive
+            fs.put(local_join(source, "subdir", "*"), t)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+            assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+                # Limit recursive by maxdepth
+                fs.put(
+                    local_join(source, "subdir", glob),
+                    t,
+                    recursive=recursive,
+                    maxdepth=1,
+                )
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+    @pytest.mark.parametrize(
+        GLOB_EDGE_CASES_TESTS["argnames"],
+        GLOB_EDGE_CASES_TESTS["argvalues"],
+    )
+    def test_put_glob_edge_cases(
+        self,
+        path,
+        recursive,
+        maxdepth,
+        expected,
+        fs,
+        fs_join,
+        fs_target,
+        local_glob_edge_cases_files,
+        local_join,
+        fs_sanitize_path,
+    ):
+        # Copy scenario 1g
+        source = local_glob_edge_cases_files
+
+        target = fs_target
+
+        for new_dir, target_slash in product([True, False], [True, False]):
+            fs.mkdir(target)
+
+            t = fs_join(target, "newdir") if new_dir else target
+            t = t + "/" if target_slash else t
+
+            fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
+
+            output = fs.find(target)
+            if new_dir:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
+                ]
+            else:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, p)) for p in expected
+                ]
+            assert sorted(output) == sorted(prefixed_expected)
+
+            try:
+                fs.rm(target, recursive=True)
+            except FileNotFoundError:
+                pass
+
+    def test_put_list_of_files_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 2a
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        source_files = [
+            local_join(source, "file1"),
+            local_join(source, "file2"),
+            local_join(source, "subdir", "subfile1"),
+        ]
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            fs.put(source_files, t)
+            assert fs.isfile(fs_join(target, "file1"))
+            assert fs.isfile(fs_join(target, "file2"))
+            assert fs.isfile(fs_join(target, "subfile1"))
+
+            fs.rm(
+                [
+                    fs_join(target, "file1"),
+                    fs_join(target, "file2"),
+                    fs_join(target, "subfile1"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_put_list_of_files_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 2b
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        source_files = [
+            local_join(source, "file1"),
+            local_join(source, "file2"),
+            local_join(source, "subdir", "subfile1"),
+        ]
+
+        fs.put(source_files, fs_join(target, "newdir") + "/")  # Note trailing slash
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "file1"))
+        assert fs.isfile(fs_join(target, "newdir", "file2"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_put_directory_recursive(
+        self, fs, fs_join, fs_target, local_fs, local_join, local_path
+    ):
+        # https://github.com/fsspec/filesystem_spec/issues/1062
+        # Recursive cp/get/put of source directory into non-existent target directory.
+        src = local_join(local_path, "src")
+        src_file = local_join(src, "file")
+        local_fs.mkdir(src)
+        local_fs.touch(src_file)
+
+        target = fs_target
+
+        # put without slash
+        assert not fs.exists(target)
+        for loop in range(2):
+            fs.put(src, target, recursive=True)
+            assert fs.isdir(target)
+
+            if loop == 0:
+                assert fs.isfile(fs_join(target, "file"))
+                assert not fs.exists(fs_join(target, "src"))
+            else:
+                assert fs.isfile(fs_join(target, "file"))
+                assert fs.isdir(fs_join(target, "src"))
+                assert fs.isfile(fs_join(target, "src", "file"))
+
+        fs.rm(target, recursive=True)
+
+        # put with slash
+        assert not fs.exists(target)
+        for loop in range(2):
+            fs.put(src + "/", target, recursive=True)
+            assert fs.isdir(target)
+            assert fs.isfile(fs_join(target, "file"))
+            assert not fs.exists(fs_join(target, "src"))
+
+    def test_put_directory_without_files_with_same_name_prefix(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_dir_and_file_with_same_name_prefix,
+        supports_empty_directories,
+    ):
+        # Create the test dirs
+        source = local_dir_and_file_with_same_name_prefix
+        target = fs_target
+
+        # Test without glob
+        fs.put(local_join(source, "subdir"), fs_target, recursive=True)
+
+        assert fs.isfile(fs_join(fs_target, "subfile.txt"))
+        assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
+
+        fs.rm([fs_join(target, "subfile.txt")])
+        if supports_empty_directories:
+            assert fs.ls(target) == []
+        else:
+            assert not fs.exists(target)
+
+        # Test with glob
+        fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
+
+        assert fs.isdir(fs_join(fs_target, "subdir"))
+        assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
+        assert fs.isfile(fs_join(fs_target, "subdir.txt"))
+
+    def test_copy_with_source_and_destination_as_list(
+        self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
+    ):
+        # Create the test dir
+        source = local_10_files_with_hashed_names
+        target = fs_target
+
+        # Create list of files for source and destination
+        source_files = []
+        destination_files = []
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            source_files.append(local_join(source, f"{hashed_i}.txt"))
+            destination_files.append(fs_join(target, f"{hashed_i}.txt"))
+
+        # Copy and assert order was kept
+        fs.put(lpath=source_files, rpath=destination_files)
+
+        for i in range(10):
+            file_content = fs.cat(destination_files[i]).decode("utf-8")
+            assert file_content == str(i)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE b/URSA/.venv_ursa/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.5.dist-info/licenses/License.txt b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.5.dist-info/licenses/License.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bcd1867a02a6a8c1e592b92e2e50f34e531f2d87
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.5.dist-info/licenses/License.txt
@@ -0,0 +1,39 @@
+
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvjitlink-13.0.88.dist-info/licenses/License.txt b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvjitlink-13.0.88.dist-info/licenses/License.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b491c70e0aef319022ded661e111ddbd45b8a17f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvjitlink-13.0.88.dist-info/licenses/License.txt
@@ -0,0 +1,1568 @@
+End User License Agreement
+--------------------------
+
+
+Preface
+-------
+
+The Software License Agreement in Chapter 1 and the Supplement
+in Chapter 2 contain license terms and conditions that govern
+the use of NVIDIA software. By accepting this agreement, you
+agree to comply with all the terms and conditions applicable
+to the product(s) included herein.
+
+
+NVIDIA Driver
+
+
+Description
+
+This package contains the operating system driver and
+fundamental system software components for NVIDIA GPUs.
+
+
+NVIDIA CUDA Toolkit
+
+
+Description
+
+The NVIDIA CUDA Toolkit provides command-line and graphical
+tools for building, debugging and optimizing the performance
+of applications accelerated by NVIDIA GPUs, runtime and math
+libraries, and documentation including programming guides,
+user manuals, and API references.
+
+
+Default Install Location of CUDA Toolkit
+
+Windows platform:
+
+%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.#
+
+Linux platform:
+
+/usr/local/cuda-#.#
+
+Mac platform:
+
+/Developer/NVIDIA/CUDA-#.#
+
+
+NVIDIA CUDA Samples
+
+
+Description
+
+This package includes over 100+ CUDA examples that demonstrate
+various CUDA programming principles, and efficient CUDA
+implementation of algorithms in specific application domains.
+
+
+Default Install Location of CUDA Samples
+
+Windows platform:
+
+%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.#
+
+Linux platform:
+
+/usr/local/cuda-#.#/samples
+
+and
+
+$HOME/NVIDIA_CUDA-#.#_Samples
+
+Mac platform:
+
+/Developer/NVIDIA/CUDA-#.#/samples
+
+
+NVIDIA Nsight Visual Studio Edition (Windows only)
+
+
+Description
+
+NVIDIA Nsight Development Platform, Visual Studio Edition is a
+development environment integrated into Microsoft Visual
+Studio that provides tools for debugging, profiling, analyzing
+and optimizing your GPU computing and graphics applications.
+
+
+Default Install Location of Nsight Visual Studio Edition
+
+Windows platform:
+
+%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.#
+
+
+1. License Agreement for NVIDIA Software Development Kits
+---------------------------------------------------------
+
+
+Release Date: July 26, 2018
+---------------------------
+
+
+Important NoticeRead before downloading, installing,
+copying or using the licensed software:
+-------------------------------------------------------
+
+This license agreement, including exhibits attached
+("Agreement”) is a legal agreement between you and NVIDIA
+Corporation ("NVIDIA") and governs your use of a NVIDIA
+software development kit (“SDK”).
+
+Each SDK has its own set of software and materials, but here
+is a description of the types of items that may be included in
+a SDK: source code, header files, APIs, data sets and assets
+(examples include images, textures, models, scenes, videos,
+native API input/output files), binary software, sample code,
+libraries, utility programs, programming code and
+documentation.
+
+This Agreement can be accepted only by an adult of legal age
+of majority in the country in which the SDK is used.
+
+If you are entering into this Agreement on behalf of a company
+or other legal entity, you represent that you have the legal
+authority to bind the entity to this Agreement, in which case
+“you” will mean the entity you represent.
+
+If you don’t have the required age or authority to accept
+this Agreement, or if you don’t accept all the terms and
+conditions of this Agreement, do not download, install or use
+the SDK.
+
+You agree to use the SDK only for purposes that are permitted
+by (a) this Agreement, and (b) any applicable law, regulation
+or generally accepted practices or guidelines in the relevant
+jurisdictions.
+
+
+1.1. License
+
+
+1.1.1. License Grant
+
+Subject to the terms of this Agreement, NVIDIA hereby grants
+you a non-exclusive, non-transferable license, without the
+right to sublicense (except as expressly provided in this
+Agreement) to:
+
+  1. Install and use the SDK,
+
+  2. Modify and create derivative works of sample source code
+    delivered in the SDK, and
+
+  3. Distribute those portions of the SDK that are identified
+    in this Agreement as distributable, as incorporated in
+    object code format into a software application that meets
+    the distribution requirements indicated in this Agreement.
+
+
+1.1.2. Distribution Requirements
+
+These are the distribution requirements for you to exercise
+the distribution grant:
+
+  1. Your application must have material additional
+    functionality, beyond the included portions of the SDK.
+
+  2. The distributable portions of the SDK shall only be
+    accessed by your application.
+
+  3. The following notice shall be included in modifications
+    and derivative works of sample source code distributed:
+    “This software contains source code provided by NVIDIA
+    Corporation.”
+
+  4. Unless a developer tool is identified in this Agreement
+    as distributable, it is delivered for your internal use
+    only.
+
+  5. The terms under which you distribute your application
+    must be consistent with the terms of this Agreement,
+    including (without limitation) terms relating to the
+    license grant and license restrictions and protection of
+    NVIDIA’s intellectual property rights. Additionally, you
+    agree that you will protect the privacy, security and
+    legal rights of your application users.
+
+  6. You agree to notify NVIDIA in writing of any known or
+    suspected distribution or use of the SDK not in compliance
+    with the requirements of this Agreement, and to enforce
+    the terms of your agreements with respect to distributed
+    SDK.
+
+
+1.1.3. Authorized Users
+
+You may allow employees and contractors of your entity or of
+your subsidiary(ies) to access and use the SDK from your
+secure network to perform work on your behalf.
+
+If you are an academic institution you may allow users
+enrolled or employed by the academic institution to access and
+use the SDK from your secure network.
+
+You are responsible for the compliance with the terms of this
+Agreement by your authorized users. If you become aware that
+your authorized users didn’t follow the terms of this
+Agreement, you agree to take reasonable steps to resolve the
+non-compliance and prevent new occurrences.
+
+
+1.1.4. Pre-Release SDK
+
+The SDK versions identified as alpha, beta, preview or
+otherwise as pre-release, may not be fully functional, may
+contain errors or design flaws, and may have reduced or
+different security, privacy, accessibility, availability, and
+reliability standards relative to commercial versions of
+NVIDIA software and materials. Use of a pre-release SDK may
+result in unexpected results, loss of data, project delays or
+other unpredictable damage or loss.
+
+You may use a pre-release SDK at your own risk, understanding
+that pre-release SDKs are not intended for use in production
+or business-critical systems.
+
+NVIDIA may choose not to make available a commercial version
+of any pre-release SDK. NVIDIA may also choose to abandon
+development and terminate the availability of a pre-release
+SDK at any time without liability.
+
+
+1.1.5. Updates
+
+NVIDIA may, at its option, make available patches, workarounds
+or other updates to this SDK. Unless the updates are provided
+with their separate governing terms, they are deemed part of
+the SDK licensed to you as provided in this Agreement. You
+agree that the form and content of the SDK that NVIDIA
+provides may change without prior notice to you. While NVIDIA
+generally maintains compatibility between versions, NVIDIA may
+in some cases make changes that introduce incompatibilities in
+future versions of the SDK.
+
+
+1.1.6. Third Party Licenses
+
+The SDK may come bundled with, or otherwise include or be
+distributed with, third party software licensed by a NVIDIA
+supplier and/or open source software provided under an open
+source license. Use of third party software is subject to the
+third-party license terms, or in the absence of third party
+terms, the terms of this Agreement. Copyright to third party
+software is held by the copyright holders indicated in the
+third-party software or license.
+
+
+1.1.7. Reservation of Rights
+
+NVIDIA reserves all rights, title, and interest in and to the
+SDK, not expressly granted to you under this Agreement.
+
+
+1.2. Limitations
+
+The following license limitations apply to your use of the
+SDK:
+
+  1. You may not reverse engineer, decompile or disassemble,
+    or remove copyright or other proprietary notices from any
+    portion of the SDK or copies of the SDK.
+
+  2. Except as expressly provided in this Agreement, you may
+    not copy, sell, rent, sublicense, transfer, distribute,
+    modify, or create derivative works of any portion of the
+    SDK. For clarity, you may not distribute or sublicense the
+    SDK as a stand-alone product.
+
+  3. Unless you have an agreement with NVIDIA for this
+    purpose, you may not indicate that an application created
+    with the SDK is sponsored or endorsed by NVIDIA.
+
+  4. You may not bypass, disable, or circumvent any
+    encryption, security, digital rights management or
+    authentication mechanism in the SDK.
+
+  5. You may not use the SDK in any manner that would cause it
+    to become subject to an open source software license. As
+    examples, licenses that require as a condition of use,
+    modification, and/or distribution that the SDK be:
+
+      a. Disclosed or distributed in source code form;
+
+      b. Licensed for the purpose of making derivative works;
+        or
+
+      c. Redistributable at no charge.
+
+  6. Unless you have an agreement with NVIDIA for this
+    purpose, you may not use the SDK with any system or
+    application where the use or failure of the system or
+    application can reasonably be expected to threaten or
+    result in personal injury, death, or catastrophic loss.
+    Examples include use in avionics, navigation, military,
+    medical, life support or other life critical applications.
+    NVIDIA does not design, test or manufacture the SDK for
+    these critical uses and NVIDIA shall not be liable to you
+    or any third party, in whole or in part, for any claims or
+    damages arising from such uses.
+
+  7. You agree to defend, indemnify and hold harmless NVIDIA
+    and its affiliates, and their respective employees,
+    contractors, agents, officers and directors, from and
+    against any and all claims, damages, obligations, losses,
+    liabilities, costs or debt, fines, restitutions and
+    expenses (including but not limited to attorney’s fees
+    and costs incident to establishing the right of
+    indemnification) arising out of or related to your use of
+    the SDK outside of the scope of this Agreement, or not in
+    compliance with its terms.
+
+
+1.3. Ownership
+
+  1.  NVIDIA or its licensors hold all rights, title and
+    interest in and to the SDK and its modifications and
+    derivative works, including their respective intellectual
+    property rights, subject to your rights described in this
+    section. This SDK may include software and materials from
+    NVIDIA’s licensors, and these licensors are intended
+    third party beneficiaries that may enforce this Agreement
+    with respect to their intellectual property rights.
+
+  2.  You hold all rights, title and interest in and to your
+    applications and your derivative works of the sample
+    source code delivered in the SDK, including their
+    respective intellectual property rights, subject to
+    NVIDIA’s rights described in this section.
+
+  3. You may, but don’t have to, provide to NVIDIA
+    suggestions, feature requests or other feedback regarding
+    the SDK, including possible enhancements or modifications
+    to the SDK. For any feedback that you voluntarily provide,
+    you hereby grant NVIDIA and its affiliates a perpetual,
+    non-exclusive, worldwide, irrevocable license to use,
+    reproduce, modify, license, sublicense (through multiple
+    tiers of sublicensees), and distribute (through multiple
+    tiers of distributors) it without the payment of any
+    royalties or fees to you. NVIDIA will use feedback at its
+    choice. NVIDIA is constantly looking for ways to improve
+    its products, so you may send feedback to NVIDIA through
+    the developer portal at https://developer.nvidia.com.
+
+
+1.4. No Warranties
+
+THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL
+FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND
+ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND
+OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING,
+BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE
+ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO
+WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF
+DEALING OR COURSE OF TRADE.
+
+
+1.5. Limitation of Liability
+
+TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS
+AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
+PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS
+OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF
+PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION
+WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK,
+WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH
+OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE),
+PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF
+LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES
+TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS
+AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE
+NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS
+LIMIT.
+
+These exclusions and limitations of liability shall apply
+regardless if NVIDIA or its affiliates have been advised of
+the possibility of such damages, and regardless of whether a
+remedy fails its essential purpose. These exclusions and
+limitations of liability form an essential basis of the
+bargain between the parties, and, absent any of these
+exclusions or limitations of liability, the provisions of this
+Agreement, including, without limitation, the economic terms,
+would be substantially different.
+
+
+1.6. Termination
+
+  1. This Agreement will continue to apply until terminated by
+    either you or NVIDIA as described below.
+
+  2. If you want to terminate this Agreement, you may do so by
+    stopping to use the SDK.
+
+  3. NVIDIA may, at any time, terminate this Agreement if:
+
+      a. (i) you fail to comply with any term of this
+        Agreement and the non-compliance is not fixed within
+        thirty (30) days following notice from NVIDIA (or
+        immediately if you violate NVIDIA’s intellectual
+        property rights);
+
+      b. (ii) you commence or participate in any legal
+        proceeding against NVIDIA with respect to the SDK; or
+
+      c. (iii) NVIDIA decides to no longer provide the SDK in
+        a country or, in NVIDIA’s sole discretion, the
+        continued use of it is no longer commercially viable.
+
+  4. Upon any termination of this Agreement, you agree to
+    promptly discontinue use of the SDK and destroy all copies
+    in your possession or control. Your prior distributions in
+    accordance with this Agreement are not affected by the
+    termination of this Agreement. Upon written request, you
+    will certify in writing that you have complied with your
+    commitments under this section. Upon any termination of
+    this Agreement all provisions survive except for the
+    license grant provisions.
+
+
+1.7. General
+
+If you wish to assign this Agreement or your rights and
+obligations, including by merger, consolidation, dissolution
+or operation of law, contact NVIDIA to ask for permission. Any
+attempted assignment not approved by NVIDIA in writing shall
+be void and of no effect. NVIDIA may assign, delegate or
+transfer this Agreement and its rights and obligations, and if
+to a non-affiliate you will be notified.
+
+You agree to cooperate with NVIDIA and provide reasonably
+requested information to verify your compliance with this
+Agreement.
+
+This Agreement will be governed in all respects by the laws of
+the United States and of the State of Delaware as those laws
+are applied to contracts entered into and performed entirely
+within Delaware by Delaware residents, without regard to the
+conflicts of laws principles. The United Nations Convention on
+Contracts for the International Sale of Goods is specifically
+disclaimed. You agree to all terms of this Agreement in the
+English language.
+
+The state or federal courts residing in Santa Clara County,
+California shall have exclusive jurisdiction over any dispute
+or claim arising out of this Agreement. Notwithstanding this,
+you agree that NVIDIA shall still be allowed to apply for
+injunctive remedies or an equivalent type of urgent legal
+relief in any jurisdiction.
+
+If any court of competent jurisdiction determines that any
+provision of this Agreement is illegal, invalid or
+unenforceable, such provision will be construed as limited to
+the extent necessary to be consistent with and fully
+enforceable under the law and the remaining provisions will
+remain in full force and effect. Unless otherwise specified,
+remedies are cumulative.
+
+Each party acknowledges and agrees that the other is an
+independent contractor in the performance of this Agreement.
+
+The SDK has been developed entirely at private expense and is
+“commercial items” consisting of “commercial computer
+software” and “commercial computer software
+documentation” provided with RESTRICTED RIGHTS. Use,
+duplication or disclosure by the U.S. Government or a U.S.
+Government subcontractor is subject to the restrictions in
+this Agreement pursuant to DFARS 227.7202-3(a) or as set forth
+in subparagraphs (c)(1) and (2) of the Commercial Computer
+Software - Restricted Rights clause at FAR 52.227-19, as
+applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas
+Expressway, Santa Clara, CA 95051.
+
+The SDK is subject to United States export laws and
+regulations. You agree that you will not ship, transfer or
+export the SDK into any country, or use the SDK in any manner,
+prohibited by the United States Bureau of Industry and
+Security or economic sanctions regulations administered by the
+U.S. Department of Treasury’s Office of Foreign Assets
+Control (OFAC), or any applicable export laws, restrictions or
+regulations. These laws include restrictions on destinations,
+end users and end use. By accepting this Agreement, you
+confirm that you are not a resident or citizen of any country
+currently embargoed by the U.S. and that you are not otherwise
+prohibited from receiving the SDK.
+
+Any notice delivered by NVIDIA to you under this Agreement
+will be delivered via mail, email or fax. You agree that any
+notices that NVIDIA sends you electronically will satisfy any
+legal communication requirements. Please direct your legal
+notices or other correspondence to NVIDIA Corporation, 2788
+San Tomas Expressway, Santa Clara, California 95051, United
+States of America, Attention: Legal Department.
+
+This Agreement and any exhibits incorporated into this
+Agreement constitute the entire agreement of the parties with
+respect to the subject matter of this Agreement and supersede
+all prior negotiations or documentation exchanged between the
+parties relating to this SDK license. Any additional and/or
+conflicting terms on documents issued by you are null, void,
+and invalid. Any amendment or waiver under this Agreement
+shall be in writing and signed by representatives of both
+parties.
+
+
+2. CUDA Toolkit Supplement to Software License Agreement for
+NVIDIA Software Development Kits
+------------------------------------------------------------
+
+
+Release date: August 16, 2018
+-----------------------------
+
+The terms in this supplement govern your use of the NVIDIA
+CUDA Toolkit SDK under the terms of your license agreement
+(“Agreement”) as modified by this supplement. Capitalized
+terms used but not defined below have the meaning assigned to
+them in the Agreement.
+
+This supplement is an exhibit to the Agreement and is
+incorporated as an integral part of the Agreement. In the
+event of conflict between the terms in this supplement and the
+terms in the Agreement, the terms in this supplement govern.
+
+
+2.1. License Scope
+
+The SDK is licensed for you to develop applications only for
+use in systems with NVIDIA GPUs.
+
+
+2.2. Distribution
+
+The portions of the SDK that are distributable under the
+Agreement are listed in Attachment A.
+
+
+2.3. Operating Systems
+
+Those portions of the SDK designed exclusively for use on the
+Linux or FreeBSD operating systems, or other operating systems
+derived from the source code to these operating systems, may
+be copied and redistributed for use in accordance with this
+Agreement, provided that the object code files are not
+modified in any way (except for unzipping of compressed
+files).
+
+
+2.4. Audio and Video Encoders and Decoders
+
+You acknowledge and agree that it is your sole responsibility
+to obtain any additional third-party licenses required to
+make, have made, use, have used, sell, import, and offer for
+sale your products or services that include or incorporate any
+third-party software and content relating to audio and/or
+video encoders and decoders from, including but not limited
+to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A.,
+MPEG-LA, and Coding Technologies. NVIDIA does not grant to you
+under this Agreement any necessary patent or other rights with
+respect to any audio and/or video encoders and decoders.
+
+
+2.5. Licensing
+
+If the distribution terms in this Agreement are not suitable
+for your organization, or for any questions regarding this
+Agreement, please contact NVIDIA at
+nvidia-compute-license-questions@nvidia.com.
+
+
+2.6. Attachment A
+
+The following portions of the SDK are distributable under the
+Agreement:
+
+Component
+
+CUDA Runtime
+
+Windows
+
+cudart.dll, cudart_static.lib, cudadevrt.lib
+
+Mac OSX
+
+libcudart.dylib, libcudart_static.a, libcudadevrt.a
+
+Linux
+
+libcudart.so, libcudart_static.a, libcudadevrt.a
+
+Android
+
+libcudart.so, libcudart_static.a, libcudadevrt.a
+
+Component
+
+CUDA FFT Library
+
+Windows
+
+cufft.dll, cufftw.dll, cufft.lib, cufftw.lib
+
+Mac OSX
+
+libcufft.dylib, libcufft_static.a, libcufftw.dylib,
+libcufftw_static.a
+
+Linux
+
+libcufft.so, libcufft_static.a, libcufftw.so,
+libcufftw_static.a
+
+Android
+
+libcufft.so, libcufft_static.a, libcufftw.so,
+libcufftw_static.a
+
+Component
+
+CUDA BLAS Library
+
+Windows
+
+cublas.dll, cublasLt.dll
+
+Mac OSX
+
+libcublas.dylib, libcublasLt.dylib, libcublas_static.a,
+libcublasLt_static.a
+
+Linux
+
+libcublas.so, libcublasLt.so, libcublas_static.a,
+libcublasLt_static.a
+
+Android
+
+libcublas.so, libcublasLt.so, libcublas_static.a,
+libcublasLt_static.a
+
+Component
+
+NVIDIA "Drop-in" BLAS Library
+
+Windows
+
+nvblas.dll
+
+Mac OSX
+
+libnvblas.dylib
+
+Linux
+
+libnvblas.so
+
+Component
+
+CUDA Sparse Matrix Library
+
+Windows
+
+cusparse.dll, cusparse.lib
+
+Mac OSX
+
+libcusparse.dylib, libcusparse_static.a
+
+Linux
+
+libcusparse.so, libcusparse_static.a
+
+Android
+
+libcusparse.so, libcusparse_static.a
+
+Component
+
+CUDA Linear Solver Library
+
+Windows
+
+cusolver.dll, cusolver.lib
+
+Mac OSX
+
+libcusolver.dylib, libcusolver_static.a
+
+Linux
+
+libcusolver.so, libcusolver_static.a
+
+Android
+
+libcusolver.so, libcusolver_static.a
+
+Component
+
+CUDA Random Number Generation Library
+
+Windows
+
+curand.dll, curand.lib
+
+Mac OSX
+
+libcurand.dylib, libcurand_static.a
+
+Linux
+
+libcurand.so, libcurand_static.a
+
+Android
+
+libcurand.so, libcurand_static.a
+
+Component
+
+CUDA Accelerated Graph Library
+
+Component
+
+NVIDIA Performance Primitives Library
+
+Windows
+
+nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll,
+nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll,
+nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib,
+nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll,
+nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib
+
+Mac OSX
+
+libnppc.dylib, libnppc_static.a, libnppial.dylib,
+libnppial_static.a, libnppicc.dylib, libnppicc_static.a,
+libnppicom.dylib, libnppicom_static.a, libnppidei.dylib,
+libnppidei_static.a, libnppif.dylib, libnppif_static.a,
+libnppig.dylib, libnppig_static.a, libnppim.dylib,
+libnppisu_static.a, libnppitc.dylib, libnppitc_static.a,
+libnpps.dylib, libnpps_static.a
+
+Linux
+
+libnppc.so, libnppc_static.a, libnppial.so,
+libnppial_static.a, libnppicc.so, libnppicc_static.a,
+libnppicom.so, libnppicom_static.a, libnppidei.so,
+libnppidei_static.a, libnppif.so, libnppif_static.a
+libnppig.so, libnppig_static.a, libnppim.so,
+libnppim_static.a, libnppist.so, libnppist_static.a,
+libnppisu.so, libnppisu_static.a, libnppitc.so
+libnppitc_static.a, libnpps.so, libnpps_static.a
+
+Android
+
+libnppc.so, libnppc_static.a, libnppial.so,
+libnppial_static.a, libnppicc.so, libnppicc_static.a,
+libnppicom.so, libnppicom_static.a, libnppidei.so,
+libnppidei_static.a, libnppif.so, libnppif_static.a
+libnppig.so, libnppig_static.a, libnppim.so,
+libnppim_static.a, libnppist.so, libnppist_static.a,
+libnppisu.so, libnppisu_static.a, libnppitc.so
+libnppitc_static.a, libnpps.so, libnpps_static.a
+
+Component
+
+NVIDIA JPEG Library
+
+Linux
+
+libnvjpeg.so, libnvjpeg_static.a
+
+Component
+
+Internal common library required for statically linking to
+cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP
+
+Mac OSX
+
+libculibos.a
+
+Linux
+
+libculibos.a
+
+Component
+
+NVIDIA Runtime Compilation Library and Header
+
+All
+
+nvrtc.h
+
+Windows
+
+nvrtc.dll, nvrtc-builtins.dll
+
+Mac OSX
+
+libnvrtc.dylib, libnvrtc-builtins.dylib
+
+Linux
+
+libnvrtc.so, libnvrtc-builtins.so
+
+Component
+
+NVIDIA Optimizing Compiler Library
+
+Windows
+
+nvvm.dll
+
+Mac OSX
+
+libnvvm.dylib
+
+Linux
+
+libnvvm.so
+
+Component
+
+NVIDIA Common Device Math Functions Library
+
+Windows
+
+libdevice.10.bc
+
+Mac OSX
+
+libdevice.10.bc
+
+Linux
+
+libdevice.10.bc
+
+Component
+
+CUDA Occupancy Calculation Header Library
+
+All
+
+cuda_occupancy.h
+
+Component
+
+CUDA Half Precision Headers
+
+All
+
+cuda_fp16.h, cuda_fp16.hpp
+
+Component
+
+CUDA Profiling Tools Interface (CUPTI) Library
+
+Windows
+
+cupti.dll
+
+Mac OSX
+
+libcupti.dylib
+
+Linux
+
+libcupti.so
+
+Component
+
+NVIDIA Tools Extension Library
+
+Windows
+
+nvToolsExt.dll, nvToolsExt.lib
+
+Mac OSX
+
+libnvToolsExt.dylib
+
+Linux
+
+libnvToolsExt.so
+
+Component
+
+NVIDIA CUDA Driver Libraries
+
+Linux
+
+libcuda.so, libnvidia-fatbinaryloader.so,
+libnvidia-ptxjitcompiler.so
+
+The NVIDIA CUDA Driver Libraries are only distributable in
+applications that meet this criteria:
+
+  1. The application was developed starting from a NVIDIA CUDA
+    container obtained from Docker Hub or the NVIDIA GPU
+    Cloud, and
+
+  2. The resulting application is packaged as a Docker
+    container and distributed to users on Docker Hub or the
+    NVIDIA GPU Cloud only.
+
+
+2.7. Attachment B
+
+
+Additional Licensing Obligations
+
+The following third party components included in the SOFTWARE
+are licensed to Licensee pursuant to the following terms and
+conditions:
+
+  1. Licensee's use of the GDB third party component is
+    subject to the terms and conditions of GNU GPL v3:
+
+    This product includes copyrighted third-party software licensed
+    under the terms of the GNU General Public License v3 ("GPL v3").
+    All third-party software packages are copyright by their respective
+    authors. GPL v3 terms and conditions are hereby incorporated into
+    the Agreement by this reference:     http://www.gnu.org/licenses/gpl.txt
+
+    Consistent with these licensing requirements, the software
+    listed below is provided under the terms of the specified
+    open source software licenses. To obtain source code for
+    software provided under licenses that require
+    redistribution of source code, including the GNU General
+    Public License (GPL) and GNU Lesser General Public License
+    (LGPL), contact oss-requests@nvidia.com. This offer is
+    valid for a period of three (3) years from the date of the
+    distribution of this product by NVIDIA CORPORATION.
+
+    Component          License
+    CUDA-GDB           GPL v3
+
+  2. Licensee represents and warrants that any and all third
+    party licensing and/or royalty payment obligations in
+    connection with Licensee's use of the H.264 video codecs
+    are solely the responsibility of Licensee.
+
+  3. Licensee's use of the Thrust library is subject to the
+    terms and conditions of the Apache License Version 2.0.
+    All third-party software packages are copyright by their
+    respective authors. Apache License Version 2.0 terms and
+    conditions are hereby incorporated into the Agreement by
+    this reference.
+    http://www.apache.org/licenses/LICENSE-2.0.html
+
+    In addition, Licensee acknowledges the following notice:
+    Thrust includes source code from the Boost Iterator,
+    Tuple, System, and Random Number libraries.
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+    . . . .
+
+    Permission is hereby granted, free of charge, to any person or
+    organization obtaining a copy of the software and accompanying
+    documentation covered by this license (the "Software") to use,
+    reproduce, display, distribute, execute, and transmit the Software,
+    and to prepare derivative works of the Software, and to permit
+    third-parties to whom the Software is furnished to do so, all
+    subject to the following:
+
+    The copyright notices in the Software and this entire statement,
+    including the above license grant, this restriction and the following
+    disclaimer, must be included in all copies of the Software, in whole
+    or in part, and all derivative works of the Software, unless such
+    copies or derivative works are solely in the form of machine-executable
+    object code generated by a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
+    NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+    ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
+    OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+
+  4. Licensee's use of the LLVM third party component is
+    subject to the following terms and conditions:
+
+    ======================================================
+    LLVM Release License
+    ======================================================
+    University of Illinois/NCSA
+    Open Source License
+
+    Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+    All rights reserved.
+
+    Developed by:
+
+        LLVM Team
+
+        University of Illinois at Urbana-Champaign
+
+        http://llvm.org
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal with the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    *  Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimers.
+
+    *  Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimers in the
+       documentation and/or other materials provided with the distribution.
+
+    *  Neither the names of the LLVM Team, University of Illinois at Urbana-
+       Champaign, nor the names of its contributors may be used to endorse or
+       promote products derived from this Software without specific prior
+       written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS WITH THE SOFTWARE.
+
+  5. Licensee's use (e.g. nvprof) of the PCRE third party
+    component is subject to the following terms and
+    conditions:
+
+    ------------
+    PCRE LICENCE
+    ------------
+    PCRE is a library of functions to support regular expressions whose syntax
+    and semantics are as close as possible to those of the Perl 5 language.
+    Release 8 of PCRE is distributed under the terms of the "BSD" licence, as
+    specified below. The documentation for PCRE, supplied in the "doc"
+    directory, is distributed under the same terms as the software itself. The
+    basic library functions are written in C and are freestanding. Also
+    included in the distribution is a set of C++ wrapper functions, and a just-
+    in-time compiler that can be used to optimize pattern matching. These are
+    both optional features that can be omitted when the library is built.
+
+    THE BASIC LIBRARY FUNCTIONS
+    ---------------------------
+    Written by:       Philip Hazel
+    Email local part: ph10
+    Email domain:     cam.ac.uk
+    University of Cambridge Computing Service,
+    Cambridge, England.
+    Copyright (c) 1997-2012 University of Cambridge
+    All rights reserved.
+
+    PCRE JUST-IN-TIME COMPILATION SUPPORT
+    -------------------------------------
+    Written by:       Zoltan Herczeg
+    Email local part: hzmester
+    Emain domain:     freemail.hu
+    Copyright(c) 2010-2012 Zoltan Herczeg
+    All rights reserved.
+
+    STACK-LESS JUST-IN-TIME COMPILER
+    --------------------------------
+    Written by:       Zoltan Herczeg
+    Email local part: hzmester
+    Emain domain:     freemail.hu
+    Copyright(c) 2009-2012 Zoltan Herczeg
+    All rights reserved.
+
+    THE C++ WRAPPER FUNCTIONS
+    -------------------------
+    Contributed by:   Google Inc.
+    Copyright (c) 2007-2012, Google Inc.
+    All rights reserved.
+
+    THE "BSD" LICENCE
+    -----------------
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+      * Neither the name of the University of Cambridge nor the name of Google
+        Inc. nor the names of their contributors may be used to endorse or
+        promote products derived from this software without specific prior
+        written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  6. Some of the cuBLAS library routines were written by or
+    derived from code written by Vasily Volkov and are subject
+    to the Modified Berkeley Software Distribution License as
+    follows:
+
+    Copyright (c) 2007-2009, Regents of the University of California
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the University of California, Berkeley nor
+          the names of its contributors may be used to endorse or promote
+          products derived from this software without specific prior
+          written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+    IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  7. Some of the cuBLAS library routines were written by or
+    derived from code written by Davide Barbieri and are
+    subject to the Modified Berkeley Software Distribution
+    License as follows:
+
+    Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * The name of the author may not be used to endorse or promote
+          products derived from this software without specific prior
+          written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+    IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  8. Some of the cuBLAS library routines were derived from
+    code developed by the University of Tennessee and are
+    subject to the Modified Berkeley Software Distribution
+    License as follows:
+
+    Copyright (c) 2010 The University of Tennessee.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer listed in this license in the documentation and/or
+          other materials provided with the distribution.
+        * Neither the name of the copyright holders nor the names of its
+          contributors may be used to endorse or promote products derived
+          from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  9. Some of the cuBLAS library routines were written by or
+    derived from code written by Jonathan Hogg and are subject
+    to the Modified Berkeley Software Distribution License as
+    follows:
+
+    Copyright (c) 2012, The Science and Technology Facilities Council (STFC).
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the STFC nor the names of its contributors
+          may be used to endorse or promote products derived from this
+          software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+    OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+    IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  10. Some of the cuBLAS library routines were written by or
+    derived from code written by Ahmad M. Abdelfattah, David
+    Keyes, and Hatem Ltaief, and are subject to the Apache
+    License, Version 2.0, as follows:
+
+     -- (C) Copyright 2013 King Abdullah University of Science and Technology
+      Authors:
+      Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa)
+      David Keyes (david.keyes@kaust.edu.sa)
+      Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
+
+      Redistribution  and  use  in  source and binary forms, with or without
+      modification,  are  permitted  provided  that the following conditions
+      are met:
+
+      * Redistributions  of  source  code  must  retain  the above copyright
+        notice,  this  list  of  conditions  and  the  following  disclaimer.
+      * Redistributions  in  binary  form must reproduce the above copyright
+        notice,  this list of conditions and the following disclaimer in the
+        documentation  and/or other materials provided with the distribution.
+      * Neither  the  name of the King Abdullah University of Science and
+        Technology nor the names of its contributors may be used to endorse
+        or promote products derived from this software without specific prior
+        written permission.
+
+      THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+      ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+      LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+      A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+      HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+      SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+      LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+      DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+      THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+      (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+      OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
+
+  11. Some of the cuSPARSE library routines were written by or
+    derived from code written by Li-Wen Chang and are subject
+    to the NCSA Open Source License as follows:
+
+    Copyright (c) 2012, University of Illinois.
+
+    All rights reserved.
+
+    Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal with the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimers in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the names of IMPACT Group, University of Illinois, nor
+          the names of its contributors may be used to endorse or promote
+          products derived from this Software without specific prior
+          written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+    IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+    IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+    SOFTWARE.
+
+  12. Some of the cuRAND library routines were written by or
+    derived from code written by Mutsuo Saito and Makoto
+    Matsumoto and are subject to the following license:
+
+    Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+    University. All rights reserved.
+
+    Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+    University and University of Tokyo.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the Hiroshima University nor the names of
+          its contributors may be used to endorse or promote products
+          derived from this software without specific prior written
+          permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  13. Some of the cuRAND library routines were derived from
+    code developed by D. E. Shaw Research and are subject to
+    the following license:
+
+    Copyright 2010-2011, D. E. Shaw Research.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions, and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions, and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of D. E. Shaw Research nor the names of its
+          contributors may be used to endorse or promote products derived
+          from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  14. Some of the Math library routines were written by or
+    derived from code developed by Norbert Juffa and are
+    subject to the following license:
+
+    Copyright (c) 2015-2017, Norbert Juffa
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  15. Licensee's use of the lz4 third party component is
+    subject to the following terms and conditions:
+
+    Copyright (C) 2011-2013, Yann Collet.
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  16. The NPP library uses code from the Boost Math Toolkit,
+    and is subject to the following license:
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+    . . . .
+
+    Permission is hereby granted, free of charge, to any person or
+    organization obtaining a copy of the software and accompanying
+    documentation covered by this license (the "Software") to use,
+    reproduce, display, distribute, execute, and transmit the Software,
+    and to prepare derivative works of the Software, and to permit
+    third-parties to whom the Software is furnished to do so, all
+    subject to the following:
+
+    The copyright notices in the Software and this entire statement,
+    including the above license grant, this restriction and the following
+    disclaimer, must be included in all copies of the Software, in whole
+    or in part, and all derivative works of the Software, unless such
+    copies or derivative works are solely in the form of machine-executable
+    object code generated by a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
+    NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+    ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
+    OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+
+  17. Portions of the Nsight Eclipse Edition is subject to the
+    following license:
+
+    The Eclipse Foundation makes available all content in this plug-in
+    ("Content"). Unless otherwise indicated below, the Content is provided
+    to you under the terms and conditions of the Eclipse Public License
+    Version 1.0 ("EPL"). A copy of the EPL is available at http://
+    www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program"
+    will mean the Content.
+
+    If you did not receive this Content directly from the Eclipse
+    Foundation, the Content is being redistributed by another party
+    ("Redistributor") and different terms and conditions may apply to your
+    use of any object code in the Content. Check the Redistributor's
+    license that was provided with the Content. If no such license exists,
+    contact the Redistributor. Unless otherwise indicated below, the terms
+    and conditions of the EPL still apply to any source code in the
+    Content and such source code may be obtained at http://www.eclipse.org.
+
+  18. Some of the cuBLAS library routines uses code from
+    OpenAI, which is subject to the following license:
+
+    License URL
+    https://github.com/openai/openai-gemm/blob/master/LICENSE
+
+    License Text
+    The MIT License
+
+    Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+
+  19. Licensee's use of the Visual Studio Setup Configuration
+    Samples is subject to the following license:
+
+    The MIT License (MIT)
+    Copyright (C) Microsoft Corporation. All rights reserved.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without restriction,
+    including without limitation the rights to use, copy, modify, merge,
+    publish, distribute, sublicense, and/or sell copies of the Software,
+    and to permit persons to whom the Software is furnished to do so,
+    subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  20. Licensee's use of linmath.h header for CPU functions for
+    GL vector/matrix operations from lunarG is subject to the
+    Apache License Version 2.0.
+
+  21. The DX12-CUDA sample uses the d3dx12.h header, which is
+    subject to the MIT license .
+
+-----------------
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvshmem_cu13-3.4.5.dist-info/licenses/License.txt b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvshmem_cu13-3.4.5.dist-info/licenses/License.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b491c70e0aef319022ded661e111ddbd45b8a17f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/nvidia_nvshmem_cu13-3.4.5.dist-info/licenses/License.txt
@@ -0,0 +1,1568 @@
+End User License Agreement
+--------------------------
+
+
+Preface
+-------
+
+The Software License Agreement in Chapter 1 and the Supplement
+in Chapter 2 contain license terms and conditions that govern
+the use of NVIDIA software. By accepting this agreement, you
+agree to comply with all the terms and conditions applicable
+to the product(s) included herein.
+
+
+NVIDIA Driver
+
+
+Description
+
+This package contains the operating system driver and
+fundamental system software components for NVIDIA GPUs.
+
+
+NVIDIA CUDA Toolkit
+
+
+Description
+
+The NVIDIA CUDA Toolkit provides command-line and graphical
+tools for building, debugging and optimizing the performance
+of applications accelerated by NVIDIA GPUs, runtime and math
+libraries, and documentation including programming guides,
+user manuals, and API references.
+
+
+Default Install Location of CUDA Toolkit
+
+Windows platform:
+
+%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.#
+
+Linux platform:
+
+/usr/local/cuda-#.#
+
+Mac platform:
+
+/Developer/NVIDIA/CUDA-#.#
+
+
+NVIDIA CUDA Samples
+
+
+Description
+
+This package includes over 100+ CUDA examples that demonstrate
+various CUDA programming principles, and efficient CUDA
+implementation of algorithms in specific application domains.
+
+
+Default Install Location of CUDA Samples
+
+Windows platform:
+
+%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.#
+
+Linux platform:
+
+/usr/local/cuda-#.#/samples
+
+and
+
+$HOME/NVIDIA_CUDA-#.#_Samples
+
+Mac platform:
+
+/Developer/NVIDIA/CUDA-#.#/samples
+
+
+NVIDIA Nsight Visual Studio Edition (Windows only)
+
+
+Description
+
+NVIDIA Nsight Development Platform, Visual Studio Edition is a
+development environment integrated into Microsoft Visual
+Studio that provides tools for debugging, profiling, analyzing
+and optimizing your GPU computing and graphics applications.
+
+
+Default Install Location of Nsight Visual Studio Edition
+
+Windows platform:
+
+%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.#
+
+
+1. License Agreement for NVIDIA Software Development Kits
+---------------------------------------------------------
+
+
+Release Date: July 26, 2018
+---------------------------
+
+
+Important NoticeRead before downloading, installing,
+copying or using the licensed software:
+-------------------------------------------------------
+
+This license agreement, including exhibits attached
+("Agreement”) is a legal agreement between you and NVIDIA
+Corporation ("NVIDIA") and governs your use of a NVIDIA
+software development kit (“SDK”).
+
+Each SDK has its own set of software and materials, but here
+is a description of the types of items that may be included in
+a SDK: source code, header files, APIs, data sets and assets
+(examples include images, textures, models, scenes, videos,
+native API input/output files), binary software, sample code,
+libraries, utility programs, programming code and
+documentation.
+
+This Agreement can be accepted only by an adult of legal age
+of majority in the country in which the SDK is used.
+
+If you are entering into this Agreement on behalf of a company
+or other legal entity, you represent that you have the legal
+authority to bind the entity to this Agreement, in which case
+“you” will mean the entity you represent.
+
+If you don’t have the required age or authority to accept
+this Agreement, or if you don’t accept all the terms and
+conditions of this Agreement, do not download, install or use
+the SDK.
+
+You agree to use the SDK only for purposes that are permitted
+by (a) this Agreement, and (b) any applicable law, regulation
+or generally accepted practices or guidelines in the relevant
+jurisdictions.
+
+
+1.1. License
+
+
+1.1.1. License Grant
+
+Subject to the terms of this Agreement, NVIDIA hereby grants
+you a non-exclusive, non-transferable license, without the
+right to sublicense (except as expressly provided in this
+Agreement) to:
+
+  1. Install and use the SDK,
+
+  2. Modify and create derivative works of sample source code
+    delivered in the SDK, and
+
+  3. Distribute those portions of the SDK that are identified
+    in this Agreement as distributable, as incorporated in
+    object code format into a software application that meets
+    the distribution requirements indicated in this Agreement.
+
+
+1.1.2. Distribution Requirements
+
+These are the distribution requirements for you to exercise
+the distribution grant:
+
+  1. Your application must have material additional
+    functionality, beyond the included portions of the SDK.
+
+  2. The distributable portions of the SDK shall only be
+    accessed by your application.
+
+  3. The following notice shall be included in modifications
+    and derivative works of sample source code distributed:
+    “This software contains source code provided by NVIDIA
+    Corporation.”
+
+  4. Unless a developer tool is identified in this Agreement
+    as distributable, it is delivered for your internal use
+    only.
+
+  5. The terms under which you distribute your application
+    must be consistent with the terms of this Agreement,
+    including (without limitation) terms relating to the
+    license grant and license restrictions and protection of
+    NVIDIA’s intellectual property rights. Additionally, you
+    agree that you will protect the privacy, security and
+    legal rights of your application users.
+
+  6. You agree to notify NVIDIA in writing of any known or
+    suspected distribution or use of the SDK not in compliance
+    with the requirements of this Agreement, and to enforce
+    the terms of your agreements with respect to distributed
+    SDK.
+
+
+1.1.3. Authorized Users
+
+You may allow employees and contractors of your entity or of
+your subsidiary(ies) to access and use the SDK from your
+secure network to perform work on your behalf.
+
+If you are an academic institution you may allow users
+enrolled or employed by the academic institution to access and
+use the SDK from your secure network.
+
+You are responsible for the compliance with the terms of this
+Agreement by your authorized users. If you become aware that
+your authorized users didn’t follow the terms of this
+Agreement, you agree to take reasonable steps to resolve the
+non-compliance and prevent new occurrences.
+
+
+1.1.4. Pre-Release SDK
+
+The SDK versions identified as alpha, beta, preview or
+otherwise as pre-release, may not be fully functional, may
+contain errors or design flaws, and may have reduced or
+different security, privacy, accessibility, availability, and
+reliability standards relative to commercial versions of
+NVIDIA software and materials. Use of a pre-release SDK may
+result in unexpected results, loss of data, project delays or
+other unpredictable damage or loss.
+
+You may use a pre-release SDK at your own risk, understanding
+that pre-release SDKs are not intended for use in production
+or business-critical systems.
+
+NVIDIA may choose not to make available a commercial version
+of any pre-release SDK. NVIDIA may also choose to abandon
+development and terminate the availability of a pre-release
+SDK at any time without liability.
+
+
+1.1.5. Updates
+
+NVIDIA may, at its option, make available patches, workarounds
+or other updates to this SDK. Unless the updates are provided
+with their separate governing terms, they are deemed part of
+the SDK licensed to you as provided in this Agreement. You
+agree that the form and content of the SDK that NVIDIA
+provides may change without prior notice to you. While NVIDIA
+generally maintains compatibility between versions, NVIDIA may
+in some cases make changes that introduce incompatibilities in
+future versions of the SDK.
+
+
+1.1.6. Third Party Licenses
+
+The SDK may come bundled with, or otherwise include or be
+distributed with, third party software licensed by a NVIDIA
+supplier and/or open source software provided under an open
+source license. Use of third party software is subject to the
+third-party license terms, or in the absence of third party
+terms, the terms of this Agreement. Copyright to third party
+software is held by the copyright holders indicated in the
+third-party software or license.
+
+
+1.1.7. Reservation of Rights
+
+NVIDIA reserves all rights, title, and interest in and to the
+SDK, not expressly granted to you under this Agreement.
+
+
+1.2. Limitations
+
+The following license limitations apply to your use of the
+SDK:
+
+  1. You may not reverse engineer, decompile or disassemble,
+    or remove copyright or other proprietary notices from any
+    portion of the SDK or copies of the SDK.
+
+  2. Except as expressly provided in this Agreement, you may
+    not copy, sell, rent, sublicense, transfer, distribute,
+    modify, or create derivative works of any portion of the
+    SDK. For clarity, you may not distribute or sublicense the
+    SDK as a stand-alone product.
+
+  3. Unless you have an agreement with NVIDIA for this
+    purpose, you may not indicate that an application created
+    with the SDK is sponsored or endorsed by NVIDIA.
+
+  4. You may not bypass, disable, or circumvent any
+    encryption, security, digital rights management or
+    authentication mechanism in the SDK.
+
+  5. You may not use the SDK in any manner that would cause it
+    to become subject to an open source software license. As
+    examples, licenses that require as a condition of use,
+    modification, and/or distribution that the SDK be:
+
+      a. Disclosed or distributed in source code form;
+
+      b. Licensed for the purpose of making derivative works;
+        or
+
+      c. Redistributable at no charge.
+
+  6. Unless you have an agreement with NVIDIA for this
+    purpose, you may not use the SDK with any system or
+    application where the use or failure of the system or
+    application can reasonably be expected to threaten or
+    result in personal injury, death, or catastrophic loss.
+    Examples include use in avionics, navigation, military,
+    medical, life support or other life critical applications.
+    NVIDIA does not design, test or manufacture the SDK for
+    these critical uses and NVIDIA shall not be liable to you
+    or any third party, in whole or in part, for any claims or
+    damages arising from such uses.
+
+  7. You agree to defend, indemnify and hold harmless NVIDIA
+    and its affiliates, and their respective employees,
+    contractors, agents, officers and directors, from and
+    against any and all claims, damages, obligations, losses,
+    liabilities, costs or debt, fines, restitutions and
+    expenses (including but not limited to attorney’s fees
+    and costs incident to establishing the right of
+    indemnification) arising out of or related to your use of
+    the SDK outside of the scope of this Agreement, or not in
+    compliance with its terms.
+
+
+1.3. Ownership
+
+  1.  NVIDIA or its licensors hold all rights, title and
+    interest in and to the SDK and its modifications and
+    derivative works, including their respective intellectual
+    property rights, subject to your rights described in this
+    section. This SDK may include software and materials from
+    NVIDIA’s licensors, and these licensors are intended
+    third party beneficiaries that may enforce this Agreement
+    with respect to their intellectual property rights.
+
+  2.  You hold all rights, title and interest in and to your
+    applications and your derivative works of the sample
+    source code delivered in the SDK, including their
+    respective intellectual property rights, subject to
+    NVIDIA’s rights described in this section.
+
+  3. You may, but don’t have to, provide to NVIDIA
+    suggestions, feature requests or other feedback regarding
+    the SDK, including possible enhancements or modifications
+    to the SDK. For any feedback that you voluntarily provide,
+    you hereby grant NVIDIA and its affiliates a perpetual,
+    non-exclusive, worldwide, irrevocable license to use,
+    reproduce, modify, license, sublicense (through multiple
+    tiers of sublicensees), and distribute (through multiple
+    tiers of distributors) it without the payment of any
+    royalties or fees to you. NVIDIA will use feedback at its
+    choice. NVIDIA is constantly looking for ways to improve
+    its products, so you may send feedback to NVIDIA through
+    the developer portal at https://developer.nvidia.com.
+
+
+1.4. No Warranties
+
+THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL
+FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND
+ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND
+OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING,
+BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE
+ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO
+WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF
+DEALING OR COURSE OF TRADE.
+
+
+1.5. Limitation of Liability
+
+TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS
+AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
+PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS
+OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF
+PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION
+WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK,
+WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH
+OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE),
+PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF
+LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES
+TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS
+AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE
+NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS
+LIMIT.
+
+These exclusions and limitations of liability shall apply
+regardless if NVIDIA or its affiliates have been advised of
+the possibility of such damages, and regardless of whether a
+remedy fails its essential purpose. These exclusions and
+limitations of liability form an essential basis of the
+bargain between the parties, and, absent any of these
+exclusions or limitations of liability, the provisions of this
+Agreement, including, without limitation, the economic terms,
+would be substantially different.
+
+
+1.6. Termination
+
+  1. This Agreement will continue to apply until terminated by
+    either you or NVIDIA as described below.
+
+  2. If you want to terminate this Agreement, you may do so by
+    stopping to use the SDK.
+
+  3. NVIDIA may, at any time, terminate this Agreement if:
+
+      a. (i) you fail to comply with any term of this
+        Agreement and the non-compliance is not fixed within
+        thirty (30) days following notice from NVIDIA (or
+        immediately if you violate NVIDIA’s intellectual
+        property rights);
+
+      b. (ii) you commence or participate in any legal
+        proceeding against NVIDIA with respect to the SDK; or
+
+      c. (iii) NVIDIA decides to no longer provide the SDK in
+        a country or, in NVIDIA’s sole discretion, the
+        continued use of it is no longer commercially viable.
+
+  4. Upon any termination of this Agreement, you agree to
+    promptly discontinue use of the SDK and destroy all copies
+    in your possession or control. Your prior distributions in
+    accordance with this Agreement are not affected by the
+    termination of this Agreement. Upon written request, you
+    will certify in writing that you have complied with your
+    commitments under this section. Upon any termination of
+    this Agreement all provisions survive except for the
+    license grant provisions.
+
+
+1.7. General
+
+If you wish to assign this Agreement or your rights and
+obligations, including by merger, consolidation, dissolution
+or operation of law, contact NVIDIA to ask for permission. Any
+attempted assignment not approved by NVIDIA in writing shall
+be void and of no effect. NVIDIA may assign, delegate or
+transfer this Agreement and its rights and obligations, and if
+to a non-affiliate you will be notified.
+
+You agree to cooperate with NVIDIA and provide reasonably
+requested information to verify your compliance with this
+Agreement.
+
+This Agreement will be governed in all respects by the laws of
+the United States and of the State of Delaware as those laws
+are applied to contracts entered into and performed entirely
+within Delaware by Delaware residents, without regard to the
+conflicts of laws principles. The United Nations Convention on
+Contracts for the International Sale of Goods is specifically
+disclaimed. You agree to all terms of this Agreement in the
+English language.
+
+The state or federal courts residing in Santa Clara County,
+California shall have exclusive jurisdiction over any dispute
+or claim arising out of this Agreement. Notwithstanding this,
+you agree that NVIDIA shall still be allowed to apply for
+injunctive remedies or an equivalent type of urgent legal
+relief in any jurisdiction.
+
+If any court of competent jurisdiction determines that any
+provision of this Agreement is illegal, invalid or
+unenforceable, such provision will be construed as limited to
+the extent necessary to be consistent with and fully
+enforceable under the law and the remaining provisions will
+remain in full force and effect. Unless otherwise specified,
+remedies are cumulative.
+
+Each party acknowledges and agrees that the other is an
+independent contractor in the performance of this Agreement.
+
+The SDK has been developed entirely at private expense and is
+“commercial items” consisting of “commercial computer
+software” and “commercial computer software
+documentation” provided with RESTRICTED RIGHTS. Use,
+duplication or disclosure by the U.S. Government or a U.S.
+Government subcontractor is subject to the restrictions in
+this Agreement pursuant to DFARS 227.7202-3(a) or as set forth
+in subparagraphs (c)(1) and (2) of the Commercial Computer
+Software - Restricted Rights clause at FAR 52.227-19, as
+applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas
+Expressway, Santa Clara, CA 95051.
+
+The SDK is subject to United States export laws and
+regulations. You agree that you will not ship, transfer or
+export the SDK into any country, or use the SDK in any manner,
+prohibited by the United States Bureau of Industry and
+Security or economic sanctions regulations administered by the
+U.S. Department of Treasury’s Office of Foreign Assets
+Control (OFAC), or any applicable export laws, restrictions or
+regulations. These laws include restrictions on destinations,
+end users and end use. By accepting this Agreement, you
+confirm that you are not a resident or citizen of any country
+currently embargoed by the U.S. and that you are not otherwise
+prohibited from receiving the SDK.
+
+Any notice delivered by NVIDIA to you under this Agreement
+will be delivered via mail, email or fax. You agree that any
+notices that NVIDIA sends you electronically will satisfy any
+legal communication requirements. Please direct your legal
+notices or other correspondence to NVIDIA Corporation, 2788
+San Tomas Expressway, Santa Clara, California 95051, United
+States of America, Attention: Legal Department.
+
+This Agreement and any exhibits incorporated into this
+Agreement constitute the entire agreement of the parties with
+respect to the subject matter of this Agreement and supersede
+all prior negotiations or documentation exchanged between the
+parties relating to this SDK license. Any additional and/or
+conflicting terms on documents issued by you are null, void,
+and invalid. Any amendment or waiver under this Agreement
+shall be in writing and signed by representatives of both
+parties.
+
+
+2. CUDA Toolkit Supplement to Software License Agreement for
+NVIDIA Software Development Kits
+------------------------------------------------------------
+
+
+Release date: August 16, 2018
+-----------------------------
+
+The terms in this supplement govern your use of the NVIDIA
+CUDA Toolkit SDK under the terms of your license agreement
+(“Agreement”) as modified by this supplement. Capitalized
+terms used but not defined below have the meaning assigned to
+them in the Agreement.
+
+This supplement is an exhibit to the Agreement and is
+incorporated as an integral part of the Agreement. In the
+event of conflict between the terms in this supplement and the
+terms in the Agreement, the terms in this supplement govern.
+
+
+2.1. License Scope
+
+The SDK is licensed for you to develop applications only for
+use in systems with NVIDIA GPUs.
+
+
+2.2. Distribution
+
+The portions of the SDK that are distributable under the
+Agreement are listed in Attachment A.
+
+
+2.3. Operating Systems
+
+Those portions of the SDK designed exclusively for use on the
+Linux or FreeBSD operating systems, or other operating systems
+derived from the source code to these operating systems, may
+be copied and redistributed for use in accordance with this
+Agreement, provided that the object code files are not
+modified in any way (except for unzipping of compressed
+files).
+
+
+2.4. Audio and Video Encoders and Decoders
+
+You acknowledge and agree that it is your sole responsibility
+to obtain any additional third-party licenses required to
+make, have made, use, have used, sell, import, and offer for
+sale your products or services that include or incorporate any
+third-party software and content relating to audio and/or
+video encoders and decoders from, including but not limited
+to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A.,
+MPEG-LA, and Coding Technologies. NVIDIA does not grant to you
+under this Agreement any necessary patent or other rights with
+respect to any audio and/or video encoders and decoders.
+
+
+2.5. Licensing
+
+If the distribution terms in this Agreement are not suitable
+for your organization, or for any questions regarding this
+Agreement, please contact NVIDIA at
+nvidia-compute-license-questions@nvidia.com.
+
+
+2.6. Attachment A
+
+The following portions of the SDK are distributable under the
+Agreement:
+
+Component
+
+CUDA Runtime
+
+Windows
+
+cudart.dll, cudart_static.lib, cudadevrt.lib
+
+Mac OSX
+
+libcudart.dylib, libcudart_static.a, libcudadevrt.a
+
+Linux
+
+libcudart.so, libcudart_static.a, libcudadevrt.a
+
+Android
+
+libcudart.so, libcudart_static.a, libcudadevrt.a
+
+Component
+
+CUDA FFT Library
+
+Windows
+
+cufft.dll, cufftw.dll, cufft.lib, cufftw.lib
+
+Mac OSX
+
+libcufft.dylib, libcufft_static.a, libcufftw.dylib,
+libcufftw_static.a
+
+Linux
+
+libcufft.so, libcufft_static.a, libcufftw.so,
+libcufftw_static.a
+
+Android
+
+libcufft.so, libcufft_static.a, libcufftw.so,
+libcufftw_static.a
+
+Component
+
+CUDA BLAS Library
+
+Windows
+
+cublas.dll, cublasLt.dll
+
+Mac OSX
+
+libcublas.dylib, libcublasLt.dylib, libcublas_static.a,
+libcublasLt_static.a
+
+Linux
+
+libcublas.so, libcublasLt.so, libcublas_static.a,
+libcublasLt_static.a
+
+Android
+
+libcublas.so, libcublasLt.so, libcublas_static.a,
+libcublasLt_static.a
+
+Component
+
+NVIDIA "Drop-in" BLAS Library
+
+Windows
+
+nvblas.dll
+
+Mac OSX
+
+libnvblas.dylib
+
+Linux
+
+libnvblas.so
+
+Component
+
+CUDA Sparse Matrix Library
+
+Windows
+
+cusparse.dll, cusparse.lib
+
+Mac OSX
+
+libcusparse.dylib, libcusparse_static.a
+
+Linux
+
+libcusparse.so, libcusparse_static.a
+
+Android
+
+libcusparse.so, libcusparse_static.a
+
+Component
+
+CUDA Linear Solver Library
+
+Windows
+
+cusolver.dll, cusolver.lib
+
+Mac OSX
+
+libcusolver.dylib, libcusolver_static.a
+
+Linux
+
+libcusolver.so, libcusolver_static.a
+
+Android
+
+libcusolver.so, libcusolver_static.a
+
+Component
+
+CUDA Random Number Generation Library
+
+Windows
+
+curand.dll, curand.lib
+
+Mac OSX
+
+libcurand.dylib, libcurand_static.a
+
+Linux
+
+libcurand.so, libcurand_static.a
+
+Android
+
+libcurand.so, libcurand_static.a
+
+Component
+
+CUDA Accelerated Graph Library
+
+Component
+
+NVIDIA Performance Primitives Library
+
+Windows
+
+nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll,
+nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll,
+nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib,
+nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll,
+nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib
+
+Mac OSX
+
+libnppc.dylib, libnppc_static.a, libnppial.dylib,
+libnppial_static.a, libnppicc.dylib, libnppicc_static.a,
+libnppicom.dylib, libnppicom_static.a, libnppidei.dylib,
+libnppidei_static.a, libnppif.dylib, libnppif_static.a,
+libnppig.dylib, libnppig_static.a, libnppim.dylib,
+libnppisu_static.a, libnppitc.dylib, libnppitc_static.a,
+libnpps.dylib, libnpps_static.a
+
+Linux
+
+libnppc.so, libnppc_static.a, libnppial.so,
+libnppial_static.a, libnppicc.so, libnppicc_static.a,
+libnppicom.so, libnppicom_static.a, libnppidei.so,
+libnppidei_static.a, libnppif.so, libnppif_static.a
+libnppig.so, libnppig_static.a, libnppim.so,
+libnppim_static.a, libnppist.so, libnppist_static.a,
+libnppisu.so, libnppisu_static.a, libnppitc.so
+libnppitc_static.a, libnpps.so, libnpps_static.a
+
+Android
+
+libnppc.so, libnppc_static.a, libnppial.so,
+libnppial_static.a, libnppicc.so, libnppicc_static.a,
+libnppicom.so, libnppicom_static.a, libnppidei.so,
+libnppidei_static.a, libnppif.so, libnppif_static.a
+libnppig.so, libnppig_static.a, libnppim.so,
+libnppim_static.a, libnppist.so, libnppist_static.a,
+libnppisu.so, libnppisu_static.a, libnppitc.so
+libnppitc_static.a, libnpps.so, libnpps_static.a
+
+Component
+
+NVIDIA JPEG Library
+
+Linux
+
+libnvjpeg.so, libnvjpeg_static.a
+
+Component
+
+Internal common library required for statically linking to
+cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP
+
+Mac OSX
+
+libculibos.a
+
+Linux
+
+libculibos.a
+
+Component
+
+NVIDIA Runtime Compilation Library and Header
+
+All
+
+nvrtc.h
+
+Windows
+
+nvrtc.dll, nvrtc-builtins.dll
+
+Mac OSX
+
+libnvrtc.dylib, libnvrtc-builtins.dylib
+
+Linux
+
+libnvrtc.so, libnvrtc-builtins.so
+
+Component
+
+NVIDIA Optimizing Compiler Library
+
+Windows
+
+nvvm.dll
+
+Mac OSX
+
+libnvvm.dylib
+
+Linux
+
+libnvvm.so
+
+Component
+
+NVIDIA Common Device Math Functions Library
+
+Windows
+
+libdevice.10.bc
+
+Mac OSX
+
+libdevice.10.bc
+
+Linux
+
+libdevice.10.bc
+
+Component
+
+CUDA Occupancy Calculation Header Library
+
+All
+
+cuda_occupancy.h
+
+Component
+
+CUDA Half Precision Headers
+
+All
+
+cuda_fp16.h, cuda_fp16.hpp
+
+Component
+
+CUDA Profiling Tools Interface (CUPTI) Library
+
+Windows
+
+cupti.dll
+
+Mac OSX
+
+libcupti.dylib
+
+Linux
+
+libcupti.so
+
+Component
+
+NVIDIA Tools Extension Library
+
+Windows
+
+nvToolsExt.dll, nvToolsExt.lib
+
+Mac OSX
+
+libnvToolsExt.dylib
+
+Linux
+
+libnvToolsExt.so
+
+Component
+
+NVIDIA CUDA Driver Libraries
+
+Linux
+
+libcuda.so, libnvidia-fatbinaryloader.so,
+libnvidia-ptxjitcompiler.so
+
+The NVIDIA CUDA Driver Libraries are only distributable in
+applications that meet this criteria:
+
+  1. The application was developed starting from a NVIDIA CUDA
+    container obtained from Docker Hub or the NVIDIA GPU
+    Cloud, and
+
+  2. The resulting application is packaged as a Docker
+    container and distributed to users on Docker Hub or the
+    NVIDIA GPU Cloud only.
+
+
+2.7. Attachment B
+
+
+Additional Licensing Obligations
+
+The following third party components included in the SOFTWARE
+are licensed to Licensee pursuant to the following terms and
+conditions:
+
+  1. Licensee's use of the GDB third party component is
+    subject to the terms and conditions of GNU GPL v3:
+
+    This product includes copyrighted third-party software licensed
+    under the terms of the GNU General Public License v3 ("GPL v3").
+    All third-party software packages are copyright by their respective
+    authors. GPL v3 terms and conditions are hereby incorporated into
+    the Agreement by this reference:     http://www.gnu.org/licenses/gpl.txt
+
+    Consistent with these licensing requirements, the software
+    listed below is provided under the terms of the specified
+    open source software licenses. To obtain source code for
+    software provided under licenses that require
+    redistribution of source code, including the GNU General
+    Public License (GPL) and GNU Lesser General Public License
+    (LGPL), contact oss-requests@nvidia.com. This offer is
+    valid for a period of three (3) years from the date of the
+    distribution of this product by NVIDIA CORPORATION.
+
+    Component          License
+    CUDA-GDB           GPL v3
+
+  2. Licensee represents and warrants that any and all third
+    party licensing and/or royalty payment obligations in
+    connection with Licensee's use of the H.264 video codecs
+    are solely the responsibility of Licensee.
+
+  3. Licensee's use of the Thrust library is subject to the
+    terms and conditions of the Apache License Version 2.0.
+    All third-party software packages are copyright by their
+    respective authors. Apache License Version 2.0 terms and
+    conditions are hereby incorporated into the Agreement by
+    this reference.
+    http://www.apache.org/licenses/LICENSE-2.0.html
+
+    In addition, Licensee acknowledges the following notice:
+    Thrust includes source code from the Boost Iterator,
+    Tuple, System, and Random Number libraries.
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+    . . . .
+
+    Permission is hereby granted, free of charge, to any person or
+    organization obtaining a copy of the software and accompanying
+    documentation covered by this license (the "Software") to use,
+    reproduce, display, distribute, execute, and transmit the Software,
+    and to prepare derivative works of the Software, and to permit
+    third-parties to whom the Software is furnished to do so, all
+    subject to the following:
+
+    The copyright notices in the Software and this entire statement,
+    including the above license grant, this restriction and the following
+    disclaimer, must be included in all copies of the Software, in whole
+    or in part, and all derivative works of the Software, unless such
+    copies or derivative works are solely in the form of machine-executable
+    object code generated by a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
+    NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+    ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
+    OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+
+  4. Licensee's use of the LLVM third party component is
+    subject to the following terms and conditions:
+
+    ======================================================
+    LLVM Release License
+    ======================================================
+    University of Illinois/NCSA
+    Open Source License
+
+    Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+    All rights reserved.
+
+    Developed by:
+
+        LLVM Team
+
+        University of Illinois at Urbana-Champaign
+
+        http://llvm.org
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal with the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    *  Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimers.
+
+    *  Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimers in the
+       documentation and/or other materials provided with the distribution.
+
+    *  Neither the names of the LLVM Team, University of Illinois at Urbana-
+       Champaign, nor the names of its contributors may be used to endorse or
+       promote products derived from this Software without specific prior
+       written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS WITH THE SOFTWARE.
+
+  5. Licensee's use (e.g. nvprof) of the PCRE third party
+    component is subject to the following terms and
+    conditions:
+
+    ------------
+    PCRE LICENCE
+    ------------
+    PCRE is a library of functions to support regular expressions whose syntax
+    and semantics are as close as possible to those of the Perl 5 language.
+    Release 8 of PCRE is distributed under the terms of the "BSD" licence, as
+    specified below. The documentation for PCRE, supplied in the "doc"
+    directory, is distributed under the same terms as the software itself. The
+    basic library functions are written in C and are freestanding. Also
+    included in the distribution is a set of C++ wrapper functions, and a just-
+    in-time compiler that can be used to optimize pattern matching. These are
+    both optional features that can be omitted when the library is built.
+
+    THE BASIC LIBRARY FUNCTIONS
+    ---------------------------
+    Written by:       Philip Hazel
+    Email local part: ph10
+    Email domain:     cam.ac.uk
+    University of Cambridge Computing Service,
+    Cambridge, England.
+    Copyright (c) 1997-2012 University of Cambridge
+    All rights reserved.
+
+    PCRE JUST-IN-TIME COMPILATION SUPPORT
+    -------------------------------------
+    Written by:       Zoltan Herczeg
+    Email local part: hzmester
+    Emain domain:     freemail.hu
+    Copyright(c) 2010-2012 Zoltan Herczeg
+    All rights reserved.
+
+    STACK-LESS JUST-IN-TIME COMPILER
+    --------------------------------
+    Written by:       Zoltan Herczeg
+    Email local part: hzmester
+    Emain domain:     freemail.hu
+    Copyright(c) 2009-2012 Zoltan Herczeg
+    All rights reserved.
+
+    THE C++ WRAPPER FUNCTIONS
+    -------------------------
+    Contributed by:   Google Inc.
+    Copyright (c) 2007-2012, Google Inc.
+    All rights reserved.
+
+    THE "BSD" LICENCE
+    -----------------
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+      * Neither the name of the University of Cambridge nor the name of Google
+        Inc. nor the names of their contributors may be used to endorse or
+        promote products derived from this software without specific prior
+        written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  6. Some of the cuBLAS library routines were written by or
+    derived from code written by Vasily Volkov and are subject
+    to the Modified Berkeley Software Distribution License as
+    follows:
+
+    Copyright (c) 2007-2009, Regents of the University of California
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the University of California, Berkeley nor
+          the names of its contributors may be used to endorse or promote
+          products derived from this software without specific prior
+          written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+    IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  7. Some of the cuBLAS library routines were written by or
+    derived from code written by Davide Barbieri and are
+    subject to the Modified Berkeley Software Distribution
+    License as follows:
+
+    Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * The name of the author may not be used to endorse or promote
+          products derived from this software without specific prior
+          written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+    IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+  8. Some of the cuBLAS library routines were derived from
+    code developed by the University of Tennessee and are
+    subject to the Modified Berkeley Software Distribution
+    License as follows:
+
+    Copyright (c) 2010 The University of Tennessee.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer listed in this license in the documentation and/or
+          other materials provided with the distribution.
+        * Neither the name of the copyright holders nor the names of its
+          contributors may be used to endorse or promote products derived
+          from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  9. Some of the cuBLAS library routines were written by or
+    derived from code written by Jonathan Hogg and are subject
+    to the Modified Berkeley Software Distribution License as
+    follows:
+
+    Copyright (c) 2012, The Science and Technology Facilities Council (STFC).
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the STFC nor the names of its contributors
+          may be used to endorse or promote products derived from this
+          software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+    OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+    IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  10. Some of the cuBLAS library routines were written by or
+    derived from code written by Ahmad M. Abdelfattah, David
+    Keyes, and Hatem Ltaief, and are subject to the Apache
+    License, Version 2.0, as follows:
+
+     -- (C) Copyright 2013 King Abdullah University of Science and Technology
+      Authors:
+      Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa)
+      David Keyes (david.keyes@kaust.edu.sa)
+      Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
+
+      Redistribution  and  use  in  source and binary forms, with or without
+      modification,  are  permitted  provided  that the following conditions
+      are met:
+
+      * Redistributions  of  source  code  must  retain  the above copyright
+        notice,  this  list  of  conditions  and  the  following  disclaimer.
+      * Redistributions  in  binary  form must reproduce the above copyright
+        notice,  this list of conditions and the following disclaimer in the
+        documentation  and/or other materials provided with the distribution.
+      * Neither  the  name of the King Abdullah University of Science and
+        Technology nor the names of its contributors may be used to endorse
+        or promote products derived from this software without specific prior
+        written permission.
+
+      THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+      ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+      LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+      A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+      HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+      SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+      LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+      DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+      THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+      (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+      OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
+
+  11. Some of the cuSPARSE library routines were written by or
+    derived from code written by Li-Wen Chang and are subject
+    to the NCSA Open Source License as follows:
+
+    Copyright (c) 2012, University of Illinois.
+
+    All rights reserved.
+
+    Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal with the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimers in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the names of IMPACT Group, University of Illinois, nor
+          the names of its contributors may be used to endorse or promote
+          products derived from this Software without specific prior
+          written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+    IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+    IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+    SOFTWARE.
+
+  12. Some of the cuRAND library routines were written by or
+    derived from code written by Mutsuo Saito and Makoto
+    Matsumoto and are subject to the following license:
+
+    Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+    University. All rights reserved.
+
+    Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+    University and University of Tokyo.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of the Hiroshima University nor the names of
+          its contributors may be used to endorse or promote products
+          derived from this software without specific prior written
+          permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  13. Some of the cuRAND library routines were derived from
+    code developed by D. E. Shaw Research and are subject to
+    the following license:
+
+    Copyright 2010-2011, D. E. Shaw Research.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+        * Redistributions of source code must retain the above copyright
+          notice, this list of conditions, and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions, and the following
+          disclaimer in the documentation and/or other materials provided
+          with the distribution.
+        * Neither the name of D. E. Shaw Research nor the names of its
+          contributors may be used to endorse or promote products derived
+          from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  14. Some of the Math library routines were written by or
+    derived from code developed by Norbert Juffa and are
+    subject to the following license:
+
+    Copyright (c) 2015-2017, Norbert Juffa
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  15. Licensee's use of the lz4 third party component is
+    subject to the following terms and conditions:
+
+    Copyright (C) 2011-2013, Yann Collet.
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  16. The NPP library uses code from the Boost Math Toolkit,
+    and is subject to the following license:
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+    . . . .
+
+    Permission is hereby granted, free of charge, to any person or
+    organization obtaining a copy of the software and accompanying
+    documentation covered by this license (the "Software") to use,
+    reproduce, display, distribute, execute, and transmit the Software,
+    and to prepare derivative works of the Software, and to permit
+    third-parties to whom the Software is furnished to do so, all
+    subject to the following:
+
+    The copyright notices in the Software and this entire statement,
+    including the above license grant, this restriction and the following
+    disclaimer, must be included in all copies of the Software, in whole
+    or in part, and all derivative works of the Software, unless such
+    copies or derivative works are solely in the form of machine-executable
+    object code generated by a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
+    NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+    ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
+    OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+
+  17. Portions of the Nsight Eclipse Edition is subject to the
+    following license:
+
+    The Eclipse Foundation makes available all content in this plug-in
+    ("Content"). Unless otherwise indicated below, the Content is provided
+    to you under the terms and conditions of the Eclipse Public License
+    Version 1.0 ("EPL"). A copy of the EPL is available at http://
+    www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program"
+    will mean the Content.
+
+    If you did not receive this Content directly from the Eclipse
+    Foundation, the Content is being redistributed by another party
+    ("Redistributor") and different terms and conditions may apply to your
+    use of any object code in the Content. Check the Redistributor's
+    license that was provided with the Content. If no such license exists,
+    contact the Redistributor. Unless otherwise indicated below, the terms
+    and conditions of the EPL still apply to any source code in the
+    Content and such source code may be obtained at http://www.eclipse.org.
+
+  18. Some of the cuBLAS library routines uses code from
+    OpenAI, which is subject to the following license:
+
+    License URL
+    https://github.com/openai/openai-gemm/blob/master/LICENSE
+
+    License Text
+    The MIT License
+
+    Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+
+  19. Licensee's use of the Visual Studio Setup Configuration
+    Samples is subject to the following license:
+
+    The MIT License (MIT)
+    Copyright (C) Microsoft Corporation. All rights reserved.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without restriction,
+    including without limitation the rights to use, copy, modify, merge,
+    publish, distribute, sublicense, and/or sell copies of the Software,
+    and to permit persons to whom the Software is furnished to do so,
+    subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  20. Licensee's use of linmath.h header for CPU functions for
+    GL vector/matrix operations from lunarG is subject to the
+    Apache License Version 2.0.
+
+  21. The DX12-CUDA sample uses the d3dx12.h header, which is
+    subject to the MIT license .
+
+-----------------
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd4ffebec4c57f6d399a0f76df2b66056f0b225
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/__init__.py
@@ -0,0 +1,90 @@
+"""
+================================
+Datasets (:mod:`scipy.datasets`)
+================================
+
+.. currentmodule:: scipy.datasets
+
+Dataset Methods
+===============
+
+.. autosummary::
+   :toctree: generated/
+
+   ascent
+   face
+   electrocardiogram
+
+Utility Methods
+===============
+
+.. autosummary::
+   :toctree: generated/
+
+   download_all    -- Download all the dataset files to specified path.
+   clear_cache     -- Clear cached dataset directory.
+
+
+Usage of Datasets
+=================
+
+SciPy dataset methods can be simply called as follows: ``'<dataset-name>()'``
+This downloads the dataset files over the network once, and saves the cache,
+before returning a `numpy.ndarray` object representing the dataset.
+
+Note that the return data structure and data type might be different for
+different dataset methods. For a more detailed example on usage, please look
+into the particular dataset method documentation above.
+
+
+How dataset retrieval and storage works
+=======================================
+
+SciPy dataset files are stored within individual GitHub repositories under the
+SciPy GitHub organization, following a naming convention as
+``'dataset-<name>'``, for example `scipy.datasets.face` files live at
+https://github.com/scipy/dataset-face.  The `scipy.datasets` submodule utilizes
+and depends on `Pooch <https://www.fatiando.org/pooch/latest/>`_, a Python
+package built to simplify fetching data files. Pooch uses these repos to
+retrieve the respective dataset files when calling the dataset function.
+
+A registry of all the datasets, essentially a mapping of filenames with their
+SHA256 hash and repo urls are maintained, which Pooch uses to handle and verify
+the downloads on function call. After downloading the dataset once, the files
+are saved in the system cache directory under ``'scipy-data'``.
+
+Dataset cache locations may vary on different platforms.
+
+For macOS::
+
+    '~/Library/Caches/scipy-data'
+
+For Linux and other Unix-like platforms::
+
+    '~/.cache/scipy-data'  # or the value of the XDG_CACHE_HOME env var, if defined
+
+For Windows::
+
+    'C:\\Users\\<user>\\AppData\\Local\\<AppAuthor>\\scipy-data\\Cache'
+
+
+In environments with constrained network connectivity for various security
+reasons or on systems without continuous internet connections, one may manually
+load the cache of the datasets by placing the contents of the dataset repo in
+the above mentioned cache directory to avoid fetching dataset errors without
+the internet connectivity.
+
+"""
+
+
+from ._fetchers import face, ascent, electrocardiogram
+from ._download_all import download_all
+from ._utils import clear_cache
+
+__all__ = ['ascent', 'electrocardiogram', 'face',
+           'download_all', 'clear_cache']
+
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_download_all.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_download_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..d410e968f4d4e25d788b93913c8901d1cd74b521
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_download_all.py
@@ -0,0 +1,74 @@
+"""
+Platform independent script to download all the
+`scipy.datasets` module data files.
+This doesn't require a full scipy build.
+
+Run: python _download_all.py <download_dir>
+"""
+
+from scipy._lib._array_api import xp_capabilities
+
+import argparse
+try:
+    import pooch
+except ImportError:
+    pooch = None
+
+
+if __package__ is None or __package__ == '':
+    # Running as python script, use absolute import
+    import _registry  # type: ignore
+else:
+    # Running as python module, use relative import
+    from . import _registry
+
+
+@xp_capabilities(out_of_scope=True)
+def download_all(path=None):
+    """
+    Utility method to download all the dataset files
+    for `scipy.datasets` module.
+
+    Parameters
+    ----------
+    path : str, optional
+        Directory path to download all the dataset files.
+        If None, default to the system cache_dir detected by pooch.
+
+    Examples
+    --------
+    Download the datasets to the default cache location:
+
+    >>> from scipy import datasets
+    >>> datasets.download_all()
+
+    Download the datasets to the current directory:
+
+    >>> datasets.download_all(".")
+
+    """
+    if pooch is None:
+        raise ImportError("Missing optional dependency 'pooch' required "
+                          "for scipy.datasets module. Please use pip or "
+                          "conda to install 'pooch'.")
+    if path is None:
+        path = pooch.os_cache('scipy-data')
+    # https://github.com/scipy/scipy/issues/21879
+    downloader = pooch.HTTPDownloader(headers={"User-Agent": "SciPy"})
+    for dataset_name, dataset_hash in _registry.registry.items():
+        pooch.retrieve(url=_registry.registry_urls[dataset_name],
+                       known_hash=dataset_hash,
+                       fname=dataset_name, path=path, downloader=downloader)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Download SciPy data files.')
+    parser.add_argument("path", nargs='?', type=str,
+                        default=pooch.os_cache('scipy-data'),
+                        help="Directory path to download all the data files.")
+    args = parser.parse_args()
+    download_all(args.path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_fetchers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_fetchers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6389a0ca5da32ce86b018ac7f92b1e49300645b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_fetchers.py
@@ -0,0 +1,229 @@
+import sys
+
+from numpy import array, frombuffer, load
+from ._registry import registry, registry_urls
+
+from scipy._lib._array_api import xp_capabilities
+
+try:
+    import pooch
+except ImportError:
+    pooch = None
+    data_fetcher = None
+else:
+    data_fetcher = pooch.create(
+        # Use the default cache folder for the operating system
+        # Pooch uses appdirs (https://github.com/ActiveState/appdirs) to
+        # select an appropriate directory for the cache on each platform.
+        path=pooch.os_cache("scipy-data"),
+
+        # The remote data is on Github
+        # base_url is a required param, even though we override this
+        # using individual urls in the registry.
+        base_url="https://github.com/scipy/",
+        registry=registry,
+        urls=registry_urls
+    )
+
+
+def fetch_data(dataset_name, data_fetcher=data_fetcher):
+    if data_fetcher is None:
+        raise ImportError("Missing optional dependency 'pooch' required "
+                          "for scipy.datasets module. Please use pip or "
+                          "conda to install 'pooch'.")
+    # https://github.com/scipy/scipy/issues/21879
+    downloader = pooch.HTTPDownloader(
+        headers={"User-Agent": f"SciPy {sys.modules['scipy'].__version__}"}
+    )
+    # The "fetch" method returns the full path to the downloaded data file.
+    return data_fetcher.fetch(dataset_name, downloader=downloader)
+
+
+@xp_capabilities(out_of_scope=True)
+def ascent():
+    """
+    Get an 8-bit grayscale bit-depth, 512 x 512 derived image for easy
+    use in demos.
+
+    The image is derived from
+    https://pixnio.com/people/accent-to-the-top
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    ascent : ndarray
+       convenient image to use for testing and demonstration
+
+    Examples
+    --------
+    >>> import scipy.datasets
+    >>> ascent = scipy.datasets.ascent()
+    >>> ascent.shape
+    (512, 512)
+    >>> ascent.max()
+    np.uint8(255)
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.gray()
+    >>> plt.imshow(ascent)
+    >>> plt.show()
+
+    """
+    import pickle
+
+    # The file will be downloaded automatically the first time this is run,
+    # returning the path to the downloaded file. Afterwards, Pooch finds
+    # it in the local cache and doesn't repeat the download.
+    fname = fetch_data("ascent.dat")
+    # Now we just need to load it with our standard Python tools.
+    with open(fname, 'rb') as f:
+        ascent = array(pickle.load(f))
+    return ascent
+
+
+@xp_capabilities(out_of_scope=True)
+def electrocardiogram():
+    """
+    Load an electrocardiogram as an example for a 1-D signal.
+
+    The returned signal is a 5 minute long electrocardiogram (ECG), a medical
+    recording of the heart's electrical activity, sampled at 360 Hz.
+
+    Returns
+    -------
+    ecg : ndarray
+        The electrocardiogram in millivolt (mV) sampled at 360 Hz.
+
+    Notes
+    -----
+    The provided signal is an excerpt (19:35 to 24:35) from the `record 208`_
+    (lead MLII) provided by the MIT-BIH Arrhythmia Database [1]_ on
+    PhysioNet [2]_. The excerpt includes noise induced artifacts, typical
+    heartbeats as well as pathological changes.
+
+    .. _record 208: https://physionet.org/physiobank/database/html/mitdbdir/records.htm#208
+
+    .. versionadded:: 1.1.0
+
+    References
+    ----------
+    .. [1] Moody GB, Mark RG. The impact of the MIT-BIH Arrhythmia Database.
+           IEEE Eng in Med and Biol 20(3):45-50 (May-June 2001).
+           (PMID: 11446209); :doi:`10.13026/C2F305`
+    .. [2] Goldberger AL, Amaral LAN, Glass L, Hausdorff JM, Ivanov PCh,
+           Mark RG, Mietus JE, Moody GB, Peng C-K, Stanley HE. PhysioBank,
+           PhysioToolkit, and PhysioNet: Components of a New Research Resource
+           for Complex Physiologic Signals. Circulation 101(23):e215-e220;
+           :doi:`10.1161/01.CIR.101.23.e215`
+
+    Examples
+    --------
+    >>> from scipy.datasets import electrocardiogram
+    >>> ecg = electrocardiogram()
+    >>> ecg
+    array([-0.245, -0.215, -0.185, ..., -0.405, -0.395, -0.385], shape=(108000,))
+    >>> ecg.shape, ecg.mean(), ecg.std()
+    ((108000,), -0.16510875, 0.5992473991177294)
+
+    As stated the signal features several areas with a different morphology.
+    E.g., the first few seconds show the electrical activity of a heart in
+    normal sinus rhythm as seen below.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> fs = 360
+    >>> time = np.arange(ecg.size) / fs
+    >>> plt.plot(time, ecg)
+    >>> plt.xlabel("time in s")
+    >>> plt.ylabel("ECG in mV")
+    >>> plt.xlim(9, 10.2)
+    >>> plt.ylim(-1, 1.5)
+    >>> plt.show()
+
+    After second 16, however, the first premature ventricular contractions,
+    also called extrasystoles, appear. These have a different morphology
+    compared to typical heartbeats. The difference can easily be observed
+    in the following plot.
+
+    >>> plt.plot(time, ecg)
+    >>> plt.xlabel("time in s")
+    >>> plt.ylabel("ECG in mV")
+    >>> plt.xlim(46.5, 50)
+    >>> plt.ylim(-2, 1.5)
+    >>> plt.show()
+
+    At several points large artifacts disturb the recording, e.g.:
+
+    >>> plt.plot(time, ecg)
+    >>> plt.xlabel("time in s")
+    >>> plt.ylabel("ECG in mV")
+    >>> plt.xlim(207, 215)
+    >>> plt.ylim(-2, 3.5)
+    >>> plt.show()
+
+    Finally, examining the power spectrum reveals that most of the biosignal is
+    made up of lower frequencies. At 60 Hz the noise induced by the mains
+    electricity can be clearly observed.
+
+    >>> from scipy.signal import welch
+    >>> f, Pxx = welch(ecg, fs=fs, nperseg=2048, scaling="spectrum")
+    >>> plt.semilogy(f, Pxx)
+    >>> plt.xlabel("Frequency in Hz")
+    >>> plt.ylabel("Power spectrum of the ECG in mV**2")
+    >>> plt.xlim(f[[0, -1]])
+    >>> plt.show()
+    """
+    fname = fetch_data("ecg.dat")
+    with load(fname) as file:
+        ecg = file["ecg"].astype(int)  # np.uint16 -> int
+    # Convert raw output of ADC to mV: (ecg - adc_zero) / adc_gain
+    ecg = (ecg - 1024) / 200.0
+    return ecg
+
+
+@xp_capabilities(out_of_scope=True)
+def face(gray=False):
+    """
+    Get a 1024 x 768, color image of a raccoon face.
+
+    The image is derived from
+    https://pixnio.com/fauna-animals/raccoons/raccoon-procyon-lotor
+
+    Parameters
+    ----------
+    gray : bool, optional
+        If True return 8-bit grey-scale image, otherwise return a color image
+
+    Returns
+    -------
+    face : ndarray
+        image of a raccoon face
+
+    Examples
+    --------
+    >>> import scipy.datasets
+    >>> face = scipy.datasets.face()
+    >>> face.shape
+    (768, 1024, 3)
+    >>> face.max()
+    np.uint8(255)
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.gray()
+    >>> plt.imshow(face)
+    >>> plt.show()
+
+    """
+    import bz2
+    fname = fetch_data("face.dat")
+    with open(fname, 'rb') as f:
+        rawdata = f.read()
+    face_data = bz2.decompress(rawdata)
+    face = frombuffer(face_data, dtype='uint8').reshape((768, 1024, 3))
+    if gray is True:
+        face = (0.21 * face[:, :, 0] + 0.71 * face[:, :, 1] +
+                0.07 * face[:, :, 2]).astype('uint8')
+    return face
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_registry.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..969384ad9843159e766100bfa9755aed8102dd09
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_registry.py
@@ -0,0 +1,26 @@
+##########################################################################
+# This file serves as the dataset registry for SciPy Datasets SubModule.
+##########################################################################
+
+
+# To generate the SHA256 hash, use the command
+# openssl sha256 <filename>
+registry = {
+    "ascent.dat": "03ce124c1afc880f87b55f6b061110e2e1e939679184f5614e38dacc6c1957e2",
+    "ecg.dat": "f20ad3365fb9b7f845d0e5c48b6fe67081377ee466c3a220b7f69f35c8958baf",
+    "face.dat": "9d8b0b4d081313e2b485748c770472e5a95ed1738146883d84c7030493e82886"
+}
+
+registry_urls = {
+    "ascent.dat": "https://raw.githubusercontent.com/scipy/dataset-ascent/main/ascent.dat",
+    "ecg.dat": "https://raw.githubusercontent.com/scipy/dataset-ecg/main/ecg.dat",
+    "face.dat": "https://raw.githubusercontent.com/scipy/dataset-face/main/face.dat"
+}
+
+# dataset method mapping with their associated filenames
+# <method_name> : ["filename1", "filename2", ...]
+method_files_map = {
+    "ascent": ["ascent.dat"],
+    "electrocardiogram": ["ecg.dat"],
+    "face": ["face.dat"]
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78206da9fae576c30cf456f6348708e7e869b1c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/datasets/_utils.py
@@ -0,0 +1,84 @@
+import os
+import shutil
+from ._registry import method_files_map
+
+from scipy._lib._array_api import xp_capabilities
+
+try:
+    import platformdirs
+except ImportError:
+    platformdirs = None  # type: ignore[assignment]
+
+
+def _clear_cache(datasets, cache_dir=None, method_map=None):
+    if method_map is None:
+        # Use SciPy Datasets method map
+        method_map = method_files_map
+    if cache_dir is None:
+        # Use default cache_dir path
+        if platformdirs is None:
+            # platformdirs is pooch dependency
+            raise ImportError("Missing optional dependency 'pooch' required "
+                              "for scipy.datasets module. Please use pip or "
+                              "conda to install 'pooch'.")
+        cache_dir = platformdirs.user_cache_dir("scipy-data")
+
+    if not os.path.exists(cache_dir):
+        print(f"Cache Directory {cache_dir} doesn't exist. Nothing to clear.")
+        return
+
+    if datasets is None:
+        print(f"Cleaning the cache directory {cache_dir}!")
+        shutil.rmtree(cache_dir)
+    else:
+        if not isinstance(datasets, list | tuple):
+            # single dataset method passed should be converted to list
+            datasets = [datasets, ]
+        for dataset in datasets:
+            assert callable(dataset)
+            dataset_name = dataset.__name__  # Name of the dataset method
+            if dataset_name not in method_map:
+                raise ValueError(f"Dataset method {dataset_name} doesn't "
+                                 "exist. Please check if the passed dataset "
+                                 "is a subset of the following dataset "
+                                 f"methods: {list(method_map.keys())}")
+
+            data_files = method_map[dataset_name]
+            data_filepaths = [os.path.join(cache_dir, file)
+                              for file in data_files]
+            for data_filepath in data_filepaths:
+                if os.path.exists(data_filepath):
+                    print("Cleaning the file "
+                          f"{os.path.split(data_filepath)[1]} "
+                          f"for dataset {dataset_name}")
+                    os.remove(data_filepath)
+                else:
+                    print(f"Path {data_filepath} doesn't exist. "
+                          "Nothing to clear.")
+
+
+@xp_capabilities(out_of_scope=True)
+def clear_cache(datasets=None):
+    """
+    Cleans the scipy datasets cache directory.
+
+    If a scipy.datasets method or a list/tuple of the same is
+    provided, then clear_cache removes all the data files
+    associated to the passed dataset method callable(s).
+
+    By default, it removes all the cached data files.
+
+    Parameters
+    ----------
+    datasets : callable or list/tuple of callable or None
+
+    Examples
+    --------
+    >>> from scipy import datasets
+    >>> ascent_array = datasets.ascent()
+    >>> ascent_array.shape
+    (512, 512)
+    >>> datasets.clear_cache([datasets.ascent])
+    Cleaning the file ascent.dat for dataset ascent
+    """
+    _clear_cache(datasets)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a7ccc4b33f27dbae7958641a89106cf9580326
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/__init__.py
@@ -0,0 +1,27 @@
+"""
+==============================================================
+Finite Difference Differentiation (:mod:`scipy.differentiate`)
+==============================================================
+
+.. currentmodule:: scipy.differentiate
+
+SciPy ``differentiate`` provides functions for performing finite difference
+numerical differentiation of black-box functions.
+
+.. autosummary::
+   :toctree: generated/
+
+   derivative
+   jacobian
+   hessian
+
+"""
+
+
+from ._differentiate import *
+
+__all__ = ['derivative', 'jacobian', 'hessian']
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/_differentiate.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/_differentiate.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ea2b1696d73cfd9b8b5dd0a2c5a5cb9aba5f5e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/differentiate/_differentiate.py
@@ -0,0 +1,1140 @@
+# mypy: disable-error-code="attr-defined"
+import warnings
+import numpy as np
+import scipy._lib._elementwise_iterative_method as eim
+from scipy._lib._util import _RichResult
+from scipy._lib._array_api import array_namespace, xp_copy, xp_promote, xp_capabilities
+import scipy._lib.array_api_extra as xpx
+
+_EERRORINCREASE = -1  # used in derivative
+
+def _derivative_iv(f, x, args, tolerances, maxiter, order, initial_step,
+                   step_factor, step_direction, preserve_shape, callback):
+    # Input validation for `derivative`
+    xp = array_namespace(x)
+
+    if not callable(f):
+        raise ValueError('`f` must be callable.')
+
+    if not np.iterable(args):
+        args = (args,)
+
+    tolerances = {} if tolerances is None else tolerances
+    atol = tolerances.get('atol', None)
+    rtol = tolerances.get('rtol', None)
+
+    # tolerances are floats, not arrays; OK to use NumPy
+    message = 'Tolerances and step parameters must be non-negative scalars.'
+    tols = np.asarray([atol if atol is not None else 1,
+                       rtol if rtol is not None else 1,
+                       step_factor])
+    if (not np.issubdtype(tols.dtype, np.number) or np.any(tols < 0)
+            or np.any(np.isnan(tols)) or tols.shape != (3,)):
+        raise ValueError(message)
+    step_factor = float(tols[2])
+
+    maxiter_int = int(maxiter)
+    if maxiter != maxiter_int or maxiter <= 0:
+        raise ValueError('`maxiter` must be a positive integer.')
+
+    order_int = int(order)
+    if order_int != order or order <= 0:
+        raise ValueError('`order` must be a positive integer.')
+
+    step_direction = xp.asarray(step_direction)
+    initial_step = xp.asarray(initial_step)
+    temp = xp.broadcast_arrays(x, step_direction, initial_step)
+    x, step_direction, initial_step = temp
+
+    message = '`preserve_shape` must be True or False.'
+    if preserve_shape not in {True, False}:
+        raise ValueError(message)
+
+    if callback is not None and not callable(callback):
+        raise ValueError('`callback` must be callable.')
+
+    return (f, x, args, atol, rtol, maxiter_int, order_int, initial_step,
+            step_factor, step_direction, preserve_shape, callback)
+
+
+
+_array_api_strict_skip_reason = 'Array API does not support fancy indexing assignment.'
+_dask_reason = 'boolean indexing assignment'
+
+
+@xp_capabilities(skip_backends=[('array_api_strict', _array_api_strict_skip_reason),
+                                ('dask.array', _dask_reason)], jax_jit=False)
+def derivative(f, x, *, args=(), tolerances=None, maxiter=10,
+               order=8, initial_step=0.5, step_factor=2.0,
+               step_direction=0, preserve_shape=False, callback=None):
+    """Evaluate the derivative of an elementwise, real scalar function numerically.
+
+    For each element of the output of `f`, `derivative` approximates the first
+    derivative of `f` at the corresponding element of `x` using finite difference
+    differentiation.
+
+    This function works elementwise when `x`, `step_direction`, and `args` contain
+    (broadcastable) arrays.
+
+    Parameters
+    ----------
+    f : callable
+        The function whose derivative is desired. The signature must be::
+
+            f(xi: ndarray, *argsi) -> ndarray
+
+        where each element of ``xi`` is a finite real number and ``argsi`` is a tuple,
+        which may contain an arbitrary number of arrays that are broadcastable with
+        ``xi``. `f` must be an elementwise function: each scalar element ``f(xi)[j]``
+        must equal ``f(xi[j])`` for valid indices ``j``. It must not mutate the array
+        ``xi`` or the arrays in ``argsi``.
+    x : float array_like
+        Abscissae at which to evaluate the derivative. Must be broadcastable with
+        `args` and `step_direction`.
+    args : tuple of array_like, optional
+        Additional positional array arguments to be passed to `f`. Arrays
+        must be broadcastable with one another and the arrays of `init`.
+        If the callable for which the root is desired requires arguments that are
+        not broadcastable with `x`, wrap that callable with `f` such that `f`
+        accepts only `x` and broadcastable ``*args``.
+    tolerances : dictionary of floats, optional
+        Absolute and relative tolerances. Valid keys of the dictionary are:
+
+        - ``atol`` - absolute tolerance on the derivative
+        - ``rtol`` - relative tolerance on the derivative
+
+        Iteration will stop when ``res.error < atol + rtol * abs(res.df)``. The default
+        `atol` is the smallest normal number of the appropriate dtype, and
+        the default `rtol` is the square root of the precision of the
+        appropriate dtype.
+    order : int, default: 8
+        The (positive integer) order of the finite difference formula to be
+        used. Odd integers will be rounded up to the next even integer.
+    initial_step : float array_like, default: 0.5
+        The (absolute) initial step size for the finite difference derivative
+        approximation.
+    step_factor : float, default: 2.0
+        The factor by which the step size is *reduced* in each iteration; i.e.
+        the step size in iteration 1 is ``initial_step/step_factor``. If
+        ``step_factor < 1``, subsequent steps will be greater than the initial
+        step; this may be useful if steps smaller than some threshold are
+        undesirable (e.g. due to subtractive cancellation error).
+    maxiter : int, default: 10
+        The maximum number of iterations of the algorithm to perform. See
+        Notes.
+    step_direction : integer array_like
+        An array representing the direction of the finite difference steps (for
+        use when `x` lies near to the boundary of the domain of the function.)
+        Must be broadcastable with `x` and all `args`.
+        Where 0 (default), central differences are used; where negative (e.g.
+        -1), steps are non-positive; and where positive (e.g. 1), all steps are
+        non-negative.
+    preserve_shape : bool, default: False
+        In the following, "arguments of `f`" refers to the array ``xi`` and
+        any arrays within ``argsi``. Let ``shape`` be the broadcasted shape
+        of `x` and all elements of `args` (which is conceptually
+        distinct from ``xi` and ``argsi`` passed into `f`).
+
+        - When ``preserve_shape=False`` (default), `f` must accept arguments
+          of *any* broadcastable shapes.
+
+        - When ``preserve_shape=True``, `f` must accept arguments of shape
+          ``shape`` *or* ``shape + (n,)``, where ``(n,)`` is the number of
+          abscissae at which the function is being evaluated.
+
+        In either case, for each scalar element ``xi[j]`` within ``xi``, the array
+        returned by `f` must include the scalar ``f(xi[j])`` at the same index.
+        Consequently, the shape of the output is always the shape of the input
+        ``xi``.
+
+        See Examples.
+    callback : callable, optional
+        An optional user-supplied function to be called before the first
+        iteration and after each iteration.
+        Called as ``callback(res)``, where ``res`` is a ``_RichResult``
+        similar to that returned by `derivative` (but containing the current
+        iterate's values of all variables). If `callback` raises a
+        ``StopIteration``, the algorithm will terminate immediately and
+        `derivative` will return a result. `callback` must not mutate
+        `res` or its attributes.
+
+    Returns
+    -------
+    res : _RichResult
+        An object similar to an instance of `scipy.optimize.OptimizeResult` with the
+        following attributes. The descriptions are written as though the values will
+        be scalars; however, if `f` returns an array, the outputs will be
+        arrays of the same shape.
+
+        success : bool array
+            ``True`` where the algorithm terminated successfully (status ``0``);
+            ``False`` otherwise.
+        status : int array
+            An integer representing the exit status of the algorithm.
+
+            - ``0`` : The algorithm converged to the specified tolerances.
+            - ``-1`` : The error estimate increased, so iteration was terminated.
+            - ``-2`` : The maximum number of iterations was reached.
+            - ``-3`` : A non-finite value was encountered.
+            - ``-4`` : Iteration was terminated by `callback`.
+            - ``1`` : The algorithm is proceeding normally (in `callback` only).
+
+        df : float array
+            The derivative of `f` at `x`, if the algorithm terminated
+            successfully.
+        error : float array
+            An estimate of the error: the magnitude of the difference between
+            the current estimate of the derivative and the estimate in the
+            previous iteration.
+        nit : int array
+            The number of iterations of the algorithm that were performed.
+        nfev : int array
+            The number of points at which `f` was evaluated.
+        x : float array
+            The value at which the derivative of `f` was evaluated
+            (after broadcasting with `args` and `step_direction`).
+
+    See Also
+    --------
+    jacobian, hessian
+
+    Notes
+    -----
+    The implementation was inspired by jacobi [1]_, numdifftools [2]_, and
+    DERIVEST [3]_, but the implementation follows the theory of Taylor series
+    more straightforwardly (and arguably naively so).
+    In the first iteration, the derivative is estimated using a finite
+    difference formula of order `order` with maximum step size `initial_step`.
+    Each subsequent iteration, the maximum step size is reduced by
+    `step_factor`, and the derivative is estimated again until a termination
+    condition is reached. The error estimate is the magnitude of the difference
+    between the current derivative approximation and that of the previous
+    iteration.
+
+    The stencils of the finite difference formulae are designed such that
+    abscissae are "nested": after `f` is evaluated at ``order + 1``
+    points in the first iteration, `f` is evaluated at only two new points
+    in each subsequent iteration; ``order - 1`` previously evaluated function
+    values required by the finite difference formula are reused, and two
+    function values (evaluations at the points furthest from `x`) are unused.
+
+    Step sizes are absolute. When the step size is small relative to the
+    magnitude of `x`, precision is lost; for example, if `x` is ``1e20``, the
+    default initial step size of ``0.5`` cannot be resolved. Accordingly,
+    consider using larger initial step sizes for large magnitudes of `x`.
+
+    The default tolerances are challenging to satisfy at points where the
+    true derivative is exactly zero. If the derivative may be exactly zero,
+    consider specifying an absolute tolerance (e.g. ``atol=1e-12``) to
+    improve convergence.
+
+    References
+    ----------
+    .. [1] Hans Dembinski (@HDembinski). jacobi.
+           https://github.com/HDembinski/jacobi
+    .. [2] Per A. Brodtkorb and John D'Errico. numdifftools.
+           https://numdifftools.readthedocs.io/en/latest/
+    .. [3] John D'Errico. DERIVEST: Adaptive Robust Numerical Differentiation.
+           https://www.mathworks.com/matlabcentral/fileexchange/13490-adaptive-robust-numerical-differentiation
+    .. [4] Numerical Differentition. Wikipedia.
+           https://en.wikipedia.org/wiki/Numerical_differentiation
+
+    Examples
+    --------
+    Evaluate the derivative of ``np.exp`` at several points ``x``.
+
+    >>> import numpy as np
+    >>> from scipy.differentiate import derivative
+    >>> f = np.exp
+    >>> df = np.exp  # true derivative
+    >>> x = np.linspace(1, 2, 5)
+    >>> res = derivative(f, x)
+    >>> res.df  # approximation of the derivative
+    array([2.71828183, 3.49034296, 4.48168907, 5.75460268, 7.3890561 ])
+    >>> res.error  # estimate of the error
+    array([7.13740178e-12, 9.16600129e-12, 1.17594823e-11, 1.51061386e-11,
+           1.94262384e-11])
+    >>> abs(res.df - df(x))  # true error
+    array([2.53130850e-14, 3.55271368e-14, 5.77315973e-14, 5.59552404e-14,
+           6.92779167e-14])
+
+    Show the convergence of the approximation as the step size is reduced.
+    Each iteration, the step size is reduced by `step_factor`, so for
+    sufficiently small initial step, each iteration reduces the error by a
+    factor of ``1/step_factor**order`` until finite precision arithmetic
+    inhibits further improvement.
+
+    >>> import matplotlib.pyplot as plt
+    >>> iter = list(range(1, 12))  # maximum iterations
+    >>> hfac = 2  # step size reduction per iteration
+    >>> hdir = [-1, 0, 1]  # compare left-, central-, and right- steps
+    >>> order = 4  # order of differentiation formula
+    >>> x = 1
+    >>> ref = df(x)
+    >>> errors = []  # true error
+    >>> for i in iter:
+    ...     res = derivative(f, x, maxiter=i, step_factor=hfac,
+    ...                      step_direction=hdir, order=order,
+    ...                      # prevent early termination
+    ...                      tolerances=dict(atol=0, rtol=0))
+    ...     errors.append(abs(res.df - ref))
+    >>> errors = np.array(errors)
+    >>> plt.semilogy(iter, errors[:, 0], label='left differences')
+    >>> plt.semilogy(iter, errors[:, 1], label='central differences')
+    >>> plt.semilogy(iter, errors[:, 2], label='right differences')
+    >>> plt.xlabel('iteration')
+    >>> plt.ylabel('error')
+    >>> plt.legend()
+    >>> plt.show()
+    >>> (errors[1, 1] / errors[0, 1], 1 / hfac**order)
+    (0.06215223140159822, 0.0625)
+
+    The implementation is vectorized over `x`, `step_direction`, and `args`.
+    The function is evaluated once before the first iteration to perform input
+    validation and standardization, and once per iteration thereafter.
+
+    >>> def f(x, p):
+    ...     f.nit += 1
+    ...     return x**p
+    >>> f.nit = 0
+    >>> def df(x, p):
+    ...     return p*x**(p-1)
+    >>> x = np.arange(1, 5)
+    >>> p = np.arange(1, 6).reshape((-1, 1))
+    >>> hdir = np.arange(-1, 2).reshape((-1, 1, 1))
+    >>> res = derivative(f, x, args=(p,), step_direction=hdir, maxiter=1)
+    >>> np.allclose(res.df, df(x, p))
+    True
+    >>> res.df.shape
+    (3, 5, 4)
+    >>> f.nit
+    2
+
+    By default, `preserve_shape` is False, and therefore the callable
+    `f` may be called with arrays of any broadcastable shapes.
+    For example:
+
+    >>> shapes = []
+    >>> def f(x, c):
+    ...    shape = np.broadcast_shapes(x.shape, c.shape)
+    ...    shapes.append(shape)
+    ...    return np.sin(c*x)
+    >>>
+    >>> c = [1, 5, 10, 20]
+    >>> res = derivative(f, 0, args=(c,))
+    >>> shapes
+    [(4,), (4, 8), (4, 2), (3, 2), (2, 2), (1, 2)]
+
+    To understand where these shapes are coming from - and to better
+    understand how `derivative` computes accurate results - note that
+    higher values of ``c`` correspond with higher frequency sinusoids.
+    The higher frequency sinusoids make the function's derivative change
+    faster, so more function evaluations are required to achieve the target
+    accuracy:
+
+    >>> res.nfev
+    array([11, 13, 15, 17], dtype=int32)
+
+    The initial ``shape``, ``(4,)``, corresponds with evaluating the
+    function at a single abscissa and all four frequencies; this is used
+    for input validation and to determine the size and dtype of the arrays
+    that store results. The next shape corresponds with evaluating the
+    function at an initial grid of abscissae and all four frequencies.
+    Successive calls to the function evaluate the function at two more
+    abscissae, increasing the effective order of the approximation by two.
+    However, in later function evaluations, the function is evaluated at
+    fewer frequencies because the corresponding derivative has already
+    converged to the required tolerance. This saves function evaluations to
+    improve performance, but it requires the function to accept arguments of
+    any shape.
+
+    "Vector-valued" functions are unlikely to satisfy this requirement.
+    For example, consider
+
+    >>> def f(x):
+    ...    return [x, np.sin(3*x), x+np.sin(10*x), np.sin(20*x)*(x-1)**2]
+
+    This integrand is not compatible with `derivative` as written; for instance,
+    the shape of the output will not be the same as the shape of ``x``. Such a
+    function *could* be converted to a compatible form with the introduction of
+    additional parameters, but this would be inconvenient. In such cases,
+    a simpler solution would be to use `preserve_shape`.
+
+    >>> shapes = []
+    >>> def f(x):
+    ...     shapes.append(x.shape)
+    ...     x0, x1, x2, x3 = x
+    ...     return [x0, np.sin(3*x1), x2+np.sin(10*x2), np.sin(20*x3)*(x3-1)**2]
+    >>>
+    >>> x = np.zeros(4)
+    >>> res = derivative(f, x, preserve_shape=True)
+    >>> shapes
+    [(4,), (4, 8), (4, 2), (4, 2), (4, 2), (4, 2)]
+
+    Here, the shape of ``x`` is ``(4,)``. With ``preserve_shape=True``, the
+    function may be called with argument ``x`` of shape ``(4,)`` or ``(4, n)``,
+    and this is what we observe.
+
+    """
+    # TODO (followup):
+    #  - investigate behavior at saddle points
+    #  - multivariate functions?
+    #  - relative steps?
+    #  - show example of `np.vectorize`
+
+    res = _derivative_iv(f, x, args, tolerances, maxiter, order, initial_step,
+                            step_factor, step_direction, preserve_shape, callback)
+    (func, x, args, atol, rtol, maxiter, order,
+     h0, fac, hdir, preserve_shape, callback) = res
+
+    # Initialization
+    # Since f(x) (no step) is not needed for central differences, it may be
+    # possible to eliminate this function evaluation. However, it's useful for
+    # input validation and standardization, and everything else is designed to
+    # reduce function calls, so let's keep it simple.
+    temp = eim._initialize(func, (x,), args, preserve_shape=preserve_shape)
+    func, xs, fs, args, shape, dtype, xp = temp
+
+    finfo = xp.finfo(dtype)
+    atol = finfo.smallest_normal if atol is None else atol
+    rtol = finfo.eps**0.5 if rtol is None else rtol  # keep same as `hessian`
+
+    x, f = xs[0], fs[0]
+    df = xp.full_like(f, xp.nan)
+
+    # Ideally we'd broadcast the shape of `hdir` in `_elementwise_algo_init`, but
+    # it's simpler to do it here than to generalize `_elementwise_algo_init` further.
+    # `hdir` and `x` are already broadcasted in `_derivative_iv`, so we know
+    # that `hdir` can be broadcasted to the final shape. Same with `h0`.
+    hdir = xp.broadcast_to(hdir, shape)
+    hdir = xp.reshape(hdir, (-1,))
+    hdir = xp.astype(xp.sign(hdir), dtype)
+    h0 = xp.broadcast_to(h0, shape)
+    h0 = xp.reshape(h0, (-1,))
+    h0 = xp.astype(h0, dtype)
+    h0 = xpx.at(h0)[h0 <= 0].set(xp.nan)
+
+    status = xp.full_like(x, eim._EINPROGRESS, dtype=xp.int32)  # in progress
+    nit, nfev = 0, 1  # one function evaluations performed above
+    # Boolean indices of left, central, right, and (all) one-sided steps
+    il = hdir < 0
+    ic = hdir == 0
+    ir = hdir > 0
+    io = il | ir
+
+    # Most of these attributes are reasonably obvious, but:
+    # - `fs` holds all the function values of all active `x`. The zeroth
+    #   axis corresponds with active points `x`, the first axis corresponds
+    #   with the different steps (in the order described in
+    #   `_derivative_weights`).
+    # - `terms` (which could probably use a better name) is half the `order`,
+    #   which is always even.
+    work = _RichResult(x=x, df=df, fs=f[:, xp.newaxis], error=xp.nan, h=h0,
+                       df_last=xp.nan, error_last=xp.nan, fac=fac,
+                       atol=atol, rtol=rtol, nit=nit, nfev=nfev,
+                       status=status, dtype=dtype, terms=(order+1)//2,
+                       hdir=hdir, il=il, ic=ic, ir=ir, io=io,
+                       # Store the weights in an object so they can't get compressed
+                       # Using RichResult to allow dot notation, but a dict would work
+                       diff_state=_RichResult(central=[], right=[], fac=None))
+
+    # This is the correspondence between terms in the `work` object and the
+    # final result. In this case, the mapping is trivial. Note that `success`
+    # is prepended automatically.
+    res_work_pairs = [('status', 'status'), ('df', 'df'), ('error', 'error'),
+                      ('nit', 'nit'), ('nfev', 'nfev'), ('x', 'x')]
+
+    def pre_func_eval(work):
+        """Determine the abscissae at which the function needs to be evaluated.
+
+        See `_derivative_weights` for a description of the stencil (pattern
+        of the abscissae).
+
+        In the first iteration, there is only one stored function value in
+        `work.fs`, `f(x)`, so we need to evaluate at `order` new points. In
+        subsequent iterations, we evaluate at two new points. Note that
+        `work.x` is always flattened into a 1D array after broadcasting with
+        all `args`, so we add a new axis at the end and evaluate all point
+        in one call to the function.
+
+        For improvement:
+        - Consider measuring the step size actually taken, since ``(x + h) - x``
+          is not identically equal to `h` with floating point arithmetic.
+        - Adjust the step size automatically if `x` is too big to resolve the
+          step.
+        - We could probably save some work if there are no central difference
+          steps or no one-sided steps.
+        """
+        n = work.terms  # half the order
+        h = work.h[:, xp.newaxis]  # step size
+        c = work.fac  # step reduction factor
+        d = c**0.5  # square root of step reduction factor (one-sided stencil)
+        # Note - no need to be careful about dtypes until we allocate `x_eval`
+
+        if work.nit == 0:
+            hc = h / c**xp.arange(n, dtype=work.dtype)
+            hc = xp.concat((-xp.flip(hc, axis=-1), hc), axis=-1)
+        else:
+            hc = xp.concat((-h, h), axis=-1) / c**(n-1)
+
+        if work.nit == 0:
+            hr = h / d**xp.arange(2*n, dtype=work.dtype)
+        else:
+            hr = xp.concat((h, h/d), axis=-1) / c**(n-1)
+
+        n_new = 2*n if work.nit == 0 else 2  # number of new abscissae
+        x_eval = xp.zeros((work.hdir.shape[0], n_new), dtype=work.dtype)
+        il, ic, ir = work.il, work.ic, work.ir
+        x_eval = xpx.at(x_eval)[ir].set(work.x[ir][:, xp.newaxis] + hr[ir])
+        x_eval = xpx.at(x_eval)[ic].set(work.x[ic][:, xp.newaxis] + hc[ic])
+        x_eval = xpx.at(x_eval)[il].set(work.x[il][:, xp.newaxis] - hr[il])
+        return x_eval
+
+    def post_func_eval(x, f, work):
+        """ Estimate the derivative and error from the function evaluations
+
+        As in `pre_func_eval`: in the first iteration, there is only one stored
+        function value in `work.fs`, `f(x)`, so we need to add the `order` new
+        points. In subsequent iterations, we add two new points. The tricky
+        part is getting the order to match that of the weights, which is
+        described in `_derivative_weights`.
+
+        For improvement:
+        - Change the order of the weights (and steps in `pre_func_eval`) to
+          simplify `work_fc` concatenation and eliminate `fc` concatenation.
+        - It would be simple to do one-step Richardson extrapolation with `df`
+          and `df_last` to increase the order of the estimate and/or improve
+          the error estimate.
+        - Process the function evaluations in a more numerically favorable
+          way. For instance, combining the pairs of central difference evals
+          into a second-order approximation and using Richardson extrapolation
+          to produce a higher order approximation seemed to retain accuracy up
+          to very high order.
+        - Alternatively, we could use `polyfit` like Jacobi. An advantage of
+          fitting polynomial to more points than necessary is improved noise
+          tolerance.
+        """
+        n = work.terms
+        n_new = n if work.nit == 0 else 1
+        il, ic, io = work.il, work.ic, work.io
+
+        # Central difference
+        # `work_fc` is *all* the points at which the function has been evaluated
+        # `fc` is the points we're using *this iteration* to produce the estimate
+        work_fc = (f[ic][:, :n_new], work.fs[ic], f[ic][:, -n_new:])
+        work_fc = xp.concat(work_fc, axis=-1)
+        if work.nit == 0:
+            fc = work_fc
+        else:
+            fc = (work_fc[:, :n], work_fc[:, n:n+1], work_fc[:, -n:])
+            fc = xp.concat(fc, axis=-1)
+
+        # One-sided difference
+        work_fo = xp.concat((work.fs[io], f[io]), axis=-1)
+        if work.nit == 0:
+            fo = work_fo
+        else:
+            fo = xp.concat((work_fo[:, 0:1], work_fo[:, -2*n:]), axis=-1)
+
+        work.fs = xp.zeros((ic.shape[0], work.fs.shape[-1] + 2*n_new), dtype=work.dtype)
+        work.fs = xpx.at(work.fs)[ic].set(work_fc)
+        work.fs = xpx.at(work.fs)[io].set(work_fo)
+
+        wc, wo = _derivative_weights(work, n, xp)
+        work.df_last = xp.asarray(work.df, copy=True)
+        work.df = xpx.at(work.df)[ic].set(fc @ wc / work.h[ic])
+        work.df = xpx.at(work.df)[io].set(fo @ wo / work.h[io])
+        work.df = xpx.at(work.df)[il].multiply(-1)
+
+        work.h /= work.fac
+        work.error_last = work.error
+        # Simple error estimate - the difference in derivative estimates between
+        # this iteration and the last. This is typically conservative because if
+        # convergence has begin, the true error is much closer to the difference
+        # between the current estimate and the *next* error estimate. However,
+        # we could use Richarson extrapolation to produce an error estimate that
+        # is one order higher, and take the difference between that and
+        # `work.df` (which would just be constant factor that depends on `fac`.)
+        work.error = xp.abs(work.df - work.df_last)
+
+    def check_termination(work):
+        """Terminate due to convergence, non-finite values, or error increase"""
+        stop = xp.astype(xp.zeros_like(work.df), xp.bool)
+
+        i = work.error < work.atol + work.rtol*abs(work.df)
+        work.status = xpx.at(work.status)[i].set(eim._ECONVERGED)
+        stop = xpx.at(stop)[i].set(True)
+
+        if work.nit > 0:
+            i = ~((xp.isfinite(work.x) & xp.isfinite(work.df)) | stop)
+            work.df = xpx.at(work.df)[i].set(xp.nan)
+            work.status = xpx.at(work.status)[i].set(eim._EVALUEERR)
+            stop = xpx.at(stop)[i].set(True)
+
+        # With infinite precision, there is a step size below which
+        # all smaller step sizes will reduce the error. But in floating point
+        # arithmetic, catastrophic cancellation will begin to cause the error
+        # to increase again. This heuristic tries to avoid step sizes that are
+        # too small. There may be more theoretically sound approaches for
+        # detecting a step size that minimizes the total error, but this
+        # heuristic seems simple and effective.
+        i = (work.error > work.error_last*10) & ~stop
+        work.status = xpx.at(work.status)[i].set(_EERRORINCREASE)
+        stop = xpx.at(stop)[i].set(True)
+
+        return stop
+
+    def post_termination_check(work):
+        return
+
+    def customize_result(res, shape):
+        return shape
+
+    return eim._loop(work, callback, shape, maxiter, func, args, dtype,
+                     pre_func_eval, post_func_eval, check_termination,
+                     post_termination_check, customize_result, res_work_pairs,
+                     xp, preserve_shape)
+
+
+def _derivative_weights(work, n, xp):
+    # This produces the weights of the finite difference formula for a given
+    # stencil. In experiments, use of a second-order central difference formula
+    # with Richardson extrapolation was more accurate numerically, but it was
+    # more complicated, and it would have become even more complicated when
+    # adding support for one-sided differences. However, now that all the
+    # function evaluation values are stored, they can be processed in whatever
+    # way is desired to produce the derivative estimate. We leave alternative
+    # approaches to future work. To be more self-contained, here is the theory
+    # for deriving the weights below.
+    #
+    # Recall that the Taylor expansion of a univariate, scalar-values function
+    # about a point `x` may be expressed as:
+    #      f(x + h)  =     f(x) + f'(x)*h + f''(x)/2!*h**2  + O(h**3)
+    # Suppose we evaluate f(x), f(x+h), and f(x-h).  We have:
+    #      f(x)      =     f(x)
+    #      f(x + h)  =     f(x) + f'(x)*h + f''(x)/2!*h**2  + O(h**3)
+    #      f(x - h)  =     f(x) - f'(x)*h + f''(x)/2!*h**2  + O(h**3)
+    # We can solve for weights `wi` such that:
+    #   w1*f(x)      = w1*(f(x))
+    # + w2*f(x + h)  = w2*(f(x) + f'(x)*h + f''(x)/2!*h**2) + O(h**3)
+    # + w3*f(x - h)  = w3*(f(x) - f'(x)*h + f''(x)/2!*h**2) + O(h**3)
+    #                =     0    + f'(x)*h + 0               + O(h**3)
+    # Then
+    #     f'(x) ~ (w1*f(x) + w2*f(x+h) + w3*f(x-h))/h
+    # is a finite difference derivative approximation with error O(h**2),
+    # and so it is said to be a "second-order" approximation. Under certain
+    # conditions (e.g. well-behaved function, `h` sufficiently small), the
+    # error in the approximation will decrease with h**2; that is, if `h` is
+    # reduced by a factor of 2, the error is reduced by a factor of 4.
+    #
+    # By default, we use eighth-order formulae. Our central-difference formula
+    # uses abscissae:
+    #   x-h/c**3, x-h/c**2, x-h/c, x-h, x, x+h, x+h/c, x+h/c**2, x+h/c**3
+    # where `c` is the step factor. (Typically, the step factor is greater than
+    # one, so the outermost points - as written above - are actually closest to
+    # `x`.) This "stencil" is chosen so that each iteration, the step can be
+    # reduced by the factor `c`, and most of the function evaluations can be
+    # reused with the new step size. For example, in the next iteration, we
+    # will have:
+    #   x-h/c**4, x-h/c**3, x-h/c**2, x-h/c, x, x+h/c, x+h/c**2, x+h/c**3, x+h/c**4
+    # We do not reuse `x-h` and `x+h` for the new derivative estimate.
+    # While this would increase the order of the formula and thus the
+    # theoretical convergence rate, it is also less stable numerically.
+    # (As noted above, there are other ways of processing the values that are
+    # more stable. Thus, even now we store `f(x-h)` and `f(x+h)` in `work.fs`
+    # to simplify future development of this sort of improvement.)
+    #
+    # The (right) one-sided formula is produced similarly using abscissae
+    #   x, x+h, x+h/d, x+h/d**2, ..., x+h/d**6, x+h/d**7, x+h/d**7
+    # where `d` is the square root of `c`. (The left one-sided formula simply
+    # uses -h.) When the step size is reduced by factor `c = d**2`, we have
+    # abscissae:
+    #   x, x+h/d**2, x+h/d**3..., x+h/d**8, x+h/d**9, x+h/d**9
+    # `d` is chosen as the square root of `c` so that the rate of the step-size
+    # reduction is the same per iteration as in the central difference case.
+    # Note that because the central difference formulas are inherently of even
+    # order, for simplicity, we use only even-order formulas for one-sided
+    # differences, too.
+
+    # It's possible for the user to specify `fac` in, say, double precision but
+    # `x` and `args` in single precision. `fac` gets converted to single
+    # precision, but we should always use double precision for the intermediate
+    # calculations here to avoid additional error in the weights.
+    fac = float(work.fac)
+
+    # Note that if the user switches back to floating point precision with
+    # `x` and `args`, then `fac` will not necessarily equal the (lower
+    # precision) cached `_derivative_weights.fac`, and the weights will
+    # need to be recalculated. This could be fixed, but it's late, and of
+    # low consequence.
+
+    diff_state = work.diff_state
+    if fac != diff_state.fac:
+        diff_state.central = []
+        diff_state.right = []
+        diff_state.fac = fac
+
+    if len(diff_state.central) != 2*n + 1:
+        # Central difference weights. Consider refactoring this; it could
+        # probably be more compact.
+        # Note: Using NumPy here is OK; we convert to xp-type at the end
+        i = np.arange(-n, n + 1)
+        p = np.abs(i) - 1.  # center point has power `p` -1, but sign `s` is 0
+        s = np.sign(i)
+
+        h = s / fac ** p
+        A = np.vander(h, increasing=True).T
+        b = np.zeros(2*n + 1)
+        b[1] = 1
+        weights = np.linalg.solve(A, b)
+
+        # Enforce identities to improve accuracy
+        weights[n] = 0
+        for i in range(n):
+            weights[-i-1] = -weights[i]
+
+        # Cache the weights. We only need to calculate them once unless
+        # the step factor changes.
+        diff_state.central = weights
+
+        # One-sided difference weights. The left one-sided weights (with
+        # negative steps) are simply the negative of the right one-sided
+        # weights, so no need to compute them separately.
+        i = np.arange(2*n + 1)
+        p = i - 1.
+        s = np.sign(i)
+
+        h = s / np.sqrt(fac) ** p
+        A = np.vander(h, increasing=True).T
+        b = np.zeros(2 * n + 1)
+        b[1] = 1
+        weights = np.linalg.solve(A, b)
+
+        diff_state.right = weights
+
+    return (xp.asarray(diff_state.central, dtype=work.dtype),
+            xp.asarray(diff_state.right, dtype=work.dtype))
+
+
+@xp_capabilities(skip_backends=[('array_api_strict', _array_api_strict_skip_reason),
+                                ('dask.array', _dask_reason)], jax_jit=False)
+def jacobian(f, x, *, tolerances=None, maxiter=10, order=8, initial_step=0.5,
+             step_factor=2.0, step_direction=0):
+    r"""Evaluate the Jacobian of a function numerically.
+
+    Parameters
+    ----------
+    f : callable
+        The function whose Jacobian is desired. The signature must be::
+
+            f(xi: ndarray) -> ndarray
+
+        where each element of ``xi`` is a finite real. If the function to be
+        differentiated accepts additional arguments, wrap it (e.g. using
+        `functools.partial` or ``lambda``) and pass the wrapped callable
+        into `jacobian`. `f` must not mutate the array ``xi``. See Notes
+        regarding vectorization and the dimensionality of the input and output.
+    x : float array_like
+        Points at which to evaluate the Jacobian. Must have at least one dimension.
+        See Notes regarding the dimensionality and vectorization.
+    tolerances : dictionary of floats, optional
+        Absolute and relative tolerances. Valid keys of the dictionary are:
+
+        - ``atol`` - absolute tolerance on the derivative
+        - ``rtol`` - relative tolerance on the derivative
+
+        Iteration will stop when ``res.error < atol + rtol * abs(res.df)``. The default
+        `atol` is the smallest normal number of the appropriate dtype, and
+        the default `rtol` is the square root of the precision of the
+        appropriate dtype.
+    maxiter : int, default: 10
+        The maximum number of iterations of the algorithm to perform. See
+        Notes.
+    order : int, default: 8
+        The (positive integer) order of the finite difference formula to be
+        used. Odd integers will be rounded up to the next even integer.
+    initial_step : float array_like, default: 0.5
+        The (absolute) initial step size for the finite difference derivative
+        approximation. Must be broadcastable with `x` and `step_direction`.
+    step_factor : float, default: 2.0
+        The factor by which the step size is *reduced* in each iteration; i.e.
+        the step size in iteration 1 is ``initial_step/step_factor``. If
+        ``step_factor < 1``, subsequent steps will be greater than the initial
+        step; this may be useful if steps smaller than some threshold are
+        undesirable (e.g. due to subtractive cancellation error).
+    step_direction : integer array_like
+        An array representing the direction of the finite difference steps (e.g.
+        for use when `x` lies near to the boundary of the domain of the function.)
+        Must be broadcastable with `x` and `initial_step`.
+        Where 0 (default), central differences are used; where negative (e.g.
+        -1), steps are non-positive; and where positive (e.g. 1), all steps are
+        non-negative.
+
+    Returns
+    -------
+    res : _RichResult
+        An object similar to an instance of `scipy.optimize.OptimizeResult` with the
+        following attributes. The descriptions are written as though the values will
+        be scalars; however, if `f` returns an array, the outputs will be
+        arrays of the same shape.
+
+        success : bool array
+            ``True`` where the algorithm terminated successfully (status ``0``);
+            ``False`` otherwise.
+        status : int array
+            An integer representing the exit status of the algorithm.
+
+            - ``0`` : The algorithm converged to the specified tolerances.
+            - ``-1`` : The error estimate increased, so iteration was terminated.
+            - ``-2`` : The maximum number of iterations was reached.
+            - ``-3`` : A non-finite value was encountered.
+
+        df : float array
+            The Jacobian of `f` at `x`, if the algorithm terminated
+            successfully.
+        error : float array
+            An estimate of the error: the magnitude of the difference between
+            the current estimate of the Jacobian and the estimate in the
+            previous iteration.
+        nit : int array
+            The number of iterations of the algorithm that were performed.
+        nfev : int array
+            The number of points at which `f` was evaluated.
+
+        Each element of an attribute is associated with the corresponding
+        element of `df`. For instance, element ``i`` of `nfev` is the
+        number of points at which `f` was evaluated for the sake of
+        computing element ``i`` of `df`.
+
+    See Also
+    --------
+    derivative, hessian
+
+    Notes
+    -----
+    Suppose we wish to evaluate the Jacobian of a function
+    :math:`f: \mathbf{R}^m \rightarrow \mathbf{R}^n`. Assign to variables
+    ``m`` and ``n`` the positive integer values of :math:`m` and :math:`n`,
+    respectively, and let ``...`` represent an arbitrary tuple of integers.
+    If we wish to evaluate the Jacobian at a single point, then:
+
+    - argument `x` must be an array of shape ``(m,)``
+    - argument `f` must be vectorized to accept an array of shape ``(m, ...)``.
+      The first axis represents the :math:`m` inputs of :math:`f`; the remainder
+      are for evaluating the function at multiple points in a single call.
+    - argument `f` must return an array of shape ``(n, ...)``. The first
+      axis represents the :math:`n` outputs of :math:`f`; the remainder
+      are for the result of evaluating the function at multiple points.
+    - attribute ``df`` of the result object will be an array of shape ``(n, m)``,
+      the Jacobian.
+
+    This function is also vectorized in the sense that the Jacobian can be
+    evaluated at ``k`` points in a single call. In this case, `x` would be an
+    array of shape ``(m, k)``, `f` would accept an array of shape
+    ``(m, k, ...)`` and return an array of shape ``(n, k, ...)``, and the ``df``
+    attribute of the result would have shape ``(n, m, k)``.
+
+    Suppose the desired callable ``f_not_vectorized`` is not vectorized; it can
+    only accept an array of shape ``(m,)``. A simple solution to satisfy the required
+    interface is to wrap ``f_not_vectorized`` as follows::
+
+        def f(x):
+            return np.apply_along_axis(f_not_vectorized, axis=0, arr=x)
+
+    Alternatively, suppose the desired callable ``f_vec_q`` is vectorized, but
+    only for 2-D arrays of shape ``(m, q)``. To satisfy the required interface,
+    consider::
+
+        def f(x):
+            m, batch = x.shape[0], x.shape[1:]  # x.shape is (m, ...)
+            x = np.reshape(x, (m, -1))  # `-1` is short for q = prod(batch)
+            res = f_vec_q(x)  # pass shape (m, q) to function
+            n = res.shape[0]
+            return np.reshape(res, (n,) + batch)  # return shape (n, ...)
+
+    Then pass the wrapped callable ``f`` as the first argument of `jacobian`.
+
+    References
+    ----------
+    .. [1] Jacobian matrix and determinant, *Wikipedia*,
+           https://en.wikipedia.org/wiki/Jacobian_matrix_and_determinant
+
+    Examples
+    --------
+    The Rosenbrock function maps from :math:`\mathbf{R}^m \rightarrow \mathbf{R}`;
+    the SciPy implementation `scipy.optimize.rosen` is vectorized to accept an
+    array of shape ``(m, p)`` and return an array of shape ``p``. Suppose we wish
+    to evaluate the Jacobian (AKA the gradient because the function returns a scalar)
+    at ``[0.5, 0.5, 0.5]``.
+
+    >>> import numpy as np
+    >>> from scipy.differentiate import jacobian
+    >>> from scipy.optimize import rosen, rosen_der
+    >>> m = 3
+    >>> x = np.full(m, 0.5)
+    >>> res = jacobian(rosen, x)
+    >>> ref = rosen_der(x)  # reference value of the gradient
+    >>> res.df, ref
+    (array([-51.,  -1.,  50.]), array([-51.,  -1.,  50.]))
+
+    As an example of a function with multiple outputs, consider Example 4
+    from [1]_.
+
+    >>> def f(x):
+    ...     x1, x2, x3 = x
+    ...     return [x1, 5*x3, 4*x2**2 - 2*x3, x3*np.sin(x1)]
+
+    The true Jacobian is given by:
+
+    >>> def df(x):
+    ...         x1, x2, x3 = x
+    ...         one = np.ones_like(x1)
+    ...         return [[one, 0*one, 0*one],
+    ...                 [0*one, 0*one, 5*one],
+    ...                 [0*one, 8*x2, -2*one],
+    ...                 [x3*np.cos(x1), 0*one, np.sin(x1)]]
+
+    Evaluate the Jacobian at an arbitrary point.
+
+    >>> rng = np.random.default_rng(389252938452)
+    >>> x = rng.random(size=3)
+    >>> res = jacobian(f, x)
+    >>> ref = df(x)
+    >>> res.df.shape == (4, 3)
+    True
+    >>> np.allclose(res.df, ref)
+    True
+
+    Evaluate the Jacobian at 10 arbitrary points in a single call.
+
+    >>> x = rng.random(size=(3, 10))
+    >>> res = jacobian(f, x)
+    >>> ref = df(x)
+    >>> res.df.shape == (4, 3, 10)
+    True
+    >>> np.allclose(res.df, ref)
+    True
+
+    """
+    xp = array_namespace(x)
+    x0 = xp_promote(x, force_floating=True, xp=xp)
+
+    if x0.ndim < 1:
+        message = "Argument `x` must be at least 1-D."
+        raise ValueError(message)
+
+    m = x0.shape[0]
+    i = xp.arange(m)
+
+    def wrapped(x):
+        p = () if x.ndim == x0.ndim else (x.shape[-1],)  # number of abscissae
+
+        new_shape = (m, m) + x0.shape[1:] + p
+        xph = xp.expand_dims(x0, axis=1)
+        if x.ndim != x0.ndim:
+            xph = xp.expand_dims(xph, axis=-1)
+        xph = xp_copy(xp.broadcast_to(xph, new_shape), xp=xp)
+        xph = xpx.at(xph)[i, i].set(x)
+        return f(xph)
+
+    res = derivative(wrapped, x, tolerances=tolerances,
+                     maxiter=maxiter, order=order, initial_step=initial_step,
+                     step_factor=step_factor, preserve_shape=True,
+                     step_direction=step_direction)
+
+    del res.x  # the user knows `x`, and the way it gets broadcasted is meaningless here
+    return res
+
+
+@xp_capabilities(skip_backends=[('array_api_strict', _array_api_strict_skip_reason),
+                                ('dask.array', _dask_reason)], jax_jit=False)
+def hessian(f, x, *, tolerances=None, maxiter=10,
+            order=8, initial_step=0.5, step_factor=2.0):
+    r"""Evaluate the Hessian of a function numerically.
+
+    Parameters
+    ----------
+    f : callable
+        The function whose Hessian is desired. The signature must be::
+
+            f(xi: ndarray) -> ndarray
+
+        where each element of ``xi`` is a finite real. If the function to be
+        differentiated accepts additional arguments, wrap it (e.g. using
+        `functools.partial` or ``lambda``) and pass the wrapped callable
+        into `hessian`. `f` must not mutate the array ``xi``. See Notes
+        regarding vectorization and the dimensionality of the input and output.
+    x : float array_like
+        Points at which to evaluate the Hessian. Must have at least one dimension.
+        See Notes regarding the dimensionality and vectorization.
+    tolerances : dictionary of floats, optional
+        Absolute and relative tolerances. Valid keys of the dictionary are:
+
+        - ``atol`` - absolute tolerance on the derivative
+        - ``rtol`` - relative tolerance on the derivative
+
+        Iteration will stop when ``res.error < atol + rtol * abs(res.df)``. The default
+        `atol` is the smallest normal number of the appropriate dtype, and
+        the default `rtol` is the square root of the precision of the
+        appropriate dtype.
+    order : int, default: 8
+        The (positive integer) order of the finite difference formula to be
+        used. Odd integers will be rounded up to the next even integer.
+    initial_step : float, default: 0.5
+        The (absolute) initial step size for the finite difference derivative
+        approximation.
+    step_factor : float, default: 2.0
+        The factor by which the step size is *reduced* in each iteration; i.e.
+        the step size in iteration 1 is ``initial_step/step_factor``. If
+        ``step_factor < 1``, subsequent steps will be greater than the initial
+        step; this may be useful if steps smaller than some threshold are
+        undesirable (e.g. due to subtractive cancellation error).
+    maxiter : int, default: 10
+        The maximum number of iterations of the algorithm to perform. See
+        Notes.
+
+    Returns
+    -------
+    res : _RichResult
+        An object similar to an instance of `scipy.optimize.OptimizeResult` with the
+        following attributes. The descriptions are written as though the values will
+        be scalars; however, if `f` returns an array, the outputs will be
+        arrays of the same shape.
+
+        success : bool array
+            ``True`` where the algorithm terminated successfully (status ``0``);
+            ``False`` otherwise.
+        status : int array
+            An integer representing the exit status of the algorithm.
+
+            - ``0`` : The algorithm converged to the specified tolerances.
+            - ``-1`` : The error estimate increased, so iteration was terminated.
+            - ``-2`` : The maximum number of iterations was reached.
+            - ``-3`` : A non-finite value was encountered.
+
+        ddf : float array
+            The Hessian of `f` at `x`, if the algorithm terminated
+            successfully.
+        error : float array
+            An estimate of the error: the magnitude of the difference between
+            the current estimate of the Hessian and the estimate in the
+            previous iteration.
+        nfev : int array
+            The number of points at which `f` was evaluated.
+
+        Each element of an attribute is associated with the corresponding
+        element of `ddf`. For instance, element ``[i, j]`` of `nfev` is the
+        number of points at which `f` was evaluated for the sake of
+        computing element ``[i, j]`` of `ddf`.
+
+    See Also
+    --------
+    derivative, jacobian
+
+    Notes
+    -----
+    Suppose we wish to evaluate the Hessian of a function
+    :math:`f: \mathbf{R}^m \rightarrow \mathbf{R}`, and we assign to variable
+    ``m`` the positive integer value of :math:`m`. If we wish to evaluate
+    the Hessian at a single point, then:
+
+    - argument `x` must be an array of shape ``(m,)``
+    - argument `f` must be vectorized to accept an array of shape
+      ``(m, ...)``. The first axis represents the :math:`m` inputs of
+      :math:`f`; the remaining axes indicated by ellipses are for evaluating
+      the function at several abscissae in a single call.
+    - argument `f` must return an array of shape ``(...)``.
+    - attribute ``dff`` of the result object will be an array of shape ``(m, m)``,
+      the Hessian.
+
+    This function is also vectorized in the sense that the Hessian can be
+    evaluated at ``k`` points in a single call. In this case, `x` would be an
+    array of shape ``(m, k)``, `f` would accept an array of shape
+    ``(m, ...)`` and return an array of shape ``(...)``, and the ``ddf``
+    attribute of the result would have shape ``(m, m, k)``. Note that the
+    axis associated with the ``k`` points is included within the axes
+    denoted by ``(...)``.
+
+    Currently, `hessian` is implemented by nesting calls to `jacobian`.
+    All options passed to `hessian` are used for both the inner and outer
+    calls with one exception: the `rtol` used in the inner `jacobian` call
+    is tightened by a factor of 100 with the expectation that the inner
+    error can be ignored. A consequence is that `rtol` should not be set
+    less than 100 times the precision of the dtype of `x`; a warning is
+    emitted otherwise.
+
+    References
+    ----------
+    .. [1] Hessian matrix, *Wikipedia*,
+           https://en.wikipedia.org/wiki/Hessian_matrix
+
+    Examples
+    --------
+    The Rosenbrock function maps from :math:`\mathbf{R}^m \rightarrow \mathbf{R}`;
+    the SciPy implementation `scipy.optimize.rosen` is vectorized to accept an
+    array of shape ``(m, ...)`` and return an array of shape ``...``. Suppose we
+    wish to evaluate the Hessian at ``[0.5, 0.5, 0.5]``.
+
+    >>> import numpy as np
+    >>> from scipy.differentiate import hessian
+    >>> from scipy.optimize import rosen, rosen_hess
+    >>> m = 3
+    >>> x = np.full(m, 0.5)
+    >>> res = hessian(rosen, x)
+    >>> ref = rosen_hess(x)  # reference value of the Hessian
+    >>> np.allclose(res.ddf, ref)
+    True
+
+    `hessian` is vectorized to evaluate the Hessian at multiple points
+    in a single call.
+
+    >>> rng = np.random.default_rng(4589245925010)
+    >>> x = rng.random((m, 10))
+    >>> res = hessian(rosen, x)
+    >>> ref = [rosen_hess(xi) for xi in x.T]
+    >>> ref = np.moveaxis(ref, 0, -1)
+    >>> np.allclose(res.ddf, ref)
+    True
+
+    """
+    # todo:
+    # - add ability to vectorize over additional parameters (*args?)
+    # - error estimate stack with inner jacobian (or use legit 2D stencil)
+
+    kwargs = dict(maxiter=maxiter, order=order, initial_step=initial_step,
+                  step_factor=step_factor)
+    tolerances = {} if tolerances is None else tolerances
+    atol = tolerances.get('atol', None)
+    rtol = tolerances.get('rtol', None)
+
+    xp = array_namespace(x)
+    x0 = xp_promote(x, force_floating=True, xp=xp)
+
+    finfo = xp.finfo(x0.dtype)
+    rtol = finfo.eps**0.5 if rtol is None else rtol  # keep same as `derivative`
+
+    # tighten the inner tolerance to make the inner error negligible
+    rtol_min = finfo.eps * 100
+    message = (f"The specified `{rtol=}`, but error estimates are likely to be "
+               f"unreliable when `rtol < {rtol_min}`.")
+    if 0 < rtol < rtol_min:  # rtol <= 0 is an error
+        warnings.warn(message, RuntimeWarning, stacklevel=2)
+        rtol = rtol_min
+
+    def df(x):
+        tolerances = dict(rtol=rtol/100, atol=atol)
+        temp = jacobian(f, x, tolerances=tolerances, **kwargs)
+        nfev.append(temp.nfev if len(nfev) == 0 else temp.nfev.sum(axis=-1))
+        return temp.df
+
+    nfev = []  # track inner function evaluations
+    res = jacobian(df, x, tolerances=tolerances, **kwargs)  # jacobian of jacobian
+
+    nfev = xp.cumulative_sum(xp.stack(nfev), axis=0)
+    res_nit = xp.astype(res.nit[xp.newaxis, ...], xp.int64)  # appease torch
+    res.nfev = xp.take_along_axis(nfev, res_nit, axis=0)[0]
+    res.ddf = res.df
+    del res.df  # this is renamed to ddf
+    del res.nit  # this is only the outer-jacobian nit
+
+    return res
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8be2d80bea342e3b6b3df7d13e17f969ef1f224b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/__init__.py
@@ -0,0 +1,228 @@
+"""
+========================================
+Interpolation (:mod:`scipy.interpolate`)
+========================================
+
+.. currentmodule:: scipy.interpolate
+
+Sub-package for functions and objects used in interpolation.
+
+See the :ref:`user guide <tutorial-interpolate>` for recommendations on choosing a
+routine, and other usage details.
+
+
+Univariate interpolation
+========================
+
+.. autosummary::
+   :toctree: generated/
+
+   make_interp_spline
+   CubicSpline
+   PchipInterpolator
+   Akima1DInterpolator
+   FloaterHormannInterpolator
+   BarycentricInterpolator
+   KroghInterpolator
+   CubicHermiteSpline
+
+**Low-level data structures for univariate interpolation:**
+
+.. autosummary::
+   :toctree: generated/
+
+   PPoly
+   BPoly
+   BSpline
+
+
+Multivariate interpolation
+==========================
+
+**Unstructured data**
+
+.. autosummary::
+   :toctree: generated/
+
+   LinearNDInterpolator
+   NearestNDInterpolator
+   CloughTocher2DInterpolator
+   RBFInterpolator
+
+**For data on a grid:**
+
+.. autosummary::
+   :toctree: generated/
+
+   RegularGridInterpolator
+
+.. seealso::
+
+    `scipy.ndimage.map_coordinates`,
+    :ref:`An example wrapper for map_coordinates <tutorial-interpolate_cartesian-grids>`
+
+
+**Low-level data structures for tensor product polynomials and splines:**
+
+
+.. autosummary::
+   :toctree: generated/
+
+   NdPPoly
+   NdBSpline
+
+
+1-D spline smoothing and approximation
+======================================
+
+.. autosummary::
+   :toctree: generated/
+
+   make_lsq_spline
+   make_smoothing_spline
+   make_splrep
+   make_splprep
+   generate_knots
+
+Rational Approximation
+======================
+
+.. autosummary::
+   :toctree: generated/
+
+   AAA
+
+
+Interfaces to FITPACK routines for 1D and 2D spline fitting
+===========================================================
+
+This section lists wrappers for `FITPACK <http://www.netlib.org/dierckx/>`__
+functionality for 1D and 2D smoothing splines. In most cases, users are better off
+using higher-level routines listed in previous sections.
+
+
+1D FITPACK splines
+------------------
+
+This package provides two sets of functionally equivalent wrappers: object-oriented and
+functional.
+
+**Functional FITPACK interface:**
+
+
+.. autosummary::
+   :toctree: generated/
+
+   splrep
+   splprep
+   splev
+   splint
+   sproot
+   spalde
+   splder
+   splantider
+   insert
+
+**Object-oriented FITPACK interface:**
+
+.. autosummary::
+   :toctree: generated/
+
+   UnivariateSpline
+   InterpolatedUnivariateSpline
+   LSQUnivariateSpline
+
+
+2D FITPACK splines
+------------------
+
+**For data on a grid:**
+
+.. autosummary::
+   :toctree: generated/
+
+   RectBivariateSpline
+   RectSphereBivariateSpline
+
+**For unstructured data (OOP interface):**
+
+.. autosummary::
+   :toctree: generated/
+
+   BivariateSpline
+   SmoothBivariateSpline
+   SmoothSphereBivariateSpline
+   LSQBivariateSpline
+   LSQSphereBivariateSpline
+
+**For unstructured data (functional interface):**
+
+.. autosummary::
+   :toctree: generated/
+
+   bisplrep
+   bisplev
+
+
+Additional tools
+================
+
+.. autosummary::
+   :toctree: generated/
+
+   lagrange
+   approximate_taylor_polynomial
+   pade
+
+   interpn
+   griddata
+   barycentric_interpolate
+   krogh_interpolate
+   pchip_interpolate
+   Rbf
+   interp1d
+   interp2d
+
+.. seealso::
+
+   `scipy.ndimage.map_coordinates`,
+   `scipy.ndimage.spline_filter`,
+
+"""  # noqa: E501
+from ._interpolate import *
+from ._fitpack_py import *
+
+from ._fitpack2 import *
+
+from ._rbf import Rbf
+
+from ._rbfinterp import *
+
+from ._polyint import *
+
+from ._cubic import *
+
+from ._ndgriddata import *
+
+from ._bsplines import *
+from ._fitpack_repro import generate_knots, make_splrep, make_splprep
+
+from ._pade import *
+
+from ._rgi import *
+
+from ._ndbspline import NdBSpline
+
+from ._bary_rational import *
+
+# Deprecated namespaces, to be removed in v2.0.0
+from . import fitpack, fitpack2, interpolate, ndgriddata, polyint, rbf, interpnd
+
+__all__ = [s for s in dir() if not s.startswith('_')]
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
+
+# Backward compatibility
+pchip = PchipInterpolator
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bary_rational.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bary_rational.py
new file mode 100644
index 0000000000000000000000000000000000000000..8129f98d83e8b07f006e6e23b0f99a2b35522e94
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bary_rational.py
@@ -0,0 +1,748 @@
+# Copyright (c) 2017, The Chancellor, Masters and Scholars of the University
+# of Oxford, and the Chebfun Developers. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the University of Oxford nor the names of its
+#       contributors may be used to endorse or promote products derived from
+#       this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import warnings
+import operator
+from types import GenericAlias
+
+import numpy as np
+import scipy
+
+
+__all__ = ["AAA", "FloaterHormannInterpolator"]
+
+
+class _BarycentricRational:
+    """Base class for barycentric representation of a rational function."""
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self, x, y, axis=0, **kwargs):
+        self._axis = axis
+
+        # input validation
+        z = np.asarray(x)
+        f = np.asarray(y)
+
+        self._input_validation(z, f, **kwargs)
+
+        f = np.moveaxis(f, self._axis, 0)
+
+        # Remove infinite or NaN function values and repeated entries
+        to_keep = np.logical_and.reduce(
+            ((np.isfinite(f)) & (~np.isnan(f))).reshape(f.shape[0], -1),
+            axis=-1
+        )
+        f = f[to_keep, ...]
+        z = z[to_keep]
+        z, uni = np.unique(z, return_index=True)
+        f = f[uni, ...]
+
+        self._shape = f.shape[1:]
+        self._support_points, self._support_values, self.weights = (
+            self._compute_weights(z, f, **kwargs)
+        )
+
+        # only compute once
+        self._poles = None
+        self._residues = None
+        self._roots = None
+
+    def _input_validation(self, x, y, **kwargs):
+        if x.ndim != 1:
+            raise ValueError("`x` must be 1-D.")
+
+        if not y.ndim >= 1:
+            raise ValueError("`y` must be at least 1-D.")
+
+        if x.size != y.shape[self._axis]:
+            msg = f"`x` be of size {y.shape[self._axis]} but got size {x.size}."
+            raise ValueError(msg)
+
+        if not np.all(np.isfinite(x)):
+            raise ValueError("`x` must be finite.")
+
+    def _compute_weights(z, f, **kwargs):
+        raise NotImplementedError
+
+    def __call__(self, z):
+        """Evaluate the rational approximation at given values.
+
+        Parameters
+        ----------
+        z : array_like
+            Input values.
+        """
+        # evaluate rational function in barycentric form.
+        z = np.asarray(z)
+        zv = np.ravel(z)
+
+        support_values = self._support_values.reshape(
+            (self._support_values.shape[0], -1)
+        )
+        weights = self.weights[..., np.newaxis]
+
+        # Cauchy matrix
+        # Ignore errors due to inf/inf at support points, these will be fixed later
+        with np.errstate(invalid="ignore", divide="ignore"):
+            CC = 1 / np.subtract.outer(zv, self._support_points)
+            # Vector of values
+            r = CC @ (weights * support_values) / (CC @ weights)
+
+        # Deal with input inf: `r(inf) = lim r(z) = sum(w*f) / sum(w)`
+        if np.any(np.isinf(zv)):
+            r[np.isinf(zv)] = (np.sum(weights * support_values)
+                               / np.sum(weights))
+
+        # Deal with NaN
+        ii = np.nonzero(np.isnan(r))[0]
+        for jj in ii:
+            if np.isnan(zv[jj]) or not np.any(zv[jj] == self._support_points):
+                # r(NaN) = NaN is fine.
+                # The second case may happen if `r(zv[ii]) = 0/0` at some point.
+                pass
+            else:
+                # Clean up values `NaN = inf/inf` at support points.
+                # Find the corresponding node and set entry to correct value:
+                r[jj] = support_values[zv[jj] == self._support_points].squeeze()
+
+        res = np.reshape(r, z.shape + self._shape)
+        return np.moveaxis(res, 0, self._axis) if z.ndim > 0 else res
+
+    def poles(self):
+        """Compute the poles of the rational approximation.
+
+        Returns
+        -------
+        poles : array
+            Poles of the approximation, repeated according to their multiplicity
+            but not in any specific order.
+        """
+        if self._poles is None:
+            # Compute poles via generalized eigenvalue problem
+            m = self.weights.size
+            B = np.eye(m + 1, dtype=self.weights.dtype)
+            B[0, 0] = 0
+
+            E = np.zeros_like(B, dtype=np.result_type(self.weights,
+                                                      self._support_points))
+            E[0, 1:] = self.weights
+            E[1:, 0] = 1
+            np.fill_diagonal(E[1:, 1:], self._support_points)
+
+            pol = scipy.linalg.eigvals(E, B)
+            self._poles = pol[np.isfinite(pol)]
+        return self._poles
+
+    def residues(self):
+        """Compute the residues of the poles of the approximation.
+
+        Returns
+        -------
+        residues : array
+            Residues associated with the `poles` of the approximation
+        """
+        if self._support_values.ndim > 1:
+            raise NotImplementedError("Residues not implemented for multi-dimensional"
+                                      " data.")
+        if self._residues is None:
+            # Compute residues via formula for res of quotient of analytic functions
+            with np.errstate(divide="ignore", invalid="ignore"):
+                N = (1/(np.subtract.outer(self.poles(), self._support_points))) @ (
+                    self._support_values * self.weights
+                )
+                Ddiff = (
+                    -((1/np.subtract.outer(self.poles(), self._support_points))**2)
+                    @ self.weights
+                )
+                self._residues = N / Ddiff
+        return self._residues
+
+    def roots(self):
+        """Compute the roots of the rational approximation.
+
+        Returns
+        -------
+        zeros : array
+            Zeros of the approximation, repeated according to their multiplicity
+            but not in any specific order.
+        """
+        if self._support_values.ndim > 1:
+            raise NotImplementedError("Roots not implemented for multi-dimensional"
+                                      " data.")
+        if self._roots is None:
+            # Compute zeros via generalized eigenvalue problem
+            m = self.weights.size
+            B = np.eye(m + 1, dtype=self.weights.dtype)
+            B[0, 0] = 0
+            E = np.zeros_like(B, dtype=np.result_type(self.weights,
+                                                      self._support_values,
+                                                      self._support_points))
+            E[0, 1:] = self.weights * self._support_values
+            E[1:, 0] = 1
+            np.fill_diagonal(E[1:, 1:], self._support_points)
+
+            zer = scipy.linalg.eigvals(E, B)
+            self._roots = zer[np.isfinite(zer)]
+        return self._roots
+
+
+class AAA(_BarycentricRational):
+    r"""
+    AAA real or complex rational approximation.
+
+    As described in [1]_, the AAA algorithm is a greedy algorithm for approximation by
+    rational functions on a real or complex set of points. The rational approximation is
+    represented in a barycentric form from which the roots (zeros), poles, and residues
+    can be computed.
+
+    Parameters
+    ----------
+    x : 1D array_like, shape (n,)
+        1-D array containing values of the independent variable. Values may be real or
+        complex but must be finite.
+    y : 1D array_like, shape (n,)
+        Function values ``f(x)``. Infinite and NaN values of `values` and
+        corresponding values of `points` will be discarded.
+    rtol : float, optional
+        Relative tolerance, defaults to ``eps**0.75``. If a small subset of the entries
+        in `values` are much larger than the rest the default tolerance may be too
+        loose. If the tolerance is too tight then the approximation may contain
+        Froissart doublets or the algorithm may fail to converge entirely.
+    max_terms : int, optional
+        Maximum number of terms in the barycentric representation, defaults to ``100``.
+        Must be greater than or equal to one.
+    clean_up : bool, optional
+        Automatic removal of Froissart doublets, defaults to ``True``. See notes for
+        more details.
+    clean_up_tol : float, optional
+        Poles with residues less than this number times the geometric mean
+        of `values` times the minimum distance to `points` are deemed spurious by the
+        cleanup procedure, defaults to 1e-13. See notes for more details.
+
+    Attributes
+    ----------
+    support_points : array
+        Support points of the approximation. These are a subset of the provided `x` at
+        which the approximation strictly interpolates `y`.
+        See notes for more details.
+    support_values : array
+        Value of the approximation at the `support_points`.
+    weights : array
+        Weights of the barycentric approximation.
+    errors : array
+        Error :math:`|f(z) - r(z)|_\infty` over `points` in the successive iterations
+        of AAA.
+
+    Warns
+    -----
+    RuntimeWarning
+        If `rtol` is not achieved in `max_terms` iterations.
+
+    See Also
+    --------
+    FloaterHormannInterpolator : Floater-Hormann barycentric rational interpolation.
+    pade : Padé approximation.
+
+    Notes
+    -----
+    At iteration :math:`m` (at which point there are :math:`m` terms in the both the
+    numerator and denominator of the approximation), the
+    rational approximation in the AAA algorithm takes the barycentric form
+
+    .. math::
+
+        r(z) = n(z)/d(z) =
+        \frac{\sum_{j=1}^m\ w_j f_j / (z - z_j)}{\sum_{j=1}^m w_j / (z - z_j)},
+
+    where :math:`z_1,\dots,z_m` are real or complex support points selected from
+    `x`, :math:`f_1,\dots,f_m` are the corresponding real or complex data values
+    from `y`, and :math:`w_1,\dots,w_m` are real or complex weights.
+
+    Each iteration of the algorithm has two parts: the greedy selection the next support
+    point and the computation of the weights. The first part of each iteration is to
+    select the next support point to be added :math:`z_{m+1}` from the remaining
+    unselected `x`, such that the nonlinear residual
+    :math:`|f(z_{m+1}) - n(z_{m+1})/d(z_{m+1})|` is maximised. The algorithm terminates
+    when this maximum is less than ``rtol * np.linalg.norm(f, ord=np.inf)``. This means
+    the interpolation property is only satisfied up to a tolerance, except at the
+    support points where approximation exactly interpolates the supplied data.
+
+    In the second part of each iteration, the weights :math:`w_j` are selected to solve
+    the least-squares problem
+
+    .. math::
+
+        \text{minimise}_{w_j}|fd - n| \quad \text{subject to} \quad
+        \sum_{j=1}^{m+1} w_j = 1,
+
+    over the unselected elements of `x`.
+
+    One of the challenges with working with rational approximations is the presence of
+    Froissart doublets, which are either poles with vanishingly small residues or
+    pole-zero pairs that are close enough together to nearly cancel, see [2]_. The
+    greedy nature of the AAA algorithm means Froissart doublets are rare. However, if
+    `rtol` is set too tight then the approximation will stagnate and many Froissart
+    doublets will appear. Froissart doublets can usually be removed by removing support
+    points and then resolving the least squares problem. The support point :math:`z_j`,
+    which is the closest support point to the pole :math:`a` with residue
+    :math:`\alpha`, is removed if the following is satisfied
+
+    .. math::
+
+        |\alpha| / |z_j - a| < \verb|clean_up_tol| \cdot \tilde{f},
+
+    where :math:`\tilde{f}` is the geometric mean of `support_values`.
+
+
+    References
+    ----------
+    .. [1] Y. Nakatsukasa, O. Sete, and L. N. Trefethen, "The AAA algorithm for
+            rational approximation", SIAM J. Sci. Comp. 40 (2018), A1494-A1522.
+            :doi:`10.1137/16M1106122`
+    .. [2] J. Gilewicz and M. Pindor, Pade approximants and noise: rational functions,
+           J. Comp. Appl. Math. 105 (1999), pp. 285-297.
+           :doi:`10.1016/S0377-0427(02)00674-X`
+
+    Examples
+    --------
+
+    Here we reproduce a number of the numerical examples from [1]_ as a demonstration
+    of the functionality offered by this method.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import AAA
+    >>> import warnings
+
+    For the first example we approximate the gamma function on ``[-3.5, 4.5]`` by
+    extrapolating from 100 samples in ``[-1.5, 1.5]``.
+
+    >>> from scipy.special import gamma
+    >>> sample_points = np.linspace(-1.5, 1.5, num=100)
+    >>> r = AAA(sample_points, gamma(sample_points))
+    >>> z = np.linspace(-3.5, 4.5, num=1000)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(z, gamma(z), label="Gamma")
+    >>> ax.plot(sample_points, gamma(sample_points), label="Sample points")
+    >>> ax.plot(z, r(z).real, '--', label="AAA approximation")
+    >>> ax.set(xlabel="z", ylabel="r(z)", ylim=[-8, 8], xlim=[-3.5, 4.5])
+    >>> ax.legend()
+    >>> plt.show()
+
+    We can also view the poles of the rational approximation and their residues:
+
+    >>> order = np.argsort(r.poles())
+    >>> r.poles()[order]
+    array([-3.81591039e+00+0.j        , -3.00269049e+00+0.j        ,
+           -1.99999988e+00+0.j        , -1.00000000e+00+0.j        ,
+            5.85842812e-17+0.j        ,  4.77485458e+00-3.06919376j,
+            4.77485458e+00+3.06919376j,  5.29095868e+00-0.97373072j,
+            5.29095868e+00+0.97373072j])
+    >>> r.residues()[order]
+    array([ 0.03658074 +0.j        , -0.16915426 -0.j        ,
+            0.49999915 +0.j        , -1.         +0.j        ,
+            1.         +0.j        , -0.81132013 -2.30193429j,
+           -0.81132013 +2.30193429j,  0.87326839+10.70148546j,
+            0.87326839-10.70148546j])
+
+    For the second example, we call `AAA` with a spiral of 1000 points that wind 7.5
+    times around the origin in the complex plane.
+
+    >>> z = np.exp(np.linspace(-0.5, 0.5 + 15j*np.pi, 1000))
+    >>> r = AAA(z, np.tan(np.pi*z/2), rtol=1e-13)
+
+    We see that AAA takes 12 steps to converge with the following errors:
+
+    >>> r.errors.size
+    12
+    >>> r.errors
+    array([2.49261500e+01, 4.28045609e+01, 1.71346935e+01, 8.65055336e-02,
+           1.27106444e-02, 9.90889874e-04, 5.86910543e-05, 1.28735561e-06,
+           3.57007424e-08, 6.37007837e-10, 1.67103357e-11, 1.17112299e-13])
+
+    We can also plot the computed poles:
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(z.real, z.imag, '.', markersize=2, label="Sample points")
+    >>> ax.plot(r.poles().real, r.poles().imag, '.', markersize=5,
+    ...         label="Computed poles")
+    >>> ax.set(xlim=[-3.5, 3.5], ylim=[-3.5, 3.5], aspect="equal")
+    >>> ax.legend()
+    >>> plt.show()
+
+    We now demonstrate the removal of Froissart doublets using the `clean_up` method
+    using an example from [1]_. Here we approximate the function
+    :math:`f(z)=\log(2 + z^4)/(1 + 16z^4)` by sampling it at 1000 roots of unity. The
+    algorithm is run with ``rtol=0`` and ``clean_up=False`` to deliberately cause
+    Froissart doublets to appear.
+
+    >>> z = np.exp(1j*2*np.pi*np.linspace(0,1, num=1000))
+    >>> def f(z):
+    ...     return np.log(2 + z**4)/(1 - 16*z**4)
+    >>> with warnings.catch_warnings():  # filter convergence warning due to rtol=0
+    ...     warnings.simplefilter('ignore', RuntimeWarning)
+    ...     r = AAA(z, f(z), rtol=0, max_terms=50, clean_up=False)
+    >>> mask = np.abs(r.residues()) < 1e-13
+    >>> fig, axs = plt.subplots(ncols=2)
+    >>> axs[0].plot(r.poles().real[~mask], r.poles().imag[~mask], '.')
+    >>> axs[0].plot(r.poles().real[mask], r.poles().imag[mask], 'r.')
+
+    Now we call the `clean_up` method to remove Froissart doublets.
+
+    >>> with warnings.catch_warnings():
+    ...     warnings.simplefilter('ignore', RuntimeWarning)
+    ...     r.clean_up()
+    4  # may vary
+    >>> mask = np.abs(r.residues()) < 1e-13
+    >>> axs[1].plot(r.poles().real[~mask], r.poles().imag[~mask], '.')
+    >>> axs[1].plot(r.poles().real[mask], r.poles().imag[mask], 'r.')
+    >>> plt.show()
+
+    The left image shows the poles prior of the approximation ``clean_up=False`` with
+    poles with residue less than ``10^-13`` in absolute value shown in red. The right
+    image then shows the poles after the `clean_up` method has been called.
+    """
+    def __init__(self, x, y, *, rtol=None, max_terms=100, clean_up=True,
+                 clean_up_tol=1e-13):
+        super().__init__(x, y, rtol=rtol, max_terms=max_terms)
+
+        if clean_up:
+            self.clean_up(clean_up_tol)
+
+    def _input_validation(self, x, y, rtol=None, max_terms=100, clean_up=True,
+                          clean_up_tol=1e-13):
+        max_terms = operator.index(max_terms)
+        if max_terms < 1:
+            raise ValueError("`max_terms` must be an integer value greater than or "
+                             "equal to one.")
+
+        if y.ndim != 1:
+            raise ValueError("`y` must be 1-D.")
+
+        super()._input_validation(x, y)
+
+    @property
+    def support_points(self):
+        return self._support_points
+
+    @property
+    def support_values(self):
+        return self._support_values
+
+    def _compute_weights(self, z, f, rtol, max_terms):
+        # Initialization for AAA iteration
+        M = np.size(z)
+        mask = np.ones(M, dtype=np.bool_)
+        dtype = np.result_type(z, f, 1.0)
+        rtol = np.finfo(dtype).eps**0.75 if rtol is None else rtol
+        atol = rtol * np.linalg.norm(f, ord=np.inf)
+        zj = np.empty(max_terms, dtype=dtype)
+        fj = np.empty(max_terms, dtype=dtype)
+        # Cauchy matrix
+        C = np.empty((M, max_terms), dtype=dtype)
+        # Loewner matrix
+        A = np.empty((M, max_terms), dtype=dtype)
+        errors = np.empty(max_terms, dtype=A.real.dtype)
+        R = np.repeat(np.mean(f), M)
+        ill_conditioned = False
+        ill_conditioned_tol = 1/(3*np.finfo(dtype).eps)
+
+        # AAA iteration
+        for m in range(max_terms):
+            # Introduce next support point
+            # Select next support point
+            jj = np.argmax(np.abs(f[mask] - R[mask]))
+            # Update support points
+            zj[m] = z[mask][jj]
+            # Update data values
+            fj[m] = f[mask][jj]
+            # Next column of Cauchy matrix
+            # Ignore errors as we manually interpolate at support points
+            with np.errstate(divide="ignore", invalid="ignore"):
+                C[:, m] = 1 / (z - z[mask][jj])
+            # Update mask
+            mask[np.nonzero(mask)[0][jj]] = False
+            # Update Loewner matrix
+            # Ignore errors as inf values will be masked out in SVD call
+            with np.errstate(invalid="ignore"):
+                A[:, m] = (f - fj[m]) * C[:, m]
+
+            # Compute weights
+            rows = mask.sum()
+            if rows >= m + 1:
+                # The usual tall-skinny case
+                if not ill_conditioned:
+                    _, s, V = scipy.linalg.svd(
+                        A[mask, : m + 1], full_matrices=False, check_finite=False,
+                    )
+                    with np.errstate(invalid="ignore", divide="ignore"):
+                        if s[0]/s[-1] > ill_conditioned_tol:
+                            ill_conditioned = True
+                if ill_conditioned:
+                    col_norm = np.linalg.norm(A[mask, : m + 1], axis=0)
+                    _, s, V = scipy.linalg.svd(
+                        A[mask, : m + 1]/col_norm, full_matrices=False,
+                        check_finite=False,
+                    )
+                # Treat case of multiple min singular values
+                mm = s == np.min(s)
+                # Aim for non-sparse weight vector
+                wj = (V.conj()[mm, :].sum(axis=0) / np.sqrt(mm.sum())).astype(dtype)
+                if ill_conditioned:
+                    wj /= col_norm
+            else:
+                # Fewer rows than columns
+                V = scipy.linalg.null_space(A[mask, : m + 1], check_finite=False)
+                nm = V.shape[-1]
+                # Aim for non-sparse wt vector
+                wj = V.sum(axis=-1) / np.sqrt(nm)
+
+            # Compute rational approximant
+            # Omit columns with `wj == 0`
+            i0 = wj != 0
+            # Ignore errors as we manually interpolate at support points
+            with np.errstate(invalid="ignore"):
+                # Numerator
+                N = C[:, : m + 1][:, i0] @ (wj[i0] * fj[: m + 1][i0])
+                # Denominator
+                D = C[:, : m + 1][:, i0] @ wj[i0]
+            # Interpolate at support points with `wj !=0`
+            D_inf = np.isinf(D) | np.isnan(D)
+            D[D_inf] = 1
+            N[D_inf] = f[D_inf]
+            R = N / D
+
+            # Check if converged
+            max_error = np.linalg.norm(f - R, ord=np.inf)
+            errors[m] = max_error
+            if max_error <= atol:
+                break
+
+        if m == max_terms - 1:
+            warnings.warn(f"AAA failed to converge within {max_terms} iterations.",
+                          RuntimeWarning, stacklevel=2)
+
+        # Trim off unused array allocation
+        zj = zj[: m + 1]
+        fj = fj[: m + 1]
+
+        # Remove support points with zero weight
+        i_non_zero = wj != 0
+        self.errors = errors[: m + 1]
+        self._points = z
+        self._values = f
+        return zj[i_non_zero], fj[i_non_zero], wj[i_non_zero]
+
+    def clean_up(self, cleanup_tol=1e-13):
+        """Automatic removal of Froissart doublets.
+
+        Parameters
+        ----------
+        cleanup_tol : float, optional
+            Poles with residues less than this number times the geometric mean
+            of `values` times the minimum distance to `points` are deemed spurious by
+            the cleanup procedure, defaults to 1e-13.
+
+        Returns
+        -------
+        int
+            Number of Froissart doublets detected
+        """
+        # Find negligible residues
+        geom_mean_abs_f = scipy.stats.gmean(np.abs(self._values))
+
+        Z_distances = np.min(
+            np.abs(np.subtract.outer(self.poles(), self._points)), axis=1
+        )
+
+        with np.errstate(divide="ignore", invalid="ignore"):
+            ii = np.nonzero(
+                np.abs(self.residues()) / Z_distances < cleanup_tol * geom_mean_abs_f
+            )
+
+        ni = ii[0].size
+        if ni == 0:
+            return ni
+
+        warnings.warn(f"{ni} Froissart doublets detected.", RuntimeWarning,
+                        stacklevel=2)
+
+        # For each spurious pole find and remove closest support point
+        closest_spt_point = np.argmin(
+            np.abs(np.subtract.outer(self._support_points, self.poles()[ii])), axis=0
+        )
+        self._support_points = np.delete(self._support_points, closest_spt_point)
+        self._support_values = np.delete(self._support_values, closest_spt_point)
+
+        # Remove support points z from sample set
+        mask = np.logical_and.reduce(
+            np.not_equal.outer(self._points, self._support_points), axis=1
+        )
+        f = self._values[mask]
+        z = self._points[mask]
+
+        # recompute weights, we resolve the least squares problem for the remaining
+        # support points
+
+        m = self._support_points.size
+
+        # Cauchy matrix
+        C = 1 / np.subtract.outer(z, self._support_points)
+        # Loewner matrix
+        A = f[:, np.newaxis] * C - C * self._support_values
+
+        # Solve least-squares problem to obtain weights
+        _, _, V = scipy.linalg.svd(A, check_finite=False)
+        self.weights = np.conj(V[m - 1,:])
+
+        # reset roots, poles, residues as cached values will be wrong with new weights
+        self._poles = None
+        self._residues = None
+        self._roots = None
+
+        return ni
+
+
+class FloaterHormannInterpolator(_BarycentricRational):
+    r"""Floater-Hormann barycentric rational interpolator (C∞ smooth on real axis).
+
+    As described in [1]_, the method of Floater and Hormann computes weights for a
+    barycentric rational interpolant with no poles on the real axis.
+
+    Parameters
+    ----------
+    x : 1D array_like, shape (n,)
+        1-D array containing values of the independent variable. Values may be real or
+        complex but must be finite.
+    y : array_like, shape (n, ...)
+        Array containing values of the dependent variable. Infinite and NaN values
+        of `y` and corresponding values of `x` will be discarded.
+    d : int, default: 3
+        Integer satisfying ``0 <= d < n``. Floater-Hormann interpolation blends
+        ``n - d`` polynomials of degree `d` together; for ``d = n - 1``, this is
+        equivalent to polynomial interpolation.
+    axis : int, default: 0
+        Axis of `y` corresponding to `x`.
+
+    Attributes
+    ----------
+    weights : array
+        Weights of the barycentric approximation.
+
+    See Also
+    --------
+    AAA : Barycentric rational approximation of real and complex functions.
+    pade : Padé approximation.
+
+    Notes
+    -----
+    The Floater-Hormann interpolant is a rational function that interpolates the data
+    with approximation order :math:`O(h^{d+1})`. The rational function blends ``n - d``
+    polynomials of degree `d` together to produce a rational interpolant that contains
+    no poles on the real axis, unlike `AAA`. The interpolant is given
+    by
+
+    .. math::
+
+        r(x) = \frac{\sum_{i=0}^{n-d} \lambda_i(x) p_i(x)}
+        {\sum_{i=0}^{n-d} \lambda_i(x)},
+
+    where :math:`p_i(x)` is an interpolating polynomial of at most degree `d` through
+    the points :math:`(x_i,y_i),\dots,(x_{i+d},y_{i+d})`, and :math:`\lambda_i(z)` are
+    blending functions defined by
+
+    .. math::
+
+        \lambda_i(x) = \frac{(-1)^i}{(x - x_i)\cdots(x - x_{i+d})}.
+
+    When ``d = n - 1`` this reduces to polynomial interpolation.
+
+    Due to its stability, the following barycentric representation of the above equation
+    is used for computation
+
+    .. math::
+
+        r(z) = \frac{\sum_{k=1}^m\ w_k f_k / (x - x_k)}{\sum_{k=1}^m w_k / (x - x_k)},
+
+    where the weights :math:`w_j` are computed as
+
+    .. math::
+
+        w_k &= (-1)^{k - d} \sum_{i \in J_k} \prod_{j = i, j \neq k}^{i + d}
+        1/|x_k - x_j|, \\
+        J_k &= \{ i \in I: k - d \leq i \leq k\},\\
+        I &= \{0, 1, \dots, n - d\}.
+
+    References
+    ----------
+    .. [1] M.S. Floater and K. Hormann, "Barycentric rational interpolation with no
+           poles and high rates of approximation", Numer. Math. 107, 315 (2007).
+           :doi:`10.1007/s00211-007-0093-y`
+
+    Examples
+    --------
+
+    Here we compare the method against polynomial interpolation for an example where
+    the polynomial interpolation fails due to Runge's phenomenon.
+
+    >>> import numpy as np
+    >>> from scipy.interpolate import (FloaterHormannInterpolator,
+    ...                                BarycentricInterpolator)
+    >>> def f(x):
+    ...     return 1/(1 + x**2)
+    >>> x = np.linspace(-5, 5, num=15)
+    >>> r = FloaterHormannInterpolator(x, f(x))
+    >>> p = BarycentricInterpolator(x, f(x))
+    >>> xx = np.linspace(-5, 5, num=1000)
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(xx, f(xx), label="f(x)")
+    >>> ax.plot(xx, r(xx), "--", label="Floater-Hormann")
+    >>> ax.plot(xx, p(xx), "--", label="Polynomial")
+    >>> ax.legend()
+    >>> plt.show()
+    """
+    def __init__(self, points, values, *, d=3, axis=0):
+        super().__init__(points, values, d=d, axis=axis)
+
+    def _input_validation(self, x, y, d):
+        d = operator.index(d)
+        if not (0 <= d < len(x)):
+            raise ValueError("`d` must satisfy 0 <= d < n")
+
+        super()._input_validation(x, y)
+
+    def _compute_weights(self, z, f, d):
+        # Floater and Hormann 2007 Eqn. (18) 3 equations later
+        w = np.zeros_like(z, dtype=np.result_type(z, 1.0))
+        n = w.size
+        for k in range(n):
+            for i in range(max(k-d, 0), min(k+1, n-d)):
+                w[k] += 1/np.prod(np.abs(np.delete(z[k] - z[i : i + d + 1], k - i)))
+        w *= (-1.)**(np.arange(n) - d)
+
+        return z, f, w
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bsplines.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bsplines.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa317a1deefff6f8b5c4b22e28fb5f979d0fb97a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_bsplines.py
@@ -0,0 +1,2614 @@
+import operator
+from math import prod
+from types import GenericAlias
+
+import numpy as np
+from scipy._lib._util import normalize_axis_index
+from scipy.linalg import (get_lapack_funcs, LinAlgError,
+                          cholesky_banded, cho_solve_banded,
+                          solve, solve_banded)
+from scipy.optimize import minimize_scalar
+from . import _dierckx
+from . import _fitpack_impl
+from scipy.sparse import csr_array
+from scipy.special import poch
+from itertools import combinations
+
+from scipy._lib._array_api import array_namespace, concat_1d, xp_capabilities
+
+__all__ = ["BSpline", "make_interp_spline", "make_lsq_spline",
+           "make_smoothing_spline"]
+
+
+def _get_dtype(dtype):
+    """Return np.complex128 for complex dtypes, np.float64 otherwise."""
+    if np.issubdtype(dtype, np.complexfloating):
+        return np.complex128
+    else:
+        return np.float64
+
+
+def _as_float_array(x, check_finite=False):
+    """Convert the input into a C contiguous float array.
+
+    NB: Upcasts half- and single-precision floats to double precision.
+    """
+    x = np.ascontiguousarray(x)
+    dtyp = _get_dtype(x.dtype)
+    x = x.astype(dtyp, copy=False)
+    if check_finite and not np.isfinite(x).all():
+        raise ValueError("Array must not contain infs or nans.")
+    return x
+
+
+def _dual_poly(j, k, t, y):
+    """
+    Dual polynomial of the B-spline B_{j,k,t} -
+    polynomial which is associated with B_{j,k,t}:
+    $p_{j,k}(y) = (y - t_{j+1})(y - t_{j+2})...(y - t_{j+k})$
+    """
+    if k == 0:
+        return 1
+    return np.prod([(y - t[j + i]) for i in range(1, k + 1)])
+
+
+def _diff_dual_poly(j, k, y, d, t):
+    """
+    d-th derivative of the dual polynomial $p_{j,k}(y)$
+    """
+    if d == 0:
+        return _dual_poly(j, k, t, y)
+    if d == k:
+        return poch(1, k)
+    comb = list(combinations(range(j + 1, j + k + 1), d))
+    res = 0
+    for i in range(len(comb) * len(comb[0])):
+        res += np.prod([(y - t[j + p]) for p in range(1, k + 1)
+                        if (j + p) not in comb[i//d]])
+    return res
+
+@xp_capabilities(
+    cpu_only=True, jax_jit=False,
+    skip_backends=[
+        ("dask.array",
+         "https://github.com/data-apis/array-api-extra/issues/488")
+    ]
+)
+class BSpline:
+    r"""Univariate spline in the B-spline basis.
+
+    .. math::
+
+        S(x) = \sum_{j=0}^{n-1} c_j  B_{j, k; t}(x)
+
+    where :math:`B_{j, k; t}` are B-spline basis functions of degree `k`
+    and knots `t`.
+
+    Parameters
+    ----------
+    t : ndarray, shape (n+k+1,)
+        knots
+    c : ndarray, shape (>=n, ...)
+        spline coefficients
+    k : int
+        B-spline degree
+    extrapolate : bool or 'periodic', optional
+        whether to extrapolate beyond the base interval, ``t[k] .. t[n]``,
+        or to return nans.
+        If True, extrapolates the first and last polynomial pieces of b-spline
+        functions active on the base interval.
+        If 'periodic', periodic extrapolation is used.
+        Default is True.
+    axis : int, optional
+        Interpolation axis. Default is zero.
+
+    Attributes
+    ----------
+    t : ndarray
+        knot vector
+    c : ndarray
+        spline coefficients
+    k : int
+        spline degree
+    extrapolate : bool
+        If True, extrapolates the first and last polynomial pieces of b-spline
+        functions active on the base interval.
+    axis : int
+        Interpolation axis.
+    tck : tuple
+        A read-only equivalent of ``(self.t, self.c, self.k)``
+
+    Methods
+    -------
+    __call__
+    basis_element
+    derivative
+    antiderivative
+    integrate
+    insert_knot
+    construct_fast
+    design_matrix
+    from_power_basis
+
+    Notes
+    -----
+    B-spline basis elements are defined via
+
+    .. math::
+
+        B_{i, 0}(x) = 1, \textrm{if $t_i \le x < t_{i+1}$, otherwise $0$,}
+
+        B_{i, k}(x) = \frac{x - t_i}{t_{i+k} - t_i} B_{i, k-1}(x)
+                 + \frac{t_{i+k+1} - x}{t_{i+k+1} - t_{i+1}} B_{i+1, k-1}(x)
+
+    **Implementation details**
+
+    - At least ``k+1`` coefficients are required for a spline of degree `k`,
+      so that ``n >= k+1``. Additional coefficients, ``c[j]`` with
+      ``j > n``, are ignored.
+
+    - B-spline basis elements of degree `k` form a partition of unity on the
+      *base interval*, ``t[k] <= x <= t[n]``.
+
+
+    Examples
+    --------
+    Translating the recursive definition of B-splines into Python code, we have:
+
+    >>> def B(x, k, i, t):
+    ...    if k == 0:
+    ...       return 1.0 if t[i] <= x < t[i+1] else 0.0
+    ...    if t[i+k] == t[i]:
+    ...       c1 = 0.0
+    ...    else:
+    ...       c1 = (x - t[i])/(t[i+k] - t[i]) * B(x, k-1, i, t)
+    ...    if t[i+k+1] == t[i+1]:
+    ...       c2 = 0.0
+    ...    else:
+    ...       c2 = (t[i+k+1] - x)/(t[i+k+1] - t[i+1]) * B(x, k-1, i+1, t)
+    ...    return c1 + c2
+
+    >>> def bspline(x, t, c, k):
+    ...    n = len(t) - k - 1
+    ...    assert (n >= k+1) and (len(c) >= n)
+    ...    return sum(c[i] * B(x, k, i, t) for i in range(n))
+
+    Note that this is an inefficient (if straightforward) way to
+    evaluate B-splines --- this spline class does it in an equivalent,
+    but much more efficient way.
+
+    Here we construct a quadratic spline function on the base interval
+    ``2 <= x <= 4`` and compare with the naive way of evaluating the spline:
+
+    >>> from scipy.interpolate import BSpline
+    >>> k = 2
+    >>> t = [0, 1, 2, 3, 4, 5, 6]
+    >>> c = [-1, 2, 0, -1]
+    >>> spl = BSpline(t, c, k)
+    >>> spl(2.5)
+    array(1.375)
+    >>> bspline(2.5, t, c, k)
+    1.375
+
+    Note that outside of the base interval results differ. This is because
+    `BSpline` extrapolates the first and last polynomial pieces of B-spline
+    functions active on the base interval.
+
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> fig, ax = plt.subplots()
+    >>> xx = np.linspace(1.5, 4.5, 50)
+    >>> ax.plot(xx, [bspline(x, t, c ,k) for x in xx], 'r-', lw=3, label='naive')
+    >>> ax.plot(xx, spl(xx), 'b-', lw=4, alpha=0.7, label='BSpline')
+    >>> ax.grid(True)
+    >>> ax.legend(loc='best')
+    >>> plt.show()
+
+
+    References
+    ----------
+    .. [1] Tom Lyche and Knut Morken, Spline methods,
+        http://www.uio.no/studier/emner/matnat/ifi/INF-MAT5340/v05/undervisningsmateriale/
+    .. [2] Carl de Boor, A practical guide to splines, Springer, 2001.
+
+    """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+
+    def __init__(self, t, c, k, extrapolate=True, axis=0):
+        super().__init__()
+
+        self._asarray = array_namespace(c, t).asarray
+
+        self.k = operator.index(k)
+        self._c = np.asarray(c)
+        self._t = np.ascontiguousarray(t, dtype=np.float64)
+
+        if extrapolate == 'periodic':
+            self.extrapolate = extrapolate
+        else:
+            self.extrapolate = bool(extrapolate)
+
+        n = self._t.shape[0] - self.k - 1
+
+        axis = normalize_axis_index(axis, self._c.ndim)
+
+        # Note that the normalized axis is stored in the object.
+        self.axis = axis
+        if axis != 0:
+            # roll the interpolation axis to be the first one in self.c
+            # More specifically, the target shape for self.c is (n, ...),
+            # and axis !=0 means that we have c.shape (..., n, ...)
+            #                                               ^
+            #                                              axis
+            self._c = np.moveaxis(self._c, axis, 0)
+
+        if k < 0:
+            raise ValueError("Spline order cannot be negative.")
+        if self._t.ndim != 1:
+            raise ValueError("Knot vector must be one-dimensional.")
+        if n < self.k + 1:
+            raise ValueError(f"Need at least {2*k + 2} knots for degree {k}")
+        if (np.diff(self._t) < 0).any():
+            raise ValueError("Knots must be in a non-decreasing order.")
+        if len(np.unique(self._t[k:n+1])) < 2:
+            raise ValueError("Need at least two internal knots.")
+        if not np.isfinite(self._t).all():
+            raise ValueError("Knots should not have nans or infs.")
+        if self._c.ndim < 1:
+            raise ValueError("Coefficients must be at least 1-dimensional.")
+        if self._c.shape[0] < n:
+            raise ValueError("Knots, coefficients and degree are inconsistent.")
+
+        dt = _get_dtype(self._c.dtype)
+
+        self._c = np.ascontiguousarray(self._c, dtype=dt)
+
+    @classmethod
+    def construct_fast(cls, t, c, k, extrapolate=True, axis=0):
+        """Construct a spline without making checks.
+
+        Accepts same parameters as the regular constructor. Input arrays
+        `t` and `c` must of correct shape and dtype.
+        """
+        self = object.__new__(cls)
+        self._t, self._c, self.k = np.asarray(t), np.asarray(c), k
+        self.extrapolate = extrapolate
+        self.axis = axis
+        self._asarray = array_namespace(t, c).asarray
+        return self
+
+    @property
+    def tck(self):
+        """Equivalent to ``(self.t, self.c, self.k)`` (read-only).
+        """
+        return self.t, self.c, self.k
+
+    # Under the hood, self._c and self._t are always saved as numpy array
+    # because they are used in a C extension expecting numpy arrays.
+    @property
+    def t(self):
+        return self._asarray(self._t)
+
+    @t.setter
+    def t(self, t):
+        self._t = np.asarray(t)
+
+    @property
+    def c(self):
+        return self._asarray(self._c)
+
+    @c.setter
+    def c(self, c):
+        self._c = np.asarray(c)
+
+    @classmethod
+    def basis_element(cls, t, extrapolate=True):
+        """Return a B-spline basis element ``B(x | t[0], ..., t[k+1])``.
+
+        Parameters
+        ----------
+        t : ndarray, shape (k+2,)
+            internal knots
+        extrapolate : bool or 'periodic', optional
+            whether to extrapolate beyond the base interval, ``t[0] .. t[k+1]``,
+            or to return nans.
+            If 'periodic', periodic extrapolation is used.
+            Default is True.
+
+        Returns
+        -------
+        basis_element : callable
+            A callable representing a B-spline basis element for the knot
+            vector `t`.
+
+        Notes
+        -----
+        The degree of the B-spline, `k`, is inferred from the length of `t` as
+        ``len(t)-2``. The knot vector is constructed by appending and prepending
+        ``k+1`` elements to internal knots `t`.
+
+        Examples
+        --------
+        Construct a cubic B-spline:
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import BSpline
+        >>> b = BSpline.basis_element([0, 1, 2, 3, 4])
+        >>> k = b.k
+        >>> b.t[k:-k]
+        array([ 0.,  1.,  2.,  3.,  4.])
+        >>> k
+        3
+
+        Construct a quadratic B-spline on ``[0, 1, 1, 2]``, and compare
+        to its explicit form:
+
+        >>> t = [0, 1, 1, 2]
+        >>> b = BSpline.basis_element(t)
+        >>> def f(x):
+        ...     return np.where(x < 1, x*x, (2. - x)**2)
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots()
+        >>> x = np.linspace(0, 2, 51)
+        >>> ax.plot(x, b(x), 'g', lw=3)
+        >>> ax.plot(x, f(x), 'r', lw=8, alpha=0.4)
+        >>> ax.grid(True)
+        >>> plt.show()
+
+        """
+        xp = array_namespace(t)
+        t = np.asarray(t)
+        k = t.shape[0] - 2
+        t = _as_float_array(t)  # TODO: use concat_1d instead of np.r_
+        t = np.r_[(t[0]-1,) * k, t, (t[-1]+1,) * k]
+        c = np.zeros_like(t)
+        c[k] = 1.
+
+        t, c = xp.asarray(t), xp.asarray(c)
+        return cls.construct_fast(t, c, k, extrapolate)
+
+    @classmethod
+    def design_matrix(cls, x, t, k, extrapolate=False):
+        """
+        Returns a design matrix as a CSR format sparse array.
+
+        Parameters
+        ----------
+        x : array_like, shape (n,)
+            Points to evaluate the spline at.
+        t : array_like, shape (nt,)
+            Sorted 1D array of knots.
+        k : int
+            B-spline degree.
+        extrapolate : bool or 'periodic', optional
+            Whether to extrapolate based on the first and last intervals
+            or raise an error. If 'periodic', periodic extrapolation is used.
+            Default is False.
+
+            .. versionadded:: 1.10.0
+
+        Returns
+        -------
+        design_matrix : `csr_array` object
+            Sparse matrix in CSR format where each row contains all the basis
+            elements of the input row (first row = basis elements of x[0],
+            ..., last row = basis elements x[-1]).
+
+        Examples
+        --------
+        Construct a design matrix for a B-spline
+
+        >>> from scipy.interpolate import make_interp_spline, BSpline
+        >>> import numpy as np
+        >>> x = np.linspace(0, np.pi * 2, 4)
+        >>> y = np.sin(x)
+        >>> k = 3
+        >>> bspl = make_interp_spline(x, y, k=k)
+        >>> design_matrix = bspl.design_matrix(x, bspl.t, k)
+        >>> design_matrix.toarray()
+        [[1.        , 0.        , 0.        , 0.        ],
+        [0.2962963 , 0.44444444, 0.22222222, 0.03703704],
+        [0.03703704, 0.22222222, 0.44444444, 0.2962963 ],
+        [0.        , 0.        , 0.        , 1.        ]]
+
+        Construct a design matrix for some vector of knots
+
+        >>> k = 2
+        >>> t = [-1, 0, 1, 2, 3, 4, 5, 6]
+        >>> x = [1, 2, 3, 4]
+        >>> design_matrix = BSpline.design_matrix(x, t, k).toarray()
+        >>> design_matrix
+        [[0.5, 0.5, 0. , 0. , 0. ],
+        [0. , 0.5, 0.5, 0. , 0. ],
+        [0. , 0. , 0.5, 0.5, 0. ],
+        [0. , 0. , 0. , 0.5, 0.5]]
+
+        This result is equivalent to the one created in the sparse format
+
+        >>> c = np.eye(len(t) - k - 1)
+        >>> design_matrix_gh = BSpline(t, c, k)(x)
+        >>> np.allclose(design_matrix, design_matrix_gh, atol=1e-14)
+        True
+
+        Notes
+        -----
+        .. versionadded:: 1.8.0
+
+        In each row of the design matrix all the basis elements are evaluated
+        at the certain point (first row - x[0], ..., last row - x[-1]).
+
+        `nt` is a length of the vector of knots: as far as there are
+        `nt - k - 1` basis elements, `nt` should be not less than `2 * k + 2`
+        to have at least `k + 1` basis element.
+
+        Out of bounds `x` raises a ValueError.
+        """
+        x = _as_float_array(x, True)
+        t = _as_float_array(t, True)
+
+        if extrapolate != 'periodic':
+            extrapolate = bool(extrapolate)
+
+        if k < 0:
+            raise ValueError("Spline order cannot be negative.")
+        if t.ndim != 1 or np.any(t[1:] < t[:-1]):
+            raise ValueError(f"Expect t to be a 1-D sorted array_like, but "
+                             f"got t={t}.")
+        # There are `nt - k - 1` basis elements in a BSpline built on the
+        # vector of knots with length `nt`, so to have at least `k + 1` basis
+        # elements we need to have at least `2 * k + 2` elements in the vector
+        # of knots.
+        if len(t) < 2 * k + 2:
+            raise ValueError(f"Length t is not enough for k={k}.")
+
+        if extrapolate == 'periodic':
+            # With periodic extrapolation we map x to the segment
+            # [t[k], t[n]].
+            n = t.size - k - 1
+            x = t[k] + (x - t[k]) % (t[n] - t[k])
+            extrapolate = False
+        elif not extrapolate and (
+            (min(x) < t[k]) or (max(x) > t[t.shape[0] - k - 1])
+        ):
+            # Checks from `find_interval` function
+            raise ValueError(f'Out of bounds w/ x = {x}.')
+
+        # Compute number of non-zeros of final CSR array in order to determine
+        # the dtype of indices and indptr of the CSR array.
+        n = x.shape[0]
+        nnz = n * (k + 1)
+        if nnz < np.iinfo(np.int32).max:
+            int_dtype = np.int32
+        else:
+            int_dtype = np.int64
+
+        # Get the non-zero elements of the design matrix and per-row `offsets`:
+        # In row `i`, k+1 nonzero elements are consecutive, and start from `offset[i]`
+        data, offsets, _ = _dierckx.data_matrix(x, t, k, np.ones_like(x), extrapolate)
+        data = data.ravel()
+
+        if offsets.dtype != int_dtype:
+            offsets = offsets.astype(int_dtype)
+
+        # Convert from per-row offsets to the CSR indices/indptr format
+        indices = np.repeat(offsets, k+1).reshape(-1, k+1)
+        indices = indices + np.arange(k+1, dtype=int_dtype)
+        indices = indices.ravel()
+
+        indptr = np.arange(0, (n + 1) * (k + 1), k + 1, dtype=int_dtype)
+
+        return csr_array(
+            (data, indices, indptr),
+            shape=(x.shape[0], t.shape[0] - k - 1)
+        )
+
+    def __call__(self, x, nu=0, extrapolate=None):
+        """
+        Evaluate a spline function.
+
+        Parameters
+        ----------
+        x : array_like
+            points to evaluate the spline at.
+        nu : int, optional
+            derivative to evaluate (default is 0).
+        extrapolate : bool or 'periodic', optional
+            whether to extrapolate based on the first and last intervals
+            or return nans. If 'periodic', periodic extrapolation is used.
+            Default is `self.extrapolate`.
+
+        Returns
+        -------
+        y : array_like
+            Shape is determined by replacing the interpolation axis
+            in the coefficient array with the shape of `x`.
+
+        """
+        if extrapolate is None:
+            extrapolate = self.extrapolate
+        x = np.asarray(x)
+        x_shape, x_ndim = x.shape, x.ndim
+        x = np.ascontiguousarray(x.ravel(), dtype=np.float64)
+
+        # With periodic extrapolation we map x to the segment
+        # [self.t[k], self.t[n]].
+        if extrapolate == 'periodic':
+            n = self._t.size - self.k - 1
+            x = self._t[self.k] + (x - self._t[self.k]) % (self._t[n] - self._t[self.k])
+            extrapolate = False
+
+        self._ensure_c_contiguous()
+
+        # if self.c is complex: the C code in _dierckxmodule.cc expects
+        # floats, so make a view---this expands the last axis, and
+        # the view is C contiguous if the original is.
+        # if c.dtype is complex of shape (n,), c.view(float).shape == (2*n,)
+        # if c.dtype is complex of shape (n, m), c.view(float).shape == (n, 2*m)
+
+        is_complex = self._c.dtype.kind == 'c'
+        if is_complex:
+            cc = self._c.view(float)
+            if self._c.ndim == 1:
+                cc = cc.reshape(self._c.shape[0], 2)
+        else:
+            cc = self._c
+
+        # flatten the trailing dims
+        cc = cc.reshape(cc.shape[0], -1)
+
+        # heavy lifting: actually perform the evaluations
+        out = _dierckx.evaluate_spline(self._t, cc, self.k, x, nu, extrapolate)
+
+        if is_complex:
+            out = out.view(complex)
+
+        out = out.reshape(x_shape + self._c.shape[1:])
+        if self.axis != 0:
+            # transpose to move the calculated values to the interpolation axis
+            l = list(range(out.ndim))
+            l = l[x_ndim:x_ndim+self.axis] + l[:x_ndim] + l[x_ndim+self.axis:]
+            out = out.transpose(l)
+        return self._asarray(out)
+
+    def _ensure_c_contiguous(self):
+        """
+        c and t may be modified by the user. The Cython code expects
+        that they are C contiguous.
+
+        """
+        if not self._t.flags.c_contiguous:
+            self._t = self._t.copy()
+        if not self._c.flags.c_contiguous:
+            self._c = self._c.copy()
+
+    def derivative(self, nu=1):
+        """Return a B-spline representing the derivative.
+
+        Parameters
+        ----------
+        nu : int, optional
+            Derivative order.
+            Default is 1.
+
+        Returns
+        -------
+        b : `BSpline` object
+            A new instance representing the derivative.
+
+        See Also
+        --------
+        splder, splantider
+
+        """
+        c = self._asarray(self.c, copy=True)
+        t = self.t
+        xp = array_namespace(t, c)
+
+        # pad the c array if needed
+        ct = t.shape[0] - c.shape[0]
+        if ct > 0:
+            c = concat_1d(xp, c, xp.zeros((ct,) + c.shape[1:]))
+        tck = _fitpack_impl.splder((t, c, self.k), nu)
+        return self.construct_fast(*tck, extrapolate=self.extrapolate,
+                                   axis=self.axis)
+
+    def antiderivative(self, nu=1):
+        """Return a B-spline representing the antiderivative.
+
+        Parameters
+        ----------
+        nu : int, optional
+            Antiderivative order. Default is 1.
+
+        Returns
+        -------
+        b : `BSpline` object
+            A new instance representing the antiderivative.
+
+        Notes
+        -----
+        If antiderivative is computed and ``self.extrapolate='periodic'``,
+        it will be set to False for the returned instance. This is done because
+        the antiderivative is no longer periodic and its correct evaluation
+        outside of the initially given x interval is difficult.
+
+        See Also
+        --------
+        splder, splantider
+
+        """
+        c = self._asarray(self.c, copy=True)
+        t = self.t
+        xp = array_namespace(t, c)
+
+        # pad the c array if needed
+        ct = t.shape[0] - c.shape[0]
+        if ct > 0:
+            c = concat_1d(xp, c, xp.zeros((ct,) + c.shape[1:]))
+        tck = _fitpack_impl.splantider((t, c, self.k), nu)
+
+        if self.extrapolate == 'periodic':
+            extrapolate = False
+        else:
+            extrapolate = self.extrapolate
+
+        return self.construct_fast(*tck, extrapolate=extrapolate,
+                                   axis=self.axis)
+
+    def integrate(self, a, b, extrapolate=None):
+        """Compute a definite integral of the spline.
+
+        Parameters
+        ----------
+        a : float
+            Lower limit of integration.
+        b : float
+            Upper limit of integration.
+        extrapolate : bool or 'periodic', optional
+            whether to extrapolate beyond the base interval,
+            ``t[k] .. t[-k-1]``, or take the spline to be zero outside of the
+            base interval. If 'periodic', periodic extrapolation is used.
+            If None (default), use `self.extrapolate`.
+
+        Returns
+        -------
+        I : array_like
+            Definite integral of the spline over the interval ``[a, b]``.
+
+        Examples
+        --------
+        Construct the linear spline ``x if x < 1 else 2 - x`` on the base
+        interval :math:`[0, 2]`, and integrate it
+
+        >>> from scipy.interpolate import BSpline
+        >>> b = BSpline.basis_element([0, 1, 2])
+        >>> b.integrate(0, 1)
+        array(0.5)
+
+        If the integration limits are outside of the base interval, the result
+        is controlled by the `extrapolate` parameter
+
+        >>> b.integrate(-1, 1)
+        array(0.0)
+        >>> b.integrate(-1, 1, extrapolate=False)
+        array(0.5)
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots()
+        >>> ax.grid(True)
+        >>> ax.axvline(0, c='r', lw=5, alpha=0.5)  # base interval
+        >>> ax.axvline(2, c='r', lw=5, alpha=0.5)
+        >>> xx = [-1, 1, 2]
+        >>> ax.plot(xx, b(xx))
+        >>> plt.show()
+
+        """
+        if extrapolate is None:
+            extrapolate = self.extrapolate
+
+        # Prepare self.t and self.c.
+        self._ensure_c_contiguous()
+
+        # Swap integration bounds if needed.
+        sign = 1
+        if b < a:
+            a, b = b, a
+            sign = -1
+        n = self._t.size - self.k - 1
+
+        if extrapolate != "periodic" and not extrapolate:
+            # Shrink the integration interval, if needed.
+            a = max(a, self._t[self.k])
+            b = min(b, self._t[n])
+
+            if self._c.ndim == 1:
+                # Fast path: use FITPACK's routine
+                # (cf _fitpack_impl.splint).
+                integral = _fitpack_impl.splint(a, b, (self._t, self._c, self.k))
+                return self._asarray(integral * sign)
+
+        # Compute the antiderivative.
+        c = self._c
+        ct = len(self._t) - len(c)
+        if ct > 0:
+            c = np.r_[c, np.zeros((ct,) + c.shape[1:])]
+        ta, ca, ka = _fitpack_impl.splantider((self._t, c, self.k), 1)
+
+        if extrapolate == 'periodic':
+            # Split the integral into the part over period (can be several
+            # of them) and the remaining part.
+
+            ts, te = self._t[self.k], self._t[n]
+            period = te - ts
+            interval = b - a
+            n_periods, left = divmod(interval, period)
+
+            if n_periods > 0:
+                # Evaluate the difference of antiderivatives.
+                x = np.asarray([ts, te], dtype=np.float64)
+                out = _dierckx.evaluate_spline(ta, ca.reshape(ca.shape[0], -1),
+                                      ka, x, 0, False)
+                integral = out[1] - out[0]
+                integral *= n_periods
+            else:
+                integral = np.zeros((1, prod(self._c.shape[1:])),
+                                    dtype=self._c.dtype)
+
+            # Map a to [ts, te], b is always a + left.
+            a = ts + (a - ts) % period
+            b = a + left
+
+            # If b <= te then we need to integrate over [a, b], otherwise
+            # over [a, te] and from xs to what is remained.
+            if b <= te:
+                x = np.asarray([a, b], dtype=np.float64)
+                out = _dierckx.evaluate_spline(ta, ca.reshape(ca.shape[0], -1),
+                                      ka, x, 0, False)
+                integral += out[1] - out[0]
+            else:
+                x = np.asarray([a, te], dtype=np.float64)
+                out = _dierckx.evaluate_spline(ta, ca.reshape(ca.shape[0], -1),
+                                      ka, x, 0, False)
+                integral += out[1] - out[0]
+
+                x = np.asarray([ts, ts + b - te], dtype=np.float64)
+                out = _dierckx.evaluate_spline(ta, ca.reshape(ca.shape[0], -1),
+                                      ka, x, 0, False)
+                integral += out[1] - out[0]
+        else:
+            # Evaluate the difference of antiderivatives.
+            x = np.asarray([a, b], dtype=np.float64)
+            out = _dierckx.evaluate_spline(ta, ca.reshape(ca.shape[0], -1),
+                                  ka, x, 0, extrapolate)
+            integral = out[1] - out[0]
+
+        integral *= sign
+        return self._asarray(integral.reshape(ca.shape[1:]))
+
+    @classmethod
+    def from_power_basis(cls, pp, bc_type='not-a-knot'):
+        r"""
+        Construct a polynomial in the B-spline basis
+        from a piecewise polynomial in the power basis.
+
+        For now, accepts ``CubicSpline`` instances only.
+
+        Parameters
+        ----------
+        pp : CubicSpline
+            A piecewise polynomial in the power basis, as created
+            by ``CubicSpline``
+        bc_type : string, optional
+            Boundary condition type as in ``CubicSpline``: one of the
+            ``not-a-knot``, ``natural``, ``clamped``, or ``periodic``.
+            Necessary for construction an instance of ``BSpline`` class.
+            Default is ``not-a-knot``.
+
+        Returns
+        -------
+        b : `BSpline` object
+            A new instance representing the initial polynomial
+            in the B-spline basis.
+
+        Notes
+        -----
+        .. versionadded:: 1.8.0
+
+        Accepts only ``CubicSpline`` instances for now.
+
+        The algorithm follows from differentiation
+        the Marsden's identity [1]: each of coefficients of spline
+        interpolation function in the B-spline basis is computed as follows:
+
+        .. math::
+
+            c_j = \sum_{m=0}^{k} \frac{(k-m)!}{k!}
+                       c_{m,i} (-1)^{k-m} D^m p_{j,k}(x_i)
+
+        :math:`c_{m, i}` - a coefficient of CubicSpline,
+        :math:`D^m p_{j, k}(x_i)` - an m-th defivative of a dual polynomial
+        in :math:`x_i`.
+
+        ``k`` always equals 3 for now.
+
+        First ``n - 2`` coefficients are computed in :math:`x_i = x_j`, e.g.
+
+        .. math::
+
+            c_1 = \sum_{m=0}^{k} \frac{(k-1)!}{k!} c_{m,1} D^m p_{j,3}(x_1)
+
+        Last ``nod + 2`` coefficients are computed in ``x[-2]``,
+        ``nod`` - number of derivatives at the ends.
+
+        For example, consider :math:`x = [0, 1, 2, 3, 4]`,
+        :math:`y = [1, 1, 1, 1, 1]` and bc_type = ``natural``
+
+        The coefficients of CubicSpline in the power basis:
+
+        :math:`[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0], [1, 1, 1, 1, 1]]`
+
+        The knot vector: :math:`t = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 4]`
+
+        In this case
+
+        .. math::
+
+            c_j = \frac{0!}{k!} c_{3, i} k! = c_{3, i} = 1,~j = 0, ..., 6
+
+        References
+        ----------
+        .. [1] Tom Lyche and Knut Morken, Spline Methods, 2005, Section 3.1.2
+
+        """
+        from ._cubic import CubicSpline
+        if not isinstance(pp, CubicSpline):
+            raise NotImplementedError(f"Only CubicSpline objects are accepted "
+                                      f"for now. Got {type(pp)} instead.")
+        x = pp.x
+        coef = pp.c
+        k = pp.c.shape[0] - 1
+        n = x.shape[0]
+
+        if bc_type == 'not-a-knot':
+            t = _not_a_knot(x, k)
+        elif bc_type == 'natural' or bc_type == 'clamped':
+            t = _augknt(x, k)
+        elif bc_type == 'periodic':
+            t = _periodic_knots(x, k)
+        else:
+            raise TypeError(f'Unknown boundary condition: {bc_type}')
+
+        nod = t.shape[0] - (n + k + 1)  # number of derivatives at the ends
+        c = np.zeros(n + nod, dtype=pp.c.dtype)
+        for m in range(k + 1):
+            for i in range(n - 2):
+                c[i] += poch(k + 1, -m) * coef[m, i]\
+                        * np.power(-1, k - m)\
+                        * _diff_dual_poly(i, k, x[i], m, t)
+            for j in range(n - 2, n + nod):
+                c[j] += poch(k + 1, -m) * coef[m, n - 2]\
+                        * np.power(-1, k - m)\
+                        * _diff_dual_poly(j, k, x[n - 2], m, t)
+        return cls.construct_fast(t, c, k, pp.extrapolate, pp.axis)
+
+    def insert_knot(self, x, m=1):
+        """Insert a new knot at `x` of multiplicity `m`.
+
+        Given the knots and coefficients of a B-spline representation, create a
+        new B-spline with a knot inserted `m` times at point `x`.
+
+        Parameters
+        ----------
+        x : float
+            The position of the new knot
+        m : int, optional
+            The number of times to insert the given knot (its multiplicity).
+            Default is 1.
+
+        Returns
+        -------
+        spl : `BSpline` object
+            A new `BSpline` object with the new knot inserted.
+
+        Notes
+        -----
+        Based on algorithms from [1]_ and [2]_.
+
+        In case of a periodic spline (``self.extrapolate == "periodic"``)
+        there must be either at least k interior knots t(j) satisfying
+        ``t(k+1)<t(j)<=x`` or at least k interior knots t(j) satisfying
+        ``x<=t(j)<t(n-k)``.
+
+        This routine is functionally equivalent to `scipy.interpolate.insert`.
+
+        .. versionadded:: 1.13
+
+        References
+        ----------
+        .. [1] W. Boehm, "Inserting new knots into b-spline curves.",
+            Computer Aided Design, 12, p.199-201, 1980.
+            :doi:`10.1016/0010-4485(80)90154-2`.
+        .. [2] P. Dierckx, "Curve and surface fitting with splines, Monographs on
+            Numerical Analysis", Oxford University Press, 1993.
+
+        See Also
+        --------
+        scipy.interpolate.insert
+
+        Examples
+        --------
+        You can insert knots into a B-spline:
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import BSpline, make_interp_spline
+        >>> x = np.linspace(0, 10, 5)
+        >>> y = np.sin(x)
+        >>> spl = make_interp_spline(x, y, k=3)
+        >>> spl.t
+        array([ 0.,  0.,  0.,  0.,  5., 10., 10., 10., 10.])
+
+        Insert a single knot
+
+        >>> spl_1 = spl.insert_knot(3)
+        >>> spl_1.t
+        array([ 0.,  0.,  0.,  0.,  3.,  5., 10., 10., 10., 10.])
+
+        Insert a multiple knot
+
+        >>> spl_2 = spl.insert_knot(8, m=3)
+        >>> spl_2.t
+        array([ 0.,  0.,  0.,  0.,  5.,  8.,  8.,  8., 10., 10., 10., 10.])
+
+        """
+        x = float(x)
+
+        if x < self._t[self.k] or x > self._t[-self.k-1]:
+            raise ValueError(f"Cannot insert a knot at {x}.")
+        if m <= 0:
+            raise ValueError(f"`m` must be positive, got {m = }.")
+
+        tt = self._t.copy()
+        cc = self._c.copy()
+
+        for _ in range(m):
+            tt, cc = _insert(x, tt, cc, self.k, self.extrapolate == "periodic")
+        tt, cc = self._asarray(tt), self._asarray(cc)
+        return self.construct_fast(tt, cc, self.k, self.extrapolate, self.axis)
+
+
+def _insert(xval, t, c, k, periodic=False):
+    """Insert a single knot at `xval`."""
+    #
+    # This is a port of the FORTRAN `insert` routine by P. Dierckx,
+    # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/insert.f
+    # which carries the following comment:
+    #
+    # subroutine insert inserts a new knot x into a spline function s(x)
+    # of degree k and calculates the b-spline representation of s(x) with
+    # respect to the new set of knots. in addition, if iopt.ne.0, s(x)
+    # will be considered as a periodic spline with period per=t(n-k)-t(k+1)
+    # satisfying the boundary constraints
+    #      t(i+n-2*k-1) = t(i)+per  ,i=1,2,...,2*k+1
+    #      c(i+n-2*k-1) = c(i)      ,i=1,2,...,k
+    # in that case, the knots and b-spline coefficients returned will also
+    # satisfy these boundary constraints, i.e.
+    #      tt(i+nn-2*k-1) = tt(i)+per  ,i=1,2,...,2*k+1
+    #      cc(i+nn-2*k-1) = cc(i)      ,i=1,2,...,k
+    interval = _dierckx.find_interval(t, k, float(xval), k, False)
+    if interval < 0:
+        # extrapolated values are guarded for in BSpline.insert_knot
+        raise ValueError(f"Cannot insert the knot at {xval}.")
+
+    # super edge case: a knot with multiplicity > k+1
+    # see https://github.com/scipy/scipy/commit/037204c3e91
+    if t[interval] == t[interval + k + 1]:
+        interval -= 1
+
+    if periodic:
+        if (interval + 1 <= 2*k) and (interval + 1 >= t.shape[0] - 2*k):
+            # in case of a periodic spline (iopt.ne.0) there must be
+            # either at least k interior knots t(j) satisfying t(k+1)<t(j)<=x
+            # or at least k interior knots t(j) satisfying x<=t(j)<t(n-k)
+            raise ValueError("Not enough internal knots.")
+
+    # knots
+    tt = np.r_[t[:interval+1], xval, t[interval+1:]]
+
+    newshape = (c.shape[0] + 1,) + c.shape[1:]
+    cc = np.zeros(newshape, dtype=c.dtype)
+
+    # coefficients
+    cc[interval+1:, ...] = c[interval:, ...]
+
+    for i in range(interval, interval-k, -1):
+        fac = (xval - tt[i]) / (tt[i+k+1] - tt[i])
+        cc[i, ...] = fac*c[i, ...] + (1. - fac)*c[i-1, ...]
+
+    cc[:interval - k+1, ...] = c[:interval - k+1, ...]
+
+    if periodic:
+        # c   incorporate the boundary conditions for a periodic spline.
+        n = tt.shape[0]
+        nk = n - k - 1
+        n2k = n - 2*k - 1
+        T = tt[nk] - tt[k]   # period
+
+        if interval >= nk - k:
+            # adjust the left-hand boundary knots & coefs
+            tt[:k] = tt[nk - k:nk] - T
+            cc[:k, ...] = cc[n2k:n2k + k, ...]
+
+        if interval <= 2*k-1:
+            # adjust the right-hand boundary knots & coefs
+            tt[n-k:] = tt[k+1:k+1+k] + T
+            cc[n2k:n2k + k, ...] = cc[:k, ...]
+
+    return tt, cc
+
+
+#################################
+#  Interpolating spline helpers #
+#################################
+
+def _not_a_knot(x, k):
+    """Given data x, construct the knot vector w/ not-a-knot BC.
+    cf de Boor, XIII(12).
+
+    For even k, it's a bit ad hoc: Greville sites + omit 2nd and 2nd-to-last
+    data points, a la not-a-knot.
+    This seems to match what Dierckx does, too:
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L63-L80
+    """
+    x = np.asarray(x)
+    if k % 2 == 1:
+        k2 = (k + 1) // 2
+        t = x.copy()
+    else:
+        k2 = k // 2
+        t = (x[1:] + x[:-1]) / 2
+
+    t = t[k2:-k2]
+    t = np.r_[(x[0],)*(k+1), t, (x[-1],)*(k+1)]
+    return t
+
+
+def _augknt(x, k):
+    """Construct a knot vector appropriate for the order-k interpolation."""
+    return np.r_[(x[0],)*k, x, (x[-1],)*k]
+
+
+def _convert_string_aliases(deriv, target_shape):
+    if isinstance(deriv, str):
+        if deriv == "clamped":
+            deriv = [(1, np.zeros(target_shape))]
+        elif deriv == "natural":
+            deriv = [(2, np.zeros(target_shape))]
+        else:
+            raise ValueError(f"Unknown boundary condition : {deriv}")
+    return deriv
+
+
+def _process_deriv_spec(deriv):
+    if deriv is not None:
+        try:
+            ords, vals = zip(*deriv)
+        except TypeError as e:
+            msg = ("Derivatives, `bc_type`, should be specified as a pair of "
+                   "iterables of pairs of (order, value).")
+            raise ValueError(msg) from e
+    else:
+        ords, vals = [], []
+    return np.atleast_1d(ords, vals)
+
+
+def _woodbury_algorithm(A, ur, ll, b, k):
+    '''
+    Solve a cyclic banded linear system with upper right
+    and lower blocks of size ``(k-1) / 2`` using
+    the Woodbury formula
+
+    Parameters
+    ----------
+    A : 2-D array, shape(k, n)
+        Matrix of diagonals of original matrix (see
+        ``solve_banded`` documentation).
+    ur : 2-D array, shape(bs, bs)
+        Upper right block matrix.
+    ll : 2-D array, shape(bs, bs)
+        Lower left block matrix.
+    b : 1-D array, shape(n,)
+        Vector of constant terms of the system of linear equations.
+    k : int
+        B-spline degree.
+
+    Returns
+    -------
+    c : 1-D array, shape(n,)
+        Solution of the original system of linear equations.
+
+    Notes
+    -----
+    This algorithm works only for systems with banded matrix A plus
+    a correction term U @ V.T, where the matrix U @ V.T gives upper right
+    and lower left block of A
+    The system is solved with the following steps:
+        1.  New systems of linear equations are constructed:
+            A @ z_i = u_i,
+            u_i - column vector of U,
+            i = 1, ..., k - 1
+        2.  Matrix Z is formed from vectors z_i:
+            Z = [ z_1 | z_2 | ... | z_{k - 1} ]
+        3.  Matrix H = (1 + V.T @ Z)^{-1}
+        4.  The system A' @ y = b is solved
+        5.  x = y - Z @ (H @ V.T @ y)
+    Also, ``n`` should be greater than ``k``, otherwise corner block
+    elements will intersect with diagonals.
+
+    Examples
+    --------
+    Consider the case of n = 8, k = 5 (size of blocks - 2 x 2).
+    The matrix of a system:       U:          V:
+      x  x  x  *  *  a  b         a b 0 0     0 0 1 0
+      x  x  x  x  *  *  c         0 c 0 0     0 0 0 1
+      x  x  x  x  x  *  *         0 0 0 0     0 0 0 0
+      *  x  x  x  x  x  *         0 0 0 0     0 0 0 0
+      *  *  x  x  x  x  x         0 0 0 0     0 0 0 0
+      d  *  *  x  x  x  x         0 0 d 0     1 0 0 0
+      e  f  *  *  x  x  x         0 0 e f     0 1 0 0
+
+    References
+    ----------
+    .. [1] William H. Press, Saul A. Teukolsky, William T. Vetterling
+           and Brian P. Flannery, Numerical Recipes, 2007, Section 2.7.3
+
+    '''
+    k_mod = k - k % 2
+    bs = int((k - 1) / 2) + (k + 1) % 2
+
+    n = A.shape[1] + 1
+    U = np.zeros((n - 1, k_mod))
+    VT = np.zeros((k_mod, n - 1))  # V transpose
+
+    # upper right block
+    U[:bs, :bs] = ur
+    VT[np.arange(bs), np.arange(bs) - bs] = 1
+
+    # lower left block
+    U[-bs:, -bs:] = ll
+    VT[np.arange(bs) - bs, np.arange(bs)] = 1
+
+    Z = solve_banded((bs, bs), A, U)
+
+    H = solve(np.identity(k_mod) + VT @ Z, np.identity(k_mod))
+
+    y = solve_banded((bs, bs), A, b)
+    c = y - Z @ (H @ (VT @ y))
+
+    return c
+
+
+def _periodic_knots(x, k):
+    '''
+    returns vector of nodes on circle
+    '''
+    xc = np.copy(x)
+    n = len(xc)
+    if k % 2 == 0:
+        dx = np.diff(xc)
+        xc[1: -1] -= dx[:-1] / 2
+    dx = np.diff(xc)
+    t = np.zeros(n + 2 * k)
+    t[k: -k] = xc
+    for i in range(0, k):
+        # filling first `k` elements in descending order
+        t[k - i - 1] = t[k - i] - dx[-(i % (n - 1)) - 1]
+        # filling last `k` elements in ascending order
+        t[-k + i] = t[-k + i - 1] + dx[i % (n - 1)]
+    return t
+
+
+def _make_interp_per_full_matr(x, y, t, k):
+    '''
+    Returns a solution of a system for B-spline interpolation with periodic
+    boundary conditions. First ``k - 1`` rows of matrix are conditions of
+    periodicity (continuity of ``k - 1`` derivatives at the boundary points).
+    Last ``n`` rows are interpolation conditions.
+    RHS is ``k - 1`` zeros and ``n`` ordinates in this case.
+
+    Parameters
+    ----------
+    x : 1-D array, shape (n,)
+        Values of x - coordinate of a given set of points.
+    y : 1-D array, shape (n,)
+        Values of y - coordinate of a given set of points.
+    t : 1-D array, shape(n+2*k,)
+        Vector of knots.
+    k : int
+        The maximum degree of spline
+
+    Returns
+    -------
+    c : 1-D array, shape (n+k-1,)
+        B-spline coefficients
+
+    Notes
+    -----
+    ``t`` is supposed to be taken on circle.
+
+    '''
+
+    x, y, t = map(np.asarray, (x, y, t))
+
+    n = x.size
+    # LHS: the colocation matrix + derivatives at edges
+    matr = np.zeros((n + k - 1, n + k - 1))
+
+    # derivatives at x[0] and x[-1]:
+    for i in range(k - 1):
+        bb = _dierckx.evaluate_all_bspl(t, k, x[0], k, i + 1)
+        matr[i, : k + 1] += bb
+        bb = _dierckx.evaluate_all_bspl(t, k, x[-1], n + k - 1, i + 1)[:-1]
+        matr[i, -k:] -= bb
+
+    # colocation matrix
+    for i in range(n):
+        xval = x[i]
+        # find interval
+        if xval == t[k]:
+            left = k
+        else:
+            left = np.searchsorted(t, xval) - 1
+
+        # fill a row
+        bb = _dierckx.evaluate_all_bspl(t, k, xval, left)
+        matr[i + k - 1, left-k:left+1] = bb
+
+    # RHS
+    b = np.r_[[0] * (k - 1), y]
+
+    c = solve(matr, b)
+    return c
+
+
+def _handle_lhs_derivatives(t, k, xval, ab, kl, ku, deriv_ords, offset=0):
+    """ Fill in the entries of the colocation matrix corresponding to known
+    derivatives at `xval`.
+
+    The colocation matrix is in the banded storage, as prepared by _coloc.
+    No error checking.
+
+    Parameters
+    ----------
+    t : ndarray, shape (nt + k + 1,)
+        knots
+    k : integer
+        B-spline order
+    xval : float
+        The value at which to evaluate the derivatives at.
+    ab : ndarray, shape(2*kl + ku + 1, nt), Fortran order
+        B-spline colocation matrix.
+        This argument is modified *in-place*.
+    kl : integer
+        Number of lower diagonals of ab.
+    ku : integer
+        Number of upper diagonals of ab.
+    deriv_ords : 1D ndarray
+        Orders of derivatives known at xval
+    offset : integer, optional
+        Skip this many rows of the matrix ab.
+
+    """
+    # find where `xval` is in the knot vector, `t`
+    left = _dierckx.find_interval(t, k, float(xval), k, False)
+
+    # compute and fill in the derivatives @ xval
+    for row in range(deriv_ords.shape[0]):
+        nu = deriv_ords[row]
+        wrk = _dierckx.evaluate_all_bspl(t, k, xval, left, nu)
+
+        # if A were a full matrix, it would be just
+        # ``A[row + offset, left-k:left+1] = bb``.
+        for a in range(k+1):
+            clmn = left - k + a
+            ab[kl + ku + offset + row - clmn, clmn] = wrk[a]
+
+
+def _make_periodic_spline(x, y, t, k, axis, *, xp):
+    '''
+    Compute the (coefficients of) interpolating B-spline with periodic
+    boundary conditions.
+
+    Parameters
+    ----------
+    x : array_like, shape (n,)
+        Abscissas.
+    y : array_like, shape (n,)
+        Ordinates.
+    k : int
+        B-spline degree.
+    t : array_like, shape (n + 2 * k,).
+        Knots taken on a circle, ``k`` on the left and ``k`` on the right
+        of the vector ``x``.
+
+    Returns
+    -------
+    b : `BSpline` object
+        A `BSpline` object of the degree ``k`` and with knots ``t``.
+
+    Notes
+    -----
+    The original system is formed by ``n + k - 1`` equations where the first
+    ``k - 1`` of them stand for the ``k - 1`` derivatives continuity on the
+    edges while the other equations correspond to an interpolating case
+    (matching all the input points). Due to a special form of knot vector, it
+    can be proved that in the original system the first and last ``k``
+    coefficients of a spline function are the same, respectively. It follows
+    from the fact that all ``k - 1`` derivatives are equal term by term at ends
+    and that the matrix of the original system of linear equations is
+    non-degenerate. So, we can reduce the number of equations to ``n - 1``
+    (first ``k - 1`` equations could be reduced). Another trick of this
+    implementation is cyclic shift of values of B-splines due to equality of
+    ``k`` unknown coefficients. With this we can receive matrix of the system
+    with upper right and lower left blocks, and ``k`` diagonals.  It allows
+    to use Woodbury formula to optimize the computations.
+
+    '''
+    n = y.shape[0]
+
+    extradim = prod(y.shape[1:])
+    y_new = y.reshape(n, extradim)
+    c = np.zeros((n + k - 1, extradim))
+
+    # n <= k case is solved with full matrix
+    if n <= k:
+        for i in range(extradim):
+            c[:, i] = _make_interp_per_full_matr(x, y_new[:, i], t, k)
+        c = np.ascontiguousarray(c.reshape((n + k - 1,) + y.shape[1:]))
+        t, c = xp.asarray(t), xp.asarray(c)
+        return BSpline.construct_fast(t, c, k, extrapolate='periodic', axis=axis)
+
+    nt = len(t) - k - 1
+
+    # size of block elements
+    kul = int(k / 2)
+
+    # kl = ku = k
+    ab = np.zeros((3 * k + 1, nt), dtype=np.float64, order='F')
+
+    # upper right and lower left blocks
+    ur = np.zeros((kul, kul))
+    ll = np.zeros_like(ur)
+
+    # `offset` is made to shift all the non-zero elements to the end of the
+    # matrix
+    # NB: 1. drop the last element of `x` because `x[0] = x[-1] + T` & `y[0] == y[-1]`
+    #     2. pass ab.T to _coloc to make it C-ordered; below it'll be fed to banded
+    #        LAPACK, which needs F-ordered arrays
+    _dierckx._coloc(x[:-1], t, k, ab.T, k)
+
+    # remove zeros before the matrix
+    ab = ab[-k - (k + 1) % 2:, :]
+
+    # The least elements in rows (except repetitions) are diagonals
+    # of block matrices. Upper right matrix is an upper triangular
+    # matrix while lower left is a lower triangular one.
+    for i in range(kul):
+        ur += np.diag(ab[-i - 1, i: kul], k=i)
+        ll += np.diag(ab[i, -kul - (k % 2): n - 1 + 2 * kul - i], k=-i)
+
+    # remove elements that occur in the last point
+    # (first and last points are equivalent)
+    A = ab[:, kul: -k + kul]
+
+    for i in range(extradim):
+        cc = _woodbury_algorithm(A, ur, ll, y_new[:, i][:-1], k)
+        c[:, i] = np.concatenate((cc[-kul:], cc, cc[:kul + k % 2]))
+    c = np.ascontiguousarray(c.reshape((n + k - 1,) + y.shape[1:]))
+    t, c = xp.asarray(t), xp.asarray(c)
+    return BSpline.construct_fast(t, c, k, extrapolate='periodic', axis=axis)
+
+
+@xp_capabilities(cpu_only=True, jax_jit=False, allow_dask_compute=True)
+def make_interp_spline(x, y, k=3, t=None, bc_type=None, axis=0,
+                       check_finite=True):
+    """Create an interpolating B-spline with specified degree and boundary conditions.
+
+    Parameters
+    ----------
+    x : array_like, shape (n,)
+        Abscissas.
+    y : array_like, shape (n, ...)
+        Ordinates.
+    k : int, optional
+        B-spline degree. Default is cubic, ``k = 3``.
+    t : array_like, shape (nt + k + 1,), optional.
+        Knots.
+        The number of knots needs to agree with the number of data points and
+        the number of derivatives at the edges. Specifically, ``nt - n`` must
+        equal ``len(deriv_l) + len(deriv_r)``.
+    bc_type : 2-tuple or None
+        Boundary conditions.
+        Default is None, which means choosing the boundary conditions
+        automatically. Otherwise, it must be a length-two tuple where the first
+        element (``deriv_l``) sets the boundary conditions at ``x[0]`` and
+        the second element (``deriv_r``) sets the boundary conditions at
+        ``x[-1]``. Each of these must be an iterable of pairs
+        ``(order, value)`` which gives the values of derivatives of specified
+        orders at the given edge of the interpolation interval.
+        Alternatively, the following string aliases are recognized:
+
+        * ``"clamped"``: The first derivatives at the ends are zero. This is
+          equivalent to ``bc_type=([(1, 0.0)], [(1, 0.0)])``.
+        * ``"natural"``: The second derivatives at ends are zero. This is
+          equivalent to ``bc_type=([(2, 0.0)], [(2, 0.0)])``.
+        * ``"not-a-knot"`` (default): The first and second segments are the
+          same polynomial. This is equivalent to having ``bc_type=None``.
+        * ``"periodic"``: The values and the first ``k-1`` derivatives at the
+          ends are equivalent.
+
+    axis : int, optional
+        Interpolation axis. Default is 0.
+    check_finite : bool, optional
+        Whether to check that the input arrays contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default is True.
+
+    Returns
+    -------
+    b : `BSpline` object
+        A `BSpline` object of the degree ``k`` and with knots ``t``.
+
+    See Also
+    --------
+    BSpline : base class representing the B-spline objects
+    CubicSpline : a cubic spline in the polynomial basis
+    make_lsq_spline : a similar factory function for spline fitting
+    UnivariateSpline : a wrapper over FITPACK spline fitting routines
+    splrep : a wrapper over FITPACK spline fitting routines
+
+    Examples
+    --------
+    Use cubic interpolation on Chebyshev nodes:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> def cheb_nodes(N):
+    ...     jj = 2.*np.arange(N) + 1
+    ...     x = np.cos(np.pi * jj / 2 / N)[::-1]
+    ...     return x
+
+    >>> x = cheb_nodes(20)
+    >>> y = np.sqrt(1 - x**2)
+
+    >>> from scipy.interpolate import BSpline, make_interp_spline
+    >>> b = make_interp_spline(x, y)
+    >>> np.allclose(b(x), y)
+    True
+
+    Note that the default is a cubic spline with a not-a-knot boundary condition
+
+    >>> b.k
+    3
+
+    Here we use a 'natural' spline, with zero 2nd derivatives at edges:
+
+    >>> l, r = [(2, 0.0)], [(2, 0.0)]
+    >>> b_n = make_interp_spline(x, y, bc_type=(l, r))  # or, bc_type="natural"
+    >>> np.allclose(b_n(x), y)
+    True
+    >>> x0, x1 = x[0], x[-1]
+    >>> np.allclose([b_n(x0, 2), b_n(x1, 2)], [0, 0])
+    True
+
+    Interpolation of parametric curves is also supported. As an example, we
+    compute a discretization of a snail curve in polar coordinates
+
+    >>> phi = np.linspace(0, 2.*np.pi, 40)
+    >>> r = 0.3 + np.cos(phi)
+    >>> x, y = r*np.cos(phi), r*np.sin(phi)  # convert to Cartesian coordinates
+
+    Build an interpolating curve, parameterizing it by the angle
+
+    >>> spl = make_interp_spline(phi, np.c_[x, y])
+
+    Evaluate the interpolant on a finer grid (note that we transpose the result
+    to unpack it into a pair of x- and y-arrays)
+
+    >>> phi_new = np.linspace(0, 2.*np.pi, 100)
+    >>> x_new, y_new = spl(phi_new).T
+
+    Plot the result
+
+    >>> plt.plot(x, y, 'o')
+    >>> plt.plot(x_new, y_new, '-')
+    >>> plt.show()
+
+    Build a B-spline curve with 2 dimensional y
+
+    >>> x = np.linspace(0, 2*np.pi, 10)
+    >>> y = np.array([np.sin(x), np.cos(x)])
+
+    Periodic condition is satisfied because y coordinates of points on the ends
+    are equivalent
+
+    >>> ax = plt.axes(projection='3d')
+    >>> xx = np.linspace(0, 2*np.pi, 100)
+    >>> bspl = make_interp_spline(x, y, k=5, bc_type='periodic', axis=1)
+    >>> ax.plot3D(xx, *bspl(xx))
+    >>> ax.scatter3D(x, *y, color='red')
+    >>> plt.show()
+
+    """
+    # convert string aliases for the boundary conditions
+    if bc_type is None or bc_type == 'not-a-knot' or bc_type == 'periodic':
+        deriv_l, deriv_r = None, None
+    elif isinstance(bc_type, str):
+        deriv_l, deriv_r = bc_type, bc_type
+    else:
+        try:
+            deriv_l, deriv_r = bc_type
+        except TypeError as e:
+            raise ValueError(f"Unknown boundary condition: {bc_type}") from e
+
+    xp = array_namespace(x, y, t)
+    x = _as_float_array(x, check_finite)
+    y = _as_float_array(y, check_finite)
+
+    axis = normalize_axis_index(axis, y.ndim)
+    y = np.moveaxis(y, axis, 0)    # now internally interp axis is zero
+
+    # sanity check the input
+    if bc_type == 'periodic' and not np.allclose(y[0], y[-1], atol=1e-15):
+        raise ValueError("First and last points does not match while "
+                         "periodic case expected")
+    if x.size != y.shape[0]:
+        raise ValueError(f'Shapes of x {x.shape} and y {y.shape} are incompatible')
+    if np.any(x[1:] == x[:-1]):
+        raise ValueError("Expect x to not have duplicates")
+    if x.ndim != 1 or np.any(x[1:] < x[:-1]):
+        raise ValueError("Expect x to be a 1D strictly increasing sequence.")
+
+    # special-case k=0 right away
+    if k == 0:
+        if any(_ is not None for _ in (t, deriv_l, deriv_r)):
+            raise ValueError("Too much info for k=0: t and bc_type can only "
+                             "be None.")
+        t = np.r_[x, x[-1]]
+        c = np.asarray(y)
+        c = np.ascontiguousarray(c, dtype=_get_dtype(c.dtype))
+        t, c = xp.asarray(t), xp.asarray(c)
+        return BSpline.construct_fast(t, c, k, axis=axis)
+
+    # special-case k=1 (e.g., Lyche and Morken, Eq.(2.16))
+    if k == 1 and t is None:
+        if not (deriv_l is None and deriv_r is None):
+            raise ValueError("Too much info for k=1: bc_type can only be None.")
+        t = np.r_[x[0], x, x[-1]]
+        c = np.asarray(y)
+        c = np.ascontiguousarray(c, dtype=_get_dtype(c.dtype))
+        t, c = xp.asarray(t), xp.asarray(c)
+        return BSpline.construct_fast(t, c, k, axis=axis)
+
+    k = operator.index(k)
+
+    if bc_type == 'periodic' and t is not None:
+        raise NotImplementedError("For periodic case t is constructed "
+                                  "automatically and can not be passed "
+                                  "manually")
+
+    # come up with a sensible knot vector, if needed
+    if t is None:
+        if deriv_l is None and deriv_r is None:
+            if bc_type == 'periodic':
+                t = _periodic_knots(x, k)
+            else:
+                t = _not_a_knot(x, k)
+        else:
+            t = _augknt(x, k)
+
+    t = _as_float_array(t, check_finite)
+
+    if k < 0:
+        raise ValueError("Expect non-negative k.")
+    if t.ndim != 1 or np.any(t[1:] < t[:-1]):
+        raise ValueError("Expect t to be a 1-D sorted array_like.")
+    if t.size < x.size + k + 1:
+        raise ValueError(f"Got {t.size} knots, need at least {x.size + k + 1}.")
+    if (x[0] < t[k]) or (x[-1] > t[-k]):
+        raise ValueError(f'Out of bounds w/ x = {x}.')
+
+    if bc_type == 'periodic':
+        return _make_periodic_spline(x, y, t, k, axis, xp=xp)
+
+    # Here : deriv_l, r = [(nu, value), ...]
+    deriv_l = _convert_string_aliases(deriv_l, y.shape[1:])
+    deriv_l_ords, deriv_l_vals = _process_deriv_spec(deriv_l)
+    nleft = deriv_l_ords.shape[0]
+
+    deriv_r = _convert_string_aliases(deriv_r, y.shape[1:])
+    deriv_r_ords, deriv_r_vals = _process_deriv_spec(deriv_r)
+    nright = deriv_r_ords.shape[0]
+
+    if not all(0 <= i <= k for i in deriv_l_ords):
+        raise ValueError(f"Bad boundary conditions at {x[0]}.")
+
+    if not all(0 <= i <= k for i in deriv_r_ords):
+        raise ValueError(f"Bad boundary conditions at {x[-1]}.")
+
+    # have `n` conditions for `nt` coefficients; need nt-n derivatives
+    n = x.size
+    nt = t.size - k - 1
+
+    if nt - n != nleft + nright:
+        raise ValueError("The number of derivatives at boundaries does not "
+                         f"match: expected {nt-n}, got {nleft}+{nright}")
+
+    # bail out if the `y` array is zero-sized
+    if y.size == 0:
+        c = np.zeros((nt,) + y.shape[1:], dtype=float)
+        return BSpline.construct_fast(t, c, k, axis=axis)
+
+    # set up the LHS: the colocation matrix + derivatives at boundaries
+    # NB: ab is in F order for banded LAPACK; _coloc needs C-ordered arrays,
+    #     this pass ab.T into _coloc
+    kl = ku = k
+    ab = np.zeros((2*kl + ku + 1, nt), dtype=np.float64, order='F')
+    _dierckx._coloc(x, t, k, ab.T, nleft)
+    if nleft > 0:
+        _handle_lhs_derivatives(t, k, x[0], ab, kl, ku, deriv_l_ords)
+    if nright > 0:
+        _handle_lhs_derivatives(t, k, x[-1], ab, kl, ku, deriv_r_ords,
+                                offset=nt-nright)
+
+    # set up the RHS: values to interpolate (+ derivative values, if any)
+    extradim = prod(y.shape[1:])
+    rhs = np.empty((nt, extradim), dtype=y.dtype)
+    if nleft > 0:
+        rhs[:nleft] = deriv_l_vals.reshape(-1, extradim)
+    rhs[nleft:nt - nright] = y.reshape(-1, extradim)
+    if nright > 0:
+        rhs[nt - nright:] = deriv_r_vals.reshape(-1, extradim)
+
+    # solve Ab @ x = rhs; this is the relevant part of linalg.solve_banded
+    if check_finite:
+        ab, rhs = map(np.asarray_chkfinite, (ab, rhs))
+    gbsv, = get_lapack_funcs(('gbsv',), (ab, rhs))
+    lu, piv, c, info = gbsv(kl, ku, ab, rhs,
+                            overwrite_ab=True, overwrite_b=True)
+
+    if info > 0:
+        raise LinAlgError("Colocation matrix is singular.")
+    elif info < 0:
+        raise ValueError(f'illegal value in {-info}-th argument of internal gbsv')
+    c = np.ascontiguousarray(c.reshape((nt,) + y.shape[1:]))
+    t, c = xp.asarray(t), xp.asarray(c)
+    return BSpline.construct_fast(t, c, k, axis=axis)
+
+
+@xp_capabilities(cpu_only=True, jax_jit=False, allow_dask_compute=True)
+def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True, *, method="qr"):
+    r"""Create a smoothing B-spline satisfying the Least SQuares (LSQ) criterion.
+
+    The result is a linear combination
+
+    .. math::
+
+            S(x) = \sum_j c_j B_j(x; t)
+
+    of the B-spline basis elements, :math:`B_j(x; t)`, which minimizes
+
+    .. math::
+
+        \sum_{j} \left( w_j \times (S(x_j) - y_j) \right)^2
+
+    Parameters
+    ----------
+    x : array_like, shape (m,)
+        Abscissas.
+    y : array_like, shape (m, ...)
+        Ordinates.
+    t : array_like, shape (n + k + 1,).
+        Knots.
+        Knots and data points must satisfy Schoenberg-Whitney conditions.
+    k : int, optional
+        B-spline degree. Default is cubic, ``k = 3``.
+    w : array_like, shape (m,), optional
+        Weights for spline fitting. Must be positive. If ``None``,
+        then weights are all equal.
+        Default is ``None``.
+    axis : int, optional
+        Interpolation axis. Default is zero.
+    check_finite : bool, optional
+        Whether to check that the input arrays contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default is True.
+    method : str, optional
+        Method for solving the linear LSQ problem. Allowed values are "norm-eq"
+        (Explicitly construct and solve the normal system of equations), and
+        "qr" (Use the QR factorization of the design matrix).
+        Default is "qr".
+
+    Returns
+    -------
+    b : `BSpline` object
+        A `BSpline` object of the degree ``k`` with knots ``t``.
+
+    See Also
+    --------
+    BSpline : base class representing the B-spline objects
+    make_interp_spline : a similar factory function for interpolating splines
+    LSQUnivariateSpline : a FITPACK-based spline fitting routine
+    splrep : a FITPACK-based fitting routine
+
+    Notes
+    -----
+    The number of data points must be larger than the spline degree ``k``.
+
+    Knots ``t`` must satisfy the Schoenberg-Whitney conditions,
+    i.e., there must be a subset of data points ``x[j]`` such that
+    ``t[j] < x[j] < t[j+k+1]``, for ``j=0, 1,...,n-k-2``.
+
+    Examples
+    --------
+    Generate some noisy data:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng()
+    >>> x = np.linspace(-3, 3, 50)
+    >>> y = np.exp(-x**2) + 0.1 * rng.standard_normal(50)
+
+    Now fit a smoothing cubic spline with a pre-defined internal knots.
+    Here we make the knot vector (k+1)-regular by adding boundary knots:
+
+    >>> from scipy.interpolate import make_lsq_spline, BSpline
+    >>> t = [-1, 0, 1]
+    >>> k = 3
+    >>> t = np.r_[(x[0],)*(k+1),
+    ...           t,
+    ...           (x[-1],)*(k+1)]
+    >>> spl = make_lsq_spline(x, y, t, k)
+
+    For comparison, we also construct an interpolating spline for the same
+    set of data:
+
+    >>> from scipy.interpolate import make_interp_spline
+    >>> spl_i = make_interp_spline(x, y)
+
+    Plot both:
+
+    >>> xs = np.linspace(-3, 3, 100)
+    >>> plt.plot(x, y, 'ro', ms=5)
+    >>> plt.plot(xs, spl(xs), 'g-', lw=3, label='LSQ spline')
+    >>> plt.plot(xs, spl_i(xs), 'b-', lw=3, alpha=0.7, label='interp spline')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    **NaN handling**: If the input arrays contain ``nan`` values, the result is
+    not useful since the underlying spline fitting routines cannot deal with
+    ``nan``. A workaround is to use zero weights for not-a-number data points:
+
+    >>> y[8] = np.nan
+    >>> w = np.isnan(y)
+    >>> y[w] = 0.
+    >>> tck = make_lsq_spline(x, y, t, w=~w)
+
+    Notice the need to replace a ``nan`` by a numerical value (precise value
+    does not matter as long as the corresponding weight is zero.)
+
+    """
+    xp = array_namespace(x, y, t, w)
+
+    x = _as_float_array(x, check_finite)
+    y = _as_float_array(y, check_finite)
+    t = _as_float_array(t, check_finite)
+    if w is not None:
+        w = _as_float_array(w, check_finite)
+    else:
+        w = np.ones_like(x)
+    k = operator.index(k)
+
+    axis = normalize_axis_index(axis, y.ndim)
+
+    y = np.moveaxis(y, axis, 0)    # now internally interp axis is zero
+    if not y.flags.c_contiguous:
+        # C routines in _dierckx currently require C contiguity
+        y = y.copy(order='C')
+
+    if x.ndim != 1:
+        raise ValueError("Expect x to be a 1-D sequence.")
+    if x.shape[0] < k+1:
+        raise ValueError("Need more x points.")
+    if k < 0:
+        raise ValueError("Expect non-negative k.")
+    if t.ndim != 1 or np.any(t[1:] - t[:-1] < 0):
+        raise ValueError("Expect t to be a 1D strictly increasing sequence.")
+    if x.size != y.shape[0]:
+        raise ValueError(f'Shapes of x {x.shape} and y {y.shape} are incompatible')
+    if k > 0 and np.any((x < t[k]) | (x > t[-k])):
+        raise ValueError(f'Out of bounds w/ x = {x}.')
+    if x.size != w.size:
+        raise ValueError(f'Shapes of x {x.shape} and w {w.shape} are incompatible')
+    if method == "norm-eq" and np.any(x[1:] - x[:-1] <= 0):
+        raise ValueError("Expect x to be a 1D strictly increasing sequence.")
+    if method == "qr" and any(x[1:] - x[:-1] < 0):
+        raise ValueError("Expect x to be a 1D non-decreasing sequence.")
+
+    # number of coefficients
+    n = t.size - k - 1
+
+    # complex y: view as float, preserve the length
+    was_complex =  y.dtype.kind == 'c'
+    yy = y.view(float)
+    if was_complex and y.ndim == 1:
+        yy = yy.reshape(y.shape[0], 2)
+
+    # multiple r.h.s
+    extradim = prod(yy.shape[1:])
+    yy = yy.reshape(-1, extradim)
+
+    # complex y: view as float, preserve the length
+    was_complex =  y.dtype.kind == 'c'
+    yy = y.view(float)
+    if was_complex and y.ndim == 1:
+        yy = yy.reshape(y.shape[0], 2)
+
+    # multiple r.h.s
+    extradim = prod(yy.shape[1:])
+    yy = yy.reshape(-1, extradim)
+
+    if method == "norm-eq":
+        # construct A.T @ A and rhs with A the colocation matrix, and
+        # rhs = A.T @ y for solving the LSQ problem  ``A.T @ A @ c = A.T @ y``
+        lower = True
+        ab = np.zeros((k+1, n), dtype=np.float64, order='F')
+        rhs = np.zeros((n, extradim), dtype=np.float64)
+        _dierckx._norm_eq_lsq(x, t, k,
+                              yy,
+                              w,
+                              ab.T, rhs)
+
+        # undo complex -> float and flattening the trailing dims
+        if was_complex:
+            rhs = rhs.view(complex)
+
+        rhs = rhs.reshape((n,) + y.shape[1:])
+
+        # have observation matrix & rhs, can solve the LSQ problem
+        cho_decomp = cholesky_banded(ab, overwrite_ab=True, lower=lower,
+                                     check_finite=check_finite)
+        m = rhs.shape[0]
+        c = cho_solve_banded((cho_decomp, lower), rhs.reshape(m, -1), overwrite_b=True,
+                             check_finite=check_finite).reshape(rhs.shape)
+    elif method == "qr":
+        _, _, c, _, _ = _lsq_solve_qr(x, yy, t, k, w)
+
+        if was_complex:
+            c = c.view(complex)
+
+    else:
+        raise ValueError(f"Unknown {method =}.")
+
+
+    # restore the shape of `c` for both single and multiple r.h.s.
+    c = c.reshape((n,) + y.shape[1:])
+    c = np.ascontiguousarray(c)
+    t, c = xp.asarray(t), xp.asarray(c)
+    return BSpline.construct_fast(t, c, k, axis=axis)
+
+
+######################
+# LSQ spline helpers #
+######################
+
+def _lsq_solve_qr_for_root_rati_periodic(x, y, t, k, w):
+    """Solve for the LSQ spline coeffs given x, y and knots.
+
+    `y` is always 2D: for 1D data, the shape is ``(m, 1)``.
+    `w` is always 1D: one weight value per `x` value.
+
+    """
+    y_w = y * w[:, None]
+    # Ref: https://github.com/scipy/scipy/blob/maintenance/1.16.x/scipy/interpolate/fitpack/fpperi.f#L221-L238
+    R, H1, H2, offset, nc = _dierckx.data_matrix_periodic(x, t, k, w, False)
+    # Ref: https://github.com/scipy/scipy/blob/maintenance/1.16.x/scipy/interpolate/fitpack/fpperi.f#L239-L314
+    A1, A2, Z, p, _ = _dierckx.qr_reduce_periodic(
+        R, H1, H2, offset, nc, y_w, k,
+        len(t), True
+    )         # modifies arguments in-place
+    # Ref: https://github.com/scipy/scipy/blob/main/scipy/interpolate/fitpack/fpbacp.f
+    c, residuals, _ = _dierckx.fpbacp(A1, A2, Z, k, k, x, y, t, w)
+    return R, A1, A2, Z, y_w, c, p, residuals
+
+
+def _lsq_solve_qr(x, y, t, k, w, periodic=False):
+    """Solve for the LSQ spline coeffs given x, y and knots.
+
+    `y` is always 2D: for 1D data, the shape is ``(m, 1)``.
+    `w` is always 1D: one weight value per `x` value.
+
+    """
+    y_w = y * w[:, None]
+    if not periodic:
+        A, offset, nc = _dierckx.data_matrix(x, t, k, w)
+        _dierckx.qr_reduce(A, offset, nc, y_w)         # modifies arguments in-place
+        c, residuals, fp = _dierckx.fpback(A, nc, x, y, t, k, w, y_w)
+        return A, y_w, c, fp, residuals
+    else:
+        # Ref: https://github.com/scipy/scipy/blob/maintenance/1.16.x/scipy/interpolate/fitpack/fpperi.f#L221-L238
+        R, H1, H2, offset, nc = _dierckx.data_matrix_periodic(x, t, k, w, False)
+        # Ref: https://github.com/scipy/scipy/blob/maintenance/1.16.x/scipy/interpolate/fitpack/fpperi.f#L239-L314
+        A1, A2, Z, fp = _dierckx.qr_reduce_periodic(
+            R, H1, H2, offset, nc, y_w, k,
+            len(t), False)         # modifies arguments in-place
+        # Ref: https://github.com/scipy/scipy/blob/main/scipy/interpolate/fitpack/fpbacp.f
+        c, residuals, _ = _dierckx.fpbacp(A1, A2, Z, k, k, x, y, t, w)
+        return R, y_w, c, fp, residuals
+
+
+
+
+#############################
+#  Smoothing spline helpers #
+#############################
+
+def _compute_optimal_gcv_parameter(X, wE, y, w):
+    """
+    Returns an optimal regularization parameter from the GCV criteria [1].
+
+    Parameters
+    ----------
+    X : array, shape (5, n)
+        5 bands of the design matrix ``X`` stored in LAPACK banded storage.
+    wE : array, shape (5, n)
+        5 bands of the penalty matrix :math:`W^{-1} E` stored in LAPACK banded
+        storage.
+    y : array, shape (n,)
+        Ordinates.
+    w : array, shape (n,)
+        Vector of weights.
+
+    Returns
+    -------
+    lam : float
+        An optimal from the GCV criteria point of view regularization
+        parameter.
+
+    Notes
+    -----
+    No checks are performed.
+
+    References
+    ----------
+    .. [1] G. Wahba, "Estimating the smoothing parameter" in Spline models
+        for observational data, Philadelphia, Pennsylvania: Society for
+        Industrial and Applied Mathematics, 1990, pp. 45-65.
+        :doi:`10.1137/1.9781611970128`
+
+    """
+
+    def compute_banded_symmetric_XT_W_Y(X, w, Y):
+        """
+        Assuming that the product :math:`X^T W Y` is symmetric and both ``X``
+        and ``Y`` are 5-banded, compute the unique bands of the product.
+
+        Parameters
+        ----------
+        X : array, shape (5, n)
+            5 bands of the matrix ``X`` stored in LAPACK banded storage.
+        w : array, shape (n,)
+            Array of weights
+        Y : array, shape (5, n)
+            5 bands of the matrix ``Y`` stored in LAPACK banded storage.
+
+        Returns
+        -------
+        res : array, shape (4, n)
+            The result of the product :math:`X^T Y` stored in the banded way.
+
+        Notes
+        -----
+        As far as the matrices ``X`` and ``Y`` are 5-banded, their product
+        :math:`X^T W Y` is 7-banded. It is also symmetric, so we can store only
+        unique diagonals.
+
+        """
+        # compute W Y
+        W_Y = np.copy(Y)
+
+        W_Y[2] *= w
+        for i in range(2):
+            W_Y[i, 2 - i:] *= w[:-2 + i]
+            W_Y[3 + i, :-1 - i] *= w[1 + i:]
+
+        n = X.shape[1]
+        res = np.zeros((4, n))
+        for i in range(n):
+            for j in range(min(n-i, 4)):
+                res[-j-1, i + j] = sum(X[j:, i] * W_Y[:5-j, i + j])
+        return res
+
+    def compute_b_inv(A):
+        """
+        Inverse 3 central bands of matrix :math:`A=U^T D^{-1} U` assuming that
+        ``U`` is a unit upper triangular banded matrix using an algorithm
+        proposed in [1].
+
+        Parameters
+        ----------
+        A : array, shape (4, n)
+            Matrix to inverse, stored in LAPACK banded storage.
+
+        Returns
+        -------
+        B : array, shape (4, n)
+            3 unique bands of the symmetric matrix that is an inverse to ``A``.
+            The first row is filled with zeros.
+
+        Notes
+        -----
+        The algorithm is based on the cholesky decomposition and, therefore,
+        in case matrix ``A`` is close to not positive defined, the function
+        raises LinalgError.
+
+        Both matrices ``A`` and ``B`` are stored in LAPACK banded storage.
+
+        References
+        ----------
+        .. [1] M. F. Hutchinson and F. R. de Hoog, "Smoothing noisy data with
+            spline functions," Numerische Mathematik, vol. 47, no. 1,
+            pp. 99-106, 1985.
+            :doi:`10.1007/BF01389878`
+
+        """
+
+        def find_b_inv_elem(i, j, U, D, B):
+            rng = min(3, n - i - 1)
+            rng_sum = 0.
+            if j == 0:
+                # use 2-nd formula from [1]
+                for k in range(1, rng + 1):
+                    rng_sum -= U[-k - 1, i + k] * B[-k - 1, i + k]
+                rng_sum += D[i]
+                B[-1, i] = rng_sum
+            else:
+                # use 1-st formula from [1]
+                for k in range(1, rng + 1):
+                    diag = abs(k - j)
+                    ind = i + min(k, j)
+                    rng_sum -= U[-k - 1, i + k] * B[-diag - 1, ind + diag]
+                B[-j - 1, i + j] = rng_sum
+
+        U = cholesky_banded(A)
+        for i in range(2, 5):
+            U[-i, i-1:] /= U[-1, :-i+1]
+        D = 1. / (U[-1])**2
+        U[-1] /= U[-1]
+
+        n = U.shape[1]
+
+        B = np.zeros(shape=(4, n))
+        for i in range(n - 1, -1, -1):
+            for j in range(min(3, n - i - 1), -1, -1):
+                find_b_inv_elem(i, j, U, D, B)
+        # the first row contains garbage and should be removed
+        B[0] = [0.] * n
+        return B
+
+    def _gcv(lam, X, XtWX, wE, XtE, y):
+        r"""
+        Computes the generalized cross-validation criteria [1].
+
+        Parameters
+        ----------
+        lam : float, (:math:`\lambda \geq 0`)
+            Regularization parameter.
+        X : array, shape (5, n)
+            Matrix is stored in LAPACK banded storage.
+        XtWX : array, shape (4, n)
+            Product :math:`X^T W X` stored in LAPACK banded storage.
+        wE : array, shape (5, n)
+            Matrix :math:`W^{-1} E` stored in LAPACK banded storage.
+        XtE : array, shape (4, n)
+            Product :math:`X^T E` stored in LAPACK banded storage.
+
+        Returns
+        -------
+        res : float
+            Value of the GCV criteria with the regularization parameter
+            :math:`\lambda`.
+
+        Notes
+        -----
+        Criteria is computed from the formula (1.3.2) [3]:
+
+        .. math:
+
+        GCV(\lambda) = \dfrac{1}{n} \sum\limits_{k = 1}^{n} \dfrac{ \left(
+        y_k - f_{\lambda}(x_k) \right)^2}{\left( 1 - \Tr{A}/n\right)^2}$.
+        The criteria is discussed in section 1.3 [3].
+
+        The numerator is computed using (2.2.4) [3] and the denominator is
+        computed using an algorithm from [2] (see in the ``compute_b_inv``
+        function).
+
+        References
+        ----------
+        .. [1] G. Wahba, "Estimating the smoothing parameter" in Spline models
+            for observational data, Philadelphia, Pennsylvania: Society for
+            Industrial and Applied Mathematics, 1990, pp. 45-65.
+            :doi:`10.1137/1.9781611970128`
+        .. [2] M. F. Hutchinson and F. R. de Hoog, "Smoothing noisy data with
+            spline functions," Numerische Mathematik, vol. 47, no. 1,
+            pp. 99-106, 1985.
+            :doi:`10.1007/BF01389878`
+        .. [3] E. Zemlyanoy, "Generalized cross-validation smoothing splines",
+            BSc thesis, 2022. Might be available (in Russian)
+            `here <https://www.hse.ru/ba/am/students/diplomas/620910604>`_
+
+        """
+        # Compute the numerator from (2.2.4) [3]
+        n = X.shape[1]
+        c = solve_banded((2, 2), X + lam * wE, y)
+        res = np.zeros(n)
+        # compute ``W^{-1} E c`` with respect to banded-storage of ``E``
+        tmp = wE * c
+        for i in range(n):
+            for j in range(max(0, i - n + 3), min(5, i + 3)):
+                res[i] += tmp[j, i + 2 - j]
+        numer = np.linalg.norm(lam * res)**2 / n
+
+        # compute the denominator
+        lhs = XtWX + lam * XtE
+        try:
+            b_banded = compute_b_inv(lhs)
+            # compute the trace of the product b_banded @ XtX
+            tr = b_banded * XtWX
+            tr[:-1] *= 2
+            # find the denominator
+            denom = (1 - sum(sum(tr)) / n)**2
+        except LinAlgError:
+            # cholesky decomposition cannot be performed
+            raise ValueError('Seems like the problem is ill-posed')
+
+        res = numer / denom
+
+        return res
+
+    n = X.shape[1]
+
+    XtWX = compute_banded_symmetric_XT_W_Y(X, w, X)
+    XtE = compute_banded_symmetric_XT_W_Y(X, w, wE)
+
+    if y.ndim == 1:
+        gcv_est = minimize_scalar(
+            _gcv, bounds=(0, n), method='Bounded', args=(X, XtWX, wE, XtE, y)
+        )
+        if gcv_est.success:
+            return gcv_est.x
+        raise ValueError(f"Unable to find minimum of the GCV "
+                         f"function: {gcv_est.message}")
+    elif y.ndim == 2:
+        gcv_est = np.empty(y.shape[1])
+        for i in range(y.shape[1]):
+            est = minimize_scalar(
+                _gcv, bounds=(0, n), method='Bounded', args=(X, XtWX, wE, XtE, y[:, i])
+            )
+            if est.success:
+               gcv_est[i] = est.x
+            else:
+                raise ValueError(f"Unable to find minimum of the GCV "
+                                 f"function: {gcv_est.message}")
+        return gcv_est
+    else:
+        # trailing dims must have been flattened already.
+        raise RuntimeError("Internal error. Please report it to scipy developers.")
+
+
+def _coeff_of_divided_diff(x):
+    """
+    Returns the coefficients of the divided difference.
+
+    Parameters
+    ----------
+    x : array, shape (n,)
+        Array which is used for the computation of divided difference.
+
+    Returns
+    -------
+    res : array_like, shape (n,)
+        Coefficients of the divided difference.
+
+    Notes
+    -----
+    Vector ``x`` should have unique elements, otherwise an error division by
+    zero might be raised.
+
+    No checks are performed.
+
+    """
+    n = x.shape[0]
+    res = np.zeros(n)
+    for i in range(n):
+        pp = 1.
+        for k in range(n):
+            if k != i:
+                pp *= (x[i] - x[k])
+        res[i] = 1. / pp
+    return res
+
+
+@xp_capabilities(cpu_only=True, jax_jit=False, allow_dask_compute=True)
+def make_smoothing_spline(x, y, w=None, lam=None, *, axis=0):
+    r"""
+    Create a smoothing B-spline satisfying the Generalized Cross Validation (GCV) criterion.
+
+    Compute the (coefficients of) smoothing cubic spline function using
+    ``lam`` to control the tradeoff between the amount of smoothness of the
+    curve and its proximity to the data. In case ``lam`` is None, using the
+    GCV criteria [1] to find it.
+
+    A smoothing spline is found as a solution to the regularized weighted
+    linear regression problem:
+
+    .. math::
+
+        \sum\limits_{i=1}^n w_i\lvert y_i - f(x_i) \rvert^2 +
+        \lambda\int\limits_{x_1}^{x_n} (f^{(2)}(u))^2 d u
+
+    where :math:`f` is a spline function, :math:`w` is a vector of weights and
+    :math:`\lambda` is a regularization parameter.
+
+    If ``lam`` is None, we use the GCV criteria to find an optimal
+    regularization parameter, otherwise we solve the regularized weighted
+    linear regression problem with given parameter. The parameter controls
+    the tradeoff in the following way: the larger the parameter becomes, the
+    smoother the function gets.
+
+    Parameters
+    ----------
+    x : array_like, shape (n,)
+        Abscissas. `n` must be at least 5.
+    y : array_like, shape (n, ...)
+        Ordinates. `n` must be at least 5.
+    w : array_like, shape (n,), optional
+        Vector of weights. Default is ``np.ones_like(x)``.
+    lam : float, (:math:`\lambda \geq 0`), optional
+        Regularization parameter. If ``lam`` is None, then it is found from
+        the GCV criteria. Default is None.
+    axis : int, optional
+        The data axis. Default is zero.
+        The assumption is that ``y.shape[axis] == n``, and all other axes of ``y``
+        are batching axes.
+
+    Returns
+    -------
+    func : `BSpline` object
+        An object representing a spline in the B-spline basis
+        as a solution of the problem of smoothing splines using
+        the GCV criteria [1] in case ``lam`` is None, otherwise using the
+        given parameter ``lam``.
+
+    Notes
+    -----
+    This algorithm is a clean room reimplementation of the algorithm
+    introduced by Woltring in FORTRAN [2]. The original version cannot be used
+    in SciPy source code because of the license issues. The details of the
+    reimplementation are discussed here (available only in Russian) [4].
+
+    If the vector of weights ``w`` is None, we assume that all the points are
+    equal in terms of weights, and vector of weights is vector of ones.
+
+    Note that in weighted residual sum of squares, weights are not squared:
+    :math:`\sum\limits_{i=1}^n w_i\lvert y_i - f(x_i) \rvert^2` while in
+    ``splrep`` the sum is built from the squared weights.
+
+    In cases when the initial problem is ill-posed (for example, the product
+    :math:`X^T W X` where :math:`X` is a design matrix is not a positive
+    defined matrix) a ValueError is raised.
+
+    References
+    ----------
+    .. [1] G. Wahba, "Estimating the smoothing parameter" in Spline models for
+        observational data, Philadelphia, Pennsylvania: Society for Industrial
+        and Applied Mathematics, 1990, pp. 45-65.
+        :doi:`10.1137/1.9781611970128`
+    .. [2] H. J. Woltring, A Fortran package for generalized, cross-validatory
+        spline smoothing and differentiation, Advances in Engineering
+        Software, vol. 8, no. 2, pp. 104-113, 1986.
+        :doi:`10.1016/0141-1195(86)90098-7`
+    .. [3] T. Hastie, J. Friedman, and R. Tisbshirani, "Smoothing Splines" in
+        The elements of Statistical Learning: Data Mining, Inference, and
+        prediction, New York: Springer, 2017, pp. 241-249.
+        :doi:`10.1007/978-0-387-84858-7`
+    .. [4] E. Zemlyanoy, "Generalized cross-validation smoothing splines",
+        BSc thesis, 2022.
+        `<https://www.hse.ru/ba/am/students/diplomas/620910604>`_ (in
+        Russian)
+
+    Examples
+    --------
+    Generate some noisy data
+
+    >>> import numpy as np
+    >>> np.random.seed(1234)
+    >>> n = 200
+    >>> def func(x):
+    ...    return x**3 + x**2 * np.sin(4 * x)
+    >>> x = np.sort(np.random.random_sample(n) * 4 - 2)
+    >>> y = func(x) + np.random.normal(scale=1.5, size=n)
+
+    Make a smoothing spline function
+
+    >>> from scipy.interpolate import make_smoothing_spline
+    >>> spl = make_smoothing_spline(x, y)
+
+    Plot both
+
+    >>> import matplotlib.pyplot as plt
+    >>> grid = np.linspace(x[0], x[-1], 400)
+    >>> plt.plot(x, y, '.')
+    >>> plt.plot(grid, spl(grid), label='Spline')
+    >>> plt.plot(grid, func(grid), label='Original function')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """  # noqa:E501
+    xp = array_namespace(x, y)
+
+    x = np.ascontiguousarray(x, dtype=float)
+    y = np.ascontiguousarray(y, dtype=float)
+
+    if any(x[1:] - x[:-1] <= 0):
+        raise ValueError('``x`` should be an ascending array')
+
+    if x.ndim != 1 or x.shape[0] != y.shape[axis]:
+        raise ValueError(f'``x`` should be 1D and {x.shape = } == {y.shape = }')
+
+    if w is None:
+        w = np.ones(len(x))
+    else:
+        w = np.ascontiguousarray(w)
+        if any(w <= 0):
+            raise ValueError('Invalid vector of weights')
+
+    t = np.r_[[x[0]] * 3, x, [x[-1]] * 3]
+    n = x.shape[0]
+
+    if n <= 4:
+        raise ValueError('``x`` and ``y`` length must be at least 5')
+
+    # Internals assume that the data axis is the zero-th axis
+    axis = normalize_axis_index(axis, y.ndim)
+    y = np.moveaxis(y, axis, 0)
+
+    # flatten the trailing axes of y to simplify further manipulations
+    y_shape1 = y.shape[1:]
+    if y_shape1 != ():
+        y = y.reshape((n, -1))
+
+    # It is known that the solution to the stated minimization problem exists
+    # and is a natural cubic spline with vector of knots equal to the unique
+    # elements of ``x`` [3], so we will solve the problem in the basis of
+    # natural splines.
+
+    # create design matrix in the B-spline basis
+    X_bspl = BSpline.design_matrix(x, t, 3)
+    # move from B-spline basis to the basis of natural splines using equations
+    # (2.1.7) [4]
+    # central elements
+    X = np.zeros((5, n))
+    for i in range(1, 4):
+        X[i, 2: -2] = X_bspl[i: i - 4, 3: -3][np.diag_indices(n - 4)]
+
+    # first elements
+    X[1, 1] = X_bspl[0, 0]
+    X[2, :2] = ((x[2] + x[1] - 2 * x[0]) * X_bspl[0, 0],
+                X_bspl[1, 1] + X_bspl[1, 2])
+    X[3, :2] = ((x[2] - x[0]) * X_bspl[1, 1], X_bspl[2, 2])
+
+    # last elements
+    X[1, -2:] = (X_bspl[-3, -3], (x[-1] - x[-3]) * X_bspl[-2, -2])
+    X[2, -2:] = (X_bspl[-2, -3] + X_bspl[-2, -2],
+                 (2 * x[-1] - x[-2] - x[-3]) * X_bspl[-1, -1])
+    X[3, -2] = X_bspl[-1, -1]
+
+    # create penalty matrix and divide it by vector of weights: W^{-1} E
+    wE = np.zeros((5, n))
+    wE[2:, 0] = _coeff_of_divided_diff(x[:3]) / w[:3]
+    wE[1:, 1] = _coeff_of_divided_diff(x[:4]) / w[:4]
+    for j in range(2, n - 2):
+        wE[:, j] = (x[j+2] - x[j-2]) * _coeff_of_divided_diff(x[j-2:j+3])\
+                   / w[j-2: j+3]
+
+    wE[:-1, -2] = -_coeff_of_divided_diff(x[-4:]) / w[-4:]
+    wE[:-2, -1] = _coeff_of_divided_diff(x[-3:]) / w[-3:]
+    wE *= 6
+
+    if lam is None:
+        lam = _compute_optimal_gcv_parameter(X, wE, y, w)
+    elif lam < 0.:
+        raise ValueError('Regularization parameter should be non-negative')
+
+    # solve the initial problem in the basis of natural splines
+    if np.ndim(lam) == 0:
+        c = solve_banded((2, 2), X + lam * wE, y)
+    elif np.ndim(lam) == 1:
+        # XXX: solve_banded does not suppport batched `ab` matrices; loop manually
+        c = np.empty((n, lam.shape[0]))
+        for i in range(lam.shape[0]):
+            c[:, i] = solve_banded((2, 2), X + lam[i] * wE, y[:, i])
+    else:
+        # this should not happen, ever
+        raise RuntimeError("Internal error, please report it to SciPy developers.")
+    c = c.reshape((c.shape[0], *y_shape1))
+
+    # hack: these are c[0], c[1] etc, shape-compatible with np.r_ below
+    c0, c1 = c[0:1, ...], c[1:2, ...]     # c[0], c[1]
+    cm0, cm1 = c[-1:-2:-1, ...], c[-2:-3:-1, ...]    # c[-1], c[-2]
+
+    # move back to B-spline basis using equations (2.2.10) [4]
+    c_ = np.r_[c0 * (t[5] + t[4] - 2 * t[3]) + c1,
+               c0 * (t[5] - t[3]) + c1,
+               c[1: -1, ...],
+               cm0 * (t[-4] - t[-6]) + cm1,
+               cm0 * (2 * t[-4] - t[-5] - t[-6]) + cm1]
+
+    t, c_ = xp.asarray(t), xp.asarray(c_)
+    return BSpline.construct_fast(t, c_, 3, axis=axis)
+
+
+########################
+#  FITPACK look-alikes #
+########################
+
+def fpcheck(x, t, k, periodic=False):
+    """Check consistency of data vector `x` and knot vector `t`.
+
+    Parameters
+    ----------
+    x : array_like, shape (m,)
+        1D sorted array of data points.
+    t : array_like, shape (n,)
+        1D non-decreasing knot vector.
+    k : int
+        Degree of the spline.
+    periodic : bool, optional
+        Whether the spline is periodic. Default is False.
+
+    Raises
+    ------
+    ValueError
+        If the configuration of `x`, `t`, and `k` violates any required condition.
+    """
+    # This routine is a unified clone of the FITPACK Fortran routines `fpchec.f`
+    # and `fpchep.f`:
+    # - https://github.com/scipy/scipy/blob/main/scipy/interpolate/fitpack/fpchec.f
+    # - https://github.com/scipy/scipy/blob/main/scipy/interpolate/fitpack/fpchep.f
+    #
+    # These routines verify the number and position of the knots t(j), j=1,...,n,
+    # of a spline of degree k, relative to the number and distribution of data points
+    # x(i), i=1,...,m. If all of the following conditions are fulfilled,
+    # validation passes.
+    #
+    # For non-periodic splines:
+    #   1) k+1 <= n-k-1 <= m
+    #   2) t(1) <= t(2) <= ... <= t(k+1)
+    #      t(n-k) <= t(n-k+1) <= ... <= t(n)
+    #   3) t(k+1) < t(k+2) < ... < t(n-k)
+    #   4) t(k+1) <= x(i) <= t(n-k)
+    #   5) Schoenberg-Whitney conditions hold: there exists a subset y(j) such that
+    #        t(j) < y(j) < t(j+k+1), for j = 1, ..., n-k-1
+    #
+    # For periodic splines:
+    #   1) k+1 <= n-k-1 <= m + k - 1
+    #   2) Same boundary knot monotonicity as above
+    #   3) Same strict interior knot increase as above
+    #   4) t(k+1) <= x(i) <= t(n-k)
+    #   5) Schoenberg-Whitney conditions must hold for *some periodic shift*
+    #        of the data sequence; i.e. wrapped data points x(i) must satisfy
+    #        t(j) < y(j) < t(j+k+1), j = k+1, ..., n-k-1
+    x = np.asarray(x)
+    t = np.asarray(t)
+
+    if x.ndim != 1 or t.ndim != 1:
+        raise ValueError(f"Expect `x` and `t` be 1D sequences. Got {x = } and {t = }")
+
+    m = x.shape[0]
+    n = t.shape[0]
+    nk1 = n - k - 1
+
+    # check condition no 1
+    if periodic:
+        # c      1) k+1 <= nk1 <= m+k-1
+        if not (k + 1 <= nk1 <= m + k - 1):
+            raise ValueError(f"Need k+1 <= n-k-1 <= m+k-1. Got {m = }, {n = }, {k = }")
+    else:
+        # c      1) k+1 <= n-k-1 <= m
+        if not (k + 1 <= nk1 <= m):
+            raise ValueError(f"Need k+1 <= n-k-1 <= m. Got {m = }, {n = } and {k = }.")
+
+    # check condition no 2
+    # c      2) t(1) <= t(2) <= ... <= t(k+1)
+    # c         t(n-k) <= t(n-k+1) <= ... <= t(n)
+    if (t[:k+1] > t[1:k+2]).any():
+        raise ValueError(f"First k knots must be ordered; got {t = }.")
+
+    if (t[nk1:] < t[nk1-1:-1]).any():
+        raise ValueError(f"Last k knots must be ordered; got {t = }.")
+
+    # c  check condition no 3
+    # c      3) t(k+1) < t(k+2) < ... < t(n-k)
+    if (t[k+1:n-k] <= t[k:n-k-1]).any():
+        raise ValueError(f"Internal knots must be distinct. Got {t = }.")
+
+    # c  check condition no 4
+    # c      4) t(k+1) <= x(i) <= t(n-k)
+    # NB: FITPACK's fpchec only checks x[0] & x[-1], so we follow.
+    if (x[0] < t[k]) or (x[-1] > t[n-k-1]):
+        raise ValueError(f"Out of bounds: {x = } and {t = }.")
+
+    # c  check condition no 5
+    # c      5) the conditions specified by Schoenberg and Whitney must hold
+    # c         for at least one subset of data points y(j) such that
+    # c             t(j) < y(j) < t(j+k+1)
+    # c
+    # c         For non-periodic splines:
+    # c             j = 1, 2, ..., n-k-1 (i.e., j in [1, n-k-1])
+    # c             The data points must lie strictly inside some B-spline supports.
+    # c
+    # c         For periodic splines:
+    # c             j = k+1, ..., n-k-1
+    # c             The condition must hold for a wrapped subset of the data points,
+    # c             i.e., there exists a cyclic shift of the data such that
+    # c                 t(j) < x(i) < t(j+k+1)
+    # c             holds for all j in that range. The test must account for the
+    # c             periodic domain length: per = t(n-k) - t(k+1), and wrap around x(i)
+    # c             as x(i) + per if needed.
+    mesg = f"Schoenberg-Whitney condition is violated with {t = } and {x =}."
+
+    if periodic:
+        per = t[n - k - 1] - t[k]
+        m1 = m - 1
+        for shift in range(1, m):
+            for j in range(k, nk1):
+                tj = t[j]
+                tl = t[j + k + 1]
+                found = False
+                for i in range(shift, shift + m1 + 1):
+                    idx = i if i < m else i - m
+                    xi = x[idx] + (0 if i < m else per)
+                    if tj < xi < tl:
+                        found = True
+                        break
+                if not found:
+                    break
+            else:
+                return
+        raise ValueError(mesg)
+    else:
+        if (x[0] >= t[k+1]) or (x[-1] <= t[n-k-2]):
+            raise ValueError(mesg)
+
+        m = x.shape[0]
+        l = k+1
+        nk3 = n - k - 3
+        if nk3 < 2:
+            return
+        for j in range(1, nk3+1):
+            tj = t[j]
+            l += 1
+            tl = t[l]
+            i = np.argmax(x > tj)
+            if i >= m-1:
+                raise ValueError(mesg)
+            if x[i] >= tl:
+                raise ValueError(mesg)
+        return
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack2.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0fd2e1ecaa389c5e0ab9b07a5972bc00655d41
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack2.py
@@ -0,0 +1,2408 @@
+"""
+fitpack --- curve and surface fitting with splines
+
+fitpack is based on a collection of Fortran routines DIERCKX
+by P. Dierckx (see http://www.netlib.org/dierckx/) transformed
+to double routines by Pearu Peterson.
+"""
+# Created by Pearu Peterson, June,August 2003
+__all__ = [
+    'UnivariateSpline',
+    'InterpolatedUnivariateSpline',
+    'LSQUnivariateSpline',
+    'BivariateSpline',
+    'LSQBivariateSpline',
+    'SmoothBivariateSpline',
+    'LSQSphereBivariateSpline',
+    'SmoothSphereBivariateSpline',
+    'RectBivariateSpline',
+    'RectSphereBivariateSpline']
+
+
+import warnings
+from threading import Lock
+
+from numpy import zeros, concatenate, ravel, diff, array
+import numpy as np
+
+from . import _fitpack_impl
+from . import _dfitpack as dfitpack
+from scipy._lib._array_api import xp_capabilities
+
+
+dfitpack_int = dfitpack.types.intvar.dtype
+FITPACK_LOCK = Lock()
+
+
+# ############### Univariate spline ####################
+
+_curfit_messages = {1: """
+The required storage space exceeds the available storage space, as
+specified by the parameter nest: nest too small. If nest is already
+large (say nest > m/2), it may also indicate that s is too small.
+The approximation returned is the weighted least-squares spline
+according to the knots t[0],t[1],...,t[n-1]. (n=nest) the parameter fp
+gives the corresponding weighted sum of squared residuals (fp>s).
+""",
+                    2: """
+A theoretically impossible result was found during the iteration
+process for finding a smoothing spline with fp = s: s too small.
+There is an approximation returned but the corresponding weighted sum
+of squared residuals does not satisfy the condition abs(fp-s)/s < tol.""",
+                    3: """
+The maximal number of iterations maxit (set to 20 by the program)
+allowed for finding a smoothing spline with fp=s has been reached: s
+too small.
+There is an approximation returned but the corresponding weighted sum
+of squared residuals does not satisfy the condition abs(fp-s)/s < tol.""",
+                    10: """
+Error on entry, no approximation returned. The following conditions
+must hold:
+xb<=x[0]<x[1]<...<x[m-1]<=xe, w[i]>0, i=0..m-1
+if iopt=-1:
+  xb<t[k+1]<t[k+2]<...<t[n-k-2]<xe"""
+                    }
+
+
+# UnivariateSpline, ext parameter can be an int or a string
+_extrap_modes = {0: 0, 'extrapolate': 0,
+                 1: 1, 'zeros': 1,
+                 2: 2, 'raise': 2,
+                 3: 3, 'const': 3}
+
+
+@xp_capabilities(out_of_scope=True)
+class UnivariateSpline:
+    """
+    1-D smoothing spline fit to a given set of data points.
+
+    .. legacy:: class
+
+        Specifically, we recommend using `make_splrep` instead.
+
+    Fits a spline y = spl(x) of degree `k` to the provided `x`, `y` data.  `s`
+    specifies the number of knots by specifying a smoothing condition.
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        1-D array of independent input data. Must be increasing;
+        must be strictly increasing if `s` is 0.
+    y : (N,) array_like
+        1-D array of dependent input data, of the same length as `x`.
+    w : (N,) array_like, optional
+        Weights for spline fitting.  Must be positive.  If `w` is None,
+        weights are all 1. Default is None.
+    bbox : (2,) array_like, optional
+        2-sequence specifying the boundary of the approximation interval. If
+        `bbox` is None, ``bbox=[x[0], x[-1]]``. Default is None.
+    k : int, optional
+        Degree of the smoothing spline.  Must be 1 <= `k` <= 5.
+        ``k = 3`` is a cubic spline. Default is 3.
+    s : float or None, optional
+        Positive smoothing factor used to choose the number of knots.  Number
+        of knots will be increased until the smoothing condition is satisfied::
+
+            sum((w[i] * (y[i]-spl(x[i])))**2, axis=0) <= s
+
+        However, because of numerical issues, the actual condition is::
+
+            abs(sum((w[i] * (y[i]-spl(x[i])))**2, axis=0) - s) < 0.001 * s
+
+        If `s` is None, `s` will be set as `len(w)` for a smoothing spline
+        that uses all data points.
+        If 0, spline will interpolate through all data points. This is
+        equivalent to `InterpolatedUnivariateSpline`.
+        Default is None.
+        The user can use the `s` to control the tradeoff between closeness
+        and smoothness of fit. Larger `s` means more smoothing while smaller
+        values of `s` indicate less smoothing.
+        Recommended values of `s` depend on the weights, `w`. If the weights
+        represent the inverse of the standard-deviation of `y`, then a good
+        `s` value should be found in the range (m-sqrt(2*m),m+sqrt(2*m))
+        where m is the number of datapoints in `x`, `y`, and `w`. This means
+        ``s = len(w)`` should be a good value if ``1/w[i]`` is an
+        estimate of the standard deviation of ``y[i]``.
+    ext : int or str, optional
+        Controls the extrapolation mode for elements
+        not in the interval defined by the knot sequence.
+
+        * if ext=0 or 'extrapolate', return the extrapolated value.
+        * if ext=1 or 'zeros', return 0
+        * if ext=2 or 'raise', raise a ValueError
+        * if ext=3 or 'const', return the boundary value.
+
+        Default is 0.
+
+    check_finite : bool, optional
+        Whether to check that the input arrays contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination or non-sensical results) if the inputs
+        do contain infinities or NaNs.
+        Default is False.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh
+    InterpolatedUnivariateSpline :
+        a interpolating univariate spline for a given set of data points.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+    splrep :
+        a function to find the B-spline representation of a 1-D curve
+    splev :
+        a function to evaluate a B-spline or its derivatives
+    sproot :
+        a function to find the roots of a cubic B-spline
+    splint :
+        a function to evaluate the definite integral of a B-spline between two
+        given points
+    spalde :
+        a function to evaluate all derivatives of a B-spline
+
+    Notes
+    -----
+    The number of data points must be larger than the spline degree `k`.
+
+    **NaN handling**: If the input arrays contain ``nan`` values, the result
+    is not useful, since the underlying spline fitting routines cannot deal
+    with ``nan``. A workaround is to use zero weights for not-a-number
+    data points:
+
+    >>> import numpy as np
+    >>> from scipy.interpolate import UnivariateSpline
+    >>> x, y = np.array([1, 2, 3, 4]), np.array([1, np.nan, 3, 4])
+    >>> w = np.isnan(y)
+    >>> y[w] = 0.
+    >>> spl = UnivariateSpline(x, y, w=~w)
+
+    Notice the need to replace a ``nan`` by a numerical value (precise value
+    does not matter as long as the corresponding weight is zero.)
+
+    References
+    ----------
+    Based on algorithms described in [1]_, [2]_, [3]_, and [4]_:
+
+    .. [1] P. Dierckx, "An algorithm for smoothing, differentiation and
+       integration of experimental data using spline functions",
+       J.Comp.Appl.Maths 1 (1975) 165-184.
+    .. [2] P. Dierckx, "A fast algorithm for smoothing data on a rectangular
+       grid while using spline functions", SIAM J.Numer.Anal. 19 (1982)
+       1286-1304.
+    .. [3] P. Dierckx, "An improved algorithm for curve fitting with spline
+       functions", report tw54, Dept. Computer Science,K.U. Leuven, 1981.
+    .. [4] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+       Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import UnivariateSpline
+    >>> rng = np.random.default_rng()
+    >>> x = np.linspace(-3, 3, 50)
+    >>> y = np.exp(-x**2) + 0.1 * rng.standard_normal(50)
+    >>> plt.plot(x, y, 'ro', ms=5)
+
+    Use the default value for the smoothing parameter:
+
+    >>> spl = UnivariateSpline(x, y)
+    >>> xs = np.linspace(-3, 3, 1000)
+    >>> plt.plot(xs, spl(xs), 'g', lw=3)
+
+    Manually change the amount of smoothing:
+
+    >>> spl.set_smoothing_factor(0.5)
+    >>> plt.plot(xs, spl(xs), 'b', lw=3)
+    >>> plt.show()
+
+    """
+
+    def __init__(self, x, y, w=None, bbox=[None]*2, k=3, s=None,
+                 ext=0, check_finite=False):
+
+        x, y, w, bbox, self.ext = self.validate_input(x, y, w, bbox, k, s, ext,
+                                                      check_finite)
+
+        # _data == x,y,w,xb,xe,k,s,n,t,c,fp,fpint,nrdata,ier
+        with FITPACK_LOCK:
+            data = dfitpack.fpcurf0(x, y, k, w=w, xb=bbox[0],
+                                    xe=bbox[1], s=s)
+        if data[-1] == 1:
+            # nest too small, setting to maximum bound
+            data = self._reset_nest(data)
+        self._data = data
+        self._reset_class()
+
+    @staticmethod
+    def validate_input(x, y, w, bbox, k, s, ext, check_finite):
+        x, y, bbox = np.asarray(x), np.asarray(y), np.asarray(bbox)
+        if w is not None:
+            w = np.asarray(w)
+        if check_finite:
+            w_finite = np.isfinite(w).all() if w is not None else True
+            if (not np.isfinite(x).all() or not np.isfinite(y).all() or
+                    not w_finite):
+                raise ValueError("x and y array must not contain "
+                                 "NaNs or infs.")
+        if s is None or s > 0:
+            if not np.all(diff(x) >= 0.0):
+                raise ValueError("x must be increasing if s > 0")
+        else:
+            if not np.all(diff(x) > 0.0):
+                raise ValueError("x must be strictly increasing if s = 0")
+        if x.size != y.size:
+            raise ValueError("x and y should have a same length")
+        elif w is not None and not x.size == y.size == w.size:
+            raise ValueError("x, y, and w should have a same length")
+        elif bbox.shape != (2,):
+            raise ValueError("bbox shape should be (2,)")
+        elif not (1 <= k <= 5):
+            raise ValueError("k should be 1 <= k <= 5")
+        elif s is not None and not s >= 0.0:
+            raise ValueError("s should be s >= 0.0")
+
+        try:
+            ext = _extrap_modes[ext]
+        except KeyError as e:
+            raise ValueError(f"Unknown extrapolation mode {ext}.") from e
+
+        return x, y, w, bbox, ext
+
+    @classmethod
+    def _from_tck(cls, tck, ext=0):
+        """Construct a spline object from given tck"""
+        self = cls.__new__(cls)
+        t, c, k = tck
+        self._eval_args = tck
+        # _data == x,y,w,xb,xe,k,s,n,t,c,fp,fpint,nrdata,ier
+        self._data = (None, None, None, None, None, k, None, len(t), t,
+                      c, None, None, None, None)
+        self.ext = ext
+        return self
+
+    def _reset_class(self):
+        data = self._data
+        n, t, c, k, ier = data[7], data[8], data[9], data[5], data[-1]
+        self._eval_args = t[:n], c[:n], k
+        if ier == 0:
+            # the spline returned has a residual sum of squares fp
+            # such that abs(fp-s)/s <= tol with tol a relative
+            # tolerance set to 0.001 by the program
+            pass
+        elif ier == -1:
+            # the spline returned is an interpolating spline
+            self._set_class(InterpolatedUnivariateSpline)
+        elif ier == -2:
+            # the spline returned is the weighted least-squares
+            # polynomial of degree k. In this extreme case fp gives
+            # the upper bound fp0 for the smoothing factor s.
+            self._set_class(LSQUnivariateSpline)
+        else:
+            # error
+            if ier == 1:
+                self._set_class(LSQUnivariateSpline)
+            message = _curfit_messages.get(ier, f'ier={ier}')
+            warnings.warn(message, stacklevel=3)
+
+    def _set_class(self, cls):
+        self._spline_class = cls
+        if self.__class__ in (UnivariateSpline, InterpolatedUnivariateSpline,
+                              LSQUnivariateSpline):
+            self.__class__ = cls
+        else:
+            # It's an unknown subclass -- don't change class. cf. #731
+            pass
+
+    def _reset_nest(self, data, nest=None):
+        n = data[10]
+        if nest is None:
+            k, m = data[5], len(data[0])
+            nest = m+k+1  # this is the maximum bound for nest
+        else:
+            if not n <= nest:
+                raise ValueError("`nest` can only be increased")
+        t, c, fpint, nrdata = (np.resize(data[j], nest) for j in
+                               [8, 9, 11, 12])
+
+        args = data[:8] + (t, c, n, fpint, nrdata, data[13])
+        with FITPACK_LOCK:
+            data = dfitpack.fpcurf1(*args)
+        return data
+
+    def set_smoothing_factor(self, s):
+        """ Continue spline computation with the given smoothing
+        factor s and with the knots found at the last call.
+
+        This routine modifies the spline in place.
+
+        """
+        data = self._data
+        if data[6] == -1:
+            warnings.warn('smoothing factor unchanged for'
+                          'LSQ spline with fixed knots',
+                          stacklevel=2)
+            return
+        args = data[:6] + (s,) + data[7:]
+        with FITPACK_LOCK:
+            data = dfitpack.fpcurf1(*args)
+        if data[-1] == 1:
+            # nest too small, setting to maximum bound
+            data = self._reset_nest(data)
+        self._data = data
+        self._reset_class()
+
+    def __call__(self, x, nu=0, ext=None):
+        """
+        Evaluate spline (or its nu-th derivative) at positions x.
+
+        Parameters
+        ----------
+        x : array_like
+            A 1-D array of points at which to return the value of the smoothed
+            spline or its derivatives. Note: `x` can be unordered but the
+            evaluation is more efficient if `x` is (partially) ordered.
+        nu  : int
+            The order of derivative of the spline to compute.
+        ext : int
+            Controls the value returned for elements of `x` not in the
+            interval defined by the knot sequence.
+
+            * if ext=0 or 'extrapolate', return the extrapolated value.
+            * if ext=1 or 'zeros', return 0
+            * if ext=2 or 'raise', raise a ValueError
+            * if ext=3 or 'const', return the boundary value.
+
+            The default value is 0, passed from the initialization of
+            UnivariateSpline.
+
+        """
+        x = np.asarray(x)
+        # empty input yields empty output
+        if x.size == 0:
+            return array([])
+        if ext is None:
+            ext = self.ext
+        else:
+            try:
+                ext = _extrap_modes[ext]
+            except KeyError as e:
+                raise ValueError(f"Unknown extrapolation mode {ext}.") from e
+        with FITPACK_LOCK:
+            return _fitpack_impl.splev(x, self._eval_args, der=nu, ext=ext)
+
+    def get_knots(self):
+        """ Return positions of interior knots of the spline.
+
+        Internally, the knot vector contains ``2*k`` additional boundary knots.
+        """
+        data = self._data
+        k, n = data[5], data[7]
+        return data[8][k:n-k]
+
+    def get_coeffs(self):
+        """Return spline coefficients."""
+        data = self._data
+        k, n = data[5], data[7]
+        return data[9][:n-k-1]
+
+    def get_residual(self):
+        """Return weighted sum of squared residuals of the spline approximation.
+
+           This is equivalent to::
+
+                sum((w[i] * (y[i]-spl(x[i])))**2, axis=0)
+
+        """
+        return self._data[10]
+
+    def integral(self, a, b):
+        """ Return definite integral of the spline between two given points.
+
+        Parameters
+        ----------
+        a : float
+            Lower limit of integration.
+        b : float
+            Upper limit of integration.
+
+        Returns
+        -------
+        integral : float
+            The value of the definite integral of the spline between limits.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.interpolate import UnivariateSpline
+        >>> x = np.linspace(0, 3, 11)
+        >>> y = x**2
+        >>> spl = UnivariateSpline(x, y)
+        >>> spl.integral(0, 3)
+        9.0
+
+        which agrees with :math:`\\int x^2 dx = x^3 / 3` between the limits
+        of 0 and 3.
+
+        A caveat is that this routine assumes the spline to be zero outside of
+        the data limits:
+
+        >>> spl.integral(-1, 4)
+        9.0
+        >>> spl.integral(-1, 0)
+        0.0
+
+        """
+        with FITPACK_LOCK:
+            return _fitpack_impl.splint(a, b, self._eval_args)
+
+    def derivatives(self, x):
+        """ Return all derivatives of the spline at the point x.
+
+        Parameters
+        ----------
+        x : float
+            The point to evaluate the derivatives at.
+
+        Returns
+        -------
+        der : ndarray, shape(k+1,)
+            Derivatives of the orders 0 to k.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.interpolate import UnivariateSpline
+        >>> x = np.linspace(0, 3, 11)
+        >>> y = x**2
+        >>> spl = UnivariateSpline(x, y)
+        >>> spl.derivatives(1.5)
+        array([2.25, 3.0, 2.0, 0])
+
+        """
+        with FITPACK_LOCK:
+            return _fitpack_impl.spalde(x, self._eval_args)
+
+    def roots(self):
+        """ Return the zeros of the spline.
+
+        Notes
+        -----
+        Restriction: only cubic splines are supported by FITPACK. For non-cubic
+        splines, use `PPoly.root` (see below for an example).
+
+        Examples
+        --------
+
+        For some data, this method may miss a root. This happens when one of
+        the spline knots (which FITPACK places automatically) happens to
+        coincide with the true root. A workaround is to convert to `PPoly`,
+        which uses a different root-finding algorithm.
+
+        For example,
+
+        >>> x = [1.96, 1.97, 1.98, 1.99, 2.00, 2.01, 2.02, 2.03, 2.04, 2.05]
+        >>> y = [-6.365470e-03, -4.790580e-03, -3.204320e-03, -1.607270e-03,
+        ...      4.440892e-16,  1.616930e-03,  3.243000e-03,  4.877670e-03,
+        ...      6.520430e-03,  8.170770e-03]
+        >>> from scipy.interpolate import UnivariateSpline
+        >>> spl = UnivariateSpline(x, y, s=0)
+        >>> spl.roots()
+        array([], dtype=float64)
+
+        Converting to a PPoly object does find the roots at `x=2`:
+
+        >>> from scipy.interpolate import splrep, PPoly
+        >>> tck = splrep(x, y, s=0)
+        >>> ppoly = PPoly.from_spline(tck)
+        >>> ppoly.roots(extrapolate=False)
+        array([2.])
+
+        See Also
+        --------
+        sproot
+        PPoly.roots
+
+        """
+        k = self._data[5]
+        if k == 3:
+            t = self._eval_args[0]
+            mest = 3 * (len(t) - 7)
+            with FITPACK_LOCK:
+                return _fitpack_impl.sproot(self._eval_args, mest=mest)
+        raise NotImplementedError('finding roots unsupported for '
+                                  'non-cubic splines')
+
+    def derivative(self, n=1):
+        """
+        Construct a new spline representing the derivative of this spline.
+
+        Parameters
+        ----------
+        n : int, optional
+            Order of derivative to evaluate. Default: 1
+
+        Returns
+        -------
+        spline : UnivariateSpline
+            Spline of order k2=k-n representing the derivative of this
+            spline.
+
+        See Also
+        --------
+        splder, antiderivative
+
+        Notes
+        -----
+
+        .. versionadded:: 0.13.0
+
+        Examples
+        --------
+        This can be used for finding maxima of a curve:
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import UnivariateSpline
+        >>> x = np.linspace(0, 10, 70)
+        >>> y = np.sin(x)
+        >>> spl = UnivariateSpline(x, y, k=4, s=0)
+
+        Now, differentiate the spline and find the zeros of the
+        derivative. (NB: `sproot` only works for order 3 splines, so we
+        fit an order 4 spline):
+
+        >>> spl.derivative().roots() / np.pi
+        array([ 0.50000001,  1.5       ,  2.49999998])
+
+        This agrees well with roots :math:`\\pi/2 + n\\pi` of
+        :math:`\\cos(x) = \\sin'(x)`.
+
+        """
+        with FITPACK_LOCK:
+            tck = _fitpack_impl.splder(self._eval_args, n)
+        # if self.ext is 'const', derivative.ext will be 'zeros'
+        ext = 1 if self.ext == 3 else self.ext
+        return UnivariateSpline._from_tck(tck, ext=ext)
+
+    def antiderivative(self, n=1):
+        """
+        Construct a new spline representing the antiderivative of this spline.
+
+        Parameters
+        ----------
+        n : int, optional
+            Order of antiderivative to evaluate. Default: 1
+
+        Returns
+        -------
+        spline : UnivariateSpline
+            Spline of order k2=k+n representing the antiderivative of this
+            spline.
+
+        Notes
+        -----
+
+        .. versionadded:: 0.13.0
+
+        See Also
+        --------
+        splantider, derivative
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.interpolate import UnivariateSpline
+        >>> x = np.linspace(0, np.pi/2, 70)
+        >>> y = 1 / np.sqrt(1 - 0.8*np.sin(x)**2)
+        >>> spl = UnivariateSpline(x, y, s=0)
+
+        The derivative is the inverse operation of the antiderivative,
+        although some floating point error accumulates:
+
+        >>> spl(1.7), spl.antiderivative().derivative()(1.7)
+        (array(2.1565429877197317), array(2.1565429877201865))
+
+        Antiderivative can be used to evaluate definite integrals:
+
+        >>> ispl = spl.antiderivative()
+        >>> ispl(np.pi/2) - ispl(0)
+        2.2572053588768486
+
+        This is indeed an approximation to the complete elliptic integral
+        :math:`K(m) = \\int_0^{\\pi/2} [1 - m\\sin^2 x]^{-1/2} dx`:
+
+        >>> from scipy.special import ellipk
+        >>> ellipk(0.8)
+        2.2572053268208538
+
+        """
+        with FITPACK_LOCK:
+            tck = _fitpack_impl.splantider(self._eval_args, n)
+        return UnivariateSpline._from_tck(tck, self.ext)
+
+
+@xp_capabilities(out_of_scope=True)
+class InterpolatedUnivariateSpline(UnivariateSpline):
+    """
+    1-D interpolating spline for a given set of data points.
+
+    .. legacy:: class
+
+        Specifically, we recommend using `make_interp_spline` instead.
+
+    Fits a spline y = spl(x) of degree `k` to the provided `x`, `y` data.
+    Spline function passes through all provided points. Equivalent to
+    `UnivariateSpline` with  `s` = 0.
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        Input dimension of data points -- must be strictly increasing
+    y : (N,) array_like
+        input dimension of data points
+    w : (N,) array_like, optional
+        Weights for spline fitting.  Must be positive.  If None (default),
+        weights are all 1.
+    bbox : (2,) array_like, optional
+        2-sequence specifying the boundary of the approximation interval. If
+        None (default), ``bbox=[x[0], x[-1]]``.
+    k : int, optional
+        Degree of the smoothing spline.  Must be ``1 <= k <= 5``. Default is
+        ``k = 3``, a cubic spline.
+    ext : int or str, optional
+        Controls the extrapolation mode for elements
+        not in the interval defined by the knot sequence.
+
+        * if ext=0 or 'extrapolate', return the extrapolated value.
+        * if ext=1 or 'zeros', return 0
+        * if ext=2 or 'raise', raise a ValueError
+        * if ext=3 of 'const', return the boundary value.
+
+        The default value is 0.
+
+    check_finite : bool, optional
+        Whether to check that the input arrays contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination or non-sensical results) if the inputs
+        do contain infinities or NaNs.
+        Default is False.
+
+    See Also
+    --------
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    LSQUnivariateSpline :
+        a spline for which knots are user-selected
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    splrep :
+        a function to find the B-spline representation of a 1-D curve
+    splev :
+        a function to evaluate a B-spline or its derivatives
+    sproot :
+        a function to find the roots of a cubic B-spline
+    splint :
+        a function to evaluate the definite integral of a B-spline between two
+        given points
+    spalde :
+        a function to evaluate all derivatives of a B-spline
+
+    Notes
+    -----
+    The number of data points must be larger than the spline degree `k`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import InterpolatedUnivariateSpline
+    >>> rng = np.random.default_rng()
+    >>> x = np.linspace(-3, 3, 50)
+    >>> y = np.exp(-x**2) + 0.1 * rng.standard_normal(50)
+    >>> spl = InterpolatedUnivariateSpline(x, y)
+    >>> plt.plot(x, y, 'ro', ms=5)
+    >>> xs = np.linspace(-3, 3, 1000)
+    >>> plt.plot(xs, spl(xs), 'g', lw=3, alpha=0.7)
+    >>> plt.show()
+
+    Notice that the ``spl(x)`` interpolates `y`:
+
+    >>> spl.get_residual()
+    0.0
+
+    """
+
+    def __init__(self, x, y, w=None, bbox=[None]*2, k=3,
+                 ext=0, check_finite=False):
+
+        x, y, w, bbox, self.ext = self.validate_input(x, y, w, bbox, k, None,
+                                            ext, check_finite)
+        if not np.all(diff(x) > 0.0):
+            raise ValueError('x must be strictly increasing')
+
+        # _data == x,y,w,xb,xe,k,s,n,t,c,fp,fpint,nrdata,ier
+        with FITPACK_LOCK:
+            self._data = dfitpack.fpcurf0(x, y, k, w=w, xb=bbox[0],
+                                          xe=bbox[1], s=0)
+        self._reset_class()
+
+
+_fpchec_error_string = """The input parameters have been rejected by fpchec. \
+This means that at least one of the following conditions is violated:
+
+1) k+1 <= n-k-1 <= m
+2) t(1) <= t(2) <= ... <= t(k+1)
+   t(n-k) <= t(n-k+1) <= ... <= t(n)
+3) t(k+1) < t(k+2) < ... < t(n-k)
+4) t(k+1) <= x(i) <= t(n-k)
+5) The conditions specified by Schoenberg and Whitney must hold
+   for at least one subset of data points, i.e., there must be a
+   subset of data points y(j) such that
+       t(j) < y(j) < t(j+k+1), j=1,2,...,n-k-1
+"""
+
+
+@xp_capabilities(out_of_scope=True)
+class LSQUnivariateSpline(UnivariateSpline):
+    """
+    1-D spline with explicit internal knots.
+
+    .. legacy:: class
+
+        Specifically, we recommend using `make_lsq_spline` instead.
+
+
+    Fits a spline y = spl(x) of degree `k` to the provided `x`, `y` data.  `t`
+    specifies the internal knots of the spline
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        Input dimension of data points -- must be increasing
+    y : (N,) array_like
+        Input dimension of data points
+    t : (M,) array_like
+        interior knots of the spline.  Must be in ascending order and::
+
+            bbox[0] < t[0] < ... < t[-1] < bbox[-1]
+
+    w : (N,) array_like, optional
+        weights for spline fitting. Must be positive. If None (default),
+        weights are all 1.
+    bbox : (2,) array_like, optional
+        2-sequence specifying the boundary of the approximation interval. If
+        None (default), ``bbox = [x[0], x[-1]]``.
+    k : int, optional
+        Degree of the smoothing spline.  Must be 1 <= `k` <= 5.
+        Default is `k` = 3, a cubic spline.
+    ext : int or str, optional
+        Controls the extrapolation mode for elements
+        not in the interval defined by the knot sequence.
+
+        * if ext=0 or 'extrapolate', return the extrapolated value.
+        * if ext=1 or 'zeros', return 0
+        * if ext=2 or 'raise', raise a ValueError
+        * if ext=3 of 'const', return the boundary value.
+
+        The default value is 0.
+
+    check_finite : bool, optional
+        Whether to check that the input arrays contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination or non-sensical results) if the inputs
+        do contain infinities or NaNs.
+        Default is False.
+
+    Raises
+    ------
+    ValueError
+        If the interior knots do not satisfy the Schoenberg-Whitney conditions
+
+    See Also
+    --------
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    InterpolatedUnivariateSpline :
+        a interpolating univariate spline for a given set of data points.
+    splrep :
+        a function to find the B-spline representation of a 1-D curve
+    splev :
+        a function to evaluate a B-spline or its derivatives
+    sproot :
+        a function to find the roots of a cubic B-spline
+    splint :
+        a function to evaluate the definite integral of a B-spline between two
+        given points
+    spalde :
+        a function to evaluate all derivatives of a B-spline
+
+    Notes
+    -----
+    The number of data points must be larger than the spline degree `k`.
+
+    Knots `t` must satisfy the Schoenberg-Whitney conditions,
+    i.e., there must be a subset of data points ``x[j]`` such that
+    ``t[j] < x[j] < t[j+k+1]``, for ``j=0, 1,...,n-k-2``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.interpolate import LSQUnivariateSpline, UnivariateSpline
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng()
+    >>> x = np.linspace(-3, 3, 50)
+    >>> y = np.exp(-x**2) + 0.1 * rng.standard_normal(50)
+
+    Fit a smoothing spline with a pre-defined internal knots:
+
+    >>> t = [-1, 0, 1]
+    >>> spl = LSQUnivariateSpline(x, y, t)
+
+    >>> xs = np.linspace(-3, 3, 1000)
+    >>> plt.plot(x, y, 'ro', ms=5)
+    >>> plt.plot(xs, spl(xs), 'g-', lw=3)
+    >>> plt.show()
+
+    Check the knot vector:
+
+    >>> spl.get_knots()
+    array([-3., -1., 0., 1., 3.])
+
+    Constructing lsq spline using the knots from another spline:
+
+    >>> x = np.arange(10)
+    >>> s = UnivariateSpline(x, x, s=0)
+    >>> s.get_knots()
+    array([ 0.,  2.,  3.,  4.,  5.,  6.,  7.,  9.])
+    >>> knt = s.get_knots()
+    >>> s1 = LSQUnivariateSpline(x, x, knt[1:-1])    # Chop 1st and last knot
+    >>> s1.get_knots()
+    array([ 0.,  2.,  3.,  4.,  5.,  6.,  7.,  9.])
+
+    """
+
+    def __init__(self, x, y, t, w=None, bbox=[None]*2, k=3,
+                 ext=0, check_finite=False):
+
+        x, y, w, bbox, self.ext = self.validate_input(x, y, w, bbox, k, None,
+                                                      ext, check_finite)
+        if not np.all(diff(x) >= 0.0):
+            raise ValueError('x must be increasing')
+
+        # _data == x,y,w,xb,xe,k,s,n,t,c,fp,fpint,nrdata,ier
+        xb = bbox[0]
+        xe = bbox[1]
+        if xb is None:
+            xb = x[0]
+        if xe is None:
+            xe = x[-1]
+        t = concatenate(([xb]*(k+1), t, [xe]*(k+1)))
+        n = len(t)
+        if not np.all(t[k+1:n-k]-t[k:n-k-1] > 0, axis=0):
+            raise ValueError('Interior knots t must satisfy '
+                             'Schoenberg-Whitney conditions')
+        with FITPACK_LOCK:
+            if not dfitpack.fpchec(x, t, k) == 0:
+                raise ValueError(_fpchec_error_string)
+            data = dfitpack.fpcurfm1(x, y, k, t, w=w, xb=xb, xe=xe)
+        self._data = data[:-3] + (None, None, data[-1])
+        self._reset_class()
+
+
+# ############### Bivariate spline ####################
+
+class _BivariateSplineBase:
+    """ Base class for Bivariate spline s(x,y) interpolation on the rectangle
+    [xb,xe] x [yb, ye] calculated from a given set of data points
+    (x,y,z).
+
+    See Also
+    --------
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+    BivariateSpline :
+        a base class for bivariate splines.
+    SphereBivariateSpline :
+        a bivariate spline on a spherical grid
+    """
+
+    @classmethod
+    def _from_tck(cls, tck):
+        """Construct a spline object from given tck and degree"""
+        self = cls.__new__(cls)
+        if len(tck) != 5:
+            raise ValueError("tck should be a 5 element tuple of tx,"
+                             " ty, c, kx, ky")
+        self.tck = tck[:3]
+        self.degrees = tck[3:]
+        return self
+
+    def get_residual(self):
+        """ Return weighted sum of squared residuals of the spline
+        approximation: sum ((w[i]*(z[i]-s(x[i],y[i])))**2,axis=0)
+        """
+        return self.fp
+
+    def get_knots(self):
+        """ Return a tuple (tx,ty) where tx,ty contain knots positions
+        of the spline with respect to x-, y-variable, respectively.
+        The position of interior and additional knots are given as
+        t[k+1:-k-1] and t[:k+1]=b, t[-k-1:]=e, respectively.
+        """
+        return self.tck[:2]
+
+    def get_coeffs(self):
+        """ Return spline coefficients."""
+        return self.tck[2]
+
+    def __call__(self, x, y, dx=0, dy=0, grid=True):
+        """
+        Evaluate the spline or its derivatives at given positions.
+
+        Parameters
+        ----------
+        x, y : array_like
+            Input coordinates.
+
+            If `grid` is False, evaluate the spline at points ``(x[i],
+            y[i]), i=0, ..., len(x)-1``.  Standard Numpy broadcasting
+            is obeyed.
+
+            If `grid` is True: evaluate spline at the grid points
+            defined by the coordinate arrays x, y. The arrays must be
+            sorted to increasing order.
+
+            The ordering of axes is consistent with
+            ``np.meshgrid(..., indexing="ij")`` and inconsistent with the
+            default ordering ``np.meshgrid(..., indexing="xy")``.
+        dx : int
+            Order of x-derivative
+
+            .. versionadded:: 0.14.0
+        dy : int
+            Order of y-derivative
+
+            .. versionadded:: 0.14.0
+        grid : bool
+            Whether to evaluate the results on a grid spanned by the
+            input arrays, or at points specified by the input arrays.
+
+            .. versionadded:: 0.14.0
+
+        Examples
+        --------
+        Suppose that we want to bilinearly interpolate an exponentially decaying
+        function in 2 dimensions.
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import RectBivariateSpline
+
+        We sample the function on a coarse grid. Note that the default indexing="xy"
+        of meshgrid would result in an unexpected (transposed) result after
+        interpolation.
+
+        >>> xarr = np.linspace(-3, 3, 100)
+        >>> yarr = np.linspace(-3, 3, 100)
+        >>> xgrid, ygrid = np.meshgrid(xarr, yarr, indexing="ij")
+
+        The function to interpolate decays faster along one axis than the other.
+
+        >>> zdata = np.exp(-np.sqrt((xgrid / 2) ** 2 + ygrid**2))
+
+        Next we sample on a finer grid using interpolation (kx=ky=1 for bilinear).
+
+        >>> rbs = RectBivariateSpline(xarr, yarr, zdata, kx=1, ky=1)
+        >>> xarr_fine = np.linspace(-3, 3, 200)
+        >>> yarr_fine = np.linspace(-3, 3, 200)
+        >>> xgrid_fine, ygrid_fine = np.meshgrid(xarr_fine, yarr_fine, indexing="ij")
+        >>> zdata_interp = rbs(xgrid_fine, ygrid_fine, grid=False)
+
+        And check that the result agrees with the input by plotting both.
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig = plt.figure()
+        >>> ax1 = fig.add_subplot(1, 2, 1, aspect="equal")
+        >>> ax2 = fig.add_subplot(1, 2, 2, aspect="equal")
+        >>> ax1.imshow(zdata)
+        >>> ax2.imshow(zdata_interp)
+        >>> plt.show()
+        """
+        x = np.asarray(x)
+        y = np.asarray(y)
+
+        tx, ty, c = self.tck[:3]
+        kx, ky = self.degrees
+        if grid:
+            if x.size == 0 or y.size == 0:
+                return np.zeros((x.size, y.size), dtype=self.tck[2].dtype)
+
+            if (x.size >= 2) and (not np.all(np.diff(x) >= 0.0)):
+                raise ValueError("x must be strictly increasing when `grid` is True")
+            if (y.size >= 2) and (not np.all(np.diff(y) >= 0.0)):
+                raise ValueError("y must be strictly increasing when `grid` is True")
+
+            if dx or dy:
+                with FITPACK_LOCK:
+                    z, ier = dfitpack.parder(tx, ty, c, kx, ky, dx, dy, x, y)
+                if not ier == 0:
+                    raise ValueError(f"Error code returned by parder: {ier}")
+            else:
+                with FITPACK_LOCK:
+                    z, ier = dfitpack.bispev(tx, ty, c, kx, ky, x, y)
+                if not ier == 0:
+                    raise ValueError(f"Error code returned by bispev: {ier}")
+        else:
+            # standard Numpy broadcasting
+            if x.shape != y.shape:
+                x, y = np.broadcast_arrays(x, y)
+
+            shape = x.shape
+            x = x.ravel()
+            y = y.ravel()
+
+            if x.size == 0 or y.size == 0:
+                return np.zeros(shape, dtype=self.tck[2].dtype)
+
+            if dx or dy:
+                with FITPACK_LOCK:
+                    z, ier = dfitpack.pardeu(tx, ty, c, kx, ky, dx, dy, x, y)
+                if not ier == 0:
+                    raise ValueError(f"Error code returned by pardeu: {ier}")
+            else:
+                with FITPACK_LOCK:
+                    z, ier = dfitpack.bispeu(tx, ty, c, kx, ky, x, y)
+                if not ier == 0:
+                    raise ValueError(f"Error code returned by bispeu: {ier}")
+
+            z = z.reshape(shape)
+        return z
+
+    def partial_derivative(self, dx, dy):
+        """Construct a new spline representing a partial derivative of this
+        spline.
+
+        Parameters
+        ----------
+        dx, dy : int
+            Orders of the derivative in x and y respectively. They must be
+            non-negative integers and less than the respective degree of the
+            original spline (self) in that direction (``kx``, ``ky``).
+
+        Returns
+        -------
+        spline :
+            A new spline of degrees (``kx - dx``, ``ky - dy``) representing the
+            derivative of this spline.
+
+        Notes
+        -----
+
+        .. versionadded:: 1.9.0
+
+        """
+        if dx == 0 and dy == 0:
+            return self
+        else:
+            kx, ky = self.degrees
+            if not (dx >= 0 and dy >= 0):
+                raise ValueError("order of derivative must be positive or"
+                                 " zero")
+            if not (dx < kx and dy < ky):
+                raise ValueError("order of derivative must be less than"
+                                 " degree of spline")
+            tx, ty, c = self.tck[:3]
+            with FITPACK_LOCK:
+                newc, ier = dfitpack.pardtc(tx, ty, c, kx, ky, dx, dy)
+            if ier != 0:
+                # This should not happen under normal conditions.
+                raise ValueError(f"Unexpected error code returned by pardtc: {ier}")
+            nx = len(tx)
+            ny = len(ty)
+            newtx = tx[dx:nx - dx]
+            newty = ty[dy:ny - dy]
+            newkx, newky = kx - dx, ky - dy
+            newclen = (nx - dx - kx - 1) * (ny - dy - ky - 1)
+            return _DerivedBivariateSpline._from_tck((newtx, newty,
+                                                      newc[:newclen],
+                                                      newkx, newky))
+
+
+_surfit_messages = {1: """
+The required storage space exceeds the available storage space: nxest
+or nyest too small, or s too small.
+The weighted least-squares spline corresponds to the current set of
+knots.""",
+                    2: """
+A theoretically impossible result was found during the iteration
+process for finding a smoothing spline with fp = s: s too small or
+badly chosen eps.
+Weighted sum of squared residuals does not satisfy abs(fp-s)/s < tol.""",
+                    3: """
+the maximal number of iterations maxit (set to 20 by the program)
+allowed for finding a smoothing spline with fp=s has been reached:
+s too small.
+Weighted sum of squared residuals does not satisfy abs(fp-s)/s < tol.
+Try increasing maxit by passing it as a keyword argument.""",
+                    4: """
+No more knots can be added because the number of b-spline coefficients
+(nx-kx-1)*(ny-ky-1) already exceeds the number of data points m:
+either s or m too small.
+The weighted least-squares spline corresponds to the current set of
+knots.""",
+                    5: """
+No more knots can be added because the additional knot would (quasi)
+coincide with an old one: s too small or too large a weight to an
+inaccurate data point.
+The weighted least-squares spline corresponds to the current set of
+knots.""",
+                    10: """
+Error on entry, no approximation returned. The following conditions
+must hold:
+xb<=x[i]<=xe, yb<=y[i]<=ye, w[i]>0, i=0..m-1
+If iopt==-1, then
+  xb<tx[kx+1]<tx[kx+2]<...<tx[nx-kx-2]<xe
+  yb<ty[ky+1]<ty[ky+2]<...<ty[ny-ky-2]<ye""",
+                    -3: """
+The coefficients of the spline returned have been computed as the
+minimal norm least-squares solution of a (numerically) rank deficient
+system (deficiency=%i). If deficiency is large, the results may be
+inaccurate. Deficiency may strongly depend on the value of eps."""
+                    }
+
+
+@xp_capabilities(out_of_scope=True)
+class BivariateSpline(_BivariateSplineBase):
+    """
+    Base class for bivariate splines.
+
+    This describes a spline ``s(x, y)`` of degrees ``kx`` and ``ky`` on
+    the rectangle ``[xb, xe] * [yb, ye]`` calculated from a given set
+    of data points ``(x, y, z)``.
+
+    This class is meant to be subclassed, not instantiated directly.
+    To construct these splines, call either `SmoothBivariateSpline` or
+    `LSQBivariateSpline` or `RectBivariateSpline`.
+
+    See Also
+    --------
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+    """
+
+    def ev(self, xi, yi, dx=0, dy=0):
+        """
+        Evaluate the spline at points
+
+        Returns the interpolated value at ``(xi[i], yi[i]),
+        i=0,...,len(xi)-1``.
+
+        Parameters
+        ----------
+        xi, yi : array_like
+            Input coordinates. Standard Numpy broadcasting is obeyed.
+            The ordering of axes is consistent with
+            ``np.meshgrid(..., indexing="ij")`` and inconsistent with the
+            default ordering ``np.meshgrid(..., indexing="xy")``.
+        dx : int, optional
+            Order of x-derivative
+
+            .. versionadded:: 0.14.0
+        dy : int, optional
+            Order of y-derivative
+
+            .. versionadded:: 0.14.0
+
+        Examples
+        --------
+        Suppose that we want to bilinearly interpolate an exponentially decaying
+        function in 2 dimensions.
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import RectBivariateSpline
+        >>> def f(x, y):
+        ...     return np.exp(-np.sqrt((x / 2) ** 2 + y**2))
+
+        We sample the function on a coarse grid and set up the interpolator. Note that
+        the default ``indexing="xy"`` of meshgrid would result in an unexpected
+        (transposed) result after interpolation.
+
+        >>> xarr = np.linspace(-3, 3, 21)
+        >>> yarr = np.linspace(-3, 3, 21)
+        >>> xgrid, ygrid = np.meshgrid(xarr, yarr, indexing="ij")
+        >>> zdata = f(xgrid, ygrid)
+        >>> rbs = RectBivariateSpline(xarr, yarr, zdata, kx=1, ky=1)
+
+        Next we sample the function along a diagonal slice through the coordinate space
+        on a finer grid using interpolation.
+
+        >>> xinterp = np.linspace(-3, 3, 201)
+        >>> yinterp = np.linspace(3, -3, 201)
+        >>> zinterp = rbs.ev(xinterp, yinterp)
+
+        And check that the interpolation passes through the function evaluations as a
+        function of the distance from the origin along the slice.
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig = plt.figure()
+        >>> ax1 = fig.add_subplot(1, 1, 1)
+        >>> ax1.plot(np.sqrt(xarr**2 + yarr**2), np.diag(zdata), "or")
+        >>> ax1.plot(np.sqrt(xinterp**2 + yinterp**2), zinterp, "-b")
+        >>> plt.show()
+        """
+        return self.__call__(xi, yi, dx=dx, dy=dy, grid=False)
+
+    def integral(self, xa, xb, ya, yb):
+        """
+        Evaluate the integral of the spline over area [xa,xb] x [ya,yb].
+
+        Parameters
+        ----------
+        xa, xb : float
+            The end-points of the x integration interval.
+        ya, yb : float
+            The end-points of the y integration interval.
+
+        Returns
+        -------
+        integ : float
+            The value of the resulting integral.
+
+        """
+        tx, ty, c = self.tck[:3]
+        kx, ky = self.degrees
+        with FITPACK_LOCK:
+            return dfitpack.dblint(tx, ty, c, kx, ky, xa, xb, ya, yb)
+
+    @staticmethod
+    def _validate_input(x, y, z, w, kx, ky, eps):
+        x, y, z = np.asarray(x), np.asarray(y), np.asarray(z)
+        if not x.size == y.size == z.size:
+            raise ValueError('x, y, and z should have a same length')
+
+        if w is not None:
+            w = np.asarray(w)
+            if x.size != w.size:
+                raise ValueError('x, y, z, and w should have a same length')
+            elif not np.all(w >= 0.0):
+                raise ValueError('w should be positive')
+        if (eps is not None) and (not 0.0 < eps < 1.0):
+            raise ValueError('eps should be between (0, 1)')
+        if not x.size >= (kx + 1) * (ky + 1):
+            raise ValueError('The length of x, y and z should be at least'
+                             ' (kx+1) * (ky+1)')
+        return x, y, z, w
+
+
+class _DerivedBivariateSpline(_BivariateSplineBase):
+    """Bivariate spline constructed from the coefficients and knots of another
+    spline.
+
+    Notes
+    -----
+    The class is not meant to be instantiated directly from the data to be
+    interpolated or smoothed. As a result, its ``fp`` attribute and
+    ``get_residual`` method are inherited but overridden; ``AttributeError`` is
+    raised when they are accessed.
+
+    The other inherited attributes can be used as usual.
+    """
+    _invalid_why = ("is unavailable, because _DerivedBivariateSpline"
+                    " instance is not constructed from data that are to be"
+                    " interpolated or smoothed, but derived from the"
+                    " underlying knots and coefficients of another spline"
+                    " object")
+
+    @property
+    def fp(self):
+        raise AttributeError(f"attribute \"fp\" {self._invalid_why}")
+
+    def get_residual(self):
+        raise AttributeError(f"method \"get_residual\" {self._invalid_why}")
+
+
+@xp_capabilities(out_of_scope=True)
+class SmoothBivariateSpline(BivariateSpline):
+    """
+    Smooth bivariate spline approximation.
+
+    Parameters
+    ----------
+    x, y, z : array_like
+        1-D sequences of data points (order is not important).
+    w : array_like, optional
+        Positive 1-D sequence of weights, of same length as `x`, `y` and `z`.
+    bbox : array_like, optional
+        Sequence of length 4 specifying the boundary of the rectangular
+        approximation domain.  By default,
+        ``bbox=[min(x), max(x), min(y), max(y)]``.
+    kx, ky : ints, optional
+        Degrees of the bivariate spline. Default is 3.
+    s : float, optional
+        Positive smoothing factor defined for estimation condition:
+        ``sum((w[i]*(z[i]-s(x[i], y[i])))**2, axis=0) <= s``
+        Default ``s=len(w)`` which should be a good value if ``1/w[i]`` is an
+        estimate of the standard deviation of ``z[i]``.
+    eps : float, optional
+        A threshold for determining the effective rank of an over-determined
+        linear system of equations. `eps` should have a value within the open
+        interval ``(0, 1)``, the default is 1e-16.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+    The length of `x`, `y` and `z` should be at least ``(kx+1) * (ky+1)``.
+
+    If the input data is such that input dimensions have incommensurate
+    units and differ by many orders of magnitude, the interpolant may have
+    numerical artifacts. Consider rescaling the data before interpolating.
+
+    This routine constructs spline knot vectors automatically via the FITPACK
+    algorithm. The spline knots may be placed away from the data points. For
+    some data sets, this routine may fail to construct an interpolating spline,
+    even if one is requested via ``s=0`` parameter. In such situations, it is
+    recommended to use `bisplrep` / `bisplev` directly instead of this routine
+    and, if needed, increase the values of ``nxest`` and ``nyest`` parameters
+    of `bisplrep`.
+
+    For linear interpolation, `LinearNDInterpolator` is preferred.
+    Consult the :ref:`interp-transition-guide` for discussion.
+
+    """
+
+    def __init__(self, x, y, z, w=None, bbox=[None] * 4, kx=3, ky=3, s=None,
+                 eps=1e-16):
+
+        x, y, z, w = self._validate_input(x, y, z, w, kx, ky, eps)
+        bbox = ravel(bbox)
+        if not bbox.shape == (4,):
+            raise ValueError('bbox shape should be (4,)')
+        if s is not None and not s >= 0.0:
+            raise ValueError("s should be s >= 0.0")
+
+        xb, xe, yb, ye = bbox
+        with FITPACK_LOCK:
+            nx, tx, ny, ty, c, fp, wrk1, ier = dfitpack.surfit_smth(
+                x, y, z, w, xb, xe, yb, ye, kx, ky, s=s, eps=eps, lwrk2=1)
+            if ier > 10:          # lwrk2 was to small, re-run
+                nx, tx, ny, ty, c, fp, wrk1, ier = dfitpack.surfit_smth(
+                    x, y, z, w, xb, xe, yb, ye, kx, ky, s=s, eps=eps,
+                    lwrk2=ier)
+        if ier in [0, -1, -2]:  # normal return
+            pass
+        else:
+            message = _surfit_messages.get(ier, f'ier={ier}')
+            warnings.warn(message, stacklevel=2)
+
+        self.fp = fp
+        self.tck = tx[:nx], ty[:ny], c[:(nx-kx-1)*(ny-ky-1)]
+        self.degrees = kx, ky
+
+
+@xp_capabilities(out_of_scope=True)
+class LSQBivariateSpline(BivariateSpline):
+    """
+    Weighted least-squares bivariate spline approximation.
+
+    Parameters
+    ----------
+    x, y, z : array_like
+        1-D sequences of data points (order is not important).
+    tx, ty : array_like
+        Strictly ordered 1-D sequences of knots coordinates.
+    w : array_like, optional
+        Positive 1-D array of weights, of the same length as `x`, `y` and `z`.
+    bbox : (4,) array_like, optional
+        Sequence of length 4 specifying the boundary of the rectangular
+        approximation domain.  By default,
+        ``bbox=[min(x,tx),max(x,tx), min(y,ty),max(y,ty)]``.
+    kx, ky : ints, optional
+        Degrees of the bivariate spline. Default is 3.
+    eps : float, optional
+        A threshold for determining the effective rank of an over-determined
+        linear system of equations. `eps` should have a value within the open
+        interval ``(0, 1)``, the default is 1e-16.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+    The length of `x`, `y` and `z` should be at least ``(kx+1) * (ky+1)``.
+
+    If the input data is such that input dimensions have incommensurate
+    units and differ by many orders of magnitude, the interpolant may have
+    numerical artifacts. Consider rescaling the data before interpolating.
+
+    """
+
+    def __init__(self, x, y, z, tx, ty, w=None, bbox=[None]*4, kx=3, ky=3,
+                 eps=None):
+
+        x, y, z, w = self._validate_input(x, y, z, w, kx, ky, eps)
+        bbox = ravel(bbox)
+        if not bbox.shape == (4,):
+            raise ValueError('bbox shape should be (4,)')
+
+        nx = 2*kx+2+len(tx)
+        ny = 2*ky+2+len(ty)
+        # The Fortran subroutine "surfit" (called as dfitpack.surfit_lsq)
+        # requires that the knot arrays passed as input should be "real
+        # array(s) of dimension nmax" where "nmax" refers to the greater of nx
+        # and ny. We pad the tx1/ty1 arrays here so that this is satisfied, and
+        # slice them to the desired sizes upon return.
+        nmax = max(nx, ny)
+        tx1 = zeros((nmax,), float)
+        ty1 = zeros((nmax,), float)
+        tx1[kx+1:nx-kx-1] = tx
+        ty1[ky+1:ny-ky-1] = ty
+
+        xb, xe, yb, ye = bbox
+        with FITPACK_LOCK:
+            tx1, ty1, c, fp, ier = dfitpack.surfit_lsq(x, y, z, nx, tx1, ny, ty1,
+                                                    w, xb, xe, yb, ye,
+                                                    kx, ky, eps, lwrk2=1)
+            if ier > 10:
+                tx1, ty1, c, fp, ier = dfitpack.surfit_lsq(x, y, z,
+                                                        nx, tx1, ny, ty1, w,
+                                                        xb, xe, yb, ye,
+                                                        kx, ky, eps, lwrk2=ier)
+        if ier in [0, -1, -2]:  # normal return
+            pass
+        else:
+            if ier < -2:
+                deficiency = (nx-kx-1)*(ny-ky-1)+ier
+                message = _surfit_messages.get(-3) % (deficiency)
+            else:
+                message = _surfit_messages.get(ier, f'ier={ier}')
+            warnings.warn(message, stacklevel=2)
+        self.fp = fp
+        self.tck = tx1[:nx], ty1[:ny], c
+        self.degrees = kx, ky
+
+
+@xp_capabilities(out_of_scope=True)
+class RectBivariateSpline(BivariateSpline):
+    """
+    Bivariate spline approximation over a rectangular mesh.
+
+    Can be used for both smoothing and interpolating data.
+
+    Parameters
+    ----------
+    x,y : array_like
+        1-D arrays of coordinates in strictly ascending order.
+        Evaluated points outside the data range will be extrapolated.
+    z : array_like
+        2-D array of data with shape (x.size,y.size).
+    bbox : array_like, optional
+        Sequence of length 4 specifying the boundary of the rectangular
+        approximation domain, which means the start and end spline knots of
+        each dimension are set by these values. By default,
+        ``bbox=[min(x), max(x), min(y), max(y)]``.
+    kx, ky : ints, optional
+        Degrees of the bivariate spline. Default is 3.
+    s : float, optional
+        Positive smoothing factor defined for estimation condition:
+        ``sum((z[i]-f(x[i], y[i]))**2, axis=0) <= s`` where f is a spline
+        function. Default is ``s=0``, which is for interpolation.
+    maxit : int, optional
+        The maximal number of iterations maxit allowed for finding a
+        smoothing spline with fp=s. Default is ``maxit=20``.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+
+    If the input data is such that input dimensions have incommensurate
+    units and differ by many orders of magnitude, the interpolant may have
+    numerical artifacts. Consider rescaling the data before interpolating.
+
+    """
+
+    def __init__(self, x, y, z, bbox=[None] * 4, kx=3, ky=3, s=0, maxit=20):
+        x, y, bbox = ravel(x), ravel(y), ravel(bbox)
+        z = np.asarray(z)
+        if not np.all(diff(x) > 0.0):
+            raise ValueError('x must be strictly increasing')
+        if not np.all(diff(y) > 0.0):
+            raise ValueError('y must be strictly increasing')
+        if not x.size == z.shape[0]:
+            raise ValueError('x dimension of z must have same number of '
+                             'elements as x')
+        if not y.size == z.shape[1]:
+            raise ValueError('y dimension of z must have same number of '
+                             'elements as y')
+        if not bbox.shape == (4,):
+            raise ValueError('bbox shape should be (4,)')
+        if s is not None and not s >= 0.0:
+            raise ValueError("s should be s >= 0.0")
+
+        z = ravel(z)
+        xb, xe, yb, ye = bbox
+        with FITPACK_LOCK:
+            nx, tx, ny, ty, c, fp, ier = dfitpack.regrid_smth(x, y, z, xb, xe, yb,
+                                                            ye, kx, ky, s, maxit)
+
+        if ier not in [0, -1, -2]:
+            msg = _surfit_messages.get(ier, f'ier={ier}')
+            raise ValueError(msg)
+
+        self.fp = fp
+        self.tck = tx[:nx], ty[:ny], c[:(nx - kx - 1) * (ny - ky - 1)]
+        self.degrees = kx, ky
+
+
+_spherefit_messages = _surfit_messages.copy()
+_spherefit_messages[10] = """
+ERROR. On entry, the input data are controlled on validity. The following
+       restrictions must be satisfied:
+            -1<=iopt<=1,  m>=2, ntest>=8 ,npest >=8, 0<eps<1,
+            0<=teta(i)<=pi, 0<=phi(i)<=2*pi, w(i)>0, i=1,...,m
+            lwrk1 >= 185+52*v+10*u+14*u*v+8*(u-1)*v**2+8*m
+            kwrk >= m+(ntest-7)*(npest-7)
+            if iopt=-1: 8<=nt<=ntest , 9<=np<=npest
+                        0<tt(5)<tt(6)<...<tt(nt-4)<pi
+                        0<tp(5)<tp(6)<...<tp(np-4)<2*pi
+            if iopt>=0: s>=0
+            if one of these conditions is found to be violated,control
+            is immediately repassed to the calling program. in that
+            case there is no approximation returned."""
+_spherefit_messages[-3] = """
+WARNING. The coefficients of the spline returned have been computed as the
+         minimal norm least-squares solution of a (numerically) rank
+         deficient system (deficiency=%i, rank=%i). Especially if the rank
+         deficiency, which is computed by 6+(nt-8)*(np-7)+ier, is large,
+         the results may be inaccurate. They could also seriously depend on
+         the value of eps."""
+
+
+@xp_capabilities(out_of_scope=True)
+class SphereBivariateSpline(_BivariateSplineBase):
+    """
+    Bivariate spline s(x,y) of degrees 3 on a sphere, calculated from a
+    given set of data points (theta,phi,r).
+
+    .. versionadded:: 0.11.0
+
+    See Also
+    --------
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQUnivariateSpline :
+        a univariate spline using weighted least-squares fitting
+    """
+
+    def __call__(self, theta, phi, dtheta=0, dphi=0, grid=True):
+        """
+        Evaluate the spline or its derivatives at given positions.
+
+        Parameters
+        ----------
+        theta, phi : array_like
+            Input coordinates.
+
+            If `grid` is False, evaluate the spline at points
+            ``(theta[i], phi[i]), i=0, ..., len(x)-1``.  Standard
+            Numpy broadcasting is obeyed.
+
+            If `grid` is True: evaluate spline at the grid points
+            defined by the coordinate arrays theta, phi. The arrays
+            must be sorted to increasing order.
+            The ordering of axes is consistent with
+            ``np.meshgrid(..., indexing="ij")`` and inconsistent with the
+            default ordering ``np.meshgrid(..., indexing="xy")``.
+        dtheta : int, optional
+            Order of theta-derivative
+
+            .. versionadded:: 0.14.0
+        dphi : int
+            Order of phi-derivative
+
+            .. versionadded:: 0.14.0
+        grid : bool
+            Whether to evaluate the results on a grid spanned by the
+            input arrays, or at points specified by the input arrays.
+
+            .. versionadded:: 0.14.0
+
+        Examples
+        --------
+
+        Suppose that we want to use splines to interpolate a bivariate function on a
+        sphere. The value of the function is known on a grid of longitudes and
+        colatitudes.
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import RectSphereBivariateSpline
+        >>> def f(theta, phi):
+        ...     return np.sin(theta) * np.cos(phi)
+
+        We evaluate the function on the grid. Note that the default indexing="xy"
+        of meshgrid would result in an unexpected (transposed) result after
+        interpolation.
+
+        >>> thetaarr = np.linspace(0, np.pi, 22)[1:-1]
+        >>> phiarr = np.linspace(0, 2 * np.pi, 21)[:-1]
+        >>> thetagrid, phigrid = np.meshgrid(thetaarr, phiarr, indexing="ij")
+        >>> zdata = f(thetagrid, phigrid)
+
+        We next set up the interpolator and use it to evaluate the function
+        on a finer grid.
+
+        >>> rsbs = RectSphereBivariateSpline(thetaarr, phiarr, zdata)
+        >>> thetaarr_fine = np.linspace(0, np.pi, 200)
+        >>> phiarr_fine = np.linspace(0, 2 * np.pi, 200)
+        >>> zdata_fine = rsbs(thetaarr_fine, phiarr_fine)
+
+        Finally we plot the coarsly-sampled input data alongside the
+        finely-sampled interpolated data to check that they agree.
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig = plt.figure()
+        >>> ax1 = fig.add_subplot(1, 2, 1)
+        >>> ax2 = fig.add_subplot(1, 2, 2)
+        >>> ax1.imshow(zdata)
+        >>> ax2.imshow(zdata_fine)
+        >>> plt.show()
+        """
+        theta = np.asarray(theta)
+        phi = np.asarray(phi)
+
+        if theta.size > 0 and (theta.min() < 0. or theta.max() > np.pi):
+            raise ValueError("requested theta out of bounds.")
+
+        return _BivariateSplineBase.__call__(self, theta, phi,
+                                             dx=dtheta, dy=dphi, grid=grid)
+
+    def ev(self, theta, phi, dtheta=0, dphi=0):
+        """
+        Evaluate the spline at points
+
+        Returns the interpolated value at ``(theta[i], phi[i]),
+        i=0,...,len(theta)-1``.
+
+        Parameters
+        ----------
+        theta, phi : array_like
+            Input coordinates. Standard Numpy broadcasting is obeyed.
+            The ordering of axes is consistent with
+            np.meshgrid(..., indexing="ij") and inconsistent with the
+            default ordering np.meshgrid(..., indexing="xy").
+        dtheta : int, optional
+            Order of theta-derivative
+
+            .. versionadded:: 0.14.0
+        dphi : int, optional
+            Order of phi-derivative
+
+            .. versionadded:: 0.14.0
+
+        Examples
+        --------
+        Suppose that we want to use splines to interpolate a bivariate function on a
+        sphere. The value of the function is known on a grid of longitudes and
+        colatitudes.
+
+        >>> import numpy as np
+        >>> from scipy.interpolate import RectSphereBivariateSpline
+        >>> def f(theta, phi):
+        ...     return np.sin(theta) * np.cos(phi)
+
+        We evaluate the function on the grid. Note that the default indexing="xy"
+        of meshgrid would result in an unexpected (transposed) result after
+        interpolation.
+
+        >>> thetaarr = np.linspace(0, np.pi, 22)[1:-1]
+        >>> phiarr = np.linspace(0, 2 * np.pi, 21)[:-1]
+        >>> thetagrid, phigrid = np.meshgrid(thetaarr, phiarr, indexing="ij")
+        >>> zdata = f(thetagrid, phigrid)
+
+        We next set up the interpolator and use it to evaluate the function
+        at points not on the original grid.
+
+        >>> rsbs = RectSphereBivariateSpline(thetaarr, phiarr, zdata)
+        >>> thetainterp = np.linspace(thetaarr[0], thetaarr[-1], 200)
+        >>> phiinterp = np.linspace(phiarr[0], phiarr[-1], 200)
+        >>> zinterp = rsbs.ev(thetainterp, phiinterp)
+
+        Finally we plot the original data for a diagonal slice through the
+        initial grid, and the spline approximation along the same slice.
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig = plt.figure()
+        >>> ax1 = fig.add_subplot(1, 1, 1)
+        >>> ax1.plot(np.sin(thetaarr) * np.sin(phiarr), np.diag(zdata), "or")
+        >>> ax1.plot(np.sin(thetainterp) * np.sin(phiinterp), zinterp, "-b")
+        >>> plt.show()
+        """
+        return self.__call__(theta, phi, dtheta=dtheta, dphi=dphi, grid=False)
+
+
+@xp_capabilities(out_of_scope=True)
+class SmoothSphereBivariateSpline(SphereBivariateSpline):
+    """
+    Smooth bivariate spline approximation in spherical coordinates.
+
+    .. versionadded:: 0.11.0
+
+    Parameters
+    ----------
+    theta, phi, r : array_like
+        1-D sequences of data points (order is not important). Coordinates
+        must be given in radians. Theta must lie within the interval
+        ``[0, pi]``, and phi must lie within the interval ``[0, 2pi]``.
+    w : array_like, optional
+        Positive 1-D sequence of weights.
+    s : float, optional
+        Positive smoothing factor defined for estimation condition:
+        ``sum((w(i)*(r(i) - s(theta(i), phi(i))))**2, axis=0) <= s``
+        Default ``s=len(w)`` which should be a good value if ``1/w[i]`` is an
+        estimate of the standard deviation of ``r[i]``.
+    eps : float, optional
+        A threshold for determining the effective rank of an over-determined
+        linear system of equations. `eps` should have a value within the open
+        interval ``(0, 1)``, the default is 1e-16.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+    For more information, see the FITPACK_ site about this function.
+
+    .. _FITPACK: http://www.netlib.org/dierckx/sphere.f
+
+    Examples
+    --------
+    Suppose we have global data on a coarse grid (the input data does not
+    have to be on a grid):
+
+    >>> import numpy as np
+    >>> theta = np.linspace(0., np.pi, 7)
+    >>> phi = np.linspace(0., 2*np.pi, 9)
+    >>> data = np.empty((theta.shape[0], phi.shape[0]))
+    >>> data[:,0], data[0,:], data[-1,:] = 0., 0., 0.
+    >>> data[1:-1,1], data[1:-1,-1] = 1., 1.
+    >>> data[1,1:-1], data[-2,1:-1] = 1., 1.
+    >>> data[2:-2,2], data[2:-2,-2] = 2., 2.
+    >>> data[2,2:-2], data[-3,2:-2] = 2., 2.
+    >>> data[3,3:-2] = 3.
+    >>> data = np.roll(data, 4, 1)
+
+    We need to set up the interpolator object
+
+    >>> lats, lons = np.meshgrid(theta, phi)
+    >>> from scipy.interpolate import SmoothSphereBivariateSpline
+    >>> lut = SmoothSphereBivariateSpline(lats.ravel(), lons.ravel(),
+    ...                                   data.T.ravel(), s=3.5)
+
+    As a first test, we'll see what the algorithm returns when run on the
+    input coordinates
+
+    >>> data_orig = lut(theta, phi)
+
+    Finally we interpolate the data to a finer grid
+
+    >>> fine_lats = np.linspace(0., np.pi, 70)
+    >>> fine_lons = np.linspace(0., 2 * np.pi, 90)
+
+    >>> data_smth = lut(fine_lats, fine_lons)
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig = plt.figure()
+    >>> ax1 = fig.add_subplot(131)
+    >>> ax1.imshow(data, interpolation='nearest')
+    >>> ax2 = fig.add_subplot(132)
+    >>> ax2.imshow(data_orig, interpolation='nearest')
+    >>> ax3 = fig.add_subplot(133)
+    >>> ax3.imshow(data_smth, interpolation='nearest')
+    >>> plt.show()
+
+    """
+
+    def __init__(self, theta, phi, r, w=None, s=0., eps=1E-16):
+
+        theta, phi, r = np.asarray(theta), np.asarray(phi), np.asarray(r)
+
+        # input validation
+        if not ((0.0 <= theta).all() and (theta <= np.pi).all()):
+            raise ValueError('theta should be between [0, pi]')
+        if not ((0.0 <= phi).all() and (phi <= 2.0 * np.pi).all()):
+            raise ValueError('phi should be between [0, 2pi]')
+        if w is not None:
+            w = np.asarray(w)
+            if not (w >= 0.0).all():
+                raise ValueError('w should be positive')
+        if not s >= 0.0:
+            raise ValueError('s should be positive')
+        if not 0.0 < eps < 1.0:
+            raise ValueError('eps should be between (0, 1)')
+
+        with FITPACK_LOCK:
+            nt_, tt_, np_, tp_, c, fp, ier = dfitpack.spherfit_smth(theta, phi,
+                                                                    r, w=w, s=s,
+                                                                    eps=eps)
+        if ier not in [0, -1, -2]:
+            message = _spherefit_messages.get(ier, f'ier={ier}')
+            raise ValueError(message)
+
+        self.fp = fp
+        self.tck = tt_[:nt_], tp_[:np_], c[:(nt_ - 4) * (np_ - 4)]
+        self.degrees = (3, 3)
+
+    def __call__(self, theta, phi, dtheta=0, dphi=0, grid=True):
+
+        theta = np.asarray(theta)
+        phi = np.asarray(phi)
+
+        if phi.size > 0 and (phi.min() < 0. or phi.max() > 2. * np.pi):
+            raise ValueError("requested phi out of bounds.")
+
+        return SphereBivariateSpline.__call__(self, theta, phi, dtheta=dtheta,
+                                              dphi=dphi, grid=grid)
+
+
+@xp_capabilities(out_of_scope=True)
+class LSQSphereBivariateSpline(SphereBivariateSpline):
+    """
+    Weighted least-squares bivariate spline approximation in spherical
+    coordinates.
+
+    Determines a smoothing bicubic spline according to a given
+    set of knots in the `theta` and `phi` directions.
+
+    .. versionadded:: 0.11.0
+
+    Parameters
+    ----------
+    theta, phi, r : array_like
+        1-D sequences of data points (order is not important). Coordinates
+        must be given in radians. Theta must lie within the interval
+        ``[0, pi]``, and phi must lie within the interval ``[0, 2pi]``.
+    tt, tp : array_like
+        Strictly ordered 1-D sequences of knots coordinates.
+        Coordinates must satisfy ``0 < tt[i] < pi``, ``0 < tp[i] < 2*pi``.
+    w : array_like, optional
+        Positive 1-D sequence of weights, of the same length as `theta`, `phi`
+        and `r`.
+    eps : float, optional
+        A threshold for determining the effective rank of an over-determined
+        linear system of equations. `eps` should have a value within the
+        open interval ``(0, 1)``, the default is 1e-16.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    RectSphereBivariateSpline :
+        a bivariate spline over a rectangular mesh on a sphere
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+    For more information, see the FITPACK_ site about this function.
+
+    .. _FITPACK: http://www.netlib.org/dierckx/sphere.f
+
+    Examples
+    --------
+    Suppose we have global data on a coarse grid (the input data does not
+    have to be on a grid):
+
+    >>> from scipy.interpolate import LSQSphereBivariateSpline
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+
+    >>> theta = np.linspace(0, np.pi, num=7)
+    >>> phi = np.linspace(0, 2*np.pi, num=9)
+    >>> data = np.empty((theta.shape[0], phi.shape[0]))
+    >>> data[:,0], data[0,:], data[-1,:] = 0., 0., 0.
+    >>> data[1:-1,1], data[1:-1,-1] = 1., 1.
+    >>> data[1,1:-1], data[-2,1:-1] = 1., 1.
+    >>> data[2:-2,2], data[2:-2,-2] = 2., 2.
+    >>> data[2,2:-2], data[-3,2:-2] = 2., 2.
+    >>> data[3,3:-2] = 3.
+    >>> data = np.roll(data, 4, 1)
+
+    We need to set up the interpolator object. Here, we must also specify the
+    coordinates of the knots to use.
+
+    >>> lats, lons = np.meshgrid(theta, phi)
+    >>> knotst, knotsp = theta.copy(), phi.copy()
+    >>> knotst[0] += .0001
+    >>> knotst[-1] -= .0001
+    >>> knotsp[0] += .0001
+    >>> knotsp[-1] -= .0001
+    >>> lut = LSQSphereBivariateSpline(lats.ravel(), lons.ravel(),
+    ...                                data.T.ravel(), knotst, knotsp)
+
+    As a first test, we'll see what the algorithm returns when run on the
+    input coordinates
+
+    >>> data_orig = lut(theta, phi)
+
+    Finally we interpolate the data to a finer grid
+
+    >>> fine_lats = np.linspace(0., np.pi, 70)
+    >>> fine_lons = np.linspace(0., 2*np.pi, 90)
+    >>> data_lsq = lut(fine_lats, fine_lons)
+
+    >>> fig = plt.figure()
+    >>> ax1 = fig.add_subplot(131)
+    >>> ax1.imshow(data, interpolation='nearest')
+    >>> ax2 = fig.add_subplot(132)
+    >>> ax2.imshow(data_orig, interpolation='nearest')
+    >>> ax3 = fig.add_subplot(133)
+    >>> ax3.imshow(data_lsq, interpolation='nearest')
+    >>> plt.show()
+
+    """
+
+    def __init__(self, theta, phi, r, tt, tp, w=None, eps=1E-16):
+
+        theta, phi, r = np.asarray(theta), np.asarray(phi), np.asarray(r)
+        tt, tp = np.asarray(tt), np.asarray(tp)
+
+        if not ((0.0 <= theta).all() and (theta <= np.pi).all()):
+            raise ValueError('theta should be between [0, pi]')
+        if not ((0.0 <= phi).all() and (phi <= 2*np.pi).all()):
+            raise ValueError('phi should be between [0, 2pi]')
+        if not ((0.0 < tt).all() and (tt < np.pi).all()):
+            raise ValueError('tt should be between (0, pi)')
+        if not ((0.0 < tp).all() and (tp < 2*np.pi).all()):
+            raise ValueError('tp should be between (0, 2pi)')
+        if w is not None:
+            w = np.asarray(w)
+            if not (w >= 0.0).all():
+                raise ValueError('w should be positive')
+        if not 0.0 < eps < 1.0:
+            raise ValueError('eps should be between (0, 1)')
+
+        nt_, np_ = 8 + len(tt), 8 + len(tp)
+        tt_, tp_ = zeros((nt_,), float), zeros((np_,), float)
+        tt_[4:-4], tp_[4:-4] = tt, tp
+        tt_[-4:], tp_[-4:] = np.pi, 2. * np.pi
+        with FITPACK_LOCK:
+            tt_, tp_, c, fp, ier = dfitpack.spherfit_lsq(theta, phi, r, tt_, tp_,
+                                                        w=w, eps=eps)
+        if ier > 0:
+            message = _spherefit_messages.get(ier, f'ier={ier}')
+            raise ValueError(message)
+
+        self.fp = fp
+        self.tck = tt_, tp_, c
+        self.degrees = (3, 3)
+
+    def __call__(self, theta, phi, dtheta=0, dphi=0, grid=True):
+
+        theta = np.asarray(theta)
+        phi = np.asarray(phi)
+
+        if phi.size > 0 and (phi.min() < 0. or phi.max() > 2. * np.pi):
+            raise ValueError("requested phi out of bounds.")
+
+        return SphereBivariateSpline.__call__(self, theta, phi, dtheta=dtheta,
+                                              dphi=dphi, grid=grid)
+
+
+_spfit_messages = _surfit_messages.copy()
+_spfit_messages[10] = """
+ERROR: on entry, the input data are controlled on validity
+       the following restrictions must be satisfied.
+          -1<=iopt(1)<=1, 0<=iopt(2)<=1, 0<=iopt(3)<=1,
+          -1<=ider(1)<=1, 0<=ider(2)<=1, ider(2)=0 if iopt(2)=0.
+          -1<=ider(3)<=1, 0<=ider(4)<=1, ider(4)=0 if iopt(3)=0.
+          mu >= mumin (see above), mv >= 4, nuest >=8, nvest >= 8,
+          kwrk>=5+mu+mv+nuest+nvest,
+          lwrk >= 12+nuest*(mv+nvest+3)+nvest*24+4*mu+8*mv+max(nuest,mv+nvest)
+          0< u(i-1)<u(i)< pi,i=2,..,mu,
+          -pi<=v(1)< pi, v(1)<v(i-1)<v(i)<v(1)+2*pi, i=3,...,mv
+          if iopt(1)=-1: 8<=nu<=min(nuest,mu+6+iopt(2)+iopt(3))
+                         0<tu(5)<tu(6)<...<tu(nu-4)< pi
+                         8<=nv<=min(nvest,mv+7)
+                         v(1)<tv(5)<tv(6)<...<tv(nv-4)<v(1)+2*pi
+                         the schoenberg-whitney conditions, i.e. there must be
+                         subset of grid coordinates uu(p) and vv(q) such that
+                            tu(p) < uu(p) < tu(p+4) ,p=1,...,nu-4
+                            (iopt(2)=1 and iopt(3)=1 also count for a uu-value
+                            tv(q) < vv(q) < tv(q+4) ,q=1,...,nv-4
+                            (vv(q) is either a value v(j) or v(j)+2*pi)
+          if iopt(1)>=0: s>=0
+          if s=0: nuest>=mu+6+iopt(2)+iopt(3), nvest>=mv+7
+       if one of these conditions is found to be violated,control is
+       immediately repassed to the calling program. in that case there is no
+       approximation returned."""
+
+
+@xp_capabilities(out_of_scope=True)
+class RectSphereBivariateSpline(SphereBivariateSpline):
+    """
+    Bivariate spline approximation over a rectangular mesh on a sphere.
+
+    Can be used for smoothing data.
+
+    .. versionadded:: 0.11.0
+
+    Parameters
+    ----------
+    u : array_like
+        1-D array of colatitude coordinates in strictly ascending order.
+        Coordinates must be given in radians and lie within the open interval
+        ``(0, pi)``.
+    v : array_like
+        1-D array of longitude coordinates in strictly ascending order.
+        Coordinates must be given in radians. First element (``v[0]``) must lie
+        within the interval ``[-pi, pi)``. Last element (``v[-1]``) must satisfy
+        ``v[-1] <= v[0] + 2*pi``.
+    r : array_like
+        2-D array of data with shape ``(u.size, v.size)``.
+    s : float, optional
+        Positive smoothing factor defined for estimation condition
+        (``s=0`` is for interpolation).
+    pole_continuity : bool or (bool, bool), optional
+        Order of continuity at the poles ``u=0`` (``pole_continuity[0]``) and
+        ``u=pi`` (``pole_continuity[1]``).  The order of continuity at the pole
+        will be 1 or 0 when this is True or False, respectively.
+        Defaults to False.
+    pole_values : float or (float, float), optional
+        Data values at the poles ``u=0`` and ``u=pi``.  Either the whole
+        parameter or each individual element can be None.  Defaults to None.
+    pole_exact : bool or (bool, bool), optional
+        Data value exactness at the poles ``u=0`` and ``u=pi``.  If True, the
+        value is considered to be the right function value, and it will be
+        fitted exactly. If False, the value will be considered to be a data
+        value just like the other data values.  Defaults to False.
+    pole_flat : bool or (bool, bool), optional
+        For the poles at ``u=0`` and ``u=pi``, specify whether or not the
+        approximation has vanishing derivatives.  Defaults to False.
+
+    See Also
+    --------
+    BivariateSpline :
+        a base class for bivariate splines.
+    UnivariateSpline :
+        a smooth univariate spline to fit a given set of data points.
+    SmoothBivariateSpline :
+        a smoothing bivariate spline through the given points
+    LSQBivariateSpline :
+        a bivariate spline using weighted least-squares fitting
+    SmoothSphereBivariateSpline :
+        a smoothing bivariate spline in spherical coordinates
+    LSQSphereBivariateSpline :
+        a bivariate spline in spherical coordinates using weighted
+        least-squares fitting
+    RectBivariateSpline :
+        a bivariate spline over a rectangular mesh.
+    bisplrep :
+        a function to find a bivariate B-spline representation of a surface
+    bisplev :
+        a function to evaluate a bivariate B-spline and its derivatives
+
+    Notes
+    -----
+    Currently, only the smoothing spline approximation (``iopt[0] = 0`` and
+    ``iopt[0] = 1`` in the FITPACK routine) is supported.  The exact
+    least-squares spline approximation is not implemented yet.
+
+    When actually performing the interpolation, the requested `v` values must
+    lie within the same length 2pi interval that the original `v` values were
+    chosen from.
+
+    For more information, see the FITPACK_ site about this function.
+
+    .. _FITPACK: http://www.netlib.org/dierckx/spgrid.f
+
+    Examples
+    --------
+    Suppose we have global data on a coarse grid
+
+    >>> import numpy as np
+    >>> lats = np.linspace(10, 170, 9) * np.pi / 180.
+    >>> lons = np.linspace(0, 350, 18) * np.pi / 180.
+    >>> data = np.dot(np.atleast_2d(90. - np.linspace(-80., 80., 18)).T,
+    ...               np.atleast_2d(180. - np.abs(np.linspace(0., 350., 9)))).T
+
+    We want to interpolate it to a global one-degree grid
+
+    >>> new_lats = np.linspace(1, 180, 180) * np.pi / 180
+    >>> new_lons = np.linspace(1, 360, 360) * np.pi / 180
+    >>> new_lats, new_lons = np.meshgrid(new_lats, new_lons)
+
+    We need to set up the interpolator object
+
+    >>> from scipy.interpolate import RectSphereBivariateSpline
+    >>> lut = RectSphereBivariateSpline(lats, lons, data)
+
+    Finally we interpolate the data.  The `RectSphereBivariateSpline` object
+    only takes 1-D arrays as input, therefore we need to do some reshaping.
+
+    >>> data_interp = lut.ev(new_lats.ravel(),
+    ...                      new_lons.ravel()).reshape((360, 180)).T
+
+    Looking at the original and the interpolated data, one can see that the
+    interpolant reproduces the original data very well:
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig = plt.figure()
+    >>> ax1 = fig.add_subplot(211)
+    >>> ax1.imshow(data, interpolation='nearest')
+    >>> ax2 = fig.add_subplot(212)
+    >>> ax2.imshow(data_interp, interpolation='nearest')
+    >>> plt.show()
+
+    Choosing the optimal value of ``s`` can be a delicate task. Recommended
+    values for ``s`` depend on the accuracy of the data values.  If the user
+    has an idea of the statistical errors on the data, she can also find a
+    proper estimate for ``s``. By assuming that, if she specifies the
+    right ``s``, the interpolator will use a spline ``f(u,v)`` which exactly
+    reproduces the function underlying the data, she can evaluate
+    ``sum((r(i,j)-s(u(i),v(j)))**2)`` to find a good estimate for this ``s``.
+    For example, if she knows that the statistical errors on her
+    ``r(i,j)``-values are not greater than 0.1, she may expect that a good
+    ``s`` should have a value not larger than ``u.size * v.size * (0.1)**2``.
+
+    If nothing is known about the statistical error in ``r(i,j)``, ``s`` must
+    be determined by trial and error.  The best is then to start with a very
+    large value of ``s`` (to determine the least-squares polynomial and the
+    corresponding upper bound ``fp0`` for ``s``) and then to progressively
+    decrease the value of ``s`` (say by a factor 10 in the beginning, i.e.
+    ``s = fp0 / 10, fp0 / 100, ...``  and more carefully as the approximation
+    shows more detail) to obtain closer fits.
+
+    The interpolation results for different values of ``s`` give some insight
+    into this process:
+
+    >>> fig2 = plt.figure()
+    >>> s = [3e9, 2e9, 1e9, 1e8]
+    >>> for idx, sval in enumerate(s, 1):
+    ...     lut = RectSphereBivariateSpline(lats, lons, data, s=sval)
+    ...     data_interp = lut.ev(new_lats.ravel(),
+    ...                          new_lons.ravel()).reshape((360, 180)).T
+    ...     ax = fig2.add_subplot(2, 2, idx)
+    ...     ax.imshow(data_interp, interpolation='nearest')
+    ...     ax.set_title(f"s = {sval:g}")
+    >>> plt.show()
+
+    """
+
+    def __init__(self, u, v, r, s=0., pole_continuity=False, pole_values=None,
+                 pole_exact=False, pole_flat=False):
+        iopt = np.array([0, 0, 0], dtype=dfitpack_int)
+        ider = np.array([-1, 0, -1, 0], dtype=dfitpack_int)
+        if pole_values is None:
+            pole_values = (None, None)
+        elif isinstance(pole_values, float | np.float32 | np.float64):
+            pole_values = (pole_values, pole_values)
+        if isinstance(pole_continuity, bool):
+            pole_continuity = (pole_continuity, pole_continuity)
+        if isinstance(pole_exact, bool):
+            pole_exact = (pole_exact, pole_exact)
+        if isinstance(pole_flat, bool):
+            pole_flat = (pole_flat, pole_flat)
+
+        r0, r1 = pole_values
+        iopt[1:] = pole_continuity
+        if r0 is None:
+            ider[0] = -1
+        else:
+            ider[0] = pole_exact[0]
+
+        if r1 is None:
+            ider[2] = -1
+        else:
+            ider[2] = pole_exact[1]
+
+        ider[1], ider[3] = pole_flat
+
+        u, v = np.ravel(u), np.ravel(v)
+        r = np.asarray(r)
+
+        if not (0.0 < u[0] and u[-1] < np.pi):
+            raise ValueError('u should be between (0, pi)')
+        if not -np.pi <= v[0] < np.pi:
+            raise ValueError('v[0] should be between [-pi, pi)')
+        if not v[-1] <= v[0] + 2*np.pi:
+            raise ValueError('v[-1] should be v[0] + 2pi or less ')
+
+        if not np.all(np.diff(u) > 0.0):
+            raise ValueError('u must be strictly increasing')
+        if not np.all(np.diff(v) > 0.0):
+            raise ValueError('v must be strictly increasing')
+
+        if not u.size == r.shape[0]:
+            raise ValueError('u dimension of r must have same number of '
+                             'elements as u')
+        if not v.size == r.shape[1]:
+            raise ValueError('v dimension of r must have same number of '
+                             'elements as v')
+
+        if pole_continuity[1] is False and pole_flat[1] is True:
+            raise ValueError('if pole_continuity is False, so must be '
+                             'pole_flat')
+        if pole_continuity[0] is False and pole_flat[0] is True:
+            raise ValueError('if pole_continuity is False, so must be '
+                             'pole_flat')
+
+        if not s >= 0.0:
+            raise ValueError('s should be positive')
+
+        r = np.ravel(r)
+        with FITPACK_LOCK:
+            nu, tu, nv, tv, c, fp, ier = dfitpack.regrid_smth_spher(iopt, ider,
+                                                                    u.copy(),
+                                                                    v.copy(),
+                                                                    r.copy(),
+                                                                    r0, r1, s)
+
+        if ier not in [0, -1, -2]:
+            msg = _spfit_messages.get(ier, f'ier={ier}')
+            raise ValueError(msg)
+
+        self.fp = fp
+        self.tck = tu[:nu], tv[:nv], c[:(nu - 4) * (nv-4)]
+        self.degrees = (3, 3)
+        self.v0 = v[0]
+
+    def __call__(self, theta, phi, dtheta=0, dphi=0, grid=True):
+
+        theta = np.asarray(theta)
+        phi = np.asarray(phi)
+
+        return SphereBivariateSpline.__call__(self, theta, phi, dtheta=dtheta,
+                                              dphi=dphi, grid=grid)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_impl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e54135e201bd54084e485abbf35c8fb3371da7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_impl.py
@@ -0,0 +1,824 @@
+"""
+fitpack (dierckx in netlib) --- A Python-C wrapper to FITPACK (by P. Dierckx).
+        FITPACK is a collection of FORTRAN programs for curve and surface
+        fitting with splines and tensor product splines.
+
+See
+ https://web.archive.org/web/20010524124604/http://www.cs.kuleuven.ac.be:80/cwis/research/nalag/research/topics/fitpack.html
+or
+ http://www.netlib.org/dierckx/
+
+Copyright 2002 Pearu Peterson all rights reserved,
+Pearu Peterson <pearu@cens.ioc.ee>
+Permission to use, modify, and distribute this software is given under the
+terms of the SciPy (BSD style) license. See LICENSE.txt that came with
+this distribution for specifics.
+
+NO WARRANTY IS EXPRESSED OR IMPLIED.  USE AT YOUR OWN RISK.
+
+TODO: Make interfaces to the following fitpack functions:
+    For univariate splines: cocosp, concon, fourco, insert
+    For bivariate splines: profil, regrid, parsur, surev
+"""
+
+__all__ = ['splrep', 'splprep', 'splev', 'splint', 'sproot', 'spalde',
+           'bisplrep', 'bisplev', 'insert', 'splder', 'splantider']
+
+import warnings
+import numpy as np
+from . import _fitpack
+from numpy import (atleast_1d, array, ones, zeros, sqrt, ravel, transpose,
+                   empty, iinfo, asarray)
+
+# Try to replace _fitpack interface with
+#  f2py-generated version
+from . import _dfitpack as dfitpack
+
+from scipy._lib._array_api import array_namespace, concat_1d, xp_capabilities
+
+
+dfitpack_int = dfitpack.types.intvar.dtype
+
+
+def _int_overflow(x, exception, msg=None):
+    """Cast the value to an dfitpack_int and raise an OverflowError if the value
+    cannot fit.
+    """
+    if x > iinfo(dfitpack_int).max:
+        if msg is None:
+            msg = f'{x!r} cannot fit into an {dfitpack_int!r}'
+        raise exception(msg)
+    return dfitpack_int.type(x)
+
+
+_iermess = {
+    0: ["The spline has a residual sum of squares fp such that "
+        "abs(fp-s)/s<=0.001", None],
+    -1: ["The spline is an interpolating spline (fp=0)", None],
+    -2: ["The spline is weighted least-squares polynomial of degree k.\n"
+         "fp gives the upper bound fp0 for the smoothing factor s", None],
+    1: ["The required storage space exceeds the available storage space.\n"
+        "Probable causes: data (x,y) size is too small or smoothing parameter"
+        "\ns is too small (fp>s).", ValueError],
+    2: ["A theoretically impossible result when finding a smoothing spline\n"
+        "with fp = s. Probable cause: s too small. (abs(fp-s)/s>0.001)",
+        ValueError],
+    3: ["The maximal number of iterations (20) allowed for finding smoothing\n"
+        "spline with fp=s has been reached. Probable cause: s too small.\n"
+        "(abs(fp-s)/s>0.001)", ValueError],
+    10: ["Error on input data", ValueError],
+    'unknown': ["An error occurred", TypeError]
+}
+
+_iermess2 = {
+    0: ["The spline has a residual sum of squares fp such that "
+        "abs(fp-s)/s<=0.001", None],
+    -1: ["The spline is an interpolating spline (fp=0)", None],
+    -2: ["The spline is weighted least-squares polynomial of degree kx and ky."
+         "\nfp gives the upper bound fp0 for the smoothing factor s", None],
+    -3: ["Warning. The coefficients of the spline have been computed as the\n"
+         "minimal norm least-squares solution of a rank deficient system.",
+         None],
+    1: ["The required storage space exceeds the available storage space.\n"
+        "Probable causes: nxest or nyest too small or s is too small. (fp>s)",
+        ValueError],
+    2: ["A theoretically impossible result when finding a smoothing spline\n"
+        "with fp = s. Probable causes: s too small or badly chosen eps.\n"
+        "(abs(fp-s)/s>0.001)", ValueError],
+    3: ["The maximal number of iterations (20) allowed for finding smoothing\n"
+        "spline with fp=s has been reached. Probable cause: s too small.\n"
+        "(abs(fp-s)/s>0.001)", ValueError],
+    4: ["No more knots can be added because the number of B-spline\n"
+        "coefficients already exceeds the number of data points m.\n"
+        "Probable causes: either s or m too small. (fp>s)", ValueError],
+    5: ["No more knots can be added because the additional knot would\n"
+        "coincide with an old one. Probable cause: s too small or too large\n"
+        "a weight to an inaccurate data point. (fp>s)", ValueError],
+    10: ["Error on input data", ValueError],
+    11: ["rwrk2 too small, i.e., there is not enough workspace for computing\n"
+         "the minimal least-squares solution of a rank deficient system of\n"
+         "linear equations.", ValueError],
+    'unknown': ["An error occurred", TypeError]
+}
+
+_parcur_cache = {'t': array([], float), 'wrk': array([], float),
+                 'iwrk': array([], dfitpack_int), 'u': array([], float),
+                 'ub': 0, 'ue': 1}
+
+
+def splprep(x, w=None, u=None, ub=None, ue=None, k=3, task=0, s=None, t=None,
+            full_output=0, nest=None, per=0, quiet=1):
+    # see the docstring of `_fitpack_py/splprep`
+    if task <= 0:
+        _parcur_cache = {'t': array([], float), 'wrk': array([], float),
+                         'iwrk': array([], dfitpack_int), 'u': array([], float),
+                         'ub': 0, 'ue': 1}
+    x = atleast_1d(x)
+    idim, m = x.shape
+    if per:
+        for i in range(idim):
+            if x[i][0] != x[i][-1]:
+                if not quiet:
+                    warnings.warn(
+                        RuntimeWarning(f'Setting x[{i}][{m}]=x[{i}][0]'), 
+                        stacklevel=2
+                    )
+                x[i][-1] = x[i][0]
+    if not 0 < idim < 11:
+        raise TypeError('0 < idim < 11 must hold')
+    if w is None:
+        w = ones(m, float)
+    else:
+        w = atleast_1d(w)
+    ipar = (u is not None)
+    if ipar:
+        _parcur_cache['u'] = u
+        if ub is None:
+            _parcur_cache['ub'] = u[0]
+        else:
+            _parcur_cache['ub'] = ub
+        if ue is None:
+            _parcur_cache['ue'] = u[-1]
+        else:
+            _parcur_cache['ue'] = ue
+    else:
+        _parcur_cache['u'] = zeros(m, float)
+    if not (1 <= k <= 5):
+        raise TypeError(f'1 <= k= {k} <=5 must hold')
+    if not (-1 <= task <= 1):
+        raise TypeError('task must be -1, 0 or 1')
+    if (not len(w) == m) or (ipar == 1 and (not len(u) == m)):
+        raise TypeError('Mismatch of input dimensions')
+    if s is None:
+        s = m - sqrt(2*m)
+    if t is None and task == -1:
+        raise TypeError('Knots must be given for task=-1')
+    if t is not None:
+        _parcur_cache['t'] = atleast_1d(t)
+    n = len(_parcur_cache['t'])
+    if task == -1 and n < 2*k + 2:
+        raise TypeError('There must be at least 2*k+2 knots for task=-1')
+    if m <= k:
+        raise TypeError('m > k must hold')
+    if nest is None:
+        nest = m + 2*k
+
+    if (task >= 0 and s == 0) or (nest < 0):
+        if per:
+            nest = m + 2*k
+        else:
+            nest = m + k + 1
+    nest = max(nest, 2*k + 3)
+    u = _parcur_cache['u']
+    ub = _parcur_cache['ub']
+    ue = _parcur_cache['ue']
+    t = _parcur_cache['t']
+    wrk = _parcur_cache['wrk']
+    iwrk = _parcur_cache['iwrk']
+    t, c, o = _fitpack._parcur(ravel(transpose(x)), w, u, ub, ue, k,
+                               task, ipar, s, t, nest, wrk, iwrk, per)
+    _parcur_cache['u'] = o['u']
+    _parcur_cache['ub'] = o['ub']
+    _parcur_cache['ue'] = o['ue']
+    _parcur_cache['t'] = t
+    _parcur_cache['wrk'] = o['wrk']
+    _parcur_cache['iwrk'] = o['iwrk']
+    ier = o['ier']
+    fp = o['fp']
+    n = len(t)
+    u = o['u']
+    c = c.reshape((idim, n - k - 1))
+    tcku = [t, list(c), k], u
+    if ier <= 0 and not quiet:
+        warnings.warn(
+            RuntimeWarning(
+                _iermess[ier][0] + f"\tk={k} n={len(t)} m={m} fp={fp} s={s}"
+            ), 
+            stacklevel=2
+        )
+    if ier > 0 and not full_output:
+        if ier in [1, 2, 3]:
+            warnings.warn(RuntimeWarning(_iermess[ier][0]), stacklevel=2)
+        else:
+            try:
+                raise _iermess[ier][1](_iermess[ier][0])
+            except KeyError as e:
+                raise _iermess['unknown'][1](_iermess['unknown'][0]) from e
+    if full_output:
+        try:
+            return tcku, fp, ier, _iermess[ier][0]
+        except KeyError:
+            return tcku, fp, ier, _iermess['unknown'][0]
+    else:
+        return tcku
+
+
+_curfit_cache = {'t': array([], float), 'wrk': array([], float),
+                 'iwrk': array([], dfitpack_int)}
+
+
+def splrep(x, y, w=None, xb=None, xe=None, k=3, task=0, s=None, t=None,
+           full_output=0, per=0, quiet=1):
+    # see the docstring of `_fitpack_py/splrep`
+    if task <= 0:
+        _curfit_cache = {}
+    x, y = map(atleast_1d, [x, y])
+    m = len(x)
+    if w is None:
+        w = ones(m, float)
+        if s is None:
+            s = 0.0
+    else:
+        w = atleast_1d(w)
+        if s is None:
+            s = m - sqrt(2*m)
+    if not len(w) == m:
+        raise TypeError(f'len(w)={len(w)} is not equal to m={m}')
+    if (m != len(y)) or (m != len(w)):
+        raise TypeError('Lengths of the first three arguments (x,y,w) must '
+                        'be equal')
+    if not (1 <= k <= 5):
+        raise TypeError(
+            f'Given degree of the spline (k={k}) is not supported. (1<=k<=5)'
+        )
+    if m <= k:
+        raise TypeError('m > k must hold')
+    if xb is None:
+        xb = x[0]
+    if xe is None:
+        xe = x[-1]
+    if not (-1 <= task <= 1):
+        raise TypeError('task must be -1, 0 or 1')
+    if t is not None:
+        task = -1
+    if task == -1:
+        if t is None:
+            raise TypeError('Knots must be given for task=-1')
+        numknots = len(t)
+        _curfit_cache['t'] = empty((numknots + 2*k + 2,), float)
+        _curfit_cache['t'][k+1:-k-1] = t
+        nest = len(_curfit_cache['t'])
+    elif task == 0:
+        if per:
+            nest = max(m + 2*k, 2*k + 3)
+        else:
+            nest = max(m + k + 1, 2*k + 3)
+        t = empty((nest,), float)
+        _curfit_cache['t'] = t
+    if task <= 0:
+        if per:
+            _curfit_cache['wrk'] = empty((m*(k + 1) + nest*(8 + 5*k),), float)
+        else:
+            _curfit_cache['wrk'] = empty((m*(k + 1) + nest*(7 + 3*k),), float)
+        _curfit_cache['iwrk'] = empty((nest,), dfitpack_int)
+    try:
+        t = _curfit_cache['t']
+        wrk = _curfit_cache['wrk']
+        iwrk = _curfit_cache['iwrk']
+    except KeyError as e:
+        raise TypeError("must call with task=1 only after"
+                        " call with task=0,-1") from e
+    if not per:
+        n, c, fp, ier = dfitpack.curfit(task, x, y, w, t, wrk, iwrk,
+                                        xb, xe, k, s)
+    else:
+        n, c, fp, ier = dfitpack.percur(task, x, y, w, t, wrk, iwrk, k, s)
+    tck = (t[:n], c[:n], k)
+    if ier <= 0 and not quiet:
+        _mess = (_iermess[ier][0] + f"\tk={k} n={len(t)} m={m} fp={fp} s={s}")
+        warnings.warn(RuntimeWarning(_mess), stacklevel=2)
+    if ier > 0 and not full_output:
+        if ier in [1, 2, 3]:
+            warnings.warn(RuntimeWarning(_iermess[ier][0]), stacklevel=2)
+        else:
+            try:
+                raise _iermess[ier][1](_iermess[ier][0])
+            except KeyError as e:
+                raise _iermess['unknown'][1](_iermess['unknown'][0]) from e
+    if full_output:
+        try:
+            return tck, fp, ier, _iermess[ier][0]
+        except KeyError:
+            return tck, fp, ier, _iermess['unknown'][0]
+    else:
+        return tck
+
+
+def splev(x, tck, der=0, ext=0):
+    # see the docstring of `_fitpack_py/splev`
+    t, c, k = tck
+    try:
+        c[0][0]
+        parametric = True
+    except Exception:
+        parametric = False
+    if parametric:
+        return list(map(lambda c, x=x, t=t, k=k, der=der:
+                        splev(x, [t, c, k], der, ext), c))
+    else:
+        if not (0 <= der <= k):
+            raise ValueError(f"0<=der={der}<=k={k} must hold")
+        if ext not in (0, 1, 2, 3):
+            raise ValueError(f"ext = {ext} not in (0, 1, 2, 3) ")
+
+        x = asarray(x)
+        shape = x.shape
+        x = atleast_1d(x).ravel()
+        if der == 0:
+            y, ier = dfitpack.splev(t, c, k, x, ext)
+        else:
+            y, ier = dfitpack.splder(t, c, k, x, der, ext)
+
+        if ier == 10:
+            raise ValueError("Invalid input data")
+        if ier == 1:
+            raise ValueError("Found x value not in the domain")
+        if ier:
+            raise TypeError("An error occurred")
+
+        return y.reshape(shape)
+
+
+def splint(a, b, tck, full_output=0):
+    # see the docstring of `_fitpack_py/splint`
+    t, c, k = tck
+    try:
+        c[0][0]
+        parametric = True
+    except Exception:
+        parametric = False
+    if parametric:
+        return list(map(lambda c, a=a, b=b, t=t, k=k:
+                        splint(a, b, [t, c, k]), c))
+    else:
+        aint, wrk = dfitpack.splint(t, c, k, a, b)
+        if full_output:
+            return aint, wrk
+        else:
+            return aint
+
+
+def sproot(tck, mest=10):
+    # see the docstring of `_fitpack_py/sproot`
+    t, c, k = tck
+    if k != 3:
+        raise ValueError("sproot works only for cubic (k=3) splines")
+    try:
+        c[0][0]
+        parametric = True
+    except Exception:
+        parametric = False
+    if parametric:
+        return list(map(lambda c, t=t, k=k, mest=mest:
+                        sproot([t, c, k], mest), c))
+    else:
+        if len(t) < 8:
+            raise TypeError(f"The number of knots {len(t)}>=8")
+        z, m, ier = dfitpack.sproot(t, c, mest)
+        if ier == 10:
+            raise TypeError("Invalid input data. "
+                            "t1<=..<=t4<t5<..<tn-3<=..<=tn must hold.")
+        if ier == 0:
+            return z[:m]
+        if ier == 1:
+            warnings.warn(RuntimeWarning("The number of zeros exceeds mest"),
+                          stacklevel=2)
+            return z[:m]
+        raise TypeError("Unknown error")
+
+
+def spalde(x, tck):
+    # see the docstring of `_fitpack_py/spalde`
+    t, c, k = tck
+    try:
+        c[0][0]
+        parametric = True
+    except Exception:
+        parametric = False
+    if parametric:
+        return list(map(lambda c, x=x, t=t, k=k:
+                        spalde(x, [t, c, k]), c))
+    else:
+        x = atleast_1d(x)
+        if len(x) > 1:
+            return list(map(lambda x, tck=tck: spalde(x, tck), x))
+        d, ier = dfitpack.spalde(t, c, k+1, x[0])
+        if ier == 0:
+            return d
+        if ier == 10:
+            raise TypeError("Invalid input data. t(k)<=x<=t(n-k+1) must hold.")
+        raise TypeError("Unknown error")
+
+# def _curfit(x,y,w=None,xb=None,xe=None,k=3,task=0,s=None,t=None,
+#           full_output=0,nest=None,per=0,quiet=1):
+
+
+_surfit_cache = {'tx': array([], float), 'ty': array([], float),
+                 'wrk': array([], float), 'iwrk': array([], dfitpack_int)}
+
+
+@xp_capabilities(out_of_scope=True)
+def bisplrep(x, y, z, w=None, xb=None, xe=None, yb=None, ye=None,
+             kx=3, ky=3, task=0, s=None, eps=1e-16, tx=None, ty=None,
+             full_output=0, nxest=None, nyest=None, quiet=1):
+    """
+    Find a bivariate B-spline representation of a surface.
+
+    Given a set of data points (x[i], y[i], z[i]) representing a surface
+    z=f(x,y), compute a B-spline representation of the surface. Based on
+    the routine SURFIT from FITPACK.
+
+    Parameters
+    ----------
+    x, y, z : ndarray
+        Rank-1 arrays of data points.
+    w : ndarray, optional
+        Rank-1 array of weights. By default ``w=np.ones(len(x))``.
+    xb, xe : float, optional
+        End points of approximation interval in `x`.
+        By default ``xb = x.min(), xe=x.max()``.
+    yb, ye : float, optional
+        End points of approximation interval in `y`.
+        By default ``yb=y.min(), ye = y.max()``.
+    kx, ky : int, optional
+        The degrees of the spline (1 <= kx, ky <= 5).
+        Third order (kx=ky=3) is recommended.
+    task : int, optional
+        If task=0, find knots in x and y and coefficients for a given
+        smoothing factor, s.
+        If task=1, find knots and coefficients for another value of the
+        smoothing factor, s.  bisplrep must have been previously called
+        with task=0 or task=1.
+        If task=-1, find coefficients for a given set of knots tx, ty.
+    s : float, optional
+        A non-negative smoothing factor. If weights correspond
+        to the inverse of the standard-deviation of the errors in z,
+        then a good s-value should be found in the range
+        ``(m-sqrt(2*m),m+sqrt(2*m))`` where m=len(x).
+    eps : float, optional
+        A threshold for determining the effective rank of an
+        over-determined linear system of equations (0 < eps < 1).
+        `eps` is not likely to need changing.
+    tx, ty : ndarray, optional
+        Rank-1 arrays of the knots of the spline for task=-1
+    full_output : int, optional
+        Non-zero to return optional outputs.
+    nxest, nyest : int, optional
+        Over-estimates of the total number of knots. If None then
+        ``nxest = max(kx+sqrt(m/2),2*kx+3)``,
+        ``nyest = max(ky+sqrt(m/2),2*ky+3)``.
+    quiet : int, optional
+        Non-zero to suppress printing of messages.
+
+    Returns
+    -------
+    tck : array_like
+        A list [tx, ty, c, kx, ky] containing the knots (tx, ty) and
+        coefficients (c) of the bivariate B-spline representation of the
+        surface along with the degree of the spline.
+    fp : ndarray
+        The weighted sum of squared residuals of the spline approximation.
+    ier : int
+        An integer flag about splrep success. Success is indicated if
+        ier<=0. If ier in [1,2,3] an error occurred but was not raised.
+        Otherwise an error is raised.
+    msg : str
+        A message corresponding to the integer flag, ier.
+
+    See Also
+    --------
+    splprep, splrep, splint, sproot, splev
+    UnivariateSpline, BivariateSpline
+
+    Notes
+    -----
+    See `bisplev` to evaluate the value of the B-spline given its tck
+    representation.
+
+    If the input data is such that input dimensions have incommensurate
+    units and differ by many orders of magnitude, the interpolant may have
+    numerical artifacts. Consider rescaling the data before interpolation.
+
+    References
+    ----------
+    .. [1] Dierckx P.:An algorithm for surface fitting with spline functions
+       Ima J. Numer. Anal. 1 (1981) 267-283.
+    .. [2] Dierckx P.:An algorithm for surface fitting with spline functions
+       report tw50, Dept. Computer Science,K.U.Leuven, 1980.
+    .. [3] Dierckx P.:Curve and surface fitting with splines, Monographs on
+       Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    Examples are given :ref:`in the tutorial <tutorial-interpolate_2d_spline>`.
+
+    """
+    x, y, z = map(ravel, [x, y, z])  # ensure 1-d arrays.
+    m = len(x)
+    if not (m == len(y) == len(z)):
+        raise TypeError('len(x)==len(y)==len(z) must hold.')
+    if w is None:
+        w = ones(m, float)
+    else:
+        w = atleast_1d(w)
+    if not len(w) == m:
+        raise TypeError(f'len(w)={len(w)} is not equal to m={m}')
+    if xb is None:
+        xb = x.min()
+    if xe is None:
+        xe = x.max()
+    if yb is None:
+        yb = y.min()
+    if ye is None:
+        ye = y.max()
+    if not (-1 <= task <= 1):
+        raise TypeError('task must be -1, 0 or 1')
+    if s is None:
+        s = m - sqrt(2*m)
+    if tx is None and task == -1:
+        raise TypeError('Knots_x must be given for task=-1')
+    if tx is not None:
+        _surfit_cache['tx'] = atleast_1d(tx)
+    nx = len(_surfit_cache['tx'])
+    if ty is None and task == -1:
+        raise TypeError('Knots_y must be given for task=-1')
+    if ty is not None:
+        _surfit_cache['ty'] = atleast_1d(ty)
+    ny = len(_surfit_cache['ty'])
+    if task == -1 and nx < 2*kx+2:
+        raise TypeError('There must be at least 2*kx+2 knots_x for task=-1')
+    if task == -1 and ny < 2*ky+2:
+        raise TypeError('There must be at least 2*ky+2 knots_x for task=-1')
+    if not ((1 <= kx <= 5) and (1 <= ky <= 5)):
+        raise TypeError(
+            f'Given degree of the spline (kx,ky={kx},{ky}) is not supported. (1<=k<=5)'
+        )
+    if m < (kx + 1)*(ky + 1):
+        raise TypeError('m >= (kx+1)(ky+1) must hold')
+    if nxest is None:
+        nxest = int(kx + sqrt(m/2))
+    if nyest is None:
+        nyest = int(ky + sqrt(m/2))
+    nxest, nyest = max(nxest, 2*kx + 3), max(nyest, 2*ky + 3)
+    if task >= 0 and s == 0:
+        nxest = int(kx + sqrt(3*m))
+        nyest = int(ky + sqrt(3*m))
+    if task == -1:
+        _surfit_cache['tx'] = atleast_1d(tx)
+        _surfit_cache['ty'] = atleast_1d(ty)
+    tx, ty = _surfit_cache['tx'], _surfit_cache['ty']
+    wrk = _surfit_cache['wrk']
+    u = nxest - kx - 1
+    v = nyest - ky - 1
+    km = max(kx, ky) + 1
+    ne = max(nxest, nyest)
+    bx, by = kx*v + ky + 1, ky*u + kx + 1
+    b1, b2 = bx, bx + v - ky
+    if bx > by:
+        b1, b2 = by, by + u - kx
+    msg = "Too many data points to interpolate"
+    lwrk1 = _int_overflow(u*v*(2 + b1 + b2) +
+                          2*(u + v + km*(m + ne) + ne - kx - ky) + b2 + 1,
+                          OverflowError,
+                          msg=msg)
+    lwrk2 = _int_overflow(u*v*(b2 + 1) + b2, OverflowError, msg=msg)
+    tx, ty, c, o = _fitpack._surfit(x, y, z, w, xb, xe, yb, ye, kx, ky,
+                                    task, s, eps, tx, ty, nxest, nyest,
+                                    wrk, lwrk1, lwrk2)
+    _curfit_cache['tx'] = tx
+    _curfit_cache['ty'] = ty
+    _curfit_cache['wrk'] = o['wrk']
+    ier, fp = o['ier'], o['fp']
+    tck = [tx, ty, c, kx, ky]
+
+    ierm = min(11, max(-3, ier))
+    if ierm <= 0 and not quiet:
+        _mess = (
+            _iermess2[ierm][0] + 
+            f"\tkx,ky={kx},{ky} nx,ny={len(tx)},{len(ty)} m={m} fp={fp} s={s}"
+        )
+        warnings.warn(RuntimeWarning(_mess), stacklevel=2)
+    if ierm > 0 and not full_output:
+        if ier in [1, 2, 3, 4, 5]:
+            _mess = (
+                f"\n\tkx,ky={kx},{ky} nx,ny={len(tx)},{len(ty)} m={m} fp={fp} s={s}"
+            )
+            warnings.warn(RuntimeWarning(_iermess2[ierm][0] + _mess), stacklevel=2)
+        else:
+            try:
+                raise _iermess2[ierm][1](_iermess2[ierm][0])
+            except KeyError as e:
+                raise _iermess2['unknown'][1](_iermess2['unknown'][0]) from e
+    if full_output:
+        try:
+            return tck, fp, ier, _iermess2[ierm][0]
+        except KeyError:
+            return tck, fp, ier, _iermess2['unknown'][0]
+    else:
+        return tck
+
+
+@xp_capabilities(out_of_scope=True)
+def bisplev(x, y, tck, dx=0, dy=0):
+    """
+    Evaluate a bivariate B-spline and its derivatives.
+
+    Return a rank-2 array of spline function values (or spline derivative
+    values) at points given by the cross-product of the rank-1 arrays `x` and
+    `y`.  In special cases, return an array or just a float if either `x` or
+    `y` or both are floats.  Based on BISPEV and PARDER from FITPACK.
+
+    Parameters
+    ----------
+    x, y : ndarray
+        Rank-1 arrays specifying the domain over which to evaluate the
+        spline or its derivative.
+    tck : tuple
+        A sequence of length 5 returned by `bisplrep` containing the knot
+        locations, the coefficients, and the degree of the spline:
+        [tx, ty, c, kx, ky].
+    dx, dy : int, optional
+        The orders of the partial derivatives in `x` and `y` respectively.
+
+    Returns
+    -------
+    vals : ndarray
+        The B-spline or its derivative evaluated over the set formed by
+        the cross-product of `x` and `y`.
+
+    See Also
+    --------
+    splprep, splrep, splint, sproot, splev
+    UnivariateSpline, BivariateSpline
+
+    Notes
+    -----
+    See `bisplrep` to generate the `tck` representation.
+
+    References
+    ----------
+    .. [1] Dierckx P. : An algorithm for surface fitting
+       with spline functions
+       Ima J. Numer. Anal. 1 (1981) 267-283.
+    .. [2] Dierckx P. : An algorithm for surface fitting
+       with spline functions
+       report tw50, Dept. Computer Science,K.U.Leuven, 1980.
+    .. [3] Dierckx P. : Curve and surface fitting with splines,
+       Monographs on Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    Examples are given :ref:`in the tutorial <tutorial-interpolate_2d_spline>`.
+
+    """
+    tx, ty, c, kx, ky = tck
+    if not (0 <= dx < kx):
+        raise ValueError(f"0 <= dx = {dx} < kx = {kx} must hold")
+    if not (0 <= dy < ky):
+        raise ValueError(f"0 <= dy = {dy} < ky = {ky} must hold")
+    x, y = map(atleast_1d, [x, y])
+    if (len(x.shape) != 1) or (len(y.shape) != 1):
+        raise ValueError("First two entries should be rank-1 arrays.")
+
+    msg = "Too many data points to interpolate."
+
+    _int_overflow(x.size * y.size, MemoryError, msg=msg)
+
+    if dx != 0 or dy != 0:
+        _int_overflow((tx.size - kx - 1)*(ty.size - ky - 1),
+                      MemoryError, msg=msg)
+        z, ier = dfitpack.parder(tx, ty, c, kx, ky, dx, dy, x, y)
+    else:
+        z, ier = dfitpack.bispev(tx, ty, c, kx, ky, x, y)
+
+    if ier == 10:
+        raise ValueError("Invalid input data")
+    if ier:
+        raise TypeError("An error occurred")
+    z = z.reshape((len(x), len(y)))
+    if len(z) > 1:
+        return z
+    if len(z[0]) > 1:
+        return z[0]
+    return z[0][0]
+
+
+def dblint(xa, xb, ya, yb, tck):
+    """Evaluate the integral of a spline over area [xa,xb] x [ya,yb].
+
+    Parameters
+    ----------
+    xa, xb : float
+        The end-points of the x integration interval.
+    ya, yb : float
+        The end-points of the y integration interval.
+    tck : list [tx, ty, c, kx, ky]
+        A sequence of length 5 returned by bisplrep containing the knot
+        locations tx, ty, the coefficients c, and the degrees kx, ky
+        of the spline.
+
+    Returns
+    -------
+    integ : float
+        The value of the resulting integral.
+    """
+    tx, ty, c, kx, ky = tck
+    return dfitpack.dblint(tx, ty, c, kx, ky, xa, xb, ya, yb)
+
+
+def insert(x, tck, m=1, per=0):
+    # see the docstring of `_fitpack_py/insert`
+    t, c, k = tck
+    try:
+        c[0][0]
+        parametric = True
+    except Exception:
+        parametric = False
+    if parametric:
+        cc = []
+        for c_vals in c:
+            tt, cc_val, kk = insert(x, [t, c_vals, k], m)
+            cc.append(cc_val)
+        return (tt, cc, kk)
+    else:
+        tt, cc, ier = _fitpack._insert(per, t, c, k, x, m)
+        if ier == 10:
+            raise ValueError("Invalid input data")
+        if ier:
+            raise TypeError("An error occurred")
+        return (tt, cc, k)
+
+
+def splder(tck, n=1, xp=None):
+    # see the docstring of `_fitpack_py/splder`
+    if n < 0:
+        return splantider(tck, -n)
+
+    t, c, k = tck
+
+    if xp is None:
+        xp = array_namespace(t, c)
+
+    if n > k:
+        raise ValueError(f"Order of derivative (n = {n!r}) must be <= "
+                         f"order of spline (k = {tck[2]!r})")
+
+    # Extra axes for the trailing dims of the `c` array:
+    sh = (slice(None),) + ((None,)*len(c.shape[1:]))
+
+    with np.errstate(invalid='raise', divide='raise'):
+        try:
+            for j in range(n):
+                # See e.g. Schumaker, Spline Functions: Basic Theory, Chapter 5
+
+                # Compute the denominator in the differentiation formula.
+                # (and append trailing dims, if necessary)
+                dt = t[k+1:-1] - t[1:-k-1]
+                dt = dt[sh]
+                # Compute the new coefficients
+                c = (c[1:-1-k, ...] - c[:-2-k, ...]) * k / dt
+                # Pad coefficient array to same size as knots (FITPACK
+                # convention)
+                c = concat_1d(xp, c, xp.zeros((k,) + c.shape[1:]))
+                # Adjust knots
+                t = t[1:-1]
+                k -= 1
+        except FloatingPointError as e:
+            raise ValueError("The spline has internal repeated knots "
+                              f"and is not differentiable {n} times") from e
+
+    return t, c, k
+
+
+def splantider(tck, n=1, *, xp=None):
+    # see the docstring of `_fitpack_py/splantider`
+    if n < 0:
+        return splder(tck, -n)
+
+    t, c, k = tck
+
+    if xp is None:
+        xp = array_namespace(t, c)
+
+    # Extra axes for the trailing dims of the `c` array:
+    sh = (slice(None),) + (None,)*len(c.shape[1:])
+
+    for j in range(n):
+        # This is the inverse set of operations to splder.
+
+        # Compute the multiplier in the antiderivative formula.
+        dt = t[k+1:] - t[:-k-1]
+        dt = dt[sh]
+        # Compute the new coefficients
+        c = xp.cumulative_sum(c[:-k-1, ...] * dt, axis=0) / (k + 1)
+        c = concat_1d(
+            xp,
+            xp.zeros((1,) + c.shape[1:]),
+            c,
+            xp.stack([c[-1, ...]] * (k+2)),
+        )
+        # New knots
+        t = concat_1d(xp, t[0], t, t[-1])
+        k += 1
+
+    return t, c, k
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_py.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_py.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0b29abc035deb3d76774939d050933aad4743f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_fitpack_py.py
@@ -0,0 +1,908 @@
+__all__ = ['splrep', 'splprep', 'splev', 'splint', 'sproot', 'spalde',
+           'bisplrep', 'bisplev', 'insert', 'splder', 'splantider']
+
+
+import numpy as np
+
+# These are in the API for fitpack even if not used in fitpack.py itself.
+from ._fitpack_impl import bisplrep, bisplev, dblint  # noqa: F401
+from . import _fitpack_impl as _impl
+from ._bsplines import BSpline
+from scipy._lib._array_api import xp_capabilities
+
+
+@xp_capabilities(out_of_scope=True)
+def splprep(x, w=None, u=None, ub=None, ue=None, k=3, task=0, s=None, t=None,
+            full_output=0, nest=None, per=0, quiet=1):
+    """
+    Find the B-spline representation of an N-D curve.
+
+    .. legacy:: function
+
+        Specifically, we recommend using `make_splprep` in new code.
+
+    Given a list of N rank-1 arrays, `x`, which represent a curve in
+    N-dimensional space parametrized by `u`, find a smooth approximating
+    spline curve g(`u`). Uses the FORTRAN routine parcur from FITPACK.
+
+    Parameters
+    ----------
+    x : array_like
+        A list of sample vector arrays representing the curve.
+    w : array_like, optional
+        Strictly positive rank-1 array of weights the same length as `x[0]`.
+        The weights are used in computing the weighted least-squares spline
+        fit. If the errors in the `x` values have standard-deviation given by
+        the vector d, then `w` should be 1/d. Default is ``ones(len(x[0]))``.
+    u : array_like, optional
+        An array of parameter values. If not given, these values are
+        calculated automatically as ``M = len(x[0])``, where
+
+            v[0] = 0
+
+            v[i] = v[i-1] + distance(`x[i]`, `x[i-1]`)
+
+            u[i] = v[i] / v[M-1]
+
+    ub, ue : int, optional
+        The end-points of the parameters interval.  Defaults to
+        u[0] and u[-1].
+    k : int, optional
+        Degree of the spline. Cubic splines are recommended.
+        Even values of `k` should be avoided especially with a small s-value.
+        ``1 <= k <= 5``, default is 3.
+    task : int, optional
+        If task==0 (default), find t and c for a given smoothing factor, s.
+        If task==1, find t and c for another value of the smoothing factor, s.
+        There must have been a previous call with task=0 or task=1
+        for the same set of data.
+        If task=-1 find the weighted least square spline for a given set of
+        knots, t.
+    s : float, optional
+        A smoothing condition.  The amount of smoothness is determined by
+        satisfying the conditions: ``sum((w * (y - g))**2,axis=0) <= s``,
+        where g(x) is the smoothed interpolation of (x,y).  The user can
+        use `s` to control the trade-off between closeness and smoothness
+        of fit.  Larger `s` means more smoothing while smaller values of `s`
+        indicate less smoothing. Recommended values of `s` depend on the
+        weights, w.  If the weights represent the inverse of the
+        standard-deviation of y, then a good `s` value should be found in
+        the range ``(m-sqrt(2*m),m+sqrt(2*m))``, where m is the number of
+        data points in x, y, and w.
+    t : array, optional
+        The knots needed for ``task=-1``.
+        There must be at least ``2*k+2`` knots.
+    full_output : int, optional
+        If non-zero, then return optional outputs.
+    nest : int, optional
+        An over-estimate of the total number of knots of the spline to
+        help in determining the storage space.  By default nest=m/2.
+        Always large enough is nest=m+k+1.
+    per : int, optional
+       If non-zero, data points are considered periodic with period
+       ``x[m-1] - x[0]`` and a smooth periodic spline approximation is
+       returned.  Values of ``y[m-1]`` and ``w[m-1]`` are not used.
+    quiet : int, optional
+         Non-zero to suppress messages.
+
+    Returns
+    -------
+    tck : tuple
+        A tuple, ``(t,c,k)`` containing the vector of knots, the B-spline
+        coefficients, and the degree of the spline.
+    u : array
+        An array of the values of the parameter.
+    fp : float
+        The weighted sum of squared residuals of the spline approximation.
+    ier : int
+        An integer flag about splrep success.  Success is indicated
+        if ier<=0. If ier in [1,2,3] an error occurred but was not raised.
+        Otherwise an error is raised.
+    msg : str
+        A message corresponding to the integer flag, ier.
+
+    See Also
+    --------
+    splrep, splev, sproot, spalde, splint,
+    bisplrep, bisplev
+    UnivariateSpline, BivariateSpline
+    BSpline
+    make_interp_spline
+
+    Notes
+    -----
+    See `splev` for evaluation of the spline and its derivatives.
+    The number of dimensions N must be smaller than 11.
+
+    The number of coefficients in the `c` array is ``k+1`` less than the number
+    of knots, ``len(t)``. This is in contrast with `splrep`, which zero-pads
+    the array of coefficients to have the same length as the array of knots.
+    These additional coefficients are ignored by evaluation routines, `splev`
+    and `BSpline`.
+
+    References
+    ----------
+    .. [1] P. Dierckx, "Algorithms for smoothing data with periodic and
+        parametric splines, Computer Graphics and Image Processing",
+        20 (1982) 171-184.
+    .. [2] P. Dierckx, "Algorithms for smoothing data with periodic and
+        parametric splines", report tw55, Dept. Computer Science,
+        K.U.Leuven, 1981.
+    .. [3] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+        Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    Generate a discretization of a limacon curve in the polar coordinates:
+
+    >>> import numpy as np
+    >>> phi = np.linspace(0, 2.*np.pi, 40)
+    >>> r = 0.5 + np.cos(phi)         # polar coords
+    >>> x, y = r * np.cos(phi), r * np.sin(phi)    # convert to cartesian
+
+    And interpolate:
+
+    >>> from scipy.interpolate import splprep, splev
+    >>> tck, u = splprep([x, y], s=0)
+    >>> new_points = splev(u, tck)
+
+    Notice that (i) we force interpolation by using ``s=0``,
+    (ii) the parameterization, ``u``, is generated automatically.
+    Now plot the result:
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, y, 'ro')
+    >>> ax.plot(new_points[0], new_points[1], 'r-')
+    >>> plt.show()
+
+    """
+
+    res = _impl.splprep(x, w, u, ub, ue, k, task, s, t, full_output, nest, per,
+                        quiet)
+    return res
+
+
+@xp_capabilities(out_of_scope=True)
+def splrep(x, y, w=None, xb=None, xe=None, k=3, task=0, s=None, t=None,
+           full_output=0, per=0, quiet=1):
+    """
+    Find the B-spline representation of a 1-D curve.
+
+    .. legacy:: function
+
+        Specifically, we recommend using `make_splrep` in new code.
+
+
+    Given the set of data points ``(x[i], y[i])`` determine a smooth spline
+    approximation of degree k on the interval ``xb <= x <= xe``.
+
+    Parameters
+    ----------
+    x, y : array_like
+        The data points defining a curve ``y = f(x)``.
+    w : array_like, optional
+        Strictly positive rank-1 array of weights the same length as `x` and `y`.
+        The weights are used in computing the weighted least-squares spline
+        fit. If the errors in the `y` values have standard-deviation given by the
+        vector ``d``, then `w` should be ``1/d``. Default is ``ones(len(x))``.
+    xb, xe : float, optional
+        The interval to fit.  If None, these default to ``x[0]`` and ``x[-1]``
+        respectively.
+    k : int, optional
+        The degree of the spline fit. It is recommended to use cubic splines.
+        Even values of `k` should be avoided especially with small `s` values.
+        ``1 <= k <= 5``.
+    task : {1, 0, -1}, optional
+        If ``task==0``, find ``t`` and ``c`` for a given smoothing factor, `s`.
+
+        If ``task==1`` find ``t`` and ``c`` for another value of the smoothing factor,
+        `s`. There must have been a previous call with ``task=0`` or ``task=1`` for
+        the same set of data (``t`` will be stored an used internally)
+
+        If ``task=-1`` find the weighted least square spline for a given set of
+        knots, ``t``. These should be interior knots as knots on the ends will be
+        added automatically.
+    s : float, optional
+        A smoothing condition. The amount of smoothness is determined by
+        satisfying the conditions: ``sum((w * (y - g))**2,axis=0) <= s`` where ``g(x)``
+        is the smoothed interpolation of ``(x,y)``. The user can use `s` to control
+        the tradeoff between closeness and smoothness of fit. Larger `s` means
+        more smoothing while smaller values of `s` indicate less smoothing.
+        Recommended values of `s` depend on the weights, `w`. If the weights
+        represent the inverse of the standard-deviation of `y`, then a good `s`
+        value should be found in the range ``(m-sqrt(2*m),m+sqrt(2*m))`` where ``m`` is
+        the number of datapoints in `x`, `y`, and `w`. default : ``s=m-sqrt(2*m)`` if
+        weights are supplied. ``s = 0.0`` (interpolating) if no weights are
+        supplied.
+    t : array_like, optional
+        The knots needed for ``task=-1``. If given then task is automatically set
+        to ``-1``.
+    full_output : bool, optional
+        If non-zero, then return optional outputs.
+    per : bool, optional
+        If non-zero, data points are considered periodic with period ``x[m-1]`` -
+        ``x[0]`` and a smooth periodic spline approximation is returned. Values of
+        ``y[m-1]`` and ``w[m-1]`` are not used.
+        The default is zero, corresponding to boundary condition 'not-a-knot'.
+    quiet : bool, optional
+        Non-zero to suppress messages.
+
+    Returns
+    -------
+    tck : tuple
+        A tuple ``(t,c,k)`` containing the vector of knots, the B-spline
+        coefficients, and the degree of the spline.
+    fp : array, optional
+        The weighted sum of squared residuals of the spline approximation.
+    ier : int, optional
+        An integer flag about splrep success. Success is indicated if ``ier<=0``.
+        If ``ier in [1,2,3]``, an error occurred but was not raised. Otherwise an
+        error is raised.
+    msg : str, optional
+        A message corresponding to the integer flag, `ier`.
+
+    See Also
+    --------
+    UnivariateSpline, BivariateSpline
+    splprep, splev, sproot, spalde, splint
+    bisplrep, bisplev
+    BSpline
+    make_interp_spline
+
+    Notes
+    -----
+    See `splev` for evaluation of the spline and its derivatives. Uses the
+    FORTRAN routine ``curfit`` from FITPACK.
+
+    The user is responsible for assuring that the values of `x` are unique.
+    Otherwise, `splrep` will not return sensible results.
+
+    If provided, knots `t` must satisfy the Schoenberg-Whitney conditions,
+    i.e., there must be a subset of data points ``x[j]`` such that
+    ``t[j] < x[j] < t[j+k+1]``, for ``j=0, 1,...,n-k-2``.
+
+    This routine zero-pads the coefficients array ``c`` to have the same length
+    as the array of knots ``t`` (the trailing ``k + 1`` coefficients are ignored
+    by the evaluation routines, `splev` and `BSpline`.) This is in contrast with
+    `splprep`, which does not zero-pad the coefficients.
+
+    The default boundary condition is 'not-a-knot', i.e. the first and second
+    segment at a curve end are the same polynomial. More boundary conditions are
+    available in `CubicSpline`.
+
+    References
+    ----------
+    Based on algorithms described in [1]_, [2]_, [3]_, and [4]_:
+
+    .. [1] P. Dierckx, "An algorithm for smoothing, differentiation and
+       integration of experimental data using spline functions",
+       J.Comp.Appl.Maths 1 (1975) 165-184.
+    .. [2] P. Dierckx, "A fast algorithm for smoothing data on a rectangular
+       grid while using spline functions", SIAM J.Numer.Anal. 19 (1982)
+       1286-1304.
+    .. [3] P. Dierckx, "An improved algorithm for curve fitting with spline
+       functions", report tw54, Dept. Computer Science,K.U. Leuven, 1981.
+    .. [4] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+       Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    You can interpolate 1-D points with a B-spline curve.
+    Further examples are given in
+    :ref:`in the tutorial <tutorial-interpolate_splXXX>`.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import splev, splrep
+    >>> x = np.linspace(0, 10, 10)
+    >>> y = np.sin(x)
+    >>> spl = splrep(x, y)
+    >>> x2 = np.linspace(0, 10, 200)
+    >>> y2 = splev(x2, spl)
+    >>> plt.plot(x, y, 'o', x2, y2)
+    >>> plt.show()
+
+    """
+    res = _impl.splrep(x, y, w, xb, xe, k, task, s, t, full_output, per, quiet)
+    return res
+
+
+@xp_capabilities(out_of_scope=True)
+def splev(x, tck, der=0, ext=0):
+    """
+    Evaluate a B-spline or its derivatives.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using
+        its ``__call__`` method.
+
+    Given the knots and coefficients of a B-spline representation, evaluate
+    the value of the smoothing polynomial and its derivatives. This is a
+    wrapper around the FORTRAN routines splev and splder of FITPACK.
+
+    Parameters
+    ----------
+    x : array_like
+        An array of points at which to return the value of the smoothed
+        spline or its derivatives. If `tck` was returned from `splprep`,
+        then the parameter values, u should be given.
+    tck : BSpline instance or tuple
+        If a tuple, then it should be a sequence of length 3 returned by
+        `splrep` or `splprep` containing the knots, coefficients, and degree
+        of the spline. (Also see Notes.)
+    der : int, optional
+        The order of derivative of the spline to compute (must be less than
+        or equal to k, the degree of the spline).
+    ext : int, optional
+        Controls the value returned for elements of ``x`` not in the
+        interval defined by the knot sequence.
+
+        * if ext=0, return the extrapolated value.
+        * if ext=1, return 0
+        * if ext=2, raise a ValueError
+        * if ext=3, return the boundary value.
+
+        The default value is 0.
+
+    Returns
+    -------
+    y : ndarray or list of ndarrays
+        An array of values representing the spline function evaluated at
+        the points in `x`.  If `tck` was returned from `splprep`, then this
+        is a list of arrays representing the curve in an N-D space.
+
+    See Also
+    --------
+    splprep, splrep, sproot, spalde, splint
+    bisplrep, bisplev
+    BSpline
+
+    Notes
+    -----
+    Manipulating the tck-tuples directly is not recommended. In new code,
+    prefer using `BSpline` objects.
+
+    References
+    ----------
+    .. [1] C. de Boor, "On calculating with b-splines", J. Approximation
+        Theory, 6, p.50-62, 1972.
+    .. [2] M. G. Cox, "The numerical evaluation of b-splines", J. Inst. Maths
+        Applics, 10, p.134-149, 1972.
+    .. [3] P. Dierckx, "Curve and surface fitting with splines", Monographs
+        on Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    Examples are given :ref:`in the tutorial <tutorial-interpolate_splXXX>`.
+
+    A comparison between `splev`, `splder` and `spalde` to compute the derivatives of a 
+    B-spline can be found in the `spalde` examples section.
+
+    """
+    if isinstance(tck, BSpline):
+        if tck.c.ndim > 1:
+            mesg = ("Calling splev() with BSpline objects with c.ndim > 1 is "
+                    "not allowed. Use BSpline.__call__(x) instead.")
+            raise ValueError(mesg)
+
+        # remap the out-of-bounds behavior
+        try:
+            extrapolate = {0: True, }[ext]
+        except KeyError as e:
+            raise ValueError(f"Extrapolation mode {ext} is not supported "
+                             "by BSpline.") from e
+
+        return tck(x, der, extrapolate=extrapolate)
+    else:
+        return _impl.splev(x, tck, der, ext)
+
+
+@xp_capabilities(out_of_scope=True)
+def splint(a, b, tck, full_output=0):
+    """
+    Evaluate the definite integral of a B-spline between two given points.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using its
+        ``integrate`` method.
+
+    Parameters
+    ----------
+    a, b : float
+        The end-points of the integration interval.
+    tck : tuple or a BSpline instance
+        If a tuple, then it should be a sequence of length 3, containing the
+        vector of knots, the B-spline coefficients, and the degree of the
+        spline (see `splev`).
+    full_output : int, optional
+        Non-zero to return optional output.
+
+    Returns
+    -------
+    integral : float
+        The resulting integral.
+    wrk : ndarray
+        An array containing the integrals of the normalized B-splines
+        defined on the set of knots.
+        (Only returned if `full_output` is non-zero)
+
+    See Also
+    --------
+    splprep, splrep, sproot, spalde, splev
+    bisplrep, bisplev
+    BSpline
+
+    Notes
+    -----
+    `splint` silently assumes that the spline function is zero outside the data
+    interval (`a`, `b`).
+
+    Manipulating the tck-tuples directly is not recommended. In new code,
+    prefer using the `BSpline` objects.
+
+    References
+    ----------
+    .. [1] P.W. Gaffney, The calculation of indefinite integrals of b-splines",
+        J. Inst. Maths Applics, 17, p.37-41, 1976.
+    .. [2] P. Dierckx, "Curve and surface fitting with splines", Monographs
+        on Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    Examples are given :ref:`in the tutorial <tutorial-interpolate_splXXX>`.
+
+    """
+    if isinstance(tck, BSpline):
+        if tck.c.ndim > 1:
+            mesg = ("Calling splint() with BSpline objects with c.ndim > 1 is "
+                    "not allowed. Use BSpline.integrate() instead.")
+            raise ValueError(mesg)
+
+        if full_output != 0:
+            mesg = (f"full_output = {full_output} is not supported. Proceeding as if "
+                    "full_output = 0")
+
+        return tck.integrate(a, b, extrapolate=False)
+    else:
+        return _impl.splint(a, b, tck, full_output)
+
+
+@xp_capabilities(out_of_scope=True)
+def sproot(tck, mest=10):
+    """
+    Find the roots of a cubic B-spline.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using the
+        following pattern: `PPoly.from_spline(spl).roots()`.
+
+    Given the knots (>=8) and coefficients of a cubic B-spline return the
+    roots of the spline.
+
+    Parameters
+    ----------
+    tck : tuple or a BSpline object
+        If a tuple, then it should be a sequence of length 3, containing the
+        vector of knots, the B-spline coefficients, and the degree of the
+        spline.
+        The number of knots must be >= 8, and the degree must be 3.
+        The knots must be a montonically increasing sequence.
+    mest : int, optional
+        An estimate of the number of zeros (Default is 10).
+
+    Returns
+    -------
+    zeros : ndarray
+        An array giving the roots of the spline.
+
+    See Also
+    --------
+    splprep, splrep, splint, spalde, splev
+    bisplrep, bisplev
+    BSpline
+
+    Notes
+    -----
+    Manipulating the tck-tuples directly is not recommended. In new code,
+    prefer using the `BSpline` objects.
+
+    References
+    ----------
+    .. [1] C. de Boor, "On calculating with b-splines", J. Approximation
+        Theory, 6, p.50-62, 1972.
+    .. [2] M. G. Cox, "The numerical evaluation of b-splines", J. Inst. Maths
+        Applics, 10, p.134-149, 1972.
+    .. [3] P. Dierckx, "Curve and surface fitting with splines", Monographs
+        on Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+
+    For some data, this method may miss a root. This happens when one of
+    the spline knots (which FITPACK places automatically) happens to
+    coincide with the true root. A workaround is to convert to `PPoly`,
+    which uses a different root-finding algorithm.
+
+    For example,
+
+    >>> x = [1.96, 1.97, 1.98, 1.99, 2.00, 2.01, 2.02, 2.03, 2.04, 2.05]
+    >>> y = [-6.365470e-03, -4.790580e-03, -3.204320e-03, -1.607270e-03,
+    ...      4.440892e-16,  1.616930e-03,  3.243000e-03,  4.877670e-03,
+    ...      6.520430e-03,  8.170770e-03]
+    >>> from scipy.interpolate import splrep, sproot, PPoly
+    >>> tck = splrep(x, y, s=0)
+    >>> sproot(tck)
+    array([], dtype=float64)
+
+    Converting to a PPoly object does find the roots at ``x=2``:
+
+    >>> ppoly = PPoly.from_spline(tck)
+    >>> ppoly.roots(extrapolate=False)
+    array([2.])
+
+
+    Further examples are given :ref:`in the tutorial
+    <tutorial-interpolate_splXXX>`.
+
+    """
+    if isinstance(tck, BSpline):
+        if tck.c.ndim > 1:
+            mesg = ("Calling sproot() with BSpline objects with c.ndim > 1 is "
+                    "not allowed.")
+            raise ValueError(mesg)
+
+        t, c, k = tck.tck
+
+        # _impl.sproot expects the interpolation axis to be last, so roll it.
+        # NB: This transpose is a no-op if c is 1D.
+        sh = tuple(range(c.ndim))
+        c = c.transpose(sh[1:] + (0,))
+        return _impl.sproot((t, c, k), mest)
+    else:
+        return _impl.sproot(tck, mest)
+
+
+@xp_capabilities(out_of_scope=True)
+def spalde(x, tck):
+    """
+    Evaluate a B-spline and all its derivatives at one point (or set of points) up
+    to order k (the degree of the spline), being 0 the spline itself.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and evaluate
+        its derivative in a loop or a list comprehension.
+
+    Parameters
+    ----------
+    x : array_like
+        A point or a set of points at which to evaluate the derivatives.
+        Note that ``t(k) <= x <= t(n-k+1)`` must hold for each `x`.
+    tck : tuple
+        A tuple (t,c,k) containing the vector of knots,
+        the B-spline coefficients, and the degree of the spline whose 
+        derivatives to compute.
+
+    Returns
+    -------
+    results : {ndarray, list of ndarrays}
+        An array (or a list of arrays) containing all derivatives
+        up to order k inclusive for each point `x`, being the first element the 
+        spline itself.
+
+    See Also
+    --------
+    splprep, splrep, splint, sproot, splev, bisplrep, bisplev,
+    UnivariateSpline, BivariateSpline
+
+    References
+    ----------
+    .. [1] de Boor C : On calculating with b-splines, J. Approximation Theory
+       6 (1972) 50-62.
+    .. [2] Cox M.G. : The numerical evaluation of b-splines, J. Inst. Maths
+       applics 10 (1972) 134-149.
+    .. [3] Dierckx P. : Curve and surface fitting with splines, Monographs on
+       Numerical Analysis, Oxford University Press, 1993.
+
+    Examples
+    --------
+    To calculate the derivatives of a B-spline there are several aproaches. 
+    In this example, we will demonstrate that `spalde` is equivalent to
+    calling `splev` and `splder`.
+    
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import BSpline, spalde, splder, splev
+    
+    >>> # Store characteristic parameters of a B-spline
+    >>> tck = ((-2, -2, -2, -2, -1, 0, 1, 2, 2, 2, 2),  # knots
+    ...        (0, 0, 0, 6, 0, 0, 0),  # coefficients
+    ...        3)  # degree (cubic)
+    >>> # Instance a B-spline object
+    >>> # `BSpline` objects are preferred, except for spalde()
+    >>> bspl = BSpline(tck[0], tck[1], tck[2])
+    >>> # Generate extra points to get a smooth curve
+    >>> x = np.linspace(min(tck[0]), max(tck[0]), 100)
+    
+    Evaluate the curve and all derivatives
+    
+    >>> # The order of derivative must be less or equal to k, the degree of the spline
+    >>> # Method 1: spalde()
+    >>> f1_y_bsplin = [spalde(i, tck)[0] for i in x ]  # The B-spline itself
+    >>> f1_y_deriv1 = [spalde(i, tck)[1] for i in x ]  # 1st derivative
+    >>> f1_y_deriv2 = [spalde(i, tck)[2] for i in x ]  # 2nd derivative
+    >>> f1_y_deriv3 = [spalde(i, tck)[3] for i in x ]  # 3rd derivative
+    >>> # You can reach the same result by using `splev`and `splder`
+    >>> f2_y_deriv3 = splev(x, bspl, der=3)
+    >>> f3_y_deriv3 = splder(bspl, n=3)(x)
+    
+    >>> # Generate a figure with three axes for graphic comparison
+    >>> fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))
+    >>> suptitle = fig.suptitle(f'Evaluate a B-spline and all derivatives')
+    >>> # Plot B-spline and all derivatives using the three methods
+    >>> orders = range(4)
+    >>> linetypes = ['-', '--', '-.', ':']
+    >>> labels = ['B-Spline', '1st deriv.', '2nd deriv.', '3rd deriv.']
+    >>> functions = ['splev()', 'splder()', 'spalde()']
+    >>> for order, linetype, label in zip(orders, linetypes, labels):
+    ...     ax1.plot(x, splev(x, bspl, der=order), linetype, label=label)
+    ...     ax2.plot(x, splder(bspl, n=order)(x), linetype, label=label)
+    ...     ax3.plot(x, [spalde(i, tck)[order] for i in x], linetype, label=label)
+    >>> for ax, function in zip((ax1, ax2, ax3), functions):
+    ...     ax.set_title(function)
+    ...     ax.legend()
+    >>> plt.tight_layout()
+    >>> plt.show()
+
+    """
+    if isinstance(tck, BSpline):
+        raise TypeError("spalde does not accept BSpline instances.")
+    else:
+        return _impl.spalde(x, tck)
+
+
+@xp_capabilities(out_of_scope=True)
+def insert(x, tck, m=1, per=0):
+    """
+    Insert knots into a B-spline.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using
+        its ``insert_knot`` method.
+
+    Given the knots and coefficients of a B-spline representation, create a
+    new B-spline with a knot inserted `m` times at point `x`.
+    This is a wrapper around the FORTRAN routine insert of FITPACK.
+
+    Parameters
+    ----------
+    x (u) : float
+        A knot value at which to insert a new knot.  If `tck` was returned
+        from ``splprep``, then the parameter values, u should be given.
+    tck : a `BSpline` instance or a tuple
+        If tuple, then it is expected to be a tuple (t,c,k) containing
+        the vector of knots, the B-spline coefficients, and the degree of
+        the spline.
+    m : int, optional
+        The number of times to insert the given knot (its multiplicity).
+        Default is 1.
+    per : int, optional
+        If non-zero, the input spline is considered periodic.
+
+    Returns
+    -------
+    BSpline instance or a tuple
+        A new B-spline with knots t, coefficients c, and degree k.
+        ``t(k+1) <= x <= t(n-k)``, where k is the degree of the spline.
+        In case of a periodic spline (``per != 0``) there must be
+        either at least k interior knots t(j) satisfying ``t(k+1)<t(j)<=x``
+        or at least k interior knots t(j) satisfying ``x<=t(j)<t(n-k)``.
+        A tuple is returned iff the input argument `tck` is a tuple, otherwise
+        a BSpline object is constructed and returned.
+
+    Notes
+    -----
+    Based on algorithms from [1]_ and [2]_.
+
+    Manipulating the tck-tuples directly is not recommended. In new code,
+    prefer using the `BSpline` objects, in particular `BSpline.insert_knot`
+    method.
+
+    See Also
+    --------
+    BSpline.insert_knot
+
+    References
+    ----------
+    .. [1] W. Boehm, "Inserting new knots into b-spline curves.",
+        Computer Aided Design, 12, p.199-201, 1980.
+    .. [2] P. Dierckx, "Curve and surface fitting with splines, Monographs on
+        Numerical Analysis", Oxford University Press, 1993.
+
+    Examples
+    --------
+    You can insert knots into a B-spline.
+
+    >>> from scipy.interpolate import splrep, insert
+    >>> import numpy as np
+    >>> x = np.linspace(0, 10, 5)
+    >>> y = np.sin(x)
+    >>> tck = splrep(x, y)
+    >>> tck[0]
+    array([ 0.,  0.,  0.,  0.,  5., 10., 10., 10., 10.])
+
+    A knot is inserted:
+
+    >>> tck_inserted = insert(3, tck)
+    >>> tck_inserted[0]
+    array([ 0.,  0.,  0.,  0.,  3.,  5., 10., 10., 10., 10.])
+
+    Some knots are inserted:
+
+    >>> tck_inserted2 = insert(8, tck, m=3)
+    >>> tck_inserted2[0]
+    array([ 0.,  0.,  0.,  0.,  5.,  8.,  8.,  8., 10., 10., 10., 10.])
+
+    """
+    if isinstance(tck, BSpline):
+
+        t, c, k = tck.tck
+
+        # FITPACK expects the interpolation axis to be last, so roll it over
+        # NB: if c array is 1D, transposes are no-ops
+        sh = tuple(range(c.ndim))
+        c = c.transpose(sh[1:] + (0,))
+        t_, c_, k_ = _impl.insert(x, (t, c, k), m, per)
+
+        # and roll the last axis back
+        c_ = np.asarray(c_)
+        c_ = c_.transpose((sh[-1],) + sh[:-1])
+        return BSpline(t_, c_, k_)
+    else:
+        return _impl.insert(x, tck, m, per)
+
+
+@xp_capabilities(out_of_scope=True)
+def splder(tck, n=1):
+    """
+    Compute the spline representation of the derivative of a given spline
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using its
+        ``derivative`` method.
+
+    Parameters
+    ----------
+    tck : BSpline instance or tuple
+        BSpline instance or a tuple (t,c,k) containing the vector of knots,
+        the B-spline coefficients, and the degree of the spline whose 
+        derivative to compute
+    n : int, optional
+        Order of derivative to evaluate. Default: 1
+
+    Returns
+    -------
+    `BSpline` instance or tuple
+        Spline of order k2=k-n representing the derivative
+        of the input spline.
+        A tuple is returned if the input argument `tck` is a tuple, otherwise
+        a BSpline object is constructed and returned.
+
+    See Also
+    --------
+    splantider, splev, spalde
+    BSpline
+
+    Notes
+    -----
+
+    .. versionadded:: 0.13.0
+
+    Examples
+    --------
+    This can be used for finding maxima of a curve:
+
+    >>> from scipy.interpolate import splrep, splder, sproot
+    >>> import numpy as np
+    >>> x = np.linspace(0, 10, 70)
+    >>> y = np.sin(x)
+    >>> spl = splrep(x, y, k=4)
+
+    Now, differentiate the spline and find the zeros of the
+    derivative. (NB: `sproot` only works for order 3 splines, so we
+    fit an order 4 spline):
+
+    >>> dspl = splder(spl)
+    >>> sproot(dspl) / np.pi
+    array([ 0.50000001,  1.5       ,  2.49999998])
+
+    This agrees well with roots :math:`\\pi/2 + n\\pi` of
+    :math:`\\cos(x) = \\sin'(x)`.
+
+    A comparison between `splev`, `splder` and `spalde` to compute the derivatives of a 
+    B-spline can be found in the `spalde` examples section.
+
+    """
+    if isinstance(tck, BSpline):
+        return tck.derivative(n)
+    else:
+        return _impl.splder(tck, n)
+
+
+@xp_capabilities(out_of_scope=True)
+def splantider(tck, n=1):
+    """
+    Compute the spline for the antiderivative (integral) of a given spline.
+
+    .. legacy:: function
+
+        Specifically, we recommend constructing a `BSpline` object and using its
+        ``antiderivative`` method.
+
+    Parameters
+    ----------
+    tck : BSpline instance or a tuple of (t, c, k)
+        Spline whose antiderivative to compute
+    n : int, optional
+        Order of antiderivative to evaluate. Default: 1
+
+    Returns
+    -------
+    BSpline instance or a tuple of (t2, c2, k2)
+        Spline of order k2=k+n representing the antiderivative of the input
+        spline.
+        A tuple is returned iff the input argument `tck` is a tuple, otherwise
+        a BSpline object is constructed and returned.
+
+    See Also
+    --------
+    splder, splev, spalde
+    BSpline
+
+    Notes
+    -----
+    The `splder` function is the inverse operation of this function.
+    Namely, ``splder(splantider(tck))`` is identical to `tck`, modulo
+    rounding error.
+
+    .. versionadded:: 0.13.0
+
+    Examples
+    --------
+    >>> from scipy.interpolate import splrep, splder, splantider, splev
+    >>> import numpy as np
+    >>> x = np.linspace(0, np.pi/2, 70)
+    >>> y = 1 / np.sqrt(1 - 0.8*np.sin(x)**2)
+    >>> spl = splrep(x, y)
+
+    The derivative is the inverse operation of the antiderivative,
+    although some floating point error accumulates:
+
+    >>> splev(1.7, spl), splev(1.7, splder(splantider(spl)))
+    (array(2.1565429877197317), array(2.1565429877201865))
+
+    Antiderivative can be used to evaluate definite integrals:
+
+    >>> ispl = splantider(spl)
+    >>> splev(np.pi/2, ispl) - splev(0, ispl)
+    2.2572053588768486
+
+    This is indeed an approximation to the complete elliptic integral
+    :math:`K(m) = \\int_0^{\\pi/2} [1 - m\\sin^2 x]^{-1/2} dx`:
+
+    >>> from scipy.special import ellipk
+    >>> ellipk(0.8)
+    2.2572053268208538
+
+    """
+    if isinstance(tck, BSpline):
+        return tck.antiderivative(n)
+    else:
+        return _impl.splantider(tck, n)
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndbspline.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndbspline.py
new file mode 100644
index 0000000000000000000000000000000000000000..408bdede1ba2f628752b69bd6409df5c60577202
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndbspline.py
@@ -0,0 +1,518 @@
+import itertools
+import functools
+import operator
+import numpy as np
+
+from math import prod
+from types import GenericAlias
+
+from . import _dierckx  # type: ignore[attr-defined]
+
+import scipy.sparse.linalg as ssl
+from scipy.sparse import csr_array
+from scipy._lib._array_api import array_namespace, xp_capabilities
+
+from ._bsplines import _not_a_knot, BSpline
+
+__all__ = ["NdBSpline"]
+
+
+def _get_dtype(dtype):
+    """Return np.complex128 for complex dtypes, np.float64 otherwise."""
+    if np.issubdtype(dtype, np.complexfloating):
+        return np.complex128
+    else:
+        return np.float64
+
+
+@xp_capabilities(
+    cpu_only=True, jax_jit=False,
+    skip_backends=[
+        ("dask.array",
+         "https://github.com/data-apis/array-api-extra/issues/488")
+    ]
+)
+class NdBSpline:
+    """Tensor product spline object.
+
+    The value at point ``xp = (x1, x2, ..., xN)`` is evaluated as a linear
+    combination of products of one-dimensional b-splines in each of the ``N``
+    dimensions::
+
+       c[i1, i2, ..., iN] * B(x1; i1, t1) * B(x2; i2, t2) * ... * B(xN; iN, tN)
+
+
+    Here ``B(x; i, t)`` is the ``i``-th b-spline defined by the knot vector
+    ``t`` evaluated at ``x``.
+
+    Parameters
+    ----------
+    t : tuple of 1D ndarrays
+        knot vectors in directions 1, 2, ... N,
+        ``len(t[i]) == n[i] + k + 1``
+    c : ndarray, shape (n1, n2, ..., nN, ...)
+        b-spline coefficients
+    k : int or length-d tuple of integers
+        spline degrees.
+        A single integer is interpreted as having this degree for
+        all dimensions.
+    extrapolate : bool, optional
+        Whether to extrapolate out-of-bounds inputs, or return `nan`.
+        Default is to extrapolate.
+
+    Attributes
+    ----------
+    t : tuple of ndarrays
+        Knots vectors.
+    c : ndarray
+        Coefficients of the tensor-product spline.
+    k : tuple of integers
+        Degrees for each dimension.
+    extrapolate : bool, optional
+        Whether to extrapolate or return nans for out-of-bounds inputs.
+        Defaults to true.
+
+    Methods
+    -------
+    __call__
+    derivative
+    design_matrix
+
+    See Also
+    --------
+    BSpline : a one-dimensional B-spline object
+    NdPPoly : an N-dimensional piecewise tensor product polynomial
+
+    """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self, t, c, k, *, extrapolate=None):
+        self._k, self._indices_k1d, (self._t, self._len_t) = _preprocess_inputs(k, t)
+
+        self._asarray = array_namespace(c, *t).asarray
+
+        if extrapolate is None:
+            extrapolate = True
+        self.extrapolate = bool(extrapolate)
+
+        self._c = np.asarray(c)
+
+        ndim = self._t.shape[0]   # == len(self.t)
+        if self._c.ndim < ndim:
+            raise ValueError(f"Coefficients must be at least {ndim}-dimensional.")
+
+        for d in range(ndim):
+            td = self.t[d]
+            kd = self.k[d]
+            n = td.shape[0] - kd - 1
+
+            if self._c.shape[d] != n:
+                raise ValueError(f"Knots, coefficients and degree in dimension"
+                                 f" {d} are inconsistent:"
+                                 f" got {self._c.shape[d]} coefficients for"
+                                 f" {len(td)} knots, need at least {n} for"
+                                 f" k={k}.")
+
+        dt = _get_dtype(self._c.dtype)
+        self._c = np.ascontiguousarray(self._c, dtype=dt)
+
+    @property
+    def k(self):
+        return tuple(self._k)
+
+    @property
+    def t(self):
+        # repack the knots into a tuple
+        return tuple(
+            self._asarray(self._t[d, :self._len_t[d]]) for d in range(self._t.shape[0])
+        )
+
+    @property
+    def c(self):
+        return self._asarray(self._c)
+
+    def __call__(self, xi, *, nu=None, extrapolate=None):
+        """Evaluate the tensor product b-spline at ``xi``.
+
+        Parameters
+        ----------
+        xi : array_like, shape(..., ndim)
+            The coordinates to evaluate the interpolator at.
+            This can be a list or tuple of ndim-dimensional points
+            or an array with the shape (num_points, ndim).
+        nu : sequence of length ``ndim``, optional
+            Orders of derivatives to evaluate. Each must be non-negative.
+            Defaults to the zeroth derivivative.
+        extrapolate : bool, optional
+            Whether to exrapolate based on first and last intervals in each
+            dimension, or return `nan`. Default is to ``self.extrapolate``.
+
+        Returns
+        -------
+        values : ndarray, shape ``xi.shape[:-1] + self.c.shape[ndim:]``
+            Interpolated values at ``xi``
+        """
+        ndim = self._t.shape[0]  # == len(self.t)
+
+        if extrapolate is None:
+            extrapolate = self.extrapolate
+        extrapolate = bool(extrapolate)
+
+        if nu is None:
+            nu = np.zeros((ndim,), dtype=np.int64)
+        else:
+            nu = np.asarray(nu, dtype=np.int64)
+            if nu.ndim != 1 or nu.shape[0] != ndim:
+                raise ValueError(
+                    f"invalid number of derivative orders {nu = } for "
+                    f"ndim = {len(self.t)}.")
+            if any(nu < 0):
+                raise ValueError(f"derivatives must be positive, got {nu = }")
+
+        # prepare xi : shape (..., m1, ..., md) -> (1, m1, ..., md)
+        xi = np.asarray(xi, dtype=float)
+        xi_shape = xi.shape
+        xi = xi.reshape(-1, xi_shape[-1])
+        xi = np.ascontiguousarray(xi)
+
+        if xi_shape[-1] != ndim:
+            raise ValueError(f"Shapes: xi.shape={xi_shape} and ndim={ndim}")
+
+        # complex -> double
+        was_complex = self._c.dtype.kind == 'c'
+        cc = self._c
+        if was_complex and self._c.ndim == ndim:
+            # make sure that core dimensions are intact, and complex->float
+            # size doubling only adds a trailing dimension
+            cc = self._c[..., None]
+        cc = cc.view(float)
+
+        # prepare the coefficients: flatten the trailing dimensions
+        c1 = cc.reshape(cc.shape[:ndim] + (-1,))
+        c1r = c1.ravel()
+
+        # replacement for np.ravel_multi_index for indexing of `c1`:
+        _strides_c1 = np.asarray([s // c1.dtype.itemsize
+                                  for s in c1.strides], dtype=np.int64)
+
+        num_c_tr = c1.shape[-1]  # # of trailing coefficients
+        out = _dierckx.evaluate_ndbspline(xi,
+                                 self._t,
+                                 self._len_t,
+                                 self._k,
+                                 nu,
+                                 extrapolate,
+                                 c1r,
+                                 num_c_tr,
+                                 _strides_c1,
+                                 self._indices_k1d,
+        )
+        out = out.view(self._c.dtype)
+        out = out.reshape(xi_shape[:-1] + self._c.shape[ndim:])
+        return self._asarray(out)
+
+    @classmethod
+    def design_matrix(cls, xvals, t, k, extrapolate=True):
+        """Construct the design matrix as a CSR format sparse array.
+
+        Parameters
+        ----------
+        xvals :  ndarray, shape(npts, ndim)
+            Data points. ``xvals[j, :]`` gives the ``j``-th data point as an
+            ``ndim``-dimensional array.
+        t : tuple of 1D ndarrays, length-ndim
+            Knot vectors in directions 1, 2, ... ndim,
+        k : int
+            B-spline degree.
+        extrapolate : bool, optional
+            Whether to extrapolate out-of-bounds values of raise a `ValueError`
+
+        Returns
+        -------
+        design_matrix : a CSR array
+            Each row of the design matrix corresponds to a value in `xvals` and
+            contains values of b-spline basis elements which are non-zero
+            at this value.
+
+        """
+        xvals = np.asarray(xvals, dtype=float)
+        ndim = xvals.shape[-1]
+        if len(t) != ndim:
+            raise ValueError(
+                f"Data and knots are inconsistent: len(t) = {len(t)} for "
+                f" {ndim = }."
+            )
+
+        # tabulate the flat indices for iterating over the (k+1)**ndim subarray
+        k, _indices_k1d, (_t, len_t) = _preprocess_inputs(k, t)
+
+        # Precompute the shape and strides of the 'coefficients array'.
+        # This would have been the NdBSpline coefficients; in the present context
+        # this is a helper to compute the indices into the colocation matrix.
+        c_shape = tuple(len_t[d] - k[d] - 1 for d in range(ndim))
+
+        # The strides of the coeffs array: the computation is equivalent to
+        # >>> cstrides = [s // 8 for s in np.empty(c_shape).strides]
+        cs = c_shape[1:] + (1,)
+        cstrides = np.cumprod(cs[::-1], dtype=np.int64)[::-1].copy()
+
+        # heavy lifting happens here
+        data, indices, indptr = _dierckx._coloc_nd(xvals,
+                _t, len_t, k, _indices_k1d, cstrides)
+
+        return csr_array((data, indices, indptr))
+
+    def _bspline_derivative_along_axis(self, c, t, k, axis, nu=1):
+        # Move the selected axis to front
+        c = np.moveaxis(c, axis, 0)
+        n = c.shape[0]
+        trailing_shape = c.shape[1:]
+        c_flat = c.reshape(n, -1)
+
+        new_c_list = []
+        new_t = None
+
+        for i in range(c_flat.shape[1]):
+            if k >= nu:
+                b = BSpline.construct_fast(t, c_flat[:, i], k)
+                db = b.derivative(nu)
+                # truncate coefficients to match new knot/degree size
+                db.c = db.c[:len(db.t) - db.k - 1]
+            else:
+                db = BSpline.construct_fast(t, np.zeros(len(t) - 1), 0)
+
+            if new_t is None:
+                new_t = db.t
+
+            new_c_list.append(db.c)
+
+        new_c = np.stack(new_c_list, axis=1).reshape(
+            (len(new_c_list[0]),) + trailing_shape)
+        new_c = np.moveaxis(new_c, 0, axis)
+
+        return new_c, new_t
+
+    def derivative(self, nu):
+        """
+        Construct a new NdBSpline representing the partial derivative.
+
+        Parameters
+        ----------
+        nu : array_like of shape (ndim,)
+            Orders of the partial derivatives to compute along each dimension.
+
+        Returns
+        -------
+        NdBSpline
+            A new NdBSpline representing the partial derivative of the original spline.
+
+        """
+        nu_arr = np.asarray(nu, dtype=np.int64)
+        ndim = len(self.t)
+
+        if nu_arr.ndim != 1 or nu_arr.shape[0] != ndim:
+            raise ValueError(
+                f"invalid number of derivative orders {nu = } for "
+                f"ndim = {len(self.t)}.")
+
+        if any(nu_arr < 0):
+            raise ValueError(f"derivative orders must be positive, got {nu = }")
+
+        # extract t and c as numpy arrays
+        t_new = [self._t[d, :self._len_t[d]] for d in range(self._t.shape[0])]
+        k_new = list(self.k)
+        c_new = self._c.copy()
+
+        for axis, n in enumerate(nu_arr):
+            if n == 0:
+                continue
+
+            c_new, t_new[axis] = self._bspline_derivative_along_axis(
+                c_new, t_new[axis], k_new[axis], axis, nu=n
+            )
+            k_new[axis] = max(k_new[axis] - n, 0)
+
+        return NdBSpline(tuple(self._asarray(t) for t in t_new),
+                         self._asarray(c_new),
+                         tuple(k_new),
+                         extrapolate=self.extrapolate
+        )
+
+def _preprocess_inputs(k, t_tpl):
+    """Helpers: validate and preprocess NdBSpline inputs.
+
+       Parameters
+       ----------
+       k : int or tuple
+          Spline orders
+       t_tpl : tuple or array-likes
+          Knots.
+    """
+    # 1. Make sure t_tpl is a tuple
+    if not isinstance(t_tpl, tuple):
+        raise ValueError(f"Expect `t` to be a tuple of array-likes. "
+                         f"Got {t_tpl} instead."
+        )
+
+    # 2. Make ``k`` a tuple of integers
+    ndim = len(t_tpl)
+    try:
+        len(k)
+    except TypeError:
+        # make k a tuple
+        k = (k,)*ndim
+
+    k = np.asarray([operator.index(ki) for ki in k], dtype=np.int64)
+
+    if len(k) != ndim:
+        raise ValueError(f"len(t) = {len(t_tpl)} != {len(k) = }.")
+
+    # 3. Validate inputs
+    ndim = len(t_tpl)
+    for d in range(ndim):
+        td = np.asarray(t_tpl[d])
+        kd = k[d]
+        n = td.shape[0] - kd - 1
+        if kd < 0:
+            raise ValueError(f"Spline degree in dimension {d} cannot be"
+                             f" negative.")
+        if td.ndim != 1:
+            raise ValueError(f"Knot vector in dimension {d} must be"
+                             f" one-dimensional.")
+        if n < kd + 1:
+            raise ValueError(f"Need at least {2*kd + 2} knots for degree"
+                             f" {kd} in dimension {d}.")
+        if (np.diff(td) < 0).any():
+            raise ValueError(f"Knots in dimension {d} must be in a"
+                             f" non-decreasing order.")
+        if len(np.unique(td[kd:n + 1])) < 2:
+            raise ValueError(f"Need at least two internal knots in"
+                             f" dimension {d}.")
+        if not np.isfinite(td).all():
+            raise ValueError(f"Knots in dimension {d} should not have"
+                             f" nans or infs.")
+
+    # 4. tabulate the flat indices for iterating over the (k+1)**ndim subarray
+    # non-zero b-spline elements
+    shape = tuple(kd + 1 for kd in k)
+    indices = np.unravel_index(np.arange(prod(shape)), shape)
+    _indices_k1d = np.asarray(indices, dtype=np.int64).T.copy()
+
+    # 5. pack the knots into a single array:
+    #    ([1, 2, 3, 4], [5, 6], (7, 8, 9)) -->
+    #    array([[1, 2, 3, 4],
+    #           [5, 6, nan, nan],
+    #           [7, 8, 9, nan]])
+    t_tpl = [np.asarray(t) for t in t_tpl]
+    ndim = len(t_tpl)
+    len_t = [len(ti) for ti in t_tpl]
+    _t = np.empty((ndim, max(len_t)), dtype=float)
+    _t.fill(np.nan)
+    for d in range(ndim):
+        _t[d, :len(t_tpl[d])] = t_tpl[d]
+    len_t = np.asarray(len_t, dtype=np.int64)
+
+    return k, _indices_k1d, (_t, len_t)
+
+
+def _iter_solve(a, b, solver=ssl.gcrotmk, **solver_args):
+    # work around iterative solvers not accepting multiple r.h.s.
+
+    # also work around a.dtype == float64 and b.dtype == complex128
+    # cf https://github.com/scipy/scipy/issues/19644
+    if np.issubdtype(b.dtype, np.complexfloating):
+        real = _iter_solve(a, b.real, solver, **solver_args)
+        imag = _iter_solve(a, b.imag, solver, **solver_args)
+        return real + 1j*imag
+
+    if b.ndim == 2 and b.shape[1] !=1:
+        res = np.empty_like(b)
+        for j in range(b.shape[1]):
+            res[:, j], info = solver(a, b[:, j], **solver_args)
+            if info != 0:
+                raise ValueError(f"{solver = } returns {info =} for column {j}.")
+        return res
+    else:
+        res, info = solver(a, b, **solver_args)
+        if info != 0:
+            raise ValueError(f"{solver = } returns {info = }.")
+        return res
+
+
+def make_ndbspl(points, values, k=3, *, solver=ssl.gcrotmk, **solver_args):
+    """Construct an interpolating NdBspline.
+
+    Parameters
+    ----------
+    points : tuple of ndarrays of float, with shapes (m1,), ... (mN,)
+        The points defining the regular grid in N dimensions. The points in
+        each dimension (i.e. every element of the `points` tuple) must be
+        strictly ascending or descending.
+    values : ndarray of float, shape (m1, ..., mN, ...)
+        The data on the regular grid in n dimensions.
+    k : int, optional
+        The spline degree. Must be odd. Default is cubic, k=3
+    solver : a `scipy.sparse.linalg` solver (iterative or direct), optional.
+        An iterative solver from `scipy.sparse.linalg` or a direct one,
+        `sparse.sparse.linalg.spsolve`.
+        Used to solve the sparse linear system
+        ``design_matrix @ coefficients = rhs`` for the coefficients.
+        Default is `scipy.sparse.linalg.gcrotmk`
+    solver_args : dict, optional
+        Additional arguments for the solver. The call signature is
+        ``solver(csr_array, rhs_vector, **solver_args)``
+
+    Returns
+    -------
+    spl : NdBSpline object
+
+    Notes
+    -----
+    Boundary conditions are not-a-knot in all dimensions.
+    """
+    ndim = len(points)
+    xi_shape = tuple(len(x) for x in points)
+
+    try:
+        len(k)
+    except TypeError:
+        # make k a tuple
+        k = (k,)*ndim
+
+    for d, point in enumerate(points):
+        numpts = len(np.atleast_1d(point))
+        if numpts <= k[d]:
+            raise ValueError(f"There are {numpts} points in dimension {d},"
+                             f" but order {k[d]} requires at least "
+                             f" {k[d]+1} points per dimension.")
+
+    t = tuple(_not_a_knot(np.asarray(points[d], dtype=float), k[d])
+              for d in range(ndim))
+    xvals = np.asarray([xv for xv in itertools.product(*points)], dtype=float)
+
+    # construct the colocation matrix
+    matr = NdBSpline.design_matrix(xvals, t, k)
+
+    # Remove zeros from the sparse matrix
+    # If k=1, then solve() doesn't take long enough for this to help
+    if k[0] >= 3:
+        matr.eliminate_zeros()
+
+    # Solve for the coefficients given `values`.
+    # Trailing dimensions: first ndim dimensions are data, the rest are batch
+    # dimensions, so stack `values` into a 2D array for `spsolve` to undestand.
+    v_shape = values.shape
+    vals_shape = (prod(v_shape[:ndim]), prod(v_shape[ndim:]))
+    vals = values.reshape(vals_shape)
+
+    if solver != ssl.spsolve:
+        solver = functools.partial(_iter_solve, solver=solver)
+        if "atol" not in solver_args:
+            # avoid a DeprecationWarning, grumble grumble
+            solver_args["atol"] = 1e-6
+
+    coef = solver(matr, vals, **solver_args)
+    coef = coef.reshape(xi_shape + v_shape[ndim:])
+    return NdBSpline(t, coef, k)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndgriddata.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndgriddata.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a36945e5cf9d2dc8dcb6fb9bd256a066a0816a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_ndgriddata.py
@@ -0,0 +1,330 @@
+"""
+Convenience interface to N-D interpolation
+
+.. versionadded:: 0.9
+
+"""
+import numpy as np
+from ._interpnd import (LinearNDInterpolator, NDInterpolatorBase,
+     CloughTocher2DInterpolator, _ndim_coords_from_arrays)
+from scipy.spatial import cKDTree
+
+__all__ = ['griddata', 'NearestNDInterpolator', 'LinearNDInterpolator',
+           'CloughTocher2DInterpolator']
+
+#------------------------------------------------------------------------------
+# Nearest-neighbor interpolation
+#------------------------------------------------------------------------------
+
+
+class NearestNDInterpolator(NDInterpolatorBase):
+    """Nearest-neighbor interpolator in N > 1 dimensions.
+
+    Methods
+    -------
+    __call__
+
+    Parameters
+    ----------
+    x : (npoints, ndims) 2-D ndarray of floats
+        Data point coordinates.
+    y : (npoints, ...) N-D ndarray of float or complex
+        Data values. The length of `y` along the first axis must be equal to
+        the length of `x`.
+    rescale : boolean, optional
+        Rescale points to unit cube before performing interpolation.
+        This is useful if some of the input dimensions have
+        incommensurable units and differ by many orders of magnitude.
+
+        .. versionadded:: 0.14.0
+    tree_options : dict, optional
+        Options passed to the underlying ``cKDTree``.
+
+        .. versionadded:: 0.17.0
+
+    See Also
+    --------
+    griddata :
+        Interpolate unstructured D-D data.
+    LinearNDInterpolator :
+        Piecewise linear interpolator in N dimensions.
+    CloughTocher2DInterpolator :
+        Piecewise cubic, C1 smooth, curvature-minimizing interpolator in 2D.
+    interpn : Interpolation on a regular grid or rectilinear grid.
+    RegularGridInterpolator : Interpolator on a regular or rectilinear grid
+                              in arbitrary dimensions (`interpn` wraps this
+                              class).
+
+    Notes
+    -----
+    Uses ``scipy.spatial.cKDTree``
+
+    .. note:: For data on a regular grid use `interpn` instead.
+
+    Examples
+    --------
+    We can interpolate values on a 2D plane:
+
+    >>> from scipy.interpolate import NearestNDInterpolator
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng()
+    >>> x = rng.random(10) - 0.5
+    >>> y = rng.random(10) - 0.5
+    >>> z = np.hypot(x, y)
+    >>> X = np.linspace(min(x), max(x))
+    >>> Y = np.linspace(min(y), max(y))
+    >>> X, Y = np.meshgrid(X, Y)  # 2D grid for interpolation
+    >>> interp = NearestNDInterpolator(list(zip(x, y)), z)
+    >>> Z = interp(X, Y)
+    >>> plt.pcolormesh(X, Y, Z, shading='auto')
+    >>> plt.plot(x, y, "ok", label="input point")
+    >>> plt.legend()
+    >>> plt.colorbar()
+    >>> plt.axis("equal")
+    >>> plt.show()
+
+    """
+
+    def __init__(self, x, y, rescale=False, tree_options=None):
+        NDInterpolatorBase.__init__(self, x, y, rescale=rescale,
+                                    need_contiguous=False,
+                                    need_values=False)
+        if tree_options is None:
+            tree_options = dict()
+        self.tree = cKDTree(self.points, **tree_options)
+        self.values = np.asarray(y)
+
+    def __call__(self, *args, **query_options):
+        """
+        Evaluate interpolator at given points.
+
+        Parameters
+        ----------
+        x1, x2, ... xn : array-like of float
+            Points where to interpolate data at.
+            x1, x2, ... xn can be array-like of float with broadcastable shape.
+            or x1 can be array-like of float with shape ``(..., ndim)``
+        **query_options
+            This allows ``eps``, ``p``, ``distance_upper_bound``, and ``workers``
+            being passed to the cKDTree's query function to be explicitly set.
+            See `scipy.spatial.cKDTree.query` for an overview of the different options.
+
+            .. versionadded:: 1.12.0
+
+        """
+        # For the sake of enabling subclassing, NDInterpolatorBase._set_xi performs
+        # some operations which are not required by NearestNDInterpolator.__call__, 
+        # hence here we operate on xi directly, without calling a parent class function.
+        xi = _ndim_coords_from_arrays(args, ndim=self.points.shape[1])
+        xi = self._check_call_shape(xi)
+        xi = self._scale_x(xi)
+
+        # We need to handle two important cases:
+        # (1) the case where xi has trailing dimensions (..., ndim), and
+        # (2) the case where y has trailing dimensions
+        # We will first flatten xi to deal with case (1),
+        # do the computation in flattened array while retaining y's dimensionality,
+        # and then reshape the interpolated values back to match xi's shape.
+
+        # Flatten xi for the query
+        xi_flat = xi.reshape(-1, xi.shape[-1])
+        original_shape = xi.shape
+        flattened_shape = xi_flat.shape
+
+        # if distance_upper_bound is set to not be infinite,
+        # then we need to consider the case where cKDtree
+        # does not find any points within distance_upper_bound to return.
+        # It marks those points as having infinte distance, which is what will be used
+        # below to mask the array and return only the points that were deemed
+        # to have a close enough neighbor to return something useful.
+        dist, i = self.tree.query(xi_flat, **query_options)
+        valid_mask = np.isfinite(dist)
+
+        # create a holder interp_values array and fill with nans.
+        if self.values.ndim > 1:
+            interp_shape = flattened_shape[:-1] + self.values.shape[1:]
+        else:
+            interp_shape = flattened_shape[:-1]
+
+        if np.issubdtype(self.values.dtype, np.complexfloating):
+            interp_values = np.full(interp_shape, np.nan, dtype=self.values.dtype)
+        else:
+            interp_values = np.full(interp_shape, np.nan)
+
+        interp_values[valid_mask] = self.values[i[valid_mask], ...]
+
+        if self.values.ndim > 1:
+            new_shape = original_shape[:-1] + self.values.shape[1:]
+        else:
+            new_shape = original_shape[:-1]
+        interp_values = interp_values.reshape(new_shape)
+
+        return interp_values
+
+
+#------------------------------------------------------------------------------
+# Convenience interface function
+#------------------------------------------------------------------------------
+
+
+def griddata(points, values, xi, method='linear', fill_value=np.nan,
+             rescale=False):
+    """
+    Convenience function for interpolating unstructured data in multiple dimensions.
+
+    Parameters
+    ----------
+    points : 2-D ndarray of floats with shape (n, D), or length D tuple of 1-D ndarrays with shape (n,).
+        Data point coordinates.
+    values : ndarray of float or complex, shape (n,)
+        Data values.
+    xi : 2-D ndarray of floats with shape (m, D), or length D tuple of ndarrays broadcastable to the same shape.
+        Points at which to interpolate data.
+    method : {'linear', 'nearest', 'cubic'}, optional
+        Method of interpolation. One of
+
+        ``nearest``
+          return the value at the data point closest to
+          the point of interpolation. See `NearestNDInterpolator` for
+          more details.
+
+        ``linear``
+          tessellate the input point set to N-D
+          simplices, and interpolate linearly on each simplex. See
+          `LinearNDInterpolator` for more details.
+
+        ``cubic`` (1-D)
+          return the value determined from a cubic
+          spline.
+
+        ``cubic`` (2-D)
+          return the value determined from a
+          piecewise cubic, continuously differentiable (C1), and
+          approximately curvature-minimizing polynomial surface. See
+          `CloughTocher2DInterpolator` for more details.
+    fill_value : float, optional
+        Value used to fill in for requested points outside of the
+        convex hull of the input points. If not provided, then the
+        default is ``nan``. This option has no effect for the
+        'nearest' method.
+    rescale : bool, optional
+        Rescale points to unit cube before performing interpolation.
+        This is useful if some of the input dimensions have
+        incommensurable units and differ by many orders of magnitude.
+
+        .. versionadded:: 0.14.0
+
+    Returns
+    -------
+    ndarray
+        Array of interpolated values.
+
+    See Also
+    --------
+    LinearNDInterpolator :
+        Piecewise linear interpolator in N dimensions.
+    NearestNDInterpolator :
+        Nearest-neighbor interpolator in N dimensions.
+    CloughTocher2DInterpolator :
+        Piecewise cubic, C1 smooth, curvature-minimizing interpolator in 2D.
+    interpn : Interpolation on a regular grid or rectilinear grid.
+    RegularGridInterpolator : Interpolator on a regular or rectilinear grid
+                              in arbitrary dimensions (`interpn` wraps this
+                              class).
+
+    Notes
+    -----
+
+    .. versionadded:: 0.9
+
+    .. note:: For data on a regular grid use `interpn` instead.
+
+    Examples
+    --------
+
+    Suppose we want to interpolate the 2-D function
+
+    >>> import numpy as np
+    >>> def func(x, y):
+    ...     return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2
+
+    on a grid in [0, 1]x[0, 1]
+
+    >>> grid_x, grid_y = np.mgrid[0:1:100j, 0:1:200j]
+
+    but we only know its values at 1000 data points:
+
+    >>> rng = np.random.default_rng()
+    >>> points = rng.random((1000, 2))
+    >>> values = func(points[:,0], points[:,1])
+
+    This can be done with `griddata` -- below we try out all of the
+    interpolation methods:
+
+    >>> from scipy.interpolate import griddata
+    >>> grid_z0 = griddata(points, values, (grid_x, grid_y), method='nearest')
+    >>> grid_z1 = griddata(points, values, (grid_x, grid_y), method='linear')
+    >>> grid_z2 = griddata(points, values, (grid_x, grid_y), method='cubic')
+
+    One can see that the exact result is reproduced by all of the
+    methods to some degree, but for this smooth function the piecewise
+    cubic interpolant gives the best results:
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.subplot(221)
+    >>> plt.imshow(func(grid_x, grid_y).T, extent=(0,1,0,1), origin='lower')
+    >>> plt.plot(points[:,0], points[:,1], 'k.', ms=1)
+    >>> plt.title('Original')
+    >>> plt.subplot(222)
+    >>> plt.imshow(grid_z0.T, extent=(0,1,0,1), origin='lower')
+    >>> plt.title('Nearest')
+    >>> plt.subplot(223)
+    >>> plt.imshow(grid_z1.T, extent=(0,1,0,1), origin='lower')
+    >>> plt.title('Linear')
+    >>> plt.subplot(224)
+    >>> plt.imshow(grid_z2.T, extent=(0,1,0,1), origin='lower')
+    >>> plt.title('Cubic')
+    >>> plt.gcf().set_size_inches(6, 6)
+    >>> plt.show()
+
+    """ # numpy/numpydoc#87  # noqa: E501
+
+    points = _ndim_coords_from_arrays(points)
+
+    if points.ndim < 2:
+        ndim = points.ndim
+    else:
+        ndim = points.shape[-1]
+
+    if ndim == 1 and method in ('nearest', 'linear', 'cubic'):
+        from ._interpolate import interp1d
+        points = points.ravel()
+        if isinstance(xi, tuple):
+            if len(xi) != 1:
+                raise ValueError("invalid number of dimensions in xi")
+            xi, = xi
+        # Sort points/values together, necessary as input for interp1d
+        idx = np.argsort(points)
+        points = points[idx]
+        values = values[idx]
+        if method == 'nearest':
+            fill_value = 'extrapolate'
+        ip = interp1d(points, values, kind=method, axis=0, bounds_error=False,
+                      fill_value=fill_value)
+        return ip(xi)
+    elif method == 'nearest':
+        ip = NearestNDInterpolator(points, values, rescale=rescale)
+        return ip(xi)
+    elif method == 'linear':
+        ip = LinearNDInterpolator(points, values, fill_value=fill_value,
+                                  rescale=rescale)
+        return ip(xi)
+    elif method == 'cubic' and ndim == 2:
+        ip = CloughTocher2DInterpolator(points, values, fill_value=fill_value,
+                                        rescale=rescale)
+        return ip(xi)
+    else:
+        raise ValueError(
+            f"Unknown interpolation method {method!r} for {ndim} dimensional data"
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_pade.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_pade.py
new file mode 100644
index 0000000000000000000000000000000000000000..387ef11dde5d3ace8a15324058c10fa31899c92c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_pade.py
@@ -0,0 +1,67 @@
+from numpy import zeros, asarray, eye, poly1d, hstack, r_
+from scipy import linalg
+
+__all__ = ["pade"]
+
+def pade(an, m, n=None):
+    """
+    Return Pade approximation to a polynomial as the ratio of two polynomials.
+
+    Parameters
+    ----------
+    an : (N,) array_like
+        Taylor series coefficients.
+    m : int
+        The order of the returned approximating polynomial `q`.
+    n : int, optional
+        The order of the returned approximating polynomial `p`. By default,
+        the order is ``len(an)-1-m``.
+
+    Returns
+    -------
+    p, q : Polynomial class
+        The Pade approximation of the polynomial defined by `an` is
+        ``p(x)/q(x)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.interpolate import pade
+    >>> e_exp = [1.0, 1.0, 1.0/2.0, 1.0/6.0, 1.0/24.0, 1.0/120.0]
+    >>> p, q = pade(e_exp, 2)
+
+    >>> e_exp.reverse()
+    >>> e_poly = np.poly1d(e_exp)
+
+    Compare ``e_poly(x)`` and the Pade approximation ``p(x)/q(x)``
+
+    >>> e_poly(1)
+    2.7166666666666668
+
+    >>> p(1)/q(1)
+    2.7179487179487181
+
+    """
+    an = asarray(an)
+    if n is None:
+        n = len(an) - 1 - m
+        if n < 0:
+            raise ValueError("Order of q <m> must be smaller than len(an)-1.")
+    if n < 0:
+        raise ValueError("Order of p <n> must be greater than 0.")
+    N = m + n
+    if N > len(an)-1:
+        raise ValueError("Order of q+p <m+n> must be smaller than len(an).")
+    an = an[:N+1]
+    Akj = eye(N+1, n+1, dtype=an.dtype)
+    Bkj = zeros((N+1, m), dtype=an.dtype)
+    for row in range(1, m+1):
+        Bkj[row,:row] = -(an[:row])[::-1]
+    for row in range(m+1, N+1):
+        Bkj[row,:] = -(an[row-m:row])[::-1]
+    C = hstack((Akj, Bkj))
+    pq = linalg.solve(C, an)
+    p = pq[:n+1]
+    q = r_[1.0, pq[n+1:]]
+    return poly1d(p[::-1]), poly1d(q[::-1])
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_polyint.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_polyint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02065f552827a602656077d4383474016933eda
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_polyint.py
@@ -0,0 +1,1029 @@
+import warnings
+from types import GenericAlias
+
+import numpy as np
+from scipy.special import factorial
+from scipy._lib._util import (_asarray_validated, float_factorial, check_random_state,
+                              _transition_to_rng)
+
+
+__all__ = ["KroghInterpolator", "krogh_interpolate",
+           "BarycentricInterpolator", "barycentric_interpolate",
+           "approximate_taylor_polynomial"]
+
+
+def _isscalar(x):
+    """Check whether x is if a scalar type, or 0-dim"""
+    return np.isscalar(x) or hasattr(x, 'shape') and x.shape == ()
+
+
+class _Interpolator1D:
+    """
+    Common features in univariate interpolation
+
+    Deal with input data type and interpolation axis rolling. The
+    actual interpolator can assume the y-data is of shape (n, r) where
+    `n` is the number of x-points, and `r` the number of variables,
+    and use self.dtype as the y-data type.
+
+    Attributes
+    ----------
+    _y_axis
+        Axis along which the interpolation goes in the original array
+    _y_extra_shape
+        Additional trailing shape of the input arrays, excluding
+        the interpolation axis.
+    dtype
+        Dtype of the y-data arrays. Can be set via _set_dtype, which
+        forces it to be float or complex.
+
+    Methods
+    -------
+    __call__
+    _prepare_x
+    _finish_y
+    _reshape_yi
+    _set_yi
+    _set_dtype
+    _evaluate
+
+    """
+
+    __slots__ = ('_y_axis', '_y_extra_shape', 'dtype')
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self, xi=None, yi=None, axis=None):
+        self._y_axis = axis
+        self._y_extra_shape = None
+        self.dtype = None
+        if yi is not None:
+            self._set_yi(yi, xi=xi, axis=axis)
+
+    def __call__(self, x):
+        """
+        Evaluate the interpolant
+
+        Parameters
+        ----------
+        x : array_like
+            Point or points at which to evaluate the interpolant.
+
+        Returns
+        -------
+        y : array_like
+            Interpolated values. Shape is determined by replacing
+            the interpolation axis in the original array with the shape of `x`.
+
+        Notes
+        -----
+        Input values `x` must be convertible to `float` values like `int`
+        or `float`.
+
+        """
+        x, x_shape = self._prepare_x(x)
+        y = self._evaluate(x)
+        return self._finish_y(y, x_shape)
+
+    def _evaluate(self, x):
+        """
+        Actually evaluate the value of the interpolator.
+        """
+        raise NotImplementedError()
+
+    def _prepare_x(self, x):
+        """Reshape input x array to 1-D"""
+        x = _asarray_validated(x, check_finite=False, as_inexact=True)
+        x_shape = x.shape
+        return x.ravel(), x_shape
+
+    def _finish_y(self, y, x_shape):
+        """Reshape interpolated y back to an N-D array similar to initial y"""
+        y = y.reshape(x_shape + self._y_extra_shape)
+        if self._y_axis != 0 and x_shape != ():
+            nx = len(x_shape)
+            ny = len(self._y_extra_shape)
+            s = (list(range(nx, nx + self._y_axis))
+                 + list(range(nx)) + list(range(nx+self._y_axis, nx+ny)))
+            y = y.transpose(s)
+        return y
+
+    def _reshape_yi(self, yi, check=False):
+        yi = np.moveaxis(np.asarray(yi), self._y_axis, 0)
+        if check and yi.shape[1:] != self._y_extra_shape:
+            ok_shape = (f"{self._y_extra_shape[-self._y_axis:]!r} + (N,) + "
+                        f"{self._y_extra_shape[:-self._y_axis]!r}")
+            raise ValueError(f"Data must be of shape {ok_shape}")
+        return yi.reshape((yi.shape[0], -1))
+
+    def _set_yi(self, yi, xi=None, axis=None):
+        if axis is None:
+            axis = self._y_axis
+        if axis is None:
+            raise ValueError("no interpolation axis specified")
+
+        yi = np.asarray(yi)
+
+        shape = yi.shape
+        if shape == ():
+            shape = (1,)
+        if xi is not None and shape[axis] != len(xi):
+            raise ValueError("x and y arrays must be equal in length along "
+                             "interpolation axis.")
+
+        self._y_axis = (axis % yi.ndim)
+        self._y_extra_shape = yi.shape[:self._y_axis] + yi.shape[self._y_axis+1:]
+        self.dtype = None
+        self._set_dtype(yi.dtype)
+
+    def _set_dtype(self, dtype, union=False):
+        if np.issubdtype(dtype, np.complexfloating) \
+               or np.issubdtype(self.dtype, np.complexfloating):
+            self.dtype = np.complex128
+        else:
+            if not union or self.dtype != np.complex128:
+                self.dtype = np.float64
+
+
+class _Interpolator1DWithDerivatives(_Interpolator1D):
+    def derivatives(self, x, der=None):
+        """
+        Evaluate several derivatives of the polynomial at the point `x`
+
+        Produce an array of derivatives evaluated at the point `x`.
+
+        Parameters
+        ----------
+        x : array_like
+            Point or points at which to evaluate the derivatives
+        der : int or list or None, optional
+            How many derivatives to evaluate, or None for all potentially
+            nonzero derivatives (that is, a number equal to the number
+            of points), or a list of derivatives to evaluate. This number
+            includes the function value as the '0th' derivative.
+
+        Returns
+        -------
+        d : ndarray
+            Array with derivatives; ``d[j]`` contains the jth derivative.
+            Shape of ``d[j]`` is determined by replacing the interpolation
+            axis in the original array with the shape of `x`.
+
+        Examples
+        --------
+        >>> from scipy.interpolate import KroghInterpolator
+        >>> KroghInterpolator([0,0,0],[1,2,3]).derivatives(0)
+        array([1.0,2.0,3.0])
+        >>> KroghInterpolator([0,0,0],[1,2,3]).derivatives([0,0])
+        array([[1.0,1.0],
+               [2.0,2.0],
+               [3.0,3.0]])
+
+        """
+        x, x_shape = self._prepare_x(x)
+        y = self._evaluate_derivatives(x, der)
+
+        y = y.reshape((y.shape[0],) + x_shape + self._y_extra_shape)
+        if self._y_axis != 0 and x_shape != ():
+            nx = len(x_shape)
+            ny = len(self._y_extra_shape)
+            s = ([0] + list(range(nx+1, nx + self._y_axis+1))
+                 + list(range(1, nx+1)) +
+                 list(range(nx+1+self._y_axis, nx+ny+1)))
+            y = y.transpose(s)
+        return y
+
+    def derivative(self, x, der=1):
+        """
+        Evaluate a single derivative of the polynomial at the point `x`.
+
+        Parameters
+        ----------
+        x : array_like
+            Point or points at which to evaluate the derivatives
+
+        der : integer, optional
+            Which derivative to evaluate (default: first derivative).
+            This number includes the function value as 0th derivative.
+
+        Returns
+        -------
+        d : ndarray
+            Derivative interpolated at the x-points. Shape of `d` is
+            determined by replacing the interpolation axis in the
+            original array with the shape of `x`.
+
+        Notes
+        -----
+        This may be computed by evaluating all derivatives up to the desired
+        one (using self.derivatives()) and then discarding the rest.
+
+        """
+        x, x_shape = self._prepare_x(x)
+        y = self._evaluate_derivatives(x, der+1)
+        return self._finish_y(y[der], x_shape)
+
+    def _evaluate_derivatives(self, x, der=None):
+        """
+        Actually evaluate the derivatives.
+
+        Parameters
+        ----------
+        x : array_like
+            1D array of points at which to evaluate the derivatives
+        der : integer, optional
+            The number of derivatives to evaluate, from 'order 0' (der=1)
+            to order der-1.  If omitted, return all possibly-non-zero
+            derivatives, ie 0 to order n-1.
+
+        Returns
+        -------
+        d : ndarray
+            Array of shape ``(der, x.size, self.yi.shape[1])`` containing
+            the derivatives from 0 to der-1
+        """
+        raise NotImplementedError()
+
+
+class KroghInterpolator(_Interpolator1DWithDerivatives):
+    """Krogh interpolator (C∞ smooth).
+
+    The polynomial passes through all the pairs ``(xi, yi)``. One may
+    additionally specify a number of derivatives at each point `xi`;
+    this is done by repeating the value `xi` and specifying the
+    derivatives as successive `yi` values.
+
+    Allows evaluation of the polynomial and all its derivatives.
+    For reasons of numerical stability, this function does not compute
+    the coefficients of the polynomial, although they can be obtained
+    by evaluating all the derivatives.
+
+    Parameters
+    ----------
+    xi : array_like, shape (npoints, )
+        Known x-coordinates. Must be sorted in increasing order.
+    yi : array_like, shape (..., npoints, ...)
+        Known y-coordinates. When an xi occurs two or more times in
+        a row, the corresponding yi's represent derivative values. The length of `yi`
+        along the interpolation axis must be equal to the length of `xi`. Use the
+        `axis` parameter to select the correct axis.
+    axis : int, optional
+        Axis in the `yi` array corresponding to the x-coordinate values. Defaults to
+        ``axis=0``.
+
+    Notes
+    -----
+    Be aware that the algorithms implemented here are not necessarily
+    the most numerically stable known. Moreover, even in a world of
+    exact computation, unless the x coordinates are chosen very
+    carefully - Chebyshev zeros (e.g., cos(i*pi/n)) are a good choice -
+    polynomial interpolation itself is a very ill-conditioned process
+    due to the Runge phenomenon. In general, even with well-chosen
+    x values, degrees higher than about thirty cause problems with
+    numerical instability in this code.
+
+    Based on [1]_.
+
+    References
+    ----------
+    .. [1] Krogh, "Efficient Algorithms for Polynomial Interpolation
+        and Numerical Differentiation", 1970.
+
+    Examples
+    --------
+    To produce a polynomial that is zero at 0 and 1 and has
+    derivative 2 at 0, call
+
+    >>> from scipy.interpolate import KroghInterpolator
+    >>> KroghInterpolator([0,0,1],[0,2,0])
+
+    This constructs the quadratic :math:`2x^2-2x`. The derivative condition
+    is indicated by the repeated zero in the `xi` array; the corresponding
+    yi values are 0, the function value, and 2, the derivative value.
+
+    For another example, given `xi`, `yi`, and a derivative `ypi` for each
+    point, appropriate arrays can be constructed as:
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> xi = np.linspace(0, 1, 5)
+    >>> yi, ypi = rng.random((2, 5))
+    >>> xi_k, yi_k = np.repeat(xi, 2), np.ravel(np.dstack((yi,ypi)))
+    >>> KroghInterpolator(xi_k, yi_k)
+
+    To produce a vector-valued polynomial, supply a higher-dimensional
+    array for `yi`:
+
+    >>> KroghInterpolator([0,1],[[2,3],[4,5]])
+
+    This constructs a linear polynomial giving (2,3) at 0 and (4,5) at 1.
+
+    """
+
+    def __init__(self, xi, yi, axis=0):
+        super().__init__(xi, yi, axis)
+
+        self.xi = np.asarray(xi)
+        self.yi = self._reshape_yi(yi)
+        self.n, self.r = self.yi.shape
+
+        if (deg := self.xi.size) > 30:
+            warnings.warn(f"{deg} degrees provided, degrees higher than about"
+                          " thirty cause problems with numerical instability "
+                          "with 'KroghInterpolator'", stacklevel=2)
+
+        c = np.zeros((self.n+1, self.r), dtype=self.dtype)
+        c[0] = self.yi[0]
+        Vk = np.zeros((self.n, self.r), dtype=self.dtype)
+        for k in range(1, self.n):
+            s = 0
+            while s <= k and xi[k-s] == xi[k]:
+                s += 1
+            s -= 1
+            Vk[0] = self.yi[k]/float_factorial(s)
+            for i in range(k-s):
+                if xi[i] == xi[k]:
+                    raise ValueError("Elements of `xi` can't be equal.")
+                if s == 0:
+                    Vk[i+1] = (c[i]-Vk[i])/(xi[i]-xi[k])
+                else:
+                    Vk[i+1] = (Vk[i+1]-Vk[i])/(xi[i]-xi[k])
+            c[k] = Vk[k-s]
+        self.c = c
+
+    def _evaluate(self, x):
+        pi = 1
+        p = np.zeros((len(x), self.r), dtype=self.dtype)
+        p += self.c[0,np.newaxis,:]
+        for k in range(1, self.n):
+            w = x - self.xi[k-1]
+            pi = w*pi
+            p += pi[:,np.newaxis] * self.c[k]
+        return p
+
+    def _evaluate_derivatives(self, x, der=None):
+        n = self.n
+        r = self.r
+
+        if der is None:
+            der = self.n
+
+        pi = np.zeros((n, len(x)))
+        w = np.zeros((n, len(x)))
+        pi[0] = 1
+        p = np.zeros((len(x), self.r), dtype=self.dtype)
+        p += self.c[0, np.newaxis, :]
+
+        for k in range(1, n):
+            w[k-1] = x - self.xi[k-1]
+            pi[k] = w[k-1] * pi[k-1]
+            p += pi[k, :, np.newaxis] * self.c[k]
+
+        cn = np.zeros((max(der, n+1), len(x), r), dtype=self.dtype)
+        cn[:n+1, :, :] += self.c[:n+1, np.newaxis, :]
+        cn[0] = p
+        for k in range(1, n):
+            for i in range(1, n-k+1):
+                pi[i] = w[k+i-1]*pi[i-1] + pi[i]
+                cn[k] = cn[k] + pi[i, :, np.newaxis]*cn[k+i]
+            cn[k] *= float_factorial(k)
+
+        cn[n, :, :] = 0
+        return cn[:der]
+
+
+def krogh_interpolate(xi, yi, x, der=0, axis=0):
+    """Convenience function for Krogh interpolation.
+
+    See `KroghInterpolator` for more details.
+
+    Parameters
+    ----------
+    xi : array_like
+        Interpolation points (known x-coordinates).
+    yi : array_like
+        Known y-coordinates, of shape ``(xi.size, R)``. Interpreted as
+        vectors of length R, or scalars if R=1.
+    x : array_like
+        Point or points at which to evaluate the derivatives.
+    der : int or list or None, optional
+        How many derivatives to evaluate, or None for all potentially
+        nonzero derivatives (that is, a number equal to the number
+        of points), or a list of derivatives to evaluate. This number
+        includes the function value as the '0th' derivative.
+    axis : int, optional
+        Axis in the `yi` array corresponding to the x-coordinate values.
+
+    Returns
+    -------
+    d : ndarray
+        If the interpolator's values are R-D then the
+        returned array will be the number of derivatives by N by R.
+        If `x` is a scalar, the middle dimension will be dropped; if
+        the `yi` are scalars then the last dimension will be dropped.
+
+    See Also
+    --------
+    KroghInterpolator : Krogh interpolator
+
+    Notes
+    -----
+    Construction of the interpolating polynomial is a relatively expensive
+    process. If you want to evaluate it repeatedly consider using the class
+    KroghInterpolator (which is what this function uses).
+
+    Examples
+    --------
+    We can interpolate 2D observed data using Krogh interpolation:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import krogh_interpolate
+    >>> x_observed = np.linspace(0.0, 10.0, 11)
+    >>> y_observed = np.sin(x_observed)
+    >>> x = np.linspace(min(x_observed), max(x_observed), num=100)
+    >>> y = krogh_interpolate(x_observed, y_observed, x)
+    >>> plt.plot(x_observed, y_observed, "o", label="observation")
+    >>> plt.plot(x, y, label="krogh interpolation")
+    >>> plt.legend()
+    >>> plt.show()
+    """
+
+    P = KroghInterpolator(xi, yi, axis=axis)
+    if der == 0:
+        return P(x)
+    elif _isscalar(der):
+        return P.derivative(x, der=der)
+    else:
+        return P.derivatives(x, der=np.amax(der)+1)[der]
+
+
+def approximate_taylor_polynomial(f,x,degree,scale,order=None):
+    """
+    Estimate the Taylor polynomial of f at x by polynomial fitting.
+
+    Parameters
+    ----------
+    f : callable
+        The function whose Taylor polynomial is sought. Should accept
+        a vector of `x` values.
+    x : scalar
+        The point at which the polynomial is to be evaluated.
+    degree : int
+        The degree of the Taylor polynomial
+    scale : scalar
+        The width of the interval to use to evaluate the Taylor polynomial.
+        Function values spread over a range this wide are used to fit the
+        polynomial. Must be chosen carefully.
+    order : int or None, optional
+        The order of the polynomial to be used in the fitting; `f` will be
+        evaluated ``order+1`` times. If None, use `degree`.
+
+    Returns
+    -------
+    p : poly1d instance
+        The Taylor polynomial (translated to the origin, so that
+        for example p(0)=f(x)).
+
+    Notes
+    -----
+    The appropriate choice of "scale" is a trade-off; too large and the
+    function differs from its Taylor polynomial too much to get a good
+    answer, too small and round-off errors overwhelm the higher-order terms.
+    The algorithm used becomes numerically unstable around order 30 even
+    under ideal circumstances.
+
+    Choosing order somewhat larger than degree may improve the higher-order
+    terms.
+
+    Examples
+    --------
+    We can calculate Taylor approximation polynomials of sin function with
+    various degrees:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import approximate_taylor_polynomial
+    >>> x = np.linspace(-10.0, 10.0, num=100)
+    >>> plt.plot(x, np.sin(x), label="sin curve")
+    >>> for degree in np.arange(1, 15, step=2):
+    ...     sin_taylor = approximate_taylor_polynomial(np.sin, 0, degree, 1,
+    ...                                                order=degree + 2)
+    ...     plt.plot(x, sin_taylor(x), label=f"degree={degree}")
+    >>> plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left',
+    ...            borderaxespad=0.0, shadow=True)
+    >>> plt.tight_layout()
+    >>> plt.axis([-10, 10, -10, 10])
+    >>> plt.show()
+
+    """
+    if order is None:
+        order = degree
+
+    n = order+1
+    # Choose n points that cluster near the endpoints of the interval in
+    # a way that avoids the Runge phenomenon. Ensure, by including the
+    # endpoint or not as appropriate, that one point always falls at x
+    # exactly.
+    xs = scale*np.cos(np.linspace(0,np.pi,n,endpoint=n % 1)) + x
+
+    P = KroghInterpolator(xs, f(xs))
+    d = P.derivatives(x,der=degree+1)
+
+    return np.poly1d((d/factorial(np.arange(degree+1)))[::-1])
+
+
+class BarycentricInterpolator(_Interpolator1DWithDerivatives):
+    r"""Barycentric (Lagrange with improved stability) interpolator (C∞ smooth).
+
+    Constructs a polynomial that passes through a given set of points.
+    Allows evaluation of the polynomial and all its derivatives,
+    efficient changing of the y-values to be interpolated,
+    and updating by adding more x- and y-values. For numerical stability, a barycentric
+    representation is used rather than computing the coefficients of the polynomial
+    directly.
+
+
+    Parameters
+    ----------
+    xi : array_like, shape (npoints, )
+        1-D array of x-coordinates of the points the polynomial
+        should pass through
+    yi : array_like, shape (..., npoints, ...), optional
+        N-D array of y-coordinates of the points the polynomial should pass through.
+        If None, the y values will be supplied later via the `set_y` method.
+        The length of `yi` along the interpolation axis must be equal to the length
+        of `xi`. Use the ``axis`` parameter to select correct axis.
+    axis : int, optional
+        Axis in the yi array corresponding to the x-coordinate values. Defaults
+        to ``axis=0``.
+    wi : array_like, optional
+        The barycentric weights for the chosen interpolation points `xi`.
+        If absent or None, the weights will be computed from `xi` (default).
+        This allows for the reuse of the weights `wi` if several interpolants
+        are being calculated using the same nodes `xi`, without re-computation. This
+        also allows for computing the weights explicitly for some choices of
+        `xi` (see notes).
+    rng : {None, int, `numpy.random.Generator`}, optional
+        If `rng` is passed by keyword, types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+        If `rng` is already a ``Generator`` instance, then the provided instance is
+        used. Specify `rng` for repeatable interpolation.
+
+        If this argument `random_state` is passed by keyword,
+        legacy behavior for the argument `random_state` applies:
+
+        - If `random_state` is None (or `numpy.random`), the `numpy.random.RandomState`
+          singleton is used.
+        - If `random_state` is an int, a new ``RandomState`` instance is used,
+          seeded with `random_state`.
+        - If `random_state` is already a ``Generator`` or ``RandomState`` instance then
+          that instance is used.
+
+        .. versionchanged:: 1.15.0
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator` this keyword was changed from `random_state` to `rng`.
+            For an interim period, both keywords will continue to work (only specify
+            one of them). After the interim period using the `random_state` keyword will emit
+            warnings. The behavior of the `random_state` and `rng` keywords is outlined above.
+
+    Notes
+    -----
+    This method is a variant of Lagrange polynomial interpolation [1]_ based on [2]_.
+    Instead of using Lagrange's or Newton's formula, the polynomial is represented by
+    the barycentric formula
+
+    .. math::
+
+        p(x) =
+        \frac{\sum_{i=1}^m\ w_i y_i / (x - x_i)}{\sum_{i=1}^m w_i / (x - x_i)},
+
+    where :math:`w_i` are the barycentric weights computed with the general formula
+
+    .. math::
+
+        w_i = \left( \prod_{k \neq i} x_i - x_k \right)^{-1}.
+
+    This is the same barycentric form used by `AAA` and `FloaterHormannInterpolator`.
+    However, in contrast, the weights :math:`w_i` are defined such that
+    :math:`p(x)` is a polynomial rather than a rational function.
+
+    The barycentric representation avoids many of the problems associated with
+    polynomial interpolation caused by floating-point arithmetic. However, it does not
+    avoid issues that are intrinsic to polynomial interpolation. Namely, if the
+    x-coordinates are equally spaced, then the weights can be computed explicitly using
+    the formula from [2]_
+
+    .. math::
+
+        w_i = (-1)^i {n \choose i},
+
+    where :math:`n` is the number of x-coordinates. As noted in [2]_, this means that
+    for large :math:`n` the weights vary by exponentially large factors, leading to the
+    Runge phenomenon.
+
+    To avoid this ill-conditioning, the x-coordinates should be clustered at the
+    endpoints of the interval. An excellent choice of points on the interval
+    :math:`[a,b]` are Chebyshev points of the second kind
+
+    .. math::
+
+        x_i = \frac{a + b}{2} + \frac{a - b}{2}\cos(i\pi/n).
+
+    in which case the weights can be computed explicitly as
+
+    .. math::
+
+        w_i = \begin{cases}
+                  (-1)^i/2 & i = 0,n \\
+                  (-1)^i   & \text{otherwise}
+              \end{cases}.
+
+    See [2]_ for more infomation. Note that for large :math:`n`, computing the weights
+    explicitly (see examples) will be faster than the generic formula.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Lagrange_polynomial
+    .. [2] Jean-Paul Berrut and Lloyd N. Trefethen, "Barycentric Lagrange
+           Interpolation", SIAM Review 2004 46:3, 501-517
+           :doi:`10.1137/S0036144502417715`
+
+    Examples
+    --------
+    To produce a quintic barycentric interpolant approximating the function
+    :math:`\sin x`, and its first four derivatives, using six randomly-spaced
+    nodes in :math:`(0, \pi/2)`:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import BarycentricInterpolator
+    >>> rng = np.random.default_rng()
+    >>> xi = rng.random(6) * np.pi/2
+    >>> f, f_d1, f_d2, f_d3, f_d4 = np.sin, np.cos, lambda x: -np.sin(x), lambda x: -np.cos(x), np.sin
+    >>> P = BarycentricInterpolator(xi, f(xi), random_state=rng)
+    >>> fig, axs = plt.subplots(5, 1, sharex=True, layout='constrained', figsize=(7,10))
+    >>> x = np.linspace(0, np.pi, 100)
+    >>> axs[0].plot(x, P(x), 'r:', x, f(x), 'k--', xi, f(xi), 'xk')
+    >>> axs[1].plot(x, P.derivative(x), 'r:', x, f_d1(x), 'k--', xi, f_d1(xi), 'xk')
+    >>> axs[2].plot(x, P.derivative(x, 2), 'r:', x, f_d2(x), 'k--', xi, f_d2(xi), 'xk')
+    >>> axs[3].plot(x, P.derivative(x, 3), 'r:', x, f_d3(x), 'k--', xi, f_d3(xi), 'xk')
+    >>> axs[4].plot(x, P.derivative(x, 4), 'r:', x, f_d4(x), 'k--', xi, f_d4(xi), 'xk')
+    >>> axs[0].set_xlim(0, np.pi)
+    >>> axs[4].set_xlabel(r"$x$")
+    >>> axs[4].set_xticks([i * np.pi / 4 for i in range(5)],
+    ...                   ["0", r"$\frac{\pi}{4}$", r"$\frac{\pi}{2}$", r"$\frac{3\pi}{4}$", r"$\pi$"])
+    >>> for ax, label in zip(axs, ("$f(x)$", "$f'(x)$", "$f''(x)$", "$f^{(3)}(x)$", "$f^{(4)}(x)$")):
+    ...     ax.set_ylabel(label)
+    >>> labels = ['Interpolation nodes', 'True function $f$', 'Barycentric interpolation']
+    >>> axs[0].legend(axs[0].get_lines()[::-1], labels, bbox_to_anchor=(0., 1.02, 1., .102),
+    ...               loc='lower left', ncols=3, mode="expand", borderaxespad=0., frameon=False)
+    >>> plt.show()
+
+    Next, we show how using Chebyshev points of the second kind avoids the
+    Runge phenomenon. In this example, we also compute the weights explicitly.
+
+    >>> n = 20
+    >>> def f(x): return np.abs(x) + 0.5*x - x**2
+    >>> i = np.arange(n)
+    >>> x_cheb = np.cos(i*np.pi/(n - 1))  # Chebyshev points on [-1, 1]
+    >>> w_i_cheb = (-1.)**i  # Explicit formula for weights of Chebyshev points
+    >>> w_i_cheb[[0, -1]] /= 2
+    >>> p_cheb = BarycentricInterpolator(x_cheb, f(x_cheb), wi=w_i_cheb)
+    >>> x_equi = np.linspace(-1, 1, n)
+    >>> p_equi = BarycentricInterpolator(x_equi, f(x_equi))
+    >>> xx = np.linspace(-1, 1, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(xx, f(xx), label="Original Function")
+    >>> ax.plot(xx, p_cheb(xx), "--", label="Chebshev Points")
+    >>> ax.plot(xx, p_equi(xx), "--", label="Equally Spaced Points")
+    >>> ax.set(xlabel="$x$", ylabel="$f(x)$", xlim=[-1, 1])
+    >>> ax.legend()
+    >>> plt.show()
+
+    """ # numpy/numpydoc#87  # noqa: E501
+
+    @_transition_to_rng("random_state", replace_doc=False)
+    def __init__(self, xi, yi=None, axis=0, *, wi=None, rng=None):
+        super().__init__(xi, yi, axis)
+
+        rng = check_random_state(rng)
+
+        self.xi = np.asarray(xi, dtype=np.float64)
+        self.set_yi(yi)
+        self.n = len(self.xi)
+
+        # cache derivative object to avoid re-computing the weights with every call.
+        self._diff_cij = None
+
+        if wi is not None:
+            self.wi = wi
+        else:
+            # See page 510 of Berrut and Trefethen 2004 for an explanation of the
+            # capacity scaling and the suggestion of using a random permutation of
+            # the input factors.
+            # At the moment, the permutation is not performed for xi that are
+            # appended later through the add_xi interface. It's not clear to me how
+            # to implement that and it seems that most situations that require
+            # these numerical stability improvements will be able to provide all
+            # the points to the constructor.
+            self._inv_capacity = 4.0 / (np.max(self.xi) - np.min(self.xi))
+            permute = rng.permutation(self.n, )
+            inv_permute = np.zeros(self.n, dtype=np.int32)
+            inv_permute[permute] = np.arange(self.n)
+            self.wi = np.zeros(self.n)
+
+            for i in range(self.n):
+                dist = self._inv_capacity * (self.xi[i] - self.xi[permute])
+                dist[inv_permute[i]] = 1.0
+                prod = np.prod(dist)
+                if prod == 0.0:
+                    raise ValueError("Interpolation points xi must be"
+                                     " distinct.")
+                self.wi[i] = 1.0 / prod
+
+    def set_yi(self, yi, axis=None):
+        """
+        Update the y values to be interpolated
+
+        The barycentric interpolation algorithm requires the calculation
+        of weights, but these depend only on the `xi`. The `yi` can be changed
+        at any time.
+
+        Parameters
+        ----------
+        yi : array_like
+            The y-coordinates of the points the polynomial will pass through.
+            If None, the y values must be supplied later.
+        axis : int, optional
+            Axis in the `yi` array corresponding to the x-coordinate values.
+
+        """
+        if yi is None:
+            self.yi = None
+            return
+        self._set_yi(yi, xi=self.xi, axis=axis)
+        self.yi = self._reshape_yi(yi)
+        self.n, self.r = self.yi.shape
+        self._diff_baryint = None
+
+    def add_xi(self, xi, yi=None):
+        """
+        Add more x values to the set to be interpolated
+
+        The barycentric interpolation algorithm allows easy updating by
+        adding more points for the polynomial to pass through.
+
+        Parameters
+        ----------
+        xi : array_like
+            The x coordinates of the points that the polynomial should pass
+            through.
+        yi : array_like, optional
+            The y coordinates of the points the polynomial should pass through.
+            Should have shape ``(xi.size, R)``; if R > 1 then the polynomial is
+            vector-valued.
+            If `yi` is not given, the y values will be supplied later. `yi`
+            should be given if and only if the interpolator has y values
+            specified.
+
+        Notes
+        -----
+        The new points added by `add_xi` are not randomly permuted
+        so there is potential for numerical instability,
+        especially for a large number of points. If this
+        happens, please reconstruct interpolation from scratch instead.
+        """
+        if yi is not None:
+            if self.yi is None:
+                raise ValueError("No previous yi value to update!")
+            yi = self._reshape_yi(yi, check=True)
+            self.yi = np.vstack((self.yi,yi))
+        else:
+            if self.yi is not None:
+                raise ValueError("No update to yi provided!")
+        old_n = self.n
+        self.xi = np.concatenate((self.xi,xi))
+        self.n = len(self.xi)
+        self.wi **= -1
+        old_wi = self.wi
+        self.wi = np.zeros(self.n)
+        self.wi[:old_n] = old_wi
+        for j in range(old_n, self.n):
+            self.wi[:j] *= self._inv_capacity * (self.xi[j]-self.xi[:j])
+            self.wi[j] = np.multiply.reduce(
+                self._inv_capacity * (self.xi[:j]-self.xi[j])
+            )
+        self.wi **= -1
+        self._diff_cij = None
+        self._diff_baryint = None
+
+    def __call__(self, x):
+        """Evaluate the interpolating polynomial at the points x
+
+        Parameters
+        ----------
+        x : array_like
+            Point or points at which to evaluate the interpolant.
+
+        Returns
+        -------
+        y : array_like
+            Interpolated values. Shape is determined by replacing
+            the interpolation axis in the original array with the shape of `x`.
+
+        Notes
+        -----
+        Currently the code computes an outer product between `x` and the
+        weights, that is, it constructs an intermediate array of size
+        ``(N, len(x))``, where N is the degree of the polynomial.
+        """
+        return _Interpolator1D.__call__(self, x)
+
+    def _evaluate(self, x):
+        if x.size == 0:
+            p = np.zeros((0, self.r), dtype=self.dtype)
+        else:
+            c = x[..., np.newaxis] - self.xi
+            z = c == 0
+            c[z] = 1
+            c = self.wi / c
+            with np.errstate(divide='ignore'):
+                p = np.dot(c, self.yi) / np.sum(c, axis=-1)[..., np.newaxis]
+            # Now fix where x==some xi
+            r = np.nonzero(z)
+            if len(r) == 1:  # evaluation at a scalar
+                if len(r[0]) > 0:  # equals one of the points
+                    p = self.yi[r[0][0]]
+            else:
+                p[r[:-1]] = self.yi[r[-1]]
+        return p
+
+    def derivative(self, x, der=1):
+        """
+        Evaluate a single derivative of the polynomial at the point x.
+
+        Parameters
+        ----------
+        x : array_like
+            Point or points at which to evaluate the derivatives
+        der : integer, optional
+            Which derivative to evaluate (default: first derivative).
+            This number includes the function value as 0th derivative.
+
+        Returns
+        -------
+        d : ndarray
+            Derivative interpolated at the x-points. Shape of `d` is
+            determined by replacing the interpolation axis in the
+            original array with the shape of `x`.
+        """
+        x, x_shape = self._prepare_x(x)
+        y = self._evaluate_derivatives(x, der+1, all_lower=False)
+        return self._finish_y(y, x_shape)
+
+    def _evaluate_derivatives(self, x, der=None, all_lower=True):
+        # NB: der here is not the order of the highest derivative;
+        # instead, it is the size of the derivatives matrix that
+        # would be returned with all_lower=True, including the
+        # '0th' derivative (the undifferentiated function).
+        # E.g. to evaluate the 5th derivative alone, call
+        # _evaluate_derivatives(x, der=6, all_lower=False).
+
+        if (not all_lower) and (x.size == 0 or self.r == 0):
+            return np.zeros((0, self.r), dtype=self.dtype)
+
+        if (not all_lower) and der == 1:
+            return self._evaluate(x)
+
+        if (not all_lower) and (der > self.n):
+            return np.zeros((len(x), self.r), dtype=self.dtype)
+
+        if der is None:
+            der = self.n
+
+        if all_lower and (x.size == 0 or self.r == 0):
+            return np.zeros((der, len(x), self.r), dtype=self.dtype)
+
+        if self._diff_cij is None:
+            # c[i,j] = xi[i] - xi[j]
+            c = self.xi[:, np.newaxis] - self.xi
+
+            # avoid division by 0 (diagonal entries are so far zero by construction)
+            np.fill_diagonal(c, 1)
+
+            # c[i,j] = (w[j] / w[i]) / (xi[i] - xi[j]) (equation 9.4)
+            c = self.wi/ (c * self.wi[..., np.newaxis])
+
+            # fill in correct diagonal entries: each column sums to 0
+            np.fill_diagonal(c, 0)
+
+            # calculate diagonal
+            # c[j,j] = -sum_{i != j} c[i,j] (equation 9.5)
+            d = -c.sum(axis=1)
+            # c[i,j] = l_j(x_i)
+            np.fill_diagonal(c, d)
+
+            self._diff_cij = c
+
+        if self._diff_baryint is None:
+            # initialise and cache derivative interpolator and cijs;
+            # reuse weights wi (which depend only on interpolation points xi),
+            # to avoid unnecessary re-computation
+            self._diff_baryint = BarycentricInterpolator(xi=self.xi,
+                                                         yi=self._diff_cij @ self.yi,
+                                                         wi=self.wi)
+            self._diff_baryint._diff_cij = self._diff_cij
+
+        if all_lower:
+            # assemble matrix of derivatives from order 0 to order der-1,
+            # in the format required by _Interpolator1DWithDerivatives.
+            cn = np.zeros((der, len(x), self.r), dtype=self.dtype)
+            for d in range(der):
+                cn[d, :, :] = self._evaluate_derivatives(x, d+1, all_lower=False)
+            return cn
+
+        # recursively evaluate only the derivative requested
+        return self._diff_baryint._evaluate_derivatives(x, der-1, all_lower=False)
+
+
+def barycentric_interpolate(xi, yi, x, axis=0, *, der=0, rng=None):
+    """Convenience function for barycentric interpolation.
+
+    Constructs a polynomial that passes through a given set of points,
+    then evaluates the polynomial. For reasons of numerical stability,
+    this function does not compute the coefficients of the polynomial.
+
+    This function uses a "barycentric interpolation" method that treats
+    the problem as a special case of rational function interpolation.
+    This algorithm is quite stable, numerically, but even in a world of
+    exact computation, unless the `x` coordinates are chosen very
+    carefully - Chebyshev zeros (e.g., cos(i*pi/n)) are a good choice -
+    polynomial interpolation itself is a very ill-conditioned process
+    due to the Runge phenomenon.
+
+    Parameters
+    ----------
+    xi : array_like
+        1-D array of x coordinates of the points the polynomial should
+        pass through
+    yi : array_like
+        The y coordinates of the points the polynomial should pass through.
+    x : scalar or array_like
+        Point or points at which to evaluate the interpolant.
+    axis : int, optional
+        Axis in the `yi` array corresponding to the x-coordinate values.
+    der : int or list or None, optional
+        How many derivatives to evaluate, or None for all potentially
+        nonzero derivatives (that is, a number equal to the number
+        of points), or a list of derivatives to evaluate. This number
+        includes the function value as the '0th' derivative.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+    Returns
+    -------
+    y : scalar or array_like
+        Interpolated values. Shape is determined by replacing
+        the interpolation axis in the original array with the shape of `x`.
+
+    See Also
+    --------
+    BarycentricInterpolator : Barycentric interpolator
+
+    Notes
+    -----
+    Construction of the interpolation weights is a relatively slow process.
+    If you want to call this many times with the same xi (but possibly
+    varying yi or x) you should use the class `BarycentricInterpolator`.
+    This is what this function uses internally.
+
+    Examples
+    --------
+    We can interpolate 2D observed data using barycentric interpolation:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import barycentric_interpolate
+    >>> x_observed = np.linspace(0.0, 10.0, 11)
+    >>> y_observed = np.sin(x_observed)
+    >>> x = np.linspace(min(x_observed), max(x_observed), num=100)
+    >>> y = barycentric_interpolate(x_observed, y_observed, x)
+    >>> plt.plot(x_observed, y_observed, "o", label="observation")
+    >>> plt.plot(x, y, label="barycentric interpolation")
+    >>> plt.legend()
+    >>> plt.show()
+
+    """
+    P = BarycentricInterpolator(xi, yi, axis=axis, rng=rng)
+    if der == 0:
+        return P(x)
+    elif _isscalar(der):
+        return P.derivative(x, der=der)
+    else:
+        return P.derivatives(x, der=np.amax(der)+1)[der]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbf.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..abc071b64d7faec5714396a8053cf5341d13e4e2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbf.py
@@ -0,0 +1,292 @@
+"""rbf - Radial basis functions for interpolation/smoothing scattered N-D data.
+
+Written by John Travers <jtravs@gmail.com>, February 2007
+Based closely on Matlab code by Alex Chirokov
+Additional, large, improvements by Robert Hetland
+Some additional alterations by Travis Oliphant
+Interpolation with multi-dimensional target domain by Josua Sassen
+
+Permission to use, modify, and distribute this software is given under the
+terms of the SciPy (BSD style) license. See LICENSE.txt that came with
+this distribution for specifics.
+
+NO WARRANTY IS EXPRESSED OR IMPLIED. USE AT YOUR OWN RISK.
+
+Copyright (c) 2006-2007, Robert Hetland <hetland@tamu.edu>
+Copyright (c) 2007, John Travers <jtravs@gmail.com>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of Robert Hetland nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import numpy as np
+
+from scipy import linalg
+from scipy.special import xlogy
+from scipy.spatial.distance import cdist, pdist, squareform
+from scipy._lib._array_api import xp_capabilities
+
+__all__ = ['Rbf']
+
+
+@xp_capabilities(out_of_scope=True)
+class Rbf:
+    """
+    Rbf(*args, **kwargs)
+
+    Class for radial basis function interpolation of functions from
+    N-D scattered data to an M-D domain (legacy).
+
+    .. legacy:: class
+
+        `Rbf` is legacy code, for new usage please use `RBFInterpolator`
+        instead.
+
+    Parameters
+    ----------
+    *args : arrays
+        x, y, z, ..., d, where x, y, z, ... are the coordinates of the nodes
+        and d is the array of values at the nodes
+    function : str or callable, optional
+        The radial basis function, based on the radius, r, given by the norm
+        (default is Euclidean distance); the default is 'multiquadric'::
+
+            'multiquadric': sqrt((r/self.epsilon)**2 + 1)
+            'inverse': 1.0/sqrt((r/self.epsilon)**2 + 1)
+            'gaussian': exp(-(r/self.epsilon)**2)
+            'linear': r
+            'cubic': r**3
+            'quintic': r**5
+            'thin_plate': r**2 * log(r)
+
+        If callable, then it must take 2 arguments (self, r). The epsilon
+        parameter will be available as self.epsilon. Other keyword
+        arguments passed in will be available as well.
+
+    epsilon : float, optional
+        Adjustable constant for gaussian or multiquadrics functions
+        - defaults to approximate average distance between nodes (which is
+        a good start).
+    smooth : float, optional
+        Values greater than zero increase the smoothness of the
+        approximation. 0 is for interpolation (default), the function will
+        always go through the nodal points in this case.
+    norm : str, callable, optional
+        A function that returns the 'distance' between two points, with
+        inputs as arrays of positions (x, y, z, ...), and an output as an
+        array of distance. E.g., the default: 'euclidean', such that the result
+        is a matrix of the distances from each point in ``x1`` to each point in
+        ``x2``. For more options, see documentation of
+        `scipy.spatial.distances.cdist`.
+    mode : str, optional
+        Mode of the interpolation, can be '1-D' (default) or 'N-D'. When it is
+        '1-D' the data `d` will be considered as 1-D and flattened
+        internally. When it is 'N-D' the data `d` is assumed to be an array of
+        shape (n_samples, m), where m is the dimension of the target domain.
+
+
+    Attributes
+    ----------
+    N : int
+        The number of data points (as determined by the input arrays).
+    di : ndarray
+        The 1-D array of data values at each of the data coordinates `xi`.
+    xi : ndarray
+        The 2-D array of data coordinates.
+    function : str or callable
+        The radial basis function. See description under Parameters.
+    epsilon : float
+        Parameter used by gaussian or multiquadrics functions. See Parameters.
+    smooth : float
+        Smoothing parameter. See description under Parameters.
+    norm : str or callable
+        The distance function. See description under Parameters.
+    mode : str
+        Mode of the interpolation. See description under Parameters.
+    nodes : ndarray
+        A 1-D array of node values for the interpolation.
+    A : internal property, do not use
+
+    See Also
+    --------
+    RBFInterpolator
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.interpolate import Rbf
+    >>> rng = np.random.default_rng()
+    >>> x, y, z, d = rng.random((4, 50))
+    >>> rbfi = Rbf(x, y, z, d)  # radial basis function interpolator instance
+    >>> xi = yi = zi = np.linspace(0, 1, 20)
+    >>> di = rbfi(xi, yi, zi)   # interpolated values
+    >>> di.shape
+    (20,)
+
+    """
+    # Available radial basis functions that can be selected as strings;
+    # they all start with _h_ (self._init_function relies on that)
+    def _h_multiquadric(self, r):
+        return np.sqrt((1.0/self.epsilon*r)**2 + 1)
+
+    def _h_inverse_multiquadric(self, r):
+        return 1.0/np.sqrt((1.0/self.epsilon*r)**2 + 1)
+
+    def _h_gaussian(self, r):
+        return np.exp(-(1.0/self.epsilon*r)**2)
+
+    def _h_linear(self, r):
+        return r
+
+    def _h_cubic(self, r):
+        return r**3
+
+    def _h_quintic(self, r):
+        return r**5
+
+    def _h_thin_plate(self, r):
+        return xlogy(r**2, r)
+
+    # Setup self._function and do smoke test on initial r
+    def _init_function(self, r):
+        if isinstance(self.function, str):
+            self.function = self.function.lower()
+            _mapped = {'inverse': 'inverse_multiquadric',
+                       'inverse multiquadric': 'inverse_multiquadric',
+                       'thin-plate': 'thin_plate'}
+            if self.function in _mapped:
+                self.function = _mapped[self.function]
+
+            func_name = "_h_" + self.function
+            if hasattr(self, func_name):
+                self._function = getattr(self, func_name)
+            else:
+                functionlist = [x[3:] for x in dir(self)
+                                if x.startswith('_h_')]
+                raise ValueError("function must be a callable or one of " +
+                                 ", ".join(functionlist))
+            self._function = getattr(self, "_h_"+self.function)
+        elif callable(self.function):
+            allow_one = False
+            if hasattr(self.function, 'func_code') or \
+               hasattr(self.function, '__code__'):
+                val = self.function
+                allow_one = True
+            elif hasattr(self.function, "__call__"):
+                val = self.function.__call__.__func__
+            else:
+                raise ValueError("Cannot determine number of arguments to "
+                                 "function")
+
+            argcount = val.__code__.co_argcount
+            if allow_one and argcount == 1:
+                self._function = self.function
+            elif argcount == 2:
+                self._function = self.function.__get__(self, Rbf)
+            else:
+                raise ValueError("Function argument must take 1 or 2 "
+                                 "arguments.")
+
+        a0 = self._function(r)
+        if a0.shape != r.shape:
+            raise ValueError("Callable must take array and return array of "
+                             "the same shape")
+        return a0
+
+    def __init__(self, *args, **kwargs):
+        # `args` can be a variable number of arrays; we flatten them and store
+        # them as a single 2-D array `xi` of shape (n_args-1, array_size),
+        # plus a 1-D array `di` for the values.
+        # All arrays must have the same number of elements
+        self.xi = np.asarray([np.asarray(a, dtype=np.float64).flatten()
+                              for a in args[:-1]])
+        self.N = self.xi.shape[-1]
+
+        self.mode = kwargs.pop('mode', '1-D')
+
+        if self.mode == '1-D':
+            self.di = np.asarray(args[-1]).flatten()
+            self._target_dim = 1
+        elif self.mode == 'N-D':
+            self.di = np.asarray(args[-1])
+            self._target_dim = self.di.shape[-1]
+        else:
+            raise ValueError("Mode has to be 1-D or N-D.")
+
+        if not all([x.size == self.di.shape[0] for x in self.xi]):
+            raise ValueError("All arrays must be equal length.")
+
+        self.norm = kwargs.pop('norm', 'euclidean')
+        self.epsilon = kwargs.pop('epsilon', None)
+        if self.epsilon is None:
+            # default epsilon is the "the average distance between nodes" based
+            # on a bounding hypercube
+            ximax = np.amax(self.xi, axis=1)
+            ximin = np.amin(self.xi, axis=1)
+            edges = ximax - ximin
+            edges = edges[np.nonzero(edges)]
+            self.epsilon = np.power(np.prod(edges)/self.N, 1.0/edges.size)
+
+        self.smooth = kwargs.pop('smooth', 0.0)
+        self.function = kwargs.pop('function', 'multiquadric')
+
+        # attach anything left in kwargs to self for use by any user-callable
+        # function or to save on the object returned.
+        for item, value in kwargs.items():
+            setattr(self, item, value)
+
+        # Compute weights
+        if self._target_dim > 1:  # If we have more than one target dimension,
+            # we first factorize the matrix
+            self.nodes = np.zeros((self.N, self._target_dim), dtype=self.di.dtype)
+            lu, piv = linalg.lu_factor(self.A)
+            for i in range(self._target_dim):
+                self.nodes[:, i] = linalg.lu_solve((lu, piv), self.di[:, i])
+        else:
+            self.nodes = linalg.solve(self.A, self.di)
+
+    @property
+    def A(self):
+        # this only exists for backwards compatibility: self.A was available
+        # and, at least technically, public.
+        r = squareform(pdist(self.xi.T, self.norm))  # Pairwise norm
+        return self._init_function(r) - np.eye(self.N)*self.smooth
+
+    def _call_norm(self, x1, x2):
+        return cdist(x1.T, x2.T, self.norm)
+
+    def __call__(self, *args):
+        args = [np.asarray(x) for x in args]
+        if not all([x.shape == y.shape for x in args for y in args]):
+            raise ValueError("Array lengths must be equal")
+        if self._target_dim > 1:
+            shp = args[0].shape + (self._target_dim,)
+        else:
+            shp = args[0].shape
+        xa = np.asarray([a.flatten() for a in args], dtype=np.float64)
+        r = self._call_norm(xa, self.xi)
+        return np.dot(self._function(r), self.nodes).reshape(shp)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6df4f4dea3b674787d48fd736dd9dc72dc4fc76
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp.py
@@ -0,0 +1,540 @@
+"""Module for RBF interpolation."""
+import warnings
+from types import GenericAlias
+
+import numpy as np
+from scipy.spatial import KDTree
+
+from . import _rbfinterp_np
+from . import _rbfinterp_xp
+
+from scipy._lib._array_api import (
+    _asarray, array_namespace, xp_size, is_numpy, xp_capabilities
+)
+import scipy._lib.array_api_extra as xpx
+
+
+__all__ = ["RBFInterpolator"]
+
+
+# These RBFs are implemented.
+_AVAILABLE = {
+    "linear",
+    "thin_plate_spline",
+    "cubic",
+    "quintic",
+    "multiquadric",
+    "inverse_multiquadric",
+    "inverse_quadratic",
+    "gaussian"
+    }
+
+
+# The shape parameter does not need to be specified when using these RBFs.
+_SCALE_INVARIANT = {"linear", "thin_plate_spline", "cubic", "quintic"}
+
+
+# For RBFs that are conditionally positive definite of order m, the interpolant
+# should include polynomial terms with degree >= m - 1. Define the minimum
+# degrees here. These values are from Chapter 8 of Fasshauer's "Meshfree
+# Approximation Methods with MATLAB". The RBFs that are not in this dictionary
+# are positive definite and do not need polynomial terms.
+_NAME_TO_MIN_DEGREE = {
+    "multiquadric": 0,
+    "linear": 0,
+    "thin_plate_spline": 1,
+    "cubic": 1,
+    "quintic": 2
+    }
+
+
+def _get_backend(xp):
+    if is_numpy(xp):
+        return _rbfinterp_np
+    return _rbfinterp_xp
+
+
+extra_note="""Only the default ``neighbors=None`` is Array API compatible.
+    If a non-default value of ``neighbors`` is given, the behavior is NumPy -only.
+
+"""
+
+@xp_capabilities(
+    skip_backends=[
+        ("dask.array", "linalg.lu is broken; array_api_extra#488"),
+        ("array_api_strict", "array-api#977, diag, view")
+    ],
+    extra_note=extra_note
+)
+class RBFInterpolator:
+    """Radial basis function interpolator in N ≥ 1 dimensions.
+
+    Parameters
+    ----------
+    y : (npoints, ndims) array_like
+        2-D array of data point coordinates.
+    d : (npoints, ...) array_like
+        N-D array of data values at `y`. The length of `d` along the first
+        axis must be equal to the length of `y`. Unlike some interpolators, the
+        interpolation axis cannot be changed.
+    neighbors : int, optional
+        If specified, the value of the interpolant at each evaluation point
+        will be computed using only this many nearest data points. All the data
+        points are used by default.
+    smoothing : float or (npoints, ) array_like, optional
+        Smoothing parameter. The interpolant perfectly fits the data when this
+        is set to 0. For large values, the interpolant approaches a least
+        squares fit of a polynomial with the specified degree. Default is 0.
+    kernel : str, optional
+        Type of RBF. This should be one of
+
+            - 'linear'               : ``-r``
+            - 'thin_plate_spline'    : ``r**2 * log(r)``
+            - 'cubic'                : ``r**3``
+            - 'quintic'              : ``-r**5``
+            - 'multiquadric'         : ``-sqrt(1 + r**2)``
+            - 'inverse_multiquadric' : ``1/sqrt(1 + r**2)``
+            - 'inverse_quadratic'    : ``1/(1 + r**2)``
+            - 'gaussian'             : ``exp(-r**2)``
+
+        Default is 'thin_plate_spline'.
+    epsilon : float, optional
+        Shape parameter that scales the input to the RBF. If `kernel` is
+        'linear', 'thin_plate_spline', 'cubic', or 'quintic', this defaults to
+        1 and can be ignored because it has the same effect as scaling the
+        smoothing parameter. Otherwise, this must be specified.
+    degree : int, optional
+        Degree of the added polynomial. For some RBFs the interpolant may not
+        be well-posed if the polynomial degree is too small. Those RBFs and
+        their corresponding minimum degrees are
+
+            - 'multiquadric'      : 0
+            - 'linear'            : 0
+            - 'thin_plate_spline' : 1
+            - 'cubic'             : 1
+            - 'quintic'           : 2
+
+        The default value is the minimum degree for `kernel` or 0 if there is
+        no minimum degree. Set this to -1 for no added polynomial.
+
+    Notes
+    -----
+    An RBF is a scalar valued function in N-dimensional space whose value at
+    :math:`x` can be expressed in terms of :math:`r=||x - c||`, where :math:`c`
+    is the center of the RBF.
+
+    An RBF interpolant for the vector of data values :math:`d`, which are from
+    locations :math:`y`, is a linear combination of RBFs centered at :math:`y`
+    plus a polynomial with a specified degree. The RBF interpolant is written
+    as
+
+    .. math::
+        f(x) = K(x, y) a + P(x) b,
+
+    where :math:`K(x, y)` is a matrix of RBFs with centers at :math:`y`
+    evaluated at the points :math:`x`, and :math:`P(x)` is a matrix of
+    monomials, which span polynomials with the specified degree, evaluated at
+    :math:`x`. The coefficients :math:`a` and :math:`b` are the solution to the
+    linear equations
+
+    .. math::
+        (K(x, y) + \\lambda I) a + P(y) b = d
+
+    and
+
+    .. math::
+        P(y)^T a = 0,
+
+    where :math:`\\lambda` is a non-negative smoothing parameter that controls
+    how well we want to fit the data. The data are fit exactly when the
+    smoothing parameter is 0.
+
+    The above system is uniquely solvable if the following requirements are
+    met:
+
+        - :math:`P(y)` must have full column rank. :math:`P(y)` always has full
+          column rank when `degree` is -1 or 0. When `degree` is 1,
+          :math:`P(y)` has full column rank if the data point locations are not
+          all collinear (N=2), coplanar (N=3), etc.
+        - If `kernel` is 'multiquadric', 'linear', 'thin_plate_spline',
+          'cubic', or 'quintic', then `degree` must not be lower than the
+          minimum value listed above.
+        - If `smoothing` is 0, then each data point location must be distinct.
+
+    When using an RBF that is not scale invariant ('multiquadric',
+    'inverse_multiquadric', 'inverse_quadratic', or 'gaussian'), an appropriate
+    shape parameter must be chosen (e.g., through cross validation). Smaller
+    values for the shape parameter correspond to wider RBFs. The problem can
+    become ill-conditioned or singular when the shape parameter is too small.
+
+    The memory required to solve for the RBF interpolation coefficients
+    increases quadratically with the number of data points, which can become
+    impractical when interpolating more than about a thousand data points.
+    To overcome memory limitations for large interpolation problems, the
+    `neighbors` argument can be specified to compute an RBF interpolant for
+    each evaluation point using only the nearest data points.
+
+    .. versionadded:: 1.7.0
+
+    See Also
+    --------
+    NearestNDInterpolator
+    LinearNDInterpolator
+    CloughTocher2DInterpolator
+
+    References
+    ----------
+    .. [1] Fasshauer, G., 2007. Meshfree Approximation Methods with Matlab.
+        World Scientific Publishing Co.
+
+    .. [2] http://amadeus.math.iit.edu/~fass/603_ch3.pdf
+
+    .. [3] Wahba, G., 1990. Spline Models for Observational Data. SIAM.
+
+    .. [4] http://pages.stat.wisc.edu/~wahba/stat860public/lect/lect8/lect8.pdf
+
+    Examples
+    --------
+    Demonstrate interpolating scattered data to a grid in 2-D.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import RBFInterpolator
+    >>> from scipy.stats.qmc import Halton
+
+    >>> rng = np.random.default_rng()
+    >>> xobs = 2*Halton(2, seed=rng).random(100) - 1
+    >>> yobs = np.sum(xobs, axis=1)*np.exp(-6*np.sum(xobs**2, axis=1))
+
+    >>> x1 = np.linspace(-1, 1, 50)
+    >>> xgrid = np.asarray(np.meshgrid(x1, x1, indexing='ij'))
+    >>> xflat = xgrid.reshape(2, -1).T     # make it a 2-D array
+    >>> yflat = RBFInterpolator(xobs, yobs)(xflat)
+    >>> ygrid = yflat.reshape(50, 50)
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.pcolormesh(*xgrid, ygrid, vmin=-0.25, vmax=0.25, shading='gouraud')
+    >>> p = ax.scatter(*xobs.T, c=yobs, s=50, ec='k', vmin=-0.25, vmax=0.25)
+    >>> fig.colorbar(p)
+    >>> plt.show()
+
+    """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self, y, d,
+                 neighbors=None,
+                 smoothing=0.0,
+                 kernel="thin_plate_spline",
+                 epsilon=None,
+                 degree=None):
+        xp = array_namespace(y, d, smoothing)
+        _backend = _get_backend(xp)
+
+        if neighbors is not None:
+            if not is_numpy(xp):
+                raise NotImplementedError(
+                    "neighbors not None is numpy-only because it relies on KDTree"
+                )
+
+        y = _asarray(y, dtype=xp.float64, order="C", xp=xp)
+        if y.ndim != 2:
+            raise ValueError("`y` must be a 2-dimensional array.")
+
+        ny, ndim = y.shape
+
+        d = xp.asarray(d)
+        if xp.isdtype(d.dtype, 'complex floating'):
+            d_dtype = xp.complex128
+        else:
+            d_dtype = xp.float64
+        d = _asarray(d, dtype=d_dtype, order="C", xp=xp)
+
+        if d.shape[0] != ny:
+            raise ValueError(
+                f"Expected the first axis of `d` to have length {ny}."
+                )
+
+        d_shape = d.shape[1:]
+        d = xp.reshape(d, (ny, -1))
+        # If `d` is complex, convert it to a float array with twice as many
+        # columns. Otherwise, the LHS matrix would need to be converted to
+        # complex and take up 2x more memory than necessary.
+        d = d.view(float)     # NB not Array API compliant (and jax copies)
+
+        if isinstance(smoothing, int | float) or smoothing.shape == ():
+            smoothing = xp.full(ny, smoothing, dtype=xp.float64)
+        else:
+            smoothing = _asarray(smoothing, dtype=float, order="C", xp=xp)
+            if smoothing.shape != (ny,):
+                raise ValueError(
+                    "Expected `smoothing` to be a scalar or have shape "
+                    f"({ny},)."
+                    )
+
+        kernel = kernel.lower()
+        if kernel not in _AVAILABLE:
+            raise ValueError(f"`kernel` must be one of {_AVAILABLE}.")
+
+        if epsilon is None:
+            if kernel in _SCALE_INVARIANT:
+                epsilon = 1.0
+            else:
+                raise ValueError(
+                    "`epsilon` must be specified if `kernel` is not one of "
+                    f"{_SCALE_INVARIANT}."
+                    )
+        else:
+            epsilon = float(epsilon)
+
+        min_degree = _NAME_TO_MIN_DEGREE.get(kernel, -1)
+        if degree is None:
+            degree = max(min_degree, 0)
+        else:
+            degree = int(degree)
+            if degree < -1:
+                raise ValueError("`degree` must be at least -1.")
+            elif -1 < degree < min_degree:
+                warnings.warn(
+                    f"`degree` should not be below {min_degree} except -1 "
+                    f"when `kernel` is '{kernel}'."
+                    f"The interpolant may not be uniquely "
+                    f"solvable, and the smoothing parameter may have an "
+                    f"unintuitive effect.",
+                    UserWarning, stacklevel=2
+                )
+
+        if neighbors is None:
+            nobs = ny
+        else:
+            # Make sure the number of nearest neighbors used for interpolation
+            # does not exceed the number of observations.
+            neighbors = int(min(neighbors, ny))
+            nobs = neighbors
+
+        powers = _backend._monomial_powers(ndim, degree, xp)
+        # The polynomial matrix must have full column rank in order for the
+        # interpolant to be well-posed, which is not possible if there are
+        # fewer observations than monomials.
+        if powers.shape[0] > nobs:
+            raise ValueError(
+                f"At least {powers.shape[0]} data points are required when "
+                f"`degree` is {degree} and the number of dimensions is {ndim}."
+                )
+
+        if neighbors is None:
+            shift, scale, coeffs = _backend._build_and_solve_system(
+                y, d, smoothing, kernel, epsilon, powers,
+                xp
+            )
+
+            # Make these attributes private since they do not always exist.
+            self._shift = shift
+            self._scale = scale
+            self._coeffs = coeffs
+
+        else:
+            self._tree = KDTree(y)
+
+        self.y = y
+        self.d = d
+        self.d_shape = d_shape
+        self.d_dtype = d_dtype
+        self.neighbors = neighbors
+        self.smoothing = smoothing
+        self.kernel = kernel
+        self.epsilon = epsilon
+        self.powers = powers
+        self._xp = xp
+
+    def __setstate__(self, state):
+        tpl1, tpl2 = state
+        (self.y, self.d, self.d_shape, self.d_dtype, self.neighbors,
+         self.smoothing, self.kernel, self.epsilon, self.powers) = tpl1
+
+        if self.neighbors is None:
+            self._shift, self._scale, self._coeffs = tpl2
+        else:
+            self._tree, = tpl2
+
+        self._xp = array_namespace(self.y, self.d, self.smoothing)
+
+    def __getstate__(self):
+        tpl = (self.y, self.d, self.d_shape, self.d_dtype, self.neighbors,
+               self.smoothing, self.kernel, self.epsilon, self.powers
+        )
+
+        if self.neighbors is None:
+            tpl2 = (self._shift, self._scale, self._coeffs)
+        else:
+            tpl2 = (self._tree,)
+
+        return (tpl, tpl2)
+
+    def _chunk_evaluator(
+            self,
+            x,
+            y,
+            shift,
+            scale,
+            coeffs,
+            memory_budget=1000000
+    ):
+        """
+        Evaluate the interpolation while controlling memory consumption.
+        We chunk the input if we need more memory than specified.
+
+        Parameters
+        ----------
+        x : (Q, N) float ndarray
+            array of points on which to evaluate
+        y: (P, N) float ndarray
+            array of points on which we know function values
+        shift: (N, ) ndarray
+            Domain shift used to create the polynomial matrix.
+        scale : (N,) float ndarray
+            Domain scaling used to create the polynomial matrix.
+        coeffs: (P+R, S) float ndarray
+            Coefficients in front of basis functions
+        memory_budget: int
+            Total amount of memory (in units of sizeof(float)) we wish
+            to devote for storing the array of coefficients for
+            interpolated points. If we need more memory than that, we
+            chunk the input.
+
+        Returns
+        -------
+        (Q, S) float ndarray
+        Interpolated array
+        """
+        _backend = _get_backend(self._xp)
+
+        nx, ndim = x.shape
+        if self.neighbors is None:
+            nnei = y.shape[0]
+        else:
+            nnei = self.neighbors
+        # in each chunk we consume the same space we already occupy
+        chunksize = memory_budget // (self.powers.shape[0] + nnei) + 1
+        if chunksize <= nx:
+            out = self._xp.empty((nx, self.d.shape[1]), dtype=self._xp.float64)
+            for i in range(0, nx, chunksize):
+                chunk = _backend.compute_interpolation(
+                    x[i:i + chunksize, :],
+                    y,
+                    self.kernel,
+                    self.epsilon,
+                    self.powers,
+                    shift,
+                    scale,
+                    coeffs,
+                    self._xp
+                )
+                out = xpx.at(out, (slice(i, i + chunksize), slice(None,))).set(chunk)
+        else:
+            out = _backend.compute_interpolation(
+                x,
+                y,
+                self.kernel,
+                self.epsilon,
+                self.powers,
+                shift,
+                scale,
+                coeffs,
+                self._xp
+            )
+        return out
+
+    def __call__(self, x):
+        """Evaluate the interpolant at `x`.
+
+        Parameters
+        ----------
+        x : (npts, ndim) array_like
+            Evaluation point coordinates.
+
+        Returns
+        -------
+        ndarray, shape (npts, )
+            Values of the interpolant at `x`.
+
+        """
+        x = _asarray(x, dtype=self._xp.float64, order="C", xp=self._xp)
+        if x.ndim != 2:
+            raise ValueError("`x` must be a 2-dimensional array.")
+
+        nx, ndim = x.shape
+        if ndim != self.y.shape[1]:
+            raise ValueError("Expected the second axis of `x` to have length "
+                             f"{self.y.shape[1]}.")
+
+        # Our memory budget for storing RBF coefficients is
+        # based on how many floats in memory we already occupy
+        # If this number is below 1e6 we just use 1e6
+        # This memory budget is used to decide how we chunk
+        # the inputs
+        memory_budget = max(xp_size(x) + xp_size(self.y) + xp_size(self.d), 1_000_000)
+
+        if self.neighbors is None:
+            out = self._chunk_evaluator(
+                x,
+                self.y,
+                self._shift,
+                self._scale,
+                self._coeffs,
+                memory_budget=memory_budget)
+        else:
+            # XXX: this relies on KDTree, hence is numpy-only until KDTree is converted
+            _build_and_solve_system = _get_backend(np)._build_and_solve_system
+
+            # Get the indices of the k nearest observation points to each
+            # evaluation point.
+            _, yindices = self._tree.query(x, self.neighbors)
+            if self.neighbors == 1:
+                # `KDTree` squeezes the output when neighbors=1.
+                yindices = yindices[:, None]
+
+            # Multiple evaluation points may have the same neighborhood of
+            # observation points. Make the neighborhoods unique so that we only
+            # compute the interpolation coefficients once for each
+            # neighborhood.
+            yindices = np.sort(yindices, axis=1)
+            yindices, inv = np.unique(yindices, return_inverse=True, axis=0)
+            inv = np.reshape(inv, (-1,))  # flatten, we need 1-D indices
+            # `inv` tells us which neighborhood will be used by each evaluation
+            # point. Now we find which evaluation points will be using each
+            # neighborhood.
+            xindices = [[] for _ in range(len(yindices))]
+            for i, j in enumerate(inv):
+                xindices[j].append(i)
+
+            out = np.empty((nx, self.d.shape[1]), dtype=float)
+            for xidx, yidx in zip(xindices, yindices):
+                # `yidx` are the indices of the observations in this
+                # neighborhood. `xidx` are the indices of the evaluation points
+                # that are using this neighborhood.
+                xnbr = x[xidx]
+                ynbr = self.y[yidx]
+                dnbr = self.d[yidx]
+                snbr = self.smoothing[yidx]
+                shift, scale, coeffs = _build_and_solve_system(
+                    ynbr,
+                    dnbr,
+                    snbr,
+                    self.kernel,
+                    self.epsilon,
+                    self.powers,
+                    np
+                )
+                out[xidx] = self._chunk_evaluator(
+                    xnbr,
+                    ynbr,
+                    shift,
+                    scale,
+                    coeffs,
+                    memory_budget=memory_budget)
+
+        out = out.view(self.d_dtype)    # NB not Array API compliant (and jax copies)
+        out = self._xp.reshape(out, (nx, ) + self.d_shape)
+        return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_common.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..528fe7a97187d1f46557833bfa218bab245b7251
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_common.py
@@ -0,0 +1,32 @@
+# Impl routines common for all backends
+from itertools import combinations_with_replacement
+from math import comb
+
+def _monomial_powers_impl(ndim, degree):
+    """Return the powers for each monomial in a polynomial.
+
+    Parameters
+    ----------
+    ndim : int
+        Number of variables in the polynomial.
+    degree : int
+        Degree of the polynomial.
+
+    Returns
+    -------
+    (nmonos, ndim) int ndarray
+        Array where each row contains the powers for each variable in a
+        monomial.
+
+    """
+    nmonos = comb(degree + ndim, ndim)
+    out = [[0]*ndim for _ in range(nmonos)]
+    count = 0
+    for deg in range(degree + 1):
+        for mono in combinations_with_replacement(range(ndim), deg):
+            # `mono` is a tuple of variables in the current monomial with
+            # multiplicity indicating power (e.g., (0, 1, 1) represents x*y**2)
+            for var in mono:
+                out[count][var] += 1
+            count += 1
+    return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_np.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_np.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94f03718e6a2d96c8ac6d423a9c4a627f473379
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_np.py
@@ -0,0 +1,92 @@
+import numpy as np
+from numpy.linalg import LinAlgError
+from scipy.linalg.lapack import dgesv  # type: ignore[attr-defined]
+from ._rbfinterp_common import _monomial_powers_impl
+
+from ._rbfinterp_pythran import (
+    _build_system as _pythran_build_system,
+    _build_evaluation_coefficients as _pythran_build_evaluation_coefficients,
+    _polynomial_matrix as _pythran_polynomial_matrix
+)
+
+
+# trampolines for pythran-compiled functions to drop the `xp` argument
+def _build_evaluation_coefficients(
+    x, y, kernel, epsilon, powers, shift, scale, xp
+):
+    return _pythran_build_evaluation_coefficients(
+        x, y, kernel, epsilon, powers, shift, scale
+    )
+
+def polynomial_matrix(x, powers, xp):
+    return _pythran_polynomial_matrix(x, powers)
+
+
+def _monomial_powers(ndim, degree, xp):
+    out = _monomial_powers_impl(ndim, degree)
+    out = np.asarray(out, dtype=np.int64)
+    if len(out) == 0:
+        out = out.reshape(0, ndim)
+    return out
+
+
+def _build_system(y, d, smoothing, kernel, epsilon, powers, xp):
+    return _pythran_build_system(y, d, smoothing, kernel, epsilon, powers)
+
+
+def _build_and_solve_system(y, d, smoothing, kernel, epsilon, powers, xp):
+    """Build and solve the RBF interpolation system of equations.
+
+    Parameters
+    ----------
+    y : (P, N) float ndarray
+        Data point coordinates.
+    d : (P, S) float ndarray
+        Data values at `y`.
+    smoothing : (P,) float ndarray
+        Smoothing parameter for each data point.
+    kernel : str
+        Name of the RBF.
+    epsilon : float
+        Shape parameter.
+    powers : (R, N) int ndarray
+        The exponents for each monomial in the polynomial.
+
+    Returns
+    -------
+    coeffs : (P + R, S) float ndarray
+        Coefficients for each RBF and monomial.
+    shift : (N,) float ndarray
+        Domain shift used to create the polynomial matrix.
+    scale : (N,) float ndarray
+        Domain scaling used to create the polynomial matrix.
+
+    """
+    lhs, rhs, shift, scale = _build_system(
+        y, d, smoothing, kernel, epsilon, powers, xp
+        )
+    _, _, coeffs, info = dgesv(lhs, rhs, overwrite_a=True, overwrite_b=True)
+    if info < 0:
+        raise ValueError(f"The {-info}-th argument had an illegal value.")
+    elif info > 0:
+        msg = "Singular matrix."
+        nmonos = powers.shape[0]
+        if nmonos > 0:
+            pmat = polynomial_matrix((y - shift)/scale, powers, xp)
+            rank = np.linalg.matrix_rank(pmat)
+            if rank < nmonos:
+                msg = (
+                    "Singular matrix. The matrix of monomials evaluated at "
+                    "the data point coordinates does not have full column "
+                    f"rank ({rank}/{nmonos})."
+                    )
+
+        raise LinAlgError(msg)
+
+    return shift, scale, coeffs
+
+def compute_interpolation(x, y, kernel, epsilon, powers, shift, scale, coeffs, xp):
+    vec = _build_evaluation_coefficients(
+        x, y, kernel, epsilon, powers, shift, scale, xp
+    )
+    return vec @ coeffs
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_xp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_xp.py
new file mode 100644
index 0000000000000000000000000000000000000000..57cf3da5d967c5b2eafaddacca5a039580ac55d1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/_rbfinterp_xp.py
@@ -0,0 +1,266 @@
+"""
+'Generic' Array API backend for RBF interpolation.
+
+The general logic is this: `_rbfinterp.py` implements the user API and calls
+into either `_rbfinterp_np` (the "numpy backend"), or `_rbfinterp_xp` (the
+"generic backend".
+
+The numpy backend offloads performance-critical computations to the
+pythran-compiled `_rbfinterp_pythran` extension. This way, the call chain is
+
+    _rbfinterp.py <-- _rbfinterp_np.py <-- _rbfinterp_pythran.py
+
+The "generic" backend here is a drop-in replacement of the API of
+`_rbfinterp_np.py` for use in `_rbfinterp.py` with non-numpy arrays.
+
+The implementation closely follows `_rbfinterp_np + _rbfinterp_pythran`, with
+the following differences:
+
+  -  We used vectorized code not explicit loops in `_build_system` and
+     `_build_evaluation_coefficients`; this is more torch/jax friendly;
+  - RBF kernels are also "vectorized" and not scalar: they receive an
+    array of norms not a single norm;
+  - RBF kernels accept an extra xp= argument;
+
+In general, we would prefer less code duplication. The main blocker ATM is
+that pythran cannot compile functions with an xp= argument where xp is numpy.
+"""
+from numpy.linalg import LinAlgError
+from ._rbfinterp_common import _monomial_powers_impl
+
+
+def _monomial_powers(ndim, degree, xp):
+    out = _monomial_powers_impl(ndim, degree)
+    out = xp.asarray(out)
+    if out.shape[0] == 0:
+        out = xp.reshape(out, (0, ndim))
+    return out
+
+
+def _build_and_solve_system(y, d, smoothing, kernel, epsilon, powers, xp):
+    """Build and solve the RBF interpolation system of equations.
+
+    Parameters
+    ----------
+    y : (P, N) float ndarray
+        Data point coordinates.
+    d : (P, S) float ndarray
+        Data values at `y`.
+    smoothing : (P,) float ndarray
+        Smoothing parameter for each data point.
+    kernel : str
+        Name of the RBF.
+    epsilon : float
+        Shape parameter.
+    powers : (R, N) int ndarray
+        The exponents for each monomial in the polynomial.
+
+    Returns
+    -------
+    coeffs : (P + R, S) float ndarray
+        Coefficients for each RBF and monomial.
+    shift : (N,) float ndarray
+        Domain shift used to create the polynomial matrix.
+    scale : (N,) float ndarray
+        Domain scaling used to create the polynomial matrix.
+
+    """
+    lhs, rhs, shift, scale = _build_system(
+        y, d, smoothing, kernel, epsilon, powers, xp
+        )
+    try:
+        coeffs = xp.linalg.solve(lhs, rhs)
+    except Exception:
+        # Best-effort attempt to emit a helpful message.
+        # `_rbfinterp_np` backend gives better diagnostics; it is hard to
+        # match it in a backend-agnostic way: e.g. jax emits no error at all,
+        # and instead returns an array of nans for a singular `lhs`.
+        msg = "Singular matrix"
+        nmonos = powers.shape[0]
+        if nmonos > 0:
+            pmat = polynomial_matrix((y - shift)/scale, powers, xp=xp)
+            rank = xp.linalg.matrix_rank(pmat)
+            if rank < nmonos:
+                msg = (
+                    "Singular matrix. The matrix of monomials evaluated at "
+                    "the data point coordinates does not have full column "
+                    f"rank ({rank}/{nmonos})."
+                    )
+        raise LinAlgError(msg)
+
+    return shift, scale, coeffs
+
+
+def linear(r, xp):
+    return -r
+
+
+def thin_plate_spline(r, xp):
+    # NB: changed w.r.t. pythran, vectorized
+    return xp.where(r == 0, 0, r**2 * xp.log(r))
+
+
+def cubic(r, xp):
+    return r**3
+
+
+def quintic(r, xp):
+    return -r**5
+
+
+def multiquadric(r, xp):
+    return -xp.sqrt(r**2 + 1)
+
+
+def inverse_multiquadric(r, xp):
+    return 1.0 / xp.sqrt(r**2 + 1.0)
+
+
+def inverse_quadratic(r, xp):
+    return 1.0 / (r**2 + 1.0)
+
+
+def gaussian(r, xp):
+    return xp.exp(-r**2)
+
+
+NAME_TO_FUNC = {
+   "linear": linear,
+   "thin_plate_spline": thin_plate_spline,
+   "cubic": cubic,
+   "quintic": quintic,
+   "multiquadric": multiquadric,
+   "inverse_multiquadric": inverse_multiquadric,
+   "inverse_quadratic": inverse_quadratic,
+   "gaussian": gaussian
+   }
+
+
+def kernel_matrix(x, kernel_func, xp):
+    """Evaluate RBFs, with centers at `x`, at `x`."""
+    return kernel_func(
+        xp.linalg.vector_norm(x[None, :, :] - x[:, None, :], axis=-1), xp
+    )
+
+
+def polynomial_matrix(x, powers, xp):
+    """Evaluate monomials, with exponents from `powers`, at `x`."""
+    return xp.prod(x[:, None, :] ** powers, axis=-1)
+
+
+def _build_system(y, d, smoothing, kernel, epsilon, powers, xp):
+    """Build the system used to solve for the RBF interpolant coefficients.
+
+    Parameters
+    ----------
+    y : (P, N) float ndarray
+        Data point coordinates.
+    d : (P, S) float ndarray
+        Data values at `y`.
+    smoothing : (P,) float ndarray
+        Smoothing parameter for each data point.
+    kernel : str
+        Name of the RBF.
+    epsilon : float
+        Shape parameter.
+    powers : (R, N) int ndarray
+        The exponents for each monomial in the polynomial.
+
+    Returns
+    -------
+    lhs : (P + R, P + R) float ndarray
+        Left-hand side matrix.
+    rhs : (P + R, S) float ndarray
+        Right-hand side matrix.
+    shift : (N,) float ndarray
+        Domain shift used to create the polynomial matrix.
+    scale : (N,) float ndarray
+        Domain scaling used to create the polynomial matrix.
+
+    """
+    s = d.shape[1]
+    r = powers.shape[0]
+    kernel_func = NAME_TO_FUNC[kernel]
+
+    # Shift and scale the polynomial domain to be between -1 and 1
+    mins = xp.min(y, axis=0)
+    maxs = xp.max(y, axis=0)
+    shift = (maxs + mins)/2
+    scale = (maxs - mins)/2
+    # The scale may be zero if there is a single point or all the points have
+    # the same value for some dimension. Avoid division by zero by replacing
+    # zeros with ones.
+    scale = xp.where(scale == 0.0, 1.0, scale)
+
+    yeps = y*epsilon
+    yhat = (y - shift)/scale
+
+    out_kernels  = kernel_matrix(yeps, kernel_func, xp)
+    out_poly = polynomial_matrix(yhat, powers, xp)
+
+    lhs = xp.concat(
+        [
+         xp.concat((out_kernels, out_poly), axis=1),
+         xp.concat((out_poly.T, xp.zeros((r, r))), axis=1)
+        ]
+    , axis=0) + xp.diag(xp.concat([smoothing, xp.zeros(r)]))
+
+    rhs = xp.concat([d, xp.zeros((r, s))], axis=0)
+
+    return lhs, rhs, shift, scale
+
+
+def _build_evaluation_coefficients(
+    x, y, kernel, epsilon, powers, shift, scale, xp
+):
+    """Construct the coefficients needed to evaluate
+    the RBF.
+
+    Parameters
+    ----------
+    x : (Q, N) float ndarray
+        Evaluation point coordinates.
+    y : (P, N) float ndarray
+        Data point coordinates.
+    kernel : str
+        Name of the RBF.
+    epsilon : float
+        Shape parameter.
+    powers : (R, N) int ndarray
+        The exponents for each monomial in the polynomial.
+    shift : (N,) float ndarray
+        Shifts the polynomial domain for numerical stability.
+    scale : (N,) float ndarray
+        Scales the polynomial domain for numerical stability.
+
+    Returns
+    -------
+    (Q, P + R) float ndarray
+
+    """
+    kernel_func = NAME_TO_FUNC[kernel]
+
+    yeps = y*epsilon
+    xeps = x*epsilon
+    xhat = (x - shift)/scale
+
+    # NB: changed w.r.t. pythran
+    vec = xp.concat(
+        [
+            kernel_func(
+                xp.linalg.vector_norm(
+                    xeps[:, None, :] - yeps[None, :, :], axis=-1
+                ), xp
+            ),
+            xp.prod(xhat[:, None, :] ** powers, axis=-1)
+        ], axis=-1
+    )
+
+    return vec
+
+
+def compute_interpolation(x, y, kernel, epsilon, powers, shift, scale, coeffs, xp):
+    vec = _build_evaluation_coefficients(
+        x, y, kernel, epsilon, powers, shift, scale, xp
+    )
+    return vec @ coeffs
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/dfitpack.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/dfitpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..71b4407257b499f79c4a0bd13a1d12bf55d9e44d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/dfitpack.py
@@ -0,0 +1,24 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'spalde',
+    'splder',
+    'splev',
+    'splint',
+    'sproot',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="dfitpack",
+                                   private_modules=["_dfitpack"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..6490c93fe02b4c665b032d09e2ad3c269e1f7970
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack.py
@@ -0,0 +1,31 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'BSpline',
+    'bisplev',
+    'bisplrep',
+    'insert',
+    'spalde',
+    'splantider',
+    'splder',
+    'splev',
+    'splint',
+    'splprep',
+    'splrep',
+    'sproot',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="fitpack",
+                                   private_modules=["_fitpack_py"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f993961f94d913d632aa3d2cc7b1348659a6a613
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/fitpack2.py
@@ -0,0 +1,29 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'BivariateSpline',
+    'InterpolatedUnivariateSpline',
+    'LSQBivariateSpline',
+    'LSQSphereBivariateSpline',
+    'LSQUnivariateSpline',
+    'RectBivariateSpline',
+    'RectSphereBivariateSpline',
+    'SmoothBivariateSpline',
+    'SmoothSphereBivariateSpline',
+    'UnivariateSpline',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="fitpack2",
+                                   private_modules=["_fitpack2"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpnd.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpnd.py
new file mode 100644
index 0000000000000000000000000000000000000000..36bfa6b9f5669803b016a0471e327bc87e546733
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpnd.py
@@ -0,0 +1,21 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'CloughTocher2DInterpolator',
+    'LinearNDInterpolator',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="interpnd",
+                                   private_modules=["_interpnd"], all=__all__,
+                                   attribute=name, dep_version="1.17.0")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpolate.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..341d13954c81130cceb8afe070db023a82550e7a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/interpolate.py
@@ -0,0 +1,30 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'BPoly',
+    'BSpline',
+    'NdPPoly',
+    'PPoly',
+    'RectBivariateSpline',
+    'RegularGridInterpolator',
+    'interp1d',
+    'interp2d',
+    'interpn',
+    'lagrange',
+    'make_interp_spline',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="interpolate",
+                                   private_modules=["_interpolate", "fitpack2", "_rgi"],
+                                   all=__all__, attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/polyint.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/polyint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81306304abffb313ab5abe09116a162642a9d67
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/interpolate/polyint.py
@@ -0,0 +1,24 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.interpolate` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'BarycentricInterpolator',
+    'KroghInterpolator',
+    'approximate_taylor_polynomial',
+    'barycentric_interpolate',
+    'krogh_interpolate',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="interpolate", module="polyint",
+                                   private_modules=["_polyint"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b7c89781a221779b7ccd7cab34e1b2bd98b0d5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/__init__.py
@@ -0,0 +1,842 @@
+"""
+========================================
+Special functions (:mod:`scipy.special`)
+========================================
+
+.. currentmodule:: scipy.special
+
+.. toctree::
+   :hidden:
+
+   special.cython_special
+
+Almost all of the functions below accept NumPy arrays as input
+arguments as well as single numbers. This means they follow
+broadcasting and automatic array-looping rules. Technically,
+they are `NumPy universal functions
+<https://numpy.org/doc/stable/user/basics.ufuncs.html#ufuncs-basics>`_.
+Functions which do not accept NumPy arrays are marked by a warning
+in the section description.
+
+.. seealso::
+
+   `scipy.special.cython_special` -- Typed Cython versions of special functions
+
+
+Error handling
+==============
+
+Errors are handled by returning NaNs or other appropriate values.
+Some of the special function routines can emit warnings or raise
+exceptions when an error occurs. By default this is disabled, except
+for memory allocation errors, which result in an exception being raised.
+To query and control the current error handling state the following
+functions are provided.
+
+.. autosummary::
+   :toctree: generated/
+
+   geterr                 -- Get the current way of handling special-function errors.
+   seterr                 -- Set how special-function errors are handled.
+   errstate               -- Context manager for special-function error handling.
+   SpecialFunctionWarning -- Warning that can be emitted by special functions.
+   SpecialFunctionError   -- Exception that can be raised by special functions.
+
+Available functions
+===================
+
+Airy functions
+--------------
+
+.. autosummary::
+   :toctree: generated/
+
+   airy     -- Airy functions and their derivatives.
+   airye    -- Exponentially scaled Airy functions and their derivatives.
+   ai_zeros -- Compute `nt` zeros and values of the Airy function Ai and its derivative.
+   bi_zeros -- Compute `nt` zeros and values of the Airy function Bi and its derivative.
+   itairy   -- Integrals of Airy functions
+
+
+Elliptic functions and integrals
+--------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   ellipj    -- Jacobian elliptic functions.
+   ellipk    -- Complete elliptic integral of the first kind.
+   ellipkm1  -- Complete elliptic integral of the first kind around `m` = 1.
+   ellipkinc -- Incomplete elliptic integral of the first kind.
+   ellipe    -- Complete elliptic integral of the second kind.
+   ellipeinc -- Incomplete elliptic integral of the second kind.
+   elliprc   -- Degenerate symmetric integral RC.
+   elliprd   -- Symmetric elliptic integral of the second kind.
+   elliprf   -- Completely-symmetric elliptic integral of the first kind.
+   elliprg   -- Completely-symmetric elliptic integral of the second kind.
+   elliprj   -- Symmetric elliptic integral of the third kind.
+
+Bessel functions
+----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   jv                -- Bessel function of the first kind of real order and \
+                        complex argument.
+   jve               -- Exponentially scaled Bessel function of order `v`.
+   yn                -- Bessel function of the second kind of integer order and \
+                        real argument.
+   yv                -- Bessel function of the second kind of real order and \
+                        complex argument.
+   yve               -- Exponentially scaled Bessel function of the second kind \
+                        of real order.
+   iv                -- Modified Bessel function of the first kind of real order.
+   ive               -- Exponentially scaled modified Bessel function of the \
+                        first kind.
+   kn                -- Modified Bessel function of the second kind of integer \
+                        order `n`
+   kv                -- Modified Bessel function of the second kind of real order \
+                        `v`
+   kve               -- Exponentially scaled modified Bessel function of the \
+                        second kind.
+   hankel1           -- Hankel function of the first kind.
+   hankel1e          -- Exponentially scaled Hankel function of the first kind.
+   hankel2           -- Hankel function of the second kind.
+   hankel2e          -- Exponentially scaled Hankel function of the second kind.
+   wright_bessel     -- Wright's generalized Bessel function.
+   log_wright_bessel -- Logarithm of Wright's generalized Bessel function.
+
+The following function does not accept NumPy arrays (it is not a
+universal function):
+
+.. autosummary::
+   :toctree: generated/
+
+   lmbda -- Jahnke-Emden Lambda function, Lambdav(x).
+
+Zeros of Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   jnjnp_zeros -- Compute zeros of integer-order Bessel functions Jn and Jn'.
+   jnyn_zeros  -- Compute nt zeros of Bessel functions Jn(x), Jn'(x), Yn(x), and Yn'(x).
+   jn_zeros    -- Compute zeros of integer-order Bessel function Jn(x).
+   jnp_zeros   -- Compute zeros of integer-order Bessel function derivative Jn'(x).
+   yn_zeros    -- Compute zeros of integer-order Bessel function Yn(x).
+   ynp_zeros   -- Compute zeros of integer-order Bessel function derivative Yn'(x).
+   y0_zeros    -- Compute nt zeros of Bessel function Y0(z), and derivative at each zero.
+   y1_zeros    -- Compute nt zeros of Bessel function Y1(z), and derivative at each zero.
+   y1p_zeros   -- Compute nt zeros of Bessel derivative Y1'(z), and value at each zero.
+
+Faster versions of common Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   j0  -- Bessel function of the first kind of order 0.
+   j1  -- Bessel function of the first kind of order 1.
+   y0  -- Bessel function of the second kind of order 0.
+   y1  -- Bessel function of the second kind of order 1.
+   i0  -- Modified Bessel function of order 0.
+   i0e -- Exponentially scaled modified Bessel function of order 0.
+   i1  -- Modified Bessel function of order 1.
+   i1e -- Exponentially scaled modified Bessel function of order 1.
+   k0  -- Modified Bessel function of the second kind of order 0, :math:`K_0`.
+   k0e -- Exponentially scaled modified Bessel function K of order 0
+   k1  -- Modified Bessel function of the second kind of order 1, :math:`K_1(x)`.
+   k1e -- Exponentially scaled modified Bessel function K of order 1.
+
+Integrals of Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   itj0y0     -- Integrals of Bessel functions of order 0.
+   it2j0y0    -- Integrals related to Bessel functions of order 0.
+   iti0k0     -- Integrals of modified Bessel functions of order 0.
+   it2i0k0    -- Integrals related to modified Bessel functions of order 0.
+   besselpoly -- Weighted integral of a Bessel function.
+
+Derivatives of Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   jvp  -- Compute nth derivative of Bessel function Jv(z) with respect to `z`.
+   yvp  -- Compute nth derivative of Bessel function Yv(z) with respect to `z`.
+   ivp  -- Compute nth derivative of modified Bessel function Iv(z) with respect to `z`.
+   kvp  -- Compute nth derivative of real-order modified Bessel function Kv(z)
+   h1vp -- Compute nth derivative of Hankel function H1v(z) with respect to `z`.
+   h2vp -- Compute nth derivative of Hankel function H2v(z) with respect to `z`.
+
+Spherical Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   spherical_jn -- Spherical Bessel function of the first kind or its derivative.
+   spherical_yn -- Spherical Bessel function of the second kind or its derivative.
+   spherical_in -- Modified spherical Bessel function of the first kind or its derivative.
+   spherical_kn -- Modified spherical Bessel function of the second kind or its derivative.
+
+Riccati-Bessel functions
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   riccati_jn -- Compute Riccati-Bessel function of the first kind and its derivative.
+   riccati_yn -- Compute Riccati-Bessel function of the second kind and its derivative.
+
+Struve functions
+----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   struve       -- Struve function.
+   modstruve    -- Modified Struve function.
+   itstruve0    -- Integral of the Struve function of order 0.
+   it2struve0   -- Integral related to the Struve function of order 0.
+   itmodstruve0 -- Integral of the modified Struve function of order 0.
+
+
+Raw statistical functions
+-------------------------
+
+.. seealso:: :mod:`scipy.stats`: Friendly versions of these functions.
+
+Binomial distribution
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   bdtr         -- Binomial distribution cumulative distribution function.
+   bdtrc        -- Binomial distribution survival function.
+   bdtri        -- Inverse function to `bdtr` with respect to `p`.
+   bdtrik       -- Inverse function to `bdtr` with respect to `k`.
+   bdtrin       -- Inverse function to `bdtr` with respect to `n`.
+
+Beta distribution
+^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   btdtria      -- Inverse of `betainc` with respect to `a`.
+   btdtrib      -- Inverse of `betainc` with respect to `b`.
+
+F distribution
+^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   fdtr         -- F cumulative distribution function.
+   fdtrc        -- F survival function.
+   fdtri        -- The `p`-th quantile of the F-distribution.
+   fdtridfd     -- Inverse to `fdtr` vs dfd.
+
+Gamma distribution
+^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   gdtr         -- Gamma distribution cumulative distribution function.
+   gdtrc        -- Gamma distribution survival function.
+   gdtria       -- Inverse of `gdtr` vs a.
+   gdtrib       -- Inverse of `gdtr` vs b.
+   gdtrix       -- Inverse of `gdtr` vs x.
+
+Negative binomial distribution
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   nbdtr        -- Negative binomial cumulative distribution function.
+   nbdtrc       -- Negative binomial survival function.
+   nbdtri       -- Inverse of `nbdtr` vs `p`.
+   nbdtrik      -- Inverse of `nbdtr` vs `k`.
+   nbdtrin      -- Inverse of `nbdtr` vs `n`.
+
+Noncentral F distribution
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   ncfdtr       -- Cumulative distribution function of the non-central F distribution.
+   ncfdtridfd   -- Calculate degrees of freedom (denominator) for the noncentral F-distribution.
+   ncfdtridfn   -- Calculate degrees of freedom (numerator) for the noncentral F-distribution.
+   ncfdtri      -- Inverse cumulative distribution function of the non-central F distribution.
+   ncfdtrinc    -- Calculate non-centrality parameter for non-central F distribution.
+
+Noncentral t distribution
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   nctdtr       -- Cumulative distribution function of the non-central `t` distribution.
+   nctdtridf    -- Calculate degrees of freedom for non-central t distribution.
+   nctdtrit     -- Inverse cumulative distribution function of the non-central t distribution.
+   nctdtrinc    -- Calculate non-centrality parameter for non-central t distribution.
+
+Normal distribution
+^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   nrdtrimn     -- Calculate mean of normal distribution given other params.
+   nrdtrisd     -- Calculate standard deviation of normal distribution given other params.
+   ndtr         -- Normal cumulative distribution function.
+   log_ndtr     -- Logarithm of normal cumulative distribution function.
+   ndtri        -- Inverse of `ndtr` vs x.
+   ndtri_exp    -- Inverse of `log_ndtr` vs x.
+
+Poisson distribution
+^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   pdtr         -- Poisson cumulative distribution function.
+   pdtrc        -- Poisson survival function.
+   pdtri        -- Inverse to `pdtr` vs m.
+   pdtrik       -- Inverse to `pdtr` vs k.
+
+Student t distribution
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   stdtr        -- Student t distribution cumulative distribution function.
+   stdtridf     -- Inverse of `stdtr` vs df.
+   stdtrit      -- Inverse of `stdtr` vs `t`.
+
+Chi square distribution
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   chdtr        -- Chi square cumulative distribution function.
+   chdtrc       -- Chi square survival function.
+   chdtri       -- Inverse to `chdtrc`.
+   chdtriv      -- Inverse to `chdtr` vs `v`.
+
+Non-central chi square distribution
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   chndtr       -- Non-central chi square cumulative distribution function.
+   chndtridf    -- Inverse to `chndtr` vs `df`.
+   chndtrinc    -- Inverse to `chndtr` vs `nc`.
+   chndtrix     -- Inverse to `chndtr` vs `x`.
+
+Kolmogorov distribution
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   smirnov      -- Kolmogorov-Smirnov complementary cumulative distribution function.
+   smirnovi     -- Inverse to `smirnov`.
+   kolmogorov   -- Complementary cumulative distribution function of Kolmogorov distribution.
+   kolmogi      -- Inverse function to `kolmogorov`.
+
+Box-Cox transformation
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   boxcox       -- Compute the Box-Cox transformation.
+   boxcox1p     -- Compute the Box-Cox transformation of 1 + `x`.
+   inv_boxcox   -- Compute the inverse of the Box-Cox transformation.
+   inv_boxcox1p -- Compute the inverse of the Box-Cox transformation.
+
+
+Sigmoidal functions
+^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   logit        -- Logit ufunc for ndarrays.
+   expit        -- Logistic sigmoid function.
+   log_expit    -- Logarithm of the logistic sigmoid function.
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: generated/
+
+   tklmbda      -- Tukey-Lambda cumulative distribution function.
+   owens_t      -- Owen's T Function.
+
+
+Information Theory functions
+----------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   entr         -- Elementwise function for computing entropy.
+   rel_entr     -- Elementwise function for computing relative entropy.
+   kl_div       -- Elementwise function for computing Kullback-Leibler divergence.
+   huber        -- Huber loss function.
+   pseudo_huber -- Pseudo-Huber loss function.
+
+
+Gamma and related functions
+---------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   gamma        -- Gamma function.
+   gammaln      -- Logarithm of the absolute value of the Gamma function for real inputs.
+   loggamma     -- Principal branch of the logarithm of the Gamma function.
+   gammasgn     -- Sign of the gamma function.
+   gammainc     -- Regularized lower incomplete gamma function.
+   gammaincinv  -- Inverse to `gammainc`.
+   gammaincc    -- Regularized upper incomplete gamma function.
+   gammainccinv -- Inverse to `gammaincc`.
+   beta         -- Beta function.
+   betaln       -- Natural logarithm of absolute value of beta function.
+   betainc      -- Incomplete beta integral.
+   betaincc     -- Complemented incomplete beta integral.
+   betaincinv   -- Inverse function to beta integral.
+   betainccinv  -- Inverse of the complemented incomplete beta integral.
+   psi          -- The digamma function.
+   rgamma       -- Gamma function inverted.
+   polygamma    -- Polygamma function n.
+   multigammaln -- Returns the log of multivariate gamma, also sometimes called the generalized gamma.
+   digamma      -- psi(x[, out]).
+   poch         -- Rising factorial (z)_m.
+
+Error function and Fresnel integrals
+------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   erf           -- Returns the error function of complex argument.
+   erfc          -- Complementary error function, ``1 - erf(x)``.
+   erfcx         -- Scaled complementary error function, ``exp(x**2) * erfc(x)``.
+   erfi          -- Imaginary error function, ``-i erf(i z)``.
+   erfinv        -- Inverse function for erf.
+   erfcinv       -- Inverse function for erfc.
+   wofz          -- Faddeeva function.
+   dawsn         -- Dawson's integral.
+   fresnel       -- Fresnel sin and cos integrals.
+   fresnel_zeros -- Compute nt complex zeros of sine and cosine Fresnel integrals S(z) and C(z).
+   modfresnelp   -- Modified Fresnel positive integrals.
+   modfresnelm   -- Modified Fresnel negative integrals.
+   voigt_profile -- Voigt profile.
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   erf_zeros      -- Compute nt complex zeros of error function erf(z).
+   fresnelc_zeros -- Compute nt complex zeros of cosine Fresnel integral C(z).
+   fresnels_zeros -- Compute nt complex zeros of sine Fresnel integral S(z).
+
+Legendre functions
+------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   legendre_p                 -- Legendre polynomials of the first kind.
+   legendre_p_all             -- All Legendre polynomials of the first kind up to a specified order.
+   assoc_legendre_p           -- Associated Legendre polynomials of the first kind.
+   assoc_legendre_p_all       -- All associated Legendre polynomials of the first kind up to a specified order and degree.
+   sph_legendre_p             -- Spherical Legendre polynomials of the first kind.
+   sph_legendre_p_all         -- All spherical Legendre polynomials of the first kind up to a specified order and degree.
+   sph_harm_y                 -- Spherical harmonics.
+   sph_harm_y_all             -- All spherical harmonics up to a specified order and degree.
+
+The following functions are in the process of being deprecated in favor of the above,
+which provide a more flexible and consistent interface.
+
+.. autosummary::
+   :toctree: generated/
+
+   lpmv                       -- Associated Legendre function of integer order and real degree.
+   lqn                        -- Legendre function of the second kind.
+   lqmn                       -- Sequence of associated Legendre functions of the second kind.
+
+Ellipsoidal harmonics
+---------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   ellip_harm   -- Ellipsoidal harmonic functions E^p_n(l).
+   ellip_harm_2 -- Ellipsoidal harmonic functions F^p_n(l).
+   ellip_normal -- Ellipsoidal harmonic normalization constants gamma^p_n.
+
+Orthogonal polynomials
+----------------------
+
+The following functions evaluate values of orthogonal polynomials:
+
+.. autosummary::
+   :toctree: generated/
+
+   assoc_laguerre   -- Compute the generalized (associated) Laguerre polynomial of degree n and order k.
+   eval_legendre    -- Evaluate Legendre polynomial at a point.
+   eval_chebyt      -- Evaluate Chebyshev polynomial of the first kind at a point.
+   eval_chebyu      -- Evaluate Chebyshev polynomial of the second kind at a point.
+   eval_chebyc      -- Evaluate Chebyshev polynomial of the first kind on [-2, 2] at a point.
+   eval_chebys      -- Evaluate Chebyshev polynomial of the second kind on [-2, 2] at a point.
+   eval_jacobi      -- Evaluate Jacobi polynomial at a point.
+   eval_laguerre    -- Evaluate Laguerre polynomial at a point.
+   eval_genlaguerre -- Evaluate generalized Laguerre polynomial at a point.
+   eval_hermite     -- Evaluate physicist's Hermite polynomial at a point.
+   eval_hermitenorm -- Evaluate probabilist's (normalized) Hermite polynomial at a point.
+   eval_gegenbauer  -- Evaluate Gegenbauer polynomial at a point.
+   eval_sh_legendre -- Evaluate shifted Legendre polynomial at a point.
+   eval_sh_chebyt   -- Evaluate shifted Chebyshev polynomial of the first kind at a point.
+   eval_sh_chebyu   -- Evaluate shifted Chebyshev polynomial of the second kind at a point.
+   eval_sh_jacobi   -- Evaluate shifted Jacobi polynomial at a point.
+
+The following functions compute roots and quadrature weights for
+orthogonal polynomials:
+
+.. autosummary::
+   :toctree: generated/
+
+   roots_legendre    -- Gauss-Legendre quadrature.
+   roots_chebyt      -- Gauss-Chebyshev (first kind) quadrature.
+   roots_chebyu      -- Gauss-Chebyshev (second kind) quadrature.
+   roots_chebyc      -- Gauss-Chebyshev (first kind) quadrature.
+   roots_chebys      -- Gauss-Chebyshev (second kind) quadrature.
+   roots_jacobi      -- Gauss-Jacobi quadrature.
+   roots_laguerre    -- Gauss-Laguerre quadrature.
+   roots_genlaguerre -- Gauss-generalized Laguerre quadrature.
+   roots_hermite     -- Gauss-Hermite (physicist's) quadrature.
+   roots_hermitenorm -- Gauss-Hermite (statistician's) quadrature.
+   roots_gegenbauer  -- Gauss-Gegenbauer quadrature.
+   roots_sh_legendre -- Gauss-Legendre (shifted) quadrature.
+   roots_sh_chebyt   -- Gauss-Chebyshev (first kind, shifted) quadrature.
+   roots_sh_chebyu   -- Gauss-Chebyshev (second kind, shifted) quadrature.
+   roots_sh_jacobi   -- Gauss-Jacobi (shifted) quadrature.
+
+The functions below, in turn, return the polynomial coefficients in
+``orthopoly1d`` objects, which function similarly as `numpy.poly1d`.
+The ``orthopoly1d`` class also has an attribute ``weights``, which returns
+the roots, weights, and total weights for the appropriate form of Gaussian
+quadrature. These are returned in an ``n x 3`` array with roots in the first
+column, weights in the second column, and total weights in the final column.
+Note that ``orthopoly1d`` objects are converted to `~numpy.poly1d` when doing
+arithmetic, and lose information of the original orthogonal polynomial.
+
+.. autosummary::
+   :toctree: generated/
+
+   legendre    -- Legendre polynomial.
+   chebyt      -- Chebyshev polynomial of the first kind.
+   chebyu      -- Chebyshev polynomial of the second kind.
+   chebyc      -- Chebyshev polynomial of the first kind on :math:`[-2, 2]`.
+   chebys      -- Chebyshev polynomial of the second kind on :math:`[-2, 2]`.
+   jacobi      -- Jacobi polynomial.
+   laguerre    -- Laguerre polynomial.
+   genlaguerre -- Generalized (associated) Laguerre polynomial.
+   hermite     -- Physicist's Hermite polynomial.
+   hermitenorm -- Normalized (probabilist's) Hermite polynomial.
+   gegenbauer  -- Gegenbauer (ultraspherical) polynomial.
+   sh_legendre -- Shifted Legendre polynomial.
+   sh_chebyt   -- Shifted Chebyshev polynomial of the first kind.
+   sh_chebyu   -- Shifted Chebyshev polynomial of the second kind.
+   sh_jacobi   -- Shifted Jacobi polynomial.
+
+.. warning::
+
+   Computing values of high-order polynomials (around ``order > 20``) using
+   polynomial coefficients is numerically unstable. To evaluate polynomial
+   values, the ``eval_*`` functions should be used instead.
+
+
+Hypergeometric functions
+------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   hyp2f1 -- Gauss hypergeometric function 2F1(a, b; c; z).
+   hyp1f1 -- Confluent hypergeometric function 1F1(a, b; x).
+   hyperu -- Confluent hypergeometric function U(a, b, x) of the second kind.
+   hyp0f1 -- Confluent hypergeometric limit function 0F1.
+
+
+Parabolic cylinder functions
+----------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   pbdv -- Parabolic cylinder function D.
+   pbvv -- Parabolic cylinder function V.
+   pbwa -- Parabolic cylinder function W.
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   pbdv_seq -- Parabolic cylinder functions Dv(x) and derivatives.
+   pbvv_seq -- Parabolic cylinder functions Vv(x) and derivatives.
+   pbdn_seq -- Parabolic cylinder functions Dn(z) and derivatives.
+
+Mathieu and related functions
+-----------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   mathieu_a -- Characteristic value of even Mathieu functions.
+   mathieu_b -- Characteristic value of odd Mathieu functions.
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   mathieu_even_coef -- Fourier coefficients for even Mathieu and modified Mathieu functions.
+   mathieu_odd_coef  -- Fourier coefficients for even Mathieu and modified Mathieu functions.
+
+The following return both function and first derivative:
+
+.. autosummary::
+   :toctree: generated/
+
+   mathieu_cem     -- Even Mathieu function and its derivative.
+   mathieu_sem     -- Odd Mathieu function and its derivative.
+   mathieu_modcem1 -- Even modified Mathieu function of the first kind and its derivative.
+   mathieu_modcem2 -- Even modified Mathieu function of the second kind and its derivative.
+   mathieu_modsem1 -- Odd modified Mathieu function of the first kind and its derivative.
+   mathieu_modsem2 -- Odd modified Mathieu function of the second kind and its derivative.
+
+Spheroidal wave functions
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   pro_ang1   -- Prolate spheroidal angular function of the first kind and its derivative.
+   pro_rad1   -- Prolate spheroidal radial function of the first kind and its derivative.
+   pro_rad2   -- Prolate spheroidal radial function of the second kind and its derivative.
+   obl_ang1   -- Oblate spheroidal angular function of the first kind and its derivative.
+   obl_rad1   -- Oblate spheroidal radial function of the first kind and its derivative.
+   obl_rad2   -- Oblate spheroidal radial function of the second kind and its derivative.
+   pro_cv     -- Characteristic value of prolate spheroidal function.
+   obl_cv     -- Characteristic value of oblate spheroidal function.
+   pro_cv_seq -- Characteristic values for prolate spheroidal wave functions.
+   obl_cv_seq -- Characteristic values for oblate spheroidal wave functions.
+
+The following functions require pre-computed characteristic value:
+
+.. autosummary::
+   :toctree: generated/
+
+   pro_ang1_cv -- Prolate spheroidal angular function pro_ang1 for precomputed characteristic value.
+   pro_rad1_cv -- Prolate spheroidal radial function pro_rad1 for precomputed characteristic value.
+   pro_rad2_cv -- Prolate spheroidal radial function pro_rad2 for precomputed characteristic value.
+   obl_ang1_cv -- Oblate spheroidal angular function obl_ang1 for precomputed characteristic value.
+   obl_rad1_cv -- Oblate spheroidal radial function obl_rad1 for precomputed characteristic value.
+   obl_rad2_cv -- Oblate spheroidal radial function obl_rad2 for precomputed characteristic value.
+
+Kelvin functions
+----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   kelvin       -- Kelvin functions as complex numbers.
+   kelvin_zeros -- Compute nt zeros of all Kelvin functions.
+   ber          -- Kelvin function ber.
+   bei          -- Kelvin function bei
+   berp         -- Derivative of the Kelvin function `ber`.
+   beip         -- Derivative of the Kelvin function `bei`.
+   ker          -- Kelvin function ker.
+   kei          -- Kelvin function ker.
+   kerp         -- Derivative of the Kelvin function ker.
+   keip         -- Derivative of the Kelvin function kei.
+
+The following functions do not accept NumPy arrays (they are not
+universal functions):
+
+.. autosummary::
+   :toctree: generated/
+
+   ber_zeros  -- Compute nt zeros of the Kelvin function ber(x).
+   bei_zeros  -- Compute nt zeros of the Kelvin function bei(x).
+   berp_zeros -- Compute nt zeros of the Kelvin function ber'(x).
+   beip_zeros -- Compute nt zeros of the Kelvin function bei'(x).
+   ker_zeros  -- Compute nt zeros of the Kelvin function ker(x).
+   kei_zeros  -- Compute nt zeros of the Kelvin function kei(x).
+   kerp_zeros -- Compute nt zeros of the Kelvin function ker'(x).
+   keip_zeros -- Compute nt zeros of the Kelvin function kei'(x).
+
+Combinatorics
+-------------
+
+.. autosummary::
+   :toctree: generated/
+
+   comb -- The number of combinations of N things taken k at a time.
+   perm -- Permutations of N things taken k at a time, i.e., k-permutations of N.
+   stirling2 -- Stirling numbers of the second kind.
+
+Lambert W and related functions
+-------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   lambertw    -- Lambert W function.
+   wrightomega -- Wright Omega function.
+
+Other special functions
+-----------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   agm         -- Arithmetic, Geometric Mean.
+   bernoulli   -- Bernoulli numbers B0..Bn (inclusive).
+   binom       -- Binomial coefficient
+   diric       -- Periodic sinc function, also called the Dirichlet kernel.
+   euler       -- Euler numbers E0..En (inclusive).
+   expn        -- Exponential integral E_n.
+   exp1        -- Exponential integral E_1 of complex argument z.
+   expi        -- Exponential integral Ei.
+   factorial   -- The factorial of a number or array of numbers.
+   factorial2  -- Double factorial.
+   factorialk  -- Multifactorial of n of order k, n(!!...!).
+   shichi      -- Hyperbolic sine and cosine integrals.
+   sici        -- Sine and cosine integrals.
+   softmax     -- Softmax function.
+   log_softmax -- Logarithm of softmax function.
+   spence      -- Spence's function, also known as the dilogarithm.
+   zeta        -- Riemann zeta function.
+   zetac       -- Riemann zeta function minus 1.
+   softplus    -- Softplus function.
+
+Convenience functions
+---------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   cbrt      -- Cube root of `x`.
+   exp10     -- 10**x.
+   exp2      -- 2**x.
+   radian    -- Convert from degrees to radians.
+   cosdg     -- Cosine of the angle `x` given in degrees.
+   sindg     -- Sine of angle given in degrees.
+   tandg     -- Tangent of angle x given in degrees.
+   cotdg     -- Cotangent of the angle `x` given in degrees.
+   log1p     -- Calculates log(1+x) for use when `x` is near zero.
+   expm1     -- ``exp(x) - 1`` for use when `x` is near zero.
+   cosm1     -- ``cos(x) - 1`` for use when `x` is near zero.
+   powm1     -- ``x**y - 1`` for use when `y` is near zero or `x` is near 1.
+   round     -- Round to nearest integer.
+   xlogy     -- Compute ``x*log(y)`` so that the result is 0 if ``x = 0``.
+   xlog1py   -- Compute ``x*log1p(y)`` so that the result is 0 if ``x = 0``.
+   logsumexp -- Compute the log of the sum of exponentials of input elements.
+   exprel    -- Relative error exponential, (exp(x)-1)/x, for use when `x` is near zero.
+   sinc      -- Return the sinc function.
+
+"""  # noqa: E501
+
+
+from ._sf_error import SpecialFunctionWarning, SpecialFunctionError
+
+from . import _ufuncs
+from ._ufuncs import *
+
+from . import _basic
+from ._basic import *
+
+# Replace some function definitions from _ufuncs and _basic
+# to add Array API support
+from ._support_alternative_backends import *
+
+from ._logsumexp import logsumexp, softmax, log_softmax
+
+from . import _multiufuncs
+from ._multiufuncs import *
+
+from . import _orthogonal
+from ._orthogonal import *
+
+from ._ellip_harm import (
+    ellip_harm,
+    ellip_harm_2,
+    ellip_normal
+)
+from ._lambertw import lambertw
+from ._spherical_bessel import (
+    spherical_jn,
+    spherical_yn,
+    spherical_in,
+    spherical_kn
+)
+
+# Deprecated namespaces, to be removed in v2.0.0
+from . import add_newdocs, basic, orthogonal, specfun, sf_error, spfun_stats
+
+# We replace some function definitions from _ufuncs with those from
+# _support_alternative_backends above, but those are all listed in _ufuncs.__all__,
+# so there is no need to consider _support_alternative_backends.__all__ here.
+__all__ = _ufuncs.__all__ + _basic.__all__ + _orthogonal.__all__ + _multiufuncs.__all__
+__all__ += [
+    'SpecialFunctionWarning',
+    'SpecialFunctionError',
+    'logsumexp',
+    'softmax',
+    'log_softmax',
+    'multigammaln',
+    'ellip_harm',
+    'ellip_harm_2',
+    'ellip_normal',
+    'lambertw',
+    'spherical_jn',
+    'spherical_yn',
+    'spherical_in',
+    'spherical_kn',
+]
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_add_newdocs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_add_newdocs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1efe2d3390effdda6d019c2a1699be18c16bda6a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_add_newdocs.py
@@ -0,0 +1,8850 @@
+# Docstrings for generated ufuncs
+#
+# The syntax is designed to look like the function add_newdoc is being
+# called from numpy.lib, but in this file add_newdoc puts the
+# docstrings in a dictionary. This dictionary is used in
+# _generate_pyx.py to generate the docstrings for the ufuncs in
+# scipy.special at the C level when the ufuncs are created at compile
+# time.
+
+docdict: dict[str, str] = {}
+
+
+def get(name):
+    return docdict.get(name)
+
+
+def add_newdoc(name, doc):
+    docdict[name] = doc
+
+
+add_newdoc("_sf_error_test_function",
+    """
+    Private function; do not use.
+    """)
+
+
+add_newdoc("_cosine_cdf",
+    """
+    _cosine_cdf(x)
+
+    Cumulative distribution function (CDF) of the cosine distribution::
+
+                 {             0,              x < -pi
+        cdf(x) = { (pi + x + sin(x))/(2*pi),   -pi <= x <= pi
+                 {             1,              x > pi
+
+    Parameters
+    ----------
+    x : array_like
+        `x` must contain real numbers.
+
+    Returns
+    -------
+    scalar or ndarray
+        The cosine distribution CDF evaluated at `x`.
+
+    """)
+
+add_newdoc("_cosine_invcdf",
+    """
+    _cosine_invcdf(p)
+
+    Inverse of the cumulative distribution function (CDF) of the cosine
+    distribution.
+
+    The CDF of the cosine distribution is::
+
+        cdf(x) = (pi + x + sin(x))/(2*pi)
+
+    This function computes the inverse of cdf(x).
+
+    Parameters
+    ----------
+    p : array_like
+        `p` must contain real numbers in the interval ``0 <= p <= 1``.
+        `nan` is returned for values of `p` outside the interval [0, 1].
+
+    Returns
+    -------
+    scalar or ndarray
+        The inverse of the cosine distribution CDF evaluated at `p`.
+
+    """)
+
+add_newdoc("_ellip_harm",
+    """
+    Internal function, use `ellip_harm` instead.
+    """)
+
+add_newdoc("_ellip_norm",
+    """
+    Internal function, use `ellip_norm` instead.
+    """)
+
+add_newdoc("wrightomega",
+    r"""
+    wrightomega(z, out=None)
+
+    Wright Omega function.
+
+    Defined as the solution to
+
+    .. math::
+
+        \omega + \log(\omega) = z
+
+    where :math:`\log` is the principal branch of the complex logarithm.
+
+    Parameters
+    ----------
+    z : array_like
+        Points at which to evaluate the Wright Omega function
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    omega : scalar or ndarray
+        Values of the Wright Omega function
+
+    See Also
+    --------
+    lambertw : The Lambert W function
+
+    Notes
+    -----
+    .. versionadded:: 0.19.0
+
+    The function can also be defined as
+
+    .. math::
+
+        \omega(z) = W_{K(z)}(e^z)
+
+    where :math:`K(z) = \lceil (\Im(z) - \pi)/(2\pi) \rceil` is the
+    unwinding number and :math:`W` is the Lambert W function.
+
+    The implementation here is taken from [1]_.
+
+    References
+    ----------
+    .. [1] Lawrence, Corless, and Jeffrey, "Algorithm 917: Complex
+           Double-Precision Evaluation of the Wright :math:`\omega`
+           Function." ACM Transactions on Mathematical Software,
+           2012. :doi:`10.1145/2168773.2168779`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import wrightomega, lambertw
+
+    >>> wrightomega([-2, -1, 0, 1, 2])
+    array([0.12002824, 0.27846454, 0.56714329, 1.        , 1.5571456 ])
+
+    Complex input:
+
+    >>> wrightomega(3 + 5j)
+    (1.5804428632097158+3.8213626783287937j)
+
+    Verify that ``wrightomega(z)`` satisfies ``w + log(w) = z``:
+
+    >>> w = -5 + 4j
+    >>> wrightomega(w + np.log(w))
+    (-5+4j)
+
+    Verify the connection to ``lambertw``:
+
+    >>> z = 0.5 + 3j
+    >>> wrightomega(z)
+    (0.0966015889280649+1.4937828458191993j)
+    >>> lambertw(np.exp(z))
+    (0.09660158892806493+1.4937828458191993j)
+
+    >>> z = 0.5 + 4j
+    >>> wrightomega(z)
+    (-0.3362123489037213+2.282986001579032j)
+    >>> lambertw(np.exp(z), k=1)
+    (-0.33621234890372115+2.282986001579032j)
+    """)
+
+
+add_newdoc("agm",
+    """
+    agm(a, b, out=None)
+
+    Compute the arithmetic-geometric mean of `a` and `b`.
+
+    Start with a_0 = a and b_0 = b and iteratively compute::
+
+        a_{n+1} = (a_n + b_n)/2
+        b_{n+1} = sqrt(a_n*b_n)
+
+    a_n and b_n converge to the same limit as n increases; their common
+    limit is agm(a, b).
+
+    Parameters
+    ----------
+    a, b : array_like
+        Real values only. If the values are both negative, the result
+        is negative. If one value is negative and the other is positive,
+        `nan` is returned.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        The arithmetic-geometric mean of `a` and `b`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import agm
+    >>> a, b = 24.0, 6.0
+    >>> agm(a, b)
+    13.458171481725614
+
+    Compare that result to the iteration:
+
+    >>> while a != b:
+    ...     a, b = (a + b)/2, np.sqrt(a*b)
+    ...     print("a = %19.16f  b=%19.16f" % (a, b))
+    ...
+    a = 15.0000000000000000  b=12.0000000000000000
+    a = 13.5000000000000000  b=13.4164078649987388
+    a = 13.4582039324993694  b=13.4581390309909850
+    a = 13.4581714817451772  b=13.4581714817060547
+    a = 13.4581714817256159  b=13.4581714817256159
+
+    When array-like arguments are given, broadcasting applies:
+
+    >>> a = np.array([[1.5], [3], [6]])  # a has shape (3, 1).
+    >>> b = np.array([6, 12, 24, 48])    # b has shape (4,).
+    >>> agm(a, b)
+    array([[  3.36454287,   5.42363427,   9.05798751,  15.53650756],
+           [  4.37037309,   6.72908574,  10.84726853,  18.11597502],
+           [  6.        ,   8.74074619,  13.45817148,  21.69453707]])
+    """)
+
+add_newdoc("bdtr",
+    r"""
+    bdtr(k, n, p, out=None)
+
+    Binomial distribution cumulative distribution function.
+
+    Sum of the terms 0 through `floor(k)` of the Binomial probability density.
+
+    .. math::
+        \mathrm{bdtr}(k, n, p) =
+        \sum_{j=0}^{\lfloor k \rfloor} {{n}\choose{j}} p^j (1-p)^{n-j}
+
+    Parameters
+    ----------
+    k : array_like
+        Number of successes (double), rounded down to the nearest integer.
+    n : array_like
+        Number of events (int).
+    p : array_like
+        Probability of success in a single event (float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        Probability of `floor(k)` or fewer successes in `n` independent events with
+        success probabilities of `p`.
+
+    Notes
+    -----
+    The terms are not summed directly; instead the regularized incomplete beta
+    function is employed, according to the formula,
+
+    .. math::
+        \mathrm{bdtr}(k, n, p) =
+        I_{1 - p}(n - \lfloor k \rfloor, \lfloor k \rfloor + 1).
+
+    Wrapper for the Cephes [1]_ routine `bdtr`.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    """)
+
+add_newdoc("bdtrc",
+    r"""
+    bdtrc(k, n, p, out=None)
+
+    Binomial distribution survival function.
+
+    Sum of the terms `floor(k) + 1` through `n` of the binomial probability
+    density,
+
+    .. math::
+        \mathrm{bdtrc}(k, n, p) =
+        \sum_{j=\lfloor k \rfloor +1}^n {{n}\choose{j}} p^j (1-p)^{n-j}
+
+    Parameters
+    ----------
+    k : array_like
+        Number of successes (double), rounded down to nearest integer.
+    n : array_like
+        Number of events (int)
+    p : array_like
+        Probability of success in a single event.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        Probability of `floor(k) + 1` or more successes in `n` independent
+        events with success probabilities of `p`.
+
+    See Also
+    --------
+    bdtr
+    betainc
+
+    Notes
+    -----
+    The terms are not summed directly; instead the regularized incomplete beta
+    function is employed, according to the formula,
+
+    .. math::
+        \mathrm{bdtrc}(k, n, p) = I_{p}(\lfloor k \rfloor + 1, n - \lfloor k \rfloor).
+
+    Wrapper for the Cephes [1]_ routine `bdtrc`.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    """)
+
+add_newdoc("bdtri",
+    r"""
+    bdtri(k, n, y, out=None)
+
+    Inverse function to `bdtr` with respect to `p`.
+
+    Finds the event probability `p` such that the sum of the terms 0 through
+    `k` of the binomial probability density is equal to the given cumulative
+    probability `y`.
+
+    Parameters
+    ----------
+    k : array_like
+        Number of successes (float), rounded down to the nearest integer.
+    n : array_like
+        Number of events (float)
+    y : array_like
+        Cumulative probability (probability of `k` or fewer successes in `n`
+        events).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    p : scalar or ndarray
+        The event probability such that `bdtr(\lfloor k \rfloor, n, p) = y`.
+
+    See Also
+    --------
+    bdtr
+    betaincinv
+
+    Notes
+    -----
+    The computation is carried out using the inverse beta integral function
+    and the relation,::
+
+        1 - p = betaincinv(n - k, k + 1, y).
+
+    Wrapper for the Cephes [1]_ routine `bdtri`.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+    """)
+
+add_newdoc("bdtrik",
+    """
+    bdtrik(y, n, p, out=None)
+
+    Inverse function to `bdtr` with respect to `k`.
+
+    Finds the number of successes `k` such that the sum of the terms 0 through
+    `k` of the Binomial probability density for `n` events with probability
+    `p` is equal to the given cumulative probability `y`.
+
+    Parameters
+    ----------
+    y : array_like
+        Cumulative probability (probability of `k` or fewer successes in `n`
+        events).
+    n : array_like
+        Number of events (float).
+    p : array_like
+        Success probability (float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    k : scalar or ndarray
+        The number of successes `k` such that `bdtr(k, n, p) = y`.
+
+    See Also
+    --------
+    bdtr
+
+    Notes
+    -----
+    Formula 26.5.24 of [1]_ (or equivalently [2]_) is used to reduce the binomial
+    distribution to the cumulative incomplete beta distribution.
+
+    Computation of `k` involves a search for a value that produces the desired
+    value of `y`. The search relies on the monotonicity of `y` with `k`.
+
+    Wrapper for the CDFLIB [3]_ Fortran routine `cdfbin`.
+
+    References
+    ----------
+    .. [1] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [2] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17.5#E5
+    .. [3] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+
+    """)
+
+add_newdoc("bdtrin",
+    """
+    bdtrin(k, y, p, out=None)
+
+    Inverse function to `bdtr` with respect to `n`.
+
+    Finds the number of events `n` such that the sum of the terms 0 through
+    `k` of the Binomial probability density for events with probability `p` is
+    equal to the given cumulative probability `y`.
+
+    Parameters
+    ----------
+    k : array_like
+        Number of successes (float).
+    y : array_like
+        Cumulative probability (probability of `k` or fewer successes in `n`
+        events).
+    p : array_like
+        Success probability (float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    n : scalar or ndarray
+        The number of events `n` such that `bdtr(k, n, p) = y`.
+
+    See Also
+    --------
+    bdtr
+
+    Notes
+    -----
+    Formula 26.5.24 of [1]_ (or equivalently [2]_) is used to reduce the binomial
+    distribution to the cumulative incomplete beta distribution.
+
+    Computation of `n` involves a search for a value that produces the desired
+    value of `y`. The search relies on the monotonicity of `y` with `n`.
+
+    Wrapper for the CDFLIB [3]_ Fortran routine `cdfbin`.
+
+    References
+    ----------
+    .. [1] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [2] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17.5#E5
+    .. [3] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+    """)
+
+add_newdoc("btdtria",
+    r"""
+    btdtria(p, b, x, out=None)
+
+    Inverse of `betainc` with respect to `a`.
+
+    This is the inverse of the beta cumulative distribution function, `betainc`,
+    considered as a function of `a`, returning the value of `a` for which
+    `betainc(a, b, x) = p`, or
+
+    .. math::
+        p = \int_0^x \frac{\Gamma(a + b)}{\Gamma(a)\Gamma(b)} t^{a-1} (1-t)^{b-1}\,dt
+
+    Parameters
+    ----------
+    p : array_like
+        Cumulative probability, in [0, 1].
+    b : array_like
+        Shape parameter (`b` > 0).
+    x : array_like
+        The quantile, in [0, 1].
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    a : scalar or ndarray
+        The value of the shape parameter `a` such that `betainc(a, b, x) = p`.
+
+    See Also
+    --------
+    betainc : Regularized incomplete beta function
+    betaincinv : Inverse of the regularized incomplete beta function
+    btdtrib : Inverse of the beta cumulative distribution function, with respect to `b`.
+
+    Notes
+    -----
+    This function wraps the ``ibeta_inva`` routine from the
+    Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    This function is the inverse of `betainc` for fixed
+    values of :math:`b` and :math:`x`.
+
+    >>> a, b, x = 1.2, 3.1, 0.2
+    >>> y = sc.betainc(a, b, x)
+    >>> sc.btdtria(y, b, x)
+    1.2
+
+    """)
+
+add_newdoc("btdtrib",
+    r"""
+    btdtria(a, p, x, out=None)
+
+    Inverse of `betainc` with respect to `b`.
+
+    This is the inverse of the beta cumulative distribution function, `betainc`,
+    considered as a function of `b`, returning the value of `b` for which
+    `betainc(a, b, x) = p`, or
+
+    .. math::
+        p = \int_0^x \frac{\Gamma(a + b)}{\Gamma(a)\Gamma(b)} t^{a-1} (1-t)^{b-1}\,dt
+
+    Parameters
+    ----------
+    a : array_like
+        Shape parameter (`a` > 0).
+    p : array_like
+        Cumulative probability, in [0, 1].
+    x : array_like
+        The quantile, in [0, 1].
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    b : scalar or ndarray
+        The value of the shape parameter `b` such that `betainc(a, b, x) = p`.
+
+    See Also
+    --------
+    betainc : Regularized incomplete beta function
+    betaincinv : Inverse of the regularized incomplete beta function with
+                 respect to `x`.
+    btdtria : Inverse of the beta cumulative distribution function, with respect to `a`.
+
+    Notes
+    -----
+    Wrapper for the `ibeta_invb` routine from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+    >>> a, b, x = 1.2, 3.1, 0.2
+    >>> y = sc.betainc(a, b, x)
+
+    `btdtrib` is the inverse of `betainc` for fixed values of :math:`a` and
+    :math:`x`:
+
+    >>> sc.btdtrib(a, y, x)
+    3.1
+
+    """)
+
+add_newdoc(
+    "betainc",
+    r"""
+    betainc(a, b, x, out=None)
+
+    Regularized incomplete beta function.
+
+    Computes the regularized incomplete beta function, defined as [1]_:
+
+    .. math::
+
+        I_x(a, b) = \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)} \int_0^x
+        t^{a-1}(1-t)^{b-1}dt,
+
+    for :math:`0 \leq x \leq 1`.
+
+    This function is the cumulative distribution function for the beta
+    distribution; its range is [0, 1].
+
+    Parameters
+    ----------
+    a, b : array_like
+           Positive, real-valued parameters
+    x : array_like
+        Real-valued such that :math:`0 \leq x \leq 1`,
+        the upper limit of integration
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the regularized incomplete beta function
+
+    See Also
+    --------
+    beta : beta function
+    betaincinv : inverse of the regularized incomplete beta function
+    betaincc : complement of the regularized incomplete beta function
+    scipy.stats.beta : beta distribution
+
+    Notes
+    -----
+    The term *regularized* in the name of this function refers to the
+    scaling of the function by the gamma function terms shown in the
+    formula.  When not qualified as *regularized*, the name *incomplete
+    beta function* often refers to just the integral expression,
+    without the gamma terms.  One can use the function `beta` from
+    `scipy.special` to get this "nonregularized" incomplete beta
+    function by multiplying the result of ``betainc(a, b, x)`` by
+    ``beta(a, b)``.
+
+    ``betainc(a, b, x)`` is treated as a two parameter family of functions
+    of a single variable `x`, rather than as a function of three variables.
+    This impacts only the limiting cases ``a = 0``, ``b = 0``, ``a = inf``,
+    ``b = inf``.
+
+    In general
+
+    .. math::
+
+        \lim_{(a, b) \rightarrow (a_0, b_0)} \mathrm{betainc}(a, b, x)
+
+    is treated as a pointwise limit in ``x``. Thus for example,
+    ``betainc(0, b, 0)`` equals ``0`` for ``b > 0``, although it would be
+    indeterminate when considering the simultaneous limit ``(a, x) -> (0+, 0+)``.
+
+    This function wraps the ``ibeta`` routine from the
+    Boost Math C++ library [2]_.
+
+    References
+    ----------
+    .. [1] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+
+    Let :math:`B(a, b)` be the `beta` function.
+
+    >>> import scipy.special as sc
+
+    The coefficient in terms of `gamma` is equal to
+    :math:`1/B(a, b)`. Also, when :math:`x=1`
+    the integral is equal to :math:`B(a, b)`.
+    Therefore, :math:`I_{x=1}(a, b) = 1` for any :math:`a, b`.
+
+    >>> sc.betainc(0.2, 3.5, 1.0)
+    1.0
+
+    It satisfies
+    :math:`I_x(a, b) = x^a F(a, 1-b, a+1, x)/ (aB(a, b))`,
+    where :math:`F` is the hypergeometric function `hyp2f1`:
+
+    >>> a, b, x = 1.4, 3.1, 0.5
+    >>> x**a * sc.hyp2f1(a, 1 - b, a + 1, x)/(a * sc.beta(a, b))
+    0.8148904036225295
+    >>> sc.betainc(a, b, x)
+    0.8148904036225296
+
+    This functions satisfies the relationship
+    :math:`I_x(a, b) = 1 - I_{1-x}(b, a)`:
+
+    >>> sc.betainc(2.2, 3.1, 0.4)
+    0.49339638807619446
+    >>> 1 - sc.betainc(3.1, 2.2, 1 - 0.4)
+    0.49339638807619446
+
+    """)
+
+
+add_newdoc(
+    "betaincc",
+    r"""
+    betaincc(a, b, x, out=None)
+
+    Complement of the regularized incomplete beta function.
+
+    Computes the complement of the regularized incomplete beta function,
+    defined as [1]_:
+
+    .. math::
+
+        \bar{I}_x(a, b) = 1 - I_x(a, b)
+                        = 1 - \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)} \int_0^x
+                                  t^{a-1}(1-t)^{b-1}dt,
+
+    for :math:`0 \leq x \leq 1`.
+
+    Parameters
+    ----------
+    a, b : array_like
+           Positive, real-valued parameters
+    x : array_like
+        Real-valued such that :math:`0 \leq x \leq 1`,
+        the upper limit of integration
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the regularized incomplete beta function
+
+    See Also
+    --------
+    betainc : regularized incomplete beta function
+    betaincinv : inverse of the regularized incomplete beta function
+    betainccinv :
+        inverse of the complement of the regularized incomplete beta function
+    beta : beta function
+    scipy.stats.beta : beta distribution
+
+    Notes
+    -----
+    .. versionadded:: 1.11.0
+
+    Like `betainc`, ``betaincc(a, b, x)`` is treated as a two parameter
+    family of functions of a single variable `x`, rather than as a function of
+    three variables. See the `betainc` docstring for more info on how this
+    impacts limiting cases.
+
+    This function wraps the ``ibetac`` routine from the
+    Boost Math C++ library [2]_.
+
+    References
+    ----------
+    .. [1] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import betaincc, betainc
+
+    The naive calculation ``1 - betainc(a, b, x)`` loses precision when
+    the values of ``betainc(a, b, x)`` are close to 1:
+
+    >>> 1 - betainc(0.5, 8, [0.9, 0.99, 0.999])
+    array([2.0574632e-09, 0.0000000e+00, 0.0000000e+00])
+
+    By using ``betaincc``, we get the correct values:
+
+    >>> betaincc(0.5, 8, [0.9, 0.99, 0.999])
+    array([2.05746321e-09, 1.97259354e-17, 1.96467954e-25])
+
+    """)
+
+add_newdoc(
+    "betaincinv",
+    r"""
+    betaincinv(a, b, y, out=None)
+
+    Inverse of the regularized incomplete beta function.
+
+    Computes :math:`x` such that:
+
+    .. math::
+
+        y = I_x(a, b) = \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)}
+        \int_0^x t^{a-1}(1-t)^{b-1}dt,
+
+    where :math:`I_x` is the normalized incomplete beta function `betainc`
+    and :math:`\Gamma` is the `gamma` function [1]_.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Positive, real-valued parameters
+    y : array_like
+        Real-valued input
+    out : ndarray, optional
+        Optional output array for function values
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the inverse of the regularized incomplete beta function
+
+    See Also
+    --------
+    betainc : regularized incomplete beta function
+    gamma : gamma function
+
+    Notes
+    -----
+    This function wraps the ``ibeta_inv`` routine from the
+    Boost Math C++ library [2]_.
+
+    References
+    ----------
+    .. [1] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    This function is the inverse of `betainc` for fixed
+    values of :math:`a` and :math:`b`.
+
+    >>> a, b = 1.2, 3.1
+    >>> y = sc.betainc(a, b, 0.2)
+    >>> sc.betaincinv(a, b, y)
+    0.2
+    >>>
+    >>> a, b = 7.5, 0.4
+    >>> x = sc.betaincinv(a, b, 0.5)
+    >>> sc.betainc(a, b, x)
+    0.5
+
+    """)
+
+
+add_newdoc(
+    "betainccinv",
+    r"""
+    betainccinv(a, b, y, out=None)
+
+    Inverse of the complemented regularized incomplete beta function.
+
+    Computes :math:`x` such that:
+
+    .. math::
+
+        y = 1 - I_x(a, b) = 1 - \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)}
+        \int_0^x t^{a-1}(1-t)^{b-1}dt,
+
+    where :math:`I_x` is the normalized incomplete beta function `betainc`
+    and :math:`\Gamma` is the `gamma` function [1]_.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Positive, real-valued parameters
+    y : array_like
+        Real-valued input
+    out : ndarray, optional
+        Optional output array for function values
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the inverse of the regularized incomplete beta function
+
+    See Also
+    --------
+    betainc : regularized incomplete beta function
+    betaincc : complement of the regularized incomplete beta function
+
+    Notes
+    -----
+    .. versionadded:: 1.11.0
+
+    This function wraps the ``ibetac_inv`` routine from the
+    Boost Math C++ library [2]_.
+
+    References
+    ----------
+    .. [1] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import betainccinv, betaincc
+
+    This function is the inverse of `betaincc` for fixed
+    values of :math:`a` and :math:`b`.
+
+    >>> a, b = 1.2, 3.1
+    >>> y = betaincc(a, b, 0.2)
+    >>> betainccinv(a, b, y)
+    0.2
+
+    >>> a, b = 7, 2.5
+    >>> x = betainccinv(a, b, 0.875)
+    >>> betaincc(a, b, x)
+    0.875
+
+    """)
+
+add_newdoc("boxcox",
+    """
+    boxcox(x, lmbda, out=None)
+
+    Compute the Box-Cox transformation.
+
+    The Box-Cox transformation is::
+
+        y = (x**lmbda - 1) / lmbda  if lmbda != 0
+            log(x)                  if lmbda == 0
+
+    Returns `nan` if ``x < 0``.
+    Returns `-inf` if ``x == 0`` and ``lmbda < 0``.
+
+    Parameters
+    ----------
+    x : array_like
+        Data to be transformed.
+    lmbda : array_like
+        Power parameter of the Box-Cox transform.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        Transformed data.
+
+    Notes
+    -----
+
+    .. versionadded:: 0.14.0
+
+    Examples
+    --------
+    >>> from scipy.special import boxcox
+    >>> boxcox([1, 4, 10], 2.5)
+    array([   0.        ,   12.4       ,  126.09110641])
+    >>> boxcox(2, [0, 1, 2])
+    array([ 0.69314718,  1.        ,  1.5       ])
+    """)
+
+add_newdoc("boxcox1p",
+    """
+    boxcox1p(x, lmbda, out=None)
+
+    Compute the Box-Cox transformation of 1 + `x`.
+
+    The Box-Cox transformation computed by `boxcox1p` is::
+
+        y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
+            log(1+x)                    if lmbda == 0
+
+    Returns `nan` if ``x < -1``.
+    Returns `-inf` if ``x == -1`` and ``lmbda < 0``.
+
+    Parameters
+    ----------
+    x : array_like
+        Data to be transformed.
+    lmbda : array_like
+        Power parameter of the Box-Cox transform.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        Transformed data.
+
+    Notes
+    -----
+
+    .. versionadded:: 0.14.0
+
+    Examples
+    --------
+    >>> from scipy.special import boxcox1p
+    >>> boxcox1p(1e-4, [0, 0.5, 1])
+    array([  9.99950003e-05,   9.99975001e-05,   1.00000000e-04])
+    >>> boxcox1p([0.01, 0.1], 0.25)
+    array([ 0.00996272,  0.09645476])
+    """)
+
+add_newdoc("inv_boxcox",
+    """
+    inv_boxcox(y, lmbda, out=None)
+
+    Compute the inverse of the Box-Cox transformation.
+
+    Find ``x`` such that::
+
+        y = (x**lmbda - 1) / lmbda  if lmbda != 0
+            log(x)                  if lmbda == 0
+
+    Parameters
+    ----------
+    y : array_like
+        Data to be transformed.
+    lmbda : array_like
+        Power parameter of the Box-Cox transform.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Transformed data.
+
+    Notes
+    -----
+
+    .. versionadded:: 0.16.0
+
+    Examples
+    --------
+    >>> from scipy.special import boxcox, inv_boxcox
+    >>> y = boxcox([1, 4, 10], 2.5)
+    >>> inv_boxcox(y, 2.5)
+    array([1., 4., 10.])
+    """)
+
+add_newdoc("inv_boxcox1p",
+    """
+    inv_boxcox1p(y, lmbda, out=None)
+
+    Compute the inverse of the Box-Cox transformation.
+
+    Find ``x`` such that::
+
+        y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
+            log(1+x)                    if lmbda == 0
+
+    Parameters
+    ----------
+    y : array_like
+        Data to be transformed.
+    lmbda : array_like
+        Power parameter of the Box-Cox transform.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Transformed data.
+
+    Notes
+    -----
+
+    .. versionadded:: 0.16.0
+
+    Examples
+    --------
+    >>> from scipy.special import boxcox1p, inv_boxcox1p
+    >>> y = boxcox1p([1, 4, 10], 2.5)
+    >>> inv_boxcox1p(y, 2.5)
+    array([1., 4., 10.])
+    """)
+
+add_newdoc("chdtr",
+    r"""
+    chdtr(v, x, out=None)
+
+    Chi square cumulative distribution function.
+
+    Returns the area under the left tail (from 0 to `x`) of the Chi
+    square probability density function with `v` degrees of freedom:
+
+    .. math::
+
+        \frac{1}{2^{v/2} \Gamma(v/2)} \int_0^x t^{v/2 - 1} e^{-t/2} dt
+
+    Here :math:`\Gamma` is the Gamma function; see `gamma`. This
+    integral can be expressed in terms of the regularized lower
+    incomplete gamma function `gammainc` as
+    ``gammainc(v / 2, x / 2)``. [1]_
+
+    Parameters
+    ----------
+    v : array_like
+        Degrees of freedom.
+    x : array_like
+        Upper bound of the integral.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the cumulative distribution function.
+
+    See Also
+    --------
+    chdtrc, chdtri, chdtriv, gammainc
+
+    References
+    ----------
+    .. [1] Chi-Square distribution,
+        https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It can be expressed in terms of the regularized lower incomplete
+    gamma function.
+
+    >>> v = 1
+    >>> x = np.arange(4)
+    >>> sc.chdtr(v, x)
+    array([0.        , 0.68268949, 0.84270079, 0.91673548])
+    >>> sc.gammainc(v / 2, x / 2)
+    array([0.        , 0.68268949, 0.84270079, 0.91673548])
+
+    """)
+
+add_newdoc("chdtrc",
+    r"""
+    chdtrc(v, x, out=None)
+
+    Chi square survival function.
+
+    Returns the area under the right hand tail (from `x` to infinity)
+    of the Chi square probability density function with `v` degrees of
+    freedom:
+
+    .. math::
+
+        \frac{1}{2^{v/2} \Gamma(v/2)} \int_x^\infty t^{v/2 - 1} e^{-t/2} dt
+
+    Here :math:`\Gamma` is the Gamma function; see `gamma`. This
+    integral can be expressed in terms of the regularized upper
+    incomplete gamma function `gammaincc` as
+    ``gammaincc(v / 2, x / 2)``. [1]_
+
+    Parameters
+    ----------
+    v : array_like
+        Degrees of freedom.
+    x : array_like
+        Lower bound of the integral.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the survival function.
+
+    See Also
+    --------
+    chdtr, chdtri, chdtriv, gammaincc
+
+    References
+    ----------
+    .. [1] Chi-Square distribution,
+        https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It can be expressed in terms of the regularized upper incomplete
+    gamma function.
+
+    >>> v = 1
+    >>> x = np.arange(4)
+    >>> sc.chdtrc(v, x)
+    array([1.        , 0.31731051, 0.15729921, 0.08326452])
+    >>> sc.gammaincc(v / 2, x / 2)
+    array([1.        , 0.31731051, 0.15729921, 0.08326452])
+
+    """)
+
+add_newdoc("chdtri",
+    """
+    chdtri(v, p, out=None)
+
+    Inverse to `chdtrc` with respect to `x`.
+
+    Returns `x` such that ``chdtrc(v, x) == p``.
+
+    Parameters
+    ----------
+    v : array_like
+        Degrees of freedom.
+    p : array_like
+        Probability.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Value so that the probability a Chi square random variable
+        with `v` degrees of freedom is greater than `x` equals `p`.
+
+    See Also
+    --------
+    chdtrc, chdtr, chdtriv
+
+    References
+    ----------
+    .. [1] Chi-Square distribution,
+        https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    It inverts `chdtrc`.
+
+    >>> v, p = 1, 0.3
+    >>> sc.chdtrc(v, sc.chdtri(v, p))
+    0.3
+    >>> x = 1
+    >>> sc.chdtri(v, sc.chdtrc(v, x))
+    1.0
+
+    """)
+
+add_newdoc("chdtriv",
+    """
+    chdtriv(p, x, out=None)
+
+    Inverse to `chdtr` with respect to `v`.
+
+    Returns `v` such that ``chdtr(v, x) == p``.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability that the Chi square random variable is less than
+        or equal to `x`.
+    x : array_like
+        Nonnegative input.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        Degrees of freedom.
+
+    See Also
+    --------
+    chdtr, chdtrc, chdtri
+
+    Notes
+    -----
+    This function wraps routines from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+    .. [2] Chi-Square distribution,
+        https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    It inverts `chdtr`.
+
+    >>> p, x = 0.5, 1
+    >>> sc.chdtr(sc.chdtriv(p, x), x)
+    0.5000000000000003
+    >>> v = 1
+    >>> sc.chdtriv(sc.chdtr(v, x), v)
+    1.0
+
+    """)
+
+add_newdoc("chndtr",
+    r"""
+    chndtr(x, df, nc, out=None)
+
+    Non-central chi square cumulative distribution function
+
+    The cumulative distribution function is given by:
+
+    .. math::
+
+        P(\chi^{\prime 2} \vert \nu, \lambda) =\sum_{j=0}^{\infty}
+        e^{-\lambda /2}
+        \frac{(\lambda /2)^j}{j!} P(\chi^{\prime 2} \vert \nu + 2j),
+
+    where :math:`\nu > 0` is the degrees of freedom (``df``) and
+    :math:`\lambda \geq 0` is the non-centrality parameter (``nc``).
+
+    Parameters
+    ----------
+    x : array_like
+        Upper bound of the integral; must satisfy ``x >= 0``
+    df : array_like
+        Degrees of freedom; must satisfy ``df > 0``
+    nc : array_like
+        Non-centrality parameter; must satisfy ``nc >= 0``
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Value of the non-central chi square cumulative distribution function.
+
+    See Also
+    --------
+    chndtrix: Noncentral Chi Squared distribution quantile
+    chndtridf: Inverse of `chndtr` with respect to `df`
+    chndtrinc: Inverse of `chndtr` with respect to `nc`
+    scipy.stats.ncx2: Non-central chi-squared distribution
+
+    Notes
+    -----
+    The noncentral chi squared distribution is also available in
+    `scipy.stats.ncx2`. ``scipy.stats.ncx2.cdf`` is equivalent to `chndtr`.
+
+    This function wraps routines from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    Compute the noncentral chi squared distribution CDF at one point.
+
+    >>> x = 4.0
+    >>> df = 1.0
+    >>> nc = 5.0
+    >>> sc.chndtr(x, df, nc)
+    0.40667858759710945
+
+    Plot the noncentral chi squared distribution CDF for different parameters.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0, 40, 1000)
+    >>> plt.plot(x, sc.chndtr(x, 1, 5), label=r"$df=1,\ nc=5$")
+    >>> plt.plot(x, sc.chndtr(x, 5, 10), label=r"$df=5,\ nc=10$")
+    >>> plt.legend()
+    >>> plt.show()
+
+    """)
+
+add_newdoc("chndtrix",
+    """
+    chndtrix(p, df, nc, out=None)
+
+    Inverse to `chndtr` vs `x`
+
+    Calculated using a search to find a value for `x` that produces the
+    desired value of `p`.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability; must satisfy ``0 <= p < 1``
+    df : array_like
+        Degrees of freedom; must satisfy ``df > 0``
+    nc : array_like
+        Non-centrality parameter; must satisfy ``nc >= 0``
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Value so that the probability a non-central Chi square random variable
+        with `df` degrees of freedom and non-centrality, `nc`, is greater than
+        `x` equals `p`.
+
+    See Also
+    --------
+    chndtr : Noncentral chi-squared distribution CDF
+    chndtridf : inverse of `chndtr` with respect to `cdf`
+    chndtrinc : inverse of `chndtr` with respect to `nc`
+    scipy.stats.ncx2 : Non-central chi-squared distribution
+
+    Notes
+    -----
+    The noncentral chi squared distribution is also available in
+    `scipy.stats.ncx2`. ``scipy.stats.ncx2.ppf`` is equivalent to `chndtrix`.
+
+    This function wraps routines from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import chndtrix, chndtr
+
+    Compute the noncentral chi squared distribution CDF at one point.
+    >>> x, df, nc = 3, 5, 10
+    >>> p = chndtr(x, df, nc)
+
+    `chndtrix` is the inverse of `chndtr` with respect to `x`:
+
+    >>> chndtrix(p, df, nc)
+    3.0
+
+    """)
+
+add_newdoc("chndtridf",
+    """
+    chndtridf(x, p, nc, out=None)
+
+    Inverse to `chndtr` vs `df`
+
+    Calculated using a search to find a value for `df` that produces the
+    desired value of `p`.
+
+    Parameters
+    ----------
+    x : array_like
+        Upper bound of the integral; must satisfy ``x >= 0``
+    p : array_like
+        Probability; must satisfy ``0 <= p < 1``
+    nc : array_like
+        Non-centrality parameter; must satisfy ``nc >= 0``
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    df : scalar or ndarray
+        Degrees of freedom
+
+    See Also
+    --------
+    chndtr : Noncentral chi-squared distribution CDF
+    chndtrix : inverse of `chndtr` with respect to `x`
+    chndtrinc : inverse of `chndtr` with respect to `nc`
+    scipy.stats.ncx2 : Non-central chi-squared distribution
+
+    Notes
+    -----
+    The noncentral chi squared distribution is also available in
+    `scipy.stats.ncx2`.
+
+    This function wraps routines from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import chndtridf, chndtr
+
+    Compute the noncentral chi squared distribution CDF at one point.
+
+    >>> x, df, nc = 3, 5, 10
+    >>> p = chndtr(x, df, nc)
+
+    `chndtridf` is the inverse of `chndtr` with respect to `df`:
+
+    >>> chndtridf(x, p, nc)
+    5.0
+
+    """)
+
+add_newdoc("chndtrinc",
+    """
+    chndtrinc(x, df, p, out=None)
+
+    Inverse to `chndtr` vs `nc`
+
+    Calculated using a search to find a value for `df` that produces the
+    desired value of `p`.
+
+    Parameters
+    ----------
+    x : array_like
+        Upper bound of the integral; must satisfy ``x >= 0``
+    df : array_like
+        Degrees of freedom; must satisfy ``df > 0``
+    p : array_like
+        Probability; must satisfy ``0 <= p < 1``
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    nc : scalar or ndarray
+        Non-centrality
+
+    See Also
+    --------
+    chndtr : Noncentral chi-squared distribution CDF
+    chndtridf : inverse of `chndtr` with respect to `df`
+    chndtrinc : inverse of `chndtr` with respect to `nc`
+    scipy.stats.ncx2 : Non-central chi-squared distribution
+
+    Notes
+    -----
+    The noncentral chi squared distribution is also available in
+    `scipy.stats.ncx2`.
+
+    This function wraps routines from the Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import chndtrinc, chndtr
+
+    Compute the noncentral chi squared distribution CDF at one point.
+
+    >>> x, df, nc = 3, 5, 10
+    >>> p = chndtr(x, df, nc)
+
+    `chndtrinc` is the inverse of `chndtr` with respect to `nc`:
+
+    >>> chndtrinc(x, df, p)
+    10.0
+
+    """)
+
+add_newdoc(
+    "elliprc",
+    r"""
+    elliprc(x, y, out=None)
+
+    Degenerate symmetric elliptic integral.
+
+    The function RC is defined as [1]_
+
+    .. math::
+
+        R_{\mathrm{C}}(x, y) =
+           \frac{1}{2} \int_0^{+\infty} (t + x)^{-1/2} (t + y)^{-1} dt
+           = R_{\mathrm{F}}(x, y, y)
+
+    Parameters
+    ----------
+    x, y : array_like
+        Real or complex input parameters. `x` can be any number in the
+        complex plane cut along the negative real axis. `y` must be non-zero.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    R : scalar or ndarray
+        Value of the integral. If `y` is real and negative, the Cauchy
+        principal value is returned. If both of `x` and `y` are real, the
+        return value is real. Otherwise, the return value is complex.
+
+    See Also
+    --------
+    elliprf : Completely-symmetric elliptic integral of the first kind.
+    elliprd : Symmetric elliptic integral of the second kind.
+    elliprg : Completely-symmetric elliptic integral of the second kind.
+    elliprj : Symmetric elliptic integral of the third kind.
+
+    Notes
+    -----
+    RC is a degenerate case of the symmetric integral RF: ``elliprc(x, y) ==
+    elliprf(x, y, y)``. It is an elementary function rather than an elliptic
+    integral.
+
+    The code implements Carlson's algorithm based on the duplication theorems
+    and series expansion up to the 7th order. [2]_
+
+    .. versionadded:: 1.8.0
+
+    References
+    ----------
+    .. [1] B. C. Carlson, ed., Chapter 19 in "Digital Library of Mathematical
+           Functions," NIST, US Dept. of Commerce.
+           https://dlmf.nist.gov/19.16.E6
+    .. [2] B. C. Carlson, "Numerical computation of real or complex elliptic
+           integrals," Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.
+           https://arxiv.org/abs/math/9409227
+           https://doi.org/10.1007/BF02198293
+
+    Examples
+    --------
+    Basic homogeneity property:
+
+    >>> import numpy as np
+    >>> from scipy.special import elliprc
+
+    >>> x = 1.2 + 3.4j
+    >>> y = 5.
+    >>> scale = 0.3 + 0.4j
+    >>> elliprc(scale*x, scale*y)
+    (0.5484493976710874-0.4169557678995833j)
+
+    >>> elliprc(x, y)/np.sqrt(scale)
+    (0.5484493976710874-0.41695576789958333j)
+
+    When the two arguments coincide, the integral is particularly
+    simple:
+
+    >>> x = 1.2 + 3.4j
+    >>> elliprc(x, x)
+    (0.4299173120614631-0.3041729818745595j)
+
+    >>> 1/np.sqrt(x)
+    (0.4299173120614631-0.30417298187455954j)
+
+    Another simple case: the first argument vanishes:
+
+    >>> y = 1.2 + 3.4j
+    >>> elliprc(0, y)
+    (0.6753125346116815-0.47779380263880866j)
+
+    >>> np.pi/2/np.sqrt(y)
+    (0.6753125346116815-0.4777938026388088j)
+
+    When `x` and `y` are both positive, we can express
+    :math:`R_C(x,y)` in terms of more elementary functions.  For the
+    case :math:`0 \le x < y`,
+
+    >>> x = 3.2
+    >>> y = 6.
+    >>> elliprc(x, y)
+    0.44942991498453444
+
+    >>> np.arctan(np.sqrt((y-x)/x))/np.sqrt(y-x)
+    0.44942991498453433
+
+    And for the case :math:`0 \le y < x`,
+
+    >>> x = 6.
+    >>> y = 3.2
+    >>> elliprc(x,y)
+    0.4989837501576147
+
+    >>> np.log((np.sqrt(x)+np.sqrt(x-y))/np.sqrt(y))/np.sqrt(x-y)
+    0.49898375015761476
+
+    """)
+
+add_newdoc(
+    "elliprd",
+    r"""
+    elliprd(x, y, z, out=None)
+
+    Symmetric elliptic integral of the second kind.
+
+    The function RD is defined as [1]_
+
+    .. math::
+
+        R_{\mathrm{D}}(x, y, z) =
+           \frac{3}{2} \int_0^{+\infty} [(t + x) (t + y)]^{-1/2} (t + z)^{-3/2}
+           dt
+
+    Parameters
+    ----------
+    x, y, z : array_like
+        Real or complex input parameters. `x` or `y` can be any number in the
+        complex plane cut along the negative real axis, but at most one of them
+        can be zero, while `z` must be non-zero.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    R : scalar or ndarray
+        Value of the integral. If all of `x`, `y`, and `z` are real, the
+        return value is real. Otherwise, the return value is complex.
+
+    See Also
+    --------
+    elliprc : Degenerate symmetric elliptic integral.
+    elliprf : Completely-symmetric elliptic integral of the first kind.
+    elliprg : Completely-symmetric elliptic integral of the second kind.
+    elliprj : Symmetric elliptic integral of the third kind.
+
+    Notes
+    -----
+    RD is a degenerate case of the elliptic integral RJ: ``elliprd(x, y, z) ==
+    elliprj(x, y, z, z)``.
+
+    The code implements Carlson's algorithm based on the duplication theorems
+    and series expansion up to the 7th order. [2]_
+
+    .. versionadded:: 1.8.0
+
+    References
+    ----------
+    .. [1] B. C. Carlson, ed., Chapter 19 in "Digital Library of Mathematical
+           Functions," NIST, US Dept. of Commerce.
+           https://dlmf.nist.gov/19.16.E5
+    .. [2] B. C. Carlson, "Numerical computation of real or complex elliptic
+           integrals," Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.
+           https://arxiv.org/abs/math/9409227
+           https://doi.org/10.1007/BF02198293
+
+    Examples
+    --------
+    Basic homogeneity property:
+
+    >>> import numpy as np
+    >>> from scipy.special import elliprd
+
+    >>> x = 1.2 + 3.4j
+    >>> y = 5.
+    >>> z = 6.
+    >>> scale = 0.3 + 0.4j
+    >>> elliprd(scale*x, scale*y, scale*z)
+    (-0.03703043835680379-0.24500934665683802j)
+
+    >>> elliprd(x, y, z)*np.power(scale, -1.5)
+    (-0.0370304383568038-0.24500934665683805j)
+
+    All three arguments coincide:
+
+    >>> x = 1.2 + 3.4j
+    >>> elliprd(x, x, x)
+    (-0.03986825876151896-0.14051741840449586j)
+
+    >>> np.power(x, -1.5)
+    (-0.03986825876151894-0.14051741840449583j)
+
+    The so-called "second lemniscate constant":
+
+    >>> elliprd(0, 2, 1)/3
+    0.5990701173677961
+
+    >>> from scipy.special import gamma
+    >>> gamma(0.75)**2/np.sqrt(2*np.pi)
+    0.5990701173677959
+
+    """)
+
+add_newdoc(
+    "elliprf",
+    r"""
+    elliprf(x, y, z, out=None)
+
+    Completely-symmetric elliptic integral of the first kind.
+
+    The function RF is defined as [1]_
+
+    .. math::
+
+        R_{\mathrm{F}}(x, y, z) =
+           \frac{1}{2} \int_0^{+\infty} [(t + x) (t + y) (t + z)]^{-1/2} dt
+
+    Parameters
+    ----------
+    x, y, z : array_like
+        Real or complex input parameters. `x`, `y`, or `z` can be any number in
+        the complex plane cut along the negative real axis, but at most one of
+        them can be zero.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    R : scalar or ndarray
+        Value of the integral. If all of `x`, `y`, and `z` are real, the return
+        value is real. Otherwise, the return value is complex.
+
+    See Also
+    --------
+    elliprc : Degenerate symmetric integral.
+    elliprd : Symmetric elliptic integral of the second kind.
+    elliprg : Completely-symmetric elliptic integral of the second kind.
+    elliprj : Symmetric elliptic integral of the third kind.
+
+    Notes
+    -----
+    The code implements Carlson's algorithm based on the duplication theorems
+    and series expansion up to the 7th order (cf.:
+    https://dlmf.nist.gov/19.36.i) and the AGM algorithm for the complete
+    integral. [2]_
+
+    .. versionadded:: 1.8.0
+
+    References
+    ----------
+    .. [1] B. C. Carlson, ed., Chapter 19 in "Digital Library of Mathematical
+           Functions," NIST, US Dept. of Commerce.
+           https://dlmf.nist.gov/19.16.E1
+    .. [2] B. C. Carlson, "Numerical computation of real or complex elliptic
+           integrals," Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.
+           https://arxiv.org/abs/math/9409227
+           https://doi.org/10.1007/BF02198293
+
+    Examples
+    --------
+    Basic homogeneity property:
+
+    >>> import numpy as np
+    >>> from scipy.special import elliprf
+
+    >>> x = 1.2 + 3.4j
+    >>> y = 5.
+    >>> z = 6.
+    >>> scale = 0.3 + 0.4j
+    >>> elliprf(scale*x, scale*y, scale*z)
+    (0.5328051227278146-0.4008623567957094j)
+
+    >>> elliprf(x, y, z)/np.sqrt(scale)
+    (0.5328051227278147-0.4008623567957095j)
+
+    All three arguments coincide:
+
+    >>> x = 1.2 + 3.4j
+    >>> elliprf(x, x, x)
+    (0.42991731206146316-0.30417298187455954j)
+
+    >>> 1/np.sqrt(x)
+    (0.4299173120614631-0.30417298187455954j)
+
+    The so-called "first lemniscate constant":
+
+    >>> elliprf(0, 1, 2)
+    1.3110287771460598
+
+    >>> from scipy.special import gamma
+    >>> gamma(0.25)**2/(4*np.sqrt(2*np.pi))
+    1.3110287771460598
+
+    """)
+
+add_newdoc(
+    "elliprg",
+    r"""
+    elliprg(x, y, z, out=None)
+
+    Completely-symmetric elliptic integral of the second kind.
+
+    The function RG is defined as [1]_
+
+    .. math::
+
+        R_{\mathrm{G}}(x, y, z) =
+           \frac{1}{4} \int_0^{+\infty} [(t + x) (t + y) (t + z)]^{-1/2}
+           \left(\frac{x}{t + x} + \frac{y}{t + y} + \frac{z}{t + z}\right) t
+           dt
+
+    Parameters
+    ----------
+    x, y, z : array_like
+        Real or complex input parameters. `x`, `y`, or `z` can be any number in
+        the complex plane cut along the negative real axis.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    R : scalar or ndarray
+        Value of the integral. If all of `x`, `y`, and `z` are real, the return
+        value is real. Otherwise, the return value is complex.
+
+    See Also
+    --------
+    elliprc : Degenerate symmetric integral.
+    elliprd : Symmetric elliptic integral of the second kind.
+    elliprf : Completely-symmetric elliptic integral of the first kind.
+    elliprj : Symmetric elliptic integral of the third kind.
+
+    Notes
+    -----
+    The implementation uses the relation [1]_
+
+    .. math::
+
+        2 R_{\mathrm{G}}(x, y, z) =
+           z R_{\mathrm{F}}(x, y, z) -
+           \frac{1}{3} (x - z) (y - z) R_{\mathrm{D}}(x, y, z) +
+           \sqrt{\frac{x y}{z}}
+
+    and the symmetry of `x`, `y`, `z` when at least one non-zero parameter can
+    be chosen as the pivot. When one of the arguments is close to zero, the AGM
+    method is applied instead. Other special cases are computed following Ref.
+    [2]_
+
+    .. versionadded:: 1.8.0
+
+    References
+    ----------
+    .. [1] B. C. Carlson, "Numerical computation of real or complex elliptic
+           integrals," Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.
+           https://arxiv.org/abs/math/9409227
+           https://doi.org/10.1007/BF02198293
+    .. [2] B. C. Carlson, ed., Chapter 19 in "Digital Library of Mathematical
+           Functions," NIST, US Dept. of Commerce.
+           https://dlmf.nist.gov/19.16.E1
+           https://dlmf.nist.gov/19.20.ii
+
+    Examples
+    --------
+    Basic homogeneity property:
+
+    >>> import numpy as np
+    >>> from scipy.special import elliprg
+
+    >>> x = 1.2 + 3.4j
+    >>> y = 5.
+    >>> z = 6.
+    >>> scale = 0.3 + 0.4j
+    >>> elliprg(scale*x, scale*y, scale*z)
+    (1.195936862005246+0.8470988320464167j)
+
+    >>> elliprg(x, y, z)*np.sqrt(scale)
+    (1.195936862005246+0.8470988320464165j)
+
+    Simplifications:
+
+    >>> elliprg(0, y, y)
+    1.756203682760182
+
+    >>> 0.25*np.pi*np.sqrt(y)
+    1.7562036827601817
+
+    >>> elliprg(0, 0, z)
+    1.224744871391589
+
+    >>> 0.5*np.sqrt(z)
+    1.224744871391589
+
+    The surface area of a triaxial ellipsoid with semiaxes ``a``, ``b``, and
+    ``c`` is given by
+
+    .. math::
+
+        S = 4 \pi a b c R_{\mathrm{G}}(1 / a^2, 1 / b^2, 1 / c^2).
+
+    >>> def ellipsoid_area(a, b, c):
+    ...     r = 4.0 * np.pi * a * b * c
+    ...     return r * elliprg(1.0 / (a * a), 1.0 / (b * b), 1.0 / (c * c))
+    >>> print(ellipsoid_area(1, 3, 5))
+    108.62688289491807
+    """)
+
+add_newdoc(
+    "elliprj",
+    r"""
+    elliprj(x, y, z, p, out=None)
+
+    Symmetric elliptic integral of the third kind.
+
+    The function RJ is defined as [1]_
+
+    .. math::
+
+        R_{\mathrm{J}}(x, y, z, p) =
+           \frac{3}{2} \int_0^{+\infty} [(t + x) (t + y) (t + z)]^{-1/2}
+           (t + p)^{-1} dt
+
+    .. warning::
+        This function should be considered experimental when the inputs are
+        unbalanced.  Check correctness with another independent implementation.
+
+    Parameters
+    ----------
+    x, y, z, p : array_like
+        Real or complex input parameters. `x`, `y`, or `z` are numbers in
+        the complex plane cut along the negative real axis (subject to further
+        constraints, see Notes), and at most one of them can be zero. `p` must
+        be non-zero.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    R : scalar or ndarray
+        Value of the integral. If all of `x`, `y`, `z`, and `p` are real, the
+        return value is real. Otherwise, the return value is complex.
+
+        If `p` is real and negative, while `x`, `y`, and `z` are real,
+        non-negative, and at most one of them is zero, the Cauchy principal
+        value is returned. [1]_ [2]_
+
+    See Also
+    --------
+    elliprc : Degenerate symmetric integral.
+    elliprd : Symmetric elliptic integral of the second kind.
+    elliprf : Completely-symmetric elliptic integral of the first kind.
+    elliprg : Completely-symmetric elliptic integral of the second kind.
+
+    Notes
+    -----
+    The code implements Carlson's algorithm based on the duplication theorems
+    and series expansion up to the 7th order. [3]_ The algorithm is slightly
+    different from its earlier incarnation as it appears in [1]_, in that the
+    call to `elliprc` (or ``atan``/``atanh``, see [4]_) is no longer needed in
+    the inner loop. Asymptotic approximations are used where arguments differ
+    widely in the order of magnitude. [5]_
+
+    The input values are subject to certain sufficient but not necessary
+    constraints when input arguments are complex. Notably, ``x``, ``y``, and
+    ``z`` must have non-negative real parts, unless two of them are
+    non-negative and complex-conjugates to each other while the other is a real
+    non-negative number. [1]_ If the inputs do not satisfy the sufficient
+    condition described in Ref. [1]_ they are rejected outright with the output
+    set to NaN.
+
+    In the case where one of ``x``, ``y``, and ``z`` is equal to ``p``, the
+    function ``elliprd`` should be preferred because of its less restrictive
+    domain.
+
+    .. versionadded:: 1.8.0
+
+    References
+    ----------
+    .. [1] B. C. Carlson, "Numerical computation of real or complex elliptic
+           integrals," Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.
+           https://arxiv.org/abs/math/9409227
+           https://doi.org/10.1007/BF02198293
+    .. [2] B. C. Carlson, ed., Chapter 19 in "Digital Library of Mathematical
+           Functions," NIST, US Dept. of Commerce.
+           https://dlmf.nist.gov/19.20.iii
+    .. [3] B. C. Carlson, J. FitzSimmons, "Reduction Theorems for Elliptic
+           Integrands with the Square Root of Two Quadratic Factors," J.
+           Comput. Appl. Math., vol. 118, nos. 1-2, pp. 71-85, 2000.
+           https://doi.org/10.1016/S0377-0427(00)00282-X
+    .. [4] F. Johansson, "Numerical Evaluation of Elliptic Functions, Elliptic
+           Integrals and Modular Forms," in J. Blumlein, C. Schneider, P.
+           Paule, eds., "Elliptic Integrals, Elliptic Functions and Modular
+           Forms in Quantum Field Theory," pp. 269-293, 2019 (Cham,
+           Switzerland: Springer Nature Switzerland)
+           https://arxiv.org/abs/1806.06725
+           https://doi.org/10.1007/978-3-030-04480-0
+    .. [5] B. C. Carlson, J. L. Gustafson, "Asymptotic Approximations for
+           Symmetric Elliptic Integrals," SIAM J. Math. Anls., vol. 25, no. 2,
+           pp. 288-303, 1994.
+           https://arxiv.org/abs/math/9310223
+           https://doi.org/10.1137/S0036141092228477
+
+    Examples
+    --------
+    Basic homogeneity property:
+
+    >>> import numpy as np
+    >>> from scipy.special import elliprj
+
+    >>> x = 1.2 + 3.4j
+    >>> y = 5.
+    >>> z = 6.
+    >>> p = 7.
+    >>> scale = 0.3 - 0.4j
+    >>> elliprj(scale*x, scale*y, scale*z, scale*p)
+    (0.10834905565679157+0.19694950747103812j)
+
+    >>> elliprj(x, y, z, p)*np.power(scale, -1.5)
+    (0.10834905565679556+0.19694950747103854j)
+
+    Reduction to simpler elliptic integral:
+
+    >>> elliprj(x, y, z, z)
+    (0.08288462362195129-0.028376809745123258j)
+
+    >>> from scipy.special import elliprd
+    >>> elliprd(x, y, z)
+    (0.08288462362195136-0.028376809745123296j)
+
+    All arguments coincide:
+
+    >>> elliprj(x, x, x, x)
+    (-0.03986825876151896-0.14051741840449586j)
+
+    >>> np.power(x, -1.5)
+    (-0.03986825876151894-0.14051741840449583j)
+
+    """)
+
+add_newdoc("entr",
+    r"""
+    entr(x, out=None)
+
+    Elementwise function for computing entropy.
+
+    .. math:: \text{entr}(x) = \begin{cases} - x \log(x) & x > 0  \\ 0 & x = 0
+              \\ -\infty & \text{otherwise} \end{cases}
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    res : scalar or ndarray
+        The value of the elementwise entropy function at the given points `x`.
+
+    See Also
+    --------
+    kl_div, rel_entr, scipy.stats.entropy
+
+    Notes
+    -----
+    .. versionadded:: 0.15.0
+
+    This function is concave.
+
+    The origin of this function is in convex programming; see [1]_.
+    Given a probability distribution :math:`p_1, \ldots, p_n`,
+    the definition of entropy in the context of *information theory* is
+
+    .. math::
+
+        \sum_{i = 1}^n \mathrm{entr}(p_i).
+
+    To compute the latter quantity, use `scipy.stats.entropy`.
+
+    References
+    ----------
+    .. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.
+           Cambridge University Press, 2004.
+           :doi:`https://doi.org/10.1017/CBO9780511804441`
+
+    """)
+
+add_newdoc(
+    "erfinv",
+    """
+    erfinv(y, out=None)
+
+    Inverse of the error function.
+
+    Computes the inverse of the error function.
+
+    In the complex domain, there is no unique complex number w satisfying
+    erf(w)=z. This indicates a true inverse function would be multivalued.
+    When the domain restricts to the real, -1 < x < 1, there is a unique real
+    number satisfying erf(erfinv(x)) = x.
+
+    Parameters
+    ----------
+    y : ndarray
+        Argument at which to evaluate. Domain: [-1, 1]
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    erfinv : scalar or ndarray
+        The inverse of erf of y, element-wise
+
+    See Also
+    --------
+    erf : Error function of a complex argument
+    erfc : Complementary error function, ``1 - erf(x)``
+    erfcinv : Inverse of the complementary error function
+
+    Notes
+    -----
+    This function wraps the ``erf_inv`` routine from the
+    Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import erfinv, erf
+
+    >>> erfinv(0.5)
+    0.4769362762044699
+
+    >>> y = np.linspace(-1.0, 1.0, num=9)
+    >>> x = erfinv(y)
+    >>> x
+    array([       -inf, -0.81341985, -0.47693628, -0.22531206,  0.        ,
+            0.22531206,  0.47693628,  0.81341985,         inf])
+
+    Verify that ``erf(erfinv(y))`` is ``y``.
+
+    >>> erf(x)
+    array([-1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ])
+
+    Plot the function:
+
+    >>> y = np.linspace(-1, 1, 200)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(y, erfinv(y))
+    >>> ax.grid(True)
+    >>> ax.set_xlabel('y')
+    >>> ax.set_title('erfinv(y)')
+    >>> plt.show()
+
+    """)
+
+add_newdoc(
+    "erfcinv",
+    """
+    erfcinv(y, out=None)
+
+    Inverse of the complementary error function.
+
+    Computes the inverse of the complementary error function.
+
+    In the complex domain, there is no unique complex number w satisfying
+    erfc(w)=z. This indicates a true inverse function would be multivalued.
+    When the domain restricts to the real, 0 < x < 2, there is a unique real
+    number satisfying erfc(erfcinv(x)) = erfcinv(erfc(x)).
+
+    It is related to inverse of the error function by erfcinv(1-x) = erfinv(x)
+
+    Parameters
+    ----------
+    y : ndarray
+        Argument at which to evaluate. Domain: [0, 2]
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    erfcinv : scalar or ndarray
+        The inverse of erfc of y, element-wise
+
+    See Also
+    --------
+    erf : Error function of a complex argument
+    erfc : Complementary error function, ``1 - erf(x)``
+    erfinv : Inverse of the error function
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import erfcinv
+
+    >>> erfcinv(0.5)
+    0.4769362762044699
+
+    >>> y = np.linspace(0.0, 2.0, num=11)
+    >>> erfcinv(y)
+    array([        inf,  0.9061938 ,  0.59511608,  0.37080716,  0.17914345,
+           -0.        , -0.17914345, -0.37080716, -0.59511608, -0.9061938 ,
+                  -inf])
+
+    Plot the function:
+
+    >>> y = np.linspace(0, 2, 200)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(y, erfcinv(y))
+    >>> ax.grid(True)
+    >>> ax.set_xlabel('y')
+    >>> ax.set_title('erfcinv(y)')
+    >>> plt.show()
+
+    """)
+
+add_newdoc("eval_jacobi",
+    r"""
+    eval_jacobi(n, alpha, beta, x, out=None)
+
+    Evaluate Jacobi polynomial at a point.
+
+    The Jacobi polynomials can be defined via the Gauss hypergeometric
+    function :math:`{}_2F_1` as
+
+    .. math::
+
+        P_n^{(\alpha, \beta)}(x) = \frac{(\alpha + 1)_n}{\Gamma(n + 1)}
+          {}_2F_1(-n, 1 + \alpha + \beta + n; \alpha + 1; (1 - z)/2)
+
+    where :math:`(\cdot)_n` is the Pochhammer symbol; see `poch`. When
+    :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.42 in [AS]_ or [DLMF]_ for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer the result is
+        determined via the relation to the Gauss hypergeometric
+        function.
+    alpha : array_like
+        Parameter
+    beta : array_like
+        Parameter
+    x : array_like
+        Points at which to evaluate the polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    P : scalar or ndarray
+        Values of the Jacobi polynomial
+
+    See Also
+    --------
+    roots_jacobi : roots and quadrature weights of Jacobi polynomials
+    jacobi : Jacobi polynomial object
+    hyp2f1 : Gauss hypergeometric function
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E7
+
+    """)
+
+add_newdoc("eval_sh_jacobi",
+    r"""
+    eval_sh_jacobi(n, p, q, x, out=None)
+
+    Evaluate shifted Jacobi polynomial at a point.
+
+    Defined by
+
+    .. math::
+
+        G_n^{(p, q)}(x)
+          = \binom{2n + p - 1}{n}^{-1} P_n^{(p - q, q - 1)}(2x - 1),
+
+    where :math:`P_n^{(\cdot, \cdot)}` is the n-th Jacobi polynomial.
+    See 22.5.2 in [AS]_ (or equivalently [DLMF]_)  for details.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to `binom` and `eval_jacobi`.
+    p : float
+        Parameter
+    q : float
+        Parameter
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    G : scalar or ndarray
+        Values of the shifted Jacobi polynomial.
+
+    See Also
+    --------
+    roots_sh_jacobi : roots and quadrature weights of shifted Jacobi
+                      polynomials
+    sh_jacobi : shifted Jacobi polynomial object
+    eval_jacobi : evaluate Jacobi polynomials
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.1.E2
+
+    """)
+
+add_newdoc("eval_gegenbauer",
+    r"""
+    eval_gegenbauer(n, alpha, x, out=None)
+
+    Evaluate Gegenbauer polynomial at a point.
+
+    The Gegenbauer polynomials can be defined via the Gauss
+    hypergeometric function :math:`{}_2F_1` as
+
+    .. math::
+
+        C_n^{(\alpha)} = \frac{(2\alpha)_n}{\Gamma(n + 1)}
+          {}_2F_1(-n, 2\alpha + n; \alpha + 1/2; (1 - z)/2).
+
+    When :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.46 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to the Gauss hypergeometric
+        function.
+    alpha : array_like
+        Parameter
+    x : array_like
+        Points at which to evaluate the Gegenbauer polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    C : scalar or ndarray
+        Values of the Gegenbauer polynomial
+
+    See Also
+    --------
+    roots_gegenbauer : roots and quadrature weights of Gegenbauer
+                       polynomials
+    gegenbauer : Gegenbauer polynomial object
+    hyp2f1 : Gauss hypergeometric function
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E9
+
+    """)
+
+add_newdoc("eval_chebyt",
+    r"""
+    eval_chebyt(n, x, out=None)
+
+    Evaluate Chebyshev polynomial of the first kind at a point.
+
+    The Chebyshev polynomials of the first kind can be defined via the
+    Gauss hypergeometric function :math:`{}_2F_1` as
+
+    .. math::
+
+        T_n(x) = {}_2F_1(n, -n; 1/2; (1 - x)/2).
+
+    When :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.47 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to the Gauss hypergeometric
+        function.
+    x : array_like
+        Points at which to evaluate the Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    T : scalar or ndarray
+        Values of the Chebyshev polynomial
+
+    See Also
+    --------
+    roots_chebyt : roots and quadrature weights of Chebyshev
+                   polynomials of the first kind
+    chebyu : Chebychev polynomial object
+    eval_chebyu : evaluate Chebyshev polynomials of the second kind
+    hyp2f1 : Gauss hypergeometric function
+    numpy.polynomial.chebyshev.Chebyshev : Chebyshev series
+
+    Notes
+    -----
+    This routine is numerically stable for `x` in ``[-1, 1]`` at least
+    up to order ``10000``.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E11_2
+
+    """)
+
+add_newdoc("eval_chebyu",
+    r"""
+    eval_chebyu(n, x, out=None)
+
+    Evaluate Chebyshev polynomial of the second kind at a point.
+
+    The Chebyshev polynomials of the second kind can be defined via
+    the Gauss hypergeometric function :math:`{}_2F_1` as
+
+    .. math::
+
+        U_n(x) = (n + 1) {}_2F_1(-n, n + 2; 3/2; (1 - x)/2).
+
+    When :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.48 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to the Gauss hypergeometric
+        function.
+    x : array_like
+        Points at which to evaluate the Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    U : scalar or ndarray
+        Values of the Chebyshev polynomial
+
+    See Also
+    --------
+    roots_chebyu : roots and quadrature weights of Chebyshev
+                   polynomials of the second kind
+    chebyu : Chebyshev polynomial object
+    eval_chebyt : evaluate Chebyshev polynomials of the first kind
+    hyp2f1 : Gauss hypergeometric function
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E11_4
+
+    """)
+
+add_newdoc("eval_chebys",
+    r"""
+    eval_chebys(n, x, out=None)
+
+    Evaluate Chebyshev polynomial of the second kind on [-2, 2] at a
+    point.
+
+    These polynomials are defined as
+
+    .. math::
+
+        S_n(x) = U_n(x/2)
+
+    where :math:`U_n` is a Chebyshev polynomial of the second kind.
+    See 22.5.13 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to `eval_chebyu`.
+    x : array_like
+        Points at which to evaluate the Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    S : scalar or ndarray
+        Values of the Chebyshev polynomial
+
+    See Also
+    --------
+    roots_chebys : roots and quadrature weights of Chebyshev
+                   polynomials of the second kind on [-2, 2]
+    chebys : Chebyshev polynomial object
+    eval_chebyu : evaluate Chebyshev polynomials of the second kind
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.1.E3
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    They are a scaled version of the Chebyshev polynomials of the
+    second kind.
+
+    >>> x = np.linspace(-2, 2, 6)
+    >>> sc.eval_chebys(3, x)
+    array([-4.   ,  0.672,  0.736, -0.736, -0.672,  4.   ])
+    >>> sc.eval_chebyu(3, x / 2)
+    array([-4.   ,  0.672,  0.736, -0.736, -0.672,  4.   ])
+
+    """)
+
+add_newdoc("eval_chebyc",
+    r"""
+    eval_chebyc(n, x, out=None)
+
+    Evaluate Chebyshev polynomial of the first kind on [-2, 2] at a
+    point.
+
+    These polynomials are defined as
+
+    .. math::
+
+        C_n(x) = 2 T_n(x/2)
+
+    where :math:`T_n` is a Chebyshev polynomial of the first kind. See
+    22.5.11 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to `eval_chebyt`.
+    x : array_like
+        Points at which to evaluate the Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    C : scalar or ndarray
+        Values of the Chebyshev polynomial
+
+    See Also
+    --------
+    roots_chebyc : roots and quadrature weights of Chebyshev
+                   polynomials of the first kind on [-2, 2]
+    chebyc : Chebyshev polynomial object
+    numpy.polynomial.chebyshev.Chebyshev : Chebyshev series
+    eval_chebyt : evaluate Chebycshev polynomials of the first kind
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.1.E3
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    They are a scaled version of the Chebyshev polynomials of the
+    first kind.
+
+    >>> x = np.linspace(-2, 2, 6)
+    >>> sc.eval_chebyc(3, x)
+    array([-2.   ,  1.872,  1.136, -1.136, -1.872,  2.   ])
+    >>> 2 * sc.eval_chebyt(3, x / 2)
+    array([-2.   ,  1.872,  1.136, -1.136, -1.872,  2.   ])
+
+    """)
+
+add_newdoc("eval_sh_chebyt",
+    r"""
+    eval_sh_chebyt(n, x, out=None)
+
+    Evaluate shifted Chebyshev polynomial of the first kind at a
+    point.
+
+    These polynomials are defined as
+
+    .. math::
+
+        T_n^*(x) = T_n(2x - 1)
+
+    where :math:`T_n` is a Chebyshev polynomial of the first kind. See
+    22.5.14 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to `eval_chebyt`.
+    x : array_like
+        Points at which to evaluate the shifted Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    T : scalar or ndarray
+        Values of the shifted Chebyshev polynomial
+
+    See Also
+    --------
+    roots_sh_chebyt : roots and quadrature weights of shifted
+                      Chebyshev polynomials of the first kind
+    sh_chebyt : shifted Chebyshev polynomial object
+    eval_chebyt : evaluate Chebyshev polynomials of the first kind
+    numpy.polynomial.chebyshev.Chebyshev : Chebyshev series
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.7.E7
+
+    """)
+
+add_newdoc("eval_sh_chebyu",
+    r"""
+    eval_sh_chebyu(n, x, out=None)
+
+    Evaluate shifted Chebyshev polynomial of the second kind at a
+    point.
+
+    These polynomials are defined as
+
+    .. math::
+
+        U_n^*(x) = U_n(2x - 1)
+
+    where :math:`U_n` is a Chebyshev polynomial of the first kind. See
+    22.5.15 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to `eval_chebyu`.
+    x : array_like
+        Points at which to evaluate the shifted Chebyshev polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    U : scalar or ndarray
+        Values of the shifted Chebyshev polynomial
+
+    See Also
+    --------
+    roots_sh_chebyu : roots and quadrature weights of shifted
+                      Chebychev polynomials of the second kind
+    sh_chebyu : shifted Chebyshev polynomial object
+    eval_chebyu : evaluate Chebyshev polynomials of the second kind
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.7.E8
+
+    """)
+
+add_newdoc("eval_legendre",
+    r"""
+    eval_legendre(n, x, out=None)
+
+    Evaluate Legendre polynomial at a point.
+
+    The Legendre polynomials can be defined via the Gauss
+    hypergeometric function :math:`{}_2F_1` as
+
+    .. math::
+
+        P_n(x) = {}_2F_1(-n, n + 1; 1; (1 - x)/2).
+
+    When :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.49 in [AS]_ (or equivalently [DLMF]_) for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to the Gauss hypergeometric
+        function.
+    x : array_like
+        Points at which to evaluate the Legendre polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    P : scalar or ndarray
+        Values of the Legendre polynomial
+
+    See Also
+    --------
+    roots_legendre : roots and quadrature weights of Legendre
+                     polynomials
+    legendre : Legendre polynomial object
+    hyp2f1 : Gauss hypergeometric function
+    numpy.polynomial.legendre.Legendre : Legendre series
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/15.9.E7
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import eval_legendre
+
+    Evaluate the zero-order Legendre polynomial at x = 0
+
+    >>> eval_legendre(0, 0)
+    1.0
+
+    Evaluate the first-order Legendre polynomial between -1 and 1
+
+    >>> X = np.linspace(-1, 1, 5)  # Domain of Legendre polynomials
+    >>> eval_legendre(1, X)
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+
+    Evaluate Legendre polynomials of order 0 through 4 at x = 0
+
+    >>> N = range(0, 5)
+    >>> eval_legendre(N, 0)
+    array([ 1.   ,  0.   , -0.5  ,  0.   ,  0.375])
+
+    Plot Legendre polynomials of order 0 through 4
+
+    >>> X = np.linspace(-1, 1)
+
+    >>> import matplotlib.pyplot as plt
+    >>> for n in range(0, 5):
+    ...     y = eval_legendre(n, X)
+    ...     plt.plot(X, y, label=r'$P_{}(x)$'.format(n))
+
+    >>> plt.title("Legendre Polynomials")
+    >>> plt.xlabel("x")
+    >>> plt.ylabel(r'$P_n(x)$')
+    >>> plt.legend(loc='lower right')
+    >>> plt.show()
+
+    """)
+
+add_newdoc("eval_sh_legendre",
+    r"""
+    eval_sh_legendre(n, x, out=None)
+
+    Evaluate shifted Legendre polynomial at a point.
+
+    These polynomials are defined as
+
+    .. math::
+
+        P_n^*(x) = P_n(2x - 1)
+
+    where :math:`P_n` is a Legendre polynomial. See 2.2.11 in [AS]_
+    or [DLMF]_ for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the value is
+        determined via the relation to `eval_legendre`.
+    x : array_like
+        Points at which to evaluate the shifted Legendre polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    P : scalar or ndarray
+        Values of the shifted Legendre polynomial
+
+    See Also
+    --------
+    roots_sh_legendre : roots and quadrature weights of shifted
+                        Legendre polynomials
+    sh_legendre : shifted Legendre polynomial object
+    eval_legendre : evaluate Legendre polynomials
+    numpy.polynomial.legendre.Legendre : Legendre series
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.7.E10
+
+    """)
+
+add_newdoc("eval_genlaguerre",
+    r"""
+    eval_genlaguerre(n, alpha, x, out=None)
+
+    Evaluate generalized Laguerre polynomial at a point.
+
+    The generalized Laguerre polynomials can be defined via the
+    confluent hypergeometric function :math:`{}_1F_1` as
+
+    .. math::
+
+        L_n^{(\alpha)}(x) = \binom{n + \alpha}{n}
+          {}_1F_1(-n, \alpha + 1, x).
+
+    When :math:`n` is an integer the result is a polynomial of degree
+    :math:`n`. See 22.5.54 in [AS]_ or [DLMF]_ for details. The Laguerre
+    polynomials are the special case where :math:`\alpha = 0`.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer, the result is
+        determined via the relation to the confluent hypergeometric
+        function.
+    alpha : array_like
+        Parameter; must have ``alpha > -1``
+    x : array_like
+        Points at which to evaluate the generalized Laguerre
+        polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    L : scalar or ndarray
+        Values of the generalized Laguerre polynomial
+
+    See Also
+    --------
+    roots_genlaguerre : roots and quadrature weights of generalized
+                        Laguerre polynomials
+    genlaguerre : generalized Laguerre polynomial object
+    hyp1f1 : confluent hypergeometric function
+    eval_laguerre : evaluate Laguerre polynomials
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E12
+
+    """)
+
+add_newdoc("eval_laguerre",
+    r"""
+    eval_laguerre(n, x, out=None)
+
+    Evaluate Laguerre polynomial at a point.
+
+    The Laguerre polynomials can be defined via the confluent
+    hypergeometric function :math:`{}_1F_1` as
+
+    .. math::
+
+        L_n(x) = {}_1F_1(-n, 1, x).
+
+    See 22.5.16 and 22.5.54 in [AS]_ (or equivalently [DLMF1]_ and [DLMF2]_)
+    for details. When :math:`n` is an integer the result is a polynomial
+    of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial. If not an integer the result is
+        determined via the relation to the confluent hypergeometric
+        function.
+    x : array_like
+        Points at which to evaluate the Laguerre polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    L : scalar or ndarray
+        Values of the Laguerre polynomial
+
+    See Also
+    --------
+    roots_laguerre : roots and quadrature weights of Laguerre
+                     polynomials
+    laguerre : Laguerre polynomial object
+    numpy.polynomial.laguerre.Laguerre : Laguerre series
+    eval_genlaguerre : evaluate generalized Laguerre polynomials
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF1] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.1#I1.ix7.p1
+    .. [DLMF2] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.E12
+
+     """)
+
+add_newdoc("eval_hermite",
+    r"""
+    eval_hermite(n, x, out=None)
+
+    Evaluate physicist's Hermite polynomial at a point.
+
+    Defined by
+
+    .. math::
+
+        H_n(x) = (-1)^n e^{x^2} \frac{d^n}{dx^n} e^{-x^2};
+
+    :math:`H_n` is a polynomial of degree :math:`n`. See 22.11.7 in
+    [AS]_ or [DLMF]_ for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial
+    x : array_like
+        Points at which to evaluate the Hermite polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    H : scalar or ndarray
+        Values of the Hermite polynomial
+
+    See Also
+    --------
+    roots_hermite : roots and quadrature weights of physicist's
+                    Hermite polynomials
+    hermite : physicist's Hermite polynomial object
+    numpy.polynomial.hermite.Hermite : Physicist's Hermite series
+    eval_hermitenorm : evaluate Probabilist's Hermite polynomials
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.T1
+
+    """)
+
+add_newdoc("eval_hermitenorm",
+    r"""
+    eval_hermitenorm(n, x, out=None)
+
+    Evaluate probabilist's (normalized) Hermite polynomial at a
+    point.
+
+    Defined by
+
+    .. math::
+
+        He_n(x) = (-1)^n e^{x^2/2} \frac{d^n}{dx^n} e^{-x^2/2};
+
+    :math:`He_n` is a polynomial of degree :math:`n`. See 22.11.8 in
+    [AS]_ or [DLMF]_ for details.
+
+    Parameters
+    ----------
+    n : array_like
+        Degree of the polynomial
+    x : array_like
+        Points at which to evaluate the Hermite polynomial
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    He : scalar or ndarray
+        Values of the Hermite polynomial
+
+    See Also
+    --------
+    roots_hermitenorm : roots and quadrature weights of probabilist's
+                        Hermite polynomials
+    hermitenorm : probabilist's Hermite polynomial object
+    numpy.polynomial.hermite_e.HermiteE : Probabilist's Hermite series
+    eval_hermite : evaluate physicist's Hermite polynomials
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [DLMF] NIST Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/18.5.T1
+
+    """)
+
+add_newdoc("expn",
+    r"""
+    expn(n, x, out=None)
+
+    Generalized exponential integral En.
+
+    For integer :math:`n \geq 0` and real :math:`x \geq 0` the
+    generalized exponential integral is defined as [DLMF]_
+
+    .. math::
+
+        E_n(x) = x^{n - 1} \int_x^\infty \frac{e^{-t}}{t^n} dt.
+
+    Parameters
+    ----------
+    n : array_like
+        Non-negative integers
+    x : array_like
+        Real argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the generalized exponential integral
+
+    See Also
+    --------
+    exp1 : special case of :math:`E_n` for :math:`n = 1`
+    expi : related to :math:`E_n` when :math:`n = 1`
+
+    References
+    ----------
+    .. [DLMF] Digital Library of Mathematical Functions, 8.19.2
+              https://dlmf.nist.gov/8.19#E2
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    Its domain is nonnegative n and x.
+
+    >>> sc.expn(-1, 1.0), sc.expn(1, -1.0)
+    (nan, nan)
+
+    It has a pole at ``x = 0`` for ``n = 1, 2``; for larger ``n`` it
+    is equal to ``1 / (n - 1)``.
+
+    >>> sc.expn([0, 1, 2, 3, 4], 0)
+    array([       inf,        inf, 1.        , 0.5       , 0.33333333])
+
+    For n equal to 0 it reduces to ``exp(-x) / x``.
+
+    >>> x = np.array([1, 2, 3, 4])
+    >>> sc.expn(0, x)
+    array([0.36787944, 0.06766764, 0.01659569, 0.00457891])
+    >>> np.exp(-x) / x
+    array([0.36787944, 0.06766764, 0.01659569, 0.00457891])
+
+    For n equal to 1 it reduces to `exp1`.
+
+    >>> sc.expn(1, x)
+    array([0.21938393, 0.04890051, 0.01304838, 0.00377935])
+    >>> sc.exp1(x)
+    array([0.21938393, 0.04890051, 0.01304838, 0.00377935])
+
+    """)
+
+add_newdoc("fdtr",
+    r"""
+    fdtr(dfn, dfd, x, out=None)
+
+    F cumulative distribution function.
+
+    Returns the value of the cumulative distribution function of the
+    F-distribution, also known as Snedecor's F-distribution or the
+    Fisher-Snedecor distribution.
+
+    The F-distribution with parameters :math:`d_n` and :math:`d_d` is the
+    distribution of the random variable,
+
+    .. math::
+        X = \frac{U_n/d_n}{U_d/d_d},
+
+    where :math:`U_n` and :math:`U_d` are random variables distributed
+    :math:`\chi^2`, with :math:`d_n` and :math:`d_d` degrees of freedom,
+    respectively.
+
+    Parameters
+    ----------
+    dfn : array_like
+        First parameter (positive float).
+    dfd : array_like
+        Second parameter (positive float).
+    x : array_like
+        Argument (nonnegative float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        The CDF of the F-distribution with parameters `dfn` and `dfd` at `x`.
+
+    See Also
+    --------
+    fdtrc : F distribution survival function
+    fdtri : F distribution inverse cumulative distribution
+    scipy.stats.f : F distribution
+
+    Notes
+    -----
+    The regularized incomplete beta function is used, according to the
+    formula,
+
+    .. math::
+        F(d_n, d_d; x) = I_{xd_n/(d_d + xd_n)}(d_n/2, d_d/2).
+
+    Wrapper for a routine from the Boost Math C++ library [1]_. The
+    F distribution is also available as `scipy.stats.f`. Calling
+    `fdtr` directly can improve performance compared to the ``cdf``
+    method of `scipy.stats.f` (see last example below).
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+
+    Examples
+    --------
+    Calculate the function for ``dfn=1`` and ``dfd=2`` at ``x=1``.
+
+    >>> import numpy as np
+    >>> from scipy.special import fdtr
+    >>> fdtr(1, 2, 1)
+    0.5773502691896258
+
+    Calculate the function at several points by providing a NumPy array for
+    `x`.
+
+    >>> x = np.array([0.5, 2., 3.])
+    >>> fdtr(1, 2, x)
+    array([0.4472136 , 0.70710678, 0.77459667])
+
+    Plot the function for several parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> dfn_parameters = [1, 5, 10, 50]
+    >>> dfd_parameters = [1, 1, 2, 3]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(dfn_parameters, dfd_parameters,
+    ...                            linestyles))
+    >>> x = np.linspace(0, 30, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> for parameter_set in parameters_list:
+    ...     dfn, dfd, style = parameter_set
+    ...     fdtr_vals = fdtr(dfn, dfd, x)
+    ...     ax.plot(x, fdtr_vals, label=rf"$d_n={dfn},\, d_d={dfd}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title("F distribution cumulative distribution function")
+    >>> plt.show()
+
+    The F distribution is also available as `scipy.stats.f`. Using `fdtr`
+    directly can be much faster than calling the ``cdf`` method of
+    `scipy.stats.f`, especially for small arrays or individual values.
+    To get the same results one must use the following parametrization:
+    ``stats.f(dfn, dfd).cdf(x)=fdtr(dfn, dfd, x)``.
+
+    >>> from scipy.stats import f
+    >>> dfn, dfd = 1, 2
+    >>> x = 1
+    >>> fdtr_res = fdtr(dfn, dfd, x)  # this will often be faster than below
+    >>> f_dist_res = f(dfn, dfd).cdf(x)
+    >>> fdtr_res == f_dist_res  # test that results are equal
+    True
+    """)
+
+add_newdoc("fdtrc",
+    r"""
+    fdtrc(dfn, dfd, x, out=None)
+
+    F survival function.
+
+    Returns the complemented F-distribution function (the integral of the
+    density from `x` to infinity).
+
+    Parameters
+    ----------
+    dfn : array_like
+        First parameter (positive float).
+    dfd : array_like
+        Second parameter (positive float).
+    x : array_like
+        Argument (nonnegative float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        The complemented F-distribution function with parameters `dfn` and
+        `dfd` at `x`.
+
+    See Also
+    --------
+    fdtr : F distribution cumulative distribution function
+    fdtri : F distribution inverse cumulative distribution function
+    scipy.stats.f : F distribution
+
+    Notes
+    -----
+    The regularized incomplete beta function is used, according to the
+    formula,
+
+    .. math::
+        F(d_n, d_d; x) = I_{d_d/(d_d + xd_n)}(d_d/2, d_n/2).
+
+    Wrapper for a routine from the Boost Math C++ library [1]_. The
+    F distribution is also available as `scipy.stats.f`. Calling
+    `fdtrc` directly can improve performance compared to the ``sf``
+    method of `scipy.stats.f` (see last example below).
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    Calculate the function for ``dfn=1`` and ``dfd=2`` at ``x=1``.
+
+    >>> import numpy as np
+    >>> from scipy.special import fdtrc
+    >>> fdtrc(1, 2, 1)
+    0.42264973081037427
+
+    Calculate the function at several points by providing a NumPy array for
+    `x`.
+
+    >>> x = np.array([0.5, 2., 3.])
+    >>> fdtrc(1, 2, x)
+    array([0.5527864 , 0.29289322, 0.22540333])
+
+    Plot the function for several parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> dfn_parameters = [1, 5, 10, 50]
+    >>> dfd_parameters = [1, 1, 2, 3]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(dfn_parameters, dfd_parameters,
+    ...                            linestyles))
+    >>> x = np.linspace(0, 30, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> for parameter_set in parameters_list:
+    ...     dfn, dfd, style = parameter_set
+    ...     fdtrc_vals = fdtrc(dfn, dfd, x)
+    ...     ax.plot(x, fdtrc_vals, label=rf"$d_n={dfn},\, d_d={dfd}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title("F distribution survival function")
+    >>> plt.show()
+
+    The F distribution is also available as `scipy.stats.f`. Using `fdtrc`
+    directly can be much faster than calling the ``sf`` method of
+    `scipy.stats.f`, especially for small arrays or individual values.
+    To get the same results one must use the following parametrization:
+    ``stats.f(dfn, dfd).sf(x)=fdtrc(dfn, dfd, x)``.
+
+    >>> from scipy.stats import f
+    >>> dfn, dfd = 1, 2
+    >>> x = 1
+    >>> fdtrc_res = fdtrc(dfn, dfd, x)  # this will often be faster than below
+    >>> f_dist_res = f(dfn, dfd).sf(x)
+    >>> f_dist_res == fdtrc_res  # test that results are equal
+    True
+    """)
+
+add_newdoc("fdtri",
+    r"""
+    fdtri(dfn, dfd, p, out=None)
+
+    The `p`-th quantile of the F-distribution.
+
+    This function is the inverse of the F-distribution CDF, `fdtr`, returning
+    the `x` such that `fdtr(dfn, dfd, x) = p`.
+
+    Parameters
+    ----------
+    dfn : array_like
+        First parameter (positive float).
+    dfd : array_like
+        Second parameter (positive float).
+    p : array_like
+        Cumulative probability, in [0, 1].
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    x : scalar or ndarray
+        The quantile corresponding to `p`.
+
+    See Also
+    --------
+    fdtr : F distribution cumulative distribution function
+    fdtrc : F distribution survival function
+    scipy.stats.f : F distribution
+
+    Notes
+    -----
+    Wrapper for a routine from the Boost Math C++ library [1]_. The
+    F distribution is also available as `scipy.stats.f`. Calling
+    `fdtri` directly can improve performance compared to the ``ppf``
+    method of `scipy.stats.f` (see last example below).
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    `fdtri` represents the inverse of the F distribution CDF which is
+    available as `fdtr`. Here, we calculate the CDF for ``df1=1``, ``df2=2``
+    at ``x=3``. `fdtri` then returns ``3`` given the same values for `df1`,
+    `df2` and the computed CDF value.
+
+    >>> import numpy as np
+    >>> from scipy.special import fdtri, fdtr
+    >>> df1, df2 = 1, 2
+    >>> x = 3
+    >>> cdf_value =  fdtr(df1, df2, x)
+    >>> fdtri(df1, df2, cdf_value)
+    3.000000000000006
+
+    Calculate the function at several points by providing a NumPy array for
+    `x`.
+
+    >>> x = np.array([0.1, 0.4, 0.7])
+    >>> fdtri(1, 2, x)
+    array([0.02020202, 0.38095238, 1.92156863])
+
+    Plot the function for several parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> dfn_parameters = [50, 10, 1, 50]
+    >>> dfd_parameters = [0.5, 1, 1, 5]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(dfn_parameters, dfd_parameters,
+    ...                            linestyles))
+    >>> x = np.linspace(0, 1, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> for parameter_set in parameters_list:
+    ...     dfn, dfd, style = parameter_set
+    ...     fdtri_vals = fdtri(dfn, dfd, x)
+    ...     ax.plot(x, fdtri_vals, label=rf"$d_n={dfn},\, d_d={dfd}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$x$")
+    >>> title = "F distribution inverse cumulative distribution function"
+    >>> ax.set_title(title)
+    >>> ax.set_ylim(0, 30)
+    >>> plt.show()
+
+    The F distribution is also available as `scipy.stats.f`. Using `fdtri`
+    directly can be much faster than calling the ``ppf`` method of
+    `scipy.stats.f`, especially for small arrays or individual values.
+    To get the same results one must use the following parametrization:
+    ``stats.f(dfn, dfd).ppf(x)=fdtri(dfn, dfd, x)``.
+
+    >>> from scipy.stats import f
+    >>> dfn, dfd = 1, 2
+    >>> x = 0.7
+    >>> fdtri_res = fdtri(dfn, dfd, x)  # this will often be faster than below
+    >>> f_dist_res = f(dfn, dfd).ppf(x)
+    >>> f_dist_res == fdtri_res  # test that results are equal
+    True
+    """)
+
+add_newdoc("fdtridfd",
+    """
+    fdtridfd(dfn, p, x, out=None)
+
+    Inverse to `fdtr` vs dfd
+
+    Finds the F density argument dfd such that ``fdtr(dfn, dfd, x) == p``.
+
+    Parameters
+    ----------
+    dfn : array_like
+        First parameter (positive float).
+    p : array_like
+        Cumulative probability, in [0, 1].
+    x : array_like
+        Argument (nonnegative float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    dfd : scalar or ndarray
+        `dfd` such that ``fdtr(dfn, dfd, x) == p``.
+
+    See Also
+    --------
+    fdtr : F distribution cumulative distribution function
+    fdtrc : F distribution survival function
+    fdtri : F distribution quantile function
+    scipy.stats.f : F distribution
+
+    Examples
+    --------
+    Compute the F distribution cumulative distribution function for one
+    parameter set.
+
+    >>> from scipy.special import fdtridfd, fdtr
+    >>> dfn, dfd, x = 10, 5, 2
+    >>> cdf_value = fdtr(dfn, dfd, x)
+    >>> cdf_value
+    0.7700248806501017
+
+    Verify that `fdtridfd` recovers the original value for `dfd`:
+
+    >>> fdtridfd(dfn, cdf_value, x)
+    5.0
+    """)
+
+'''
+commented out as fdtridfn seems to have bugs and is not in functions.json
+see: https://github.com/scipy/scipy/pull/15622#discussion_r811440983
+
+add_newdoc(
+    "fdtridfn",
+    """
+    fdtridfn(p, dfd, x, out=None)
+
+    Inverse to `fdtr` vs dfn
+
+    finds the F density argument dfn such that ``fdtr(dfn, dfd, x) == p``.
+
+
+    Parameters
+    ----------
+    p : array_like
+        Cumulative probability, in [0, 1].
+    dfd : array_like
+        Second parameter (positive float).
+    x : array_like
+        Argument (nonnegative float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    dfn : scalar or ndarray
+        `dfn` such that ``fdtr(dfn, dfd, x) == p``.
+
+    See Also
+    --------
+    fdtr, fdtrc, fdtri, fdtridfd
+
+
+    """)
+'''
+
+add_newdoc("gdtr",
+    r"""
+    gdtr(a, b, x, out=None)
+
+    Gamma distribution cumulative distribution function.
+
+    Returns the integral from zero to `x` of the gamma probability density
+    function,
+
+    .. math::
+
+        F = \int_0^x \frac{a^b}{\Gamma(b)} t^{b-1} e^{-at}\,dt,
+
+    where :math:`\Gamma` is the gamma function.
+
+    Parameters
+    ----------
+    a : array_like
+        The rate parameter of the gamma distribution, sometimes denoted
+        :math:`\beta` (float).  It is also the reciprocal of the scale
+        parameter :math:`\theta`.
+    b : array_like
+        The shape parameter of the gamma distribution, sometimes denoted
+        :math:`\alpha` (float).
+    x : array_like
+        The quantile (upper limit of integration; float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    F : scalar or ndarray
+        The CDF of the gamma distribution with parameters `a` and `b`
+        evaluated at `x`.
+
+    See Also
+    --------
+    gdtrc : 1 - CDF of the gamma distribution.
+    scipy.stats.gamma: Gamma distribution
+
+    Notes
+    -----
+    The evaluation is carried out using the relation to the incomplete gamma
+    integral (regularized gamma function).
+
+    Wrapper for the Cephes [1]_ routine `gdtr`. Calling `gdtr` directly can
+    improve performance compared to the ``cdf`` method of `scipy.stats.gamma`
+    (see last example below).
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    Compute the function for ``a=1``, ``b=2`` at ``x=5``.
+
+    >>> import numpy as np
+    >>> from scipy.special import gdtr
+    >>> import matplotlib.pyplot as plt
+    >>> gdtr(1., 2., 5.)
+    0.9595723180054873
+
+    Compute the function for ``a=1`` and ``b=2`` at several points by
+    providing a NumPy array for `x`.
+
+    >>> xvalues = np.array([1., 2., 3., 4])
+    >>> gdtr(1., 1., xvalues)
+    array([0.63212056, 0.86466472, 0.95021293, 0.98168436])
+
+    `gdtr` can evaluate different parameter sets by providing arrays with
+    broadcasting compatible shapes for `a`, `b` and `x`. Here we compute the
+    function for three different `a` at four positions `x` and ``b=3``,
+    resulting in a 3x4 array.
+
+    >>> a = np.array([[0.5], [1.5], [2.5]])
+    >>> x = np.array([1., 2., 3., 4])
+    >>> a.shape, x.shape
+    ((3, 1), (4,))
+
+    >>> gdtr(a, 3., x)
+    array([[0.01438768, 0.0803014 , 0.19115317, 0.32332358],
+           [0.19115317, 0.57680992, 0.82642193, 0.9380312 ],
+           [0.45618688, 0.87534798, 0.97974328, 0.9972306 ]])
+
+    Plot the function for four different parameter sets.
+
+    >>> a_parameters = [0.3, 1, 2, 6]
+    >>> b_parameters = [2, 10, 15, 20]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(a_parameters, b_parameters, linestyles))
+    >>> x = np.linspace(0, 30, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> for parameter_set in parameters_list:
+    ...     a, b, style = parameter_set
+    ...     gdtr_vals = gdtr(a, b, x)
+    ...     ax.plot(x, gdtr_vals, label=fr"$a= {a},\, b={b}$", ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title("Gamma distribution cumulative distribution function")
+    >>> plt.show()
+
+    The gamma distribution is also available as `scipy.stats.gamma`. Using
+    `gdtr` directly can be much faster than calling the ``cdf`` method of
+    `scipy.stats.gamma`, especially for small arrays or individual values.
+    To get the same results one must use the following parametrization:
+    ``stats.gamma(b, scale=1/a).cdf(x)=gdtr(a, b, x)``.
+
+    >>> from scipy.stats import gamma
+    >>> a = 2.
+    >>> b = 3
+    >>> x = 1.
+    >>> gdtr_result = gdtr(a, b, x)  # this will often be faster than below
+    >>> gamma_dist_result = gamma(b, scale=1/a).cdf(x)
+    >>> gdtr_result == gamma_dist_result  # test that results are equal
+    True
+    """)
+
+add_newdoc("gdtrc",
+    r"""
+    gdtrc(a, b, x, out=None)
+
+    Gamma distribution survival function.
+
+    Integral from `x` to infinity of the gamma probability density function,
+
+    .. math::
+
+        F = \int_x^\infty \frac{a^b}{\Gamma(b)} t^{b-1} e^{-at}\,dt,
+
+    where :math:`\Gamma` is the gamma function.
+
+    Parameters
+    ----------
+    a : array_like
+        The rate parameter of the gamma distribution, sometimes denoted
+        :math:`\beta` (float). It is also the reciprocal of the scale
+        parameter :math:`\theta`.
+    b : array_like
+        The shape parameter of the gamma distribution, sometimes denoted
+        :math:`\alpha` (float).
+    x : array_like
+        The quantile (lower limit of integration; float).
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    F : scalar or ndarray
+        The survival function of the gamma distribution with parameters `a`
+        and `b` evaluated at `x`.
+
+    See Also
+    --------
+    gdtr: Gamma distribution cumulative distribution function
+    scipy.stats.gamma: Gamma distribution
+    gdtrix
+
+    Notes
+    -----
+    The evaluation is carried out using the relation to the incomplete gamma
+    integral (regularized gamma function).
+
+    Wrapper for the Cephes [1]_ routine `gdtrc`. Calling `gdtrc` directly can
+    improve performance compared to the ``sf`` method of `scipy.stats.gamma`
+    (see last example below).
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    Compute the function for ``a=1`` and ``b=2`` at ``x=5``.
+
+    >>> import numpy as np
+    >>> from scipy.special import gdtrc
+    >>> import matplotlib.pyplot as plt
+    >>> gdtrc(1., 2., 5.)
+    0.04042768199451279
+
+    Compute the function for ``a=1``, ``b=2`` at several points by providing
+    a NumPy array for `x`.
+
+    >>> xvalues = np.array([1., 2., 3., 4])
+    >>> gdtrc(1., 1., xvalues)
+    array([0.36787944, 0.13533528, 0.04978707, 0.01831564])
+
+    `gdtrc` can evaluate different parameter sets by providing arrays with
+    broadcasting compatible shapes for `a`, `b` and `x`. Here we compute the
+    function for three different `a` at four positions `x` and ``b=3``,
+    resulting in a 3x4 array.
+
+    >>> a = np.array([[0.5], [1.5], [2.5]])
+    >>> x = np.array([1., 2., 3., 4])
+    >>> a.shape, x.shape
+    ((3, 1), (4,))
+
+    >>> gdtrc(a, 3., x)
+    array([[0.98561232, 0.9196986 , 0.80884683, 0.67667642],
+           [0.80884683, 0.42319008, 0.17357807, 0.0619688 ],
+           [0.54381312, 0.12465202, 0.02025672, 0.0027694 ]])
+
+    Plot the function for four different parameter sets.
+
+    >>> a_parameters = [0.3, 1, 2, 6]
+    >>> b_parameters = [2, 10, 15, 20]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(a_parameters, b_parameters, linestyles))
+    >>> x = np.linspace(0, 30, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> for parameter_set in parameters_list:
+    ...     a, b, style = parameter_set
+    ...     gdtrc_vals = gdtrc(a, b, x)
+    ...     ax.plot(x, gdtrc_vals, label=fr"$a= {a},\, b={b}$", ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title("Gamma distribution survival function")
+    >>> plt.show()
+
+    The gamma distribution is also available as `scipy.stats.gamma`.
+    Using `gdtrc` directly can be much faster than calling the ``sf`` method
+    of `scipy.stats.gamma`, especially for small arrays or individual
+    values. To get the same results one must use the following parametrization:
+    ``stats.gamma(b, scale=1/a).sf(x)=gdtrc(a, b, x)``.
+
+    >>> from scipy.stats import gamma
+    >>> a = 2
+    >>> b = 3
+    >>> x = 1.
+    >>> gdtrc_result = gdtrc(a, b, x)  # this will often be faster than below
+    >>> gamma_dist_result = gamma(b, scale=1/a).sf(x)
+    >>> gdtrc_result == gamma_dist_result  # test that results are equal
+    True
+    """)
+
+add_newdoc("gdtria",
+    """
+    gdtria(p, b, x, out=None)
+
+    Inverse of `gdtr` vs a.
+
+    Returns the inverse with respect to the parameter `a` of ``p =
+    gdtr(a, b, x)``, the cumulative distribution function of the gamma
+    distribution.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability values.
+    b : array_like
+        `b` parameter values of `gdtr(a, b, x)`. `b` is the "shape" parameter
+        of the gamma distribution.
+    x : array_like
+        Nonnegative real values, from the domain of the gamma distribution.
+    out : ndarray, optional
+        If a fourth argument is given, it must be a numpy.ndarray whose size
+        matches the broadcast result of `a`, `b` and `x`.  `out` is then the
+        array returned by the function.
+
+    Returns
+    -------
+    a : scalar or ndarray
+        Values of the `a` parameter such that ``p = gdtr(a, b, x)`.  ``1/a``
+        is the "scale" parameter of the gamma distribution.
+
+    See Also
+    --------
+    gdtr : CDF of the gamma distribution.
+    gdtrib : Inverse with respect to `b` of `gdtr(a, b, x)`.
+    gdtrix : Inverse with respect to `x` of `gdtr(a, b, x)`.
+
+    Notes
+    -----
+    Wrapper for the CDFLIB [1]_ Fortran routine `cdfgam`.
+
+    The cumulative distribution function `p` is computed using a routine by
+    DiDinato and Morris [2]_. Computation of `a` involves a search for a value
+    that produces the desired value of `p`. The search relies on the
+    monotonicity of `p` with `a`.
+
+    References
+    ----------
+    .. [1] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+    .. [2] DiDinato, A. R. and Morris, A. H.,
+           Computation of the incomplete gamma function ratios and their
+           inverse.  ACM Trans. Math. Softw. 12 (1986), 377-393.
+
+    Examples
+    --------
+    First evaluate `gdtr`.
+
+    >>> from scipy.special import gdtr, gdtria
+    >>> p = gdtr(1.2, 3.4, 5.6)
+    >>> print(p)
+    0.94378087442
+
+    Verify the inverse.
+
+    >>> gdtria(p, 3.4, 5.6)
+    1.2
+    """)
+
+add_newdoc("gdtrib",
+    """
+    gdtrib(a, p, x, out=None)
+
+    Inverse of `gdtr` vs b.
+
+    Returns the inverse with respect to the parameter `b` of ``p =
+    gdtr(a, b, x)``, the cumulative distribution function of the gamma
+    distribution.
+
+    Parameters
+    ----------
+    a : array_like
+        `a` parameter values of ``gdtr(a, b, x)`. ``1/a`` is the "scale"
+        parameter of the gamma distribution.
+    p : array_like
+        Probability values.
+    x : array_like
+        Nonnegative real values, from the domain of the gamma distribution.
+    out : ndarray, optional
+        If a fourth argument is given, it must be a numpy.ndarray whose size
+        matches the broadcast result of `a`, `b` and `x`.  `out` is then the
+        array returned by the function.
+
+    Returns
+    -------
+    b : scalar or ndarray
+        Values of the `b` parameter such that `p = gdtr(a, b, x)`.  `b` is
+        the "shape" parameter of the gamma distribution.
+
+    See Also
+    --------
+    gdtr : CDF of the gamma distribution.
+    gdtria : Inverse with respect to `a` of `gdtr(a, b, x)`.
+    gdtrix : Inverse with respect to `x` of `gdtr(a, b, x)`.
+
+    Notes
+    -----
+
+    The cumulative distribution function `p` is computed using the Cephes [1]_
+    routines `igam` and `igamc`. Computation of `b` involves a search for a value
+    that produces the desired value of `p` using Chandrupatla's bracketing
+    root finding algorithm [2]_.
+
+    Note that there are some edge cases where `gdtrib` is extended by taking
+    limits where they are uniquely defined. In particular
+    ``x == 0`` with ``p > 0`` and ``p == 0`` with ``x > 0``.
+    For these edge cases, a numerical result will be returned for
+    ``gdtrib(a, p, x)`` even though ``gdtr(a, gdtrib(a, p, x), x)`` is
+    undefined.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+    .. [2] Chandrupatla, Tirupathi R.
+           "A new hybrid quadratic/bisection algorithm for finding the zero of a
+           nonlinear function without using derivatives".
+           Advances in Engineering Software, 28(3), 145-149.
+           https://doi.org/10.1016/s0965-9978(96)00051-8
+
+    Examples
+    --------
+    First evaluate `gdtr`.
+
+    >>> from scipy.special import gdtr, gdtrib
+    >>> p = gdtr(1.2, 3.4, 5.6)
+    >>> print(p)
+    0.94378087442
+
+    Verify the inverse.
+
+    >>> gdtrib(1.2, p, 5.6)
+    3.3999999999999995
+    """)
+
+add_newdoc("gdtrix",
+    """
+    gdtrix(a, b, p, out=None)
+
+    Inverse of `gdtr` vs x.
+
+    Returns the inverse with respect to the parameter `x` of ``p =
+    gdtr(a, b, x)``, the cumulative distribution function of the gamma
+    distribution. This is also known as the pth quantile of the
+    distribution.
+
+    Parameters
+    ----------
+    a : array_like
+        `a` parameter values of ``gdtr(a, b, x)``. ``1/a`` is the "scale"
+        parameter of the gamma distribution.
+    b : array_like
+        `b` parameter values of ``gdtr(a, b, x)``. `b` is the "shape" parameter
+        of the gamma distribution.
+    p : array_like
+        Probability values.
+    out : ndarray, optional
+        If a fourth argument is given, it must be a numpy.ndarray whose size
+        matches the broadcast result of `a`, `b` and `x`. `out` is then the
+        array returned by the function.
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Values of the `x` parameter such that `p = gdtr(a, b, x)`.
+
+    See Also
+    --------
+    gdtr : CDF of the gamma distribution.
+    gdtria : Inverse with respect to `a` of ``gdtr(a, b, x)``.
+    gdtrib : Inverse with respect to `b` of ``gdtr(a, b, x)``.
+
+    Notes
+    -----
+    Wrapper for the CDFLIB [1]_ Fortran routine `cdfgam`.
+
+    The cumulative distribution function `p` is computed using a routine by
+    DiDinato and Morris [2]_. Computation of `x` involves a search for a value
+    that produces the desired value of `p`. The search relies on the
+    monotonicity of `p` with `x`.
+
+    References
+    ----------
+    .. [1] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+    .. [2] DiDinato, A. R. and Morris, A. H.,
+           Computation of the incomplete gamma function ratios and their
+           inverse.  ACM Trans. Math. Softw. 12 (1986), 377-393.
+
+    Examples
+    --------
+    First evaluate `gdtr`.
+
+    >>> from scipy.special import gdtr, gdtrix
+    >>> p = gdtr(1.2, 3.4, 5.6)
+    >>> print(p)
+    0.94378087442
+
+    Verify the inverse.
+
+    >>> gdtrix(1.2, 3.4, p)
+    5.5999999999999996
+    """)
+
+
+
+add_newdoc("huber",
+    r"""
+    huber(delta, r, out=None)
+
+    Huber loss function.
+
+    .. math:: \text{huber}(\delta, r) = \begin{cases} \infty & \delta < 0  \\
+              \frac{1}{2}r^2 & 0 \le \delta, | r | \le \delta \\
+              \delta ( |r| - \frac{1}{2}\delta ) & \text{otherwise} \end{cases}
+
+    Parameters
+    ----------
+    delta : ndarray
+        Input array, indicating the quadratic vs. linear loss changepoint.
+    r : ndarray
+        Input array, possibly representing residuals.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        The computed Huber loss function values.
+
+    See Also
+    --------
+    pseudo_huber : smooth approximation of this function
+
+    Notes
+    -----
+    `huber` is useful as a loss function in robust statistics or machine
+    learning to reduce the influence of outliers as compared to the common
+    squared error loss, residuals with a magnitude higher than `delta` are
+    not squared [1]_.
+
+    Typically, `r` represents residuals, the difference
+    between a model prediction and data. Then, for :math:`|r|\leq\delta`,
+    `huber` resembles the squared error and for :math:`|r|>\delta` the
+    absolute error. This way, the Huber loss often achieves
+    a fast convergence in model fitting for small residuals like the squared
+    error loss function and still reduces the influence of outliers
+    (:math:`|r|>\delta`) like the absolute error loss. As :math:`\delta` is
+    the cutoff between squared and absolute error regimes, it has
+    to be tuned carefully for each problem. `huber` is also
+    convex, making it suitable for gradient based optimization.
+
+    .. versionadded:: 0.15.0
+
+    References
+    ----------
+    .. [1] Peter Huber. "Robust Estimation of a Location Parameter",
+           1964. Annals of Statistics. 53 (1): 73 - 101.
+
+    Examples
+    --------
+    Import all necessary modules.
+
+    >>> import numpy as np
+    >>> from scipy.special import huber
+    >>> import matplotlib.pyplot as plt
+
+    Compute the function for ``delta=1`` at ``r=2``
+
+    >>> huber(1., 2.)
+    1.5
+
+    Compute the function for different `delta` by providing a NumPy array or
+    list for `delta`.
+
+    >>> huber([1., 3., 5.], 4.)
+    array([3.5, 7.5, 8. ])
+
+    Compute the function at different points by providing a NumPy array or
+    list for `r`.
+
+    >>> huber(2., np.array([1., 1.5, 3.]))
+    array([0.5  , 1.125, 4.   ])
+
+    The function can be calculated for different `delta` and `r` by
+    providing arrays for both with compatible shapes for broadcasting.
+
+    >>> r = np.array([1., 2.5, 8., 10.])
+    >>> deltas = np.array([[1.], [5.], [9.]])
+    >>> print(r.shape, deltas.shape)
+    (4,) (3, 1)
+
+    >>> huber(deltas, r)
+    array([[ 0.5  ,  2.   ,  7.5  ,  9.5  ],
+           [ 0.5  ,  3.125, 27.5  , 37.5  ],
+           [ 0.5  ,  3.125, 32.   , 49.5  ]])
+
+    Plot the function for different `delta`.
+
+    >>> x = np.linspace(-4, 4, 500)
+    >>> deltas = [1, 2, 3]
+    >>> linestyles = ["dashed", "dotted", "dashdot"]
+    >>> fig, ax = plt.subplots()
+    >>> combined_plot_parameters = list(zip(deltas, linestyles))
+    >>> for delta, style in combined_plot_parameters:
+    ...     ax.plot(x, huber(delta, x), label=fr"$\delta={delta}$", ls=style)
+    >>> ax.legend(loc="upper center")
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title(r"Huber loss function $h_{\delta}(x)$")
+    >>> ax.set_xlim(-4, 4)
+    >>> ax.set_ylim(0, 8)
+    >>> plt.show()
+    """)
+
+add_newdoc("hyp0f1",
+    r"""
+    hyp0f1(v, z, out=None)
+
+    Confluent hypergeometric limit function 0F1.
+
+    Parameters
+    ----------
+    v : array_like
+        Real-valued parameter
+    z : array_like
+        Real- or complex-valued argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The confluent hypergeometric limit function
+
+    Notes
+    -----
+    This function is defined as:
+
+    .. math:: _0F_1(v, z) = \sum_{k=0}^{\infty}\frac{z^k}{(v)_k k!}.
+
+    It's also the limit as :math:`q \to \infty` of :math:`_1F_1(q; v; z/q)`,
+    and satisfies the differential equation :math:`f''(z) + vf'(z) =
+    f(z)`. See [1]_ for more information.
+
+    References
+    ----------
+    .. [1] Wolfram MathWorld, "Confluent Hypergeometric Limit Function",
+           http://mathworld.wolfram.com/ConfluentHypergeometricLimitFunction.html
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It is one when `z` is zero.
+
+    >>> sc.hyp0f1(1, 0)
+    1.0
+
+    It is the limit of the confluent hypergeometric function as `q`
+    goes to infinity.
+
+    >>> q = np.array([1, 10, 100, 1000])
+    >>> v = 1
+    >>> z = 1
+    >>> sc.hyp1f1(q, v, z / q)
+    array([2.71828183, 2.31481985, 2.28303778, 2.27992985])
+    >>> sc.hyp0f1(v, z)
+    2.2795853023360673
+
+    It is related to Bessel functions.
+
+    >>> n = 1
+    >>> x = np.linspace(0, 1, 5)
+    >>> sc.jv(n, x)
+    array([0.        , 0.12402598, 0.24226846, 0.3492436 , 0.44005059])
+    >>> (0.5 * x)**n / sc.factorial(n) * sc.hyp0f1(n + 1, -0.25 * x**2)
+    array([0.        , 0.12402598, 0.24226846, 0.3492436 , 0.44005059])
+
+    """)
+
+add_newdoc("hyp1f1",
+    r"""
+    hyp1f1(a, b, x, out=None)
+
+    Confluent hypergeometric function 1F1.
+
+    The confluent hypergeometric function is defined by the series
+
+    .. math::
+
+       {}_1F_1(a; b; x) = \sum_{k = 0}^\infty \frac{(a)_k}{(b)_k k!} x^k.
+
+    See [DLMF]_ for more details. Here :math:`(\cdot)_k` is the
+    Pochhammer symbol; see `poch`.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Real parameters
+    x : array_like
+        Real or complex argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the confluent hypergeometric function
+
+    See Also
+    --------
+    hyperu : another confluent hypergeometric function
+    hyp0f1 : confluent hypergeometric limit function
+    hyp2f1 : Gaussian hypergeometric function
+
+    Notes
+    -----
+    For real values, this function uses the ``hyp1f1`` routine from the C++ Boost
+    library [2]_, for complex values a C translation of the specfun
+    Fortran library [3]_.
+
+    References
+    ----------
+    .. [DLMF] NIST Digital Library of Mathematical Functions
+              https://dlmf.nist.gov/13.2#E2
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+    .. [3] Zhang, Jin, "Computation of Special Functions", John Wiley
+           and Sons, Inc, 1996.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It is one when `x` is zero:
+
+    >>> sc.hyp1f1(0.5, 0.5, 0)
+    1.0
+
+    It is singular when `b` is a nonpositive integer.
+
+    >>> sc.hyp1f1(0.5, -1, 0)
+    inf
+
+    It is a polynomial when `a` is a nonpositive integer.
+
+    >>> a, b, x = -1, 0.5, np.array([1.0, 2.0, 3.0, 4.0])
+    >>> sc.hyp1f1(a, b, x)
+    array([-1., -3., -5., -7.])
+    >>> 1 + (a / b) * x
+    array([-1., -3., -5., -7.])
+
+    It reduces to the exponential function when ``a = b``.
+
+    >>> sc.hyp1f1(2, 2, [1, 2, 3, 4])
+    array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])
+    >>> np.exp([1, 2, 3, 4])
+    array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])
+
+    """)
+
+add_newdoc("hyperu",
+    r"""
+    hyperu(a, b, x, out=None)
+
+    Confluent hypergeometric function U
+
+    It is defined as the solution to the equation
+
+    .. math::
+
+       x \frac{d^2w}{dx^2} + (b - x) \frac{dw}{dx} - aw = 0
+
+    which satisfies the property
+
+    .. math::
+
+       U(a, b, x) \sim x^{-a}
+
+    as :math:`x \to \infty`. See [DLMF]_ for more details.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Real-valued parameters
+    x : array_like
+        Real-valued argument
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of `U`
+
+    References
+    ----------
+    .. [DLMF] NIST Digital Library of Mathematics Functions
+              https://dlmf.nist.gov/13.2#E6
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It has a branch cut along the negative `x` axis.
+
+    >>> x = np.linspace(-0.1, -10, 5)
+    >>> sc.hyperu(1, 1, x)
+    array([nan, nan, nan, nan, nan])
+
+    It approaches zero as `x` goes to infinity.
+
+    >>> x = np.array([1, 10, 100])
+    >>> sc.hyperu(1, 1, x)
+    array([0.59634736, 0.09156333, 0.00990194])
+
+    It satisfies Kummer's transformation.
+
+    >>> a, b, x = 2, 1, 1
+    >>> sc.hyperu(a, b, x)
+    0.1926947246463881
+    >>> x**(1 - b) * sc.hyperu(a - b + 1, 2 - b, x)
+    0.1926947246463881
+
+    """)
+
+add_newdoc("_igam_fac",
+    """
+    Internal function, do not use.
+    """)
+
+add_newdoc("kl_div",
+    r"""
+    kl_div(x, y, out=None)
+
+    Elementwise function for computing Kullback-Leibler divergence.
+
+    .. math::
+
+        \mathrm{kl\_div}(x, y) =
+          \begin{cases}
+            x \log(x / y) - x + y & x > 0, y > 0 \\
+            y & x = 0, y \ge 0 \\
+            \infty & \text{otherwise}
+          \end{cases}
+
+    Parameters
+    ----------
+    x, y : array_like
+        Real arguments
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the Kullback-Liebler divergence.
+
+    See Also
+    --------
+    entr, rel_entr, scipy.stats.entropy
+
+    Notes
+    -----
+    .. versionadded:: 0.15.0
+
+    This function is non-negative and is jointly convex in `x` and `y`.
+
+    The origin of this function is in convex programming; see [1]_ for
+    details. This is why the function contains the extra :math:`-x
+    + y` terms over what might be expected from the Kullback-Leibler
+    divergence. For a version of the function without the extra terms,
+    see `rel_entr`.
+
+    References
+    ----------
+    .. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.
+           Cambridge University Press, 2004.
+           :doi:`https://doi.org/10.1017/CBO9780511804441`
+
+    """)
+
+add_newdoc("kn",
+    r"""
+    kn(n, x, out=None)
+
+    Modified Bessel function of the second kind of integer order `n`
+
+    Returns the modified Bessel function of the second kind for integer order
+    `n` at real `z`.
+
+    These are also sometimes called functions of the third kind, Basset
+    functions, or Macdonald functions.
+
+    Parameters
+    ----------
+    n : array_like of int
+        Order of Bessel functions (floats will truncate with a warning)
+    x : array_like of float
+        Argument at which to evaluate the Bessel functions
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the Modified Bessel function of the second kind,
+        :math:`K_n(x)`.
+
+    See Also
+    --------
+    kv : Same function, but accepts real order and complex argument
+    kvp : Derivative of this function
+
+    Notes
+    -----
+    Wrapper for AMOS [1]_ routine `zbesk`.  For a discussion of the
+    algorithm used, see [2]_ and the references therein.
+
+    References
+    ----------
+    .. [1] Donald E. Amos, "AMOS, A Portable Package for Bessel Functions
+           of a Complex Argument and Nonnegative Order",
+           http://netlib.org/amos/
+    .. [2] Donald E. Amos, "Algorithm 644: A portable package for Bessel
+           functions of a complex argument and nonnegative order", ACM
+           TOMS Vol. 12 Issue 3, Sept. 1986, p. 265
+
+    Examples
+    --------
+    Plot the function of several orders for real input:
+
+    >>> import numpy as np
+    >>> from scipy.special import kn
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0, 5, 1000)
+    >>> for N in range(6):
+    ...     plt.plot(x, kn(N, x), label='$K_{}(x)$'.format(N))
+    >>> plt.ylim(0, 10)
+    >>> plt.legend()
+    >>> plt.title(r'Modified Bessel function of the second kind $K_n(x)$')
+    >>> plt.show()
+
+    Calculate for a single value at multiple orders:
+
+    >>> kn([4, 5, 6], 1)
+    array([   44.23241585,   360.9605896 ,  3653.83831186])
+    """)
+
+add_newdoc("kolmogi",
+    """
+    kolmogi(p, out=None)
+
+    Inverse Survival Function of Kolmogorov distribution
+
+    It is the inverse function to `kolmogorov`.
+    Returns y such that ``kolmogorov(y) == p``.
+
+    Parameters
+    ----------
+    p : float array_like
+        Probability
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value(s) of kolmogi(p)
+
+    See Also
+    --------
+    kolmogorov : The Survival Function for the distribution
+    scipy.stats.kstwobign : Provides the functionality as a continuous distribution
+    smirnov, smirnovi : Functions for the one-sided distribution
+
+    Notes
+    -----
+    `kolmogorov` is used by `stats.kstest` in the application of the
+    Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this
+    function is exposed in `scpy.special`, but the recommended way to achieve
+    the most accurate CDF/SF/PDF/PPF/ISF computations is to use the
+    `stats.kstwobign` distribution.
+
+    Examples
+    --------
+    >>> from scipy.special import kolmogi
+    >>> kolmogi([0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])
+    array([        inf,  1.22384787,  1.01918472,  0.82757356,  0.67644769,
+            0.57117327,  0.        ])
+
+    """)
+
+add_newdoc("kolmogorov",
+    r"""
+    kolmogorov(y, out=None)
+
+    Complementary cumulative distribution (Survival Function) function of
+    Kolmogorov distribution.
+
+    Returns the complementary cumulative distribution function of
+    Kolmogorov's limiting distribution (``D_n*\sqrt(n)`` as n goes to infinity)
+    of a two-sided test for equality between an empirical and a theoretical
+    distribution. It is equal to the (limit as n->infinity of the)
+    probability that ``sqrt(n) * max absolute deviation > y``.
+
+    Parameters
+    ----------
+    y : float array_like
+      Absolute deviation between the Empirical CDF (ECDF) and the target CDF,
+      multiplied by sqrt(n).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value(s) of kolmogorov(y)
+
+    See Also
+    --------
+    kolmogi : The Inverse Survival Function for the distribution
+    scipy.stats.kstwobign : Provides the functionality as a continuous distribution
+    smirnov, smirnovi : Functions for the one-sided distribution
+
+    Notes
+    -----
+    `kolmogorov` is used by `stats.kstest` in the application of the
+    Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this
+    function is exposed in `scpy.special`, but the recommended way to achieve
+    the most accurate CDF/SF/PDF/PPF/ISF computations is to use the
+    `stats.kstwobign` distribution.
+
+    Examples
+    --------
+    Show the probability of a gap at least as big as 0, 0.5 and 1.0.
+
+    >>> import numpy as np
+    >>> from scipy.special import kolmogorov
+    >>> from scipy.stats import kstwobign
+    >>> kolmogorov([0, 0.5, 1.0])
+    array([ 1.        ,  0.96394524,  0.26999967])
+
+    Compare a sample of size 1000 drawn from a Laplace(0, 1) distribution against
+    the target distribution, a Normal(0, 1) distribution.
+
+    >>> from scipy.stats import norm, laplace
+    >>> rng = np.random.default_rng()
+    >>> n = 1000
+    >>> lap01 = laplace(0, 1)
+    >>> x = np.sort(lap01.rvs(n, random_state=rng))
+    >>> np.mean(x), np.std(x)
+    (-0.05841730131499543, 1.3968109101997568)
+
+    Construct the Empirical CDF and the K-S statistic Dn.
+
+    >>> target = norm(0,1)  # Normal mean 0, stddev 1
+    >>> cdfs = target.cdf(x)
+    >>> ecdfs = np.arange(n+1, dtype=float)/n
+    >>> gaps = np.column_stack([cdfs - ecdfs[:n], ecdfs[1:] - cdfs])
+    >>> Dn = np.max(gaps)
+    >>> Kn = np.sqrt(n) * Dn
+    >>> print('Dn=%f, sqrt(n)*Dn=%f' % (Dn, Kn))
+    Dn=0.043363, sqrt(n)*Dn=1.371265
+    >>> print(chr(10).join(['For a sample of size n drawn from a N(0, 1) distribution:',
+    ...   ' the approximate Kolmogorov probability that sqrt(n)*Dn>=%f is %f' %
+    ...    (Kn, kolmogorov(Kn)),
+    ...   ' the approximate Kolmogorov probability that sqrt(n)*Dn<=%f is %f' %
+    ...    (Kn, kstwobign.cdf(Kn))]))
+    For a sample of size n drawn from a N(0, 1) distribution:
+     the approximate Kolmogorov probability that sqrt(n)*Dn>=1.371265 is 0.046533
+     the approximate Kolmogorov probability that sqrt(n)*Dn<=1.371265 is 0.953467
+
+    Plot the Empirical CDF against the target N(0, 1) CDF.
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.step(np.concatenate([[-3], x]), ecdfs, where='post', label='Empirical CDF')
+    >>> x3 = np.linspace(-3, 3, 100)
+    >>> plt.plot(x3, target.cdf(x3), label='CDF for N(0, 1)')
+    >>> plt.ylim([0, 1]); plt.grid(True); plt.legend();
+    >>> # Add vertical lines marking Dn+ and Dn-
+    >>> iminus, iplus = np.argmax(gaps, axis=0)
+    >>> plt.vlines([x[iminus]], ecdfs[iminus], cdfs[iminus],
+    ...            color='r', linestyle='dashed', lw=4)
+    >>> plt.vlines([x[iplus]], cdfs[iplus], ecdfs[iplus+1],
+    ...            color='r', linestyle='dashed', lw=4)
+    >>> plt.show()
+    """)
+
+add_newdoc("_kolmogc",
+    r"""
+    Internal function, do not use.
+    """)
+
+add_newdoc("_kolmogci",
+    r"""
+    Internal function, do not use.
+    """)
+
+add_newdoc("_kolmogp",
+    r"""
+    Internal function, do not use.
+    """)
+
+add_newdoc("_lanczos_sum_expg_scaled",
+    """
+    Internal function, do not use.
+    """)
+
+add_newdoc(
+    "_landau_pdf",
+    """
+    _landau_pdf(x, loc, scale)
+
+    Probability density function of the Landau distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued argument
+    loc : array_like
+        Real-valued distribution location
+    scale : array_like
+        Positive, real-valued distribution scale
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_landau_cdf",
+    """
+    _landau_cdf(x, loc, scale)
+
+    Cumulative distribution function of the Landau distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued argument
+    loc : array_like
+        Real-valued distribution location
+    scale : array_like
+        Positive, real-valued distribution scale
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_landau_sf",
+    """
+    _landau_sf(x, loc, scale)
+
+    Survival function of the Landau distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued argument
+    loc : array_like
+        Real-valued distribution location
+    scale : array_like
+        Positive, real-valued distribution scale
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_landau_ppf",
+    """
+    _landau_ppf(p, loc, scale)
+
+    Percent point function of the Landau distribution.
+
+    Parameters
+    ----------
+    p : array_like
+        Real-valued argument between 0 and 1
+    loc : array_like
+        Real-valued distribution location
+    scale : array_like
+        Positive, real-valued distribution scale
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_landau_isf",
+    """
+    _landau_isf(p, loc, scale)
+
+    Inverse survival function of the Landau distribution.
+
+    Parameters
+    ----------
+    p : array_like
+        Real-valued argument between 0 and 1
+    loc : array_like
+        Real-valued distribution location
+    scale : array_like
+        Positive, real-valued distribution scale
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc("_lgam1p",
+    """
+    Internal function, do not use.
+    """)
+
+add_newdoc("lpmv",
+    r"""
+    lpmv(m, v, x, out=None)
+
+    Associated Legendre function of integer order and real degree.
+
+    Defined as
+
+    .. math::
+
+        P_v^m = (-1)^m (1 - x^2)^{m/2} \frac{d^m}{dx^m} P_v(x)
+
+    where
+
+    .. math::
+
+        P_v = \sum_{k = 0}^\infty \frac{(-v)_k (v + 1)_k}{(k!)^2}
+                \left(\frac{1 - x}{2}\right)^k
+
+    is the Legendre function of the first kind. Here :math:`(\cdot)_k`
+    is the Pochhammer symbol; see `poch`.
+
+    Parameters
+    ----------
+    m : array_like
+        Order (int or float). If passed a float not equal to an
+        integer the function returns NaN.
+    v : array_like
+        Degree (float).
+    x : array_like
+        Argument (float). Must have ``|x| <= 1``.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    pmv : scalar or ndarray
+        Value of the associated Legendre function.
+
+    Notes
+    -----
+    Note that this implementation includes the Condon-Shortley phase.
+
+    References
+    ----------
+    .. [1] Zhang, Jin, "Computation of Special Functions", John Wiley
+           and Sons, Inc, 1996.
+
+    """)
+
+add_newdoc("nbdtr",
+    r"""
+    nbdtr(k, n, p, out=None)
+
+    Negative binomial cumulative distribution function.
+
+    Returns the sum of the terms 0 through `k` of the negative binomial
+    distribution probability mass function,
+
+    .. math::
+
+        F = \sum_{j=0}^k {{n + j - 1}\choose{j}} p^n (1 - p)^j.
+
+    In a sequence of Bernoulli trials with individual success probabilities
+    `p`, this is the probability that `k` or fewer failures precede the nth
+    success.
+
+    Parameters
+    ----------
+    k : array_like
+        The maximum number of allowed failures (nonnegative int).
+    n : array_like
+        The target number of successes (positive int).
+    p : array_like
+        Probability of success in a single event (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    F : scalar or ndarray
+        The probability of `k` or fewer failures before `n` successes in a
+        sequence of events with individual success probability `p`.
+
+    See Also
+    --------
+    nbdtrc : Negative binomial survival function
+    nbdtrik : Negative binomial quantile function
+    scipy.stats.nbinom : Negative binomial distribution
+
+    Notes
+    -----
+    If floating point values are passed for `k` or `n`, they will be truncated
+    to integers.
+
+    The terms are not summed directly; instead the regularized incomplete beta
+    function is employed, according to the formula,
+
+    .. math::
+        \mathrm{nbdtr}(k, n, p) = I_{p}(n, k + 1).
+
+    Wrapper for the Cephes [1]_ routine `nbdtr`.
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. Using `nbdtr` directly can improve performance
+    compared to the ``cdf`` method of `scipy.stats.nbinom` (see last example).
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    Compute the function for ``k=10`` and ``n=5`` at ``p=0.5``.
+
+    >>> import numpy as np
+    >>> from scipy.special import nbdtr
+    >>> nbdtr(10, 5, 0.5)
+    0.940765380859375
+
+    Compute the function for ``n=10`` and ``p=0.5`` at several points by
+    providing a NumPy array or list for `k`.
+
+    >>> nbdtr([5, 10, 15], 10, 0.5)
+    array([0.15087891, 0.58809853, 0.88523853])
+
+    Plot the function for four different parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> k = np.arange(130)
+    >>> n_parameters = [20, 20, 20, 80]
+    >>> p_parameters = [0.2, 0.5, 0.8, 0.5]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(p_parameters, n_parameters,
+    ...                            linestyles))
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> for parameter_set in parameters_list:
+    ...     p, n, style = parameter_set
+    ...     nbdtr_vals = nbdtr(k, n, p)
+    ...     ax.plot(k, nbdtr_vals, label=rf"$n={n},\, p={p}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$k$")
+    >>> ax.set_title("Negative binomial cumulative distribution function")
+    >>> plt.show()
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. Using `nbdtr` directly can be much faster than
+    calling the ``cdf`` method of `scipy.stats.nbinom`, especially for small
+    arrays or individual values. To get the same results one must use the
+    following parametrization: ``nbinom(n, p).cdf(k)=nbdtr(k, n, p)``.
+
+    >>> from scipy.stats import nbinom
+    >>> k, n, p = 5, 3, 0.5
+    >>> nbdtr_res = nbdtr(k, n, p)  # this will often be faster than below
+    >>> stats_res = nbinom(n, p).cdf(k)
+    >>> stats_res, nbdtr_res  # test that results are equal
+    (0.85546875, 0.85546875)
+
+    `nbdtr` can evaluate different parameter sets by providing arrays with
+    shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute
+    the function for three different `k` at four locations `p`, resulting in
+    a 3x4 array.
+
+    >>> k = np.array([[5], [10], [15]])
+    >>> p = np.array([0.3, 0.5, 0.7, 0.9])
+    >>> k.shape, p.shape
+    ((3, 1), (4,))
+
+    >>> nbdtr(k, 5, p)
+    array([[0.15026833, 0.62304687, 0.95265101, 0.9998531 ],
+           [0.48450894, 0.94076538, 0.99932777, 0.99999999],
+           [0.76249222, 0.99409103, 0.99999445, 1.        ]])
+    """)
+
+add_newdoc("nbdtrc",
+    r"""
+    nbdtrc(k, n, p, out=None)
+
+    Negative binomial survival function.
+
+    Returns the sum of the terms `k + 1` to infinity of the negative binomial
+    distribution probability mass function,
+
+    .. math::
+
+        F = \sum_{j=k + 1}^\infty {{n + j - 1}\choose{j}} p^n (1 - p)^j.
+
+    In a sequence of Bernoulli trials with individual success probabilities
+    `p`, this is the probability that more than `k` failures precede the nth
+    success.
+
+    Parameters
+    ----------
+    k : array_like
+        The maximum number of allowed failures (nonnegative int).
+    n : array_like
+        The target number of successes (positive int).
+    p : array_like
+        Probability of success in a single event (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    F : scalar or ndarray
+        The probability of `k + 1` or more failures before `n` successes in a
+        sequence of events with individual success probability `p`.
+
+    See Also
+    --------
+    nbdtr : Negative binomial cumulative distribution function
+    nbdtrik : Negative binomial percentile function
+    scipy.stats.nbinom : Negative binomial distribution
+
+    Notes
+    -----
+    If floating point values are passed for `k` or `n`, they will be truncated
+    to integers.
+
+    The terms are not summed directly; instead the regularized incomplete beta
+    function is employed, according to the formula,
+
+    .. math::
+        \mathrm{nbdtrc}(k, n, p) = I_{1 - p}(k + 1, n).
+
+    Wrapper for the Cephes [1]_ routine `nbdtrc`.
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. Using `nbdtrc` directly can improve performance
+    compared to the ``sf`` method of `scipy.stats.nbinom` (see last example).
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    Compute the function for ``k=10`` and ``n=5`` at ``p=0.5``.
+
+    >>> import numpy as np
+    >>> from scipy.special import nbdtrc
+    >>> nbdtrc(10, 5, 0.5)
+    0.059234619140624986
+
+    Compute the function for ``n=10`` and ``p=0.5`` at several points by
+    providing a NumPy array or list for `k`.
+
+    >>> nbdtrc([5, 10, 15], 10, 0.5)
+    array([0.84912109, 0.41190147, 0.11476147])
+
+    Plot the function for four different parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> k = np.arange(130)
+    >>> n_parameters = [20, 20, 20, 80]
+    >>> p_parameters = [0.2, 0.5, 0.8, 0.5]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(p_parameters, n_parameters,
+    ...                            linestyles))
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> for parameter_set in parameters_list:
+    ...     p, n, style = parameter_set
+    ...     nbdtrc_vals = nbdtrc(k, n, p)
+    ...     ax.plot(k, nbdtrc_vals, label=rf"$n={n},\, p={p}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_xlabel("$k$")
+    >>> ax.set_title("Negative binomial distribution survival function")
+    >>> plt.show()
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. Using `nbdtrc` directly can be much faster than
+    calling the ``sf`` method of `scipy.stats.nbinom`, especially for small
+    arrays or individual values. To get the same results one must use the
+    following parametrization: ``nbinom(n, p).sf(k)=nbdtrc(k, n, p)``.
+
+    >>> from scipy.stats import nbinom
+    >>> k, n, p = 3, 5, 0.5
+    >>> nbdtr_res = nbdtrc(k, n, p)  # this will often be faster than below
+    >>> stats_res = nbinom(n, p).sf(k)
+    >>> stats_res, nbdtr_res  # test that results are equal
+    (0.6367187499999999, 0.6367187499999999)
+
+    `nbdtrc` can evaluate different parameter sets by providing arrays with
+    shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute
+    the function for three different `k` at four locations `p`, resulting in
+    a 3x4 array.
+
+    >>> k = np.array([[5], [10], [15]])
+    >>> p = np.array([0.3, 0.5, 0.7, 0.9])
+    >>> k.shape, p.shape
+    ((3, 1), (4,))
+
+    >>> nbdtrc(k, 5, p)
+    array([[8.49731667e-01, 3.76953125e-01, 4.73489874e-02, 1.46902600e-04],
+           [5.15491059e-01, 5.92346191e-02, 6.72234070e-04, 9.29610100e-09],
+           [2.37507779e-01, 5.90896606e-03, 5.55025308e-06, 3.26346760e-13]])
+    """)
+
+add_newdoc(
+    "nbdtri",
+    r"""
+    nbdtri(k, n, y, out=None)
+
+    Returns the inverse with respect to the parameter `p` of
+    ``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution
+    function.
+
+    Parameters
+    ----------
+    k : array_like
+        The maximum number of allowed failures (nonnegative int).
+    n : array_like
+        The target number of successes (positive int).
+    y : array_like
+        The probability of `k` or fewer failures before `n` successes (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    p : scalar or ndarray
+        Probability of success in a single event (float) such that
+        `nbdtr(k, n, p) = y`.
+
+    See Also
+    --------
+    nbdtr : Cumulative distribution function of the negative binomial.
+    nbdtrc : Negative binomial survival function.
+    scipy.stats.nbinom : negative binomial distribution.
+    nbdtrik : Inverse with respect to `k` of `nbdtr(k, n, p)`.
+    nbdtrin : Inverse with respect to `n` of `nbdtr(k, n, p)`.
+    scipy.stats.nbinom : Negative binomial distribution
+
+    Notes
+    -----
+    Wrapper for the Cephes [1]_ routine `nbdtri`.
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. Using `nbdtri` directly can improve performance
+    compared to the ``ppf`` method of `scipy.stats.nbinom`.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    `nbdtri` is the inverse of `nbdtr` with respect to `p`.
+    Up to floating point errors the following holds:
+    ``nbdtri(k, n, nbdtr(k, n, p))=p``.
+
+    >>> import numpy as np
+    >>> from scipy.special import nbdtri, nbdtr
+    >>> k, n, y = 5, 10, 0.2
+    >>> cdf_val = nbdtr(k, n, y)
+    >>> nbdtri(k, n, cdf_val)
+    0.20000000000000004
+
+    Compute the function for ``k=10`` and ``n=5`` at several points by
+    providing a NumPy array or list for `y`.
+
+    >>> y = np.array([0.1, 0.4, 0.8])
+    >>> nbdtri(3, 5, y)
+    array([0.34462319, 0.51653095, 0.69677416])
+
+    Plot the function for three different parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> n_parameters = [5, 20, 30, 30]
+    >>> k_parameters = [20, 20, 60, 80]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(n_parameters, k_parameters, linestyles))
+    >>> cdf_vals = np.linspace(0, 1, 1000)
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> for parameter_set in parameters_list:
+    ...     n, k, style = parameter_set
+    ...     nbdtri_vals = nbdtri(k, n, cdf_vals)
+    ...     ax.plot(cdf_vals, nbdtri_vals, label=rf"$k={k},\ n={n}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_ylabel("$p$")
+    >>> ax.set_xlabel("$CDF$")
+    >>> title = "nbdtri: inverse of negative binomial CDF with respect to $p$"
+    >>> ax.set_title(title)
+    >>> plt.show()
+
+    `nbdtri` can evaluate different parameter sets by providing arrays with
+    shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute
+    the function for three different `k` at four locations `p`, resulting in
+    a 3x4 array.
+
+    >>> k = np.array([[5], [10], [15]])
+    >>> y = np.array([0.3, 0.5, 0.7, 0.9])
+    >>> k.shape, y.shape
+    ((3, 1), (4,))
+
+    >>> nbdtri(k, 5, y)
+    array([[0.37258157, 0.45169416, 0.53249956, 0.64578407],
+           [0.24588501, 0.30451981, 0.36778453, 0.46397088],
+           [0.18362101, 0.22966758, 0.28054743, 0.36066188]])
+    """)
+
+add_newdoc("nbdtrik",
+    r"""
+    nbdtrik(y, n, p, out=None)
+
+    Negative binomial percentile function.
+
+    Returns the inverse with respect to the parameter `k` of
+    ``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution
+    function.
+
+    Parameters
+    ----------
+    y : array_like
+        The probability of `k` or fewer failures before `n` successes (float).
+    n : array_like
+        The target number of successes (positive int).
+    p : array_like
+        Probability of success in a single event (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    k : scalar or ndarray
+        The maximum number of allowed failures such that `nbdtr(k, n, p) = y`.
+
+    See Also
+    --------
+    nbdtr : Cumulative distribution function of the negative binomial.
+    nbdtrc : Survival function of the negative binomial.
+    nbdtri : Inverse with respect to `p` of `nbdtr(k, n, p)`.
+    nbdtrin : Inverse with respect to `n` of `nbdtr(k, n, p)`.
+    scipy.stats.nbinom : Negative binomial distribution
+
+    Notes
+    -----
+    Wrapper for the CDFLIB [1]_ Fortran routine `cdfnbn`.
+
+    Formula 26.5.26 of [2]_ or [3]_,
+
+    .. math::
+        \sum_{j=k + 1}^\infty {{n + j - 1}
+        \choose{j}} p^n (1 - p)^j = I_{1 - p}(k + 1, n),
+
+    is used to reduce calculation of the cumulative distribution function to
+    that of a regularized incomplete beta :math:`I`.
+
+    Computation of `k` involves a search for a value that produces the desired
+    value of `y`.  The search relies on the monotonicity of `y` with `k`.
+
+    References
+    ----------
+    .. [1] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [3] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17.E24
+
+    Examples
+    --------
+    Compute the negative binomial cumulative distribution function for an
+    exemplary parameter set.
+
+    >>> import numpy as np
+    >>> from scipy.special import nbdtr, nbdtrik
+    >>> k, n, p = 5, 2, 0.5
+    >>> cdf_value = nbdtr(k, n, p)
+    >>> cdf_value
+    0.9375
+
+    Verify that `nbdtrik` recovers the original value for `k`.
+
+    >>> nbdtrik(cdf_value, n, p)
+    5.0
+
+    Plot the function for different parameter sets.
+
+    >>> import matplotlib.pyplot as plt
+    >>> p_parameters = [0.2, 0.5, 0.7, 0.5]
+    >>> n_parameters = [30, 30, 30, 80]
+    >>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']
+    >>> parameters_list = list(zip(p_parameters, n_parameters, linestyles))
+    >>> cdf_vals = np.linspace(0, 1, 1000)
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> for parameter_set in parameters_list:
+    ...     p, n, style = parameter_set
+    ...     nbdtrik_vals = nbdtrik(cdf_vals, n, p)
+    ...     ax.plot(cdf_vals, nbdtrik_vals, label=rf"$n={n},\ p={p}$",
+    ...             ls=style)
+    >>> ax.legend()
+    >>> ax.set_ylabel("$k$")
+    >>> ax.set_xlabel("$CDF$")
+    >>> ax.set_title("Negative binomial percentile function")
+    >>> plt.show()
+
+    The negative binomial distribution is also available as
+    `scipy.stats.nbinom`. The percentile function  method ``ppf``
+    returns the result of `nbdtrik` rounded up to integers:
+
+    >>> from scipy.stats import nbinom
+    >>> q, n, p = 0.6, 5, 0.5
+    >>> nbinom.ppf(q, n, p), nbdtrik(q, n, p)
+    (5.0, 4.800428460273882)
+
+    """)
+
+add_newdoc("nbdtrin",
+    r"""
+    nbdtrin(k, y, p, out=None)
+
+    Inverse of `nbdtr` vs `n`.
+
+    Returns the inverse with respect to the parameter `n` of
+    ``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution
+    function.
+
+    Parameters
+    ----------
+    k : array_like
+        The maximum number of allowed failures (nonnegative int).
+    y : array_like
+        The probability of `k` or fewer failures before `n` successes (float).
+    p : array_like
+        Probability of success in a single event (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    n : scalar or ndarray
+        The number of successes `n` such that `nbdtr(k, n, p) = y`.
+
+    See Also
+    --------
+    nbdtr : Cumulative distribution function of the negative binomial.
+    nbdtri : Inverse with respect to `p` of `nbdtr(k, n, p)`.
+    nbdtrik : Inverse with respect to `k` of `nbdtr(k, n, p)`.
+
+    Notes
+    -----
+    Wrapper for the CDFLIB [1]_ Fortran routine `cdfnbn`.
+
+    Formula 26.5.26 of [2]_ or [3]_,
+
+    .. math::
+        \sum_{j=k + 1}^\infty {{n + j - 1}
+        \choose{j}} p^n (1 - p)^j = I_{1 - p}(k + 1, n),
+
+    is used to reduce calculation of the cumulative distribution function to
+    that of a regularized incomplete beta :math:`I`.
+
+    Computation of `n` involves a search for a value that produces the desired
+    value of `y`.  The search relies on the monotonicity of `y` with `n`.
+
+    References
+    ----------
+    .. [1] Barry Brown, James Lovato, and Kathy Russell,
+           CDFLIB: Library of Fortran Routines for Cumulative Distribution
+           Functions, Inverses, and Other Parameters.
+    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [3] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/8.17.E24
+
+    Examples
+    --------
+    Compute the negative binomial cumulative distribution function for an
+    exemplary parameter set.
+
+    >>> from scipy.special import nbdtr, nbdtrin
+    >>> k, n, p = 5, 2, 0.5
+    >>> cdf_value = nbdtr(k, n, p)
+    >>> cdf_value
+    0.9375
+
+    Verify that `nbdtrin` recovers the original value for `n` up to floating
+    point accuracy.
+
+    >>> nbdtrin(k, cdf_value, p)
+    1.999999999998137
+    """)
+
+add_newdoc("ncfdtr",
+    r"""
+    ncfdtr(dfn, dfd, nc, f, out=None)
+
+    Cumulative distribution function of the non-central F distribution.
+
+    The non-central F describes the distribution of,
+
+    .. math::
+        Z = \frac{X/d_n}{Y/d_d}
+
+    where :math:`X` and :math:`Y` are independently distributed, with
+    :math:`X` distributed non-central :math:`\chi^2` with noncentrality
+    parameter `nc` and :math:`d_n` degrees of freedom, and :math:`Y`
+    distributed :math:`\chi^2` with :math:`d_d` degrees of freedom.
+
+    Parameters
+    ----------
+    dfn : array_like
+        Degrees of freedom of the numerator sum of squares.  Range (0, inf).
+    dfd : array_like
+        Degrees of freedom of the denominator sum of squares.  Range (0, inf).
+    nc : array_like
+        Noncentrality parameter.  Range [0, inf).
+    f : array_like
+        Quantiles, i.e. the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    cdf : scalar or ndarray
+        The calculated CDF.  If all inputs are scalar, the return will be a
+        float.  Otherwise it will be an array.
+
+    See Also
+    --------
+    ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.
+    ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.
+    ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.
+    ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.
+    scipy.stats.ncf : Non-central F distribution.
+
+    Notes
+    -----
+    This function calculates the CDF of the non-central f distribution using
+    the Boost Math C++ library [1]_.
+
+    The cumulative distribution function is computed using Formula 26.6.20 of
+    [2]_:
+
+    .. math::
+        F(d_n, d_d, n_c, f) = \sum_{j=0}^\infty e^{-n_c/2}
+        \frac{(n_c/2)^j}{j!} I_{x}(\frac{d_n}{2} + j, \frac{d_d}{2}),
+
+    where :math:`I` is the regularized incomplete beta function, and
+    :math:`x = f d_n/(f d_n + d_d)`.
+
+    Note that argument order of `ncfdtr` is different from that of the
+    similar ``cdf`` method of `scipy.stats.ncf`: `f` is the last
+    parameter of `ncfdtr` but the first parameter of ``scipy.stats.ncf.cdf``.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    Plot the CDF of the non-central F distribution, for nc=0.  Compare with the
+    F-distribution from scipy.stats:
+
+    >>> x = np.linspace(-1, 8, num=500)
+    >>> dfn = 3
+    >>> dfd = 2
+    >>> ncf_stats = stats.f.cdf(x, dfn, dfd)
+    >>> ncf_special = special.ncfdtr(dfn, dfd, 0, x)
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, ncf_stats, 'b-', lw=3)
+    >>> ax.plot(x, ncf_special, 'r-')
+    >>> plt.show()
+
+    """)
+
+add_newdoc("ncfdtri",
+    """
+    ncfdtri(dfn, dfd, nc, p, out=None)
+
+    Inverse with respect to `f` of the CDF of the non-central F distribution.
+
+    See `ncfdtr` for more details.
+
+    Parameters
+    ----------
+    dfn : array_like
+        Degrees of freedom of the numerator sum of squares.  Range (0, inf).
+    dfd : array_like
+        Degrees of freedom of the denominator sum of squares.  Range (0, inf).
+    nc : array_like
+        Noncentrality parameter.  Range [0, inf).
+    p : array_like
+        Value of the cumulative distribution function.  Must be in the
+        range [0, 1].
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    f : scalar or ndarray
+        Quantiles, i.e., the upper limit of integration.
+
+    See Also
+    --------
+    ncfdtr : CDF of the non-central F distribution.
+    ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.
+    ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.
+    ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.
+    scipy.stats.ncf : Non-central F distribution.
+
+    Notes
+    -----
+    This function calculates the Quantile of the non-central f distribution
+    using the Boost Math C++ library [1]_.
+
+    Note that argument order of `ncfdtri` is different from that of the
+    similar ``ppf`` method of `scipy.stats.ncf`. `p` is the last parameter
+    of `ncfdtri` but the first parameter of ``scipy.stats.ncf.ppf``.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import ncfdtr, ncfdtri
+
+    Compute the CDF for several values of `f`:
+
+    >>> f = [0.5, 1, 1.5]
+    >>> p = ncfdtr(2, 3, 1.5, f)
+    >>> p
+    array([ 0.20782291,  0.36107392,  0.47345752])
+
+    Compute the inverse.  We recover the values of `f`, as expected:
+
+    >>> ncfdtri(2, 3, 1.5, p)
+    array([ 0.5,  1. ,  1.5])
+
+    """)
+
+add_newdoc("ncfdtridfd",
+    """
+    ncfdtridfd(dfn, p, nc, f, out=None)
+
+    Calculate degrees of freedom (denominator) for the noncentral F-distribution.
+
+    This is the inverse with respect to `dfd` of `ncfdtr`.
+    See `ncfdtr` for more details.
+
+    Parameters
+    ----------
+    dfn : array_like
+        Degrees of freedom of the numerator sum of squares.  Range (0, inf).
+    p : array_like
+        Value of the cumulative distribution function.  Must be in the
+        range [0, 1].
+    nc : array_like
+        Noncentrality parameter.  Should be in range (0, 1e4).
+    f : array_like
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    dfd : scalar or ndarray
+        Degrees of freedom of the denominator sum of squares.
+
+    See Also
+    --------
+    ncfdtr : CDF of the non-central F distribution.
+    ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.
+    ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.
+    ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.
+
+    Notes
+    -----
+    The value of the cumulative noncentral F distribution is not necessarily
+    monotone in either degrees of freedom. There thus may be two values that
+    provide a given CDF value. This routine assumes monotonicity and will
+    find an arbitrary one of the two values.
+
+    Examples
+    --------
+    >>> from scipy.special import ncfdtr, ncfdtridfd
+
+    Compute the CDF for several values of `dfd`:
+
+    >>> dfd = [1, 2, 3]
+    >>> p = ncfdtr(2, dfd, 0.25, 15)
+    >>> p
+    array([ 0.8097138 ,  0.93020416,  0.96787852])
+
+    Compute the inverse.  We recover the values of `dfd`, as expected:
+
+    >>> ncfdtridfd(2, p, 0.25, 15)
+    array([ 1.,  2.,  3.])
+
+    """)
+
+add_newdoc("ncfdtridfn",
+    """
+    ncfdtridfn(p, dfd, nc, f, out=None)
+
+    Calculate degrees of freedom (numerator) for the noncentral F-distribution.
+
+    This is the inverse with respect to `dfn` of `ncfdtr`.
+    See `ncfdtr` for more details.
+
+    Parameters
+    ----------
+    p : array_like
+        Value of the cumulative distribution function. Must be in the
+        range [0, 1].
+    dfd : array_like
+        Degrees of freedom of the denominator sum of squares. Range (0, inf).
+    nc : array_like
+        Noncentrality parameter.  Should be in range (0, 1e4).
+    f : float
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    dfn : scalar or ndarray
+        Degrees of freedom of the numerator sum of squares.
+
+    See Also
+    --------
+    ncfdtr : CDF of the non-central F distribution.
+    ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.
+    ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.
+    ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.
+
+    Notes
+    -----
+    The value of the cumulative noncentral F distribution is not necessarily
+    monotone in either degrees of freedom. There thus may be two values that
+    provide a given CDF value. This routine assumes monotonicity and will
+    find an arbitrary one of the two values.
+
+    Examples
+    --------
+    >>> from scipy.special import ncfdtr, ncfdtridfn
+
+    Compute the CDF for several values of `dfn`:
+
+    >>> dfn = [1, 2, 3]
+    >>> p = ncfdtr(dfn, 2, 0.25, 15)
+    >>> p
+    array([ 0.92562363,  0.93020416,  0.93188394])
+
+    Compute the inverse. We recover the values of `dfn`, as expected:
+
+    >>> ncfdtridfn(p, 2, 0.25, 15)
+    array([ 1.,  2.,  3.])
+
+    """)
+
+add_newdoc("ncfdtrinc",
+    """
+    ncfdtrinc(dfn, dfd, p, f, out=None)
+
+    Calculate non-centrality parameter for non-central F distribution.
+
+    This is the inverse with respect to `nc` of `ncfdtr`.
+    See `ncfdtr` for more details.
+
+    Parameters
+    ----------
+    dfn : array_like
+        Degrees of freedom of the numerator sum of squares. Range (0, inf).
+    dfd : array_like
+        Degrees of freedom of the denominator sum of squares. Range (0, inf).
+    p : array_like
+        Value of the cumulative distribution function. Must be in the
+        range [0, 1].
+    f : array_like
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    nc : scalar or ndarray
+        Noncentrality parameter.
+
+    See Also
+    --------
+    ncfdtr : CDF of the non-central F distribution.
+    ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.
+    ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.
+    ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.
+
+    Examples
+    --------
+    >>> from scipy.special import ncfdtr, ncfdtrinc
+
+    Compute the CDF for several values of `nc`:
+
+    >>> nc = [0.5, 1.5, 2.0]
+    >>> p = ncfdtr(2, 3, nc, 15)
+    >>> p
+    array([ 0.96309246,  0.94327955,  0.93304098])
+
+    Compute the inverse. We recover the values of `nc`, as expected:
+
+    >>> ncfdtrinc(2, 3, p, 15)
+    array([ 0.5,  1.5,  2. ])
+
+    """)
+
+add_newdoc("nctdtr",
+    """
+    nctdtr(df, nc, t, out=None)
+
+    Cumulative distribution function of the non-central `t` distribution.
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom of the distribution. Should be in range (0, inf).
+    nc : array_like
+        Noncentrality parameter.
+    t : array_like
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    cdf : scalar or ndarray
+        The calculated CDF. If all inputs are scalar, the return will be a
+        float. Otherwise, it will be an array.
+
+    See Also
+    --------
+    nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.
+    nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.
+    nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.
+
+    Notes
+    -----
+    This function calculates the CDF of the non-central t distribution using
+    the Boost Math C++ library [1]_.
+
+    Note that the argument order of `nctdtr` is different from that of the
+    similar ``cdf`` method of `scipy.stats.nct`: `t` is the last
+    parameter of `nctdtr` but the first parameter of ``scipy.stats.nct.cdf``.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    Plot the CDF of the non-central t distribution, for nc=0. Compare with the
+    t-distribution from scipy.stats:
+
+    >>> x = np.linspace(-5, 5, num=500)
+    >>> df = 3
+    >>> nct_stats = stats.t.cdf(x, df)
+    >>> nct_special = special.nctdtr(df, 0, x)
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, nct_stats, 'b-', lw=3)
+    >>> ax.plot(x, nct_special, 'r-')
+    >>> plt.show()
+
+    """)
+
+add_newdoc("nctdtridf",
+    """
+    nctdtridf(p, nc, t, out=None)
+
+    Calculate degrees of freedom for non-central t distribution.
+
+    See `nctdtr` for more details.
+
+    Parameters
+    ----------
+    p : array_like
+        CDF values, in range (0, 1].
+    nc : array_like
+        Noncentrality parameter. Should be in range (-1e6, 1e6).
+    t : array_like
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    df : scalar or ndarray
+        The degrees of freedom. If all inputs are scalar, the return will be a
+        float. Otherwise, it will be an array.
+
+    See Also
+    --------
+    nctdtr :  CDF of the non-central `t` distribution.
+    nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.
+    nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.
+
+    Examples
+    --------
+    >>> from scipy.special import nctdtr, nctdtridf
+
+    Compute the CDF for several values of `df`:
+
+    >>> df = [1, 2, 3]
+    >>> p = nctdtr(df, 0.25, 1)
+    >>> p
+    array([0.67491974, 0.716464  , 0.73349456])
+
+    Compute the inverse. We recover the values of `df`, as expected:
+
+    >>> nctdtridf(p, 0.25, 1)
+    array([1., 2., 3.])
+
+    """)
+
+add_newdoc("nctdtrinc",
+    """
+    nctdtrinc(df, p, t, out=None)
+
+    Calculate non-centrality parameter for non-central t distribution.
+
+    See `nctdtr` for more details.
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom of the distribution. Should be in range (0, inf).
+    p : array_like
+        CDF values, in range (0, 1].
+    t : array_like
+        Quantiles, i.e., the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    nc : scalar or ndarray
+        Noncentrality parameter
+
+    See Also
+    --------
+    nctdtr :  CDF of the non-central `t` distribution.
+    nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.
+    nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.
+
+    Examples
+    --------
+    >>> from scipy.special import nctdtr, nctdtrinc
+
+    Compute the CDF for several values of `nc`:
+
+    >>> nc = [0.5, 1.5, 2.5]
+    >>> p = nctdtr(3, nc, 1.5)
+    >>> p
+    array([0.77569497, 0.45524533, 0.1668691 ])
+
+    Compute the inverse. We recover the values of `nc`, as expected:
+
+    >>> nctdtrinc(3, p, 1.5)
+    array([0.5, 1.5, 2.5])
+
+    """)
+
+add_newdoc("nctdtrit",
+    """
+    nctdtrit(df, nc, p, out=None)
+
+    Inverse cumulative distribution function of the non-central t distribution.
+
+    See `nctdtr` for more details.
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom of the distribution. Should be in range (0, inf).
+    nc : array_like
+        Noncentrality parameter.
+    p : array_like
+        CDF values, in range (0, 1].
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    t : scalar or ndarray
+        Quantiles
+
+    See Also
+    --------
+    nctdtr :  CDF of the non-central `t` distribution.
+    nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.
+    nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.
+
+    Notes
+    -----
+    This function calculates the quantile of the non-central t distribution using
+    the Boost Math C++ library [1]_.
+
+    Note that the argument order of `nctdtrit` is different from that of the
+    similar ``ppf`` method of `scipy.stats.nct`: `t` is the last
+    parameter of `nctdtrit` but the first parameter of ``scipy.stats.nct.ppf``.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> from scipy.special import nctdtr, nctdtrit
+
+    Compute the CDF for several values of `t`:
+
+    >>> t = [0.5, 1, 1.5]
+    >>> p = nctdtr(3, 1, t)
+    >>> p
+    array([0.29811049, 0.46922687, 0.6257559 ])
+
+    Compute the inverse. We recover the values of `t`, as expected:
+
+    >>> nctdtrit(3, 1, p)
+    array([0.5, 1. , 1.5])
+
+    """)
+
+add_newdoc("nrdtrimn",
+    """
+    nrdtrimn(p, std, x, out=None)
+
+    Calculate mean of normal distribution given other params.
+
+    Parameters
+    ----------
+    p : array_like
+        CDF values, in range (0, 1].
+    std : array_like
+        Standard deviation.
+    x : array_like
+        Quantiles, i.e. the upper limit of integration.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    mn : scalar or ndarray
+        The mean of the normal distribution.
+
+    See Also
+    --------
+    scipy.stats.norm : Normal distribution
+    ndtr : Standard normal cumulative probability distribution
+    ndtri : Inverse of standard normal CDF with respect to quantile
+    nrdtrisd : Inverse of normal distribution CDF with respect to
+               standard deviation
+
+    Examples
+    --------
+    `nrdtrimn` can be used to recover the mean of a normal distribution
+    if we know the CDF value `p` for a given quantile `x` and the
+    standard deviation `std`. First, we calculate
+    the normal distribution CDF for an exemplary parameter set.
+
+    >>> from scipy.stats import norm
+    >>> mean = 3.
+    >>> std = 2.
+    >>> x = 6.
+    >>> p = norm.cdf(x, loc=mean, scale=std)
+    >>> p
+    0.9331927987311419
+
+    Verify that `nrdtrimn` returns the original value for `mean`.
+
+    >>> from scipy.special import nrdtrimn
+    >>> nrdtrimn(p, std, x)
+    3.0000000000000004
+
+    """)
+
+add_newdoc("nrdtrisd",
+    """
+    nrdtrisd(mn, p, x, out=None)
+
+    Calculate standard deviation of normal distribution given other params.
+
+    Parameters
+    ----------
+    mn : scalar or ndarray
+        The mean of the normal distribution.
+    p : array_like
+        CDF values, in range (0, 1].
+    x : array_like
+        Quantiles, i.e. the upper limit of integration.
+
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    std : scalar or ndarray
+        Standard deviation.
+
+    See Also
+    --------
+    scipy.stats.norm : Normal distribution
+    ndtr : Standard normal cumulative probability distribution
+    ndtri : Inverse of standard normal CDF with respect to quantile
+    nrdtrimn : Inverse of normal distribution CDF with respect to
+               mean
+
+    Examples
+    --------
+    `nrdtrisd` can be used to recover the standard deviation of a normal
+    distribution if we know the CDF value `p` for a given quantile `x` and
+    the mean `mn`. First, we calculate the normal distribution CDF for an
+    exemplary parameter set.
+
+    >>> from scipy.stats import norm
+    >>> mean = 3.
+    >>> std = 2.
+    >>> x = 6.
+    >>> p = norm.cdf(x, loc=mean, scale=std)
+    >>> p
+    0.9331927987311419
+
+    Verify that `nrdtrisd` returns the original value for `std`.
+
+    >>> from scipy.special import nrdtrisd
+    >>> nrdtrisd(mean, p, x)
+    2.0000000000000004
+
+    """)
+
+add_newdoc("ndtri",
+    """
+    ndtri(y, out=None)
+
+    Inverse of `ndtr` vs x
+
+    Returns the argument x for which the area under the standard normal
+    probability density function (integrated from minus infinity to `x`)
+    is equal to y.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    x : scalar or ndarray
+        Value of x such that ``ndtr(x) == p``.
+
+    See Also
+    --------
+    ndtr : Standard normal cumulative probability distribution
+    ndtri_exp : Inverse of log_ndtr
+
+    Examples
+    --------
+    `ndtri` is the percentile function of the standard normal distribution.
+    This means it returns the inverse of the cumulative density `ndtr`. First,
+    let us compute a cumulative density value.
+
+    >>> import numpy as np
+    >>> from scipy.special import ndtri, ndtr
+    >>> cdf_val = ndtr(2)
+    >>> cdf_val
+    0.9772498680518208
+
+    Verify that `ndtri` yields the original value for `x` up to floating point
+    errors.
+
+    >>> ndtri(cdf_val)
+    2.0000000000000004
+
+    Plot the function. For that purpose, we provide a NumPy array as argument.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0.01, 1, 200)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, ndtri(x))
+    >>> ax.set_title("Standard normal percentile function")
+    >>> plt.show()
+    """)
+
+add_newdoc("pdtr",
+    r"""
+    pdtr(k, m, out=None)
+
+    Poisson cumulative distribution function.
+
+    Defined as the probability that a Poisson-distributed random
+    variable with event rate :math:`m` is less than or equal to
+    :math:`k`. More concretely, this works out to be [1]_
+
+    .. math::
+
+       \exp(-m) \sum_{j = 0}^{\lfloor{k}\rfloor} \frac{m^j}{j!}.
+
+    Parameters
+    ----------
+    k : array_like
+        Number of occurrences (nonnegative, real)
+    m : array_like
+        Shape parameter (nonnegative, real)
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the Poisson cumulative distribution function
+
+    See Also
+    --------
+    pdtrc : Poisson survival function
+    pdtrik : inverse of `pdtr` with respect to `k`
+    pdtri : inverse of `pdtr` with respect to `m`
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Poisson_distribution
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It is a cumulative distribution function, so it converges to 1
+    monotonically as `k` goes to infinity.
+
+    >>> sc.pdtr([1, 10, 100, np.inf], 1)
+    array([0.73575888, 0.99999999, 1.        , 1.        ])
+
+    It is discontinuous at integers and constant between integers.
+
+    >>> sc.pdtr([1, 1.5, 1.9, 2], 1)
+    array([0.73575888, 0.73575888, 0.73575888, 0.9196986 ])
+
+    """)
+
+add_newdoc("pdtrc",
+    """
+    pdtrc(k, m, out=None)
+
+    Poisson survival function
+
+    Returns the sum of the terms from k+1 to infinity of the Poisson
+    distribution: sum(exp(-m) * m**j / j!, j=k+1..inf) = gammainc(
+    k+1, m). Arguments must both be non-negative doubles.
+
+    Parameters
+    ----------
+    k : array_like
+        Number of occurrences (nonnegative, real)
+    m : array_like
+        Shape parameter (nonnegative, real)
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the Poisson survival function
+
+    See Also
+    --------
+    pdtr : Poisson cumulative distribution function
+    pdtrik : inverse of `pdtr` with respect to `k`
+    pdtri : inverse of `pdtr` with respect to `m`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It is a survival function, so it decreases to 0
+    monotonically as `k` goes to infinity.
+
+    >>> k = np.array([1, 10, 100, np.inf])
+    >>> sc.pdtrc(k, 1)
+    array([2.64241118e-001, 1.00477664e-008, 3.94147589e-161, 0.00000000e+000])
+
+    It can be expressed in terms of the lower incomplete gamma
+    function `gammainc`.
+
+    >>> sc.gammainc(k + 1, 1)
+    array([2.64241118e-001, 1.00477664e-008, 3.94147589e-161, 0.00000000e+000])
+
+    """)
+
+add_newdoc("pdtri",
+    """
+    pdtri(k, y, out=None)
+
+    Inverse to `pdtr` vs m
+
+    Returns the Poisson variable `m` such that the sum from 0 to `k` of
+    the Poisson density is equal to the given probability `y`:
+    calculated by ``gammaincinv(k + 1, y)``. `k` must be a nonnegative
+    integer and `y` between 0 and 1.
+
+    Parameters
+    ----------
+    k : array_like
+        Number of occurrences (nonnegative, real)
+    y : array_like
+        Probability
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the shape parameter `m` such that ``pdtr(k, m) = p``
+
+    See Also
+    --------
+    pdtr : Poisson cumulative distribution function
+    pdtrc : Poisson survival function
+    pdtrik : inverse of `pdtr` with respect to `k`
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    Compute the CDF for several values of `m`:
+
+    >>> m = [0.5, 1, 1.5]
+    >>> p = sc.pdtr(1, m)
+    >>> p
+    array([0.90979599, 0.73575888, 0.5578254 ])
+
+    Compute the inverse. We recover the values of `m`, as expected:
+
+    >>> sc.pdtri(1, p)
+    array([0.5, 1. , 1.5])
+
+    """)
+
+add_newdoc("pdtrik",
+    """
+    pdtrik(p, m, out=None)
+
+    Inverse to `pdtr` vs `k`.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability
+    m : array_like
+        Shape parameter (nonnegative, real)
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The number of occurrences `k` such that ``pdtr(k, m) = p``
+
+    Notes
+    -----
+    This function relies on the ``gamma_q_inva`` function from the Boost
+    Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    See Also
+    --------
+    pdtr : Poisson cumulative distribution function
+    pdtrc : Poisson survival function
+    pdtri : inverse of `pdtr` with respect to `m`
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    Compute the CDF for several values of `k`:
+
+    >>> k = [1, 2, 3]
+    >>> p = sc.pdtr(k, 2)
+    >>> p
+    array([0.40600585, 0.67667642, 0.85712346])
+
+    Compute the inverse. We recover the values of `k`, as expected:
+
+    >>> sc.pdtrik(p, 2)
+    array([1., 2., 3.])
+
+    """)
+
+add_newdoc("poch",
+    r"""
+    poch(z, m, out=None)
+
+    Pochhammer symbol.
+
+    The Pochhammer symbol (rising factorial) is defined as
+
+    .. math::
+
+        (z)_m = \frac{\Gamma(z + m)}{\Gamma(z)}
+
+    For positive integer `m` it reads
+
+    .. math::
+
+        (z)_m = z (z + 1) ... (z + m - 1)
+
+    See [DLMF]_ for more details.
+
+    Parameters
+    ----------
+    z, m : array_like
+        Real-valued arguments.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value of the function.
+
+    References
+    ----------
+    .. [DLMF] Nist, Digital Library of Mathematical Functions
+        https://dlmf.nist.gov/5.2#iii
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    It is 1 when m is 0.
+
+    >>> sc.poch([1, 2, 3, 4], 0)
+    array([1., 1., 1., 1.])
+
+    For z equal to 1 it reduces to the factorial function.
+
+    >>> sc.poch(1, 5)
+    120.0
+    >>> 1 * 2 * 3 * 4 * 5
+    120
+
+    It can be expressed in terms of the gamma function.
+
+    >>> z, m = 3.7, 2.1
+    >>> sc.poch(z, m)
+    20.529581933776953
+    >>> sc.gamma(z + m) / sc.gamma(z)
+    20.52958193377696
+
+    """)
+
+add_newdoc("powm1", """
+    powm1(x, y, out=None)
+
+    Computes ``x**y - 1``.
+
+    This function is useful when `y` is near 0, or when `x` is near 1.
+
+    The function is implemented for real types only (unlike ``numpy.power``,
+    which accepts complex inputs).
+
+    Parameters
+    ----------
+    x : array_like
+        The base. Must be a real type (i.e. integer or float, not complex).
+    y : array_like
+        The exponent. Must be a real type (i.e. integer or float, not complex).
+
+    Returns
+    -------
+    array_like
+        Result of the calculation
+
+    Notes
+    -----
+    .. versionadded:: 1.10.0
+
+    The underlying code is implemented for single precision and double
+    precision floats only.  Unlike `numpy.power`, integer inputs to
+    `powm1` are converted to floating point, and complex inputs are
+    not accepted.
+
+    Note the following edge cases:
+
+    * ``powm1(x, 0)`` returns 0 for any ``x``, including 0, ``inf``
+      and ``nan``.
+    * ``powm1(1, y)`` returns 0 for any ``y``, including ``nan``
+      and ``inf``.
+
+    This function wraps the ``powm1`` routine from the
+    Boost Math C++ library [1]_.
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import powm1
+
+    >>> x = np.array([1.2, 10.0, 0.9999999975])
+    >>> y = np.array([1e-9, 1e-11, 0.1875])
+    >>> powm1(x, y)
+    array([ 1.82321557e-10,  2.30258509e-11, -4.68749998e-10])
+
+    It can be verified that the relative errors in those results
+    are less than 2.5e-16.
+
+    Compare that to the result of ``x**y - 1``, where the
+    relative errors are all larger than 8e-8:
+
+    >>> x**y - 1
+    array([ 1.82321491e-10,  2.30258035e-11, -4.68750039e-10])
+
+    """)
+
+
+add_newdoc("pseudo_huber",
+    r"""
+    pseudo_huber(delta, r, out=None)
+
+    Pseudo-Huber loss function.
+
+    .. math:: \mathrm{pseudo\_huber}(\delta, r) =
+              \delta^2 \left( \sqrt{ 1 + \left( \frac{r}{\delta} \right)^2 } - 1 \right)
+
+    Parameters
+    ----------
+    delta : array_like
+        Input array, indicating the soft quadratic vs. linear loss changepoint.
+    r : array_like
+        Input array, possibly representing residuals.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    res : scalar or ndarray
+        The computed Pseudo-Huber loss function values.
+
+    See Also
+    --------
+    huber: Similar function which this function approximates
+
+    Notes
+    -----
+    Like `huber`, `pseudo_huber` often serves as a robust loss function
+    in statistics or machine learning to reduce the influence of outliers.
+    Unlike `huber`, `pseudo_huber` is smooth.
+
+    Typically, `r` represents residuals, the difference
+    between a model prediction and data. Then, for :math:`|r|\leq\delta`,
+    `pseudo_huber` resembles the squared error and for :math:`|r|>\delta` the
+    absolute error. This way, the Pseudo-Huber loss often achieves
+    a fast convergence in model fitting for small residuals like the squared
+    error loss function and still reduces the influence of outliers
+    (:math:`|r|>\delta`) like the absolute error loss. As :math:`\delta` is
+    the cutoff between squared and absolute error regimes, it has
+    to be tuned carefully for each problem. `pseudo_huber` is also
+    convex, making it suitable for gradient based optimization. [1]_ [2]_
+
+    .. versionadded:: 0.15.0
+
+    References
+    ----------
+    .. [1] Hartley, Zisserman, "Multiple View Geometry in Computer Vision".
+           2003. Cambridge University Press. p. 619
+    .. [2] Charbonnier et al. "Deterministic edge-preserving regularization
+           in computed imaging". 1997. IEEE Trans. Image Processing.
+           6 (2): 298 - 311.
+
+    Examples
+    --------
+    Import all necessary modules.
+
+    >>> import numpy as np
+    >>> from scipy.special import pseudo_huber, huber
+    >>> import matplotlib.pyplot as plt
+
+    Calculate the function for ``delta=1`` at ``r=2``.
+
+    >>> pseudo_huber(1., 2.)
+    1.2360679774997898
+
+    Calculate the function at ``r=2`` for different `delta` by providing
+    a list or NumPy array for `delta`.
+
+    >>> pseudo_huber([1., 2., 4.], 3.)
+    array([2.16227766, 3.21110255, 4.        ])
+
+    Calculate the function for ``delta=1`` at several points by providing
+    a list or NumPy array for `r`.
+
+    >>> pseudo_huber(2., np.array([1., 1.5, 3., 4.]))
+    array([0.47213595, 1.        , 3.21110255, 4.94427191])
+
+    The function can be calculated for different `delta` and `r` by
+    providing arrays for both with compatible shapes for broadcasting.
+
+    >>> r = np.array([1., 2.5, 8., 10.])
+    >>> deltas = np.array([[1.], [5.], [9.]])
+    >>> print(r.shape, deltas.shape)
+    (4,) (3, 1)
+
+    >>> pseudo_huber(deltas, r)
+    array([[ 0.41421356,  1.6925824 ,  7.06225775,  9.04987562],
+           [ 0.49509757,  2.95084972, 22.16990566, 30.90169944],
+           [ 0.49846624,  3.06693762, 27.37435121, 40.08261642]])
+
+    Plot the function for different `delta`.
+
+    >>> x = np.linspace(-4, 4, 500)
+    >>> deltas = [1, 2, 3]
+    >>> linestyles = ["dashed", "dotted", "dashdot"]
+    >>> fig, ax = plt.subplots()
+    >>> combined_plot_parameters = list(zip(deltas, linestyles))
+    >>> for delta, style in combined_plot_parameters:
+    ...     ax.plot(x, pseudo_huber(delta, x), label=rf"$\delta={delta}$",
+    ...             ls=style)
+    >>> ax.legend(loc="upper center")
+    >>> ax.set_xlabel("$x$")
+    >>> ax.set_title(r"Pseudo-Huber loss function $h_{\delta}(x)$")
+    >>> ax.set_xlim(-4, 4)
+    >>> ax.set_ylim(0, 8)
+    >>> plt.show()
+
+    Finally, illustrate the difference between `huber` and `pseudo_huber` by
+    plotting them and their gradients with respect to `r`. The plot shows
+    that `pseudo_huber` is continuously differentiable while `huber` is not
+    at the points :math:`\pm\delta`.
+
+    >>> def huber_grad(delta, x):
+    ...     grad = np.copy(x)
+    ...     linear_area = np.argwhere(np.abs(x) > delta)
+    ...     grad[linear_area]=delta*np.sign(x[linear_area])
+    ...     return grad
+    >>> def pseudo_huber_grad(delta, x):
+    ...     return x* (1+(x/delta)**2)**(-0.5)
+    >>> x=np.linspace(-3, 3, 500)
+    >>> delta = 1.
+    >>> fig, ax = plt.subplots(figsize=(7, 7))
+    >>> ax.plot(x, huber(delta, x), label="Huber", ls="dashed")
+    >>> ax.plot(x, huber_grad(delta, x), label="Huber Gradient", ls="dashdot")
+    >>> ax.plot(x, pseudo_huber(delta, x), label="Pseudo-Huber", ls="dotted")
+    >>> ax.plot(x, pseudo_huber_grad(delta, x), label="Pseudo-Huber Gradient",
+    ...         ls="solid")
+    >>> ax.legend(loc="upper center")
+    >>> plt.show()
+    """)
+
+add_newdoc("rel_entr",
+    r"""
+    rel_entr(x, y, out=None)
+
+    Elementwise function for computing relative entropy.
+
+    .. math::
+
+        \mathrm{rel\_entr}(x, y) =
+            \begin{cases}
+                x \log(x / y) & x > 0, y > 0 \\
+                0 & x = 0, y \ge 0 \\
+                \infty & \text{otherwise}
+            \end{cases}
+
+    Parameters
+    ----------
+    x, y : array_like
+        Input arrays
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Relative entropy of the inputs
+
+    See Also
+    --------
+    entr, kl_div, scipy.stats.entropy
+
+    Notes
+    -----
+    .. versionadded:: 0.15.0
+
+    This function is jointly convex in x and y.
+
+    The origin of this function is in convex programming; see
+    [1]_. Given two discrete probability distributions :math:`p_1,
+    \ldots, p_n` and :math:`q_1, \ldots, q_n`, the definition of relative
+    entropy in the context of *information theory* is
+
+    .. math::
+
+        \sum_{i = 1}^n \mathrm{rel\_entr}(p_i, q_i).
+
+    To compute the latter quantity, use `scipy.stats.entropy`.
+
+    See [2]_ for details.
+
+    References
+    ----------
+    .. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.
+           Cambridge University Press, 2004.
+           :doi:`https://doi.org/10.1017/CBO9780511804441`
+    .. [2] Kullback-Leibler divergence,
+           https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+
+    """)
+
+add_newdoc("round",
+    """
+    round(x, out=None)
+
+    Round to the nearest integer.
+
+    Returns the nearest integer to `x`.  If `x` ends in 0.5 exactly,
+    the nearest even integer is chosen.
+
+    Parameters
+    ----------
+    x : array_like
+        Real valued input.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        The nearest integers to the elements of `x`. The result is of
+        floating type, not integer type.
+
+    Examples
+    --------
+    >>> import scipy.special as sc
+
+    It rounds to even.
+
+    >>> sc.round([0.5, 1.5])
+    array([0., 2.])
+
+    """)
+
+add_newdoc("shichi",
+    r"""
+    shichi(x, out=None)
+
+    Hyperbolic sine and cosine integrals.
+
+    The hyperbolic sine integral is
+
+    .. math::
+
+      \int_0^x \frac{\sinh{t}}{t}dt
+
+    and the hyperbolic cosine integral is
+
+    .. math::
+
+      \gamma + \log(x) + \int_0^x \frac{\cosh{t} - 1}{t} dt
+
+    where :math:`\gamma` is Euler's constant and :math:`\log` is the
+    principal branch of the logarithm [1]_ (see also [2]_).
+
+    Parameters
+    ----------
+    x : array_like
+        Real or complex points at which to compute the hyperbolic sine
+        and cosine integrals.
+    out : tuple of ndarray, optional
+        Optional output arrays for the function results
+
+    Returns
+    -------
+    si : scalar or ndarray
+        Hyperbolic sine integral at ``x``
+    ci : scalar or ndarray
+        Hyperbolic cosine integral at ``x``
+
+    See Also
+    --------
+    sici : Sine and cosine integrals.
+    exp1 : Exponential integral E1.
+    expi : Exponential integral Ei.
+
+    Notes
+    -----
+    For real arguments with ``x < 0``, ``chi`` is the real part of the
+    hyperbolic cosine integral. For such points ``chi(x)`` and ``chi(x
+    + 0j)`` differ by a factor of ``1j*pi``.
+
+    For real arguments the function is computed by calling Cephes'
+    [3]_ *shichi* routine. For complex arguments the algorithm is based
+    on Mpmath's [4]_ *shi* and *chi* routines.
+
+    References
+    ----------
+    .. [1] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+           (See Section 5.2.)
+    .. [2] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/6.2.E15 and https://dlmf.nist.gov/6.2.E16
+    .. [3] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+    .. [4] Fredrik Johansson and others.
+           "mpmath: a Python library for arbitrary-precision floating-point
+           arithmetic" (Version 0.19) http://mpmath.org/
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import shichi, sici
+
+    `shichi` accepts real or complex input:
+
+    >>> shichi(0.5)
+    (0.5069967498196671, -0.05277684495649357)
+    >>> shichi(0.5 + 2.5j)
+    ((0.11772029666668238+1.831091777729851j),
+     (0.29912435887648825+1.7395351121166562j))
+
+    The hyperbolic sine and cosine integrals Shi(z) and Chi(z) are
+    related to the sine and cosine integrals Si(z) and Ci(z) by
+
+    * Shi(z) = -i*Si(i*z)
+    * Chi(z) = Ci(-i*z) + i*pi/2
+
+    >>> z = 0.25 + 5j
+    >>> shi, chi = shichi(z)
+    >>> shi, -1j*sici(1j*z)[0]            # Should be the same.
+    ((-0.04834719325101729+1.5469354086921228j),
+     (-0.04834719325101729+1.5469354086921228j))
+    >>> chi, sici(-1j*z)[1] + 1j*np.pi/2  # Should be the same.
+    ((-0.19568708973868087+1.556276312103824j),
+     (-0.19568708973868087+1.556276312103824j))
+
+    Plot the functions evaluated on the real axis:
+
+    >>> xp = np.geomspace(1e-8, 4.0, 250)
+    >>> x = np.concatenate((-xp[::-1], xp))
+    >>> shi, chi = shichi(x)
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, shi, label='Shi(x)')
+    >>> ax.plot(x, chi, '--', label='Chi(x)')
+    >>> ax.set_xlabel('x')
+    >>> ax.set_title('Hyperbolic Sine and Cosine Integrals')
+    >>> ax.legend(shadow=True, framealpha=1, loc='lower right')
+    >>> ax.grid(True)
+    >>> plt.show()
+
+    """)
+
+add_newdoc("sici",
+    r"""
+    sici(x, out=None)
+
+    Sine and cosine integrals.
+
+    The sine integral is
+
+    .. math::
+
+      \int_0^x \frac{\sin{t}}{t}dt
+
+    and the cosine integral is
+
+    .. math::
+
+      \gamma + \log(x) + \int_0^x \frac{\cos{t} - 1}{t}dt
+
+    where :math:`\gamma` is Euler's constant and :math:`\log` is the
+    principal branch of the logarithm [1]_ (see also [2]_).
+
+    Parameters
+    ----------
+    x : array_like
+        Real or complex points at which to compute the sine and cosine
+        integrals.
+    out : tuple of ndarray, optional
+        Optional output arrays for the function results
+
+    Returns
+    -------
+    si : scalar or ndarray
+        Sine integral at ``x``
+    ci : scalar or ndarray
+        Cosine integral at ``x``
+
+    See Also
+    --------
+    shichi : Hyperbolic sine and cosine integrals.
+    exp1 : Exponential integral E1.
+    expi : Exponential integral Ei.
+
+    Notes
+    -----
+    For real arguments with ``x < 0``, ``ci`` is the real part of the
+    cosine integral. For such points ``ci(x)`` and ``ci(x + 0j)``
+    differ by a factor of ``1j*pi``.
+
+    For real arguments the function is computed by calling Cephes'
+    [3]_ *sici* routine. For complex arguments the algorithm is based
+    on Mpmath's [4]_ *si* and *ci* routines.
+
+    References
+    ----------
+    .. [1] Milton Abramowitz and Irene A. Stegun, eds.
+           Handbook of Mathematical Functions with Formulas,
+           Graphs, and Mathematical Tables. New York: Dover, 1972.
+           (See Section 5.2.)
+    .. [2] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/6.2.E9, https://dlmf.nist.gov/6.2.E12,
+           and https://dlmf.nist.gov/6.2.E13
+    .. [3] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+    .. [4] Fredrik Johansson and others.
+           "mpmath: a Python library for arbitrary-precision floating-point
+           arithmetic" (Version 0.19) http://mpmath.org/
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import sici, exp1
+
+    `sici` accepts real or complex input:
+
+    >>> sici(2.5)
+    (1.7785201734438267, 0.2858711963653835)
+    >>> sici(2.5 + 3j)
+    ((4.505735874563953+0.06863305018999577j),
+    (0.0793644206906966-2.935510262937543j))
+
+    For z in the right half plane, the sine and cosine integrals are
+    related to the exponential integral E1 (implemented in SciPy as
+    `scipy.special.exp1`) by
+
+    * Si(z) = (E1(i*z) - E1(-i*z))/2i + pi/2
+    * Ci(z) = -(E1(i*z) + E1(-i*z))/2
+
+    See [1]_ (equations 5.2.21 and 5.2.23).
+
+    We can verify these relations:
+
+    >>> z = 2 - 3j
+    >>> sici(z)
+    ((4.54751388956229-1.3991965806460565j),
+    (1.408292501520851+2.9836177420296055j))
+
+    >>> (exp1(1j*z) - exp1(-1j*z))/2j + np.pi/2  # Same as sine integral
+    (4.54751388956229-1.3991965806460565j)
+
+    >>> -(exp1(1j*z) + exp1(-1j*z))/2            # Same as cosine integral
+    (1.408292501520851+2.9836177420296055j)
+
+    Plot the functions evaluated on the real axis; the dotted horizontal
+    lines are at pi/2 and -pi/2:
+
+    >>> x = np.linspace(-16, 16, 150)
+    >>> si, ci = sici(x)
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, si, label='Si(x)')
+    >>> ax.plot(x, ci, '--', label='Ci(x)')
+    >>> ax.legend(shadow=True, framealpha=1, loc='upper left')
+    >>> ax.set_xlabel('x')
+    >>> ax.set_title('Sine and Cosine Integrals')
+    >>> ax.axhline(np.pi/2, linestyle=':', alpha=0.5, color='k')
+    >>> ax.axhline(-np.pi/2, linestyle=':', alpha=0.5, color='k')
+    >>> ax.grid(True)
+    >>> plt.show()
+
+    """)
+
+add_newdoc("smirnov",
+    r"""
+    smirnov(n, d, out=None)
+
+    Kolmogorov-Smirnov complementary cumulative distribution function
+
+    Returns the exact Kolmogorov-Smirnov complementary cumulative
+    distribution function,(aka the Survival Function) of Dn+ (or Dn-)
+    for a one-sided test of equality between an empirical and a
+    theoretical distribution. It is equal to the probability that the
+    maximum difference between a theoretical distribution and an empirical
+    one based on `n` samples is greater than d.
+
+    Parameters
+    ----------
+    n : int
+      Number of samples
+    d : float array_like
+      Deviation between the Empirical CDF (ECDF) and the target CDF.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value(s) of smirnov(n, d), Prob(Dn+ >= d) (Also Prob(Dn- >= d))
+
+    See Also
+    --------
+    smirnovi : The Inverse Survival Function for the distribution
+    scipy.stats.ksone : Provides the functionality as a continuous distribution
+    kolmogorov, kolmogi : Functions for the two-sided distribution
+
+    Notes
+    -----
+    `smirnov` is used by `stats.kstest` in the application of the
+    Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this
+    function is exposed in `scpy.special`, but the recommended way to achieve
+    the most accurate CDF/SF/PDF/PPF/ISF computations is to use the
+    `stats.ksone` distribution.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import smirnov
+    >>> from scipy.stats import norm
+
+    Show the probability of a gap at least as big as 0, 0.5 and 1.0 for a
+    sample of size 5.
+
+    >>> smirnov(5, [0, 0.5, 1.0])
+    array([ 1.   ,  0.056,  0.   ])
+
+    Compare a sample of size 5 against N(0, 1), the standard normal
+    distribution with mean 0 and standard deviation 1.
+
+    `x` is the sample.
+
+    >>> x = np.array([-1.392, -0.135, 0.114, 0.190, 1.82])
+
+    >>> target = norm(0, 1)
+    >>> cdfs = target.cdf(x)
+    >>> cdfs
+    array([0.0819612 , 0.44630594, 0.5453811 , 0.57534543, 0.9656205 ])
+
+    Construct the empirical CDF and the K-S statistics (Dn+, Dn-, Dn).
+
+    >>> n = len(x)
+    >>> ecdfs = np.arange(n+1, dtype=float)/n
+    >>> cols = np.column_stack([x, ecdfs[1:], cdfs, cdfs - ecdfs[:n],
+    ...                        ecdfs[1:] - cdfs])
+    >>> with np.printoptions(precision=3):
+    ...    print(cols)
+    [[-1.392  0.2    0.082  0.082  0.118]
+     [-0.135  0.4    0.446  0.246 -0.046]
+     [ 0.114  0.6    0.545  0.145  0.055]
+     [ 0.19   0.8    0.575 -0.025  0.225]
+     [ 1.82   1.     0.966  0.166  0.034]]
+    >>> gaps = cols[:, -2:]
+    >>> Dnpm = np.max(gaps, axis=0)
+    >>> print(f'Dn-={Dnpm[0]:f}, Dn+={Dnpm[1]:f}')
+    Dn-=0.246306, Dn+=0.224655
+    >>> probs = smirnov(n, Dnpm)
+    >>> print(f'For a sample of size {n} drawn from N(0, 1):',
+    ...       f' Smirnov n={n}: Prob(Dn- >= {Dnpm[0]:f}) = {probs[0]:.4f}',
+    ...       f' Smirnov n={n}: Prob(Dn+ >= {Dnpm[1]:f}) = {probs[1]:.4f}',
+    ...       sep='\n')
+    For a sample of size 5 drawn from N(0, 1):
+     Smirnov n=5: Prob(Dn- >= 0.246306) = 0.4711
+     Smirnov n=5: Prob(Dn+ >= 0.224655) = 0.5245
+
+    Plot the empirical CDF and the standard normal CDF.
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.step(np.concatenate(([-2.5], x, [2.5])),
+    ...          np.concatenate((ecdfs, [1])),
+    ...          where='post', label='Empirical CDF')
+    >>> xx = np.linspace(-2.5, 2.5, 100)
+    >>> plt.plot(xx, target.cdf(xx), '--', label='CDF for N(0, 1)')
+
+    Add vertical lines marking Dn+ and Dn-.
+
+    >>> iminus, iplus = np.argmax(gaps, axis=0)
+    >>> plt.vlines([x[iminus]], ecdfs[iminus], cdfs[iminus], color='r',
+    ...            alpha=0.5, lw=4)
+    >>> plt.vlines([x[iplus]], cdfs[iplus], ecdfs[iplus+1], color='m',
+    ...            alpha=0.5, lw=4)
+
+    >>> plt.grid(True)
+    >>> plt.legend(framealpha=1, shadow=True)
+    >>> plt.show()
+    """)
+
+add_newdoc("smirnovi",
+    """
+    smirnovi(n, p, out=None)
+
+    Inverse to `smirnov`
+
+    Returns `d` such that ``smirnov(n, d) == p``, the critical value
+    corresponding to `p`.
+
+    Parameters
+    ----------
+    n : int
+      Number of samples
+    p : float array_like
+        Probability
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value(s) of smirnovi(n, p), the critical values.
+
+    See Also
+    --------
+    smirnov : The Survival Function (SF) for the distribution
+    scipy.stats.ksone : Provides the functionality as a continuous distribution
+    kolmogorov, kolmogi : Functions for the two-sided distribution
+    scipy.stats.kstwobign : Two-sided Kolmogorov-Smirnov distribution, large n
+
+    Notes
+    -----
+    `smirnov` is used by `stats.kstest` in the application of the
+    Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this
+    function is exposed in `scpy.special`, but the recommended way to achieve
+    the most accurate CDF/SF/PDF/PPF/ISF computations is to use the
+    `stats.ksone` distribution.
+
+    Examples
+    --------
+    >>> from scipy.special import smirnovi, smirnov
+
+    >>> n = 24
+    >>> deviations = [0.1, 0.2, 0.3]
+
+    Use `smirnov` to compute the complementary CDF of the Smirnov
+    distribution for the given number of samples and deviations.
+
+    >>> p = smirnov(n, deviations)
+    >>> p
+    array([0.58105083, 0.12826832, 0.01032231])
+
+    The inverse function ``smirnovi(n, p)`` returns ``deviations``.
+
+    >>> smirnovi(n, p)
+    array([0.1, 0.2, 0.3])
+
+    """)
+
+add_newdoc("_smirnovc",
+    """
+    _smirnovc(n, d)
+     Internal function, do not use.
+    """)
+
+add_newdoc("_smirnovci",
+    """
+     Internal function, do not use.
+    """)
+
+add_newdoc("_smirnovp",
+    """
+    _smirnovp(n, p)
+     Internal function, do not use.
+    """)
+
+add_newdoc("spence",
+    r"""
+    spence(z, out=None)
+
+    Spence's function, also known as the dilogarithm.
+
+    It is defined to be
+
+    .. math::
+      \int_1^z \frac{\log(t)}{1 - t}dt
+
+    for complex :math:`z`, where the contour of integration is taken
+    to avoid the branch cut of the logarithm. Spence's function is
+    analytic everywhere except the negative real axis where it has a
+    branch cut.
+
+    Parameters
+    ----------
+    z : array_like
+        Points at which to evaluate Spence's function
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    s : scalar or ndarray
+        Computed values of Spence's function
+
+    Notes
+    -----
+    There is a different convention which defines Spence's function by
+    the integral
+
+    .. math::
+      -\int_0^z \frac{\log(1 - t)}{t}dt;
+
+    this is our ``spence(1 - z)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import spence
+    >>> import matplotlib.pyplot as plt
+
+    The function is defined for complex inputs:
+
+    >>> spence([1-1j, 1.5+2j, 3j, -10-5j])
+    array([-0.20561676+0.91596559j, -0.86766909-1.39560134j,
+           -0.59422064-2.49129918j, -1.14044398+6.80075924j])
+
+    For complex inputs on the branch cut, which is the negative real axis,
+    the function returns the limit for ``z`` with positive imaginary part.
+    For example, in the following, note the sign change of the imaginary
+    part of the output for ``z = -2`` and ``z = -2 - 1e-8j``:
+
+    >>> spence([-2 + 1e-8j, -2, -2 - 1e-8j])
+    array([2.32018041-3.45139229j, 2.32018042-3.4513923j ,
+           2.32018041+3.45139229j])
+
+    The function returns ``nan`` for real inputs on the branch cut:
+
+    >>> spence(-1.5)
+    nan
+
+    Verify some particular values: ``spence(0) = pi**2/6``,
+    ``spence(1) = 0`` and ``spence(2) = -pi**2/12``.
+
+    >>> spence([0, 1, 2])
+    array([ 1.64493407,  0.        , -0.82246703])
+    >>> np.pi**2/6, -np.pi**2/12
+    (1.6449340668482264, -0.8224670334241132)
+
+    Verify the identity::
+
+        spence(z) + spence(1 - z) = pi**2/6 - log(z)*log(1 - z)
+
+    >>> z = 3 + 4j
+    >>> spence(z) + spence(1 - z)
+    (-2.6523186143876067+1.8853470951513935j)
+    >>> np.pi**2/6 - np.log(z)*np.log(1 - z)
+    (-2.652318614387606+1.885347095151394j)
+
+    Plot the function for positive real input.
+
+    >>> fig, ax = plt.subplots()
+    >>> x = np.linspace(0, 6, 400)
+    >>> ax.plot(x, spence(x))
+    >>> ax.grid()
+    >>> ax.set_xlabel('x')
+    >>> ax.set_title('spence(x)')
+    >>> plt.show()
+    """)
+
+add_newdoc(
+    "stdtr",
+    r"""
+    stdtr(df, t, out=None)
+
+    Student t distribution cumulative distribution function
+
+    Returns the integral:
+
+    .. math::
+        \frac{\Gamma((df+1)/2)}{\sqrt{\pi df} \Gamma(df/2)}
+        \int_{-\infty}^t (1+x^2/df)^{-(df+1)/2}\, dx
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom
+    t : array_like
+        Upper bound of the integral
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the Student t CDF at t
+
+    See Also
+    --------
+    stdtridf : inverse of stdtr with respect to `df`
+    stdtrit : inverse of stdtr with respect to `t`
+    scipy.stats.t : student t distribution
+
+    Notes
+    -----
+    The student t distribution is also available as `scipy.stats.t`.
+    Calling `stdtr` directly can improve performance compared to the
+    ``cdf`` method of `scipy.stats.t` (see last example below).
+
+    The function is computed using the Boost Math library [1]_, which
+    relies on the incomplete beta function.
+
+    References
+    ----------
+    .. [1] Boost C++ Libraries, http://www.boost.org/
+
+    Examples
+    --------
+    Calculate the function for ``df=3`` at ``t=1``.
+
+    >>> import numpy as np
+    >>> from scipy.special import stdtr
+    >>> import matplotlib.pyplot as plt
+    >>> stdtr(3, 1)
+    0.8044988905221148
+
+    Plot the function for three different degrees of freedom.
+
+    >>> x = np.linspace(-10, 10, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> parameters = [(1, "solid"), (3, "dashed"), (10, "dotted")]
+    >>> for (df, linestyle) in parameters:
+    ...     ax.plot(x, stdtr(df, x), ls=linestyle, label=f"$df={df}$")
+    >>> ax.legend()
+    >>> ax.set_title("Student t distribution cumulative distribution function")
+    >>> plt.show()
+
+    The function can be computed for several degrees of freedom at the same
+    time by providing a NumPy array or list for `df`:
+
+    >>> stdtr([1, 2, 3], 1)
+    array([0.75      , 0.78867513, 0.80449889])
+
+    It is possible to calculate the function at several points for several
+    different degrees of freedom simultaneously by providing arrays for `df`
+    and `t` with shapes compatible for broadcasting. Compute `stdtr` at
+    4 points for 3 degrees of freedom resulting in an array of shape 3x4.
+
+    >>> dfs = np.array([[1], [2], [3]])
+    >>> t = np.array([2, 4, 6, 8])
+    >>> dfs.shape, t.shape
+    ((3, 1), (4,))
+
+    >>> stdtr(dfs, t)
+    array([[0.85241638, 0.92202087, 0.94743154, 0.96041658],
+           [0.90824829, 0.97140452, 0.98666426, 0.99236596],
+           [0.93033702, 0.98599577, 0.99536364, 0.99796171]])
+
+    The t distribution is also available as `scipy.stats.t`. Calling `stdtr`
+    directly can be much faster than calling the ``cdf`` method of
+    `scipy.stats.t`. To get the same results, one must use the following
+    parametrization: ``scipy.stats.t(df).cdf(x) = stdtr(df, x)``.
+
+    >>> from scipy.stats import t
+    >>> df, x = 3, 1
+    >>> stdtr_result = stdtr(df, x)  # this can be faster than below
+    >>> stats_result = t(df).cdf(x)
+    >>> stats_result == stdtr_result  # test that results are equal
+    True
+    """)
+
+add_newdoc("stdtridf",
+    """
+    stdtridf(p, t, out=None)
+
+    Inverse of `stdtr` vs df
+
+    Returns the argument df such that stdtr(df, t) is equal to `p`.
+
+    Parameters
+    ----------
+    p : array_like
+        Probability
+    t : array_like
+        Upper bound of the integral
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    df : scalar or ndarray
+        Value of `df` such that ``stdtr(df, t) == p``
+
+    See Also
+    --------
+    stdtr : Student t CDF
+    stdtrit : inverse of stdtr with respect to `t`
+    scipy.stats.t : Student t distribution
+
+    Examples
+    --------
+    Compute the student t cumulative distribution function for one
+    parameter set.
+
+    >>> from scipy.special import stdtr, stdtridf
+    >>> df, x = 5, 2
+    >>> cdf_value = stdtr(df, x)
+    >>> cdf_value
+    0.9490302605850709
+
+    Verify that `stdtridf` recovers the original value for `df` given
+    the CDF value and `x`.
+
+    >>> stdtridf(cdf_value, x)
+    5.0
+    """)
+
+add_newdoc("stdtrit",
+    """
+    stdtrit(df, p, out=None)
+
+    The `p`-th quantile of the student t distribution.
+
+    This function is the inverse of the student t distribution cumulative
+    distribution function (CDF), returning `t` such that `stdtr(df, t) = p`.
+
+    Returns the argument `t` such that stdtr(df, t) is equal to `p`.
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom
+    p : array_like
+        Probability
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    t : scalar or ndarray
+        Value of `t` such that ``stdtr(df, t) == p``
+
+    See Also
+    --------
+    stdtr : Student t CDF
+    stdtridf : inverse of stdtr with respect to `df`
+    scipy.stats.t : Student t distribution
+
+    Notes
+    -----
+    The student t distribution is also available as `scipy.stats.t`. Calling
+    `stdtrit` directly can improve performance compared to the ``ppf``
+    method of `scipy.stats.t` (see last example below).
+
+    The function is computed using the Boost Math library [1]_, which
+    relies on the incomplete beta function.
+
+    References
+    ----------
+    .. [1] Boost C++ Libraries, http://www.boost.org/
+
+    Examples
+    --------
+    `stdtrit` represents the inverse of the student t distribution CDF which
+    is available as `stdtr`. Here, we calculate the CDF for ``df`` at
+    ``x=1``. `stdtrit` then returns ``1`` up to floating point errors
+    given the same value for `df` and the computed CDF value.
+
+    >>> import numpy as np
+    >>> from scipy.special import stdtr, stdtrit
+    >>> import matplotlib.pyplot as plt
+    >>> df = 3
+    >>> x = 1
+    >>> cdf_value = stdtr(df, x)
+    >>> stdtrit(df, cdf_value)
+    0.9999999994418539
+
+    Plot the function for three different degrees of freedom.
+
+    >>> x = np.linspace(0, 1, 1000)
+    >>> parameters = [(1, "solid"), (2, "dashed"), (5, "dotted")]
+    >>> fig, ax = plt.subplots()
+    >>> for (df, linestyle) in parameters:
+    ...     ax.plot(x, stdtrit(df, x), ls=linestyle, label=f"$df={df}$")
+    >>> ax.legend()
+    >>> ax.set_ylim(-10, 10)
+    >>> ax.set_title("Student t distribution quantile function")
+    >>> plt.show()
+
+    The function can be computed for several degrees of freedom at the same
+    time by providing a NumPy array or list for `df`:
+
+    >>> stdtrit([1, 2, 3], 0.7)
+    array([0.72654253, 0.6172134 , 0.58438973])
+
+    It is possible to calculate the function at several points for several
+    different degrees of freedom simultaneously by providing arrays for `df`
+    and `p` with shapes compatible for broadcasting. Compute `stdtrit` at
+    4 points for 3 degrees of freedom resulting in an array of shape 3x4.
+
+    >>> dfs = np.array([[1], [2], [3]])
+    >>> p = np.array([0.2, 0.4, 0.7, 0.8])
+    >>> dfs.shape, p.shape
+    ((3, 1), (4,))
+
+    >>> stdtrit(dfs, p)
+    array([[-1.37638192, -0.3249197 ,  0.72654253,  1.37638192],
+           [-1.06066017, -0.28867513,  0.6172134 ,  1.06066017],
+           [-0.97847231, -0.27667066,  0.58438973,  0.97847231]])
+
+    The t distribution is also available as `scipy.stats.t`. Calling `stdtrit`
+    directly can be much faster than calling the ``ppf`` method of
+    `scipy.stats.t`. To get the same results, one must use the following
+    parametrization: ``scipy.stats.t(df).ppf(x) = stdtrit(df, x)``.
+
+    >>> from scipy.stats import t
+    >>> df, x = 3, 0.5
+    >>> stdtrit_result = stdtrit(df, x)  # this can be faster than below
+    >>> stats_result = t(df).ppf(x)
+    >>> stats_result == stdtrit_result  # test that results are equal
+    True
+    """)
+
+add_newdoc(
+    "tklmbda",
+    r"""
+    tklmbda(x, lmbda, out=None)
+
+    Cumulative distribution function of the Tukey lambda distribution.
+
+    Parameters
+    ----------
+    x, lmbda : array_like
+        Parameters
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    cdf : scalar or ndarray
+        Value of the Tukey lambda CDF
+
+    See Also
+    --------
+    scipy.stats.tukeylambda : Tukey lambda distribution
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import tklmbda, expit
+
+    Compute the cumulative distribution function (CDF) of the Tukey lambda
+    distribution at several ``x`` values for `lmbda` = -1.5.
+
+    >>> x = np.linspace(-2, 2, 9)
+    >>> x
+    array([-2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ])
+    >>> tklmbda(x, -1.5)
+    array([0.34688734, 0.3786554 , 0.41528805, 0.45629737, 0.5       ,
+           0.54370263, 0.58471195, 0.6213446 , 0.65311266])
+
+    When `lmbda` is 0, the function is the logistic sigmoid function,
+    which is implemented in `scipy.special` as `expit`.
+
+    >>> tklmbda(x, 0)
+    array([0.11920292, 0.18242552, 0.26894142, 0.37754067, 0.5       ,
+           0.62245933, 0.73105858, 0.81757448, 0.88079708])
+    >>> expit(x)
+    array([0.11920292, 0.18242552, 0.26894142, 0.37754067, 0.5       ,
+           0.62245933, 0.73105858, 0.81757448, 0.88079708])
+
+    When `lmbda` is 1, the Tukey lambda distribution is uniform on the
+    interval [-1, 1], so the CDF increases linearly.
+
+    >>> t = np.linspace(-1, 1, 9)
+    >>> tklmbda(t, 1)
+    array([0.   , 0.125, 0.25 , 0.375, 0.5  , 0.625, 0.75 , 0.875, 1.   ])
+
+    In the following, we generate plots for several values of `lmbda`.
+
+    The first figure shows graphs for `lmbda` <= 0.
+
+    >>> styles = ['-', '-.', '--', ':']
+    >>> fig, ax = plt.subplots()
+    >>> x = np.linspace(-12, 12, 500)
+    >>> for k, lmbda in enumerate([-1.0, -0.5, 0.0]):
+    ...     y = tklmbda(x, lmbda)
+    ...     ax.plot(x, y, styles[k], label=rf'$\lambda$ = {lmbda:-4.1f}')
+
+    >>> ax.set_title(r'tklmbda(x, $\lambda$)')
+    >>> ax.set_label('x')
+    >>> ax.legend(framealpha=1, shadow=True)
+    >>> ax.grid(True)
+
+    The second figure shows graphs for `lmbda` > 0.  The dots in the
+    graphs show the bounds of the support of the distribution.
+
+    >>> fig, ax = plt.subplots()
+    >>> x = np.linspace(-4.2, 4.2, 500)
+    >>> lmbdas = [0.25, 0.5, 1.0, 1.5]
+    >>> for k, lmbda in enumerate(lmbdas):
+    ...     y = tklmbda(x, lmbda)
+    ...     ax.plot(x, y, styles[k], label=fr'$\lambda$ = {lmbda}')
+
+    >>> ax.set_prop_cycle(None)
+    >>> for lmbda in lmbdas:
+    ...     ax.plot([-1/lmbda, 1/lmbda], [0, 1], '.', ms=8)
+
+    >>> ax.set_title(r'tklmbda(x, $\lambda$)')
+    >>> ax.set_xlabel('x')
+    >>> ax.legend(framealpha=1, shadow=True)
+    >>> ax.grid(True)
+
+    >>> plt.tight_layout()
+    >>> plt.show()
+
+    The CDF of the Tukey lambda distribution is also implemented as the
+    ``cdf`` method of `scipy.stats.tukeylambda`.  In the following,
+    ``tukeylambda.cdf(x, -0.5)`` and ``tklmbda(x, -0.5)`` compute the
+    same values:
+
+    >>> from scipy.stats import tukeylambda
+    >>> x = np.linspace(-2, 2, 9)
+
+    >>> tukeylambda.cdf(x, -0.5)
+    array([0.21995157, 0.27093858, 0.33541677, 0.41328161, 0.5       ,
+           0.58671839, 0.66458323, 0.72906142, 0.78004843])
+
+    >>> tklmbda(x, -0.5)
+    array([0.21995157, 0.27093858, 0.33541677, 0.41328161, 0.5       ,
+           0.58671839, 0.66458323, 0.72906142, 0.78004843])
+
+    The implementation in ``tukeylambda`` also provides location and scale
+    parameters, and other methods such as ``pdf()`` (the probability
+    density function) and ``ppf()`` (the inverse of the CDF), so for
+    working with the Tukey lambda distribution, ``tukeylambda`` is more
+    generally useful.  The primary advantage of ``tklmbda`` is that it is
+    significantly faster than ``tukeylambda.cdf``.
+    """)
+
+add_newdoc("yn",
+    r"""
+    yn(n, x, out=None)
+
+    Bessel function of the second kind of integer order and real argument.
+
+    Parameters
+    ----------
+    n : array_like
+        Order (integer).
+    x : array_like
+        Argument (float).
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    Y : scalar or ndarray
+        Value of the Bessel function, :math:`Y_n(x)`.
+
+    See Also
+    --------
+    yv : For real order and real or complex argument.
+    y0: faster implementation of this function for order 0
+    y1: faster implementation of this function for order 1
+
+    Notes
+    -----
+    Wrapper for the Cephes [1]_ routine `yn`.
+
+    The function is evaluated by forward recurrence on `n`, starting with
+    values computed by the Cephes routines `y0` and `y1`. If ``n = 0`` or 1,
+    the routine for `y0` or `y1` is called directly.
+
+    References
+    ----------
+    .. [1] Cephes Mathematical Functions Library,
+           http://www.netlib.org/cephes/
+
+    Examples
+    --------
+    Evaluate the function of order 0 at one point.
+
+    >>> from scipy.special import yn
+    >>> yn(0, 1.)
+    0.08825696421567697
+
+    Evaluate the function at one point for different orders.
+
+    >>> yn(0, 1.), yn(1, 1.), yn(2, 1.)
+    (0.08825696421567697, -0.7812128213002888, -1.6506826068162546)
+
+    The evaluation for different orders can be carried out in one call by
+    providing a list or NumPy array as argument for the `v` parameter:
+
+    >>> yn([0, 1, 2], 1.)
+    array([ 0.08825696, -0.78121282, -1.65068261])
+
+    Evaluate the function at several points for order 0 by providing an
+    array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0.5, 3., 8.])
+    >>> yn(0, points)
+    array([-0.44451873,  0.37685001,  0.22352149])
+
+    If `z` is an array, the order parameter `v` must be broadcastable to
+    the correct shape if different orders shall be computed in one call.
+    To calculate the orders 0 and 1 for a 1D array:
+
+    >>> orders = np.array([[0], [1]])
+    >>> orders.shape
+    (2, 1)
+
+    >>> yn(orders, points)
+    array([[-0.44451873,  0.37685001,  0.22352149],
+           [-1.47147239,  0.32467442, -0.15806046]])
+
+    Plot the functions of order 0 to 3 from 0 to 10.
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> x = np.linspace(0., 10., 1000)
+    >>> for i in range(4):
+    ...     ax.plot(x, yn(i, x), label=f'$Y_{i!r}$')
+    >>> ax.set_ylim(-3, 1)
+    >>> ax.legend()
+    >>> plt.show()
+    """)
+
+
+add_newdoc("_struve_asymp_large_z",
+    """
+    _struve_asymp_large_z(v, z, is_h)
+
+    Internal function for testing `struve` & `modstruve`
+
+    Evaluates using asymptotic expansion
+
+    Returns
+    -------
+    v, err
+    """)
+
+add_newdoc("_struve_power_series",
+    """
+    _struve_power_series(v, z, is_h)
+
+    Internal function for testing `struve` & `modstruve`
+
+    Evaluates using power series
+
+    Returns
+    -------
+    v, err
+    """)
+
+add_newdoc("_struve_bessel_series",
+    """
+    _struve_bessel_series(v, z, is_h)
+
+    Internal function for testing `struve` & `modstruve`
+
+    Evaluates using Bessel function series
+
+    Returns
+    -------
+    v, err
+    """)
+
+add_newdoc("_spherical_jn",
+    """
+    Internal function, use `spherical_jn` instead.
+    """)
+
+add_newdoc("_spherical_jn_d",
+    """
+    Internal function, use `spherical_jn` instead.
+    """)
+
+add_newdoc("_spherical_yn",
+    """
+    Internal function, use `spherical_yn` instead.
+    """)
+
+add_newdoc("_spherical_yn_d",
+    """
+    Internal function, use `spherical_yn` instead.
+    """)
+
+add_newdoc("_spherical_in",
+    """
+    Internal function, use `spherical_in` instead.
+    """)
+
+add_newdoc("_spherical_in_d",
+    """
+    Internal function, use `spherical_in` instead.
+    """)
+
+add_newdoc("_spherical_kn",
+    """
+    Internal function, use `spherical_kn` instead.
+    """)
+
+add_newdoc("_spherical_kn_d",
+    """
+    Internal function, use `spherical_kn` instead.
+    """)
+
+add_newdoc("owens_t",
+    """
+    owens_t(h, a, out=None)
+
+    Owen's T Function.
+
+    The function T(h, a) gives the probability of the event
+    (X > h and 0 < Y < a * X) where X and Y are independent
+    standard normal random variables.
+
+    Parameters
+    ----------
+    h: array_like
+        Input value.
+    a: array_like
+        Input value.
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    t: scalar or ndarray
+        Probability of the event (X > h and 0 < Y < a * X),
+        where X and Y are independent standard normal random variables.
+
+    References
+    ----------
+    .. [1] M. Patefield and D. Tandy, "Fast and accurate calculation of
+           Owen's T Function", Statistical Software vol. 5, pp. 1-25, 2000.
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> a = 3.5
+    >>> h = 0.78
+    >>> special.owens_t(h, a)
+    0.10877216734852274
+    """)
+
+add_newdoc("_factorial",
+    """
+    Internal function, do not use.
+    """)
+
+add_newdoc("ndtri_exp",
+    r"""
+    ndtri_exp(y, out=None)
+
+    Inverse of `log_ndtr` vs x. Allows for greater precision than
+    `ndtri` composed with `numpy.exp` for very small values of y and for
+    y close to 0.
+
+    Parameters
+    ----------
+    y : array_like of float
+        Function argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Inverse of the log CDF of the standard normal distribution, evaluated
+        at y.
+
+    See Also
+    --------
+    log_ndtr : log of the standard normal cumulative distribution function
+    ndtr : standard normal cumulative distribution function
+    ndtri : standard normal percentile function
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    `ndtri_exp` agrees with the naive implementation when the latter does
+    not suffer from underflow.
+
+    >>> sc.ndtri_exp(-1)
+    -0.33747496376420244
+    >>> sc.ndtri(np.exp(-1))
+    -0.33747496376420244
+
+    For extreme values of y, the naive approach fails
+
+    >>> sc.ndtri(np.exp(-800))
+    -inf
+    >>> sc.ndtri(np.exp(-1e-20))
+    inf
+
+    whereas `ndtri_exp` is still able to compute the result to high precision.
+
+    >>> sc.ndtri_exp(-800)
+    -39.88469483825668
+    >>> sc.ndtri_exp(-1e-20)
+    9.262340089798409
+    """)
+
+
+add_newdoc("_stirling2_inexact",
+    r"""
+    Internal function, do not use.
+    """)
+
+add_newdoc(
+    "_beta_pdf",
+    r"""
+    _beta_pdf(x, a, b)
+
+    Probability density function of beta distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued such that :math:`0 \leq x \leq 1`,
+        the upper limit of integration
+    a, b : array_like
+           Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_beta_ppf",
+    r"""
+    _beta_ppf(x, a, b)
+
+    Percent point function of beta distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued such that :math:`0 \leq x \leq 1`,
+        the upper limit of integration
+    a, b : array_like
+           Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_invgauss_ppf",
+    """
+    _invgauss_ppf(x, mu)
+
+    Percent point function of inverse gaussian distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    mu : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_invgauss_isf",
+    """
+    _invgauss_isf(x, mu, s)
+
+    Inverse survival function of inverse gaussian distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    mu : array_like
+        Positive, real-valued parameters
+    s : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_cauchy_ppf",
+    """
+    _cauchy_ppf(p, loc, scale)
+
+    Percent point function (i.e. quantile) of the Cauchy distribution.
+
+    Parameters
+    ----------
+    p : array_like
+        Probabilities
+    loc : array_like
+        Location parameter of the distribution.
+    scale : array_like
+        Scale parameter of the distribution.
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_cauchy_isf",
+    """
+    _cauchy_isf(p, loc, scale)
+
+    Inverse survival function of the Cauchy distribution.
+
+    Parameters
+    ----------
+    p : array_like
+        Probabilities
+    loc : array_like
+        Location parameter of the distribution.
+    scale : array_like
+        Scale parameter of the distribution.
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncx2_pdf",
+    """
+    _ncx2_pdf(x, k, l)
+
+    Probability density function of Non-central chi-squared distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    k, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncx2_cdf",
+    """
+    _ncx2_cdf(x, k, l)
+
+    Cumulative density function of Non-central chi-squared distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    k, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncx2_ppf",
+    """
+    _ncx2_ppf(x, k, l)
+
+    Percent point function of Non-central chi-squared distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    k, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncx2_sf",
+    """
+    _ncx2_sf(x, k, l)
+
+    Survival function of Non-central chi-squared distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    k, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncx2_isf",
+    """
+    _ncx2_isf(x, k, l)
+
+    Inverse survival function of Non-central chi-squared distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    k, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_pdf",
+    """
+    _ncf_pdf(x, v1, v2, l)
+
+    Probability density function of noncentral F-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_cdf",
+    """
+    _ncf_cdf(x, v1, v2, l)
+
+    Cumulative density function of noncentral F-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_ppf",
+    """
+    _ncf_ppf(x, v1, v2, l)
+
+    Percent point function of noncentral F-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_sf",
+    """
+    _ncf_sf(x, v1, v2, l)
+
+    Survival function of noncentral F-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_isf",
+    """
+    _ncf_isf(x, v1, v2, l)
+
+    Inverse survival function of noncentral F-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Positive real-valued
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_mean",
+    """
+    _ncf_mean(v1, v2, l)
+
+    Mean of noncentral F-distribution.
+
+    Parameters
+    ----------
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_variance",
+    """
+    _ncf_variance(v1, v2, l)
+
+    Variance of noncentral F-distribution.
+
+    Parameters
+    ----------
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_skewness",
+    """
+    _ncf_skewness(v1, v2, l)
+
+    Skewness of noncentral F-distribution.
+
+    Parameters
+    ----------
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_ncf_kurtosis_excess",
+    """
+    _ncf_kurtosis_excess(v1, v2, l)
+
+    Kurtosis excess of noncentral F-distribution.
+
+    Parameters
+    ----------
+    v1, v2, l : array_like
+        Positive, real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_cdf",
+    """
+    _nct_cdf(x, v, l)
+
+    Cumulative density function of noncentral t-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_pdf",
+    """
+    _nct_pdf(x, v, l)
+
+    Probability density function of noncentral t-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+
+add_newdoc(
+    "_nct_ppf",
+    """
+    _nct_ppf(x, v, l)
+
+    Percent point function of noncentral t-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_sf",
+    """
+    _nct_sf(x, v, l)
+
+    Survival function of noncentral t-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_isf",
+    """
+    _nct_isf(x, v, l)
+
+    Inverse survival function of noncentral t-distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_mean",
+    """
+    _nct_mean(v, l)
+
+    Mean of noncentral t-distribution.
+
+    Parameters
+    ----------
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_variance",
+    """
+    _nct_variance(v, l)
+
+    Variance of noncentral t-distribution.
+
+    Parameters
+    ----------
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_skewness",
+    """
+    _nct_skewness(v, l)
+
+    Skewness of noncentral t-distribution.
+
+    Parameters
+    ----------
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nct_kurtosis_excess",
+    """
+    _nct_kurtosis_excess(v, l)
+
+    Kurtosis excess of noncentral t-distribution.
+
+    Parameters
+    ----------
+    v : array_like
+        Positive, real-valued parameters
+    l : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_skewnorm_cdf",
+    """
+    _skewnorm_cdf(x, l, sc, sh)
+
+    Cumulative density function of skewnorm distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    l : array_like
+        Real-valued parameters
+    sc : array_like
+        Positive, Real-valued parameters
+    sh : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_skewnorm_ppf",
+    """
+    _skewnorm_ppf(x, l, sc, sh)
+
+    Percent point function of skewnorm distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    l : array_like
+        Real-valued parameters
+    sc : array_like
+        Positive, Real-valued parameters
+    sh : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_skewnorm_isf",
+    """
+    _skewnorm_isf(x, l, sc, sh)
+
+    Inverse survival function of skewnorm distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    l : array_like
+        Real-valued parameters
+    sc : array_like
+        Positive, Real-valued parameters
+    sh : array_like
+        Real-valued parameters
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_binom_pmf",
+    """
+    _binom_pmf(x, n, p)
+
+    Probability mass function of binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    n : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_binom_cdf",
+    """
+    _binom_cdf(x, n, p)
+
+    Cumulative density function of binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    n : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_binom_ppf",
+    """
+    _binom_ppf(x, n, p)
+
+    Percent point function of binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    n : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_binom_sf",
+    """
+    _binom_sf(x, n, p)
+
+    Survival function of binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    n : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_binom_isf",
+    """
+    _binom_isf(x, n, p)
+
+    Inverse survival function of binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    n : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_pmf",
+    """
+    _nbinom_pmf(x, r, p)
+
+    Probability mass function of negative binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_cdf",
+    """
+    _nbinom_cdf(x, r, p)
+
+    Cumulative density function of negative binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_ppf",
+    """
+    _nbinom_ppf(x, r, p)
+
+    Percent point function of negative binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_sf",
+    """
+    _nbinom_sf(x, r, p)
+
+    Survival function of negative binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_isf",
+    """
+    _nbinom_isf(x, r, p)
+
+    Inverse survival function of negative binomial distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_mean",
+    """
+    _nbinom_mean(r, p)
+
+    Mean of negative binomial distribution.
+
+    Parameters
+    ----------
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_variance",
+    """
+    _nbinom_variance(r, p)
+
+    Variance of negative binomial distribution.
+
+    Parameters
+    ----------
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_skewness",
+    """
+    _nbinom_skewness(r, p)
+
+    Skewness of negative binomial distribution.
+
+    Parameters
+    ----------
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_nbinom_kurtosis_excess",
+    """
+    _nbinom_kurtosis_excess(r, p)
+
+    Kurtosis excess of negative binomial distribution.
+
+    Parameters
+    ----------
+    r : array_like
+        Positive, integer-valued parameter
+    p : array_like
+        Positive, real-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_hypergeom_pmf",
+    """
+    _hypergeom_pmf(x, r, N, M)
+
+    Probability mass function of hypergeometric distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_hypergeom_cdf",
+    """
+    _hypergeom_cdf(x, r, N, M)
+
+    Cumulative density function of hypergeometric distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_hypergeom_sf",
+    """
+    _hypergeom_sf(x, r, N, M)
+
+    Survival function of hypergeometric distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Real-valued
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+    """)
+
+add_newdoc(
+    "_hypergeom_mean",
+    """
+    _hypergeom_mean(r, N, M)
+
+    Mean of hypergeometric distribution.
+
+    Parameters
+    ----------
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_hypergeom_variance",
+    """
+    _hypergeom_variance(r, N, M)
+
+    Mean of hypergeometric distribution.
+
+    Parameters
+    ----------
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
+
+add_newdoc(
+    "_hypergeom_skewness",
+    """
+    _hypergeom_skewness(r, N, M)
+
+    Skewness of hypergeometric distribution.
+
+    Parameters
+    ----------
+    r, N, M : array_like
+        Positive, integer-valued parameter
+
+    Returns
+    -------
+    scalar or ndarray
+
+    """)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_basic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..1698d82b0001b315ca0b1ab49b3c8243896de34f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_basic.py
@@ -0,0 +1,3382 @@
+#
+# Author:  Travis Oliphant, 2002
+#
+
+import numpy as np
+import math
+import warnings
+from collections import defaultdict
+from heapq import heapify, heappop
+from numpy import (pi, asarray, floor, isscalar, sqrt, where,
+                   sin, place, issubdtype, extract, inexact, nan, zeros, sinc)
+
+from . import _ufuncs
+from ._ufuncs import (mathieu_a, mathieu_b, iv, jv, gamma, rgamma,
+                      psi, hankel1, hankel2, yv, kv, poch, binom,
+                      _stirling2_inexact)
+
+from ._gufuncs import _lqn, _lqmn, _rctj, _rcty
+from ._input_validation import _nonneg_int_or_fail
+from . import _specfun
+from ._comb import _comb_int
+
+
+__all__ = [
+    'ai_zeros',
+    'assoc_laguerre',
+    'bei_zeros',
+    'beip_zeros',
+    'ber_zeros',
+    'bernoulli',
+    'berp_zeros',
+    'bi_zeros',
+    'comb',
+    'digamma',
+    'diric',
+    'erf_zeros',
+    'euler',
+    'factorial',
+    'factorial2',
+    'factorialk',
+    'fresnel_zeros',
+    'fresnelc_zeros',
+    'fresnels_zeros',
+    'h1vp',
+    'h2vp',
+    'ivp',
+    'jn_zeros',
+    'jnjnp_zeros',
+    'jnp_zeros',
+    'jnyn_zeros',
+    'jvp',
+    'kei_zeros',
+    'keip_zeros',
+    'kelvin_zeros',
+    'ker_zeros',
+    'kerp_zeros',
+    'kvp',
+    'lmbda',
+    'lqmn',
+    'lqn',
+    'mathieu_even_coef',
+    'mathieu_odd_coef',
+    'obl_cv_seq',
+    'pbdn_seq',
+    'pbdv_seq',
+    'pbvv_seq',
+    'perm',
+    'polygamma',
+    'pro_cv_seq',
+    'riccati_jn',
+    'riccati_yn',
+    'sinc',
+    'softplus',
+    'stirling2',
+    'y0_zeros',
+    'y1_zeros',
+    'y1p_zeros',
+    'yn_zeros',
+    'ynp_zeros',
+    'yvp',
+    'zeta'
+]
+
+
+# mapping k to last n such that factorialk(n, k) < np.iinfo(np.int64).max
+_FACTORIALK_LIMITS_64BITS = {1: 20, 2: 33, 3: 44, 4: 54, 5: 65,
+                             6: 74, 7: 84, 8: 93, 9: 101}
+# mapping k to last n such that factorialk(n, k) < np.iinfo(np.int32).max
+_FACTORIALK_LIMITS_32BITS = {1: 12, 2: 19, 3: 25, 4: 31, 5: 37,
+                             6: 43, 7: 47, 8: 51, 9: 56}
+
+
+def diric(x, n):
+    """Periodic sinc function, also called the Dirichlet kernel.
+
+    The Dirichlet kernel is defined as::
+
+        diric(x, n) = sin(x * n/2) / (n * sin(x / 2)),
+
+    where `n` is a positive integer.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data
+    n : int
+        Integer defining the periodicity.
+
+    Returns
+    -------
+    diric : ndarray
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-8*np.pi, 8*np.pi, num=201)
+    >>> plt.figure(figsize=(8, 8));
+    >>> for idx, n in enumerate([2, 3, 4, 9]):
+    ...     plt.subplot(2, 2, idx+1)
+    ...     plt.plot(x, special.diric(x, n))
+    ...     plt.title('diric, n={}'.format(n))
+    >>> plt.show()
+
+    The following example demonstrates that `diric` gives the magnitudes
+    (modulo the sign and scaling) of the Fourier coefficients of a
+    rectangular pulse.
+
+    Suppress output of values that are effectively 0:
+
+    >>> np.set_printoptions(suppress=True)
+
+    Create a signal `x` of length `m` with `k` ones:
+
+    >>> m = 8
+    >>> k = 3
+    >>> x = np.zeros(m)
+    >>> x[:k] = 1
+
+    Use the FFT to compute the Fourier transform of `x`, and
+    inspect the magnitudes of the coefficients:
+
+    >>> np.abs(np.fft.fft(x))
+    array([ 3.        ,  2.41421356,  1.        ,  0.41421356,  1.        ,
+            0.41421356,  1.        ,  2.41421356])
+
+    Now find the same values (up to sign) using `diric`. We multiply
+    by `k` to account for the different scaling conventions of
+    `numpy.fft.fft` and `diric`:
+
+    >>> theta = np.linspace(0, 2*np.pi, m, endpoint=False)
+    >>> k * special.diric(theta, k)
+    array([ 3.        ,  2.41421356,  1.        , -0.41421356, -1.        ,
+           -0.41421356,  1.        ,  2.41421356])
+    """
+    x, n = asarray(x), asarray(n)
+    n = asarray(n + (x-x))
+    x = asarray(x + (n-n))
+    if issubdtype(x.dtype, inexact):
+        ytype = x.dtype
+    else:
+        ytype = float
+    y = zeros(x.shape, ytype)
+
+    # empirical minval for 32, 64 or 128 bit float computations
+    # where sin(x/2) < minval, result is fixed at +1 or -1
+    if np.finfo(ytype).eps < 1e-18:
+        minval = 1e-11
+    elif np.finfo(ytype).eps < 1e-15:
+        minval = 1e-7
+    else:
+        minval = 1e-3
+
+    mask1 = (n <= 0) | (n != floor(n))
+    place(y, mask1, nan)
+
+    x = x / 2
+    denom = sin(x)
+    mask2 = (1-mask1) & (abs(denom) < minval)
+    xsub = extract(mask2, x)
+    nsub = extract(mask2, n)
+    zsub = xsub / pi
+    place(y, mask2, pow(-1, np.round(zsub)*(nsub-1)))
+
+    mask = (1-mask1) & (1-mask2)
+    xsub = extract(mask, x)
+    nsub = extract(mask, n)
+    dsub = extract(mask, denom)
+    place(y, mask, sin(nsub*xsub)/(nsub*dsub))
+    return y
+
+
+def jnjnp_zeros(nt):
+    """Compute zeros of integer-order Bessel functions Jn and Jn'.
+
+    Results are arranged in order of the magnitudes of the zeros.
+
+    Parameters
+    ----------
+    nt : int
+        Number (<=1200) of zeros to compute
+
+    Returns
+    -------
+    zo[l-1] : ndarray
+        Value of the lth zero of Jn(x) and Jn'(x). Of length `nt`.
+    n[l-1] : ndarray
+        Order of the Jn(x) or Jn'(x) associated with lth zero. Of length `nt`.
+    m[l-1] : ndarray
+        Serial number of the zeros of Jn(x) or Jn'(x) associated
+        with lth zero. Of length `nt`.
+    t[l-1] : ndarray
+        0 if lth zero in zo is zero of Jn(x), 1 if it is a zero of Jn'(x). Of
+        length `nt`.
+
+    See Also
+    --------
+    jn_zeros, jnp_zeros : to get separated arrays of zeros.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt > 1200):
+        raise ValueError("Number must be integer <= 1200.")
+    nt = int(nt)
+    n, m, t, zo = _specfun.jdzo(nt)
+    return zo[1:nt+1], n[:nt], m[:nt], t[:nt]
+
+
+def jnyn_zeros(n, nt):
+    """Compute nt zeros of Bessel functions Jn(x), Jn'(x), Yn(x), and Yn'(x).
+
+    Returns 4 arrays of length `nt`, corresponding to the first `nt`
+    zeros of Jn(x), Jn'(x), Yn(x), and Yn'(x), respectively. The zeros
+    are returned in ascending order.
+
+    Parameters
+    ----------
+    n : int
+        Order of the Bessel functions
+    nt : int
+        Number (<=1200) of zeros to compute
+
+    Returns
+    -------
+    Jn : ndarray
+        First `nt` zeros of Jn
+    Jnp : ndarray
+        First `nt` zeros of Jn'
+    Yn : ndarray
+        First `nt` zeros of Yn
+    Ynp : ndarray
+        First `nt` zeros of Yn'
+
+    See Also
+    --------
+    jn_zeros, jnp_zeros, yn_zeros, ynp_zeros
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first three roots of :math:`J_1`, :math:`J_1'`,
+    :math:`Y_1` and :math:`Y_1'`.
+
+    >>> from scipy.special import jnyn_zeros
+    >>> jn_roots, jnp_roots, yn_roots, ynp_roots = jnyn_zeros(1, 3)
+    >>> jn_roots, yn_roots
+    (array([ 3.83170597,  7.01558667, 10.17346814]),
+     array([2.19714133, 5.42968104, 8.59600587]))
+
+    Plot :math:`J_1`, :math:`J_1'`, :math:`Y_1`, :math:`Y_1'` and their roots.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import jnyn_zeros, jvp, jn, yvp, yn
+    >>> jn_roots, jnp_roots, yn_roots, ynp_roots = jnyn_zeros(1, 3)
+    >>> fig, ax = plt.subplots()
+    >>> xmax= 11
+    >>> x = np.linspace(0, xmax)
+    >>> x[0] += 1e-15
+    >>> ax.plot(x, jn(1, x), label=r"$J_1$", c='r')
+    >>> ax.plot(x, jvp(1, x, 1), label=r"$J_1'$", c='b')
+    >>> ax.plot(x, yn(1, x), label=r"$Y_1$", c='y')
+    >>> ax.plot(x, yvp(1, x, 1), label=r"$Y_1'$", c='c')
+    >>> zeros = np.zeros((3, ))
+    >>> ax.scatter(jn_roots, zeros, s=30, c='r', zorder=5,
+    ...            label=r"$J_1$ roots")
+    >>> ax.scatter(jnp_roots, zeros, s=30, c='b', zorder=5,
+    ...            label=r"$J_1'$ roots")
+    >>> ax.scatter(yn_roots, zeros, s=30, c='y', zorder=5,
+    ...            label=r"$Y_1$ roots")
+    >>> ax.scatter(ynp_roots, zeros, s=30, c='c', zorder=5,
+    ...            label=r"$Y_1'$ roots")
+    >>> ax.hlines(0, 0, xmax, color='k')
+    >>> ax.set_ylim(-0.6, 0.6)
+    >>> ax.set_xlim(0, xmax)
+    >>> ax.legend(ncol=2, bbox_to_anchor=(1., 0.75))
+    >>> plt.tight_layout()
+    >>> plt.show()
+    """
+    if not (isscalar(nt) and isscalar(n)):
+        raise ValueError("Arguments must be scalars.")
+    if (floor(n) != n) or (floor(nt) != nt):
+        raise ValueError("Arguments must be integers.")
+    if (nt <= 0):
+        raise ValueError("nt > 0")
+    return _specfun.jyzo(abs(n), nt)
+
+
+def jn_zeros(n, nt):
+    r"""Compute zeros of integer-order Bessel functions Jn.
+
+    Compute `nt` zeros of the Bessel functions :math:`J_n(x)` on the
+    interval :math:`(0, \infty)`. The zeros are returned in ascending
+    order. Note that this interval excludes the zero at :math:`x = 0`
+    that exists for :math:`n > 0`.
+
+    Parameters
+    ----------
+    n : int
+        Order of Bessel function
+    nt : int
+        Number of zeros to return
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Bessel function.
+
+    See Also
+    --------
+    jv: Real-order Bessel functions of the first kind
+    jnp_zeros: Zeros of :math:`Jn'`
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first four positive roots of :math:`J_3`.
+
+    >>> from scipy.special import jn_zeros
+    >>> jn_zeros(3, 4)
+    array([ 6.3801619 ,  9.76102313, 13.01520072, 16.22346616])
+
+    Plot :math:`J_3` and its first four positive roots. Note
+    that the root located at 0 is not returned by `jn_zeros`.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import jn, jn_zeros
+    >>> j3_roots = jn_zeros(3, 4)
+    >>> xmax = 18
+    >>> xmin = -1
+    >>> x = np.linspace(xmin, xmax, 500)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, jn(3, x), label=r'$J_3$')
+    >>> ax.scatter(j3_roots, np.zeros((4, )), s=30, c='r',
+    ...            label=r"$J_3$_Zeros", zorder=5)
+    >>> ax.scatter(0, 0, s=30, c='k',
+    ...            label=r"Root at 0", zorder=5)
+    >>> ax.hlines(0, 0, xmax, color='k')
+    >>> ax.set_xlim(xmin, xmax)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    return jnyn_zeros(n, nt)[0]
+
+
+def jnp_zeros(n, nt):
+    r"""Compute zeros of integer-order Bessel function derivatives Jn'.
+
+    Compute `nt` zeros of the functions :math:`J_n'(x)` on the
+    interval :math:`(0, \infty)`. The zeros are returned in ascending
+    order. Note that this interval excludes the zero at :math:`x = 0`
+    that exists for :math:`n > 1`.
+
+    Parameters
+    ----------
+    n : int
+        Order of Bessel function
+    nt : int
+        Number of zeros to return
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Bessel function.
+
+    See Also
+    --------
+    jvp: Derivatives of integer-order Bessel functions of the first kind
+    jv: Float-order Bessel functions of the first kind
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first four roots of :math:`J_2'`.
+
+    >>> from scipy.special import jnp_zeros
+    >>> jnp_zeros(2, 4)
+    array([ 3.05423693,  6.70613319,  9.96946782, 13.17037086])
+
+    As `jnp_zeros` yields the roots of :math:`J_n'`, it can be used to
+    compute the locations of the peaks of :math:`J_n`. Plot
+    :math:`J_2`, :math:`J_2'` and the locations of the roots of :math:`J_2'`.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import jn, jnp_zeros, jvp
+    >>> j2_roots = jnp_zeros(2, 4)
+    >>> xmax = 15
+    >>> x = np.linspace(0, xmax, 500)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, jn(2, x), label=r'$J_2$')
+    >>> ax.plot(x, jvp(2, x, 1), label=r"$J_2'$")
+    >>> ax.hlines(0, 0, xmax, color='k')
+    >>> ax.scatter(j2_roots, np.zeros((4, )), s=30, c='r',
+    ...            label=r"Roots of $J_2'$", zorder=5)
+    >>> ax.set_ylim(-0.4, 0.8)
+    >>> ax.set_xlim(0, xmax)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    return jnyn_zeros(n, nt)[1]
+
+
+def yn_zeros(n, nt):
+    r"""Compute zeros of integer-order Bessel function Yn(x).
+
+    Compute `nt` zeros of the functions :math:`Y_n(x)` on the interval
+    :math:`(0, \infty)`. The zeros are returned in ascending order.
+
+    Parameters
+    ----------
+    n : int
+        Order of Bessel function
+    nt : int
+        Number of zeros to return
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Bessel function.
+
+    See Also
+    --------
+    yn: Bessel function of the second kind for integer order
+    yv: Bessel function of the second kind for real order
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first four roots of :math:`Y_2`.
+
+    >>> from scipy.special import yn_zeros
+    >>> yn_zeros(2, 4)
+    array([ 3.38424177,  6.79380751, 10.02347798, 13.20998671])
+
+    Plot :math:`Y_2` and its first four roots.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import yn, yn_zeros
+    >>> xmin = 2
+    >>> xmax = 15
+    >>> x = np.linspace(xmin, xmax, 500)
+    >>> fig, ax = plt.subplots()
+    >>> ax.hlines(0, xmin, xmax, color='k')
+    >>> ax.plot(x, yn(2, x), label=r'$Y_2$')
+    >>> ax.scatter(yn_zeros(2, 4), np.zeros((4, )), s=30, c='r',
+    ...            label='Roots', zorder=5)
+    >>> ax.set_ylim(-0.4, 0.4)
+    >>> ax.set_xlim(xmin, xmax)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    return jnyn_zeros(n, nt)[2]
+
+
+def ynp_zeros(n, nt):
+    r"""Compute zeros of integer-order Bessel function derivatives Yn'(x).
+
+    Compute `nt` zeros of the functions :math:`Y_n'(x)` on the
+    interval :math:`(0, \infty)`. The zeros are returned in ascending
+    order.
+
+    Parameters
+    ----------
+    n : int
+        Order of Bessel function
+    nt : int
+        Number of zeros to return
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Bessel derivative function.
+
+
+    See Also
+    --------
+    yvp
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first four roots of the first derivative of the
+    Bessel function of second kind for order 0 :math:`Y_0'`.
+
+    >>> from scipy.special import ynp_zeros
+    >>> ynp_zeros(0, 4)
+    array([ 2.19714133,  5.42968104,  8.59600587, 11.74915483])
+
+    Plot :math:`Y_0`, :math:`Y_0'` and confirm visually that the roots of
+    :math:`Y_0'` are located at local extrema of :math:`Y_0`.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import yn, ynp_zeros, yvp
+    >>> zeros = ynp_zeros(0, 4)
+    >>> xmax = 13
+    >>> x = np.linspace(0, xmax, 500)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, yn(0, x), label=r'$Y_0$')
+    >>> ax.plot(x, yvp(0, x, 1), label=r"$Y_0'$")
+    >>> ax.scatter(zeros, np.zeros((4, )), s=30, c='r',
+    ...            label=r"Roots of $Y_0'$", zorder=5)
+    >>> for root in zeros:
+    ...     y0_extremum =  yn(0, root)
+    ...     lower = min(0, y0_extremum)
+    ...     upper = max(0, y0_extremum)
+    ...     ax.vlines(root, lower, upper, color='r')
+    >>> ax.hlines(0, 0, xmax, color='k')
+    >>> ax.set_ylim(-0.6, 0.6)
+    >>> ax.set_xlim(0, xmax)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    return jnyn_zeros(n, nt)[3]
+
+
+def y0_zeros(nt, complex=False):
+    """Compute nt zeros of Bessel function Y0(z), and derivative at each zero.
+
+    The derivatives are given by Y0'(z0) = -Y1(z0) at each zero z0.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to return
+    complex : bool, default False
+        Set to False to return only the real zeros; set to True to return only
+        the complex zeros with negative real part and positive imaginary part.
+        Note that the complex conjugates of the latter are also zeros of the
+        function, but are not returned by this routine.
+
+    Returns
+    -------
+    z0n : ndarray
+        Location of nth zero of Y0(z)
+    y0pz0n : ndarray
+        Value of derivative Y0'(z0) for nth zero
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first 4 real roots and the derivatives at the roots of
+    :math:`Y_0`:
+
+    >>> import numpy as np
+    >>> from scipy.special import y0_zeros
+    >>> zeros, grads = y0_zeros(4)
+    >>> with np.printoptions(precision=5):
+    ...     print(f"Roots: {zeros}")
+    ...     print(f"Gradients: {grads}")
+    Roots: [ 0.89358+0.j  3.95768+0.j  7.08605+0.j 10.22235+0.j]
+    Gradients: [-0.87942+0.j  0.40254+0.j -0.3001 +0.j  0.2497 +0.j]
+
+    Plot the real part of :math:`Y_0` and the first four computed roots.
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import y0
+    >>> xmin = 0
+    >>> xmax = 11
+    >>> x = np.linspace(xmin, xmax, 500)
+    >>> fig, ax = plt.subplots()
+    >>> ax.hlines(0, xmin, xmax, color='k')
+    >>> ax.plot(x, y0(x), label=r'$Y_0$')
+    >>> zeros, grads = y0_zeros(4)
+    >>> ax.scatter(zeros.real, np.zeros((4, )), s=30, c='r',
+    ...            label=r'$Y_0$_zeros', zorder=5)
+    >>> ax.set_ylim(-0.5, 0.6)
+    >>> ax.set_xlim(xmin, xmax)
+    >>> plt.legend(ncol=2)
+    >>> plt.show()
+
+    Compute the first 4 complex roots and the derivatives at the roots of
+    :math:`Y_0` by setting ``complex=True``:
+
+    >>> y0_zeros(4, True)
+    (array([ -2.40301663+0.53988231j,  -5.5198767 +0.54718001j,
+             -8.6536724 +0.54841207j, -11.79151203+0.54881912j]),
+     array([ 0.10074769-0.88196771j, -0.02924642+0.5871695j ,
+             0.01490806-0.46945875j, -0.00937368+0.40230454j]))
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("Arguments must be scalar positive integer.")
+    kf = 0
+    kc = not complex
+    return _specfun.cyzo(nt, kf, kc)
+
+
+def y1_zeros(nt, complex=False):
+    """Compute nt zeros of Bessel function Y1(z), and derivative at each zero.
+
+    The derivatives are given by Y1'(z1) = Y0(z1) at each zero z1.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to return
+    complex : bool, default False
+        Set to False to return only the real zeros; set to True to return only
+        the complex zeros with negative real part and positive imaginary part.
+        Note that the complex conjugates of the latter are also zeros of the
+        function, but are not returned by this routine.
+
+    Returns
+    -------
+    z1n : ndarray
+        Location of nth zero of Y1(z)
+    y1pz1n : ndarray
+        Value of derivative Y1'(z1) for nth zero
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first 4 real roots and the derivatives at the roots of
+    :math:`Y_1`:
+
+    >>> import numpy as np
+    >>> from scipy.special import y1_zeros
+    >>> zeros, grads = y1_zeros(4)
+    >>> with np.printoptions(precision=5):
+    ...     print(f"Roots: {zeros}")
+    ...     print(f"Gradients: {grads}")
+    Roots: [ 2.19714+0.j  5.42968+0.j  8.59601+0.j 11.74915+0.j]
+    Gradients: [ 0.52079+0.j -0.34032+0.j  0.27146+0.j -0.23246+0.j]
+
+    Extract the real parts:
+
+    >>> realzeros = zeros.real
+    >>> realzeros
+    array([ 2.19714133,  5.42968104,  8.59600587, 11.74915483])
+
+    Plot :math:`Y_1` and the first four computed roots.
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import y1
+    >>> xmin = 0
+    >>> xmax = 13
+    >>> x = np.linspace(xmin, xmax, 500)
+    >>> zeros, grads = y1_zeros(4)
+    >>> fig, ax = plt.subplots()
+    >>> ax.hlines(0, xmin, xmax, color='k')
+    >>> ax.plot(x, y1(x), label=r'$Y_1$')
+    >>> ax.scatter(zeros.real, np.zeros((4, )), s=30, c='r',
+    ...            label=r'$Y_1$_zeros', zorder=5)
+    >>> ax.set_ylim(-0.5, 0.5)
+    >>> ax.set_xlim(xmin, xmax)
+    >>> plt.legend()
+    >>> plt.show()
+
+    Compute the first 4 complex roots and the derivatives at the roots of
+    :math:`Y_1` by setting ``complex=True``:
+
+    >>> y1_zeros(4, True)
+    (array([ -0.50274327+0.78624371j,  -3.83353519+0.56235654j,
+             -7.01590368+0.55339305j, -10.17357383+0.55127339j]),
+     array([-0.45952768+1.31710194j,  0.04830191-0.69251288j,
+            -0.02012695+0.51864253j,  0.011614  -0.43203296j]))
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("Arguments must be scalar positive integer.")
+    kf = 1
+    kc = not complex
+    return _specfun.cyzo(nt, kf, kc)
+
+
+def y1p_zeros(nt, complex=False):
+    """Compute nt zeros of Bessel derivative Y1'(z), and value at each zero.
+
+    The values are given by Y1(z1) at each z1 where Y1'(z1)=0.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to return
+    complex : bool, default False
+        Set to False to return only the real zeros; set to True to return only
+        the complex zeros with negative real part and positive imaginary part.
+        Note that the complex conjugates of the latter are also zeros of the
+        function, but are not returned by this routine.
+
+    Returns
+    -------
+    z1pn : ndarray
+        Location of nth zero of Y1'(z)
+    y1z1pn : ndarray
+        Value of derivative Y1(z1) for nth zero
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    Compute the first four roots of :math:`Y_1'` and the values of
+    :math:`Y_1` at these roots.
+
+    >>> import numpy as np
+    >>> from scipy.special import y1p_zeros
+    >>> y1grad_roots, y1_values = y1p_zeros(4)
+    >>> with np.printoptions(precision=5):
+    ...     print(f"Y1' Roots: {y1grad_roots.real}")
+    ...     print(f"Y1 values: {y1_values.real}")
+    Y1' Roots: [ 3.68302  6.9415  10.1234  13.28576]
+    Y1 values: [ 0.41673 -0.30317  0.25091 -0.21897]
+
+    `y1p_zeros` can be used to calculate the extremal points of :math:`Y_1`
+    directly. Here we plot :math:`Y_1` and the first four extrema.
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import y1, yvp
+    >>> y1_roots, y1_values_at_roots = y1p_zeros(4)
+    >>> real_roots = y1_roots.real
+    >>> xmax = 15
+    >>> x = np.linspace(0, xmax, 500)
+    >>> x[0] += 1e-15
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, y1(x), label=r'$Y_1$')
+    >>> ax.plot(x, yvp(1, x, 1), label=r"$Y_1'$")
+    >>> ax.scatter(real_roots, np.zeros((4, )), s=30, c='r',
+    ...            label=r"Roots of $Y_1'$", zorder=5)
+    >>> ax.scatter(real_roots, y1_values_at_roots.real, s=30, c='k',
+    ...            label=r"Extrema of $Y_1$", zorder=5)
+    >>> ax.hlines(0, 0, xmax, color='k')
+    >>> ax.set_ylim(-0.5, 0.5)
+    >>> ax.set_xlim(0, xmax)
+    >>> ax.legend(ncol=2, bbox_to_anchor=(1., 0.75))
+    >>> plt.tight_layout()
+    >>> plt.show()
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("Arguments must be scalar positive integer.")
+    kf = 2
+    kc = not complex
+    return _specfun.cyzo(nt, kf, kc)
+
+
+def _bessel_diff_formula(v, z, n, L, phase):
+    # from AMS55.
+    # L(v, z) = J(v, z), Y(v, z), H1(v, z), H2(v, z), phase = -1
+    # L(v, z) = I(v, z) or exp(v*pi*i)K(v, z), phase = 1
+    # For K, you can pull out the exp((v-k)*pi*i) into the caller
+    v = asarray(v)
+    p = 1.0
+    s = L(v-n, z)
+    for i in range(1, n+1):
+        p = phase * (p * (n-i+1)) / i   # = choose(k, i)
+        s += p*L(v-n + i*2, z)
+    return s / (2.**n)
+
+
+def jvp(v, z, n=1):
+    """Compute derivatives of Bessel functions of the first kind.
+
+    Compute the nth derivative of the Bessel function `Jv` with
+    respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like or float
+        Order of Bessel function
+    z : complex
+        Argument at which to evaluate the derivative; can be real or
+        complex.
+    n : int, default 1
+        Order of derivative. For 0 returns the Bessel function `jv` itself.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the derivative of the Bessel function.
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.6.7 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.6.E7
+
+    Examples
+    --------
+
+    Compute the Bessel function of the first kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import jvp
+    >>> jvp(0, 1, 0), jvp(0, 1, 1), jvp(0, 1, 2)
+    (0.7651976865579666, -0.44005058574493355, -0.3251471008130331)
+
+    Compute the first derivative of the Bessel function of the first
+    kind for several orders at 1 by providing an array for `v`.
+
+    >>> jvp([0, 1, 2], 1, 1)
+    array([-0.44005059,  0.3251471 ,  0.21024362])
+
+    Compute the first derivative of the Bessel function of the first
+    kind of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0., 1.5, 3.])
+    >>> jvp(0, points, 1)
+    array([-0.        , -0.55793651, -0.33905896])
+
+    Plot the Bessel function of the first kind of order 1 and its
+    first three derivatives.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-10, 10, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, jvp(1, x, 0), label=r"$J_1$")
+    >>> ax.plot(x, jvp(1, x, 1), label=r"$J_1'$")
+    >>> ax.plot(x, jvp(1, x, 2), label=r"$J_1''$")
+    >>> ax.plot(x, jvp(1, x, 3), label=r"$J_1'''$")
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return jv(v, z)
+    else:
+        return _bessel_diff_formula(v, z, n, jv, -1)
+
+
+def yvp(v, z, n=1):
+    """Compute derivatives of Bessel functions of the second kind.
+
+    Compute the nth derivative of the Bessel function `Yv` with
+    respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like of float
+        Order of Bessel function
+    z : complex
+        Argument at which to evaluate the derivative
+    n : int, default 1
+        Order of derivative. For 0 returns the BEssel function `yv`
+
+    Returns
+    -------
+    scalar or ndarray
+        nth derivative of the Bessel function.
+
+    See Also
+    --------
+    yv : Bessel functions of the second kind
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.6.7 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.6.E7
+
+    Examples
+    --------
+    Compute the Bessel function of the second kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import yvp
+    >>> yvp(0, 1, 0), yvp(0, 1, 1), yvp(0, 1, 2)
+    (0.088256964215677, 0.7812128213002889, -0.8694697855159659)
+
+    Compute the first derivative of the Bessel function of the second
+    kind for several orders at 1 by providing an array for `v`.
+
+    >>> yvp([0, 1, 2], 1, 1)
+    array([0.78121282, 0.86946979, 2.52015239])
+
+    Compute the first derivative of the Bessel function of the
+    second kind of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0.5, 1.5, 3.])
+    >>> yvp(0, points, 1)
+    array([ 1.47147239,  0.41230863, -0.32467442])
+
+    Plot the Bessel function of the second kind of order 1 and its
+    first three derivatives.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0, 5, 1000)
+    >>> x[0] += 1e-15
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, yvp(1, x, 0), label=r"$Y_1$")
+    >>> ax.plot(x, yvp(1, x, 1), label=r"$Y_1'$")
+    >>> ax.plot(x, yvp(1, x, 2), label=r"$Y_1''$")
+    >>> ax.plot(x, yvp(1, x, 3), label=r"$Y_1'''$")
+    >>> ax.set_ylim(-10, 10)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return yv(v, z)
+    else:
+        return _bessel_diff_formula(v, z, n, yv, -1)
+
+
+def kvp(v, z, n=1):
+    """Compute derivatives of real-order modified Bessel function Kv(z)
+
+    Kv(z) is the modified Bessel function of the second kind.
+    Derivative is calculated with respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like of float
+        Order of Bessel function
+    z : array_like of complex
+        Argument at which to evaluate the derivative
+    n : int, default 1
+        Order of derivative. For 0 returns the Bessel function `kv` itself.
+
+    Returns
+    -------
+    out : ndarray
+        The results
+
+    See Also
+    --------
+    kv
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.29.5 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 6.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.29.E5
+
+    Examples
+    --------
+    Compute the modified bessel function of the second kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import kvp
+    >>> kvp(0, 1, 0), kvp(0, 1, 1), kvp(0, 1, 2)
+    (0.42102443824070834, -0.6019072301972346, 1.0229316684379428)
+
+    Compute the first derivative of the modified Bessel function of the second
+    kind for several orders at 1 by providing an array for `v`.
+
+    >>> kvp([0, 1, 2], 1, 1)
+    array([-0.60190723, -1.02293167, -3.85158503])
+
+    Compute the first derivative of the modified Bessel function of the
+    second kind of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0.5, 1.5, 3.])
+    >>> kvp(0, points, 1)
+    array([-1.65644112, -0.2773878 , -0.04015643])
+
+    Plot the modified bessel function of the second kind and its
+    first three derivatives.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0, 5, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, kvp(1, x, 0), label=r"$K_1$")
+    >>> ax.plot(x, kvp(1, x, 1), label=r"$K_1'$")
+    >>> ax.plot(x, kvp(1, x, 2), label=r"$K_1''$")
+    >>> ax.plot(x, kvp(1, x, 3), label=r"$K_1'''$")
+    >>> ax.set_ylim(-2.5, 2.5)
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return kv(v, z)
+    else:
+        return (-1)**n * _bessel_diff_formula(v, z, n, kv, 1)
+
+
+def ivp(v, z, n=1):
+    """Compute derivatives of modified Bessel functions of the first kind.
+
+    Compute the nth derivative of the modified Bessel function `Iv`
+    with respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like or float
+        Order of Bessel function
+    z : array_like
+        Argument at which to evaluate the derivative; can be real or
+        complex.
+    n : int, default 1
+        Order of derivative. For 0, returns the Bessel function `iv` itself.
+
+    Returns
+    -------
+    scalar or ndarray
+        nth derivative of the modified Bessel function.
+
+    See Also
+    --------
+    iv
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.29.5 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 6.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.29.E5
+
+    Examples
+    --------
+    Compute the modified Bessel function of the first kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import ivp
+    >>> ivp(0, 1, 0), ivp(0, 1, 1), ivp(0, 1, 2)
+    (1.2660658777520084, 0.565159103992485, 0.7009067737595233)
+
+    Compute the first derivative of the modified Bessel function of the first
+    kind for several orders at 1 by providing an array for `v`.
+
+    >>> ivp([0, 1, 2], 1, 1)
+    array([0.5651591 , 0.70090677, 0.29366376])
+
+    Compute the first derivative of the modified Bessel function of the
+    first kind of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0., 1.5, 3.])
+    >>> ivp(0, points, 1)
+    array([0.        , 0.98166643, 3.95337022])
+
+    Plot the modified Bessel function of the first kind of order 1 and its
+    first three derivatives.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-5, 5, 1000)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, ivp(1, x, 0), label=r"$I_1$")
+    >>> ax.plot(x, ivp(1, x, 1), label=r"$I_1'$")
+    >>> ax.plot(x, ivp(1, x, 2), label=r"$I_1''$")
+    >>> ax.plot(x, ivp(1, x, 3), label=r"$I_1'''$")
+    >>> plt.legend()
+    >>> plt.show()
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return iv(v, z)
+    else:
+        return _bessel_diff_formula(v, z, n, iv, 1)
+
+
+def h1vp(v, z, n=1):
+    """Compute derivatives of Hankel function H1v(z) with respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like
+        Order of Hankel function
+    z : array_like
+        Argument at which to evaluate the derivative. Can be real or
+        complex.
+    n : int, default 1
+        Order of derivative. For 0 returns the Hankel function `hankel1` itself.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the derivative of the Hankel function.
+
+    See Also
+    --------
+    hankel1
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.6.7 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.6.E7
+
+    Examples
+    --------
+    Compute the Hankel function of the first kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import h1vp
+    >>> h1vp(0, 1, 0), h1vp(0, 1, 1), h1vp(0, 1, 2)
+    ((0.7651976865579664+0.088256964215677j),
+     (-0.44005058574493355+0.7812128213002889j),
+     (-0.3251471008130329-0.8694697855159659j))
+
+    Compute the first derivative of the Hankel function of the first kind
+    for several orders at 1 by providing an array for `v`.
+
+    >>> h1vp([0, 1, 2], 1, 1)
+    array([-0.44005059+0.78121282j,  0.3251471 +0.86946979j,
+           0.21024362+2.52015239j])
+
+    Compute the first derivative of the Hankel function of the first kind
+    of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0.5, 1.5, 3.])
+    >>> h1vp(0, points, 1)
+    array([-0.24226846+1.47147239j, -0.55793651+0.41230863j,
+           -0.33905896-0.32467442j])
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return hankel1(v, z)
+    else:
+        return _bessel_diff_formula(v, z, n, hankel1, -1)
+
+
+def h2vp(v, z, n=1):
+    """Compute derivatives of Hankel function H2v(z) with respect to `z`.
+
+    Parameters
+    ----------
+    v : array_like
+        Order of Hankel function
+    z : array_like
+        Argument at which to evaluate the derivative. Can be real or
+        complex.
+    n : int, default 1
+        Order of derivative. For 0 returns the Hankel function `hankel2` itself.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the derivative of the Hankel function.
+
+    See Also
+    --------
+    hankel2
+
+    Notes
+    -----
+    The derivative is computed using the relation DLFM 10.6.7 [2]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 5.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.6.E7
+
+    Examples
+    --------
+    Compute the Hankel function of the second kind of order 0 and
+    its first two derivatives at 1.
+
+    >>> from scipy.special import h2vp
+    >>> h2vp(0, 1, 0), h2vp(0, 1, 1), h2vp(0, 1, 2)
+    ((0.7651976865579664-0.088256964215677j),
+     (-0.44005058574493355-0.7812128213002889j),
+     (-0.3251471008130329+0.8694697855159659j))
+
+    Compute the first derivative of the Hankel function of the second kind
+    for several orders at 1 by providing an array for `v`.
+
+    >>> h2vp([0, 1, 2], 1, 1)
+    array([-0.44005059-0.78121282j,  0.3251471 -0.86946979j,
+           0.21024362-2.52015239j])
+
+    Compute the first derivative of the Hankel function of the second kind
+    of order 0 at several points by providing an array for `z`.
+
+    >>> import numpy as np
+    >>> points = np.array([0.5, 1.5, 3.])
+    >>> h2vp(0, points, 1)
+    array([-0.24226846-1.47147239j, -0.55793651-0.41230863j,
+           -0.33905896+0.32467442j])
+    """
+    n = _nonneg_int_or_fail(n, 'n')
+    if n == 0:
+        return hankel2(v, z)
+    else:
+        return _bessel_diff_formula(v, z, n, hankel2, -1)
+
+
+def riccati_jn(n, x):
+    r"""Compute Riccati-Bessel function of the first kind and its derivative.
+
+    The Riccati-Bessel function of the first kind is defined as :math:`x
+    j_n(x)`, where :math:`j_n` is the spherical Bessel function of the first
+    kind of order :math:`n`.
+
+    This function computes the value and first derivative of the
+    Riccati-Bessel function for all orders up to and including `n`.
+
+    Parameters
+    ----------
+    n : int
+        Maximum order of function to compute
+    x : float
+        Argument at which to evaluate
+
+    Returns
+    -------
+    jn : ndarray
+        Value of j0(x), ..., jn(x)
+    jnp : ndarray
+        First derivative j0'(x), ..., jn'(x)
+
+    Notes
+    -----
+    The computation is carried out via backward recurrence, using the
+    relation DLMF 10.51.1 [2]_.
+
+    Wrapper for a Fortran routine created by Shanjie Zhang and Jianming
+    Jin [1]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.51.E1
+
+    """
+    if not (isscalar(n) and isscalar(x)):
+        raise ValueError("arguments must be scalars.")
+    n = _nonneg_int_or_fail(n, 'n', strict=False)
+    if (n == 0):
+        n1 = 1
+    else:
+        n1 = n
+
+    jn = np.empty((n1 + 1,), dtype=np.float64)
+    jnp = np.empty_like(jn)
+
+    _rctj(x, out=(jn, jnp))
+    return jn[:(n+1)], jnp[:(n+1)]
+
+
+def riccati_yn(n, x):
+    """Compute Riccati-Bessel function of the second kind and its derivative.
+
+    The Riccati-Bessel function of the second kind is defined here as :math:`+x
+    y_n(x)`, where :math:`y_n` is the spherical Bessel function of the second
+    kind of order :math:`n`. *Note that this is in contrast to a common convention
+    that includes a minus sign in the definition.*
+
+    This function computes the value and first derivative of the function for
+    all orders up to and including `n`.
+
+    Parameters
+    ----------
+    n : int
+        Maximum order of function to compute
+    x : float
+        Argument at which to evaluate
+
+    Returns
+    -------
+    yn : ndarray
+        Value of y0(x), ..., yn(x)
+    ynp : ndarray
+        First derivative y0'(x), ..., yn'(x)
+
+    Notes
+    -----
+    The computation is carried out via ascending recurrence, using the
+    relation DLMF 10.51.1 [2]_.
+
+    Wrapper for a Fortran routine created by Shanjie Zhang and Jianming
+    Jin [1]_.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+    .. [2] NIST Digital Library of Mathematical Functions.
+           https://dlmf.nist.gov/10.51.E1
+
+    """
+    if not (isscalar(n) and isscalar(x)):
+        raise ValueError("arguments must be scalars.")
+    n = _nonneg_int_or_fail(n, 'n', strict=False)
+    if (n == 0):
+        n1 = 1
+    else:
+        n1 = n
+
+    yn = np.empty((n1 + 1,), dtype=np.float64)
+    ynp = np.empty_like(yn)
+    _rcty(x, out=(yn, ynp))
+
+    return yn[:(n+1)], ynp[:(n+1)]
+
+
+def erf_zeros(nt):
+    """Compute the first nt zero in the first quadrant, ordered by absolute value.
+
+    Zeros in the other quadrants can be obtained by using the symmetries
+    erf(-z) = erf(z) and erf(conj(z)) = conj(erf(z)).
+
+
+    Parameters
+    ----------
+    nt : int
+        The number of zeros to compute
+
+    Returns
+    -------
+    The locations of the zeros of erf : ndarray (complex)
+        Complex values at which zeros of erf(z)
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> special.erf_zeros(1)
+    array([1.45061616+1.880943j])
+
+    Check that erf is (close to) zero for the value returned by erf_zeros
+
+    >>> special.erf(special.erf_zeros(1))
+    array([4.95159469e-14-1.16407394e-16j])
+
+    """
+    if (floor(nt) != nt) or (nt <= 0) or not isscalar(nt):
+        raise ValueError("Argument must be positive scalar integer.")
+    return _specfun.cerzo(nt)
+
+
+def fresnelc_zeros(nt):
+    """Compute nt complex zeros of cosine Fresnel integral C(z).
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute
+
+    Returns
+    -------
+    fresnelc_zeros: ndarray
+        Zeros of the cosine Fresnel integral
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if (floor(nt) != nt) or (nt <= 0) or not isscalar(nt):
+        raise ValueError("Argument must be positive scalar integer.")
+    return _specfun.fcszo(1, nt)
+
+
+def fresnels_zeros(nt):
+    """Compute nt complex zeros of sine Fresnel integral S(z).
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute
+
+    Returns
+    -------
+    fresnels_zeros: ndarray
+        Zeros of the sine Fresnel integral
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if (floor(nt) != nt) or (nt <= 0) or not isscalar(nt):
+        raise ValueError("Argument must be positive scalar integer.")
+    return _specfun.fcszo(2, nt)
+
+
+def fresnel_zeros(nt):
+    """Compute nt complex zeros of sine and cosine Fresnel integrals S(z) and C(z).
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute
+
+    Returns
+    -------
+    zeros_sine: ndarray
+        Zeros of the sine Fresnel integral
+    zeros_cosine : ndarray
+        Zeros of the cosine Fresnel integral
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if (floor(nt) != nt) or (nt <= 0) or not isscalar(nt):
+        raise ValueError("Argument must be positive scalar integer.")
+    return _specfun.fcszo(2, nt), _specfun.fcszo(1, nt)
+
+
+def assoc_laguerre(x, n, k=0.0):
+    """Compute the generalized (associated) Laguerre polynomial of degree n and order k.
+
+    The polynomial :math:`L^{(k)}_n(x)` is orthogonal over ``[0, inf)``,
+    with weighting function ``exp(-x) * x**k`` with ``k > -1``.
+
+    Parameters
+    ----------
+    x : float or ndarray
+        Points where to evaluate the Laguerre polynomial
+    n : int
+        Degree of the Laguerre polynomial
+    k : int
+        Order of the Laguerre polynomial
+
+    Returns
+    -------
+    assoc_laguerre: float or ndarray
+        Associated laguerre polynomial values
+
+    Notes
+    -----
+    `assoc_laguerre` is a simple wrapper around `eval_genlaguerre`, with
+    reversed argument order ``(x, n, k=0.0) --> (n, k, x)``.
+
+    """
+    return _ufuncs.eval_genlaguerre(n, k, x)
+
+
+digamma = psi
+
+
+def polygamma(n, x):
+    r"""Polygamma functions.
+
+    Defined as :math:`\psi^{(n)}(x)` where :math:`\psi` is the
+    `digamma` function. See [dlmf]_ for details.
+
+    Parameters
+    ----------
+    n : array_like
+        The order of the derivative of the digamma function; must be
+        integral
+    x : array_like
+        Real valued input
+
+    Returns
+    -------
+    ndarray
+        Function results
+
+    See Also
+    --------
+    digamma
+
+    References
+    ----------
+    .. [dlmf] NIST, Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/5.15
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> x = [2, 3, 25.5]
+    >>> special.polygamma(1, x)
+    array([ 0.64493407,  0.39493407,  0.03999467])
+    >>> special.polygamma(0, x) == special.psi(x)
+    array([ True,  True,  True], dtype=bool)
+
+    """
+    n, x = asarray(n), asarray(x)
+    fac2 = (-1.0)**(n+1) * gamma(n+1.0) * zeta(n+1, x)
+    return where(n == 0, psi(x), fac2)
+
+
+def mathieu_even_coef(m, q):
+    r"""Fourier coefficients for even Mathieu and modified Mathieu functions.
+
+    The Fourier series of the even solutions of the Mathieu differential
+    equation are of the form
+
+    .. math:: \mathrm{ce}_{2n}(z, q) = \sum_{k=0}^{\infty} A_{(2n)}^{(2k)} \cos 2kz
+
+    .. math:: \mathrm{ce}_{2n+1}(z, q) =
+              \sum_{k=0}^{\infty} A_{(2n+1)}^{(2k+1)} \cos (2k+1)z
+
+    This function returns the coefficients :math:`A_{(2n)}^{(2k)}` for even
+    input m=2n, and the coefficients :math:`A_{(2n+1)}^{(2k+1)}` for odd input
+    m=2n+1.
+
+    Parameters
+    ----------
+    m : int
+        Order of Mathieu functions.  Must be non-negative.
+    q : float (>=0)
+        Parameter of Mathieu functions.  Must be non-negative.
+
+    Returns
+    -------
+    Ak : ndarray
+        Even or odd Fourier coefficients, corresponding to even or odd m.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+    .. [2] NIST Digital Library of Mathematical Functions
+           https://dlmf.nist.gov/28.4#i
+
+    """
+    if not (isscalar(m) and isscalar(q)):
+        raise ValueError("m and q must be scalars.")
+    if (q < 0):
+        raise ValueError("q >=0")
+    if (m != floor(m)) or (m < 0):
+        raise ValueError("m must be an integer >=0.")
+
+    if (q <= 1):
+        qm = 7.5 + 56.1*sqrt(q) - 134.7*q + 90.7*sqrt(q)*q
+    else:
+        qm = 17.0 + 3.1*sqrt(q) - .126*q + .0037*sqrt(q)*q
+    km = int(qm + 0.5*m)
+    if km > 251:
+        warnings.warn("Too many predicted coefficients.", RuntimeWarning, stacklevel=2)
+    kd = 1
+    m = int(floor(m))
+    if m % 2:
+        kd = 2
+
+    a = mathieu_a(m, q)
+    fc = _specfun.fcoef(kd, m, q, a)
+    return fc[:km]
+
+
+def mathieu_odd_coef(m, q):
+    r"""Fourier coefficients for odd Mathieu and modified Mathieu functions.
+
+    The Fourier series of the odd solutions of the Mathieu differential
+    equation are of the form
+
+    .. math:: \mathrm{se}_{2n+1}(z, q) =
+              \sum_{k=0}^{\infty} B_{(2n+1)}^{(2k+1)} \sin (2k+1)z
+
+    .. math:: \mathrm{se}_{2n+2}(z, q) =
+              \sum_{k=0}^{\infty} B_{(2n+2)}^{(2k+2)} \sin (2k+2)z
+
+    This function returns the coefficients :math:`B_{(2n+2)}^{(2k+2)}` for even
+    input m=2n+2, and the coefficients :math:`B_{(2n+1)}^{(2k+1)}` for odd
+    input m=2n+1.
+
+    Parameters
+    ----------
+    m : int
+        Order of Mathieu functions.  Must be non-negative.
+    q : float (>=0)
+        Parameter of Mathieu functions.  Must be non-negative.
+
+    Returns
+    -------
+    Bk : ndarray
+        Even or odd Fourier coefficients, corresponding to even or odd m.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(m) and isscalar(q)):
+        raise ValueError("m and q must be scalars.")
+    if (q < 0):
+        raise ValueError("q >=0")
+    if (m != floor(m)) or (m <= 0):
+        raise ValueError("m must be an integer > 0")
+
+    if (q <= 1):
+        qm = 7.5 + 56.1*sqrt(q) - 134.7*q + 90.7*sqrt(q)*q
+    else:
+        qm = 17.0 + 3.1*sqrt(q) - .126*q + .0037*sqrt(q)*q
+    km = int(qm + 0.5*m)
+    if km > 251:
+        warnings.warn("Too many predicted coefficients.", RuntimeWarning, stacklevel=2)
+    kd = 4
+    m = int(floor(m))
+    if m % 2:
+        kd = 3
+
+    b = mathieu_b(m, q)
+    fc = _specfun.fcoef(kd, m, q, b)
+    return fc[:km]
+
+
+def lqmn(m, n, z):
+    """Sequence of associated Legendre functions of the second kind.
+
+    Computes the associated Legendre function of the second kind of order m and
+    degree n, ``Qmn(z)`` = :math:`Q_n^m(z)`, and its derivative, ``Qmn'(z)``.
+    Returns two arrays of size ``(m+1, n+1)`` containing ``Qmn(z)`` and
+    ``Qmn'(z)`` for all orders from ``0..m`` and degrees from ``0..n``.
+
+    Parameters
+    ----------
+    m : int
+       ``|m| <= n``; the order of the Legendre function.
+    n : int
+       where ``n >= 0``; the degree of the Legendre function.  Often
+       called ``l`` (lower case L) in descriptions of the associated
+       Legendre function
+    z : array_like, complex
+        Input value.
+
+    Returns
+    -------
+    Qmn_z : (m+1, n+1) array
+       Values for all orders 0..m and degrees 0..n
+    Qmn_d_z : (m+1, n+1) array
+       Derivatives for all orders 0..m and degrees 0..n
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(m) or (m < 0):
+        raise ValueError("m must be a non-negative integer.")
+    if not isscalar(n) or (n < 0):
+        raise ValueError("n must be a non-negative integer.")
+
+    m, n = int(m), int(n)  # Convert to int to maintain backwards compatibility.
+    # Ensure neither m nor n == 0
+    mm = max(1, m)
+    nn = max(1, n)
+
+    z = np.asarray(z)
+    if (not np.issubdtype(z.dtype, np.inexact)):
+        z = z.astype(np.float64)
+
+    if np.iscomplexobj(z):
+        q = np.empty((mm + 1, nn + 1) + z.shape, dtype=np.complex128)
+    else:
+        q = np.empty((mm + 1, nn + 1) + z.shape, dtype=np.float64)
+    qd = np.empty_like(q)
+    if (z.ndim == 0):
+        _lqmn(z, out=(q, qd))
+    else:
+        # new axes must be last for the ufunc
+        _lqmn(z,
+              out=(np.moveaxis(q, (0, 1), (-2, -1)),
+                   np.moveaxis(qd, (0, 1), (-2, -1))))
+
+    return q[:(m+1), :(n+1)], qd[:(m+1), :(n+1)]
+
+
+def bernoulli(n):
+    """Bernoulli numbers B0..Bn (inclusive).
+
+    Parameters
+    ----------
+    n : int
+        Indicated the number of terms in the Bernoulli series to generate.
+
+    Returns
+    -------
+    ndarray
+        The Bernoulli numbers ``[B(0), B(1), ..., B(n)]``.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+    .. [2] "Bernoulli number", Wikipedia, https://en.wikipedia.org/wiki/Bernoulli_number
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import bernoulli, zeta
+    >>> bernoulli(4)
+    array([ 1.        , -0.5       ,  0.16666667,  0.        , -0.03333333])
+
+    The Wikipedia article ([2]_) points out the relationship between the
+    Bernoulli numbers and the zeta function, ``B_n^+ = -n * zeta(1 - n)``
+    for ``n > 0``:
+
+    >>> n = np.arange(1, 5)
+    >>> -n * zeta(1 - n)
+    array([ 0.5       ,  0.16666667, -0.        , -0.03333333])
+
+    Note that, in the notation used in the wikipedia article,
+    `bernoulli` computes ``B_n^-`` (i.e. it used the convention that
+    ``B_1`` is -1/2).  The relation given above is for ``B_n^+``, so the
+    sign of 0.5 does not match the output of ``bernoulli(4)``.
+
+    """
+    if not isscalar(n) or (n < 0):
+        raise ValueError("n must be a non-negative integer.")
+    n = int(n)
+    if (n < 2):
+        n1 = 2
+    else:
+        n1 = n
+    return _specfun.bernob(int(n1))[:(n+1)]
+
+
+def euler(n):
+    """Euler numbers E(0), E(1), ..., E(n).
+
+    The Euler numbers [1]_ are also known as the secant numbers.
+
+    Because ``euler(n)`` returns floating point values, it does not give
+    exact values for large `n`.  The first inexact value is E(22).
+
+    Parameters
+    ----------
+    n : int
+        The highest index of the Euler number to be returned.
+
+    Returns
+    -------
+    ndarray
+        The Euler numbers [E(0), E(1), ..., E(n)].
+        The odd Euler numbers, which are all zero, are included.
+
+    References
+    ----------
+    .. [1] Sequence A122045, The On-Line Encyclopedia of Integer Sequences,
+           https://oeis.org/A122045
+    .. [2] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import euler
+    >>> euler(6)
+    array([  1.,   0.,  -1.,   0.,   5.,   0., -61.])
+
+    >>> euler(13).astype(np.int64)
+    array([      1,       0,      -1,       0,       5,       0,     -61,
+                 0,    1385,       0,  -50521,       0, 2702765,       0])
+
+    >>> euler(22)[-1]  # Exact value of E(22) is -69348874393137901.
+    -69348874393137976.0
+
+    """
+    if not isscalar(n) or (n < 0):
+        raise ValueError("n must be a non-negative integer.")
+    n = int(n)
+    if (n < 2):
+        n1 = 2
+    else:
+        n1 = n
+    return _specfun.eulerb(n1)[:(n+1)]
+
+
+def lqn(n, z):
+    """Legendre function of the second kind.
+
+    Compute sequence of Legendre functions of the second kind, Qn(z) and
+    derivatives for all degrees from 0 to n (inclusive).
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    n = _nonneg_int_or_fail(n, 'n', strict=False)
+    if (n < 1):
+        n1 = 1
+    else:
+        n1 = n
+
+    z = np.asarray(z)
+    if (not np.issubdtype(z.dtype, np.inexact)):
+        z = z.astype(float)
+
+    if np.iscomplexobj(z):
+        qn = np.empty((n1 + 1,) + z.shape, dtype=np.complex128)
+    else:
+        qn = np.empty((n1 + 1,) + z.shape, dtype=np.float64)
+    qd = np.empty_like(qn)
+    if (z.ndim == 0):
+        _lqn(z, out=(qn, qd))
+    else:
+          # new axes must be last for the ufunc
+        _lqn(z,
+             out=(np.moveaxis(qn, 0, -1),
+                  np.moveaxis(qd, 0, -1)))
+
+    return qn[:(n+1)], qd[:(n+1)]
+
+
+def ai_zeros(nt):
+    """
+    Compute `nt` zeros and values of the Airy function Ai and its derivative.
+
+    Computes the first `nt` zeros, `a`, of the Airy function Ai(x);
+    first `nt` zeros, `ap`, of the derivative of the Airy function Ai'(x);
+    the corresponding values Ai(a');
+    and the corresponding values Ai'(a).
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute
+
+    Returns
+    -------
+    a : ndarray
+        First `nt` zeros of Ai(x)
+    ap : ndarray
+        First `nt` zeros of Ai'(x)
+    ai : ndarray
+        Values of Ai(x) evaluated at first `nt` zeros of Ai'(x)
+    aip : ndarray
+        Values of Ai'(x) evaluated at first `nt` zeros of Ai(x)
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> a, ap, ai, aip = special.ai_zeros(3)
+    >>> a
+    array([-2.33810741, -4.08794944, -5.52055983])
+    >>> ap
+    array([-1.01879297, -3.24819758, -4.82009921])
+    >>> ai
+    array([ 0.53565666, -0.41901548,  0.38040647])
+    >>> aip
+    array([ 0.70121082, -0.80311137,  0.86520403])
+
+    """
+    kf = 1
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be a positive integer scalar.")
+    return _specfun.airyzo(nt, kf)
+
+
+def bi_zeros(nt):
+    """
+    Compute `nt` zeros and values of the Airy function Bi and its derivative.
+
+    Computes the first `nt` zeros, b, of the Airy function Bi(x);
+    first `nt` zeros, b', of the derivative of the Airy function Bi'(x);
+    the corresponding values Bi(b');
+    and the corresponding values Bi'(b).
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute
+
+    Returns
+    -------
+    b : ndarray
+        First `nt` zeros of Bi(x)
+    bp : ndarray
+        First `nt` zeros of Bi'(x)
+    bi : ndarray
+        Values of Bi(x) evaluated at first `nt` zeros of Bi'(x)
+    bip : ndarray
+        Values of Bi'(x) evaluated at first `nt` zeros of Bi(x)
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> b, bp, bi, bip = special.bi_zeros(3)
+    >>> b
+    array([-1.17371322, -3.2710933 , -4.83073784])
+    >>> bp
+    array([-2.29443968, -4.07315509, -5.51239573])
+    >>> bi
+    array([-0.45494438,  0.39652284, -0.36796916])
+    >>> bip
+    array([ 0.60195789, -0.76031014,  0.83699101])
+
+    """
+    kf = 2
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be a positive integer scalar.")
+    return _specfun.airyzo(nt, kf)
+
+
+def lmbda(v, x):
+    r"""Jahnke-Emden Lambda function, Lambdav(x).
+
+    This function is defined as [2]_,
+
+    .. math:: \Lambda_v(x) = \Gamma(v+1) \frac{J_v(x)}{(x/2)^v},
+
+    where :math:`\Gamma` is the gamma function and :math:`J_v` is the
+    Bessel function of the first kind.
+
+    Parameters
+    ----------
+    v : float
+        Order of the Lambda function
+    x : float
+        Value at which to evaluate the function and derivatives
+
+    Returns
+    -------
+    vl : ndarray
+        Values of Lambda_vi(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+    dl : ndarray
+        Derivatives Lambda_vi'(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+    .. [2] Jahnke, E. and Emde, F. "Tables of Functions with Formulae and
+           Curves" (4th ed.), Dover, 1945
+    """
+    if not (isscalar(v) and isscalar(x)):
+        raise ValueError("arguments must be scalars.")
+    if (v < 0):
+        raise ValueError("argument must be > 0.")
+    n = int(v)
+    v0 = v - n
+    if (n < 1):
+        n1 = 1
+    else:
+        n1 = n
+    v1 = n1 + v0
+    if (v != floor(v)):
+        vm, vl, dl = _specfun.lamv(v1, x)
+    else:
+        vm, vl, dl = _specfun.lamn(v1, x)
+    return vl[:(n+1)], dl[:(n+1)]
+
+
+def pbdv_seq(v, x):
+    """Parabolic cylinder functions Dv(x) and derivatives.
+
+    Parameters
+    ----------
+    v : float
+        Order of the parabolic cylinder function
+    x : float
+        Value at which to evaluate the function and derivatives
+
+    Returns
+    -------
+    dv : ndarray
+        Values of D_vi(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+    dp : ndarray
+        Derivatives D_vi'(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 13.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(v) and isscalar(x)):
+        raise ValueError("arguments must be scalars.")
+    n = int(v)
+    v0 = v-n
+    if (n < 1):
+        n1 = 1
+    else:
+        n1 = n
+    v1 = n1 + v0
+    dv, dp, pdf, pdd = _specfun.pbdv(v1, x)
+    return dv[:n1+1], dp[:n1+1]
+
+
+def pbvv_seq(v, x):
+    """Parabolic cylinder functions Vv(x) and derivatives.
+
+    Parameters
+    ----------
+    v : float
+        Order of the parabolic cylinder function
+    x : float
+        Value at which to evaluate the function and derivatives
+
+    Returns
+    -------
+    dv : ndarray
+        Values of V_vi(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+    dp : ndarray
+        Derivatives V_vi'(x), for vi=v-int(v), vi=1+v-int(v), ..., vi=v.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 13.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(v) and isscalar(x)):
+        raise ValueError("arguments must be scalars.")
+    n = int(v)
+    v0 = v-n
+    if (n <= 1):
+        n1 = 1
+    else:
+        n1 = n
+    v1 = n1 + v0
+    dv, dp, pdf, pdd = _specfun.pbvv(v1, x)
+    return dv[:n1+1], dp[:n1+1]
+
+
+def pbdn_seq(n, z):
+    """Parabolic cylinder functions Dn(z) and derivatives.
+
+    Parameters
+    ----------
+    n : int
+        Order of the parabolic cylinder function
+    z : complex
+        Value at which to evaluate the function and derivatives
+
+    Returns
+    -------
+    dv : ndarray
+        Values of D_i(z), for i=0, ..., i=n.
+    dp : ndarray
+        Derivatives D_i'(z), for i=0, ..., i=n.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996, chapter 13.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(n) and isscalar(z)):
+        raise ValueError("arguments must be scalars.")
+    if (floor(n) != n):
+        raise ValueError("n must be an integer.")
+    if (abs(n) <= 1):
+        n1 = 1
+    else:
+        n1 = n
+    cpb, cpd = _specfun.cpbdn(n1, z)
+    return cpb[:n1+1], cpd[:n1+1]
+
+
+def ber_zeros(nt):
+    """Compute nt zeros of the Kelvin function ber.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Kelvin function.
+
+    See Also
+    --------
+    ber
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 1)
+
+
+def bei_zeros(nt):
+    """Compute nt zeros of the Kelvin function bei.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Kelvin function.
+
+    See Also
+    --------
+    bei
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 2)
+
+
+def ker_zeros(nt):
+    """Compute nt zeros of the Kelvin function ker.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Kelvin function.
+
+    See Also
+    --------
+    ker
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 3)
+
+
+def kei_zeros(nt):
+    """Compute nt zeros of the Kelvin function kei.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the Kelvin function.
+
+    See Also
+    --------
+    kei
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 4)
+
+
+def berp_zeros(nt):
+    """Compute nt zeros of the derivative of the Kelvin function ber.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the derivative of the Kelvin function.
+
+    See Also
+    --------
+    ber, berp
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+
+    Examples
+    --------
+    Compute the first 5 zeros of the derivative of the Kelvin function.
+
+    >>> from scipy.special import berp_zeros
+    >>> berp_zeros(5)
+    array([ 6.03871081, 10.51364251, 14.96844542, 19.41757493, 23.86430432])
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 5)
+
+
+def beip_zeros(nt):
+    """Compute nt zeros of the derivative of the Kelvin function bei.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the derivative of the Kelvin function.
+
+    See Also
+    --------
+    bei, beip
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 6)
+
+
+def kerp_zeros(nt):
+    """Compute nt zeros of the derivative of the Kelvin function ker.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the derivative of the Kelvin function.
+
+    See Also
+    --------
+    ker, kerp
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 7)
+
+
+def keip_zeros(nt):
+    """Compute nt zeros of the derivative of the Kelvin function kei.
+
+    Parameters
+    ----------
+    nt : int
+        Number of zeros to compute. Must be positive.
+
+    Returns
+    -------
+    ndarray
+        First `nt` zeros of the derivative of the Kelvin function.
+
+    See Also
+    --------
+    kei, keip
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return _specfun.klvnzo(nt, 8)
+
+
+def kelvin_zeros(nt):
+    """Compute nt zeros of all Kelvin functions.
+
+    Returned in a length-8 tuple of arrays of length nt.  The tuple contains
+    the arrays of zeros of (ber, bei, ker, kei, ber', bei', ker', kei').
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not isscalar(nt) or (floor(nt) != nt) or (nt <= 0):
+        raise ValueError("nt must be positive integer scalar.")
+    return (_specfun.klvnzo(nt, 1),
+            _specfun.klvnzo(nt, 2),
+            _specfun.klvnzo(nt, 3),
+            _specfun.klvnzo(nt, 4),
+            _specfun.klvnzo(nt, 5),
+            _specfun.klvnzo(nt, 6),
+            _specfun.klvnzo(nt, 7),
+            _specfun.klvnzo(nt, 8))
+
+
+def pro_cv_seq(m, n, c):
+    """Characteristic values for prolate spheroidal wave functions.
+
+    Compute a sequence of characteristic values for the prolate
+    spheroidal wave functions for mode m and n'=m..n and spheroidal
+    parameter c.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(m) and isscalar(n) and isscalar(c)):
+        raise ValueError("Arguments must be scalars.")
+    if (n != floor(n)) or (m != floor(m)):
+        raise ValueError("Modes must be integers.")
+    if (n-m > 199):
+        raise ValueError("Difference between n and m is too large.")
+    maxL = n-m+1
+    return _specfun.segv(m, n, c, 1)[1][:maxL]
+
+
+def obl_cv_seq(m, n, c):
+    """Characteristic values for oblate spheroidal wave functions.
+
+    Compute a sequence of characteristic values for the oblate
+    spheroidal wave functions for mode m and n'=m..n and spheroidal
+    parameter c.
+
+    References
+    ----------
+    .. [1] Zhang, Shanjie and Jin, Jianming. "Computation of Special
+           Functions", John Wiley and Sons, 1996.
+           https://people.sc.fsu.edu/~jburkardt/f77_src/special_functions/special_functions.html
+
+    """
+    if not (isscalar(m) and isscalar(n) and isscalar(c)):
+        raise ValueError("Arguments must be scalars.")
+    if (n != floor(n)) or (m != floor(m)):
+        raise ValueError("Modes must be integers.")
+    if (n-m > 199):
+        raise ValueError("Difference between n and m is too large.")
+    maxL = n-m+1
+    return _specfun.segv(m, n, c, -1)[1][:maxL]
+
+
+def comb(N, k, *, exact=False, repetition=False):
+    """The number of combinations of N things taken k at a time.
+
+    This is often expressed as "N choose k".
+
+    Parameters
+    ----------
+    N : int, ndarray
+        Number of things.
+    k : int, ndarray
+        Number of elements taken.
+    exact : bool, optional
+        For integers, if `exact` is False, then floating point precision is
+        used, otherwise the result is computed exactly.
+    repetition : bool, optional
+        If `repetition` is True, then the number of combinations with
+        repetition is computed.
+
+    Returns
+    -------
+    val : int, float, ndarray
+        The total number of combinations.
+
+    See Also
+    --------
+    binom : Binomial coefficient considered as a function of two real
+            variables.
+
+    Notes
+    -----
+    - Array arguments accepted only for exact=False case.
+    - If N < 0, or k < 0, then 0 is returned.
+    - If k > N and repetition=False, then 0 is returned.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import comb
+    >>> k = np.array([3, 4])
+    >>> n = np.array([10, 10])
+    >>> comb(n, k, exact=False)
+    array([ 120.,  210.])
+    >>> comb(10, 3, exact=True)
+    120
+    >>> comb(10, 3, exact=True, repetition=True)
+    220
+
+    """
+    if repetition:
+        # Special case: C(n, 0) with repetition = 1 for n >= 0
+        # Without this check, comb(0, 0, repetition=True) would compute
+        # comb(-1, 0) which incorrectly returns 0
+        if exact:
+            if k == 0 and int(N) == N and N >= 0:
+                return 1
+        else:
+            k, N = asarray(k), asarray(N)
+            cond = (k == 0) & (N >= 0)
+            vals = binom(N + k - 1, k)
+            if isinstance(vals, np.ndarray):
+                vals[cond] = 1.0
+            elif cond:
+                vals = np.float64(1.0)
+            return vals
+        return comb(N + k - 1, k, exact=exact)
+    if exact:
+        if int(N) == N and int(k) == k:
+            # _comb_int casts inputs to integers, which is safe & intended here
+            return _comb_int(N, k)
+        else:
+            raise ValueError("Non-integer `N` and `k` with `exact=True` is not "
+                             "supported.")
+    else:
+        k, N = asarray(k), asarray(N)
+        cond = (k <= N) & (N >= 0) & (k >= 0)
+        vals = binom(N, k)
+        if isinstance(vals, np.ndarray):
+            vals[~cond] = 0
+        elif not cond:
+            vals = np.float64(0)
+        return vals
+
+
+def perm(N, k, exact=False):
+    """Permutations of N things taken k at a time, i.e., k-permutations of N.
+
+    It's also known as "partial permutations".
+
+    Parameters
+    ----------
+    N : int, ndarray
+        Number of things.
+    k : int, ndarray
+        Number of elements taken.
+    exact : bool, optional
+        If ``True``, calculate the answer exactly using long integer arithmetic (`N`
+        and `k` must be scalar integers). If ``False``, a floating point approximation
+        is calculated (more rapidly) using `poch`. Default is ``False``.
+
+    Returns
+    -------
+    val : int, ndarray
+        The number of k-permutations of N.
+
+    Notes
+    -----
+    - Array arguments accepted only for exact=False case.
+    - If k > N, N < 0, or k < 0, then a 0 is returned.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import perm
+    >>> k = np.array([3, 4])
+    >>> n = np.array([10, 10])
+    >>> perm(n, k)
+    array([  720.,  5040.])
+    >>> perm(10, 3, exact=True)
+    720
+
+    """
+    if exact:
+        N = np.squeeze(N)[()]  # for backward compatibility (accepted size 1 arrays)
+        k = np.squeeze(k)[()]
+        if not (isscalar(N) and isscalar(k)):
+            raise ValueError("`N` and `k` must be scalar integers with `exact=True`.")
+
+        floor_N, floor_k = int(N), int(k)
+        non_integral = not (floor_N == N and floor_k == k)
+        if non_integral:
+            raise ValueError("Non-integer `N` and `k` with `exact=True` is not "
+                             "supported.")
+
+        if (k > N) or (N < 0) or (k < 0):
+            return 0
+
+        val = 1
+        for i in range(floor_N - floor_k + 1, floor_N + 1):
+            val *= i
+        return val
+    else:
+        k, N = asarray(k), asarray(N)
+        cond = (k <= N) & (N >= 0) & (k >= 0)
+        vals = poch(N - k + 1, k)
+        if isinstance(vals, np.ndarray):
+            vals[~cond] = 0
+        elif not cond:
+            vals = np.float64(0)
+        return vals
+
+
+# https://stackoverflow.com/a/16327037
+def _range_prod(lo, hi, k=1):
+    """
+    Product of a range of numbers spaced k apart (from hi).
+
+    For k=1, this returns the product of
+    lo * (lo+1) * (lo+2) * ... * (hi-2) * (hi-1) * hi
+    = hi! / (lo-1)!
+
+    For k>1, it correspond to taking only every k'th number when
+    counting down from hi - e.g. 18!!!! = _range_prod(1, 18, 4).
+
+    Breaks into smaller products first for speed:
+    _range_prod(2, 9) = ((2*3)*(4*5))*((6*7)*(8*9))
+    """
+    if lo == 1 and k == 1:
+        return math.factorial(hi)
+
+    if lo + k < hi:
+        mid = (hi + lo) // 2
+        if k > 1:
+            # make sure mid is a multiple of k away from hi
+            mid = mid - ((mid - hi) % k)
+        return _range_prod(lo, mid, k) * _range_prod(mid + k, hi, k)
+    elif lo + k == hi:
+        return lo * hi
+    else:
+        return hi
+
+
+def _factorialx_array_exact(n, k=1):
+    """
+    Exact computation of factorial for an array.
+
+    The factorials are computed in incremental fashion, by taking
+    the sorted unique values of n and multiplying the intervening
+    numbers between the different unique values.
+
+    In other words, the factorial for the largest input is only
+    computed once, with each other result computed in the process.
+
+    k > 1 corresponds to the multifactorial.
+    """
+    un = np.unique(n)
+
+    # Convert to object array if np.int64 can't handle size
+    if k in _FACTORIALK_LIMITS_64BITS.keys():
+        if un[-1] > _FACTORIALK_LIMITS_64BITS[k]:
+            # e.g. k=1: 21! > np.iinfo(np.int64).max
+            dt = object
+        elif un[-1] > _FACTORIALK_LIMITS_32BITS[k]:
+            # e.g. k=3: 26!!! > np.iinfo(np.int32).max
+            dt = np.int64
+        else:
+            dt = np.dtype("long")
+    else:
+        # for k >= 10, we always use object
+        dt = object
+
+    out = np.empty_like(n, dtype=dt)
+
+    # Handle invalid/trivial values
+    un = un[un > 1]
+    out[n < 2] = 1
+    out[n < 0] = 0
+
+    # Calculate products of each range of numbers
+    # we can only multiply incrementally if the values are k apart;
+    # therefore we partition `un` into "lanes", i.e. its residues modulo k
+    for lane in range(0, k):
+        ul = un[(un % k) == lane] if k > 1 else un
+        if ul.size:
+            # after np.unique, un resp. ul are sorted, ul[0] is the smallest;
+            # cast to python ints to avoid overflow with np.int-types
+            val = _range_prod(1, int(ul[0]), k=k)
+            out[n == ul[0]] = val
+            for i in range(len(ul) - 1):
+                # by the filtering above, we have ensured that prev & current
+                # are a multiple of k apart
+                prev = ul[i]
+                current = ul[i + 1]
+                # we already multiplied all factors until prev; continue
+                # building the full factorial from the following (`prev + 1`);
+                # use int() for the same reason as above
+                val *= _range_prod(int(prev + 1), int(current), k=k)
+                out[n == current] = val
+
+    return out
+
+
+def _factorialx_array_approx(n, k, extend):
+    """
+    Calculate approximation to multifactorial for array n and integer k.
+
+    Ensure that values aren't calculated unnecessarily.
+    """
+    if extend == "complex":
+        return _factorialx_approx_core(n, k=k, extend=extend)
+
+    # at this point we are guaranteed that extend='zero' and that k>0 is an integer
+    result = zeros(n.shape)
+    # keep nans as nans
+    place(result, np.isnan(n), np.nan)
+    # only compute where n >= 0 (excludes nans), everything else is 0
+    cond = (n >= 0)
+    n_to_compute = extract(cond, n)
+    place(result, cond, _factorialx_approx_core(n_to_compute, k=k, extend=extend))
+    return result
+
+
+def _gamma1p(vals):
+    """
+    returns gamma(n+1), though with NaN at -1 instead of inf, c.f. #21827
+    """
+    res = gamma(vals + 1)
+    # replace infinities at -1 (from gamma function at 0) with nan
+    # gamma only returns inf for real inputs; can ignore complex case
+    if isinstance(res, np.ndarray):
+        if not _is_subdtype(vals.dtype, "c"):
+            res[vals == -1] = np.nan
+    elif np.isinf(res) and vals == -1:
+        res = np.float64("nan")
+    return res
+
+
+def _factorialx_approx_core(n, k, extend):
+    """
+    Core approximation to multifactorial for array n and integer k.
+    """
+    if k == 1:
+        # shortcut for k=1; same for both extensions, because we assume the
+        # handling of extend == 'zero' happens in _factorialx_array_approx
+        result = _gamma1p(n)
+        if isinstance(n, np.ndarray):
+            # gamma does not maintain 0-dim arrays; fix it
+            result = np.array(result)
+        return result
+
+    if extend == "complex":
+        # see https://numpy.org/doc/stable/reference/generated/numpy.power.html
+        p_dtype = complex if (_is_subdtype(type(k), "c") or k < 0) else None
+        with warnings.catch_warnings():
+            # do not warn about 0 * inf, nan / nan etc.; the results are correct
+            warnings.simplefilter("ignore", RuntimeWarning)
+            # don't use `(n-1)/k` in np.power; underflows if 0 is of a uintX type
+            result = np.power(k, n / k, dtype=p_dtype) * _gamma1p(n / k)
+            result *= rgamma(1 / k + 1) / np.power(k, 1 / k, dtype=p_dtype)
+        if isinstance(n, np.ndarray):
+            # ensure we keep array-ness for 0-dim inputs; already n/k above loses it
+            result = np.array(result)
+        return result
+
+    # at this point we are guaranteed that extend='zero' and that k>0 is an integer
+    n_mod_k = n % k
+    # scalar case separately, unified handling would be inefficient for arrays;
+    # don't use isscalar due to numpy/numpy#23574; 0-dim arrays treated below
+    if not isinstance(n, np.ndarray):
+        with warnings.catch_warnings():
+            # large n cause overflow warnings, but infinity is fine
+            warnings.simplefilter("ignore", RuntimeWarning)
+            return (
+                np.power(k, (n - n_mod_k) / k)
+                * gamma(n / k + 1) / gamma(n_mod_k / k + 1)
+                * max(n_mod_k, 1)
+            )
+
+    # factor that's independent of the residue class (see factorialk docstring)
+    with warnings.catch_warnings():
+        # large n cause overflow warnings, but infinity is fine
+        warnings.simplefilter("ignore", RuntimeWarning)
+        result = np.power(k, n / k) * gamma(n / k + 1)
+    # factor dependent on residue r (for `r=0` it's 1, so we skip `r=0`
+    # below and thus also avoid evaluating `max(r, 1)`)
+    def corr(k, r): return np.power(k, -r / k) / gamma(r / k + 1) * r
+    for r in np.unique(n_mod_k):
+        if r == 0:
+            continue
+        # cast to int because uint types break on `-r`
+        result[n_mod_k == r] *= corr(k, int(r))
+    return result
+
+
+def _is_subdtype(dtype, dtypes):
+    """
+    Shorthand for calculating whether dtype is subtype of some dtypes.
+
+    Also allows specifying a list instead of just a single dtype.
+
+    Additionaly, the most important supertypes from
+        https://numpy.org/doc/stable/reference/arrays.scalars.html
+    can optionally be specified using abbreviations as follows:
+        "i": np.integer
+        "f": np.floating
+        "c": np.complexfloating
+        "n": np.number (contains the other three)
+    """
+    dtypes = dtypes if isinstance(dtypes, list) else [dtypes]
+    # map single character abbreviations, if they are in dtypes
+    mapping = {
+        "i": np.integer,
+        "f": np.floating,
+        "c": np.complexfloating,
+        "n": np.number
+    }
+    dtypes = [mapping.get(x, x) for x in dtypes]
+    return any(np.issubdtype(dtype, dt) for dt in dtypes)
+
+
+def _factorialx_wrapper(fname, n, k, exact, extend):
+    """
+    Shared implementation for factorial, factorial2 & factorialk.
+    """
+    if extend not in ("zero", "complex"):
+        raise ValueError(
+            f"argument `extend` must be either 'zero' or 'complex', received: {extend}"
+        )
+    if exact and extend == "complex":
+        raise ValueError("Incompatible options: `exact=True` and `extend='complex'`")
+
+    msg_unsup = (
+        "Unsupported data type for {vname} in {fname}: {dtype}\n"
+    )
+    if fname == "factorial":
+        msg_unsup += (
+            "Permitted data types are integers and floating point numbers, "
+            "as well as complex numbers if `extend='complex' is passed."
+        )
+    else:
+        msg_unsup += (
+            "Permitted data types are integers, as well as floating point "
+            "numbers and complex numbers if `extend='complex' is passed."
+        )
+    msg_exact_not_possible = (
+        "`exact=True` only supports integers, cannot use data type {dtype}"
+    )
+    msg_needs_complex = (
+        "In order to use non-integer arguments, you must opt into this by passing "
+        "`extend='complex'`. Note that this changes the result for all negative "
+        "arguments (which by default return 0)."
+    )
+
+    if fname == "factorial2":
+        msg_needs_complex += (" Additionally, it will rescale the values of the double"
+                              " factorial at even integers by a factor of sqrt(2/pi).")
+    elif fname == "factorialk":
+        msg_needs_complex += (" Additionally, it will perturb the values of the"
+                              " multifactorial at most positive integers `n`.")
+        # check type of k
+        if not _is_subdtype(type(k), ["i", "f", "c"]):
+            raise ValueError(msg_unsup.format(vname="`k`", fname=fname, dtype=type(k)))
+        elif _is_subdtype(type(k), ["f", "c"]) and extend != "complex":
+            raise ValueError(msg_needs_complex)
+        # check value of k
+        if extend == "zero" and k < 1:
+            msg = f"For `extend='zero'`, k must be a positive integer, received: {k}"
+            raise ValueError(msg)
+        elif k == 0:
+            raise ValueError("Parameter k cannot be zero!")
+
+    # factorial allows floats also for extend="zero"
+    types_requiring_complex = "c" if fname == "factorial" else ["f", "c"]
+
+    # don't use isscalar due to numpy/numpy#23574; 0-dim arrays treated below
+    if np.ndim(n) == 0 and not isinstance(n, np.ndarray):
+        # scalar cases
+        if not _is_subdtype(type(n), ["i", "f", "c", type(None)]):
+            raise ValueError(msg_unsup.format(vname="`n`", fname=fname, dtype=type(n)))
+        elif _is_subdtype(type(n), types_requiring_complex) and extend != "complex":
+            raise ValueError(msg_needs_complex)
+        elif n is None or np.isnan(n):
+            complexify = (extend == "complex") and _is_subdtype(type(n), "c")
+            return np.complex128("nan+nanj") if complexify else np.float64("nan")
+        elif extend == "zero" and n < 0:
+            return 0 if exact else np.float64(0)
+        elif n in {0, 1}:
+            return 1 if exact else np.float64(1)
+        elif exact and _is_subdtype(type(n), "i"):
+            # calculate with integers; cast away other int types (like unsigned)
+            return _range_prod(1, int(n), k=k)
+        elif exact:
+            # only relevant for factorial
+            raise ValueError(msg_exact_not_possible.format(dtype=type(n)))
+        # approximation
+        return _factorialx_approx_core(n, k=k, extend=extend)
+
+    # arrays & array-likes
+    n = asarray(n)
+
+    if not _is_subdtype(n.dtype, ["i", "f", "c"]):
+        raise ValueError(msg_unsup.format(vname="`n`", fname=fname, dtype=n.dtype))
+    elif _is_subdtype(n.dtype, types_requiring_complex) and extend != "complex":
+        raise ValueError(msg_needs_complex)
+    elif exact and _is_subdtype(n.dtype, ["f"]):
+        # only relevant for factorial
+        raise ValueError(msg_exact_not_possible.format(dtype=n.dtype))
+
+    if n.size == 0:
+        # return empty arrays unchanged
+        return n
+    elif exact:
+        # calculate with integers
+        return _factorialx_array_exact(n, k=k)
+    # approximation
+    return _factorialx_array_approx(n, k=k, extend=extend)
+
+
+def factorial(n, exact=False, extend="zero"):
+    """
+    The factorial of a number or array of numbers.
+
+    The factorial of non-negative integer `n` is the product of all
+    positive integers less than or equal to `n`::
+
+        n! = n * (n - 1) * (n - 2) * ... * 1
+
+    Parameters
+    ----------
+    n : int or float or complex (or array_like thereof)
+        Input values for ``n!``. Complex values require ``extend='complex'``.
+        By default, the return value for ``n < 0`` is 0.
+    exact : bool, optional
+        If ``exact`` is set to True, calculate the answer exactly using
+        integer arithmetic, otherwise approximate using the gamma function
+        (faster, but yields floats instead of integers).
+        Default is False.
+    extend : string, optional
+        One of ``'zero'`` or ``'complex'``; this determines how values ``n<0``
+        are handled - by default they are 0, but it is possible to opt into the
+        complex extension of the factorial (see below).
+
+    Returns
+    -------
+    nf : int or float or complex or ndarray
+        Factorial of ``n``, as integer, float or complex (depending on ``exact``
+        and ``extend``). Array inputs are returned as arrays.
+
+    Notes
+    -----
+    For arrays with ``exact=True``, the factorial is computed only once, for
+    the largest input, with each other result computed in the process.
+    The output dtype is increased to ``int64`` or ``object`` if necessary.
+
+    With ``exact=False`` the factorial is approximated using the gamma
+    function (which is also the definition of the complex extension):
+
+    .. math:: n! = \\Gamma(n+1)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import factorial
+    >>> arr = np.array([3, 4, 5])
+    >>> factorial(arr, exact=False)
+    array([   6.,   24.,  120.])
+    >>> factorial(arr, exact=True)
+    array([  6,  24, 120])
+    >>> factorial(5, exact=True)
+    120
+
+    """
+    return _factorialx_wrapper("factorial", n, k=1, exact=exact, extend=extend)
+
+
+def factorial2(n, exact=False, extend="zero"):
+    """Double factorial.
+
+    This is the factorial with every second value skipped.  E.g., ``7!! = 7 * 5
+    * 3 * 1``.  It can be approximated numerically as::
+
+      n!! = 2 ** (n / 2) * gamma(n / 2 + 1) * sqrt(2 / pi)  n odd
+          = 2 ** (n / 2) * gamma(n / 2 + 1)                 n even
+          = 2 ** (n / 2) * (n / 2)!                         n even
+
+    The formula for odd ``n`` is the basis for the complex extension.
+
+    Parameters
+    ----------
+    n : int or float or complex (or array_like thereof)
+        Input values for ``n!!``. Non-integer values require ``extend='complex'``.
+        By default, the return value for ``n < 0`` is 0.
+    exact : bool, optional
+        If ``exact`` is set to True, calculate the answer exactly using
+        integer arithmetic, otherwise use above approximation (faster,
+        but yields floats instead of integers).
+        Default is False.
+    extend : string, optional
+        One of ``'zero'`` or ``'complex'``; this determines how values ``n<0``
+        are handled - by default they are 0, but it is possible to opt into the
+        complex extension of the double factorial. This also enables passing
+        complex values to ``n``.
+
+        .. warning::
+
+           Using the ``'complex'`` extension also changes the values of the
+           double factorial for even integers, reducing them by a factor of
+           ``sqrt(2/pi) ~= 0.79``, see [1].
+
+    Returns
+    -------
+    nf : int or float or complex or ndarray
+        Double factorial of ``n``, as integer, float or complex (depending on
+        ``exact`` and ``extend``). Array inputs are returned as arrays.
+
+    Examples
+    --------
+    >>> from scipy.special import factorial2
+    >>> factorial2(7, exact=False)
+    np.float64(105.00000000000001)
+    >>> factorial2(7, exact=True)
+    105
+
+    References
+    ----------
+    .. [1] Complex extension to double factorial
+            https://en.wikipedia.org/wiki/Double_factorial#Complex_arguments
+    """
+    return _factorialx_wrapper("factorial2", n, k=2, exact=exact, extend=extend)
+
+
+def factorialk(n, k, exact=False, extend="zero"):
+    """Multifactorial of n of order k, n(!!...!).
+
+    This is the multifactorial of n skipping k values.  For example,
+
+      factorialk(17, 4) = 17!!!! = 17 * 13 * 9 * 5 * 1
+
+    In particular, for any integer ``n``, we have
+
+      factorialk(n, 1) = factorial(n)
+
+      factorialk(n, 2) = factorial2(n)
+
+    Parameters
+    ----------
+    n : int or float or complex (or array_like thereof)
+        Input values for multifactorial. Non-integer values require
+        ``extend='complex'``. By default, the return value for ``n < 0`` is 0.
+    k : int or float or complex (or array_like thereof)
+        Order of multifactorial. Non-integer values require ``extend='complex'``.
+    exact : bool, optional
+        If ``exact`` is set to True, calculate the answer exactly using
+        integer arithmetic, otherwise use an approximation (faster,
+        but yields floats instead of integers)
+        Default is False.
+    extend : string, optional
+        One of ``'zero'`` or ``'complex'``; this determines how values ``n<0`` are
+        handled - by default they are 0, but it is possible to opt into the complex
+        extension of the multifactorial. This enables passing complex values,
+        not only to ``n`` but also to ``k``.
+
+        .. warning::
+
+           Using the ``'complex'`` extension also changes the values of the
+           multifactorial at integers ``n != 1 (mod k)`` by a factor depending
+           on both ``k`` and ``n % k``, see below or [1].
+
+    Returns
+    -------
+    nf : int or float or complex or ndarray
+        Multifactorial (order ``k``) of ``n``, as integer, float or complex (depending
+        on ``exact`` and ``extend``). Array inputs are returned as arrays.
+
+    Examples
+    --------
+    >>> from scipy.special import factorialk
+    >>> factorialk(5, k=1, exact=True)
+    120
+    >>> factorialk(5, k=3, exact=True)
+    10
+    >>> factorialk([5, 7, 9], k=3, exact=True)
+    array([ 10,  28, 162])
+    >>> factorialk([5, 7, 9], k=3, exact=False)
+    array([ 10.,  28., 162.])
+
+    Notes
+    -----
+    While less straight-forward than for the double-factorial, it's possible to
+    calculate a general approximation formula of n!(k) by studying ``n`` for a given
+    remainder ``r < k`` (thus ``n = m * k + r``, resp. ``r = n % k``), which can be
+    put together into something valid for all integer values ``n >= 0`` & ``k > 0``::
+
+      n!(k) = k ** ((n - r)/k) * gamma(n/k + 1) / gamma(r/k + 1) * max(r, 1)
+
+    This is the basis of the approximation when ``exact=False``.
+
+    In principle, any fixed choice of ``r`` (ignoring its relation ``r = n%k``
+    to ``n``) would provide a suitable analytic continuation from integer ``n``
+    to complex ``z`` (not only satisfying the functional equation but also
+    being logarithmically convex, c.f. Bohr-Mollerup theorem) -- in fact, the
+    choice of ``r`` above only changes the function by a constant factor. The
+    final constraint that determines the canonical continuation is ``f(1) = 1``,
+    which forces ``r = 1`` (see also [1]).::
+
+      z!(k) = k ** ((z - 1)/k) * gamma(z/k + 1) / gamma(1/k + 1)
+
+    References
+    ----------
+    .. [1] Complex extension to multifactorial
+            https://en.wikipedia.org/wiki/Double_factorial#Alternative_extension_of_the_multifactorial
+    """
+    return _factorialx_wrapper("factorialk", n, k=k, exact=exact, extend=extend)
+
+
+def stirling2(N, K, *, exact=False):
+    r"""Generate Stirling number(s) of the second kind.
+
+    Stirling numbers of the second kind count the number of ways to
+    partition a set with N elements into K non-empty subsets.
+
+    The values this function returns are calculated using a dynamic
+    program which avoids redundant computation across the subproblems
+    in the solution. For array-like input, this implementation also
+    avoids redundant computation across the different Stirling number
+    calculations.
+
+    The numbers are sometimes denoted
+
+    .. math::
+
+        {N \brace{K}}
+
+    see [1]_ for details. This is often expressed-verbally-as
+    "N subset K".
+
+    Parameters
+    ----------
+    N : int, ndarray
+        Number of things.
+    K : int, ndarray
+        Number of non-empty subsets taken.
+    exact : bool, optional
+        Uses dynamic programming (DP) with floating point
+        numbers for smaller arrays and uses a second order approximation due to
+        Temme for larger entries  of `N` and `K` that allows trading speed for
+        accuracy. See [2]_ for a description. Temme approximation is used for
+        values ``n>50``. The max error from the DP has max relative error
+        ``4.5*10^-16`` for ``n<=50`` and the max error from the Temme approximation
+        has max relative error ``5*10^-5`` for ``51 <= n < 70`` and
+        ``9*10^-6`` for ``70 <= n < 101``. Note that these max relative errors will
+        decrease further as `n` increases.
+
+    Returns
+    -------
+    val : int, float, ndarray
+        The number of partitions.
+
+    See Also
+    --------
+    comb : The number of combinations of N things taken k at a time.
+
+    Notes
+    -----
+    - If N < 0, or K < 0, then 0 is returned.
+    - If K > N, then 0 is returned.
+
+    The output type will always be `int` or ndarray of `object`.
+    The input must contain either numpy or python integers otherwise a
+    TypeError is raised.
+
+    References
+    ----------
+    .. [1] R. L. Graham, D. E. Knuth and O. Patashnik, "Concrete
+        Mathematics: A Foundation for Computer Science," Addison-Wesley
+        Publishing Company, Boston, 1989. Chapter 6, page 258.
+
+    .. [2] Temme, Nico M. "Asymptotic estimates of Stirling numbers."
+        Studies in Applied Mathematics 89.3 (1993): 233-243.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import stirling2
+    >>> k = np.array([3, -1, 3])
+    >>> n = np.array([10, 10, 9])
+    >>> stirling2(n, k)
+    array([9330.0, 0.0, 3025.0])
+
+    """
+    output_is_scalar = np.isscalar(N) and np.isscalar(K)
+    # make a min-heap of unique (n,k) pairs
+    N, K = asarray(N), asarray(K)
+    if not np.issubdtype(N.dtype, np.integer):
+        raise TypeError("Argument `N` must contain only integers")
+    if not np.issubdtype(K.dtype, np.integer):
+        raise TypeError("Argument `K` must contain only integers")
+    if not exact:
+        # NOTE: here we allow np.uint via casting to double types prior to
+        # passing to private ufunc dispatcher. All dispatched functions
+        # take double type for (n,k) arguments and return double.
+        return _stirling2_inexact(N.astype(float), K.astype(float))
+    nk_pairs = list(
+        set([(n.take(0), k.take(0))
+             for n, k in np.nditer([N, K], ['refs_ok'])])
+    )
+    heapify(nk_pairs)
+    # base mapping for small values
+    snsk_vals = defaultdict(int)
+    for pair in [(0, 0), (1, 1), (2, 1), (2, 2)]:
+        snsk_vals[pair] = 1
+    # for each pair in the min-heap, calculate the value, store for later
+    n_old, n_row = 2, [0, 1, 1]
+    while nk_pairs:
+        n, k = heappop(nk_pairs)
+        if n < 2 or k > n or k <= 0:
+            continue
+        elif k == n or k == 1:
+            snsk_vals[(n, k)] = 1
+            continue
+        elif n != n_old:
+            num_iters = n - n_old
+            while num_iters > 0:
+                n_row.append(1)
+                # traverse from back to remove second row
+                for j in range(len(n_row)-2, 1, -1):
+                    n_row[j] = n_row[j]*j + n_row[j-1]
+                num_iters -= 1
+            snsk_vals[(n, k)] = n_row[k]
+        else:
+            snsk_vals[(n, k)] = n_row[k]
+        n_old, n_row = n, n_row
+    out_types = [object, object, object] if exact else [float, float, float]
+    # for each pair in the map, fetch the value, and populate the array
+    it = np.nditer(
+        [N, K, None],
+        ['buffered', 'refs_ok'],
+        [['readonly'], ['readonly'], ['writeonly', 'allocate']],
+        op_dtypes=out_types,
+    )
+    with it:
+        while not it.finished:
+            it[2] = snsk_vals[(int(it[0]), int(it[1]))]
+            it.iternext()
+        output = it.operands[2]
+        # If N and K were both scalars, convert output to scalar.
+        if output_is_scalar:
+            output = output.take(0)
+    return output
+
+
+def zeta(x, q=None, out=None):
+    r"""
+    Riemann or Hurwitz zeta function.
+
+    Parameters
+    ----------
+    x : array_like of float or complex.
+        Input data
+    q : array_like of float, optional
+        Input data, must be real.  Defaults to Riemann zeta. When `q` is
+        ``None``, complex inputs `x` are supported. If `q` is not ``None``,
+        then currently only real inputs `x` with ``x >= 1`` are supported,
+        even when ``q = 1.0`` (corresponding to the Riemann zeta function).
+
+    out : ndarray, optional
+        Output array for the computed values.
+
+    Returns
+    -------
+    out : array_like
+        Values of zeta(x).
+
+    See Also
+    --------
+    zetac
+
+    Notes
+    -----
+    The two-argument version is the Hurwitz zeta function
+
+    .. math::
+
+        \zeta(x, q) = \sum_{k=0}^{\infty} \frac{1}{(k + q)^x};
+
+    see [dlmf]_ for details. The Riemann zeta function corresponds to
+    the case when ``q = 1``.
+
+    For complex inputs with ``q = None``, points with
+    ``abs(z.imag) > 1e9`` and ``0 <= abs(z.real) < 2.5`` are currently not
+    supported due to slow convergence causing excessive runtime.
+
+    References
+    ----------
+    .. [dlmf] NIST, Digital Library of Mathematical Functions,
+        https://dlmf.nist.gov/25.11#i
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import zeta, polygamma, factorial
+
+    Some specific values:
+
+    >>> zeta(2), np.pi**2/6
+    (1.6449340668482266, 1.6449340668482264)
+
+    >>> zeta(4), np.pi**4/90
+    (1.0823232337111381, 1.082323233711138)
+
+    First nontrivial zero:
+
+    >>> zeta(0.5 + 14.134725141734695j)
+    0 + 0j
+
+    Relation to the `polygamma` function:
+
+    >>> m = 3
+    >>> x = 1.25
+    >>> polygamma(m, x)
+    array(2.782144009188397)
+    >>> (-1)**(m+1) * factorial(m) * zeta(m+1, x)
+    2.7821440091883969
+
+    """
+    if q is None:
+        return _ufuncs._riemann_zeta(x, out)
+    else:
+        return _ufuncs._zeta(x, q, out)
+
+
+def softplus(x, **kwargs):
+    r"""
+    Compute the softplus function element-wise.
+
+    The softplus function is defined as: ``softplus(x) = log(1 + exp(x))``.
+    It is a smooth approximation of the rectifier function (ReLU).
+
+    Parameters
+    ----------
+    x : array_like
+        Input value.
+    **kwargs
+        For other keyword-only arguments, see the
+        `ufunc docs <https://numpy.org/doc/stable/reference/ufuncs.html>`_.
+
+    Returns
+    -------
+    softplus : ndarray
+        Logarithm of ``exp(0) + exp(x)``.
+
+    Examples
+    --------
+    >>> from scipy import special
+
+    >>> special.softplus(0)
+    0.6931471805599453
+
+    >>> special.softplus([-1, 0, 1])
+    array([0.31326169, 0.69314718, 1.31326169])
+    """
+    return np.logaddexp(0, x, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_comb.cpython-312-x86_64-linux-gnu.so b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_comb.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..d85b9e3309ac5f0829185bca03d99adf6bf2fc34
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_comb.cpython-312-x86_64-linux-gnu.so differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ellip_harm.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ellip_harm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b07f755f87c675822af66b9f77d4821b554fdfd0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ellip_harm.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+from ._ufuncs import _ellip_harm
+from ._ellip_harm_2 import _ellipsoid, _ellipsoid_norm
+
+
+def ellip_harm(h2, k2, n, p, s, signm=1, signn=1):
+    r"""
+    Ellipsoidal harmonic functions E^p_n(l)
+
+    These are also known as Lamé functions of the first kind, and are
+    solutions to the Lamé equation:
+
+    .. math:: (s^2 - h^2)(s^2 - k^2)E''(s)
+              + s(2s^2 - h^2 - k^2)E'(s) + (a - q s^2)E(s) = 0
+
+    where :math:`q = (n+1)n` and :math:`a` is the eigenvalue (not
+    returned) corresponding to the solutions.
+
+    Parameters
+    ----------
+    h2 : float
+        ``h**2``
+    k2 : float
+        ``k**2``; should be larger than ``h**2``
+    n : int
+        Degree
+    s : float
+        Coordinate
+    p : int
+        Order, can range between [1,2n+1]
+    signm : {1, -1}, optional
+        Sign of prefactor of functions. Can be +/-1. See Notes.
+    signn : {1, -1}, optional
+        Sign of prefactor of functions. Can be +/-1. See Notes.
+
+    Returns
+    -------
+    E : float
+        the harmonic :math:`E^p_n(s)`
+
+    See Also
+    --------
+    ellip_harm_2, ellip_normal
+
+    Notes
+    -----
+    The geometric interpretation of the ellipsoidal functions is
+    explained in [2]_, [3]_, [4]_. The `signm` and `signn` arguments control the
+    sign of prefactors for functions according to their type::
+
+        K : +1
+        L : signm
+        M : signn
+        N : signm*signn
+
+    .. versionadded:: 0.15.0
+
+    References
+    ----------
+    .. [1] Digital Library of Mathematical Functions 29.12
+       https://dlmf.nist.gov/29.12
+    .. [2] Bardhan and Knepley, "Computational science and
+       re-discovery: open-source implementations of
+       ellipsoidal harmonics for problems in potential theory",
+       Comput. Sci. Disc. 5, 014006 (2012)
+       :doi:`10.1088/1749-4699/5/1/014006`.
+    .. [3] David J.and Dechambre P, "Computation of Ellipsoidal
+       Gravity Field Harmonics for small solar system bodies"
+       pp. 30-36, 2000
+    .. [4] George Dassios, "Ellipsoidal Harmonics: Theory and Applications"
+       pp. 418, 2012
+
+    Examples
+    --------
+    >>> from scipy.special import ellip_harm
+    >>> w = ellip_harm(5,8,1,1,2.5)
+    >>> w
+    2.5
+
+    Check that the functions indeed are solutions to the Lamé equation:
+
+    >>> import numpy as np
+    >>> from scipy.interpolate import UnivariateSpline
+    >>> def eigenvalue(f, df, ddf):
+    ...     r = (((s**2 - h**2) * (s**2 - k**2) * ddf
+    ...           + s * (2*s**2 - h**2 - k**2) * df
+    ...           - n * (n + 1)*s**2*f) / f)
+    ...     return -r.mean(), r.std()
+    >>> s = np.linspace(0.1, 10, 200)
+    >>> k, h, n, p = 8.0, 2.2, 3, 2
+    >>> E = ellip_harm(h**2, k**2, n, p, s)
+    >>> E_spl = UnivariateSpline(s, E)
+    >>> a, a_err = eigenvalue(E_spl(s), E_spl(s,1), E_spl(s,2))
+    >>> a, a_err
+    (583.44366156701483, 6.4580890640310646e-11)
+
+    """  # noqa: E501
+    return _ellip_harm(h2, k2, n, p, s, signm, signn)
+
+
+_ellip_harm_2_vec = np.vectorize(_ellipsoid, otypes='d')
+
+
+def ellip_harm_2(h2, k2, n, p, s):
+    r"""
+    Ellipsoidal harmonic functions F^p_n(l)
+
+    These are also known as Lamé functions of the second kind, and are
+    solutions to the Lamé equation:
+
+    .. math:: (s^2 - h^2)(s^2 - k^2)F''(s)
+              + s(2s^2 - h^2 - k^2)F'(s) + (a - q s^2)F(s) = 0
+
+    where :math:`q = (n+1)n` and :math:`a` is the eigenvalue (not
+    returned) corresponding to the solutions.
+
+    Parameters
+    ----------
+    h2 : float
+        ``h**2``
+    k2 : float
+        ``k**2``; should be larger than ``h**2``
+    n : int
+        Degree.
+    p : int
+        Order, can range between [1,2n+1].
+    s : float
+        Coordinate
+
+    Returns
+    -------
+    F : float
+        The harmonic :math:`F^p_n(s)`
+
+    See Also
+    --------
+    ellip_harm, ellip_normal
+
+    Notes
+    -----
+    Lamé functions of the second kind are related to the functions of the first kind:
+
+    .. math::
+
+       F^p_n(s)=(2n + 1)E^p_n(s)\int_{0}^{1/s}
+       \frac{du}{(E^p_n(1/u))^2\sqrt{(1-u^2k^2)(1-u^2h^2)}}
+
+    .. versionadded:: 0.15.0
+
+    Examples
+    --------
+    >>> from scipy.special import ellip_harm_2
+    >>> w = ellip_harm_2(5,8,2,1,10)
+    >>> w
+    0.00108056853382
+
+    """
+    with np.errstate(all='ignore'):
+        return _ellip_harm_2_vec(h2, k2, n, p, s)
+
+
+def _ellip_normal_vec(h2, k2, n, p):
+    return _ellipsoid_norm(h2, k2, n, p)
+
+
+_ellip_normal_vec = np.vectorize(_ellip_normal_vec, otypes='d')
+
+
+def ellip_normal(h2, k2, n, p):
+    r"""
+    Ellipsoidal harmonic normalization constants gamma^p_n
+
+    The normalization constant is defined as
+
+    .. math::
+
+       \gamma^p_n=8\int_{0}^{h}dx\int_{h}^{k}dy
+       \frac{(y^2-x^2)(E^p_n(y)E^p_n(x))^2}{\sqrt((k^2-y^2)(y^2-h^2)(h^2-x^2)(k^2-x^2)}
+
+    Parameters
+    ----------
+    h2 : float
+        ``h**2``
+    k2 : float
+        ``k**2``; should be larger than ``h**2``
+    n : int
+        Degree.
+    p : int
+        Order, can range between [1,2n+1].
+
+    Returns
+    -------
+    gamma : float
+        The normalization constant :math:`\gamma^p_n`
+
+    See Also
+    --------
+    ellip_harm, ellip_harm_2
+
+    Notes
+    -----
+    .. versionadded:: 0.15.0
+
+    Examples
+    --------
+    >>> from scipy.special import ellip_normal
+    >>> w = ellip_normal(5,8,3,7)
+    >>> w
+    1723.38796997
+
+    """
+    with np.errstate(all='ignore'):
+        return _ellip_normal_vec(h2, k2, n, p)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_input_validation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_input_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b7fe36df87617bf91655623ee0076c37a4d08a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_input_validation.py
@@ -0,0 +1,17 @@
+import math
+import operator
+
+def _nonneg_int_or_fail(n, var_name, strict=True):
+    try:
+        if strict:
+            # Raises an exception if float
+            n = operator.index(n)
+        elif n == math.floor(n):
+            n = int(n)
+        else:
+            raise ValueError()
+        if n < 0:
+            raise ValueError()
+    except (ValueError, TypeError) as err:
+        raise err.__class__(f"{var_name} must be a non-negative integer") from err
+    return n
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_lambertw.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_lambertw.py
new file mode 100644
index 0000000000000000000000000000000000000000..f758c7c21fdddc0ec1b84727d90c6de7f34a094e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_lambertw.py
@@ -0,0 +1,149 @@
+from ._ufuncs import _lambertw
+
+import numpy as np
+
+
+def lambertw(z, k=0, tol=1e-8):
+    r"""
+    lambertw(z, k=0, tol=1e-8)
+
+    Lambert W function.
+
+    The Lambert W function `W(z)` is defined as the inverse function
+    of ``w * exp(w)``. In other words, the value of ``W(z)`` is
+    such that ``z = W(z) * exp(W(z))`` for any complex number
+    ``z``.
+
+    The Lambert W function is a multivalued function with infinitely
+    many branches. Each branch gives a separate solution of the
+    equation ``z = w exp(w)``. Here, the branches are indexed by the
+    integer `k`.
+
+    Parameters
+    ----------
+    z : array_like
+        Input argument.
+    k : int, optional
+        Branch index.
+    tol : float, optional
+        Evaluation tolerance.
+
+    Returns
+    -------
+    w : array
+        `w` will have the same shape as `z`.
+
+    See Also
+    --------
+    wrightomega : the Wright Omega function
+
+    Notes
+    -----
+    All branches are supported by `lambertw`:
+
+    * ``lambertw(z)`` gives the principal solution (branch 0)
+    * ``lambertw(z, k)`` gives the solution on branch `k`
+
+    The Lambert W function has two partially real branches: the
+    principal branch (`k = 0`) is real for real ``z > -1/e``, and the
+    ``k = -1`` branch is real for ``-1/e < z < 0``. All branches except
+    ``k = 0`` have a logarithmic singularity at ``z = 0``.
+
+    **Possible issues**
+
+    The evaluation can become inaccurate very close to the branch point
+    at ``-1/e``. In some corner cases, `lambertw` might currently
+    fail to converge, or can end up on the wrong branch.
+
+    **Algorithm**
+
+    Halley's iteration is used to invert ``w * exp(w)``, using a first-order
+    asymptotic approximation (O(log(w)) or `O(w)`) as the initial estimate.
+
+    The definition, implementation and choice of branches is based on [2]_.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Lambert_W_function
+    .. [2] Corless et al, "On the Lambert W function", Adv. Comp. Math. 5
+       (1996) 329-359.
+       https://cs.uwaterloo.ca/research/tr/1993/03/W.pdf
+
+    Examples
+    --------
+    The Lambert W function is the inverse of ``w exp(w)``:
+
+    >>> import numpy as np
+    >>> from scipy.special import lambertw
+    >>> w = lambertw(1)
+    >>> w
+    (0.56714329040978384+0j)
+    >>> w * np.exp(w)
+    (1.0+0j)
+
+    Any branch gives a valid inverse:
+
+    >>> w = lambertw(1, k=3)
+    >>> w
+    (-2.8535817554090377+17.113535539412148j)
+    >>> w*np.exp(w)
+    (1.0000000000000002+1.609823385706477e-15j)
+
+    **Applications to equation-solving**
+
+    The Lambert W function may be used to solve various kinds of
+    equations.  We give two examples here.
+
+    First, the function can be used to solve implicit equations of the
+    form
+
+        :math:`x = a + b e^{c x}`
+
+    for :math:`x`.  We assume :math:`c` is not zero.  After a little
+    algebra, the equation may be written
+
+        :math:`z e^z = -b c e^{a c}`
+
+    where :math:`z = c (a - x)`.  :math:`z` may then be expressed using
+    the Lambert W function
+
+        :math:`z = W(-b c e^{a c})`
+
+    giving
+
+        :math:`x = a - W(-b c e^{a c})/c`
+
+    For example,
+
+    >>> a = 3
+    >>> b = 2
+    >>> c = -0.5
+
+    The solution to :math:`x = a + b e^{c x}` is:
+
+    >>> x = a - lambertw(-b*c*np.exp(a*c))/c
+    >>> x
+    (3.3707498368978794+0j)
+
+    Verify that it solves the equation:
+
+    >>> a + b*np.exp(c*x)
+    (3.37074983689788+0j)
+
+    The Lambert W function may also be used find the value of the infinite
+    power tower :math:`z^{z^{z^{\ldots}}}`:
+
+    >>> def tower(z, n):
+    ...     if n == 0:
+    ...         return z
+    ...     return z ** tower(z, n-1)
+    ...
+    >>> tower(0.5, 100)
+    0.641185744504986
+    >>> -lambertw(-np.log(0.5)) / np.log(0.5)
+    (0.64118574450498589+0j)
+    """
+    # TODO: special expert should inspect this
+    # interception; better place to do it?
+    k = np.asarray(k, dtype=np.dtype("long"))
+    return _lambertw(z, k, tol)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_mptestutils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_mptestutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e519093dface79e21f16d7063541ad107f5ca96
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_mptestutils.py
@@ -0,0 +1,453 @@
+import os
+import sys
+import time
+from itertools import zip_longest
+
+import numpy as np
+from numpy.testing import assert_
+import pytest
+
+from scipy.special._testutils import assert_func_equal
+
+try:
+    import mpmath
+except ImportError:
+    pass
+
+
+# ------------------------------------------------------------------------------
+# Machinery for systematic tests with mpmath
+# ------------------------------------------------------------------------------
+
+class Arg:
+    """Generate a set of numbers on the real axis, concentrating on
+    'interesting' regions and covering all orders of magnitude.
+
+    """
+
+    def __init__(self, a=-np.inf, b=np.inf, inclusive_a=True, inclusive_b=True):
+        if a > b:
+            raise ValueError("a should be less than or equal to b")
+        if a == -np.inf:
+            a = -0.5*np.finfo(float).max
+        if b == np.inf:
+            b = 0.5*np.finfo(float).max
+        self.a, self.b = a, b
+
+        self.inclusive_a, self.inclusive_b = inclusive_a, inclusive_b
+
+    def _positive_values(self, a, b, n):
+        if a < 0:
+            raise ValueError("a should be positive")
+
+        # Try to put half of the points into a linspace between a and
+        # 10 the other half in a logspace.
+        if n % 2 == 0:
+            nlogpts = n//2
+            nlinpts = nlogpts
+        else:
+            nlogpts = n//2
+            nlinpts = nlogpts + 1
+
+        if a >= 10:
+            # Outside of linspace range; just return a logspace.
+            pts = np.logspace(np.log10(a), np.log10(b), n)
+        elif a > 0 and b < 10:
+            # Outside of logspace range; just return a linspace
+            pts = np.linspace(a, b, n)
+        elif a > 0:
+            # Linspace between a and 10 and a logspace between 10 and
+            # b.
+            linpts = np.linspace(a, 10, nlinpts, endpoint=False)
+            logpts = np.logspace(1, np.log10(b), nlogpts)
+            pts = np.hstack((linpts, logpts))
+        elif a == 0 and b <= 10:
+            # Linspace between 0 and b and a logspace between 0 and
+            # the smallest positive point of the linspace
+            linpts = np.linspace(0, b, nlinpts)
+            if linpts.size > 1:
+                right = np.log10(linpts[1])
+            else:
+                right = -30
+            logpts = np.logspace(-30, right, nlogpts, endpoint=False)
+            pts = np.hstack((logpts, linpts))
+        else:
+            # Linspace between 0 and 10, logspace between 0 and the
+            # smallest positive point of the linspace, and a logspace
+            # between 10 and b.
+            if nlogpts % 2 == 0:
+                nlogpts1 = nlogpts//2
+                nlogpts2 = nlogpts1
+            else:
+                nlogpts1 = nlogpts//2
+                nlogpts2 = nlogpts1 + 1
+            linpts = np.linspace(0, 10, nlinpts, endpoint=False)
+            if linpts.size > 1:
+                right = np.log10(linpts[1])
+            else:
+                right = -30
+            logpts1 = np.logspace(-30, right, nlogpts1, endpoint=False)
+            logpts2 = np.logspace(1, np.log10(b), nlogpts2)
+            pts = np.hstack((logpts1, linpts, logpts2))
+
+        return np.sort(pts)
+
+    def values(self, n):
+        """Return an array containing n numbers."""
+        a, b = self.a, self.b
+        if a == b:
+            return np.zeros(n)
+
+        if not self.inclusive_a:
+            n += 1
+        if not self.inclusive_b:
+            n += 1
+
+        if n % 2 == 0:
+            n1 = n//2
+            n2 = n1
+        else:
+            n1 = n//2
+            n2 = n1 + 1
+
+        if a >= 0:
+            pospts = self._positive_values(a, b, n)
+            negpts = []
+        elif b <= 0:
+            pospts = []
+            negpts = -self._positive_values(-b, -a, n)
+        else:
+            pospts = self._positive_values(0, b, n1)
+            negpts = -self._positive_values(0, -a, n2 + 1)
+            # Don't want to get zero twice
+            negpts = negpts[1:]
+        pts = np.hstack((negpts[::-1], pospts))
+
+        if not self.inclusive_a:
+            pts = pts[1:]
+        if not self.inclusive_b:
+            pts = pts[:-1]
+        return pts
+
+
+class FixedArg:
+    def __init__(self, values):
+        self._values = np.asarray(values)
+
+    def values(self, n):
+        return self._values
+
+
+class ComplexArg:
+    def __init__(self, a=complex(-np.inf, -np.inf), b=complex(np.inf, np.inf)):
+        self.real = Arg(a.real, b.real)
+        self.imag = Arg(a.imag, b.imag)
+
+    def values(self, n):
+        m = int(np.floor(np.sqrt(n)))
+        x = self.real.values(m)
+        y = self.imag.values(m + 1)
+        return (x[:,None] + 1j*y[None,:]).ravel()
+
+
+class IntArg:
+    def __init__(self, a=-1000, b=1000):
+        self.a = a
+        self.b = b
+
+    def values(self, n):
+        v1 = Arg(self.a, self.b).values(max(1 + n//2, n-5)).astype(int)
+        v2 = np.arange(-5, 5)
+        v = np.unique(np.r_[v1, v2])
+        v = v[(v >= self.a) & (v < self.b)]
+        return v
+
+
+def get_args(argspec, n):
+    if isinstance(argspec, np.ndarray):
+        args = argspec.copy()
+    else:
+        nargs = len(argspec)
+        ms = np.asarray(
+            [1.5 if isinstance(spec, ComplexArg) else 1.0 for spec in argspec]
+        )
+        ms = (n**(ms/sum(ms))).astype(int) + 1
+
+        args = [spec.values(m) for spec, m in zip(argspec, ms)]
+        args = np.array(np.broadcast_arrays(*np.ix_(*args))).reshape(nargs, -1).T
+
+    return args
+
+
+class MpmathData:
+    def __init__(self, scipy_func, mpmath_func, arg_spec, name=None,
+                 dps=None, prec=None, n=None, rtol=1e-7, atol=1e-300,
+                 ignore_inf_sign=False, distinguish_nan_and_inf=True,
+                 nan_ok=True, param_filter=None):
+
+        # mpmath tests are really slow (see gh-6989).  Use a small number of
+        # points by default, increase back to 5000 (old default) if XSLOW is
+        # set
+        if n is None:
+            try:
+                is_xslow = int(os.environ.get('SCIPY_XSLOW', '0'))
+            except ValueError:
+                is_xslow = False
+
+            n = 5000 if is_xslow else 500
+
+        self.scipy_func = scipy_func
+        self.mpmath_func = mpmath_func
+        self.arg_spec = arg_spec
+        self.dps = dps
+        self.prec = prec
+        self.n = n
+        self.rtol = rtol
+        self.atol = atol
+        self.ignore_inf_sign = ignore_inf_sign
+        self.nan_ok = nan_ok
+        if isinstance(self.arg_spec, np.ndarray):
+            self.is_complex = np.issubdtype(self.arg_spec.dtype, np.complexfloating)
+        else:
+            self.is_complex = any(
+                [isinstance(arg, ComplexArg) for arg in self.arg_spec]
+            )
+        self.ignore_inf_sign = ignore_inf_sign
+        self.distinguish_nan_and_inf = distinguish_nan_and_inf
+        if not name or name == '<lambda>':
+            name = getattr(scipy_func, '__name__', None)
+        if not name or name == '<lambda>':
+            name = getattr(mpmath_func, '__name__', None)
+        self.name = name
+        self.param_filter = param_filter
+
+    def check(self):
+        np.random.seed(1234)
+
+        # Generate values for the arguments
+        argarr = get_args(self.arg_spec, self.n)
+
+        # Check
+        old_dps, old_prec = mpmath.mp.dps, mpmath.mp.prec
+        try:
+            if self.dps is not None:
+                dps_list = [self.dps]
+            else:
+                dps_list = [20]
+            if self.prec is not None:
+                mpmath.mp.prec = self.prec
+
+            # Proper casting of mpmath input and output types. Using
+            # native mpmath types as inputs gives improved precision
+            # in some cases.
+            if np.issubdtype(argarr.dtype, np.complexfloating):
+                pytype = mpc2complex
+
+                def mptype(x):
+                    return mpmath.mpc(complex(x))
+            else:
+                def mptype(x):
+                    return mpmath.mpf(float(x))
+
+                def pytype(x):
+                    if abs(x.imag) > 1e-16*(1 + abs(x.real)):
+                        return np.nan
+                    else:
+                        return mpf2float(x.real)
+
+            # Try out different dps until one (or none) works
+            for j, dps in enumerate(dps_list):
+                mpmath.mp.dps = dps
+
+                try:
+                    assert_func_equal(
+                        self.scipy_func,
+                        lambda *a: pytype(self.mpmath_func(*map(mptype, a))),
+                        argarr,
+                        vectorized=False,
+                        rtol=self.rtol,
+                        atol=self.atol,
+                        ignore_inf_sign=self.ignore_inf_sign,
+                        distinguish_nan_and_inf=self.distinguish_nan_and_inf,
+                        nan_ok=self.nan_ok,
+                        param_filter=self.param_filter
+                    )
+                    break
+                except AssertionError:
+                    if j >= len(dps_list)-1:
+                        # reraise the Exception
+                        tp, value, tb = sys.exc_info()
+                        if value.__traceback__ is not tb:
+                            raise value.with_traceback(tb)
+                        raise value
+        finally:
+            mpmath.mp.dps, mpmath.mp.prec = old_dps, old_prec
+
+    def __repr__(self):
+        if self.is_complex:
+            return f"<MpmathData: {self.name} (complex)>"
+        else:
+            return f"<MpmathData: {self.name}>"
+
+
+def assert_mpmath_equal(*a, **kw):
+    d = MpmathData(*a, **kw)
+    d.check()
+
+
+def nonfunctional_tooslow(func):
+    return pytest.mark.skip(
+        reason="    Test not yet functional (too slow), needs more work."
+    )(func)
+
+
+# ------------------------------------------------------------------------------
+# Tools for dealing with mpmath quirks
+# ------------------------------------------------------------------------------
+
+def mpf2float(x):
+    """
+    Convert an mpf to the nearest floating point number. Just using
+    float directly doesn't work because of results like this:
+
+    with mp.workdps(50):
+        float(mpf("0.99999999999999999")) = 0.9999999999999999
+
+    """
+    return float(mpmath.nstr(x, 17, min_fixed=0, max_fixed=0))
+
+
+def mpc2complex(x):
+    return complex(mpf2float(x.real), mpf2float(x.imag))
+
+
+def trace_args(func):
+    def tofloat(x):
+        if isinstance(x, mpmath.mpc):
+            return complex(x)
+        else:
+            return float(x)
+
+    def wrap(*a, **kw):
+        sys.stderr.write(f"{tuple(map(tofloat, a))!r}: ")
+        sys.stderr.flush()
+        try:
+            r = func(*a, **kw)
+            sys.stderr.write(f"-> {r!r}")
+        finally:
+            sys.stderr.write("\n")
+            sys.stderr.flush()
+        return r
+    return wrap
+
+
+try:
+    import signal
+    POSIX = ('setitimer' in dir(signal))
+except ImportError:
+    POSIX = False
+
+
+class TimeoutError(Exception):
+    pass
+
+
+def time_limited(timeout=0.5, return_val=np.nan, use_sigalrm=True):
+    """
+    Decorator for setting a timeout for pure-Python functions.
+
+    If the function does not return within `timeout` seconds, the
+    value `return_val` is returned instead.
+
+    On POSIX this uses SIGALRM by default. On non-POSIX, settrace is
+    used. Do not use this with threads: the SIGALRM implementation
+    does probably not work well. The settrace implementation only
+    traces the current thread.
+
+    The settrace implementation slows down execution speed. Slowdown
+    by a factor around 10 is probably typical.
+    """
+    if POSIX and use_sigalrm:
+        def sigalrm_handler(signum, frame):
+            raise TimeoutError()
+
+        def deco(func):
+            def wrap(*a, **kw):
+                old_handler = signal.signal(signal.SIGALRM, sigalrm_handler)
+                signal.setitimer(signal.ITIMER_REAL, timeout)
+                try:
+                    return func(*a, **kw)
+                except TimeoutError:
+                    return return_val
+                finally:
+                    signal.setitimer(signal.ITIMER_REAL, 0)
+                    signal.signal(signal.SIGALRM, old_handler)
+            return wrap
+    else:
+        def deco(func):
+            def wrap(*a, **kw):
+                start_time = time.time()
+
+                def trace(frame, event, arg):
+                    if time.time() - start_time > timeout:
+                        raise TimeoutError()
+                    return trace
+                sys.settrace(trace)
+                try:
+                    return func(*a, **kw)
+                except TimeoutError:
+                    sys.settrace(None)
+                    return return_val
+                finally:
+                    sys.settrace(None)
+            return wrap
+    return deco
+
+
+def exception_to_nan(func):
+    """Decorate function to return nan if it raises an exception"""
+    def wrap(*a, **kw):
+        try:
+            return func(*a, **kw)
+        except Exception:
+            return np.nan
+    return wrap
+
+
+def inf_to_nan(func):
+    """Decorate function to return nan if it returns inf"""
+    def wrap(*a, **kw):
+        v = func(*a, **kw)
+        if not np.isfinite(v):
+            return np.nan
+        return v
+    return wrap
+
+
+def mp_assert_allclose(res, std, atol=0, rtol=1e-17):
+    """
+    Compare lists of mpmath.mpf's or mpmath.mpc's directly so that it
+    can be done to higher precision than double.
+    """
+    failures = []
+    for k, (resval, stdval) in enumerate(zip_longest(res, std)):
+        if resval is None or stdval is None:
+            raise ValueError('Lengths of inputs res and std are not equal.')
+        if mpmath.fabs(resval - stdval) > atol + rtol*mpmath.fabs(stdval):
+            failures.append((k, resval, stdval))
+
+    nfail = len(failures)
+    if nfail > 0:
+        ndigits = int(abs(np.log10(rtol)))
+        msg = [""]
+        msg.append(f"Bad results ({nfail} out of {k + 1}) for the following points:")
+        for k, resval, stdval in failures:
+            resrep = mpmath.nstr(resval, ndigits, min_fixed=0, max_fixed=0)
+            stdrep = mpmath.nstr(stdval, ndigits, min_fixed=0, max_fixed=0)
+            if stdval == 0:
+                rdiff = "inf"
+            else:
+                rdiff = mpmath.fabs((resval - stdval)/stdval)
+                rdiff = mpmath.nstr(rdiff, 3)
+            msg.append(f"{k}: {resrep} != {stdrep} (rdiff {rdiff})")
+        assert_(False, "\n".join(msg))
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b93529d37b5965241c3d208004cc0c15cfd623
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.py
@@ -0,0 +1,2606 @@
+"""
+A collection of functions to find the weights and abscissas for
+Gaussian Quadrature.
+
+These calculations are done by finding the eigenvalues of a
+tridiagonal matrix whose entries are dependent on the coefficients
+in the recursion formula for the orthogonal polynomials with the
+corresponding weighting function over the interval.
+
+Many recursion relations for orthogonal polynomials are given:
+
+.. math::
+
+    a1n f_{n+1} (x) = (a2n + a3n x ) f_n (x) - a4n f_{n-1} (x)
+
+The recursion relation of interest is
+
+.. math::
+
+    P_{n+1} (x) = (x - A_n) P_n (x) - B_n P_{n-1} (x)
+
+where :math:`P` has a different normalization than :math:`f`.
+
+The coefficients can be found as:
+
+.. math::
+
+    A_n = -a2n / a3n
+    \\qquad
+    B_n = ( a4n / a3n \\sqrt{h_n-1 / h_n})^2
+
+where
+
+.. math::
+
+    h_n = \\int_a^b w(x) f_n(x)^2
+
+assume:
+
+.. math::
+
+    P_0 (x) = 1
+    \\qquad
+    P_{-1} (x) == 0
+
+For the mathematical background, see [golub.welsch-1969-mathcomp]_ and
+[abramowitz.stegun-1965]_.
+
+References
+----------
+.. [golub.welsch-1969-mathcomp]
+   Golub, Gene H, and John H Welsch. 1969. Calculation of Gauss
+   Quadrature Rules. *Mathematics of Computation* 23, 221-230+s1--s10.
+
+.. [abramowitz.stegun-1965]
+   Abramowitz, Milton, and Irene A Stegun. (1965) *Handbook of
+   Mathematical Functions: with Formulas, Graphs, and Mathematical
+   Tables*. Gaithersburg, MD: National Bureau of Standards.
+   http://www.math.sfu.ca/~cbm/aands/
+
+.. [townsend.trogdon.olver-2014]
+   Townsend, A. and Trogdon, T. and Olver, S. (2014)
+   *Fast computation of Gauss quadrature nodes and
+   weights on the whole real line*. :arXiv:`1410.5286`.
+
+.. [townsend.trogdon.olver-2015]
+   Townsend, A. and Trogdon, T. and Olver, S. (2015)
+   *Fast computation of Gauss quadrature nodes and
+   weights on the whole real line*.
+   IMA Journal of Numerical Analysis
+   :doi:`10.1093/imanum/drv002`.
+"""
+#
+# Author:  Travis Oliphant 2000
+# Updated Sep. 2003 (fixed bugs --- tested to be accurate)
+
+# SciPy imports.
+import numpy as np
+from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around,
+                   hstack, arccos, arange)
+from scipy.special import airy
+
+# Local imports.
+# There is no .pyi file for _specfun
+from . import _specfun  # type: ignore
+from . import _ufuncs
+_gam = _ufuncs.gamma
+
+_polyfuns = ['legendre', 'chebyt', 'chebyu', 'chebyc', 'chebys',
+             'jacobi', 'laguerre', 'genlaguerre', 'hermite',
+             'hermitenorm', 'gegenbauer', 'sh_legendre', 'sh_chebyt',
+             'sh_chebyu', 'sh_jacobi']
+
+# Correspondence between new and old names of root functions
+_rootfuns_map = {'roots_legendre': 'p_roots',
+                 'roots_chebyt': 't_roots',
+                 'roots_chebyu': 'u_roots',
+                 'roots_chebyc': 'c_roots',
+                 'roots_chebys': 's_roots',
+                 'roots_jacobi': 'j_roots',
+                 'roots_laguerre': 'l_roots',
+                 'roots_genlaguerre': 'la_roots',
+                 'roots_hermite': 'h_roots',
+                 'roots_hermitenorm': 'he_roots',
+                 'roots_gegenbauer': 'cg_roots',
+                 'roots_sh_legendre': 'ps_roots',
+                 'roots_sh_chebyt': 'ts_roots',
+                 'roots_sh_chebyu': 'us_roots',
+                 'roots_sh_jacobi': 'js_roots'}
+
+__all__ = _polyfuns + list(_rootfuns_map.keys())
+
+
+class orthopoly1d(np.poly1d):
+
+    def __init__(self, roots, weights=None, hn=1.0, kn=1.0, wfunc=None,
+                 limits=None, monic=False, eval_func=None):
+        equiv_weights = [weights[k] / wfunc(roots[k]) for
+                         k in range(len(roots))]
+        mu = sqrt(hn)
+        if monic:
+            evf = eval_func
+            if evf:
+                knn = kn
+                def eval_func(x):
+                    return evf(x) / knn
+            mu = mu / abs(kn)
+            kn = 1.0
+
+        # compute coefficients from roots, then scale
+        poly = np.poly1d(roots, r=True)
+        np.poly1d.__init__(self, poly.coeffs * float(kn))
+
+        self.weights = np.array(list(zip(roots, weights, equiv_weights)))
+        self.weight_func = wfunc
+        self.limits = limits
+        self.normcoef = mu
+
+        # Note: eval_func will be discarded on arithmetic
+        self._eval_func = eval_func
+
+    def __call__(self, v):
+        if self._eval_func and not isinstance(v, np.poly1d):
+            return self._eval_func(v)
+        else:
+            return np.poly1d.__call__(self, v)
+
+    def _scale(self, p):
+        if p == 1.0:
+            return
+        self._coeffs *= p
+
+        evf = self._eval_func
+        if evf:
+            self._eval_func = lambda x: evf(x) * p
+        self.normcoef *= p
+
+
+def _gen_roots_and_weights(n, mu0, an_func, bn_func, f, df, symmetrize, mu):
+    """[x,w] = gen_roots_and_weights(n,an_func,sqrt_bn_func,mu)
+
+    Returns the roots (x) of an nth order orthogonal polynomial,
+    and weights (w) to use in appropriate Gaussian quadrature with that
+    orthogonal polynomial.
+
+    The polynomials have the recurrence relation
+          P_n+1(x) = (x - A_n) P_n(x) - B_n P_n-1(x)
+
+    an_func(n)          should return A_n
+    sqrt_bn_func(n)     should return sqrt(B_n)
+    mu ( = h_0 )        is the integral of the weight over the orthogonal
+                        interval
+    """
+    # lazy import to prevent to prevent linalg dependency for whole module (gh-23420)
+    from scipy import linalg
+    k = np.arange(n, dtype='d')
+    c = np.zeros((2, n))
+    c[0,1:] = bn_func(k[1:])
+    c[1,:] = an_func(k)
+    x = linalg.eigvals_banded(c, overwrite_a_band=True)
+
+    # improve roots by one application of Newton's method
+    y = f(n, x)
+    dy = df(n, x)
+    x -= y/dy
+
+    # fm and dy may contain very large/small values, so we
+    # log-normalize them to maintain precision in the product fm*dy
+    fm = f(n-1, x)
+    log_fm = np.log(np.abs(fm))
+    log_dy = np.log(np.abs(dy))
+    fm /= np.exp((log_fm.max() + log_fm.min()) / 2.)
+    dy /= np.exp((log_dy.max() + log_dy.min()) / 2.)
+    w = 1.0 / (fm * dy)
+
+    if symmetrize:
+        w = (w + w[::-1]) / 2
+        x = (x - x[::-1]) / 2
+
+    w *= mu0 / w.sum()
+
+    if mu:
+        return x, w, mu0
+    else:
+        return x, w
+
+# Jacobi Polynomials 1               P^(alpha,beta)_n(x)
+
+
+def roots_jacobi(n, alpha, beta, mu=False):
+    r"""Gauss-Jacobi quadrature.
+
+    Compute the sample points and weights for Gauss-Jacobi
+    quadrature. The sample points are the roots of the nth degree
+    Jacobi polynomial, :math:`P^{\alpha, \beta}_n(x)`. These sample
+    points and weights correctly integrate polynomials of degree
+    :math:`2n - 1` or less over the interval :math:`[-1, 1]` with
+    weight function :math:`w(x) = (1 - x)^{\alpha} (1 +
+    x)^{\beta}`. See 22.2.1 in [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order.
+    alpha : float
+        alpha must be > -1
+    beta : float
+        beta must be > -1
+    mu : bool, optional
+        If True, return the sum of the weights in addition to sample points and weights.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points.
+    w : ndarray
+        Weights.
+    mu : float, optional
+        Sum of the weights, only returned if `mu=True`.
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    >>> from scipy.special import roots_jacobi
+    >>> x, w = roots_jacobi(3, 0.5, 0.5)
+    >>> x
+    array([-0.70710678,  0.        ,  0.70710678])
+    >>> w
+    array([0.39269908, 0.78539816, 0.39269908])
+
+    >>> x, w, mu = roots_jacobi(3, 0.5, 0.5, mu=True)
+    >>> mu
+    1.5707963267948966  # Sum of weights, equals pi/2 for alpha = beta = 0.5
+    """
+
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+    if alpha <= -1 or beta <= -1:
+        raise ValueError("alpha and beta must be greater than -1.")
+
+    if alpha == 0.0 and beta == 0.0:
+        return roots_legendre(m, mu)
+    if alpha == beta:
+        return roots_gegenbauer(m, alpha+0.5, mu)
+
+    if (alpha + beta) <= 1000:
+        mu0 = 2.0**(alpha+beta+1) * _ufuncs.beta(alpha+1, beta+1)
+    else:
+        # Avoid overflows in pow and beta for very large parameters
+        mu0 = np.exp((alpha + beta + 1) * np.log(2.0)
+                     + _ufuncs.betaln(alpha+1, beta+1))
+    a = alpha
+    b = beta
+    if a + b == 0.0:
+        def an_func(k):
+            return np.where(k == 0, (b - a) / (2 + a + b), 0.0)
+    else:
+        def an_func(k):
+            return np.where(
+                k == 0,
+                (b - a) / (2 + a + b),
+                (b * b - a * a) / ((2.0 * k + a + b) * (2.0 * k + a + b + 2))
+            )
+
+    def bn_func(k):
+        return (
+            2.0 / (2.0 * k + a + b)
+            * np.sqrt((k + a) * (k + b) / (2 * k + a + b + 1))
+            * np.where(k == 1, 1.0, np.sqrt(k * (k + a + b) / (2.0 * k + a + b - 1)))
+        )
+
+    def f(n, x):
+        return _ufuncs.eval_jacobi(n, a, b, x)
+    def df(n, x):
+        return 0.5 * (n + a + b + 1) * _ufuncs.eval_jacobi(n - 1, a + 1, b + 1, x)
+    return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, False, mu)
+
+
+def jacobi(n, alpha, beta, monic=False):
+    r"""Jacobi polynomial.
+
+    Defined to be the solution of
+
+    .. math::
+        (1 - x^2)\frac{d^2}{dx^2}P_n^{(\alpha, \beta)}
+          + (\beta - \alpha - (\alpha + \beta + 2)x)
+            \frac{d}{dx}P_n^{(\alpha, \beta)}
+          + n(n + \alpha + \beta + 1)P_n^{(\alpha, \beta)} = 0
+
+    for :math:`\alpha, \beta > -1`; :math:`P_n^{(\alpha, \beta)}` is a
+    polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    alpha : float
+        Parameter, must be greater than -1.
+    beta : float
+        Parameter, must be greater than -1.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    P : orthopoly1d
+        Jacobi polynomial.
+
+    Notes
+    -----
+    For fixed :math:`\alpha, \beta`, the polynomials
+    :math:`P_n^{(\alpha, \beta)}` are orthogonal over :math:`[-1, 1]`
+    with weight function :math:`(1 - x)^\alpha(1 + x)^\beta`.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The Jacobi polynomials satisfy the recurrence relation:
+
+    .. math::
+        P_n^{(\alpha, \beta-1)}(x) - P_n^{(\alpha-1, \beta)}(x)
+          = P_{n-1}^{(\alpha, \beta)}(x)
+
+    This can be verified, for example, for :math:`\alpha = \beta = 2`
+    and :math:`n = 1` over the interval :math:`[-1, 1]`:
+
+    >>> import numpy as np
+    >>> from scipy.special import jacobi
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> np.allclose(jacobi(0, 2, 2)(x),
+    ...             jacobi(1, 2, 1)(x) - jacobi(1, 1, 2)(x))
+    True
+
+    Plot of the Jacobi polynomial :math:`P_5^{(\alpha, -0.5)}` for
+    different values of :math:`\alpha`:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-2.0, 2.0)
+    >>> ax.set_title(r'Jacobi polynomials $P_5^{(\alpha, -0.5)}$')
+    >>> for alpha in np.arange(0, 4, 1):
+    ...     ax.plot(x, jacobi(5, alpha, -0.5)(x), label=rf'$\alpha={alpha}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    def wfunc(x):
+        return (1 - x) ** alpha * (1 + x) ** beta
+    if n == 0:
+        return orthopoly1d([], [], 1.0, 1.0, wfunc, (-1, 1), monic,
+                           eval_func=np.ones_like)
+    x, w, mu = roots_jacobi(n, alpha, beta, mu=True)
+    ab1 = alpha + beta + 1.0
+    hn = 2**ab1 / (2 * n + ab1) * _gam(n + alpha + 1)
+    hn *= _gam(n + beta + 1.0) / _gam(n + 1) / _gam(n + ab1)
+    kn = _gam(2 * n + ab1) / 2.0**n / _gam(n + 1) / _gam(n + ab1)
+    # here kn = coefficient on x^n term
+    p = orthopoly1d(x, w, hn, kn, wfunc, (-1, 1), monic,
+                    lambda x: _ufuncs.eval_jacobi(n, alpha, beta, x))
+    return p
+
+# Jacobi Polynomials shifted         G_n(p,q,x)
+
+
+def roots_sh_jacobi(n, p1, q1, mu=False):
+    """Gauss-Jacobi (shifted) quadrature.
+
+    Compute the sample points and weights for Gauss-Jacobi (shifted)
+    quadrature. The sample points are the roots of the nth degree
+    shifted Jacobi polynomial, :math:`G^{p,q}_n(x)`. These sample
+    points and weights correctly integrate polynomials of degree
+    :math:`2n - 1` or less over the interval :math:`[0, 1]` with
+    weight function :math:`w(x) = (1 - x)^{p-q} x^{q-1}`. See 22.2.2
+    in [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    p1 : float
+        (p1 - q1) must be > -1
+    q1 : float
+        q1 must be > 0
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    if (p1-q1) <= -1 or q1 <= 0:
+        message = "(p - q) must be greater than -1, and q must be greater than 0."
+        raise ValueError(message)
+    x, w, m = roots_jacobi(n, p1-q1, q1-1, True)
+    x = (x + 1) / 2
+    scale = 2.0**p1
+    w /= scale
+    m /= scale
+    if mu:
+        return x, w, m
+    else:
+        return x, w
+
+
+def sh_jacobi(n, p, q, monic=False):
+    r"""Shifted Jacobi polynomial.
+
+    Defined by
+
+    .. math::
+
+        G_n^{(p, q)}(x)
+          = \binom{2n + p - 1}{n}^{-1}P_n^{(p - q, q - 1)}(2x - 1),
+
+    where :math:`P_n^{(\cdot, \cdot)}` is the nth Jacobi polynomial.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    p : float
+        Parameter, must have :math:`p > q - 1`.
+    q : float
+        Parameter, must be greater than 0.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    G : orthopoly1d
+        Shifted Jacobi polynomial.
+
+    Notes
+    -----
+    For fixed :math:`p, q`, the polynomials :math:`G_n^{(p, q)}` are
+    orthogonal over :math:`[0, 1]` with weight function :math:`(1 -
+    x)^{p - q}x^{q - 1}`.
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    def wfunc(x):
+        return (1.0 - x) ** (p - q) * x ** (q - 1.0)
+    if n == 0:
+        return orthopoly1d([], [], 1.0, 1.0, wfunc, (-1, 1), monic,
+                           eval_func=np.ones_like)
+    n1 = n
+    x, w = roots_sh_jacobi(n1, p, q)
+    hn = _gam(n + 1) * _gam(n + q) * _gam(n + p) * _gam(n + p - q + 1)
+    hn /= (2 * n + p) * (_gam(2 * n + p)**2)
+    # kn = 1.0 in standard form so monic is redundant. Kept for compatibility.
+    kn = 1.0
+    pp = orthopoly1d(x, w, hn, kn, wfunc=wfunc, limits=(0, 1), monic=monic,
+                     eval_func=lambda x: _ufuncs.eval_sh_jacobi(n, p, q, x))
+    return pp
+
+# Generalized Laguerre               L^(alpha)_n(x)
+
+
+def roots_genlaguerre(n, alpha, mu=False):
+    r"""Gauss-generalized Laguerre quadrature.
+
+    Compute the sample points and weights for Gauss-generalized
+    Laguerre quadrature. The sample points are the roots of the nth
+    degree generalized Laguerre polynomial, :math:`L^{\alpha}_n(x)`.
+    These sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[0,
+    \infty]` with weight function :math:`w(x) = x^{\alpha}
+    e^{-x}`. See 22.3.9 in [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    alpha : float
+        alpha must be > -1
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+    if alpha < -1:
+        raise ValueError("alpha must be greater than -1.")
+
+    mu0 = _ufuncs.gamma(alpha + 1)
+
+    if m == 1:
+        x = np.array([alpha+1.0], 'd')
+        w = np.array([mu0], 'd')
+        if mu:
+            return x, w, mu0
+        else:
+            return x, w
+
+    def an_func(k):
+        return 2 * k + alpha + 1
+    def bn_func(k):
+        return -np.sqrt(k * (k + alpha))
+    def f(n, x):
+        return _ufuncs.eval_genlaguerre(n, alpha, x)
+    def df(n, x):
+        return (n * _ufuncs.eval_genlaguerre(n, alpha, x)
+                - (n + alpha) * _ufuncs.eval_genlaguerre(n - 1, alpha, x)) / x
+    return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, False, mu)
+
+
+def genlaguerre(n, alpha, monic=False):
+    r"""Generalized (associated) Laguerre polynomial.
+
+    Defined to be the solution of
+
+    .. math::
+        x\frac{d^2}{dx^2}L_n^{(\alpha)}
+          + (\alpha + 1 - x)\frac{d}{dx}L_n^{(\alpha)}
+          + nL_n^{(\alpha)} = 0,
+
+    where :math:`\alpha > -1`; :math:`L_n^{(\alpha)}` is a polynomial
+    of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    alpha : float
+        Parameter, must be greater than -1.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    L : orthopoly1d
+        Generalized Laguerre polynomial.
+
+    See Also
+    --------
+    laguerre : Laguerre polynomial.
+    hyp1f1 : confluent hypergeometric function
+
+    Notes
+    -----
+    For fixed :math:`\alpha`, the polynomials :math:`L_n^{(\alpha)}`
+    are orthogonal over :math:`[0, \infty)` with weight function
+    :math:`e^{-x}x^\alpha`.
+
+    The Laguerre polynomials are the special case where :math:`\alpha
+    = 0`.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The generalized Laguerre polynomials are closely related to the confluent
+    hypergeometric function :math:`{}_1F_1`:
+
+        .. math::
+            L_n^{(\alpha)} = \binom{n + \alpha}{n} {}_1F_1(-n, \alpha +1, x)
+
+    This can be verified, for example,  for :math:`n = \alpha = 3` over the
+    interval :math:`[-1, 1]`:
+
+    >>> import numpy as np
+    >>> from scipy.special import binom
+    >>> from scipy.special import genlaguerre
+    >>> from scipy.special import hyp1f1
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> np.allclose(genlaguerre(3, 3)(x), binom(6, 3) * hyp1f1(-3, 4, x))
+    True
+
+    This is the plot of the generalized Laguerre polynomials
+    :math:`L_3^{(\alpha)}` for some values of :math:`\alpha`:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(-4.0, 12.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-5.0, 10.0)
+    >>> ax.set_title(r'Generalized Laguerre polynomials $L_3^{\alpha}$')
+    >>> for alpha in np.arange(0, 5):
+    ...     ax.plot(x, genlaguerre(3, alpha)(x), label=rf'$L_3^{(alpha)}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    if alpha <= -1:
+        raise ValueError("alpha must be > -1")
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_genlaguerre(n1, alpha)
+    def wfunc(x):
+        return exp(-x) * x ** alpha
+    if n == 0:
+        x, w = [], []
+    hn = _gam(n + alpha + 1) / _gam(n + 1)
+    kn = (-1)**n / _gam(n + 1)
+    p = orthopoly1d(x, w, hn, kn, wfunc, (0, inf), monic,
+                    lambda x: _ufuncs.eval_genlaguerre(n, alpha, x))
+    return p
+
+# Laguerre                      L_n(x)
+
+
+def roots_laguerre(n, mu=False):
+    r"""Gauss-Laguerre quadrature.
+
+    Compute the sample points and weights for Gauss-Laguerre
+    quadrature. The sample points are the roots of the nth degree
+    Laguerre polynomial, :math:`L_n(x)`. These sample points and
+    weights correctly integrate polynomials of degree :math:`2n - 1`
+    or less over the interval :math:`[0, \infty]` with weight function
+    :math:`w(x) = e^{-x}`. See 22.2.13 in [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+    numpy.polynomial.laguerre.laggauss
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    return roots_genlaguerre(n, 0.0, mu=mu)
+
+
+def laguerre(n, monic=False):
+    r"""Laguerre polynomial.
+
+    Defined to be the solution of
+
+    .. math::
+        x\frac{d^2}{dx^2}L_n + (1 - x)\frac{d}{dx}L_n + nL_n = 0;
+
+    :math:`L_n` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    L : orthopoly1d
+        Laguerre Polynomial.
+
+    See Also
+    --------
+    genlaguerre : Generalized (associated) Laguerre polynomial.
+
+    Notes
+    -----
+    The polynomials :math:`L_n` are orthogonal over :math:`[0,
+    \infty)` with weight function :math:`e^{-x}`.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The Laguerre polynomials :math:`L_n` are the special case
+    :math:`\alpha = 0` of the generalized Laguerre polynomials
+    :math:`L_n^{(\alpha)}`.
+    Let's verify it on the interval :math:`[-1, 1]`:
+
+    >>> import numpy as np
+    >>> from scipy.special import genlaguerre
+    >>> from scipy.special import laguerre
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> np.allclose(genlaguerre(3, 0)(x), laguerre(3)(x))
+    True
+
+    The polynomials :math:`L_n` also satisfy the recurrence relation:
+
+    .. math::
+        (n + 1)L_{n+1}(x) = (2n +1 -x)L_n(x) - nL_{n-1}(x)
+
+    This can be easily checked on :math:`[0, 1]` for :math:`n = 3`:
+
+    >>> x = np.arange(0.0, 1.0, 0.01)
+    >>> np.allclose(4 * laguerre(4)(x),
+    ...             (7 - x) * laguerre(3)(x) - 3 * laguerre(2)(x))
+    True
+
+    This is the plot of the first few Laguerre polynomials :math:`L_n`:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(-1.0, 5.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-5.0, 5.0)
+    >>> ax.set_title(r'Laguerre polynomials $L_n$')
+    >>> for n in np.arange(0, 5):
+    ...     ax.plot(x, laguerre(n)(x), label=rf'$L_{n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_laguerre(n1)
+    if n == 0:
+        x, w = [], []
+    hn = 1.0
+    kn = (-1)**n / _gam(n + 1)
+    p = orthopoly1d(x, w, hn, kn, lambda x: exp(-x), (0, inf), monic,
+                    lambda x: _ufuncs.eval_laguerre(n, x))
+    return p
+
+# Hermite  1                         H_n(x)
+
+
+def roots_hermite(n, mu=False):
+    r"""Gauss-Hermite (physicist's) quadrature.
+
+    Compute the sample points and weights for Gauss-Hermite
+    quadrature. The sample points are the roots of the nth degree
+    Hermite polynomial, :math:`H_n(x)`. These sample points and
+    weights correctly integrate polynomials of degree :math:`2n - 1`
+    or less over the interval :math:`[-\infty, \infty]` with weight
+    function :math:`w(x) = e^{-x^2}`. See 22.2.14 in [AS]_ for
+    details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+    numpy.polynomial.hermite.hermgauss
+    roots_hermitenorm
+
+    Notes
+    -----
+    For small n up to 150 a modified version of the Golub-Welsch
+    algorithm is used. Nodes are computed from the eigenvalue
+    problem and improved by one step of a Newton iteration.
+    The weights are computed from the well-known analytical formula.
+
+    For n larger than 150 an optimal asymptotic algorithm is applied
+    which computes nodes and weights in a numerically stable manner.
+    The algorithm has linear runtime making computation for very
+    large n (several thousand or more) feasible.
+
+    References
+    ----------
+    .. [townsend.trogdon.olver-2014]
+        Townsend, A. and Trogdon, T. and Olver, S. (2014)
+        *Fast computation of Gauss quadrature nodes and
+        weights on the whole real line*. :arXiv:`1410.5286`.
+    .. [townsend.trogdon.olver-2015]
+        Townsend, A. and Trogdon, T. and Olver, S. (2015)
+        *Fast computation of Gauss quadrature nodes and
+        weights on the whole real line*.
+        IMA Journal of Numerical Analysis
+        :doi:`10.1093/imanum/drv002`.
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+
+    mu0 = np.sqrt(np.pi)
+    if n <= 150:
+        def an_func(k):
+            return 0.0 * k
+        def bn_func(k):
+            return np.sqrt(k / 2.0)
+        f = _ufuncs.eval_hermite
+        def df(n, x):
+            return 2.0 * n * _ufuncs.eval_hermite(n - 1, x)
+        return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, True, mu)
+    else:
+        nodes, weights = _roots_hermite_asy(m)
+        if mu:
+            return nodes, weights, mu0
+        else:
+            return nodes, weights
+
+
+def _compute_tauk(n, k, maxit=5):
+    """Helper function for Tricomi initial guesses
+
+    For details, see formula 3.1 in lemma 3.1 in the
+    original paper.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+    k : ndarray of type int
+        Index of roots :math:`\tau_k` to compute
+    maxit : int
+        Number of Newton maxit performed, the default
+        value of 5 is sufficient.
+
+    Returns
+    -------
+    tauk : ndarray
+        Roots of equation 3.1
+
+    See Also
+    --------
+    initial_nodes_a
+    roots_hermite_asy
+    """
+    a = n % 2 - 0.5
+    c = (4.0*floor(n/2.0) - 4.0*k + 3.0)*pi / (4.0*floor(n/2.0) + 2.0*a + 2.0)
+    def f(x):
+        return x - sin(x) - c
+    def df(x):
+        return 1.0 - cos(x)
+    xi = 0.5*pi
+    for i in range(maxit):
+        xi = xi - f(xi)/df(xi)
+    return xi
+
+
+def _initial_nodes_a(n, k):
+    r"""Tricomi initial guesses
+
+    Computes an initial approximation to the square of the `k`-th
+    (positive) root :math:`x_k` of the Hermite polynomial :math:`H_n`
+    of order :math:`n`. The formula is the one from lemma 3.1 in the
+    original paper. The guesses are accurate except in the region
+    near :math:`\sqrt{2n + 1}`.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+    k : ndarray of type int
+        Index of roots to compute
+
+    Returns
+    -------
+    xksq : ndarray
+        Square of the approximate roots
+
+    See Also
+    --------
+    initial_nodes
+    roots_hermite_asy
+    """
+    tauk = _compute_tauk(n, k)
+    sigk = cos(0.5*tauk)**2
+    a = n % 2 - 0.5
+    nu = 4.0*floor(n/2.0) + 2.0*a + 2.0
+    # Initial approximation of Hermite roots (square)
+    xksq = nu*sigk - 1.0/(3.0*nu) * (5.0/(4.0*(1.0-sigk)**2) - 1.0/(1.0-sigk) - 0.25)
+    return xksq
+
+
+def _initial_nodes_b(n, k):
+    r"""Gatteschi initial guesses
+
+    Computes an initial approximation to the square of the kth
+    (positive) root :math:`x_k` of the Hermite polynomial :math:`H_n`
+    of order :math:`n`. The formula is the one from lemma 3.2 in the
+    original paper. The guesses are accurate in the region just
+    below :math:`\sqrt{2n + 1}`.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+    k : ndarray of type int
+        Index of roots to compute
+
+    Returns
+    -------
+    xksq : ndarray
+        Square of the approximate root
+
+    See Also
+    --------
+    initial_nodes
+    roots_hermite_asy
+    """
+    a = n % 2 - 0.5
+    nu = 4.0*floor(n/2.0) + 2.0*a + 2.0
+    # Airy roots by approximation
+    ak = _specfun.airyzo(k.max(), 1)[0][::-1]
+    # Initial approximation of Hermite roots (square)
+    xksq = (nu
+            + 2.0**(2.0/3.0) * ak * nu**(1.0/3.0)
+            + 1.0/5.0 * 2.0**(4.0/3.0) * ak**2 * nu**(-1.0/3.0)
+            + (9.0/140.0 - 12.0/175.0 * ak**3) * nu**(-1.0)
+            + (16.0/1575.0 * ak + 92.0/7875.0 * ak**4) * 2.0**(2.0/3.0) * nu**(-5.0/3.0)
+            - (15152.0/3031875.0 * ak**5 + 1088.0/121275.0 * ak**2)
+              * 2.0**(1.0/3.0) * nu**(-7.0/3.0))
+    return xksq
+
+
+def _initial_nodes(n):
+    """Initial guesses for the Hermite roots
+
+    Computes an initial approximation to the non-negative
+    roots :math:`x_k` of the Hermite polynomial :math:`H_n`
+    of order :math:`n`. The Tricomi and Gatteschi initial
+    guesses are used in the region where they are accurate.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+
+    Returns
+    -------
+    xk : ndarray
+        Approximate roots
+
+    See Also
+    --------
+    roots_hermite_asy
+    """
+    # Turnover point
+    # linear polynomial fit to error of 10, 25, 40, ..., 1000 point rules
+    fit = 0.49082003*n - 4.37859653
+    turnover = around(fit).astype(int)
+    # Compute all approximations
+    ia = arange(1, int(floor(n*0.5)+1))
+    ib = ia[::-1]
+    xasq = _initial_nodes_a(n, ia[:turnover+1])
+    xbsq = _initial_nodes_b(n, ib[turnover+1:])
+    # Combine
+    iv = sqrt(hstack([xasq, xbsq]))
+    # Central node is always zero
+    if n % 2 == 1:
+        iv = hstack([0.0, iv])
+    return iv
+
+
+def _pbcf(n, theta):
+    r"""Asymptotic series expansion of parabolic cylinder function
+
+    The implementation is based on sections 3.2 and 3.3 from the
+    original paper. Compared to the published version this code
+    adds one more term to the asymptotic series. The detailed
+    formulas can be found at [parabolic-asymptotics]_. The evaluation
+    is done in a transformed variable :math:`\theta := \arccos(t)`
+    where :math:`t := x / \mu` and :math:`\mu := \sqrt{2n + 1}`.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+    theta : ndarray
+        Transformed position variable
+
+    Returns
+    -------
+    U : ndarray
+        Value of the parabolic cylinder function :math:`U(a, \theta)`.
+    Ud : ndarray
+        Value of the derivative :math:`U^{\prime}(a, \theta)` of
+        the parabolic cylinder function.
+
+    See Also
+    --------
+    roots_hermite_asy
+
+    References
+    ----------
+    .. [parabolic-asymptotics]
+       https://dlmf.nist.gov/12.10#vii
+    """
+    st = sin(theta)
+    ct = cos(theta)
+    # https://dlmf.nist.gov/12.10#vii
+    mu = 2.0*n + 1.0
+    # https://dlmf.nist.gov/12.10#E23
+    eta = 0.5*theta - 0.5*st*ct
+    # https://dlmf.nist.gov/12.10#E39
+    zeta = -(3.0*eta/2.0) ** (2.0/3.0)
+    # https://dlmf.nist.gov/12.10#E40
+    phi = (-zeta / st**2) ** (0.25)
+    # Coefficients
+    # https://dlmf.nist.gov/12.10#E43
+    a0 = 1.0
+    a1 = 0.10416666666666666667
+    a2 = 0.08355034722222222222
+    a3 = 0.12822657455632716049
+    a4 = 0.29184902646414046425
+    a5 = 0.88162726744375765242
+    b0 = 1.0
+    b1 = -0.14583333333333333333
+    b2 = -0.09874131944444444444
+    b3 = -0.14331205391589506173
+    b4 = -0.31722720267841354810
+    b5 = -0.94242914795712024914
+    # Polynomials
+    # https://dlmf.nist.gov/12.10#E9
+    # https://dlmf.nist.gov/12.10#E10
+    ctp = ct ** arange(16).reshape((-1,1))
+    u0 = 1.0
+    u1 = (1.0*ctp[3,:] - 6.0*ct) / 24.0
+    u2 = (-9.0*ctp[4,:] + 249.0*ctp[2,:] + 145.0) / 1152.0
+    u3 = (-4042.0*ctp[9,:] + 18189.0*ctp[7,:] - 28287.0*ctp[5,:]
+          - 151995.0*ctp[3,:] - 259290.0*ct) / 414720.0
+    u4 = (72756.0*ctp[10,:] - 321339.0*ctp[8,:] - 154982.0*ctp[6,:]
+          + 50938215.0*ctp[4,:] + 122602962.0*ctp[2,:] + 12773113.0) / 39813120.0
+    u5 = (82393456.0*ctp[15,:] - 617950920.0*ctp[13,:] + 1994971575.0*ctp[11,:]
+          - 3630137104.0*ctp[9,:] + 4433574213.0*ctp[7,:] - 37370295816.0*ctp[5,:]
+          - 119582875013.0*ctp[3,:] - 34009066266.0*ct) / 6688604160.0
+    v0 = 1.0
+    v1 = (1.0*ctp[3,:] + 6.0*ct) / 24.0
+    v2 = (15.0*ctp[4,:] - 327.0*ctp[2,:] - 143.0) / 1152.0
+    v3 = (-4042.0*ctp[9,:] + 18189.0*ctp[7,:] - 36387.0*ctp[5,:] 
+          + 238425.0*ctp[3,:] + 259290.0*ct) / 414720.0
+    v4 = (-121260.0*ctp[10,:] + 551733.0*ctp[8,:] - 151958.0*ctp[6,:]
+          - 57484425.0*ctp[4,:] - 132752238.0*ctp[2,:] - 12118727) / 39813120.0
+    v5 = (82393456.0*ctp[15,:] - 617950920.0*ctp[13,:] + 2025529095.0*ctp[11,:]
+          - 3750839308.0*ctp[9,:] + 3832454253.0*ctp[7,:] + 35213253348.0*ctp[5,:]
+          + 130919230435.0*ctp[3,:] + 34009066266*ct) / 6688604160.0
+    # Airy Evaluation (Bi and Bip unused)
+    Ai, Aip, Bi, Bip = airy(mu**(4.0/6.0) * zeta)
+    # Prefactor for U
+    P = 2.0*sqrt(pi) * mu**(1.0/6.0) * phi
+    # Terms for U
+    # https://dlmf.nist.gov/12.10#E42
+    phip = phi ** arange(6, 31, 6).reshape((-1,1))
+    A0 = b0*u0
+    A1 = (b2*u0 + phip[0,:]*b1*u1 + phip[1,:]*b0*u2) / zeta**3
+    A2 = (b4*u0 + phip[0,:]*b3*u1 + phip[1,:]*b2*u2 + phip[2,:]*b1*u3
+          + phip[3,:]*b0*u4) / zeta**6
+    B0 = -(a1*u0 + phip[0,:]*a0*u1) / zeta**2
+    B1 = -(a3*u0 + phip[0,:]*a2*u1 + phip[1,:]*a1*u2 + phip[2,:]*a0*u3) / zeta**5
+    B2 = -(a5*u0 + phip[0,:]*a4*u1 + phip[1,:]*a3*u2 + phip[2,:]*a2*u3
+           + phip[3,:]*a1*u4 + phip[4,:]*a0*u5) / zeta**8
+    # U
+    # https://dlmf.nist.gov/12.10#E35
+    U = P * (Ai * (A0 + A1/mu**2.0 + A2/mu**4.0) +
+             Aip * (B0 + B1/mu**2.0 + B2/mu**4.0) / mu**(8.0/6.0))
+    # Prefactor for derivative of U
+    Pd = sqrt(2.0*pi) * mu**(2.0/6.0) / phi
+    # Terms for derivative of U
+    # https://dlmf.nist.gov/12.10#E46
+    C0 = -(b1*v0 + phip[0,:]*b0*v1) / zeta
+    C1 = -(b3*v0 + phip[0,:]*b2*v1 + phip[1,:]*b1*v2 + phip[2,:]*b0*v3) / zeta**4
+    C2 = -(b5*v0 + phip[0,:]*b4*v1 + phip[1,:]*b3*v2 + phip[2,:]*b2*v3
+           + phip[3,:]*b1*v4 + phip[4,:]*b0*v5) / zeta**7
+    D0 = a0*v0
+    D1 = (a2*v0 + phip[0,:]*a1*v1 + phip[1,:]*a0*v2) / zeta**3
+    D2 = (a4*v0 + phip[0,:]*a3*v1 + phip[1,:]*a2*v2 + phip[2,:]*a1*v3
+          + phip[3,:]*a0*v4) / zeta**6
+    # Derivative of U
+    # https://dlmf.nist.gov/12.10#E36
+    Ud = Pd * (Ai * (C0 + C1/mu**2.0 + C2/mu**4.0) / mu**(4.0/6.0) +
+               Aip * (D0 + D1/mu**2.0 + D2/mu**4.0))
+    return U, Ud
+
+
+def _newton(n, x_initial, maxit=5):
+    """Newton iteration for polishing the asymptotic approximation
+    to the zeros of the Hermite polynomials.
+
+    Parameters
+    ----------
+    n : int
+        Quadrature order
+    x_initial : ndarray
+        Initial guesses for the roots
+    maxit : int
+        Maximal number of Newton iterations.
+        The default 5 is sufficient, usually
+        only one or two steps are needed.
+
+    Returns
+    -------
+    nodes : ndarray
+        Quadrature nodes
+    weights : ndarray
+        Quadrature weights
+
+    See Also
+    --------
+    roots_hermite_asy
+    """
+    # Variable transformation
+    mu = sqrt(2.0*n + 1.0)
+    t = x_initial / mu
+    theta = arccos(t)
+    # Newton iteration
+    for i in range(maxit):
+        u, ud = _pbcf(n, theta)
+        dtheta = u / (sqrt(2.0) * mu * sin(theta) * ud)
+        theta = theta + dtheta
+        if max(abs(dtheta)) < 1e-14:
+            break
+    # Undo variable transformation
+    x = mu * cos(theta)
+    # Central node is always zero
+    if n % 2 == 1:
+        x[0] = 0.0
+    # Compute weights
+    w = exp(-x**2) / (2.0*ud**2)
+    return x, w
+
+
+def _roots_hermite_asy(n):
+    r"""Gauss-Hermite (physicist's) quadrature for large n.
+
+    Computes the sample points and weights for Gauss-Hermite quadrature.
+    The sample points are the roots of the nth degree Hermite polynomial,
+    :math:`H_n(x)`. These sample points and weights correctly integrate
+    polynomials of degree :math:`2n - 1` or less over the interval
+    :math:`[-\infty, \infty]` with weight function :math:`f(x) = e^{-x^2}`.
+
+    This method relies on asymptotic expansions which work best for n > 150.
+    The algorithm has linear runtime making computation for very large n
+    feasible.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+
+    Returns
+    -------
+    nodes : ndarray
+        Quadrature nodes
+    weights : ndarray
+        Quadrature weights
+
+    See Also
+    --------
+    roots_hermite
+
+    References
+    ----------
+    .. [townsend.trogdon.olver-2014]
+       Townsend, A. and Trogdon, T. and Olver, S. (2014)
+       *Fast computation of Gauss quadrature nodes and
+       weights on the whole real line*. :arXiv:`1410.5286`.
+
+    .. [townsend.trogdon.olver-2015]
+       Townsend, A. and Trogdon, T. and Olver, S. (2015)
+       *Fast computation of Gauss quadrature nodes and
+       weights on the whole real line*.
+       IMA Journal of Numerical Analysis
+       :doi:`10.1093/imanum/drv002`.
+    """
+    iv = _initial_nodes(n)
+    nodes, weights = _newton(n, iv)
+    # Combine with negative parts
+    if n % 2 == 0:
+        nodes = hstack([-nodes[::-1], nodes])
+        weights = hstack([weights[::-1], weights])
+    else:
+        nodes = hstack([-nodes[-1:0:-1], nodes])
+        weights = hstack([weights[-1:0:-1], weights])
+    # Scale weights
+    weights *= sqrt(pi) / sum(weights)
+    return nodes, weights
+
+
+def hermite(n, monic=False):
+    r"""Physicist's Hermite polynomial.
+
+    Defined by
+
+    .. math::
+
+        H_n(x) = (-1)^ne^{x^2}\frac{d^n}{dx^n}e^{-x^2};
+
+    :math:`H_n` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    H : orthopoly1d
+        Hermite polynomial.
+
+    Notes
+    -----
+    The polynomials :math:`H_n` are orthogonal over :math:`(-\infty,
+    \infty)` with weight function :math:`e^{-x^2}`.
+
+    Examples
+    --------
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+
+    >>> p_monic = special.hermite(3, monic=True)
+    >>> p_monic
+    poly1d([ 1. ,  0. , -1.5,  0. ])
+    >>> p_monic(1)
+    -0.49999999999999983
+    >>> x = np.linspace(-3, 3, 400)
+    >>> y = p_monic(x)
+    >>> plt.plot(x, y)
+    >>> plt.title("Monic Hermite polynomial of degree 3")
+    >>> plt.xlabel("x")
+    >>> plt.ylabel("H_3(x)")
+    >>> plt.show()
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_hermite(n1)
+    def wfunc(x):
+        return exp(-x * x)
+    if n == 0:
+        x, w = [], []
+    hn = 2**n * _gam(n + 1) * sqrt(pi)
+    kn = 2**n
+    p = orthopoly1d(x, w, hn, kn, wfunc, (-inf, inf), monic,
+                    lambda x: _ufuncs.eval_hermite(n, x))
+    return p
+
+# Hermite  2                         He_n(x)
+
+
+def roots_hermitenorm(n, mu=False):
+    r"""Gauss-Hermite (statistician's) quadrature.
+
+    Compute the sample points and weights for Gauss-Hermite
+    quadrature. The sample points are the roots of the nth degree
+    Hermite polynomial, :math:`He_n(x)`. These sample points and
+    weights correctly integrate polynomials of degree :math:`2n - 1`
+    or less over the interval :math:`[-\infty, \infty]` with weight
+    function :math:`w(x) = e^{-x^2/2}`. See 22.2.15 in [AS]_ for more
+    details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+    numpy.polynomial.hermite_e.hermegauss
+
+    Notes
+    -----
+    For small n up to 150 a modified version of the Golub-Welsch
+    algorithm is used. Nodes are computed from the eigenvalue
+    problem and improved by one step of a Newton iteration.
+    The weights are computed from the well-known analytical formula.
+
+    For n larger than 150 an optimal asymptotic algorithm is used
+    which computes nodes and weights in a numerical stable manner.
+    The algorithm has linear runtime making computation for very
+    large n (several thousand or more) feasible.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+
+    mu0 = np.sqrt(2.0*np.pi)
+    if n <= 150:
+        def an_func(k):
+            return 0.0 * k
+        def bn_func(k):
+            return np.sqrt(k)
+        f = _ufuncs.eval_hermitenorm
+        def df(n, x):
+            return n * _ufuncs.eval_hermitenorm(n - 1, x)
+        return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, True, mu)
+    else:
+        nodes, weights = _roots_hermite_asy(m)
+        # Transform
+        nodes *= sqrt(2)
+        weights *= sqrt(2)
+        if mu:
+            return nodes, weights, mu0
+        else:
+            return nodes, weights
+
+
+def hermitenorm(n, monic=False):
+    r"""Normalized (probabilist's) Hermite polynomial.
+
+    Defined by
+
+    .. math::
+
+        He_n(x) = (-1)^ne^{x^2/2}\frac{d^n}{dx^n}e^{-x^2/2};
+
+    :math:`He_n` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    He : orthopoly1d
+        Hermite polynomial.
+
+    Notes
+    -----
+
+    The polynomials :math:`He_n` are orthogonal over :math:`(-\infty,
+    \infty)` with weight function :math:`e^{-x^2/2}`.
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_hermitenorm(n1)
+    def wfunc(x):
+        return exp(-x * x / 2.0)
+    if n == 0:
+        x, w = [], []
+    hn = sqrt(2 * pi) * _gam(n + 1)
+    kn = 1.0
+    p = orthopoly1d(x, w, hn, kn, wfunc=wfunc, limits=(-inf, inf), monic=monic,
+                    eval_func=lambda x: _ufuncs.eval_hermitenorm(n, x))
+    return p
+
+# The remainder of the polynomials can be derived from the ones above.
+
+# Ultraspherical (Gegenbauer)        C^(alpha)_n(x)
+
+
+def roots_gegenbauer(n, alpha, mu=False):
+    r"""Gauss-Gegenbauer quadrature.
+
+    Compute the sample points and weights for Gauss-Gegenbauer
+    quadrature. The sample points are the roots of the nth degree
+    Gegenbauer polynomial, :math:`C^{\alpha}_n(x)`. These sample
+    points and weights correctly integrate polynomials of degree
+    :math:`2n - 1` or less over the interval :math:`[-1, 1]` with
+    weight function :math:`w(x) = (1 - x^2)^{\alpha - 1/2}`. See
+    22.2.3 in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    alpha : float
+        alpha must be > -0.5
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+    if alpha < -0.5:
+        raise ValueError("alpha must be greater than -0.5.")
+    elif alpha == 0.0:
+        # C(n,0,x) == 0 uniformly, however, as alpha->0, C(n,alpha,x)->T(n,x)
+        # strictly, we should just error out here, since the roots are not
+        # really defined, but we used to return something useful, so let's
+        # keep doing so.
+        return roots_chebyt(n, mu)
+
+    if alpha <= 170:
+        mu0 = (np.sqrt(np.pi) * _ufuncs.gamma(alpha + 0.5)) \
+              / _ufuncs.gamma(alpha + 1)
+    else:
+        # For large alpha we use a Taylor series expansion around inf,
+        # expressed as a 6th order polynomial of a^-1 and using Horner's
+        # method to minimize computation and maximize precision
+        inv_alpha = 1. / alpha
+        coeffs = np.array([0.000207186, -0.00152206, -0.000640869,
+                           0.00488281, 0.0078125, -0.125, 1.])
+        mu0 = coeffs[0]
+        for term in range(1, len(coeffs)):
+            mu0 = mu0 * inv_alpha + coeffs[term]
+        mu0 = mu0 * np.sqrt(np.pi / alpha)
+    def an_func(k):
+        return 0.0 * k
+    def bn_func(k):
+        return np.sqrt(k * (k + 2 * alpha - 1) / (4 * (k + alpha) * (k + alpha - 1)))
+    def f(n, x):
+        return _ufuncs.eval_gegenbauer(n, alpha, x)
+    def df(n, x):
+        return (
+            -n * x * _ufuncs.eval_gegenbauer(n, alpha, x)
+            + (n + 2 * alpha - 1) * _ufuncs.eval_gegenbauer(n - 1, alpha, x)
+        ) / (1 - x ** 2)
+    return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, True, mu)
+
+
+def gegenbauer(n, alpha, monic=False):
+    r"""Gegenbauer (ultraspherical) polynomial.
+
+    Defined to be the solution of
+
+    .. math::
+        (1 - x^2)\frac{d^2}{dx^2}C_n^{(\alpha)}
+          - (2\alpha + 1)x\frac{d}{dx}C_n^{(\alpha)}
+          + n(n + 2\alpha)C_n^{(\alpha)} = 0
+
+    for :math:`\alpha > -1/2`; :math:`C_n^{(\alpha)}` is a polynomial
+    of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    alpha : float
+        Parameter, must be greater than -0.5.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    C : orthopoly1d
+        Gegenbauer polynomial.
+
+    Notes
+    -----
+    The polynomials :math:`C_n^{(\alpha)}` are orthogonal over
+    :math:`[-1,1]` with weight function :math:`(1 - x^2)^{(\alpha -
+    1/2)}`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+
+    We can initialize a variable ``p`` as a Gegenbauer polynomial using the
+    `gegenbauer` function and evaluate at a point ``x = 1``.
+
+    >>> p = special.gegenbauer(3, 0.5, monic=False)
+    >>> p
+    poly1d([ 2.5,  0. , -1.5,  0. ])
+    >>> p(1)
+    1.0
+
+    To evaluate ``p`` at various points ``x`` in the interval ``(-3, 3)``,
+    simply pass an array ``x`` to ``p`` as follows:
+
+    >>> x = np.linspace(-3, 3, 400)
+    >>> y = p(x)
+
+    We can then visualize ``x, y`` using `matplotlib.pyplot`.
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, y)
+    >>> ax.set_title("Gegenbauer (ultraspherical) polynomial of degree 3")
+    >>> ax.set_xlabel("x")
+    >>> ax.set_ylabel("G_3(x)")
+    >>> plt.show()
+
+    """
+    if not np.isfinite(alpha) or alpha <= -0.5 :
+        raise ValueError("`alpha` must be a finite number greater than -1/2")
+    base = jacobi(n, alpha - 0.5, alpha - 0.5, monic=monic)
+    if monic or n == 0:
+        return base
+    #  Abrahmowitz and Stegan 22.5.20
+    factor = (_gam(2*alpha + n) * _gam(alpha + 0.5) /
+              _gam(2*alpha) / _gam(alpha + 0.5 + n))
+    base._scale(factor)
+    base.__dict__['_eval_func'] = lambda x: _ufuncs.eval_gegenbauer(float(n),
+                                                                    alpha, x)
+    return base
+
+# Chebyshev of the first kind: T_n(x) =
+#     n! sqrt(pi) / _gam(n+1./2)* P^(-1/2,-1/2)_n(x)
+# Computed anew.
+
+
+def roots_chebyt(n, mu=False):
+    r"""Gauss-Chebyshev (first kind) quadrature.
+
+    Computes the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    Chebyshev polynomial of the first kind, :math:`T_n(x)`. These
+    sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[-1, 1]`
+    with weight function :math:`w(x) = 1/\sqrt{1 - x^2}`. See 22.2.4
+    in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+    numpy.polynomial.chebyshev.chebgauss
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError('n must be a positive integer.')
+    x = _ufuncs._sinpi(np.arange(-m + 1, m, 2) / (2*m))
+    w = np.full_like(x, pi/m)
+    if mu:
+        return x, w, pi
+    else:
+        return x, w
+
+
+def chebyt(n, monic=False):
+    r"""Chebyshev polynomial of the first kind.
+
+    Defined to be the solution of
+
+    .. math::
+        (1 - x^2)\frac{d^2}{dx^2}T_n - x\frac{d}{dx}T_n + n^2T_n = 0;
+
+    :math:`T_n` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    T : orthopoly1d
+        Chebyshev polynomial of the first kind.
+
+    See Also
+    --------
+    chebyu : Chebyshev polynomial of the second kind.
+
+    Notes
+    -----
+    The polynomials :math:`T_n` are orthogonal over :math:`[-1, 1]`
+    with weight function :math:`(1 - x^2)^{-1/2}`.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    Chebyshev polynomials of the first kind of order :math:`n` can
+    be obtained as the determinant of specific :math:`n \times n`
+    matrices. As an example we can check how the points obtained from
+    the determinant of the following :math:`3 \times 3` matrix
+    lay exactly on :math:`T_3`:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.linalg import det
+    >>> from scipy.special import chebyt
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-2.0, 2.0)
+    >>> ax.set_title(r'Chebyshev polynomial $T_3$')
+    >>> ax.plot(x, chebyt(3)(x), label=rf'$T_3$')
+    >>> for p in np.arange(-1.0, 1.0, 0.1):
+    ...     ax.plot(p,
+    ...             det(np.array([[p, 1, 0], [1, 2*p, 1], [0, 1, 2*p]])),
+    ...             'rx')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    They are also related to the Jacobi Polynomials
+    :math:`P_n^{(-0.5, -0.5)}` through the relation:
+
+    .. math::
+        P_n^{(-0.5, -0.5)}(x) = \frac{1}{4^n} \binom{2n}{n} T_n(x)
+
+    Let's verify it for :math:`n = 3`:
+
+    >>> from scipy.special import binom
+    >>> from scipy.special import jacobi
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> np.allclose(jacobi(3, -0.5, -0.5)(x),
+    ...             1/64 * binom(6, 3) * chebyt(3)(x))
+    True
+
+    We can plot the Chebyshev polynomials :math:`T_n` for some values
+    of :math:`n`:
+
+    >>> x = np.arange(-1.5, 1.5, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-4.0, 4.0)
+    >>> ax.set_title(r'Chebyshev polynomials $T_n$')
+    >>> for n in np.arange(2,5):
+    ...     ax.plot(x, chebyt(n)(x), label=rf'$T_n={n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    def wfunc(x):
+        return 1.0 / sqrt(1 - x * x)
+    if n == 0:
+        return orthopoly1d([], [], pi, 1.0, wfunc, (-1, 1), monic,
+                           lambda x: _ufuncs.eval_chebyt(n, x))
+    n1 = n
+    x, w, mu = roots_chebyt(n1, mu=True)
+    hn = pi / 2
+    kn = 2**(n - 1)
+    p = orthopoly1d(x, w, hn, kn, wfunc, (-1, 1), monic,
+                    lambda x: _ufuncs.eval_chebyt(n, x))
+    return p
+
+# Chebyshev of the second kind
+#    U_n(x) = (n+1)! sqrt(pi) / (2*_gam(n+3./2)) * P^(1/2,1/2)_n(x)
+
+
+def roots_chebyu(n, mu=False):
+    r"""Gauss-Chebyshev (second kind) quadrature.
+
+    Computes the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    Chebyshev polynomial of the second kind, :math:`U_n(x)`. These
+    sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[-1, 1]`
+    with weight function :math:`w(x) = \sqrt{1 - x^2}`. See 22.2.5 in
+    [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError('n must be a positive integer.')
+    t = np.arange(m, 0, -1) * pi / (m + 1)
+    x = np.cos(t)
+    w = pi * np.sin(t)**2 / (m + 1)
+    if mu:
+        return x, w, pi / 2
+    else:
+        return x, w
+
+
+def chebyu(n, monic=False):
+    r"""Chebyshev polynomial of the second kind.
+
+    Defined to be the solution of
+
+    .. math::
+        (1 - x^2)\frac{d^2}{dx^2}U_n - 3x\frac{d}{dx}U_n
+          + n(n + 2)U_n = 0;
+
+    :math:`U_n` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    U : orthopoly1d
+        Chebyshev polynomial of the second kind.
+
+    See Also
+    --------
+    chebyt : Chebyshev polynomial of the first kind.
+
+    Notes
+    -----
+    The polynomials :math:`U_n` are orthogonal over :math:`[-1, 1]`
+    with weight function :math:`(1 - x^2)^{1/2}`.
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    Chebyshev polynomials of the second kind of order :math:`n` can
+    be obtained as the determinant of specific :math:`n \times n`
+    matrices. As an example we can check how the points obtained from
+    the determinant of the following :math:`3 \times 3` matrix
+    lay exactly on :math:`U_3`:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.linalg import det
+    >>> from scipy.special import chebyu
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-2.0, 2.0)
+    >>> ax.set_title(r'Chebyshev polynomial $U_3$')
+    >>> ax.plot(x, chebyu(3)(x), label=rf'$U_3$')
+    >>> for p in np.arange(-1.0, 1.0, 0.1):
+    ...     ax.plot(p,
+    ...             det(np.array([[2*p, 1, 0], [1, 2*p, 1], [0, 1, 2*p]])),
+    ...             'rx')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    They satisfy the recurrence relation:
+
+    .. math::
+        U_{2n-1}(x) = 2 T_n(x)U_{n-1}(x)
+
+    where the :math:`T_n` are the Chebyshev polynomial of the first kind.
+    Let's verify it for :math:`n = 2`:
+
+    >>> from scipy.special import chebyt
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> np.allclose(chebyu(3)(x), 2 * chebyt(2)(x) * chebyu(1)(x))
+    True
+
+    We can plot the Chebyshev polynomials :math:`U_n` for some values
+    of :math:`n`:
+
+    >>> x = np.arange(-1.0, 1.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-1.5, 1.5)
+    >>> ax.set_title(r'Chebyshev polynomials $U_n$')
+    >>> for n in np.arange(1,5):
+    ...     ax.plot(x, chebyu(n)(x), label=rf'$U_n={n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    base = jacobi(n, 0.5, 0.5, monic=monic)
+    if monic:
+        return base
+    factor = sqrt(pi) / 2.0 * _gam(n + 2) / _gam(n + 1.5)
+    base._scale(factor)
+    return base
+
+# Chebyshev of the first kind        C_n(x)
+
+
+def roots_chebyc(n, mu=False):
+    r"""Gauss-Chebyshev (first kind) quadrature.
+
+    Compute the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    Chebyshev polynomial of the first kind, :math:`C_n(x)`. These
+    sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[-2, 2]`
+    with weight function :math:`w(x) = 1 / \sqrt{1 - (x/2)^2}`. See
+    22.2.6 in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    x, w, m = roots_chebyt(n, True)
+    x *= 2
+    w *= 2
+    m *= 2
+    if mu:
+        return x, w, m
+    else:
+        return x, w
+
+
+def chebyc(n, monic=False):
+    r"""Chebyshev polynomial of the first kind on :math:`[-2, 2]`.
+
+    Defined as :math:`C_n(x) = 2T_n(x/2)`, where :math:`T_n` is the
+    nth Chebychev polynomial of the first kind.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    C : orthopoly1d
+        Chebyshev polynomial of the first kind on :math:`[-2, 2]`.
+
+    See Also
+    --------
+    chebyt : Chebyshev polynomial of the first kind.
+
+    Notes
+    -----
+    The polynomials :math:`C_n(x)` are orthogonal over :math:`[-2, 2]`
+    with weight function :math:`1/\sqrt{1 - (x/2)^2}`.
+
+    References
+    ----------
+    .. [1] Abramowitz and Stegun, "Handbook of Mathematical Functions"
+           Section 22. National Bureau of Standards, 1972.
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_chebyc(n1)
+    if n == 0:
+        x, w = [], []
+    hn = 4 * pi * ((n == 0) + 1)
+    kn = 1.0
+    p = orthopoly1d(x, w, hn, kn,
+                    wfunc=lambda x: 1.0 / sqrt(1 - x * x / 4.0),
+                    limits=(-2, 2), monic=monic)
+    if not monic:
+        p._scale(2.0 / p(2))
+        p.__dict__['_eval_func'] = lambda x: _ufuncs.eval_chebyc(n, x)
+    return p
+
+# Chebyshev of the second kind       S_n(x)
+
+
+def roots_chebys(n, mu=False):
+    r"""Gauss-Chebyshev (second kind) quadrature.
+
+    Compute the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    Chebyshev polynomial of the second kind, :math:`S_n(x)`. These
+    sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[-2, 2]`
+    with weight function :math:`w(x) = \sqrt{1 - (x/2)^2}`. See 22.2.7
+    in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    x, w, m = roots_chebyu(n, True)
+    x *= 2
+    w *= 2
+    m *= 2
+    if mu:
+        return x, w, m
+    else:
+        return x, w
+
+
+def chebys(n, monic=False):
+    r"""Chebyshev polynomial of the second kind on :math:`[-2, 2]`.
+
+    Defined as :math:`S_n(x) = U_n(x/2)` where :math:`U_n` is the
+    nth Chebychev polynomial of the second kind.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    S : orthopoly1d
+        Chebyshev polynomial of the second kind on :math:`[-2, 2]`.
+
+    See Also
+    --------
+    chebyu : Chebyshev polynomial of the second kind
+
+    Notes
+    -----
+    The polynomials :math:`S_n(x)` are orthogonal over :math:`[-2, 2]`
+    with weight function :math:`\sqrt{1 - (x/2)}^2`.
+
+    References
+    ----------
+    .. [1] Abramowitz and Stegun, "Handbook of Mathematical Functions"
+           Section 22. National Bureau of Standards, 1972.
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_chebys(n1)
+    if n == 0:
+        x, w = [], []
+    hn = pi
+    kn = 1.0
+    p = orthopoly1d(x, w, hn, kn,
+                    wfunc=lambda x: sqrt(1 - x * x / 4.0),
+                    limits=(-2, 2), monic=monic)
+    if not monic:
+        factor = (n + 1.0) / p(2)
+        p._scale(factor)
+        p.__dict__['_eval_func'] = lambda x: _ufuncs.eval_chebys(n, x)
+    return p
+
+# Shifted Chebyshev of the first kind     T^*_n(x)
+
+
+def roots_sh_chebyt(n, mu=False):
+    r"""Gauss-Chebyshev (first kind, shifted) quadrature.
+
+    Compute the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    shifted Chebyshev polynomial of the first kind, :math:`T_n(x)`.
+    These sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[0, 1]`
+    with weight function :math:`w(x) = 1/\sqrt{x - x^2}`. See 22.2.8
+    in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    xw = roots_chebyt(n, mu)
+    return ((xw[0] + 1) / 2,) + xw[1:]
+
+
+def sh_chebyt(n, monic=False):
+    r"""Shifted Chebyshev polynomial of the first kind.
+
+    Defined as :math:`T^*_n(x) = T_n(2x - 1)` for :math:`T_n` the nth
+    Chebyshev polynomial of the first kind.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    T : orthopoly1d
+        Shifted Chebyshev polynomial of the first kind.
+
+    Notes
+    -----
+    The polynomials :math:`T^*_n` are orthogonal over :math:`[0, 1]`
+    with weight function :math:`(x - x^2)^{-1/2}`.
+
+    """
+    base = sh_jacobi(n, 0.0, 0.5, monic=monic)
+    if monic:
+        return base
+    if n > 0:
+        factor = 4**n / 2.0
+    else:
+        factor = 1.0
+    base._scale(factor)
+    return base
+
+
+# Shifted Chebyshev of the second kind    U^*_n(x)
+def roots_sh_chebyu(n, mu=False):
+    r"""Gauss-Chebyshev (second kind, shifted) quadrature.
+
+    Computes the sample points and weights for Gauss-Chebyshev
+    quadrature. The sample points are the roots of the nth degree
+    shifted Chebyshev polynomial of the second kind, :math:`U_n(x)`.
+    These sample points and weights correctly integrate polynomials of
+    degree :math:`2n - 1` or less over the interval :math:`[0, 1]`
+    with weight function :math:`w(x) = \sqrt{x - x^2}`. See 22.2.9 in
+    [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    x, w, m = roots_chebyu(n, True)
+    x = (x + 1) / 2
+    m_us = _ufuncs.beta(1.5, 1.5)
+    w *= m_us / m
+    if mu:
+        return x, w, m_us
+    else:
+        return x, w
+
+
+def sh_chebyu(n, monic=False):
+    r"""Shifted Chebyshev polynomial of the second kind.
+
+    Defined as :math:`U^*_n(x) = U_n(2x - 1)` for :math:`U_n` the nth
+    Chebyshev polynomial of the second kind.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    U : orthopoly1d
+        Shifted Chebyshev polynomial of the second kind.
+
+    Notes
+    -----
+    The polynomials :math:`U^*_n` are orthogonal over :math:`[0, 1]`
+    with weight function :math:`(x - x^2)^{1/2}`.
+
+    """
+    base = sh_jacobi(n, 2.0, 1.5, monic=monic)
+    if monic:
+        return base
+    factor = 4**n
+    base._scale(factor)
+    return base
+
+# Legendre
+
+
+def roots_legendre(n, mu=False):
+    r"""Gauss-Legendre quadrature.
+
+    Compute the sample points and weights for Gauss-Legendre
+    quadrature [GL]_. The sample points are the roots of the nth degree
+    Legendre polynomial :math:`P_n(x)`. These sample points and
+    weights correctly integrate polynomials of degree :math:`2n - 1`
+    or less over the interval :math:`[-1, 1]` with weight function
+    :math:`w(x) = 1`. See 2.2.10 in [AS]_ for more details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+    numpy.polynomial.legendre.leggauss
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+    .. [GL] Gauss-Legendre quadrature, Wikipedia,
+        https://en.wikipedia.org/wiki/Gauss%E2%80%93Legendre_quadrature
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import roots_legendre, eval_legendre
+    >>> roots, weights = roots_legendre(9)
+
+    ``roots`` holds the roots, and ``weights`` holds the weights for
+    Gauss-Legendre quadrature.
+
+    >>> roots
+    array([-0.96816024, -0.83603111, -0.61337143, -0.32425342,  0.        ,
+            0.32425342,  0.61337143,  0.83603111,  0.96816024])
+    >>> weights
+    array([0.08127439, 0.18064816, 0.2606107 , 0.31234708, 0.33023936,
+           0.31234708, 0.2606107 , 0.18064816, 0.08127439])
+
+    Verify that we have the roots by evaluating the degree 9 Legendre
+    polynomial at ``roots``.  All the values are approximately zero:
+
+    >>> eval_legendre(9, roots)
+    array([-8.88178420e-16, -2.22044605e-16,  1.11022302e-16,  1.11022302e-16,
+            0.00000000e+00, -5.55111512e-17, -1.94289029e-16,  1.38777878e-16,
+           -8.32667268e-17])
+
+    Here we'll show how the above values can be used to estimate the
+    integral from 1 to 2 of f(t) = t + 1/t with Gauss-Legendre
+    quadrature [GL]_.  First define the function and the integration
+    limits.
+
+    >>> def f(t):
+    ...    return t + 1/t
+    ...
+    >>> a = 1
+    >>> b = 2
+
+    We'll use ``integral(f(t), t=a, t=b)`` to denote the definite integral
+    of f from t=a to t=b.  The sample points in ``roots`` are from the
+    interval [-1, 1], so we'll rewrite the integral with the simple change
+    of variable::
+
+        x = 2/(b - a) * t - (a + b)/(b - a)
+
+    with inverse::
+
+        t = (b - a)/2 * x + (a + b)/2
+
+    Then::
+
+        integral(f(t), a, b) =
+            (b - a)/2 * integral(f((b-a)/2*x + (a+b)/2), x=-1, x=1)
+
+    We can approximate the latter integral with the values returned
+    by `roots_legendre`.
+
+    Map the roots computed above from [-1, 1] to [a, b].
+
+    >>> t = (b - a)/2 * roots + (a + b)/2
+
+    Approximate the integral as the weighted sum of the function values.
+
+    >>> (b - a)/2 * f(t).dot(weights)
+    2.1931471805599276
+
+    Compare that to the exact result, which is 3/2 + log(2):
+
+    >>> 1.5 + np.log(2)
+    2.1931471805599454
+
+    """
+    m = int(n)
+    if n < 1 or n != m:
+        raise ValueError("n must be a positive integer.")
+
+    mu0 = 2.0
+    def an_func(k):
+        return 0.0 * k
+    def bn_func(k):
+        return k * np.sqrt(1.0 / (4 * k * k - 1))
+    f = _ufuncs.eval_legendre
+    def df(n, x):
+        return (-n * x * _ufuncs.eval_legendre(n, x)
+                + n * _ufuncs.eval_legendre(n - 1, x)) / (1 - x ** 2)
+    return _gen_roots_and_weights(m, mu0, an_func, bn_func, f, df, True, mu)
+
+
+def legendre(n, monic=False):
+    r"""Legendre polynomial.
+
+    Defined to be the solution of
+
+    .. math::
+        \frac{d}{dx}\left[(1 - x^2)\frac{d}{dx}P_n(x)\right]
+          + n(n + 1)P_n(x) = 0;
+
+    :math:`P_n(x)` is a polynomial of degree :math:`n`.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    P : orthopoly1d
+        Legendre polynomial.
+
+    Notes
+    -----
+    The polynomials :math:`P_n` are orthogonal over :math:`[-1, 1]`
+    with weight function 1.
+
+    Examples
+    --------
+    Generate the 3rd-order Legendre polynomial 1/2*(5x^3 + 0x^2 - 3x + 0):
+
+    >>> from scipy.special import legendre
+    >>> legendre(3)
+    poly1d([ 2.5,  0. , -1.5,  0. ])
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    if n == 0:
+        n1 = n + 1
+    else:
+        n1 = n
+    x, w = roots_legendre(n1)
+    if n == 0:
+        x, w = [], []
+    hn = 2.0 / (2 * n + 1)
+    kn = _gam(2 * n + 1) / _gam(n + 1)**2 / 2.0**n
+    p = orthopoly1d(x, w, hn, kn, wfunc=lambda x: 1.0, limits=(-1, 1),
+                    monic=monic,
+                    eval_func=lambda x: _ufuncs.eval_legendre(n, x))
+    return p
+
+# Shifted Legendre              P^*_n(x)
+
+
+def roots_sh_legendre(n, mu=False):
+    r"""Gauss-Legendre (shifted) quadrature.
+
+    Compute the sample points and weights for Gauss-Legendre
+    quadrature. The sample points are the roots of the nth degree
+    shifted Legendre polynomial :math:`P^*_n(x)`. These sample points
+    and weights correctly integrate polynomials of degree :math:`2n -
+    1` or less over the interval :math:`[0, 1]` with weight function
+    :math:`w(x) = 1.0`. See 2.2.11 in [AS]_ for details.
+
+    Parameters
+    ----------
+    n : int
+        quadrature order
+    mu : bool, optional
+        If True, return the sum of the weights, optional.
+
+    Returns
+    -------
+    x : ndarray
+        Sample points
+    w : ndarray
+        Weights
+    mu : float
+        Sum of the weights
+
+    See Also
+    --------
+    scipy.integrate.fixed_quad
+
+    References
+    ----------
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    """
+    x, w = roots_legendre(n)
+    x = (x + 1) / 2
+    w /= 2
+    if mu:
+        return x, w, 1.0
+    else:
+        return x, w
+
+
+def sh_legendre(n, monic=False):
+    r"""Shifted Legendre polynomial.
+
+    Defined as :math:`P^*_n(x) = P_n(2x - 1)` for :math:`P_n` the nth
+    Legendre polynomial.
+
+    Parameters
+    ----------
+    n : int
+        Degree of the polynomial.
+    monic : bool, optional
+        If `True`, scale the leading coefficient to be 1. Default is
+        `False`.
+
+    Returns
+    -------
+    P : orthopoly1d
+        Shifted Legendre polynomial.
+
+    Notes
+    -----
+    The polynomials :math:`P^*_n` are orthogonal over :math:`[0, 1]`
+    with weight function 1.
+
+    """
+    if n < 0:
+        raise ValueError("n must be nonnegative.")
+
+    def wfunc(x):
+        return 0.0 * x + 1.0
+    if n == 0:
+        return orthopoly1d([], [], 1.0, 1.0, wfunc, (0, 1), monic,
+                           lambda x: _ufuncs.eval_sh_legendre(n, x))
+    x, w = roots_sh_legendre(n)
+    hn = 1.0 / (2 * n + 1.0)
+    kn = _gam(2 * n + 1) / _gam(n + 1)**2
+    p = orthopoly1d(x, w, hn, kn, wfunc, limits=(0, 1), monic=monic,
+                    eval_func=lambda x: _ufuncs.eval_sh_legendre(n, x))
+    return p
+
+
+# Make the old root function names an alias for the new ones
+_modattrs = globals()
+for newfun, oldfun in _rootfuns_map.items():
+    _modattrs[oldfun] = _modattrs[newfun]
+    __all__.append(oldfun)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..298f13ca35f1f497e2c4cec9d4f1e5cf0ed028c1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_orthogonal.pyi
@@ -0,0 +1,330 @@
+from typing import (
+    Any,
+    Literal,
+    Optional,
+    overload,
+)
+from collections.abc import Callable
+
+import numpy as np
+
+_IntegerType = int | np.integer
+_FloatingType = float | np.floating
+_PointsAndWeights = tuple[np.ndarray, np.ndarray]
+_PointsAndWeightsAndMu = tuple[np.ndarray, np.ndarray, float]
+
+_ArrayLike0D = bool | int | float | complex | str | bytes | np.generic
+
+__all__ = [
+    'legendre',
+    'chebyt',
+    'chebyu',
+    'chebyc',
+    'chebys',
+    'jacobi',
+    'laguerre',
+    'genlaguerre',
+    'hermite',
+    'hermitenorm',
+    'gegenbauer',
+    'sh_legendre',
+    'sh_chebyt',
+    'sh_chebyu',
+    'sh_jacobi',
+    'roots_legendre',
+    'roots_chebyt',
+    'roots_chebyu',
+    'roots_chebyc',
+    'roots_chebys',
+    'roots_jacobi',
+    'roots_laguerre',
+    'roots_genlaguerre',
+    'roots_hermite',
+    'roots_hermitenorm',
+    'roots_gegenbauer',
+    'roots_sh_legendre',
+    'roots_sh_chebyt',
+    'roots_sh_chebyu',
+    'roots_sh_jacobi',
+]
+
+@overload
+def roots_jacobi(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        beta: _FloatingType,
+) -> _PointsAndWeights: ...
+@overload
+def roots_jacobi(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        beta: _FloatingType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_jacobi(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        beta: _FloatingType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_sh_jacobi(
+        n: _IntegerType,
+        p1: _FloatingType,
+        q1: _FloatingType,
+) -> _PointsAndWeights: ...
+@overload
+def roots_sh_jacobi(
+        n: _IntegerType,
+        p1: _FloatingType,
+        q1: _FloatingType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_sh_jacobi(
+        n: _IntegerType,
+        p1: _FloatingType,
+        q1: _FloatingType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_genlaguerre(
+        n: _IntegerType,
+        alpha: _FloatingType,
+) -> _PointsAndWeights: ...
+@overload
+def roots_genlaguerre(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_genlaguerre(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_laguerre(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_laguerre(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_laguerre(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_hermite(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_hermite(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_hermite(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_hermitenorm(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_hermitenorm(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_hermitenorm(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_gegenbauer(
+        n: _IntegerType,
+        alpha: _FloatingType,
+) -> _PointsAndWeights: ...
+@overload
+def roots_gegenbauer(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_gegenbauer(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_chebyt(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_chebyt(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_chebyt(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_chebyu(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_chebyu(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_chebyu(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_chebyc(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_chebyc(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_chebyc(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_chebys(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_chebys(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_chebys(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_sh_chebyt(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_sh_chebyt(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_sh_chebyt(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_sh_chebyu(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_sh_chebyu(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_sh_chebyu(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_legendre(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_legendre(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_legendre(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+@overload
+def roots_sh_legendre(n: _IntegerType) -> _PointsAndWeights: ...
+@overload
+def roots_sh_legendre(
+        n: _IntegerType,
+        mu: Literal[False],
+) -> _PointsAndWeights: ...
+@overload
+def roots_sh_legendre(
+        n: _IntegerType,
+        mu: Literal[True],
+) -> _PointsAndWeightsAndMu: ...
+
+class orthopoly1d(np.poly1d):
+    def __init__(
+            self,
+            roots: np.typing.ArrayLike,
+            weights: np.typing.ArrayLike | None,
+            hn: float = ...,
+            kn: float = ...,
+            wfunc = Optional[Callable[[float], float]],  # noqa: UP045
+            limits = tuple[float, float] | None,
+            monic: bool = ...,
+            eval_func: np.ufunc = ...,
+    ) -> None: ...
+    @property
+    def limits(self) -> tuple[float, float]: ...
+    def weight_func(self, x: float) -> float: ...
+    @overload
+    def __call__(self, x: _ArrayLike0D) -> Any: ...
+    @overload
+    def __call__(self, x: np.poly1d) -> np.poly1d: ...  # type: ignore[overload-overlap]
+    @overload
+    def __call__(self, x: np.typing.ArrayLike) -> np.ndarray: ...
+
+def legendre(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def chebyt(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def chebyu(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def chebyc(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def chebys(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def jacobi(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        beta: _FloatingType,
+        monic: bool = ...,
+) -> orthopoly1d: ...
+def laguerre(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def genlaguerre(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        monic: bool = ...,
+) -> orthopoly1d: ...
+def hermite(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def hermitenorm(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def gegenbauer(
+        n: _IntegerType,
+        alpha: _FloatingType,
+        monic: bool = ...,
+) -> orthopoly1d: ...
+def sh_legendre(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def sh_chebyt(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def sh_chebyu(n: _IntegerType, monic: bool = ...) -> orthopoly1d: ...
+def sh_jacobi(
+        n: _IntegerType,
+        p: _FloatingType,
+        q: _FloatingType,
+        monic: bool = ...,
+) -> orthopoly1d: ...
+
+# These functions are not public, but still need stubs because they
+# get checked in the tests.
+def _roots_hermite_asy(n: _IntegerType) -> _PointsAndWeights: ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spfun_stats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spfun_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e122b7c54a5880ee8846ef4818ac3e0a6f2ac3b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spfun_stats.py
@@ -0,0 +1,108 @@
+# Last Change: Sat Mar 21 02:00 PM 2009 J
+
+# Copyright (c) 2001, 2002 Enthought, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#   a. Redistributions of source code must retain the above copyright notice,
+#      this list of conditions and the following disclaimer.
+#   b. Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#   c. Neither the name of the Enthought nor the names of its contributors
+#      may be used to endorse or promote products derived from this software
+#      without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+# DAMAGE.
+
+"""Some more special functions which may be useful for multivariate statistical
+analysis."""
+
+import numpy as np
+from scipy.special import gammaln as loggam
+
+
+__all__ = ['multigammaln']
+
+
+def multigammaln(a, d):
+    r"""Returns the log of multivariate gamma, also sometimes called the
+    generalized gamma.
+
+    Parameters
+    ----------
+    a : ndarray
+        The multivariate gamma is computed for each item of `a`.
+    d : int
+        The dimension of the space of integration.
+
+    Returns
+    -------
+    res : ndarray
+        The values of the log multivariate gamma at the given points `a`.
+
+    Notes
+    -----
+    The formal definition of the multivariate gamma of dimension d for a real
+    `a` is
+
+    .. math::
+
+        \Gamma_d(a) = \int_{A>0} e^{-tr(A)} |A|^{a - (d+1)/2} dA
+
+    with the condition :math:`a > (d-1)/2`, and :math:`A > 0` being the set of
+    all the positive definite matrices of dimension `d`.  Note that `a` is a
+    scalar: the integrand only is multivariate, the argument is not (the
+    function is defined over a subset of the real set).
+
+    This can be proven to be equal to the much friendlier equation
+
+    .. math::
+
+        \Gamma_d(a) = \pi^{d(d-1)/4} \prod_{i=1}^{d} \Gamma(a - (i-1)/2).
+
+    References
+    ----------
+    R. J. Muirhead, Aspects of multivariate statistical theory (Wiley Series in
+    probability and mathematical statistics).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import multigammaln, gammaln
+    >>> a = 23.5
+    >>> d = 10
+    >>> multigammaln(a, d)
+    454.1488605074416
+
+    Verify that the result agrees with the logarithm of the equation
+    shown above:
+
+    >>> d*(d-1)/4*np.log(np.pi) + gammaln(a - 0.5*np.arange(0, d)).sum()
+    454.1488605074416
+    """
+    a = np.asarray(a)
+    # Support for 0d arrays is needed for array_api_strict and dask.
+    d = np.asarray(d)[()]
+    if not np.isscalar(d) or (np.floor(d) != d):
+        raise ValueError("d should be a positive integer (dimension)")
+    if np.any(a <= 0.5 * (d - 1)):
+        raise ValueError(f"condition a ({a}) > 0.5 * (d-1) ({0.5 * (d-1)}) not met")
+
+    res = (d * (d-1) * 0.25) * np.log(np.pi)
+    res += np.sum(loggam([(a - (j - 1.)/2) for j in range(1, d+1)]), axis=0)
+    return res
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spherical_bessel.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spherical_bessel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5ccfd42fc6b6c855038af81e6ca54779e30653
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_spherical_bessel.py
@@ -0,0 +1,397 @@
+from functools import wraps
+import scipy._lib.array_api_extra as xpx
+import numpy as np
+from ._ufuncs import (_spherical_jn, _spherical_yn, _spherical_in,
+                      _spherical_kn, _spherical_jn_d, _spherical_yn_d,
+                      _spherical_in_d, _spherical_kn_d)
+
+
+def use_reflection(sign_n_even=None, reflection_fun=None):
+    # - If reflection_fun is not specified, reflects negative `z` and multiplies
+    #   output by appropriate sign (indicated by `sign_n_even`).
+    # - If reflection_fun is specified, calls `reflection_fun` instead of `fun`.
+    # See DLMF 10.47(v) https://dlmf.nist.gov/10.47
+    def decorator(fun):
+        def standard_reflection(n, z, derivative):
+            # sign_n_even indicates the sign when the order `n` is even
+            sign = np.where(n % 2 == 0, sign_n_even, -sign_n_even)
+            # By the chain rule, differentiation at `-z` adds a minus sign
+            sign = -sign if derivative else sign
+            # Evaluate at positive z (minus negative z) and adjust the sign
+            return fun(n, -z, derivative) * sign
+
+        @wraps(fun)
+        def wrapper(n, z, derivative=False):
+            z = np.asarray(z)
+
+            if np.issubdtype(z.dtype, np.complexfloating):
+                return fun(n, z, derivative)  # complex dtype just works
+
+            f2 = standard_reflection if reflection_fun is None else reflection_fun
+            return xpx.apply_where(z.real >= 0, (n, z),
+                                   lambda n, z: fun(n, z, derivative),
+                                   lambda n, z: f2(n, z, derivative))[()]
+        return wrapper
+    return decorator
+
+
+@use_reflection(+1)  # See DLMF 10.47(v) https://dlmf.nist.gov/10.47
+def spherical_jn(n, z, derivative=False):
+    r"""Spherical Bessel function of the first kind or its derivative.
+
+    Defined as [1]_,
+
+    .. math:: j_n(z) = \sqrt{\frac{\pi}{2z}} J_{n + 1/2}(z),
+
+    where :math:`J_n` is the Bessel function of the first kind.
+
+    Parameters
+    ----------
+    n : int, array_like
+        Order of the Bessel function (n >= 0).
+    z : complex or float, array_like
+        Argument of the Bessel function.
+    derivative : bool, optional
+        If True, the value of the derivative (rather than the function
+        itself) is returned.
+
+    Returns
+    -------
+    jn : ndarray
+
+    Notes
+    -----
+    For real arguments greater than the order, the function is computed
+    using the ascending recurrence [2]_. For small real or complex
+    arguments, the definitional relation to the cylindrical Bessel function
+    of the first kind is used.
+
+    The derivative is computed using the relations [3]_,
+
+    .. math::
+        j_n'(z) = j_{n-1}(z) - \frac{n + 1}{z} j_n(z).
+
+        j_0'(z) = -j_1(z)
+
+
+    .. versionadded:: 0.18.0
+
+    References
+    ----------
+    .. [1] https://dlmf.nist.gov/10.47.E3
+    .. [2] https://dlmf.nist.gov/10.51.E1
+    .. [3] https://dlmf.nist.gov/10.51.E2
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The spherical Bessel functions of the first kind :math:`j_n` accept
+    both real and complex second argument. They can return a complex type:
+
+    >>> from scipy.special import spherical_jn
+    >>> spherical_jn(0, 3+5j)
+    (-9.878987731663194-8.021894345786002j)
+    >>> type(spherical_jn(0, 3+5j))
+    <class 'numpy.complex128'>
+
+    We can verify the relation for the derivative from the Notes
+    for :math:`n=3` in the interval :math:`[1, 2]`:
+
+    >>> import numpy as np
+    >>> x = np.arange(1.0, 2.0, 0.01)
+    >>> np.allclose(spherical_jn(3, x, True),
+    ...             spherical_jn(2, x) - 4/x * spherical_jn(3, x))
+    True
+
+    The first few :math:`j_n` with real argument:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(0.0, 10.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-0.5, 1.5)
+    >>> ax.set_title(r'Spherical Bessel functions $j_n$')
+    >>> for n in np.arange(0, 4):
+    ...     ax.plot(x, spherical_jn(n, x), label=rf'$j_{n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    n = np.asarray(n, dtype=np.dtype("long"))
+    if derivative:
+        return _spherical_jn_d(n, z)
+    else:
+        return _spherical_jn(n, z)
+
+
+@use_reflection(-1)  # See DLMF 10.47(v) https://dlmf.nist.gov/10.47
+def spherical_yn(n, z, derivative=False):
+    r"""Spherical Bessel function of the second kind or its derivative.
+
+    Defined as [1]_,
+
+    .. math:: y_n(z) = \sqrt{\frac{\pi}{2z}} Y_{n + 1/2}(z),
+
+    where :math:`Y_n` is the Bessel function of the second kind.
+
+    Parameters
+    ----------
+    n : int, array_like
+        Order of the Bessel function (n >= 0).
+    z : complex or float, array_like
+        Argument of the Bessel function.
+    derivative : bool, optional
+        If True, the value of the derivative (rather than the function
+        itself) is returned.
+
+    Returns
+    -------
+    yn : ndarray
+
+    Notes
+    -----
+    For real arguments, the function is computed using the ascending
+    recurrence [2]_.  For complex arguments, the definitional relation to
+    the cylindrical Bessel function of the second kind is used.
+
+    The derivative is computed using the relations [3]_,
+
+    .. math::
+        y_n' = y_{n-1} - \frac{n + 1}{z} y_n.
+
+        y_0' = -y_1
+
+
+    .. versionadded:: 0.18.0
+
+    References
+    ----------
+    .. [1] https://dlmf.nist.gov/10.47.E4
+    .. [2] https://dlmf.nist.gov/10.51.E1
+    .. [3] https://dlmf.nist.gov/10.51.E2
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The spherical Bessel functions of the second kind :math:`y_n` accept
+    both real and complex second argument. They can return a complex type:
+
+    >>> from scipy.special import spherical_yn
+    >>> spherical_yn(0, 3+5j)
+    (8.022343088587197-9.880052589376795j)
+    >>> type(spherical_yn(0, 3+5j))
+    <class 'numpy.complex128'>
+
+    We can verify the relation for the derivative from the Notes
+    for :math:`n=3` in the interval :math:`[1, 2]`:
+
+    >>> import numpy as np
+    >>> x = np.arange(1.0, 2.0, 0.01)
+    >>> np.allclose(spherical_yn(3, x, True),
+    ...             spherical_yn(2, x) - 4/x * spherical_yn(3, x))
+    True
+
+    The first few :math:`y_n` with real argument:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(0.0, 10.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-2.0, 1.0)
+    >>> ax.set_title(r'Spherical Bessel functions $y_n$')
+    >>> for n in np.arange(0, 4):
+    ...     ax.plot(x, spherical_yn(n, x), label=rf'$y_{n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    n = np.asarray(n, dtype=np.dtype("long"))
+    if derivative:
+        return _spherical_yn_d(n, z)
+    else:
+        return _spherical_yn(n, z)
+
+
+@use_reflection(+1)  # See DLMF 10.47(v) https://dlmf.nist.gov/10.47
+def spherical_in(n, z, derivative=False):
+    r"""Modified spherical Bessel function of the first kind or its derivative.
+
+    Defined as [1]_,
+
+    .. math:: i_n(z) = \sqrt{\frac{\pi}{2z}} I_{n + 1/2}(z),
+
+    where :math:`I_n` is the modified Bessel function of the first kind.
+
+    Parameters
+    ----------
+    n : int, array_like
+        Order of the Bessel function (n >= 0).
+    z : complex or float, array_like
+        Argument of the Bessel function.
+    derivative : bool, optional
+        If True, the value of the derivative (rather than the function
+        itself) is returned.
+
+    Returns
+    -------
+    in : ndarray
+
+    Notes
+    -----
+    The function is computed using its definitional relation to the
+    modified cylindrical Bessel function of the first kind.
+
+    The derivative is computed using the relations [2]_,
+
+    .. math::
+        i_n' = i_{n-1} - \frac{n + 1}{z} i_n.
+
+        i_1' = i_0
+
+
+    .. versionadded:: 0.18.0
+
+    References
+    ----------
+    .. [1] https://dlmf.nist.gov/10.47.E7
+    .. [2] https://dlmf.nist.gov/10.51.E5
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The modified spherical Bessel functions of the first kind :math:`i_n`
+    accept both real and complex second argument.
+    They can return a complex type:
+
+    >>> from scipy.special import spherical_in
+    >>> spherical_in(0, 3+5j)
+    (-1.1689867793369182-1.2697305267234222j)
+    >>> type(spherical_in(0, 3+5j))
+    <class 'numpy.complex128'>
+
+    We can verify the relation for the derivative from the Notes
+    for :math:`n=3` in the interval :math:`[1, 2]`:
+
+    >>> import numpy as np
+    >>> x = np.arange(1.0, 2.0, 0.01)
+    >>> np.allclose(spherical_in(3, x, True),
+    ...             spherical_in(2, x) - 4/x * spherical_in(3, x))
+    True
+
+    The first few :math:`i_n` with real argument:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(0.0, 6.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(-0.5, 5.0)
+    >>> ax.set_title(r'Modified spherical Bessel functions $i_n$')
+    >>> for n in np.arange(0, 4):
+    ...     ax.plot(x, spherical_in(n, x), label=rf'$i_{n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    n = np.asarray(n, dtype=np.dtype("long"))
+    if derivative:
+        return _spherical_in_d(n, z)
+    else:
+        return _spherical_in(n, z)
+
+
+def spherical_kn_reflection(n, z, derivative=False):
+    # More complex than the other cases, and this will likely be re-implemented
+    # in C++ anyway. Would require multiple function evaluations. Probably about
+    # as fast to just resort to complex math, and much simpler.
+    return spherical_kn(n, z + 0j, derivative=derivative).real
+
+
+@use_reflection(reflection_fun=spherical_kn_reflection)
+def spherical_kn(n, z, derivative=False):
+    r"""Modified spherical Bessel function of the second kind or its derivative.
+
+    Defined as [1]_,
+
+    .. math:: k_n(z) = \sqrt{\frac{\pi}{2z}} K_{n + 1/2}(z),
+
+    where :math:`K_n` is the modified Bessel function of the second kind.
+
+    Parameters
+    ----------
+    n : int, array_like
+        Order of the Bessel function (n >= 0).
+    z : complex or float, array_like
+        Argument of the Bessel function.
+    derivative : bool, optional
+        If True, the value of the derivative (rather than the function
+        itself) is returned.
+
+    Returns
+    -------
+    kn : ndarray
+
+    Notes
+    -----
+    The function is computed using its definitional relation to the
+    modified cylindrical Bessel function of the second kind.
+
+    The derivative is computed using the relations [2]_,
+
+    .. math::
+        k_n' = -k_{n-1} - \frac{n + 1}{z} k_n.
+
+        k_0' = -k_1
+
+
+    .. versionadded:: 0.18.0
+
+    References
+    ----------
+    .. [1] https://dlmf.nist.gov/10.47.E9
+    .. [2] https://dlmf.nist.gov/10.51.E5
+    .. [AS] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover, 1972.
+
+    Examples
+    --------
+    The modified spherical Bessel functions of the second kind :math:`k_n`
+    accept both real and complex second argument.
+    They can return a complex type:
+
+    >>> from scipy.special import spherical_kn
+    >>> spherical_kn(0, 3+5j)
+    (0.012985785614001561+0.003354691603137546j)
+    >>> type(spherical_kn(0, 3+5j))
+    <class 'numpy.complex128'>
+
+    We can verify the relation for the derivative from the Notes
+    for :math:`n=3` in the interval :math:`[1, 2]`:
+
+    >>> import numpy as np
+    >>> x = np.arange(1.0, 2.0, 0.01)
+    >>> np.allclose(spherical_kn(3, x, True),
+    ...             - 4/x * spherical_kn(3, x) - spherical_kn(2, x))
+    True
+
+    The first few :math:`k_n` with real argument:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.arange(0.0, 4.0, 0.01)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_ylim(0.0, 5.0)
+    >>> ax.set_title(r'Modified spherical Bessel functions $k_n$')
+    >>> for n in np.arange(0, 4):
+    ...     ax.plot(x, spherical_kn(n, x), label=rf'$k_{n}$')
+    >>> plt.legend(loc='best')
+    >>> plt.show()
+
+    """
+    n = np.asarray(n, dtype=np.dtype("long"))
+    if derivative:
+        return _spherical_kn_d(n, z)
+    else:
+        return _spherical_kn(n, z)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_support_alternative_backends.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_support_alternative_backends.py
new file mode 100644
index 0000000000000000000000000000000000000000..b220abea4990a70549991dc411ad3ea11fd2cfdb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_support_alternative_backends.py
@@ -0,0 +1,876 @@
+import functools
+import operator
+from collections.abc import Callable
+from dataclasses import dataclass
+from types import ModuleType
+
+import numpy as np
+from scipy._lib._array_api import (
+    array_namespace, scipy_namespace_for, is_numpy, is_dask, is_marray,
+    xp_promote, xp_capabilities, SCIPY_ARRAY_API, get_native_namespace_name
+)
+import scipy._lib.array_api_extra as xpx
+from . import _basic
+from . import _spfun_stats
+from . import _ufuncs
+
+
+@dataclass
+class _FuncInfo:
+    # NumPy-only function. IT MUST BE ELEMENTWISE.
+    func: Callable
+    # Number of arguments, not counting out=
+    # This is for testing purposes only, due to the fact that
+    # inspect.signature() just returns *args for ufuncs.
+    n_args: int
+    # @xp_capabilities decorator, for the purpose of
+    # documentation and unit testing. Omit to indicate
+    # full support for all backends.
+    xp_capabilities: Callable[[Callable], Callable] | None = None
+    # Generic implementation to fall back on if there is no native dispatch
+    # available. This is a function that accepts (main namespace, scipy namespace)
+    # and returns the final callable, or None if not available.
+    generic_impl: Callable[
+        [ModuleType, ModuleType | None], Callable | None
+    ] | None = None
+    # Handle case where a backend uses an alternative name for a function.
+    # Should map backend names to alternative function names.
+    alt_names_map: dict[str, str] | None = None
+    # Some functions only take integer arrays for some arguments.
+    int_only: tuple[bool] | None = None
+    # For testing purposes, whether tests should only use positive values
+    # for some arguments. If bool and equal to True, restrict to positive
+    # values for all arguments. To restrict only some arguments to positive
+    # values, pass a tuple of bool of the same length as the number of
+    # arguments, the ith entry in the tuple controls positive_only for
+    # the ith argument. To make backend specific choices for positive_only,
+    # pass in a dict mapping backend names to bool or tuple[bool].
+    positive_only: bool | tuple[bool] | dict[str, tuple[bool]] = False
+    # Some special functions are not ufuncs and ufunc-specific tests
+    # should not be applied to these.
+    is_ufunc: bool = True
+    # Some non-ufunc special functions take only Python ints for some arguments.
+    # If so, python_int_only should be a tuple of the same length as the number
+    # of arguments,with value True if the corresponding argument needs to be a
+    # Python int.
+    # Can also take a dict mapping backends to such tuples if an argument being
+    # Python int only is backend specific.
+    python_int_only: dict[str, tuple[bool]] | tuple[bool] | None = None
+    # Some functions which seem to be scalar also accept 0d arrays.
+    scalar_or_0d_only: dict[str, tuple[bool]] | tuple[bool] | None = None
+    # Some functions may not work well with very large integer valued arguments.
+    test_large_ints: bool = True
+    # Some non-ufunc special functions don't decay 0d arrays to scalar.
+    produces_0d: bool = False
+    # Whether or not uses native PyTorch or falls back to NumPy/SciPy. This
+    # is needed because in PyTorch, the default dtype affects promotion
+    # rules when mixing integer and floating dtypes, so relying on a
+    # NumPy/SciPy fallback when the default dtype is other than float64 can lead
+    # to float64 output when native PyTorch would have e.g. float32 output. This
+    # must be accounted for in tests. Not putting this in xp_capabilities for now,
+    # but in the future I think it's likely we may want to add a warning to
+    # xp_capabilities when not using native PyTorch on CPU.
+    torch_native: bool = True
+
+    @property
+    def name(self):
+        return self.func.__name__
+
+    # These are needed by @lru_cache below
+    def __hash__(self):
+        return hash(self.func)
+
+    def __eq__(self, other):
+        return isinstance(other, _FuncInfo) and self.func == other.func
+
+    @property
+    def wrapper(self):
+        if self.name in globals():
+            # Already initialised. We are likely in a unit test.
+            # Return function potentially overridden by xpx.testing.lazy_xp_function.
+            import scipy.special
+            return getattr(scipy.special, self.name)
+
+        if SCIPY_ARRAY_API:
+            @functools.wraps(self.func)
+            def wrapped(*args, **kwargs):
+                xp = array_namespace(*args)
+                return self._wrapper_for(xp)(*args, **kwargs)
+
+            # Allow pickling the function. Normally this is done by @wraps,
+            # but in this case it doesn't work because self.func is a ufunc.
+            wrapped.__module__ = "scipy.special"
+            wrapped.__qualname__ = self.name
+            func = wrapped
+        else:
+            func = self.func
+
+        capabilities = self.xp_capabilities or xp_capabilities()
+        # In order to retain a naked ufunc when SCIPY_ARRAY_API is
+        # disabled, xp_capabilities must apply its changes in place.
+        cap_func = capabilities(func)
+        assert cap_func is func
+        return func
+
+    @functools.lru_cache(1000)
+    def _wrapper_for(self, xp):
+        if is_numpy(xp):
+            return self.func
+
+        # If a native implementation is available, use that
+        spx = scipy_namespace_for(xp)
+        f = _get_native_func(xp, spx, self.name, alt_names_map=self.alt_names_map)
+        if f is not None:
+            return f
+
+        # If generic Array API implementation is available, use that
+        if self.generic_impl is not None:
+            f = self.generic_impl(xp, spx)
+            if f is not None:
+                return f
+
+        if is_marray(xp):
+            # Unwrap the array, apply the function on the wrapped namespace,
+            # and then re-wrap it.
+            # IMPORTANT: this only works because all functions in this module
+            # are elementwise. Otherwise, we would not be able to define a
+            # general rule for mask propagation.
+
+            _f = globals()[self.name]  # Allow nested wrapping
+            def f(*args, _f=_f, xp=xp, **kwargs):
+                data_args = [getattr(arg, 'data', arg) for arg in args]
+                out = _f(*data_args, **kwargs)
+                mask = functools.reduce(operator.or_,
+                                        (getattr(arg, 'mask', False) for arg in args))
+                return xp.asarray(out, mask=mask)
+
+            return f
+
+        if is_dask(xp):
+            # Apply the function to each block of the Dask array.
+            # IMPORTANT: map_blocks works only because all functions in this module
+            # are elementwise. It would be a grave mistake to apply this to gufuncs
+            # or any other function with reductions, as they would change their
+            # output depending on chunking!
+
+            _f = globals()[self.name]  # Allow nested wrapping
+            def f(*args, _f=_f, xp=xp, **kwargs):
+                # Hide dtype kwarg from map_blocks
+                return xp.map_blocks(functools.partial(_f, **kwargs), *args)
+
+            return f
+
+        # As a final resort, use the NumPy/SciPy implementation
+        _f = self.func
+        def f(*args, _f=_f, xp=xp, **kwargs):
+            # TODO use xpx.lazy_apply to add jax.jit support
+            # (but dtype propagation can be non-trivial)
+            args = [np.asarray(arg) for arg in args]
+            out = _f(*args, **kwargs)
+            return xp.asarray(out)
+
+        return f
+
+
+def _get_native_func(xp, spx, f_name, *, alt_names_map=None):
+    if alt_names_map is None:
+        alt_names_map = {}
+    f_name = alt_names_map.get(get_native_namespace_name(xp), f_name)
+    f = getattr(spx.special, f_name, None) if spx else None
+    if f is None and hasattr(xp, 'special'):
+        # Currently dead branch, in anticipation of 'special' Array API extension
+        # https://github.com/data-apis/array-api/issues/725
+        f = getattr(xp.special, f_name, None)
+    return f
+
+
+def _rel_entr(xp, spx):
+    def __rel_entr(x, y, *, xp=xp):
+        # https://github.com/data-apis/array-api-extra/issues/160
+        mxp = array_namespace(x._meta, y._meta) if is_dask(xp) else xp
+        x, y = xp_promote(x, y, broadcast=True, force_floating=True, xp=xp)
+        xy_pos = (x > 0) & (y > 0)
+        xy_inf = xp.isinf(x) & xp.isinf(y)
+        res = xpx.apply_where(
+            xy_pos & ~xy_inf,
+            (x, y),
+            # Note: for very large x, this can overflow.
+            lambda x, y: x * (mxp.log(x) - mxp.log(y)),
+            fill_value=xp.inf
+        )
+        res = xpx.at(res)[(x == 0) & (y >= 0)].set(0)
+        res = xpx.at(res)[xp.isnan(x) | xp.isnan(y) | (xy_pos & xy_inf)].set(xp.nan)
+        return res
+
+    return __rel_entr
+
+
+def _xlogy(xp, spx):
+    def __xlogy(x, y, *, xp=xp):
+        x, y = xp_promote(x, y, force_floating=True, xp=xp)
+        with np.errstate(divide='ignore', invalid='ignore'):
+            temp = x * xp.log(y)
+        return xp.where(x == 0., 0., temp)
+    return __xlogy
+
+
+
+def _chdtr(xp, spx):
+    # The difference between this and just using `gammainc`
+    # defined by `get_array_special_func` is that if `gammainc`
+    # isn't found, we don't want to use the SciPy version; we'll
+    # return None here and use the SciPy version of `chdtr`.
+    gammainc = _get_native_func(xp, spx, 'gammainc')
+    if gammainc is None:
+        return None
+
+    def __chdtr(v, x):
+        res = gammainc(v / 2, x / 2)  # this is almost all we need
+        # The rest can be removed when google/jax#20507 is resolved
+        mask = (v == 0) & (x > 0)  # JAX returns NaN
+        res = xp.where(mask, 1., res)
+        mask = xp.isinf(v) & xp.isinf(x)  # JAX returns 1.0
+        return xp.where(mask, xp.nan, res)
+    return __chdtr
+
+
+def _chdtrc(xp, spx):
+    # The difference between this and just using `gammaincc`
+    # defined by `get_array_special_func` is that if `gammaincc`
+    # isn't found, we don't want to use the SciPy version; we'll
+    # return None here and use the SciPy version of `chdtrc`.
+    gammaincc = _get_native_func(xp, spx, 'gammaincc')
+    if gammaincc is None:
+        return None
+
+    def __chdtrc(v, x):
+        res = xp.where(x >= 0, gammaincc(v/2, x/2), 1)
+        i_nan = ((x == 0) & (v == 0)) | xp.isnan(x) | xp.isnan(v) | (v <= 0)
+        res = xp.where(i_nan, xp.nan, res)
+        return res
+    return __chdtrc
+
+
+def _betaincc(xp, spx):
+    betainc = _get_native_func(xp, spx, 'betainc')
+    if betainc is None:
+        return None
+
+    def __betaincc(a, b, x):
+        # not perfect; might want to just rely on SciPy
+        return betainc(b, a, 1-x)
+    return __betaincc
+
+
+def _stdtr(xp, spx):
+    betainc = _get_native_func(xp, spx, 'betainc')
+    if betainc is None:
+        return None
+
+    def __stdtr(df, t):
+        x = df / (t ** 2 + df)
+        tail = betainc(df / 2, 0.5, x) / 2
+        return xp.where(t < 0, tail, 1 - tail)
+
+    return __stdtr
+
+
+def _stdtrit(xp, spx):
+    # Need either native stdtr or native betainc
+    stdtr = _get_native_func(xp, spx, 'stdtr') or _stdtr(xp, spx)
+    # If betainc is not defined, the root-finding would be done with `xp`
+    # despite `stdtr` being evaluated with SciPy/NumPy `stdtr`. Save the
+    # conversions: in this case, just evaluate `stdtrit` with SciPy/NumPy.
+    if stdtr is None:
+        return None
+
+    from scipy.optimize.elementwise import bracket_root, find_root
+
+    def __stdtrit(df, p):
+        def fun(t, df, p):  return stdtr(df, t) - p
+        res_bracket = bracket_root(fun, xp.zeros_like(p), args=(df, p))
+        res_root = find_root(fun, res_bracket.bracket, args=(df, p))
+        return res_root.x
+
+    return __stdtrit
+
+
+# Inventory of automatically dispatched functions
+# IMPORTANT: these must all be **elementwise** functions!
+
+# PyTorch doesn't implement `betainc`.
+# On torch CPU we can fall back to NumPy, but on GPU it won't work.
+_needs_betainc = xp_capabilities(cpu_only=True, exceptions=["jax.numpy", "cupy"])
+
+_special_funcs = (
+    _FuncInfo(
+        _ufuncs.bdtr, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(False, True, False), torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.bdtrc, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(False, True, False), torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.bdtri, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(False, True, False), torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.betainc, 3, _needs_betainc, torch_native=False),
+    _FuncInfo(_ufuncs.betaincc, 3, _needs_betainc, generic_impl=_betaincc,
+              torch_native=False),
+    _FuncInfo(
+        _ufuncs.betaincinv, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        test_large_ints=False, positive_only=True, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.betaln, 2,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        # For betaln, nan mismatches can occur at negative integer a or b of
+        # sufficiently large magnitude.
+        positive_only={"jax.numpy": True}, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.binom, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.boxcox, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.boxcox1p, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.cbrt, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.chdtr, 2, generic_impl=_chdtr),
+    _FuncInfo(_ufuncs.chdtrc, 2, generic_impl=_chdtrc,
+              # scipy/scipy#20972
+              positive_only={"cupy": True, "jax.numpy": True, "torch": True}),
+    _FuncInfo(
+        _ufuncs.chdtri, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.cosdg, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        test_large_ints=False, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.cosm1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.cotdg, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.ellipk, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.ellipkm1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.entr, 1),
+    _FuncInfo(_ufuncs.erf, 1),
+    _FuncInfo(_ufuncs.erfc, 1),
+    _FuncInfo(
+        _ufuncs.erfcx, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.erfinv, 1),
+    _FuncInfo(
+        _ufuncs.exp1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.exp10, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.exp2, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.exprel, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.expi, 1,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.expit, 1),
+    _FuncInfo(
+        _ufuncs.expn, 2,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        # Inconsistent behavior for negative n. expn is not defined here without
+        # taking analytic continuation.
+        positive_only=True,
+        int_only=(True, False), test_large_ints=False,
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.fdtr, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.fdtrc, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.fdtri, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.gamma, 1,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.gammainc, 2),
+    _FuncInfo(
+        _ufuncs.gammaincc, 2,
+        # google/jax#20699
+        positive_only={"jax.numpy": True},
+    ),
+    _FuncInfo(
+        _ufuncs.gammainccinv, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.gammaincinv, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.gammaln, 1),
+    _FuncInfo(
+        _ufuncs.gammasgn, 1,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.gdtr, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.gdtrc, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.huber, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.hyp1f1, 3,
+        xp_capabilities(cpu_only=True, exceptions=["jax.numpy"]),
+        positive_only={"jax.numpy": True}, test_large_ints=False,
+        torch_native=False,
+    ),
+    # Comment out when jax>=0.6.1 is available in Conda for CI.
+    # (or add version requirements to xp_capabilities).
+    # _FuncInfo(
+    #     _ufuncs.hyp2f1, 4,
+    #     xp_capabilities(cpu_only=True, exceptions=["jax.numpy"]),
+    #     positive_only={"jax.numpy": True}, test_large_ints=False,
+    #     torch_native=False,
+    # ),
+    _FuncInfo(
+        _ufuncs.inv_boxcox, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.inv_boxcox1p, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.i0, 1),
+    _FuncInfo(_ufuncs.i0e, 1),
+    _FuncInfo(_ufuncs.i1, 1),
+    _FuncInfo(_ufuncs.i1e, 1),
+    _FuncInfo(
+        _ufuncs.j0, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "bessel_j0"}, test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.j1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "bessel_j1"}, test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.k0, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "modified_bessel_k0"},
+    ),
+    _FuncInfo(
+        _ufuncs.k0e, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "scaled_modified_bessel_k0"},
+        test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.k1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "modified_bessel_k1"},
+    ),
+    _FuncInfo(
+        _ufuncs.k1e, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "scaled_modified_bessel_k1"},
+        test_large_ints=False),
+    _FuncInfo(
+        _ufuncs.kl_div, 2,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.log_ndtr, 1),
+    _FuncInfo(
+        _ufuncs.loggamma, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.logit, 1),
+    _FuncInfo(
+        _ufuncs.lpmv, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+        test_large_ints=False,
+    ),
+    _FuncInfo(
+        _spfun_stats.multigammaln, 2,
+        is_ufunc=False,
+        python_int_only={
+            "cupy": [False, True],
+            "jax.numpy": [False, True],
+            "torch": [False, True],
+        },
+        scalar_or_0d_only={
+            "array_api_strict": [False, True],
+            "numpy": [False, True],
+            "dask.array": [False, True],
+            "marray": [False, True],
+        },
+        int_only=(False, True), test_large_ints=False,
+        positive_only=True, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.nbdtr, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(True, True, False), positive_only=True,
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.nbdtrc, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(True, True, False), positive_only=True,
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.nbdtri, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(True, True, False), positive_only=True,
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.ndtr, 1),
+    _FuncInfo(_ufuncs.ndtri, 1),
+    _FuncInfo(
+        _ufuncs.pdtr, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        positive_only=True, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.pdtrc, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        positive_only=True, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.pdtri, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        int_only=(True, False), positive_only=True,
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.poch, 2,
+        xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"]),
+        test_large_ints=False, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.pseudo_huber, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _basic.polygamma, 2, int_only=(True, False), is_ufunc=False,
+              scalar_or_0d_only={"torch": (True, False)}, produces_0d=True,
+              positive_only={"torch": (True, False), "jax.numpy": True},
+              test_large_ints=False,
+    ),
+    _FuncInfo(_ufuncs.psi, 1, alt_names_map={"jax.numpy": "digamma"}),
+    _FuncInfo(
+        _ufuncs.radian, 3,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.rel_entr, 2, generic_impl=_rel_entr),
+    _FuncInfo(
+        _ufuncs.rgamma, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+    _FuncInfo(
+        _basic.sinc, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        is_ufunc=False,
+    ),
+    _FuncInfo(
+        _ufuncs.sindg, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        test_large_ints=False, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.spence, 1,
+        xp_capabilities(cpu_only=True, exceptions=["jax.numpy"]),
+        torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.stdtr,  2, _needs_betainc, generic_impl=_stdtr,
+              torch_native=False),
+    _FuncInfo(
+        _ufuncs.stdtrit, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],  # needs betainc
+            skip_backends=[("jax.numpy", "no scipy.optimize support")],
+        ),
+        generic_impl=_stdtrit, torch_native=False,
+    ),
+    _FuncInfo(
+        _ufuncs.tandg, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        test_large_ints=False, torch_native=False,
+    ),
+    _FuncInfo(_ufuncs.xlog1py, 2),
+    _FuncInfo(_ufuncs.xlogy, 2, generic_impl=_xlogy),
+    _FuncInfo(
+        _ufuncs.y0, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "bessel_y0"}, test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.y1, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy", "torch"],
+            jax_jit=False,
+        ),
+        alt_names_map={"torch": "bessel_y1"}, test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.yn, 2,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        positive_only={"cupy": (True, False)}, int_only=(True, False),
+        test_large_ints=False, torch_native=False,
+    ),
+    _FuncInfo(
+        _basic.zeta, 2, is_ufunc=False,
+        positive_only={"jax.numpy": True, "torch": (True, False)},
+        test_large_ints=False,
+    ),
+    _FuncInfo(
+        _ufuncs.zetac, 1,
+        xp_capabilities(
+            cpu_only=True, exceptions=["cupy"],
+            jax_jit=False,
+        ),
+        torch_native=False,
+    ),
+)
+
+# Override ufuncs.
+# When SCIPY_ARRAY_API is disabled, this exclusively updates the docstrings in place
+# and populates the xp_capabilities table, while retaining the original ufuncs.
+globals().update({nfo.func.__name__: nfo.wrapper for nfo in _special_funcs})
+# digamma is an alias for psi. Define here so it also has alternative backend
+# support. Add noqa because the linter gets confused by the sneaky way psi
+# is inserted into globals above.
+digamma = psi  # noqa: F821
+__all__ = [nfo.func.__name__ for nfo in _special_funcs] + ["digamma"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_testutils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_testutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd1951bb415d72c55efdb0f884d56dc617d818b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_testutils.py
@@ -0,0 +1,321 @@
+import os
+import functools
+import operator
+from scipy._lib import _pep440
+
+import numpy as np
+from numpy.testing import assert_
+import pytest
+
+import scipy.special as sc
+
+__all__ = ['with_special_errors', 'assert_func_equal', 'FuncData']
+
+
+#------------------------------------------------------------------------------
+# Check if a module is present to be used in tests
+#------------------------------------------------------------------------------
+
+class MissingModule:
+    def __init__(self, name):
+        self.name = name
+
+
+def check_version(module, min_ver):
+    if type(module) is MissingModule:
+        return pytest.mark.skip(reason=f"{module.name} is not installed")
+    return pytest.mark.skipif(
+        _pep440.parse(module.__version__) < _pep440.Version(min_ver),
+        reason=f"{module.__name__} version >= {min_ver} required"
+    )
+
+
+#------------------------------------------------------------------------------
+# Enable convergence and loss of precision warnings -- turn off one by one
+#------------------------------------------------------------------------------
+
+def with_special_errors(func):
+    """
+    Enable special function errors (such as underflow, overflow,
+    loss of precision, etc.)
+    """
+    @functools.wraps(func)
+    def wrapper(*a, **kw):
+        with sc.errstate(all='raise'):
+            res = func(*a, **kw)
+        return res
+    return wrapper
+
+
+#------------------------------------------------------------------------------
+# Comparing function values at many data points at once, with helpful
+# error reports
+#------------------------------------------------------------------------------
+
+def assert_func_equal(func, results, points, rtol=None, atol=None,
+                      param_filter=None, knownfailure=None,
+                      vectorized=True, dtype=None, nan_ok=False,
+                      ignore_inf_sign=False, distinguish_nan_and_inf=True):
+    if hasattr(points, 'next'):
+        # it's a generator
+        points = list(points)
+
+    points = np.asarray(points)
+    if points.ndim == 1:
+        points = points[:,None]
+    nparams = points.shape[1]
+
+    if hasattr(results, '__name__'):
+        # function
+        data = points
+        result_columns = None
+        result_func = results
+    else:
+        # dataset
+        data = np.c_[points, results]
+        result_columns = list(range(nparams, data.shape[1]))
+        result_func = None
+
+    fdata = FuncData(func, data, list(range(nparams)),
+                     result_columns=result_columns, result_func=result_func,
+                     rtol=rtol, atol=atol, param_filter=param_filter,
+                     knownfailure=knownfailure, nan_ok=nan_ok, vectorized=vectorized,
+                     ignore_inf_sign=ignore_inf_sign,
+                     distinguish_nan_and_inf=distinguish_nan_and_inf)
+    fdata.check()
+
+
+class FuncData:
+    """
+    Data set for checking a special function.
+
+    Parameters
+    ----------
+    func : function
+        Function to test
+    data : numpy array
+        columnar data to use for testing
+    param_columns : int or tuple of ints
+        Columns indices in which the parameters to `func` lie.
+        Can be imaginary integers to indicate that the parameter
+        should be cast to complex.
+    result_columns : int or tuple of ints, optional
+        Column indices for expected results from `func`.
+    result_func : callable, optional
+        Function to call to obtain results.
+    rtol : float, optional
+        Required relative tolerance. Default is 5*eps.
+    atol : float, optional
+        Required absolute tolerance. Default is 5*tiny.
+    param_filter : function, or tuple of functions/Nones, optional
+        Filter functions to exclude some parameter ranges.
+        If omitted, no filtering is done.
+    knownfailure : str, optional
+        Known failure error message to raise when the test is run.
+        If omitted, no exception is raised.
+    nan_ok : bool, optional
+        If nan is always an accepted result.
+    vectorized : bool, optional
+        Whether all functions passed in are vectorized.
+    ignore_inf_sign : bool, optional
+        Whether to ignore signs of infinities.
+        (Doesn't matter for complex-valued functions.)
+    distinguish_nan_and_inf : bool, optional
+        If True, treat numbers which contain nans or infs as
+        equal. Sets ignore_inf_sign to be True.
+
+    """
+
+    def __init__(self, func, data, param_columns, result_columns=None,
+                 result_func=None, rtol=None, atol=None, param_filter=None,
+                 knownfailure=None, dataname=None, nan_ok=False, vectorized=True,
+                 ignore_inf_sign=False, distinguish_nan_and_inf=True):
+        self.func = func
+        self.data = data
+        self.dataname = dataname
+        if not hasattr(param_columns, '__len__'):
+            param_columns = (param_columns,)
+        self.param_columns = tuple(param_columns)
+        if result_columns is not None:
+            if not hasattr(result_columns, '__len__'):
+                result_columns = (result_columns,)
+            self.result_columns = tuple(result_columns)
+            if result_func is not None:
+                message = "Only result_func or result_columns should be provided"
+                raise ValueError(message)
+        elif result_func is not None:
+            self.result_columns = None
+        else:
+            raise ValueError("Either result_func or result_columns should be provided")
+        self.result_func = result_func
+        self.rtol = rtol
+        self.atol = atol
+        if not hasattr(param_filter, '__len__'):
+            param_filter = (param_filter,)
+        self.param_filter = param_filter
+        self.knownfailure = knownfailure
+        self.nan_ok = nan_ok
+        self.vectorized = vectorized
+        self.ignore_inf_sign = ignore_inf_sign
+        self.distinguish_nan_and_inf = distinguish_nan_and_inf
+        if not self.distinguish_nan_and_inf:
+            self.ignore_inf_sign = True
+
+    def get_tolerances(self, dtype):
+        if not np.issubdtype(dtype, np.inexact):
+            dtype = np.dtype(float)
+        info = np.finfo(dtype)
+        rtol, atol = self.rtol, self.atol
+        if rtol is None:
+            rtol = 5*info.eps
+        if atol is None:
+            atol = 5*info.tiny
+        return rtol, atol
+
+    def check(self, data=None, dtype=None, dtypes=None):
+        """Check the special function against the data."""
+        __tracebackhide__ = operator.methodcaller(
+            'errisinstance', AssertionError
+        )
+
+        if self.knownfailure:
+            pytest.xfail(reason=self.knownfailure)
+
+        if data is None:
+            data = self.data
+
+        if dtype is None:
+            dtype = data.dtype
+        else:
+            data = data.astype(dtype)
+
+        rtol, atol = self.get_tolerances(dtype)
+
+        # Apply given filter functions
+        if self.param_filter:
+            param_mask = np.ones((data.shape[0],), np.bool_)
+            for j, filter in zip(self.param_columns, self.param_filter):
+                if filter:
+                    param_mask &= list(filter(data[:,j]))
+            data = data[param_mask]
+
+        # Pick parameters from the correct columns
+        params = []
+        for idx, j in enumerate(self.param_columns):
+            if np.iscomplexobj(j):
+                j = int(j.imag)
+                params.append(data[:,j].astype(complex))
+            elif dtypes and idx < len(dtypes):
+                params.append(data[:, j].astype(dtypes[idx]))
+            else:
+                params.append(data[:,j])
+
+        # Helper for evaluating results
+        def eval_func_at_params(func, skip_mask=None):
+            if self.vectorized:
+                got = func(*params)
+            else:
+                got = []
+                for j in range(len(params[0])):
+                    if skip_mask is not None and skip_mask[j]:
+                        got.append(np.nan)
+                        continue
+                    got.append(func(*tuple([params[i][j] for i in range(len(params))])))
+                got = np.asarray(got)
+            if not isinstance(got, tuple):
+                got = (got,)
+            return got
+
+        # Evaluate function to be tested
+        got = eval_func_at_params(self.func)
+
+        # Grab the correct results
+        if self.result_columns is not None:
+            # Correct results passed in with the data
+            wanted = tuple([data[:,icol] for icol in self.result_columns])
+        else:
+            # Function producing correct results passed in
+            skip_mask = None
+            if self.nan_ok and len(got) == 1:
+                # Don't spend time evaluating what doesn't need to be evaluated
+                skip_mask = np.isnan(got[0])
+            wanted = eval_func_at_params(self.result_func, skip_mask=skip_mask)
+
+        # Check the validity of each output returned
+        assert_(len(got) == len(wanted))
+
+        for output_num, (x, y) in enumerate(zip(got, wanted)):
+            if np.issubdtype(x.dtype, np.complexfloating) or self.ignore_inf_sign:
+                pinf_x = np.isinf(x)
+                pinf_y = np.isinf(y)
+                minf_x = np.isinf(x)
+                minf_y = np.isinf(y)
+            else:
+                pinf_x = np.isposinf(x)
+                pinf_y = np.isposinf(y)
+                minf_x = np.isneginf(x)
+                minf_y = np.isneginf(y)
+            nan_x = np.isnan(x)
+            nan_y = np.isnan(y)
+
+            with np.errstate(all='ignore'):
+                abs_y = np.absolute(y)
+                abs_y[~np.isfinite(abs_y)] = 0
+                diff = np.absolute(x - y)
+                diff[~np.isfinite(diff)] = 0
+
+                rdiff = diff / np.absolute(y)
+                rdiff[~np.isfinite(rdiff)] = 0
+
+            tol_mask = (diff <= atol + rtol*abs_y)
+            pinf_mask = (pinf_x == pinf_y)
+            minf_mask = (minf_x == minf_y)
+
+            nan_mask = (nan_x == nan_y)
+
+            bad_j = ~(tol_mask & pinf_mask & minf_mask & nan_mask)
+
+            point_count = bad_j.size
+            if self.nan_ok:
+                bad_j &= ~nan_x
+                bad_j &= ~nan_y
+                point_count -= (nan_x | nan_y).sum()
+
+            if not self.distinguish_nan_and_inf and not self.nan_ok:
+                # If nan's are okay we've already covered all these cases
+                inf_x = np.isinf(x)
+                inf_y = np.isinf(y)
+                both_nonfinite = (inf_x & nan_y) | (nan_x & inf_y)
+                bad_j &= ~both_nonfinite
+                point_count -= both_nonfinite.sum()
+
+            if np.any(bad_j):
+                # Some bad results: inform what, where, and how bad
+                msg = [""]
+                msg.append(f"Max |adiff|: {diff[bad_j].max():g}")
+                msg.append(f"Max |rdiff|: {rdiff[bad_j].max():g}")
+                msg.append(f"Bad results ({np.sum(bad_j)} out of "
+                           f"{point_count}) for the following points "
+                           f"(in output {output_num}):")
+                for j in np.nonzero(bad_j)[0]:
+                    j = int(j)
+                    def fmt(x):
+                        return f'{np.array2string(x[j], precision=18):30s}'
+                    a = "  ".join(map(fmt, params))
+                    b = "  ".join(map(fmt, got))
+                    c = "  ".join(map(fmt, wanted))
+                    d = fmt(rdiff)
+                    msg.append(f"{a} => {b} != {c}  (rdiff {d})")
+                assert_(False, "\n".join(msg))
+
+    def __repr__(self):
+        """Pretty-printing"""
+        if np.any(list(map(np.iscomplexobj, self.param_columns))):
+            is_complex = " (complex)"
+        else:
+            is_complex = ""
+        if self.dataname:
+            return (f"<Data for {self.func.__name__}{is_complex}: "
+                    f"{os.path.basename(self.dataname)}>")
+        else:
+            return f"<Data for {self.func.__name__}{is_complex}>"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..eb90cc8d13a67d2a81631e7d57803d8396dbdb4a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyi
@@ -0,0 +1,522 @@
+from typing import Any
+
+import numpy as np
+
+__all__ = [
+    'geterr',
+    'seterr',
+    'errstate',
+    'agm',
+    'airy',
+    'airye',
+    'bdtr',
+    'bdtrc',
+    'bdtri',
+    'bdtrik',
+    'bdtrin',
+    'bei',
+    'beip',
+    'ber',
+    'berp',
+    'besselpoly',
+    'beta',
+    'betainc',
+    'betaincc',
+    'betainccinv',
+    'betaincinv',
+    'betaln',
+    'binom',
+    'boxcox',
+    'boxcox1p',
+    'btdtria',
+    'btdtrib',
+    'cbrt',
+    'chdtr',
+    'chdtrc',
+    'chdtri',
+    'chdtriv',
+    'chndtr',
+    'chndtridf',
+    'chndtrinc',
+    'chndtrix',
+    'cosdg',
+    'cosm1',
+    'cotdg',
+    'dawsn',
+    'ellipe',
+    'ellipeinc',
+    'ellipj',
+    'ellipk',
+    'ellipkinc',
+    'ellipkm1',
+    'elliprc',
+    'elliprd',
+    'elliprf',
+    'elliprg',
+    'elliprj',
+    'entr',
+    'erf',
+    'erfc',
+    'erfcinv',
+    'erfcx',
+    'erfi',
+    'erfinv',
+    'eval_chebyc',
+    'eval_chebys',
+    'eval_chebyt',
+    'eval_chebyu',
+    'eval_gegenbauer',
+    'eval_genlaguerre',
+    'eval_hermite',
+    'eval_hermitenorm',
+    'eval_jacobi',
+    'eval_laguerre',
+    'eval_legendre',
+    'eval_sh_chebyt',
+    'eval_sh_chebyu',
+    'eval_sh_jacobi',
+    'eval_sh_legendre',
+    'exp1',
+    'exp10',
+    'exp2',
+    'expi',
+    'expit',
+    'expm1',
+    'expn',
+    'exprel',
+    'fdtr',
+    'fdtrc',
+    'fdtri',
+    'fdtridfd',
+    'fresnel',
+    'gamma',
+    'gammainc',
+    'gammaincc',
+    'gammainccinv',
+    'gammaincinv',
+    'gammaln',
+    'gammasgn',
+    'gdtr',
+    'gdtrc',
+    'gdtria',
+    'gdtrib',
+    'gdtrix',
+    'hankel1',
+    'hankel1e',
+    'hankel2',
+    'hankel2e',
+    'huber',
+    'hyp0f1',
+    'hyp1f1',
+    'hyp2f1',
+    'hyperu',
+    'i0',
+    'i0e',
+    'i1',
+    'i1e',
+    'inv_boxcox',
+    'inv_boxcox1p',
+    'it2i0k0',
+    'it2j0y0',
+    'it2struve0',
+    'itairy',
+    'iti0k0',
+    'itj0y0',
+    'itmodstruve0',
+    'itstruve0',
+    'iv',
+    'ive',
+    'j0',
+    'j1',
+    'jn',
+    'jv',
+    'jve',
+    'k0',
+    'k0e',
+    'k1',
+    'k1e',
+    'kei',
+    'keip',
+    'kelvin',
+    'ker',
+    'kerp',
+    'kl_div',
+    'kn',
+    'kolmogi',
+    'kolmogorov',
+    'kv',
+    'kve',
+    'log1p',
+    'log_expit',
+    'log_ndtr',
+    'log_wright_bessel',
+    'loggamma',
+    'logit',
+    'lpmv',
+    'mathieu_a',
+    'mathieu_b',
+    'mathieu_cem',
+    'mathieu_modcem1',
+    'mathieu_modcem2',
+    'mathieu_modsem1',
+    'mathieu_modsem2',
+    'mathieu_sem',
+    'modfresnelm',
+    'modfresnelp',
+    'modstruve',
+    'nbdtr',
+    'nbdtrc',
+    'nbdtri',
+    'nbdtrik',
+    'nbdtrin',
+    'ncfdtr',
+    'ncfdtri',
+    'ncfdtridfd',
+    'ncfdtridfn',
+    'ncfdtrinc',
+    'nctdtr',
+    'nctdtridf',
+    'nctdtrinc',
+    'nctdtrit',
+    'ndtr',
+    'ndtri',
+    'ndtri_exp',
+    'nrdtrimn',
+    'nrdtrisd',
+    'obl_ang1',
+    'obl_ang1_cv',
+    'obl_cv',
+    'obl_rad1',
+    'obl_rad1_cv',
+    'obl_rad2',
+    'obl_rad2_cv',
+    'owens_t',
+    'pbdv',
+    'pbvv',
+    'pbwa',
+    'pdtr',
+    'pdtrc',
+    'pdtri',
+    'pdtrik',
+    'poch',
+    'powm1',
+    'pro_ang1',
+    'pro_ang1_cv',
+    'pro_cv',
+    'pro_rad1',
+    'pro_rad1_cv',
+    'pro_rad2',
+    'pro_rad2_cv',
+    'pseudo_huber',
+    'psi',
+    'radian',
+    'rel_entr',
+    'rgamma',
+    'round',
+    'shichi',
+    'sici',
+    'sindg',
+    'smirnov',
+    'smirnovi',
+    'spence',
+    'stdtr',
+    'stdtridf',
+    'stdtrit',
+    'struve',
+    'tandg',
+    'tklmbda',
+    'voigt_profile',
+    'wofz',
+    'wright_bessel',
+    'wrightomega',
+    'xlog1py',
+    'xlogy',
+    'y0',
+    'y1',
+    'yn',
+    'yv',
+    'yve',
+    'zetac'
+]
+
+def geterr() -> dict[str, str]: ...
+def seterr(**kwargs: str) -> dict[str, str]: ...
+
+class errstate:
+    def __init__(self, **kargs: str) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(
+        self,
+        exc_type: Any,  # Unused
+        exc_value: Any,  # Unused
+        traceback: Any,  # Unused
+    ) -> None: ...
+
+_cosine_cdf: np.ufunc
+_cosine_invcdf: np.ufunc
+_cospi: np.ufunc
+_ellip_harm: np.ufunc
+_factorial: np.ufunc
+_gen_harmonic: np.ufunc
+_igam_fac: np.ufunc
+_kolmogc: np.ufunc
+_kolmogci: np.ufunc
+_kolmogp: np.ufunc
+_lambertw: np.ufunc
+_lanczos_sum_expg_scaled: np.ufunc
+_lgam1p: np.ufunc
+_log1mexp: np.ufunc
+_log1pmx: np.ufunc
+_normalized_gen_harmonic: np.ufunc
+_riemann_zeta: np.ufunc
+_scaled_exp1: np.ufunc
+_sf_error_test_function: np.ufunc
+_sinpi: np.ufunc
+_smirnovc: np.ufunc
+_smirnovci: np.ufunc
+_smirnovp: np.ufunc
+_spherical_in: np.ufunc
+_spherical_in_d: np.ufunc
+_spherical_jn: np.ufunc
+_spherical_jn_d: np.ufunc
+_spherical_kn: np.ufunc
+_spherical_kn_d: np.ufunc
+_spherical_yn: np.ufunc
+_spherical_yn_d: np.ufunc
+_stirling2_inexact: np.ufunc
+_struve_asymp_large_z: np.ufunc
+_struve_bessel_series: np.ufunc
+_struve_power_series: np.ufunc
+_zeta: np.ufunc
+agm: np.ufunc
+airy: np.ufunc
+airye: np.ufunc
+bdtr: np.ufunc
+bdtrc: np.ufunc
+bdtri: np.ufunc
+bdtrik: np.ufunc
+bdtrin: np.ufunc
+bei: np.ufunc
+beip: np.ufunc
+ber: np.ufunc
+berp: np.ufunc
+besselpoly: np.ufunc
+beta: np.ufunc
+betainc: np.ufunc
+betaincc: np.ufunc
+betainccinv: np.ufunc
+betaincinv: np.ufunc
+betaln: np.ufunc
+binom: np.ufunc
+boxcox1p: np.ufunc
+boxcox: np.ufunc
+btdtria: np.ufunc
+btdtrib: np.ufunc
+cbrt: np.ufunc
+chdtr: np.ufunc
+chdtrc: np.ufunc
+chdtri: np.ufunc
+chdtriv: np.ufunc
+chndtr: np.ufunc
+chndtridf: np.ufunc
+chndtrinc: np.ufunc
+chndtrix: np.ufunc
+cosdg: np.ufunc
+cosm1: np.ufunc
+cotdg: np.ufunc
+dawsn: np.ufunc
+ellipe: np.ufunc
+ellipeinc: np.ufunc
+ellipj: np.ufunc
+ellipk: np.ufunc
+ellipkinc: np.ufunc
+ellipkm1: np.ufunc
+elliprc: np.ufunc
+elliprd: np.ufunc
+elliprf: np.ufunc
+elliprg: np.ufunc
+elliprj: np.ufunc
+entr: np.ufunc
+erf: np.ufunc
+erfc: np.ufunc
+erfcinv: np.ufunc
+erfcx: np.ufunc
+erfi: np.ufunc
+erfinv: np.ufunc
+eval_chebyc: np.ufunc
+eval_chebys: np.ufunc
+eval_chebyt: np.ufunc
+eval_chebyu: np.ufunc
+eval_gegenbauer: np.ufunc
+eval_genlaguerre: np.ufunc
+eval_hermite: np.ufunc
+eval_hermitenorm: np.ufunc
+eval_jacobi: np.ufunc
+eval_laguerre: np.ufunc
+eval_legendre: np.ufunc
+eval_sh_chebyt: np.ufunc
+eval_sh_chebyu: np.ufunc
+eval_sh_jacobi: np.ufunc
+eval_sh_legendre: np.ufunc
+exp10: np.ufunc
+exp1: np.ufunc
+exp2: np.ufunc
+expi: np.ufunc
+expit: np.ufunc
+expm1: np.ufunc
+expn: np.ufunc
+exprel: np.ufunc
+fdtr: np.ufunc
+fdtrc: np.ufunc
+fdtri: np.ufunc
+fdtridfd: np.ufunc
+fresnel: np.ufunc
+gamma: np.ufunc
+gammainc: np.ufunc
+gammaincc: np.ufunc
+gammainccinv: np.ufunc
+gammaincinv: np.ufunc
+gammaln: np.ufunc
+gammasgn: np.ufunc
+gdtr: np.ufunc
+gdtrc: np.ufunc
+gdtria: np.ufunc
+gdtrib: np.ufunc
+gdtrix: np.ufunc
+hankel1: np.ufunc
+hankel1e: np.ufunc
+hankel2: np.ufunc
+hankel2e: np.ufunc
+huber: np.ufunc
+hyp0f1: np.ufunc
+hyp1f1: np.ufunc
+hyp2f1: np.ufunc
+hyperu: np.ufunc
+i0: np.ufunc
+i0e: np.ufunc
+i1: np.ufunc
+i1e: np.ufunc
+inv_boxcox1p: np.ufunc
+inv_boxcox: np.ufunc
+it2i0k0: np.ufunc
+it2j0y0: np.ufunc
+it2struve0: np.ufunc
+itairy: np.ufunc
+iti0k0: np.ufunc
+itj0y0: np.ufunc
+itmodstruve0: np.ufunc
+itstruve0: np.ufunc
+iv: np.ufunc
+ive: np.ufunc
+j0: np.ufunc
+j1: np.ufunc
+jn: np.ufunc
+jv: np.ufunc
+jve: np.ufunc
+k0: np.ufunc
+k0e: np.ufunc
+k1: np.ufunc
+k1e: np.ufunc
+kei: np.ufunc
+keip: np.ufunc
+kelvin: np.ufunc
+ker: np.ufunc
+kerp: np.ufunc
+kl_div: np.ufunc
+kn: np.ufunc
+kolmogi: np.ufunc
+kolmogorov: np.ufunc
+kv: np.ufunc
+kve: np.ufunc
+log1p: np.ufunc
+log_expit: np.ufunc
+log_ndtr: np.ufunc
+log_wright_bessel: np.ufunc
+loggamma: np.ufunc
+logit: np.ufunc
+lpmv: np.ufunc
+mathieu_a: np.ufunc
+mathieu_b: np.ufunc
+mathieu_cem: np.ufunc
+mathieu_modcem1: np.ufunc
+mathieu_modcem2: np.ufunc
+mathieu_modsem1: np.ufunc
+mathieu_modsem2: np.ufunc
+mathieu_sem: np.ufunc
+modfresnelm: np.ufunc
+modfresnelp: np.ufunc
+modstruve: np.ufunc
+nbdtr: np.ufunc
+nbdtrc: np.ufunc
+nbdtri: np.ufunc
+nbdtrik: np.ufunc
+nbdtrin: np.ufunc
+ncfdtr: np.ufunc
+ncfdtri: np.ufunc
+ncfdtridfd: np.ufunc
+ncfdtridfn: np.ufunc
+ncfdtrinc: np.ufunc
+nctdtr: np.ufunc
+nctdtridf: np.ufunc
+nctdtrinc: np.ufunc
+nctdtrit: np.ufunc
+ndtr: np.ufunc
+ndtri: np.ufunc
+ndtri_exp: np.ufunc
+nrdtrimn: np.ufunc
+nrdtrisd: np.ufunc
+obl_ang1: np.ufunc
+obl_ang1_cv: np.ufunc
+obl_cv: np.ufunc
+obl_rad1: np.ufunc
+obl_rad1_cv: np.ufunc
+obl_rad2: np.ufunc
+obl_rad2_cv: np.ufunc
+owens_t: np.ufunc
+pbdv: np.ufunc
+pbvv: np.ufunc
+pbwa: np.ufunc
+pdtr: np.ufunc
+pdtrc: np.ufunc
+pdtri: np.ufunc
+pdtrik: np.ufunc
+poch: np.ufunc
+powm1: np.ufunc
+pro_ang1: np.ufunc
+pro_ang1_cv: np.ufunc
+pro_cv: np.ufunc
+pro_rad1: np.ufunc
+pro_rad1_cv: np.ufunc
+pro_rad2: np.ufunc
+pro_rad2_cv: np.ufunc
+pseudo_huber: np.ufunc
+psi: np.ufunc
+radian: np.ufunc
+rel_entr: np.ufunc
+rgamma: np.ufunc
+round: np.ufunc
+shichi: np.ufunc
+sici: np.ufunc
+sindg: np.ufunc
+smirnov: np.ufunc
+smirnovi: np.ufunc
+spence: np.ufunc
+stdtr: np.ufunc
+stdtridf: np.ufunc
+stdtrit: np.ufunc
+struve: np.ufunc
+tandg: np.ufunc
+tklmbda: np.ufunc
+voigt_profile: np.ufunc
+wofz: np.ufunc
+wright_bessel: np.ufunc
+wrightomega: np.ufunc
+xlog1py: np.ufunc
+xlogy: np.ufunc
+y0: np.ufunc
+y1: np.ufunc
+yn: np.ufunc
+yv: np.ufunc
+yve: np.ufunc
+zetac: np.ufunc
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyx b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..77cdf192bde3d5e750307603c1c9d7c717f9b139
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs.pyx
@@ -0,0 +1,13240 @@
+# This file is automatically generated by _generate_pyx.py.
+# Do not edit manually!
+
+from libc.math cimport NAN
+
+include "_ufuncs_extra_code_common.pxi"
+include "_ufuncs_extra_code.pxi"
+__all__ = ['agm', 'bdtr', 'bdtrc', 'bdtri', 'bdtrik', 'bdtrin', 'betainc', 'betaincc', 'betainccinv', 'betaincinv', 'boxcox', 'boxcox1p', 'btdtria', 'btdtrib', 'chdtr', 'chdtrc', 'chdtri', 'chdtriv', 'chndtr', 'chndtridf', 'chndtrinc', 'chndtrix', 'elliprc', 'elliprd', 'elliprf', 'elliprg', 'elliprj', 'entr', 'erfcinv', 'erfinv', 'eval_chebyc', 'eval_chebys', 'eval_chebyt', 'eval_chebyu', 'eval_gegenbauer', 'eval_genlaguerre', 'eval_hermite', 'eval_hermitenorm', 'eval_jacobi', 'eval_laguerre', 'eval_legendre', 'eval_sh_chebyt', 'eval_sh_chebyu', 'eval_sh_jacobi', 'eval_sh_legendre', 'expn', 'fdtr', 'fdtrc', 'fdtri', 'fdtridfd', 'gdtr', 'gdtrc', 'gdtria', 'gdtrib', 'gdtrix', 'huber', 'hyp0f1', 'hyp1f1', 'hyperu', 'inv_boxcox', 'inv_boxcox1p', 'kl_div', 'kn', 'kolmogi', 'kolmogorov', 'lpmv', 'nbdtr', 'nbdtrc', 'nbdtri', 'nbdtrik', 'nbdtrin', 'ncfdtr', 'ncfdtri', 'ncfdtridfd', 'ncfdtridfn', 'ncfdtrinc', 'nctdtr', 'nctdtridf', 'nctdtrinc', 'nctdtrit', 'ndtri', 'ndtri_exp', 'nrdtrimn', 'nrdtrisd', 'owens_t', 'pdtr', 'pdtrc', 'pdtri', 'pdtrik', 'poch', 'powm1', 'pseudo_huber', 'rel_entr', 'round', 'shichi', 'sici', 'smirnov', 'smirnovi', 'spence', 'stdtr', 'stdtridf', 'stdtrit', 'tklmbda', 'wrightomega', 'yn', 'geterr', 'seterr', 'errstate', 'jn', 'airy', 'airye', 'bei', 'beip', 'ber', 'berp', 'binom', 'exp1', 'expi', 'expit', 'exprel', 'gamma', 'gammaln', 'hankel1', 'hankel1e', 'hankel2', 'hankel2e', 'hyp2f1', 'it2i0k0', 'it2j0y0', 'it2struve0', 'itairy', 'iti0k0', 'itj0y0', 'itmodstruve0', 'itstruve0', 'iv', 'ive', 'jv', 'jve', 'kei', 'keip', 'kelvin', 'ker', 'kerp', 'kv', 'kve', 'log_expit', 'log_wright_bessel', 'loggamma', 'logit', 'mathieu_a', 'mathieu_b', 'mathieu_cem', 'mathieu_modcem1', 'mathieu_modcem2', 'mathieu_modsem1', 'mathieu_modsem2', 'mathieu_sem', 'modfresnelm', 'modfresnelp', 'obl_ang1', 'obl_ang1_cv', 'obl_cv', 'obl_rad1', 'obl_rad1_cv', 'obl_rad2', 'obl_rad2_cv', 'pbdv', 'pbvv', 'pbwa', 'pro_ang1', 'pro_ang1_cv', 'pro_cv', 'pro_rad1', 'pro_rad1_cv', 'pro_rad2', 'pro_rad2_cv', 'psi', 'rgamma', 'wright_bessel', 'yv', 'yve', 'zetac', 'sindg', 'cosdg', 'tandg', 'cotdg', 'i0', 'i0e', 'i1', 'i1e', 'k0', 'k0e', 'k1', 'k1e', 'y0', 'y1', 'j0', 'j1', 'struve', 'modstruve', 'beta', 'betaln', 'besselpoly', 'gammaln', 'gammasgn', 'cbrt', 'radian', 'cosm1', 'gammainc', 'gammaincinv', 'gammaincc', 'gammainccinv', 'fresnel', 'ellipe', 'ellipeinc', 'ellipk', 'ellipkinc', 'ellipkm1', 'ellipj', 'erf', 'erfc', 'erfcx', 'erfi', 'voigt_profile', 'wofz', 'dawsn', 'ndtr', 'log_ndtr', 'exp2', 'exp10', 'expm1', 'log1p', 'xlogy', 'xlog1py']
+cdef void loop_D_DDDD__As_DDDD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex, double complex, double complex) noexcept nogil>func)(<double complex>(<double complex*>ip0)[0], <double complex>(<double complex*>ip1)[0], <double complex>(<double complex*>ip2)[0], <double complex>(<double complex*>ip3)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_DDDD__As_FFFF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex, double complex, double complex) noexcept nogil>func)(<double complex>(<float complex*>ip0)[0], <double complex>(<float complex*>ip1)[0], <double complex>(<float complex*>ip2)[0], <double complex>(<float complex*>ip3)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_DDD__As_DDD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex, double complex) noexcept nogil>func)(<double complex>(<double complex*>ip0)[0], <double complex>(<double complex*>ip1)[0], <double complex>(<double complex*>ip2)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_DDD__As_FFF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex, double complex) noexcept nogil>func)(<double complex>(<float complex*>ip0)[0], <double complex>(<float complex*>ip1)[0], <double complex>(<float complex*>ip2)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_DD__As_DD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex) noexcept nogil>func)(<double complex>(<double complex*>ip0)[0], <double complex>(<double complex*>ip1)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_DD__As_FF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex, double complex) noexcept nogil>func)(<double complex>(<float complex*>ip0)[0], <double complex>(<float complex*>ip1)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_D__As_D_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex) noexcept nogil>func)(<double complex>(<double complex*>ip0)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_D__As_F_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double complex) noexcept nogil>func)(<double complex>(<float complex*>ip0)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_dD__As_dD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double complex) noexcept nogil>func)(<double>(<double*>ip0)[0], <double complex>(<double complex*>ip1)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_dD__As_fF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double complex) noexcept nogil>func)(<double>(<float*>ip0)[0], <double complex>(<float complex*>ip1)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_ddD__As_ddD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double, double complex) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <double complex>(<double complex*>ip2)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_ddD__As_ffF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double, double complex) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0], <double complex>(<float complex*>ip2)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_dddD__As_dddD_D(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double, double, double complex) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0], <double complex>(<double complex*>ip3)[0])
+        (<double complex *>op0)[0] = <double complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_D_dddD__As_fffF_F(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double complex ov0
+    for i in range(n):
+        ov0 = (<double complex(*)(double, double, double, double complex) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0], <double>(<float*>ip2)[0], <double complex>(<float complex*>ip3)[0])
+        (<float complex *>op0)[0] = <float complex>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_d__As_d_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double) noexcept nogil>func)(<double>(<double*>ip0)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_d__As_f_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double) noexcept nogil>func)(<double>(<float*>ip0)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_dd__As_dd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_dd__As_ff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddd__As_ddd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddd__As_fff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0], <double>(<float*>ip2)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_dddd__As_dddd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0], <double>(<double*>ip3)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_dddd__As_ffff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double, double) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0], <double>(<float*>ip2)[0], <double>(<float*>ip3)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddddddd__As_ddddddd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *ip4 = args[4]
+    cdef char *ip5 = args[5]
+    cdef char *ip6 = args[6]
+    cdef char *op0 = args[7]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double, double, double, double, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0], <double>(<double*>ip3)[0], <double>(<double*>ip4)[0], <double>(<double*>ip5)[0], <double>(<double*>ip6)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        ip4 += steps[4]
+        ip5 += steps[5]
+        ip6 += steps[6]
+        op0 += steps[7]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddddddd__As_fffffff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *ip4 = args[4]
+    cdef char *ip5 = args[5]
+    cdef char *ip6 = args[6]
+    cdef char *op0 = args[7]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, double, double, double, double, double, double) noexcept nogil>func)(<double>(<float*>ip0)[0], <double>(<float*>ip1)[0], <double>(<float*>ip2)[0], <double>(<float*>ip3)[0], <double>(<float*>ip4)[0], <double>(<float*>ip5)[0], <double>(<float*>ip6)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        ip4 += steps[4]
+        ip5 += steps[5]
+        ip6 += steps[6]
+        op0 += steps[7]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddiiddd__As_ddllddd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *ip4 = args[4]
+    cdef char *ip5 = args[5]
+    cdef char *ip6 = args[6]
+    cdef char *op0 = args[7]
+    cdef double ov0
+    for i in range(n):
+        if <int>(<long*>ip2)[0] == (<long*>ip2)[0] and <int>(<long*>ip3)[0] == (<long*>ip3)[0]:
+            ov0 = (<double(*)(double, double, int, int, double, double, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <int>(<long*>ip2)[0], <int>(<long*>ip3)[0], <double>(<double*>ip4)[0], <double>(<double*>ip5)[0], <double>(<double*>ip6)[0])
+        else:
+            sf_error.error(func_name, sf_error.DOMAIN, "invalid input argument")
+            ov0 = <double>NAN
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        ip4 += steps[4]
+        ip5 += steps[5]
+        ip6 += steps[6]
+        op0 += steps[7]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ddp_d_As_ddp_dd(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef char *op1 = args[4]
+    cdef double ov0
+    cdef double ov1
+    for i in range(n):
+        ov0 = (<double(*)(double, double, Py_ssize_t, double *) noexcept nogil>func)(<double>(<double*>ip0)[0], <double>(<double*>ip1)[0], <Py_ssize_t>(<Py_ssize_t*>ip2)[0], &ov1)
+        (<double *>op0)[0] = <double>ov0
+        (<double *>op1)[0] = <double>ov1
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+        op1 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_dpd__As_dpd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(double, Py_ssize_t, double) noexcept nogil>func)(<double>(<double*>ip0)[0], <Py_ssize_t>(<Py_ssize_t*>ip1)[0], <double>(<double*>ip2)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_pd__As_pd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(Py_ssize_t, double) noexcept nogil>func)(<Py_ssize_t>(<Py_ssize_t*>ip0)[0], <double>(<double*>ip1)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_pdd__As_pdd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(Py_ssize_t, double, double) noexcept nogil>func)(<Py_ssize_t>(<Py_ssize_t*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_pddd__As_pddd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(Py_ssize_t, double, double, double) noexcept nogil>func)(<Py_ssize_t>(<Py_ssize_t*>ip0)[0], <double>(<double*>ip1)[0], <double>(<double*>ip2)[0], <double>(<double*>ip3)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_d_ppd__As_ppd_d(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef double ov0
+    for i in range(n):
+        ov0 = (<double(*)(Py_ssize_t, Py_ssize_t, double) noexcept nogil>func)(<Py_ssize_t>(<Py_ssize_t*>ip0)[0], <Py_ssize_t>(<Py_ssize_t*>ip1)[0], <double>(<double*>ip2)[0])
+        (<double *>op0)[0] = <double>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_f_f__As_f_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef float ov0
+    for i in range(n):
+        ov0 = (<float(*)(float) noexcept nogil>func)(<float>(<float*>ip0)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_f_ff__As_ff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *op0 = args[2]
+    cdef float ov0
+    for i in range(n):
+        ov0 = (<float(*)(float, float) noexcept nogil>func)(<float>(<float*>ip0)[0], <float>(<float*>ip1)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        op0 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_f_fff__As_fff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *op0 = args[3]
+    cdef float ov0
+    for i in range(n):
+        ov0 = (<float(*)(float, float, float) noexcept nogil>func)(<float>(<float*>ip0)[0], <float>(<float*>ip1)[0], <float>(<float*>ip2)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        op0 += steps[3]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_f_ffff__As_ffff_f(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *ip1 = args[1]
+    cdef char *ip2 = args[2]
+    cdef char *ip3 = args[3]
+    cdef char *op0 = args[4]
+    cdef float ov0
+    for i in range(n):
+        ov0 = (<float(*)(float, float, float, float) noexcept nogil>func)(<float>(<float*>ip0)[0], <float>(<float*>ip1)[0], <float>(<float*>ip2)[0], <float>(<float*>ip3)[0])
+        (<float *>op0)[0] = <float>ov0
+        ip0 += steps[0]
+        ip1 += steps[1]
+        ip2 += steps[2]
+        ip3 += steps[3]
+        op0 += steps[4]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_i_D_DD_As_D_DD(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef char *op1 = args[2]
+    cdef double complex ov0
+    cdef double complex ov1
+    for i in range(n):
+        (<int(*)(double complex, double complex *, double complex *) noexcept nogil>func)(<double complex>(<double complex*>ip0)[0], &ov0, &ov1)
+        (<double complex *>op0)[0] = <double complex>ov0
+        (<double complex *>op1)[0] = <double complex>ov1
+        ip0 += steps[0]
+        op0 += steps[1]
+        op1 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_i_D_DD_As_F_FF(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef char *op1 = args[2]
+    cdef double complex ov0
+    cdef double complex ov1
+    for i in range(n):
+        (<int(*)(double complex, double complex *, double complex *) noexcept nogil>func)(<double complex>(<float complex*>ip0)[0], &ov0, &ov1)
+        (<float complex *>op0)[0] = <float complex>ov0
+        (<float complex *>op1)[0] = <float complex>ov1
+        ip0 += steps[0]
+        op0 += steps[1]
+        op1 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_i_d_dd_As_d_dd(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef char *op1 = args[2]
+    cdef double ov0
+    cdef double ov1
+    for i in range(n):
+        (<int(*)(double, double *, double *) noexcept nogil>func)(<double>(<double*>ip0)[0], &ov0, &ov1)
+        (<double *>op0)[0] = <double>ov0
+        (<double *>op1)[0] = <double>ov1
+        ip0 += steps[0]
+        op0 += steps[1]
+        op1 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_i_d_dd_As_f_ff(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef char *op1 = args[2]
+    cdef double ov0
+    cdef double ov1
+    for i in range(n):
+        (<int(*)(double, double *, double *) noexcept nogil>func)(<double>(<float*>ip0)[0], &ov0, &ov1)
+        (<float *>op0)[0] = <float>ov0
+        (<float *>op1)[0] = <float>ov1
+        ip0 += steps[0]
+        op0 += steps[1]
+        op1 += steps[2]
+    sf_error.check_fpe(func_name)
+
+cdef void loop_i_i__As_l_l(char **args, np.npy_intp *dims, np.npy_intp *steps, void *data) noexcept nogil:
+    cdef np.npy_intp i, n = dims[0]
+    cdef void *func = (<void**>data)[0]
+    cdef char *func_name = <char*>(<void**>data)[1]
+    cdef char *ip0 = args[0]
+    cdef char *op0 = args[1]
+    cdef int ov0
+    for i in range(n):
+        if <int>(<long*>ip0)[0] == (<long*>ip0)[0]:
+            ov0 = (<int(*)(int) noexcept nogil>func)(<int>(<long*>ip0)[0])
+        else:
+            sf_error.error(func_name, sf_error.DOMAIN, "invalid input argument")
+            ov0 = <int>0xbad0bad0
+        (<long *>op0)[0] = <long>ov0
+        ip0 += steps[0]
+        op0 += steps[1]
+    sf_error.check_fpe(func_name)
+
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cosine_cdf "cosine_cdf"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cosine_invcdf "cosine_invcdf"(double) noexcept nogil
+from ._ellip_harm cimport ellip_harmonic as _func_ellip_harmonic
+ctypedef double _proto_ellip_harmonic_t(double, double, int, int, double, double, double) noexcept nogil
+cdef _proto_ellip_harmonic_t *_proto_ellip_harmonic_t_var = &_func_ellip_harmonic
+from ._legacy cimport ellip_harmonic_unsafe as _func_ellip_harmonic_unsafe
+ctypedef double _proto_ellip_harmonic_unsafe_t(double, double, double, double, double, double, double) noexcept nogil
+cdef _proto_ellip_harmonic_unsafe_t *_proto_ellip_harmonic_unsafe_t_var = &_func_ellip_harmonic_unsafe
+from ._factorial cimport _factorial as _func__factorial
+ctypedef double _proto__factorial_t(double) noexcept nogil
+cdef _proto__factorial_t *_proto__factorial_t_var = &_func__factorial
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_igam_fac "cephes_igam_fac"(double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_kolmogc "xsf_kolmogc"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_kolmogci "xsf_kolmogci"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_kolmogp "xsf_kolmogp"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_lanczos_sum_expg_scaled "cephes_lanczos_sum_expg_scaled"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_lgam1p "cephes_lgam1p"(double) noexcept nogil
+from .sf_error cimport _sf_error_test_function as _func__sf_error_test_function
+ctypedef int _proto__sf_error_test_function_t(int) noexcept nogil
+cdef _proto__sf_error_test_function_t *_proto__sf_error_test_function_t_var = &_func__sf_error_test_function
+from ._legacy cimport smirnovc_unsafe as _func_smirnovc_unsafe
+ctypedef double _proto_smirnovc_unsafe_t(double, double) noexcept nogil
+cdef _proto_smirnovc_unsafe_t *_proto_smirnovc_unsafe_t_var = &_func_smirnovc_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_smirnovc_wrap "cephes_smirnovc_wrap"(Py_ssize_t, double) noexcept nogil
+from ._legacy cimport smirnovci_unsafe as _func_smirnovci_unsafe
+ctypedef double _proto_smirnovci_unsafe_t(double, double) noexcept nogil
+cdef _proto_smirnovci_unsafe_t *_proto_smirnovci_unsafe_t_var = &_func_smirnovci_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_smirnovci_wrap "cephes_smirnovci_wrap"(Py_ssize_t, double) noexcept nogil
+from ._legacy cimport smirnovp_unsafe as _func_smirnovp_unsafe
+ctypedef double _proto_smirnovp_unsafe_t(double, double) noexcept nogil
+cdef _proto_smirnovp_unsafe_t *_proto_smirnovp_unsafe_t_var = &_func_smirnovp_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_smirnovp_wrap "cephes_smirnovp_wrap"(Py_ssize_t, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes__struve_asymp_large_z "cephes__struve_asymp_large_z"(double, double, Py_ssize_t, double *) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes__struve_bessel_series "cephes__struve_bessel_series"(double, double, Py_ssize_t, double *) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes__struve_power_series "cephes__struve_power_series"(double, double, Py_ssize_t, double *) noexcept nogil
+from ._agm cimport agm as _func_agm
+ctypedef double _proto_agm_t(double, double) noexcept nogil
+cdef _proto_agm_t *_proto_agm_t_var = &_func_agm
+from ._legacy cimport bdtr_unsafe as _func_bdtr_unsafe
+ctypedef double _proto_bdtr_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_bdtr_unsafe_t *_proto_bdtr_unsafe_t_var = &_func_bdtr_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_bdtr_wrap "cephes_bdtr_wrap"(double, Py_ssize_t, double) noexcept nogil
+from ._legacy cimport bdtrc_unsafe as _func_bdtrc_unsafe
+ctypedef double _proto_bdtrc_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_bdtrc_unsafe_t *_proto_bdtrc_unsafe_t_var = &_func_bdtrc_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_bdtrc_wrap "cephes_bdtrc_wrap"(double, Py_ssize_t, double) noexcept nogil
+from ._legacy cimport bdtri_unsafe as _func_bdtri_unsafe
+ctypedef double _proto_bdtri_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_bdtri_unsafe_t *_proto_bdtri_unsafe_t_var = &_func_bdtri_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_bdtri_wrap "cephes_bdtri_wrap"(double, Py_ssize_t, double) noexcept nogil
+from ._cdflib_wrappers cimport bdtrik as _func_bdtrik
+ctypedef double _proto_bdtrik_t(double, double, double) noexcept nogil
+cdef _proto_bdtrik_t *_proto_bdtrik_t_var = &_func_bdtrik
+from ._cdflib_wrappers cimport bdtrin as _func_bdtrin
+ctypedef double _proto_bdtrin_t(double, double, double) noexcept nogil
+cdef _proto_bdtrin_t *_proto_bdtrin_t_var = &_func_bdtrin
+from ._boxcox cimport boxcox as _func_boxcox
+ctypedef double _proto_boxcox_t(double, double) noexcept nogil
+cdef _proto_boxcox_t *_proto_boxcox_t_var = &_func_boxcox
+from ._boxcox cimport boxcox1p as _func_boxcox1p
+ctypedef double _proto_boxcox1p_t(double, double) noexcept nogil
+cdef _proto_boxcox1p_t *_proto_boxcox1p_t_var = &_func_boxcox1p
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_chdtr "xsf_chdtr"(double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_chdtrc "xsf_chdtrc"(double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_chdtri "xsf_chdtri"(double, double) noexcept nogil
+from ._convex_analysis cimport entr as _func_entr
+ctypedef double _proto_entr_t(double) noexcept nogil
+cdef _proto_entr_t *_proto_entr_t_var = &_func_entr
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_erfcinv "cephes_erfcinv"(double) noexcept nogil
+from .orthogonal_eval cimport eval_chebyc as _func_eval_chebyc
+ctypedef double complex _proto_eval_chebyc_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_chebyc_double_complex__t *_proto_eval_chebyc_double_complex__t_var = &_func_eval_chebyc[double_complex]
+from .orthogonal_eval cimport eval_chebyc as _func_eval_chebyc
+ctypedef double _proto_eval_chebyc_double__t(double, double) noexcept nogil
+cdef _proto_eval_chebyc_double__t *_proto_eval_chebyc_double__t_var = &_func_eval_chebyc[double]
+from .orthogonal_eval cimport eval_chebyc_l as _func_eval_chebyc_l
+ctypedef double _proto_eval_chebyc_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_chebyc_l_t *_proto_eval_chebyc_l_t_var = &_func_eval_chebyc_l
+from .orthogonal_eval cimport eval_chebys as _func_eval_chebys
+ctypedef double complex _proto_eval_chebys_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_chebys_double_complex__t *_proto_eval_chebys_double_complex__t_var = &_func_eval_chebys[double_complex]
+from .orthogonal_eval cimport eval_chebys as _func_eval_chebys
+ctypedef double _proto_eval_chebys_double__t(double, double) noexcept nogil
+cdef _proto_eval_chebys_double__t *_proto_eval_chebys_double__t_var = &_func_eval_chebys[double]
+from .orthogonal_eval cimport eval_chebys_l as _func_eval_chebys_l
+ctypedef double _proto_eval_chebys_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_chebys_l_t *_proto_eval_chebys_l_t_var = &_func_eval_chebys_l
+from .orthogonal_eval cimport eval_chebyt as _func_eval_chebyt
+ctypedef double complex _proto_eval_chebyt_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_chebyt_double_complex__t *_proto_eval_chebyt_double_complex__t_var = &_func_eval_chebyt[double_complex]
+from .orthogonal_eval cimport eval_chebyt as _func_eval_chebyt
+ctypedef double _proto_eval_chebyt_double__t(double, double) noexcept nogil
+cdef _proto_eval_chebyt_double__t *_proto_eval_chebyt_double__t_var = &_func_eval_chebyt[double]
+from .orthogonal_eval cimport eval_chebyt_l as _func_eval_chebyt_l
+ctypedef double _proto_eval_chebyt_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_chebyt_l_t *_proto_eval_chebyt_l_t_var = &_func_eval_chebyt_l
+from .orthogonal_eval cimport eval_chebyu as _func_eval_chebyu
+ctypedef double complex _proto_eval_chebyu_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_chebyu_double_complex__t *_proto_eval_chebyu_double_complex__t_var = &_func_eval_chebyu[double_complex]
+from .orthogonal_eval cimport eval_chebyu as _func_eval_chebyu
+ctypedef double _proto_eval_chebyu_double__t(double, double) noexcept nogil
+cdef _proto_eval_chebyu_double__t *_proto_eval_chebyu_double__t_var = &_func_eval_chebyu[double]
+from .orthogonal_eval cimport eval_chebyu_l as _func_eval_chebyu_l
+ctypedef double _proto_eval_chebyu_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_chebyu_l_t *_proto_eval_chebyu_l_t_var = &_func_eval_chebyu_l
+from .orthogonal_eval cimport eval_gegenbauer as _func_eval_gegenbauer
+ctypedef double complex _proto_eval_gegenbauer_double_complex__t(double, double, double complex) noexcept nogil
+cdef _proto_eval_gegenbauer_double_complex__t *_proto_eval_gegenbauer_double_complex__t_var = &_func_eval_gegenbauer[double_complex]
+from .orthogonal_eval cimport eval_gegenbauer as _func_eval_gegenbauer
+ctypedef double _proto_eval_gegenbauer_double__t(double, double, double) noexcept nogil
+cdef _proto_eval_gegenbauer_double__t *_proto_eval_gegenbauer_double__t_var = &_func_eval_gegenbauer[double]
+from .orthogonal_eval cimport eval_gegenbauer_l as _func_eval_gegenbauer_l
+ctypedef double _proto_eval_gegenbauer_l_t(Py_ssize_t, double, double) noexcept nogil
+cdef _proto_eval_gegenbauer_l_t *_proto_eval_gegenbauer_l_t_var = &_func_eval_gegenbauer_l
+from .orthogonal_eval cimport eval_genlaguerre as _func_eval_genlaguerre
+ctypedef double complex _proto_eval_genlaguerre_double_complex__t(double, double, double complex) noexcept nogil
+cdef _proto_eval_genlaguerre_double_complex__t *_proto_eval_genlaguerre_double_complex__t_var = &_func_eval_genlaguerre[double_complex]
+from .orthogonal_eval cimport eval_genlaguerre as _func_eval_genlaguerre
+ctypedef double _proto_eval_genlaguerre_double__t(double, double, double) noexcept nogil
+cdef _proto_eval_genlaguerre_double__t *_proto_eval_genlaguerre_double__t_var = &_func_eval_genlaguerre[double]
+from .orthogonal_eval cimport eval_genlaguerre_l as _func_eval_genlaguerre_l
+ctypedef double _proto_eval_genlaguerre_l_t(Py_ssize_t, double, double) noexcept nogil
+cdef _proto_eval_genlaguerre_l_t *_proto_eval_genlaguerre_l_t_var = &_func_eval_genlaguerre_l
+from .orthogonal_eval cimport eval_hermite as _func_eval_hermite
+ctypedef double _proto_eval_hermite_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_hermite_t *_proto_eval_hermite_t_var = &_func_eval_hermite
+from .orthogonal_eval cimport eval_hermitenorm as _func_eval_hermitenorm
+ctypedef double _proto_eval_hermitenorm_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_hermitenorm_t *_proto_eval_hermitenorm_t_var = &_func_eval_hermitenorm
+from .orthogonal_eval cimport eval_jacobi as _func_eval_jacobi
+ctypedef double complex _proto_eval_jacobi_double_complex__t(double, double, double, double complex) noexcept nogil
+cdef _proto_eval_jacobi_double_complex__t *_proto_eval_jacobi_double_complex__t_var = &_func_eval_jacobi[double_complex]
+from .orthogonal_eval cimport eval_jacobi as _func_eval_jacobi
+ctypedef double _proto_eval_jacobi_double__t(double, double, double, double) noexcept nogil
+cdef _proto_eval_jacobi_double__t *_proto_eval_jacobi_double__t_var = &_func_eval_jacobi[double]
+from .orthogonal_eval cimport eval_jacobi_l as _func_eval_jacobi_l
+ctypedef double _proto_eval_jacobi_l_t(Py_ssize_t, double, double, double) noexcept nogil
+cdef _proto_eval_jacobi_l_t *_proto_eval_jacobi_l_t_var = &_func_eval_jacobi_l
+from .orthogonal_eval cimport eval_laguerre as _func_eval_laguerre
+ctypedef double complex _proto_eval_laguerre_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_laguerre_double_complex__t *_proto_eval_laguerre_double_complex__t_var = &_func_eval_laguerre[double_complex]
+from .orthogonal_eval cimport eval_laguerre as _func_eval_laguerre
+ctypedef double _proto_eval_laguerre_double__t(double, double) noexcept nogil
+cdef _proto_eval_laguerre_double__t *_proto_eval_laguerre_double__t_var = &_func_eval_laguerre[double]
+from .orthogonal_eval cimport eval_laguerre_l as _func_eval_laguerre_l
+ctypedef double _proto_eval_laguerre_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_laguerre_l_t *_proto_eval_laguerre_l_t_var = &_func_eval_laguerre_l
+from .orthogonal_eval cimport eval_legendre as _func_eval_legendre
+ctypedef double complex _proto_eval_legendre_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_legendre_double_complex__t *_proto_eval_legendre_double_complex__t_var = &_func_eval_legendre[double_complex]
+from .orthogonal_eval cimport eval_legendre as _func_eval_legendre
+ctypedef double _proto_eval_legendre_double__t(double, double) noexcept nogil
+cdef _proto_eval_legendre_double__t *_proto_eval_legendre_double__t_var = &_func_eval_legendre[double]
+from .orthogonal_eval cimport eval_legendre_l as _func_eval_legendre_l
+ctypedef double _proto_eval_legendre_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_legendre_l_t *_proto_eval_legendre_l_t_var = &_func_eval_legendre_l
+from .orthogonal_eval cimport eval_sh_chebyt as _func_eval_sh_chebyt
+ctypedef double complex _proto_eval_sh_chebyt_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_sh_chebyt_double_complex__t *_proto_eval_sh_chebyt_double_complex__t_var = &_func_eval_sh_chebyt[double_complex]
+from .orthogonal_eval cimport eval_sh_chebyt as _func_eval_sh_chebyt
+ctypedef double _proto_eval_sh_chebyt_double__t(double, double) noexcept nogil
+cdef _proto_eval_sh_chebyt_double__t *_proto_eval_sh_chebyt_double__t_var = &_func_eval_sh_chebyt[double]
+from .orthogonal_eval cimport eval_sh_chebyt_l as _func_eval_sh_chebyt_l
+ctypedef double _proto_eval_sh_chebyt_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_sh_chebyt_l_t *_proto_eval_sh_chebyt_l_t_var = &_func_eval_sh_chebyt_l
+from .orthogonal_eval cimport eval_sh_chebyu as _func_eval_sh_chebyu
+ctypedef double complex _proto_eval_sh_chebyu_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_sh_chebyu_double_complex__t *_proto_eval_sh_chebyu_double_complex__t_var = &_func_eval_sh_chebyu[double_complex]
+from .orthogonal_eval cimport eval_sh_chebyu as _func_eval_sh_chebyu
+ctypedef double _proto_eval_sh_chebyu_double__t(double, double) noexcept nogil
+cdef _proto_eval_sh_chebyu_double__t *_proto_eval_sh_chebyu_double__t_var = &_func_eval_sh_chebyu[double]
+from .orthogonal_eval cimport eval_sh_chebyu_l as _func_eval_sh_chebyu_l
+ctypedef double _proto_eval_sh_chebyu_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_sh_chebyu_l_t *_proto_eval_sh_chebyu_l_t_var = &_func_eval_sh_chebyu_l
+from .orthogonal_eval cimport eval_sh_jacobi as _func_eval_sh_jacobi
+ctypedef double complex _proto_eval_sh_jacobi_double_complex__t(double, double, double, double complex) noexcept nogil
+cdef _proto_eval_sh_jacobi_double_complex__t *_proto_eval_sh_jacobi_double_complex__t_var = &_func_eval_sh_jacobi[double_complex]
+from .orthogonal_eval cimport eval_sh_jacobi as _func_eval_sh_jacobi
+ctypedef double _proto_eval_sh_jacobi_double__t(double, double, double, double) noexcept nogil
+cdef _proto_eval_sh_jacobi_double__t *_proto_eval_sh_jacobi_double__t_var = &_func_eval_sh_jacobi[double]
+from .orthogonal_eval cimport eval_sh_jacobi_l as _func_eval_sh_jacobi_l
+ctypedef double _proto_eval_sh_jacobi_l_t(Py_ssize_t, double, double, double) noexcept nogil
+cdef _proto_eval_sh_jacobi_l_t *_proto_eval_sh_jacobi_l_t_var = &_func_eval_sh_jacobi_l
+from .orthogonal_eval cimport eval_sh_legendre as _func_eval_sh_legendre
+ctypedef double complex _proto_eval_sh_legendre_double_complex__t(double, double complex) noexcept nogil
+cdef _proto_eval_sh_legendre_double_complex__t *_proto_eval_sh_legendre_double_complex__t_var = &_func_eval_sh_legendre[double_complex]
+from .orthogonal_eval cimport eval_sh_legendre as _func_eval_sh_legendre
+ctypedef double _proto_eval_sh_legendre_double__t(double, double) noexcept nogil
+cdef _proto_eval_sh_legendre_double__t *_proto_eval_sh_legendre_double__t_var = &_func_eval_sh_legendre[double]
+from .orthogonal_eval cimport eval_sh_legendre_l as _func_eval_sh_legendre_l
+ctypedef double _proto_eval_sh_legendre_l_t(Py_ssize_t, double) noexcept nogil
+cdef _proto_eval_sh_legendre_l_t *_proto_eval_sh_legendre_l_t_var = &_func_eval_sh_legendre_l
+from ._legacy cimport expn_unsafe as _func_expn_unsafe
+ctypedef double _proto_expn_unsafe_t(double, double) noexcept nogil
+cdef _proto_expn_unsafe_t *_proto_expn_unsafe_t_var = &_func_expn_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_expn_wrap "cephes_expn_wrap"(Py_ssize_t, double) noexcept nogil
+from ._cdflib_wrappers cimport fdtridfd as _func_fdtridfd
+ctypedef double _proto_fdtridfd_t(double, double, double) noexcept nogil
+cdef _proto_fdtridfd_t *_proto_fdtridfd_t_var = &_func_fdtridfd
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_gdtr "xsf_gdtr"(double, double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_gdtrc "xsf_gdtrc"(double, double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_special_gdtria "special_gdtria"(double, double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_gdtrib "xsf_gdtrib"(double, double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_special_gdtrix "special_gdtrix"(double, double, double) noexcept nogil
+from ._convex_analysis cimport huber as _func_huber
+ctypedef double _proto_huber_t(double, double) noexcept nogil
+cdef _proto_huber_t *_proto_huber_t_var = &_func_huber
+from ._hyp0f1 cimport _hyp0f1_cmplx as _func__hyp0f1_cmplx
+ctypedef double complex _proto__hyp0f1_cmplx_t(double, double complex) noexcept nogil
+cdef _proto__hyp0f1_cmplx_t *_proto__hyp0f1_cmplx_t_var = &_func__hyp0f1_cmplx
+from ._hyp0f1 cimport _hyp0f1_real as _func__hyp0f1_real
+ctypedef double _proto__hyp0f1_real_t(double, double) noexcept nogil
+cdef _proto__hyp0f1_real_t *_proto__hyp0f1_real_t_var = &_func__hyp0f1_real
+cdef extern from r"_ufuncs_defs.h":
+    cdef double complex _func_chyp1f1_wrap "chyp1f1_wrap"(double, double, double complex) noexcept nogil
+from ._hypergeometric cimport hyperu as _func_hyperu
+ctypedef double _proto_hyperu_t(double, double, double) noexcept nogil
+cdef _proto_hyperu_t *_proto_hyperu_t_var = &_func_hyperu
+from ._boxcox cimport inv_boxcox as _func_inv_boxcox
+ctypedef double _proto_inv_boxcox_t(double, double) noexcept nogil
+cdef _proto_inv_boxcox_t *_proto_inv_boxcox_t_var = &_func_inv_boxcox
+from ._boxcox cimport inv_boxcox1p as _func_inv_boxcox1p
+ctypedef double _proto_inv_boxcox1p_t(double, double) noexcept nogil
+cdef _proto_inv_boxcox1p_t *_proto_inv_boxcox1p_t_var = &_func_inv_boxcox1p
+from ._convex_analysis cimport kl_div as _func_kl_div
+ctypedef double _proto_kl_div_t(double, double) noexcept nogil
+cdef _proto_kl_div_t *_proto_kl_div_t_var = &_func_kl_div
+from ._legacy cimport kn_unsafe as _func_kn_unsafe
+ctypedef double _proto_kn_unsafe_t(double, double) noexcept nogil
+cdef _proto_kn_unsafe_t *_proto_kn_unsafe_t_var = &_func_kn_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_special_cyl_bessel_k_int "special_cyl_bessel_k_int"(Py_ssize_t, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_kolmogi "xsf_kolmogi"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_kolmogorov "xsf_kolmogorov"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_pmv_wrap "pmv_wrap"(double, double, double) noexcept nogil
+from ._legacy cimport nbdtr_unsafe as _func_nbdtr_unsafe
+ctypedef double _proto_nbdtr_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_nbdtr_unsafe_t *_proto_nbdtr_unsafe_t_var = &_func_nbdtr_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_nbdtr_wrap "cephes_nbdtr_wrap"(Py_ssize_t, Py_ssize_t, double) noexcept nogil
+from ._legacy cimport nbdtrc_unsafe as _func_nbdtrc_unsafe
+ctypedef double _proto_nbdtrc_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_nbdtrc_unsafe_t *_proto_nbdtrc_unsafe_t_var = &_func_nbdtrc_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_nbdtrc_wrap "cephes_nbdtrc_wrap"(Py_ssize_t, Py_ssize_t, double) noexcept nogil
+from ._legacy cimport nbdtri_unsafe as _func_nbdtri_unsafe
+ctypedef double _proto_nbdtri_unsafe_t(double, double, double) noexcept nogil
+cdef _proto_nbdtri_unsafe_t *_proto_nbdtri_unsafe_t_var = &_func_nbdtri_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_nbdtri_wrap "cephes_nbdtri_wrap"(Py_ssize_t, Py_ssize_t, double) noexcept nogil
+from ._cdflib_wrappers cimport nbdtrik as _func_nbdtrik
+ctypedef double _proto_nbdtrik_t(double, double, double) noexcept nogil
+cdef _proto_nbdtrik_t *_proto_nbdtrik_t_var = &_func_nbdtrik
+from ._cdflib_wrappers cimport nbdtrin as _func_nbdtrin
+ctypedef double _proto_nbdtrin_t(double, double, double) noexcept nogil
+cdef _proto_nbdtrin_t *_proto_nbdtrin_t_var = &_func_nbdtrin
+from ._cdflib_wrappers cimport ncfdtridfd as _func_ncfdtridfd
+ctypedef double _proto_ncfdtridfd_t(double, double, double, double) noexcept nogil
+cdef _proto_ncfdtridfd_t *_proto_ncfdtridfd_t_var = &_func_ncfdtridfd
+from ._cdflib_wrappers cimport ncfdtridfn as _func_ncfdtridfn
+ctypedef double _proto_ncfdtridfn_t(double, double, double, double) noexcept nogil
+cdef _proto_ncfdtridfn_t *_proto_ncfdtridfn_t_var = &_func_ncfdtridfn
+from ._cdflib_wrappers cimport ncfdtrinc as _func_ncfdtrinc
+ctypedef double _proto_ncfdtrinc_t(double, double, double, double) noexcept nogil
+cdef _proto_ncfdtrinc_t *_proto_ncfdtrinc_t_var = &_func_ncfdtrinc
+from ._cdflib_wrappers cimport nctdtridf as _func_nctdtridf
+ctypedef double _proto_nctdtridf_t(double, double, double) noexcept nogil
+cdef _proto_nctdtridf_t *_proto_nctdtridf_t_var = &_func_nctdtridf
+from ._cdflib_wrappers cimport nctdtrinc as _func_nctdtrinc
+ctypedef double _proto_nctdtrinc_t(double, double, double) noexcept nogil
+cdef _proto_nctdtrinc_t *_proto_nctdtrinc_t_var = &_func_nctdtrinc
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_ndtri "xsf_ndtri"(double) noexcept nogil
+from ._ndtri_exp cimport ndtri_exp as _func_ndtri_exp
+ctypedef double _proto_ndtri_exp_t(double) noexcept nogil
+cdef _proto_ndtri_exp_t *_proto_ndtri_exp_t_var = &_func_ndtri_exp
+from ._cdflib_wrappers cimport nrdtrimn as _func_nrdtrimn
+ctypedef double _proto_nrdtrimn_t(double, double, double) noexcept nogil
+cdef _proto_nrdtrimn_t *_proto_nrdtrimn_t_var = &_func_nrdtrimn
+from ._cdflib_wrappers cimport nrdtrisd as _func_nrdtrisd
+ctypedef double _proto_nrdtrisd_t(double, double, double) noexcept nogil
+cdef _proto_nrdtrisd_t *_proto_nrdtrisd_t_var = &_func_nrdtrisd
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_owens_t "xsf_owens_t"(double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_pdtr "xsf_pdtr"(double, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_pdtrc "xsf_pdtrc"(double, double) noexcept nogil
+from ._legacy cimport pdtri_unsafe as _func_pdtri_unsafe
+ctypedef double _proto_pdtri_unsafe_t(double, double) noexcept nogil
+cdef _proto_pdtri_unsafe_t *_proto_pdtri_unsafe_t_var = &_func_pdtri_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_pdtri_wrap "cephes_pdtri_wrap"(Py_ssize_t, double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_poch "cephes_poch"(double, double) noexcept nogil
+from ._convex_analysis cimport pseudo_huber as _func_pseudo_huber
+ctypedef double _proto_pseudo_huber_t(double, double) noexcept nogil
+cdef _proto_pseudo_huber_t *_proto_pseudo_huber_t_var = &_func_pseudo_huber
+from ._convex_analysis cimport rel_entr as _func_rel_entr
+ctypedef double _proto_rel_entr_t(double, double) noexcept nogil
+cdef _proto_rel_entr_t *_proto_rel_entr_t_var = &_func_rel_entr
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_round "cephes_round"(double) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef int _func_xsf_cshichi "xsf_cshichi"(double complex, double complex *, double complex *) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef int _func_xsf_shichi "xsf_shichi"(double, double *, double *) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef int _func_xsf_csici "xsf_csici"(double complex, double complex *, double complex *) noexcept nogil
+cdef extern from r"_ufuncs_defs.h":
+    cdef int _func_xsf_sici "xsf_sici"(double, double *, double *) noexcept nogil
+from ._legacy cimport smirnov_unsafe as _func_smirnov_unsafe
+ctypedef double _proto_smirnov_unsafe_t(double, double) noexcept nogil
+cdef _proto_smirnov_unsafe_t *_proto_smirnov_unsafe_t_var = &_func_smirnov_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_smirnov_wrap "cephes_smirnov_wrap"(Py_ssize_t, double) noexcept nogil
+from ._legacy cimport smirnovi_unsafe as _func_smirnovi_unsafe
+ctypedef double _proto_smirnovi_unsafe_t(double, double) noexcept nogil
+cdef _proto_smirnovi_unsafe_t *_proto_smirnovi_unsafe_t_var = &_func_smirnovi_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_smirnovi_wrap "cephes_smirnovi_wrap"(Py_ssize_t, double) noexcept nogil
+from ._spence cimport cspence as _func_cspence
+ctypedef double complex _proto_cspence_t(double complex) noexcept nogil
+cdef _proto_cspence_t *_proto_cspence_t_var = &_func_cspence
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_spence "cephes_spence"(double) noexcept nogil
+from ._cdflib_wrappers cimport stdtridf as _func_stdtridf
+ctypedef double _proto_stdtridf_t(double, double) noexcept nogil
+cdef _proto_stdtridf_t *_proto_stdtridf_t_var = &_func_stdtridf
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_xsf_tukeylambdacdf "xsf_tukeylambdacdf"(double, double) noexcept nogil
+from ._legacy cimport yn_unsafe as _func_yn_unsafe
+ctypedef double _proto_yn_unsafe_t(double, double) noexcept nogil
+cdef _proto_yn_unsafe_t *_proto_yn_unsafe_t_var = &_func_yn_unsafe
+cdef extern from r"_ufuncs_defs.h":
+    cdef double _func_cephes_yn_wrap "cephes_yn_wrap"(Py_ssize_t, double) noexcept nogil
+cdef np.PyUFuncGenericFunction ufunc__beta_pdf_loops[2]
+cdef void *ufunc__beta_pdf_ptr[4]
+cdef void *ufunc__beta_pdf_data[2]
+cdef char ufunc__beta_pdf_types[8]
+cdef char *ufunc__beta_pdf_doc = (
+    "_beta_pdf(x, a, b)\n"
+    "\n"
+    "Probability density function of beta distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued such that :math:`0 \\leq x \\leq 1`,\n"
+    "    the upper limit of integration\n"
+    "a, b : array_like\n"
+    "       Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__beta_pdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__beta_pdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__beta_pdf_types[0] = <char>NPY_FLOAT
+ufunc__beta_pdf_types[1] = <char>NPY_FLOAT
+ufunc__beta_pdf_types[2] = <char>NPY_FLOAT
+ufunc__beta_pdf_types[3] = <char>NPY_FLOAT
+ufunc__beta_pdf_types[4] = <char>NPY_DOUBLE
+ufunc__beta_pdf_types[5] = <char>NPY_DOUBLE
+ufunc__beta_pdf_types[6] = <char>NPY_DOUBLE
+ufunc__beta_pdf_types[7] = <char>NPY_DOUBLE
+ufunc__beta_pdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_beta_pdf_float
+ufunc__beta_pdf_ptr[2*0+1] = <void*>(<char*>"_beta_pdf")
+ufunc__beta_pdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_beta_pdf_double
+ufunc__beta_pdf_ptr[2*1+1] = <void*>(<char*>"_beta_pdf")
+ufunc__beta_pdf_data[0] = &ufunc__beta_pdf_ptr[2*0]
+ufunc__beta_pdf_data[1] = &ufunc__beta_pdf_ptr[2*1]
+_beta_pdf = np.PyUFunc_FromFuncAndData(ufunc__beta_pdf_loops, ufunc__beta_pdf_data, ufunc__beta_pdf_types, 2, 3, 1, 0, '_beta_pdf', ufunc__beta_pdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__beta_ppf_loops[2]
+cdef void *ufunc__beta_ppf_ptr[4]
+cdef void *ufunc__beta_ppf_data[2]
+cdef char ufunc__beta_ppf_types[8]
+cdef char *ufunc__beta_ppf_doc = (
+    "_beta_ppf(x, a, b)\n"
+    "\n"
+    "Percent point function of beta distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued such that :math:`0 \\leq x \\leq 1`,\n"
+    "    the upper limit of integration\n"
+    "a, b : array_like\n"
+    "       Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__beta_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__beta_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__beta_ppf_types[0] = <char>NPY_FLOAT
+ufunc__beta_ppf_types[1] = <char>NPY_FLOAT
+ufunc__beta_ppf_types[2] = <char>NPY_FLOAT
+ufunc__beta_ppf_types[3] = <char>NPY_FLOAT
+ufunc__beta_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__beta_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__beta_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__beta_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__beta_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_beta_ppf_float
+ufunc__beta_ppf_ptr[2*0+1] = <void*>(<char*>"_beta_ppf")
+ufunc__beta_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_beta_ppf_double
+ufunc__beta_ppf_ptr[2*1+1] = <void*>(<char*>"_beta_ppf")
+ufunc__beta_ppf_data[0] = &ufunc__beta_ppf_ptr[2*0]
+ufunc__beta_ppf_data[1] = &ufunc__beta_ppf_ptr[2*1]
+_beta_ppf = np.PyUFunc_FromFuncAndData(ufunc__beta_ppf_loops, ufunc__beta_ppf_data, ufunc__beta_ppf_types, 2, 3, 1, 0, '_beta_ppf', ufunc__beta_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__binom_cdf_loops[2]
+cdef void *ufunc__binom_cdf_ptr[4]
+cdef void *ufunc__binom_cdf_data[2]
+cdef char ufunc__binom_cdf_types[8]
+cdef char *ufunc__binom_cdf_doc = (
+    "_binom_cdf(x, n, p)\n"
+    "\n"
+    "Cumulative density function of binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "n : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__binom_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__binom_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__binom_cdf_types[0] = <char>NPY_FLOAT
+ufunc__binom_cdf_types[1] = <char>NPY_FLOAT
+ufunc__binom_cdf_types[2] = <char>NPY_FLOAT
+ufunc__binom_cdf_types[3] = <char>NPY_FLOAT
+ufunc__binom_cdf_types[4] = <char>NPY_DOUBLE
+ufunc__binom_cdf_types[5] = <char>NPY_DOUBLE
+ufunc__binom_cdf_types[6] = <char>NPY_DOUBLE
+ufunc__binom_cdf_types[7] = <char>NPY_DOUBLE
+ufunc__binom_cdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_binom_cdf_float
+ufunc__binom_cdf_ptr[2*0+1] = <void*>(<char*>"_binom_cdf")
+ufunc__binom_cdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_binom_cdf_double
+ufunc__binom_cdf_ptr[2*1+1] = <void*>(<char*>"_binom_cdf")
+ufunc__binom_cdf_data[0] = &ufunc__binom_cdf_ptr[2*0]
+ufunc__binom_cdf_data[1] = &ufunc__binom_cdf_ptr[2*1]
+_binom_cdf = np.PyUFunc_FromFuncAndData(ufunc__binom_cdf_loops, ufunc__binom_cdf_data, ufunc__binom_cdf_types, 2, 3, 1, 0, '_binom_cdf', ufunc__binom_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__binom_isf_loops[2]
+cdef void *ufunc__binom_isf_ptr[4]
+cdef void *ufunc__binom_isf_data[2]
+cdef char ufunc__binom_isf_types[8]
+cdef char *ufunc__binom_isf_doc = (
+    "_binom_isf(x, n, p)\n"
+    "\n"
+    "Inverse survival function of binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "n : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__binom_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__binom_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__binom_isf_types[0] = <char>NPY_FLOAT
+ufunc__binom_isf_types[1] = <char>NPY_FLOAT
+ufunc__binom_isf_types[2] = <char>NPY_FLOAT
+ufunc__binom_isf_types[3] = <char>NPY_FLOAT
+ufunc__binom_isf_types[4] = <char>NPY_DOUBLE
+ufunc__binom_isf_types[5] = <char>NPY_DOUBLE
+ufunc__binom_isf_types[6] = <char>NPY_DOUBLE
+ufunc__binom_isf_types[7] = <char>NPY_DOUBLE
+ufunc__binom_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_binom_isf_float
+ufunc__binom_isf_ptr[2*0+1] = <void*>(<char*>"_binom_isf")
+ufunc__binom_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_binom_isf_double
+ufunc__binom_isf_ptr[2*1+1] = <void*>(<char*>"_binom_isf")
+ufunc__binom_isf_data[0] = &ufunc__binom_isf_ptr[2*0]
+ufunc__binom_isf_data[1] = &ufunc__binom_isf_ptr[2*1]
+_binom_isf = np.PyUFunc_FromFuncAndData(ufunc__binom_isf_loops, ufunc__binom_isf_data, ufunc__binom_isf_types, 2, 3, 1, 0, '_binom_isf', ufunc__binom_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__binom_pmf_loops[2]
+cdef void *ufunc__binom_pmf_ptr[4]
+cdef void *ufunc__binom_pmf_data[2]
+cdef char ufunc__binom_pmf_types[8]
+cdef char *ufunc__binom_pmf_doc = (
+    "_binom_pmf(x, n, p)\n"
+    "\n"
+    "Probability mass function of binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "n : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__binom_pmf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__binom_pmf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__binom_pmf_types[0] = <char>NPY_FLOAT
+ufunc__binom_pmf_types[1] = <char>NPY_FLOAT
+ufunc__binom_pmf_types[2] = <char>NPY_FLOAT
+ufunc__binom_pmf_types[3] = <char>NPY_FLOAT
+ufunc__binom_pmf_types[4] = <char>NPY_DOUBLE
+ufunc__binom_pmf_types[5] = <char>NPY_DOUBLE
+ufunc__binom_pmf_types[6] = <char>NPY_DOUBLE
+ufunc__binom_pmf_types[7] = <char>NPY_DOUBLE
+ufunc__binom_pmf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_binom_pmf_float
+ufunc__binom_pmf_ptr[2*0+1] = <void*>(<char*>"_binom_pmf")
+ufunc__binom_pmf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_binom_pmf_double
+ufunc__binom_pmf_ptr[2*1+1] = <void*>(<char*>"_binom_pmf")
+ufunc__binom_pmf_data[0] = &ufunc__binom_pmf_ptr[2*0]
+ufunc__binom_pmf_data[1] = &ufunc__binom_pmf_ptr[2*1]
+_binom_pmf = np.PyUFunc_FromFuncAndData(ufunc__binom_pmf_loops, ufunc__binom_pmf_data, ufunc__binom_pmf_types, 2, 3, 1, 0, '_binom_pmf', ufunc__binom_pmf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__binom_ppf_loops[2]
+cdef void *ufunc__binom_ppf_ptr[4]
+cdef void *ufunc__binom_ppf_data[2]
+cdef char ufunc__binom_ppf_types[8]
+cdef char *ufunc__binom_ppf_doc = (
+    "_binom_ppf(x, n, p)\n"
+    "\n"
+    "Percent point function of binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "n : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__binom_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__binom_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__binom_ppf_types[0] = <char>NPY_FLOAT
+ufunc__binom_ppf_types[1] = <char>NPY_FLOAT
+ufunc__binom_ppf_types[2] = <char>NPY_FLOAT
+ufunc__binom_ppf_types[3] = <char>NPY_FLOAT
+ufunc__binom_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__binom_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__binom_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__binom_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__binom_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_binom_ppf_float
+ufunc__binom_ppf_ptr[2*0+1] = <void*>(<char*>"_binom_ppf")
+ufunc__binom_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_binom_ppf_double
+ufunc__binom_ppf_ptr[2*1+1] = <void*>(<char*>"_binom_ppf")
+ufunc__binom_ppf_data[0] = &ufunc__binom_ppf_ptr[2*0]
+ufunc__binom_ppf_data[1] = &ufunc__binom_ppf_ptr[2*1]
+_binom_ppf = np.PyUFunc_FromFuncAndData(ufunc__binom_ppf_loops, ufunc__binom_ppf_data, ufunc__binom_ppf_types, 2, 3, 1, 0, '_binom_ppf', ufunc__binom_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__binom_sf_loops[2]
+cdef void *ufunc__binom_sf_ptr[4]
+cdef void *ufunc__binom_sf_data[2]
+cdef char ufunc__binom_sf_types[8]
+cdef char *ufunc__binom_sf_doc = (
+    "_binom_sf(x, n, p)\n"
+    "\n"
+    "Survival function of binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "n : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__binom_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__binom_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__binom_sf_types[0] = <char>NPY_FLOAT
+ufunc__binom_sf_types[1] = <char>NPY_FLOAT
+ufunc__binom_sf_types[2] = <char>NPY_FLOAT
+ufunc__binom_sf_types[3] = <char>NPY_FLOAT
+ufunc__binom_sf_types[4] = <char>NPY_DOUBLE
+ufunc__binom_sf_types[5] = <char>NPY_DOUBLE
+ufunc__binom_sf_types[6] = <char>NPY_DOUBLE
+ufunc__binom_sf_types[7] = <char>NPY_DOUBLE
+ufunc__binom_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_binom_sf_float
+ufunc__binom_sf_ptr[2*0+1] = <void*>(<char*>"_binom_sf")
+ufunc__binom_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_binom_sf_double
+ufunc__binom_sf_ptr[2*1+1] = <void*>(<char*>"_binom_sf")
+ufunc__binom_sf_data[0] = &ufunc__binom_sf_ptr[2*0]
+ufunc__binom_sf_data[1] = &ufunc__binom_sf_ptr[2*1]
+_binom_sf = np.PyUFunc_FromFuncAndData(ufunc__binom_sf_loops, ufunc__binom_sf_data, ufunc__binom_sf_types, 2, 3, 1, 0, '_binom_sf', ufunc__binom_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__cauchy_isf_loops[2]
+cdef void *ufunc__cauchy_isf_ptr[4]
+cdef void *ufunc__cauchy_isf_data[2]
+cdef char ufunc__cauchy_isf_types[8]
+cdef char *ufunc__cauchy_isf_doc = (
+    "_cauchy_isf(p, loc, scale)\n"
+    "\n"
+    "Inverse survival function of the Cauchy distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probabilities\n"
+    "loc : array_like\n"
+    "    Location parameter of the distribution.\n"
+    "scale : array_like\n"
+    "    Scale parameter of the distribution.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__cauchy_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__cauchy_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__cauchy_isf_types[0] = <char>NPY_FLOAT
+ufunc__cauchy_isf_types[1] = <char>NPY_FLOAT
+ufunc__cauchy_isf_types[2] = <char>NPY_FLOAT
+ufunc__cauchy_isf_types[3] = <char>NPY_FLOAT
+ufunc__cauchy_isf_types[4] = <char>NPY_DOUBLE
+ufunc__cauchy_isf_types[5] = <char>NPY_DOUBLE
+ufunc__cauchy_isf_types[6] = <char>NPY_DOUBLE
+ufunc__cauchy_isf_types[7] = <char>NPY_DOUBLE
+ufunc__cauchy_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_cauchy_isf_float
+ufunc__cauchy_isf_ptr[2*0+1] = <void*>(<char*>"_cauchy_isf")
+ufunc__cauchy_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_cauchy_isf_double
+ufunc__cauchy_isf_ptr[2*1+1] = <void*>(<char*>"_cauchy_isf")
+ufunc__cauchy_isf_data[0] = &ufunc__cauchy_isf_ptr[2*0]
+ufunc__cauchy_isf_data[1] = &ufunc__cauchy_isf_ptr[2*1]
+_cauchy_isf = np.PyUFunc_FromFuncAndData(ufunc__cauchy_isf_loops, ufunc__cauchy_isf_data, ufunc__cauchy_isf_types, 2, 3, 1, 0, '_cauchy_isf', ufunc__cauchy_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__cauchy_ppf_loops[2]
+cdef void *ufunc__cauchy_ppf_ptr[4]
+cdef void *ufunc__cauchy_ppf_data[2]
+cdef char ufunc__cauchy_ppf_types[8]
+cdef char *ufunc__cauchy_ppf_doc = (
+    "_cauchy_ppf(p, loc, scale)\n"
+    "\n"
+    "Percent point function (i.e. quantile) of the Cauchy distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probabilities\n"
+    "loc : array_like\n"
+    "    Location parameter of the distribution.\n"
+    "scale : array_like\n"
+    "    Scale parameter of the distribution.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__cauchy_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__cauchy_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__cauchy_ppf_types[0] = <char>NPY_FLOAT
+ufunc__cauchy_ppf_types[1] = <char>NPY_FLOAT
+ufunc__cauchy_ppf_types[2] = <char>NPY_FLOAT
+ufunc__cauchy_ppf_types[3] = <char>NPY_FLOAT
+ufunc__cauchy_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__cauchy_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__cauchy_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__cauchy_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__cauchy_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_cauchy_ppf_float
+ufunc__cauchy_ppf_ptr[2*0+1] = <void*>(<char*>"_cauchy_ppf")
+ufunc__cauchy_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_cauchy_ppf_double
+ufunc__cauchy_ppf_ptr[2*1+1] = <void*>(<char*>"_cauchy_ppf")
+ufunc__cauchy_ppf_data[0] = &ufunc__cauchy_ppf_ptr[2*0]
+ufunc__cauchy_ppf_data[1] = &ufunc__cauchy_ppf_ptr[2*1]
+_cauchy_ppf = np.PyUFunc_FromFuncAndData(ufunc__cauchy_ppf_loops, ufunc__cauchy_ppf_data, ufunc__cauchy_ppf_types, 2, 3, 1, 0, '_cauchy_ppf', ufunc__cauchy_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__cosine_cdf_loops[2]
+cdef void *ufunc__cosine_cdf_ptr[4]
+cdef void *ufunc__cosine_cdf_data[2]
+cdef char ufunc__cosine_cdf_types[4]
+cdef char *ufunc__cosine_cdf_doc = (
+    "_cosine_cdf(x)\n"
+    "\n"
+    "Cumulative distribution function (CDF) of the cosine distribution::\n"
+    "\n"
+    "             {             0,              x < -pi\n"
+    "    cdf(x) = { (pi + x + sin(x))/(2*pi),   -pi <= x <= pi\n"
+    "             {             1,              x > pi\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    `x` must contain real numbers.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The cosine distribution CDF evaluated at `x`.")
+ufunc__cosine_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__cosine_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__cosine_cdf_types[0] = <char>NPY_FLOAT
+ufunc__cosine_cdf_types[1] = <char>NPY_FLOAT
+ufunc__cosine_cdf_types[2] = <char>NPY_DOUBLE
+ufunc__cosine_cdf_types[3] = <char>NPY_DOUBLE
+ufunc__cosine_cdf_ptr[2*0] = <void*>_func_cosine_cdf
+ufunc__cosine_cdf_ptr[2*0+1] = <void*>(<char*>"_cosine_cdf")
+ufunc__cosine_cdf_ptr[2*1] = <void*>_func_cosine_cdf
+ufunc__cosine_cdf_ptr[2*1+1] = <void*>(<char*>"_cosine_cdf")
+ufunc__cosine_cdf_data[0] = &ufunc__cosine_cdf_ptr[2*0]
+ufunc__cosine_cdf_data[1] = &ufunc__cosine_cdf_ptr[2*1]
+_cosine_cdf = np.PyUFunc_FromFuncAndData(ufunc__cosine_cdf_loops, ufunc__cosine_cdf_data, ufunc__cosine_cdf_types, 2, 1, 1, 0, '_cosine_cdf', ufunc__cosine_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__cosine_invcdf_loops[2]
+cdef void *ufunc__cosine_invcdf_ptr[4]
+cdef void *ufunc__cosine_invcdf_data[2]
+cdef char ufunc__cosine_invcdf_types[4]
+cdef char *ufunc__cosine_invcdf_doc = (
+    "_cosine_invcdf(p)\n"
+    "\n"
+    "Inverse of the cumulative distribution function (CDF) of the cosine\n"
+    "distribution.\n"
+    "\n"
+    "The CDF of the cosine distribution is::\n"
+    "\n"
+    "    cdf(x) = (pi + x + sin(x))/(2*pi)\n"
+    "\n"
+    "This function computes the inverse of cdf(x).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    `p` must contain real numbers in the interval ``0 <= p <= 1``.\n"
+    "    `nan` is returned for values of `p` outside the interval [0, 1].\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The inverse of the cosine distribution CDF evaluated at `p`.")
+ufunc__cosine_invcdf_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__cosine_invcdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__cosine_invcdf_types[0] = <char>NPY_FLOAT
+ufunc__cosine_invcdf_types[1] = <char>NPY_FLOAT
+ufunc__cosine_invcdf_types[2] = <char>NPY_DOUBLE
+ufunc__cosine_invcdf_types[3] = <char>NPY_DOUBLE
+ufunc__cosine_invcdf_ptr[2*0] = <void*>_func_cosine_invcdf
+ufunc__cosine_invcdf_ptr[2*0+1] = <void*>(<char*>"_cosine_invcdf")
+ufunc__cosine_invcdf_ptr[2*1] = <void*>_func_cosine_invcdf
+ufunc__cosine_invcdf_ptr[2*1+1] = <void*>(<char*>"_cosine_invcdf")
+ufunc__cosine_invcdf_data[0] = &ufunc__cosine_invcdf_ptr[2*0]
+ufunc__cosine_invcdf_data[1] = &ufunc__cosine_invcdf_ptr[2*1]
+_cosine_invcdf = np.PyUFunc_FromFuncAndData(ufunc__cosine_invcdf_loops, ufunc__cosine_invcdf_data, ufunc__cosine_invcdf_types, 2, 1, 1, 0, '_cosine_invcdf', ufunc__cosine_invcdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ellip_harm_loops[3]
+cdef void *ufunc__ellip_harm_ptr[6]
+cdef void *ufunc__ellip_harm_data[3]
+cdef char ufunc__ellip_harm_types[24]
+cdef char *ufunc__ellip_harm_doc = (
+    "Internal function, use `ellip_harm` instead.")
+ufunc__ellip_harm_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddddddd__As_fffffff_f
+ufunc__ellip_harm_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddiiddd__As_ddllddd_d
+ufunc__ellip_harm_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddddddd__As_ddddddd_d
+ufunc__ellip_harm_types[0] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[1] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[2] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[3] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[4] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[5] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[6] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[7] = <char>NPY_FLOAT
+ufunc__ellip_harm_types[8] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[9] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[10] = <char>NPY_LONG
+ufunc__ellip_harm_types[11] = <char>NPY_LONG
+ufunc__ellip_harm_types[12] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[13] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[14] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[15] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[16] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[17] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[18] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[19] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[20] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[21] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[22] = <char>NPY_DOUBLE
+ufunc__ellip_harm_types[23] = <char>NPY_DOUBLE
+ufunc__ellip_harm_ptr[2*0] = <void*>_func_ellip_harmonic_unsafe
+ufunc__ellip_harm_ptr[2*0+1] = <void*>(<char*>"_ellip_harm")
+ufunc__ellip_harm_ptr[2*1] = <void*>_func_ellip_harmonic
+ufunc__ellip_harm_ptr[2*1+1] = <void*>(<char*>"_ellip_harm")
+ufunc__ellip_harm_ptr[2*2] = <void*>_func_ellip_harmonic_unsafe
+ufunc__ellip_harm_ptr[2*2+1] = <void*>(<char*>"_ellip_harm")
+ufunc__ellip_harm_data[0] = &ufunc__ellip_harm_ptr[2*0]
+ufunc__ellip_harm_data[1] = &ufunc__ellip_harm_ptr[2*1]
+ufunc__ellip_harm_data[2] = &ufunc__ellip_harm_ptr[2*2]
+_ellip_harm = np.PyUFunc_FromFuncAndData(ufunc__ellip_harm_loops, ufunc__ellip_harm_data, ufunc__ellip_harm_types, 3, 7, 1, 0, '_ellip_harm', ufunc__ellip_harm_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__factorial_loops[2]
+cdef void *ufunc__factorial_ptr[4]
+cdef void *ufunc__factorial_data[2]
+cdef char ufunc__factorial_types[4]
+cdef char *ufunc__factorial_doc = (
+    "Internal function, do not use.")
+ufunc__factorial_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__factorial_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__factorial_types[0] = <char>NPY_FLOAT
+ufunc__factorial_types[1] = <char>NPY_FLOAT
+ufunc__factorial_types[2] = <char>NPY_DOUBLE
+ufunc__factorial_types[3] = <char>NPY_DOUBLE
+ufunc__factorial_ptr[2*0] = <void*>_func__factorial
+ufunc__factorial_ptr[2*0+1] = <void*>(<char*>"_factorial")
+ufunc__factorial_ptr[2*1] = <void*>_func__factorial
+ufunc__factorial_ptr[2*1+1] = <void*>(<char*>"_factorial")
+ufunc__factorial_data[0] = &ufunc__factorial_ptr[2*0]
+ufunc__factorial_data[1] = &ufunc__factorial_ptr[2*1]
+_factorial = np.PyUFunc_FromFuncAndData(ufunc__factorial_loops, ufunc__factorial_data, ufunc__factorial_types, 2, 1, 1, 0, '_factorial', ufunc__factorial_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_cdf_loops[2]
+cdef void *ufunc__hypergeom_cdf_ptr[4]
+cdef void *ufunc__hypergeom_cdf_data[2]
+cdef char ufunc__hypergeom_cdf_types[10]
+cdef char *ufunc__hypergeom_cdf_doc = (
+    "_hypergeom_cdf(x, r, N, M)\n"
+    "\n"
+    "Cumulative density function of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__hypergeom_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__hypergeom_cdf_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_cdf_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_cdf_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_cdf_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_cdf_types[4] = <char>NPY_FLOAT
+ufunc__hypergeom_cdf_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_cdf_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_cdf_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_cdf_types[8] = <char>NPY_DOUBLE
+ufunc__hypergeom_cdf_types[9] = <char>NPY_DOUBLE
+ufunc__hypergeom_cdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_cdf_float
+ufunc__hypergeom_cdf_ptr[2*0+1] = <void*>(<char*>"_hypergeom_cdf")
+ufunc__hypergeom_cdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_cdf_double
+ufunc__hypergeom_cdf_ptr[2*1+1] = <void*>(<char*>"_hypergeom_cdf")
+ufunc__hypergeom_cdf_data[0] = &ufunc__hypergeom_cdf_ptr[2*0]
+ufunc__hypergeom_cdf_data[1] = &ufunc__hypergeom_cdf_ptr[2*1]
+_hypergeom_cdf = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_cdf_loops, ufunc__hypergeom_cdf_data, ufunc__hypergeom_cdf_types, 2, 4, 1, 0, '_hypergeom_cdf', ufunc__hypergeom_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_mean_loops[2]
+cdef void *ufunc__hypergeom_mean_ptr[4]
+cdef void *ufunc__hypergeom_mean_data[2]
+cdef char ufunc__hypergeom_mean_types[8]
+cdef char *ufunc__hypergeom_mean_doc = (
+    "_hypergeom_mean(r, N, M)\n"
+    "\n"
+    "Mean of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_mean_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__hypergeom_mean_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__hypergeom_mean_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_mean_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_mean_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_mean_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_mean_types[4] = <char>NPY_DOUBLE
+ufunc__hypergeom_mean_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_mean_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_mean_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_mean_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_mean_float
+ufunc__hypergeom_mean_ptr[2*0+1] = <void*>(<char*>"_hypergeom_mean")
+ufunc__hypergeom_mean_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_mean_double
+ufunc__hypergeom_mean_ptr[2*1+1] = <void*>(<char*>"_hypergeom_mean")
+ufunc__hypergeom_mean_data[0] = &ufunc__hypergeom_mean_ptr[2*0]
+ufunc__hypergeom_mean_data[1] = &ufunc__hypergeom_mean_ptr[2*1]
+_hypergeom_mean = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_mean_loops, ufunc__hypergeom_mean_data, ufunc__hypergeom_mean_types, 2, 3, 1, 0, '_hypergeom_mean', ufunc__hypergeom_mean_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_pmf_loops[2]
+cdef void *ufunc__hypergeom_pmf_ptr[4]
+cdef void *ufunc__hypergeom_pmf_data[2]
+cdef char ufunc__hypergeom_pmf_types[10]
+cdef char *ufunc__hypergeom_pmf_doc = (
+    "_hypergeom_pmf(x, r, N, M)\n"
+    "\n"
+    "Probability mass function of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_pmf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__hypergeom_pmf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__hypergeom_pmf_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_pmf_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_pmf_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_pmf_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_pmf_types[4] = <char>NPY_FLOAT
+ufunc__hypergeom_pmf_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_pmf_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_pmf_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_pmf_types[8] = <char>NPY_DOUBLE
+ufunc__hypergeom_pmf_types[9] = <char>NPY_DOUBLE
+ufunc__hypergeom_pmf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_pmf_float
+ufunc__hypergeom_pmf_ptr[2*0+1] = <void*>(<char*>"_hypergeom_pmf")
+ufunc__hypergeom_pmf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_pmf_double
+ufunc__hypergeom_pmf_ptr[2*1+1] = <void*>(<char*>"_hypergeom_pmf")
+ufunc__hypergeom_pmf_data[0] = &ufunc__hypergeom_pmf_ptr[2*0]
+ufunc__hypergeom_pmf_data[1] = &ufunc__hypergeom_pmf_ptr[2*1]
+_hypergeom_pmf = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_pmf_loops, ufunc__hypergeom_pmf_data, ufunc__hypergeom_pmf_types, 2, 4, 1, 0, '_hypergeom_pmf', ufunc__hypergeom_pmf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_sf_loops[2]
+cdef void *ufunc__hypergeom_sf_ptr[4]
+cdef void *ufunc__hypergeom_sf_data[2]
+cdef char ufunc__hypergeom_sf_types[10]
+cdef char *ufunc__hypergeom_sf_doc = (
+    "_hypergeom_sf(x, r, N, M)\n"
+    "\n"
+    "Survival function of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__hypergeom_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__hypergeom_sf_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_sf_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_sf_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_sf_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_sf_types[4] = <char>NPY_FLOAT
+ufunc__hypergeom_sf_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_sf_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_sf_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_sf_types[8] = <char>NPY_DOUBLE
+ufunc__hypergeom_sf_types[9] = <char>NPY_DOUBLE
+ufunc__hypergeom_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_sf_float
+ufunc__hypergeom_sf_ptr[2*0+1] = <void*>(<char*>"_hypergeom_sf")
+ufunc__hypergeom_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_sf_double
+ufunc__hypergeom_sf_ptr[2*1+1] = <void*>(<char*>"_hypergeom_sf")
+ufunc__hypergeom_sf_data[0] = &ufunc__hypergeom_sf_ptr[2*0]
+ufunc__hypergeom_sf_data[1] = &ufunc__hypergeom_sf_ptr[2*1]
+_hypergeom_sf = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_sf_loops, ufunc__hypergeom_sf_data, ufunc__hypergeom_sf_types, 2, 4, 1, 0, '_hypergeom_sf', ufunc__hypergeom_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_skewness_loops[2]
+cdef void *ufunc__hypergeom_skewness_ptr[4]
+cdef void *ufunc__hypergeom_skewness_data[2]
+cdef char ufunc__hypergeom_skewness_types[8]
+cdef char *ufunc__hypergeom_skewness_doc = (
+    "_hypergeom_skewness(r, N, M)\n"
+    "\n"
+    "Skewness of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_skewness_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__hypergeom_skewness_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__hypergeom_skewness_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_skewness_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_skewness_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_skewness_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_skewness_types[4] = <char>NPY_DOUBLE
+ufunc__hypergeom_skewness_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_skewness_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_skewness_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_skewness_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_skewness_float
+ufunc__hypergeom_skewness_ptr[2*0+1] = <void*>(<char*>"_hypergeom_skewness")
+ufunc__hypergeom_skewness_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_skewness_double
+ufunc__hypergeom_skewness_ptr[2*1+1] = <void*>(<char*>"_hypergeom_skewness")
+ufunc__hypergeom_skewness_data[0] = &ufunc__hypergeom_skewness_ptr[2*0]
+ufunc__hypergeom_skewness_data[1] = &ufunc__hypergeom_skewness_ptr[2*1]
+_hypergeom_skewness = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_skewness_loops, ufunc__hypergeom_skewness_data, ufunc__hypergeom_skewness_types, 2, 3, 1, 0, '_hypergeom_skewness', ufunc__hypergeom_skewness_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__hypergeom_variance_loops[2]
+cdef void *ufunc__hypergeom_variance_ptr[4]
+cdef void *ufunc__hypergeom_variance_data[2]
+cdef char ufunc__hypergeom_variance_types[8]
+cdef char *ufunc__hypergeom_variance_doc = (
+    "_hypergeom_variance(r, N, M)\n"
+    "\n"
+    "Mean of hypergeometric distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r, N, M : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__hypergeom_variance_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__hypergeom_variance_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__hypergeom_variance_types[0] = <char>NPY_FLOAT
+ufunc__hypergeom_variance_types[1] = <char>NPY_FLOAT
+ufunc__hypergeom_variance_types[2] = <char>NPY_FLOAT
+ufunc__hypergeom_variance_types[3] = <char>NPY_FLOAT
+ufunc__hypergeom_variance_types[4] = <char>NPY_DOUBLE
+ufunc__hypergeom_variance_types[5] = <char>NPY_DOUBLE
+ufunc__hypergeom_variance_types[6] = <char>NPY_DOUBLE
+ufunc__hypergeom_variance_types[7] = <char>NPY_DOUBLE
+ufunc__hypergeom_variance_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_variance_float
+ufunc__hypergeom_variance_ptr[2*0+1] = <void*>(<char*>"_hypergeom_variance")
+ufunc__hypergeom_variance_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_hypergeom_variance_double
+ufunc__hypergeom_variance_ptr[2*1+1] = <void*>(<char*>"_hypergeom_variance")
+ufunc__hypergeom_variance_data[0] = &ufunc__hypergeom_variance_ptr[2*0]
+ufunc__hypergeom_variance_data[1] = &ufunc__hypergeom_variance_ptr[2*1]
+_hypergeom_variance = np.PyUFunc_FromFuncAndData(ufunc__hypergeom_variance_loops, ufunc__hypergeom_variance_data, ufunc__hypergeom_variance_types, 2, 3, 1, 0, '_hypergeom_variance', ufunc__hypergeom_variance_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__igam_fac_loops[2]
+cdef void *ufunc__igam_fac_ptr[4]
+cdef void *ufunc__igam_fac_data[2]
+cdef char ufunc__igam_fac_types[6]
+cdef char *ufunc__igam_fac_doc = (
+    "Internal function, do not use.")
+ufunc__igam_fac_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc__igam_fac_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__igam_fac_types[0] = <char>NPY_FLOAT
+ufunc__igam_fac_types[1] = <char>NPY_FLOAT
+ufunc__igam_fac_types[2] = <char>NPY_FLOAT
+ufunc__igam_fac_types[3] = <char>NPY_DOUBLE
+ufunc__igam_fac_types[4] = <char>NPY_DOUBLE
+ufunc__igam_fac_types[5] = <char>NPY_DOUBLE
+ufunc__igam_fac_ptr[2*0] = <void*>_func_cephes_igam_fac
+ufunc__igam_fac_ptr[2*0+1] = <void*>(<char*>"_igam_fac")
+ufunc__igam_fac_ptr[2*1] = <void*>_func_cephes_igam_fac
+ufunc__igam_fac_ptr[2*1+1] = <void*>(<char*>"_igam_fac")
+ufunc__igam_fac_data[0] = &ufunc__igam_fac_ptr[2*0]
+ufunc__igam_fac_data[1] = &ufunc__igam_fac_ptr[2*1]
+_igam_fac = np.PyUFunc_FromFuncAndData(ufunc__igam_fac_loops, ufunc__igam_fac_data, ufunc__igam_fac_types, 2, 2, 1, 0, '_igam_fac', ufunc__igam_fac_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__invgauss_isf_loops[2]
+cdef void *ufunc__invgauss_isf_ptr[4]
+cdef void *ufunc__invgauss_isf_data[2]
+cdef char ufunc__invgauss_isf_types[8]
+cdef char *ufunc__invgauss_isf_doc = (
+    "_invgauss_isf(x, mu, s)\n"
+    "\n"
+    "Inverse survival function of inverse gaussian distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "mu : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "s : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__invgauss_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__invgauss_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__invgauss_isf_types[0] = <char>NPY_FLOAT
+ufunc__invgauss_isf_types[1] = <char>NPY_FLOAT
+ufunc__invgauss_isf_types[2] = <char>NPY_FLOAT
+ufunc__invgauss_isf_types[3] = <char>NPY_FLOAT
+ufunc__invgauss_isf_types[4] = <char>NPY_DOUBLE
+ufunc__invgauss_isf_types[5] = <char>NPY_DOUBLE
+ufunc__invgauss_isf_types[6] = <char>NPY_DOUBLE
+ufunc__invgauss_isf_types[7] = <char>NPY_DOUBLE
+ufunc__invgauss_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_invgauss_isf_float
+ufunc__invgauss_isf_ptr[2*0+1] = <void*>(<char*>"_invgauss_isf")
+ufunc__invgauss_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_invgauss_isf_double
+ufunc__invgauss_isf_ptr[2*1+1] = <void*>(<char*>"_invgauss_isf")
+ufunc__invgauss_isf_data[0] = &ufunc__invgauss_isf_ptr[2*0]
+ufunc__invgauss_isf_data[1] = &ufunc__invgauss_isf_ptr[2*1]
+_invgauss_isf = np.PyUFunc_FromFuncAndData(ufunc__invgauss_isf_loops, ufunc__invgauss_isf_data, ufunc__invgauss_isf_types, 2, 3, 1, 0, '_invgauss_isf', ufunc__invgauss_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__invgauss_ppf_loops[2]
+cdef void *ufunc__invgauss_ppf_ptr[4]
+cdef void *ufunc__invgauss_ppf_data[2]
+cdef char ufunc__invgauss_ppf_types[8]
+cdef char *ufunc__invgauss_ppf_doc = (
+    "_invgauss_ppf(x, mu)\n"
+    "\n"
+    "Percent point function of inverse gaussian distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "mu : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__invgauss_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__invgauss_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__invgauss_ppf_types[0] = <char>NPY_FLOAT
+ufunc__invgauss_ppf_types[1] = <char>NPY_FLOAT
+ufunc__invgauss_ppf_types[2] = <char>NPY_FLOAT
+ufunc__invgauss_ppf_types[3] = <char>NPY_FLOAT
+ufunc__invgauss_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__invgauss_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__invgauss_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__invgauss_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__invgauss_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_invgauss_ppf_float
+ufunc__invgauss_ppf_ptr[2*0+1] = <void*>(<char*>"_invgauss_ppf")
+ufunc__invgauss_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_invgauss_ppf_double
+ufunc__invgauss_ppf_ptr[2*1+1] = <void*>(<char*>"_invgauss_ppf")
+ufunc__invgauss_ppf_data[0] = &ufunc__invgauss_ppf_ptr[2*0]
+ufunc__invgauss_ppf_data[1] = &ufunc__invgauss_ppf_ptr[2*1]
+_invgauss_ppf = np.PyUFunc_FromFuncAndData(ufunc__invgauss_ppf_loops, ufunc__invgauss_ppf_data, ufunc__invgauss_ppf_types, 2, 3, 1, 0, '_invgauss_ppf', ufunc__invgauss_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__kolmogc_loops[2]
+cdef void *ufunc__kolmogc_ptr[4]
+cdef void *ufunc__kolmogc_data[2]
+cdef char ufunc__kolmogc_types[4]
+cdef char *ufunc__kolmogc_doc = (
+    "Internal function, do not use.")
+ufunc__kolmogc_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__kolmogc_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__kolmogc_types[0] = <char>NPY_FLOAT
+ufunc__kolmogc_types[1] = <char>NPY_FLOAT
+ufunc__kolmogc_types[2] = <char>NPY_DOUBLE
+ufunc__kolmogc_types[3] = <char>NPY_DOUBLE
+ufunc__kolmogc_ptr[2*0] = <void*>_func_xsf_kolmogc
+ufunc__kolmogc_ptr[2*0+1] = <void*>(<char*>"_kolmogc")
+ufunc__kolmogc_ptr[2*1] = <void*>_func_xsf_kolmogc
+ufunc__kolmogc_ptr[2*1+1] = <void*>(<char*>"_kolmogc")
+ufunc__kolmogc_data[0] = &ufunc__kolmogc_ptr[2*0]
+ufunc__kolmogc_data[1] = &ufunc__kolmogc_ptr[2*1]
+_kolmogc = np.PyUFunc_FromFuncAndData(ufunc__kolmogc_loops, ufunc__kolmogc_data, ufunc__kolmogc_types, 2, 1, 1, 0, '_kolmogc', ufunc__kolmogc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__kolmogci_loops[2]
+cdef void *ufunc__kolmogci_ptr[4]
+cdef void *ufunc__kolmogci_data[2]
+cdef char ufunc__kolmogci_types[4]
+cdef char *ufunc__kolmogci_doc = (
+    "Internal function, do not use.")
+ufunc__kolmogci_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__kolmogci_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__kolmogci_types[0] = <char>NPY_FLOAT
+ufunc__kolmogci_types[1] = <char>NPY_FLOAT
+ufunc__kolmogci_types[2] = <char>NPY_DOUBLE
+ufunc__kolmogci_types[3] = <char>NPY_DOUBLE
+ufunc__kolmogci_ptr[2*0] = <void*>_func_xsf_kolmogci
+ufunc__kolmogci_ptr[2*0+1] = <void*>(<char*>"_kolmogci")
+ufunc__kolmogci_ptr[2*1] = <void*>_func_xsf_kolmogci
+ufunc__kolmogci_ptr[2*1+1] = <void*>(<char*>"_kolmogci")
+ufunc__kolmogci_data[0] = &ufunc__kolmogci_ptr[2*0]
+ufunc__kolmogci_data[1] = &ufunc__kolmogci_ptr[2*1]
+_kolmogci = np.PyUFunc_FromFuncAndData(ufunc__kolmogci_loops, ufunc__kolmogci_data, ufunc__kolmogci_types, 2, 1, 1, 0, '_kolmogci', ufunc__kolmogci_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__kolmogp_loops[2]
+cdef void *ufunc__kolmogp_ptr[4]
+cdef void *ufunc__kolmogp_data[2]
+cdef char ufunc__kolmogp_types[4]
+cdef char *ufunc__kolmogp_doc = (
+    "Internal function, do not use.")
+ufunc__kolmogp_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__kolmogp_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__kolmogp_types[0] = <char>NPY_FLOAT
+ufunc__kolmogp_types[1] = <char>NPY_FLOAT
+ufunc__kolmogp_types[2] = <char>NPY_DOUBLE
+ufunc__kolmogp_types[3] = <char>NPY_DOUBLE
+ufunc__kolmogp_ptr[2*0] = <void*>_func_xsf_kolmogp
+ufunc__kolmogp_ptr[2*0+1] = <void*>(<char*>"_kolmogp")
+ufunc__kolmogp_ptr[2*1] = <void*>_func_xsf_kolmogp
+ufunc__kolmogp_ptr[2*1+1] = <void*>(<char*>"_kolmogp")
+ufunc__kolmogp_data[0] = &ufunc__kolmogp_ptr[2*0]
+ufunc__kolmogp_data[1] = &ufunc__kolmogp_ptr[2*1]
+_kolmogp = np.PyUFunc_FromFuncAndData(ufunc__kolmogp_loops, ufunc__kolmogp_data, ufunc__kolmogp_types, 2, 1, 1, 0, '_kolmogp', ufunc__kolmogp_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__lanczos_sum_expg_scaled_loops[2]
+cdef void *ufunc__lanczos_sum_expg_scaled_ptr[4]
+cdef void *ufunc__lanczos_sum_expg_scaled_data[2]
+cdef char ufunc__lanczos_sum_expg_scaled_types[4]
+cdef char *ufunc__lanczos_sum_expg_scaled_doc = (
+    "Internal function, do not use.")
+ufunc__lanczos_sum_expg_scaled_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__lanczos_sum_expg_scaled_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__lanczos_sum_expg_scaled_types[0] = <char>NPY_FLOAT
+ufunc__lanczos_sum_expg_scaled_types[1] = <char>NPY_FLOAT
+ufunc__lanczos_sum_expg_scaled_types[2] = <char>NPY_DOUBLE
+ufunc__lanczos_sum_expg_scaled_types[3] = <char>NPY_DOUBLE
+ufunc__lanczos_sum_expg_scaled_ptr[2*0] = <void*>_func_cephes_lanczos_sum_expg_scaled
+ufunc__lanczos_sum_expg_scaled_ptr[2*0+1] = <void*>(<char*>"_lanczos_sum_expg_scaled")
+ufunc__lanczos_sum_expg_scaled_ptr[2*1] = <void*>_func_cephes_lanczos_sum_expg_scaled
+ufunc__lanczos_sum_expg_scaled_ptr[2*1+1] = <void*>(<char*>"_lanczos_sum_expg_scaled")
+ufunc__lanczos_sum_expg_scaled_data[0] = &ufunc__lanczos_sum_expg_scaled_ptr[2*0]
+ufunc__lanczos_sum_expg_scaled_data[1] = &ufunc__lanczos_sum_expg_scaled_ptr[2*1]
+_lanczos_sum_expg_scaled = np.PyUFunc_FromFuncAndData(ufunc__lanczos_sum_expg_scaled_loops, ufunc__lanczos_sum_expg_scaled_data, ufunc__lanczos_sum_expg_scaled_types, 2, 1, 1, 0, '_lanczos_sum_expg_scaled', ufunc__lanczos_sum_expg_scaled_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__landau_cdf_loops[2]
+cdef void *ufunc__landau_cdf_ptr[4]
+cdef void *ufunc__landau_cdf_data[2]
+cdef char ufunc__landau_cdf_types[8]
+cdef char *ufunc__landau_cdf_doc = (
+    "_landau_cdf(x, loc, scale)\n"
+    "\n"
+    "Cumulative distribution function of the Landau distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued argument\n"
+    "loc : array_like\n"
+    "    Real-valued distribution location\n"
+    "scale : array_like\n"
+    "    Positive, real-valued distribution scale\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__landau_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__landau_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__landau_cdf_types[0] = <char>NPY_FLOAT
+ufunc__landau_cdf_types[1] = <char>NPY_FLOAT
+ufunc__landau_cdf_types[2] = <char>NPY_FLOAT
+ufunc__landau_cdf_types[3] = <char>NPY_FLOAT
+ufunc__landau_cdf_types[4] = <char>NPY_DOUBLE
+ufunc__landau_cdf_types[5] = <char>NPY_DOUBLE
+ufunc__landau_cdf_types[6] = <char>NPY_DOUBLE
+ufunc__landau_cdf_types[7] = <char>NPY_DOUBLE
+ufunc__landau_cdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_landau_cdf_float
+ufunc__landau_cdf_ptr[2*0+1] = <void*>(<char*>"_landau_cdf")
+ufunc__landau_cdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_landau_cdf_double
+ufunc__landau_cdf_ptr[2*1+1] = <void*>(<char*>"_landau_cdf")
+ufunc__landau_cdf_data[0] = &ufunc__landau_cdf_ptr[2*0]
+ufunc__landau_cdf_data[1] = &ufunc__landau_cdf_ptr[2*1]
+_landau_cdf = np.PyUFunc_FromFuncAndData(ufunc__landau_cdf_loops, ufunc__landau_cdf_data, ufunc__landau_cdf_types, 2, 3, 1, 0, '_landau_cdf', ufunc__landau_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__landau_isf_loops[2]
+cdef void *ufunc__landau_isf_ptr[4]
+cdef void *ufunc__landau_isf_data[2]
+cdef char ufunc__landau_isf_types[8]
+cdef char *ufunc__landau_isf_doc = (
+    "_landau_isf(p, loc, scale)\n"
+    "\n"
+    "Inverse survival function of the Landau distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Real-valued argument between 0 and 1\n"
+    "loc : array_like\n"
+    "    Real-valued distribution location\n"
+    "scale : array_like\n"
+    "    Positive, real-valued distribution scale\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__landau_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__landau_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__landau_isf_types[0] = <char>NPY_FLOAT
+ufunc__landau_isf_types[1] = <char>NPY_FLOAT
+ufunc__landau_isf_types[2] = <char>NPY_FLOAT
+ufunc__landau_isf_types[3] = <char>NPY_FLOAT
+ufunc__landau_isf_types[4] = <char>NPY_DOUBLE
+ufunc__landau_isf_types[5] = <char>NPY_DOUBLE
+ufunc__landau_isf_types[6] = <char>NPY_DOUBLE
+ufunc__landau_isf_types[7] = <char>NPY_DOUBLE
+ufunc__landau_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_landau_isf_float
+ufunc__landau_isf_ptr[2*0+1] = <void*>(<char*>"_landau_isf")
+ufunc__landau_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_landau_isf_double
+ufunc__landau_isf_ptr[2*1+1] = <void*>(<char*>"_landau_isf")
+ufunc__landau_isf_data[0] = &ufunc__landau_isf_ptr[2*0]
+ufunc__landau_isf_data[1] = &ufunc__landau_isf_ptr[2*1]
+_landau_isf = np.PyUFunc_FromFuncAndData(ufunc__landau_isf_loops, ufunc__landau_isf_data, ufunc__landau_isf_types, 2, 3, 1, 0, '_landau_isf', ufunc__landau_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__landau_pdf_loops[2]
+cdef void *ufunc__landau_pdf_ptr[4]
+cdef void *ufunc__landau_pdf_data[2]
+cdef char ufunc__landau_pdf_types[8]
+cdef char *ufunc__landau_pdf_doc = (
+    "_landau_pdf(x, loc, scale)\n"
+    "\n"
+    "Probability density function of the Landau distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued argument\n"
+    "loc : array_like\n"
+    "    Real-valued distribution location\n"
+    "scale : array_like\n"
+    "    Positive, real-valued distribution scale\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__landau_pdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__landau_pdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__landau_pdf_types[0] = <char>NPY_FLOAT
+ufunc__landau_pdf_types[1] = <char>NPY_FLOAT
+ufunc__landau_pdf_types[2] = <char>NPY_FLOAT
+ufunc__landau_pdf_types[3] = <char>NPY_FLOAT
+ufunc__landau_pdf_types[4] = <char>NPY_DOUBLE
+ufunc__landau_pdf_types[5] = <char>NPY_DOUBLE
+ufunc__landau_pdf_types[6] = <char>NPY_DOUBLE
+ufunc__landau_pdf_types[7] = <char>NPY_DOUBLE
+ufunc__landau_pdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_landau_pdf_float
+ufunc__landau_pdf_ptr[2*0+1] = <void*>(<char*>"_landau_pdf")
+ufunc__landau_pdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_landau_pdf_double
+ufunc__landau_pdf_ptr[2*1+1] = <void*>(<char*>"_landau_pdf")
+ufunc__landau_pdf_data[0] = &ufunc__landau_pdf_ptr[2*0]
+ufunc__landau_pdf_data[1] = &ufunc__landau_pdf_ptr[2*1]
+_landau_pdf = np.PyUFunc_FromFuncAndData(ufunc__landau_pdf_loops, ufunc__landau_pdf_data, ufunc__landau_pdf_types, 2, 3, 1, 0, '_landau_pdf', ufunc__landau_pdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__landau_ppf_loops[2]
+cdef void *ufunc__landau_ppf_ptr[4]
+cdef void *ufunc__landau_ppf_data[2]
+cdef char ufunc__landau_ppf_types[8]
+cdef char *ufunc__landau_ppf_doc = (
+    "_landau_ppf(p, loc, scale)\n"
+    "\n"
+    "Percent point function of the Landau distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Real-valued argument between 0 and 1\n"
+    "loc : array_like\n"
+    "    Real-valued distribution location\n"
+    "scale : array_like\n"
+    "    Positive, real-valued distribution scale\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__landau_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__landau_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__landau_ppf_types[0] = <char>NPY_FLOAT
+ufunc__landau_ppf_types[1] = <char>NPY_FLOAT
+ufunc__landau_ppf_types[2] = <char>NPY_FLOAT
+ufunc__landau_ppf_types[3] = <char>NPY_FLOAT
+ufunc__landau_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__landau_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__landau_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__landau_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__landau_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_landau_ppf_float
+ufunc__landau_ppf_ptr[2*0+1] = <void*>(<char*>"_landau_ppf")
+ufunc__landau_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_landau_ppf_double
+ufunc__landau_ppf_ptr[2*1+1] = <void*>(<char*>"_landau_ppf")
+ufunc__landau_ppf_data[0] = &ufunc__landau_ppf_ptr[2*0]
+ufunc__landau_ppf_data[1] = &ufunc__landau_ppf_ptr[2*1]
+_landau_ppf = np.PyUFunc_FromFuncAndData(ufunc__landau_ppf_loops, ufunc__landau_ppf_data, ufunc__landau_ppf_types, 2, 3, 1, 0, '_landau_ppf', ufunc__landau_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__landau_sf_loops[2]
+cdef void *ufunc__landau_sf_ptr[4]
+cdef void *ufunc__landau_sf_data[2]
+cdef char ufunc__landau_sf_types[8]
+cdef char *ufunc__landau_sf_doc = (
+    "_landau_sf(x, loc, scale)\n"
+    "\n"
+    "Survival function of the Landau distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued argument\n"
+    "loc : array_like\n"
+    "    Real-valued distribution location\n"
+    "scale : array_like\n"
+    "    Positive, real-valued distribution scale\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__landau_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__landau_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__landau_sf_types[0] = <char>NPY_FLOAT
+ufunc__landau_sf_types[1] = <char>NPY_FLOAT
+ufunc__landau_sf_types[2] = <char>NPY_FLOAT
+ufunc__landau_sf_types[3] = <char>NPY_FLOAT
+ufunc__landau_sf_types[4] = <char>NPY_DOUBLE
+ufunc__landau_sf_types[5] = <char>NPY_DOUBLE
+ufunc__landau_sf_types[6] = <char>NPY_DOUBLE
+ufunc__landau_sf_types[7] = <char>NPY_DOUBLE
+ufunc__landau_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_landau_sf_float
+ufunc__landau_sf_ptr[2*0+1] = <void*>(<char*>"_landau_sf")
+ufunc__landau_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_landau_sf_double
+ufunc__landau_sf_ptr[2*1+1] = <void*>(<char*>"_landau_sf")
+ufunc__landau_sf_data[0] = &ufunc__landau_sf_ptr[2*0]
+ufunc__landau_sf_data[1] = &ufunc__landau_sf_ptr[2*1]
+_landau_sf = np.PyUFunc_FromFuncAndData(ufunc__landau_sf_loops, ufunc__landau_sf_data, ufunc__landau_sf_types, 2, 3, 1, 0, '_landau_sf', ufunc__landau_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__lgam1p_loops[2]
+cdef void *ufunc__lgam1p_ptr[4]
+cdef void *ufunc__lgam1p_data[2]
+cdef char ufunc__lgam1p_types[4]
+cdef char *ufunc__lgam1p_doc = (
+    "Internal function, do not use.")
+ufunc__lgam1p_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc__lgam1p_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc__lgam1p_types[0] = <char>NPY_FLOAT
+ufunc__lgam1p_types[1] = <char>NPY_FLOAT
+ufunc__lgam1p_types[2] = <char>NPY_DOUBLE
+ufunc__lgam1p_types[3] = <char>NPY_DOUBLE
+ufunc__lgam1p_ptr[2*0] = <void*>_func_cephes_lgam1p
+ufunc__lgam1p_ptr[2*0+1] = <void*>(<char*>"_lgam1p")
+ufunc__lgam1p_ptr[2*1] = <void*>_func_cephes_lgam1p
+ufunc__lgam1p_ptr[2*1+1] = <void*>(<char*>"_lgam1p")
+ufunc__lgam1p_data[0] = &ufunc__lgam1p_ptr[2*0]
+ufunc__lgam1p_data[1] = &ufunc__lgam1p_ptr[2*1]
+_lgam1p = np.PyUFunc_FromFuncAndData(ufunc__lgam1p_loops, ufunc__lgam1p_data, ufunc__lgam1p_types, 2, 1, 1, 0, '_lgam1p', ufunc__lgam1p_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_cdf_loops[2]
+cdef void *ufunc__nbinom_cdf_ptr[4]
+cdef void *ufunc__nbinom_cdf_data[2]
+cdef char ufunc__nbinom_cdf_types[8]
+cdef char *ufunc__nbinom_cdf_doc = (
+    "_nbinom_cdf(x, r, p)\n"
+    "\n"
+    "Cumulative density function of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nbinom_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nbinom_cdf_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_cdf_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_cdf_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_cdf_types[3] = <char>NPY_FLOAT
+ufunc__nbinom_cdf_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_cdf_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_cdf_types[6] = <char>NPY_DOUBLE
+ufunc__nbinom_cdf_types[7] = <char>NPY_DOUBLE
+ufunc__nbinom_cdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_cdf_float
+ufunc__nbinom_cdf_ptr[2*0+1] = <void*>(<char*>"_nbinom_cdf")
+ufunc__nbinom_cdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_cdf_double
+ufunc__nbinom_cdf_ptr[2*1+1] = <void*>(<char*>"_nbinom_cdf")
+ufunc__nbinom_cdf_data[0] = &ufunc__nbinom_cdf_ptr[2*0]
+ufunc__nbinom_cdf_data[1] = &ufunc__nbinom_cdf_ptr[2*1]
+_nbinom_cdf = np.PyUFunc_FromFuncAndData(ufunc__nbinom_cdf_loops, ufunc__nbinom_cdf_data, ufunc__nbinom_cdf_types, 2, 3, 1, 0, '_nbinom_cdf', ufunc__nbinom_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_isf_loops[2]
+cdef void *ufunc__nbinom_isf_ptr[4]
+cdef void *ufunc__nbinom_isf_data[2]
+cdef char ufunc__nbinom_isf_types[8]
+cdef char *ufunc__nbinom_isf_doc = (
+    "_nbinom_isf(x, r, p)\n"
+    "\n"
+    "Inverse survival function of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nbinom_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nbinom_isf_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_isf_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_isf_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_isf_types[3] = <char>NPY_FLOAT
+ufunc__nbinom_isf_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_isf_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_isf_types[6] = <char>NPY_DOUBLE
+ufunc__nbinom_isf_types[7] = <char>NPY_DOUBLE
+ufunc__nbinom_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_isf_float
+ufunc__nbinom_isf_ptr[2*0+1] = <void*>(<char*>"_nbinom_isf")
+ufunc__nbinom_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_isf_double
+ufunc__nbinom_isf_ptr[2*1+1] = <void*>(<char*>"_nbinom_isf")
+ufunc__nbinom_isf_data[0] = &ufunc__nbinom_isf_ptr[2*0]
+ufunc__nbinom_isf_data[1] = &ufunc__nbinom_isf_ptr[2*1]
+_nbinom_isf = np.PyUFunc_FromFuncAndData(ufunc__nbinom_isf_loops, ufunc__nbinom_isf_data, ufunc__nbinom_isf_types, 2, 3, 1, 0, '_nbinom_isf', ufunc__nbinom_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_kurtosis_excess_loops[2]
+cdef void *ufunc__nbinom_kurtosis_excess_ptr[4]
+cdef void *ufunc__nbinom_kurtosis_excess_data[2]
+cdef char ufunc__nbinom_kurtosis_excess_types[6]
+cdef char *ufunc__nbinom_kurtosis_excess_doc = (
+    "_nbinom_kurtosis_excess(r, p)\n"
+    "\n"
+    "Kurtosis excess of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_kurtosis_excess_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nbinom_kurtosis_excess_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nbinom_kurtosis_excess_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_kurtosis_excess_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_kurtosis_excess_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_kurtosis_excess_types[3] = <char>NPY_DOUBLE
+ufunc__nbinom_kurtosis_excess_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_kurtosis_excess_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_kurtosis_excess_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_kurtosis_excess_float
+ufunc__nbinom_kurtosis_excess_ptr[2*0+1] = <void*>(<char*>"_nbinom_kurtosis_excess")
+ufunc__nbinom_kurtosis_excess_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_kurtosis_excess_double
+ufunc__nbinom_kurtosis_excess_ptr[2*1+1] = <void*>(<char*>"_nbinom_kurtosis_excess")
+ufunc__nbinom_kurtosis_excess_data[0] = &ufunc__nbinom_kurtosis_excess_ptr[2*0]
+ufunc__nbinom_kurtosis_excess_data[1] = &ufunc__nbinom_kurtosis_excess_ptr[2*1]
+_nbinom_kurtosis_excess = np.PyUFunc_FromFuncAndData(ufunc__nbinom_kurtosis_excess_loops, ufunc__nbinom_kurtosis_excess_data, ufunc__nbinom_kurtosis_excess_types, 2, 2, 1, 0, '_nbinom_kurtosis_excess', ufunc__nbinom_kurtosis_excess_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_mean_loops[2]
+cdef void *ufunc__nbinom_mean_ptr[4]
+cdef void *ufunc__nbinom_mean_data[2]
+cdef char ufunc__nbinom_mean_types[6]
+cdef char *ufunc__nbinom_mean_doc = (
+    "_nbinom_mean(r, p)\n"
+    "\n"
+    "Mean of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_mean_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nbinom_mean_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nbinom_mean_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_mean_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_mean_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_mean_types[3] = <char>NPY_DOUBLE
+ufunc__nbinom_mean_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_mean_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_mean_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_mean_float
+ufunc__nbinom_mean_ptr[2*0+1] = <void*>(<char*>"_nbinom_mean")
+ufunc__nbinom_mean_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_mean_double
+ufunc__nbinom_mean_ptr[2*1+1] = <void*>(<char*>"_nbinom_mean")
+ufunc__nbinom_mean_data[0] = &ufunc__nbinom_mean_ptr[2*0]
+ufunc__nbinom_mean_data[1] = &ufunc__nbinom_mean_ptr[2*1]
+_nbinom_mean = np.PyUFunc_FromFuncAndData(ufunc__nbinom_mean_loops, ufunc__nbinom_mean_data, ufunc__nbinom_mean_types, 2, 2, 1, 0, '_nbinom_mean', ufunc__nbinom_mean_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_pmf_loops[2]
+cdef void *ufunc__nbinom_pmf_ptr[4]
+cdef void *ufunc__nbinom_pmf_data[2]
+cdef char ufunc__nbinom_pmf_types[8]
+cdef char *ufunc__nbinom_pmf_doc = (
+    "_nbinom_pmf(x, r, p)\n"
+    "\n"
+    "Probability mass function of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_pmf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nbinom_pmf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nbinom_pmf_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_pmf_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_pmf_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_pmf_types[3] = <char>NPY_FLOAT
+ufunc__nbinom_pmf_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_pmf_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_pmf_types[6] = <char>NPY_DOUBLE
+ufunc__nbinom_pmf_types[7] = <char>NPY_DOUBLE
+ufunc__nbinom_pmf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_pmf_float
+ufunc__nbinom_pmf_ptr[2*0+1] = <void*>(<char*>"_nbinom_pmf")
+ufunc__nbinom_pmf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_pmf_double
+ufunc__nbinom_pmf_ptr[2*1+1] = <void*>(<char*>"_nbinom_pmf")
+ufunc__nbinom_pmf_data[0] = &ufunc__nbinom_pmf_ptr[2*0]
+ufunc__nbinom_pmf_data[1] = &ufunc__nbinom_pmf_ptr[2*1]
+_nbinom_pmf = np.PyUFunc_FromFuncAndData(ufunc__nbinom_pmf_loops, ufunc__nbinom_pmf_data, ufunc__nbinom_pmf_types, 2, 3, 1, 0, '_nbinom_pmf', ufunc__nbinom_pmf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_ppf_loops[2]
+cdef void *ufunc__nbinom_ppf_ptr[4]
+cdef void *ufunc__nbinom_ppf_data[2]
+cdef char ufunc__nbinom_ppf_types[8]
+cdef char *ufunc__nbinom_ppf_doc = (
+    "_nbinom_ppf(x, r, p)\n"
+    "\n"
+    "Percent point function of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nbinom_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nbinom_ppf_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_ppf_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_ppf_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_ppf_types[3] = <char>NPY_FLOAT
+ufunc__nbinom_ppf_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__nbinom_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__nbinom_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_ppf_float
+ufunc__nbinom_ppf_ptr[2*0+1] = <void*>(<char*>"_nbinom_ppf")
+ufunc__nbinom_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_ppf_double
+ufunc__nbinom_ppf_ptr[2*1+1] = <void*>(<char*>"_nbinom_ppf")
+ufunc__nbinom_ppf_data[0] = &ufunc__nbinom_ppf_ptr[2*0]
+ufunc__nbinom_ppf_data[1] = &ufunc__nbinom_ppf_ptr[2*1]
+_nbinom_ppf = np.PyUFunc_FromFuncAndData(ufunc__nbinom_ppf_loops, ufunc__nbinom_ppf_data, ufunc__nbinom_ppf_types, 2, 3, 1, 0, '_nbinom_ppf', ufunc__nbinom_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_sf_loops[2]
+cdef void *ufunc__nbinom_sf_ptr[4]
+cdef void *ufunc__nbinom_sf_data[2]
+cdef char ufunc__nbinom_sf_types[8]
+cdef char *ufunc__nbinom_sf_doc = (
+    "_nbinom_sf(x, r, p)\n"
+    "\n"
+    "Survival function of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nbinom_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nbinom_sf_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_sf_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_sf_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_sf_types[3] = <char>NPY_FLOAT
+ufunc__nbinom_sf_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_sf_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_sf_types[6] = <char>NPY_DOUBLE
+ufunc__nbinom_sf_types[7] = <char>NPY_DOUBLE
+ufunc__nbinom_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_sf_float
+ufunc__nbinom_sf_ptr[2*0+1] = <void*>(<char*>"_nbinom_sf")
+ufunc__nbinom_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_sf_double
+ufunc__nbinom_sf_ptr[2*1+1] = <void*>(<char*>"_nbinom_sf")
+ufunc__nbinom_sf_data[0] = &ufunc__nbinom_sf_ptr[2*0]
+ufunc__nbinom_sf_data[1] = &ufunc__nbinom_sf_ptr[2*1]
+_nbinom_sf = np.PyUFunc_FromFuncAndData(ufunc__nbinom_sf_loops, ufunc__nbinom_sf_data, ufunc__nbinom_sf_types, 2, 3, 1, 0, '_nbinom_sf', ufunc__nbinom_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_skewness_loops[2]
+cdef void *ufunc__nbinom_skewness_ptr[4]
+cdef void *ufunc__nbinom_skewness_data[2]
+cdef char ufunc__nbinom_skewness_types[6]
+cdef char *ufunc__nbinom_skewness_doc = (
+    "_nbinom_skewness(r, p)\n"
+    "\n"
+    "Skewness of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_skewness_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nbinom_skewness_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nbinom_skewness_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_skewness_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_skewness_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_skewness_types[3] = <char>NPY_DOUBLE
+ufunc__nbinom_skewness_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_skewness_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_skewness_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_skewness_float
+ufunc__nbinom_skewness_ptr[2*0+1] = <void*>(<char*>"_nbinom_skewness")
+ufunc__nbinom_skewness_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_skewness_double
+ufunc__nbinom_skewness_ptr[2*1+1] = <void*>(<char*>"_nbinom_skewness")
+ufunc__nbinom_skewness_data[0] = &ufunc__nbinom_skewness_ptr[2*0]
+ufunc__nbinom_skewness_data[1] = &ufunc__nbinom_skewness_ptr[2*1]
+_nbinom_skewness = np.PyUFunc_FromFuncAndData(ufunc__nbinom_skewness_loops, ufunc__nbinom_skewness_data, ufunc__nbinom_skewness_types, 2, 2, 1, 0, '_nbinom_skewness', ufunc__nbinom_skewness_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nbinom_variance_loops[2]
+cdef void *ufunc__nbinom_variance_ptr[4]
+cdef void *ufunc__nbinom_variance_data[2]
+cdef char ufunc__nbinom_variance_types[6]
+cdef char *ufunc__nbinom_variance_doc = (
+    "_nbinom_variance(r, p)\n"
+    "\n"
+    "Variance of negative binomial distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "r : array_like\n"
+    "    Positive, integer-valued parameter\n"
+    "p : array_like\n"
+    "    Positive, real-valued parameter\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nbinom_variance_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nbinom_variance_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nbinom_variance_types[0] = <char>NPY_FLOAT
+ufunc__nbinom_variance_types[1] = <char>NPY_FLOAT
+ufunc__nbinom_variance_types[2] = <char>NPY_FLOAT
+ufunc__nbinom_variance_types[3] = <char>NPY_DOUBLE
+ufunc__nbinom_variance_types[4] = <char>NPY_DOUBLE
+ufunc__nbinom_variance_types[5] = <char>NPY_DOUBLE
+ufunc__nbinom_variance_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nbinom_variance_float
+ufunc__nbinom_variance_ptr[2*0+1] = <void*>(<char*>"_nbinom_variance")
+ufunc__nbinom_variance_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nbinom_variance_double
+ufunc__nbinom_variance_ptr[2*1+1] = <void*>(<char*>"_nbinom_variance")
+ufunc__nbinom_variance_data[0] = &ufunc__nbinom_variance_ptr[2*0]
+ufunc__nbinom_variance_data[1] = &ufunc__nbinom_variance_ptr[2*1]
+_nbinom_variance = np.PyUFunc_FromFuncAndData(ufunc__nbinom_variance_loops, ufunc__nbinom_variance_data, ufunc__nbinom_variance_types, 2, 2, 1, 0, '_nbinom_variance', ufunc__nbinom_variance_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_isf_loops[2]
+cdef void *ufunc__ncf_isf_ptr[4]
+cdef void *ufunc__ncf_isf_data[2]
+cdef char ufunc__ncf_isf_types[10]
+cdef char *ufunc__ncf_isf_doc = (
+    "_ncf_isf(x, v1, v2, l)\n"
+    "\n"
+    "Inverse survival function of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__ncf_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__ncf_isf_types[0] = <char>NPY_FLOAT
+ufunc__ncf_isf_types[1] = <char>NPY_FLOAT
+ufunc__ncf_isf_types[2] = <char>NPY_FLOAT
+ufunc__ncf_isf_types[3] = <char>NPY_FLOAT
+ufunc__ncf_isf_types[4] = <char>NPY_FLOAT
+ufunc__ncf_isf_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_isf_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_isf_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_isf_types[8] = <char>NPY_DOUBLE
+ufunc__ncf_isf_types[9] = <char>NPY_DOUBLE
+ufunc__ncf_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_isf_float
+ufunc__ncf_isf_ptr[2*0+1] = <void*>(<char*>"_ncf_isf")
+ufunc__ncf_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_isf_double
+ufunc__ncf_isf_ptr[2*1+1] = <void*>(<char*>"_ncf_isf")
+ufunc__ncf_isf_data[0] = &ufunc__ncf_isf_ptr[2*0]
+ufunc__ncf_isf_data[1] = &ufunc__ncf_isf_ptr[2*1]
+_ncf_isf = np.PyUFunc_FromFuncAndData(ufunc__ncf_isf_loops, ufunc__ncf_isf_data, ufunc__ncf_isf_types, 2, 4, 1, 0, '_ncf_isf', ufunc__ncf_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_kurtosis_excess_loops[2]
+cdef void *ufunc__ncf_kurtosis_excess_ptr[4]
+cdef void *ufunc__ncf_kurtosis_excess_data[2]
+cdef char ufunc__ncf_kurtosis_excess_types[8]
+cdef char *ufunc__ncf_kurtosis_excess_doc = (
+    "_ncf_kurtosis_excess(v1, v2, l)\n"
+    "\n"
+    "Kurtosis excess of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_kurtosis_excess_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncf_kurtosis_excess_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncf_kurtosis_excess_types[0] = <char>NPY_FLOAT
+ufunc__ncf_kurtosis_excess_types[1] = <char>NPY_FLOAT
+ufunc__ncf_kurtosis_excess_types[2] = <char>NPY_FLOAT
+ufunc__ncf_kurtosis_excess_types[3] = <char>NPY_FLOAT
+ufunc__ncf_kurtosis_excess_types[4] = <char>NPY_DOUBLE
+ufunc__ncf_kurtosis_excess_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_kurtosis_excess_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_kurtosis_excess_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_kurtosis_excess_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_kurtosis_excess_float
+ufunc__ncf_kurtosis_excess_ptr[2*0+1] = <void*>(<char*>"_ncf_kurtosis_excess")
+ufunc__ncf_kurtosis_excess_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_kurtosis_excess_double
+ufunc__ncf_kurtosis_excess_ptr[2*1+1] = <void*>(<char*>"_ncf_kurtosis_excess")
+ufunc__ncf_kurtosis_excess_data[0] = &ufunc__ncf_kurtosis_excess_ptr[2*0]
+ufunc__ncf_kurtosis_excess_data[1] = &ufunc__ncf_kurtosis_excess_ptr[2*1]
+_ncf_kurtosis_excess = np.PyUFunc_FromFuncAndData(ufunc__ncf_kurtosis_excess_loops, ufunc__ncf_kurtosis_excess_data, ufunc__ncf_kurtosis_excess_types, 2, 3, 1, 0, '_ncf_kurtosis_excess', ufunc__ncf_kurtosis_excess_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_mean_loops[2]
+cdef void *ufunc__ncf_mean_ptr[4]
+cdef void *ufunc__ncf_mean_data[2]
+cdef char ufunc__ncf_mean_types[8]
+cdef char *ufunc__ncf_mean_doc = (
+    "_ncf_mean(v1, v2, l)\n"
+    "\n"
+    "Mean of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_mean_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncf_mean_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncf_mean_types[0] = <char>NPY_FLOAT
+ufunc__ncf_mean_types[1] = <char>NPY_FLOAT
+ufunc__ncf_mean_types[2] = <char>NPY_FLOAT
+ufunc__ncf_mean_types[3] = <char>NPY_FLOAT
+ufunc__ncf_mean_types[4] = <char>NPY_DOUBLE
+ufunc__ncf_mean_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_mean_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_mean_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_mean_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_mean_float
+ufunc__ncf_mean_ptr[2*0+1] = <void*>(<char*>"_ncf_mean")
+ufunc__ncf_mean_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_mean_double
+ufunc__ncf_mean_ptr[2*1+1] = <void*>(<char*>"_ncf_mean")
+ufunc__ncf_mean_data[0] = &ufunc__ncf_mean_ptr[2*0]
+ufunc__ncf_mean_data[1] = &ufunc__ncf_mean_ptr[2*1]
+_ncf_mean = np.PyUFunc_FromFuncAndData(ufunc__ncf_mean_loops, ufunc__ncf_mean_data, ufunc__ncf_mean_types, 2, 3, 1, 0, '_ncf_mean', ufunc__ncf_mean_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_pdf_loops[2]
+cdef void *ufunc__ncf_pdf_ptr[4]
+cdef void *ufunc__ncf_pdf_data[2]
+cdef char ufunc__ncf_pdf_types[10]
+cdef char *ufunc__ncf_pdf_doc = (
+    "_ncf_pdf(x, v1, v2, l)\n"
+    "\n"
+    "Probability density function of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_pdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__ncf_pdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__ncf_pdf_types[0] = <char>NPY_FLOAT
+ufunc__ncf_pdf_types[1] = <char>NPY_FLOAT
+ufunc__ncf_pdf_types[2] = <char>NPY_FLOAT
+ufunc__ncf_pdf_types[3] = <char>NPY_FLOAT
+ufunc__ncf_pdf_types[4] = <char>NPY_FLOAT
+ufunc__ncf_pdf_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_pdf_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_pdf_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_pdf_types[8] = <char>NPY_DOUBLE
+ufunc__ncf_pdf_types[9] = <char>NPY_DOUBLE
+ufunc__ncf_pdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_pdf_float
+ufunc__ncf_pdf_ptr[2*0+1] = <void*>(<char*>"_ncf_pdf")
+ufunc__ncf_pdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_pdf_double
+ufunc__ncf_pdf_ptr[2*1+1] = <void*>(<char*>"_ncf_pdf")
+ufunc__ncf_pdf_data[0] = &ufunc__ncf_pdf_ptr[2*0]
+ufunc__ncf_pdf_data[1] = &ufunc__ncf_pdf_ptr[2*1]
+_ncf_pdf = np.PyUFunc_FromFuncAndData(ufunc__ncf_pdf_loops, ufunc__ncf_pdf_data, ufunc__ncf_pdf_types, 2, 4, 1, 0, '_ncf_pdf', ufunc__ncf_pdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_sf_loops[2]
+cdef void *ufunc__ncf_sf_ptr[4]
+cdef void *ufunc__ncf_sf_data[2]
+cdef char ufunc__ncf_sf_types[10]
+cdef char *ufunc__ncf_sf_doc = (
+    "_ncf_sf(x, v1, v2, l)\n"
+    "\n"
+    "Survival function of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__ncf_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__ncf_sf_types[0] = <char>NPY_FLOAT
+ufunc__ncf_sf_types[1] = <char>NPY_FLOAT
+ufunc__ncf_sf_types[2] = <char>NPY_FLOAT
+ufunc__ncf_sf_types[3] = <char>NPY_FLOAT
+ufunc__ncf_sf_types[4] = <char>NPY_FLOAT
+ufunc__ncf_sf_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_sf_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_sf_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_sf_types[8] = <char>NPY_DOUBLE
+ufunc__ncf_sf_types[9] = <char>NPY_DOUBLE
+ufunc__ncf_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_sf_float
+ufunc__ncf_sf_ptr[2*0+1] = <void*>(<char*>"_ncf_sf")
+ufunc__ncf_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_sf_double
+ufunc__ncf_sf_ptr[2*1+1] = <void*>(<char*>"_ncf_sf")
+ufunc__ncf_sf_data[0] = &ufunc__ncf_sf_ptr[2*0]
+ufunc__ncf_sf_data[1] = &ufunc__ncf_sf_ptr[2*1]
+_ncf_sf = np.PyUFunc_FromFuncAndData(ufunc__ncf_sf_loops, ufunc__ncf_sf_data, ufunc__ncf_sf_types, 2, 4, 1, 0, '_ncf_sf', ufunc__ncf_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_skewness_loops[2]
+cdef void *ufunc__ncf_skewness_ptr[4]
+cdef void *ufunc__ncf_skewness_data[2]
+cdef char ufunc__ncf_skewness_types[8]
+cdef char *ufunc__ncf_skewness_doc = (
+    "_ncf_skewness(v1, v2, l)\n"
+    "\n"
+    "Skewness of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_skewness_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncf_skewness_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncf_skewness_types[0] = <char>NPY_FLOAT
+ufunc__ncf_skewness_types[1] = <char>NPY_FLOAT
+ufunc__ncf_skewness_types[2] = <char>NPY_FLOAT
+ufunc__ncf_skewness_types[3] = <char>NPY_FLOAT
+ufunc__ncf_skewness_types[4] = <char>NPY_DOUBLE
+ufunc__ncf_skewness_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_skewness_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_skewness_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_skewness_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_skewness_float
+ufunc__ncf_skewness_ptr[2*0+1] = <void*>(<char*>"_ncf_skewness")
+ufunc__ncf_skewness_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_skewness_double
+ufunc__ncf_skewness_ptr[2*1+1] = <void*>(<char*>"_ncf_skewness")
+ufunc__ncf_skewness_data[0] = &ufunc__ncf_skewness_ptr[2*0]
+ufunc__ncf_skewness_data[1] = &ufunc__ncf_skewness_ptr[2*1]
+_ncf_skewness = np.PyUFunc_FromFuncAndData(ufunc__ncf_skewness_loops, ufunc__ncf_skewness_data, ufunc__ncf_skewness_types, 2, 3, 1, 0, '_ncf_skewness', ufunc__ncf_skewness_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncf_variance_loops[2]
+cdef void *ufunc__ncf_variance_ptr[4]
+cdef void *ufunc__ncf_variance_data[2]
+cdef char ufunc__ncf_variance_types[8]
+cdef char *ufunc__ncf_variance_doc = (
+    "_ncf_variance(v1, v2, l)\n"
+    "\n"
+    "Variance of noncentral F-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v1, v2, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncf_variance_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncf_variance_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncf_variance_types[0] = <char>NPY_FLOAT
+ufunc__ncf_variance_types[1] = <char>NPY_FLOAT
+ufunc__ncf_variance_types[2] = <char>NPY_FLOAT
+ufunc__ncf_variance_types[3] = <char>NPY_FLOAT
+ufunc__ncf_variance_types[4] = <char>NPY_DOUBLE
+ufunc__ncf_variance_types[5] = <char>NPY_DOUBLE
+ufunc__ncf_variance_types[6] = <char>NPY_DOUBLE
+ufunc__ncf_variance_types[7] = <char>NPY_DOUBLE
+ufunc__ncf_variance_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_variance_float
+ufunc__ncf_variance_ptr[2*0+1] = <void*>(<char*>"_ncf_variance")
+ufunc__ncf_variance_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_variance_double
+ufunc__ncf_variance_ptr[2*1+1] = <void*>(<char*>"_ncf_variance")
+ufunc__ncf_variance_data[0] = &ufunc__ncf_variance_ptr[2*0]
+ufunc__ncf_variance_data[1] = &ufunc__ncf_variance_ptr[2*1]
+_ncf_variance = np.PyUFunc_FromFuncAndData(ufunc__ncf_variance_loops, ufunc__ncf_variance_data, ufunc__ncf_variance_types, 2, 3, 1, 0, '_ncf_variance', ufunc__ncf_variance_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_isf_loops[2]
+cdef void *ufunc__nct_isf_ptr[4]
+cdef void *ufunc__nct_isf_data[2]
+cdef char ufunc__nct_isf_types[8]
+cdef char *ufunc__nct_isf_doc = (
+    "_nct_isf(x, v, l)\n"
+    "\n"
+    "Inverse survival function of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nct_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nct_isf_types[0] = <char>NPY_FLOAT
+ufunc__nct_isf_types[1] = <char>NPY_FLOAT
+ufunc__nct_isf_types[2] = <char>NPY_FLOAT
+ufunc__nct_isf_types[3] = <char>NPY_FLOAT
+ufunc__nct_isf_types[4] = <char>NPY_DOUBLE
+ufunc__nct_isf_types[5] = <char>NPY_DOUBLE
+ufunc__nct_isf_types[6] = <char>NPY_DOUBLE
+ufunc__nct_isf_types[7] = <char>NPY_DOUBLE
+ufunc__nct_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_isf_float
+ufunc__nct_isf_ptr[2*0+1] = <void*>(<char*>"_nct_isf")
+ufunc__nct_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_isf_double
+ufunc__nct_isf_ptr[2*1+1] = <void*>(<char*>"_nct_isf")
+ufunc__nct_isf_data[0] = &ufunc__nct_isf_ptr[2*0]
+ufunc__nct_isf_data[1] = &ufunc__nct_isf_ptr[2*1]
+_nct_isf = np.PyUFunc_FromFuncAndData(ufunc__nct_isf_loops, ufunc__nct_isf_data, ufunc__nct_isf_types, 2, 3, 1, 0, '_nct_isf', ufunc__nct_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_kurtosis_excess_loops[2]
+cdef void *ufunc__nct_kurtosis_excess_ptr[4]
+cdef void *ufunc__nct_kurtosis_excess_data[2]
+cdef char ufunc__nct_kurtosis_excess_types[6]
+cdef char *ufunc__nct_kurtosis_excess_doc = (
+    "_nct_kurtosis_excess(v, l)\n"
+    "\n"
+    "Kurtosis excess of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_kurtosis_excess_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nct_kurtosis_excess_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nct_kurtosis_excess_types[0] = <char>NPY_FLOAT
+ufunc__nct_kurtosis_excess_types[1] = <char>NPY_FLOAT
+ufunc__nct_kurtosis_excess_types[2] = <char>NPY_FLOAT
+ufunc__nct_kurtosis_excess_types[3] = <char>NPY_DOUBLE
+ufunc__nct_kurtosis_excess_types[4] = <char>NPY_DOUBLE
+ufunc__nct_kurtosis_excess_types[5] = <char>NPY_DOUBLE
+ufunc__nct_kurtosis_excess_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_kurtosis_excess_float
+ufunc__nct_kurtosis_excess_ptr[2*0+1] = <void*>(<char*>"_nct_kurtosis_excess")
+ufunc__nct_kurtosis_excess_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_kurtosis_excess_double
+ufunc__nct_kurtosis_excess_ptr[2*1+1] = <void*>(<char*>"_nct_kurtosis_excess")
+ufunc__nct_kurtosis_excess_data[0] = &ufunc__nct_kurtosis_excess_ptr[2*0]
+ufunc__nct_kurtosis_excess_data[1] = &ufunc__nct_kurtosis_excess_ptr[2*1]
+_nct_kurtosis_excess = np.PyUFunc_FromFuncAndData(ufunc__nct_kurtosis_excess_loops, ufunc__nct_kurtosis_excess_data, ufunc__nct_kurtosis_excess_types, 2, 2, 1, 0, '_nct_kurtosis_excess', ufunc__nct_kurtosis_excess_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_mean_loops[2]
+cdef void *ufunc__nct_mean_ptr[4]
+cdef void *ufunc__nct_mean_data[2]
+cdef char ufunc__nct_mean_types[6]
+cdef char *ufunc__nct_mean_doc = (
+    "_nct_mean(v, l)\n"
+    "\n"
+    "Mean of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_mean_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nct_mean_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nct_mean_types[0] = <char>NPY_FLOAT
+ufunc__nct_mean_types[1] = <char>NPY_FLOAT
+ufunc__nct_mean_types[2] = <char>NPY_FLOAT
+ufunc__nct_mean_types[3] = <char>NPY_DOUBLE
+ufunc__nct_mean_types[4] = <char>NPY_DOUBLE
+ufunc__nct_mean_types[5] = <char>NPY_DOUBLE
+ufunc__nct_mean_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_mean_float
+ufunc__nct_mean_ptr[2*0+1] = <void*>(<char*>"_nct_mean")
+ufunc__nct_mean_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_mean_double
+ufunc__nct_mean_ptr[2*1+1] = <void*>(<char*>"_nct_mean")
+ufunc__nct_mean_data[0] = &ufunc__nct_mean_ptr[2*0]
+ufunc__nct_mean_data[1] = &ufunc__nct_mean_ptr[2*1]
+_nct_mean = np.PyUFunc_FromFuncAndData(ufunc__nct_mean_loops, ufunc__nct_mean_data, ufunc__nct_mean_types, 2, 2, 1, 0, '_nct_mean', ufunc__nct_mean_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_pdf_loops[2]
+cdef void *ufunc__nct_pdf_ptr[4]
+cdef void *ufunc__nct_pdf_data[2]
+cdef char ufunc__nct_pdf_types[8]
+cdef char *ufunc__nct_pdf_doc = (
+    "_nct_pdf(x, v, l)\n"
+    "\n"
+    "Probability density function of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_pdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nct_pdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nct_pdf_types[0] = <char>NPY_FLOAT
+ufunc__nct_pdf_types[1] = <char>NPY_FLOAT
+ufunc__nct_pdf_types[2] = <char>NPY_FLOAT
+ufunc__nct_pdf_types[3] = <char>NPY_FLOAT
+ufunc__nct_pdf_types[4] = <char>NPY_DOUBLE
+ufunc__nct_pdf_types[5] = <char>NPY_DOUBLE
+ufunc__nct_pdf_types[6] = <char>NPY_DOUBLE
+ufunc__nct_pdf_types[7] = <char>NPY_DOUBLE
+ufunc__nct_pdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_pdf_float
+ufunc__nct_pdf_ptr[2*0+1] = <void*>(<char*>"_nct_pdf")
+ufunc__nct_pdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_pdf_double
+ufunc__nct_pdf_ptr[2*1+1] = <void*>(<char*>"_nct_pdf")
+ufunc__nct_pdf_data[0] = &ufunc__nct_pdf_ptr[2*0]
+ufunc__nct_pdf_data[1] = &ufunc__nct_pdf_ptr[2*1]
+_nct_pdf = np.PyUFunc_FromFuncAndData(ufunc__nct_pdf_loops, ufunc__nct_pdf_data, ufunc__nct_pdf_types, 2, 3, 1, 0, '_nct_pdf', ufunc__nct_pdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_sf_loops[2]
+cdef void *ufunc__nct_sf_ptr[4]
+cdef void *ufunc__nct_sf_data[2]
+cdef char ufunc__nct_sf_types[8]
+cdef char *ufunc__nct_sf_doc = (
+    "_nct_sf(x, v, l)\n"
+    "\n"
+    "Survival function of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__nct_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__nct_sf_types[0] = <char>NPY_FLOAT
+ufunc__nct_sf_types[1] = <char>NPY_FLOAT
+ufunc__nct_sf_types[2] = <char>NPY_FLOAT
+ufunc__nct_sf_types[3] = <char>NPY_FLOAT
+ufunc__nct_sf_types[4] = <char>NPY_DOUBLE
+ufunc__nct_sf_types[5] = <char>NPY_DOUBLE
+ufunc__nct_sf_types[6] = <char>NPY_DOUBLE
+ufunc__nct_sf_types[7] = <char>NPY_DOUBLE
+ufunc__nct_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_sf_float
+ufunc__nct_sf_ptr[2*0+1] = <void*>(<char*>"_nct_sf")
+ufunc__nct_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_sf_double
+ufunc__nct_sf_ptr[2*1+1] = <void*>(<char*>"_nct_sf")
+ufunc__nct_sf_data[0] = &ufunc__nct_sf_ptr[2*0]
+ufunc__nct_sf_data[1] = &ufunc__nct_sf_ptr[2*1]
+_nct_sf = np.PyUFunc_FromFuncAndData(ufunc__nct_sf_loops, ufunc__nct_sf_data, ufunc__nct_sf_types, 2, 3, 1, 0, '_nct_sf', ufunc__nct_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_skewness_loops[2]
+cdef void *ufunc__nct_skewness_ptr[4]
+cdef void *ufunc__nct_skewness_data[2]
+cdef char ufunc__nct_skewness_types[6]
+cdef char *ufunc__nct_skewness_doc = (
+    "_nct_skewness(v, l)\n"
+    "\n"
+    "Skewness of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_skewness_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nct_skewness_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nct_skewness_types[0] = <char>NPY_FLOAT
+ufunc__nct_skewness_types[1] = <char>NPY_FLOAT
+ufunc__nct_skewness_types[2] = <char>NPY_FLOAT
+ufunc__nct_skewness_types[3] = <char>NPY_DOUBLE
+ufunc__nct_skewness_types[4] = <char>NPY_DOUBLE
+ufunc__nct_skewness_types[5] = <char>NPY_DOUBLE
+ufunc__nct_skewness_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_skewness_float
+ufunc__nct_skewness_ptr[2*0+1] = <void*>(<char*>"_nct_skewness")
+ufunc__nct_skewness_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_skewness_double
+ufunc__nct_skewness_ptr[2*1+1] = <void*>(<char*>"_nct_skewness")
+ufunc__nct_skewness_data[0] = &ufunc__nct_skewness_ptr[2*0]
+ufunc__nct_skewness_data[1] = &ufunc__nct_skewness_ptr[2*1]
+_nct_skewness = np.PyUFunc_FromFuncAndData(ufunc__nct_skewness_loops, ufunc__nct_skewness_data, ufunc__nct_skewness_types, 2, 2, 1, 0, '_nct_skewness', ufunc__nct_skewness_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__nct_variance_loops[2]
+cdef void *ufunc__nct_variance_ptr[4]
+cdef void *ufunc__nct_variance_data[2]
+cdef char ufunc__nct_variance_types[6]
+cdef char *ufunc__nct_variance_doc = (
+    "_nct_variance(v, l)\n"
+    "\n"
+    "Variance of noncentral t-distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__nct_variance_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc__nct_variance_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__nct_variance_types[0] = <char>NPY_FLOAT
+ufunc__nct_variance_types[1] = <char>NPY_FLOAT
+ufunc__nct_variance_types[2] = <char>NPY_FLOAT
+ufunc__nct_variance_types[3] = <char>NPY_DOUBLE
+ufunc__nct_variance_types[4] = <char>NPY_DOUBLE
+ufunc__nct_variance_types[5] = <char>NPY_DOUBLE
+ufunc__nct_variance_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_variance_float
+ufunc__nct_variance_ptr[2*0+1] = <void*>(<char*>"_nct_variance")
+ufunc__nct_variance_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_variance_double
+ufunc__nct_variance_ptr[2*1+1] = <void*>(<char*>"_nct_variance")
+ufunc__nct_variance_data[0] = &ufunc__nct_variance_ptr[2*0]
+ufunc__nct_variance_data[1] = &ufunc__nct_variance_ptr[2*1]
+_nct_variance = np.PyUFunc_FromFuncAndData(ufunc__nct_variance_loops, ufunc__nct_variance_data, ufunc__nct_variance_types, 2, 2, 1, 0, '_nct_variance', ufunc__nct_variance_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncx2_isf_loops[2]
+cdef void *ufunc__ncx2_isf_ptr[4]
+cdef void *ufunc__ncx2_isf_data[2]
+cdef char ufunc__ncx2_isf_types[8]
+cdef char *ufunc__ncx2_isf_doc = (
+    "_ncx2_isf(x, k, l)\n"
+    "\n"
+    "Inverse survival function of Non-central chi-squared distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "k, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncx2_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncx2_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncx2_isf_types[0] = <char>NPY_FLOAT
+ufunc__ncx2_isf_types[1] = <char>NPY_FLOAT
+ufunc__ncx2_isf_types[2] = <char>NPY_FLOAT
+ufunc__ncx2_isf_types[3] = <char>NPY_FLOAT
+ufunc__ncx2_isf_types[4] = <char>NPY_DOUBLE
+ufunc__ncx2_isf_types[5] = <char>NPY_DOUBLE
+ufunc__ncx2_isf_types[6] = <char>NPY_DOUBLE
+ufunc__ncx2_isf_types[7] = <char>NPY_DOUBLE
+ufunc__ncx2_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_isf_float
+ufunc__ncx2_isf_ptr[2*0+1] = <void*>(<char*>"_ncx2_isf")
+ufunc__ncx2_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_isf_double
+ufunc__ncx2_isf_ptr[2*1+1] = <void*>(<char*>"_ncx2_isf")
+ufunc__ncx2_isf_data[0] = &ufunc__ncx2_isf_ptr[2*0]
+ufunc__ncx2_isf_data[1] = &ufunc__ncx2_isf_ptr[2*1]
+_ncx2_isf = np.PyUFunc_FromFuncAndData(ufunc__ncx2_isf_loops, ufunc__ncx2_isf_data, ufunc__ncx2_isf_types, 2, 3, 1, 0, '_ncx2_isf', ufunc__ncx2_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncx2_pdf_loops[2]
+cdef void *ufunc__ncx2_pdf_ptr[4]
+cdef void *ufunc__ncx2_pdf_data[2]
+cdef char ufunc__ncx2_pdf_types[8]
+cdef char *ufunc__ncx2_pdf_doc = (
+    "_ncx2_pdf(x, k, l)\n"
+    "\n"
+    "Probability density function of Non-central chi-squared distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "k, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncx2_pdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncx2_pdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncx2_pdf_types[0] = <char>NPY_FLOAT
+ufunc__ncx2_pdf_types[1] = <char>NPY_FLOAT
+ufunc__ncx2_pdf_types[2] = <char>NPY_FLOAT
+ufunc__ncx2_pdf_types[3] = <char>NPY_FLOAT
+ufunc__ncx2_pdf_types[4] = <char>NPY_DOUBLE
+ufunc__ncx2_pdf_types[5] = <char>NPY_DOUBLE
+ufunc__ncx2_pdf_types[6] = <char>NPY_DOUBLE
+ufunc__ncx2_pdf_types[7] = <char>NPY_DOUBLE
+ufunc__ncx2_pdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_pdf_float
+ufunc__ncx2_pdf_ptr[2*0+1] = <void*>(<char*>"_ncx2_pdf")
+ufunc__ncx2_pdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_pdf_double
+ufunc__ncx2_pdf_ptr[2*1+1] = <void*>(<char*>"_ncx2_pdf")
+ufunc__ncx2_pdf_data[0] = &ufunc__ncx2_pdf_ptr[2*0]
+ufunc__ncx2_pdf_data[1] = &ufunc__ncx2_pdf_ptr[2*1]
+_ncx2_pdf = np.PyUFunc_FromFuncAndData(ufunc__ncx2_pdf_loops, ufunc__ncx2_pdf_data, ufunc__ncx2_pdf_types, 2, 3, 1, 0, '_ncx2_pdf', ufunc__ncx2_pdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__ncx2_sf_loops[2]
+cdef void *ufunc__ncx2_sf_ptr[4]
+cdef void *ufunc__ncx2_sf_data[2]
+cdef char ufunc__ncx2_sf_types[8]
+cdef char *ufunc__ncx2_sf_doc = (
+    "_ncx2_sf(x, k, l)\n"
+    "\n"
+    "Survival function of Non-central chi-squared distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Positive real-valued\n"
+    "k, l : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__ncx2_sf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc__ncx2_sf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc__ncx2_sf_types[0] = <char>NPY_FLOAT
+ufunc__ncx2_sf_types[1] = <char>NPY_FLOAT
+ufunc__ncx2_sf_types[2] = <char>NPY_FLOAT
+ufunc__ncx2_sf_types[3] = <char>NPY_FLOAT
+ufunc__ncx2_sf_types[4] = <char>NPY_DOUBLE
+ufunc__ncx2_sf_types[5] = <char>NPY_DOUBLE
+ufunc__ncx2_sf_types[6] = <char>NPY_DOUBLE
+ufunc__ncx2_sf_types[7] = <char>NPY_DOUBLE
+ufunc__ncx2_sf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_sf_float
+ufunc__ncx2_sf_ptr[2*0+1] = <void*>(<char*>"_ncx2_sf")
+ufunc__ncx2_sf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_sf_double
+ufunc__ncx2_sf_ptr[2*1+1] = <void*>(<char*>"_ncx2_sf")
+ufunc__ncx2_sf_data[0] = &ufunc__ncx2_sf_ptr[2*0]
+ufunc__ncx2_sf_data[1] = &ufunc__ncx2_sf_ptr[2*1]
+_ncx2_sf = np.PyUFunc_FromFuncAndData(ufunc__ncx2_sf_loops, ufunc__ncx2_sf_data, ufunc__ncx2_sf_types, 2, 3, 1, 0, '_ncx2_sf', ufunc__ncx2_sf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__sf_error_test_function_loops[1]
+cdef void *ufunc__sf_error_test_function_ptr[2]
+cdef void *ufunc__sf_error_test_function_data[1]
+cdef char ufunc__sf_error_test_function_types[2]
+cdef char *ufunc__sf_error_test_function_doc = (
+    "Private function; do not use.")
+ufunc__sf_error_test_function_loops[0] = <np.PyUFuncGenericFunction>loop_i_i__As_l_l
+ufunc__sf_error_test_function_types[0] = <char>NPY_LONG
+ufunc__sf_error_test_function_types[1] = <char>NPY_LONG
+ufunc__sf_error_test_function_ptr[2*0] = <void*>_func__sf_error_test_function
+ufunc__sf_error_test_function_ptr[2*0+1] = <void*>(<char*>"_sf_error_test_function")
+ufunc__sf_error_test_function_data[0] = &ufunc__sf_error_test_function_ptr[2*0]
+_sf_error_test_function = np.PyUFunc_FromFuncAndData(ufunc__sf_error_test_function_loops, ufunc__sf_error_test_function_data, ufunc__sf_error_test_function_types, 1, 1, 1, 0, '_sf_error_test_function', ufunc__sf_error_test_function_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__skewnorm_cdf_loops[2]
+cdef void *ufunc__skewnorm_cdf_ptr[4]
+cdef void *ufunc__skewnorm_cdf_data[2]
+cdef char ufunc__skewnorm_cdf_types[10]
+cdef char *ufunc__skewnorm_cdf_doc = (
+    "_skewnorm_cdf(x, l, sc, sh)\n"
+    "\n"
+    "Cumulative density function of skewnorm distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "sc : array_like\n"
+    "    Positive, Real-valued parameters\n"
+    "sh : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__skewnorm_cdf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__skewnorm_cdf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__skewnorm_cdf_types[0] = <char>NPY_FLOAT
+ufunc__skewnorm_cdf_types[1] = <char>NPY_FLOAT
+ufunc__skewnorm_cdf_types[2] = <char>NPY_FLOAT
+ufunc__skewnorm_cdf_types[3] = <char>NPY_FLOAT
+ufunc__skewnorm_cdf_types[4] = <char>NPY_FLOAT
+ufunc__skewnorm_cdf_types[5] = <char>NPY_DOUBLE
+ufunc__skewnorm_cdf_types[6] = <char>NPY_DOUBLE
+ufunc__skewnorm_cdf_types[7] = <char>NPY_DOUBLE
+ufunc__skewnorm_cdf_types[8] = <char>NPY_DOUBLE
+ufunc__skewnorm_cdf_types[9] = <char>NPY_DOUBLE
+ufunc__skewnorm_cdf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_cdf_float
+ufunc__skewnorm_cdf_ptr[2*0+1] = <void*>(<char*>"_skewnorm_cdf")
+ufunc__skewnorm_cdf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_cdf_double
+ufunc__skewnorm_cdf_ptr[2*1+1] = <void*>(<char*>"_skewnorm_cdf")
+ufunc__skewnorm_cdf_data[0] = &ufunc__skewnorm_cdf_ptr[2*0]
+ufunc__skewnorm_cdf_data[1] = &ufunc__skewnorm_cdf_ptr[2*1]
+_skewnorm_cdf = np.PyUFunc_FromFuncAndData(ufunc__skewnorm_cdf_loops, ufunc__skewnorm_cdf_data, ufunc__skewnorm_cdf_types, 2, 4, 1, 0, '_skewnorm_cdf', ufunc__skewnorm_cdf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__skewnorm_isf_loops[2]
+cdef void *ufunc__skewnorm_isf_ptr[4]
+cdef void *ufunc__skewnorm_isf_data[2]
+cdef char ufunc__skewnorm_isf_types[10]
+cdef char *ufunc__skewnorm_isf_doc = (
+    "_skewnorm_isf(x, l, sc, sh)\n"
+    "\n"
+    "Inverse survival function of skewnorm distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "sc : array_like\n"
+    "    Positive, Real-valued parameters\n"
+    "sh : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__skewnorm_isf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__skewnorm_isf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__skewnorm_isf_types[0] = <char>NPY_FLOAT
+ufunc__skewnorm_isf_types[1] = <char>NPY_FLOAT
+ufunc__skewnorm_isf_types[2] = <char>NPY_FLOAT
+ufunc__skewnorm_isf_types[3] = <char>NPY_FLOAT
+ufunc__skewnorm_isf_types[4] = <char>NPY_FLOAT
+ufunc__skewnorm_isf_types[5] = <char>NPY_DOUBLE
+ufunc__skewnorm_isf_types[6] = <char>NPY_DOUBLE
+ufunc__skewnorm_isf_types[7] = <char>NPY_DOUBLE
+ufunc__skewnorm_isf_types[8] = <char>NPY_DOUBLE
+ufunc__skewnorm_isf_types[9] = <char>NPY_DOUBLE
+ufunc__skewnorm_isf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_isf_float
+ufunc__skewnorm_isf_ptr[2*0+1] = <void*>(<char*>"_skewnorm_isf")
+ufunc__skewnorm_isf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_isf_double
+ufunc__skewnorm_isf_ptr[2*1+1] = <void*>(<char*>"_skewnorm_isf")
+ufunc__skewnorm_isf_data[0] = &ufunc__skewnorm_isf_ptr[2*0]
+ufunc__skewnorm_isf_data[1] = &ufunc__skewnorm_isf_ptr[2*1]
+_skewnorm_isf = np.PyUFunc_FromFuncAndData(ufunc__skewnorm_isf_loops, ufunc__skewnorm_isf_data, ufunc__skewnorm_isf_types, 2, 4, 1, 0, '_skewnorm_isf', ufunc__skewnorm_isf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__skewnorm_ppf_loops[2]
+cdef void *ufunc__skewnorm_ppf_ptr[4]
+cdef void *ufunc__skewnorm_ppf_data[2]
+cdef char ufunc__skewnorm_ppf_types[10]
+cdef char *ufunc__skewnorm_ppf_doc = (
+    "_skewnorm_ppf(x, l, sc, sh)\n"
+    "\n"
+    "Percent point function of skewnorm distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real-valued\n"
+    "l : array_like\n"
+    "    Real-valued parameters\n"
+    "sc : array_like\n"
+    "    Positive, Real-valued parameters\n"
+    "sh : array_like\n"
+    "    Real-valued parameters\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray")
+ufunc__skewnorm_ppf_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc__skewnorm_ppf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc__skewnorm_ppf_types[0] = <char>NPY_FLOAT
+ufunc__skewnorm_ppf_types[1] = <char>NPY_FLOAT
+ufunc__skewnorm_ppf_types[2] = <char>NPY_FLOAT
+ufunc__skewnorm_ppf_types[3] = <char>NPY_FLOAT
+ufunc__skewnorm_ppf_types[4] = <char>NPY_FLOAT
+ufunc__skewnorm_ppf_types[5] = <char>NPY_DOUBLE
+ufunc__skewnorm_ppf_types[6] = <char>NPY_DOUBLE
+ufunc__skewnorm_ppf_types[7] = <char>NPY_DOUBLE
+ufunc__skewnorm_ppf_types[8] = <char>NPY_DOUBLE
+ufunc__skewnorm_ppf_types[9] = <char>NPY_DOUBLE
+ufunc__skewnorm_ppf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_ppf_float
+ufunc__skewnorm_ppf_ptr[2*0+1] = <void*>(<char*>"_skewnorm_ppf")
+ufunc__skewnorm_ppf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_skewnorm_ppf_double
+ufunc__skewnorm_ppf_ptr[2*1+1] = <void*>(<char*>"_skewnorm_ppf")
+ufunc__skewnorm_ppf_data[0] = &ufunc__skewnorm_ppf_ptr[2*0]
+ufunc__skewnorm_ppf_data[1] = &ufunc__skewnorm_ppf_ptr[2*1]
+_skewnorm_ppf = np.PyUFunc_FromFuncAndData(ufunc__skewnorm_ppf_loops, ufunc__skewnorm_ppf_data, ufunc__skewnorm_ppf_types, 2, 4, 1, 0, '_skewnorm_ppf', ufunc__skewnorm_ppf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__smirnovc_loops[3]
+cdef void *ufunc__smirnovc_ptr[6]
+cdef void *ufunc__smirnovc_data[3]
+cdef char ufunc__smirnovc_types[9]
+cdef char *ufunc__smirnovc_doc = (
+    "_smirnovc(n, d)\n"
+    " Internal function, do not use.")
+ufunc__smirnovc_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc__smirnovc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc__smirnovc_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__smirnovc_types[0] = <char>NPY_INTP
+ufunc__smirnovc_types[1] = <char>NPY_DOUBLE
+ufunc__smirnovc_types[2] = <char>NPY_DOUBLE
+ufunc__smirnovc_types[3] = <char>NPY_FLOAT
+ufunc__smirnovc_types[4] = <char>NPY_FLOAT
+ufunc__smirnovc_types[5] = <char>NPY_FLOAT
+ufunc__smirnovc_types[6] = <char>NPY_DOUBLE
+ufunc__smirnovc_types[7] = <char>NPY_DOUBLE
+ufunc__smirnovc_types[8] = <char>NPY_DOUBLE
+ufunc__smirnovc_ptr[2*0] = <void*>_func_cephes_smirnovc_wrap
+ufunc__smirnovc_ptr[2*0+1] = <void*>(<char*>"_smirnovc")
+ufunc__smirnovc_ptr[2*1] = <void*>_func_smirnovc_unsafe
+ufunc__smirnovc_ptr[2*1+1] = <void*>(<char*>"_smirnovc")
+ufunc__smirnovc_ptr[2*2] = <void*>_func_smirnovc_unsafe
+ufunc__smirnovc_ptr[2*2+1] = <void*>(<char*>"_smirnovc")
+ufunc__smirnovc_data[0] = &ufunc__smirnovc_ptr[2*0]
+ufunc__smirnovc_data[1] = &ufunc__smirnovc_ptr[2*1]
+ufunc__smirnovc_data[2] = &ufunc__smirnovc_ptr[2*2]
+_smirnovc = np.PyUFunc_FromFuncAndData(ufunc__smirnovc_loops, ufunc__smirnovc_data, ufunc__smirnovc_types, 3, 2, 1, 0, '_smirnovc', ufunc__smirnovc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__smirnovci_loops[3]
+cdef void *ufunc__smirnovci_ptr[6]
+cdef void *ufunc__smirnovci_data[3]
+cdef char ufunc__smirnovci_types[9]
+cdef char *ufunc__smirnovci_doc = (
+    "Internal function, do not use.")
+ufunc__smirnovci_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc__smirnovci_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc__smirnovci_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__smirnovci_types[0] = <char>NPY_INTP
+ufunc__smirnovci_types[1] = <char>NPY_DOUBLE
+ufunc__smirnovci_types[2] = <char>NPY_DOUBLE
+ufunc__smirnovci_types[3] = <char>NPY_FLOAT
+ufunc__smirnovci_types[4] = <char>NPY_FLOAT
+ufunc__smirnovci_types[5] = <char>NPY_FLOAT
+ufunc__smirnovci_types[6] = <char>NPY_DOUBLE
+ufunc__smirnovci_types[7] = <char>NPY_DOUBLE
+ufunc__smirnovci_types[8] = <char>NPY_DOUBLE
+ufunc__smirnovci_ptr[2*0] = <void*>_func_cephes_smirnovci_wrap
+ufunc__smirnovci_ptr[2*0+1] = <void*>(<char*>"_smirnovci")
+ufunc__smirnovci_ptr[2*1] = <void*>_func_smirnovci_unsafe
+ufunc__smirnovci_ptr[2*1+1] = <void*>(<char*>"_smirnovci")
+ufunc__smirnovci_ptr[2*2] = <void*>_func_smirnovci_unsafe
+ufunc__smirnovci_ptr[2*2+1] = <void*>(<char*>"_smirnovci")
+ufunc__smirnovci_data[0] = &ufunc__smirnovci_ptr[2*0]
+ufunc__smirnovci_data[1] = &ufunc__smirnovci_ptr[2*1]
+ufunc__smirnovci_data[2] = &ufunc__smirnovci_ptr[2*2]
+_smirnovci = np.PyUFunc_FromFuncAndData(ufunc__smirnovci_loops, ufunc__smirnovci_data, ufunc__smirnovci_types, 3, 2, 1, 0, '_smirnovci', ufunc__smirnovci_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__smirnovp_loops[3]
+cdef void *ufunc__smirnovp_ptr[6]
+cdef void *ufunc__smirnovp_data[3]
+cdef char ufunc__smirnovp_types[9]
+cdef char *ufunc__smirnovp_doc = (
+    "_smirnovp(n, p)\n"
+    " Internal function, do not use.")
+ufunc__smirnovp_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc__smirnovp_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc__smirnovp_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__smirnovp_types[0] = <char>NPY_INTP
+ufunc__smirnovp_types[1] = <char>NPY_DOUBLE
+ufunc__smirnovp_types[2] = <char>NPY_DOUBLE
+ufunc__smirnovp_types[3] = <char>NPY_FLOAT
+ufunc__smirnovp_types[4] = <char>NPY_FLOAT
+ufunc__smirnovp_types[5] = <char>NPY_FLOAT
+ufunc__smirnovp_types[6] = <char>NPY_DOUBLE
+ufunc__smirnovp_types[7] = <char>NPY_DOUBLE
+ufunc__smirnovp_types[8] = <char>NPY_DOUBLE
+ufunc__smirnovp_ptr[2*0] = <void*>_func_cephes_smirnovp_wrap
+ufunc__smirnovp_ptr[2*0+1] = <void*>(<char*>"_smirnovp")
+ufunc__smirnovp_ptr[2*1] = <void*>_func_smirnovp_unsafe
+ufunc__smirnovp_ptr[2*1+1] = <void*>(<char*>"_smirnovp")
+ufunc__smirnovp_ptr[2*2] = <void*>_func_smirnovp_unsafe
+ufunc__smirnovp_ptr[2*2+1] = <void*>(<char*>"_smirnovp")
+ufunc__smirnovp_data[0] = &ufunc__smirnovp_ptr[2*0]
+ufunc__smirnovp_data[1] = &ufunc__smirnovp_ptr[2*1]
+ufunc__smirnovp_data[2] = &ufunc__smirnovp_ptr[2*2]
+_smirnovp = np.PyUFunc_FromFuncAndData(ufunc__smirnovp_loops, ufunc__smirnovp_data, ufunc__smirnovp_types, 3, 2, 1, 0, '_smirnovp', ufunc__smirnovp_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__stirling2_inexact_loops[2]
+cdef void *ufunc__stirling2_inexact_ptr[4]
+cdef void *ufunc__stirling2_inexact_data[2]
+cdef char ufunc__stirling2_inexact_types[6]
+cdef char *ufunc__stirling2_inexact_doc = (
+    "Internal function, do not use.")
+ufunc__stirling2_inexact_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc__stirling2_inexact_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc__stirling2_inexact_types[0] = <char>NPY_FLOAT
+ufunc__stirling2_inexact_types[1] = <char>NPY_FLOAT
+ufunc__stirling2_inexact_types[2] = <char>NPY_FLOAT
+ufunc__stirling2_inexact_types[3] = <char>NPY_DOUBLE
+ufunc__stirling2_inexact_types[4] = <char>NPY_DOUBLE
+ufunc__stirling2_inexact_types[5] = <char>NPY_DOUBLE
+ufunc__stirling2_inexact_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export__stirling2_inexact
+ufunc__stirling2_inexact_ptr[2*0+1] = <void*>(<char*>"_stirling2_inexact")
+ufunc__stirling2_inexact_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export__stirling2_inexact
+ufunc__stirling2_inexact_ptr[2*1+1] = <void*>(<char*>"_stirling2_inexact")
+ufunc__stirling2_inexact_data[0] = &ufunc__stirling2_inexact_ptr[2*0]
+ufunc__stirling2_inexact_data[1] = &ufunc__stirling2_inexact_ptr[2*1]
+_stirling2_inexact = np.PyUFunc_FromFuncAndData(ufunc__stirling2_inexact_loops, ufunc__stirling2_inexact_data, ufunc__stirling2_inexact_types, 2, 2, 1, 0, '_stirling2_inexact', ufunc__stirling2_inexact_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__struve_asymp_large_z_loops[1]
+cdef void *ufunc__struve_asymp_large_z_ptr[2]
+cdef void *ufunc__struve_asymp_large_z_data[1]
+cdef char ufunc__struve_asymp_large_z_types[5]
+cdef char *ufunc__struve_asymp_large_z_doc = (
+    "_struve_asymp_large_z(v, z, is_h)\n"
+    "\n"
+    "Internal function for testing `struve` & `modstruve`\n"
+    "\n"
+    "Evaluates using asymptotic expansion\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "v, err")
+ufunc__struve_asymp_large_z_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddp_d_As_ddp_dd
+ufunc__struve_asymp_large_z_types[0] = <char>NPY_DOUBLE
+ufunc__struve_asymp_large_z_types[1] = <char>NPY_DOUBLE
+ufunc__struve_asymp_large_z_types[2] = <char>NPY_INTP
+ufunc__struve_asymp_large_z_types[3] = <char>NPY_DOUBLE
+ufunc__struve_asymp_large_z_types[4] = <char>NPY_DOUBLE
+ufunc__struve_asymp_large_z_ptr[2*0] = <void*>_func_cephes__struve_asymp_large_z
+ufunc__struve_asymp_large_z_ptr[2*0+1] = <void*>(<char*>"_struve_asymp_large_z")
+ufunc__struve_asymp_large_z_data[0] = &ufunc__struve_asymp_large_z_ptr[2*0]
+_struve_asymp_large_z = np.PyUFunc_FromFuncAndData(ufunc__struve_asymp_large_z_loops, ufunc__struve_asymp_large_z_data, ufunc__struve_asymp_large_z_types, 1, 3, 2, 0, '_struve_asymp_large_z', ufunc__struve_asymp_large_z_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__struve_bessel_series_loops[1]
+cdef void *ufunc__struve_bessel_series_ptr[2]
+cdef void *ufunc__struve_bessel_series_data[1]
+cdef char ufunc__struve_bessel_series_types[5]
+cdef char *ufunc__struve_bessel_series_doc = (
+    "_struve_bessel_series(v, z, is_h)\n"
+    "\n"
+    "Internal function for testing `struve` & `modstruve`\n"
+    "\n"
+    "Evaluates using Bessel function series\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "v, err")
+ufunc__struve_bessel_series_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddp_d_As_ddp_dd
+ufunc__struve_bessel_series_types[0] = <char>NPY_DOUBLE
+ufunc__struve_bessel_series_types[1] = <char>NPY_DOUBLE
+ufunc__struve_bessel_series_types[2] = <char>NPY_INTP
+ufunc__struve_bessel_series_types[3] = <char>NPY_DOUBLE
+ufunc__struve_bessel_series_types[4] = <char>NPY_DOUBLE
+ufunc__struve_bessel_series_ptr[2*0] = <void*>_func_cephes__struve_bessel_series
+ufunc__struve_bessel_series_ptr[2*0+1] = <void*>(<char*>"_struve_bessel_series")
+ufunc__struve_bessel_series_data[0] = &ufunc__struve_bessel_series_ptr[2*0]
+_struve_bessel_series = np.PyUFunc_FromFuncAndData(ufunc__struve_bessel_series_loops, ufunc__struve_bessel_series_data, ufunc__struve_bessel_series_types, 1, 3, 2, 0, '_struve_bessel_series', ufunc__struve_bessel_series_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc__struve_power_series_loops[1]
+cdef void *ufunc__struve_power_series_ptr[2]
+cdef void *ufunc__struve_power_series_data[1]
+cdef char ufunc__struve_power_series_types[5]
+cdef char *ufunc__struve_power_series_doc = (
+    "_struve_power_series(v, z, is_h)\n"
+    "\n"
+    "Internal function for testing `struve` & `modstruve`\n"
+    "\n"
+    "Evaluates using power series\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "v, err")
+ufunc__struve_power_series_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddp_d_As_ddp_dd
+ufunc__struve_power_series_types[0] = <char>NPY_DOUBLE
+ufunc__struve_power_series_types[1] = <char>NPY_DOUBLE
+ufunc__struve_power_series_types[2] = <char>NPY_INTP
+ufunc__struve_power_series_types[3] = <char>NPY_DOUBLE
+ufunc__struve_power_series_types[4] = <char>NPY_DOUBLE
+ufunc__struve_power_series_ptr[2*0] = <void*>_func_cephes__struve_power_series
+ufunc__struve_power_series_ptr[2*0+1] = <void*>(<char*>"_struve_power_series")
+ufunc__struve_power_series_data[0] = &ufunc__struve_power_series_ptr[2*0]
+_struve_power_series = np.PyUFunc_FromFuncAndData(ufunc__struve_power_series_loops, ufunc__struve_power_series_data, ufunc__struve_power_series_types, 1, 3, 2, 0, '_struve_power_series', ufunc__struve_power_series_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_agm_loops[2]
+cdef void *ufunc_agm_ptr[4]
+cdef void *ufunc_agm_data[2]
+cdef char ufunc_agm_types[6]
+cdef char *ufunc_agm_doc = (
+    "agm(a, b, out=None)\n"
+    "\n"
+    "Compute the arithmetic-geometric mean of `a` and `b`.\n"
+    "\n"
+    "Start with a_0 = a and b_0 = b and iteratively compute::\n"
+    "\n"
+    "    a_{n+1} = (a_n + b_n)/2\n"
+    "    b_{n+1} = sqrt(a_n*b_n)\n"
+    "\n"
+    "a_n and b_n converge to the same limit as n increases; their common\n"
+    "limit is agm(a, b).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "    Real values only. If the values are both negative, the result\n"
+    "    is negative. If one value is negative and the other is positive,\n"
+    "    `nan` is returned.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The arithmetic-geometric mean of `a` and `b`.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import agm\n"
+    ">>> a, b = 24.0, 6.0\n"
+    ">>> agm(a, b)\n"
+    "13.458171481725614\n"
+    "\n"
+    "Compare that result to the iteration:\n"
+    "\n"
+    ">>> while a != b:\n"
+    "...     a, b = (a + b)/2, np.sqrt(a*b)\n"
+    "...     print(\"a = %19.16f  b=%19.16f\" % (a, b))\n"
+    "...\n"
+    "a = 15.0000000000000000  b=12.0000000000000000\n"
+    "a = 13.5000000000000000  b=13.4164078649987388\n"
+    "a = 13.4582039324993694  b=13.4581390309909850\n"
+    "a = 13.4581714817451772  b=13.4581714817060547\n"
+    "a = 13.4581714817256159  b=13.4581714817256159\n"
+    "\n"
+    "When array-like arguments are given, broadcasting applies:\n"
+    "\n"
+    ">>> a = np.array([[1.5], [3], [6]])  # a has shape (3, 1).\n"
+    ">>> b = np.array([6, 12, 24, 48])    # b has shape (4,).\n"
+    ">>> agm(a, b)\n"
+    "array([[  3.36454287,   5.42363427,   9.05798751,  15.53650756],\n"
+    "       [  4.37037309,   6.72908574,  10.84726853,  18.11597502],\n"
+    "       [  6.        ,   8.74074619,  13.45817148,  21.69453707]])")
+ufunc_agm_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_agm_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_agm_types[0] = <char>NPY_FLOAT
+ufunc_agm_types[1] = <char>NPY_FLOAT
+ufunc_agm_types[2] = <char>NPY_FLOAT
+ufunc_agm_types[3] = <char>NPY_DOUBLE
+ufunc_agm_types[4] = <char>NPY_DOUBLE
+ufunc_agm_types[5] = <char>NPY_DOUBLE
+ufunc_agm_ptr[2*0] = <void*>_func_agm
+ufunc_agm_ptr[2*0+1] = <void*>(<char*>"agm")
+ufunc_agm_ptr[2*1] = <void*>_func_agm
+ufunc_agm_ptr[2*1+1] = <void*>(<char*>"agm")
+ufunc_agm_data[0] = &ufunc_agm_ptr[2*0]
+ufunc_agm_data[1] = &ufunc_agm_ptr[2*1]
+agm = np.PyUFunc_FromFuncAndData(ufunc_agm_loops, ufunc_agm_data, ufunc_agm_types, 2, 2, 1, 0, 'agm', ufunc_agm_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_bdtr_loops[3]
+cdef void *ufunc_bdtr_ptr[6]
+cdef void *ufunc_bdtr_data[3]
+cdef char ufunc_bdtr_types[12]
+cdef char *ufunc_bdtr_doc = (
+    "bdtr(k, n, p, out=None)\n"
+    "\n"
+    "Binomial distribution cumulative distribution function.\n"
+    "\n"
+    "Sum of the terms 0 through `floor(k)` of the Binomial probability density.\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{bdtr}(k, n, p) =\n"
+    "    \\sum_{j=0}^{\\lfloor k \\rfloor} {{n}\\choose{j}} p^j (1-p)^{n-j}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of successes (double), rounded down to the nearest integer.\n"
+    "n : array_like\n"
+    "    Number of events (int).\n"
+    "p : array_like\n"
+    "    Probability of success in a single event (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    Probability of `floor(k)` or fewer successes in `n` independent events with\n"
+    "    success probabilities of `p`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The terms are not summed directly; instead the regularized incomplete beta\n"
+    "function is employed, according to the formula,\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{bdtr}(k, n, p) =\n"
+    "    I_{1 - p}(n - \\lfloor k \\rfloor, \\lfloor k \\rfloor + 1).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `bdtr`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/")
+ufunc_bdtr_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_bdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dpd__As_dpd_d
+ufunc_bdtr_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_bdtr_types[0] = <char>NPY_FLOAT
+ufunc_bdtr_types[1] = <char>NPY_FLOAT
+ufunc_bdtr_types[2] = <char>NPY_FLOAT
+ufunc_bdtr_types[3] = <char>NPY_FLOAT
+ufunc_bdtr_types[4] = <char>NPY_DOUBLE
+ufunc_bdtr_types[5] = <char>NPY_INTP
+ufunc_bdtr_types[6] = <char>NPY_DOUBLE
+ufunc_bdtr_types[7] = <char>NPY_DOUBLE
+ufunc_bdtr_types[8] = <char>NPY_DOUBLE
+ufunc_bdtr_types[9] = <char>NPY_DOUBLE
+ufunc_bdtr_types[10] = <char>NPY_DOUBLE
+ufunc_bdtr_types[11] = <char>NPY_DOUBLE
+ufunc_bdtr_ptr[2*0] = <void*>_func_bdtr_unsafe
+ufunc_bdtr_ptr[2*0+1] = <void*>(<char*>"bdtr")
+ufunc_bdtr_ptr[2*1] = <void*>_func_cephes_bdtr_wrap
+ufunc_bdtr_ptr[2*1+1] = <void*>(<char*>"bdtr")
+ufunc_bdtr_ptr[2*2] = <void*>_func_bdtr_unsafe
+ufunc_bdtr_ptr[2*2+1] = <void*>(<char*>"bdtr")
+ufunc_bdtr_data[0] = &ufunc_bdtr_ptr[2*0]
+ufunc_bdtr_data[1] = &ufunc_bdtr_ptr[2*1]
+ufunc_bdtr_data[2] = &ufunc_bdtr_ptr[2*2]
+bdtr = np.PyUFunc_FromFuncAndData(ufunc_bdtr_loops, ufunc_bdtr_data, ufunc_bdtr_types, 3, 3, 1, 0, 'bdtr', ufunc_bdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_bdtrc_loops[3]
+cdef void *ufunc_bdtrc_ptr[6]
+cdef void *ufunc_bdtrc_data[3]
+cdef char ufunc_bdtrc_types[12]
+cdef char *ufunc_bdtrc_doc = (
+    "bdtrc(k, n, p, out=None)\n"
+    "\n"
+    "Binomial distribution survival function.\n"
+    "\n"
+    "Sum of the terms `floor(k) + 1` through `n` of the binomial probability\n"
+    "density,\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{bdtrc}(k, n, p) =\n"
+    "    \\sum_{j=\\lfloor k \\rfloor +1}^n {{n}\\choose{j}} p^j (1-p)^{n-j}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of successes (double), rounded down to nearest integer.\n"
+    "n : array_like\n"
+    "    Number of events (int)\n"
+    "p : array_like\n"
+    "    Probability of success in a single event.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    Probability of `floor(k) + 1` or more successes in `n` independent\n"
+    "    events with success probabilities of `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "bdtr\n"
+    "betainc\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The terms are not summed directly; instead the regularized incomplete beta\n"
+    "function is employed, according to the formula,\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{bdtrc}(k, n, p) = I_{p}(\\lfloor k \\rfloor + 1, n - \\lfloor k \\rfloor).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `bdtrc`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/")
+ufunc_bdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_bdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dpd__As_dpd_d
+ufunc_bdtrc_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_bdtrc_types[0] = <char>NPY_FLOAT
+ufunc_bdtrc_types[1] = <char>NPY_FLOAT
+ufunc_bdtrc_types[2] = <char>NPY_FLOAT
+ufunc_bdtrc_types[3] = <char>NPY_FLOAT
+ufunc_bdtrc_types[4] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[5] = <char>NPY_INTP
+ufunc_bdtrc_types[6] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[7] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[8] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[9] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[10] = <char>NPY_DOUBLE
+ufunc_bdtrc_types[11] = <char>NPY_DOUBLE
+ufunc_bdtrc_ptr[2*0] = <void*>_func_bdtrc_unsafe
+ufunc_bdtrc_ptr[2*0+1] = <void*>(<char*>"bdtrc")
+ufunc_bdtrc_ptr[2*1] = <void*>_func_cephes_bdtrc_wrap
+ufunc_bdtrc_ptr[2*1+1] = <void*>(<char*>"bdtrc")
+ufunc_bdtrc_ptr[2*2] = <void*>_func_bdtrc_unsafe
+ufunc_bdtrc_ptr[2*2+1] = <void*>(<char*>"bdtrc")
+ufunc_bdtrc_data[0] = &ufunc_bdtrc_ptr[2*0]
+ufunc_bdtrc_data[1] = &ufunc_bdtrc_ptr[2*1]
+ufunc_bdtrc_data[2] = &ufunc_bdtrc_ptr[2*2]
+bdtrc = np.PyUFunc_FromFuncAndData(ufunc_bdtrc_loops, ufunc_bdtrc_data, ufunc_bdtrc_types, 3, 3, 1, 0, 'bdtrc', ufunc_bdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_bdtri_loops[3]
+cdef void *ufunc_bdtri_ptr[6]
+cdef void *ufunc_bdtri_data[3]
+cdef char ufunc_bdtri_types[12]
+cdef char *ufunc_bdtri_doc = (
+    "bdtri(k, n, y, out=None)\n"
+    "\n"
+    "Inverse function to `bdtr` with respect to `p`.\n"
+    "\n"
+    "Finds the event probability `p` such that the sum of the terms 0 through\n"
+    "`k` of the binomial probability density is equal to the given cumulative\n"
+    "probability `y`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of successes (float), rounded down to the nearest integer.\n"
+    "n : array_like\n"
+    "    Number of events (float)\n"
+    "y : array_like\n"
+    "    Cumulative probability (probability of `k` or fewer successes in `n`\n"
+    "    events).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "p : scalar or ndarray\n"
+    "    The event probability such that `bdtr(\\lfloor k \\rfloor, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "bdtr\n"
+    "betaincinv\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The computation is carried out using the inverse beta integral function\n"
+    "and the relation,::\n"
+    "\n"
+    "    1 - p = betaincinv(n - k, k + 1, y).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `bdtri`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/")
+ufunc_bdtri_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_bdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_dpd__As_dpd_d
+ufunc_bdtri_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_bdtri_types[0] = <char>NPY_FLOAT
+ufunc_bdtri_types[1] = <char>NPY_FLOAT
+ufunc_bdtri_types[2] = <char>NPY_FLOAT
+ufunc_bdtri_types[3] = <char>NPY_FLOAT
+ufunc_bdtri_types[4] = <char>NPY_DOUBLE
+ufunc_bdtri_types[5] = <char>NPY_INTP
+ufunc_bdtri_types[6] = <char>NPY_DOUBLE
+ufunc_bdtri_types[7] = <char>NPY_DOUBLE
+ufunc_bdtri_types[8] = <char>NPY_DOUBLE
+ufunc_bdtri_types[9] = <char>NPY_DOUBLE
+ufunc_bdtri_types[10] = <char>NPY_DOUBLE
+ufunc_bdtri_types[11] = <char>NPY_DOUBLE
+ufunc_bdtri_ptr[2*0] = <void*>_func_bdtri_unsafe
+ufunc_bdtri_ptr[2*0+1] = <void*>(<char*>"bdtri")
+ufunc_bdtri_ptr[2*1] = <void*>_func_cephes_bdtri_wrap
+ufunc_bdtri_ptr[2*1+1] = <void*>(<char*>"bdtri")
+ufunc_bdtri_ptr[2*2] = <void*>_func_bdtri_unsafe
+ufunc_bdtri_ptr[2*2+1] = <void*>(<char*>"bdtri")
+ufunc_bdtri_data[0] = &ufunc_bdtri_ptr[2*0]
+ufunc_bdtri_data[1] = &ufunc_bdtri_ptr[2*1]
+ufunc_bdtri_data[2] = &ufunc_bdtri_ptr[2*2]
+bdtri = np.PyUFunc_FromFuncAndData(ufunc_bdtri_loops, ufunc_bdtri_data, ufunc_bdtri_types, 3, 3, 1, 0, 'bdtri', ufunc_bdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_bdtrik_loops[2]
+cdef void *ufunc_bdtrik_ptr[4]
+cdef void *ufunc_bdtrik_data[2]
+cdef char ufunc_bdtrik_types[8]
+cdef char *ufunc_bdtrik_doc = (
+    "bdtrik(y, n, p, out=None)\n"
+    "\n"
+    "Inverse function to `bdtr` with respect to `k`.\n"
+    "\n"
+    "Finds the number of successes `k` such that the sum of the terms 0 through\n"
+    "`k` of the Binomial probability density for `n` events with probability\n"
+    "`p` is equal to the given cumulative probability `y`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : array_like\n"
+    "    Cumulative probability (probability of `k` or fewer successes in `n`\n"
+    "    events).\n"
+    "n : array_like\n"
+    "    Number of events (float).\n"
+    "p : array_like\n"
+    "    Success probability (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "k : scalar or ndarray\n"
+    "    The number of successes `k` such that `bdtr(k, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "bdtr\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Formula 26.5.24 of [1]_ (or equivalently [2]_) is used to reduce the binomial\n"
+    "distribution to the cumulative incomplete beta distribution.\n"
+    "\n"
+    "Computation of `k` involves a search for a value that produces the desired\n"
+    "value of `y`. The search relies on the monotonicity of `y` with `k`.\n"
+    "\n"
+    "Wrapper for the CDFLIB [3]_ Fortran routine `cdfbin`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [2] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17.5#E5\n"
+    ".. [3] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.")
+ufunc_bdtrik_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_bdtrik_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_bdtrik_types[0] = <char>NPY_FLOAT
+ufunc_bdtrik_types[1] = <char>NPY_FLOAT
+ufunc_bdtrik_types[2] = <char>NPY_FLOAT
+ufunc_bdtrik_types[3] = <char>NPY_FLOAT
+ufunc_bdtrik_types[4] = <char>NPY_DOUBLE
+ufunc_bdtrik_types[5] = <char>NPY_DOUBLE
+ufunc_bdtrik_types[6] = <char>NPY_DOUBLE
+ufunc_bdtrik_types[7] = <char>NPY_DOUBLE
+ufunc_bdtrik_ptr[2*0] = <void*>_func_bdtrik
+ufunc_bdtrik_ptr[2*0+1] = <void*>(<char*>"bdtrik")
+ufunc_bdtrik_ptr[2*1] = <void*>_func_bdtrik
+ufunc_bdtrik_ptr[2*1+1] = <void*>(<char*>"bdtrik")
+ufunc_bdtrik_data[0] = &ufunc_bdtrik_ptr[2*0]
+ufunc_bdtrik_data[1] = &ufunc_bdtrik_ptr[2*1]
+bdtrik = np.PyUFunc_FromFuncAndData(ufunc_bdtrik_loops, ufunc_bdtrik_data, ufunc_bdtrik_types, 2, 3, 1, 0, 'bdtrik', ufunc_bdtrik_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_bdtrin_loops[2]
+cdef void *ufunc_bdtrin_ptr[4]
+cdef void *ufunc_bdtrin_data[2]
+cdef char ufunc_bdtrin_types[8]
+cdef char *ufunc_bdtrin_doc = (
+    "bdtrin(k, y, p, out=None)\n"
+    "\n"
+    "Inverse function to `bdtr` with respect to `n`.\n"
+    "\n"
+    "Finds the number of events `n` such that the sum of the terms 0 through\n"
+    "`k` of the Binomial probability density for events with probability `p` is\n"
+    "equal to the given cumulative probability `y`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of successes (float).\n"
+    "y : array_like\n"
+    "    Cumulative probability (probability of `k` or fewer successes in `n`\n"
+    "    events).\n"
+    "p : array_like\n"
+    "    Success probability (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "n : scalar or ndarray\n"
+    "    The number of events `n` such that `bdtr(k, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "bdtr\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Formula 26.5.24 of [1]_ (or equivalently [2]_) is used to reduce the binomial\n"
+    "distribution to the cumulative incomplete beta distribution.\n"
+    "\n"
+    "Computation of `n` involves a search for a value that produces the desired\n"
+    "value of `y`. The search relies on the monotonicity of `y` with `n`.\n"
+    "\n"
+    "Wrapper for the CDFLIB [3]_ Fortran routine `cdfbin`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [2] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17.5#E5\n"
+    ".. [3] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.")
+ufunc_bdtrin_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_bdtrin_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_bdtrin_types[0] = <char>NPY_FLOAT
+ufunc_bdtrin_types[1] = <char>NPY_FLOAT
+ufunc_bdtrin_types[2] = <char>NPY_FLOAT
+ufunc_bdtrin_types[3] = <char>NPY_FLOAT
+ufunc_bdtrin_types[4] = <char>NPY_DOUBLE
+ufunc_bdtrin_types[5] = <char>NPY_DOUBLE
+ufunc_bdtrin_types[6] = <char>NPY_DOUBLE
+ufunc_bdtrin_types[7] = <char>NPY_DOUBLE
+ufunc_bdtrin_ptr[2*0] = <void*>_func_bdtrin
+ufunc_bdtrin_ptr[2*0+1] = <void*>(<char*>"bdtrin")
+ufunc_bdtrin_ptr[2*1] = <void*>_func_bdtrin
+ufunc_bdtrin_ptr[2*1+1] = <void*>(<char*>"bdtrin")
+ufunc_bdtrin_data[0] = &ufunc_bdtrin_ptr[2*0]
+ufunc_bdtrin_data[1] = &ufunc_bdtrin_ptr[2*1]
+bdtrin = np.PyUFunc_FromFuncAndData(ufunc_bdtrin_loops, ufunc_bdtrin_data, ufunc_bdtrin_types, 2, 3, 1, 0, 'bdtrin', ufunc_bdtrin_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_betainc_loops[2]
+cdef void *ufunc_betainc_ptr[4]
+cdef void *ufunc_betainc_data[2]
+cdef char ufunc_betainc_types[8]
+cdef char *ufunc_betainc_doc = (
+    "betainc(a, b, x, out=None)\n"
+    "\n"
+    "Regularized incomplete beta function.\n"
+    "\n"
+    "Computes the regularized incomplete beta function, defined as [1]_:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    I_x(a, b) = \\frac{\\Gamma(a+b)}{\\Gamma(a)\\Gamma(b)} \\int_0^x\n"
+    "    t^{a-1}(1-t)^{b-1}dt,\n"
+    "\n"
+    "for :math:`0 \\leq x \\leq 1`.\n"
+    "\n"
+    "This function is the cumulative distribution function for the beta\n"
+    "distribution; its range is [0, 1].\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "       Positive, real-valued parameters\n"
+    "x : array_like\n"
+    "    Real-valued such that :math:`0 \\leq x \\leq 1`,\n"
+    "    the upper limit of integration\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the regularized incomplete beta function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "beta : beta function\n"
+    "betaincinv : inverse of the regularized incomplete beta function\n"
+    "betaincc : complement of the regularized incomplete beta function\n"
+    "scipy.stats.beta : beta distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The term *regularized* in the name of this function refers to the\n"
+    "scaling of the function by the gamma function terms shown in the\n"
+    "formula.  When not qualified as *regularized*, the name *incomplete\n"
+    "beta function* often refers to just the integral expression,\n"
+    "without the gamma terms.  One can use the function `beta` from\n"
+    "`scipy.special` to get this \"nonregularized\" incomplete beta\n"
+    "function by multiplying the result of ``betainc(a, b, x)`` by\n"
+    "``beta(a, b)``.\n"
+    "\n"
+    "``betainc(a, b, x)`` is treated as a two parameter family of functions\n"
+    "of a single variable `x`, rather than as a function of three variables.\n"
+    "This impacts only the limiting cases ``a = 0``, ``b = 0``, ``a = inf``,\n"
+    "``b = inf``.\n"
+    "\n"
+    "In general\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\lim_{(a, b) \\rightarrow (a_0, b_0)} \\mathrm{betainc}(a, b, x)\n"
+    "\n"
+    "is treated as a pointwise limit in ``x``. Thus for example,\n"
+    "``betainc(0, b, 0)`` equals ``0`` for ``b > 0``, although it would be\n"
+    "indeterminate when considering the simultaneous limit ``(a, x) -> (0+, 0+)``.\n"
+    "\n"
+    "This function wraps the ``ibeta`` routine from the\n"
+    "Boost Math C++ library [2]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17\n"
+    ".. [2] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "\n"
+    "Let :math:`B(a, b)` be the `beta` function.\n"
+    "\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "The coefficient in terms of `gamma` is equal to\n"
+    ":math:`1/B(a, b)`. Also, when :math:`x=1`\n"
+    "the integral is equal to :math:`B(a, b)`.\n"
+    "Therefore, :math:`I_{x=1}(a, b) = 1` for any :math:`a, b`.\n"
+    "\n"
+    ">>> sc.betainc(0.2, 3.5, 1.0)\n"
+    "1.0\n"
+    "\n"
+    "It satisfies\n"
+    ":math:`I_x(a, b) = x^a F(a, 1-b, a+1, x)/ (aB(a, b))`,\n"
+    "where :math:`F` is the hypergeometric function `hyp2f1`:\n"
+    "\n"
+    ">>> a, b, x = 1.4, 3.1, 0.5\n"
+    ">>> x**a * sc.hyp2f1(a, 1 - b, a + 1, x)/(a * sc.beta(a, b))\n"
+    "0.8148904036225295\n"
+    ">>> sc.betainc(a, b, x)\n"
+    "0.8148904036225296\n"
+    "\n"
+    "This functions satisfies the relationship\n"
+    ":math:`I_x(a, b) = 1 - I_{1-x}(b, a)`:\n"
+    "\n"
+    ">>> sc.betainc(2.2, 3.1, 0.4)\n"
+    "0.49339638807619446\n"
+    ">>> 1 - sc.betainc(3.1, 2.2, 1 - 0.4)\n"
+    "0.49339638807619446")
+ufunc_betainc_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_betainc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_betainc_types[0] = <char>NPY_FLOAT
+ufunc_betainc_types[1] = <char>NPY_FLOAT
+ufunc_betainc_types[2] = <char>NPY_FLOAT
+ufunc_betainc_types[3] = <char>NPY_FLOAT
+ufunc_betainc_types[4] = <char>NPY_DOUBLE
+ufunc_betainc_types[5] = <char>NPY_DOUBLE
+ufunc_betainc_types[6] = <char>NPY_DOUBLE
+ufunc_betainc_types[7] = <char>NPY_DOUBLE
+ufunc_betainc_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibeta_float
+ufunc_betainc_ptr[2*0+1] = <void*>(<char*>"betainc")
+ufunc_betainc_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibeta_double
+ufunc_betainc_ptr[2*1+1] = <void*>(<char*>"betainc")
+ufunc_betainc_data[0] = &ufunc_betainc_ptr[2*0]
+ufunc_betainc_data[1] = &ufunc_betainc_ptr[2*1]
+betainc = np.PyUFunc_FromFuncAndData(ufunc_betainc_loops, ufunc_betainc_data, ufunc_betainc_types, 2, 3, 1, 0, 'betainc', ufunc_betainc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_betaincc_loops[2]
+cdef void *ufunc_betaincc_ptr[4]
+cdef void *ufunc_betaincc_data[2]
+cdef char ufunc_betaincc_types[8]
+cdef char *ufunc_betaincc_doc = (
+    "betaincc(a, b, x, out=None)\n"
+    "\n"
+    "Complement of the regularized incomplete beta function.\n"
+    "\n"
+    "Computes the complement of the regularized incomplete beta function,\n"
+    "defined as [1]_:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\bar{I}_x(a, b) = 1 - I_x(a, b)\n"
+    "                    = 1 - \\frac{\\Gamma(a+b)}{\\Gamma(a)\\Gamma(b)} \\int_0^x\n"
+    "                              t^{a-1}(1-t)^{b-1}dt,\n"
+    "\n"
+    "for :math:`0 \\leq x \\leq 1`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "       Positive, real-valued parameters\n"
+    "x : array_like\n"
+    "    Real-valued such that :math:`0 \\leq x \\leq 1`,\n"
+    "    the upper limit of integration\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the regularized incomplete beta function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "betainc : regularized incomplete beta function\n"
+    "betaincinv : inverse of the regularized incomplete beta function\n"
+    "betainccinv :\n"
+    "    inverse of the complement of the regularized incomplete beta function\n"
+    "beta : beta function\n"
+    "scipy.stats.beta : beta distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 1.11.0\n"
+    "\n"
+    "Like `betainc`, ``betaincc(a, b, x)`` is treated as a two parameter\n"
+    "family of functions of a single variable `x`, rather than as a function of\n"
+    "three variables. See the `betainc` docstring for more info on how this\n"
+    "impacts limiting cases.\n"
+    "\n"
+    "This function wraps the ``ibetac`` routine from the\n"
+    "Boost Math C++ library [2]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17\n"
+    ".. [2] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import betaincc, betainc\n"
+    "\n"
+    "The naive calculation ``1 - betainc(a, b, x)`` loses precision when\n"
+    "the values of ``betainc(a, b, x)`` are close to 1:\n"
+    "\n"
+    ">>> 1 - betainc(0.5, 8, [0.9, 0.99, 0.999])\n"
+    "array([2.0574632e-09, 0.0000000e+00, 0.0000000e+00])\n"
+    "\n"
+    "By using ``betaincc``, we get the correct values:\n"
+    "\n"
+    ">>> betaincc(0.5, 8, [0.9, 0.99, 0.999])\n"
+    "array([2.05746321e-09, 1.97259354e-17, 1.96467954e-25])")
+ufunc_betaincc_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_betaincc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_betaincc_types[0] = <char>NPY_FLOAT
+ufunc_betaincc_types[1] = <char>NPY_FLOAT
+ufunc_betaincc_types[2] = <char>NPY_FLOAT
+ufunc_betaincc_types[3] = <char>NPY_FLOAT
+ufunc_betaincc_types[4] = <char>NPY_DOUBLE
+ufunc_betaincc_types[5] = <char>NPY_DOUBLE
+ufunc_betaincc_types[6] = <char>NPY_DOUBLE
+ufunc_betaincc_types[7] = <char>NPY_DOUBLE
+ufunc_betaincc_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibetac_float
+ufunc_betaincc_ptr[2*0+1] = <void*>(<char*>"betaincc")
+ufunc_betaincc_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibetac_double
+ufunc_betaincc_ptr[2*1+1] = <void*>(<char*>"betaincc")
+ufunc_betaincc_data[0] = &ufunc_betaincc_ptr[2*0]
+ufunc_betaincc_data[1] = &ufunc_betaincc_ptr[2*1]
+betaincc = np.PyUFunc_FromFuncAndData(ufunc_betaincc_loops, ufunc_betaincc_data, ufunc_betaincc_types, 2, 3, 1, 0, 'betaincc', ufunc_betaincc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_betainccinv_loops[2]
+cdef void *ufunc_betainccinv_ptr[4]
+cdef void *ufunc_betainccinv_data[2]
+cdef char ufunc_betainccinv_types[8]
+cdef char *ufunc_betainccinv_doc = (
+    "betainccinv(a, b, y, out=None)\n"
+    "\n"
+    "Inverse of the complemented regularized incomplete beta function.\n"
+    "\n"
+    "Computes :math:`x` such that:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    y = 1 - I_x(a, b) = 1 - \\frac{\\Gamma(a+b)}{\\Gamma(a)\\Gamma(b)}\n"
+    "    \\int_0^x t^{a-1}(1-t)^{b-1}dt,\n"
+    "\n"
+    "where :math:`I_x` is the normalized incomplete beta function `betainc`\n"
+    "and :math:`\\Gamma` is the `gamma` function [1]_.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "y : array_like\n"
+    "    Real-valued input\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the inverse of the regularized incomplete beta function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "betainc : regularized incomplete beta function\n"
+    "betaincc : complement of the regularized incomplete beta function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 1.11.0\n"
+    "\n"
+    "This function wraps the ``ibetac_inv`` routine from the\n"
+    "Boost Math C++ library [2]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17\n"
+    ".. [2] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import betainccinv, betaincc\n"
+    "\n"
+    "This function is the inverse of `betaincc` for fixed\n"
+    "values of :math:`a` and :math:`b`.\n"
+    "\n"
+    ">>> a, b = 1.2, 3.1\n"
+    ">>> y = betaincc(a, b, 0.2)\n"
+    ">>> betainccinv(a, b, y)\n"
+    "0.2\n"
+    "\n"
+    ">>> a, b = 7, 2.5\n"
+    ">>> x = betainccinv(a, b, 0.875)\n"
+    ">>> betaincc(a, b, x)\n"
+    "0.875")
+ufunc_betainccinv_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_betainccinv_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_betainccinv_types[0] = <char>NPY_FLOAT
+ufunc_betainccinv_types[1] = <char>NPY_FLOAT
+ufunc_betainccinv_types[2] = <char>NPY_FLOAT
+ufunc_betainccinv_types[3] = <char>NPY_FLOAT
+ufunc_betainccinv_types[4] = <char>NPY_DOUBLE
+ufunc_betainccinv_types[5] = <char>NPY_DOUBLE
+ufunc_betainccinv_types[6] = <char>NPY_DOUBLE
+ufunc_betainccinv_types[7] = <char>NPY_DOUBLE
+ufunc_betainccinv_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibetac_inv_float
+ufunc_betainccinv_ptr[2*0+1] = <void*>(<char*>"betainccinv")
+ufunc_betainccinv_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibetac_inv_double
+ufunc_betainccinv_ptr[2*1+1] = <void*>(<char*>"betainccinv")
+ufunc_betainccinv_data[0] = &ufunc_betainccinv_ptr[2*0]
+ufunc_betainccinv_data[1] = &ufunc_betainccinv_ptr[2*1]
+betainccinv = np.PyUFunc_FromFuncAndData(ufunc_betainccinv_loops, ufunc_betainccinv_data, ufunc_betainccinv_types, 2, 3, 1, 0, 'betainccinv', ufunc_betainccinv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_betaincinv_loops[2]
+cdef void *ufunc_betaincinv_ptr[4]
+cdef void *ufunc_betaincinv_data[2]
+cdef char ufunc_betaincinv_types[8]
+cdef char *ufunc_betaincinv_doc = (
+    "betaincinv(a, b, y, out=None)\n"
+    "\n"
+    "Inverse of the regularized incomplete beta function.\n"
+    "\n"
+    "Computes :math:`x` such that:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    y = I_x(a, b) = \\frac{\\Gamma(a+b)}{\\Gamma(a)\\Gamma(b)}\n"
+    "    \\int_0^x t^{a-1}(1-t)^{b-1}dt,\n"
+    "\n"
+    "where :math:`I_x` is the normalized incomplete beta function `betainc`\n"
+    "and :math:`\\Gamma` is the `gamma` function [1]_.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "    Positive, real-valued parameters\n"
+    "y : array_like\n"
+    "    Real-valued input\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the inverse of the regularized incomplete beta function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "betainc : regularized incomplete beta function\n"
+    "gamma : gamma function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function wraps the ``ibeta_inv`` routine from the\n"
+    "Boost Math C++ library [2]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17\n"
+    ".. [2] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "This function is the inverse of `betainc` for fixed\n"
+    "values of :math:`a` and :math:`b`.\n"
+    "\n"
+    ">>> a, b = 1.2, 3.1\n"
+    ">>> y = sc.betainc(a, b, 0.2)\n"
+    ">>> sc.betaincinv(a, b, y)\n"
+    "0.2\n"
+    ">>>\n"
+    ">>> a, b = 7.5, 0.4\n"
+    ">>> x = sc.betaincinv(a, b, 0.5)\n"
+    ">>> sc.betainc(a, b, x)\n"
+    "0.5")
+ufunc_betaincinv_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_betaincinv_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_betaincinv_types[0] = <char>NPY_FLOAT
+ufunc_betaincinv_types[1] = <char>NPY_FLOAT
+ufunc_betaincinv_types[2] = <char>NPY_FLOAT
+ufunc_betaincinv_types[3] = <char>NPY_FLOAT
+ufunc_betaincinv_types[4] = <char>NPY_DOUBLE
+ufunc_betaincinv_types[5] = <char>NPY_DOUBLE
+ufunc_betaincinv_types[6] = <char>NPY_DOUBLE
+ufunc_betaincinv_types[7] = <char>NPY_DOUBLE
+ufunc_betaincinv_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibeta_inv_float
+ufunc_betaincinv_ptr[2*0+1] = <void*>(<char*>"betaincinv")
+ufunc_betaincinv_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibeta_inv_double
+ufunc_betaincinv_ptr[2*1+1] = <void*>(<char*>"betaincinv")
+ufunc_betaincinv_data[0] = &ufunc_betaincinv_ptr[2*0]
+ufunc_betaincinv_data[1] = &ufunc_betaincinv_ptr[2*1]
+betaincinv = np.PyUFunc_FromFuncAndData(ufunc_betaincinv_loops, ufunc_betaincinv_data, ufunc_betaincinv_types, 2, 3, 1, 0, 'betaincinv', ufunc_betaincinv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_boxcox_loops[2]
+cdef void *ufunc_boxcox_ptr[4]
+cdef void *ufunc_boxcox_data[2]
+cdef char ufunc_boxcox_types[6]
+cdef char *ufunc_boxcox_doc = (
+    "boxcox(x, lmbda, out=None)\n"
+    "\n"
+    "Compute the Box-Cox transformation.\n"
+    "\n"
+    "The Box-Cox transformation is::\n"
+    "\n"
+    "    y = (x**lmbda - 1) / lmbda  if lmbda != 0\n"
+    "        log(x)                  if lmbda == 0\n"
+    "\n"
+    "Returns `nan` if ``x < 0``.\n"
+    "Returns `-inf` if ``x == 0`` and ``lmbda < 0``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Data to be transformed.\n"
+    "lmbda : array_like\n"
+    "    Power parameter of the Box-Cox transform.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    Transformed data.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "\n"
+    ".. versionadded:: 0.14.0\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import boxcox\n"
+    ">>> boxcox([1, 4, 10], 2.5)\n"
+    "array([   0.        ,   12.4       ,  126.09110641])\n"
+    ">>> boxcox(2, [0, 1, 2])\n"
+    "array([ 0.69314718,  1.        ,  1.5       ])")
+ufunc_boxcox_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_boxcox_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_boxcox_types[0] = <char>NPY_FLOAT
+ufunc_boxcox_types[1] = <char>NPY_FLOAT
+ufunc_boxcox_types[2] = <char>NPY_FLOAT
+ufunc_boxcox_types[3] = <char>NPY_DOUBLE
+ufunc_boxcox_types[4] = <char>NPY_DOUBLE
+ufunc_boxcox_types[5] = <char>NPY_DOUBLE
+ufunc_boxcox_ptr[2*0] = <void*>_func_boxcox
+ufunc_boxcox_ptr[2*0+1] = <void*>(<char*>"boxcox")
+ufunc_boxcox_ptr[2*1] = <void*>_func_boxcox
+ufunc_boxcox_ptr[2*1+1] = <void*>(<char*>"boxcox")
+ufunc_boxcox_data[0] = &ufunc_boxcox_ptr[2*0]
+ufunc_boxcox_data[1] = &ufunc_boxcox_ptr[2*1]
+boxcox = np.PyUFunc_FromFuncAndData(ufunc_boxcox_loops, ufunc_boxcox_data, ufunc_boxcox_types, 2, 2, 1, 0, 'boxcox', ufunc_boxcox_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_boxcox1p_loops[2]
+cdef void *ufunc_boxcox1p_ptr[4]
+cdef void *ufunc_boxcox1p_data[2]
+cdef char ufunc_boxcox1p_types[6]
+cdef char *ufunc_boxcox1p_doc = (
+    "boxcox1p(x, lmbda, out=None)\n"
+    "\n"
+    "Compute the Box-Cox transformation of 1 + `x`.\n"
+    "\n"
+    "The Box-Cox transformation computed by `boxcox1p` is::\n"
+    "\n"
+    "    y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0\n"
+    "        log(1+x)                    if lmbda == 0\n"
+    "\n"
+    "Returns `nan` if ``x < -1``.\n"
+    "Returns `-inf` if ``x == -1`` and ``lmbda < 0``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Data to be transformed.\n"
+    "lmbda : array_like\n"
+    "    Power parameter of the Box-Cox transform.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    Transformed data.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "\n"
+    ".. versionadded:: 0.14.0\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import boxcox1p\n"
+    ">>> boxcox1p(1e-4, [0, 0.5, 1])\n"
+    "array([  9.99950003e-05,   9.99975001e-05,   1.00000000e-04])\n"
+    ">>> boxcox1p([0.01, 0.1], 0.25)\n"
+    "array([ 0.00996272,  0.09645476])")
+ufunc_boxcox1p_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_boxcox1p_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_boxcox1p_types[0] = <char>NPY_FLOAT
+ufunc_boxcox1p_types[1] = <char>NPY_FLOAT
+ufunc_boxcox1p_types[2] = <char>NPY_FLOAT
+ufunc_boxcox1p_types[3] = <char>NPY_DOUBLE
+ufunc_boxcox1p_types[4] = <char>NPY_DOUBLE
+ufunc_boxcox1p_types[5] = <char>NPY_DOUBLE
+ufunc_boxcox1p_ptr[2*0] = <void*>_func_boxcox1p
+ufunc_boxcox1p_ptr[2*0+1] = <void*>(<char*>"boxcox1p")
+ufunc_boxcox1p_ptr[2*1] = <void*>_func_boxcox1p
+ufunc_boxcox1p_ptr[2*1+1] = <void*>(<char*>"boxcox1p")
+ufunc_boxcox1p_data[0] = &ufunc_boxcox1p_ptr[2*0]
+ufunc_boxcox1p_data[1] = &ufunc_boxcox1p_ptr[2*1]
+boxcox1p = np.PyUFunc_FromFuncAndData(ufunc_boxcox1p_loops, ufunc_boxcox1p_data, ufunc_boxcox1p_types, 2, 2, 1, 0, 'boxcox1p', ufunc_boxcox1p_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_btdtria_loops[2]
+cdef void *ufunc_btdtria_ptr[4]
+cdef void *ufunc_btdtria_data[2]
+cdef char ufunc_btdtria_types[8]
+cdef char *ufunc_btdtria_doc = (
+    "btdtria(p, b, x, out=None)\n"
+    "\n"
+    "Inverse of `betainc` with respect to `a`.\n"
+    "\n"
+    "This is the inverse of the beta cumulative distribution function, `betainc`,\n"
+    "considered as a function of `a`, returning the value of `a` for which\n"
+    "`betainc(a, b, x) = p`, or\n"
+    "\n"
+    ".. math::\n"
+    "    p = \\int_0^x \\frac{\\Gamma(a + b)}{\\Gamma(a)\\Gamma(b)} t^{a-1} (1-t)^{b-1}\\,dt\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Cumulative probability, in [0, 1].\n"
+    "b : array_like\n"
+    "    Shape parameter (`b` > 0).\n"
+    "x : array_like\n"
+    "    The quantile, in [0, 1].\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "a : scalar or ndarray\n"
+    "    The value of the shape parameter `a` such that `betainc(a, b, x) = p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "betainc : Regularized incomplete beta function\n"
+    "betaincinv : Inverse of the regularized incomplete beta function\n"
+    "btdtrib : Inverse of the beta cumulative distribution function, with respect to `b`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function wraps the ``ibeta_inva`` routine from the\n"
+    "Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "This function is the inverse of `betainc` for fixed\n"
+    "values of :math:`b` and :math:`x`.\n"
+    "\n"
+    ">>> a, b, x = 1.2, 3.1, 0.2\n"
+    ">>> y = sc.betainc(a, b, x)\n"
+    ">>> sc.btdtria(y, b, x)\n"
+    "1.2")
+ufunc_btdtria_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_btdtria_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_btdtria_types[0] = <char>NPY_FLOAT
+ufunc_btdtria_types[1] = <char>NPY_FLOAT
+ufunc_btdtria_types[2] = <char>NPY_FLOAT
+ufunc_btdtria_types[3] = <char>NPY_FLOAT
+ufunc_btdtria_types[4] = <char>NPY_DOUBLE
+ufunc_btdtria_types[5] = <char>NPY_DOUBLE
+ufunc_btdtria_types[6] = <char>NPY_DOUBLE
+ufunc_btdtria_types[7] = <char>NPY_DOUBLE
+ufunc_btdtria_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibeta_inva_float
+ufunc_btdtria_ptr[2*0+1] = <void*>(<char*>"btdtria")
+ufunc_btdtria_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibeta_inva_double
+ufunc_btdtria_ptr[2*1+1] = <void*>(<char*>"btdtria")
+ufunc_btdtria_data[0] = &ufunc_btdtria_ptr[2*0]
+ufunc_btdtria_data[1] = &ufunc_btdtria_ptr[2*1]
+btdtria = np.PyUFunc_FromFuncAndData(ufunc_btdtria_loops, ufunc_btdtria_data, ufunc_btdtria_types, 2, 3, 1, 0, 'btdtria', ufunc_btdtria_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_btdtrib_loops[2]
+cdef void *ufunc_btdtrib_ptr[4]
+cdef void *ufunc_btdtrib_data[2]
+cdef char ufunc_btdtrib_types[8]
+cdef char *ufunc_btdtrib_doc = (
+    "btdtria(a, p, x, out=None)\n"
+    "\n"
+    "Inverse of `betainc` with respect to `b`.\n"
+    "\n"
+    "This is the inverse of the beta cumulative distribution function, `betainc`,\n"
+    "considered as a function of `b`, returning the value of `b` for which\n"
+    "`betainc(a, b, x) = p`, or\n"
+    "\n"
+    ".. math::\n"
+    "    p = \\int_0^x \\frac{\\Gamma(a + b)}{\\Gamma(a)\\Gamma(b)} t^{a-1} (1-t)^{b-1}\\,dt\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : array_like\n"
+    "    Shape parameter (`a` > 0).\n"
+    "p : array_like\n"
+    "    Cumulative probability, in [0, 1].\n"
+    "x : array_like\n"
+    "    The quantile, in [0, 1].\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "b : scalar or ndarray\n"
+    "    The value of the shape parameter `b` such that `betainc(a, b, x) = p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "betainc : Regularized incomplete beta function\n"
+    "betaincinv : Inverse of the regularized incomplete beta function with\n"
+    "             respect to `x`.\n"
+    "btdtria : Inverse of the beta cumulative distribution function, with respect to `a`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the `ibeta_invb` routine from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    ">>> a, b, x = 1.2, 3.1, 0.2\n"
+    ">>> y = sc.betainc(a, b, x)\n"
+    "\n"
+    "`btdtrib` is the inverse of `betainc` for fixed values of :math:`a` and\n"
+    ":math:`x`:\n"
+    "\n"
+    ">>> sc.btdtrib(a, y, x)\n"
+    "3.1")
+ufunc_btdtrib_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_btdtrib_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_btdtrib_types[0] = <char>NPY_FLOAT
+ufunc_btdtrib_types[1] = <char>NPY_FLOAT
+ufunc_btdtrib_types[2] = <char>NPY_FLOAT
+ufunc_btdtrib_types[3] = <char>NPY_FLOAT
+ufunc_btdtrib_types[4] = <char>NPY_DOUBLE
+ufunc_btdtrib_types[5] = <char>NPY_DOUBLE
+ufunc_btdtrib_types[6] = <char>NPY_DOUBLE
+ufunc_btdtrib_types[7] = <char>NPY_DOUBLE
+ufunc_btdtrib_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ibeta_invb_float
+ufunc_btdtrib_ptr[2*0+1] = <void*>(<char*>"btdtrib")
+ufunc_btdtrib_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ibeta_invb_double
+ufunc_btdtrib_ptr[2*1+1] = <void*>(<char*>"btdtrib")
+ufunc_btdtrib_data[0] = &ufunc_btdtrib_ptr[2*0]
+ufunc_btdtrib_data[1] = &ufunc_btdtrib_ptr[2*1]
+btdtrib = np.PyUFunc_FromFuncAndData(ufunc_btdtrib_loops, ufunc_btdtrib_data, ufunc_btdtrib_types, 2, 3, 1, 0, 'btdtrib', ufunc_btdtrib_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chdtr_loops[2]
+cdef void *ufunc_chdtr_ptr[4]
+cdef void *ufunc_chdtr_data[2]
+cdef char ufunc_chdtr_types[6]
+cdef char *ufunc_chdtr_doc = (
+    "chdtr(v, x, out=None)\n"
+    "\n"
+    "Chi square cumulative distribution function.\n"
+    "\n"
+    "Returns the area under the left tail (from 0 to `x`) of the Chi\n"
+    "square probability density function with `v` degrees of freedom:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\frac{1}{2^{v/2} \\Gamma(v/2)} \\int_0^x t^{v/2 - 1} e^{-t/2} dt\n"
+    "\n"
+    "Here :math:`\\Gamma` is the Gamma function; see `gamma`. This\n"
+    "integral can be expressed in terms of the regularized lower\n"
+    "incomplete gamma function `gammainc` as\n"
+    "``gammainc(v / 2, x / 2)``. [1]_\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Degrees of freedom.\n"
+    "x : array_like\n"
+    "    Upper bound of the integral.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the cumulative distribution function.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chdtrc, chdtri, chdtriv, gammainc\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Chi-Square distribution,\n"
+    "    https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It can be expressed in terms of the regularized lower incomplete\n"
+    "gamma function.\n"
+    "\n"
+    ">>> v = 1\n"
+    ">>> x = np.arange(4)\n"
+    ">>> sc.chdtr(v, x)\n"
+    "array([0.        , 0.68268949, 0.84270079, 0.91673548])\n"
+    ">>> sc.gammainc(v / 2, x / 2)\n"
+    "array([0.        , 0.68268949, 0.84270079, 0.91673548])")
+ufunc_chdtr_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_chdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_chdtr_types[0] = <char>NPY_FLOAT
+ufunc_chdtr_types[1] = <char>NPY_FLOAT
+ufunc_chdtr_types[2] = <char>NPY_FLOAT
+ufunc_chdtr_types[3] = <char>NPY_DOUBLE
+ufunc_chdtr_types[4] = <char>NPY_DOUBLE
+ufunc_chdtr_types[5] = <char>NPY_DOUBLE
+ufunc_chdtr_ptr[2*0] = <void*>_func_xsf_chdtr
+ufunc_chdtr_ptr[2*0+1] = <void*>(<char*>"chdtr")
+ufunc_chdtr_ptr[2*1] = <void*>_func_xsf_chdtr
+ufunc_chdtr_ptr[2*1+1] = <void*>(<char*>"chdtr")
+ufunc_chdtr_data[0] = &ufunc_chdtr_ptr[2*0]
+ufunc_chdtr_data[1] = &ufunc_chdtr_ptr[2*1]
+chdtr = np.PyUFunc_FromFuncAndData(ufunc_chdtr_loops, ufunc_chdtr_data, ufunc_chdtr_types, 2, 2, 1, 0, 'chdtr', ufunc_chdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chdtrc_loops[2]
+cdef void *ufunc_chdtrc_ptr[4]
+cdef void *ufunc_chdtrc_data[2]
+cdef char ufunc_chdtrc_types[6]
+cdef char *ufunc_chdtrc_doc = (
+    "chdtrc(v, x, out=None)\n"
+    "\n"
+    "Chi square survival function.\n"
+    "\n"
+    "Returns the area under the right hand tail (from `x` to infinity)\n"
+    "of the Chi square probability density function with `v` degrees of\n"
+    "freedom:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\frac{1}{2^{v/2} \\Gamma(v/2)} \\int_x^\\infty t^{v/2 - 1} e^{-t/2} dt\n"
+    "\n"
+    "Here :math:`\\Gamma` is the Gamma function; see `gamma`. This\n"
+    "integral can be expressed in terms of the regularized upper\n"
+    "incomplete gamma function `gammaincc` as\n"
+    "``gammaincc(v / 2, x / 2)``. [1]_\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Degrees of freedom.\n"
+    "x : array_like\n"
+    "    Lower bound of the integral.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the survival function.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chdtr, chdtri, chdtriv, gammaincc\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Chi-Square distribution,\n"
+    "    https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It can be expressed in terms of the regularized upper incomplete\n"
+    "gamma function.\n"
+    "\n"
+    ">>> v = 1\n"
+    ">>> x = np.arange(4)\n"
+    ">>> sc.chdtrc(v, x)\n"
+    "array([1.        , 0.31731051, 0.15729921, 0.08326452])\n"
+    ">>> sc.gammaincc(v / 2, x / 2)\n"
+    "array([1.        , 0.31731051, 0.15729921, 0.08326452])")
+ufunc_chdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_chdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_chdtrc_types[0] = <char>NPY_FLOAT
+ufunc_chdtrc_types[1] = <char>NPY_FLOAT
+ufunc_chdtrc_types[2] = <char>NPY_FLOAT
+ufunc_chdtrc_types[3] = <char>NPY_DOUBLE
+ufunc_chdtrc_types[4] = <char>NPY_DOUBLE
+ufunc_chdtrc_types[5] = <char>NPY_DOUBLE
+ufunc_chdtrc_ptr[2*0] = <void*>_func_xsf_chdtrc
+ufunc_chdtrc_ptr[2*0+1] = <void*>(<char*>"chdtrc")
+ufunc_chdtrc_ptr[2*1] = <void*>_func_xsf_chdtrc
+ufunc_chdtrc_ptr[2*1+1] = <void*>(<char*>"chdtrc")
+ufunc_chdtrc_data[0] = &ufunc_chdtrc_ptr[2*0]
+ufunc_chdtrc_data[1] = &ufunc_chdtrc_ptr[2*1]
+chdtrc = np.PyUFunc_FromFuncAndData(ufunc_chdtrc_loops, ufunc_chdtrc_data, ufunc_chdtrc_types, 2, 2, 1, 0, 'chdtrc', ufunc_chdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chdtri_loops[2]
+cdef void *ufunc_chdtri_ptr[4]
+cdef void *ufunc_chdtri_data[2]
+cdef char ufunc_chdtri_types[6]
+cdef char *ufunc_chdtri_doc = (
+    "chdtri(v, p, out=None)\n"
+    "\n"
+    "Inverse to `chdtrc` with respect to `x`.\n"
+    "\n"
+    "Returns `x` such that ``chdtrc(v, x) == p``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Degrees of freedom.\n"
+    "p : array_like\n"
+    "    Probability.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Value so that the probability a Chi square random variable\n"
+    "    with `v` degrees of freedom is greater than `x` equals `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chdtrc, chdtr, chdtriv\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Chi-Square distribution,\n"
+    "    https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It inverts `chdtrc`.\n"
+    "\n"
+    ">>> v, p = 1, 0.3\n"
+    ">>> sc.chdtrc(v, sc.chdtri(v, p))\n"
+    "0.3\n"
+    ">>> x = 1\n"
+    ">>> sc.chdtri(v, sc.chdtrc(v, x))\n"
+    "1.0")
+ufunc_chdtri_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_chdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_chdtri_types[0] = <char>NPY_FLOAT
+ufunc_chdtri_types[1] = <char>NPY_FLOAT
+ufunc_chdtri_types[2] = <char>NPY_FLOAT
+ufunc_chdtri_types[3] = <char>NPY_DOUBLE
+ufunc_chdtri_types[4] = <char>NPY_DOUBLE
+ufunc_chdtri_types[5] = <char>NPY_DOUBLE
+ufunc_chdtri_ptr[2*0] = <void*>_func_xsf_chdtri
+ufunc_chdtri_ptr[2*0+1] = <void*>(<char*>"chdtri")
+ufunc_chdtri_ptr[2*1] = <void*>_func_xsf_chdtri
+ufunc_chdtri_ptr[2*1+1] = <void*>(<char*>"chdtri")
+ufunc_chdtri_data[0] = &ufunc_chdtri_ptr[2*0]
+ufunc_chdtri_data[1] = &ufunc_chdtri_ptr[2*1]
+chdtri = np.PyUFunc_FromFuncAndData(ufunc_chdtri_loops, ufunc_chdtri_data, ufunc_chdtri_types, 2, 2, 1, 0, 'chdtri', ufunc_chdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chdtriv_loops[2]
+cdef void *ufunc_chdtriv_ptr[4]
+cdef void *ufunc_chdtriv_data[2]
+cdef char ufunc_chdtriv_types[6]
+cdef char *ufunc_chdtriv_doc = (
+    "chdtriv(p, x, out=None)\n"
+    "\n"
+    "Inverse to `chdtr` with respect to `v`.\n"
+    "\n"
+    "Returns `v` such that ``chdtr(v, x) == p``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability that the Chi square random variable is less than\n"
+    "    or equal to `x`.\n"
+    "x : array_like\n"
+    "    Nonnegative input.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Degrees of freedom.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chdtr, chdtrc, chdtri\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function wraps routines from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    ".. [2] Chi-Square distribution,\n"
+    "    https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It inverts `chdtr`.\n"
+    "\n"
+    ">>> p, x = 0.5, 1\n"
+    ">>> sc.chdtr(sc.chdtriv(p, x), x)\n"
+    "0.5000000000000003\n"
+    ">>> v = 1\n"
+    ">>> sc.chdtriv(sc.chdtr(v, x), v)\n"
+    "1.0")
+ufunc_chdtriv_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc_chdtriv_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_chdtriv_types[0] = <char>NPY_FLOAT
+ufunc_chdtriv_types[1] = <char>NPY_FLOAT
+ufunc_chdtriv_types[2] = <char>NPY_FLOAT
+ufunc_chdtriv_types[3] = <char>NPY_DOUBLE
+ufunc_chdtriv_types[4] = <char>NPY_DOUBLE
+ufunc_chdtriv_types[5] = <char>NPY_DOUBLE
+ufunc_chdtriv_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_chdtriv_float
+ufunc_chdtriv_ptr[2*0+1] = <void*>(<char*>"chdtriv")
+ufunc_chdtriv_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_chdtriv_double
+ufunc_chdtriv_ptr[2*1+1] = <void*>(<char*>"chdtriv")
+ufunc_chdtriv_data[0] = &ufunc_chdtriv_ptr[2*0]
+ufunc_chdtriv_data[1] = &ufunc_chdtriv_ptr[2*1]
+chdtriv = np.PyUFunc_FromFuncAndData(ufunc_chdtriv_loops, ufunc_chdtriv_data, ufunc_chdtriv_types, 2, 2, 1, 0, 'chdtriv', ufunc_chdtriv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chndtr_loops[2]
+cdef void *ufunc_chndtr_ptr[4]
+cdef void *ufunc_chndtr_data[2]
+cdef char ufunc_chndtr_types[8]
+cdef char *ufunc_chndtr_doc = (
+    "chndtr(x, df, nc, out=None)\n"
+    "\n"
+    "Non-central chi square cumulative distribution function\n"
+    "\n"
+    "The cumulative distribution function is given by:\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P(\\chi^{\\prime 2} \\vert \\nu, \\lambda) =\\sum_{j=0}^{\\infty}\n"
+    "    e^{-\\lambda /2}\n"
+    "    \\frac{(\\lambda /2)^j}{j!} P(\\chi^{\\prime 2} \\vert \\nu + 2j),\n"
+    "\n"
+    "where :math:`\\nu > 0` is the degrees of freedom (``df``) and\n"
+    ":math:`\\lambda \\geq 0` is the non-centrality parameter (``nc``).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Upper bound of the integral; must satisfy ``x >= 0``\n"
+    "df : array_like\n"
+    "    Degrees of freedom; must satisfy ``df > 0``\n"
+    "nc : array_like\n"
+    "    Non-centrality parameter; must satisfy ``nc >= 0``\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Value of the non-central chi square cumulative distribution function.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chndtrix: Noncentral Chi Squared distribution quantile\n"
+    "chndtridf: Inverse of `chndtr` with respect to `df`\n"
+    "chndtrinc: Inverse of `chndtr` with respect to `nc`\n"
+    "scipy.stats.ncx2: Non-central chi-squared distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The noncentral chi squared distribution is also available in\n"
+    "`scipy.stats.ncx2`. ``scipy.stats.ncx2.cdf`` is equivalent to `chndtr`.\n"
+    "\n"
+    "This function wraps routines from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "Compute the noncentral chi squared distribution CDF at one point.\n"
+    "\n"
+    ">>> x = 4.0\n"
+    ">>> df = 1.0\n"
+    ">>> nc = 5.0\n"
+    ">>> sc.chndtr(x, df, nc)\n"
+    "0.40667858759710945\n"
+    "\n"
+    "Plot the noncentral chi squared distribution CDF for different parameters.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> x = np.linspace(0, 40, 1000)\n"
+    ">>> plt.plot(x, sc.chndtr(x, 1, 5), label=r\"$df=1,\\ nc=5$\")\n"
+    ">>> plt.plot(x, sc.chndtr(x, 5, 10), label=r\"$df=5,\\ nc=10$\")\n"
+    ">>> plt.legend()\n"
+    ">>> plt.show()")
+ufunc_chndtr_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_chndtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_chndtr_types[0] = <char>NPY_FLOAT
+ufunc_chndtr_types[1] = <char>NPY_FLOAT
+ufunc_chndtr_types[2] = <char>NPY_FLOAT
+ufunc_chndtr_types[3] = <char>NPY_FLOAT
+ufunc_chndtr_types[4] = <char>NPY_DOUBLE
+ufunc_chndtr_types[5] = <char>NPY_DOUBLE
+ufunc_chndtr_types[6] = <char>NPY_DOUBLE
+ufunc_chndtr_types[7] = <char>NPY_DOUBLE
+ufunc_chndtr_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_cdf_float
+ufunc_chndtr_ptr[2*0+1] = <void*>(<char*>"chndtr")
+ufunc_chndtr_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_cdf_double
+ufunc_chndtr_ptr[2*1+1] = <void*>(<char*>"chndtr")
+ufunc_chndtr_data[0] = &ufunc_chndtr_ptr[2*0]
+ufunc_chndtr_data[1] = &ufunc_chndtr_ptr[2*1]
+chndtr = np.PyUFunc_FromFuncAndData(ufunc_chndtr_loops, ufunc_chndtr_data, ufunc_chndtr_types, 2, 3, 1, 0, 'chndtr', ufunc_chndtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chndtridf_loops[2]
+cdef void *ufunc_chndtridf_ptr[4]
+cdef void *ufunc_chndtridf_data[2]
+cdef char ufunc_chndtridf_types[8]
+cdef char *ufunc_chndtridf_doc = (
+    "chndtridf(x, p, nc, out=None)\n"
+    "\n"
+    "Inverse to `chndtr` vs `df`\n"
+    "\n"
+    "Calculated using a search to find a value for `df` that produces the\n"
+    "desired value of `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Upper bound of the integral; must satisfy ``x >= 0``\n"
+    "p : array_like\n"
+    "    Probability; must satisfy ``0 <= p < 1``\n"
+    "nc : array_like\n"
+    "    Non-centrality parameter; must satisfy ``nc >= 0``\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "df : scalar or ndarray\n"
+    "    Degrees of freedom\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chndtr : Noncentral chi-squared distribution CDF\n"
+    "chndtrix : inverse of `chndtr` with respect to `x`\n"
+    "chndtrinc : inverse of `chndtr` with respect to `nc`\n"
+    "scipy.stats.ncx2 : Non-central chi-squared distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The noncentral chi squared distribution is also available in\n"
+    "`scipy.stats.ncx2`.\n"
+    "\n"
+    "This function wraps routines from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import chndtridf, chndtr\n"
+    "\n"
+    "Compute the noncentral chi squared distribution CDF at one point.\n"
+    "\n"
+    ">>> x, df, nc = 3, 5, 10\n"
+    ">>> p = chndtr(x, df, nc)\n"
+    "\n"
+    "`chndtridf` is the inverse of `chndtr` with respect to `df`:\n"
+    "\n"
+    ">>> chndtridf(x, p, nc)\n"
+    "5.0")
+ufunc_chndtridf_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_chndtridf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_chndtridf_types[0] = <char>NPY_FLOAT
+ufunc_chndtridf_types[1] = <char>NPY_FLOAT
+ufunc_chndtridf_types[2] = <char>NPY_FLOAT
+ufunc_chndtridf_types[3] = <char>NPY_FLOAT
+ufunc_chndtridf_types[4] = <char>NPY_DOUBLE
+ufunc_chndtridf_types[5] = <char>NPY_DOUBLE
+ufunc_chndtridf_types[6] = <char>NPY_DOUBLE
+ufunc_chndtridf_types[7] = <char>NPY_DOUBLE
+ufunc_chndtridf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_find_degrees_of_freedom_float
+ufunc_chndtridf_ptr[2*0+1] = <void*>(<char*>"chndtridf")
+ufunc_chndtridf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_find_degrees_of_freedom_double
+ufunc_chndtridf_ptr[2*1+1] = <void*>(<char*>"chndtridf")
+ufunc_chndtridf_data[0] = &ufunc_chndtridf_ptr[2*0]
+ufunc_chndtridf_data[1] = &ufunc_chndtridf_ptr[2*1]
+chndtridf = np.PyUFunc_FromFuncAndData(ufunc_chndtridf_loops, ufunc_chndtridf_data, ufunc_chndtridf_types, 2, 3, 1, 0, 'chndtridf', ufunc_chndtridf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chndtrinc_loops[2]
+cdef void *ufunc_chndtrinc_ptr[4]
+cdef void *ufunc_chndtrinc_data[2]
+cdef char ufunc_chndtrinc_types[8]
+cdef char *ufunc_chndtrinc_doc = (
+    "chndtrinc(x, df, p, out=None)\n"
+    "\n"
+    "Inverse to `chndtr` vs `nc`\n"
+    "\n"
+    "Calculated using a search to find a value for `df` that produces the\n"
+    "desired value of `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Upper bound of the integral; must satisfy ``x >= 0``\n"
+    "df : array_like\n"
+    "    Degrees of freedom; must satisfy ``df > 0``\n"
+    "p : array_like\n"
+    "    Probability; must satisfy ``0 <= p < 1``\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "nc : scalar or ndarray\n"
+    "    Non-centrality\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chndtr : Noncentral chi-squared distribution CDF\n"
+    "chndtridf : inverse of `chndtr` with respect to `df`\n"
+    "chndtrinc : inverse of `chndtr` with respect to `nc`\n"
+    "scipy.stats.ncx2 : Non-central chi-squared distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The noncentral chi squared distribution is also available in\n"
+    "`scipy.stats.ncx2`.\n"
+    "\n"
+    "This function wraps routines from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import chndtrinc, chndtr\n"
+    "\n"
+    "Compute the noncentral chi squared distribution CDF at one point.\n"
+    "\n"
+    ">>> x, df, nc = 3, 5, 10\n"
+    ">>> p = chndtr(x, df, nc)\n"
+    "\n"
+    "`chndtrinc` is the inverse of `chndtr` with respect to `nc`:\n"
+    "\n"
+    ">>> chndtrinc(x, df, p)\n"
+    "10.0")
+ufunc_chndtrinc_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_chndtrinc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_chndtrinc_types[0] = <char>NPY_FLOAT
+ufunc_chndtrinc_types[1] = <char>NPY_FLOAT
+ufunc_chndtrinc_types[2] = <char>NPY_FLOAT
+ufunc_chndtrinc_types[3] = <char>NPY_FLOAT
+ufunc_chndtrinc_types[4] = <char>NPY_DOUBLE
+ufunc_chndtrinc_types[5] = <char>NPY_DOUBLE
+ufunc_chndtrinc_types[6] = <char>NPY_DOUBLE
+ufunc_chndtrinc_types[7] = <char>NPY_DOUBLE
+ufunc_chndtrinc_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_find_noncentrality_float
+ufunc_chndtrinc_ptr[2*0+1] = <void*>(<char*>"chndtrinc")
+ufunc_chndtrinc_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_find_noncentrality_double
+ufunc_chndtrinc_ptr[2*1+1] = <void*>(<char*>"chndtrinc")
+ufunc_chndtrinc_data[0] = &ufunc_chndtrinc_ptr[2*0]
+ufunc_chndtrinc_data[1] = &ufunc_chndtrinc_ptr[2*1]
+chndtrinc = np.PyUFunc_FromFuncAndData(ufunc_chndtrinc_loops, ufunc_chndtrinc_data, ufunc_chndtrinc_types, 2, 3, 1, 0, 'chndtrinc', ufunc_chndtrinc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_chndtrix_loops[2]
+cdef void *ufunc_chndtrix_ptr[4]
+cdef void *ufunc_chndtrix_data[2]
+cdef char ufunc_chndtrix_types[8]
+cdef char *ufunc_chndtrix_doc = (
+    "chndtrix(p, df, nc, out=None)\n"
+    "\n"
+    "Inverse to `chndtr` vs `x`\n"
+    "\n"
+    "Calculated using a search to find a value for `x` that produces the\n"
+    "desired value of `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability; must satisfy ``0 <= p < 1``\n"
+    "df : array_like\n"
+    "    Degrees of freedom; must satisfy ``df > 0``\n"
+    "nc : array_like\n"
+    "    Non-centrality parameter; must satisfy ``nc >= 0``\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Value so that the probability a non-central Chi square random variable\n"
+    "    with `df` degrees of freedom and non-centrality, `nc`, is greater than\n"
+    "    `x` equals `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "chndtr : Noncentral chi-squared distribution CDF\n"
+    "chndtridf : inverse of `chndtr` with respect to `cdf`\n"
+    "chndtrinc : inverse of `chndtr` with respect to `nc`\n"
+    "scipy.stats.ncx2 : Non-central chi-squared distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The noncentral chi squared distribution is also available in\n"
+    "`scipy.stats.ncx2`. ``scipy.stats.ncx2.ppf`` is equivalent to `chndtrix`.\n"
+    "\n"
+    "This function wraps routines from the Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import chndtrix, chndtr\n"
+    "\n"
+    "Compute the noncentral chi squared distribution CDF at one point.\n"
+    ">>> x, df, nc = 3, 5, 10\n"
+    ">>> p = chndtr(x, df, nc)\n"
+    "\n"
+    "`chndtrix` is the inverse of `chndtr` with respect to `x`:\n"
+    "\n"
+    ">>> chndtrix(p, df, nc)\n"
+    "3.0")
+ufunc_chndtrix_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_chndtrix_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_chndtrix_types[0] = <char>NPY_FLOAT
+ufunc_chndtrix_types[1] = <char>NPY_FLOAT
+ufunc_chndtrix_types[2] = <char>NPY_FLOAT
+ufunc_chndtrix_types[3] = <char>NPY_FLOAT
+ufunc_chndtrix_types[4] = <char>NPY_DOUBLE
+ufunc_chndtrix_types[5] = <char>NPY_DOUBLE
+ufunc_chndtrix_types[6] = <char>NPY_DOUBLE
+ufunc_chndtrix_types[7] = <char>NPY_DOUBLE
+ufunc_chndtrix_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncx2_ppf_float
+ufunc_chndtrix_ptr[2*0+1] = <void*>(<char*>"chndtrix")
+ufunc_chndtrix_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncx2_ppf_double
+ufunc_chndtrix_ptr[2*1+1] = <void*>(<char*>"chndtrix")
+ufunc_chndtrix_data[0] = &ufunc_chndtrix_ptr[2*0]
+ufunc_chndtrix_data[1] = &ufunc_chndtrix_ptr[2*1]
+chndtrix = np.PyUFunc_FromFuncAndData(ufunc_chndtrix_loops, ufunc_chndtrix_data, ufunc_chndtrix_types, 2, 3, 1, 0, 'chndtrix', ufunc_chndtrix_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_elliprc_loops[4]
+cdef void *ufunc_elliprc_ptr[8]
+cdef void *ufunc_elliprc_data[4]
+cdef char ufunc_elliprc_types[12]
+cdef char *ufunc_elliprc_doc = (
+    "elliprc(x, y, out=None)\n"
+    "\n"
+    "Degenerate symmetric elliptic integral.\n"
+    "\n"
+    "The function RC is defined as [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    R_{\\mathrm{C}}(x, y) =\n"
+    "       \\frac{1}{2} \\int_0^{+\\infty} (t + x)^{-1/2} (t + y)^{-1} dt\n"
+    "       = R_{\\mathrm{F}}(x, y, y)\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y : array_like\n"
+    "    Real or complex input parameters. `x` can be any number in the\n"
+    "    complex plane cut along the negative real axis. `y` must be non-zero.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "R : scalar or ndarray\n"
+    "    Value of the integral. If `y` is real and negative, the Cauchy\n"
+    "    principal value is returned. If both of `x` and `y` are real, the\n"
+    "    return value is real. Otherwise, the return value is complex.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "elliprf : Completely-symmetric elliptic integral of the first kind.\n"
+    "elliprd : Symmetric elliptic integral of the second kind.\n"
+    "elliprg : Completely-symmetric elliptic integral of the second kind.\n"
+    "elliprj : Symmetric elliptic integral of the third kind.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "RC is a degenerate case of the symmetric integral RF: ``elliprc(x, y) ==\n"
+    "elliprf(x, y, y)``. It is an elementary function rather than an elliptic\n"
+    "integral.\n"
+    "\n"
+    "The code implements Carlson's algorithm based on the duplication theorems\n"
+    "and series expansion up to the 7th order. [2]_\n"
+    "\n"
+    ".. versionadded:: 1.8.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] B. C. Carlson, ed., Chapter 19 in \"Digital Library of Mathematical\n"
+    "       Functions,\" NIST, US Dept. of Commerce.\n"
+    "       https://dlmf.nist.gov/19.16.E6\n"
+    ".. [2] B. C. Carlson, \"Numerical computation of real or complex elliptic\n"
+    "       integrals,\" Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.\n"
+    "       https://arxiv.org/abs/math/9409227\n"
+    "       https://doi.org/10.1007/BF02198293\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Basic homogeneity property:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import elliprc\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> y = 5.\n"
+    ">>> scale = 0.3 + 0.4j\n"
+    ">>> elliprc(scale*x, scale*y)\n"
+    "(0.5484493976710874-0.4169557678995833j)\n"
+    "\n"
+    ">>> elliprc(x, y)/np.sqrt(scale)\n"
+    "(0.5484493976710874-0.41695576789958333j)\n"
+    "\n"
+    "When the two arguments coincide, the integral is particularly\n"
+    "simple:\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> elliprc(x, x)\n"
+    "(0.4299173120614631-0.3041729818745595j)\n"
+    "\n"
+    ">>> 1/np.sqrt(x)\n"
+    "(0.4299173120614631-0.30417298187455954j)\n"
+    "\n"
+    "Another simple case: the first argument vanishes:\n"
+    "\n"
+    ">>> y = 1.2 + 3.4j\n"
+    ">>> elliprc(0, y)\n"
+    "(0.6753125346116815-0.47779380263880866j)\n"
+    "\n"
+    ">>> np.pi/2/np.sqrt(y)\n"
+    "(0.6753125346116815-0.4777938026388088j)\n"
+    "\n"
+    "When `x` and `y` are both positive, we can express\n"
+    ":math:`R_C(x,y)` in terms of more elementary functions.  For the\n"
+    "case :math:`0 \\le x < y`,\n"
+    "\n"
+    ">>> x = 3.2\n"
+    ">>> y = 6.\n"
+    ">>> elliprc(x, y)\n"
+    "0.44942991498453444\n"
+    "\n"
+    ">>> np.arctan(np.sqrt((y-x)/x))/np.sqrt(y-x)\n"
+    "0.44942991498453433\n"
+    "\n"
+    "And for the case :math:`0 \\le y < x`,\n"
+    "\n"
+    ">>> x = 6.\n"
+    ">>> y = 3.2\n"
+    ">>> elliprc(x,y)\n"
+    "0.4989837501576147\n"
+    "\n"
+    ">>> np.log((np.sqrt(x)+np.sqrt(x-y))/np.sqrt(y))/np.sqrt(x-y)\n"
+    "0.49898375015761476")
+ufunc_elliprc_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_elliprc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_elliprc_loops[2] = <np.PyUFuncGenericFunction>loop_D_DD__As_FF_F
+ufunc_elliprc_loops[3] = <np.PyUFuncGenericFunction>loop_D_DD__As_DD_D
+ufunc_elliprc_types[0] = <char>NPY_FLOAT
+ufunc_elliprc_types[1] = <char>NPY_FLOAT
+ufunc_elliprc_types[2] = <char>NPY_FLOAT
+ufunc_elliprc_types[3] = <char>NPY_DOUBLE
+ufunc_elliprc_types[4] = <char>NPY_DOUBLE
+ufunc_elliprc_types[5] = <char>NPY_DOUBLE
+ufunc_elliprc_types[6] = <char>NPY_CFLOAT
+ufunc_elliprc_types[7] = <char>NPY_CFLOAT
+ufunc_elliprc_types[8] = <char>NPY_CFLOAT
+ufunc_elliprc_types[9] = <char>NPY_CDOUBLE
+ufunc_elliprc_types[10] = <char>NPY_CDOUBLE
+ufunc_elliprc_types[11] = <char>NPY_CDOUBLE
+ufunc_elliprc_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_fellint_RC
+ufunc_elliprc_ptr[2*0+1] = <void*>(<char*>"elliprc")
+ufunc_elliprc_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_fellint_RC
+ufunc_elliprc_ptr[2*1+1] = <void*>(<char*>"elliprc")
+ufunc_elliprc_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_cellint_RC
+ufunc_elliprc_ptr[2*2+1] = <void*>(<char*>"elliprc")
+ufunc_elliprc_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_cellint_RC
+ufunc_elliprc_ptr[2*3+1] = <void*>(<char*>"elliprc")
+ufunc_elliprc_data[0] = &ufunc_elliprc_ptr[2*0]
+ufunc_elliprc_data[1] = &ufunc_elliprc_ptr[2*1]
+ufunc_elliprc_data[2] = &ufunc_elliprc_ptr[2*2]
+ufunc_elliprc_data[3] = &ufunc_elliprc_ptr[2*3]
+elliprc = np.PyUFunc_FromFuncAndData(ufunc_elliprc_loops, ufunc_elliprc_data, ufunc_elliprc_types, 4, 2, 1, 0, 'elliprc', ufunc_elliprc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_elliprd_loops[4]
+cdef void *ufunc_elliprd_ptr[8]
+cdef void *ufunc_elliprd_data[4]
+cdef char ufunc_elliprd_types[16]
+cdef char *ufunc_elliprd_doc = (
+    "elliprd(x, y, z, out=None)\n"
+    "\n"
+    "Symmetric elliptic integral of the second kind.\n"
+    "\n"
+    "The function RD is defined as [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    R_{\\mathrm{D}}(x, y, z) =\n"
+    "       \\frac{3}{2} \\int_0^{+\\infty} [(t + x) (t + y)]^{-1/2} (t + z)^{-3/2}\n"
+    "       dt\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y, z : array_like\n"
+    "    Real or complex input parameters. `x` or `y` can be any number in the\n"
+    "    complex plane cut along the negative real axis, but at most one of them\n"
+    "    can be zero, while `z` must be non-zero.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "R : scalar or ndarray\n"
+    "    Value of the integral. If all of `x`, `y`, and `z` are real, the\n"
+    "    return value is real. Otherwise, the return value is complex.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "elliprc : Degenerate symmetric elliptic integral.\n"
+    "elliprf : Completely-symmetric elliptic integral of the first kind.\n"
+    "elliprg : Completely-symmetric elliptic integral of the second kind.\n"
+    "elliprj : Symmetric elliptic integral of the third kind.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "RD is a degenerate case of the elliptic integral RJ: ``elliprd(x, y, z) ==\n"
+    "elliprj(x, y, z, z)``.\n"
+    "\n"
+    "The code implements Carlson's algorithm based on the duplication theorems\n"
+    "and series expansion up to the 7th order. [2]_\n"
+    "\n"
+    ".. versionadded:: 1.8.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] B. C. Carlson, ed., Chapter 19 in \"Digital Library of Mathematical\n"
+    "       Functions,\" NIST, US Dept. of Commerce.\n"
+    "       https://dlmf.nist.gov/19.16.E5\n"
+    ".. [2] B. C. Carlson, \"Numerical computation of real or complex elliptic\n"
+    "       integrals,\" Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.\n"
+    "       https://arxiv.org/abs/math/9409227\n"
+    "       https://doi.org/10.1007/BF02198293\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Basic homogeneity property:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import elliprd\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> y = 5.\n"
+    ">>> z = 6.\n"
+    ">>> scale = 0.3 + 0.4j\n"
+    ">>> elliprd(scale*x, scale*y, scale*z)\n"
+    "(-0.03703043835680379-0.24500934665683802j)\n"
+    "\n"
+    ">>> elliprd(x, y, z)*np.power(scale, -1.5)\n"
+    "(-0.0370304383568038-0.24500934665683805j)\n"
+    "\n"
+    "All three arguments coincide:\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> elliprd(x, x, x)\n"
+    "(-0.03986825876151896-0.14051741840449586j)\n"
+    "\n"
+    ">>> np.power(x, -1.5)\n"
+    "(-0.03986825876151894-0.14051741840449583j)\n"
+    "\n"
+    "The so-called \"second lemniscate constant\":\n"
+    "\n"
+    ">>> elliprd(0, 2, 1)/3\n"
+    "0.5990701173677961\n"
+    "\n"
+    ">>> from scipy.special import gamma\n"
+    ">>> gamma(0.75)**2/np.sqrt(2*np.pi)\n"
+    "0.5990701173677959")
+ufunc_elliprd_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_elliprd_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_elliprd_loops[2] = <np.PyUFuncGenericFunction>loop_D_DDD__As_FFF_F
+ufunc_elliprd_loops[3] = <np.PyUFuncGenericFunction>loop_D_DDD__As_DDD_D
+ufunc_elliprd_types[0] = <char>NPY_FLOAT
+ufunc_elliprd_types[1] = <char>NPY_FLOAT
+ufunc_elliprd_types[2] = <char>NPY_FLOAT
+ufunc_elliprd_types[3] = <char>NPY_FLOAT
+ufunc_elliprd_types[4] = <char>NPY_DOUBLE
+ufunc_elliprd_types[5] = <char>NPY_DOUBLE
+ufunc_elliprd_types[6] = <char>NPY_DOUBLE
+ufunc_elliprd_types[7] = <char>NPY_DOUBLE
+ufunc_elliprd_types[8] = <char>NPY_CFLOAT
+ufunc_elliprd_types[9] = <char>NPY_CFLOAT
+ufunc_elliprd_types[10] = <char>NPY_CFLOAT
+ufunc_elliprd_types[11] = <char>NPY_CFLOAT
+ufunc_elliprd_types[12] = <char>NPY_CDOUBLE
+ufunc_elliprd_types[13] = <char>NPY_CDOUBLE
+ufunc_elliprd_types[14] = <char>NPY_CDOUBLE
+ufunc_elliprd_types[15] = <char>NPY_CDOUBLE
+ufunc_elliprd_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_fellint_RD
+ufunc_elliprd_ptr[2*0+1] = <void*>(<char*>"elliprd")
+ufunc_elliprd_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_fellint_RD
+ufunc_elliprd_ptr[2*1+1] = <void*>(<char*>"elliprd")
+ufunc_elliprd_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_cellint_RD
+ufunc_elliprd_ptr[2*2+1] = <void*>(<char*>"elliprd")
+ufunc_elliprd_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_cellint_RD
+ufunc_elliprd_ptr[2*3+1] = <void*>(<char*>"elliprd")
+ufunc_elliprd_data[0] = &ufunc_elliprd_ptr[2*0]
+ufunc_elliprd_data[1] = &ufunc_elliprd_ptr[2*1]
+ufunc_elliprd_data[2] = &ufunc_elliprd_ptr[2*2]
+ufunc_elliprd_data[3] = &ufunc_elliprd_ptr[2*3]
+elliprd = np.PyUFunc_FromFuncAndData(ufunc_elliprd_loops, ufunc_elliprd_data, ufunc_elliprd_types, 4, 3, 1, 0, 'elliprd', ufunc_elliprd_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_elliprf_loops[4]
+cdef void *ufunc_elliprf_ptr[8]
+cdef void *ufunc_elliprf_data[4]
+cdef char ufunc_elliprf_types[16]
+cdef char *ufunc_elliprf_doc = (
+    "elliprf(x, y, z, out=None)\n"
+    "\n"
+    "Completely-symmetric elliptic integral of the first kind.\n"
+    "\n"
+    "The function RF is defined as [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    R_{\\mathrm{F}}(x, y, z) =\n"
+    "       \\frac{1}{2} \\int_0^{+\\infty} [(t + x) (t + y) (t + z)]^{-1/2} dt\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y, z : array_like\n"
+    "    Real or complex input parameters. `x`, `y`, or `z` can be any number in\n"
+    "    the complex plane cut along the negative real axis, but at most one of\n"
+    "    them can be zero.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "R : scalar or ndarray\n"
+    "    Value of the integral. If all of `x`, `y`, and `z` are real, the return\n"
+    "    value is real. Otherwise, the return value is complex.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "elliprc : Degenerate symmetric integral.\n"
+    "elliprd : Symmetric elliptic integral of the second kind.\n"
+    "elliprg : Completely-symmetric elliptic integral of the second kind.\n"
+    "elliprj : Symmetric elliptic integral of the third kind.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The code implements Carlson's algorithm based on the duplication theorems\n"
+    "and series expansion up to the 7th order (cf.:\n"
+    "https://dlmf.nist.gov/19.36.i) and the AGM algorithm for the complete\n"
+    "integral. [2]_\n"
+    "\n"
+    ".. versionadded:: 1.8.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] B. C. Carlson, ed., Chapter 19 in \"Digital Library of Mathematical\n"
+    "       Functions,\" NIST, US Dept. of Commerce.\n"
+    "       https://dlmf.nist.gov/19.16.E1\n"
+    ".. [2] B. C. Carlson, \"Numerical computation of real or complex elliptic\n"
+    "       integrals,\" Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.\n"
+    "       https://arxiv.org/abs/math/9409227\n"
+    "       https://doi.org/10.1007/BF02198293\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Basic homogeneity property:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import elliprf\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> y = 5.\n"
+    ">>> z = 6.\n"
+    ">>> scale = 0.3 + 0.4j\n"
+    ">>> elliprf(scale*x, scale*y, scale*z)\n"
+    "(0.5328051227278146-0.4008623567957094j)\n"
+    "\n"
+    ">>> elliprf(x, y, z)/np.sqrt(scale)\n"
+    "(0.5328051227278147-0.4008623567957095j)\n"
+    "\n"
+    "All three arguments coincide:\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> elliprf(x, x, x)\n"
+    "(0.42991731206146316-0.30417298187455954j)\n"
+    "\n"
+    ">>> 1/np.sqrt(x)\n"
+    "(0.4299173120614631-0.30417298187455954j)\n"
+    "\n"
+    "The so-called \"first lemniscate constant\":\n"
+    "\n"
+    ">>> elliprf(0, 1, 2)\n"
+    "1.3110287771460598\n"
+    "\n"
+    ">>> from scipy.special import gamma\n"
+    ">>> gamma(0.25)**2/(4*np.sqrt(2*np.pi))\n"
+    "1.3110287771460598")
+ufunc_elliprf_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_elliprf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_elliprf_loops[2] = <np.PyUFuncGenericFunction>loop_D_DDD__As_FFF_F
+ufunc_elliprf_loops[3] = <np.PyUFuncGenericFunction>loop_D_DDD__As_DDD_D
+ufunc_elliprf_types[0] = <char>NPY_FLOAT
+ufunc_elliprf_types[1] = <char>NPY_FLOAT
+ufunc_elliprf_types[2] = <char>NPY_FLOAT
+ufunc_elliprf_types[3] = <char>NPY_FLOAT
+ufunc_elliprf_types[4] = <char>NPY_DOUBLE
+ufunc_elliprf_types[5] = <char>NPY_DOUBLE
+ufunc_elliprf_types[6] = <char>NPY_DOUBLE
+ufunc_elliprf_types[7] = <char>NPY_DOUBLE
+ufunc_elliprf_types[8] = <char>NPY_CFLOAT
+ufunc_elliprf_types[9] = <char>NPY_CFLOAT
+ufunc_elliprf_types[10] = <char>NPY_CFLOAT
+ufunc_elliprf_types[11] = <char>NPY_CFLOAT
+ufunc_elliprf_types[12] = <char>NPY_CDOUBLE
+ufunc_elliprf_types[13] = <char>NPY_CDOUBLE
+ufunc_elliprf_types[14] = <char>NPY_CDOUBLE
+ufunc_elliprf_types[15] = <char>NPY_CDOUBLE
+ufunc_elliprf_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_fellint_RF
+ufunc_elliprf_ptr[2*0+1] = <void*>(<char*>"elliprf")
+ufunc_elliprf_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_fellint_RF
+ufunc_elliprf_ptr[2*1+1] = <void*>(<char*>"elliprf")
+ufunc_elliprf_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_cellint_RF
+ufunc_elliprf_ptr[2*2+1] = <void*>(<char*>"elliprf")
+ufunc_elliprf_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_cellint_RF
+ufunc_elliprf_ptr[2*3+1] = <void*>(<char*>"elliprf")
+ufunc_elliprf_data[0] = &ufunc_elliprf_ptr[2*0]
+ufunc_elliprf_data[1] = &ufunc_elliprf_ptr[2*1]
+ufunc_elliprf_data[2] = &ufunc_elliprf_ptr[2*2]
+ufunc_elliprf_data[3] = &ufunc_elliprf_ptr[2*3]
+elliprf = np.PyUFunc_FromFuncAndData(ufunc_elliprf_loops, ufunc_elliprf_data, ufunc_elliprf_types, 4, 3, 1, 0, 'elliprf', ufunc_elliprf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_elliprg_loops[4]
+cdef void *ufunc_elliprg_ptr[8]
+cdef void *ufunc_elliprg_data[4]
+cdef char ufunc_elliprg_types[16]
+cdef char *ufunc_elliprg_doc = (
+    "elliprg(x, y, z, out=None)\n"
+    "\n"
+    "Completely-symmetric elliptic integral of the second kind.\n"
+    "\n"
+    "The function RG is defined as [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    R_{\\mathrm{G}}(x, y, z) =\n"
+    "       \\frac{1}{4} \\int_0^{+\\infty} [(t + x) (t + y) (t + z)]^{-1/2}\n"
+    "       \\left(\\frac{x}{t + x} + \\frac{y}{t + y} + \\frac{z}{t + z}\\right) t\n"
+    "       dt\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y, z : array_like\n"
+    "    Real or complex input parameters. `x`, `y`, or `z` can be any number in\n"
+    "    the complex plane cut along the negative real axis.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "R : scalar or ndarray\n"
+    "    Value of the integral. If all of `x`, `y`, and `z` are real, the return\n"
+    "    value is real. Otherwise, the return value is complex.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "elliprc : Degenerate symmetric integral.\n"
+    "elliprd : Symmetric elliptic integral of the second kind.\n"
+    "elliprf : Completely-symmetric elliptic integral of the first kind.\n"
+    "elliprj : Symmetric elliptic integral of the third kind.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The implementation uses the relation [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    2 R_{\\mathrm{G}}(x, y, z) =\n"
+    "       z R_{\\mathrm{F}}(x, y, z) -\n"
+    "       \\frac{1}{3} (x - z) (y - z) R_{\\mathrm{D}}(x, y, z) +\n"
+    "       \\sqrt{\\frac{x y}{z}}\n"
+    "\n"
+    "and the symmetry of `x`, `y`, `z` when at least one non-zero parameter can\n"
+    "be chosen as the pivot. When one of the arguments is close to zero, the AGM\n"
+    "method is applied instead. Other special cases are computed following Ref.\n"
+    "[2]_\n"
+    "\n"
+    ".. versionadded:: 1.8.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] B. C. Carlson, \"Numerical computation of real or complex elliptic\n"
+    "       integrals,\" Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.\n"
+    "       https://arxiv.org/abs/math/9409227\n"
+    "       https://doi.org/10.1007/BF02198293\n"
+    ".. [2] B. C. Carlson, ed., Chapter 19 in \"Digital Library of Mathematical\n"
+    "       Functions,\" NIST, US Dept. of Commerce.\n"
+    "       https://dlmf.nist.gov/19.16.E1\n"
+    "       https://dlmf.nist.gov/19.20.ii\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Basic homogeneity property:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import elliprg\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> y = 5.\n"
+    ">>> z = 6.\n"
+    ">>> scale = 0.3 + 0.4j\n"
+    ">>> elliprg(scale*x, scale*y, scale*z)\n"
+    "(1.195936862005246+0.8470988320464167j)\n"
+    "\n"
+    ">>> elliprg(x, y, z)*np.sqrt(scale)\n"
+    "(1.195936862005246+0.8470988320464165j)\n"
+    "\n"
+    "Simplifications:\n"
+    "\n"
+    ">>> elliprg(0, y, y)\n"
+    "1.756203682760182\n"
+    "\n"
+    ">>> 0.25*np.pi*np.sqrt(y)\n"
+    "1.7562036827601817\n"
+    "\n"
+    ">>> elliprg(0, 0, z)\n"
+    "1.224744871391589\n"
+    "\n"
+    ">>> 0.5*np.sqrt(z)\n"
+    "1.224744871391589\n"
+    "\n"
+    "The surface area of a triaxial ellipsoid with semiaxes ``a``, ``b``, and\n"
+    "``c`` is given by\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    S = 4 \\pi a b c R_{\\mathrm{G}}(1 / a^2, 1 / b^2, 1 / c^2).\n"
+    "\n"
+    ">>> def ellipsoid_area(a, b, c):\n"
+    "...     r = 4.0 * np.pi * a * b * c\n"
+    "...     return r * elliprg(1.0 / (a * a), 1.0 / (b * b), 1.0 / (c * c))\n"
+    ">>> print(ellipsoid_area(1, 3, 5))\n"
+    "108.62688289491807")
+ufunc_elliprg_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_elliprg_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_elliprg_loops[2] = <np.PyUFuncGenericFunction>loop_D_DDD__As_FFF_F
+ufunc_elliprg_loops[3] = <np.PyUFuncGenericFunction>loop_D_DDD__As_DDD_D
+ufunc_elliprg_types[0] = <char>NPY_FLOAT
+ufunc_elliprg_types[1] = <char>NPY_FLOAT
+ufunc_elliprg_types[2] = <char>NPY_FLOAT
+ufunc_elliprg_types[3] = <char>NPY_FLOAT
+ufunc_elliprg_types[4] = <char>NPY_DOUBLE
+ufunc_elliprg_types[5] = <char>NPY_DOUBLE
+ufunc_elliprg_types[6] = <char>NPY_DOUBLE
+ufunc_elliprg_types[7] = <char>NPY_DOUBLE
+ufunc_elliprg_types[8] = <char>NPY_CFLOAT
+ufunc_elliprg_types[9] = <char>NPY_CFLOAT
+ufunc_elliprg_types[10] = <char>NPY_CFLOAT
+ufunc_elliprg_types[11] = <char>NPY_CFLOAT
+ufunc_elliprg_types[12] = <char>NPY_CDOUBLE
+ufunc_elliprg_types[13] = <char>NPY_CDOUBLE
+ufunc_elliprg_types[14] = <char>NPY_CDOUBLE
+ufunc_elliprg_types[15] = <char>NPY_CDOUBLE
+ufunc_elliprg_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_fellint_RG
+ufunc_elliprg_ptr[2*0+1] = <void*>(<char*>"elliprg")
+ufunc_elliprg_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_fellint_RG
+ufunc_elliprg_ptr[2*1+1] = <void*>(<char*>"elliprg")
+ufunc_elliprg_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_cellint_RG
+ufunc_elliprg_ptr[2*2+1] = <void*>(<char*>"elliprg")
+ufunc_elliprg_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_cellint_RG
+ufunc_elliprg_ptr[2*3+1] = <void*>(<char*>"elliprg")
+ufunc_elliprg_data[0] = &ufunc_elliprg_ptr[2*0]
+ufunc_elliprg_data[1] = &ufunc_elliprg_ptr[2*1]
+ufunc_elliprg_data[2] = &ufunc_elliprg_ptr[2*2]
+ufunc_elliprg_data[3] = &ufunc_elliprg_ptr[2*3]
+elliprg = np.PyUFunc_FromFuncAndData(ufunc_elliprg_loops, ufunc_elliprg_data, ufunc_elliprg_types, 4, 3, 1, 0, 'elliprg', ufunc_elliprg_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_elliprj_loops[4]
+cdef void *ufunc_elliprj_ptr[8]
+cdef void *ufunc_elliprj_data[4]
+cdef char ufunc_elliprj_types[20]
+cdef char *ufunc_elliprj_doc = (
+    "elliprj(x, y, z, p, out=None)\n"
+    "\n"
+    "Symmetric elliptic integral of the third kind.\n"
+    "\n"
+    "The function RJ is defined as [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    R_{\\mathrm{J}}(x, y, z, p) =\n"
+    "       \\frac{3}{2} \\int_0^{+\\infty} [(t + x) (t + y) (t + z)]^{-1/2}\n"
+    "       (t + p)^{-1} dt\n"
+    "\n"
+    ".. warning::\n"
+    "    This function should be considered experimental when the inputs are\n"
+    "    unbalanced.  Check correctness with another independent implementation.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y, z, p : array_like\n"
+    "    Real or complex input parameters. `x`, `y`, or `z` are numbers in\n"
+    "    the complex plane cut along the negative real axis (subject to further\n"
+    "    constraints, see Notes), and at most one of them can be zero. `p` must\n"
+    "    be non-zero.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "R : scalar or ndarray\n"
+    "    Value of the integral. If all of `x`, `y`, `z`, and `p` are real, the\n"
+    "    return value is real. Otherwise, the return value is complex.\n"
+    "\n"
+    "    If `p` is real and negative, while `x`, `y`, and `z` are real,\n"
+    "    non-negative, and at most one of them is zero, the Cauchy principal\n"
+    "    value is returned. [1]_ [2]_\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "elliprc : Degenerate symmetric integral.\n"
+    "elliprd : Symmetric elliptic integral of the second kind.\n"
+    "elliprf : Completely-symmetric elliptic integral of the first kind.\n"
+    "elliprg : Completely-symmetric elliptic integral of the second kind.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The code implements Carlson's algorithm based on the duplication theorems\n"
+    "and series expansion up to the 7th order. [3]_ The algorithm is slightly\n"
+    "different from its earlier incarnation as it appears in [1]_, in that the\n"
+    "call to `elliprc` (or ``atan``/``atanh``, see [4]_) is no longer needed in\n"
+    "the inner loop. Asymptotic approximations are used where arguments differ\n"
+    "widely in the order of magnitude. [5]_\n"
+    "\n"
+    "The input values are subject to certain sufficient but not necessary\n"
+    "constraints when input arguments are complex. Notably, ``x``, ``y``, and\n"
+    "``z`` must have non-negative real parts, unless two of them are\n"
+    "non-negative and complex-conjugates to each other while the other is a real\n"
+    "non-negative number. [1]_ If the inputs do not satisfy the sufficient\n"
+    "condition described in Ref. [1]_ they are rejected outright with the output\n"
+    "set to NaN.\n"
+    "\n"
+    "In the case where one of ``x``, ``y``, and ``z`` is equal to ``p``, the\n"
+    "function ``elliprd`` should be preferred because of its less restrictive\n"
+    "domain.\n"
+    "\n"
+    ".. versionadded:: 1.8.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] B. C. Carlson, \"Numerical computation of real or complex elliptic\n"
+    "       integrals,\" Numer. Algorithm, vol. 10, no. 1, pp. 13-26, 1995.\n"
+    "       https://arxiv.org/abs/math/9409227\n"
+    "       https://doi.org/10.1007/BF02198293\n"
+    ".. [2] B. C. Carlson, ed., Chapter 19 in \"Digital Library of Mathematical\n"
+    "       Functions,\" NIST, US Dept. of Commerce.\n"
+    "       https://dlmf.nist.gov/19.20.iii\n"
+    ".. [3] B. C. Carlson, J. FitzSimmons, \"Reduction Theorems for Elliptic\n"
+    "       Integrands with the Square Root of Two Quadratic Factors,\" J.\n"
+    "       Comput. Appl. Math., vol. 118, nos. 1-2, pp. 71-85, 2000.\n"
+    "       https://doi.org/10.1016/S0377-0427(00)00282-X\n"
+    ".. [4] F. Johansson, \"Numerical Evaluation of Elliptic Functions, Elliptic\n"
+    "       Integrals and Modular Forms,\" in J. Blumlein, C. Schneider, P.\n"
+    "       Paule, eds., \"Elliptic Integrals, Elliptic Functions and Modular\n"
+    "       Forms in Quantum Field Theory,\" pp. 269-293, 2019 (Cham,\n"
+    "       Switzerland: Springer Nature Switzerland)\n"
+    "       https://arxiv.org/abs/1806.06725\n"
+    "       https://doi.org/10.1007/978-3-030-04480-0\n"
+    ".. [5] B. C. Carlson, J. L. Gustafson, \"Asymptotic Approximations for\n"
+    "       Symmetric Elliptic Integrals,\" SIAM J. Math. Anls., vol. 25, no. 2,\n"
+    "       pp. 288-303, 1994.\n"
+    "       https://arxiv.org/abs/math/9310223\n"
+    "       https://doi.org/10.1137/S0036141092228477\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Basic homogeneity property:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import elliprj\n"
+    "\n"
+    ">>> x = 1.2 + 3.4j\n"
+    ">>> y = 5.\n"
+    ">>> z = 6.\n"
+    ">>> p = 7.\n"
+    ">>> scale = 0.3 - 0.4j\n"
+    ">>> elliprj(scale*x, scale*y, scale*z, scale*p)\n"
+    "(0.10834905565679157+0.19694950747103812j)\n"
+    "\n"
+    ">>> elliprj(x, y, z, p)*np.power(scale, -1.5)\n"
+    "(0.10834905565679556+0.19694950747103854j)\n"
+    "\n"
+    "Reduction to simpler elliptic integral:\n"
+    "\n"
+    ">>> elliprj(x, y, z, z)\n"
+    "(0.08288462362195129-0.028376809745123258j)\n"
+    "\n"
+    ">>> from scipy.special import elliprd\n"
+    ">>> elliprd(x, y, z)\n"
+    "(0.08288462362195136-0.028376809745123296j)\n"
+    "\n"
+    "All arguments coincide:\n"
+    "\n"
+    ">>> elliprj(x, x, x, x)\n"
+    "(-0.03986825876151896-0.14051741840449586j)\n"
+    "\n"
+    ">>> np.power(x, -1.5)\n"
+    "(-0.03986825876151894-0.14051741840449583j)")
+ufunc_elliprj_loops[0] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_elliprj_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_elliprj_loops[2] = <np.PyUFuncGenericFunction>loop_D_DDDD__As_FFFF_F
+ufunc_elliprj_loops[3] = <np.PyUFuncGenericFunction>loop_D_DDDD__As_DDDD_D
+ufunc_elliprj_types[0] = <char>NPY_FLOAT
+ufunc_elliprj_types[1] = <char>NPY_FLOAT
+ufunc_elliprj_types[2] = <char>NPY_FLOAT
+ufunc_elliprj_types[3] = <char>NPY_FLOAT
+ufunc_elliprj_types[4] = <char>NPY_FLOAT
+ufunc_elliprj_types[5] = <char>NPY_DOUBLE
+ufunc_elliprj_types[6] = <char>NPY_DOUBLE
+ufunc_elliprj_types[7] = <char>NPY_DOUBLE
+ufunc_elliprj_types[8] = <char>NPY_DOUBLE
+ufunc_elliprj_types[9] = <char>NPY_DOUBLE
+ufunc_elliprj_types[10] = <char>NPY_CFLOAT
+ufunc_elliprj_types[11] = <char>NPY_CFLOAT
+ufunc_elliprj_types[12] = <char>NPY_CFLOAT
+ufunc_elliprj_types[13] = <char>NPY_CFLOAT
+ufunc_elliprj_types[14] = <char>NPY_CFLOAT
+ufunc_elliprj_types[15] = <char>NPY_CDOUBLE
+ufunc_elliprj_types[16] = <char>NPY_CDOUBLE
+ufunc_elliprj_types[17] = <char>NPY_CDOUBLE
+ufunc_elliprj_types[18] = <char>NPY_CDOUBLE
+ufunc_elliprj_types[19] = <char>NPY_CDOUBLE
+ufunc_elliprj_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_fellint_RJ
+ufunc_elliprj_ptr[2*0+1] = <void*>(<char*>"elliprj")
+ufunc_elliprj_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_fellint_RJ
+ufunc_elliprj_ptr[2*1+1] = <void*>(<char*>"elliprj")
+ufunc_elliprj_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_cellint_RJ
+ufunc_elliprj_ptr[2*2+1] = <void*>(<char*>"elliprj")
+ufunc_elliprj_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_cellint_RJ
+ufunc_elliprj_ptr[2*3+1] = <void*>(<char*>"elliprj")
+ufunc_elliprj_data[0] = &ufunc_elliprj_ptr[2*0]
+ufunc_elliprj_data[1] = &ufunc_elliprj_ptr[2*1]
+ufunc_elliprj_data[2] = &ufunc_elliprj_ptr[2*2]
+ufunc_elliprj_data[3] = &ufunc_elliprj_ptr[2*3]
+elliprj = np.PyUFunc_FromFuncAndData(ufunc_elliprj_loops, ufunc_elliprj_data, ufunc_elliprj_types, 4, 4, 1, 0, 'elliprj', ufunc_elliprj_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_entr_loops[2]
+cdef void *ufunc_entr_ptr[4]
+cdef void *ufunc_entr_data[2]
+cdef char ufunc_entr_types[4]
+cdef char *ufunc_entr_doc = (
+    "entr(x, out=None)\n"
+    "\n"
+    "Elementwise function for computing entropy.\n"
+    "\n"
+    ".. math:: \\text{entr}(x) = \\begin{cases} - x \\log(x) & x > 0  \\\\ 0 & x = 0\n"
+    "          \\\\ -\\infty & \\text{otherwise} \\end{cases}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : ndarray\n"
+    "    Input array.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "res : scalar or ndarray\n"
+    "    The value of the elementwise entropy function at the given points `x`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "kl_div, rel_entr, scipy.stats.entropy\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 0.15.0\n"
+    "\n"
+    "This function is concave.\n"
+    "\n"
+    "The origin of this function is in convex programming; see [1]_.\n"
+    "Given a probability distribution :math:`p_1, \\ldots, p_n`,\n"
+    "the definition of entropy in the context of *information theory* is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\sum_{i = 1}^n \\mathrm{entr}(p_i).\n"
+    "\n"
+    "To compute the latter quantity, use `scipy.stats.entropy`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.\n"
+    "       Cambridge University Press, 2004.\n"
+    "       :doi:`https://doi.org/10.1017/CBO9780511804441`")
+ufunc_entr_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_entr_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_entr_types[0] = <char>NPY_FLOAT
+ufunc_entr_types[1] = <char>NPY_FLOAT
+ufunc_entr_types[2] = <char>NPY_DOUBLE
+ufunc_entr_types[3] = <char>NPY_DOUBLE
+ufunc_entr_ptr[2*0] = <void*>_func_entr
+ufunc_entr_ptr[2*0+1] = <void*>(<char*>"entr")
+ufunc_entr_ptr[2*1] = <void*>_func_entr
+ufunc_entr_ptr[2*1+1] = <void*>(<char*>"entr")
+ufunc_entr_data[0] = &ufunc_entr_ptr[2*0]
+ufunc_entr_data[1] = &ufunc_entr_ptr[2*1]
+entr = np.PyUFunc_FromFuncAndData(ufunc_entr_loops, ufunc_entr_data, ufunc_entr_types, 2, 1, 1, 0, 'entr', ufunc_entr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_erfcinv_loops[2]
+cdef void *ufunc_erfcinv_ptr[4]
+cdef void *ufunc_erfcinv_data[2]
+cdef char ufunc_erfcinv_types[4]
+cdef char *ufunc_erfcinv_doc = (
+    "erfcinv(y, out=None)\n"
+    "\n"
+    "Inverse of the complementary error function.\n"
+    "\n"
+    "Computes the inverse of the complementary error function.\n"
+    "\n"
+    "In the complex domain, there is no unique complex number w satisfying\n"
+    "erfc(w)=z. This indicates a true inverse function would be multivalued.\n"
+    "When the domain restricts to the real, 0 < x < 2, there is a unique real\n"
+    "number satisfying erfc(erfcinv(x)) = erfcinv(erfc(x)).\n"
+    "\n"
+    "It is related to inverse of the error function by erfcinv(1-x) = erfinv(x)\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : ndarray\n"
+    "    Argument at which to evaluate. Domain: [0, 2]\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "erfcinv : scalar or ndarray\n"
+    "    The inverse of erfc of y, element-wise\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "erf : Error function of a complex argument\n"
+    "erfc : Complementary error function, ``1 - erf(x)``\n"
+    "erfinv : Inverse of the error function\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> from scipy.special import erfcinv\n"
+    "\n"
+    ">>> erfcinv(0.5)\n"
+    "0.4769362762044699\n"
+    "\n"
+    ">>> y = np.linspace(0.0, 2.0, num=11)\n"
+    ">>> erfcinv(y)\n"
+    "array([        inf,  0.9061938 ,  0.59511608,  0.37080716,  0.17914345,\n"
+    "       -0.        , -0.17914345, -0.37080716, -0.59511608, -0.9061938 ,\n"
+    "              -inf])\n"
+    "\n"
+    "Plot the function:\n"
+    "\n"
+    ">>> y = np.linspace(0, 2, 200)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> ax.plot(y, erfcinv(y))\n"
+    ">>> ax.grid(True)\n"
+    ">>> ax.set_xlabel('y')\n"
+    ">>> ax.set_title('erfcinv(y)')\n"
+    ">>> plt.show()")
+ufunc_erfcinv_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_erfcinv_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_erfcinv_types[0] = <char>NPY_FLOAT
+ufunc_erfcinv_types[1] = <char>NPY_FLOAT
+ufunc_erfcinv_types[2] = <char>NPY_DOUBLE
+ufunc_erfcinv_types[3] = <char>NPY_DOUBLE
+ufunc_erfcinv_ptr[2*0] = <void*>_func_cephes_erfcinv
+ufunc_erfcinv_ptr[2*0+1] = <void*>(<char*>"erfcinv")
+ufunc_erfcinv_ptr[2*1] = <void*>_func_cephes_erfcinv
+ufunc_erfcinv_ptr[2*1+1] = <void*>(<char*>"erfcinv")
+ufunc_erfcinv_data[0] = &ufunc_erfcinv_ptr[2*0]
+ufunc_erfcinv_data[1] = &ufunc_erfcinv_ptr[2*1]
+erfcinv = np.PyUFunc_FromFuncAndData(ufunc_erfcinv_loops, ufunc_erfcinv_data, ufunc_erfcinv_types, 2, 1, 1, 0, 'erfcinv', ufunc_erfcinv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_erfinv_loops[2]
+cdef void *ufunc_erfinv_ptr[4]
+cdef void *ufunc_erfinv_data[2]
+cdef char ufunc_erfinv_types[4]
+cdef char *ufunc_erfinv_doc = (
+    "erfinv(y, out=None)\n"
+    "\n"
+    "Inverse of the error function.\n"
+    "\n"
+    "Computes the inverse of the error function.\n"
+    "\n"
+    "In the complex domain, there is no unique complex number w satisfying\n"
+    "erf(w)=z. This indicates a true inverse function would be multivalued.\n"
+    "When the domain restricts to the real, -1 < x < 1, there is a unique real\n"
+    "number satisfying erf(erfinv(x)) = x.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : ndarray\n"
+    "    Argument at which to evaluate. Domain: [-1, 1]\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "erfinv : scalar or ndarray\n"
+    "    The inverse of erf of y, element-wise\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "erf : Error function of a complex argument\n"
+    "erfc : Complementary error function, ``1 - erf(x)``\n"
+    "erfcinv : Inverse of the complementary error function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function wraps the ``erf_inv`` routine from the\n"
+    "Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> from scipy.special import erfinv, erf\n"
+    "\n"
+    ">>> erfinv(0.5)\n"
+    "0.4769362762044699\n"
+    "\n"
+    ">>> y = np.linspace(-1.0, 1.0, num=9)\n"
+    ">>> x = erfinv(y)\n"
+    ">>> x\n"
+    "array([       -inf, -0.81341985, -0.47693628, -0.22531206,  0.        ,\n"
+    "        0.22531206,  0.47693628,  0.81341985,         inf])\n"
+    "\n"
+    "Verify that ``erf(erfinv(y))`` is ``y``.\n"
+    "\n"
+    ">>> erf(x)\n"
+    "array([-1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ])\n"
+    "\n"
+    "Plot the function:\n"
+    "\n"
+    ">>> y = np.linspace(-1, 1, 200)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> ax.plot(y, erfinv(y))\n"
+    ">>> ax.grid(True)\n"
+    ">>> ax.set_xlabel('y')\n"
+    ">>> ax.set_title('erfinv(y)')\n"
+    ">>> plt.show()")
+ufunc_erfinv_loops[0] = <np.PyUFuncGenericFunction>loop_f_f__As_f_f
+ufunc_erfinv_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_erfinv_types[0] = <char>NPY_FLOAT
+ufunc_erfinv_types[1] = <char>NPY_FLOAT
+ufunc_erfinv_types[2] = <char>NPY_DOUBLE
+ufunc_erfinv_types[3] = <char>NPY_DOUBLE
+ufunc_erfinv_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_erfinv_float
+ufunc_erfinv_ptr[2*0+1] = <void*>(<char*>"erfinv")
+ufunc_erfinv_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_erfinv_double
+ufunc_erfinv_ptr[2*1+1] = <void*>(<char*>"erfinv")
+ufunc_erfinv_data[0] = &ufunc_erfinv_ptr[2*0]
+ufunc_erfinv_data[1] = &ufunc_erfinv_ptr[2*1]
+erfinv = np.PyUFunc_FromFuncAndData(ufunc_erfinv_loops, ufunc_erfinv_data, ufunc_erfinv_types, 2, 1, 1, 0, 'erfinv', ufunc_erfinv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_chebyc_loops[5]
+cdef void *ufunc_eval_chebyc_ptr[10]
+cdef void *ufunc_eval_chebyc_data[5]
+cdef char ufunc_eval_chebyc_types[15]
+cdef char *ufunc_eval_chebyc_doc = (
+    "eval_chebyc(n, x, out=None)\n"
+    "\n"
+    "Evaluate Chebyshev polynomial of the first kind on [-2, 2] at a\n"
+    "point.\n"
+    "\n"
+    "These polynomials are defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    C_n(x) = 2 T_n(x/2)\n"
+    "\n"
+    "where :math:`T_n` is a Chebyshev polynomial of the first kind. See\n"
+    "22.5.11 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to `eval_chebyt`.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "C : scalar or ndarray\n"
+    "    Values of the Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_chebyc : roots and quadrature weights of Chebyshev\n"
+    "               polynomials of the first kind on [-2, 2]\n"
+    "chebyc : Chebyshev polynomial object\n"
+    "numpy.polynomial.chebyshev.Chebyshev : Chebyshev series\n"
+    "eval_chebyt : evaluate Chebycshev polynomials of the first kind\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.1.E3\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "They are a scaled version of the Chebyshev polynomials of the\n"
+    "first kind.\n"
+    "\n"
+    ">>> x = np.linspace(-2, 2, 6)\n"
+    ">>> sc.eval_chebyc(3, x)\n"
+    "array([-2.   ,  1.872,  1.136, -1.136, -1.872,  2.   ])\n"
+    ">>> 2 * sc.eval_chebyt(3, x / 2)\n"
+    "array([-2.   ,  1.872,  1.136, -1.136, -1.872,  2.   ])")
+ufunc_eval_chebyc_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_chebyc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_chebyc_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_chebyc_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_chebyc_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_chebyc_types[0] = <char>NPY_INTP
+ufunc_eval_chebyc_types[1] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[2] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[3] = <char>NPY_FLOAT
+ufunc_eval_chebyc_types[4] = <char>NPY_FLOAT
+ufunc_eval_chebyc_types[5] = <char>NPY_FLOAT
+ufunc_eval_chebyc_types[6] = <char>NPY_FLOAT
+ufunc_eval_chebyc_types[7] = <char>NPY_CFLOAT
+ufunc_eval_chebyc_types[8] = <char>NPY_CFLOAT
+ufunc_eval_chebyc_types[9] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[10] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[11] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[12] = <char>NPY_DOUBLE
+ufunc_eval_chebyc_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_chebyc_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_chebyc_ptr[2*0] = <void*>_func_eval_chebyc_l
+ufunc_eval_chebyc_ptr[2*0+1] = <void*>(<char*>"eval_chebyc")
+ufunc_eval_chebyc_ptr[2*1] = <void*>_func_eval_chebyc[double]
+ufunc_eval_chebyc_ptr[2*1+1] = <void*>(<char*>"eval_chebyc")
+ufunc_eval_chebyc_ptr[2*2] = <void*>_func_eval_chebyc[double_complex]
+ufunc_eval_chebyc_ptr[2*2+1] = <void*>(<char*>"eval_chebyc")
+ufunc_eval_chebyc_ptr[2*3] = <void*>_func_eval_chebyc[double]
+ufunc_eval_chebyc_ptr[2*3+1] = <void*>(<char*>"eval_chebyc")
+ufunc_eval_chebyc_ptr[2*4] = <void*>_func_eval_chebyc[double_complex]
+ufunc_eval_chebyc_ptr[2*4+1] = <void*>(<char*>"eval_chebyc")
+ufunc_eval_chebyc_data[0] = &ufunc_eval_chebyc_ptr[2*0]
+ufunc_eval_chebyc_data[1] = &ufunc_eval_chebyc_ptr[2*1]
+ufunc_eval_chebyc_data[2] = &ufunc_eval_chebyc_ptr[2*2]
+ufunc_eval_chebyc_data[3] = &ufunc_eval_chebyc_ptr[2*3]
+ufunc_eval_chebyc_data[4] = &ufunc_eval_chebyc_ptr[2*4]
+eval_chebyc = np.PyUFunc_FromFuncAndData(ufunc_eval_chebyc_loops, ufunc_eval_chebyc_data, ufunc_eval_chebyc_types, 5, 2, 1, 0, 'eval_chebyc', ufunc_eval_chebyc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_chebys_loops[5]
+cdef void *ufunc_eval_chebys_ptr[10]
+cdef void *ufunc_eval_chebys_data[5]
+cdef char ufunc_eval_chebys_types[15]
+cdef char *ufunc_eval_chebys_doc = (
+    "eval_chebys(n, x, out=None)\n"
+    "\n"
+    "Evaluate Chebyshev polynomial of the second kind on [-2, 2] at a\n"
+    "point.\n"
+    "\n"
+    "These polynomials are defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    S_n(x) = U_n(x/2)\n"
+    "\n"
+    "where :math:`U_n` is a Chebyshev polynomial of the second kind.\n"
+    "See 22.5.13 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to `eval_chebyu`.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "S : scalar or ndarray\n"
+    "    Values of the Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_chebys : roots and quadrature weights of Chebyshev\n"
+    "               polynomials of the second kind on [-2, 2]\n"
+    "chebys : Chebyshev polynomial object\n"
+    "eval_chebyu : evaluate Chebyshev polynomials of the second kind\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.1.E3\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "They are a scaled version of the Chebyshev polynomials of the\n"
+    "second kind.\n"
+    "\n"
+    ">>> x = np.linspace(-2, 2, 6)\n"
+    ">>> sc.eval_chebys(3, x)\n"
+    "array([-4.   ,  0.672,  0.736, -0.736, -0.672,  4.   ])\n"
+    ">>> sc.eval_chebyu(3, x / 2)\n"
+    "array([-4.   ,  0.672,  0.736, -0.736, -0.672,  4.   ])")
+ufunc_eval_chebys_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_chebys_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_chebys_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_chebys_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_chebys_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_chebys_types[0] = <char>NPY_INTP
+ufunc_eval_chebys_types[1] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[2] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[3] = <char>NPY_FLOAT
+ufunc_eval_chebys_types[4] = <char>NPY_FLOAT
+ufunc_eval_chebys_types[5] = <char>NPY_FLOAT
+ufunc_eval_chebys_types[6] = <char>NPY_FLOAT
+ufunc_eval_chebys_types[7] = <char>NPY_CFLOAT
+ufunc_eval_chebys_types[8] = <char>NPY_CFLOAT
+ufunc_eval_chebys_types[9] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[10] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[11] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[12] = <char>NPY_DOUBLE
+ufunc_eval_chebys_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_chebys_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_chebys_ptr[2*0] = <void*>_func_eval_chebys_l
+ufunc_eval_chebys_ptr[2*0+1] = <void*>(<char*>"eval_chebys")
+ufunc_eval_chebys_ptr[2*1] = <void*>_func_eval_chebys[double]
+ufunc_eval_chebys_ptr[2*1+1] = <void*>(<char*>"eval_chebys")
+ufunc_eval_chebys_ptr[2*2] = <void*>_func_eval_chebys[double_complex]
+ufunc_eval_chebys_ptr[2*2+1] = <void*>(<char*>"eval_chebys")
+ufunc_eval_chebys_ptr[2*3] = <void*>_func_eval_chebys[double]
+ufunc_eval_chebys_ptr[2*3+1] = <void*>(<char*>"eval_chebys")
+ufunc_eval_chebys_ptr[2*4] = <void*>_func_eval_chebys[double_complex]
+ufunc_eval_chebys_ptr[2*4+1] = <void*>(<char*>"eval_chebys")
+ufunc_eval_chebys_data[0] = &ufunc_eval_chebys_ptr[2*0]
+ufunc_eval_chebys_data[1] = &ufunc_eval_chebys_ptr[2*1]
+ufunc_eval_chebys_data[2] = &ufunc_eval_chebys_ptr[2*2]
+ufunc_eval_chebys_data[3] = &ufunc_eval_chebys_ptr[2*3]
+ufunc_eval_chebys_data[4] = &ufunc_eval_chebys_ptr[2*4]
+eval_chebys = np.PyUFunc_FromFuncAndData(ufunc_eval_chebys_loops, ufunc_eval_chebys_data, ufunc_eval_chebys_types, 5, 2, 1, 0, 'eval_chebys', ufunc_eval_chebys_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_chebyt_loops[5]
+cdef void *ufunc_eval_chebyt_ptr[10]
+cdef void *ufunc_eval_chebyt_data[5]
+cdef char ufunc_eval_chebyt_types[15]
+cdef char *ufunc_eval_chebyt_doc = (
+    "eval_chebyt(n, x, out=None)\n"
+    "\n"
+    "Evaluate Chebyshev polynomial of the first kind at a point.\n"
+    "\n"
+    "The Chebyshev polynomials of the first kind can be defined via the\n"
+    "Gauss hypergeometric function :math:`{}_2F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    T_n(x) = {}_2F_1(n, -n; 1/2; (1 - x)/2).\n"
+    "\n"
+    "When :math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.47 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to the Gauss hypergeometric\n"
+    "    function.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "T : scalar or ndarray\n"
+    "    Values of the Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_chebyt : roots and quadrature weights of Chebyshev\n"
+    "               polynomials of the first kind\n"
+    "chebyu : Chebychev polynomial object\n"
+    "eval_chebyu : evaluate Chebyshev polynomials of the second kind\n"
+    "hyp2f1 : Gauss hypergeometric function\n"
+    "numpy.polynomial.chebyshev.Chebyshev : Chebyshev series\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This routine is numerically stable for `x` in ``[-1, 1]`` at least\n"
+    "up to order ``10000``.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E11_2")
+ufunc_eval_chebyt_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_chebyt_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_chebyt_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_chebyt_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_chebyt_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_chebyt_types[0] = <char>NPY_INTP
+ufunc_eval_chebyt_types[1] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[2] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[3] = <char>NPY_FLOAT
+ufunc_eval_chebyt_types[4] = <char>NPY_FLOAT
+ufunc_eval_chebyt_types[5] = <char>NPY_FLOAT
+ufunc_eval_chebyt_types[6] = <char>NPY_FLOAT
+ufunc_eval_chebyt_types[7] = <char>NPY_CFLOAT
+ufunc_eval_chebyt_types[8] = <char>NPY_CFLOAT
+ufunc_eval_chebyt_types[9] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[10] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[11] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[12] = <char>NPY_DOUBLE
+ufunc_eval_chebyt_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_chebyt_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_chebyt_ptr[2*0] = <void*>_func_eval_chebyt_l
+ufunc_eval_chebyt_ptr[2*0+1] = <void*>(<char*>"eval_chebyt")
+ufunc_eval_chebyt_ptr[2*1] = <void*>_func_eval_chebyt[double]
+ufunc_eval_chebyt_ptr[2*1+1] = <void*>(<char*>"eval_chebyt")
+ufunc_eval_chebyt_ptr[2*2] = <void*>_func_eval_chebyt[double_complex]
+ufunc_eval_chebyt_ptr[2*2+1] = <void*>(<char*>"eval_chebyt")
+ufunc_eval_chebyt_ptr[2*3] = <void*>_func_eval_chebyt[double]
+ufunc_eval_chebyt_ptr[2*3+1] = <void*>(<char*>"eval_chebyt")
+ufunc_eval_chebyt_ptr[2*4] = <void*>_func_eval_chebyt[double_complex]
+ufunc_eval_chebyt_ptr[2*4+1] = <void*>(<char*>"eval_chebyt")
+ufunc_eval_chebyt_data[0] = &ufunc_eval_chebyt_ptr[2*0]
+ufunc_eval_chebyt_data[1] = &ufunc_eval_chebyt_ptr[2*1]
+ufunc_eval_chebyt_data[2] = &ufunc_eval_chebyt_ptr[2*2]
+ufunc_eval_chebyt_data[3] = &ufunc_eval_chebyt_ptr[2*3]
+ufunc_eval_chebyt_data[4] = &ufunc_eval_chebyt_ptr[2*4]
+eval_chebyt = np.PyUFunc_FromFuncAndData(ufunc_eval_chebyt_loops, ufunc_eval_chebyt_data, ufunc_eval_chebyt_types, 5, 2, 1, 0, 'eval_chebyt', ufunc_eval_chebyt_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_chebyu_loops[5]
+cdef void *ufunc_eval_chebyu_ptr[10]
+cdef void *ufunc_eval_chebyu_data[5]
+cdef char ufunc_eval_chebyu_types[15]
+cdef char *ufunc_eval_chebyu_doc = (
+    "eval_chebyu(n, x, out=None)\n"
+    "\n"
+    "Evaluate Chebyshev polynomial of the second kind at a point.\n"
+    "\n"
+    "The Chebyshev polynomials of the second kind can be defined via\n"
+    "the Gauss hypergeometric function :math:`{}_2F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    U_n(x) = (n + 1) {}_2F_1(-n, n + 2; 3/2; (1 - x)/2).\n"
+    "\n"
+    "When :math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.48 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to the Gauss hypergeometric\n"
+    "    function.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "U : scalar or ndarray\n"
+    "    Values of the Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_chebyu : roots and quadrature weights of Chebyshev\n"
+    "               polynomials of the second kind\n"
+    "chebyu : Chebyshev polynomial object\n"
+    "eval_chebyt : evaluate Chebyshev polynomials of the first kind\n"
+    "hyp2f1 : Gauss hypergeometric function\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E11_4")
+ufunc_eval_chebyu_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_chebyu_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_chebyu_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_chebyu_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_chebyu_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_chebyu_types[0] = <char>NPY_INTP
+ufunc_eval_chebyu_types[1] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[2] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[3] = <char>NPY_FLOAT
+ufunc_eval_chebyu_types[4] = <char>NPY_FLOAT
+ufunc_eval_chebyu_types[5] = <char>NPY_FLOAT
+ufunc_eval_chebyu_types[6] = <char>NPY_FLOAT
+ufunc_eval_chebyu_types[7] = <char>NPY_CFLOAT
+ufunc_eval_chebyu_types[8] = <char>NPY_CFLOAT
+ufunc_eval_chebyu_types[9] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[10] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[11] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[12] = <char>NPY_DOUBLE
+ufunc_eval_chebyu_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_chebyu_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_chebyu_ptr[2*0] = <void*>_func_eval_chebyu_l
+ufunc_eval_chebyu_ptr[2*0+1] = <void*>(<char*>"eval_chebyu")
+ufunc_eval_chebyu_ptr[2*1] = <void*>_func_eval_chebyu[double]
+ufunc_eval_chebyu_ptr[2*1+1] = <void*>(<char*>"eval_chebyu")
+ufunc_eval_chebyu_ptr[2*2] = <void*>_func_eval_chebyu[double_complex]
+ufunc_eval_chebyu_ptr[2*2+1] = <void*>(<char*>"eval_chebyu")
+ufunc_eval_chebyu_ptr[2*3] = <void*>_func_eval_chebyu[double]
+ufunc_eval_chebyu_ptr[2*3+1] = <void*>(<char*>"eval_chebyu")
+ufunc_eval_chebyu_ptr[2*4] = <void*>_func_eval_chebyu[double_complex]
+ufunc_eval_chebyu_ptr[2*4+1] = <void*>(<char*>"eval_chebyu")
+ufunc_eval_chebyu_data[0] = &ufunc_eval_chebyu_ptr[2*0]
+ufunc_eval_chebyu_data[1] = &ufunc_eval_chebyu_ptr[2*1]
+ufunc_eval_chebyu_data[2] = &ufunc_eval_chebyu_ptr[2*2]
+ufunc_eval_chebyu_data[3] = &ufunc_eval_chebyu_ptr[2*3]
+ufunc_eval_chebyu_data[4] = &ufunc_eval_chebyu_ptr[2*4]
+eval_chebyu = np.PyUFunc_FromFuncAndData(ufunc_eval_chebyu_loops, ufunc_eval_chebyu_data, ufunc_eval_chebyu_types, 5, 2, 1, 0, 'eval_chebyu', ufunc_eval_chebyu_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_gegenbauer_loops[5]
+cdef void *ufunc_eval_gegenbauer_ptr[10]
+cdef void *ufunc_eval_gegenbauer_data[5]
+cdef char ufunc_eval_gegenbauer_types[20]
+cdef char *ufunc_eval_gegenbauer_doc = (
+    "eval_gegenbauer(n, alpha, x, out=None)\n"
+    "\n"
+    "Evaluate Gegenbauer polynomial at a point.\n"
+    "\n"
+    "The Gegenbauer polynomials can be defined via the Gauss\n"
+    "hypergeometric function :math:`{}_2F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    C_n^{(\\alpha)} = \\frac{(2\\alpha)_n}{\\Gamma(n + 1)}\n"
+    "      {}_2F_1(-n, 2\\alpha + n; \\alpha + 1/2; (1 - z)/2).\n"
+    "\n"
+    "When :math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.46 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to the Gauss hypergeometric\n"
+    "    function.\n"
+    "alpha : array_like\n"
+    "    Parameter\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Gegenbauer polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "C : scalar or ndarray\n"
+    "    Values of the Gegenbauer polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_gegenbauer : roots and quadrature weights of Gegenbauer\n"
+    "                   polynomials\n"
+    "gegenbauer : Gegenbauer polynomial object\n"
+    "hyp2f1 : Gauss hypergeometric function\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E9")
+ufunc_eval_gegenbauer_loops[0] = <np.PyUFuncGenericFunction>loop_d_pdd__As_pdd_d
+ufunc_eval_gegenbauer_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_eval_gegenbauer_loops[2] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ffF_F
+ufunc_eval_gegenbauer_loops[3] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_eval_gegenbauer_loops[4] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ddD_D
+ufunc_eval_gegenbauer_types[0] = <char>NPY_INTP
+ufunc_eval_gegenbauer_types[1] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[2] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[3] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[4] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[5] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[6] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[7] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[8] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[9] = <char>NPY_FLOAT
+ufunc_eval_gegenbauer_types[10] = <char>NPY_CFLOAT
+ufunc_eval_gegenbauer_types[11] = <char>NPY_CFLOAT
+ufunc_eval_gegenbauer_types[12] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[13] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[14] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[15] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[16] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[17] = <char>NPY_DOUBLE
+ufunc_eval_gegenbauer_types[18] = <char>NPY_CDOUBLE
+ufunc_eval_gegenbauer_types[19] = <char>NPY_CDOUBLE
+ufunc_eval_gegenbauer_ptr[2*0] = <void*>_func_eval_gegenbauer_l
+ufunc_eval_gegenbauer_ptr[2*0+1] = <void*>(<char*>"eval_gegenbauer")
+ufunc_eval_gegenbauer_ptr[2*1] = <void*>_func_eval_gegenbauer[double]
+ufunc_eval_gegenbauer_ptr[2*1+1] = <void*>(<char*>"eval_gegenbauer")
+ufunc_eval_gegenbauer_ptr[2*2] = <void*>_func_eval_gegenbauer[double_complex]
+ufunc_eval_gegenbauer_ptr[2*2+1] = <void*>(<char*>"eval_gegenbauer")
+ufunc_eval_gegenbauer_ptr[2*3] = <void*>_func_eval_gegenbauer[double]
+ufunc_eval_gegenbauer_ptr[2*3+1] = <void*>(<char*>"eval_gegenbauer")
+ufunc_eval_gegenbauer_ptr[2*4] = <void*>_func_eval_gegenbauer[double_complex]
+ufunc_eval_gegenbauer_ptr[2*4+1] = <void*>(<char*>"eval_gegenbauer")
+ufunc_eval_gegenbauer_data[0] = &ufunc_eval_gegenbauer_ptr[2*0]
+ufunc_eval_gegenbauer_data[1] = &ufunc_eval_gegenbauer_ptr[2*1]
+ufunc_eval_gegenbauer_data[2] = &ufunc_eval_gegenbauer_ptr[2*2]
+ufunc_eval_gegenbauer_data[3] = &ufunc_eval_gegenbauer_ptr[2*3]
+ufunc_eval_gegenbauer_data[4] = &ufunc_eval_gegenbauer_ptr[2*4]
+eval_gegenbauer = np.PyUFunc_FromFuncAndData(ufunc_eval_gegenbauer_loops, ufunc_eval_gegenbauer_data, ufunc_eval_gegenbauer_types, 5, 3, 1, 0, 'eval_gegenbauer', ufunc_eval_gegenbauer_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_genlaguerre_loops[5]
+cdef void *ufunc_eval_genlaguerre_ptr[10]
+cdef void *ufunc_eval_genlaguerre_data[5]
+cdef char ufunc_eval_genlaguerre_types[20]
+cdef char *ufunc_eval_genlaguerre_doc = (
+    "eval_genlaguerre(n, alpha, x, out=None)\n"
+    "\n"
+    "Evaluate generalized Laguerre polynomial at a point.\n"
+    "\n"
+    "The generalized Laguerre polynomials can be defined via the\n"
+    "confluent hypergeometric function :math:`{}_1F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    L_n^{(\\alpha)}(x) = \\binom{n + \\alpha}{n}\n"
+    "      {}_1F_1(-n, \\alpha + 1, x).\n"
+    "\n"
+    "When :math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.54 in [AS]_ or [DLMF]_ for details. The Laguerre\n"
+    "polynomials are the special case where :math:`\\alpha = 0`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to the confluent hypergeometric\n"
+    "    function.\n"
+    "alpha : array_like\n"
+    "    Parameter; must have ``alpha > -1``\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the generalized Laguerre\n"
+    "    polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "L : scalar or ndarray\n"
+    "    Values of the generalized Laguerre polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_genlaguerre : roots and quadrature weights of generalized\n"
+    "                    Laguerre polynomials\n"
+    "genlaguerre : generalized Laguerre polynomial object\n"
+    "hyp1f1 : confluent hypergeometric function\n"
+    "eval_laguerre : evaluate Laguerre polynomials\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E12")
+ufunc_eval_genlaguerre_loops[0] = <np.PyUFuncGenericFunction>loop_d_pdd__As_pdd_d
+ufunc_eval_genlaguerre_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_eval_genlaguerre_loops[2] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ffF_F
+ufunc_eval_genlaguerre_loops[3] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_eval_genlaguerre_loops[4] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ddD_D
+ufunc_eval_genlaguerre_types[0] = <char>NPY_INTP
+ufunc_eval_genlaguerre_types[1] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[2] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[3] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[4] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[5] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[6] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[7] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[8] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[9] = <char>NPY_FLOAT
+ufunc_eval_genlaguerre_types[10] = <char>NPY_CFLOAT
+ufunc_eval_genlaguerre_types[11] = <char>NPY_CFLOAT
+ufunc_eval_genlaguerre_types[12] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[13] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[14] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[15] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[16] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[17] = <char>NPY_DOUBLE
+ufunc_eval_genlaguerre_types[18] = <char>NPY_CDOUBLE
+ufunc_eval_genlaguerre_types[19] = <char>NPY_CDOUBLE
+ufunc_eval_genlaguerre_ptr[2*0] = <void*>_func_eval_genlaguerre_l
+ufunc_eval_genlaguerre_ptr[2*0+1] = <void*>(<char*>"eval_genlaguerre")
+ufunc_eval_genlaguerre_ptr[2*1] = <void*>_func_eval_genlaguerre[double]
+ufunc_eval_genlaguerre_ptr[2*1+1] = <void*>(<char*>"eval_genlaguerre")
+ufunc_eval_genlaguerre_ptr[2*2] = <void*>_func_eval_genlaguerre[double_complex]
+ufunc_eval_genlaguerre_ptr[2*2+1] = <void*>(<char*>"eval_genlaguerre")
+ufunc_eval_genlaguerre_ptr[2*3] = <void*>_func_eval_genlaguerre[double]
+ufunc_eval_genlaguerre_ptr[2*3+1] = <void*>(<char*>"eval_genlaguerre")
+ufunc_eval_genlaguerre_ptr[2*4] = <void*>_func_eval_genlaguerre[double_complex]
+ufunc_eval_genlaguerre_ptr[2*4+1] = <void*>(<char*>"eval_genlaguerre")
+ufunc_eval_genlaguerre_data[0] = &ufunc_eval_genlaguerre_ptr[2*0]
+ufunc_eval_genlaguerre_data[1] = &ufunc_eval_genlaguerre_ptr[2*1]
+ufunc_eval_genlaguerre_data[2] = &ufunc_eval_genlaguerre_ptr[2*2]
+ufunc_eval_genlaguerre_data[3] = &ufunc_eval_genlaguerre_ptr[2*3]
+ufunc_eval_genlaguerre_data[4] = &ufunc_eval_genlaguerre_ptr[2*4]
+eval_genlaguerre = np.PyUFunc_FromFuncAndData(ufunc_eval_genlaguerre_loops, ufunc_eval_genlaguerre_data, ufunc_eval_genlaguerre_types, 5, 3, 1, 0, 'eval_genlaguerre', ufunc_eval_genlaguerre_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_hermite_loops[1]
+cdef void *ufunc_eval_hermite_ptr[2]
+cdef void *ufunc_eval_hermite_data[1]
+cdef char ufunc_eval_hermite_types[3]
+cdef char *ufunc_eval_hermite_doc = (
+    "eval_hermite(n, x, out=None)\n"
+    "\n"
+    "Evaluate physicist's Hermite polynomial at a point.\n"
+    "\n"
+    "Defined by\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    H_n(x) = (-1)^n e^{x^2} \\frac{d^n}{dx^n} e^{-x^2};\n"
+    "\n"
+    ":math:`H_n` is a polynomial of degree :math:`n`. See 22.11.7 in\n"
+    "[AS]_ or [DLMF]_ for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Hermite polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "H : scalar or ndarray\n"
+    "    Values of the Hermite polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_hermite : roots and quadrature weights of physicist's\n"
+    "                Hermite polynomials\n"
+    "hermite : physicist's Hermite polynomial object\n"
+    "numpy.polynomial.hermite.Hermite : Physicist's Hermite series\n"
+    "eval_hermitenorm : evaluate Probabilist's Hermite polynomials\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.T1")
+ufunc_eval_hermite_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_hermite_types[0] = <char>NPY_INTP
+ufunc_eval_hermite_types[1] = <char>NPY_DOUBLE
+ufunc_eval_hermite_types[2] = <char>NPY_DOUBLE
+ufunc_eval_hermite_ptr[2*0] = <void*>_func_eval_hermite
+ufunc_eval_hermite_ptr[2*0+1] = <void*>(<char*>"eval_hermite")
+ufunc_eval_hermite_data[0] = &ufunc_eval_hermite_ptr[2*0]
+eval_hermite = np.PyUFunc_FromFuncAndData(ufunc_eval_hermite_loops, ufunc_eval_hermite_data, ufunc_eval_hermite_types, 1, 2, 1, 0, 'eval_hermite', ufunc_eval_hermite_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_hermitenorm_loops[1]
+cdef void *ufunc_eval_hermitenorm_ptr[2]
+cdef void *ufunc_eval_hermitenorm_data[1]
+cdef char ufunc_eval_hermitenorm_types[3]
+cdef char *ufunc_eval_hermitenorm_doc = (
+    "eval_hermitenorm(n, x, out=None)\n"
+    "\n"
+    "Evaluate probabilist's (normalized) Hermite polynomial at a\n"
+    "point.\n"
+    "\n"
+    "Defined by\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    He_n(x) = (-1)^n e^{x^2/2} \\frac{d^n}{dx^n} e^{-x^2/2};\n"
+    "\n"
+    ":math:`He_n` is a polynomial of degree :math:`n`. See 22.11.8 in\n"
+    "[AS]_ or [DLMF]_ for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Hermite polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "He : scalar or ndarray\n"
+    "    Values of the Hermite polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_hermitenorm : roots and quadrature weights of probabilist's\n"
+    "                    Hermite polynomials\n"
+    "hermitenorm : probabilist's Hermite polynomial object\n"
+    "numpy.polynomial.hermite_e.HermiteE : Probabilist's Hermite series\n"
+    "eval_hermite : evaluate physicist's Hermite polynomials\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.T1")
+ufunc_eval_hermitenorm_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_hermitenorm_types[0] = <char>NPY_INTP
+ufunc_eval_hermitenorm_types[1] = <char>NPY_DOUBLE
+ufunc_eval_hermitenorm_types[2] = <char>NPY_DOUBLE
+ufunc_eval_hermitenorm_ptr[2*0] = <void*>_func_eval_hermitenorm
+ufunc_eval_hermitenorm_ptr[2*0+1] = <void*>(<char*>"eval_hermitenorm")
+ufunc_eval_hermitenorm_data[0] = &ufunc_eval_hermitenorm_ptr[2*0]
+eval_hermitenorm = np.PyUFunc_FromFuncAndData(ufunc_eval_hermitenorm_loops, ufunc_eval_hermitenorm_data, ufunc_eval_hermitenorm_types, 1, 2, 1, 0, 'eval_hermitenorm', ufunc_eval_hermitenorm_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_jacobi_loops[5]
+cdef void *ufunc_eval_jacobi_ptr[10]
+cdef void *ufunc_eval_jacobi_data[5]
+cdef char ufunc_eval_jacobi_types[25]
+cdef char *ufunc_eval_jacobi_doc = (
+    "eval_jacobi(n, alpha, beta, x, out=None)\n"
+    "\n"
+    "Evaluate Jacobi polynomial at a point.\n"
+    "\n"
+    "The Jacobi polynomials can be defined via the Gauss hypergeometric\n"
+    "function :math:`{}_2F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P_n^{(\\alpha, \\beta)}(x) = \\frac{(\\alpha + 1)_n}{\\Gamma(n + 1)}\n"
+    "      {}_2F_1(-n, 1 + \\alpha + \\beta + n; \\alpha + 1; (1 - z)/2)\n"
+    "\n"
+    "where :math:`(\\cdot)_n` is the Pochhammer symbol; see `poch`. When\n"
+    ":math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.42 in [AS]_ or [DLMF]_ for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer the result is\n"
+    "    determined via the relation to the Gauss hypergeometric\n"
+    "    function.\n"
+    "alpha : array_like\n"
+    "    Parameter\n"
+    "beta : array_like\n"
+    "    Parameter\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "P : scalar or ndarray\n"
+    "    Values of the Jacobi polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_jacobi : roots and quadrature weights of Jacobi polynomials\n"
+    "jacobi : Jacobi polynomial object\n"
+    "hyp2f1 : Gauss hypergeometric function\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E7")
+ufunc_eval_jacobi_loops[0] = <np.PyUFuncGenericFunction>loop_d_pddd__As_pddd_d
+ufunc_eval_jacobi_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_eval_jacobi_loops[2] = <np.PyUFuncGenericFunction>loop_D_dddD__As_fffF_F
+ufunc_eval_jacobi_loops[3] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_eval_jacobi_loops[4] = <np.PyUFuncGenericFunction>loop_D_dddD__As_dddD_D
+ufunc_eval_jacobi_types[0] = <char>NPY_INTP
+ufunc_eval_jacobi_types[1] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[2] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[3] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[4] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[5] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[6] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[7] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[8] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[9] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[10] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[11] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[12] = <char>NPY_FLOAT
+ufunc_eval_jacobi_types[13] = <char>NPY_CFLOAT
+ufunc_eval_jacobi_types[14] = <char>NPY_CFLOAT
+ufunc_eval_jacobi_types[15] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[16] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[17] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[18] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[19] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[20] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[21] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[22] = <char>NPY_DOUBLE
+ufunc_eval_jacobi_types[23] = <char>NPY_CDOUBLE
+ufunc_eval_jacobi_types[24] = <char>NPY_CDOUBLE
+ufunc_eval_jacobi_ptr[2*0] = <void*>_func_eval_jacobi_l
+ufunc_eval_jacobi_ptr[2*0+1] = <void*>(<char*>"eval_jacobi")
+ufunc_eval_jacobi_ptr[2*1] = <void*>_func_eval_jacobi[double]
+ufunc_eval_jacobi_ptr[2*1+1] = <void*>(<char*>"eval_jacobi")
+ufunc_eval_jacobi_ptr[2*2] = <void*>_func_eval_jacobi[double_complex]
+ufunc_eval_jacobi_ptr[2*2+1] = <void*>(<char*>"eval_jacobi")
+ufunc_eval_jacobi_ptr[2*3] = <void*>_func_eval_jacobi[double]
+ufunc_eval_jacobi_ptr[2*3+1] = <void*>(<char*>"eval_jacobi")
+ufunc_eval_jacobi_ptr[2*4] = <void*>_func_eval_jacobi[double_complex]
+ufunc_eval_jacobi_ptr[2*4+1] = <void*>(<char*>"eval_jacobi")
+ufunc_eval_jacobi_data[0] = &ufunc_eval_jacobi_ptr[2*0]
+ufunc_eval_jacobi_data[1] = &ufunc_eval_jacobi_ptr[2*1]
+ufunc_eval_jacobi_data[2] = &ufunc_eval_jacobi_ptr[2*2]
+ufunc_eval_jacobi_data[3] = &ufunc_eval_jacobi_ptr[2*3]
+ufunc_eval_jacobi_data[4] = &ufunc_eval_jacobi_ptr[2*4]
+eval_jacobi = np.PyUFunc_FromFuncAndData(ufunc_eval_jacobi_loops, ufunc_eval_jacobi_data, ufunc_eval_jacobi_types, 5, 4, 1, 0, 'eval_jacobi', ufunc_eval_jacobi_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_laguerre_loops[5]
+cdef void *ufunc_eval_laguerre_ptr[10]
+cdef void *ufunc_eval_laguerre_data[5]
+cdef char ufunc_eval_laguerre_types[15]
+cdef char *ufunc_eval_laguerre_doc = (
+    "eval_laguerre(n, x, out=None)\n"
+    "\n"
+    "Evaluate Laguerre polynomial at a point.\n"
+    "\n"
+    "The Laguerre polynomials can be defined via the confluent\n"
+    "hypergeometric function :math:`{}_1F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    L_n(x) = {}_1F_1(-n, 1, x).\n"
+    "\n"
+    "See 22.5.16 and 22.5.54 in [AS]_ (or equivalently [DLMF1]_ and [DLMF2]_)\n"
+    "for details. When :math:`n` is an integer the result is a polynomial\n"
+    "of degree :math:`n`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer the result is\n"
+    "    determined via the relation to the confluent hypergeometric\n"
+    "    function.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Laguerre polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "L : scalar or ndarray\n"
+    "    Values of the Laguerre polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_laguerre : roots and quadrature weights of Laguerre\n"
+    "                 polynomials\n"
+    "laguerre : Laguerre polynomial object\n"
+    "numpy.polynomial.laguerre.Laguerre : Laguerre series\n"
+    "eval_genlaguerre : evaluate generalized Laguerre polynomials\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF1] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.1#I1.ix7.p1\n"
+    ".. [DLMF2] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.5.E12")
+ufunc_eval_laguerre_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_laguerre_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_laguerre_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_laguerre_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_laguerre_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_laguerre_types[0] = <char>NPY_INTP
+ufunc_eval_laguerre_types[1] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[2] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[3] = <char>NPY_FLOAT
+ufunc_eval_laguerre_types[4] = <char>NPY_FLOAT
+ufunc_eval_laguerre_types[5] = <char>NPY_FLOAT
+ufunc_eval_laguerre_types[6] = <char>NPY_FLOAT
+ufunc_eval_laguerre_types[7] = <char>NPY_CFLOAT
+ufunc_eval_laguerre_types[8] = <char>NPY_CFLOAT
+ufunc_eval_laguerre_types[9] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[10] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[11] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[12] = <char>NPY_DOUBLE
+ufunc_eval_laguerre_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_laguerre_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_laguerre_ptr[2*0] = <void*>_func_eval_laguerre_l
+ufunc_eval_laguerre_ptr[2*0+1] = <void*>(<char*>"eval_laguerre")
+ufunc_eval_laguerre_ptr[2*1] = <void*>_func_eval_laguerre[double]
+ufunc_eval_laguerre_ptr[2*1+1] = <void*>(<char*>"eval_laguerre")
+ufunc_eval_laguerre_ptr[2*2] = <void*>_func_eval_laguerre[double_complex]
+ufunc_eval_laguerre_ptr[2*2+1] = <void*>(<char*>"eval_laguerre")
+ufunc_eval_laguerre_ptr[2*3] = <void*>_func_eval_laguerre[double]
+ufunc_eval_laguerre_ptr[2*3+1] = <void*>(<char*>"eval_laguerre")
+ufunc_eval_laguerre_ptr[2*4] = <void*>_func_eval_laguerre[double_complex]
+ufunc_eval_laguerre_ptr[2*4+1] = <void*>(<char*>"eval_laguerre")
+ufunc_eval_laguerre_data[0] = &ufunc_eval_laguerre_ptr[2*0]
+ufunc_eval_laguerre_data[1] = &ufunc_eval_laguerre_ptr[2*1]
+ufunc_eval_laguerre_data[2] = &ufunc_eval_laguerre_ptr[2*2]
+ufunc_eval_laguerre_data[3] = &ufunc_eval_laguerre_ptr[2*3]
+ufunc_eval_laguerre_data[4] = &ufunc_eval_laguerre_ptr[2*4]
+eval_laguerre = np.PyUFunc_FromFuncAndData(ufunc_eval_laguerre_loops, ufunc_eval_laguerre_data, ufunc_eval_laguerre_types, 5, 2, 1, 0, 'eval_laguerre', ufunc_eval_laguerre_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_legendre_loops[5]
+cdef void *ufunc_eval_legendre_ptr[10]
+cdef void *ufunc_eval_legendre_data[5]
+cdef char ufunc_eval_legendre_types[15]
+cdef char *ufunc_eval_legendre_doc = (
+    "eval_legendre(n, x, out=None)\n"
+    "\n"
+    "Evaluate Legendre polynomial at a point.\n"
+    "\n"
+    "The Legendre polynomials can be defined via the Gauss\n"
+    "hypergeometric function :math:`{}_2F_1` as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P_n(x) = {}_2F_1(-n, n + 1; 1; (1 - x)/2).\n"
+    "\n"
+    "When :math:`n` is an integer the result is a polynomial of degree\n"
+    ":math:`n`. See 22.5.49 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to the Gauss hypergeometric\n"
+    "    function.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the Legendre polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "P : scalar or ndarray\n"
+    "    Values of the Legendre polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_legendre : roots and quadrature weights of Legendre\n"
+    "                 polynomials\n"
+    "legendre : Legendre polynomial object\n"
+    "hyp2f1 : Gauss hypergeometric function\n"
+    "numpy.polynomial.legendre.Legendre : Legendre series\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/15.9.E7\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import eval_legendre\n"
+    "\n"
+    "Evaluate the zero-order Legendre polynomial at x = 0\n"
+    "\n"
+    ">>> eval_legendre(0, 0)\n"
+    "1.0\n"
+    "\n"
+    "Evaluate the first-order Legendre polynomial between -1 and 1\n"
+    "\n"
+    ">>> X = np.linspace(-1, 1, 5)  # Domain of Legendre polynomials\n"
+    ">>> eval_legendre(1, X)\n"
+    "array([-1. , -0.5,  0. ,  0.5,  1. ])\n"
+    "\n"
+    "Evaluate Legendre polynomials of order 0 through 4 at x = 0\n"
+    "\n"
+    ">>> N = range(0, 5)\n"
+    ">>> eval_legendre(N, 0)\n"
+    "array([ 1.   ,  0.   , -0.5  ,  0.   ,  0.375])\n"
+    "\n"
+    "Plot Legendre polynomials of order 0 through 4\n"
+    "\n"
+    ">>> X = np.linspace(-1, 1)\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> for n in range(0, 5):\n"
+    "...     y = eval_legendre(n, X)\n"
+    "...     plt.plot(X, y, label=r'$P_{}(x)$'.format(n))\n"
+    "\n"
+    ">>> plt.title(\"Legendre Polynomials\")\n"
+    ">>> plt.xlabel(\"x\")\n"
+    ">>> plt.ylabel(r'$P_n(x)$')\n"
+    ">>> plt.legend(loc='lower right')\n"
+    ">>> plt.show()")
+ufunc_eval_legendre_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_legendre_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_legendre_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_legendre_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_legendre_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_legendre_types[0] = <char>NPY_INTP
+ufunc_eval_legendre_types[1] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[2] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[3] = <char>NPY_FLOAT
+ufunc_eval_legendre_types[4] = <char>NPY_FLOAT
+ufunc_eval_legendre_types[5] = <char>NPY_FLOAT
+ufunc_eval_legendre_types[6] = <char>NPY_FLOAT
+ufunc_eval_legendre_types[7] = <char>NPY_CFLOAT
+ufunc_eval_legendre_types[8] = <char>NPY_CFLOAT
+ufunc_eval_legendre_types[9] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[10] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[11] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[12] = <char>NPY_DOUBLE
+ufunc_eval_legendre_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_legendre_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_legendre_ptr[2*0] = <void*>_func_eval_legendre_l
+ufunc_eval_legendre_ptr[2*0+1] = <void*>(<char*>"eval_legendre")
+ufunc_eval_legendre_ptr[2*1] = <void*>_func_eval_legendre[double]
+ufunc_eval_legendre_ptr[2*1+1] = <void*>(<char*>"eval_legendre")
+ufunc_eval_legendre_ptr[2*2] = <void*>_func_eval_legendre[double_complex]
+ufunc_eval_legendre_ptr[2*2+1] = <void*>(<char*>"eval_legendre")
+ufunc_eval_legendre_ptr[2*3] = <void*>_func_eval_legendre[double]
+ufunc_eval_legendre_ptr[2*3+1] = <void*>(<char*>"eval_legendre")
+ufunc_eval_legendre_ptr[2*4] = <void*>_func_eval_legendre[double_complex]
+ufunc_eval_legendre_ptr[2*4+1] = <void*>(<char*>"eval_legendre")
+ufunc_eval_legendre_data[0] = &ufunc_eval_legendre_ptr[2*0]
+ufunc_eval_legendre_data[1] = &ufunc_eval_legendre_ptr[2*1]
+ufunc_eval_legendre_data[2] = &ufunc_eval_legendre_ptr[2*2]
+ufunc_eval_legendre_data[3] = &ufunc_eval_legendre_ptr[2*3]
+ufunc_eval_legendre_data[4] = &ufunc_eval_legendre_ptr[2*4]
+eval_legendre = np.PyUFunc_FromFuncAndData(ufunc_eval_legendre_loops, ufunc_eval_legendre_data, ufunc_eval_legendre_types, 5, 2, 1, 0, 'eval_legendre', ufunc_eval_legendre_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_sh_chebyt_loops[5]
+cdef void *ufunc_eval_sh_chebyt_ptr[10]
+cdef void *ufunc_eval_sh_chebyt_data[5]
+cdef char ufunc_eval_sh_chebyt_types[15]
+cdef char *ufunc_eval_sh_chebyt_doc = (
+    "eval_sh_chebyt(n, x, out=None)\n"
+    "\n"
+    "Evaluate shifted Chebyshev polynomial of the first kind at a\n"
+    "point.\n"
+    "\n"
+    "These polynomials are defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    T_n^*(x) = T_n(2x - 1)\n"
+    "\n"
+    "where :math:`T_n` is a Chebyshev polynomial of the first kind. See\n"
+    "22.5.14 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to `eval_chebyt`.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the shifted Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "T : scalar or ndarray\n"
+    "    Values of the shifted Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_sh_chebyt : roots and quadrature weights of shifted\n"
+    "                  Chebyshev polynomials of the first kind\n"
+    "sh_chebyt : shifted Chebyshev polynomial object\n"
+    "eval_chebyt : evaluate Chebyshev polynomials of the first kind\n"
+    "numpy.polynomial.chebyshev.Chebyshev : Chebyshev series\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.7.E7")
+ufunc_eval_sh_chebyt_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_sh_chebyt_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_sh_chebyt_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_sh_chebyt_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_sh_chebyt_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_sh_chebyt_types[0] = <char>NPY_INTP
+ufunc_eval_sh_chebyt_types[1] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[2] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[3] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyt_types[4] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyt_types[5] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyt_types[6] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyt_types[7] = <char>NPY_CFLOAT
+ufunc_eval_sh_chebyt_types[8] = <char>NPY_CFLOAT
+ufunc_eval_sh_chebyt_types[9] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[10] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[11] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[12] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyt_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_sh_chebyt_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_sh_chebyt_ptr[2*0] = <void*>_func_eval_sh_chebyt_l
+ufunc_eval_sh_chebyt_ptr[2*0+1] = <void*>(<char*>"eval_sh_chebyt")
+ufunc_eval_sh_chebyt_ptr[2*1] = <void*>_func_eval_sh_chebyt[double]
+ufunc_eval_sh_chebyt_ptr[2*1+1] = <void*>(<char*>"eval_sh_chebyt")
+ufunc_eval_sh_chebyt_ptr[2*2] = <void*>_func_eval_sh_chebyt[double_complex]
+ufunc_eval_sh_chebyt_ptr[2*2+1] = <void*>(<char*>"eval_sh_chebyt")
+ufunc_eval_sh_chebyt_ptr[2*3] = <void*>_func_eval_sh_chebyt[double]
+ufunc_eval_sh_chebyt_ptr[2*3+1] = <void*>(<char*>"eval_sh_chebyt")
+ufunc_eval_sh_chebyt_ptr[2*4] = <void*>_func_eval_sh_chebyt[double_complex]
+ufunc_eval_sh_chebyt_ptr[2*4+1] = <void*>(<char*>"eval_sh_chebyt")
+ufunc_eval_sh_chebyt_data[0] = &ufunc_eval_sh_chebyt_ptr[2*0]
+ufunc_eval_sh_chebyt_data[1] = &ufunc_eval_sh_chebyt_ptr[2*1]
+ufunc_eval_sh_chebyt_data[2] = &ufunc_eval_sh_chebyt_ptr[2*2]
+ufunc_eval_sh_chebyt_data[3] = &ufunc_eval_sh_chebyt_ptr[2*3]
+ufunc_eval_sh_chebyt_data[4] = &ufunc_eval_sh_chebyt_ptr[2*4]
+eval_sh_chebyt = np.PyUFunc_FromFuncAndData(ufunc_eval_sh_chebyt_loops, ufunc_eval_sh_chebyt_data, ufunc_eval_sh_chebyt_types, 5, 2, 1, 0, 'eval_sh_chebyt', ufunc_eval_sh_chebyt_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_sh_chebyu_loops[5]
+cdef void *ufunc_eval_sh_chebyu_ptr[10]
+cdef void *ufunc_eval_sh_chebyu_data[5]
+cdef char ufunc_eval_sh_chebyu_types[15]
+cdef char *ufunc_eval_sh_chebyu_doc = (
+    "eval_sh_chebyu(n, x, out=None)\n"
+    "\n"
+    "Evaluate shifted Chebyshev polynomial of the second kind at a\n"
+    "point.\n"
+    "\n"
+    "These polynomials are defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    U_n^*(x) = U_n(2x - 1)\n"
+    "\n"
+    "where :math:`U_n` is a Chebyshev polynomial of the first kind. See\n"
+    "22.5.15 in [AS]_ (or equivalently [DLMF]_) for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to `eval_chebyu`.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the shifted Chebyshev polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "U : scalar or ndarray\n"
+    "    Values of the shifted Chebyshev polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_sh_chebyu : roots and quadrature weights of shifted\n"
+    "                  Chebychev polynomials of the second kind\n"
+    "sh_chebyu : shifted Chebyshev polynomial object\n"
+    "eval_chebyu : evaluate Chebyshev polynomials of the second kind\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.7.E8")
+ufunc_eval_sh_chebyu_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_sh_chebyu_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_sh_chebyu_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_sh_chebyu_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_sh_chebyu_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_sh_chebyu_types[0] = <char>NPY_INTP
+ufunc_eval_sh_chebyu_types[1] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[2] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[3] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyu_types[4] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyu_types[5] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyu_types[6] = <char>NPY_FLOAT
+ufunc_eval_sh_chebyu_types[7] = <char>NPY_CFLOAT
+ufunc_eval_sh_chebyu_types[8] = <char>NPY_CFLOAT
+ufunc_eval_sh_chebyu_types[9] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[10] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[11] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[12] = <char>NPY_DOUBLE
+ufunc_eval_sh_chebyu_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_sh_chebyu_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_sh_chebyu_ptr[2*0] = <void*>_func_eval_sh_chebyu_l
+ufunc_eval_sh_chebyu_ptr[2*0+1] = <void*>(<char*>"eval_sh_chebyu")
+ufunc_eval_sh_chebyu_ptr[2*1] = <void*>_func_eval_sh_chebyu[double]
+ufunc_eval_sh_chebyu_ptr[2*1+1] = <void*>(<char*>"eval_sh_chebyu")
+ufunc_eval_sh_chebyu_ptr[2*2] = <void*>_func_eval_sh_chebyu[double_complex]
+ufunc_eval_sh_chebyu_ptr[2*2+1] = <void*>(<char*>"eval_sh_chebyu")
+ufunc_eval_sh_chebyu_ptr[2*3] = <void*>_func_eval_sh_chebyu[double]
+ufunc_eval_sh_chebyu_ptr[2*3+1] = <void*>(<char*>"eval_sh_chebyu")
+ufunc_eval_sh_chebyu_ptr[2*4] = <void*>_func_eval_sh_chebyu[double_complex]
+ufunc_eval_sh_chebyu_ptr[2*4+1] = <void*>(<char*>"eval_sh_chebyu")
+ufunc_eval_sh_chebyu_data[0] = &ufunc_eval_sh_chebyu_ptr[2*0]
+ufunc_eval_sh_chebyu_data[1] = &ufunc_eval_sh_chebyu_ptr[2*1]
+ufunc_eval_sh_chebyu_data[2] = &ufunc_eval_sh_chebyu_ptr[2*2]
+ufunc_eval_sh_chebyu_data[3] = &ufunc_eval_sh_chebyu_ptr[2*3]
+ufunc_eval_sh_chebyu_data[4] = &ufunc_eval_sh_chebyu_ptr[2*4]
+eval_sh_chebyu = np.PyUFunc_FromFuncAndData(ufunc_eval_sh_chebyu_loops, ufunc_eval_sh_chebyu_data, ufunc_eval_sh_chebyu_types, 5, 2, 1, 0, 'eval_sh_chebyu', ufunc_eval_sh_chebyu_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_sh_jacobi_loops[5]
+cdef void *ufunc_eval_sh_jacobi_ptr[10]
+cdef void *ufunc_eval_sh_jacobi_data[5]
+cdef char ufunc_eval_sh_jacobi_types[25]
+cdef char *ufunc_eval_sh_jacobi_doc = (
+    "eval_sh_jacobi(n, p, q, x, out=None)\n"
+    "\n"
+    "Evaluate shifted Jacobi polynomial at a point.\n"
+    "\n"
+    "Defined by\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    G_n^{(p, q)}(x)\n"
+    "      = \\binom{2n + p - 1}{n}^{-1} P_n^{(p - q, q - 1)}(2x - 1),\n"
+    "\n"
+    "where :math:`P_n^{(\\cdot, \\cdot)}` is the n-th Jacobi polynomial.\n"
+    "See 22.5.2 in [AS]_ (or equivalently [DLMF]_)  for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : int\n"
+    "    Degree of the polynomial. If not an integer, the result is\n"
+    "    determined via the relation to `binom` and `eval_jacobi`.\n"
+    "p : float\n"
+    "    Parameter\n"
+    "q : float\n"
+    "    Parameter\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "G : scalar or ndarray\n"
+    "    Values of the shifted Jacobi polynomial.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_sh_jacobi : roots and quadrature weights of shifted Jacobi\n"
+    "                  polynomials\n"
+    "sh_jacobi : shifted Jacobi polynomial object\n"
+    "eval_jacobi : evaluate Jacobi polynomials\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.1.E2")
+ufunc_eval_sh_jacobi_loops[0] = <np.PyUFuncGenericFunction>loop_d_pddd__As_pddd_d
+ufunc_eval_sh_jacobi_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_eval_sh_jacobi_loops[2] = <np.PyUFuncGenericFunction>loop_D_dddD__As_fffF_F
+ufunc_eval_sh_jacobi_loops[3] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_eval_sh_jacobi_loops[4] = <np.PyUFuncGenericFunction>loop_D_dddD__As_dddD_D
+ufunc_eval_sh_jacobi_types[0] = <char>NPY_INTP
+ufunc_eval_sh_jacobi_types[1] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[2] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[3] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[4] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[5] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[6] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[7] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[8] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[9] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[10] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[11] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[12] = <char>NPY_FLOAT
+ufunc_eval_sh_jacobi_types[13] = <char>NPY_CFLOAT
+ufunc_eval_sh_jacobi_types[14] = <char>NPY_CFLOAT
+ufunc_eval_sh_jacobi_types[15] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[16] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[17] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[18] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[19] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[20] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[21] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[22] = <char>NPY_DOUBLE
+ufunc_eval_sh_jacobi_types[23] = <char>NPY_CDOUBLE
+ufunc_eval_sh_jacobi_types[24] = <char>NPY_CDOUBLE
+ufunc_eval_sh_jacobi_ptr[2*0] = <void*>_func_eval_sh_jacobi_l
+ufunc_eval_sh_jacobi_ptr[2*0+1] = <void*>(<char*>"eval_sh_jacobi")
+ufunc_eval_sh_jacobi_ptr[2*1] = <void*>_func_eval_sh_jacobi[double]
+ufunc_eval_sh_jacobi_ptr[2*1+1] = <void*>(<char*>"eval_sh_jacobi")
+ufunc_eval_sh_jacobi_ptr[2*2] = <void*>_func_eval_sh_jacobi[double_complex]
+ufunc_eval_sh_jacobi_ptr[2*2+1] = <void*>(<char*>"eval_sh_jacobi")
+ufunc_eval_sh_jacobi_ptr[2*3] = <void*>_func_eval_sh_jacobi[double]
+ufunc_eval_sh_jacobi_ptr[2*3+1] = <void*>(<char*>"eval_sh_jacobi")
+ufunc_eval_sh_jacobi_ptr[2*4] = <void*>_func_eval_sh_jacobi[double_complex]
+ufunc_eval_sh_jacobi_ptr[2*4+1] = <void*>(<char*>"eval_sh_jacobi")
+ufunc_eval_sh_jacobi_data[0] = &ufunc_eval_sh_jacobi_ptr[2*0]
+ufunc_eval_sh_jacobi_data[1] = &ufunc_eval_sh_jacobi_ptr[2*1]
+ufunc_eval_sh_jacobi_data[2] = &ufunc_eval_sh_jacobi_ptr[2*2]
+ufunc_eval_sh_jacobi_data[3] = &ufunc_eval_sh_jacobi_ptr[2*3]
+ufunc_eval_sh_jacobi_data[4] = &ufunc_eval_sh_jacobi_ptr[2*4]
+eval_sh_jacobi = np.PyUFunc_FromFuncAndData(ufunc_eval_sh_jacobi_loops, ufunc_eval_sh_jacobi_data, ufunc_eval_sh_jacobi_types, 5, 4, 1, 0, 'eval_sh_jacobi', ufunc_eval_sh_jacobi_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_eval_sh_legendre_loops[5]
+cdef void *ufunc_eval_sh_legendre_ptr[10]
+cdef void *ufunc_eval_sh_legendre_data[5]
+cdef char ufunc_eval_sh_legendre_types[15]
+cdef char *ufunc_eval_sh_legendre_doc = (
+    "eval_sh_legendre(n, x, out=None)\n"
+    "\n"
+    "Evaluate shifted Legendre polynomial at a point.\n"
+    "\n"
+    "These polynomials are defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P_n^*(x) = P_n(2x - 1)\n"
+    "\n"
+    "where :math:`P_n` is a Legendre polynomial. See 2.2.11 in [AS]_\n"
+    "or [DLMF]_ for details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Degree of the polynomial. If not an integer, the value is\n"
+    "    determined via the relation to `eval_legendre`.\n"
+    "x : array_like\n"
+    "    Points at which to evaluate the shifted Legendre polynomial\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "P : scalar or ndarray\n"
+    "    Values of the shifted Legendre polynomial\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "roots_sh_legendre : roots and quadrature weights of shifted\n"
+    "                    Legendre polynomials\n"
+    "sh_legendre : shifted Legendre polynomial object\n"
+    "eval_legendre : evaluate Legendre polynomials\n"
+    "numpy.polynomial.legendre.Legendre : Legendre series\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [AS] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "    Handbook of Mathematical Functions with Formulas,\n"
+    "    Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions,\n"
+    "    https://dlmf.nist.gov/18.7.E10")
+ufunc_eval_sh_legendre_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_eval_sh_legendre_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_eval_sh_legendre_loops[2] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_eval_sh_legendre_loops[3] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_eval_sh_legendre_loops[4] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_eval_sh_legendre_types[0] = <char>NPY_INTP
+ufunc_eval_sh_legendre_types[1] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[2] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[3] = <char>NPY_FLOAT
+ufunc_eval_sh_legendre_types[4] = <char>NPY_FLOAT
+ufunc_eval_sh_legendre_types[5] = <char>NPY_FLOAT
+ufunc_eval_sh_legendre_types[6] = <char>NPY_FLOAT
+ufunc_eval_sh_legendre_types[7] = <char>NPY_CFLOAT
+ufunc_eval_sh_legendre_types[8] = <char>NPY_CFLOAT
+ufunc_eval_sh_legendre_types[9] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[10] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[11] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[12] = <char>NPY_DOUBLE
+ufunc_eval_sh_legendre_types[13] = <char>NPY_CDOUBLE
+ufunc_eval_sh_legendre_types[14] = <char>NPY_CDOUBLE
+ufunc_eval_sh_legendre_ptr[2*0] = <void*>_func_eval_sh_legendre_l
+ufunc_eval_sh_legendre_ptr[2*0+1] = <void*>(<char*>"eval_sh_legendre")
+ufunc_eval_sh_legendre_ptr[2*1] = <void*>_func_eval_sh_legendre[double]
+ufunc_eval_sh_legendre_ptr[2*1+1] = <void*>(<char*>"eval_sh_legendre")
+ufunc_eval_sh_legendre_ptr[2*2] = <void*>_func_eval_sh_legendre[double_complex]
+ufunc_eval_sh_legendre_ptr[2*2+1] = <void*>(<char*>"eval_sh_legendre")
+ufunc_eval_sh_legendre_ptr[2*3] = <void*>_func_eval_sh_legendre[double]
+ufunc_eval_sh_legendre_ptr[2*3+1] = <void*>(<char*>"eval_sh_legendre")
+ufunc_eval_sh_legendre_ptr[2*4] = <void*>_func_eval_sh_legendre[double_complex]
+ufunc_eval_sh_legendre_ptr[2*4+1] = <void*>(<char*>"eval_sh_legendre")
+ufunc_eval_sh_legendre_data[0] = &ufunc_eval_sh_legendre_ptr[2*0]
+ufunc_eval_sh_legendre_data[1] = &ufunc_eval_sh_legendre_ptr[2*1]
+ufunc_eval_sh_legendre_data[2] = &ufunc_eval_sh_legendre_ptr[2*2]
+ufunc_eval_sh_legendre_data[3] = &ufunc_eval_sh_legendre_ptr[2*3]
+ufunc_eval_sh_legendre_data[4] = &ufunc_eval_sh_legendre_ptr[2*4]
+eval_sh_legendre = np.PyUFunc_FromFuncAndData(ufunc_eval_sh_legendre_loops, ufunc_eval_sh_legendre_data, ufunc_eval_sh_legendre_types, 5, 2, 1, 0, 'eval_sh_legendre', ufunc_eval_sh_legendre_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_expn_loops[3]
+cdef void *ufunc_expn_ptr[6]
+cdef void *ufunc_expn_data[3]
+cdef char ufunc_expn_types[9]
+cdef char *ufunc_expn_doc = (
+    "expn(n, x, out=None)\n"
+    "\n"
+    "Generalized exponential integral En.\n"
+    "\n"
+    "For integer :math:`n \\geq 0` and real :math:`x \\geq 0` the\n"
+    "generalized exponential integral is defined as [DLMF]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    E_n(x) = x^{n - 1} \\int_x^\\infty \\frac{e^{-t}}{t^n} dt.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Non-negative integers\n"
+    "x : array_like\n"
+    "    Real argument\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the generalized exponential integral\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "exp1 : special case of :math:`E_n` for :math:`n = 1`\n"
+    "expi : related to :math:`E_n` when :math:`n = 1`\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [DLMF] Digital Library of Mathematical Functions, 8.19.2\n"
+    "          https://dlmf.nist.gov/8.19#E2\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "Its domain is nonnegative n and x.\n"
+    "\n"
+    ">>> sc.expn(-1, 1.0), sc.expn(1, -1.0)\n"
+    "(nan, nan)\n"
+    "\n"
+    "It has a pole at ``x = 0`` for ``n = 1, 2``; for larger ``n`` it\n"
+    "is equal to ``1 / (n - 1)``.\n"
+    "\n"
+    ">>> sc.expn([0, 1, 2, 3, 4], 0)\n"
+    "array([       inf,        inf, 1.        , 0.5       , 0.33333333])\n"
+    "\n"
+    "For n equal to 0 it reduces to ``exp(-x) / x``.\n"
+    "\n"
+    ">>> x = np.array([1, 2, 3, 4])\n"
+    ">>> sc.expn(0, x)\n"
+    "array([0.36787944, 0.06766764, 0.01659569, 0.00457891])\n"
+    ">>> np.exp(-x) / x\n"
+    "array([0.36787944, 0.06766764, 0.01659569, 0.00457891])\n"
+    "\n"
+    "For n equal to 1 it reduces to `exp1`.\n"
+    "\n"
+    ">>> sc.expn(1, x)\n"
+    "array([0.21938393, 0.04890051, 0.01304838, 0.00377935])\n"
+    ">>> sc.exp1(x)\n"
+    "array([0.21938393, 0.04890051, 0.01304838, 0.00377935])")
+ufunc_expn_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_expn_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_expn_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_expn_types[0] = <char>NPY_INTP
+ufunc_expn_types[1] = <char>NPY_DOUBLE
+ufunc_expn_types[2] = <char>NPY_DOUBLE
+ufunc_expn_types[3] = <char>NPY_FLOAT
+ufunc_expn_types[4] = <char>NPY_FLOAT
+ufunc_expn_types[5] = <char>NPY_FLOAT
+ufunc_expn_types[6] = <char>NPY_DOUBLE
+ufunc_expn_types[7] = <char>NPY_DOUBLE
+ufunc_expn_types[8] = <char>NPY_DOUBLE
+ufunc_expn_ptr[2*0] = <void*>_func_cephes_expn_wrap
+ufunc_expn_ptr[2*0+1] = <void*>(<char*>"expn")
+ufunc_expn_ptr[2*1] = <void*>_func_expn_unsafe
+ufunc_expn_ptr[2*1+1] = <void*>(<char*>"expn")
+ufunc_expn_ptr[2*2] = <void*>_func_expn_unsafe
+ufunc_expn_ptr[2*2+1] = <void*>(<char*>"expn")
+ufunc_expn_data[0] = &ufunc_expn_ptr[2*0]
+ufunc_expn_data[1] = &ufunc_expn_ptr[2*1]
+ufunc_expn_data[2] = &ufunc_expn_ptr[2*2]
+expn = np.PyUFunc_FromFuncAndData(ufunc_expn_loops, ufunc_expn_data, ufunc_expn_types, 3, 2, 1, 0, 'expn', ufunc_expn_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_fdtr_loops[2]
+cdef void *ufunc_fdtr_ptr[4]
+cdef void *ufunc_fdtr_data[2]
+cdef char ufunc_fdtr_types[8]
+cdef char *ufunc_fdtr_doc = (
+    "fdtr(dfn, dfd, x, out=None)\n"
+    "\n"
+    "F cumulative distribution function.\n"
+    "\n"
+    "Returns the value of the cumulative distribution function of the\n"
+    "F-distribution, also known as Snedecor's F-distribution or the\n"
+    "Fisher-Snedecor distribution.\n"
+    "\n"
+    "The F-distribution with parameters :math:`d_n` and :math:`d_d` is the\n"
+    "distribution of the random variable,\n"
+    "\n"
+    ".. math::\n"
+    "    X = \\frac{U_n/d_n}{U_d/d_d},\n"
+    "\n"
+    "where :math:`U_n` and :math:`U_d` are random variables distributed\n"
+    ":math:`\\chi^2`, with :math:`d_n` and :math:`d_d` degrees of freedom,\n"
+    "respectively.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    First parameter (positive float).\n"
+    "dfd : array_like\n"
+    "    Second parameter (positive float).\n"
+    "x : array_like\n"
+    "    Argument (nonnegative float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    The CDF of the F-distribution with parameters `dfn` and `dfd` at `x`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "fdtrc : F distribution survival function\n"
+    "fdtri : F distribution inverse cumulative distribution\n"
+    "scipy.stats.f : F distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The regularized incomplete beta function is used, according to the\n"
+    "formula,\n"
+    "\n"
+    ".. math::\n"
+    "    F(d_n, d_d; x) = I_{xd_n/(d_d + xd_n)}(d_n/2, d_d/2).\n"
+    "\n"
+    "Wrapper for a routine from the Boost Math C++ library [1]_. The\n"
+    "F distribution is also available as `scipy.stats.f`. Calling\n"
+    "`fdtr` directly can improve performance compared to the ``cdf``\n"
+    "method of `scipy.stats.f` (see last example below).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Calculate the function for ``dfn=1`` and ``dfd=2`` at ``x=1``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import fdtr\n"
+    ">>> fdtr(1, 2, 1)\n"
+    "0.5773502691896258\n"
+    "\n"
+    "Calculate the function at several points by providing a NumPy array for\n"
+    "`x`.\n"
+    "\n"
+    ">>> x = np.array([0.5, 2., 3.])\n"
+    ">>> fdtr(1, 2, x)\n"
+    "array([0.4472136 , 0.70710678, 0.77459667])\n"
+    "\n"
+    "Plot the function for several parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> dfn_parameters = [1, 5, 10, 50]\n"
+    ">>> dfd_parameters = [1, 1, 2, 3]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(dfn_parameters, dfd_parameters,\n"
+    "...                            linestyles))\n"
+    ">>> x = np.linspace(0, 30, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     dfn, dfd, style = parameter_set\n"
+    "...     fdtr_vals = fdtr(dfn, dfd, x)\n"
+    "...     ax.plot(x, fdtr_vals, label=rf\"$d_n={dfn},\\, d_d={dfd}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(\"F distribution cumulative distribution function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The F distribution is also available as `scipy.stats.f`. Using `fdtr`\n"
+    "directly can be much faster than calling the ``cdf`` method of\n"
+    "`scipy.stats.f`, especially for small arrays or individual values.\n"
+    "To get the same results one must use the following parametrization:\n"
+    "``stats.f(dfn, dfd).cdf(x)=fdtr(dfn, dfd, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import f\n"
+    ">>> dfn, dfd = 1, 2\n"
+    ">>> x = 1\n"
+    ">>> fdtr_res = fdtr(dfn, dfd, x)  # this will often be faster than below\n"
+    ">>> f_dist_res = f(dfn, dfd).cdf(x)\n"
+    ">>> fdtr_res == f_dist_res  # test that results are equal\n"
+    "True")
+ufunc_fdtr_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_fdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_fdtr_types[0] = <char>NPY_FLOAT
+ufunc_fdtr_types[1] = <char>NPY_FLOAT
+ufunc_fdtr_types[2] = <char>NPY_FLOAT
+ufunc_fdtr_types[3] = <char>NPY_FLOAT
+ufunc_fdtr_types[4] = <char>NPY_DOUBLE
+ufunc_fdtr_types[5] = <char>NPY_DOUBLE
+ufunc_fdtr_types[6] = <char>NPY_DOUBLE
+ufunc_fdtr_types[7] = <char>NPY_DOUBLE
+ufunc_fdtr_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_f_cdf_float
+ufunc_fdtr_ptr[2*0+1] = <void*>(<char*>"fdtr")
+ufunc_fdtr_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_f_cdf_double
+ufunc_fdtr_ptr[2*1+1] = <void*>(<char*>"fdtr")
+ufunc_fdtr_data[0] = &ufunc_fdtr_ptr[2*0]
+ufunc_fdtr_data[1] = &ufunc_fdtr_ptr[2*1]
+fdtr = np.PyUFunc_FromFuncAndData(ufunc_fdtr_loops, ufunc_fdtr_data, ufunc_fdtr_types, 2, 3, 1, 0, 'fdtr', ufunc_fdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_fdtrc_loops[2]
+cdef void *ufunc_fdtrc_ptr[4]
+cdef void *ufunc_fdtrc_data[2]
+cdef char ufunc_fdtrc_types[8]
+cdef char *ufunc_fdtrc_doc = (
+    "fdtrc(dfn, dfd, x, out=None)\n"
+    "\n"
+    "F survival function.\n"
+    "\n"
+    "Returns the complemented F-distribution function (the integral of the\n"
+    "density from `x` to infinity).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    First parameter (positive float).\n"
+    "dfd : array_like\n"
+    "    Second parameter (positive float).\n"
+    "x : array_like\n"
+    "    Argument (nonnegative float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "y : scalar or ndarray\n"
+    "    The complemented F-distribution function with parameters `dfn` and\n"
+    "    `dfd` at `x`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "fdtr : F distribution cumulative distribution function\n"
+    "fdtri : F distribution inverse cumulative distribution function\n"
+    "scipy.stats.f : F distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The regularized incomplete beta function is used, according to the\n"
+    "formula,\n"
+    "\n"
+    ".. math::\n"
+    "    F(d_n, d_d; x) = I_{d_d/(d_d + xd_n)}(d_d/2, d_n/2).\n"
+    "\n"
+    "Wrapper for a routine from the Boost Math C++ library [1]_. The\n"
+    "F distribution is also available as `scipy.stats.f`. Calling\n"
+    "`fdtrc` directly can improve performance compared to the ``sf``\n"
+    "method of `scipy.stats.f` (see last example below).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Calculate the function for ``dfn=1`` and ``dfd=2`` at ``x=1``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import fdtrc\n"
+    ">>> fdtrc(1, 2, 1)\n"
+    "0.42264973081037427\n"
+    "\n"
+    "Calculate the function at several points by providing a NumPy array for\n"
+    "`x`.\n"
+    "\n"
+    ">>> x = np.array([0.5, 2., 3.])\n"
+    ">>> fdtrc(1, 2, x)\n"
+    "array([0.5527864 , 0.29289322, 0.22540333])\n"
+    "\n"
+    "Plot the function for several parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> dfn_parameters = [1, 5, 10, 50]\n"
+    ">>> dfd_parameters = [1, 1, 2, 3]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(dfn_parameters, dfd_parameters,\n"
+    "...                            linestyles))\n"
+    ">>> x = np.linspace(0, 30, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     dfn, dfd, style = parameter_set\n"
+    "...     fdtrc_vals = fdtrc(dfn, dfd, x)\n"
+    "...     ax.plot(x, fdtrc_vals, label=rf\"$d_n={dfn},\\, d_d={dfd}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(\"F distribution survival function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The F distribution is also available as `scipy.stats.f`. Using `fdtrc`\n"
+    "directly can be much faster than calling the ``sf`` method of\n"
+    "`scipy.stats.f`, especially for small arrays or individual values.\n"
+    "To get the same results one must use the following parametrization:\n"
+    "``stats.f(dfn, dfd).sf(x)=fdtrc(dfn, dfd, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import f\n"
+    ">>> dfn, dfd = 1, 2\n"
+    ">>> x = 1\n"
+    ">>> fdtrc_res = fdtrc(dfn, dfd, x)  # this will often be faster than below\n"
+    ">>> f_dist_res = f(dfn, dfd).sf(x)\n"
+    ">>> f_dist_res == fdtrc_res  # test that results are equal\n"
+    "True")
+ufunc_fdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_fdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_fdtrc_types[0] = <char>NPY_FLOAT
+ufunc_fdtrc_types[1] = <char>NPY_FLOAT
+ufunc_fdtrc_types[2] = <char>NPY_FLOAT
+ufunc_fdtrc_types[3] = <char>NPY_FLOAT
+ufunc_fdtrc_types[4] = <char>NPY_DOUBLE
+ufunc_fdtrc_types[5] = <char>NPY_DOUBLE
+ufunc_fdtrc_types[6] = <char>NPY_DOUBLE
+ufunc_fdtrc_types[7] = <char>NPY_DOUBLE
+ufunc_fdtrc_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_f_sf_float
+ufunc_fdtrc_ptr[2*0+1] = <void*>(<char*>"fdtrc")
+ufunc_fdtrc_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_f_sf_double
+ufunc_fdtrc_ptr[2*1+1] = <void*>(<char*>"fdtrc")
+ufunc_fdtrc_data[0] = &ufunc_fdtrc_ptr[2*0]
+ufunc_fdtrc_data[1] = &ufunc_fdtrc_ptr[2*1]
+fdtrc = np.PyUFunc_FromFuncAndData(ufunc_fdtrc_loops, ufunc_fdtrc_data, ufunc_fdtrc_types, 2, 3, 1, 0, 'fdtrc', ufunc_fdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_fdtri_loops[2]
+cdef void *ufunc_fdtri_ptr[4]
+cdef void *ufunc_fdtri_data[2]
+cdef char ufunc_fdtri_types[8]
+cdef char *ufunc_fdtri_doc = (
+    "fdtri(dfn, dfd, p, out=None)\n"
+    "\n"
+    "The `p`-th quantile of the F-distribution.\n"
+    "\n"
+    "This function is the inverse of the F-distribution CDF, `fdtr`, returning\n"
+    "the `x` such that `fdtr(dfn, dfd, x) = p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    First parameter (positive float).\n"
+    "dfd : array_like\n"
+    "    Second parameter (positive float).\n"
+    "p : array_like\n"
+    "    Cumulative probability, in [0, 1].\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    The quantile corresponding to `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "fdtr : F distribution cumulative distribution function\n"
+    "fdtrc : F distribution survival function\n"
+    "scipy.stats.f : F distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for a routine from the Boost Math C++ library [1]_. The\n"
+    "F distribution is also available as `scipy.stats.f`. Calling\n"
+    "`fdtri` directly can improve performance compared to the ``ppf``\n"
+    "method of `scipy.stats.f` (see last example below).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`fdtri` represents the inverse of the F distribution CDF which is\n"
+    "available as `fdtr`. Here, we calculate the CDF for ``df1=1``, ``df2=2``\n"
+    "at ``x=3``. `fdtri` then returns ``3`` given the same values for `df1`,\n"
+    "`df2` and the computed CDF value.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import fdtri, fdtr\n"
+    ">>> df1, df2 = 1, 2\n"
+    ">>> x = 3\n"
+    ">>> cdf_value =  fdtr(df1, df2, x)\n"
+    ">>> fdtri(df1, df2, cdf_value)\n"
+    "3.000000000000006\n"
+    "\n"
+    "Calculate the function at several points by providing a NumPy array for\n"
+    "`x`.\n"
+    "\n"
+    ">>> x = np.array([0.1, 0.4, 0.7])\n"
+    ">>> fdtri(1, 2, x)\n"
+    "array([0.02020202, 0.38095238, 1.92156863])\n"
+    "\n"
+    "Plot the function for several parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> dfn_parameters = [50, 10, 1, 50]\n"
+    ">>> dfd_parameters = [0.5, 1, 1, 5]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(dfn_parameters, dfd_parameters,\n"
+    "...                            linestyles))\n"
+    ">>> x = np.linspace(0, 1, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     dfn, dfd, style = parameter_set\n"
+    "...     fdtri_vals = fdtri(dfn, dfd, x)\n"
+    "...     ax.plot(x, fdtri_vals, label=rf\"$d_n={dfn},\\, d_d={dfd}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> title = \"F distribution inverse cumulative distribution function\"\n"
+    ">>> ax.set_title(title)\n"
+    ">>> ax.set_ylim(0, 30)\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The F distribution is also available as `scipy.stats.f`. Using `fdtri`\n"
+    "directly can be much faster than calling the ``ppf`` method of\n"
+    "`scipy.stats.f`, especially for small arrays or individual values.\n"
+    "To get the same results one must use the following parametrization:\n"
+    "``stats.f(dfn, dfd).ppf(x)=fdtri(dfn, dfd, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import f\n"
+    ">>> dfn, dfd = 1, 2\n"
+    ">>> x = 0.7\n"
+    ">>> fdtri_res = fdtri(dfn, dfd, x)  # this will often be faster than below\n"
+    ">>> f_dist_res = f(dfn, dfd).ppf(x)\n"
+    ">>> f_dist_res == fdtri_res  # test that results are equal\n"
+    "True")
+ufunc_fdtri_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_fdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_fdtri_types[0] = <char>NPY_FLOAT
+ufunc_fdtri_types[1] = <char>NPY_FLOAT
+ufunc_fdtri_types[2] = <char>NPY_FLOAT
+ufunc_fdtri_types[3] = <char>NPY_FLOAT
+ufunc_fdtri_types[4] = <char>NPY_DOUBLE
+ufunc_fdtri_types[5] = <char>NPY_DOUBLE
+ufunc_fdtri_types[6] = <char>NPY_DOUBLE
+ufunc_fdtri_types[7] = <char>NPY_DOUBLE
+ufunc_fdtri_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_f_ppf_float
+ufunc_fdtri_ptr[2*0+1] = <void*>(<char*>"fdtri")
+ufunc_fdtri_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_f_ppf_double
+ufunc_fdtri_ptr[2*1+1] = <void*>(<char*>"fdtri")
+ufunc_fdtri_data[0] = &ufunc_fdtri_ptr[2*0]
+ufunc_fdtri_data[1] = &ufunc_fdtri_ptr[2*1]
+fdtri = np.PyUFunc_FromFuncAndData(ufunc_fdtri_loops, ufunc_fdtri_data, ufunc_fdtri_types, 2, 3, 1, 0, 'fdtri', ufunc_fdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_fdtridfd_loops[2]
+cdef void *ufunc_fdtridfd_ptr[4]
+cdef void *ufunc_fdtridfd_data[2]
+cdef char ufunc_fdtridfd_types[8]
+cdef char *ufunc_fdtridfd_doc = (
+    "fdtridfd(dfn, p, x, out=None)\n"
+    "\n"
+    "Inverse to `fdtr` vs dfd\n"
+    "\n"
+    "Finds the F density argument dfd such that ``fdtr(dfn, dfd, x) == p``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    First parameter (positive float).\n"
+    "p : array_like\n"
+    "    Cumulative probability, in [0, 1].\n"
+    "x : array_like\n"
+    "    Argument (nonnegative float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "dfd : scalar or ndarray\n"
+    "    `dfd` such that ``fdtr(dfn, dfd, x) == p``.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "fdtr : F distribution cumulative distribution function\n"
+    "fdtrc : F distribution survival function\n"
+    "fdtri : F distribution quantile function\n"
+    "scipy.stats.f : F distribution\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the F distribution cumulative distribution function for one\n"
+    "parameter set.\n"
+    "\n"
+    ">>> from scipy.special import fdtridfd, fdtr\n"
+    ">>> dfn, dfd, x = 10, 5, 2\n"
+    ">>> cdf_value = fdtr(dfn, dfd, x)\n"
+    ">>> cdf_value\n"
+    "0.7700248806501017\n"
+    "\n"
+    "Verify that `fdtridfd` recovers the original value for `dfd`:\n"
+    "\n"
+    ">>> fdtridfd(dfn, cdf_value, x)\n"
+    "5.0")
+ufunc_fdtridfd_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_fdtridfd_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_fdtridfd_types[0] = <char>NPY_FLOAT
+ufunc_fdtridfd_types[1] = <char>NPY_FLOAT
+ufunc_fdtridfd_types[2] = <char>NPY_FLOAT
+ufunc_fdtridfd_types[3] = <char>NPY_FLOAT
+ufunc_fdtridfd_types[4] = <char>NPY_DOUBLE
+ufunc_fdtridfd_types[5] = <char>NPY_DOUBLE
+ufunc_fdtridfd_types[6] = <char>NPY_DOUBLE
+ufunc_fdtridfd_types[7] = <char>NPY_DOUBLE
+ufunc_fdtridfd_ptr[2*0] = <void*>_func_fdtridfd
+ufunc_fdtridfd_ptr[2*0+1] = <void*>(<char*>"fdtridfd")
+ufunc_fdtridfd_ptr[2*1] = <void*>_func_fdtridfd
+ufunc_fdtridfd_ptr[2*1+1] = <void*>(<char*>"fdtridfd")
+ufunc_fdtridfd_data[0] = &ufunc_fdtridfd_ptr[2*0]
+ufunc_fdtridfd_data[1] = &ufunc_fdtridfd_ptr[2*1]
+fdtridfd = np.PyUFunc_FromFuncAndData(ufunc_fdtridfd_loops, ufunc_fdtridfd_data, ufunc_fdtridfd_types, 2, 3, 1, 0, 'fdtridfd', ufunc_fdtridfd_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_gdtr_loops[2]
+cdef void *ufunc_gdtr_ptr[4]
+cdef void *ufunc_gdtr_data[2]
+cdef char ufunc_gdtr_types[8]
+cdef char *ufunc_gdtr_doc = (
+    "gdtr(a, b, x, out=None)\n"
+    "\n"
+    "Gamma distribution cumulative distribution function.\n"
+    "\n"
+    "Returns the integral from zero to `x` of the gamma probability density\n"
+    "function,\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    F = \\int_0^x \\frac{a^b}{\\Gamma(b)} t^{b-1} e^{-at}\\,dt,\n"
+    "\n"
+    "where :math:`\\Gamma` is the gamma function.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : array_like\n"
+    "    The rate parameter of the gamma distribution, sometimes denoted\n"
+    "    :math:`\\beta` (float).  It is also the reciprocal of the scale\n"
+    "    parameter :math:`\\theta`.\n"
+    "b : array_like\n"
+    "    The shape parameter of the gamma distribution, sometimes denoted\n"
+    "    :math:`\\alpha` (float).\n"
+    "x : array_like\n"
+    "    The quantile (upper limit of integration; float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "F : scalar or ndarray\n"
+    "    The CDF of the gamma distribution with parameters `a` and `b`\n"
+    "    evaluated at `x`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "gdtrc : 1 - CDF of the gamma distribution.\n"
+    "scipy.stats.gamma: Gamma distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The evaluation is carried out using the relation to the incomplete gamma\n"
+    "integral (regularized gamma function).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `gdtr`. Calling `gdtr` directly can\n"
+    "improve performance compared to the ``cdf`` method of `scipy.stats.gamma`\n"
+    "(see last example below).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the function for ``a=1``, ``b=2`` at ``x=5``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import gdtr\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> gdtr(1., 2., 5.)\n"
+    "0.9595723180054873\n"
+    "\n"
+    "Compute the function for ``a=1`` and ``b=2`` at several points by\n"
+    "providing a NumPy array for `x`.\n"
+    "\n"
+    ">>> xvalues = np.array([1., 2., 3., 4])\n"
+    ">>> gdtr(1., 1., xvalues)\n"
+    "array([0.63212056, 0.86466472, 0.95021293, 0.98168436])\n"
+    "\n"
+    "`gdtr` can evaluate different parameter sets by providing arrays with\n"
+    "broadcasting compatible shapes for `a`, `b` and `x`. Here we compute the\n"
+    "function for three different `a` at four positions `x` and ``b=3``,\n"
+    "resulting in a 3x4 array.\n"
+    "\n"
+    ">>> a = np.array([[0.5], [1.5], [2.5]])\n"
+    ">>> x = np.array([1., 2., 3., 4])\n"
+    ">>> a.shape, x.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> gdtr(a, 3., x)\n"
+    "array([[0.01438768, 0.0803014 , 0.19115317, 0.32332358],\n"
+    "       [0.19115317, 0.57680992, 0.82642193, 0.9380312 ],\n"
+    "       [0.45618688, 0.87534798, 0.97974328, 0.9972306 ]])\n"
+    "\n"
+    "Plot the function for four different parameter sets.\n"
+    "\n"
+    ">>> a_parameters = [0.3, 1, 2, 6]\n"
+    ">>> b_parameters = [2, 10, 15, 20]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(a_parameters, b_parameters, linestyles))\n"
+    ">>> x = np.linspace(0, 30, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     a, b, style = parameter_set\n"
+    "...     gdtr_vals = gdtr(a, b, x)\n"
+    "...     ax.plot(x, gdtr_vals, label=fr\"$a= {a},\\, b={b}$\", ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(\"Gamma distribution cumulative distribution function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The gamma distribution is also available as `scipy.stats.gamma`. Using\n"
+    "`gdtr` directly can be much faster than calling the ``cdf`` method of\n"
+    "`scipy.stats.gamma`, especially for small arrays or individual values.\n"
+    "To get the same results one must use the following parametrization:\n"
+    "``stats.gamma(b, scale=1/a).cdf(x)=gdtr(a, b, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import gamma\n"
+    ">>> a = 2.\n"
+    ">>> b = 3\n"
+    ">>> x = 1.\n"
+    ">>> gdtr_result = gdtr(a, b, x)  # this will often be faster than below\n"
+    ">>> gamma_dist_result = gamma(b, scale=1/a).cdf(x)\n"
+    ">>> gdtr_result == gamma_dist_result  # test that results are equal\n"
+    "True")
+ufunc_gdtr_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_gdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_gdtr_types[0] = <char>NPY_FLOAT
+ufunc_gdtr_types[1] = <char>NPY_FLOAT
+ufunc_gdtr_types[2] = <char>NPY_FLOAT
+ufunc_gdtr_types[3] = <char>NPY_FLOAT
+ufunc_gdtr_types[4] = <char>NPY_DOUBLE
+ufunc_gdtr_types[5] = <char>NPY_DOUBLE
+ufunc_gdtr_types[6] = <char>NPY_DOUBLE
+ufunc_gdtr_types[7] = <char>NPY_DOUBLE
+ufunc_gdtr_ptr[2*0] = <void*>_func_xsf_gdtr
+ufunc_gdtr_ptr[2*0+1] = <void*>(<char*>"gdtr")
+ufunc_gdtr_ptr[2*1] = <void*>_func_xsf_gdtr
+ufunc_gdtr_ptr[2*1+1] = <void*>(<char*>"gdtr")
+ufunc_gdtr_data[0] = &ufunc_gdtr_ptr[2*0]
+ufunc_gdtr_data[1] = &ufunc_gdtr_ptr[2*1]
+gdtr = np.PyUFunc_FromFuncAndData(ufunc_gdtr_loops, ufunc_gdtr_data, ufunc_gdtr_types, 2, 3, 1, 0, 'gdtr', ufunc_gdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_gdtrc_loops[2]
+cdef void *ufunc_gdtrc_ptr[4]
+cdef void *ufunc_gdtrc_data[2]
+cdef char ufunc_gdtrc_types[8]
+cdef char *ufunc_gdtrc_doc = (
+    "gdtrc(a, b, x, out=None)\n"
+    "\n"
+    "Gamma distribution survival function.\n"
+    "\n"
+    "Integral from `x` to infinity of the gamma probability density function,\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    F = \\int_x^\\infty \\frac{a^b}{\\Gamma(b)} t^{b-1} e^{-at}\\,dt,\n"
+    "\n"
+    "where :math:`\\Gamma` is the gamma function.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : array_like\n"
+    "    The rate parameter of the gamma distribution, sometimes denoted\n"
+    "    :math:`\\beta` (float). It is also the reciprocal of the scale\n"
+    "    parameter :math:`\\theta`.\n"
+    "b : array_like\n"
+    "    The shape parameter of the gamma distribution, sometimes denoted\n"
+    "    :math:`\\alpha` (float).\n"
+    "x : array_like\n"
+    "    The quantile (lower limit of integration; float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "F : scalar or ndarray\n"
+    "    The survival function of the gamma distribution with parameters `a`\n"
+    "    and `b` evaluated at `x`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "gdtr: Gamma distribution cumulative distribution function\n"
+    "scipy.stats.gamma: Gamma distribution\n"
+    "gdtrix\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The evaluation is carried out using the relation to the incomplete gamma\n"
+    "integral (regularized gamma function).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `gdtrc`. Calling `gdtrc` directly can\n"
+    "improve performance compared to the ``sf`` method of `scipy.stats.gamma`\n"
+    "(see last example below).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the function for ``a=1`` and ``b=2`` at ``x=5``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import gdtrc\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> gdtrc(1., 2., 5.)\n"
+    "0.04042768199451279\n"
+    "\n"
+    "Compute the function for ``a=1``, ``b=2`` at several points by providing\n"
+    "a NumPy array for `x`.\n"
+    "\n"
+    ">>> xvalues = np.array([1., 2., 3., 4])\n"
+    ">>> gdtrc(1., 1., xvalues)\n"
+    "array([0.36787944, 0.13533528, 0.04978707, 0.01831564])\n"
+    "\n"
+    "`gdtrc` can evaluate different parameter sets by providing arrays with\n"
+    "broadcasting compatible shapes for `a`, `b` and `x`. Here we compute the\n"
+    "function for three different `a` at four positions `x` and ``b=3``,\n"
+    "resulting in a 3x4 array.\n"
+    "\n"
+    ">>> a = np.array([[0.5], [1.5], [2.5]])\n"
+    ">>> x = np.array([1., 2., 3., 4])\n"
+    ">>> a.shape, x.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> gdtrc(a, 3., x)\n"
+    "array([[0.98561232, 0.9196986 , 0.80884683, 0.67667642],\n"
+    "       [0.80884683, 0.42319008, 0.17357807, 0.0619688 ],\n"
+    "       [0.54381312, 0.12465202, 0.02025672, 0.0027694 ]])\n"
+    "\n"
+    "Plot the function for four different parameter sets.\n"
+    "\n"
+    ">>> a_parameters = [0.3, 1, 2, 6]\n"
+    ">>> b_parameters = [2, 10, 15, 20]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(a_parameters, b_parameters, linestyles))\n"
+    ">>> x = np.linspace(0, 30, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     a, b, style = parameter_set\n"
+    "...     gdtrc_vals = gdtrc(a, b, x)\n"
+    "...     ax.plot(x, gdtrc_vals, label=fr\"$a= {a},\\, b={b}$\", ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(\"Gamma distribution survival function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The gamma distribution is also available as `scipy.stats.gamma`.\n"
+    "Using `gdtrc` directly can be much faster than calling the ``sf`` method\n"
+    "of `scipy.stats.gamma`, especially for small arrays or individual\n"
+    "values. To get the same results one must use the following parametrization:\n"
+    "``stats.gamma(b, scale=1/a).sf(x)=gdtrc(a, b, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import gamma\n"
+    ">>> a = 2\n"
+    ">>> b = 3\n"
+    ">>> x = 1.\n"
+    ">>> gdtrc_result = gdtrc(a, b, x)  # this will often be faster than below\n"
+    ">>> gamma_dist_result = gamma(b, scale=1/a).sf(x)\n"
+    ">>> gdtrc_result == gamma_dist_result  # test that results are equal\n"
+    "True")
+ufunc_gdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_gdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_gdtrc_types[0] = <char>NPY_FLOAT
+ufunc_gdtrc_types[1] = <char>NPY_FLOAT
+ufunc_gdtrc_types[2] = <char>NPY_FLOAT
+ufunc_gdtrc_types[3] = <char>NPY_FLOAT
+ufunc_gdtrc_types[4] = <char>NPY_DOUBLE
+ufunc_gdtrc_types[5] = <char>NPY_DOUBLE
+ufunc_gdtrc_types[6] = <char>NPY_DOUBLE
+ufunc_gdtrc_types[7] = <char>NPY_DOUBLE
+ufunc_gdtrc_ptr[2*0] = <void*>_func_xsf_gdtrc
+ufunc_gdtrc_ptr[2*0+1] = <void*>(<char*>"gdtrc")
+ufunc_gdtrc_ptr[2*1] = <void*>_func_xsf_gdtrc
+ufunc_gdtrc_ptr[2*1+1] = <void*>(<char*>"gdtrc")
+ufunc_gdtrc_data[0] = &ufunc_gdtrc_ptr[2*0]
+ufunc_gdtrc_data[1] = &ufunc_gdtrc_ptr[2*1]
+gdtrc = np.PyUFunc_FromFuncAndData(ufunc_gdtrc_loops, ufunc_gdtrc_data, ufunc_gdtrc_types, 2, 3, 1, 0, 'gdtrc', ufunc_gdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_gdtria_loops[2]
+cdef void *ufunc_gdtria_ptr[4]
+cdef void *ufunc_gdtria_data[2]
+cdef char ufunc_gdtria_types[8]
+cdef char *ufunc_gdtria_doc = (
+    "gdtria(p, b, x, out=None)\n"
+    "\n"
+    "Inverse of `gdtr` vs a.\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `a` of ``p =\n"
+    "gdtr(a, b, x)``, the cumulative distribution function of the gamma\n"
+    "distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability values.\n"
+    "b : array_like\n"
+    "    `b` parameter values of `gdtr(a, b, x)`. `b` is the \"shape\" parameter\n"
+    "    of the gamma distribution.\n"
+    "x : array_like\n"
+    "    Nonnegative real values, from the domain of the gamma distribution.\n"
+    "out : ndarray, optional\n"
+    "    If a fourth argument is given, it must be a numpy.ndarray whose size\n"
+    "    matches the broadcast result of `a`, `b` and `x`.  `out` is then the\n"
+    "    array returned by the function.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "a : scalar or ndarray\n"
+    "    Values of the `a` parameter such that ``p = gdtr(a, b, x)`.  ``1/a``\n"
+    "    is the \"scale\" parameter of the gamma distribution.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "gdtr : CDF of the gamma distribution.\n"
+    "gdtrib : Inverse with respect to `b` of `gdtr(a, b, x)`.\n"
+    "gdtrix : Inverse with respect to `x` of `gdtr(a, b, x)`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the CDFLIB [1]_ Fortran routine `cdfgam`.\n"
+    "\n"
+    "The cumulative distribution function `p` is computed using a routine by\n"
+    "DiDinato and Morris [2]_. Computation of `a` involves a search for a value\n"
+    "that produces the desired value of `p`. The search relies on the\n"
+    "monotonicity of `p` with `a`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.\n"
+    ".. [2] DiDinato, A. R. and Morris, A. H.,\n"
+    "       Computation of the incomplete gamma function ratios and their\n"
+    "       inverse.  ACM Trans. Math. Softw. 12 (1986), 377-393.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "First evaluate `gdtr`.\n"
+    "\n"
+    ">>> from scipy.special import gdtr, gdtria\n"
+    ">>> p = gdtr(1.2, 3.4, 5.6)\n"
+    ">>> print(p)\n"
+    "0.94378087442\n"
+    "\n"
+    "Verify the inverse.\n"
+    "\n"
+    ">>> gdtria(p, 3.4, 5.6)\n"
+    "1.2")
+ufunc_gdtria_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_gdtria_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_gdtria_types[0] = <char>NPY_FLOAT
+ufunc_gdtria_types[1] = <char>NPY_FLOAT
+ufunc_gdtria_types[2] = <char>NPY_FLOAT
+ufunc_gdtria_types[3] = <char>NPY_FLOAT
+ufunc_gdtria_types[4] = <char>NPY_DOUBLE
+ufunc_gdtria_types[5] = <char>NPY_DOUBLE
+ufunc_gdtria_types[6] = <char>NPY_DOUBLE
+ufunc_gdtria_types[7] = <char>NPY_DOUBLE
+ufunc_gdtria_ptr[2*0] = <void*>_func_special_gdtria
+ufunc_gdtria_ptr[2*0+1] = <void*>(<char*>"gdtria")
+ufunc_gdtria_ptr[2*1] = <void*>_func_special_gdtria
+ufunc_gdtria_ptr[2*1+1] = <void*>(<char*>"gdtria")
+ufunc_gdtria_data[0] = &ufunc_gdtria_ptr[2*0]
+ufunc_gdtria_data[1] = &ufunc_gdtria_ptr[2*1]
+gdtria = np.PyUFunc_FromFuncAndData(ufunc_gdtria_loops, ufunc_gdtria_data, ufunc_gdtria_types, 2, 3, 1, 0, 'gdtria', ufunc_gdtria_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_gdtrib_loops[2]
+cdef void *ufunc_gdtrib_ptr[4]
+cdef void *ufunc_gdtrib_data[2]
+cdef char ufunc_gdtrib_types[8]
+cdef char *ufunc_gdtrib_doc = (
+    "gdtrib(a, p, x, out=None)\n"
+    "\n"
+    "Inverse of `gdtr` vs b.\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `b` of ``p =\n"
+    "gdtr(a, b, x)``, the cumulative distribution function of the gamma\n"
+    "distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : array_like\n"
+    "    `a` parameter values of ``gdtr(a, b, x)`. ``1/a`` is the \"scale\"\n"
+    "    parameter of the gamma distribution.\n"
+    "p : array_like\n"
+    "    Probability values.\n"
+    "x : array_like\n"
+    "    Nonnegative real values, from the domain of the gamma distribution.\n"
+    "out : ndarray, optional\n"
+    "    If a fourth argument is given, it must be a numpy.ndarray whose size\n"
+    "    matches the broadcast result of `a`, `b` and `x`.  `out` is then the\n"
+    "    array returned by the function.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "b : scalar or ndarray\n"
+    "    Values of the `b` parameter such that `p = gdtr(a, b, x)`.  `b` is\n"
+    "    the \"shape\" parameter of the gamma distribution.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "gdtr : CDF of the gamma distribution.\n"
+    "gdtria : Inverse with respect to `a` of `gdtr(a, b, x)`.\n"
+    "gdtrix : Inverse with respect to `x` of `gdtr(a, b, x)`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "\n"
+    "The cumulative distribution function `p` is computed using the Cephes [1]_\n"
+    "routines `igam` and `igamc`. Computation of `b` involves a search for a value\n"
+    "that produces the desired value of `p` using Chandrupatla's bracketing\n"
+    "root finding algorithm [2]_.\n"
+    "\n"
+    "Note that there are some edge cases where `gdtrib` is extended by taking\n"
+    "limits where they are uniquely defined. In particular\n"
+    "``x == 0`` with ``p > 0`` and ``p == 0`` with ``x > 0``.\n"
+    "For these edge cases, a numerical result will be returned for\n"
+    "``gdtrib(a, p, x)`` even though ``gdtr(a, gdtrib(a, p, x), x)`` is\n"
+    "undefined.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    ".. [2] Chandrupatla, Tirupathi R.\n"
+    "       \"A new hybrid quadratic/bisection algorithm for finding the zero of a\n"
+    "       nonlinear function without using derivatives\".\n"
+    "       Advances in Engineering Software, 28(3), 145-149.\n"
+    "       https://doi.org/10.1016/s0965-9978(96)00051-8\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "First evaluate `gdtr`.\n"
+    "\n"
+    ">>> from scipy.special import gdtr, gdtrib\n"
+    ">>> p = gdtr(1.2, 3.4, 5.6)\n"
+    ">>> print(p)\n"
+    "0.94378087442\n"
+    "\n"
+    "Verify the inverse.\n"
+    "\n"
+    ">>> gdtrib(1.2, p, 5.6)\n"
+    "3.3999999999999995")
+ufunc_gdtrib_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_gdtrib_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_gdtrib_types[0] = <char>NPY_FLOAT
+ufunc_gdtrib_types[1] = <char>NPY_FLOAT
+ufunc_gdtrib_types[2] = <char>NPY_FLOAT
+ufunc_gdtrib_types[3] = <char>NPY_FLOAT
+ufunc_gdtrib_types[4] = <char>NPY_DOUBLE
+ufunc_gdtrib_types[5] = <char>NPY_DOUBLE
+ufunc_gdtrib_types[6] = <char>NPY_DOUBLE
+ufunc_gdtrib_types[7] = <char>NPY_DOUBLE
+ufunc_gdtrib_ptr[2*0] = <void*>_func_xsf_gdtrib
+ufunc_gdtrib_ptr[2*0+1] = <void*>(<char*>"gdtrib")
+ufunc_gdtrib_ptr[2*1] = <void*>_func_xsf_gdtrib
+ufunc_gdtrib_ptr[2*1+1] = <void*>(<char*>"gdtrib")
+ufunc_gdtrib_data[0] = &ufunc_gdtrib_ptr[2*0]
+ufunc_gdtrib_data[1] = &ufunc_gdtrib_ptr[2*1]
+gdtrib = np.PyUFunc_FromFuncAndData(ufunc_gdtrib_loops, ufunc_gdtrib_data, ufunc_gdtrib_types, 2, 3, 1, 0, 'gdtrib', ufunc_gdtrib_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_gdtrix_loops[2]
+cdef void *ufunc_gdtrix_ptr[4]
+cdef void *ufunc_gdtrix_data[2]
+cdef char ufunc_gdtrix_types[8]
+cdef char *ufunc_gdtrix_doc = (
+    "gdtrix(a, b, p, out=None)\n"
+    "\n"
+    "Inverse of `gdtr` vs x.\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `x` of ``p =\n"
+    "gdtr(a, b, x)``, the cumulative distribution function of the gamma\n"
+    "distribution. This is also known as the pth quantile of the\n"
+    "distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : array_like\n"
+    "    `a` parameter values of ``gdtr(a, b, x)``. ``1/a`` is the \"scale\"\n"
+    "    parameter of the gamma distribution.\n"
+    "b : array_like\n"
+    "    `b` parameter values of ``gdtr(a, b, x)``. `b` is the \"shape\" parameter\n"
+    "    of the gamma distribution.\n"
+    "p : array_like\n"
+    "    Probability values.\n"
+    "out : ndarray, optional\n"
+    "    If a fourth argument is given, it must be a numpy.ndarray whose size\n"
+    "    matches the broadcast result of `a`, `b` and `x`. `out` is then the\n"
+    "    array returned by the function.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Values of the `x` parameter such that `p = gdtr(a, b, x)`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "gdtr : CDF of the gamma distribution.\n"
+    "gdtria : Inverse with respect to `a` of ``gdtr(a, b, x)``.\n"
+    "gdtrib : Inverse with respect to `b` of ``gdtr(a, b, x)``.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the CDFLIB [1]_ Fortran routine `cdfgam`.\n"
+    "\n"
+    "The cumulative distribution function `p` is computed using a routine by\n"
+    "DiDinato and Morris [2]_. Computation of `x` involves a search for a value\n"
+    "that produces the desired value of `p`. The search relies on the\n"
+    "monotonicity of `p` with `x`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.\n"
+    ".. [2] DiDinato, A. R. and Morris, A. H.,\n"
+    "       Computation of the incomplete gamma function ratios and their\n"
+    "       inverse.  ACM Trans. Math. Softw. 12 (1986), 377-393.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "First evaluate `gdtr`.\n"
+    "\n"
+    ">>> from scipy.special import gdtr, gdtrix\n"
+    ">>> p = gdtr(1.2, 3.4, 5.6)\n"
+    ">>> print(p)\n"
+    "0.94378087442\n"
+    "\n"
+    "Verify the inverse.\n"
+    "\n"
+    ">>> gdtrix(1.2, 3.4, p)\n"
+    "5.5999999999999996")
+ufunc_gdtrix_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_gdtrix_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_gdtrix_types[0] = <char>NPY_FLOAT
+ufunc_gdtrix_types[1] = <char>NPY_FLOAT
+ufunc_gdtrix_types[2] = <char>NPY_FLOAT
+ufunc_gdtrix_types[3] = <char>NPY_FLOAT
+ufunc_gdtrix_types[4] = <char>NPY_DOUBLE
+ufunc_gdtrix_types[5] = <char>NPY_DOUBLE
+ufunc_gdtrix_types[6] = <char>NPY_DOUBLE
+ufunc_gdtrix_types[7] = <char>NPY_DOUBLE
+ufunc_gdtrix_ptr[2*0] = <void*>_func_special_gdtrix
+ufunc_gdtrix_ptr[2*0+1] = <void*>(<char*>"gdtrix")
+ufunc_gdtrix_ptr[2*1] = <void*>_func_special_gdtrix
+ufunc_gdtrix_ptr[2*1+1] = <void*>(<char*>"gdtrix")
+ufunc_gdtrix_data[0] = &ufunc_gdtrix_ptr[2*0]
+ufunc_gdtrix_data[1] = &ufunc_gdtrix_ptr[2*1]
+gdtrix = np.PyUFunc_FromFuncAndData(ufunc_gdtrix_loops, ufunc_gdtrix_data, ufunc_gdtrix_types, 2, 3, 1, 0, 'gdtrix', ufunc_gdtrix_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_huber_loops[2]
+cdef void *ufunc_huber_ptr[4]
+cdef void *ufunc_huber_data[2]
+cdef char ufunc_huber_types[6]
+cdef char *ufunc_huber_doc = (
+    "huber(delta, r, out=None)\n"
+    "\n"
+    "Huber loss function.\n"
+    "\n"
+    ".. math:: \\text{huber}(\\delta, r) = \\begin{cases} \\infty & \\delta < 0  \\\\\n"
+    "          \\frac{1}{2}r^2 & 0 \\le \\delta, | r | \\le \\delta \\\\\n"
+    "          \\delta ( |r| - \\frac{1}{2}\\delta ) & \\text{otherwise} \\end{cases}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "delta : ndarray\n"
+    "    Input array, indicating the quadratic vs. linear loss changepoint.\n"
+    "r : ndarray\n"
+    "    Input array, possibly representing residuals.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The computed Huber loss function values.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "pseudo_huber : smooth approximation of this function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "`huber` is useful as a loss function in robust statistics or machine\n"
+    "learning to reduce the influence of outliers as compared to the common\n"
+    "squared error loss, residuals with a magnitude higher than `delta` are\n"
+    "not squared [1]_.\n"
+    "\n"
+    "Typically, `r` represents residuals, the difference\n"
+    "between a model prediction and data. Then, for :math:`|r|\\leq\\delta`,\n"
+    "`huber` resembles the squared error and for :math:`|r|>\\delta` the\n"
+    "absolute error. This way, the Huber loss often achieves\n"
+    "a fast convergence in model fitting for small residuals like the squared\n"
+    "error loss function and still reduces the influence of outliers\n"
+    "(:math:`|r|>\\delta`) like the absolute error loss. As :math:`\\delta` is\n"
+    "the cutoff between squared and absolute error regimes, it has\n"
+    "to be tuned carefully for each problem. `huber` is also\n"
+    "convex, making it suitable for gradient based optimization.\n"
+    "\n"
+    ".. versionadded:: 0.15.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Peter Huber. \"Robust Estimation of a Location Parameter\",\n"
+    "       1964. Annals of Statistics. 53 (1): 73 - 101.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Import all necessary modules.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import huber\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    "\n"
+    "Compute the function for ``delta=1`` at ``r=2``\n"
+    "\n"
+    ">>> huber(1., 2.)\n"
+    "1.5\n"
+    "\n"
+    "Compute the function for different `delta` by providing a NumPy array or\n"
+    "list for `delta`.\n"
+    "\n"
+    ">>> huber([1., 3., 5.], 4.)\n"
+    "array([3.5, 7.5, 8. ])\n"
+    "\n"
+    "Compute the function at different points by providing a NumPy array or\n"
+    "list for `r`.\n"
+    "\n"
+    ">>> huber(2., np.array([1., 1.5, 3.]))\n"
+    "array([0.5  , 1.125, 4.   ])\n"
+    "\n"
+    "The function can be calculated for different `delta` and `r` by\n"
+    "providing arrays for both with compatible shapes for broadcasting.\n"
+    "\n"
+    ">>> r = np.array([1., 2.5, 8., 10.])\n"
+    ">>> deltas = np.array([[1.], [5.], [9.]])\n"
+    ">>> print(r.shape, deltas.shape)\n"
+    "(4,) (3, 1)\n"
+    "\n"
+    ">>> huber(deltas, r)\n"
+    "array([[ 0.5  ,  2.   ,  7.5  ,  9.5  ],\n"
+    "       [ 0.5  ,  3.125, 27.5  , 37.5  ],\n"
+    "       [ 0.5  ,  3.125, 32.   , 49.5  ]])\n"
+    "\n"
+    "Plot the function for different `delta`.\n"
+    "\n"
+    ">>> x = np.linspace(-4, 4, 500)\n"
+    ">>> deltas = [1, 2, 3]\n"
+    ">>> linestyles = [\"dashed\", \"dotted\", \"dashdot\"]\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> combined_plot_parameters = list(zip(deltas, linestyles))\n"
+    ">>> for delta, style in combined_plot_parameters:\n"
+    "...     ax.plot(x, huber(delta, x), label=fr\"$\\delta={delta}$\", ls=style)\n"
+    ">>> ax.legend(loc=\"upper center\")\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(r\"Huber loss function $h_{\\delta}(x)$\")\n"
+    ">>> ax.set_xlim(-4, 4)\n"
+    ">>> ax.set_ylim(0, 8)\n"
+    ">>> plt.show()")
+ufunc_huber_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_huber_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_huber_types[0] = <char>NPY_FLOAT
+ufunc_huber_types[1] = <char>NPY_FLOAT
+ufunc_huber_types[2] = <char>NPY_FLOAT
+ufunc_huber_types[3] = <char>NPY_DOUBLE
+ufunc_huber_types[4] = <char>NPY_DOUBLE
+ufunc_huber_types[5] = <char>NPY_DOUBLE
+ufunc_huber_ptr[2*0] = <void*>_func_huber
+ufunc_huber_ptr[2*0+1] = <void*>(<char*>"huber")
+ufunc_huber_ptr[2*1] = <void*>_func_huber
+ufunc_huber_ptr[2*1+1] = <void*>(<char*>"huber")
+ufunc_huber_data[0] = &ufunc_huber_ptr[2*0]
+ufunc_huber_data[1] = &ufunc_huber_ptr[2*1]
+huber = np.PyUFunc_FromFuncAndData(ufunc_huber_loops, ufunc_huber_data, ufunc_huber_types, 2, 2, 1, 0, 'huber', ufunc_huber_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_hyp0f1_loops[4]
+cdef void *ufunc_hyp0f1_ptr[8]
+cdef void *ufunc_hyp0f1_data[4]
+cdef char ufunc_hyp0f1_types[12]
+cdef char *ufunc_hyp0f1_doc = (
+    "hyp0f1(v, z, out=None)\n"
+    "\n"
+    "Confluent hypergeometric limit function 0F1.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "v : array_like\n"
+    "    Real-valued parameter\n"
+    "z : array_like\n"
+    "    Real- or complex-valued argument\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The confluent hypergeometric limit function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function is defined as:\n"
+    "\n"
+    ".. math:: _0F_1(v, z) = \\sum_{k=0}^{\\infty}\\frac{z^k}{(v)_k k!}.\n"
+    "\n"
+    "It's also the limit as :math:`q \\to \\infty` of :math:`_1F_1(q; v; z/q)`,\n"
+    "and satisfies the differential equation :math:`f''(z) + vf'(z) =\n"
+    "f(z)`. See [1]_ for more information.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Wolfram MathWorld, \"Confluent Hypergeometric Limit Function\",\n"
+    "       http://mathworld.wolfram.com/ConfluentHypergeometricLimitFunction.html\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It is one when `z` is zero.\n"
+    "\n"
+    ">>> sc.hyp0f1(1, 0)\n"
+    "1.0\n"
+    "\n"
+    "It is the limit of the confluent hypergeometric function as `q`\n"
+    "goes to infinity.\n"
+    "\n"
+    ">>> q = np.array([1, 10, 100, 1000])\n"
+    ">>> v = 1\n"
+    ">>> z = 1\n"
+    ">>> sc.hyp1f1(q, v, z / q)\n"
+    "array([2.71828183, 2.31481985, 2.28303778, 2.27992985])\n"
+    ">>> sc.hyp0f1(v, z)\n"
+    "2.2795853023360673\n"
+    "\n"
+    "It is related to Bessel functions.\n"
+    "\n"
+    ">>> n = 1\n"
+    ">>> x = np.linspace(0, 1, 5)\n"
+    ">>> sc.jv(n, x)\n"
+    "array([0.        , 0.12402598, 0.24226846, 0.3492436 , 0.44005059])\n"
+    ">>> (0.5 * x)**n / sc.factorial(n) * sc.hyp0f1(n + 1, -0.25 * x**2)\n"
+    "array([0.        , 0.12402598, 0.24226846, 0.3492436 , 0.44005059])")
+ufunc_hyp0f1_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_hyp0f1_loops[1] = <np.PyUFuncGenericFunction>loop_D_dD__As_fF_F
+ufunc_hyp0f1_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_hyp0f1_loops[3] = <np.PyUFuncGenericFunction>loop_D_dD__As_dD_D
+ufunc_hyp0f1_types[0] = <char>NPY_FLOAT
+ufunc_hyp0f1_types[1] = <char>NPY_FLOAT
+ufunc_hyp0f1_types[2] = <char>NPY_FLOAT
+ufunc_hyp0f1_types[3] = <char>NPY_FLOAT
+ufunc_hyp0f1_types[4] = <char>NPY_CFLOAT
+ufunc_hyp0f1_types[5] = <char>NPY_CFLOAT
+ufunc_hyp0f1_types[6] = <char>NPY_DOUBLE
+ufunc_hyp0f1_types[7] = <char>NPY_DOUBLE
+ufunc_hyp0f1_types[8] = <char>NPY_DOUBLE
+ufunc_hyp0f1_types[9] = <char>NPY_DOUBLE
+ufunc_hyp0f1_types[10] = <char>NPY_CDOUBLE
+ufunc_hyp0f1_types[11] = <char>NPY_CDOUBLE
+ufunc_hyp0f1_ptr[2*0] = <void*>_func__hyp0f1_real
+ufunc_hyp0f1_ptr[2*0+1] = <void*>(<char*>"hyp0f1")
+ufunc_hyp0f1_ptr[2*1] = <void*>_func__hyp0f1_cmplx
+ufunc_hyp0f1_ptr[2*1+1] = <void*>(<char*>"hyp0f1")
+ufunc_hyp0f1_ptr[2*2] = <void*>_func__hyp0f1_real
+ufunc_hyp0f1_ptr[2*2+1] = <void*>(<char*>"hyp0f1")
+ufunc_hyp0f1_ptr[2*3] = <void*>_func__hyp0f1_cmplx
+ufunc_hyp0f1_ptr[2*3+1] = <void*>(<char*>"hyp0f1")
+ufunc_hyp0f1_data[0] = &ufunc_hyp0f1_ptr[2*0]
+ufunc_hyp0f1_data[1] = &ufunc_hyp0f1_ptr[2*1]
+ufunc_hyp0f1_data[2] = &ufunc_hyp0f1_ptr[2*2]
+ufunc_hyp0f1_data[3] = &ufunc_hyp0f1_ptr[2*3]
+hyp0f1 = np.PyUFunc_FromFuncAndData(ufunc_hyp0f1_loops, ufunc_hyp0f1_data, ufunc_hyp0f1_types, 4, 2, 1, 0, 'hyp0f1', ufunc_hyp0f1_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_hyp1f1_loops[4]
+cdef void *ufunc_hyp1f1_ptr[8]
+cdef void *ufunc_hyp1f1_data[4]
+cdef char ufunc_hyp1f1_types[16]
+cdef char *ufunc_hyp1f1_doc = (
+    "hyp1f1(a, b, x, out=None)\n"
+    "\n"
+    "Confluent hypergeometric function 1F1.\n"
+    "\n"
+    "The confluent hypergeometric function is defined by the series\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "   {}_1F_1(a; b; x) = \\sum_{k = 0}^\\infty \\frac{(a)_k}{(b)_k k!} x^k.\n"
+    "\n"
+    "See [DLMF]_ for more details. Here :math:`(\\cdot)_k` is the\n"
+    "Pochhammer symbol; see `poch`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "    Real parameters\n"
+    "x : array_like\n"
+    "    Real or complex argument\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the confluent hypergeometric function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "hyperu : another confluent hypergeometric function\n"
+    "hyp0f1 : confluent hypergeometric limit function\n"
+    "hyp2f1 : Gaussian hypergeometric function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "For real values, this function uses the ``hyp1f1`` routine from the C++ Boost\n"
+    "library [2]_, for complex values a C translation of the specfun\n"
+    "Fortran library [3]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [DLMF] NIST Digital Library of Mathematical Functions\n"
+    "          https://dlmf.nist.gov/13.2#E2\n"
+    ".. [2] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    ".. [3] Zhang, Jin, \"Computation of Special Functions\", John Wiley\n"
+    "       and Sons, Inc, 1996.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It is one when `x` is zero:\n"
+    "\n"
+    ">>> sc.hyp1f1(0.5, 0.5, 0)\n"
+    "1.0\n"
+    "\n"
+    "It is singular when `b` is a nonpositive integer.\n"
+    "\n"
+    ">>> sc.hyp1f1(0.5, -1, 0)\n"
+    "inf\n"
+    "\n"
+    "It is a polynomial when `a` is a nonpositive integer.\n"
+    "\n"
+    ">>> a, b, x = -1, 0.5, np.array([1.0, 2.0, 3.0, 4.0])\n"
+    ">>> sc.hyp1f1(a, b, x)\n"
+    "array([-1., -3., -5., -7.])\n"
+    ">>> 1 + (a / b) * x\n"
+    "array([-1., -3., -5., -7.])\n"
+    "\n"
+    "It reduces to the exponential function when ``a = b``.\n"
+    "\n"
+    ">>> sc.hyp1f1(2, 2, [1, 2, 3, 4])\n"
+    "array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])\n"
+    ">>> np.exp([1, 2, 3, 4])\n"
+    "array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])")
+ufunc_hyp1f1_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_hyp1f1_loops[1] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ffF_F
+ufunc_hyp1f1_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_hyp1f1_loops[3] = <np.PyUFuncGenericFunction>loop_D_ddD__As_ddD_D
+ufunc_hyp1f1_types[0] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[1] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[2] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[3] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[4] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[5] = <char>NPY_FLOAT
+ufunc_hyp1f1_types[6] = <char>NPY_CFLOAT
+ufunc_hyp1f1_types[7] = <char>NPY_CFLOAT
+ufunc_hyp1f1_types[8] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[9] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[10] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[11] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[12] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[13] = <char>NPY_DOUBLE
+ufunc_hyp1f1_types[14] = <char>NPY_CDOUBLE
+ufunc_hyp1f1_types[15] = <char>NPY_CDOUBLE
+ufunc_hyp1f1_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_hyp1f1_double
+ufunc_hyp1f1_ptr[2*0+1] = <void*>(<char*>"hyp1f1")
+ufunc_hyp1f1_ptr[2*1] = <void*>_func_chyp1f1_wrap
+ufunc_hyp1f1_ptr[2*1+1] = <void*>(<char*>"hyp1f1")
+ufunc_hyp1f1_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_hyp1f1_double
+ufunc_hyp1f1_ptr[2*2+1] = <void*>(<char*>"hyp1f1")
+ufunc_hyp1f1_ptr[2*3] = <void*>_func_chyp1f1_wrap
+ufunc_hyp1f1_ptr[2*3+1] = <void*>(<char*>"hyp1f1")
+ufunc_hyp1f1_data[0] = &ufunc_hyp1f1_ptr[2*0]
+ufunc_hyp1f1_data[1] = &ufunc_hyp1f1_ptr[2*1]
+ufunc_hyp1f1_data[2] = &ufunc_hyp1f1_ptr[2*2]
+ufunc_hyp1f1_data[3] = &ufunc_hyp1f1_ptr[2*3]
+hyp1f1 = np.PyUFunc_FromFuncAndData(ufunc_hyp1f1_loops, ufunc_hyp1f1_data, ufunc_hyp1f1_types, 4, 3, 1, 0, 'hyp1f1', ufunc_hyp1f1_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_hyperu_loops[2]
+cdef void *ufunc_hyperu_ptr[4]
+cdef void *ufunc_hyperu_data[2]
+cdef char ufunc_hyperu_types[8]
+cdef char *ufunc_hyperu_doc = (
+    "hyperu(a, b, x, out=None)\n"
+    "\n"
+    "Confluent hypergeometric function U\n"
+    "\n"
+    "It is defined as the solution to the equation\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "   x \\frac{d^2w}{dx^2} + (b - x) \\frac{dw}{dx} - aw = 0\n"
+    "\n"
+    "which satisfies the property\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "   U(a, b, x) \\sim x^{-a}\n"
+    "\n"
+    "as :math:`x \\to \\infty`. See [DLMF]_ for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a, b : array_like\n"
+    "    Real-valued parameters\n"
+    "x : array_like\n"
+    "    Real-valued argument\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of `U`\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [DLMF] NIST Digital Library of Mathematics Functions\n"
+    "          https://dlmf.nist.gov/13.2#E6\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It has a branch cut along the negative `x` axis.\n"
+    "\n"
+    ">>> x = np.linspace(-0.1, -10, 5)\n"
+    ">>> sc.hyperu(1, 1, x)\n"
+    "array([nan, nan, nan, nan, nan])\n"
+    "\n"
+    "It approaches zero as `x` goes to infinity.\n"
+    "\n"
+    ">>> x = np.array([1, 10, 100])\n"
+    ">>> sc.hyperu(1, 1, x)\n"
+    "array([0.59634736, 0.09156333, 0.00990194])\n"
+    "\n"
+    "It satisfies Kummer's transformation.\n"
+    "\n"
+    ">>> a, b, x = 2, 1, 1\n"
+    ">>> sc.hyperu(a, b, x)\n"
+    "0.1926947246463881\n"
+    ">>> x**(1 - b) * sc.hyperu(a - b + 1, 2 - b, x)\n"
+    "0.1926947246463881")
+ufunc_hyperu_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_hyperu_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_hyperu_types[0] = <char>NPY_FLOAT
+ufunc_hyperu_types[1] = <char>NPY_FLOAT
+ufunc_hyperu_types[2] = <char>NPY_FLOAT
+ufunc_hyperu_types[3] = <char>NPY_FLOAT
+ufunc_hyperu_types[4] = <char>NPY_DOUBLE
+ufunc_hyperu_types[5] = <char>NPY_DOUBLE
+ufunc_hyperu_types[6] = <char>NPY_DOUBLE
+ufunc_hyperu_types[7] = <char>NPY_DOUBLE
+ufunc_hyperu_ptr[2*0] = <void*>_func_hyperu
+ufunc_hyperu_ptr[2*0+1] = <void*>(<char*>"hyperu")
+ufunc_hyperu_ptr[2*1] = <void*>_func_hyperu
+ufunc_hyperu_ptr[2*1+1] = <void*>(<char*>"hyperu")
+ufunc_hyperu_data[0] = &ufunc_hyperu_ptr[2*0]
+ufunc_hyperu_data[1] = &ufunc_hyperu_ptr[2*1]
+hyperu = np.PyUFunc_FromFuncAndData(ufunc_hyperu_loops, ufunc_hyperu_data, ufunc_hyperu_types, 2, 3, 1, 0, 'hyperu', ufunc_hyperu_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_inv_boxcox_loops[2]
+cdef void *ufunc_inv_boxcox_ptr[4]
+cdef void *ufunc_inv_boxcox_data[2]
+cdef char ufunc_inv_boxcox_types[6]
+cdef char *ufunc_inv_boxcox_doc = (
+    "inv_boxcox(y, lmbda, out=None)\n"
+    "\n"
+    "Compute the inverse of the Box-Cox transformation.\n"
+    "\n"
+    "Find ``x`` such that::\n"
+    "\n"
+    "    y = (x**lmbda - 1) / lmbda  if lmbda != 0\n"
+    "        log(x)                  if lmbda == 0\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : array_like\n"
+    "    Data to be transformed.\n"
+    "lmbda : array_like\n"
+    "    Power parameter of the Box-Cox transform.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Transformed data.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "\n"
+    ".. versionadded:: 0.16.0\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import boxcox, inv_boxcox\n"
+    ">>> y = boxcox([1, 4, 10], 2.5)\n"
+    ">>> inv_boxcox(y, 2.5)\n"
+    "array([1., 4., 10.])")
+ufunc_inv_boxcox_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_inv_boxcox_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_inv_boxcox_types[0] = <char>NPY_FLOAT
+ufunc_inv_boxcox_types[1] = <char>NPY_FLOAT
+ufunc_inv_boxcox_types[2] = <char>NPY_FLOAT
+ufunc_inv_boxcox_types[3] = <char>NPY_DOUBLE
+ufunc_inv_boxcox_types[4] = <char>NPY_DOUBLE
+ufunc_inv_boxcox_types[5] = <char>NPY_DOUBLE
+ufunc_inv_boxcox_ptr[2*0] = <void*>_func_inv_boxcox
+ufunc_inv_boxcox_ptr[2*0+1] = <void*>(<char*>"inv_boxcox")
+ufunc_inv_boxcox_ptr[2*1] = <void*>_func_inv_boxcox
+ufunc_inv_boxcox_ptr[2*1+1] = <void*>(<char*>"inv_boxcox")
+ufunc_inv_boxcox_data[0] = &ufunc_inv_boxcox_ptr[2*0]
+ufunc_inv_boxcox_data[1] = &ufunc_inv_boxcox_ptr[2*1]
+inv_boxcox = np.PyUFunc_FromFuncAndData(ufunc_inv_boxcox_loops, ufunc_inv_boxcox_data, ufunc_inv_boxcox_types, 2, 2, 1, 0, 'inv_boxcox', ufunc_inv_boxcox_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_inv_boxcox1p_loops[2]
+cdef void *ufunc_inv_boxcox1p_ptr[4]
+cdef void *ufunc_inv_boxcox1p_data[2]
+cdef char ufunc_inv_boxcox1p_types[6]
+cdef char *ufunc_inv_boxcox1p_doc = (
+    "inv_boxcox1p(y, lmbda, out=None)\n"
+    "\n"
+    "Compute the inverse of the Box-Cox transformation.\n"
+    "\n"
+    "Find ``x`` such that::\n"
+    "\n"
+    "    y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0\n"
+    "        log(1+x)                    if lmbda == 0\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : array_like\n"
+    "    Data to be transformed.\n"
+    "lmbda : array_like\n"
+    "    Power parameter of the Box-Cox transform.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Transformed data.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "\n"
+    ".. versionadded:: 0.16.0\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import boxcox1p, inv_boxcox1p\n"
+    ">>> y = boxcox1p([1, 4, 10], 2.5)\n"
+    ">>> inv_boxcox1p(y, 2.5)\n"
+    "array([1., 4., 10.])")
+ufunc_inv_boxcox1p_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_inv_boxcox1p_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_inv_boxcox1p_types[0] = <char>NPY_FLOAT
+ufunc_inv_boxcox1p_types[1] = <char>NPY_FLOAT
+ufunc_inv_boxcox1p_types[2] = <char>NPY_FLOAT
+ufunc_inv_boxcox1p_types[3] = <char>NPY_DOUBLE
+ufunc_inv_boxcox1p_types[4] = <char>NPY_DOUBLE
+ufunc_inv_boxcox1p_types[5] = <char>NPY_DOUBLE
+ufunc_inv_boxcox1p_ptr[2*0] = <void*>_func_inv_boxcox1p
+ufunc_inv_boxcox1p_ptr[2*0+1] = <void*>(<char*>"inv_boxcox1p")
+ufunc_inv_boxcox1p_ptr[2*1] = <void*>_func_inv_boxcox1p
+ufunc_inv_boxcox1p_ptr[2*1+1] = <void*>(<char*>"inv_boxcox1p")
+ufunc_inv_boxcox1p_data[0] = &ufunc_inv_boxcox1p_ptr[2*0]
+ufunc_inv_boxcox1p_data[1] = &ufunc_inv_boxcox1p_ptr[2*1]
+inv_boxcox1p = np.PyUFunc_FromFuncAndData(ufunc_inv_boxcox1p_loops, ufunc_inv_boxcox1p_data, ufunc_inv_boxcox1p_types, 2, 2, 1, 0, 'inv_boxcox1p', ufunc_inv_boxcox1p_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_kl_div_loops[2]
+cdef void *ufunc_kl_div_ptr[4]
+cdef void *ufunc_kl_div_data[2]
+cdef char ufunc_kl_div_types[6]
+cdef char *ufunc_kl_div_doc = (
+    "kl_div(x, y, out=None)\n"
+    "\n"
+    "Elementwise function for computing Kullback-Leibler divergence.\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\mathrm{kl\\_div}(x, y) =\n"
+    "      \\begin{cases}\n"
+    "        x \\log(x / y) - x + y & x > 0, y > 0 \\\\\n"
+    "        y & x = 0, y \\ge 0 \\\\\n"
+    "        \\infty & \\text{otherwise}\n"
+    "      \\end{cases}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y : array_like\n"
+    "    Real arguments\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the Kullback-Liebler divergence.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "entr, rel_entr, scipy.stats.entropy\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 0.15.0\n"
+    "\n"
+    "This function is non-negative and is jointly convex in `x` and `y`.\n"
+    "\n"
+    "The origin of this function is in convex programming; see [1]_ for\n"
+    "details. This is why the function contains the extra :math:`-x\n"
+    "+ y` terms over what might be expected from the Kullback-Leibler\n"
+    "divergence. For a version of the function without the extra terms,\n"
+    "see `rel_entr`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.\n"
+    "       Cambridge University Press, 2004.\n"
+    "       :doi:`https://doi.org/10.1017/CBO9780511804441`")
+ufunc_kl_div_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_kl_div_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_kl_div_types[0] = <char>NPY_FLOAT
+ufunc_kl_div_types[1] = <char>NPY_FLOAT
+ufunc_kl_div_types[2] = <char>NPY_FLOAT
+ufunc_kl_div_types[3] = <char>NPY_DOUBLE
+ufunc_kl_div_types[4] = <char>NPY_DOUBLE
+ufunc_kl_div_types[5] = <char>NPY_DOUBLE
+ufunc_kl_div_ptr[2*0] = <void*>_func_kl_div
+ufunc_kl_div_ptr[2*0+1] = <void*>(<char*>"kl_div")
+ufunc_kl_div_ptr[2*1] = <void*>_func_kl_div
+ufunc_kl_div_ptr[2*1+1] = <void*>(<char*>"kl_div")
+ufunc_kl_div_data[0] = &ufunc_kl_div_ptr[2*0]
+ufunc_kl_div_data[1] = &ufunc_kl_div_ptr[2*1]
+kl_div = np.PyUFunc_FromFuncAndData(ufunc_kl_div_loops, ufunc_kl_div_data, ufunc_kl_div_types, 2, 2, 1, 0, 'kl_div', ufunc_kl_div_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_kn_loops[3]
+cdef void *ufunc_kn_ptr[6]
+cdef void *ufunc_kn_data[3]
+cdef char ufunc_kn_types[9]
+cdef char *ufunc_kn_doc = (
+    "kn(n, x, out=None)\n"
+    "\n"
+    "Modified Bessel function of the second kind of integer order `n`\n"
+    "\n"
+    "Returns the modified Bessel function of the second kind for integer order\n"
+    "`n` at real `z`.\n"
+    "\n"
+    "These are also sometimes called functions of the third kind, Basset\n"
+    "functions, or Macdonald functions.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like of int\n"
+    "    Order of Bessel functions (floats will truncate with a warning)\n"
+    "x : array_like of float\n"
+    "    Argument at which to evaluate the Bessel functions\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the Modified Bessel function of the second kind,\n"
+    "    :math:`K_n(x)`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "kv : Same function, but accepts real order and complex argument\n"
+    "kvp : Derivative of this function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for AMOS [1]_ routine `zbesk`.  For a discussion of the\n"
+    "algorithm used, see [2]_ and the references therein.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Donald E. Amos, \"AMOS, A Portable Package for Bessel Functions\n"
+    "       of a Complex Argument and Nonnegative Order\",\n"
+    "       http://netlib.org/amos/\n"
+    ".. [2] Donald E. Amos, \"Algorithm 644: A portable package for Bessel\n"
+    "       functions of a complex argument and nonnegative order\", ACM\n"
+    "       TOMS Vol. 12 Issue 3, Sept. 1986, p. 265\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Plot the function of several orders for real input:\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import kn\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> x = np.linspace(0, 5, 1000)\n"
+    ">>> for N in range(6):\n"
+    "...     plt.plot(x, kn(N, x), label='$K_{}(x)$'.format(N))\n"
+    ">>> plt.ylim(0, 10)\n"
+    ">>> plt.legend()\n"
+    ">>> plt.title(r'Modified Bessel function of the second kind $K_n(x)$')\n"
+    ">>> plt.show()\n"
+    "\n"
+    "Calculate for a single value at multiple orders:\n"
+    "\n"
+    ">>> kn([4, 5, 6], 1)\n"
+    "array([   44.23241585,   360.9605896 ,  3653.83831186])")
+ufunc_kn_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_kn_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_kn_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_kn_types[0] = <char>NPY_INTP
+ufunc_kn_types[1] = <char>NPY_DOUBLE
+ufunc_kn_types[2] = <char>NPY_DOUBLE
+ufunc_kn_types[3] = <char>NPY_FLOAT
+ufunc_kn_types[4] = <char>NPY_FLOAT
+ufunc_kn_types[5] = <char>NPY_FLOAT
+ufunc_kn_types[6] = <char>NPY_DOUBLE
+ufunc_kn_types[7] = <char>NPY_DOUBLE
+ufunc_kn_types[8] = <char>NPY_DOUBLE
+ufunc_kn_ptr[2*0] = <void*>_func_special_cyl_bessel_k_int
+ufunc_kn_ptr[2*0+1] = <void*>(<char*>"kn")
+ufunc_kn_ptr[2*1] = <void*>_func_kn_unsafe
+ufunc_kn_ptr[2*1+1] = <void*>(<char*>"kn")
+ufunc_kn_ptr[2*2] = <void*>_func_kn_unsafe
+ufunc_kn_ptr[2*2+1] = <void*>(<char*>"kn")
+ufunc_kn_data[0] = &ufunc_kn_ptr[2*0]
+ufunc_kn_data[1] = &ufunc_kn_ptr[2*1]
+ufunc_kn_data[2] = &ufunc_kn_ptr[2*2]
+kn = np.PyUFunc_FromFuncAndData(ufunc_kn_loops, ufunc_kn_data, ufunc_kn_types, 3, 2, 1, 0, 'kn', ufunc_kn_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_kolmogi_loops[2]
+cdef void *ufunc_kolmogi_ptr[4]
+cdef void *ufunc_kolmogi_data[2]
+cdef char ufunc_kolmogi_types[4]
+cdef char *ufunc_kolmogi_doc = (
+    "kolmogi(p, out=None)\n"
+    "\n"
+    "Inverse Survival Function of Kolmogorov distribution\n"
+    "\n"
+    "It is the inverse function to `kolmogorov`.\n"
+    "Returns y such that ``kolmogorov(y) == p``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : float array_like\n"
+    "    Probability\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The value(s) of kolmogi(p)\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "kolmogorov : The Survival Function for the distribution\n"
+    "scipy.stats.kstwobign : Provides the functionality as a continuous distribution\n"
+    "smirnov, smirnovi : Functions for the one-sided distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "`kolmogorov` is used by `stats.kstest` in the application of the\n"
+    "Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this\n"
+    "function is exposed in `scpy.special`, but the recommended way to achieve\n"
+    "the most accurate CDF/SF/PDF/PPF/ISF computations is to use the\n"
+    "`stats.kstwobign` distribution.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import kolmogi\n"
+    ">>> kolmogi([0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])\n"
+    "array([        inf,  1.22384787,  1.01918472,  0.82757356,  0.67644769,\n"
+    "        0.57117327,  0.        ])")
+ufunc_kolmogi_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_kolmogi_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_kolmogi_types[0] = <char>NPY_FLOAT
+ufunc_kolmogi_types[1] = <char>NPY_FLOAT
+ufunc_kolmogi_types[2] = <char>NPY_DOUBLE
+ufunc_kolmogi_types[3] = <char>NPY_DOUBLE
+ufunc_kolmogi_ptr[2*0] = <void*>_func_xsf_kolmogi
+ufunc_kolmogi_ptr[2*0+1] = <void*>(<char*>"kolmogi")
+ufunc_kolmogi_ptr[2*1] = <void*>_func_xsf_kolmogi
+ufunc_kolmogi_ptr[2*1+1] = <void*>(<char*>"kolmogi")
+ufunc_kolmogi_data[0] = &ufunc_kolmogi_ptr[2*0]
+ufunc_kolmogi_data[1] = &ufunc_kolmogi_ptr[2*1]
+kolmogi = np.PyUFunc_FromFuncAndData(ufunc_kolmogi_loops, ufunc_kolmogi_data, ufunc_kolmogi_types, 2, 1, 1, 0, 'kolmogi', ufunc_kolmogi_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_kolmogorov_loops[2]
+cdef void *ufunc_kolmogorov_ptr[4]
+cdef void *ufunc_kolmogorov_data[2]
+cdef char ufunc_kolmogorov_types[4]
+cdef char *ufunc_kolmogorov_doc = (
+    "kolmogorov(y, out=None)\n"
+    "\n"
+    "Complementary cumulative distribution (Survival Function) function of\n"
+    "Kolmogorov distribution.\n"
+    "\n"
+    "Returns the complementary cumulative distribution function of\n"
+    "Kolmogorov's limiting distribution (``D_n*\\sqrt(n)`` as n goes to infinity)\n"
+    "of a two-sided test for equality between an empirical and a theoretical\n"
+    "distribution. It is equal to the (limit as n->infinity of the)\n"
+    "probability that ``sqrt(n) * max absolute deviation > y``.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : float array_like\n"
+    "  Absolute deviation between the Empirical CDF (ECDF) and the target CDF,\n"
+    "  multiplied by sqrt(n).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The value(s) of kolmogorov(y)\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "kolmogi : The Inverse Survival Function for the distribution\n"
+    "scipy.stats.kstwobign : Provides the functionality as a continuous distribution\n"
+    "smirnov, smirnovi : Functions for the one-sided distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "`kolmogorov` is used by `stats.kstest` in the application of the\n"
+    "Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this\n"
+    "function is exposed in `scpy.special`, but the recommended way to achieve\n"
+    "the most accurate CDF/SF/PDF/PPF/ISF computations is to use the\n"
+    "`stats.kstwobign` distribution.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Show the probability of a gap at least as big as 0, 0.5 and 1.0.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import kolmogorov\n"
+    ">>> from scipy.stats import kstwobign\n"
+    ">>> kolmogorov([0, 0.5, 1.0])\n"
+    "array([ 1.        ,  0.96394524,  0.26999967])\n"
+    "\n"
+    "Compare a sample of size 1000 drawn from a Laplace(0, 1) distribution against\n"
+    "the target distribution, a Normal(0, 1) distribution.\n"
+    "\n"
+    ">>> from scipy.stats import norm, laplace\n"
+    ">>> rng = np.random.default_rng()\n"
+    ">>> n = 1000\n"
+    ">>> lap01 = laplace(0, 1)\n"
+    ">>> x = np.sort(lap01.rvs(n, random_state=rng))\n"
+    ">>> np.mean(x), np.std(x)\n"
+    "(-0.05841730131499543, 1.3968109101997568)\n"
+    "\n"
+    "Construct the Empirical CDF and the K-S statistic Dn.\n"
+    "\n"
+    ">>> target = norm(0,1)  # Normal mean 0, stddev 1\n"
+    ">>> cdfs = target.cdf(x)\n"
+    ">>> ecdfs = np.arange(n+1, dtype=float)/n\n"
+    ">>> gaps = np.column_stack([cdfs - ecdfs[:n], ecdfs[1:] - cdfs])\n"
+    ">>> Dn = np.max(gaps)\n"
+    ">>> Kn = np.sqrt(n) * Dn\n"
+    ">>> print('Dn=%f, sqrt(n)*Dn=%f' % (Dn, Kn))\n"
+    "Dn=0.043363, sqrt(n)*Dn=1.371265\n"
+    ">>> print(chr(10).join(['For a sample of size n drawn from a N(0, 1) distribution:',\n"
+    "...   ' the approximate Kolmogorov probability that sqrt(n)*Dn>=%f is %f' %\n"
+    "...    (Kn, kolmogorov(Kn)),\n"
+    "...   ' the approximate Kolmogorov probability that sqrt(n)*Dn<=%f is %f' %\n"
+    "...    (Kn, kstwobign.cdf(Kn))]))\n"
+    "For a sample of size n drawn from a N(0, 1) distribution:\n"
+    " the approximate Kolmogorov probability that sqrt(n)*Dn>=1.371265 is 0.046533\n"
+    " the approximate Kolmogorov probability that sqrt(n)*Dn<=1.371265 is 0.953467\n"
+    "\n"
+    "Plot the Empirical CDF against the target N(0, 1) CDF.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> plt.step(np.concatenate([[-3], x]), ecdfs, where='post', label='Empirical CDF')\n"
+    ">>> x3 = np.linspace(-3, 3, 100)\n"
+    ">>> plt.plot(x3, target.cdf(x3), label='CDF for N(0, 1)')\n"
+    ">>> plt.ylim([0, 1]); plt.grid(True); plt.legend();\n"
+    ">>> # Add vertical lines marking Dn+ and Dn-\n"
+    ">>> iminus, iplus = np.argmax(gaps, axis=0)\n"
+    ">>> plt.vlines([x[iminus]], ecdfs[iminus], cdfs[iminus],\n"
+    "...            color='r', linestyle='dashed', lw=4)\n"
+    ">>> plt.vlines([x[iplus]], cdfs[iplus], ecdfs[iplus+1],\n"
+    "...            color='r', linestyle='dashed', lw=4)\n"
+    ">>> plt.show()")
+ufunc_kolmogorov_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_kolmogorov_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_kolmogorov_types[0] = <char>NPY_FLOAT
+ufunc_kolmogorov_types[1] = <char>NPY_FLOAT
+ufunc_kolmogorov_types[2] = <char>NPY_DOUBLE
+ufunc_kolmogorov_types[3] = <char>NPY_DOUBLE
+ufunc_kolmogorov_ptr[2*0] = <void*>_func_xsf_kolmogorov
+ufunc_kolmogorov_ptr[2*0+1] = <void*>(<char*>"kolmogorov")
+ufunc_kolmogorov_ptr[2*1] = <void*>_func_xsf_kolmogorov
+ufunc_kolmogorov_ptr[2*1+1] = <void*>(<char*>"kolmogorov")
+ufunc_kolmogorov_data[0] = &ufunc_kolmogorov_ptr[2*0]
+ufunc_kolmogorov_data[1] = &ufunc_kolmogorov_ptr[2*1]
+kolmogorov = np.PyUFunc_FromFuncAndData(ufunc_kolmogorov_loops, ufunc_kolmogorov_data, ufunc_kolmogorov_types, 2, 1, 1, 0, 'kolmogorov', ufunc_kolmogorov_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_lpmv_loops[2]
+cdef void *ufunc_lpmv_ptr[4]
+cdef void *ufunc_lpmv_data[2]
+cdef char ufunc_lpmv_types[8]
+cdef char *ufunc_lpmv_doc = (
+    "lpmv(m, v, x, out=None)\n"
+    "\n"
+    "Associated Legendre function of integer order and real degree.\n"
+    "\n"
+    "Defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P_v^m = (-1)^m (1 - x^2)^{m/2} \\frac{d^m}{dx^m} P_v(x)\n"
+    "\n"
+    "where\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    P_v = \\sum_{k = 0}^\\infty \\frac{(-v)_k (v + 1)_k}{(k!)^2}\n"
+    "            \\left(\\frac{1 - x}{2}\\right)^k\n"
+    "\n"
+    "is the Legendre function of the first kind. Here :math:`(\\cdot)_k`\n"
+    "is the Pochhammer symbol; see `poch`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "m : array_like\n"
+    "    Order (int or float). If passed a float not equal to an\n"
+    "    integer the function returns NaN.\n"
+    "v : array_like\n"
+    "    Degree (float).\n"
+    "x : array_like\n"
+    "    Argument (float). Must have ``|x| <= 1``.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "pmv : scalar or ndarray\n"
+    "    Value of the associated Legendre function.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Note that this implementation includes the Condon-Shortley phase.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Zhang, Jin, \"Computation of Special Functions\", John Wiley\n"
+    "       and Sons, Inc, 1996.")
+ufunc_lpmv_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_lpmv_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_lpmv_types[0] = <char>NPY_FLOAT
+ufunc_lpmv_types[1] = <char>NPY_FLOAT
+ufunc_lpmv_types[2] = <char>NPY_FLOAT
+ufunc_lpmv_types[3] = <char>NPY_FLOAT
+ufunc_lpmv_types[4] = <char>NPY_DOUBLE
+ufunc_lpmv_types[5] = <char>NPY_DOUBLE
+ufunc_lpmv_types[6] = <char>NPY_DOUBLE
+ufunc_lpmv_types[7] = <char>NPY_DOUBLE
+ufunc_lpmv_ptr[2*0] = <void*>_func_pmv_wrap
+ufunc_lpmv_ptr[2*0+1] = <void*>(<char*>"lpmv")
+ufunc_lpmv_ptr[2*1] = <void*>_func_pmv_wrap
+ufunc_lpmv_ptr[2*1+1] = <void*>(<char*>"lpmv")
+ufunc_lpmv_data[0] = &ufunc_lpmv_ptr[2*0]
+ufunc_lpmv_data[1] = &ufunc_lpmv_ptr[2*1]
+lpmv = np.PyUFunc_FromFuncAndData(ufunc_lpmv_loops, ufunc_lpmv_data, ufunc_lpmv_types, 2, 3, 1, 0, 'lpmv', ufunc_lpmv_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nbdtr_loops[3]
+cdef void *ufunc_nbdtr_ptr[6]
+cdef void *ufunc_nbdtr_data[3]
+cdef char ufunc_nbdtr_types[12]
+cdef char *ufunc_nbdtr_doc = (
+    "nbdtr(k, n, p, out=None)\n"
+    "\n"
+    "Negative binomial cumulative distribution function.\n"
+    "\n"
+    "Returns the sum of the terms 0 through `k` of the negative binomial\n"
+    "distribution probability mass function,\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    F = \\sum_{j=0}^k {{n + j - 1}\\choose{j}} p^n (1 - p)^j.\n"
+    "\n"
+    "In a sequence of Bernoulli trials with individual success probabilities\n"
+    "`p`, this is the probability that `k` or fewer failures precede the nth\n"
+    "success.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    The maximum number of allowed failures (nonnegative int).\n"
+    "n : array_like\n"
+    "    The target number of successes (positive int).\n"
+    "p : array_like\n"
+    "    Probability of success in a single event (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "F : scalar or ndarray\n"
+    "    The probability of `k` or fewer failures before `n` successes in a\n"
+    "    sequence of events with individual success probability `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nbdtrc : Negative binomial survival function\n"
+    "nbdtrik : Negative binomial quantile function\n"
+    "scipy.stats.nbinom : Negative binomial distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "If floating point values are passed for `k` or `n`, they will be truncated\n"
+    "to integers.\n"
+    "\n"
+    "The terms are not summed directly; instead the regularized incomplete beta\n"
+    "function is employed, according to the formula,\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{nbdtr}(k, n, p) = I_{p}(n, k + 1).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `nbdtr`.\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. Using `nbdtr` directly can improve performance\n"
+    "compared to the ``cdf`` method of `scipy.stats.nbinom` (see last example).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the function for ``k=10`` and ``n=5`` at ``p=0.5``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import nbdtr\n"
+    ">>> nbdtr(10, 5, 0.5)\n"
+    "0.940765380859375\n"
+    "\n"
+    "Compute the function for ``n=10`` and ``p=0.5`` at several points by\n"
+    "providing a NumPy array or list for `k`.\n"
+    "\n"
+    ">>> nbdtr([5, 10, 15], 10, 0.5)\n"
+    "array([0.15087891, 0.58809853, 0.88523853])\n"
+    "\n"
+    "Plot the function for four different parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> k = np.arange(130)\n"
+    ">>> n_parameters = [20, 20, 20, 80]\n"
+    ">>> p_parameters = [0.2, 0.5, 0.8, 0.5]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(p_parameters, n_parameters,\n"
+    "...                            linestyles))\n"
+    ">>> fig, ax = plt.subplots(figsize=(8, 8))\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     p, n, style = parameter_set\n"
+    "...     nbdtr_vals = nbdtr(k, n, p)\n"
+    "...     ax.plot(k, nbdtr_vals, label=rf\"$n={n},\\, p={p}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$k$\")\n"
+    ">>> ax.set_title(\"Negative binomial cumulative distribution function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. Using `nbdtr` directly can be much faster than\n"
+    "calling the ``cdf`` method of `scipy.stats.nbinom`, especially for small\n"
+    "arrays or individual values. To get the same results one must use the\n"
+    "following parametrization: ``nbinom(n, p).cdf(k)=nbdtr(k, n, p)``.\n"
+    "\n"
+    ">>> from scipy.stats import nbinom\n"
+    ">>> k, n, p = 5, 3, 0.5\n"
+    ">>> nbdtr_res = nbdtr(k, n, p)  # this will often be faster than below\n"
+    ">>> stats_res = nbinom(n, p).cdf(k)\n"
+    ">>> stats_res, nbdtr_res  # test that results are equal\n"
+    "(0.85546875, 0.85546875)\n"
+    "\n"
+    "`nbdtr` can evaluate different parameter sets by providing arrays with\n"
+    "shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute\n"
+    "the function for three different `k` at four locations `p`, resulting in\n"
+    "a 3x4 array.\n"
+    "\n"
+    ">>> k = np.array([[5], [10], [15]])\n"
+    ">>> p = np.array([0.3, 0.5, 0.7, 0.9])\n"
+    ">>> k.shape, p.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> nbdtr(k, 5, p)\n"
+    "array([[0.15026833, 0.62304687, 0.95265101, 0.9998531 ],\n"
+    "       [0.48450894, 0.94076538, 0.99932777, 0.99999999],\n"
+    "       [0.76249222, 0.99409103, 0.99999445, 1.        ]])")
+ufunc_nbdtr_loops[0] = <np.PyUFuncGenericFunction>loop_d_ppd__As_ppd_d
+ufunc_nbdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nbdtr_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nbdtr_types[0] = <char>NPY_INTP
+ufunc_nbdtr_types[1] = <char>NPY_INTP
+ufunc_nbdtr_types[2] = <char>NPY_DOUBLE
+ufunc_nbdtr_types[3] = <char>NPY_DOUBLE
+ufunc_nbdtr_types[4] = <char>NPY_FLOAT
+ufunc_nbdtr_types[5] = <char>NPY_FLOAT
+ufunc_nbdtr_types[6] = <char>NPY_FLOAT
+ufunc_nbdtr_types[7] = <char>NPY_FLOAT
+ufunc_nbdtr_types[8] = <char>NPY_DOUBLE
+ufunc_nbdtr_types[9] = <char>NPY_DOUBLE
+ufunc_nbdtr_types[10] = <char>NPY_DOUBLE
+ufunc_nbdtr_types[11] = <char>NPY_DOUBLE
+ufunc_nbdtr_ptr[2*0] = <void*>_func_cephes_nbdtr_wrap
+ufunc_nbdtr_ptr[2*0+1] = <void*>(<char*>"nbdtr")
+ufunc_nbdtr_ptr[2*1] = <void*>_func_nbdtr_unsafe
+ufunc_nbdtr_ptr[2*1+1] = <void*>(<char*>"nbdtr")
+ufunc_nbdtr_ptr[2*2] = <void*>_func_nbdtr_unsafe
+ufunc_nbdtr_ptr[2*2+1] = <void*>(<char*>"nbdtr")
+ufunc_nbdtr_data[0] = &ufunc_nbdtr_ptr[2*0]
+ufunc_nbdtr_data[1] = &ufunc_nbdtr_ptr[2*1]
+ufunc_nbdtr_data[2] = &ufunc_nbdtr_ptr[2*2]
+nbdtr = np.PyUFunc_FromFuncAndData(ufunc_nbdtr_loops, ufunc_nbdtr_data, ufunc_nbdtr_types, 3, 3, 1, 0, 'nbdtr', ufunc_nbdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nbdtrc_loops[3]
+cdef void *ufunc_nbdtrc_ptr[6]
+cdef void *ufunc_nbdtrc_data[3]
+cdef char ufunc_nbdtrc_types[12]
+cdef char *ufunc_nbdtrc_doc = (
+    "nbdtrc(k, n, p, out=None)\n"
+    "\n"
+    "Negative binomial survival function.\n"
+    "\n"
+    "Returns the sum of the terms `k + 1` to infinity of the negative binomial\n"
+    "distribution probability mass function,\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    F = \\sum_{j=k + 1}^\\infty {{n + j - 1}\\choose{j}} p^n (1 - p)^j.\n"
+    "\n"
+    "In a sequence of Bernoulli trials with individual success probabilities\n"
+    "`p`, this is the probability that more than `k` failures precede the nth\n"
+    "success.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    The maximum number of allowed failures (nonnegative int).\n"
+    "n : array_like\n"
+    "    The target number of successes (positive int).\n"
+    "p : array_like\n"
+    "    Probability of success in a single event (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "F : scalar or ndarray\n"
+    "    The probability of `k + 1` or more failures before `n` successes in a\n"
+    "    sequence of events with individual success probability `p`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nbdtr : Negative binomial cumulative distribution function\n"
+    "nbdtrik : Negative binomial percentile function\n"
+    "scipy.stats.nbinom : Negative binomial distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "If floating point values are passed for `k` or `n`, they will be truncated\n"
+    "to integers.\n"
+    "\n"
+    "The terms are not summed directly; instead the regularized incomplete beta\n"
+    "function is employed, according to the formula,\n"
+    "\n"
+    ".. math::\n"
+    "    \\mathrm{nbdtrc}(k, n, p) = I_{1 - p}(k + 1, n).\n"
+    "\n"
+    "Wrapper for the Cephes [1]_ routine `nbdtrc`.\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. Using `nbdtrc` directly can improve performance\n"
+    "compared to the ``sf`` method of `scipy.stats.nbinom` (see last example).\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the function for ``k=10`` and ``n=5`` at ``p=0.5``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import nbdtrc\n"
+    ">>> nbdtrc(10, 5, 0.5)\n"
+    "0.059234619140624986\n"
+    "\n"
+    "Compute the function for ``n=10`` and ``p=0.5`` at several points by\n"
+    "providing a NumPy array or list for `k`.\n"
+    "\n"
+    ">>> nbdtrc([5, 10, 15], 10, 0.5)\n"
+    "array([0.84912109, 0.41190147, 0.11476147])\n"
+    "\n"
+    "Plot the function for four different parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> k = np.arange(130)\n"
+    ">>> n_parameters = [20, 20, 20, 80]\n"
+    ">>> p_parameters = [0.2, 0.5, 0.8, 0.5]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(p_parameters, n_parameters,\n"
+    "...                            linestyles))\n"
+    ">>> fig, ax = plt.subplots(figsize=(8, 8))\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     p, n, style = parameter_set\n"
+    "...     nbdtrc_vals = nbdtrc(k, n, p)\n"
+    "...     ax.plot(k, nbdtrc_vals, label=rf\"$n={n},\\, p={p}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_xlabel(\"$k$\")\n"
+    ">>> ax.set_title(\"Negative binomial distribution survival function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. Using `nbdtrc` directly can be much faster than\n"
+    "calling the ``sf`` method of `scipy.stats.nbinom`, especially for small\n"
+    "arrays or individual values. To get the same results one must use the\n"
+    "following parametrization: ``nbinom(n, p).sf(k)=nbdtrc(k, n, p)``.\n"
+    "\n"
+    ">>> from scipy.stats import nbinom\n"
+    ">>> k, n, p = 3, 5, 0.5\n"
+    ">>> nbdtr_res = nbdtrc(k, n, p)  # this will often be faster than below\n"
+    ">>> stats_res = nbinom(n, p).sf(k)\n"
+    ">>> stats_res, nbdtr_res  # test that results are equal\n"
+    "(0.6367187499999999, 0.6367187499999999)\n"
+    "\n"
+    "`nbdtrc` can evaluate different parameter sets by providing arrays with\n"
+    "shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute\n"
+    "the function for three different `k` at four locations `p`, resulting in\n"
+    "a 3x4 array.\n"
+    "\n"
+    ">>> k = np.array([[5], [10], [15]])\n"
+    ">>> p = np.array([0.3, 0.5, 0.7, 0.9])\n"
+    ">>> k.shape, p.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> nbdtrc(k, 5, p)\n"
+    "array([[8.49731667e-01, 3.76953125e-01, 4.73489874e-02, 1.46902600e-04],\n"
+    "       [5.15491059e-01, 5.92346191e-02, 6.72234070e-04, 9.29610100e-09],\n"
+    "       [2.37507779e-01, 5.90896606e-03, 5.55025308e-06, 3.26346760e-13]])")
+ufunc_nbdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_d_ppd__As_ppd_d
+ufunc_nbdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nbdtrc_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nbdtrc_types[0] = <char>NPY_INTP
+ufunc_nbdtrc_types[1] = <char>NPY_INTP
+ufunc_nbdtrc_types[2] = <char>NPY_DOUBLE
+ufunc_nbdtrc_types[3] = <char>NPY_DOUBLE
+ufunc_nbdtrc_types[4] = <char>NPY_FLOAT
+ufunc_nbdtrc_types[5] = <char>NPY_FLOAT
+ufunc_nbdtrc_types[6] = <char>NPY_FLOAT
+ufunc_nbdtrc_types[7] = <char>NPY_FLOAT
+ufunc_nbdtrc_types[8] = <char>NPY_DOUBLE
+ufunc_nbdtrc_types[9] = <char>NPY_DOUBLE
+ufunc_nbdtrc_types[10] = <char>NPY_DOUBLE
+ufunc_nbdtrc_types[11] = <char>NPY_DOUBLE
+ufunc_nbdtrc_ptr[2*0] = <void*>_func_cephes_nbdtrc_wrap
+ufunc_nbdtrc_ptr[2*0+1] = <void*>(<char*>"nbdtrc")
+ufunc_nbdtrc_ptr[2*1] = <void*>_func_nbdtrc_unsafe
+ufunc_nbdtrc_ptr[2*1+1] = <void*>(<char*>"nbdtrc")
+ufunc_nbdtrc_ptr[2*2] = <void*>_func_nbdtrc_unsafe
+ufunc_nbdtrc_ptr[2*2+1] = <void*>(<char*>"nbdtrc")
+ufunc_nbdtrc_data[0] = &ufunc_nbdtrc_ptr[2*0]
+ufunc_nbdtrc_data[1] = &ufunc_nbdtrc_ptr[2*1]
+ufunc_nbdtrc_data[2] = &ufunc_nbdtrc_ptr[2*2]
+nbdtrc = np.PyUFunc_FromFuncAndData(ufunc_nbdtrc_loops, ufunc_nbdtrc_data, ufunc_nbdtrc_types, 3, 3, 1, 0, 'nbdtrc', ufunc_nbdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nbdtri_loops[3]
+cdef void *ufunc_nbdtri_ptr[6]
+cdef void *ufunc_nbdtri_data[3]
+cdef char ufunc_nbdtri_types[12]
+cdef char *ufunc_nbdtri_doc = (
+    "nbdtri(k, n, y, out=None)\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `p` of\n"
+    "``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution\n"
+    "function.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    The maximum number of allowed failures (nonnegative int).\n"
+    "n : array_like\n"
+    "    The target number of successes (positive int).\n"
+    "y : array_like\n"
+    "    The probability of `k` or fewer failures before `n` successes (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "p : scalar or ndarray\n"
+    "    Probability of success in a single event (float) such that\n"
+    "    `nbdtr(k, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nbdtr : Cumulative distribution function of the negative binomial.\n"
+    "nbdtrc : Negative binomial survival function.\n"
+    "scipy.stats.nbinom : negative binomial distribution.\n"
+    "nbdtrik : Inverse with respect to `k` of `nbdtr(k, n, p)`.\n"
+    "nbdtrin : Inverse with respect to `n` of `nbdtr(k, n, p)`.\n"
+    "scipy.stats.nbinom : Negative binomial distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the Cephes [1]_ routine `nbdtri`.\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. Using `nbdtri` directly can improve performance\n"
+    "compared to the ``ppf`` method of `scipy.stats.nbinom`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`nbdtri` is the inverse of `nbdtr` with respect to `p`.\n"
+    "Up to floating point errors the following holds:\n"
+    "``nbdtri(k, n, nbdtr(k, n, p))=p``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import nbdtri, nbdtr\n"
+    ">>> k, n, y = 5, 10, 0.2\n"
+    ">>> cdf_val = nbdtr(k, n, y)\n"
+    ">>> nbdtri(k, n, cdf_val)\n"
+    "0.20000000000000004\n"
+    "\n"
+    "Compute the function for ``k=10`` and ``n=5`` at several points by\n"
+    "providing a NumPy array or list for `y`.\n"
+    "\n"
+    ">>> y = np.array([0.1, 0.4, 0.8])\n"
+    ">>> nbdtri(3, 5, y)\n"
+    "array([0.34462319, 0.51653095, 0.69677416])\n"
+    "\n"
+    "Plot the function for three different parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> n_parameters = [5, 20, 30, 30]\n"
+    ">>> k_parameters = [20, 20, 60, 80]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(n_parameters, k_parameters, linestyles))\n"
+    ">>> cdf_vals = np.linspace(0, 1, 1000)\n"
+    ">>> fig, ax = plt.subplots(figsize=(8, 8))\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     n, k, style = parameter_set\n"
+    "...     nbdtri_vals = nbdtri(k, n, cdf_vals)\n"
+    "...     ax.plot(cdf_vals, nbdtri_vals, label=rf\"$k={k},\\ n={n}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_ylabel(\"$p$\")\n"
+    ">>> ax.set_xlabel(\"$CDF$\")\n"
+    ">>> title = \"nbdtri: inverse of negative binomial CDF with respect to $p$\"\n"
+    ">>> ax.set_title(title)\n"
+    ">>> plt.show()\n"
+    "\n"
+    "`nbdtri` can evaluate different parameter sets by providing arrays with\n"
+    "shapes compatible for broadcasting for `k`, `n` and `p`. Here we compute\n"
+    "the function for three different `k` at four locations `p`, resulting in\n"
+    "a 3x4 array.\n"
+    "\n"
+    ">>> k = np.array([[5], [10], [15]])\n"
+    ">>> y = np.array([0.3, 0.5, 0.7, 0.9])\n"
+    ">>> k.shape, y.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> nbdtri(k, 5, y)\n"
+    "array([[0.37258157, 0.45169416, 0.53249956, 0.64578407],\n"
+    "       [0.24588501, 0.30451981, 0.36778453, 0.46397088],\n"
+    "       [0.18362101, 0.22966758, 0.28054743, 0.36066188]])")
+ufunc_nbdtri_loops[0] = <np.PyUFuncGenericFunction>loop_d_ppd__As_ppd_d
+ufunc_nbdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nbdtri_loops[2] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nbdtri_types[0] = <char>NPY_INTP
+ufunc_nbdtri_types[1] = <char>NPY_INTP
+ufunc_nbdtri_types[2] = <char>NPY_DOUBLE
+ufunc_nbdtri_types[3] = <char>NPY_DOUBLE
+ufunc_nbdtri_types[4] = <char>NPY_FLOAT
+ufunc_nbdtri_types[5] = <char>NPY_FLOAT
+ufunc_nbdtri_types[6] = <char>NPY_FLOAT
+ufunc_nbdtri_types[7] = <char>NPY_FLOAT
+ufunc_nbdtri_types[8] = <char>NPY_DOUBLE
+ufunc_nbdtri_types[9] = <char>NPY_DOUBLE
+ufunc_nbdtri_types[10] = <char>NPY_DOUBLE
+ufunc_nbdtri_types[11] = <char>NPY_DOUBLE
+ufunc_nbdtri_ptr[2*0] = <void*>_func_cephes_nbdtri_wrap
+ufunc_nbdtri_ptr[2*0+1] = <void*>(<char*>"nbdtri")
+ufunc_nbdtri_ptr[2*1] = <void*>_func_nbdtri_unsafe
+ufunc_nbdtri_ptr[2*1+1] = <void*>(<char*>"nbdtri")
+ufunc_nbdtri_ptr[2*2] = <void*>_func_nbdtri_unsafe
+ufunc_nbdtri_ptr[2*2+1] = <void*>(<char*>"nbdtri")
+ufunc_nbdtri_data[0] = &ufunc_nbdtri_ptr[2*0]
+ufunc_nbdtri_data[1] = &ufunc_nbdtri_ptr[2*1]
+ufunc_nbdtri_data[2] = &ufunc_nbdtri_ptr[2*2]
+nbdtri = np.PyUFunc_FromFuncAndData(ufunc_nbdtri_loops, ufunc_nbdtri_data, ufunc_nbdtri_types, 3, 3, 1, 0, 'nbdtri', ufunc_nbdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nbdtrik_loops[2]
+cdef void *ufunc_nbdtrik_ptr[4]
+cdef void *ufunc_nbdtrik_data[2]
+cdef char ufunc_nbdtrik_types[8]
+cdef char *ufunc_nbdtrik_doc = (
+    "nbdtrik(y, n, p, out=None)\n"
+    "\n"
+    "Negative binomial percentile function.\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `k` of\n"
+    "``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution\n"
+    "function.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : array_like\n"
+    "    The probability of `k` or fewer failures before `n` successes (float).\n"
+    "n : array_like\n"
+    "    The target number of successes (positive int).\n"
+    "p : array_like\n"
+    "    Probability of success in a single event (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "k : scalar or ndarray\n"
+    "    The maximum number of allowed failures such that `nbdtr(k, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nbdtr : Cumulative distribution function of the negative binomial.\n"
+    "nbdtrc : Survival function of the negative binomial.\n"
+    "nbdtri : Inverse with respect to `p` of `nbdtr(k, n, p)`.\n"
+    "nbdtrin : Inverse with respect to `n` of `nbdtr(k, n, p)`.\n"
+    "scipy.stats.nbinom : Negative binomial distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the CDFLIB [1]_ Fortran routine `cdfnbn`.\n"
+    "\n"
+    "Formula 26.5.26 of [2]_ or [3]_,\n"
+    "\n"
+    ".. math::\n"
+    "    \\sum_{j=k + 1}^\\infty {{n + j - 1}\n"
+    "    \\choose{j}} p^n (1 - p)^j = I_{1 - p}(k + 1, n),\n"
+    "\n"
+    "is used to reduce calculation of the cumulative distribution function to\n"
+    "that of a regularized incomplete beta :math:`I`.\n"
+    "\n"
+    "Computation of `k` involves a search for a value that produces the desired\n"
+    "value of `y`.  The search relies on the monotonicity of `y` with `k`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.\n"
+    ".. [2] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [3] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17.E24\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the negative binomial cumulative distribution function for an\n"
+    "exemplary parameter set.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import nbdtr, nbdtrik\n"
+    ">>> k, n, p = 5, 2, 0.5\n"
+    ">>> cdf_value = nbdtr(k, n, p)\n"
+    ">>> cdf_value\n"
+    "0.9375\n"
+    "\n"
+    "Verify that `nbdtrik` recovers the original value for `k`.\n"
+    "\n"
+    ">>> nbdtrik(cdf_value, n, p)\n"
+    "5.0\n"
+    "\n"
+    "Plot the function for different parameter sets.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> p_parameters = [0.2, 0.5, 0.7, 0.5]\n"
+    ">>> n_parameters = [30, 30, 30, 80]\n"
+    ">>> linestyles = ['solid', 'dashed', 'dotted', 'dashdot']\n"
+    ">>> parameters_list = list(zip(p_parameters, n_parameters, linestyles))\n"
+    ">>> cdf_vals = np.linspace(0, 1, 1000)\n"
+    ">>> fig, ax = plt.subplots(figsize=(8, 8))\n"
+    ">>> for parameter_set in parameters_list:\n"
+    "...     p, n, style = parameter_set\n"
+    "...     nbdtrik_vals = nbdtrik(cdf_vals, n, p)\n"
+    "...     ax.plot(cdf_vals, nbdtrik_vals, label=rf\"$n={n},\\ p={p}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_ylabel(\"$k$\")\n"
+    ">>> ax.set_xlabel(\"$CDF$\")\n"
+    ">>> ax.set_title(\"Negative binomial percentile function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The negative binomial distribution is also available as\n"
+    "`scipy.stats.nbinom`. The percentile function  method ``ppf``\n"
+    "returns the result of `nbdtrik` rounded up to integers:\n"
+    "\n"
+    ">>> from scipy.stats import nbinom\n"
+    ">>> q, n, p = 0.6, 5, 0.5\n"
+    ">>> nbinom.ppf(q, n, p), nbdtrik(q, n, p)\n"
+    "(5.0, 4.800428460273882)")
+ufunc_nbdtrik_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nbdtrik_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nbdtrik_types[0] = <char>NPY_FLOAT
+ufunc_nbdtrik_types[1] = <char>NPY_FLOAT
+ufunc_nbdtrik_types[2] = <char>NPY_FLOAT
+ufunc_nbdtrik_types[3] = <char>NPY_FLOAT
+ufunc_nbdtrik_types[4] = <char>NPY_DOUBLE
+ufunc_nbdtrik_types[5] = <char>NPY_DOUBLE
+ufunc_nbdtrik_types[6] = <char>NPY_DOUBLE
+ufunc_nbdtrik_types[7] = <char>NPY_DOUBLE
+ufunc_nbdtrik_ptr[2*0] = <void*>_func_nbdtrik
+ufunc_nbdtrik_ptr[2*0+1] = <void*>(<char*>"nbdtrik")
+ufunc_nbdtrik_ptr[2*1] = <void*>_func_nbdtrik
+ufunc_nbdtrik_ptr[2*1+1] = <void*>(<char*>"nbdtrik")
+ufunc_nbdtrik_data[0] = &ufunc_nbdtrik_ptr[2*0]
+ufunc_nbdtrik_data[1] = &ufunc_nbdtrik_ptr[2*1]
+nbdtrik = np.PyUFunc_FromFuncAndData(ufunc_nbdtrik_loops, ufunc_nbdtrik_data, ufunc_nbdtrik_types, 2, 3, 1, 0, 'nbdtrik', ufunc_nbdtrik_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nbdtrin_loops[2]
+cdef void *ufunc_nbdtrin_ptr[4]
+cdef void *ufunc_nbdtrin_data[2]
+cdef char ufunc_nbdtrin_types[8]
+cdef char *ufunc_nbdtrin_doc = (
+    "nbdtrin(k, y, p, out=None)\n"
+    "\n"
+    "Inverse of `nbdtr` vs `n`.\n"
+    "\n"
+    "Returns the inverse with respect to the parameter `n` of\n"
+    "``y = nbdtr(k, n, p)``, the negative binomial cumulative distribution\n"
+    "function.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    The maximum number of allowed failures (nonnegative int).\n"
+    "y : array_like\n"
+    "    The probability of `k` or fewer failures before `n` successes (float).\n"
+    "p : array_like\n"
+    "    Probability of success in a single event (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "n : scalar or ndarray\n"
+    "    The number of successes `n` such that `nbdtr(k, n, p) = y`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nbdtr : Cumulative distribution function of the negative binomial.\n"
+    "nbdtri : Inverse with respect to `p` of `nbdtr(k, n, p)`.\n"
+    "nbdtrik : Inverse with respect to `k` of `nbdtr(k, n, p)`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the CDFLIB [1]_ Fortran routine `cdfnbn`.\n"
+    "\n"
+    "Formula 26.5.26 of [2]_ or [3]_,\n"
+    "\n"
+    ".. math::\n"
+    "    \\sum_{j=k + 1}^\\infty {{n + j - 1}\n"
+    "    \\choose{j}} p^n (1 - p)^j = I_{1 - p}(k + 1, n),\n"
+    "\n"
+    "is used to reduce calculation of the cumulative distribution function to\n"
+    "that of a regularized incomplete beta :math:`I`.\n"
+    "\n"
+    "Computation of `n` involves a search for a value that produces the desired\n"
+    "value of `y`.  The search relies on the monotonicity of `y` with `n`.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Barry Brown, James Lovato, and Kathy Russell,\n"
+    "       CDFLIB: Library of Fortran Routines for Cumulative Distribution\n"
+    "       Functions, Inverses, and Other Parameters.\n"
+    ".. [2] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    ".. [3] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/8.17.E24\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the negative binomial cumulative distribution function for an\n"
+    "exemplary parameter set.\n"
+    "\n"
+    ">>> from scipy.special import nbdtr, nbdtrin\n"
+    ">>> k, n, p = 5, 2, 0.5\n"
+    ">>> cdf_value = nbdtr(k, n, p)\n"
+    ">>> cdf_value\n"
+    "0.9375\n"
+    "\n"
+    "Verify that `nbdtrin` recovers the original value for `n` up to floating\n"
+    "point accuracy.\n"
+    "\n"
+    ">>> nbdtrin(k, cdf_value, p)\n"
+    "1.999999999998137")
+ufunc_nbdtrin_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nbdtrin_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nbdtrin_types[0] = <char>NPY_FLOAT
+ufunc_nbdtrin_types[1] = <char>NPY_FLOAT
+ufunc_nbdtrin_types[2] = <char>NPY_FLOAT
+ufunc_nbdtrin_types[3] = <char>NPY_FLOAT
+ufunc_nbdtrin_types[4] = <char>NPY_DOUBLE
+ufunc_nbdtrin_types[5] = <char>NPY_DOUBLE
+ufunc_nbdtrin_types[6] = <char>NPY_DOUBLE
+ufunc_nbdtrin_types[7] = <char>NPY_DOUBLE
+ufunc_nbdtrin_ptr[2*0] = <void*>_func_nbdtrin
+ufunc_nbdtrin_ptr[2*0+1] = <void*>(<char*>"nbdtrin")
+ufunc_nbdtrin_ptr[2*1] = <void*>_func_nbdtrin
+ufunc_nbdtrin_ptr[2*1+1] = <void*>(<char*>"nbdtrin")
+ufunc_nbdtrin_data[0] = &ufunc_nbdtrin_ptr[2*0]
+ufunc_nbdtrin_data[1] = &ufunc_nbdtrin_ptr[2*1]
+nbdtrin = np.PyUFunc_FromFuncAndData(ufunc_nbdtrin_loops, ufunc_nbdtrin_data, ufunc_nbdtrin_types, 2, 3, 1, 0, 'nbdtrin', ufunc_nbdtrin_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ncfdtr_loops[2]
+cdef void *ufunc_ncfdtr_ptr[4]
+cdef void *ufunc_ncfdtr_data[2]
+cdef char ufunc_ncfdtr_types[10]
+cdef char *ufunc_ncfdtr_doc = (
+    "ncfdtr(dfn, dfd, nc, f, out=None)\n"
+    "\n"
+    "Cumulative distribution function of the non-central F distribution.\n"
+    "\n"
+    "The non-central F describes the distribution of,\n"
+    "\n"
+    ".. math::\n"
+    "    Z = \\frac{X/d_n}{Y/d_d}\n"
+    "\n"
+    "where :math:`X` and :math:`Y` are independently distributed, with\n"
+    ":math:`X` distributed non-central :math:`\\chi^2` with noncentrality\n"
+    "parameter `nc` and :math:`d_n` degrees of freedom, and :math:`Y`\n"
+    "distributed :math:`\\chi^2` with :math:`d_d` degrees of freedom.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    Degrees of freedom of the numerator sum of squares.  Range (0, inf).\n"
+    "dfd : array_like\n"
+    "    Degrees of freedom of the denominator sum of squares.  Range (0, inf).\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.  Range [0, inf).\n"
+    "f : array_like\n"
+    "    Quantiles, i.e. the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "cdf : scalar or ndarray\n"
+    "    The calculated CDF.  If all inputs are scalar, the return will be a\n"
+    "    float.  Otherwise it will be an array.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.\n"
+    "ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.\n"
+    "ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.\n"
+    "ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.\n"
+    "scipy.stats.ncf : Non-central F distribution.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function calculates the CDF of the non-central f distribution using\n"
+    "the Boost Math C++ library [1]_.\n"
+    "\n"
+    "The cumulative distribution function is computed using Formula 26.6.20 of\n"
+    "[2]_:\n"
+    "\n"
+    ".. math::\n"
+    "    F(d_n, d_d, n_c, f) = \\sum_{j=0}^\\infty e^{-n_c/2}\n"
+    "    \\frac{(n_c/2)^j}{j!} I_{x}(\\frac{d_n}{2} + j, \\frac{d_d}{2}),\n"
+    "\n"
+    "where :math:`I` is the regularized incomplete beta function, and\n"
+    ":math:`x = f d_n/(f d_n + d_d)`.\n"
+    "\n"
+    "Note that argument order of `ncfdtr` is different from that of the\n"
+    "similar ``cdf`` method of `scipy.stats.ncf`: `f` is the last\n"
+    "parameter of `ncfdtr` but the first parameter of ``scipy.stats.ncf.cdf``.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    ".. [2] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy import special\n"
+    ">>> from scipy import stats\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    "\n"
+    "Plot the CDF of the non-central F distribution, for nc=0.  Compare with the\n"
+    "F-distribution from scipy.stats:\n"
+    "\n"
+    ">>> x = np.linspace(-1, 8, num=500)\n"
+    ">>> dfn = 3\n"
+    ">>> dfd = 2\n"
+    ">>> ncf_stats = stats.f.cdf(x, dfn, dfd)\n"
+    ">>> ncf_special = special.ncfdtr(dfn, dfd, 0, x)\n"
+    "\n"
+    ">>> fig = plt.figure()\n"
+    ">>> ax = fig.add_subplot(111)\n"
+    ">>> ax.plot(x, ncf_stats, 'b-', lw=3)\n"
+    ">>> ax.plot(x, ncf_special, 'r-')\n"
+    ">>> plt.show()")
+ufunc_ncfdtr_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc_ncfdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_ncfdtr_types[0] = <char>NPY_FLOAT
+ufunc_ncfdtr_types[1] = <char>NPY_FLOAT
+ufunc_ncfdtr_types[2] = <char>NPY_FLOAT
+ufunc_ncfdtr_types[3] = <char>NPY_FLOAT
+ufunc_ncfdtr_types[4] = <char>NPY_FLOAT
+ufunc_ncfdtr_types[5] = <char>NPY_DOUBLE
+ufunc_ncfdtr_types[6] = <char>NPY_DOUBLE
+ufunc_ncfdtr_types[7] = <char>NPY_DOUBLE
+ufunc_ncfdtr_types[8] = <char>NPY_DOUBLE
+ufunc_ncfdtr_types[9] = <char>NPY_DOUBLE
+ufunc_ncfdtr_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_cdf_float
+ufunc_ncfdtr_ptr[2*0+1] = <void*>(<char*>"ncfdtr")
+ufunc_ncfdtr_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_cdf_double
+ufunc_ncfdtr_ptr[2*1+1] = <void*>(<char*>"ncfdtr")
+ufunc_ncfdtr_data[0] = &ufunc_ncfdtr_ptr[2*0]
+ufunc_ncfdtr_data[1] = &ufunc_ncfdtr_ptr[2*1]
+ncfdtr = np.PyUFunc_FromFuncAndData(ufunc_ncfdtr_loops, ufunc_ncfdtr_data, ufunc_ncfdtr_types, 2, 4, 1, 0, 'ncfdtr', ufunc_ncfdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ncfdtri_loops[2]
+cdef void *ufunc_ncfdtri_ptr[4]
+cdef void *ufunc_ncfdtri_data[2]
+cdef char ufunc_ncfdtri_types[10]
+cdef char *ufunc_ncfdtri_doc = (
+    "ncfdtri(dfn, dfd, nc, p, out=None)\n"
+    "\n"
+    "Inverse with respect to `f` of the CDF of the non-central F distribution.\n"
+    "\n"
+    "See `ncfdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    Degrees of freedom of the numerator sum of squares.  Range (0, inf).\n"
+    "dfd : array_like\n"
+    "    Degrees of freedom of the denominator sum of squares.  Range (0, inf).\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.  Range [0, inf).\n"
+    "p : array_like\n"
+    "    Value of the cumulative distribution function.  Must be in the\n"
+    "    range [0, 1].\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "f : scalar or ndarray\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ncfdtr : CDF of the non-central F distribution.\n"
+    "ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.\n"
+    "ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.\n"
+    "ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.\n"
+    "scipy.stats.ncf : Non-central F distribution.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function calculates the Quantile of the non-central f distribution\n"
+    "using the Boost Math C++ library [1]_.\n"
+    "\n"
+    "Note that argument order of `ncfdtri` is different from that of the\n"
+    "similar ``ppf`` method of `scipy.stats.ncf`. `p` is the last parameter\n"
+    "of `ncfdtri` but the first parameter of ``scipy.stats.ncf.ppf``.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import ncfdtr, ncfdtri\n"
+    "\n"
+    "Compute the CDF for several values of `f`:\n"
+    "\n"
+    ">>> f = [0.5, 1, 1.5]\n"
+    ">>> p = ncfdtr(2, 3, 1.5, f)\n"
+    ">>> p\n"
+    "array([ 0.20782291,  0.36107392,  0.47345752])\n"
+    "\n"
+    "Compute the inverse.  We recover the values of `f`, as expected:\n"
+    "\n"
+    ">>> ncfdtri(2, 3, 1.5, p)\n"
+    "array([ 0.5,  1. ,  1.5])")
+ufunc_ncfdtri_loops[0] = <np.PyUFuncGenericFunction>loop_f_ffff__As_ffff_f
+ufunc_ncfdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_ncfdtri_types[0] = <char>NPY_FLOAT
+ufunc_ncfdtri_types[1] = <char>NPY_FLOAT
+ufunc_ncfdtri_types[2] = <char>NPY_FLOAT
+ufunc_ncfdtri_types[3] = <char>NPY_FLOAT
+ufunc_ncfdtri_types[4] = <char>NPY_FLOAT
+ufunc_ncfdtri_types[5] = <char>NPY_DOUBLE
+ufunc_ncfdtri_types[6] = <char>NPY_DOUBLE
+ufunc_ncfdtri_types[7] = <char>NPY_DOUBLE
+ufunc_ncfdtri_types[8] = <char>NPY_DOUBLE
+ufunc_ncfdtri_types[9] = <char>NPY_DOUBLE
+ufunc_ncfdtri_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_ncf_ppf_float
+ufunc_ncfdtri_ptr[2*0+1] = <void*>(<char*>"ncfdtri")
+ufunc_ncfdtri_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_ncf_ppf_double
+ufunc_ncfdtri_ptr[2*1+1] = <void*>(<char*>"ncfdtri")
+ufunc_ncfdtri_data[0] = &ufunc_ncfdtri_ptr[2*0]
+ufunc_ncfdtri_data[1] = &ufunc_ncfdtri_ptr[2*1]
+ncfdtri = np.PyUFunc_FromFuncAndData(ufunc_ncfdtri_loops, ufunc_ncfdtri_data, ufunc_ncfdtri_types, 2, 4, 1, 0, 'ncfdtri', ufunc_ncfdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ncfdtridfd_loops[2]
+cdef void *ufunc_ncfdtridfd_ptr[4]
+cdef void *ufunc_ncfdtridfd_data[2]
+cdef char ufunc_ncfdtridfd_types[10]
+cdef char *ufunc_ncfdtridfd_doc = (
+    "ncfdtridfd(dfn, p, nc, f, out=None)\n"
+    "\n"
+    "Calculate degrees of freedom (denominator) for the noncentral F-distribution.\n"
+    "\n"
+    "This is the inverse with respect to `dfd` of `ncfdtr`.\n"
+    "See `ncfdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    Degrees of freedom of the numerator sum of squares.  Range (0, inf).\n"
+    "p : array_like\n"
+    "    Value of the cumulative distribution function.  Must be in the\n"
+    "    range [0, 1].\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.  Should be in range (0, 1e4).\n"
+    "f : array_like\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "dfd : scalar or ndarray\n"
+    "    Degrees of freedom of the denominator sum of squares.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ncfdtr : CDF of the non-central F distribution.\n"
+    "ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.\n"
+    "ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.\n"
+    "ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The value of the cumulative noncentral F distribution is not necessarily\n"
+    "monotone in either degrees of freedom. There thus may be two values that\n"
+    "provide a given CDF value. This routine assumes monotonicity and will\n"
+    "find an arbitrary one of the two values.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import ncfdtr, ncfdtridfd\n"
+    "\n"
+    "Compute the CDF for several values of `dfd`:\n"
+    "\n"
+    ">>> dfd = [1, 2, 3]\n"
+    ">>> p = ncfdtr(2, dfd, 0.25, 15)\n"
+    ">>> p\n"
+    "array([ 0.8097138 ,  0.93020416,  0.96787852])\n"
+    "\n"
+    "Compute the inverse.  We recover the values of `dfd`, as expected:\n"
+    "\n"
+    ">>> ncfdtridfd(2, p, 0.25, 15)\n"
+    "array([ 1.,  2.,  3.])")
+ufunc_ncfdtridfd_loops[0] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_ncfdtridfd_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_ncfdtridfd_types[0] = <char>NPY_FLOAT
+ufunc_ncfdtridfd_types[1] = <char>NPY_FLOAT
+ufunc_ncfdtridfd_types[2] = <char>NPY_FLOAT
+ufunc_ncfdtridfd_types[3] = <char>NPY_FLOAT
+ufunc_ncfdtridfd_types[4] = <char>NPY_FLOAT
+ufunc_ncfdtridfd_types[5] = <char>NPY_DOUBLE
+ufunc_ncfdtridfd_types[6] = <char>NPY_DOUBLE
+ufunc_ncfdtridfd_types[7] = <char>NPY_DOUBLE
+ufunc_ncfdtridfd_types[8] = <char>NPY_DOUBLE
+ufunc_ncfdtridfd_types[9] = <char>NPY_DOUBLE
+ufunc_ncfdtridfd_ptr[2*0] = <void*>_func_ncfdtridfd
+ufunc_ncfdtridfd_ptr[2*0+1] = <void*>(<char*>"ncfdtridfd")
+ufunc_ncfdtridfd_ptr[2*1] = <void*>_func_ncfdtridfd
+ufunc_ncfdtridfd_ptr[2*1+1] = <void*>(<char*>"ncfdtridfd")
+ufunc_ncfdtridfd_data[0] = &ufunc_ncfdtridfd_ptr[2*0]
+ufunc_ncfdtridfd_data[1] = &ufunc_ncfdtridfd_ptr[2*1]
+ncfdtridfd = np.PyUFunc_FromFuncAndData(ufunc_ncfdtridfd_loops, ufunc_ncfdtridfd_data, ufunc_ncfdtridfd_types, 2, 4, 1, 0, 'ncfdtridfd', ufunc_ncfdtridfd_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ncfdtridfn_loops[2]
+cdef void *ufunc_ncfdtridfn_ptr[4]
+cdef void *ufunc_ncfdtridfn_data[2]
+cdef char ufunc_ncfdtridfn_types[10]
+cdef char *ufunc_ncfdtridfn_doc = (
+    "ncfdtridfn(p, dfd, nc, f, out=None)\n"
+    "\n"
+    "Calculate degrees of freedom (numerator) for the noncentral F-distribution.\n"
+    "\n"
+    "This is the inverse with respect to `dfn` of `ncfdtr`.\n"
+    "See `ncfdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Value of the cumulative distribution function. Must be in the\n"
+    "    range [0, 1].\n"
+    "dfd : array_like\n"
+    "    Degrees of freedom of the denominator sum of squares. Range (0, inf).\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.  Should be in range (0, 1e4).\n"
+    "f : float\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "dfn : scalar or ndarray\n"
+    "    Degrees of freedom of the numerator sum of squares.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ncfdtr : CDF of the non-central F distribution.\n"
+    "ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.\n"
+    "ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.\n"
+    "ncfdtrinc : Inverse of `ncfdtr` with respect to `nc`.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The value of the cumulative noncentral F distribution is not necessarily\n"
+    "monotone in either degrees of freedom. There thus may be two values that\n"
+    "provide a given CDF value. This routine assumes monotonicity and will\n"
+    "find an arbitrary one of the two values.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import ncfdtr, ncfdtridfn\n"
+    "\n"
+    "Compute the CDF for several values of `dfn`:\n"
+    "\n"
+    ">>> dfn = [1, 2, 3]\n"
+    ">>> p = ncfdtr(dfn, 2, 0.25, 15)\n"
+    ">>> p\n"
+    "array([ 0.92562363,  0.93020416,  0.93188394])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `dfn`, as expected:\n"
+    "\n"
+    ">>> ncfdtridfn(p, 2, 0.25, 15)\n"
+    "array([ 1.,  2.,  3.])")
+ufunc_ncfdtridfn_loops[0] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_ncfdtridfn_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_ncfdtridfn_types[0] = <char>NPY_FLOAT
+ufunc_ncfdtridfn_types[1] = <char>NPY_FLOAT
+ufunc_ncfdtridfn_types[2] = <char>NPY_FLOAT
+ufunc_ncfdtridfn_types[3] = <char>NPY_FLOAT
+ufunc_ncfdtridfn_types[4] = <char>NPY_FLOAT
+ufunc_ncfdtridfn_types[5] = <char>NPY_DOUBLE
+ufunc_ncfdtridfn_types[6] = <char>NPY_DOUBLE
+ufunc_ncfdtridfn_types[7] = <char>NPY_DOUBLE
+ufunc_ncfdtridfn_types[8] = <char>NPY_DOUBLE
+ufunc_ncfdtridfn_types[9] = <char>NPY_DOUBLE
+ufunc_ncfdtridfn_ptr[2*0] = <void*>_func_ncfdtridfn
+ufunc_ncfdtridfn_ptr[2*0+1] = <void*>(<char*>"ncfdtridfn")
+ufunc_ncfdtridfn_ptr[2*1] = <void*>_func_ncfdtridfn
+ufunc_ncfdtridfn_ptr[2*1+1] = <void*>(<char*>"ncfdtridfn")
+ufunc_ncfdtridfn_data[0] = &ufunc_ncfdtridfn_ptr[2*0]
+ufunc_ncfdtridfn_data[1] = &ufunc_ncfdtridfn_ptr[2*1]
+ncfdtridfn = np.PyUFunc_FromFuncAndData(ufunc_ncfdtridfn_loops, ufunc_ncfdtridfn_data, ufunc_ncfdtridfn_types, 2, 4, 1, 0, 'ncfdtridfn', ufunc_ncfdtridfn_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ncfdtrinc_loops[2]
+cdef void *ufunc_ncfdtrinc_ptr[4]
+cdef void *ufunc_ncfdtrinc_data[2]
+cdef char ufunc_ncfdtrinc_types[10]
+cdef char *ufunc_ncfdtrinc_doc = (
+    "ncfdtrinc(dfn, dfd, p, f, out=None)\n"
+    "\n"
+    "Calculate non-centrality parameter for non-central F distribution.\n"
+    "\n"
+    "This is the inverse with respect to `nc` of `ncfdtr`.\n"
+    "See `ncfdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "dfn : array_like\n"
+    "    Degrees of freedom of the numerator sum of squares. Range (0, inf).\n"
+    "dfd : array_like\n"
+    "    Degrees of freedom of the denominator sum of squares. Range (0, inf).\n"
+    "p : array_like\n"
+    "    Value of the cumulative distribution function. Must be in the\n"
+    "    range [0, 1].\n"
+    "f : array_like\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "nc : scalar or ndarray\n"
+    "    Noncentrality parameter.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ncfdtr : CDF of the non-central F distribution.\n"
+    "ncfdtri : Quantile function; inverse of `ncfdtr` with respect to `f`.\n"
+    "ncfdtridfd : Inverse of `ncfdtr` with respect to `dfd`.\n"
+    "ncfdtridfn : Inverse of `ncfdtr` with respect to `dfn`.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import ncfdtr, ncfdtrinc\n"
+    "\n"
+    "Compute the CDF for several values of `nc`:\n"
+    "\n"
+    ">>> nc = [0.5, 1.5, 2.0]\n"
+    ">>> p = ncfdtr(2, 3, nc, 15)\n"
+    ">>> p\n"
+    "array([ 0.96309246,  0.94327955,  0.93304098])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `nc`, as expected:\n"
+    "\n"
+    ">>> ncfdtrinc(2, 3, p, 15)\n"
+    "array([ 0.5,  1.5,  2. ])")
+ufunc_ncfdtrinc_loops[0] = <np.PyUFuncGenericFunction>loop_d_dddd__As_ffff_f
+ufunc_ncfdtrinc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dddd__As_dddd_d
+ufunc_ncfdtrinc_types[0] = <char>NPY_FLOAT
+ufunc_ncfdtrinc_types[1] = <char>NPY_FLOAT
+ufunc_ncfdtrinc_types[2] = <char>NPY_FLOAT
+ufunc_ncfdtrinc_types[3] = <char>NPY_FLOAT
+ufunc_ncfdtrinc_types[4] = <char>NPY_FLOAT
+ufunc_ncfdtrinc_types[5] = <char>NPY_DOUBLE
+ufunc_ncfdtrinc_types[6] = <char>NPY_DOUBLE
+ufunc_ncfdtrinc_types[7] = <char>NPY_DOUBLE
+ufunc_ncfdtrinc_types[8] = <char>NPY_DOUBLE
+ufunc_ncfdtrinc_types[9] = <char>NPY_DOUBLE
+ufunc_ncfdtrinc_ptr[2*0] = <void*>_func_ncfdtrinc
+ufunc_ncfdtrinc_ptr[2*0+1] = <void*>(<char*>"ncfdtrinc")
+ufunc_ncfdtrinc_ptr[2*1] = <void*>_func_ncfdtrinc
+ufunc_ncfdtrinc_ptr[2*1+1] = <void*>(<char*>"ncfdtrinc")
+ufunc_ncfdtrinc_data[0] = &ufunc_ncfdtrinc_ptr[2*0]
+ufunc_ncfdtrinc_data[1] = &ufunc_ncfdtrinc_ptr[2*1]
+ncfdtrinc = np.PyUFunc_FromFuncAndData(ufunc_ncfdtrinc_loops, ufunc_ncfdtrinc_data, ufunc_ncfdtrinc_types, 2, 4, 1, 0, 'ncfdtrinc', ufunc_ncfdtrinc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nctdtr_loops[2]
+cdef void *ufunc_nctdtr_ptr[4]
+cdef void *ufunc_nctdtr_data[2]
+cdef char ufunc_nctdtr_types[8]
+cdef char *ufunc_nctdtr_doc = (
+    "nctdtr(df, nc, t, out=None)\n"
+    "\n"
+    "Cumulative distribution function of the non-central `t` distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "df : array_like\n"
+    "    Degrees of freedom of the distribution. Should be in range (0, inf).\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.\n"
+    "t : array_like\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "cdf : scalar or ndarray\n"
+    "    The calculated CDF. If all inputs are scalar, the return will be a\n"
+    "    float. Otherwise, it will be an array.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.\n"
+    "nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.\n"
+    "nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function calculates the CDF of the non-central t distribution using\n"
+    "the Boost Math C++ library [1]_.\n"
+    "\n"
+    "Note that the argument order of `nctdtr` is different from that of the\n"
+    "similar ``cdf`` method of `scipy.stats.nct`: `t` is the last\n"
+    "parameter of `nctdtr` but the first parameter of ``scipy.stats.nct.cdf``.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy import special\n"
+    ">>> from scipy import stats\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    "\n"
+    "Plot the CDF of the non-central t distribution, for nc=0. Compare with the\n"
+    "t-distribution from scipy.stats:\n"
+    "\n"
+    ">>> x = np.linspace(-5, 5, num=500)\n"
+    ">>> df = 3\n"
+    ">>> nct_stats = stats.t.cdf(x, df)\n"
+    ">>> nct_special = special.nctdtr(df, 0, x)\n"
+    "\n"
+    ">>> fig = plt.figure()\n"
+    ">>> ax = fig.add_subplot(111)\n"
+    ">>> ax.plot(x, nct_stats, 'b-', lw=3)\n"
+    ">>> ax.plot(x, nct_special, 'r-')\n"
+    ">>> plt.show()")
+ufunc_nctdtr_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_nctdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nctdtr_types[0] = <char>NPY_FLOAT
+ufunc_nctdtr_types[1] = <char>NPY_FLOAT
+ufunc_nctdtr_types[2] = <char>NPY_FLOAT
+ufunc_nctdtr_types[3] = <char>NPY_FLOAT
+ufunc_nctdtr_types[4] = <char>NPY_DOUBLE
+ufunc_nctdtr_types[5] = <char>NPY_DOUBLE
+ufunc_nctdtr_types[6] = <char>NPY_DOUBLE
+ufunc_nctdtr_types[7] = <char>NPY_DOUBLE
+ufunc_nctdtr_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_cdf_float
+ufunc_nctdtr_ptr[2*0+1] = <void*>(<char*>"nctdtr")
+ufunc_nctdtr_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_cdf_double
+ufunc_nctdtr_ptr[2*1+1] = <void*>(<char*>"nctdtr")
+ufunc_nctdtr_data[0] = &ufunc_nctdtr_ptr[2*0]
+ufunc_nctdtr_data[1] = &ufunc_nctdtr_ptr[2*1]
+nctdtr = np.PyUFunc_FromFuncAndData(ufunc_nctdtr_loops, ufunc_nctdtr_data, ufunc_nctdtr_types, 2, 3, 1, 0, 'nctdtr', ufunc_nctdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nctdtridf_loops[2]
+cdef void *ufunc_nctdtridf_ptr[4]
+cdef void *ufunc_nctdtridf_data[2]
+cdef char ufunc_nctdtridf_types[8]
+cdef char *ufunc_nctdtridf_doc = (
+    "nctdtridf(p, nc, t, out=None)\n"
+    "\n"
+    "Calculate degrees of freedom for non-central t distribution.\n"
+    "\n"
+    "See `nctdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    CDF values, in range (0, 1].\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter. Should be in range (-1e6, 1e6).\n"
+    "t : array_like\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "df : scalar or ndarray\n"
+    "    The degrees of freedom. If all inputs are scalar, the return will be a\n"
+    "    float. Otherwise, it will be an array.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nctdtr :  CDF of the non-central `t` distribution.\n"
+    "nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.\n"
+    "nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import nctdtr, nctdtridf\n"
+    "\n"
+    "Compute the CDF for several values of `df`:\n"
+    "\n"
+    ">>> df = [1, 2, 3]\n"
+    ">>> p = nctdtr(df, 0.25, 1)\n"
+    ">>> p\n"
+    "array([0.67491974, 0.716464  , 0.73349456])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `df`, as expected:\n"
+    "\n"
+    ">>> nctdtridf(p, 0.25, 1)\n"
+    "array([1., 2., 3.])")
+ufunc_nctdtridf_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nctdtridf_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nctdtridf_types[0] = <char>NPY_FLOAT
+ufunc_nctdtridf_types[1] = <char>NPY_FLOAT
+ufunc_nctdtridf_types[2] = <char>NPY_FLOAT
+ufunc_nctdtridf_types[3] = <char>NPY_FLOAT
+ufunc_nctdtridf_types[4] = <char>NPY_DOUBLE
+ufunc_nctdtridf_types[5] = <char>NPY_DOUBLE
+ufunc_nctdtridf_types[6] = <char>NPY_DOUBLE
+ufunc_nctdtridf_types[7] = <char>NPY_DOUBLE
+ufunc_nctdtridf_ptr[2*0] = <void*>_func_nctdtridf
+ufunc_nctdtridf_ptr[2*0+1] = <void*>(<char*>"nctdtridf")
+ufunc_nctdtridf_ptr[2*1] = <void*>_func_nctdtridf
+ufunc_nctdtridf_ptr[2*1+1] = <void*>(<char*>"nctdtridf")
+ufunc_nctdtridf_data[0] = &ufunc_nctdtridf_ptr[2*0]
+ufunc_nctdtridf_data[1] = &ufunc_nctdtridf_ptr[2*1]
+nctdtridf = np.PyUFunc_FromFuncAndData(ufunc_nctdtridf_loops, ufunc_nctdtridf_data, ufunc_nctdtridf_types, 2, 3, 1, 0, 'nctdtridf', ufunc_nctdtridf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nctdtrinc_loops[2]
+cdef void *ufunc_nctdtrinc_ptr[4]
+cdef void *ufunc_nctdtrinc_data[2]
+cdef char ufunc_nctdtrinc_types[8]
+cdef char *ufunc_nctdtrinc_doc = (
+    "nctdtrinc(df, p, t, out=None)\n"
+    "\n"
+    "Calculate non-centrality parameter for non-central t distribution.\n"
+    "\n"
+    "See `nctdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "df : array_like\n"
+    "    Degrees of freedom of the distribution. Should be in range (0, inf).\n"
+    "p : array_like\n"
+    "    CDF values, in range (0, 1].\n"
+    "t : array_like\n"
+    "    Quantiles, i.e., the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "nc : scalar or ndarray\n"
+    "    Noncentrality parameter\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nctdtr :  CDF of the non-central `t` distribution.\n"
+    "nctdtrit : Inverse CDF (iCDF) of the non-central t distribution.\n"
+    "nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import nctdtr, nctdtrinc\n"
+    "\n"
+    "Compute the CDF for several values of `nc`:\n"
+    "\n"
+    ">>> nc = [0.5, 1.5, 2.5]\n"
+    ">>> p = nctdtr(3, nc, 1.5)\n"
+    ">>> p\n"
+    "array([0.77569497, 0.45524533, 0.1668691 ])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `nc`, as expected:\n"
+    "\n"
+    ">>> nctdtrinc(3, p, 1.5)\n"
+    "array([0.5, 1.5, 2.5])")
+ufunc_nctdtrinc_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nctdtrinc_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nctdtrinc_types[0] = <char>NPY_FLOAT
+ufunc_nctdtrinc_types[1] = <char>NPY_FLOAT
+ufunc_nctdtrinc_types[2] = <char>NPY_FLOAT
+ufunc_nctdtrinc_types[3] = <char>NPY_FLOAT
+ufunc_nctdtrinc_types[4] = <char>NPY_DOUBLE
+ufunc_nctdtrinc_types[5] = <char>NPY_DOUBLE
+ufunc_nctdtrinc_types[6] = <char>NPY_DOUBLE
+ufunc_nctdtrinc_types[7] = <char>NPY_DOUBLE
+ufunc_nctdtrinc_ptr[2*0] = <void*>_func_nctdtrinc
+ufunc_nctdtrinc_ptr[2*0+1] = <void*>(<char*>"nctdtrinc")
+ufunc_nctdtrinc_ptr[2*1] = <void*>_func_nctdtrinc
+ufunc_nctdtrinc_ptr[2*1+1] = <void*>(<char*>"nctdtrinc")
+ufunc_nctdtrinc_data[0] = &ufunc_nctdtrinc_ptr[2*0]
+ufunc_nctdtrinc_data[1] = &ufunc_nctdtrinc_ptr[2*1]
+nctdtrinc = np.PyUFunc_FromFuncAndData(ufunc_nctdtrinc_loops, ufunc_nctdtrinc_data, ufunc_nctdtrinc_types, 2, 3, 1, 0, 'nctdtrinc', ufunc_nctdtrinc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nctdtrit_loops[2]
+cdef void *ufunc_nctdtrit_ptr[4]
+cdef void *ufunc_nctdtrit_data[2]
+cdef char ufunc_nctdtrit_types[8]
+cdef char *ufunc_nctdtrit_doc = (
+    "nctdtrit(df, nc, p, out=None)\n"
+    "\n"
+    "Inverse cumulative distribution function of the non-central t distribution.\n"
+    "\n"
+    "See `nctdtr` for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "df : array_like\n"
+    "    Degrees of freedom of the distribution. Should be in range (0, inf).\n"
+    "nc : array_like\n"
+    "    Noncentrality parameter.\n"
+    "p : array_like\n"
+    "    CDF values, in range (0, 1].\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "t : scalar or ndarray\n"
+    "    Quantiles\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "nctdtr :  CDF of the non-central `t` distribution.\n"
+    "nctdtridf : Calculate degrees of freedom, given CDF and iCDF values.\n"
+    "nctdtrinc : Calculate non-centrality parameter, given CDF iCDF values.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function calculates the quantile of the non-central t distribution using\n"
+    "the Boost Math C++ library [1]_.\n"
+    "\n"
+    "Note that the argument order of `nctdtrit` is different from that of the\n"
+    "similar ``ppf`` method of `scipy.stats.nct`: `t` is the last\n"
+    "parameter of `nctdtrit` but the first parameter of ``scipy.stats.nct.ppf``.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import nctdtr, nctdtrit\n"
+    "\n"
+    "Compute the CDF for several values of `t`:\n"
+    "\n"
+    ">>> t = [0.5, 1, 1.5]\n"
+    ">>> p = nctdtr(3, 1, t)\n"
+    ">>> p\n"
+    "array([0.29811049, 0.46922687, 0.6257559 ])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `t`, as expected:\n"
+    "\n"
+    ">>> nctdtrit(3, 1, p)\n"
+    "array([0.5, 1. , 1.5])")
+ufunc_nctdtrit_loops[0] = <np.PyUFuncGenericFunction>loop_f_fff__As_fff_f
+ufunc_nctdtrit_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nctdtrit_types[0] = <char>NPY_FLOAT
+ufunc_nctdtrit_types[1] = <char>NPY_FLOAT
+ufunc_nctdtrit_types[2] = <char>NPY_FLOAT
+ufunc_nctdtrit_types[3] = <char>NPY_FLOAT
+ufunc_nctdtrit_types[4] = <char>NPY_DOUBLE
+ufunc_nctdtrit_types[5] = <char>NPY_DOUBLE
+ufunc_nctdtrit_types[6] = <char>NPY_DOUBLE
+ufunc_nctdtrit_types[7] = <char>NPY_DOUBLE
+ufunc_nctdtrit_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_nct_ppf_float
+ufunc_nctdtrit_ptr[2*0+1] = <void*>(<char*>"nctdtrit")
+ufunc_nctdtrit_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_nct_ppf_double
+ufunc_nctdtrit_ptr[2*1+1] = <void*>(<char*>"nctdtrit")
+ufunc_nctdtrit_data[0] = &ufunc_nctdtrit_ptr[2*0]
+ufunc_nctdtrit_data[1] = &ufunc_nctdtrit_ptr[2*1]
+nctdtrit = np.PyUFunc_FromFuncAndData(ufunc_nctdtrit_loops, ufunc_nctdtrit_data, ufunc_nctdtrit_types, 2, 3, 1, 0, 'nctdtrit', ufunc_nctdtrit_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ndtri_loops[2]
+cdef void *ufunc_ndtri_ptr[4]
+cdef void *ufunc_ndtri_data[2]
+cdef char ufunc_ndtri_types[4]
+cdef char *ufunc_ndtri_doc = (
+    "ndtri(y, out=None)\n"
+    "\n"
+    "Inverse of `ndtr` vs x\n"
+    "\n"
+    "Returns the argument x for which the area under the standard normal\n"
+    "probability density function (integrated from minus infinity to `x`)\n"
+    "is equal to y.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "x : scalar or ndarray\n"
+    "    Value of x such that ``ndtr(x) == p``.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "ndtr : Standard normal cumulative probability distribution\n"
+    "ndtri_exp : Inverse of log_ndtr\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`ndtri` is the percentile function of the standard normal distribution.\n"
+    "This means it returns the inverse of the cumulative density `ndtr`. First,\n"
+    "let us compute a cumulative density value.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import ndtri, ndtr\n"
+    ">>> cdf_val = ndtr(2)\n"
+    ">>> cdf_val\n"
+    "0.9772498680518208\n"
+    "\n"
+    "Verify that `ndtri` yields the original value for `x` up to floating point\n"
+    "errors.\n"
+    "\n"
+    ">>> ndtri(cdf_val)\n"
+    "2.0000000000000004\n"
+    "\n"
+    "Plot the function. For that purpose, we provide a NumPy array as argument.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> x = np.linspace(0.01, 1, 200)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> ax.plot(x, ndtri(x))\n"
+    ">>> ax.set_title(\"Standard normal percentile function\")\n"
+    ">>> plt.show()")
+ufunc_ndtri_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_ndtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_ndtri_types[0] = <char>NPY_FLOAT
+ufunc_ndtri_types[1] = <char>NPY_FLOAT
+ufunc_ndtri_types[2] = <char>NPY_DOUBLE
+ufunc_ndtri_types[3] = <char>NPY_DOUBLE
+ufunc_ndtri_ptr[2*0] = <void*>_func_xsf_ndtri
+ufunc_ndtri_ptr[2*0+1] = <void*>(<char*>"ndtri")
+ufunc_ndtri_ptr[2*1] = <void*>_func_xsf_ndtri
+ufunc_ndtri_ptr[2*1+1] = <void*>(<char*>"ndtri")
+ufunc_ndtri_data[0] = &ufunc_ndtri_ptr[2*0]
+ufunc_ndtri_data[1] = &ufunc_ndtri_ptr[2*1]
+ndtri = np.PyUFunc_FromFuncAndData(ufunc_ndtri_loops, ufunc_ndtri_data, ufunc_ndtri_types, 2, 1, 1, 0, 'ndtri', ufunc_ndtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_ndtri_exp_loops[2]
+cdef void *ufunc_ndtri_exp_ptr[4]
+cdef void *ufunc_ndtri_exp_data[2]
+cdef char ufunc_ndtri_exp_types[4]
+cdef char *ufunc_ndtri_exp_doc = (
+    "ndtri_exp(y, out=None)\n"
+    "\n"
+    "Inverse of `log_ndtr` vs x. Allows for greater precision than\n"
+    "`ndtri` composed with `numpy.exp` for very small values of y and for\n"
+    "y close to 0.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "y : array_like of float\n"
+    "    Function argument\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Inverse of the log CDF of the standard normal distribution, evaluated\n"
+    "    at y.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "log_ndtr : log of the standard normal cumulative distribution function\n"
+    "ndtr : standard normal cumulative distribution function\n"
+    "ndtri : standard normal percentile function\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "`ndtri_exp` agrees with the naive implementation when the latter does\n"
+    "not suffer from underflow.\n"
+    "\n"
+    ">>> sc.ndtri_exp(-1)\n"
+    "-0.33747496376420244\n"
+    ">>> sc.ndtri(np.exp(-1))\n"
+    "-0.33747496376420244\n"
+    "\n"
+    "For extreme values of y, the naive approach fails\n"
+    "\n"
+    ">>> sc.ndtri(np.exp(-800))\n"
+    "-inf\n"
+    ">>> sc.ndtri(np.exp(-1e-20))\n"
+    "inf\n"
+    "\n"
+    "whereas `ndtri_exp` is still able to compute the result to high precision.\n"
+    "\n"
+    ">>> sc.ndtri_exp(-800)\n"
+    "-39.88469483825668\n"
+    ">>> sc.ndtri_exp(-1e-20)\n"
+    "9.262340089798409")
+ufunc_ndtri_exp_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_ndtri_exp_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_ndtri_exp_types[0] = <char>NPY_FLOAT
+ufunc_ndtri_exp_types[1] = <char>NPY_FLOAT
+ufunc_ndtri_exp_types[2] = <char>NPY_DOUBLE
+ufunc_ndtri_exp_types[3] = <char>NPY_DOUBLE
+ufunc_ndtri_exp_ptr[2*0] = <void*>_func_ndtri_exp
+ufunc_ndtri_exp_ptr[2*0+1] = <void*>(<char*>"ndtri_exp")
+ufunc_ndtri_exp_ptr[2*1] = <void*>_func_ndtri_exp
+ufunc_ndtri_exp_ptr[2*1+1] = <void*>(<char*>"ndtri_exp")
+ufunc_ndtri_exp_data[0] = &ufunc_ndtri_exp_ptr[2*0]
+ufunc_ndtri_exp_data[1] = &ufunc_ndtri_exp_ptr[2*1]
+ndtri_exp = np.PyUFunc_FromFuncAndData(ufunc_ndtri_exp_loops, ufunc_ndtri_exp_data, ufunc_ndtri_exp_types, 2, 1, 1, 0, 'ndtri_exp', ufunc_ndtri_exp_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nrdtrimn_loops[2]
+cdef void *ufunc_nrdtrimn_ptr[4]
+cdef void *ufunc_nrdtrimn_data[2]
+cdef char ufunc_nrdtrimn_types[8]
+cdef char *ufunc_nrdtrimn_doc = (
+    "nrdtrimn(p, std, x, out=None)\n"
+    "\n"
+    "Calculate mean of normal distribution given other params.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    CDF values, in range (0, 1].\n"
+    "std : array_like\n"
+    "    Standard deviation.\n"
+    "x : array_like\n"
+    "    Quantiles, i.e. the upper limit of integration.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "mn : scalar or ndarray\n"
+    "    The mean of the normal distribution.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "scipy.stats.norm : Normal distribution\n"
+    "ndtr : Standard normal cumulative probability distribution\n"
+    "ndtri : Inverse of standard normal CDF with respect to quantile\n"
+    "nrdtrisd : Inverse of normal distribution CDF with respect to\n"
+    "           standard deviation\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`nrdtrimn` can be used to recover the mean of a normal distribution\n"
+    "if we know the CDF value `p` for a given quantile `x` and the\n"
+    "standard deviation `std`. First, we calculate\n"
+    "the normal distribution CDF for an exemplary parameter set.\n"
+    "\n"
+    ">>> from scipy.stats import norm\n"
+    ">>> mean = 3.\n"
+    ">>> std = 2.\n"
+    ">>> x = 6.\n"
+    ">>> p = norm.cdf(x, loc=mean, scale=std)\n"
+    ">>> p\n"
+    "0.9331927987311419\n"
+    "\n"
+    "Verify that `nrdtrimn` returns the original value for `mean`.\n"
+    "\n"
+    ">>> from scipy.special import nrdtrimn\n"
+    ">>> nrdtrimn(p, std, x)\n"
+    "3.0000000000000004")
+ufunc_nrdtrimn_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nrdtrimn_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nrdtrimn_types[0] = <char>NPY_FLOAT
+ufunc_nrdtrimn_types[1] = <char>NPY_FLOAT
+ufunc_nrdtrimn_types[2] = <char>NPY_FLOAT
+ufunc_nrdtrimn_types[3] = <char>NPY_FLOAT
+ufunc_nrdtrimn_types[4] = <char>NPY_DOUBLE
+ufunc_nrdtrimn_types[5] = <char>NPY_DOUBLE
+ufunc_nrdtrimn_types[6] = <char>NPY_DOUBLE
+ufunc_nrdtrimn_types[7] = <char>NPY_DOUBLE
+ufunc_nrdtrimn_ptr[2*0] = <void*>_func_nrdtrimn
+ufunc_nrdtrimn_ptr[2*0+1] = <void*>(<char*>"nrdtrimn")
+ufunc_nrdtrimn_ptr[2*1] = <void*>_func_nrdtrimn
+ufunc_nrdtrimn_ptr[2*1+1] = <void*>(<char*>"nrdtrimn")
+ufunc_nrdtrimn_data[0] = &ufunc_nrdtrimn_ptr[2*0]
+ufunc_nrdtrimn_data[1] = &ufunc_nrdtrimn_ptr[2*1]
+nrdtrimn = np.PyUFunc_FromFuncAndData(ufunc_nrdtrimn_loops, ufunc_nrdtrimn_data, ufunc_nrdtrimn_types, 2, 3, 1, 0, 'nrdtrimn', ufunc_nrdtrimn_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_nrdtrisd_loops[2]
+cdef void *ufunc_nrdtrisd_ptr[4]
+cdef void *ufunc_nrdtrisd_data[2]
+cdef char ufunc_nrdtrisd_types[8]
+cdef char *ufunc_nrdtrisd_doc = (
+    "nrdtrisd(mn, p, x, out=None)\n"
+    "\n"
+    "Calculate standard deviation of normal distribution given other params.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "mn : scalar or ndarray\n"
+    "    The mean of the normal distribution.\n"
+    "p : array_like\n"
+    "    CDF values, in range (0, 1].\n"
+    "x : array_like\n"
+    "    Quantiles, i.e. the upper limit of integration.\n"
+    "\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "std : scalar or ndarray\n"
+    "    Standard deviation.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "scipy.stats.norm : Normal distribution\n"
+    "ndtr : Standard normal cumulative probability distribution\n"
+    "ndtri : Inverse of standard normal CDF with respect to quantile\n"
+    "nrdtrimn : Inverse of normal distribution CDF with respect to\n"
+    "           mean\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`nrdtrisd` can be used to recover the standard deviation of a normal\n"
+    "distribution if we know the CDF value `p` for a given quantile `x` and\n"
+    "the mean `mn`. First, we calculate the normal distribution CDF for an\n"
+    "exemplary parameter set.\n"
+    "\n"
+    ">>> from scipy.stats import norm\n"
+    ">>> mean = 3.\n"
+    ">>> std = 2.\n"
+    ">>> x = 6.\n"
+    ">>> p = norm.cdf(x, loc=mean, scale=std)\n"
+    ">>> p\n"
+    "0.9331927987311419\n"
+    "\n"
+    "Verify that `nrdtrisd` returns the original value for `std`.\n"
+    "\n"
+    ">>> from scipy.special import nrdtrisd\n"
+    ">>> nrdtrisd(mean, p, x)\n"
+    "2.0000000000000004")
+ufunc_nrdtrisd_loops[0] = <np.PyUFuncGenericFunction>loop_d_ddd__As_fff_f
+ufunc_nrdtrisd_loops[1] = <np.PyUFuncGenericFunction>loop_d_ddd__As_ddd_d
+ufunc_nrdtrisd_types[0] = <char>NPY_FLOAT
+ufunc_nrdtrisd_types[1] = <char>NPY_FLOAT
+ufunc_nrdtrisd_types[2] = <char>NPY_FLOAT
+ufunc_nrdtrisd_types[3] = <char>NPY_FLOAT
+ufunc_nrdtrisd_types[4] = <char>NPY_DOUBLE
+ufunc_nrdtrisd_types[5] = <char>NPY_DOUBLE
+ufunc_nrdtrisd_types[6] = <char>NPY_DOUBLE
+ufunc_nrdtrisd_types[7] = <char>NPY_DOUBLE
+ufunc_nrdtrisd_ptr[2*0] = <void*>_func_nrdtrisd
+ufunc_nrdtrisd_ptr[2*0+1] = <void*>(<char*>"nrdtrisd")
+ufunc_nrdtrisd_ptr[2*1] = <void*>_func_nrdtrisd
+ufunc_nrdtrisd_ptr[2*1+1] = <void*>(<char*>"nrdtrisd")
+ufunc_nrdtrisd_data[0] = &ufunc_nrdtrisd_ptr[2*0]
+ufunc_nrdtrisd_data[1] = &ufunc_nrdtrisd_ptr[2*1]
+nrdtrisd = np.PyUFunc_FromFuncAndData(ufunc_nrdtrisd_loops, ufunc_nrdtrisd_data, ufunc_nrdtrisd_types, 2, 3, 1, 0, 'nrdtrisd', ufunc_nrdtrisd_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_owens_t_loops[2]
+cdef void *ufunc_owens_t_ptr[4]
+cdef void *ufunc_owens_t_data[2]
+cdef char ufunc_owens_t_types[6]
+cdef char *ufunc_owens_t_doc = (
+    "owens_t(h, a, out=None)\n"
+    "\n"
+    "Owen's T Function.\n"
+    "\n"
+    "The function T(h, a) gives the probability of the event\n"
+    "(X > h and 0 < Y < a * X) where X and Y are independent\n"
+    "standard normal random variables.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "h: array_like\n"
+    "    Input value.\n"
+    "a: array_like\n"
+    "    Input value.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "t: scalar or ndarray\n"
+    "    Probability of the event (X > h and 0 < Y < a * X),\n"
+    "    where X and Y are independent standard normal random variables.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] M. Patefield and D. Tandy, \"Fast and accurate calculation of\n"
+    "       Owen's T Function\", Statistical Software vol. 5, pp. 1-25, 2000.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy import special\n"
+    ">>> a = 3.5\n"
+    ">>> h = 0.78\n"
+    ">>> special.owens_t(h, a)\n"
+    "0.10877216734852274")
+ufunc_owens_t_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_owens_t_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_owens_t_types[0] = <char>NPY_FLOAT
+ufunc_owens_t_types[1] = <char>NPY_FLOAT
+ufunc_owens_t_types[2] = <char>NPY_FLOAT
+ufunc_owens_t_types[3] = <char>NPY_DOUBLE
+ufunc_owens_t_types[4] = <char>NPY_DOUBLE
+ufunc_owens_t_types[5] = <char>NPY_DOUBLE
+ufunc_owens_t_ptr[2*0] = <void*>_func_xsf_owens_t
+ufunc_owens_t_ptr[2*0+1] = <void*>(<char*>"owens_t")
+ufunc_owens_t_ptr[2*1] = <void*>_func_xsf_owens_t
+ufunc_owens_t_ptr[2*1+1] = <void*>(<char*>"owens_t")
+ufunc_owens_t_data[0] = &ufunc_owens_t_ptr[2*0]
+ufunc_owens_t_data[1] = &ufunc_owens_t_ptr[2*1]
+owens_t = np.PyUFunc_FromFuncAndData(ufunc_owens_t_loops, ufunc_owens_t_data, ufunc_owens_t_types, 2, 2, 1, 0, 'owens_t', ufunc_owens_t_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_pdtr_loops[2]
+cdef void *ufunc_pdtr_ptr[4]
+cdef void *ufunc_pdtr_data[2]
+cdef char ufunc_pdtr_types[6]
+cdef char *ufunc_pdtr_doc = (
+    "pdtr(k, m, out=None)\n"
+    "\n"
+    "Poisson cumulative distribution function.\n"
+    "\n"
+    "Defined as the probability that a Poisson-distributed random\n"
+    "variable with event rate :math:`m` is less than or equal to\n"
+    ":math:`k`. More concretely, this works out to be [1]_\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "   \\exp(-m) \\sum_{j = 0}^{\\lfloor{k}\\rfloor} \\frac{m^j}{j!}.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of occurrences (nonnegative, real)\n"
+    "m : array_like\n"
+    "    Shape parameter (nonnegative, real)\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the Poisson cumulative distribution function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "pdtrc : Poisson survival function\n"
+    "pdtrik : inverse of `pdtr` with respect to `k`\n"
+    "pdtri : inverse of `pdtr` with respect to `m`\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] https://en.wikipedia.org/wiki/Poisson_distribution\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It is a cumulative distribution function, so it converges to 1\n"
+    "monotonically as `k` goes to infinity.\n"
+    "\n"
+    ">>> sc.pdtr([1, 10, 100, np.inf], 1)\n"
+    "array([0.73575888, 0.99999999, 1.        , 1.        ])\n"
+    "\n"
+    "It is discontinuous at integers and constant between integers.\n"
+    "\n"
+    ">>> sc.pdtr([1, 1.5, 1.9, 2], 1)\n"
+    "array([0.73575888, 0.73575888, 0.73575888, 0.9196986 ])")
+ufunc_pdtr_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_pdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_pdtr_types[0] = <char>NPY_FLOAT
+ufunc_pdtr_types[1] = <char>NPY_FLOAT
+ufunc_pdtr_types[2] = <char>NPY_FLOAT
+ufunc_pdtr_types[3] = <char>NPY_DOUBLE
+ufunc_pdtr_types[4] = <char>NPY_DOUBLE
+ufunc_pdtr_types[5] = <char>NPY_DOUBLE
+ufunc_pdtr_ptr[2*0] = <void*>_func_xsf_pdtr
+ufunc_pdtr_ptr[2*0+1] = <void*>(<char*>"pdtr")
+ufunc_pdtr_ptr[2*1] = <void*>_func_xsf_pdtr
+ufunc_pdtr_ptr[2*1+1] = <void*>(<char*>"pdtr")
+ufunc_pdtr_data[0] = &ufunc_pdtr_ptr[2*0]
+ufunc_pdtr_data[1] = &ufunc_pdtr_ptr[2*1]
+pdtr = np.PyUFunc_FromFuncAndData(ufunc_pdtr_loops, ufunc_pdtr_data, ufunc_pdtr_types, 2, 2, 1, 0, 'pdtr', ufunc_pdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_pdtrc_loops[2]
+cdef void *ufunc_pdtrc_ptr[4]
+cdef void *ufunc_pdtrc_data[2]
+cdef char ufunc_pdtrc_types[6]
+cdef char *ufunc_pdtrc_doc = (
+    "pdtrc(k, m, out=None)\n"
+    "\n"
+    "Poisson survival function\n"
+    "\n"
+    "Returns the sum of the terms from k+1 to infinity of the Poisson\n"
+    "distribution: sum(exp(-m) * m**j / j!, j=k+1..inf) = gammainc(\n"
+    "k+1, m). Arguments must both be non-negative doubles.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of occurrences (nonnegative, real)\n"
+    "m : array_like\n"
+    "    Shape parameter (nonnegative, real)\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the Poisson survival function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "pdtr : Poisson cumulative distribution function\n"
+    "pdtrik : inverse of `pdtr` with respect to `k`\n"
+    "pdtri : inverse of `pdtr` with respect to `m`\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It is a survival function, so it decreases to 0\n"
+    "monotonically as `k` goes to infinity.\n"
+    "\n"
+    ">>> k = np.array([1, 10, 100, np.inf])\n"
+    ">>> sc.pdtrc(k, 1)\n"
+    "array([2.64241118e-001, 1.00477664e-008, 3.94147589e-161, 0.00000000e+000])\n"
+    "\n"
+    "It can be expressed in terms of the lower incomplete gamma\n"
+    "function `gammainc`.\n"
+    "\n"
+    ">>> sc.gammainc(k + 1, 1)\n"
+    "array([2.64241118e-001, 1.00477664e-008, 3.94147589e-161, 0.00000000e+000])")
+ufunc_pdtrc_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_pdtrc_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_pdtrc_types[0] = <char>NPY_FLOAT
+ufunc_pdtrc_types[1] = <char>NPY_FLOAT
+ufunc_pdtrc_types[2] = <char>NPY_FLOAT
+ufunc_pdtrc_types[3] = <char>NPY_DOUBLE
+ufunc_pdtrc_types[4] = <char>NPY_DOUBLE
+ufunc_pdtrc_types[5] = <char>NPY_DOUBLE
+ufunc_pdtrc_ptr[2*0] = <void*>_func_xsf_pdtrc
+ufunc_pdtrc_ptr[2*0+1] = <void*>(<char*>"pdtrc")
+ufunc_pdtrc_ptr[2*1] = <void*>_func_xsf_pdtrc
+ufunc_pdtrc_ptr[2*1+1] = <void*>(<char*>"pdtrc")
+ufunc_pdtrc_data[0] = &ufunc_pdtrc_ptr[2*0]
+ufunc_pdtrc_data[1] = &ufunc_pdtrc_ptr[2*1]
+pdtrc = np.PyUFunc_FromFuncAndData(ufunc_pdtrc_loops, ufunc_pdtrc_data, ufunc_pdtrc_types, 2, 2, 1, 0, 'pdtrc', ufunc_pdtrc_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_pdtri_loops[3]
+cdef void *ufunc_pdtri_ptr[6]
+cdef void *ufunc_pdtri_data[3]
+cdef char ufunc_pdtri_types[9]
+cdef char *ufunc_pdtri_doc = (
+    "pdtri(k, y, out=None)\n"
+    "\n"
+    "Inverse to `pdtr` vs m\n"
+    "\n"
+    "Returns the Poisson variable `m` such that the sum from 0 to `k` of\n"
+    "the Poisson density is equal to the given probability `y`:\n"
+    "calculated by ``gammaincinv(k + 1, y)``. `k` must be a nonnegative\n"
+    "integer and `y` between 0 and 1.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "k : array_like\n"
+    "    Number of occurrences (nonnegative, real)\n"
+    "y : array_like\n"
+    "    Probability\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Values of the shape parameter `m` such that ``pdtr(k, m) = p``\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "pdtr : Poisson cumulative distribution function\n"
+    "pdtrc : Poisson survival function\n"
+    "pdtrik : inverse of `pdtr` with respect to `k`\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "Compute the CDF for several values of `m`:\n"
+    "\n"
+    ">>> m = [0.5, 1, 1.5]\n"
+    ">>> p = sc.pdtr(1, m)\n"
+    ">>> p\n"
+    "array([0.90979599, 0.73575888, 0.5578254 ])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `m`, as expected:\n"
+    "\n"
+    ">>> sc.pdtri(1, p)\n"
+    "array([0.5, 1. , 1.5])")
+ufunc_pdtri_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_pdtri_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_pdtri_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_pdtri_types[0] = <char>NPY_INTP
+ufunc_pdtri_types[1] = <char>NPY_DOUBLE
+ufunc_pdtri_types[2] = <char>NPY_DOUBLE
+ufunc_pdtri_types[3] = <char>NPY_FLOAT
+ufunc_pdtri_types[4] = <char>NPY_FLOAT
+ufunc_pdtri_types[5] = <char>NPY_FLOAT
+ufunc_pdtri_types[6] = <char>NPY_DOUBLE
+ufunc_pdtri_types[7] = <char>NPY_DOUBLE
+ufunc_pdtri_types[8] = <char>NPY_DOUBLE
+ufunc_pdtri_ptr[2*0] = <void*>_func_cephes_pdtri_wrap
+ufunc_pdtri_ptr[2*0+1] = <void*>(<char*>"pdtri")
+ufunc_pdtri_ptr[2*1] = <void*>_func_pdtri_unsafe
+ufunc_pdtri_ptr[2*1+1] = <void*>(<char*>"pdtri")
+ufunc_pdtri_ptr[2*2] = <void*>_func_pdtri_unsafe
+ufunc_pdtri_ptr[2*2+1] = <void*>(<char*>"pdtri")
+ufunc_pdtri_data[0] = &ufunc_pdtri_ptr[2*0]
+ufunc_pdtri_data[1] = &ufunc_pdtri_ptr[2*1]
+ufunc_pdtri_data[2] = &ufunc_pdtri_ptr[2*2]
+pdtri = np.PyUFunc_FromFuncAndData(ufunc_pdtri_loops, ufunc_pdtri_data, ufunc_pdtri_types, 3, 2, 1, 0, 'pdtri', ufunc_pdtri_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_pdtrik_loops[2]
+cdef void *ufunc_pdtrik_ptr[4]
+cdef void *ufunc_pdtrik_data[2]
+cdef char ufunc_pdtrik_types[6]
+cdef char *ufunc_pdtrik_doc = (
+    "pdtrik(p, m, out=None)\n"
+    "\n"
+    "Inverse to `pdtr` vs `k`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability\n"
+    "m : array_like\n"
+    "    Shape parameter (nonnegative, real)\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The number of occurrences `k` such that ``pdtr(k, m) = p``\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function relies on the ``gamma_q_inva`` function from the Boost\n"
+    "Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "pdtr : Poisson cumulative distribution function\n"
+    "pdtrc : Poisson survival function\n"
+    "pdtri : inverse of `pdtr` with respect to `m`\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "Compute the CDF for several values of `k`:\n"
+    "\n"
+    ">>> k = [1, 2, 3]\n"
+    ">>> p = sc.pdtr(k, 2)\n"
+    ">>> p\n"
+    "array([0.40600585, 0.67667642, 0.85712346])\n"
+    "\n"
+    "Compute the inverse. We recover the values of `k`, as expected:\n"
+    "\n"
+    ">>> sc.pdtrik(p, 2)\n"
+    "array([1., 2., 3.])")
+ufunc_pdtrik_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc_pdtrik_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_pdtrik_types[0] = <char>NPY_FLOAT
+ufunc_pdtrik_types[1] = <char>NPY_FLOAT
+ufunc_pdtrik_types[2] = <char>NPY_FLOAT
+ufunc_pdtrik_types[3] = <char>NPY_DOUBLE
+ufunc_pdtrik_types[4] = <char>NPY_DOUBLE
+ufunc_pdtrik_types[5] = <char>NPY_DOUBLE
+ufunc_pdtrik_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_pdtrik_float
+ufunc_pdtrik_ptr[2*0+1] = <void*>(<char*>"pdtrik")
+ufunc_pdtrik_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_pdtrik_double
+ufunc_pdtrik_ptr[2*1+1] = <void*>(<char*>"pdtrik")
+ufunc_pdtrik_data[0] = &ufunc_pdtrik_ptr[2*0]
+ufunc_pdtrik_data[1] = &ufunc_pdtrik_ptr[2*1]
+pdtrik = np.PyUFunc_FromFuncAndData(ufunc_pdtrik_loops, ufunc_pdtrik_data, ufunc_pdtrik_types, 2, 2, 1, 0, 'pdtrik', ufunc_pdtrik_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_poch_loops[2]
+cdef void *ufunc_poch_ptr[4]
+cdef void *ufunc_poch_data[2]
+cdef char ufunc_poch_types[6]
+cdef char *ufunc_poch_doc = (
+    "poch(z, m, out=None)\n"
+    "\n"
+    "Pochhammer symbol.\n"
+    "\n"
+    "The Pochhammer symbol (rising factorial) is defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    (z)_m = \\frac{\\Gamma(z + m)}{\\Gamma(z)}\n"
+    "\n"
+    "For positive integer `m` it reads\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    (z)_m = z (z + 1) ... (z + m - 1)\n"
+    "\n"
+    "See [DLMF]_ for more details.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "z, m : array_like\n"
+    "    Real-valued arguments.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The value of the function.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [DLMF] Nist, Digital Library of Mathematical Functions\n"
+    "    https://dlmf.nist.gov/5.2#iii\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It is 1 when m is 0.\n"
+    "\n"
+    ">>> sc.poch([1, 2, 3, 4], 0)\n"
+    "array([1., 1., 1., 1.])\n"
+    "\n"
+    "For z equal to 1 it reduces to the factorial function.\n"
+    "\n"
+    ">>> sc.poch(1, 5)\n"
+    "120.0\n"
+    ">>> 1 * 2 * 3 * 4 * 5\n"
+    "120\n"
+    "\n"
+    "It can be expressed in terms of the gamma function.\n"
+    "\n"
+    ">>> z, m = 3.7, 2.1\n"
+    ">>> sc.poch(z, m)\n"
+    "20.529581933776953\n"
+    ">>> sc.gamma(z + m) / sc.gamma(z)\n"
+    "20.52958193377696")
+ufunc_poch_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_poch_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_poch_types[0] = <char>NPY_FLOAT
+ufunc_poch_types[1] = <char>NPY_FLOAT
+ufunc_poch_types[2] = <char>NPY_FLOAT
+ufunc_poch_types[3] = <char>NPY_DOUBLE
+ufunc_poch_types[4] = <char>NPY_DOUBLE
+ufunc_poch_types[5] = <char>NPY_DOUBLE
+ufunc_poch_ptr[2*0] = <void*>_func_cephes_poch
+ufunc_poch_ptr[2*0+1] = <void*>(<char*>"poch")
+ufunc_poch_ptr[2*1] = <void*>_func_cephes_poch
+ufunc_poch_ptr[2*1+1] = <void*>(<char*>"poch")
+ufunc_poch_data[0] = &ufunc_poch_ptr[2*0]
+ufunc_poch_data[1] = &ufunc_poch_ptr[2*1]
+poch = np.PyUFunc_FromFuncAndData(ufunc_poch_loops, ufunc_poch_data, ufunc_poch_types, 2, 2, 1, 0, 'poch', ufunc_poch_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_powm1_loops[2]
+cdef void *ufunc_powm1_ptr[4]
+cdef void *ufunc_powm1_data[2]
+cdef char ufunc_powm1_types[6]
+cdef char *ufunc_powm1_doc = (
+    "powm1(x, y, out=None)\n"
+    "\n"
+    "Computes ``x**y - 1``.\n"
+    "\n"
+    "This function is useful when `y` is near 0, or when `x` is near 1.\n"
+    "\n"
+    "The function is implemented for real types only (unlike ``numpy.power``,\n"
+    "which accepts complex inputs).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    The base. Must be a real type (i.e. integer or float, not complex).\n"
+    "y : array_like\n"
+    "    The exponent. Must be a real type (i.e. integer or float, not complex).\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "array_like\n"
+    "    Result of the calculation\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 1.10.0\n"
+    "\n"
+    "The underlying code is implemented for single precision and double\n"
+    "precision floats only.  Unlike `numpy.power`, integer inputs to\n"
+    "`powm1` are converted to floating point, and complex inputs are\n"
+    "not accepted.\n"
+    "\n"
+    "Note the following edge cases:\n"
+    "\n"
+    "* ``powm1(x, 0)`` returns 0 for any ``x``, including 0, ``inf``\n"
+    "  and ``nan``.\n"
+    "* ``powm1(1, y)`` returns 0 for any ``y``, including ``nan``\n"
+    "  and ``inf``.\n"
+    "\n"
+    "This function wraps the ``powm1`` routine from the\n"
+    "Boost Math C++ library [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] The Boost Developers. \"Boost C++ Libraries\". https://www.boost.org/.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import powm1\n"
+    "\n"
+    ">>> x = np.array([1.2, 10.0, 0.9999999975])\n"
+    ">>> y = np.array([1e-9, 1e-11, 0.1875])\n"
+    ">>> powm1(x, y)\n"
+    "array([ 1.82321557e-10,  2.30258509e-11, -4.68749998e-10])\n"
+    "\n"
+    "It can be verified that the relative errors in those results\n"
+    "are less than 2.5e-16.\n"
+    "\n"
+    "Compare that to the result of ``x**y - 1``, where the\n"
+    "relative errors are all larger than 8e-8:\n"
+    "\n"
+    ">>> x**y - 1\n"
+    "array([ 1.82321491e-10,  2.30258035e-11, -4.68750039e-10])")
+ufunc_powm1_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc_powm1_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_powm1_types[0] = <char>NPY_FLOAT
+ufunc_powm1_types[1] = <char>NPY_FLOAT
+ufunc_powm1_types[2] = <char>NPY_FLOAT
+ufunc_powm1_types[3] = <char>NPY_DOUBLE
+ufunc_powm1_types[4] = <char>NPY_DOUBLE
+ufunc_powm1_types[5] = <char>NPY_DOUBLE
+ufunc_powm1_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_powm1_float
+ufunc_powm1_ptr[2*0+1] = <void*>(<char*>"powm1")
+ufunc_powm1_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_powm1_double
+ufunc_powm1_ptr[2*1+1] = <void*>(<char*>"powm1")
+ufunc_powm1_data[0] = &ufunc_powm1_ptr[2*0]
+ufunc_powm1_data[1] = &ufunc_powm1_ptr[2*1]
+powm1 = np.PyUFunc_FromFuncAndData(ufunc_powm1_loops, ufunc_powm1_data, ufunc_powm1_types, 2, 2, 1, 0, 'powm1', ufunc_powm1_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_pseudo_huber_loops[2]
+cdef void *ufunc_pseudo_huber_ptr[4]
+cdef void *ufunc_pseudo_huber_data[2]
+cdef char ufunc_pseudo_huber_types[6]
+cdef char *ufunc_pseudo_huber_doc = (
+    "pseudo_huber(delta, r, out=None)\n"
+    "\n"
+    "Pseudo-Huber loss function.\n"
+    "\n"
+    ".. math:: \\mathrm{pseudo\\_huber}(\\delta, r) =\n"
+    "          \\delta^2 \\left( \\sqrt{ 1 + \\left( \\frac{r}{\\delta} \\right)^2 } - 1 \\right)\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "delta : array_like\n"
+    "    Input array, indicating the soft quadratic vs. linear loss changepoint.\n"
+    "r : array_like\n"
+    "    Input array, possibly representing residuals.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "res : scalar or ndarray\n"
+    "    The computed Pseudo-Huber loss function values.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "huber: Similar function which this function approximates\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Like `huber`, `pseudo_huber` often serves as a robust loss function\n"
+    "in statistics or machine learning to reduce the influence of outliers.\n"
+    "Unlike `huber`, `pseudo_huber` is smooth.\n"
+    "\n"
+    "Typically, `r` represents residuals, the difference\n"
+    "between a model prediction and data. Then, for :math:`|r|\\leq\\delta`,\n"
+    "`pseudo_huber` resembles the squared error and for :math:`|r|>\\delta` the\n"
+    "absolute error. This way, the Pseudo-Huber loss often achieves\n"
+    "a fast convergence in model fitting for small residuals like the squared\n"
+    "error loss function and still reduces the influence of outliers\n"
+    "(:math:`|r|>\\delta`) like the absolute error loss. As :math:`\\delta` is\n"
+    "the cutoff between squared and absolute error regimes, it has\n"
+    "to be tuned carefully for each problem. `pseudo_huber` is also\n"
+    "convex, making it suitable for gradient based optimization. [1]_ [2]_\n"
+    "\n"
+    ".. versionadded:: 0.15.0\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Hartley, Zisserman, \"Multiple View Geometry in Computer Vision\".\n"
+    "       2003. Cambridge University Press. p. 619\n"
+    ".. [2] Charbonnier et al. \"Deterministic edge-preserving regularization\n"
+    "       in computed imaging\". 1997. IEEE Trans. Image Processing.\n"
+    "       6 (2): 298 - 311.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Import all necessary modules.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import pseudo_huber, huber\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    "\n"
+    "Calculate the function for ``delta=1`` at ``r=2``.\n"
+    "\n"
+    ">>> pseudo_huber(1., 2.)\n"
+    "1.2360679774997898\n"
+    "\n"
+    "Calculate the function at ``r=2`` for different `delta` by providing\n"
+    "a list or NumPy array for `delta`.\n"
+    "\n"
+    ">>> pseudo_huber([1., 2., 4.], 3.)\n"
+    "array([2.16227766, 3.21110255, 4.        ])\n"
+    "\n"
+    "Calculate the function for ``delta=1`` at several points by providing\n"
+    "a list or NumPy array for `r`.\n"
+    "\n"
+    ">>> pseudo_huber(2., np.array([1., 1.5, 3., 4.]))\n"
+    "array([0.47213595, 1.        , 3.21110255, 4.94427191])\n"
+    "\n"
+    "The function can be calculated for different `delta` and `r` by\n"
+    "providing arrays for both with compatible shapes for broadcasting.\n"
+    "\n"
+    ">>> r = np.array([1., 2.5, 8., 10.])\n"
+    ">>> deltas = np.array([[1.], [5.], [9.]])\n"
+    ">>> print(r.shape, deltas.shape)\n"
+    "(4,) (3, 1)\n"
+    "\n"
+    ">>> pseudo_huber(deltas, r)\n"
+    "array([[ 0.41421356,  1.6925824 ,  7.06225775,  9.04987562],\n"
+    "       [ 0.49509757,  2.95084972, 22.16990566, 30.90169944],\n"
+    "       [ 0.49846624,  3.06693762, 27.37435121, 40.08261642]])\n"
+    "\n"
+    "Plot the function for different `delta`.\n"
+    "\n"
+    ">>> x = np.linspace(-4, 4, 500)\n"
+    ">>> deltas = [1, 2, 3]\n"
+    ">>> linestyles = [\"dashed\", \"dotted\", \"dashdot\"]\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> combined_plot_parameters = list(zip(deltas, linestyles))\n"
+    ">>> for delta, style in combined_plot_parameters:\n"
+    "...     ax.plot(x, pseudo_huber(delta, x), label=rf\"$\\delta={delta}$\",\n"
+    "...             ls=style)\n"
+    ">>> ax.legend(loc=\"upper center\")\n"
+    ">>> ax.set_xlabel(\"$x$\")\n"
+    ">>> ax.set_title(r\"Pseudo-Huber loss function $h_{\\delta}(x)$\")\n"
+    ">>> ax.set_xlim(-4, 4)\n"
+    ">>> ax.set_ylim(0, 8)\n"
+    ">>> plt.show()\n"
+    "\n"
+    "Finally, illustrate the difference between `huber` and `pseudo_huber` by\n"
+    "plotting them and their gradients with respect to `r`. The plot shows\n"
+    "that `pseudo_huber` is continuously differentiable while `huber` is not\n"
+    "at the points :math:`\\pm\\delta`.\n"
+    "\n"
+    ">>> def huber_grad(delta, x):\n"
+    "...     grad = np.copy(x)\n"
+    "...     linear_area = np.argwhere(np.abs(x) > delta)\n"
+    "...     grad[linear_area]=delta*np.sign(x[linear_area])\n"
+    "...     return grad\n"
+    ">>> def pseudo_huber_grad(delta, x):\n"
+    "...     return x* (1+(x/delta)**2)**(-0.5)\n"
+    ">>> x=np.linspace(-3, 3, 500)\n"
+    ">>> delta = 1.\n"
+    ">>> fig, ax = plt.subplots(figsize=(7, 7))\n"
+    ">>> ax.plot(x, huber(delta, x), label=\"Huber\", ls=\"dashed\")\n"
+    ">>> ax.plot(x, huber_grad(delta, x), label=\"Huber Gradient\", ls=\"dashdot\")\n"
+    ">>> ax.plot(x, pseudo_huber(delta, x), label=\"Pseudo-Huber\", ls=\"dotted\")\n"
+    ">>> ax.plot(x, pseudo_huber_grad(delta, x), label=\"Pseudo-Huber Gradient\",\n"
+    "...         ls=\"solid\")\n"
+    ">>> ax.legend(loc=\"upper center\")\n"
+    ">>> plt.show()")
+ufunc_pseudo_huber_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_pseudo_huber_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_pseudo_huber_types[0] = <char>NPY_FLOAT
+ufunc_pseudo_huber_types[1] = <char>NPY_FLOAT
+ufunc_pseudo_huber_types[2] = <char>NPY_FLOAT
+ufunc_pseudo_huber_types[3] = <char>NPY_DOUBLE
+ufunc_pseudo_huber_types[4] = <char>NPY_DOUBLE
+ufunc_pseudo_huber_types[5] = <char>NPY_DOUBLE
+ufunc_pseudo_huber_ptr[2*0] = <void*>_func_pseudo_huber
+ufunc_pseudo_huber_ptr[2*0+1] = <void*>(<char*>"pseudo_huber")
+ufunc_pseudo_huber_ptr[2*1] = <void*>_func_pseudo_huber
+ufunc_pseudo_huber_ptr[2*1+1] = <void*>(<char*>"pseudo_huber")
+ufunc_pseudo_huber_data[0] = &ufunc_pseudo_huber_ptr[2*0]
+ufunc_pseudo_huber_data[1] = &ufunc_pseudo_huber_ptr[2*1]
+pseudo_huber = np.PyUFunc_FromFuncAndData(ufunc_pseudo_huber_loops, ufunc_pseudo_huber_data, ufunc_pseudo_huber_types, 2, 2, 1, 0, 'pseudo_huber', ufunc_pseudo_huber_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_rel_entr_loops[2]
+cdef void *ufunc_rel_entr_ptr[4]
+cdef void *ufunc_rel_entr_data[2]
+cdef char ufunc_rel_entr_types[6]
+cdef char *ufunc_rel_entr_doc = (
+    "rel_entr(x, y, out=None)\n"
+    "\n"
+    "Elementwise function for computing relative entropy.\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\mathrm{rel\\_entr}(x, y) =\n"
+    "        \\begin{cases}\n"
+    "            x \\log(x / y) & x > 0, y > 0 \\\\\n"
+    "            0 & x = 0, y \\ge 0 \\\\\n"
+    "            \\infty & \\text{otherwise}\n"
+    "        \\end{cases}\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, y : array_like\n"
+    "    Input arrays\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Relative entropy of the inputs\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "entr, kl_div, scipy.stats.entropy\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 0.15.0\n"
+    "\n"
+    "This function is jointly convex in x and y.\n"
+    "\n"
+    "The origin of this function is in convex programming; see\n"
+    "[1]_. Given two discrete probability distributions :math:`p_1,\n"
+    "\\ldots, p_n` and :math:`q_1, \\ldots, q_n`, the definition of relative\n"
+    "entropy in the context of *information theory* is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\sum_{i = 1}^n \\mathrm{rel\\_entr}(p_i, q_i).\n"
+    "\n"
+    "To compute the latter quantity, use `scipy.stats.entropy`.\n"
+    "\n"
+    "See [2]_ for details.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Boyd, Stephen and Lieven Vandenberghe. *Convex optimization*.\n"
+    "       Cambridge University Press, 2004.\n"
+    "       :doi:`https://doi.org/10.1017/CBO9780511804441`\n"
+    ".. [2] Kullback-Leibler divergence,\n"
+    "       https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence")
+ufunc_rel_entr_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_rel_entr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_rel_entr_types[0] = <char>NPY_FLOAT
+ufunc_rel_entr_types[1] = <char>NPY_FLOAT
+ufunc_rel_entr_types[2] = <char>NPY_FLOAT
+ufunc_rel_entr_types[3] = <char>NPY_DOUBLE
+ufunc_rel_entr_types[4] = <char>NPY_DOUBLE
+ufunc_rel_entr_types[5] = <char>NPY_DOUBLE
+ufunc_rel_entr_ptr[2*0] = <void*>_func_rel_entr
+ufunc_rel_entr_ptr[2*0+1] = <void*>(<char*>"rel_entr")
+ufunc_rel_entr_ptr[2*1] = <void*>_func_rel_entr
+ufunc_rel_entr_ptr[2*1+1] = <void*>(<char*>"rel_entr")
+ufunc_rel_entr_data[0] = &ufunc_rel_entr_ptr[2*0]
+ufunc_rel_entr_data[1] = &ufunc_rel_entr_ptr[2*1]
+rel_entr = np.PyUFunc_FromFuncAndData(ufunc_rel_entr_loops, ufunc_rel_entr_data, ufunc_rel_entr_types, 2, 2, 1, 0, 'rel_entr', ufunc_rel_entr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_round_loops[2]
+cdef void *ufunc_round_ptr[4]
+cdef void *ufunc_round_data[2]
+cdef char ufunc_round_types[4]
+cdef char *ufunc_round_doc = (
+    "round(x, out=None)\n"
+    "\n"
+    "Round to the nearest integer.\n"
+    "\n"
+    "Returns the nearest integer to `x`.  If `x` ends in 0.5 exactly,\n"
+    "the nearest even integer is chosen.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real valued input.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results.\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The nearest integers to the elements of `x`. The result is of\n"
+    "    floating type, not integer type.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import scipy.special as sc\n"
+    "\n"
+    "It rounds to even.\n"
+    "\n"
+    ">>> sc.round([0.5, 1.5])\n"
+    "array([0., 2.])")
+ufunc_round_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_round_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_round_types[0] = <char>NPY_FLOAT
+ufunc_round_types[1] = <char>NPY_FLOAT
+ufunc_round_types[2] = <char>NPY_DOUBLE
+ufunc_round_types[3] = <char>NPY_DOUBLE
+ufunc_round_ptr[2*0] = <void*>_func_cephes_round
+ufunc_round_ptr[2*0+1] = <void*>(<char*>"round")
+ufunc_round_ptr[2*1] = <void*>_func_cephes_round
+ufunc_round_ptr[2*1+1] = <void*>(<char*>"round")
+ufunc_round_data[0] = &ufunc_round_ptr[2*0]
+ufunc_round_data[1] = &ufunc_round_ptr[2*1]
+round = np.PyUFunc_FromFuncAndData(ufunc_round_loops, ufunc_round_data, ufunc_round_types, 2, 1, 1, 0, 'round', ufunc_round_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_shichi_loops[4]
+cdef void *ufunc_shichi_ptr[8]
+cdef void *ufunc_shichi_data[4]
+cdef char ufunc_shichi_types[12]
+cdef char *ufunc_shichi_doc = (
+    "shichi(x, out=None)\n"
+    "\n"
+    "Hyperbolic sine and cosine integrals.\n"
+    "\n"
+    "The hyperbolic sine integral is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "  \\int_0^x \\frac{\\sinh{t}}{t}dt\n"
+    "\n"
+    "and the hyperbolic cosine integral is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "  \\gamma + \\log(x) + \\int_0^x \\frac{\\cosh{t} - 1}{t} dt\n"
+    "\n"
+    "where :math:`\\gamma` is Euler's constant and :math:`\\log` is the\n"
+    "principal branch of the logarithm [1]_ (see also [2]_).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real or complex points at which to compute the hyperbolic sine\n"
+    "    and cosine integrals.\n"
+    "out : tuple of ndarray, optional\n"
+    "    Optional output arrays for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "si : scalar or ndarray\n"
+    "    Hyperbolic sine integral at ``x``\n"
+    "ci : scalar or ndarray\n"
+    "    Hyperbolic cosine integral at ``x``\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "sici : Sine and cosine integrals.\n"
+    "exp1 : Exponential integral E1.\n"
+    "expi : Exponential integral Ei.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "For real arguments with ``x < 0``, ``chi`` is the real part of the\n"
+    "hyperbolic cosine integral. For such points ``chi(x)`` and ``chi(x\n"
+    "+ 0j)`` differ by a factor of ``1j*pi``.\n"
+    "\n"
+    "For real arguments the function is computed by calling Cephes'\n"
+    "[3]_ *shichi* routine. For complex arguments the algorithm is based\n"
+    "on Mpmath's [4]_ *shi* and *chi* routines.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    "       (See Section 5.2.)\n"
+    ".. [2] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/6.2.E15 and https://dlmf.nist.gov/6.2.E16\n"
+    ".. [3] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    ".. [4] Fredrik Johansson and others.\n"
+    "       \"mpmath: a Python library for arbitrary-precision floating-point\n"
+    "       arithmetic\" (Version 0.19) http://mpmath.org/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> from scipy.special import shichi, sici\n"
+    "\n"
+    "`shichi` accepts real or complex input:\n"
+    "\n"
+    ">>> shichi(0.5)\n"
+    "(0.5069967498196671, -0.05277684495649357)\n"
+    ">>> shichi(0.5 + 2.5j)\n"
+    "((0.11772029666668238+1.831091777729851j),\n"
+    " (0.29912435887648825+1.7395351121166562j))\n"
+    "\n"
+    "The hyperbolic sine and cosine integrals Shi(z) and Chi(z) are\n"
+    "related to the sine and cosine integrals Si(z) and Ci(z) by\n"
+    "\n"
+    "* Shi(z) = -i*Si(i*z)\n"
+    "* Chi(z) = Ci(-i*z) + i*pi/2\n"
+    "\n"
+    ">>> z = 0.25 + 5j\n"
+    ">>> shi, chi = shichi(z)\n"
+    ">>> shi, -1j*sici(1j*z)[0]            # Should be the same.\n"
+    "((-0.04834719325101729+1.5469354086921228j),\n"
+    " (-0.04834719325101729+1.5469354086921228j))\n"
+    ">>> chi, sici(-1j*z)[1] + 1j*np.pi/2  # Should be the same.\n"
+    "((-0.19568708973868087+1.556276312103824j),\n"
+    " (-0.19568708973868087+1.556276312103824j))\n"
+    "\n"
+    "Plot the functions evaluated on the real axis:\n"
+    "\n"
+    ">>> xp = np.geomspace(1e-8, 4.0, 250)\n"
+    ">>> x = np.concatenate((-xp[::-1], xp))\n"
+    ">>> shi, chi = shichi(x)\n"
+    "\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> ax.plot(x, shi, label='Shi(x)')\n"
+    ">>> ax.plot(x, chi, '--', label='Chi(x)')\n"
+    ">>> ax.set_xlabel('x')\n"
+    ">>> ax.set_title('Hyperbolic Sine and Cosine Integrals')\n"
+    ">>> ax.legend(shadow=True, framealpha=1, loc='lower right')\n"
+    ">>> ax.grid(True)\n"
+    ">>> plt.show()")
+ufunc_shichi_loops[0] = <np.PyUFuncGenericFunction>loop_i_d_dd_As_f_ff
+ufunc_shichi_loops[1] = <np.PyUFuncGenericFunction>loop_i_d_dd_As_d_dd
+ufunc_shichi_loops[2] = <np.PyUFuncGenericFunction>loop_i_D_DD_As_F_FF
+ufunc_shichi_loops[3] = <np.PyUFuncGenericFunction>loop_i_D_DD_As_D_DD
+ufunc_shichi_types[0] = <char>NPY_FLOAT
+ufunc_shichi_types[1] = <char>NPY_FLOAT
+ufunc_shichi_types[2] = <char>NPY_FLOAT
+ufunc_shichi_types[3] = <char>NPY_DOUBLE
+ufunc_shichi_types[4] = <char>NPY_DOUBLE
+ufunc_shichi_types[5] = <char>NPY_DOUBLE
+ufunc_shichi_types[6] = <char>NPY_CFLOAT
+ufunc_shichi_types[7] = <char>NPY_CFLOAT
+ufunc_shichi_types[8] = <char>NPY_CFLOAT
+ufunc_shichi_types[9] = <char>NPY_CDOUBLE
+ufunc_shichi_types[10] = <char>NPY_CDOUBLE
+ufunc_shichi_types[11] = <char>NPY_CDOUBLE
+ufunc_shichi_ptr[2*0] = <void*>_func_xsf_shichi
+ufunc_shichi_ptr[2*0+1] = <void*>(<char*>"shichi")
+ufunc_shichi_ptr[2*1] = <void*>_func_xsf_shichi
+ufunc_shichi_ptr[2*1+1] = <void*>(<char*>"shichi")
+ufunc_shichi_ptr[2*2] = <void*>_func_xsf_cshichi
+ufunc_shichi_ptr[2*2+1] = <void*>(<char*>"shichi")
+ufunc_shichi_ptr[2*3] = <void*>_func_xsf_cshichi
+ufunc_shichi_ptr[2*3+1] = <void*>(<char*>"shichi")
+ufunc_shichi_data[0] = &ufunc_shichi_ptr[2*0]
+ufunc_shichi_data[1] = &ufunc_shichi_ptr[2*1]
+ufunc_shichi_data[2] = &ufunc_shichi_ptr[2*2]
+ufunc_shichi_data[3] = &ufunc_shichi_ptr[2*3]
+shichi = np.PyUFunc_FromFuncAndData(ufunc_shichi_loops, ufunc_shichi_data, ufunc_shichi_types, 4, 1, 2, 0, 'shichi', ufunc_shichi_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_sici_loops[4]
+cdef void *ufunc_sici_ptr[8]
+cdef void *ufunc_sici_data[4]
+cdef char ufunc_sici_types[12]
+cdef char *ufunc_sici_doc = (
+    "sici(x, out=None)\n"
+    "\n"
+    "Sine and cosine integrals.\n"
+    "\n"
+    "The sine integral is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "  \\int_0^x \\frac{\\sin{t}}{t}dt\n"
+    "\n"
+    "and the cosine integral is\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "  \\gamma + \\log(x) + \\int_0^x \\frac{\\cos{t} - 1}{t}dt\n"
+    "\n"
+    "where :math:`\\gamma` is Euler's constant and :math:`\\log` is the\n"
+    "principal branch of the logarithm [1]_ (see also [2]_).\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x : array_like\n"
+    "    Real or complex points at which to compute the sine and cosine\n"
+    "    integrals.\n"
+    "out : tuple of ndarray, optional\n"
+    "    Optional output arrays for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "si : scalar or ndarray\n"
+    "    Sine integral at ``x``\n"
+    "ci : scalar or ndarray\n"
+    "    Cosine integral at ``x``\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "shichi : Hyperbolic sine and cosine integrals.\n"
+    "exp1 : Exponential integral E1.\n"
+    "expi : Exponential integral Ei.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "For real arguments with ``x < 0``, ``ci`` is the real part of the\n"
+    "cosine integral. For such points ``ci(x)`` and ``ci(x + 0j)``\n"
+    "differ by a factor of ``1j*pi``.\n"
+    "\n"
+    "For real arguments the function is computed by calling Cephes'\n"
+    "[3]_ *sici* routine. For complex arguments the algorithm is based\n"
+    "on Mpmath's [4]_ *si* and *ci* routines.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Milton Abramowitz and Irene A. Stegun, eds.\n"
+    "       Handbook of Mathematical Functions with Formulas,\n"
+    "       Graphs, and Mathematical Tables. New York: Dover, 1972.\n"
+    "       (See Section 5.2.)\n"
+    ".. [2] NIST Digital Library of Mathematical Functions\n"
+    "       https://dlmf.nist.gov/6.2.E9, https://dlmf.nist.gov/6.2.E12,\n"
+    "       and https://dlmf.nist.gov/6.2.E13\n"
+    ".. [3] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    ".. [4] Fredrik Johansson and others.\n"
+    "       \"mpmath: a Python library for arbitrary-precision floating-point\n"
+    "       arithmetic\" (Version 0.19) http://mpmath.org/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> from scipy.special import sici, exp1\n"
+    "\n"
+    "`sici` accepts real or complex input:\n"
+    "\n"
+    ">>> sici(2.5)\n"
+    "(1.7785201734438267, 0.2858711963653835)\n"
+    ">>> sici(2.5 + 3j)\n"
+    "((4.505735874563953+0.06863305018999577j),\n"
+    "(0.0793644206906966-2.935510262937543j))\n"
+    "\n"
+    "For z in the right half plane, the sine and cosine integrals are\n"
+    "related to the exponential integral E1 (implemented in SciPy as\n"
+    "`scipy.special.exp1`) by\n"
+    "\n"
+    "* Si(z) = (E1(i*z) - E1(-i*z))/2i + pi/2\n"
+    "* Ci(z) = -(E1(i*z) + E1(-i*z))/2\n"
+    "\n"
+    "See [1]_ (equations 5.2.21 and 5.2.23).\n"
+    "\n"
+    "We can verify these relations:\n"
+    "\n"
+    ">>> z = 2 - 3j\n"
+    ">>> sici(z)\n"
+    "((4.54751388956229-1.3991965806460565j),\n"
+    "(1.408292501520851+2.9836177420296055j))\n"
+    "\n"
+    ">>> (exp1(1j*z) - exp1(-1j*z))/2j + np.pi/2  # Same as sine integral\n"
+    "(4.54751388956229-1.3991965806460565j)\n"
+    "\n"
+    ">>> -(exp1(1j*z) + exp1(-1j*z))/2            # Same as cosine integral\n"
+    "(1.408292501520851+2.9836177420296055j)\n"
+    "\n"
+    "Plot the functions evaluated on the real axis; the dotted horizontal\n"
+    "lines are at pi/2 and -pi/2:\n"
+    "\n"
+    ">>> x = np.linspace(-16, 16, 150)\n"
+    ">>> si, ci = sici(x)\n"
+    "\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> ax.plot(x, si, label='Si(x)')\n"
+    ">>> ax.plot(x, ci, '--', label='Ci(x)')\n"
+    ">>> ax.legend(shadow=True, framealpha=1, loc='upper left')\n"
+    ">>> ax.set_xlabel('x')\n"
+    ">>> ax.set_title('Sine and Cosine Integrals')\n"
+    ">>> ax.axhline(np.pi/2, linestyle=':', alpha=0.5, color='k')\n"
+    ">>> ax.axhline(-np.pi/2, linestyle=':', alpha=0.5, color='k')\n"
+    ">>> ax.grid(True)\n"
+    ">>> plt.show()")
+ufunc_sici_loops[0] = <np.PyUFuncGenericFunction>loop_i_d_dd_As_f_ff
+ufunc_sici_loops[1] = <np.PyUFuncGenericFunction>loop_i_d_dd_As_d_dd
+ufunc_sici_loops[2] = <np.PyUFuncGenericFunction>loop_i_D_DD_As_F_FF
+ufunc_sici_loops[3] = <np.PyUFuncGenericFunction>loop_i_D_DD_As_D_DD
+ufunc_sici_types[0] = <char>NPY_FLOAT
+ufunc_sici_types[1] = <char>NPY_FLOAT
+ufunc_sici_types[2] = <char>NPY_FLOAT
+ufunc_sici_types[3] = <char>NPY_DOUBLE
+ufunc_sici_types[4] = <char>NPY_DOUBLE
+ufunc_sici_types[5] = <char>NPY_DOUBLE
+ufunc_sici_types[6] = <char>NPY_CFLOAT
+ufunc_sici_types[7] = <char>NPY_CFLOAT
+ufunc_sici_types[8] = <char>NPY_CFLOAT
+ufunc_sici_types[9] = <char>NPY_CDOUBLE
+ufunc_sici_types[10] = <char>NPY_CDOUBLE
+ufunc_sici_types[11] = <char>NPY_CDOUBLE
+ufunc_sici_ptr[2*0] = <void*>_func_xsf_sici
+ufunc_sici_ptr[2*0+1] = <void*>(<char*>"sici")
+ufunc_sici_ptr[2*1] = <void*>_func_xsf_sici
+ufunc_sici_ptr[2*1+1] = <void*>(<char*>"sici")
+ufunc_sici_ptr[2*2] = <void*>_func_xsf_csici
+ufunc_sici_ptr[2*2+1] = <void*>(<char*>"sici")
+ufunc_sici_ptr[2*3] = <void*>_func_xsf_csici
+ufunc_sici_ptr[2*3+1] = <void*>(<char*>"sici")
+ufunc_sici_data[0] = &ufunc_sici_ptr[2*0]
+ufunc_sici_data[1] = &ufunc_sici_ptr[2*1]
+ufunc_sici_data[2] = &ufunc_sici_ptr[2*2]
+ufunc_sici_data[3] = &ufunc_sici_ptr[2*3]
+sici = np.PyUFunc_FromFuncAndData(ufunc_sici_loops, ufunc_sici_data, ufunc_sici_types, 4, 1, 2, 0, 'sici', ufunc_sici_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_smirnov_loops[3]
+cdef void *ufunc_smirnov_ptr[6]
+cdef void *ufunc_smirnov_data[3]
+cdef char ufunc_smirnov_types[9]
+cdef char *ufunc_smirnov_doc = (
+    "smirnov(n, d, out=None)\n"
+    "\n"
+    "Kolmogorov-Smirnov complementary cumulative distribution function\n"
+    "\n"
+    "Returns the exact Kolmogorov-Smirnov complementary cumulative\n"
+    "distribution function,(aka the Survival Function) of Dn+ (or Dn-)\n"
+    "for a one-sided test of equality between an empirical and a\n"
+    "theoretical distribution. It is equal to the probability that the\n"
+    "maximum difference between a theoretical distribution and an empirical\n"
+    "one based on `n` samples is greater than d.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : int\n"
+    "  Number of samples\n"
+    "d : float array_like\n"
+    "  Deviation between the Empirical CDF (ECDF) and the target CDF.\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The value(s) of smirnov(n, d), Prob(Dn+ >= d) (Also Prob(Dn- >= d))\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "smirnovi : The Inverse Survival Function for the distribution\n"
+    "scipy.stats.ksone : Provides the functionality as a continuous distribution\n"
+    "kolmogorov, kolmogi : Functions for the two-sided distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "`smirnov` is used by `stats.kstest` in the application of the\n"
+    "Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this\n"
+    "function is exposed in `scpy.special`, but the recommended way to achieve\n"
+    "the most accurate CDF/SF/PDF/PPF/ISF computations is to use the\n"
+    "`stats.ksone` distribution.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import smirnov\n"
+    ">>> from scipy.stats import norm\n"
+    "\n"
+    "Show the probability of a gap at least as big as 0, 0.5 and 1.0 for a\n"
+    "sample of size 5.\n"
+    "\n"
+    ">>> smirnov(5, [0, 0.5, 1.0])\n"
+    "array([ 1.   ,  0.056,  0.   ])\n"
+    "\n"
+    "Compare a sample of size 5 against N(0, 1), the standard normal\n"
+    "distribution with mean 0 and standard deviation 1.\n"
+    "\n"
+    "`x` is the sample.\n"
+    "\n"
+    ">>> x = np.array([-1.392, -0.135, 0.114, 0.190, 1.82])\n"
+    "\n"
+    ">>> target = norm(0, 1)\n"
+    ">>> cdfs = target.cdf(x)\n"
+    ">>> cdfs\n"
+    "array([0.0819612 , 0.44630594, 0.5453811 , 0.57534543, 0.9656205 ])\n"
+    "\n"
+    "Construct the empirical CDF and the K-S statistics (Dn+, Dn-, Dn).\n"
+    "\n"
+    ">>> n = len(x)\n"
+    ">>> ecdfs = np.arange(n+1, dtype=float)/n\n"
+    ">>> cols = np.column_stack([x, ecdfs[1:], cdfs, cdfs - ecdfs[:n],\n"
+    "...                        ecdfs[1:] - cdfs])\n"
+    ">>> with np.printoptions(precision=3):\n"
+    "...    print(cols)\n"
+    "[[-1.392  0.2    0.082  0.082  0.118]\n"
+    " [-0.135  0.4    0.446  0.246 -0.046]\n"
+    " [ 0.114  0.6    0.545  0.145  0.055]\n"
+    " [ 0.19   0.8    0.575 -0.025  0.225]\n"
+    " [ 1.82   1.     0.966  0.166  0.034]]\n"
+    ">>> gaps = cols[:, -2:]\n"
+    ">>> Dnpm = np.max(gaps, axis=0)\n"
+    ">>> print(f'Dn-={Dnpm[0]:f}, Dn+={Dnpm[1]:f}')\n"
+    "Dn-=0.246306, Dn+=0.224655\n"
+    ">>> probs = smirnov(n, Dnpm)\n"
+    ">>> print(f'For a sample of size {n} drawn from N(0, 1):',\n"
+    "...       f' Smirnov n={n}: Prob(Dn- >= {Dnpm[0]:f}) = {probs[0]:.4f}',\n"
+    "...       f' Smirnov n={n}: Prob(Dn+ >= {Dnpm[1]:f}) = {probs[1]:.4f}',\n"
+    "...       sep='\\n')\n"
+    "For a sample of size 5 drawn from N(0, 1):\n"
+    " Smirnov n=5: Prob(Dn- >= 0.246306) = 0.4711\n"
+    " Smirnov n=5: Prob(Dn+ >= 0.224655) = 0.5245\n"
+    "\n"
+    "Plot the empirical CDF and the standard normal CDF.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> plt.step(np.concatenate(([-2.5], x, [2.5])),\n"
+    "...          np.concatenate((ecdfs, [1])),\n"
+    "...          where='post', label='Empirical CDF')\n"
+    ">>> xx = np.linspace(-2.5, 2.5, 100)\n"
+    ">>> plt.plot(xx, target.cdf(xx), '--', label='CDF for N(0, 1)')\n"
+    "\n"
+    "Add vertical lines marking Dn+ and Dn-.\n"
+    "\n"
+    ">>> iminus, iplus = np.argmax(gaps, axis=0)\n"
+    ">>> plt.vlines([x[iminus]], ecdfs[iminus], cdfs[iminus], color='r',\n"
+    "...            alpha=0.5, lw=4)\n"
+    ">>> plt.vlines([x[iplus]], cdfs[iplus], ecdfs[iplus+1], color='m',\n"
+    "...            alpha=0.5, lw=4)\n"
+    "\n"
+    ">>> plt.grid(True)\n"
+    ">>> plt.legend(framealpha=1, shadow=True)\n"
+    ">>> plt.show()")
+ufunc_smirnov_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_smirnov_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_smirnov_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_smirnov_types[0] = <char>NPY_INTP
+ufunc_smirnov_types[1] = <char>NPY_DOUBLE
+ufunc_smirnov_types[2] = <char>NPY_DOUBLE
+ufunc_smirnov_types[3] = <char>NPY_FLOAT
+ufunc_smirnov_types[4] = <char>NPY_FLOAT
+ufunc_smirnov_types[5] = <char>NPY_FLOAT
+ufunc_smirnov_types[6] = <char>NPY_DOUBLE
+ufunc_smirnov_types[7] = <char>NPY_DOUBLE
+ufunc_smirnov_types[8] = <char>NPY_DOUBLE
+ufunc_smirnov_ptr[2*0] = <void*>_func_cephes_smirnov_wrap
+ufunc_smirnov_ptr[2*0+1] = <void*>(<char*>"smirnov")
+ufunc_smirnov_ptr[2*1] = <void*>_func_smirnov_unsafe
+ufunc_smirnov_ptr[2*1+1] = <void*>(<char*>"smirnov")
+ufunc_smirnov_ptr[2*2] = <void*>_func_smirnov_unsafe
+ufunc_smirnov_ptr[2*2+1] = <void*>(<char*>"smirnov")
+ufunc_smirnov_data[0] = &ufunc_smirnov_ptr[2*0]
+ufunc_smirnov_data[1] = &ufunc_smirnov_ptr[2*1]
+ufunc_smirnov_data[2] = &ufunc_smirnov_ptr[2*2]
+smirnov = np.PyUFunc_FromFuncAndData(ufunc_smirnov_loops, ufunc_smirnov_data, ufunc_smirnov_types, 3, 2, 1, 0, 'smirnov', ufunc_smirnov_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_smirnovi_loops[3]
+cdef void *ufunc_smirnovi_ptr[6]
+cdef void *ufunc_smirnovi_data[3]
+cdef char ufunc_smirnovi_types[9]
+cdef char *ufunc_smirnovi_doc = (
+    "smirnovi(n, p, out=None)\n"
+    "\n"
+    "Inverse to `smirnov`\n"
+    "\n"
+    "Returns `d` such that ``smirnov(n, d) == p``, the critical value\n"
+    "corresponding to `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : int\n"
+    "  Number of samples\n"
+    "p : float array_like\n"
+    "    Probability\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    The value(s) of smirnovi(n, p), the critical values.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "smirnov : The Survival Function (SF) for the distribution\n"
+    "scipy.stats.ksone : Provides the functionality as a continuous distribution\n"
+    "kolmogorov, kolmogi : Functions for the two-sided distribution\n"
+    "scipy.stats.kstwobign : Two-sided Kolmogorov-Smirnov distribution, large n\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "`smirnov` is used by `stats.kstest` in the application of the\n"
+    "Kolmogorov-Smirnov Goodness of Fit test. For historical reasons this\n"
+    "function is exposed in `scpy.special`, but the recommended way to achieve\n"
+    "the most accurate CDF/SF/PDF/PPF/ISF computations is to use the\n"
+    "`stats.ksone` distribution.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from scipy.special import smirnovi, smirnov\n"
+    "\n"
+    ">>> n = 24\n"
+    ">>> deviations = [0.1, 0.2, 0.3]\n"
+    "\n"
+    "Use `smirnov` to compute the complementary CDF of the Smirnov\n"
+    "distribution for the given number of samples and deviations.\n"
+    "\n"
+    ">>> p = smirnov(n, deviations)\n"
+    ">>> p\n"
+    "array([0.58105083, 0.12826832, 0.01032231])\n"
+    "\n"
+    "The inverse function ``smirnovi(n, p)`` returns ``deviations``.\n"
+    "\n"
+    ">>> smirnovi(n, p)\n"
+    "array([0.1, 0.2, 0.3])")
+ufunc_smirnovi_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_smirnovi_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_smirnovi_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_smirnovi_types[0] = <char>NPY_INTP
+ufunc_smirnovi_types[1] = <char>NPY_DOUBLE
+ufunc_smirnovi_types[2] = <char>NPY_DOUBLE
+ufunc_smirnovi_types[3] = <char>NPY_FLOAT
+ufunc_smirnovi_types[4] = <char>NPY_FLOAT
+ufunc_smirnovi_types[5] = <char>NPY_FLOAT
+ufunc_smirnovi_types[6] = <char>NPY_DOUBLE
+ufunc_smirnovi_types[7] = <char>NPY_DOUBLE
+ufunc_smirnovi_types[8] = <char>NPY_DOUBLE
+ufunc_smirnovi_ptr[2*0] = <void*>_func_cephes_smirnovi_wrap
+ufunc_smirnovi_ptr[2*0+1] = <void*>(<char*>"smirnovi")
+ufunc_smirnovi_ptr[2*1] = <void*>_func_smirnovi_unsafe
+ufunc_smirnovi_ptr[2*1+1] = <void*>(<char*>"smirnovi")
+ufunc_smirnovi_ptr[2*2] = <void*>_func_smirnovi_unsafe
+ufunc_smirnovi_ptr[2*2+1] = <void*>(<char*>"smirnovi")
+ufunc_smirnovi_data[0] = &ufunc_smirnovi_ptr[2*0]
+ufunc_smirnovi_data[1] = &ufunc_smirnovi_ptr[2*1]
+ufunc_smirnovi_data[2] = &ufunc_smirnovi_ptr[2*2]
+smirnovi = np.PyUFunc_FromFuncAndData(ufunc_smirnovi_loops, ufunc_smirnovi_data, ufunc_smirnovi_types, 3, 2, 1, 0, 'smirnovi', ufunc_smirnovi_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_spence_loops[4]
+cdef void *ufunc_spence_ptr[8]
+cdef void *ufunc_spence_data[4]
+cdef char ufunc_spence_types[8]
+cdef char *ufunc_spence_doc = (
+    "spence(z, out=None)\n"
+    "\n"
+    "Spence's function, also known as the dilogarithm.\n"
+    "\n"
+    "It is defined to be\n"
+    "\n"
+    ".. math::\n"
+    "  \\int_1^z \\frac{\\log(t)}{1 - t}dt\n"
+    "\n"
+    "for complex :math:`z`, where the contour of integration is taken\n"
+    "to avoid the branch cut of the logarithm. Spence's function is\n"
+    "analytic everywhere except the negative real axis where it has a\n"
+    "branch cut.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "z : array_like\n"
+    "    Points at which to evaluate Spence's function\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "s : scalar or ndarray\n"
+    "    Computed values of Spence's function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "There is a different convention which defines Spence's function by\n"
+    "the integral\n"
+    "\n"
+    ".. math::\n"
+    "  -\\int_0^z \\frac{\\log(1 - t)}{t}dt;\n"
+    "\n"
+    "this is our ``spence(1 - z)``.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import spence\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    "\n"
+    "The function is defined for complex inputs:\n"
+    "\n"
+    ">>> spence([1-1j, 1.5+2j, 3j, -10-5j])\n"
+    "array([-0.20561676+0.91596559j, -0.86766909-1.39560134j,\n"
+    "       -0.59422064-2.49129918j, -1.14044398+6.80075924j])\n"
+    "\n"
+    "For complex inputs on the branch cut, which is the negative real axis,\n"
+    "the function returns the limit for ``z`` with positive imaginary part.\n"
+    "For example, in the following, note the sign change of the imaginary\n"
+    "part of the output for ``z = -2`` and ``z = -2 - 1e-8j``:\n"
+    "\n"
+    ">>> spence([-2 + 1e-8j, -2, -2 - 1e-8j])\n"
+    "array([2.32018041-3.45139229j, 2.32018042-3.4513923j ,\n"
+    "       2.32018041+3.45139229j])\n"
+    "\n"
+    "The function returns ``nan`` for real inputs on the branch cut:\n"
+    "\n"
+    ">>> spence(-1.5)\n"
+    "nan\n"
+    "\n"
+    "Verify some particular values: ``spence(0) = pi**2/6``,\n"
+    "``spence(1) = 0`` and ``spence(2) = -pi**2/12``.\n"
+    "\n"
+    ">>> spence([0, 1, 2])\n"
+    "array([ 1.64493407,  0.        , -0.82246703])\n"
+    ">>> np.pi**2/6, -np.pi**2/12\n"
+    "(1.6449340668482264, -0.8224670334241132)\n"
+    "\n"
+    "Verify the identity::\n"
+    "\n"
+    "    spence(z) + spence(1 - z) = pi**2/6 - log(z)*log(1 - z)\n"
+    "\n"
+    ">>> z = 3 + 4j\n"
+    ">>> spence(z) + spence(1 - z)\n"
+    "(-2.6523186143876067+1.8853470951513935j)\n"
+    ">>> np.pi**2/6 - np.log(z)*np.log(1 - z)\n"
+    "(-2.652318614387606+1.885347095151394j)\n"
+    "\n"
+    "Plot the function for positive real input.\n"
+    "\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> x = np.linspace(0, 6, 400)\n"
+    ">>> ax.plot(x, spence(x))\n"
+    ">>> ax.grid()\n"
+    ">>> ax.set_xlabel('x')\n"
+    ">>> ax.set_title('spence(x)')\n"
+    ">>> plt.show()")
+ufunc_spence_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_spence_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_spence_loops[2] = <np.PyUFuncGenericFunction>loop_D_D__As_F_F
+ufunc_spence_loops[3] = <np.PyUFuncGenericFunction>loop_D_D__As_D_D
+ufunc_spence_types[0] = <char>NPY_FLOAT
+ufunc_spence_types[1] = <char>NPY_FLOAT
+ufunc_spence_types[2] = <char>NPY_DOUBLE
+ufunc_spence_types[3] = <char>NPY_DOUBLE
+ufunc_spence_types[4] = <char>NPY_CFLOAT
+ufunc_spence_types[5] = <char>NPY_CFLOAT
+ufunc_spence_types[6] = <char>NPY_CDOUBLE
+ufunc_spence_types[7] = <char>NPY_CDOUBLE
+ufunc_spence_ptr[2*0] = <void*>_func_cephes_spence
+ufunc_spence_ptr[2*0+1] = <void*>(<char*>"spence")
+ufunc_spence_ptr[2*1] = <void*>_func_cephes_spence
+ufunc_spence_ptr[2*1+1] = <void*>(<char*>"spence")
+ufunc_spence_ptr[2*2] = <void*>_func_cspence
+ufunc_spence_ptr[2*2+1] = <void*>(<char*>"spence")
+ufunc_spence_ptr[2*3] = <void*>_func_cspence
+ufunc_spence_ptr[2*3+1] = <void*>(<char*>"spence")
+ufunc_spence_data[0] = &ufunc_spence_ptr[2*0]
+ufunc_spence_data[1] = &ufunc_spence_ptr[2*1]
+ufunc_spence_data[2] = &ufunc_spence_ptr[2*2]
+ufunc_spence_data[3] = &ufunc_spence_ptr[2*3]
+spence = np.PyUFunc_FromFuncAndData(ufunc_spence_loops, ufunc_spence_data, ufunc_spence_types, 4, 1, 1, 0, 'spence', ufunc_spence_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_stdtr_loops[2]
+cdef void *ufunc_stdtr_ptr[4]
+cdef void *ufunc_stdtr_data[2]
+cdef char ufunc_stdtr_types[6]
+cdef char *ufunc_stdtr_doc = (
+    "stdtr(df, t, out=None)\n"
+    "\n"
+    "Student t distribution cumulative distribution function\n"
+    "\n"
+    "Returns the integral:\n"
+    "\n"
+    ".. math::\n"
+    "    \\frac{\\Gamma((df+1)/2)}{\\sqrt{\\pi df} \\Gamma(df/2)}\n"
+    "    \\int_{-\\infty}^t (1+x^2/df)^{-(df+1)/2}\\, dx\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "df : array_like\n"
+    "    Degrees of freedom\n"
+    "t : array_like\n"
+    "    Upper bound of the integral\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "scalar or ndarray\n"
+    "    Value of the Student t CDF at t\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "stdtridf : inverse of stdtr with respect to `df`\n"
+    "stdtrit : inverse of stdtr with respect to `t`\n"
+    "scipy.stats.t : student t distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The student t distribution is also available as `scipy.stats.t`.\n"
+    "Calling `stdtr` directly can improve performance compared to the\n"
+    "``cdf`` method of `scipy.stats.t` (see last example below).\n"
+    "\n"
+    "The function is computed using the Boost Math library [1]_, which\n"
+    "relies on the incomplete beta function.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Boost C++ Libraries, http://www.boost.org/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Calculate the function for ``df=3`` at ``t=1``.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import stdtr\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> stdtr(3, 1)\n"
+    "0.8044988905221148\n"
+    "\n"
+    "Plot the function for three different degrees of freedom.\n"
+    "\n"
+    ">>> x = np.linspace(-10, 10, 1000)\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> parameters = [(1, \"solid\"), (3, \"dashed\"), (10, \"dotted\")]\n"
+    ">>> for (df, linestyle) in parameters:\n"
+    "...     ax.plot(x, stdtr(df, x), ls=linestyle, label=f\"$df={df}$\")\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_title(\"Student t distribution cumulative distribution function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The function can be computed for several degrees of freedom at the same\n"
+    "time by providing a NumPy array or list for `df`:\n"
+    "\n"
+    ">>> stdtr([1, 2, 3], 1)\n"
+    "array([0.75      , 0.78867513, 0.80449889])\n"
+    "\n"
+    "It is possible to calculate the function at several points for several\n"
+    "different degrees of freedom simultaneously by providing arrays for `df`\n"
+    "and `t` with shapes compatible for broadcasting. Compute `stdtr` at\n"
+    "4 points for 3 degrees of freedom resulting in an array of shape 3x4.\n"
+    "\n"
+    ">>> dfs = np.array([[1], [2], [3]])\n"
+    ">>> t = np.array([2, 4, 6, 8])\n"
+    ">>> dfs.shape, t.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> stdtr(dfs, t)\n"
+    "array([[0.85241638, 0.92202087, 0.94743154, 0.96041658],\n"
+    "       [0.90824829, 0.97140452, 0.98666426, 0.99236596],\n"
+    "       [0.93033702, 0.98599577, 0.99536364, 0.99796171]])\n"
+    "\n"
+    "The t distribution is also available as `scipy.stats.t`. Calling `stdtr`\n"
+    "directly can be much faster than calling the ``cdf`` method of\n"
+    "`scipy.stats.t`. To get the same results, one must use the following\n"
+    "parametrization: ``scipy.stats.t(df).cdf(x) = stdtr(df, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import t\n"
+    ">>> df, x = 3, 1\n"
+    ">>> stdtr_result = stdtr(df, x)  # this can be faster than below\n"
+    ">>> stats_result = t(df).cdf(x)\n"
+    ">>> stats_result == stdtr_result  # test that results are equal\n"
+    "True")
+ufunc_stdtr_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc_stdtr_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_stdtr_types[0] = <char>NPY_FLOAT
+ufunc_stdtr_types[1] = <char>NPY_FLOAT
+ufunc_stdtr_types[2] = <char>NPY_FLOAT
+ufunc_stdtr_types[3] = <char>NPY_DOUBLE
+ufunc_stdtr_types[4] = <char>NPY_DOUBLE
+ufunc_stdtr_types[5] = <char>NPY_DOUBLE
+ufunc_stdtr_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_t_cdf_float
+ufunc_stdtr_ptr[2*0+1] = <void*>(<char*>"stdtr")
+ufunc_stdtr_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_t_cdf_double
+ufunc_stdtr_ptr[2*1+1] = <void*>(<char*>"stdtr")
+ufunc_stdtr_data[0] = &ufunc_stdtr_ptr[2*0]
+ufunc_stdtr_data[1] = &ufunc_stdtr_ptr[2*1]
+stdtr = np.PyUFunc_FromFuncAndData(ufunc_stdtr_loops, ufunc_stdtr_data, ufunc_stdtr_types, 2, 2, 1, 0, 'stdtr', ufunc_stdtr_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_stdtridf_loops[2]
+cdef void *ufunc_stdtridf_ptr[4]
+cdef void *ufunc_stdtridf_data[2]
+cdef char ufunc_stdtridf_types[6]
+cdef char *ufunc_stdtridf_doc = (
+    "stdtridf(p, t, out=None)\n"
+    "\n"
+    "Inverse of `stdtr` vs df\n"
+    "\n"
+    "Returns the argument df such that stdtr(df, t) is equal to `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "p : array_like\n"
+    "    Probability\n"
+    "t : array_like\n"
+    "    Upper bound of the integral\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "df : scalar or ndarray\n"
+    "    Value of `df` such that ``stdtr(df, t) == p``\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "stdtr : Student t CDF\n"
+    "stdtrit : inverse of stdtr with respect to `t`\n"
+    "scipy.stats.t : Student t distribution\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Compute the student t cumulative distribution function for one\n"
+    "parameter set.\n"
+    "\n"
+    ">>> from scipy.special import stdtr, stdtridf\n"
+    ">>> df, x = 5, 2\n"
+    ">>> cdf_value = stdtr(df, x)\n"
+    ">>> cdf_value\n"
+    "0.9490302605850709\n"
+    "\n"
+    "Verify that `stdtridf` recovers the original value for `df` given\n"
+    "the CDF value and `x`.\n"
+    "\n"
+    ">>> stdtridf(cdf_value, x)\n"
+    "5.0")
+ufunc_stdtridf_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_stdtridf_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_stdtridf_types[0] = <char>NPY_FLOAT
+ufunc_stdtridf_types[1] = <char>NPY_FLOAT
+ufunc_stdtridf_types[2] = <char>NPY_FLOAT
+ufunc_stdtridf_types[3] = <char>NPY_DOUBLE
+ufunc_stdtridf_types[4] = <char>NPY_DOUBLE
+ufunc_stdtridf_types[5] = <char>NPY_DOUBLE
+ufunc_stdtridf_ptr[2*0] = <void*>_func_stdtridf
+ufunc_stdtridf_ptr[2*0+1] = <void*>(<char*>"stdtridf")
+ufunc_stdtridf_ptr[2*1] = <void*>_func_stdtridf
+ufunc_stdtridf_ptr[2*1+1] = <void*>(<char*>"stdtridf")
+ufunc_stdtridf_data[0] = &ufunc_stdtridf_ptr[2*0]
+ufunc_stdtridf_data[1] = &ufunc_stdtridf_ptr[2*1]
+stdtridf = np.PyUFunc_FromFuncAndData(ufunc_stdtridf_loops, ufunc_stdtridf_data, ufunc_stdtridf_types, 2, 2, 1, 0, 'stdtridf', ufunc_stdtridf_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_stdtrit_loops[2]
+cdef void *ufunc_stdtrit_ptr[4]
+cdef void *ufunc_stdtrit_data[2]
+cdef char ufunc_stdtrit_types[6]
+cdef char *ufunc_stdtrit_doc = (
+    "stdtrit(df, p, out=None)\n"
+    "\n"
+    "The `p`-th quantile of the student t distribution.\n"
+    "\n"
+    "This function is the inverse of the student t distribution cumulative\n"
+    "distribution function (CDF), returning `t` such that `stdtr(df, t) = p`.\n"
+    "\n"
+    "Returns the argument `t` such that stdtr(df, t) is equal to `p`.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "df : array_like\n"
+    "    Degrees of freedom\n"
+    "p : array_like\n"
+    "    Probability\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "t : scalar or ndarray\n"
+    "    Value of `t` such that ``stdtr(df, t) == p``\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "stdtr : Student t CDF\n"
+    "stdtridf : inverse of stdtr with respect to `df`\n"
+    "scipy.stats.t : Student t distribution\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "The student t distribution is also available as `scipy.stats.t`. Calling\n"
+    "`stdtrit` directly can improve performance compared to the ``ppf``\n"
+    "method of `scipy.stats.t` (see last example below).\n"
+    "\n"
+    "The function is computed using the Boost Math library [1]_, which\n"
+    "relies on the incomplete beta function.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Boost C++ Libraries, http://www.boost.org/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "`stdtrit` represents the inverse of the student t distribution CDF which\n"
+    "is available as `stdtr`. Here, we calculate the CDF for ``df`` at\n"
+    "``x=1``. `stdtrit` then returns ``1`` up to floating point errors\n"
+    "given the same value for `df` and the computed CDF value.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import stdtr, stdtrit\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> df = 3\n"
+    ">>> x = 1\n"
+    ">>> cdf_value = stdtr(df, x)\n"
+    ">>> stdtrit(df, cdf_value)\n"
+    "0.9999999994418539\n"
+    "\n"
+    "Plot the function for three different degrees of freedom.\n"
+    "\n"
+    ">>> x = np.linspace(0, 1, 1000)\n"
+    ">>> parameters = [(1, \"solid\"), (2, \"dashed\"), (5, \"dotted\")]\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> for (df, linestyle) in parameters:\n"
+    "...     ax.plot(x, stdtrit(df, x), ls=linestyle, label=f\"$df={df}$\")\n"
+    ">>> ax.legend()\n"
+    ">>> ax.set_ylim(-10, 10)\n"
+    ">>> ax.set_title(\"Student t distribution quantile function\")\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The function can be computed for several degrees of freedom at the same\n"
+    "time by providing a NumPy array or list for `df`:\n"
+    "\n"
+    ">>> stdtrit([1, 2, 3], 0.7)\n"
+    "array([0.72654253, 0.6172134 , 0.58438973])\n"
+    "\n"
+    "It is possible to calculate the function at several points for several\n"
+    "different degrees of freedom simultaneously by providing arrays for `df`\n"
+    "and `p` with shapes compatible for broadcasting. Compute `stdtrit` at\n"
+    "4 points for 3 degrees of freedom resulting in an array of shape 3x4.\n"
+    "\n"
+    ">>> dfs = np.array([[1], [2], [3]])\n"
+    ">>> p = np.array([0.2, 0.4, 0.7, 0.8])\n"
+    ">>> dfs.shape, p.shape\n"
+    "((3, 1), (4,))\n"
+    "\n"
+    ">>> stdtrit(dfs, p)\n"
+    "array([[-1.37638192, -0.3249197 ,  0.72654253,  1.37638192],\n"
+    "       [-1.06066017, -0.28867513,  0.6172134 ,  1.06066017],\n"
+    "       [-0.97847231, -0.27667066,  0.58438973,  0.97847231]])\n"
+    "\n"
+    "The t distribution is also available as `scipy.stats.t`. Calling `stdtrit`\n"
+    "directly can be much faster than calling the ``ppf`` method of\n"
+    "`scipy.stats.t`. To get the same results, one must use the following\n"
+    "parametrization: ``scipy.stats.t(df).ppf(x) = stdtrit(df, x)``.\n"
+    "\n"
+    ">>> from scipy.stats import t\n"
+    ">>> df, x = 3, 0.5\n"
+    ">>> stdtrit_result = stdtrit(df, x)  # this can be faster than below\n"
+    ">>> stats_result = t(df).ppf(x)\n"
+    ">>> stats_result == stdtrit_result  # test that results are equal\n"
+    "True")
+ufunc_stdtrit_loops[0] = <np.PyUFuncGenericFunction>loop_f_ff__As_ff_f
+ufunc_stdtrit_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_stdtrit_types[0] = <char>NPY_FLOAT
+ufunc_stdtrit_types[1] = <char>NPY_FLOAT
+ufunc_stdtrit_types[2] = <char>NPY_FLOAT
+ufunc_stdtrit_types[3] = <char>NPY_DOUBLE
+ufunc_stdtrit_types[4] = <char>NPY_DOUBLE
+ufunc_stdtrit_types[5] = <char>NPY_DOUBLE
+ufunc_stdtrit_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_t_ppf_float
+ufunc_stdtrit_ptr[2*0+1] = <void*>(<char*>"stdtrit")
+ufunc_stdtrit_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_t_ppf_double
+ufunc_stdtrit_ptr[2*1+1] = <void*>(<char*>"stdtrit")
+ufunc_stdtrit_data[0] = &ufunc_stdtrit_ptr[2*0]
+ufunc_stdtrit_data[1] = &ufunc_stdtrit_ptr[2*1]
+stdtrit = np.PyUFunc_FromFuncAndData(ufunc_stdtrit_loops, ufunc_stdtrit_data, ufunc_stdtrit_types, 2, 2, 1, 0, 'stdtrit', ufunc_stdtrit_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_tklmbda_loops[2]
+cdef void *ufunc_tklmbda_ptr[4]
+cdef void *ufunc_tklmbda_data[2]
+cdef char ufunc_tklmbda_types[6]
+cdef char *ufunc_tklmbda_doc = (
+    "tklmbda(x, lmbda, out=None)\n"
+    "\n"
+    "Cumulative distribution function of the Tukey lambda distribution.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "x, lmbda : array_like\n"
+    "    Parameters\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "cdf : scalar or ndarray\n"
+    "    Value of the Tukey lambda CDF\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "scipy.stats.tukeylambda : Tukey lambda distribution\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> from scipy.special import tklmbda, expit\n"
+    "\n"
+    "Compute the cumulative distribution function (CDF) of the Tukey lambda\n"
+    "distribution at several ``x`` values for `lmbda` = -1.5.\n"
+    "\n"
+    ">>> x = np.linspace(-2, 2, 9)\n"
+    ">>> x\n"
+    "array([-2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ])\n"
+    ">>> tklmbda(x, -1.5)\n"
+    "array([0.34688734, 0.3786554 , 0.41528805, 0.45629737, 0.5       ,\n"
+    "       0.54370263, 0.58471195, 0.6213446 , 0.65311266])\n"
+    "\n"
+    "When `lmbda` is 0, the function is the logistic sigmoid function,\n"
+    "which is implemented in `scipy.special` as `expit`.\n"
+    "\n"
+    ">>> tklmbda(x, 0)\n"
+    "array([0.11920292, 0.18242552, 0.26894142, 0.37754067, 0.5       ,\n"
+    "       0.62245933, 0.73105858, 0.81757448, 0.88079708])\n"
+    ">>> expit(x)\n"
+    "array([0.11920292, 0.18242552, 0.26894142, 0.37754067, 0.5       ,\n"
+    "       0.62245933, 0.73105858, 0.81757448, 0.88079708])\n"
+    "\n"
+    "When `lmbda` is 1, the Tukey lambda distribution is uniform on the\n"
+    "interval [-1, 1], so the CDF increases linearly.\n"
+    "\n"
+    ">>> t = np.linspace(-1, 1, 9)\n"
+    ">>> tklmbda(t, 1)\n"
+    "array([0.   , 0.125, 0.25 , 0.375, 0.5  , 0.625, 0.75 , 0.875, 1.   ])\n"
+    "\n"
+    "In the following, we generate plots for several values of `lmbda`.\n"
+    "\n"
+    "The first figure shows graphs for `lmbda` <= 0.\n"
+    "\n"
+    ">>> styles = ['-', '-.', '--', ':']\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> x = np.linspace(-12, 12, 500)\n"
+    ">>> for k, lmbda in enumerate([-1.0, -0.5, 0.0]):\n"
+    "...     y = tklmbda(x, lmbda)\n"
+    "...     ax.plot(x, y, styles[k], label=rf'$\\lambda$ = {lmbda:-4.1f}')\n"
+    "\n"
+    ">>> ax.set_title(r'tklmbda(x, $\\lambda$)')\n"
+    ">>> ax.set_label('x')\n"
+    ">>> ax.legend(framealpha=1, shadow=True)\n"
+    ">>> ax.grid(True)\n"
+    "\n"
+    "The second figure shows graphs for `lmbda` > 0.  The dots in the\n"
+    "graphs show the bounds of the support of the distribution.\n"
+    "\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> x = np.linspace(-4.2, 4.2, 500)\n"
+    ">>> lmbdas = [0.25, 0.5, 1.0, 1.5]\n"
+    ">>> for k, lmbda in enumerate(lmbdas):\n"
+    "...     y = tklmbda(x, lmbda)\n"
+    "...     ax.plot(x, y, styles[k], label=fr'$\\lambda$ = {lmbda}')\n"
+    "\n"
+    ">>> ax.set_prop_cycle(None)\n"
+    ">>> for lmbda in lmbdas:\n"
+    "...     ax.plot([-1/lmbda, 1/lmbda], [0, 1], '.', ms=8)\n"
+    "\n"
+    ">>> ax.set_title(r'tklmbda(x, $\\lambda$)')\n"
+    ">>> ax.set_xlabel('x')\n"
+    ">>> ax.legend(framealpha=1, shadow=True)\n"
+    ">>> ax.grid(True)\n"
+    "\n"
+    ">>> plt.tight_layout()\n"
+    ">>> plt.show()\n"
+    "\n"
+    "The CDF of the Tukey lambda distribution is also implemented as the\n"
+    "``cdf`` method of `scipy.stats.tukeylambda`.  In the following,\n"
+    "``tukeylambda.cdf(x, -0.5)`` and ``tklmbda(x, -0.5)`` compute the\n"
+    "same values:\n"
+    "\n"
+    ">>> from scipy.stats import tukeylambda\n"
+    ">>> x = np.linspace(-2, 2, 9)\n"
+    "\n"
+    ">>> tukeylambda.cdf(x, -0.5)\n"
+    "array([0.21995157, 0.27093858, 0.33541677, 0.41328161, 0.5       ,\n"
+    "       0.58671839, 0.66458323, 0.72906142, 0.78004843])\n"
+    "\n"
+    ">>> tklmbda(x, -0.5)\n"
+    "array([0.21995157, 0.27093858, 0.33541677, 0.41328161, 0.5       ,\n"
+    "       0.58671839, 0.66458323, 0.72906142, 0.78004843])\n"
+    "\n"
+    "The implementation in ``tukeylambda`` also provides location and scale\n"
+    "parameters, and other methods such as ``pdf()`` (the probability\n"
+    "density function) and ``ppf()`` (the inverse of the CDF), so for\n"
+    "working with the Tukey lambda distribution, ``tukeylambda`` is more\n"
+    "generally useful.  The primary advantage of ``tklmbda`` is that it is\n"
+    "significantly faster than ``tukeylambda.cdf``.")
+ufunc_tklmbda_loops[0] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_tklmbda_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_tklmbda_types[0] = <char>NPY_FLOAT
+ufunc_tklmbda_types[1] = <char>NPY_FLOAT
+ufunc_tklmbda_types[2] = <char>NPY_FLOAT
+ufunc_tklmbda_types[3] = <char>NPY_DOUBLE
+ufunc_tklmbda_types[4] = <char>NPY_DOUBLE
+ufunc_tklmbda_types[5] = <char>NPY_DOUBLE
+ufunc_tklmbda_ptr[2*0] = <void*>_func_xsf_tukeylambdacdf
+ufunc_tklmbda_ptr[2*0+1] = <void*>(<char*>"tklmbda")
+ufunc_tklmbda_ptr[2*1] = <void*>_func_xsf_tukeylambdacdf
+ufunc_tklmbda_ptr[2*1+1] = <void*>(<char*>"tklmbda")
+ufunc_tklmbda_data[0] = &ufunc_tklmbda_ptr[2*0]
+ufunc_tklmbda_data[1] = &ufunc_tklmbda_ptr[2*1]
+tklmbda = np.PyUFunc_FromFuncAndData(ufunc_tklmbda_loops, ufunc_tklmbda_data, ufunc_tklmbda_types, 2, 2, 1, 0, 'tklmbda', ufunc_tklmbda_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_wrightomega_loops[4]
+cdef void *ufunc_wrightomega_ptr[8]
+cdef void *ufunc_wrightomega_data[4]
+cdef char ufunc_wrightomega_types[8]
+cdef char *ufunc_wrightomega_doc = (
+    "wrightomega(z, out=None)\n"
+    "\n"
+    "Wright Omega function.\n"
+    "\n"
+    "Defined as the solution to\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\omega + \\log(\\omega) = z\n"
+    "\n"
+    "where :math:`\\log` is the principal branch of the complex logarithm.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "z : array_like\n"
+    "    Points at which to evaluate the Wright Omega function\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function values\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "omega : scalar or ndarray\n"
+    "    Values of the Wright Omega function\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "lambertw : The Lambert W function\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    ".. versionadded:: 0.19.0\n"
+    "\n"
+    "The function can also be defined as\n"
+    "\n"
+    ".. math::\n"
+    "\n"
+    "    \\omega(z) = W_{K(z)}(e^z)\n"
+    "\n"
+    "where :math:`K(z) = \\lceil (\\Im(z) - \\pi)/(2\\pi) \\rceil` is the\n"
+    "unwinding number and :math:`W` is the Lambert W function.\n"
+    "\n"
+    "The implementation here is taken from [1]_.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Lawrence, Corless, and Jeffrey, \"Algorithm 917: Complex\n"
+    "       Double-Precision Evaluation of the Wright :math:`\\omega`\n"
+    "       Function.\" ACM Transactions on Mathematical Software,\n"
+    "       2012. :doi:`10.1145/2168773.2168779`.\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> import numpy as np\n"
+    ">>> from scipy.special import wrightomega, lambertw\n"
+    "\n"
+    ">>> wrightomega([-2, -1, 0, 1, 2])\n"
+    "array([0.12002824, 0.27846454, 0.56714329, 1.        , 1.5571456 ])\n"
+    "\n"
+    "Complex input:\n"
+    "\n"
+    ">>> wrightomega(3 + 5j)\n"
+    "(1.5804428632097158+3.8213626783287937j)\n"
+    "\n"
+    "Verify that ``wrightomega(z)`` satisfies ``w + log(w) = z``:\n"
+    "\n"
+    ">>> w = -5 + 4j\n"
+    ">>> wrightomega(w + np.log(w))\n"
+    "(-5+4j)\n"
+    "\n"
+    "Verify the connection to ``lambertw``:\n"
+    "\n"
+    ">>> z = 0.5 + 3j\n"
+    ">>> wrightomega(z)\n"
+    "(0.0966015889280649+1.4937828458191993j)\n"
+    ">>> lambertw(np.exp(z))\n"
+    "(0.09660158892806493+1.4937828458191993j)\n"
+    "\n"
+    ">>> z = 0.5 + 4j\n"
+    ">>> wrightomega(z)\n"
+    "(-0.3362123489037213+2.282986001579032j)\n"
+    ">>> lambertw(np.exp(z), k=1)\n"
+    "(-0.33621234890372115+2.282986001579032j)")
+ufunc_wrightomega_loops[0] = <np.PyUFuncGenericFunction>loop_d_d__As_f_f
+ufunc_wrightomega_loops[1] = <np.PyUFuncGenericFunction>loop_d_d__As_d_d
+ufunc_wrightomega_loops[2] = <np.PyUFuncGenericFunction>loop_D_D__As_F_F
+ufunc_wrightomega_loops[3] = <np.PyUFuncGenericFunction>loop_D_D__As_D_D
+ufunc_wrightomega_types[0] = <char>NPY_FLOAT
+ufunc_wrightomega_types[1] = <char>NPY_FLOAT
+ufunc_wrightomega_types[2] = <char>NPY_DOUBLE
+ufunc_wrightomega_types[3] = <char>NPY_DOUBLE
+ufunc_wrightomega_types[4] = <char>NPY_CFLOAT
+ufunc_wrightomega_types[5] = <char>NPY_CFLOAT
+ufunc_wrightomega_types[6] = <char>NPY_CDOUBLE
+ufunc_wrightomega_types[7] = <char>NPY_CDOUBLE
+ufunc_wrightomega_ptr[2*0] = <void*>scipy.special._ufuncs_cxx._export_wrightomega_real
+ufunc_wrightomega_ptr[2*0+1] = <void*>(<char*>"wrightomega")
+ufunc_wrightomega_ptr[2*1] = <void*>scipy.special._ufuncs_cxx._export_wrightomega_real
+ufunc_wrightomega_ptr[2*1+1] = <void*>(<char*>"wrightomega")
+ufunc_wrightomega_ptr[2*2] = <void*>scipy.special._ufuncs_cxx._export_wrightomega
+ufunc_wrightomega_ptr[2*2+1] = <void*>(<char*>"wrightomega")
+ufunc_wrightomega_ptr[2*3] = <void*>scipy.special._ufuncs_cxx._export_wrightomega
+ufunc_wrightomega_ptr[2*3+1] = <void*>(<char*>"wrightomega")
+ufunc_wrightomega_data[0] = &ufunc_wrightomega_ptr[2*0]
+ufunc_wrightomega_data[1] = &ufunc_wrightomega_ptr[2*1]
+ufunc_wrightomega_data[2] = &ufunc_wrightomega_ptr[2*2]
+ufunc_wrightomega_data[3] = &ufunc_wrightomega_ptr[2*3]
+wrightomega = np.PyUFunc_FromFuncAndData(ufunc_wrightomega_loops, ufunc_wrightomega_data, ufunc_wrightomega_types, 4, 1, 1, 0, 'wrightomega', ufunc_wrightomega_doc, 0)
+
+cdef np.PyUFuncGenericFunction ufunc_yn_loops[3]
+cdef void *ufunc_yn_ptr[6]
+cdef void *ufunc_yn_data[3]
+cdef char ufunc_yn_types[9]
+cdef char *ufunc_yn_doc = (
+    "yn(n, x, out=None)\n"
+    "\n"
+    "Bessel function of the second kind of integer order and real argument.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "n : array_like\n"
+    "    Order (integer).\n"
+    "x : array_like\n"
+    "    Argument (float).\n"
+    "out : ndarray, optional\n"
+    "    Optional output array for the function results\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "Y : scalar or ndarray\n"
+    "    Value of the Bessel function, :math:`Y_n(x)`.\n"
+    "\n"
+    "See Also\n"
+    "--------\n"
+    "yv : For real order and real or complex argument.\n"
+    "y0: faster implementation of this function for order 0\n"
+    "y1: faster implementation of this function for order 1\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Wrapper for the Cephes [1]_ routine `yn`.\n"
+    "\n"
+    "The function is evaluated by forward recurrence on `n`, starting with\n"
+    "values computed by the Cephes routines `y0` and `y1`. If ``n = 0`` or 1,\n"
+    "the routine for `y0` or `y1` is called directly.\n"
+    "\n"
+    "References\n"
+    "----------\n"
+    ".. [1] Cephes Mathematical Functions Library,\n"
+    "       http://www.netlib.org/cephes/\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    "Evaluate the function of order 0 at one point.\n"
+    "\n"
+    ">>> from scipy.special import yn\n"
+    ">>> yn(0, 1.)\n"
+    "0.08825696421567697\n"
+    "\n"
+    "Evaluate the function at one point for different orders.\n"
+    "\n"
+    ">>> yn(0, 1.), yn(1, 1.), yn(2, 1.)\n"
+    "(0.08825696421567697, -0.7812128213002888, -1.6506826068162546)\n"
+    "\n"
+    "The evaluation for different orders can be carried out in one call by\n"
+    "providing a list or NumPy array as argument for the `v` parameter:\n"
+    "\n"
+    ">>> yn([0, 1, 2], 1.)\n"
+    "array([ 0.08825696, -0.78121282, -1.65068261])\n"
+    "\n"
+    "Evaluate the function at several points for order 0 by providing an\n"
+    "array for `z`.\n"
+    "\n"
+    ">>> import numpy as np\n"
+    ">>> points = np.array([0.5, 3., 8.])\n"
+    ">>> yn(0, points)\n"
+    "array([-0.44451873,  0.37685001,  0.22352149])\n"
+    "\n"
+    "If `z` is an array, the order parameter `v` must be broadcastable to\n"
+    "the correct shape if different orders shall be computed in one call.\n"
+    "To calculate the orders 0 and 1 for a 1D array:\n"
+    "\n"
+    ">>> orders = np.array([[0], [1]])\n"
+    ">>> orders.shape\n"
+    "(2, 1)\n"
+    "\n"
+    ">>> yn(orders, points)\n"
+    "array([[-0.44451873,  0.37685001,  0.22352149],\n"
+    "       [-1.47147239,  0.32467442, -0.15806046]])\n"
+    "\n"
+    "Plot the functions of order 0 to 3 from 0 to 10.\n"
+    "\n"
+    ">>> import matplotlib.pyplot as plt\n"
+    ">>> fig, ax = plt.subplots()\n"
+    ">>> x = np.linspace(0., 10., 1000)\n"
+    ">>> for i in range(4):\n"
+    "...     ax.plot(x, yn(i, x), label=f'$Y_{i!r}$')\n"
+    ">>> ax.set_ylim(-3, 1)\n"
+    ">>> ax.legend()\n"
+    ">>> plt.show()")
+ufunc_yn_loops[0] = <np.PyUFuncGenericFunction>loop_d_pd__As_pd_d
+ufunc_yn_loops[1] = <np.PyUFuncGenericFunction>loop_d_dd__As_ff_f
+ufunc_yn_loops[2] = <np.PyUFuncGenericFunction>loop_d_dd__As_dd_d
+ufunc_yn_types[0] = <char>NPY_INTP
+ufunc_yn_types[1] = <char>NPY_DOUBLE
+ufunc_yn_types[2] = <char>NPY_DOUBLE
+ufunc_yn_types[3] = <char>NPY_FLOAT
+ufunc_yn_types[4] = <char>NPY_FLOAT
+ufunc_yn_types[5] = <char>NPY_FLOAT
+ufunc_yn_types[6] = <char>NPY_DOUBLE
+ufunc_yn_types[7] = <char>NPY_DOUBLE
+ufunc_yn_types[8] = <char>NPY_DOUBLE
+ufunc_yn_ptr[2*0] = <void*>_func_cephes_yn_wrap
+ufunc_yn_ptr[2*0+1] = <void*>(<char*>"yn")
+ufunc_yn_ptr[2*1] = <void*>_func_yn_unsafe
+ufunc_yn_ptr[2*1+1] = <void*>(<char*>"yn")
+ufunc_yn_ptr[2*2] = <void*>_func_yn_unsafe
+ufunc_yn_ptr[2*2+1] = <void*>(<char*>"yn")
+ufunc_yn_data[0] = &ufunc_yn_ptr[2*0]
+ufunc_yn_data[1] = &ufunc_yn_ptr[2*1]
+ufunc_yn_data[2] = &ufunc_yn_ptr[2*2]
+yn = np.PyUFunc_FromFuncAndData(ufunc_yn_loops, ufunc_yn_data, ufunc_yn_types, 3, 2, 1, 0, 'yn', ufunc_yn_doc, 0)
+
+from ._special_ufuncs import (_cospi, _gen_harmonic, _lambertw, _normalized_gen_harmonic, _scaled_exp1, _sinpi, _spherical_jn, _spherical_jn_d, _spherical_yn, _spherical_yn_d, _spherical_in, _spherical_in_d, _spherical_kn, _spherical_kn_d, airy, airye, bei, beip, ber, berp, binom, exp1, expi, expit, exprel, gamma, gammaln, hankel1, hankel1e, hankel2, hankel2e, hyp2f1, it2i0k0, it2j0y0, it2struve0, itairy, iti0k0, itj0y0, itmodstruve0, itstruve0, iv, _iv_ratio, _iv_ratio_c, ive, jv, jve, kei, keip, kelvin, ker, kerp, kv, kve, log_expit, log_wright_bessel, loggamma, logit, mathieu_a, mathieu_b, mathieu_cem, mathieu_modcem1, mathieu_modcem2, mathieu_modsem1, mathieu_modsem2, mathieu_sem, modfresnelm, modfresnelp, obl_ang1, obl_ang1_cv, obl_cv, obl_rad1, obl_rad1_cv, obl_rad2, obl_rad2_cv, pbdv, pbvv, pbwa, pro_ang1, pro_ang1_cv, pro_cv, pro_rad1, pro_rad1_cv, pro_rad2, pro_rad2_cv, psi, rgamma, wright_bessel, yv, yve, zetac, _zeta, sindg, cosdg, tandg, cotdg, i0, i0e, i1, i1e, k0, k0e, k1, k1e, y0, y1, j0, j1, struve, modstruve, beta, betaln, besselpoly, gammaln, gammasgn, cbrt, radian, cosm1, gammainc, gammaincinv, gammaincc, gammainccinv, fresnel, ellipe, ellipeinc, ellipk, ellipkinc, ellipkm1, ellipj, _riemann_zeta, erf, erfc, erfcx, erfi, voigt_profile, wofz, dawsn, ndtr, log_ndtr, exp2, exp10, expm1, log1p, xlogy, xlog1py, _log1pmx, _log1mexp)
+
+#
+# Aliases
+#
+jn = jv
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pxd b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..1ffce7aa3b649c4600bb50d2f5fe1ddac4d46096
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pxd
@@ -0,0 +1,164 @@
+from . cimport sf_error
+cdef void _set_action(sf_error.sf_error_t, sf_error.sf_action_t) noexcept nogil
+cdef void *_export_beta_pdf_float
+cdef void *_export_beta_pdf_double
+cdef void *_export_beta_ppf_float
+cdef void *_export_beta_ppf_double
+cdef void *_export_binom_cdf_float
+cdef void *_export_binom_cdf_double
+cdef void *_export_binom_isf_float
+cdef void *_export_binom_isf_double
+cdef void *_export_binom_pmf_float
+cdef void *_export_binom_pmf_double
+cdef void *_export_binom_ppf_float
+cdef void *_export_binom_ppf_double
+cdef void *_export_binom_sf_float
+cdef void *_export_binom_sf_double
+cdef void *_export_cauchy_isf_float
+cdef void *_export_cauchy_isf_double
+cdef void *_export_cauchy_ppf_float
+cdef void *_export_cauchy_ppf_double
+cdef void *_export_hypergeom_cdf_float
+cdef void *_export_hypergeom_cdf_double
+cdef void *_export_hypergeom_mean_float
+cdef void *_export_hypergeom_mean_double
+cdef void *_export_hypergeom_pmf_float
+cdef void *_export_hypergeom_pmf_double
+cdef void *_export_hypergeom_sf_float
+cdef void *_export_hypergeom_sf_double
+cdef void *_export_hypergeom_skewness_float
+cdef void *_export_hypergeom_skewness_double
+cdef void *_export_hypergeom_variance_float
+cdef void *_export_hypergeom_variance_double
+cdef void *_export_invgauss_isf_float
+cdef void *_export_invgauss_isf_double
+cdef void *_export_invgauss_ppf_float
+cdef void *_export_invgauss_ppf_double
+cdef void *_export_landau_cdf_float
+cdef void *_export_landau_cdf_double
+cdef void *_export_landau_isf_float
+cdef void *_export_landau_isf_double
+cdef void *_export_landau_pdf_float
+cdef void *_export_landau_pdf_double
+cdef void *_export_landau_ppf_float
+cdef void *_export_landau_ppf_double
+cdef void *_export_landau_sf_float
+cdef void *_export_landau_sf_double
+cdef void *_export_nbinom_cdf_float
+cdef void *_export_nbinom_cdf_double
+cdef void *_export_nbinom_isf_float
+cdef void *_export_nbinom_isf_double
+cdef void *_export_nbinom_kurtosis_excess_float
+cdef void *_export_nbinom_kurtosis_excess_double
+cdef void *_export_nbinom_mean_float
+cdef void *_export_nbinom_mean_double
+cdef void *_export_nbinom_pmf_float
+cdef void *_export_nbinom_pmf_double
+cdef void *_export_nbinom_ppf_float
+cdef void *_export_nbinom_ppf_double
+cdef void *_export_nbinom_sf_float
+cdef void *_export_nbinom_sf_double
+cdef void *_export_nbinom_skewness_float
+cdef void *_export_nbinom_skewness_double
+cdef void *_export_nbinom_variance_float
+cdef void *_export_nbinom_variance_double
+cdef void *_export_ncf_isf_float
+cdef void *_export_ncf_isf_double
+cdef void *_export_ncf_kurtosis_excess_float
+cdef void *_export_ncf_kurtosis_excess_double
+cdef void *_export_ncf_mean_float
+cdef void *_export_ncf_mean_double
+cdef void *_export_ncf_pdf_float
+cdef void *_export_ncf_pdf_double
+cdef void *_export_ncf_sf_float
+cdef void *_export_ncf_sf_double
+cdef void *_export_ncf_skewness_float
+cdef void *_export_ncf_skewness_double
+cdef void *_export_ncf_variance_float
+cdef void *_export_ncf_variance_double
+cdef void *_export_nct_isf_float
+cdef void *_export_nct_isf_double
+cdef void *_export_nct_kurtosis_excess_float
+cdef void *_export_nct_kurtosis_excess_double
+cdef void *_export_nct_mean_float
+cdef void *_export_nct_mean_double
+cdef void *_export_nct_pdf_float
+cdef void *_export_nct_pdf_double
+cdef void *_export_nct_sf_float
+cdef void *_export_nct_sf_double
+cdef void *_export_nct_skewness_float
+cdef void *_export_nct_skewness_double
+cdef void *_export_nct_variance_float
+cdef void *_export_nct_variance_double
+cdef void *_export_ncx2_isf_float
+cdef void *_export_ncx2_isf_double
+cdef void *_export_ncx2_pdf_float
+cdef void *_export_ncx2_pdf_double
+cdef void *_export_ncx2_sf_float
+cdef void *_export_ncx2_sf_double
+cdef void *_export_skewnorm_cdf_float
+cdef void *_export_skewnorm_cdf_double
+cdef void *_export_skewnorm_isf_float
+cdef void *_export_skewnorm_isf_double
+cdef void *_export_skewnorm_ppf_float
+cdef void *_export_skewnorm_ppf_double
+cdef void *_export__stirling2_inexact
+cdef void *_export_ibeta_float
+cdef void *_export_ibeta_double
+cdef void *_export_ibetac_float
+cdef void *_export_ibetac_double
+cdef void *_export_ibetac_inv_float
+cdef void *_export_ibetac_inv_double
+cdef void *_export_ibeta_inv_float
+cdef void *_export_ibeta_inv_double
+cdef void *_export_ibeta_inva_float
+cdef void *_export_ibeta_inva_double
+cdef void *_export_ibeta_invb_float
+cdef void *_export_ibeta_invb_double
+cdef void *_export_chdtriv_float
+cdef void *_export_chdtriv_double
+cdef void *_export_ncx2_cdf_float
+cdef void *_export_ncx2_cdf_double
+cdef void *_export_ncx2_find_degrees_of_freedom_float
+cdef void *_export_ncx2_find_degrees_of_freedom_double
+cdef void *_export_ncx2_find_noncentrality_float
+cdef void *_export_ncx2_find_noncentrality_double
+cdef void *_export_ncx2_ppf_float
+cdef void *_export_ncx2_ppf_double
+cdef void *_export_fellint_RC
+cdef void *_export_cellint_RC
+cdef void *_export_fellint_RD
+cdef void *_export_cellint_RD
+cdef void *_export_fellint_RF
+cdef void *_export_cellint_RF
+cdef void *_export_fellint_RG
+cdef void *_export_cellint_RG
+cdef void *_export_fellint_RJ
+cdef void *_export_cellint_RJ
+cdef void *_export_erfinv_float
+cdef void *_export_erfinv_double
+cdef void *_export_f_cdf_float
+cdef void *_export_f_cdf_double
+cdef void *_export_f_sf_float
+cdef void *_export_f_sf_double
+cdef void *_export_f_ppf_float
+cdef void *_export_f_ppf_double
+cdef void *_export_hyp1f1_double
+cdef void *_export_ncf_cdf_float
+cdef void *_export_ncf_cdf_double
+cdef void *_export_ncf_ppf_float
+cdef void *_export_ncf_ppf_double
+cdef void *_export_nct_cdf_float
+cdef void *_export_nct_cdf_double
+cdef void *_export_nct_ppf_float
+cdef void *_export_nct_ppf_double
+cdef void *_export_pdtrik_float
+cdef void *_export_pdtrik_double
+cdef void *_export_powm1_float
+cdef void *_export_powm1_double
+cdef void *_export_t_cdf_float
+cdef void *_export_t_cdf_double
+cdef void *_export_t_ppf_float
+cdef void *_export_t_ppf_double
+cdef void *_export_wrightomega
+cdef void *_export_wrightomega_real
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pyx b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..8d9c6f860befef1a7d80313e00fdf5b1969c664d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx.pyx
@@ -0,0 +1,493 @@
+# This file is automatically generated by _generate_pyx.py.
+# Do not edit manually!
+
+from libc.math cimport NAN
+
+include "_ufuncs_extra_code_common.pxi"
+
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_beta_pdf_float "beta_pdf_float"(float, float, float) noexcept nogil
+cdef void *_export_beta_pdf_float = <void*>_func_beta_pdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_beta_pdf_double "beta_pdf_double"(double, double, double) noexcept nogil
+cdef void *_export_beta_pdf_double = <void*>_func_beta_pdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_beta_ppf_float "beta_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_beta_ppf_float = <void*>_func_beta_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_beta_ppf_double "beta_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_beta_ppf_double = <void*>_func_beta_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_binom_cdf_float "binom_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_binom_cdf_float = <void*>_func_binom_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_binom_cdf_double "binom_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_binom_cdf_double = <void*>_func_binom_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_binom_isf_float "binom_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_binom_isf_float = <void*>_func_binom_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_binom_isf_double "binom_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_binom_isf_double = <void*>_func_binom_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_binom_pmf_float "binom_pmf_float"(float, float, float) noexcept nogil
+cdef void *_export_binom_pmf_float = <void*>_func_binom_pmf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_binom_pmf_double "binom_pmf_double"(double, double, double) noexcept nogil
+cdef void *_export_binom_pmf_double = <void*>_func_binom_pmf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_binom_ppf_float "binom_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_binom_ppf_float = <void*>_func_binom_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_binom_ppf_double "binom_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_binom_ppf_double = <void*>_func_binom_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_binom_sf_float "binom_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_binom_sf_float = <void*>_func_binom_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_binom_sf_double "binom_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_binom_sf_double = <void*>_func_binom_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_cauchy_isf_float "cauchy_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_cauchy_isf_float = <void*>_func_cauchy_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_cauchy_isf_double "cauchy_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_cauchy_isf_double = <void*>_func_cauchy_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_cauchy_ppf_float "cauchy_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_cauchy_ppf_float = <void*>_func_cauchy_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_cauchy_ppf_double "cauchy_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_cauchy_ppf_double = <void*>_func_cauchy_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_cdf_float "hypergeom_cdf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_hypergeom_cdf_float = <void*>_func_hypergeom_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_cdf_double "hypergeom_cdf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_hypergeom_cdf_double = <void*>_func_hypergeom_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_mean_float "hypergeom_mean_float"(float, float, float) noexcept nogil
+cdef void *_export_hypergeom_mean_float = <void*>_func_hypergeom_mean_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_mean_double "hypergeom_mean_double"(double, double, double) noexcept nogil
+cdef void *_export_hypergeom_mean_double = <void*>_func_hypergeom_mean_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_pmf_float "hypergeom_pmf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_hypergeom_pmf_float = <void*>_func_hypergeom_pmf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_pmf_double "hypergeom_pmf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_hypergeom_pmf_double = <void*>_func_hypergeom_pmf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_sf_float "hypergeom_sf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_hypergeom_sf_float = <void*>_func_hypergeom_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_sf_double "hypergeom_sf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_hypergeom_sf_double = <void*>_func_hypergeom_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_skewness_float "hypergeom_skewness_float"(float, float, float) noexcept nogil
+cdef void *_export_hypergeom_skewness_float = <void*>_func_hypergeom_skewness_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_skewness_double "hypergeom_skewness_double"(double, double, double) noexcept nogil
+cdef void *_export_hypergeom_skewness_double = <void*>_func_hypergeom_skewness_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_hypergeom_variance_float "hypergeom_variance_float"(float, float, float) noexcept nogil
+cdef void *_export_hypergeom_variance_float = <void*>_func_hypergeom_variance_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hypergeom_variance_double "hypergeom_variance_double"(double, double, double) noexcept nogil
+cdef void *_export_hypergeom_variance_double = <void*>_func_hypergeom_variance_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_invgauss_isf_float "invgauss_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_invgauss_isf_float = <void*>_func_invgauss_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_invgauss_isf_double "invgauss_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_invgauss_isf_double = <void*>_func_invgauss_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_invgauss_ppf_float "invgauss_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_invgauss_ppf_float = <void*>_func_invgauss_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_invgauss_ppf_double "invgauss_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_invgauss_ppf_double = <void*>_func_invgauss_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_landau_cdf_float "landau_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_landau_cdf_float = <void*>_func_landau_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_landau_cdf_double "landau_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_landau_cdf_double = <void*>_func_landau_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_landau_isf_float "landau_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_landau_isf_float = <void*>_func_landau_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_landau_isf_double "landau_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_landau_isf_double = <void*>_func_landau_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_landau_pdf_float "landau_pdf_float"(float, float, float) noexcept nogil
+cdef void *_export_landau_pdf_float = <void*>_func_landau_pdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_landau_pdf_double "landau_pdf_double"(double, double, double) noexcept nogil
+cdef void *_export_landau_pdf_double = <void*>_func_landau_pdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_landau_ppf_float "landau_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_landau_ppf_float = <void*>_func_landau_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_landau_ppf_double "landau_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_landau_ppf_double = <void*>_func_landau_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_landau_sf_float "landau_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_landau_sf_float = <void*>_func_landau_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_landau_sf_double "landau_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_landau_sf_double = <void*>_func_landau_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_cdf_float "nbinom_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_nbinom_cdf_float = <void*>_func_nbinom_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_cdf_double "nbinom_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_nbinom_cdf_double = <void*>_func_nbinom_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_isf_float "nbinom_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_nbinom_isf_float = <void*>_func_nbinom_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_isf_double "nbinom_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_nbinom_isf_double = <void*>_func_nbinom_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_kurtosis_excess_float "nbinom_kurtosis_excess_float"(float, float) noexcept nogil
+cdef void *_export_nbinom_kurtosis_excess_float = <void*>_func_nbinom_kurtosis_excess_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_kurtosis_excess_double "nbinom_kurtosis_excess_double"(double, double) noexcept nogil
+cdef void *_export_nbinom_kurtosis_excess_double = <void*>_func_nbinom_kurtosis_excess_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_mean_float "nbinom_mean_float"(float, float) noexcept nogil
+cdef void *_export_nbinom_mean_float = <void*>_func_nbinom_mean_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_mean_double "nbinom_mean_double"(double, double) noexcept nogil
+cdef void *_export_nbinom_mean_double = <void*>_func_nbinom_mean_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_pmf_float "nbinom_pmf_float"(float, float, float) noexcept nogil
+cdef void *_export_nbinom_pmf_float = <void*>_func_nbinom_pmf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_pmf_double "nbinom_pmf_double"(double, double, double) noexcept nogil
+cdef void *_export_nbinom_pmf_double = <void*>_func_nbinom_pmf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_ppf_float "nbinom_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_nbinom_ppf_float = <void*>_func_nbinom_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_ppf_double "nbinom_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_nbinom_ppf_double = <void*>_func_nbinom_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_sf_float "nbinom_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_nbinom_sf_float = <void*>_func_nbinom_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_sf_double "nbinom_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_nbinom_sf_double = <void*>_func_nbinom_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_skewness_float "nbinom_skewness_float"(float, float) noexcept nogil
+cdef void *_export_nbinom_skewness_float = <void*>_func_nbinom_skewness_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_skewness_double "nbinom_skewness_double"(double, double) noexcept nogil
+cdef void *_export_nbinom_skewness_double = <void*>_func_nbinom_skewness_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nbinom_variance_float "nbinom_variance_float"(float, float) noexcept nogil
+cdef void *_export_nbinom_variance_float = <void*>_func_nbinom_variance_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nbinom_variance_double "nbinom_variance_double"(double, double) noexcept nogil
+cdef void *_export_nbinom_variance_double = <void*>_func_nbinom_variance_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_isf_float "ncf_isf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_ncf_isf_float = <void*>_func_ncf_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_isf_double "ncf_isf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_ncf_isf_double = <void*>_func_ncf_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_kurtosis_excess_float "ncf_kurtosis_excess_float"(float, float, float) noexcept nogil
+cdef void *_export_ncf_kurtosis_excess_float = <void*>_func_ncf_kurtosis_excess_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_kurtosis_excess_double "ncf_kurtosis_excess_double"(double, double, double) noexcept nogil
+cdef void *_export_ncf_kurtosis_excess_double = <void*>_func_ncf_kurtosis_excess_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_mean_float "ncf_mean_float"(float, float, float) noexcept nogil
+cdef void *_export_ncf_mean_float = <void*>_func_ncf_mean_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_mean_double "ncf_mean_double"(double, double, double) noexcept nogil
+cdef void *_export_ncf_mean_double = <void*>_func_ncf_mean_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_pdf_float "ncf_pdf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_ncf_pdf_float = <void*>_func_ncf_pdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_pdf_double "ncf_pdf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_ncf_pdf_double = <void*>_func_ncf_pdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_sf_float "ncf_sf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_ncf_sf_float = <void*>_func_ncf_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_sf_double "ncf_sf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_ncf_sf_double = <void*>_func_ncf_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_skewness_float "ncf_skewness_float"(float, float, float) noexcept nogil
+cdef void *_export_ncf_skewness_float = <void*>_func_ncf_skewness_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_skewness_double "ncf_skewness_double"(double, double, double) noexcept nogil
+cdef void *_export_ncf_skewness_double = <void*>_func_ncf_skewness_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_variance_float "ncf_variance_float"(float, float, float) noexcept nogil
+cdef void *_export_ncf_variance_float = <void*>_func_ncf_variance_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_variance_double "ncf_variance_double"(double, double, double) noexcept nogil
+cdef void *_export_ncf_variance_double = <void*>_func_ncf_variance_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_isf_float "nct_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_nct_isf_float = <void*>_func_nct_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_isf_double "nct_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_nct_isf_double = <void*>_func_nct_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_kurtosis_excess_float "nct_kurtosis_excess_float"(float, float) noexcept nogil
+cdef void *_export_nct_kurtosis_excess_float = <void*>_func_nct_kurtosis_excess_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_kurtosis_excess_double "nct_kurtosis_excess_double"(double, double) noexcept nogil
+cdef void *_export_nct_kurtosis_excess_double = <void*>_func_nct_kurtosis_excess_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_mean_float "nct_mean_float"(float, float) noexcept nogil
+cdef void *_export_nct_mean_float = <void*>_func_nct_mean_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_mean_double "nct_mean_double"(double, double) noexcept nogil
+cdef void *_export_nct_mean_double = <void*>_func_nct_mean_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_pdf_float "nct_pdf_float"(float, float, float) noexcept nogil
+cdef void *_export_nct_pdf_float = <void*>_func_nct_pdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_pdf_double "nct_pdf_double"(double, double, double) noexcept nogil
+cdef void *_export_nct_pdf_double = <void*>_func_nct_pdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_sf_float "nct_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_nct_sf_float = <void*>_func_nct_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_sf_double "nct_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_nct_sf_double = <void*>_func_nct_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_skewness_float "nct_skewness_float"(float, float) noexcept nogil
+cdef void *_export_nct_skewness_float = <void*>_func_nct_skewness_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_skewness_double "nct_skewness_double"(double, double) noexcept nogil
+cdef void *_export_nct_skewness_double = <void*>_func_nct_skewness_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_variance_float "nct_variance_float"(float, float) noexcept nogil
+cdef void *_export_nct_variance_float = <void*>_func_nct_variance_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_variance_double "nct_variance_double"(double, double) noexcept nogil
+cdef void *_export_nct_variance_double = <void*>_func_nct_variance_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_isf_float "ncx2_isf_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_isf_float = <void*>_func_ncx2_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_isf_double "ncx2_isf_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_isf_double = <void*>_func_ncx2_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_pdf_float "ncx2_pdf_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_pdf_float = <void*>_func_ncx2_pdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_pdf_double "ncx2_pdf_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_pdf_double = <void*>_func_ncx2_pdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_sf_float "ncx2_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_sf_float = <void*>_func_ncx2_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_sf_double "ncx2_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_sf_double = <void*>_func_ncx2_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_skewnorm_cdf_float "skewnorm_cdf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_skewnorm_cdf_float = <void*>_func_skewnorm_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_skewnorm_cdf_double "skewnorm_cdf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_skewnorm_cdf_double = <void*>_func_skewnorm_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_skewnorm_isf_float "skewnorm_isf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_skewnorm_isf_float = <void*>_func_skewnorm_isf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_skewnorm_isf_double "skewnorm_isf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_skewnorm_isf_double = <void*>_func_skewnorm_isf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_skewnorm_ppf_float "skewnorm_ppf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_skewnorm_ppf_float = <void*>_func_skewnorm_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_skewnorm_ppf_double "skewnorm_ppf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_skewnorm_ppf_double = <void*>_func_skewnorm_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func__stirling2_inexact "_stirling2_inexact"(double, double) noexcept nogil
+cdef void *_export__stirling2_inexact = <void*>_func__stirling2_inexact
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibeta_float "ibeta_float"(float, float, float) noexcept nogil
+cdef void *_export_ibeta_float = <void*>_func_ibeta_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibeta_double "ibeta_double"(double, double, double) noexcept nogil
+cdef void *_export_ibeta_double = <void*>_func_ibeta_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibetac_float "ibetac_float"(float, float, float) noexcept nogil
+cdef void *_export_ibetac_float = <void*>_func_ibetac_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibetac_double "ibetac_double"(double, double, double) noexcept nogil
+cdef void *_export_ibetac_double = <void*>_func_ibetac_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibetac_inv_float "ibetac_inv_float"(float, float, float) noexcept nogil
+cdef void *_export_ibetac_inv_float = <void*>_func_ibetac_inv_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibetac_inv_double "ibetac_inv_double"(double, double, double) noexcept nogil
+cdef void *_export_ibetac_inv_double = <void*>_func_ibetac_inv_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibeta_inv_float "ibeta_inv_float"(float, float, float) noexcept nogil
+cdef void *_export_ibeta_inv_float = <void*>_func_ibeta_inv_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibeta_inv_double "ibeta_inv_double"(double, double, double) noexcept nogil
+cdef void *_export_ibeta_inv_double = <void*>_func_ibeta_inv_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibeta_inva_float "ibeta_inva_float"(float, float, float) noexcept nogil
+cdef void *_export_ibeta_inva_float = <void*>_func_ibeta_inva_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibeta_inva_double "ibeta_inva_double"(double, double, double) noexcept nogil
+cdef void *_export_ibeta_inva_double = <void*>_func_ibeta_inva_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ibeta_invb_float "ibeta_invb_float"(float, float, float) noexcept nogil
+cdef void *_export_ibeta_invb_float = <void*>_func_ibeta_invb_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ibeta_invb_double "ibeta_invb_double"(double, double, double) noexcept nogil
+cdef void *_export_ibeta_invb_double = <void*>_func_ibeta_invb_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_chdtriv_float "chdtriv_float"(float, float) noexcept nogil
+cdef void *_export_chdtriv_float = <void*>_func_chdtriv_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_chdtriv_double "chdtriv_double"(double, double) noexcept nogil
+cdef void *_export_chdtriv_double = <void*>_func_chdtriv_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_cdf_float "ncx2_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_cdf_float = <void*>_func_ncx2_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_cdf_double "ncx2_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_cdf_double = <void*>_func_ncx2_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_find_degrees_of_freedom_float "ncx2_find_degrees_of_freedom_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_find_degrees_of_freedom_float = <void*>_func_ncx2_find_degrees_of_freedom_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_find_degrees_of_freedom_double "ncx2_find_degrees_of_freedom_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_find_degrees_of_freedom_double = <void*>_func_ncx2_find_degrees_of_freedom_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_find_noncentrality_float "ncx2_find_noncentrality_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_find_noncentrality_float = <void*>_func_ncx2_find_noncentrality_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_find_noncentrality_double "ncx2_find_noncentrality_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_find_noncentrality_double = <void*>_func_ncx2_find_noncentrality_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncx2_ppf_float "ncx2_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_ncx2_ppf_float = <void*>_func_ncx2_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncx2_ppf_double "ncx2_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_ncx2_ppf_double = <void*>_func_ncx2_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_fellint_RC "fellint_RC"(double, double) noexcept nogil
+cdef void *_export_fellint_RC = <void*>_func_fellint_RC
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_cellint_RC "cellint_RC"(double complex, double complex) noexcept nogil
+cdef void *_export_cellint_RC = <void*>_func_cellint_RC
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_fellint_RD "fellint_RD"(double, double, double) noexcept nogil
+cdef void *_export_fellint_RD = <void*>_func_fellint_RD
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_cellint_RD "cellint_RD"(double complex, double complex, double complex) noexcept nogil
+cdef void *_export_cellint_RD = <void*>_func_cellint_RD
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_fellint_RF "fellint_RF"(double, double, double) noexcept nogil
+cdef void *_export_fellint_RF = <void*>_func_fellint_RF
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_cellint_RF "cellint_RF"(double complex, double complex, double complex) noexcept nogil
+cdef void *_export_cellint_RF = <void*>_func_cellint_RF
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_fellint_RG "fellint_RG"(double, double, double) noexcept nogil
+cdef void *_export_fellint_RG = <void*>_func_fellint_RG
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_cellint_RG "cellint_RG"(double complex, double complex, double complex) noexcept nogil
+cdef void *_export_cellint_RG = <void*>_func_cellint_RG
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_fellint_RJ "fellint_RJ"(double, double, double, double) noexcept nogil
+cdef void *_export_fellint_RJ = <void*>_func_fellint_RJ
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_cellint_RJ "cellint_RJ"(double complex, double complex, double complex, double complex) noexcept nogil
+cdef void *_export_cellint_RJ = <void*>_func_cellint_RJ
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_erfinv_float "erfinv_float"(float) noexcept nogil
+cdef void *_export_erfinv_float = <void*>_func_erfinv_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_erfinv_double "erfinv_double"(double) noexcept nogil
+cdef void *_export_erfinv_double = <void*>_func_erfinv_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_f_cdf_float "f_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_f_cdf_float = <void*>_func_f_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_f_cdf_double "f_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_f_cdf_double = <void*>_func_f_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_f_sf_float "f_sf_float"(float, float, float) noexcept nogil
+cdef void *_export_f_sf_float = <void*>_func_f_sf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_f_sf_double "f_sf_double"(double, double, double) noexcept nogil
+cdef void *_export_f_sf_double = <void*>_func_f_sf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_f_ppf_float "f_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_f_ppf_float = <void*>_func_f_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_f_ppf_double "f_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_f_ppf_double = <void*>_func_f_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_hyp1f1_double "hyp1f1_double"(double, double, double) noexcept nogil
+cdef void *_export_hyp1f1_double = <void*>_func_hyp1f1_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_cdf_float "ncf_cdf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_ncf_cdf_float = <void*>_func_ncf_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_cdf_double "ncf_cdf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_ncf_cdf_double = <void*>_func_ncf_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_ncf_ppf_float "ncf_ppf_float"(float, float, float, float) noexcept nogil
+cdef void *_export_ncf_ppf_float = <void*>_func_ncf_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_ncf_ppf_double "ncf_ppf_double"(double, double, double, double) noexcept nogil
+cdef void *_export_ncf_ppf_double = <void*>_func_ncf_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_cdf_float "nct_cdf_float"(float, float, float) noexcept nogil
+cdef void *_export_nct_cdf_float = <void*>_func_nct_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_cdf_double "nct_cdf_double"(double, double, double) noexcept nogil
+cdef void *_export_nct_cdf_double = <void*>_func_nct_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_nct_ppf_float "nct_ppf_float"(float, float, float) noexcept nogil
+cdef void *_export_nct_ppf_float = <void*>_func_nct_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_nct_ppf_double "nct_ppf_double"(double, double, double) noexcept nogil
+cdef void *_export_nct_ppf_double = <void*>_func_nct_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_pdtrik_float "pdtrik_float"(float, float) noexcept nogil
+cdef void *_export_pdtrik_float = <void*>_func_pdtrik_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_pdtrik_double "pdtrik_double"(double, double) noexcept nogil
+cdef void *_export_pdtrik_double = <void*>_func_pdtrik_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_powm1_float "powm1_float"(float, float) noexcept nogil
+cdef void *_export_powm1_float = <void*>_func_powm1_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_powm1_double "powm1_double"(double, double) noexcept nogil
+cdef void *_export_powm1_double = <void*>_func_powm1_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_t_cdf_float "t_cdf_float"(float, float) noexcept nogil
+cdef void *_export_t_cdf_float = <void*>_func_t_cdf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_t_cdf_double "t_cdf_double"(double, double) noexcept nogil
+cdef void *_export_t_cdf_double = <void*>_func_t_cdf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef float _func_t_ppf_float "t_ppf_float"(float, float) noexcept nogil
+cdef void *_export_t_ppf_float = <void*>_func_t_ppf_float
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_t_ppf_double "t_ppf_double"(double, double) noexcept nogil
+cdef void *_export_t_ppf_double = <void*>_func_t_ppf_double
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double complex _func_wrightomega "wrightomega"(double complex) noexcept nogil
+cdef void *_export_wrightomega = <void*>_func_wrightomega
+cdef extern from r"_ufuncs_cxx_defs.h":
+    cdef double _func_wrightomega_real "wrightomega_real"(double) noexcept nogil
+cdef void *_export_wrightomega_real = <void*>_func_wrightomega_real
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx_defs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..19b7dcb172283ca35a90d1225aa4047a1596bc58
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/_ufuncs_cxx_defs.h
@@ -0,0 +1,169 @@
+#ifndef UFUNCS_PROTO_H
+#define UFUNCS_PROTO_H 1
+#include "boost_special_functions.h"
+npy_float beta_pdf_float(npy_float, npy_float, npy_float);
+npy_double beta_pdf_double(npy_double, npy_double, npy_double);
+npy_float beta_ppf_float(npy_float, npy_float, npy_float);
+npy_double beta_ppf_double(npy_double, npy_double, npy_double);
+npy_float binom_cdf_float(npy_float, npy_float, npy_float);
+npy_double binom_cdf_double(npy_double, npy_double, npy_double);
+npy_float binom_isf_float(npy_float, npy_float, npy_float);
+npy_double binom_isf_double(npy_double, npy_double, npy_double);
+npy_float binom_pmf_float(npy_float, npy_float, npy_float);
+npy_double binom_pmf_double(npy_double, npy_double, npy_double);
+npy_float binom_ppf_float(npy_float, npy_float, npy_float);
+npy_double binom_ppf_double(npy_double, npy_double, npy_double);
+npy_float binom_sf_float(npy_float, npy_float, npy_float);
+npy_double binom_sf_double(npy_double, npy_double, npy_double);
+npy_float cauchy_isf_float(npy_float, npy_float, npy_float);
+npy_double cauchy_isf_double(npy_double, npy_double, npy_double);
+npy_float cauchy_ppf_float(npy_float, npy_float, npy_float);
+npy_double cauchy_ppf_double(npy_double, npy_double, npy_double);
+npy_float hypergeom_cdf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double hypergeom_cdf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float hypergeom_mean_float(npy_float, npy_float, npy_float);
+npy_double hypergeom_mean_double(npy_double, npy_double, npy_double);
+npy_float hypergeom_pmf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double hypergeom_pmf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float hypergeom_sf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double hypergeom_sf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float hypergeom_skewness_float(npy_float, npy_float, npy_float);
+npy_double hypergeom_skewness_double(npy_double, npy_double, npy_double);
+npy_float hypergeom_variance_float(npy_float, npy_float, npy_float);
+npy_double hypergeom_variance_double(npy_double, npy_double, npy_double);
+npy_float invgauss_isf_float(npy_float, npy_float, npy_float);
+npy_double invgauss_isf_double(npy_double, npy_double, npy_double);
+npy_float invgauss_ppf_float(npy_float, npy_float, npy_float);
+npy_double invgauss_ppf_double(npy_double, npy_double, npy_double);
+npy_float landau_cdf_float(npy_float, npy_float, npy_float);
+npy_double landau_cdf_double(npy_double, npy_double, npy_double);
+npy_float landau_isf_float(npy_float, npy_float, npy_float);
+npy_double landau_isf_double(npy_double, npy_double, npy_double);
+npy_float landau_pdf_float(npy_float, npy_float, npy_float);
+npy_double landau_pdf_double(npy_double, npy_double, npy_double);
+npy_float landau_ppf_float(npy_float, npy_float, npy_float);
+npy_double landau_ppf_double(npy_double, npy_double, npy_double);
+npy_float landau_sf_float(npy_float, npy_float, npy_float);
+npy_double landau_sf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_cdf_float(npy_float, npy_float, npy_float);
+npy_double nbinom_cdf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_isf_float(npy_float, npy_float, npy_float);
+npy_double nbinom_isf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_kurtosis_excess_float(npy_float, npy_float);
+npy_double nbinom_kurtosis_excess_double(npy_double, npy_double);
+npy_float nbinom_mean_float(npy_float, npy_float);
+npy_double nbinom_mean_double(npy_double, npy_double);
+npy_float nbinom_pmf_float(npy_float, npy_float, npy_float);
+npy_double nbinom_pmf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_ppf_float(npy_float, npy_float, npy_float);
+npy_double nbinom_ppf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_sf_float(npy_float, npy_float, npy_float);
+npy_double nbinom_sf_double(npy_double, npy_double, npy_double);
+npy_float nbinom_skewness_float(npy_float, npy_float);
+npy_double nbinom_skewness_double(npy_double, npy_double);
+npy_float nbinom_variance_float(npy_float, npy_float);
+npy_double nbinom_variance_double(npy_double, npy_double);
+npy_float ncf_isf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double ncf_isf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float ncf_kurtosis_excess_float(npy_float, npy_float, npy_float);
+npy_double ncf_kurtosis_excess_double(npy_double, npy_double, npy_double);
+npy_float ncf_mean_float(npy_float, npy_float, npy_float);
+npy_double ncf_mean_double(npy_double, npy_double, npy_double);
+npy_float ncf_pdf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double ncf_pdf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float ncf_sf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double ncf_sf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float ncf_skewness_float(npy_float, npy_float, npy_float);
+npy_double ncf_skewness_double(npy_double, npy_double, npy_double);
+npy_float ncf_variance_float(npy_float, npy_float, npy_float);
+npy_double ncf_variance_double(npy_double, npy_double, npy_double);
+npy_float nct_isf_float(npy_float, npy_float, npy_float);
+npy_double nct_isf_double(npy_double, npy_double, npy_double);
+npy_float nct_kurtosis_excess_float(npy_float, npy_float);
+npy_double nct_kurtosis_excess_double(npy_double, npy_double);
+npy_float nct_mean_float(npy_float, npy_float);
+npy_double nct_mean_double(npy_double, npy_double);
+npy_float nct_pdf_float(npy_float, npy_float, npy_float);
+npy_double nct_pdf_double(npy_double, npy_double, npy_double);
+npy_float nct_sf_float(npy_float, npy_float, npy_float);
+npy_double nct_sf_double(npy_double, npy_double, npy_double);
+npy_float nct_skewness_float(npy_float, npy_float);
+npy_double nct_skewness_double(npy_double, npy_double);
+npy_float nct_variance_float(npy_float, npy_float);
+npy_double nct_variance_double(npy_double, npy_double);
+npy_float ncx2_isf_float(npy_float, npy_float, npy_float);
+npy_double ncx2_isf_double(npy_double, npy_double, npy_double);
+npy_float ncx2_pdf_float(npy_float, npy_float, npy_float);
+npy_double ncx2_pdf_double(npy_double, npy_double, npy_double);
+npy_float ncx2_sf_float(npy_float, npy_float, npy_float);
+npy_double ncx2_sf_double(npy_double, npy_double, npy_double);
+npy_float skewnorm_cdf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double skewnorm_cdf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float skewnorm_isf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double skewnorm_isf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float skewnorm_ppf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double skewnorm_ppf_double(npy_double, npy_double, npy_double, npy_double);
+#include "stirling2.h"
+npy_double _stirling2_inexact(npy_double, npy_double);
+npy_float ibeta_float(npy_float, npy_float, npy_float);
+npy_double ibeta_double(npy_double, npy_double, npy_double);
+npy_float ibetac_float(npy_float, npy_float, npy_float);
+npy_double ibetac_double(npy_double, npy_double, npy_double);
+npy_float ibetac_inv_float(npy_float, npy_float, npy_float);
+npy_double ibetac_inv_double(npy_double, npy_double, npy_double);
+npy_float ibeta_inv_float(npy_float, npy_float, npy_float);
+npy_double ibeta_inv_double(npy_double, npy_double, npy_double);
+npy_float ibeta_inva_float(npy_float, npy_float, npy_float);
+npy_double ibeta_inva_double(npy_double, npy_double, npy_double);
+npy_float ibeta_invb_float(npy_float, npy_float, npy_float);
+npy_double ibeta_invb_double(npy_double, npy_double, npy_double);
+npy_float chdtriv_float(npy_float, npy_float);
+npy_double chdtriv_double(npy_double, npy_double);
+npy_float ncx2_cdf_float(npy_float, npy_float, npy_float);
+npy_double ncx2_cdf_double(npy_double, npy_double, npy_double);
+npy_float ncx2_find_degrees_of_freedom_float(npy_float, npy_float, npy_float);
+npy_double ncx2_find_degrees_of_freedom_double(npy_double, npy_double, npy_double);
+npy_float ncx2_find_noncentrality_float(npy_float, npy_float, npy_float);
+npy_double ncx2_find_noncentrality_double(npy_double, npy_double, npy_double);
+npy_float ncx2_ppf_float(npy_float, npy_float, npy_float);
+npy_double ncx2_ppf_double(npy_double, npy_double, npy_double);
+#include "ellint_carlson_wrap.hh"
+npy_double fellint_RC(npy_double, npy_double);
+npy_cdouble cellint_RC(npy_cdouble, npy_cdouble);
+npy_double fellint_RD(npy_double, npy_double, npy_double);
+npy_cdouble cellint_RD(npy_cdouble, npy_cdouble, npy_cdouble);
+npy_double fellint_RF(npy_double, npy_double, npy_double);
+npy_cdouble cellint_RF(npy_cdouble, npy_cdouble, npy_cdouble);
+npy_double fellint_RG(npy_double, npy_double, npy_double);
+npy_cdouble cellint_RG(npy_cdouble, npy_cdouble, npy_cdouble);
+npy_double fellint_RJ(npy_double, npy_double, npy_double, npy_double);
+npy_cdouble cellint_RJ(npy_cdouble, npy_cdouble, npy_cdouble, npy_cdouble);
+npy_float erfinv_float(npy_float);
+npy_double erfinv_double(npy_double);
+npy_float f_cdf_float(npy_float, npy_float, npy_float);
+npy_double f_cdf_double(npy_double, npy_double, npy_double);
+npy_float f_sf_float(npy_float, npy_float, npy_float);
+npy_double f_sf_double(npy_double, npy_double, npy_double);
+npy_float f_ppf_float(npy_float, npy_float, npy_float);
+npy_double f_ppf_double(npy_double, npy_double, npy_double);
+npy_double hyp1f1_double(npy_double, npy_double, npy_double);
+npy_float ncf_cdf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double ncf_cdf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float ncf_ppf_float(npy_float, npy_float, npy_float, npy_float);
+npy_double ncf_ppf_double(npy_double, npy_double, npy_double, npy_double);
+npy_float nct_cdf_float(npy_float, npy_float, npy_float);
+npy_double nct_cdf_double(npy_double, npy_double, npy_double);
+npy_float nct_ppf_float(npy_float, npy_float, npy_float);
+npy_double nct_ppf_double(npy_double, npy_double, npy_double);
+npy_float pdtrik_float(npy_float, npy_float);
+npy_double pdtrik_double(npy_double, npy_double);
+npy_float powm1_float(npy_float, npy_float);
+npy_double powm1_double(npy_double, npy_double);
+npy_float t_cdf_float(npy_float, npy_float);
+npy_double t_cdf_double(npy_double, npy_double);
+npy_float t_ppf_float(npy_float, npy_float);
+npy_double t_ppf_double(npy_double, npy_double);
+#include "_wright.h"
+npy_cdouble wrightomega(npy_cdouble);
+npy_double wrightomega_real(npy_double);
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/add_newdocs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/add_newdocs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5549717d35710d71655e42c836625cde9346bcc3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/add_newdocs.py
@@ -0,0 +1,15 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+__all__: list[str] = []
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="special", module="add_newdocs",
+                                   private_modules=["_add_newdocs"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/basic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c179a893c4c9022a7cdd77c8923ff5a012b0eb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/basic.py
@@ -0,0 +1,84 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.special` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'ai_zeros',
+    'assoc_laguerre',
+    'bei_zeros',
+    'beip_zeros',
+    'ber_zeros',
+    'bernoulli',
+    'berp_zeros',
+    'bi_zeros',
+    'comb',
+    'digamma',
+    'diric',
+    'erf_zeros',
+    'euler',
+    'factorial',
+    'factorial2',
+    'factorialk',
+    'fresnel_zeros',
+    'fresnelc_zeros',
+    'fresnels_zeros',
+    'gamma',
+    'h1vp',
+    'h2vp',
+    'hankel1',
+    'hankel2',
+    'iv',
+    'ivp',
+    'jn_zeros',
+    'jnjnp_zeros',
+    'jnp_zeros',
+    'jnyn_zeros',
+    'jv',
+    'jvp',
+    'kei_zeros',
+    'keip_zeros',
+    'kelvin_zeros',
+    'ker_zeros',
+    'kerp_zeros',
+    'kv',
+    'kvp',
+    'lmbda',
+    'lqmn',
+    'lqn',
+    'mathieu_a',
+    'mathieu_b',
+    'mathieu_even_coef',
+    'mathieu_odd_coef',
+    'obl_cv_seq',
+    'pbdn_seq',
+    'pbdv_seq',
+    'pbvv_seq',
+    'perm',
+    'polygamma',
+    'pro_cv_seq',
+    'psi',
+    'riccati_jn',
+    'riccati_yn',
+    'sinc',
+    'y0_zeros',
+    'y1_zeros',
+    'y1p_zeros',
+    'yn_zeros',
+    'ynp_zeros',
+    'yv',
+    'yvp',
+    'zeta'
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="special", module="basic",
+                                   private_modules=["_basic", "_ufuncs"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pxd b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..7a6fbae7a279708ab702ef544c7b17250f81f6c1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pxd
@@ -0,0 +1,258 @@
+
+ctypedef fused number_t:
+    double complex
+    double
+
+cpdef number_t spherical_jn(Py_ssize_t n, number_t z, bint derivative=*) noexcept nogil
+cpdef number_t spherical_yn(Py_ssize_t n, number_t z, bint derivative=*) noexcept nogil
+cpdef number_t spherical_in(Py_ssize_t n, number_t z, bint derivative=*) noexcept nogil
+cpdef number_t spherical_kn(Py_ssize_t n, number_t z, bint derivative=*) noexcept nogil
+
+ctypedef fused Dd_number_t:
+    double complex
+    double
+
+ctypedef fused df_number_t:
+    double
+    float
+
+ctypedef fused dfg_number_t:
+    double
+    float
+    long double
+
+ctypedef fused dlp_number_t:
+    double
+    long
+    Py_ssize_t
+
+cpdef double voigt_profile(double x0, double x1, double x2) noexcept nogil
+cpdef double agm(double x0, double x1) noexcept nogil
+cdef void airy(Dd_number_t x0, Dd_number_t *y0, Dd_number_t *y1, Dd_number_t *y2, Dd_number_t *y3) noexcept nogil
+cdef void airye(Dd_number_t x0, Dd_number_t *y0, Dd_number_t *y1, Dd_number_t *y2, Dd_number_t *y3) noexcept nogil
+cpdef double bdtr(double x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double bdtrc(double x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double bdtri(double x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double bdtrik(double x0, double x1, double x2) noexcept nogil
+cpdef double bdtrin(double x0, double x1, double x2) noexcept nogil
+cpdef double bei(double x0) noexcept nogil
+cpdef double beip(double x0) noexcept nogil
+cpdef double ber(double x0) noexcept nogil
+cpdef double berp(double x0) noexcept nogil
+cpdef double besselpoly(double x0, double x1, double x2) noexcept nogil
+cpdef double beta(double x0, double x1) noexcept nogil
+cpdef df_number_t betainc(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t betaincc(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t betaincinv(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t betainccinv(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef double betaln(double x0, double x1) noexcept nogil
+cpdef double binom(double x0, double x1) noexcept nogil
+cpdef double boxcox(double x0, double x1) noexcept nogil
+cpdef double boxcox1p(double x0, double x1) noexcept nogil
+cpdef df_number_t btdtria(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t btdtrib(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef double cbrt(double x0) noexcept nogil
+cpdef double chdtr(double x0, double x1) noexcept nogil
+cpdef double chdtrc(double x0, double x1) noexcept nogil
+cpdef double chdtri(double x0, double x1) noexcept nogil
+cpdef df_number_t chdtriv(df_number_t x0, df_number_t x1) noexcept nogil
+cpdef df_number_t chndtr(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t chndtridf(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t chndtrinc(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t chndtrix(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef double cosdg(double x0) noexcept nogil
+cpdef double cosm1(double x0) noexcept nogil
+cpdef double cotdg(double x0) noexcept nogil
+cpdef Dd_number_t dawsn(Dd_number_t x0) noexcept nogil
+cpdef double ellipe(double x0) noexcept nogil
+cpdef double ellipeinc(double x0, double x1) noexcept nogil
+cdef void ellipj(double x0, double x1, double *y0, double *y1, double *y2, double *y3) noexcept nogil
+cpdef double ellipkinc(double x0, double x1) noexcept nogil
+cpdef double ellipkm1(double x0) noexcept nogil
+cpdef double ellipk(double x0) noexcept nogil
+cpdef Dd_number_t elliprc(Dd_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t elliprd(Dd_number_t x0, Dd_number_t x1, Dd_number_t x2) noexcept nogil
+cpdef Dd_number_t elliprf(Dd_number_t x0, Dd_number_t x1, Dd_number_t x2) noexcept nogil
+cpdef Dd_number_t elliprg(Dd_number_t x0, Dd_number_t x1, Dd_number_t x2) noexcept nogil
+cpdef Dd_number_t elliprj(Dd_number_t x0, Dd_number_t x1, Dd_number_t x2, Dd_number_t x3) noexcept nogil
+cpdef double entr(double x0) noexcept nogil
+cpdef Dd_number_t erf(Dd_number_t x0) noexcept nogil
+cpdef Dd_number_t erfc(Dd_number_t x0) noexcept nogil
+cpdef Dd_number_t erfcx(Dd_number_t x0) noexcept nogil
+cpdef Dd_number_t erfi(Dd_number_t x0) noexcept nogil
+cpdef df_number_t erfinv(df_number_t x0) noexcept nogil
+cpdef double erfcinv(double x0) noexcept nogil
+cpdef Dd_number_t eval_chebyc(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_chebys(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_chebyt(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_chebyu(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_gegenbauer(dlp_number_t x0, double x1, Dd_number_t x2) noexcept nogil
+cpdef Dd_number_t eval_genlaguerre(dlp_number_t x0, double x1, Dd_number_t x2) noexcept nogil
+cpdef double eval_hermite(Py_ssize_t x0, double x1) noexcept nogil
+cpdef double eval_hermitenorm(Py_ssize_t x0, double x1) noexcept nogil
+cpdef Dd_number_t eval_jacobi(dlp_number_t x0, double x1, double x2, Dd_number_t x3) noexcept nogil
+cpdef Dd_number_t eval_laguerre(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_legendre(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_sh_chebyt(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_sh_chebyu(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t eval_sh_jacobi(dlp_number_t x0, double x1, double x2, Dd_number_t x3) noexcept nogil
+cpdef Dd_number_t eval_sh_legendre(dlp_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t exp1(Dd_number_t x0) noexcept nogil
+cpdef double exp10(double x0) noexcept nogil
+cpdef double exp2(double x0) noexcept nogil
+cpdef Dd_number_t expi(Dd_number_t x0) noexcept nogil
+cpdef dfg_number_t expit(dfg_number_t x0) noexcept nogil
+cpdef Dd_number_t expm1(Dd_number_t x0) noexcept nogil
+cpdef double expn(dlp_number_t x0, double x1) noexcept nogil
+cpdef double exprel(double x0) noexcept nogil
+cpdef df_number_t fdtr(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t fdtrc(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef df_number_t fdtri(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef double fdtridfd(double x0, double x1, double x2) noexcept nogil
+cdef void fresnel(Dd_number_t x0, Dd_number_t *y0, Dd_number_t *y1) noexcept nogil
+cpdef Dd_number_t gamma(Dd_number_t x0) noexcept nogil
+cpdef double gammainc(double x0, double x1) noexcept nogil
+cpdef double gammaincc(double x0, double x1) noexcept nogil
+cpdef double gammainccinv(double x0, double x1) noexcept nogil
+cpdef double gammaincinv(double x0, double x1) noexcept nogil
+cpdef double gammaln(double x0) noexcept nogil
+cpdef double gammasgn(double x0) noexcept nogil
+cpdef double gdtr(double x0, double x1, double x2) noexcept nogil
+cpdef double gdtrc(double x0, double x1, double x2) noexcept nogil
+cpdef double gdtria(double x0, double x1, double x2) noexcept nogil
+cpdef double gdtrib(double x0, double x1, double x2) noexcept nogil
+cpdef double gdtrix(double x0, double x1, double x2) noexcept nogil
+cpdef double complex hankel1(double x0, double complex x1) noexcept nogil
+cpdef double complex hankel1e(double x0, double complex x1) noexcept nogil
+cpdef double complex hankel2(double x0, double complex x1) noexcept nogil
+cpdef double complex hankel2e(double x0, double complex x1) noexcept nogil
+cpdef double huber(double x0, double x1) noexcept nogil
+cpdef Dd_number_t hyp0f1(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t hyp1f1(double x0, double x1, Dd_number_t x2) noexcept nogil
+cpdef Dd_number_t hyp2f1(double x0, double x1, double x2, Dd_number_t x3) noexcept nogil
+cpdef double hyperu(double x0, double x1, double x2) noexcept nogil
+cpdef double i0(double x0) noexcept nogil
+cpdef double i0e(double x0) noexcept nogil
+cpdef double i1(double x0) noexcept nogil
+cpdef double i1e(double x0) noexcept nogil
+cpdef double inv_boxcox(double x0, double x1) noexcept nogil
+cpdef double inv_boxcox1p(double x0, double x1) noexcept nogil
+cdef void it2i0k0(double x0, double *y0, double *y1) noexcept nogil
+cdef void it2j0y0(double x0, double *y0, double *y1) noexcept nogil
+cpdef double it2struve0(double x0) noexcept nogil
+cdef void itairy(double x0, double *y0, double *y1, double *y2, double *y3) noexcept nogil
+cdef void iti0k0(double x0, double *y0, double *y1) noexcept nogil
+cdef void itj0y0(double x0, double *y0, double *y1) noexcept nogil
+cpdef double itmodstruve0(double x0) noexcept nogil
+cpdef double itstruve0(double x0) noexcept nogil
+cpdef Dd_number_t iv(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t ive(double x0, Dd_number_t x1) noexcept nogil
+cpdef double j0(double x0) noexcept nogil
+cpdef double j1(double x0) noexcept nogil
+cpdef Dd_number_t jv(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t jve(double x0, Dd_number_t x1) noexcept nogil
+cpdef double k0(double x0) noexcept nogil
+cpdef double k0e(double x0) noexcept nogil
+cpdef double k1(double x0) noexcept nogil
+cpdef double k1e(double x0) noexcept nogil
+cpdef double kei(double x0) noexcept nogil
+cpdef double keip(double x0) noexcept nogil
+cdef void kelvin(double x0, double complex *y0, double complex *y1, double complex *y2, double complex *y3) noexcept nogil
+cpdef double ker(double x0) noexcept nogil
+cpdef double kerp(double x0) noexcept nogil
+cpdef double kl_div(double x0, double x1) noexcept nogil
+cpdef double kn(dlp_number_t x0, double x1) noexcept nogil
+cpdef double kolmogi(double x0) noexcept nogil
+cpdef double kolmogorov(double x0) noexcept nogil
+cpdef Dd_number_t kv(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t kve(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t log1p(Dd_number_t x0) noexcept nogil
+cpdef dfg_number_t log_expit(dfg_number_t x0) noexcept nogil
+cpdef Dd_number_t log_ndtr(Dd_number_t x0) noexcept nogil
+cpdef Dd_number_t loggamma(Dd_number_t x0) noexcept nogil
+cpdef dfg_number_t logit(dfg_number_t x0) noexcept nogil
+cpdef double lpmv(double x0, double x1, double x2) noexcept nogil
+cpdef double mathieu_a(double x0, double x1) noexcept nogil
+cpdef double mathieu_b(double x0, double x1) noexcept nogil
+cdef void mathieu_cem(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void mathieu_modcem1(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void mathieu_modcem2(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void mathieu_modsem1(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void mathieu_modsem2(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void mathieu_sem(double x0, double x1, double x2, double *y0, double *y1) noexcept nogil
+cdef void modfresnelm(double x0, double complex *y0, double complex *y1) noexcept nogil
+cdef void modfresnelp(double x0, double complex *y0, double complex *y1) noexcept nogil
+cpdef double modstruve(double x0, double x1) noexcept nogil
+cpdef double nbdtr(dlp_number_t x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double nbdtrc(dlp_number_t x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double nbdtri(dlp_number_t x0, dlp_number_t x1, double x2) noexcept nogil
+cpdef double nbdtrik(double x0, double x1, double x2) noexcept nogil
+cpdef double nbdtrin(double x0, double x1, double x2) noexcept nogil
+cpdef df_number_t ncfdtr(df_number_t x0, df_number_t x1, df_number_t x2, df_number_t x3) noexcept nogil
+cpdef df_number_t ncfdtri(df_number_t x0, df_number_t x1, df_number_t x2, df_number_t x3) noexcept nogil
+cpdef double ncfdtridfd(double x0, double x1, double x2, double x3) noexcept nogil
+cpdef double ncfdtridfn(double x0, double x1, double x2, double x3) noexcept nogil
+cpdef double ncfdtrinc(double x0, double x1, double x2, double x3) noexcept nogil
+cpdef df_number_t nctdtr(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef double nctdtridf(double x0, double x1, double x2) noexcept nogil
+cpdef double nctdtrinc(double x0, double x1, double x2) noexcept nogil
+cpdef df_number_t nctdtrit(df_number_t x0, df_number_t x1, df_number_t x2) noexcept nogil
+cpdef Dd_number_t ndtr(Dd_number_t x0) noexcept nogil
+cpdef double ndtri(double x0) noexcept nogil
+cpdef double nrdtrimn(double x0, double x1, double x2) noexcept nogil
+cpdef double nrdtrisd(double x0, double x1, double x2) noexcept nogil
+cdef void obl_ang1(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void obl_ang1_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cpdef double obl_cv(double x0, double x1, double x2) noexcept nogil
+cdef void obl_rad1(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void obl_rad1_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cdef void obl_rad2(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void obl_rad2_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cpdef double owens_t(double x0, double x1) noexcept nogil
+cdef void pbdv(double x0, double x1, double *y0, double *y1) noexcept nogil
+cdef void pbvv(double x0, double x1, double *y0, double *y1) noexcept nogil
+cdef void pbwa(double x0, double x1, double *y0, double *y1) noexcept nogil
+cpdef double pdtr(double x0, double x1) noexcept nogil
+cpdef double pdtrc(double x0, double x1) noexcept nogil
+cpdef double pdtri(dlp_number_t x0, double x1) noexcept nogil
+cpdef df_number_t pdtrik(df_number_t x0, df_number_t x1) noexcept nogil
+cpdef double poch(double x0, double x1) noexcept nogil
+cpdef df_number_t powm1(df_number_t x0, df_number_t x1) noexcept nogil
+cdef void pro_ang1(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void pro_ang1_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cpdef double pro_cv(double x0, double x1, double x2) noexcept nogil
+cdef void pro_rad1(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void pro_rad1_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cdef void pro_rad2(double x0, double x1, double x2, double x3, double *y0, double *y1) noexcept nogil
+cdef void pro_rad2_cv(double x0, double x1, double x2, double x3, double x4, double *y0, double *y1) noexcept nogil
+cpdef double pseudo_huber(double x0, double x1) noexcept nogil
+cpdef Dd_number_t psi(Dd_number_t x0) noexcept nogil
+cpdef double radian(double x0, double x1, double x2) noexcept nogil
+cpdef double rel_entr(double x0, double x1) noexcept nogil
+cpdef Dd_number_t rgamma(Dd_number_t x0) noexcept nogil
+cpdef double round(double x0) noexcept nogil
+cdef void shichi(Dd_number_t x0, Dd_number_t *y0, Dd_number_t *y1) noexcept nogil
+cdef void sici(Dd_number_t x0, Dd_number_t *y0, Dd_number_t *y1) noexcept nogil
+cpdef double sindg(double x0) noexcept nogil
+cpdef double smirnov(dlp_number_t x0, double x1) noexcept nogil
+cpdef double smirnovi(dlp_number_t x0, double x1) noexcept nogil
+cpdef Dd_number_t spence(Dd_number_t x0) noexcept nogil
+cpdef df_number_t stdtr(df_number_t x0, df_number_t x1) noexcept nogil
+cpdef double stdtridf(double x0, double x1) noexcept nogil
+cpdef df_number_t stdtrit(df_number_t x0, df_number_t x1) noexcept nogil
+cpdef double struve(double x0, double x1) noexcept nogil
+cpdef double tandg(double x0) noexcept nogil
+cpdef double tklmbda(double x0, double x1) noexcept nogil
+cpdef double complex wofz(double complex x0) noexcept nogil
+cpdef Dd_number_t wrightomega(Dd_number_t x0) noexcept nogil
+cpdef Dd_number_t xlog1py(Dd_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t xlogy(Dd_number_t x0, Dd_number_t x1) noexcept nogil
+cpdef double y0(double x0) noexcept nogil
+cpdef double y1(double x0) noexcept nogil
+cpdef double yn(dlp_number_t x0, double x1) noexcept nogil
+cpdef Dd_number_t yv(double x0, Dd_number_t x1) noexcept nogil
+cpdef Dd_number_t yve(double x0, Dd_number_t x1) noexcept nogil
+cpdef double zetac(double x0) noexcept nogil
+cpdef double wright_bessel(double x0, double x1, double x2) noexcept nogil
+cpdef double log_wright_bessel(double x0, double x1, double x2) noexcept nogil
+cpdef double ndtri_exp(double x0) noexcept nogil
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..024e962b10df8892631eaad20223f7fc8378ea83
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/cython_special.pyi
@@ -0,0 +1,3 @@
+from typing import Any
+
+def __getattr__(name) -> Any: ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/orthogonal.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/orthogonal.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b13a08a96cb683d72a4a00d6962446e1779c88a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/orthogonal.py
@@ -0,0 +1,45 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.special` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+_polyfuns = ['legendre', 'chebyt', 'chebyu', 'chebyc', 'chebys',
+             'jacobi', 'laguerre', 'genlaguerre', 'hermite',
+             'hermitenorm', 'gegenbauer', 'sh_legendre', 'sh_chebyt',
+             'sh_chebyu', 'sh_jacobi']
+
+# Correspondence between new and old names of root functions
+_rootfuns_map = {'roots_legendre': 'p_roots',
+               'roots_chebyt': 't_roots',
+               'roots_chebyu': 'u_roots',
+               'roots_chebyc': 'c_roots',
+               'roots_chebys': 's_roots',
+               'roots_jacobi': 'j_roots',
+               'roots_laguerre': 'l_roots',
+               'roots_genlaguerre': 'la_roots',
+               'roots_hermite': 'h_roots',
+               'roots_hermitenorm': 'he_roots',
+               'roots_gegenbauer': 'cg_roots',
+               'roots_sh_legendre': 'ps_roots',
+               'roots_sh_chebyt': 'ts_roots',
+               'roots_sh_chebyu': 'us_roots',
+               'roots_sh_jacobi': 'js_roots'}
+
+
+__all__ = _polyfuns + list(_rootfuns_map.keys()) + [  # noqa: F822
+    'airy', 'p_roots', 't_roots', 'u_roots', 'c_roots', 's_roots',
+    'j_roots', 'l_roots', 'la_roots', 'h_roots', 'he_roots', 'cg_roots',
+    'ps_roots', 'ts_roots', 'us_roots', 'js_roots'
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="special", module="orthogonal",
+                                   private_modules=["_orthogonal"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/spfun_stats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/spfun_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1e58487aaa547483c9f2531ac4efc2ad5e4795c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/special/spfun_stats.py
@@ -0,0 +1,17 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.special` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+__all__ = ['multigammaln']  # noqa: F822
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="special", module="spfun_stats",
+                                   private_modules=["_spfun_stats"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4b0d0a54fddc916bf26621db1075fc27407bf2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/__init__.py
@@ -0,0 +1,672 @@
+"""
+.. _statsrefmanual:
+
+==========================================
+Statistical functions (:mod:`scipy.stats`)
+==========================================
+
+.. currentmodule:: scipy.stats
+
+This module contains a large number of probability distributions,
+summary and frequency statistics, correlation functions and statistical
+tests, masked statistics, kernel density estimation, quasi-Monte Carlo
+functionality, and more.
+
+Statistics is a very large area, and there are topics that are out of scope
+for SciPy and are covered by other packages. Some of the most important ones
+are:
+
+- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
+  regression, linear models, time series analysis, extensions to topics
+  also covered by ``scipy.stats``.
+- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
+  functionality, interfaces to other statistical languages.
+- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
+  modeling, probabilistic machine learning.
+- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
+  model selection.
+- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
+- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
+
+
+Probability distributions
+=========================
+
+Each univariate distribution is an instance of a subclass of `rv_continuous`
+(`rv_discrete` for discrete distributions):
+
+.. autosummary::
+   :toctree: generated/
+
+   rv_continuous
+   rv_discrete
+   rv_histogram
+
+Continuous distributions
+------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   alpha             -- Alpha
+   anglit            -- Anglit
+   arcsine           -- Arcsine
+   argus             -- Argus
+   beta              -- Beta
+   betaprime         -- Beta Prime
+   bradford          -- Bradford
+   burr              -- Burr (Type III)
+   burr12            -- Burr (Type XII)
+   cauchy            -- Cauchy
+   chi               -- Chi
+   chi2              -- Chi-squared
+   cosine            -- Cosine
+   crystalball       -- Crystalball
+   dgamma            -- Double Gamma
+   dpareto_lognorm   -- Double Pareto Lognormal
+   dweibull          -- Double Weibull
+   erlang            -- Erlang
+   expon             -- Exponential
+   exponnorm         -- Exponentially Modified Normal
+   exponweib         -- Exponentiated Weibull
+   exponpow          -- Exponential Power
+   f                 -- F (Snecdor F)
+   fatiguelife       -- Fatigue Life (Birnbaum-Saunders)
+   fisk              -- Fisk
+   foldcauchy        -- Folded Cauchy
+   foldnorm          -- Folded Normal
+   genlogistic       -- Generalized Logistic
+   gennorm           -- Generalized normal
+   genpareto         -- Generalized Pareto
+   genexpon          -- Generalized Exponential
+   genextreme        -- Generalized Extreme Value
+   gausshyper        -- Gauss Hypergeometric
+   gamma             -- Gamma
+   gengamma          -- Generalized gamma
+   genhalflogistic   -- Generalized Half Logistic
+   genhyperbolic     -- Generalized Hyperbolic
+   geninvgauss       -- Generalized Inverse Gaussian
+   gibrat            -- Gibrat
+   gompertz          -- Gompertz (Truncated Gumbel)
+   gumbel_r          -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
+   gumbel_l          -- Left Sided Gumbel, etc.
+   halfcauchy        -- Half Cauchy
+   halflogistic      -- Half Logistic
+   halfnorm          -- Half Normal
+   halfgennorm       -- Generalized Half Normal
+   hypsecant         -- Hyperbolic Secant
+   invgamma          -- Inverse Gamma
+   invgauss          -- Inverse Gaussian
+   invweibull        -- Inverse Weibull
+   irwinhall         -- Irwin-Hall
+   jf_skew_t         -- Jones and Faddy Skew-T
+   johnsonsb         -- Johnson SB
+   johnsonsu         -- Johnson SU
+   kappa4            -- Kappa 4 parameter
+   kappa3            -- Kappa 3 parameter
+   ksone             -- Distribution of Kolmogorov-Smirnov one-sided test statistic
+   kstwo             -- Distribution of Kolmogorov-Smirnov two-sided test statistic
+   kstwobign         -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
+   landau            -- Landau
+   laplace           -- Laplace
+   laplace_asymmetric    -- Asymmetric Laplace
+   levy              -- Levy
+   levy_l
+   levy_stable
+   logistic          -- Logistic
+   loggamma          -- Log-Gamma
+   loglaplace        -- Log-Laplace (Log Double Exponential)
+   lognorm           -- Log-Normal
+   loguniform        -- Log-Uniform
+   lomax             -- Lomax (Pareto of the second kind)
+   maxwell           -- Maxwell
+   mielke            -- Mielke's Beta-Kappa
+   moyal             -- Moyal
+   nakagami          -- Nakagami
+   ncx2              -- Non-central chi-squared
+   ncf               -- Non-central F
+   nct               -- Non-central Student's T
+   norm              -- Normal (Gaussian)
+   norminvgauss      -- Normal Inverse Gaussian
+   pareto            -- Pareto
+   pearson3          -- Pearson type III
+   powerlaw          -- Power-function
+   powerlognorm      -- Power log normal
+   powernorm         -- Power normal
+   rdist             -- R-distribution
+   rayleigh          -- Rayleigh
+   rel_breitwigner   -- Relativistic Breit-Wigner
+   rice              -- Rice
+   recipinvgauss     -- Reciprocal Inverse Gaussian
+   semicircular      -- Semicircular
+   skewcauchy        -- Skew Cauchy
+   skewnorm          -- Skew normal
+   studentized_range    -- Studentized Range
+   t                 -- Student's T
+   trapezoid         -- Trapezoidal
+   triang            -- Triangular
+   truncexpon        -- Truncated Exponential
+   truncnorm         -- Truncated Normal
+   truncpareto       -- Truncated Pareto
+   truncweibull_min  -- Truncated minimum Weibull distribution
+   tukeylambda       -- Tukey-Lambda
+   uniform           -- Uniform
+   vonmises          -- Von-Mises (Circular)
+   vonmises_line     -- Von-Mises (Line)
+   wald              -- Wald
+   weibull_min       -- Minimum Weibull (see Frechet)
+   weibull_max       -- Maximum Weibull (see Frechet)
+   wrapcauchy        -- Wrapped Cauchy
+
+The ``fit`` method of the univariate continuous distributions uses
+maximum likelihood estimation to fit the distribution to a data set.
+The ``fit`` method can accept regular data or *censored data*.
+Censored data is represented with instances of the `CensoredData`
+class.
+
+.. autosummary::
+   :toctree: generated/
+
+   CensoredData
+
+
+Multivariate distributions
+--------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   multivariate_normal    -- Multivariate normal distribution
+   matrix_normal          -- Matrix normal distribution
+   dirichlet              -- Dirichlet
+   dirichlet_multinomial  -- Dirichlet multinomial distribution
+   wishart                -- Wishart
+   invwishart             -- Inverse Wishart
+   multinomial            -- Multinomial distribution
+   special_ortho_group    -- SO(N) group
+   ortho_group            -- O(N) group
+   unitary_group          -- U(N) group
+   random_correlation     -- random correlation matrices
+   multivariate_t         -- Multivariate t-distribution
+   multivariate_hypergeom -- Multivariate hypergeometric distribution
+   normal_inverse_gamma   -- Normal-inverse-gamma distribution
+   random_table           -- Distribution of random tables with given marginals
+   uniform_direction      -- Uniform distribution on S(N-1)
+   vonmises_fisher        -- Von Mises-Fisher distribution
+   matrix_t               -- Matrix variate t distribution
+
+`scipy.stats.multivariate_normal` methods accept instances
+of the following class to represent the covariance.
+
+.. autosummary::
+   :toctree: generated/
+
+   Covariance             -- Representation of a covariance matrix
+
+
+Discrete distributions
+----------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   bernoulli                -- Bernoulli
+   betabinom                -- Beta-Binomial
+   betanbinom               -- Beta-Negative Binomial
+   binom                    -- Binomial
+   boltzmann                -- Boltzmann (Truncated Discrete Exponential)
+   dlaplace                 -- Discrete Laplacian
+   geom                     -- Geometric
+   hypergeom                -- Hypergeometric
+   logser                   -- Logarithmic (Log-Series, Series)
+   nbinom                   -- Negative Binomial
+   nchypergeom_fisher       -- Fisher's Noncentral Hypergeometric
+   nchypergeom_wallenius    -- Wallenius's Noncentral Hypergeometric
+   nhypergeom               -- Negative Hypergeometric
+   planck                   -- Planck (Discrete Exponential)
+   poisson                  -- Poisson
+   poisson_binom            -- Poisson Binomial
+   randint                  -- Discrete Uniform
+   skellam                  -- Skellam
+   yulesimon                -- Yule-Simon
+   zipf                     -- Zipf (Zeta)
+   zipfian                  -- Zipfian
+
+
+An overview of statistical functions is given below.  Many of these functions
+have a similar version in `scipy.stats.mstats` which work for masked arrays.
+
+Summary statistics
+==================
+
+.. autosummary::
+   :toctree: generated/
+
+   describe          -- Descriptive statistics
+   gmean             -- Geometric mean
+   hmean             -- Harmonic mean
+   pmean             -- Power mean
+   kurtosis          -- Fisher or Pearson kurtosis
+   mode              -- Modal value
+   moment            -- Central moment
+   lmoment
+   expectile         -- Expectile
+   skew              -- Skewness
+   kstat             --
+   kstatvar          --
+   tmean             -- Truncated arithmetic mean
+   tvar              -- Truncated variance
+   tmin              --
+   tmax              --
+   tstd              --
+   tsem              --
+   variation         -- Coefficient of variation
+   rankdata
+   tiecorrect
+   trim_mean
+   gstd              -- Geometric Standard Deviation
+   iqr
+   sem
+   bayes_mvs
+   mvsdist
+   entropy
+   differential_entropy
+   median_abs_deviation
+
+Frequency statistics
+====================
+
+.. autosummary::
+   :toctree: generated/
+
+   cumfreq
+   quantile
+   percentileofscore
+   scoreatpercentile
+   relfreq
+
+.. autosummary::
+   :toctree: generated/
+
+   binned_statistic     -- Compute a binned statistic for a set of data.
+   binned_statistic_2d  -- Compute a 2-D binned statistic for a set of data.
+   binned_statistic_dd  -- Compute a d-D binned statistic for a set of data.
+
+.. _hypotests:
+
+Hypothesis Tests and related functions
+======================================
+SciPy has many functions for performing hypothesis tests that return a
+test statistic and a p-value, and several of them return confidence intervals
+and/or other related information.
+
+The headings below are based on common uses of the functions within, but due to
+the wide variety of statistical procedures, any attempt at coarse-grained
+categorization will be imperfect. Also, note that tests within the same heading
+are not interchangeable in general (e.g. many have different distributional
+assumptions).
+
+One Sample Tests / Paired Sample Tests
+--------------------------------------
+One sample tests are typically used to assess whether a single sample was
+drawn from a specified distribution or a distribution with specified properties
+(e.g. zero mean).
+
+.. autosummary::
+   :toctree: generated/
+
+   ttest_1samp
+   binomtest
+   quantile_test
+   skewtest
+   kurtosistest
+   normaltest
+   jarque_bera
+   shapiro
+   anderson
+   cramervonmises
+   ks_1samp
+   goodness_of_fit
+   chisquare
+   power_divergence
+
+Paired sample tests are often used to assess whether two samples were drawn
+from the same distribution; they differ from the independent sample tests below
+in that each observation in one sample is treated as paired with a
+closely-related observation in the other sample (e.g. when environmental
+factors are controlled between observations within a pair but not among pairs).
+They can also be interpreted or used as one-sample tests (e.g. tests on the
+mean or median of *differences* between paired observations).
+
+.. autosummary::
+   :toctree: generated/
+
+   ttest_rel
+   wilcoxon
+
+Association/Correlation Tests
+-----------------------------
+
+These tests are often used to assess whether there is a relationship (e.g.
+linear) between paired observations in multiple samples or among the
+coordinates of multivariate observations.
+
+.. autosummary::
+   :toctree: generated/
+
+   linregress
+   pearsonr
+   spearmanrho
+   pointbiserialr
+   kendalltau
+   chatterjeexi
+   weightedtau
+   somersd
+   siegelslopes
+   theilslopes
+   page_trend_test
+   multiscale_graphcorr
+   spearmanr
+
+These association tests and are to work with samples in the form of contingency
+tables. Supporting functions are available in `scipy.stats.contingency`.
+
+.. autosummary::
+   :toctree: generated/
+
+   chi2_contingency
+   fisher_exact
+   barnard_exact
+   boschloo_exact
+
+Independent Sample Tests
+------------------------
+Independent sample tests are typically used to assess whether multiple samples
+were independently drawn from the same distribution or different distributions
+with a shared property (e.g. equal means).
+
+Some tests are specifically for comparing two samples.
+
+.. autosummary::
+   :toctree: generated/
+
+   ttest_ind_from_stats
+   poisson_means_test
+   ttest_ind
+   mannwhitneyu
+   bws_test
+   ranksums
+   brunnermunzel
+   mood
+   ansari
+   cramervonmises_2samp
+   epps_singleton_2samp
+   ks_2samp
+   kstest
+
+Others are generalized to multiple samples.
+
+.. autosummary::
+   :toctree: generated/
+
+   f_oneway
+   tukey_hsd
+   dunnett
+   kruskal
+   alexandergovern
+   fligner
+   levene
+   bartlett
+   median_test
+   friedmanchisquare
+   anderson_ksamp
+
+Resampling and Monte Carlo Methods
+----------------------------------
+The following functions can reproduce the p-value and confidence interval
+results of most of the functions above, and often produce accurate results in a
+wider variety of conditions. They can also be used to perform hypothesis tests
+and generate confidence intervals for custom statistics. This flexibility comes
+at the cost of greater computational requirements and stochastic results.
+
+.. autosummary::
+   :toctree: generated/
+
+   monte_carlo_test
+   permutation_test
+   bootstrap
+   power
+
+Instances of the following object can be passed into some hypothesis test
+functions to perform a resampling or Monte Carlo version of the hypothesis
+test.
+
+.. autosummary::
+   :toctree: generated/
+
+   MonteCarloMethod
+   PermutationMethod
+   BootstrapMethod
+
+Multiple Hypothesis Testing and Meta-Analysis
+---------------------------------------------
+These functions are for assessing the results of individual tests as a whole.
+Functions for performing specific multiple hypothesis tests (e.g. post hoc
+tests) are listed above.
+
+.. autosummary::
+   :toctree: generated/
+
+   combine_pvalues
+   false_discovery_control
+
+
+The following functions are related to the tests above but do not belong in the
+above categories.
+
+Random Variables
+================
+
+.. autosummary::
+   :toctree: generated/
+
+   make_distribution
+   Normal
+   Logistic
+   Uniform
+   Binomial
+   Mixture
+   order_statistic
+   truncate
+   abs
+   exp
+   log
+
+Quasi-Monte Carlo
+=================
+
+.. toctree::
+   :maxdepth: 4
+
+   stats.qmc
+
+Contingency Tables
+==================
+
+.. toctree::
+   :maxdepth: 4
+
+   stats.contingency
+
+Masked statistics functions
+===========================
+
+.. toctree::
+
+   stats.mstats
+
+
+Other statistical functionality
+===============================
+
+Transformations
+---------------
+
+.. autosummary::
+   :toctree: generated/
+
+   boxcox
+   boxcox_normmax
+   boxcox_llf
+   yeojohnson
+   yeojohnson_normmax
+   yeojohnson_llf
+   obrientransform
+   sigmaclip
+   trimboth
+   trim1
+   zmap
+   zscore
+   gzscore
+
+Statistical distances
+---------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   wasserstein_distance
+   wasserstein_distance_nd
+   energy_distance
+
+Sampling
+--------
+
+.. toctree::
+   :maxdepth: 4
+
+   stats.sampling
+
+Fitting / Survival Analysis
+---------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   fit
+   ecdf
+   logrank
+
+Directional statistical functions
+---------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   directional_stats
+   circmean
+   circvar
+   circstd
+
+Sensitivity Analysis
+--------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   sobol_indices
+
+Plot-tests
+----------
+
+.. autosummary::
+   :toctree: generated/
+
+   ppcc_max
+   ppcc_plot
+   probplot
+   boxcox_normplot
+   yeojohnson_normplot
+
+Univariate and multivariate kernel density estimation
+-----------------------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   gaussian_kde
+
+Warnings / Errors used in :mod:`scipy.stats`
+--------------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   DegenerateDataWarning
+   ConstantInputWarning
+   NearConstantInputWarning
+   FitError
+
+Result classes used in :mod:`scipy.stats`
+-----------------------------------------
+
+.. warning::
+
+    These classes are private, but they are included here because instances
+    of them are returned by other statistical functions. User import and
+    instantiation is not supported.
+
+.. toctree::
+   :maxdepth: 2
+
+   stats._result_classes
+
+"""  # noqa: E501
+
+from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
+                               DegenerateDataWarning, FitError)
+from ._stats_py import *
+from ._variation import variation
+from .distributions import *
+from ._morestats import *
+from ._multicomp import *
+from ._binomtest import binomtest
+from ._binned_statistic import *
+from ._kde import gaussian_kde
+from . import mstats
+from . import qmc
+from ._multivariate import *
+from . import contingency
+from .contingency import chi2_contingency
+from ._censored_data import CensoredData
+from ._resampling import (bootstrap, monte_carlo_test, permutation_test, power,
+                          MonteCarloMethod, PermutationMethod, BootstrapMethod)
+from ._entropy import *
+from ._hypotests import *
+from ._page_trend_test import page_trend_test
+from ._mannwhitneyu import mannwhitneyu
+from ._bws_test import bws_test
+from ._fit import fit, goodness_of_fit
+from ._covariance import Covariance
+from ._sensitivity_analysis import *
+from ._survival import *
+from ._distribution_infrastructure import (
+    make_distribution, Mixture, order_statistic, truncate, exp, log, abs
+)
+from ._new_distributions import Normal, Logistic, Uniform, Binomial
+from ._mgc import multiscale_graphcorr
+from ._correlation import chatterjeexi, spearmanrho
+from ._quantile import quantile
+
+
+# Deprecated namespaces, to be removed in v2.0.0
+from . import (
+    biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
+)
+
+
+__all__ = [s for s in dir() if not s.startswith("_")]  # Remove dunders.
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_axis_nan_policy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_axis_nan_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39e58be0d94dc72bd16385f4378aed9fb566db4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_axis_nan_policy.py
@@ -0,0 +1,716 @@
+# Many scipy.stats functions support `axis` and `nan_policy` parameters.
+# When the two are combined, it can be tricky to get all the behavior just
+# right. This file contains utility functions useful for scipy.stats functions
+# that support `axis` and `nan_policy`, including a decorator that
+# automatically adds `axis` and `nan_policy` arguments to a function.
+
+import math
+import warnings
+import numpy as np
+from functools import wraps
+from scipy._lib._array_api import xp_ravel
+from scipy._lib._docscrape import FunctionDoc, Parameter
+from scipy._lib._util import _contains_nan, AxisError, _get_nan
+from scipy._lib._array_api import (array_namespace, is_numpy, xp_size, xp_copy,
+                                   xp_promote, is_lazy_array)
+import scipy._lib.array_api_extra as xpx
+
+import inspect
+
+too_small_1d_not_omit = (
+    "One or more sample arguments is too small; all "
+    "returned values will be NaN. "
+    "See documentation for sample size requirements.")
+
+too_small_1d_omit = (
+    "After omitting NaNs, one or more sample arguments "
+    "is too small; all returned values will be NaN. "
+    "See documentation for sample size requirements.")
+
+too_small_nd_not_omit = (
+    "All axis-slices of one or more sample arguments are "
+    "too small; all elements of returned arrays will be NaN. "
+    "See documentation for sample size requirements.")
+
+too_small_nd_omit = (
+    "After omitting NaNs, one or more axis-slices of one "
+    "or more sample arguments is too small; corresponding "
+    "elements of returned arrays will be NaN. "
+    "See documentation for sample size requirements.")
+
+class SmallSampleWarning(RuntimeWarning):
+    pass
+
+
+def _broadcast_arrays(arrays, axis=None, xp=None):
+    """
+    Broadcast shapes of arrays, ignoring incompatibility of specified axes
+    """
+    arrays = tuple(arrays)
+    if not arrays:
+        return arrays
+    xp = array_namespace(*arrays) if xp is None else xp
+    arrays = [xp.asarray(arr) for arr in arrays]
+    shapes = [arr.shape for arr in arrays]
+    new_shapes = _broadcast_shapes(shapes, axis)
+    if axis is None:
+        new_shapes = [new_shapes]*len(arrays)
+    return [xp.broadcast_to(array, new_shape)
+            for array, new_shape in zip(arrays, new_shapes)]
+
+
+def _broadcast_shapes(shapes, axis=None):
+    """
+    Broadcast shapes, ignoring incompatibility of specified axes
+    """
+    if not shapes:
+        return shapes
+
+    # input validation
+    if axis is not None:
+        axis = np.atleast_1d(axis)
+        message = '`axis` must be an integer, a tuple of integers, or `None`.'
+        try:
+            with np.errstate(invalid='ignore'):
+                axis_int = axis.astype(int)
+        except ValueError as e:
+            raise AxisError(message) from e
+        if not np.array_equal(axis_int, axis):
+            raise AxisError(message)
+        axis = axis_int
+
+    # First, ensure all shapes have same number of dimensions by prepending 1s.
+    n_dims = max([len(shape) for shape in shapes])
+    new_shapes = np.ones((len(shapes), n_dims), dtype=int)
+    for row, shape in zip(new_shapes, shapes):
+        row[len(row)-len(shape):] = shape  # can't use negative indices (-0:)
+
+    # Remove the shape elements of the axes to be ignored, but remember them.
+    if axis is not None:
+        axis[axis < 0] = n_dims + axis[axis < 0]
+        axis = np.sort(axis)
+        if axis[-1] >= n_dims or axis[0] < 0:
+            message = (f"`axis` is out of bounds "
+                       f"for array of dimension {n_dims}")
+            raise AxisError(message)
+
+        if len(np.unique(axis)) != len(axis):
+            raise AxisError("`axis` must contain only distinct elements")
+
+        removed_shapes = new_shapes[:, axis]
+        new_shapes = np.delete(new_shapes, axis, axis=1)
+
+    # If arrays are broadcastable, shape elements that are 1 may be replaced
+    # with a corresponding non-1 shape element. Assuming arrays are
+    # broadcastable, that final shape element can be found with:
+    new_shape = np.max(new_shapes, axis=0)
+    # except in case of an empty array:
+    new_shape *= new_shapes.all(axis=0)
+
+    # Among all arrays, there can only be one unique non-1 shape element.
+    # Therefore, if any non-1 shape element does not match what we found
+    # above, the arrays must not be broadcastable after all.
+    if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
+        raise ValueError("Array shapes are incompatible for broadcasting.")
+
+    if axis is not None:
+        # Add back the shape elements that were ignored
+        new_axis = axis - np.arange(len(axis))
+        new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
+                      for removed_shape in removed_shapes]
+        return new_shapes
+    else:
+        return tuple(new_shape)
+
+
+def _broadcast_array_shapes_remove_axis(arrays, axis=None):
+    """
+    Broadcast shapes of arrays, dropping specified axes
+
+    Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
+    the shape of the broadcast result after consuming/dropping `axis`.
+    In other words, return output shape of a typical hypothesis test on
+    `arrays` vectorized along `axis`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes_remove_axis
+    >>> a = np.zeros((5, 2, 1))
+    >>> b = np.zeros((9, 3))
+    >>> _broadcast_array_shapes_remove_axis((a, b), 1)
+    (5, 3)
+    """
+    # Note that here, `axis=None` means do not consume/drop any axes - _not_
+    # ravel arrays before broadcasting.
+    shapes = [arr.shape for arr in arrays]
+    return _broadcast_shapes_remove_axis(shapes, axis)
+
+
+def _broadcast_shapes_remove_axis(shapes, axis=None):
+    """
+    Broadcast shapes, dropping specified axes
+
+    Same as _broadcast_array_shapes_remove_axis, but given a sequence
+    of array shapes `shapes` instead of the arrays themselves.
+    """
+    shapes = _broadcast_shapes(shapes, axis)
+    shape = shapes[0]
+    if axis is not None:
+        shape = np.delete(shape, axis)
+    return tuple(shape)
+
+
+def _broadcast_concatenate(arrays, axis, paired=False, xp=None):
+    """Concatenate arrays along an axis with broadcasting."""
+    xp = array_namespace(*arrays) if xp is None else xp
+    arrays = _broadcast_arrays(arrays, axis if not paired else None, xp=xp)
+    res = xp.concat(arrays, axis=axis)
+    return res
+
+
+def _remove_nans(samples, paired, xp=None):
+    "Remove nans from paired or unpaired 1D samples"
+    # potential optimization: don't copy arrays that don't contain nans
+    xp = array_namespace(*samples)
+    if not paired:
+        return [sample[~xp.isnan(sample)] for sample in samples]
+
+    # for paired samples, we need to remove the whole pair when any part
+    # has a nan
+    nans = xp.isnan(samples[0])
+    for sample in samples[1:]:
+        nans = nans | xp.isnan(sample)
+    not_nans = ~nans
+    return [sample[not_nans] for sample in samples]
+
+
+def _remove_sentinel(samples, paired, sentinel):
+    "Remove sentinel values from paired or unpaired 1D samples"
+    # could consolidate with `_remove_nans`, but it's not quite as simple as
+    # passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
+
+    # potential optimization: don't copy arrays that don't contain sentinel
+    if not paired:
+        return [sample[sample != sentinel] for sample in samples]
+
+    # for paired samples, we need to remove the whole pair when any part
+    # has a nan
+    sentinels = (samples[0] == sentinel)
+    for sample in samples[1:]:
+        sentinels = sentinels | (sample == sentinel)
+    not_sentinels = ~sentinels
+    return [sample[not_sentinels] for sample in samples]
+
+
+def _masked_arrays_2_sentinel_arrays(samples):
+    # masked arrays in `samples` are converted to regular arrays, and values
+    # corresponding with masked elements are replaced with a sentinel value
+
+    # return without modifying arrays if none have a mask
+    has_mask = False
+    for sample in samples:
+        mask = getattr(sample, 'mask', False)
+        has_mask = has_mask or np.any(mask)
+    if not has_mask:
+        return samples, None  # None means there is no sentinel value
+
+    # Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
+    # values are always omitted, but there are different nan policies.
+    dtype = np.result_type(*samples)
+    dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
+    for i in range(len(samples)):
+        # Things get more complicated if the arrays are of different types.
+        # We could have different sentinel values for each array, but
+        # the purpose of this code is convenience, not efficiency.
+        samples[i] = samples[i].astype(dtype, copy=False)
+
+    inexact = np.issubdtype(dtype, np.inexact)
+    info = np.finfo if inexact else np.iinfo
+    max_possible, min_possible = info(dtype).max, info(dtype).min
+    nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
+
+    sentinel = max_possible
+    # For simplicity, min_possible/np.infs are not candidate sentinel values
+    while sentinel > min_possible:
+        for sample in samples:
+            if np.any(sample == sentinel):  # choose a new sentinel value
+                sentinel = nextafter(sentinel, -np.inf)
+                break
+        else:  # when sentinel value is OK, break the while loop
+            break
+    else:
+        message = ("This function replaces masked elements with sentinel "
+                   "values, but the data contains all distinct values of this "
+                   "data type. Consider promoting the dtype to `np.float64`.")
+        raise ValueError(message)
+
+    # replace masked elements with sentinel value
+    out_samples = []
+    for sample in samples:
+        mask = getattr(sample, 'mask', None)
+        if mask is not None:  # turn all masked arrays into sentinel arrays
+            mask = np.broadcast_to(mask, sample.shape)
+            sample = sample.data.copy() if np.any(mask) else sample.data
+            sample = np.asarray(sample)  # `sample.data` could be a memoryview?
+            sample[mask] = sentinel
+        out_samples.append(sample)
+
+    return out_samples, sentinel
+
+
+def _check_empty_inputs(samples, axis, xp=None):
+    """
+    Check for empty sample; return appropriate output for a vectorized hypotest
+    """
+    xp = array_namespace(*samples) if xp is None else xp
+    # if none of the samples are empty, we need to perform the test
+    if not any(xp_size(sample) == 0 for sample in samples):
+        return None
+    # otherwise, the statistic and p-value will be either empty arrays or
+    # arrays with NaNs. Produce the appropriate array and return it.
+    output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
+    NaN = _get_nan(*samples)
+    output = xp.full(output_shape, xp.nan, dtype=NaN.dtype)
+    return output
+
+
+def _add_reduced_axes(res, reduced_axes, keepdims, xp=np):
+    """
+    Add reduced axes back to all the arrays in the result object
+    if keepdims = True.
+    """
+    return ([xpx.expand_dims(output, axis=reduced_axes)
+             if not isinstance(output, int) else output for output in res]
+            if keepdims else res)
+
+
+# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
+_name = 'axis'
+_desc = (
+    """If an int, the axis of the input along which to compute the statistic.
+The statistic of each axis-slice (e.g. row) of the input will appear in a
+corresponding element of the output.
+If ``None``, the input will be raveled before computing the statistic."""
+    .split('\n'))
+
+
+def _get_axis_params(default_axis=0, _name=_name, _desc=_desc):  # bind NOW
+    _type = f"int or None, default: {default_axis}"
+    _axis_parameter_doc = Parameter(_name, _type, _desc)
+    _axis_parameter = inspect.Parameter(_name,
+                                        inspect.Parameter.KEYWORD_ONLY,
+                                        default=default_axis)
+    return _axis_parameter_doc, _axis_parameter
+
+
+_name = 'nan_policy'
+_type = "{'propagate', 'omit', 'raise'}"
+_desc = (
+    """Defines how to handle input NaNs.
+
+- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+  which the  statistic is computed, the corresponding entry of the output
+  will be NaN.
+- ``omit``: NaNs will be omitted when performing the calculation.
+  If insufficient data remains in the axis slice along which the
+  statistic is computed, the corresponding entry of the output will be
+  NaN.
+- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
+    .split('\n'))
+_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
+_nan_policy_parameter = inspect.Parameter(_name,
+                                          inspect.Parameter.KEYWORD_ONLY,
+                                          default='propagate')
+
+_name = 'keepdims'
+_type = "bool, default: False"
+_desc = (
+    """If this is set to True, the axes which are reduced are left
+in the result as dimensions with size one. With this option,
+the result will broadcast correctly against the input array."""
+    .split('\n'))
+_keepdims_parameter_doc = Parameter(_name, _type, _desc)
+_keepdims_parameter = inspect.Parameter(_name,
+                                        inspect.Parameter.KEYWORD_ONLY,
+                                        default=False)
+
+_standard_note_addition = (
+    """\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
+code) are converted to ``np.ndarray`` before the calculation is performed. In
+this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
+rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
+arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
+masked array with ``mask=False``.""").split('\n')
+
+
+def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
+                             n_samples=1, paired=False,
+                             result_to_tuple=None, too_small=0,
+                             n_outputs=2, kwd_samples=(), override=None):
+    """Factory for a wrapper that adds axis/nan_policy params to a function.
+
+    Parameters
+    ----------
+    tuple_to_result : callable
+        Callable that returns an object of the type returned by the function
+        being wrapped (e.g. the namedtuple or dataclass returned by a
+        statistical test) provided the separate components (e.g. statistic,
+        pvalue).
+    default_axis : int, default: 0
+        The default value of the axis argument. Standard is 0 except when
+        backwards compatibility demands otherwise (e.g. `None`).
+    n_samples : int or callable, default: 1
+        The number of data samples accepted by the function
+        (e.g. `mannwhitneyu`), a callable that accepts a dictionary of
+        parameters passed into the function and returns the number of data
+        samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
+        of samples (e.g. `kruskal`).
+    paired : {False, True}
+        Whether the function being wrapped treats the samples as paired (i.e.
+        corresponding elements of each sample should be considered as different
+        components of the same sample.)
+    result_to_tuple : callable, optional
+        Function that unpacks the results of the function being wrapped into
+        a tuple. This is essentially the inverse of `tuple_to_result`. Default
+        is `None`, which is appropriate for statistical tests that return a
+        statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
+    too_small : int or callable, default: 0
+        The largest unnacceptably small sample for the function being wrapped.
+        For example, some functions require samples of size two or more or they
+        raise an error. This argument prevents the error from being raised when
+        input is not 1D and instead places a NaN in the corresponding element
+        of the result. If callable, it must accept a list of samples, axis,
+        and a dictionary of keyword arguments passed to the wrapper function as
+        arguments and return a bool indicating weather the samples passed are
+        too small.
+    n_outputs : int or callable, default: 2
+        The number of outputs produced by the function given 1d sample(s). For
+        example, hypothesis tests that return a namedtuple or result object
+        with attributes ``statistic`` and ``pvalue`` use the default
+        ``n_outputs=2``; summary statistics with scalar output use
+        ``n_outputs=1``. Alternatively, may be a callable that accepts a
+        dictionary of arguments passed into the wrapped function and returns
+        the number of outputs corresponding with those arguments.
+    kwd_samples : sequence, default: ()
+        The names of keyword parameters that should be treated as samples. For
+        example, `gmean` accepts as its first argument a sample `a` but
+        also `weights` as a fourth, optional keyword argument. In this case, we
+        use `n_samples=1` and kwd_samples=['weights'].
+    override : dict, default: {'vectorization': False, 'nan_propagation': True}
+        Pass a dictionary with ``'vectorization': True`` to ensure that the
+        decorator overrides the function's behavior for multimensional input.
+        Use ``'nan_propagation': False`` to ensure that the decorator does not
+        override the function's behavior for ``nan_policy='propagate'``.
+    """
+    # Specify which existing behaviors the decorator must override
+    temp = override or {}
+    override = {'vectorization': False,
+                'nan_propagation': True}
+    override.update(temp)
+
+    if result_to_tuple is None:
+        def result_to_tuple(res, _):
+            return res
+
+    if not callable(too_small):
+        def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
+            for sample in samples:
+                if sample.shape[axis] <= too_small:
+                    return True
+            return False
+    else:
+        is_too_small = too_small
+
+    def axis_nan_policy_decorator(hypotest_fun_in):
+        @wraps(hypotest_fun_in)
+        def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
+
+            if _no_deco:  # for testing, decorator does nothing
+                return hypotest_fun_in(*args, **kwds)
+
+            # For now, skip the decorator entirely if using array API. In the future,
+            # we'll probably want to use it for `keepdims`, `axis` tuples, etc.
+            if len(args) == 0:  # extract sample from `kwds` if there are no `args`
+                used_kwd_samples = list(set(kwds).intersection(set(kwd_samples)))
+                temp = used_kwd_samples[:1]
+            else:
+                temp = args[0]
+
+            if is_lazy_array(temp):
+                msg = ("Use of `nan_policy` and `keepdims` "
+                       "is incompatible with lazy arrays.")
+                if 'nan_policy' in kwds or 'keepdims' in kwds:
+                    raise NotImplementedError(msg)
+                return hypotest_fun_in(*args, **kwds)
+
+            # We need to be flexible about whether position or keyword
+            # arguments are used, but we need to make sure users don't pass
+            # both for the same parameter. To complicate matters, some
+            # functions accept samples with *args, and some functions already
+            # accept `axis` and `nan_policy` as positional arguments.
+            # The strategy is to make sure that there is no duplication
+            # between `args` and `kwds`, combine the two into `kwds`, then
+            # the samples, `nan_policy`, and `axis` from `kwds`, as they are
+            # dealt with separately.
+
+            # Check for intersection between positional and keyword args
+            params = list(inspect.signature(hypotest_fun_in).parameters)
+            if n_samples is None:
+                # Give unique names to each positional sample argument
+                # Note that *args can't be provided as a keyword argument
+                params = [f"arg{i}" for i in range(len(args))] + params[1:]
+
+            # raise if there are too many positional args
+            maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
+                      else len(inspect.getfullargspec(hypotest_fun_in).args))
+            if len(args) > maxarg:  # let the function raise the right error
+                hypotest_fun_in(*args, **kwds)
+
+            # raise if multiple values passed for same parameter
+            d_args = dict(zip(params, args))
+            intersection = set(d_args) & set(kwds)
+            if intersection:  # let the function raise the right error
+                hypotest_fun_in(*args, **kwds)
+
+            # Consolidate other positional and keyword args into `kwds`
+            kwds.update(d_args)
+
+            # rename avoids UnboundLocalError
+            if callable(n_samples):
+                # Future refactoring idea: no need for callable n_samples.
+                # Just replace `n_samples` and `kwd_samples` with a single
+                # list of the names of all samples, and treat all of them
+                # as `kwd_samples` are treated below.
+                n_samp = n_samples(kwds)
+            else:
+                n_samp = n_samples or len(args)
+
+            # get the number of outputs
+            n_out = n_outputs  # rename to avoid UnboundLocalError
+            if callable(n_out):
+                n_out = n_out(kwds)
+
+            # If necessary, rearrange function signature: accept other samples
+            # as positional args right after the first n_samp args
+            kwd_samp = [name for name in kwd_samples
+                        if kwds.get(name, None) is not None]
+            n_kwd_samp = len(kwd_samp)
+            if not kwd_samp:
+                hypotest_fun_out = hypotest_fun_in
+            else:
+                def hypotest_fun_out(*samples, **kwds):
+                    new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
+                    kwds.update(new_kwds)
+                    return hypotest_fun_in(*samples[:n_samp], **kwds)
+
+            # Extract the things we need here
+            try:  # if something is missing
+                samples = [kwds.pop(param) for param in (params[:n_samp] + kwd_samp)]
+                xp = array_namespace(*samples)
+                samples = xp_promote(*samples, xp=xp)
+                samples = (samples,) if not isinstance(samples, tuple) else samples
+                samples = [xpx.atleast_nd(sample, ndim=1) for sample in samples]
+            except KeyError:  # let the function raise the right error
+                # might need to revisit this if required arg is not a "sample"
+                hypotest_fun_in(*args, **kwds)
+            vectorized = True if 'axis' in params else False
+            vectorized = vectorized and not override['vectorization']
+            axis = kwds.pop('axis', default_axis)
+            nan_policy = kwds.pop('nan_policy', 'propagate')
+            keepdims = kwds.pop("keepdims", False)
+            del args  # avoid the possibility of passing both `args` and `kwds`
+
+            # convert masked arrays to regular arrays with sentinel values
+            sentinel = None
+            if is_numpy(xp):
+                samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
+
+            # standardize to always work along last axis
+            reduced_axes = axis
+            if axis is None:
+                if samples:
+                    # when axis=None, take the maximum of all dimensions since
+                    # all the dimensions are reduced.
+                    n_dims = max([xp.asarray(sample).ndim for sample in samples])
+                    reduced_axes = tuple(range(n_dims))
+                samples = [xp_ravel(sample) for sample in samples]
+            else:
+                # don't ignore any axes when broadcasting if paired
+                samples = _broadcast_arrays(samples, axis=axis if not paired else None)
+                axis = (axis,) if np.isscalar(axis) else axis
+                n_axes = len(axis)
+                # move all axes in `axis` to the end to be raveled
+                samples = [xp.moveaxis(sample, axis, tuple(range(-len(axis), 0)))
+                           for sample in samples]
+                shapes = [sample.shape for sample in samples]
+                # New shape is unchanged for all axes _not_ in `axis`
+                # At the end, we append the product of the shapes of the axes
+                # in `axis`. Appending -1 doesn't work for zero-size arrays!
+                new_shapes = [shape[:-n_axes] + (math.prod(shape[-n_axes:]),)
+                              for shape in shapes]
+                samples = [xp.reshape(sample, new_shape)
+                           for sample, new_shape in zip(samples, new_shapes)]
+            axis = -1  # work over the last axis
+
+            NaN = _get_nan(*samples) if samples else xp.nan
+
+            # if axis is not needed, just handle nan_policy and return
+            ndims = np.array([sample.ndim for sample in samples])  # NumPy OK for ndims
+            if np.all(ndims <= 1):
+                # Addresses nan_policy == "raise"
+                if nan_policy != 'propagate' or override['nan_propagation']:
+                    contains_nan = [_contains_nan(sample, nan_policy)
+                                    for sample in samples]
+                else:
+                    # Behave as though there are no NaNs (even if there are)
+                    contains_nan = [False] * len(samples)
+
+                # Addresses nan_policy == "propagate"
+                if any(contains_nan) and (nan_policy == 'propagate'
+                                          and override['nan_propagation']):
+                    res = xp.full(n_out, xp.nan, dtype=NaN.dtype)
+                    res = _add_reduced_axes(res, reduced_axes, keepdims)
+                    return tuple_to_result(*res)
+
+                # Addresses nan_policy == "omit"
+                too_small_msg = too_small_1d_not_omit
+                if any(contains_nan) and nan_policy == 'omit':
+                    # consider passing in contains_nan
+                    samples = _remove_nans(samples, paired)
+                    too_small_msg = too_small_1d_omit
+
+                if sentinel:
+                    samples = _remove_sentinel(samples, paired, sentinel)
+
+                if is_too_small(samples, kwds):
+                    warnings.warn(too_small_msg, SmallSampleWarning, stacklevel=2)
+                    res = xp.full(n_out, xp.nan, dtype=NaN.dtype)
+                    res = _add_reduced_axes(res, reduced_axes, keepdims)
+                    return tuple_to_result(*res)
+
+                res = hypotest_fun_out(*samples, **kwds)
+                res = result_to_tuple(res, n_out)
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+
+            # check for empty input
+            empty_output = _check_empty_inputs(samples, axis, xp=xp)
+            # only return empty output if zero sized input is too small.
+            if (
+                empty_output is not None
+                and (is_too_small(samples, kwds) or xp_size(empty_output) == 0)
+            ):
+                if is_too_small(samples, kwds) and xp_size(empty_output) != 0:
+                    warnings.warn(too_small_nd_not_omit, SmallSampleWarning,
+                                  stacklevel=2)
+                res = [xp_copy(empty_output) for i in range(n_out)]
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+
+            if not is_numpy(xp) and 'nan_policy' in kwds:
+                msg = ("Use of `nan_policy` is incompatible with multidimensional "
+                       "non-NumPy arrays.")
+                raise NotImplementedError(msg)
+
+            if not is_numpy(xp):
+                res = hypotest_fun_out(*samples, axis=axis, **kwds)
+                res = result_to_tuple(res, n_out)
+                res = _add_reduced_axes(res, reduced_axes, keepdims, xp=xp)
+                return tuple_to_result(*res)
+
+            # otherwise, concatenate all samples along axis, remembering where
+            # each separate sample begins
+            lengths = np.array([sample.shape[axis] for sample in samples])
+            split_indices = np.cumsum(lengths)
+            x = _broadcast_concatenate(samples, axis, paired=paired)
+
+            # Addresses nan_policy == "raise"
+            if nan_policy != 'propagate' or override['nan_propagation']:
+                contains_nan = _contains_nan(x, nan_policy)
+            else:
+                contains_nan = False  # behave like there are no NaNs
+
+            if vectorized and not contains_nan and not sentinel:
+                res = hypotest_fun_out(*samples, axis=axis, **kwds)
+                res = result_to_tuple(res, n_out)
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+
+            # Addresses nan_policy == "omit"
+            if contains_nan and nan_policy == 'omit':
+                def hypotest_fun(x):
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    samples = _remove_nans(samples, paired)
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        warnings.warn(too_small_nd_omit, SmallSampleWarning,
+                                      stacklevel=4)
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
+
+            # Addresses nan_policy == "propagate"
+            elif (contains_nan and nan_policy == 'propagate'
+                  and override['nan_propagation']):
+                def hypotest_fun(x):
+                    if np.isnan(x).any():
+                        return np.full(n_out, NaN)
+
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
+
+            else:
+                def hypotest_fun(x):
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
+
+            x = np.moveaxis(x, axis, 0)
+            res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
+            res = _add_reduced_axes(res, reduced_axes, keepdims)
+            return tuple_to_result(*res)
+
+        _axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
+        doc = FunctionDoc(axis_nan_policy_wrapper)
+        parameter_names = [param.name for param in doc['Parameters']]
+        if 'axis' in parameter_names:
+            doc['Parameters'][parameter_names.index('axis')] = (
+                _axis_parameter_doc)
+        else:
+            doc['Parameters'].append(_axis_parameter_doc)
+        if 'nan_policy' in parameter_names:
+            doc['Parameters'][parameter_names.index('nan_policy')] = (
+                _nan_policy_parameter_doc)
+        else:
+            doc['Parameters'].append(_nan_policy_parameter_doc)
+        if 'keepdims' in parameter_names:
+            doc['Parameters'][parameter_names.index('keepdims')] = (
+                _keepdims_parameter_doc)
+        else:
+            doc['Parameters'].append(_keepdims_parameter_doc)
+        doc['Notes'] += _standard_note_addition
+        doc = str(doc).split("\n", 1)[1].lstrip(" \n")  # remove signature
+        axis_nan_policy_wrapper.__doc__ = str(doc)
+
+        sig = inspect.signature(axis_nan_policy_wrapper)
+        parameters = sig.parameters
+        parameter_list = list(parameters.values())
+        if 'axis' not in parameters:
+            parameter_list.append(_axis_parameter)
+        if 'nan_policy' not in parameters:
+            parameter_list.append(_nan_policy_parameter)
+        if 'keepdims' not in parameters:
+            parameter_list.append(_keepdims_parameter)
+        sig = sig.replace(parameters=parameter_list)
+        axis_nan_policy_wrapper.__signature__ = sig
+
+        return axis_nan_policy_wrapper
+    return axis_nan_policy_decorator
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_biasedurn.pxd b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_biasedurn.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..92785f08dbec30a4db286fcb85b42d7221e2228e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_biasedurn.pxd
@@ -0,0 +1,27 @@
+# Declare the class with cdef
+cdef extern from "biasedurn/stocc.h" nogil:
+    cdef cppclass CFishersNCHypergeometric:
+        CFishersNCHypergeometric(int, int, int, double, double) except +
+        int mode()
+        double mean()
+        double variance()
+        double probability(int x)
+        double moments(double * mean, double * var)
+
+    cdef cppclass CWalleniusNCHypergeometric:
+        CWalleniusNCHypergeometric() except +
+        CWalleniusNCHypergeometric(int, int, int, double, double) except +
+        int mode()
+        double mean()
+        double variance()
+        double probability(int x)
+        double moments(double * mean, double * var)
+
+    cdef cppclass StochasticLib3:
+        StochasticLib3(int seed) except +
+        double Random() except +
+        void SetAccuracy(double accur)
+        int FishersNCHyp (int n, int m, int N, double odds) except +
+        int WalleniusNCHyp (int n, int m, int N, double odds) except +
+        double(*next_double)()
+        double(*next_normal)(const double m, const double s)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binned_statistic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binned_statistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca305dfc44e7641d6c97800381a16475bea5def7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binned_statistic.py
@@ -0,0 +1,800 @@
+import builtins
+from warnings import catch_warnings, simplefilter
+import numpy as np
+from operator import index
+from collections import namedtuple
+
+from scipy._lib._array_api import xp_capabilities
+
+__all__ = ['binned_statistic',
+           'binned_statistic_2d',
+           'binned_statistic_dd']
+
+
+BinnedStatisticResult = namedtuple('BinnedStatisticResult',
+                                   ('statistic', 'bin_edges', 'binnumber'))
+
+
+@xp_capabilities(np_only=True)
+def binned_statistic(x, values, statistic='mean',
+                     bins=10, range=None):
+    """
+    Compute a binned statistic for one or more sets of data.
+
+    This is a generalization of a histogram function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values (or set of values) within each bin.
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        A sequence of values to be binned.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `x`, or a set of sequences - each the same shape as
+        `x`.  If `values` is a set of sequences, the statistic will be computed
+        on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+
+    bins : int or sequence of scalars, optional
+        If `bins` is an int, it defines the number of equal-width bins in the
+        given range (10 by default).  If `bins` is a sequence, it defines the
+        bin edges, including the rightmost edge, allowing for non-uniform bin
+        widths.  Values in `x` that are smaller than lowest bin edge are
+        assigned to bin number 0, values beyond the highest bin are assigned to
+        ``bins[-1]``.  If the bin edges are specified, the number of bins will
+        be, (nx = len(bins)-1).
+    range : (float, float) or [(float, float)], optional
+        The lower and upper range of the bins.  If not provided, range
+        is simply ``(x.min(), x.max())``.  Values outside the range are
+        ignored.
+
+    Returns
+    -------
+    statistic : array
+        The values of the selected statistic in each bin.
+    bin_edges : array of dtype float
+        Return the bin edges ``(length(statistic)+1)``.
+    binnumber: 1-D ndarray of ints
+        Indices of the bins (corresponding to `bin_edges`) in which each value
+        of `x` belongs.  Same length as `values`.  A binnumber of `i` means the
+        corresponding value is between (bin_edges[i-1], bin_edges[i]).
+
+    See Also
+    --------
+    numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
+
+    Notes
+    -----
+    All but the last (righthand-most) bin is half-open.  In other words, if
+    `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
+    but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
+    ``[3, 4]``, which *includes* 4.
+
+    .. versionadded:: 0.11.0
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    First some basic examples:
+
+    Create two evenly spaced bins in the range of the given sample, and sum the
+    corresponding values in each of those bins:
+
+    >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
+    >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
+    BinnedStatisticResult(statistic=array([4. , 4.5]),
+            bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
+
+    Multiple arrays of values can also be passed.  The statistic is calculated
+    on each set independently:
+
+    >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
+    >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
+    BinnedStatisticResult(statistic=array([[4. , 4.5],
+           [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
+           binnumber=array([1, 1, 1, 2, 2]))
+
+    >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
+    ...                        bins=3)
+    BinnedStatisticResult(statistic=array([1., 2., 4.]),
+            bin_edges=array([1., 2., 3., 4.]),
+            binnumber=array([1, 2, 1, 2, 3]))
+
+    As a second example, we now generate some random data of sailing boat speed
+    as a function of wind speed, and then determine how fast our boat is for
+    certain wind speeds:
+
+    >>> rng = np.random.default_rng()
+    >>> windspeed = 8 * rng.random(500)
+    >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
+    >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
+    ...                 boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
+    >>> plt.figure()
+    >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
+    >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
+    ...            label='binned statistic of data')
+    >>> plt.legend()
+
+    Now we can use ``binnumber`` to select all datapoints with a windspeed
+    below 1:
+
+    >>> low_boatspeed = boatspeed[binnumber == 0]
+
+    As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
+    plot of a distribution that shows the mean and distribution around that
+    mean per bin, on top of a regular histogram and the probability
+    distribution function:
+
+    >>> x = np.linspace(0, 5, num=500)
+    >>> x_pdf = stats.maxwell.pdf(x)
+    >>> samples = stats.maxwell.rvs(size=10000)
+
+    >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
+    ...         statistic='mean', bins=25)
+    >>> bin_width = (bin_edges[1] - bin_edges[0])
+    >>> bin_centers = bin_edges[1:] - bin_width/2
+
+    >>> plt.figure()
+    >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
+    ...          alpha=0.2, label='histogram of data')
+    >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
+    >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
+    ...            label='binned statistic of data')
+    >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
+    >>> plt.legend(fontsize=10)
+    >>> plt.show()
+
+    """
+    try:
+        N = len(bins)
+    except TypeError:
+        N = 1
+
+    if N != 1:
+        bins = [np.asarray(bins, float)]
+
+    if range is not None:
+        if len(range) == 2:
+            range = [range]
+
+    medians, edges, binnumbers = binned_statistic_dd(
+        [x], values, statistic, bins, range)
+
+    return BinnedStatisticResult(medians, edges[0], binnumbers)
+
+
+BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
+                                     ('statistic', 'x_edge', 'y_edge',
+                                      'binnumber'))
+
+
+@xp_capabilities(np_only=True)
+def binned_statistic_2d(x, y, values, statistic='mean',
+                        bins=10, range=None, expand_binnumbers=False):
+    """
+    Compute a bidimensional binned statistic for one or more sets of data.
+
+    This is a generalization of a histogram2d function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values (or set of values) within each bin.
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        A sequence of values to be binned along the first dimension.
+    y : (N,) array_like
+        A sequence of values to be binned along the second dimension.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `x`, or a list of sequences - each with the same
+        shape as `x`.  If `values` is such a list, the statistic will be
+        computed on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+
+    bins : int or [int, int] or array_like or [array, array], optional
+        The bin specification:
+
+          * the number of bins for the two dimensions (nx = ny = bins),
+          * the number of bins in each dimension (nx, ny = bins),
+          * the bin edges for the two dimensions (x_edge = y_edge = bins),
+          * the bin edges in each dimension (x_edge, y_edge = bins).
+
+        If the bin edges are specified, the number of bins will be,
+        (nx = len(x_edge)-1, ny = len(y_edge)-1).
+
+    range : (2,2) array_like, optional
+        The leftmost and rightmost edges of the bins along each dimension
+        (if not specified explicitly in the `bins` parameters):
+        [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
+        considered outliers and not tallied in the histogram.
+    expand_binnumbers : bool, optional
+        'False' (default): the returned `binnumber` is a shape (N,) array of
+        linearized bin indices.
+        'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
+        ndarray, where each row gives the bin numbers in the corresponding
+        dimension.
+        See the `binnumber` returned value, and the `Examples` section.
+
+        .. versionadded:: 0.17.0
+
+    Returns
+    -------
+    statistic : (nx, ny) ndarray
+        The values of the selected statistic in each two-dimensional bin.
+    x_edge : (nx + 1) ndarray
+        The bin edges along the first dimension.
+    y_edge : (ny + 1) ndarray
+        The bin edges along the second dimension.
+    binnumber : (N,) array of ints or (2,N) ndarray of ints
+        This assigns to each element of `sample` an integer that represents the
+        bin in which this observation falls.  The representation depends on the
+        `expand_binnumbers` argument.  See `Notes` for details.
+
+
+    See Also
+    --------
+    numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
+
+    Notes
+    -----
+    Binedges:
+    All but the last (righthand-most) bin is half-open.  In other words, if
+    `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
+    but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
+    ``[3, 4]``, which *includes* 4.
+
+    `binnumber`:
+    This returned argument assigns to each element of `sample` an integer that
+    represents the bin in which it belongs.  The representation depends on the
+    `expand_binnumbers` argument. If 'False' (default): The returned
+    `binnumber` is a shape (N,) array of linearized indices mapping each
+    element of `sample` to its corresponding bin (using row-major ordering).
+    Note that the returned linearized bin indices are used for an array with
+    extra bins on the outer binedges to capture values outside of the defined
+    bin bounds.
+    If 'True': The returned `binnumber` is a shape (2,N) ndarray where
+    each row indicates bin placements for each dimension respectively.  In each
+    dimension, a binnumber of `i` means the corresponding value is between
+    (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
+
+    .. versionadded:: 0.11.0
+
+    Examples
+    --------
+    >>> from scipy import stats
+
+    Calculate the counts with explicit bin-edges:
+
+    >>> x = [0.1, 0.1, 0.1, 0.6]
+    >>> y = [2.1, 2.6, 2.1, 2.1]
+    >>> binx = [0.0, 0.5, 1.0]
+    >>> biny = [2.0, 2.5, 3.0]
+    >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
+    >>> ret.statistic
+    array([[2., 1.],
+           [1., 0.]])
+
+    The bin in which each sample is placed is given by the `binnumber`
+    returned parameter.  By default, these are the linearized bin indices:
+
+    >>> ret.binnumber
+    array([5, 6, 5, 9])
+
+    The bin indices can also be expanded into separate entries for each
+    dimension using the `expand_binnumbers` parameter:
+
+    >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
+    ...                                 expand_binnumbers=True)
+    >>> ret.binnumber
+    array([[1, 1, 1, 2],
+           [1, 2, 1, 1]])
+
+    Which shows that the first three elements belong in the xbin 1, and the
+    fourth into xbin 2; and so on for y.
+
+    """
+
+    # This code is based on np.histogram2d
+    try:
+        N = len(bins)
+    except TypeError:
+        N = 1
+
+    if N != 1 and N != 2:
+        xedges = yedges = np.asarray(bins, float)
+        bins = [xedges, yedges]
+
+    medians, edges, binnumbers = binned_statistic_dd(
+        [x, y], values, statistic, bins, range,
+        expand_binnumbers=expand_binnumbers)
+
+    return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
+
+
+BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
+                                     ('statistic', 'bin_edges',
+                                      'binnumber'))
+
+
+def _bincount(x, weights):
+    if np.iscomplexobj(weights):
+        a = np.bincount(x, np.real(weights))
+        b = np.bincount(x, np.imag(weights))
+        z = a + b*1j
+
+    else:
+        z = np.bincount(x, weights)
+    return z
+
+
+@xp_capabilities(np_only=True)
+def binned_statistic_dd(sample, values, statistic='mean',
+                        bins=10, range=None, expand_binnumbers=False,
+                        binned_statistic_result=None):
+    """
+    Compute a multidimensional binned statistic for a set of data.
+
+    This is a generalization of a histogramdd function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values within each bin.
+
+    Parameters
+    ----------
+    sample : array_like
+        Data to histogram passed as a sequence of N arrays of length D, or
+        as an (N,D) array.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `sample`, or a list of sequences - each with the
+        same shape as `sample`.  If `values` is such a list, the statistic
+        will be computed on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0. If the number of values
+            within a given bin is 0 or 1, the computed standard deviation value
+            will be 0 for the bin.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+
+    bins : sequence or positive int, optional
+        The bin specification must be in one of the following forms:
+
+          * A sequence of arrays describing the bin edges along each dimension.
+          * The number of bins for each dimension (nx, ny, ... = bins).
+          * The number of bins for all dimensions (nx = ny = ... = bins).
+    range : sequence, optional
+        A sequence of lower and upper bin edges to be used if the edges are
+        not given explicitly in `bins`. Defaults to the minimum and maximum
+        values along each dimension.
+    expand_binnumbers : bool, optional
+        'False' (default): the returned `binnumber` is a shape (N,) array of
+        linearized bin indices.
+        'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
+        ndarray, where each row gives the bin numbers in the corresponding
+        dimension.
+        See the `binnumber` returned value, and the `Examples` section of
+        `binned_statistic_2d`.
+    binned_statistic_result : binnedStatisticddResult
+        Result of a previous call to the function in order to reuse bin edges
+        and bin numbers with new values and/or a different statistic.
+        To reuse bin numbers, `expand_binnumbers` must have been set to False
+        (the default)
+
+        .. versionadded:: 0.17.0
+
+    Returns
+    -------
+    statistic : ndarray, shape(nx1, nx2, nx3,...)
+        The values of the selected statistic in each two-dimensional bin.
+    bin_edges : list of ndarrays
+        A list of D arrays describing the (nxi + 1) bin edges for each
+        dimension.
+    binnumber : (N,) array of ints or (D,N) ndarray of ints
+        This assigns to each element of `sample` an integer that represents the
+        bin in which this observation falls.  The representation depends on the
+        `expand_binnumbers` argument.  See `Notes` for details.
+
+
+    See Also
+    --------
+    numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
+
+    Notes
+    -----
+    Binedges:
+    All but the last (righthand-most) bin is half-open in each dimension.  In
+    other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
+    ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The
+    last bin, however, is ``[3, 4]``, which *includes* 4.
+
+    `binnumber`:
+    This returned argument assigns to each element of `sample` an integer that
+    represents the bin in which it belongs.  The representation depends on the
+    `expand_binnumbers` argument. If 'False' (default): The returned
+    `binnumber` is a shape (N,) array of linearized indices mapping each
+    element of `sample` to its corresponding bin (using row-major ordering).
+    If 'True': The returned `binnumber` is a shape (D,N) ndarray where
+    each row indicates bin placements for each dimension respectively.  In each
+    dimension, a binnumber of `i` means the corresponding value is between
+    (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
+
+    .. versionadded:: 0.11.0
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> from mpl_toolkits.mplot3d import Axes3D
+
+    Take an array of 600 (x, y) coordinates as an example.
+    `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
+    of dimension `D+1` is required.
+
+    >>> mu = np.array([0., 1.])
+    >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
+    >>> multinormal = stats.multivariate_normal(mu, sigma)
+    >>> data = multinormal.rvs(size=600, random_state=235412)
+    >>> data.shape
+    (600, 2)
+
+    Create bins and count how many arrays fall in each bin:
+
+    >>> N = 60
+    >>> x = np.linspace(-3, 3, N)
+    >>> y = np.linspace(-3, 4, N)
+    >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
+    ...                                 statistic='count')
+    >>> bincounts = ret.statistic
+
+    Set the volume and the location of bars:
+
+    >>> dx = x[1] - x[0]
+    >>> dy = y[1] - y[0]
+    >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
+    >>> z = 0
+
+    >>> bincounts = bincounts.ravel()
+    >>> x = x.ravel()
+    >>> y = y.ravel()
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111, projection='3d')
+    >>> with np.errstate(divide='ignore'):   # silence random axes3d warning
+    ...     ax.bar3d(x, y, z, dx, dy, bincounts)
+
+    Reuse bin numbers and bin edges with new values:
+
+    >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
+    ...                                  binned_statistic_result=ret,
+    ...                                  statistic='mean')
+    """
+    known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
+    if not callable(statistic) and statistic not in known_stats:
+        raise ValueError(f'invalid statistic {statistic!r}')
+
+    try:
+        bins = index(bins)
+    except TypeError:
+        # bins is not an integer
+        pass
+    # If bins was an integer-like object, now it is an actual Python int.
+
+    # NOTE: for _bin_edges(), see e.g. gh-11365
+    if isinstance(bins, int) and not np.isfinite(sample).all():
+        raise ValueError(f'{sample!r} contains non-finite values.')
+
+    # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
+    # `Dlen` is the length of elements along each dimension.
+    # This code is based on np.histogramdd
+    try:
+        # `sample` is an ND-array.
+        Dlen, Ndim = sample.shape
+    except (AttributeError, ValueError):
+        # `sample` is a sequence of 1D arrays.
+        sample = np.atleast_2d(sample).T
+        Dlen, Ndim = sample.shape
+
+    # Store initial shape of `values` to preserve it in the output
+    values = np.asarray(values)
+    input_shape = list(values.shape)
+    # Make sure that `values` is 2D to iterate over rows
+    values = np.atleast_2d(values)
+    Vdim, Vlen = values.shape
+
+    # Make sure `values` match `sample`
+    if statistic != 'count' and Vlen != Dlen:
+        raise AttributeError('The number of `values` elements must match the '
+                             'length of each `sample` dimension.')
+
+    try:
+        M = len(bins)
+        if M != Ndim:
+            raise AttributeError('The dimension of bins must be equal '
+                                 'to the dimension of the sample x.')
+    except TypeError:
+        bins = Ndim * [bins]
+
+    if binned_statistic_result is None:
+        nbin, edges, dedges = _bin_edges(sample, bins, range)
+        binnumbers = _bin_numbers(sample, nbin, edges, dedges)
+    else:
+        edges = binned_statistic_result.bin_edges
+        nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
+        # +1 for outlier bins
+        dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
+        binnumbers = binned_statistic_result.binnumber
+
+    # Avoid overflow with double precision. Complex `values` -> `complex128`.
+    result_type = np.result_type(values, np.float64)
+    result = np.empty([Vdim, nbin.prod()], dtype=result_type)
+
+    if statistic in {'mean', np.mean}:
+        result.fill(np.nan)
+        flatcount = _bincount(binnumbers, None)
+        a = flatcount.nonzero()
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            result[vv, a] = flatsum[a] / flatcount[a]
+    elif statistic in {'std', np.std}:
+        result.fill(np.nan)
+        flatcount = _bincount(binnumbers, None)
+        a = flatcount.nonzero()
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
+            std = np.sqrt(
+                _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
+            )
+            result[vv, a] = std
+        result = np.real(result)
+    elif statistic == 'count':
+        result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
+        result.fill(0)
+        flatcount = _bincount(binnumbers, None)
+        a = np.arange(len(flatcount))
+        result[:, a] = flatcount[np.newaxis, :]
+    elif statistic in {'sum', np.sum}:
+        result.fill(0)
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            a = np.arange(len(flatsum))
+            result[vv, a] = flatsum
+    elif statistic in {'median', np.median}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.lexsort((values[vv], binnumbers))
+            _, j, counts = np.unique(binnumbers[i],
+                                     return_index=True, return_counts=True)
+            mid = j + (counts - 1) / 2
+            mid_a = values[vv, i][np.floor(mid).astype(int)]
+            mid_b = values[vv, i][np.ceil(mid).astype(int)]
+            medians = (mid_a + mid_b) / 2
+            result[vv, binnumbers[i][j]] = medians
+    elif statistic in {'min', np.min}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.argsort(values[vv])[::-1]  # Reversed so the min is last
+            result[vv, binnumbers[i]] = values[vv, i]
+    elif statistic in {'max', np.max}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.argsort(values[vv])
+            result[vv, binnumbers[i]] = values[vv, i]
+    elif callable(statistic):
+        with np.errstate(invalid='ignore'), catch_warnings():
+            simplefilter("ignore", RuntimeWarning)
+            try:
+                null = statistic([])
+            except Exception:
+                null = np.nan
+        if np.iscomplexobj(null):
+            result = result.astype(np.complex128)
+        result.fill(null)
+        try:
+            _calc_binned_statistic(
+                Vdim, binnumbers, result, values, statistic
+            )
+        except ValueError:
+            result = result.astype(np.complex128)
+            _calc_binned_statistic(
+                Vdim, binnumbers, result, values, statistic
+            )
+
+    # Shape into a proper matrix
+    result = result.reshape(np.append(Vdim, nbin))
+
+    # Remove outliers (indices 0 and -1 for each bin-dimension).
+    core = tuple([slice(None)] + Ndim * [slice(1, -1)])
+    result = result[core]
+
+    # Unravel binnumbers into an ndarray, each row the bins for each dimension
+    if expand_binnumbers and Ndim > 1:
+        binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
+
+    if np.any(result.shape[1:] != nbin - 2):
+        raise RuntimeError('Internal Shape Error')
+
+    # Reshape to have output (`result`) match input (`values`) shape
+    result = result.reshape(input_shape[:-1] + list(nbin-2))
+
+    return BinnedStatisticddResult(result, edges, binnumbers)
+
+
+def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
+    unique_bin_numbers = np.unique(bin_numbers)
+    for vv in builtins.range(Vdim):
+        bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
+                                      values, vv)
+        for i in unique_bin_numbers:
+            stat = stat_func(np.array(bin_map[i]))
+            if np.iscomplexobj(stat) and not np.iscomplexobj(result):
+                raise ValueError("The statistic function returns complex ")
+            result[vv, i] = stat
+
+
+def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
+    """ Create hashmap of bin ids to values in bins
+    key: bin number
+    value: list of binned data
+    """
+    bin_map = dict()
+    for i in unique_bin_numbers:
+        bin_map[i] = []
+    for i in builtins.range(len(bin_numbers)):
+        bin_map[bin_numbers[i]].append(values[vv, i])
+    return bin_map
+
+
+def _bin_edges(sample, bins=None, range=None):
+    """ Create edge arrays
+    """
+    Dlen, Ndim = sample.shape
+
+    nbin = np.empty(Ndim, int)    # Number of bins in each dimension
+    edges = Ndim * [None]         # Bin edges for each dim (will be 2D array)
+    dedges = Ndim * [None]        # Spacing between edges (will be 2D array)
+
+    # Select range for each dimension
+    # Used only if number of bins is given.
+    if range is None:
+        smin = np.atleast_1d(np.array(sample.min(axis=0), float))
+        smax = np.atleast_1d(np.array(sample.max(axis=0), float))
+    else:
+        if len(range) != Ndim:
+            raise ValueError(
+                f"range given for {len(range)} dimensions; {Ndim} required")
+        smin = np.empty(Ndim)
+        smax = np.empty(Ndim)
+        for i in builtins.range(Ndim):
+            if range[i][1] < range[i][0]:
+                raise ValueError(
+                    f"In {f'dimension {i + 1} of ' if Ndim > 1 else ''}range,"
+                    " start must be <= stop")
+            smin[i], smax[i] = range[i]
+
+    # Make sure the bins have a finite width.
+    for i in builtins.range(len(smin)):
+        if smin[i] == smax[i]:
+            smin[i] = smin[i] - .5
+            smax[i] = smax[i] + .5
+
+    # Preserve sample floating point precision in bin edges
+    edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
+                   else float)
+
+    # Create edge arrays
+    for i in builtins.range(Ndim):
+        if np.isscalar(bins[i]):
+            nbin[i] = bins[i] + 2  # +2 for outlier bins
+            edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
+                                   dtype=edges_dtype)
+        else:
+            edges[i] = np.asarray(bins[i], edges_dtype)
+            nbin[i] = len(edges[i]) + 1  # +1 for outlier bins
+        dedges[i] = np.diff(edges[i])
+
+    nbin = np.asarray(nbin)
+
+    return nbin, edges, dedges
+
+
+def _bin_numbers(sample, nbin, edges, dedges):
+    """Compute the bin number each sample falls into, in each dimension
+    """
+    Dlen, Ndim = sample.shape
+
+    sampBin = [
+        np.digitize(sample[:, i], edges[i])
+        for i in range(Ndim)
+    ]
+
+    # Using `digitize`, values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right
+    # edge to be counted in the last bin, and not as an outlier.
+    for i in range(Ndim):
+        # Find the rounding precision
+        dedges_min = dedges[i].min()
+        if dedges_min == 0:
+            raise ValueError('The smallest edge difference is numerically 0.')
+        decimal = int(-np.log10(dedges_min)) + 6
+        # Find which points are on the rightmost edge.
+        on_edge = np.where((sample[:, i] >= edges[i][-1]) &
+                           (np.around(sample[:, i], decimal) ==
+                            np.around(edges[i][-1], decimal)))[0]
+        # Shift these points one bin to the left.
+        sampBin[i][on_edge] -= 1
+
+    # Compute the sample indices in the flattened statistic matrix.
+    binnumbers = np.ravel_multi_index(sampBin, nbin)
+
+    return binnumbers
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binomtest.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binomtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a13a6b5aa86009f9d04238c308beaa34cee606
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_binomtest.py
@@ -0,0 +1,377 @@
+from math import sqrt
+import numpy as np
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._util import _validate_int
+from scipy.optimize import brentq
+from scipy.special import ndtri
+from ._discrete_distns import binom
+from ._common import ConfidenceInterval
+
+
+class BinomTestResult:
+    """
+    Result of `scipy.stats.binomtest`.
+
+    Attributes
+    ----------
+    k : int
+        The number of successes (copied from `binomtest` input).
+    n : int
+        The number of trials (copied from `binomtest` input).
+    alternative : str
+        Indicates the alternative hypothesis specified in the input
+        to `binomtest`.  It will be one of ``'two-sided'``, ``'greater'``,
+        or ``'less'``.
+    statistic: float
+        The estimate of the proportion of successes.
+    pvalue : float
+        The p-value of the hypothesis test.
+
+    """
+    def __init__(self, k, n, alternative, statistic, pvalue):
+        self.k = k
+        self.n = n
+        self.alternative = alternative
+        self.statistic = statistic
+        self.pvalue = pvalue
+
+        # add alias for backward compatibility
+        self.proportion_estimate = statistic
+
+    def __repr__(self):
+        s = ("BinomTestResult("
+             f"k={self.k}, "
+             f"n={self.n}, "
+             f"alternative={self.alternative!r}, "
+             f"statistic={self.statistic}, "
+             f"pvalue={self.pvalue})")
+        return s
+
+    def proportion_ci(self, confidence_level=0.95, method='exact'):
+        """
+        Compute the confidence interval for ``statistic``.
+
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval
+            of the estimated proportion. Default is 0.95.
+        method : {'exact', 'wilson', 'wilsoncc'}, optional
+            Selects the method used to compute the confidence interval
+            for the estimate of the proportion:
+
+            'exact' :
+                Use the Clopper-Pearson exact method [1]_.
+            'wilson' :
+                Wilson's method, without continuity correction ([2]_, [3]_).
+            'wilsoncc' :
+                Wilson's method, with continuity correction ([2]_, [3]_).
+
+            Default is ``'exact'``.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence interval.
+
+        References
+        ----------
+        .. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
+               fiducial limits illustrated in the case of the binomial,
+               Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
+        .. [2] E. B. Wilson, Probable inference, the law of succession, and
+               statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
+               (1927).
+        .. [3] Robert G. Newcombe, Two-sided confidence intervals for the
+               single proportion: comparison of seven methods, Statistics
+               in Medicine, 17, pp 857-872 (1998).
+
+        Examples
+        --------
+        >>> from scipy.stats import binomtest
+        >>> result = binomtest(k=7, n=50, p=0.1)
+        >>> result.statistic
+        0.14
+        >>> result.proportion_ci()
+        ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
+        """
+        if method not in ('exact', 'wilson', 'wilsoncc'):
+            raise ValueError(f"method ('{method}') must be one of 'exact', "
+                             "'wilson' or 'wilsoncc'.")
+        if not (0 <= confidence_level <= 1):
+            raise ValueError(f'confidence_level ({confidence_level}) must be in '
+                             'the interval [0, 1].')
+        if method == 'exact':
+            low, high = _binom_exact_conf_int(self.k, self.n,
+                                              confidence_level,
+                                              self.alternative)
+        else:
+            # method is 'wilson' or 'wilsoncc'
+            low, high = _binom_wilson_conf_int(self.k, self.n,
+                                               confidence_level,
+                                               self.alternative,
+                                               correction=method == 'wilsoncc')
+        return ConfidenceInterval(low=low, high=high)
+
+
+def _findp(func):
+    try:
+        p = brentq(func, 0, 1)
+    except RuntimeError:
+        raise RuntimeError('numerical solver failed to converge when '
+                           'computing the confidence limits') from None
+    except ValueError as exc:
+        raise ValueError('brentq raised a ValueError; report this to the '
+                         'SciPy developers') from exc
+    return p
+
+
+def _binom_exact_conf_int(k, n, confidence_level, alternative):
+    """
+    Compute the estimate and confidence interval for the binomial test.
+
+    Returns proportion, prop_low, prop_high
+    """
+    if alternative == 'two-sided':
+        alpha = (1 - confidence_level) / 2
+        if k == 0:
+            plow = 0.0
+        else:
+            plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
+        if k == n:
+            phigh = 1.0
+        else:
+            phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
+    elif alternative == 'less':
+        alpha = 1 - confidence_level
+        plow = 0.0
+        if k == n:
+            phigh = 1.0
+        else:
+            phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
+    elif alternative == 'greater':
+        alpha = 1 - confidence_level
+        if k == 0:
+            plow = 0.0
+        else:
+            plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
+        phigh = 1.0
+    return plow, phigh
+
+
+def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
+    # This function assumes that the arguments have already been validated.
+    # In particular, `alternative` must be one of 'two-sided', 'less' or
+    # 'greater'.
+    p = k / n
+    if alternative == 'two-sided':
+        z = ndtri(0.5 + 0.5*confidence_level)
+    else:
+        z = ndtri(confidence_level)
+
+    # For reference, the formulas implemented here are from
+    # Newcombe (1998) (ref. [3] in the proportion_ci docstring).
+    denom = 2*(n + z**2)
+    center = (2*n*p + z**2)/denom
+    q = 1 - p
+    if correction:
+        if alternative == 'less' or k == 0:
+            lo = 0.0
+        else:
+            dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
+            lo = center - dlo
+        if alternative == 'greater' or k == n:
+            hi = 1.0
+        else:
+            dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
+            hi = center + dhi
+    else:
+        delta = z/denom * sqrt(4*n*p*q + z**2)
+        if alternative == 'less' or k == 0:
+            lo = 0.0
+        else:
+            lo = center - delta
+        if alternative == 'greater' or k == n:
+            hi = 1.0
+        else:
+            hi = center + delta
+
+    return lo, hi
+
+
+@xp_capabilities(np_only=True)
+def binomtest(k, n, p=0.5, alternative='two-sided'):
+    """
+    Perform a test that the probability of success is p.
+
+    The binomial test [1]_ is a test of the null hypothesis that the
+    probability of success in a Bernoulli experiment is `p`.
+
+    Details of the test can be found in many texts on statistics, such
+    as section 24.5 of [2]_.
+
+    Parameters
+    ----------
+    k : int
+        The number of successes.
+    n : int
+        The number of trials.
+    p : float, optional
+        The hypothesized probability of success, i.e. the expected
+        proportion of successes.  The value must be in the interval
+        ``0 <= p <= 1``. The default value is ``p = 0.5``.
+    alternative : {'two-sided', 'greater', 'less'}, optional
+        Indicates the alternative hypothesis. The default value is
+        'two-sided'.
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.BinomTestResult` instance
+        The return value is an object with the following attributes:
+
+        k : int
+            The number of successes (copied from `binomtest` input).
+        n : int
+            The number of trials (copied from `binomtest` input).
+        alternative : str
+            Indicates the alternative hypothesis specified in the input
+            to `binomtest`.  It will be one of ``'two-sided'``, ``'greater'``,
+            or ``'less'``.
+        statistic : float
+            The estimate of the proportion of successes.
+        pvalue : float
+            The p-value of the hypothesis test.
+
+        The object has the following methods:
+
+        proportion_ci(confidence_level=0.95, method='exact') :
+            Compute the confidence interval for ``statistic``.
+
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+
+    References
+    ----------
+    .. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
+    .. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
+           Prentice Hall, Upper Saddle River, New Jersey USA (2010)
+
+    Examples
+    --------
+    >>> from scipy.stats import binomtest
+
+    A car manufacturer claims that no more than 10% of their cars are unsafe.
+    15 cars are inspected for safety, 3 were found to be unsafe. Test the
+    manufacturer's claim:
+
+    >>> result = binomtest(3, n=15, p=0.1, alternative='greater')
+    >>> result.pvalue
+    0.18406106910639114
+
+    The null hypothesis cannot be rejected at the 5% level of significance
+    because the returned p-value is greater than the critical value of 5%.
+
+    The test statistic is equal to the estimated proportion, which is simply
+    ``3/15``:
+
+    >>> result.statistic
+    0.2
+
+    We can use the `proportion_ci()` method of the result to compute the
+    confidence interval of the estimate:
+
+    >>> result.proportion_ci(confidence_level=0.95)
+    ConfidenceInterval(low=0.05684686759024681, high=1.0)
+
+    """
+    k = _validate_int(k, 'k', minimum=0)
+    n = _validate_int(n, 'n', minimum=1)
+    if k > n:
+        raise ValueError(f'k ({k}) must not be greater than n ({n}).')
+
+    if not (0 <= p <= 1):
+        raise ValueError(f"p ({p}) must be in range [0,1]")
+
+    if alternative not in ('two-sided', 'less', 'greater'):
+        raise ValueError(f"alternative ('{alternative}') not recognized; \n"
+                         "must be 'two-sided', 'less' or 'greater'")
+    if alternative == 'less':
+        pval = binom.cdf(k, n, p)
+    elif alternative == 'greater':
+        pval = binom.sf(k-1, n, p)
+    else:
+        # alternative is 'two-sided'
+        d = binom.pmf(k, n, p)
+        rerr = 1 + 1e-7
+        if k == p * n:
+            # special case as shortcut, would also be handled by `else` below
+            pval = 1.
+        elif k < p * n:
+            ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
+                                              -d*rerr, np.ceil(p * n), n)
+            # y is the number of terms between mode and n that are <= d*rerr.
+            # ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
+            # if the first equality doesn't hold, y=n-ix. Otherwise, we
+            # need to include ix as well as the equality holds. Note that
+            # the equality will hold in very very rare situations due to rerr.
+            y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
+            pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
+        else:
+            ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
+                                              d*rerr, 0, np.floor(p * n))
+            # y is the number of terms between 0 and mode that are <= d*rerr.
+            # we need to add a 1 to account for the 0 index.
+            # For comparing this with old behavior, see
+            # tst_binary_srch_for_binom_tst method in test_morestats.
+            y = ix + 1
+            pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
+
+        pval = min(1.0, pval)
+
+    result = BinomTestResult(k=k, n=n, alternative=alternative,
+                             statistic=k/n, pvalue=pval)
+    return result
+
+
+def _binary_search_for_binom_tst(a, d, lo, hi):
+    """
+    Conducts an implicit binary search on a function specified by `a`.
+
+    Meant to be used on the binomial PMF for the case of two-sided tests
+    to obtain the value on the other side of the mode where the tail
+    probability should be computed. The values on either side of
+    the mode are always in order, meaning binary search is applicable.
+
+    Parameters
+    ----------
+    a : callable
+      The function over which to perform binary search. Its values
+      for inputs lo and hi should be in ascending order.
+    d : float
+      The value to search.
+    lo : int
+      The lower end of range to search.
+    hi : int
+      The higher end of the range to search.
+
+    Returns
+    -------
+    int
+      The index, i between lo and hi
+      such that a(i)<=d<a(i+1)
+    """
+    while lo < hi:
+        mid = lo + (hi-lo)//2
+        midval = a(mid)
+        if midval < d:
+            lo = mid+1
+        elif midval > d:
+            hi = mid-1
+        else:
+            return mid
+    if a(lo) <= d:
+        return lo
+    else:
+        return lo-1
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_bws_test.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_bws_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf9881cd1c88887b3c78ecf1a025756edb4686b2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_bws_test.py
@@ -0,0 +1,179 @@
+import numpy as np
+from functools import partial
+from scipy import stats
+from scipy._lib._array_api import xp_capabilities
+
+
+def _bws_input_validation(x, y, alternative, method):
+    ''' Input validation and standardization for bws test'''
+    x, y = np.atleast_1d(x, y)
+    if x.ndim > 1 or y.ndim > 1:
+        raise ValueError('`x` and `y` must be exactly one-dimensional.')
+    if np.isnan(x).any() or np.isnan(y).any():
+        raise ValueError('`x` and `y` must not contain NaNs.')
+    if np.size(x) == 0 or np.size(y) == 0:
+        raise ValueError('`x` and `y` must be of nonzero size.')
+
+    z = stats.rankdata(np.concatenate((x, y)))
+    x, y = z[:len(x)], z[len(x):]
+
+    alternatives = {'two-sided', 'less', 'greater'}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f'`alternative` must be one of {alternatives}.')
+
+    method = stats.PermutationMethod() if method is None else method
+    if not isinstance(method, stats.PermutationMethod):
+        raise ValueError('`method` must be an instance of '
+                         '`scipy.stats.PermutationMethod`')
+
+    return x, y, alternative, method
+
+
+def _bws_statistic(x, y, alternative, axis):
+    '''Compute the BWS test statistic for two independent samples'''
+    # Public function currently does not accept `axis`, but `permutation_test`
+    # uses `axis` to make vectorized call.
+
+    Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
+    n, m = Ri.shape[axis], Hj.shape[axis]
+    i, j = np.arange(1, n+1), np.arange(1, m+1)
+
+    Bx_num = Ri - (m + n)/n * i
+    By_num = Hj - (m + n)/m * j
+
+    if alternative == 'two-sided':
+        Bx_num *= Bx_num
+        By_num *= By_num
+    else:
+        Bx_num *= np.abs(Bx_num)
+        By_num *= np.abs(By_num)
+
+    Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
+    By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
+
+    Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
+    By = 1/m * np.sum(By_num/By_den, axis=axis)
+
+    B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
+
+    return B
+
+
+@xp_capabilities(np_only=True)
+def bws_test(x, y, *, alternative="two-sided", method=None):
+    r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
+
+    The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of 
+    the null hypothesis that the distribution underlying sample `x` 
+    is the same as the distribution underlying sample `y`. Unlike 
+    the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests, 
+    the BWS test weights the integral by the variance of the difference
+    in cumulative distribution functions (CDFs), emphasizing the tails of the
+    distributions, which increases the power of the test in many applications.
+
+    Parameters
+    ----------
+    x, y : array-like
+        1-d arrays of samples.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        Let *F(u)* and *G(u)* be the cumulative distribution functions of the
+        distributions underlying `x` and `y`, respectively. Then the following
+        alternative hypotheses are available:
+
+        * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
+          at least one *u*.
+        * 'less': the distribution underlying `x` is stochastically less than
+          the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
+        * 'greater': the distribution underlying `x` is stochastically greater
+          than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
+          *u*.
+
+        Under a more restrictive set of assumptions, the alternative hypotheses
+        can be expressed in terms of the locations of the distributions;
+        see [2] section 5.1.
+    method : PermutationMethod, optional
+        Configures the method used to compute the p-value. The default is
+        the default `PermutationMethod` object.
+
+    Returns
+    -------
+    res : PermutationTestResult
+    An object with attributes:
+
+    statistic : float
+        The observed test statistic of the data.
+    pvalue : float
+        The p-value for the given alternative.
+    null_distribution : ndarray
+        The values of the test statistic generated under the null hypothesis.
+
+    See also
+    --------
+    scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
+
+    Notes
+    -----
+    When ``alternative=='two-sided'``, the statistic is defined by the
+    equations given in [1]_ Section 2. This statistic is not appropriate for
+    one-sided alternatives; in that case, the statistic is the *negative* of
+    that given by the equations in [1]_ Section 2. Consequently, when the
+    distribution of the first sample is stochastically greater than that of the
+    second sample, the statistic will tend to be positive.
+
+    References
+    ----------
+    .. [1] Neuhäuser, M. (2005). Exact Tests Based on the
+           Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
+           46(1), 1-29.
+    .. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
+           On assumptions for hypothesis tests and multiple interpretations of 
+           decision rules. Statistics surveys, 4, 1.
+
+    Examples
+    --------
+    We follow the example of table 3 in [1]_: Fourteen children were divided
+    randomly into two groups. Their ranks at performing a specific tests are
+    as follows.
+
+    >>> import numpy as np
+    >>> x = [1, 2, 3, 4, 6, 7, 8]
+    >>> y = [5, 9, 10, 11, 12, 13, 14]
+
+    We use the BWS test to assess whether there is a statistically significant
+    difference between the two groups.
+    The null hypothesis is that there is no difference in the distributions of
+    performance between the two groups. We decide that a significance level of
+    1% is required to reject the null hypothesis in favor of the alternative
+    that the distributions are different.
+    Since the number of samples is very small, we can compare the observed test
+    statistic against the *exact* distribution of the test statistic under the
+    null hypothesis.
+
+    >>> from scipy.stats import bws_test
+    >>> res = bws_test(x, y)
+    >>> print(res.statistic)
+    5.132167152575315
+
+    This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
+    by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
+
+    >>> print(res.pvalue)
+    0.002913752913752914
+
+    Because the p-value is below our threshold of 1%, we take this as evidence
+    against the null hypothesis in favor of the alternative that there is a
+    difference in performance between the two groups.
+    '''
+
+    x, y, alternative, method = _bws_input_validation(x, y, alternative,
+                                                      method)
+    bws_statistic = partial(_bws_statistic, alternative=alternative)
+
+    permutation_alternative = 'less' if alternative == 'less' else 'greater'
+    res = stats.permutation_test((x, y), bws_statistic,
+                                 alternative=permutation_alternative,
+                                 **method._asdict())
+
+    return res
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_censored_data.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_censored_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6fee500f1d97db0bae9ebff26824d4d894c7f39
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_censored_data.py
@@ -0,0 +1,459 @@
+import numpy as np
+
+
+def _validate_1d(a, name, allow_inf=False):
+    if np.ndim(a) != 1:
+        raise ValueError(f'`{name}` must be a one-dimensional sequence.')
+    if np.isnan(a).any():
+        raise ValueError(f'`{name}` must not contain nan.')
+    if not allow_inf and np.isinf(a).any():
+        raise ValueError(f'`{name}` must contain only finite values.')
+
+
+def _validate_interval(interval):
+    interval = np.asarray(interval)
+    if interval.shape == (0,):
+        # The input was a sequence with length 0.
+        interval = interval.reshape((0, 2))
+    if interval.ndim != 2 or interval.shape[-1] != 2:
+        raise ValueError('`interval` must be a two-dimensional array with '
+                         'shape (m, 2), where m is the number of '
+                         'interval-censored values, but got shape '
+                         f'{interval.shape}')
+
+    if np.isnan(interval).any():
+        raise ValueError('`interval` must not contain nan.')
+    if np.isinf(interval).all(axis=1).any():
+        raise ValueError('In each row in `interval`, both values must not'
+                         ' be infinite.')
+    if (interval[:, 0] > interval[:, 1]).any():
+        raise ValueError('In each row of `interval`, the left value must not'
+                         ' exceed the right value.')
+
+    uncensored_mask = interval[:, 0] == interval[:, 1]
+    left_mask = np.isinf(interval[:, 0])
+    right_mask = np.isinf(interval[:, 1])
+    interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
+
+    uncensored2 = interval[uncensored_mask, 0]
+    left2 = interval[left_mask, 1]
+    right2 = interval[right_mask, 0]
+    interval2 = interval[interval_mask]
+
+    return uncensored2, left2, right2, interval2
+
+
+def _validate_x_censored(x, censored):
+    x = np.asarray(x)
+    if x.ndim != 1:
+        raise ValueError('`x` must be one-dimensional.')
+    censored = np.asarray(censored)
+    if censored.ndim != 1:
+        raise ValueError('`censored` must be one-dimensional.')
+    if (~np.isfinite(x)).any():
+        raise ValueError('`x` must not contain nan or inf.')
+    if censored.size != x.size:
+        raise ValueError('`x` and `censored` must have the same length.')
+    return x, censored.astype(bool)
+
+
+class CensoredData:
+    """
+    Instances of this class represent censored data.
+
+    Instances may be passed to the ``fit`` method of continuous
+    univariate SciPy distributions for maximum likelihood estimation.
+    The *only* method of the univariate continuous distributions that
+    understands `CensoredData` is the ``fit`` method.  An instance of
+    `CensoredData` can not be passed to methods such as ``pdf`` and
+    ``cdf``.
+
+    An observation is said to be *censored* when the precise value is unknown,
+    but it has a known upper and/or lower bound.  The conventional terminology
+    is:
+
+    * left-censored: an observation is below a certain value but it is
+      unknown by how much.
+    * right-censored: an observation is above a certain value but it is
+      unknown by how much.
+    * interval-censored: an observation lies somewhere on an interval between
+      two values.
+
+    Left-, right-, and interval-censored data can be represented by
+    `CensoredData`.
+
+    For convenience, the class methods ``left_censored`` and
+    ``right_censored`` are provided to create a `CensoredData`
+    instance from a single one-dimensional array of measurements
+    and a corresponding boolean array to indicate which measurements
+    are censored.  The class method ``interval_censored`` accepts two
+    one-dimensional arrays that hold the lower and upper bounds of the
+    intervals.
+
+    Parameters
+    ----------
+    uncensored : array_like, 1D
+        Uncensored observations.
+    left : array_like, 1D
+        Left-censored observations.
+    right : array_like, 1D
+        Right-censored observations.
+    interval : array_like, 2D, with shape (m, 2)
+        Interval-censored observations.  Each row ``interval[k, :]``
+        represents the interval for the kth interval-censored observation.
+
+    Notes
+    -----
+    In the input array `interval`, the lower bound of the interval may
+    be ``-inf``, and the upper bound may be ``inf``, but at least one must be
+    finite. When the lower bound is ``-inf``, the row represents a left-
+    censored observation, and when the upper bound is ``inf``, the row
+    represents a right-censored observation.  If the length of an interval
+    is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
+    treated as uncensored.  So one can represent all the types of censored
+    and uncensored data in ``interval``, but it is generally more convenient
+    to use `uncensored`, `left` and `right` for uncensored, left-censored and
+    right-censored observations, respectively.
+
+    Examples
+    --------
+    In the most general case, a censored data set may contain values that
+    are left-censored, right-censored, interval-censored, and uncensored.
+    For example, here we create a data set with five observations.  Two
+    are uncensored (values 1 and 1.5), one is a left-censored observation
+    of 0, one is a right-censored observation of 10 and one is
+    interval-censored in the interval [2, 3].
+
+    >>> import numpy as np
+    >>> from scipy.stats import CensoredData
+    >>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
+    ...                     interval=[[2, 3]])
+    >>> print(data)
+    CensoredData(5 values: 2 not censored, 1 left-censored,
+    1 right-censored, 1 interval-censored)
+
+    Equivalently,
+
+    >>> data = CensoredData(interval=[[1, 1],
+    ...                               [1.5, 1.5],
+    ...                               [-np.inf, 0],
+    ...                               [10, np.inf],
+    ...                               [2, 3]])
+    >>> print(data)
+    CensoredData(5 values: 2 not censored, 1 left-censored,
+    1 right-censored, 1 interval-censored)
+
+    A common case is to have a mix of uncensored observations and censored
+    observations that are all right-censored (or all left-censored). For
+    example, consider an experiment in which six devices are started at
+    various times and left running until they fail.  Assume that time is
+    measured in hours, and the experiment is stopped after 30 hours, even
+    if all the devices have not failed by that time.  We might end up with
+    data such as this::
+
+        Device  Start-time  Fail-time  Time-to-failure
+           1         0         13           13
+           2         2         24           22
+           3         5         22           17
+           4         8         23           15
+           5        10        ***          >20
+           6        12        ***          >18
+
+    Two of the devices had not failed when the experiment was stopped;
+    the observations of the time-to-failure for these two devices are
+    right-censored.  We can represent this data with
+
+    >>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
+    >>> print(data)
+    CensoredData(6 values: 4 not censored, 2 right-censored)
+
+    Alternatively, we can use the method `CensoredData.right_censored` to
+    create a representation of this data.  The time-to-failure observations
+    are put the list ``ttf``.  The ``censored`` list indicates which values
+    in ``ttf`` are censored.
+
+    >>> ttf = [13, 22, 17, 15, 20, 18]
+    >>> censored = [False, False, False, False, True, True]
+
+    Pass these lists to `CensoredData.right_censored` to create an
+    instance of `CensoredData`.
+
+    >>> data = CensoredData.right_censored(ttf, censored)
+    >>> print(data)
+    CensoredData(6 values: 4 not censored, 2 right-censored)
+
+    If the input data is interval censored and already stored in two
+    arrays, one holding the low end of the intervals and another
+    holding the high ends, the class method ``interval_censored`` can
+    be used to create the `CensoredData` instance.
+
+    This example creates an instance with four interval-censored values.
+    The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
+
+    >>> a = [10, 0.5, 2, 12.5]  # Low ends of the intervals
+    >>> b = [11, 1.0, 3, 13.5]  # High ends of the intervals
+    >>> data = CensoredData.interval_censored(low=a, high=b)
+    >>> print(data)
+    CensoredData(4 values: 0 not censored, 4 interval-censored)
+
+    Finally, we create and censor some data from the `weibull_min`
+    distribution, and then fit `weibull_min` to that data. We'll assume
+    that the location parameter is known to be 0.
+
+    >>> from scipy.stats import weibull_min
+    >>> rng = np.random.default_rng()
+
+    Create the random data set.
+
+    >>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
+    >>> x[x > 40] = 40  # Right-censor values greater or equal to 40.
+
+    Create the `CensoredData` instance with the `right_censored` method.
+    The censored values are those where the value is 40.
+
+    >>> data = CensoredData.right_censored(x, x == 40)
+    >>> print(data)
+    CensoredData(250 values: 215 not censored, 35 right-censored)
+
+    35 values have been right-censored.
+
+    Fit `weibull_min` to the censored data.  We expect to shape and scale
+    to be approximately 2.5 and 30, respectively.
+
+    >>> weibull_min.fit(data, floc=0)
+    (2.3575922823897315, 0, 30.40650074451254)
+
+    """
+
+    def __init__(self, uncensored=None, *, left=None, right=None,
+                 interval=None):
+        if uncensored is None:
+            uncensored = []
+        if left is None:
+            left = []
+        if right is None:
+            right = []
+        if interval is None:
+            interval = np.empty((0, 2))
+
+        _validate_1d(uncensored, 'uncensored')
+        _validate_1d(left, 'left')
+        _validate_1d(right, 'right')
+        uncensored2, left2, right2, interval2 = _validate_interval(interval)
+
+        self._uncensored = np.concatenate((uncensored, uncensored2))
+        self._left = np.concatenate((left, left2))
+        self._right = np.concatenate((right, right2))
+        # Note that by construction, the private attribute _interval
+        # will be a 2D array that contains only finite values representing
+        # intervals with nonzero but finite length.
+        self._interval = interval2
+
+    def __repr__(self):
+        uncensored_str = " ".join(np.array_repr(self._uncensored).split())
+        left_str = " ".join(np.array_repr(self._left).split())
+        right_str = " ".join(np.array_repr(self._right).split())
+        interval_str = " ".join(np.array_repr(self._interval).split())
+        return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
+                f"right={right_str}, interval={interval_str})")
+
+    def __str__(self):
+        num_nc = len(self._uncensored)
+        num_lc = len(self._left)
+        num_rc = len(self._right)
+        num_ic = len(self._interval)
+        n = num_nc + num_lc + num_rc + num_ic
+        parts = [f'{num_nc} not censored']
+        if num_lc > 0:
+            parts.append(f'{num_lc} left-censored')
+        if num_rc > 0:
+            parts.append(f'{num_rc} right-censored')
+        if num_ic > 0:
+            parts.append(f'{num_ic} interval-censored')
+        return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
+
+    # This is not a complete implementation of the arithmetic operators.
+    # All we need is subtracting a scalar and dividing by a scalar.
+
+    def __sub__(self, other):
+        return CensoredData(uncensored=self._uncensored - other,
+                            left=self._left - other,
+                            right=self._right - other,
+                            interval=self._interval - other)
+
+    def __truediv__(self, other):
+        return CensoredData(uncensored=self._uncensored / other,
+                            left=self._left / other,
+                            right=self._right / other,
+                            interval=self._interval / other)
+
+    def __len__(self):
+        """
+        The number of values (censored and not censored).
+        """
+        return (len(self._uncensored) + len(self._left) + len(self._right)
+                + len(self._interval))
+
+    def num_censored(self):
+        """
+        Number of censored values.
+        """
+        return len(self._left) + len(self._right) + len(self._interval)
+
+    @classmethod
+    def right_censored(cls, x, censored):
+        """
+        Create a `CensoredData` instance of right-censored data.
+
+        Parameters
+        ----------
+        x : array_like
+            `x` is the array of observed data or measurements.
+            `x` must be a one-dimensional sequence of finite numbers.
+        censored : array_like of bool
+            `censored` must be a one-dimensional sequence of boolean
+            values.  If ``censored[k]`` is True, the corresponding value
+            in `x` is right-censored.  That is, the value ``x[k]``
+            is the lower bound of the true (but unknown) value.
+
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of uncensored and right-censored values.
+
+        Examples
+        --------
+        >>> from scipy.stats import CensoredData
+
+        Two uncensored values (4 and 10) and two right-censored values
+        (24 and 25).
+
+        >>> data = CensoredData.right_censored([4, 10, 24, 25],
+        ...                                    [False, False, True, True])
+        >>> data
+        CensoredData(uncensored=array([ 4., 10.]),
+        left=array([], dtype=float64), right=array([24., 25.]),
+        interval=array([], shape=(0, 2), dtype=float64))
+        >>> print(data)
+        CensoredData(4 values: 2 not censored, 2 right-censored)
+        """
+        x, censored = _validate_x_censored(x, censored)
+        return cls(uncensored=x[~censored], right=x[censored])
+
+    @classmethod
+    def left_censored(cls, x, censored):
+        """
+        Create a `CensoredData` instance of left-censored data.
+
+        Parameters
+        ----------
+        x : array_like
+            `x` is the array of observed data or measurements.
+            `x` must be a one-dimensional sequence of finite numbers.
+        censored : array_like of bool
+            `censored` must be a one-dimensional sequence of boolean
+            values.  If ``censored[k]`` is True, the corresponding value
+            in `x` is left-censored.  That is, the value ``x[k]``
+            is the upper bound of the true (but unknown) value.
+
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of uncensored and left-censored values.
+
+        Examples
+        --------
+        >>> from scipy.stats import CensoredData
+
+        Two uncensored values (0.12 and 0.033) and two left-censored values
+        (both 1e-3).
+
+        >>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
+        ...                                   [False, False, True, True])
+        >>> data
+        CensoredData(uncensored=array([0.12 , 0.033]),
+        left=array([0.001, 0.001]), right=array([], dtype=float64),
+        interval=array([], shape=(0, 2), dtype=float64))
+        >>> print(data)
+        CensoredData(4 values: 2 not censored, 2 left-censored)
+        """
+        x, censored = _validate_x_censored(x, censored)
+        return cls(uncensored=x[~censored], left=x[censored])
+
+    @classmethod
+    def interval_censored(cls, low, high):
+        """
+        Create a `CensoredData` instance of interval-censored data.
+
+        This method is useful when all the data is interval-censored, and
+        the low and high ends of the intervals are already stored in
+        separate one-dimensional arrays.
+
+        Parameters
+        ----------
+        low : array_like
+            The one-dimensional array containing the low ends of the
+            intervals.
+        high : array_like
+            The one-dimensional array containing the high ends of the
+            intervals.
+
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of censored values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.stats import CensoredData
+
+        ``a`` and ``b`` are the low and high ends of a collection of
+        interval-censored values.
+
+        >>> a = [0.5, 2.0, 3.0, 5.5]
+        >>> b = [1.0, 2.5, 3.5, 7.0]
+        >>> data = CensoredData.interval_censored(low=a, high=b)
+        >>> print(data)
+        CensoredData(4 values: 0 not censored, 4 interval-censored)
+        """
+        _validate_1d(low, 'low', allow_inf=True)
+        _validate_1d(high, 'high', allow_inf=True)
+        if len(low) != len(high):
+            raise ValueError('`low` and `high` must have the same length.')
+        interval = np.column_stack((low, high))
+        uncensored, left, right, interval = _validate_interval(interval)
+        return cls(uncensored=uncensored, left=left, right=right,
+                   interval=interval)
+
+    def _uncensor(self):
+        """
+        This function is used when a non-censored version of the data
+        is needed to create a rough estimate of the parameters of a
+        distribution via the method of moments or some similar method.
+        The data is "uncensored" by taking the given endpoints as the
+        data for the left- or right-censored data, and the mean for the
+        interval-censored data.
+        """
+        data = np.concatenate((self._uncensored, self._left, self._right,
+                               self._interval.mean(axis=1)))
+        return data
+
+    def _supported(self, a, b):
+        """
+        Return a subset of self containing the values that are in
+        (or overlap with) the interval (a, b).
+        """
+        uncensored = self._uncensored
+        uncensored = uncensored[(a < uncensored) & (uncensored < b)]
+        left = self._left
+        left = left[a < left]
+        right = self._right
+        right = right[right < b]
+        interval = self._interval
+        interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
+        return CensoredData(uncensored, left=left, right=right,
+                            interval=interval)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_common.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..4011d425cc4afea3c7ee8937526b13f1f92b0850
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_common.py
@@ -0,0 +1,5 @@
+from collections import namedtuple
+
+
+ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
+ConfidenceInterval. __doc__ = "Class for confidence intervals."
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_constants.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..b539ce8146ebdbc8e08c66143461b04d742804f2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_constants.py
@@ -0,0 +1,42 @@
+"""
+Statistics-related constants.
+
+"""
+import numpy as np
+
+
+# The smallest representable positive number such that 1.0 + _EPS != 1.0.
+_EPS = np.finfo(float).eps
+
+# The largest [in magnitude] usable floating value.
+_XMAX = np.finfo(float).max
+
+# The log of the largest usable floating value; useful for knowing
+# when exp(something) will overflow
+_LOGXMAX = np.log(_XMAX)
+
+# The smallest [in magnitude] usable (i.e. not subnormal) double precision
+# floating value.
+_XMIN = np.finfo(float).tiny
+
+# The log of the smallest [in magnitude] usable (i.e not subnormal)
+# double precision floating value.
+_LOGXMIN = np.log(_XMIN)
+
+# -special.psi(1)
+_EULER = 0.577215664901532860606512090082402431042
+
+# special.zeta(3, 1)  Apery's constant
+_ZETA3 = 1.202056903159594285399738161511449990765
+
+# sqrt(pi)
+_SQRT_PI = 1.772453850905516027298167483341145182798
+
+# sqrt(2/pi)
+_SQRT_2_OVER_PI = 0.7978845608028654
+
+# log(pi)
+_LOG_PI = 1.1447298858494002
+
+# log(sqrt(2/pi))
+_LOG_SQRT_2_OVER_PI = -0.22579135264472744
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continued_fraction.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continued_fraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..4966b1c99ed092ba3578aa7ca15b5ba8ea4bc6a9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continued_fraction.py
@@ -0,0 +1,387 @@
+import numpy as np
+
+from scipy._lib._array_api import (
+    array_namespace, xp_ravel, xp_copy, xp_promote
+)
+import scipy._lib._elementwise_iterative_method as eim
+from scipy._lib._util import _RichResult
+from scipy import special
+
+# Todo:
+# Avoid special-casing key 'n' in _lib._elementwise_iterative_method::_check_termination
+# Rearrange termination condition to allow absolute and relative tolerances?
+# Interpret/return |f_n - f_{n-1}| as an error estimate?
+# Return gracefully for size=0 arrays
+
+def _logaddexp(x, y, xp=None):
+    # logaddexp that supports complex numbers
+    xp = array_namespace(x, y) if xp is None else xp
+    x, y = xp.broadcast_arrays(x, y)
+    xy = xp.stack((x, y), axis=0)
+    return special.logsumexp(xy, axis=0)
+
+
+def _continued_fraction_iv(a, b, args, tolerances, maxiter, log):
+    # Input validation for `_continued_fraction`
+
+    if not callable(a) or not callable(b):
+        raise ValueError('`a` and `b` must be callable.')
+
+    if not np.iterable(args):
+        args = (args,)
+
+    # Call each callable once to determine namespace and dtypes
+    a0, b0 = a(0, *args), b(0, *args)
+    xp = array_namespace(a0, b0, *args)
+    a0, b0, *args = xp_promote(a0, b0, *args, force_floating=True, broadcast=True,
+                               xp=xp)
+    shape, dtype = a0.shape, a0.dtype
+    a0, b0, *args = (xp_ravel(arg) for arg in (a0, b0) + tuple(args))
+
+    tolerances = {} if tolerances is None else tolerances
+    eps = tolerances.get('eps', None)
+    tiny = tolerances.get('tiny', None)
+
+    # tolerances are floats, not arrays, so it's OK to use NumPy
+    message = ('`eps` and `tiny` must be (or represent the logarithm of) '
+               'finite, positive, real scalars.')
+    tols = np.asarray([eps if eps is not None else 1,
+                       tiny if tiny is not None else 1])
+    not_real = (not np.issubdtype(tols.dtype, np.number)
+                or np.issubdtype(tols.dtype, np.complexfloating))
+    not_positive = np.any(tols <= 0) if not log else False
+    not_finite = not np.all(np.isfinite(tols))
+    not_scalar = tols.shape != (2,)
+    if not_real or not_positive or not_finite or not_scalar:
+        raise ValueError(message)
+
+    maxiter_int = int(maxiter)
+    if maxiter != maxiter_int or maxiter < 0:
+        raise ValueError('`maxiter` must be a non-negative integer.')
+
+    if not isinstance(log, bool):
+        raise ValueError('`log` must be boolean.')
+
+    return a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp
+
+
+def _continued_fraction(a, b, *, args=(), tolerances=None, maxiter=100, log=False):
+    r"""Evaluate a generalized continued fraction numerically.
+
+    `_continued_fraction` iteratively evaluates convergents of a continued fraction
+    given coefficients returned by callables `a` and `b`. Iteration terminates when
+    `maxiter` terms have been evaluated or a termination criterion controlled by
+    `tolerances` is satisfied, and the final convergent is returned as the ``f``
+    attribute of the result object.
+
+    This function works elementwise when `args` contains (broadcastable) arrays.
+
+    Parameters
+    ----------
+    a, b: callable
+        Functions that provide the *numerator* and *denominator* coefficients of
+        the continued fraction, respectively.
+
+        The signature of each must be::
+
+            a(n: int, *argsj) -> ndarray
+
+        where ``n`` is the coefficient number and ``argsj`` is a tuple, which may
+        contain an arbitrary number of arrays of any shape. `a` and `b` must be
+        elementwise functions: each scalar element ``a(n, *argsj)[i]`` must equal
+        ``a(n, *[argj[i] for argj in argsj])`` for valid indices ``i``.
+        `a` and `b` must not mutate the arrays in ``argsj``.
+
+        The result shape is the broadcasted shape of ``a(0, *args)`` and
+        ``b(0, *args)``. The dtype used throughout computation is the result dtype
+        of these terms if it is a float, and the default float of the array library
+        otherwise. The numerical value of ``a(0, *args)`` is ignored, and
+        the value of the leading term ``b(0, *args)`` is the so-called "integer"
+        part of the continued fraction (although it need not be integral).
+
+    args : tuple of array_like, optional
+        Additional positional *array* arguments to be passed to `a` and `b`. Arrays
+        must be broadcastable with one another. If the coefficient callables
+        require additional arguments that are not broadcastable with one
+        another, wrap them with callables `a` and `b` such that `a` and `b` accept
+        only ``n`` and broadcastable array arguments.
+    tolerances : dictionary of floats, optional
+        Tolerances and numerical thresholds used by the algorithm. Currently,
+        valid keys of the dictionary are:
+
+        - ``eps`` - the convergence threshold of Lentz' algorithm
+        - ``tiny`` - not strictly a "tolerance", but a very small positive number
+          used to avoid division by zero
+
+        The default `eps` is the precision of the appropriate dtype, and the default
+        `tiny` is the precision squared. [1]_ advises that ``eps`` is "as small as
+        you like", but for most purposes, it should not be set smaller than the default
+        because it may prevent convergence of the algorithm. [1]_ also advises that
+        ``tiny`` should be less than typical values of ``eps * b(n)``, so the default
+        is a good choice unless the :math:`b_n` are very small. See [1]_ for details.
+    maxiter : int, default: 100
+        The maximum number of iterations of the algorithm to perform.
+    log : bool, default: False
+        If True, `a` and `b` return the (natural) logarithm of the terms, `tolerances`
+        contains the logarithm of the tolerances, and the result object reports the
+        logarithm of the convergent.
+
+    Returns
+    -------
+    res : _RichResult
+        An object similar to an instance of `scipy.optimize.OptimizeResult` with the
+        following attributes. The descriptions are written as though the values will
+        be scalars; however, if `f` returns an array, the outputs will be
+        arrays of the same shape.
+
+        success : bool array
+            ``True`` where the algorithm terminated successfully (status ``0``);
+            ``False`` otherwise.
+        status : int array
+            An integer representing the exit status of the algorithm.
+
+            - ``0`` : The algorithm converged to the specified tolerances.
+            - ``-2`` : The maximum number of iterations was reached.
+            - ``-3`` : A non-finite value was encountered.
+
+        f : float array
+            The convergent which satisfied a termination criterion.
+        nit : int array
+            The number of iterations of the algorithm that were performed.
+        nfev : int array
+            The number of terms that were evaluated.
+
+    Notes
+    -----
+    A generalized continued fraction is an expression of the form
+
+    .. math::
+
+        b_0 + \frac{a_1}{b_1 + \frac{a_2}{b_2 + \frac{a_3}{b_3 + \cdots}}}
+
+    Successive "convergents" approximate the infinitely recursive continued fraction
+    with a finite number of terms :math:`a_n` and :math:`b_n`, which are provided
+    by callables `a` and `b`, respectively. This implementation follows the modified
+    Lentz algorithm ([1]_, [2]_) to evaluate successive convergents until a
+    termination condition is satisfied.
+
+    References
+    ----------
+    .. [1] Press, William H., and Saul A. Teukolsky. "Evaluating continued fractions
+           and computing exponential integrals." Computers in Physics 2.5 (1988): 88-89.
+    .. [2] Lentz's algorithm. Wikipedia.
+           https://en.wikipedia.org/wiki/Lentz%27s_algorithm
+    .. [3] Continued fraction. Wikipedia.
+           https://en.wikipedia.org/wiki/Continued_fraction
+    .. [4] Generalized continued fraction. Wikipedia.
+           https://en.wikipedia.org/wiki/Generalized_continued_fraction
+
+    Examples
+    --------
+    The "simple continued fraction" of :math:`\pi` is given in [3]_ as
+
+    .. math::
+
+        3 + \frac{1}{7 + \frac{1}{15 + \frac{1}{1 + \cdots}}}
+
+    where the :math:`b_n` terms follow no obvious pattern:
+
+    >>> b = [3, 7, 15, 1, 292, 1, 1, 1, 2, 1, 3, 1]
+
+    and the :math:`a_n` terms are all :math:`1`.
+    In this case, all the terms have been precomputed, so we call `_continued_fraction`
+    with simple callables which simply return the precomputed coefficients:
+
+    >>> import numpy as np
+    >>> from scipy.special._continued_fraction import _continued_fraction
+    >>> res = _continued_fraction(a=lambda n: 1, b=lambda n: b[n], maxiter=len(b) - 1)
+    >>> (res.f - np.pi) / np.pi
+    np.float64(7.067899292141148e-15)
+
+    A generalized continued fraction for :math:`\pi` is given by:
+
+    .. math::
+
+        3 + \frac{1^2}{6 + \frac{3^2}{6 + \frac{5^2}{6 + \cdots}}}
+
+    We define the coefficient callables as:
+
+    >>> def a(n):
+    ...     return (2*n - 1)**2
+    >>>
+    >>> def b(n):
+    ...     if n == 0:
+    ...         return 3
+    ...     else:
+    ...         return 6
+
+    Then the continued fraction can be evaluated as:
+
+    >>> res = _continued_fraction(a, b)
+    >>> res
+         success: False
+          status: -2
+               f: 3.1415924109719846
+             nit: 100
+            nfev: 101
+
+    Note that the requested tolerance was not reached within the (default)
+    maximum number of iterations because it converges very slowly.
+    An expression that converges more rapidly is expressed as the difference
+    between two continued fractions. We will compute both of them in one
+    vectorized call to `_continued_fraction`.
+
+    >>> u, v = 5, 239
+    >>>
+    >>> def a(n, a1, _):
+    ...     # The shape of the output must be the shape of the arguments
+    ...     shape = a1.shape
+    ...     if n == 0:
+    ...         return np.zeros(shape)
+    ...     elif n == 1:
+    ...         return a1
+    ...     else:
+    ...         return np.full(shape, (n-1)**2)
+    >>>
+    >>> def b(n, _, uv):
+    ...     shape = uv.shape
+    ...     if  n == 0:
+    ...         return np.zeros(shape)
+    ...     return np.full(shape, (2*n - 1)*uv)
+    >>>
+    >>> res = _continued_fraction(a, b, args=([16, 4], [u, v]))
+    >>> res
+         success: [ True  True]
+          status: [0 0]
+               f: [ 3.158e+00  1.674e-02]
+             nit: [10  4]
+            nfev: [11  5]
+
+    Note that the second term converged in fewer than half the number of iterations
+    as the first. The approximation of :math:`\pi` is the difference between the two:
+
+    >>> pi = res.f[0] - res.f[1]
+    >>> (pi - np.pi) / np.pi
+    np.float64(2.8271597168564594e-16)
+
+    If it is more efficient to compute the :math:`a_n` and :math:`b_n` terms together,
+    consider instantiating a class with a method that computes both terms and stores
+    the results in an attribute. Separate methods of the class retrieve the
+    coefficients, and these methods are passed to `_continued_fraction` as arguments
+    `a` and `b`. Similarly,if the coefficients can be computed recursively in terms of
+    previous coefficients, use a class to maintain state between callable evaluations.
+
+    """
+
+    res = _continued_fraction_iv(a, b, args, tolerances, maxiter, log)
+    a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp = res
+    callback = None  # don't want to test it, but easy to add later
+
+    # The EIM framework was designed for the case in where there would
+    # be only one callable, and all arguments of the callable would be
+    # arrays. We're going a bit beyond that here, since we have two callables,
+    # and the first argument is an integer (the number of the term). Rather
+    # than complicate the framework, we wrap the user-provided callables to
+    # make this problem fit within the existing framework.
+
+    def a(n, *args, a=a):
+        n = int(xp.real(xp_ravel(n))[0])
+        return a(n, *args)
+
+    def b(n, *args, b=b):
+        n = int(xp.real(xp_ravel(n))[0])
+        return b(n, *args)
+
+    def func(n, *args):
+        return xp.stack((a(n, *args), b(n, *args)), axis=-1)
+
+    status = xp.full_like(a0, eim._EINPROGRESS, dtype=xp.int32)  # in progress
+    nit, nfev = 0, 1  # one function evaluation (per function) performed above
+    maxiter = 100 if maxiter is None else maxiter
+
+    # Quotations describing the algorithm are from [1]_
+    # "... as small as you like, say eps"
+    if eps is None:
+        eps = xp.finfo(dtype).eps if not log else np.log(xp.finfo(dtype).eps)
+
+    # "The parameter tiny should be less than typical values of eps |b_n|"
+    if tiny is None:
+        tiny = xp.finfo(dtype).eps**2 if not log else 2*np.log(xp.finfo(dtype).eps)
+
+    # "Set f0 and C0 to the value b0 or to tiny if b0=0. Set D0 = 0.
+    zero = -xp.inf if log else 0
+    fn = xp.where(b0 == zero, tiny, b0)
+    Cnm1 = xp_copy(fn)
+    Dnm1 = xp.full_like(fn, zero)
+
+    CnDn = xp.full_like(fn, xp.inf)
+
+    work = _RichResult(n=0, fn=fn, Cnm1=Cnm1, Dnm1=Dnm1, CnDn=CnDn,
+                       eps=eps, tiny=tiny,
+                       nit=nit, nfev=nfev, status=status)
+    res_work_pairs = [('status', 'status'), ('f', 'fn'),
+                      ('nit', 'nit'), ('nfev', 'nfev')]
+
+    def pre_func_eval(work):
+        work.n = xp.reshape(xp.asarray(work.n + 1), (-1,))
+        return work.n
+
+    def post_func_eval(n, ab, work):
+        an, bn = ab[..., 0], ab[..., 1]
+
+        zero = 0 if not log else -xp.inf
+
+        # "Set D_n = 1/(b_n + a_n D_{n-1}) or 1/tiny, if the denominator vanishes"
+        denominator = (bn + an*work.Dnm1 if not log
+                       else _logaddexp(bn, an + work.Dnm1, xp=xp))
+        denominator[denominator == zero] = tiny
+        Dn = (1/denominator if not log
+              else -denominator)
+
+        # "Set C_n = b_n + a_n / C_{n-1} (or =tiny, if the expression vanishes)"
+        Cn = (bn + an / work.Cnm1 if not log
+              else _logaddexp(bn, an - work.Cnm1, xp=xp))
+        Cn[Cn == zero] = tiny
+
+        # "and set f_n = f_{n-1} C_n D_n"
+        work.CnDn = (Cn * Dn if not log
+                     else Cn + Dn)
+        work.fn = (work.fn * work.CnDn if not log
+                   else work.fn + work.CnDn)
+
+
+        work.Cnm1, work.Dnm1 = Cn, Dn
+
+    def check_termination(work):
+        # Check for all terminal conditions and record statuses.
+        stop = xp.zeros_like(work.CnDn, dtype=xp.bool)
+
+        # "You quit when |D_n C_n - 1| is as small as you like, say eps"
+        pij = xp.full_like(work.CnDn, xp.pi*1j) if log else None
+        residual = (xp.abs(work.CnDn - 1) if not log
+                    else xp.real(_logaddexp(work.CnDn, pij, xp=xp)))
+        i = residual < work.eps
+        work.status[i] = eim._ECONVERGED
+        stop[i] = True
+
+        # If function value is NaN, report failure.
+        i = (~xp.isfinite(work.fn) if not log
+             else ~(xp.isfinite(work.fn) | (work.fn == -xp.inf)))
+        work.status[i] = eim._EVALUEERR
+        stop[i] = True
+
+        return stop
+
+    def post_termination_check(work):
+        pass
+
+    def customize_result(res, shape):
+        # Only needed pre-NEP 50
+        res['f'] = xp.asarray(res['f'], dtype=dtype)
+        res['f'] = res['f'][()] if res['f'].ndim == 0 else res['f']
+        return shape
+
+    return eim._loop(work, callback, shape, maxiter, func, args, dtype,
+                     pre_func_eval, post_func_eval, check_termination,
+                     post_termination_check, customize_result, res_work_pairs,
+                     xp=xp)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continuous_distns.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continuous_distns.py
new file mode 100644
index 0000000000000000000000000000000000000000..805b4b6e020937f7e2809addaa3a32decae07613
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_continuous_distns.py
@@ -0,0 +1,12543 @@
+#
+# Author:  Travis Oliphant  2002-2011 with contributions from
+#          SciPy Developers 2004-2011
+#
+import warnings
+from collections.abc import Iterable
+from functools import wraps, cached_property
+import ctypes
+import operator
+
+import numpy as np
+from numpy.polynomial import Polynomial
+from scipy.interpolate import BSpline
+from scipy._lib.doccer import (extend_notes_in_docstring,
+                               replace_notes_in_docstring,
+                               inherit_docstring_from)
+from scipy._lib._ccallback import LowLevelCallable
+from scipy import optimize
+from scipy import integrate
+import scipy.special as sc
+
+import scipy.special._ufuncs as scu
+from scipy._lib._util import _lazyselect
+import scipy._lib.array_api_extra as xpx
+from scipy._lib._array_api import xp_promote
+
+from . import _stats
+from ._tukeylambda_stats import (tukeylambda_variance as _tlvar,
+                                 tukeylambda_kurtosis as _tlkurt)
+from ._distn_infrastructure import (_vectorize_rvs_over_shapes,
+    get_distribution_names, _kurtosis, _isintegral,
+    rv_continuous, _skew, _get_fixed_fit_value, _check_shape, _ShapeInfo)
+from ._ksstats import kolmogn, kolmognp, kolmogni
+from ._constants import (_XMIN, _LOGXMIN, _EULER, _ZETA3, _SQRT_PI,
+                         _SQRT_2_OVER_PI, _LOG_PI, _LOG_SQRT_2_OVER_PI)
+from ._censored_data import CensoredData
+from scipy.optimize import root_scalar
+from scipy.stats._warnings_errors import FitError
+import scipy.stats as stats
+
+def _remove_optimizer_parameters(kwds):
+    """
+    Remove the optimizer-related keyword arguments 'loc', 'scale' and
+    'optimizer' from `kwds`.  Then check that `kwds` is empty, and
+    raise `TypeError("Unknown arguments: %s." % kwds)` if it is not.
+
+    This function is used in the fit method of distributions that override
+    the default method and do not use the default optimization code.
+
+    `kwds` is modified in-place.
+    """
+    kwds.pop('loc', None)
+    kwds.pop('scale', None)
+    kwds.pop('optimizer', None)
+    kwds.pop('method', None)
+    if kwds:
+        raise TypeError(f"Unknown arguments: {kwds}.")
+
+
+def _call_super_mom(fun):
+    # If fit method is overridden only for MLE and doesn't specify what to do
+    # if method == 'mm' or with censored data, this decorator calls the generic
+    # implementation.
+    @wraps(fun)
+    def wrapper(self, data, *args, **kwds):
+        method = kwds.get('method', 'mle').lower()
+        censored = isinstance(data, CensoredData)
+        if method == 'mm' or (censored and data.num_censored() > 0):
+            return super(type(self), self).fit(data, *args, **kwds)
+        else:
+            if censored:
+                # data is an instance of CensoredData, but actually holds
+                # no censored values, so replace it with the array of
+                # uncensored values.
+                data = data._uncensored
+            return fun(self, data, *args, **kwds)
+
+    return wrapper
+
+
+def _get_left_bracket(fun, rbrack, lbrack=None):
+    # find left bracket for `root_scalar`. A guess for lbrack may be provided.
+    lbrack = lbrack or rbrack - 1
+    diff = rbrack - lbrack
+
+    # if there is no sign change in `fun` between the brackets, expand
+    # rbrack - lbrack until a sign change occurs
+    def interval_contains_root(lbrack, rbrack):
+        # return true if the signs disagree.
+        return np.sign(fun(lbrack)) != np.sign(fun(rbrack))
+
+    while not interval_contains_root(lbrack, rbrack):
+        diff *= 2
+        lbrack = rbrack - diff
+
+        msg = ("The solver could not find a bracket containing a "
+               "root to an MLE first order condition.")
+        if np.isinf(lbrack):
+            raise FitSolverError(msg)
+
+    return lbrack
+
+
+class ksone_gen(rv_continuous):
+    r"""Kolmogorov-Smirnov one-sided test statistic distribution.
+
+    This is the distribution of the one-sided Kolmogorov-Smirnov (KS)
+    statistics :math:`D_n^+` and :math:`D_n^-`
+    for a finite sample size ``n >= 1`` (the shape parameter).
+
+    %(before_notes)s
+
+    See Also
+    --------
+    kstwobign, kstwo, kstest
+
+    Notes
+    -----
+    :math:`D_n^+` and :math:`D_n^-` are given by
+
+    .. math::
+
+        D_n^+ &= \text{sup}_x (F_n(x) - F(x)),\\
+        D_n^- &= \text{sup}_x (F(x) - F_n(x)),\\
+
+    where :math:`F` is a continuous CDF and :math:`F_n` is an empirical CDF.
+    `ksone` describes the distribution under the null hypothesis of the KS test
+    that the empirical CDF corresponds to :math:`n` i.i.d. random variates
+    with CDF :math:`F`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Birnbaum, Z. W. and Tingey, F.H. "One-sided confidence contours
+       for probability distribution functions", The Annals of Mathematical
+       Statistics, 22(4), pp 592-596 (1951).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import ksone
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Display the probability density function (``pdf``):
+
+    >>> n = 1e+03
+    >>> x = np.linspace(ksone.ppf(0.01, n),
+    ...                 ksone.ppf(0.99, n), 100)
+    >>> ax.plot(x, ksone.pdf(x, n),
+    ...         'r-', lw=5, alpha=0.6, label='ksone pdf')
+
+    Alternatively, the distribution object can be called (as a function)
+    to fix the shape, location and scale parameters. This returns a "frozen"
+    RV object holding the given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pdf``:
+
+    >>> rv = ksone(n)
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    Check accuracy of ``cdf`` and ``ppf``:
+
+    >>> vals = ksone.ppf([0.001, 0.5, 0.999], n)
+    >>> np.allclose([0.001, 0.5, 0.999], ksone.cdf(vals, n))
+    True
+
+    """
+    def _argcheck(self, n):
+        return (n >= 1) & (n == np.round(n))
+
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (1, np.inf), (True, False))]
+
+    def _pdf(self, x, n):
+        return -scu._smirnovp(n, x)
+
+    def _cdf(self, x, n):
+        return scu._smirnovc(n, x)
+
+    def _sf(self, x, n):
+        return sc.smirnov(n, x)
+
+    def _ppf(self, q, n):
+        return scu._smirnovci(n, q)
+
+    def _isf(self, q, n):
+        return sc.smirnovi(n, q)
+
+
+ksone = ksone_gen(a=0.0, b=1.0, name='ksone')
+
+
+class kstwo_gen(rv_continuous):
+    r"""Kolmogorov-Smirnov two-sided test statistic distribution.
+
+    This is the distribution of the two-sided Kolmogorov-Smirnov (KS)
+    statistic :math:`D_n` for a finite sample size ``n >= 1``
+    (the shape parameter).
+
+    %(before_notes)s
+
+    See Also
+    --------
+    kstwobign, ksone, kstest
+
+    Notes
+    -----
+    :math:`D_n` is given by
+
+    .. math::
+
+        D_n = \text{sup}_x |F_n(x) - F(x)|
+
+    where :math:`F` is a (continuous) CDF and :math:`F_n` is an empirical CDF.
+    `kstwo` describes the distribution under the null hypothesis of the KS test
+    that the empirical CDF corresponds to :math:`n` i.i.d. random variates
+    with CDF :math:`F`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Simard, R., L'Ecuyer, P. "Computing the Two-Sided
+       Kolmogorov-Smirnov Distribution",  Journal of Statistical Software,
+       Vol 39, 11, 1-18 (2011).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import kstwo
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Display the probability density function (``pdf``):
+
+    >>> n = 10
+    >>> x = np.linspace(kstwo.ppf(0.01, n),
+    ...                 kstwo.ppf(0.99, n), 100)
+    >>> ax.plot(x, kstwo.pdf(x, n),
+    ...         'r-', lw=5, alpha=0.6, label='kstwo pdf')
+
+    Alternatively, the distribution object can be called (as a function)
+    to fix the shape, location and scale parameters. This returns a "frozen"
+    RV object holding the given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pdf``:
+
+    >>> rv = kstwo(n)
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    Check accuracy of ``cdf`` and ``ppf``:
+
+    >>> vals = kstwo.ppf([0.001, 0.5, 0.999], n)
+    >>> np.allclose([0.001, 0.5, 0.999], kstwo.cdf(vals, n))
+    True
+
+    """
+    def _argcheck(self, n):
+        return (n >= 1) & (n == np.round(n))
+
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (1, np.inf), (True, False))]
+
+    def _get_support(self, n):
+        return (0.5/(n if not isinstance(n, Iterable) else np.asanyarray(n)),
+                1.0)
+
+    def _pdf(self, x, n):
+        return kolmognp(n, x)
+
+    def _cdf(self, x, n):
+        return kolmogn(n, x)
+
+    def _sf(self, x, n):
+        return kolmogn(n, x, cdf=False)
+
+    def _ppf(self, q, n):
+        return kolmogni(n, q, cdf=True)
+
+    def _isf(self, q, n):
+        return kolmogni(n, q, cdf=False)
+
+
+# Use the pdf, (not the ppf) to compute moments
+kstwo = kstwo_gen(momtype=0, a=0.0, b=1.0, name='kstwo')
+
+
+class kstwobign_gen(rv_continuous):
+    r"""Limiting distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
+
+    This is the asymptotic distribution of the two-sided Kolmogorov-Smirnov
+    statistic :math:`\sqrt{n} D_n` that measures the maximum absolute
+    distance of the theoretical (continuous) CDF from the empirical CDF.
+    (see `kstest`).
+
+    %(before_notes)s
+
+    See Also
+    --------
+    ksone, kstwo, kstest
+
+    Notes
+    -----
+    :math:`\sqrt{n} D_n` is given by
+
+    .. math::
+
+        D_n = \text{sup}_x |F_n(x) - F(x)|
+
+    where :math:`F` is a continuous CDF and :math:`F_n` is an empirical CDF.
+    `kstwobign`  describes the asymptotic distribution (i.e. the limit of
+    :math:`\sqrt{n} D_n`) under the null hypothesis of the KS test that the
+    empirical CDF corresponds to i.i.d. random variates with CDF :math:`F`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Feller, W. "On the Kolmogorov-Smirnov Limit Theorems for Empirical
+       Distributions",  Ann. Math. Statist. Vol 19, 177-189 (1948).
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        return -scu._kolmogp(x)
+
+    def _cdf(self, x):
+        return scu._kolmogc(x)
+
+    def _sf(self, x):
+        return sc.kolmogorov(x)
+
+    def _ppf(self, q):
+        return scu._kolmogci(q)
+
+    def _isf(self, q):
+        return sc.kolmogi(q)
+
+
+kstwobign = kstwobign_gen(a=0.0, name='kstwobign')
+
+
+## Normal distribution
+
+# loc = mu, scale = std
+# Keep these implementations out of the class definition so they can be reused
+# by other distributions.
+_norm_pdf_C = np.sqrt(2*np.pi)
+_norm_pdf_logC = np.log(_norm_pdf_C)
+
+
+def _norm_pdf(x):
+    return np.exp(-x**2/2.0) / _norm_pdf_C
+
+
+def _norm_logpdf(x):
+    return -x**2 / 2.0 - _norm_pdf_logC
+
+
+def _norm_cdf(x):
+    return sc.ndtr(x)
+
+
+def _norm_logcdf(x):
+    return sc.log_ndtr(x)
+
+
+def _norm_ppf(q):
+    return sc.ndtri(q)
+
+
+def _norm_sf(x):
+    return _norm_cdf(-x)
+
+
+def _norm_logsf(x):
+    return _norm_logcdf(-x)
+
+
+def _norm_isf(q):
+    return -_norm_ppf(q)
+
+
+class norm_gen(rv_continuous):
+    r"""A normal continuous random variable.
+
+    The location (``loc``) keyword specifies the mean.
+    The scale (``scale``) keyword specifies the standard deviation.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `norm` is:
+
+    .. math::
+
+        f(x) = \frac{\exp(-x^2/2)}{\sqrt{2\pi}}
+
+    for a real number :math:`x`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.standard_normal(size)
+
+    def _pdf(self, x):
+        # norm.pdf(x) = exp(-x**2/2)/sqrt(2*pi)
+        return _norm_pdf(x)
+
+    def _logpdf(self, x):
+        return _norm_logpdf(x)
+
+    def _cdf(self, x):
+        return _norm_cdf(x)
+
+    def _logcdf(self, x):
+        return _norm_logcdf(x)
+
+    def _sf(self, x):
+        return _norm_sf(x)
+
+    def _logsf(self, x):
+        return _norm_logsf(x)
+
+    def _ppf(self, q):
+        return _norm_ppf(q)
+
+    def _isf(self, q):
+        return _norm_isf(q)
+
+    def _stats(self):
+        return 0.0, 1.0, 0.0, 0.0
+
+    def _entropy(self):
+        return 0.5*(np.log(2*np.pi)+1)
+
+    @_call_super_mom
+    @replace_notes_in_docstring(rv_continuous, notes="""\
+        For the normal distribution, method of moments and maximum likelihood
+        estimation give identical fits, and explicit formulas for the estimates
+        are available.
+        This function uses these explicit formulas for the maximum likelihood
+        estimation of the normal distribution parameters, so the
+        `optimizer` and `method` arguments are ignored.\n\n""")
+    def fit(self, data, **kwds):
+        floc = kwds.pop('floc', None)
+        fscale = kwds.pop('fscale', None)
+
+        _remove_optimizer_parameters(kwds)
+
+        if floc is not None and fscale is not None:
+            # This check is for consistency with `rv_continuous.fit`.
+            # Without this check, this function would just return the
+            # parameters that were given.
+            raise ValueError("All parameters fixed. There is nothing to "
+                             "optimize.")
+
+        data = np.asarray(data)
+
+        if not np.isfinite(data).all():
+            raise ValueError("The data contains non-finite values.")
+
+        if floc is None:
+            loc = data.mean()
+        else:
+            loc = floc
+
+        if fscale is None:
+            scale = np.sqrt(((data - loc)**2).mean())
+        else:
+            scale = fscale
+
+        return loc, scale
+
+    def _munp(self, n):
+        """
+        @returns Moments of standard normal distribution for integer n >= 0
+
+        See eq. 16 of https://arxiv.org/abs/1209.4340v2
+        """
+        if n == 0:
+            return 1.
+        if n % 2 == 0:
+            return sc.factorial2(int(n) - 1)
+        else:
+            return 0.
+
+
+norm = norm_gen(name='norm')
+
+
+class alpha_gen(rv_continuous):
+    r"""An alpha continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `alpha` ([1]_, [2]_) is:
+
+    .. math::
+
+        f(x, a) = \frac{1}{x^2 \Phi(a) \sqrt{2\pi}} *
+                  \exp(-\frac{1}{2} (a-1/x)^2)
+
+    where :math:`\Phi` is the normal CDF, :math:`x > 0`, and :math:`a > 0`.
+
+    `alpha` takes ``a`` as a shape parameter.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Johnson, Kotz, and Balakrishnan, "Continuous Univariate
+           Distributions, Volume 1", Second Edition, John Wiley and Sons,
+           p. 173 (1994).
+    .. [2] Anthony A. Salvia, "Reliability applications of the Alpha
+           Distribution", IEEE Transactions on Reliability, Vol. R-34,
+           No. 3, pp. 251-252 (1985).
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, a):
+        # alpha.pdf(x, a) = 1/(x**2*Phi(a)*sqrt(2*pi)) * exp(-1/2 * (a-1/x)**2)
+        return 1.0/(x**2)/_norm_cdf(a)*_norm_pdf(a-1.0/x)
+
+    def _logpdf(self, x, a):
+        return -2*np.log(x) + _norm_logpdf(a-1.0/x) - np.log(_norm_cdf(a))
+
+    def _cdf(self, x, a):
+        return _norm_cdf(a-1.0/x) / _norm_cdf(a)
+
+    def _ppf(self, q, a):
+        return 1.0/np.asarray(a - _norm_ppf(q*_norm_cdf(a)))
+
+    def _stats(self, a):
+        return [np.inf]*2 + [np.nan]*2
+
+
+alpha = alpha_gen(a=0.0, name='alpha')
+
+
+class anglit_gen(rv_continuous):
+    r"""An anglit continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `anglit` is:
+
+    .. math::
+
+        f(x) = \sin(2x + \pi/2) = \cos(2x)
+
+    for :math:`-\pi/4 \le x \le \pi/4`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # anglit.pdf(x) = sin(2*x + \pi/2) = cos(2*x)
+        return np.cos(2*x)
+
+    def _cdf(self, x):
+        return np.sin(x+np.pi/4)**2.0
+
+    def _sf(self, x):
+        return np.cos(x + np.pi / 4) ** 2.0
+
+    def _ppf(self, q):
+        return np.arcsin(np.sqrt(q))-np.pi/4
+
+    def _stats(self):
+        return 0.0, np.pi*np.pi/16-0.5, 0.0, -2*(np.pi**4 - 96)/(np.pi*np.pi-8)**2
+
+    def _entropy(self):
+        return 1-np.log(2)
+
+
+anglit = anglit_gen(a=-np.pi/4, b=np.pi/4, name='anglit')
+
+
+class arcsine_gen(rv_continuous):
+    r"""An arcsine continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `arcsine` is:
+
+    .. math::
+
+        f(x) = \frac{1}{\pi \sqrt{x (1-x)}}
+
+    for :math:`0 < x < 1`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # arcsine.pdf(x) = 1/(pi*sqrt(x*(1-x)))
+        with np.errstate(divide='ignore'):
+            return 1.0/np.pi/np.sqrt(x*(1-x))
+
+    def _cdf(self, x):
+        return 2.0/np.pi*np.arcsin(np.sqrt(x))
+
+    def _ppf(self, q):
+        return np.sin(np.pi/2.0*q)**2.0
+
+    def _stats(self):
+        mu = 0.5
+        mu2 = 1.0/8
+        g1 = 0
+        g2 = -3.0/2.0
+        return mu, mu2, g1, g2
+
+    def _entropy(self):
+        return -0.24156447527049044468
+
+
+arcsine = arcsine_gen(a=0.0, b=1.0, name='arcsine')
+
+
+class FitDataError(ValueError):
+    """Raised when input data is inconsistent with fixed parameters."""
+    # This exception is raised by, for example, beta_gen.fit when both floc
+    # and fscale are fixed and there are values in the data not in the open
+    # interval (floc, floc+fscale).
+    def __init__(self, distr, lower, upper):
+        self.args = (
+            "Invalid values in `data`.  Maximum likelihood "
+            f"estimation with {distr!r} requires that {lower!r} < "
+            f"(x - loc)/scale  < {upper!r} for each x in `data`.",
+        )
+
+
+class FitSolverError(FitError):
+    """
+    Raised when a solver fails to converge while fitting a distribution.
+    """
+    # This exception is raised by, for example, beta_gen.fit when
+    # optimize.fsolve returns with ier != 1.
+    def __init__(self, mesg):
+        emsg = "Solver for the MLE equations failed to converge: "
+        emsg += mesg.replace('\n', '')
+        self.args = (emsg,)
+
+
+def _beta_mle_a(a, b, n, s1):
+    # The zeros of this function give the MLE for `a`, with
+    # `b`, `n` and `s1` given.  `s1` is the sum of the logs of
+    # the data. `n` is the number of data points.
+    psiab = sc.psi(a + b)
+    func = s1 - n * (-psiab + sc.psi(a))
+    return func
+
+
+def _beta_mle_ab(theta, n, s1, s2):
+    # Zeros of this function are critical points of
+    # the maximum likelihood function.  Solving this system
+    # for theta (which contains a and b) gives the MLE for a and b
+    # given `n`, `s1` and `s2`.  `s1` is the sum of the logs of the data,
+    # and `s2` is the sum of the logs of 1 - data.  `n` is the number
+    # of data points.
+    a, b = theta
+    psiab = sc.psi(a + b)
+    func = [s1 - n * (-psiab + sc.psi(a)),
+            s2 - n * (-psiab + sc.psi(b))]
+    return func
+
+
+class beta_gen(rv_continuous):
+    r"""A beta continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `beta` is:
+
+    .. math::
+
+        f(x, a, b) = \frac{\Gamma(a+b) x^{a-1} (1-x)^{b-1}}
+                          {\Gamma(a) \Gamma(b)}
+
+    for :math:`0 <= x <= 1`, :math:`a > 0`, :math:`b > 0`, where
+    :math:`\Gamma` is the gamma function (`scipy.special.gamma`).
+
+    `beta` takes :math:`a` and :math:`b` as shape parameters.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pdf``, ``cdf``, ``ppf``, ``sf`` and ``isf``
+    methods. [1]_
+
+    Maximum likelihood estimates of parameters are only available when the location and
+    scale are fixed. When either of these parameters is free, ``beta.fit`` resorts to
+    numerical optimization, but this problem is unbounded: the location and scale may be
+    chosen to make the minimum and maximum elements of the data coincide with the
+    endpoints of the support, and the shape parameters may be chosen to make the PDF at
+    these points infinite. For best results, pass ``floc`` and ``fscale`` keyword
+    arguments to fix the location and scale, or use `scipy.stats.fit` with
+    ``method='mse'``.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _rvs(self, a, b, size=None, random_state=None):
+        return random_state.beta(a, b, size)
+
+    def _pdf(self, x, a, b):
+        #                     gamma(a+b) * x**(a-1) * (1-x)**(b-1)
+        # beta.pdf(x, a, b) = ------------------------------------
+        #                              gamma(a)*gamma(b)
+        with np.errstate(over='ignore'):
+            return scu._beta_pdf(x, a, b)
+
+    def _logpdf(self, x, a, b):
+        lPx = sc.xlog1py(b - 1.0, -x) + sc.xlogy(a - 1.0, x)
+        lPx -= sc.betaln(a, b)
+        return lPx
+
+    def _cdf(self, x, a, b):
+        return sc.betainc(a, b, x)
+
+    def _sf(self, x, a, b):
+        return sc.betaincc(a, b, x)
+
+    def _isf(self, x, a, b):
+        return sc.betainccinv(a, b, x)
+
+    def _ppf(self, q, a, b):
+        return scu._beta_ppf(q, a, b)
+
+    def _stats(self, a, b):
+        a_plus_b = a + b
+        _beta_mean = a/a_plus_b
+        _beta_variance = a*b / (a_plus_b**2 * (a_plus_b + 1))
+        _beta_skewness = ((2 * (b - a) * np.sqrt(a_plus_b + 1)) /
+                          ((a_plus_b + 2) * np.sqrt(a * b)))
+        _beta_kurtosis_excess_n = 6 * ((a - b)**2 * (a_plus_b + 1) -
+                                       a * b * (a_plus_b + 2))
+        _beta_kurtosis_excess_d = a * b * (a_plus_b + 2) * (a_plus_b + 3)
+        _beta_kurtosis_excess = _beta_kurtosis_excess_n / _beta_kurtosis_excess_d
+        return (
+            _beta_mean,
+            _beta_variance,
+            _beta_skewness,
+            _beta_kurtosis_excess)
+
+    def _fitstart(self, data):
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+
+        g1 = _skew(data)
+        g2 = _kurtosis(data)
+
+        def func(x):
+            a, b = x
+            sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
+            ku = a**3 - a**2*(2*b-1) + b**2*(b+1) - 2*a*b*(b+2)
+            ku /= a*b*(a+b+2)*(a+b+3)
+            ku *= 6
+            return [sk-g1, ku-g2]
+        a, b = optimize.fsolve(func, (1.0, 1.0))
+        return super()._fitstart(data, args=(a, b))
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        In the special case where `method="MLE"` and
+        both `floc` and `fscale` are given, a
+        `ValueError` is raised if any value `x` in `data` does not satisfy
+        `floc < x < floc + fscale`.\n\n""")
+    def fit(self, data, *args, **kwds):
+        # Override rv_continuous.fit, so we can more efficiently handle the
+        # case where floc and fscale are given.
+
+        floc = kwds.get('floc', None)
+        fscale = kwds.get('fscale', None)
+
+        if floc is None or fscale is None:
+            # do general fit
+            return super().fit(data, *args, **kwds)
+
+        # We already got these from kwds, so just pop them.
+        kwds.pop('floc', None)
+        kwds.pop('fscale', None)
+
+        f0 = _get_fixed_fit_value(kwds, ['f0', 'fa', 'fix_a'])
+        f1 = _get_fixed_fit_value(kwds, ['f1', 'fb', 'fix_b'])
+
+        _remove_optimizer_parameters(kwds)
+
+        if f0 is not None and f1 is not None:
+            # This check is for consistency with `rv_continuous.fit`.
+            raise ValueError("All parameters fixed. There is nothing to "
+                             "optimize.")
+
+        # Special case: loc and scale are constrained, so we are fitting
+        # just the shape parameters.  This can be done much more efficiently
+        # than the method used in `rv_continuous.fit`.  (See the subsection
+        # "Two unknown parameters" in the section "Maximum likelihood" of
+        # the Wikipedia article on the Beta distribution for the formulas.)
+
+        if not np.isfinite(data).all():
+            raise ValueError("The data contains non-finite values.")
+
+        # Normalize the data to the interval [0, 1].
+        data = (np.ravel(data) - floc) / fscale
+        if np.any(data <= 0) or np.any(data >= 1):
+            raise FitDataError("beta", lower=floc, upper=floc + fscale)
+
+        xbar = data.mean()
+
+        if f0 is not None or f1 is not None:
+            # One of the shape parameters is fixed.
+
+            if f0 is not None:
+                # The shape parameter a is fixed, so swap the parameters
+                # and flip the data.  We always solve for `a`.  The result
+                # will be swapped back before returning.
+                b = f0
+                data = 1 - data
+                xbar = 1 - xbar
+            else:
+                b = f1
+
+            # Initial guess for a.  Use the formula for the mean of the beta
+            # distribution, E[x] = a / (a + b), to generate a reasonable
+            # starting point based on the mean of the data and the given
+            # value of b.
+            a = b * xbar / (1 - xbar)
+
+            # Compute the MLE for `a` by solving _beta_mle_a.
+            theta, info, ier, mesg = optimize.fsolve(
+                _beta_mle_a, a,
+                args=(b, len(data), np.log(data).sum()),
+                full_output=True
+            )
+            if ier != 1:
+                raise FitSolverError(mesg=mesg)
+            a = theta[0]
+
+            if f0 is not None:
+                # The shape parameter a was fixed, so swap back the
+                # parameters.
+                a, b = b, a
+
+        else:
+            # Neither of the shape parameters is fixed.
+
+            # s1 and s2 are used in the extra arguments passed to _beta_mle_ab
+            # by optimize.fsolve.
+            s1 = np.log(data).sum()
+            s2 = sc.log1p(-data).sum()
+
+            # Use the "method of moments" to estimate the initial
+            # guess for a and b.
+            fac = xbar * (1 - xbar) / data.var(ddof=0) - 1
+            a = xbar * fac
+            b = (1 - xbar) * fac
+
+            # Compute the MLE for a and b by solving _beta_mle_ab.
+            theta, info, ier, mesg = optimize.fsolve(
+                _beta_mle_ab, [a, b],
+                args=(len(data), s1, s2),
+                full_output=True
+            )
+            if ier != 1:
+                raise FitSolverError(mesg=mesg)
+            a, b = theta
+
+        return a, b, floc, fscale
+
+    def _entropy(self, a, b):
+        def regular(a, b):
+            return (sc.betaln(a, b) - (a - 1) * sc.psi(a) -
+                    (b - 1) * sc.psi(b) + (a + b - 2) * sc.psi(a + b))
+
+        def asymptotic_ab_large(a, b):
+            sum_ab = a + b
+            log_term = 0.5 * (
+                np.log(2*np.pi) + np.log(a) + np.log(b) - 3*np.log(sum_ab) + 1
+            )
+            t1 = 110/sum_ab + 20*sum_ab**-2.0 + sum_ab**-3.0 - 2*sum_ab**-4.0
+            t2 = -50/a - 10*a**-2.0 - a**-3.0 + a**-4.0
+            t3 = -50/b - 10*b**-2.0 - b**-3.0 + b**-4.0
+            return log_term + (t1 + t2 + t3) / 120
+
+        def asymptotic_b_large(a, b):
+            sum_ab = a + b
+            t1 = sc.gammaln(a) - (a - 1) * sc.psi(a)
+            t2 = (
+                - 1/(2*b) + 1/(12*b) - b**-2.0/12 - b**-3.0/120 + b**-4.0/120
+                + b**-5.0/252 - b**-6.0/252 + 1/sum_ab - 1/(12*sum_ab)
+                + sum_ab**-2.0/6 + sum_ab**-3.0/120 - sum_ab**-4.0/60
+                - sum_ab**-5.0/252 + sum_ab**-6.0/126
+            )
+            log_term = sum_ab*np.log1p(a/b) + np.log(b) - 2*np.log(sum_ab)
+            return t1 + t2 + log_term
+
+        def asymptotic_a_large(a, b):
+            return asymptotic_b_large(b, a)
+
+        def threshold_large(v):
+            j = np.floor(np.log10(v))
+            d = np.floor(v / 10 ** j) + 2
+            return xpx.apply_where(v != 1.0, (d, j), lambda d_, j_: d_ * 10**(7 + j_),
+                                   fill_value=1000)
+
+        threshold_a = threshold_large(a)
+        threshold_b = threshold_large(b)
+        return _lazyselect([(a >= 4.96e6) & (b >= 4.96e6),
+                            (a <= 4.9e6) & (b - a >= 1e6) & (b >= threshold_a),
+                            (b <= 4.9e6) & (a - b >= 1e6) & (a >= threshold_b),
+                            (a < 4.9e6) & (b < 4.9e6)
+                           ],
+                           [asymptotic_ab_large, asymptotic_b_large,
+                            asymptotic_a_large, regular],
+                           [a, b]
+        )
+
+
+beta = beta_gen(a=0.0, b=1.0, name='beta')
+
+
+class betaprime_gen(rv_continuous):
+    r"""A beta prime continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `betaprime` is:
+
+    .. math::
+
+        f(x, a, b) = \frac{x^{a-1} (1+x)^{-a-b}}{\beta(a, b)}
+
+    for :math:`x >= 0`, :math:`a > 0`, :math:`b > 0`, where
+    :math:`\beta(a, b)` is the beta function (see `scipy.special.beta`).
+
+    `betaprime` takes ``a`` and ``b`` as shape parameters.
+
+    The distribution is related to the `beta` distribution as follows:
+    If :math:`X` follows a beta distribution with parameters :math:`a, b`,
+    then :math:`Y = X/(1-X)` has a beta prime distribution with
+    parameters :math:`a, b` ([1]_).
+
+    The beta prime distribution is a reparametrized version of the
+    F distribution.  The beta prime distribution with shape parameters
+    ``a`` and ``b`` and ``scale = s`` is equivalent to the F distribution
+    with parameters ``d1 = 2*a``, ``d2 = 2*b`` and ``scale = (a/b)*s``.
+    For example,
+
+    >>> from scipy.stats import betaprime, f
+    >>> x = [1, 2, 5, 10]
+    >>> a = 12
+    >>> b = 5
+    >>> betaprime.pdf(x, a, b, scale=2)
+    array([0.00541179, 0.08331299, 0.14669185, 0.03150079])
+    >>> f.pdf(x, 2*a, 2*b, scale=(a/b)*2)
+    array([0.00541179, 0.08331299, 0.14669185, 0.03150079])
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Beta prime distribution, Wikipedia,
+           https://en.wikipedia.org/wiki/Beta_prime_distribution
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _rvs(self, a, b, size=None, random_state=None):
+        u1 = gamma.rvs(a, size=size, random_state=random_state)
+        u2 = gamma.rvs(b, size=size, random_state=random_state)
+        return u1 / u2
+
+    def _pdf(self, x, a, b):
+        # betaprime.pdf(x, a, b) = x**(a-1) * (1+x)**(-a-b) / beta(a, b)
+        return np.exp(self._logpdf(x, a, b))
+
+    def _logpdf(self, x, a, b):
+        return sc.xlogy(a - 1.0, x) - sc.xlog1py(a + b, x) - sc.betaln(a, b)
+
+    def _cdf(self, x, a, b):
+        # note: f2 is the direct way to compute the cdf if the relationship
+        # to the beta distribution is used.
+        # however, for very large x, x/(1+x) == 1. since the distribution
+        # has very fat tails if b is small, this can cause inaccurate results
+        # use the following relationship of the incomplete beta function:
+        # betainc(x, a, b) = 1 - betainc(1-x, b, a)
+        # see gh-17631
+        return xpx.apply_where(
+            x > 1, (x, a, b),
+            lambda x_, a_, b_: beta._sf(1 / (1 + x_), b_, a_),
+            lambda x_, a_, b_: beta._cdf(x_ / (1 + x_), a_, b_))
+
+    def _sf(self, x, a, b):
+        return xpx.apply_where(
+            x > 1, (x, a, b),
+            lambda x_, a_, b_: beta._cdf(1 / (1 + x_), b_, a_),
+            lambda x_, a_, b_: beta._sf(x_ / (1 + x_), a_, b_))
+
+    def _ppf(self, p, a, b):
+        p, a, b = np.broadcast_arrays(p, a, b)
+        # By default, compute the ppf by solving the following:
+        # p = beta._cdf(x/(1+x), a, b). This implies x = r/(1-r) with
+        # r = beta._ppf(p, a, b). This can cause numerical issues if r is
+        # very close to 1. In that case, invert the alternative expression of
+        # the cdf: p = beta._sf(1/(1+x), b, a).
+        r = stats.beta._ppf(p, a, b)
+        with np.errstate(divide='ignore'):
+            out = r / (1 - r)
+        rnear1 = r > 0.9999
+        if np.isscalar(r):
+            if rnear1:
+                out = 1/stats.beta._isf(p, b, a) - 1
+        else:
+            out[rnear1] = 1/stats.beta._isf(p[rnear1], b[rnear1], a[rnear1]) - 1
+        return out
+
+    def _munp(self, n, a, b):
+        return xpx.apply_where(
+            b > n, (a, b),
+            lambda a, b: np.prod([(a+i-1)/(b-i) for i in range(1, int(n)+1)], axis=0),
+            fill_value=np.inf)
+
+
+betaprime = betaprime_gen(a=0.0, name='betaprime')
+
+
+class bradford_gen(rv_continuous):
+    r"""A Bradford continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `bradford` is:
+
+    .. math::
+
+        f(x, c) = \frac{c}{\log(1+c) (1+cx)}
+
+    for :math:`0 <= x <= 1` and :math:`c > 0`.
+
+    `bradford` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # bradford.pdf(x, c) = c / (k * (1+c*x))
+        return c / (c*x + 1.0) / sc.log1p(c)
+
+    def _cdf(self, x, c):
+        return sc.log1p(c*x) / sc.log1p(c)
+
+    def _ppf(self, q, c):
+        return sc.expm1(q * sc.log1p(c)) / c
+
+    def _stats(self, c, moments='mv'):
+        k = np.log(1.0+c)
+        mu = (c-k)/(c*k)
+        mu2 = ((c+2.0)*k-2.0*c)/(2*c*k*k)
+        g1 = None
+        g2 = None
+        if 's' in moments:
+            g1 = np.sqrt(2)*(12*c*c-9*c*k*(c+2)+2*k*k*(c*(c+3)+3))
+            g1 /= np.sqrt(c*(c*(k-2)+2*k))*(3*c*(k-2)+6*k)
+        if 'k' in moments:
+            g2 = (c**3*(k-3)*(k*(3*k-16)+24)+12*k*c*c*(k-4)*(k-3) +
+                  6*c*k*k*(3*k-14) + 12*k**3)
+            g2 /= 3*c*(c*(k-2)+2*k)**2
+        return mu, mu2, g1, g2
+
+    def _entropy(self, c):
+        k = np.log(1+c)
+        return k/2.0 - np.log(c/k)
+
+
+bradford = bradford_gen(a=0.0, b=1.0, name='bradford')
+
+
+class burr_gen(rv_continuous):
+    r"""A Burr (Type III) continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    fisk : a special case of either `burr` or `burr12` with ``d=1``
+    burr12 : Burr Type XII distribution
+    mielke : Mielke Beta-Kappa / Dagum distribution
+
+    Notes
+    -----
+    The probability density function for `burr` is:
+
+    .. math::
+
+        f(x; c, d) = c d \frac{x^{-c - 1}}
+                              {{(1 + x^{-c})}^{d + 1}}
+
+    for :math:`x >= 0` and :math:`c, d > 0`.
+
+    `burr` takes ``c`` and ``d`` as shape parameters for :math:`c` and
+    :math:`d`.
+
+    This is the PDF corresponding to the third CDF given in Burr's list;
+    specifically, it is equation (11) in Burr's paper [1]_. The distribution
+    is also commonly referred to as the Dagum distribution [2]_. If the
+    parameter :math:`c < 1` then the mean of the distribution does not
+    exist and if :math:`c < 2` the variance does not exist [2]_.
+    The PDF is finite at the left endpoint :math:`x = 0` if :math:`c * d >= 1`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Burr, I. W. "Cumulative frequency functions", Annals of
+       Mathematical Statistics, 13(2), pp 215-232 (1942).
+    .. [2] https://en.wikipedia.org/wiki/Dagum_distribution
+    .. [3] Kleiber, Christian. "A guide to the Dagum distributions."
+       Modeling Income Distributions and Lorenz Curves  pp 97-117 (2008).
+
+    %(example)s
+
+    """
+    # Do not set _support_mask to rv_continuous._open_support_mask
+    # Whether the left-hand endpoint is suitable for pdf evaluation is dependent
+    # on the values of c and d: if c*d >= 1, the pdf is finite, otherwise infinite.
+
+    def _shape_info(self):
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        id = _ShapeInfo("d", False, (0, np.inf), (False, False))
+        return [ic, id]
+
+    def _pdf(self, x, c, d):
+        # burr.pdf(x, c, d) = c * d * x**(-c-1) * (1+x**(-c))**(-d-1)
+        output = xpx.apply_where(
+            x == 0, (x, c, d),
+            lambda x_, c_, d_: c_ * d_ * (x_**(c_*d_-1)) / (1 + x_**c_),
+            lambda x_, c_, d_: (c_ * d_ * (x_ ** (-c_ - 1.0)) /
+                                ((1 + x_ ** (-c_)) ** (d_ + 1.0))))
+        return output[()] if output.ndim == 0 else output
+
+    def _logpdf(self, x, c, d):
+        output = xpx.apply_where(
+            x == 0, (x, c, d),
+            lambda x_, c_, d_: (np.log(c_) + np.log(d_) + sc.xlogy(c_*d_ - 1, x_)
+                                - (d_+1) * sc.log1p(x_**(c_))),
+            lambda x_, c_, d_: (np.log(c_) + np.log(d_)
+                                + sc.xlogy(-c_ - 1, x_)
+                                - sc.xlog1py(d_+1, x_**(-c_))))
+        return output[()] if output.ndim == 0 else output
+
+    def _cdf(self, x, c, d):
+        return (1 + x**(-c))**(-d)
+
+    def _logcdf(self, x, c, d):
+        return sc.log1p(x**(-c)) * (-d)
+
+    def _sf(self, x, c, d):
+        return np.exp(self._logsf(x, c, d))
+
+    def _logsf(self, x, c, d):
+        return np.log1p(- (1 + x**(-c))**(-d))
+
+    def _ppf(self, q, c, d):
+        return (q**(-1.0/d) - 1)**(-1.0/c)
+
+    def _isf(self, q, c, d):
+        _q = sc.xlog1py(-1.0 / d, -q)
+        return sc.expm1(_q) ** (-1.0 / c)
+
+    def _stats(self, c, d):
+        nc = np.arange(1, 5).reshape(4,1) / c
+        # ek is the kth raw moment, e1 is the mean e2-e1**2 variance etc.
+        e1, e2, e3, e4 = sc.beta(d + nc, 1. - nc) * d
+        mu = np.where(c > 1.0, e1, np.nan)
+        mu2_if_c = e2 - mu**2
+        mu2 = np.where(c > 2.0, mu2_if_c, np.nan)
+        g1 = xpx.apply_where(
+            c > 3.0, (e1, e2, e3, mu2_if_c),
+            lambda e1, e2, e3, mu2_if_c: ((e3 - 3*e2*e1 + 2*e1**3)
+                                           / np.sqrt((mu2_if_c)**3)),
+            fill_value=np.nan)
+        g2 = xpx.apply_where(
+            c > 4.0, (e1, e2, e3, e4, mu2_if_c),
+            lambda e1, e2, e3, e4, mu2_if_c: (
+                ((e4 - 4*e3*e1 + 6*e2*e1**2 - 3*e1**4) / mu2_if_c**2) - 3),
+            fill_value=np.nan)
+        if np.ndim(c) == 0:
+            return mu.item(), mu2.item(), g1.item(), g2.item()
+        return mu, mu2, g1, g2
+
+    def _munp(self, n, c, d):
+        def __munp(n, c, d):
+            nc = 1. * n / c
+            return d * sc.beta(1.0 - nc, d + nc)
+        n, c, d = np.asarray(n), np.asarray(c), np.asarray(d)
+        return xpx.apply_where((c > n) & (n == n) & (d == d),
+                               (n, c, d), __munp, fill_value=np.nan)
+
+
+burr = burr_gen(a=0.0, name='burr')
+
+
+class burr12_gen(rv_continuous):
+    r"""A Burr (Type XII) continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    fisk : a special case of either `burr` or `burr12` with ``d=1``
+    burr : Burr Type III distribution
+
+    Notes
+    -----
+    The probability density function for `burr12` is:
+
+    .. math::
+
+        f(x; c, d) = c d \frac{x^{c-1}}
+                              {(1 + x^c)^{d + 1}}
+
+    for :math:`x >= 0` and :math:`c, d > 0`.
+
+    `burr12` takes ``c`` and ``d`` as shape parameters for :math:`c`
+    and :math:`d`.
+
+    This is the PDF corresponding to the twelfth CDF given in Burr's list;
+    specifically, it is equation (20) in Burr's paper [1]_.
+
+    %(after_notes)s
+
+    The Burr type 12 distribution is also sometimes referred to as
+    the Singh-Maddala distribution from NIST [2]_.
+
+    References
+    ----------
+    .. [1] Burr, I. W. "Cumulative frequency functions", Annals of
+       Mathematical Statistics, 13(2), pp 215-232 (1942).
+
+    .. [2] https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/b12pdf.htm
+
+    .. [3] "Burr distribution",
+       https://en.wikipedia.org/wiki/Burr_distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        id = _ShapeInfo("d", False, (0, np.inf), (False, False))
+        return [ic, id]
+
+    def _pdf(self, x, c, d):
+        # burr12.pdf(x, c, d) = c * d * x**(c-1) * (1+x**(c))**(-d-1)
+        return np.exp(self._logpdf(x, c, d))
+
+    def _logpdf(self, x, c, d):
+        return np.log(c) + np.log(d) + sc.xlogy(c - 1, x) + sc.xlog1py(-d-1, x**c)
+
+    def _cdf(self, x, c, d):
+        return -sc.expm1(self._logsf(x, c, d))
+
+    def _logcdf(self, x, c, d):
+        return sc.log1p(-(1 + x**c)**(-d))
+
+    def _sf(self, x, c, d):
+        return np.exp(self._logsf(x, c, d))
+
+    def _logsf(self, x, c, d):
+        return sc.xlog1py(-d, x**c)
+
+    def _ppf(self, q, c, d):
+        # The following is an implementation of
+        #   ((1 - q)**(-1.0/d) - 1)**(1.0/c)
+        # that does a better job handling small values of q.
+        return sc.expm1(-1/d * sc.log1p(-q))**(1/c)
+
+    def _isf(self, p, c, d):
+        return sc.expm1(-1/d * np.log(p))**(1/c)
+
+    def _munp(self, n, c, d):
+        def moment_if_exists(n, c, d):
+            nc = 1. * n / c
+            return d * sc.beta(1.0 + nc, d - nc)
+
+        return xpx.apply_where(c * d > n, (n, c, d), moment_if_exists,
+                               fill_value=np.nan)
+
+
+burr12 = burr12_gen(a=0.0, name='burr12')
+
+
+class fisk_gen(burr_gen):
+    r"""A Fisk continuous random variable.
+
+    The Fisk distribution is also known as the log-logistic distribution.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    burr
+
+    Notes
+    -----
+    The probability density function for `fisk` is:
+
+    .. math::
+
+        f(x, c) = \frac{c x^{c-1}}
+                       {(1 + x^c)^2}
+
+    for :math:`x >= 0` and :math:`c > 0`.
+
+    Please note that the above expression can be transformed into the following
+    one, which is also commonly used:
+
+    .. math::
+
+        f(x, c) = \frac{c x^{-c-1}}
+                       {(1 + x^{-c})^2}
+
+    `fisk` takes ``c`` as a shape parameter for :math:`c`.
+
+    `fisk` is a special case of `burr` or `burr12` with ``d=1``.
+
+    Suppose ``X`` is a logistic random variable with location ``l``
+    and scale ``s``. Then ``Y = exp(X)`` is a Fisk (log-logistic)
+    random variable with ``scale = exp(l)`` and shape ``c = 1/s``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # fisk.pdf(x, c) = c * x**(-c-1) * (1 + x**(-c))**(-2)
+        return burr._pdf(x, c, 1.0)
+
+    def _cdf(self, x, c):
+        return burr._cdf(x, c, 1.0)
+
+    def _sf(self, x, c):
+        return burr._sf(x, c, 1.0)
+
+    def _logpdf(self, x, c):
+        # fisk.pdf(x, c) = c * x**(-c-1) * (1 + x**(-c))**(-2)
+        return burr._logpdf(x, c, 1.0)
+
+    def _logcdf(self, x, c):
+        return burr._logcdf(x, c, 1.0)
+
+    def _logsf(self, x, c):
+        return burr._logsf(x, c, 1.0)
+
+    def _ppf(self, x, c):
+        return burr._ppf(x, c, 1.0)
+
+    def _isf(self, q, c):
+        return burr._isf(q, c, 1.0)
+
+    def _munp(self, n, c):
+        return burr._munp(n, c, 1.0)
+
+    def _stats(self, c):
+        return burr._stats(c, 1.0)
+
+    def _entropy(self, c):
+        return 2 - np.log(c)
+
+
+fisk = fisk_gen(a=0.0, name='fisk')
+
+
+class cauchy_gen(rv_continuous):
+    r"""A Cauchy continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `cauchy` is
+
+    .. math::
+
+        f(x) = \frac{1}{\pi (1 + x^2)}
+
+    for a real number :math:`x`.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``ppf`` and ``isf`` methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # cauchy.pdf(x) = 1 / (pi * (1 + x**2))
+        with np.errstate(over='ignore'):
+            return 1.0/np.pi/(1.0+x*x)
+
+    def _logpdf(self, x):
+        # The formulas
+        #     log(1/(pi*(1 + x**2))) = -log(pi) - log(1 + x**2)
+        #                            = -log(pi) - log(x**2*(1 + 1/x**2))
+        #                            = -log(pi) - (2log(|x|) + log1p(1/x**2))
+        # are used here.
+        absx = np.abs(x)
+        # In the following apply_where, `f1` provides better precision than `f2`
+        # for small and moderate x, while `f2` avoids the overflow that can
+        # occur with absx**2.
+        return xpx.apply_where(
+            absx < 1, absx,
+            lambda absx: -_LOG_PI - np.log1p(absx**2),
+            lambda absx: (-_LOG_PI - (2*np.log(absx) + np.log1p((1/absx)**2))))
+
+    def _cdf(self, x):
+        return np.arctan2(1, -x)/np.pi
+
+    def _ppf(self, q):
+        return scu._cauchy_ppf(q, 0, 1)
+
+    def _sf(self, x):
+        return np.arctan2(1, x)/np.pi
+
+    def _isf(self, q):
+        return scu._cauchy_isf(q, 0, 1)
+
+    def _stats(self):
+        return np.nan, np.nan, np.nan, np.nan
+
+    def _entropy(self):
+        return np.log(4*np.pi)
+
+    def _fitstart(self, data, args=None):
+        # Initialize ML guesses using quartiles instead of moments.
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        p25, p50, p75 = np.percentile(data, [25, 50, 75])
+        return p50, (p75 - p25)/2
+
+
+cauchy = cauchy_gen(name='cauchy')
+
+
+class chi_gen(rv_continuous):
+    r"""A chi continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `chi` is:
+
+    .. math::
+
+        f(x, k) = \frac{1}{2^{k/2-1} \Gamma \left( k/2 \right)}
+                   x^{k-1} \exp \left( -x^2/2 \right)
+
+    for :math:`x >= 0` and :math:`k > 0` (degrees of freedom, denoted ``df``
+    in the implementation). :math:`\Gamma` is the gamma function
+    (`scipy.special.gamma`).
+
+    Special cases of `chi` are:
+
+        - ``chi(1, loc, scale)`` is equivalent to `halfnorm`
+        - ``chi(2, 0, scale)`` is equivalent to `rayleigh`
+        - ``chi(3, 0, scale)`` is equivalent to `maxwell`
+
+    `chi` takes ``df`` as a shape parameter.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("df", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, df, size=None, random_state=None):
+        return np.sqrt(chi2.rvs(df, size=size, random_state=random_state))
+
+    def _pdf(self, x, df):
+        #                   x**(df-1) * exp(-x**2/2)
+        # chi.pdf(x, df) =  -------------------------
+        #                   2**(df/2-1) * gamma(df/2)
+        return np.exp(self._logpdf(x, df))
+
+    def _logpdf(self, x, df):
+        l = np.log(2) - .5*np.log(2)*df - sc.gammaln(.5*df)
+        return l + sc.xlogy(df - 1., x) - .5*x**2
+
+    def _cdf(self, x, df):
+        return sc.gammainc(.5*df, .5*x**2)
+
+    def _sf(self, x, df):
+        return sc.gammaincc(.5*df, .5*x**2)
+
+    def _ppf(self, q, df):
+        return np.sqrt(2*sc.gammaincinv(.5*df, q))
+
+    def _isf(self, q, df):
+        return np.sqrt(2*sc.gammainccinv(.5*df, q))
+
+    def _stats(self, df):
+        # poch(df/2, 1/2) = gamma(df/2 + 1/2) / gamma(df/2)
+        mu = np.sqrt(2) * sc.poch(0.5 * df, 0.5)
+        mu2 = df - mu*mu
+        g1 = (2*mu**3.0 + mu*(1-2*df))/np.asarray(np.power(mu2, 1.5))
+        g2 = 2*df*(1.0-df)-6*mu**4 + 4*mu**2 * (2*df-1)
+        g2 /= np.asarray(mu2**2.0)
+        return mu, mu2, g1, g2
+
+    def _entropy(self, df):
+
+        def regular_formula(df):
+            return (sc.gammaln(.5 * df)
+                    + 0.5 * (df - np.log(2) - (df - 1) * sc.digamma(0.5 * df)))
+
+        def asymptotic_formula(df):
+            return (0.5 + np.log(np.pi)/2 - (df**-1)/6 - (df**-2)/6
+                    - 4/45*(df**-3) + (df**-4)/15)
+
+        return xpx.apply_where(df < 300, df, regular_formula, asymptotic_formula)
+
+
+chi = chi_gen(a=0.0, name='chi')
+
+
+class chi2_gen(rv_continuous):
+    r"""A chi-squared continuous random variable.
+
+    For the noncentral chi-square distribution, see `ncx2`.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    ncx2
+
+    Notes
+    -----
+    The probability density function for `chi2` is:
+
+    .. math::
+
+        f(x, k) = \frac{1}{2^{k/2} \Gamma \left( k/2 \right)}
+                   x^{k/2-1} \exp \left( -x/2 \right)
+
+    for :math:`x > 0`  and :math:`k > 0` (degrees of freedom, denoted ``df``
+    in the implementation).
+
+    `chi2` takes ``df`` as a shape parameter.
+
+    The chi-squared distribution is a special case of the gamma
+    distribution, with gamma parameters ``a = df/2``, ``loc = 0`` and
+    ``scale = 2``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("df", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, df, size=None, random_state=None):
+        return random_state.chisquare(df, size)
+
+    def _pdf(self, x, df):
+        # chi2.pdf(x, df) = 1 / (2*gamma(df/2)) * (x/2)**(df/2-1) * exp(-x/2)
+        return np.exp(self._logpdf(x, df))
+
+    def _logpdf(self, x, df):
+        return sc.xlogy(df/2.-1, x) - x/2. - sc.gammaln(df/2.) - (np.log(2)*df)/2.
+
+    def _cdf(self, x, df):
+        return sc.chdtr(df, x)
+
+    def _sf(self, x, df):
+        return sc.chdtrc(df, x)
+
+    def _isf(self, p, df):
+        return sc.chdtri(df, p)
+
+    def _ppf(self, p, df):
+        return 2*sc.gammaincinv(df/2, p)
+
+    def _stats(self, df):
+        mu = df
+        mu2 = 2*df
+        g1 = 2*np.sqrt(2.0/df)
+        g2 = 12.0/df
+        return mu, mu2, g1, g2
+
+    def _entropy(self, df):
+        half_df = 0.5 * df
+
+        def regular_formula(half_df):
+            return (half_df + np.log(2) + sc.gammaln(half_df) +
+                    (1 - half_df) * sc.psi(half_df))
+
+        def asymptotic_formula(half_df):
+            # plug in the above formula the following asymptotic
+            # expansions:
+            # ln(gamma(a)) ~ (a - 0.5) * ln(a) - a + 0.5 * ln(2 * pi) +
+            #                 1/(12 * a) - 1/(360 * a**3)
+            # psi(a) ~ ln(a) - 1/(2 * a) - 1/(3 * a**2) + 1/120 * a**4)
+            c = np.log(2) + 0.5*(1 + np.log(2*np.pi))
+            h = 0.5/half_df
+            return (h*(-2/3 + h*(-1/3 + h*(-4/45 + h/7.5))) +
+                    0.5*np.log(half_df) + c)
+
+        return xpx.apply_where(half_df < 125, half_df,
+                               regular_formula, asymptotic_formula)
+
+
+chi2 = chi2_gen(a=0.0, name='chi2')
+
+
+class cosine_gen(rv_continuous):
+    r"""A cosine continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The cosine distribution is an approximation to the normal distribution.
+    The probability density function for `cosine` is:
+
+    .. math::
+
+        f(x) = \frac{1}{2\pi} (1+\cos(x))
+
+    for :math:`-\pi \le x \le \pi`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # cosine.pdf(x) = 1/(2*pi) * (1+cos(x))
+        return 1.0/2/np.pi*(1+np.cos(x))
+
+    def _logpdf(self, x):
+        c = np.cos(x)
+        return xpx.apply_where(c != -1, c,
+                               lambda c: np.log1p(c) - np.log(2*np.pi),
+                               fill_value=-np.inf)
+
+    def _cdf(self, x):
+        return scu._cosine_cdf(x)
+
+    def _sf(self, x):
+        return scu._cosine_cdf(-x)
+
+    def _ppf(self, p):
+        return scu._cosine_invcdf(p)
+
+    def _isf(self, p):
+        return -scu._cosine_invcdf(p)
+
+    def _stats(self):
+        v = (np.pi * np.pi / 3.0) - 2.0
+        k = -6.0 * (np.pi**4 - 90) / (5.0 * (np.pi * np.pi - 6)**2)
+        return 0.0, v, 0.0, k
+
+    def _entropy(self):
+        return np.log(4*np.pi)-1.0
+
+
+cosine = cosine_gen(a=-np.pi, b=np.pi, name='cosine')
+
+
+class dgamma_gen(rv_continuous):
+    r"""A double gamma continuous random variable.
+
+    The double gamma distribution is also known as the reflected gamma
+    distribution [1]_.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `dgamma` is:
+
+    .. math::
+
+        f(x, a) = \frac{1}{2\Gamma(a)} |x|^{a-1} \exp(-|x|)
+
+    for a real number :math:`x` and :math:`a > 0`. :math:`\Gamma` is the
+    gamma function (`scipy.special.gamma`).
+
+    `dgamma` takes ``a`` as a shape parameter for :math:`a`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Johnson, Kotz, and Balakrishnan, "Continuous Univariate
+           Distributions, Volume 1", Second Edition, John Wiley and Sons
+           (1994).
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, a, size=None, random_state=None):
+        u = random_state.uniform(size=size)
+        gm = gamma.rvs(a, size=size, random_state=random_state)
+        return gm * np.where(u >= 0.5, 1, -1)
+
+    def _pdf(self, x, a):
+        # dgamma.pdf(x, a) = 1 / (2*gamma(a)) * abs(x)**(a-1) * exp(-abs(x))
+        ax = abs(x)
+        return 1.0/(2*sc.gamma(a))*ax**(a-1.0) * np.exp(-ax)
+
+    def _logpdf(self, x, a):
+        ax = abs(x)
+        return sc.xlogy(a - 1.0, ax) - ax - np.log(2) - sc.gammaln(a)
+
+    def _cdf(self, x, a):
+        return np.where(x > 0,
+                        0.5 + 0.5*sc.gammainc(a, x),
+                        0.5*sc.gammaincc(a, -x))
+
+    def _sf(self, x, a):
+        return np.where(x > 0,
+                        0.5*sc.gammaincc(a, x),
+                        0.5 + 0.5*sc.gammainc(a, -x))
+
+    def _entropy(self, a):
+        return stats.gamma._entropy(a) - np.log(0.5)
+
+    def _ppf(self, q, a):
+        return np.where(q > 0.5,
+                        sc.gammaincinv(a, 2*q - 1),
+                        -sc.gammainccinv(a, 2*q))
+
+    def _isf(self, q, a):
+        return np.where(q > 0.5,
+                        -sc.gammaincinv(a, 2*q - 1),
+                        sc.gammainccinv(a, 2*q))
+
+    def _stats(self, a):
+        mu2 = a*(a+1.0)
+        return 0.0, mu2, 0.0, (a+2.0)*(a+3.0)/mu2-3.0
+
+
+dgamma = dgamma_gen(name='dgamma')
+
+
+class dpareto_lognorm_gen(rv_continuous):
+    r"""A double Pareto lognormal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `dpareto_lognorm` is:
+
+    .. math::
+
+        f(x, \mu, \sigma, \alpha, \beta) =
+        \frac{\alpha \beta}{(\alpha + \beta) x}
+        \phi\left( \frac{\log x - \mu}{\sigma} \right)
+        \left( R(y_1) + R(y_2) \right)
+
+    where :math:`R(t) = \frac{1 - \Phi(t)}{\phi(t)}`,
+    :math:`\phi` and :math:`\Phi` are the normal PDF and CDF, respectively,
+    :math:`y_1 = \alpha \sigma - \frac{\log x - \mu}{\sigma}`,
+    and :math:`y_2 = \beta \sigma + \frac{\log x - \mu}{\sigma}`
+    for real numbers :math:`x` and :math:`\mu`, :math:`\sigma > 0`,
+    :math:`\alpha > 0`, and :math:`\beta > 0` [1]_.
+
+    `dpareto_lognorm` takes
+    ``u`` as a shape parameter for :math:`\mu`,
+    ``s`` as a shape parameter for :math:`\sigma`,
+    ``a`` as a shape parameter for :math:`\alpha`, and
+    ``b`` as a shape parameter for :math:`\beta`.
+
+    A random variable :math:`X` distributed according to the PDF above
+    can be represented as :math:`X = U \frac{V_1}{V_2}` where :math:`U`,
+    :math:`V_1`, and :math:`V_2` are independent, :math:`U` is lognormally
+    distributed such that :math:`\log U \sim N(\mu, \sigma^2)`, and
+    :math:`V_1` and :math:`V_2` follow Pareto distributions with parameters
+    :math:`\alpha` and :math:`\beta`, respectively [2]_.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Hajargasht, Gholamreza, and William E. Griffiths. "Pareto-lognormal
+           distributions: Inequality, poverty, and estimation from grouped income
+           data." Economic Modelling 33 (2013): 593-604.
+    .. [2] Reed, William J., and Murray Jorgensen. "The double Pareto-lognormal
+           distribution - a new parametric model for size distributions."
+           Communications in Statistics - Theory and Methods 33.8 (2004): 1733-1753.
+
+    %(example)s
+
+    """
+    _logphi = norm._logpdf
+    _logPhi = norm._logcdf
+    _logPhic = norm._logsf
+    _phi = norm._pdf
+    _Phi = norm._cdf
+    _Phic = norm._sf
+
+    def _R(self, z):
+        return self._Phic(z) / self._phi(z)
+
+    def _logR(self, z):
+        return self._logPhic(z) - self._logphi(z)
+
+    def _shape_info(self):
+        return [_ShapeInfo("u", False, (-np.inf, np.inf), (False, False)),
+                _ShapeInfo("s", False, (0, np.inf), (False, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _argcheck(self, u, s, a, b):
+        return (s > 0) & (a > 0) & (b > 0)
+
+    def _rvs(self, u, s, a, b, size=None, random_state=None):
+        # From [1] after Equation (12): "To generate pseudo-random
+        # deviates from the dPlN distribution, one can exponentiate
+        # pseudo-random deviates from NL generated using (6)."
+        Z = random_state.normal(u, s, size=size)
+        E1 = random_state.standard_exponential(size=size)
+        E2 = random_state.standard_exponential(size=size)
+        return np.exp(Z + E1 / a - E2 / b)
+
+    def _logpdf(self, x, u, s, a, b):
+        with np.errstate(invalid='ignore', divide='ignore'):
+            log_y, m = np.log(x), u  # compare against [1] Eq. 1
+            z = (log_y - m) / s
+            x1 = a * s - z
+            x2 = b * s + z
+            out = np.asarray(np.log(a) + np.log(b) - np.log(a + b) - log_y)
+            out += self._logphi(z)
+            out += np.logaddexp(self._logR(x1), self._logR(x2))
+        out[(x == 0) | np.isinf(x)] = -np.inf
+        return out[()]
+
+    def _logcdf(self, x, u, s, a, b):
+        with np.errstate(invalid='ignore', divide='ignore'):
+            log_y, m = np.log(x), u  # compare against [1] Eq. 2
+            z = (log_y - m) / s
+            x1 = a * s - z
+            x2 = b * s + z
+            t1 = self._logPhi(z)
+            t2 = self._logphi(z)
+            t3 = (np.log(b) + self._logR(x1))
+            t4 = (np.log(a) + self._logR(x2))
+            t1, t2, t3, t4, one = np.broadcast_arrays(t1, t2, t3, t4, 1)
+            # t3 can be smaller than t4, so we have to consider log of negative number
+            # This would be much simpler, but `return_sign` is available, so use it?
+            # t5 =  sc.logsumexp([t3, t4 + np.pi*1j])
+            t5, sign =  sc.logsumexp([t3, t4], b=[one, -one], axis=0, return_sign=True)
+            temp = [t1, t2 + t5 - np.log(a + b)]
+            out = np.asarray(sc.logsumexp(temp, b=[one, -one*sign], axis=0))
+        out[x == 0] = -np.inf
+        return out[()]
+
+    def _logsf(self, x, u, s, a, b):
+        return scu._log1mexp(self._logcdf(x, u, s, a, b))
+
+    # Infrastructure doesn't seem to do this, so...
+
+    def _pdf(self, x, u, s, a, b):
+        return np.exp(self._logpdf(x, u, s, a, b))
+
+    def _cdf(self, x, u, s, a, b):
+        return np.exp(self._logcdf(x, u, s, a, b))
+
+    def _sf(self, x, u, s, a, b):
+        return np.exp(self._logsf(x, u, s, a, b))
+
+    def _munp(self, n, u, s, a, b):
+        m, k = u, float(n)  # compare against [1] Eq. 6
+        out = (a * b) / ((a - k) * (b + k)) * np.exp(k * m + k ** 2 * s ** 2 / 2)
+        out = np.asarray(out)
+        out[a <= k] = np.nan
+        return out
+
+
+dpareto_lognorm = dpareto_lognorm_gen(a=0, name='dpareto_lognorm')
+
+
+class dweibull_gen(rv_continuous):
+    r"""A double Weibull continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `dweibull` is given by
+
+    .. math::
+
+        f(x, c) = c / 2 |x|^{c-1} \exp(-|x|^c)
+
+    for a real number :math:`x` and :math:`c > 0`.
+
+    `dweibull` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, c, size=None, random_state=None):
+        u = random_state.uniform(size=size)
+        w = weibull_min.rvs(c, size=size, random_state=random_state)
+        return w * (np.where(u >= 0.5, 1, -1))
+
+    def _pdf(self, x, c):
+        # dweibull.pdf(x, c) = c / 2 * abs(x)**(c-1) * exp(-abs(x)**c)
+        ax = abs(x)
+        Px = c / 2.0 * ax**(c-1.0) * np.exp(-ax**c)
+        return Px
+
+    def _logpdf(self, x, c):
+        ax = abs(x)
+        return np.log(c) - np.log(2.0) + sc.xlogy(c - 1.0, ax) - ax**c
+
+    def _cdf(self, x, c):
+        Cx1 = 0.5 * np.exp(-abs(x)**c)
+        return np.where(x > 0, 1 - Cx1, Cx1)
+
+    def _ppf(self, q, c):
+        fac = 2. * np.where(q <= 0.5, q, 1. - q)
+        fac = np.power(-np.log(fac), 1.0 / c)
+        return np.where(q > 0.5, fac, -fac)
+
+    def _sf(self, x, c):
+        half_weibull_min_sf = 0.5 * stats.weibull_min._sf(np.abs(x), c)
+        return np.where(x > 0, half_weibull_min_sf, 1 - half_weibull_min_sf)
+
+    def _isf(self, q, c):
+        double_q = 2. * np.where(q <= 0.5, q, 1. - q)
+        weibull_min_isf = stats.weibull_min._isf(double_q, c)
+        return np.where(q > 0.5, -weibull_min_isf, weibull_min_isf)
+
+    def _munp(self, n, c):
+        return (1 - (n % 2)) * sc.gamma(1.0 + 1.0 * n / c)
+
+    # since we know that all odd moments are zeros, return them at once.
+    # returning Nones from _stats makes the public stats call _munp
+    # so overall we're saving one or two gamma function evaluations here.
+    def _stats(self, c):
+        return 0, None, 0, None
+
+    def _entropy(self, c):
+        h = stats.weibull_min._entropy(c) - np.log(0.5)
+        return h
+
+
+dweibull = dweibull_gen(name='dweibull')
+
+
+class expon_gen(rv_continuous):
+    r"""An exponential continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `expon` is:
+
+    .. math::
+
+        f(x) = \exp(-x)
+
+    for :math:`x \ge 0`.
+
+    %(after_notes)s
+
+    A common parameterization for `expon` is in terms of the rate parameter
+    ``lambda``, such that ``pdf = lambda * exp(-lambda * x)``. This
+    parameterization corresponds to using ``scale = 1 / lambda``.
+
+    The exponential distribution is a special case of the gamma
+    distributions, with gamma shape parameter ``a = 1``.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.standard_exponential(size)
+
+    def _pdf(self, x):
+        # expon.pdf(x) = exp(-x)
+        return np.exp(-x)
+
+    def _logpdf(self, x):
+        return -x
+
+    def _cdf(self, x):
+        return -sc.expm1(-x)
+
+    def _ppf(self, q):
+        return -sc.log1p(-q)
+
+    def _sf(self, x):
+        return np.exp(-x)
+
+    def _logsf(self, x):
+        return -x
+
+    def _isf(self, q):
+        return -np.log(q)
+
+    def _stats(self):
+        return 1.0, 1.0, 2.0, 6.0
+
+    def _entropy(self):
+        return 1.0
+
+    @_call_super_mom
+    @replace_notes_in_docstring(rv_continuous, notes="""\
+        When `method='MLE'`,
+        this function uses explicit formulas for the maximum likelihood
+        estimation of the exponential distribution parameters, so the
+        `optimizer`, `loc` and `scale` keyword arguments are
+        ignored.\n\n""")
+    def fit(self, data, *args, **kwds):
+        if len(args) > 0:
+            raise TypeError("Too many arguments.")
+
+        floc = kwds.pop('floc', None)
+        fscale = kwds.pop('fscale', None)
+
+        _remove_optimizer_parameters(kwds)
+
+        if floc is not None and fscale is not None:
+            # This check is for consistency with `rv_continuous.fit`.
+            raise ValueError("All parameters fixed. There is nothing to "
+                             "optimize.")
+
+        data = np.asarray(data)
+
+        if not np.isfinite(data).all():
+            raise ValueError("The data contains non-finite values.")
+
+        data_min = data.min()
+
+        if floc is None:
+            # ML estimate of the location is the minimum of the data.
+            loc = data_min
+        else:
+            loc = floc
+            if data_min < loc:
+                # There are values that are less than the specified loc.
+                raise FitDataError("expon", lower=floc, upper=np.inf)
+
+        if fscale is None:
+            # ML estimate of the scale is the shifted mean.
+            scale = data.mean() - loc
+        else:
+            scale = fscale
+
+        # We expect the return values to be floating point, so ensure it
+        # by explicitly converting to float.
+        return float(loc), float(scale)
+
+
+expon = expon_gen(a=0.0, name='expon')
+
+
+class exponnorm_gen(rv_continuous):
+    r"""An exponentially modified Normal continuous random variable.
+
+    Also known as the exponentially modified Gaussian distribution [1]_.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `exponnorm` is:
+
+    .. math::
+
+        f(x, K) = \frac{1}{2K} \exp\left(\frac{1}{2 K^2} - x / K \right)
+                  \text{erfc}\left(-\frac{x - 1/K}{\sqrt{2}}\right)
+
+    where :math:`x` is a real number and :math:`K > 0`.
+
+    It can be thought of as the sum of a standard normal random variable
+    and an independent exponentially distributed random variable with rate
+    ``1/K``.
+
+    %(after_notes)s
+
+    An alternative parameterization of this distribution (for example, in
+    the Wikipedia article [1]_) involves three parameters, :math:`\mu`,
+    :math:`\lambda` and :math:`\sigma`.
+
+    In the present parameterization this corresponds to having ``loc`` and
+    ``scale`` equal to :math:`\mu` and :math:`\sigma`, respectively, and
+    shape parameter :math:`K = 1/(\sigma\lambda)`.
+
+    .. versionadded:: 0.16.0
+
+    References
+    ----------
+    .. [1] Exponentially modified Gaussian distribution, Wikipedia,
+           https://en.wikipedia.org/wiki/Exponentially_modified_Gaussian_distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("K", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, K, size=None, random_state=None):
+        expval = random_state.standard_exponential(size) * K
+        gval = random_state.standard_normal(size)
+        return expval + gval
+
+    def _pdf(self, x, K):
+        return np.exp(self._logpdf(x, K))
+
+    def _logpdf(self, x, K):
+        invK = 1.0 / K
+        exparg = invK * (0.5 * invK - x)
+        return exparg + _norm_logcdf(x - invK) - np.log(K)
+
+    def _cdf(self, x, K):
+        invK = 1.0 / K
+        expval = invK * (0.5 * invK - x)
+        logprod = expval + _norm_logcdf(x - invK)
+        return _norm_cdf(x) - np.exp(logprod)
+
+    def _sf(self, x, K):
+        invK = 1.0 / K
+        expval = invK * (0.5 * invK - x)
+        logprod = expval + _norm_logcdf(x - invK)
+        return _norm_cdf(-x) + np.exp(logprod)
+
+    def _stats(self, K):
+        K2 = K * K
+        opK2 = 1.0 + K2
+        skw = 2 * K**3 * opK2**(-1.5)
+        krt = 6.0 * K2 * K2 * opK2**(-2)
+        return K, opK2, skw, krt
+
+
+exponnorm = exponnorm_gen(name='exponnorm')
+
+
+def _pow1pm1(x, y):
+    """
+    Compute (1 + x)**y - 1.
+
+    Uses expm1 and xlog1py to avoid loss of precision when
+    (1 + x)**y is close to 1.
+
+    Note that the inverse of this function with respect to x is
+    ``_pow1pm1(x, 1/y)``.  That is, if
+
+        t = _pow1pm1(x, y)
+
+    then
+
+        x = _pow1pm1(t, 1/y)
+    """
+    return np.expm1(sc.xlog1py(y, x))
+
+
+class exponweib_gen(rv_continuous):
+    r"""An exponentiated Weibull continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    weibull_min, numpy.random.Generator.weibull
+
+    Notes
+    -----
+    The probability density function for `exponweib` is:
+
+    .. math::
+
+        f(x, a, c) = a c [1-\exp(-x^c)]^{a-1} \exp(-x^c) x^{c-1}
+
+    and its cumulative distribution function is:
+
+    .. math::
+
+        F(x, a, c) = [1-\exp(-x^c)]^a
+
+    for :math:`x > 0`, :math:`a > 0`, :math:`c > 0`.
+
+    `exponweib` takes :math:`a` and :math:`c` as shape parameters:
+
+    * :math:`a` is the exponentiation parameter,
+      with the special case :math:`a=1` corresponding to the
+      (non-exponentiated) Weibull distribution `weibull_min`.
+    * :math:`c` is the shape parameter of the non-exponentiated Weibull law.
+
+    %(after_notes)s
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Exponentiated_Weibull_distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        return [ia, ic]
+
+    def _pdf(self, x, a, c):
+        # exponweib.pdf(x, a, c) =
+        #     a * c * (1-exp(-x**c))**(a-1) * exp(-x**c)*x**(c-1)
+        return np.exp(self._logpdf(x, a, c))
+
+    def _logpdf(self, x, a, c):
+        negxc = -x**c
+        exm1c = -sc.expm1(negxc)
+        logp = (np.log(a) + np.log(c) + sc.xlogy(a - 1.0, exm1c) +
+                negxc + sc.xlogy(c - 1.0, x))
+        return logp
+
+    def _cdf(self, x, a, c):
+        exm1c = -sc.expm1(-x**c)
+        return exm1c**a
+
+    def _ppf(self, q, a, c):
+        return (-sc.log1p(-q**(1.0/a)))**np.asarray(1.0/c)
+
+    def _sf(self, x, a, c):
+        return -_pow1pm1(-np.exp(-x**c), a)
+
+    def _isf(self, p, a, c):
+        return (-np.log(-_pow1pm1(-p, 1/a)))**(1/c)
+
+
+exponweib = exponweib_gen(a=0.0, name='exponweib')
+
+
+class exponpow_gen(rv_continuous):
+    r"""An exponential power continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `exponpow` is:
+
+    .. math::
+
+        f(x, b) = b x^{b-1} \exp(1 + x^b - \exp(x^b))
+
+    for :math:`x \ge 0`, :math:`b > 0`.  Note that this is a different
+    distribution from the exponential power distribution that is also known
+    under the names "generalized normal" or "generalized Gaussian".
+
+    `exponpow` takes ``b`` as a shape parameter for :math:`b`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Exponentialpower.pdf
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, b):
+        # exponpow.pdf(x, b) = b * x**(b-1) * exp(1 + x**b - exp(x**b))
+        return np.exp(self._logpdf(x, b))
+
+    def _logpdf(self, x, b):
+        xb = x**b
+        f = 1 + np.log(b) + sc.xlogy(b - 1.0, x) + xb - np.exp(xb)
+        return f
+
+    def _cdf(self, x, b):
+        return -sc.expm1(-sc.expm1(x**b))
+
+    def _sf(self, x, b):
+        return np.exp(-sc.expm1(x**b))
+
+    def _isf(self, x, b):
+        return (sc.log1p(-np.log(x)))**(1./b)
+
+    def _ppf(self, q, b):
+        return pow(sc.log1p(-sc.log1p(-q)), 1.0/b)
+
+
+exponpow = exponpow_gen(a=0.0, name='exponpow')
+
+
+class fatiguelife_gen(rv_continuous):
+    r"""A fatigue-life (Birnbaum-Saunders) continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `fatiguelife` is:
+
+    .. math::
+
+        f(x, c) = \frac{x+1}{2c\sqrt{2\pi x^3}} \exp(-\frac{(x-1)^2}{2x c^2})
+
+    for :math:`x >= 0` and :math:`c > 0`.
+
+    `fatiguelife` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Birnbaum-Saunders distribution",
+           https://en.wikipedia.org/wiki/Birnbaum-Saunders_distribution
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, c, size=None, random_state=None):
+        z = random_state.standard_normal(size)
+        x = 0.5*c*z
+        x2 = x*x
+        t = 1.0 + 2*x2 + 2*x*np.sqrt(1 + x2)
+        return t
+
+    def _pdf(self, x, c):
+        # fatiguelife.pdf(x, c) =
+        #     (x+1) / (2*c*sqrt(2*pi*x**3)) * exp(-(x-1)**2/(2*x*c**2))
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        return (np.log(x+1) - (x-1)**2 / (2.0*x*c**2) - np.log(2*c) -
+                0.5*(np.log(2*np.pi) + 3*np.log(x)))
+
+    def _cdf(self, x, c):
+        return _norm_cdf(1.0 / c * (np.sqrt(x) - 1.0/np.sqrt(x)))
+
+    def _ppf(self, q, c):
+        tmp = c * _norm_ppf(q)
+        return 0.25 * (tmp + np.sqrt(tmp**2 + 4))**2
+
+    def _sf(self, x, c):
+        return _norm_sf(1.0 / c * (np.sqrt(x) - 1.0/np.sqrt(x)))
+
+    def _isf(self, q, c):
+        tmp = -c * _norm_ppf(q)
+        return 0.25 * (tmp + np.sqrt(tmp**2 + 4))**2
+
+    def _stats(self, c):
+        # NB: the formula for kurtosis in wikipedia seems to have an error:
+        # it's 40, not 41. At least it disagrees with the one from Wolfram
+        # Alpha.  And the latter one, below, passes the tests, while the wiki
+        # one doesn't So far I didn't have the guts to actually check the
+        # coefficients from the expressions for the raw moments.
+        c2 = c*c
+        mu = c2 / 2.0 + 1.0
+        den = 5.0 * c2 + 4.0
+        mu2 = c2*den / 4.0
+        g1 = 4 * c * (11*c2 + 6.0) / np.power(den, 1.5)
+        g2 = 6 * c2 * (93*c2 + 40.0) / den**2.0
+        return mu, mu2, g1, g2
+
+
+fatiguelife = fatiguelife_gen(a=0.0, name='fatiguelife')
+
+
+class foldcauchy_gen(rv_continuous):
+    r"""A folded Cauchy continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `foldcauchy` is:
+
+    .. math::
+
+        f(x, c) = \frac{1}{\pi (1+(x-c)^2)} + \frac{1}{\pi (1+(x+c)^2)}
+
+    for :math:`x \ge 0` and :math:`c \ge 0`.
+
+    `foldcauchy` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(example)s
+
+    """
+    def _argcheck(self, c):
+        return c >= 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (True, False))]
+
+    def _rvs(self, c, size=None, random_state=None):
+        return abs(cauchy.rvs(loc=c, size=size,
+                              random_state=random_state))
+
+    def _pdf(self, x, c):
+        # foldcauchy.pdf(x, c) = 1/(pi*(1+(x-c)**2)) + 1/(pi*(1+(x+c)**2))
+        return 1.0/np.pi*(1.0/(1+(x-c)**2) + 1.0/(1+(x+c)**2))
+
+    def _cdf(self, x, c):
+        return 1.0/np.pi*(np.arctan(x-c) + np.arctan(x+c))
+
+    def _sf(self, x, c):
+        # 1 - CDF(x, c) = 1 - (atan(x - c) + atan(x + c))/pi
+        #               = ((pi/2 - atan(x - c)) + (pi/2 - atan(x + c)))/pi
+        #               = (acot(x - c) + acot(x + c))/pi
+        #               = (atan2(1, x - c) + atan2(1, x + c))/pi
+        return (np.arctan2(1, x - c) + np.arctan2(1, x + c))/np.pi
+
+    def _stats(self, c):
+        return np.inf, np.inf, np.nan, np.nan
+
+
+foldcauchy = foldcauchy_gen(a=0.0, name='foldcauchy')
+
+
+class f_gen(rv_continuous):
+    r"""An F continuous random variable.
+
+    For the noncentral F distribution, see `ncf`.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    ncf
+
+    Notes
+    -----
+    The F distribution with :math:`df_1 > 0` and :math:`df_2 > 0` degrees of freedom is
+    the distribution of the ratio of two independent chi-squared distributions with
+    :math:`df_1` and :math:`df_2` degrees of freedom, after rescaling by
+    :math:`df_2 / df_1`.
+
+    The probability density function for `f` is:
+
+    .. math::
+
+        f(x, df_1, df_2) = \frac{df_2^{df_2/2} df_1^{df_1/2} x^{df_1 / 2-1}}
+                                {(df_2+df_1 x)^{(df_1+df_2)/2}
+                                 B(df_1/2, df_2/2)}
+
+    for :math:`x > 0`.
+
+    `f` accepts shape parameters ``dfn`` and ``dfd`` for :math:`df_1`, the degrees of
+    freedom of the chi-squared distribution in the numerator, and :math:`df_2`, the
+    degrees of freedom of the chi-squared distribution in the denominator, respectively.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        idfn = _ShapeInfo("dfn", False, (0, np.inf), (False, False))
+        idfd = _ShapeInfo("dfd", False, (0, np.inf), (False, False))
+        return [idfn, idfd]
+
+    def _rvs(self, dfn, dfd, size=None, random_state=None):
+        return random_state.f(dfn, dfd, size)
+
+    def _pdf(self, x, dfn, dfd):
+        #                      df2**(df2/2) * df1**(df1/2) * x**(df1/2-1)
+        # F.pdf(x, df1, df2) = --------------------------------------------
+        #                      (df2+df1*x)**((df1+df2)/2) * B(df1/2, df2/2)
+        return np.exp(self._logpdf(x, dfn, dfd))
+
+    def _logpdf(self, x, dfn, dfd):
+        n = 1.0 * dfn
+        m = 1.0 * dfd
+        lPx = (m/2 * np.log(m) + n/2 * np.log(n) + sc.xlogy(n/2 - 1, x)
+               - (((n+m)/2) * np.log(m + n*x) + sc.betaln(n/2, m/2)))
+        return lPx
+
+    def _cdf(self, x, dfn, dfd):
+        return sc.fdtr(dfn, dfd, x)
+
+    def _sf(self, x, dfn, dfd):
+        return sc.fdtrc(dfn, dfd, x)
+
+    def _ppf(self, q, dfn, dfd):
+        return sc.fdtri(dfn, dfd, q)
+
+    def _stats(self, dfn, dfd):
+        v1, v2 = 1. * dfn, 1. * dfd
+        v2_2, v2_4, v2_6, v2_8 = v2 - 2., v2 - 4., v2 - 6., v2 - 8.
+
+        mu = xpx.apply_where(
+            v2 > 2, (v2, v2_2),
+            lambda v2, v2_2: v2 / v2_2,
+            fill_value=np.inf)
+
+        mu2 = xpx.apply_where(
+            v2 > 4, (v1, v2, v2_2, v2_4),
+            lambda v1, v2, v2_2, v2_4:
+            2 * v2 * v2 * (v1 + v2_2) / (v1 * v2_2**2 * v2_4),
+            fill_value=np.inf)
+
+        g1 = xpx.apply_where(
+            v2 > 6, (v1, v2_2, v2_4, v2_6),
+            lambda v1, v2_2, v2_4, v2_6:
+            (2 * v1 + v2_2) / v2_6 * np.sqrt(v2_4 / (v1 * (v1 + v2_2))),
+            fill_value=np.nan)
+        g1 *= np.sqrt(8.)
+
+        g2 = xpx.apply_where(
+            v2 > 8, (g1, v2_6, v2_8),
+            lambda g1, v2_6, v2_8: (8 + g1 * g1 * v2_6) / v2_8,
+            fill_value=np.nan)
+        g2 *= 3. / 2.
+
+        return mu, mu2, g1, g2
+
+    def _entropy(self, dfn, dfd):
+        # the formula found in literature is incorrect. This one yields the
+        # same result as numerical integration using the generic entropy
+        # definition. This is also tested in tests/test_conntinous_basic
+        half_dfn = 0.5 * dfn
+        half_dfd = 0.5 * dfd
+        half_sum = 0.5 * (dfn + dfd)
+
+        return (np.log(dfd) - np.log(dfn) + sc.betaln(half_dfn, half_dfd) +
+                (1 - half_dfn) * sc.psi(half_dfn) - (1 + half_dfd) *
+                sc.psi(half_dfd) + half_sum * sc.psi(half_sum))
+
+
+f = f_gen(a=0.0, name='f')
+
+
+## Folded Normal
+##   abs(Z) where (Z is normal with mu=L and std=S so that c=abs(L)/S)
+##
+##  note: regress docs have scale parameter correct, but first parameter
+##    he gives is a shape parameter A = c * scale
+
+##  Half-normal is folded normal with shape-parameter c=0.
+
+class foldnorm_gen(rv_continuous):
+    r"""A folded normal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `foldnorm` is:
+
+    .. math::
+
+        f(x, c) = \sqrt{2/\pi} cosh(c x) \exp(-\frac{x^2+c^2}{2})
+
+    for :math:`x \ge 0` and :math:`c \ge 0`.
+
+    `foldnorm` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _argcheck(self, c):
+        return c >= 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (True, False))]
+
+    def _rvs(self, c, size=None, random_state=None):
+        return abs(random_state.standard_normal(size) + c)
+
+    def _pdf(self, x, c):
+        # foldnormal.pdf(x, c) = sqrt(2/pi) * cosh(c*x) * exp(-(x**2+c**2)/2)
+        return _norm_pdf(x + c) + _norm_pdf(x-c)
+
+    def _cdf(self, x, c):
+        sqrt_two = np.sqrt(2)
+        return 0.5 * (sc.erf((x - c)/sqrt_two) + sc.erf((x + c)/sqrt_two))
+
+    def _sf(self, x, c):
+        return _norm_sf(x - c) + _norm_sf(x + c)
+
+    def _stats(self, c):
+        # Regina C. Elandt, Technometrics 3, 551 (1961)
+        # https://www.jstor.org/stable/1266561
+        #
+        c2 = c*c
+        expfac = np.exp(-0.5*c2) / np.sqrt(2.*np.pi)
+
+        mu = 2.*expfac + c * sc.erf(c/np.sqrt(2))
+        mu2 = c2 + 1 - mu*mu
+
+        g1 = 2. * (mu*mu*mu - c2*mu - expfac)
+        g1 /= np.power(mu2, 1.5)
+
+        g2 = c2 * (c2 + 6.) + 3 + 8.*expfac*mu
+        g2 += (2. * (c2 - 3.) - 3. * mu**2) * mu**2
+        g2 = g2 / mu2**2.0 - 3.
+
+        return mu, mu2, g1, g2
+
+
+foldnorm = foldnorm_gen(a=0.0, name='foldnorm')
+
+
+class weibull_min_gen(rv_continuous):
+    r"""Weibull minimum continuous random variable.
+
+    The Weibull Minimum Extreme Value distribution, from extreme value theory
+    (Fisher-Gnedenko theorem), is also often simply called the Weibull
+    distribution. It arises as the limiting distribution of the rescaled
+    minimum of iid random variables.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    weibull_max, numpy.random.Generator.weibull, exponweib
+
+    Notes
+    -----
+    The probability density function for `weibull_min` is:
+
+    .. math::
+
+        f(x, c) = c x^{c-1} \exp(-x^c)
+
+    for :math:`x > 0`, :math:`c > 0`.
+
+    `weibull_min` takes ``c`` as a shape parameter for :math:`c`.
+    (named :math:`k` in Wikipedia article and :math:`a` in
+    ``numpy.random.weibull``).  Special shape values are :math:`c=1` and
+    :math:`c=2` where Weibull distribution reduces to the `expon` and
+    `rayleigh` distributions respectively.
+
+    Suppose ``X`` is an exponentially distributed random variable with
+    scale ``s``. Then ``Y = X**k`` is `weibull_min` distributed with shape
+    ``c = 1/k`` and scale ``s**k``.
+
+    %(after_notes)s
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Weibull_distribution
+
+    https://en.wikipedia.org/wiki/Fisher-Tippett-Gnedenko_theorem
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # weibull_min.pdf(x, c) = c * x**(c-1) * exp(-x**c)
+        return c*pow(x, c-1)*np.exp(-pow(x, c))
+
+    def _logpdf(self, x, c):
+        return np.log(c) + sc.xlogy(c - 1, x) - pow(x, c)
+
+    def _cdf(self, x, c):
+        return -sc.expm1(-pow(x, c))
+
+    def _ppf(self, q, c):
+        return pow(-sc.log1p(-q), 1.0/c)
+
+    def _sf(self, x, c):
+        return np.exp(self._logsf(x, c))
+
+    def _logsf(self, x, c):
+        return -pow(x, c)
+
+    def _isf(self, q, c):
+        return (-np.log(q))**(1/c)
+
+    def _munp(self, n, c):
+        return sc.gamma(1.0+n*1.0/c)
+
+    def _entropy(self, c):
+        return -_EULER / c - np.log(c) + _EULER + 1
+
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        If ``method='mm'``, parameters fixed by the user are respected, and the
+        remaining parameters are used to match distribution and sample moments
+        where possible. For example, if the user fixes the location with
+        ``floc``, the parameters will only match the distribution skewness and
+        variance to the sample skewness and variance; no attempt will be made
+        to match the means or minimize a norm of the errors.
+        \n\n""")
+    def fit(self, data, *args, **kwds):
+
+        if isinstance(data, CensoredData):
+            if data.num_censored() == 0:
+                data = data._uncensor()
+            else:
+                return super().fit(data, *args, **kwds)
+
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        # this extracts fixed shape, location, and scale however they
+        # are specified, and also leaves them in `kwds`
+        data, fc, floc, fscale = _check_fit_input_parameters(self, data,
+                                                             args, kwds)
+        method = kwds.get("method", "mle").lower()
+
+        # See https://en.wikipedia.org/wiki/Weibull_distribution#Moments for
+        # moment formulas.
+        def skew(c):
+            gamma1 = sc.gamma(1+1/c)
+            gamma2 = sc.gamma(1+2/c)
+            gamma3 = sc.gamma(1+3/c)
+            num = 2 * gamma1**3 - 3*gamma1*gamma2 + gamma3
+            den = (gamma2 - gamma1**2)**(3/2)
+            return num/den
+
+        # For c in [1e2, 3e4], population skewness appears to approach
+        # asymptote near -1.139, but past c > 3e4, skewness begins to vary
+        # wildly, and MoM won't provide a good guess. Get out early.
+        s = stats.skew(data)
+        max_c = 1e4
+        s_min = skew(max_c)
+        if s < s_min and method != "mm" and fc is None and not args:
+            return super().fit(data, *args, **kwds)
+
+        # If method is method of moments, we don't need the user's guesses.
+        # Otherwise, extract the guesses from args and kwds.
+        if method == "mm":
+            c, loc, scale = None, None, None
+        else:
+            c = args[0] if len(args) else None
+            loc = kwds.pop('loc', None)
+            scale = kwds.pop('scale', None)
+
+        if fc is None and c is None:  # not fixed and no guess: use MoM
+            # Solve for c that matches sample distribution skewness to sample
+            # skewness.
+            # we start having numerical issues with `weibull_min` with
+            # parameters outside this range - and not just in this method.
+            # We could probably improve the situation by doing everything
+            # in the log space, but that is for another time.
+            c = root_scalar(lambda c: skew(c) - s, bracket=[0.02, max_c],
+                            method='bisect').root
+        elif fc is not None:  # fixed: use it
+            c = fc
+
+        if fscale is None and scale is None:
+            v = np.var(data)
+            scale = np.sqrt(v / (sc.gamma(1+2/c) - sc.gamma(1+1/c)**2))
+        elif fscale is not None:
+            scale = fscale
+
+        if floc is None and loc is None:
+            m = np.mean(data)
+            loc = m - scale*sc.gamma(1 + 1/c)
+        elif floc is not None:
+            loc = floc
+
+        if method == 'mm':
+            return c, loc, scale
+        else:
+            # At this point, parameter "guesses" may equal the fixed parameters
+            # in kwds. No harm in passing them as guesses, too.
+            return super().fit(data, c, loc=loc, scale=scale, **kwds)
+
+
+weibull_min = weibull_min_gen(a=0.0, name='weibull_min')
+
+
+class truncweibull_min_gen(rv_continuous):
+    r"""A doubly truncated Weibull minimum continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    weibull_min, truncexpon
+
+    Notes
+    -----
+    The probability density function for `truncweibull_min` is:
+
+    .. math::
+
+        f(x, a, b, c) = \frac{c x^{c-1} \exp(-x^c)}{\exp(-a^c) - \exp(-b^c)}
+
+    for :math:`a < x <= b`, :math:`0 \le a < b` and :math:`c > 0`.
+
+    `truncweibull_min` takes :math:`a`, :math:`b`, and :math:`c` as shape
+    parameters.
+
+    Notice that the truncation values, :math:`a` and :math:`b`, are defined in
+    standardized form:
+
+    .. math::
+
+        a = (u_l - loc)/scale
+        b = (u_r - loc)/scale
+
+    where :math:`u_l` and :math:`u_r` are the specific left and right
+    truncation values, respectively. In other words, the support of the
+    distribution becomes :math:`(a*scale + loc) < x <= (b*scale + loc)` when
+    :math:`loc` and/or :math:`scale` are provided.
+
+    %(after_notes)s
+
+    References
+    ----------
+
+    .. [1] Rinne, H. "The Weibull Distribution: A Handbook". CRC Press (2009).
+
+    %(example)s
+
+    """
+    def _argcheck(self, c, a, b):
+        return (a >= 0.) & (b > a) & (c > 0.)
+
+    def _shape_info(self):
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        ia = _ShapeInfo("a", False, (0, np.inf), (True, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ic, ia, ib]
+
+    def _fitstart(self, data):
+        # Arbitrary, but default a=b=c=1 is not valid
+        return super()._fitstart(data, args=(1, 0, 1))
+
+    def _get_support(self, c, a, b):
+        return a, b
+
+    def _pdf(self, x, c, a, b):
+        denum = (np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return (c * pow(x, c-1) * np.exp(-pow(x, c))) / denum
+
+    def _logpdf(self, x, c, a, b):
+        logdenum = np.log(np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return np.log(c) + sc.xlogy(c - 1, x) - pow(x, c) - logdenum
+
+    def _cdf(self, x, c, a, b):
+        num = (np.exp(-pow(a, c)) - np.exp(-pow(x, c)))
+        denum = (np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return num / denum
+
+    def _logcdf(self, x, c, a, b):
+        lognum = np.log(np.exp(-pow(a, c)) - np.exp(-pow(x, c)))
+        logdenum = np.log(np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return lognum - logdenum
+
+    def _sf(self, x, c, a, b):
+        num = (np.exp(-pow(x, c)) - np.exp(-pow(b, c)))
+        denum = (np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return num / denum
+
+    def _logsf(self, x, c, a, b):
+        lognum = np.log(np.exp(-pow(x, c)) - np.exp(-pow(b, c)))
+        logdenum = np.log(np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return lognum - logdenum
+
+    def _isf(self, q, c, a, b):
+        return pow(
+            -np.log((1 - q) * np.exp(-pow(b, c)) + q * np.exp(-pow(a, c))), 1/c
+            )
+
+    def _ppf(self, q, c, a, b):
+        return pow(
+            -np.log((1 - q) * np.exp(-pow(a, c)) + q * np.exp(-pow(b, c))), 1/c
+            )
+
+    def _munp(self, n, c, a, b):
+        gamma_fun = sc.gamma(n/c + 1.) * (
+            sc.gammainc(n/c + 1., pow(b, c)) - sc.gammainc(n/c + 1., pow(a, c))
+            )
+        denum = (np.exp(-pow(a, c)) - np.exp(-pow(b, c)))
+        return gamma_fun / denum
+
+
+truncweibull_min = truncweibull_min_gen(name='truncweibull_min')
+truncweibull_min._support = ('a', 'b')
+
+
+class weibull_max_gen(rv_continuous):
+    r"""Weibull maximum continuous random variable.
+
+    The Weibull Maximum Extreme Value distribution, from extreme value theory
+    (Fisher-Gnedenko theorem), is the limiting distribution of rescaled
+    maximum of iid random variables. This is the distribution of -X
+    if X is from the `weibull_min` function.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    weibull_min
+
+    Notes
+    -----
+    The probability density function for `weibull_max` is:
+
+    .. math::
+
+        f(x, c) = c (-x)^{c-1} \exp(-(-x)^c)
+
+    for :math:`x < 0`, :math:`c > 0`.
+
+    `weibull_max` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Weibull_distribution
+
+    https://en.wikipedia.org/wiki/Fisher-Tippett-Gnedenko_theorem
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # weibull_max.pdf(x, c) = c * (-x)**(c-1) * exp(-(-x)**c)
+        return c*pow(-x, c-1)*np.exp(-pow(-x, c))
+
+    def _logpdf(self, x, c):
+        return np.log(c) + sc.xlogy(c-1, -x) - pow(-x, c)
+
+    def _cdf(self, x, c):
+        return np.exp(-pow(-x, c))
+
+    def _logcdf(self, x, c):
+        return -pow(-x, c)
+
+    def _sf(self, x, c):
+        return -sc.expm1(-pow(-x, c))
+
+    def _ppf(self, q, c):
+        return -pow(-np.log(q), 1.0/c)
+
+    def _munp(self, n, c):
+        val = sc.gamma(1.0+n*1.0/c)
+        if int(n) % 2:
+            sgn = -1
+        else:
+            sgn = 1
+        return sgn * val
+
+    def _entropy(self, c):
+        return -_EULER / c - np.log(c) + _EULER + 1
+
+
+weibull_max = weibull_max_gen(b=0.0, name='weibull_max')
+
+
+class genlogistic_gen(rv_continuous):
+    r"""A generalized logistic continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `genlogistic` is:
+
+    .. math::
+
+        f(x, c) = c \frac{\exp(-x)}
+                         {(1 + \exp(-x))^{c+1}}
+
+    for real :math:`x` and :math:`c > 0`. In literature, different
+    generalizations of the logistic distribution can be found. This is the type 1
+    generalized logistic distribution according to [1]_. It is also referred to
+    as the skew-logistic distribution [2]_.
+
+    `genlogistic` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Johnson et al. "Continuous Univariate Distributions", Volume 2,
+           Wiley. 1995.
+    .. [2] "Generalized Logistic Distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Generalized_logistic_distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # genlogistic.pdf(x, c) = c * exp(-x) / (1 + exp(-x))**(c+1)
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        # Two mathematically equivalent expressions for log(pdf(x, c)):
+        #     log(pdf(x, c)) = log(c) - x - (c + 1)*log(1 + exp(-x))
+        #                    = log(c) + c*x - (c + 1)*log(1 + exp(x))
+        mult = -(c - 1) * (x < 0) - 1
+        absx = np.abs(x)
+        return np.log(c) + mult*absx - (c+1) * sc.log1p(np.exp(-absx))
+
+    def _cdf(self, x, c):
+        Cx = (1+np.exp(-x))**(-c)
+        return Cx
+
+    def _logcdf(self, x, c):
+        return -c * np.log1p(np.exp(-x))
+
+    def _ppf(self, q, c):
+        return -np.log(sc.powm1(q, -1.0/c))
+
+    def _sf(self, x, c):
+        return -sc.expm1(self._logcdf(x, c))
+
+    def _isf(self, q, c):
+        return self._ppf(1 - q, c)
+
+    def _stats(self, c):
+        mu = _EULER + sc.psi(c)
+        mu2 = np.pi*np.pi/6.0 + sc.zeta(2, c)
+        g1 = -2*sc.zeta(3, c) + 2*_ZETA3
+        g1 /= np.power(mu2, 1.5)
+        g2 = np.pi**4/15.0 + 6*sc.zeta(4, c)
+        g2 /= mu2**2.0
+        return mu, mu2, g1, g2
+
+    def _entropy(self, c):
+        return xpx.apply_where(
+            c < 8e6, c,
+            lambda c: -np.log(c) + sc.psi(c + 1) + _EULER + 1,
+            # asymptotic expansion: psi(c) ~ log(c) - 1 / (2 * c)
+            # a = -log(c) + psi(c + 1)
+            #   = -log(c) + psi(c) + 1 / c
+            #   ~ -log(c) + log(c) - 1 / (2 * c) + 1 / c
+            #   = 1 / (2 * c)
+            lambda c: 1 / (2 * c) + _EULER + 1)
+
+
+genlogistic = genlogistic_gen(name='genlogistic')
+
+
+class genpareto_gen(rv_continuous):
+    r"""A generalized Pareto continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `genpareto` is:
+
+    .. math::
+
+        f(x, c) = (1 + c x)^{-1 - 1/c}
+
+    defined for :math:`x \ge 0` if :math:`c \ge 0`, and for
+    :math:`0 \le x \le -1/c` if :math:`c < 0`.
+
+    `genpareto` takes ``c`` as a shape parameter for :math:`c`.
+
+    For :math:`c=0`, `genpareto` reduces to the exponential
+    distribution, `expon`:
+
+    .. math::
+
+        f(x, 0) = \exp(-x)
+
+    For :math:`c=-1`, `genpareto` is uniform on ``[0, 1]``:
+
+    .. math::
+
+        f(x, -1) = 1
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _argcheck(self, c):
+        return np.isfinite(c)
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (-np.inf, np.inf), (False, False))]
+
+    def _get_support(self, c):
+        c = np.asarray(c)
+        a = np.broadcast_arrays(self.a, c)[0].copy()
+        b = xpx.apply_where(c < 0, c, lambda c: -1. / c,
+                            fill_value=np.inf)
+        return a, b
+
+    def _pdf(self, x, c):
+        # genpareto.pdf(x, c) = (1 + c * x)**(-1 - 1/c)
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        return xpx.apply_where((x == x) & (c != 0), (x, c),
+                               lambda x, c: -sc.xlog1py(c + 1., c*x) / c,
+                               fill_value=-x)
+
+    def _cdf(self, x, c):
+        return -sc.inv_boxcox1p(-x, -c)
+
+    def _sf(self, x, c):
+        return sc.inv_boxcox(-x, -c)
+
+    def _logsf(self, x, c):
+        return xpx.apply_where((x == x) & (c != 0), (x, c),
+                               lambda x, c: -sc.log1p(c*x) / c,
+                               fill_value=-x)
+
+    def _ppf(self, q, c):
+        return -sc.boxcox1p(-q, -c)
+
+    def _isf(self, q, c):
+        return -sc.boxcox(q, -c)
+
+    def _stats(self, c, moments='mv'):
+        m, v, s, k = None, None, None, None
+
+        if 'm' in moments:
+            m = xpx.apply_where(c < 1, c,
+                                lambda xi: 1 / (1 - xi),
+                                fill_value=np.inf)
+
+        if 'v' in moments:
+            v = xpx.apply_where(c < 1/2, c,
+                                lambda xi: 1 / (1 - xi)**2 / (1 - 2 * xi),
+                                fill_value=np.nan)
+
+        if 's' in moments:
+            s = xpx.apply_where(
+                c < 1/3, c,
+                lambda xi: 2 * (1 + xi) * np.sqrt(1 - 2*xi) / (1 - 3*xi),
+                fill_value=np.nan)
+
+        if 'k' in moments:
+            k = xpx.apply_where(
+                c < 1/4, c,
+                lambda xi: 3 * (1 - 2*xi) * (2*xi**2 + xi + 3)
+                           / (1 - 3*xi) / (1 - 4*xi) - 3,
+                fill_value=np.nan)
+
+        return m, v, s, k
+
+    def _munp(self, n, c):
+        def __munp(c):
+            val = 0.0
+            k = np.arange(0, n + 1)
+            for ki, cnk in zip(k, sc.comb(n, k)):
+                val = val + cnk * (-1) ** ki / (1.0 - c * ki)
+            return np.where(c * n < 1, val * (-1.0 / c) ** n, np.inf)
+
+        return xpx.apply_where(c != 0, c, __munp, fill_value=sc.gamma(n + 1))
+
+    def _entropy(self, c):
+        return 1. + c
+
+
+genpareto = genpareto_gen(a=0.0, name='genpareto')
+
+
+class genexpon_gen(rv_continuous):
+    r"""A generalized exponential continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `genexpon` is:
+
+    .. math::
+
+        f(x, a, b, c) = (a + b (1 - \exp(-c x)))
+                        \exp(-a x - b x + \frac{b}{c}  (1-\exp(-c x)))
+
+    for :math:`x \ge 0`, :math:`a, b, c > 0`.
+
+    `genexpon` takes :math:`a`, :math:`b` and :math:`c` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    H.K. Ryu, "An Extension of Marshall and Olkin's Bivariate Exponential
+    Distribution", Journal of the American Statistical Association, 1993.
+
+    N. Balakrishnan, Asit P. Basu (editors), *The Exponential Distribution:
+    Theory, Methods and Applications*, Gordon and Breach, 1995.
+    ISBN 10: 2884491929
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        return [ia, ib, ic]
+
+    def _pdf(self, x, a, b, c):
+        # genexpon.pdf(x, a, b, c) = (a + b * (1 - exp(-c*x))) * \
+        #                            exp(-a*x - b*x + b/c * (1-exp(-c*x)))
+        return (a + b*(-sc.expm1(-c*x)))*np.exp((-a-b)*x +
+                                                b*(-sc.expm1(-c*x))/c)
+
+    def _logpdf(self, x, a, b, c):
+        return np.log(a+b*(-sc.expm1(-c*x))) + (-a-b)*x+b*(-sc.expm1(-c*x))/c
+
+    def _cdf(self, x, a, b, c):
+        return -sc.expm1((-a-b)*x + b*(-sc.expm1(-c*x))/c)
+
+    def _ppf(self, p, a, b, c):
+        s = a + b
+        t = (b - c*np.log1p(-p))/s
+        return (t + sc.lambertw(-b/s * np.exp(-t)).real)/c
+
+    def _sf(self, x, a, b, c):
+        return np.exp((-a-b)*x + b*(-sc.expm1(-c*x))/c)
+
+    def _isf(self, p, a, b, c):
+        s = a + b
+        t = (b - c*np.log(p))/s
+        return (t + sc.lambertw(-b/s * np.exp(-t)).real)/c
+
+
+genexpon = genexpon_gen(a=0.0, name='genexpon')
+
+
+class genextreme_gen(rv_continuous):
+    r"""A generalized extreme value continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gumbel_r
+
+    Notes
+    -----
+    For :math:`c=0`, `genextreme` is equal to `gumbel_r` with
+    probability density function
+
+    .. math::
+
+        f(x) = \exp(-\exp(-x)) \exp(-x),
+
+    where :math:`-\infty < x < \infty`.
+
+    For :math:`c \ne 0`, the probability density function for `genextreme` is:
+
+    .. math::
+
+        f(x, c) = \exp(-(1-c x)^{1/c}) (1-c x)^{1/c-1},
+
+    where :math:`-\infty < x \le 1/c` if :math:`c > 0` and
+    :math:`1/c \le x < \infty` if :math:`c < 0`.
+
+    Note that several sources and software packages use the opposite
+    convention for the sign of the shape parameter :math:`c`.
+
+    `genextreme` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _argcheck(self, c):
+        return np.isfinite(c)
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (-np.inf, np.inf), (False, False))]
+
+    def _get_support(self, c):
+        _b = np.where(c > 0, 1.0 / np.maximum(c, _XMIN), np.inf)
+        _a = np.where(c < 0, 1.0 / np.minimum(c, -_XMIN), -np.inf)
+        return _a, _b
+
+    def _loglogcdf(self, x, c):
+        # Returns log(-log(cdf(x, c)))
+        return xpx.apply_where(
+            (x == x) & (c != 0), (x, c),
+            lambda x, c: sc.log1p(-c*x)/c,
+            fill_value=-x)
+
+    def _pdf(self, x, c):
+        # genextreme.pdf(x, c) =
+        #     exp(-exp(-x))*exp(-x),                    for c==0
+        #     exp(-(1-c*x)**(1/c))*(1-c*x)**(1/c-1),    for x \le 1/c, c > 0
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        # Suppress warnings 0 * inf
+        cx = xpx.apply_where((x == x) & (c != 0), (c, x),
+                             operator.mul, fill_value=0.0)
+        logex2 = sc.log1p(-cx)
+        logpex2 = self._loglogcdf(x, c)
+        pex2 = np.exp(logpex2)
+        # Handle special cases
+        np.putmask(logpex2, (c == 0) & (x == -np.inf), 0.0)
+        logpdf = xpx.apply_where(
+            ~((cx == 1) | (cx == -np.inf)),
+            (pex2, logpex2, logex2),
+            lambda pex2, lpex2, lex2: -pex2 + lpex2 - lex2,
+            fill_value=-np.inf)
+        np.putmask(logpdf, (c == 1) & (x == 1), 0.0)
+        return logpdf
+
+    def _logcdf(self, x, c):
+        return -np.exp(self._loglogcdf(x, c))
+
+    def _cdf(self, x, c):
+        return np.exp(self._logcdf(x, c))
+
+    def _sf(self, x, c):
+        return -sc.expm1(self._logcdf(x, c))
+
+    def _ppf(self, q, c):
+        x = -np.log(-np.log(q))
+        return xpx.apply_where(
+            (x == x) & (c != 0), (x, c),
+            lambda x, c: -sc.expm1(-c * x) / c,
+            fill_value=x)
+
+    def _isf(self, q, c):
+        x = -np.log(-sc.log1p(-q))
+        return xpx.apply_where(
+            (x == x) & (c != 0), (x, c),
+            lambda x, c: -sc.expm1(-c * x) / c,
+            fill_value=x)
+
+    def _stats(self, c):
+        def g(n):
+            return sc.gamma(n * c + 1)
+        g1 = g(1)
+        g2 = g(2)
+        g3 = g(3)
+        g4 = g(4)
+        g2mg12 = np.where(abs(c) < 1e-7, (c*np.pi)**2.0/6.0, g2-g1**2.0)
+        def gam2k_f(c):
+            return sc.expm1(sc.gammaln(2.0*c+1.0)-2*sc.gammaln(c + 1.0))/c**2.0
+        gam2k = xpx.apply_where(abs(c) >= 1e-7, c, gam2k_f, fill_value=np.pi**2.0/6.0)
+        eps = 1e-14
+        def gamk_f(c):
+            return sc.expm1(sc.gammaln(c + 1))/c
+        gamk = xpx.apply_where(abs(c) >= eps, c, gamk_f, fill_value=-_EULER)
+
+        # mean
+        m = np.where(c < -1.0, np.nan, -gamk)
+
+        # variance
+        v = np.where(c < -0.5, np.nan, g1**2.0*gam2k)
+
+        # skewness
+        def sk1_eval(c, *args):
+            def sk1_eval_f(c, g1, g2, g3, g2mg12):
+                return np.sign(c)*(-g3 + (g2 + 2*g2mg12)*g1)/g2mg12**1.5
+            return xpx.apply_where(c >= -1./3, (c, *args),
+                                   sk1_eval_f, fill_value=np.nan)
+
+        sk_fill = 12*np.sqrt(6)*_ZETA3/np.pi**3
+        args = (g1, g2, g3, g2mg12)
+        sk = xpx.apply_where(abs(c) > eps**0.29, (c, *args),
+                             sk1_eval, fill_value=sk_fill)
+
+        # kurtosis
+        def ku1_eval(c, *args):
+            def ku1_eval_f(g1, g2, g3, g4, g2mg12):
+                return (g4 + (-4*g3 + 3*(g2 + g2mg12)*g1)*g1)/g2mg12**2 - 3
+            return xpx.apply_where(c >= -1./4, args, ku1_eval_f, fill_value=np.nan)
+
+        args = (g1, g2, g3, g4, g2mg12)
+        ku = xpx.apply_where(abs(c) > eps**0.23, (c, *args),
+                             ku1_eval, fill_value=12.0/5.0)
+
+        return m, v, sk, ku
+
+    def _fitstart(self, data):
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        # This is better than the default shape of (1,).
+        g = _skew(data)
+        if g < 0:
+            a = 0.5
+        else:
+            a = -0.5
+        return super()._fitstart(data, args=(a,))
+
+    def _munp(self, n, c):
+        k = np.arange(0, n+1)
+        vals = 1.0/c**n * np.sum(
+            sc.comb(n, k) * (-1)**k * sc.gamma(c*k + 1),
+            axis=0)
+        return np.where(c*n > -1, vals, np.inf)
+
+    def _entropy(self, c):
+        return _EULER*(1 - c) + 1
+
+
+genextreme = genextreme_gen(name='genextreme')
+
+
+def _digammainv(y):
+    """Inverse of the digamma function (real positive arguments only).
+
+    This function is used in the `fit` method of `gamma_gen`.
+    The function uses either optimize.fsolve or optimize.newton
+    to solve `sc.digamma(x) - y = 0`.  There is probably room for
+    improvement, but currently it works over a wide range of y:
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> y = 64*rng.standard_normal(1000000)
+    >>> y.min(), y.max()
+    (-311.43592651416662, 351.77388222276869)
+    >>> x = [_digammainv(t) for t in y]
+    >>> np.abs(sc.digamma(x) - y).max()
+    1.1368683772161603e-13
+
+    """
+    _em = 0.5772156649015328606065120
+
+    def func(x):
+        return sc.digamma(x) - y
+
+    if y > -0.125:
+        x0 = np.exp(y) + 0.5
+        if y < 10:
+            # Some experimentation shows that newton reliably converges
+            # must faster than fsolve in this y range.  For larger y,
+            # newton sometimes fails to converge.
+            value = optimize.newton(func, x0, tol=1e-10)
+            return value
+    elif y > -3:
+        x0 = np.exp(y/2.332) + 0.08661
+    else:
+        x0 = 1.0 / (-y - _em)
+
+    value, info, ier, mesg = optimize.fsolve(func, x0, xtol=1e-11,
+                                             full_output=True)
+    if ier != 1:
+        raise RuntimeError(f"_digammainv: fsolve failed, y = {y!r}")
+
+    return value[0]
+
+
+## Gamma (Use MATLAB and MATHEMATICA (b=theta=scale, a=alpha=shape) definition)
+
+## gamma(a, loc, scale)  with a an integer is the Erlang distribution
+## gamma(1, loc, scale)  is the Exponential distribution
+## gamma(df/2, 0, 2) is the chi2 distribution with df degrees of freedom.
+
+class gamma_gen(rv_continuous):
+    r"""A gamma continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    erlang, expon
+
+    Notes
+    -----
+    The probability density function for `gamma` is:
+
+    .. math::
+
+        f(x, a) = \frac{x^{a-1} e^{-x}}{\Gamma(a)}
+
+    for :math:`x \ge 0`, :math:`a > 0`. Here :math:`\Gamma(a)` refers to the
+    gamma function.
+
+    `gamma` takes ``a`` as a shape parameter for :math:`a`.
+
+    When :math:`a` is an integer, `gamma` reduces to the Erlang
+    distribution, and when :math:`a=1` to the exponential distribution.
+
+    Gamma distributions are sometimes parameterized with two variables,
+    with a probability density function of:
+
+    .. math::
+
+        f(x, \alpha, \beta) =
+        \frac{\beta^\alpha x^{\alpha - 1} e^{-\beta x }}{\Gamma(\alpha)}
+
+    Note that this parameterization is equivalent to the above, with
+    ``scale = 1 / beta``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, a, size=None, random_state=None):
+        return random_state.standard_gamma(a, size)
+
+    def _pdf(self, x, a):
+        # gamma.pdf(x, a) = x**(a-1) * exp(-x) / gamma(a)
+        return np.exp(self._logpdf(x, a))
+
+    def _logpdf(self, x, a):
+        return sc.xlogy(a-1.0, x) - x - sc.gammaln(a)
+
+    def _cdf(self, x, a):
+        return sc.gammainc(a, x)
+
+    def _sf(self, x, a):
+        return sc.gammaincc(a, x)
+
+    def _ppf(self, q, a):
+        return sc.gammaincinv(a, q)
+
+    def _isf(self, q, a):
+        return sc.gammainccinv(a, q)
+
+    def _stats(self, a):
+        return a, a, 2.0/np.sqrt(a), 6.0/a
+
+    def _munp(self, n, a):
+        return sc.poch(a, n)
+
+    def _entropy(self, a):
+
+        def regular_formula(a):
+            return sc.psi(a) * (1-a) + a + sc.gammaln(a)
+
+        def asymptotic_formula(a):
+            # plug in above formula the expansions:
+            # psi(a) ~ ln(a) - 1/2a - 1/12a^2 + 1/120a^4
+            # gammaln(a) ~ a * ln(a) - a - 1/2 * ln(a) + 1/2 ln(2 * pi) +
+            #              1/12a - 1/360a^3
+            return (0.5 * (1. + np.log(2*np.pi) + np.log(a)) - 1/(3 * a)
+                    - (a**-2.)/12 - (a**-3.)/90 + (a**-4.)/120)
+
+        return xpx.apply_where(a < 250, a, regular_formula, asymptotic_formula)
+
+    def _fitstart(self, data):
+        # The skewness of the gamma distribution is `2 / np.sqrt(a)`.
+        # We invert that to estimate the shape `a` using the skewness
+        # of the data.  The formula is regularized with 1e-8 in the
+        # denominator to allow for degenerate data where the skewness
+        # is close to 0.
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        sk = _skew(data)
+        a = 4 / (1e-8 + sk**2)
+        return super()._fitstart(data, args=(a,))
+
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        When the location is fixed by using the argument `floc`
+        and `method='MLE'`, this
+        function uses explicit formulas or solves a simpler numerical
+        problem than the full ML optimization problem.  So in that case,
+        the `optimizer`, `loc` and `scale` arguments are ignored.
+        \n\n""")
+    def fit(self, data, *args, **kwds):
+        floc = kwds.get('floc', None)
+        method = kwds.get('method', 'mle')
+
+        if (isinstance(data, CensoredData) or
+                floc is None and method.lower() != 'mm'):
+            # loc is not fixed or we're not doing standard MLE.
+            # Use the default fit method.
+            return super().fit(data, *args, **kwds)
+
+        # We already have this value, so just pop it from kwds.
+        kwds.pop('floc', None)
+
+        f0 = _get_fixed_fit_value(kwds, ['f0', 'fa', 'fix_a'])
+        fscale = kwds.pop('fscale', None)
+
+        _remove_optimizer_parameters(kwds)
+
+        if f0 is not None and floc is not None and fscale is not None:
+            # This check is for consistency with `rv_continuous.fit`.
+            # Without this check, this function would just return the
+            # parameters that were given.
+            raise ValueError("All parameters fixed. There is nothing to "
+                             "optimize.")
+
+        # Fixed location is handled by shifting the data.
+        data = np.asarray(data)
+
+        if not np.isfinite(data).all():
+            raise ValueError("The data contains non-finite values.")
+
+        # Use explicit formulas for mm (gh-19884)
+        if method.lower() == 'mm':
+            m1 = np.mean(data)
+            m2 = np.var(data)
+            m3 = np.mean((data - m1) ** 3)
+            a, loc, scale = f0, floc, fscale
+            # Three unknowns
+            if a is None and loc is None and scale is None:
+                scale = m3 / (2 * m2)
+            # Two unknowns
+            if loc is None and scale is None:
+                scale = np.sqrt(m2 / a)
+            if a is None and scale is None:
+                scale = m2 / (m1 - loc)
+            if a is None and loc is None:
+                a = m2 / (scale ** 2)
+            # One unknown
+            if a is None:
+                a = (m1 - loc) / scale
+            if loc is None:
+                loc = m1 - a * scale
+            if scale is None:
+                scale = (m1 - loc) / a
+            return a, loc, scale
+
+        # Special case: loc is fixed.
+
+        # NB: data == loc is ok if a >= 1; the below check is more strict.
+        if np.any(data <= floc):
+            raise FitDataError("gamma", lower=floc, upper=np.inf)
+
+        if floc != 0:
+            # Don't do the subtraction in-place, because `data` might be a
+            # view of the input array.
+            data = data - floc
+        xbar = data.mean()
+
+        # Three cases to handle:
+        # * shape and scale both free
+        # * shape fixed, scale free
+        # * shape free, scale fixed
+
+        if fscale is None:
+            # scale is free
+            if f0 is not None:
+                # shape is fixed
+                a = f0
+            else:
+                # shape and scale are both free.
+                # The MLE for the shape parameter `a` is the solution to:
+                # np.log(a) - sc.digamma(a) - np.log(xbar) +
+                #                             np.log(data).mean() = 0
+                s = np.log(xbar) - np.log(data).mean()
+                aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
+                xa = aest*(1-0.4)
+                xb = aest*(1+0.4)
+                a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,
+                                    xa, xb, disp=0)
+
+            # The MLE for the scale parameter is just the data mean
+            # divided by the shape parameter.
+            scale = xbar / a
+        else:
+            # scale is fixed, shape is free
+            # The MLE for the shape parameter `a` is the solution to:
+            # sc.digamma(a) - np.log(data).mean() + np.log(fscale) = 0
+            c = np.log(data).mean() - np.log(fscale)
+            a = _digammainv(c)
+            scale = fscale
+
+        return a, floc, scale
+
+
+gamma = gamma_gen(a=0.0, name='gamma')
+
+
+class erlang_gen(gamma_gen):
+    """An Erlang continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gamma
+
+    Notes
+    -----
+    The Erlang distribution is a special case of the Gamma distribution, with
+    the shape parameter `a` an integer.  Note that this restriction is not
+    enforced by `erlang`. It will, however, generate a warning the first time
+    a non-integer value is used for the shape parameter.
+
+    Refer to `gamma` for examples.
+
+    """
+
+    def _argcheck(self, a):
+        allint = np.all(np.floor(a) == a)
+        if not allint:
+            # An Erlang distribution shouldn't really have a non-integer
+            # shape parameter, so warn the user.
+            message = ('The shape parameter of the erlang distribution '
+                       f'has been given a non-integer value {a!r}.')
+            warnings.warn(message, RuntimeWarning, stacklevel=3)
+        return a > 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", True, (1, np.inf), (True, False))]
+
+    def _fitstart(self, data):
+        # Override gamma_gen_fitstart so that an integer initial value is
+        # used.  (Also regularize the division, to avoid issues when
+        # _skew(data) is 0 or close to 0.)
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        a = int(4.0 / (1e-8 + _skew(data)**2))
+        return super(gamma_gen, self)._fitstart(data, args=(a,))
+
+    # Trivial override of the fit method, so we can monkey-patch its
+    # docstring.
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        The Erlang distribution is generally defined to have integer values
+        for the shape parameter.  This is not enforced by the `erlang` class.
+        When fitting the distribution, it will generally return a non-integer
+        value for the shape parameter.  By using the keyword argument
+        `f0=<integer>`, the fit method can be constrained to fit the data to
+        a specific integer shape parameter.""")
+    def fit(self, data, *args, **kwds):
+        return super().fit(data, *args, **kwds)
+
+
+erlang = erlang_gen(a=0.0, name='erlang')
+
+
+class gengamma_gen(rv_continuous):
+    r"""A generalized gamma continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gamma, invgamma, weibull_min
+
+    Notes
+    -----
+    The probability density function for `gengamma` is ([1]_):
+
+    .. math::
+
+        f(x, a, c) = \frac{|c| x^{c a-1} \exp(-x^c)}{\Gamma(a)}
+
+    for :math:`x \ge 0`, :math:`a > 0`, and :math:`c \ne 0`.
+    :math:`\Gamma` is the gamma function (`scipy.special.gamma`).
+
+    `gengamma` takes :math:`a` and :math:`c` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] E.W. Stacy, "A Generalization of the Gamma Distribution",
+       Annals of Mathematical Statistics, Vol 33(3), pp. 1187--1192.
+
+    %(example)s
+
+    """
+    def _argcheck(self, a, c):
+        return (a > 0) & (c != 0)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ic = _ShapeInfo("c", False, (-np.inf, np.inf), (False, False))
+        return [ia, ic]
+
+    def _pdf(self, x, a, c):
+        return np.exp(self._logpdf(x, a, c))
+
+    def _logpdf(self, x, a, c):
+        return xpx.apply_where(
+            (x != 0) | (c > 0), (x, c, a),
+            lambda x, c, a: (np.log(abs(c)) + sc.xlogy(c*a - 1, x)
+                             - x**c - sc.gammaln(a)),
+            fill_value=-np.inf)
+
+    def _cdf(self, x, a, c):
+        xc = x**c
+        val1 = sc.gammainc(a, xc)
+        val2 = sc.gammaincc(a, xc)
+        return np.where(c > 0, val1, val2)
+
+    def _rvs(self, a, c, size=None, random_state=None):
+        r = random_state.standard_gamma(a, size=size)
+        return r**(1./c)
+
+    def _sf(self, x, a, c):
+        xc = x**c
+        val1 = sc.gammainc(a, xc)
+        val2 = sc.gammaincc(a, xc)
+        return np.where(c > 0, val2, val1)
+
+    def _ppf(self, q, a, c):
+        val1 = sc.gammaincinv(a, q)
+        val2 = sc.gammainccinv(a, q)
+        return np.where(c > 0, val1, val2)**(1.0/c)
+
+    def _isf(self, q, a, c):
+        val1 = sc.gammaincinv(a, q)
+        val2 = sc.gammainccinv(a, q)
+        return np.where(c > 0, val2, val1)**(1.0/c)
+
+    def _munp(self, n, a, c):
+        # Pochhammer symbol: sc.pocha,n) = gamma(a+n)/gamma(a)
+        return sc.poch(a, n*1.0/c)
+
+    def _entropy(self, a, c):
+        def regular(a, c):
+            val = sc.psi(a)
+            A = a * (1 - val) + val / c
+            B = sc.gammaln(a) - np.log(abs(c))
+            h = A + B
+            return h
+
+        def asymptotic(a, c):
+            # using asymptotic expansions for gammaln and psi (see gh-18093)
+            return (norm._entropy() - np.log(a)/2
+                    - np.log(np.abs(c)) + (a**-1.)/6 - (a**-3.)/90
+                    + (np.log(a) - (a**-1.)/2 - (a**-2.)/12 + (a**-4.)/120)/c)
+
+        return xpx.apply_where(a >= 200, (a, c), asymptotic, regular)
+
+
+gengamma = gengamma_gen(a=0.0, name='gengamma')
+
+
+class genhalflogistic_gen(rv_continuous):
+    r"""A generalized half-logistic continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `genhalflogistic` is:
+
+    .. math::
+
+        f(x, c) = \frac{2 (1 - c x)^{1/(c-1)}}{[1 + (1 - c x)^{1/c}]^2}
+
+    for :math:`0 \le x \le 1/c`, and :math:`c > 0`.
+
+    `genhalflogistic` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _get_support(self, c):
+        return self.a, 1.0/c
+
+    def _pdf(self, x, c):
+        # genhalflogistic.pdf(x, c) =
+        #    2 * (1-c*x)**(1/c-1) / (1+(1-c*x)**(1/c))**2
+        limit = 1.0/c
+        tmp = np.asarray(1-c*x)
+        tmp0 = tmp**(limit-1)
+        tmp2 = tmp0*tmp
+        return 2*tmp0 / (1+tmp2)**2
+
+    def _cdf(self, x, c):
+        limit = 1.0/c
+        tmp = np.asarray(1-c*x)
+        tmp2 = tmp**(limit)
+        return (1.0-tmp2) / (1+tmp2)
+
+    def _ppf(self, q, c):
+        return 1.0/c*(1-((1.0-q)/(1.0+q))**c)
+
+    def _entropy(self, c):
+        return 2 - (2*c+1)*np.log(2)
+
+
+genhalflogistic = genhalflogistic_gen(a=0.0, name='genhalflogistic')
+
+
+class genhyperbolic_gen(rv_continuous):
+    r"""A generalized hyperbolic continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    t, norminvgauss, geninvgauss, laplace, cauchy
+
+    Notes
+    -----
+    The probability density function for `genhyperbolic` is:
+
+    .. math::
+
+        f(x, p, a, b) =
+            \frac{(a^2 - b^2)^{p/2}}
+            {\sqrt{2\pi}a^{p-1/2}
+            K_p\Big(\sqrt{a^2 - b^2}\Big)}
+            e^{bx} \times \frac{K_{p - 1/2}
+            (a \sqrt{1 + x^2})}
+            {(\sqrt{1 + x^2})^{1/2 - p}}
+
+    for :math:`x, p \in ( - \infty; \infty)`,
+    :math:`|b| < a` if :math:`p \ge 0`,
+    :math:`|b| \le a` if :math:`p < 0`.
+    :math:`K_{p}(.)` denotes the modified Bessel function of the second
+    kind and order :math:`p` (`scipy.special.kv`)
+
+    `genhyperbolic` takes ``p`` as a tail parameter,
+    ``a`` as a shape parameter,
+    ``b`` as a skewness parameter.
+
+    %(after_notes)s
+
+    The original parameterization of the Generalized Hyperbolic Distribution
+    is found in [1]_ as follows
+
+    .. math::
+
+        f(x, \lambda, \alpha, \beta, \delta, \mu) =
+           \frac{(\gamma/\delta)^\lambda}{\sqrt{2\pi}K_\lambda(\delta \gamma)}
+           e^{\beta (x - \mu)} \times \frac{K_{\lambda - 1/2}
+           (\alpha \sqrt{\delta^2 + (x - \mu)^2})}
+           {(\sqrt{\delta^2 + (x - \mu)^2} / \alpha)^{1/2 - \lambda}}
+
+    for :math:`x \in ( - \infty; \infty)`,
+    :math:`\gamma := \sqrt{\alpha^2 - \beta^2}`,
+    :math:`\lambda, \mu \in ( - \infty; \infty)`,
+    :math:`\delta \ge 0, |\beta| < \alpha` if :math:`\lambda \ge 0`,
+    :math:`\delta > 0, |\beta| \le \alpha` if :math:`\lambda < 0`.
+
+    The location-scale-based parameterization implemented in
+    SciPy is based on [2]_, where :math:`a = \alpha\delta`,
+    :math:`b = \beta\delta`, :math:`p = \lambda`,
+    :math:`scale=\delta` and :math:`loc=\mu`
+
+    Moments are implemented based on [3]_ and [4]_.
+
+    For the distributions that are a special case such as Student's t,
+    it is not recommended to rely on the implementation of genhyperbolic.
+    To avoid potential numerical problems and for performance reasons,
+    the methods of the specific distributions should be used.
+
+    References
+    ----------
+    .. [1] O. Barndorff-Nielsen, "Hyperbolic Distributions and Distributions
+       on Hyperbolae", Scandinavian Journal of Statistics, Vol. 5(3),
+       pp. 151-157, 1978. https://www.jstor.org/stable/4615705
+
+    .. [2] Eberlein E., Prause K. (2002) The Generalized Hyperbolic Model:
+        Financial Derivatives and Risk Measures. In: Geman H., Madan D.,
+        Pliska S.R., Vorst T. (eds) Mathematical Finance - Bachelier
+        Congress 2000. Springer Finance. Springer, Berlin, Heidelberg.
+        :doi:`10.1007/978-3-662-12429-1_12`
+
+    .. [3] Scott, David J, Würtz, Diethelm, Dong, Christine and Tran,
+       Thanh Tam, (2009), Moments of the generalized hyperbolic
+       distribution, MPRA Paper, University Library of Munich, Germany,
+       https://EconPapers.repec.org/RePEc:pra:mprapa:19081.
+
+    .. [4] E. Eberlein and E. A. von Hammerstein. Generalized hyperbolic
+       and inverse Gaussian distributions: Limiting cases and approximation
+       of processes. FDM Preprint 80, April 2003. University of Freiburg.
+       https://freidok.uni-freiburg.de/fedora/objects/freidok:7974/datastreams/FILE1/content
+
+    %(example)s
+
+    """
+
+    def _argcheck(self, p, a, b):
+        return (np.logical_and(np.abs(b) < a, p >= 0)
+                | np.logical_and(np.abs(b) <= a, p < 0))
+
+    def _shape_info(self):
+        ip = _ShapeInfo("p", False, (-np.inf, np.inf), (False, False))
+        ia = _ShapeInfo("a", False, (0, np.inf), (True, False))
+        ib = _ShapeInfo("b", False, (-np.inf, np.inf), (False, False))
+        return [ip, ia, ib]
+
+    def _fitstart(self, data):
+        # Arbitrary, but the default p = a = b = 1 is not valid; the
+        # distribution requires |b| < a if p >= 0.
+        return super()._fitstart(data, args=(1, 1, 0.5))
+
+    def _logpdf(self, x, p, a, b):
+        # kve instead of kv works better for large values of p
+        # and smaller values of sqrt(a^2  - b^2)
+        @np.vectorize
+        def _logpdf_single(x, p, a, b):
+            return _stats.genhyperbolic_logpdf(x, p, a, b)
+
+        return _logpdf_single(x, p, a, b)
+
+    def _pdf(self, x, p, a, b):
+        # kve instead of kv works better for large values of p
+        # and smaller values of sqrt(a^2  - b^2)
+        @np.vectorize
+        def _pdf_single(x, p, a, b):
+            return _stats.genhyperbolic_pdf(x, p, a, b)
+
+        return _pdf_single(x, p, a, b)
+
+    # np.vectorize isn't currently designed to be used as a decorator,
+    # so use a lambda instead.  This allows us to decorate the function
+    # with `np.vectorize` and still provide the `otypes` parameter.
+    @lambda func: np.vectorize(func, otypes=[np.float64])
+    @staticmethod
+    def _integrate_pdf(x0, x1, p, a, b):
+        """
+        Integrate the pdf of the genhyberbolic distribution from x0 to x1.
+        This is a private function used by _cdf() and _sf() only; either x0
+        will be -inf or x1 will be inf.
+        """
+        user_data = np.array([p, a, b], float).ctypes.data_as(ctypes.c_void_p)
+        llc = LowLevelCallable.from_cython(_stats, '_genhyperbolic_pdf',
+                                           user_data)
+        d = np.sqrt((a + b)*(a - b))
+        mean = b/d * sc.kv(p + 1, d) / sc.kv(p, d)
+        epsrel = 1e-10
+        epsabs = 0
+        if x0 < mean < x1:
+            # If the interval includes the mean, integrate over the two
+            # intervals [x0, mean] and [mean, x1] and add. If we try to do
+            # the integral in one call of quad and the non-infinite endpoint
+            # is far in the tail, quad might return an incorrect result
+            # because it does not "see" the peak of the PDF.
+            intgrl = (integrate.quad(llc, x0, mean,
+                                     epsrel=epsrel, epsabs=epsabs)[0]
+                      + integrate.quad(llc, mean, x1,
+                                       epsrel=epsrel, epsabs=epsabs)[0])
+        else:
+            intgrl = integrate.quad(llc, x0, x1,
+                                    epsrel=epsrel, epsabs=epsabs)[0]
+        if np.isnan(intgrl):
+            msg = ("Infinite values encountered in scipy.special.kve. "
+                   "Values replaced by NaN to avoid incorrect results.")
+            warnings.warn(msg, RuntimeWarning, stacklevel=3)
+        return max(0.0, min(1.0, intgrl))
+
+    def _cdf(self, x, p, a, b):
+        return self._integrate_pdf(-np.inf, x, p, a, b)
+
+    def _sf(self, x, p, a, b):
+        return self._integrate_pdf(x, np.inf, p, a, b)
+
+    def _rvs(self, p, a, b, size=None, random_state=None):
+        # note: X = b * V + sqrt(V) * X  has a
+        # generalized hyperbolic distribution
+        # if X is standard normal and V is
+        # geninvgauss(p = p, b = t2, loc = loc, scale = t3)
+        t1 = np.float_power(a, 2) - np.float_power(b, 2)
+        # b in the GIG
+        t2 = np.float_power(t1, 0.5)
+        # scale in the GIG
+        t3 = np.float_power(t1, - 0.5)
+        gig = geninvgauss.rvs(
+            p=p,
+            b=t2,
+            scale=t3,
+            size=size,
+            random_state=random_state
+            )
+        normst = norm.rvs(size=size, random_state=random_state)
+
+        return b * gig + np.sqrt(gig) * normst
+
+    def _stats(self, p, a, b):
+        # https://mpra.ub.uni-muenchen.de/19081/1/MPRA_paper_19081.pdf
+        # https://freidok.uni-freiburg.de/fedora/objects/freidok:7974/datastreams/FILE1/content
+        # standardized moments
+        p, a, b = np.broadcast_arrays(p, a, b)
+        t1 = np.float_power(a, 2) - np.float_power(b, 2)
+        t1 = np.float_power(t1, 0.5)
+        t2 = np.float_power(1, 2) * np.float_power(t1, - 1)
+        integers = np.linspace(0, 4, 5)
+        # make integers perpendicular to existing dimensions
+        integers = integers.reshape(integers.shape + (1,) * p.ndim)
+        b0, b1, b2, b3, b4 = sc.kv(p + integers, t1)
+        r1, r2, r3, r4 = (b / b0 for b in (b1, b2, b3, b4))
+
+        m = b * t2 * r1
+        v = (
+            t2 * r1 + np.float_power(b, 2) * np.float_power(t2, 2) *
+            (r2 - np.float_power(r1, 2))
+        )
+        m3e = (
+            np.float_power(b, 3) * np.float_power(t2, 3) *
+            (r3 - 3 * b2 * b1 * np.float_power(b0, -2) +
+             2 * np.float_power(r1, 3)) +
+            3 * b * np.float_power(t2, 2) *
+            (r2 - np.float_power(r1, 2))
+        )
+        s = m3e * np.float_power(v, - 3 / 2)
+        m4e = (
+            np.float_power(b, 4) * np.float_power(t2, 4) *
+            (r4 - 4 * b3 * b1 * np.float_power(b0, - 2) +
+             6 * b2 * np.float_power(b1, 2) * np.float_power(b0, - 3) -
+             3 * np.float_power(r1, 4)) +
+            np.float_power(b, 2) * np.float_power(t2, 3) *
+            (6 * r3 - 12 * b2 * b1 * np.float_power(b0, - 2) +
+             6 * np.float_power(r1, 3)) +
+            3 * np.float_power(t2, 2) * r2
+        )
+        k = m4e * np.float_power(v, -2) - 3
+
+        return m, v, s, k
+
+
+genhyperbolic = genhyperbolic_gen(name='genhyperbolic')
+
+
+class gompertz_gen(rv_continuous):
+    r"""A Gompertz (or truncated Gumbel) continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `gompertz` is:
+
+    .. math::
+
+        f(x, c) = c \exp(x) \exp(-c (e^x-1))
+
+    for :math:`x \ge 0`, :math:`c > 0`.
+
+    `gompertz` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # gompertz.pdf(x, c) = c * exp(x) * exp(-c*(exp(x)-1))
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        return np.log(c) + x - c * sc.expm1(x)
+
+    def _cdf(self, x, c):
+        return -sc.expm1(-c * sc.expm1(x))
+
+    def _ppf(self, q, c):
+        return sc.log1p(-1.0 / c * sc.log1p(-q))
+
+    def _sf(self, x, c):
+        return np.exp(-c * sc.expm1(x))
+
+    def _isf(self, p, c):
+        return sc.log1p(-np.log(p)/c)
+
+    def _entropy(self, c):
+        return 1.0 - np.log(c) - sc._ufuncs._scaled_exp1(c)/c
+
+
+gompertz = gompertz_gen(a=0.0, name='gompertz')
+
+
+def _average_with_log_weights(x, logweights):
+    x = np.asarray(x)
+    logweights = np.asarray(logweights)
+    maxlogw = logweights.max()
+    weights = np.exp(logweights - maxlogw)
+    return np.average(x, weights=weights)
+
+
+class gumbel_r_gen(rv_continuous):
+    r"""A right-skewed Gumbel continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gumbel_l, gompertz, genextreme
+
+    Notes
+    -----
+    The probability density function for `gumbel_r` is:
+
+    .. math::
+
+        f(x) = \exp(-(x + e^{-x}))
+
+    for real :math:`x`.
+
+    The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett
+    distribution.  It is also related to the extreme value distribution,
+    log-Weibull and Gompertz distributions.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # gumbel_r.pdf(x) = exp(-(x + exp(-x)))
+        return np.exp(self._logpdf(x))
+
+    def _logpdf(self, x):
+        return -x - np.exp(-x)
+
+    def _cdf(self, x):
+        return np.exp(-np.exp(-x))
+
+    def _logcdf(self, x):
+        return -np.exp(-x)
+
+    def _ppf(self, q):
+        return -np.log(-np.log(q))
+
+    def _sf(self, x):
+        return -sc.expm1(-np.exp(-x))
+
+    def _isf(self, p):
+        return -np.log(-np.log1p(-p))
+
+    def _stats(self):
+        return _EULER, np.pi*np.pi/6.0, 12*np.sqrt(6)/np.pi**3 * _ZETA3, 12.0/5
+
+    def _entropy(self):
+        # https://en.wikipedia.org/wiki/Gumbel_distribution
+        return _EULER + 1.
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        # By the method of maximum likelihood, the estimators of the
+        # location and scale are the roots of the equations defined in
+        # `func` and the value of the expression for `loc` that follows.
+        # The first `func` is a first order derivative of the log-likelihood
+        # equation and the second is from Source: Statistical Distributions,
+        # 3rd Edition. Evans, Hastings, and Peacock (2000), Page 101.
+
+        def get_loc_from_scale(scale):
+            return -scale * (sc.logsumexp(-data / scale) - np.log(len(data)))
+
+        if fscale is not None:
+            # if the scale is fixed, the location can be analytically
+            # determined.
+            scale = fscale
+            loc = get_loc_from_scale(scale)
+        else:
+            # A different function is solved depending on whether the location
+            # is fixed.
+            if floc is not None:
+                loc = floc
+
+                # equation to use if the location is fixed.
+                # note that one cannot use the equation in Evans, Hastings,
+                # and Peacock (2000) (since it assumes that the derivative
+                # w.r.t. the log-likelihood is zero). however, it is easy to
+                # derive the MLE condition directly if loc is fixed
+                def func(scale):
+                    term1 = (loc - data) * np.exp((loc - data) / scale) + data
+                    term2 = len(data) * (loc + scale)
+                    return term1.sum() - term2
+            else:
+
+                # equation to use if both location and scale are free
+                def func(scale):
+                    sdata = -data / scale
+                    wavg = _average_with_log_weights(data, logweights=sdata)
+                    return data.mean() - wavg - scale
+
+            # set brackets for `root_scalar` to use when optimizing over the
+            # scale such that a root is likely between them. Use user supplied
+            # guess or default 1.
+            brack_start = kwds.get('scale', 1)
+            lbrack, rbrack = brack_start / 2, brack_start * 2
+
+            # if a root is not between the brackets, iteratively expand them
+            # until they include a sign change, checking after each bracket is
+            # modified.
+            def interval_contains_root(lbrack, rbrack):
+                # return true if the signs disagree.
+                return (np.sign(func(lbrack)) !=
+                        np.sign(func(rbrack)))
+            while (not interval_contains_root(lbrack, rbrack)
+                   and (lbrack > 0 or rbrack < np.inf)):
+                lbrack /= 2
+                rbrack *= 2
+
+            res = optimize.root_scalar(func, bracket=(lbrack, rbrack),
+                                       rtol=1e-14, xtol=1e-14)
+            scale = res.root
+            loc = floc if floc is not None else get_loc_from_scale(scale)
+        return loc, scale
+
+
+gumbel_r = gumbel_r_gen(name='gumbel_r')
+
+
+class gumbel_l_gen(rv_continuous):
+    r"""A left-skewed Gumbel continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gumbel_r, gompertz, genextreme
+
+    Notes
+    -----
+    The probability density function for `gumbel_l` is:
+
+    .. math::
+
+        f(x) = \exp(x - e^x)
+
+    for real :math:`x`.
+
+    The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett
+    distribution.  It is also related to the extreme value distribution,
+    log-Weibull and Gompertz distributions.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # gumbel_l.pdf(x) = exp(x - exp(x))
+        return np.exp(self._logpdf(x))
+
+    def _logpdf(self, x):
+        return x - np.exp(x)
+
+    def _cdf(self, x):
+        return -sc.expm1(-np.exp(x))
+
+    def _ppf(self, q):
+        return np.log(-sc.log1p(-q))
+
+    def _logsf(self, x):
+        return -np.exp(x)
+
+    def _sf(self, x):
+        return np.exp(-np.exp(x))
+
+    def _isf(self, x):
+        return np.log(-np.log(x))
+
+    def _stats(self):
+        return -_EULER, np.pi*np.pi/6.0, \
+               -12*np.sqrt(6)/np.pi**3 * _ZETA3, 12.0/5
+
+    def _entropy(self):
+        return _EULER + 1.
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        # The fit method of `gumbel_r` can be used for this distribution with
+        # small modifications. The process to do this is
+        # 1. pass the sign negated data into `gumbel_r.fit`
+        #    - if the location is fixed, it should also be negated.
+        # 2. negate the sign of the resulting location, leaving the scale
+        #    unmodified.
+        # `gumbel_r.fit` holds necessary input checks.
+
+        if kwds.get('floc') is not None:
+            kwds['floc'] = -kwds['floc']
+        loc_r, scale_r, = gumbel_r.fit(-np.asarray(data), *args, **kwds)
+        return -loc_r, scale_r
+
+
+gumbel_l = gumbel_l_gen(name='gumbel_l')
+
+
+class halfcauchy_gen(rv_continuous):
+    r"""A Half-Cauchy continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `halfcauchy` is:
+
+    .. math::
+
+        f(x) = \frac{2}{\pi (1 + x^2)}
+
+    for :math:`x \ge 0`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # halfcauchy.pdf(x) = 2 / (pi * (1 + x**2))
+        return 2.0/np.pi/(1.0+x*x)
+
+    def _logpdf(self, x):
+        return np.log(2.0/np.pi) - sc.log1p(x*x)
+
+    def _cdf(self, x):
+        return 2.0/np.pi*np.arctan(x)
+
+    def _ppf(self, q):
+        return np.tan(np.pi/2*q)
+
+    def _sf(self, x):
+        return 2.0/np.pi * np.arctan2(1, x)
+
+    def _isf(self, p):
+        return 1.0/np.tan(np.pi*p/2)
+
+    def _stats(self):
+        return np.inf, np.inf, np.nan, np.nan
+
+    def _entropy(self):
+        return np.log(2*np.pi)
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        # location is independent from the scale
+        data_min = np.min(data)
+        if floc is not None:
+            if data_min < floc:
+                # There are values that are less than the specified loc.
+                raise FitDataError("halfcauchy", lower=floc, upper=np.inf)
+            loc = floc
+        else:
+            # if not provided, location MLE is the minimal data point
+            loc = data_min
+
+        # find scale
+        def find_scale(loc, data):
+            shifted_data = data - loc
+            n = data.size
+            shifted_data_squared = np.square(shifted_data)
+
+            def fun_to_solve(scale):
+                denominator = scale**2 + shifted_data_squared
+                return 2 * np.sum(shifted_data_squared/denominator) - n
+
+            small = np.finfo(1.0).tiny**0.5  # avoid underflow
+            res = root_scalar(fun_to_solve, bracket=(small, np.max(shifted_data)))
+            return res.root
+
+        if fscale is not None:
+            scale = fscale
+        else:
+            scale = find_scale(loc, data)
+
+        return loc, scale
+
+
+halfcauchy = halfcauchy_gen(a=0.0, name='halfcauchy')
+
+
+class halflogistic_gen(rv_continuous):
+    r"""A half-logistic continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `halflogistic` is:
+
+    .. math::
+
+        f(x) = \frac{ 2 e^{-x} }{ (1+e^{-x})^2 }
+             = \frac{1}{2} \text{sech}(x/2)^2
+
+    for :math:`x \ge 0`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Asgharzadeh et al (2011). "Comparisons of Methods of Estimation for the
+           Half-Logistic Distribution". Selcuk J. Appl. Math. 93-108.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # halflogistic.pdf(x) = 2 * exp(-x) / (1+exp(-x))**2
+        #                     = 1/2 * sech(x/2)**2
+        return np.exp(self._logpdf(x))
+
+    def _logpdf(self, x):
+        return np.log(2) - x - 2. * sc.log1p(np.exp(-x))
+
+    def _cdf(self, x):
+        return np.tanh(x/2.0)
+
+    def _ppf(self, q):
+        return 2*np.arctanh(q)
+
+    def _sf(self, x):
+        return 2 * sc.expit(-x)
+
+    def _isf(self, q):
+        return xpx.apply_where(q < 0.5, q,
+                               lambda q: -sc.logit(0.5 * q),
+                               lambda q: 2*np.arctanh(1 - q))
+
+    def _munp(self, n):
+        if n == 0:
+            return 1  # otherwise returns NaN
+        if n == 1:
+            return 2*np.log(2)
+        if n == 2:
+            return np.pi*np.pi/3.0
+        if n == 3:
+            return 9*_ZETA3
+        if n == 4:
+            return 7*np.pi**4 / 15.0
+        return 2*(1-pow(2.0, 1-n))*sc.gamma(n+1)*sc.zeta(n, 1)
+
+    def _entropy(self):
+        return 2-np.log(2)
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        def find_scale(data, loc):
+            # scale is solution to a fix point problem ([1] 2.6)
+            # use approximate MLE as starting point ([1] 3.1)
+            n_observations = data.shape[0]
+            sorted_data = np.sort(data, axis=0)
+            p = np.arange(1, n_observations + 1)/(n_observations + 1)
+            q = 1 - p
+            pp1 = 1 + p
+            alpha = p - 0.5 * q * pp1 * np.log(pp1 / q)
+            beta = 0.5 * q * pp1
+            sorted_data = sorted_data - loc
+            B = 2 * np.sum(alpha[1:] * sorted_data[1:])
+            C = 2 * np.sum(beta[1:] * sorted_data[1:]**2)
+            # starting guess
+            scale = ((B + np.sqrt(B**2 + 8 * n_observations * C))
+                    /(4 * n_observations))
+
+            # relative tolerance of fix point iterator
+            rtol = 1e-8
+            relative_residual = 1
+            shifted_mean = sorted_data.mean()  # y_mean - y_min
+
+            # find fix point by repeated application of eq. (2.6)
+            # simplify as
+            # exp(-x) / (1 + exp(-x)) = 1 / (1 + exp(x))
+            #                         = expit(-x))
+            while relative_residual > rtol:
+                sum_term = sorted_data * sc.expit(-sorted_data/scale)
+                scale_new = shifted_mean - 2/n_observations * sum_term.sum()
+                relative_residual = abs((scale - scale_new)/scale)
+                scale = scale_new
+            return scale
+
+        # location is independent from the scale
+        data_min = np.min(data)
+        if floc is not None:
+            if data_min < floc:
+                # There are values that are less than the specified loc.
+                raise FitDataError("halflogistic", lower=floc, upper=np.inf)
+            loc = floc
+        else:
+            # if not provided, location MLE is the minimal data point
+            loc = data_min
+
+        # scale depends on location
+        scale = fscale if fscale is not None else find_scale(data, loc)
+
+        return loc, scale
+
+
+halflogistic = halflogistic_gen(a=0.0, name='halflogistic')
+
+
+class halfnorm_gen(rv_continuous):
+    r"""A half-normal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `halfnorm` is:
+
+    .. math::
+
+        f(x) = \sqrt{2/\pi} \exp(-x^2 / 2)
+
+    for :math:`x >= 0`.
+
+    `halfnorm` is a special case of `chi` with ``df=1``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return abs(random_state.standard_normal(size=size))
+
+    def _pdf(self, x):
+        # halfnorm.pdf(x) = sqrt(2/pi) * exp(-x**2/2)
+        return np.sqrt(2.0/np.pi)*np.exp(-x*x/2.0)
+
+    def _logpdf(self, x):
+        return 0.5 * np.log(2.0/np.pi) - x*x/2.0
+
+    def _cdf(self, x):
+        return sc.erf(x / np.sqrt(2))
+
+    def _ppf(self, q):
+        return _norm_ppf((1+q)/2.0)
+
+    def _sf(self, x):
+        return 2 * _norm_sf(x)
+
+    def _isf(self, p):
+        return _norm_isf(p/2)
+
+    def _stats(self):
+        return (np.sqrt(2.0/np.pi),
+                1-2.0/np.pi,
+                np.sqrt(2)*(4-np.pi)/(np.pi-2)**1.5,
+                8*(np.pi-3)/(np.pi-2)**2)
+
+    def _entropy(self):
+        return 0.5*np.log(np.pi/2.0)+0.5
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        data_min = np.min(data)
+
+        if floc is not None:
+            if data_min < floc:
+                # There are values that are less than the specified loc.
+                raise FitDataError("halfnorm", lower=floc, upper=np.inf)
+            loc = floc
+        else:
+            loc = data_min
+
+        if fscale is not None:
+            scale = fscale
+        else:
+            scale = stats.moment(data, order=2, center=loc)**0.5
+
+        return loc, scale
+
+
+halfnorm = halfnorm_gen(a=0.0, name='halfnorm')
+
+
+class hypsecant_gen(rv_continuous):
+    r"""A hyperbolic secant continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `hypsecant` is:
+
+    .. math::
+
+        f(x) = \frac{1}{\pi} \text{sech}(x)
+
+    for a real number :math:`x`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # hypsecant.pdf(x) = 1/pi * sech(x)
+        return 1.0/(np.pi*np.cosh(x))
+
+    def _cdf(self, x):
+        return 2.0/np.pi*np.arctan(np.exp(x))
+
+    def _ppf(self, q):
+        return np.log(np.tan(np.pi*q/2.0))
+
+    def _sf(self, x):
+        return 2.0/np.pi*np.arctan(np.exp(-x))
+
+    def _isf(self, q):
+        return -np.log(np.tan(np.pi*q/2.0))
+
+    def _stats(self):
+        return 0, np.pi*np.pi/4, 0, 2
+
+    def _entropy(self):
+        return np.log(2*np.pi)
+
+
+hypsecant = hypsecant_gen(name='hypsecant')
+
+
+class gausshyper_gen(rv_continuous):
+    r"""A Gauss hypergeometric continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `gausshyper` is:
+
+    .. math::
+
+        f(x, a, b, c, z) = C x^{a-1} (1-x)^{b-1} (1+zx)^{-c}
+
+    for :math:`0 \le x \le 1`, :math:`a,b > 0`, :math:`c` a real number,
+    :math:`z > -1`, and :math:`C = \frac{1}{B(a, b) F[2, 1](c, a; a+b; -z)}`.
+    :math:`F[2, 1]` is the Gauss hypergeometric function
+    `scipy.special.hyp2f1`.
+
+    `gausshyper` takes :math:`a`, :math:`b`, :math:`c` and :math:`z` as shape
+    parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Armero, C., and M. J. Bayarri. "Prior Assessments for Prediction in
+           Queues." *Journal of the Royal Statistical Society*. Series D (The
+           Statistician) 43, no. 1 (1994): 139-53. doi:10.2307/2348939
+
+    %(example)s
+
+    """
+
+    def _argcheck(self, a, b, c, z):
+        # z > -1 per gh-10134
+        return (a > 0) & (b > 0) & (c == c) & (z > -1)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        ic = _ShapeInfo("c", False, (-np.inf, np.inf), (False, False))
+        iz = _ShapeInfo("z", False, (-1, np.inf), (False, False))
+        return [ia, ib, ic, iz]
+
+    def _pdf(self, x, a, b, c, z):
+        normalization_constant = sc.beta(a, b) * sc.hyp2f1(c, a, a + b, -z)
+        return (1./normalization_constant * x**(a - 1.) * (1. - x)**(b - 1.0)
+                / (1.0 + z*x)**c)
+
+    def _munp(self, n, a, b, c, z):
+        fac = sc.beta(n+a, b) / sc.beta(a, b)
+        num = sc.hyp2f1(c, a+n, a+b+n, -z)
+        den = sc.hyp2f1(c, a, a+b, -z)
+        return fac*num / den
+
+
+gausshyper = gausshyper_gen(a=0.0, b=1.0, name='gausshyper')
+
+
+class invgamma_gen(rv_continuous):
+    r"""An inverted gamma continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `invgamma` is:
+
+    .. math::
+
+        f(x, a) = \frac{x^{-a-1}}{\Gamma(a)} \exp(-\frac{1}{x})
+
+    for :math:`x >= 0`, :math:`a > 0`. :math:`\Gamma` is the gamma function
+    (`scipy.special.gamma`).
+
+    `invgamma` takes ``a`` as a shape parameter for :math:`a`.
+
+    `invgamma` is a special case of `gengamma` with ``c=-1``, and it is a
+    different parameterization of the scaled inverse chi-squared distribution.
+    Specifically, if the scaled inverse chi-squared distribution is
+    parameterized with degrees of freedom :math:`\nu` and scaling parameter
+    :math:`\tau^2`, then it can be modeled using `invgamma` with
+    ``a=`` :math:`\nu/2` and ``scale=`` :math:`\nu \tau^2/2`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, a):
+        # invgamma.pdf(x, a) = x**(-a-1) / gamma(a) * exp(-1/x)
+        return np.exp(self._logpdf(x, a))
+
+    def _logpdf(self, x, a):
+        return -(a+1) * np.log(x) - sc.gammaln(a) - 1.0/x
+
+    def _cdf(self, x, a):
+        return sc.gammaincc(a, 1.0 / x)
+
+    def _ppf(self, q, a):
+        return 1.0 / sc.gammainccinv(a, q)
+
+    def _sf(self, x, a):
+        return sc.gammainc(a, 1.0 / x)
+
+    def _isf(self, q, a):
+        return 1.0 / sc.gammaincinv(a, q)
+
+    def _stats(self, a, moments='mvsk'):
+        m1 = xpx.apply_where(a > 1, a,
+                             lambda x: 1. / (x - 1.),
+                             fill_value=np.inf)
+        m2 = xpx.apply_where(a > 2, a,
+                             lambda x: 1. / (x - 1.)**2 / (x - 2.),
+                             fill_value=np.inf)
+
+        g1, g2 = None, None
+        if 's' in moments:
+            g1 = xpx.apply_where(a > 3, a,
+                                 lambda x: 4. * np.sqrt(x - 2.) / (x - 3.),
+                                 fill_value=np.nan)
+        if 'k' in moments:
+            g2 = xpx.apply_where(a > 4, a,
+                                 lambda x: 6. * (5. * x - 11.) / (x - 3.) / (x - 4.),
+                                 fill_value=np.nan)
+
+        return m1, m2, g1, g2
+
+    def _entropy(self, a):
+        def regular(a):
+            h = a - (a + 1.0) * sc.psi(a) + sc.gammaln(a)
+            return h
+
+        def asymptotic(a):
+            # gammaln(a) ~ a * ln(a) - a - 0.5 * ln(a) + 0.5 * ln(2 * pi)
+            # psi(a) ~ ln(a) - 1 / (2 * a)
+            h = ((1 - 3*np.log(a) + np.log(2) + np.log(np.pi))/2
+                 + 2/3*a**-1. + a**-2./12 - a**-3./90 - a**-4./120)
+            return h
+
+        h = xpx.apply_where(a >= 200, a, asymptotic, regular)
+        return h
+
+
+invgamma = invgamma_gen(a=0.0, name='invgamma')
+
+
+class invgauss_gen(rv_continuous):
+    r"""An inverse Gaussian continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `invgauss` is:
+
+    .. math::
+
+        f(x; \mu) = \frac{1}{\sqrt{2 \pi x^3}}
+                    \exp\left(-\frac{(x-\mu)^2}{2 \mu^2 x}\right)
+
+    for :math:`x \ge 0` and :math:`\mu > 0`.
+
+    `invgauss` takes ``mu`` as a shape parameter for :math:`\mu`.
+
+    %(after_notes)s
+
+    A common shape-scale parameterization of the inverse Gaussian distribution
+    has density
+
+    .. math::
+
+        f(x; \nu, \lambda) = \sqrt{\frac{\lambda}{2 \pi x^3}}
+                    \exp\left( -\frac{\lambda(x-\nu)^2}{2 \nu^2 x}\right)
+
+    Using ``nu`` for :math:`\nu` and ``lam`` for :math:`\lambda`, this
+    parameterization is equivalent to the one above with ``mu = nu/lam``,
+    ``loc = 0``, and ``scale = lam``.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``ppf`` and ``isf`` methods. [1]_
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("mu", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, mu, size=None, random_state=None):
+        return random_state.wald(mu, 1.0, size=size)
+
+    def _pdf(self, x, mu):
+        # invgauss.pdf(x, mu) =
+        #                  1 / sqrt(2*pi*x**3) * exp(-(x-mu)**2/(2*x*mu**2))
+        return 1.0/np.sqrt(2*np.pi*x**3.0)*np.exp(-1.0/(2*x)*(x/mu - 1)**2)
+
+    def _logpdf(self, x, mu):
+        return -0.5*np.log(2*np.pi) - 1.5*np.log(x) - (x/mu - 1)**2/(2*x)
+
+    # approach adapted from equations in
+    # https://journal.r-project.org/archive/2016-1/giner-smyth.pdf,
+    # not R code. see gh-13616
+
+    def _logcdf(self, x, mu):
+        fac = 1 / np.sqrt(x)
+        a = _norm_logcdf(fac * (x/mu - 1))
+        b = 2 / mu + _norm_logcdf(-fac * (x/mu + 1))
+        return a + np.log1p(np.exp(b - a))
+
+    def _logsf(self, x, mu):
+        fac = 1 / np.sqrt(x)
+        a = _norm_logsf(fac * (x/mu - 1))
+        b = 2 / mu + _norm_logcdf(-fac * (x/mu + 1))
+        return a + np.log1p(-np.exp(b - a))
+
+    def _sf(self, x, mu):
+        return np.exp(self._logsf(x, mu))
+
+    def _cdf(self, x, mu):
+        return np.exp(self._logcdf(x, mu))
+
+    def _ppf(self, x, mu):
+        with np.errstate(divide='ignore', over='ignore', invalid='ignore'):
+            x, mu = np.broadcast_arrays(x, mu)
+            ppf = np.asarray(scu._invgauss_ppf(x, mu, 1))
+            i_wt = x > 0.5  # "wrong tail" - sometimes too inaccurate
+            ppf[i_wt] = scu._invgauss_isf(1-x[i_wt], mu[i_wt], 1)
+            i_nan = np.isnan(ppf)
+            ppf[i_nan] = super()._ppf(x[i_nan], mu[i_nan])
+        return ppf
+
+    def _isf(self, x, mu):
+        with np.errstate(divide='ignore', over='ignore', invalid='ignore'):
+            x, mu = np.broadcast_arrays(x, mu)
+            isf = scu._invgauss_isf(x, mu, 1)
+            i_wt = x > 0.5  # "wrong tail" - sometimes too inaccurate
+            isf[i_wt] = scu._invgauss_ppf(1-x[i_wt], mu[i_wt], 1)
+            i_nan = np.isnan(isf)
+            isf[i_nan] = super()._isf(x[i_nan], mu[i_nan])
+        return isf
+
+    def _stats(self, mu):
+        return mu, mu**3.0, 3*np.sqrt(mu), 15*mu
+
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        method = kwds.get('method', 'mle')
+
+        if (isinstance(data, CensoredData) or isinstance(self, wald_gen)
+                or method.lower() == 'mm'):
+            return super().fit(data, *args, **kwds)
+
+        data, fshape_s, floc, fscale = _check_fit_input_parameters(self, data,
+                                                                   args, kwds)
+        '''
+        Source: Statistical Distributions, 3rd Edition. Evans, Hastings,
+        and Peacock (2000), Page 121. Their shape parameter is equivalent to
+        SciPy's with the conversion `fshape_s = fshape / scale`.
+
+        MLE formulas are not used in 3 conditions:
+        - `loc` is not fixed
+        - `mu` is fixed
+        These cases fall back on the superclass fit method.
+        - `loc` is fixed but translation results in negative data raises
+          a `FitDataError`.
+        '''
+        if floc is None or fshape_s is not None:
+            return super().fit(data, *args, **kwds)
+        elif np.any(data - floc < 0):
+            raise FitDataError("invgauss", lower=0, upper=np.inf)
+        else:
+            data = data - floc
+            fshape_n = np.mean(data)
+            if fscale is None:
+                fscale = len(data) / (np.sum(data ** -1 - fshape_n ** -1))
+            fshape_s = fshape_n / fscale
+        return fshape_s, floc, fscale
+
+    def _entropy(self, mu):
+        """
+        Ref.: https://moser-isi.ethz.ch/docs/papers/smos-2012-10.pdf (eq. 9)
+        """
+        # a = log(2*pi*e*mu**3)
+        #   = 1 + log(2*pi) + 3 * log(mu)
+        a = 1. + np.log(2 * np.pi) + 3 * np.log(mu)
+        # b = exp(2/mu) * exp1(2/mu)
+        #   = _scaled_exp1(2/mu) / (2/mu)
+        r = 2/mu
+        b = sc._ufuncs._scaled_exp1(r)/r
+        return 0.5 * a - 1.5 * b
+
+
+invgauss = invgauss_gen(a=0.0, name='invgauss')
+
+
+class geninvgauss_gen(rv_continuous):
+    r"""A Generalized Inverse Gaussian continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `geninvgauss` is:
+
+    .. math::
+
+        f(x, p, b) = x^{p-1} \exp(-b (x + 1/x) / 2) / (2 K_p(b))
+
+    where ``x > 0``, `p` is a real number and ``b > 0``\([1]_).
+    :math:`K_p` is the modified Bessel function of second kind of order `p`
+    (`scipy.special.kv`).
+
+    %(after_notes)s
+
+    The inverse Gaussian distribution `stats.invgauss(mu)` is a special case of
+    `geninvgauss` with ``p = -1/2``, ``b = 1 / mu`` and ``scale = mu``.
+
+    Generating random variates is challenging for this distribution. The
+    implementation is based on [2]_.
+
+    References
+    ----------
+    .. [1] O. Barndorff-Nielsen, P. Blaesild, C. Halgreen, "First hitting time
+       models for the generalized inverse gaussian distribution",
+       Stochastic Processes and their Applications 7, pp. 49--54, 1978.
+
+    .. [2] W. Hoermann and J. Leydold, "Generating generalized inverse Gaussian
+       random variates", Statistics and Computing, 24(4), p. 547--557, 2014.
+
+    %(example)s
+
+    """
+    def _argcheck(self, p, b):
+        return (p == p) & (b > 0)
+
+    def _shape_info(self):
+        ip = _ShapeInfo("p", False, (-np.inf, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ip, ib]
+
+    def _logpdf(self, x, p, b):
+        # kve instead of kv works better for large values of b
+        # warn if kve produces infinite values and replace by nan
+        # otherwise c = -inf and the results are often incorrect
+        def logpdf_single(x, p, b):
+            return _stats.geninvgauss_logpdf(x, p, b)
+
+        logpdf_single = np.vectorize(logpdf_single, otypes=[np.float64])
+
+        z = logpdf_single(x, p, b)
+        if np.isnan(z).any():
+            msg = ("Infinite values encountered in scipy.special.kve(p, b). "
+                   "Values replaced by NaN to avoid incorrect results.")
+            warnings.warn(msg, RuntimeWarning, stacklevel=3)
+        return z
+
+    def _pdf(self, x, p, b):
+        # relying on logpdf avoids overflow of x**(p-1) for large x and p
+        return np.exp(self._logpdf(x, p, b))
+
+    def _cdf(self, x, p, b):
+        _a, _b = self._get_support(p, b)
+
+        def _cdf_single(x, p, b):
+            user_data = np.array([p, b], float).ctypes.data_as(ctypes.c_void_p)
+            llc = LowLevelCallable.from_cython(_stats, '_geninvgauss_pdf',
+                                               user_data)
+
+            return integrate.quad(llc, _a, x)[0]
+
+        _cdf_single = np.vectorize(_cdf_single, otypes=[np.float64])
+
+        return _cdf_single(x, p, b)
+
+    def _logquasipdf(self, x, p, b):
+        # log of the quasi-density (w/o normalizing constant) used in _rvs
+        return xpx.apply_where(x > 0, (x, p, b),
+                               lambda x, p, b: (p - 1)*np.log(x) - b*(x + 1/x)/2,
+                               fill_value=-np.inf)
+
+    def _rvs(self, p, b, size=None, random_state=None):
+        # if p and b are scalar, use _rvs_scalar, otherwise need to create
+        # output by iterating over parameters
+        if np.isscalar(p) and np.isscalar(b):
+            out = self._rvs_scalar(p, b, size, random_state)
+        elif p.size == 1 and b.size == 1:
+            out = self._rvs_scalar(p.item(), b.item(), size, random_state)
+        else:
+            # When this method is called, size will be a (possibly empty)
+            # tuple of integers.  It will not be None; if `size=None` is passed
+            # to `rvs()`, size will be the empty tuple ().
+
+            p, b = np.broadcast_arrays(p, b)
+            # p and b now have the same shape.
+
+            # `shp` is the shape of the blocks of random variates that are
+            # generated for each combination of parameters associated with
+            # broadcasting p and b.
+            # bc is a tuple the same length as size.  The values
+            # in bc are bools.  If bc[j] is True, it means that
+            # entire axis is filled in for a given combination of the
+            # broadcast arguments.
+            shp, bc = _check_shape(p.shape, size)
+
+            # `numsamples` is the total number of variates to be generated
+            # for each combination of the input arguments.
+            numsamples = int(np.prod(shp))
+
+            # `out` is the array to be returned.  It is filled in the
+            # loop below.
+            out = np.empty(size)
+
+            it = np.nditer([p, b],
+                           flags=['multi_index'],
+                           op_flags=[['readonly'], ['readonly']])
+            while not it.finished:
+                # Convert the iterator's multi_index into an index into the
+                # `out` array where the call to _rvs_scalar() will be stored.
+                # Where bc is True, we use a full slice; otherwise we use the
+                # index value from it.multi_index.  len(it.multi_index) might
+                # be less than len(bc), and in that case we want to align these
+                # two sequences to the right, so the loop variable j runs from
+                # -len(size) to 0.  This doesn't cause an IndexError, as
+                # bc[j] will be True in those cases where it.multi_index[j]
+                # would cause an IndexError.
+                idx = tuple((it.multi_index[j] if not bc[j] else slice(None))
+                            for j in range(-len(size), 0))
+                out[idx] = self._rvs_scalar(it[0], it[1], numsamples,
+                                            random_state).reshape(shp)
+                it.iternext()
+
+        if size == ():
+            out = out.item()
+        return out
+
+    def _rvs_scalar(self, p, b, numsamples, random_state):
+        # following [2], the quasi-pdf is used instead of the pdf for the
+        # generation of rvs
+        invert_res = False
+        if not numsamples:
+            numsamples = 1
+        if p < 0:
+            # note: if X is geninvgauss(p, b), then 1/X is geninvgauss(-p, b)
+            p = -p
+            invert_res = True
+        m = self._mode(p, b)
+
+        # determine method to be used following [2]
+        ratio_unif = True
+        if p >= 1 or b > 1:
+            # ratio of uniforms with mode shift below
+            mode_shift = True
+        elif b >= min(0.5, 2 * np.sqrt(1 - p) / 3):
+            # ratio of uniforms without mode shift below
+            mode_shift = False
+        else:
+            # new algorithm in [2]
+            ratio_unif = False
+
+        # prepare sampling of rvs
+        size1d = tuple(np.atleast_1d(numsamples))
+        N = np.prod(size1d)  # number of rvs needed, reshape upon return
+        x = np.zeros(N)
+        simulated = 0
+
+        if ratio_unif:
+            # use ratio of uniforms method
+            if mode_shift:
+                a2 = -2 * (p + 1) / b - m
+                a1 = 2 * m * (p - 1) / b - 1
+                # find roots of x**3 + a2*x**2 + a1*x + m (Cardano's formula)
+                p1 = a1 - a2**2 / 3
+                q1 = 2 * a2**3 / 27 - a2 * a1 / 3 + m
+                phi = np.arccos(-q1 * np.sqrt(-27 / p1**3) / 2)
+                s1 = -np.sqrt(-4 * p1 / 3)
+                root1 = s1 * np.cos(phi / 3 + np.pi / 3) - a2 / 3
+                root2 = -s1 * np.cos(phi / 3) - a2 / 3
+                # root3 = s1 * np.cos(phi / 3 - np.pi / 3) - a2 / 3
+
+                # if g is the quasipdf, rescale: g(x) / g(m) which we can write
+                # as exp(log(g(x)) - log(g(m))). This is important
+                # since for large values of p and b, g cannot be evaluated.
+                # denote the rescaled quasipdf by h
+                lm = self._logquasipdf(m, p, b)
+                d1 = self._logquasipdf(root1, p, b) - lm
+                d2 = self._logquasipdf(root2, p, b) - lm
+                # compute the bounding rectangle w.r.t. h. Note that
+                # np.exp(0.5*d1) = np.sqrt(g(root1)/g(m)) = np.sqrt(h(root1))
+                vmin = (root1 - m) * np.exp(0.5 * d1)
+                vmax = (root2 - m) * np.exp(0.5 * d2)
+                umax = 1  # umax = sqrt(h(m)) = 1
+
+                def logqpdf(x):
+                    return self._logquasipdf(x, p, b) - lm
+
+                c = m
+            else:
+                # ratio of uniforms without mode shift
+                # compute np.sqrt(quasipdf(m))
+                umax = np.exp(0.5*self._logquasipdf(m, p, b))
+                xplus = ((1 + p) + np.sqrt((1 + p)**2 + b**2))/b
+                vmin = 0
+                # compute xplus * np.sqrt(quasipdf(xplus))
+                vmax = xplus * np.exp(0.5 * self._logquasipdf(xplus, p, b))
+                c = 0
+
+                def logqpdf(x):
+                    return self._logquasipdf(x, p, b)
+
+            if vmin >= vmax:
+                raise ValueError("vmin must be smaller than vmax.")
+            if umax <= 0:
+                raise ValueError("umax must be positive.")
+
+            i = 1
+            while simulated < N:
+                k = N - simulated
+                # simulate uniform rvs on [0, umax] and [vmin, vmax]
+                u = umax * random_state.uniform(size=k)
+                v = random_state.uniform(size=k)
+                v = vmin + (vmax - vmin) * v
+                rvs = v / u + c
+                # rewrite acceptance condition u**2 <= pdf(rvs) by taking logs
+                accept = (2*np.log(u) <= logqpdf(rvs))
+                num_accept = np.sum(accept)
+                if num_accept > 0:
+                    x[simulated:(simulated + num_accept)] = rvs[accept]
+                    simulated += num_accept
+
+                if (simulated == 0) and (i*N >= 50000):
+                    msg = ("Not a single random variate could be generated "
+                           f"in {i*N} attempts. Sampling does not appear to "
+                           "work for the provided parameters.")
+                    raise RuntimeError(msg)
+                i += 1
+        else:
+            # use new algorithm in [2]
+            x0 = b / (1 - p)
+            xs = np.max((x0, 2 / b))
+            k1 = np.exp(self._logquasipdf(m, p, b))
+            A1 = k1 * x0
+            if x0 < 2 / b:
+                k2 = np.exp(-b)
+                if p > 0:
+                    A2 = k2 * ((2 / b)**p - x0**p) / p
+                else:
+                    A2 = k2 * np.log(2 / b**2)
+            else:
+                k2, A2 = 0, 0
+            k3 = xs**(p - 1)
+            A3 = 2 * k3 * np.exp(-xs * b / 2) / b
+            A = A1 + A2 + A3
+
+            # [2]: rejection constant is < 2.73; so expected runtime is finite
+            while simulated < N:
+                k = N - simulated
+                h, rvs = np.zeros(k), np.zeros(k)
+                # simulate uniform rvs on [x1, x2] and [0, y2]
+                u = random_state.uniform(size=k)
+                v = A * random_state.uniform(size=k)
+                cond1 = v <= A1
+                cond2 = np.logical_not(cond1) & (v <= A1 + A2)
+                cond3 = np.logical_not(cond1 | cond2)
+                # subdomain (0, x0)
+                rvs[cond1] = x0 * v[cond1] / A1
+                h[cond1] = k1
+                # subdomain (x0, 2 / b)
+                if p > 0:
+                    rvs[cond2] = (x0**p + (v[cond2] - A1) * p / k2)**(1 / p)
+                else:
+                    rvs[cond2] = b * np.exp((v[cond2] - A1) * np.exp(b))
+                h[cond2] = k2 * rvs[cond2]**(p - 1)
+                # subdomain (xs, infinity)
+                z = np.exp(-xs * b / 2) - b * (v[cond3] - A1 - A2) / (2 * k3)
+                rvs[cond3] = -2 / b * np.log(z)
+                h[cond3] = k3 * np.exp(-rvs[cond3] * b / 2)
+                # apply rejection method
+                accept = (np.log(u * h) <= self._logquasipdf(rvs, p, b))
+                num_accept = sum(accept)
+                if num_accept > 0:
+                    x[simulated:(simulated + num_accept)] = rvs[accept]
+                    simulated += num_accept
+
+        rvs = np.reshape(x, size1d)
+        if invert_res:
+            rvs = 1 / rvs
+        return rvs
+
+    def _mode(self, p, b):
+        # distinguish cases to avoid catastrophic cancellation (see [2])
+        if p < 1:
+            return b / (np.sqrt((p - 1)**2 + b**2) + 1 - p)
+        else:
+            return (np.sqrt((1 - p)**2 + b**2) - (1 - p)) / b
+
+    def _munp(self, n, p, b):
+        num = sc.kve(p + n, b)
+        denom = sc.kve(p, b)
+        inf_vals = np.isinf(num) | np.isinf(denom)
+        if inf_vals.any():
+            msg = ("Infinite values encountered in the moment calculation "
+                   "involving scipy.special.kve. Values replaced by NaN to "
+                   "avoid incorrect results.")
+            warnings.warn(msg, RuntimeWarning, stacklevel=3)
+            m = np.full_like(num, np.nan, dtype=np.float64)
+            m[~inf_vals] = num[~inf_vals] / denom[~inf_vals]
+        else:
+            m = num / denom
+        return m
+
+
+geninvgauss = geninvgauss_gen(a=0.0, name="geninvgauss")
+
+
+class norminvgauss_gen(rv_continuous):
+    r"""A Normal Inverse Gaussian continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `norminvgauss` is:
+
+    .. math::
+
+        f(x, a, b) = \frac{a \, K_1(a \sqrt{1 + x^2})}{\pi \sqrt{1 + x^2}} \,
+                     \exp(\sqrt{a^2 - b^2} + b x)
+
+    where :math:`x` is a real number, the parameter :math:`a` is the tail
+    heaviness and :math:`b` is the asymmetry parameter satisfying
+    :math:`a > 0` and :math:`|b| <= a`.
+    :math:`K_1` is the modified Bessel function of second kind
+    (`scipy.special.k1`).
+
+    %(after_notes)s
+
+    A normal inverse Gaussian random variable `Y` with parameters `a` and `b`
+    can be expressed as a normal mean-variance mixture:
+    ``Y = b * V + sqrt(V) * X`` where `X` is ``norm(0,1)`` and `V` is
+    ``invgauss(mu=1/sqrt(a**2 - b**2))``. This representation is used
+    to generate random variates.
+
+    Another common parametrization of the distribution (see Equation 2.1 in
+    [2]_) is given by the following expression of the pdf:
+
+    .. math::
+
+        g(x, \alpha, \beta, \delta, \mu) =
+        \frac{\alpha\delta K_1\left(\alpha\sqrt{\delta^2 + (x - \mu)^2}\right)}
+        {\pi \sqrt{\delta^2 + (x - \mu)^2}} \,
+        e^{\delta \sqrt{\alpha^2 - \beta^2} + \beta (x - \mu)}
+
+    In SciPy, this corresponds to
+    :math:`a=\alpha \delta, b=\beta \delta, \text{loc}=\mu, \text{scale}=\delta`.
+
+    References
+    ----------
+    .. [1] O. Barndorff-Nielsen, "Hyperbolic Distributions and Distributions on
+           Hyperbolae", Scandinavian Journal of Statistics, Vol. 5(3),
+           pp. 151-157, 1978.
+
+    .. [2] O. Barndorff-Nielsen, "Normal Inverse Gaussian Distributions and
+           Stochastic Volatility Modelling", Scandinavian Journal of
+           Statistics, Vol. 24, pp. 1-13, 1997.
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _argcheck(self, a, b):
+        return (a > 0) & (np.absolute(b) < a)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (-np.inf, np.inf), (False, False))
+        return [ia, ib]
+
+    def _fitstart(self, data):
+        # Arbitrary, but the default a = b = 1 is not valid; the distribution
+        # requires |b| < a.
+        return super()._fitstart(data, args=(1, 0.5))
+
+    def _pdf(self, x, a, b):
+        gamma = np.sqrt(a**2 - b**2)
+        fac1 = a / np.pi
+        sq = np.hypot(1, x)  # reduce overflows
+        return fac1 * sc.k1e(a * sq) * np.exp(b*x - a*sq + gamma) / sq
+
+    def _sf(self, x, a, b):
+        if np.isscalar(x):
+            # If x is a scalar, then so are a and b.
+            return integrate.quad(self._pdf, x, np.inf, args=(a, b))[0]
+        else:
+            a = np.atleast_1d(a)
+            b = np.atleast_1d(b)
+            result = []
+            for (x0, a0, b0) in zip(x, a, b):
+                result.append(integrate.quad(self._pdf, x0, np.inf,
+                                             args=(a0, b0))[0])
+            return np.array(result)
+
+    def _isf(self, q, a, b):
+        def _isf_scalar(q, a, b):
+
+            def eq(x, a, b, q):
+                # Solve eq(x, a, b, q) = 0 to obtain isf(x, a, b) = q.
+                return self._sf(x, a, b) - q
+
+            # Find a bracketing interval for the root.
+            # Start at the mean, and grow the length of the interval
+            # by 2 each iteration until there is a sign change in eq.
+            xm = self.mean(a, b)
+            em = eq(xm, a, b, q)
+            if em == 0:
+                # Unlikely, but might as well check.
+                return xm
+            if em > 0:
+                delta = 1
+                left = xm
+                right = xm + delta
+                while eq(right, a, b, q) > 0:
+                    delta = 2*delta
+                    right = xm + delta
+            else:
+                # em < 0
+                delta = 1
+                right = xm
+                left = xm - delta
+                while eq(left, a, b, q) < 0:
+                    delta = 2*delta
+                    left = xm - delta
+            result = optimize.brentq(eq, left, right, args=(a, b, q),
+                                     xtol=self.xtol)
+            return result
+
+        if np.isscalar(q):
+            return _isf_scalar(q, a, b)
+        else:
+            result = []
+            for (q0, a0, b0) in zip(q, a, b):
+                result.append(_isf_scalar(q0, a0, b0))
+            return np.array(result)
+
+    def _rvs(self, a, b, size=None, random_state=None):
+        # note: X = b * V + sqrt(V) * X is norminvgaus(a,b) if X is standard
+        # normal and V is invgauss(mu=1/sqrt(a**2 - b**2))
+        gamma = np.sqrt(a**2 - b**2)
+        ig = invgauss.rvs(mu=1/gamma, size=size, random_state=random_state)
+        return b * ig + np.sqrt(ig) * norm.rvs(size=size,
+                                               random_state=random_state)
+
+    def _stats(self, a, b):
+        gamma = np.sqrt(a**2 - b**2)
+        mean = b / gamma
+        variance = a**2 / gamma**3
+        skewness = 3.0 * b / (a * np.sqrt(gamma))
+        kurtosis = 3.0 * (1 + 4 * b**2 / a**2) / gamma
+        return mean, variance, skewness, kurtosis
+
+
+norminvgauss = norminvgauss_gen(name="norminvgauss")
+
+
+class invweibull_gen(rv_continuous):
+    """An inverted Weibull continuous random variable.
+
+    This distribution is also known as the Fréchet distribution or the
+    type II extreme value distribution.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `invweibull` is:
+
+    .. math::
+
+        f(x, c) = c x^{-c-1} \\exp(-x^{-c})
+
+    for :math:`x > 0`, :math:`c > 0`.
+
+    `invweibull` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    F.R.S. de Gusmao, E.M.M Ortega and G.M. Cordeiro, "The generalized inverse
+    Weibull distribution", Stat. Papers, vol. 52, pp. 591-619, 2011.
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # invweibull.pdf(x, c) = c * x**(-c-1) * exp(-x**(-c))
+        xc1 = np.power(x, -c - 1.0)
+        xc2 = np.power(x, -c)
+        xc2 = np.exp(-xc2)
+        return c * xc1 * xc2
+
+    def _cdf(self, x, c):
+        xc1 = np.power(x, -c)
+        return np.exp(-xc1)
+
+    def _sf(self, x, c):
+        return -np.expm1(-x**-c)
+
+    def _ppf(self, q, c):
+        return np.power(-np.log(q), -1.0/c)
+
+    def _isf(self, p, c):
+        return (-np.log1p(-p))**(-1/c)
+
+    def _munp(self, n, c):
+        return sc.gamma(1 - n / c)
+
+    def _entropy(self, c):
+        return 1+_EULER + _EULER / c - np.log(c)
+
+    def _fitstart(self, data, args=None):
+        # invweibull requires c > 1 for the first moment to exist, so use 2.0
+        args = (2.0,) if args is None else args
+        return super()._fitstart(data, args=args)
+
+
+invweibull = invweibull_gen(a=0, name='invweibull')
+
+
+class jf_skew_t_gen(rv_continuous):
+    r"""Jones and Faddy skew-t distribution.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `jf_skew_t` is:
+
+    .. math::
+
+        f(x; a, b) = C_{a,b}^{-1}
+                    \left(1+\frac{x}{\left(a+b+x^2\right)^{1/2}}\right)^{a+1/2}
+                    \left(1-\frac{x}{\left(a+b+x^2\right)^{1/2}}\right)^{b+1/2}
+
+    for real numbers :math:`a>0` and :math:`b>0`, where
+    :math:`C_{a,b} = 2^{a+b-1}B(a,b)(a+b)^{1/2}`, and :math:`B` denotes the
+    beta function (`scipy.special.beta`).
+
+    When :math:`a<b`, the distribution is negatively skewed, and when
+    :math:`a>b`, the distribution is positively skewed. If :math:`a=b`, then
+    we recover the `t` distribution with :math:`2a` degrees of freedom.
+
+    `jf_skew_t` takes :math:`a` and :math:`b` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] M.C. Jones and M.J. Faddy. "A skew extension of the t distribution,
+           with applications" *Journal of the Royal Statistical Society*.
+           Series B (Statistical Methodology) 65, no. 1 (2003): 159-174.
+           :doi:`10.1111/1467-9868.00378`
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _pdf(self, x, a, b):
+        c = 2 ** (a + b - 1) * sc.beta(a, b) * np.sqrt(a + b)
+        d1 = (1 + x / np.sqrt(a + b + x ** 2)) ** (a + 0.5)
+        d2 = (1 - x / np.sqrt(a + b + x ** 2)) ** (b + 0.5)
+        return d1 * d2 / c
+
+    def _rvs(self, a, b, size=None, random_state=None):
+        d1 = random_state.beta(a, b, size)
+        d2 = (2 * d1 - 1) * np.sqrt(a + b)
+        d3 = 2 * np.sqrt(d1 * (1 - d1))
+        return d2 / d3
+
+    def _cdf(self, x, a, b):
+        y = (1 + x / np.sqrt(a + b + x ** 2)) * 0.5
+        return sc.betainc(a, b, y)
+
+    def _sf(self, x, a, b):
+        y = (1 + x / np.sqrt(a + b + x ** 2)) * 0.5
+        return sc.betaincc(a, b, y)
+
+    def _ppf(self, q, a, b):
+        d1 = beta.ppf(q, a, b)
+        d2 = (2 * d1 - 1) * np.sqrt(a + b)
+        d3 = 2 * np.sqrt(d1 * (1 - d1))
+        return d2 / d3
+
+    def _munp(self, n, a, b):
+        """Returns the n-th moment(s) where all the following hold:
+
+        - n >= 0
+        - a > n / 2
+        - b > n / 2
+
+        The result is np.nan in all other cases.
+        """
+        def nth_moment(n_k, a_k, b_k):
+            """Computes E[T^(n_k)] where T is skew-t distributed with
+            parameters a_k and b_k.
+            """
+            num = (a_k + b_k) ** (0.5 * n_k)
+            denom = 2 ** n_k * sc.beta(a_k, b_k)
+
+            indices = np.arange(n_k + 1)
+            sgn = np.where(indices % 2 > 0, -1, 1)
+            d = sc.beta(a_k + 0.5 * n_k - indices, b_k - 0.5 * n_k + indices)
+            sum_terms = sc.comb(n_k, indices) * sgn * d
+
+            return num / denom * sum_terms.sum()
+
+        nth_moment_valid = (a > 0.5 * n) & (b > 0.5 * n) & (n >= 0)
+        return xpx.apply_where(
+            nth_moment_valid,
+            (n, a, b),
+            np.vectorize(nth_moment, otypes=[np.float64]),
+            fill_value=np.nan,
+        )
+
+
+jf_skew_t = jf_skew_t_gen(name='jf_skew_t')
+
+
+class johnsonsb_gen(rv_continuous):
+    r"""A Johnson SB continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    johnsonsu
+
+    Notes
+    -----
+    The probability density function for `johnsonsb` is:
+
+    .. math::
+
+        f(x, a, b) = \frac{b}{x(1-x)}  \phi(a + b \log \frac{x}{1-x} )
+
+    where :math:`x`, :math:`a`, and :math:`b` are real scalars; :math:`b > 0`
+    and :math:`x \in [0,1]`.  :math:`\phi` is the pdf of the normal
+    distribution.
+
+    `johnsonsb` takes :math:`a` and :math:`b` as shape parameters.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _argcheck(self, a, b):
+        return (b > 0) & (a == a)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (-np.inf, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _pdf(self, x, a, b):
+        # johnsonsb.pdf(x, a, b) = b / (x*(1-x)) * phi(a + b * log(x/(1-x)))
+        trm = _norm_pdf(a + b*sc.logit(x))
+        return b*1.0/(x*(1-x))*trm
+
+    def _cdf(self, x, a, b):
+        return _norm_cdf(a + b*sc.logit(x))
+
+    def _ppf(self, q, a, b):
+        return sc.expit(1.0 / b * (_norm_ppf(q) - a))
+
+    def _sf(self, x, a, b):
+        return _norm_sf(a + b*sc.logit(x))
+
+    def _isf(self, q, a, b):
+        return sc.expit(1.0 / b * (_norm_isf(q) - a))
+
+
+johnsonsb = johnsonsb_gen(a=0.0, b=1.0, name='johnsonsb')
+
+
+class johnsonsu_gen(rv_continuous):
+    r"""A Johnson SU continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    johnsonsb
+
+    Notes
+    -----
+    The probability density function for `johnsonsu` is:
+
+    .. math::
+
+        f(x, a, b) = \frac{b}{\sqrt{x^2 + 1}}
+                     \phi(a + b \log(x + \sqrt{x^2 + 1}))
+
+    where :math:`x`, :math:`a`, and :math:`b` are real scalars; :math:`b > 0`.
+    :math:`\phi` is the pdf of the normal distribution.
+
+    `johnsonsu` takes :math:`a` and :math:`b` as shape parameters.
+
+    The first four central moments are calculated according to the formulas
+    in [1]_.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Taylor Enterprises. "Johnson Family of Distributions".
+       https://variation.com/wp-content/distribution_analyzer_help/hs126.htm
+
+    %(example)s
+
+    """
+    def _argcheck(self, a, b):
+        return (b > 0) & (a == a)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (-np.inf, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _pdf(self, x, a, b):
+        # johnsonsu.pdf(x, a, b) = b / sqrt(x**2 + 1) *
+        #                          phi(a + b * log(x + sqrt(x**2 + 1)))
+        x2 = x*x
+        trm = _norm_pdf(a + b * np.arcsinh(x))
+        return b*1.0/np.sqrt(x2+1.0)*trm
+
+    def _cdf(self, x, a, b):
+        return _norm_cdf(a + b * np.arcsinh(x))
+
+    def _ppf(self, q, a, b):
+        return np.sinh((_norm_ppf(q) - a) / b)
+
+    def _sf(self, x, a, b):
+        return _norm_sf(a + b * np.arcsinh(x))
+
+    def _isf(self, x, a, b):
+        return np.sinh((_norm_isf(x) - a) / b)
+
+    def _stats(self, a, b, moments='mv'):
+        # Naive implementation of first and second moment to address gh-18071.
+        # https://variation.com/wp-content/distribution_analyzer_help/hs126.htm
+        # Numerical improvements left to future enhancements.
+        mu, mu2, g1, g2 = None, None, None, None
+
+        bn2 = b**-2.
+        expbn2 = np.exp(bn2)
+        a_b = a / b
+
+        if 'm' in moments:
+            mu = -expbn2**0.5 * np.sinh(a_b)
+        if 'v' in moments:
+            mu2 = 0.5*sc.expm1(bn2)*(expbn2*np.cosh(2*a_b) + 1)
+        if 's' in moments:
+            t1 = expbn2**.5 * sc.expm1(bn2)**0.5
+            t2 = 3*np.sinh(a_b)
+            t3 = expbn2 * (expbn2 + 2) * np.sinh(3*a_b)
+            denom = np.sqrt(2) * (1 + expbn2 * np.cosh(2*a_b))**(3/2)
+            g1 = -t1 * (t2 + t3) / denom
+        if 'k' in moments:
+            t1 = 3 + 6*expbn2
+            t2 = 4*expbn2**2 * (expbn2 + 2) * np.cosh(2*a_b)
+            t3 = expbn2**2 * np.cosh(4*a_b)
+            t4 = -3 + 3*expbn2**2 + 2*expbn2**3 + expbn2**4
+            denom = 2*(1 + expbn2*np.cosh(2*a_b))**2
+            g2 = (t1 + t2 + t3*t4) / denom - 3
+        return mu, mu2, g1, g2
+
+
+johnsonsu = johnsonsu_gen(name='johnsonsu')
+
+
+class landau_gen(rv_continuous):
+    r"""A Landau continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `landau` ([1]_, [2]_) is:
+
+    .. math::
+
+        f(x) = \frac{1}{\pi}\int_0^\infty \exp(-t \log t - xt)\sin(\pi t) dt
+
+    for a real number :math:`x`.
+
+    %(after_notes)s
+
+    Often (e.g. [2]_), the Landau distribution is parameterized in terms of a
+    location parameter :math:`\mu` and scale parameter :math:`c`, the latter of
+    which *also* introduces a location shift. If ``mu`` and ``c`` are used to
+    represent these parameters, this corresponds with SciPy's parameterization
+    with ``loc = mu + 2*c / np.pi * np.log(c)`` and ``scale = c``.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pdf``, ``cdf``, ``ppf``, ``sf`` and ``isf``
+    methods. [1]_
+
+    References
+    ----------
+    .. [1] Landau, L. (1944). "On the energy loss of fast particles by
+           ionization". J. Phys. (USSR). 8: 201.
+    .. [2] "Landau Distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Landau_distribution
+    .. [3] Chambers, J. M., Mallows, C. L., & Stuck, B. (1976).
+           "A method for simulating stable random variables."
+           Journal of the American Statistical Association, 71(354), 340-344.
+    .. [4] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+    .. [5] Yoshimura, T. "Numerical Evaluation and High Precision Approximation
+           Formula for Landau Distribution".
+           :doi:`10.36227/techrxiv.171822215.53612870/v2`
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _entropy(self):
+        # Computed with mpmath - see gh-19145
+        return 2.37263644000448182
+
+    def _pdf(self, x):
+        return scu._landau_pdf(x, 0, 1)
+
+    def _cdf(self, x):
+        return scu._landau_cdf(x, 0, 1)
+
+    def _sf(self, x):
+        return scu._landau_sf(x, 0, 1)
+
+    def _ppf(self, p):
+        return scu._landau_ppf(p, 0, 1)
+
+    def _isf(self, p):
+        return scu._landau_isf(p, 0, 1)
+
+    def _stats(self):
+        return np.nan, np.nan, np.nan, np.nan
+
+    def _munp(self, n):
+        return np.nan if n > 0 else 1
+
+    def _fitstart(self, data, args=None):
+        # Initialize ML guesses using quartiles instead of moments.
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        p25, p50, p75 = np.percentile(data, [25, 50, 75])
+        return p50, (p75 - p25)/2
+
+    def _rvs(self, size=None, random_state=None):
+        # Method from https://www.jstor.org/stable/2285309 Eq. 2.4
+        pi_2 = np.pi / 2
+        U = random_state.uniform(-np.pi / 2, np.pi / 2, size=size)
+        W = random_state.standard_exponential(size=size)
+        S = 2 / np.pi * ((pi_2 + U) * np.tan(U)
+                         - np.log((pi_2 * W * np.cos(U)) / (pi_2 + U)))
+        return S
+
+
+landau = landau_gen(name='landau')
+
+
+class laplace_gen(rv_continuous):
+    r"""A Laplace continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `laplace` is
+
+    .. math::
+
+        f(x) = \frac{1}{2} \exp(-|x|)
+
+    for a real number :math:`x`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.laplace(0, 1, size=size)
+
+    def _pdf(self, x):
+        # laplace.pdf(x) = 1/2 * exp(-abs(x))
+        return 0.5*np.exp(-abs(x))
+
+    def _cdf(self, x):
+        with np.errstate(over='ignore'):
+            return np.where(x > 0, 1.0 - 0.5*np.exp(-x), 0.5*np.exp(x))
+
+    def _sf(self, x):
+        # By symmetry...
+        return self._cdf(-x)
+
+    def _ppf(self, q):
+        return np.where(q > 0.5, -np.log(2*(1-q)), np.log(2*q))
+
+    def _isf(self, q):
+        # By symmetry...
+        return -self._ppf(q)
+
+    def _stats(self):
+        return 0, 2, 0, 3
+
+    def _entropy(self):
+        return np.log(2)+1
+
+    @_call_super_mom
+    @replace_notes_in_docstring(rv_continuous, notes="""\
+        This function uses explicit formulas for the maximum likelihood
+        estimation of the Laplace distribution parameters, so the keyword
+        arguments `loc`, `scale`, and `optimizer` are ignored.\n\n""")
+    def fit(self, data, *args, **kwds):
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        # Source: Statistical Distributions, 3rd Edition. Evans, Hastings,
+        # and Peacock (2000), Page 124
+
+        if floc is None:
+            floc = np.median(data)
+
+        if fscale is None:
+            fscale = (np.sum(np.abs(data - floc))) / len(data)
+
+        return floc, fscale
+
+
+laplace = laplace_gen(name='laplace')
+
+
+class laplace_asymmetric_gen(rv_continuous):
+    r"""An asymmetric Laplace continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    laplace : Laplace distribution
+
+    Notes
+    -----
+    The probability density function for `laplace_asymmetric` is
+
+    .. math::
+
+       f(x, \kappa) &= \frac{1}{\kappa+\kappa^{-1}}\exp(-x\kappa),\quad x\ge0\\
+                    &= \frac{1}{\kappa+\kappa^{-1}}\exp(x/\kappa),\quad x<0\\
+
+    for :math:`-\infty < x < \infty`, :math:`\kappa > 0`.
+
+    `laplace_asymmetric` takes ``kappa`` as a shape parameter for
+    :math:`\kappa`. For :math:`\kappa = 1`, it is identical to a
+    Laplace distribution.
+
+    %(after_notes)s
+
+    Note that the scale parameter of some references is the reciprocal of
+    SciPy's ``scale``. For example, :math:`\lambda = 1/2` in the
+    parameterization of [1]_ is equivalent to ``scale = 2`` with
+    `laplace_asymmetric`.
+
+    References
+    ----------
+    .. [1] "Asymmetric Laplace distribution", Wikipedia
+            https://en.wikipedia.org/wiki/Asymmetric_Laplace_distribution
+
+    .. [2] Kozubowski TJ and Podgórski K. A Multivariate and
+           Asymmetric Generalization of Laplace Distribution,
+           Computational Statistics 15, 531--540 (2000).
+           :doi:`10.1007/PL00022717`
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("kappa", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, kappa):
+        return np.exp(self._logpdf(x, kappa))
+
+    def _logpdf(self, x, kappa):
+        kapinv = 1/kappa
+        lPx = x * np.where(x >= 0, -kappa, kapinv)
+        lPx -= np.log(kappa+kapinv)
+        return lPx
+
+    def _cdf(self, x, kappa):
+        kapinv = 1/kappa
+        kappkapinv = kappa+kapinv
+        return np.where(x >= 0,
+                        1 - np.exp(-x*kappa)*(kapinv/kappkapinv),
+                        np.exp(x*kapinv)*(kappa/kappkapinv))
+
+    def _sf(self, x, kappa):
+        kapinv = 1/kappa
+        kappkapinv = kappa+kapinv
+        return np.where(x >= 0,
+                        np.exp(-x*kappa)*(kapinv/kappkapinv),
+                        1 - np.exp(x*kapinv)*(kappa/kappkapinv))
+
+    def _ppf(self, q, kappa):
+        kapinv = 1/kappa
+        kappkapinv = kappa+kapinv
+        return np.where(q >= kappa/kappkapinv,
+                        -np.log((1 - q)*kappkapinv*kappa)*kapinv,
+                        np.log(q*kappkapinv/kappa)*kappa)
+
+    def _isf(self, q, kappa):
+        kapinv = 1/kappa
+        kappkapinv = kappa+kapinv
+        return np.where(q <= kapinv/kappkapinv,
+                        -np.log(q*kappkapinv*kappa)*kapinv,
+                        np.log((1 - q)*kappkapinv/kappa)*kappa)
+
+    def _stats(self, kappa):
+        kapinv = 1/kappa
+        mn = kapinv - kappa
+        var = kapinv*kapinv + kappa*kappa
+        g1 = 2.0*(1-np.power(kappa, 6))/np.power(1+np.power(kappa, 4), 1.5)
+        g2 = 6.0*(1+np.power(kappa, 8))/np.power(1+np.power(kappa, 4), 2)
+        return mn, var, g1, g2
+
+    def _entropy(self, kappa):
+        return 1 + np.log(kappa+1/kappa)
+
+
+laplace_asymmetric = laplace_asymmetric_gen(name='laplace_asymmetric')
+
+
+def _check_fit_input_parameters(dist, data, args, kwds):
+    if not isinstance(data, CensoredData):
+        data = np.asarray(data)
+
+    floc = kwds.get('floc', None)
+    fscale = kwds.get('fscale', None)
+
+    num_shapes = len(dist.shapes.split(",")) if dist.shapes else 0
+    fshape_keys = []
+    fshapes = []
+
+    # user has many options for fixing the shape, so here we standardize it
+    # into 'f' + the number of the shape.
+    # Adapted from `_reduce_func` in `_distn_infrastructure.py`:
+    if dist.shapes:
+        shapes = dist.shapes.replace(',', ' ').split()
+        for j, s in enumerate(shapes):
+            key = 'f' + str(j)
+            names = [key, 'f' + s, 'fix_' + s]
+            val = _get_fixed_fit_value(kwds, names)
+            fshape_keys.append(key)
+            fshapes.append(val)
+            if val is not None:
+                kwds[key] = val
+
+    # determine if there are any unknown arguments in kwds
+    known_keys = {'loc', 'scale', 'optimizer', 'method',
+                  'floc', 'fscale', *fshape_keys}
+    unknown_keys = set(kwds).difference(known_keys)
+    if unknown_keys:
+        raise TypeError(f"Unknown keyword arguments: {unknown_keys}.")
+
+    if len(args) > num_shapes:
+        raise TypeError("Too many positional arguments.")
+
+    if None not in {floc, fscale, *fshapes}:
+        # This check is for consistency with `rv_continuous.fit`.
+        # Without this check, this function would just return the
+        # parameters that were given.
+        raise RuntimeError("All parameters fixed. There is nothing to "
+                           "optimize.")
+
+    uncensored = data._uncensor() if isinstance(data, CensoredData) else data
+    if not np.isfinite(uncensored).all():
+        raise ValueError("The data contains non-finite values.")
+
+    return (data, *fshapes, floc, fscale)
+
+
+class levy_gen(rv_continuous):
+    r"""A Levy continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    levy_stable, levy_l
+
+    Notes
+    -----
+    The probability density function for `levy` is:
+
+    .. math::
+
+        f(x) = \frac{1}{\sqrt{2\pi x^3}} \exp\left(-\frac{1}{2x}\right)
+
+    for :math:`x > 0`.
+
+    This is the same as the Levy-stable distribution with :math:`a=1/2` and
+    :math:`b=1`.
+
+    %(after_notes)s
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import levy
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Calculate the first four moments:
+
+    >>> mean, var, skew, kurt = levy.stats(moments='mvsk')
+
+    Display the probability density function (``pdf``):
+
+    >>> # `levy` is very heavy-tailed.
+    >>> # To show a nice plot, let's cut off the upper 40 percent.
+    >>> a, b = levy.ppf(0), levy.ppf(0.6)
+    >>> x = np.linspace(a, b, 100)
+    >>> ax.plot(x, levy.pdf(x),
+    ...        'r-', lw=5, alpha=0.6, label='levy pdf')
+
+    Alternatively, the distribution object can be called (as a function)
+    to fix the shape, location and scale parameters. This returns a "frozen"
+    RV object holding the given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pdf``:
+
+    >>> rv = levy()
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+
+    Check accuracy of ``cdf`` and ``ppf``:
+
+    >>> vals = levy.ppf([0.001, 0.5, 0.999])
+    >>> np.allclose([0.001, 0.5, 0.999], levy.cdf(vals))
+    True
+
+    Generate random numbers:
+
+    >>> r = levy.rvs(size=1000)
+
+    And compare the histogram:
+
+    >>> # manual binning to ignore the tail
+    >>> bins = np.concatenate((np.linspace(a, b, 20), [np.max(r)]))
+    >>> ax.hist(r, bins=bins, density=True, histtype='stepfilled', alpha=0.2)
+    >>> ax.set_xlim([x[0], x[-1]])
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # levy.pdf(x) = 1 / (x * sqrt(2*pi*x)) * exp(-1/(2*x))
+        return 1 / np.sqrt(2*np.pi*x) / x * np.exp(-1/(2*x))
+
+    def _cdf(self, x):
+        # Equivalent to 2*norm.sf(np.sqrt(1/x))
+        return sc.erfc(np.sqrt(0.5 / x))
+
+    def _sf(self, x):
+        return sc.erf(np.sqrt(0.5 / x))
+
+    def _ppf(self, q):
+        # Equivalent to 1.0/(norm.isf(q/2)**2) or 0.5/(erfcinv(q)**2)
+        val = _norm_isf(q/2)
+        return 1.0 / (val * val)
+
+    def _isf(self, p):
+        return 1/(2*sc.erfinv(p)**2)
+
+    def _stats(self):
+        return np.inf, np.inf, np.nan, np.nan
+
+
+levy = levy_gen(a=0.0, name="levy")
+
+
+class levy_l_gen(rv_continuous):
+    r"""A left-skewed Levy continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    levy, levy_stable
+
+    Notes
+    -----
+    The probability density function for `levy_l` is:
+
+    .. math::
+        f(x) = \frac{1}{|x| \sqrt{2\pi |x|}} \exp{ \left(-\frac{1}{2|x|} \right)}
+
+    for :math:`x < 0`.
+
+    This is the same as the Levy-stable distribution with :math:`a=1/2` and
+    :math:`b=-1`.
+
+    %(after_notes)s
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import levy_l
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Calculate the first four moments:
+
+    >>> mean, var, skew, kurt = levy_l.stats(moments='mvsk')
+
+    Display the probability density function (``pdf``):
+
+    >>> # `levy_l` is very heavy-tailed.
+    >>> # To show a nice plot, let's cut off the lower 40 percent.
+    >>> a, b = levy_l.ppf(0.4), levy_l.ppf(1)
+    >>> x = np.linspace(a, b, 100)
+    >>> ax.plot(x, levy_l.pdf(x),
+    ...        'r-', lw=5, alpha=0.6, label='levy_l pdf')
+
+    Alternatively, the distribution object can be called (as a function)
+    to fix the shape, location and scale parameters. This returns a "frozen"
+    RV object holding the given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pdf``:
+
+    >>> rv = levy_l()
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+
+    Check accuracy of ``cdf`` and ``ppf``:
+
+    >>> vals = levy_l.ppf([0.001, 0.5, 0.999])
+    >>> np.allclose([0.001, 0.5, 0.999], levy_l.cdf(vals))
+    True
+
+    Generate random numbers:
+
+    >>> r = levy_l.rvs(size=1000)
+
+    And compare the histogram:
+
+    >>> # manual binning to ignore the tail
+    >>> bins = np.concatenate(([np.min(r)], np.linspace(a, b, 20)))
+    >>> ax.hist(r, bins=bins, density=True, histtype='stepfilled', alpha=0.2)
+    >>> ax.set_xlim([x[0], x[-1]])
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        # levy_l.pdf(x) = 1 / (abs(x) * sqrt(2*pi*abs(x))) * exp(-1/(2*abs(x)))
+        ax = abs(x)
+        return 1/np.sqrt(2*np.pi*ax)/ax*np.exp(-1/(2*ax))
+
+    def _cdf(self, x):
+        ax = abs(x)
+        return 2 * _norm_cdf(1 / np.sqrt(ax)) - 1
+
+    def _sf(self, x):
+        ax = abs(x)
+        return 2 * _norm_sf(1 / np.sqrt(ax))
+
+    def _ppf(self, q):
+        val = _norm_ppf((q + 1.0) / 2)
+        return -1.0 / (val * val)
+
+    def _isf(self, p):
+        return -1/_norm_isf(p/2)**2
+
+    def _stats(self):
+        return np.inf, np.inf, np.nan, np.nan
+
+
+levy_l = levy_l_gen(b=0.0, name="levy_l")
+
+
+class logistic_gen(rv_continuous):
+    r"""A logistic (or Sech-squared) continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `logistic` is:
+
+    .. math::
+
+        f(x) = \frac{\exp(-x)}
+                    {(1+\exp(-x))^2}
+
+    `logistic` is a special case of `genlogistic` with ``c=1``.
+
+    Remark that the survival function (``logistic.sf``) is equal to the
+    Fermi-Dirac distribution describing fermionic statistics.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.logistic(size=size)
+
+    def _pdf(self, x):
+        # logistic.pdf(x) = exp(-x) / (1+exp(-x))**2
+        return np.exp(self._logpdf(x))
+
+    def _logpdf(self, x):
+        y = -np.abs(x)
+        return y - 2. * sc.log1p(np.exp(y))
+
+    def _cdf(self, x):
+        return sc.expit(x)
+
+    def _logcdf(self, x):
+        return sc.log_expit(x)
+
+    def _ppf(self, q):
+        return sc.logit(q)
+
+    def _sf(self, x):
+        return sc.expit(-x)
+
+    def _logsf(self, x):
+        return sc.log_expit(-x)
+
+    def _isf(self, q):
+        return -sc.logit(q)
+
+    def _stats(self):
+        return 0, np.pi*np.pi/3.0, 0, 6.0/5.0
+
+    def _entropy(self):
+        # https://en.wikipedia.org/wiki/Logistic_distribution
+        return 2.0
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+        n = len(data)
+
+        # rv_continuous provided guesses
+        loc, scale = self._fitstart(data)
+        # these are trumped by user-provided guesses
+        loc, scale = kwds.get('loc', loc), kwds.get('scale', scale)
+
+        # the maximum likelihood estimators `a` and `b` of the location and
+        # scale parameters are roots of the two equations described in `func`.
+        # Source: Statistical Distributions, 3rd Edition. Evans, Hastings, and
+        # Peacock (2000), Page 130
+
+        def dl_dloc(loc, scale=fscale):
+            c = (data - loc) / scale
+            return np.sum(sc.expit(c)) - n/2
+
+        def dl_dscale(scale, loc=floc):
+            c = (data - loc) / scale
+            return np.sum(c*np.tanh(c/2)) - n
+
+        def func(params):
+            loc, scale = params
+            return dl_dloc(loc, scale), dl_dscale(scale, loc)
+
+        if fscale is not None and floc is None:
+            res = optimize.root(dl_dloc, (loc,))
+            loc = res.x[0]
+            scale = fscale
+        elif floc is not None and fscale is None:
+            res = optimize.root(dl_dscale, (scale,))
+            scale = res.x[0]
+            loc = floc
+        else:
+            res = optimize.root(func, (loc, scale))
+            loc, scale = res.x
+
+        # Note: gh-18176 reported data for which the reported MLE had
+        # `scale < 0`. To fix the bug, we return abs(scale). This is OK because
+        # `dl_dscale` and `dl_dloc` are even and odd functions of `scale`,
+        # respectively, so if `-scale` is a solution, so is `scale`.
+        scale = abs(scale)
+        return ((loc, scale) if res.success
+                else super().fit(data, *args, **kwds))
+
+
+logistic = logistic_gen(name='logistic')
+
+
+class loggamma_gen(rv_continuous):
+    r"""A log gamma continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `loggamma` is:
+
+    .. math::
+
+        f(x, c) = \frac{\exp(c x - \exp(x))}
+                       {\Gamma(c)}
+
+    for all :math:`x, c > 0`. Here, :math:`\Gamma` is the
+    gamma function (`scipy.special.gamma`).
+
+    `loggamma` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, c, size=None, random_state=None):
+        # Use the property of the gamma distribution Gamma(c)
+        #    Gamma(c) ~ Gamma(c + 1)*U**(1/c),
+        # where U is uniform on [0, 1]. (See, e.g.,
+        # G. Marsaglia and W.W. Tsang, "A simple method for generating gamma
+        # variables", https://doi.org/10.1145/358407.358414)
+        # So
+        #    log(Gamma(c)) ~ log(Gamma(c + 1)) + log(U)/c
+        # Generating a sample with this formulation is a bit slower
+        # than the more obvious log(Gamma(c)), but it avoids loss
+        # of precision when c << 1.
+        return (np.log(random_state.gamma(c + 1, size=size))
+                + np.log(random_state.uniform(size=size))/c)
+
+    def _pdf(self, x, c):
+        # loggamma.pdf(x, c) = exp(c*x-exp(x)) / gamma(c)
+        return np.exp(c*x-np.exp(x)-sc.gammaln(c))
+
+    def _logpdf(self, x, c):
+        return c*x - np.exp(x) - sc.gammaln(c)
+
+    def _cdf(self, x, c):
+        # This function is gammainc(c, exp(x)), where gammainc(c, z) is
+        # the regularized incomplete gamma function.
+        # The first term in a series expansion of gamminc(c, z) is
+        # z**c/Gamma(c+1); see 6.5.29 of Abramowitz & Stegun (and refer
+        # back to 6.5.1, 6.5.2 and 6.5.4 for the relevant notation).
+        # This can also be found in the wikipedia article
+        # https://en.wikipedia.org/wiki/Incomplete_gamma_function.
+        # Here we use that formula when x is sufficiently negative that
+        # exp(x) will result in subnormal numbers and lose precision.
+        # We evaluate the log of the expression first to allow the possible
+        # cancellation of the terms in the division, and then exponentiate.
+        # That is,
+        #     exp(x)**c/Gamma(c+1) = exp(log(exp(x)**c/Gamma(c+1)))
+        #                          = exp(c*x - gammaln(c+1))
+        return xpx.apply_where(
+            x < _LOGXMIN, (x, c),
+            lambda x, c: np.exp(c*x - sc.gammaln(c+1)),
+            lambda x, c: sc.gammainc(c, np.exp(x)))
+
+    def _ppf(self, q, c):
+        # The expression used when g < _XMIN inverts the one term expansion
+        # given in the comments of _cdf().
+        g = sc.gammaincinv(c, q)
+        return xpx.apply_where(
+            g < _XMIN, (g, q, c),
+            lambda g, q, c: (np.log(q) + sc.gammaln(c+1))/c,
+            lambda g, q, c: np.log(g))
+
+    def _sf(self, x, c):
+        # See the comments for _cdf() for how x < _LOGXMIN is handled.
+        return xpx.apply_where(
+            x < _LOGXMIN, (x, c),
+            lambda x, c: -np.expm1(c*x - sc.gammaln(c+1)),
+            lambda x, c: sc.gammaincc(c, np.exp(x)))
+
+    def _isf(self, q, c):
+        # The expression used when g < _XMIN inverts the complement of
+        # the one term expansion given in the comments of _cdf().
+        g = sc.gammainccinv(c, q)
+        return xpx.apply_where(
+            g < _XMIN, (g, q, c),
+            lambda g, q, c: (np.log1p(-q) + sc.gammaln(c+1))/c,
+            lambda g, q, c: np.log(g))
+
+    def _stats(self, c):
+        # See, for example, "A Statistical Study of Log-Gamma Distribution", by
+        # Ping Shing Chan (thesis, McMaster University, 1993).
+        mean = sc.digamma(c)
+        var = sc.polygamma(1, c)
+        skewness = sc.polygamma(2, c) / np.power(var, 1.5)
+        excess_kurtosis = sc.polygamma(3, c) / (var*var)
+        return mean, var, skewness, excess_kurtosis
+
+    def _entropy(self, c):
+        def regular(c):
+            h = sc.gammaln(c) - c * sc.digamma(c) + c
+            return h
+
+        def asymptotic(c):
+            # using asymptotic expansions for gammaln and psi (see gh-18093)
+            term = -0.5*np.log(c) + c**-1./6 - c**-3./90 + c**-5./210
+            h = norm._entropy() + term
+            return h
+
+        return xpx.apply_where(c >= 45, c, asymptotic, regular)
+
+
+loggamma = loggamma_gen(name='loggamma')
+
+
+class loglaplace_gen(rv_continuous):
+    r"""A log-Laplace continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `loglaplace` is:
+
+    .. math::
+
+        f(x, c) = \begin{cases}\frac{c}{2} x^{ c-1}  &\text{for } 0 < x < 1\\
+                               \frac{c}{2} x^{-c-1}  &\text{for } x \ge 1
+                  \end{cases}
+
+    for :math:`c > 0`.
+
+    `loglaplace` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    Suppose a random variable ``X`` follows the Laplace distribution with
+    location ``a`` and scale ``b``.  Then ``Y = exp(X)`` follows the
+    log-Laplace distribution with ``c = 1 / b`` and ``scale = exp(a)``.
+
+    References
+    ----------
+    T.J. Kozubowski and K. Podgorski, "A log-Laplace growth rate model",
+    The Mathematical Scientist, vol. 28, pp. 49-60, 2003.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # loglaplace.pdf(x, c) = c / 2 * x**(c-1),   for 0 < x < 1
+        #                      = c / 2 * x**(-c-1),  for x >= 1
+        cd2 = c/2.0
+        c = np.where(x < 1, c, -c)
+        return cd2*x**(c-1)
+
+    def _cdf(self, x, c):
+        return np.where(x < 1, 0.5*x**c, 1-0.5*x**(-c))
+
+    def _sf(self, x, c):
+        return np.where(x < 1, 1 - 0.5*x**c, 0.5*x**(-c))
+
+    def _ppf(self, q, c):
+        return np.where(q < 0.5, (2.0*q)**(1.0/c), (2*(1.0-q))**(-1.0/c))
+
+    def _isf(self, q, c):
+        return np.where(q > 0.5, (2.0*(1.0 - q))**(1.0/c), (2*q)**(-1.0/c))
+
+    def _munp(self, n, c):
+        with np.errstate(divide='ignore'):
+            c2, n2 = c**2, n**2
+            return np.where(n2 < c2, c2 / (c2 - n2), np.inf)
+
+    def _entropy(self, c):
+        return np.log(2.0/c) + 1.0
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        data, fc, floc, fscale = _check_fit_input_parameters(self, data,
+                                                             args, kwds)
+
+        # Specialize MLE only when location is known.
+        if floc is None:
+            return super(type(self), self).fit(data, *args, **kwds)
+
+        # Raise an error if any observation has zero likelihood.
+        if np.any(data <= floc):
+            raise FitDataError("loglaplace", lower=floc, upper=np.inf)
+
+        # Remove location from data.
+        if floc != 0:
+            data = data - floc
+
+        # When location is zero, the log-Laplace distribution is related to
+        # the Laplace distribution in that if X ~ Laplace(loc=a, scale=b),
+        # then Y = exp(X) ~ LogLaplace(c=1/b, loc=0, scale=exp(a)).  It can
+        # be shown that the MLE for Y is the same as the MLE for X = ln(Y).
+        # Therefore, we reuse the formulas from laplace.fit() and transform
+        # the result back into log-laplace's parameter space.
+        a, b = laplace.fit(np.log(data),
+                           floc=np.log(fscale) if fscale is not None else None,
+                           fscale=1/fc if fc is not None else None,
+                           method='mle')
+        loc = floc
+        scale = np.exp(a) if fscale is None else fscale
+        c = 1 / b if fc is None else fc
+        return c, loc, scale
+
+loglaplace = loglaplace_gen(a=0.0, name='loglaplace')
+
+
+def _lognorm_logpdf(x, s):
+    return xpx.apply_where(
+        x != 0, (x, s),
+        lambda x, s: (-np.log(x)**2 / (2 * s**2)
+                      - np.log(s * x * np.sqrt(2 * np.pi))),
+        fill_value=-np.inf)
+
+
+class lognorm_gen(rv_continuous):
+    r"""A lognormal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `lognorm` is:
+
+    .. math::
+
+        f(x, s) = \frac{1}{s x \sqrt{2\pi}}
+                  \exp\left(-\frac{\log^2(x)}{2s^2}\right)
+
+    for :math:`x > 0`, :math:`s > 0`.
+
+    `lognorm` takes ``s`` as a shape parameter for :math:`s`.
+
+    %(after_notes)s
+
+    Suppose a normally distributed random variable ``X`` has  mean ``mu`` and
+    standard deviation ``sigma``. Then ``Y = exp(X)`` is lognormally
+    distributed with ``s = sigma`` and ``scale = exp(mu)``.
+
+    %(example)s
+
+    The logarithm of a log-normally distributed random variable is
+    normally distributed:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> mu, sigma = 2, 0.5
+    >>> X = stats.norm(loc=mu, scale=sigma)
+    >>> Y = stats.lognorm(s=sigma, scale=np.exp(mu))
+    >>> x = np.linspace(*X.interval(0.999))
+    >>> y = Y.rvs(size=10000)
+    >>> ax.plot(x, X.pdf(x), label='X (pdf)')
+    >>> ax.hist(np.log(y), density=True, bins=x, label='log(Y) (histogram)')
+    >>> ax.legend()
+    >>> plt.show()
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return [_ShapeInfo("s", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, s, size=None, random_state=None):
+        return np.exp(s * random_state.standard_normal(size))
+
+    def _pdf(self, x, s):
+        # lognorm.pdf(x, s) = 1 / (s*x*sqrt(2*pi)) * exp(-1/2*(log(x)/s)**2)
+        return np.exp(self._logpdf(x, s))
+
+    def _logpdf(self, x, s):
+        return _lognorm_logpdf(x, s)
+
+    def _cdf(self, x, s):
+        return _norm_cdf(np.log(x) / s)
+
+    def _logcdf(self, x, s):
+        return _norm_logcdf(np.log(x) / s)
+
+    def _ppf(self, q, s):
+        return np.exp(s * _norm_ppf(q))
+
+    def _sf(self, x, s):
+        return _norm_sf(np.log(x) / s)
+
+    def _logsf(self, x, s):
+        return _norm_logsf(np.log(x) / s)
+
+    def _isf(self, q, s):
+        return np.exp(s * _norm_isf(q))
+
+    def _stats(self, s):
+        p = np.exp(s*s)
+        mu = np.sqrt(p)
+        mu2 = p*(p-1)
+        g1 = np.sqrt(p-1)*(2+p)
+        g2 = np.polyval([1, 2, 3, 0, -6.0], p)
+        return mu, mu2, g1, g2
+
+    def _entropy(self, s):
+        return 0.5 * (1 + np.log(2*np.pi) + 2 * np.log(s))
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        When `method='MLE'` and
+        the location parameter is fixed by using the `floc` argument,
+        this function uses explicit formulas for the maximum likelihood
+        estimation of the log-normal shape and scale parameters, so the
+        `optimizer`, `loc` and `scale` keyword arguments are ignored.
+        If the location is free, a likelihood maximum is found by
+        setting its partial derivative wrt to location to 0, and
+        solving by substituting the analytical expressions of shape
+        and scale (or provided parameters).
+        See, e.g., equation 3.1 in
+        A. Clifford Cohen & Betty Jones Whitten (1980)
+        Estimation in the Three-Parameter Lognormal Distribution,
+        Journal of the American Statistical Association, 75:370, 399-404
+        https://doi.org/10.2307/2287466
+        \n\n""")
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        parameters = _check_fit_input_parameters(self, data, args, kwds)
+        data, fshape, floc, fscale = parameters
+        data_min = np.min(data)
+
+        def get_shape_scale(loc):
+            # Calculate maximum likelihood scale and shape with analytical
+            # formulas unless provided by the user
+            if fshape is None or fscale is None:
+                lndata = np.log(data - loc)
+            scale = fscale or np.exp(lndata.mean())
+            shape = fshape or np.sqrt(np.mean((lndata - np.log(scale))**2))
+            return shape, scale
+
+        def dL_dLoc(loc):
+            # Derivative of (positive) LL w.r.t. loc
+            shape, scale = get_shape_scale(loc)
+            shifted = data - loc
+            return np.sum((1 + np.log(shifted/scale)/shape**2)/shifted)
+
+        def ll(loc):
+            # (Positive) log-likelihood
+            shape, scale = get_shape_scale(loc)
+            return -self.nnlf((shape, loc, scale), data)
+
+        if floc is None:
+            # The location must be less than the minimum of the data.
+            # Back off a bit to avoid numerical issues.
+            spacing = np.spacing(data_min)
+            rbrack = data_min - spacing
+
+            # Find the right end of the bracket by successive doubling of the
+            # distance to data_min. We're interested in a maximum LL, so the
+            # slope dL_dLoc_rbrack should be negative at the right end.
+            # optimization for later: share shape, scale
+            dL_dLoc_rbrack = dL_dLoc(rbrack)
+            ll_rbrack = ll(rbrack)
+            delta = 2 * spacing  # 2 * (data_min - rbrack)
+            while dL_dLoc_rbrack >= -1e-6:
+                rbrack = data_min - delta
+                dL_dLoc_rbrack = dL_dLoc(rbrack)
+                delta *= 2
+
+            if not np.isfinite(rbrack) or not np.isfinite(dL_dLoc_rbrack):
+                # If we never find a negative slope, either we missed it or the
+                # slope is always positive. It's usually the latter,
+                # which means
+                # loc = data_min - spacing
+                # But sometimes when shape and/or scale are fixed there are
+                # other issues, so be cautious.
+                return super().fit(data, *args, **kwds)
+
+            # Now find the left end of the bracket. Guess is `rbrack-1`
+            # unless that is too small of a difference to resolve. Double
+            # the size of the interval until the left end is found.
+            lbrack = np.minimum(np.nextafter(rbrack, -np.inf), rbrack-1)
+            dL_dLoc_lbrack = dL_dLoc(lbrack)
+            delta = 2 * (rbrack - lbrack)
+            while (np.isfinite(lbrack) and np.isfinite(dL_dLoc_lbrack)
+                   and np.sign(dL_dLoc_lbrack) == np.sign(dL_dLoc_rbrack)):
+                lbrack = rbrack - delta
+                dL_dLoc_lbrack = dL_dLoc(lbrack)
+                delta *= 2
+
+            # I don't recall observing this, but just in case...
+            if not np.isfinite(lbrack) or not np.isfinite(dL_dLoc_lbrack):
+                return super().fit(data, *args, **kwds)
+
+            # If we have a valid bracket, find the root
+            res = root_scalar(dL_dLoc, bracket=(lbrack, rbrack))
+            if not res.converged:
+                return super().fit(data, *args, **kwds)
+
+            # If the slope was positive near the minimum of the data,
+            # the maximum LL could be there instead of at the root. Compare
+            # the LL of the two points to decide.
+            ll_root = ll(res.root)
+            loc = res.root if ll_root > ll_rbrack else data_min-spacing
+
+        else:
+            if floc >= data_min:
+                raise FitDataError("lognorm", lower=0., upper=np.inf)
+            loc = floc
+
+        shape, scale = get_shape_scale(loc)
+        if not (self._argcheck(shape) and scale > 0):
+            return super().fit(data, *args, **kwds)
+        return shape, loc, scale
+
+
+lognorm = lognorm_gen(a=0.0, name='lognorm')
+
+
+class gibrat_gen(rv_continuous):
+    r"""A Gibrat continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `gibrat` is:
+
+    .. math::
+
+        f(x) = \frac{1}{x \sqrt{2\pi}} \exp(-\frac{1}{2} (\log(x))^2)
+
+    for :math:`x >= 0`.
+
+    `gibrat` is a special case of `lognorm` with ``s=1``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return np.exp(random_state.standard_normal(size))
+
+    def _pdf(self, x):
+        # gibrat.pdf(x) = 1/(x*sqrt(2*pi)) * exp(-1/2*(log(x))**2)
+        return np.exp(self._logpdf(x))
+
+    def _logpdf(self, x):
+        return _lognorm_logpdf(x, 1.0)
+
+    def _cdf(self, x):
+        return _norm_cdf(np.log(x))
+
+    def _ppf(self, q):
+        return np.exp(_norm_ppf(q))
+
+    def _sf(self, x):
+        return _norm_sf(np.log(x))
+
+    def _isf(self, p):
+        return np.exp(_norm_isf(p))
+
+    def _stats(self):
+        p = np.e
+        mu = np.sqrt(p)
+        mu2 = p * (p - 1)
+        g1 = np.sqrt(p - 1) * (2 + p)
+        g2 = np.polyval([1, 2, 3, 0, -6.0], p)
+        return mu, mu2, g1, g2
+
+    def _entropy(self):
+        return 0.5 * np.log(2 * np.pi) + 0.5
+
+
+gibrat = gibrat_gen(a=0.0, name='gibrat')
+
+
+class maxwell_gen(rv_continuous):
+    r"""A Maxwell continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    A special case of a `chi` distribution,  with ``df=3``, ``loc=0.0``,
+    and given ``scale = a``, where ``a`` is the parameter used in the
+    Mathworld description [1]_.
+
+    The probability density function for `maxwell` is:
+
+    .. math::
+
+        f(x) = \sqrt{2/\pi}x^2 \exp(-x^2/2)
+
+    for :math:`x >= 0`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] http://mathworld.wolfram.com/MaxwellDistribution.html
+
+    %(example)s
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return chi.rvs(3.0, size=size, random_state=random_state)
+
+    def _pdf(self, x):
+        # maxwell.pdf(x) = sqrt(2/pi)x**2 * exp(-x**2/2)
+        return _SQRT_2_OVER_PI*x*x*np.exp(-x*x/2.0)
+
+    def _logpdf(self, x):
+        # Allow x=0 without 'divide by zero' warnings
+        with np.errstate(divide='ignore'):
+            return _LOG_SQRT_2_OVER_PI + 2*np.log(x) - 0.5*x*x
+
+    def _cdf(self, x):
+        return sc.gammainc(1.5, x*x/2.0)
+
+    def _ppf(self, q):
+        return np.sqrt(2*sc.gammaincinv(1.5, q))
+
+    def _sf(self, x):
+        return sc.gammaincc(1.5, x*x/2.0)
+
+    def _isf(self, q):
+        return np.sqrt(2*sc.gammainccinv(1.5, q))
+
+    def _stats(self):
+        val = 3*np.pi-8
+        return (2*np.sqrt(2.0/np.pi),
+                3-8/np.pi,
+                np.sqrt(2)*(32-10*np.pi)/val**1.5,
+                (-12*np.pi*np.pi + 160*np.pi - 384) / val**2.0)
+
+    def _entropy(self):
+        return _EULER + 0.5*np.log(2*np.pi)-0.5
+
+
+maxwell = maxwell_gen(a=0.0, name='maxwell')
+
+
+class mielke_gen(rv_continuous):
+    r"""A Mielke Beta-Kappa / Dagum continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `mielke` is:
+
+    .. math::
+
+        f(x, k, s) = \frac{k x^{k-1}}{(1+x^s)^{1+k/s}}
+
+    for :math:`x > 0` and :math:`k, s > 0`. The distribution is sometimes
+    called Dagum distribution ([2]_). It was already defined in [3]_, called
+    a Burr Type III distribution (`burr` with parameters ``c=s`` and
+    ``d=k/s``).
+
+    `mielke` takes ``k`` and ``s`` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Mielke, P.W., 1973 "Another Family of Distributions for Describing
+           and Analyzing Precipitation Data." J. Appl. Meteor., 12, 275-280
+    .. [2] Dagum, C., 1977 "A new model for personal income distribution."
+           Economie Appliquee, 33, 327-367.
+    .. [3] Burr, I. W. "Cumulative frequency functions", Annals of
+           Mathematical Statistics, 13(2), pp 215-232 (1942).
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        ik = _ShapeInfo("k", False, (0, np.inf), (False, False))
+        i_s = _ShapeInfo("s", False, (0, np.inf), (False, False))
+        return [ik, i_s]
+
+    def _pdf(self, x, k, s):
+        return k*x**(k-1.0) / (1.0+x**s)**(1.0+k*1.0/s)
+
+    def _logpdf(self, x, k, s):
+        # Allow x=0 without 'divide by zero' warnings.
+        with np.errstate(divide='ignore'):
+            return np.log(k) + np.log(x)*(k - 1) - np.log1p(x**s)*(1 + k/s)
+
+    def _cdf(self, x, k, s):
+        return x**k / (1.0+x**s)**(k*1.0/s)
+
+    def _ppf(self, q, k, s):
+        qsk = pow(q, s*1.0/k)
+        return pow(qsk/(1.0-qsk), 1.0/s)
+
+    def _munp(self, n, k, s):
+        def nth_moment(n, k, s):
+            # n-th moment is defined for -k < n < s
+            return sc.gamma((k+n)/s)*sc.gamma(1-n/s)/sc.gamma(k/s)
+
+        return xpx.apply_where(n < s, (n, k, s), nth_moment, fill_value=np.inf)
+
+
+mielke = mielke_gen(a=0.0, name='mielke')
+
+
+class kappa4_gen(rv_continuous):
+    r"""Kappa 4 parameter distribution.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for kappa4 is:
+
+    .. math::
+
+        f(x, h, k) = (1 - k x)^{1/k - 1} (1 - h (1 - k x)^{1/k})^{1/h-1}
+
+    if :math:`h` and :math:`k` are not equal to 0.
+
+    If :math:`h` or :math:`k` are zero then the pdf can be simplified:
+
+    :math:`h = 0` and :math:`k \neq 0`::
+
+        kappa4.pdf(x, h, k) = (1.0 - k*x)**(1.0/k - 1.0)*
+                              exp(-(1.0 - k*x)**(1.0/k))
+
+    :math:`h \neq 0` and :math:`k = 0`::
+
+        kappa4.pdf(x, h, k) = exp(-x)*(1.0 - h*exp(-x))**(1.0/h - 1.0)
+
+    :math:`h = 0` and :math:`k = 0`::
+
+        kappa4.pdf(x, h, k) = exp(-x)*exp(-exp(-x))
+
+    kappa4 takes :math:`h` and :math:`k` as shape parameters.
+
+    The kappa4 distribution returns other distributions when certain
+    :math:`h` and :math:`k` values are used.
+
+    +------+-------------+----------------+------------------+
+    | h    | k=0.0       | k=1.0          | -inf<=k<=inf     |
+    +======+=============+================+==================+
+    | -1.0 | Logistic    |                | Generalized      |
+    |      |             |                | Logistic(1)      |
+    |      |             |                |                  |
+    |      | logistic(x) |                |                  |
+    +------+-------------+----------------+------------------+
+    |  0.0 | Gumbel      | Reverse        | Generalized      |
+    |      |             | Exponential(2) | Extreme Value    |
+    |      |             |                |                  |
+    |      | gumbel_r(x) |                | genextreme(x, k) |
+    +------+-------------+----------------+------------------+
+    |  1.0 | Exponential | Uniform        | Generalized      |
+    |      |             |                | Pareto           |
+    |      |             |                |                  |
+    |      | expon(x)    | uniform(x)     | genpareto(x, -k) |
+    +------+-------------+----------------+------------------+
+
+    (1) There are at least five generalized logistic distributions.
+        Four are described here:
+        https://en.wikipedia.org/wiki/Generalized_logistic_distribution
+        The "fifth" one is the one kappa4 should match which currently
+        isn't implemented in scipy:
+        https://en.wikipedia.org/wiki/Talk:Generalized_logistic_distribution
+        https://www.mathwave.com/help/easyfit/html/analyses/distributions/gen_logistic.html
+    (2) This distribution is currently not in scipy.
+
+    References
+    ----------
+    J.C. Finney, "Optimization of a Skewed Logistic Distribution With Respect
+    to the Kolmogorov-Smirnov Test", A Dissertation Submitted to the Graduate
+    Faculty of the Louisiana State University and Agricultural and Mechanical
+    College, (August, 2004),
+    https://digitalcommons.lsu.edu/gradschool_dissertations/3672
+
+    J.R.M. Hosking, "The four-parameter kappa distribution". IBM J. Res.
+    Develop. 38 (3), 25 1-258 (1994).
+
+    B. Kumphon, A. Kaew-Man, P. Seenoi, "A Rainfall Distribution for the Lampao
+    Site in the Chi River Basin, Thailand", Journal of Water Resource and
+    Protection, vol. 4, 866-869, (2012).
+    :doi:`10.4236/jwarp.2012.410101`
+
+    C. Winchester, "On Estimation of the Four-Parameter Kappa Distribution", A
+    Thesis Submitted to Dalhousie University, Halifax, Nova Scotia, (March
+    2000).
+    http://www.nlc-bnc.ca/obj/s4/f2/dsk2/ftp01/MQ57336.pdf
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _argcheck(self, h, k):
+        shape = np.broadcast_arrays(h, k)[0].shape
+        return np.full(shape, fill_value=True)
+
+    def _shape_info(self):
+        ih = _ShapeInfo("h", False, (-np.inf, np.inf), (False, False))
+        ik = _ShapeInfo("k", False, (-np.inf, np.inf), (False, False))
+        return [ih, ik]
+
+    def _get_support(self, h, k):
+        condlist = [np.logical_and(h > 0, k > 0),
+                    np.logical_and(h > 0, k == 0),
+                    np.logical_and(h > 0, k < 0),
+                    np.logical_and(h <= 0, k > 0),
+                    np.logical_and(h <= 0, k == 0),
+                    np.logical_and(h <= 0, k < 0)]
+
+        def f0(h, k):
+            return (1.0 - np.float_power(h, -k))/k
+
+        def f1(h, k):
+            return np.log(h)
+
+        def f3(h, k):
+            a = np.empty(np.shape(h))
+            a[:] = -np.inf
+            return a
+
+        def f5(h, k):
+            return 1.0/k
+
+        _a = _lazyselect(condlist,
+                         [f0, f1, f0, f3, f3, f5],
+                         [h, k],
+                         default=np.nan)
+
+        def f0(h, k):
+            return 1.0/k
+
+        def f1(h, k):
+            a = np.empty(np.shape(h))
+            a[:] = np.inf
+            return a
+
+        _b = _lazyselect(condlist,
+                         [f0, f1, f1, f0, f1, f1],
+                         [h, k],
+                         default=np.nan)
+        return _a, _b
+
+    def _pdf(self, x, h, k):
+        # kappa4.pdf(x, h, k) = (1.0 - k*x)**(1.0/k - 1.0)*
+        #                       (1.0 - h*(1.0 - k*x)**(1.0/k))**(1.0/h-1)
+        return np.exp(self._logpdf(x, h, k))
+
+    def _logpdf(self, x, h, k):
+        condlist = [np.logical_and(h != 0, k != 0),
+                    np.logical_and(h == 0, k != 0),
+                    np.logical_and(h != 0, k == 0),
+                    np.logical_and(h == 0, k == 0)]
+
+        def f0(x, h, k):
+            '''pdf = (1.0 - k*x)**(1.0/k - 1.0)*(
+                      1.0 - h*(1.0 - k*x)**(1.0/k))**(1.0/h-1.0)
+               logpdf = ...
+            '''
+            return (sc.xlog1py(1.0/k - 1.0, -k*x) +
+                    sc.xlog1py(1.0/h - 1.0, -h*(1.0 - k*x)**(1.0/k)))
+
+        def f1(x, h, k):
+            '''pdf = (1.0 - k*x)**(1.0/k - 1.0)*np.exp(-(
+                      1.0 - k*x)**(1.0/k))
+               logpdf = ...
+            '''
+            return sc.xlog1py(1.0/k - 1.0, -k*x) - (1.0 - k*x)**(1.0/k)
+
+        def f2(x, h, k):
+            '''pdf = np.exp(-x)*(1.0 - h*np.exp(-x))**(1.0/h - 1.0)
+               logpdf = ...
+            '''
+            return -x + sc.xlog1py(1.0/h - 1.0, -h*np.exp(-x))
+
+        def f3(x, h, k):
+            '''pdf = np.exp(-x-np.exp(-x))
+               logpdf = ...
+            '''
+            return -x - np.exp(-x)
+
+        return _lazyselect(condlist,
+                           [f0, f1, f2, f3],
+                           [x, h, k],
+                           default=np.nan)
+
+    def _cdf(self, x, h, k):
+        return np.exp(self._logcdf(x, h, k))
+
+    def _logcdf(self, x, h, k):
+        condlist = [np.logical_and(h != 0, k != 0),
+                    np.logical_and(h == 0, k != 0),
+                    np.logical_and(h != 0, k == 0),
+                    np.logical_and(h == 0, k == 0)]
+
+        def f0(x, h, k):
+            '''cdf = (1.0 - h*(1.0 - k*x)**(1.0/k))**(1.0/h)
+               logcdf = ...
+            '''
+            return (1.0/h)*sc.log1p(-h*(1.0 - k*x)**(1.0/k))
+
+        def f1(x, h, k):
+            '''cdf = np.exp(-(1.0 - k*x)**(1.0/k))
+               logcdf = ...
+            '''
+            return -(1.0 - k*x)**(1.0/k)
+
+        def f2(x, h, k):
+            '''cdf = (1.0 - h*np.exp(-x))**(1.0/h)
+               logcdf = ...
+            '''
+            return (1.0/h)*sc.log1p(-h*np.exp(-x))
+
+        def f3(x, h, k):
+            '''cdf = np.exp(-np.exp(-x))
+               logcdf = ...
+            '''
+            return -np.exp(-x)
+
+        return _lazyselect(condlist,
+                           [f0, f1, f2, f3],
+                           [x, h, k],
+                           default=np.nan)
+
+    def _ppf(self, q, h, k):
+        condlist = [np.logical_and(h != 0, k != 0),
+                    np.logical_and(h == 0, k != 0),
+                    np.logical_and(h != 0, k == 0),
+                    np.logical_and(h == 0, k == 0)]
+
+        def f0(q, h, k):
+            return 1.0/k*(1.0 - ((1.0 - (q**h))/h)**k)
+
+        def f1(q, h, k):
+            return 1.0/k*(1.0 - (-np.log(q))**k)
+
+        def f2(q, h, k):
+            '''ppf = -np.log((1.0 - (q**h))/h)
+            '''
+            return -sc.log1p(-(q**h)) + np.log(h)
+
+        def f3(q, h, k):
+            return -np.log(-np.log(q))
+
+        return _lazyselect(condlist,
+                           [f0, f1, f2, f3],
+                           [q, h, k],
+                           default=np.nan)
+
+    def _get_stats_info(self, h, k):
+        condlist = [
+            np.logical_and(h < 0, k >= 0),
+            k < 0,
+        ]
+
+        def f0(h, k):
+            return (-1.0/h*k).astype(int)
+
+        def f1(h, k):
+            return (-1.0/k).astype(int)
+
+        return _lazyselect(condlist, [f0, f1], [h, k], default=5)
+
+    def _stats(self, h, k):
+        maxr = self._get_stats_info(h, k)
+        outputs = [None if np.any(r < maxr) else np.nan for r in range(1, 5)]
+        return outputs[:]
+
+    def _mom1_sc(self, m, *args):
+        maxr = self._get_stats_info(args[0], args[1])
+        if m >= maxr:
+            return np.nan
+        return integrate.quad(self._mom_integ1, 0, 1, args=(m,)+args)[0]
+
+
+kappa4 = kappa4_gen(name='kappa4')
+
+
+class kappa3_gen(rv_continuous):
+    r"""Kappa 3 parameter distribution.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `kappa3` is:
+
+    .. math::
+
+        f(x, a) = a (a + x^a)^{-(a + 1)/a}
+
+    for :math:`x > 0` and :math:`a > 0`.
+
+    `kappa3` takes ``a`` as a shape parameter for :math:`a`.
+
+    References
+    ----------
+    P.W. Mielke and E.S. Johnson, "Three-Parameter Kappa Distribution Maximum
+    Likelihood and Likelihood Ratio Tests", Methods in Weather Research,
+    701-707, (September, 1973),
+    :doi:`10.1175/1520-0493(1973)101<0701:TKDMLE>2.3.CO;2`
+
+    B. Kumphon, "Maximum Entropy and Maximum Likelihood Estimation for the
+    Three-Parameter Kappa Distribution", Open Journal of Statistics, vol 2,
+    415-419 (2012), :doi:`10.4236/ojs.2012.24050`
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, a):
+        # kappa3.pdf(x, a) = a*(a + x**a)**(-(a + 1)/a),     for x > 0
+        return a*(a + x**a)**(-1.0/a-1)
+
+    def _cdf(self, x, a):
+        return x*(a + x**a)**(-1.0/a)
+
+    def _sf(self, x, a):
+        x, a = np.broadcast_arrays(x, a)  # some code paths pass scalars
+        sf = super()._sf(x, a)
+
+        # When the SF is small, another formulation is typically more accurate.
+        # However, it blows up for large `a`, so use it only if it also returns
+        # a small value of the SF.
+        cutoff = 0.01
+        i = sf < cutoff
+        sf2 = -sc.expm1(sc.xlog1py(-1.0 / a[i], a[i] * x[i]**-a[i]))
+        i2 = sf2 > cutoff
+        sf2[i2] = sf[i][i2]  # replace bad values with original values
+
+        sf[i] = sf2
+        return sf
+
+    def _ppf(self, q, a):
+        return (a/(q**-a - 1.0))**(1.0/a)
+
+    def _isf(self, q, a):
+        lg = sc.xlog1py(-a, -q)
+        denom = sc.expm1(lg)
+        return (a / denom)**(1.0 / a)
+
+    def _stats(self, a):
+        outputs = [None if np.any(i < a) else np.nan for i in range(1, 5)]
+        return outputs[:]
+
+    def _mom1_sc(self, m, *args):
+        if np.any(m >= args[0]):
+            return np.nan
+        return integrate.quad(self._mom_integ1, 0, 1, args=(m,)+args)[0]
+
+
+kappa3 = kappa3_gen(a=0.0, name='kappa3')
+
+
+class moyal_gen(rv_continuous):
+    r"""A Moyal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `moyal` is:
+
+    .. math::
+
+        f(x) = \exp(-(x + \exp(-x))/2) / \sqrt{2\pi}
+
+    for a real number :math:`x`.
+
+    %(after_notes)s
+
+    This distribution has utility in high-energy physics and radiation
+    detection. It describes the energy loss of a charged relativistic
+    particle due to ionization of the medium [1]_. It also provides an
+    approximation for the Landau distribution. For an in depth description
+    see [2]_. For additional description, see [3]_.
+
+    References
+    ----------
+    .. [1] J.E. Moyal, "XXX. Theory of ionization fluctuations",
+           The London, Edinburgh, and Dublin Philosophical Magazine
+           and Journal of Science, vol 46, 263-280, (1955).
+           :doi:`10.1080/14786440308521076` (gated)
+    .. [2] G. Cordeiro et al., "The beta Moyal: A useful skew distribution",
+           International Journal of Research and Reviews in Applied Sciences,
+           vol 10, 171-192, (2012).
+           https://www.arpapress.com/files/volumes/vol10issue2/ijrras_10_2_02.pdf
+    .. [3] C. Walck, "Handbook on Statistical Distributions for
+           Experimentalists; International Report SUF-PFY/96-01", Chapter 26,
+           University of Stockholm: Stockholm, Sweden, (2007).
+           http://www.stat.rice.edu/~dobelman/textfiles/DistributionsHandbook.pdf
+
+    .. versionadded:: 1.1.0
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        u1 = gamma.rvs(a=0.5, scale=2, size=size,
+                       random_state=random_state)
+        return -np.log(u1)
+
+    def _pdf(self, x):
+        return np.exp(-0.5 * (x + np.exp(-x))) / np.sqrt(2*np.pi)
+
+    def _cdf(self, x):
+        return sc.erfc(np.exp(-0.5 * x) / np.sqrt(2))
+
+    def _sf(self, x):
+        return sc.erf(np.exp(-0.5 * x) / np.sqrt(2))
+
+    def _ppf(self, x):
+        return -np.log(2 * sc.erfcinv(x)**2)
+
+    def _stats(self):
+        mu = np.log(2) + np.euler_gamma
+        mu2 = np.pi**2 / 2
+        g1 = 28 * np.sqrt(2) * sc.zeta(3) / np.pi**3
+        g2 = 4.
+        return mu, mu2, g1, g2
+
+    def _munp(self, n):
+        if n == 1.0:
+            return np.log(2) + np.euler_gamma
+        elif n == 2.0:
+            return np.pi**2 / 2 + (np.log(2) + np.euler_gamma)**2
+        elif n == 3.0:
+            tmp1 = 1.5 * np.pi**2 * (np.log(2)+np.euler_gamma)
+            tmp2 = (np.log(2)+np.euler_gamma)**3
+            tmp3 = 14 * sc.zeta(3)
+            return tmp1 + tmp2 + tmp3
+        elif n == 4.0:
+            tmp1 = 4 * 14 * sc.zeta(3) * (np.log(2) + np.euler_gamma)
+            tmp2 = 3 * np.pi**2 * (np.log(2) + np.euler_gamma)**2
+            tmp3 = (np.log(2) + np.euler_gamma)**4
+            tmp4 = 7 * np.pi**4 / 4
+            return tmp1 + tmp2 + tmp3 + tmp4
+        else:
+            # return generic for higher moments
+            # return rv_continuous._mom1_sc(self, n, b)
+            return self._mom1_sc(n)
+
+
+moyal = moyal_gen(name="moyal")
+
+
+class nakagami_gen(rv_continuous):
+    r"""A Nakagami continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `nakagami` is:
+
+    .. math::
+
+        f(x, \nu) = \frac{2 \nu^\nu}{\Gamma(\nu)} x^{2\nu-1} \exp(-\nu x^2)
+
+    for :math:`x >= 0`, :math:`\nu > 0`. The distribution was introduced in
+    [2]_, see also [1]_ for further information.
+
+    `nakagami` takes ``nu`` as a shape parameter for :math:`\nu`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Nakagami distribution", Wikipedia
+           https://en.wikipedia.org/wiki/Nakagami_distribution
+    .. [2] M. Nakagami, "The m-distribution - A general formula of intensity
+           distribution of rapid fading", Statistical methods in radio wave
+           propagation, Pergamon Press, 1960, 3-36.
+           :doi:`10.1016/B978-0-08-009306-2.50005-4`
+
+    %(example)s
+
+    """
+    def _argcheck(self, nu):
+        return nu > 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("nu", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, nu):
+        return np.exp(self._logpdf(x, nu))
+
+    def _logpdf(self, x, nu):
+        # nakagami.pdf(x, nu) = 2 * nu**nu / gamma(nu) *
+        #                       x**(2*nu-1) * exp(-nu*x**2)
+        return (np.log(2) + sc.xlogy(nu, nu) - sc.gammaln(nu) +
+                sc.xlogy(2*nu - 1, x) - nu*x**2)
+
+    def _cdf(self, x, nu):
+        return sc.gammainc(nu, nu*x*x)
+
+    def _ppf(self, q, nu):
+        return np.sqrt(1.0/nu*sc.gammaincinv(nu, q))
+
+    def _sf(self, x, nu):
+        return sc.gammaincc(nu, nu*x*x)
+
+    def _isf(self, p, nu):
+        return np.sqrt(1/nu * sc.gammainccinv(nu, p))
+
+    def _stats(self, nu):
+        mu = sc.poch(nu, 0.5)/np.sqrt(nu)
+        mu2 = 1.0-mu*mu
+        g1 = mu * (1 - 4*nu*mu2) / 2.0 / nu / np.power(mu2, 1.5)
+        g2 = -6*mu**4*nu + (8*nu-2)*mu**2-2*nu + 1
+        g2 /= nu*mu2**2.0
+        return mu, mu2, g1, g2
+
+    def _entropy(self, nu):
+        shape = np.shape(nu)
+        # because somehow this isn't taken care of by the infrastructure...
+        nu = np.atleast_1d(nu)
+        A = sc.gammaln(nu)
+        B = nu - (nu - 0.5) * sc.digamma(nu)
+        C = -0.5 * np.log(nu) - np.log(2)
+        h = A + B + C
+        # This is the asymptotic sum of A and B (see gh-17868)
+        norm_entropy = stats.norm._entropy()
+        # Above, this is lost to rounding error for large nu, so use the
+        # asymptotic sum when the approximation becomes accurate
+        i = nu > 5e4  # roundoff error ~ approximation error
+        # -1 / (12 * nu) is the O(1/nu) term; see gh-17929
+        h[i] = C[i] + norm_entropy - 1/(12*nu[i])
+        return h.reshape(shape)[()]
+
+    def _rvs(self, nu, size=None, random_state=None):
+        # this relationship can be found in [1] or by a direct calculation
+        return np.sqrt(random_state.standard_gamma(nu, size=size) / nu)
+
+    def _fitstart(self, data, args=None):
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        if args is None:
+            args = (1.0,) * self.numargs
+        # Analytical justified estimates
+        # see: https://docs.scipy.org/doc/scipy/reference/tutorial/stats/continuous_nakagami.html
+        loc = np.min(data)
+        scale = np.sqrt(np.sum((data - loc)**2) / len(data))
+        return args + (loc, scale)
+
+
+nakagami = nakagami_gen(a=0.0, name="nakagami")
+
+
+# The function name ncx2 is an abbreviation for noncentral chi squared.
+def _ncx2_log_pdf(x, df, nc):
+    # We use (xs**2 + ns**2)/2 = (xs - ns)**2/2  + xs*ns, and include the
+    # factor of exp(-xs*ns) into the ive function to improve numerical
+    # stability at large values of xs. See also `rice.pdf`.
+    df2 = df/2.0 - 1.0
+    xs, ns = np.sqrt(x), np.sqrt(nc)
+    res = sc.xlogy(df2/2.0, x/nc) - 0.5*(xs - ns)**2
+    corr = sc.ive(df2, xs*ns) / 2.0
+    # Return res + np.log(corr) avoiding np.log(0)
+    return xpx.apply_where(
+        corr > 0,
+        (res, corr),
+        lambda r, c: r + np.log(c),
+        fill_value=-np.inf)
+
+
+class ncx2_gen(rv_continuous):
+    r"""A non-central chi-squared continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `ncx2` is:
+
+    .. math::
+
+        f(x, k, \lambda) = \frac{1}{2} \exp(-(\lambda+x)/2)
+            (x/\lambda)^{(k-2)/4}  I_{(k-2)/2}(\sqrt{\lambda x})
+
+    for :math:`x >= 0`, :math:`k > 0` and :math:`\lambda \ge 0`.
+    :math:`k` specifies the degrees of freedom (denoted ``df`` in the
+    implementation) and :math:`\lambda` is the non-centrality parameter
+    (denoted ``nc`` in the implementation). :math:`I_\nu` denotes the
+    modified Bessel function of first order of degree :math:`\nu`
+    (`scipy.special.iv`).
+
+    `ncx2` takes ``df`` and ``nc`` as shape parameters.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pdf``, ``cdf``, ``ppf``, ``sf`` and ``isf``
+    methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _argcheck(self, df, nc):
+        return (df > 0) & np.isfinite(df) & (nc >= 0)
+
+    def _shape_info(self):
+        idf = _ShapeInfo("df", False, (0, np.inf), (False, False))
+        inc = _ShapeInfo("nc", False, (0, np.inf), (True, False))
+        return [idf, inc]
+
+    def _rvs(self, df, nc, size=None, random_state=None):
+        return random_state.noncentral_chisquare(df, nc, size)
+
+    def _logpdf(self, x, df, nc):
+        return xpx.apply_where(nc != 0, (x, df, nc), _ncx2_log_pdf,
+                               lambda x, df, _: chi2._logpdf(x, df))
+
+    def _pdf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return xpx.apply_where(nc != 0, (x, df, nc), scu._ncx2_pdf,
+                                   lambda x, df, _: chi2._pdf(x, df))
+
+    def _cdf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return xpx.apply_where(nc != 0, (x, df, nc), sc.chndtr,
+                                   lambda x, df, _: chi2._cdf(x, df))
+
+    def _ppf(self, q, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return xpx.apply_where(nc != 0, (q, df, nc), sc.chndtrix,
+                                   lambda x, df, _: chi2._ppf(x, df))
+
+    def _sf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return xpx.apply_where(nc != 0, (x, df, nc), scu._ncx2_sf,
+                                   lambda x, df, _: chi2._sf(x, df))
+
+    def _isf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return xpx.apply_where(nc != 0, (x, df, nc), scu._ncx2_isf,
+                                   lambda x, df, _: chi2._isf(x, df))
+
+    def _stats(self, df, nc):
+        _ncx2_mean = df + nc
+        def k_plus_cl(k, l, c):
+            return k + c*l
+        _ncx2_variance =  2.0 * k_plus_cl(df, nc, 2.0)
+        _ncx2_skewness = (np.sqrt(8.0) * k_plus_cl(df, nc, 3) /
+                          np.sqrt(k_plus_cl(df, nc, 2.0)**3))
+        _ncx2_kurtosis_excess = (12.0 * k_plus_cl(df, nc, 4.0) /
+                                 k_plus_cl(df, nc, 2.0)**2)
+        return (
+            _ncx2_mean,
+            _ncx2_variance,
+            _ncx2_skewness,
+            _ncx2_kurtosis_excess,
+        )
+
+
+ncx2 = ncx2_gen(a=0.0, name='ncx2')
+
+
+class ncf_gen(rv_continuous):
+    r"""A non-central F distribution continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    scipy.stats.f : Fisher distribution
+
+    Notes
+    -----
+    The probability density function for `ncf` is:
+
+    .. math::
+
+        f(x, n_1, n_2, \lambda) =
+            \exp\left(\frac{\lambda}{2} +
+                      \lambda n_1 \frac{x}{2(n_1 x + n_2)}
+                \right)
+            n_1^{n_1/2} n_2^{n_2/2} x^{n_1/2 - 1} \\
+            (n_2 + n_1 x)^{-(n_1 + n_2)/2}
+            \gamma(n_1/2) \gamma(1 + n_2/2) \\
+            \frac{L^{\frac{n_1}{2}-1}_{n_2/2}
+                \left(-\lambda n_1 \frac{x}{2(n_1 x + n_2)}\right)}
+            {B(n_1/2, n_2/2)
+                \gamma\left(\frac{n_1 + n_2}{2}\right)}
+
+    for :math:`n_1, n_2 > 0`, :math:`\lambda \ge 0`.  Here :math:`n_1` is the
+    degrees of freedom in the numerator, :math:`n_2` the degrees of freedom in
+    the denominator, :math:`\lambda` the non-centrality parameter,
+    :math:`\gamma` is the logarithm of the Gamma function, :math:`L_n^k` is a
+    generalized Laguerre polynomial and :math:`B` is the beta function.
+
+    `ncf` takes ``dfn``, ``dfd`` and ``nc`` as shape parameters. If ``nc=0``,
+    the distribution becomes equivalent to the Fisher distribution.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pdf``, ``cdf``, ``ppf``, ``stats``, ``sf`` and
+    ``isf`` methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _argcheck(self, dfn, dfd, nc):
+        return (dfn > 0) & (dfd > 0) & (nc >= 0)
+
+    def _shape_info(self):
+        idf1 = _ShapeInfo("dfn", False, (0, np.inf), (False, False))
+        idf2 = _ShapeInfo("dfd", False, (0, np.inf), (False, False))
+        inc = _ShapeInfo("nc", False, (0, np.inf), (True, False))
+        return [idf1, idf2, inc]
+
+    def _rvs(self, dfn, dfd, nc, size=None, random_state=None):
+        return random_state.noncentral_f(dfn, dfd, nc, size)
+
+    def _pdf(self, x, dfn, dfd, nc):
+        return scu._ncf_pdf(x, dfn, dfd, nc)
+
+    def _cdf(self, x, dfn, dfd, nc):
+        return sc.ncfdtr(dfn, dfd, nc, x)
+
+    def _ppf(self, q, dfn, dfd, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return sc.ncfdtri(dfn, dfd, nc, q)
+
+    def _sf(self, x, dfn, dfd, nc):
+        return scu._ncf_sf(x, dfn, dfd, nc)
+
+    def _isf(self, x, dfn, dfd, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return scu._ncf_isf(x, dfn, dfd, nc)
+
+    # # Produces bogus values as written - maybe it's close, though?
+    # def _munp(self, n, dfn, dfd, nc):
+    #     val = (dfn * 1.0/dfd)**n
+    #     term = sc.gammaln(n+0.5*dfn) + sc.gammaln(0.5*dfd-n) - sc.gammaln(dfd*0.5)
+    #     val *= np.exp(-nc / 2.0+term)
+    #     val *= sc.hyp1f1(n+0.5*dfn, 0.5*dfn, 0.5*nc)
+    #     return val
+
+    def _stats(self, dfn, dfd, nc, moments='mv'):
+        mu = scu._ncf_mean(dfn, dfd, nc)
+        mu2 = scu._ncf_variance(dfn, dfd, nc)
+        g1 = scu._ncf_skewness(dfn, dfd, nc) if 's' in moments else None
+        g2 = scu._ncf_kurtosis_excess(  # isn't really excess kurtosis!
+            dfn, dfd, nc) - 3 if 'k' in moments else None
+        # Mathematica: Kurtosis[NoncentralFRatioDistribution[27, 27, 0.415784417992261]]
+        return mu, mu2, g1, g2
+
+
+ncf = ncf_gen(a=0.0, name='ncf')
+
+
+class t_gen(rv_continuous):
+    r"""A Student's t continuous random variable.
+
+    For the noncentral t distribution, see `nct`.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    nct
+
+    Notes
+    -----
+    The probability density function for `t` is:
+
+    .. math::
+
+        f(x, \nu) = \frac{\Gamma((\nu+1)/2)}
+                        {\sqrt{\pi \nu} \Gamma(\nu/2)}
+                    (1+x^2/\nu)^{-(\nu+1)/2}
+
+    where :math:`x` is a real number and the degrees of freedom parameter
+    :math:`\nu` (denoted ``df`` in the implementation) satisfies
+    :math:`\nu > 0`. :math:`\Gamma` is the gamma function
+    (`scipy.special.gamma`).
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("df", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, df, size=None, random_state=None):
+        return random_state.standard_t(df, size=size)
+
+    def _pdf(self, x, df):
+        return xpx.apply_where(
+            df == np.inf, (x, df),
+            lambda x, df: norm._pdf(x),
+            lambda x, df: np.exp(self._logpdf(x, df)))
+
+    def _logpdf(self, x, df):
+
+        def t_logpdf(x, df):
+            return (np.log(sc.poch(0.5 * df, 0.5))
+                    - 0.5 * (np.log(df) + np.log(np.pi))
+                    - (df + 1)/2*np.log1p(x * x/df))
+
+        def norm_logpdf(x, df):
+            return norm._logpdf(x)
+
+        return xpx.apply_where(df == np.inf, (x, df), norm_logpdf, t_logpdf)
+
+    def _cdf(self, x, df):
+        return sc.stdtr(df, x)
+
+    def _sf(self, x, df):
+        return sc.stdtr(df, -x)
+
+    def _ppf(self, q, df):
+        return sc.stdtrit(df, q)
+
+    def _isf(self, q, df):
+        return -sc.stdtrit(df, q)
+
+    def _stats(self, df):
+        # infinite df -> normal distribution (0.0, 1.0, 0.0, 0.0)
+        infinite_df = np.isposinf(df)
+
+        mu = np.where(df > 1, 0.0, np.inf)
+
+        condlist = ((df > 1) & (df <= 2),
+                    (df > 2) & np.isfinite(df),
+                    infinite_df)
+        choicelist = (lambda df: np.broadcast_to(np.inf, df.shape),
+                      lambda df: df / (df-2.0),
+                      lambda df: np.broadcast_to(1, df.shape))
+        mu2 = _lazyselect(condlist, choicelist, (df,), np.nan)
+
+        g1 = np.where(df > 3, 0.0, np.nan)
+
+        condlist = ((df > 2) & (df <= 4),
+                    (df > 4) & np.isfinite(df),
+                    infinite_df)
+        choicelist = (lambda df: np.broadcast_to(np.inf, df.shape),
+                      lambda df: 6.0 / (df-4.0),
+                      lambda df: np.broadcast_to(0, df.shape))
+        g2 = _lazyselect(condlist, choicelist, (df,), np.nan)
+
+        return mu, mu2, g1, g2
+
+    def _entropy(self, df):
+        if df == np.inf:
+            return norm._entropy()
+
+        def regular(df):
+            half = df/2
+            half1 = (df + 1)/2
+            return (half1*(sc.digamma(half1) - sc.digamma(half))
+                    + np.log(np.sqrt(df)*sc.beta(half, 0.5)))
+
+        def asymptotic(df):
+            # Formula from Wolfram Alpha:
+            # "asymptotic expansion (d+1)/2 * (digamma((d+1)/2) - digamma(d/2))
+            #  + log(sqrt(d) * beta(d/2, 1/2))"
+            h = (norm._entropy() + 1/df + (df**-2.)/4 - (df**-3.)/6
+                 - (df**-4.)/8 + 3/10*(df**-5.) + (df**-6.)/4)
+            return h
+
+        return xpx.apply_where(df >= 100, df, asymptotic, regular)
+
+
+t = t_gen(name='t')
+
+
+class nct_gen(rv_continuous):
+    r"""A non-central Student's t continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    If :math:`Y` is a standard normal random variable and :math:`V` is
+    an independent chi-square random variable (`chi2`) with :math:`k` degrees
+    of freedom, then
+
+    .. math::
+
+        X = \frac{Y + c}{\sqrt{V/k}}
+
+    has a non-central Student's t distribution on the real line.
+    The degrees of freedom parameter :math:`k` (denoted ``df`` in the
+    implementation) satisfies :math:`k > 0` and the noncentrality parameter
+    :math:`c` (denoted ``nc`` in the implementation) is a real number.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pdf``, ``cdf``, ``ppf``, ``sf`` and ``isf``
+    methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _argcheck(self, df, nc):
+        return (df > 0) & (nc == nc)
+
+    def _shape_info(self):
+        idf = _ShapeInfo("df", False, (0, np.inf), (False, False))
+        inc = _ShapeInfo("nc", False, (-np.inf, np.inf), (False, False))
+        return [idf, inc]
+
+    def _rvs(self, df, nc, size=None, random_state=None):
+        n = norm.rvs(loc=nc, size=size, random_state=random_state)
+        c2 = chi2.rvs(df, size=size, random_state=random_state)
+        return n * np.sqrt(df) / np.sqrt(c2)
+
+    def _pdf(self, x, df, nc):
+        return scu._nct_pdf(x, df, nc)
+
+    def _cdf(self, x, df, nc):
+        return sc.nctdtr(df, nc, x)
+
+    def _ppf(self, q, df, nc):
+        return sc.nctdtrit(df, nc, q)
+
+    def _sf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return np.clip(scu._nct_sf(x, df, nc), 0, 1)
+
+    def _isf(self, x, df, nc):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return scu._nct_isf(x, df, nc)
+
+    def _stats(self, df, nc, moments='mv'):
+        mu = scu._nct_mean(df, nc)
+        mu2 = scu._nct_variance(df, nc)
+        g1 = scu._nct_skewness(df, nc) if 's' in moments else None
+        g2 = scu._nct_kurtosis_excess(df, nc) if 'k' in moments else None
+        return mu, mu2, g1, g2
+
+
+nct = nct_gen(name="nct")
+
+
+class pareto_gen(rv_continuous):
+    r"""A Pareto continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `pareto` is:
+
+    .. math::
+
+        f(x, b) = \frac{b}{x^{b+1}}
+
+    for :math:`x \ge 1`, :math:`b > 0`.
+
+    `pareto` takes ``b`` as a shape parameter for :math:`b`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, b):
+        # pareto.pdf(x, b) = b / x**(b+1)
+        return b * x**(-b-1)
+
+    def _cdf(self, x, b):
+        return 1 - x**(-b)
+
+    def _ppf(self, q, b):
+        return pow(1-q, -1.0/b)
+
+    def _sf(self, x, b):
+        return x**(-b)
+
+    def _isf(self, q, b):
+        return np.power(q, -1.0 / b)
+
+    def _stats(self, b, moments='mv'):
+        mu, mu2, g1, g2 = None, None, None, None
+        if 'm' in moments:
+            mask = b > 1
+            bt = np.extract(mask, b)
+            mu = np.full(np.shape(b), fill_value=np.inf)
+            np.place(mu, mask, bt / (bt-1.0))
+        if 'v' in moments:
+            mask = b > 2
+            bt = np.extract(mask, b)
+            mu2 = np.full(np.shape(b), fill_value=np.inf)
+            np.place(mu2, mask, bt / (bt-2.0) / (bt-1.0)**2)
+        if 's' in moments:
+            mask = b > 3
+            bt = np.extract(mask, b)
+            g1 = np.full(np.shape(b), fill_value=np.nan)
+            vals = 2 * (bt + 1.0) * np.sqrt(bt - 2.0) / ((bt - 3.0) * np.sqrt(bt))
+            np.place(g1, mask, vals)
+        if 'k' in moments:
+            mask = b > 4
+            bt = np.extract(mask, b)
+            g2 = np.full(np.shape(b), fill_value=np.nan)
+            vals = (6.0*np.polyval([1.0, 1.0, -6, -2], bt) /
+                    np.polyval([1.0, -7.0, 12.0, 0.0], bt))
+            np.place(g2, mask, vals)
+        return mu, mu2, g1, g2
+
+    def _entropy(self, b):
+        return 1 + 1.0/b - np.log(b)
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        parameters = _check_fit_input_parameters(self, data, args, kwds)
+        data, fshape, floc, fscale = parameters
+
+        # ensure that any fixed parameters don't violate constraints of the
+        # distribution before continuing.
+        if floc is not None and np.min(data) - floc < (fscale or 0):
+            raise FitDataError("pareto", lower=1, upper=np.inf)
+
+        ndata = data.shape[0]
+
+        def get_shape(scale, location):
+            # The first-order necessary condition on `shape` can be solved in
+            # closed form
+            return ndata / np.sum(np.log((data - location) / scale))
+
+        if floc is fscale is None:
+            # The support of the distribution is `(x - loc)/scale > 0`.
+            # The method of Lagrange multipliers turns this constraint
+            # into an equation that can be solved numerically.
+            # See gh-12545 for details.
+
+            def dL_dScale(shape, scale):
+                # The partial derivative of the log-likelihood function w.r.t.
+                # the scale.
+                return ndata * shape / scale
+
+            def dL_dLocation(shape, location):
+                # The partial derivative of the log-likelihood function w.r.t.
+                # the location.
+                return (shape + 1) * np.sum(1 / (data - location))
+
+            def fun_to_solve(scale):
+                # optimize the scale by setting the partial derivatives
+                # w.r.t. to location and scale equal and solving.
+                location = np.min(data) - scale
+                shape = fshape or get_shape(scale, location)
+                return dL_dLocation(shape, location) - dL_dScale(shape, scale)
+
+            def interval_contains_root(lbrack, rbrack):
+                # return true if the signs disagree.
+                return (np.sign(fun_to_solve(lbrack)) !=
+                        np.sign(fun_to_solve(rbrack)))
+
+            # set brackets for `root_scalar` to use when optimizing over the
+            # scale such that a root is likely between them. Use user supplied
+            # guess or default 1.
+            brack_start = float(kwds.get('scale', 1))
+            lbrack, rbrack = brack_start / 2, brack_start * 2
+            # if a root is not between the brackets, iteratively expand them
+            # until they include a sign change, checking after each bracket is
+            # modified.
+            while (not interval_contains_root(lbrack, rbrack)
+                   and (lbrack > 0 or rbrack < np.inf)):
+                lbrack /= 2
+                rbrack *= 2
+            res = root_scalar(fun_to_solve, bracket=[lbrack, rbrack])
+            if res.converged:
+                scale = res.root
+                loc = np.min(data) - scale
+                shape = fshape or get_shape(scale, loc)
+
+                # The Pareto distribution requires that its parameters satisfy
+                # the condition `fscale + floc <= min(data)`. However, to
+                # avoid numerical issues, we require that `fscale + floc`
+                # is strictly less than `min(data)`. If this condition
+                # is not satisfied, reduce the scale with `np.nextafter` to
+                # ensure that data does not fall outside of the support.
+                if not (scale + loc) < np.min(data):
+                    scale = np.min(data) - loc
+                    scale = np.nextafter(scale, 0)
+                return shape, loc, scale
+            else:
+                return super().fit(data, **kwds)
+        elif floc is None:
+            loc = np.min(data) - fscale
+        else:
+            loc = floc
+        # Source: Evans, Hastings, and Peacock (2000), Statistical
+        # Distributions, 3rd. Ed., John Wiley and Sons. Page 149.
+        scale = fscale or np.min(data) - loc
+        shape = fshape or get_shape(scale, loc)
+        return shape, loc, scale
+
+
+pareto = pareto_gen(a=1.0, name="pareto")
+
+
+class lomax_gen(rv_continuous):
+    r"""A Lomax (Pareto of the second kind) continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `lomax` is:
+
+    .. math::
+
+        f(x, c) = \frac{c}{(1+x)^{c+1}}
+
+    for :math:`x \ge 0`, :math:`c > 0`.
+
+    `lomax` takes ``c`` as a shape parameter for :math:`c`.
+
+    `lomax` is a special case of `pareto` with ``loc=-1.0``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # lomax.pdf(x, c) = c / (1+x)**(c+1)
+        return c*1.0/(1.0+x)**(c+1.0)
+
+    def _logpdf(self, x, c):
+        return np.log(c) - (c+1)*sc.log1p(x)
+
+    def _cdf(self, x, c):
+        return -sc.expm1(-c*sc.log1p(x))
+
+    def _sf(self, x, c):
+        return np.exp(-c*sc.log1p(x))
+
+    def _logsf(self, x, c):
+        return -c*sc.log1p(x)
+
+    def _ppf(self, q, c):
+        return sc.expm1(-sc.log1p(-q)/c)
+
+    def _isf(self, q, c):
+        return q**(-1.0 / c) - 1
+
+    def _stats(self, c):
+        mu, mu2, g1, g2 = pareto.stats(c, loc=-1.0, moments='mvsk')
+        return mu, mu2, g1, g2
+
+    def _entropy(self, c):
+        return 1+1.0/c-np.log(c)
+
+
+lomax = lomax_gen(a=0.0, name="lomax")
+
+
+class pearson3_gen(rv_continuous):
+    r"""A pearson type III continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `pearson3` is:
+
+    .. math::
+
+        f(x, \kappa) = \frac{|\beta|}{\Gamma(\alpha)}
+                       (\beta (x - \zeta))^{\alpha - 1}
+                       \exp(-\beta (x - \zeta))
+
+    where:
+
+    .. math::
+
+            \beta = \frac{2}{\kappa}
+
+            \alpha = \beta^2 = \frac{4}{\kappa^2}
+
+            \zeta = -\frac{\alpha}{\beta} = -\beta
+
+    :math:`\Gamma` is the gamma function (`scipy.special.gamma`).
+    Pass the skew :math:`\kappa` into `pearson3` as the shape parameter
+    ``skew``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    References
+    ----------
+    R.W. Vogel and D.E. McMartin, "Probability Plot Goodness-of-Fit and
+    Skewness Estimation Procedures for the Pearson Type 3 Distribution", Water
+    Resources Research, Vol.27, 3149-3158 (1991).
+
+    L.R. Salvosa, "Tables of Pearson's Type III Function", Ann. Math. Statist.,
+    Vol.1, 191-198 (1930).
+
+    "Using Modern Computing Tools to Fit the Pearson Type III Distribution to
+    Aviation Loads Data", Office of Aviation Research (2003).
+
+    """
+    def _preprocess(self, x, skew):
+        # The real 'loc' and 'scale' are handled in the calling pdf(...). The
+        # local variables 'loc' and 'scale' within pearson3._pdf are set to
+        # the defaults just to keep them as part of the equations for
+        # documentation.
+        loc = 0.0
+        scale = 1.0
+
+        # If skew is small, return _norm_pdf. The divide between pearson3
+        # and norm was found by brute force and is approximately a skew of
+        # 0.000016.  No one, I hope, would actually use a skew value even
+        # close to this small.
+        norm2pearson_transition = 0.000016
+
+        ans, x, skew = np.broadcast_arrays(1.0, x, skew)
+        ans = ans.copy()
+
+        # mask is True where skew is small enough to use the normal approx.
+        mask = np.absolute(skew) < norm2pearson_transition
+        invmask = ~mask
+
+        beta = 2.0 / (skew[invmask] * scale)
+        alpha = (scale * beta)**2
+        zeta = loc - alpha / beta
+
+        transx = beta * (x[invmask] - zeta)
+        return ans, x, transx, mask, invmask, beta, alpha, zeta
+
+    def _argcheck(self, skew):
+        # The _argcheck function in rv_continuous only allows positive
+        # arguments.  The skew argument for pearson3 can be zero (which I want
+        # to handle inside pearson3._pdf) or negative.  So just return True
+        # for all skew args.
+        return np.isfinite(skew)
+
+    def _shape_info(self):
+        return [_ShapeInfo("skew", False, (-np.inf, np.inf), (False, False))]
+
+    def _stats(self, skew):
+        m = 0.0
+        v = 1.0
+        s = skew
+        k = 1.5*skew**2
+        return m, v, s, k
+
+    def _pdf(self, x, skew):
+        # pearson3.pdf(x, skew) = abs(beta) / gamma(alpha) *
+        #     (beta * (x - zeta))**(alpha - 1) * exp(-beta*(x - zeta))
+        # Do the calculation in _logpdf since helps to limit
+        # overflow/underflow problems
+        ans = np.exp(self._logpdf(x, skew))
+        if ans.ndim == 0:
+            if np.isnan(ans):
+                return 0.0
+            return ans
+        ans[np.isnan(ans)] = 0.0
+        return ans
+
+    def _logpdf(self, x, skew):
+        #   PEARSON3 logpdf                           GAMMA logpdf
+        #   np.log(abs(beta))
+        # + (alpha - 1)*np.log(beta*(x - zeta))          + (a - 1)*np.log(x)
+        # - beta*(x - zeta)                           - x
+        # - sc.gammalnalpha)                              - sc.gammalna)
+        ans, x, transx, mask, invmask, beta, alpha, _ = (
+            self._preprocess(x, skew))
+
+        ans[mask] = np.log(_norm_pdf(x[mask]))
+        # use logpdf instead of _logpdf to fix issue mentioned in gh-12640
+        # (_logpdf does not return correct result for alpha = 1)
+        ans[invmask] = np.log(abs(beta)) + gamma.logpdf(transx, alpha)
+        return ans
+
+    def _cdf(self, x, skew):
+        ans, x, transx, mask, invmask, _, alpha, _ = (
+            self._preprocess(x, skew))
+
+        ans[mask] = _norm_cdf(x[mask])
+
+        skew = np.broadcast_to(skew, invmask.shape)
+        invmask1a = np.logical_and(invmask, skew > 0)
+        invmask1b = skew[invmask] > 0
+        # use cdf instead of _cdf to fix issue mentioned in gh-12640
+        # (_cdf produces NaNs for inputs outside support)
+        ans[invmask1a] = gamma.cdf(transx[invmask1b], alpha[invmask1b])
+
+        # The gamma._cdf approach wasn't working with negative skew.
+        # Note that multiplying the skew by -1 reflects about x=0.
+        # So instead of evaluating the CDF with negative skew at x,
+        # evaluate the SF with positive skew at -x.
+        invmask2a = np.logical_and(invmask, skew < 0)
+        invmask2b = skew[invmask] < 0
+        # gamma._sf produces NaNs when transx < 0, so use gamma.sf
+        ans[invmask2a] = gamma.sf(transx[invmask2b], alpha[invmask2b])
+
+        return ans
+
+    def _sf(self, x, skew):
+        ans, x, transx, mask, invmask, _, alpha, _ = (
+            self._preprocess(x, skew))
+
+        ans[mask] = _norm_sf(x[mask])
+
+        skew = np.broadcast_to(skew, invmask.shape)
+        invmask1a = np.logical_and(invmask, skew > 0)
+        invmask1b = skew[invmask] > 0
+        ans[invmask1a] = gamma.sf(transx[invmask1b], alpha[invmask1b])
+
+        invmask2a = np.logical_and(invmask, skew < 0)
+        invmask2b = skew[invmask] < 0
+        ans[invmask2a] = gamma.cdf(transx[invmask2b], alpha[invmask2b])
+
+        return ans
+
+    def _rvs(self, skew, size=None, random_state=None):
+        skew = np.broadcast_to(skew, size)
+        ans, _, _, mask, invmask, beta, alpha, zeta = (
+            self._preprocess([0], skew))
+
+        nsmall = mask.sum()
+        nbig = mask.size - nsmall
+        ans[mask] = random_state.standard_normal(nsmall)
+        ans[invmask] = random_state.standard_gamma(alpha, nbig)/beta + zeta
+
+        if size == ():
+            ans = ans[0]
+        return ans
+
+    def _ppf(self, q, skew):
+        ans, q, _, mask, invmask, beta, alpha, zeta = (
+            self._preprocess(q, skew))
+        ans[mask] = _norm_ppf(q[mask])
+        q = q[invmask]
+        q[beta < 0] = 1 - q[beta < 0]  # for negative skew; see gh-17050
+        ans[invmask] = sc.gammaincinv(alpha, q)/beta + zeta
+        return ans
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        Note that method of moments (`method='MM'`) is not
+        available for this distribution.\n\n""")
+    def fit(self, data, *args, **kwds):
+        if kwds.get("method", None) == 'MM':
+            raise NotImplementedError("Fit `method='MM'` is not available for "
+                                      "the Pearson3 distribution. Please try "
+                                      "the default `method='MLE'`.")
+        else:
+            return super(type(self), self).fit(data, *args, **kwds)
+
+
+pearson3 = pearson3_gen(name="pearson3")
+
+
+class powerlaw_gen(rv_continuous):
+    r"""A power-function continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    pareto
+
+    Notes
+    -----
+    The probability density function for `powerlaw` is:
+
+    .. math::
+
+        f(x, a) = a x^{a-1}
+
+    for :math:`0 \le x \le 1`, :math:`a > 0`.
+
+    `powerlaw` takes ``a`` as a shape parameter for :math:`a`.
+
+    %(after_notes)s
+
+    For example, the support of `powerlaw` can be adjusted from the default
+    interval ``[0, 1]`` to the interval ``[c, c+d]`` by setting ``loc=c`` and
+    ``scale=d``. For a power-law distribution with infinite support, see
+    `pareto`. For a power-law distribution described by PDF:
+
+    .. math::
+
+        f(x; a, l, h) = \frac{a}{h^a - l^2} x^{a-1}
+
+    with :math:`a \neq 0` and :math:`0 < l < x < h`, see `truncpareto`.
+
+    `powerlaw` is a special case of `beta` with ``b=1``.
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, a):
+        # powerlaw.pdf(x, a) = a * x**(a-1)
+        return a*x**(a-1.0)
+
+    def _logpdf(self, x, a):
+        return np.log(a) + sc.xlogy(a - 1, x)
+
+    def _cdf(self, x, a):
+        return x**(a*1.0)
+
+    def _logcdf(self, x, a):
+        return a*np.log(x)
+
+    def _ppf(self, q, a):
+        return pow(q, 1.0/a)
+
+    def _sf(self, p, a):
+        return -sc.powm1(p, a)
+
+    def _munp(self, n, a):
+        # The following expression is correct for all real n (provided a > 0).
+        return a / (a + n)
+
+    def _stats(self, a):
+        return (a / (a + 1.0),
+                a / (a + 2.0) / (a + 1.0) ** 2,
+                -2.0 * ((a - 1.0) / (a + 3.0)) * np.sqrt((a + 2.0) / a),
+                6 * np.polyval([1, -1, -6, 2], a) / (a * (a + 3.0) * (a + 4)))
+
+    def _entropy(self, a):
+        return 1 - 1.0/a - np.log(a)
+
+    def _support_mask(self, x, a):
+        return (super()._support_mask(x, a)
+                & ((x != 0) | (a >= 1)))
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        Notes specifically for ``powerlaw.fit``: If the location is a free
+        parameter and the value returned for the shape parameter is less than
+        one, the true maximum likelihood approaches infinity. This causes
+        numerical difficulties, and the resulting estimates are approximate.
+        \n\n""")
+    def fit(self, data, *args, **kwds):
+        # Summary of the strategy:
+        #
+        # 1) If the scale and location are fixed, return the shape according
+        #    to a formula.
+        #
+        # 2) If the scale is fixed, there are two possibilities for the other
+        #    parameters - one corresponding with shape less than one, and
+        #    another with shape greater than one. Calculate both, and return
+        #    whichever has the better log-likelihood.
+        #
+        # At this point, the scale is known to be free.
+        #
+        # 3) If the location is fixed, return the scale and shape according to
+        #    formulas (or, if the shape is fixed, the fixed shape).
+        #
+        # At this point, the location and scale are both free. There are
+        # separate equations depending on whether the shape is less than one or
+        # greater than one.
+        #
+        # 4a) If the shape is less than one, there are formulas for shape,
+        #     location, and scale.
+        # 4b) If the shape is greater than one, there are formulas for shape
+        #     and scale, but there is a condition for location to be solved
+        #     numerically.
+        #
+        # If the shape is fixed and less than one, we use 4a.
+        # If the shape is fixed and greater than one, we use 4b.
+        # If the shape is also free, we calculate fits using both 4a and 4b
+        # and choose the one that results a better log-likelihood.
+        #
+        # In many cases, the use of `np.nextafter` is used to avoid numerical
+        # issues.
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        if len(np.unique(data)) == 1:
+            return super().fit(data, *args, **kwds)
+
+        data, fshape, floc, fscale = _check_fit_input_parameters(self, data,
+                                                                 args, kwds)
+        penalized_nllf_args = [data, (self._fitstart(data),)]
+        penalized_nllf = self._reduce_func(penalized_nllf_args, {})[1]
+
+        # ensure that any fixed parameters don't violate constraints of the
+        # distribution before continuing. The support of the distribution
+        # is `0 < (x - loc)/scale < 1`.
+        if floc is not None:
+            if not data.min() > floc:
+                raise FitDataError('powerlaw', 0, 1)
+            if fscale is not None and not data.max() <= floc + fscale:
+                raise FitDataError('powerlaw', 0, 1)
+
+        if fscale is not None:
+            if fscale <= 0:
+                raise ValueError("Negative or zero `fscale` is outside the "
+                                 "range allowed by the distribution.")
+            if fscale <= np.ptp(data):
+                msg = "`fscale` must be greater than the range of data."
+                raise ValueError(msg)
+
+        def get_shape(data, loc, scale):
+            # The first-order necessary condition on `shape` can be solved in
+            # closed form. It can be used no matter the assumption of the
+            # value of the shape.
+            N = len(data)
+            return - N / (np.sum(np.log(data - loc)) - N*np.log(scale))
+
+        def get_scale(data, loc):
+            # analytical solution for `scale` based on the location.
+            # It can be used no matter the assumption of the value of the
+            # shape.
+            return data.max() - loc
+
+        # 1) The location and scale are both fixed. Analytically determine the
+        # shape.
+        if fscale is not None and floc is not None:
+            return get_shape(data, floc, fscale), floc, fscale
+
+        # 2) The scale is fixed. There are two possibilities for the other
+        # parameters. Choose the option with better log-likelihood.
+        if fscale is not None:
+            # using `data.min()` as the optimal location
+            loc_lt1 = np.nextafter(data.min(), -np.inf)
+            shape_lt1 = fshape or get_shape(data, loc_lt1, fscale)
+            ll_lt1 = penalized_nllf((shape_lt1, loc_lt1, fscale), data)
+
+            # using `data.max() - scale` as the optimal location
+            loc_gt1 = np.nextafter(data.max() - fscale, np.inf)
+            shape_gt1 = fshape or get_shape(data, loc_gt1, fscale)
+            ll_gt1 = penalized_nllf((shape_gt1, loc_gt1, fscale), data)
+
+            if ll_lt1 < ll_gt1:
+                return shape_lt1, loc_lt1, fscale
+            else:
+                return shape_gt1, loc_gt1, fscale
+
+        # 3) The location is fixed. Return the analytical scale and the
+        # analytical (or fixed) shape.
+        if floc is not None:
+            scale = get_scale(data, floc)
+            shape = fshape or get_shape(data, floc, scale)
+            return shape, floc, scale
+
+        # 4) Location and scale are both free
+        # 4a) Use formulas that assume `shape <= 1`.
+
+        def fit_loc_scale_w_shape_lt_1():
+            loc = np.nextafter(data.min(), -np.inf)
+            if np.abs(loc) < np.finfo(loc.dtype).tiny:
+                loc = np.sign(loc) * np.finfo(loc.dtype).tiny
+            scale = np.nextafter(get_scale(data, loc), np.inf)
+            shape = fshape or get_shape(data, loc, scale)
+            return shape, loc, scale
+
+        # 4b) Fit under the assumption that `shape > 1`. The support
+        # of the distribution is `(x - loc)/scale <= 1`. The method of Lagrange
+        # multipliers turns this constraint into the condition that
+        # dL_dScale - dL_dLocation must be zero, which is solved numerically.
+        # (Alternatively, substitute the constraint into the objective
+        # function before deriving the likelihood equation for location.)
+
+        def dL_dScale(data, shape, scale):
+            # The partial derivative of the log-likelihood function w.r.t.
+            # the scale.
+            return -data.shape[0] * shape / scale
+
+        def dL_dLocation(data, shape, loc):
+            # The partial derivative of the log-likelihood function w.r.t.
+            # the location.
+            return (shape - 1) * np.sum(1 / (loc - data))  # -1/(data-loc)
+
+        def dL_dLocation_star(loc):
+            # The derivative of the log-likelihood function w.r.t.
+            # the location, given optimal shape and scale
+            scale = np.nextafter(get_scale(data, loc), -np.inf)
+            shape = fshape or get_shape(data, loc, scale)
+            return dL_dLocation(data, shape, loc)
+
+        def fun_to_solve(loc):
+            # optimize the location by setting the partial derivatives
+            # w.r.t. to location and scale equal and solving.
+            scale = np.nextafter(get_scale(data, loc), -np.inf)
+            shape = fshape or get_shape(data, loc, scale)
+            return (dL_dScale(data, shape, scale)
+                    - dL_dLocation(data, shape, loc))
+
+        def fit_loc_scale_w_shape_gt_1():
+            # set brackets for `root_scalar` to use when optimizing over the
+            # location such that a root is likely between them.
+            rbrack = np.nextafter(data.min(), -np.inf)
+
+            # if the sign of `dL_dLocation_star` is positive at rbrack,
+            # we're not going to find the root we're looking for
+            delta = (data.min() - rbrack)
+            while dL_dLocation_star(rbrack) > 0:
+                rbrack = data.min() - delta
+                delta *= 2
+
+            def interval_contains_root(lbrack, rbrack):
+                # Check if the interval (lbrack, rbrack) contains the root.
+                return (np.sign(fun_to_solve(lbrack))
+                        != np.sign(fun_to_solve(rbrack)))
+
+            lbrack = rbrack - 1
+
+            # if the sign doesn't change between the brackets, move the left
+            # bracket until it does. (The right bracket remains fixed at the
+            # maximum permissible value.)
+            i = 1.0
+            while (not interval_contains_root(lbrack, rbrack)
+                   and lbrack != -np.inf):
+                lbrack = (data.min() - i)
+                i *= 2
+
+            root = optimize.root_scalar(fun_to_solve, bracket=(lbrack, rbrack))
+
+            loc = np.nextafter(root.root, -np.inf)
+            scale = np.nextafter(get_scale(data, loc), np.inf)
+            shape = fshape or get_shape(data, loc, scale)
+            return shape, loc, scale
+
+        # Shape is fixed - choose 4a or 4b accordingly.
+        if fshape is not None and fshape <= 1:
+            return fit_loc_scale_w_shape_lt_1()
+        elif fshape is not None and fshape > 1:
+            return fit_loc_scale_w_shape_gt_1()
+
+        # Shape is free
+        fit_shape_lt1 = fit_loc_scale_w_shape_lt_1()
+        ll_lt1 = self.nnlf(fit_shape_lt1, data)
+
+        fit_shape_gt1 = fit_loc_scale_w_shape_gt_1()
+        ll_gt1 = self.nnlf(fit_shape_gt1, data)
+
+        if ll_lt1 <= ll_gt1 and fit_shape_lt1[0] <= 1:
+            return fit_shape_lt1
+        elif ll_lt1 > ll_gt1 and fit_shape_gt1[0] > 1:
+            return fit_shape_gt1
+        else:
+            return super().fit(data, *args, **kwds)
+
+
+powerlaw = powerlaw_gen(a=0.0, b=1.0, name="powerlaw")
+
+
+class powerlognorm_gen(rv_continuous):
+    r"""A power log-normal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `powerlognorm` is:
+
+    .. math::
+
+        f(x, c, s) = \frac{c}{x s} \phi(\log(x)/s)
+                     (\Phi(-\log(x)/s))^{c-1}
+
+    where :math:`\phi` is the normal pdf, and :math:`\Phi` is the normal cdf,
+    and :math:`x > 0`, :math:`s, c > 0`.
+
+    `powerlognorm` takes :math:`c` and :math:`s` as shape parameters.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        ic = _ShapeInfo("c", False, (0, np.inf), (False, False))
+        i_s = _ShapeInfo("s", False, (0, np.inf), (False, False))
+        return [ic, i_s]
+
+    def _pdf(self, x, c, s):
+        return np.exp(self._logpdf(x, c, s))
+
+    def _logpdf(self, x, c, s):
+        return (np.log(c) - np.log(x) - np.log(s) +
+                _norm_logpdf(np.log(x) / s) +
+                _norm_logcdf(-np.log(x) / s) * (c - 1.))
+
+    def _cdf(self, x, c, s):
+        return -sc.expm1(self._logsf(x, c, s))
+
+    def _ppf(self, q, c, s):
+        return self._isf(1 - q, c, s)
+
+    def _sf(self, x, c, s):
+        return np.exp(self._logsf(x, c, s))
+
+    def _logsf(self, x, c, s):
+        return _norm_logcdf(-np.log(x) / s) * c
+
+    def _isf(self, q, c, s):
+        return np.exp(-_norm_ppf(q**(1/c)) * s)
+
+
+powerlognorm = powerlognorm_gen(a=0.0, name="powerlognorm")
+
+
+class powernorm_gen(rv_continuous):
+    r"""A power normal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `powernorm` is:
+
+    .. math::
+
+        f(x, c) = c \phi(x) (\Phi(-x))^{c-1}
+
+    where :math:`\phi` is the normal pdf, :math:`\Phi` is the normal cdf,
+    :math:`x` is any real, and :math:`c > 0` [1]_.
+
+    `powernorm` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] NIST Engineering Statistics Handbook, Section 1.3.6.6.13,
+           https://www.itl.nist.gov/div898/handbook//eda/section3/eda366d.htm
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, c):
+        # powernorm.pdf(x, c) = c * phi(x) * (Phi(-x))**(c-1)
+        return c*_norm_pdf(x) * (_norm_cdf(-x)**(c-1.0))
+
+    def _logpdf(self, x, c):
+        return np.log(c) + _norm_logpdf(x) + (c-1)*_norm_logcdf(-x)
+
+    def _cdf(self, x, c):
+        return -sc.expm1(self._logsf(x, c))
+
+    def _ppf(self, q, c):
+        return -_norm_ppf(pow(1.0 - q, 1.0 / c))
+
+    def _sf(self, x, c):
+        return np.exp(self._logsf(x, c))
+
+    def _logsf(self, x, c):
+        return c * _norm_logcdf(-x)
+
+    def _isf(self, q, c):
+        return -_norm_ppf(np.exp(np.log(q) / c))
+
+
+powernorm = powernorm_gen(name='powernorm')
+
+
+class rdist_gen(rv_continuous):
+    r"""An R-distributed (symmetric beta) continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `rdist` is:
+
+    .. math::
+
+        f(x, c) = \frac{(1-x^2)^{c/2-1}}{B(1/2, c/2)}
+
+    for :math:`-1 \le x \le 1`, :math:`c > 0`. `rdist` is also called the
+    symmetric beta distribution: if B has a `beta` distribution with
+    parameters (c/2, c/2), then X = 2*B - 1 follows a R-distribution with
+    parameter c.
+
+    `rdist` takes ``c`` as a shape parameter for :math:`c`.
+
+    This distribution includes the following distribution kernels as
+    special cases::
+
+        c = 2:  uniform
+        c = 3:  `semicircular`
+        c = 4:  Epanechnikov (parabolic)
+        c = 6:  quartic (biweight)
+        c = 8:  triweight
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
+
+    # use relation to the beta distribution for pdf, cdf, etc
+    def _pdf(self, x, c):
+        return np.exp(self._logpdf(x, c))
+
+    def _logpdf(self, x, c):
+        return -np.log(2) + beta._logpdf((x + 1)/2, c/2, c/2)
+
+    def _cdf(self, x, c):
+        return beta._cdf((x + 1)/2, c/2, c/2)
+
+    def _sf(self, x, c):
+        return beta._sf((x + 1)/2, c/2, c/2)
+
+    def _ppf(self, q, c):
+        return 2*beta._ppf(q, c/2, c/2) - 1
+
+    def _rvs(self, c, size=None, random_state=None):
+        return 2 * random_state.beta(c/2, c/2, size) - 1
+
+    def _munp(self, n, c):
+        numerator = (1 - (n % 2)) * sc.beta((n + 1.0) / 2, c / 2.0)
+        return numerator / sc.beta(1. / 2, c / 2.)
+
+
+rdist = rdist_gen(a=-1.0, b=1.0, name="rdist")
+
+
+class rayleigh_gen(rv_continuous):
+    r"""A Rayleigh continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `rayleigh` is:
+
+    .. math::
+
+        f(x) = x \exp(-x^2/2)
+
+    for :math:`x \ge 0`.
+
+    `rayleigh` is a special case of `chi` with ``df=2``.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return chi.rvs(2, size=size, random_state=random_state)
+
+    def _pdf(self, r):
+        # rayleigh.pdf(r) = r * exp(-r**2/2)
+        return np.exp(self._logpdf(r))
+
+    def _logpdf(self, r):
+        return np.log(r) - 0.5 * r * r
+
+    def _cdf(self, r):
+        return -sc.expm1(-0.5 * r**2)
+
+    def _ppf(self, q):
+        return np.sqrt(-2 * sc.log1p(-q))
+
+    def _sf(self, r):
+        return np.exp(self._logsf(r))
+
+    def _logsf(self, r):
+        return -0.5 * r * r
+
+    def _isf(self, q):
+        return np.sqrt(-2 * np.log(q))
+
+    def _stats(self):
+        val = 4 - np.pi
+        return (np.sqrt(np.pi/2),
+                val/2,
+                2*(np.pi-3)*np.sqrt(np.pi)/val**1.5,
+                6*np.pi/val-16/val**2)
+
+    def _entropy(self):
+        return _EULER/2.0 + 1 - 0.5*np.log(2)
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        Notes specifically for ``rayleigh.fit``: If the location is fixed with
+        the `floc` parameter, this method uses an analytical formula to find
+        the scale.  Otherwise, this function uses a numerical root finder on
+        the first order conditions of the log-likelihood function to find the
+        MLE.  Only the (optional) `loc` parameter is used as the initial guess
+        for the root finder; the `scale` parameter and any other parameters
+        for the optimizer are ignored.\n\n""")
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+        data, floc, fscale = _check_fit_input_parameters(self, data,
+                                                         args, kwds)
+
+        def scale_mle(loc):
+            # Source: Statistical Distributions, 3rd Edition. Evans, Hastings,
+            # and Peacock (2000), Page 175
+            return (np.sum((data - loc) ** 2) / (2 * len(data))) ** .5
+
+        def loc_mle(loc):
+            # This implicit equation for `loc` is used when
+            # both `loc` and `scale` are free.
+            xm = data - loc
+            s1 = xm.sum()
+            s2 = (xm**2).sum()
+            s3 = (1/xm).sum()
+            return s1 - s2/(2*len(data))*s3
+
+        def loc_mle_scale_fixed(loc, scale=fscale):
+            # This implicit equation for `loc` is used when
+            # `scale` is fixed but `loc` is not.
+            xm = data - loc
+            return xm.sum() - scale**2 * (1/xm).sum()
+
+        if floc is not None:
+            # `loc` is fixed, analytically determine `scale`.
+            if np.any(data - floc <= 0):
+                raise FitDataError("rayleigh", lower=1, upper=np.inf)
+            else:
+                return floc, scale_mle(floc)
+
+        # Account for user provided guess of `loc`.
+        loc0 = kwds.get('loc')
+        if loc0 is None:
+            # Use _fitstart to estimate loc; ignore the returned scale.
+            loc0 = self._fitstart(data)[0]
+
+        fun = loc_mle if fscale is None else loc_mle_scale_fixed
+        rbrack = np.nextafter(np.min(data), -np.inf)
+        lbrack = _get_left_bracket(fun, rbrack)
+        res = optimize.root_scalar(fun, bracket=(lbrack, rbrack))
+        if not res.converged:
+            raise FitSolverError(res.flag)
+        loc = res.root
+        scale = fscale or scale_mle(loc)
+        return loc, scale
+
+
+rayleigh = rayleigh_gen(a=0.0, name="rayleigh")
+
+
+class reciprocal_gen(rv_continuous):
+    r"""A loguniform or reciprocal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for this class is:
+
+    .. math::
+
+        f(x, a, b) = \frac{1}{x \log(b/a)}
+
+    for :math:`a \le x \le b`, :math:`b > a > 0`. This class takes
+    :math:`a` and :math:`b` as shape parameters.
+
+    %(after_notes)s
+
+    %(example)s
+
+    This doesn't show the equal probability of ``0.01``, ``0.1`` and
+    ``1``. This is best when the x-axis is log-scaled:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.hist(np.log10(r))
+    >>> ax.set_ylabel("Frequency")
+    >>> ax.set_xlabel("Value of random variable")
+    >>> ax.xaxis.set_major_locator(plt.FixedLocator([-2, -1, 0]))
+    >>> ticks = ["$10^{{ {} }}$".format(i) for i in [-2, -1, 0]]
+    >>> ax.set_xticklabels(ticks)  # doctest: +SKIP
+    >>> plt.show()
+
+    This random variable will be log-uniform regardless of the base chosen for
+    ``a`` and ``b``. Let's specify with base ``2`` instead:
+
+    >>> rvs = %(name)s(2**-2, 2**0).rvs(size=1000)
+
+    Values of ``1/4``, ``1/2`` and ``1`` are equally likely with this random
+    variable.  Here's the histogram:
+
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.hist(np.log2(rvs))
+    >>> ax.set_ylabel("Frequency")
+    >>> ax.set_xlabel("Value of random variable")
+    >>> ax.xaxis.set_major_locator(plt.FixedLocator([-2, -1, 0]))
+    >>> ticks = ["$2^{{ {} }}$".format(i) for i in [-2, -1, 0]]
+    >>> ax.set_xticklabels(ticks)  # doctest: +SKIP
+    >>> plt.show()
+
+    """
+    def _argcheck(self, a, b):
+        return (a > 0) & (b > a)
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (0, np.inf), (False, False))
+        ib = _ShapeInfo("b", False, (0, np.inf), (False, False))
+        return [ia, ib]
+
+    def _fitstart(self, data):
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        # Reasonable, since support is [a, b]
+        return super()._fitstart(data, args=(np.min(data), np.max(data)))
+
+    def _get_support(self, a, b):
+        return a, b
+
+    def _pdf(self, x, a, b):
+        # reciprocal.pdf(x, a, b) = 1 / (x*(log(b) - log(a)))
+        return np.exp(self._logpdf(x, a, b))
+
+    def _logpdf(self, x, a, b):
+        return -np.log(x) - np.log(np.log(b) - np.log(a))
+
+    def _cdf(self, x, a, b):
+        return (np.log(x)-np.log(a)) / (np.log(b) - np.log(a))
+
+    def _ppf(self, q, a, b):
+        return np.exp(np.log(a) + q*(np.log(b) - np.log(a)))
+
+    def _munp(self, n, a, b):
+        if n == 0:
+            return 1.0
+        t1 = 1 / (np.log(b) - np.log(a)) / n
+        t2 = np.real(np.exp(_log_diff(n * np.log(b), n*np.log(a))))
+        return t1 * t2
+
+    def _entropy(self, a, b):
+        return 0.5*(np.log(a) + np.log(b)) + np.log(np.log(b) - np.log(a))
+
+    fit_note = """\
+        `loguniform`/`reciprocal` is over-parameterized. `fit` automatically
+         fixes `scale` to 1 unless `fscale` is provided by the user.\n\n"""
+
+    @extend_notes_in_docstring(rv_continuous, notes=fit_note)
+    def fit(self, data, *args, **kwds):
+        fscale = kwds.pop('fscale', 1)
+        return super().fit(data, *args, fscale=fscale, **kwds)
+
+    # Details related to the decision of not defining
+    # the survival function for this distribution can be
+    # found in the PR: https://github.com/scipy/scipy/pull/18614
+
+
+loguniform = reciprocal_gen(name="loguniform")
+reciprocal = reciprocal_gen(name="reciprocal")
+loguniform._support = ('a', 'b')
+reciprocal._support = ('a', 'b')
+
+
+class rice_gen(rv_continuous):
+    r"""A Rice continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `rice` is:
+
+    .. math::
+
+        f(x, b) = x \exp(- \frac{x^2 + b^2}{2}) I_0(x b)
+
+    for :math:`x >= 0`, :math:`b > 0`. :math:`I_0` is the modified Bessel
+    function of order zero (`scipy.special.i0`).
+
+    `rice` takes ``b`` as a shape parameter for :math:`b`.
+
+    %(after_notes)s
+
+    The Rice distribution describes the length, :math:`r`, of a 2-D vector with
+    components :math:`(U+u, V+v)`, where :math:`U, V` are constant, :math:`u,
+    v` are independent Gaussian random variables with standard deviation
+    :math:`s`.  Let :math:`R = \sqrt{U^2 + V^2}`. Then the pdf of :math:`r` is
+    ``rice.pdf(x, R/s, scale=s)``.
+
+    %(example)s
+
+    """
+    def _argcheck(self, b):
+        return b >= 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("b", False, (0, np.inf), (True, False))]
+
+    def _rvs(self, b, size=None, random_state=None):
+        # https://en.wikipedia.org/wiki/Rice_distribution
+        t = b/np.sqrt(2) + random_state.standard_normal(size=(2,) + size)
+        return np.sqrt((t*t).sum(axis=0))
+
+    def _cdf(self, x, b):
+        return sc.chndtr(np.square(x), 2, np.square(b))
+
+    def _ppf(self, q, b):
+        return np.sqrt(sc.chndtrix(q, 2, np.square(b)))
+
+    def _pdf(self, x, b):
+        # rice.pdf(x, b) = x * exp(-(x**2+b**2)/2) * I[0](x*b)
+        #
+        # We use (x**2 + b**2)/2 = ((x-b)**2)/2 + xb.
+        # The factor of np.exp(-xb) is then included in the i0e function
+        # in place of the modified Bessel function, i0, improving
+        # numerical stability for large values of xb.
+        return x * np.exp(-(x-b)*(x-b)/2.0) * sc.i0e(x*b)
+
+    def _munp(self, n, b):
+        nd2 = n/2.0
+        n1 = 1 + nd2
+        b2 = b*b/2.0
+        return (2.0**(nd2) * np.exp(-b2) * sc.gamma(n1) *
+                sc.hyp1f1(n1, 1, b2))
+
+
+rice = rice_gen(a=0.0, name="rice")
+
+class irwinhall_gen(rv_continuous):
+    r"""An Irwin-Hall (Uniform Sum) continuous random variable.
+
+    An `Irwin-Hall <https://en.wikipedia.org/wiki/Irwin-Hall_distribution/>`_
+    continuous random variable is the sum of :math:`n` independent
+    standard uniform random variables [1]_ [2]_.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    Applications include `Rao's Spacing Test
+    <https://jammalam.faculty.pstat.ucsb.edu/html/favorite/test.htm>`_,
+    a more powerful alternative to the Rayleigh test
+    when the data are not unimodal, and radar [3]_.
+
+    Conveniently, the pdf and cdf are the :math:`n`-fold convolution of
+    the ones for the standard uniform distribution, which is also the
+    definition of the cardinal B-splines of degree :math:`n-1`
+    having knots evenly spaced from :math:`1` to :math:`n` [4]_ [5]_.
+
+    The Bates distribution, which represents the *mean* of statistically
+    independent, uniformly distributed random variables, is simply the
+    Irwin-Hall distribution scaled by :math:`1/n`. For example, the frozen
+    distribution ``bates = irwinhall(10, scale=1/10)`` represents the
+    distribution of the mean of 10 uniformly distributed random variables.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] P. Hall, "The distribution of means for samples of size N drawn
+            from a population in which the variate takes values between 0 and 1,
+            all such values being equally probable",
+            Biometrika, Volume 19, Issue 3-4, December 1927, Pages 240-244,
+            :doi:`10.1093/biomet/19.3-4.240`.
+    .. [2] J. O. Irwin, "On the frequency distribution of the means of samples
+            from a population having any law of frequency with finite moments,
+            with special reference to Pearson's Type II,
+            Biometrika, Volume 19, Issue 3-4, December 1927, Pages 225-239,
+            :doi:`0.1093/biomet/19.3-4.225`.
+    .. [3] K. Buchanan, T. Adeyemi, C. Flores-Molina, S. Wheeland and D. Overturf,
+            "Sidelobe behavior and bandwidth characteristics
+            of distributed antenna arrays,"
+            2018 United States National Committee of
+            URSI National Radio Science Meeting (USNC-URSI NRSM),
+            Boulder, CO, USA, 2018, pp. 1-2.
+            https://www.usnc-ursi-archive.org/nrsm/2018/papers/B15-9.pdf.
+    .. [4] Amos Ron, "Lecture 1: Cardinal B-splines and convolution operators", p. 1
+            https://pages.cs.wisc.edu/~deboor/887/lec1new.pdf.
+    .. [5] Trefethen, N. (2012, July). B-splines and convolution. Chebfun.
+            Retrieved April 30, 2024, from http://www.chebfun.org/examples/approx/BSplineConv.html.
+
+    %(example)s
+    """  # noqa: E501
+
+    @replace_notes_in_docstring(rv_continuous, notes="""\
+        Raises a ``NotImplementedError`` for the Irwin-Hall distribution because
+        the generic `fit` implementation is unreliable and no custom implementation
+        is available. Consider using `scipy.stats.fit`.\n\n""")
+    def fit(self, data, *args, **kwds):
+        fit_notes = ("The generic `fit` implementation is unreliable for this "
+                     "distribution, and no custom implementation is available. "
+                     "Consider using `scipy.stats.fit`.")
+        raise NotImplementedError(fit_notes)
+
+    def _argcheck(self, n):
+        return (n > 0) & _isintegral(n) & np.isrealobj(n)
+
+    def _get_support(self, n):
+        return 0, n
+
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (1, np.inf), (True, False))]
+
+    def _munp(self, order, n):
+        # see https://link.springer.com/content/pdf/10.1007/s10959-020-01050-9.pdf
+        # page 640, with m=n, j=n+order
+        def vmunp(order, n):
+            n = np.asarray(n, dtype=np.int64)
+            return (sc.stirling2(n+order, n, exact=True)
+                    / sc.comb(n+order, n, exact=True))
+
+        # exact rationals, but we convert to float anyway
+        return np.vectorize(vmunp, otypes=[np.float64])(order, n)
+
+    @staticmethod
+    def _cardbspl(n):
+        t = np.arange(n+1)
+        return BSpline.basis_element(t)
+
+    def _pdf(self, x, n):
+        def vpdf(x, n):
+            return self._cardbspl(n)(x)
+        return np.vectorize(vpdf, otypes=[np.float64])(x, n)
+
+    def _cdf(self, x, n):
+        def vcdf(x, n):
+            return self._cardbspl(n).antiderivative()(x)
+        return np.vectorize(vcdf, otypes=[np.float64])(x, n)
+
+    def _sf(self, x, n):
+        def vsf(x, n):
+            return self._cardbspl(n).antiderivative()(n-x)
+        return np.vectorize(vsf, otypes=[np.float64])(x, n)
+
+    def _rvs(self, n, size=None, random_state=None, *args):
+        @_vectorize_rvs_over_shapes
+        def _rvs1(n, size=None, random_state=None):
+            n = np.floor(n).astype(int)
+            usize = (n,) if size is None else (n, *size)
+            return random_state.uniform(size=usize).sum(axis=0)
+        return _rvs1(n, size=size, random_state=random_state)
+
+    def _stats(self, n):
+        # mgf = ((exp(t) - 1)/t)**n
+        # m'th derivative follows from the generalized Leibniz rule
+        # Moments follow directly from the definition as the sum of n iid unif(0,1)
+        # and the summation rules for moments of a sum of iid random variables
+        # E(IH((n))) = n*E(U(0,1)) = n/2
+        # Var(IH((n))) = n*Var(U(0,1)) = n/12
+        # Skew(IH((n))) = Skew(U(0,1))/sqrt(n) = 0
+        # Kurt(IH((n))) = Kurt(U(0,1))/n = -6/(5*n) -- Fisher's excess kurtosis
+        # See e.g. https://en.wikipedia.org/wiki/Irwin%E2%80%93Hall_distribution
+
+        return n/2, n/12, 0, -6/(5*n)
+
+
+irwinhall = irwinhall_gen(name="irwinhall")
+irwinhall._support = (0.0, 'n')
+
+
+class recipinvgauss_gen(rv_continuous):
+    r"""A reciprocal inverse Gaussian continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `recipinvgauss` is:
+
+    .. math::
+
+        f(x, \mu) = \frac{1}{\sqrt{2\pi x}}
+                    \exp\left(\frac{-(1-\mu x)^2}{2\mu^2x}\right)
+
+    for :math:`x \ge 0`.
+
+    `recipinvgauss` takes ``mu`` as a shape parameter for :math:`\mu`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("mu", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, mu):
+        # recipinvgauss.pdf(x, mu) =
+        #                     1/sqrt(2*pi*x) * exp(-(1-mu*x)**2/(2*x*mu**2))
+        return np.exp(self._logpdf(x, mu))
+
+    def _logpdf(self, x, mu):
+        return xpx.apply_where(
+            x > 0, (x, mu),
+            lambda x, mu: (-(1 - mu*x)**2.0 / (2*x*mu**2.0)
+                           - 0.5*np.log(2*np.pi*x)),
+            fill_value=-np.inf)
+
+    def _cdf(self, x, mu):
+        trm1 = 1.0/mu - x
+        trm2 = 1.0/mu + x
+        isqx = 1.0/np.sqrt(x)
+        return _norm_cdf(-isqx*trm1) - np.exp(2.0/mu)*_norm_cdf(-isqx*trm2)
+
+    def _sf(self, x, mu):
+        trm1 = 1.0/mu - x
+        trm2 = 1.0/mu + x
+        isqx = 1.0/np.sqrt(x)
+        return _norm_cdf(isqx*trm1) + np.exp(2.0/mu)*_norm_cdf(-isqx*trm2)
+
+    def _rvs(self, mu, size=None, random_state=None):
+        return 1.0/random_state.wald(mu, 1.0, size=size)
+
+
+recipinvgauss = recipinvgauss_gen(a=0.0, name='recipinvgauss')
+
+
+class semicircular_gen(rv_continuous):
+    r"""A semicircular continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    rdist
+
+    Notes
+    -----
+    The probability density function for `semicircular` is:
+
+    .. math::
+
+        f(x) = \frac{2}{\pi} \sqrt{1-x^2}
+
+    for :math:`-1 \le x \le 1`.
+
+    The distribution is a special case of `rdist` with ``c = 3``.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Wigner semicircle distribution",
+           https://en.wikipedia.org/wiki/Wigner_semicircle_distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _pdf(self, x):
+        return 2.0/np.pi*np.sqrt(1-x*x)
+
+    def _logpdf(self, x):
+        return np.log(2/np.pi) + 0.5*sc.log1p(-x*x)
+
+    def _cdf(self, x):
+        return 0.5+1.0/np.pi*(x*np.sqrt(1-x*x) + np.arcsin(x))
+
+    def _ppf(self, q):
+        return rdist._ppf(q, 3)
+
+    def _rvs(self, size=None, random_state=None):
+        # generate values uniformly distributed on the area under the pdf
+        # (semi-circle) by randomly generating the radius and angle
+        r = np.sqrt(random_state.uniform(size=size))
+        a = np.cos(np.pi * random_state.uniform(size=size))
+        return r * a
+
+    def _stats(self):
+        return 0, 0.25, 0, -1.0
+
+    def _entropy(self):
+        return 0.64472988584940017414
+
+
+semicircular = semicircular_gen(a=-1.0, b=1.0, name="semicircular")
+
+
+class skewcauchy_gen(rv_continuous):
+    r"""A skewed Cauchy random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    cauchy : Cauchy distribution
+
+    Notes
+    -----
+
+    The probability density function for `skewcauchy` is:
+
+    .. math::
+
+        f(x) = \frac{1}{\pi \left(\frac{x^2}{\left(a\, \text{sign}(x) + 1
+                                                   \right)^2} + 1 \right)}
+
+    for a real number :math:`x` and skewness parameter :math:`-1 < a < 1`.
+
+    When :math:`a=0`, the distribution reduces to the usual Cauchy
+    distribution.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Skewed generalized *t* distribution", Wikipedia
+       https://en.wikipedia.org/wiki/Skewed_generalized_t_distribution#Skewed_Cauchy_distribution
+
+    %(example)s
+
+    """
+    def _argcheck(self, a):
+        return np.abs(a) < 1
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (-1.0, 1.0), (False, False))]
+
+    def _pdf(self, x, a):
+        return 1 / (np.pi * (x**2 / (a * np.sign(x) + 1)**2 + 1))
+
+    def _cdf(self, x, a):
+        return np.where(x <= 0,
+                        (1 - a) / 2 + (1 - a) / np.pi * np.arctan(x / (1 - a)),
+                        (1 - a) / 2 + (1 + a) / np.pi * np.arctan(x / (1 + a)))
+
+    def _ppf(self, x, a):
+        i = x < self._cdf(0, a)
+        return np.where(i,
+                        np.tan(np.pi / (1 - a) * (x - (1 - a) / 2)) * (1 - a),
+                        np.tan(np.pi / (1 + a) * (x - (1 - a) / 2)) * (1 + a))
+
+    def _stats(self, a, moments='mvsk'):
+        return np.nan, np.nan, np.nan, np.nan
+
+    def _fitstart(self, data):
+        # Use 0 as the initial guess of the skewness shape parameter.
+        # For the location and scale, estimate using the median and
+        # quartiles.
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        p25, p50, p75 = np.percentile(data, [25, 50, 75])
+        return 0.0, p50, (p75 - p25)/2
+
+
+skewcauchy = skewcauchy_gen(name='skewcauchy')
+
+
+class skewnorm_gen(rv_continuous):
+    r"""A skew-normal random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The pdf is::
+
+        skewnorm.pdf(x, a) = 2 * norm.pdf(x) * norm.cdf(a*x)
+
+    `skewnorm` takes a real number :math:`a` as a skewness parameter
+    When ``a = 0`` the distribution is identical to a normal distribution
+    (`norm`). `rvs` implements the method of [1]_.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of ``cdf``, ``ppf`` and ``isf`` methods. [2]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] A. Azzalini and A. Capitanio (1999). Statistical applications of
+        the multivariate skew-normal distribution. J. Roy. Statist. Soc.,
+        B 61, 579-602. :arxiv:`0911.2093`
+    .. [2] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    """
+    def _argcheck(self, a):
+        return np.isfinite(a)
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (-np.inf, np.inf), (False, False))]
+
+    def _pdf(self, x, a):
+        return xpx.apply_where(
+            a == 0, (x, a),
+            lambda x, a: _norm_pdf(x),
+            lambda x, a: 2.*_norm_pdf(x)*_norm_cdf(a*x))
+
+    def _logpdf(self, x, a):
+        return xpx.apply_where(
+            a == 0, (x, a),
+            lambda x, a: _norm_logpdf(x),
+            lambda x, a: np.log(2)+_norm_logpdf(x)+_norm_logcdf(a*x))
+
+    def _cdf(self, x, a):
+        a = np.atleast_1d(a)
+        cdf = scu._skewnorm_cdf(x, 0.0, 1.0, a)
+        # for some reason, a isn't broadcasted if some of x are invalid
+        a = np.broadcast_to(a, cdf.shape)
+        # Boost is not accurate in left tail when a > 0
+        i_small_cdf = (cdf < 1e-6) & (a > 0)
+        cdf[i_small_cdf] = super()._cdf(x[i_small_cdf], a[i_small_cdf])
+        return np.clip(cdf, 0, 1)
+
+    def _ppf(self, x, a):
+        return scu._skewnorm_ppf(x, 0.0, 1.0, a)
+
+    def _sf(self, x, a):
+        # Boost's SF is implemented this way. Use whatever customizations
+        # we made in the _cdf.
+        return self._cdf(-x, -a)
+
+    def _isf(self, x, a):
+        return scu._skewnorm_isf(x, 0.0, 1.0, a)
+
+    def _rvs(self, a, size=None, random_state=None):
+        u0 = random_state.normal(size=size)
+        v = random_state.normal(size=size)
+        d = a/np.sqrt(1 + a**2)
+        u1 = d*u0 + v*np.sqrt(1 - d**2)
+        return np.where(u0 >= 0, u1, -u1)
+
+    def _stats(self, a, moments='mvsk'):
+        output = [None, None, None, None]
+        const = np.sqrt(2/np.pi) * a/np.sqrt(1 + a**2)
+
+        if 'm' in moments:
+            output[0] = const
+        if 'v' in moments:
+            output[1] = 1 - const**2
+        if 's' in moments:
+            output[2] = ((4 - np.pi)/2) * (const/np.sqrt(1 - const**2))**3
+        if 'k' in moments:
+            output[3] = (2*(np.pi - 3)) * (const**4/(1 - const**2)**2)
+
+        return output
+
+    # For odd order, the each noncentral moment of the skew-normal distribution
+    # with location 0 and scale 1 can be expressed as a polynomial in delta,
+    # where delta = a/sqrt(1 + a**2) and `a` is the skew-normal shape
+    # parameter.  The dictionary _skewnorm_odd_moments defines those
+    # polynomials for orders up to 19.  The dict is implemented as a cached
+    # property to reduce the impact of the creation of the dict on import time.
+    @cached_property
+    def _skewnorm_odd_moments(self):
+        skewnorm_odd_moments = {
+            1: Polynomial([1]),
+            3: Polynomial([3, -1]),
+            5: Polynomial([15, -10, 3]),
+            7: Polynomial([105, -105, 63, -15]),
+            9: Polynomial([945, -1260, 1134, -540, 105]),
+            11: Polynomial([10395, -17325, 20790, -14850, 5775, -945]),
+            13: Polynomial([135135, -270270, 405405, -386100, 225225, -73710,
+                            10395]),
+            15: Polynomial([2027025, -4729725, 8513505, -10135125, 7882875,
+                            -3869775, 1091475, -135135]),
+            17: Polynomial([34459425, -91891800, 192972780, -275675400,
+                            268017750, -175429800, 74220300, -18378360,
+                            2027025]),
+            19: Polynomial([654729075, -1964187225, 4714049340, -7856748900,
+                            9166207050, -7499623950, 4230557100, -1571349780,
+                            346621275, -34459425]),
+        }
+        return skewnorm_odd_moments
+
+    def _munp(self, order, a):
+        if order % 2:
+            if order > 19:
+                raise NotImplementedError("skewnorm noncentral moments not "
+                                          "implemented for odd orders greater "
+                                          "than 19.")
+            # Use the precomputed polynomials that were derived from the
+            # moment generating function.
+            delta = a/np.sqrt(1 + a**2)
+            return (delta * self._skewnorm_odd_moments[order](delta**2)
+                    * _SQRT_2_OVER_PI)
+        else:
+            # For even order, the moment is just (order-1)!!, where !! is the
+            # notation for the double factorial; for an odd integer m, m!! is
+            # m*(m-2)*...*3*1.
+            # We could use special.factorial2, but we know the argument is odd,
+            # so avoid the overhead of that function and compute the result
+            # directly here.
+            return sc.gamma((order + 1)/2) * 2**(order/2) / _SQRT_PI
+
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        If ``method='mm'``, parameters fixed by the user are respected, and the
+        remaining parameters are used to match distribution and sample moments
+        where possible. For example, if the user fixes the location with
+        ``floc``, the parameters will only match the distribution skewness and
+        variance to the sample skewness and variance; no attempt will be made
+        to match the means or minimize a norm of the errors.
+        Note that the maximum possible skewness magnitude of a
+        `scipy.stats.skewnorm` distribution is approximately 0.9952717; if the
+        magnitude of the data's sample skewness exceeds this, the returned
+        shape parameter ``a`` will be infinite.
+        \n\n""")
+    def fit(self, data, *args, **kwds):
+        if kwds.pop("superfit", False):
+            return super().fit(data, *args, **kwds)
+        if isinstance(data, CensoredData):
+            if data.num_censored() == 0:
+                data = data._uncensor()
+            else:
+                return super().fit(data, *args, **kwds)
+
+        # this extracts fixed shape, location, and scale however they
+        # are specified, and also leaves them in `kwds`
+        data, fa, floc, fscale = _check_fit_input_parameters(self, data,
+                                                             args, kwds)
+        method = kwds.get("method", "mle").lower()
+
+        # See https://en.wikipedia.org/wiki/Skew_normal_distribution for
+        # moment formulas.
+        def skew_d(d):  # skewness in terms of delta
+            return (4-np.pi)/2 * ((d * np.sqrt(2 / np.pi))**3
+                                  / (1 - 2*d**2 / np.pi)**(3/2))
+
+        def d_skew(skew):  # delta in terms of skewness
+            s_23 = np.abs(skew)**(2/3)
+            return np.sign(skew) * np.sqrt(
+                np.pi/2 * s_23 / (s_23 + ((4 - np.pi)/2)**(2/3))
+            )
+
+        # If method is method of moments, we don't need the user's guesses.
+        # Otherwise, extract the guesses from args and kwds.
+        if method == "mm":
+            a, loc, scale = None, None, None
+        else:
+            a = args[0] if len(args) else None
+            loc = kwds.pop('loc', None)
+            scale = kwds.pop('scale', None)
+
+        if fa is None and a is None:  # not fixed and no guess: use MoM
+            # Solve for a that matches sample distribution skewness to sample
+            # skewness.
+            s = stats.skew(data)
+            if method == 'mle':
+                # For MLE initial conditions, clip skewness to a large but
+                # reasonable value in case the data skewness is out-of-range.
+                s = np.clip(s, -0.99, 0.99)
+            else:
+                s_max = skew_d(1)
+                s = np.clip(s, -s_max, s_max)
+            d = d_skew(s)
+            with np.errstate(divide='ignore'):
+                a = np.sqrt(np.divide(d**2, (1-d**2)))*np.sign(s)
+        else:
+            a = fa if fa is not None else a
+            d = a / np.sqrt(1 + a**2)
+
+        if fscale is None and scale is None:
+            v = np.var(data)
+            scale = np.sqrt(v / (1 - 2*d**2/np.pi))
+        elif fscale is not None:
+            scale = fscale
+
+        if floc is None and loc is None:
+            m = np.mean(data)
+            loc = m - scale*d*np.sqrt(2/np.pi)
+        elif floc is not None:
+            loc = floc
+
+        if method == 'mm':
+            return a, loc, scale
+        else:
+            # At this point, parameter "guesses" may equal the fixed parameters
+            # in kwds. No harm in passing them as guesses, too.
+            return super().fit(data, a, loc=loc, scale=scale, **kwds)
+
+
+skewnorm = skewnorm_gen(name='skewnorm')
+
+
+class trapezoid_gen(rv_continuous):
+    r"""A trapezoidal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The trapezoidal distribution can be represented with an up-sloping line
+    from ``loc`` to ``(loc + c*scale)``, then constant to ``(loc + d*scale)``
+    and then downsloping from ``(loc + d*scale)`` to ``(loc+scale)``.  This
+    defines the trapezoid base from ``loc`` to ``(loc+scale)`` and the flat
+    top from ``c`` to ``d`` proportional to the position along the base
+    with ``0 <= c <= d <= 1``.  When ``c=d``, this is equivalent to `triang`
+    with the same values for `loc`, `scale` and `c`.
+    The method of [1]_ is used for computing moments.
+
+    `trapezoid` takes :math:`c` and :math:`d` as shape parameters.
+
+    %(after_notes)s
+
+    The standard form is in the range [0, 1] with c the mode.
+    The location parameter shifts the start to `loc`.
+    The scale parameter changes the width from 1 to `scale`.
+
+    %(example)s
+
+    References
+    ----------
+    .. [1] Kacker, R.N. and Lawrence, J.F. (2007). Trapezoidal and triangular
+       distributions for Type B evaluation of standard uncertainty.
+       Metrologia 44, 117-127. :doi:`10.1088/0026-1394/44/2/003`
+
+
+    """
+    def _argcheck(self, c, d):
+        return (c >= 0) & (c <= 1) & (d >= 0) & (d <= 1) & (d >= c)
+
+    def _shape_info(self):
+        ic = _ShapeInfo("c", False, (0, 1.0), (True, True))
+        id = _ShapeInfo("d", False, (0, 1.0), (True, True))
+        return [ic, id]
+
+    def _pdf(self, x, c, d):
+        u = 2 / (d-c+1)
+
+        return _lazyselect([x < c,
+                            (c <= x) & (x <= d),
+                            x > d],
+                           [lambda x, c, d, u: u * x / c,
+                            lambda x, c, d, u: u,
+                            lambda x, c, d, u: u * (1-x) / (1-d)],
+                           (x, c, d, u))
+
+    def _cdf(self, x, c, d):
+        return _lazyselect([x < c,
+                            (c <= x) & (x <= d),
+                            x > d],
+                           [lambda x, c, d: x**2 / c / (d-c+1),
+                            lambda x, c, d: (c + 2 * (x-c)) / (d-c+1),
+                            lambda x, c, d: 1-((1-x) ** 2
+                                               / (d-c+1) / (1-d))],
+                           (x, c, d))
+
+    def _ppf(self, q, c, d):
+        qc, qd = self._cdf(c, c, d), self._cdf(d, c, d)
+        condlist = [q < qc, q <= qd, q > qd]
+        choicelist = [np.sqrt(q * c * (1 + d - c)),
+                      0.5 * q * (1 + d - c) + 0.5 * c,
+                      1 - np.sqrt((1 - q) * (d - c + 1) * (1 - d))]
+        return np.select(condlist, choicelist)
+
+    def _munp(self, n, c, d):
+        # Using the parameterization from Kacker, 2007, with
+        # a=bottom left, c=top left, d=top right, b=bottom right, then
+        #     E[X^n] = h/(n+1)/(n+2) [(b^{n+2}-d^{n+2})/(b-d)
+        #                             - ((c^{n+2} - a^{n+2})/(c-a)]
+        # with h = 2/((b-a) - (d-c)). The corresponding parameterization
+        # in scipy, has a'=loc, c'=loc+c*scale, d'=loc+d*scale, b'=loc+scale,
+        # which for standard form reduces to a'=0, b'=1, c'=c, d'=d.
+        # Substituting into E[X^n] gives the bd' term as (1 - d^{n+2})/(1 - d)
+        # and the ac' term as c^{n-1} for the standard form. The bd' term has
+        # numerical difficulties near d=1, so replace (1 - d^{n+2})/(1-d)
+        # with expm1((n+2)*log(d))/(d-1).
+        # Testing with n=18 for c=(1e-30,1-eps) shows that this is stable.
+        # We still require an explicit test for d=1 to prevent divide by zero,
+        # and now a test for d=0 to prevent log(0).
+        ab_term = c**(n+1)
+        dc_term = _lazyselect(
+            [d == 0.0, (0.0 < d) & (d < 1.0), d == 1.0],
+            [lambda d: 1.0,
+             lambda d: np.expm1((n+2) * np.log(d)) / (d-1.0),
+             lambda d: n+2],
+            [d])
+        val = 2.0 / (1.0+d-c) * (dc_term - ab_term) / ((n+1) * (n+2))
+        return val
+
+    def _entropy(self, c, d):
+        # Using the parameterization from Wikipedia (van Dorp, 2003)
+        # with a=bottom left, c=top left, d=top right, b=bottom right
+        # gives a'=loc, b'=loc+c*scale, c'=loc+d*scale, d'=loc+scale,
+        # which for loc=0, scale=1 is a'=0, b'=c, c'=d, d'=1.
+        # Substituting into the entropy formula from Wikipedia gives
+        # the following result.
+        return 0.5 * (1.0-d+c) / (1.0+d-c) + np.log(0.5 * (1.0+d-c))
+
+    def _fitstart(self, data, args=None):
+        # Arbitrary, but c=d=1 fails due to being on edge of bounds
+        if args is None:
+            args = (0.33, 0.66)
+        return super()._fitstart(data, args=args)
+
+
+trapezoid = trapezoid_gen(a=0.0, b=1.0, name="trapezoid")
+
+
+class triang_gen(rv_continuous):
+    r"""A triangular continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The triangular distribution can be represented with an up-sloping line from
+    ``loc`` to ``(loc + c*scale)`` and then downsloping for ``(loc + c*scale)``
+    to ``(loc + scale)``.
+
+    `triang` takes ``c`` as a shape parameter for :math:`0 \le c \le 1`.
+
+    %(after_notes)s
+
+    The standard form is in the range [0, 1] with c the mode.
+    The location parameter shifts the start to `loc`.
+    The scale parameter changes the width from 1 to `scale`.
+
+    %(example)s
+
+    """
+    def _rvs(self, c, size=None, random_state=None):
+        return random_state.triangular(0, c, 1, size)
+
+    def _argcheck(self, c):
+        return (c >= 0) & (c <= 1)
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, 1.0), (True, True))]
+
+    def _pdf(self, x, c):
+        # 0: edge case where c=0
+        # 1: generalised case for x < c, don't use x <= c, as it doesn't cope
+        #    with c = 0.
+        # 2: generalised case for x >= c, but doesn't cope with c = 1
+        # 3: edge case where c=1
+        r = _lazyselect([c == 0,
+                         x < c,
+                         (x >= c) & (c != 1),
+                         c == 1],
+                        [lambda x, c: 2 - 2 * x,
+                         lambda x, c: 2 * x / c,
+                         lambda x, c: 2 * (1 - x) / (1 - c),
+                         lambda x, c: 2 * x],
+                        (x, c))
+        return r
+
+    def _cdf(self, x, c):
+        r = _lazyselect([c == 0,
+                         x < c,
+                         (x >= c) & (c != 1),
+                         c == 1],
+                        [lambda x, c: 2*x - x*x,
+                         lambda x, c: x * x / c,
+                         lambda x, c: (x*x - 2*x + c) / (c-1),
+                         lambda x, c: x * x],
+                        (x, c))
+        return r
+
+    def _ppf(self, q, c):
+        return np.where(q < c, np.sqrt(c * q), 1-np.sqrt((1-c) * (1-q)))
+
+    def _stats(self, c):
+        return ((c+1.0)/3.0,
+                (1.0-c+c*c)/18,
+                np.sqrt(2)*(2*c-1)*(c+1)*(c-2) / (5*np.power((1.0-c+c*c), 1.5)),
+                -3.0/5.0)
+
+    def _entropy(self, c):
+        return 0.5-np.log(2)
+
+
+triang = triang_gen(a=0.0, b=1.0, name="triang")
+
+
+class truncexpon_gen(rv_continuous):
+    r"""A truncated exponential continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `truncexpon` is:
+
+    .. math::
+
+        f(x, b) = \frac{\exp(-x)}{1 - \exp(-b)}
+
+    for :math:`0 <= x <= b`.
+
+    `truncexpon` takes ``b`` as a shape parameter for :math:`b`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _get_support(self, b):
+        return self.a, b
+
+    def _pdf(self, x, b):
+        # truncexpon.pdf(x, b) = exp(-x) / (1-exp(-b))
+        return np.exp(-x)/(-sc.expm1(-b))
+
+    def _logpdf(self, x, b):
+        return -x - np.log(-sc.expm1(-b))
+
+    def _cdf(self, x, b):
+        return sc.expm1(-x)/sc.expm1(-b)
+
+    def _ppf(self, q, b):
+        return -sc.log1p(q*sc.expm1(-b))
+
+    def _sf(self, x, b):
+        return (np.exp(-b) - np.exp(-x))/sc.expm1(-b)
+
+    def _isf(self, q, b):
+        return -np.log(np.exp(-b) - q * sc.expm1(-b))
+
+    def _munp(self, n, b):
+        # wrong answer with formula, same as in continuous.pdf
+        # return sc.gamman+1)-sc.gammainc1+n, b)
+        if n == 1:
+            return (1-(b+1)*np.exp(-b))/(-sc.expm1(-b))
+        elif n == 2:
+            return 2*(1-0.5*(b*b+2*b+2)*np.exp(-b))/(-sc.expm1(-b))
+        else:
+            # return generic for higher moments
+            return super()._munp(n, b)
+
+    def _entropy(self, b):
+        eB = np.exp(b)
+        return np.log(eB-1)+(1+eB*(b-1.0))/(1.0-eB)
+
+
+truncexpon = truncexpon_gen(a=0.0, name='truncexpon')
+truncexpon._support = (0.0, 'b')
+
+
+# logsumexp trick for log(p + q) with only log(p) and log(q)
+def _log_sum(log_p, log_q):
+    return sc.logsumexp([log_p, log_q], axis=0)
+
+
+# same as above, but using -exp(x) = exp(x + πi)
+def _log_diff(log_p, log_q):
+    return sc.logsumexp([log_p, log_q+np.pi*1j], axis=0)
+
+
+def _log_gauss_mass(a, b):
+    """Log of Gaussian probability mass within an interval"""
+    a, b = np.broadcast_arrays(a, b)
+
+    # Calculations in right tail are inaccurate, so we'll exploit the
+    # symmetry and work only in the left tail
+    case_left = b <= 0
+    case_right = a > 0
+    case_central = ~(case_left | case_right)
+
+    def mass_case_left(a, b):
+        return _log_diff(_norm_logcdf(b), _norm_logcdf(a))
+
+    def mass_case_right(a, b):
+        return mass_case_left(-b, -a)
+
+    def mass_case_central(a, b):
+        # Previously, this was implemented as:
+        # left_mass = mass_case_left(a, 0)
+        # right_mass = mass_case_right(0, b)
+        # return _log_sum(left_mass, right_mass)
+        # Catastrophic cancellation occurs as np.exp(log_mass) approaches 1.
+        # Correct for this with an alternative formulation.
+        # We're not concerned with underflow here: if only one term
+        # underflows, it was insignificant; if both terms underflow,
+        # the result can't accurately be represented in logspace anyway
+        # because sc.log1p(x) ~ x for small x.
+        return sc.log1p(-_norm_cdf(a) - _norm_cdf(-b))
+
+    # _lazyselect not working; don't care to debug it
+    out = np.full_like(a, fill_value=np.nan, dtype=np.complex128)
+    if a[case_left].size:
+        out[case_left] = mass_case_left(a[case_left], b[case_left])
+    if a[case_right].size:
+        out[case_right] = mass_case_right(a[case_right], b[case_right])
+    if a[case_central].size:
+        out[case_central] = mass_case_central(a[case_central], b[case_central])
+    return np.real(out)  # discard ~0j
+
+
+class truncnorm_gen(rv_continuous):
+    r"""A truncated normal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    This distribution is the normal distribution centered on ``loc`` (default
+    0), with standard deviation ``scale`` (default 1), and truncated at ``a``
+    and ``b`` *standard deviations* from ``loc``. For arbitrary ``loc`` and
+    ``scale``, ``a`` and ``b`` are *not* the abscissae at which the shifted
+    and scaled distribution is truncated.
+
+    .. note::
+        If ``a_trunc`` and ``b_trunc`` are the abscissae at which we wish
+        to truncate the distribution (as opposed to the number of standard
+        deviations from ``loc``), then we can calculate the distribution
+        parameters ``a`` and ``b`` as follows::
+
+            a, b = (a_trunc - loc) / scale, (b_trunc - loc) / scale
+
+        This is a common point of confusion. For additional clarification,
+        please see the example below.
+
+    %(example)s
+
+    In the examples above, ``loc=0`` and ``scale=1``, so the plot is truncated
+    at ``a`` on the left and ``b`` on the right. However, suppose we were to
+    produce the same histogram with ``loc = 1`` and ``scale=0.5``.
+
+    >>> loc, scale = 1, 0.5
+    >>> rv = truncnorm(a, b, loc=loc, scale=scale)
+    >>> x = np.linspace(truncnorm.ppf(0.01, a, b),
+    ...                 truncnorm.ppf(0.99, a, b), 100)
+    >>> r = rv.rvs(size=1000)
+
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+    >>> ax.hist(r, density=True, bins='auto', histtype='stepfilled', alpha=0.2)
+    >>> ax.set_xlim(a, b)
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    Note that the distribution is no longer appears to be truncated at
+    abscissae ``a`` and ``b``. That is because the *standard* normal
+    distribution is first truncated at ``a`` and ``b``, *then* the resulting
+    distribution is scaled by ``scale`` and shifted by ``loc``. If we instead
+    want the shifted and scaled distribution to be truncated at ``a`` and
+    ``b``, we need to transform these values before passing them as the
+    distribution parameters.
+
+    >>> a_transformed, b_transformed = (a - loc) / scale, (b - loc) / scale
+    >>> rv = truncnorm(a_transformed, b_transformed, loc=loc, scale=scale)
+    >>> x = np.linspace(truncnorm.ppf(0.01, a, b),
+    ...                 truncnorm.ppf(0.99, a, b), 100)
+    >>> r = rv.rvs(size=10000)
+
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+    >>> ax.hist(r, density=True, bins='auto', histtype='stepfilled', alpha=0.2)
+    >>> ax.set_xlim(a-0.1, b+0.1)
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+    """
+
+    def _argcheck(self, a, b):
+        return a < b
+
+    def _shape_info(self):
+        ia = _ShapeInfo("a", False, (-np.inf, np.inf), (True, False))
+        ib = _ShapeInfo("b", False, (-np.inf, np.inf), (False, True))
+        return [ia, ib]
+
+    def _fitstart(self, data):
+        # Reasonable, since support is [a, b]
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        return super()._fitstart(data, args=(np.min(data), np.max(data)))
+
+    def _get_support(self, a, b):
+        return a, b
+
+    def _pdf(self, x, a, b):
+        return np.exp(self._logpdf(x, a, b))
+
+    def _logpdf(self, x, a, b):
+        return _norm_logpdf(x) - _log_gauss_mass(a, b)
+
+    def _cdf(self, x, a, b):
+        return np.exp(self._logcdf(x, a, b))
+
+    def _logcdf(self, x, a, b):
+        x, a, b = np.broadcast_arrays(x, a, b)
+        logcdf = np.asarray(_log_gauss_mass(a, x) - _log_gauss_mass(a, b))
+        i = logcdf > -0.1  # avoid catastrophic cancellation
+        if np.any(i):
+            logcdf[i] = np.log1p(-np.exp(self._logsf(x[i], a[i], b[i])))
+        return logcdf
+
+    def _sf(self, x, a, b):
+        return np.exp(self._logsf(x, a, b))
+
+    def _logsf(self, x, a, b):
+        x, a, b = np.broadcast_arrays(x, a, b)
+        logsf = np.asarray(_log_gauss_mass(x, b) - _log_gauss_mass(a, b))
+        i = logsf > -0.1  # avoid catastrophic cancellation
+        if np.any(i):
+            logsf[i] = np.log1p(-np.exp(self._logcdf(x[i], a[i], b[i])))
+        return logsf
+
+    def _entropy(self, a, b):
+        A = _norm_cdf(a)
+        B = _norm_cdf(b)
+        Z = B - A
+        C = np.log(np.sqrt(2 * np.pi * np.e) * Z)
+        D = (a * _norm_pdf(a) - b * _norm_pdf(b)) / (2 * Z)
+        h = C + D
+        return h
+
+    def _ppf(self, q, a, b):
+        q, a, b = np.broadcast_arrays(q, a, b)
+
+        case_left = a < 0
+        case_right = ~case_left
+
+        def ppf_left(q, a, b):
+            log_Phi_x = _log_sum(_norm_logcdf(a),
+                                 np.log(q) + _log_gauss_mass(a, b))
+            return sc.ndtri_exp(log_Phi_x)
+
+        def ppf_right(q, a, b):
+            log_Phi_x = _log_sum(_norm_logcdf(-b),
+                                 np.log1p(-q) + _log_gauss_mass(a, b))
+            return -sc.ndtri_exp(log_Phi_x)
+
+        out = np.empty_like(q)
+
+        q_left = q[case_left]
+        q_right = q[case_right]
+
+        if q_left.size:
+            out[case_left] = ppf_left(q_left, a[case_left], b[case_left])
+        if q_right.size:
+            out[case_right] = ppf_right(q_right, a[case_right], b[case_right])
+
+        return out
+
+    def _isf(self, q, a, b):
+        # Mostly copy-paste of _ppf, but I think this is simpler than combining
+        q, a, b = np.broadcast_arrays(q, a, b)
+
+        case_left = b < 0
+        case_right = ~case_left
+
+        def isf_left(q, a, b):
+            log_Phi_x = _log_diff(_norm_logcdf(b),
+                                  np.log(q) + _log_gauss_mass(a, b))
+            return sc.ndtri_exp(np.real(log_Phi_x))
+
+        def isf_right(q, a, b):
+            log_Phi_x = _log_diff(_norm_logcdf(-a),
+                                  np.log1p(-q) + _log_gauss_mass(a, b))
+            return -sc.ndtri_exp(np.real(log_Phi_x))
+
+        out = np.empty_like(q)
+
+        q_left = q[case_left]
+        q_right = q[case_right]
+
+        if q_left.size:
+            out[case_left] = isf_left(q_left, a[case_left], b[case_left])
+        if q_right.size:
+            out[case_right] = isf_right(q_right, a[case_right], b[case_right])
+
+        return out
+
+    def _munp(self, n, a, b):
+        def n_th_moment(n, a, b):
+            """
+            Returns n-th moment. Defined only if n >= 0.
+            Function cannot broadcast due to the loop over n
+            """
+            ab = np.asarray([a, b])
+            pA, pB = self._pdf(ab, a, b)
+            probs = np.asarray([pA, -pB])
+            cond = probs != 0
+            moments = [0, 1]
+            for k in range(1, n+1):
+                # a or b might be infinite, and the corresponding pdf value
+                # is 0 in that case, but nan is returned for the
+                # multiplication.  However, as b->infinity,  pdf(b)*b**k -> 0.
+                # So it is safe to use xpx.apply_where to avoid the nan.
+                vals = xpx.apply_where(cond, (probs, ab),
+                                       lambda x, y: x * y**(k-1),
+                                       fill_value=0)
+                mk = np.sum(vals) + (k-1) * moments[-2]
+                moments.append(mk)
+            return moments[-1]
+
+        return xpx.apply_where((n >= 0) & (a == a) & (b == b), (n, a, b),
+                               np.vectorize(n_th_moment, otypes=[np.float64]),
+                               fill_value=np.nan)
+
+    def _stats(self, a, b, moments='mv'):
+        pA, pB = self.pdf(np.array([a, b]), a, b)
+
+        def _truncnorm_stats_scalar(a, b, pA, pB):
+            ab = np.asarray([a, b])
+            m1 = pA - pB
+            mu = m1
+            # use xpx.apply_where to avoid nan (See detailed comment in _munp)
+            probs = np.asarray([pA, -pB])
+            cond = probs != 0
+            vals = xpx.apply_where(cond, (probs, ab), lambda x, y: x*y,
+                                   fill_value=0)
+            m2 = 1 + np.sum(vals)
+            vals = xpx.apply_where(cond, (probs, ab - mu), lambda x, y: x*y,
+                                   fill_value=0)
+            # mu2 = m2 - mu**2, but not as numerically stable as:
+            # mu2 = (a-mu)*pA - (b-mu)*pB + 1
+            mu2 = 1 + np.sum(vals)
+            vals = xpx.apply_where(cond, (probs, ab), lambda x, y: x*y**2,
+                                   fill_value=0)
+            m3 = 2*m1 + np.sum(vals)
+            vals = xpx.apply_where(cond, (probs, ab), lambda x, y: x*y**3,
+                                   fill_value=0)
+            m4 = 3*m2 + np.sum(vals)
+
+            mu3 = m3 + m1 * (-3*m2 + 2*m1**2)
+            g1 = mu3 / np.power(mu2, 1.5)
+            mu4 = m4 + m1*(-4*m3 + 3*m1*(2*m2 - m1**2))
+            g2 = mu4 / mu2**2 - 3
+            return mu, mu2, g1, g2
+
+        _truncnorm_stats = np.vectorize(_truncnorm_stats_scalar)
+        return _truncnorm_stats(a, b, pA, pB)
+
+
+truncnorm = truncnorm_gen(name='truncnorm', momtype=1)
+truncnorm._support = ('a', 'b')
+
+
+class truncpareto_gen(rv_continuous):
+    r"""An upper truncated Pareto continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    pareto : Pareto distribution
+
+    Notes
+    -----
+    The probability density function for `truncpareto` is:
+
+    .. math::
+
+        f(x, b, c) = \frac{b}{1 - c^{-b}} \frac{1}{x^{b+1}}
+
+    for :math:`b \neq 0`, :math:`c > 1` and :math:`1 \le x \le c`.
+
+    `truncpareto` takes `b` and `c` as shape parameters for :math:`b` and
+    :math:`c`.
+
+    Notice that the upper truncation value :math:`c` is defined in
+    standardized form so that random values of an unscaled, unshifted variable
+    are within the range ``[1, c]``.
+    If ``u_r`` is the upper bound to a scaled and/or shifted variable,
+    then ``c = (u_r - loc) / scale``. In other words, the support of the
+    distribution becomes ``(scale + loc) <= x <= (c*scale + loc)`` when
+    `scale` and/or `loc` are provided.
+
+    The ``fit`` method assumes that :math:`b` is positive; it does not produce
+    good results when the data is more consistent with negative :math:`b`.
+
+    `truncpareto` can also be used to model a general power law distribution
+    with PDF:
+
+    .. math::
+
+        f(x; a, l, h) = \frac{a}{h^a - l^a} x^{a-1}
+
+    for :math:`a \neq 0` and :math:`0 < l < x < h`. Suppose :math:`a`,
+    :math:`l`, and :math:`h` are represented in code as ``a``, ``l``, and
+    ``h``, respectively. In this case, use `truncpareto` with parameters
+    ``b = -a``, ``c = h / l``, ``scale = l``, and ``loc = 0``.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Burroughs, S. M., and Tebbens S. F.
+        "Upper-truncated power laws in natural systems."
+        Pure and Applied Geophysics 158.4 (2001): 741-757.
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        ib = _ShapeInfo("b", False, (-np.inf, np.inf), (False, False))
+        ic = _ShapeInfo("c", False, (1.0, np.inf), (False, False))
+        return [ib, ic]
+
+    def _argcheck(self, b, c):
+        return (b != 0.) & (c > 1.)
+
+    def _get_support(self, b, c):
+        return self.a, c
+
+    def _pdf(self, x, b, c):
+        # here and below, avoid int to negative int power
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return b * x**-(b+1) / (1 - 1/c**b)
+
+    def _logpdf(self, x, b, c):
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return xpx.apply_where(b > 0, (x, b, c), self._logpdf_pos_b, super()._logpdf)
+
+    def _logpdf_pos_b(self, x, b, c):
+        return np.log(b) - np.log(-np.expm1(-b*np.log(c))) - (b+1)*np.log(x)
+
+    def _cdf(self, x, b, c):
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return (1 - x**-b) / (1 - 1/c**b)
+
+    def _logcdf(self, x, b, c):
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return xpx.apply_where(b > 0, (x, b, c), self._logcdf_pos_b, super()._logcdf)
+
+    def _logcdf_pos_b(self, x, b, c):
+        return np.log1p(-x**-b) - np.log1p(-1/c**b)
+
+    def _ppf(self, q, b, c):
+        q, b, c = xp_promote(q, b, c, force_floating=True, xp=np)
+        return pow(1 - (1 - 1/c**b)*q, -1/b)
+
+    def _sf(self, x, b, c):
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return (x**-b - 1/c**b) / (1 - 1/c**b)
+
+    def _logsf(self, x, b, c):
+        x, b, c = xp_promote(x, b, c, force_floating=True, xp=np)
+        return xpx.apply_where(b > 0, (x, b, c), self._logsf_pos_b, super()._logsf)
+
+    def _logsf_pos_b(self, x, b, c):
+        return np.log(x**-b - 1/c**b) - np.log1p(-1/c**b)
+
+    def _isf(self, q, b, c):
+        q, b, c = xp_promote(q, b, c, force_floating=True, xp=np)
+        return pow(1/c**b + (1 - 1/c**b)*q, -1/b)
+
+    def _entropy(self, b, c):
+        return -(np.log(b/(1 - 1/c**b))
+                 + (b+1)*(np.log(c)/(c**b - 1) - 1/b))
+
+    def _munp(self, n, b, c):
+        n, b, c = xp_promote(n, b, c, force_floating=True, xp=np)
+        if (n == b).all():
+            return b*np.log(c) / (1 - 1/c**b)
+        else:
+            return b / (b-n) * (c**b - c**n) / (c**b - 1)
+
+    def _fitstart(self, data):
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        b, loc, scale = pareto.fit(data)
+        c = (max(data) - loc)/scale
+        return b, c, loc, scale
+
+    @_call_super_mom
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        if kwds.pop("superfit", False):
+            return super().fit(data, *args, **kwds)
+
+        def log_mean(x):
+            return np.mean(np.log(x))
+
+        def harm_mean(x):
+            return 1/np.mean(1/x)
+
+        def get_b(c, loc, scale):
+            u = (data-loc)/scale
+            harm_m = harm_mean(u)
+            log_m = log_mean(u)
+            quot = (harm_m-1)/log_m
+            return (1 - (quot-1) / (quot - (1 - 1/c)*harm_m/np.log(c)))/log_m
+
+        def get_c(loc, scale):
+            return (mx - loc)/scale
+
+        def get_loc(fc, fscale):
+            if fscale:  # (fscale and fc) or (fscale and not fc)
+                loc = mn - fscale
+                return loc
+            if fc:
+                loc = (fc*mn - mx)/(fc - 1)
+                return loc
+
+        def get_scale(loc):
+            return mn - loc
+
+        # Functions used for optimisation; partial derivatives of
+        # the Lagrangian, set to equal 0.
+
+        def dL_dLoc(loc, b_=None):
+            # Partial derivative wrt location.
+            # Optimised upon when no parameters, or only b, are fixed.
+            scale = get_scale(loc)
+            c = get_c(loc, scale)
+            b = get_b(c, loc, scale) if b_ is None else b_
+            harm_m = harm_mean((data - loc)/scale)
+            return 1 - (1 + (c - 1)/(c**(b+1) - c)) * (1 - 1/(b+1)) * harm_m
+
+        def dL_dB(b, logc, logm):
+            # Partial derivative wrt b.
+            # Optimised upon whenever at least one parameter but b is fixed,
+            # and b is free.
+            return b - np.log1p(b*logc / (1 - b*logm)) / logc
+
+        def fallback(data, *args, **kwargs):
+            # Should any issue arise, default to the general fit method.
+            return super(truncpareto_gen, self).fit(data, *args, **kwargs)
+
+        parameters = _check_fit_input_parameters(self, data, args, kwds)
+        data, fb, fc, floc, fscale = parameters
+        mn, mx = data.min(), data.max()
+        mn_inf = np.nextafter(mn, -np.inf)
+
+        if (fb is not None
+                and fc is not None
+                and floc is not None
+                and fscale is not None):
+            raise ValueError("All parameters fixed."
+                             "There is nothing to optimize.")
+        elif fc is None and floc is None and fscale is None:
+            if fb is None:
+                def cond_b(loc):
+                    # b is positive only if this function is positive
+                    scale = get_scale(loc)
+                    c = get_c(loc, scale)
+                    harm_m = harm_mean((data - loc)/scale)
+                    return (1 + 1/(c-1)) * np.log(c) / harm_m - 1
+
+                # This gives an upper bound on loc allowing for a positive b.
+                # Iteratively look for a bracket for root_scalar.
+                mn_inf = np.nextafter(mn, -np.inf)
+                rbrack = mn_inf
+                i = 0
+                lbrack = rbrack - 1
+                while ((lbrack > -np.inf)
+                       and (cond_b(lbrack)*cond_b(rbrack) >= 0)):
+                    i += 1
+                    lbrack = rbrack - np.power(2., i)
+                if not lbrack > -np.inf:
+                    return fallback(data, *args, **kwds)
+                res = root_scalar(cond_b, bracket=(lbrack, rbrack))
+                if not res.converged:
+                    return fallback(data, *args, **kwds)
+
+                # Determine the MLE for loc.
+                # Iteratively look for a bracket for root_scalar.
+                rbrack = res.root - 1e-3  # grad_loc is numerically ill-behaved
+                lbrack = rbrack - 1
+                i = 0
+                while ((lbrack > -np.inf)
+                       and (dL_dLoc(lbrack)*dL_dLoc(rbrack) >= 0)):
+                    i += 1
+                    lbrack = rbrack - np.power(2., i)
+                if not lbrack > -np.inf:
+                    return fallback(data, *args, **kwds)
+                res = root_scalar(dL_dLoc, bracket=(lbrack, rbrack))
+                if not res.converged:
+                    return fallback(data, *args, **kwds)
+                loc = res.root
+                scale = get_scale(loc)
+                c = get_c(loc, scale)
+                b = get_b(c, loc, scale)
+
+                std_data = (data - loc)/scale
+                # The expression of b relies on b being bounded above.
+                up_bound_b = min(1/log_mean(std_data),
+                                 1/(harm_mean(std_data)-1))
+                if not (b < up_bound_b):
+                    return fallback(data, *args, **kwds)
+            else:
+                # We know b is positive (or a FitError will be triggered)
+                # so we let loc get close to min(data).
+                rbrack = mn_inf
+                lbrack = mn_inf - 1
+                i = 0
+                # Iteratively look for a bracket for root_scalar.
+                while (lbrack > -np.inf
+                       and (dL_dLoc(lbrack, fb)
+                            * dL_dLoc(rbrack, fb) >= 0)):
+                    i += 1
+                    lbrack = rbrack - 2**i
+                if not lbrack > -np.inf:
+                    return fallback(data, *args, **kwds)
+                res = root_scalar(dL_dLoc, (fb,),
+                                  bracket=(lbrack, rbrack))
+                if not res.converged:
+                    return fallback(data, *args, **kwds)
+                loc = res.root
+                scale = get_scale(loc)
+                c = get_c(loc, scale)
+                b = fb
+        else:
+            # At least one of the parameters determining the support is fixed;
+            # the others then have analytical expressions from the constraints.
+            # The completely determined case (fixed c, loc and scale)
+            # has to be checked for not overflowing the support.
+            # If not fixed, b has to be determined numerically.
+            loc = floc if floc is not None else get_loc(fc, fscale)
+            scale = fscale or get_scale(loc)
+            c = fc or get_c(loc, scale)
+
+            # Unscaled, translated values should be positive when the location
+            # is fixed. If it is not the case, we end up with negative `scale`
+            # and `c`, which would trigger a FitError before exiting the
+            # method.
+            if floc is not None and data.min() - floc < 0:
+                raise FitDataError("truncpareto", lower=1, upper=c)
+
+            # Standardised values should be within the distribution support
+            # when all parameters controlling it are fixed. If it not the case,
+            # `fc` is overridden by `c` determined from `floc` and `fscale` when
+            # raising the exception.
+            if fc and (floc is not None) and fscale:
+                if data.max() > fc*fscale + floc:
+                    raise FitDataError("truncpareto", lower=1,
+                                       upper=get_c(loc, scale))
+
+            # The other constraints should be automatically satisfied
+            # from the analytical expressions of the parameters.
+            # If fc or fscale are respectively less than one or less than 0,
+            # a FitError is triggered before exiting the method.
+
+            if fb is None:
+                std_data = (data - loc)/scale
+                logm = log_mean(std_data)
+                logc = np.log(c)
+                # Condition for a positive root to exist.
+                if not (2*logm < logc):
+                    return fallback(data, *args, **kwds)
+
+                lbrack = 1/logm + 1/(logm - logc)
+                rbrack = np.nextafter(1/logm, 0)
+                try:
+                    res = root_scalar(dL_dB, (logc, logm),
+                                      bracket=(lbrack, rbrack))
+                    # we should then never get there
+                    if not res.converged:
+                        return fallback(data, *args, **kwds)
+                    b = res.root
+                except ValueError:
+                    b = rbrack
+            else:
+                b = fb
+
+        # The distribution requires that `scale+loc <= data <= c*scale+loc`.
+        # To avoid numerical issues, some tuning may be necessary.
+        # We adjust `scale` to satisfy the lower bound, and we adjust
+        # `c` to satisfy the upper bound.
+        if not (scale+loc) < mn:
+            if fscale:
+                loc = np.nextafter(loc, -np.inf)
+            else:
+                scale = get_scale(loc)
+                scale = np.nextafter(scale, 0)
+        if not (c*scale+loc) > mx:
+            c = get_c(loc, scale)
+            c = np.nextafter(c, np.inf)
+
+        if not (np.all(self._argcheck(b, c)) and (scale > 0)):
+            return fallback(data, *args, **kwds)
+
+        params_override = b, c, loc, scale
+        if floc is None and fscale is None:
+            # Based on testing in gh-16782, the following methods are only
+            # reliable if either `floc` or `fscale` are provided. They are
+            # fast, though, so might as well see if they are better than the
+            # generic method.
+            params_super = fallback(data, *args, **kwds)
+            nllf_override = self.nnlf(params_override, data)
+            nllf_super = self.nnlf(params_super, data)
+            if nllf_super < nllf_override:
+                return params_super
+
+        return params_override
+
+
+truncpareto = truncpareto_gen(a=1.0, name='truncpareto')
+truncpareto._support = (1.0, 'c')
+
+
+class tukeylambda_gen(rv_continuous):
+    r"""A Tukey-Lamdba continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    A flexible distribution, able to represent and interpolate between the
+    following distributions:
+
+    - Cauchy                (:math:`lambda = -1`)
+    - logistic              (:math:`lambda = 0`)
+    - approx Normal         (:math:`lambda = 0.14`)
+    - uniform from -1 to 1  (:math:`lambda = 1`)
+
+    `tukeylambda` takes a real number :math:`lambda` (denoted ``lam``
+    in the implementation) as a shape parameter.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _argcheck(self, lam):
+        return np.isfinite(lam)
+
+    def _shape_info(self):
+        return [_ShapeInfo("lam", False, (-np.inf, np.inf), (False, False))]
+
+    def _get_support(self, lam):
+        b = xpx.apply_where(lam > 0, lam,
+                            lambda lam: 1/lam,
+                            fill_value=np.inf)
+        return -b, b
+
+    def _pdf(self, x, lam):
+        Fx = np.asarray(sc.tklmbda(x, lam))
+        Px = Fx**(lam-1.0) + (np.asarray(1-Fx))**(lam-1.0)
+        with np.errstate(divide='ignore'):
+            Px = 1.0/np.asarray(Px)
+            return np.where((lam <= 0) | (abs(x) < 1.0/np.asarray(lam)), Px, 0.0)
+
+    def _cdf(self, x, lam):
+        return sc.tklmbda(x, lam)
+
+    def _ppf(self, q, lam):
+        return sc.boxcox(q, lam) - sc.boxcox1p(-q, lam)
+
+    def _stats(self, lam):
+        return 0, _tlvar(lam), 0, _tlkurt(lam)
+
+    def _entropy(self, lam):
+        def integ(p):
+            return np.log(pow(p, lam-1)+pow(1-p, lam-1))
+        return integrate.quad(integ, 0, 1)[0]
+
+
+tukeylambda = tukeylambda_gen(name='tukeylambda')
+
+
+class FitUniformFixedScaleDataError(FitDataError):
+    def __init__(self, ptp, fscale):
+        self.args = (
+            "Invalid values in `data`.  Maximum likelihood estimation with "
+            "the uniform distribution and fixed scale requires that "
+            f"np.ptp(data) <= fscale, but np.ptp(data) = {ptp} and "
+            f"fscale = {fscale}."
+        )
+
+
+class uniform_gen(rv_continuous):
+    r"""A uniform continuous random variable.
+
+    In the standard form, the distribution is uniform on ``[0, 1]``. Using
+    the parameters ``loc`` and ``scale``, one obtains the uniform distribution
+    on ``[loc, loc + scale]``.
+
+    %(before_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.uniform(0.0, 1.0, size)
+
+    def _pdf(self, x):
+        return 1.0*(x == x)
+
+    def _cdf(self, x):
+        return x
+
+    def _ppf(self, q):
+        return q
+
+    def _stats(self):
+        return 0.5, 1.0/12, 0, -1.2
+
+    def _entropy(self):
+        return 0.0
+
+    @_call_super_mom
+    def fit(self, data, *args, **kwds):
+        """
+        Maximum likelihood estimate for the location and scale parameters.
+
+        `uniform.fit` uses only the following parameters.  Because exact
+        formulas are used, the parameters related to optimization that are
+        available in the `fit` method of other distributions are ignored
+        here.  The only positional argument accepted is `data`.
+
+        Parameters
+        ----------
+        data : array_like
+            Data to use in calculating the maximum likelihood estimate.
+        floc : float, optional
+            Hold the location parameter fixed to the specified value.
+        fscale : float, optional
+            Hold the scale parameter fixed to the specified value.
+
+        Returns
+        -------
+        loc, scale : float
+            Maximum likelihood estimates for the location and scale.
+
+        Notes
+        -----
+        An error is raised if `floc` is given and any values in `data` are
+        less than `floc`, or if `fscale` is given and `fscale` is less
+        than ``data.max() - data.min()``.  An error is also raised if both
+        `floc` and `fscale` are given.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.stats import uniform
+
+        We'll fit the uniform distribution to `x`:
+
+        >>> x = np.array([2, 2.5, 3.1, 9.5, 13.0])
+
+        For a uniform distribution MLE, the location is the minimum of the
+        data, and the scale is the maximum minus the minimum.
+
+        >>> loc, scale = uniform.fit(x)
+        >>> loc
+        2.0
+        >>> scale
+        11.0
+
+        If we know the data comes from a uniform distribution where the support
+        starts at 0, we can use ``floc=0``:
+
+        >>> loc, scale = uniform.fit(x, floc=0)
+        >>> loc
+        0.0
+        >>> scale
+        13.0
+
+        Alternatively, if we know the length of the support is 12, we can use
+        ``fscale=12``:
+
+        >>> loc, scale = uniform.fit(x, fscale=12)
+        >>> loc
+        1.5
+        >>> scale
+        12.0
+
+        In that last example, the support interval is [1.5, 13.5].  This
+        solution is not unique.  For example, the distribution with ``loc=2``
+        and ``scale=12`` has the same likelihood as the one above.  When
+        `fscale` is given and it is larger than ``data.max() - data.min()``,
+        the parameters returned by the `fit` method center the support over
+        the interval ``[data.min(), data.max()]``.
+
+        """
+        if len(args) > 0:
+            raise TypeError("Too many arguments.")
+
+        floc = kwds.pop('floc', None)
+        fscale = kwds.pop('fscale', None)
+
+        _remove_optimizer_parameters(kwds)
+
+        if floc is not None and fscale is not None:
+            # This check is for consistency with `rv_continuous.fit`.
+            raise ValueError("All parameters fixed. There is nothing to "
+                             "optimize.")
+
+        data = np.asarray(data)
+
+        if not np.isfinite(data).all():
+            raise ValueError("The data contains non-finite values.")
+
+        # MLE for the uniform distribution
+        # --------------------------------
+        # The PDF is
+        #
+        #     f(x, loc, scale) = {1/scale  for loc <= x <= loc + scale
+        #                        {0        otherwise}
+        #
+        # The likelihood function is
+        #     L(x, loc, scale) = (1/scale)**n
+        # where n is len(x), assuming loc <= x <= loc + scale for all x.
+        # The log-likelihood is
+        #     l(x, loc, scale) = -n*log(scale)
+        # The log-likelihood is maximized by making scale as small as possible,
+        # while keeping loc <= x <= loc + scale.   So if neither loc nor scale
+        # are fixed, the log-likelihood is maximized by choosing
+        #     loc = x.min()
+        #     scale = np.ptp(x)
+        # If loc is fixed, it must be less than or equal to x.min(), and then
+        # the scale is
+        #     scale = x.max() - loc
+        # If scale is fixed, it must not be less than np.ptp(x).  If scale is
+        # greater than np.ptp(x), the solution is not unique.  Note that the
+        # likelihood does not depend on loc, except for the requirement that
+        # loc <= x <= loc + scale.  All choices of loc for which
+        #     x.max() - scale <= loc <= x.min()
+        # have the same log-likelihood.  In this case, we choose loc such that
+        # the support is centered over the interval [data.min(), data.max()]:
+        #     loc = x.min() = 0.5*(scale - np.ptp(x))
+
+        if fscale is None:
+            # scale is not fixed.
+            if floc is None:
+                # loc is not fixed, scale is not fixed.
+                loc = data.min()
+                scale = np.ptp(data)
+            else:
+                # loc is fixed, scale is not fixed.
+                loc = floc
+                scale = data.max() - loc
+                if data.min() < loc:
+                    raise FitDataError("uniform", lower=loc, upper=loc + scale)
+        else:
+            # loc is not fixed, scale is fixed.
+            ptp = np.ptp(data)
+            if ptp > fscale:
+                raise FitUniformFixedScaleDataError(ptp=ptp, fscale=fscale)
+            # If ptp < fscale, the ML estimate is not unique; see the comments
+            # above.  We choose the distribution for which the support is
+            # centered over the interval [data.min(), data.max()].
+            loc = data.min() - 0.5*(fscale - ptp)
+            scale = fscale
+
+        # We expect the return values to be floating point, so ensure it
+        # by explicitly converting to float.
+        return float(loc), float(scale)
+
+
+uniform = uniform_gen(a=0.0, b=1.0, name='uniform')
+
+
+class vonmises_gen(rv_continuous):
+    r"""A Von Mises continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    scipy.stats.vonmises_fisher : Von-Mises Fisher distribution on a
+                                  hypersphere
+
+    Notes
+    -----
+    The probability density function for `vonmises` and `vonmises_line` is:
+
+    .. math::
+
+        f(x, \kappa) = \frac{ \exp(\kappa \cos(x)) }{ 2 \pi I_0(\kappa) }
+
+    for :math:`-\pi \le x \le \pi`, :math:`\kappa \ge 0`. :math:`I_0` is the
+    modified Bessel function of order zero (`scipy.special.i0`).
+
+    `vonmises` is a circular distribution which does not restrict the
+    distribution to a fixed interval. Currently, there is no circular
+    distribution framework in SciPy. The ``cdf`` is implemented such that
+    ``cdf(x + 2*np.pi) == cdf(x) + 1``.
+
+    `vonmises_line` is the same distribution, defined on :math:`[-\pi, \pi]`
+    on the real line. This is a regular (i.e. non-circular) distribution.
+
+    Note about distribution parameters: `vonmises` and `vonmises_line` take
+    ``kappa`` as a shape parameter (concentration) and ``loc`` as the location
+    (circular mean). A ``scale`` parameter is accepted but does not have any
+    effect.
+
+    Examples
+    --------
+    Import the necessary modules.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import vonmises
+
+    Define distribution parameters.
+
+    >>> loc = 0.5 * np.pi  # circular mean
+    >>> kappa = 1  # concentration
+
+    Compute the probability density at ``x=0`` via the ``pdf`` method.
+
+    >>> vonmises.pdf(0, loc=loc, kappa=kappa)
+    0.12570826359722018
+
+    Verify that the percentile function ``ppf`` inverts the cumulative
+    distribution function ``cdf`` up to floating point accuracy.
+
+    >>> x = 1
+    >>> cdf_value = vonmises.cdf(x, loc=loc, kappa=kappa)
+    >>> ppf_value = vonmises.ppf(cdf_value, loc=loc, kappa=kappa)
+    >>> x, cdf_value, ppf_value
+    (1, 0.31489339900904967, 1.0000000000000004)
+
+    Draw 1000 random variates by calling the ``rvs`` method.
+
+    >>> sample_size = 1000
+    >>> sample = vonmises(loc=loc, kappa=kappa).rvs(sample_size)
+
+    Plot the von Mises density on a Cartesian and polar grid to emphasize
+    that it is a circular distribution.
+
+    >>> fig = plt.figure(figsize=(12, 6))
+    >>> left = plt.subplot(121)
+    >>> right = plt.subplot(122, projection='polar')
+    >>> x = np.linspace(-np.pi, np.pi, 500)
+    >>> vonmises_pdf = vonmises.pdf(x, loc=loc, kappa=kappa)
+    >>> ticks = [0, 0.15, 0.3]
+
+    The left image contains the Cartesian plot.
+
+    >>> left.plot(x, vonmises_pdf)
+    >>> left.set_yticks(ticks)
+    >>> number_of_bins = int(np.sqrt(sample_size))
+    >>> left.hist(sample, density=True, bins=number_of_bins)
+    >>> left.set_title("Cartesian plot")
+    >>> left.set_xlim(-np.pi, np.pi)
+    >>> left.grid(True)
+
+    The right image contains the polar plot.
+
+    >>> right.plot(x, vonmises_pdf, label="PDF")
+    >>> right.set_yticks(ticks)
+    >>> right.hist(sample, density=True, bins=number_of_bins,
+    ...            label="Histogram")
+    >>> right.set_title("Polar plot")
+    >>> right.legend(bbox_to_anchor=(0.15, 1.06))
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("kappa", False, (0, np.inf), (True, False))]
+
+    def _argcheck(self, kappa):
+        return kappa >= 0
+
+    def _rvs(self, kappa, size=None, random_state=None):
+        return random_state.vonmises(0.0, kappa, size=size)
+
+    @inherit_docstring_from(rv_continuous)
+    def rvs(self, *args, **kwds):
+        rvs = super().rvs(*args, **kwds)
+        return np.mod(rvs + np.pi, 2*np.pi) - np.pi
+
+    def _pdf(self, x, kappa):
+        # vonmises.pdf(x, kappa) = exp(kappa * cos(x)) / (2*pi*I[0](kappa))
+        #                        = exp(kappa * (cos(x) - 1)) /
+        #                          (2*pi*exp(-kappa)*I[0](kappa))
+        #                        = exp(kappa * cosm1(x)) / (2*pi*i0e(kappa))
+        return np.exp(kappa*sc.cosm1(x)) / (2*np.pi*sc.i0e(kappa))
+
+    def _logpdf(self, x, kappa):
+        # vonmises.pdf(x, kappa) = exp(kappa * cosm1(x)) / (2*pi*i0e(kappa))
+        return kappa * sc.cosm1(x) - np.log(2*np.pi) - np.log(sc.i0e(kappa))
+
+    def _cdf(self, x, kappa):
+        return _stats.von_mises_cdf(kappa, x)
+
+    def _stats_skip(self, kappa):
+        return 0, None, 0, None
+
+    def _entropy(self, kappa):
+        # vonmises.entropy(kappa) = -kappa * I[1](kappa) / I[0](kappa) +
+        #                           log(2 * np.pi * I[0](kappa))
+        #                         = -kappa * I[1](kappa) * exp(-kappa) /
+        #                           (I[0](kappa) * exp(-kappa)) +
+        #                           log(2 * np.pi *
+        #                           I[0](kappa) * exp(-kappa) / exp(-kappa))
+        #                         = -kappa * sc.i1e(kappa) / sc.i0e(kappa) +
+        #                           log(2 * np.pi * i0e(kappa)) + kappa
+        return (-kappa * sc.i1e(kappa) / sc.i0e(kappa) +
+                np.log(2 * np.pi * sc.i0e(kappa)) + kappa)
+
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        The default limits of integration are endpoints of the interval
+        of width ``2*pi`` centered at `loc` (e.g. ``[-pi, pi]`` when
+        ``loc=0``).\n\n""")
+    def expect(self, func=None, args=(), loc=0, scale=1, lb=None, ub=None,
+               conditional=False, **kwds):
+        _a, _b = -np.pi, np.pi
+
+        if lb is None:
+            lb = loc + _a
+        if ub is None:
+            ub = loc + _b
+
+        return super().expect(func, args, loc,
+                              scale, lb, ub, conditional, **kwds)
+
+    @_call_super_mom
+    @extend_notes_in_docstring(rv_continuous, notes="""\
+        Fit data is assumed to represent angles and will be wrapped onto the
+        unit circle. `f0` and `fscale` are ignored; the returned shape is
+        always the maximum likelihood estimate and the scale is always
+        1. Initial guesses are ignored.\n\n""")
+    def fit(self, data, *args, **kwds):
+        if kwds.pop('superfit', False):
+            return super().fit(data, *args, **kwds)
+
+        data, fshape, floc, fscale = _check_fit_input_parameters(self, data,
+                                                                 args, kwds)
+        if self.a == -np.pi:
+            # vonmises line case, here the default fit method will be used
+            return super().fit(data, *args, **kwds)
+
+        # wrap data to interval [0, 2*pi]
+        data = np.mod(data, 2 * np.pi)
+
+        def find_mu(data):
+            return stats.circmean(data)
+
+        def find_kappa(data, loc):
+            # Usually, sources list the following as the equation to solve for
+            # the MLE of the shape parameter:
+            # r = I[1](kappa)/I[0](kappa), where r = mean resultant length
+            # This is valid when the location is the MLE of location.
+            # More generally, when the location may be fixed at an arbitrary
+            # value, r should be defined as follows:
+            r = np.sum(np.cos(loc - data))/len(data)
+            # See gh-18128 for more information.
+
+            # The function r[0](kappa) := I[1](kappa)/I[0](kappa) is monotonic
+            # increasing from r[0](0) = 0 to r[0](+inf) = 1.  The partial
+            # derivative of the log likelihood function with respect to kappa
+            # is monotonic decreasing in kappa.
+            if r == 1:
+                # All observations are (almost) equal to the mean.  Return
+                # some large kappa such that r[0](kappa) = 1.0 numerically.
+                return 1e16
+            elif r > 0:
+                def solve_for_kappa(kappa):
+                    return sc.i1e(kappa)/sc.i0e(kappa) - r
+
+                # The bounds of the root of r[0](kappa) = r are derived from
+                # selected bounds of r[0](x) given in [1, Eq. 11 & 16].  See
+                # gh-20102 for details.
+                #
+                # [1] Amos, D. E. (1973).  Computation of Modified Bessel
+                #     Functions and Their Ratios.  Mathematics of Computation,
+                #     28(125): 239-251.
+                lower_bound = r/(1-r)/(1+r)
+                upper_bound = 2*lower_bound
+
+                # The bounds are violated numerically for certain values of r,
+                # where solve_for_kappa evaluated at the bounds have the same
+                # sign.  This indicates numerical imprecision of i1e()/i0e().
+                # Return the violated bound in this case as it's more accurate.
+                if solve_for_kappa(lower_bound) >= 0:
+                    return lower_bound
+                elif solve_for_kappa(upper_bound) <= 0:
+                    return upper_bound
+                else:
+                    root_res = root_scalar(solve_for_kappa, method="brentq",
+                                           bracket=(lower_bound, upper_bound))
+                    return root_res.root
+            else:
+                # if the provided floc is very far from the circular mean,
+                # the mean resultant length r can become negative.
+                # In that case, the equation
+                # I[1](kappa)/I[0](kappa) = r does not have a solution.
+                # The maximum likelihood kappa is then 0 which practically
+                # results in the uniform distribution on the circle. As
+                # vonmises is defined for kappa > 0, return instead the
+                # smallest floating point value.
+                # See gh-18190 for more information
+                return np.finfo(float).tiny
+
+        # location likelihood equation has a solution independent of kappa
+        loc = floc if floc is not None else find_mu(data)
+        # shape likelihood equation depends on location
+        shape = fshape if fshape is not None else find_kappa(data, loc)
+
+        loc = np.mod(loc + np.pi, 2 * np.pi) - np.pi  # ensure in [-pi, pi]
+        return shape, loc, 1  # scale is not handled
+
+
+vonmises = vonmises_gen(name='vonmises')
+vonmises_line = vonmises_gen(a=-np.pi, b=np.pi, name='vonmises_line')
+
+
+class wald_gen(invgauss_gen):
+    r"""A Wald continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `wald` is:
+
+    .. math::
+
+        f(x) = \frac{1}{\sqrt{2\pi x^3}} \exp(- \frac{ (x-1)^2 }{ 2x })
+
+    for :math:`x >= 0`.
+
+    `wald` is a special case of `invgauss` with ``mu=1``.
+
+    %(after_notes)s
+
+    %(example)s
+    """
+    _support_mask = rv_continuous._open_support_mask
+
+    def _shape_info(self):
+        return []
+
+    def _rvs(self, size=None, random_state=None):
+        return random_state.wald(1.0, 1.0, size=size)
+
+    def _pdf(self, x):
+        # wald.pdf(x) = 1/sqrt(2*pi*x**3) * exp(-(x-1)**2/(2*x))
+        return invgauss._pdf(x, 1.0)
+
+    def _cdf(self, x):
+        return invgauss._cdf(x, 1.0)
+
+    def _sf(self, x):
+        return invgauss._sf(x, 1.0)
+
+    def _ppf(self, x):
+        return invgauss._ppf(x, 1.0)
+
+    def _isf(self, x):
+        return invgauss._isf(x, 1.0)
+
+    def _logpdf(self, x):
+        return invgauss._logpdf(x, 1.0)
+
+    def _logcdf(self, x):
+        return invgauss._logcdf(x, 1.0)
+
+    def _logsf(self, x):
+        return invgauss._logsf(x, 1.0)
+
+    def _stats(self):
+        return 1.0, 1.0, 3.0, 15.0
+
+    def _entropy(self):
+        return invgauss._entropy(1.0)
+
+
+wald = wald_gen(a=0.0, name="wald")
+
+
+class wrapcauchy_gen(rv_continuous):
+    r"""A wrapped Cauchy continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `wrapcauchy` is:
+
+    .. math::
+
+        f(x, c) = \frac{1-c^2}{2\pi (1+c^2 - 2c \cos(x))}
+
+    for :math:`0 \le x \le 2\pi`, :math:`0 < c < 1`.
+
+    `wrapcauchy` takes ``c`` as a shape parameter for :math:`c`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _argcheck(self, c):
+        return (c > 0) & (c < 1)
+
+    def _shape_info(self):
+        return [_ShapeInfo("c", False, (0, 1), (False, False))]
+
+    def _pdf(self, x, c):
+        # wrapcauchy.pdf(x, c) = (1-c**2) / (2*pi*(1+c**2-2*c*cos(x)))
+        return (1.0-c*c)/(2*np.pi*(1+c*c-2*c*np.cos(x)))
+
+    def _cdf(self, x, c):
+
+        def f1(x, cr):
+            # CDF for 0 <= x < pi
+            return 1/np.pi * np.arctan(cr*np.tan(x/2))
+
+        def f2(x, cr):
+            # CDF for pi <= x <= 2*pi
+            return 1 - 1/np.pi * np.arctan(cr*np.tan((2*np.pi - x)/2))
+
+        cr = (1 + c)/(1 - c)
+        return xpx.apply_where(x < np.pi, (x, cr), f1, f2)
+
+    def _ppf(self, q, c):
+        val = (1.0-c)/(1.0+c)
+        rcq = 2*np.arctan(val*np.tan(np.pi*q))
+        rcmq = 2*np.pi-2*np.arctan(val*np.tan(np.pi*(1-q)))
+        return np.where(q < 1.0/2, rcq, rcmq)
+
+    def _entropy(self, c):
+        return np.log(2*np.pi*(1-c*c))
+
+    def _fitstart(self, data):
+        # Use 0.5 as the initial guess of the shape parameter.
+        # For the location and scale, use the minimum and
+        # peak-to-peak/(2*pi), respectively.
+        if isinstance(data, CensoredData):
+            data = data._uncensor()
+        return 0.5, np.min(data), np.ptp(data)/(2*np.pi)
+
+    @inherit_docstring_from(rv_continuous)
+    def rvs(self, *args, **kwds):
+        rvs = super().rvs(*args, **kwds)
+        return np.mod(rvs, 2*np.pi)
+
+wrapcauchy = wrapcauchy_gen(a=0.0, b=2*np.pi, name='wrapcauchy')
+
+
+class gennorm_gen(rv_continuous):
+    r"""A generalized normal continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    laplace : Laplace distribution
+    norm : normal distribution
+
+    Notes
+    -----
+    The probability density function for `gennorm` is [1]_:
+
+    .. math::
+
+        f(x, \beta) = \frac{\beta}{2 \Gamma(1/\beta)} \exp(-|x|^\beta),
+
+    where :math:`x` is a real number, :math:`\beta > 0` and
+    :math:`\Gamma` is the gamma function (`scipy.special.gamma`).
+
+    `gennorm` takes ``beta`` as a shape parameter for :math:`\beta`.
+    For :math:`\beta = 1`, it is identical to a Laplace distribution.
+    For :math:`\beta = 2`, it is identical to a normal distribution
+    (with ``scale=1/sqrt(2)``).
+
+    References
+    ----------
+
+    .. [1] "Generalized normal distribution, Version 1",
+           https://en.wikipedia.org/wiki/Generalized_normal_distribution#Version_1
+
+    .. [2] Nardon, Martina, and Paolo Pianca. "Simulation techniques for
+           generalized Gaussian densities." Journal of Statistical
+           Computation and Simulation 79.11 (2009): 1317-1329
+
+    .. [3] Wicklin, Rick. "Simulate data from a generalized Gaussian
+           distribution" in The DO Loop blog, September 21, 2016,
+           https://blogs.sas.com/content/iml/2016/09/21/simulate-generalized-gaussian-sas.html
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("beta", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, beta):
+        return np.exp(self._logpdf(x, beta))
+
+    def _logpdf(self, x, beta):
+        return np.log(0.5*beta) - sc.gammaln(1.0/beta) - abs(x)**beta
+
+    def _cdf(self, x, beta):
+        c = 0.5 * np.sign(x)
+        # evaluating (.5 + c) first prevents numerical cancellation
+        return (0.5 + c) - c * sc.gammaincc(1.0/beta, abs(x)**beta)
+
+    def _ppf(self, x, beta):
+        c = np.sign(x - 0.5)
+        # evaluating (1. + c) first prevents numerical cancellation
+        return c * sc.gammainccinv(1.0/beta, (1.0 + c) - 2.0*c*x)**(1.0/beta)
+
+    def _sf(self, x, beta):
+        return self._cdf(-x, beta)
+
+    def _isf(self, x, beta):
+        return -self._ppf(x, beta)
+
+    def _munp(self, n, beta):
+        if n == 0:
+            return 1.
+        if n % 2 == 0:
+            c1, cn = sc.gammaln([1.0/beta, (n + 1.0)/beta])
+            return np.exp(cn - c1)
+        else:
+            return 0.
+
+    def _stats(self, beta):
+        c1, c3, c5 = sc.gammaln([1.0/beta, 3.0/beta, 5.0/beta])
+        return 0., np.exp(c3 - c1), 0., np.exp(c5 + c1 - 2.0*c3) - 3.
+
+    def _entropy(self, beta):
+        return 1. / beta - np.log(.5 * beta) + sc.gammaln(1. / beta)
+
+    def _rvs(self, beta, size=None, random_state=None):
+        # see [2]_ for the algorithm
+        # see [3]_ for reference implementation in SAS
+        z = random_state.gamma(1/beta, size=size)
+        y = z ** (1/beta)
+        # convert y to array to ensure masking support
+        y = np.asarray(y)
+        mask = random_state.random(size=y.shape) < 0.5
+        y[mask] = -y[mask]
+        return y
+
+
+gennorm = gennorm_gen(name='gennorm')
+
+
+class halfgennorm_gen(rv_continuous):
+    r"""The upper half of a generalized normal continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    gennorm : generalized normal distribution
+    expon : exponential distribution
+    halfnorm : half normal distribution
+
+    Notes
+    -----
+    The probability density function for `halfgennorm` is:
+
+    .. math::
+
+        f(x, \beta) = \frac{\beta}{\Gamma(1/\beta)} \exp(-|x|^\beta)
+
+    for :math:`x, \beta > 0`. :math:`\Gamma` is the gamma function
+    (`scipy.special.gamma`).
+
+    `halfgennorm` takes ``beta`` as a shape parameter for :math:`\beta`.
+    For :math:`\beta = 1`, it is identical to an exponential distribution.
+    For :math:`\beta = 2`, it is identical to a half normal distribution
+    (with ``scale=1/sqrt(2)``).
+
+    References
+    ----------
+
+    .. [1] "Generalized normal distribution, Version 1",
+           https://en.wikipedia.org/wiki/Generalized_normal_distribution#Version_1
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("beta", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, beta):
+        #                                 beta
+        # halfgennorm.pdf(x, beta) =  -------------  exp(-|x|**beta)
+        #                             gamma(1/beta)
+        return np.exp(self._logpdf(x, beta))
+
+    def _logpdf(self, x, beta):
+        return np.log(beta) - sc.gammaln(1.0/beta) - x**beta
+
+    def _cdf(self, x, beta):
+        return sc.gammainc(1.0/beta, x**beta)
+
+    def _ppf(self, x, beta):
+        return sc.gammaincinv(1.0/beta, x)**(1.0/beta)
+
+    def _sf(self, x, beta):
+        return sc.gammaincc(1.0/beta, x**beta)
+
+    def _isf(self, x, beta):
+        return sc.gammainccinv(1.0/beta, x)**(1.0/beta)
+
+    def _entropy(self, beta):
+        return 1.0/beta - np.log(beta) + sc.gammaln(1.0/beta)
+
+
+halfgennorm = halfgennorm_gen(a=0, name='halfgennorm')
+
+
+class crystalball_gen(rv_continuous):
+    r"""
+    Crystalball distribution
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `crystalball` is:
+
+    .. math::
+
+        f(x, \beta, m) =  \begin{cases}
+                            N \exp(-x^2 / 2),  &\text{for } x > -\beta\\
+                            N A (B - x)^{-m}  &\text{for } x \le -\beta
+                          \end{cases}
+
+    where :math:`A = (m / |\beta|)^m  \exp(-\beta^2 / 2)`,
+    :math:`B = m/|\beta| - |\beta|` and :math:`N` is a normalisation constant.
+
+    `crystalball` takes :math:`\beta > 0` and :math:`m > 1` as shape
+    parameters.  :math:`\beta` defines the point where the pdf changes
+    from a power-law to a Gaussian distribution.  :math:`m` is the power
+    of the power-law tail.
+
+    %(after_notes)s
+
+    .. versionadded:: 0.19.0
+
+    References
+    ----------
+    .. [1] "Crystal Ball Function",
+           https://en.wikipedia.org/wiki/Crystal_Ball_function
+
+    %(example)s
+    """
+    def _argcheck(self, beta, m):
+        """
+        Shape parameter bounds are m > 1 and beta > 0.
+        """
+        return (m > 1) & (beta > 0)
+
+    def _shape_info(self):
+        ibeta = _ShapeInfo("beta", False, (0, np.inf), (False, False))
+        im = _ShapeInfo("m", False, (1, np.inf), (False, False))
+        return [ibeta, im]
+
+    def _fitstart(self, data):
+        # Arbitrary, but the default m=1 is not valid
+        return super()._fitstart(data, args=(1, 1.5))
+
+    def _pdf(self, x, beta, m):
+        """
+        Return PDF of the crystalball function.
+
+                                            --
+                                           | exp(-x**2 / 2),  for x > -beta
+        crystalball.pdf(x, beta, m) =  N * |
+                                           | A * (B - x)**(-m), for x <= -beta
+                                            --
+        """
+        N = 1.0 / (m/beta / (m-1) * np.exp(-beta**2 / 2.0) +
+                   _norm_pdf_C * _norm_cdf(beta))
+
+        def rhs(x, beta, m):
+            return np.exp(-x**2 / 2)
+
+        def lhs(x, beta, m):
+            return ((m/beta)**m * np.exp(-beta**2 / 2.0) *
+                    (m/beta - beta - x)**(-m))
+
+        return N * xpx.apply_where(x > -beta, (x, beta, m), rhs, lhs)
+
+    def _logpdf(self, x, beta, m):
+        """
+        Return the log of the PDF of the crystalball function.
+        """
+        N = 1.0 / (m/beta / (m-1) * np.exp(-beta**2 / 2.0) +
+                   _norm_pdf_C * _norm_cdf(beta))
+
+        def rhs(x, beta, m):
+            return -x**2/2
+
+        def lhs(x, beta, m):
+            return m*np.log(m/beta) - beta**2/2 - m*np.log(m/beta - beta - x)
+
+        return np.log(N) + xpx.apply_where(x > -beta, (x, beta, m), rhs, lhs)
+
+    def _cdf(self, x, beta, m):
+        """
+        Return CDF of the crystalball function
+        """
+        N = 1.0 / (m/beta / (m-1) * np.exp(-beta**2 / 2.0) +
+                   _norm_pdf_C * _norm_cdf(beta))
+
+        def rhs(x, beta, m):
+            return ((m/beta) * np.exp(-beta**2 / 2.0) / (m-1) +
+                    _norm_pdf_C * (_norm_cdf(x) - _norm_cdf(-beta)))
+
+        def lhs(x, beta, m):
+            return ((m/beta)**m * np.exp(-beta**2 / 2.0) *
+                    (m/beta - beta - x)**(-m+1) / (m-1))
+
+        return N * xpx.apply_where(x > -beta, (x, beta, m), rhs, lhs)
+
+    def _sf(self, x, beta, m):
+        """
+        Survival function of the crystalball distribution.
+        """
+
+        def rhs(x, beta, m):
+            # M is the same as 1/N used elsewhere.
+            M = m/beta/(m - 1)*np.exp(-beta**2/2) + _norm_pdf_C*_norm_cdf(beta)
+            return _norm_pdf_C*_norm_sf(x)/M
+
+        def lhs(x, beta, m):
+            # Default behavior is OK in the left tail of the SF.
+            return 1 - self._cdf(x, beta, m)
+
+        return xpx.apply_where(x > -beta, (x, beta, m), rhs, lhs)
+
+    def _ppf(self, p, beta, m):
+        N = 1.0 / (m/beta / (m-1) * np.exp(-beta**2 / 2.0) +
+                   _norm_pdf_C * _norm_cdf(beta))
+        pbeta = N * (m/beta) * np.exp(-beta**2/2) / (m - 1)
+
+        def ppf_less(p, beta, m):
+            eb2 = np.exp(-beta**2/2)
+            C = (m/beta) * eb2 / (m-1)
+            N = 1/(C + _norm_pdf_C * _norm_cdf(beta))
+            return (m/beta - beta -
+                    ((m - 1)*(m/beta)**(-m)/eb2*p/N)**(1/(1-m)))
+
+        def ppf_greater(p, beta, m):
+            eb2 = np.exp(-beta**2/2)
+            C = (m/beta) * eb2 / (m-1)
+            N = 1/(C + _norm_pdf_C * _norm_cdf(beta))
+            return _norm_ppf(_norm_cdf(-beta) + (1/_norm_pdf_C)*(p/N - C))
+
+        return xpx.apply_where(p < pbeta, (p, beta, m), ppf_less, ppf_greater)
+
+    def _munp(self, n, beta, m):
+        """
+        Returns the n-th non-central moment of the crystalball function.
+        """
+        N = 1.0 / (m/beta / (m-1) * np.exp(-beta**2 / 2.0) +
+                   _norm_pdf_C * _norm_cdf(beta))
+
+        def n_th_moment(n, beta, m):
+            """
+            Returns n-th moment. Defined only if n+1 < m
+            Function cannot broadcast due to the loop over n
+            """
+            A = (m/beta)**m * np.exp(-beta**2 / 2.0)
+            B = m/beta - beta
+            rhs = (2**((n-1)/2.0) * sc.gamma((n+1)/2) *
+                   (1.0 + (-1)**n * sc.gammainc((n+1)/2, beta**2 / 2)))
+            lhs = np.zeros(rhs.shape)
+            for k in range(int(n) + 1):
+                lhs += (sc.binom(n, k) * B**(n-k) * (-1)**k / (m - k - 1) *
+                        (m/beta)**(-m + k + 1))
+            return A * lhs + rhs
+
+        return N * xpx.apply_where(n + 1 < m, (n, beta, m),
+                                   np.vectorize(n_th_moment, otypes=[np.float64]),
+                                   fill_value=np.inf)
+
+
+crystalball = crystalball_gen(name='crystalball', longname="A Crystalball Function")
+
+
+def _argus_phi(chi):
+    """
+    Utility function for the argus distribution used in the pdf, sf and
+    moment calculation.
+    Note that for all x > 0:
+    gammainc(1.5, x**2/2) = 2 * (_norm_cdf(x) - x * _norm_pdf(x) - 0.5).
+    This can be verified directly by noting that the cdf of Gamma(1.5) can
+    be written as erf(sqrt(x)) - 2*sqrt(x)*exp(-x)/sqrt(Pi).
+    We use gammainc instead of the usual definition because it is more precise
+    for small chi.
+    """
+    return sc.gammainc(1.5, chi**2/2) / 2
+
+
+class argus_gen(rv_continuous):
+    r"""
+    Argus distribution
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `argus` is:
+
+    .. math::
+
+        f(x, \chi) = \frac{\chi^3}{\sqrt{2\pi} \Psi(\chi)} x \sqrt{1-x^2}
+                     \exp(-\chi^2 (1 - x^2)/2)
+
+    for :math:`0 < x < 1` and :math:`\chi > 0`, where
+
+    .. math::
+
+        \Psi(\chi) = \Phi(\chi) - \chi \phi(\chi) - 1/2
+
+    with :math:`\Phi` and :math:`\phi` being the CDF and PDF of a standard
+    normal distribution, respectively.
+
+    `argus` takes :math:`\chi` as shape a parameter. Details about sampling
+    from the ARGUS distribution can be found in [2]_.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "ARGUS distribution",
+           https://en.wikipedia.org/wiki/ARGUS_distribution
+    .. [2] Christoph Baumgarten "Random variate generation by fast numerical
+           inversion in the varying parameter case." Research in Statistics,
+           vol. 1, 2023. :doi:`10.1080/27684520.2023.2279060`
+
+    .. versionadded:: 0.19.0
+
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("chi", False, (0, np.inf), (False, False))]
+
+    def _logpdf(self, x, chi):
+        # for x = 0 or 1, logpdf returns -np.inf
+        with np.errstate(divide='ignore'):
+            y = 1.0 - x*x
+            A = 3*np.log(chi) - _norm_pdf_logC - np.log(_argus_phi(chi))
+            return A + np.log(x) + 0.5*np.log1p(-x*x) - chi**2 * y / 2
+
+    def _pdf(self, x, chi):
+        return np.exp(self._logpdf(x, chi))
+
+    def _cdf(self, x, chi):
+        return 1.0 - self._sf(x, chi)
+
+    def _sf(self, x, chi):
+        return _argus_phi(chi * np.sqrt((1 - x)*(1 + x))) / _argus_phi(chi)
+
+    def _rvs(self, chi, size=None, random_state=None):
+        chi = np.asarray(chi)
+        if chi.size == 1:
+            out = self._rvs_scalar(chi, numsamples=size,
+                                   random_state=random_state)
+        else:
+            shp, bc = _check_shape(chi.shape, size)
+            numsamples = int(np.prod(shp))
+            out = np.empty(size)
+            it = np.nditer([chi],
+                           flags=['multi_index'],
+                           op_flags=[['readonly']])
+            while not it.finished:
+                idx = tuple((it.multi_index[j] if not bc[j] else slice(None))
+                            for j in range(-len(size), 0))
+                r = self._rvs_scalar(it[0], numsamples=numsamples,
+                                     random_state=random_state)
+                out[idx] = r.reshape(shp)
+                it.iternext()
+
+        if size == ():
+            out = out[()]
+        return out
+
+    def _rvs_scalar(self, chi, numsamples=None, random_state=None):
+        # if chi <= 1.8:
+        # use rejection method, see Devroye:
+        # Non-Uniform Random Variate Generation, 1986, section II.3.2.
+        # write: PDF f(x) = c * g(x) * h(x), where
+        # h is [0,1]-valued and g is a density
+        # we use two ways to write f
+        #
+        # Case 1:
+        # write g(x) = 3*x*sqrt(1-x**2), h(x) = exp(-chi**2 (1-x**2) / 2)
+        # If X has a distribution with density g its ppf G_inv is given by:
+        # G_inv(u) = np.sqrt(1 - u**(2/3))
+        #
+        # Case 2:
+        # g(x) = chi**2 * x * exp(-chi**2 * (1-x**2)/2) / (1 - exp(-chi**2 /2))
+        # h(x) = sqrt(1 - x**2), 0 <= x <= 1
+        # one can show that
+        # G_inv(u) = np.sqrt(2*np.log(u*(np.exp(chi**2/2)-1)+1))/chi
+        #          = np.sqrt(1 + 2*np.log(np.exp(-chi**2/2)*(1-u)+u)/chi**2)
+        # the latter expression is used for precision with small chi
+        #
+        # In both cases, the inverse cdf of g can be written analytically, and
+        # we can apply the rejection method:
+        #
+        # REPEAT
+        #    Generate U uniformly distributed on [0, 1]
+        #    Generate X with density g (e.g. via inverse transform sampling:
+        #    X = G_inv(V) with V uniformly distributed on [0, 1])
+        # UNTIL X <= h(X)
+        # RETURN X
+        #
+        # We use case 1 for chi <= 0.5 as it maintains precision for small chi
+        # and case 2 for 0.5 < chi <= 1.8 due to its speed for moderate chi.
+        #
+        # if chi > 1.8:
+        # use relation to the Gamma distribution: if X is ARGUS with parameter
+        # chi), then Y = chi**2 * (1 - X**2) / 2 has density proportional to
+        # sqrt(u) * exp(-u) on [0, chi**2 / 2], i.e. a Gamma(3/2) distribution
+        # conditioned on [0, chi**2 / 2]). Therefore, to sample X from the
+        # ARGUS distribution, we sample Y from the gamma distribution, keeping
+        # only samples on [0, chi**2 / 2], and apply the inverse
+        # transformation X = (1 - 2*Y/chi**2)**(1/2). Since we only
+        # look at chi > 1.8, gamma(1.5).cdf(chi**2/2) is large enough such
+        # Y falls in the interval [0, chi**2 / 2] with a high probability:
+        # stats.gamma(1.5).cdf(1.8**2/2) = 0.644...
+        #
+        # The points to switch between the different methods are determined
+        # by a comparison of the runtime of the different methods. However,
+        # the runtime is platform-dependent. The implemented values should
+        # ensure a good overall performance and are supported by an analysis
+        # of the rejection constants of different methods.
+
+        size1d = tuple(np.atleast_1d(numsamples))
+        N = int(np.prod(size1d))
+        x = np.zeros(N)
+        simulated = 0
+        chi2 = chi * chi
+        if chi <= 0.5:
+            d = -chi2 / 2
+            while simulated < N:
+                k = N - simulated
+                u = random_state.uniform(size=k)
+                v = random_state.uniform(size=k)
+                z = v**(2/3)
+                # acceptance condition: u <= h(G_inv(v)). This simplifies to
+                accept = (np.log(u) <= d * z)
+                num_accept = np.sum(accept)
+                if num_accept > 0:
+                    # we still need to transform z=v**(2/3) to X = G_inv(v)
+                    rvs = np.sqrt(1 - z[accept])
+                    x[simulated:(simulated + num_accept)] = rvs
+                    simulated += num_accept
+        elif chi <= 1.8:
+            echi = np.exp(-chi2 / 2)
+            while simulated < N:
+                k = N - simulated
+                u = random_state.uniform(size=k)
+                v = random_state.uniform(size=k)
+                z = 2 * np.log(echi * (1 - v) + v) / chi2
+                # as in case one, simplify u <= h(G_inv(v)) and then transform
+                # z to the target distribution X = G_inv(v)
+                accept = (u**2 + z <= 0)
+                num_accept = np.sum(accept)
+                if num_accept > 0:
+                    rvs = np.sqrt(1 + z[accept])
+                    x[simulated:(simulated + num_accept)] = rvs
+                    simulated += num_accept
+        else:
+            # conditional Gamma for chi > 1.8
+            while simulated < N:
+                k = N - simulated
+                g = random_state.standard_gamma(1.5, size=k)
+                accept = (g <= chi2 / 2)
+                num_accept = np.sum(accept)
+                if num_accept > 0:
+                    x[simulated:(simulated + num_accept)] = g[accept]
+                    simulated += num_accept
+            x = np.sqrt(1 - 2 * x / chi2)
+
+        return np.reshape(x, size1d)
+
+    def _stats(self, chi):
+        # need to ensure that dtype is float
+        # otherwise the mask below does not work for integers
+        chi = np.asarray(chi, dtype=float)
+        phi = _argus_phi(chi)
+        m = np.sqrt(np.pi/8) * chi * sc.ive(1, chi**2/4) / phi
+        # compute second moment, use Taylor expansion for small chi (<= 0.1)
+        mu2 = np.empty_like(chi)
+        mask = chi > 0.1
+        c = chi[mask]
+        mu2[mask] = 1 - 3 / c**2 + c * _norm_pdf(c) / phi[mask]
+        c = chi[~mask]
+        coef = [-358/65690625, 0, -94/1010625, 0, 2/2625, 0, 6/175, 0, 0.4]
+        mu2[~mask] = np.polyval(coef, c)
+        return m, mu2 - m**2, None, None
+
+
+argus = argus_gen(name='argus', longname="An Argus Function", a=0.0, b=1.0)
+
+
+class rv_histogram(rv_continuous):
+    """
+    Generates a distribution given by a histogram.
+    This is useful to generate a template distribution from a binned
+    datasample.
+
+    As a subclass of the `rv_continuous` class, `rv_histogram` inherits from it
+    a collection of generic methods (see `rv_continuous` for the full list),
+    and implements them based on the properties of the provided binned
+    datasample.
+
+    Parameters
+    ----------
+    histogram : tuple of array_like
+        Tuple containing two array_like objects.
+        The first containing the content of n bins,
+        the second containing the (n+1) bin boundaries.
+        In particular, the return value of `numpy.histogram` is accepted.
+
+    density : bool, optional
+        If False, assumes the histogram is proportional to counts per bin;
+        otherwise, assumes it is proportional to a density.
+        For constant bin widths, these are equivalent, but the distinction
+        is important when bin widths vary (see Notes).
+        If None (default), sets ``density=True`` for backwards compatibility,
+        but warns if the bin widths are variable. Set `density` explicitly
+        to silence the warning.
+
+        .. versionadded:: 1.10.0
+
+    Notes
+    -----
+    When a histogram has unequal bin widths, there is a distinction between
+    histograms that are proportional to counts per bin and histograms that are
+    proportional to probability density over a bin. If `numpy.histogram` is
+    called with its default ``density=False``, the resulting histogram is the
+    number of counts per bin, so ``density=False`` should be passed to
+    `rv_histogram`. If `numpy.histogram` is called with ``density=True``, the
+    resulting histogram is in terms of probability density, so ``density=True``
+    should be passed to `rv_histogram`. To avoid warnings, always pass
+    ``density`` explicitly when the input histogram has unequal bin widths.
+
+    There are no additional shape parameters except for the loc and scale.
+    The pdf is defined as a stepwise function from the provided histogram.
+    The cdf is a linear interpolation of the pdf.
+
+    .. versionadded:: 0.19.0
+
+    Examples
+    --------
+
+    Create a scipy.stats distribution from a numpy histogram
+
+    >>> import scipy.stats
+    >>> import numpy as np
+    >>> data = scipy.stats.norm.rvs(size=100000, loc=0, scale=1.5,
+    ...                             random_state=123)
+    >>> hist = np.histogram(data, bins=100)
+    >>> hist_dist = scipy.stats.rv_histogram(hist, density=False)
+
+    Behaves like an ordinary scipy rv_continuous distribution
+
+    >>> hist_dist.pdf(1.0)
+    0.20538577847618705
+    >>> hist_dist.cdf(2.0)
+    0.90818568543056499
+
+    PDF is zero above (below) the highest (lowest) bin of the histogram,
+    defined by the max (min) of the original dataset
+
+    >>> hist_dist.pdf(np.max(data))
+    0.0
+    >>> hist_dist.cdf(np.max(data))
+    1.0
+    >>> hist_dist.pdf(np.min(data))
+    7.7591907244498314e-05
+    >>> hist_dist.cdf(np.min(data))
+    0.0
+
+    PDF and CDF follow the histogram
+
+    >>> import matplotlib.pyplot as plt
+    >>> X = np.linspace(-5.0, 5.0, 100)
+    >>> fig, ax = plt.subplots()
+    >>> ax.set_title("PDF from Template")
+    >>> ax.hist(data, density=True, bins=100)
+    >>> ax.plot(X, hist_dist.pdf(X), label='PDF')
+    >>> ax.plot(X, hist_dist.cdf(X), label='CDF')
+    >>> ax.legend()
+    >>> fig.show()
+
+    """
+    _support_mask = rv_continuous._support_mask
+
+    def __init__(self, histogram, *args, density=None, **kwargs):
+        """
+        Create a new distribution using the given histogram
+
+        Parameters
+        ----------
+        histogram : tuple of array_like
+            Tuple containing two array_like objects.
+            The first containing the content of n bins,
+            the second containing the (n+1) bin boundaries.
+            In particular, the return value of np.histogram is accepted.
+        density : bool, optional
+            If False, assumes the histogram is proportional to counts per bin;
+            otherwise, assumes it is proportional to a density.
+            For constant bin widths, these are equivalent.
+            If None (default), sets ``density=True`` for backward
+            compatibility, but warns if the bin widths are variable. Set
+            `density` explicitly to silence the warning.
+        """
+        self._histogram = histogram
+        self._density = density
+        if len(histogram) != 2:
+            raise ValueError("Expected length 2 for parameter histogram")
+        self._hpdf = np.asarray(histogram[0])
+        self._hbins = np.asarray(histogram[1])
+        if len(self._hpdf) + 1 != len(self._hbins):
+            raise ValueError("Number of elements in histogram content "
+                             "and histogram boundaries do not match, "
+                             "expected n and n+1.")
+        self._hbin_widths = self._hbins[1:] - self._hbins[:-1]
+        bins_vary = not np.allclose(self._hbin_widths, self._hbin_widths[0])
+        if density is None and bins_vary:
+            message = ("Bin widths are not constant. Assuming `density=True`."
+                       "Specify `density` explicitly to silence this warning.")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+            density = True
+        elif not density:
+            self._hpdf = self._hpdf / self._hbin_widths
+
+        self._hpdf = self._hpdf / float(np.sum(self._hpdf * self._hbin_widths))
+        self._hcdf = np.cumsum(self._hpdf * self._hbin_widths)
+        self._hpdf = np.hstack([0.0, self._hpdf, 0.0])
+        self._hcdf = np.hstack([0.0, self._hcdf])
+        # Set support
+        kwargs['a'] = self.a = self._hbins[0]
+        kwargs['b'] = self.b = self._hbins[-1]
+        super().__init__(*args, **kwargs)
+
+    def _pdf(self, x):
+        """
+        PDF of the histogram
+        """
+        return self._hpdf[np.searchsorted(self._hbins, x, side='right')]
+
+    def _cdf(self, x):
+        """
+        CDF calculated from the histogram
+        """
+        return np.interp(x, self._hbins, self._hcdf)
+
+    def _ppf(self, x):
+        """
+        Percentile function calculated from the histogram
+        """
+        return np.interp(x, self._hcdf, self._hbins)
+
+    def _munp(self, n):
+        """Compute the n-th non-central moment."""
+        integrals = (self._hbins[1:]**(n+1) - self._hbins[:-1]**(n+1)) / (n+1)
+        return np.sum(self._hpdf[1:-1] * integrals)
+
+    def _entropy(self):
+        """Compute entropy of distribution"""
+        hpdf = self._hpdf[1:-1]
+        res = xpx.apply_where(hpdf > 0.0, hpdf, np.log, fill_value=0.0)
+        return -np.sum(hpdf * res * self._hbin_widths)
+
+    def _updated_ctor_param(self):
+        """
+        Set the histogram as additional constructor argument
+        """
+        dct = super()._updated_ctor_param()
+        dct['histogram'] = self._histogram
+        dct['density'] = self._density
+        return dct
+
+
+class studentized_range_gen(rv_continuous):
+    r"""A studentized range continuous random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    t: Student's t distribution
+
+    Notes
+    -----
+    The probability density function for `studentized_range` is:
+
+    .. math::
+
+         f(x; k, \nu) = \frac{k(k-1)\nu^{\nu/2}}{\Gamma(\nu/2)
+                        2^{\nu/2-1}} \int_{0}^{\infty} \int_{-\infty}^{\infty}
+                        s^{\nu} e^{-\nu s^2/2} \phi(z) \phi(sx + z)
+                        [\Phi(sx + z) - \Phi(z)]^{k-2} \,dz \,ds
+
+    for :math:`x ≥ 0`, :math:`k > 1`, and :math:`\nu > 0`.
+
+    `studentized_range` takes ``k`` for :math:`k` and ``df`` for :math:`\nu`
+    as shape parameters.
+
+    When :math:`\nu` exceeds 100,000, an asymptotic approximation (infinite
+    degrees of freedom) is used to compute the cumulative distribution
+    function [4]_ and probability distribution function.
+
+    %(after_notes)s
+
+    References
+    ----------
+
+    .. [1] "Studentized range distribution",
+           https://en.wikipedia.org/wiki/Studentized_range_distribution
+    .. [2] Batista, Ben Dêivide, et al. "Externally Studentized Normal Midrange
+           Distribution." Ciência e Agrotecnologia, vol. 41, no. 4, 2017, pp.
+           378-389., doi:10.1590/1413-70542017414047716.
+    .. [3] Harter, H. Leon. "Tables of Range and Studentized Range." The Annals
+           of Mathematical Statistics, vol. 31, no. 4, 1960, pp. 1122-1147.
+           JSTOR, www.jstor.org/stable/2237810. Accessed 18 Feb. 2021.
+    .. [4] Lund, R. E., and J. R. Lund. "Algorithm AS 190: Probabilities and
+           Upper Quantiles for the Studentized Range." Journal of the Royal
+           Statistical Society. Series C (Applied Statistics), vol. 32, no. 2,
+           1983, pp. 204-210. JSTOR, www.jstor.org/stable/2347300. Accessed 18
+           Feb. 2021.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import studentized_range
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Display the probability density function (``pdf``):
+
+    >>> k, df = 3, 10
+    >>> x = np.linspace(studentized_range.ppf(0.01, k, df),
+    ...                 studentized_range.ppf(0.99, k, df), 100)
+    >>> ax.plot(x, studentized_range.pdf(x, k, df),
+    ...         'r-', lw=5, alpha=0.6, label='studentized_range pdf')
+
+    Alternatively, the distribution object can be called (as a function)
+    to fix the shape, location and scale parameters. This returns a "frozen"
+    RV object holding the given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pdf``:
+
+    >>> rv = studentized_range(k, df)
+    >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+
+    Check accuracy of ``cdf`` and ``ppf``:
+
+    >>> vals = studentized_range.ppf([0.001, 0.5, 0.999], k, df)
+    >>> np.allclose([0.001, 0.5, 0.999], studentized_range.cdf(vals, k, df))
+    True
+
+    Rather than using (``studentized_range.rvs``) to generate random variates,
+    which is very slow for this distribution, we can approximate the inverse
+    CDF using an interpolator, and then perform inverse transform sampling
+    with this approximate inverse CDF.
+
+    This distribution has an infinite but thin right tail, so we focus our
+    attention on the leftmost 99.9 percent.
+
+    >>> a, b = studentized_range.ppf([0, .999], k, df)
+    >>> a, b
+    0, 7.41058083802274
+
+    >>> from scipy.interpolate import interp1d
+    >>> rng = np.random.default_rng()
+    >>> xs = np.linspace(a, b, 50)
+    >>> cdf = studentized_range.cdf(xs, k, df)
+    # Create an interpolant of the inverse CDF
+    >>> ppf = interp1d(cdf, xs, fill_value='extrapolate')
+    # Perform inverse transform sampling using the interpolant
+    >>> r = ppf(rng.uniform(size=1000))
+
+    And compare the histogram:
+
+    >>> ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
+    >>> ax.legend(loc='best', frameon=False)
+    >>> plt.show()
+
+    """
+
+    def _argcheck(self, k, df):
+        return (k > 1) & (df > 0)
+
+    def _shape_info(self):
+        ik = _ShapeInfo("k", False, (1, np.inf), (False, False))
+        idf = _ShapeInfo("df", False, (0, np.inf), (False, False))
+        return [ik, idf]
+
+    def _fitstart(self, data):
+        # Default is k=1, but that is not a valid value of the parameter.
+        return super()._fitstart(data, args=(2, 1))
+
+    def _munp(self, K, k, df):
+        cython_symbol = '_studentized_range_moment'
+        _a, _b = self._get_support()
+        # all three of these are used to create a numpy array so they must
+        # be the same shape.
+
+        def _single_moment(K, k, df):
+            log_const = _stats._studentized_range_pdf_logconst(k, df)
+            arg = [K, k, df, log_const]
+            usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+
+            llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
+
+            ranges = [(-np.inf, np.inf), (0, np.inf), (_a, _b)]
+            opts = dict(epsabs=1e-11, epsrel=1e-12)
+
+            return integrate.nquad(llc, ranges=ranges, opts=opts)[0]
+
+        ufunc = np.frompyfunc(_single_moment, 3, 1)
+        return np.asarray(ufunc(K, k, df), dtype=np.float64)[()]
+
+    def _pdf(self, x, k, df):
+
+        def _single_pdf(q, k, df):
+            # The infinite form of the PDF is derived from the infinite
+            # CDF.
+            if df < 100000:
+                cython_symbol = '_studentized_range_pdf'
+                log_const = _stats._studentized_range_pdf_logconst(k, df)
+                arg = [q, k, df, log_const]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf), (0, np.inf)]
+
+            else:
+                cython_symbol = '_studentized_range_pdf_asymptotic'
+                arg = [q, k]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf)]
+
+            llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
+            opts = dict(epsabs=1e-11, epsrel=1e-12)
+            return integrate.nquad(llc, ranges=ranges, opts=opts)[0]
+
+        ufunc = np.frompyfunc(_single_pdf, 3, 1)
+        return np.asarray(ufunc(x, k, df), dtype=np.float64)[()]
+
+    def _cdf(self, x, k, df):
+
+        def _single_cdf(q, k, df):
+            # "When the degrees of freedom V are infinite the probability
+            # integral takes [on a] simpler form," and a single asymptotic
+            # integral is evaluated rather than the standard double integral.
+            # (Lund, Lund, page 205)
+            if df < 100000:
+                cython_symbol = '_studentized_range_cdf'
+                log_const = _stats._studentized_range_cdf_logconst(k, df)
+                arg = [q, k, df, log_const]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf), (0, np.inf)]
+
+            else:
+                cython_symbol = '_studentized_range_cdf_asymptotic'
+                arg = [q, k]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf)]
+
+            llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
+            opts = dict(epsabs=1e-11, epsrel=1e-12)
+            return integrate.nquad(llc, ranges=ranges, opts=opts)[0]
+
+        ufunc = np.frompyfunc(_single_cdf, 3, 1)
+
+        # clip p-values to ensure they are in [0, 1].
+        return np.clip(np.asarray(ufunc(x, k, df), dtype=np.float64)[()], 0, 1)
+
+
+studentized_range = studentized_range_gen(name='studentized_range', a=0,
+                                          b=np.inf)
+
+
+class rel_breitwigner_gen(rv_continuous):
+    r"""A relativistic Breit-Wigner random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    cauchy: Cauchy distribution, also known as the Breit-Wigner distribution.
+
+    Notes
+    -----
+
+    The probability density function for `rel_breitwigner` is
+
+    .. math::
+
+        f(x, \rho) = \frac{k}{(x^2 - \rho^2)^2 + \rho^2}
+
+    where
+
+    .. math::
+        k = \frac{2\sqrt{2}\rho^2\sqrt{\rho^2 + 1}}
+            {\pi\sqrt{\rho^2 + \rho\sqrt{\rho^2 + 1}}}
+
+    The relativistic Breit-Wigner distribution is used in high energy physics
+    to model resonances [1]_. It gives the uncertainty in the invariant mass,
+    :math:`M` [2]_, of a resonance with characteristic mass :math:`M_0` and
+    decay-width :math:`\Gamma`, where :math:`M`, :math:`M_0` and :math:`\Gamma`
+    are expressed in natural units. In SciPy's parametrization, the shape
+    parameter :math:`\rho` is equal to :math:`M_0/\Gamma` and takes values in
+    :math:`(0, \infty)`.
+
+    Equivalently, the relativistic Breit-Wigner distribution is said to give
+    the uncertainty in the center-of-mass energy :math:`E_{\text{cm}}`. In
+    natural units, the speed of light :math:`c` is equal to 1 and the invariant
+    mass :math:`M` is equal to the rest energy :math:`Mc^2`. In the
+    center-of-mass frame, the rest energy is equal to the total energy [3]_.
+
+    %(after_notes)s
+
+    :math:`\rho = M/\Gamma` and :math:`\Gamma` is the scale parameter. For
+    example, if one seeks to model the :math:`Z^0` boson with :math:`M_0
+    \approx 91.1876 \text{ GeV}` and :math:`\Gamma \approx 2.4952\text{ GeV}`
+    [4]_ one can set ``rho=91.1876/2.4952`` and ``scale=2.4952``.
+
+    To ensure a physically meaningful result when using the `fit` method, one
+    should set ``floc=0`` to fix the location parameter to 0.
+
+    References
+    ----------
+    .. [1] Relativistic Breit-Wigner distribution, Wikipedia,
+           https://en.wikipedia.org/wiki/Relativistic_Breit-Wigner_distribution
+    .. [2] Invariant mass, Wikipedia,
+           https://en.wikipedia.org/wiki/Invariant_mass
+    .. [3] Center-of-momentum frame, Wikipedia,
+           https://en.wikipedia.org/wiki/Center-of-momentum_frame
+    .. [4] M. Tanabashi et al. (Particle Data Group) Phys. Rev. D 98, 030001 -
+           Published 17 August 2018
+
+    %(example)s
+
+    """
+    def _argcheck(self, rho):
+        return rho > 0
+
+    def _shape_info(self):
+        return [_ShapeInfo("rho", False, (0, np.inf), (False, False))]
+
+    def _pdf(self, x, rho):
+        # C = k / rho**2
+        C = np.sqrt(
+            2 * (1 + 1/rho**2) / (1 + np.sqrt(1 + 1/rho**2))
+        ) * 2 / np.pi
+        with np.errstate(over='ignore'):
+            return C / (((x - rho)*(x + rho)/rho)**2 + 1)
+
+    def _cdf(self, x, rho):
+        # C = k / (2 * rho**2) / np.sqrt(1 + 1/rho**2)
+        C = np.sqrt(2/(1 + np.sqrt(1 + 1/rho**2)))/np.pi
+        result = (
+            np.sqrt(-1 + 1j/rho)
+            * np.arctan(x/np.sqrt(-rho*(rho + 1j)))
+        )
+        result = C * 2 * np.imag(result)
+        # Sometimes above formula produces values greater than 1.
+        return np.clip(result, None, 1)
+
+    def _munp(self, n, rho):
+        if n == 0:
+            return 1.
+        if n == 1:
+            # C = k / (2 * rho)
+            C = np.sqrt(
+                2 * (1 + 1/rho**2) / (1 + np.sqrt(1 + 1/rho**2))
+            ) / np.pi * rho
+            return C * (np.pi/2 + np.arctan(rho))
+        if n == 2:
+            # C = pi * k / (4 * rho)
+            C = np.sqrt(
+                (1 + 1/rho**2) / (2 * (1 + np.sqrt(1 + 1/rho**2)))
+            ) * rho
+            result = (1 - rho * 1j) / np.sqrt(-1 - 1j/rho)
+            return 2 * C * np.real(result)
+        else:
+            return np.inf
+
+    def _stats(self, rho):
+        # Returning None from stats makes public stats use _munp.
+        # nan values will be omitted from public stats. Skew and
+        # kurtosis are actually infinite.
+        return None, None, np.nan, np.nan
+
+    @inherit_docstring_from(rv_continuous)
+    def fit(self, data, *args, **kwds):
+        # Override rv_continuous.fit to better handle case where floc is set.
+        data, _, floc, fscale = _check_fit_input_parameters(
+            self, data, args, kwds
+        )
+
+        censored = isinstance(data, CensoredData)
+        if censored:
+            if data.num_censored() == 0:
+                # There are no censored values in data, so replace the
+                # CensoredData instance with a regular array.
+                data = data._uncensored
+                censored = False
+
+        if floc is None or censored:
+            return super().fit(data, *args, **kwds)
+
+        if fscale is None:
+            # The interquartile range approximates the scale parameter gamma.
+            # The median approximates rho * gamma.
+            p25, p50, p75 = np.quantile(data - floc, [0.25, 0.5, 0.75])
+            scale_0 = p75 - p25
+            rho_0 = p50 / scale_0
+            if not args:
+                args = [rho_0]
+            if "scale" not in kwds:
+                kwds["scale"] = scale_0
+        else:
+            M_0 = np.median(data - floc)
+            rho_0 = M_0 / fscale
+            if not args:
+                args = [rho_0]
+        return super().fit(data, *args, **kwds)
+
+
+rel_breitwigner = rel_breitwigner_gen(a=0.0, name="rel_breitwigner")
+
+
+# Collect names of classes and objects in this module.
+pairs = list(globals().copy().items())
+_distn_names, _distn_gen_names = get_distribution_names(pairs, rv_continuous)
+
+__all__ = _distn_names + _distn_gen_names + ['rv_histogram']
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_correlation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b6d4d51418c2a0e6a97d1b577cb73d58d203b19
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_correlation.py
@@ -0,0 +1,385 @@
+import math
+from scipy import stats
+from scipy._lib._array_api import (xp_capabilities, array_namespace, xp_promote,
+                                   xp_result_type)
+from scipy.stats._stats_py import _SimpleNormal, SignificanceResult, _get_pvalue
+from scipy.stats._axis_nan_policy import _axis_nan_policy_factory
+
+
+__all__ = ['chatterjeexi', 'spearmanrho']
+
+
+def _xi_statistic(x, y, y_continuous, xp):
+    # Compute xi correlation statistic
+
+    # `axis=-1` is guaranteed by _axis_nan_policy decorator
+    n = x.shape[-1]
+
+    # "Rearrange the data as (X(1), Y(1)), . . . ,(X(n), Y(n))
+    # such that X(1) ≤ ··· ≤ X(n)"
+    j = xp.argsort(x, axis=-1)
+    j, y = xp.broadcast_arrays(j, y)
+    y = xp.take_along_axis(y, j, axis=-1)
+
+    # "Let ri be the rank of Y(i), that is, the number of j such that Y(j) ≤ Y(i)"
+    r = stats.rankdata(y, method='max', axis=-1)
+    # " additionally define li to be the number of j such that Y(j) ≥ Y(i)"
+    # Could probably compute this from r, but that can be an enhancement
+    l = stats.rankdata(-y, method='max', axis=-1)
+    r, l = xp.astype(r, x.dtype), xp.astype(l, x.dtype)
+
+    num = xp.sum(xp.abs(xp.diff(r, axis=-1)), axis=-1)
+    if y_continuous:  # [1] Eq. 1.1
+        statistic = 1 - 3 * num / (n ** 2 - 1)
+    else:  # [1] Eq. 1.2
+        den = 2 * xp.sum((n - l) * l, axis=-1)
+        statistic = 1 - n * num / den
+
+    return statistic, r, l
+
+
+def _xi_std(r, l, y_continuous, xp):
+    # Compute asymptotic standard deviation of xi under null hypothesis of independence
+
+    # `axis=-1` is guaranteed by _axis_nan_policy decorator
+    n = r.shape[-1]
+
+    # "Suppose that X and Y are independent and Y is continuous. Then
+    # √n·ξn(X, Y) → N(0, 2/5) in distribution as n → ∞"
+    if y_continuous:  # [1] Theorem 2.1
+        return xp.asarray(math.sqrt(2 / 5) / math.sqrt(n), dtype=r.dtype)
+
+    # "Suppose that X and Y are independent. Then √n·ξn(X, Y)
+    # converges to N(0, τ²) in distribution as n → ∞
+    # [1] Eq. 2.2 and surrounding math
+    i = xp.arange(1, n + 1, dtype=r.dtype)
+    u = xp.sort(r, axis=-1)
+    v = xp.cumulative_sum(u, axis=-1)
+    an = 1 / n**4 * xp.sum((2*n - 2*i + 1) * u**2, axis=-1)
+    bn = 1 / n**5 * xp.sum((v + (n - i)*u)**2, axis=-1)
+    cn = 1 / n**3 * xp.sum((2*n - 2*i + 1) * u, axis=-1)
+    dn = 1 / n**3 * xp.sum((l * (n - l)), axis=-1)
+    tau2 = (an - 2*bn + cn**2) / dn**2
+
+    return xp.sqrt(tau2) / math.sqrt(n)
+
+
+def _chatterjeexi_iv(y_continuous, method):
+    # Input validation for `chatterjeexi`
+    # x, y, `axis` input validation taken care of by decorator
+
+    if y_continuous not in {True, False}:
+        raise ValueError('`y_continuous` must be boolean.')
+
+    if not isinstance(method, stats.PermutationMethod):
+        method = method.lower()
+        message = "`method` must be 'asymptotic' or a `PermutationMethod` instance."
+        if method != 'asymptotic':
+            raise ValueError(message)
+
+    return y_continuous, method
+
+
+def _unpack(res, _):
+    return res.statistic, res.pvalue
+
+
+@xp_capabilities(skip_backends=[('dask.array', 'no take_along_axis'),
+                                ('cupy', 'no rankdata (xp.repeats limitation)')])
+@_axis_nan_policy_factory(SignificanceResult, paired=True, n_samples=2,
+                          result_to_tuple=_unpack, n_outputs=2, too_small=1)
+def chatterjeexi(x, y, *, axis=0, y_continuous=False, method='asymptotic'):
+    r"""Compute the xi correlation and perform a test of independence
+
+    The xi correlation coefficient is a measure of association between two
+    variables; the value tends to be close to zero when the variables are
+    independent and close to 1 when there is a strong association. Unlike
+    other correlation coefficients, the xi correlation is effective even
+    when the association is not monotonic.
+
+    Parameters
+    ----------
+    x, y : array-like
+        The samples: corresponding observations of the independent and
+        dependent variable. The (N-d) arrays must be broadcastable.
+    axis : int, default: 0
+        Axis along which to perform the test.
+    method : 'asymptotic' or `PermutationMethod` instance, optional
+        Selects the method used to calculate the *p*-value.
+        Default is 'asymptotic'. The following options are available.
+
+        * ``'asymptotic'``: compares the standardized test statistic
+          against the normal distribution.
+        * `PermutationMethod` instance. In this case, the p-value
+          is computed using `permutation_test` with the provided
+          configuration options and other appropriate settings.
+
+    y_continuous : bool, default: False
+        Whether `y` is assumed to be drawn from a continuous distribution.
+        If `y` is drawn from a continuous distribution, results are valid
+        whether this is assumed or not, but enabling this assumption will
+        result in faster computation and typically produce similar results.
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+            The xi correlation statistic.
+        pvalue : float
+            The associated *p*-value: the probability of a statistic at least as
+            high as the observed value under the null hypothesis of independence.
+
+    See Also
+    --------
+    scipy.stats.pearsonr, scipy.stats.spearmanr, scipy.stats.kendalltau
+
+    Notes
+    -----
+    There is currently no special handling of ties in `x`; they are broken arbitrarily
+    by the implementation. [1]_ recommends: "if there are ties among the Xi's, then
+    choose an increasing rearrangement as above by breaking ties uniformly at random."
+    This is easily accomplished by adding a small amount of random noise to `x`; see
+    examples.
+
+    [1]_ notes that the statistic is not symmetric in `x` and `y` *by design*:
+    "...we may want to understand if :math:`Y` is a function :math:`X`, and not just
+    if one of the variables is a function of the other." See [1]_ Remark 1.
+
+    References
+    ----------
+    .. [1] Chatterjee, Sourav. "A new coefficient of correlation." Journal of
+           the American Statistical Association 116.536 (2021): 2009-2022.
+           :doi:`10.1080/01621459.2020.1758115`.
+
+    Examples
+    --------
+    Generate perfectly correlated data, and observe that the xi correlation is
+    nearly 1.0.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(348932549825235)
+    >>> x = rng.uniform(0, 10, size=100)
+    >>> y = np.sin(x)
+    >>> res = stats.chatterjeexi(x, y)
+    >>> res.statistic
+    np.float64(0.9012901290129013)
+
+    The probability of observing such a high value of the statistic under the
+    null hypothesis of independence is very low.
+
+    >>> res.pvalue
+    np.float64(2.2206974648177804e-46)
+
+    As noise is introduced, the correlation coefficient decreases.
+
+    >>> noise = rng.normal(scale=[[0.1], [0.5], [1]], size=(3, 100))
+    >>> res = stats.chatterjeexi(x, y + noise, axis=-1)
+    >>> res.statistic
+    array([0.79507951, 0.41824182, 0.16651665])
+
+    Because the distribution of `y` is continuous, it is valid to pass
+    ``y_continuous=True``. The statistic is identical, and the p-value
+    (not shown) is only slightly different.
+
+    >>> stats.chatterjeexi(x, y + noise, y_continuous=True, axis=-1).statistic
+    array([0.79507951, 0.41824182, 0.16651665])
+
+    Consider a case in which there are ties in `x`.
+
+    >>> x = rng.integers(10, size=1000)
+    >>> y = rng.integers(10, size=1000)
+
+    [1]_ recommends breaking the ties uniformly at random.
+
+    >>> d = rng.uniform(1e-5, size=x.size)
+    >>> res = stats.chatterjeexi(x + d, y)
+    >>> res.statistic
+    -0.029919991638798438
+
+    Since this gives a randomized estimate of the statistic, [1]_ also suggests
+    considering the average over all possibilities of breaking ties. This is
+    computationally infeasible when there are many ties, but a randomized estimate of
+    *this* quantity can be obtained by considering many random possibilities of breaking
+    ties.
+
+    >>> d = rng.uniform(1e-5, size=(9999, x.size))
+    >>> res = stats.chatterjeexi(x + d, y, axis=1)
+    >>> np.mean(res.statistic)
+    0.001186895213756626
+
+    """
+    xp = array_namespace(x, y)
+
+    # x, y, `axis` input validation taken care of by decorator
+    # In fact, `axis` is guaranteed to be -1
+    y_continuous, method = _chatterjeexi_iv(y_continuous, method)
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+
+    # A highly negative statistic is possible, e.g.
+    # x = np.arange(100.), y = (x % 2 == 0)
+    # Unclear whether we should expose `alternative`, though.
+    alternative = 'greater'
+
+    if method == 'asymptotic':
+        xi, r, l = _xi_statistic(x, y, y_continuous, xp=xp)
+        std = _xi_std(r, l, y_continuous, xp=xp)
+        norm = _SimpleNormal()
+        pvalue = _get_pvalue(xi / std, norm, alternative=alternative, xp=xp)
+    elif isinstance(method, stats.PermutationMethod):
+        res = stats.permutation_test(
+            # Could be faster if we just permuted the ranks; for now, keep it simple.
+            data=(y,),
+            statistic=lambda y, axis: _xi_statistic(x, y, y_continuous, xp=xp)[0],
+            alternative=alternative, permutation_type='pairings', **method._asdict(),
+            axis=-1)  # `axis=-1` is guaranteed by _axis_nan_policy decorator
+
+        xi, pvalue = res.statistic, res.pvalue
+
+    xi = xi[()] if xi.ndim == 0 else xi
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+    return SignificanceResult(xi, pvalue)
+
+
+@xp_capabilities(cpu_only=True, exceptions=['jax.numpy'],
+    skip_backends=[('dask.array', 'not supported by rankdata (take_along_axis)')]
+)
+@_axis_nan_policy_factory(SignificanceResult, paired=True, n_samples=2,
+                          result_to_tuple=_unpack, n_outputs=2, too_small=1)
+def spearmanrho(x, y, /, *, alternative='two-sided', method=None, axis=0):
+    r"""Calculate a Spearman rho correlation coefficient with associated p-value.
+
+    The Spearman rank-order correlation coefficient is a nonparametric measure
+    of the monotonicity of the relationship between two datasets.
+    Like other correlation coefficients, it varies between -1 and +1 with 0
+    implying no correlation. Coefficients of -1 or +1 are associated with an exact
+    monotonic relationship.  Positive correlations indicate that as `x` increases,
+    so does `y`; negative correlations indicate that as `x` increases, `y` decreases.
+    The p-value is the probability of an uncorrelated system producing datasets
+    with a Spearman correlation at least as extreme as the one computed from the
+    observed dataset.
+
+    Parameters
+    ----------
+    x, y : array-like
+        The samples: corresponding observations of the independent and
+        dependent variable. The (N-d) arrays must be broadcastable.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the correlation is nonzero
+        * 'less': the correlation is negative (less than zero)
+        * 'greater':  the correlation is positive (greater than zero)
+
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value. If `method` is an
+        instance of `PermutationMethod`/`MonteCarloMethod`, the p-value is
+        computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Otherwise, the p-value is computed using an asymptotic approximation of
+        the null distribution.
+    axis : int or None, optional
+        If axis=0 (default), then each column represents a variable, with
+        observations in the rows. If axis=1, the relationship is transposed:
+        each row represents a variable, while the columns contain observations.
+        If axis=None, then both arrays will be raveled.
+        Like other `scipy.stats` functions, `axis` is interpreted after the
+        arrays are broadcasted.
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : floating point array or NumPy scalar
+            Spearman correlation coefficient
+        pvalue : floating point array NumPy scalar
+            The p-value - the probabilitiy of realizing such an extreme statistic
+            value under the null hypothesis that two samples have no ordinal
+            correlation. See `alternative` above for alternative hypotheses.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Raised if an input is a constant array.  The correlation coefficient
+        is not defined in this case, so ``np.nan`` is returned.
+
+    Notes
+    -----
+    `spearmanrho` was created to make improvements to SciPy's implementation of
+    the Spearman correlation test without making backward-incompatible changes
+    to `spearmanr`. Advantages of `spearmanrho` over `spearmanr` include:
+
+    - `spearmanrho` follows standard array broadcasting rules.
+    - `spearmanrho` is compatible with some non-NumPy arrays.
+    - `spearmanrho` can compute exact p-values, even in the presence of ties,
+      when an appropriate instance of `PermutationMethod` is provided via the
+      `method` argument.
+
+    References
+    ----------
+    .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+       Probability and Statistics Tables and Formulae. Chapman & Hall: New
+       York. 2000.
+       Section  14.7
+    .. [2] Kendall, M. G. and Stuart, A. (1973).
+       The Advanced Theory of Statistics, Volume 2: Inference and Relationship.
+       Griffin. 1973.
+       Section 31.18
+
+    Examples
+    --------
+    Univariate samples, approximate p-value.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = [1, 2, 3, 4, 5]
+    >>> y = [5, 6, 7, 8, 7]
+    >>> res = stats.spearmanrho(x, y)
+    >>> res.statistic
+    np.float64(0.8207826816681233)
+    >>> res.pvalue
+    np.float64(0.08858700531354405)
+
+    Univariate samples, exact p-value.
+
+    >>> res = stats.spearmanrho(x, y, method=stats.PermutationMethod())
+    >>> res.statistic
+    np.float64(0.8207826816681233)
+    >>> res.pvalue
+    np.float64(0.13333333333333333)
+
+    Batch of univariate samples, one vectorized call.
+
+    >>> rng = np.random.default_rng(98145152315484)
+    >>> x2 = rng.standard_normal((2, 100))
+    >>> y2 = rng.standard_normal((2, 100))
+    >>> res = stats.spearmanrho(x2, y2, axis=-1)
+    >>> res.statistic
+    array([ 0.16585659, -0.12151215])
+    >>> res.pvalue
+    array([0.0991155 , 0.22846869])
+
+    Bivariate samples using standard broadcasting rules.
+
+    >>> res = stats.spearmanrho(x2[np.newaxis, :], x2[:, np.newaxis], axis=-1)
+    >>> res.statistic
+    array([[ 1.        , -0.14670267],
+           [-0.14670267,  1.        ]])
+    >>> res.pvalue
+    array([[0.        , 0.14526128],
+           [0.14526128, 0.        ]])
+
+    """
+    xp = array_namespace(x, y)
+    dtype = xp_result_type(x, y, force_floating=True, xp=xp)
+    rx = stats.rankdata(x, axis=axis)
+    ry = stats.rankdata(y, axis=axis)
+    rx = xp.astype(rx, dtype, copy=False)
+    ry = xp.astype(ry, dtype, copy=False)
+    res = stats.pearsonr(rx, ry, method=method, alternative=alternative, axis=axis)
+    return SignificanceResult(res.statistic, res.pvalue)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_covariance.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5e81b4071d9a4973f29ec5630f0282deaeda2a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_covariance.py
@@ -0,0 +1,649 @@
+from functools import cached_property
+from types import GenericAlias
+
+import numpy as np
+from scipy import linalg
+from scipy.stats import _multivariate
+
+
+__all__ = ["Covariance"]
+
+
+class Covariance:
+    """
+    Representation of a covariance matrix
+
+    Calculations involving covariance matrices (e.g. data whitening,
+    multivariate normal function evaluation) are often performed more
+    efficiently using a decomposition of the covariance matrix instead of the
+    covariance matrix itself. This class allows the user to construct an
+    object representing a covariance matrix using any of several
+    decompositions and perform calculations using a common interface.
+
+    .. note::
+
+        The `Covariance` class cannot be instantiated directly. Instead, use
+        one of the factory methods (e.g. `Covariance.from_diagonal`).
+
+    Examples
+    --------
+    The `Covariance` class is used by calling one of its
+    factory methods to create a `Covariance` object, then pass that
+    representation of the `Covariance` matrix as a shape parameter of a
+    multivariate distribution.
+
+    For instance, the multivariate normal distribution can accept an array
+    representing a covariance matrix:
+
+    >>> from scipy import stats
+    >>> import numpy as np
+    >>> d = [1, 2, 3]
+    >>> A = np.diag(d)  # a diagonal covariance matrix
+    >>> x = [4, -2, 5]  # a point of interest
+    >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
+    >>> dist.pdf(x)
+    4.9595685102808205e-08
+
+    but the calculations are performed in a very generic way that does not
+    take advantage of any special properties of the covariance matrix. Because
+    our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
+    to create an object representing the covariance matrix, and
+    `multivariate_normal` can use this to compute the probability density
+    function more efficiently.
+
+    >>> cov = stats.Covariance.from_diagonal(d)
+    >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
+    >>> dist.pdf(x)
+    4.9595685102808205e-08
+
+    """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self):
+        message = ("The `Covariance` class cannot be instantiated directly. "
+                   "Please use one of the factory methods "
+                   "(e.g. `Covariance.from_diagonal`).")
+        raise NotImplementedError(message)
+
+    @staticmethod
+    def from_diagonal(diagonal):
+        r"""
+        Return a representation of a covariance matrix from its diagonal.
+
+        Parameters
+        ----------
+        diagonal : array_like
+            The diagonal elements of a diagonal matrix.
+
+        Notes
+        -----
+        Let the diagonal elements of a diagonal covariance matrix :math:`D` be
+        stored in the vector :math:`d`.
+
+        When all elements of :math:`d` are strictly positive, whitening of a
+        data point :math:`x` is performed by computing
+        :math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
+        element-wise.
+        :math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
+        where the :math:`\log` operation is performed element-wise.
+
+        This `Covariance` class supports singular covariance matrices. When
+        computing ``_log_pdet``, non-positive elements of :math:`d` are
+        ignored. Whitening is not well defined when the point to be whitened
+        does not lie in the span of the columns of the covariance matrix. The
+        convention taken here is to treat the inverse square root of
+        non-positive elements of :math:`d` as zeros.
+
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = np.diag(rng.random(n))
+        >>> x = rng.random(size=n)
+
+        Extract the diagonal from ``A`` and create the `Covariance` object.
+
+        >>> d = np.diag(A)
+        >>> cov = stats.Covariance.from_diagonal(d)
+
+        Compare the functionality of the `Covariance` object against a
+        reference implementations.
+
+        >>> res = cov.whiten(x)
+        >>> ref = np.diag(d**-0.5) @ x
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+
+        """
+        return CovViaDiagonal(diagonal)
+
+    @staticmethod
+    def from_precision(precision, covariance=None):
+        r"""
+        Return a representation of a covariance from its precision matrix.
+
+        Parameters
+        ----------
+        precision : array_like
+            The precision matrix; that is, the inverse of a square, symmetric,
+            positive definite covariance matrix.
+        covariance : array_like, optional
+            The square, symmetric, positive definite covariance matrix. If not
+            provided, this may need to be calculated (e.g. to evaluate the
+            cumulative distribution function of
+            `scipy.stats.multivariate_normal`) by inverting `precision`.
+
+        Notes
+        -----
+        Let the covariance matrix be :math:`A`, its precision matrix be
+        :math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
+        that :math:`L L^T = P`.
+        Whitening of a data point :math:`x` is performed by computing
+        :math:`x^T L`. :math:`\log\det{A}` is calculated as
+        :math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
+        element-wise.
+
+        This `Covariance` class does not support singular covariance matrices
+        because the precision matrix does not exist for a singular covariance
+        matrix.
+
+        Examples
+        --------
+        Prepare a symmetric positive definite precision matrix ``P`` and a
+        data point ``x``. (If the precision matrix is not already available,
+        consider the other factory methods of the `Covariance` class.)
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> P = rng.random(size=(n, n))
+        >>> P = P @ P.T  # a precision matrix must be positive definite
+        >>> x = rng.random(size=n)
+
+        Create the `Covariance` object.
+
+        >>> cov = stats.Covariance.from_precision(P)
+
+        Compare the functionality of the `Covariance` object against
+        reference implementations.
+
+        >>> res = cov.whiten(x)
+        >>> ref = x @ np.linalg.cholesky(P)
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = -np.linalg.slogdet(P)[-1]
+        >>> np.allclose(res, ref)
+        True
+
+        """
+        return CovViaPrecision(precision, covariance)
+
+    @staticmethod
+    def from_cholesky(cholesky):
+        r"""
+        Representation of a covariance provided via the (lower) Cholesky factor
+
+        Parameters
+        ----------
+        cholesky : array_like
+            The lower triangular Cholesky factor of the covariance matrix.
+
+        Notes
+        -----
+        Let the covariance matrix be :math:`A` and :math:`L` be the lower
+        Cholesky factor such that :math:`L L^T = A`.
+        Whitening of a data point :math:`x` is performed by computing
+        :math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
+        :math:`2tr(\log{L})`, where the :math:`\log` operation is performed
+        element-wise.
+
+        This `Covariance` class does not support singular covariance matrices
+        because the Cholesky decomposition does not exist for a singular
+        covariance matrix.
+
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = rng.random(size=(n, n))
+        >>> A = A @ A.T  # make the covariance symmetric positive definite
+        >>> x = rng.random(size=n)
+
+        Perform the Cholesky decomposition of ``A`` and create the
+        `Covariance` object.
+
+        >>> L = np.linalg.cholesky(A)
+        >>> cov = stats.Covariance.from_cholesky(L)
+
+        Compare the functionality of the `Covariance` object against
+        reference implementation.
+
+        >>> from scipy.linalg import solve_triangular
+        >>> res = cov.whiten(x)
+        >>> ref = solve_triangular(L, x, lower=True)
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+
+        """
+        return CovViaCholesky(cholesky)
+
+    @staticmethod
+    def from_eigendecomposition(eigendecomposition):
+        r"""
+        Representation of a covariance provided via eigendecomposition
+
+        Parameters
+        ----------
+        eigendecomposition : sequence
+            A sequence (nominally a tuple) containing the eigenvalue and
+            eigenvector arrays as computed by `scipy.linalg.eigh` or
+            `numpy.linalg.eigh`.
+
+        Notes
+        -----
+        Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
+        eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
+        such that `V W V^T = A`.
+
+        When all of the eigenvalues are strictly positive, whitening of a
+        data point :math:`x` is performed by computing
+        :math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
+        element-wise.
+        :math:`\log\det{A}` is calculated as  :math:`tr(\log{W})`,
+        where the :math:`\log` operation is performed element-wise.
+
+        This `Covariance` class supports singular covariance matrices. When
+        computing ``_log_pdet``, non-positive eigenvalues are ignored.
+        Whitening is not well defined when the point to be whitened
+        does not lie in the span of the columns of the covariance matrix. The
+        convention taken here is to treat the inverse square root of
+        non-positive eigenvalues as zeros.
+
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = rng.random(size=(n, n))
+        >>> A = A @ A.T  # make the covariance symmetric positive definite
+        >>> x = rng.random(size=n)
+
+        Perform the eigendecomposition of ``A`` and create the `Covariance`
+        object.
+
+        >>> w, v = np.linalg.eigh(A)
+        >>> cov = stats.Covariance.from_eigendecomposition((w, v))
+
+        Compare the functionality of the `Covariance` object against
+        reference implementations.
+
+        >>> res = cov.whiten(x)
+        >>> ref = x @ (v @ np.diag(w**-0.5))
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+
+        """
+        return CovViaEigendecomposition(eigendecomposition)
+
+    def whiten(self, x):
+        """
+        Perform a whitening transformation on data.
+
+        "Whitening" ("white" as in "white noise", in which each frequency has
+        equal magnitude) transforms a set of random variables into a new set of
+        random variables with unit-diagonal covariance. When a whitening
+        transform is applied to a sample of points distributed according to
+        a multivariate normal distribution with zero mean, the covariance of
+        the transformed sample is approximately the identity matrix.
+
+        Parameters
+        ----------
+        x : array_like
+            An array of points. The last dimension must correspond with the
+            dimensionality of the space, i.e., the number of columns in the
+            covariance matrix.
+
+        Returns
+        -------
+        x_ : array_like
+            The transformed array of points.
+
+        References
+        ----------
+        .. [1] "Whitening Transformation". Wikipedia.
+               https://en.wikipedia.org/wiki/Whitening_transformation
+        .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
+               coloring linear transformation". Transactions of VSB 18.2
+               (2018): 31-35. :doi:`10.31490/tces-2018-0013`
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 3
+        >>> A = rng.random(size=(n, n))
+        >>> cov_array = A @ A.T  # make matrix symmetric positive definite
+        >>> precision = np.linalg.inv(cov_array)
+        >>> cov_object = stats.Covariance.from_precision(precision)
+        >>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
+        >>> x_ = cov_object.whiten(x)
+        >>> np.cov(x_, rowvar=False)  # near-identity covariance
+        array([[0.97862122, 0.00893147, 0.02430451],
+               [0.00893147, 0.96719062, 0.02201312],
+               [0.02430451, 0.02201312, 0.99206881]])
+
+        """
+        return self._whiten(np.asarray(x))
+
+    def colorize(self, x):
+        """
+        Perform a colorizing transformation on data.
+
+        "Colorizing" ("color" as in "colored noise", in which different
+        frequencies may have different magnitudes) transforms a set of
+        uncorrelated random variables into a new set of random variables with
+        the desired covariance. When a coloring transform is applied to a
+        sample of points distributed according to a multivariate normal
+        distribution with identity covariance and zero mean, the covariance of
+        the transformed sample is approximately the covariance matrix used
+        in the coloring transform.
+
+        Parameters
+        ----------
+        x : array_like
+            An array of points. The last dimension must correspond with the
+            dimensionality of the space, i.e., the number of columns in the
+            covariance matrix.
+
+        Returns
+        -------
+        x_ : array_like
+            The transformed array of points.
+
+        References
+        ----------
+        .. [1] "Whitening Transformation". Wikipedia.
+               https://en.wikipedia.org/wiki/Whitening_transformation
+        .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
+               coloring linear transformation". Transactions of VSB 18.2
+               (2018): 31-35. :doi:`10.31490/tces-2018-0013`
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng(1638083107694713882823079058616272161)
+        >>> n = 3
+        >>> A = rng.random(size=(n, n))
+        >>> cov_array = A @ A.T  # make matrix symmetric positive definite
+        >>> cholesky = np.linalg.cholesky(cov_array)
+        >>> cov_object = stats.Covariance.from_cholesky(cholesky)
+        >>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
+        >>> x_ = cov_object.colorize(x)
+        >>> cov_data = np.cov(x_, rowvar=False)
+        >>> np.allclose(cov_data, cov_array, rtol=3e-2)
+        True
+        """
+        return self._colorize(np.asarray(x))
+
+    @property
+    def log_pdet(self):
+        """
+        Log of the pseudo-determinant of the covariance matrix
+        """
+        return np.array(self._log_pdet, dtype=float)[()]
+
+    @property
+    def rank(self):
+        """
+        Rank of the covariance matrix
+        """
+        return np.array(self._rank, dtype=int)[()]
+
+    @property
+    def covariance(self):
+        """
+        Explicit representation of the covariance matrix
+        """
+        return self._covariance
+
+    @property
+    def shape(self):
+        """
+        Shape of the covariance array
+        """
+        return self._shape
+
+    def _validate_matrix(self, A, name):
+        A = np.atleast_2d(A)
+        m, n = A.shape[-2:]
+        if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
+                                         np.issubdtype(A.dtype, np.floating)):
+            message = (f"The input `{name}` must be a square, "
+                       "two-dimensional array of real numbers.")
+            raise ValueError(message)
+        return A
+
+    def _validate_vector(self, A, name):
+        A = np.atleast_1d(A)
+        if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
+                               np.issubdtype(A.dtype, np.floating)):
+            message = (f"The input `{name}` must be a one-dimensional array "
+                       "of real numbers.")
+            raise ValueError(message)
+        return A
+
+
+class CovViaPrecision(Covariance):
+
+    __class_getitem__ = None
+
+    def __init__(self, precision, covariance=None):
+        precision = self._validate_matrix(precision, 'precision')
+        if covariance is not None:
+            covariance = self._validate_matrix(covariance, 'covariance')
+            message = "`precision.shape` must equal `covariance.shape`."
+            if precision.shape != covariance.shape:
+                raise ValueError(message)
+
+        self._chol_P = np.linalg.cholesky(precision)
+        self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
+        self._rank = precision.shape[-1]  # must be full rank if invertible
+        self._precision = precision
+        self._cov_matrix = covariance
+        self._shape = precision.shape
+        self._allow_singular = False
+
+    def _whiten(self, x):
+        return x @ self._chol_P
+
+    @cached_property
+    def _covariance(self):
+        n = self._shape[-1]
+        return (linalg.cho_solve((self._chol_P, True), np.eye(n))
+                if self._cov_matrix is None else self._cov_matrix)
+
+    def _colorize(self, x):
+        m = x.T.shape[0]
+        res = linalg.solve_triangular(self._chol_P.T, x.T.reshape(m, -1), lower=False)
+        return res.reshape(x.T.shape).T
+
+
+def _dot_diag(x, d):
+    # If d were a full diagonal matrix, x @ d would always do what we want.
+    # Special treatment is needed for n-dimensional `d` in which each row
+    # includes only the diagonal elements of a covariance matrix.
+    return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
+
+
+class CovViaDiagonal(Covariance):
+
+    def __init__(self, diagonal):
+        diagonal = self._validate_vector(diagonal, 'diagonal')
+
+        i_zero = diagonal <= 0
+        positive_diagonal = np.array(diagonal, dtype=np.float64)
+
+        positive_diagonal[i_zero] = 1  # ones don't affect determinant
+        self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
+
+        psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
+        psuedo_reciprocals[i_zero] = 0
+
+        self._sqrt_diagonal = np.sqrt(diagonal)
+        self._LP = psuedo_reciprocals
+        self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
+        self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
+        self._i_zero = i_zero
+        self._shape = self._covariance.shape
+        self._allow_singular = True
+
+    def _whiten(self, x):
+        return _dot_diag(x, self._LP)
+
+    def _colorize(self, x):
+        return _dot_diag(x, self._sqrt_diagonal)
+
+    def _support_mask(self, x):
+        """
+        Check whether x lies in the support of the distribution.
+        """
+        return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
+
+
+class CovViaCholesky(Covariance):
+
+    __class_getitem__ = None
+
+    def __init__(self, cholesky):
+        L = self._validate_matrix(cholesky, 'cholesky')
+
+        self._factor = L
+        self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
+        self._rank = L.shape[-1]  # must be full rank for cholesky
+        self._shape = L.shape
+        self._allow_singular = False
+
+    @cached_property
+    def _covariance(self):
+        return self._factor @ self._factor.T
+
+    def _whiten(self, x):
+        m = x.T.shape[0]
+        res = linalg.solve_triangular(self._factor, x.T.reshape(m, -1), lower=True)
+        return res.reshape(x.T.shape).T
+
+    def _colorize(self, x):
+        return x @ self._factor.T
+
+
+class CovViaEigendecomposition(Covariance):
+
+    __class_getitem__ = None
+
+    def __init__(self, eigendecomposition):
+        eigenvalues, eigenvectors = eigendecomposition
+        eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
+        eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
+        message = ("The shapes of `eigenvalues` and `eigenvectors` "
+                   "must be compatible.")
+        try:
+            eigenvalues = np.expand_dims(eigenvalues, -2)
+            eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
+                                                            eigenvalues)
+            eigenvalues = eigenvalues[..., 0, :]
+        except ValueError:
+            raise ValueError(message)
+
+        i_zero = eigenvalues <= 0
+        positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
+
+        positive_eigenvalues[i_zero] = 1  # ones don't affect determinant
+        self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
+
+        psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
+        psuedo_reciprocals[i_zero] = 0
+
+        self._LP = eigenvectors * psuedo_reciprocals
+        self._LA = eigenvectors * np.sqrt(eigenvalues)
+        self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
+        self._w = eigenvalues
+        self._v = eigenvectors
+        self._shape = eigenvectors.shape
+        self._null_basis = eigenvectors * i_zero
+        # This is only used for `_support_mask`, not to decide whether
+        # the covariance is singular or not.
+        self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
+        self._allow_singular = True
+
+    def _whiten(self, x):
+        return x @ self._LP
+
+    def _colorize(self, x):
+        return x @ self._LA.T
+
+    @cached_property
+    def _covariance(self):
+        return (self._v * self._w) @ self._v.T
+
+    def _support_mask(self, x):
+        """
+        Check whether x lies in the support of the distribution.
+        """
+        residual = np.linalg.norm(x @ self._null_basis, axis=-1)
+        in_support = residual < self._eps
+        return in_support
+
+
+class CovViaPSD(Covariance):
+    """
+    Representation of a covariance provided via an instance of _PSD
+    """
+
+    __class_getitem__ = None
+
+    def __init__(self, psd):
+        self._LP = psd.U
+        self._log_pdet = psd.log_pdet
+        self._rank = psd.rank
+        self._covariance = psd._M
+        self._shape = psd._M.shape
+        self._psd = psd
+        self._allow_singular = False  # by default
+
+    def _whiten(self, x):
+        return x @ self._LP
+
+    def _support_mask(self, x):
+        return self._psd._support_mask(x)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_crosstab.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_crosstab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e938eacad04467068985aacc7134248ab6ec44a6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_crosstab.py
@@ -0,0 +1,204 @@
+import numpy as np
+from scipy.sparse import coo_matrix
+from scipy._lib._bunch import _make_tuple_bunch
+
+
+CrosstabResult = _make_tuple_bunch(
+    "CrosstabResult", ["elements", "count"]
+)
+
+
+def crosstab(*args, levels=None, sparse=False):
+    """
+    Return table of counts for each possible unique combination in ``*args``.
+
+    When ``len(args) > 1``, the array computed by this function is
+    often referred to as a *contingency table* [1]_.
+
+    The arguments must be sequences with the same length.  The second return
+    value, `count`, is an integer array with ``len(args)`` dimensions.  If
+    `levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
+    is the number of unique elements in ``args[k]``.
+
+    Parameters
+    ----------
+    *args : sequences
+        A sequence of sequences whose unique aligned elements are to be
+        counted.  The sequences in args must all be the same length.
+    levels : sequence, optional
+        If `levels` is given, it must be a sequence that is the same length as
+        `args`.  Each element in `levels` is either a sequence or None.  If it
+        is a sequence, it gives the values in the corresponding sequence in
+        `args` that are to be counted.  If any value in the sequences in `args`
+        does not occur in the corresponding sequence in `levels`, that value
+        is ignored and not counted in the returned array `count`.  The default
+        value of `levels` for ``args[i]`` is ``np.unique(args[i])``
+    sparse : bool, optional
+        If True, return a sparse matrix.  The matrix will be an instance of
+        the `scipy.sparse.coo_matrix` class.  Because SciPy's sparse matrices
+        must be 2-d, only two input sequences are allowed when `sparse` is
+        True.  Default is False.
+
+    Returns
+    -------
+    res : CrosstabResult
+        An object containing the following attributes:
+
+        elements : tuple of numpy.ndarrays.
+            Tuple of length ``len(args)`` containing the arrays of elements
+            that are counted in `count`.  These can be interpreted as the
+            labels of the corresponding dimensions of `count`. If `levels` was
+            given, then if ``levels[i]`` is not None, ``elements[i]`` will
+            hold the values given in ``levels[i]``.
+        count : numpy.ndarray or scipy.sparse.coo_matrix
+            Counts of the unique elements in ``zip(*args)``, stored in an
+            array. Also known as a *contingency table* when ``len(args) > 1``.
+
+    See Also
+    --------
+    numpy.unique
+
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+
+    References
+    ----------
+    .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
+
+    Examples
+    --------
+    >>> from scipy.stats.contingency import crosstab
+
+    Given the lists `a` and `x`, create a contingency table that counts the
+    frequencies of the corresponding pairs.
+
+    >>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
+    >>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
+    >>> res = crosstab(a, x)
+    >>> avals, xvals = res.elements
+    >>> avals
+    array(['A', 'B'], dtype='<U1')
+    >>> xvals
+    array(['X', 'Y', 'Z'], dtype='<U1')
+    >>> res.count
+    array([[2, 3, 0],
+           [1, 0, 4]])
+
+    So ``('A', 'X')`` occurs twice, ``('A', 'Y')`` occurs three times, etc.
+
+    Higher dimensional contingency tables can be created.
+
+    >>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
+    >>> res = crosstab(a, x, p)
+    >>> res.count
+    array([[[2, 0],
+            [2, 1],
+            [0, 0]],
+           [[1, 0],
+            [0, 0],
+            [1, 3]]])
+    >>> res.count.shape
+    (2, 3, 2)
+
+    The values to be counted can be set by using the `levels` argument.
+    It allows the elements of interest in each input sequence to be
+    given explicitly instead finding the unique elements of the sequence.
+
+    For example, suppose one of the arguments is an array containing the
+    answers to a survey question, with integer values 1 to 4.  Even if the
+    value 1 does not occur in the data, we want an entry for it in the table.
+
+    >>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4]  # 1 does not occur.
+    >>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4]  # 3 does not occur.
+    >>> options = [1, 2, 3, 4]
+    >>> res = crosstab(q1, q2, levels=(options, options))
+    >>> res.count
+    array([[0, 0, 0, 0],
+           [1, 1, 0, 1],
+           [1, 4, 0, 1],
+           [0, 3, 0, 3]])
+
+    If `levels` is given, but an element of `levels` is None, the unique values
+    of the corresponding argument are used. For example,
+
+    >>> res = crosstab(q1, q2, levels=(None, options))
+    >>> res.elements
+    [array([2, 3, 4]), [1, 2, 3, 4]]
+    >>> res.count
+    array([[1, 1, 0, 1],
+           [1, 4, 0, 1],
+           [0, 3, 0, 3]])
+
+    If we want to ignore the pairs where 4 occurs in ``q2``, we can
+    give just the values [1, 2] to `levels`, and the 4 will be ignored:
+
+    >>> res = crosstab(q1, q2, levels=(None, [1, 2]))
+    >>> res.elements
+    [array([2, 3, 4]), [1, 2]]
+    >>> res.count
+    array([[1, 1],
+           [1, 4],
+           [0, 3]])
+
+    Finally, let's repeat the first example, but return a sparse matrix:
+
+    >>> res = crosstab(a, x, sparse=True)
+    >>> res.count
+    <COOrdinate sparse matrix of dtype 'int64'
+        with 4 stored elements and shape (2, 3)>
+    >>> res.count.toarray()
+    array([[2, 3, 0],
+           [1, 0, 4]])
+
+    """
+    nargs = len(args)
+    if nargs == 0:
+        raise TypeError("At least one input sequence is required.")
+
+    len0 = len(args[0])
+    if not all(len(a) == len0 for a in args[1:]):
+        raise ValueError("All input sequences must have the same length.")
+
+    if sparse and nargs != 2:
+        raise ValueError("When `sparse` is True, only two input sequences "
+                         "are allowed.")
+
+    if levels is None:
+        # Call np.unique with return_inverse=True on each argument.
+        actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
+                                       for a in args])
+    else:
+        # `levels` is not None...
+        if len(levels) != nargs:
+            raise ValueError('len(levels) must equal the number of input '
+                             'sequences')
+
+        args = [np.asarray(arg) for arg in args]
+        mask = np.zeros((nargs, len0), dtype=np.bool_)
+        inv = np.zeros((nargs, len0), dtype=np.intp)
+        actual_levels = []
+        for k, (levels_list, arg) in enumerate(zip(levels, args)):
+            if levels_list is None:
+                levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
+                mask[k, :] = True
+            else:
+                q = arg == np.asarray(levels_list).reshape(-1, 1)
+                mask[k, :] = np.any(q, axis=0)
+                qnz = q.T.nonzero()
+                inv[k, qnz[0]] = qnz[1]
+            actual_levels.append(levels_list)
+
+        mask_all = mask.all(axis=0)
+        indices = tuple(inv[:, mask_all])
+
+    if sparse:
+        count = coo_matrix((np.ones(len(indices[0]), dtype=int),
+                            (indices[0], indices[1])))
+        count.sum_duplicates()
+    else:
+        shape = [len(u) for u in actual_levels]
+        count = np.zeros(shape, dtype=int)
+        np.add.at(count, indices, 1)
+
+    return CrosstabResult(actual_levels, count)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_discrete_distns.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_discrete_distns.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e441ade12975b8a3f6a0c67627b61cf0dc12c2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_discrete_distns.py
@@ -0,0 +1,2096 @@
+#
+# Author:  Travis Oliphant  2002-2011 with contributions from
+#          SciPy Developers 2004-2011
+#
+from functools import partial
+
+from scipy import special
+from scipy.special import entr, logsumexp, betaln, gammaln as gamln
+import scipy.special._ufuncs as scu
+from scipy._lib._util import rng_integers
+import scipy._lib.array_api_extra as xpx
+from scipy.interpolate import interp1d
+
+from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
+
+import numpy as np
+
+from ._distn_infrastructure import (rv_discrete, get_distribution_names,
+                                    _vectorize_rvs_over_shapes,
+                                    _ShapeInfo, _isintegral,
+                                    rv_discrete_frozen)
+from ._biasedurn import (_PyFishersNCHypergeometric,
+                         _PyWalleniusNCHypergeometric,
+                         _PyStochasticLib3)
+from ._stats_pythran import _poisson_binom
+
+
+class binom_gen(rv_discrete):
+    r"""A binomial discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `binom` is:
+
+    .. math::
+
+       f(k) = \binom{n}{k} p^k (1-p)^{n-k}
+
+    for :math:`k \in \{0, 1, \dots, n\}`, :math:`0 \leq p \leq 1`
+
+    `binom` takes :math:`n` and :math:`p` as shape parameters,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pmf``, ``cdf``, ``sf``, ``ppf`` and ``isf``
+    methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    See Also
+    --------
+    hypergeom, nbinom, nhypergeom
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("p", False, (0, 1), (True, True))]
+
+    def _rvs(self, n, p, size=None, random_state=None):
+        return random_state.binomial(n, p, size)
+
+    def _argcheck(self, n, p):
+        return (n >= 0) & _isintegral(n) & (p >= 0) & (p <= 1)
+
+    def _get_support(self, n, p):
+        return self.a, n
+
+    def _logpmf(self, x, n, p):
+        k = floor(x)
+        combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
+        return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
+
+    def _pmf(self, x, n, p):
+        # binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
+        return scu._binom_pmf(x, n, p)
+
+    def _cdf(self, x, n, p):
+        k = floor(x)
+        return scu._binom_cdf(k, n, p)
+
+    def _sf(self, x, n, p):
+        k = floor(x)
+        return scu._binom_sf(k, n, p)
+
+    def _isf(self, x, n, p):
+        return scu._binom_isf(x, n, p)
+
+    def _ppf(self, q, n, p):
+        return scu._binom_ppf(q, n, p)
+
+    def _stats(self, n, p, moments='mv'):
+        mu = n * p
+        var = mu - n * np.square(p)
+        g1, g2 = None, None
+        if 's' in moments:
+            pq = p - np.square(p)
+            npq_sqrt = np.sqrt(n * pq)
+            t1 = np.reciprocal(npq_sqrt)
+            t2 = (2.0 * p) / npq_sqrt
+            g1 = t1 - t2
+        if 'k' in moments:
+            pq = p - np.square(p)
+            npq = n * pq
+            t1 = np.reciprocal(npq)
+            t2 = 6.0/n
+            g2 = t1 - t2
+        return mu, var, g1, g2
+
+    def _entropy(self, n, p):
+        k = np.r_[0:n + 1]
+        vals = self._pmf(k, n, p)
+        return np.sum(entr(vals), axis=0)
+
+
+binom = binom_gen(name='binom')
+
+
+class bernoulli_gen(binom_gen):
+    r"""A Bernoulli discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `bernoulli` is:
+
+    .. math::
+
+       f(k) = \begin{cases}1-p  &\text{if } k = 0\\
+                           p    &\text{if } k = 1\end{cases}
+
+    for :math:`k` in :math:`\{0, 1\}`, :math:`0 \leq p \leq 1`
+
+    `bernoulli` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+
+    def _rvs(self, p, size=None, random_state=None):
+        return binom_gen._rvs(self, 1, p, size=size, random_state=random_state)
+
+    def _argcheck(self, p):
+        return (p >= 0) & (p <= 1)
+
+    def _get_support(self, p):
+        # Overrides binom_gen._get_support!x
+        return self.a, self.b
+
+    def _logpmf(self, x, p):
+        return binom._logpmf(x, 1, p)
+
+    def _pmf(self, x, p):
+        # bernoulli.pmf(k) = 1-p  if k = 0
+        #                  = p    if k = 1
+        return binom._pmf(x, 1, p)
+
+    def _cdf(self, x, p):
+        return binom._cdf(x, 1, p)
+
+    def _sf(self, x, p):
+        return binom._sf(x, 1, p)
+
+    def _isf(self, x, p):
+        return binom._isf(x, 1, p)
+
+    def _ppf(self, q, p):
+        return binom._ppf(q, 1, p)
+
+    def _stats(self, p):
+        return binom._stats(1, p)
+
+    def _entropy(self, p):
+        return entr(p) + entr(1-p)
+
+
+bernoulli = bernoulli_gen(b=1, name='bernoulli')
+
+
+class betabinom_gen(rv_discrete):
+    r"""A beta-binomial discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The beta-binomial distribution is a binomial distribution with a
+    probability of success `p` that follows a beta distribution.
+
+    The probability mass function for `betabinom` is:
+
+    .. math::
+
+       f(k) = \binom{n}{k} \frac{B(k + a, n - k + b)}{B(a, b)}
+
+    for :math:`k \in \{0, 1, \dots, n\}`, :math:`n \geq 0`, :math:`a > 0`,
+    :math:`b > 0`, where :math:`B(a, b)` is the beta function.
+
+    `betabinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Beta-binomial_distribution
+
+    .. versionadded:: 1.4.0
+
+    See Also
+    --------
+    beta, binom
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, n, a, b, size=None, random_state=None):
+        p = random_state.beta(a, b, size)
+        return random_state.binomial(n, p, size)
+
+    def _get_support(self, n, a, b):
+        return 0, n
+
+    def _argcheck(self, n, a, b):
+        return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
+
+    def _logpmf(self, x, n, a, b):
+        k = floor(x)
+        combiln = -log(n + 1) - betaln(n - k + 1, k + 1)
+        return combiln + betaln(k + a, n - k + b) - betaln(a, b)
+
+    def _pmf(self, x, n, a, b):
+        return exp(self._logpmf(x, n, a, b))
+
+    def _stats(self, n, a, b, moments='mv'):
+        e_p = a / (a + b)
+        e_q = 1 - e_p
+        mu = n * e_p
+        var = n * (a + b + n) * e_p * e_q / (a + b + 1)
+        g1, g2 = None, None
+        if 's' in moments:
+            g1 = 1.0 / sqrt(var)
+            g1 *= (a + b + 2 * n) * (b - a)
+            g1 /= (a + b + 2) * (a + b)
+        if 'k' in moments:
+            g2 = (a + b).astype(e_p.dtype)
+            g2 *= (a + b - 1 + 6 * n)
+            g2 += 3 * a * b * (n - 2)
+            g2 += 6 * n ** 2
+            g2 -= 3 * e_p * b * n * (6 - n)
+            g2 -= 18 * e_p * e_q * n ** 2
+            g2 *= (a + b) ** 2 * (1 + a + b)
+            g2 /= (n * a * b * (a + b + 2) * (a + b + 3) * (a + b + n))
+            g2 -= 3
+        return mu, var, g1, g2
+
+
+betabinom = betabinom_gen(name='betabinom')
+
+
+class nbinom_gen(rv_discrete):
+    r"""A negative binomial discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    Negative binomial distribution describes a sequence of i.i.d. Bernoulli
+    trials, repeated until a predefined, non-random number of successes occurs.
+
+    The probability mass function of the number of failures for `nbinom` is:
+
+    .. math::
+
+       f(k) = \binom{k+n-1}{n-1} p^n (1-p)^k
+
+    for :math:`k \ge 0`, :math:`0 < p \leq 1`
+
+    `nbinom` takes :math:`n` and :math:`p` as shape parameters where :math:`n`
+    is the number of successes, :math:`p` is the probability of a single
+    success, and :math:`1-p` is the probability of a single failure.
+
+    Another common parameterization of the negative binomial distribution is
+    in terms of the mean number of failures :math:`\mu` to achieve :math:`n`
+    successes. The mean :math:`\mu` is related to the probability of success
+    as
+
+    .. math::
+
+       p = \frac{n}{n + \mu}
+
+    The number of successes :math:`n` may also be specified in terms of a
+    "dispersion", "heterogeneity", or "aggregation" parameter :math:`\alpha`,
+    which relates the mean :math:`\mu` to the variance :math:`\sigma^2`,
+    e.g. :math:`\sigma^2 = \mu + \alpha \mu^2`. Regardless of the convention
+    used for :math:`\alpha`,
+
+    .. math::
+
+       p &= \frac{\mu}{\sigma^2} \\
+       n &= \frac{\mu^2}{\sigma^2 - \mu}
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pmf``, ``cdf``, ``sf``, ``ppf``, ``isf``
+    and ``stats`` methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    %(example)s
+
+    See Also
+    --------
+    hypergeom, binom, nhypergeom
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("p", False, (0, 1), (True, True))]
+
+    def _rvs(self, n, p, size=None, random_state=None):
+        return random_state.negative_binomial(n, p, size)
+
+    def _argcheck(self, n, p):
+        return (n > 0) & (p > 0) & (p <= 1)
+
+    def _pmf(self, x, n, p):
+        # nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
+        return scu._nbinom_pmf(x, n, p)
+
+    def _logpmf(self, x, n, p):
+        coeff = gamln(n+x) - gamln(x+1) - gamln(n)
+        return coeff + n*log(p) + special.xlog1py(x, -p)
+
+    def _cdf(self, x, n, p):
+        k = floor(x)
+        return scu._nbinom_cdf(k, n, p)
+
+    def _logcdf(self, x, n, p):
+        k = floor(x)
+        k, n, p = np.broadcast_arrays(k, n, p)
+        cdf = self._cdf(k, n, p)
+        cond = cdf > 0.5
+        def f1(k, n, p):
+            return np.log1p(-special.betainc(k + 1, n, 1 - p))
+
+        # do calc in place
+        logcdf = cdf
+        with np.errstate(divide='ignore'):
+            logcdf[cond] = f1(k[cond], n[cond], p[cond])
+            logcdf[~cond] = np.log(cdf[~cond])
+        return logcdf
+
+    def _sf(self, x, n, p):
+        k = floor(x)
+        return scu._nbinom_sf(k, n, p)
+
+    def _isf(self, x, n, p):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return scu._nbinom_isf(x, n, p)
+
+    def _ppf(self, q, n, p):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return scu._nbinom_ppf(q, n, p)
+
+    def _stats(self, n, p):
+        return (
+            scu._nbinom_mean(n, p),
+            scu._nbinom_variance(n, p),
+            scu._nbinom_skewness(n, p),
+            scu._nbinom_kurtosis_excess(n, p),
+        )
+
+
+nbinom = nbinom_gen(name='nbinom')
+
+
+class betanbinom_gen(rv_discrete):
+    r"""A beta-negative-binomial discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The beta-negative-binomial distribution is a negative binomial
+    distribution with a probability of success `p` that follows a
+    beta distribution.
+
+    The probability mass function for `betanbinom` is:
+
+    .. math::
+
+       f(k) = \binom{n + k - 1}{k} \frac{B(a + n, b + k)}{B(a, b)}
+
+    for :math:`k \ge 0`, :math:`n \geq 0`, :math:`a > 0`,
+    :math:`b > 0`, where :math:`B(a, b)` is the beta function.
+
+    `betanbinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Beta_negative_binomial_distribution
+
+    .. versionadded:: 1.12.0
+
+    See Also
+    --------
+    betabinom : Beta binomial distribution
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, n, a, b, size=None, random_state=None):
+        p = random_state.beta(a, b, size)
+        return random_state.negative_binomial(n, p, size)
+
+    def _argcheck(self, n, a, b):
+        return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
+
+    def _logpmf(self, x, n, a, b):
+        k = floor(x)
+        combiln = -np.log(n + k) - betaln(n, k + 1)
+        return combiln + betaln(a + n, b + k) - betaln(a, b)
+
+    def _pmf(self, x, n, a, b):
+        return exp(self._logpmf(x, n, a, b))
+
+    def _stats(self, n, a, b, moments='mv'):
+        # reference: Wolfram Alpha input
+        # BetaNegativeBinomialDistribution[a, b, n]
+        def mean(n, a, b):
+            return n * b / (a - 1.)
+        mu = xpx.apply_where(a > 1, (n, a, b), mean, fill_value=np.inf)
+        def var(n, a, b):
+            return (n * b * (n + a - 1.) * (a + b - 1.)
+                    / ((a - 2.) * (a - 1.)**2.))
+        var = xpx.apply_where(a > 2, (n, a, b), var, fill_value=np.inf)
+        g1, g2 = None, None
+        def skew(n, a, b):
+            return ((2 * n + a - 1.) * (2 * b + a - 1.)
+                    / (a - 3.) / sqrt(n * b * (n + a - 1.) * (b + a - 1.)
+                    / (a - 2.)))
+        if 's' in moments:
+            g1 = xpx.apply_where(a > 3, (n, a, b), skew, fill_value=np.inf)
+        def kurtosis(n, a, b):
+            term = (a - 2.)
+            term_2 = ((a - 1.)**2. * (a**2. + a * (6 * b - 1.)
+                      + 6. * (b - 1.) * b)
+                      + 3. * n**2. * ((a + 5.) * b**2. + (a + 5.)
+                      * (a - 1.) * b + 2. * (a - 1.)**2)
+                      + 3 * (a - 1.) * n
+                      * ((a + 5.) * b**2. + (a + 5.) * (a - 1.) * b
+                      + 2. * (a - 1.)**2.))
+            denominator = ((a - 4.) * (a - 3.) * b * n
+                           * (a + b - 1.) * (a + n - 1.))
+            # Wolfram Alpha uses Pearson kurtosis, so we subtract 3 to get
+            # scipy's Fisher kurtosis
+            return term * term_2 / denominator - 3.
+        if 'k' in moments:
+            g2 = xpx.apply_where(a > 4, (n, a, b), kurtosis, fill_value=np.inf)
+        return mu, var, g1, g2
+
+
+betanbinom = betanbinom_gen(name='betanbinom')
+
+
+class geom_gen(rv_discrete):
+    r"""A geometric discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `geom` is:
+
+    .. math::
+
+        f(k) = (1-p)^{k-1} p
+
+    for :math:`k \ge 1`, :math:`0 < p \leq 1`
+
+    `geom` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+
+    Note that when drawing random samples, the probability of observations that exceed
+    ``np.iinfo(np.int64).max`` increases rapidly as $p$ decreases below $10^{-17}$. For
+    $p < 10^{-20}$, almost all observations would exceed the maximum ``int64``; however,
+    the output dtype is always ``int64``, so these values are clipped to the maximum.
+
+    %(after_notes)s
+
+    See Also
+    --------
+    planck
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+
+    def _rvs(self, p, size=None, random_state=None):
+        res = random_state.geometric(p, size=size)
+        # RandomState.geometric can wrap around to negative values; make behavior
+        # consistent with Generator.geometric by replacing with maximum integer.
+        max_int = np.iinfo(res.dtype).max
+        return np.where(res < 0, max_int, res)
+
+    def _argcheck(self, p):
+        return (p <= 1) & (p > 0)
+
+    def _pmf(self, k, p):
+        return np.power(1-p, k-1) * p
+
+    def _logpmf(self, k, p):
+        return special.xlog1py(k - 1, -p) + log(p)
+
+    def _cdf(self, x, p):
+        k = floor(x)
+        return -expm1(log1p(-p)*k)
+
+    def _sf(self, x, p):
+        return np.exp(self._logsf(x, p))
+
+    def _logsf(self, x, p):
+        k = floor(x)
+        return k*log1p(-p)
+
+    def _ppf(self, q, p):
+        vals = ceil(log1p(-q) / log1p(-p))
+        temp = self._cdf(vals-1, p)
+        return np.where((temp >= q) & (vals > 0), vals-1, vals)
+
+    def _stats(self, p):
+        mu = 1.0/p
+        qr = 1.0-p
+        var = qr / p / p
+        g1 = (2.0-p) / sqrt(qr)
+        g2 = np.polyval([1, -6, 6], p)/(1.0-p)
+        return mu, var, g1, g2
+
+    def _entropy(self, p):
+        return -np.log(p) - np.log1p(-p) * (1.0-p) / p
+
+
+geom = geom_gen(a=1, name='geom', longname="A geometric")
+
+
+class hypergeom_gen(rv_discrete):
+    r"""A hypergeometric discrete random variable.
+
+    The hypergeometric distribution models drawing objects from a bin.
+    `M` is the total number of objects, `n` is total number of Type I objects.
+    The random variate represents the number of Type I objects in `N` drawn
+    without replacement from the total population.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The symbols used to denote the shape parameters (`M`, `n`, and `N`) are not
+    universally accepted.  See the Examples for a clarification of the
+    definitions used here.
+
+    The probability mass function is defined as,
+
+    .. math:: p(k, M, n, N) = \frac{\binom{n}{k} \binom{M - n}{N - k}}
+                                   {\binom{M}{N}}
+
+    for :math:`k \in [\max(0, N - M + n), \min(n, N)]`, where the binomial
+    coefficients are defined as,
+
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+
+    This distribution uses routines from the Boost Math C++ library for
+    the computation of the ``pmf``, ``cdf``, ``sf`` and ``stats`` methods. [1]_
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] The Boost Developers. "Boost C++ Libraries". https://www.boost.org/.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import hypergeom
+    >>> import matplotlib.pyplot as plt
+
+    Suppose we have a collection of 20 animals, of which 7 are dogs.  Then if
+    we want to know the probability of finding a given number of dogs if we
+    choose at random 12 of the 20 animals, we can initialize a frozen
+    distribution and plot the probability mass function:
+
+    >>> [M, n, N] = [20, 7, 12]
+    >>> rv = hypergeom(M, n, N)
+    >>> x = np.arange(0, n+1)
+    >>> pmf_dogs = rv.pmf(x)
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, pmf_dogs, 'bo')
+    >>> ax.vlines(x, 0, pmf_dogs, lw=2)
+    >>> ax.set_xlabel('# of dogs in our group of chosen animals')
+    >>> ax.set_ylabel('hypergeom PMF')
+    >>> plt.show()
+
+    Instead of using a frozen distribution we can also use `hypergeom`
+    methods directly.  To for example obtain the cumulative distribution
+    function, use:
+
+    >>> prb = hypergeom.cdf(x, M, n, N)
+
+    And to generate random numbers:
+
+    >>> R = hypergeom.rvs(M, n, N, size=10)
+
+    See Also
+    --------
+    nhypergeom, binom, nbinom
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("N", True, (0, np.inf), (True, False))]
+
+    def _rvs(self, M, n, N, size=None, random_state=None):
+        return random_state.hypergeometric(n, M-n, N, size=size)
+
+    def _get_support(self, M, n, N):
+        return np.maximum(N-(M-n), 0), np.minimum(n, N)
+
+    def _argcheck(self, M, n, N):
+        cond = (M > 0) & (n >= 0) & (N >= 0)
+        cond &= (n <= M) & (N <= M)
+        cond &= _isintegral(M) & _isintegral(n) & _isintegral(N)
+        return cond
+
+    def _logpmf(self, k, M, n, N):
+        tot, good = M, n
+        bad = tot - good
+        result = (betaln(good+1, 1) + betaln(bad+1, 1) + betaln(tot-N+1, N+1) -
+                  betaln(k+1, good-k+1) - betaln(N-k+1, bad-N+k+1) -
+                  betaln(tot+1, 1))
+        return result
+
+    def _pmf(self, k, M, n, N):
+        return scu._hypergeom_pmf(k, n, N, M)
+
+    def _cdf(self, k, M, n, N):
+        return scu._hypergeom_cdf(k, n, N, M)
+
+    def _stats(self, M, n, N):
+        M, n, N = 1. * M, 1. * n, 1. * N
+        m = M - n
+
+        # Boost kurtosis_excess doesn't return the same as the value
+        # computed here.
+        g2 = M * (M + 1) - 6. * N * (M - N) - 6. * n * m
+        g2 *= (M - 1) * M * M
+        g2 += 6. * n * N * (M - N) * m * (5. * M - 6)
+        g2 /= n * N * (M - N) * m * (M - 2.) * (M - 3.)
+        return (
+            scu._hypergeom_mean(n, N, M),
+            scu._hypergeom_variance(n, N, M),
+            scu._hypergeom_skewness(n, N, M),
+            g2,
+        )
+
+    def _entropy(self, M, n, N):
+        k = np.r_[N - (M - n):min(n, N) + 1]
+        vals = self.pmf(k, M, n, N)
+        return np.sum(entr(vals), axis=0)
+
+    def _sf(self, k, M, n, N):
+        return scu._hypergeom_sf(k, n, N, M)
+
+    def _logsf(self, k, M, n, N):
+        res = []
+        for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
+            if (quant + 0.5) * (tot + 0.5) < (good - 0.5) * (draw - 0.5):
+                # Less terms to sum if we calculate log(1-cdf)
+                res.append(log1p(-exp(self.logcdf(quant, tot, good, draw))))
+            else:
+                # Integration over probability mass function using logsumexp
+                k2 = np.arange(quant + 1, draw + 1)
+                res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
+        return np.asarray(res)
+
+    def _logcdf(self, k, M, n, N):
+        res = []
+        for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
+            if (quant + 0.5) * (tot + 0.5) > (good - 0.5) * (draw - 0.5):
+                # Less terms to sum if we calculate log(1-sf)
+                res.append(log1p(-exp(self.logsf(quant, tot, good, draw))))
+            else:
+                # Integration over probability mass function using logsumexp
+                k2 = np.arange(0, quant + 1)
+                res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
+        return np.asarray(res)
+
+
+hypergeom = hypergeom_gen(name='hypergeom')
+
+
+class nhypergeom_gen(rv_discrete):
+    r"""A negative hypergeometric discrete random variable.
+
+    Consider a box containing :math:`M` balls:, :math:`n` red and
+    :math:`M-n` blue. We randomly sample balls from the box, one
+    at a time and *without* replacement, until we have picked :math:`r`
+    blue balls. `nhypergeom` is the distribution of the number of
+    red balls :math:`k` we have picked.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The symbols used to denote the shape parameters (`M`, `n`, and `r`) are not
+    universally accepted. See the Examples for a clarification of the
+    definitions used here.
+
+    The probability mass function is defined as,
+
+    .. math:: f(k; M, n, r) = \frac{{{k+r-1}\choose{k}}{{M-r-k}\choose{n-k}}}
+                                   {{M \choose n}}
+
+    for :math:`k \in [0, n]`, :math:`n \in [0, M]`, :math:`r \in [0, M-n]`,
+    and the binomial coefficient is:
+
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+
+    It is equivalent to observing :math:`k` successes in :math:`k+r-1`
+    samples with :math:`k+r`'th sample being a failure. The former
+    can be modelled as a hypergeometric distribution. The probability
+    of the latter is simply the number of failures remaining
+    :math:`M-n-(r-1)` divided by the size of the remaining population
+    :math:`M-(k+r-1)`. This relationship can be shown as:
+
+    .. math:: NHG(k;M,n,r) = HG(k;M,n,k+r-1)\frac{(M-n-(r-1))}{(M-(k+r-1))}
+
+    where :math:`NHG` is probability mass function (PMF) of the
+    negative hypergeometric distribution and :math:`HG` is the
+    PMF of the hypergeometric distribution.
+
+    %(after_notes)s
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import nhypergeom
+    >>> import matplotlib.pyplot as plt
+
+    Suppose we have a collection of 20 animals, of which 7 are dogs.
+    Then if we want to know the probability of finding a given number
+    of dogs (successes) in a sample with exactly 12 animals that
+    aren't dogs (failures), we can initialize a frozen distribution
+    and plot the probability mass function:
+
+    >>> M, n, r = [20, 7, 12]
+    >>> rv = nhypergeom(M, n, r)
+    >>> x = np.arange(0, n+2)
+    >>> pmf_dogs = rv.pmf(x)
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, pmf_dogs, 'bo')
+    >>> ax.vlines(x, 0, pmf_dogs, lw=2)
+    >>> ax.set_xlabel('# of dogs in our group with given 12 failures')
+    >>> ax.set_ylabel('nhypergeom PMF')
+    >>> plt.show()
+
+    Instead of using a frozen distribution we can also use `nhypergeom`
+    methods directly.  To for example obtain the probability mass
+    function, use:
+
+    >>> prb = nhypergeom.pmf(x, M, n, r)
+
+    And to generate random numbers:
+
+    >>> R = nhypergeom.rvs(M, n, r, size=10)
+
+    To verify the relationship between `hypergeom` and `nhypergeom`, use:
+
+    >>> from scipy.stats import hypergeom, nhypergeom
+    >>> M, n, r = 45, 13, 8
+    >>> k = 6
+    >>> nhypergeom.pmf(k, M, n, r)
+    0.06180776620271643
+    >>> hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
+    0.06180776620271644
+
+    See Also
+    --------
+    hypergeom, binom, nbinom
+
+    References
+    ----------
+    .. [1] Negative Hypergeometric Distribution on Wikipedia
+           https://en.wikipedia.org/wiki/Negative_hypergeometric_distribution
+
+    .. [2] Negative Hypergeometric Distribution from
+           http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Negativehypergeometric.pdf
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("r", True, (0, np.inf), (True, False))]
+
+    def _get_support(self, M, n, r):
+        return 0, n
+
+    def _argcheck(self, M, n, r):
+        cond = (n >= 0) & (n <= M) & (r >= 0) & (r <= M-n)
+        cond &= _isintegral(M) & _isintegral(n) & _isintegral(r)
+        return cond
+
+    def _rvs(self, M, n, r, size=None, random_state=None):
+
+        @_vectorize_rvs_over_shapes
+        def _rvs1(M, n, r, size, random_state):
+            # invert cdf by calculating all values in support, scalar M, n, r
+            a, b = self.support(M, n, r)
+            ks = np.arange(a, b+1)
+            cdf = self.cdf(ks, M, n, r)
+            ppf = interp1d(cdf, ks, kind='next', fill_value='extrapolate')
+            rvs = ppf(random_state.uniform(size=size)).astype(int)
+            if size is None:
+                return rvs.item()
+            return rvs
+
+        return _rvs1(M, n, r, size=size, random_state=random_state)
+
+    def _logpmf(self, k, M, n, r):
+        return xpx.apply_where(
+            (r != 0) | (k != 0), (k, M, n, r),
+            lambda k, M, n, r:
+                (-betaln(k+1, r) + betaln(k+r, 1)
+                 - betaln(n-k+1, M-r-n+1) + betaln(M-r-k+1, 1)
+                 + betaln(n+1, M-n+1) - betaln(M+1, 1)),
+            fill_value=0.0)
+
+    def _pmf(self, k, M, n, r):
+        # same as the following but numerically more precise
+        # return comb(k+r-1, k) * comb(M-r-k, n-k) / comb(M, n)
+        return exp(self._logpmf(k, M, n, r))
+
+    def _stats(self, M, n, r):
+        # Promote the datatype to at least float
+        # mu = rn / (M-n+1)
+        M, n, r = 1.*M, 1.*n, 1.*r
+        mu = r*n / (M-n+1)
+
+        var = r*(M+1)*n / ((M-n+1)*(M-n+2)) * (1 - r / (M-n+1))
+
+        # The skew and kurtosis are mathematically
+        # intractable so return `None`. See [2]_.
+        g1, g2 = None, None
+        return mu, var, g1, g2
+
+
+nhypergeom = nhypergeom_gen(name='nhypergeom')
+
+
+# FIXME: Fails _cdfvec
+class logser_gen(rv_discrete):
+    r"""A Logarithmic (Log-Series, Series) discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `logser` is:
+
+    .. math::
+
+        f(k) = - \frac{p^k}{k \log(1-p)}
+
+    for :math:`k \ge 1`, :math:`0 < p < 1`
+
+    `logser` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+
+    def _rvs(self, p, size=None, random_state=None):
+        # looks wrong for p>0.5, too few k=1
+        # trying to use generic is worse, no k=1 at all
+        return random_state.logseries(p, size=size)
+
+    def _argcheck(self, p):
+        return (p > 0) & (p < 1)
+
+    def _pmf(self, k, p):
+        # logser.pmf(k) = - p**k / (k*log(1-p))
+        return -np.power(p, k) * 1.0 / k / special.log1p(-p)
+
+    def _sf(self, k, p):
+        tiny = 1e-100
+        # Ideally, this is the unregularized beta function with `b=0`. We don't have
+        # an unregularized beta function (yet, although we could get it from Boost),
+        # and neither technically support `b=0` - despite the function being accurate
+        # for `b` super close to zero. See https://github.com/scipy/scipy/issues/3890.
+        return -special.betainc(k+1, tiny, p) * special.beta(k+1, tiny) / np.log1p(-p)
+
+    def _stats(self, p):
+        r = special.log1p(-p)
+        mu = p / (p - 1.0) / r
+        mu2p = -p / r / (p - 1.0)**2
+        var = mu2p - mu*mu
+        mu3p = -p / r * (1.0+p) / (1.0 - p)**3
+        mu3 = mu3p - 3*mu*mu2p + 2*mu**3
+        g1 = mu3 / np.power(var, 1.5)
+
+        mu4p = -p / r * (
+            1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
+        mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
+        g2 = mu4 / var**2 - 3.0
+        return mu, var, g1, g2
+
+
+logser = logser_gen(a=1, name='logser', longname='A logarithmic')
+
+
+class poisson_gen(rv_discrete):
+    r"""A Poisson discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `poisson` is:
+
+    .. math::
+
+        f(k) = \exp(-\mu) \frac{\mu^k}{k!}
+
+    for :math:`k \ge 0`.
+
+    `poisson` takes :math:`\mu \geq 0` as shape parameter.
+    When :math:`\mu = 0`, the ``pmf`` method
+    returns ``1.0`` at quantile :math:`k = 0`.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("mu", False, (0, np.inf), (True, False))]
+
+    # Override rv_discrete._argcheck to allow mu=0.
+    def _argcheck(self, mu):
+        return mu >= 0
+
+    def _rvs(self, mu, size=None, random_state=None):
+        return random_state.poisson(mu, size)
+
+    def _logpmf(self, k, mu):
+        Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
+        return Pk
+
+    def _pmf(self, k, mu):
+        # poisson.pmf(k) = exp(-mu) * mu**k / k!
+        return exp(self._logpmf(k, mu))
+
+    def _cdf(self, x, mu):
+        k = floor(x)
+        return special.pdtr(k, mu)
+
+    def _sf(self, x, mu):
+        k = floor(x)
+        return special.pdtrc(k, mu)
+
+    def _ppf(self, q, mu):
+        vals = ceil(special.pdtrik(q, mu))
+        vals1 = np.maximum(vals - 1, 0)
+        temp = special.pdtr(vals1, mu)
+        return np.where(temp >= q, vals1, vals)
+
+    def _stats(self, mu):
+        var = mu
+        tmp = np.asarray(mu)
+        mu_nonzero = tmp > 0
+        g1 = xpx.apply_where(mu_nonzero, tmp, lambda x: sqrt(1.0/x), fill_value=np.inf)
+        g2 = xpx.apply_where(mu_nonzero, tmp, lambda x: 1.0/x, fill_value=np.inf)
+        return mu, var, g1, g2
+
+
+poisson = poisson_gen(name="poisson", longname='A Poisson')
+
+
+class planck_gen(rv_discrete):
+    r"""A Planck discrete exponential random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `planck` is:
+
+    .. math::
+
+        f(k) = (1-\exp(-\lambda)) \exp(-\lambda k)
+
+    for :math:`k \ge 0` and :math:`\lambda > 0`.
+
+    `planck` takes :math:`\lambda` as shape parameter. The Planck distribution
+    can be written as a geometric distribution (`geom`) with
+    :math:`p = 1 - \exp(-\lambda)` shifted by ``loc = -1``.
+
+    %(after_notes)s
+
+    See Also
+    --------
+    geom
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("lambda_", False, (0, np.inf), (False, False))]
+
+    def _argcheck(self, lambda_):
+        return lambda_ > 0
+
+    def _pmf(self, k, lambda_):
+        return -expm1(-lambda_)*exp(-lambda_*k)
+
+    def _cdf(self, x, lambda_):
+        k = floor(x)
+        return -expm1(-lambda_*(k+1))
+
+    def _sf(self, x, lambda_):
+        return exp(self._logsf(x, lambda_))
+
+    def _logsf(self, x, lambda_):
+        k = floor(x)
+        return -lambda_*(k+1)
+
+    def _ppf(self, q, lambda_):
+        vals = ceil(-1.0/lambda_ * log1p(-q)-1)
+        vals1 = (vals-1).clip(*(self._get_support(lambda_)))
+        temp = self._cdf(vals1, lambda_)
+        return np.where(temp >= q, vals1, vals)
+
+    def _rvs(self, lambda_, size=None, random_state=None):
+        # use relation to geometric distribution for sampling
+        p = -expm1(-lambda_)
+        return random_state.geometric(p, size=size) - 1.0
+
+    def _stats(self, lambda_):
+        mu = 1/expm1(lambda_)
+        var = exp(-lambda_)/(expm1(-lambda_))**2
+        g1 = 2*cosh(lambda_/2.0)
+        g2 = 4+2*cosh(lambda_)
+        return mu, var, g1, g2
+
+    def _entropy(self, lambda_):
+        C = -expm1(-lambda_)
+        return lambda_*exp(-lambda_)/C - log(C)
+
+
+planck = planck_gen(a=0, name='planck', longname='A discrete exponential ')
+
+
+class boltzmann_gen(rv_discrete):
+    r"""A Boltzmann (Truncated Discrete Exponential) random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `boltzmann` is:
+
+    .. math::
+
+        f(k) = (1-\exp(-\lambda)) \exp(-\lambda k) / (1-\exp(-\lambda N))
+
+    for :math:`k = 0,..., N-1`.
+
+    `boltzmann` takes :math:`\lambda > 0` and :math:`N > 0` as shape parameters.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("lambda_", False, (0, np.inf), (False, False)),
+                _ShapeInfo("N", True, (0, np.inf), (False, False))]
+
+    def _argcheck(self, lambda_, N):
+        return (lambda_ > 0) & (N > 0) & _isintegral(N)
+
+    def _get_support(self, lambda_, N):
+        return self.a, N - 1
+
+    def _pmf(self, k, lambda_, N):
+        # boltzmann.pmf(k) =
+        #               (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
+        fact = (1-exp(-lambda_))/(1-exp(-lambda_*N))
+        return fact*exp(-lambda_*k)
+
+    def _cdf(self, x, lambda_, N):
+        k = floor(x)
+        return (1-exp(-lambda_*(k+1)))/(1-exp(-lambda_*N))
+
+    def _ppf(self, q, lambda_, N):
+        qnew = q*(1-exp(-lambda_*N))
+        vals = ceil(-1.0/lambda_ * log(1-qnew)-1)
+        vals1 = (vals-1).clip(0.0, np.inf)
+        temp = self._cdf(vals1, lambda_, N)
+        return np.where(temp >= q, vals1, vals)
+
+    def _stats(self, lambda_, N):
+        z = exp(-lambda_)
+        zN = exp(-lambda_*N)
+        mu = z/(1.0-z)-N*zN/(1-zN)
+        var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
+        trm = (1-zN)/(1-z)
+        trm2 = (z*trm**2 - N*N*zN)
+        g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
+        g1 = g1 / trm2**(1.5)
+        g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
+        g2 = g2 / trm2 / trm2
+        return mu, var, g1, g2
+
+
+boltzmann = boltzmann_gen(name='boltzmann', a=0,
+                          longname='A truncated discrete exponential ')
+
+
+class randint_gen(rv_discrete):
+    r"""A uniform discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `randint` is:
+
+    .. math::
+
+        f(k) = \frac{1}{\texttt{high} - \texttt{low}}
+
+    for :math:`k \in \{\texttt{low}, \dots, \texttt{high} - 1\}`.
+
+    `randint` takes :math:`\texttt{low}` and :math:`\texttt{high}` as shape
+    parameters.
+
+    %(after_notes)s
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import randint
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+
+    Calculate the first four moments:
+
+    >>> low, high = 7, 31
+    >>> mean, var, skew, kurt = randint.stats(low, high, moments='mvsk')
+
+    Display the probability mass function (``pmf``):
+
+    >>> x = np.arange(low - 5, high + 5)
+    >>> ax.plot(x, randint.pmf(x, low, high), 'bo', ms=8, label='randint pmf')
+    >>> ax.vlines(x, 0, randint.pmf(x, low, high), colors='b', lw=5, alpha=0.5)
+
+    Alternatively, the distribution object can be called (as a function) to
+    fix the shape and location. This returns a "frozen" RV object holding the
+    given parameters fixed.
+
+    Freeze the distribution and display the frozen ``pmf``:
+
+    >>> rv = randint(low, high)
+    >>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-',
+    ...           lw=1, label='frozen pmf')
+    >>> ax.legend(loc='lower center')
+    >>> plt.show()
+
+    Check the relationship between the cumulative distribution function
+    (``cdf``) and its inverse, the percent point function (``ppf``):
+
+    >>> q = np.arange(low, high)
+    >>> p = randint.cdf(q, low, high)
+    >>> np.allclose(q, randint.ppf(p, low, high))
+    True
+
+    Generate random numbers:
+
+    >>> r = randint.rvs(low, high, size=1000)
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("low", True, (-np.inf, np.inf), (False, False)),
+                _ShapeInfo("high", True, (-np.inf, np.inf), (False, False))]
+
+    def _argcheck(self, low, high):
+        return (high > low) & _isintegral(low) & _isintegral(high)
+
+    def _get_support(self, low, high):
+        return low, high-1
+
+    def _pmf(self, k, low, high):
+        # randint.pmf(k) = 1./(high - low)
+        p = np.ones_like(k) / (np.asarray(high, dtype=np.int64) - low)
+        return np.where((k >= low) & (k < high), p, 0.)
+
+    def _cdf(self, x, low, high):
+        k = floor(x)
+        return (k - low + 1.) / (high - low)
+
+    def _ppf(self, q, low, high):
+        vals = ceil(q * (high - low) + low) - 1
+        vals1 = (vals - 1).clip(low, high)
+        temp = self._cdf(vals1, low, high)
+        return np.where(temp >= q, vals1, vals)
+
+    def _stats(self, low, high):
+        m2, m1 = np.asarray(high), np.asarray(low)
+        mu = (m2 + m1 - 1.0) / 2
+        d = m2 - m1
+        var = (d*d - 1) / 12.0
+        g1 = 0.0
+        g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
+        return mu, var, g1, g2
+
+    def _rvs(self, low, high, size=None, random_state=None):
+        """An array of *size* random integers >= ``low`` and < ``high``."""
+        if np.asarray(low).size == 1 and np.asarray(high).size == 1:
+            # no need to vectorize in that case
+            return rng_integers(random_state, low, high, size=size)
+
+        if size is not None:
+            # NumPy's RandomState.randint() doesn't broadcast its arguments.
+            # Use `broadcast_to()` to extend the shapes of low and high
+            # up to size.  Then we can use the numpy.vectorize'd
+            # randint without needing to pass it a `size` argument.
+            low = np.broadcast_to(low, size)
+            high = np.broadcast_to(high, size)
+        randint = np.vectorize(partial(rng_integers, random_state),
+                               otypes=[np.dtype(int)])
+        return randint(low, high)
+
+    def _entropy(self, low, high):
+        return log(high - low)
+
+
+randint = randint_gen(name='randint', longname='A discrete uniform '
+                      '(random integer)')
+
+
+# FIXME: problems sampling.
+class zipf_gen(rv_discrete):
+    r"""A Zipf (Zeta) discrete random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    zipfian
+
+    Notes
+    -----
+    The probability mass function for `zipf` is:
+
+    .. math::
+
+        f(k, a) = \frac{1}{\zeta(a) k^a}
+
+    for :math:`k \ge 1`, :math:`a > 1`.
+
+    `zipf` takes :math:`a > 1` as shape parameter. :math:`\zeta` is the
+    Riemann zeta function (`scipy.special.zeta`)
+
+    The Zipf distribution is also known as the zeta distribution, which is
+    a special case of the Zipfian distribution (`zipfian`).
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Zeta Distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Zeta_distribution
+
+    %(example)s
+
+    Confirm that `zipf` is the large `n` limit of `zipfian`.
+
+    >>> import numpy as np
+    >>> from scipy.stats import zipf, zipfian
+    >>> k = np.arange(11)
+    >>> np.allclose(zipf.pmf(k, a), zipfian.pmf(k, a, n=10000000))
+    True
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (1, np.inf), (False, False))]
+
+    def _rvs(self, a, size=None, random_state=None):
+        return random_state.zipf(a, size=size)
+
+    def _argcheck(self, a):
+        return a > 1
+
+    def _pmf(self, k, a):
+        k = k.astype(np.float64)
+        # zipf.pmf(k, a) = 1/(zeta(a) * k**a)
+        Pk = 1.0 / special.zeta(a, 1) * k**-a
+        return Pk
+
+    def _munp(self, n, a):
+        return xpx.apply_where(
+            a > n + 1, (a, n),
+            lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
+            fill_value=np.inf)
+
+
+zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
+
+
+class zipfian_gen(rv_discrete):
+    r"""A Zipfian discrete random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    zipf
+
+    Notes
+    -----
+    The probability mass function for `zipfian` is:
+
+    .. math::
+
+        f(k, a, n) = \frac{1}{H_{n,a} k^a}
+
+    for :math:`k \in \{1, 2, \dots, n-1, n\}`, :math:`a \ge 0`,
+    :math:`n \in \{1, 2, 3, \dots\}`.
+
+    `zipfian` takes :math:`a` and :math:`n` as shape parameters.
+    :math:`H_{n,a}` is the :math:`n`:sup:`th` generalized harmonic
+    number of order :math:`a`.
+
+    The SciPy implementation of this distribution requires :math:`1 \le n \le 2^{53}`.
+    For larger values of :math:`n`, the `zipfian` methods (`pmf`, `cdf`, `mean`, etc.)
+    will return `nan`.
+
+    When :math:`a > 1`, the Zipfian distribution reduces to the Zipf (zeta)
+    distribution as :math:`n \rightarrow \infty`.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Zipf's Law", Wikipedia, https://en.wikipedia.org/wiki/Zipf's_law
+    .. [2] Larry Leemis, "Zipf Distribution", Univariate Distribution
+           Relationships. http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
+
+    %(example)s
+
+    Confirm that `zipfian` reduces to `zipf` for large `n`, ``a > 1``.
+
+    >>> import numpy as np
+    >>> from scipy.stats import zipf, zipfian
+    >>> k = np.arange(11)
+    >>> np.allclose(zipfian.pmf(k, a=3.5, n=10000000), zipf.pmf(k, a=3.5))
+    True
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (False, False))]
+
+    def _argcheck(self, a, n):
+        # The upper bound on n is for practical numerical reasons.  The numerical
+        # methods for computing the PMF, CDF and SF involve sums over range(1, n+1)
+        # when a <= 1, so there is no way they can be computed for extremely large
+        # n--even 2**53 is ridiculously large for those calculations.  The bound is
+        # also required to ensure that the loops compute the sums correctly when
+        # the inputs are double precision instead of integers.
+        #
+        # n may be an integer or a float, but the value must be an integer in the
+        # range 1 <= n <= 2**53.  The expression below clips n to the accepted range
+        # before attempting to cast to integer to avoid warnings generated by an
+        # input such as n=1e100.  The extra `np.asarray()` wrapper avoids the error
+        # that arises with an input such as `n=2**100`.
+        return ((a >= 0) &
+                (n == np.asarray(np.clip(n, 1, 2**53)).astype(dtype=np.int64)))
+
+    def _get_support(self, a, n):
+        return 1, np.floor(n)
+
+    def _pmf(self, k, a, n):
+        k = np.floor(k)
+        n = np.floor(n)
+        return scu._normalized_gen_harmonic(k, k, n, a)
+
+    def _cdf(self, k, a, n):
+        k = np.floor(k)
+        n = np.floor(n)
+        return scu._normalized_gen_harmonic(1, k, n, a)
+
+    def _sf(self, k, a, n):
+        k = np.floor(k)
+        n = np.floor(n)
+        return scu._normalized_gen_harmonic(k + 1, n, n, a)
+
+    def _stats(self, a, n):
+        n = np.floor(n)
+        # see http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
+        Hna = scu._gen_harmonic(n, a)
+        Hna1 = scu._gen_harmonic(n, a-1)
+        Hna2 = scu._gen_harmonic(n, a-2)
+        Hna3 = scu._gen_harmonic(n, a-3)
+        Hna4 = scu._gen_harmonic(n, a-4)
+        mu1 = Hna1/Hna
+        mu2n = (Hna2*Hna - Hna1**2)
+        mu2d = Hna**2
+        mu2 = mu2n / mu2d
+        g1 = (Hna3/Hna - 3*Hna1*Hna2/Hna**2 + 2*Hna1**3/Hna**3)/mu2**(3/2)
+        g2 = (Hna**3*Hna4 - 4*Hna**2*Hna1*Hna3 + 6*Hna*Hna1**2*Hna2
+              - 3*Hna1**4) / mu2n**2
+        g2 -= 3
+        return mu1, mu2, g1, g2
+
+
+zipfian = zipfian_gen(a=1, name='zipfian', longname='A Zipfian')
+
+
+class dlaplace_gen(rv_discrete):
+    r"""A  Laplacian discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability mass function for `dlaplace` is:
+
+    .. math::
+
+        f(k) = \tanh(a/2) \exp(-a |k|)
+
+    for integers :math:`k` and :math:`a > 0`.
+
+    `dlaplace` takes :math:`a` as shape parameter.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+
+    def _pmf(self, k, a):
+        # dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
+        return tanh(a/2.0) * exp(-a * abs(k))
+
+    def _cdf(self, x, a):
+        k = floor(x)
+
+        def f1(k, a):
+            return 1.0 - exp(-a * k) / (exp(a) + 1)
+
+        def f2(k, a):
+            return exp(a * (k + 1)) / (exp(a) + 1)
+
+        return xpx.apply_where(k >= 0, (k, a), f1, f2)
+
+    def _ppf(self, q, a):
+        const = 1 + exp(a)
+        vals = ceil(np.where(q < 1.0 / (1 + exp(-a)),
+                             log(q*const) / a - 1,
+                             -log((1-q) * const) / a))
+        vals1 = vals - 1
+        return np.where(self._cdf(vals1, a) >= q, vals1, vals)
+
+    def _stats(self, a):
+        ea = exp(a)
+        mu2 = 2.*ea/(ea-1.)**2
+        mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
+        return 0., mu2, 0., mu4/mu2**2 - 3.
+
+    def _entropy(self, a):
+        return a / sinh(a) - log(tanh(a/2.0))
+
+    def _rvs(self, a, size=None, random_state=None):
+        # The discrete Laplace is equivalent to the two-sided geometric
+        # distribution with PMF:
+        #   f(k) = (1 - alpha)/(1 + alpha) * alpha^abs(k)
+        #   Reference:
+        #     https://www.sciencedirect.com/science/
+        #     article/abs/pii/S0378375804003519
+        # Furthermore, the two-sided geometric distribution is
+        # equivalent to the difference between two iid geometric
+        # distributions.
+        #   Reference (page 179):
+        #     https://pdfs.semanticscholar.org/61b3/
+        #     b99f466815808fd0d03f5d2791eea8b541a1.pdf
+        # Thus, we can leverage the following:
+        #   1) alpha = e^-a
+        #   2) probability_of_success = 1 - alpha (Bernoulli trial)
+        probOfSuccess = -np.expm1(-np.asarray(a))
+        x = random_state.geometric(probOfSuccess, size=size)
+        y = random_state.geometric(probOfSuccess, size=size)
+        return x - y
+
+
+dlaplace = dlaplace_gen(a=-np.inf,
+                        name='dlaplace', longname='A discrete Laplacian')
+
+
+class poisson_binom_gen(rv_discrete):
+    r"""A Poisson Binomial discrete random variable.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    binom
+
+    Notes
+    -----
+    The probability mass function for `poisson_binom` is:
+
+    .. math::
+
+     f(k; p_1, p_2, ..., p_n) = \sum_{A \in F_k} \prod_{i \in A} p_i \prod_{j \in A^C} 1 - p_j
+
+    where :math:`k \in \{0, 1, \dots, n-1, n\}`, :math:`F_k` is the set of all
+    subsets of :math:`k` integers that can be selected :math:`\{0, 1, \dots, n-1, n\}`,
+    and :math:`A^C` is the complement of a set :math:`A`.
+
+    `poisson_binom` accepts a single array argument ``p`` for shape parameters
+    :math:`0 ≤ p_i ≤ 1`, where the last axis corresponds with the index :math:`i` and
+    any others are for batch dimensions. Broadcasting behaves according to the usual
+    rules except that the last axis of ``p`` is ignored. Instances of this class do
+    not support serialization/unserialization.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] "Poisson binomial distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Poisson_binomial_distribution
+    .. [2] Biscarri, William, Sihai Dave Zhao, and Robert J. Brunner. "A simple and
+           fast method for computing the Poisson binomial distribution function".
+           Computational Statistics & Data Analysis 122 (2018) 92-100.
+           :doi:`10.1016/j.csda.2018.01.007`
+
+    %(example)s
+
+    """  # noqa: E501
+    def _shape_info(self):
+        # message = 'Fitting is not implemented for this distribution."
+        # raise NotImplementedError(message)
+        return []
+
+    def _argcheck(self, *args):
+        p = np.stack(args, axis=0)
+        conds = (0 <= p) & (p <= 1)
+        return np.all(conds, axis=0)
+
+    def _rvs(self, *args, size=None, random_state=None):
+        # convenient to work along the last axis here to avoid interference with `size`
+        p = np.stack(args, axis=-1)
+        # Size passed by the user is the *shape of the returned array*, so it won't
+        # contain the length of the last axis of p.
+        size = (p.shape if size is None else
+                (size, 1) if np.isscalar(size) else tuple(size) + (1,))
+        size = np.broadcast_shapes(p.shape, size)
+        return bernoulli._rvs(p, size=size, random_state=random_state).sum(axis=-1)
+
+    def _get_support(self, *args):
+        return 0, len(args)
+
+    def _pmf(self, k, *args):
+        k = np.atleast_1d(k).astype(np.int64)
+        k, *args = np.broadcast_arrays(k, *args)
+        args = np.asarray(args, dtype=np.float64)
+        return _poisson_binom(k, args, 'pmf')
+
+    def _cdf(self, k, *args):
+        k = np.atleast_1d(k).astype(np.int64)
+        k, *args = np.broadcast_arrays(k, *args)
+        args = np.asarray(args, dtype=np.float64)
+        return _poisson_binom(k, args, 'cdf')
+
+    def _stats(self, *args, **kwds):
+        p = np.stack(args, axis=0)
+        mean = np.sum(p, axis=0)
+        var = np.sum(p * (1-p), axis=0)
+        return (mean, var, None, None)
+
+    def __call__(self, *args, **kwds):
+        return poisson_binomial_frozen(self, *args, **kwds)
+
+
+poisson_binom = poisson_binom_gen(name='poisson_binom', longname='A Poisson binomial',
+                                  shapes='p')
+
+# The _parse_args methods don't work with vector-valued shape parameters, so we rewrite
+# them. Note that `p` is accepted as an array with the index `i` of `p_i` corresponding
+# with the last axis; we return it as a tuple (p_1, p_2, ..., p_n) so that it looks
+# like `n` scalar (or arrays of scalar-valued) shape parameters to the infrastructure.
+
+def _parse_args_rvs(self, p, loc=0, size=None):
+    return tuple(np.moveaxis(p, -1, 0)), loc, 1.0, size
+
+def _parse_args_stats(self, p, loc=0, moments='mv'):
+    return tuple(np.moveaxis(p, -1, 0)), loc, 1.0, moments
+
+def _parse_args(self, p, loc=0):
+    return tuple(np.moveaxis(p, -1, 0)), loc, 1.0
+
+# The infrastructure manually binds these methods to the instance, so
+# we can only override them by manually binding them, too.
+_pb_obj, _pb_cls = poisson_binom, poisson_binom_gen  # shorter names (for PEP8)
+poisson_binom._parse_args_rvs = _parse_args_rvs.__get__(_pb_obj, _pb_cls)
+poisson_binom._parse_args_stats = _parse_args_stats.__get__(_pb_obj, _pb_cls)
+poisson_binom._parse_args = _parse_args.__get__(_pb_obj, _pb_cls)
+
+class poisson_binomial_frozen(rv_discrete_frozen):
+    # copied from rv_frozen; we just need to bind the `_parse_args` methods
+    def __init__(self, dist, *args, **kwds):                        # verbatim
+        self.args = args                                            # verbatim
+        self.kwds = kwds                                            # verbatim
+
+        # create a new instance                                     # verbatim
+        self.dist = dist.__class__(**dist._updated_ctor_param())    # verbatim
+
+        # Here is the only modification
+        self.dist._parse_args_rvs = _parse_args_rvs.__get__(_pb_obj, _pb_cls)
+        self.dist._parse_args_stats = _parse_args_stats.__get__(_pb_obj, _pb_cls)
+        self.dist._parse_args = _parse_args.__get__(_pb_obj, _pb_cls)
+
+        shapes, _, _ = self.dist._parse_args(*args, **kwds)         # verbatim
+        self.a, self.b = self.dist._get_support(*shapes)            # verbatim
+
+    def expect(self, func=None, lb=None, ub=None, conditional=False, **kwds):
+        a, loc, scale = self.dist._parse_args(*self.args, **self.kwds)
+        # Here's the modification: we pass all args (including `loc`) into the `args`
+        # parameter of `expect` so the shape only goes through `_parse_args` once.
+        return self.dist.expect(func, self.args, loc, lb, ub, conditional, **kwds)
+
+
+class skellam_gen(rv_discrete):
+    r"""A  Skellam discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    Probability distribution of the difference of two correlated or
+    uncorrelated Poisson random variables.
+
+    Let :math:`k_1` and :math:`k_2` be two Poisson-distributed r.v. with
+    expected values :math:`\lambda_1` and :math:`\lambda_2`. Then,
+    :math:`k_1 - k_2` follows a Skellam distribution with parameters
+    :math:`\mu_1 = \lambda_1 - \rho \sqrt{\lambda_1 \lambda_2}` and
+    :math:`\mu_2 = \lambda_2 - \rho \sqrt{\lambda_1 \lambda_2}`, where
+    :math:`\rho` is the correlation coefficient between :math:`k_1` and
+    :math:`k_2`. If the two Poisson-distributed r.v. are independent then
+    :math:`\rho = 0`.
+
+    Parameters :math:`\mu_1` and :math:`\mu_2` must be strictly positive.
+
+    For details see: https://en.wikipedia.org/wiki/Skellam_distribution
+
+    `skellam` takes :math:`\mu_1` and :math:`\mu_2` as shape parameters.
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("mu1", False, (0, np.inf), (False, False)),
+                _ShapeInfo("mu2", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, mu1, mu2, size=None, random_state=None):
+        n = size
+        return (random_state.poisson(mu1, n) -
+                random_state.poisson(mu2, n))
+
+    def _pmf(self, x, mu1, mu2):
+        with np.errstate(over='ignore'):  # see gh-17432
+            px = np.where(x < 0,
+                          scu._ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
+                          scu._ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
+            # ncx2.pdf() returns nan's for extremely low probabilities
+        return px
+
+    def _cdf(self, x, mu1, mu2):
+        x = floor(x)
+        with np.errstate(over='ignore'):  # see gh-17432
+            px = np.where(x < 0,
+                          special.chndtr(2*mu2, -2*x, 2*mu1),
+                          scu._ncx2_sf(2*mu1, 2*(x+1), 2*mu2))
+        return px
+
+    def _stats(self, mu1, mu2):
+        mean = mu1 - mu2
+        var = mu1 + mu2
+        g1 = mean / sqrt((var)**3)
+        g2 = 1 / var
+        return mean, var, g1, g2
+
+
+skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
+
+
+class yulesimon_gen(rv_discrete):
+    r"""A Yule-Simon discrete random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+
+    The probability mass function for the `yulesimon` is:
+
+    .. math::
+
+        f(k) =  \alpha B(k, \alpha+1)
+
+    for :math:`k=1,2,3,...`, where :math:`\alpha>0`.
+    Here :math:`B` refers to the `scipy.special.beta` function.
+
+    The sampling of random variates is based on pg 553, Section 6.3 of [1]_.
+    Our notation maps to the referenced logic via :math:`\alpha=a-1`.
+
+    For details see the wikipedia entry [2]_.
+
+    References
+    ----------
+    .. [1] Devroye, Luc. "Non-uniform Random Variate Generation",
+         (1986) Springer, New York.
+
+    .. [2] https://en.wikipedia.org/wiki/Yule-Simon_distribution
+
+    %(after_notes)s
+
+    %(example)s
+
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("alpha", False, (0, np.inf), (False, False))]
+
+    def _rvs(self, alpha, size=None, random_state=None):
+        E1 = random_state.standard_exponential(size)
+        E2 = random_state.standard_exponential(size)
+        ans = ceil(-E1 / log1p(-exp(-E2 / alpha)))
+        return ans
+
+    def _pmf(self, x, alpha):
+        return alpha * special.beta(x, alpha + 1)
+
+    def _argcheck(self, alpha):
+        return (alpha > 0)
+
+    def _logpmf(self, x, alpha):
+        return log(alpha) + special.betaln(x, alpha + 1)
+
+    def _cdf(self, x, alpha):
+        return 1 - x * special.beta(x, alpha + 1)
+
+    def _sf(self, x, alpha):
+        return x * special.beta(x, alpha + 1)
+
+    def _logsf(self, x, alpha):
+        return log(x) + special.betaln(x, alpha + 1)
+
+    def _stats(self, alpha):
+        mu = np.where(alpha <= 1, np.inf, alpha / (alpha - 1))
+        mu2 = np.where(alpha > 2,
+                       alpha**2 / ((alpha - 2.0) * (alpha - 1)**2),
+                       np.inf)
+        mu2 = np.where(alpha <= 1, np.nan, mu2)
+        g1 = np.where(alpha > 3,
+                      sqrt(alpha - 2) * (alpha + 1)**2 / (alpha * (alpha - 3)),
+                      np.inf)
+        g1 = np.where(alpha <= 2, np.nan, g1)
+        g2 = np.where(alpha > 4,
+                      alpha + 3 + ((11 * alpha**3 - 49 * alpha - 22) /
+                                   (alpha * (alpha - 4) * (alpha - 3))),
+                      np.inf)
+        g2 = np.where(alpha <= 2, np.nan, g2)
+        return mu, mu2, g1, g2
+
+
+yulesimon = yulesimon_gen(name='yulesimon', a=1)
+
+
+class _nchypergeom_gen(rv_discrete):
+    r"""A noncentral hypergeometric discrete random variable.
+
+    For subclassing by nchypergeom_fisher_gen and nchypergeom_wallenius_gen.
+
+    """
+
+    rvs_name = None
+    dist = None
+
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("N", True, (0, np.inf), (True, False)),
+                _ShapeInfo("odds", False, (0, np.inf), (False, False))]
+
+    def _get_support(self, M, n, N, odds):
+        N, m1, n = M, n, N  # follow Wikipedia notation
+        m2 = N - m1
+        x_min = np.maximum(0, n - m2)
+        x_max = np.minimum(n, m1)
+        return x_min, x_max
+
+    def _argcheck(self, M, n, N, odds):
+        M, n = np.asarray(M), np.asarray(n),
+        N, odds = np.asarray(N), np.asarray(odds)
+        cond1 = (~np.isnan(M)) & (M.astype(int) == M) & (M >= 0)
+        cond2 = (~np.isnan(n)) & (n.astype(int) == n) & (n >= 0)
+        cond3 = (~np.isnan(N)) & (N.astype(int) == N) & (N >= 0)
+        cond4 = odds > 0
+        cond5 = N <= M
+        cond6 = n <= M
+        return cond1 & cond2 & cond3 & cond4 & cond5 & cond6
+
+    def _rvs(self, M, n, N, odds, size=None, random_state=None):
+
+        @_vectorize_rvs_over_shapes
+        def _rvs1(M, n, N, odds, size, random_state):
+            if np.isnan(M) | np.isnan(n) | np.isnan(N):
+                return np.full(size, np.nan)
+            length = np.prod(size)
+            urn = _PyStochasticLib3()
+            rv_gen = getattr(urn, self.rvs_name)
+            rvs = rv_gen(N, n, M, odds, length, random_state)
+            rvs = rvs.reshape(size)
+            return rvs
+
+        return _rvs1(M, n, N, odds, size=size, random_state=random_state)
+
+    def _pmf(self, x, M, n, N, odds):
+
+        x, M, n, N, odds = np.broadcast_arrays(x, M, n, N, odds)
+        if x.size == 0:  # np.vectorize doesn't work with zero size input
+            return np.empty_like(x)
+
+        @np.vectorize
+        def _pmf1(x, M, n, N, odds):
+            if np.isnan(x) | np.isnan(M) | np.isnan(n) | np.isnan(N):
+                return np.nan
+            urn = self.dist(N, n, M, odds, 1e-12)
+            return urn.probability(x)
+
+        return _pmf1(x, M, n, N, odds)
+
+    def _stats(self, M, n, N, odds, moments='mv'):
+
+        @np.vectorize
+        def _moments1(M, n, N, odds):
+            if np.isnan(M) | np.isnan(n) | np.isnan(N):
+                return np.nan, np.nan
+            urn = self.dist(N, n, M, odds, 1e-12)
+            return urn.moments()
+
+        m, v = (_moments1(M, n, N, odds) if ("m" in moments or "v" in moments)
+                else (None, None))
+        s, k = None, None
+        return m, v, s, k
+
+
+class nchypergeom_fisher_gen(_nchypergeom_gen):
+    r"""A Fisher's noncentral hypergeometric discrete random variable.
+
+    Fisher's noncentral hypergeometric distribution models drawing objects of
+    two types from a bin. `M` is the total number of objects, `n` is the
+    number of Type I objects, and `odds` is the odds ratio: the odds of
+    selecting a Type I object rather than a Type II object when there is only
+    one object of each type.
+    The random variate represents the number of Type I objects drawn if we
+    take a handful of objects from the bin at once and find out afterwards
+    that we took `N` objects.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    nchypergeom_wallenius, hypergeom, nhypergeom
+
+    Notes
+    -----
+    Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
+    with parameters `N`, `n`, and `M` (respectively) as defined above.
+
+    The probability mass function is defined as
+
+    .. math::
+
+        p(x; M, n, N, \omega) =
+        \frac{\binom{n}{x}\binom{M - n}{N-x}\omega^x}{P_0},
+
+    for
+    :math:`x \in [x_l, x_u]`,
+    :math:`M \in {\mathbb N}`,
+    :math:`n \in [0, M]`,
+    :math:`N \in [0, M]`,
+    :math:`\omega > 0`,
+    where
+    :math:`x_l = \max(0, N - (M - n))`,
+    :math:`x_u = \min(N, n)`,
+
+    .. math::
+
+        P_0 = \sum_{y=x_l}^{x_u} \binom{n}{y}\binom{M - n}{N-y}\omega^y,
+
+    and the binomial coefficients are defined as
+
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+
+    `nchypergeom_fisher` uses the BiasedUrn package by Agner Fog with
+    permission for it to be distributed under SciPy's license.
+
+    The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
+    universally accepted; they are chosen for consistency with `hypergeom`.
+
+    Note that Fisher's noncentral hypergeometric distribution is distinct
+    from Wallenius' noncentral hypergeometric distribution, which models
+    drawing a pre-determined `N` objects from a bin one by one.
+    When the odds ratio is unity, however, both distributions reduce to the
+    ordinary hypergeometric distribution.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Agner Fog, "Biased Urn Theory".
+           https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
+
+    .. [2] "Fisher's noncentral hypergeometric distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Fisher's_noncentral_hypergeometric_distribution
+
+    %(example)s
+
+    """
+
+    rvs_name = "rvs_fisher"
+    dist = _PyFishersNCHypergeometric
+
+
+nchypergeom_fisher = nchypergeom_fisher_gen(
+    name='nchypergeom_fisher',
+    longname="A Fisher's noncentral hypergeometric")
+
+
+class nchypergeom_wallenius_gen(_nchypergeom_gen):
+    r"""A Wallenius' noncentral hypergeometric discrete random variable.
+
+    Wallenius' noncentral hypergeometric distribution models drawing objects of
+    two types from a bin. `M` is the total number of objects, `n` is the
+    number of Type I objects, and `odds` is the odds ratio: the odds of
+    selecting a Type I object rather than a Type II object when there is only
+    one object of each type.
+    The random variate represents the number of Type I objects drawn if we
+    draw a pre-determined `N` objects from a bin one by one.
+
+    %(before_notes)s
+
+    See Also
+    --------
+    nchypergeom_fisher, hypergeom, nhypergeom
+
+    Notes
+    -----
+    Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
+    with parameters `N`, `n`, and `M` (respectively) as defined above.
+
+    The probability mass function is defined as
+
+    .. math::
+
+        p(x; N, n, M) = \binom{n}{x} \binom{M - n}{N-x}
+        \int_0^1 \left(1-t^{\omega/D}\right)^x\left(1-t^{1/D}\right)^{N-x} dt
+
+    for
+    :math:`x \in [x_l, x_u]`,
+    :math:`M \in {\mathbb N}`,
+    :math:`n \in [0, M]`,
+    :math:`N \in [0, M]`,
+    :math:`\omega > 0`,
+    where
+    :math:`x_l = \max(0, N - (M - n))`,
+    :math:`x_u = \min(N, n)`,
+
+    .. math::
+
+        D = \omega(n - x) + ((M - n)-(N-x)),
+
+    and the binomial coefficients are defined as
+
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+
+    `nchypergeom_wallenius` uses the BiasedUrn package by Agner Fog with
+    permission for it to be distributed under SciPy's license.
+
+    The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
+    universally accepted; they are chosen for consistency with `hypergeom`.
+
+    Note that Wallenius' noncentral hypergeometric distribution is distinct
+    from Fisher's noncentral hypergeometric distribution, which models
+    take a handful of objects from the bin at once, finding out afterwards
+    that `N` objects were taken.
+    When the odds ratio is unity, however, both distributions reduce to the
+    ordinary hypergeometric distribution.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Agner Fog, "Biased Urn Theory".
+           https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
+
+    .. [2] "Wallenius' noncentral hypergeometric distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Wallenius'_noncentral_hypergeometric_distribution
+
+    %(example)s
+
+    """
+
+    rvs_name = "rvs_wallenius"
+    dist = _PyWalleniusNCHypergeometric
+
+
+nchypergeom_wallenius = nchypergeom_wallenius_gen(
+    name='nchypergeom_wallenius',
+    longname="A Wallenius' noncentral hypergeometric")
+
+
+# Collect names of classes and objects in this module.
+pairs = list(globals().copy().items())
+_distn_names, _distn_gen_names = get_distribution_names(pairs, rv_discrete)
+
+__all__ = _distn_names + _distn_gen_names
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distn_infrastructure.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distn_infrastructure.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b2c263185fad9aa2dc4af1349c3a006afed4676
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distn_infrastructure.py
@@ -0,0 +1,4228 @@
+#
+# Author:  Travis Oliphant  2002-2011 with contributions from
+#          SciPy Developers 2004-2011
+#
+from scipy._lib._util import getfullargspec_no_self as _getfullargspec
+
+import sys
+import keyword
+import re
+import types
+import warnings
+from itertools import zip_longest
+
+from scipy._lib import doccer
+from ._distr_params import distcont, distdiscrete
+from scipy._lib._util import check_random_state
+import scipy._lib.array_api_extra as xpx
+
+from scipy.special import comb, entr
+
+
+# for root finding for continuous distribution ppf, and maximum likelihood
+# estimation
+from scipy import optimize
+
+# for functions of continuous distributions (e.g. moments, entropy, cdf)
+from scipy import integrate
+
+# to approximate the pdf of a continuous distribution given its cdf
+from scipy.stats._finite_differences import _derivative
+
+# for scipy.stats.entropy. Attempts to import just that function or file
+# have cause import problems
+from scipy import stats
+
+from numpy import (arange, putmask, ones, shape, ndarray, zeros, floor,
+                   logical_and, log, sqrt, place, argmax, vectorize, asarray,
+                   nan, inf, isinf, empty)
+
+import numpy as np
+from ._constants import _XMAX, _LOGXMAX
+from ._censored_data import CensoredData
+from scipy.stats._warnings_errors import FitError
+
+# These are the docstring parts used for substitution in specific
+# distribution docstrings
+
+docheaders = {'methods': """\nMethods\n-------\n""",
+              'notes': """\nNotes\n-----\n""",
+              'examples': """\nExamples\n--------\n"""}
+
+_doc_rvs = """\
+rvs(%(shapes)s, loc=0, scale=1, size=1, random_state=None)
+    Random variates.
+"""
+_doc_pdf = """\
+pdf(x, %(shapes)s, loc=0, scale=1)
+    Probability density function.
+"""
+_doc_logpdf = """\
+logpdf(x, %(shapes)s, loc=0, scale=1)
+    Log of the probability density function.
+"""
+_doc_pmf = """\
+pmf(k, %(shapes)s, loc=0, scale=1)
+    Probability mass function.
+"""
+_doc_logpmf = """\
+logpmf(k, %(shapes)s, loc=0, scale=1)
+    Log of the probability mass function.
+"""
+_doc_cdf = """\
+cdf(x, %(shapes)s, loc=0, scale=1)
+    Cumulative distribution function.
+"""
+_doc_logcdf = """\
+logcdf(x, %(shapes)s, loc=0, scale=1)
+    Log of the cumulative distribution function.
+"""
+_doc_sf = """\
+sf(x, %(shapes)s, loc=0, scale=1)
+    Survival function  (also defined as ``1 - cdf``, but `sf` is sometimes more accurate).
+"""  # noqa: E501
+_doc_logsf = """\
+logsf(x, %(shapes)s, loc=0, scale=1)
+    Log of the survival function.
+"""
+_doc_ppf = """\
+ppf(q, %(shapes)s, loc=0, scale=1)
+    Percent point function (inverse of ``cdf`` --- percentiles).
+"""
+_doc_isf = """\
+isf(q, %(shapes)s, loc=0, scale=1)
+    Inverse survival function (inverse of ``sf``).
+"""
+_doc_moment = """\
+moment(order, %(shapes)s, loc=0, scale=1)
+    Non-central moment of the specified order.
+"""
+_doc_stats = """\
+stats(%(shapes)s, loc=0, scale=1, moments='mv')
+    Mean('m'), variance('v'), skew('s'), and/or kurtosis('k').
+"""
+_doc_entropy = """\
+entropy(%(shapes)s, loc=0, scale=1)
+    (Differential) entropy of the RV.
+"""
+_doc_fit = """\
+fit(data)
+    Parameter estimates for generic data.
+    See `scipy.stats.rv_continuous.fit <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.fit.html#scipy.stats.rv_continuous.fit>`__ for detailed documentation of the
+    keyword arguments.
+"""  # noqa: E501
+_doc_expect = """\
+expect(func, args=(%(shapes_)s), loc=0, scale=1, lb=None, ub=None, conditional=False, **kwds)
+    Expected value of a function (of one argument) with respect to the distribution.
+"""  # noqa: E501
+_doc_expect_discrete = """\
+expect(func, args=(%(shapes_)s), loc=0, lb=None, ub=None, conditional=False)
+    Expected value of a function (of one argument) with respect to the distribution.
+"""
+_doc_median = """\
+median(%(shapes)s, loc=0, scale=1)
+    Median of the distribution.
+"""
+_doc_mean = """\
+mean(%(shapes)s, loc=0, scale=1)
+    Mean of the distribution.
+"""
+_doc_var = """\
+var(%(shapes)s, loc=0, scale=1)
+    Variance of the distribution.
+"""
+_doc_std = """\
+std(%(shapes)s, loc=0, scale=1)
+    Standard deviation of the distribution.
+"""
+_doc_interval = """\
+interval(confidence, %(shapes)s, loc=0, scale=1)
+    Confidence interval with equal areas around the median.
+"""
+_doc_allmethods = ''.join([docheaders['methods'], _doc_rvs, _doc_pdf,
+                           _doc_logpdf, _doc_cdf, _doc_logcdf, _doc_sf,
+                           _doc_logsf, _doc_ppf, _doc_isf, _doc_moment,
+                           _doc_stats, _doc_entropy, _doc_fit,
+                           _doc_expect, _doc_median,
+                           _doc_mean, _doc_var, _doc_std, _doc_interval])
+
+_doc_default_longsummary = """\
+As an instance of the `rv_continuous` class, `%(name)s` object inherits from it
+a collection of generic methods (see below for the full list),
+and completes them with details specific for this particular distribution.
+"""
+
+_doc_default_frozen_note = """
+Alternatively, the object may be called (as a function) to fix the shape,
+location, and scale parameters returning a "frozen" continuous RV object:
+
+rv = %(name)s(%(shapes)s, loc=0, scale=1)
+    - Frozen RV object with the same methods but holding the given shape,
+      location, and scale fixed.
+"""
+_doc_default_example = """\
+Examples
+--------
+>>> import numpy as np
+>>> from scipy.stats import %(name)s
+>>> import matplotlib.pyplot as plt
+>>> fig, ax = plt.subplots(1, 1)
+
+Get the support:
+
+%(set_vals_stmt)s
+>>> lb, ub = %(name)s.support(%(shapes)s)
+
+Calculate the first four moments:
+
+>>> mean, var, skew, kurt = %(name)s.stats(%(shapes)s, moments='mvsk')
+
+Display the probability density function (``pdf``):
+
+>>> x = np.linspace(%(name)s.ppf(0.01, %(shapes)s),
+...                 %(name)s.ppf(0.99, %(shapes)s), 100)
+>>> ax.plot(x, %(name)s.pdf(x, %(shapes)s),
+...        'r-', lw=5, alpha=0.6, label='%(name)s pdf')
+
+Alternatively, the distribution object can be called (as a function)
+to fix the shape, location and scale parameters. This returns a "frozen"
+RV object holding the given parameters fixed.
+
+Freeze the distribution and display the frozen ``pdf``:
+
+>>> rv = %(name)s(%(shapes)s)
+>>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
+
+Check accuracy of ``cdf`` and ``ppf``:
+
+>>> vals = %(name)s.ppf([0.001, 0.5, 0.999], %(shapes)s)
+>>> np.allclose([0.001, 0.5, 0.999], %(name)s.cdf(vals, %(shapes)s))
+True
+
+Generate random numbers:
+
+>>> r = %(name)s.rvs(%(shapes)s, size=1000)
+
+And compare the histogram:
+
+>>> ax.hist(r, density=True, bins='auto', histtype='stepfilled', alpha=0.2)
+>>> ax.set_xlim([x[0], x[-1]])
+>>> ax.legend(loc='best', frameon=False)
+>>> plt.show()
+
+"""
+
+_doc_default_locscale = """\
+The probability density above is defined in the "standardized" form. To shift
+and/or scale the distribution use the ``loc`` and ``scale`` parameters.
+Specifically, ``%(name)s.pdf(x, %(shapes)s, loc, scale)`` is identically
+equivalent to ``%(name)s.pdf(y, %(shapes)s) / scale`` with
+``y = (x - loc) / scale``. Note that shifting the location of a distribution
+does not make it a "noncentral" distribution; noncentral generalizations of
+some distributions are available in separate classes.
+"""
+
+_doc_default = ''.join([_doc_default_longsummary,
+                        _doc_allmethods,
+                        '\n',
+                        _doc_default_example])
+
+_doc_default_before_notes = ''.join([_doc_default_longsummary,
+                                     _doc_allmethods])
+
+docdict = {
+    'rvs': _doc_rvs,
+    'pdf': _doc_pdf,
+    'logpdf': _doc_logpdf,
+    'cdf': _doc_cdf,
+    'logcdf': _doc_logcdf,
+    'sf': _doc_sf,
+    'logsf': _doc_logsf,
+    'ppf': _doc_ppf,
+    'isf': _doc_isf,
+    'stats': _doc_stats,
+    'entropy': _doc_entropy,
+    'fit': _doc_fit,
+    'moment': _doc_moment,
+    'expect': _doc_expect,
+    'interval': _doc_interval,
+    'mean': _doc_mean,
+    'std': _doc_std,
+    'var': _doc_var,
+    'median': _doc_median,
+    'allmethods': _doc_allmethods,
+    'longsummary': _doc_default_longsummary,
+    'frozennote': _doc_default_frozen_note,
+    'example': _doc_default_example,
+    'default': _doc_default,
+    'before_notes': _doc_default_before_notes,
+    'after_notes': _doc_default_locscale
+}
+
+# Reuse common content between continuous and discrete docs, change some
+# minor bits.
+docdict_discrete = docdict.copy()
+
+docdict_discrete['pmf'] = _doc_pmf
+docdict_discrete['logpmf'] = _doc_logpmf
+docdict_discrete['expect'] = _doc_expect_discrete
+_doc_disc_methods = ['rvs', 'pmf', 'logpmf', 'cdf', 'logcdf', 'sf', 'logsf',
+                     'ppf', 'isf', 'stats', 'entropy', 'expect', 'median',
+                     'mean', 'var', 'std', 'interval']
+for obj in _doc_disc_methods:
+    docdict_discrete[obj] = docdict_discrete[obj].replace(', scale=1', '')
+
+_doc_disc_methods_err_varname = ['cdf', 'logcdf', 'sf', 'logsf']
+for obj in _doc_disc_methods_err_varname:
+    docdict_discrete[obj] = docdict_discrete[obj].replace('(x, ', '(k, ')
+
+docdict_discrete.pop('pdf')
+docdict_discrete.pop('logpdf')
+
+_doc_allmethods = ''.join([docdict_discrete[obj] for obj in _doc_disc_methods])
+docdict_discrete['allmethods'] = docheaders['methods'] + _doc_allmethods
+
+docdict_discrete['longsummary'] = _doc_default_longsummary.replace(
+    'rv_continuous', 'rv_discrete')
+
+_doc_default_frozen_note = """
+Alternatively, the object may be called (as a function) to fix the shape and
+location parameters returning a "frozen" discrete RV object:
+
+rv = %(name)s(%(shapes)s, loc=0)
+    - Frozen RV object with the same methods but holding the given shape and
+      location fixed.
+"""
+docdict_discrete['frozennote'] = _doc_default_frozen_note
+
+_doc_default_discrete_example = """\
+Examples
+--------
+>>> import numpy as np
+>>> from scipy.stats import %(name)s
+>>> import matplotlib.pyplot as plt
+>>> fig, ax = plt.subplots(1, 1)
+
+Get the support:
+
+%(set_vals_stmt)s
+>>> lb, ub = %(name)s.support(%(shapes)s)
+
+Calculate the first four moments:
+
+>>> mean, var, skew, kurt = %(name)s.stats(%(shapes)s, moments='mvsk')
+
+Display the probability mass function (``pmf``):
+
+>>> x = np.arange(%(name)s.ppf(0.01, %(shapes)s),
+...               %(name)s.ppf(0.99, %(shapes)s))
+>>> ax.plot(x, %(name)s.pmf(x, %(shapes)s), 'bo', ms=8, label='%(name)s pmf')
+>>> ax.vlines(x, 0, %(name)s.pmf(x, %(shapes)s), colors='b', lw=5, alpha=0.5)
+
+Alternatively, the distribution object can be called (as a function)
+to fix the shape and location. This returns a "frozen" RV object holding
+the given parameters fixed.
+
+Freeze the distribution and display the frozen ``pmf``:
+
+>>> rv = %(name)s(%(shapes)s)
+>>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1,
+...         label='frozen pmf')
+>>> ax.legend(loc='best', frameon=False)
+>>> plt.show()
+
+Check accuracy of ``cdf`` and ``ppf``:
+
+>>> prob = %(name)s.cdf(x, %(shapes)s)
+>>> np.allclose(x, %(name)s.ppf(prob, %(shapes)s))
+True
+
+Generate random numbers:
+
+>>> r = %(name)s.rvs(%(shapes)s, size=1000)
+"""
+
+
+_doc_default_discrete_locscale = """\
+The probability mass function above is defined in the "standardized" form.
+To shift distribution use the ``loc`` parameter.
+Specifically, ``%(name)s.pmf(k, %(shapes)s, loc)`` is identically
+equivalent to ``%(name)s.pmf(k - loc, %(shapes)s)``.
+"""
+
+docdict_discrete['example'] = _doc_default_discrete_example
+docdict_discrete['after_notes'] = _doc_default_discrete_locscale
+
+_doc_default_before_notes = ''.join([docdict_discrete['longsummary'],
+                                     docdict_discrete['allmethods']])
+docdict_discrete['before_notes'] = _doc_default_before_notes
+
+_doc_default_disc = ''.join([docdict_discrete['longsummary'],
+                             docdict_discrete['allmethods'],
+                             docdict_discrete['frozennote'],
+                             docdict_discrete['example']])
+docdict_discrete['default'] = _doc_default_disc
+
+# clean up all the separate docstring elements, we do not need them anymore
+for obj in [s for s in dir() if s.startswith('_doc_')]:
+    exec('del ' + obj)
+del obj
+
+
+def _moment(data, n, mu=None):
+    if mu is None:
+        mu = data.mean()
+    return ((data - mu)**n).mean()
+
+
+def _moment_from_stats(n, mu, mu2, g1, g2, moment_func, args):
+    if (n == 0):
+        return 1.0
+    elif (n == 1):
+        if mu is None:
+            val = moment_func(1, *args)
+        else:
+            val = mu
+    elif (n == 2):
+        if mu2 is None or mu is None:
+            val = moment_func(2, *args)
+        else:
+            val = mu2 + mu*mu
+    elif (n == 3):
+        if g1 is None or mu2 is None or mu is None:
+            val = moment_func(3, *args)
+        else:
+            mu3 = g1 * np.power(mu2, 1.5)  # 3rd central moment
+            val = mu3+3*mu*mu2+mu*mu*mu  # 3rd non-central moment
+    elif (n == 4):
+        if g1 is None or g2 is None or mu2 is None or mu is None:
+            val = moment_func(4, *args)
+        else:
+            mu4 = (g2+3.0)*(mu2**2.0)  # 4th central moment
+            mu3 = g1*np.power(mu2, 1.5)  # 3rd central moment
+            val = mu4+4*mu*mu3+6*mu*mu*mu2+mu*mu*mu*mu
+    else:
+        val = moment_func(n, *args)
+
+    return val
+
+
+def _skew(data):
+    """
+    skew is third central moment / variance**(1.5)
+    """
+    data = np.ravel(data)
+    mu = data.mean()
+    m2 = ((data - mu)**2).mean()
+    m3 = ((data - mu)**3).mean()
+    return m3 / np.power(m2, 1.5)
+
+
+def _kurtosis(data):
+    """Fisher's excess kurtosis is fourth central moment / variance**2 - 3."""
+    data = np.ravel(data)
+    mu = data.mean()
+    m2 = ((data - mu)**2).mean()
+    m4 = ((data - mu)**4).mean()
+    return m4 / m2**2 - 3
+
+def _vectorize_rvs_over_shapes(_rvs1):
+    """Decorator that vectorizes _rvs method to work on ndarray shapes"""
+    # _rvs1 must be a _function_ that accepts _scalar_ args as positional
+    # arguments, `size` and `random_state` as keyword arguments.
+    # _rvs1 must return a random variate array with shape `size`. If `size` is
+    # None, _rvs1 must return a scalar.
+    # When applied to _rvs1, this decorator broadcasts ndarray args
+    # and loops over them, calling _rvs1 for each set of scalar args.
+    # For usage example, see _nchypergeom_gen
+    def _rvs(*args, size, random_state):
+        _rvs1_size, _rvs1_indices = _check_shape(args[0].shape, size)
+
+        size = np.array(size)
+        _rvs1_size = np.array(_rvs1_size)
+        _rvs1_indices = np.array(_rvs1_indices)
+
+        if np.all(_rvs1_indices):  # all args are scalars
+            return _rvs1(*args, size, random_state)
+
+        out = np.empty(size)
+
+        # out.shape can mix dimensions associated with arg_shape and _rvs1_size
+        # Sort them to arg_shape + _rvs1_size for easy indexing of dimensions
+        # corresponding with the different sets of scalar args
+        j0 = np.arange(out.ndim)
+        j1 = np.hstack((j0[~_rvs1_indices], j0[_rvs1_indices]))
+        out = np.moveaxis(out, j1, j0)
+
+        for i in np.ndindex(*size[~_rvs1_indices]):
+            # arg can be squeezed because singleton dimensions will be
+            # associated with _rvs1_size, not arg_shape per _check_shape
+            out[i] = _rvs1(*[np.squeeze(arg)[i] for arg in args],
+                           _rvs1_size, random_state)
+
+        return np.moveaxis(out, j0, j1)  # move axes back before returning
+    return _rvs
+
+
+def _fit_determine_optimizer(optimizer):
+    if not callable(optimizer) and isinstance(optimizer, str):
+        if not optimizer.startswith('fmin_'):
+            optimizer = "fmin_"+optimizer
+        if optimizer == 'fmin_':
+            optimizer = 'fmin'
+        try:
+            optimizer = getattr(optimize, optimizer)
+        except AttributeError as e:
+            raise ValueError(f"{optimizer} is not a valid optimizer") from e
+    return optimizer
+
+def _isintegral(x):
+    return x == np.round(x)
+
+def _sum_finite(x):
+    """
+    For a 1D array x, return a tuple containing the sum of the
+    finite values of x and the number of nonfinite values.
+
+    This is a utility function used when evaluating the negative
+    loglikelihood for a distribution and an array of samples.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats._distn_infrastructure import _sum_finite
+    >>> tot, nbad = _sum_finite(np.array([-2, -np.inf, 5, 1]))
+    >>> tot
+    4.0
+    >>> nbad
+    1
+    """
+    finite_x = np.isfinite(x)
+    bad_count = finite_x.size - np.count_nonzero(finite_x)
+    return np.sum(x[finite_x]), bad_count
+
+
+# Frozen RV class
+class rv_frozen:
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(types.GenericAlias)
+
+    def __init__(self, dist, *args, **kwds):
+        self.args = args
+        self.kwds = kwds
+
+        # create a new instance
+        self.dist = dist.__class__(**dist._updated_ctor_param())
+
+        shapes, _, _ = self.dist._parse_args(*args, **kwds)
+        self.a, self.b = self.dist._get_support(*shapes)
+
+    @property
+    def random_state(self):
+        return self.dist._random_state
+
+    @random_state.setter
+    def random_state(self, seed):
+        self.dist._random_state = check_random_state(seed)
+
+    def cdf(self, x):
+        return self.dist.cdf(x, *self.args, **self.kwds)
+
+    def logcdf(self, x):
+        return self.dist.logcdf(x, *self.args, **self.kwds)
+
+    def ppf(self, q):
+        return self.dist.ppf(q, *self.args, **self.kwds)
+
+    def isf(self, q):
+        return self.dist.isf(q, *self.args, **self.kwds)
+
+    def rvs(self, size=None, random_state=None):
+        kwds = self.kwds.copy()
+        kwds.update({'size': size, 'random_state': random_state})
+        return self.dist.rvs(*self.args, **kwds)
+
+    def sf(self, x):
+        return self.dist.sf(x, *self.args, **self.kwds)
+
+    def logsf(self, x):
+        return self.dist.logsf(x, *self.args, **self.kwds)
+
+    def stats(self, moments='mv'):
+        kwds = self.kwds.copy()
+        kwds.update({'moments': moments})
+        return self.dist.stats(*self.args, **kwds)
+
+    def median(self):
+        return self.dist.median(*self.args, **self.kwds)
+
+    def mean(self):
+        return self.dist.mean(*self.args, **self.kwds)
+
+    def var(self):
+        return self.dist.var(*self.args, **self.kwds)
+
+    def std(self):
+        return self.dist.std(*self.args, **self.kwds)
+
+    def moment(self, order=None):
+        return self.dist.moment(order, *self.args, **self.kwds)
+
+    def entropy(self):
+        return self.dist.entropy(*self.args, **self.kwds)
+
+    def interval(self, confidence=None):
+        return self.dist.interval(confidence, *self.args, **self.kwds)
+
+    def expect(self, func=None, lb=None, ub=None, conditional=False, **kwds):
+        # expect method only accepts shape parameters as positional args
+        # hence convert self.args, self.kwds, also loc/scale
+        # See the .expect method docstrings for the meaning of
+        # other parameters.
+        a, loc, scale = self.dist._parse_args(*self.args, **self.kwds)
+        if isinstance(self.dist, rv_discrete):
+            return self.dist.expect(func, a, loc, lb, ub, conditional, **kwds)
+        else:
+            return self.dist.expect(func, a, loc, scale, lb, ub,
+                                    conditional, **kwds)
+
+    def support(self):
+        return self.dist.support(*self.args, **self.kwds)
+
+
+class rv_discrete_frozen(rv_frozen):
+
+    def pmf(self, k):
+        return self.dist.pmf(k, *self.args, **self.kwds)
+
+    def logpmf(self, k):  # No error
+        return self.dist.logpmf(k, *self.args, **self.kwds)
+
+
+class rv_continuous_frozen(rv_frozen):
+
+    def pdf(self, x):
+        return self.dist.pdf(x, *self.args, **self.kwds)
+
+    def logpdf(self, x):
+        return self.dist.logpdf(x, *self.args, **self.kwds)
+
+
+def argsreduce(cond, *args):
+    """Clean arguments to:
+
+    1. Ensure all arguments are iterable (arrays of dimension at least one
+    2. If cond != True and size > 1, ravel(args[i]) where ravel(condition) is
+       True, in 1D.
+
+    Return list of processed arguments.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats._distn_infrastructure import argsreduce
+    >>> rng = np.random.default_rng()
+    >>> A = rng.random((4, 5))
+    >>> B = 2
+    >>> C = rng.random((1, 5))
+    >>> cond = np.ones(A.shape)
+    >>> [A1, B1, C1] = argsreduce(cond, A, B, C)
+    >>> A1.shape
+    (4, 5)
+    >>> B1.shape
+    (1,)
+    >>> C1.shape
+    (1, 5)
+    >>> cond[2,:] = 0
+    >>> [A1, B1, C1] = argsreduce(cond, A, B, C)
+    >>> A1.shape
+    (15,)
+    >>> B1.shape
+    (1,)
+    >>> C1.shape
+    (15,)
+
+    """
+    # some distributions assume arguments are iterable.
+    newargs = np.atleast_1d(*args)
+
+    # np.atleast_1d returns an array if only one argument, or a list of arrays
+    # if more than one argument.
+    if not isinstance(newargs, (list | tuple)):
+        newargs = (newargs,)
+
+    if np.all(cond):
+        # broadcast arrays with cond
+        *newargs, cond = np.broadcast_arrays(*newargs, cond)
+        return [arg.ravel() for arg in newargs]
+
+    s = cond.shape
+    # np.extract returns flattened arrays, which are not broadcastable together
+    # unless they are either the same size or size == 1.
+    return [(arg if np.size(arg) == 1
+            else np.extract(cond, np.broadcast_to(arg, s)))
+            for arg in newargs]
+
+
+parse_arg_template = """
+def _parse_args(self, %(shape_arg_str)s %(locscale_in)s):
+    return (%(shape_arg_str)s), %(locscale_out)s
+
+def _parse_args_rvs(self, %(shape_arg_str)s %(locscale_in)s, size=None):
+    return self._argcheck_rvs(%(shape_arg_str)s %(locscale_out)s, size=size)
+
+def _parse_args_stats(self, %(shape_arg_str)s %(locscale_in)s, moments='mv'):
+    return (%(shape_arg_str)s), %(locscale_out)s, moments
+"""
+
+
+class rv_generic:
+    """Class which encapsulates common functionality between rv_discrete
+    and rv_continuous.
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__()
+
+        # figure out if _stats signature has 'moments' keyword
+        sig = _getfullargspec(self._stats)
+        self._stats_has_moments = ((sig.varkw is not None) or
+                                   ('moments' in sig.args) or
+                                   ('moments' in sig.kwonlyargs))
+        self._random_state = check_random_state(seed)
+
+    @property
+    def random_state(self):
+        """Get or set the generator object for generating random variates.
+
+        If `random_state` is None (or `np.random`), the
+        `numpy.random.RandomState` singleton is used.
+        If `random_state` is an int, a new ``RandomState`` instance is used,
+        seeded with `random_state`.
+        If `random_state` is already a ``Generator`` or ``RandomState``
+        instance, that instance is used.
+
+        """
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, seed):
+        self._random_state = check_random_state(seed)
+
+    def __setstate__(self, state):
+        try:
+            self.__dict__.update(state)
+            # attaches the dynamically created methods on each instance.
+            # if a subclass overrides rv_generic.__setstate__, or implements
+            # it's own _attach_methods, then it must make sure that
+            # _attach_argparser_methods is called.
+            self._attach_methods()
+        except ValueError:
+            # reconstitute an old pickle scipy<1.6, that contains
+            # (_ctor_param, random_state) as state
+            self._ctor_param = state[0]
+            self._random_state = state[1]
+            self.__init__()
+
+    def _attach_methods(self):
+        """Attaches dynamically created methods to the rv_* instance.
+
+        This method must be overridden by subclasses, and must itself call
+         _attach_argparser_methods. This method is called in __init__ in
+         subclasses, and in __setstate__
+        """
+        raise NotImplementedError
+
+    def _attach_argparser_methods(self):
+        """
+        Generates the argument-parsing functions dynamically and attaches
+        them to the instance.
+
+        Should be called from `_attach_methods`, typically in __init__ and
+        during unpickling (__setstate__)
+        """
+        ns = {}
+        exec(self._parse_arg_template, ns)
+        # NB: attach to the instance, not class
+        for name in ['_parse_args', '_parse_args_stats', '_parse_args_rvs']:
+            setattr(self, name, types.MethodType(ns[name], self))
+
+    def _construct_argparser(
+            self, meths_to_inspect, locscale_in, locscale_out):
+        """Construct the parser string for the shape arguments.
+
+        This method should be called in __init__ of a class for each
+        distribution. It creates the `_parse_arg_template` attribute that is
+        then used by `_attach_argparser_methods` to dynamically create and
+        attach the `_parse_args`, `_parse_args_stats`, `_parse_args_rvs`
+        methods to the instance.
+
+        If self.shapes is a non-empty string, interprets it as a
+        comma-separated list of shape parameters.
+
+        Otherwise inspects the call signatures of `meths_to_inspect`
+        and constructs the argument-parsing functions from these.
+        In this case also sets `shapes` and `numargs`.
+        """
+
+        if self.shapes:
+            # sanitize the user-supplied shapes
+            if not isinstance(self.shapes, str):
+                raise TypeError('shapes must be a string.')
+
+            shapes = self.shapes.replace(',', ' ').split()
+
+            for field in shapes:
+                if keyword.iskeyword(field):
+                    raise SyntaxError('keywords cannot be used as shapes.')
+                if not re.match('^[_a-zA-Z][_a-zA-Z0-9]*$', field):
+                    raise SyntaxError(
+                        'shapes must be valid python identifiers')
+        else:
+            # find out the call signatures (_pdf, _cdf etc), deduce shape
+            # arguments. Generic methods only have 'self, x', any further args
+            # are shapes.
+            shapes_list = []
+            for meth in meths_to_inspect:
+                shapes_args = _getfullargspec(meth)  # NB does not contain self
+                args = shapes_args.args[1:]       # peel off 'x', too
+
+                if args:
+                    shapes_list.append(args)
+
+                    # *args or **kwargs are not allowed w/automatic shapes
+                    if shapes_args.varargs is not None:
+                        raise TypeError(
+                            '*args are not allowed w/out explicit shapes')
+                    if shapes_args.varkw is not None:
+                        raise TypeError(
+                            '**kwds are not allowed w/out explicit shapes')
+                    if shapes_args.kwonlyargs:
+                        raise TypeError(
+                            'kwonly args are not allowed w/out explicit shapes')
+                    if shapes_args.defaults is not None:
+                        raise TypeError('defaults are not allowed for shapes')
+
+            if shapes_list:
+                shapes = shapes_list[0]
+
+                # make sure the signatures are consistent
+                for item in shapes_list:
+                    if item != shapes:
+                        raise TypeError('Shape arguments are inconsistent.')
+            else:
+                shapes = []
+
+        # have the arguments, construct the method from template
+        shapes_str = ', '.join(shapes) + ', ' if shapes else ''  # NB: not None
+        dct = dict(shape_arg_str=shapes_str,
+                   locscale_in=locscale_in,
+                   locscale_out=locscale_out,
+                   )
+
+        # this string is used by _attach_argparser_methods
+        self._parse_arg_template = parse_arg_template % dct
+
+        self.shapes = ', '.join(shapes) if shapes else None
+        if not hasattr(self, 'numargs'):
+            # allows more general subclassing with *args
+            self.numargs = len(shapes)
+
+    def _construct_doc(self, docdict, shapes_vals=None):
+        """Construct the instance docstring with string substitutions."""
+        if sys.flags.optimize > 1:
+            # if run with -OO, docstrings are stripped
+            # see https://docs.python.org/3/using/cmdline.html#cmdoption-OO
+            return
+        tempdict = docdict.copy()
+        tempdict['name'] = self.name or 'distname'
+        tempdict['shapes'] = self.shapes or ''
+
+        if shapes_vals is None:
+            shapes_vals = ()
+        try:
+            vals = ', '.join(f'{val:.3g}' for val in shapes_vals)
+        except TypeError:
+            vals = ', '.join(f'{val}' for val in shapes_vals)
+        tempdict['vals'] = vals
+
+        tempdict['shapes_'] = self.shapes or ''
+        if self.shapes and self.numargs == 1:
+            tempdict['shapes_'] += ','
+
+        if self.shapes:
+            tempdict['set_vals_stmt'] = f'>>> {self.shapes} = {vals}'
+        else:
+            tempdict['set_vals_stmt'] = ''
+
+        if self.shapes is None:
+            # remove shapes from call parameters if there are none
+            for item in ['default', 'before_notes']:
+                tempdict[item] = tempdict[item].replace(
+                    "\n%(shapes)s : array_like\n    shape parameters", "")
+        for i in range(2):
+            if self.shapes is None:
+                # necessary because we use %(shapes)s in two forms (w w/o ", ")
+                self.__doc__ = self.__doc__.replace("%(shapes)s, ", "")
+            try:
+                self.__doc__ = doccer.docformat(self.__doc__, tempdict)
+            except TypeError as e:
+                raise Exception("Unable to construct docstring for "
+                                f"distribution \"{self.name}\": {repr(e)}") from e
+
+        # correct for empty shapes
+        self.__doc__ = self.__doc__.replace('(, ', '(').replace(', )', ')')
+
+    def _construct_default_doc(self, longname=None,
+                               docdict=None, discrete='continuous'):
+        """Construct instance docstring from the default template."""
+        if sys.flags.optimize > 1:
+            # if run with -OO, docstrings are stripped
+            # see https://docs.python.org/3/using/cmdline.html#cmdoption-OO
+            return
+        if longname is None:
+            longname = 'A'
+        self.__doc__ = ''.join([f'{longname} {discrete} random variable.',
+                                '\n\n%(before_notes)s\n', docheaders['notes'],
+                                '\n%(example)s'])
+        self._construct_doc(docdict)
+
+    def freeze(self, *args, **kwds):
+        """Freeze the distribution for the given arguments.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution.  Should include all
+            the non-optional arguments, may include ``loc`` and ``scale``.
+
+        Returns
+        -------
+        rv_frozen : rv_frozen instance
+            The frozen distribution.
+
+        """
+        if isinstance(self, rv_continuous):
+            return rv_continuous_frozen(self, *args, **kwds)
+        else:
+            return rv_discrete_frozen(self, *args, **kwds)
+
+    def __call__(self, *args, **kwds):
+        return self.freeze(*args, **kwds)
+    __call__.__doc__ = freeze.__doc__
+
+    # The actual calculation functions (no basic checking need be done)
+    # If these are defined, the others won't be looked at.
+    # Otherwise, the other set can be defined.
+    def _stats(self, *args, **kwds):
+        return None, None, None, None
+
+    # Noncentral moments (also known as the moment about the origin).
+    # Expressed in LaTeX, munp would be $\mu'_{n}$, i.e. "mu-sub-n-prime".
+    # The primed mu is a widely used notation for the noncentral moment.
+    def _munp(self, n, *args):
+        # Silence floating point warnings from integration.
+        with np.errstate(all='ignore'):
+            vals = self.generic_moment(n, *args)
+        return vals
+
+    def _argcheck_rvs(self, *args, **kwargs):
+        # Handle broadcasting and size validation of the rvs method.
+        # Subclasses should not have to override this method.
+        # The rule is that if `size` is not None, then `size` gives the
+        # shape of the result (integer values of `size` are treated as
+        # tuples with length 1; i.e. `size=3` is the same as `size=(3,)`.)
+        #
+        # `args` is expected to contain the shape parameters (if any), the
+        # location and the scale in a flat tuple (e.g. if there are two
+        # shape parameters `a` and `b`, `args` will be `(a, b, loc, scale)`).
+        # The only keyword argument expected is 'size'.
+        size = kwargs.get('size', None)
+        all_bcast = np.broadcast_arrays(*args)
+
+        def squeeze_left(a):
+            while a.ndim > 0 and a.shape[0] == 1:
+                a = a[0]
+            return a
+
+        # Eliminate trivial leading dimensions.  In the convention
+        # used by numpy's random variate generators, trivial leading
+        # dimensions are effectively ignored.  In other words, when `size`
+        # is given, trivial leading dimensions of the broadcast parameters
+        # in excess of the number of dimensions  in size are ignored, e.g.
+        #   >>> np.random.normal([[1, 3, 5]], [[[[0.01]]]], size=3)
+        #   array([ 1.00104267,  3.00422496,  4.99799278])
+        # If `size` is not given, the exact broadcast shape is preserved:
+        #   >>> np.random.normal([[1, 3, 5]], [[[[0.01]]]])
+        #   array([[[[ 1.00862899,  3.00061431,  4.99867122]]]])
+        #
+        all_bcast = [squeeze_left(a) for a in all_bcast]
+        bcast_shape = all_bcast[0].shape
+        bcast_ndim = all_bcast[0].ndim
+
+        if size is None:
+            size_ = bcast_shape
+        else:
+            size_ = tuple(np.atleast_1d(size))
+
+        # Check compatibility of size_ with the broadcast shape of all
+        # the parameters.  This check is intended to be consistent with
+        # how the numpy random variate generators (e.g. np.random.normal,
+        # np.random.beta) handle their arguments.   The rule is that, if size
+        # is given, it determines the shape of the output.  Broadcasting
+        # can't change the output size.
+
+        # This is the standard broadcasting convention of extending the
+        # shape with fewer dimensions with enough dimensions of length 1
+        # so that the two shapes have the same number of dimensions.
+        ndiff = bcast_ndim - len(size_)
+        if ndiff < 0:
+            bcast_shape = (1,)*(-ndiff) + bcast_shape
+        elif ndiff > 0:
+            size_ = (1,)*ndiff + size_
+
+        # This compatibility test is not standard.  In "regular" broadcasting,
+        # two shapes are compatible if for each dimension, the lengths are the
+        # same or one of the lengths is 1.  Here, the length of a dimension in
+        # size_ must not be less than the corresponding length in bcast_shape.
+        ok = all([bcdim == 1 or bcdim == szdim
+                  for (bcdim, szdim) in zip(bcast_shape, size_)])
+        if not ok:
+            raise ValueError("size does not match the broadcast shape of "
+                             f"the parameters. {size}, {size_}, {bcast_shape}")
+
+        param_bcast = all_bcast[:-2]
+        loc_bcast = all_bcast[-2]
+        scale_bcast = all_bcast[-1]
+
+        return param_bcast, loc_bcast, scale_bcast, size_
+
+    # These are the methods you must define (standard form functions)
+    # NB: generic _pdf, _logpdf, _cdf are different for
+    # rv_continuous and rv_discrete hence are defined in there
+    def _argcheck(self, *args):
+        """Default check for correct values on args and keywords.
+
+        Returns condition array of 1's where arguments are correct and
+         0's where they are not.
+
+        """
+        cond = 1
+        for arg in args:
+            cond = logical_and(cond, (asarray(arg) > 0))
+        return cond
+
+    def _get_support(self, *args, **kwargs):
+        """Return the support of the (unscaled, unshifted) distribution.
+
+        *Must* be overridden by distributions which have support dependent
+        upon the shape parameters of the distribution.  Any such override
+        *must not* set or change any of the class members, as these members
+        are shared amongst all instances of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, ... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+
+        Returns
+        -------
+        a, b : numeric (float, or int or +/-np.inf)
+            end-points of the distribution's support for the specified
+            shape parameters.
+        """
+        return self.a, self.b
+
+    def _support_mask(self, x, *args):
+        a, b = self._get_support(*args)
+        with np.errstate(invalid='ignore'):
+            return (a <= x) & (x <= b)
+
+    def _open_support_mask(self, x, *args):
+        a, b = self._get_support(*args)
+        with np.errstate(invalid='ignore'):
+            return (a < x) & (x < b)
+
+    def _rvs(self, *args, size=None, random_state=None):
+        # This method must handle size being a tuple, and it must
+        # properly broadcast *args and size.  size might be
+        # an empty tuple, which means a scalar random variate is to be
+        # generated.
+
+        # Use basic inverse cdf algorithm for RV generation as default.
+        U = random_state.uniform(size=size)
+        Y = self._ppf(U, *args)
+        return Y
+
+    def _logcdf(self, x, *args):
+        with np.errstate(divide='ignore'):
+            return log(self._cdf(x, *args))
+
+    def _sf(self, x, *args):
+        return 1.0-self._cdf(x, *args)
+
+    def _logsf(self, x, *args):
+        with np.errstate(divide='ignore'):
+            return log(self._sf(x, *args))
+
+    def _ppf(self, q, *args):
+        return self._ppfvec(q, *args)
+
+    def _isf(self, q, *args):
+        return self._ppf(1.0-q, *args)  # use correct _ppf for subclasses
+
+    # These are actually called, and should not be overwritten if you
+    # want to keep error checking.
+    def rvs(self, *args, **kwds):
+        """Random variates of given type.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+        scale : array_like, optional
+            Scale parameter (default=1).
+        size : int or tuple of ints, optional
+            Defining number of random variates (default is 1).
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            If `random_state` is None (or `np.random`), the
+            `numpy.random.RandomState` singleton is used.
+            If `random_state` is an int, a new ``RandomState`` instance is
+            used, seeded with `random_state`.
+            If `random_state` is already a ``Generator`` or ``RandomState``
+            instance, that instance is used.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of given `size`.
+
+        """
+        discrete = kwds.pop('discrete', None)
+        rndm = kwds.pop('random_state', None)
+        args, loc, scale, size = self._parse_args_rvs(*args, **kwds)
+        cond = logical_and(self._argcheck(*args), (scale >= 0))
+        if not np.all(cond):
+            message = ("Domain error in arguments. The `scale` parameter must "
+                       "be positive for all distributions, and many "
+                       "distributions have restrictions on shape parameters. "
+                       f"Please see the `scipy.stats.{self.name}` "
+                       "documentation for details.")
+            raise ValueError(message)
+
+        if np.all(scale == 0):
+            return loc*ones(size, 'd')
+
+        # extra gymnastics needed for a custom random_state
+        if rndm is not None:
+            random_state_saved = self._random_state
+            random_state = check_random_state(rndm)
+        else:
+            random_state = self._random_state
+
+        vals = self._rvs(*args, size=size, random_state=random_state)
+
+        vals = vals * scale + loc
+
+        # do not forget to restore the _random_state
+        if rndm is not None:
+            self._random_state = random_state_saved
+
+        # Cast to int if discrete
+        if discrete and not isinstance(self, rv_sample):
+            if size == ():
+                vals = int(vals)
+            else:
+                vals = vals.astype(np.int64)
+
+        return vals
+
+    def stats(self, *args, **kwds):
+        """Some statistics of the given RV.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional (continuous RVs only)
+            scale parameter (default=1)
+        moments : str, optional
+            composed of letters ['mvsk'] defining which moments to compute:
+            'm' = mean,
+            'v' = variance,
+            's' = (Fisher's) skew,
+            'k' = (Fisher's) kurtosis.
+            (default is 'mv')
+
+        Returns
+        -------
+        stats : sequence
+            of requested moments.
+
+        """
+        args, loc, scale, moments = self._parse_args_stats(*args, **kwds)
+        # scale = 1 by construction for discrete RVs
+        loc, scale = map(asarray, (loc, scale))
+        args = tuple(map(asarray, args))
+        cond = self._argcheck(*args) & (scale > 0) & (loc == loc)
+        output = []
+        default = np.full(shape(cond), fill_value=self.badvalue)
+
+        # Use only entries that are valid in calculation
+        if np.any(cond):
+            goodargs = argsreduce(cond, *(args+(scale, loc)))
+            scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2]
+
+            if self._stats_has_moments:
+                mu, mu2, g1, g2 = self._stats(*goodargs,
+                                              **{'moments': moments})
+            else:
+                mu, mu2, g1, g2 = self._stats(*goodargs)
+
+            if 'm' in moments:
+                if mu is None:
+                    mu = self._munp(1, *goodargs)
+                out0 = default.copy()
+                place(out0, cond, mu * scale + loc)
+                output.append(out0)
+
+            if 'v' in moments:
+                if mu2 is None:
+                    mu2p = self._munp(2, *goodargs)
+                    if mu is None:
+                        mu = self._munp(1, *goodargs)
+                    # if mean is inf then var is also inf
+                    with np.errstate(invalid='ignore'):
+                        mu2 = np.where(~np.isinf(mu), mu2p - mu**2, np.inf)
+                out0 = default.copy()
+                place(out0, cond, mu2 * scale * scale)
+                output.append(out0)
+
+            if 's' in moments:
+                if g1 is None:
+                    mu3p = self._munp(3, *goodargs)
+                    if mu is None:
+                        mu = self._munp(1, *goodargs)
+                    if mu2 is None:
+                        mu2p = self._munp(2, *goodargs)
+                        with np.errstate(invalid='ignore'):
+                            mu2 = mu2p - mu * mu
+                    with np.errstate(invalid='ignore'):
+                        mu3 = (-mu*mu - 3*mu2)*mu + mu3p
+                        g1 = mu3 / np.power(mu2, 1.5)
+                out0 = default.copy()
+                place(out0, cond, g1)
+                output.append(out0)
+
+            if 'k' in moments:
+                if g2 is None:
+                    mu4p = self._munp(4, *goodargs)
+                    if mu is None:
+                        mu = self._munp(1, *goodargs)
+                    if mu2 is None:
+                        mu2p = self._munp(2, *goodargs)
+                        with np.errstate(invalid='ignore'):
+                            mu2 = mu2p - mu * mu
+                    if g1 is None:
+                        mu3 = None
+                    else:
+                        # (mu2**1.5) breaks down for nan and inf
+                        mu3 = g1 * np.power(mu2, 1.5)
+                    if mu3 is None:
+                        mu3p = self._munp(3, *goodargs)
+                        with np.errstate(invalid='ignore'):
+                            mu3 = (-mu * mu - 3 * mu2) * mu + mu3p
+                    with np.errstate(invalid='ignore'):
+                        mu4 = ((-mu**2 - 6*mu2) * mu - 4*mu3)*mu + mu4p
+                        g2 = mu4 / mu2**2.0 - 3.0
+                out0 = default.copy()
+                place(out0, cond, g2)
+                output.append(out0)
+        else:  # no valid args
+            output = [default.copy() for _ in moments]
+
+        output = [out[()] for out in output]
+        if len(output) == 1:
+            return output[0]
+        else:
+            return tuple(output)
+
+    def entropy(self, *args, **kwds):
+        """Differential entropy of the RV.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+        scale : array_like, optional  (continuous distributions only).
+            Scale parameter (default=1).
+
+        Notes
+        -----
+        Entropy is defined base `e`:
+
+        >>> import numpy as np
+        >>> from scipy.stats._distn_infrastructure import rv_discrete
+        >>> drv = rv_discrete(values=((0, 1), (0.5, 0.5)))
+        >>> np.allclose(drv.entropy(), np.log(2.0))
+        True
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        # NB: for discrete distributions scale=1 by construction in _parse_args
+        loc, scale = map(asarray, (loc, scale))
+        args = tuple(map(asarray, args))
+        cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc)
+        output = zeros(shape(cond0), 'd')
+        place(output, (1-cond0), self.badvalue)
+        goodargs = argsreduce(cond0, scale, *args)
+        goodscale = goodargs[0]
+        goodargs = goodargs[1:]
+        place(output, cond0, self.vecentropy(*goodargs) + log(goodscale))
+        return output[()]
+
+    def moment(self, order, *args, **kwds):
+        """non-central moment of distribution of specified order.
+
+        Parameters
+        ----------
+        order : int, order >= 1
+            Order of moment.
+        arg1, arg2, arg3,... : float
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        """
+        n = order
+        shapes, loc, scale = self._parse_args(*args, **kwds)
+        args = np.broadcast_arrays(*(*shapes, loc, scale))
+        *shapes, loc, scale = args
+
+        i0 = np.logical_and(self._argcheck(*shapes), scale > 0)
+        i1 = np.logical_and(i0, loc == 0)
+        i2 = np.logical_and(i0, loc != 0)
+
+        args = argsreduce(i0, *shapes, loc, scale)
+        *shapes, loc, scale = args
+
+        if (floor(n) != n):
+            raise ValueError("Moment must be an integer.")
+        if (n < 0):
+            raise ValueError("Moment must be positive.")
+        mu, mu2, g1, g2 = None, None, None, None
+        if (n > 0) and (n < 5):
+            if self._stats_has_moments:
+                mdict = {'moments': {1: 'm', 2: 'v', 3: 'vs', 4: 'mvsk'}[n]}
+            else:
+                mdict = {}
+            mu, mu2, g1, g2 = self._stats(*shapes, **mdict)
+        val = np.empty(loc.shape)  # val needs to be indexed by loc
+        val[...] = _moment_from_stats(n, mu, mu2, g1, g2, self._munp, shapes)
+
+        # Convert to transformed  X = L + S*Y
+        # E[X^n] = E[(L+S*Y)^n] = L^n sum(comb(n, k)*(S/L)^k E[Y^k], k=0...n)
+        result = zeros(i0.shape)
+        place(result, ~i0, self.badvalue)
+
+        if i1.any():
+            res1 = scale[loc == 0]**n * val[loc == 0]
+            place(result, i1, res1)
+
+        if i2.any():
+            mom = [mu, mu2, g1, g2]
+            arrs = [i for i in mom if i is not None]
+            idx = [i for i in range(4) if mom[i] is not None]
+            if any(idx):
+                arrs = argsreduce(loc != 0, *arrs)
+                j = 0
+                for i in idx:
+                    mom[i] = arrs[j]
+                    j += 1
+            mu, mu2, g1, g2 = mom
+            args = argsreduce(loc != 0, *shapes, loc, scale, val)
+            *shapes, loc, scale, val = args
+
+            res2 = zeros(loc.shape, dtype='d')
+            fac = scale / loc
+            for k in range(n):
+                valk = _moment_from_stats(k, mu, mu2, g1, g2, self._munp,
+                                          shapes)
+                res2 += comb(n, k, exact=True)*fac**k * valk
+            res2 += fac**n * val
+            res2 *= loc**n
+            place(result, i2, res2)
+
+        return result[()]
+
+    def median(self, *args, **kwds):
+        """Median of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            Location parameter, Default is 0.
+        scale : array_like, optional
+            Scale parameter, Default is 1.
+
+        Returns
+        -------
+        median : float
+            The median of the distribution.
+
+        See Also
+        --------
+        rv_discrete.ppf
+            Inverse of the CDF
+
+        """
+        return self.ppf(0.5, *args, **kwds)
+
+    def mean(self, *args, **kwds):
+        """Mean of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        mean : float
+            the mean of the distribution
+
+        """
+        kwds['moments'] = 'm'
+        res = self.stats(*args, **kwds)
+        if isinstance(res, ndarray) and res.ndim == 0:
+            return res[()]
+        return res
+
+    def var(self, *args, **kwds):
+        """Variance of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        var : float
+            the variance of the distribution
+
+        """
+        kwds['moments'] = 'v'
+        res = self.stats(*args, **kwds)
+        if isinstance(res, ndarray) and res.ndim == 0:
+            return res[()]
+        return res
+
+    def std(self, *args, **kwds):
+        """Standard deviation of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        std : float
+            standard deviation of the distribution
+
+        """
+        kwds['moments'] = 'v'
+        res = sqrt(self.stats(*args, **kwds))
+        return res
+
+    def interval(self, confidence, *args, **kwds):
+        """Confidence interval with equal areas around the median.
+
+        Parameters
+        ----------
+        confidence : array_like of float
+            Probability that an rv will be drawn from the returned range.
+            Each value should be in the range [0, 1].
+        arg1, arg2, ... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            location parameter, Default is 0.
+        scale : array_like, optional
+            scale parameter, Default is 1.
+
+        Returns
+        -------
+        a, b : ndarray of float
+            end-points of range that contain ``100 * alpha %`` of the rv's
+            possible values.
+
+        Notes
+        -----
+        This is implemented as ``ppf([p_tail, 1-p_tail])``, where
+        ``ppf`` is the inverse cumulative distribution function and
+        ``p_tail = (1-confidence)/2``. Suppose ``[c, d]`` is the support of a
+        discrete distribution; then ``ppf([0, 1]) == (c-1, d)``. Therefore,
+        when ``confidence=1`` and the distribution is discrete, the left end
+        of the interval will be beyond the support of the distribution.
+        For discrete distributions, the interval will limit the probability
+        in each tail to be less than or equal to ``p_tail`` (usually
+        strictly less).
+
+        """
+        alpha = confidence
+
+        alpha = asarray(alpha)
+        if np.any((alpha > 1) | (alpha < 0)):
+            raise ValueError("alpha must be between 0 and 1 inclusive")
+        q1 = (1.0-alpha)/2
+        q2 = (1.0+alpha)/2
+        a = self.ppf(q1, *args, **kwds)
+        b = self.ppf(q2, *args, **kwds)
+        return a, b
+
+    def support(self, *args, **kwargs):
+        """Support of the distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, ... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            location parameter, Default is 0.
+        scale : array_like, optional
+            scale parameter, Default is 1.
+
+        Returns
+        -------
+        a, b : array_like
+            end-points of the distribution's support.
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwargs)
+        arrs = np.broadcast_arrays(*args, loc, scale)
+        args, loc, scale = arrs[:-2], arrs[-2], arrs[-1]
+        cond = self._argcheck(*args) & (scale > 0)
+        _a, _b = self._get_support(*args)
+        if cond.all():
+            return _a * scale + loc, _b * scale + loc
+        elif cond.ndim == 0:
+            return self.badvalue, self.badvalue
+        # promote bounds to at least float to fill in the badvalue
+        _a, _b = np.asarray(_a).astype('d'), np.asarray(_b).astype('d')
+        out_a, out_b = _a * scale + loc, _b * scale + loc
+        place(out_a, 1-cond, self.badvalue)
+        place(out_b, 1-cond, self.badvalue)
+        return out_a, out_b
+
+    def nnlf(self, theta, x):
+        """Negative loglikelihood function.
+        Notes
+        -----
+        This is ``-sum(log pdf(x, theta), axis=0)`` where `theta` are the
+        parameters (including loc and scale).
+        """
+        loc, scale, args = self._unpack_loc_scale(theta)
+        if not self._argcheck(*args) or scale <= 0:
+            return inf
+        x = (asarray(x)-loc) / scale
+        n_log_scale = len(x) * log(scale)
+        if np.any(~self._support_mask(x, *args)):
+            return inf
+        return self._nnlf(x, *args) + n_log_scale
+
+    def _nnlf(self, x, *args):
+        return -np.sum(self._logpxf(x, *args), axis=0)
+
+    def _nlff_and_penalty(self, x, args, log_fitfun):
+        # negative log fit function
+        cond0 = ~self._support_mask(x, *args)
+        n_bad = np.count_nonzero(cond0, axis=0)
+        if n_bad > 0:
+            x = argsreduce(~cond0, x)[0]
+        logff = log_fitfun(x, *args)
+        finite_logff = np.isfinite(logff)
+        n_bad += np.sum(~finite_logff, axis=0)
+        if n_bad > 0:
+            penalty = n_bad * log(_XMAX) * 100
+            return -np.sum(logff[finite_logff], axis=0) + penalty
+        return -np.sum(logff, axis=0)
+
+    def _penalized_nnlf(self, theta, x):
+        """Penalized negative loglikelihood function.
+        i.e., - sum (log pdf(x, theta), axis=0) + penalty
+        where theta are the parameters (including loc and scale)
+        """
+        loc, scale, args = self._unpack_loc_scale(theta)
+        if not self._argcheck(*args) or scale <= 0:
+            return inf
+        x = asarray((x-loc) / scale)
+        n_log_scale = len(x) * log(scale)
+        return self._nlff_and_penalty(x, args, self._logpxf) + n_log_scale
+
+    def _penalized_nlpsf(self, theta, x):
+        """Penalized negative log product spacing function.
+        i.e., - sum (log (diff (cdf (x, theta))), axis=0) + penalty
+        where theta are the parameters (including loc and scale)
+        Follows reference [1] of scipy.stats.fit
+        """
+        loc, scale, args = self._unpack_loc_scale(theta)
+        if not self._argcheck(*args) or scale <= 0:
+            return inf
+        x = (np.sort(x) - loc)/scale
+
+        def log_psf(x, *args):
+            x, lj = np.unique(x, return_counts=True)  # fast for sorted x
+            cdf_data = self._cdf(x, *args) if x.size else []
+            if not (x.size and 1 - cdf_data[-1] <= 0):
+                cdf = np.concatenate(([0], cdf_data, [1]))
+                lj = np.concatenate((lj, [1]))
+            else:
+                cdf = np.concatenate(([0], cdf_data))
+            # here we could use logcdf w/ logsumexp trick to take differences,
+            # but in the context of the method, it seems unlikely to matter
+            return lj * np.log(np.diff(cdf) / lj)
+
+        return self._nlff_and_penalty(x, args, log_psf)
+
+
+class _ShapeInfo:
+    def __init__(self, name, integrality=False, domain=(-np.inf, np.inf),
+                 inclusive=(True, True)):
+        self.name = name
+        self.integrality = integrality
+        self.endpoints = domain
+        self.inclusive = inclusive
+
+        domain = list(domain)
+        if np.isfinite(domain[0]) and not inclusive[0]:
+            domain[0] = np.nextafter(domain[0], np.inf)
+        if np.isfinite(domain[1]) and not inclusive[1]:
+            domain[1] = np.nextafter(domain[1], -np.inf)
+        self.domain = domain
+
+
+def _get_fixed_fit_value(kwds, names):
+    """
+    Given names such as ``['f0', 'fa', 'fix_a']``, check that there is
+    at most one non-None value in `kwds` associated with those names.
+    Return that value, or None if none of the names occur in `kwds`.
+    As a side effect, all occurrences of those names in `kwds` are
+    removed.
+    """
+    vals = [(name, kwds.pop(name)) for name in names if name in kwds]
+    if len(vals) > 1:
+        repeated = [name for name, val in vals]
+        raise ValueError("fit method got multiple keyword arguments to "
+                         "specify the same fixed parameter: " +
+                         ', '.join(repeated))
+    return vals[0][1] if vals else None
+
+
+#  continuous random variables: implement maybe later
+#
+#  hf  --- Hazard Function (PDF / SF)
+#  chf  --- Cumulative hazard function (-log(SF))
+#  psf --- Probability sparsity function (reciprocal of the pdf) in
+#                units of percent-point-function (as a function of q).
+#                Also, the derivative of the percent-point function.
+
+
+class rv_continuous(rv_generic):
+    """A generic continuous random variable class meant for subclassing.
+
+    `rv_continuous` is a base class to construct specific distribution classes
+    and instances for continuous random variables. It cannot be used
+    directly as a distribution.
+
+    Parameters
+    ----------
+    momtype : int, optional
+        The type of generic moment calculation to use: 0 for pdf, 1 (default)
+        for ppf.
+    a : float, optional
+        Lower bound of the support of the distribution, default is minus
+        infinity.
+    b : float, optional
+        Upper bound of the support of the distribution, default is plus
+        infinity.
+    xtol : float, optional
+        The tolerance for fixed point calculation for generic ppf.
+    badvalue : float, optional
+        The value in a result arrays that indicates a value that for which
+        some argument restriction is violated, default is np.nan.
+    name : str, optional
+        The name of the instance. This string is used to construct the default
+        example for distributions.
+    longname : str, optional
+        This string is used as part of the first line of the docstring returned
+        when a subclass has no docstring of its own. Note: `longname` exists
+        for backwards compatibility, do not use for new subclasses.
+    shapes : str, optional
+        The shape of the distribution. For example ``"m, n"`` for a
+        distribution that takes two integers as the two shape arguments for all
+        its methods. If not provided, shape parameters will be inferred from
+        the signature of the private methods, ``_pdf`` and ``_cdf`` of the
+        instance.
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    Attributes
+    ----------
+    a, b : float, optional
+        Lower/upper bound of the support of the unshifted/unscaled distribution.
+        This value is unaffected by the `loc` and `scale` parameters.
+        To calculate the support of the shifted/scaled distribution,
+        use the `support` method.
+
+    Methods
+    -------
+    rvs
+    pdf
+    logpdf
+    cdf
+    logcdf
+    sf
+    logsf
+    ppf
+    isf
+    moment
+    stats
+    entropy
+    expect
+    median
+    mean
+    std
+    var
+    interval
+    __call__
+    fit
+    fit_loc_scale
+    nnlf
+    support
+
+    Notes
+    -----
+    Public methods of an instance of a distribution class (e.g., ``pdf``,
+    ``cdf``) check their arguments and pass valid arguments to private,
+    computational methods (``_pdf``, ``_cdf``). For ``pdf(x)``, ``x`` is valid
+    if it is within the support of the distribution.
+    Whether a shape parameter is valid is decided by an ``_argcheck`` method
+    (which defaults to checking that its arguments are strictly positive.)
+
+    **Subclassing**
+
+    New random variables can be defined by subclassing the `rv_continuous` class
+    and re-defining at least the ``_pdf`` or the ``_cdf`` method (normalized
+    to location 0 and scale 1).
+
+    If positive argument checking is not correct for your RV
+    then you will also need to re-define the ``_argcheck`` method.
+
+    For most of the scipy.stats distributions, the support interval doesn't
+    depend on the shape parameters. ``x`` being in the support interval is
+    equivalent to ``self.a <= x <= self.b``.  If either of the endpoints of
+    the support do depend on the shape parameters, then
+    i) the distribution must implement the ``_get_support`` method; and
+    ii) those dependent endpoints must be omitted from the distribution's
+    call to the ``rv_continuous`` initializer.
+
+    Correct, but potentially slow defaults exist for the remaining
+    methods but for speed and/or accuracy you can over-ride::
+
+      _logpdf, _cdf, _logcdf, _ppf, _rvs, _isf, _sf, _logsf
+
+    The default method ``_rvs`` relies on the inverse of the cdf, ``_ppf``,
+    applied to a uniform random variate. In order to generate random variates
+    efficiently, either the default ``_ppf`` needs to be overwritten (e.g.
+    if the inverse cdf can expressed in an explicit form) or a sampling
+    method needs to be implemented in a custom ``_rvs`` method.
+
+    If possible, you should override ``_isf``, ``_sf`` or ``_logsf``.
+    The main reason would be to improve numerical accuracy: for example,
+    the survival function ``_sf`` is computed as ``1 - _cdf`` which can
+    result in loss of precision if ``_cdf(x)`` is close to one.
+
+    **Methods that can be overwritten by subclasses**
+    ::
+
+      _rvs
+      _pdf
+      _cdf
+      _sf
+      _ppf
+      _isf
+      _stats
+      _munp
+      _entropy
+      _argcheck
+      _get_support
+
+    There are additional (internal and private) generic methods that can
+    be useful for cross-checking and for debugging, but might work in all
+    cases when directly called.
+
+    A note on ``shapes``: subclasses need not specify them explicitly. In this
+    case, `shapes` will be automatically deduced from the signatures of the
+    overridden methods (`pdf`, `cdf` etc).
+    If, for some reason, you prefer to avoid relying on introspection, you can
+    specify ``shapes`` explicitly as an argument to the instance constructor.
+
+
+    **Frozen Distributions**
+
+    Normally, you must provide shape parameters (and, optionally, location and
+    scale parameters to each call of a method of a distribution.
+
+    Alternatively, the object may be called (as a function) to fix the shape,
+    location, and scale parameters returning a "frozen" continuous RV object:
+
+    rv = generic(<shape(s)>, loc=0, scale=1)
+        `rv_frozen` object with the same methods but holding the given shape,
+        location, and scale fixed
+
+    **Statistics**
+
+    Statistics are computed using numerical integration by default.
+    For speed you can redefine this using ``_stats``:
+
+     - take shape parameters and return mu, mu2, g1, g2
+     - If you can't compute one of these, return it as None
+     - Can also be defined with a keyword argument ``moments``, which is a
+       string composed of "m", "v", "s", and/or "k".
+       Only the components appearing in string should be computed and
+       returned in the order "m", "v", "s", or "k"  with missing values
+       returned as None.
+
+    Alternatively, you can override ``_munp``, which takes ``n`` and shape
+    parameters and returns the n-th non-central moment of the distribution.
+
+    **Deepcopying / Pickling**
+
+    If a distribution or frozen distribution is deepcopied (pickled/unpickled,
+    etc.), any underlying random number generator is deepcopied with it. An
+    implication is that if a distribution relies on the singleton RandomState
+    before copying, it will rely on a copy of that random state after copying,
+    and ``np.random.seed`` will no longer control the state.
+
+    Examples
+    --------
+    To create a new Gaussian distribution, we would do the following:
+
+    >>> from scipy.stats import rv_continuous
+    >>> class gaussian_gen(rv_continuous):
+    ...     "Gaussian distribution"
+    ...     def _pdf(self, x):
+    ...         return np.exp(-x**2 / 2.) / np.sqrt(2.0 * np.pi)
+    >>> gaussian = gaussian_gen(name='gaussian')
+
+    ``scipy.stats`` distributions are *instances*, so here we subclass
+    `rv_continuous` and create an instance. With this, we now have
+    a fully functional distribution with all relevant methods automagically
+    generated by the framework.
+
+    Note that above we defined a standard normal distribution, with zero mean
+    and unit variance. Shifting and scaling of the distribution can be done
+    by using ``loc`` and ``scale`` parameters: ``gaussian.pdf(x, loc, scale)``
+    essentially computes ``y = (x - loc) / scale`` and
+    ``gaussian._pdf(y) / scale``.
+
+    """
+
+    def __init__(self, momtype=1, a=None, b=None, xtol=1e-14,
+                 badvalue=None, name=None, longname=None,
+                 shapes=None, seed=None):
+
+        super().__init__(seed)
+
+        # save the ctor parameters, cf generic freeze
+        self._ctor_param = dict(
+            momtype=momtype, a=a, b=b, xtol=xtol,
+            badvalue=badvalue, name=name, longname=longname,
+            shapes=shapes, seed=seed)
+
+        if badvalue is None:
+            badvalue = nan
+        if name is None:
+            name = 'Distribution'
+        self.badvalue = badvalue
+        self.name = name
+        self.a = a
+        self.b = b
+        if a is None:
+            self.a = -inf
+        if b is None:
+            self.b = inf
+        self.xtol = xtol
+        self.moment_type = momtype
+        self.shapes = shapes
+
+        self._construct_argparser(meths_to_inspect=[self._pdf, self._cdf],
+                                  locscale_in='loc=0, scale=1',
+                                  locscale_out='loc, scale')
+        self._attach_methods()
+
+        if longname is None:
+            if name[0] in ['aeiouAEIOU']:
+                hstr = "An "
+            else:
+                hstr = "A "
+            longname = hstr + name
+
+        if sys.flags.optimize < 2:
+            # Skip adding docstrings if interpreter is run with -OO
+            if self.__doc__ is None:
+                self._construct_default_doc(longname=longname,
+                                            docdict=docdict,
+                                            discrete='continuous')
+            else:
+                dct = dict(distcont)
+                self._construct_doc(docdict, dct.get(self.name))
+
+    def __getstate__(self):
+        dct = self.__dict__.copy()
+
+        # these methods will be remade in __setstate__
+        # _random_state attribute is taken care of by rv_generic
+        attrs = ["_parse_args", "_parse_args_stats", "_parse_args_rvs",
+                 "_cdfvec", "_ppfvec", "vecentropy", "generic_moment"]
+        [dct.pop(attr, None) for attr in attrs]
+        return dct
+
+    def _attach_methods(self):
+        """
+        Attaches dynamically created methods to the rv_continuous instance.
+        """
+        # _attach_methods is responsible for calling _attach_argparser_methods
+        self._attach_argparser_methods()
+
+        # nin correction
+        self._ppfvec = vectorize(self._ppf_single, otypes='d')
+        self._ppfvec.nin = self.numargs + 1
+        self.vecentropy = vectorize(self._entropy, otypes='d')
+        self._cdfvec = vectorize(self._cdf_single, otypes='d')
+        self._cdfvec.nin = self.numargs + 1
+
+        if self.moment_type == 0:
+            self.generic_moment = vectorize(self._mom0_sc, otypes='d')
+        else:
+            self.generic_moment = vectorize(self._mom1_sc, otypes='d')
+        # Because of the *args argument of _mom0_sc, vectorize cannot count the
+        # number of arguments correctly.
+        self.generic_moment.nin = self.numargs + 1
+
+    def _updated_ctor_param(self):
+        """Return the current version of _ctor_param, possibly updated by user.
+
+        Used by freezing.
+        Keep this in sync with the signature of __init__.
+        """
+        dct = self._ctor_param.copy()
+        dct['a'] = self.a
+        dct['b'] = self.b
+        dct['xtol'] = self.xtol
+        dct['badvalue'] = self.badvalue
+        dct['name'] = self.name
+        dct['shapes'] = self.shapes
+        return dct
+
+    def _ppf_to_solve(self, x, q, *args):
+        return self.cdf(*(x, )+args)-q
+
+    def _ppf_single(self, q, *args):
+        factor = 10.
+        left, right = self._get_support(*args)
+
+        if np.isinf(left):
+            left = min(-factor, right)
+            while self._ppf_to_solve(left, q, *args) > 0.:
+                left, right = left * factor, left
+            # left is now such that cdf(left) <= q
+            # if right has changed, then cdf(right) > q
+
+        if np.isinf(right):
+            right = max(factor, left)
+            while self._ppf_to_solve(right, q, *args) < 0.:
+                left, right = right, right * factor
+            # right is now such that cdf(right) >= q
+
+        return optimize.brentq(self._ppf_to_solve,
+                               left, right, args=(q,)+args, xtol=self.xtol)
+
+    # moment from definition
+    def _mom_integ0(self, x, m, *args):
+        return x**m * self.pdf(x, *args)
+
+    def _mom0_sc(self, m, *args):
+        _a, _b = self._get_support(*args)
+        return integrate.quad(self._mom_integ0, _a, _b,
+                              args=(m,)+args)[0]
+
+    # moment calculated using ppf
+    def _mom_integ1(self, q, m, *args):
+        return (self.ppf(q, *args))**m
+
+    def _mom1_sc(self, m, *args):
+        return integrate.quad(self._mom_integ1, 0, 1, args=(m,)+args)[0]
+
+    def _pdf(self, x, *args):
+        return _derivative(self._cdf, x, dx=1e-5, args=args, order=5)
+
+    # Could also define any of these
+    def _logpdf(self, x, *args):
+        p = self._pdf(x, *args)
+        with np.errstate(divide='ignore'):
+            return log(p)
+
+    def _logpxf(self, x, *args):
+        # continuous distributions have PDF, discrete have PMF, but sometimes
+        # the distinction doesn't matter. This lets us use `_logpxf` for both
+        # discrete and continuous distributions.
+        return self._logpdf(x, *args)
+
+    def _cdf_single(self, x, *args):
+        _a, _b = self._get_support(*args)
+        return integrate.quad(self._pdf, _a, x, args=args)[0]
+
+    def _cdf(self, x, *args):
+        return self._cdfvec(x, *args)
+
+    def _logcdf(self, x, *args):
+        median = self._ppf(0.5, *args)
+        with np.errstate(divide='ignore'):
+            return xpx.apply_where(
+                x < median, (x,) + args,
+                lambda x, *args: np.log(self._cdf(x, *args)),
+                lambda x, *args: np.log1p(-self._sf(x, *args)))
+
+    def _logsf(self, x, *args):
+        median = self._ppf(0.5, *args)
+        with np.errstate(divide='ignore'):
+            return xpx.apply_where(
+                x > median, (x,) + args,
+                lambda x, *args: np.log(self._sf(x, *args)),
+                lambda x, *args: np.log1p(-self._cdf(x, *args)))
+
+    # generic _argcheck, _sf, _ppf, _isf, _rvs are defined
+    # in rv_generic
+
+    def pdf(self, x, *args, **kwds):
+        """Probability density function at x of the given RV.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        pdf : ndarray
+            Probability density function evaluated at x
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._support_mask(x, *args) & (scale > 0)
+        cond = cond0 & cond1
+        output = zeros(shape(cond), dtyp)
+        putmask(output, (1-cond0)+np.isnan(x), self.badvalue)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((x,)+args+(scale,)))
+            scale, goodargs = goodargs[-1], goodargs[:-1]
+            place(output, cond, self._pdf(*goodargs) / scale)
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logpdf(self, x, *args, **kwds):
+        """Log of the probability density function at x of the given RV.
+
+        This uses a more numerically accurate calculation if available.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        logpdf : array_like
+            Log of the probability density function evaluated at x
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._support_mask(x, *args) & (scale > 0)
+        cond = cond0 & cond1
+        output = empty(shape(cond), dtyp)
+        output.fill(-inf)
+        putmask(output, (1-cond0)+np.isnan(x), self.badvalue)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((x,)+args+(scale,)))
+            scale, goodargs = goodargs[-1], goodargs[:-1]
+            place(output, cond, self._logpdf(*goodargs) - log(scale))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def cdf(self, x, *args, **kwds):
+        """
+        Cumulative distribution function of the given RV.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        cdf : ndarray
+            Cumulative distribution function evaluated at `x`
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._open_support_mask(x, *args) & (scale > 0)
+        cond2 = (x >= np.asarray(_b)) & cond0
+        cond = cond0 & cond1
+        output = zeros(shape(cond), dtyp)
+        place(output, (1-cond0)+np.isnan(x), self.badvalue)
+        place(output, cond2, 1.0)
+        if np.any(cond):  # call only if at least 1 entry
+            goodargs = argsreduce(cond, *((x,)+args))
+            place(output, cond, self._cdf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logcdf(self, x, *args, **kwds):
+        """Log of the cumulative distribution function at x of the given RV.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        logcdf : array_like
+            Log of the cumulative distribution function evaluated at x
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._open_support_mask(x, *args) & (scale > 0)
+        cond2 = (x >= _b) & cond0
+        cond = cond0 & cond1
+        output = empty(shape(cond), dtyp)
+        output.fill(-inf)
+        place(output, (1-cond0)*(cond1 == cond1)+np.isnan(x), self.badvalue)
+        place(output, cond2, 0.0)
+        if np.any(cond):  # call only if at least 1 entry
+            goodargs = argsreduce(cond, *((x,)+args))
+            place(output, cond, self._logcdf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def sf(self, x, *args, **kwds):
+        """Survival function (1 - `cdf`) at x of the given RV.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        sf : array_like
+            Survival function evaluated at x
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._open_support_mask(x, *args) & (scale > 0)
+        cond2 = cond0 & (x <= _a)
+        cond = cond0 & cond1
+        output = zeros(shape(cond), dtyp)
+        place(output, (1-cond0)+np.isnan(x), self.badvalue)
+        place(output, cond2, 1.0)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((x,)+args))
+            place(output, cond, self._sf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logsf(self, x, *args, **kwds):
+        """Log of the survival function of the given RV.
+
+        Returns the log of the "survival function," defined as (1 - `cdf`),
+        evaluated at `x`.
+
+        Parameters
+        ----------
+        x : array_like
+            quantiles
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        logsf : ndarray
+            Log of the survival function evaluated at `x`.
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        x, loc, scale = map(asarray, (x, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        dtyp = np.promote_types(x.dtype, np.float64)
+        x = np.asarray((x - loc)/scale, dtype=dtyp)
+        cond0 = self._argcheck(*args) & (scale > 0)
+        cond1 = self._open_support_mask(x, *args) & (scale > 0)
+        cond2 = cond0 & (x <= _a)
+        cond = cond0 & cond1
+        output = empty(shape(cond), dtyp)
+        output.fill(-inf)
+        place(output, (1-cond0)+np.isnan(x), self.badvalue)
+        place(output, cond2, 0.0)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((x,)+args))
+            place(output, cond, self._logsf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def ppf(self, q, *args, **kwds):
+        """Percent point function (inverse of `cdf`) at q of the given RV.
+
+        Parameters
+        ----------
+        q : array_like
+            lower tail probability
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        x : array_like
+            quantile corresponding to the lower tail probability q.
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        q, loc, scale = map(asarray, (q, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc)
+        cond1 = (0 < q) & (q < 1)
+        cond2 = cond0 & (q == 0)
+        cond3 = cond0 & (q == 1)
+        cond = cond0 & cond1
+        output = np.full(shape(cond), fill_value=self.badvalue)
+
+        lower_bound = _a * scale + loc
+        upper_bound = _b * scale + loc
+        place(output, cond2, argsreduce(cond2, lower_bound)[0])
+        place(output, cond3, argsreduce(cond3, upper_bound)[0])
+
+        if np.any(cond):  # call only if at least 1 entry
+            goodargs = argsreduce(cond, *((q,)+args+(scale, loc)))
+            scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2]
+            place(output, cond, self._ppf(*goodargs) * scale + loc)
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def isf(self, q, *args, **kwds):
+        """Inverse survival function (inverse of `sf`) at q of the given RV.
+
+        Parameters
+        ----------
+        q : array_like
+            upper tail probability
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            location parameter (default=0)
+        scale : array_like, optional
+            scale parameter (default=1)
+
+        Returns
+        -------
+        x : ndarray or scalar
+            Quantile corresponding to the upper tail probability q.
+
+        """
+        args, loc, scale = self._parse_args(*args, **kwds)
+        q, loc, scale = map(asarray, (q, loc, scale))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc)
+        cond1 = (0 < q) & (q < 1)
+        cond2 = cond0 & (q == 1)
+        cond3 = cond0 & (q == 0)
+        cond = cond0 & cond1
+        output = np.full(shape(cond), fill_value=self.badvalue)
+
+        lower_bound = _a * scale + loc
+        upper_bound = _b * scale + loc
+        place(output, cond2, argsreduce(cond2, lower_bound)[0])
+        place(output, cond3, argsreduce(cond3, upper_bound)[0])
+
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((q,)+args+(scale, loc)))
+            scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2]
+            place(output, cond, self._isf(*goodargs) * scale + loc)
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def _unpack_loc_scale(self, theta):
+        try:
+            loc = theta[-2]
+            scale = theta[-1]
+            args = tuple(theta[:-2])
+        except IndexError as e:
+            raise ValueError("Not enough input arguments.") from e
+        return loc, scale, args
+
+    def _nnlf_and_penalty(self, x, args):
+        """
+        Compute the penalized negative log-likelihood for the
+        "standardized" data (i.e. already shifted by loc and
+        scaled by scale) for the shape parameters in `args`.
+
+        `x` can be a 1D numpy array or a CensoredData instance.
+        """
+        if isinstance(x, CensoredData):
+            # Filter out the data that is not in the support.
+            xs = x._supported(*self._get_support(*args))
+            n_bad = len(x) - len(xs)
+            i1, i2 = xs._interval.T
+            terms = [
+                # logpdf of the noncensored data.
+                self._logpdf(xs._uncensored, *args),
+                # logcdf of the left-censored data.
+                self._logcdf(xs._left, *args),
+                # logsf of the right-censored data.
+                self._logsf(xs._right, *args),
+                # log of probability of the interval-censored data.
+                np.log(self._delta_cdf(i1, i2, *args)),
+            ]
+        else:
+            cond0 = ~self._support_mask(x, *args)
+            n_bad = np.count_nonzero(cond0)
+            if n_bad > 0:
+                x = argsreduce(~cond0, x)[0]
+            terms = [self._logpdf(x, *args)]
+
+        totals, bad_counts = zip(*[_sum_finite(term) for term in terms])
+        total = sum(totals)
+        n_bad += sum(bad_counts)
+
+        return -total + n_bad * _LOGXMAX * 100
+
+    def _penalized_nnlf(self, theta, x):
+        """Penalized negative loglikelihood function.
+
+        i.e., - sum (log pdf(x, theta), axis=0) + penalty
+        where theta are the parameters (including loc and scale)
+        """
+        loc, scale, args = self._unpack_loc_scale(theta)
+        if not self._argcheck(*args) or scale <= 0:
+            return inf
+        if isinstance(x, CensoredData):
+            x = (x - loc) / scale
+            n_log_scale = (len(x) - x.num_censored()) * log(scale)
+        else:
+            x = (x - loc) / scale
+            n_log_scale = len(x) * log(scale)
+
+        return self._nnlf_and_penalty(x, args) + n_log_scale
+
+    def _fitstart(self, data, args=None):
+        """Starting point for fit (shape arguments + loc + scale)."""
+        if args is None:
+            args = (1.0,)*self.numargs
+        loc, scale = self._fit_loc_scale_support(data, *args)
+        return args + (loc, scale)
+
+    def _reduce_func(self, args, kwds, data=None):
+        """
+        Return the (possibly reduced) function to optimize in order to find MLE
+        estimates for the .fit method.
+        """
+        # Convert fixed shape parameters to the standard numeric form: e.g. for
+        # stats.beta, shapes='a, b'. To fix `a`, the caller can give a value
+        # for `f0`, `fa` or 'fix_a'.  The following converts the latter two
+        # into the first (numeric) form.
+        shapes = []
+        if self.shapes:
+            shapes = self.shapes.replace(',', ' ').split()
+            for j, s in enumerate(shapes):
+                key = 'f' + str(j)
+                names = [key, 'f' + s, 'fix_' + s]
+                val = _get_fixed_fit_value(kwds, names)
+                if val is not None:
+                    kwds[key] = val
+
+        args = list(args)
+        Nargs = len(args)
+        fixedn = []
+        names = [f'f{n}' for n in range(Nargs - 2)] + ['floc', 'fscale']
+        x0 = []
+        for n, key in enumerate(names):
+            if key in kwds:
+                fixedn.append(n)
+                args[n] = kwds.pop(key)
+            else:
+                x0.append(args[n])
+
+        methods = {"mle", "mm"}
+        method = kwds.pop('method', "mle").lower()
+        if method == "mm":
+            n_params = len(shapes) + 2 - len(fixedn)
+            exponents = (np.arange(1, n_params+1))[:, np.newaxis]
+            data_moments = np.sum(data[None, :]**exponents/len(data), axis=1)
+
+            def objective(theta, x):
+                return self._moment_error(theta, x, data_moments)
+
+        elif method == "mle":
+            objective = self._penalized_nnlf
+        else:
+            raise ValueError(f"Method '{method}' not available; "
+                             f"must be one of {methods}")
+
+        if len(fixedn) == 0:
+            func = objective
+            restore = None
+        else:
+            if len(fixedn) == Nargs:
+                raise ValueError(
+                    "All parameters fixed. There is nothing to optimize.")
+
+            def restore(args, theta):
+                # Replace with theta for all numbers not in fixedn
+                # This allows the non-fixed values to vary, but
+                #  we still call self.nnlf with all parameters.
+                i = 0
+                for n in range(Nargs):
+                    if n not in fixedn:
+                        args[n] = theta[i]
+                        i += 1
+                return args
+
+            def func(theta, x):
+                newtheta = restore(args[:], theta)
+                return objective(newtheta, x)
+
+        return x0, func, restore, args
+
+    def _moment_error(self, theta, x, data_moments):
+        loc, scale, args = self._unpack_loc_scale(theta)
+        if not self._argcheck(*args) or scale <= 0:
+            return inf
+
+        dist_moments = np.array([self.moment(i+1, *args, loc=loc, scale=scale)
+                                 for i in range(len(data_moments))])
+        if np.any(np.isnan(dist_moments)):
+            raise ValueError("Method of moments encountered a non-finite "
+                             "distribution moment and cannot continue. "
+                             "Consider trying method='MLE'.")
+
+        return (((data_moments - dist_moments) /
+                 np.maximum(np.abs(data_moments), 1e-8))**2).sum()
+
+    def fit(self, data, *args, **kwds):
+        r"""
+        Return estimates of shape (if applicable), location, and scale
+        parameters from data. The default estimation method is Maximum
+        Likelihood Estimation (MLE), but Method of Moments (MM)
+        is also available.
+
+        Starting estimates for the fit are given by input arguments;
+        for any arguments not provided with starting estimates,
+        ``self._fitstart(data)`` is called to generate such.
+
+        One can hold some parameters fixed to specific values by passing in
+        keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters)
+        and ``floc`` and ``fscale`` (for location and scale parameters,
+        respectively).
+
+        Parameters
+        ----------
+        data : array_like or `CensoredData` instance
+            Data to use in estimating the distribution parameters.
+        arg1, arg2, arg3,... : floats, optional
+            Starting value(s) for any shape-characterizing arguments (those not
+            provided will be determined by a call to ``_fitstart(data)``).
+            No default value.
+        **kwds : floats, optional
+            - `loc`: initial guess of the distribution's location parameter.
+            - `scale`: initial guess of the distribution's scale parameter.
+
+            Special keyword arguments are recognized as holding certain
+            parameters fixed:
+
+            - f0, ..., fn : hold respective shape parameters fixed.
+              Alternatively, shape parameters to fix can be specified by name.
+              For example, if ``self.shapes == "a, b"``, ``fa`` and ``fix_a``
+              are equivalent to ``f0``, and ``fb`` and ``fix_b`` are
+              equivalent to ``f1``.
+
+            - floc : hold location parameter fixed to specified value.
+
+            - fscale : hold scale parameter fixed to specified value.
+
+            - optimizer : The optimizer to use.  The optimizer must take
+              ``func`` and starting position as the first two arguments,
+              plus ``args`` (for extra arguments to pass to the
+              function to be optimized) and ``disp``.
+              The ``fit`` method calls the optimizer with ``disp=0`` to suppress output.
+              The optimizer must return the estimated parameters.
+
+            - method : The method to use. The default is "MLE" (Maximum
+              Likelihood Estimate); "MM" (Method of Moments)
+              is also available.
+
+        Raises
+        ------
+        TypeError, ValueError
+            If an input is invalid
+        `~scipy.stats.FitError`
+            If fitting fails or the fit produced would be invalid
+
+        Returns
+        -------
+        parameter_tuple : tuple of floats
+            Estimates for any shape parameters (if applicable), followed by
+            those for location and scale. For most random variables, shape
+            statistics will be returned, but there are exceptions (e.g.
+            ``norm``).
+
+        Notes
+        -----
+        With ``method="MLE"`` (default), the fit is computed by minimizing
+        the negative log-likelihood function. A large, finite penalty
+        (rather than infinite negative log-likelihood) is applied for
+        observations beyond the support of the distribution.
+
+        With ``method="MM"``, the fit is computed by minimizing the L2 norm
+        of the relative errors between the first *k* raw (about zero) data
+        moments and the corresponding distribution moments, where *k* is the
+        number of non-fixed parameters.
+        More precisely, the objective function is::
+
+            (((data_moments - dist_moments)
+              / np.maximum(np.abs(data_moments), 1e-8))**2).sum()
+
+        where the constant ``1e-8`` avoids division by zero in case of
+        vanishing data moments. Typically, this error norm can be reduced to
+        zero.
+        Note that the standard method of moments can produce parameters for
+        which some data are outside the support of the fitted distribution;
+        this implementation does nothing to prevent this.
+
+        For either method,
+        the returned answer is not guaranteed to be globally optimal; it
+        may only be locally optimal, or the optimization may fail altogether.
+        If the data contain any of ``np.nan``, ``np.inf``, or ``-np.inf``,
+        the `fit` method will raise a ``RuntimeError``.
+
+        When passing a ``CensoredData`` instance to ``data``, the log-likelihood
+        function is defined as:
+
+        .. math::
+
+            l(\pmb{\theta}; k) & = \sum
+                                    \log(f(k_u; \pmb{\theta}))
+                                + \sum
+                                    \log(F(k_l; \pmb{\theta})) \\
+                                & + \sum
+                                    \log(1 - F(k_r; \pmb{\theta})) \\
+                                & + \sum
+                                    \log(F(k_{\text{high}, i}; \pmb{\theta})
+                                    - F(k_{\text{low}, i}; \pmb{\theta}))
+
+        where :math:`f` and :math:`F` are the pdf and cdf, respectively, of the
+        function being fitted, :math:`\pmb{\theta}` is the parameter vector,
+        :math:`u` are the indices of uncensored observations,
+        :math:`l` are the indices of left-censored observations,
+        :math:`r` are the indices of right-censored observations,
+        subscripts "low"/"high" denote endpoints of interval-censored observations, and
+        :math:`i` are the indices of interval-censored observations.
+
+        Examples
+        --------
+
+        Generate some data to fit: draw random variates from the `beta`
+        distribution
+
+        >>> import numpy as np
+        >>> from scipy.stats import beta
+        >>> a, b = 1., 2.
+        >>> rng = np.random.default_rng(172786373191770012695001057628748821561)
+        >>> x = beta.rvs(a, b, size=1000, random_state=rng)
+
+        Now we can fit all four parameters (``a``, ``b``, ``loc`` and
+        ``scale``):
+
+        >>> a1, b1, loc1, scale1 = beta.fit(x)
+        >>> a1, b1, loc1, scale1
+        (1.0198945204435628, 1.9484708982737828, 4.372241314917588e-05, 0.9979078845964814)
+
+        The fit can be done also using a custom optimizer:
+
+        >>> from scipy.optimize import minimize
+        >>> def custom_optimizer(func, x0, args=(), disp=0):
+        ...     res = minimize(func, x0, args, method="slsqp", options={"disp": disp})
+        ...     if res.success:
+        ...         return res.x
+        ...     raise RuntimeError('optimization routine failed')
+        >>> a1, b1, loc1, scale1 = beta.fit(x, method="MLE", optimizer=custom_optimizer)
+        >>> a1, b1, loc1, scale1
+        (1.0198821087258905, 1.948484145914738, 4.3705304486881485e-05, 0.9979104663953395)
+
+        We can also use some prior knowledge about the dataset: let's keep
+        ``loc`` and ``scale`` fixed:
+
+        >>> a1, b1, loc1, scale1 = beta.fit(x, floc=0, fscale=1)
+        >>> loc1, scale1
+        (0, 1)
+
+        We can also keep shape parameters fixed by using ``f``-keywords. To
+        keep the zero-th shape parameter ``a`` equal 1, use ``f0=1`` or,
+        equivalently, ``fa=1``:
+
+        >>> a1, b1, loc1, scale1 = beta.fit(x, fa=1, floc=0, fscale=1)
+        >>> a1
+        1
+
+        Not all distributions return estimates for the shape parameters.
+        ``norm`` for example just returns estimates for location and scale:
+
+        >>> from scipy.stats import norm
+        >>> x = norm.rvs(a, b, size=1000, random_state=123)
+        >>> loc1, scale1 = norm.fit(x)
+        >>> loc1, scale1
+        (0.92087172783841631, 2.0015750750324668)
+        """ # noqa: E501
+        method = kwds.get('method', "mle").lower()
+
+        censored = isinstance(data, CensoredData)
+        if censored:
+            if method != 'mle':
+                raise ValueError('For censored data, the method must'
+                                 ' be "MLE".')
+            if data.num_censored() == 0:
+                # There are no censored values in data, so replace the
+                # CensoredData instance with a regular array.
+                data = data._uncensored
+                censored = False
+
+        Narg = len(args)
+        if Narg > self.numargs:
+            raise TypeError("Too many input arguments.")
+
+        # Check the finiteness of data only if data is not an instance of
+        # CensoredData.  The arrays in a CensoredData instance have already
+        # been validated.
+        if not censored:
+            # Note: `ravel()` is called for backwards compatibility.
+            data = np.asarray(data).ravel()
+            if not np.isfinite(data).all():
+                raise ValueError("The data contains non-finite values.")
+
+        start = [None]*2
+        if (Narg < self.numargs) or not ('loc' in kwds and
+                                         'scale' in kwds):
+            # get distribution specific starting locations
+            start = self._fitstart(data)
+            args += start[Narg:-2]
+        loc = kwds.pop('loc', start[-2])
+        scale = kwds.pop('scale', start[-1])
+        args += (loc, scale)
+        x0, func, restore, args = self._reduce_func(args, kwds, data=data)
+        optimizer = kwds.pop('optimizer', optimize.fmin)
+        # convert string to function in scipy.optimize
+        optimizer = _fit_determine_optimizer(optimizer)
+        # by now kwds must be empty, since everybody took what they needed
+        if kwds:
+            raise TypeError(f"Unknown arguments: {kwds}.")
+
+        # In some cases, method of moments can be done with fsolve/root
+        # instead of an optimizer, but sometimes no solution exists,
+        # especially when the user fixes parameters. Minimizing the sum
+        # of squares of the error generalizes to these cases.
+        vals = optimizer(func, x0, args=(data,), disp=0)
+        obj = func(vals, data)
+
+        if restore is not None:
+            vals = restore(args, vals)
+        vals = tuple(vals)
+
+        loc, scale, shapes = self._unpack_loc_scale(vals)
+        if not (np.all(self._argcheck(*shapes)) and scale > 0):
+            raise FitError("Optimization converged to parameters that are "
+                           "outside the range allowed by the distribution.")
+
+        if method == 'mm':
+            if not np.isfinite(obj):
+                raise FitError("Optimization failed: either a data moment "
+                               "or fitted distribution moment is "
+                               "non-finite.")
+
+        return vals
+
+    def _fit_loc_scale_support(self, data, *args):
+        """Estimate loc and scale parameters from data accounting for support.
+
+        Parameters
+        ----------
+        data : array_like
+            Data to fit.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+
+        Returns
+        -------
+        Lhat : float
+            Estimated location parameter for the data.
+        Shat : float
+            Estimated scale parameter for the data.
+
+        """
+        if isinstance(data, CensoredData):
+            # For this estimate, "uncensor" the data by taking the
+            # given endpoints as the data for the left- or right-censored
+            # data, and the mean for the interval-censored data.
+            data = data._uncensor()
+        else:
+            data = np.asarray(data)
+
+        # Estimate location and scale according to the method of moments.
+        loc_hat, scale_hat = self.fit_loc_scale(data, *args)
+
+        # Compute the support according to the shape parameters.
+        self._argcheck(*args)
+        _a, _b = self._get_support(*args)
+        a, b = _a, _b
+        support_width = b - a
+
+        # If the support is empty then return the moment-based estimates.
+        if support_width <= 0:
+            return loc_hat, scale_hat
+
+        # Compute the proposed support according to the loc and scale
+        # estimates.
+        a_hat = loc_hat + a * scale_hat
+        b_hat = loc_hat + b * scale_hat
+
+        # Use the moment-based estimates if they are compatible with the data.
+        data_a = np.min(data)
+        data_b = np.max(data)
+        if a_hat < data_a and data_b < b_hat:
+            return loc_hat, scale_hat
+
+        # Otherwise find other estimates that are compatible with the data.
+        data_width = data_b - data_a
+        rel_margin = 0.1
+        margin = data_width * rel_margin
+
+        # For a finite interval, both the location and scale
+        # should have interesting values.
+        if support_width < np.inf:
+            loc_hat = (data_a - a) - margin
+            scale_hat = (data_width + 2 * margin) / support_width
+            return loc_hat, scale_hat
+
+        # For a one-sided interval, use only an interesting location parameter.
+        if a > -np.inf:
+            return (data_a - a) - margin, 1
+        elif b < np.inf:
+            return (data_b - b) + margin, 1
+        else:
+            raise RuntimeError
+
+    def fit_loc_scale(self, data, *args):
+        """
+        Estimate loc and scale parameters from data using 1st and 2nd moments.
+
+        Parameters
+        ----------
+        data : array_like
+            Data to fit.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+
+        Returns
+        -------
+        Lhat : float
+            Estimated location parameter for the data.
+        Shat : float
+            Estimated scale parameter for the data.
+
+        """
+        mu, mu2 = self.stats(*args, **{'moments': 'mv'})
+        tmp = asarray(data)
+        muhat = tmp.mean()
+        mu2hat = tmp.var()
+        Shat = sqrt(mu2hat / mu2)
+        with np.errstate(invalid='ignore'):
+            Lhat = muhat - Shat*mu
+        if not np.isfinite(Lhat):
+            Lhat = 0
+        if not (np.isfinite(Shat) and (0 < Shat)):
+            Shat = 1
+        return Lhat, Shat
+
+    def _entropy(self, *args):
+        def integ(x):
+            val = self._pdf(x, *args)
+            return entr(val)
+
+        # upper limit is often inf, so suppress warnings when integrating
+        _a, _b = self._get_support(*args)
+        with np.errstate(over='ignore'):
+            h = integrate.quad(integ, _a, _b)[0]
+
+        if not np.isnan(h):
+            return h
+        else:
+            # try with different limits if integration problems
+            low, upp = self.ppf([1e-10, 1. - 1e-10], *args)
+            if np.isinf(_b):
+                upper = upp
+            else:
+                upper = _b
+            if np.isinf(_a):
+                lower = low
+            else:
+                lower = _a
+            return integrate.quad(integ, lower, upper)[0]
+
+    def expect(self, func=None, args=(), loc=0, scale=1, lb=None, ub=None,
+               conditional=False, **kwds):
+        """Calculate expected value of a function with respect to the
+        distribution by numerical integration.
+
+        The expected value of a function ``f(x)`` with respect to a
+        distribution ``dist`` is defined as::
+
+                    ub
+            E[f(x)] = Integral(f(x) * dist.pdf(x)),
+                    lb
+
+        where ``ub`` and ``lb`` are arguments and ``x`` has the ``dist.pdf(x)``
+        distribution. If the bounds ``lb`` and ``ub`` correspond to the
+        support of the distribution, e.g. ``[-inf, inf]`` in the default
+        case, then the integral is the unrestricted expectation of ``f(x)``.
+        Also, the function ``f(x)`` may be defined such that ``f(x)`` is ``0``
+        outside a finite interval in which case the expectation is
+        calculated within the finite range ``[lb, ub]``.
+
+        Parameters
+        ----------
+        func : callable, optional
+            Function for which integral is calculated. Takes only one argument.
+            The default is the identity mapping f(x) = x.
+        args : tuple, optional
+            Shape parameters of the distribution.
+        loc : float, optional
+            Location parameter (default=0).
+        scale : float, optional
+            Scale parameter (default=1).
+        lb, ub : scalar, optional
+            Lower and upper bound for integration. Default is set to the
+            support of the distribution.
+        conditional : bool, optional
+            If True, the integral is corrected by the conditional probability
+            of the integration interval.  The return value is the expectation
+            of the function, conditional on being in the given interval.
+            Default is False.
+
+        Additional keyword arguments are passed to the integration routine.
+
+        Returns
+        -------
+        expect : float
+            The calculated expected value.
+
+        Notes
+        -----
+        The integration behavior of this function is inherited from
+        `scipy.integrate.quad`. Neither this function nor
+        `scipy.integrate.quad` can verify whether the integral exists or is
+        finite. For example ``cauchy(0).mean()`` returns ``np.nan`` and
+        ``cauchy(0).expect()`` returns ``0.0``.
+
+        Likewise, the accuracy of results is not verified by the function.
+        `scipy.integrate.quad` is typically reliable for integrals that are
+        numerically favorable, but it is not guaranteed to converge
+        to a correct value for all possible intervals and integrands. This
+        function is provided for convenience; for critical applications,
+        check results against other integration methods.
+
+        The function is not vectorized.
+
+        Examples
+        --------
+
+        To understand the effect of the bounds of integration consider
+
+        >>> from scipy.stats import expon
+        >>> expon(1).expect(lambda x: 1, lb=0.0, ub=2.0)
+        0.6321205588285578
+
+        This is close to
+
+        >>> expon(1).cdf(2.0) - expon(1).cdf(0.0)
+        0.6321205588285577
+
+        If ``conditional=True``
+
+        >>> expon(1).expect(lambda x: 1, lb=0.0, ub=2.0, conditional=True)
+        1.0000000000000002
+
+        The slight deviation from 1 is due to numerical integration.
+
+        The integrand can be treated as a complex-valued function
+        by passing ``complex_func=True`` to `scipy.integrate.quad` .
+
+        >>> import numpy as np
+        >>> from scipy.stats import vonmises
+        >>> res = vonmises(loc=2, kappa=1).expect(lambda x: np.exp(1j*x),
+        ...                                       complex_func=True)
+        >>> res
+        (-0.18576377217422957+0.40590124735052263j)
+
+        >>> np.angle(res)  # location of the (circular) distribution
+        2.0
+
+        """
+        lockwds = {'loc': loc,
+                   'scale': scale}
+        self._argcheck(*args)
+        _a, _b = self._get_support(*args)
+        if func is None:
+            def fun(x, *args):
+                return x * self.pdf(x, *args, **lockwds)
+        else:
+            def fun(x, *args):
+                return func(x) * self.pdf(x, *args, **lockwds)
+        if lb is None:
+            lb = loc + _a * scale
+        if ub is None:
+            ub = loc + _b * scale
+
+        cdf_bounds = self.cdf([lb, ub], *args, **lockwds)
+        invfac = cdf_bounds[1] - cdf_bounds[0]
+
+        kwds['args'] = args
+
+        # split interval to help integrator w/ infinite support; see gh-8928
+        alpha = 0.05  # split body from tails at probability mass `alpha`
+        inner_bounds = np.array([alpha, 1-alpha])
+        cdf_inner_bounds = cdf_bounds[0] + invfac * inner_bounds
+        c, d = loc + self._ppf(cdf_inner_bounds, *args) * scale
+
+        # Do not silence warnings from integration.
+        lbc = integrate.quad(fun, lb, c, **kwds)[0]
+        cd = integrate.quad(fun, c, d, **kwds)[0]
+        dub = integrate.quad(fun, d, ub, **kwds)[0]
+        vals = (lbc + cd + dub)
+
+        if conditional:
+            vals /= invfac
+        return np.array(vals)[()]  # make it a numpy scalar like other methods
+
+    def _param_info(self):
+        shape_info = self._shape_info()
+        loc_info = _ShapeInfo("loc", False, (-np.inf, np.inf), (False, False))
+        scale_info = _ShapeInfo("scale", False, (0, np.inf), (False, False))
+        param_info = shape_info + [loc_info, scale_info]
+        return param_info
+
+    # For now, _delta_cdf is a private method.
+    def _delta_cdf(self, x1, x2, *args, loc=0, scale=1):
+        """
+        Compute CDF(x2) - CDF(x1).
+
+        Where x1 is greater than the median, compute SF(x1) - SF(x2),
+        otherwise compute CDF(x2) - CDF(x1).
+
+        This function is only useful if `dist.sf(x, ...)` has an implementation
+        that is numerically more accurate than `1 - dist.cdf(x, ...)`.
+        """
+        cdf1 = self.cdf(x1, *args, loc=loc, scale=scale)
+        # Possible optimizations (needs investigation-these might not be
+        # better):
+        # * Use xpx.apply_where instead of np.where
+        # * Instead of cdf1 > 0.5, compare x1 to the median.
+        result = np.where(cdf1 > 0.5,
+                          (self.sf(x1, *args, loc=loc, scale=scale)
+                           - self.sf(x2, *args, loc=loc, scale=scale)),
+                          self.cdf(x2, *args, loc=loc, scale=scale) - cdf1)
+        if result.ndim == 0:
+            result = result[()]
+        return result
+
+
+# Helpers for the discrete distributions
+def _drv2_moment(self, n, *args):
+    """Non-central moment of discrete distribution."""
+    def fun(x):
+        return np.power(x, n) * self._pmf(x, *args)
+
+    _a, _b = self._get_support(*args)
+    return _expect(fun, _a, _b, self._ppf(0.5, *args), self.inc)
+
+
+def _drv2_ppfsingle(self, q, *args):  # Use basic bisection algorithm
+    _a, _b = self._get_support(*args)
+    b = _b
+    a = _a
+
+    step = 10
+    if isinf(b):            # Be sure ending point is > q
+        b = float(max(100*q, 10))
+        while 1:
+            if b >= _b:
+                qb = 1.0
+                break
+            qb = self._cdf(b, *args)
+            if (qb < q):
+                b += step
+                step *= 2
+            else:
+                break
+    else:
+        qb = 1.0
+
+    step = 10
+    if isinf(a):    # be sure starting point < q
+        a = float(min(-100*q, -10))
+        while 1:
+            if a <= _a:
+                qb = 0.0
+                break
+            qa = self._cdf(a, *args)
+            if (qa > q):
+                a -= step
+                step *= 2
+            else:
+                break
+    else:
+        qa = self._cdf(a, *args)
+
+    if np.isinf(a) or np.isinf(b):
+        message = "Arguments that bracket the requested quantile could not be found."
+        raise RuntimeError(message)
+
+    # maximum number of bisections within the normal float64s
+    # maxiter = int(np.log2(finfo.max) - np.log2(finfo.smallest_normal))
+    maxiter = 2046
+    for i in range(maxiter):
+        if (qa == q):
+            return a
+        if (qb == q):
+            return b
+        if b <= a+1:
+            if qa > q:
+                return a
+            else:
+                return b
+        c = int((a+b)/2.0)
+        qc = self._cdf(c, *args)
+        if (qc < q):
+            if a != c:
+                a = c
+            else:
+                raise RuntimeError('updating stopped, endless loop')
+            qa = qc
+        elif (qc > q):
+            if b != c:
+                b = c
+            else:
+                raise RuntimeError('updating stopped, endless loop')
+            qb = qc
+        else:
+            return c
+
+
+# Must over-ride one of _pmf or _cdf or pass in
+#  x_k, p(x_k) lists in initialization
+
+
+class rv_discrete(rv_generic):
+    """A generic discrete random variable class meant for subclassing.
+
+    `rv_discrete` is a base class to construct specific distribution classes
+    and instances for discrete random variables. It can also be used
+    to construct an arbitrary distribution defined by a list of support
+    points and corresponding probabilities.
+
+    Parameters
+    ----------
+    a : float, optional
+        Lower bound of the support of the distribution, default: 0
+    b : float, optional
+        Upper bound of the support of the distribution, default: plus infinity
+    moment_tol : float, optional
+        The tolerance for the generic calculation of moments.
+    values : tuple of two array_like, optional
+        ``(xk, pk)`` where ``xk`` are integers and ``pk`` are the non-zero
+        probabilities between 0 and 1 with ``sum(pk) = 1``. ``xk``
+        and ``pk`` must have the same shape, and ``xk`` must be unique.
+    inc : integer, optional
+        Increment for the support of the distribution.
+        Default is 1. (other values have not been tested)
+    badvalue : float, optional
+        The value in a result arrays that indicates a value that for which
+        some argument restriction is violated, default is np.nan.
+    name : str, optional
+        The name of the instance. This string is used to construct the default
+        example for distributions.
+    longname : str, optional
+        This string is used as part of the first line of the docstring returned
+        when a subclass has no docstring of its own. Note: `longname` exists
+        for backwards compatibility, do not use for new subclasses.
+    shapes : str, optional
+        The shape of the distribution. For example "m, n" for a distribution
+        that takes two integers as the two shape arguments for all its methods
+        If not provided, shape parameters will be inferred from
+        the signatures of the private methods, ``_pmf`` and ``_cdf`` of
+        the instance.
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    Attributes
+    ----------
+    a, b : float, optional
+        Lower/upper bound of the support of the unshifted/unscaled distribution.
+        This value is unaffected by the `loc` and `scale` parameters.
+        To calculate the support of the shifted/scaled distribution,
+        use the `support` method.
+
+    Methods
+    -------
+    rvs
+    pmf
+    logpmf
+    cdf
+    logcdf
+    sf
+    logsf
+    ppf
+    isf
+    moment
+    stats
+    entropy
+    expect
+    median
+    mean
+    std
+    var
+    interval
+    __call__
+    support
+
+    Notes
+    -----
+    This class is similar to `rv_continuous`. Whether a shape parameter is
+    valid is decided by an ``_argcheck`` method (which defaults to checking
+    that its arguments are strictly positive.)
+    The main differences are as follows.
+
+    - The support of the distribution is a set of integers.
+    - Instead of the probability density function, ``pdf`` (and the
+      corresponding private ``_pdf``), this class defines the
+      *probability mass function*, `pmf` (and the corresponding
+      private ``_pmf``.)
+    - There is no ``scale`` parameter.
+    - The default implementations of methods (e.g. ``_cdf``) are not designed
+      for distributions with support that is unbounded below (i.e.
+      ``a=-np.inf``), so they must be overridden.
+
+    To create a new discrete distribution, we would do the following:
+
+    >>> from scipy.stats import rv_discrete
+    >>> class poisson_gen(rv_discrete):
+    ...     "Poisson distribution"
+    ...     def _pmf(self, k, mu):
+    ...         return exp(-mu) * mu**k / factorial(k)
+
+    and create an instance::
+
+    >>> poisson = poisson_gen(name="poisson")
+
+    Note that above we defined the Poisson distribution in the standard form.
+    Shifting the distribution can be done by providing the ``loc`` parameter
+    to the methods of the instance. For example, ``poisson.pmf(x, mu, loc)``
+    delegates the work to ``poisson._pmf(x-loc, mu)``.
+
+    **Discrete distributions from a list of probabilities**
+
+    Alternatively, you can construct an arbitrary discrete rv defined
+    on a finite set of values ``xk`` with ``Prob{X=xk} = pk`` by using the
+    ``values`` keyword argument to the `rv_discrete` constructor.
+
+    **Deepcopying / Pickling**
+
+    If a distribution or frozen distribution is deepcopied (pickled/unpickled,
+    etc.), any underlying random number generator is deepcopied with it. An
+    implication is that if a distribution relies on the singleton RandomState
+    before copying, it will rely on a copy of that random state after copying,
+    and ``np.random.seed`` will no longer control the state.
+
+    Examples
+    --------
+    Custom made discrete distribution:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> xk = np.arange(7)
+    >>> pk = (0.1, 0.2, 0.3, 0.1, 0.1, 0.0, 0.2)
+    >>> custm = stats.rv_discrete(name='custm', values=(xk, pk))
+    >>>
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.plot(xk, custm.pmf(xk), 'ro', ms=12, mec='r')
+    >>> ax.vlines(xk, 0, custm.pmf(xk), colors='r', lw=4)
+    >>> plt.show()
+
+    Random number generation:
+
+    >>> R = custm.rvs(size=100)
+
+    """
+    def __new__(cls, a=0, b=inf, name=None, badvalue=None,
+                moment_tol=1e-8, values=None, inc=1, longname=None,
+                shapes=None, seed=None):
+
+        if values is not None:
+            # dispatch to a subclass
+            return super().__new__(rv_sample)
+        else:
+            # business as usual
+            return super().__new__(cls)
+
+    def __init__(self, a=0, b=inf, name=None, badvalue=None,
+                 moment_tol=1e-8, values=None, inc=1, longname=None,
+                 shapes=None, seed=None):
+
+        super().__init__(seed)
+
+        # cf generic freeze
+        self._ctor_param = dict(
+            a=a, b=b, name=name, badvalue=badvalue,
+            moment_tol=moment_tol, values=values, inc=inc,
+            longname=longname, shapes=shapes, seed=seed)
+
+        if badvalue is None:
+            badvalue = nan
+        self.badvalue = badvalue
+        self.a = a
+        self.b = b
+        self.moment_tol = moment_tol
+        self.inc = inc
+        self.shapes = shapes
+
+        if values is not None:
+            raise ValueError("rv_discrete.__init__(..., values != None, ...)")
+
+        self._construct_argparser(meths_to_inspect=[self._pmf, self._cdf],
+                                  locscale_in='loc=0',
+                                  # scale=1 for discrete RVs
+                                  locscale_out='loc, 1')
+        self._attach_methods()
+        self._construct_docstrings(name, longname)
+
+    def __getstate__(self):
+        dct = self.__dict__.copy()
+        # these methods will be remade in __setstate__
+        attrs = ["_parse_args", "_parse_args_stats", "_parse_args_rvs",
+                 "_cdfvec", "_ppfvec", "generic_moment"]
+        [dct.pop(attr, None) for attr in attrs]
+        return dct
+
+    def _attach_methods(self):
+        """Attaches dynamically created methods to the rv_discrete instance."""
+        self._cdfvec = vectorize(self._cdf_single, otypes='d')
+        self.vecentropy = vectorize(self._entropy)
+
+        # _attach_methods is responsible for calling _attach_argparser_methods
+        self._attach_argparser_methods()
+
+        # nin correction needs to be after we know numargs
+        # correct nin for generic moment vectorization
+        _vec_generic_moment = vectorize(_drv2_moment, otypes='d')
+        _vec_generic_moment.nin = self.numargs + 2
+        self.generic_moment = types.MethodType(_vec_generic_moment, self)
+
+        # correct nin for ppf vectorization
+        _vppf = vectorize(_drv2_ppfsingle, otypes='d')
+        _vppf.nin = self.numargs + 2
+        self._ppfvec = types.MethodType(_vppf, self)
+
+        # now that self.numargs is defined, we can adjust nin
+        self._cdfvec.nin = self.numargs + 1
+
+    def _construct_docstrings(self, name, longname):
+        if name is None:
+            name = 'Distribution'
+        self.name = name
+
+        # generate docstring for subclass instances
+        if longname is None:
+            if name[0] in ['aeiouAEIOU']:
+                hstr = "An "
+            else:
+                hstr = "A "
+            longname = hstr + name
+
+        if sys.flags.optimize < 2:
+            # Skip adding docstrings if interpreter is run with -OO
+            if self.__doc__ is None:
+                self._construct_default_doc(longname=longname,
+                                            docdict=docdict_discrete,
+                                            discrete='discrete')
+            else:
+                dct = dict(distdiscrete)
+                self._construct_doc(docdict_discrete, dct.get(self.name))
+
+            # discrete RV do not have the scale parameter, remove it
+            self.__doc__ = self.__doc__.replace(
+                '\n    scale : array_like, '
+                'optional\n        scale parameter (default=1)', '')
+
+    def _updated_ctor_param(self):
+        """Return the current version of _ctor_param, possibly updated by user.
+
+        Used by freezing.
+        Keep this in sync with the signature of __init__.
+        """
+        dct = self._ctor_param.copy()
+        dct['a'] = self.a
+        dct['b'] = self.b
+        dct['badvalue'] = self.badvalue
+        dct['moment_tol'] = self.moment_tol
+        dct['inc'] = self.inc
+        dct['name'] = self.name
+        dct['shapes'] = self.shapes
+        return dct
+
+    def _nonzero(self, k, *args):
+        return floor(k) == k
+
+    def _pmf(self, k, *args):
+        return self._cdf(k, *args) - self._cdf(k-1, *args)
+
+    def _logpmf(self, k, *args):
+        with np.errstate(divide='ignore'):
+            return log(self._pmf(k, *args))
+
+    def _logpxf(self, k, *args):
+        # continuous distributions have PDF, discrete have PMF, but sometimes
+        # the distinction doesn't matter. This lets us use `_logpxf` for both
+        # discrete and continuous distributions.
+        return self._logpmf(k, *args)
+
+    def _unpack_loc_scale(self, theta):
+        try:
+            loc = theta[-1]
+            scale = 1
+            args = tuple(theta[:-1])
+        except IndexError as e:
+            raise ValueError("Not enough input arguments.") from e
+        return loc, scale, args
+
+    def _cdf_single(self, k, *args):
+        _a, _b = self._get_support(*args)
+        m = arange(int(_a), k+1)
+        return np.sum(self._pmf(m, *args), axis=0)
+
+    def _cdf(self, x, *args):
+        k = floor(x).astype(np.float64)
+        return self._cdfvec(k, *args)
+
+    # generic _logcdf, _sf, _logsf, _ppf, _isf, _rvs defined in rv_generic
+
+    def rvs(self, *args, **kwargs):
+        """Random variates of given type.
+
+        Parameters
+        ----------
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+        size : int or tuple of ints, optional
+            Defining number of random variates (Default is 1). Note that `size`
+            has to be given as keyword, not as positional argument.
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            If `random_state` is None (or `np.random`), the
+            `numpy.random.RandomState` singleton is used.
+            If `random_state` is an int, a new ``RandomState`` instance is
+            used, seeded with `random_state`.
+            If `random_state` is already a ``Generator`` or ``RandomState``
+            instance, that instance is used.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of given `size`.
+
+        """
+        kwargs['discrete'] = True
+        return super().rvs(*args, **kwargs)
+
+    def pmf(self, k, *args, **kwds):
+        """Probability mass function at k of the given RV.
+
+        Parameters
+        ----------
+        k : array_like
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information)
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        pmf : array_like
+            Probability mass function evaluated at k
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k <= _b)
+        if not isinstance(self, rv_sample):
+            cond1 = cond1 & self._nonzero(k, *args)
+        cond = cond0 & cond1
+        output = zeros(shape(cond), 'd')
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, np.clip(self._pmf(*goodargs), 0, 1))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logpmf(self, k, *args, **kwds):
+        """Log of the probability mass function at k of the given RV.
+
+        Parameters
+        ----------
+        k : array_like
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter. Default is 0.
+
+        Returns
+        -------
+        logpmf : array_like
+            Log of the probability mass function evaluated at k.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k <= _b)
+        if not isinstance(self, rv_sample):
+            cond1 = cond1 & self._nonzero(k, *args)
+        cond = cond0 & cond1
+        output = empty(shape(cond), 'd')
+        output.fill(-inf)
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, self._logpmf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def cdf(self, k, *args, **kwds):
+        """Cumulative distribution function of the given RV.
+
+        Parameters
+        ----------
+        k : array_like, int
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        cdf : ndarray
+            Cumulative distribution function evaluated at `k`.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k < _b)
+        cond2 = (k >= _b)
+        cond3 = np.isneginf(k)
+        cond = cond0 & cond1 & np.isfinite(k)
+
+        output = zeros(shape(cond), 'd')
+        place(output, cond2*(cond0 == cond0), 1.0)
+        place(output, cond3*(cond0 == cond0), 0.0)
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, np.clip(self._cdf(*goodargs), 0, 1))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logcdf(self, k, *args, **kwds):
+        """Log of the cumulative distribution function at k of the given RV.
+
+        Parameters
+        ----------
+        k : array_like, int
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        logcdf : array_like
+            Log of the cumulative distribution function evaluated at k.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k < _b)
+        cond2 = (k >= _b)
+        cond = cond0 & cond1
+        output = empty(shape(cond), 'd')
+        output.fill(-inf)
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+        place(output, cond2*(cond0 == cond0), 0.0)
+
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, self._logcdf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def sf(self, k, *args, **kwds):
+        """Survival function (1 - `cdf`) at k of the given RV.
+
+        Parameters
+        ----------
+        k : array_like
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        sf : array_like
+            Survival function evaluated at k.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k < _b)
+        cond2 = ((k < _a) | np.isneginf(k)) & cond0
+        cond = cond0 & cond1 & np.isfinite(k)
+        output = zeros(shape(cond), 'd')
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+        place(output, cond2, 1.0)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, np.clip(self._sf(*goodargs), 0, 1))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def logsf(self, k, *args, **kwds):
+        """Log of the survival function of the given RV.
+
+        Returns the log of the "survival function," defined as 1 - `cdf`,
+        evaluated at `k`.
+
+        Parameters
+        ----------
+        k : array_like
+            Quantiles.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        logsf : ndarray
+            Log of the survival function evaluated at `k`.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        k, loc = map(asarray, (k, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        k = asarray(k-loc)
+        cond0 = self._argcheck(*args)
+        cond1 = (k >= _a) & (k < _b)
+        cond2 = (k < _a) & cond0
+        cond = cond0 & cond1
+        output = empty(shape(cond), 'd')
+        output.fill(-inf)
+        place(output, (1-cond0) + np.isnan(k), self.badvalue)
+        place(output, cond2, 0.0)
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((k,)+args))
+            place(output, cond, self._logsf(*goodargs))
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def ppf(self, q, *args, **kwds):
+        """Percent point function (inverse of `cdf`) at q of the given RV.
+
+        Parameters
+        ----------
+        q : array_like
+            Lower tail probability.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        k : array_like
+            Quantile corresponding to the lower tail probability, q.
+
+        Notes
+        -----
+        For discrete distributions, the `cdf` is not strictly invertible. By convention,
+        this method returns the minimum value `k` for which the `cdf` at `k` is at
+        least `q`. There is one exception:  the `ppf` of ``0`` is ``a-1``,
+        where ``a`` is the left endpoint of the support.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        q, loc = map(asarray, (q, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        cond0 = self._argcheck(*args) & (loc == loc)
+        cond1 = (q > 0) & (q < 1)
+        cond2 = (q == 0) & cond0
+        cond3 = (q == 1) & cond0
+        cond = cond0 & cond1
+        output = np.full(shape(cond), fill_value=self.badvalue, dtype='d')
+        # output type 'd' to handle nin and inf
+
+        place(output, cond2, argsreduce(cond2, _a-1 + loc)[0])
+        place(output, cond3, argsreduce(cond3, _b + loc)[0])
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((q,)+args+(loc,)))
+            loc, goodargs = goodargs[-1], goodargs[:-1]
+            place(output, cond, self._ppf(*goodargs) + loc)
+
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def isf(self, q, *args, **kwds):
+        """Inverse survival function (inverse of `sf`) at q of the given RV.
+
+        Parameters
+        ----------
+        q : array_like
+            Upper tail probability.
+        arg1, arg2, arg3,... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+        loc : array_like, optional
+            Location parameter (default=0).
+
+        Returns
+        -------
+        k : ndarray or scalar
+            Quantile corresponding to the upper tail probability, q.
+
+        Notes
+        -----
+        For discrete distributions, the `sf` is not strictly invertible. By convention,
+        this method returns the minimum value `k` for which the `sf` at `k` is
+        no greater than `q`. There is one exception: the `isf` of ``1`` is ``a-1``,
+        where ``a`` is the left endpoint of the support.
+
+        """
+        args, loc, _ = self._parse_args(*args, **kwds)
+        q, loc = map(asarray, (q, loc))
+        args = tuple(map(asarray, args))
+        _a, _b = self._get_support(*args)
+        cond0 = self._argcheck(*args) & (loc == loc)
+        cond1 = (q > 0) & (q < 1)
+        cond2 = (q == 1) & cond0
+        cond3 = (q == 0) & cond0
+        cond = cond0 & cond1
+
+        # same problem as with ppf; copied from ppf and changed
+        output = np.full(shape(cond), fill_value=self.badvalue, dtype='d')
+        # output type 'd' to handle nin and inf
+        lower_bound = _a - 1 + loc
+        upper_bound = _b + loc
+        place(output, cond2, argsreduce(cond2, lower_bound)[0])
+        place(output, cond3, argsreduce(cond3, upper_bound)[0])
+
+        # call place only if at least 1 valid argument
+        if np.any(cond):
+            goodargs = argsreduce(cond, *((q,)+args+(loc,)))
+            loc, goodargs = goodargs[-1], goodargs[:-1]
+            # PB same as ticket 766
+            place(output, cond, self._isf(*goodargs) + loc)
+
+        if output.ndim == 0:
+            return output[()]
+        return output
+
+    def _entropy(self, *args):
+        if hasattr(self, 'pk'):
+            return stats.entropy(self.pk)
+        else:
+            _a, _b = self._get_support(*args)
+            return _expect(lambda x: entr(self._pmf(x, *args)),
+                           _a, _b, self._ppf(0.5, *args), self.inc)
+
+    def expect(self, func=None, args=(), loc=0, lb=None, ub=None,
+               conditional=False, maxcount=1000, tolerance=1e-10, chunksize=32):
+        """
+        Calculate expected value of a function with respect to the distribution
+        for discrete distribution by numerical summation.
+
+        Parameters
+        ----------
+        func : callable, optional
+            Function for which the expectation value is calculated.
+            Takes only one argument.
+            The default is the identity mapping f(k) = k.
+        args : tuple, optional
+            Shape parameters of the distribution.
+        loc : float, optional
+            Location parameter.
+            Default is 0.
+        lb, ub : int, optional
+            Lower and upper bound for the summation, default is set to the
+            support of the distribution, inclusive (``lb <= k <= ub``).
+        conditional : bool, optional
+            If true then the expectation is corrected by the conditional
+            probability of the summation interval. The return value is the
+            expectation of the function, `func`, conditional on being in
+            the given interval (k such that ``lb <= k <= ub``).
+            Default is False.
+        maxcount : int, optional
+            Maximal number of terms to evaluate (to avoid an endless loop for
+            an infinite sum). Default is 1000.
+        tolerance : float, optional
+            Absolute tolerance for the summation. Default is 1e-10.
+        chunksize : int, optional
+            Iterate over the support of a distributions in chunks of this size.
+            Default is 32.
+
+        Returns
+        -------
+        expect : float
+            Expected value.
+
+        Notes
+        -----
+        For heavy-tailed distributions, the expected value may or
+        may not exist,
+        depending on the function, `func`. If it does exist, but the
+        sum converges
+        slowly, the accuracy of the result may be rather low. For instance, for
+        ``zipf(4)``, accuracy for mean, variance in example is only 1e-5.
+        increasing `maxcount` and/or `chunksize` may improve the result,
+        but may also make zipf very slow.
+
+        The function is not vectorized.
+
+        """
+        # Although `args` is just the shape parameters, `poisson_binom` needs this
+        # to split the vector-valued shape into a tuple of separate shapes
+        args, _, _ = self._parse_args(*args)
+
+        if func is None:
+            def fun(x):
+                # loc and args from outer scope
+                return (x+loc)*self._pmf(x, *args)
+        else:
+            def fun(x):
+                # loc and args from outer scope
+                return func(x+loc)*self._pmf(x, *args)
+        # used pmf because _pmf does not check support in randint and there
+        # might be problems(?) with correct self.a, self.b at this stage maybe
+        # not anymore, seems to work now with _pmf
+
+        _a, _b = self._get_support(*args)
+        if lb is None:
+            lb = _a
+        else:
+            lb = lb - loc   # convert bound for standardized distribution
+        if ub is None:
+            ub = _b
+        else:
+            ub = ub - loc   # convert bound for standardized distribution
+        if conditional:
+            invfac = self.sf(lb-1, *args) - self.sf(ub, *args)
+        else:
+            invfac = 1.0
+
+        if isinstance(self, rv_sample):
+            res = self._expect(fun, lb, ub)
+            return res / invfac
+
+        # iterate over the support, starting from the median
+        x0 = self._ppf(0.5, *args)
+        res = _expect(fun, lb, ub, x0, self.inc, maxcount, tolerance, chunksize)
+        return res / invfac
+
+    def _param_info(self):
+        shape_info = self._shape_info()
+        loc_info = _ShapeInfo("loc", True, (-np.inf, np.inf), (False, False))
+        param_info = shape_info + [loc_info]
+        return param_info
+
+
+def _expect(fun, lb, ub, x0, inc, maxcount=1000, tolerance=1e-10,
+            chunksize=32):
+    """Helper for computing the expectation value of `fun`."""
+    # short-circuit if the support size is small enough
+    if (ub - lb) <= chunksize:
+        supp = np.arange(lb, ub+1, inc)
+        vals = fun(supp)
+        return np.sum(vals)
+
+    # otherwise, iterate starting from x0
+    if x0 < lb:
+        x0 = lb
+    if x0 > ub:
+        x0 = ub
+
+    count, tot = 0, 0.
+    # iterate over [x0, ub] inclusive
+    for x in _iter_chunked(x0, ub+1, chunksize=chunksize, inc=inc):
+        count += x.size
+        delta = np.sum(fun(x))
+        tot += delta
+        if abs(delta) < tolerance * x.size:
+            break
+        if count > maxcount:
+            warnings.warn('expect(): sum did not converge',
+                          RuntimeWarning, stacklevel=3)
+            return tot
+
+    # iterate over [lb, x0)
+    for x in _iter_chunked(x0-1, lb-1, chunksize=chunksize, inc=-inc):
+        count += x.size
+        delta = np.sum(fun(x))
+        tot += delta
+        if abs(delta) < tolerance * x.size:
+            break
+        if count > maxcount:
+            warnings.warn('expect(): sum did not converge',
+                          RuntimeWarning, stacklevel=3)
+            break
+
+    return tot
+
+
+def _iter_chunked(x0, x1, chunksize=4, inc=1):
+    """Iterate from x0 to x1 in chunks of chunksize and steps inc.
+
+    x0 must be finite, x1 need not be. In the latter case, the iterator is
+    infinite.
+    Handles both x0 < x1 and x0 > x1. In the latter case, iterates downwards
+    (make sure to set inc < 0.)
+
+    >>> from scipy.stats._distn_infrastructure import _iter_chunked
+    >>> [x for x in _iter_chunked(2, 5, inc=2)]
+    [array([2, 4])]
+    >>> [x for x in _iter_chunked(2, 11, inc=2)]
+    [array([2, 4, 6, 8]), array([10])]
+    >>> [x for x in _iter_chunked(2, -5, inc=-2)]
+    [array([ 2,  0, -2, -4])]
+    >>> [x for x in _iter_chunked(2, -9, inc=-2)]
+    [array([ 2,  0, -2, -4]), array([-6, -8])]
+
+    """
+    if inc == 0:
+        raise ValueError('Cannot increment by zero.')
+    if chunksize <= 0:
+        raise ValueError(f'Chunk size must be positive; got {chunksize}.')
+
+    s = 1 if inc > 0 else -1
+    stepsize = abs(chunksize * inc)
+
+    x = np.copy(x0)
+    while (x - x1) * inc < 0:
+        delta = min(stepsize, abs(x - x1))
+        step = delta * s
+        supp = np.arange(x, x + step, inc)
+        x += step
+        yield supp
+
+
+class rv_sample(rv_discrete):
+    """A 'sample' discrete distribution defined by the support and values.
+
+    The ctor ignores most of the arguments, only needs the `values` argument.
+    """
+
+    def __init__(self, a=0, b=inf, name=None, badvalue=None,
+                 moment_tol=1e-8, values=None, inc=1, longname=None,
+                 shapes=None, seed=None):
+
+        super(rv_discrete, self).__init__(seed)
+
+        if values is None:
+            raise ValueError("rv_sample.__init__(..., values=None,...)")
+
+        # cf generic freeze
+        self._ctor_param = dict(
+            a=a, b=b, name=name, badvalue=badvalue,
+            moment_tol=moment_tol, values=values, inc=inc,
+            longname=longname, shapes=shapes, seed=seed)
+
+        if badvalue is None:
+            badvalue = nan
+        self.badvalue = badvalue
+        self.moment_tol = moment_tol
+        self.inc = inc
+        self.shapes = shapes
+        self.vecentropy = self._entropy
+
+        xk, pk = values
+
+        if np.shape(xk) != np.shape(pk):
+            raise ValueError("xk and pk must have the same shape.")
+        if np.less(pk, 0.0).any():
+            raise ValueError("All elements of pk must be non-negative.")
+        if not np.allclose(np.sum(pk), 1):
+            raise ValueError("The sum of provided pk is not 1.")
+        if not len(set(np.ravel(xk))) == np.size(xk):
+            raise ValueError("xk may not contain duplicate values.")
+
+        indx = np.argsort(np.ravel(xk))
+        self.xk = np.take(np.ravel(xk), indx, 0)
+        self.pk = np.take(np.ravel(pk), indx, 0)
+        self.a = self.xk[0]
+        self.b = self.xk[-1]
+
+        self.qvals = np.cumsum(self.pk, axis=0)
+
+        self.shapes = ' '   # bypass inspection
+
+        self._construct_argparser(meths_to_inspect=[self._pmf],
+                                  locscale_in='loc=0',
+                                  # scale=1 for discrete RVs
+                                  locscale_out='loc, 1')
+
+        self._attach_methods()
+
+        self._construct_docstrings(name, longname)
+
+    def __getstate__(self):
+        dct = self.__dict__.copy()
+
+        # these methods will be remade in rv_generic.__setstate__,
+        # which calls rv_generic._attach_methods
+        attrs = ["_parse_args", "_parse_args_stats", "_parse_args_rvs"]
+        [dct.pop(attr, None) for attr in attrs]
+
+        return dct
+
+    def _attach_methods(self):
+        """Attaches dynamically created argparser methods."""
+        self._attach_argparser_methods()
+
+    def _get_support(self, *args):
+        """Return the support of the (unscaled, unshifted) distribution.
+
+        Parameters
+        ----------
+        arg1, arg2, ... : array_like
+            The shape parameter(s) for the distribution (see docstring of the
+            instance object for more information).
+
+        Returns
+        -------
+        a, b : numeric (float, or int or +/-np.inf)
+            end-points of the distribution's support.
+        """
+        return self.a, self.b
+
+    def _pmf(self, x):
+        return np.select([x == k for k in self.xk],
+                         [np.broadcast_arrays(p, x)[0] for p in self.pk], 0)
+
+    def _cdf(self, x):
+        xx, xxk = np.broadcast_arrays(x[:, None], self.xk)
+        indx = np.argmax(xxk > xx, axis=-1) - 1
+        return self.qvals[indx]
+
+    def _ppf(self, q):
+        qq, sqq = np.broadcast_arrays(q[..., None], self.qvals)
+        indx = argmax(sqq >= qq, axis=-1)
+        return self.xk[indx]
+
+    def _rvs(self, size=None, random_state=None):
+        # Need to define it explicitly, otherwise .rvs() with size=None
+        # fails due to explicit broadcasting in _ppf
+        U = random_state.uniform(size=size)
+        if size is None:
+            U = np.array(U, ndmin=1)
+            Y = self._ppf(U)[0]
+        else:
+            Y = self._ppf(U)
+        return Y
+
+    def _entropy(self):
+        return stats.entropy(self.pk)
+
+    def generic_moment(self, n):
+        n = asarray(n)
+        return np.sum(self.xk**n[np.newaxis, ...] * self.pk, axis=0)
+
+    def _expect(self, fun, lb, ub, *args, **kwds):
+        # ignore all args, just do a brute force summation
+        supp = self.xk[(lb <= self.xk) & (self.xk <= ub)]
+        vals = fun(supp)
+        return np.sum(vals)
+
+
+def _check_shape(argshape, size):
+    """
+    This is a utility function used by `_rvs()` in the class geninvgauss_gen.
+    It compares the tuple argshape to the tuple size.
+
+    Parameters
+    ----------
+    argshape : tuple of integers
+        Shape of the arguments.
+    size : tuple of integers or integer
+        Size argument of rvs().
+
+    Returns
+    -------
+    The function returns two tuples, scalar_shape and bc.
+
+    scalar_shape : tuple
+        Shape to which the 1-d array of random variates returned by
+        _rvs_scalar() is converted when it is copied into the
+        output array of _rvs().
+
+    bc : tuple of booleans
+        bc is an tuple the same length as size. bc[j] is True if the data
+        associated with that index is generated in one call of _rvs_scalar().
+
+    """
+    scalar_shape = []
+    bc = []
+    for argdim, sizedim in zip_longest(argshape[::-1], size[::-1],
+                                       fillvalue=1):
+        if sizedim > argdim or (argdim == sizedim == 1):
+            scalar_shape.append(sizedim)
+            bc.append(True)
+        else:
+            bc.append(False)
+    return tuple(scalar_shape[::-1]), tuple(bc[::-1])
+
+
+def get_distribution_names(namespace_pairs, rv_base_class):
+    """Collect names of statistical distributions and their generators.
+
+    Parameters
+    ----------
+    namespace_pairs : sequence
+        A snapshot of (name, value) pairs in the namespace of a module.
+    rv_base_class : class
+        The base class of random variable generator classes in a module.
+
+    Returns
+    -------
+    distn_names : list of strings
+        Names of the statistical distributions.
+    distn_gen_names : list of strings
+        Names of the generators of the statistical distributions.
+        Note that these are not simply the names of the statistical
+        distributions, with a _gen suffix added.
+
+    """
+    distn_names = []
+    distn_gen_names = []
+    for name, value in namespace_pairs:
+        if name.startswith('_'):
+            continue
+        if name.endswith('_gen') and issubclass(value, rv_base_class):
+            distn_gen_names.append(name)
+        if isinstance(value, rv_base_class):
+            distn_names.append(name)
+    return distn_names, distn_gen_names
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distr_params.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distr_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..3234bae358fffd18b70a008a00110c6f3e348530
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distr_params.py
@@ -0,0 +1,300 @@
+"""
+Sane parameters for stats.distributions.
+"""
+import numpy as np
+
+distcont = [
+    ['alpha', (3.5704770516650459,)],
+    ['anglit', ()],
+    ['arcsine', ()],
+    ['argus', (1.0,)],
+    ['beta', (2.3098496451481823, 0.62687954300963677)],
+    ['betaprime', (5, 6)],
+    ['bradford', (0.29891359763170633,)],
+    ['burr', (10.5, 4.3)],
+    ['burr12', (10, 4)],
+    ['cauchy', ()],
+    ['chi', (78,)],
+    ['chi2', (55,)],
+    ['cosine', ()],
+    ['crystalball', (2.0, 3.0)],
+    ['dgamma', (1.1023326088288166,)],
+    ['dpareto_lognorm', (3, 1.2, 1.5, 2)],
+    ['dweibull', (2.0685080649914673,)],
+    ['erlang', (10,)],
+    ['expon', ()],
+    ['exponnorm', (1.5,)],
+    ['exponpow', (2.697119160358469,)],
+    ['exponweib', (2.8923945291034436, 1.9505288745913174)],
+    ['f', (29, 18)],
+    ['fatiguelife', (29,)],   # correction numargs = 1
+    ['fisk', (3.0857548622253179,)],
+    ['foldcauchy', (4.7164673455831894,)],
+    ['foldnorm', (1.9521253373555869,)],
+    ['gamma', (1.9932305483800778,)],
+    ['gausshyper', (13.763771604130699, 3.1189636648681431,
+                    2.5145980350183019, 5.1811649903971615)],  # veryslow
+    ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
+    ['genextreme', (-0.1,)],
+    ['gengamma', (4.4162385429431925, 3.1193091679242761)],
+    ['gengamma', (4.4162385429431925, -3.1193091679242761)],
+    ['genhalflogistic', (0.77274727809929322,)],
+    ['genhyperbolic', (0.5, 1.5, -0.5,)],
+    ['geninvgauss', (2.3, 1.5)],
+    ['genlogistic', (0.41192440799679475,)],
+    ['gennorm', (1.2988442399460265,)],
+    ['halfgennorm', (0.6748054997000371,)],
+    ['genpareto', (0.1,)],   # use case with finite moments
+    ['gibrat', ()],
+    ['gompertz', (0.94743713075105251,)],
+    ['gumbel_l', ()],
+    ['gumbel_r', ()],
+    ['halfcauchy', ()],
+    ['halflogistic', ()],
+    ['halfnorm', ()],
+    ['hypsecant', ()],
+    ['invgamma', (4.0668996136993067,)],
+    ['invgauss', (0.14546264555347513,)],
+    ['invweibull', (10.58,)],
+    ['irwinhall', (10,)],
+    ['jf_skew_t', (8, 4)],
+    ['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
+    ['johnsonsu', (2.554395574161155, 2.2482281679651965)],
+    ['kappa4', (0.0, 0.0)],
+    ['kappa4', (-0.1, 0.1)],
+    ['kappa4', (0.0, 0.1)],
+    ['kappa4', (0.1, 0.0)],
+    ['kappa3', (1.0,)],
+    ['ksone', (1000,)],  # replace 22 by 100 to avoid failing range, ticket 956
+    ['kstwo', (10,)],
+    ['kstwobign', ()],
+    ['landau', ()],
+    ['laplace', ()],
+    ['laplace_asymmetric', (2,)],
+    ['levy', ()],
+    ['levy_l', ()],
+    ['levy_stable', (1.8, -0.5)],
+    ['loggamma', (0.41411931826052117,)],
+    ['logistic', ()],
+    ['loglaplace', (3.2505926592051435,)],
+    ['lognorm', (0.95368226960575331,)],
+    ['loguniform', (0.01, 1.25)],
+    ['lomax', (1.8771398388773268,)],
+    ['maxwell', ()],
+    ['mielke', (10.4, 4.6)],
+    ['moyal', ()],
+    ['nakagami', (4.9673794866666237,)],
+    ['ncf', (27, 27, 0.41578441799226107)],
+    ['nct', (14, 0.24045031331198066)],
+    ['ncx2', (21, 1.0560465975116415)],
+    ['norm', ()],
+    ['norminvgauss', (1.25, 0.5)],
+    ['pareto', (2.621716532144454,)],
+    ['pearson3', (0.1,)],
+    ['pearson3', (-2,)],
+    ['powerlaw', (1.6591133289905851,)],
+    ['powerlaw', (0.6591133289905851,)],
+    ['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
+    ['powernorm', (4.4453652254590779,)],
+    ['rayleigh', ()],
+    ['rdist', (1.6,)],
+    ['recipinvgauss', (0.63004267809369119,)],
+    ['reciprocal', (0.01, 1.25)],
+    ['rel_breitwigner', (36.545206797050334, )],
+    ['rice', (0.7749725210111873,)],
+    ['semicircular', ()],
+    ['skewcauchy', (0.5,)],
+    ['skewnorm', (4.0,)],
+    ['studentized_range', (3.0, 10.0)],
+    ['t', (2.7433514990818093,)],
+    ['trapezoid', (0.2, 0.8)],
+    ['triang', (0.15785029824528218,)],
+    ['truncexpon', (4.6907725456810478,)],
+    ['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
+    ['truncnorm', (0.1, 2.)],
+    ['truncpareto', (1.8, 5.3)],
+    ['truncpareto', (2, 5)],
+    ['truncpareto', (-2, 5)],
+    ['truncweibull_min', (2.5, 0.25, 1.75)],
+    ['tukeylambda', (3.1321477856738267,)],
+    ['uniform', ()],
+    ['vonmises', (3.9939042581071398,)],
+    ['vonmises_line', (3.9939042581071398,)],
+    ['wald', ()],
+    ['weibull_max', (2.8687961709100187,)],
+    ['weibull_min', (1.7866166930421596,)],
+    ['wrapcauchy', (0.031071279018614728,)]
+]
+
+
+distdiscrete = [
+    ['bernoulli',(0.3,)],
+    ['betabinom', (5, 2.3, 0.63)],
+    ['betanbinom', (5, 9.3, 1)],
+    ['binom', (5, 0.4)],
+    ['boltzmann',(1.4, 19)],
+    ['dlaplace', (0.8,)],  # 0.5
+    ['geom', (0.5,)],
+    ['hypergeom',(30, 12, 6)],
+    ['hypergeom',(21,3,12)],  # numpy.random (3,18,12) numpy ticket:921
+    ['hypergeom',(21,18,11)],  # numpy.random (18,3,11) numpy ticket:921
+    ['nchypergeom_fisher', (140, 80, 60, 0.5)],
+    ['nchypergeom_wallenius', (140, 80, 60, 0.5)],
+    ['logser', (0.6,)],  # re-enabled, numpy ticket:921
+    ['nbinom', (0.4, 0.4)],  # from tickets: 583
+    ['nbinom', (5, 0.5)],
+    ['planck', (0.51,)],   # 4.1
+    ['poisson', (0.6,)],
+    ['poisson_binom', ([0.1, 0.6, 0.7, 0.8],)],
+    ['randint', (7, 31)],
+    ['skellam', (15, 8)],
+    ['zipf', (6.6,)],
+    ['zipfian', (0.75, 15)],
+    ['zipfian', (1.25, 10)],
+    ['yulesimon', (11.0,)],
+    ['nhypergeom', (20, 7, 1)]
+]
+
+
+invdistdiscrete = [
+    # In each of the following, at least one shape parameter is invalid
+    ['hypergeom', (3, 3, 4)],
+    ['nhypergeom', (5, 2, 8)],
+    ['nchypergeom_fisher', (3, 3, 4, 1)],
+    ['nchypergeom_wallenius', (3, 3, 4, 1)],
+    ['bernoulli', (1.5, )],
+    ['binom', (10, 1.5)],
+    ['betabinom', (10, -0.4, -0.5)],
+    ['betanbinom', (10, -0.4, -0.5)],
+    ['boltzmann', (-1, 4)],
+    ['dlaplace', (-0.5, )],
+    ['geom', (1.5, )],
+    ['logser', (1.5, )],
+    ['nbinom', (10, 1.5)],
+    ['planck', (-0.5, )],
+    ['poisson', (-0.5, )],
+    ['poisson_binom', ([-1, 2, 0.5],)],
+    ['randint', (5, 2)],
+    ['skellam', (-5, -2)],
+    ['zipf', (-2, )],
+    ['yulesimon', (-2, )],
+    ['zipfian', (-0.75, 15)]
+]
+
+
+invdistcont = [
+    # In each of the following, at least one shape parameter is invalid
+    ['alpha', (-1, )],
+    ['anglit', ()],
+    ['arcsine', ()],
+    ['argus', (-1, )],
+    ['beta', (-2, 2)],
+    ['betaprime', (-2, 2)],
+    ['bradford', (-1, )],
+    ['burr', (-1, 1)],
+    ['burr12', (-1, 1)],
+    ['cauchy', ()],
+    ['chi', (-1, )],
+    ['chi2', (-1, )],
+    ['cosine', ()],
+    ['crystalball', (-1, 2)],
+    ['dgamma', (-1, )],
+    ['dpareto_lognorm', (3, -1.2, 1.5, 2)],
+    ['dweibull', (-1, )],
+    ['erlang', (-1, )],
+    ['expon', ()],
+    ['exponnorm', (-1, )],
+    ['exponweib', (1, -1)],
+    ['exponpow', (-1, )],
+    ['f', (10, -10)],
+    ['fatiguelife', (-1, )],
+    ['fisk', (-1, )],
+    ['foldcauchy', (-1, )],
+    ['foldnorm', (-1, )],
+    ['genlogistic', (-1, )],
+    ['gennorm', (-1, )],
+    ['genpareto', (np.inf, )],
+    ['genexpon', (1, 2, -3)],
+    ['genextreme', (np.inf, )],
+    ['genhyperbolic', (0.5, -0.5, -1.5,)],
+    ['gausshyper', (1, 2, 3, -4)],
+    ['gamma', (-1, )],
+    ['gengamma', (-1, 0)],
+    ['genhalflogistic', (-1, )],
+    ['geninvgauss', (1, 0)],
+    ['gibrat', ()],
+    ['gompertz', (-1, )],
+    ['gumbel_r', ()],
+    ['gumbel_l', ()],
+    ['halfcauchy', ()],
+    ['halflogistic', ()],
+    ['halfnorm', ()],
+    ['halfgennorm', (-1, )],
+    ['hypsecant', ()],
+    ['invgamma', (-1, )],
+    ['invgauss', (-1, )],
+    ['invweibull', (-1, )],
+    ['irwinhall', (-1,)],
+    ['irwinhall', (0,)],
+    ['irwinhall', (2.5,)],
+    ['jf_skew_t', (-1, 0)],
+    ['johnsonsb', (1, -2)],
+    ['johnsonsu', (1, -2)],
+    ['kappa4', (np.nan, 0)],
+    ['kappa3', (-1, )],
+    ['ksone', (-1, )],
+    ['kstwo', (-1, )],
+    ['kstwobign', ()],
+    ['landau', ()],
+    ['laplace', ()],
+    ['laplace_asymmetric', (-1, )],
+    ['levy', ()],
+    ['levy_l', ()],
+    ['levy_stable', (-1, 1)],
+    ['logistic', ()],
+    ['loggamma', (-1, )],
+    ['loglaplace', (-1, )],
+    ['lognorm', (-1, )],
+    ['loguniform', (10, 5)],
+    ['lomax', (-1, )],
+    ['maxwell', ()],
+    ['mielke', (1, -2)],
+    ['moyal', ()],
+    ['nakagami', (-1, )],
+    ['ncx2', (-1, 2)],
+    ['ncf', (10, 20, -1)],
+    ['nct', (-1, 2)],
+    ['norm', ()],
+    ['norminvgauss', (5, -10)],
+    ['pareto', (-1, )],
+    ['pearson3', (np.nan, )],
+    ['powerlaw', (-1, )],
+    ['powerlognorm', (1, -2)],
+    ['powernorm', (-1, )],
+    ['rdist', (-1, )],
+    ['rayleigh', ()],
+    ['rice', (-1, )],
+    ['recipinvgauss', (-1, )],
+    ['semicircular', ()],
+    ['skewnorm', (np.inf, )],
+    ['studentized_range', (-1, 1)],
+    ['rel_breitwigner', (-2, )],
+    ['t', (-1, )],
+    ['trapezoid', (0, 2)],
+    ['triang', (2, )],
+    ['truncexpon', (-1, )],
+    ['truncnorm', (10, 5)],
+    ['truncpareto', (-1, -5)],
+    ['truncpareto', (1.8, .5)],
+    ['truncweibull_min', (-2.5, 0.25, 1.75)],
+    ['tukeylambda', (np.nan, )],
+    ['uniform', ()],
+    ['vonmises', (-1, )],
+    ['vonmises_line', (-1, )],
+    ['wald', ()],
+    ['weibull_min', (-1, )],
+    ['weibull_max', (-1, )],
+    ['wrapcauchy', (2, )],
+    ['reciprocal', (15, 10)],
+    ['skewcauchy', (2, )]
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distribution_infrastructure.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distribution_infrastructure.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca262116a0d8842579881c75f9ea65264f73941
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_distribution_infrastructure.py
@@ -0,0 +1,5818 @@
+import functools
+from abc import ABC, abstractmethod
+from functools import cached_property
+from types import GenericAlias
+import inspect
+import math
+
+import numpy as np
+from numpy import inf
+
+from scipy._lib._array_api import xp_capabilities, xp_promote
+from scipy._lib._util import _rng_spawn, _RichResult
+from scipy._lib._docscrape import ClassDoc, NumpyDocString
+import scipy._lib.array_api_extra as xpx
+from scipy import special, stats
+from scipy.special._ufuncs import _log1mexp
+from scipy.integrate import tanhsinh as _tanhsinh, nsum
+from scipy.optimize._bracket import _bracket_root, _bracket_minimum
+from scipy.optimize._chandrupatla import _chandrupatla, _chandrupatla_minimize
+from scipy.stats._probability_distribution import _ProbabilityDistribution
+from scipy.stats import qmc
+
+# in case we need to distinguish between None and not specified
+# Typically this is used to determine whether the tolerance has been set by the
+# user and make a decision about which method to use to evaluate a distribution
+# function. Sometimes, the logic does not consider the value of the tolerance,
+# only whether this has been defined or not. This is not intended to be the
+# best possible logic; the intent is to establish the structure, which can
+# be refined in follow-up work.
+# See https://github.com/scipy/scipy/pull/21050#discussion_r1714195433.
+_null = object()
+def _isnull(x):
+    return type(x) is object or x is None
+
+__all__ = ['make_distribution', 'Mixture', 'order_statistic',
+           'truncate', 'abs', 'exp', 'log']
+
+# Could add other policies for broadcasting and edge/out-of-bounds case handling
+# For instance, when edge case handling is known not to be needed, it's much
+# faster to turn it off, but it might still be nice to have array conversion
+# and shaping done so the user doesn't need to be so careful.
+_SKIP_ALL = "skip_all"
+# Other cache policies would be useful, too.
+_NO_CACHE = "no_cache"
+
+# TODO:
+#  Test sample dtypes
+#  Add dtype kwarg (especially for distributions with no parameters)
+#  When drawing endpoint/out-of-bounds values of a parameter, draw them from
+#   the endpoints/out-of-bounds region of the full `domain`, not `typical`.
+#  Distributions without shape parameters probably need to accept a `dtype` parameter;
+#    right now they default to float64. If we have them default to float16, they will
+#    need to determine result_type when input is not float16 (overhead).
+#  Test _solve_bounded bracket logic, and decide what to do about warnings
+#  Get test coverage to 100%
+#  Raise when distribution method returns wrong shape/dtype?
+#  Consider ensuring everything is at least 1D for calculations? Would avoid needing
+#    to sprinkle `np.asarray` throughout due to indescriminate conversion of 0D arrays
+#    to scalars
+#  Break up `test_basic`: test each method separately
+#  Fix `sample` for QMCEngine (implementation does not match documentation)
+#  When a parameter is invalid, set only the offending parameter to NaN (if possible)?
+#  `_tanhsinh` special case when there are no abscissae between the limits
+#    example: cdf of uniform betweeen 1.0 and np.nextafter(1.0, np.inf)
+#  check behavior of moment methods when moments are undefined/infinite -
+#    basically OK but needs tests
+#  investigate use of median
+#  implement symmetric distribution
+#  implement composite distribution
+#  implement wrapped distribution
+#  profile/optimize
+#  general cleanup (choose keyword-only parameters)
+#  compare old/new distribution timing
+#  make video
+#  add array API support
+#  why does dist.ilogcdf(-100) not converge to bound? Check solver response to inf
+#  _chandrupatla_minimize should not report xm = fm = NaN when it fails
+#  integrate `logmoment` into `moment`? (Not hard, but enough time and code
+#   complexity to wait for reviewer feedback before adding.)
+#  Eliminate bracket_root error "`min <= a < b <= max` must be True"
+#  Test repr?
+#  use `median` information to improve integration? In some cases this will
+#   speed things up. If it's not needed, it may be about twice as slow. I think
+#   it should depend on the accuracy setting.
+#  in tests, check reference value against that produced using np.vectorize?
+#  add `axis` to `ks_1samp`
+#  User tips for faster execution:
+#  - pass NumPy arrays
+#  - pass inputs of floating point type (not integers)
+#  - prefer NumPy scalars or 0d arrays over other size 1 arrays
+#  - pass no invalid parameters and disable invalid parameter checks with iv_profile
+#  - provide a Generator if you're going to do sampling
+#  add options for drawing parameters: log-spacing
+#  accuracy benchmark suite
+#  Should caches be attributes so we can more easily ensure that they are not
+#   modified when caching is turned off?
+#  Make ShiftedScaledDistribution more efficient - only process underlying
+#   distribution parameters as necessary.
+#  Reconsider `all_inclusive`
+#  Should process_parameters update kwargs rather than returning? Should we
+#   update parameters rather than setting to what process_parameters returns?
+
+# Questions:
+# 1.  I override `__getattr__` so that distribution parameters can be read as
+#     attributes. We don't want uses to try to change them.
+#     - To prevent replacements (dist.a = b), I could override `__setattr__`.
+#     - To prevent in-place modifications, `__getattr__` could return a copy,
+#       or it could set the WRITEABLE flag of the array to false.
+#     Which should I do?
+# 2.  `cache_policy` is supported in several methods where I imagine it being
+#     useful, but it needs to be tested. Before doing that:
+#     - What should the default value be?
+#     - What should the other values be?
+#     Or should we just eliminate this policy?
+# 3.  `validation_policy` is supported in a few places, but it should be checked for
+#     consistency. I have the same questions as for `cache_policy`.
+# 4.  `tol` is currently notional. I think there needs to be way to set
+#     separate `atol` and `rtol`. Some ways I imagine it being used:
+#     - Values can be passed to iterative functions (quadrature, root-finder).
+#     - To control which "method" of a distribution function is used. For
+#       example, if `atol` is set to `1e-12`, it may be acceptable to compute
+#       the complementary CDF as 1 - CDF even when CDF is nearly 1; otherwise,
+#       a (potentially more time-consuming) method would need to be used.
+#     I'm looking for unified suggestions for the interface, not ad hoc ideas
+#     for using tolerances. Suppose the user wants to have more control over
+#     the tolerances used for each method - how do they specify it? It would
+#     probably be easiest for the user if they could pass tolerances into each
+#     method, but it's easiest for us if they can only set it as a property of
+#     the class. Perhaps a dictionary of tolerance settings?
+# 5.  I also envision that accuracy estimates should be reported to the user
+#     somehow. I think my preference would be to return a subclass of an array
+#     with an `error` attribute - yes, really. But this is unlikely to be
+#     popular, so what are other ideas? Again, we need a unified vision here,
+#     not just pointing out difficulties (not all errors are known or easy
+#     to estimate, what to do when errors could compound, etc.).
+# 6.  The term "method" is used to refer to public instance functions,
+#     private instance functions, the "method" string argument, and the means
+#     of calculating the desired quantity (represented by the string argument).
+#     For the sake of disambiguation, shall I rename the "method" string to
+#     "strategy" and refer to the means of calculating the quantity as the
+#     "strategy"?
+
+# Originally, I planned to filter out invalid distribution parameters;
+# distribution implementation functions would always work with "compressed",
+# 1D arrays containing only valid distribution parameters. There are two
+# problems with this:
+# - This essentially requires copying all arrays, even if there is only a
+#   single invalid parameter combination. This is expensive. Then, to output
+#   the original size data to the user, we need to "decompress" the arrays
+#   and fill in the NaNs, so more copying. Unless we branch the code when
+#   there are no invalid data, these copies happen even in the normal case,
+#   where there are no invalid parameter combinations. We should not incur
+#   all this overhead in the normal case.
+# - For methods that accept arguments other than distribution parameters, the
+#   user will pass in arrays that are broadcastable with the original arrays,
+#   not the compressed arrays. This means that this same sort of invalid
+#   value detection needs to be repeated every time one of these methods is
+#   called.
+# The much simpler solution is to keep the data uncompressed but to replace
+# the invalid parameters and arguments with NaNs (and only if some are
+# invalid). With this approach, the copying happens only if/when it is
+# needed. Most functions involved in stats distribution calculations don't
+# mind NaNs; they just return NaN. The behavior "If x_i is NaN, the result
+# is NaN" is explicit in the array API. So this should be fine.
+#
+# Currently, I am still leaving the parameters and function arguments
+# in their broadcasted shapes rather than, say, raveling. The intent
+# is to avoid back and forth reshaping. If authors of distributions have
+# trouble dealing with N-D arrays, we can reconsider this.
+#
+# Another important decision is that the *private* methods must accept
+# the distribution parameters as inputs rather than relying on these
+# cached properties directly (although the public methods typically pass
+# the cached values to the private methods). This is because the elementwise
+# algorithms for quadrature, differentiation, root-finding, and minimization
+# prefer that the input functions are strictly elementwise in the sense
+# that the value output for a given input element does not depend on the
+# shape of the input or that element's location within the input array.
+# When the computation has converged for an element, it is removed from
+# the computation entirely. As a result, the shape of the arrays passed to
+# the function will almost never be broadcastable with the shape of the
+# cached parameter arrays.
+#
+# I've sprinkled in some optimizations for scalars and same-shape/type arrays
+# throughout. The biggest time sinks before were:
+# - broadcast_arrays
+# - result_dtype
+# - is_subdtype
+# It is much faster to check whether these are necessary than to do them.
+
+
+class _Domain(ABC):
+    r""" Representation of the applicable domain of a parameter or variable.
+
+    A `_Domain` object is responsible for storing information about the
+    domain of a parameter or variable, determining whether a value is within
+    the domain (`contains`), and providing a text/mathematical representation
+    of itself (`__str__`). Because the domain of a parameter/variable can have
+    a complicated relationship with other parameters and variables of a
+    distribution, `_Domain` itself does not try to represent all possibilities;
+    in fact, it has no implementation and is meant for subclassing.
+
+    Attributes
+    ----------
+    symbols : dict
+        A map from special numerical values to symbols for use in `__str__`
+
+    Methods
+    -------
+    contains(x)
+        Determine whether the argument is contained within the domain (True)
+        or not (False). Used for input validation.
+    get_numerical_endpoints()
+        Gets the numerical values of the domain endpoints, which may have been
+        defined symbolically or through a callable.
+    __str__()
+        Returns a text representation of the domain (e.g. ``[0, b)``).
+        Used for generating documentation.
+
+    """
+    symbols = {np.inf: r"\infty", -np.inf: r"-\infty", np.pi: r"\pi", -np.pi: r"-\pi"}
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    @abstractmethod
+    def contains(self, x):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def draw(self, n):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_numerical_endpoints(self, x):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __str__(self):
+        raise NotImplementedError()
+
+
+class _Interval(_Domain):
+    r""" Representation of an interval defined by two endpoints.
+
+    Each endpoint may be a finite scalar, positive or negative infinity, or
+    be given by a single parameter. The domain may include the endpoints or
+    not.
+
+    This class still does not provide an implementation of the __str__ method,
+    so it is meant for subclassing (e.g. a subclass for domains on the real
+    line).
+
+    Attributes
+    ----------
+    symbols : dict
+        Inherited. A map from special values to symbols for use in `__str__`.
+    endpoints : 2-tuple of float(s) and/or str(s) and/or callable(s).
+        A tuple with two values. Each may be either a float (the numerical
+        value of the endpoints of the domain), a string (the name of the
+        parameters that will define the endpoint), or a callable taking the
+        parameters used to define the endpoints of the domain as keyword only
+        arguments and returning a numerical value for the endpoint.
+    inclusive : 2-tuple of bools
+        A tuple with two boolean values; each indicates whether the
+        corresponding endpoint is included within the domain or not.
+
+    Methods
+    -------
+    define_parameters(*parameters)
+        Records any parameters used to define the endpoints of the domain
+    get_numerical_endpoints(parameter_values)
+        Gets the numerical values of the domain endpoints, which may have been
+        defined symbolically or through a callable.
+    contains(item, parameter_values)
+        Determines whether the argument is contained within the domain
+    draw(size, rng, proportions, parameter_values)
+        Draws random values based on the domain.
+
+    """
+    def __init__(self, endpoints=(-inf, inf), inclusive=(False, False)):
+        self.symbols = super().symbols.copy()
+        a, b = endpoints
+        self.endpoints = np.asarray(a)[()], np.asarray(b)[()]
+        self.inclusive = inclusive
+
+    def define_parameters(self, *parameters):
+        r""" Records any parameters used to define the endpoints of the domain.
+
+        Adds the keyword name of each parameter and its text representation
+        to the  `symbols` attribute as key:value pairs.
+        For instance, a parameter may be passed into to a distribution's
+        initializer using the keyword `log_a`, and the corresponding
+        string representation may be '\log(a)'. To form the text
+        representation of the domain for use in documentation, the
+        _Domain object needs to map from the keyword name used in the code
+        to the string representation.
+
+        Returns None, but updates the `symbols` attribute.
+
+        Parameters
+        ----------
+        *parameters : _Parameter objects
+            Parameters that may define the endpoints of the domain.
+
+        """
+        new_symbols = {param.name: param.symbol for param in parameters}
+        self.symbols.update(new_symbols)
+
+    def get_numerical_endpoints(self, parameter_values):
+        r""" Get the numerical values of the domain endpoints.
+
+        Domain endpoints may be defined symbolically or through a callable.
+        This returns numerical values of the endpoints given numerical values for
+        any variables.
+
+        Parameters
+        ----------
+        parameter_values : dict
+            A dictionary that maps between string variable names and numerical
+            values of parameters, which may define the endpoints.
+
+        Returns
+        -------
+        a, b : ndarray
+            Numerical values of the endpoints
+
+        """
+        a, b = self.endpoints
+        # If `a` (`b`) is a string - the name of the parameter that defines
+        # the endpoint of the domain - then corresponding numerical values
+        # will be found in the `parameter_values` dictionary.
+        # If a callable, it will be executed with `parameter_values` passed as
+        # keyword arguments, and it will return the numerical values.
+        # Otherwise, it is itself the array of numerical values of the endpoint.
+        try:
+            if callable(a):
+                a = a(**parameter_values)
+            else:
+                a = np.asarray(parameter_values.get(a, a))
+            if callable(b):
+                b = b(**parameter_values)
+            else:
+                b = np.asarray(parameter_values.get(b, b))
+        except TypeError as e:
+            message = ("The endpoints of the distribution are defined by "
+                       "parameters, but their values were not provided. When "
+                       f"using a private method of {self.__class__}, pass "
+                       "all required distribution parameters as keyword "
+                       "arguments.")
+            raise TypeError(message) from e
+        # Floating point types are used for even integer parameters.
+        # Convert to float here to ensure consistency throughout framework.
+        a, b = xp_promote(a, b, force_floating=True, xp=np)
+        return a, b
+
+    def contains(self, item, parameter_values=None):
+        r"""Determine whether the argument is contained within the domain.
+
+        Parameters
+        ----------
+        item : ndarray
+            The argument
+        parameter_values : dict
+            A dictionary that maps between string variable names and numerical
+            values of parameters, which may define the endpoints.
+
+        Returns
+        -------
+        out : bool
+            True if `item` is within the domain; False otherwise.
+
+        """
+        parameter_values = parameter_values or {}
+        # if self.all_inclusive:
+        #     # Returning a 0d value here makes things much faster.
+        #     # I'm not sure if it's safe, though. If it causes a bug someday,
+        #     # I guess it wasn't.
+        #     # Even if there is no bug because of the shape, it is incorrect for
+        #     # `contains` to return True when there are invalid (e.g. NaN)
+        #     # parameters.
+        #     return np.asarray(True)
+
+        a, b = self.get_numerical_endpoints(parameter_values)
+        left_inclusive, right_inclusive = self.inclusive
+
+        in_left = item >= a if left_inclusive else item > a
+        in_right = item <= b if right_inclusive else item < b
+        return in_left & in_right
+
+    def draw(self, n, type_, min, max, squeezed_base_shape, rng=None):
+        r""" Draw random values from the domain.
+
+        Parameters
+        ----------
+        n : int
+            The number of values to be drawn from the domain.
+        type_ : str
+            A string indicating whether the values are
+
+            - strictly within the domain ('in'),
+            - at one of the two endpoints ('on'),
+            - strictly outside the domain ('out'), or
+            - NaN ('nan').
+        min, max : ndarray
+            The endpoints of the domain.
+        squeezed_based_shape : tuple of ints
+            See _RealParameter.draw.
+        rng : np.Generator
+            The Generator used for drawing random values.
+
+        """
+        rng = np.random.default_rng(rng)
+
+        def ints(*args, **kwargs): return rng.integers(*args, **kwargs, endpoint=True)
+        uniform = rng.uniform if isinstance(self, _RealInterval) else ints
+
+        # get copies of min and max with no nans so that uniform doesn't fail
+        min_nn, max_nn = min.copy(), max.copy()
+        i = np.isnan(min_nn) | np.isnan(max_nn)
+        min_nn[i] = 0
+        max_nn[i] = 1
+
+        shape = (n,) + squeezed_base_shape
+
+        if type_ == 'in':
+            z = uniform(min_nn, max_nn, size=shape)
+
+        elif type_ == 'on':
+            z_on_shape = shape
+            z = np.ones(z_on_shape)
+            i = rng.random(size=n) < 0.5
+            z[i] = min
+            z[~i] = max
+
+        elif type_ == 'out':
+            z = min_nn - uniform(1, 5, size=shape)   # 1, 5 is arbitary; we just want
+            zr = max_nn + uniform(1, 5, size=shape)  # some numbers outside domain
+            i = rng.random(size=n) < 0.5
+            z[i] = zr[i]
+
+        elif type_ == 'nan':
+            z = np.full(shape, np.nan)
+
+        return z
+
+
+class _RealInterval(_Interval):
+    r""" Represents a simply-connected subset of the real line; i.e., an interval
+
+    Completes the implementation of the `_Interval` class for intervals
+    on the real line.
+
+    Methods
+    -------
+    define_parameters(*parameters)
+        (Inherited) Records any parameters used to define the endpoints of the
+        domain.
+    get_numerical_endpoints(parameter_values)
+        (Inherited) Gets the numerical values of the domain endpoints, which
+        may have been defined symbolically.
+    contains(item, parameter_values)
+        (Inherited) Determines whether the argument is contained within the
+        domain
+    __str__()
+        Returns a string representation of the domain, e.g. "[a, b)".
+    """
+
+    def __str__(self):
+        a, b = self.endpoints
+        a, b = self._get_endpoint_str(a, "f1"), self._get_endpoint_str(b, "f2")
+        left_inclusive, right_inclusive = self.inclusive
+        left = "[" if left_inclusive else "("
+        right = "]" if right_inclusive else ")"
+
+        return f"{left}{a}, {b}{right}"
+
+    def _get_endpoint_str(self, endpoint, funcname):
+        if callable(endpoint):
+            if endpoint.__doc__ is not None:
+                return endpoint.__doc__
+            params = inspect.signature(endpoint).parameters.values()
+            params = [
+                p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
+            ]
+            return f"{funcname}({','.join(params)})"
+        return self.symbols.get(endpoint, f"{endpoint}")
+
+
+class _IntegerInterval(_Interval):
+    r""" Represents an interval of integers
+
+    Completes the implementation of the `_Interval` class for simple
+    domains on the integers.
+
+    Methods
+    -------
+    define_parameters(*parameters)
+        (Inherited) Records any parameters used to define the endpoints of the
+        domain.
+    get_numerical_endpoints(parameter_values)
+        (Inherited) Gets the numerical values of the domain endpoints, which
+        may have been defined symbolically.
+    contains(item, parameter_values)
+        (Overridden) Determines whether the argument is contained within the
+        domain
+    draw(n, type_, min, max, squeezed_base_shape, rng=None)
+        (Inherited) Draws random values based on the domain.
+    __str__()
+        Returns a string representation of the domain, e.g. "{a, a+1, ..., b-1, b}".
+
+    """
+    def contains(self, item, parameter_values=None):
+        super_contains = super().contains(item, parameter_values)
+        integral = (item == np.round(item))
+        return super_contains & integral
+
+    def __str__(self):
+        a, b = self.endpoints
+        a = self.symbols.get(a, a)
+        b = self.symbols.get(b, b)
+
+        a_str, b_str = isinstance(a, str), isinstance(b, str)
+        a_inf = a == r"-\infty" if a_str else np.isinf(a)
+        b_inf = b == r"\infty" if b_str else np.isinf(b)
+
+        # This doesn't work well for cases where ``a`` is floating point
+        # number large enough that ``nextafter(a, inf) > a + 1``, and
+        # similarly for ``b`` and nextafter(b, -inf). There may not be any
+        # distributions fit for SciPy where we would actually need to handle these
+        # cases though.
+        ap1 = f"{a} + 1" if a_str else f"{a + 1}"
+        bm1 = f"{b} - 1" if b_str else f"{b - 1}"
+
+        if not a_str and not b_str:
+            gap = b - a
+            if gap == 3:
+                return f"\\{{{a}, {ap1}, {bm1}, {b}\\}}"
+            if gap == 2:
+                return f"\\{{{a}, {ap1}, {b}\\}}"
+            if gap == 1:
+                return f"\\{{{a}, {b}\\}}"
+            if gap == 0:
+                return f"\\{{{a}\\}}"
+
+        if not a_inf and b_inf:
+            ap2 = f"{a} + 2" if a_str else f"{a + 2}"
+            return f"\\{{{a}, {ap1}, {ap2}, ...\\}}"
+        if a_inf and not b_inf:
+            bm2 = f"{b} - 2" if b_str else f"{b - 2}"
+            return f"\\{{{b}, {bm1}, {bm2}, ...\\}}"
+        if a_inf and b_inf:
+            return "\\{..., -2, -1, 0, 1, 2, ...\\}"
+
+        return f"\\{{{a}, {ap1}, ..., {bm1}, {b}\\}}"
+
+
+class _Parameter(ABC):
+    r""" Representation of a distribution parameter or variable.
+
+    A `_Parameter` object is responsible for storing information about a
+    parameter or variable, providing input validation/standardization of
+    values passed for that parameter, providing a text/mathematical
+    representation of the parameter for the documentation (`__str__`), and
+    drawing random values of itself for testing and benchmarking. It does
+    not provide a complete implementation of this functionality and is meant
+    for subclassing.
+
+    Attributes
+    ----------
+    name : str
+        The keyword used to pass numerical values of the parameter into the
+        initializer of the distribution
+    symbol : str
+        The text representation of the variable in the documentation. May
+        include LaTeX.
+    domain : _Domain
+        The domain of the parameter for which the distribution is valid.
+    typical : 2-tuple of floats or strings (consider making a _Domain)
+        Defines the endpoints of a typical range of values of the parameter.
+        Used for sampling.
+
+    Methods
+    -------
+    __str__():
+        Returns a string description of the variable for use in documentation,
+        including the keyword used to represent it in code, the symbol used to
+        represent it mathemtatically, and a description of the valid domain.
+    draw(size, *, rng, domain, proportions)
+        Draws random values of the parameter. Proportions of values within
+        the valid domain, on the endpoints of the domain, outside the domain,
+        and having value NaN are specified by `proportions`.
+    validate(x):
+        Validates and standardizes the argument for use as numerical values
+        of the parameter.
+
+   """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    def __init__(self, name, *, domain, symbol=None, typical=None):
+        self.name = name
+        self.symbol = symbol or name
+        self.domain = domain
+        if typical is not None and not isinstance(typical, _Domain):
+            typical = domain.__class__(typical)
+        self.typical = typical or domain
+
+    def __str__(self):
+        r""" String representation of the parameter for use in documentation."""
+        return f"`{self.name}` for :math:`{self.symbol} \\in {str(self.domain)}`"
+
+    def draw(self, size=None, *, rng=None, region='domain', proportions=None,
+             parameter_values=None):
+        r""" Draw random values of the parameter for use in testing.
+
+        Parameters
+        ----------
+        size : tuple of ints
+            The shape of the array of valid values to be drawn.
+        rng : np.Generator
+            The Generator used for drawing random values.
+        region : str
+            The region of the `_Parameter` from which to draw. Default is
+            "domain" (the *full* domain); alternative is "typical". An
+            enhancement would give a way to interpolate between the two.
+        proportions : tuple of numbers
+            A tuple of four non-negative numbers that indicate the expected
+            relative proportion of elements that:
+
+            - are strictly within the domain,
+            - are at one of the two endpoints,
+            - are strictly outside the domain, and
+            - are NaN,
+
+            respectively. Default is (1, 0, 0, 0). The number of elements in
+            each category is drawn from the multinomial distribution with
+            `np.prod(size)` as the number of trials and `proportions` as the
+            event probabilities. The values in `proportions` are automatically
+            normalized to sum to 1.
+        parameter_values : dict
+            Map between the names of parameters (that define the endpoints of
+            `typical`) and numerical values (arrays).
+
+        """
+        parameter_values = parameter_values or {}
+        domain = self.domain
+        proportions = (1, 0, 0, 0) if proportions is None else proportions
+
+        pvals = proportions / np.sum(proportions)
+
+        a, b = domain.get_numerical_endpoints(parameter_values)
+        a, b = np.broadcast_arrays(a, b)
+
+        base_shape = a.shape
+        extended_shape = np.broadcast_shapes(size, base_shape)
+        n_extended = np.prod(extended_shape)
+        n_base = np.prod(base_shape)
+        n = int(n_extended / n_base) if n_extended else 0
+
+        rng = np.random.default_rng(rng)
+        n_in, n_on, n_out, n_nan = rng.multinomial(n, pvals)
+
+        # `min` and `max` can have singleton dimensions that correspond with
+        # non-singleton dimensions in `size`. We need to be careful to avoid
+        # shuffling results (e.g. a value that was generated for the domain
+        # [min[i], max[i]] ends up at index j). To avoid this:
+        # - Squeeze the singleton dimensions out of `min`/`max`. Squeezing is
+        #   often not the right thing to do, but here is equivalent to moving
+        #   all the dimensions that are singleton in `min`/`max` (which may be
+        #   non-singleton in the result) to the left. This is what we want.
+        # - Now all the non-singleton dimensions of the result are on the left.
+        #   Ravel them to a single dimension of length `n`, which is now along
+        #   the 0th axis.
+        # - Reshape the 0th axis back to the required dimensions, and move
+        #   these axes back to their original places.
+        base_shape_padded = ((1,)*(len(extended_shape) - len(base_shape))
+                             + base_shape)
+        base_singletons = np.where(np.asarray(base_shape_padded)==1)[0]
+        new_base_singletons = tuple(range(len(base_singletons)))
+        # Base singleton dimensions are going to get expanded to these lengths
+        shape_expansion = np.asarray(extended_shape)[base_singletons]
+
+        # assert(np.prod(shape_expansion) == n)  # check understanding
+        # min = np.reshape(min, base_shape_padded)
+        # max = np.reshape(max, base_shape_padded)
+        # min = np.moveaxis(min, base_singletons, new_base_singletons)
+        # max = np.moveaxis(max, base_singletons, new_base_singletons)
+        # squeezed_base_shape = max.shape[len(base_singletons):]
+        # assert np.all(min.reshape(squeezed_base_shape) == min.squeeze())
+        # assert np.all(max.reshape(squeezed_base_shape) == max.squeeze())
+
+        # min = np.maximum(a, _fiinfo(a).min/10) if np.any(np.isinf(a)) else a
+        # max = np.minimum(b, _fiinfo(b).max/10) if np.any(np.isinf(b)) else b
+        min = np.asarray(a.squeeze())
+        max = np.asarray(b.squeeze())
+        squeezed_base_shape = max.shape
+
+        if region == 'typical':
+            typical = self.typical
+            a, b = typical.get_numerical_endpoints(parameter_values)
+            a, b = np.broadcast_arrays(a, b)
+            min_here = np.asarray(a.squeeze())
+            max_here = np.asarray(b.squeeze())
+            z_in = typical.draw(n_in, 'in', min_here, max_here, squeezed_base_shape,
+                                rng=rng)
+        else:
+            z_in = domain.draw(n_in, 'in', min, max, squeezed_base_shape, rng=rng)
+        z_on = domain.draw(n_on, 'on', min, max, squeezed_base_shape, rng=rng)
+        z_out = domain.draw(n_out, 'out', min, max, squeezed_base_shape, rng=rng)
+        z_nan= domain.draw(n_nan, 'nan', min, max, squeezed_base_shape, rng=rng)
+
+        z = np.concatenate((z_in, z_on, z_out, z_nan), axis=0)
+        z = rng.permuted(z, axis=0)
+
+        z = np.reshape(z, tuple(shape_expansion) + squeezed_base_shape)
+        z = np.moveaxis(z, new_base_singletons, base_singletons)
+        return z
+
+    @abstractmethod
+    def validate(self, arr):
+        raise NotImplementedError()
+
+
+class _RealParameter(_Parameter):
+    r""" Represents a real-valued parameter.
+
+    Implements the remaining methods of _Parameter for real parameters.
+    All attributes are inherited.
+
+    """
+    def validate(self, arr, parameter_values):
+        r""" Input validation/standardization of numerical values of a parameter.
+
+        Checks whether elements of the argument `arr` are reals, ensuring that
+        the dtype reflects this. Also produces a logical array that indicates
+        which elements meet the requirements.
+
+        Parameters
+        ----------
+        arr : ndarray
+            The argument array to be validated and standardized.
+        parameter_values : dict
+            Map of parameter names to parameter value arrays.
+
+        Returns
+        -------
+        arr : ndarray
+            The argument array that has been validated and standardized
+            (converted to an appropriate dtype, if necessary).
+        dtype : NumPy dtype
+            The appropriate floating point dtype of the parameter.
+        valid : boolean ndarray
+            Logical array indicating which elements are valid (True) and
+            which are not (False). The arrays of all distribution parameters
+            will be broadcasted, and elements for which any parameter value
+            does not meet the requirements will be replaced with NaN.
+
+        """
+        arr = np.asarray(arr)
+
+        valid_dtype = None
+        # minor optimization - fast track the most common types to avoid
+        # overhead of np.issubdtype. Checking for `in {...}` doesn't work : /
+        if arr.dtype == np.float64 or arr.dtype == np.float32:
+            pass
+        elif arr.dtype == np.int32 or arr.dtype == np.int64:
+            arr = np.asarray(arr, dtype=np.float64)
+        elif np.issubdtype(arr.dtype, np.floating):
+            pass
+        elif np.issubdtype(arr.dtype, np.integer):
+            arr = np.asarray(arr, dtype=np.float64)
+        else:
+            message = f"Parameter `{self.name}` must be of real dtype."
+            raise TypeError(message)
+
+        valid = self.domain.contains(arr, parameter_values)
+        valid = valid & valid_dtype if valid_dtype is not None else valid
+
+        return arr[()], arr.dtype, valid
+
+
+class _Parameterization:
+    r""" Represents a parameterization of a distribution.
+
+    Distributions can have multiple parameterizations. A `_Parameterization`
+    object is responsible for recording the parameters used by the
+    parameterization, checking whether keyword arguments passed to the
+    distribution match the parameterization, and performing input validation
+    of the numerical values of these parameters.
+
+    Attributes
+    ----------
+    parameters : dict
+        String names (of keyword arguments) and the corresponding _Parameters.
+
+    Methods
+    -------
+    __len__()
+        Returns the number of parameters in the parameterization.
+    __str__()
+        Returns a string representation of the parameterization.
+    copy
+        Returns a copy of the parameterization. This is needed for transformed
+        distributions that add parameters to the parameterization.
+    matches(parameters)
+        Checks whether the keyword arguments match the parameterization.
+    validation(parameter_values)
+        Input validation / standardization of parameterization. Validates the
+        numerical values of all parameters.
+    draw(sizes, rng, proportions)
+        Draw random values of all parameters of the parameterization for use
+        in testing.
+    """
+    def __init__(self, *parameters):
+        self.parameters = {param.name: param for param in parameters}
+
+    def __len__(self):
+        return len(self.parameters)
+
+    def copy(self):
+        return _Parameterization(*self.parameters.values())
+
+    def matches(self, parameters):
+        r""" Checks whether the keyword arguments match the parameterization.
+
+        Parameters
+        ----------
+        parameters : set
+            Set of names of parameters passed into the distribution as keyword
+            arguments.
+
+        Returns
+        -------
+        out : bool
+            True if the keyword arguments names match the names of the
+            parameters of this parameterization.
+        """
+        return parameters == set(self.parameters.keys())
+
+    def validation(self, parameter_values):
+        r""" Input validation / standardization of parameterization.
+
+        Parameters
+        ----------
+        parameter_values : dict
+            The keyword arguments passed as parameter values to the
+            distribution.
+
+        Returns
+        -------
+        all_valid : ndarray
+            Logical array indicating the elements of the broadcasted arrays
+            for which all parameter values are valid.
+        dtype : dtype
+            The common dtype of the parameter arrays. This will determine
+            the dtype of the output of distribution methods.
+        """
+        all_valid = True
+        dtypes = set()  # avoid np.result_type if there's only one type
+        for name, arr in parameter_values.items():
+            parameter = self.parameters[name]
+            arr, dtype, valid = parameter.validate(arr, parameter_values)
+            dtypes.add(dtype)
+            all_valid = all_valid & valid
+            parameter_values[name] = arr
+        dtype = arr.dtype if len(dtypes)==1 else np.result_type(*list(dtypes))
+
+        return all_valid, dtype
+
+    def __str__(self):
+        r"""Returns a string representation of the parameterization."""
+        messages = [str(param) for name, param in self.parameters.items()]
+        return ", ".join(messages)
+
+    def draw(self, sizes=None, rng=None, proportions=None, region='domain'):
+        r"""Draw random values of all parameters for use in testing.
+
+        Parameters
+        ----------
+        sizes : iterable of shape tuples
+            The size of the array to be generated for each parameter in the
+            parameterization. Note that the order of sizes is arbitary; the
+            size of the array generated for a specific parameter is not
+            controlled individually as written.
+        rng : NumPy Generator
+            The generator used to draw random values.
+        proportions : tuple
+            A tuple of four non-negative numbers that indicate the expected
+            relative proportion of elements that are within the parameter's
+            domain, are on the boundary of the parameter's domain, are outside
+            the parameter's domain, and have value NaN. For more information,
+            see the `draw` method of the _Parameter subclasses.
+        domain : str
+            The domain of the `_Parameter` from which to draw. Default is
+            "domain" (the *full* domain); alternative is "typical".
+
+        Returns
+        -------
+        parameter_values : dict (string: array)
+            A dictionary of parameter name/value pairs.
+        """
+        # ENH: be smart about the order. The domains of some parameters
+        # depend on others. If the relationshp is simple (e.g. a < b < c),
+        # we can draw values in order a, b, c.
+        parameter_values = {}
+
+        if sizes is None or not len(sizes) or not np.iterable(sizes[0]):
+            sizes = [sizes]*len(self.parameters)
+
+        for size, param in zip(sizes, self.parameters.values()):
+            parameter_values[param.name] = param.draw(
+                size, rng=rng, proportions=proportions,
+                parameter_values=parameter_values,
+                region=region
+            )
+
+        return parameter_values
+
+
+def _set_invalid_nan(f):
+    # Wrapper for input / output validation and standardization of distribution
+    # functions that accept either the quantile or percentile as an argument:
+    # logpdf, pdf
+    # logpmf, pmf
+    # logcdf, cdf
+    # logccdf, ccdf
+    # ilogcdf, icdf
+    # ilogccdf, iccdf
+    # Arguments that are outside the required range are replaced by NaN before
+    # passing them into the underlying function. The corresponding outputs
+    # are replaced by the appropriate value before being returned to the user.
+    # For example, when the argument of `cdf` exceeds the right end of the
+    # distribution's support, the wrapper replaces the argument with NaN,
+    # ignores the output of the underlying function, and returns 1.0. It also
+    # ensures that output is of the appropriate shape and dtype.
+
+    endpoints = {'icdf': (0, 1), 'iccdf': (0, 1),
+                 'ilogcdf': (-np.inf, 0), 'ilogccdf': (-np.inf, 0)}
+    replacements = {'logpdf': (-inf, -inf), 'pdf': (0, 0),
+                    'logpmf': (-inf, -inf), 'pmf': (0, 0),
+                    '_logcdf1': (-inf, 0), '_logccdf1': (0, -inf),
+                    '_cdf1': (0, 1), '_ccdf1': (1, 0)}
+    replace_strict = {'pdf', 'logpdf', 'pmf', 'logpmf'}
+    replace_exact = {'icdf', 'iccdf', 'ilogcdf', 'ilogccdf'}
+    clip = {'_cdf1', '_ccdf1'}
+    clip_log = {'_logcdf1', '_logccdf1'}
+    # relevant to discrete distributions only
+    replace_non_integral = {'pmf', 'logpmf', 'pdf', 'logpdf'}
+
+    @functools.wraps(f)
+    def filtered(self, x, *args, **kwargs):
+        if self.validation_policy == _SKIP_ALL:
+            return f(self, x, *args, **kwargs)
+
+        method_name = f.__name__
+        x = np.asarray(x)
+        dtype = self._dtype
+        shape = self._shape
+        discrete = isinstance(self, DiscreteDistribution)
+        keep_low_endpoint = discrete and method_name in {'_cdf1', '_logcdf1',
+                                                         '_ccdf1', '_logccdf1'}
+
+        # Ensure that argument is at least as precise as distribution
+        # parameters, which are already at least floats. This will avoid issues
+        # with raising integers to negative integer powers and failure to replace
+        # invalid integers with NaNs.
+        if x.dtype != dtype:
+            dtype = np.result_type(x.dtype, dtype)
+            x = np.asarray(x, dtype=dtype)
+
+        # Broadcasting is slow. Do it only if necessary.
+        if not x.shape == shape:
+            try:
+                shape = np.broadcast_shapes(x.shape, shape)
+                x = np.broadcast_to(x, shape)
+                # Should we broadcast the distribution parameters to this shape, too?
+            except ValueError as e:
+                message = (
+                    f"The argument provided to `{self.__class__.__name__}"
+                    f".{method_name}` cannot be be broadcast to the same "
+                    "shape as the distribution parameters.")
+                raise ValueError(message) from e
+
+        low, high = endpoints.get(method_name, self.support())
+
+        # Check for arguments outside of domain. They'll be replaced with NaNs,
+        # and the result will be set to the appropriate value.
+        left_inc, right_inc = self._variable.domain.inclusive
+        mask_low = (x < low if (method_name in replace_strict and left_inc)
+                    or keep_low_endpoint else x <= low)
+        mask_high = (x > high if (method_name in replace_strict and right_inc)
+                     else x >= high)
+        mask_invalid = (mask_low | mask_high)
+        any_invalid = (mask_invalid if mask_invalid.shape == ()
+                       else np.any(mask_invalid))
+
+        # Check for arguments at domain endpoints, whether they
+        # are part of the domain or not.
+        any_endpoint = False
+        if method_name in replace_exact:
+            mask_low_endpoint = (x == low)
+            mask_high_endpoint = (x == high)
+            mask_endpoint = (mask_low_endpoint | mask_high_endpoint)
+            any_endpoint = (mask_endpoint if mask_endpoint.shape == ()
+                            else np.any(mask_endpoint))
+
+        # Check for non-integral arguments to PMF method
+        # or PDF of a discrete distribution.
+        any_non_integral = False
+        if discrete and method_name in replace_non_integral:
+            mask_non_integral = (x != np.floor(x))
+            any_non_integral = (mask_non_integral if mask_non_integral.shape == ()
+                                else np.any(mask_non_integral))
+
+        # Set out-of-domain arguments to NaN. The result will be set to the
+        # appropriate value later.
+        if any_invalid:
+            x = np.array(x, dtype=dtype, copy=True)
+            x[mask_invalid] = np.nan
+
+        res = np.asarray(f(self, x, *args, **kwargs))
+
+        # Ensure that the result is the correct dtype and shape,
+        # copying (only once) if necessary.
+        res_needs_copy = False
+        if res.dtype != dtype:
+            dtype = np.result_type(dtype, self._dtype)
+            res_needs_copy = True
+
+        if res.shape != shape:  # faster to check first
+            res = np.broadcast_to(res, self._shape)
+            res_needs_copy = (res_needs_copy or any_invalid
+                              or any_endpoint or any_non_integral)
+
+        if res_needs_copy:
+            res = np.array(res, dtype=dtype, copy=True)
+
+        # For non-integral arguments to PMF (and PDF of discrete distribution)
+        # replace with zero.
+        if any_non_integral:
+            zero = -np.inf if method_name in {'logpmf', 'logpdf'} else 0
+            res[mask_non_integral & ~np.isnan(res)] = zero
+
+        # For arguments outside the function domain, replace results
+        if any_invalid:
+            replace_low, replace_high = (
+                replacements.get(method_name, (np.nan, np.nan)))
+            res[mask_low] = replace_low
+            res[mask_high] = replace_high
+
+        # For arguments at the endpoints of the domain, replace results
+        if any_endpoint:
+            a, b = self.support()
+            if a.shape != shape:
+                a = np.array(np.broadcast_to(a, shape), copy=True)
+                b = np.array(np.broadcast_to(b, shape), copy=True)
+
+            replace_low_endpoint = (
+                b[mask_low_endpoint] if method_name.endswith('ccdf')
+                else a[mask_low_endpoint])
+            replace_high_endpoint = (
+                a[mask_high_endpoint] if method_name.endswith('ccdf')
+                else b[mask_high_endpoint])
+
+            if not keep_low_endpoint:
+                res[mask_low_endpoint] = replace_low_endpoint
+            res[mask_high_endpoint] = replace_high_endpoint
+
+        # Clip probabilities to [0, 1]
+        if method_name in clip:
+            res = np.clip(res, 0., 1.)
+        elif method_name in clip_log:
+            res = res.real  # exp(res) > 0
+            res = np.clip(res, None, 0.)  # exp(res) < 1
+
+        return res[()]
+
+    return filtered
+
+
+def _set_invalid_nan_property(f):
+    # Wrapper for input / output validation and standardization of distribution
+    # functions that represent properties of the distribution itself:
+    # logentropy, entropy
+    # median, mode
+    # moment
+    # It ensures that the output is of the correct shape and dtype and that
+    # there are NaNs wherever the distribution parameters were invalid.
+
+    @functools.wraps(f)
+    def filtered(self, *args, **kwargs):
+        if self.validation_policy == _SKIP_ALL:
+            return f(self, *args, **kwargs)
+
+        res = f(self, *args, **kwargs)
+        if res is None:
+            # message could be more appropriate
+            raise NotImplementedError(self._not_implemented)
+
+        res = np.asarray(res)
+        needs_copy = False
+        dtype = res.dtype
+
+        if dtype != self._dtype:  # this won't work for logmoments (complex)
+            dtype = np.result_type(dtype, self._dtype)
+            needs_copy = True
+
+        if res.shape != self._shape:  # faster to check first
+            res = np.broadcast_to(res, self._shape)
+            needs_copy = needs_copy or self._any_invalid
+
+        if needs_copy:
+            res = res.astype(dtype=dtype, copy=True)
+
+        if self._any_invalid:
+            # may be redundant when quadrature is used, but not necessarily
+            # when formulas are used.
+            res[self._invalid] = np.nan
+
+        return res[()]
+
+    return filtered
+
+
+def _dispatch(f):
+    # For each public method (instance function) of a distribution (e.g. ccdf),
+    # there may be several ways ("method"s) that it can be computed (e.g. a
+    # formula, as the complement of the CDF, or via numerical integration).
+    # Each "method" is implemented by a different private method (instance
+    # function).
+    # This wrapper calls the appropriate private method based on the public
+    # method and any specified `method` keyword option.
+    # - If `method` is specified as a string (by the user), the appropriate
+    #   private method is called.
+    # - If `method` is None:
+    #   - The appropriate private method for the public method is looked up
+    #     in a cache.
+    #   - If the cache does not have an entry for the public method, the
+    #     appropriate "dispatch " function is called to determine which method
+    #     is most appropriate given the available private methods and
+    #     settings (e.g. tolerance).
+
+    @functools.wraps(f)
+    def wrapped(self, *args, method=None, **kwargs):
+        func_name = f.__name__
+        method = method or self._method_cache.get(func_name, None)
+        if callable(method):
+            pass
+        elif method is not None:
+            method = 'logexp' if method == 'log/exp' else method
+            method_name = func_name.replace('dispatch', method)
+            method = getattr(self, method_name)
+        else:
+            method = f(self, *args, method=method, **kwargs)
+            if func_name != '_sample_dispatch' and self.cache_policy != _NO_CACHE:
+                self._method_cache[func_name] = method
+
+        try:
+            return method(*args, **kwargs)
+        except KeyError as e:
+            raise NotImplementedError(self._not_implemented) from e
+
+    return wrapped
+
+
+def _cdf2_input_validation(f):
+    # Wrapper that does the job of `_set_invalid_nan` when `cdf` or `logcdf`
+    # is called with two quantile arguments.
+    # Let's keep it simple; no special cases for speed right now.
+    # The strategy is a bit different than for 1-arg `cdf` (and other methods
+    # covered by `_set_invalid_nan`). For 1-arg `cdf`, elements of `x` that
+    # are outside (or at the edge of) the support get replaced by `nan`,
+    # and then the results get replaced by the appropriate value (0 or 1).
+    # We *could* do something similar, dispatching to `_cdf1` in these
+    # cases. That would be a bit more robust, but it would also be quite
+    # a bit more complex, since we'd have to do different things when
+    # `x` and `y` are both out of bounds, when just `x` is out of bounds,
+    # when just `y` is out of bounds, and when both are out of bounds.
+    # I'm not going to do that right now. Instead, simply replace values
+    # outside the support by those at the edge of the support. Here, we also
+    # omit some of the optimizations that make `_set_invalid_nan` faster for
+    # simple arguments (e.g. float64 scalars).
+
+    @functools.wraps(f)
+    def wrapped(self, x, y, *args, **kwargs):
+        func_name = f.__name__
+
+        low, high = self.support()
+        x, y, low, high = np.broadcast_arrays(x, y, low, high)
+        dtype = np.result_type(x.dtype, y.dtype, self._dtype)
+        # yes, copy to avoid modifying input arrays
+        x, y = x.astype(dtype, copy=True), y.astype(dtype, copy=True)
+
+        # Swap arguments to ensure that x < y, and replace
+        # out-of domain arguments with domain endpoints. We'll
+        # transform the result later.
+        i_swap = y < x
+        x[i_swap], y[i_swap] = y[i_swap], x[i_swap]
+        i = x < low
+        x[i] = low[i]
+        i = y < low
+        y[i] = low[i]
+        i = x > high
+        x[i] = high[i]
+        i = y > high
+        y[i] = high[i]
+
+        res = f(self, x, y, *args, **kwargs)
+
+        # Clipping probability to [0, 1]
+        if func_name in {'_cdf2', '_ccdf2'}:
+            res = np.clip(res, 0., 1.)
+        else:
+            res = np.clip(res, None, 0.)  # exp(res) < 1
+
+        # Transform the result to account for swapped argument order
+        res = np.asarray(res)
+        if func_name == '_cdf2':
+            res[i_swap] *= -1.
+        elif func_name == '_ccdf2':
+            res[i_swap] *= -1
+            res[i_swap] += 2.
+        elif func_name == '_logcdf2':
+            res = np.asarray(res + 0j) if np.any(i_swap) else res
+            res[i_swap] = res[i_swap] + np.pi*1j
+        else:
+            # res[i_swap] is always positive and less than 1, so it's
+            # safe to ensure that the result is real
+            res[i_swap] = _logexpxmexpy(np.log(2), res[i_swap]).real
+        return res[()]
+
+    return wrapped
+
+
+def _fiinfo(x):
+    if np.issubdtype(x.dtype, np.inexact):
+        return np.finfo(x.dtype)
+    else:
+        return np.iinfo(x)
+
+
+def _kwargs2args(f, args=None, kwargs=None):
+    # Wraps a function that accepts a primary argument `x`, secondary
+    # arguments `args`, and secondary keyward arguments `kwargs` such that the
+    # wrapper accepts only `x` and `args`. The keyword arguments are extracted
+    # from `args` passed into the wrapper, and these are passed to the
+    # underlying function as `kwargs`.
+    # This is a temporary workaround until the scalar algorithms `_tanhsinh`,
+    # `_chandrupatla`, etc., support `kwargs` or can operate with compressing
+    # arguments to the callable.
+    args = args or []
+    kwargs = kwargs or {}
+    names = list(kwargs.keys())
+    n_args = len(args)
+
+    def wrapped(x, *args):
+        return f(x, *args[:n_args], **dict(zip(names, args[n_args:])))
+
+    args = tuple(args) + tuple(kwargs.values())
+
+    return wrapped, args
+
+
+def _logexpxmexpy(x, y):
+    """ Compute the log of the difference of the exponentials of two arguments.
+
+    Avoids over/underflow, but does not prevent loss of precision otherwise.
+    """
+    # TODO: properly avoid NaN when y is negative infinity
+    # TODO: silence warning with taking log of complex nan
+    # TODO: deal with x == y better
+    i = np.isneginf(np.real(y))
+    if np.any(i):
+        y = np.asarray(y.copy())
+        y[i] = np.finfo(y.dtype).min
+    x, y = np.broadcast_arrays(x, y)
+    res = np.asarray(special.logsumexp([x, y+np.pi*1j], axis=0))
+    i = (x == y)
+    res[i] = -np.inf
+    return res
+
+
+def _guess_bracket(xmin, xmax):
+    a = np.full_like(xmin, -1.0)
+    b = np.ones_like(xmax)
+
+    i = np.isfinite(xmin) & np.isfinite(xmax)
+    a[i] = xmin[i]
+    b[i] = xmax[i]
+
+    i = np.isfinite(xmin) & ~np.isfinite(xmax)
+    a[i] = xmin[i]
+    b[i] = xmin[i] + 1
+
+    i = np.isfinite(xmax) & ~np.isfinite(xmin)
+    a[i] = xmax[i] - 1
+    b[i] = xmax[i]
+
+    return a, b
+
+
+def _log_real_standardize(x):
+    """Standardizes the (complex) logarithm of a real number.
+
+    The logarithm of a real number may be represented by a complex number with
+    imaginary part that is a multiple of pi*1j. Even multiples correspond with
+    a positive real and odd multiples correspond with a negative real.
+
+    Given a logarithm of a real number `x`, this function returns an equivalent
+    representation in a standard form: the log of a positive real has imaginary
+    part `0` and the log of a negative real has imaginary part `pi`.
+
+    """
+    shape = x.shape
+    x = np.atleast_1d(x)
+    real = np.real(x).astype(x.dtype)
+    complex = np.imag(x)
+    y = real
+    negative = np.exp(complex*1j) < 0.5
+    y[negative] = y[negative] + np.pi * 1j
+    return y.reshape(shape)[()]
+
+
+def _combine_docs(dist_family, *, include_examples=True):
+    fields = set(NumpyDocString.sections)
+    fields.remove('index')
+    if not include_examples:
+        fields.remove('Examples')
+
+    doc = ClassDoc(dist_family)
+    superdoc = ClassDoc(UnivariateDistribution)
+    for field in fields:
+        if field in {"Methods", "Attributes"}:
+            doc[field] = superdoc[field]
+        elif field in {"Summary"}:
+            pass
+        elif field == "Extended Summary":
+            doc[field].append(_generate_domain_support(dist_family))
+        elif field == 'Examples':
+            doc[field] = [_generate_example(dist_family)]
+        else:
+            doc[field] += superdoc[field]
+    return str(doc)
+
+
+def _generate_domain_support(dist_family):
+    n_parameterizations = len(dist_family._parameterizations)
+
+    domain = f"\nfor :math:`x \\in {dist_family._variable.domain}`.\n"
+
+    if n_parameterizations == 0:
+        support = """
+        This class accepts no distribution parameters.
+        """
+    elif n_parameterizations == 1:
+        support = f"""
+        This class accepts one parameterization:
+        {str(dist_family._parameterizations[0])}.
+        """
+    else:
+        number = {2: 'two', 3: 'three', 4: 'four', 5: 'five'}[
+            n_parameterizations]
+        parameterizations = [f"- {str(p)}" for p in
+                             dist_family._parameterizations]
+        parameterizations = "\n".join(parameterizations)
+        support = f"""
+        This class accepts {number} parameterizations:
+
+        {parameterizations}
+        """
+    support = "\n".join([line.lstrip() for line in support.split("\n")][1:])
+    return domain + support
+
+
+def _generate_example(dist_family):
+    n_parameters = dist_family._num_parameters(0)
+    shapes = [()] * n_parameters
+    rng = np.random.default_rng(615681484984984)
+    i = 0
+    dist = dist_family._draw(shapes, rng=rng, i_parameterization=i)
+
+    rng = np.random.default_rng(2354873452)
+    name = dist_family.__name__
+    if n_parameters:
+        parameter_names = list(dist._parameterizations[i].parameters)
+        parameter_values = [round(getattr(dist, name), 2) for name in
+                            parameter_names]
+        name_values = [f"{name}={value}" for name, value in
+                       zip(parameter_names, parameter_values)]
+        instantiation = f"{name}({', '.join(name_values)})"
+        attributes = ", ".join([f"X.{param}" for param in dist._parameters])
+        X = dist_family(**dict(zip(parameter_names, parameter_values)))
+    else:
+        instantiation = f"{name}()"
+        X = dist
+
+    p = 0.32
+    x = round(X.icdf(p), 2)
+    y = round(X.icdf(2 * p), 2)  # noqa: F841
+
+    example = f"""
+    To use the distribution class, it must be instantiated using keyword
+    parameters corresponding with one of the accepted parameterizations.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> from scipy.stats import {name}
+    >>> X = {instantiation}
+
+    For convenience, the ``plot`` method can be used to visualize the density
+    and other functions of the distribution.
+
+    >>> X.plot()
+    >>> plt.show()
+
+    The support of the underlying distribution is available using the ``support``
+    method.
+
+    >>> X.support()
+    {X.support()}
+    """
+
+    if n_parameters:
+        example += f"""
+        The numerical values of parameters associated with all parameterizations
+        are available as attributes.
+
+        >>> {attributes}
+        {tuple(X._parameters.values())}
+        """
+
+    example += f"""
+    To evaluate the probability density/mass function of the underlying distribution
+    at argument ``x={x}``:
+
+    >>> x = {x}
+    >>> X.pdf(x), X.pmf(x)
+    {X.pdf(x), X.pmf(x)}
+
+    The cumulative distribution function, its complement, and the logarithm
+    of these functions are evaluated similarly.
+
+    >>> np.allclose(np.exp(X.logccdf(x)), 1 - X.cdf(x))
+    True
+    """
+
+    # When two-arg CDF is implemented for DiscreteDistribution, consider removing
+    # the special-casing here.
+    if issubclass(dist_family, ContinuousDistribution):
+        example_continuous = f"""
+    The inverse of these functions with respect to the argument ``x`` is also
+    available.
+
+    >>> logp = np.log(1 - X.ccdf(x))
+    >>> np.allclose(X.ilogcdf(logp), x)
+    True
+
+    Note that distribution functions and their logarithms also have two-argument
+    versions for working with the probability mass between two arguments. The
+    result tends to be more accurate than the naive implementation because it avoids
+    subtractive cancellation.
+
+    >>> y = {y}
+    >>> np.allclose(X.ccdf(x, y), 1 - (X.cdf(y) - X.cdf(x)))
+    True
+        """
+        example += example_continuous
+
+    example += f"""
+    There are methods for computing measures of central tendency,
+    dispersion, higher moments, and entropy.
+
+    >>> X.mean(), X.median(), X.mode()
+    {X.mean(), X.median(), X.mode()}
+
+    >>> X.variance(), X.standard_deviation()
+    {X.variance(), X.standard_deviation()}
+
+    >>> X.skewness(), X.kurtosis()
+    {X.skewness(), X.kurtosis()}
+
+    >>> np.allclose(X.moment(order=6, kind='standardized'),
+    ...             X.moment(order=6, kind='central') / X.variance()**3)
+    True
+    """
+
+    # When logentropy is implemented for DiscreteDistribution, remove special-casing
+    if issubclass(dist_family, ContinuousDistribution):
+        example += """
+    >>> np.allclose(np.exp(X.logentropy()), X.entropy())
+    True
+        """
+    else:
+        example += f"""
+        >>> X.entropy()
+        {X.entropy()}
+        """
+
+    example += f"""
+    Pseudo-random samples can be drawn from
+    the underlying distribution using ``sample``.
+
+    >>> X.sample(shape=(4,))
+    {repr(X.sample(shape=(4,)))}  # may vary
+    """
+    # remove the indentation due to use of block quote within function;
+    # eliminate blank first line
+    example = "\n".join([line.lstrip() for line in example.split("\n")][1:])
+    return example
+
+
+class UnivariateDistribution(_ProbabilityDistribution):
+    r""" Class that represents a continuous statistical distribution.
+
+    Parameters
+    ----------
+    tol : positive float, optional
+        The desired relative tolerance of calculations. Left unspecified,
+        calculations may be faster; when provided, calculations may be
+        more likely to meet the desired accuracy.
+    validation_policy : {None, "skip_all"}
+        Specifies the level of input validation to perform. Left unspecified,
+        input validation is performed to ensure appropriate behavior in edge
+        case (e.g. parameters out of domain, argument outside of distribution
+        support, etc.) and improve consistency of output dtype, shape, etc.
+        Pass ``'skip_all'`` to avoid the computational overhead of these
+        checks when rough edges are acceptable.
+    cache_policy : {None, "no_cache"}
+        Specifies the extent to which intermediate results are cached. Left
+        unspecified, intermediate results of some calculations (e.g. distribution
+        support, moments, etc.) are cached to improve performance of future
+        calculations. Pass ``'no_cache'`` to reduce memory reserved by the class
+        instance.
+
+    Attributes
+    ----------
+    All parameters are available as attributes.
+
+    Methods
+    -------
+    support
+
+    plot
+
+    sample
+
+    moment
+
+    mean
+    median
+    mode
+
+    variance
+    standard_deviation
+
+    skewness
+    kurtosis
+
+    pdf
+    logpdf
+
+    cdf
+    icdf
+    ccdf
+    iccdf
+
+    logcdf
+    ilogcdf
+    logccdf
+    ilogccdf
+
+    entropy
+    logentropy
+
+    See Also
+    --------
+    :ref:`rv_infrastructure` : Tutorial
+
+    Notes
+    -----
+    The following abbreviations are used throughout the documentation.
+
+    - PDF: probability density function
+    - CDF: cumulative distribution function
+    - CCDF: complementary CDF
+    - entropy: differential entropy
+    - log-*F*: logarithm of *F* (e.g. log-CDF)
+    - inverse *F*: inverse function of *F* (e.g. inverse CDF)
+
+    The API documentation is written to describe the API, not to serve as
+    a statistical reference. Effort is made to be correct at the level
+    required to use the functionality, not to be mathematically rigorous.
+    For example, continuity and differentiability may be implicitly assumed.
+    For precise mathematical definitions, consult your preferred mathematical
+    text.
+
+    """
+    __array_priority__ = 1
+    _parameterizations = []  # type: ignore[var-annotated]
+
+    ### Initialization
+
+    def __init__(self, *, tol=_null, validation_policy=None, cache_policy=None,
+                 **parameters):
+        self.tol = tol
+        self.validation_policy = validation_policy
+        self.cache_policy = cache_policy
+        self._not_implemented = (
+            f"`{self.__class__.__name__}` does not provide an accurate "
+            "implementation of the required method. Consider leaving "
+            "`method` and `tol` unspecified to use another implementation."
+        )
+        self._original_parameters = {}
+        # We may want to override the `__init__` method with parameters so
+        # IDEs can suggest parameter names. If there are multiple parameterizations,
+        # we'll need the default values of parameters to be None; this will
+        # filter out the parameters that were not actually specified by the user.
+        parameters = {key: val for key, val in
+                      sorted(parameters.items()) if val is not None}
+        self._update_parameters(**parameters)
+
+    def _update_parameters(self, *, validation_policy=None, **params):
+        r""" Update the numerical values of distribution parameters.
+
+        Parameters
+        ----------
+        **params : array_like
+            Desired numerical values of the distribution parameters. Any or all
+            of the parameters initially used to instantiate the distribution
+            may be modified. Parameters used in alternative parameterizations
+            are not accepted.
+
+        validation_policy : str
+            To be documented. See Question 3 at the top.
+        """
+
+        parameters = original_parameters = self._original_parameters.copy()
+        parameters.update(**params)
+        parameterization = None
+        self._invalid = np.asarray(False)
+        self._any_invalid = False
+        self._shape = tuple()
+        self._ndim = 0
+        self._size = 1
+        self._dtype = np.float64
+
+        if (validation_policy or self.validation_policy) == _SKIP_ALL:
+            parameters = self._process_parameters(**parameters)
+        elif not len(self._parameterizations):
+            if parameters:
+                message = (f"The `{self.__class__.__name__}` distribution "
+                           "family does not accept parameters, but parameters "
+                           f"`{set(parameters)}` were provided.")
+                raise ValueError(message)
+        else:
+            # This is default behavior, which re-runs all parameter validations
+            # even when only a single parameter is modified. For many
+            # distributions, the domain of a parameter doesn't depend on other
+            # parameters, so parameters could safely be modified without
+            # re-validating all other parameters. To handle these cases more
+            # efficiently, we could allow the developer  to override this
+            # behavior.
+
+            # Currently the user can only update the original parameterization.
+            # Even though that parameterization is already known,
+            # `_identify_parameterization` is called to produce a nice error
+            # message if the user passes other values. To be a little more
+            # efficient, we could detect whether the values passed are
+            # consistent with the original parameterization rather than finding
+            # it from scratch. However, we might want other parameterizations
+            # to be accepted, which would require other changes, so I didn't
+            # optimize this.
+
+            parameterization = self._identify_parameterization(parameters)
+            parameters, shape, size, ndim = self._broadcast(parameters)
+            parameters, invalid, any_invalid, dtype = (
+                self._validate(parameterization, parameters))
+            parameters = self._process_parameters(**parameters)
+
+            self._invalid = invalid
+            self._any_invalid = any_invalid
+            self._shape = shape
+            self._size = size
+            self._ndim = ndim
+            self._dtype = dtype
+
+        self.reset_cache()
+        self._parameters = parameters
+        self._parameterization = parameterization
+        self._original_parameters = original_parameters
+        for name in self._parameters.keys():
+            # Make parameters properties of the class; return values from the instance
+            if hasattr(self.__class__, name):
+                continue
+            setattr(self.__class__, name, property(lambda self_, name_=name:
+                                                   self_._parameters[name_].copy()[()]))
+
+    def reset_cache(self):
+        r""" Clear all cached values.
+
+        To improve the speed of some calculations, the distribution's support
+        and moments are cached.
+
+        This function is called automatically whenever the distribution
+        parameters are updated.
+
+        """
+        # We could offer finer control over what is cleared.
+        # For simplicity, these will still exist even if cache_policy is
+        # NO_CACHE; they just won't be populated. This allows caching to be
+        # turned on and off easily.
+        self._moment_raw_cache = {}
+        self._moment_central_cache = {}
+        self._moment_standardized_cache = {}
+        self._support_cache = None
+        self._method_cache = {}
+        self._constant_cache = None
+
+    def _identify_parameterization(self, parameters):
+        # Determine whether a `parameters` dictionary matches is consistent
+        # with one of the parameterizations of the distribution. If so,
+        # return that parameterization object; if not, raise an error.
+        #
+        # I've come back to this a few times wanting to avoid this explicit
+        # loop. I've considered several possibilities, but they've all been a
+        # little unusual. For example, we could override `_eq_` so we can
+        # use _parameterizations.index() to retrieve the parameterization,
+        # or the user could put the parameterizations in a dictionary so we
+        # could look them up with a key (e.g. frozenset of parameter names).
+        # I haven't been sure enough of these approaches to implement them.
+        parameter_names_set = set(parameters)
+
+        for parameterization in self._parameterizations:
+            if parameterization.matches(parameter_names_set):
+                break
+        else:
+            if not parameter_names_set:
+                message = (f"The `{self.__class__.__name__}` distribution "
+                           "family requires parameters, but none were "
+                           "provided.")
+            else:
+                parameter_names = self._get_parameter_str(parameters)
+                message = (f"The provided parameters `{parameter_names}` "
+                           "do not match a supported parameterization of the "
+                           f"`{self.__class__.__name__}` distribution family.")
+            raise ValueError(message)
+
+        return parameterization
+
+    def _broadcast(self, parameters):
+        # Broadcast the distribution parameters to the same shape. If the
+        # arrays are not broadcastable, raise a meaningful error.
+        #
+        # We always make sure that the parameters *are* the same shape
+        # and not just broadcastable. Users can access parameters as
+        # attributes, and I think they should see the arrays as the same shape.
+        # More importantly, arrays should be the same shape before logical
+        # indexing operations, which are needed in infrastructure code when
+        # there are invalid parameters, and may be needed in
+        # distribution-specific code. We don't want developers to need to
+        # broadcast in implementation functions.
+
+        # It's much faster to check whether broadcasting is necessary than to
+        # broadcast when it's not necessary.
+        parameter_vals = [np.asarray(parameter)
+                          for parameter in parameters.values()]
+        parameter_shapes = set(parameter.shape for parameter in parameter_vals)
+        if len(parameter_shapes) == 1:
+            return (parameters, parameter_vals[0].shape,
+                    parameter_vals[0].size, parameter_vals[0].ndim)
+
+        try:
+            parameter_vals = np.broadcast_arrays(*parameter_vals)
+        except ValueError as e:
+            parameter_names = self._get_parameter_str(parameters)
+            message = (f"The parameters `{parameter_names}` provided to the "
+                       f"`{self.__class__.__name__}` distribution family "
+                       "cannot be broadcast to the same shape.")
+            raise ValueError(message) from e
+        return (dict(zip(parameters.keys(), parameter_vals)),
+                parameter_vals[0].shape,
+                parameter_vals[0].size,
+                parameter_vals[0].ndim)
+
+    def _validate(self, parameterization, parameters):
+        # Broadcasts distribution parameter arrays and converts them to a
+        # consistent dtype. Replaces invalid parameters with `np.nan`.
+        # Returns the validated parameters, a boolean mask indicated *which*
+        # elements are invalid, a boolean scalar indicating whether *any*
+        # are invalid (to skip special treatments if none are invalid), and
+        # the common dtype.
+        valid, dtype = parameterization.validation(parameters)
+        invalid = ~valid
+        any_invalid = invalid if invalid.shape == () else np.any(invalid)
+        # If necessary, make the arrays contiguous and replace invalid with NaN
+        if any_invalid:
+            for parameter_name in parameters:
+                parameters[parameter_name] = np.copy(
+                    parameters[parameter_name])
+                parameters[parameter_name][invalid] = np.nan
+
+        return parameters, invalid, any_invalid, dtype
+
+    def _process_parameters(self, **params):
+        r""" Process and cache distribution parameters for reuse.
+
+        This is intended to be overridden by subclasses. It allows distribution
+        authors to pre-process parameters for re-use. For instance, when a user
+        parameterizes a LogUniform distribution with `a` and `b`, it makes
+        sense to calculate `log(a)` and `log(b)` because these values will be
+        used in almost all distribution methods. The dictionary returned by
+        this method is passed to all private methods that calculate functions
+        of the distribution.
+        """
+        return params
+
+    def _get_parameter_str(self, parameters):
+        # Get a string representation of the parameters like "{a, b, c}".
+        return f"{{{', '.join(parameters.keys())}}}"
+
+    def _copy_parameterization(self):
+        self._parameterizations = self._parameterizations.copy()
+        for i in range(len(self._parameterizations)):
+            self._parameterizations[i] = self._parameterizations[i].copy()
+
+    ### Attributes
+
+    # `tol` attribute is just notional right now. See Question 4 above.
+    @property
+    def tol(self):
+        r"""positive float:
+        The desired relative tolerance of calculations. Left unspecified,
+        calculations may be faster; when provided, calculations may be
+        more likely to meet the desired accuracy.
+        """
+        return self._tol
+
+    @tol.setter
+    def tol(self, tol):
+        if _isnull(tol):
+            self._tol = tol
+            return
+
+        tol = np.asarray(tol)
+        if (tol.shape != () or not tol > 0 or  # catches NaNs
+                not np.issubdtype(tol.dtype, np.floating)):
+            message = (f"Attribute `tol` of `{self.__class__.__name__}` must "
+                       "be a positive float, if specified.")
+            raise ValueError(message)
+        self._tol = tol[()]
+
+    @property
+    def cache_policy(self):
+        r"""{None, "no_cache"}:
+        Specifies the extent to which intermediate results are cached. Left
+        unspecified, intermediate results of some calculations (e.g. distribution
+        support, moments, etc.) are cached to improve performance of future
+        calculations. Pass ``'no_cache'`` to reduce memory reserved by the class
+        instance.
+        """
+        return self._cache_policy
+
+    @cache_policy.setter
+    def cache_policy(self, cache_policy):
+        cache_policy = str(cache_policy).lower() if cache_policy is not None else None
+        cache_policies = {None, 'no_cache'}
+        if cache_policy not in cache_policies:
+            message = (f"Attribute `cache_policy` of `{self.__class__.__name__}` "
+                       f"must be one of {cache_policies}, if specified.")
+            raise ValueError(message)
+        self._cache_policy = cache_policy
+
+    @property
+    def validation_policy(self):
+        r"""{None, "skip_all"}:
+        Specifies the level of input validation to perform. Left unspecified,
+        input validation is performed to ensure appropriate behavior in edge
+        case (e.g. parameters out of domain, argument outside of distribution
+        support, etc.) and improve consistency of output dtype, shape, etc.
+        Use ``'skip_all'`` to avoid the computational overhead of these
+        checks when rough edges are acceptable.
+        """
+        return self._validation_policy
+
+    @validation_policy.setter
+    def validation_policy(self, validation_policy):
+        validation_policy = (str(validation_policy).lower()
+                             if validation_policy is not None else None)
+        iv_policies = {None, 'skip_all'}
+        if validation_policy not in iv_policies:
+            message = (f"Attribute `validation_policy` of `{self.__class__.__name__}` "
+                       f"must be one of {iv_policies}, if specified.")
+            raise ValueError(message)
+        self._validation_policy = validation_policy
+
+    ### Other magic methods
+
+    def __repr__(self):
+        r""" Returns a string representation of the distribution.
+
+        Includes the name of the distribution family, the names of the
+        parameters and the `repr` of each of their values.
+
+
+        """
+        class_name = self.__class__.__name__
+        parameters = list(self._original_parameters.items())
+        info = []
+        with np.printoptions(threshold=10):
+            str_parameters = [f"{symbol}={repr(value)}" for symbol, value in parameters]
+        str_parameters = f"{', '.join(str_parameters)}"
+        info.append(str_parameters)
+        return f"{class_name}({', '.join(info)})"
+
+    def __str__(self):
+        class_name = self.__class__.__name__
+        parameters = list(self._original_parameters.items())
+        info = []
+        with np.printoptions(threshold=10):
+            str_parameters = [f"{symbol}={str(value)}" for symbol, value in parameters]
+        str_parameters = f"{', '.join(str_parameters)}"
+        info.append(str_parameters)
+        return f"{class_name}({', '.join(info)})"
+
+    def __add__(self, loc):
+        return ShiftedScaledDistribution(self, loc=loc)
+
+    def __sub__(self, loc):
+        return ShiftedScaledDistribution(self, loc=-loc)
+
+    def __mul__(self, scale):
+        return ShiftedScaledDistribution(self, scale=scale)
+
+    def __truediv__(self, scale):
+        return ShiftedScaledDistribution(self, scale=1/scale)
+
+    def __pow__(self, other):
+        if not np.isscalar(other) or other <= 0 or other != int(other):
+            message = ("Raising a random variable to the power of an argument is only "
+                       "implemented when the argument is a positive integer.")
+            raise NotImplementedError(message)
+
+        # Fill in repr_pattern with the repr of self before taking abs.
+        # Avoids having unnecessary abs in the repr.
+        with np.printoptions(threshold=10):
+            repr_pattern = f"({repr(self)})**{repr(other)}"
+            str_pattern = f"({str(self)})**{str(other)}"
+        X = abs(self) if other % 2 == 0 else self
+
+        funcs = dict(g=lambda u: u**other, repr_pattern=repr_pattern,
+                     str_pattern=str_pattern,
+                     h=lambda u: np.sign(u) * np.abs(u)**(1 / other),
+                     dh=lambda u: 1/other * np.abs(u)**(1/other - 1))
+
+        return MonotonicTransformedDistribution(X, **funcs, increasing=True)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __rsub__(self, other):
+        return self.__neg__().__add__(other)
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __rtruediv__(self, other):
+        a, b = self.support()
+        with np.printoptions(threshold=10):
+            funcs = dict(g=lambda u: 1 / u,
+                         repr_pattern=f"{repr(other)}/({repr(self)})",
+                         str_pattern=f"{str(other)}/({str(self)})",
+                         h=lambda u: 1 / u, dh=lambda u: 1 / u ** 2)
+        if np.all(a >= 0) or np.all(b <= 0):
+            out = MonotonicTransformedDistribution(self, **funcs, increasing=False)
+        else:
+            message = ("Division by a random variable is only implemented "
+                       "when the support is either non-negative or non-positive.")
+            raise NotImplementedError(message)
+        if np.all(other == 1):
+            return out
+        else:
+            return out * other
+
+    def __rpow__(self, other):
+        with np.printoptions(threshold=10):
+            funcs = dict(g=lambda u: other**u,
+                         h=lambda u: np.log(u) / np.log(other),
+                         dh=lambda u: 1 / np.abs(u * np.log(other)),
+                         repr_pattern=f"{repr(other)}**({repr(self)})",
+                         str_pattern=f"{str(other)}**({str(self)})",)
+
+        if not np.isscalar(other) or other <= 0 or other == 1:
+            message = ("Raising an argument to the power of a random variable is only "
+                       "implemented when the argument is a positive scalar other than "
+                       "1.")
+            raise NotImplementedError(message)
+
+        if other > 1:
+            return MonotonicTransformedDistribution(self, **funcs, increasing=True)
+        else:
+            return MonotonicTransformedDistribution(self, **funcs, increasing=False)
+
+    def __neg__(self):
+        return self * -1
+
+    def __abs__(self):
+        return FoldedDistribution(self)
+
+    ### Utilities
+
+    ## Input validation
+
+    def _validate_order_kind(self, order, kind, kinds):
+        # Yet another integer validating function. Unlike others in SciPy, it
+        # Is quite flexible about what is allowed as an integer, and it
+        # raises a distribution-specific error message to facilitate
+        # identification of the source of the error.
+        if self.validation_policy == _SKIP_ALL:
+            return order
+
+        order = np.asarray(order, dtype=self._dtype)[()]
+        message = (f"Argument `order` of `{self.__class__.__name__}.moment` "
+                   "must be a finite, positive integer.")
+        try:
+            order_int = round(order.item())
+            # If this fails for any reason (e.g. it's an array, it's infinite)
+            # it's not a valid `order`.
+        except Exception as e:
+            raise ValueError(message) from e
+
+        if order_int <0 or order_int != order:
+            raise ValueError(message)
+
+        message = (f"Argument `kind` of `{self.__class__.__name__}.moment` "
+                   f"must be one of {set(kinds)}.")
+        if kind.lower() not in kinds:
+            raise ValueError(message)
+
+        return order
+
+    def _preserve_type(self, x):
+        x = np.asarray(x)
+        if x.dtype != self._dtype:
+            x = x.astype(self._dtype)
+        return x[()]
+
+    ## Testing
+
+    @classmethod
+    def _draw(cls, sizes=None, rng=None, i_parameterization=None,
+              proportions=None):
+        r""" Draw a specific (fully-defined) distribution from the family.
+
+        See _Parameterization.draw for documentation details.
+        """
+        rng = np.random.default_rng(rng)
+        if len(cls._parameterizations) == 0:
+            return cls()
+        if i_parameterization is None:
+            n = cls._num_parameterizations()
+            i_parameterization = rng.integers(0, max(0, n - 1), endpoint=True)
+
+        parameterization = cls._parameterizations[i_parameterization]
+        parameters = parameterization.draw(sizes, rng, proportions=proportions,
+                                           region='typical')
+        return cls(**parameters)
+
+    @classmethod
+    def _num_parameterizations(cls):
+        # Returns the number of parameterizations accepted by the family.
+        return len(cls._parameterizations)
+
+    @classmethod
+    def _num_parameters(cls, i_parameterization=0):
+        # Returns the number of parameters used in the specified
+        # parameterization.
+        return (0 if not cls._num_parameterizations()
+                else len(cls._parameterizations[i_parameterization]))
+
+    ## Algorithms
+
+    def _quadrature(self, integrand, limits=None, args=None,
+                    params=None, log=False):
+        # Performs numerical integration of an integrand between limits.
+        # Much of this should be added to `_tanhsinh`.
+        a, b = self._support(**params) if limits is None else limits
+        a, b = np.broadcast_arrays(a, b)
+        if not a.size:
+            # maybe need to figure out result type from a, b
+            return np.empty(a.shape, dtype=self._dtype)
+        args = [] if args is None else args
+        params = {} if params is None else params
+        f, args = _kwargs2args(integrand, args=args, kwargs=params)
+        args = np.broadcast_arrays(*args)
+        # If we know the median or mean, consider breaking up the interval
+        rtol = None if _isnull(self.tol) else self.tol
+        # For now, we ignore the status, but I want to return the error
+        # estimate - see question 5 at the top.
+        if isinstance(self, ContinuousDistribution):
+            res = _tanhsinh(f, a, b, args=args, log=log, rtol=rtol)
+            return res.integral
+        else:
+            res = nsum(f, a, b, args=args, log=log, tolerances=dict(rtol=rtol)).sum
+            res = np.asarray(res)
+            # The result should be nan when parameters are nan, so need to special
+            # case this.
+            cond = np.isnan(params.popitem()[1]) if params else np.True_
+            cond = np.broadcast_to(cond, a.shape)
+            res[(a > b)] = -np.inf if log else 0  # fix in nsum?
+            res[cond] = np.nan
+
+            return res[()]
+
+    def _solve_bounded(self, f, p, *, bounds=None, params=None, xatol=None):
+        # Finds the argument of a function that produces the desired output.
+        # Much of this should be added to _bracket_root / _chandrupatla.
+        xmin, xmax = self._support(**params) if bounds is None else bounds
+        params = {} if params is None else params
+
+        p, xmin, xmax = np.broadcast_arrays(p, xmin, xmax)
+        if not p.size:
+            # might need to figure out result type based on p
+            res = _RichResult()
+            empty = np.empty(p.shape, dtype=self._dtype)
+            res.xl, res.x, res.xr = empty, empty, empty
+            res.fl, res.fr = empty, empty
+
+        def f2(x, _p, **kwargs):  # named `_p` to avoid conflict with shape `p`
+            return f(x, **kwargs) - _p
+
+        f3, args = _kwargs2args(f2, args=[p], kwargs=params)
+        # If we know the median or mean, should use it
+
+        # Any operations between 0d array and a scalar produces a scalar, so...
+        shape = xmin.shape
+        xmin, xmax = np.atleast_1d(xmin, xmax)
+
+        xl0, xr0 = _guess_bracket(xmin, xmax)
+        xmin = xmin.reshape(shape)
+        xmax = xmax.reshape(shape)
+        xl0 = xl0.reshape(shape)
+        xr0 = xr0.reshape(shape)
+
+        res = _bracket_root(f3, xl0=xl0, xr0=xr0, xmin=xmin, xmax=xmax, args=args)
+        # For now, we ignore the status, but I want to use the bracket width
+        # as an error estimate - see question 5 at the top.
+
+        xrtol = None if _isnull(self.tol) else self.tol
+        xatol = None if xatol is None else xatol
+        tolerances = dict(xrtol=xrtol, xatol=xatol, fatol=0, frtol=0)
+        return _chandrupatla(f3, a=res.xl, b=res.xr, args=args, **tolerances)
+
+    ## Other
+
+    def _overrides(self, method_name):
+        # Determines whether a class overrides a specified method.
+        # Returns True if the method implementation exists and is the same as
+        # that of the `ContinuousDistribution` class; otherwise returns False.
+
+        # Sometimes we use `_overrides` to check whether a certain method is overridden
+        # and if so, call it. This begs the questions of why we don't do the more
+        # obvious thing: restructure so that if the private method is overridden,
+        # Python will call it instead of the inherited version automatically. The short
+        # answer is that there are multiple ways a use might wish to evaluate a method,
+        # and simply overriding the method with a formula is not always the best option.
+        # For more complete discussion of the considerations, see:
+        # https://github.com/scipy/scipy/pull/21050#discussion_r1707798901
+        method = getattr(self.__class__, method_name, None)
+        super_method = getattr(UnivariateDistribution, method_name, None)
+        return method is not super_method
+
+    ### Distribution properties
+    # The following "distribution properties" are exposed via a public method
+    # that accepts only options (not distribution parameters or quantile/
+    # percentile argument).
+    # support
+    # logentropy, entropy,
+    # median, mode, mean,
+    # variance, standard_deviation
+    # skewness, kurtosis
+    # Common options are:
+    # method - a string that indicates which method should be used to compute
+    #          the quantity (e.g. a formula or numerical integration).
+    # Input/output validation is provided by the `_set_invalid_nan_property`
+    # decorator. These are the methods meant to be called by users.
+    #
+    # Each public method calls a private "dispatch" method that
+    # determines which "method" (strategy for calculating the desired quantity)
+    # to use by default and, via the `@_dispatch` decorator, calls the
+    # method and computes the result.
+    # Dispatch methods always accept:
+    # method - as passed from the public method
+    # params - a dictionary of distribution shape parameters passed by
+    #          the public method.
+    # Dispatch methods accept `params` rather than relying on the state of the
+    # object because iterative algorithms like `_tanhsinh` and `_chandrupatla`
+    # need their callable to follow a strict elementwise protocol: each element
+    # of the output is determined solely by the values of the inputs at the
+    # corresponding location. The public methods do not satisfy this protocol
+    # because they do not accept the parameters as arguments, producing an
+    # output that generally has a different shape than that of the input. Also,
+    # by calling "dispatch" methods rather than the public methods, the
+    # iterative algorithms avoid the overhead of input validation.
+    #
+    # Each dispatch method can designate the responsibility of computing
+    # the required value to any of several "implementation" methods. These
+    # methods accept only `**params`, the parameter dictionary passed from
+    # the public method via the dispatch method. We separate the implementation
+    # methods from the dispatch methods for the sake of simplicity (via
+    # compartmentalization) and to allow subclasses to override certain
+    # implementation methods (typically only the "formula" methods). The names
+    # of implementation methods are combinations of the public method name and
+    # the name of the "method" (strategy for calculating the desired quantity)
+    # string. (In fact, the name of the implementation method is calculated
+    # from these two strings in the `_dispatch` decorator.) Common method
+    # strings are:
+    # formula - distribution-specific analytical expressions to be implemented
+    #           by subclasses.
+    # log/exp - Compute the log of a number and then exponentiate it or vice
+    #           versa.
+    # quadrature - Compute the value via numerical integration.
+    #
+    # The default method (strategy) is determined based on what implementation
+    # methods are available and the error tolerance of the user. Typically,
+    # a formula is always used if available. We fall back to "log/exp" if a
+    # formula for the logarithm or exponential of the quantity is available,
+    # and we use quadrature otherwise.
+
+    def support(self):
+        # If this were a `cached_property`, we couldn't update the value
+        # when the distribution parameters change.
+        # Caching is important, though, because calls to _support take a few
+        # microseconds even when `a` and `b` are already the same shape.
+        if self._support_cache is not None:
+            return self._support_cache
+
+        a, b = self._support(**self._parameters)
+        if a.shape != self._shape:
+            a = np.broadcast_to(a, self._shape)
+        if b.shape != self._shape:
+            b = np.broadcast_to(b, self._shape)
+
+        if self._any_invalid:
+            a, b = np.asarray(a).copy(), np.asarray(b).copy()
+            a[self._invalid], b[self._invalid] = np.nan, np.nan
+            a, b = a[()], b[()]
+
+        support = (a, b)
+
+        if self.cache_policy != _NO_CACHE:
+            self._support_cache = support
+
+        return support
+
+    def _support(self, **params):
+        # Computes the support given distribution parameters
+        a, b = self._variable.domain.get_numerical_endpoints(params)
+        if len(params):
+            # the parameters should all be of the same dtype and shape at this point
+            vals = list(params.values())
+            shape = vals[0].shape
+            a = np.broadcast_to(a, shape) if a.shape != shape else a
+            b = np.broadcast_to(b, shape) if b.shape != shape else b
+        return self._preserve_type(a), self._preserve_type(b)
+
+    @_set_invalid_nan_property
+    def logentropy(self, *, method=None):
+        return self._logentropy_dispatch(method=method, **self._parameters) + 0j
+
+    @_dispatch
+    def _logentropy_dispatch(self, method=None, **params):
+        if self._overrides('_logentropy_formula'):
+            method = self._logentropy_formula
+        elif self._overrides('_entropy_formula'):
+            method = self._logentropy_logexp_safe
+        else:
+            method = self._logentropy_quadrature
+        return method
+
+    def _logentropy_formula(self, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logentropy_logexp(self, **params):
+        res = np.log(self._entropy_dispatch(**params)+0j)
+        return _log_real_standardize(res)
+
+    def _logentropy_logexp_safe(self, **params):
+        out = self._logentropy_logexp(**params)
+        mask = np.isinf(out.real)
+        if np.any(mask):
+            params_mask = {key:val[mask] for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._logentropy_quadrature(**params_mask)
+        return out[()]
+
+    def _logentropy_quadrature(self, **params):
+        def logintegrand(x, **params):
+            logpxf = self._logpxf_dispatch(x, **params)
+            return logpxf + np.log(0j+logpxf)
+        res = self._quadrature(logintegrand, params=params, log=True)
+        return _log_real_standardize(res + np.pi*1j)
+
+    @_set_invalid_nan_property
+    def entropy(self, *, method=None):
+        return self._entropy_dispatch(method=method, **self._parameters)
+
+    @_dispatch
+    def _entropy_dispatch(self, method=None, **params):
+        if self._overrides('_entropy_formula'):
+            method = self._entropy_formula
+        elif self._overrides('_logentropy_formula'):
+            method = self._entropy_logexp
+        else:
+            method = self._entropy_quadrature
+        return method
+
+    def _entropy_formula(self, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _entropy_logexp(self, **params):
+        return np.real(np.exp(self._logentropy_dispatch(**params)))
+
+    def _entropy_quadrature(self, **params):
+        def integrand(x, **params):
+            pxf = self._pxf_dispatch(x, **params)
+            logpxf = self._logpxf_dispatch(x, **params)
+            temp = np.asarray(pxf)
+            i = (pxf != 0)  # 0 * inf -> nan; should be 0
+            temp[i] = -pxf[i]*logpxf[i]
+            return temp
+        return self._quadrature(integrand, params=params)
+
+    @_set_invalid_nan_property
+    def median(self, *, method=None):
+        return self._median_dispatch(method=method, **self._parameters)
+
+    @_dispatch
+    def _median_dispatch(self, method=None, **params):
+        if self._overrides('_median_formula'):
+            method = self._median_formula
+        else:
+            method = self._median_icdf
+        return method
+
+    def _median_formula(self, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _median_icdf(self, **params):
+        return self._icdf_dispatch(np.asarray(0.5, dtype=self._dtype), **params)
+
+    @_set_invalid_nan_property
+    def mode(self, *, method=None):
+        return self._mode_dispatch(method=method, **self._parameters)
+
+    @_dispatch
+    def _mode_dispatch(self, method=None, **params):
+        # We could add a method that looks for a critical point with
+        # differentiation and the root finder
+        if self._overrides('_mode_formula'):
+            method = self._mode_formula
+        else:
+            method = self._mode_optimization
+        return method
+
+    def _mode_formula(self, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _mode_optimization(self, xatol=None, **params):
+        if not self._size:
+            return np.empty(self._shape, dtype=self._dtype)
+
+        a, b = self._support(**params)
+        m = self._median_dispatch(**params)
+
+        f, args = _kwargs2args(lambda x, **params: -self._pxf_dispatch(x, **params),
+                               args=(), kwargs=params)
+        res_b = _bracket_minimum(f, m, xmin=a, xmax=b, args=args)
+        res = _chandrupatla_minimize(f, res_b.xl, res_b.xm, res_b.xr,
+                                     args=args, xatol=xatol)
+        mode = np.asarray(res.x)
+        mode_at_boundary = res_b.status == -1
+        mode_at_left = mode_at_boundary & (res_b.fl <= res_b.fm)
+        mode_at_right = mode_at_boundary & (res_b.fr < res_b.fm)
+        mode[mode_at_left] = a[mode_at_left]
+        mode[mode_at_right] = b[mode_at_right]
+        return mode[()]
+
+    def mean(self, *, method=None):
+        return self.moment(1, kind='raw', method=method)
+
+    def variance(self, *, method=None):
+        return self.moment(2, kind='central', method=method)
+
+    def standard_deviation(self, *, method=None):
+        return np.sqrt(self.variance(method=method))
+
+    def skewness(self, *, method=None):
+        return self.moment(3, kind='standardized', method=method)
+
+    def kurtosis(self, *, method=None, convention='non-excess'):
+        conventions = {'non-excess', 'excess'}
+        message = (f'Parameter `convention` of `{self.__class__.__name__}.kurtosis` '
+                   f"must be one of {conventions}.")
+        convention = convention.lower()
+        if convention not in conventions:
+            raise ValueError(message)
+        k = self.moment(4, kind='standardized', method=method)
+        return k - 3 if convention == 'excess' else k
+
+    ### Distribution functions
+    # The following functions related to the distribution PDF and CDF are
+    # exposed via a public method that accepts one positional argument - the
+    # quantile - and keyword options (but not distribution parameters).
+    # logpdf, pdf
+    # logcdf, cdf
+    # logccdf, ccdf
+    # The `logcdf` and `cdf` functions can also be called with two positional
+    # arguments - lower and upper quantiles - and they return the probability
+    # mass (integral of the PDF) between them. The 2-arg versions of `logccdf`
+    # and `ccdf` return the complement of this quantity.
+    # All the (1-arg) cumulative distribution functions have inverse
+    # functions, which accept one positional argument - the percentile.
+    # ilogcdf, icdf
+    # ilogccdf, iccdf
+    # Common keyword options include:
+    # method - a string that indicates which method should be used to compute
+    #          the quantity (e.g. a formula or numerical integration).
+    # Tolerance options should be added.
+    # Input/output validation is provided by the `_set_invalid_nan`
+    # decorator. These are the methods meant to be called by users.
+    #
+    # Each public method calls a private "dispatch" method that
+    # determines which "method" (strategy for calculating the desired quantity)
+    # to use by default and, via the `@_dispatch` decorator, calls the
+    # method and computes the result.
+    # Each dispatch method can designate the responsibility of computing
+    # the required value to any of several "implementation" methods. These
+    # methods accept only `**params`, the parameter dictionary passed from
+    # the public method via the dispatch method.
+    # See the note corresponding with the "Distribution Parameters" for more
+    # information.
+
+    ## Probability Density/Mass Functions
+
+    @_set_invalid_nan
+    def logpdf(self, x, /, *, method=None):
+        return self._logpdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _logpdf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_logpdf_formula'):
+            method = self._logpdf_formula
+        elif _isnull(self.tol):  # ensure that developers override _logpdf
+            method = self._logpdf_logexp
+        return method
+
+    def _logpdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logpdf_logexp(self, x, **params):
+        return np.log(self._pdf_dispatch(x, **params))
+
+    @_set_invalid_nan
+    def pdf(self, x, /, *, method=None):
+        return self._pdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _pdf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_pdf_formula'):
+            method = self._pdf_formula
+        else:
+            method = self._pdf_logexp
+        return method
+
+    def _pdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _pdf_logexp(self, x, **params):
+        return np.exp(self._logpdf_dispatch(x, **params))
+
+    @_set_invalid_nan
+    def logpmf(self, x, /, *, method=None):
+        return self._logpmf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _logpmf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_logpmf_formula'):
+            method = self._logpmf_formula
+        elif _isnull(self.tol):  # ensure that developers override _logpmf
+            method = self._logpmf_logexp
+        return method
+
+    def _logpmf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logpmf_logexp(self, x, **params):
+        with np.errstate(divide='ignore'):
+            return np.log(self._pmf_dispatch(x, **params))
+
+    @_set_invalid_nan
+    def pmf(self, x, /, *, method=None):
+        return self._pmf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _pmf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_pmf_formula'):
+            method = self._pmf_formula
+        else:
+            method = self._pmf_logexp
+        return method
+
+    def _pmf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _pmf_logexp(self, x, **params):
+        return np.exp(self._logpmf_dispatch(x, **params))
+
+    ## Cumulative Distribution Functions
+
+    def logcdf(self, x, y=None, /, *, method=None):
+        if y is None:
+            return self._logcdf1(x, method=method)
+        else:
+            return self._logcdf2(x, y, method=method)
+
+    @_cdf2_input_validation
+    def _logcdf2(self, x, y, *, method):
+        out = self._logcdf2_dispatch(x, y, method=method, **self._parameters)
+        return (out + 0j) if not np.issubdtype(out.dtype, np.complexfloating) else out
+
+    @_dispatch
+    def _logcdf2_dispatch(self, x, y, *, method=None, **params):
+        # dtype is complex if any x > y, else real
+        # Should revisit this logic.
+        if self._overrides('_logcdf2_formula'):
+            method = self._logcdf2_formula
+        elif (self._overrides('_logcdf_formula')
+              or self._overrides('_logccdf_formula')):
+            method = self._logcdf2_subtraction
+        elif (self._overrides('_cdf_formula')
+              or self._overrides('_ccdf_formula')):
+            method = self._logcdf2_logexp_safe
+        else:
+            method = self._logcdf2_quadrature
+        return method
+
+    def _logcdf2_formula(self, x, y, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logcdf2_subtraction(self, x, y, **params):
+        flip_sign = x > y  # some results will be negative
+        x, y = np.minimum(x, y), np.maximum(x, y)
+        logcdf_x = self._logcdf_dispatch(x, **params)
+        logcdf_y = self._logcdf_dispatch(y, **params)
+        logccdf_x = self._logccdf_dispatch(x, **params)
+        logccdf_y = self._logccdf_dispatch(y, **params)
+        case_left = (logcdf_x < -1) & (logcdf_y < -1)
+        case_right = (logccdf_x < -1) & (logccdf_y < -1)
+        case_central = ~(case_left | case_right)
+        log_mass = _logexpxmexpy(logcdf_y, logcdf_x)
+        log_mass[case_right] = _logexpxmexpy(logccdf_x, logccdf_y)[case_right]
+        log_tail = np.logaddexp(logcdf_x, logccdf_y)[case_central]
+        log_mass[case_central] = _log1mexp(log_tail)
+        log_mass[flip_sign] += np.pi * 1j
+        return log_mass[()] if np.any(flip_sign) else log_mass.real[()]
+
+    def _logcdf2_logexp(self, x, y, **params):
+        expres = self._cdf2_dispatch(x, y, **params)
+        expres = expres + 0j if np.any(x > y) else expres
+        return np.log(expres)
+
+    def _logcdf2_logexp_safe(self, x, y, **params):
+        out = self._logcdf2_logexp(x, y, **params)
+        mask = np.isinf(out.real)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._logcdf2_quadrature(x[mask], y[mask], **params_mask)
+        return out[()]
+
+    def _logcdf2_quadrature(self, x, y, **params):
+        logres = self._quadrature(self._logpxf_dispatch, limits=(x, y),
+                                  log=True, params=params)
+        return logres
+
+    @_set_invalid_nan
+    def _logcdf1(self, x, *, method=None):
+        return self._logcdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _logcdf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_logcdf_formula'):
+            method = self._logcdf_formula
+        elif self._overrides('_logccdf_formula'):
+            method = self._logcdf_complement
+        elif self._overrides('_cdf_formula'):
+            method = self._logcdf_logexp_safe
+        else:
+            method = self._logcdf_quadrature
+        return method
+
+    def _logcdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logcdf_complement(self, x, **params):
+        return _log1mexp(self._logccdf_dispatch(x, **params))
+
+    def _logcdf_logexp(self, x, **params):
+        return np.log(self._cdf_dispatch(x, **params))
+
+    def _logcdf_logexp_safe(self, x, **params):
+        out = self._logcdf_logexp(x, **params)
+        mask = np.isinf(out)
+        if np.any(mask):
+            params_mask = {key:np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._logcdf_quadrature(x[mask], **params_mask)
+        return out[()]
+
+    def _logcdf_quadrature(self, x, **params):
+        a, _ = self._support(**params)
+        return self._quadrature(self._logpxf_dispatch, limits=(a, x),
+                                params=params, log=True)
+
+    def cdf(self, x, y=None, /, *, method=None):
+        if y is None:
+            return self._cdf1(x, method=method)
+        else:
+            return self._cdf2(x, y, method=method)
+
+    @_cdf2_input_validation
+    def _cdf2(self, x, y, *, method):
+        return self._cdf2_dispatch(x, y, method=method, **self._parameters)
+
+    @_dispatch
+    def _cdf2_dispatch(self, x, y, *, method=None, **params):
+        # Should revisit this logic.
+        if self._overrides('_cdf2_formula'):
+            method = self._cdf2_formula
+        elif (self._overrides('_logcdf_formula')
+              or self._overrides('_logccdf_formula')):
+            method = self._cdf2_logexp
+        elif self._overrides('_cdf_formula') or self._overrides('_ccdf_formula'):
+            method = self._cdf2_subtraction_safe
+        else:
+            method = self._cdf2_quadrature
+        return method
+
+    def _cdf2_formula(self, x, y, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _cdf2_logexp(self, x, y, **params):
+        return np.real(np.exp(self._logcdf2_dispatch(x, y, **params)))
+
+    def _cdf2_subtraction(self, x, y, **params):
+        # Improvements:
+        # Lazy evaluation of cdf/ccdf only where needed
+        # Stack x and y to reduce function calls?
+        cdf_x = self._cdf_dispatch(x, **params)
+        cdf_y = self._cdf_dispatch(y, **params)
+        ccdf_x = self._ccdf_dispatch(x, **params)
+        ccdf_y = self._ccdf_dispatch(y, **params)
+        i = (ccdf_x < 0.5) & (ccdf_y < 0.5)
+        return np.where(i, ccdf_x-ccdf_y, cdf_y-cdf_x)
+
+    def _cdf2_subtraction_safe(self, x, y, **params):
+        cdf_x = self._cdf_dispatch(x, **params)
+        cdf_y = self._cdf_dispatch(y, **params)
+        ccdf_x = self._ccdf_dispatch(x, **params)
+        ccdf_y = self._ccdf_dispatch(y, **params)
+        i = (ccdf_x < 0.5) & (ccdf_y < 0.5)
+        out = np.where(i, ccdf_x-ccdf_y, cdf_y-cdf_x)
+
+        eps = np.finfo(self._dtype).eps
+        tol = self.tol if not _isnull(self.tol) else np.sqrt(eps)
+
+        cdf_max = np.maximum(cdf_x, cdf_y)
+        ccdf_max = np.maximum(ccdf_x, ccdf_y)
+        spacing = np.spacing(np.where(i, ccdf_max, cdf_max))
+        mask = np.abs(tol * out) < spacing
+
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._cdf2_quadrature(x[mask], y[mask], **params_mask)
+        return out[()]
+
+    def _cdf2_quadrature(self, x, y, **params):
+        return self._quadrature(self._pxf_dispatch, limits=(x, y), params=params)
+
+    @_set_invalid_nan
+    def _cdf1(self, x, *, method):
+        return self._cdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _cdf_dispatch(self, x, *, method=None, **params):
+        if self._overrides('_cdf_formula'):
+            method = self._cdf_formula
+        elif self._overrides('_logcdf_formula'):
+            method = self._cdf_logexp
+        elif self._overrides('_ccdf_formula'):
+            method = self._cdf_complement_safe
+        else:
+            method = self._cdf_quadrature
+        return method
+
+    def _cdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _cdf_logexp(self, x, **params):
+        return np.exp(self._logcdf_dispatch(x, **params))
+
+    def _cdf_complement(self, x, **params):
+        return 1 - self._ccdf_dispatch(x, **params)
+
+    def _cdf_complement_safe(self, x, **params):
+        ccdf = self._ccdf_dispatch(x, **params)
+        out = 1 - ccdf
+        eps = np.finfo(self._dtype).eps
+        tol = self.tol if not _isnull(self.tol) else np.sqrt(eps)
+        mask = tol * out < np.spacing(ccdf)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._cdf_quadrature(x[mask], *params_mask)
+        return out[()]
+
+    def _cdf_quadrature(self, x, **params):
+        a, _ = self._support(**params)
+        return self._quadrature(self._pxf_dispatch, limits=(a, x),
+                                params=params)
+
+    def logccdf(self, x, y=None, /, *, method=None):
+        if y is None:
+            return self._logccdf1(x, method=method)
+        else:
+            return self._logccdf2(x, y, method=method)
+
+    @_cdf2_input_validation
+    def _logccdf2(self, x, y, *, method):
+        return self._logccdf2_dispatch(x, y, method=method, **self._parameters)
+
+    @_dispatch
+    def _logccdf2_dispatch(self, x, y, *, method=None, **params):
+        # if _logccdf2_formula exists, we could use the complement
+        # if _ccdf2_formula exists, we could use log/exp
+        if self._overrides('_logccdf2_formula'):
+            method = self._logccdf2_formula
+        else:
+            method = self._logccdf2_addition
+        return method
+
+    def _logccdf2_formula(self, x, y, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logccdf2_addition(self, x, y, **params):
+        logcdf_x = self._logcdf_dispatch(x, **params)
+        logccdf_y = self._logccdf_dispatch(y, **params)
+        return special.logsumexp([logcdf_x, logccdf_y], axis=0)
+
+    @_set_invalid_nan
+    def _logccdf1(self, x, *, method=None):
+        return self._logccdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _logccdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_logccdf_formula'):
+            method = self._logccdf_formula
+        elif self._overrides('_logcdf_formula'):
+            method = self._logccdf_complement
+        elif self._overrides('_ccdf_formula'):
+            method = self._logccdf_logexp_safe
+        else:
+            method = self._logccdf_quadrature
+        return method
+
+    def _logccdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _logccdf_complement(self, x, **params):
+        return _log1mexp(self._logcdf_dispatch(x, **params))
+
+    def _logccdf_logexp(self, x, **params):
+        return np.log(self._ccdf_dispatch(x, **params))
+
+    def _logccdf_logexp_safe(self, x, **params):
+        out = self._logccdf_logexp(x, **params)
+        mask = np.isinf(out)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._logccdf_quadrature(x[mask], **params_mask)
+        return out[()]
+
+    def _logccdf_quadrature(self, x, **params):
+        _, b = self._support(**params)
+        return self._quadrature(self._logpxf_dispatch, limits=(x, b),
+                                params=params, log=True)
+
+    def ccdf(self, x, y=None, /, *, method=None):
+        if y is None:
+            return self._ccdf1(x, method=method)
+        else:
+            return self._ccdf2(x, y, method=method)
+
+    @_cdf2_input_validation
+    def _ccdf2(self, x, y, *, method):
+        return self._ccdf2_dispatch(x, y, method=method, **self._parameters)
+
+    @_dispatch
+    def _ccdf2_dispatch(self, x, y, *, method=None, **params):
+        if self._overrides('_ccdf2_formula'):
+            method = self._ccdf2_formula
+        else:
+            method = self._ccdf2_addition
+        return method
+
+    def _ccdf2_formula(self, x, y, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _ccdf2_addition(self, x, y, **params):
+        cdf_x = self._cdf_dispatch(x, **params)
+        ccdf_y = self._ccdf_dispatch(y, **params)
+        # even if x > y, cdf(x, y) + ccdf(x,y) sums to 1
+        return cdf_x + ccdf_y
+
+    @_set_invalid_nan
+    def _ccdf1(self, x, *, method):
+        return self._ccdf_dispatch(x, method=method, **self._parameters)
+
+    @_dispatch
+    def _ccdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_ccdf_formula'):
+            method = self._ccdf_formula
+        elif self._overrides('_logccdf_formula'):
+            method = self._ccdf_logexp
+        elif self._overrides('_cdf_formula'):
+            method = self._ccdf_complement_safe
+        else:
+            method = self._ccdf_quadrature
+        return method
+
+    def _ccdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _ccdf_logexp(self, x, **params):
+        return np.exp(self._logccdf_dispatch(x, **params))
+
+    def _ccdf_complement(self, x, **params):
+        return 1 - self._cdf_dispatch(x, **params)
+
+    def _ccdf_complement_safe(self, x, **params):
+        cdf = self._cdf_dispatch(x, **params)
+        out = 1 - cdf
+        eps = np.finfo(self._dtype).eps
+        tol = self.tol if not _isnull(self.tol) else np.sqrt(eps)
+        mask = tol * out < np.spacing(cdf)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._ccdf_quadrature(x[mask], **params_mask)
+        return out[()]
+
+    def _ccdf_quadrature(self, x, **params):
+        _, b = self._support(**params)
+        return self._quadrature(self._pxf_dispatch, limits=(x, b),
+                                params=params)
+
+    ## Inverse cumulative distribution functions
+
+    @_set_invalid_nan
+    def ilogcdf(self, logp, /, *, method=None):
+        return self._ilogcdf_dispatch(logp, method=method, **self._parameters)
+
+    @_dispatch
+    def _ilogcdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_ilogcdf_formula'):
+            method = self._ilogcdf_formula
+        elif self._overrides('_ilogccdf_formula'):
+            method = self._ilogcdf_complement
+        else:
+            method = self._ilogcdf_inversion
+        return method
+
+    def _ilogcdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _ilogcdf_complement(self, x, **params):
+        return self._ilogccdf_dispatch(_log1mexp(x), **params)
+
+    def _ilogcdf_inversion(self, x, **params):
+        return self._solve_bounded_continuous(self._logcdf_dispatch, x, params=params)
+
+    @_set_invalid_nan
+    def icdf(self, p, /, *, method=None):
+        return self._icdf_dispatch(p, method=method, **self._parameters)
+
+    @_dispatch
+    def _icdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_icdf_formula'):
+            method = self._icdf_formula
+        elif self._overrides('_iccdf_formula'):
+            method = self._icdf_complement_safe
+        else:
+            method = self._icdf_inversion
+        return method
+
+    def _icdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _icdf_complement(self, x, **params):
+        return self._iccdf_dispatch(1 - x, **params)
+
+    def _icdf_complement_safe(self, x, **params):
+        out = self._icdf_complement(x, **params)
+        eps = np.finfo(self._dtype).eps
+        tol = self.tol if not _isnull(self.tol) else np.sqrt(eps)
+        mask = tol * x < np.spacing(1 - x)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._icdf_inversion(x[mask], *params_mask)
+        return out[()]
+
+    def _icdf_inversion(self, x, **params):
+        return self._solve_bounded_continuous(self._cdf_dispatch, x, params=params)
+
+    @_set_invalid_nan
+    def ilogccdf(self, logp, /, *, method=None):
+        return self._ilogccdf_dispatch(logp, method=method, **self._parameters)
+
+    @_dispatch
+    def _ilogccdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_ilogccdf_formula'):
+            method = self._ilogccdf_formula
+        elif self._overrides('_ilogcdf_formula'):
+            method = self._ilogccdf_complement
+        else:
+            method = self._ilogccdf_inversion
+        return method
+
+    def _ilogccdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _ilogccdf_complement(self, x, **params):
+        return self._ilogcdf_dispatch(_log1mexp(x), **params)
+
+    def _ilogccdf_inversion(self, x, **params):
+        return self._solve_bounded_continuous(self._logccdf_dispatch, x, params=params)
+
+    @_set_invalid_nan
+    def iccdf(self, p, /, *, method=None):
+        return self._iccdf_dispatch(p, method=method, **self._parameters)
+
+    @_dispatch
+    def _iccdf_dispatch(self, x, method=None, **params):
+        if self._overrides('_iccdf_formula'):
+            method = self._iccdf_formula
+        elif self._overrides('_icdf_formula'):
+            method = self._iccdf_complement_safe
+        else:
+            method = self._iccdf_inversion
+        return method
+
+    def _iccdf_formula(self, x, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _iccdf_complement(self, x, **params):
+        return self._icdf_dispatch(1 - x, **params)
+
+    def _iccdf_complement_safe(self, x, **params):
+        out = self._iccdf_complement(x, **params)
+        eps = np.finfo(self._dtype).eps
+        tol = self.tol if not _isnull(self.tol) else np.sqrt(eps)
+        mask = tol * x < np.spacing(1 - x)
+        if np.any(mask):
+            params_mask = {key: np.broadcast_to(val, mask.shape)[mask]
+                           for key, val in params.items()}
+            out = np.asarray(out)
+            out[mask] = self._iccdf_inversion(x[mask], *params_mask)
+        return out[()]
+
+    def _iccdf_inversion(self, x, **params):
+        return self._solve_bounded_continuous(self._ccdf_dispatch, x, params=params)
+
+    ### Sampling Functions
+    # The following functions for drawing samples from the distribution are
+    # exposed via a public method that accepts one positional argument - the
+    # shape of the sample - and keyword options (but not distribution
+    # parameters).
+    # sample
+    # ~~qmc_sample~~ built into sample now
+    #
+    # Common keyword options include:
+    # method - a string that indicates which method should be used to compute
+    #          the quantity (e.g. a formula or numerical integration).
+    # rng - the NumPy Generator/SciPy QMCEngine object to used for drawing numbers.
+    #
+    # Input/output validation is included in each function, since there is
+    # little code to be shared.
+    # These are the methods meant to be called by users.
+    #
+    # Each public method calls a private "dispatch" method that
+    # determines which "method" (strategy for calculating the desired quantity)
+    # to use by default and, via the `@_dispatch` decorator, calls the
+    # method and computes the result.
+    # Each dispatch method can designate the responsibility of sampling to any
+    # of several "implementation" methods. These methods accept only
+    # `**params`, the parameter dictionary passed from the public method via
+    # the "dispatch" method.
+    # See the note corresponding with the "Distribution Parameters" for more
+    # information.
+
+    # TODO:
+    #  - should we accept a QRNG with `d != 1`?
+    def sample(self, shape=(), *, method=None, rng=None):
+        # needs output validation to ensure that developer returns correct
+        # dtype and shape
+        sample_shape = (shape,) if not np.iterable(shape) else tuple(shape)
+        full_shape = sample_shape + self._shape
+        rng = np.random.default_rng(rng) if not isinstance(rng, qmc.QMCEngine) else rng
+        res = self._sample_dispatch(full_shape, method=method, rng=rng,
+                                    **self._parameters)
+
+        return res.astype(self._dtype, copy=False)
+
+    @_dispatch
+    def _sample_dispatch(self, full_shape, *, method, rng, **params):
+        # make sure that tests catch if sample is 0d array
+        if self._overrides('_sample_formula') and not isinstance(rng, qmc.QMCEngine):
+            method = self._sample_formula
+        else:
+            method = self._sample_inverse_transform
+        return method
+
+    def _sample_formula(self, full_shape, *, rng, **params):
+        raise NotImplementedError(self._not_implemented)
+
+    def _sample_inverse_transform(self, full_shape, *, rng, **params):
+        if isinstance(rng, qmc.QMCEngine):
+            uniform = self._qmc_uniform(full_shape, qrng=rng, **params)
+        else:
+            uniform = rng.random(size=full_shape, dtype=self._dtype)
+        return self._icdf_dispatch(uniform, **params)
+
+    def _qmc_uniform(self, full_shape, *, qrng, **params):
+        # Generate QMC uniform sample(s) on unit interval with specified shape;
+        # if `sample_shape != ()`, then each slice along axis 0 is independent.
+
+        sample_shape = full_shape[:len(full_shape)-len(self._shape)]
+        # Determine the number of independent sequences and the length of each.
+        n_low_discrepancy = sample_shape[0] if sample_shape else 1
+        n_independent = math.prod(full_shape[1:] if sample_shape else full_shape)
+
+        # For each independent sequence, we'll need a new QRNG of the appropriate class
+        # with its own RNG. (If scramble=False, we don't really need all the separate
+        # rngs, but I'm not going to add a special code path right now.)
+        rngs = _rng_spawn(qrng.rng, n_independent)
+        qrng_class = qrng.__class__
+        kwargs = dict(d=1, scramble=qrng.scramble, optimization=qrng._optimization)
+        if isinstance(qrng, qmc.Sobol):
+            kwargs['bits'] = qrng.bits
+
+        # Draw uniform low-discrepancy sequences scrambled with each RNG
+        uniforms = []
+        for rng in rngs:
+            qrng = qrng_class(seed=rng, **kwargs)
+            uniform = qrng.random(n_low_discrepancy)
+            uniform = uniform.reshape(n_low_discrepancy if sample_shape else ())[()]
+            uniforms.append(uniform)
+
+        # Reorder the axes and ensure that the shape is correct
+        uniform = np.moveaxis(np.stack(uniforms), -1, 0) if uniforms else np.asarray([])
+        return uniform.reshape(full_shape)
+
+    ### Moments
+    # The `moment` method accepts two positional arguments - the order and kind
+    # (raw, central, or standard) of the moment - and a keyword option:
+    # method - a string that indicates which method should be used to compute
+    #          the quantity (e.g. a formula or numerical integration).
+    # Like the distribution properties, input/output validation is provided by
+    # the `_set_invalid_nan_property` decorator.
+    #
+    # Unlike most public methods above, `moment` dispatches to one of three
+    # private methods - one for each 'kind'. Like most *public* methods above,
+    # each of these private methods calls a private "dispatch" method that
+    # determines which "method" (strategy for calculating the desired quantity)
+    # to use. Also, each dispatch method can designate the responsibility
+    # computing the moment to one of several "implementation" methods.
+    # Unlike the dispatch methods above, however, the `@_dispatch` decorator
+    # is not used, and both logic and method calls are included in the function
+    # itself.
+    # Instead of determining which method will be used based solely on the
+    # implementation methods available and calling only the corresponding
+    # implementation method, *all* the implementation methods are called
+    # in sequence until one returns the desired information. When an
+    # implementation methods cannot provide the requested information, it
+    # returns the object None (which is distinct from arrays with NaNs or infs,
+    # which are valid values of moments).
+    # The reason for this approach is that although formulae for the first
+    # few moments of a distribution may be found, general formulae that work
+    # for all orders are not always easy to find. This approach allows the
+    # developer to write "formula" implementation functions that return the
+    # desired moment when it is available and None otherwise.
+    #
+    # Note that the first implementation method called is a cache. This is
+    # important because lower-order moments are often needed to compute
+    # higher moments from formulae, so we eliminate redundant calculations
+    # when moments of several orders are needed.
+
+    @cached_property
+    def _moment_methods(self):
+        return {'cache', 'formula', 'transform',
+                'normalize', 'general', 'quadrature'}
+
+    @property
+    def _zero(self):
+        return self._constants()[0]
+
+    @property
+    def _one(self):
+        return self._constants()[1]
+
+    def _constants(self):
+        if self._constant_cache is not None:
+            return self._constant_cache
+
+        constants = self._preserve_type([0, 1])
+
+        if self.cache_policy != _NO_CACHE:
+            self._constant_cache = constants
+
+        return constants
+
+    @_set_invalid_nan_property
+    def moment(self, order=1, kind='raw', *, method=None):
+        kinds = {'raw': self._moment_raw,
+                 'central': self._moment_central,
+                 'standardized': self._moment_standardized}
+        order = self._validate_order_kind(order, kind, kinds)
+        moment_kind = kinds[kind]
+        return moment_kind(order, method=method)
+
+    def _moment_raw(self, order=1, *, method=None):
+        """Raw distribution moment about the origin."""
+        # Consider exposing the point about which moments are taken as an
+        # option. This is easy to support, since `_moment_transform_center`
+        # does all the work.
+        methods = self._moment_methods if method is None else {method}
+        return self._moment_raw_dispatch(order, methods=methods, **self._parameters)
+
+    def _moment_raw_dispatch(self, order, *, methods, **params):
+        moment = None
+
+        if 'cache' in methods:
+            moment = self._moment_raw_cache.get(order, None)
+
+        if moment is None and 'formula' in methods:
+            moment = self._moment_raw_formula(order, **params)
+
+        if moment is None and 'transform' in methods and order > 1:
+            moment = self._moment_raw_transform(order, **params)
+
+        if moment is None and 'general' in methods:
+            moment = self._moment_raw_general(order, **params)
+
+        if moment is None and 'quadrature' in methods:
+            moment = self._moment_from_pxf(order, center=self._zero, **params)
+
+        if moment is None and 'quadrature_icdf' in methods:
+            moment = self._moment_integrate_icdf(order, center=self._zero, **params)
+
+        if moment is not None and self.cache_policy != _NO_CACHE:
+            self._moment_raw_cache[order] = moment
+
+        return moment
+
+    def _moment_raw_formula(self, order, **params):
+        return None
+
+    def _moment_raw_transform(self, order, **params):
+        central_moments = []
+        for i in range(int(order) + 1):
+            methods = {'cache', 'formula', 'normalize', 'general'}
+            moment_i = self._moment_central_dispatch(order=i, methods=methods, **params)
+            if moment_i is None:
+                return None
+            central_moments.append(moment_i)
+
+        # Doesn't make sense to get the mean by "transform", since that's
+        # how we got here. Questionable whether 'quadrature' should be here.
+        mean_methods = {'cache', 'formula', 'quadrature'}
+        mean = self._moment_raw_dispatch(self._one, methods=mean_methods, **params)
+        if mean is None:
+            return None
+
+        moment = self._moment_transform_center(order, central_moments, mean, self._zero)
+        return moment
+
+    def _moment_raw_general(self, order, **params):
+        # This is the only general formula for a raw moment of a probability
+        # distribution
+        return self._one if order == 0 else None
+
+    def _moment_central(self, order=1, *, method=None):
+        """Distribution moment about the mean."""
+        methods = self._moment_methods if method is None else {method}
+        return self._moment_central_dispatch(order, methods=methods, **self._parameters)
+
+    def _moment_central_dispatch(self, order, *, methods, **params):
+        moment = None
+
+        if 'cache' in methods:
+            moment = self._moment_central_cache.get(order, None)
+
+        if moment is None and 'formula' in methods:
+            moment = self._moment_central_formula(order, **params)
+
+        if moment is None and 'transform' in methods:
+            moment = self._moment_central_transform(order, **params)
+
+        if moment is None and 'normalize' in methods and order > 2:
+            moment = self._moment_central_normalize(order, **params)
+
+        if moment is None and 'general' in methods:
+            moment = self._moment_central_general(order, **params)
+
+        if moment is None and 'quadrature' in methods:
+            mean = self._moment_raw_dispatch(self._one, **params,
+                                             methods=self._moment_methods)
+            moment = self._moment_from_pxf(order, center=mean, **params)
+
+        if moment is None and 'quadrature_icdf' in methods:
+            mean = self._moment_raw_dispatch(self._one, **params,
+                                             methods=self._moment_methods)
+            moment = self._moment_integrate_icdf(order, center=mean, **params)
+
+        if moment is not None and self.cache_policy != _NO_CACHE:
+            self._moment_central_cache[order] = moment
+
+        return moment
+
+    def _moment_central_formula(self, order, **params):
+        return None
+
+    def _moment_central_transform(self, order, **params):
+
+        raw_moments = []
+        for i in range(int(order) + 1):
+            methods = {'cache', 'formula', 'general'}
+            moment_i = self._moment_raw_dispatch(order=i, methods=methods, **params)
+            if moment_i is None:
+                return None
+            raw_moments.append(moment_i)
+
+        mean_methods = self._moment_methods
+        mean = self._moment_raw_dispatch(self._one, methods=mean_methods, **params)
+
+        moment = self._moment_transform_center(order, raw_moments, self._zero, mean)
+        return moment
+
+    def _moment_central_normalize(self, order, **params):
+        methods = {'cache', 'formula', 'general'}
+        standard_moment = self._moment_standardized_dispatch(order, **params,
+                                                             methods=methods)
+        if standard_moment is None:
+            return None
+        var = self._moment_central_dispatch(2, methods=self._moment_methods, **params)
+        return standard_moment*var**(order/2)
+
+    def _moment_central_general(self, order, **params):
+        general_central_moments = {0: self._one, 1: self._zero}
+        return general_central_moments.get(order, None)
+
+    def _moment_standardized(self, order=1, *, method=None):
+        """Standardized distribution moment."""
+        methods = self._moment_methods if method is None else {method}
+        return self._moment_standardized_dispatch(order, methods=methods,
+                                                  **self._parameters)
+
+    def _moment_standardized_dispatch(self, order, *, methods, **params):
+        moment = None
+
+        if 'cache' in methods:
+            moment = self._moment_standardized_cache.get(order, None)
+
+        if moment is None and 'formula' in methods:
+            moment = self._moment_standardized_formula(order, **params)
+
+        if moment is None and 'normalize' in methods:
+            moment = self._moment_standardized_normalize(order, False, **params)
+
+        if moment is None and 'general' in methods:
+            moment = self._moment_standardized_general(order, **params)
+
+        if moment is None and 'normalize' in methods:
+            moment = self._moment_standardized_normalize(order, True, **params)
+
+        if moment is not None and self.cache_policy != _NO_CACHE:
+            self._moment_standardized_cache[order] = moment
+
+        return moment
+
+    def _moment_standardized_formula(self, order, **params):
+        return None
+
+    def _moment_standardized_normalize(self, order, use_quadrature, **params):
+        methods = ({'quadrature'} if use_quadrature
+                   else {'cache', 'formula', 'transform'})
+        central_moment = self._moment_central_dispatch(order, **params,
+                                                       methods=methods)
+        if central_moment is None:
+            return None
+        var = self._moment_central_dispatch(2, methods=self._moment_methods,
+                                            **params)
+        return central_moment/var**(order/2)
+
+    def _moment_standardized_general(self, order, **params):
+        general_standard_moments = {0: self._one, 1: self._zero, 2: self._one}
+        return general_standard_moments.get(order, None)
+
+    def _moment_from_pxf(self, order, center, **params):
+        def integrand(x, order, center, **params):
+            pxf = self._pxf_dispatch(x, **params)
+            return pxf*(x-center)**order
+        return self._quadrature(integrand, args=(order, center), params=params)
+
+    def _moment_integrate_icdf(self, order, center, **params):
+        def integrand(x, order, center, **params):
+            x = self._icdf_dispatch(x, **params)
+            return (x-center)**order
+        return self._quadrature(integrand, limits=(0., 1.),
+                                args=(order, center), params=params)
+
+    def _moment_transform_center(self, order, moment_as, a, b):
+        a, b, *moment_as = np.broadcast_arrays(a, b, *moment_as)
+        n = order
+        i = np.arange(n+1).reshape([-1]+[1]*a.ndim)  # orthogonal to other axes
+        i = self._preserve_type(i)
+        n_choose_i = special.binom(n, i)
+        with np.errstate(invalid='ignore'):  # can happen with infinite moment
+            moment_b = np.sum(n_choose_i*moment_as*(a-b)**(n-i), axis=0)
+        return moment_b
+
+    def _logmoment(self, order=1, *, logcenter=None, standardized=False):
+        # make this private until it is worked into moment
+        if logcenter is None or standardized is True:
+            logmean = self._logmoment_quad(self._one, -np.inf, **self._parameters)
+        else:
+            logmean = None
+
+        logcenter = logmean if logcenter is None else logcenter
+        res = self._logmoment_quad(order, logcenter, **self._parameters)
+        if standardized:
+            logvar = self._logmoment_quad(2, logmean, **self._parameters)
+            res = res - logvar * (order/2)
+        return res
+
+    def _logmoment_quad(self, order, logcenter, **params):
+        def logintegrand(x, order, logcenter, **params):
+            logpdf = self._logpxf_dispatch(x, **params)
+            return logpdf + order * _logexpxmexpy(np.log(x + 0j), logcenter)
+            ## if logx == logcenter, `_logexpxmexpy` returns (-inf + 0j)
+            ## multiplying by order produces (-inf + nan j) - bad
+            ## We're skipping logmoment tests, so we might don't need to fix
+            ## now, but if we ever do use run them, this might help:
+            # logx = np.log(x+0j)
+            # out = np.asarray(logpdf + order*_logexpxmexpy(logx, logcenter))
+            # i = (logx == logcenter)
+            # out[i] = logpdf[i]
+            # return out
+        return self._quadrature(logintegrand, args=(order, logcenter),
+                                params=params, log=True)
+
+    ### Convenience
+
+    def plot(self, x='x', y=None, *, t=None, ax=None):
+        r"""Plot a function of the distribution.
+
+        Convenience function for quick visualization of the distribution
+        underlying the random variable.
+
+        Parameters
+        ----------
+        x, y : str, optional
+            String indicating the quantities to be used as the abscissa and
+            ordinate (horizontal and vertical coordinates), respectively.
+            Defaults are ``'x'`` (the domain of the random variable) and either
+            ``'pdf'`` (the probability density function) (continuous) or
+            ``'pdf'`` (the probability density function) (discrete).
+            Valid values are:
+            'x', 'pdf', 'pmf', 'cdf', 'ccdf', 'icdf', 'iccdf', 'logpdf', 'logpmf',
+            'logcdf', 'logccdf', 'ilogcdf', 'ilogccdf'.
+        t : 3-tuple of (str, float, float), optional
+            Tuple indicating the limits within which the quantities are plotted.
+            The default is ``('cdf', 0.0005, 0.9995)`` if the domain is infinite,
+            indicating that the central 99.9% of the distribution is to be shown;
+            otherwise, endpoints of the support are used where they are finite.
+            Valid values are:
+            'x', 'cdf', 'ccdf', 'icdf', 'iccdf', 'logcdf', 'logccdf',
+            'ilogcdf', 'ilogccdf'.
+        ax : `matplotlib.axes`, optional
+            Axes on which to generate the plot. If not provided, use the
+            current axes.
+
+        Returns
+        -------
+        ax : `matplotlib.axes`
+            Axes on which the plot was generated.
+            The plot can be customized by manipulating this object.
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> import matplotlib.pyplot as plt
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Plot the PDF over the central 99.9% of the distribution.
+        Compare against a histogram of a random sample.
+
+        >>> ax = X.plot()
+        >>> sample = X.sample(10000)
+        >>> ax.hist(sample, density=True, bins=50, alpha=0.5)
+        >>> plt.show()
+
+        Plot ``logpdf(x)`` as a function of ``x`` in the left tail,
+        where the log of the CDF is between -10 and ``np.log(0.5)``.
+
+        >>> X.plot('x', 'logpdf', t=('logcdf', -10, np.log(0.5)))
+        >>> plt.show()
+
+        Plot the PDF of the normal distribution as a function of the
+        CDF for various values of the scale parameter.
+
+        >>> X = stats.Normal(mu=0., sigma=[0.5, 1., 2])
+        >>> X.plot('cdf', 'pdf')
+        >>> plt.show()
+
+        """
+
+        # Strategy: given t limits, get quantile limits. Form grid of
+        # quantiles, compute requested x and y at quantiles, and plot.
+        # Currently, the grid of quantiles is always linearly spaced.
+        # Instead of always computing linearly-spaced quantiles, it
+        # would be better to choose:
+        # a) quantiles or probabilities
+        # b) linearly or logarithmically spaced
+        # based on the specified `t`.
+        # TODO:
+        # - smart spacing of points
+        # - when the parameters of the distribution are an array,
+        #   use the full range of abscissae for all curves
+
+        discrete = isinstance(self, DiscreteDistribution)
+        t_is_quantile = {'x', 'icdf', 'iccdf', 'ilogcdf', 'ilogccdf'}
+        t_is_probability = {'cdf', 'ccdf', 'logcdf', 'logccdf'}
+        valid_t = t_is_quantile.union(t_is_probability)
+        valid_xy =  valid_t.union({'pdf', 'logpdf', 'pmf', 'logpmf'})
+        y_default = 'pmf' if discrete else 'pdf'
+        y = y_default if y is None else y
+
+        ndim = self._ndim
+        x_name, y_name = x, y
+        t_name = 'cdf' if t is None else t[0]
+
+        a, b = self.support()
+        tliml_default = 0 if np.all(np.isfinite(a)) else 0.0005
+        tliml = tliml_default if t is None else t[1]
+        tlimr_default = 1 if np.all(np.isfinite(b)) else 0.9995
+        tlimr = tlimr_default if t is None else t[2]
+        tlim = np.asarray([tliml, tlimr])
+        tlim = tlim[:, np.newaxis] if ndim else tlim
+
+        # pdf/logpdf are not valid for `t` because we can't easily invert them
+        message = (f'Argument `t` of `{self.__class__.__name__}.plot` "'
+                   f'must be one of {valid_t}')
+        if t_name not in valid_t:
+            raise ValueError(message)
+
+        message = (f'Argument `x` of `{self.__class__.__name__}.plot` "'
+                   f'must be one of {valid_xy}')
+        if x_name not in valid_xy:
+            raise ValueError(message)
+
+        message = (f'Argument `y` of `{self.__class__.__name__}.plot` "'
+                   f'must be one of {valid_xy}')
+        if y_name not in valid_xy:
+            raise ValueError(message)
+
+        # This could just be a warning
+        message = (f'`{self.__class__.__name__}.plot` was called on a random '
+                   'variable with at least one invalid shape parameters. When '
+                   'a parameter is invalid, no plot can be shown.')
+        if self._any_invalid:
+            raise ValueError(message)
+
+        # We could automatically ravel, but do we want to? For now, raise.
+        message = ("To use `plot`, distribution parameters must be "
+                   "scalars or arrays with one or fewer dimensions.")
+        if ndim > 1:
+            raise ValueError(message)
+
+        try:
+            import matplotlib.pyplot as plt  # noqa: F401, E402
+        except ModuleNotFoundError as exc:
+            message = ("`matplotlib` must be installed to use "
+                       f"`{self.__class__.__name__}.plot`.")
+            raise ModuleNotFoundError(message) from exc
+        ax = plt.gca() if ax is None else ax
+
+        # get quantile limits given t limits
+        qlim = tlim if t_name in t_is_quantile else getattr(self, 'i'+t_name)(tlim)
+
+        message = (f"`{self.__class__.__name__}.plot` received invalid input for `t`: "
+                   f"calling {'i'+t_name}({tlim}) produced {qlim}.")
+        if not np.all(np.isfinite(qlim)):
+            raise ValueError(message)
+
+        # form quantile grid
+        if discrete and x_name in t_is_quantile:
+            # should probably aggregate for large ranges
+            q = np.arange(np.min(qlim[0]), np.max(qlim[1]) + 1)
+            q = q[:, np.newaxis] if ndim else q
+        else:
+            grid = np.linspace(0, 1, 300)
+            grid = grid[:, np.newaxis] if ndim else grid
+            q = qlim[0] + (qlim[1] - qlim[0]) * grid
+            q = np.round(q) if discrete else q
+
+        # compute requested x and y at quantile grid
+        x = q if x_name in t_is_quantile else getattr(self, x_name)(q)
+        y = q if y_name in t_is_quantile else getattr(self, y_name)(q)
+
+        # make plot
+        x, y = np.broadcast_arrays(x.T, np.atleast_2d(y.T))
+        for xi, yi in zip(x, y):  # plot is vectorized, but bar/step don't seem to be
+            if discrete and x_name in t_is_quantile and y_name == 'pmf':
+                # should this just be a step plot, too?
+                ax.bar(xi, yi, alpha=np.sqrt(1/y.shape[0]))  # alpha heuristic
+            elif discrete and x_name in t_is_quantile:
+                values = yi
+                edges = np.concatenate((xi, [xi[-1]+1]))
+                ax.stairs(values, edges, baseline=None)
+            else:
+                ax.plot(xi, yi)
+        ax.set_xlabel(f"${x_name}$")
+        ax.set_ylabel(f"${y_name}$")
+        ax.set_title(str(self))
+
+        # only need a legend if distribution has parameters
+        if len(self._parameters):
+            label = []
+            parameters = self._parameterization.parameters
+            param_names = list(parameters)
+            param_arrays = [np.atleast_1d(self._parameters[pname])
+                            for pname in param_names]
+            for param_vals in zip(*param_arrays):
+                assignments = [f"${parameters[name].symbol}$ = {val:.4g}"
+                               for name, val in zip(param_names, param_vals)]
+                label.append(", ".join(assignments))
+            ax.legend(label)
+
+        return ax
+
+
+    ### Fitting
+    # All methods above treat the distribution parameters as fixed, and the
+    # variable argument may be a quantile or probability. The fitting functions
+    # are fundamentally different because the quantiles (often observations)
+    # are considered to be fixed, and the distribution parameters are the
+    # variables. In a sense, they are like an inverse of the sampling
+    # functions.
+    #
+    # At first glance, it would seem ideal for `fit` to be a classmethod,
+    # called like `LogUniform.fit(sample=sample)`.
+    # I tried this. I insisted on it for a while. But if `fit` is a
+    # classmethod, it cannot call instance methods. If we want to support MLE,
+    # MPS, MoM, MoLM, then we end up with most of the distribution functions
+    # above needing to be classmethods, too. All state information, such as
+    # tolerances and the underlying distribution of `ShiftedScaledDistribution`
+    # and `OrderStatisticDistribution`, would need to be passed into all
+    # methods. And I'm not really sure how we would call `fit` as a
+    # classmethod of a transformed distribution - maybe
+    # ShiftedScaledDistribution.fit would accept the class of the
+    # shifted/scaled distribution as an argument?
+    #
+    # In any case, it was a conscious decision for the infrastructure to
+    # treat the parameters as "fixed" and the quantile/percentile arguments
+    # as "variable". There are a lot of advantages to this structure, and I
+    # don't think the fact that a few methods reverse the fixed and variable
+    # quantities should make us question that choice. It can still accomodate
+    # these methods reasonably efficiently.
+
+
+class ContinuousDistribution(UnivariateDistribution):
+    def _overrides(self, method_name):
+        if method_name in {'_logpmf_formula', '_pmf_formula'}:
+            return True
+        return super()._overrides(method_name)
+
+    def _pmf_formula(self, x, **params):
+        return np.zeros_like(x)
+
+    def _logpmf_formula(self, x, **params):
+        return np.full_like(x, -np.inf)
+
+    def _pxf_dispatch(self, x, *, method=None, **params):
+        return self._pdf_dispatch(x, method=method, **params)
+
+    def _logpxf_dispatch(self, x, *, method=None, **params):
+        return self._logpdf_dispatch(x, method=method, **params)
+
+    def _solve_bounded_continuous(self, func, p, params, xatol=None):
+        return self._solve_bounded(func, p, params=params, xatol=xatol).x
+
+
+class DiscreteDistribution(UnivariateDistribution):
+    def _overrides(self, method_name):
+        if method_name in {'_logpdf_formula', '_pdf_formula'}:
+            return True
+        return super()._overrides(method_name)
+
+    def _logpdf_formula(self, x, **params):
+        if params:
+            p = next(iter(params.values()))
+            nan_result = np.isnan(x) | np.isnan(p)
+        else:
+            nan_result = np.isnan(x)
+        return np.where(nan_result, np.nan, np.inf)
+
+    def _pdf_formula(self, x, **params):
+        if params:
+            p = next(iter(params.values()))
+            nan_result = np.isnan(x) | np.isnan(p)
+        else:
+            nan_result = np.isnan(x)
+        return np.where(nan_result, np.nan, np.inf)
+
+    def _pxf_dispatch(self, x, *, method=None, **params):
+        return self._pmf_dispatch(x, method=method, **params)
+
+    def _logpxf_dispatch(self, x, *, method=None, **params):
+        return self._logpmf_dispatch(x, method=method, **params)
+
+    def _cdf_quadrature(self, x, **params):
+        return super()._cdf_quadrature(np.floor(x), **params)
+
+    def _logcdf_quadrature(self, x, **params):
+        return super()._logcdf_quadrature(np.floor(x), **params)
+
+    def _ccdf_quadrature(self, x, **params):
+        return super()._ccdf_quadrature(np.floor(x + 1), **params)
+
+    def _logccdf_quadrature(self, x, **params):
+        return super()._logccdf_quadrature(np.floor(x + 1), **params)
+
+    def _cdf2(self, x, y, *, method):
+        raise NotImplementedError(
+            "Two argument cdf functions are currently only supported for "
+            "continuous distributions.")
+
+    def _ccdf2(self, x, y, *, method):
+        raise NotImplementedError(
+            "Two argument cdf functions are currently only supported for "
+            "continuous distributions.")
+
+    def _logcdf2(self, x, y, *, method):
+        raise NotImplementedError(
+            "Two argument cdf functions are currently only supported for "
+            "continuous distributions.")
+
+    def _logccdf2(self, x, y, *, method):
+        raise NotImplementedError(
+            "Two argument cdf functions are currently only supported for "
+            "continuous distributions.")
+
+    def _solve_bounded_discrete(self, func, p, params, comp):
+        # We're trying to solve one of these two problems:
+        # a) find the smallest integer x* within the support s.t. F(x*) >= p
+        # b) find the smallest integer x* within the support s.t. G(x*) = 1 - F(x*) <= p
+        # Our approach is to solve a continuous version of the problem that narrows the
+        # solution down to an integer x s.t. either x* = x or x* = x + 1. At the end,
+        # we'll choose between them.
+
+        # First, solve func(x) == p where func is a continuous, monotone interpolant
+        # of either the monotone increasing F or monotone decreasing G.
+        res = self._solve_bounded(func, p, params=params, xatol=0.9)
+        # Here, `_solve_bounded` can terminate for one of three reasons:
+        # 1. `func(res.x) == p` (`fatol = 0` is satisfied),
+        # 2. `res.xl` and `res.xr` bracket the root and `|res.xr - res.xl| <= xatol`, or
+        # 3. There is no solution within the support.
+        # There are several possible strategies for using `res.xl`, `res.x`, and/or
+        # `res.xr` to find a solution to the original, discrete problem. Here is ours.
+
+        # Consider case 2a. Because F is an increasing function, we know
+        # that F(xr) >= p (and F(xl) <= p), so F(floor(xr) + 1) >= p.
+        # F(floor(xr)) *may* be >= p, but we can't know until we evaluate it.
+        # F(floor(xr) - 1) < p (strictly) because floor(xr) - 1 < xl and F decreases
+        # monotonically as the argument decreases. So we choose x = floor(xr), and
+        # later we'll choose between x* = x and x* = x + 1.
+        x = np.asarray(np.floor(res.xr))
+        # This is also suitable for case 2b. Because G is a *decreasing* function, we
+        # know that G(xr) <= p (and G(xl) >= p), so G(floor(xr) + 1) <= p.
+        # G(floor(xr)) *may* be <= p, but we can't know until we evaluate it.
+        # G(floor(xr) - 1) > p (strictly) because floor(xr) - 1 < xl and G increases
+        # as the argument decreases. So we would still want to choose x = floor(xr), and
+        # later we'll choose between x* = x and x* = x + 1.
+
+        # Now we consider case 1a/b. In this case, `res.x` solved the equation
+        # *exactly*, so the algorithm may have terminated before the bracket is tight
+        # enough to rely on `res.xr`. If `res.x` happens to be integral, `res.x` is
+        # the solution to the discrete problem, and floor(res.x) == res.x, so
+        # floor(res.x) is the solution to the discrete problem. If not:
+        # a) F(floor(res.x)) < p (strictly) and F(floor(res.x) + 1) > p (strictly). So
+        #    floor(res.x) + 1 is the solution to the discrete problem.
+        # b) G(floor(res.x)) > p (strictly) and G(floor(res.x) + 1) < p (strictly). So
+        #    floor(res.x) + 1 is again the solution to the discrete problem.
+        # Either way, we can choose x = res.x, and at the end we'll choose between
+        # x* = x and x* = x + 1.
+        mask = res.fun == 0
+        x[mask] = np.floor(res.x[mask])
+
+        # For case 3, let xmin be the left endpoint of the support, and note that in
+        # general, F(xmin) > 0 and G(xmin) < 1. Therefore it is possible that:
+        # a) F(x) > p for all x in the support (e.g. because p ~ 0)
+        # a) G(x) < p for all x in the support (e.g. because p ~ 1)
+        # In these cases, `_solve_bounded` would fail to find a root of the continuous
+        # equation above, but the solution to the original, discrete problem is the left
+        # endpoint of the support.
+        # This case is handled before we get to this function; otherwise,
+        # `_solve_bounded` may spin its wheels for a long time in vain.
+
+        # Now, we choose between x* = x and x* = x + 1: if func(x) satisfies the
+        # comparison `comp` (>= for cdf, <= for ccdf), the solution is x* = x;
+        # otherwise the solution must be x* = x + 1.
+        f = func(x, **params)
+        x = np.where(comp(f, p), x, x + 1.0)
+        x[np.isnan(f)] = np.nan  # needed? why would func(x) be NaN within support?
+
+        return x
+
+    def _base_discrete_inversion(self, p, func, comp, /, **params):
+        # For discrete distributions, icdf(p) is defined as the minimum integer x*
+        # within the support such that F(x*) >= p; iccdf(p) is the minimum integer x*
+        # within the support such that G(x*) <= p.
+
+        # Identify where the solution is xmin.
+        # (See rationale in `_solve_bounded_discrete`.)
+        xmin, xmax = self._support(**params)
+        p, xmin, _ = np.broadcast_arrays(p, xmin, xmax)
+        mask = comp(func(xmin, **params), p)
+
+        # Use `apply_where` to perform the inversion only when necessary.
+        def f1(p, *args):
+            return self._solve_bounded_discrete(
+                func, p, params=dict(zip(params.keys(), args)), comp=comp)
+
+        x = xpx.apply_where(~mask, (p, *params.values()), f1, fill_value=xmin)
+
+        # x above may be a finite value even when p is NaN, so the returned value
+        # should be NaN. We need to handle this as a special case.
+        x[np.isnan(p)] = np.nan
+        return x[()]
+
+    def _icdf_inversion(self, x, **params):
+        return self._base_discrete_inversion(x, self._cdf_dispatch,
+                                             np.greater_equal, **params)
+
+    def _ilogcdf_inversion(self, x, **params):
+        return self._base_discrete_inversion(x, self._logcdf_dispatch,
+                                             np.greater_equal, **params)
+
+    def _iccdf_inversion(self, x, **params):
+        return self._base_discrete_inversion(x, self._ccdf_dispatch,
+                                             np.less_equal, **params)
+
+    def _ilogccdf_inversion(self, x, **params):
+        return self._base_discrete_inversion(x, self._logccdf_dispatch,
+                                             np.less_equal, **params)
+
+    def _mode_optimization(self, **params):
+        # If `x` is the true mode of a unimodal continuous function, we can find
+        # the mode among integers by rounding in each direction and checking
+        # which is better. If the difference between `x` and the nearest integer
+        # is less than `xatol`, the computed value of `x` may end up on the wrong
+        # side of the nearest integer. Setting `xatol=0.5` guarantees that at most
+        # three integers need to be checked, the two nearest integers, ``floor(x)``
+        # and ``round(x)`` and the nearest integer other than these.
+        x = super()._mode_optimization(xatol=0.5, **params)
+        low, high = self.support()
+        xl, xr = np.floor(x), np.ceil(x)
+        nearest = np.round(x)
+        # Clip to stay within support. There will be redundant calculation
+        # when clipping since `xo` will be one of `xl` or `xr`, but let's
+        # keep the implementation simple for now.
+        xo = np.clip(nearest + np.copysign(1, nearest - x), low, high)
+        x = np.stack([xl, xo, xr])
+        idx = np.argmax(self._pmf_dispatch(x, **params), axis=0)
+        return np.choose(idx, [xl, xo, xr])
+
+    def _logentropy_quadrature(self, **params):
+        def logintegrand(x, **params):
+            logpmf = self._logpmf_dispatch(x, **params)
+            # Entropy summand is -pmf*log(pmf), so log-entropy summand is
+            # logpmf + log(logpmf) + pi*j. But pmf is always between 0 and 1,
+            # so logpmf is always negative, and so log(logpmf) = log(-logpmf) + pi*j.
+            # The two imaginary components "cancel" each other out (which we would
+            # expect because each term of the entropy summand is positive).
+            return np.where(np.isfinite(logpmf), logpmf + np.log(-logpmf), -np.inf)
+        return self._quadrature(logintegrand, params=params, log=True)
+
+
+# Special case the names of some new-style distributions in `make_distribution`
+_distribution_names = {
+    # Continuous
+    'argus': 'ARGUS',
+    'betaprime': 'BetaPrime',
+    'chi2': 'ChiSquared',
+    'crystalball': 'CrystalBall',
+    'dgamma': 'DoubleGamma',
+    'dweibull': 'DoubleWeibull',
+    'expon': 'Exponential',
+    'exponnorm': 'ExponentiallyModifiedNormal',
+    'exponweib': 'ExponentialWeibull',
+    'exponpow': 'ExponentialPower',
+    'fatiguelife': 'FatigueLife',
+    'foldcauchy': 'FoldedCauchy',
+    'foldnorm': 'FoldedNormal',
+    'genlogistic': 'GeneralizedLogistic',
+    'gennorm': 'GeneralizedNormal',
+    'genpareto': 'GeneralizedPareto',
+    'genexpon': 'GeneralizedExponential',
+    'genextreme': 'GeneralizedExtremeValue',
+    'gausshyper': 'GaussHypergeometric',
+    'gengamma': 'GeneralizedGamma',
+    'genhalflogistic': 'GeneralizedHalfLogistic',
+    'geninvgauss': 'GeneralizedInverseGaussian',
+    'gumbel_r': 'Gumbel',
+    'gumbel_l': 'ReflectedGumbel',
+    'halfcauchy': 'HalfCauchy',
+    'halflogistic': 'HalfLogistic',
+    'halfnorm': 'HalfNormal',
+    'halfgennorm': 'HalfGeneralizedNormal',
+    'hypsecant': 'HyperbolicSecant',
+    'invgamma': 'InverseGammma',
+    'invgauss': 'InverseGaussian',
+    'invweibull': 'InverseWeibull',
+    'irwinhall': 'IrwinHall',
+    'jf_skew_t': 'JonesFaddySkewT',
+    'johnsonsb': 'JohnsonSB',
+    'johnsonsu': 'JohnsonSU',
+    'ksone': 'KSOneSided',
+    'kstwo': 'KSTwoSided',
+    'kstwobign': 'KSTwoSidedAsymptotic',
+    'laplace_asymmetric': 'LaplaceAsymmetric',
+    'levy_l': 'LevyLeft',
+    'levy_stable': 'LevyStable',
+    'loggamma': 'ExpGamma',  # really the Exponential Gamma Distribution
+    'loglaplace': 'LogLaplace',
+    'lognorm': 'LogNormal',
+    'loguniform': 'LogUniform',
+    'ncx2': 'NoncentralChiSquared',
+    'nct': 'NoncentralT',
+    'norm': 'Normal',
+    'norminvgauss': 'NormalInverseGaussian',
+    'powerlaw': 'PowerLaw',
+    'powernorm': 'PowerNormal',
+    'rdist': 'R',
+    'rel_breitwigner': 'RelativisticBreitWigner',
+    'recipinvgauss': 'ReciprocalInverseGaussian',
+    'reciprocal': 'LogUniform',
+    'semicircular': 'SemiCircular',
+    'skewcauchy': 'SkewCauchy',
+    'skewnorm': 'SkewNormal',
+    'studentized_range': 'StudentizedRange',
+    't': 'StudentT',
+    'trapezoid': 'Trapezoidal',
+    'triang': 'Triangular',
+    'truncexpon': 'TruncatedExponential',
+    'truncnorm': 'TruncatedNormal',
+    'truncpareto': 'TruncatedPareto',
+    'truncweibull_min': 'TruncatedWeibull',
+    'tukeylambda': 'TukeyLambda',
+    'vonmises_line': 'VonMisesLine',
+    'weibull_min': 'Weibull',
+    'weibull_max': 'ReflectedWeibull',
+    'wrapcauchy': 'WrappedCauchyLine',
+    # Discrete
+    'betabinom': 'BetaBinomial',
+    'betanbinom': 'BetaNegativeBinomial',
+    'dlaplace': 'LaplaceDiscrete',
+    'geom': 'Geometric',
+    'hypergeom': 'Hypergeometric',
+    'logser': 'LogarithmicSeries',
+    'nbinom': 'NegativeBinomial',
+    'nchypergeom_fisher': 'NoncentralHypergeometricFisher',
+    'nchypergeom_wallenius': 'NoncentralHypergeometricWallenius',
+    'nhypergeom': 'NegativeHypergeometric',
+    'poisson_binom': 'PoissonBinomial',
+    'randint': 'UniformDiscrete',
+    'yulesimon': 'YuleSimon',
+    'zipf': 'Zeta',
+}
+
+
+# beta, genextreme, gengamma, t, tukeylambda need work for 1D arrays
+@xp_capabilities(np_only=True)
+def make_distribution(dist):
+    """Generate a `UnivariateDistribution` class from a compatible object
+
+    The argument may be an instance of `rv_continuous` or an instance of
+    another class that satisfies the interface described below.
+
+    The returned value is a `ContinuousDistribution` subclass if the input is an
+    instance of `rv_continuous` or a `DiscreteDistribution` subclass if the input
+    is an instance of `rv_discrete`. Like any subclass of `UnivariateDistribution`,
+    it must be instantiated (i.e. by passing all shape parameters as keyword
+    arguments) before use. Once instantiated, the resulting object will have the
+    same interface as any other instance of `UnivariateDistribution`; e.g.,
+    `scipy.stats.Normal`, `scipy.stats.Binomial`.
+
+    .. note::
+
+        `make_distribution` does not work perfectly with all instances of
+        `rv_continuous`. Known failures include `levy_stable`, `vonmises`,
+        `hypergeom`, 'nchypergeom_fisher', 'nchypergeom_wallenius', and
+        `poisson_binom`. Some methods of some distributions will not support
+        array shape parameters.
+
+    Parameters
+    ----------
+    dist : `rv_continuous`
+        Instance of `rv_continuous`, `rv_discrete`, or an instance of any class with
+        the following attributes:
+
+        __make_distribution_version__ : str
+            A string containing the version number of SciPy in which this interface
+            is defined. The preferred interface may change in future SciPy versions,
+            in which case support for an old interface version may be deprecated
+            and eventually removed.
+        parameters : dict or tuple
+            If a dictionary, each key is the name of a parameter,
+            and the corresponding value is either a dictionary or tuple.
+            If the value is a dictionary, it may have the following items, with default
+            values used for entries which aren't present.
+
+            endpoints : tuple, default: (-inf, inf)
+                A tuple defining the lower and upper endpoints of the domain of the
+                parameter; allowable values are floats, the name (string) of another
+                parameter, or a callable taking parameters as keyword only
+                arguments and returning the numerical value of an endpoint for
+                given parameter values.
+
+            inclusive : tuple of bool, default: (False, False)
+                A tuple specifying whether the endpoints are included within the domain
+                of the parameter.
+
+            typical : tuple, default: ``endpoints``
+                Defining endpoints of a typical range of values of a parameter. Can be
+                used for sampling parameter values for testing. Behaves like the
+                ``endpoints`` tuple above, and should define a subinterval of the
+                domain given by ``endpoints``.
+
+            A tuple value ``(a, b)`` associated to a key in the ``parameters``
+            dictionary is equivalent to ``{endpoints: (a, b)}``.
+
+            Custom distributions with multiple parameterizations can be defined by
+            having the ``parameters`` attribute be a tuple of dictionaries with
+            the structure described above. In this case, ``dist``\'s class must also
+            define a method ``process_parameters`` to map between the different
+            parameterizations. It must take all parameters from all parameterizations
+            as optional keyword arguments and return a dictionary mapping parameters to
+            values, filling in values from other parameterizations using values from
+            the supplied parameterization. See example.
+
+        support : dict or tuple
+            A dictionary describing the support of the distribution or a tuple
+            describing the endpoints of the support. This behaves identically to
+            the values of the parameters dict described above, except that the key
+            ``typical`` is ignored.
+
+        The class **must** also define a ``pdf`` method and **may** define methods
+        ``logentropy``, ``entropy``, ``median``, ``mode``, ``logpdf``,
+        ``logcdf``, ``cdf``, ``logccdf``, ``ccdf``,
+        ``ilogcdf``, ``icdf``, ``ilogccdf``, ``iccdf``,
+        ``moment``, and ``sample``.
+        If defined, these methods must accept the parameters of the distribution as
+        keyword arguments and also accept any positional-only arguments accepted by
+        the corresponding method of `ContinuousDistribution`.
+        When multiple parameterizations are defined, these methods must accept
+        all parameters from all parameterizations. The ``moment`` method
+        must accept the ``order`` and ``kind`` arguments by position or keyword, but
+        may return ``None`` if a formula is not available for the arguments; in this
+        case, the infrastructure will fall back to a default implementation. The
+        ``sample`` method must accept ``shape`` by position or keyword, but contrary
+        to the public method of the same name, the argument it receives will be the
+        *full* shape of the output array - that is, the shape passed to the public
+        method prepended to the broadcasted shape of random variable parameters.
+
+    Returns
+    -------
+    CustomDistribution : `UnivariateDistribution`
+        A subclass of `UnivariateDistribution` corresponding with `dist`. The
+        initializer requires all shape parameters to be passed as keyword arguments
+        (using the same names as the instance of `rv_continuous`/`rv_discrete`).
+
+    Notes
+    -----
+    The documentation of `UnivariateDistribution` is not rendered. See below for
+    an example of how to instantiate the class (i.e. pass all shape parameters of
+    `dist` to the initializer as keyword arguments). Documentation of all methods
+    is identical to that of `scipy.stats.Normal`. Use ``help`` on the returned
+    class or its methods for more information.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> from scipy import special
+
+    Create a `ContinuousDistribution` from `scipy.stats.loguniform`.
+
+    >>> LogUniform = stats.make_distribution(stats.loguniform)
+    >>> X = LogUniform(a=1.0, b=3.0)
+    >>> np.isclose((X + 0.25).median(), stats.loguniform.ppf(0.5, 1, 3, loc=0.25))
+    np.True_
+    >>> X.plot()
+    >>> sample = X.sample(10000, rng=np.random.default_rng())
+    >>> plt.hist(sample, density=True, bins=30)
+    >>> plt.legend(('pdf', 'histogram'))
+    >>> plt.show()
+
+    Create a custom distribution.
+
+    >>> class MyLogUniform:
+    ...     @property
+    ...     def __make_distribution_version__(self):
+    ...         return "1.16.0"
+    ...
+    ...     @property
+    ...     def parameters(self):
+    ...         return {'a': {'endpoints': (0, np.inf),
+    ...                       'inclusive': (False, False)},
+    ...                 'b': {'endpoints': ('a', np.inf),
+    ...                       'inclusive': (False, False)}}
+    ...
+    ...     @property
+    ...     def support(self):
+    ...         return {'endpoints': ('a', 'b'), 'inclusive': (True, True)}
+    ...
+    ...     def pdf(self, x, a, b):
+    ...         return 1 / (x * (np.log(b)- np.log(a)))
+    >>>
+    >>> MyLogUniform = stats.make_distribution(MyLogUniform())
+    >>> Y = MyLogUniform(a=1.0, b=3.0)
+    >>> np.isclose(Y.cdf(2.), X.cdf(2.))
+    np.True_
+
+    Create a custom distribution with variable support.
+
+    >>> class MyUniformCube:
+    ...     @property
+    ...     def __make_distribution_version__(self):
+    ...         return "1.16.0"
+    ...
+    ...     @property
+    ...     def parameters(self):
+    ...         return {"a": (-np.inf, np.inf),
+    ...                 "b": {'endpoints':('a', np.inf), 'inclusive':(True, False)}}
+    ...
+    ...     @property
+    ...     def support(self):
+    ...         def left(*, a, b):
+    ...             return a**3
+    ...
+    ...         def right(*, a, b):
+    ...             return b**3
+    ...         return (left, right)
+    ...
+    ...     def pdf(self, x, *, a, b):
+    ...         return 1 / (3*(b - a)*np.cbrt(x)**2)
+    ...
+    ...     def cdf(self, x, *, a, b):
+    ...         return (np.cbrt(x) - a) / (b - a)
+    >>>
+    >>> MyUniformCube = stats.make_distribution(MyUniformCube())
+    >>> X = MyUniformCube(a=-2, b=2)
+    >>> Y = stats.Uniform(a=-2, b=2)**3
+    >>> X.support()
+    (-8.0, 8.0)
+    >>> np.isclose(X.cdf(2.1), Y.cdf(2.1))
+    np.True_
+
+    Create a custom distribution with multiple parameterizations. Here we create a
+    custom version of the beta distribution that has an alternative parameterization
+    in terms of the mean ``mu`` and a dispersion parameter ``nu``.
+
+    >>> class MyBeta:
+    ...     @property
+    ...     def __make_distribution_version__(self):
+    ...         return "1.16.0"
+    ...
+    ...     @property
+    ...     def parameters(self):
+    ...         return ({"a": (0, np.inf), "b": (0, np.inf)},
+    ...                 {"mu": (0, 1), "nu": (0, np.inf)})
+    ...
+    ...     def process_parameters(self, a=None, b=None, mu=None, nu=None):
+    ...         if a is not None and b is not None:
+    ...             nu = a + b
+    ...             mu = a / nu
+    ...         else:
+    ...             a = mu * nu
+    ...             b = nu - a
+    ...         return dict(a=a, b=b, mu=mu, nu=nu)
+    ...
+    ...     @property
+    ...     def support(self):
+    ...         return {'endpoints': (0, 1)}
+    ...
+    ...     def pdf(self, x, a, b, mu, nu):
+    ...         return special._ufuncs._beta_pdf(x, a, b)
+    ...
+    ...     def cdf(self, x, a, b, mu, nu):
+    ...         return special.betainc(a, b, x)
+    >>>
+    >>> MyBeta = stats.make_distribution(MyBeta())
+    >>> X = MyBeta(a=2.0, b=2.0)
+    >>> Y = MyBeta(mu=0.5, nu=4.0)
+    >>> np.isclose(X.pdf(0.3), Y.pdf(0.3))
+    np.True_
+
+    """
+    if dist in {stats.levy_stable, stats.vonmises, stats.hypergeom,
+                stats.nchypergeom_fisher, stats.nchypergeom_wallenius,
+                stats.poisson_binom}:
+        raise NotImplementedError(f"`{dist.name}` is not supported.")
+
+    if isinstance(dist, stats.rv_continuous | stats.rv_discrete):
+        return _make_distribution_rv_generic(dist)
+    elif getattr(dist, "__make_distribution_version__", "0.0.0") >= "1.16.0":
+        return _make_distribution_custom(dist)
+    else:
+        message = ("The argument must be an instance of `rv_continuous`, "
+                   "`rv_discrete`, or an instance of a class with attribute "
+                   "`__make_distribution_version__ >= 1.16`.")
+        raise ValueError(message)
+
+def _make_distribution_rv_generic(dist):
+    parameters = []
+    names = []
+    support = getattr(dist, '_support', (dist.a, dist.b))
+    for shape_info in dist._shape_info():
+        domain = _RealInterval(endpoints=shape_info.endpoints,
+                               inclusive=shape_info.inclusive)
+        param = _RealParameter(shape_info.name, domain=domain)
+        parameters.append(param)
+        names.append(shape_info.name)
+
+    repr_str = _distribution_names.get(dist.name, dist.name.capitalize())
+    if isinstance(dist, stats.rv_continuous):
+        old_class, new_class = stats.rv_continuous, ContinuousDistribution
+    else:
+        old_class, new_class = stats.rv_discrete, DiscreteDistribution
+
+    def _overrides(method_name):
+        return (getattr(dist.__class__, method_name, None)
+                is not getattr(old_class, method_name, None))
+
+    if _overrides("_get_support"):
+        def left(**parameter_values):
+            a, _ = dist._get_support(**parameter_values)
+            return np.asarray(a)[()]
+
+        def right(**parameter_values):
+            _, b = dist._get_support(**parameter_values)
+            return np.asarray(b)[()]
+
+        endpoints = (left, right)
+    else:
+        endpoints = support
+
+    _x_support = _RealInterval(endpoints=endpoints, inclusive=(True, True))
+    _x_param = _RealParameter('x', domain=_x_support, typical=(-1, 1))
+
+    class CustomDistribution(new_class):
+        _parameterizations = ([_Parameterization(*parameters)] if parameters
+                              else [])
+        _variable = _x_param
+
+        __class_getitem__ = None
+
+        def __repr__(self):
+            s = super().__repr__()
+            return s.replace('CustomDistribution', repr_str)
+
+        def __str__(self):
+            s = super().__str__()
+            return s.replace('CustomDistribution', repr_str)
+
+    def _sample_formula(self, full_shape=(), *, rng=None, **kwargs):
+        return dist._rvs(size=full_shape, random_state=rng, **kwargs)
+
+    def _moment_raw_formula(self, order, **kwargs):
+        return dist._munp(int(order), **kwargs)
+
+    def _moment_raw_formula_1(self, order, **kwargs):
+        if order != 1:
+            return None
+        return dist._stats(**kwargs)[0]
+
+    def _moment_central_formula(self, order, **kwargs):
+        if order != 2:
+            return None
+        return dist._stats(**kwargs)[1]
+
+    def _moment_standard_formula(self, order, **kwargs):
+        if order == 3:
+            if dist._stats_has_moments:
+                kwargs['moments'] = 's'
+            return dist._stats(**kwargs)[int(order - 1)]
+        elif order == 4:
+            if dist._stats_has_moments:
+                kwargs['moments'] = 'k'
+            k = dist._stats(**kwargs)[int(order - 1)]
+            return k if k is None else k + 3
+        else:
+            return None
+
+    methods = {'_logpdf': '_logpdf_formula',
+               '_pdf': '_pdf_formula',
+               '_logpmf': '_logpmf_formula',
+               '_pmf': '_pmf_formula',
+               '_logcdf': '_logcdf_formula',
+               '_cdf': '_cdf_formula',
+               '_logsf': '_logccdf_formula',
+               '_sf': '_ccdf_formula',
+               '_ppf': '_icdf_formula',
+               '_isf': '_iccdf_formula',
+               '_entropy': '_entropy_formula',
+               '_median': '_median_formula'}
+
+    # These are not desirable overrides for the new infrastructure
+    skip_override = {'norminvgauss': {'_sf', '_isf'}}
+
+    for old_method, new_method in methods.items():
+        if dist.name in skip_override and old_method in skip_override[dist.name]:
+            continue
+        # If method of old distribution overrides generic implementation...
+        method = getattr(dist.__class__, old_method, None)
+        super_method = getattr(old_class, old_method, None)
+        if method is not super_method:
+            # Make it an attribute of the new object with the new name
+            setattr(CustomDistribution, new_method, getattr(dist, old_method))
+
+    if _overrides('_munp'):
+        CustomDistribution._moment_raw_formula = _moment_raw_formula
+
+    if _overrides('_rvs'):
+        CustomDistribution._sample_formula = _sample_formula
+
+    if _overrides('_stats'):
+        CustomDistribution._moment_standardized_formula = _moment_standard_formula
+        if not _overrides('_munp'):
+            CustomDistribution._moment_raw_formula = _moment_raw_formula_1
+            CustomDistribution._moment_central_formula = _moment_central_formula
+
+    support_etc = _combine_docs(CustomDistribution, include_examples=False).lstrip()
+    docs = [
+        f"This class represents `scipy.stats.{dist.name}` as a subclass of "
+        f"`{new_class}`.",
+        f"The `repr`/`str` of class instances is `{repr_str}`.",
+        f"The PDF of the distribution is defined {support_etc}"
+    ]
+    CustomDistribution.__doc__ = ("\n".join(docs))
+
+    return CustomDistribution
+
+
+def _get_domain_info(info):
+    domain_info = {"endpoints": info} if isinstance(info, tuple) else info
+    typical = domain_info.pop("typical", None)
+    return domain_info, typical
+
+
+def _make_distribution_custom(dist):
+    dist_parameters = (
+        dist.parameters if isinstance(dist.parameters, tuple) else (dist.parameters, )
+    )
+    parameterizations = []
+    for parameterization in dist_parameters:
+        # The attribute name ``parameters`` appears reasonable from a user facing
+        # perspective, but there is a little tension here with the internal. It's
+        # important to keep in mind that the ``parameters`` attribute in a
+        # user-created custom distribution specifies ``_parameterizations`` within
+        # the infrastructure.
+        parameters = []
+
+        for name, info in parameterization.items():
+            domain_info, typical = _get_domain_info(info)
+            domain = _RealInterval(**domain_info)
+            param = _RealParameter(name, domain=domain, typical=typical)
+            parameters.append(param)
+        parameterizations.append(_Parameterization(*parameters) if parameters else [])
+
+    domain_info, _ = _get_domain_info(dist.support)
+    _x_support = _RealInterval(**domain_info)
+    _x_param = _RealParameter('x', domain=_x_support)
+    repr_str = dist.__class__.__name__
+
+    class CustomDistribution(ContinuousDistribution):
+        _parameterizations = parameterizations
+        _variable = _x_param
+
+        def __repr__(self):
+            s = super().__repr__()
+            return s.replace('CustomDistribution', repr_str)
+
+        def __str__(self):
+            s = super().__str__()
+            return s.replace('CustomDistribution', repr_str)
+
+    methods = {'sample', 'logentropy', 'entropy',
+               'median', 'mode', 'logpdf', 'pdf',
+               'logcdf2', 'logcdf', 'cdf2', 'cdf',
+               'logccdf2', 'logccdf', 'ccdf2', 'ccdf',
+               'ilogcdf', 'icdf', 'ilogccdf', 'iccdf'}
+
+    for method in methods:
+        if hasattr(dist, method):
+            # Make it an attribute of the new object with the new name
+            new_method = f"_{method}_formula"
+            setattr(CustomDistribution, new_method, getattr(dist, method))
+
+    if hasattr(dist, 'moment'):
+        def _moment_raw_formula(self, order, **kwargs):
+            return dist.moment(order, kind='raw', **kwargs)
+
+        def _moment_central_formula(self, order, **kwargs):
+            return dist.moment(order, kind='central', **kwargs)
+
+        def _moment_standardized_formula(self, order, **kwargs):
+            return dist.moment(order, kind='standardized', **kwargs)
+
+        CustomDistribution._moment_raw_formula = _moment_raw_formula
+        CustomDistribution._moment_central_formula = _moment_central_formula
+        CustomDistribution._moment_standardized_formula = _moment_standardized_formula
+
+    if hasattr(dist, 'process_parameters'):
+        setattr(
+            CustomDistribution,
+            "_process_parameters",
+            getattr(dist, "process_parameters")
+        )
+
+    support_etc = _combine_docs(CustomDistribution, include_examples=False).lstrip()
+    docs = [
+        f"This class represents `{repr_str}` as a subclass of "
+        "`ContinuousDistribution`.",
+        f"The PDF of the distribution is defined {support_etc}"
+    ]
+    CustomDistribution.__doc__ = ("\n".join(docs))
+
+    return CustomDistribution
+
+
+# Rough sketch of how we might shift/scale distributions. The purpose of
+# making it a separate class is for
+# a) simplicity of the ContinuousDistribution class and
+# b) avoiding the requirement that every distribution accept loc/scale.
+# The simplicity of ContinuousDistribution is important, because there are
+# several other distribution transformations to be supported; e.g., truncation,
+# wrapping, folding, and doubling. We wouldn't want to cram all of this
+# into the `ContinuousDistribution` class. Also, the order of the composition
+# matters (e.g. truncate then shift/scale or vice versa). It's easier to
+# accommodate different orders if the transformation is built up from
+# components rather than all built into `ContinuousDistribution`.
+
+def _shift_scale_distribution_function_2arg(func):
+    def wrapped(self, x, y, *args, loc, scale, sign, **kwargs):
+        item = func.__name__
+
+        f = getattr(self._dist, item)
+
+        # Obviously it's possible to get away with half of the work here.
+        # Let's focus on correct results first and optimize later.
+        xt = self._transform(x, loc, scale)
+        yt = self._transform(y, loc, scale)
+        fxy = f(xt, yt, *args, **kwargs)
+        fyx = f(yt, xt, *args, **kwargs)
+        return np.real_if_close(np.where(sign, fxy, fyx))[()]
+
+    return wrapped
+
+def _shift_scale_distribution_function(func):
+    # c is for complementary
+    citem = {'_logcdf_dispatch': '_logccdf_dispatch',
+             '_cdf_dispatch': '_ccdf_dispatch',
+             '_logccdf_dispatch': '_logcdf_dispatch',
+             '_ccdf_dispatch': '_cdf_dispatch'}
+    def wrapped(self, x, *args, loc, scale, sign, **kwargs):
+        item = func.__name__
+
+        f = getattr(self._dist, item)
+        cf = getattr(self._dist, citem[item])
+
+        # Obviously it's possible to get away with half of the work here.
+        # Let's focus on correct results first and optimize later.
+        xt = self._transform(x, loc, scale)
+        fx = f(xt, *args, **kwargs)
+        cfx = cf(xt, *args, **kwargs)
+        return np.where(sign, fx, cfx)[()]
+
+    return wrapped
+
+def _shift_scale_inverse_function(func):
+    citem = {'_ilogcdf_dispatch': '_ilogccdf_dispatch',
+             '_icdf_dispatch': '_iccdf_dispatch',
+             '_ilogccdf_dispatch': '_ilogcdf_dispatch',
+             '_iccdf_dispatch': '_icdf_dispatch'}
+    def wrapped(self, p, *args, loc, scale, sign, **kwargs):
+        item = func.__name__
+
+        f = getattr(self._dist, item)
+        cf = getattr(self._dist, citem[item])
+
+        # Obviously it's possible to get away with half of the work here.
+        # Let's focus on correct results first and optimize later.
+        fx =  self._itransform(f(p, *args, **kwargs), loc, scale)
+        cfx = self._itransform(cf(p, *args, **kwargs), loc, scale)
+        return np.where(sign, fx, cfx)[()]
+
+    return wrapped
+
+
+class TransformedDistribution(ContinuousDistribution):
+    def __init__(self, X, /, *args, **kwargs):
+        if not isinstance(X, ContinuousDistribution):
+            message = "Transformations are currently only supported for continuous RVs."
+            raise NotImplementedError(message)
+        self._copy_parameterization()
+        self._variable = X._variable
+        self._dist = X
+        if X._parameterization:
+            # Add standard distribution parameters to our parameterization
+            dist_parameters = X._parameterization.parameters
+            set_params = set(dist_parameters)
+            if not self._parameterizations:
+                self._parameterizations.append(_Parameterization())
+            for parameterization in self._parameterizations:
+                if set_params.intersection(parameterization.parameters):
+                    message = (f"One or more of the parameters of {X} has "
+                               "the same name as a parameter of "
+                               f"{self.__class__.__name__}. Name collisions "
+                               "create ambiguities and are not supported.")
+                    raise ValueError(message)
+                parameterization.parameters.update(dist_parameters)
+        super().__init__(*args, **kwargs)
+
+    def _overrides(self, method_name):
+        return (self._dist._overrides(method_name)
+                or super()._overrides(method_name))
+
+    def reset_cache(self):
+        self._dist.reset_cache()
+        super().reset_cache()
+
+    def _update_parameters(self, *, validation_policy=None, **params):
+        # maybe broadcast everything before processing?
+        parameters = {}
+        # There may be some issues with _original_parameters
+        # We only want to update with _dist._original_parameters during
+        # initialization. Afterward that, we want to start with
+        # self._original_parameters.
+        parameters.update(self._dist._original_parameters)
+        parameters.update(params)
+        super()._update_parameters(validation_policy=validation_policy, **parameters)
+
+    def _process_parameters(self, **params):
+        return self._dist._process_parameters(**params)
+
+    def __repr__(self):
+        raise NotImplementedError()
+
+    def __str__(self):
+        raise NotImplementedError()
+
+
+class TruncatedDistribution(TransformedDistribution):
+    """Truncated distribution."""
+    # TODO:
+    # - consider avoiding catastropic cancellation by using appropriate tail
+    # - if the mode of `_dist` is within the support, it's still the mode
+    # - rejection sampling might be more efficient than inverse transform
+
+    _lb_domain = _RealInterval(endpoints=(-inf, 'ub'), inclusive=(True, False))
+    _lb_param = _RealParameter('lb', symbol=r'b_l',
+                                domain=_lb_domain, typical=(0.1, 0.2))
+
+    _ub_domain = _RealInterval(endpoints=('lb', inf), inclusive=(False, True))
+    _ub_param = _RealParameter('ub', symbol=r'b_u',
+                                  domain=_ub_domain, typical=(0.8, 0.9))
+
+    _parameterizations = [_Parameterization(_lb_param, _ub_param),
+                          _Parameterization(_lb_param),
+                          _Parameterization(_ub_param)]
+
+    def __init__(self, X, /, *args, lb=-np.inf, ub=np.inf, **kwargs):
+        return super().__init__(X, *args, lb=lb, ub=ub, **kwargs)
+
+    def _process_parameters(self, lb=None, ub=None, **params):
+        lb = lb if lb is not None else np.full_like(lb, -np.inf)[()]
+        ub = ub if ub is not None else np.full_like(ub, np.inf)[()]
+        parameters = self._dist._process_parameters(**params)
+        a, b = self._support(lb=lb, ub=ub, **parameters)
+        logmass = self._dist._logcdf2_dispatch(a, b, **parameters)
+        parameters.update(dict(lb=lb, ub=ub, _a=a, _b=b, logmass=logmass))
+        return parameters
+
+    def _support(self, lb, ub, **params):
+        a, b = self._dist._support(**params)
+        return np.maximum(a, lb), np.minimum(b, ub)
+
+    def _overrides(self, method_name):
+        return False
+
+    def _logpdf_dispatch(self, x, *args, lb, ub, _a, _b, logmass, **params):
+        logpdf = self._dist._logpdf_dispatch(x, *args, **params)
+        return logpdf - logmass
+
+    def _logcdf_dispatch(self, x, *args, lb, ub, _a, _b, logmass, **params):
+        logcdf = self._dist._logcdf2_dispatch(_a, x, *args, **params)
+        # of course, if this result is small we could compute with the other tail
+        return logcdf - logmass
+
+    def _logccdf_dispatch(self, x, *args, lb, ub, _a, _b, logmass, **params):
+        logccdf = self._dist._logcdf2_dispatch(x, _b, *args, **params)
+        return logccdf - logmass
+
+    def _logcdf2_dispatch(self, x, y, *args, lb, ub, _a, _b, logmass, **params):
+        logcdf2 = self._dist._logcdf2_dispatch(x, y, *args, **params)
+        return logcdf2 - logmass
+
+    def _ilogcdf_dispatch(self, logp, *args, lb, ub, _a, _b, logmass, **params):
+        log_Fa = self._dist._logcdf_dispatch(_a, *args, **params)
+        logp_adjusted = np.logaddexp(log_Fa, logp + logmass)
+        return self._dist._ilogcdf_dispatch(logp_adjusted, *args, **params)
+
+    def _ilogccdf_dispatch(self, logp, *args, lb, ub, _a, _b, logmass, **params):
+        log_cFb = self._dist._logccdf_dispatch(_b, *args, **params)
+        logp_adjusted = np.logaddexp(log_cFb, logp + logmass)
+        return self._dist._ilogccdf_dispatch(logp_adjusted, *args, **params)
+
+    def _icdf_dispatch(self, p, *args, lb, ub, _a, _b, logmass, **params):
+        Fa = self._dist._cdf_dispatch(_a, *args, **params)
+        p_adjusted = Fa + p*np.exp(logmass)
+        return self._dist._icdf_dispatch(p_adjusted, *args, **params)
+
+    def _iccdf_dispatch(self, p, *args, lb, ub, _a, _b, logmass, **params):
+        cFb = self._dist._ccdf_dispatch(_b, *args, **params)
+        p_adjusted = cFb + p*np.exp(logmass)
+        return self._dist._iccdf_dispatch(p_adjusted, *args, **params)
+
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return (f"truncate({repr(self._dist)}, "
+                    f"lb={repr(self.lb)}, ub={repr(self.ub)})")
+
+    def __str__(self):
+        with np.printoptions(threshold=10):
+            return (f"truncate({str(self._dist)}, "
+                    f"lb={str(self.lb)}, ub={str(self.ub)})")
+
+
+@xp_capabilities(np_only=True)
+def truncate(X, lb=-np.inf, ub=np.inf):
+    """Truncate the support of a random variable.
+
+    Given a random variable `X`, `truncate` returns a random variable with
+    support truncated to the interval between `lb` and `ub`. The underlying
+    probability density function is normalized accordingly.
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable to be truncated.
+    lb, ub : float array-like
+        The lower and upper truncation points, respectively. Must be
+        broadcastable with one another and the shape of `X`.
+
+    Returns
+    -------
+    X : `ContinuousDistribution`
+        The truncated random variable.
+
+    References
+    ----------
+    .. [1] "Truncated Distribution". *Wikipedia*.
+           https://en.wikipedia.org/wiki/Truncated_distribution
+
+    Examples
+    --------
+    Compare against `scipy.stats.truncnorm`, which truncates a standard normal,
+    *then* shifts and scales it.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> loc, scale, lb, ub = 1, 2, -2, 2
+    >>> X = stats.truncnorm(lb, ub, loc, scale)
+    >>> Y = scale * stats.truncate(stats.Normal(), lb, ub) + loc
+    >>> x = np.linspace(-3, 5, 300)
+    >>> plt.plot(x, X.pdf(x), '-', label='X')
+    >>> plt.plot(x, Y.pdf(x), '--', label='Y')
+    >>> plt.xlabel('x')
+    >>> plt.ylabel('PDF')
+    >>> plt.title('Truncated, then Shifted/Scaled Normal')
+    >>> plt.legend()
+    >>> plt.show()
+
+    However, suppose we wish to shift and scale a normal random variable,
+    then truncate its support to given values. This is straightforward with
+    `truncate`.
+
+    >>> Z = stats.truncate(scale * stats.Normal() + loc, lb, ub)
+    >>> Z.plot()
+    >>> plt.show()
+
+    Furthermore, `truncate` can be applied to any random variable:
+
+    >>> Rayleigh = stats.make_distribution(stats.rayleigh)
+    >>> W = stats.truncate(Rayleigh(), lb=0.5, ub=3)
+    >>> W.plot()
+    >>> plt.show()
+
+    """
+    return TruncatedDistribution(X, lb=lb, ub=ub)
+
+
+class ShiftedScaledDistribution(TransformedDistribution):
+    """Distribution with a standard shift/scale transformation."""
+    # Unclear whether infinite loc/scale will work reasonably in all cases
+    _loc_domain = _RealInterval(endpoints=(-inf, inf), inclusive=(True, True))
+    _loc_param = _RealParameter('loc', symbol=r'\mu',
+                                domain=_loc_domain, typical=(1, 2))
+
+    _scale_domain = _RealInterval(endpoints=(-inf, inf), inclusive=(True, True))
+    _scale_param = _RealParameter('scale', symbol=r'\sigma',
+                                  domain=_scale_domain, typical=(0.1, 10))
+
+    _parameterizations = [_Parameterization(_loc_param, _scale_param),
+                          _Parameterization(_loc_param),
+                          _Parameterization(_scale_param)]
+
+    def _process_parameters(self, loc=None, scale=None, **params):
+        loc = loc if loc is not None else np.zeros_like(scale)[()]
+        scale = scale if scale is not None else np.ones_like(loc)[()]
+        sign = scale > 0
+        parameters = self._dist._process_parameters(**params)
+        parameters.update(dict(loc=loc, scale=scale, sign=sign))
+        return parameters
+
+    def _transform(self, x, loc, scale, **kwargs):
+        return (x - loc)/scale
+
+    def _itransform(self, x, loc, scale, **kwargs):
+        return x * scale + loc
+
+    def _support(self, loc, scale, sign, **params):
+        # Add shortcut for infinite support?
+        a, b = self._dist._support(**params)
+        a, b = self._itransform(a, loc, scale), self._itransform(b, loc, scale)
+        return np.where(sign, a, b)[()], np.where(sign, b, a)[()]
+
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            result =  f"{repr(self.scale)}*{repr(self._dist)}"
+            if not self.loc.ndim and self.loc < 0:
+                result += f" - {repr(-self.loc)}"
+            elif (np.any(self.loc != 0)
+                  or not np.can_cast(self.loc.dtype, self.scale.dtype)):
+                # We don't want to hide a zero array loc if it can cause
+                # a type promotion.
+                result += f" + {repr(self.loc)}"
+        return result
+
+    def __str__(self):
+        with np.printoptions(threshold=10):
+            result =  f"{str(self.scale)}*{str(self._dist)}"
+            if not self.loc.ndim and self.loc < 0:
+                result += f" - {str(-self.loc)}"
+            elif (np.any(self.loc != 0)
+                  or not np.can_cast(self.loc.dtype, self.scale.dtype)):
+                # We don't want to hide a zero array loc if it can cause
+                # a type promotion.
+                result += f" + {str(self.loc)}"
+        return result
+
+    # Here, we override all the `_dispatch` methods rather than the public
+    # methods or _function methods. Why not the public methods?
+    # If we were to override the public methods, then other
+    # TransformedDistribution classes (which could transform a
+    # ShiftedScaledDistribution) would need to call the public methods of
+    # ShiftedScaledDistribution, which would run the input validation again.
+    # Why not the _function methods? For distributions that rely on the
+    # default implementation of methods (e.g. `quadrature`, `inversion`),
+    # the implementation would "see" the location and scale like other
+    # distribution parameters, so they could affect the accuracy of the
+    # calculations. I think it is cleaner if `loc` and `scale` do not affect
+    # the underlying calculations at all.
+
+    def _entropy_dispatch(self, *args, loc, scale, sign, **params):
+        return (self._dist._entropy_dispatch(*args, **params)
+                + np.log(np.abs(scale)))
+
+    def _logentropy_dispatch(self, *args, loc, scale, sign, **params):
+        lH0 = self._dist._logentropy_dispatch(*args, **params)
+        lls = np.log(np.log(np.abs(scale))+0j)
+        return special.logsumexp(np.broadcast_arrays(lH0, lls), axis=0)
+
+    def _median_dispatch(self, *, method, loc, scale, sign, **params):
+        raw = self._dist._median_dispatch(method=method, **params)
+        return self._itransform(raw, loc, scale)
+
+    def _mode_dispatch(self, *, method, loc, scale, sign, **params):
+        raw = self._dist._mode_dispatch(method=method, **params)
+        return self._itransform(raw, loc, scale)
+
+    def _logpdf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        logpdf = self._dist._logpdf_dispatch(x, *args, **params)
+        return logpdf - np.log(np.abs(scale))
+
+    def _pdf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        pdf = self._dist._pdf_dispatch(x, *args, **params)
+        return pdf / np.abs(scale)
+
+    def _logpmf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        logpmf = self._dist._logpmf_dispatch(x, *args, **params)
+        return logpmf - np.log(np.abs(scale))
+
+    def _pmf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        pmf = self._dist._pmf_dispatch(x, *args, **params)
+        return pmf / np.abs(scale)
+
+    def _logpxf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        logpxf = self._dist._logpxf_dispatch(x, *args, **params)
+        return logpxf - np.log(np.abs(scale))
+
+    def _pxf_dispatch(self, x, *args, loc, scale, sign, **params):
+        x = self._transform(x, loc, scale)
+        pxf = self._dist._pxf_dispatch(x, *args, **params)
+        return pxf / np.abs(scale)
+
+    # Sorry about the magic. This is just a draft to show the behavior.
+    @_shift_scale_distribution_function
+    def _logcdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function
+    def _cdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function
+    def _logccdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function
+    def _ccdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function_2arg
+    def _logcdf2_dispatch(self, x, y, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function_2arg
+    def _cdf2_dispatch(self, x, y, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function_2arg
+    def _logccdf2_dispatch(self, x, y, *, method=None, **params):
+        pass
+
+    @_shift_scale_distribution_function_2arg
+    def _ccdf2_dispatch(self, x, y, *, method=None, **params):
+        pass
+
+    @_shift_scale_inverse_function
+    def _ilogcdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_inverse_function
+    def _icdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_inverse_function
+    def _ilogccdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    @_shift_scale_inverse_function
+    def _iccdf_dispatch(self, x, *, method=None, **params):
+        pass
+
+    def _moment_standardized_dispatch(self, order, *, loc, scale, sign, methods,
+                                      **params):
+        res = (self._dist._moment_standardized_dispatch(
+            order, methods=methods, **params))
+        return None if res is None else res * np.sign(scale)**order
+
+    def _moment_central_dispatch(self, order, *, loc, scale, sign, methods,
+                                 **params):
+        res = (self._dist._moment_central_dispatch(
+            order, methods=methods, **params))
+        return None if res is None else res * scale**order
+
+    def _moment_raw_dispatch(self, order, *, loc, scale, sign, methods,
+                             **params):
+        raw_moments = []
+        methods_highest_order = methods
+        for i in range(int(order) + 1):
+            methods = (self._moment_methods if i < order
+                       else methods_highest_order)
+            raw = self._dist._moment_raw_dispatch(i, methods=methods, **params)
+            if raw is None:
+                return None
+            moment_i = raw * scale**i
+            raw_moments.append(moment_i)
+
+        return self._moment_transform_center(
+            order, raw_moments, loc, self._zero)
+
+    def _sample_dispatch(self, full_shape, *,
+                         rng, loc, scale, sign, method, **params):
+        rvs = self._dist._sample_dispatch(full_shape, method=method, rng=rng, **params)
+        return self._itransform(rvs, loc=loc, scale=scale, sign=sign, **params)
+
+    def __add__(self, loc):
+        return ShiftedScaledDistribution(self._dist, loc=self.loc + loc,
+                                         scale=self.scale)
+
+    def __sub__(self, loc):
+        return ShiftedScaledDistribution(self._dist, loc=self.loc - loc,
+                                         scale=self.scale)
+
+    def __mul__(self, scale):
+        return ShiftedScaledDistribution(self._dist,
+                                         loc=self.loc * scale,
+                                         scale=self.scale * scale)
+
+    def __truediv__(self, scale):
+        return ShiftedScaledDistribution(self._dist,
+                                         loc=self.loc / scale,
+                                         scale=self.scale / scale)
+
+
+class OrderStatisticDistribution(TransformedDistribution):
+    r"""Probability distribution of an order statistic
+
+    An instance of this class represents a random variable that follows the
+    distribution underlying the :math:`r^{\text{th}}` order statistic of a
+    sample of :math:`n` observations of a random variable :math:`X`.
+
+    Parameters
+    ----------
+    dist : `ContinuousDistribution`
+        The random variable :math:`X`
+    n : array_like
+        The (integer) sample size :math:`n`
+    r : array_like
+        The (integer) rank of the order statistic :math:`r`
+
+
+    Notes
+    -----
+    If we make :math:`n` observations of a continuous random variable
+    :math:`X` and sort them in increasing order
+    :math:`X_{(1)}, \dots, X_{(r)}, \dots, X_{(n)}`,
+    :math:`X_{(r)}` is known as the :math:`r^{\text{th}}` order statistic.
+
+    If the PDF, CDF, and CCDF underlying math:`X` are denoted :math:`f`,
+    :math:`F`, and :math:`F'`, respectively, then the PDF underlying
+    math:`X_{(r)}` is given by:
+
+    .. math::
+
+        f_r(x) = \frac{n!}{(r-1)! (n-r)!} f(x) F(x)^{r-1} F'(x)^{n - r}
+
+    The CDF and other methods of the distribution underlying :math:`X_{(r)}`
+    are calculated using the fact that :math:`X = F^{-1}(U)`, where :math:`U` is
+    a standard uniform random variable, and that the order statistics of
+    observations of `U` follow a beta distribution, :math:`B(r, n - r + 1)`.
+
+    References
+    ----------
+    .. [1] Order statistic. *Wikipedia*. https://en.wikipedia.org/wiki/Order_statistic
+
+    Examples
+    --------
+    Suppose we are interested in order statistics of samples of size five drawn
+    from the standard normal distribution. Plot the PDF underlying the fourth
+    order statistic and compare with a normalized histogram from simulation.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> from scipy.stats._distribution_infrastructure import OrderStatisticDistribution
+    >>>
+    >>> X = stats.Normal()
+    >>> data = X.sample(shape=(10000, 5))
+    >>> ranks = np.sort(data, axis=1)
+    >>> Y = OrderStatisticDistribution(X, r=4, n=5)
+    >>>
+    >>> ax = plt.gca()
+    >>> Y.plot(ax=ax)
+    >>> ax.hist(ranks[:, 3], density=True, bins=30)
+    >>> plt.show()
+
+    """
+
+    # These can be restricted to _IntegerInterval/_IntegerParameter in a separate
+    # PR if desired.
+    _r_domain = _RealInterval(endpoints=(1, 'n'), inclusive=(True, True))
+    _r_param = _RealParameter('r', domain=_r_domain, typical=(1, 2))
+
+    _n_domain = _RealInterval(endpoints=(1, np.inf), inclusive=(True, True))
+    _n_param = _RealParameter('n', domain=_n_domain, typical=(1, 4))
+
+    _r_domain.define_parameters(_n_param)
+
+    _parameterizations = [_Parameterization(_r_param, _n_param)]
+
+    def __init__(self, dist, /, *args, r, n, **kwargs):
+        super().__init__(dist, *args, r=r, n=n, **kwargs)
+
+    def _support(self, *args, r, n, **kwargs):
+        return self._dist._support(*args, **kwargs)
+
+    def _process_parameters(self, r=None, n=None, **params):
+        parameters = self._dist._process_parameters(**params)
+        parameters.update(dict(r=r, n=n))
+        return parameters
+
+    def _overrides(self, method_name):
+        return method_name in {'_logpdf_formula', '_pdf_formula',
+                               '_cdf_formula', '_ccdf_formula',
+                               '_icdf_formula', '_iccdf_formula'}
+
+    def _logpdf_formula(self, x, r, n, **kwargs):
+        log_factor = special.betaln(r, n - r + 1)
+        log_fX = self._dist._logpdf_dispatch(x, **kwargs)
+        # log-methods sometimes use complex dtype with 0 imaginary component,
+        # but `_tanhsinh` doesn't accept complex limits of integration; take `real`.
+        log_FX = self._dist._logcdf_dispatch(x.real, **kwargs)
+        log_cFX = self._dist._logccdf_dispatch(x.real, **kwargs)
+        # This can be problematic when (r - 1)|(n-r) = 0 and `log_FX`|log_cFX = -inf
+        # The PDF in these cases is 0^0, so these should be replaced with log(1)=0
+        # return log_fX + (r-1)*log_FX + (n-r)*log_cFX - log_factor
+        rm1_log_FX = np.where((r - 1 == 0) & np.isneginf(log_FX), 0, (r-1)*log_FX)
+        nmr_log_cFX = np.where((n - r == 0) & np.isneginf(log_cFX), 0, (n-r)*log_cFX)
+        return log_fX + rm1_log_FX + nmr_log_cFX - log_factor
+
+    def _pdf_formula(self, x, r, n, **kwargs):
+        # 1 / factor = factorial(n) / (factorial(r-1) * factorial(n-r))
+        factor = special.beta(r, n - r + 1)
+        fX = self._dist._pdf_dispatch(x, **kwargs)
+        FX = self._dist._cdf_dispatch(x, **kwargs)
+        cFX = self._dist._ccdf_dispatch(x, **kwargs)
+        return fX * FX**(r-1) * cFX**(n-r) / factor
+
+    def _cdf_formula(self, x, r, n, **kwargs):
+        x_ = self._dist._cdf_dispatch(x, **kwargs)
+        return special.betainc(r, n-r+1, x_)
+
+    def _ccdf_formula(self, x, r, n, **kwargs):
+        x_ = self._dist._cdf_dispatch(x, **kwargs)
+        return special.betaincc(r, n-r+1, x_)
+
+    def _icdf_formula(self, p, r, n, **kwargs):
+        p_ = special.betaincinv(r, n-r+1, p)
+        return self._dist._icdf_dispatch(p_, **kwargs)
+
+    def _iccdf_formula(self, p, r, n, **kwargs):
+        p_ = special.betainccinv(r, n-r+1, p)
+        return self._dist._icdf_dispatch(p_, **kwargs)
+
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return (f"order_statistic({repr(self._dist)}, r={repr(self.r)}, "
+                    f"n={repr(self.n)})")
+
+    def __str__(self):
+        with np.printoptions(threshold=10):
+            return (f"order_statistic({str(self._dist)}, r={str(self.r)}, "
+                    f"n={str(self.n)})")
+
+
+@xp_capabilities(np_only=True)
+def order_statistic(X, /, *, r, n):
+    r"""Probability distribution of an order statistic
+
+    Returns a random variable that follows the distribution underlying the
+    :math:`r^{\text{th}}` order statistic of a sample of :math:`n`
+    observations of a random variable :math:`X`.
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X`
+    r : array_like
+        The (positive integer) rank of the order statistic :math:`r`
+    n : array_like
+        The (positive integer) sample size :math:`n`
+
+    Returns
+    -------
+    Y : `ContinuousDistribution`
+        A random variable that follows the distribution of the prescribed
+        order statistic.
+
+    Notes
+    -----
+    If we make :math:`n` observations of a continuous random variable
+    :math:`X` and sort them in increasing order
+    :math:`X_{(1)}, \dots, X_{(r)}, \dots, X_{(n)}`,
+    :math:`X_{(r)}` is known as the :math:`r^{\text{th}}` order statistic.
+
+    If the PDF, CDF, and CCDF underlying math:`X` are denoted :math:`f`,
+    :math:`F`, and :math:`F'`, respectively, then the PDF underlying
+    math:`X_{(r)}` is given by:
+
+    .. math::
+
+        f_r(x) = \frac{n!}{(r-1)! (n-r)!} f(x) F(x)^{r-1} F'(x)^{n - r}
+
+    The CDF and other methods of the distribution underlying :math:`X_{(r)}`
+    are calculated using the fact that :math:`X = F^{-1}(U)`, where :math:`U` is
+    a standard uniform random variable, and that the order statistics of
+    observations of `U` follow a beta distribution, :math:`B(r, n - r + 1)`.
+
+    References
+    ----------
+    .. [1] Order statistic. *Wikipedia*. https://en.wikipedia.org/wiki/Order_statistic
+
+    Examples
+    --------
+    Suppose we are interested in order statistics of samples of size five drawn
+    from the standard normal distribution. Plot the PDF underlying each
+    order statistic and compare with a normalized histogram from simulation.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>>
+    >>> X = stats.Normal()
+    >>> data = X.sample(shape=(10000, 5))
+    >>> sorted = np.sort(data, axis=1)
+    >>> Y = stats.order_statistic(X, r=[1, 2, 3, 4, 5], n=5)
+    >>>
+    >>> ax = plt.gca()
+    >>> colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
+    >>> for i in range(5):
+    ...     y = sorted[:, i]
+    ...     ax.hist(y, density=True, bins=30, alpha=0.1, color=colors[i])
+    >>> Y.plot(ax=ax)
+    >>> plt.show()
+
+    """
+    r, n = np.asarray(r), np.asarray(n)
+    if np.any((r != np.floor(r)) | (r < 0)) or np.any((n != np.floor(n)) | (n < 0)):
+        message = "`r` and `n` must contain only positive integers."
+        raise ValueError(message)
+    return OrderStatisticDistribution(X, r=r, n=n)
+
+
+class Mixture(_ProbabilityDistribution):
+    r"""Representation of a mixture distribution.
+
+    A mixture distribution is the distribution of a random variable
+    defined in the following way: first, a random variable is selected
+    from `components` according to the probabilities given by `weights`, then
+    the selected random variable is realized.
+
+    Parameters
+    ----------
+    components : sequence of `ContinuousDistribution`
+        The underlying instances of `ContinuousDistribution`.
+        All must have scalar shape parameters (if any); e.g., the `pdf` evaluated
+        at a scalar argument must return a scalar.
+    weights : sequence of floats, optional
+        The corresponding probabilities of selecting each random variable.
+        Must be non-negative and sum to one. The default behavior is to weight
+        all components equally.
+
+    Attributes
+    ----------
+    components : sequence of `ContinuousDistribution`
+        The underlying instances of `ContinuousDistribution`.
+    weights : ndarray
+        The corresponding probabilities of selecting each random variable.
+
+    Methods
+    -------
+    support
+
+    sample
+
+    moment
+
+    mean
+    median
+    mode
+
+    variance
+    standard_deviation
+
+    skewness
+    kurtosis
+
+    pdf
+    logpdf
+
+    cdf
+    icdf
+    ccdf
+    iccdf
+
+    logcdf
+    ilogcdf
+    logccdf
+    ilogccdf
+
+    entropy
+
+    Notes
+    -----
+    The following abbreviations are used throughout the documentation.
+
+    - PDF: probability density function
+    - CDF: cumulative distribution function
+    - CCDF: complementary CDF
+    - entropy: differential entropy
+    - log-*F*: logarithm of *F* (e.g. log-CDF)
+    - inverse *F*: inverse function of *F* (e.g. inverse CDF)
+
+    References
+    ----------
+    .. [1] Mixture distribution, *Wikipedia*,
+           https://en.wikipedia.org/wiki/Mixture_distribution
+
+
+    Examples
+    --------
+    A mixture of normal distributions:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> X1 = stats.Normal(mu=-2, sigma=1)
+    >>> X2 = stats.Normal(mu=2, sigma=1)
+    >>> mixture = stats.Mixture([X1, X2], weights=[0.4, 0.6])
+    >>> print(f'mean: {mixture.mean():.2f}, '
+    ...       f'median: {mixture.median():.2f}, '
+    ...       f'mode: {mixture.mode():.2f}')
+    mean: 0.40, median: 1.04, mode: 2.00
+    >>> x = np.linspace(-10, 10, 300)
+    >>> plt.plot(x, mixture.pdf(x))
+    >>> plt.title('PDF of normal distribution mixture')
+    >>> plt.show()
+
+    """
+    # Todo:
+    # Add support for array shapes, weights
+
+    def _input_validation(self, components, weights):
+        if len(components) == 0:
+            message = ("`components` must contain at least one random variable.")
+            raise ValueError(message)
+
+        for var in components:
+            # will generalize to other kinds of distributions when there
+            # *are* other kinds of distributions
+            if not isinstance(var, ContinuousDistribution):
+                message = ("Each element of `components` must be an instance of "
+                           "`ContinuousDistribution`.")
+                raise ValueError(message)
+            if not var._shape == ():
+                message = "All elements of `components` must have scalar shapes."
+                raise ValueError(message)
+
+        if weights is None:
+            return components, weights
+
+        weights = np.asarray(weights)
+        if weights.shape != (len(components),):
+            message = "`components` and `weights` must have the same length."
+            raise ValueError(message)
+
+        if not np.issubdtype(weights.dtype, np.inexact):
+            message = "`weights` must have floating point dtype."
+            raise ValueError(message)
+
+        if not np.isclose(np.sum(weights), 1.0):
+            message = "`weights` must sum to 1.0."
+            raise ValueError(message)
+
+        if not np.all(weights >= 0):
+            message = "All `weights` must be non-negative."
+            raise ValueError(message)
+
+        return components, weights
+
+    def __init__(self, components, *, weights=None):
+        components, weights = self._input_validation(components, weights)
+        n = len(components)
+        dtype = np.result_type(*(var._dtype for var in components))
+        self._shape = np.broadcast_shapes(*(var._shape for var in components))
+        self._dtype, self._components = dtype, components
+        self._weights = np.full(n, 1/n, dtype=dtype) if weights is None else weights
+        self.validation_policy = None
+
+    @property
+    def components(self):
+        return list(self._components)
+
+    @property
+    def weights(self):
+        return self._weights.copy()
+
+    def _full(self, val, *args):
+        args = [np.asarray(arg) for arg in args]
+        dtype = np.result_type(self._dtype, *(arg.dtype for arg in args))
+        shape = np.broadcast_shapes(self._shape, *(arg.shape for arg in args))
+        return np.full(shape, val, dtype=dtype)
+
+    def _sum(self, fun, *args):
+        out = self._full(0, *args)
+        for var, weight in zip(self._components, self._weights):
+            out += getattr(var, fun)(*args) * weight
+        return out[()]
+
+    def _logsum(self, fun, *args):
+        out = self._full(-np.inf, *args)
+        for var, log_weight in zip(self._components, np.log(self._weights)):
+            np.logaddexp(out, getattr(var, fun)(*args) + log_weight, out=out)
+        return out[()]
+
+    def support(self):
+        a = self._full(np.inf)
+        b = self._full(-np.inf)
+        for var in self._components:
+            a = np.minimum(a, var.support()[0])
+            b = np.maximum(b, var.support()[1])
+        return a, b
+
+    def _raise_if_method(self, method):
+        if method is not None:
+            raise NotImplementedError("`method` not implemented for this distribution.")
+
+    def logentropy(self, *, method=None):
+        self._raise_if_method(method)
+        def log_integrand(x):
+            # `x` passed by `_tanhsinh` will be of complex dtype because
+            # `log_integrand` returns complex values, but the imaginary
+            # component is always zero. Extract the real part because
+            # `logpdf` uses `logaddexp`, which fails for complex input.
+            return self.logpdf(x.real) + np.log(self.logpdf(x.real) + 0j)
+
+        res = _tanhsinh(log_integrand, *self.support(), log=True).integral
+        return _log_real_standardize(res + np.pi*1j)
+
+    def entropy(self, *, method=None):
+        self._raise_if_method(method)
+        return _tanhsinh(lambda x: -self.pdf(x) * self.logpdf(x),
+                         *self.support()).integral
+
+    def mode(self, *, method=None):
+        self._raise_if_method(method)
+        a, b = self.support()
+        def f(x): return -self.pdf(x)
+        res = _bracket_minimum(f, 1., xmin=a, xmax=b)
+        res = _chandrupatla_minimize(f, res.xl, res.xm, res.xr)
+        return res.x
+
+    def median(self, *, method=None):
+        self._raise_if_method(method)
+        return self.icdf(0.5)
+
+    def mean(self, *, method=None):
+        self._raise_if_method(method)
+        return self._sum('mean')
+
+    def variance(self, *, method=None):
+        self._raise_if_method(method)
+        return self._moment_central(2)
+
+    def standard_deviation(self, *, method=None):
+        self._raise_if_method(method)
+        return self.variance()**0.5
+
+    def skewness(self, *, method=None):
+        self._raise_if_method(method)
+        return self._moment_standardized(3)
+
+    def kurtosis(self, *, method=None):
+        self._raise_if_method(method)
+        return self._moment_standardized(4)
+
+    def moment(self, order=1, kind='raw', *, method=None):
+        self._raise_if_method(method)
+        kinds = {'raw': self._moment_raw,
+                 'central': self._moment_central,
+                 'standardized': self._moment_standardized}
+        order = ContinuousDistribution._validate_order_kind(self, order, kind, kinds)
+        moment_kind = kinds[kind]
+        return moment_kind(order)
+
+    def _moment_raw(self, order):
+        out = self._full(0)
+        for var, weight in zip(self._components, self._weights):
+            out += var.moment(order, kind='raw') * weight
+        return out[()]
+
+    def _moment_central(self, order):
+        order = int(order)
+        out = self._full(0)
+        for var, weight in zip(self._components, self._weights):
+            moment_as = [var.moment(order, kind='central')
+                         for order in range(order + 1)]
+            a, b = var.mean(), self.mean()
+            moment = var._moment_transform_center(order, moment_as, a, b)
+            out += moment * weight
+        return out[()]
+
+    def _moment_standardized(self, order):
+        return self._moment_central(order) / self.standard_deviation()**order
+
+    def pdf(self, x, /, *, method=None):
+        self._raise_if_method(method)
+        return self._sum('pdf', x)
+
+    def logpdf(self, x, /, *, method=None):
+        self._raise_if_method(method)
+        return self._logsum('logpdf', x)
+
+    def pmf(self, x, /, *, method=None):
+        self._raise_if_method(method)
+        return self._sum('pmf', x)
+
+    def logpmf(self, x, /, *, method=None):
+        self._raise_if_method(method)
+        return self._logsum('logpmf', x)
+
+    def cdf(self, x, y=None, /, *, method=None):
+        self._raise_if_method(method)
+        args = (x,) if y is None else (x, y)
+        return self._sum('cdf', *args)
+
+    def logcdf(self, x, y=None, /, *, method=None):
+        self._raise_if_method(method)
+        args = (x,) if y is None else (x, y)
+        return self._logsum('logcdf', *args)
+
+    def ccdf(self, x, y=None, /, *, method=None):
+        self._raise_if_method(method)
+        args = (x,) if y is None else (x, y)
+        return self._sum('ccdf', *args)
+
+    def logccdf(self, x, y=None, /, *, method=None):
+        self._raise_if_method(method)
+        args = (x,) if y is None else (x, y)
+        return self._logsum('logccdf', *args)
+
+    def _invert(self, fun, p):
+        xmin, xmax = self.support()
+        fun = getattr(self, fun)
+        f = lambda x, p: fun(x) - p  # noqa: E731 is silly
+        xl0, xr0 = _guess_bracket(xmin, xmax)
+        res = _bracket_root(f, xl0=xl0, xr0=xr0, xmin=xmin, xmax=xmax, args=(p,))
+        return _chandrupatla(f, a=res.xl, b=res.xr, args=(p,)).x
+
+    def icdf(self, p, /, *, method=None):
+        self._raise_if_method(method)
+        return self._invert('cdf', p)
+
+    def iccdf(self, p, /, *, method=None):
+        self._raise_if_method(method)
+        return self._invert('ccdf', p)
+
+    def ilogcdf(self, p, /, *, method=None):
+        self._raise_if_method(method)
+        return self._invert('logcdf', p)
+
+    def ilogccdf(self, p, /, *, method=None):
+        self._raise_if_method(method)
+        return self._invert('logccdf', p)
+
+    def sample(self, shape=(), *, rng=None, method=None):
+        self._raise_if_method(method)
+        rng = np.random.default_rng(rng)
+        size = np.prod(np.atleast_1d(shape))
+        ns = rng.multinomial(size, self._weights)
+        x = [var.sample(shape=n, rng=rng) for n, var in zip(ns, self._components)]
+        x = np.reshape(rng.permuted(np.concatenate(x)), shape)
+        return x[()]
+
+    def __repr__(self):
+        result = "Mixture(\n"
+        result += "    [\n"
+        with np.printoptions(threshold=10):
+            for component in self.components:
+                result += f"        {repr(component)},\n"
+            result += "    ],\n"
+            result += f"    weights={repr(self.weights)},\n"
+        result += ")"
+        return result
+
+    def __str__(self):
+        result = "Mixture(\n"
+        result += "    [\n"
+        with np.printoptions(threshold=10):
+            for component in self.components:
+                result += f"        {str(component)},\n"
+            result += "    ],\n"
+            result += f"    weights={str(self.weights)},\n"
+        result += ")"
+        return result
+
+
+class MonotonicTransformedDistribution(TransformedDistribution):
+    r"""Distribution underlying a strictly monotonic function of a random variable
+
+    Given a random variable :math:`X`; a strictly monotonic function
+    :math:`g(u)`, its inverse :math:`h(u) = g^{-1}(u)`, and the derivative magnitude
+    :math: `|h'(u)| = \left| \frac{dh(u)}{du} \right|`, define the distribution
+    underlying the random variable :math:`Y = g(X)`.
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X`.
+    g, h, dh : callable
+        Elementwise functions representing the mathematical functions
+        :math:`g(u)`, :math:`h(u)`, and :math:`|h'(u)|`
+    logdh : callable, optional
+        Elementwise function representing :math:`\log(h'(u))`.
+        The default is ``lambda u: np.log(dh(u))``, but providing
+        a custom implementation may avoid over/underflow.
+    increasing : bool, optional
+        Whether the function is strictly increasing (True, default)
+        or strictly decreasing (False).
+    repr_pattern : str, optional
+        A string pattern for determining the __repr__. The __repr__
+        for X will be substituted into the position where `***` appears.
+        For example:
+            ``"exp(***)"`` for the repr of an exponentially transformed
+            distribution
+        The default is ``f"{g.__name__}(***)"``.
+    str_pattern : str, optional
+        A string pattern for determining `__str__`. The `__str__`
+        for X will be substituted into the position where `***` appears.
+        For example:
+            ``"exp(***)"`` for the repr of an exponentially transformed
+            distribution
+        The default is the value `repr_pattern` takes.
+    """
+
+    def __init__(self, X, /, *args, g, h, dh, logdh=None,
+                 increasing=True, repr_pattern=None,
+                 str_pattern=None, **kwargs):
+        super().__init__(X, *args, **kwargs)
+        self._g = g
+        self._h = h
+        self._dh = dh
+        self._logdh = (logdh if logdh is not None
+                       else lambda u: np.log(dh(u)))
+        if increasing:
+            self._xdf = self._dist._cdf_dispatch
+            self._cxdf = self._dist._ccdf_dispatch
+            self._ixdf = self._dist._icdf_dispatch
+            self._icxdf = self._dist._iccdf_dispatch
+            self._logxdf = self._dist._logcdf_dispatch
+            self._logcxdf = self._dist._logccdf_dispatch
+            self._ilogxdf = self._dist._ilogcdf_dispatch
+            self._ilogcxdf = self._dist._ilogccdf_dispatch
+        else:
+            self._xdf = self._dist._ccdf_dispatch
+            self._cxdf = self._dist._cdf_dispatch
+            self._ixdf = self._dist._iccdf_dispatch
+            self._icxdf = self._dist._icdf_dispatch
+            self._logxdf = self._dist._logccdf_dispatch
+            self._logcxdf = self._dist._logcdf_dispatch
+            self._ilogxdf = self._dist._ilogccdf_dispatch
+            self._ilogcxdf = self._dist._ilogcdf_dispatch
+        self._increasing = increasing
+        self._repr_pattern = repr_pattern or f"{g.__name__}(***)"
+        self._str_pattern = str_pattern or self._repr_pattern
+
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return self._repr_pattern.replace("***", repr(self._dist))
+
+    def __str__(self):
+        with np.printoptions(threshold=10):
+            return self._str_pattern.replace("***", str(self._dist))
+
+    def _overrides(self, method_name):
+        # Do not use the generic overrides of TransformedDistribution
+        return False
+
+    def _support(self, **params):
+        a, b = self._dist._support(**params)
+        # For reciprocal transformation, we want this zero to become -inf
+        b = np.where(b==0, np.asarray("-0", dtype=b.dtype), b)
+        with np.errstate(divide='ignore'):
+            if self._increasing:
+                return self._g(a), self._g(b)
+            else:
+                return self._g(b), self._g(a)
+
+    def _logpdf_dispatch(self, x, *args, **params):
+        return self._dist._logpdf_dispatch(self._h(x), *args, **params) + self._logdh(x)
+
+    def _pdf_dispatch(self, x, *args, **params):
+        return self._dist._pdf_dispatch(self._h(x), *args, **params) * self._dh(x)
+
+    def _logcdf_dispatch(self, x, *args, **params):
+        return self._logxdf(self._h(x), *args, **params)
+
+    def _cdf_dispatch(self, x, *args, **params):
+        return self._xdf(self._h(x), *args, **params)
+
+    def _logccdf_dispatch(self, x, *args, **params):
+        return self._logcxdf(self._h(x), *args, **params)
+
+    def _ccdf_dispatch(self, x, *args, **params):
+        return self._cxdf(self._h(x), *args, **params)
+
+    def _ilogcdf_dispatch(self, p, *args, **params):
+        return self._g(self._ilogxdf(p, *args, **params))
+
+    def _icdf_dispatch(self, p, *args, **params):
+        return self._g(self._ixdf(p, *args, **params))
+
+    def _ilogccdf_dispatch(self, p, *args, **params):
+        return self._g(self._ilogcxdf(p, *args, **params))
+
+    def _iccdf_dispatch(self, p, *args, **params):
+        return self._g(self._icxdf(p, *args, **params))
+
+    def _sample_dispatch(self, full_shape, *, method, rng, **params):
+        rvs = self._dist._sample_dispatch(full_shape, method=method, rng=rng, **params)
+        return self._g(rvs)
+
+
+class FoldedDistribution(TransformedDistribution):
+    r"""Distribution underlying the absolute value of a random variable
+
+    Given a random variable :math:`X`; define the distribution
+    underlying the random variable :math:`Y = |X|`.
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X`.
+
+    Returns
+    -------
+    Y : `ContinuousDistribution`
+        The random variable :math:`Y = |X|`
+
+    """
+    # Many enhancements are possible if distribution is symmetric. Start
+    # with the general case; enhance later.
+
+    def __init__(self, X, /, *args, **kwargs):
+        super().__init__(X, *args, **kwargs)
+        # I think we need to allow `_support` to define whether the endpoints
+        # are inclusive or not. In the meantime, it's best to ensure that the lower
+        # endpoint (typically 0 for folded distribution) is inclusive so PDF evaluates
+        # correctly at that point.
+        self._variable.domain.inclusive = (True, self._variable.domain.inclusive[1])
+
+    def _overrides(self, method_name):
+        # Do not use the generic overrides of TransformedDistribution
+        return False
+
+    def _support(self, **params):
+        a, b = self._dist._support(**params)
+        a_, b_ = np.abs(a), np.abs(b)
+        a_, b_ = np.minimum(a_, b_), np.maximum(a_, b_)
+        i = (a < 0) & (b > 0)
+        a_ = np.asarray(a_)
+        a_[i] = 0
+        return a_[()], b_[()]
+
+    def _logpdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        right = self._dist._logpdf_dispatch(x, *args, method=method, **params)
+        left = self._dist._logpdf_dispatch(-x, *args, method=method, **params)
+        left = np.asarray(left)
+        right = np.asarray(right)
+        a, b = self._dist._support(**params)
+        left[-x < a] = -np.inf
+        right[x > b] = -np.inf
+        logpdfs = np.stack([left, right])
+        return special.logsumexp(logpdfs, axis=0)
+
+    def _pdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        right = self._dist._pdf_dispatch(x, *args, method=method, **params)
+        left = self._dist._pdf_dispatch(-x, *args, method=method, **params)
+        left = np.asarray(left)
+        right = np.asarray(right)
+        a, b = self._dist._support(**params)
+        left[-x < a] = 0
+        right[x > b] = 0
+        return left + right
+
+    def _logcdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        a, b = self._dist._support(**params)
+        xl = np.maximum(-x, a)
+        xr = np.minimum(x, b)
+        return self._dist._logcdf2_dispatch(xl, xr, *args, method=method, **params).real
+
+    def _cdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        a, b = self._dist._support(**params)
+        xl = np.maximum(-x, a)
+        xr = np.minimum(x, b)
+        return self._dist._cdf2_dispatch(xl, xr, *args, **params)
+
+    def _logccdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        a, b = self._dist._support(**params)
+        xl = np.maximum(-x, a)
+        xr = np.minimum(x, b)
+        return self._dist._logccdf2_dispatch(xl, xr, *args, method=method,
+                                             **params).real
+
+    def _ccdf_dispatch(self, x, *args, method=None, **params):
+        x = np.abs(x)
+        a, b = self._dist._support(**params)
+        xl = np.maximum(-x, a)
+        xr = np.minimum(x, b)
+        return self._dist._ccdf2_dispatch(xl, xr, *args, method=method, **params)
+
+    def _sample_dispatch(self, full_shape, *, method, rng, **params):
+        rvs = self._dist._sample_dispatch(full_shape, method=method, rng=rng, **params)
+        return np.abs(rvs)
+
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return f"abs({repr(self._dist)})"
+
+    def __str__(self):
+        with np.printoptions(threshold=10):
+            return f"abs({str(self._dist)})"
+
+
+@xp_capabilities(np_only=True)
+def abs(X, /):
+    r"""Absolute value of a random variable
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X`.
+
+    Returns
+    -------
+    Y : `ContinuousDistribution`
+        A random variable :math:`Y = |X|`.
+
+    Examples
+    --------
+    Suppose we have a normally distributed random variable :math:`X`:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> X = stats.Normal()
+
+    We wish to have a random variable :math:`Y` distributed according to
+    the folded normal distribution; that is, a random variable :math:`|X|`.
+
+    >>> Y = stats.abs(X)
+
+    The PDF of the distribution in the left half plane is "folded" over to
+    the right half plane. Because the normal PDF is symmetric, the resulting
+    PDF is zero for negative arguments and doubled for positive arguments.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0, 5, 300)
+    >>> ax = plt.gca()
+    >>> Y.plot(x='x', y='pdf', t=('x', -1, 5), ax=ax)
+    >>> plt.plot(x, 2 * X.pdf(x), '--')
+    >>> plt.legend(('PDF of `Y`', 'Doubled PDF of `X`'))
+    >>> plt.show()
+
+    """
+    return FoldedDistribution(X)
+
+
+@xp_capabilities(np_only=True)
+def exp(X, /):
+    r"""Natural exponential of a random variable
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X`.
+
+    Returns
+    -------
+    Y : `ContinuousDistribution`
+        A random variable :math:`Y = \exp(X)`.
+
+    Examples
+    --------
+    Suppose we have a normally distributed random variable :math:`X`:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> X = stats.Normal()
+
+    We wish to have a lognormally distributed random variable :math:`Y`,
+    a random variable whose natural logarithm is :math:`X`.
+    If :math:`X` is to be the natural logarithm of :math:`Y`, then we
+    must take :math:`Y` to be the natural exponential of :math:`X`.
+
+    >>> Y = stats.exp(X)
+
+    To demonstrate that ``X`` represents the logarithm of ``Y``,
+    we plot a normalized histogram of the logarithm of observations of
+    ``Y`` against the PDF underlying ``X``.
+
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng(435383595582522)
+    >>> y = Y.sample(shape=10000, rng=rng)
+    >>> ax = plt.gca()
+    >>> ax.hist(np.log(y), bins=50, density=True)
+    >>> X.plot(ax=ax)
+    >>> plt.legend(('PDF of `X`', 'histogram of `log(y)`'))
+    >>> plt.show()
+
+    """
+    return MonotonicTransformedDistribution(X, g=np.exp, h=np.log, dh=lambda u: 1 / u,
+                                            logdh=lambda u: -np.log(u))
+
+
+@xp_capabilities(np_only=True)
+def log(X, /):
+    r"""Natural logarithm of a non-negative random variable
+
+    Parameters
+    ----------
+    X : `ContinuousDistribution`
+        The random variable :math:`X` with positive support.
+
+    Returns
+    -------
+    Y : `ContinuousDistribution`
+        A random variable :math:`Y = \log(X)`.
+
+    Examples
+    --------
+    Suppose we have a gamma distributed random variable :math:`X`:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> Gamma = stats.make_distribution(stats.gamma)
+    >>> X = Gamma(a=1.0)
+
+    We wish to have an exp-gamma distributed random variable :math:`Y`,
+    a random variable whose natural exponential is :math:`X`.
+    If :math:`X` is to be the natural exponential of :math:`Y`, then we
+    must take :math:`Y` to be the natural logarithm of :math:`X`.
+
+    >>> Y = stats.log(X)
+
+    To demonstrate that ``X`` represents the exponential of ``Y``,
+    we plot a normalized histogram of the exponential of observations of
+    ``Y`` against the PDF underlying ``X``.
+
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng(435383595582522)
+    >>> y = Y.sample(shape=10000, rng=rng)
+    >>> ax = plt.gca()
+    >>> ax.hist(np.exp(y), bins=50, density=True)
+    >>> X.plot(ax=ax)
+    >>> plt.legend(('PDF of `X`', 'histogram of `exp(y)`'))
+    >>> plt.show()
+
+    """
+    if np.any(X.support()[0] < 0):
+        message = ("The logarithm of a random variable is only implemented when the "
+                   "support is non-negative.")
+        raise NotImplementedError(message)
+    return MonotonicTransformedDistribution(X, g=np.log, h=np.exp, dh=np.exp,
+                                            logdh=lambda u: u)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_entropy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fb3e6659153867cf7f7bb11e9e665127d4cc7d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_entropy.py
@@ -0,0 +1,435 @@
+"""
+Created on Fri Apr  2 09:06:05 2021
+
+@author: matth
+"""
+
+import math
+import numpy as np
+from scipy import special
+from ._axis_nan_policy import _axis_nan_policy_factory
+from scipy._lib._array_api import (array_namespace, xp_promote, xp_device,
+                                   is_marray, _share_masks, xp_capabilities)
+
+__all__ = ['entropy', 'differential_entropy']
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x,
+    n_samples=lambda kwgs: (
+        2 if ("qk" in kwgs and kwgs["qk"] is not None)
+        else 1
+    ),
+    n_outputs=1, result_to_tuple=lambda x, _: (x,), paired=True,
+    too_small=-1  # entropy doesn't have too small inputs
+)
+def entropy(pk: np.typing.ArrayLike,
+            qk: np.typing.ArrayLike | None = None,
+            base: float | None = None,
+            axis: int = 0
+            ) -> np.number | np.ndarray:
+    """
+    Calculate the Shannon entropy/relative entropy of given distribution(s).
+
+    If only probabilities `pk` are given, the Shannon entropy is calculated as
+    ``H = -sum(pk * log(pk))``.
+
+    If `qk` is not None, then compute the relative entropy
+    ``D = sum(pk * log(pk / qk))``. This quantity is also known
+    as the Kullback-Leibler divergence.
+
+    This routine will normalize `pk` and `qk` if they don't sum to 1.
+
+    Parameters
+    ----------
+    pk : array_like
+        Defines the (discrete) distribution. Along each axis-slice of ``pk``,
+        element ``i`` is the  (possibly unnormalized) probability of event
+        ``i``.
+    qk : array_like, optional
+        Sequence against which the relative entropy is computed. Should be in
+        the same format as `pk`.
+    base : float, optional
+        The logarithmic base to use, defaults to ``e`` (natural logarithm).
+    axis : int, optional
+        The axis along which the entropy is calculated. Default is 0.
+
+    Returns
+    -------
+    S : {float, array_like}
+        The calculated entropy.
+
+    Notes
+    -----
+    Informally, the Shannon entropy quantifies the expected uncertainty
+    inherent in the possible outcomes of a discrete random variable.
+    For example,
+    if messages consisting of sequences of symbols from a set are to be
+    encoded and transmitted over a noiseless channel, then the Shannon entropy
+    ``H(pk)`` gives a tight lower bound for the average number of units of
+    information needed per symbol if the symbols occur with frequencies
+    governed by the discrete distribution `pk` [1]_. The choice of base
+    determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
+
+    The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
+    number of units of information needed per symbol if the encoding is
+    optimized for the probability distribution `qk` instead of the true
+    distribution `pk`. Informally, the relative entropy quantifies the expected
+    excess in surprise experienced if one believes the true distribution is
+    `qk` when it is actually `pk`.
+
+    A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
+    equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
+    the formula ``CE = -sum(pk * log(qk))``. It gives the average
+    number of units of information needed per symbol if an encoding is
+    optimized for the probability distribution `qk` when the true distribution
+    is `pk`. It is not computed directly by `entropy`, but it can be computed
+    using two calls to the function (see Examples).
+
+    See [2]_ for more information.
+
+    References
+    ----------
+    .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
+           Bell System Technical Journal, 27: 379-423.
+           https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
+    .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
+           Theory (Wiley Series in Telecommunications and Signal Processing).
+           Wiley-Interscience, USA.
+
+
+    Examples
+    --------
+    The outcome of a fair coin is the most uncertain:
+
+    >>> import numpy as np
+    >>> from scipy.stats import entropy
+    >>> base = 2  # work in units of bits
+    >>> pk = np.array([1/2, 1/2])  # fair coin
+    >>> H = entropy(pk, base=base)
+    >>> H
+    1.0
+    >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
+    True
+
+    The outcome of a biased coin is less uncertain:
+
+    >>> qk = np.array([9/10, 1/10])  # biased coin
+    >>> entropy(qk, base=base)
+    0.46899559358928117
+
+    The relative entropy between the fair coin and biased coin is calculated
+    as:
+
+    >>> D = entropy(pk, qk, base=base)
+    >>> D
+    0.7369655941662062
+    >>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0)
+    True
+
+    The cross entropy can be calculated as the sum of the entropy and
+    relative entropy`:
+
+    >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
+    >>> CE
+    1.736965594166206
+    >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
+    True
+
+    """
+    if base is not None and base <= 0:
+        raise ValueError("`base` must be a positive number or `None`.")
+
+    xp = array_namespace(pk, qk)
+    pk, qk = xp_promote(pk, qk, broadcast=True, xp=xp)
+
+    with np.errstate(invalid='ignore'):
+        if qk is not None:
+            pk, qk = _share_masks(pk, qk, xp=xp)
+            qk = qk / xp.sum(qk, axis=axis, keepdims=True)
+        pk = pk / xp.sum(pk, axis=axis, keepdims=True)
+
+    if qk is None:
+        vec = special.entr(pk)
+    else:
+        if is_marray(xp):  # compensate for mdhaber/marray#97
+            vec = special.rel_entr(pk.data, qk.data)  # type: ignore[union-attr]
+            vec = xp.asarray(vec, mask=pk.mask)  #  type: ignore[union-attr]
+        else:
+            vec = special.rel_entr(pk, qk)
+
+    S = xp.sum(vec, axis=axis)
+    if base is not None:
+        S /= math.log(base)
+    return S
+
+
+def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
+    values = samples[0]
+    n = values.shape[axis]
+    window_length = kwargs.get("window_length")
+    if window_length is None:
+        window_length = math.floor(math.sqrt(n) + 0.5)
+    if not 2 <= 2 * window_length < n:
+        return True
+    return False
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,),
+    too_small=_differential_entropy_is_too_small
+)
+def differential_entropy(
+    values: np.typing.ArrayLike,
+    *,
+    window_length: int | None = None,
+    base: float | None = None,
+    axis: int = 0,
+    method: str = "auto",
+) -> np.number | np.ndarray:
+    r"""Given a sample of a distribution, estimate the differential entropy.
+
+    Several estimation methods are available using the `method` parameter. By
+    default, a method is selected based the size of the sample.
+
+    Parameters
+    ----------
+    values : sequence
+        Sample from a continuous distribution.
+    window_length : int, optional
+        Window length for computing Vasicek estimate. Must be an integer
+        between 1 and half of the sample size. If ``None`` (the default), it
+        uses the heuristic value
+
+        .. math::
+            \left \lfloor \sqrt{n} + 0.5 \right \rfloor
+
+        where :math:`n` is the sample size. This heuristic was originally
+        proposed in [2]_ and has become common in the literature.
+    base : float, optional
+        The logarithmic base to use, defaults to ``e`` (natural logarithm).
+    axis : int, optional
+        The axis along which the differential entropy is calculated.
+        Default is 0.
+    method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
+        The method used to estimate the differential entropy from the sample.
+        Default is ``'auto'``.  See Notes for more information.
+
+    Returns
+    -------
+    entropy : float
+        The calculated differential entropy.
+
+    Notes
+    -----
+    This function will converge to the true differential entropy in the limit
+
+    .. math::
+        n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
+
+    The optimal choice of ``window_length`` for a given sample size depends on
+    the (unknown) distribution. Typically, the smoother the density of the
+    distribution, the larger the optimal value of ``window_length`` [1]_.
+
+    The following options are available for the `method` parameter.
+
+    * ``'vasicek'`` uses the estimator presented in [1]_. This is
+      one of the first and most influential estimators of differential entropy.
+    * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
+      is not only consistent but, under some conditions, asymptotically normal.
+    * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
+      in simulation to have smaller bias and mean squared error than
+      the Vasicek estimator.
+    * ``'correa'`` uses the estimator presented in [5]_ based on local linear
+      regression. In a simulation study, it had consistently smaller mean
+      square error than the Vasiceck estimator, but it is more expensive to
+      compute.
+    * ``'auto'`` selects the method automatically (default). Currently,
+      this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
+      for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
+      samples, but this behavior is subject to change in future versions.
+
+    All estimators are implemented as described in [6]_.
+
+    References
+    ----------
+    .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
+           Journal of the Royal Statistical Society:
+           Series B (Methodological), 38(1), 54-59.
+    .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
+           goodness-of-fit test for exponentiality. Communications in
+           Statistics-Theory and Methods, 28(5), 1183-1202.
+    .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
+           class of statistics based on spacings. Scandinavian Journal of
+           Statistics, 61-72.
+    .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
+           of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
+    .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
+           in Statistics-Theory and Methods, 24(10), 2439-2449.
+    .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
+           Annals of Data Science, 2(2), 231-241.
+           https://link.springer.com/article/10.1007/s40745-015-0045-9
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import differential_entropy, norm
+
+    Entropy of a standard normal distribution:
+
+    >>> rng = np.random.default_rng()
+    >>> values = rng.standard_normal(100)
+    >>> differential_entropy(values)
+    1.3407817436640392
+
+    Compare with the true entropy:
+
+    >>> float(norm.entropy())
+    1.4189385332046727
+
+    For several sample sizes between 5 and 1000, compare the accuracy of
+    the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
+    compare the root mean squared error (over 1000 trials) between the estimate
+    and the true differential entropy of the distribution.
+
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>>
+    >>>
+    >>> def rmse(res, expected):
+    ...     '''Root mean squared error'''
+    ...     return np.sqrt(np.mean((res - expected)**2))
+    >>>
+    >>>
+    >>> a, b = np.log10(5), np.log10(1000)
+    >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
+    >>> reps = 1000  # number of repetitions for each sample size
+    >>> expected = stats.expon.entropy()
+    >>>
+    >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
+    >>> for method in method_errors:
+    ...     for n in ns:
+    ...        rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
+    ...        res = stats.differential_entropy(rvs, method=method, axis=-1)
+    ...        error = rmse(res, expected)
+    ...        method_errors[method].append(error)
+    >>>
+    >>> for method, errors in method_errors.items():
+    ...     plt.loglog(ns, errors, label=method)
+    >>>
+    >>> plt.legend()
+    >>> plt.xlabel('sample size')
+    >>> plt.ylabel('RMSE (1000 trials)')
+    >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
+
+    """
+    xp = array_namespace(values)
+    values = xp_promote(values, force_floating=True, xp=xp)
+    values = xp.moveaxis(values, axis, -1)
+    n = values.shape[-1]  # type: ignore[union-attr]
+
+    if window_length is None:
+        window_length = math.floor(math.sqrt(n) + 0.5)
+
+    if not 2 <= 2 * window_length < n:
+        raise ValueError(
+            f"Window length ({window_length}) must be positive and less "
+            f"than half the sample size ({n}).",
+        )
+
+    if base is not None and base <= 0:
+        raise ValueError("`base` must be a positive number or `None`.")
+
+    sorted_data = xp.sort(values, axis=-1)
+
+    methods = {"vasicek": _vasicek_entropy,
+               "van es": _van_es_entropy,
+               "correa": _correa_entropy,
+               "ebrahimi": _ebrahimi_entropy,
+               "auto": _vasicek_entropy}
+    method = method.lower()
+    if method not in methods:
+        message = f"`method` must be one of {set(methods)}"
+        raise ValueError(message)
+
+    if method == "auto":
+        if n <= 10:
+            method = 'van es'
+        elif n <= 1000:
+            method = 'ebrahimi'
+        else:
+            method = 'vasicek'
+
+    res = methods[method](sorted_data, window_length, xp=xp)
+
+    if base is not None:
+        res /= math.log(base)
+
+    # avoid dtype changes due to data-apis/array-api-compat#152
+    # can be removed when data-apis/array-api-compat#152 is resolved
+    return xp.astype(res, values.dtype)  # type: ignore[union-attr]
+
+
+def _pad_along_last_axis(X, m, *, xp):
+    """Pad the data for computing the rolling window difference."""
+    # scales a  bit better than method in _vasicek_like_entropy
+    shape = X.shape[:-1] + (m,)
+    Xl = xp.broadcast_to(X[..., :1], shape)  # :1 vs 0 to maintain shape
+    Xr = xp.broadcast_to(X[..., -1:], shape)
+    return xp.concat((Xl, X, Xr), axis=-1)
+
+
+def _vasicek_entropy(X, m, *, xp):
+    """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m, xp=xp)
+    differences = X[..., 2 * m:] - X[..., : -2 * m:]
+    logs = xp.log(n/(2*m) * differences)
+    return xp.mean(logs, axis=-1)
+
+
+def _van_es_entropy(X, m, *, xp):
+    """Compute the van Es estimator as described in [6]."""
+    # No equation number, but referred to as HVE_mn.
+    # Typo: there should be a log within the summation.
+    n = X.shape[-1]
+    difference = X[..., m:] - X[..., :-m]
+    term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1)
+    k = xp.arange(m, n+1, dtype=term1.dtype, device=xp_device(X))
+    return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1)
+
+
+def _ebrahimi_entropy(X, m, *, xp):
+    """Compute the Ebrahimi estimator as described in [6]."""
+    # No equation number, but referred to as HE_mn
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m, xp=xp)
+
+    differences = X[..., 2 * m:] - X[..., : -2 * m:]
+
+    i = xp.arange(1, n+1, dtype=X.dtype, device=xp_device(X))
+    ci = xp.where(i <= m, 1 + (i - 1)/m, 2.)
+    ci = xp.where(i >= n - m + 1, 1 + (n - i)/m, ci)
+
+    logs = xp.log(n * differences / (ci * m))
+    return xp.mean(logs, axis=-1)
+
+
+def _correa_entropy(X, m, *, xp):
+    """Compute the Correa estimator as described in [6]."""
+    # No equation number, but referred to as HC_mn
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m, xp=xp)
+
+    i = xp.arange(1, n+1, device=xp_device(X))
+    dj = xp.arange(-m, m+1, device=xp_device(X))[:, None]
+    j = i + dj
+    j0 = j + m - 1  # 0-indexed version of j
+
+    Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True)
+    difference = X[..., j0] - Xibar
+    num = xp.sum(difference*dj, axis=-2)  # dj is d-i
+    den = n*xp.sum(difference**2, axis=-2)
+    return -xp.mean(xp.log(num/den), axis=-1)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_finite_differences.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_finite_differences.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ec91654c4e95f73f32edc5aa357a4e3f80fff1f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_finite_differences.py
@@ -0,0 +1,145 @@
+from numpy import arange, newaxis, hstack, prod, array
+from scipy import linalg
+
+
+def _central_diff_weights(Np, ndiv=1):
+    """
+    Return weights for an Np-point central derivative.
+
+    Assumes equally-spaced function points.
+
+    If weights are in the vector w, then
+    derivative is w[0] * f(x-ho*dx) + ... + w[-1] * f(x+h0*dx)
+
+    Parameters
+    ----------
+    Np : int
+        Number of points for the central derivative.
+    ndiv : int, optional
+        Number of divisions. Default is 1.
+
+    Returns
+    -------
+    w : ndarray
+        Weights for an Np-point central derivative. Its size is `Np`.
+
+    Notes
+    -----
+    Can be inaccurate for a large number of points.
+
+    Examples
+    --------
+    We can calculate a derivative value of a function.
+
+    >>> def f(x):
+    ...     return 2 * x**2 + 3
+    >>> x = 3.0 # derivative point
+    >>> h = 0.1 # differential step
+    >>> Np = 3 # point number for central derivative
+    >>> weights = _central_diff_weights(Np) # weights for first derivative
+    >>> vals = [f(x + (i - Np/2) * h) for i in range(Np)]
+    >>> sum(w * v for (w, v) in zip(weights, vals))/h
+    11.79999999999998
+
+    This value is close to the analytical solution:
+    f'(x) = 4x, so f'(3) = 12
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Finite_difference
+
+    """
+    if Np < ndiv + 1:
+        raise ValueError(
+            "Number of points must be at least the derivative order + 1."
+        )
+    if Np % 2 == 0:
+        raise ValueError("The number of points must be odd.")
+
+    ho = Np >> 1
+    x = arange(-ho, ho + 1.0)
+    x = x[:, newaxis]
+    X = x**0.0
+    for k in range(1, Np):
+        X = hstack([X, x**k])
+    w = prod(arange(1, ndiv + 1), axis=0) * linalg.inv(X)[ndiv]
+    return w
+
+
+def _derivative(func, x0, dx=1.0, n=1, args=(), order=3):
+    """
+    Find the nth derivative of a function at a point.
+
+    Given a function, use a central difference formula with spacing `dx` to
+    compute the nth derivative at `x0`.
+
+    Parameters
+    ----------
+    func : function
+        Input function.
+    x0 : float
+        The point at which the nth derivative is found.
+    dx : float, optional
+        Spacing.
+    n : int, optional
+        Order of the derivative. Default is 1.
+    args : tuple, optional
+        Arguments
+    order : int, optional
+        Number of points to use, must be odd.
+
+    Notes
+    -----
+    Decreasing the step size too small can result in round-off error.
+
+    Examples
+    --------
+    >>> def f(x):
+    ...     return x**3 + x**2
+    >>> _derivative(f, 1.0, dx=1e-6)
+    4.9999999999217337
+
+    """
+    if order < n + 1:
+        raise ValueError(
+            "'order' (the number of points used to compute the derivative), "
+            "must be at least the derivative order 'n' + 1."
+        )
+    if order % 2 == 0:
+        raise ValueError(
+            "'order' (the number of points used to compute the derivative) "
+            "must be odd."
+        )
+    # pre-computed for n=1 and 2 and low-order for speed.
+    if n == 1:
+        if order == 3:
+            weights = array([-1, 0, 1]) / 2.0
+        elif order == 5:
+            weights = array([1, -8, 0, 8, -1]) / 12.0
+        elif order == 7:
+            weights = array([-1, 9, -45, 0, 45, -9, 1]) / 60.0
+        elif order == 9:
+            weights = array([3, -32, 168, -672, 0, 672, -168, 32, -3]) / 840.0
+        else:
+            weights = _central_diff_weights(order, 1)
+    elif n == 2:
+        if order == 3:
+            weights = array([1, -2.0, 1])
+        elif order == 5:
+            weights = array([-1, 16, -30, 16, -1]) / 12.0
+        elif order == 7:
+            weights = array([2, -27, 270, -490, 270, -27, 2]) / 180.0
+        elif order == 9:
+            weights = (
+                array([-9, 128, -1008, 8064, -14350, 8064, -1008, 128, -9])
+                / 5040.0
+            )
+        else:
+            weights = _central_diff_weights(order, 2)
+    else:
+        weights = _central_diff_weights(order, n)
+    val = 0.0
+    ho = order >> 1
+    for k in range(order):
+        val += weights[k] * func(x0 + (k - ho) * dx, *args)
+    return val / prod((dx,) * n, axis=0)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_fit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_fit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c60b187dab4eef492706317bbcaf985c9914608e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_fit.py
@@ -0,0 +1,1349 @@
+import warnings
+from collections import namedtuple
+import numpy as np
+from scipy import optimize, stats
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._util import check_random_state, _transition_to_rng
+
+
+def _combine_bounds(name, user_bounds, shape_domain, integral):
+    """Intersection of user-defined bounds and distribution PDF/PMF domain"""
+
+    user_bounds = np.atleast_1d(user_bounds)
+
+    if user_bounds[0] > user_bounds[1]:
+        message = (f"There are no values for `{name}` on the interval "
+                   f"{list(user_bounds)}.")
+        raise ValueError(message)
+
+    bounds = (max(user_bounds[0], shape_domain[0]),
+              min(user_bounds[1], shape_domain[1]))
+
+    if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):
+        message = (f"There are no integer values for `{name}` on the interval "
+                   f"defined by the user-provided bounds and the domain "
+                   "of the distribution.")
+        raise ValueError(message)
+    elif not integral and (bounds[0] > bounds[1]):
+        message = (f"There are no values for `{name}` on the interval "
+                   f"defined by the user-provided bounds and the domain "
+                   "of the distribution.")
+        raise ValueError(message)
+
+    if not np.all(np.isfinite(bounds)):
+        message = (f"The intersection of user-provided bounds for `{name}` "
+                   f"and the domain of the distribution is not finite. Please "
+                   f"provide finite bounds for shape `{name}` in `bounds`.")
+        raise ValueError(message)
+
+    return bounds
+
+
+class FitResult:
+    r"""Result of fitting a discrete or continuous distribution to data
+
+    Attributes
+    ----------
+    params : namedtuple
+        A namedtuple containing the maximum likelihood estimates of the
+        shape parameters, location, and (if applicable) scale of the
+        distribution.
+    success : bool or None
+        Whether the optimizer considered the optimization to terminate
+        successfully or not.
+    message : str or None
+        Any status message provided by the optimizer.
+
+    """
+
+    def __init__(self, dist, data, discrete, res):
+        self._dist = dist
+        self._data = data
+        self.discrete = discrete
+        self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)
+
+        shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
+        if not discrete:
+            FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])
+        else:
+            FitParams = namedtuple('FitParams', shape_names + ['loc'])
+
+        self.params = FitParams(*res.x)
+
+        # Optimizer can report success even when nllf is infinite
+        if res.success and not np.isfinite(self.nllf()):
+            res.success = False
+            res.message = ("Optimization converged to parameter values that "
+                           "are inconsistent with the data.")
+        self.success = getattr(res, "success", None)
+        self.message = getattr(res, "message", None)
+
+    def __repr__(self):
+        keys = ["params", "success", "message"]
+        m = max(map(len, keys)) + 1
+        return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))
+                          for key in keys if getattr(self, key) is not None])
+
+    def nllf(self, params=None, data=None):
+        """Negative log-likelihood function
+
+        Evaluates the negative of the log-likelihood function of the provided
+        data at the provided parameters.
+
+        Parameters
+        ----------
+        params : tuple, optional
+            The shape parameters, location, and (if applicable) scale of the
+            distribution as a single tuple. Default is the maximum likelihood
+            estimates (``self.params``).
+        data : array_like, optional
+            The data for which the log-likelihood function is to be evaluated.
+            Default is the data to which the distribution was fit.
+
+        Returns
+        -------
+        nllf : float
+            The negative of the log-likelihood function.
+
+        """
+        params = params if params is not None else self.params
+        data = data if data is not None else self._data
+        return self._dist.nnlf(theta=params, x=data)
+
+    def plot(self, ax=None, *, plot_type="hist"):
+        """Visually compare the data against the fitted distribution.
+
+        Available only if `matplotlib` is installed.
+
+        Parameters
+        ----------
+        ax : `matplotlib.axes.Axes`
+            Axes object to draw the plot onto, otherwise uses the current Axes.
+        plot_type : {"hist", "qq", "pp", "cdf"}
+            Type of plot to draw. Options include:
+
+            - "hist": Superposes the PDF/PMF of the fitted distribution
+              over a normalized histogram of the data.
+            - "qq": Scatter plot of theoretical quantiles against the
+              empirical quantiles. Specifically, the x-coordinates are the
+              values of the fitted distribution PPF evaluated at the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the
+              number of data points, and the y-coordinates are the sorted
+              data points.
+            - "pp": Scatter plot of theoretical percentiles against the
+              observed percentiles. Specifically, the x-coordinates are the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
+              the number of data points, and the y-coordinates are the values
+              of the fitted distribution CDF evaluated at the sorted
+              data points.
+            - "cdf": Superposes the CDF of the fitted distribution over the
+              empirical CDF. Specifically, the x-coordinates of the empirical
+              CDF are the sorted data points, and the y-coordinates are the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
+              the number of data points.
+
+        Returns
+        -------
+        ax : `matplotlib.axes.Axes`
+            The matplotlib Axes object on which the plot was drawn.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> import matplotlib.pyplot as plt  # matplotlib must be installed
+        >>> rng = np.random.default_rng()
+        >>> data = stats.nbinom(5, 0.5).rvs(size=1000, random_state=rng)
+        >>> bounds = [(0, 30), (0, 1)]
+        >>> res = stats.fit(stats.nbinom, data, bounds)
+        >>> ax = res.plot()  # save matplotlib Axes object
+
+        The `matplotlib.axes.Axes` object can be used to customize the plot.
+        See `matplotlib.axes.Axes` documentation for details.
+
+        >>> ax.set_xlabel('number of trials')  # customize axis label
+        >>> ax.get_children()[0].set_linewidth(5)  # customize line widths
+        >>> ax.legend()
+        >>> plt.show()
+        """
+        try:
+            import matplotlib  # noqa: F401
+        except ModuleNotFoundError as exc:
+            message = "matplotlib must be installed to use method `plot`."
+            raise ModuleNotFoundError(message) from exc
+
+        plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,
+                 'pp': self._pp_plot, 'cdf': self._cdf_plot,
+                 'hist': self._hist_plot}
+        if plot_type.lower() not in plots:
+            message = f"`plot_type` must be one of {set(plots.keys())}"
+            raise ValueError(message)
+        plot = plots[plot_type.lower()]
+
+        if ax is None:
+            import matplotlib.pyplot as plt
+            ax = plt.gca()
+
+        fit_params = np.atleast_1d(self.params)
+
+        return plot(ax=ax, fit_params=fit_params)
+
+    def _hist_plot(self, ax, fit_params):
+        from matplotlib.ticker import MaxNLocator
+
+        support = self._dist.support(*fit_params)
+        lb = support[0] if np.isfinite(support[0]) else min(self._data)
+        ub = support[1] if np.isfinite(support[1]) else max(self._data)
+        pxf = "PMF" if self.discrete else "PDF"
+
+        if self.discrete:
+            x = np.arange(lb, ub + 2)
+            y = self.pxf(x, *fit_params)
+            ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',
+                      color='C0')
+            options = dict(density=True, bins=x, align='left', color='C1')
+            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+            ax.set_xlabel('k')
+            ax.set_ylabel('PMF')
+        else:
+            x = np.linspace(lb, ub, 200)
+            y = self.pxf(x, *fit_params)
+            ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')
+            options = dict(density=True, bins=50, align='mid', color='C1')
+            ax.set_xlabel('x')
+            ax.set_ylabel('PDF')
+
+        if len(self._data) > 50 or self.discrete:
+            ax.hist(self._data, label="Histogram of Data", **options)
+        else:
+            ax.plot(self._data, np.zeros_like(self._data), "*",
+                    label='Data', color='C1')
+
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")
+        ax.legend(*ax.get_legend_handles_labels())
+        return ax
+
+    def _qp_plot(self, ax, fit_params, qq):
+        data = np.sort(self._data)
+        ps = self._plotting_positions(len(self._data))
+
+        if qq:
+            qp = "Quantiles"
+            plot_type = 'Q-Q'
+            x = self._dist.ppf(ps, *fit_params)
+            y = data
+        else:
+            qp = "Percentiles"
+            plot_type = 'P-P'
+            x = ps
+            y = self._dist.cdf(data, *fit_params)
+
+        ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',
+                color='C0', zorder=1)
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()
+        lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]
+        if not qq:
+            lim = max(lim[0], 0), min(lim[1], 1)
+
+        if self.discrete and qq:
+            q_min, q_max = int(lim[0]), int(lim[1]+1)
+            q_ideal = np.arange(q_min, q_max)
+            # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))
+            ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',
+                    alpha=0.25, markerfacecolor='none', clip_on=True)
+        elif self.discrete and not qq:
+            # The intent of this is to match the plot that would be produced
+            # if x were continuous on [0, 1] and y were cdf(ppf(x)).
+            # It can be approximated by letting x = np.linspace(0, 1, 1000),
+            # but this might not look great when zooming in. The vertical
+            # portions are included to indicate where the transition occurs
+            # where the data completely obscures the horizontal portions.
+            p_min, p_max = lim
+            a, b = self._dist.support(*fit_params)
+            p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)
+            p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)
+            q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)
+            qs = np.arange(q_min-1, q_max+1)
+            ps = self._dist.cdf(qs, *fit_params)
+            ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,
+                    clip_on=True)
+        else:
+            ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,
+                    clip_on=True)
+
+        ax.set_xlim(lim)
+        ax.set_ylim(lim)
+        ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")
+        ax.set_ylabel(f"Data {qp}")
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")
+        ax.legend(*ax.get_legend_handles_labels())
+        ax.set_aspect('equal')
+        return ax
+
+    def _qq_plot(self, **kwargs):
+        return self._qp_plot(qq=True, **kwargs)
+
+    def _pp_plot(self, **kwargs):
+        return self._qp_plot(qq=False, **kwargs)
+
+    def _plotting_positions(self, n, a=.5):
+        # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions
+        k = np.arange(1, n+1)
+        return (k-a) / (n + 1 - 2*a)
+
+    def _cdf_plot(self, ax, fit_params):
+        data = np.sort(self._data)
+        ecdf = self._plotting_positions(len(self._data))
+        ls = '--' if len(np.unique(data)) < 30 else '.'
+        xlabel = 'k' if self.discrete else 'x'
+        ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)
+
+        xlim = ax.get_xlim()
+        q = np.linspace(*xlim, 300)
+        tcdf = self._dist.cdf(q, *fit_params)
+
+        ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)
+        ax.set_xlim(xlim)
+        ax.set_ylim(0, 1)
+        ax.set_xlabel(xlabel)
+        ax.set_ylabel("CDF")
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend(handles[::-1], labels[::-1])
+        return ax
+
+
+@xp_capabilities(out_of_scope=True)
+def fit(dist, data, bounds=None, *, guess=None, method='mle',
+        optimizer=optimize.differential_evolution):
+    r"""Fit a discrete or continuous distribution to data
+
+    Given a distribution, data, and bounds on the parameters of the
+    distribution, return maximum likelihood estimates of the parameters.
+
+    Parameters
+    ----------
+    dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
+        The object representing the distribution to be fit to the data.
+    data : 1D array_like
+        The data to which the distribution is to be fit. If the data contain
+        any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will
+        raise a ``ValueError``.
+    bounds : dict or sequence of tuples, optional
+        If a dictionary, each key is the name of a parameter of the
+        distribution, and the corresponding value is a tuple containing the
+        lower and upper bound on that parameter.  If the distribution is
+        defined only for a finite range of values of that parameter, no entry
+        for that parameter is required; e.g., some distributions have
+        parameters which must be on the interval [0, 1]. Bounds for parameters
+        location (``loc``) and scale (``scale``) are optional; by default,
+        they are fixed to 0 and 1, respectively.
+
+        If a sequence, element *i* is a tuple containing the lower and upper
+        bound on the *i*\ th parameter of the distribution. In this case,
+        bounds for *all* distribution shape parameters must be provided.
+        Optionally, bounds for location and scale may follow the
+        distribution shape parameters.
+
+        If a shape is to be held fixed (e.g. if it is known), the
+        lower and upper bounds may be equal. If a user-provided lower or upper
+        bound is beyond a bound of the domain for which the distribution is
+        defined, the bound of the distribution's domain will replace the
+        user-provided value. Similarly, parameters which must be integral
+        will be constrained to integral values within the user-provided bounds.
+    guess : dict or array_like, optional
+        If a dictionary, each key is the name of a parameter of the
+        distribution, and the corresponding value is a guess for the value
+        of the parameter.
+
+        If a sequence, element *i* is a guess for the *i*\ th parameter of the
+        distribution. In this case, guesses for *all* distribution shape
+        parameters must be provided.
+
+        If `guess` is not provided, guesses for the decision variables will
+        not be passed to the optimizer. If `guess` is provided, guesses for
+        any missing parameters will be set at the mean of the lower and
+        upper bounds. Guesses for parameters which must be integral will be
+        rounded to integral values, and guesses that lie outside the
+        intersection of the user-provided bounds and the domain of the
+        distribution will be clipped.
+    method : {'mle', 'mse'}
+        With ``method="mle"`` (default), the fit is computed by minimizing
+        the negative log-likelihood function. A large, finite penalty
+        (rather than infinite negative log-likelihood) is applied for
+        observations beyond the support of the distribution.
+        With ``method="mse"``, the fit is computed by minimizing
+        the negative log-product spacing function. The same penalty is applied
+        for observations beyond the support. We follow the approach of [1]_,
+        which is generalized for samples with repeated observations.
+    optimizer : callable, optional
+        `optimizer` is a callable that accepts the following positional
+        argument.
+
+        fun : callable
+            The objective function to be optimized. `fun` accepts one argument
+            ``x``, candidate shape parameters of the distribution, and returns
+            the objective function value given ``x``, `dist`, and the provided
+            `data`.
+            The job of `optimizer` is to find values of the decision variables
+            that minimizes `fun`.
+
+        `optimizer` must also accept the following keyword argument.
+
+        bounds : sequence of tuples
+            The bounds on values of the decision variables; each element will
+            be a tuple containing the lower and upper bound on a decision
+            variable.
+
+        If `guess` is provided, `optimizer` must also accept the following
+        keyword argument.
+
+        x0 : array_like
+            The guesses for each decision variable.
+
+        If the distribution has any shape parameters that must be integral or
+        if the distribution is discrete and the location parameter is not
+        fixed, `optimizer` must also accept the following keyword argument.
+
+        integrality : array_like of bools
+            For each decision variable, True if the decision variable
+            must be constrained to integer values and False if the decision
+            variable is continuous.
+
+        `optimizer` must return an object, such as an instance of
+        `scipy.optimize.OptimizeResult`, which holds the optimal values of
+        the decision variables in an attribute ``x``. If attributes
+        ``fun``, ``status``, or ``message`` are provided, they will be
+        included in the result object returned by `fit`.
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.FitResult`
+        An object with the following fields.
+
+        params : namedtuple
+            A namedtuple containing the maximum likelihood estimates of the
+            shape parameters, location, and (if applicable) scale of the
+            distribution.
+        success : bool or None
+            Whether the optimizer considered the optimization to terminate
+            successfully or not.
+        message : str or None
+            Any status message provided by the optimizer.
+
+        The object has the following method:
+
+        nllf(params=None, data=None)
+            By default, the negative log-likelihood function at the fitted
+            `params` for the given `data`. Accepts a tuple containing
+            alternative shapes, location, and scale of the distribution and
+            an array of alternative data.
+
+        plot(ax=None)
+            Superposes the PDF/PMF of the fitted distribution over a normalized
+            histogram of the data.
+
+    See Also
+    --------
+    rv_continuous,  rv_discrete
+
+    Notes
+    -----
+    Optimization is more likely to converge to the maximum likelihood estimate
+    when the user provides tight bounds containing the maximum likelihood
+    estimate. For example, when fitting a binomial distribution to data, the
+    number of experiments underlying each sample may be known, in which case
+    the corresponding shape parameter ``n`` can be fixed.
+
+    References
+    ----------
+    .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings
+           method: a unified formulation with illustration of strong
+           consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.
+
+    Examples
+    --------
+    Suppose we wish to fit a distribution to the following data.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> dist = stats.nbinom
+    >>> shapes = (5, 0.5)
+    >>> data = dist.rvs(*shapes, size=1000, random_state=rng)
+
+    Suppose we do not know how the data were generated, but we suspect that
+    it follows a negative binomial distribution with parameters *n* and *p*\.
+    (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer
+    than 30, and we know that the parameter *p* must lie on the interval
+    [0, 1]. We record this information in a variable `bounds` and pass
+    this information to `fit`.
+
+    >>> bounds = [(0, 30), (0, 1)]
+    >>> res = stats.fit(dist, data, bounds)
+
+    `fit` searches within the user-specified `bounds` for the
+    values that best match the data (in the sense of maximum likelihood
+    estimation). In this case, it found shape values similar to those
+    from which the data were actually generated.
+
+    >>> res.params
+    FitParams(n=5.0, p=0.5028157644634368, loc=0.0)  # may vary
+
+    We can visualize the results by superposing the probability mass function
+    of the distribution (with the shapes fit to the data) over a normalized
+    histogram of the data.
+
+    >>> import matplotlib.pyplot as plt  # matplotlib must be installed to plot
+    >>> res.plot()
+    >>> plt.show()
+
+    Note that the estimate for *n* was exactly integral; this is because
+    the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`
+    object "knows" that. `nbinom` also knows that the shape *p* must be a
+    value between 0 and 1. In such a case - when the domain of the distribution
+    with respect to a parameter is finite - we are not required to specify
+    bounds for the parameter.
+
+    >>> bounds = {'n': (0, 30)}  # omit parameter p using a `dict`
+    >>> res2 = stats.fit(dist, data, bounds)
+    >>> res2.params
+    FitParams(n=5.0, p=0.5016492009232932, loc=0.0)  # may vary
+
+    If we wish to force the distribution to be fit with *n* fixed at 6, we can
+    set both the lower and upper bounds on *n* to 6. Note, however, that the
+    value of the objective function being optimized is typically worse (higher)
+    in this case.
+
+    >>> bounds = {'n': (6, 6)}  # fix parameter `n`
+    >>> res3 = stats.fit(dist, data, bounds)
+    >>> res3.params
+    FitParams(n=6.0, p=0.5486556076755706, loc=0.0)  # may vary
+    >>> res3.nllf() > res.nllf()
+    True  # may vary
+
+    Note that the numerical results of the previous examples are typical, but
+    they may vary because the default optimizer used by `fit`,
+    `scipy.optimize.differential_evolution`, is stochastic. However, we can
+    customize the settings used by the optimizer to ensure reproducibility -
+    or even use a different optimizer entirely - using the `optimizer`
+    parameter.
+
+    >>> from scipy.optimize import differential_evolution
+    >>> rng = np.random.default_rng(767585560716548)
+    >>> def optimizer(fun, bounds, *, integrality):
+    ...     return differential_evolution(fun, bounds, strategy='best2bin',
+    ...                                   rng=rng, integrality=integrality)
+    >>> bounds = [(0, 30), (0, 1)]
+    >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)
+    >>> res4.params
+    FitParams(n=5.0, p=0.5015183149259951, loc=0.0)
+
+    """
+    # --- Input Validation / Standardization --- #
+    user_bounds = bounds
+    user_guess = guess
+
+    # distribution input validation and information collection
+    if hasattr(dist, "pdf"):  # can't use isinstance for types
+        default_bounds = {'loc': (0, 0), 'scale': (1, 1)}
+        discrete = False
+    elif hasattr(dist, "pmf"):
+        default_bounds = {'loc': (0, 0)}
+        discrete = True
+    else:
+        message = ("`dist` must be an instance of `rv_continuous` "
+                   "or `rv_discrete.`")
+        raise ValueError(message)
+
+    try:
+        param_info = dist._param_info()
+    except AttributeError as e:
+        message = (f"Distribution `{dist.name}` is not yet supported by "
+                   "`scipy.stats.fit` because shape information has "
+                   "not been defined.")
+        raise ValueError(message) from e
+
+    # data input validation
+    data = np.asarray(data)
+    if data.ndim != 1:
+        message = "`data` must be exactly one-dimensional."
+        raise ValueError(message)
+    if not (np.issubdtype(data.dtype, np.number)
+            and np.all(np.isfinite(data))):
+        message = "All elements of `data` must be finite numbers."
+        raise ValueError(message)
+
+    # bounds input validation and information collection
+    n_params = len(param_info)
+    n_shapes = n_params - (1 if discrete else 2)
+    param_list = [param.name for param in param_info]
+    param_names = ", ".join(param_list)
+    shape_names = ", ".join(param_list[:n_shapes])
+
+    if user_bounds is None:
+        user_bounds = {}
+
+    if isinstance(user_bounds, dict):
+        default_bounds.update(user_bounds)
+        user_bounds = default_bounds
+        user_bounds_array = np.empty((n_params, 2))
+        for i in range(n_params):
+            param_name = param_info[i].name
+            user_bound = user_bounds.pop(param_name, None)
+            if user_bound is None:
+                user_bound = param_info[i].domain
+            user_bounds_array[i] = user_bound
+        if user_bounds:
+            message = ("Bounds provided for the following unrecognized "
+                       f"parameters will be ignored: {set(user_bounds)}")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+    else:
+        try:
+            user_bounds = np.asarray(user_bounds, dtype=float)
+            if user_bounds.size == 0:
+                user_bounds = np.empty((0, 2))
+        except ValueError as e:
+            message = ("Each element of a `bounds` sequence must be a tuple "
+                       "containing two elements: the lower and upper bound of "
+                       "a distribution parameter.")
+            raise ValueError(message) from e
+        if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):
+            message = ("Each element of `bounds` must be a tuple specifying "
+                       "the lower and upper bounds of a shape parameter")
+            raise ValueError(message)
+        if user_bounds.shape[0] < n_shapes:
+            message = (f"A `bounds` sequence must contain at least {n_shapes} "
+                       "elements: tuples specifying the lower and upper "
+                       f"bounds of all shape parameters {shape_names}.")
+            raise ValueError(message)
+        if user_bounds.shape[0] > n_params:
+            message = ("A `bounds` sequence may not contain more than "
+                       f"{n_params} elements: tuples specifying the lower and "
+                       "upper bounds of distribution parameters "
+                       f"{param_names}.")
+            raise ValueError(message)
+
+        user_bounds_array = np.empty((n_params, 2))
+        user_bounds_array[n_shapes:] = list(default_bounds.values())
+        user_bounds_array[:len(user_bounds)] = user_bounds
+
+    user_bounds = user_bounds_array
+    validated_bounds = []
+    for i in range(n_params):
+        name = param_info[i].name
+        user_bound = user_bounds_array[i]
+        param_domain = param_info[i].domain
+        integral = param_info[i].integrality
+        combined = _combine_bounds(name, user_bound, param_domain, integral)
+        validated_bounds.append(combined)
+
+    bounds = np.asarray(validated_bounds)
+    integrality = [param.integrality for param in param_info]
+
+    # guess input validation
+
+    if user_guess is None:
+        guess_array = None
+    elif isinstance(user_guess, dict):
+        default_guess = {param.name: np.mean(bound)
+                         for param, bound in zip(param_info, bounds)}
+        unrecognized = set(user_guess) - set(default_guess)
+        if unrecognized:
+            message = ("Guesses provided for the following unrecognized "
+                       f"parameters will be ignored: {unrecognized}")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+        default_guess.update(user_guess)
+
+        message = ("Each element of `guess` must be a scalar "
+                   "guess for a distribution parameter.")
+        try:
+            guess_array = np.asarray([default_guess[param.name]
+                                      for param in param_info], dtype=float)
+        except ValueError as e:
+            raise ValueError(message) from e
+
+    else:
+        message = ("Each element of `guess` must be a scalar "
+                   "guess for a distribution parameter.")
+        try:
+            user_guess = np.asarray(user_guess, dtype=float)
+        except ValueError as e:
+            raise ValueError(message) from e
+        if user_guess.ndim != 1:
+            raise ValueError(message)
+        if user_guess.shape[0] < n_shapes:
+            message = (f"A `guess` sequence must contain at least {n_shapes} "
+                       "elements: scalar guesses for the distribution shape "
+                       f"parameters {shape_names}.")
+            raise ValueError(message)
+        if user_guess.shape[0] > n_params:
+            message = ("A `guess` sequence may not contain more than "
+                       f"{n_params} elements: scalar guesses for the "
+                       f"distribution parameters {param_names}.")
+            raise ValueError(message)
+
+        guess_array = np.mean(bounds, axis=1)
+        guess_array[:len(user_guess)] = user_guess
+
+    if guess_array is not None:
+        guess_rounded = guess_array.copy()
+
+        guess_rounded[integrality] = np.round(guess_rounded[integrality])
+        rounded = np.where(guess_rounded != guess_array)[0]
+        for i in rounded:
+            message = (f"Guess for parameter `{param_info[i].name}` "
+                       f"rounded from {guess_array[i]} to {guess_rounded[i]}.")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+        guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])
+        clipped = np.where(guess_clipped != guess_rounded)[0]
+        for i in clipped:
+            message = (f"Guess for parameter `{param_info[i].name}` "
+                       f"clipped from {guess_rounded[i]} to "
+                       f"{guess_clipped[i]}.")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+        guess = guess_clipped
+    else:
+        guess = None
+
+    # --- Fitting --- #
+    def nllf(free_params, data=data):  # bind data NOW
+        with np.errstate(invalid='ignore', divide='ignore'):
+            return dist._penalized_nnlf(free_params, data)
+
+    def nlpsf(free_params, data=data):  # bind data NOW
+        with np.errstate(invalid='ignore', divide='ignore'):
+            return dist._penalized_nlpsf(free_params, data)
+
+    methods = {'mle': nllf, 'mse': nlpsf}
+    objective = methods[method.lower()]
+
+    with np.errstate(invalid='ignore', divide='ignore'):
+        kwds = {}
+        if bounds is not None:
+            kwds['bounds'] = bounds
+        if np.any(integrality):
+            kwds['integrality'] = integrality
+        if guess is not None:
+            kwds['x0'] = guess
+        res = optimizer(objective, **kwds)
+
+    return FitResult(dist, data, discrete, res)
+
+
+GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',
+                                 ('fit_result', 'statistic', 'pvalue',
+                                  'null_distribution'))
+
+
+@xp_capabilities(out_of_scope=True)
+@_transition_to_rng('random_state')
+def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
+                    guessed_params=None, statistic='ad', n_mc_samples=9999,
+                    rng=None):
+    r"""
+    Perform a goodness of fit test comparing data to a distribution family.
+
+    Given a distribution family and data, perform a test of the null hypothesis
+    that the data were drawn from a distribution in that family. Any known
+    parameters of the distribution may be specified. Remaining parameters of
+    the distribution will be fit to the data, and the p-value of the test
+    is computed accordingly. Several statistics for comparing the distribution
+    to data are available.
+
+    Parameters
+    ----------
+    dist : `scipy.stats.rv_continuous`
+        The object representing the distribution family under the null
+        hypothesis.
+    data : 1D array_like
+        Finite, uncensored data to be tested.
+    known_params : dict, optional
+        A dictionary containing name-value pairs of known distribution
+        parameters. Monte Carlo samples are randomly drawn from the
+        null-hypothesized distribution with these values of the parameters.
+        Before the statistic is evaluated for the observed `data` and each
+        Monte Carlo sample, only remaining unknown parameters of the
+        null-hypothesized distribution family are fit to the samples; the
+        known parameters are held fixed. If all parameters of the distribution
+        family are known, then the step of fitting the distribution family to
+        each sample is omitted.
+    fit_params : dict, optional
+        A dictionary containing name-value pairs of distribution parameters
+        that have already been fit to the data, e.g. using `scipy.stats.fit`
+        or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the
+        null-hypothesized distribution with these specified values of the
+        parameter. However, these and all other unknown parameters of the
+        null-hypothesized distribution family are always fit to the sample,
+        whether that is the observed `data` or a Monte Carlo sample, before
+        the statistic is evaluated.
+    guessed_params : dict, optional
+        A dictionary containing name-value pairs of distribution parameters
+        which have been guessed. These parameters are always considered as
+        free parameters and are fit both to the provided `data` as well as
+        to the Monte Carlo samples drawn from the null-hypothesized
+        distribution. The purpose of these `guessed_params` is to be used as
+        initial values for the numerical fitting procedure.
+    statistic : {"ad", "ks", "cvm", "filliben"} or callable, optional
+        The statistic used to compare data to a distribution after fitting
+        unknown parameters of the distribution family to the data. The
+        Anderson-Darling ("ad") [1]_, Kolmogorov-Smirnov ("ks") [1]_,
+        Cramer-von Mises ("cvm") [1]_, and Filliben ("filliben") [7]_
+        statistics are available.  Alternatively, a callable with signature
+        ``(dist, data, axis)`` may be supplied to compute the statistic. Here
+        ``dist`` is a frozen distribution object (potentially with array
+        parameters), ``data`` is an array of Monte Carlo samples (of
+        compatible shape), and ``axis`` is the axis of ``data`` along which
+        the statistic must be computed.
+    n_mc_samples : int, default: 9999
+        The number of Monte Carlo samples drawn from the null hypothesized
+        distribution to form the null distribution of the statistic. The
+        sample size of each is the same as the given `data`.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+    Returns
+    -------
+    res : GoodnessOfFitResult
+        An object with the following attributes.
+
+        fit_result : `~scipy.stats._result_classes.FitResult`
+            An object representing the fit of the provided `dist` to `data`.
+            This  object includes the values of distribution family parameters
+            that fully define the null-hypothesized distribution, that is,
+            the distribution from which Monte Carlo samples are drawn.
+        statistic : float
+            The value of the statistic comparing provided `data` to the
+            null-hypothesized distribution.
+        pvalue : float
+            The proportion of elements in the null distribution with
+            statistic values at least as extreme as the statistic value of the
+            provided `data`.
+        null_distribution : ndarray
+            The value of the statistic for each Monte Carlo sample
+            drawn from the null-hypothesized distribution.
+
+    Notes
+    -----
+    This is a generalized Monte Carlo goodness-of-fit procedure, special cases
+    of which correspond with various Anderson-Darling tests, Lilliefors' test,
+    etc. The test is described in [2]_, [3]_, and [4]_ as a parametric
+    bootstrap test. This is a Monte Carlo test in which parameters that
+    specify the distribution from which samples are drawn have been estimated
+    from the data. We describe the test using "Monte Carlo" rather than
+    "parametric bootstrap" throughout to avoid confusion with the more familiar
+    nonparametric bootstrap, and describe how the test is performed below.
+
+    *Traditional goodness of fit tests*
+
+    Traditionally, critical values corresponding with a fixed set of
+    significance levels are pre-calculated using Monte Carlo methods. Users
+    perform the test by calculating the value of the test statistic only for
+    their observed `data` and comparing this value to tabulated critical
+    values. This practice is not very flexible, as tables are not available for
+    all distributions and combinations of known and unknown parameter values.
+    Also, results can be inaccurate when critical values are interpolated from
+    limited tabulated data to correspond with the user's sample size and
+    fitted parameter values. To overcome these shortcomings, this function
+    allows the user to perform the Monte Carlo trials adapted to their
+    particular data.
+
+    *Algorithmic overview*
+
+    In brief, this routine executes the following steps:
+
+      1. Fit unknown parameters to the given `data`, thereby forming the
+         "null-hypothesized" distribution, and compute the statistic of
+         this pair of data and distribution.
+      2. Draw random samples from this null-hypothesized distribution.
+      3. Fit the unknown parameters to each random sample.
+      4. Calculate the statistic between each sample and the distribution that
+         has been fit to the sample.
+      5. Compare the value of the statistic corresponding with `data` from (1)
+         against the values of the statistic corresponding with the random
+         samples from (4). The p-value is the proportion of samples with a
+         statistic value greater than or equal to the statistic of the observed
+         data.
+
+    In more detail, the steps are as follows.
+
+    First, any unknown parameters of the distribution family specified by
+    `dist` are fit to the provided `data` using maximum likelihood estimation.
+    (One exception is the normal distribution with unknown location and scale:
+    we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for
+    the scale as recommended in [1]_.)
+    These values of the parameters specify a particular member of the
+    distribution family referred to as the "null-hypothesized distribution",
+    that is, the distribution from which the data were sampled under the null
+    hypothesis. The `statistic`, which compares data to a distribution, is
+    computed between `data` and the null-hypothesized distribution.
+
+    Next, many (specifically `n_mc_samples`) new samples, each containing the
+    same number of observations as `data`, are drawn from the
+    null-hypothesized distribution. All unknown parameters of the distribution
+    family `dist` are fit to *each resample*, and the `statistic` is computed
+    between each sample and its corresponding fitted distribution. These
+    values of the statistic form the Monte Carlo null distribution (not to be
+    confused with the "null-hypothesized distribution" above).
+
+    The p-value of the test is the proportion of statistic values in the Monte
+    Carlo null distribution that are at least as extreme as the statistic value
+    of the provided `data`. More precisely, the p-value is given by
+
+    .. math::
+
+        p = \frac{b + 1}
+                 {m + 1}
+
+    where :math:`b` is the number of statistic values in the Monte Carlo null
+    distribution that are greater than or equal to the statistic value
+    calculated for `data`, and :math:`m` is the number of elements in the
+    Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`
+    to the numerator and denominator can be thought of as including the
+    value of the statistic corresponding with `data` in the null distribution,
+    but a more formal explanation is given in [5]_.
+
+    *Limitations*
+
+    The test can be very slow for some distribution families because unknown
+    parameters of the distribution family must be fit to each of the Monte
+    Carlo samples, and for most distributions in SciPy, distribution fitting
+    performed via numerical optimization.
+
+    *Anti-Pattern*
+
+    For this reason, it may be tempting
+    to treat parameters of the distribution pre-fit to `data` (by the user)
+    as though they were `known_params`, as specification of all parameters of
+    the distribution precludes the need to fit the distribution to each Monte
+    Carlo sample. (This is essentially how the original Kilmogorov-Smirnov
+    test is performed.) Although such a test can provide evidence against the
+    null hypothesis, the test is conservative in the sense that small p-values
+    will tend to (greatly) *overestimate* the probability of making a type I
+    error (that is, rejecting the null hypothesis although it is true), and the
+    power of the test is low (that is, it is less likely to reject the null
+    hypothesis even when the null hypothesis is false).
+    This is because the Monte Carlo samples are less likely to agree with the
+    null-hypothesized distribution as well as `data`. This tends to increase
+    the values of the statistic recorded in the null distribution, so that a
+    larger number of them exceed the value of statistic for `data`, thereby
+    inflating the p-value.
+
+    References
+    ----------
+    .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and
+           Some Comparisons." Journal of the American Statistical Association,
+           Vol. 69, pp. 730-737.
+    .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).
+           "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.
+    .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric
+           bootstrap for goodness-of-fit testing in semiparametric models."
+           Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.
+    .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on
+           a weighted bootstrap: A fast large-sample alternative to the
+           parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.
+    .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should
+           Never Be Zero: Calculating Exact P-values When Permutations Are
+           Randomly Drawn." Statistical Applications in Genetics and Molecular
+           Biology 9.1.
+    .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for
+           normality with mean and variance unknown." Journal of the American
+           statistical Association 62.318: 399-402.
+    .. [7] Filliben, James J. "The probability plot correlation coefficient
+           test for normality." Technometrics 17.1 (1975): 111-117.
+
+    Examples
+    --------
+    A well-known test of the null hypothesis that data were drawn from a
+    given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy
+    as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following
+    data:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(1638083107694713882823079058616272161)
+    >>> x = stats.uniform.rvs(size=75, random_state=rng)
+
+    were sampled from a normal distribution. To perform a KS test, the
+    empirical distribution function of the observed data will be compared
+    against the (theoretical) cumulative distribution function of a normal
+    distribution. Of course, to do this, the normal distribution under the null
+    hypothesis must be fully specified. This is commonly done by first fitting
+    the ``loc`` and ``scale`` parameters of the distribution to the observed
+    data, then performing the test.
+
+    >>> loc, scale = np.mean(x), np.std(x, ddof=1)
+    >>> cdf = stats.norm(loc, scale).cdf
+    >>> stats.ks_1samp(x, cdf)
+    KstestResult(statistic=0.1119257570456813,
+                 pvalue=0.2827756409939257,
+                 statistic_location=0.7751845155861765,
+                 statistic_sign=-1)
+
+    An advantage of the KS-test is that the p-value - the probability of
+    obtaining a value of the test statistic under the null hypothesis as
+    extreme as the value obtained from the observed data - can be calculated
+    exactly and efficiently. `goodness_of_fit` can only approximate these
+    results.
+
+    >>> known_params = {'loc': loc, 'scale': scale}
+    >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
+    ...                             statistic='ks', rng=rng)
+    >>> res.statistic, res.pvalue
+    (0.1119257570456813, 0.2788)
+
+    The statistic matches exactly, but the p-value is estimated by forming
+    a "Monte Carlo null distribution", that is, by explicitly drawing random
+    samples from `scipy.stats.norm` with the provided parameters and
+    calculating the stastic for each. The fraction of these statistic values
+    at least as extreme as ``res.statistic`` approximates the exact p-value
+    calculated by `scipy.stats.ks_1samp`.
+
+    However, in many cases, we would prefer to test only that the data were
+    sampled from one of *any* member of the normal distribution family, not
+    specifically from the normal distribution with the location and scale
+    fitted to the observed sample. In this case, Lilliefors [6]_ argued that
+    the KS test is far too conservative (that is, the p-value overstates
+    the actual probability of rejecting a true null hypothesis) and thus lacks
+    power - the ability to reject the null hypothesis when the null hypothesis
+    is actually false.
+    Indeed, our p-value above is approximately 0.28, which is far too large
+    to reject the null hypothesis at any common significance level.
+
+    Consider why this might be. Note that in the KS test above, the statistic
+    always compares data against the CDF of a normal distribution fitted to the
+    *observed data*. This tends to reduce the value of the statistic for the
+    observed data, but it is "unfair" when computing the statistic for other
+    samples, such as those we randomly draw to form the Monte Carlo null
+    distribution. It is easy to correct for this: whenever we compute the KS
+    statistic of a sample, we use the CDF of a normal distribution fitted
+    to *that sample*. The null distribution in this case has not been
+    calculated exactly and is tyically approximated using Monte Carlo methods
+    as described above. This is where `goodness_of_fit` excels.
+
+    >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks', rng=rng)
+    >>> res.statistic, res.pvalue
+    (0.1119257570456813, 0.0196)
+
+    Indeed, this p-value is much smaller, and small enough to (correctly)
+    reject the null hypothesis at common significance levels, including 5% and
+    2.5%.
+
+    However, the KS statistic is not very sensitive to all deviations from
+    normality. The original advantage of the KS statistic was the ability
+    to compute the null distribution theoretically, but a more sensitive
+    statistic - resulting in a higher test power - can be used now that we can
+    approximate the null distribution
+    computationally. The Anderson-Darling statistic [1]_ tends to be more
+    sensitive, and critical values of this statistic have been tabulated
+    for various significance levels and sample sizes using Monte Carlo methods.
+
+    >>> res = stats.anderson(x, 'norm', method='interpolate')
+    >>> print(res.statistic)
+    1.2139573337497467
+    >>> print(res.pvalue)
+    0.01
+
+    Here, the observed value of the statistic exceeds the critical value
+    corresponding with a 1% significance level. This tells us that the p-value
+    of the observed data is 1% or less, but what is it? `goodness_of_fit` can
+    estimate it directly.
+
+    >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad', rng=rng)
+    >>> res.statistic, res.pvalue
+    (1.2139573337497467, 0.0034)
+
+    A further advantage is that use of `goodness_of_fit` is not limited to
+    a particular set of distributions or conditions on which parameters
+    are known versus which must be estimated from data. Instead,
+    `goodness_of_fit` can estimate p-values relatively quickly for any
+    distribution with a sufficiently fast and reliable ``fit`` method. For
+    instance, here we perform a goodness of fit test using the Cramer-von Mises
+    statistic against the Rayleigh distribution with known location and unknown
+    scale.
+
+    >>> rng = np.random.default_rng()
+    >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
+    >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
+    ...                             known_params={'loc': 0}, rng=rng)
+
+    This executes fairly quickly, but to check the reliability of the ``fit``
+    method, we should inspect the fit result.
+
+    >>> res.fit_result  # location is as specified, and scale is reasonable
+      params: FitParams(loc=0.0, scale=2.1026719844231243)
+     success: True
+     message: 'The fit was performed successfully.'
+    >>> import matplotlib.pyplot as plt  # matplotlib must be installed to plot
+    >>> res.fit_result.plot()
+    >>> plt.show()
+
+    If the distribution is not fit to the observed data as well as possible,
+    the test may not control the type I error rate, that is, the chance of
+    rejecting the null hypothesis even when it is true.
+
+    We should also look for extreme outliers in the null distribution that
+    may be caused by unreliable fitting. These do not necessarily invalidate
+    the result, but they tend to reduce the test's power.
+
+    >>> _, ax = plt.subplots()
+    >>> ax.hist(np.log10(res.null_distribution))
+    >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")
+    >>> ax.set_ylabel("Frequency")
+    >>> ax.set_title("Histogram of the Monte Carlo null distribution")
+    >>> plt.show()
+
+    This plot seems reassuring.
+
+    If ``fit`` method is working reliably, and if the distribution of the test
+    statistic is not particularly sensitive to the values of the fitted
+    parameters, then the p-value provided by `goodness_of_fit` is expected to
+    be a good approximation.
+
+    >>> res.statistic, res.pvalue
+    (0.2231991510248692, 0.0525)
+
+    """
+    args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
+                   statistic, n_mc_samples, rng)
+    (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
+     guessed_rfd_params, statistic, n_mc_samples_int, rng) = args
+
+    # Fit null hypothesis distribution to data
+    nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
+                               fixed_nhd_params)
+    nhd_vals = nhd_fit_fun(data)
+    nhd_dist = dist(*nhd_vals)
+
+    def rvs(size):
+        return nhd_dist.rvs(size=size, random_state=rng)
+
+    # Define statistic
+    fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
+    if callable(statistic):
+        compare_fun = statistic
+    else:
+        compare_fun = _compare_dict[statistic]
+    alternative = getattr(compare_fun, 'alternative', 'greater')
+
+    def statistic_fun(data, axis):
+        # Make things simple by always working along the last axis.
+        data = np.moveaxis(data, axis, -1)
+        rfd_vals = fit_fun(data)
+        rfd_dist = dist(*rfd_vals)
+        return compare_fun(rfd_dist, data, axis=-1)
+
+    res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,
+                                 n_resamples=n_mc_samples, axis=-1,
+                                 alternative=alternative)
+    opt_res = optimize.OptimizeResult()
+    opt_res.success = True
+    opt_res.message = "The fit was performed successfully."
+    opt_res.x = nhd_vals
+    # Only continuous distributions for now, hence discrete=False
+    # There's no fundamental limitation; it's just that we're not using
+    # stats.fit, discrete distributions don't have `fit` method, and
+    # we haven't written any vectorized fit functions for a discrete
+    # distribution yet.
+    return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),
+                               res.statistic, res.pvalue,
+                               res.null_distribution)
+
+
+def _get_fit_fun(dist, data, guessed_params, fixed_params):
+
+    shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
+    param_names = shape_names + ['loc', 'scale']
+    fparam_names = ['f'+name for name in param_names]
+    all_fixed = not set(fparam_names).difference(fixed_params)
+    guessed_shapes = [guessed_params.pop(x, None)
+                      for x in shape_names if x in guessed_params]
+
+    if all_fixed:
+        def fit_fun(data):
+            return [fixed_params[name] for name in fparam_names]
+    # Define statistic, including fitting distribution to data
+    elif dist in _fit_funs:
+        def fit_fun(data):
+            params = _fit_funs[dist](data, **fixed_params)
+            params = np.asarray(np.broadcast_arrays(*params))
+            if params.ndim > 1:
+                params = params[..., np.newaxis]
+            return params
+    else:
+        def fit_fun_1d(data):
+            return dist.fit(data, *guessed_shapes, **guessed_params,
+                            **fixed_params)
+
+        def fit_fun(data):
+            params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)
+            if params.ndim > 1:
+                params = params.T[..., np.newaxis]
+            return params
+
+    return fit_fun
+
+
+# Vectorized fitting functions. These are to accept ND `data` in which each
+# row (slice along last axis) is a sample to fit and scalar fixed parameters.
+# They return a tuple of shape parameter arrays, each of shape data.shape[:-1].
+def _fit_norm(data, floc=None, fscale=None):
+    loc = floc
+    scale = fscale
+    if loc is None and scale is None:
+        loc = np.mean(data, axis=-1)
+        scale = np.std(data, ddof=1, axis=-1)
+    elif loc is None:
+        loc = np.mean(data, axis=-1)
+    elif scale is None:
+        scale = np.sqrt(((data - loc)**2).mean(axis=-1))
+    return loc, scale
+
+
+_fit_funs = {stats.norm: _fit_norm}  # type: ignore[attr-defined]
+
+
+# Vectorized goodness of fit statistic functions. These accept a frozen
+# distribution object and `data` in which each row (slice along last axis) is
+# a sample.
+
+
+def _anderson_darling(dist, data, axis):
+    x = np.sort(data, axis=-1)
+    n = data.shape[-1]
+    i = np.arange(1, n+1)
+    Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))
+    S = np.sum(Si, axis=-1)
+    return -n - S
+
+
+def _compute_dplus(cdfvals):  # adapted from _stats_py before gh-17062
+    n = cdfvals.shape[-1]
+    return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)
+
+
+def _compute_dminus(cdfvals):
+    n = cdfvals.shape[-1]
+    return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)
+
+
+def _kolmogorov_smirnov(dist, data, axis=-1):
+    x = np.sort(data, axis=axis)
+    cdfvals = dist.cdf(x)
+    cdfvals = np.moveaxis(cdfvals, axis, -1)
+    Dplus = _compute_dplus(cdfvals)  # always works along last axis
+    Dminus = _compute_dminus(cdfvals)
+    return np.maximum(Dplus, Dminus)
+
+
+def _corr(X, M):
+    # Correlation coefficient r, simplified and vectorized as we need it.
+    # See [7] Equation (2). Lemma 1/2 are only for distributions symmetric
+    # about 0.
+    Xm = X.mean(axis=-1, keepdims=True)
+    Mm = M.mean(axis=-1, keepdims=True)
+    num = np.sum((X - Xm) * (M - Mm), axis=-1)
+    den = np.sqrt(np.sum((X - Xm)**2, axis=-1) * np.sum((M - Mm)**2, axis=-1))
+    return num/den
+
+
+def _filliben(dist, data, axis):
+    # [7] Section 8 # 1
+    X = np.sort(data, axis=-1)
+
+    # [7] Section 8 # 2
+    n = data.shape[-1]
+    k = np.arange(1, n+1)
+    # Filliben used an approximation for the uniform distribution order
+    # statistic medians.
+    # m = (k - .3175)/(n + 0.365)
+    # m[-1] = 0.5**(1/n)
+    # m[0] = 1 - m[-1]
+    # We can just as easily use the (theoretically) exact values. See e.g.
+    # https://en.wikipedia.org/wiki/Order_statistic
+    # "Order statistics sampled from a uniform distribution"
+    m = stats.beta(k, n + 1 - k).median()
+
+    # [7] Section 8 # 3
+    M = dist.ppf(m)
+
+    # [7] Section 8 # 4
+    return _corr(X, M)
+_filliben.alternative = 'less'  # type: ignore[attr-defined]
+
+
+def _cramer_von_mises(dist, data, axis):
+    x = np.sort(data, axis=-1)
+    n = data.shape[-1]
+    cdfvals = dist.cdf(x)
+    u = (2*np.arange(1, n+1) - 1)/(2*n)
+    w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)
+    return w
+
+
+_compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,
+                 "cvm": _cramer_von_mises, "filliben": _filliben}
+
+
+def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
+            n_mc_samples, rng):
+
+    if not isinstance(dist, stats.rv_continuous):
+        message = ("`dist` must be a (non-frozen) instance of "
+                   "`stats.rv_continuous`.")
+        raise TypeError(message)
+
+    data = np.asarray(data, dtype=float)
+    if not data.ndim == 1:
+        message = "`data` must be a one-dimensional array of numbers."
+        raise ValueError(message)
+
+    # Leave validation of these key/value pairs to the `fit` method,
+    # but collect these into dictionaries that will be used
+    known_params = known_params or dict()
+    fit_params = fit_params or dict()
+    guessed_params = guessed_params or dict()
+
+    known_params_f = {("f"+key): val for key, val in known_params.items()}
+    fit_params_f = {("f"+key): val for key, val in fit_params.items()}
+
+    # These are the values of parameters of the null distribution family
+    # with which resamples are drawn
+    fixed_nhd_params = known_params_f.copy()
+    fixed_nhd_params.update(fit_params_f)
+
+    # These are fixed when fitting the distribution family to resamples
+    fixed_rfd_params = known_params_f.copy()
+
+    # These are used as guesses when fitting the distribution family to
+    # the original data
+    guessed_nhd_params = guessed_params.copy()
+
+    # These are used as guesses when fitting the distribution family to
+    # resamples
+    guessed_rfd_params = fit_params.copy()
+    guessed_rfd_params.update(guessed_params)
+
+    if not callable(statistic):
+        statistic = statistic.lower()
+        statistics = {'ad', 'ks', 'cvm', 'filliben'}
+        if statistic not in statistics:
+            message = f"`statistic` must be one of {statistics}."
+            raise ValueError(message)
+
+    n_mc_samples_int = int(n_mc_samples)
+    if n_mc_samples_int != n_mc_samples:
+        message = "`n_mc_samples` must be an integer."
+        raise TypeError(message)
+
+    rng = check_random_state(rng)
+
+    return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
+            guessed_rfd_params, statistic, n_mc_samples_int, rng)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_hypotests.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_hypotests.py
new file mode 100644
index 0000000000000000000000000000000000000000..52898fe24af398ced1a33056af4983ef78689b96
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_hypotests.py
@@ -0,0 +1,2160 @@
+from collections import namedtuple
+from dataclasses import dataclass
+import math
+import numpy as np
+import warnings
+from itertools import combinations
+import scipy.stats
+from scipy.optimize import shgo
+from . import distributions
+from ._common import ConfidenceInterval
+from ._continuous_distns import norm
+from scipy._lib._array_api import (xp_capabilities, array_namespace, xp_size,
+                                   xp_promote, xp_result_type, xp_copy, is_numpy)
+import scipy._lib.array_api_extra as xpx
+from scipy.special import gamma, kv, gammaln
+from scipy.fft import ifft
+from ._stats_pythran import _a_ij_Aij_Dij2
+from ._stats_pythran import (
+    _concordant_pairs as _P, _discordant_pairs as _Q
+)
+from ._axis_nan_policy import _axis_nan_policy_factory
+from scipy.stats import _stats_py
+
+__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
+           'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
+           'tukey_hsd', 'poisson_means_test']
+
+Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
+                                        ('statistic', 'pvalue'))
+
+
+@xp_capabilities(skip_backends=[("dask.array", "lazy -> no _axis_nan_policy"),
+                                ("jax.numpy", "lazy -> no _axis_nan_policy")])
+@_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4)
+def epps_singleton_2samp(x, y, t=(0.4, 0.8), *, axis=0):
+    """Compute the Epps-Singleton (ES) test statistic.
+
+    Test the null hypothesis that two samples have the same underlying
+    probability distribution.
+
+    Parameters
+    ----------
+    x, y : array-like
+        The two samples of observations to be tested. Input must not have more
+        than one dimension. Samples can have different lengths, but both
+        must have at least five observations.
+    t : array-like, optional
+        The points (t1, ..., tn) where the empirical characteristic function is
+        to be evaluated. It should be positive distinct numbers. The default
+        value (0.4, 0.8) is proposed in [1]_. Input must not have more than
+        one dimension.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The associated p-value based on the asymptotic chi2-distribution.
+
+    See Also
+    --------
+    ks_2samp, anderson_ksamp
+
+    Notes
+    -----
+    Testing whether two samples are generated by the same underlying
+    distribution is a classical question in statistics. A widely used test is
+    the Kolmogorov-Smirnov (KS) test which relies on the empirical
+    distribution function. Epps and Singleton introduce a test based on the
+    empirical characteristic function in [1]_.
+
+    One advantage of the ES test compared to the KS test is that is does
+    not assume a continuous distribution. In [1]_, the authors conclude
+    that the test also has a higher power than the KS test in many
+    examples. They recommend the use of the ES test for discrete samples as
+    well as continuous samples with at least 25 observations each, whereas
+    `anderson_ksamp` is recommended for smaller sample sizes in the
+    continuous case.
+
+    The p-value is computed from the asymptotic distribution of the test
+    statistic which follows a `chi2` distribution. If the sample size of both
+    `x` and `y` is below 25, the small sample correction proposed in [1]_ is
+    applied to the test statistic.
+
+    The default values of `t` are determined in [1]_ by considering
+    various distributions and finding good values that lead to a high power
+    of the test in general. Table III in [1]_ gives the optimal values for
+    the distributions tested in that study. The values of `t` are scaled by
+    the semi-interquartile range in the implementation, see [1]_.
+
+    References
+    ----------
+    .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
+       problem using the empirical characteristic function", Journal of
+       Statistical Computation and Simulation 26, p. 177--203, 1986.
+
+    .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
+       - the Epps-Singleton two-sample test using the empirical characteristic
+       function", The Stata Journal 9(3), p. 454--465, 2009.
+
+    """
+    xp = array_namespace(x, y)
+    # x and y are converted to arrays by the decorator
+    # and `axis` is guaranteed to be -1.
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+    t = xp.asarray(t, dtype=x.dtype)
+    # check if x and y are valid inputs
+    nx, ny = x.shape[-1], y.shape[-1]
+    if (nx < 5) or (ny < 5):  # only used by test_axis_nan_policy
+        raise ValueError('x and y should have at least 5 elements, but len(x) '
+                         f'= {nx} and len(y) = {ny}.')
+    n = nx + ny
+
+    # check if t is valid
+    if t.ndim > 1:
+        raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.')
+    if xp.any(t <= 0):
+        raise ValueError('t must contain positive elements only.')
+
+    # Previously, non-finite input caused an error in linalg functions.
+    # To prevent an issue in one slice from halting the calculation, replace non-finite
+    # values with a harmless one, and replace results with NaN at the end.
+    i_x = ~xp.isfinite(x)
+    i_y = ~xp.isfinite(y)
+    # Ideally we would avoid copying all data here; see
+    # discussion in data-apis/array-api-extra#506.
+    x = xp.where(i_x, 1., x)
+    y = xp.where(i_y, 1., y)
+    invalid_result = xp.any(i_x, axis=-1) | xp.any(i_y, axis=-1)
+
+    # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
+    # circular import
+    from scipy.stats import iqr
+    sigma = iqr(xp.concat((x, y), axis=-1), axis=-1, keepdims=True) / 2
+    ts = xp.reshape(t, (-1,) + (1,)*x.ndim) / sigma
+
+    # covariance estimation of ES test
+    gx = xp.concat((xp.cos(ts*x), xp.sin(ts*x)), axis=0)
+    gy = xp.concat((xp.cos(ts*y), xp.sin(ts*y)), axis=0)
+    gx, gy = xp.moveaxis(gx, 0, -2), xp.moveaxis(gy, 0, -2)
+    cov_x = xpx.cov(gx) * (nx-1)/nx  # the test uses biased cov-estimate
+    cov_y = xpx.cov(gy) * (ny-1)/ny
+    cov_x, cov_y = xp.astype(cov_x, x.dtype), xp.astype(cov_y, y.dtype)
+    est_cov = (n/nx)*cov_x + (n/ny)*cov_y
+    est_cov_inv = xp.linalg.pinv(est_cov)
+    r = xp.asarray(xp.linalg.matrix_rank(est_cov_inv), dtype=est_cov_inv.dtype)
+    if xp.any(r < 2*xp_size(t)):
+        warnings.warn('Estimated covariance matrix does not have full rank. '
+                      'This indicates a bad choice of the input t and the '
+                      'test might not be consistent.', # see p. 183 in [1]_
+                      stacklevel=2)
+
+    # compute test statistic w distributed asympt. as chisquare with df=r
+    g_diff = xp.mean(gx, axis=-1, keepdims=True) - xp.mean(gy, axis=-1, keepdims=True)
+    w = n*xp.matmul(xp.matrix_transpose(g_diff), xp.matmul(est_cov_inv, g_diff))
+    w = w[..., 0, 0]
+
+    # apply small-sample correction
+    if (max(nx, ny) < 25):
+        corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
+        w *= corr
+
+    chi2 = _stats_py._SimpleChi2(r)
+    p = _stats_py._get_pvalue(w, chi2, alternative='greater', symmetric=False, xp=xp)
+
+    w = xpx.at(w)[invalid_result].set(xp.nan)
+    p = xpx.at(p)[invalid_result].set(xp.nan)
+    w = w[()] if w.ndim == 0 else w
+    p = p[()] if p.ndim == 0 else p
+    return Epps_Singleton_2sampResult(w, p)
+
+
+@xp_capabilities(np_only=True)
+def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
+    r"""
+    Performs the Poisson means test, AKA the "E-test".
+
+    This is a test of the null hypothesis that the difference between means of
+    two Poisson distributions is `diff`. The samples are provided as the
+    number of events `k1` and `k2` observed within measurement intervals
+    (e.g. of time, space, number of observations) of sizes `n1` and `n2`.
+
+    Parameters
+    ----------
+    k1 : int
+        Number of events observed from distribution 1.
+    n1: float
+        Size of sample from distribution 1.
+    k2 : int
+        Number of events observed from distribution 2.
+    n2 : float
+        Size of sample from distribution 2.
+    diff : float, default=0
+        The hypothesized difference in means between the distributions
+        underlying the samples.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+          * 'two-sided': the difference between distribution means is not
+            equal to `diff`
+          * 'less': the difference between distribution means is less than
+            `diff`
+          * 'greater': the difference between distribution means is greater
+            than `diff`
+
+    Returns
+    -------
+    statistic : float
+        The test statistic (see [1]_ equation 3.3).
+    pvalue : float
+        The probability of achieving such an extreme value of the test
+        statistic under the null hypothesis.
+
+    Notes
+    -----
+
+    Let:
+
+    .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
+
+    be a random variable independent of
+
+    .. math:: X_2  \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
+
+    and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
+    and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
+    of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
+    ``n2``, respectively, to test the null hypothesis that
+
+    .. math::
+       H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
+
+    A benefit of the E-test is that it has good power for small sample sizes,
+    which can reduce sampling costs [1]_. It has been evaluated and determined
+    to be more powerful than the comparable C-test, sometimes referred to as
+    the Poisson exact test.
+
+    References
+    ----------
+    .. [1]  Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
+       comparing two Poisson means. Journal of Statistical Planning and
+       Inference, 119(1), 23-35.
+
+    .. [2]  Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
+       testing samples from Poisson series: With an application to testing
+       clover seed for dodder. Biometrika, 31(3/4), 313-323.
+
+    Examples
+    --------
+
+    Suppose that a gardener wishes to test the number of dodder (weed) seeds
+    in a sack of clover seeds that they buy from a seed company. It has
+    previously been established that the number of dodder seeds in clover
+    follows the Poisson distribution.
+
+    A 100 gram sample is drawn from the sack before being shipped to the
+    gardener. The sample is analyzed, and it is found to contain no dodder
+    seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
+    another 100 gram sample from the sack. This time, three dodder seeds are
+    found in the sample; that is, `k2` is 3. The gardener would like to
+    know if the difference is significant and not due to chance. The
+    null hypothesis is that the difference between the two samples is merely
+    due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
+    where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
+    difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
+    The gardener selects a significance level of 5% to reject the null
+    hypothesis in favor of the alternative [2]_.
+
+    >>> import scipy.stats as stats
+    >>> res = stats.poisson_means_test(0, 100, 3, 100)
+    >>> res.statistic, res.pvalue
+    (-1.7320508075688772, 0.08837900929018157)
+
+    The p-value is .088, indicating a near 9% chance of observing a value of
+    the test statistic under the null hypothesis. This exceeds 5%, so the
+    gardener does not reject the null hypothesis as the difference cannot be
+    regarded as significant at this level.
+    """
+
+    _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
+
+    # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
+    lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
+
+    # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
+    # case the null hypothesis cannot be rejected ... [and] it is not necessary
+    # to compute the p-value". [1] page 26 below eq. (3.6).
+    if lmbd_hat2 <= 0:
+        return _stats_py.SignificanceResult(0, 1)
+
+    # The unbiased variance estimate [1] (3.2)
+    var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
+
+    # The _observed_ pivot statistic from the input. It follows the
+    # unnumbered equation following equation (3.3) This is used later in
+    # comparison with the computed pivot statistics in an indicator function.
+    t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
+
+    # Equation (3.5) of [1] is lengthy, so it is broken into several parts,
+    # beginning here. Note that the probability mass function of poisson is
+    # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
+    # here as nlmbd_hat*. The strategy for evaluating the double summation in
+    # (3.5) is to create two arrays of the values of the two products inside
+    # the summation and then broadcast them together into a matrix, and then
+    # sum across the entire matrix.
+
+    # Compute constants (as seen in the first and second separated products in
+    # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
+    nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
+    nlmbd_hat2 = n2 * lmbd_hat2
+
+    # Determine summation bounds for tail ends of distribution rather than
+    # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
+    # sum.
+    x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
+    x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
+
+    # Construct arrays to function as the x_1 and x_2 counters on the summation
+    # in (3.5). `x1` is in columns and `x2` is in rows to allow for
+    # broadcasting.
+    x1 = np.arange(x1_lb, x1_ub + 1)
+    x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
+
+    # These are the two products in equation (3.5) with `prob_x1` being the
+    # first (left side) and `prob_x2` being the second (right side). (To
+    # make as clear as possible: the 1st contains a "+ d" term, the 2nd does
+    # not.)
+    prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
+    prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
+
+    # compute constants for use in the "pivot statistic" per the
+    # unnumbered equation following (3.3).
+    lmbd_x1 = x1 / n1
+    lmbd_x2 = x2 / n2
+    lmbds_diff = lmbd_x1 - lmbd_x2 - diff
+    var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
+
+    # This is the 'pivot statistic' for use in the indicator of the summation
+    # (left side of "I[.]").
+    with np.errstate(invalid='ignore', divide='ignore'):
+        t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
+
+    # `[indicator]` implements the "I[.] ... the indicator function" per
+    # the paragraph following equation (3.5).
+    if alternative == 'two-sided':
+        indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
+    elif alternative == 'less':
+        indicator = t_x1x2 <= t_k1k2
+    else:
+        indicator = t_x1x2 >= t_k1k2
+
+    # Multiply all combinations of the products together, exclude terms
+    # based on the `indicator` and then sum. (3.5)
+    pvalue = np.sum((prob_x1 * prob_x2)[indicator])
+    return _stats_py.SignificanceResult(t_k1k2, pvalue)
+
+
+def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
+    # """check for valid types and values of input to `poisson_mean_test`."""
+    if k1 != int(k1) or k2 != int(k2):
+        raise TypeError('`k1` and `k2` must be integers.')
+
+    count_err = '`k1` and `k2` must be greater than or equal to 0.'
+    if k1 < 0 or k2 < 0:
+        raise ValueError(count_err)
+
+    if n1 <= 0 or n2 <= 0:
+        raise ValueError('`n1` and `n2` must be greater than 0.')
+
+    if diff < 0:
+        raise ValueError('diff must be greater than or equal to 0.')
+
+    alternatives = {'two-sided', 'less', 'greater'}
+    if alternative.lower() not in alternatives:
+        raise ValueError(f"Alternative must be one of '{alternatives}'.")
+
+
+class CramerVonMisesResult:
+    def __init__(self, statistic, pvalue):
+        self.statistic = statistic
+        self.pvalue = pvalue
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(statistic={self.statistic}, "
+                f"pvalue={self.pvalue})")
+
+
+def _psi1_mod(x, *, xp=None):
+    """
+    psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
+    This implements a modified version by excluding the term V(x) / 12
+    (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
+    twice in _cdf_cvm.
+
+    Implementation based on MAPLE code of Julian Faraway and R code of the
+    function pCvM in the package goftest (v1.1.1), permission granted
+    by Adrian Baddeley. Main difference in the implementation: the code
+    here keeps adding terms of the series until the terms are small enough.
+    """
+    xp = array_namespace(x) if xp is None else xp
+
+    def _ed2(y):
+        z = y**2 / 4
+        z_ = np.asarray(z)
+        b = xp.asarray(kv(1/4, z_) + kv(3/4, z_))
+        return xp.exp(-z) * (y/2)**(3/2) * b / math.sqrt(np.pi)
+
+    def _ed3(y):
+        z = y**2 / 4
+        z_ = np.asarray(z)
+        c = xp.exp(-z) / math.sqrt(np.pi)
+        kv_terms = xp.asarray(2*kv(1/4, z_)
+                              + 3*kv(3/4, z_) - kv(5/4, z_))
+        return c * (y/2)**(5/2) * kv_terms
+
+    def _Ak(k, x):
+        m = 2*k + 1
+        sx = 2 * xp.sqrt(x)
+        y1 = x**(3/4)
+        y2 = x**(5/4)
+
+        gamma_kp1_2 = float(gamma(k + 1 / 2))
+        gamma_kp3_2 = float(gamma(k + 3 / 2))
+
+        e1 = m * gamma_kp1_2 * _ed2((4 * k + 3)/sx) / (9 * y1)
+        e2 = gamma_kp1_2 * _ed3((4 * k + 1) / sx) / (72 * y2)
+        e3 = 2 * (m + 2) * gamma_kp3_2 * _ed3((4 * k + 5) / sx) / (12 * y2)
+        e4 = 7 * m * gamma_kp1_2 * _ed2((4 * k + 1) / sx) / (144 * y1)
+        e5 = 7 * m * gamma_kp1_2 * _ed2((4 * k + 5) / sx) / (144 * y1)
+
+        return e1 + e2 + e3 + e4 + e5
+
+    x = xp.asarray(x)
+    tot = xp.zeros_like(x)
+    cond = xp.ones_like(x, dtype=xp.bool)
+    k = 0
+    while xp.any(cond):
+        gamma_kp1 = float(gamma(k + 1))
+        z = -_Ak(k, x[cond]) / (xp.pi * gamma_kp1)
+        tot = xpx.at(tot)[cond].set(tot[cond] + z)
+        # For float32 arithmetic, the tolerance may need to be adjusted or the
+        # algorithm may prove to be unsuitable.
+        cond = xpx.at(cond)[xp_copy(cond)].set(xp.abs(z) >= 1e-7)
+        k += 1
+
+    return tot
+
+
+def _cdf_cvm_inf(x, *, xp=None):
+    """
+    Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
+
+    See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
+
+    Implementation based on MAPLE code of Julian Faraway and R code of the
+    function pCvM in the package goftest (v1.1.1), permission granted
+    by Adrian Baddeley. Main difference in the implementation: the code
+    here keeps adding terms of the series until the terms are small enough.
+
+    The function is not expected to be accurate for large values of x, say
+    x > 4, when the cdf is very close to 1.
+    """
+    xp = array_namespace(x) if xp is None else xp
+    x = xp.asarray(x)
+
+    def term(x, k):
+        # this expression can be found in [2], second line of (1.3)
+        u = math.exp(gammaln(k + 0.5) - gammaln(k+1)) / (xp.pi**1.5 * xp.sqrt(x))
+        y = 4*k + 1
+        q = y**2 / (16*x)
+        b = xp.asarray(kv(0.25, np.asarray(q)), dtype=u.dtype)  # not automatic?
+        return u * math.sqrt(y) * xp.exp(-q) * b
+
+    tot = xp.zeros_like(x, dtype=x.dtype)
+    cond = xp.ones_like(x, dtype=xp.bool)
+    k = 0
+    while xp.any(cond):
+        z = term(x[cond], k)
+        # tot[cond] = tot[cond] + z
+        tot = xpx.at(tot)[cond].add(z)
+        # cond[cond] = np.abs(z) >= 1e-7
+        cond = xpx.at(cond)[xp_copy(cond)].set(xp.abs(z) >= 1e-7)  # torch needs copy
+        k += 1
+
+    return tot
+
+
+def _cdf_cvm(x, n=None, *, xp=None):
+    """
+    Calculate the cdf of the Cramér-von Mises statistic for a finite sample
+    size n. If N is None, use the asymptotic cdf (n=inf).
+
+    See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
+    1.2 for the asymptotic cdf.
+
+    The function is not expected to be accurate for large values of x, say
+    x > 2, when the cdf is very close to 1 and it might return values > 1
+    in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
+    is not accurate for small values of n, especially close to the bounds of
+    the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
+    and 1, respectively. These are limitations of the approximation by Csörgő
+    and Faraway (1996) implemented in this function.
+    """
+    xp = array_namespace(x) if xp is None else xp
+    x = xp.asarray(x)
+
+    if n is None:
+        y = _cdf_cvm_inf(x, xp=xp)
+    else:
+        # support of the test statistic is [12/n, n/3], see 1.1 in [2]
+        y = xp.zeros_like(x, dtype=x.dtype)
+        sup = (1./(12*n) < x) & (x < n/3.)
+        # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
+        # therefore, we need to add it here
+        y = xpx.at(y)[sup].set(_cdf_cvm_inf(x[sup], xp=xp) * (1 + 1./(12*n))
+                               + _psi1_mod(x[sup], xp=xp) / n)
+        y = xpx.at(y)[x >= n/3].set(1.)
+
+    return y[()] if y.ndim == 0 else y
+
+
+def _cvm_result_to_tuple(res, _):
+    return res.statistic, res.pvalue
+
+
+@xp_capabilities(cpu_only=True,  # needs special function `kv`
+                 skip_backends=[('dask.array', 'typical dask issues')], jax_jit=False)
+@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1,
+                          result_to_tuple=_cvm_result_to_tuple)
+def cramervonmises(rvs, cdf, args=(), *, axis=0):
+    r"""Perform the one-sample Cramér-von Mises test for goodness of fit.
+
+    This performs a test of the goodness of fit of a cumulative distribution
+    function (cdf) :math:`F` compared to the empirical distribution function
+    :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
+    assumed to be independent and identically distributed ([1]_).
+    The null hypothesis is that the :math:`X_i` have cumulative distribution
+    :math:`F`.
+
+    The test statistic :math:`T` is defined as in [1]_, where :math:`\omega^2`
+    is the Cramér-von Mises criterion and :math:`x_i` are the observed values.
+
+    .. math::
+        T = n\omega^2 =
+        \frac{1}{12n} + \sum_{i=1}^n \left[ \frac{2i-1}{2n} - F(x_i) \right]^2
+
+    Parameters
+    ----------
+    rvs : array_like
+        A 1-D array of observed values of the random variables :math:`X_i`.
+        The sample must contain at least two observations.
+    cdf : str or callable
+        The cumulative distribution function :math:`F` to test the
+        observations against. If a string, it should be the name of a
+        distribution in `scipy.stats`. If a callable, that callable is used
+        to calculate the cdf: ``cdf(x, *args) -> float``.
+    args : tuple, optional
+        Distribution parameters. These are assumed to be known; see Notes.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    res : object with attributes
+        statistic : float
+            Cramér-von Mises statistic :math:`T`.
+        pvalue : float
+            The p-value.
+
+    See Also
+    --------
+    kstest, cramervonmises_2samp
+
+    Notes
+    -----
+    .. versionadded:: 1.6.0
+
+    The p-value relies on the approximation given by equation 1.8 in [2]_.
+    It is important to keep in mind that the p-value is only accurate if
+    one tests a simple hypothesis, i.e. the parameters of the reference
+    distribution are known. If the parameters are estimated from the data
+    (composite hypothesis), the computed p-value is not reliable.
+
+    References
+    ----------
+    .. [1] Cramér-von Mises criterion, Wikipedia,
+           https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
+    .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
+           Distribution of Cramér-von Mises Statistics. Journal of the
+           Royal Statistical Society, pp. 221-234.
+
+    Examples
+    --------
+
+    Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
+    were, in fact, drawn from the standard normal distribution. We choose a
+    significance level of ``alpha=0.05``.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(165417232101553420507139617764912913465)
+    >>> x = stats.norm.rvs(size=500, random_state=rng)
+    >>> res = stats.cramervonmises(x, 'norm')
+    >>> res.statistic, res.pvalue
+    (0.1072085112565724, 0.5508482238203407)
+
+    The p-value exceeds our chosen significance level, so we do not
+    reject the null hypothesis that the observed sample is drawn from the
+    standard normal distribution.
+
+    Now suppose we wish to check whether the same samples shifted by 2.1 is
+    consistent with being drawn from a normal distribution with a mean of 2.
+
+    >>> y = x + 2.1
+    >>> res = stats.cramervonmises(y, 'norm', args=(2,))
+    >>> res.statistic, res.pvalue
+    (0.8364446265294695, 0.00596286797008283)
+
+    Here we have used the `args` keyword to specify the mean (``loc``)
+    of the normal distribution to test the data against. This is equivalent
+    to the following, in which we create a frozen normal distribution with
+    mean 2.1, then pass its ``cdf`` method as an argument.
+
+    >>> frozen_dist = stats.norm(loc=2)
+    >>> res = stats.cramervonmises(y, frozen_dist.cdf)
+    >>> res.statistic, res.pvalue
+    (0.8364446265294695, 0.00596286797008283)
+
+    In either case, we would reject the null hypothesis that the observed
+    sample is drawn from a normal distribution with a mean of 2 (and default
+    variance of 1) because the p-value is less than our chosen
+    significance level.
+
+    """
+    # `_axis_nan_policy` decorator ensures `axis=-1`
+    xp = array_namespace(rvs)
+
+    if isinstance(cdf, str) and is_numpy(xp):
+        cdf = getattr(distributions, cdf).cdf
+    elif isinstance(cdf, str):
+        message = "`cdf` must be a callable if `rvs` is a non-NumPy array."
+        raise ValueError(message)
+
+    n = rvs.shape[-1]
+    if n <= 1:  # only needed for `test_axis_nan_policy.py`; not user-facing
+        raise ValueError('The sample must contain at least two observations.')
+
+    rvs, n = xp_promote(rvs, n, force_floating=True, xp=xp)
+    vals = xp.sort(rvs, axis=-1)
+    cdfvals = cdf(vals, *args)
+
+    u = (2*xp.arange(1, n+1, dtype=n.dtype) - 1)/(2*n)
+    w = 1/(12*n) + xp.sum((u - cdfvals)**2, axis=-1)
+
+    # avoid small negative values that can occur due to the approximation
+    p = xp.clip(1. - _cdf_cvm(w, n), 0., None)
+
+    return CramerVonMisesResult(statistic=w, pvalue=p)
+
+
+def _get_wilcoxon_distr(n):
+    """
+    Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
+    of ranks of positive differences).
+    Returns an array with the probabilities of all the possible ranks
+    r = 0, ..., n*(n+1)/2
+    """
+    c = np.ones(1, dtype=np.float64)
+    for k in range(1, n + 1):
+        prev_c = c
+        c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64)
+        m = len(prev_c)
+        c[:m] = prev_c * 0.5
+        c[-m:] += prev_c * 0.5
+    return c
+
+
+def _get_wilcoxon_distr2(n):
+    """
+    Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
+    of ranks of positive differences).
+    Returns an array with the probabilities of all the possible ranks
+    r = 0, ..., n*(n+1)/2
+    This is a slower reference function
+    References
+    ----------
+    .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
+        Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
+    """
+    ai = np.arange(1, n+1)[:, None]
+    t = n*(n+1)/2
+    q = 2*t
+    j = np.arange(q)
+    theta = 2*np.pi/q*j
+    phi_sp = np.prod(np.cos(theta*ai), axis=0)
+    phi_s = np.exp(1j*theta*t) * phi_sp
+    p = np.real(ifft(phi_s))
+    res = np.zeros(int(t)+1)
+    res[:-1:] = p[::2]
+    res[0] /= 2
+    res[-1] = res[0]
+    return res
+
+
+def _tau_b(A):
+    """Calculate Kendall's tau-b and p-value from contingency table."""
+    # See [2] 2.2 and 4.2
+
+    # contingency table must be truly 2D
+    if A.shape[0] == 1 or A.shape[1] == 1:
+        return np.nan, np.nan
+
+    NA = A.sum()
+    PA = _P(A)
+    QA = _Q(A)
+    Sri2 = (A.sum(axis=1)**2).sum()
+    Scj2 = (A.sum(axis=0)**2).sum()
+    denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
+
+    tau = (PA-QA)/(denominator)**0.5
+
+    numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
+    s02_tau_b = numerator/denominator
+    if s02_tau_b == 0:  # Avoid divide by zero
+        return tau, 0
+    Z = tau/s02_tau_b**0.5
+    p = 2*norm.sf(abs(Z))  # 2-sided p-value
+
+    return tau, p
+
+
+def _somers_d(A, alternative='two-sided'):
+    """Calculate Somers' D and p-value from contingency table."""
+    # See [3] page 1740
+
+    # contingency table must be truly 2D
+    if A.shape[0] <= 1 or A.shape[1] <= 1:
+        return np.nan, np.nan
+
+    NA = A.sum()
+    NA2 = NA**2
+    PA = _P(A)
+    QA = _Q(A)
+    Sri2 = (A.sum(axis=1)**2).sum()
+
+    d = (PA - QA)/(NA2 - Sri2)
+
+    S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
+
+    with np.errstate(divide='ignore'):
+        Z = (PA - QA)/(4*(S))**0.5
+
+    norm = _stats_py._SimpleNormal()
+    p = _stats_py._get_pvalue(Z, norm, alternative, xp=np)
+
+    return d, p
+
+
+@dataclass
+class SomersDResult:
+    statistic: float
+    pvalue: float
+    table: np.ndarray
+
+
+@xp_capabilities(np_only=True)
+def somersd(x, y=None, alternative='two-sided'):
+    r"""Calculates Somers' D, an asymmetric measure of ordinal association.
+
+    Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
+    correspondence between two rankings. Both statistics consider the
+    difference between the number of concordant and discordant pairs in two
+    rankings :math:`X` and :math:`Y`, and both are normalized such that values
+    close  to 1 indicate strong agreement and values close to -1 indicate
+    strong disagreement. They differ in how they are normalized. To show the
+    relationship, Somers' :math:`D` can be defined in terms of Kendall's
+    :math:`\tau_a`:
+
+    .. math::
+        D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
+
+    Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
+    second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
+    :math:`n` rankings can also be viewed as an :math:`r \times s` contingency
+    table in which element :math:`i, j` is the number of rank pairs with rank
+    :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
+    Accordingly, `somersd` also allows the input data to be supplied as a
+    single, 2D contingency table instead of as two separate, 1D rankings.
+
+    Note that the definition of Somers' :math:`D` is asymmetric: in general,
+    :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
+    :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
+    variable, and the "column" variable :math:`Y` is dependent. For Somers'
+    :math:`D(X|Y)`, swap the input lists or transpose the input table.
+
+    Parameters
+    ----------
+    x : array_like
+        1D array of rankings, treated as the (row) independent variable.
+        Alternatively, a 2D contingency table.
+    y : array_like, optional
+        If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
+        same length, treated as the (column) dependent variable.
+        If `x` is 2D, `y` is ignored.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+        * 'two-sided': the rank correlation is nonzero
+        * 'less': the rank correlation is negative (less than zero)
+        * 'greater':  the rank correlation is positive (greater than zero)
+
+    Returns
+    -------
+    res : SomersDResult
+        A `SomersDResult` object with the following fields:
+
+            statistic : float
+               The Somers' :math:`D` statistic.
+            pvalue : float
+               The p-value for a hypothesis test whose null
+               hypothesis is an absence of association, :math:`D=0`.
+               See notes for more information.
+            table : 2D array
+               The contingency table formed from rankings `x` and `y` (or the
+               provided contingency table, if `x` is a 2D array)
+
+    See Also
+    --------
+    kendalltau : Calculates Kendall's tau, another correlation measure.
+    weightedtau : Computes a weighted version of Kendall's tau.
+    spearmanr : Calculates a Spearman rank-order correlation coefficient.
+    pearsonr : Calculates a Pearson correlation coefficient.
+
+    Notes
+    -----
+    This function follows the contingency table approach of [2]_ and
+    [3]_. *p*-values are computed based on an asymptotic approximation of
+    the test statistic distribution under the null hypothesis :math:`D=0`.
+
+    Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
+    :math:`D` should be identical.
+    However, the *p*-values returned by `kendalltau` are based
+    on the null hypothesis of *independence* between :math:`X` and :math:`Y`
+    (i.e. the population from which pairs in :math:`X` and :math:`Y` are
+    sampled contains equal numbers of all possible pairs), which is more
+    specific than the null hypothesis :math:`D=0` used here. If the null
+    hypothesis of independence is desired, it is acceptable to use the
+    *p*-value returned by `kendalltau` with the statistic returned by
+    `somersd` and vice versa. For more information, see [2]_.
+
+    Contingency tables are formatted according to the convention used by
+    SAS and R: the first ranking supplied (``x``) is the "row" variable, and
+    the second ranking supplied (``y``) is the "column" variable. This is
+    opposite the convention of Somers' original paper [1]_.
+
+    References
+    ----------
+    .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
+           Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
+           pp. 799--811, 1962.
+
+    .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
+           Tests for Correlation in Two-Way Contingency Tables", *Journal of
+           the American Statistical Association* Vol. 72, No. 358, pp.
+           309--315, 1977.
+
+    .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
+           *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
+
+    .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
+           Statistics Tutorials and Statistical Guides*,
+           https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
+           Accessed July 31, 2020.
+
+    Examples
+    --------
+    We calculate Somers' D for the example given in [4]_, in which a hotel
+    chain owner seeks to determine the association between hotel room
+    cleanliness and customer satisfaction. The independent variable, hotel
+    room cleanliness, is ranked on an ordinal scale: "below average (1)",
+    "average (2)", or "above average (3)". The dependent variable, customer
+    satisfaction, is ranked on a second scale: "very dissatisfied (1)",
+    "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
+    "moderately satisfied (4)", or "very satisfied (5)". 189 customers
+    respond to the survey, and the results are cast into a contingency table
+    with the hotel room cleanliness as the "row" variable and customer
+    satisfaction as the "column" variable.
+
+    +-----+-----+-----+-----+-----+-----+
+    |     | (1) | (2) | (3) | (4) | (5) |
+    +=====+=====+=====+=====+=====+=====+
+    | (1) | 27  | 25  | 14  | 7   | 0   |
+    +-----+-----+-----+-----+-----+-----+
+    | (2) | 7   | 14  | 18  | 35  | 12  |
+    +-----+-----+-----+-----+-----+-----+
+    | (3) | 1   | 3   | 2   | 7   | 17  |
+    +-----+-----+-----+-----+-----+-----+
+
+    For example, 27 customers assigned their room a cleanliness ranking of
+    "below average (1)" and a corresponding satisfaction of "very
+    dissatisfied (1)". We perform the analysis as follows.
+
+    >>> from scipy.stats import somersd
+    >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
+    >>> res = somersd(table)
+    >>> res.statistic
+    0.6032766111513396
+    >>> res.pvalue
+    1.0007091191074533e-27
+
+    The value of the Somers' D statistic is approximately 0.6, indicating
+    a positive correlation between room cleanliness and customer satisfaction
+    in the sample.
+    The *p*-value is very small, indicating a very small probability of
+    observing such an extreme value of the statistic under the null
+    hypothesis that the statistic of the entire population (from which
+    our sample of 189 customers is drawn) is zero. This supports the
+    alternative hypothesis that the true value of Somers' D for the population
+    is nonzero.
+
+    """
+    x, y = np.array(x), np.array(y)
+    if x.ndim == 1:
+        if x.size != y.size:
+            raise ValueError("Rankings must be of equal length.")
+        table = scipy.stats.contingency.crosstab(x, y)[1]
+    elif x.ndim == 2:
+        if np.any(x < 0):
+            raise ValueError("All elements of the contingency table must be "
+                             "non-negative.")
+        if np.any(x != x.astype(int)):
+            raise ValueError("All elements of the contingency table must be "
+                             "integer.")
+        if x.nonzero()[0].size < 2:
+            raise ValueError("At least two elements of the contingency table "
+                             "must be nonzero.")
+        table = x
+    else:
+        raise ValueError("x must be either a 1D or 2D array")
+    # The table type is converted to a float to avoid an integer overflow
+    d, p = _somers_d(table.astype(float), alternative)
+
+    # add alias for consistency with other correlation functions
+    res = SomersDResult(d, p, table)
+    res.correlation = d
+    return res
+
+
+# This could be combined with `_all_partitions` in `_resampling.py`
+def _all_partitions(nx, ny):
+    """
+    Partition a set of indices into two fixed-length sets in all possible ways
+
+    Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
+    ny in all possible ways (ignoring order of elements).
+    """
+    z = np.arange(nx+ny)
+    for c in combinations(z, nx):
+        x = np.array(c)
+        mask = np.ones(nx+ny, bool)
+        mask[x] = False
+        y = z[mask]
+        yield x, y
+
+
+def _compute_log_combinations(n):
+    """Compute all log combination of C(n, k)."""
+    gammaln_arr = gammaln(np.arange(n + 1) + 1)
+    return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
+
+
+@dataclass
+class BarnardExactResult:
+    statistic: float
+    pvalue: float
+
+
+@xp_capabilities(np_only=True)
+def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
+    r"""Perform a Barnard exact test on a 2x2 contingency table.
+
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements should be non-negative integers.
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes section below.
+
+    pooled : bool, optional
+        Whether to compute score statistic with pooled variance (as in
+        Student's t-test, for example) or unpooled variance (as in Welch's
+        t-test). Default is ``True``.
+
+    n : int, optional
+        Number of sampling points used in the construction of the sampling
+        method. Note that this argument will automatically be converted to
+        the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
+        select sample points. Default is 32. Must be positive. In most cases,
+        32 points is enough to reach good precision. More points comes at
+        performance cost.
+
+    Returns
+    -------
+    ber : BarnardExactResult
+        A result object with the following attributes.
+
+        statistic : float
+            The Wald statistic with pooled or unpooled variance, depending
+            on the user choice of `pooled`.
+
+        pvalue : float
+            P-value, the probability of obtaining a distribution at least as
+            extreme as the one that was actually observed, assuming that the
+            null hypothesis is true.
+
+    See Also
+    --------
+    chi2_contingency : Chi-square test of independence of variables in a
+        contingency table.
+    fisher_exact : Fisher exact test on a 2x2 contingency table.
+    boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
+        which is an uniformly more powerful alternative to Fisher's exact test.
+
+    Notes
+    -----
+    Barnard's test is an exact test used in the analysis of contingency
+    tables. It examines the association of two categorical variables, and
+    is a more powerful alternative than Fisher's exact test
+    for 2x2 contingency tables.
+
+    Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
+    where each column stores the binomial experiment, as in the example
+    below. Let's also define :math:`p_1, p_2` the theoretical binomial
+    probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
+    Barnard exact test, we can assert three different null hypotheses :
+
+    - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
+      with `alternative` = "less"
+
+    - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
+      with `alternative` = "greater"
+
+    - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
+      with `alternative` = "two-sided" (default one)
+
+    In order to compute Barnard's exact test, we are using the Wald
+    statistic [3]_ with pooled or unpooled variance.
+    Under the default assumption that both variances are equal
+    (``pooled = True``), the statistic is computed as:
+
+    .. math::
+
+        T(X) = \frac{
+            \hat{p}_1 - \hat{p}_2
+        }{
+            \sqrt{
+                \hat{p}(1 - \hat{p})
+                (\frac{1}{c_1} +
+                \frac{1}{c_2})
+            }
+        }
+
+    with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
+    :math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
+    given the assumption that :math:`p_1 = p_2`.
+
+    If this assumption is invalid (``pooled = False``), the statistic is:
+
+    .. math::
+
+        T(X) = \frac{
+            \hat{p}_1 - \hat{p}_2
+        }{
+            \sqrt{
+                \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
+                \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
+            }
+        }
+
+    The p-value is then computed as:
+
+    .. math::
+
+        \sum
+            \binom{c_1}{x_{11}}
+            \binom{c_2}{x_{12}}
+            \pi^{x_{11} + x_{12}}
+            (1 - \pi)^{t - x_{11} - x_{12}}
+
+    where the sum is over all  2x2 contingency tables :math:`X` such that:
+    * :math:`T(X) \leq T(X_0)` when `alternative` = "less",
+    * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
+    * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
+    Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
+    and :math:`t` the total (sum of the 4 sample's element).
+
+    The returned p-value is the maximum p-value taken over the nuisance
+    parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
+
+    This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
+    number of sample points.
+
+    References
+    ----------
+    .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
+           34.1/2 (1947): 123-138. :doi:`dpgkg3`
+
+    .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
+           unconditional exact tests for comparing two binomials."
+           *Cytel Software Corporation* 675 (2003): 1-5.
+
+    .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
+
+    Examples
+    --------
+    An example use of Barnard's test is presented in [2]_.
+
+        Consider the following example of a vaccine efficacy study
+        (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
+        inoculated with a recombinant DNA influenza vaccine and the 15 were
+        inoculated with a placebo. Twelve of the 15 subjects in the placebo
+        group (80%) eventually became infected with influenza whereas for the
+        vaccine group, only 7 of the 15 subjects (47%) became infected. The
+        data are tabulated as a 2 x 2 table::
+
+                Vaccine  Placebo
+            Yes     7        12
+            No      8        3
+
+    When working with statistical hypothesis testing, we usually use a
+    threshold probability or significance level upon which we decide
+    to reject the null hypothesis :math:`H_0`. Suppose we choose the common
+    significance level of 5%.
+
+    Our alternative hypothesis is that the vaccine will lower the chance of
+    becoming infected with the virus; that is, the probability :math:`p_1` of
+    catching the virus with the vaccine will be *less than* the probability
+    :math:`p_2` of catching the virus without the vaccine.  Therefore, we call
+    `barnard_exact` with the ``alternative="less"`` option:
+
+    >>> import scipy.stats as stats
+    >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
+    >>> res.statistic
+    -1.894
+    >>> res.pvalue
+    0.03407
+
+    Under the null hypothesis that the vaccine will not lower the chance of
+    becoming infected, the probability of obtaining test results at least as
+    extreme as the observed data is approximately 3.4%. Since this p-value is
+    less than our chosen significance level, we have evidence to reject
+    :math:`H_0` in favor of the alternative.
+
+    Suppose we had used Fisher's exact test instead:
+
+    >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
+    >>> pvalue
+    0.0640
+
+    With the same threshold significance of 5%, we would not have been able
+    to reject the null hypothesis in favor of the alternative. As stated in
+    [2]_, Barnard's test is uniformly more powerful than Fisher's exact test
+    because Barnard's test does not condition on any margin. Fisher's test
+    should only be used when both sets of marginals are fixed.
+
+    """
+    if n <= 0:
+        raise ValueError(
+            "Number of points `n` must be strictly positive, "
+            f"found {n!r}"
+        )
+
+    table = np.asarray(table, dtype=np.int64)
+
+    if not table.shape == (2, 2):
+        raise ValueError("The input `table` must be of shape (2, 2).")
+
+    if np.any(table < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+
+    if 0 in table.sum(axis=0):
+        # If both values in column are zero, the p-value is 1 and
+        # the score's statistic is NaN.
+        return BarnardExactResult(np.nan, 1.0)
+
+    total_col_1, total_col_2 = table.sum(axis=0)
+
+    x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
+    x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
+
+    # We need to calculate the wald statistics for each combination of x1 and
+    # x2.
+    p1, p2 = x1 / total_col_1, x2 / total_col_2
+
+    if pooled:
+        p = (x1 + x2) / (total_col_1 + total_col_2)
+        variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
+    else:
+        variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
+
+    # To avoid warning when dividing by 0
+    with np.errstate(divide="ignore", invalid="ignore"):
+        wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
+
+    wald_statistic[p1 == p2] = 0  # Removing NaN values
+
+    wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
+
+    if alternative == "two-sided":
+        index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
+    elif alternative == "less":
+        index_arr = wald_statistic <= wald_stat_obs
+    elif alternative == "greater":
+        index_arr = wald_statistic >= wald_stat_obs
+    else:
+        msg = (
+            "`alternative` should be one of {'two-sided', 'less', 'greater'},"
+            f" found {alternative!r}"
+        )
+        raise ValueError(msg)
+
+    x1_sum_x2 = x1 + x2
+
+    x1_log_comb = _compute_log_combinations(total_col_1)
+    x2_log_comb = _compute_log_combinations(total_col_2)
+    x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
+
+    result = shgo(
+        _get_binomial_log_p_value_with_nuisance_param,
+        args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
+        bounds=((0, 1),),
+        n=n,
+        sampling_method="sobol",
+    )
+
+    # result.fun is the negative log pvalue and therefore needs to be
+    # changed before return
+    p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
+    return BarnardExactResult(wald_stat_obs, p_value)
+
+
+@dataclass
+class BoschlooExactResult:
+    statistic: float
+    pvalue: float
+
+
+@xp_capabilities(np_only=True)
+def boschloo_exact(table, alternative="two-sided", n=32):
+    r"""Perform Boschloo's exact test on a 2x2 contingency table.
+
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements should be non-negative integers.
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes section below.
+
+    n : int, optional
+        Number of sampling points used in the construction of the sampling
+        method. Note that this argument will automatically be converted to
+        the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
+        select sample points. Default is 32. Must be positive. In most cases,
+        32 points is enough to reach good precision. More points comes at
+        performance cost.
+
+    Returns
+    -------
+    ber : BoschlooExactResult
+        A result object with the following attributes.
+
+        statistic : float
+            The statistic used in Boschloo's test; that is, the p-value
+            from Fisher's exact test.
+
+        pvalue : float
+            P-value, the probability of obtaining a distribution at least as
+            extreme as the one that was actually observed, assuming that the
+            null hypothesis is true.
+
+    See Also
+    --------
+    chi2_contingency : Chi-square test of independence of variables in a
+        contingency table.
+    fisher_exact : Fisher exact test on a 2x2 contingency table.
+    barnard_exact : Barnard's exact test, which is a more powerful alternative
+        than Fisher's exact test for 2x2 contingency tables.
+
+    Notes
+    -----
+    Boschloo's test is an exact test used in the analysis of contingency
+    tables. It examines the association of two categorical variables, and
+    is a uniformly more powerful alternative to Fisher's exact test
+    for 2x2 contingency tables.
+
+    Boschloo's exact test uses the p-value of Fisher's exact test as a
+    statistic, and Boschloo's p-value is the probability under the null
+    hypothesis of observing such an extreme value of this statistic.
+
+    Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
+    where each column stores the binomial experiment, as in the example
+    below. Let's also define :math:`p_1, p_2` the theoretical binomial
+    probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
+    Boschloo exact test, we can assert three different alternative hypotheses:
+
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
+      with `alternative` = "less"
+
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
+      with `alternative` = "greater"
+
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
+      with `alternative` = "two-sided" (default)
+
+    There are multiple conventions for computing a two-sided p-value when the
+    null distribution is asymmetric. Here, we apply the convention that the
+    p-value of a two-sided test is twice the minimum of the p-values of the
+    one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
+    different convention, so for a given `table`, the statistic reported by
+    `boschloo_exact` may differ from the p-value reported by `fisher_exact`
+    when ``alternative='two-sided'``.
+
+    .. versionadded:: 1.7.0
+
+    References
+    ----------
+    .. [1] R.D. Boschloo. "Raised conditional level of significance for the
+       2 x 2-table when testing the equality of two probabilities",
+       Statistica Neerlandica, 24(1), 1970
+
+    .. [2] "Boschloo's test", Wikipedia,
+       https://en.wikipedia.org/wiki/Boschloo%27s_test
+
+    .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
+       Human Resource Management, 43(4), 395-407, 2004,
+       :doi:`10.1002/hrm.20032`.
+
+    Examples
+    --------
+    In the following example, we consider the article "Employee
+    attitudes and job satisfaction" [3]_
+    which reports the results of a survey from 63 scientists and 117 college
+    professors. Of the 63 scientists, 31 said they were very satisfied with
+    their jobs, whereas 74 of the college professors were very satisfied
+    with their work. Is this significant evidence that college
+    professors are happier with their work than scientists?
+    The following table summarizes the data mentioned above::
+
+                         college professors   scientists
+        Very Satisfied   74                     31
+        Dissatisfied     43                     32
+
+    When working with statistical hypothesis testing, we usually use a
+    threshold probability or significance level upon which we decide
+    to reject the null hypothesis :math:`H_0`. Suppose we choose the common
+    significance level of 5%.
+
+    Our alternative hypothesis is that college professors are truly more
+    satisfied with their work than scientists. Therefore, we expect
+    :math:`p_1` the proportion of very satisfied college professors to be
+    greater than :math:`p_2`, the proportion of very satisfied scientists.
+    We thus call `boschloo_exact` with the ``alternative="greater"`` option:
+
+    >>> import scipy.stats as stats
+    >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
+    >>> res.statistic
+    0.0483
+    >>> res.pvalue
+    0.0355
+
+    Under the null hypothesis that scientists are happier in their work than
+    college professors, the probability of obtaining test
+    results at least as extreme as the observed data is approximately 3.55%.
+    Since this p-value is less than our chosen significance level, we have
+    evidence to reject :math:`H_0` in favor of the alternative hypothesis.
+
+    """
+    hypergeom = distributions.hypergeom
+
+    if n <= 0:
+        raise ValueError(
+            "Number of points `n` must be strictly positive,"
+            f" found {n!r}"
+        )
+
+    table = np.asarray(table, dtype=np.int64)
+
+    if not table.shape == (2, 2):
+        raise ValueError("The input `table` must be of shape (2, 2).")
+
+    if np.any(table < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+
+    if 0 in table.sum(axis=0):
+        # If both values in column are zero, the p-value is 1 and
+        # the score's statistic is NaN.
+        return BoschlooExactResult(np.nan, np.nan)
+
+    total_col_1, total_col_2 = table.sum(axis=0)
+    total = total_col_1 + total_col_2
+    x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
+    x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
+    x1_sum_x2 = x1 + x2
+
+    if alternative == 'less':
+        pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
+    elif alternative == 'greater':
+        # Same formula as the 'less' case, but with the second column.
+        pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
+    elif alternative == 'two-sided':
+        boschloo_less = boschloo_exact(table, alternative="less", n=n)
+        boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
+
+        res = (
+            boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
+            else boschloo_greater
+        )
+
+        # Two-sided p-value is defined as twice the minimum of the one-sided
+        # p-values
+        pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
+        return BoschlooExactResult(res.statistic, pvalue)
+    else:
+        msg = (
+            f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
+            f" found {alternative!r}"
+        )
+        raise ValueError(msg)
+
+    fisher_stat = pvalues[table[0, 0], table[0, 1]]
+
+    # fisher_stat * (1+1e-13) guards us from small numerical error. It is
+    # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
+    # For more throughout explanations, see gh-14178
+    index_arr = pvalues <= fisher_stat * (1+1e-13)
+
+    x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
+    x1_log_comb = _compute_log_combinations(total_col_1)
+    x2_log_comb = _compute_log_combinations(total_col_2)
+    x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
+
+    result = shgo(
+        _get_binomial_log_p_value_with_nuisance_param,
+        args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
+        bounds=((0, 1),),
+        n=n,
+        sampling_method="sobol",
+    )
+
+    # result.fun is the negative log pvalue and therefore needs to be
+    # changed before return
+    p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
+    return BoschlooExactResult(fisher_stat, p_value)
+
+
+def _get_binomial_log_p_value_with_nuisance_param(
+    nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
+):
+    r"""
+    Compute the log pvalue in respect of a nuisance parameter considering
+    a 2x2 sample space.
+
+    Parameters
+    ----------
+    nuisance_param : float
+        nuisance parameter used in the computation of the maximisation of
+        the p-value. Must be between 0 and 1
+
+    x1_sum_x2 : ndarray
+        Sum of x1 and x2 inside barnard_exact
+
+    x1_sum_x2_log_comb : ndarray
+        sum of the log combination of x1 and x2
+
+    index_arr : ndarray of boolean
+
+    Returns
+    -------
+    p_value : float
+        Return the maximum p-value considering every nuisance parameter
+        between 0 and 1
+
+    Notes
+    -----
+
+    Both Barnard's test and Boschloo's test iterate over a nuisance parameter
+    :math:`\pi \in [0, 1]` to find the maximum p-value. To search this
+    maxima, this function return the negative log pvalue with respect to the
+    nuisance parameter passed in params. This negative log p-value is then
+    used in `shgo` to find the minimum negative pvalue which is our maximum
+    pvalue.
+
+    Also, to compute the different combination used in the
+    p-values' computation formula, this function uses `gammaln` which is
+    more tolerant for large value than `scipy.special.comb`. `gammaln` gives
+    a log combination. For the little precision loss, performances are
+    improved a lot.
+    """
+    t1, t2 = x1_sum_x2.shape
+    n = t1 + t2 - 2
+    with np.errstate(divide="ignore", invalid="ignore"):
+        log_nuisance = np.log(
+            nuisance_param,
+            out=np.zeros_like(nuisance_param),
+            where=nuisance_param >= 0,
+        )
+        log_1_minus_nuisance = np.log(
+            1 - nuisance_param,
+            out=np.zeros_like(nuisance_param),
+            where=1 - nuisance_param >= 0,
+        )
+
+        nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
+        nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
+
+        nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
+        nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
+
+        tmp_log_values_arr = (
+            x1_sum_x2_log_comb
+            + nuisance_power_x1_x2
+            + nuisance_power_n_minus_x1_x2
+        )
+
+    tmp_values_from_index = tmp_log_values_arr[index_arr]
+
+    # To avoid dividing by zero in log function and getting inf value,
+    # values are centered according to the max
+    max_value = tmp_values_from_index.max()
+
+    # To have better result's precision, the log pvalue is taken here.
+    # Indeed, pvalue is included inside [0, 1] interval. Passing the
+    # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
+    # help us to achieve better precision
+    with np.errstate(divide="ignore", invalid="ignore"):
+        log_probs = np.exp(tmp_values_from_index - max_value).sum()
+        log_pvalue = max_value + np.log(
+            log_probs,
+            out=np.full_like(log_probs, -np.inf),
+            where=log_probs > 0,
+        )
+
+    # Since shgo find the minima, minus log pvalue is returned
+    return -log_pvalue
+
+
+@np.vectorize(otypes=[np.float64])
+def _pval_cvm_2samp_exact(s, m, n):
+    """
+    Compute the exact p-value of the Cramer-von Mises two-sample test
+    for a given value s of the test statistic.
+    m and n are the sizes of the samples.
+
+    [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
+        the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
+        vol. 17, no. 8, pp. 1-15, Dec. 2006.
+    [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
+        Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
+        33(3), 1148-1159, (September, 1962)
+    """
+
+    # [1, p. 3]
+    lcm = np.lcm(m, n)
+    # [1, p. 4], below eq. 3
+    a = lcm // m
+    b = lcm // n
+    # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
+    # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
+    mn = m * n
+    zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
+
+    # bound maximum value that may appear in `gs` (remember both rows!)
+    zeta_bound = lcm**2 * (m + n)  # bound elements in row 1
+    combinations = math.comb(m + n, m)  # sum of row 2
+    max_gs = max(zeta_bound, combinations)
+    dtype = np.min_scalar_type(max_gs)
+
+    # the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
+    gs = ([np.array([[0], [1]], dtype=dtype)]
+          + [np.empty((2, 0), dtype=dtype) for _ in range(m)])
+    for u in range(n + 1):
+        next_gs = []
+        tmp = np.empty((2, 0), dtype=dtype)
+        for v, g in enumerate(gs):
+            # Calculate g recursively with eq. 11 in [1]. Even though it
+            # doesn't look like it, this also does 12/13 (all of Algorithm 1).
+            vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
+            tmp = np.concatenate([
+                np.stack([vi, tmp[1, i0] + g[1, i1]]),
+                np.delete(tmp, i0, 1),
+                np.delete(g, i1, 1)
+            ], 1)
+            res = (a * v - b * u) ** 2
+            tmp[0] += res.astype(dtype)
+            next_gs.append(tmp)
+        gs = next_gs
+    value, freq = gs[m]
+    return np.float64(np.sum(freq[value >= zeta]) / combinations)
+
+
+def _pval_cvm_2samp_asymptotic(t, N, nx, ny, k, *, xp):
+    # compute expected value and variance of T (eq. 11 and 14 in [2])
+    et = (1 + 1 / N) / 6
+    vt = (N + 1) * (4 * k * N - 3 * (nx ** 2 + ny ** 2) - 2 * k)
+    vt = vt / (45 * N ** 2 * 4 * k)
+
+    # computed the normalized statistic (eq. 15 in [2])
+    tn = 1 / 6 + (t - et) / math.sqrt(45 * vt)
+
+    # approximate distribution of tn with limiting distribution
+    # of the one-sample test statistic
+    # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
+    p = xpx.apply_where(tn >= 0.003,
+                        (tn,),
+                        lambda tn: xp.clip(1. - _cdf_cvm_inf(tn, xp=xp), 0.),
+                        fill_value = 1.)
+    return p
+
+
+@xp_capabilities(skip_backends=[('cupy', 'needs rankdata'),
+                                ('dask.array', 'needs rankdata')],
+                 cpu_only=True, jax_jit=False)
+@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1,
+                          result_to_tuple=_cvm_result_to_tuple)
+def cramervonmises_2samp(x, y, method='auto', *, axis=0):
+    r"""Perform the two-sample Cramér-von Mises test for goodness of fit.
+
+    This is the two-sample version of the Cramér-von Mises test ([1]_):
+    for two independent samples :math:`X_1, ..., X_n` and
+    :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
+    come from the same (unspecified) continuous distribution.
+
+    The test statistic :math:`T` is defined as in [1]_:
+
+    .. math::
+        T = \frac{nm}{n+m}\omega^2 =
+        \frac{U}{n m (n+m)} - \frac{4 m n - 1}{6(m+n)}
+
+    where :math:`U` is defined as below, and :math:`\omega^2` is the Cramér-von
+    Mises criterion. The function :math:`r(\cdot)` here denotes the rank of the
+    observed values :math:`x_i` and :math:`y_j` within the pooled sample of size
+    :math:`n + m`, with ties assigned mid-rank values:
+
+    .. math::
+        U = n \sum_{i=1}^n (r(x_i)-i)^2 + m \sum_{j=1}^m (r(y_j)-j)^2
+
+    Parameters
+    ----------
+    x : array_like
+        A 1-D array of observed values of the random variables :math:`X_i`.
+        Must contain at least two observations.
+    y : array_like
+        A 1-D array of observed values of the random variables :math:`Y_i`.
+        Must contain at least two observations.
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        The method used to compute the p-value, see Notes for details.
+        The default is 'auto'.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    res : object with attributes
+        statistic : float
+            Cramér-von Mises statistic :math:`T`.
+        pvalue : float
+            The p-value.
+
+    See Also
+    --------
+    cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
+
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+
+    The statistic is computed according to equation 9 in [2]_. The
+    calculation of the p-value depends on the keyword `method`:
+
+    - ``asymptotic``: The p-value is approximated by using the limiting
+      distribution of the test statistic.
+    - ``exact``: The exact p-value is computed by enumerating all
+      possible combinations of the test statistic, see [2]_.
+
+    If ``method='auto'``, the exact approach is used
+    if both samples contain equal to or less than 20 observations,
+    otherwise the asymptotic distribution is used.
+
+    If the underlying distribution is not continuous, the p-value is likely to
+    be conservative (Section 6.2 in [3]_). When ranking the data to compute
+    the test statistic, midranks are used if there are ties.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
+    .. [2] Anderson, T.W. (1962). On the distribution of the two-sample
+           Cramer-von-Mises criterion. The Annals of Mathematical
+           Statistics, pp. 1148-1159.
+    .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
+
+    Examples
+    --------
+
+    Suppose we wish to test whether two samples generated by
+    ``scipy.stats.norm.rvs`` have the same distribution. We choose a
+    significance level of alpha=0.05.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = stats.norm.rvs(size=100, random_state=rng)
+    >>> y = stats.norm.rvs(size=70, random_state=rng)
+    >>> res = stats.cramervonmises_2samp(x, y)
+    >>> res.statistic, res.pvalue
+    (0.29376470588235293, 0.1412873014573014)
+
+    The p-value exceeds our chosen significance level, so we do not
+    reject the null hypothesis that the observed samples are drawn from the
+    same distribution.
+
+    For small sample sizes, one can compute the exact p-values:
+
+    >>> x = stats.norm.rvs(size=7, random_state=rng)
+    >>> y = stats.t.rvs(df=2, size=6, random_state=rng)
+    >>> res = stats.cramervonmises_2samp(x, y, method='exact')
+    >>> res.statistic, res.pvalue
+    (0.197802197802198, 0.31643356643356646)
+
+    The p-value based on the asymptotic distribution is a good approximation
+    even though the sample size is small.
+
+    >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
+    >>> res.statistic, res.pvalue
+    (0.197802197802198, 0.2966041181527128)
+
+    Independent of the method, one would not reject the null hypothesis at the
+    chosen significance level in this example.
+
+    """
+    xp = array_namespace(x, y)
+    nx = x.shape[-1]
+    ny = y.shape[-1]
+
+    if nx <= 1 or ny <= 1:  # only needed for testing / `test_axis_nan_policy`
+        raise ValueError('x and y must contain at least two observations.')
+    if method not in ['auto', 'exact', 'asymptotic']:
+        raise ValueError('method must be either auto, exact or asymptotic.')
+
+    if method == 'auto':
+        if max(nx, ny) > 20:
+            method = 'asymptotic'
+        else:
+            method = 'exact'
+
+    # axis=-1 is guaranteed by _axis_nan_policy decorator
+    xa = xp.sort(x, axis=-1)
+    ya = xp.sort(y, axis=-1)
+
+    # get ranks of x and y in the pooled sample
+    z = xp.concat([xa, ya], axis=-1)
+    # in case of ties, use midrank (see [1])
+    r = scipy.stats.rankdata(z, method='average', axis=-1)
+    dtype = xp_result_type(x, y, force_floating=True, xp=xp)
+    r = xp.astype(r, dtype, copy=False)
+    rx = r[..., :nx]
+    ry = r[..., nx:]
+
+    # compute U (eq. 10 in [2])
+    u = (nx * xp.sum((rx - xp.arange(1, nx+1, dtype=dtype))**2, axis=-1)
+         + ny * xp.sum((ry - xp.arange(1, ny+1, dtype=dtype))**2, axis=-1))
+
+    # compute T (eq. 9 in [2])
+    k, N = nx*ny, nx + ny
+    t = u / (k*N) - (4*k - 1)/(6*N)
+
+    if method == 'exact':
+        p = xp.asarray(_pval_cvm_2samp_exact(np.asarray(u), nx, ny), dtype=dtype)
+    else:
+        p = _pval_cvm_2samp_asymptotic(t, N, nx, ny, k, xp=xp)
+
+    t = t[()] if t.ndim == 0 else t
+    p = p[()] if p.ndim == 0 else p
+    return CramerVonMisesResult(statistic=t, pvalue=p)
+
+
+class TukeyHSDResult:
+    """Result of `scipy.stats.tukey_hsd`.
+
+    Attributes
+    ----------
+    statistic : float ndarray
+        The computed statistic of the test for each comparison. The element
+        at index ``(i, j)`` is the statistic for the comparison between groups
+        ``i`` and ``j``.
+    pvalue : float ndarray
+        The associated p-value from the studentized range distribution. The
+        element at index ``(i, j)`` is the p-value for the comparison
+        between groups ``i`` and ``j``.
+
+    Notes
+    -----
+    The string representation of this object displays the most recently
+    calculated confidence interval, and if none have been previously
+    calculated, it will evaluate ``confidence_interval()``.
+
+    References
+    ----------
+    .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
+           Method."
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+           28 November 2020.
+    .. [2] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
+           with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
+           Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
+           doi: https://doi.org/10.3102/10769986001002113.
+    """
+
+    def __init__(self, statistic, pvalue, _ntreatments, _df, _stand_err):
+        self.statistic = statistic
+        self.pvalue = pvalue
+        self._ntreatments = _ntreatments
+        self._df = _df
+        self._stand_err = _stand_err
+        self._ci = None
+        self._ci_cl = None
+
+    def __str__(self):
+        # Note: `__str__` prints the confidence intervals from the most
+        # recent call to `confidence_interval`. If it has not been called,
+        # it will be called with the default CL of .95.
+        if self._ci is None:
+            self.confidence_interval(confidence_level=.95)
+        s = ("Pairwise Group Comparisons"
+             f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
+        s += "Comparison  Statistic  p-value  Lower CI  Upper CI\n"
+        for i, j in np.ndindex(self.pvalue.shape):
+            if i != j:
+                s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
+                      f"{self.pvalue[i, j]:>10.3f}"
+                      f"{self._ci.low[i, j]:>10.3f}"
+                      f"{self._ci.high[i, j]:>10.3f}\n")
+        return s
+
+    def confidence_interval(self, confidence_level=.95):
+        """Compute the confidence interval for the specified confidence level.
+
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval
+            of the estimated proportion. Default is .95.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence intervals for each
+            comparison. The high and low values are accessible for each
+            comparison at index ``(i, j)`` between groups ``i`` and ``j``.
+
+        References
+        ----------
+        .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
+               Tukey's Method."
+               https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+               28 November 2020.
+        .. [2] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
+               with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
+               Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
+               doi: https://doi.org/10.3102/10769986001002113.
+
+        Examples
+        --------
+        >>> from scipy.stats import tukey_hsd
+        >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+        >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+        >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+        >>> result = tukey_hsd(group0, group1, group2)
+        >>> ci = result.confidence_interval()
+        >>> ci.low
+        array([[-3.649159, -8.249159, -3.909159],
+               [ 0.950841, -3.649159,  0.690841],
+               [-3.389159, -7.989159, -3.649159]])
+        >>> ci.high
+        array([[ 3.649159, -0.950841,  3.389159],
+               [ 8.249159,  3.649159,  7.989159],
+               [ 3.909159, -0.690841,  3.649159]])
+        """
+        # check to see if the supplied confidence level matches that of the
+        # previously computed CI.
+        if (self._ci is not None and self._ci_cl is not None and
+                confidence_level == self._ci_cl):
+            return self._ci
+
+        if not 0 < confidence_level < 1:
+            raise ValueError("Confidence level must be between 0 and 1.")
+        # determine the critical value of the studentized range using the
+        # appropriate confidence level, number of treatments, and degrees
+        # of freedom. See [1] "Confidence limits for Tukey's method" / [2] p.117
+        # "H0 was rejected if...". Note that in the cases of unequal sample sizes,
+        # there will be a criterion for each group comparison.
+        params = (confidence_level, self._ntreatments, self._df)
+        srd = distributions.studentized_range.ppf(*params)
+        # also called maximum critical value, the confidence_radius is the
+        # studentized range critical value * the square root of mean square
+        # error over the sample size.
+        confidence_radius = srd * self._stand_err
+        # the confidence levels are determined by the
+        # `mean_differences` +- `confidence_radius`
+        upper_conf = self.statistic + confidence_radius
+        lower_conf = self.statistic - confidence_radius
+        self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
+        self._ci_cl = confidence_level
+        return self._ci
+
+
+def _tukey_hsd_iv(args, equal_var):
+    if (len(args)) < 2:
+        raise ValueError("There must be more than 1 treatment.")
+    if not isinstance(equal_var, bool):
+        raise TypeError("Expected a boolean value for 'equal_var'")
+    args = [np.asarray(arg) for arg in args]
+    for arg in args:
+        if arg.ndim != 1:
+            raise ValueError("Input samples must be one-dimensional.")
+        if arg.size <= 1:
+            raise ValueError("Input sample size must be greater than one.")
+        if np.isinf(arg).any():
+            raise ValueError("Input samples must be finite.")
+    return args
+
+
+@xp_capabilities(np_only=True)
+def tukey_hsd(*args, equal_var=True):
+    """Perform Tukey's HSD test for equality of means over multiple treatments.
+
+    Tukey's honestly significant difference (HSD) test performs pairwise
+    comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
+    assesses whether the true means underlying each sample are identical,
+    Tukey's HSD is a post hoc test used to compare the mean of each sample
+    to the mean of each other sample.
+
+    The null hypothesis is that the distributions underlying the samples all
+    have the same mean. The test statistic, which is computed for every
+    possible pairing of samples, is simply the difference between the sample
+    means. For each pair, the p-value is the probability under the null
+    hypothesis (and other assumptions; see notes) of observing such an extreme
+    value of the statistic, considering that many pairwise comparisons are
+    being performed. Confidence intervals for the difference between each pair
+    of means are also available.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The sample measurements for each group. There must be at least
+        two arguments.
+    equal_var: bool, optional
+        If True (default) and equal sample size, perform Tukey-HSD test [6].
+        If True and unequal sample size, perform Tukey-Kramer test [4]_.
+        If False, perform Games-Howell test [7]_, which does not assume equal variances.
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.TukeyHSDResult` instance
+        The return value is an object with the following attributes:
+
+        statistic : float ndarray
+            The computed statistic of the test for each comparison. The element
+            at index ``(i, j)`` is the statistic for the comparison between
+            groups ``i`` and ``j``.
+        pvalue : float ndarray
+            The computed p-value of the test for each comparison. The element
+            at index ``(i, j)`` is the p-value for the comparison between
+            groups ``i`` and ``j``.
+
+        The object has the following methods:
+
+        confidence_interval(confidence_level=0.95):
+            Compute the confidence interval for the specified confidence level.
+
+    See Also
+    --------
+    dunnett : performs comparison of means against a control group.
+
+    Notes
+    -----
+    The use of this test relies on several assumptions.
+
+    1. The observations are independent within and among groups.
+    2. The observations within each group are normally distributed.
+    3. The distributions from which the samples are drawn have the same finite
+       variance.
+
+    The original formulation of the test was for samples of equal size drawn from
+    populations assumed to have equal variances [6]_. In case of unequal sample sizes,
+    the test uses the Tukey-Kramer method [4]_. When equal variances are not assumed
+    (``equal_var=False``), the test uses the Games-Howell method [7]_.
+
+    References
+    ----------
+    .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
+           Method."
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+           28 November 2020.
+    .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
+           Difference (HSD) Test."
+           https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
+    .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
+           Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
+    .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
+           Means with Unequal Numbers of Replications." Biometrics, vol. 12,
+           no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
+           Accessed 25 May 2021.
+    .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
+           The ANOVA table and tests of hypotheses about means"
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
+           2 June 2021.
+    .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
+           Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
+           www.jstor.org/stable/3001913. Accessed 14 June 2021.
+    .. [7] P. A. Games and J. F. Howell, "Pairwise Multiple Comparison Procedures
+           with Unequal N's and/or Variances: A Monte Carlo Study," Journal of
+           Educational Statistics, vol. 1, no. 2, pp. 113-125, Jun. 1976,
+           doi: https://doi.org/10.3102/10769986001002113.
+
+
+    Examples
+    --------
+    Here are some data comparing the time to relief of three brands of
+    headache medicine, reported in minutes. Data adapted from [3]_.
+
+    >>> import numpy as np
+    >>> from scipy.stats import tukey_hsd
+    >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+    >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+    >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+
+    We would like to see if the means between any of the groups are
+    significantly different. First, visually examine a box and whisker plot.
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.boxplot([group0, group1, group2])
+    >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
+    >>> ax.set_ylabel("mean") # doctest: +SKIP
+    >>> plt.show()
+
+    From the box and whisker plot, we can see overlap in the interquartile
+    ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
+    test to determine if the difference between means is significant. We
+    set a significance level of .05 to reject the null hypothesis.
+
+    >>> res = tukey_hsd(group0, group1, group2)
+    >>> print(res)
+    Pairwise Group Comparisons (95.0% Confidence Interval)
+    Comparison  Statistic  p-value  Lower CI  Upper CI
+     (0 - 1)     -4.600     0.014    -8.249    -0.951
+     (0 - 2)     -0.260     0.980    -3.909     3.389
+     (1 - 0)      4.600     0.014     0.951     8.249
+     (1 - 2)      4.340     0.020     0.691     7.989
+     (2 - 0)      0.260     0.980    -3.389     3.909
+     (2 - 1)     -4.340     0.020    -7.989    -0.691
+
+    The null hypothesis is that each group has the same mean. The p-value for
+    comparisons between ``group0`` and ``group1`` as well as ``group1`` and
+    ``group2`` do not exceed .05, so we reject the null hypothesis that they
+    have the same means. The p-value of the comparison between ``group0``
+    and ``group2`` exceeds .05, so we accept the null hypothesis that there
+    is not a significant difference between their means.
+
+    We can also compute the confidence interval associated with our chosen
+    confidence level.
+
+    >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+    >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+    >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+    >>> result = tukey_hsd(group0, group1, group2)
+    >>> conf = res.confidence_interval(confidence_level=.99)
+    >>> for ((i, j), l) in np.ndenumerate(conf.low):
+    ...     # filter out self comparisons
+    ...     if i != j:
+    ...         h = conf.high[i,j]
+    ...         print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
+    (0 - 1) -9.480  0.280
+    (0 - 2) -5.140  4.620
+    (1 - 0) -0.280  9.480
+    (1 - 2) -0.540  9.220
+    (2 - 0) -4.620  5.140
+    (2 - 1) -9.220  0.540
+    """
+    args = _tukey_hsd_iv(args, equal_var)
+    ntreatments = len(args)
+    means = np.asarray([np.mean(arg) for arg in args])
+    nsamples_treatments = np.asarray([a.size for a in args])
+    nobs = np.sum(nsamples_treatments)
+    vars_ = np.asarray([np.var(arg, ddof=1) for arg in args])
+
+    if equal_var:
+        # determine mean square error [5]. Note that this is sometimes called
+        # mean square error within.
+        mse = (np.sum(vars_ * (nsamples_treatments - 1)) / (nobs - ntreatments))
+
+        # The calculation of the standard error differs when treatments differ in
+        # size. See ("Unequal sample sizes")[1].
+        if np.unique(nsamples_treatments).size == 1:
+            # all input groups are the same length, so only one value needs to be
+            # calculated [1].
+            normalize = 2 / nsamples_treatments[0]
+        else:
+            # to compare groups of differing sizes, we must compute a variance
+            # value for each individual comparison. Use broadcasting to get the
+            # resulting matrix. [3], verified against [4] (page 308).
+            normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
+
+        # the standard error is used in the computation of the tukey criterion and
+        # finding the p-values.
+        stand_err = np.sqrt(normalize * mse / 2)
+        df = nobs - ntreatments
+    else:
+        # `stand_err` is the denominator of the Behrens-Fisher statistic ($v$)
+        # with a factor of $\sqrt{2}$. Compare [7] p.116 "t-solution rejects H0 if...",
+        # [7] p. 117 "H0 was rejected", and definition of `t_stat` below.
+        sj2_nj = vars_ / nsamples_treatments
+        si2_ni = sj2_nj[:, np.newaxis]
+        stand_err = np.sqrt(si2_ni + sj2_nj) / 2**0.5
+
+        # `df` is the Welch degree of freedom $\nu$.
+        # See [7] p. 116 "and the degrees of freedom, $\nu$, are given by...".
+        njm1 = nsamples_treatments - 1
+        nim1 = njm1[:, np.newaxis]
+        df = (si2_ni + sj2_nj)**2 / (si2_ni**2 / nim1 + sj2_nj**2 / njm1)
+
+    # the mean difference is the test statistic.
+    mean_differences = means[None].T - means
+
+    # Calculate the t-statistic to use within the survival function of the
+    # studentized range to get the p-value.
+    t_stat = np.abs(mean_differences) / stand_err
+
+    params = t_stat, ntreatments, df
+    pvalues = distributions.studentized_range.sf(*params)
+
+    return TukeyHSDResult(mean_differences, pvalues, ntreatments,
+                          df, stand_err)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_kde.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb66b00446a885feed7ae78ba04897d98892266
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_kde.py
@@ -0,0 +1,733 @@
+#-------------------------------------------------------------------------------
+#
+#  Define classes for (uni/multi)-variate kernel density estimation.
+#
+#  Currently, only Gaussian kernels are implemented.
+#
+#  Written by: Robert Kern
+#
+#  Date: 2004-08-09
+#
+#  Modified: 2005-02-10 by Robert Kern.
+#              Contributed to SciPy
+#            2005-10-07 by Robert Kern.
+#              Some fixes to match the new scipy_core
+#
+#  Copyright 2004-2005 by Enthought, Inc.
+#
+#-------------------------------------------------------------------------------
+
+# SciPy imports.
+from scipy import linalg, special
+from scipy._lib._util import check_random_state, np_vecdot
+
+from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
+                   sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
+                   ones, cov)
+import numpy as np
+
+# Local imports.
+from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
+from ._multivariate import multivariate_normal
+
+__all__ = ['gaussian_kde']
+
+
+class gaussian_kde:
+    """Representation of a kernel-density estimate using Gaussian kernels.
+
+    Kernel density estimation is a way to estimate the probability density
+    function (PDF) of a random variable in a non-parametric way.
+    `gaussian_kde` works for both uni-variate and multi-variate data.   It
+    includes automatic bandwidth determination.  The estimation works best for
+    a unimodal distribution; bimodal or multi-modal distributions tend to be
+    oversmoothed.
+
+    Parameters
+    ----------
+    dataset : array_like
+        Datapoints to estimate from. In case of univariate data this is a 1-D
+        array, otherwise a 2-D array with shape (# of dims, # of data).
+    bw_method : str, scalar or callable, optional
+        The method used to calculate the bandwidth factor.  This can be
+        'scott', 'silverman', a scalar constant or a callable.  If a scalar,
+        this will be used directly as `factor`.  If a callable, it should
+        take a `gaussian_kde` instance as only parameter and return a scalar.
+        If None (default), 'scott' is used.  See Notes for more details.
+    weights : array_like, optional
+        weights of datapoints. This must be the same shape as dataset.
+        If None (default), the samples are assumed to be equally weighted
+
+    Attributes
+    ----------
+    dataset : ndarray
+        The dataset with which `gaussian_kde` was initialized.
+    d : int
+        Number of dimensions.
+    n : int
+        Number of datapoints.
+    neff : int
+        Effective number of datapoints.
+
+        .. versionadded:: 1.2.0
+    factor : float
+        The bandwidth factor obtained from `covariance_factor`.
+    covariance : ndarray
+        The kernel covariance matrix; this is the data covariance matrix
+        multiplied by the square of the bandwidth factor, e.g.
+        ``np.cov(dataset) * factor**2``.
+    inv_cov : ndarray
+        The inverse of `covariance`.
+
+    Methods
+    -------
+    evaluate
+    __call__
+    integrate_gaussian
+    integrate_box_1d
+    integrate_box
+    integrate_kde
+    pdf
+    logpdf
+    resample
+    set_bandwidth
+    covariance_factor
+    marginal
+
+    Notes
+    -----
+    Bandwidth selection strongly influences the estimate obtained from the KDE
+    (much more so than the actual shape of the kernel).  Bandwidth selection
+    can be done by a "rule of thumb", by cross-validation, by "plug-in
+    methods" or by other means; see [3]_, [4]_ for reviews.  `gaussian_kde`
+    uses a rule of thumb, the default is Scott's Rule.
+
+    Scott's Rule [1]_, implemented as `scotts_factor`, is::
+
+        n**(-1./(d+4)),
+
+    with ``n`` the number of data points and ``d`` the number of dimensions.
+    In the case of unequally weighted points, `scotts_factor` becomes::
+
+        neff**(-1./(d+4)),
+
+    with ``neff`` the effective number of datapoints.
+    Silverman's suggestion for *multivariate* data [2]_, implemented as
+    `silverman_factor`, is::
+
+        (n * (d + 2) / 4.)**(-1. / (d + 4)).
+
+    or in the case of unequally weighted points::
+
+        (neff * (d + 2) / 4.)**(-1. / (d + 4)).
+
+    Note that this is not the same as "Silverman's rule of thumb" [6]_, which
+    may be more robust in the univariate case; see documentation of the
+    ``set_bandwidth`` method for implementing a custom bandwidth rule.
+
+    Good general descriptions of kernel density estimation can be found in [1]_
+    and [2]_, the mathematics for this multi-dimensional implementation can be
+    found in [1]_.
+
+    With a set of weighted samples, the effective number of datapoints ``neff``
+    is defined by::
+
+        neff = sum(weights)^2 / sum(weights^2)
+
+    as detailed in [5]_.
+
+    `gaussian_kde` does not currently support data that lies in a
+    lower-dimensional subspace of the space in which it is expressed. For such
+    data, consider performing principal component analysis / dimensionality
+    reduction and using `gaussian_kde` with the transformed data.
+
+    References
+    ----------
+    .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
+           Visualization", John Wiley & Sons, New York, Chicester, 1992.
+    .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
+           Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
+           Chapman and Hall, London, 1986.
+    .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
+           Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
+    .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
+           conditional density estimation", Computational Statistics & Data
+           Analysis, Vol. 36, pp. 279-298, 2001.
+    .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
+           Series A (General), 132, 272
+    .. [6] Kernel density estimation. *Wikipedia.*
+           https://en.wikipedia.org/wiki/Kernel_density_estimation
+
+    Examples
+    --------
+    Generate some random two-dimensional data:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> def measure(n):
+    ...     "Measurement model, return two coupled measurements."
+    ...     m1 = np.random.normal(size=n)
+    ...     m2 = np.random.normal(scale=0.5, size=n)
+    ...     return m1+m2, m1-m2
+
+    >>> m1, m2 = measure(2000)
+    >>> xmin = m1.min()
+    >>> xmax = m1.max()
+    >>> ymin = m2.min()
+    >>> ymax = m2.max()
+
+    Perform a kernel density estimate on the data:
+
+    >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
+    >>> positions = np.vstack([X.ravel(), Y.ravel()])
+    >>> values = np.vstack([m1, m2])
+    >>> kernel = stats.gaussian_kde(values)
+    >>> Z = np.reshape(kernel(positions).T, X.shape)
+
+    Plot the results:
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
+    ...           extent=[xmin, xmax, ymin, ymax])
+    >>> ax.plot(m1, m2, 'k.', markersize=2)
+    >>> ax.set_xlim([xmin, xmax])
+    >>> ax.set_ylim([ymin, ymax])
+    >>> plt.show()
+
+    Compare against manual KDE at a point:
+
+    >>> point = [1, 2]
+    >>> mean = values.T
+    >>> cov = kernel.factor**2 * np.cov(values)
+    >>> X = stats.multivariate_normal(cov=cov)
+    >>> res = kernel.pdf(point)
+    >>> ref = X.pdf(point - mean).sum() / len(mean)
+    >>> np.allclose(res, ref)
+    True
+    """
+    def __init__(self, dataset, bw_method=None, weights=None):
+        self.dataset = atleast_2d(asarray(dataset))
+        if not self.dataset.size > 1:
+            raise ValueError("`dataset` input should have multiple elements.")
+
+        self.d, self.n = self.dataset.shape
+
+        if weights is not None:
+            self._weights = atleast_1d(weights).astype(float)
+            self._weights /= sum(self._weights)
+            if self.weights.ndim != 1:
+                raise ValueError("`weights` input should be one-dimensional.")
+            if len(self._weights) != self.n:
+                raise ValueError("`weights` input should be of length n")
+            self._neff = 1/np_vecdot(self._weights, self._weights)
+
+        # This can be converted to a warning once gh-10205 is resolved
+        if self.d > self.n:
+            msg = ("Number of dimensions is greater than number of samples. "
+                   "This results in a singular data covariance matrix, which "
+                   "cannot be treated using the algorithms implemented in "
+                   "`gaussian_kde`. Note that `gaussian_kde` interprets each "
+                   "*column* of `dataset` to be a point; consider transposing "
+                   "the input to `dataset`.")
+            raise ValueError(msg)
+
+        try:
+            self.set_bandwidth(bw_method=bw_method)
+        except linalg.LinAlgError as e:
+            msg = ("The data appears to lie in a lower-dimensional subspace "
+                   "of the space in which it is expressed. This has resulted "
+                   "in a singular data covariance matrix, which cannot be "
+                   "treated using the algorithms implemented in "
+                   "`gaussian_kde`. Consider performing principal component "
+                   "analysis / dimensionality reduction and using "
+                   "`gaussian_kde` with the transformed data.")
+            raise linalg.LinAlgError(msg) from e
+
+    def evaluate(self, points):
+        """Evaluate the estimated pdf on a set of points.
+
+        Parameters
+        ----------
+        points : (# of dimensions, # of points)-array
+            Alternatively, a (# of dimensions,) vector can be passed in and
+            treated as a single point.
+
+        Returns
+        -------
+        values : (# of points,)-array
+            The values at each point.
+
+        Raises
+        ------
+        ValueError : if the dimensionality of the input points is different than
+                     the dimensionality of the KDE.
+
+        """
+        points = atleast_2d(asarray(points))
+
+        d, m = points.shape
+        if d != self.d:
+            if d == 1 and m == self.d:
+                # points was passed in as a row vector
+                points = reshape(points, (self.d, 1))
+                m = 1
+            else:
+                msg = (f"points have dimension {d}, "
+                       f"dataset has dimension {self.d}")
+                raise ValueError(msg)
+
+        output_dtype, spec = _get_output_dtype(self.covariance, points)
+        result = gaussian_kernel_estimate[spec](
+            self.dataset.T, self.weights[:, None],
+            points.T, self.cho_cov, output_dtype)
+
+        return result[:, 0]
+
+    __call__ = evaluate
+
+    def integrate_gaussian(self, mean, cov):
+        """
+        Multiply estimated density by a multivariate Gaussian and integrate
+        over the whole space.
+
+        Parameters
+        ----------
+        mean : aray_like
+            A 1-D array, specifying the mean of the Gaussian.
+        cov : array_like
+            A 2-D array, specifying the covariance matrix of the Gaussian.
+
+        Returns
+        -------
+        result : scalar
+            The value of the integral.
+
+        Raises
+        ------
+        ValueError
+            If the mean or covariance of the input Gaussian differs from
+            the KDE's dimensionality.
+
+        """
+        mean = atleast_1d(squeeze(mean))
+        cov = atleast_2d(cov)
+
+        if mean.shape != (self.d,):
+            raise ValueError(f"mean does not have dimension {self.d}")
+        if cov.shape != (self.d, self.d):
+            raise ValueError(f"covariance does not have dimension {self.d}")
+
+        # make mean a column vector
+        mean = mean[:, newaxis]
+
+        sum_cov = self.covariance + cov
+
+        # This will raise LinAlgError if the new cov matrix is not s.p.d
+        # cho_factor returns (ndarray, bool) where bool is a flag for whether
+        # or not ndarray is upper or lower triangular
+        sum_cov_chol = linalg.cho_factor(sum_cov)
+
+        diff = self.dataset - mean
+        tdiff = linalg.cho_solve(sum_cov_chol, diff)
+
+        sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
+        norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
+
+        energies = np_vecdot(diff, tdiff, axis=0) / 2.0
+        result = np_vecdot(exp(-energies), self.weights, axis=0) / norm_const
+
+        return result
+
+    def integrate_box_1d(self, low, high):
+        """
+        Computes the integral of a 1D pdf between two bounds.
+
+        Parameters
+        ----------
+        low : scalar
+            Lower bound of integration.
+        high : scalar
+            Upper bound of integration.
+
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+
+        Raises
+        ------
+        ValueError
+            If the KDE is over more than one dimension.
+
+        """
+        if self.d != 1:
+            raise ValueError("integrate_box_1d() only handles 1D pdfs")
+
+        stdev = ravel(sqrt(self.covariance))[0]
+
+        normalized_low = ravel((low - self.dataset) / stdev)
+        normalized_high = ravel((high - self.dataset) / stdev)
+
+        delta = special.ndtr(normalized_high) - special.ndtr(normalized_low)
+        value = np_vecdot(self.weights, delta)
+        return value
+
+    def integrate_box(self, low_bounds, high_bounds, maxpts=None, *, rng=None):
+        """Computes the integral of a pdf over a rectangular interval.
+
+        Parameters
+        ----------
+        low_bounds : array_like
+            A 1-D array containing the lower bounds of integration.
+        high_bounds : array_like
+            A 1-D array containing the upper bounds of integration.
+        maxpts : int, optional
+            The maximum number of points to use for integration.
+        rng : `numpy.random.Generator`, optional
+            Pseudorandom number generator state. When `rng` is None, a new
+            generator is created using entropy from the operating system. Types
+            other than `numpy.random.Generator` are passed to
+            `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+
+        """
+        low, high = low_bounds - self.dataset.T, high_bounds - self.dataset.T
+        values = multivariate_normal.cdf(
+            high, lower_limit=low, cov=self.covariance, maxpts=maxpts,
+            rng=rng
+        )
+        return np_vecdot(values, self.weights, axis=-1)
+
+    def integrate_kde(self, other):
+        """
+        Computes the integral of the product of this  kernel density estimate
+        with another.
+
+        Parameters
+        ----------
+        other : gaussian_kde instance
+            The other kde.
+
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+
+        Raises
+        ------
+        ValueError
+            If the KDEs have different dimensionality.
+
+        """
+        if other.d != self.d:
+            raise ValueError("KDEs are not the same dimensionality")
+
+        # we want to iterate over the smallest number of points
+        if other.n < self.n:
+            small = other
+            large = self
+        else:
+            small = self
+            large = other
+
+        sum_cov = small.covariance + large.covariance
+        sum_cov_chol = linalg.cho_factor(sum_cov)
+        result = 0.0
+        for i in range(small.n):
+            mean = small.dataset[:, i, newaxis]
+            diff = large.dataset - mean
+            tdiff = linalg.cho_solve(sum_cov_chol, diff)
+
+            energies = np_vecdot(diff, tdiff, axis=0) / 2.0
+            result += np_vecdot(exp(-energies), large.weights, axis=0)*small.weights[i]
+
+        sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
+        norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
+
+        result /= norm_const
+
+        return result
+
+    def resample(self, size=None, seed=None):
+        """Randomly sample a dataset from the estimated pdf.
+
+        Parameters
+        ----------
+        size : int, optional
+            The number of samples to draw.  If not provided, then the size is
+            the same as the effective number of samples in the underlying
+            dataset.
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance then
+            that instance is used.
+
+        Returns
+        -------
+        resample : (self.d, `size`) ndarray
+            The sampled dataset.
+
+        """ # numpy/numpydoc#87  # noqa: E501
+        if size is None:
+            size = int(self.neff)
+
+        random_state = check_random_state(seed)
+        norm = transpose(random_state.multivariate_normal(
+            zeros((self.d,), float), self.covariance, size=size
+        ))
+        indices = random_state.choice(self.n, size=size, p=self.weights)
+        means = self.dataset[:, indices]
+
+        return means + norm
+
+    def scotts_factor(self):
+        """Compute Scott's factor.
+
+        Returns
+        -------
+        s : float
+            Scott's factor.
+        """
+        return power(self.neff, -1./(self.d+4))
+
+    def silverman_factor(self):
+        """Compute the Silverman factor.
+
+        Returns
+        -------
+        s : float
+            The silverman factor.
+        """
+        return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
+
+    #  Default method to calculate bandwidth, can be overwritten by subclass
+    covariance_factor = scotts_factor
+    covariance_factor.__doc__ = """Computes the bandwidth factor `factor`.
+        The default is `scotts_factor`.  A subclass can overwrite this
+        method to provide a different method, or set it through a call to
+        `set_bandwidth`."""
+
+    def set_bandwidth(self, bw_method=None):
+        """Compute the bandwidth factor with given method.
+
+        The new bandwidth calculated after a call to `set_bandwidth` is used
+        for subsequent evaluations of the estimated density.
+
+        Parameters
+        ----------
+        bw_method : str, scalar or callable, optional
+            The method used to calculate the bandwidth factor.  This can be
+            'scott', 'silverman', a scalar constant or a callable.  If a
+            scalar, this will be used directly as `factor`.  If a callable,
+            it should take a `gaussian_kde` instance as only parameter and
+            return a scalar.  If None (default), nothing happens; the current
+            `covariance_factor` method is kept.
+
+        Notes
+        -----
+        .. versionadded:: 0.11
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import scipy.stats as stats
+        >>> x1 = np.array([-7, -5, 1, 4, 5.])
+        >>> kde = stats.gaussian_kde(x1)
+        >>> xs = np.linspace(-10, 10, num=50)
+        >>> y1 = kde(xs)
+        >>> kde.set_bandwidth(bw_method='silverman')
+        >>> y2 = kde(xs)
+        >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
+        >>> y3 = kde(xs)
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots()
+        >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
+        ...         label='Data points (rescaled)')
+        >>> ax.plot(xs, y1, label='Scott (default)')
+        >>> ax.plot(xs, y2, label='Silverman')
+        >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
+        >>> ax.legend()
+        >>> plt.show()
+
+        """
+        if bw_method is None:
+            pass
+        elif bw_method == 'scott':
+            self.covariance_factor = self.scotts_factor
+        elif bw_method == 'silverman':
+            self.covariance_factor = self.silverman_factor
+        elif np.isscalar(bw_method) and not isinstance(bw_method, str):
+            self._bw_method = 'use constant'
+            self.covariance_factor = lambda: bw_method
+        elif callable(bw_method):
+            self._bw_method = bw_method
+            self.covariance_factor = lambda: self._bw_method(self)
+        else:
+            msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
+                  "or a callable."
+            raise ValueError(msg)
+
+        self._compute_covariance()
+
+    def _compute_covariance(self):
+        """Computes the covariance matrix for each Gaussian kernel using
+        covariance_factor().
+        """
+        self.factor = self.covariance_factor()
+        # Cache covariance and Cholesky decomp of covariance
+        if not hasattr(self, '_data_cho_cov'):
+            self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
+                                               bias=False,
+                                               aweights=self.weights))
+            self._data_cho_cov = linalg.cholesky(self._data_covariance,
+                                                 lower=True)
+
+        self.covariance = self._data_covariance * self.factor**2
+        self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
+        self.log_det = 2*np.log(np.diag(self.cho_cov
+                                        * np.sqrt(2*pi))).sum()
+
+    @property
+    def inv_cov(self):
+        # Re-compute from scratch each time because I'm not sure how this is
+        # used in the wild. (Perhaps users change the `dataset`, since it's
+        # not a private attribute?) `_compute_covariance` used to recalculate
+        # all these, so we'll recalculate everything now that this is a
+        # a property.
+        self.factor = self.covariance_factor()
+        self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
+                                           bias=False, aweights=self.weights))
+        return linalg.inv(self._data_covariance) / self.factor**2
+
+    def pdf(self, x):
+        """
+        Evaluate the estimated pdf on a provided set of points.
+
+        Notes
+        -----
+        This is an alias for `gaussian_kde.evaluate`.  See the ``evaluate``
+        docstring for more details.
+
+        """
+        return self.evaluate(x)
+
+    def logpdf(self, x):
+        """
+        Evaluate the log of the estimated pdf on a provided set of points.
+        """
+        points = atleast_2d(x)
+
+        d, m = points.shape
+        if d != self.d:
+            if d == 1 and m == self.d:
+                # points was passed in as a row vector
+                points = reshape(points, (self.d, 1))
+                m = 1
+            else:
+                msg = (f"points have dimension {d}, "
+                       f"dataset has dimension {self.d}")
+                raise ValueError(msg)
+
+        output_dtype, spec = _get_output_dtype(self.covariance, points)
+        result = gaussian_kernel_estimate_log[spec](
+            self.dataset.T, self.weights[:, None],
+            points.T, self.cho_cov, output_dtype)
+
+        return result[:, 0]
+
+    def marginal(self, dimensions):
+        """Return a marginal KDE distribution
+
+        Parameters
+        ----------
+        dimensions : int or 1-d array_like
+            The dimensions of the multivariate distribution corresponding
+            with the marginal variables, that is, the indices of the dimensions
+            that are being retained. The other dimensions are marginalized out.
+
+        Returns
+        -------
+        marginal_kde : gaussian_kde
+            An object representing the marginal distribution.
+
+        Notes
+        -----
+        .. versionadded:: 1.10.0
+
+        """
+
+        dims = np.atleast_1d(dimensions)
+
+        if not np.issubdtype(dims.dtype, np.integer):
+            msg = ("Elements of `dimensions` must be integers - the indices "
+                   "of the marginal variables being retained.")
+            raise ValueError(msg)
+
+        n = len(self.dataset)  # number of dimensions
+        original_dims = dims.copy()
+
+        dims[dims < 0] = n + dims[dims < 0]
+
+        if len(np.unique(dims)) != len(dims):
+            msg = ("All elements of `dimensions` must be unique.")
+            raise ValueError(msg)
+
+        i_invalid = (dims < 0) | (dims >= n)
+        if np.any(i_invalid):
+            msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
+                   f"for a distribution in {n} dimensions.")
+            raise ValueError(msg)
+
+        dataset = self.dataset[dims]
+        weights = self.weights
+
+        return gaussian_kde(dataset, bw_method=self.covariance_factor(),
+                            weights=weights)
+
+    @property
+    def weights(self):
+        try:
+            return self._weights
+        except AttributeError:
+            self._weights = ones(self.n)/self.n
+            return self._weights
+
+    @property
+    def neff(self):
+        try:
+            return self._neff
+        except AttributeError:
+            self._neff = 1/np_vecdot(self.weights, self.weights)
+            return self._neff
+
+
+def _get_output_dtype(covariance, points):
+    """
+    Calculates the output dtype and the "spec" (=C type name).
+
+    This was necessary in order to deal with the fused types in the Cython
+    routine `gaussian_kernel_estimate`. See gh-10824 for details.
+    """
+    output_dtype = np.common_type(covariance, points)
+    itemsize = np.dtype(output_dtype).itemsize
+    if itemsize == 4:
+        spec = 'float'
+    elif itemsize == 8:
+        spec = 'double'
+    elif itemsize in (12, 16):
+        spec = 'long double'
+    else:
+        raise ValueError(
+                f"{output_dtype} has unexpected item size: {itemsize}"
+            )
+
+    return output_dtype, spec
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_ksstats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_ksstats.py
new file mode 100644
index 0000000000000000000000000000000000000000..8996745601edda71f5fde87b92ce9075579d5ded
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_ksstats.py
@@ -0,0 +1,600 @@
+# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
+#    D_n = sup_x{|F_n(x) - F(x)|},
+#    F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
+#    F(x) is the CDF of a probability distribution.
+#
+# Exact methods:
+# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
+#   or a recursion algorithm due to Pomeranz[2].
+# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
+#   the Durbin algorithm.
+#   D_n >= d <==>  D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
+#   Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
+#   For d > 0.5, the latter intersection probability is 0.
+#
+# Approximate methods:
+# For d close to 0.5, ignoring that intersection term may still give a
+#   reasonable approximation.
+# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
+# Kolmogorov's initial asymptotic, suitable for large d. (See
+#   scipy.special.kolmogorov for that asymptotic)
+# Pelz-Good[6] used the functional equation for Jacobi theta functions to
+#   transform the Li-Chien/Korolyuk formula produce a computational formula
+#   suitable for small d.
+#
+# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
+#   the above approaches and it is that which is used here.
+#
+# Other approaches:
+# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
+# Moscovich and Nadler[9] use FFTs to compute the convolutions.
+
+# References:
+# [1] Durbin J (1968).
+#     "The Probability that the Sample Distribution Function Lies Between Two
+#     Parallel Straight Lines."
+#     Annals of Mathematical Statistics, 39, 398-411.
+# [2] Pomeranz J (1974).
+#     "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
+#     Small Samples (Algorithm 487)."
+#     Communications of the ACM, 17(12), 703-704.
+# [3] Marsaglia G, Tsang WW, Wang J (2003).
+#     "Evaluating Kolmogorov's Distribution."
+#     Journal of Statistical Software, 8(18), 1-4.
+# [4] LI-CHIEN, C. (1956).
+#     "On the exact distribution of the statistics of A. N. Kolmogorov and
+#     their asymptotic expansion."
+#     Acta Matematica Sinica, 6, 55-81.
+# [5] KOROLYUK, V. S. (1960).
+#     "Asymptotic analysis of the distribution of the maximum deviation in
+#     the Bernoulli scheme."
+#     Theor. Probability Appl., 4, 339-366.
+# [6] Pelz W, Good IJ (1976).
+#     "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
+#     Statistic."
+#     Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
+#  [7] Simard, R., L'Ecuyer, P. (2011)
+# 	  "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
+# 	  Journal of Statistical Software, Vol 39, 11, 1-18.
+#  [8] Carvalho, Luis (2015)
+#     "An Improved Evaluation of Kolmogorov's Distribution"
+#     Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
+#  [9] Amit Moscovich, Boaz Nadler (2017)
+#     "Fast calculation of boundary crossing probabilities for Poisson
+#     processes",
+#     Statistics & Probability Letters, Vol 123, 177-182.
+
+
+import numpy as np
+import scipy.special
+import scipy.special._ufuncs as scu
+from scipy.stats._finite_differences import _derivative
+
+_E128 = 128
+_EP128 = np.ldexp(np.longdouble(1), _E128)
+_EM128 = np.ldexp(np.longdouble(1), -_E128)
+
+_SQRT2PI = np.sqrt(2 * np.pi)
+_LOG_2PI = np.log(2 * np.pi)
+_MIN_LOG = -708
+_SQRT3 = np.sqrt(3)
+_PI_SQUARED = np.pi ** 2
+_PI_FOUR = np.pi ** 4
+_PI_SIX = np.pi ** 6
+
+# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
+# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
+_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
+                    -1.9175269175269175269e-3, 8.4175084175084175084e-4,
+                    -5.952380952380952381e-4, 7.9365079365079365079e-4,
+                    -2.7777777777777777778e-3, 8.3333333333333333333e-2]
+
+
+def _log_nfactorial_div_n_pow_n(n):
+    # Computes n! / n**n
+    #    = (n-1)! / n**(n-1)
+    # Uses Stirling's approximation, but removes n*log(n) up-front to
+    # avoid subtractive cancellation.
+    #    = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
+    rn = 1.0/n
+    return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
+
+
+def _clip_prob(p):
+    """clips a probability to range 0<=p<=1."""
+    return np.clip(p, 0.0, 1.0)
+
+
+def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
+    """Selects either the CDF or SF, and then clips to range 0<=p<=1."""
+    p = np.where(cdf, cdfprob, sfprob)
+    return _clip_prob(p)
+
+
+def _kolmogn_DMTW(n, d, cdf=True):
+    r"""Computes the Kolmogorov CDF:  Pr(D_n <= d) using the MTW approach to
+    the Durbin matrix algorithm.
+
+    Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
+    """
+    # Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
+    # Generate initial matrix H of size m*m where m=(2k-1)
+    # Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
+    # Requires memory O(m^2) and computation O(m^2 log(n)).
+    # Most suitable for small m.
+
+    if d >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf)
+    nd = n * d
+    if nd <= 0.5:
+        return _select_and_clip_prob(0.0, 1.0, cdf)
+    k = int(np.ceil(nd))
+    h = k - nd
+    m = 2 * k - 1
+
+    H = np.zeros([m, m])
+
+    # Initialize: v is first column (and last row) of H
+    #  v[j] = (1-h^(j+1)/(j+1)!  (except for v[-1])
+    #  w[j] = 1/(j)!
+    # q = k-th row of H (actually i!/n^i*H^i)
+    intm = np.arange(1, m + 1)
+    v = 1.0 - h ** intm
+    w = np.empty(m)
+    fac = 1.0
+    for j in intm:
+        w[j - 1] = fac
+        fac /= j  # This might underflow.  Isn't a problem.
+        v[j - 1] *= fac
+    tt = max(2 * h - 1.0, 0)**m - 2*h**m
+    v[-1] = (1.0 + tt) * fac
+
+    for i in range(1, m):
+        H[i - 1:, i] = w[:m - i + 1]
+    H[:, 0] = v
+    H[-1, :] = np.flip(v, axis=0)
+
+    Hpwr = np.eye(np.shape(H)[0])  # Holds intermediate powers of H
+    nn = n
+    expnt = 0  # Scaling of Hpwr
+    Hexpnt = 0  # Scaling of H
+    while nn > 0:
+        if nn % 2:
+            Hpwr = np.matmul(Hpwr, H)
+            expnt += Hexpnt
+        H = np.matmul(H, H)
+        Hexpnt *= 2
+        # Scale as needed.
+        if np.abs(H[k - 1, k - 1]) > _EP128:
+            H /= _EP128
+            Hexpnt += _E128
+        nn = nn // 2
+
+    p = Hpwr[k - 1, k - 1]
+
+    # Multiply by n!/n^n
+    for i in range(1, n + 1):
+        p = i * p / n
+        if np.abs(p) < _EM128:
+            p *= _EP128
+            expnt -= _E128
+
+    # unscale
+    if expnt != 0:
+        p = np.ldexp(p, expnt)
+
+    return _select_and_clip_prob(p, 1.0-p, cdf)
+
+
+def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
+    """Compute the endpoints of the interval for row i."""
+    if i == 0:
+        j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
+    else:
+        # i + 1 = 2*ip1div2 + ip1mod2
+        ip1div2, ip1mod2 = divmod(i + 1, 2)
+        if ip1mod2 == 0:  # i is odd
+            if ip1div2 == n + 1:
+                j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
+            else:
+                j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
+        else:
+            j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
+
+    return max(j1 + 2, 0), min(j2, n)
+
+
+def _kolmogn_Pomeranz(n, x, cdf=True):
+    r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
+
+    Pomeranz (1974) [2]
+    """
+
+    # V is n*(2n+2) matrix.
+    # Each row is convolution of the previous row and probabilities from a
+    #  Poisson distribution.
+    # Desired CDF probability is n! V[n-1, 2n+1]  (final entry in final row).
+    # Only two rows are needed at any given stage:
+    #  - Call them V0 and V1.
+    #  - Swap each iteration
+    # Only a few (contiguous) entries in each row can be non-zero.
+    #  - Keep track of start and end (j1 and j2 below)
+    #  - V0s and V1s track the start in the two rows
+    # Scale intermediate results as needed.
+    # Only a few different Poisson distributions can occur
+    t = n * x
+    ll = int(np.floor(t))
+    f = 1.0 * (t - ll)  # fractional part of t
+    g = min(f, 1.0 - f)
+    ceilf = (1 if f > 0 else 0)
+    roundf = (1 if f > 0.5 else 0)
+    npwrs = 2 * (ll + 1)    # Maximum number of powers needed in convolutions
+    gpower = np.empty(npwrs)  # gpower = (g/n)^m/m!
+    twogpower = np.empty(npwrs)  # twogpower = (2g/n)^m/m!
+    onem2gpower = np.empty(npwrs)  # onem2gpower = ((1-2g)/n)^m/m!
+    # gpower etc are *almost* Poisson probs, just missing normalizing factor.
+
+    gpower[0] = 1.0
+    twogpower[0] = 1.0
+    onem2gpower[0] = 1.0
+    expnt = 0
+    g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
+    for m in range(1, npwrs):
+        gpower[m] = gpower[m - 1] * g_over_n / m
+        twogpower[m] = twogpower[m - 1] * two_g_over_n / m
+        onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
+
+    V0 = np.zeros([npwrs])
+    V1 = np.zeros([npwrs])
+    V1[0] = 1  # first row
+    V0s, V1s = 0, 0  # start indices of the two rows
+
+    j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
+    for i in range(1, 2 * n + 2):
+        # Preserve j1, V1, V1s, V0s from last iteration
+        k1 = j1
+        V0, V1 = V1, V0
+        V0s, V1s = V1s, V0s
+        V1.fill(0.0)
+        j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
+        if i == 1 or i == 2 * n + 1:
+            pwrs = gpower
+        else:
+            pwrs = (twogpower if i % 2 else onem2gpower)
+        ln2 = j2 - k1 + 1
+        if ln2 > 0:
+            conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
+            conv_start = j1 - k1  # First index to use from conv
+            conv_len = j2 - j1 + 1  # Number of entries to use from conv
+            V1[:conv_len] = conv[conv_start:conv_start + conv_len]
+            # Scale to avoid underflow.
+            if 0 < np.max(V1) < _EM128:
+                V1 *= _EP128
+                expnt -= _E128
+            V1s = V0s + j1 - k1
+
+    # multiply by n!
+    ans = V1[n - V1s]
+    for m in range(1, n + 1):
+        if np.abs(ans) > _EP128:
+            ans *= _EM128
+            expnt += _E128
+        ans *= m
+
+    # Undo any intermediate scaling
+    if expnt != 0:
+        ans = np.ldexp(ans, expnt)
+    ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
+    return ans
+
+
+def _kolmogn_PelzGood(n, x, cdf=True):
+    """Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
+
+    Start with Li-Chien, Korolyuk approximation:
+        Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
+    where z = x*sqrt(n).
+    Transform each K_(z) using Jacobi theta functions into a form suitable
+    for small z.
+    Pelz-Good (1976). [6]
+    """
+    if x <= 0.0:
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+    if x >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
+
+    z = np.sqrt(n) * x
+    zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
+
+    qlog = -_PI_SQUARED / 8 / zsquared
+    if qlog < _MIN_LOG:  # z ~ 0.041743441416853426
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+
+    q = np.exp(qlog)
+
+    # Coefficients of terms in the sums for K1, K2 and K3
+    k1a = -zsquared
+    k1b = _PI_SQUARED / 4
+
+    k2a = 6 * zsix + 2 * zfour
+    k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
+    k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
+
+    k3d = _PI_SIX * (5 - 30 * zsquared) / 64
+    k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
+    k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
+    k3a = -30 * zsix - 90 * z**8
+
+    K0to3 = np.zeros(4)
+    # Use a Horner scheme to evaluate sum c_i q^(i^2)
+    # Reduces to a sum over odd integers.
+    maxk = int(np.ceil(16 * z / np.pi))
+    for k in range(maxk, 0, -1):
+        m = 2 * k - 1
+        msquared, mfour, msix = m**2, m**4, m**6
+        qpower = np.power(q, 8 * k)
+        coeffs = np.array([1.0,
+                           k1a + k1b*msquared,
+                           k2a + k2b*msquared + k2c*mfour,
+                           k3a + k3b*msquared + k3c*mfour + k3d*msix])
+        K0to3 *= qpower
+        K0to3 += coeffs
+    K0to3 *= q
+    K0to3 *= _SQRT2PI
+    # z**10 > 0 as z > 0.04
+    K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
+
+    # Now do the other sum over the other terms, all integers k
+    # K_2:  (pi^2 k^2) q^(k^2),
+    # K_3:  (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
+    # Don't expect much subtractive cancellation so use direct calculation
+    q = np.exp(-_PI_SQUARED / 2 / zsquared)
+    ks = np.arange(maxk, 0, -1)
+    ksquared = ks ** 2
+    sqrt3z = _SQRT3 * z
+    kspi = np.pi * ks
+    qpwers = q ** ksquared
+    k2extra = np.sum(ksquared * qpwers)
+    k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
+    K0to3[2] += k2extra
+    k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
+    k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
+    K0to3[3] += k3extra
+    powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
+    K0to3 /= powers_of_n
+
+    if not cdf:
+        K0to3 *= -1
+        K0to3[0] += 1
+
+    Ksum = sum(K0to3)
+    return Ksum
+
+
+def _kolmogn(n, x, cdf=True):
+    """Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
+
+    x must be of type float, n of type integer.
+
+    Simard & L'Ecuyer (2011) [7].
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if x >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
+    if x <= 0.0:
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+    t = n * x
+    if t <= 1.0:  # Ruben-Gambino: 1/2n <= x <= 1/n
+        if t <= 0.5:
+            return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+        if n <= 140:
+            prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
+        else:
+            prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
+        return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+    if t >= n - 1:  # Ruben-Gambino
+        prob = 2 * (1.0 - x)**n
+        return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
+    if x >= 0.5:  # Exact: 2 * smirnov
+        prob = 2 * scipy.special.smirnov(n, x)
+        return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
+
+    nxsquared = t * x
+    if n <= 140:
+        if nxsquared <= 0.754693:
+            prob = _kolmogn_DMTW(n, x, cdf=True)
+            return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+        if nxsquared <= 4:
+            prob = _kolmogn_Pomeranz(n, x, cdf=True)
+            return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+        # Now use Miller approximation of 2*smirnov
+        prob = 2 * scipy.special.smirnov(n, x)
+        return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
+
+    # Split CDF and SF as they have different cutoffs on nxsquared.
+    if not cdf:
+        if nxsquared >= 370.0:
+            return 0.0
+        if nxsquared >= 2.2:
+            prob = 2 * scipy.special.smirnov(n, x)
+            return _clip_prob(prob)
+        # Fall through and compute the SF as 1.0-CDF
+    if nxsquared >= 18.0:
+        cdfprob = 1.0
+    elif n <= 100000 and n * x**1.5 <= 1.4:
+        cdfprob = _kolmogn_DMTW(n, x, cdf=True)
+    else:
+        cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
+    return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
+
+
+def _kolmogn_p(n, x):
+    """Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
+
+    x must be of type float, n of type integer.
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if x >= 1.0 or x <= 0:
+        return 0
+    t = n * x
+    if t <= 1.0:
+        # Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
+        if t <= 0.5:
+            return 0.0
+        if n <= 140:
+            prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
+        else:
+            prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
+        return prd * 2 * n**2
+    if t >= n - 1:
+        # Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
+        return 2 * (1.0 - x) ** (n-1) * n
+    if x >= 0.5:
+        return 2 * scipy.stats.ksone.pdf(x, n)
+
+    # Just take a small delta.
+    # Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
+    # as the CDF is a piecewise degree n polynomial.
+    # It has knots at 1/n, 2/n, ... (n-1)/n
+    # and is not a C-infinity function at the knots
+    delta = x / 2.0**16
+    delta = min(delta, x - 1.0/n)
+    delta = min(delta, 0.5 - x)
+
+    def _kk(_x):
+        return kolmogn(n, _x)
+
+    return _derivative(_kk, x, dx=delta, order=5)
+
+
+def _kolmogni(n, p, q):
+    """Computes the PPF/ISF of kolmogn.
+
+    n of type integer, n>= 1
+    p is the CDF, q the SF, p+q=1
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if p <= 0:
+        return 1.0/n
+    if q <= 0:
+        return 1.0
+    delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
+    if delta <= 1.0/n:
+        return (delta + 1.0 / n) / 2
+    x = -np.expm1(np.log(q/2.0)/n)
+    if x >= 1 - 1.0/n:
+        return x
+    x1 = scu._kolmogci(p)/np.sqrt(n)
+    x1 = min(x1, 1.0 - 1.0/n)
+
+    def _f(x):
+        return _kolmogn(n, x) - p
+
+    return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
+
+
+def kolmogn(n, x, cdf=True):
+    """Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
+
+    The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
+    for a sample of size n drawn from a distribution with CDF F(t), where
+    :math:`D_n &= sup_t |F_n(t) - F(t)|`, and
+    :math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
+
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    x : float, array_like
+        The K-S statistic, float between 0 and 1
+    cdf : bool, optional
+        whether to compute the CDF(default=true) or the SF.
+
+    Returns
+    -------
+    cdf : ndarray
+        CDF (or SF it cdf is False) at the specified locations.
+
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, x, cdf, None], flags=['zerosize_ok'],
+                   op_dtypes=[None, np.float64, np.bool_, np.float64])
+    for _n, _x, _cdf, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
+    result = it.operands[-1]
+    return result
+
+
+def kolmognp(n, x):
+    """Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
+
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    x : float, array_like
+        The K-S statistic, float between 0 and 1
+
+    Returns
+    -------
+    pdf : ndarray
+        The PDF at the specified locations
+
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, x, None])
+    for _n, _x, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        z[...] = _kolmogn_p(int(_n), _x)
+    result = it.operands[-1]
+    return result
+
+
+def kolmogni(n, q, cdf=True):
+    """Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
+
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    q : float, array_like
+        Probabilities, float between 0 and 1
+    cdf : bool, optional
+        whether to compute the PPF(default=true) or the ISF.
+
+    Returns
+    -------
+    ppf : ndarray
+        PPF (or ISF if cdf is False) at the specified locations
+
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, q, cdf, None])
+    for _n, _q, _cdf, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        _pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
+        z[...] = _kolmogni(int(_n), _pcdf, _psf)
+    result = it.operands[-1]
+    return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mannwhitneyu.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mannwhitneyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..053ffe8a264f1ef1e570456ecf7a6cb011a8f52c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mannwhitneyu.py
@@ -0,0 +1,490 @@
+import threading
+import numpy as np
+from collections import namedtuple
+from scipy._lib._array_api import array_namespace, xp_capabilities, xp_size, xp_promote
+from scipy._lib import array_api_extra as xpx
+from scipy import special
+from scipy import stats
+from scipy.stats._stats_py import _rankdata
+from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_concatenate
+
+class _MWU:
+    '''Distribution of MWU statistic under the null hypothesis'''
+
+    def __init__(self, n1, n2):
+        self._reset(n1, n2)
+
+    def set_shapes(self, n1, n2):
+        n1, n2 = min(n1, n2), max(n1, n2)
+        if (n1, n2) == (self.n1, self.n2):
+            return
+
+        self.n1 = n1
+        self.n2 = n2
+        self.s_array = np.zeros(0, dtype=int)
+        self.configurations = np.zeros(0, dtype=np.uint64)
+
+    def reset(self):
+        self._reset(self.n1, self.n2)
+
+    def _reset(self, n1, n2):
+        self.n1 = None
+        self.n2 = None
+        self.set_shapes(n1, n2)
+
+    def pmf(self, k):
+
+        # In practice, `pmf` is never called with k > m*n/2.
+        # If it were, we'd exploit symmetry here:
+        # k = np.array(k, copy=True)
+        # k2 = m*n - k
+        # i = k2 < k
+        # k[i] = k2[i]
+
+        pmfs = self.build_u_freqs_array(np.max(k))
+        return pmfs[k]
+
+    def cdf(self, k):
+        '''Cumulative distribution function'''
+
+        # In practice, `cdf` is never called with k > m*n/2.
+        # If it were, we'd exploit symmetry here rather than in `sf`
+        pmfs = self.build_u_freqs_array(np.max(k))
+        cdfs = np.cumsum(pmfs)
+        return cdfs[k]
+
+    def sf(self, k):
+        '''Survival function'''
+        # Note that both CDF and SF include the PMF at k. The p-value is
+        # calculated from the SF and should include the mass at k, so this
+        # is desirable
+
+        # Use the fact that the distribution is symmetric and sum from the left
+        kc = np.asarray(self.n1*self.n2 - k)  # complement of k
+        i = k < kc
+        if np.any(i):
+            kc[i] = k[i]
+            cdfs = np.asarray(self.cdf(kc))
+            cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i])
+        else:
+            cdfs = np.asarray(self.cdf(kc))
+        return cdfs[()]
+
+    # build_sigma_array and build_u_freqs_array adapted from code
+    # by @toobaz with permission. Thanks to @andreasloe for the suggestion.
+    # See https://github.com/scipy/scipy/pull/4933#issuecomment-1898082691
+    def build_sigma_array(self, a):
+        n1, n2 = self.n1, self.n2
+        if a + 1 <= self.s_array.size:
+            return self.s_array[1:a+1]
+
+        s_array = np.zeros(a + 1, dtype=int)
+
+        for d in np.arange(1, n1 + 1):
+            # All multiples of d, except 0:
+            indices = np.arange(d, a + 1, d)
+            # \epsilon_d = 1:
+            s_array[indices] += d
+
+        for d in np.arange(n2 + 1, n2 + n1 + 1):
+            # All multiples of d, except 0:
+            indices = np.arange(d, a + 1, d)
+            # \epsilon_d = -1:
+            s_array[indices] -= d
+
+        # We don't need 0:
+        self.s_array = s_array
+        return s_array[1:]
+
+    def build_u_freqs_array(self, maxu):
+        """
+        Build all the array of frequencies for u from 0 to maxu.
+        Assumptions:
+          n1 <= n2
+          maxu <= n1 * n2 / 2
+        """
+        n1, n2 = self.n1, self.n2
+        total = special.binom(n1 + n2, n1)
+
+        if maxu + 1 <= self.configurations.size:
+            return self.configurations[:maxu + 1] / total
+
+        s_array = self.build_sigma_array(maxu)
+
+        # Start working with ints, for maximum precision and efficiency:
+        configurations = np.zeros(maxu + 1, dtype=np.uint64)
+        configurations_is_uint = True
+        uint_max = np.iinfo(np.uint64).max
+        # How many ways to have U=0? 1
+        configurations[0] = 1
+
+        for u in np.arange(1, maxu + 1):
+            coeffs = s_array[u - 1::-1]
+            new_val = np.dot(configurations[:u], coeffs) / u
+            if new_val > uint_max and configurations_is_uint:
+                # OK, we got into numbers too big for uint64.
+                # So now we start working with floats.
+                # By doing this since the beginning, we would have lost precision.
+                # (And working on python long ints would be unbearably slow)
+                configurations = configurations.astype(float)
+                configurations_is_uint = False
+            configurations[u] = new_val
+
+        self.configurations = configurations
+        return configurations / total
+
+
+# Maintain state for faster repeat calls to `mannwhitneyu`.
+# _MWU() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside mannwhitneyu().
+_mwu_state = threading.local()
+
+
+def _get_mwu_z(U, n1, n2, t, continuity=True, *, xp):
+    '''Standardized MWU statistic'''
+    # Follows mannwhitneyu [2]
+    mu = n1 * n2 / 2
+    n = n1 + n2
+
+    # Tie correction according to [2], "Normal approximation and tie correction"
+    # "A more computationally-efficient form..."
+    tie_term = xp.sum(t**3 - t, axis=-1)
+    s = xp.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
+
+    numerator = U - mu
+
+    # Continuity correction.
+    # Because SF is always used to calculate the p-value, we can always
+    # _subtract_ 0.5 for the continuity correction. This always increases the
+    # p-value to account for the rest of the probability mass _at_ q = U.
+    if continuity:
+        numerator -= 0.5
+
+    # no problem evaluating the norm SF at an infinity
+    with np.errstate(divide='ignore', invalid='ignore'):
+        z = numerator / s
+    return z
+
+
+def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
+    ''' Input validation and standardization for mannwhitneyu '''
+    xp = array_namespace(x, y)
+
+    x, y = xpx.atleast_nd(x, ndim=1), xpx.atleast_nd(y, ndim=1)
+    if xp.any(xp.isnan(x)) or xp.any(xp.isnan(y)):
+        raise ValueError('`x` and `y` must not contain NaNs.')
+    if xp_size(x) == 0 or xp_size(y) == 0:
+        raise ValueError('`x` and `y` must be of nonzero size.')
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+
+    bools = {True, False}
+    if use_continuity not in bools:
+        raise ValueError(f'`use_continuity` must be one of {bools}.')
+
+    alternatives = {"two-sided", "less", "greater"}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f'`alternative` must be one of {alternatives}.')
+
+    axis_int = int(axis)
+    if axis != axis_int:
+        raise ValueError('`axis` must be an integer.')
+
+    if not isinstance(method, stats.PermutationMethod):
+        methods = {"asymptotic", "exact", "auto"}
+        method = method.lower()
+        if method not in methods:
+            raise ValueError(f'`method` must be one of {methods}.')
+
+    return x, y, use_continuity, alternative, axis_int, method, xp
+
+
+def _mwu_choose_method(n1, n2, ties):
+    """Choose method 'asymptotic' or 'exact' depending on input size, ties"""
+
+    # if both inputs are large, asymptotic is OK
+    if n1 > 8 and n2 > 8:
+        return "asymptotic"
+
+    # if there are any ties, asymptotic is preferred
+    if ties:
+        return "asymptotic"
+
+    return "exact"
+
+
+MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(cpu_only=True,  # exact calculation only implemented in NumPy
+                 skip_backends=[('cupy', 'needs rankdata'),
+                                ('dask.array', 'needs rankdata')],
+                 jax_jit=False)
+@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
+def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
+                 axis=0, method="auto"):
+    r'''Perform the Mann-Whitney U rank test on two independent samples.
+
+    The Mann-Whitney U test is a nonparametric test of the null hypothesis
+    that the distribution underlying sample `x` is the same as the
+    distribution underlying sample `y`. It is often used as a test of
+    difference in location between distributions.
+
+    Parameters
+    ----------
+    x, y : array-like
+        N-d arrays of samples. The arrays must be broadcastable except along
+        the dimension given by `axis`.
+    use_continuity : bool, optional
+            Whether a continuity correction (1/2) should be applied.
+            Default is True when `method` is ``'asymptotic'``; has no effect
+            otherwise.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        Let *SX(u)* and *SY(u)* be the survival functions of the
+        distributions underlying `x` and `y`, respectively. Then the following
+        alternative hypotheses are available:
+
+        * 'two-sided': the distributions are not equal, i.e. *SX(u) ≠ SY(u)* for
+          at least one *u*.
+        * 'less': the distribution underlying `x` is stochastically less
+          than the distribution underlying `y`, i.e. *SX(u) < SY(u)* for all *u*.
+        * 'greater': the distribution underlying `x` is stochastically greater
+          than the distribution underlying `y`, i.e. *SX(u) > SY(u)* for all *u*.
+
+        Under a more restrictive set of assumptions, the alternative hypotheses
+        can be expressed in terms of the locations of the distributions;
+        see [5]_ section 5.1.
+    axis : int, optional
+        Axis along which to perform the test. Default is 0.
+    method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
+        Selects the method used to calculate the *p*-value.
+        Default is 'auto'. The following options are available.
+
+        * ``'asymptotic'``: compares the standardized test statistic
+          against the normal distribution, correcting for ties.
+        * ``'exact'``: computes the exact *p*-value by comparing the observed
+          :math:`U` statistic against the exact distribution of the :math:`U`
+          statistic under the null hypothesis. No correction is made for ties.
+        * ``'auto'``: chooses ``'exact'`` when the size of one of the samples
+          is less than or equal to 8 and there are no ties;
+          chooses ``'asymptotic'`` otherwise.
+        * `PermutationMethod` instance. In this case, the p-value
+          is computed using `permutation_test` with the provided
+          configuration options and other appropriate settings.
+
+    Returns
+    -------
+    res : MannwhitneyuResult
+        An object containing attributes:
+
+        statistic : float
+            The Mann-Whitney U statistic corresponding with sample `x`. See
+            Notes for the test statistic corresponding with sample `y`.
+        pvalue : float
+            The associated *p*-value for the chosen `alternative`.
+
+    Notes
+    -----
+    If ``U1`` is the statistic corresponding with sample `x`, then the
+    statistic corresponding with sample `y` is
+    ``U2 = x.shape[axis] * y.shape[axis] - U1``.
+
+    `mannwhitneyu` is for independent samples. For related / paired samples,
+    consider `scipy.stats.wilcoxon`.
+
+    `method` ``'exact'`` is recommended when there are no ties and when either
+    sample size is less than 8 [1]_. The implementation follows the algorithm
+    reported in [3]_.
+    Note that the exact method is *not* corrected for ties, but
+    `mannwhitneyu` will not raise errors or warnings if there are ties in the
+    data. If there are ties and either samples is small (fewer than ~10
+    observations), consider passing an instance of `PermutationMethod`
+    as the `method` to perform a permutation test.
+
+    The Mann-Whitney U test is a non-parametric version of the t-test for
+    independent samples. When the means of samples from the populations
+    are normally distributed, consider `scipy.stats.ttest_ind`.
+
+    See Also
+    --------
+    scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
+
+    References
+    ----------
+    .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
+           variables is stochastically larger than the other", The Annals of
+           Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
+    .. [2] Mann-Whitney U Test, Wikipedia,
+           http://en.wikipedia.org/wiki/Mann-Whitney_U_test
+    .. [3] Andreas Löffler,
+           "Über eine Partition der nat. Zahlen und ihr Anwendung beim U-Test",
+           Wiss. Z. Univ. Halle, XXXII'83 pp. 87-89.
+    .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
+           Learning Support Centre, 2004.
+    .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
+           or t-test? On assumptions for hypothesis tests and multiple \
+           interpretations of decision rules." Statistics surveys, Vol. 4, pp.
+           1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
+
+    Examples
+    --------
+    We follow the example from [4]_: nine randomly sampled young adults were
+    diagnosed with type II diabetes at the ages below.
+
+    >>> males = [19, 22, 16, 29, 24]
+    >>> females = [20, 11, 17, 12]
+
+    We use the Mann-Whitney U test to assess whether there is a statistically
+    significant difference in the diagnosis age of males and females.
+    The null hypothesis is that the distribution of male diagnosis ages is
+    the same as the distribution of female diagnosis ages. We decide
+    that a confidence level of 95% is required to reject the null hypothesis
+    in favor of the alternative that the distributions are different.
+    Since the number of samples is very small and there are no ties in the
+    data, we can compare the observed test statistic against the *exact*
+    distribution of the test statistic under the null hypothesis.
+
+    >>> from scipy.stats import mannwhitneyu
+    >>> U1, p = mannwhitneyu(males, females, method="exact")
+    >>> print(U1)
+    17.0
+
+    `mannwhitneyu` always reports the statistic associated with the first
+    sample, which, in this case, is males. This agrees with :math:`U_M = 17`
+    reported in [4]_. The statistic associated with the second statistic
+    can be calculated:
+
+    >>> nx, ny = len(males), len(females)
+    >>> U2 = nx*ny - U1
+    >>> print(U2)
+    3.0
+
+    This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
+    *p*-value can be calculated from either statistic, and the value produced
+    by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
+
+    >>> print(p)
+    0.1111111111111111
+
+    The exact distribution of the test statistic is asymptotically normal, so
+    the example continues by comparing the exact *p*-value against the
+    *p*-value produced using the normal approximation.
+
+    >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
+    >>> print(pnorm)
+    0.11134688653314041
+
+    Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
+    value :math:`p = 0.09` given in [4]_. The reason is that [4]_
+    does not apply the continuity correction performed by `mannwhitneyu`;
+    `mannwhitneyu` reduces the distance between the test statistic and the
+    mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
+    discrete statistic is being compared against a continuous distribution.
+    Here, the :math:`U` statistic used is less than the mean, so we reduce
+    the distance by adding 0.5 in the numerator.
+
+    >>> import numpy as np
+    >>> from scipy.stats import norm
+    >>> U = min(U1, U2)
+    >>> N = nx + ny
+    >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
+    >>> p = 2 * norm.cdf(z)  # use CDF to get p-value from smaller statistic
+    >>> print(p)
+    0.11134688653314041
+
+    If desired, we can disable the continuity correction to get a result
+    that agrees with that reported in [4]_.
+
+    >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
+    ...                         method="asymptotic")
+    >>> print(pnorm)
+    0.0864107329737
+
+    Regardless of whether we perform an exact or asymptotic test, the
+    probability of the test statistic being as extreme or more extreme by
+    chance exceeds 5%, so we do not consider the results statistically
+    significant.
+
+    Suppose that, before seeing the data, we had hypothesized that females
+    would tend to be diagnosed at a younger age than males.
+    In that case, it would be natural to provide the female ages as the
+    first input, and we would have performed a one-sided test using
+    ``alternative = 'less'``: females are diagnosed at an age that is
+    stochastically less than that of males.
+
+    >>> res = mannwhitneyu(females, males, alternative="less", method="exact")
+    >>> print(res)
+    MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
+
+    Again, the probability of getting a sufficiently low value of the
+    test statistic by chance under the null hypothesis is greater than 5%,
+    so we do not reject the null hypothesis in favor of our alternative.
+
+    If it is reasonable to assume that the means of samples from the
+    populations are normally distributed, we could have used a t-test to
+    perform the analysis.
+
+    >>> from scipy.stats import ttest_ind
+    >>> res = ttest_ind(females, males, alternative="less")
+    >>> print(res)
+    TtestResult(statistic=-2.239334696520584,
+                pvalue=0.030068441095757924,
+                df=7.0)
+
+    Under this assumption, the *p*-value would be low enough to reject the
+    null hypothesis in favor of the alternative.
+
+    '''
+
+    x, y, use_continuity, alternative, axis_int, method, xp = (
+        _mwu_input_validation(x, y, use_continuity, alternative, axis, method))
+
+    xy = _broadcast_concatenate((x, y), axis)
+
+    n1, n2 = x.shape[-1], y.shape[-1]  # _axis_nan_policy decorator ensures axis=-1
+
+    # Follows [2]
+    ranks, t = _rankdata(xy, 'average', return_ties=True)  # method 2, step 1
+    ranks = xp.astype(ranks, x.dtype, copy=False)
+    t = xp.astype(t, x.dtype, copy=False)
+    R1 = xp.sum(ranks[..., :n1], axis=-1)                  # method 2, step 2
+    U1 = R1 - n1*(n1+1)/2                                  # method 2, step 3
+    U2 = n1 * n2 - U1                                      # as U1 + U2 = n1 * n2
+
+    if alternative == "greater":
+        U, f = U1, 1  # U is the statistic to use for p-value, f is a factor
+    elif alternative == "less":
+        U, f = U2, 1  # Due to symmetry, use SF of U2 rather than CDF of U1
+    else:
+        U, f = xp.maximum(U1, U2), 2  # multiply SF by two for two-sided test
+
+    if method == "auto":
+        method = _mwu_choose_method(n1, n2, xp.any(t > 1))
+
+    if method == "exact":
+        if not hasattr(_mwu_state, 's'):
+            _mwu_state.s = _MWU(0, 0)
+        _mwu_state.s.set_shapes(n1, n2)
+        p = xp.asarray(_mwu_state.s.sf(np.asarray(U, np.int64)), dtype=x.dtype)
+    elif method == "asymptotic":
+        z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity, xp=xp)
+        p = special.ndtr(-z)
+    else:  # `PermutationMethod` instance (already validated)
+        def statistic(x, y, axis):
+            return mannwhitneyu(x, y, use_continuity=use_continuity,
+                                alternative=alternative, axis=axis,
+                                method="asymptotic").statistic
+
+        res = stats.permutation_test((x, y), statistic, axis=axis,
+                                     **method._asdict(), alternative=alternative)
+        p = res.pvalue
+        f = 1
+
+    p *= f
+
+    # Ensure that test statistic is not greater than 1
+    # This could happen for exact test when U = m*n/2
+    p = xp.clip(p, 0., 1.)
+
+    return MannwhitneyuResult(U1, p)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mgc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mgc.py
new file mode 100644
index 0000000000000000000000000000000000000000..35580c36cd487a53cb4df2e214ae85ac00d28873
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mgc.py
@@ -0,0 +1,552 @@
+import warnings
+import numpy as np
+
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
+from scipy._lib._bunch import _make_tuple_bunch
+from scipy.spatial.distance import cdist
+from scipy.ndimage import _measurements
+
+from ._stats import _local_correlations  # type: ignore[import-not-found]
+from . import distributions
+
+__all__ = ['multiscale_graphcorr']
+
+# FROM MGCPY: https://github.com/neurodata/mgcpy
+
+
+class _ParallelP:
+    """Helper function to calculate parallel p-value."""
+
+    def __init__(self, x, y, random_states):
+        self.x = x
+        self.y = y
+        self.random_states = random_states
+
+    def __call__(self, index):
+        order = self.random_states[index].permutation(self.y.shape[0])
+        permy = self.y[order][:, order]
+
+        # calculate permuted stats, store in null distribution
+        perm_stat = _mgc_stat(self.x, permy)[0]
+
+        return perm_stat
+
+
+def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
+    r"""Helper function that calculates the p-value. See below for uses.
+
+    Parameters
+    ----------
+    x, y : ndarray
+        `x` and `y` have shapes ``(n, p)`` and ``(n, q)``.
+    stat : float
+        The sample test statistic.
+    reps : int, optional
+        The number of replications used to estimate the null when using the
+        permutation test. The default is 1000 replications.
+    workers : int or map-like callable, optional
+        If `workers` is an int the population is subdivided into `workers`
+        sections and evaluated in parallel (uses
+        `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
+        available to the Process. Alternatively supply a map-like callable,
+        such as `multiprocessing.Pool.map` for evaluating the population in
+        parallel. This evaluation is carried out as `workers(func, iterable)`.
+        Requires that `func` be pickleable.
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    Returns
+    -------
+    pvalue : float
+        The sample test p-value.
+    null_dist : list
+        The approximated null distribution.
+
+    """
+    # generate seeds for each rep (change to new parallel random number
+    # capabilities in numpy >= 1.17+)
+    random_state = check_random_state(random_state)
+    random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
+                     size=4, dtype=np.uint32)) for _ in range(reps)]
+
+    # parallelizes with specified workers over number of reps and set seeds
+    parallelp = _ParallelP(x=x, y=y, random_states=random_states)
+    with MapWrapper(workers) as mapwrapper:
+        null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
+
+    # calculate p-value and significant permutation map through list
+    pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
+
+    return pvalue, null_dist
+
+
+def _euclidean_dist(x):
+    return cdist(x, x)
+
+
+MGCResult = _make_tuple_bunch('MGCResult',
+                              ['statistic', 'pvalue', 'mgc_dict'], [])
+
+
+@xp_capabilities(np_only=True)
+def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
+                         workers=1, is_twosamp=False, random_state=None):
+    r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
+
+    Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
+    one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
+    the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
+    called the "scale". A priori, however, it is not know which scales will be
+    most informative. So, MGC computes all distance pairs, and then efficiently
+    computes the distance correlations for all scales. The local correlations
+    illustrate which scales are relatively informative about the relationship.
+    The key, therefore, to successfully discover and decipher relationships
+    between disparate data modalities is to adaptively determine which scales
+    are the most informative, and the geometric implication for the most
+    informative scales. Doing so not only provides an estimate of whether the
+    modalities are related, but also provides insight into how the
+    determination was made. This is especially important in high-dimensional
+    data, where simple visualizations do not reveal relationships to the
+    unaided human eye. Characterizations of this implementation in particular
+    have been derived from and benchmarked within in [2]_.
+
+    Parameters
+    ----------
+    x, y : ndarray
+        If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
+        the number of samples and `p` and `q` are the number of dimensions,
+        then the MGC independence test will be run.  Alternatively, ``x`` and
+        ``y`` can have shapes ``(n, n)`` if they are distance or similarity
+        matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
+        and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
+        two-sample MGC test will be run.
+    compute_distance : callable, optional
+        A function that computes the distance or similarity among the samples
+        within each data matrix. Set to ``None`` if ``x`` and ``y`` are
+        already distance matrices. The default uses the euclidean norm metric.
+        If you are calling a custom function, either create the distance
+        matrix before-hand or create a function of the form
+        ``compute_distance(x)`` where `x` is the data matrix for which
+        pairwise distances are calculated.
+    reps : int, optional
+        The number of replications used to estimate the null when using the
+        permutation test. The default is ``1000``.
+    workers : int or map-like callable, optional
+        If ``workers`` is an int the population is subdivided into ``workers``
+        sections and evaluated in parallel (uses ``multiprocessing.Pool
+        <multiprocessing>``). Supply ``-1`` to use all cores available to the
+        Process. Alternatively supply a map-like callable, such as
+        ``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
+        This evaluation is carried out as ``workers(func, iterable)``.
+        Requires that `func` be pickleable. The default is ``1``.
+    is_twosamp : bool, optional
+        If `True`, a two sample test will be run. If ``x`` and ``y`` have
+        shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
+        set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
+        ``(n, p)`` and a two sample test is desired. The default is ``False``.
+        Note that this will not run if inputs are distance matrices.
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    Returns
+    -------
+    res : MGCResult
+        An object containing attributes:
+
+        statistic : float
+            The sample MGC test statistic within ``[-1, 1]``.
+        pvalue : float
+            The p-value obtained via permutation.
+        mgc_dict : dict
+            Contains additional useful results:
+
+                - mgc_map : ndarray
+                    A 2D representation of the latent geometry of the
+                    relationship.
+                - opt_scale : (int, int)
+                    The estimated optimal scale as a ``(x, y)`` pair.
+                - null_dist : list
+                    The null distribution derived from the permuted matrices.
+
+    See Also
+    --------
+    pearsonr : Pearson correlation coefficient and p-value for testing
+               non-correlation.
+    kendalltau : Calculates Kendall's tau.
+    spearmanr : Calculates a Spearman rank-order correlation coefficient.
+
+    Notes
+    -----
+    A description of the process of MGC and applications on neuroscience data
+    can be found in [1]_. It is performed using the following steps:
+
+    #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
+       modified to be mean zero columnwise. This results in two
+       :math:`n \times n` distance matrices :math:`A` and :math:`B` (the
+       centering and unbiased modification) [3]_.
+
+    #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
+
+       * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
+         are calculated for each property. Here, :math:`G_k (i, j)` indicates
+         the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
+         and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
+         the :math:`i`-th row of :math:`B`
+
+       * Let :math:`\circ` denotes the entry-wise matrix product, then local
+         correlations are summed and normalized using the following statistic:
+
+    .. math::
+
+        c^{kl} = \frac{\sum_{ij} A G_k B H_l}
+                      {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
+
+    #. The MGC test statistic is the smoothed optimal local correlation of
+       :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
+       (which essentially set all isolated large correlations) as 0 and
+       connected large correlations the same as before, see [3]_.) MGC is,
+
+    .. math::
+
+        MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
+                                                    \right)
+
+    The test statistic returns a value between :math:`(-1, 1)` since it is
+    normalized.
+
+    The p-value returned is calculated using a permutation test. This process
+    is completed by first randomly permuting :math:`y` to estimate the null
+    distribution and then calculating the probability of observing a test
+    statistic, under the null, at least as extreme as the observed test
+    statistic.
+
+    MGC requires at least 5 samples to run with reliable results. It can also
+    handle high-dimensional data sets.
+    In addition, by manipulating the input data matrices, the two-sample
+    testing problem can be reduced to the independence testing problem [4]_.
+    Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
+    :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
+    follows:
+
+    .. math::
+
+        X = [U | V] \in \mathcal{R}^{p \times (n + m)}
+        Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
+
+    Then, the MGC statistic can be calculated as normal. This methodology can
+    be extended to similar tests such as distance correlation [4]_.
+
+    .. versionadded:: 1.4.0
+
+    References
+    ----------
+    .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
+           Maggioni, M., & Shen, C. (2019). Discovering and deciphering
+           relationships across disparate data modalities. ELife.
+    .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
+           Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
+           mgcpy: A Comprehensive High Dimensional Independence Testing Python
+           Package. :arXiv:`1907.02088`
+    .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
+           correlation to multiscale graph correlation. Journal of the American
+           Statistical Association.
+    .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
+           Distance and Kernel Methods for Hypothesis Testing.
+           :arXiv:`1806.05514`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import multiscale_graphcorr
+    >>> x = np.arange(100)
+    >>> y = x
+    >>> res = multiscale_graphcorr(x, y)
+    >>> res.statistic, res.pvalue
+    (1.0, 0.001)
+
+    To run an unpaired two-sample test,
+
+    >>> x = np.arange(100)
+    >>> y = np.arange(79)
+    >>> res = multiscale_graphcorr(x, y)
+    >>> res.statistic, res.pvalue  # doctest: +SKIP
+    (0.033258146255703246, 0.023)
+
+    or, if shape of the inputs are the same,
+
+    >>> x = np.arange(100)
+    >>> y = x
+    >>> res = multiscale_graphcorr(x, y, is_twosamp=True)
+    >>> res.statistic, res.pvalue  # doctest: +SKIP
+    (-0.008021809890200488, 1.0)
+
+    """
+    if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
+        raise ValueError("x and y must be ndarrays")
+
+    # convert arrays of type (n,) to (n, 1)
+    if x.ndim == 1:
+        x = x[:, np.newaxis]
+    elif x.ndim != 2:
+        raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
+    if y.ndim == 1:
+        y = y[:, np.newaxis]
+    elif y.ndim != 2:
+        raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")
+
+    nx, px = x.shape
+    ny, py = y.shape
+
+    # check for NaNs
+    _contains_nan(x, nan_policy='raise')
+    _contains_nan(y, nan_policy='raise')
+
+    # check for positive or negative infinity and raise error
+    if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
+        raise ValueError("Inputs contain infinities")
+
+    if nx != ny:
+        if px == py:
+            # reshape x and y for two sample testing
+            is_twosamp = True
+        else:
+            raise ValueError("Shape mismatch, x and y must have shape [n, p] "
+                             "and [n, q] or have shape [n, p] and [m, p].")
+
+    if nx < 5 or ny < 5:
+        raise ValueError("MGC requires at least 5 samples to give reasonable "
+                         "results.")
+
+    # convert x and y to float
+    x = x.astype(np.float64)
+    y = y.astype(np.float64)
+
+    # check if compute_distance_matrix if a callable()
+    if not callable(compute_distance) and compute_distance is not None:
+        raise ValueError("Compute_distance must be a function.")
+
+    # check if number of reps exists, integer, or > 0 (if under 1000 raises
+    # warning)
+    if not isinstance(reps, int) or reps < 0:
+        raise ValueError("Number of reps must be an integer greater than 0.")
+    elif reps < 1000:
+        msg = ("The number of replications is low (under 1000), and p-value "
+               "calculations may be unreliable. Use the p-value result, with "
+               "caution!")
+        warnings.warn(msg, RuntimeWarning, stacklevel=2)
+
+    if is_twosamp:
+        if compute_distance is None:
+            raise ValueError("Cannot run if inputs are distance matrices")
+        x, y = _two_sample_transform(x, y)
+
+    if compute_distance is not None:
+        # compute distance matrices for x and y
+        x = compute_distance(x)
+        y = compute_distance(y)
+
+    # calculate MGC stat
+    stat, stat_dict = _mgc_stat(x, y)
+    stat_mgc_map = stat_dict["stat_mgc_map"]
+    opt_scale = stat_dict["opt_scale"]
+
+    # calculate permutation MGC p-value
+    pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
+                                   random_state=random_state)
+
+    # save all stats (other than stat/p-value) in dictionary
+    mgc_dict = {"mgc_map": stat_mgc_map,
+                "opt_scale": opt_scale,
+                "null_dist": null_dist}
+
+    # create result object with alias for backward compatibility
+    res = MGCResult(stat, pvalue, mgc_dict)
+    res.stat = stat
+    return res
+
+
+def _mgc_stat(distx, disty):
+    r"""Helper function that calculates the MGC stat. See above for use.
+
+    Parameters
+    ----------
+    distx, disty : ndarray
+        `distx` and `disty` have shapes ``(n, p)`` and ``(n, q)`` or
+        ``(n, n)`` and ``(n, n)``
+        if distance matrices.
+
+    Returns
+    -------
+    stat : float
+        The sample MGC test statistic within ``[-1, 1]``.
+    stat_dict : dict
+        Contains additional useful additional returns containing the following
+        keys:
+
+            - stat_mgc_map : ndarray
+                MGC-map of the statistics.
+            - opt_scale : (float, float)
+                The estimated optimal scale as a ``(x, y)`` pair.
+
+    """
+    # calculate MGC map and optimal scale
+    stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
+
+    n, m = stat_mgc_map.shape
+    if m == 1 or n == 1:
+        # the global scale at is the statistic calculated at maximal nearest
+        # neighbors. There is not enough local scale to search over, so
+        # default to global scale
+        stat = stat_mgc_map[m - 1][n - 1]
+        opt_scale = m * n
+    else:
+        samp_size = len(distx) - 1
+
+        # threshold to find connected region of significant local correlations
+        sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
+
+        # maximum within the significant region
+        stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
+
+    stat_dict = {"stat_mgc_map": stat_mgc_map,
+                 "opt_scale": opt_scale}
+
+    return stat, stat_dict
+
+
+def _threshold_mgc_map(stat_mgc_map, samp_size):
+    r"""
+    Finds a connected region of significance in the MGC-map by thresholding.
+
+    Parameters
+    ----------
+    stat_mgc_map : ndarray
+        All local correlations within ``[-1,1]``.
+    samp_size : int
+        The sample size of original data.
+
+    Returns
+    -------
+    sig_connect : ndarray
+        A binary matrix with 1's indicating the significant region.
+
+    """
+    m, n = stat_mgc_map.shape
+
+    # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
+    # with varying levels of performance. Threshold is based on a beta
+    # approximation.
+    per_sig = 1 - (0.02 / samp_size)  # Percentile to consider as significant
+    threshold = samp_size * (samp_size - 3)/4 - 1/2  # Beta approximation
+    threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
+
+    # the global scale at is the statistic calculated at maximal nearest
+    # neighbors. Threshold is the maximum on the global and local scales
+    threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
+
+    # find the largest connected component of significant correlations
+    sig_connect = stat_mgc_map > threshold
+    if np.sum(sig_connect) > 0:
+        sig_connect, _ = _measurements.label(sig_connect)
+        _, label_counts = np.unique(sig_connect, return_counts=True)
+
+        # skip the first element in label_counts, as it is count(zeros)
+        max_label = np.argmax(label_counts[1:]) + 1
+        sig_connect = sig_connect == max_label
+    else:
+        sig_connect = np.array([[False]])
+
+    return sig_connect
+
+
+def _smooth_mgc_map(sig_connect, stat_mgc_map):
+    """Finds the smoothed maximal within the significant region R.
+
+    If area of R is too small it returns the last local correlation. Otherwise,
+    returns the maximum within significant_connected_region.
+
+    Parameters
+    ----------
+    sig_connect : ndarray
+        A binary matrix with 1's indicating the significant region.
+    stat_mgc_map : ndarray
+        All local correlations within ``[-1, 1]``.
+
+    Returns
+    -------
+    stat : float
+        The sample MGC statistic within ``[-1, 1]``.
+    opt_scale: (float, float)
+        The estimated optimal scale as an ``(x, y)`` pair.
+
+    """
+    m, n = stat_mgc_map.shape
+
+    # the global scale at is the statistic calculated at maximal nearest
+    # neighbors. By default, statistic and optimal scale are global.
+    stat = stat_mgc_map[m - 1][n - 1]
+    opt_scale = [m, n]
+
+    if np.linalg.norm(sig_connect) != 0:
+        # proceed only when the connected region's area is sufficiently large
+        # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
+        # with varying levels of performance
+        if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
+            max_corr = max(stat_mgc_map[sig_connect])
+
+            # find all scales within significant_connected_region that maximize
+            # the local correlation
+            max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
+
+            if max_corr >= stat:
+                stat = max_corr
+
+                k, l = max_corr_index
+                one_d_indices = k * n + l  # 2D to 1D indexing
+                k = np.max(one_d_indices) // n
+                l = np.max(one_d_indices) % n
+                opt_scale = [k+1, l+1]  # adding 1s to match R indexing
+
+    return stat, opt_scale
+
+
+def _two_sample_transform(u, v):
+    """Helper function that concatenates x and y for two sample MGC stat.
+
+    See above for use.
+
+    Parameters
+    ----------
+    u, v : ndarray
+        `u` and `v` have shapes ``(n, p)`` and ``(m, p)``.
+
+    Returns
+    -------
+    x : ndarray
+        Concatenate `u` and `v` along the ``axis = 0``. `x` thus has shape
+        ``(2n, p)``.
+    y : ndarray
+        Label matrix for `x` where 0 refers to samples that comes from `u` and
+        1 refers to samples that come from `v`. `y` thus has shape ``(2n, 1)``.
+
+    """
+    nx = u.shape[0]
+    ny = v.shape[0]
+    x = np.concatenate([u, v], axis=0)
+    y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
+    return x, y
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_morestats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_morestats.py
new file mode 100644
index 0000000000000000000000000000000000000000..f32cf59103de4cac866df1d0de824ae2a480d21a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_morestats.py
@@ -0,0 +1,4862 @@
+import itertools
+import math
+import warnings
+import threading
+from collections import namedtuple
+
+import numpy as np
+from numpy import (isscalar, log, around, zeros,
+                   arange, sort, amin, amax, sqrt, array,
+                   pi, exp, ravel, count_nonzero)
+
+from scipy import optimize, special, interpolate, stats
+from scipy._lib._bunch import _make_tuple_bunch
+from scipy._lib._util import _rename_parameter, _contains_nan, _get_nan
+from scipy._lib.deprecation import _NoValue
+import scipy._lib.array_api_extra as xpx
+
+from scipy._lib._array_api import (
+    array_namespace,
+    is_marray,
+    xp_capabilities,
+    is_numpy,
+    is_jax,
+    is_dask,
+    xp_size,
+    xp_vector_norm,
+    xp_promote,
+    xp_result_type,
+    xp_device,
+    xp_ravel,
+    _length_nonmasked,
+)
+
+from ._ansari_swilk_statistics import gscale, swilk
+from . import _stats_py, _wilcoxon
+from ._fit import FitResult
+from ._stats_py import (_get_pvalue, SignificanceResult,  # noqa:F401
+                        _SimpleNormal, _SimpleChi2, _SimpleF)
+from .contingency import chi2_contingency
+from . import distributions
+from ._distn_infrastructure import rv_generic
+from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
+
+
+__all__ = ['mvsdist',
+           'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
+           'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
+           'shapiro', 'anderson', 'ansari', 'bartlett', 'levene',
+           'fligner', 'mood', 'wilcoxon', 'median_test',
+           'circmean', 'circvar', 'circstd', 'anderson_ksamp',
+           'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
+           'yeojohnson_normplot', 'directional_stats',
+           'false_discovery_control'
+           ]
+
+
+Mean = namedtuple('Mean', ('statistic', 'minmax'))
+Variance = namedtuple('Variance', ('statistic', 'minmax'))
+Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
+
+
+@xp_capabilities(np_only=True)
+def bayes_mvs(data, alpha=0.90):
+    r"""
+    Bayesian confidence intervals for the mean, var, and std.
+
+    Parameters
+    ----------
+    data : array_like
+        Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
+        Requires 2 or more data points.
+    alpha : float, optional
+        Probability that the returned confidence interval contains
+        the true parameter.
+
+    Returns
+    -------
+    mean_cntr, var_cntr, std_cntr : tuple
+        The three results are for the mean, variance and standard deviation,
+        respectively.  Each result is a tuple of the form::
+
+            (center, (lower, upper))
+
+        with ``center`` the mean of the conditional pdf of the value given the
+        data, and ``(lower, upper)`` a confidence interval, centered on the
+        median, containing the estimate to a probability ``alpha``.
+
+    See Also
+    --------
+    mvsdist
+
+    Notes
+    -----
+    Each tuple of mean, variance, and standard deviation estimates represent
+    the (center, (lower, upper)) with center the mean of the conditional pdf
+    of the value given the data and (lower, upper) is a confidence interval
+    centered on the median, containing the estimate to a probability
+    ``alpha``.
+
+    Converts data to 1-D and assumes all data has the same mean and variance.
+    Uses Jeffrey's prior for variance and std.
+
+    Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
+
+    References
+    ----------
+    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
+    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
+    2006.
+
+    Examples
+    --------
+    First a basic example to demonstrate the outputs:
+
+    >>> from scipy import stats
+    >>> data = [6, 9, 12, 7, 8, 8, 13]
+    >>> mean, var, std = stats.bayes_mvs(data)
+    >>> mean
+    Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
+    >>> var
+    Variance(statistic=10.0, minmax=(3.176724206, 24.45910382))
+    >>> std
+    Std_dev(statistic=2.9724954732045084,
+            minmax=(1.7823367265645143, 4.945614605014631))
+
+    Now we generate some normally distributed random data, and get estimates of
+    mean and standard deviation with 95% confidence intervals for those
+    estimates:
+
+    >>> n_samples = 100000
+    >>> data = stats.norm.rvs(size=n_samples)
+    >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
+    >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
+    >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
+    ...            alpha=0.2, label=r'Estimated mean (95% limits)')
+    >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
+    >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
+    ...            label=r'Estimated scale (95% limits)')
+
+    >>> ax.legend(fontsize=10)
+    >>> ax.set_xlim([-4, 4])
+    >>> ax.set_ylim([0, 0.5])
+    >>> plt.show()
+
+    """
+    m, v, s = mvsdist(data)
+    if alpha >= 1 or alpha <= 0:
+        raise ValueError(f"0 < alpha < 1 is required, but {alpha=} was given.")
+
+    m_res = Mean(m.mean(), m.interval(alpha))
+    v_res = Variance(v.mean(), v.interval(alpha))
+    s_res = Std_dev(s.mean(), s.interval(alpha))
+
+    return m_res, v_res, s_res
+
+
+@xp_capabilities(np_only=True)
+def mvsdist(data):
+    """
+    'Frozen' distributions for mean, variance, and standard deviation of data.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array. Converted to 1-D using ravel.
+        Requires 2 or more data-points.
+
+    Returns
+    -------
+    mdist : "frozen" distribution object
+        Distribution object representing the mean of the data.
+    vdist : "frozen" distribution object
+        Distribution object representing the variance of the data.
+    sdist : "frozen" distribution object
+        Distribution object representing the standard deviation of the data.
+
+    See Also
+    --------
+    bayes_mvs
+
+    Notes
+    -----
+    The return values from ``bayes_mvs(data)`` is equivalent to
+    ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
+
+    In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
+    on the three distribution objects returned from this function will give
+    the same results that are returned from `bayes_mvs`.
+
+    References
+    ----------
+    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
+    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
+    2006.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> data = [6, 9, 12, 7, 8, 8, 13]
+    >>> mean, var, std = stats.mvsdist(data)
+
+    We now have frozen distribution objects "mean", "var" and "std" that we can
+    examine:
+
+    >>> mean.mean()
+    9.0
+    >>> mean.interval(0.95)
+    (6.6120585482655692, 11.387941451734431)
+    >>> mean.std()
+    1.1952286093343936
+
+    """
+    x = ravel(data)
+    n = len(x)
+    if n < 2:
+        raise ValueError("Need at least 2 data-points.")
+    xbar = x.mean()
+    C = x.var()
+    if n > 1000:  # gaussian approximations for large n
+        mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
+        sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
+        vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
+    else:
+        nm1 = n - 1
+        fac = n * C / 2.
+        val = nm1 / 2.
+        mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
+        sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
+        vdist = distributions.invgamma(val, scale=fac)
+    return mdist, vdist, sdist
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, default_axis=None
+)
+def kstat(data, n=2, *, axis=None):
+    r"""
+    Return the `n` th k-statistic ( ``1<=n<=4`` so far).
+
+    The `n` th k-statistic ``k_n`` is the unique symmetric unbiased estimator of the
+    `n` th cumulant :math:`\kappa_n` [1]_ [2]_.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    n : int, {1, 2, 3, 4}, optional
+        Default is equal to 2.
+    axis : int or None, default: None
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear
+        in a corresponding element of the output. If ``None``, the input will
+        be raveled before computing the statistic.
+
+    Returns
+    -------
+    kstat : float
+        The `n` th k-statistic.
+
+    See Also
+    --------
+    kstatvar : Returns an unbiased estimator of the variance of the k-statistic
+    moment : Returns the n-th central moment about the mean for a sample.
+
+    Notes
+    -----
+    For a sample size :math:`n`, the first few k-statistics are given by
+
+    .. math::
+
+        k_1 &= \frac{S_1}{n}, \\
+        k_2 &= \frac{nS_2 - S_1^2}{n(n-1)}, \\
+        k_3 &= \frac{2S_1^3 - 3nS_1S_2 + n^2S_3}{n(n-1)(n-2)}, \\
+        k_4 &= \frac{-6S_1^4 + 12nS_1^2S_2 - 3n(n-1)S_2^2 - 4n(n+1)S_1S_3
+        + n^2(n+1)S_4}{n (n-1)(n-2)(n-3)},
+
+    where
+
+    .. math::
+
+        S_r \equiv \sum_{i=1}^n X_i^r,
+
+    and :math:`X_i` is the :math:`i` th data point.
+
+    References
+    ----------
+    .. [1] http://mathworld.wolfram.com/k-Statistic.html
+
+    .. [2] http://mathworld.wolfram.com/Cumulant.html
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> from numpy.random import default_rng
+    >>> rng = default_rng()
+
+    As sample size increases, `n`-th moment and `n`-th k-statistic converge to the
+    same number (although they aren't identical). In the case of the normal
+    distribution, they converge to zero.
+
+    >>> for i in range(2,8):
+    ...     x = rng.normal(size=10**i)
+    ...     m, k = stats.moment(x, 3), stats.kstat(x, 3)
+    ...     print(f"{i=}: {m=:.3g}, {k=:.3g}, {(m-k)=:.3g}")
+    i=2: m=-0.631, k=-0.651, (m-k)=0.0194  # random
+    i=3: m=0.0282, k=0.0283, (m-k)=-8.49e-05
+    i=4: m=-0.0454, k=-0.0454, (m-k)=1.36e-05
+    i=6: m=7.53e-05, k=7.53e-05, (m-k)=-2.26e-09
+    i=7: m=0.00166, k=0.00166, (m-k)=-4.99e-09
+    i=8: m=-2.88e-06 k=-2.88e-06, (m-k)=8.63e-13
+    """
+    xp = array_namespace(data)
+    data = xp.asarray(data)
+    if n > 4 or n < 1:
+        raise ValueError("k-statistics only supported for 1<=n<=4")
+    n = int(n)
+    if axis is None:
+        data = xp.reshape(data, (-1,))
+        axis = 0
+
+    N = _length_nonmasked(data, axis, xp=xp)
+
+    S = [None] + [xp.sum(data**k, axis=axis) for k in range(1, n + 1)]
+    if n == 1:
+        return S[1] * 1.0/N
+    elif n == 2:
+        return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
+    elif n == 3:
+        return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
+    elif n == 4:
+        return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
+                 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
+                (N*(N-1.0)*(N-2.0)*(N-3.0)))
+    else:
+        raise ValueError("Should not be here.")
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, default_axis=None
+)
+def kstatvar(data, n=2, *, axis=None):
+    r"""Return an unbiased estimator of the variance of the k-statistic.
+
+    See `kstat` and [1]_ for more details about the k-statistic.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    n : int, {1, 2}, optional
+        Default is equal to 2.
+    axis : int or None, default: None
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear
+        in a corresponding element of the output. If ``None``, the input will
+        be raveled before computing the statistic.
+
+    Returns
+    -------
+    kstatvar : float
+        The `n` th k-statistic variance.
+
+    See Also
+    --------
+    kstat : Returns the n-th k-statistic.
+    moment : Returns the n-th central moment about the mean for a sample.
+
+    Notes
+    -----
+    Unbiased estimators of the variances of the first two k-statistics are given by
+
+    .. math::
+
+        \mathrm{var}(k_1) &= \frac{k_2}{n}, \\
+        \mathrm{var}(k_2) &= \frac{2k_2^2n + (n-1)k_4}{n(n + 1)}.
+
+    References
+    ----------
+    .. [1] http://mathworld.wolfram.com/k-Statistic.html
+
+    """  # noqa: E501
+    xp = array_namespace(data)
+    data = xp.asarray(data)
+    if axis is None:
+        data = xp.reshape(data, (-1,))
+        axis = 0
+    N = _length_nonmasked(data, axis, xp=xp)
+
+    if n == 1:
+        return kstat(data, n=2, axis=axis, _no_deco=True) * 1.0/N
+    elif n == 2:
+        k2 = kstat(data, n=2, axis=axis, _no_deco=True)
+        k4 = kstat(data, n=4, axis=axis, _no_deco=True)
+        return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
+    else:
+        raise ValueError("Only n=1 or n=2 supported.")
+
+
+def _calc_uniform_order_statistic_medians(n):
+    """Approximations of uniform order statistic medians.
+
+    Parameters
+    ----------
+    n : int
+        Sample size.
+
+    Returns
+    -------
+    v : 1d float array
+        Approximations of the order statistic medians.
+
+    References
+    ----------
+    .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
+           Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
+
+    Examples
+    --------
+    Order statistics of the uniform distribution on the unit interval
+    are marginally distributed according to beta distributions.
+    The expectations of these order statistic are evenly spaced across
+    the interval, but the distributions are skewed in a way that
+    pushes the medians slightly towards the endpoints of the unit interval:
+
+    >>> import numpy as np
+    >>> n = 4
+    >>> k = np.arange(1, n+1)
+    >>> from scipy.stats import beta
+    >>> a = k
+    >>> b = n-k+1
+    >>> beta.mean(a, b)
+    array([0.2, 0.4, 0.6, 0.8])
+    >>> beta.median(a, b)
+    array([0.15910358, 0.38572757, 0.61427243, 0.84089642])
+
+    The Filliben approximation uses the exact medians of the smallest
+    and greatest order statistics, and the remaining medians are approximated
+    by points spread evenly across a sub-interval of the unit interval:
+
+    >>> from scipy.stats._morestats import _calc_uniform_order_statistic_medians
+    >>> _calc_uniform_order_statistic_medians(n)
+    array([0.15910358, 0.38545246, 0.61454754, 0.84089642])
+
+    This plot shows the skewed distributions of the order statistics
+    of a sample of size four from a uniform distribution on the unit interval:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
+    >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
+    >>> plt.figure()
+    >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
+
+    """
+    v = np.empty(n, dtype=np.float64)
+    v[-1] = 0.5**(1.0 / n)
+    v[0] = 1 - v[-1]
+    i = np.arange(2, n)
+    v[1:-1] = (i - 0.3175) / (n + 0.365)
+    return v
+
+
+def _parse_dist_kw(dist, enforce_subclass=True):
+    """Parse `dist` keyword.
+
+    Parameters
+    ----------
+    dist : str or stats.distributions instance.
+        Several functions take `dist` as a keyword, hence this utility
+        function.
+    enforce_subclass : bool, optional
+        If True (default), `dist` needs to be a
+        `_distn_infrastructure.rv_generic` instance.
+        It can sometimes be useful to set this keyword to False, if a function
+        wants to accept objects that just look somewhat like such an instance
+        (for example, they have a ``ppf`` method).
+
+    """
+    if isinstance(dist, rv_generic):
+        pass
+    elif isinstance(dist, str):
+        try:
+            dist = getattr(distributions, dist)
+        except AttributeError as e:
+            raise ValueError(f"{dist} is not a valid distribution name") from e
+    elif enforce_subclass:
+        msg = ("`dist` should be a stats.distributions instance or a string "
+               "with the name of such a distribution.")
+        raise ValueError(msg)
+
+    return dist
+
+
+def _add_axis_labels_title(plot, xlabel, ylabel, title):
+    """Helper function to add axes labels and a title to stats plots."""
+    try:
+        if hasattr(plot, 'set_title'):
+            # Matplotlib Axes instance or something that looks like it
+            plot.set_title(title)
+            plot.set_xlabel(xlabel)
+            plot.set_ylabel(ylabel)
+        else:
+            # matplotlib.pyplot module
+            plot.title(title)
+            plot.xlabel(xlabel)
+            plot.ylabel(ylabel)
+    except Exception:
+        # Not an MPL object or something that looks (enough) like it.
+        # Don't crash on adding labels or title
+        pass
+
+
+@xp_capabilities(np_only=True)
+def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
+    """
+    Calculate quantiles for a probability plot, and optionally show the plot.
+
+    Generates a probability plot of sample data against the quantiles of a
+    specified theoretical distribution (the normal distribution by default).
+    `probplot` optionally calculates a best-fit line for the data and plots the
+    results using Matplotlib or a given plot function.
+
+    Parameters
+    ----------
+    x : array_like
+        Sample/response data from which `probplot` creates the plot.
+    sparams : tuple, optional
+        Distribution-specific shape parameters (shape parameters plus location
+        and scale).
+    dist : str or stats.distributions instance, optional
+        Distribution or distribution function name. The default is 'norm' for a
+        normal probability plot.  Objects that look enough like a
+        stats.distributions instance (i.e. they have a ``ppf`` method) are also
+        accepted.
+    fit : bool, optional
+        Fit a least-squares regression (best-fit) line to the sample data if
+        True (default).
+    plot : object, optional
+        If given, plots the quantiles.
+        If given and `fit` is True, also plots the least squares fit.
+        `plot` is an object that has to have methods "plot" and "text".
+        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
+        or a custom object with the same methods.
+        Default is None, which means that no plot is created.
+    rvalue : bool, optional
+        If `plot` is provided and `fit` is True, setting `rvalue` to True
+        includes the coefficient of determination on the plot.
+        Default is False.
+
+    Returns
+    -------
+    (osm, osr) : tuple of ndarrays
+        Tuple of theoretical quantiles (osm, or order statistic medians) and
+        ordered responses (osr).  `osr` is simply sorted input `x`.
+        For details on how `osm` is calculated see the Notes section.
+    (slope, intercept, r) : tuple of floats, optional
+        Tuple  containing the result of the least-squares fit, if that is
+        performed by `probplot`. `r` is the square root of the coefficient of
+        determination.  If ``fit=False`` and ``plot=None``, this tuple is not
+        returned.
+
+    Notes
+    -----
+    Even if `plot` is given, the figure is not shown or saved by `probplot`;
+    ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
+    calling `probplot`.
+
+    `probplot` generates a probability plot, which should not be confused with
+    a Q-Q or a P-P plot.  Statsmodels has more extensive functionality of this
+    type, see ``statsmodels.api.ProbPlot``.
+
+    The formula used for the theoretical quantiles (horizontal axis of the
+    probability plot) is Filliben's estimate::
+
+        quantiles = dist.ppf(val), for
+
+                0.5**(1/n),                  for i = n
+          val = (i - 0.3175) / (n + 0.365),  for i = 2, ..., n-1
+                1 - 0.5**(1/n),              for i = 1
+
+    where ``i`` indicates the i-th ordered value and ``n`` is the total number
+    of values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> nsample = 100
+    >>> rng = np.random.default_rng()
+
+    A t distribution with small degrees of freedom:
+
+    >>> ax1 = plt.subplot(221)
+    >>> x = stats.t.rvs(3, size=nsample, random_state=rng)
+    >>> res = stats.probplot(x, plot=plt)
+
+    A t distribution with larger degrees of freedom:
+
+    >>> ax2 = plt.subplot(222)
+    >>> x = stats.t.rvs(25, size=nsample, random_state=rng)
+    >>> res = stats.probplot(x, plot=plt)
+
+    A mixture of two normal distributions with broadcasting:
+
+    >>> ax3 = plt.subplot(223)
+    >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
+    ...                    size=(nsample//2,2), random_state=rng).ravel()
+    >>> res = stats.probplot(x, plot=plt)
+
+    A standard normal distribution:
+
+    >>> ax4 = plt.subplot(224)
+    >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng)
+    >>> res = stats.probplot(x, plot=plt)
+
+    Produce a new figure with a loggamma distribution, using the ``dist`` and
+    ``sparams`` keywords:
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng)
+    >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
+    >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
+
+    Show the results with Matplotlib:
+
+    >>> plt.show()
+
+    """
+    x = np.asarray(x)
+    if x.size == 0:
+        if fit:
+            return (x, x), (np.nan, np.nan, 0.0)
+        else:
+            return x, x
+
+    osm_uniform = _calc_uniform_order_statistic_medians(len(x))
+    dist = _parse_dist_kw(dist, enforce_subclass=False)
+    if sparams is None:
+        sparams = ()
+    if isscalar(sparams):
+        sparams = (sparams,)
+    if not isinstance(sparams, tuple):
+        sparams = tuple(sparams)
+
+    osm = dist.ppf(osm_uniform, *sparams)
+    osr = sort(x)
+    if fit:
+        # perform a linear least squares fit.
+        slope, intercept, r, prob, _ = _stats_py.linregress(osm, osr)
+
+    if plot is not None:
+        plot.plot(osm, osr, 'bo')
+        if fit:
+            plot.plot(osm, slope*osm + intercept, 'r-')
+        _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
+                               ylabel='Ordered Values',
+                               title='Probability Plot')
+
+        # Add R^2 value to the plot as text
+        if fit and rvalue:
+            xmin = amin(osm)
+            xmax = amax(osm)
+            ymin = amin(x)
+            ymax = amax(x)
+            posx = xmin + 0.70 * (xmax - xmin)
+            posy = ymin + 0.01 * (ymax - ymin)
+            plot.text(posx, posy, f"$R^2={r ** 2:1.4f}$")
+
+    if fit:
+        return (osm, osr), (slope, intercept, r)
+    else:
+        return osm, osr
+
+
+@xp_capabilities(np_only=True)
+def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
+    """Calculate the shape parameter that maximizes the PPCC.
+
+    The probability plot correlation coefficient (PPCC) plot can be used
+    to determine the optimal shape parameter for a one-parameter family
+    of distributions. ``ppcc_max`` returns the shape parameter that would
+    maximize the probability plot correlation coefficient for the given
+    data to a one-parameter family of distributions.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    brack : tuple, optional
+        Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
+        then they are assumed to be a starting interval for a downhill bracket
+        search (see `scipy.optimize.brent`).
+    dist : str or stats.distributions instance, optional
+        Distribution or distribution function name.  Objects that look enough
+        like a stats.distributions instance (i.e. they have a ``ppf`` method)
+        are also accepted.  The default is ``'tukeylambda'``.
+
+    Returns
+    -------
+    shape_value : float
+        The shape parameter at which the probability plot correlation
+        coefficient reaches its max value.
+
+    See Also
+    --------
+    ppcc_plot, probplot, boxcox
+
+    Notes
+    -----
+    The brack keyword serves as a starting point which is useful in corner
+    cases. One can use a plot to obtain a rough visual estimate of the location
+    for the maximum to start the search near it.
+
+    References
+    ----------
+    .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test
+           for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
+    .. [2] Engineering Statistics Handbook, NIST/SEMATEC,
+           https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
+
+    Examples
+    --------
+    First we generate some random data from a Weibull distribution
+    with shape parameter 2.5:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng()
+    >>> c = 2.5
+    >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
+
+    Generate the PPCC plot for this data with the Weibull distribution.
+
+    >>> fig, ax = plt.subplots(figsize=(8, 6))
+    >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax)
+
+    We calculate the value where the shape should reach its maximum and a
+    red line is drawn there. The line should coincide with the highest
+    point in the PPCC graph.
+
+    >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min')
+    >>> ax.axvline(cmax, color='r')
+    >>> plt.show()
+
+    """
+    dist = _parse_dist_kw(dist)
+    osm_uniform = _calc_uniform_order_statistic_medians(len(x))
+    osr = sort(x)
+
+    # this function computes the x-axis values of the probability plot
+    #  and computes a linear regression (including the correlation)
+    #  and returns 1-r so that a minimization function maximizes the
+    #  correlation
+    def tempfunc(shape, mi, yvals, func):
+        xvals = func(mi, shape)
+        r, prob = _stats_py.pearsonr(xvals, yvals)
+        return 1 - r
+
+    return optimize.brent(tempfunc, brack=brack,
+                          args=(osm_uniform, osr, dist.ppf))
+
+
+@xp_capabilities(np_only=True)
+def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
+    """Calculate and optionally plot probability plot correlation coefficient.
+
+    The probability plot correlation coefficient (PPCC) plot can be used to
+    determine the optimal shape parameter for a one-parameter family of
+    distributions.  It cannot be used for distributions without shape
+    parameters
+    (like the normal distribution) or with multiple shape parameters.
+
+    By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
+    Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
+    distributions via an approximately normal one, and is therefore
+    particularly useful in practice.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    a, b : scalar
+        Lower and upper bounds of the shape parameter to use.
+    dist : str or stats.distributions instance, optional
+        Distribution or distribution function name.  Objects that look enough
+        like a stats.distributions instance (i.e. they have a ``ppf`` method)
+        are also accepted.  The default is ``'tukeylambda'``.
+    plot : object, optional
+        If given, plots PPCC against the shape parameter.
+        `plot` is an object that has to have methods "plot" and "text".
+        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
+        or a custom object with the same methods.
+        Default is None, which means that no plot is created.
+    N : int, optional
+        Number of points on the horizontal axis (equally distributed from
+        `a` to `b`).
+
+    Returns
+    -------
+    svals : ndarray
+        The shape values for which `ppcc` was calculated.
+    ppcc : ndarray
+        The calculated probability plot correlation coefficient values.
+
+    See Also
+    --------
+    ppcc_max, probplot, boxcox_normplot, tukeylambda
+
+    References
+    ----------
+    J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
+    Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
+
+    Examples
+    --------
+    First we generate some random data from a Weibull distribution
+    with shape parameter 2.5, and plot the histogram of the data:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng()
+    >>> c = 2.5
+    >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
+
+    Take a look at the histogram of the data.
+
+    >>> fig1, ax = plt.subplots(figsize=(9, 4))
+    >>> ax.hist(x, bins=50)
+    >>> ax.set_title('Histogram of x')
+    >>> plt.show()
+
+    Now we explore this data with a PPCC plot as well as the related
+    probability plot and Box-Cox normplot.  A red line is drawn where we
+    expect the PPCC value to be maximal (at the shape parameter ``c``
+    used above):
+
+    >>> fig2 = plt.figure(figsize=(12, 4))
+    >>> ax1 = fig2.add_subplot(1, 3, 1)
+    >>> ax2 = fig2.add_subplot(1, 3, 2)
+    >>> ax3 = fig2.add_subplot(1, 3, 3)
+    >>> res = stats.probplot(x, plot=ax1)
+    >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2)
+    >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3)
+    >>> ax3.axvline(c, color='r')
+    >>> plt.show()
+
+    """
+    if b <= a:
+        raise ValueError("`b` has to be larger than `a`.")
+
+    svals = np.linspace(a, b, num=N)
+    ppcc = np.empty_like(svals)
+    for k, sval in enumerate(svals):
+        _, r2 = probplot(x, sval, dist=dist, fit=True)
+        ppcc[k] = r2[-1]
+
+    if plot is not None:
+        plot.plot(svals, ppcc, 'x')
+        _add_axis_labels_title(plot, xlabel='Shape Values',
+                               ylabel='Prob Plot Corr. Coef.',
+                               title=f'({dist}) PPCC Plot')
+
+    return svals, ppcc
+
+
+def _log_mean(logx, axis):
+    # compute log of mean of x from log(x)
+    return (
+        special.logsumexp(logx, axis=axis, keepdims=True)
+        - math.log(logx.shape[axis])
+    )
+
+
+def _log_var(logx, xp, axis):
+    # compute log of variance of x from log(x)
+    logmean = xp.broadcast_to(_log_mean(logx, axis=axis), logx.shape)
+    ones = xp.ones_like(logx)
+    logxmu, _ = special.logsumexp(xp.stack((logx, logmean), axis=0), axis=0,
+                                  b=xp.stack((ones, -ones), axis=0), return_sign=True)
+    return special.logsumexp(2 * logxmu, axis=axis) - math.log(logx.shape[axis])
+
+
+@xp_capabilities()
+def boxcox_llf(lmb, data, *, axis=0, keepdims=False, nan_policy='propagate'):
+    r"""The boxcox log-likelihood function.
+
+    Parameters
+    ----------
+    lmb : scalar
+        Parameter for Box-Cox transformation.  See `boxcox` for details.
+    data : array_like
+        Data to calculate Box-Cox log-likelihood for.  If `data` is
+        multi-dimensional, the log-likelihood is calculated along the first
+        axis.
+    axis : int, default: 0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear in a
+        corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+    nan_policy : {'propagate', 'omit', 'raise'
+        Defines how to handle input NaNs.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the  statistic is computed, the corresponding entry of the output
+          will be NaN.
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding entry of the output will be
+          NaN.
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+    keepdims : bool, default: False
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    llf : float or ndarray
+        Box-Cox log-likelihood of `data` given `lmb`.  A float for 1-D `data`,
+        an array otherwise.
+
+    See Also
+    --------
+    boxcox, probplot, boxcox_normplot, boxcox_normmax
+
+    Notes
+    -----
+    The Box-Cox log-likelihood function :math:`l` is defined here as
+
+    .. math::
+
+        l = (\lambda - 1) \sum_i^N \log(x_i) -
+              \frac{N}{2} \log\left(\sum_i^N (y_i - \bar{y})^2 / N\right),
+
+    where :math:`N` is the number of data points ``data`` and :math:`y` is the Box-Cox
+    transformed input data.
+    This corresponds to the *profile log-likelihood* of the original data :math:`x`
+    with some constant terms dropped.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
+
+    Generate some random variates and calculate Box-Cox log-likelihood values
+    for them for a range of ``lmbda`` values:
+
+    >>> rng = np.random.default_rng()
+    >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng)
+    >>> lmbdas = np.linspace(-2, 10)
+    >>> llf = np.zeros(lmbdas.shape, dtype=float)
+    >>> for ii, lmbda in enumerate(lmbdas):
+    ...     llf[ii] = stats.boxcox_llf(lmbda, x)
+
+    Also find the optimal lmbda value with `boxcox`:
+
+    >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
+
+    Plot the log-likelihood as function of lmbda.  Add the optimal lmbda as a
+    horizontal line to check that that's really the optimum:
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(lmbdas, llf, 'b.-')
+    >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
+    >>> ax.set_xlabel('lmbda parameter')
+    >>> ax.set_ylabel('Box-Cox log-likelihood')
+
+    Now add some probability plots to show that where the log-likelihood is
+    maximized the data transformed with `boxcox` looks closest to normal:
+
+    >>> locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
+    >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
+    ...     xt = stats.boxcox(x, lmbda=lmbda)
+    ...     (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
+    ...     ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
+    ...     ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
+    ...     ax_inset.set_xticklabels([])
+    ...     ax_inset.set_yticklabels([])
+    ...     ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
+
+    >>> plt.show()
+
+    """
+    # _axis_nan_policy decorator does not currently support these for lazy arrays.
+    # We want to run tests with lazy backends, so don't pass the arguments explicitly
+    # unless necessary.
+    kwargs = {}
+    if keepdims is not False:
+        kwargs['keepdims'] = keepdims
+    if nan_policy != 'propagate':
+        kwargs['nan_policy'] = nan_policy
+    return _boxcox_llf(data, lmb=lmb, axis=axis, **kwargs)
+
+
+@_axis_nan_policy_factory(lambda x: x, n_outputs=1, default_axis=0,
+                          result_to_tuple=lambda x, _: (x,))
+def _boxcox_llf(data, axis=0, *, lmb):
+    xp = array_namespace(data)
+    dtype = xp_result_type(lmb, data, force_floating=True, xp=xp)
+    data = xp.asarray(data, dtype=dtype)
+    N = data.shape[axis]
+    if N == 0:
+        return _get_nan(data, xp=xp)
+
+    logdata = xp.log(data)
+
+    # Compute the variance of the transformed data.
+    if lmb == 0:
+        logvar = xp.log(xp.var(logdata, axis=axis))
+    else:
+        # Transform without the constant offset 1/lmb.  The offset does
+        # not affect the variance, and the subtraction of the offset can
+        # lead to loss of precision.
+        # Division by lmb can be factored out to enhance numerical stability.
+        logx = lmb * logdata
+        logvar = _log_var(logx, xp, axis) - 2 * math.log(abs(lmb))
+
+    res = (lmb - 1) * xp.sum(logdata, axis=axis) - N/2 * logvar
+    res = xp.astype(res, data.dtype, copy=False)  # compensate for NumPy <2.0
+    res = res[()] if res.ndim == 0 else res
+    return res
+
+
+def _boxcox_conf_interval(x, lmax, alpha):
+    # Need to find the lambda for which
+    #  f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
+    fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
+    target = boxcox_llf(lmax, x) - fac
+
+    def rootfunc(lmbda, data, target):
+        return boxcox_llf(lmbda, data) - target
+
+    # Find positive endpoint of interval in which answer is to be found
+    newlm = lmax + 0.5
+    N = 0
+    while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
+        newlm += 0.1
+        N += 1
+
+    if N == 500:
+        raise RuntimeError("Could not find endpoint.")
+
+    lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
+
+    # Now find negative interval in the same way
+    newlm = lmax - 0.5
+    N = 0
+    while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
+        newlm -= 0.1
+        N += 1
+
+    if N == 500:
+        raise RuntimeError("Could not find endpoint.")
+
+    lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
+    return lmminus, lmplus
+
+
+@xp_capabilities(np_only=True)
+def boxcox(x, lmbda=None, alpha=None, optimizer=None):
+    r"""Return a dataset transformed by a Box-Cox power transformation.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array to be transformed.
+
+        If `lmbda` is not None, this is an alias of
+        `scipy.special.boxcox`.
+        Returns nan if ``x < 0``; returns -inf if ``x == 0 and lmbda < 0``.
+
+        If `lmbda` is None, array must be positive, 1-dimensional, and
+        non-constant.
+
+    lmbda : scalar, optional
+        If `lmbda` is None (default), find the value of `lmbda` that maximizes
+        the log-likelihood function and return it as the second output
+        argument.
+
+        If `lmbda` is not None, do the transformation for that value.
+
+    alpha : float, optional
+        If `lmbda` is None and `alpha` is not None (default), return the
+        ``100 * (1-alpha)%`` confidence  interval for `lmbda` as the third
+        output argument. Must be between 0.0 and 1.0.
+
+        If `lmbda` is not None, `alpha` is ignored.
+    optimizer : callable, optional
+        If `lmbda` is None, `optimizer` is the scalar optimizer used to find
+        the value of `lmbda` that minimizes the negative log-likelihood
+        function. `optimizer` is a callable that accepts one argument:
+
+        fun : callable
+            The objective function, which evaluates the negative
+            log-likelihood function at a provided value of `lmbda`
+
+        and returns an object, such as an instance of
+        `scipy.optimize.OptimizeResult`, which holds the optimal value of
+        `lmbda` in an attribute `x`.
+
+        See the example in `boxcox_normmax` or the documentation of
+        `scipy.optimize.minimize_scalar` for more information.
+
+        If `lmbda` is not None, `optimizer` is ignored.
+
+    Returns
+    -------
+    boxcox : ndarray
+        Box-Cox power transformed array.
+    maxlog : float, optional
+        If the `lmbda` parameter is None, the second returned argument is
+        the `lmbda` that maximizes the log-likelihood function.
+    (min_ci, max_ci) : tuple of float, optional
+        If `lmbda` parameter is None and `alpha` is not None, this returned
+        tuple of floats represents the minimum and maximum confidence limits
+        given `alpha`.
+
+    See Also
+    --------
+    probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
+
+    Notes
+    -----
+    The Box-Cox transform is given by:
+
+    .. math::
+
+        y =
+        \begin{cases}
+          \frac{x^\lambda - 1}{\lambda}, &\text{for } \lambda \neq 0 \\
+          \log(x),                       &\text{for } \lambda = 0
+        \end{cases}
+
+    `boxcox` requires the input data to be positive.  Sometimes a Box-Cox
+    transformation provides a shift parameter to achieve this; `boxcox` does
+    not.  Such a shift parameter is equivalent to adding a positive constant to
+    `x` before calling `boxcox`.
+
+    The confidence limits returned when `alpha` is provided give the interval
+    where:
+
+    .. math::
+
+        l(\hat{\lambda}) - l(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
+
+    with :math:`l` the log-likelihood function and :math:`\chi^2` the chi-squared
+    function.
+
+    References
+    ----------
+    G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
+    Royal Statistical Society B, 26, 211-252 (1964).
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    We generate some random variates from a non-normal distribution and make a
+    probability plot for it, to show it is non-normal in the tails:
+
+    >>> fig = plt.figure()
+    >>> ax1 = fig.add_subplot(211)
+    >>> x = stats.loggamma.rvs(5, size=500) + 5
+    >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
+    >>> ax1.set_xlabel('')
+    >>> ax1.set_title('Probplot against normal distribution')
+
+    We now use `boxcox` to transform the data so it's closest to normal:
+
+    >>> ax2 = fig.add_subplot(212)
+    >>> xt, _ = stats.boxcox(x)
+    >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
+    >>> ax2.set_title('Probplot after Box-Cox transformation')
+
+    >>> plt.show()
+
+    """
+    x = np.asarray(x)
+
+    if lmbda is not None:  # single transformation
+        return special.boxcox(x, lmbda)
+
+    if x.ndim != 1:
+        raise ValueError("Data must be 1-dimensional.")
+
+    if x.size == 0:
+        return x
+
+    if np.all(x == x[0]):
+        raise ValueError("Data must not be constant.")
+
+    if np.any(x <= 0):
+        raise ValueError("Data must be positive.")
+
+    # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
+    lmax = boxcox_normmax(x, method='mle', optimizer=optimizer)
+    y = boxcox(x, lmax)
+
+    if alpha is None:
+        return y, lmax
+    else:
+        # Find confidence interval
+        interval = _boxcox_conf_interval(x, lmax, alpha)
+        return y, lmax, interval
+
+
+def _boxcox_inv_lmbda(x, y):
+    # compute lmbda given x and y for Box-Cox transformation
+    num = special.lambertw(-(x ** (-1 / y)) * np.log(x) / y, k=-1)
+    return np.real(-num / np.log(x) - 1 / y)
+
+
+class _BigFloat:
+    def __repr__(self):
+        return "BIG_FLOAT"
+
+
+_BigFloat_singleton = _BigFloat()
+
+
+@xp_capabilities(np_only=True)
+def boxcox_normmax(
+    x, brack=None, method='pearsonr', optimizer=None, *, ymax=_BigFloat_singleton
+):
+    """Compute optimal Box-Cox transform parameter for input data.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array. All entries must be positive, finite, real numbers.
+    brack : 2-tuple, optional, default (-2.0, 2.0)
+         The starting interval for a downhill bracket search for the default
+         `optimize.brent` solver. Note that this is in most cases not
+         critical; the final result is allowed to be outside this bracket.
+         If `optimizer` is passed, `brack` must be None.
+    method : str, optional
+        The method to determine the optimal transform parameter (`boxcox`
+        ``lmbda`` parameter). Options are:
+
+        'pearsonr'  (default)
+            Maximizes the Pearson correlation coefficient between
+            ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
+            normally-distributed.
+
+        'mle'
+            Maximizes the log-likelihood `boxcox_llf`.  This is the method used
+            in `boxcox`.
+
+        'all'
+            Use all optimization methods available, and return all results.
+            Useful to compare different methods.
+    optimizer : callable, optional
+        `optimizer` is a callable that accepts one argument:
+
+        fun : callable
+            The objective function to be minimized. `fun` accepts one argument,
+            the Box-Cox transform parameter `lmbda`, and returns the value of
+            the function (e.g., the negative log-likelihood) at the provided
+            argument. The job of `optimizer` is to find the value of `lmbda`
+            that *minimizes* `fun`.
+
+        and returns an object, such as an instance of
+        `scipy.optimize.OptimizeResult`, which holds the optimal value of
+        `lmbda` in an attribute `x`.
+
+        See the example below or the documentation of
+        `scipy.optimize.minimize_scalar` for more information.
+    ymax : float, optional
+        The unconstrained optimal transform parameter may cause Box-Cox
+        transformed data to have extreme magnitude or even overflow.
+        This parameter constrains MLE optimization such that the magnitude
+        of the transformed `x` does not exceed `ymax`. The default is
+        the maximum value of the input dtype. If set to infinity,
+        `boxcox_normmax` returns the unconstrained optimal lambda.
+        Ignored when ``method='pearsonr'``.
+
+    Returns
+    -------
+    maxlog : float or ndarray
+        The optimal transform parameter found.  An array instead of a scalar
+        for ``method='all'``.
+
+    See Also
+    --------
+    boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    We can generate some data and determine the optimal ``lmbda`` in various
+    ways:
+
+    >>> rng = np.random.default_rng()
+    >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
+    >>> y, lmax_mle = stats.boxcox(x)
+    >>> lmax_pearsonr = stats.boxcox_normmax(x)
+
+    >>> lmax_mle
+    2.217563431465757
+    >>> lmax_pearsonr
+    2.238318660200961
+    >>> stats.boxcox_normmax(x, method='all')
+    array([2.23831866, 2.21756343])
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
+    >>> ax.axvline(lmax_mle, color='r')
+    >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
+
+    >>> plt.show()
+
+    Alternatively, we can define our own `optimizer` function. Suppose we
+    are only interested in values of `lmbda` on the interval [6, 7], we
+    want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``,
+    and we want to use tighter tolerances when optimizing the log-likelihood
+    function. To do this, we define a function that accepts positional argument
+    `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject
+    to the provided bounds and tolerances:
+
+    >>> from scipy import optimize
+    >>> options = {'xatol': 1e-12}  # absolute tolerance on `x`
+    >>> def optimizer(fun):
+    ...     return optimize.minimize_scalar(fun, bounds=(6, 7),
+    ...                                     method="bounded", options=options)
+    >>> stats.boxcox_normmax(x, optimizer=optimizer)
+    6.000000000
+    """
+    x = np.asarray(x)
+
+    if not np.all(np.isfinite(x) & (x >= 0)):
+        message = ("The `x` argument of `boxcox_normmax` must contain "
+                   "only positive, finite, real numbers.")
+        raise ValueError(message)
+
+    end_msg = "exceed specified `ymax`."
+    if ymax is _BigFloat_singleton:
+        dtype = x.dtype if np.issubdtype(x.dtype, np.floating) else np.float64
+        # 10000 is a safety factor because `special.boxcox` overflows prematurely.
+        ymax = np.finfo(dtype).max / 10000
+        end_msg = f"overflow in {dtype}."
+    elif ymax <= 0:
+        raise ValueError("`ymax` must be strictly positive")
+
+    # If optimizer is not given, define default 'brent' optimizer.
+    if optimizer is None:
+
+        # Set default value for `brack`.
+        if brack is None:
+            brack = (-2.0, 2.0)
+
+        def _optimizer(func, args):
+            return optimize.brent(func, args=args, brack=brack)
+
+    # Otherwise check optimizer.
+    else:
+        if not callable(optimizer):
+            raise ValueError("`optimizer` must be a callable")
+
+        if brack is not None:
+            raise ValueError("`brack` must be None if `optimizer` is given")
+
+        # `optimizer` is expected to return a `OptimizeResult` object, we here
+        # get the solution to the optimization problem.
+        def _optimizer(func, args):
+            def func_wrapped(x):
+                return func(x, *args)
+            return getattr(optimizer(func_wrapped), 'x', None)
+
+    def _pearsonr(x):
+        osm_uniform = _calc_uniform_order_statistic_medians(len(x))
+        xvals = distributions.norm.ppf(osm_uniform)
+
+        def _eval_pearsonr(lmbda, xvals, samps):
+            # This function computes the x-axis values of the probability plot
+            # and computes a linear regression (including the correlation) and
+            # returns ``1 - r`` so that a minimization function maximizes the
+            # correlation.
+            y = boxcox(samps, lmbda)
+            yvals = np.sort(y)
+            r, prob = _stats_py.pearsonr(xvals, yvals)
+            return 1 - r
+
+        return _optimizer(_eval_pearsonr, args=(xvals, x))
+
+    def _mle(x):
+        def _eval_mle(lmb, data):
+            # function to minimize
+            return -boxcox_llf(lmb, data)
+
+        return _optimizer(_eval_mle, args=(x,))
+
+    def _all(x):
+        maxlog = np.empty(2, dtype=float)
+        maxlog[0] = _pearsonr(x)
+        maxlog[1] = _mle(x)
+        return maxlog
+
+    methods = {'pearsonr': _pearsonr,
+               'mle': _mle,
+               'all': _all}
+    if method not in methods.keys():
+        raise ValueError(f"Method {method} not recognized.")
+
+    optimfunc = methods[method]
+
+    res = optimfunc(x)
+
+    if res is None:
+        message = ("The `optimizer` argument of `boxcox_normmax` must return "
+                   "an object containing the optimal `lmbda` in attribute `x`.")
+        raise ValueError(message)
+    elif not np.isinf(ymax):  # adjust the final lambda
+        # x > 1, boxcox(x) > 0; x < 1, boxcox(x) < 0
+        xmax, xmin = np.max(x), np.min(x)
+        if xmin >= 1:
+            x_treme = xmax
+        elif xmax <= 1:
+            x_treme = xmin
+        else:  # xmin < 1 < xmax
+            indicator = special.boxcox(xmax, res) > abs(special.boxcox(xmin, res))
+            if isinstance(res, np.ndarray):
+                indicator = indicator[1]  # select corresponds with 'mle'
+            x_treme = xmax if indicator else xmin
+
+        mask = abs(special.boxcox(x_treme, res)) > ymax
+        if np.any(mask):
+            message = (
+                f"The optimal lambda is {res}, but the returned lambda is the "
+                f"constrained optimum to ensure that the maximum or the minimum "
+                f"of the transformed data does not " + end_msg
+            )
+            warnings.warn(message, stacklevel=2)
+
+            # Return the constrained lambda to ensure the transformation
+            # does not cause overflow or exceed specified `ymax`
+            constrained_res = _boxcox_inv_lmbda(x_treme, ymax * np.sign(x_treme - 1))
+
+            if isinstance(res, np.ndarray):
+                res[mask] = constrained_res
+            else:
+                res = constrained_res
+    return res
+
+
+def _normplot(method, x, la, lb, plot=None, N=80):
+    """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
+    optionally show it.
+
+    See `boxcox_normplot` or `yeojohnson_normplot` for details.
+    """
+
+    if method == 'boxcox':
+        title = 'Box-Cox Normality Plot'
+        transform_func = boxcox
+    else:
+        title = 'Yeo-Johnson Normality Plot'
+        transform_func = yeojohnson
+
+    x = np.asarray(x)
+    if x.size == 0:
+        return x
+
+    if lb <= la:
+        raise ValueError("`lb` has to be larger than `la`.")
+
+    if method == 'boxcox' and np.any(x <= 0):
+        raise ValueError("Data must be positive.")
+
+    lmbdas = np.linspace(la, lb, num=N)
+    ppcc = lmbdas * 0.0
+    for i, val in enumerate(lmbdas):
+        # Determine for each lmbda the square root of correlation coefficient
+        # of transformed x
+        z = transform_func(x, lmbda=val)
+        _, (_, _, r) = probplot(z, dist='norm', fit=True)
+        ppcc[i] = r
+
+    if plot is not None:
+        plot.plot(lmbdas, ppcc, 'x')
+        _add_axis_labels_title(plot, xlabel='$\\lambda$',
+                               ylabel='Prob Plot Corr. Coef.',
+                               title=title)
+
+    return lmbdas, ppcc
+
+
+@xp_capabilities(np_only=True)
+def boxcox_normplot(x, la, lb, plot=None, N=80):
+    """Compute parameters for a Box-Cox normality plot, optionally show it.
+
+    A Box-Cox normality plot shows graphically what the best transformation
+    parameter is to use in `boxcox` to obtain a distribution that is close
+    to normal.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    la, lb : scalar
+        The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
+        for Box-Cox transformations.  These are also the limits of the
+        horizontal axis of the plot if that is generated.
+    plot : object, optional
+        If given, plots the quantiles and least squares fit.
+        `plot` is an object that has to have methods "plot" and "text".
+        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
+        or a custom object with the same methods.
+        Default is None, which means that no plot is created.
+    N : int, optional
+        Number of points on the horizontal axis (equally distributed from
+        `la` to `lb`).
+
+    Returns
+    -------
+    lmbdas : ndarray
+        The ``lmbda`` values for which a Box-Cox transform was done.
+    ppcc : ndarray
+        Probability Plot Correlation Coefficient, as obtained from `probplot`
+        when fitting the Box-Cox transformed input `x` against a normal
+        distribution.
+
+    See Also
+    --------
+    probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
+
+    Notes
+    -----
+    Even if `plot` is given, the figure is not shown or saved by
+    `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
+    should be used after calling `probplot`.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    Generate some non-normally distributed data, and create a Box-Cox plot:
+
+    >>> x = stats.loggamma.rvs(5, size=500) + 5
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
+
+    Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
+    the same plot:
+
+    >>> _, maxlog = stats.boxcox(x)
+    >>> ax.axvline(maxlog, color='r')
+
+    >>> plt.show()
+
+    """
+    return _normplot('boxcox', x, la, lb, plot, N)
+
+
+@xp_capabilities(np_only=True)
+def yeojohnson(x, lmbda=None):
+    r"""Return a dataset transformed by a Yeo-Johnson power transformation.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.  Should be 1-dimensional.
+    lmbda : float, optional
+        If ``lmbda`` is ``None``, find the lambda that maximizes the
+        log-likelihood function and return it as the second output argument.
+        Otherwise the transformation is done for the given value.
+
+    Returns
+    -------
+    yeojohnson: ndarray
+        Yeo-Johnson power transformed array.
+    maxlog : float, optional
+        If the `lmbda` parameter is None, the second returned argument is
+        the lambda that maximizes the log-likelihood function.
+
+    See Also
+    --------
+    probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
+
+    Notes
+    -----
+    The Yeo-Johnson transform is given by:
+
+    .. math::
+
+        y =
+        \begin{cases}
+        \frac{(x + 1)^\lambda - 1}{\lambda},
+        &\text{for } x \geq 0, \lambda \neq 0
+        \\
+        \log(x + 1),
+        &\text{for } x \geq 0, \lambda = 0
+        \\
+        -\frac{(-x + 1)^{2 - \lambda} - 1}{2 - \lambda},
+        &\text{for } x < 0, \lambda \neq 2
+        \\
+        -\log(-x + 1),
+        &\text{for } x < 0, \lambda = 2
+        \end{cases}
+
+    Unlike `boxcox`, `yeojohnson` does not require the input data to be
+    positive.
+
+    .. versionadded:: 1.2.0
+
+
+    References
+    ----------
+    I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
+    Improve Normality or Symmetry", Biometrika 87.4 (2000):
+
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    We generate some random variates from a non-normal distribution and make a
+    probability plot for it, to show it is non-normal in the tails:
+
+    >>> fig = plt.figure()
+    >>> ax1 = fig.add_subplot(211)
+    >>> x = stats.loggamma.rvs(5, size=500) + 5
+    >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
+    >>> ax1.set_xlabel('')
+    >>> ax1.set_title('Probplot against normal distribution')
+
+    We now use `yeojohnson` to transform the data so it's closest to normal:
+
+    >>> ax2 = fig.add_subplot(212)
+    >>> xt, lmbda = stats.yeojohnson(x)
+    >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
+    >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
+
+    >>> plt.show()
+
+    """
+    x = np.asarray(x)
+    if x.size == 0:
+        return x
+
+    if np.issubdtype(x.dtype, np.complexfloating):
+        raise ValueError('Yeo-Johnson transformation is not defined for '
+                         'complex numbers.')
+
+    if np.issubdtype(x.dtype, np.integer):
+        x = x.astype(np.float64, copy=False)
+
+    if lmbda is not None:
+        return _yeojohnson_transform(x, lmbda)
+
+    # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
+    lmax = yeojohnson_normmax(x)
+    y = _yeojohnson_transform(x, lmax)
+
+    return y, lmax
+
+
+def _yeojohnson_transform(x, lmbda, xp=None):
+    """Returns `x` transformed by the Yeo-Johnson power transform with given
+    parameter `lmbda`.
+    """
+    xp = array_namespace(x) if xp is None else xp
+    dtype = xp_result_type(x, lmbda, force_floating=True, xp=xp)
+    eps = xp.finfo(dtype).eps
+    out = xp.zeros_like(x, dtype=dtype)
+    pos = x >= 0  # binary mask
+
+    # when x >= 0
+    if abs(lmbda) < eps:
+        out = xpx.at(out)[pos].set(xp.log1p(x[pos]))
+    else:  # lmbda != 0
+        # more stable version of: ((x + 1) ** lmbda - 1) / lmbda
+        out = xpx.at(out)[pos].set(xp.expm1(lmbda * xp.log1p(x[pos])) / lmbda)
+
+    # when x < 0
+    if abs(lmbda - 2) > eps:
+        out = xpx.at(out)[~pos].set(
+            -xp.expm1((2 - lmbda) * xp.log1p(-x[~pos])) / (2 - lmbda))
+    else:  # lmbda == 2
+        out = xpx.at(out)[~pos].set(-xp.log1p(-x[~pos]))
+
+    return out
+
+
+@xp_capabilities(skip_backends=[("dask.array", "Dask can't broadcast nan shapes")])
+def yeojohnson_llf(lmb, data, *, axis=0, nan_policy='propagate', keepdims=False):
+    r"""The Yeo-Johnson log-likelihood function.
+
+    Parameters
+    ----------
+    lmb : scalar
+        Parameter for Yeo-Johnson transformation. See `yeojohnson` for
+        details.
+    data : array_like
+        Data to calculate Yeo-Johnson log-likelihood for.
+    axis : int, default: 0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear in a
+        corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+    nan_policy : {'propagate', 'omit', 'raise'
+        Defines how to handle input NaNs.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the  statistic is computed, the corresponding entry of the output
+          will be NaN.
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding entry of the output will be
+          NaN.
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+    keepdims : bool, default: False
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    llf : float
+        Yeo-Johnson log-likelihood of `data` given `lmb`.
+
+    See Also
+    --------
+    yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
+
+    Notes
+    -----
+    The Yeo-Johnson log-likelihood function :math:`l` is defined here as
+
+    .. math::
+
+        l = -\frac{N}{2} \log(\hat{\sigma}^2) + (\lambda - 1)
+              \sum_i^N \text{sign}(x_i) \log(|x_i| + 1)
+
+    where :math:`N` is the number of data points :math:`x`=``data`` and
+    :math:`\hat{\sigma}^2` is the estimated variance of the Yeo-Johnson transformed
+    input data :math:`x`.
+    This corresponds to the *profile log-likelihood* of the original data :math:`x`
+    with some constant terms dropped.
+
+    .. versionadded:: 1.2.0
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
+
+    Generate some random variates and calculate Yeo-Johnson log-likelihood
+    values for them for a range of ``lmbda`` values:
+
+    >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
+    >>> lmbdas = np.linspace(-2, 10)
+    >>> llf = np.zeros(lmbdas.shape, dtype=float)
+    >>> for ii, lmbda in enumerate(lmbdas):
+    ...     llf[ii] = stats.yeojohnson_llf(lmbda, x)
+
+    Also find the optimal lmbda value with `yeojohnson`:
+
+    >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
+
+    Plot the log-likelihood as function of lmbda.  Add the optimal lmbda as a
+    horizontal line to check that that's really the optimum:
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(lmbdas, llf, 'b.-')
+    >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
+    >>> ax.set_xlabel('lmbda parameter')
+    >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
+
+    Now add some probability plots to show that where the log-likelihood is
+    maximized the data transformed with `yeojohnson` looks closest to normal:
+
+    >>> locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
+    >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
+    ...     xt = stats.yeojohnson(x, lmbda=lmbda)
+    ...     (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
+    ...     ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
+    ...     ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
+    ...     ax_inset.set_xticklabels([])
+    ...     ax_inset.set_yticklabels([])
+    ...     ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
+
+    >>> plt.show()
+
+    """
+    # _axis_nan_policy decorator does not currently support these for lazy arrays.
+    # We want to run tests with lazy backends, so don't pass the arguments explicitly
+    # unless necessary.
+    kwargs = {}
+    if keepdims is not False:
+        kwargs['keepdims'] = keepdims
+    if nan_policy != 'propagate':
+        kwargs['nan_policy'] = nan_policy
+    res = _yeojohnson_llf(data, lmb=lmb, axis=axis, **kwargs)
+    return res[()] if res.ndim == 0 else res
+
+
+@_axis_nan_policy_factory(lambda x: x, n_outputs=1, default_axis=0,
+                          result_to_tuple=lambda x, _: (x,))
+def _yeojohnson_llf(data, *, lmb, axis=0):
+    xp = array_namespace(data)
+    y = _yeojohnson_transform(data, lmb, xp=xp)
+    sigma = xp.var(y, axis=axis)
+
+    # Suppress RuntimeWarning raised by np.log when the variance is too low
+    finite_variance = sigma >= xp.finfo(sigma.dtype).smallest_normal
+    log_sigma = xpx.apply_where(finite_variance, (sigma,), xp.log, fill_value=-xp.inf)
+
+    n = data.shape[axis]
+    loglike = (-n / 2 * log_sigma
+               + (lmb - 1) * xp.sum(xp.sign(data) * xp.log1p(xp.abs(data)), axis=axis))
+
+    return loglike
+
+
+@xp_capabilities(np_only=True)
+def yeojohnson_normmax(x, brack=None):
+    """Compute optimal Yeo-Johnson transform parameter.
+
+    Compute optimal Yeo-Johnson transform parameter for input data, using
+    maximum likelihood estimation.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    brack : 2-tuple, optional
+        The starting interval for a downhill bracket search with
+        `optimize.brent`. Note that this is in most cases not critical; the
+        final result is allowed to be outside this bracket. If None,
+        `optimize.fminbound` is used with bounds that avoid overflow.
+
+    Returns
+    -------
+    maxlog : float
+        The optimal transform parameter found.
+
+    See Also
+    --------
+    yeojohnson, yeojohnson_llf, yeojohnson_normplot
+
+    Notes
+    -----
+    .. versionadded:: 1.2.0
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    Generate some data and determine optimal ``lmbda``
+
+    >>> rng = np.random.default_rng()
+    >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
+    >>> lmax = stats.yeojohnson_normmax(x)
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
+    >>> ax.axvline(lmax, color='r')
+
+    >>> plt.show()
+
+    """
+    def _neg_llf(lmbda, data):
+        llf = np.asarray(yeojohnson_llf(lmbda, data))
+        # reject likelihoods that are inf which are likely due to small
+        # variance in the transformed space
+        llf[np.isinf(llf)] = -np.inf
+        return -llf
+
+    with np.errstate(invalid='ignore'):
+        if not np.all(np.isfinite(x)):
+            raise ValueError('Yeo-Johnson input must be finite.')
+        if np.all(x == 0):
+            return 1.0
+        if brack is not None:
+            return optimize.brent(_neg_llf, brack=brack, args=(x,))
+        x = np.asarray(x)
+        dtype = x.dtype if np.issubdtype(x.dtype, np.floating) else np.float64
+        # Allow values up to 20 times the maximum observed value to be safely
+        # transformed without over- or underflow.
+        log1p_max_x = np.log1p(20 * np.max(np.abs(x)))
+        # Use half of floating point's exponent range to allow safe computation
+        # of the variance of the transformed data.
+        log_eps = np.log(np.finfo(dtype).eps)
+        log_tiny_float = (np.log(np.finfo(dtype).tiny) - log_eps) / 2
+        log_max_float = (np.log(np.finfo(dtype).max) + log_eps) / 2
+        # Compute the bounds by approximating the inverse of the Yeo-Johnson
+        # transform on the smallest and largest floating point exponents, given
+        # the largest data we expect to observe. See [1] for further details.
+        # [1] https://github.com/scipy/scipy/pull/18852#issuecomment-1630286174
+        lb = log_tiny_float / log1p_max_x
+        ub = log_max_float / log1p_max_x
+        # Convert the bounds if all or some of the data is negative.
+        if np.all(x < 0):
+            lb, ub = 2 - ub, 2 - lb
+        elif np.any(x < 0):
+            lb, ub = max(2 - ub, lb), min(2 - lb, ub)
+        # Match `optimize.brent`'s tolerance.
+        tol_brent = 1.48e-08
+        return optimize.fminbound(_neg_llf, lb, ub, args=(x,), xtol=tol_brent)
+
+
+@xp_capabilities(np_only=True)
+def yeojohnson_normplot(x, la, lb, plot=None, N=80):
+    """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
+
+    A Yeo-Johnson normality plot shows graphically what the best
+    transformation parameter is to use in `yeojohnson` to obtain a
+    distribution that is close to normal.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    la, lb : scalar
+        The lower and upper bounds for the ``lmbda`` values to pass to
+        `yeojohnson` for Yeo-Johnson transformations. These are also the
+        limits of the horizontal axis of the plot if that is generated.
+    plot : object, optional
+        If given, plots the quantiles and least squares fit.
+        `plot` is an object that has to have methods "plot" and "text".
+        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
+        or a custom object with the same methods.
+        Default is None, which means that no plot is created.
+    N : int, optional
+        Number of points on the horizontal axis (equally distributed from
+        `la` to `lb`).
+
+    Returns
+    -------
+    lmbdas : ndarray
+        The ``lmbda`` values for which a Yeo-Johnson transform was done.
+    ppcc : ndarray
+        Probability Plot Correlation Coefficient, as obtained from `probplot`
+        when fitting the Box-Cox transformed input `x` against a normal
+        distribution.
+
+    See Also
+    --------
+    probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
+
+    Notes
+    -----
+    Even if `plot` is given, the figure is not shown or saved by
+    `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
+    should be used after calling `probplot`.
+
+    .. versionadded:: 1.2.0
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    Generate some non-normally distributed data, and create a Yeo-Johnson plot:
+
+    >>> x = stats.loggamma.rvs(5, size=500) + 5
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
+
+    Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
+    the same plot:
+
+    >>> _, maxlog = stats.yeojohnson(x)
+    >>> ax.axvline(maxlog, color='r')
+
+    >>> plt.show()
+
+    """
+    return _normplot('yeojohnson', x, la, lb, plot, N)
+
+
+ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(ShapiroResult, n_samples=1, too_small=2, default_axis=None)
+def shapiro(x):
+    r"""Perform the Shapiro-Wilk test for normality.
+
+    The Shapiro-Wilk test tests the null hypothesis that the
+    data was drawn from a normal distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Array of sample data. Must contain at least three observations.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    p-value : float
+        The p-value for the hypothesis test.
+
+    See Also
+    --------
+    anderson : The Anderson-Darling test for normality
+    kstest : The Kolmogorov-Smirnov test for goodness of fit.
+    :ref:`hypothesis_shapiro` : Extended example
+
+    Notes
+    -----
+    The algorithm used is described in [4]_ but censoring parameters as
+    described are not implemented. For N > 5000 the W test statistic is
+    accurate, but the p-value may not be.
+
+    References
+    ----------
+    .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
+           :doi:`10.18434/M32189`
+    .. [2] Shapiro, S. S. & Wilk, M.B, "An analysis of variance test for
+           normality (complete samples)", Biometrika, 1965, Vol. 52,
+           pp. 591-611, :doi:`10.2307/2333709`
+    .. [3] Razali, N. M. & Wah, Y. B., "Power comparisons of Shapiro-Wilk,
+           Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests", Journal
+           of Statistical Modeling and Analytics, 2011, Vol. 2, pp. 21-33.
+    .. [4] Royston P., "Remark AS R94: A Remark on Algorithm AS 181: The
+           W-test for Normality", 1995, Applied Statistics, Vol. 44,
+           :doi:`10.2307/2986146`
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)
+    >>> shapiro_test = stats.shapiro(x)
+    >>> shapiro_test
+    ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585)
+    >>> shapiro_test.statistic
+    0.9813305735588074
+    >>> shapiro_test.pvalue
+    0.16855233907699585
+
+    For a more detailed example, see :ref:`hypothesis_shapiro`.
+    """
+    x = np.ravel(x).astype(np.float64)
+
+    N = len(x)
+    if N < 3:
+        raise ValueError("Data must be at least length 3.")
+
+    a = zeros(N//2, dtype=np.float64)
+    init = 0
+
+    y = sort(x)
+    y -= x[N//2]  # subtract the median (or a nearby value); see gh-15777
+
+    w, pw, ifault = swilk(y, a, init)
+    if ifault not in [0, 2]:
+        warnings.warn("scipy.stats.shapiro: Input data has range zero. The"
+                      " results may not be accurate.", stacklevel=2)
+    if N > 5000:
+        warnings.warn("scipy.stats.shapiro: For N > 5000, computed p-value "
+                      f"may not be accurate. Current N is {N}.",
+                      stacklevel=2)
+
+    # `w` and `pw` are always Python floats, which are double precision.
+    # We want to ensure that they are NumPy floats, so until dtypes are
+    # respected, we can explicitly convert each to float64 (faster than
+    # `np.array([w, pw])`).
+    return ShapiroResult(np.float64(w), np.float64(pw))
+
+
+# Values from [8]
+_Avals_norm = array([0.561, 0.631, 0.752, 0.873, 1.035])
+_Avals_expon = array([0.916, 1.062, 1.321, 1.591, 1.959])
+# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
+#             Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
+_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
+# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
+#             on the Empirical Distribution Function.", Biometrika,
+#             Vol. 66, Issue 3, Dec. 1979, pp 591-595.
+_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
+# From Richard A. Lockhart and Michael A. Stephens "Estimation and Tests of
+#             Fit for the Three-Parameter Weibull Distribution"
+#             Journal of the Royal Statistical Society.Series B(Methodological)
+#             Vol. 56, No. 3 (1994), pp. 491-500, table 1. Keys are c*100
+_Avals_weibull = [[0.292, 0.395, 0.467, 0.522, 0.617, 0.711, 0.836, 0.931],
+                  [0.295, 0.399, 0.471, 0.527, 0.623, 0.719, 0.845, 0.941],
+                  [0.298, 0.403, 0.476, 0.534, 0.631, 0.728, 0.856, 0.954],
+                  [0.301, 0.408, 0.483, 0.541, 0.640, 0.738, 0.869, 0.969],
+                  [0.305, 0.414, 0.490, 0.549, 0.650, 0.751, 0.885, 0.986],
+                  [0.309, 0.421, 0.498, 0.559, 0.662, 0.765, 0.902, 1.007],
+                  [0.314, 0.429, 0.508, 0.570, 0.676, 0.782, 0.923, 1.030],
+                  [0.320, 0.438, 0.519, 0.583, 0.692, 0.802, 0.947, 1.057],
+                  [0.327, 0.448, 0.532, 0.598, 0.711, 0.824, 0.974, 1.089],
+                  [0.334, 0.469, 0.547, 0.615, 0.732, 0.850, 1.006, 1.125],
+                  [0.342, 0.472, 0.563, 0.636, 0.757, 0.879, 1.043, 1.167]]
+_Avals_weibull = np.array(_Avals_weibull)
+_cvals_weibull = np.linspace(0, 0.5, 11)
+_get_As_weibull = interpolate.interp1d(_cvals_weibull, _Avals_weibull.T,
+                                       kind='linear', bounds_error=False,
+                                       fill_value=_Avals_weibull[-1])
+
+
+def _weibull_fit_check(params, x):
+    # Refine the fit returned by `weibull_min.fit` to ensure that the first
+    # order necessary conditions are satisfied. If not, raise an error.
+    # Here, use `m` for the shape parameter to be consistent with [7]
+    # and avoid confusion with `c` as defined in [7].
+    n = len(x)
+    m, u, s = params
+
+    def dnllf_dm(m, u):
+        # Partial w.r.t. shape w/ optimal scale. See [7] Equation 5.
+        xu = x-u
+        return (1/m - (xu**m*np.log(xu)).sum()/(xu**m).sum()
+                + np.log(xu).sum()/n)
+
+    def dnllf_du(m, u):
+        # Partial w.r.t. loc w/ optimal scale. See [7] Equation 6.
+        xu = x-u
+        return (m-1)/m*(xu**-1).sum() - n*(xu**(m-1)).sum()/(xu**m).sum()
+
+    def get_scale(m, u):
+        # Partial w.r.t. scale solved in terms of shape and location.
+        # See [7] Equation 7.
+        return ((x-u)**m/n).sum()**(1/m)
+
+    def dnllf(params):
+        # Partial derivatives of the NLLF w.r.t. parameters, i.e.
+        # first order necessary conditions for MLE fit.
+        return [dnllf_dm(*params), dnllf_du(*params)]
+
+    suggestion = ("Maximum likelihood estimation is known to be challenging "
+                  "for the three-parameter Weibull distribution. Consider "
+                  "performing a custom goodness-of-fit test using "
+                  "`scipy.stats.monte_carlo_test`.")
+
+    if np.allclose(u, np.min(x)) or m < 1:
+        # The critical values provided by [7] don't seem to control the
+        # Type I error rate in this case. Error out.
+        message = ("Maximum likelihood estimation has converged to "
+                   "a solution in which the location is equal to the minimum "
+                   "of the data, the shape parameter is less than 2, or both. "
+                   "The table of critical values in [7] does not "
+                   "include this case. " + suggestion)
+        raise ValueError(message)
+
+    try:
+        # Refine the MLE / verify that first-order necessary conditions are
+        # satisfied. If so, the critical values provided in [7] seem reliable.
+        with np.errstate(over='raise', invalid='raise'):
+            res = optimize.root(dnllf, params[:-1])
+
+        message = ("Solution of MLE first-order conditions failed: "
+                   f"{res.message}. `anderson` cannot continue. " + suggestion)
+        if not res.success:
+            raise ValueError(message)
+
+    except (FloatingPointError, ValueError) as e:
+        message = ("An error occurred while fitting the Weibull distribution "
+                   "to the data, so `anderson` cannot continue. " + suggestion)
+        raise ValueError(message) from e
+
+    m, u = res.x
+    s = get_scale(m, u)
+    return m, u, s
+
+
+AndersonResult = _make_tuple_bunch('AndersonResult',
+                                   ['statistic', 'critical_values',
+                                    'significance_level'], ['fit_result'])
+
+
+_anderson_warning_message = (
+"""As of SciPy 1.17, users must choose a p-value calculation method by providing the
+`method` parameter. `method='interpolate'` interpolates the p-value from pre-calculated
+tables; `method` may also be an instance of `MonteCarloMethod` to approximate the
+p-value via Monte Carlo simulation. When `method` is specified, the result object will
+include a `pvalue` attribute and not attributes `critical_value`, `significance_level`,
+or `fit_result`. Beginning in 1.19.0, these other attributes will no longer be
+available, and a p-value will always be computed according to one of the available
+`method` options.""".replace('\n', ' '))
+
+
+@xp_capabilities(np_only=True)
+def anderson(x, dist='norm', *, method=None):
+    """Anderson-Darling test for data coming from a particular distribution.
+
+    The Anderson-Darling test tests the null hypothesis that a sample is
+    drawn from a population that follows a particular distribution.
+    For the Anderson-Darling test, the critical values depend on
+    which distribution is being tested against.  This function works
+    for normal, exponential, logistic, weibull_min, or Gumbel (Extreme Value
+    Type I) distributions.
+
+    Parameters
+    ----------
+    x : array_like
+        Array of sample data.
+    dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1', 'weibull_min'}, optional
+        The type of distribution to test against.  The default is 'norm'.
+        The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the
+        same distribution.
+    method : str or instance of `MonteCarloMethod`
+        Defines the method used to compute the p-value.
+        If `method` is ``"interpolated"``, the p-value is interpolated from
+        pre-calculated tables.
+        If `method` is an instance of `MonteCarloMethod`, the p-value is computed using
+        `scipy.stats.monte_carlo_test` with the provided configuration options and other
+        appropriate settings.
+
+        .. versionadded:: 1.17.0
+            If `method` is not specified, `anderson` will emit a ``FutureWarning``
+            specifying that the user must opt into a p-value calculation method.
+            When `method` is specified, the object returned will include a ``pvalue``
+            attribute, but no ``critical_value``, ``significance_level``, or
+            ``fit_result`` attributes. Beginning in 1.19.0, these other attributes will
+            no longer be available, and a p-value will always be computed according to
+            one of the available `method` options.
+
+    Returns
+    -------
+    result : AndersonResult
+        If `method` is provided, this is an object with the following attributes:
+
+        statistic : float
+            The Anderson-Darling test statistic.
+        pvalue: float
+            The p-value corresponding with the test statistic, calculated according to
+            the specified `method`.
+
+        If `method` is unspecified, this is an object with the following attributes:
+
+        statistic : float
+            The Anderson-Darling test statistic.
+        critical_values : list
+            The critical values for this distribution.
+        significance_level : list
+            The significance levels for the corresponding critical values
+            in percents.  The function returns critical values for a
+            differing set of significance levels depending on the
+            distribution that is being tested against.
+        fit_result : `~scipy.stats._result_classes.FitResult`
+            An object containing the results of fitting the distribution to
+            the data.
+
+        .. deprecated :: 1.17.0
+            The tuple-unpacking behavior of the return object and attributes
+            ``critical_values``, ``significance_level``, and ``fit_result`` are
+            deprecated. Beginning in SciPy 1.19.0, these features will no longer be
+            available, and the object returned will have attributes ``statistic`` and
+            ``pvalue``.
+
+    See Also
+    --------
+    kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
+
+    Notes
+    -----
+    Critical values provided when `method` is unspecified are for the following
+    significance levels:
+
+    normal/exponential
+        15%, 10%, 5%, 2.5%, 1%
+    logistic
+        25%, 10%, 5%, 2.5%, 1%, 0.5%
+    gumbel_l / gumbel_r
+        25%, 10%, 5%, 2.5%, 1%
+    weibull_min
+        50%, 25%, 15%, 10%, 5%, 2.5%, 1%, 0.5%
+
+    If the returned statistic is larger than these critical values then
+    for the corresponding significance level, the null hypothesis that
+    the data come from the chosen distribution can be rejected.
+    The returned statistic is referred to as 'A2' in the references.
+
+    For `weibull_min`, maximum likelihood estimation is known to be
+    challenging. If the test returns successfully, then the first order
+    conditions for a maximum likelihood estimate have been verified and
+    the critical values correspond relatively well to the significance levels,
+    provided that the sample is sufficiently large (>10 observations [7]).
+    However, for some data - especially data with no left tail - `anderson`
+    is likely to result in an error message. In this case, consider
+    performing a custom goodness of fit test using
+    `scipy.stats.monte_carlo_test`.
+
+    References
+    ----------
+    .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
+    .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
+           Some Comparisons, Journal of the American Statistical Association,
+           Vol. 69, pp. 730-737.
+    .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
+           Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
+           pp. 357-369.
+    .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
+           Distribution, Biometrika, Vol. 64, pp. 583-588.
+    .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
+           to Tests for Exponentiality , Technical Report No. 262,
+           Department of Statistics, Stanford University, Stanford, CA.
+    .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
+           Based on the Empirical Distribution Function, Biometrika, Vol. 66,
+           pp. 591-595.
+    .. [7] Richard A. Lockhart and Michael A. Stephens "Estimation and Tests of
+           Fit for the Three-Parameter Weibull Distribution"
+           Journal of the Royal Statistical Society.Series B(Methodological)
+           Vol. 56, No. 3 (1994), pp. 491-500, Table 0.
+    .. [8] D'Agostino, Ralph B. (1986). "Tests for the Normal Distribution".
+           In: Goodness-of-Fit Techniques. Ed. by Ralph B. D'Agostino and
+           Michael A. Stephens. New York: Marcel Dekker, pp. 122-141. ISBN:
+           0-8247-7487-6.
+
+    Examples
+    --------
+    Test the null hypothesis that a random sample was drawn from a normal
+    distribution (with unspecified mean and standard deviation).
+
+    >>> import numpy as np
+    >>> from scipy.stats import anderson
+    >>> rng = np.random.default_rng(9781234521)
+    >>> data = rng.random(size=35)
+    >>> res = anderson(data, dist='norm', method='interpolate')
+    >>> res.statistic
+    np.float64(0.9887620209957291)
+    >>> res.pvalue
+    np.float64(0.012111200538380142)
+
+    The p-value is approximately 0.012,, so the null hypothesis may be rejected
+    at a significance level of 2.5%, but not at a significance level of 1%.
+
+    """ # numpy/numpydoc#87  # noqa: E501
+    dist = dist.lower()
+    if dist in {'extreme1', 'gumbel'}:
+        dist = 'gumbel_l'
+    dists = {'norm', 'expon', 'gumbel_l',
+             'gumbel_r', 'logistic', 'weibull_min'}
+
+    if dist not in dists:
+        raise ValueError(f"Invalid distribution; dist must be in {dists}.")
+    y = sort(x)
+    xbar = np.mean(x, axis=0)
+    N = len(y)
+    if dist == 'norm':
+        s = np.std(x, ddof=1, axis=0)
+        w = (y - xbar) / s
+        fit_params = xbar, s
+        logcdf = distributions.norm.logcdf(w)
+        logsf = distributions.norm.logsf(w)
+        sig = array([15, 10, 5, 2.5, 1])
+        critical = around(_Avals_norm / (1.0 + 0.75/N + 2.25/N/N), 3)
+    elif dist == 'expon':
+        w = y / xbar
+        fit_params = 0, xbar
+        logcdf = distributions.expon.logcdf(w)
+        logsf = distributions.expon.logsf(w)
+        sig = array([15, 10, 5, 2.5, 1])
+        critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
+    elif dist == 'logistic':
+        def rootfunc(ab, xj, N):
+            a, b = ab
+            tmp = (xj - a) / b
+            tmp2 = exp(tmp)
+            val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
+                   np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
+            return array(val)
+
+        sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
+        sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
+        w = (y - sol[0]) / sol[1]
+        fit_params = sol
+        logcdf = distributions.logistic.logcdf(w)
+        logsf = distributions.logistic.logsf(w)
+        sig = array([25, 10, 5, 2.5, 1, 0.5])
+        critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
+    elif dist == 'gumbel_r':
+        xbar, s = distributions.gumbel_r.fit(x)
+        w = (y - xbar) / s
+        fit_params = xbar, s
+        logcdf = distributions.gumbel_r.logcdf(w)
+        logsf = distributions.gumbel_r.logsf(w)
+        sig = array([25, 10, 5, 2.5, 1])
+        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
+    elif dist == 'gumbel_l':
+        xbar, s = distributions.gumbel_l.fit(x)
+        w = (y - xbar) / s
+        fit_params = xbar, s
+        logcdf = distributions.gumbel_l.logcdf(w)
+        logsf = distributions.gumbel_l.logsf(w)
+        sig = array([25, 10, 5, 2.5, 1])
+        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
+    elif dist == 'weibull_min':
+        message = ("Critical values of the test statistic are given for the "
+                   "asymptotic distribution. These may not be accurate for "
+                   "samples with fewer than 10 observations. Consider using "
+                   "`scipy.stats.monte_carlo_test`.")
+        if N < 10:
+            warnings.warn(message, stacklevel=2)
+        # [7] writes our 'c' as 'm', and they write `c = 1/m`. Use their names.
+        m, loc, scale = distributions.weibull_min.fit(y)
+        m, loc, scale = _weibull_fit_check((m, loc, scale), y)
+        fit_params = m, loc, scale
+        logcdf = stats.weibull_min(*fit_params).logcdf(y)
+        logsf = stats.weibull_min(*fit_params).logsf(y)
+        c = 1 / m  # m and c are as used in [7]
+        sig = array([0.5, 0.75, 0.85, 0.9, 0.95, 0.975, 0.99, 0.995])
+        critical = _get_As_weibull(c)
+        # Goodness-of-fit tests should only be used to provide evidence
+        # _against_ the null hypothesis. Be conservative and round up.
+        critical = np.round(critical + 0.0005, decimals=3)
+
+    i = arange(1, N + 1)
+    A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
+
+    # FitResult initializer expects an optimize result, so let's work with it
+    message = '`anderson` successfully fit the distribution to the data.'
+    res = optimize.OptimizeResult(success=True, message=message)
+    res.x = np.array(fit_params)
+    fit_result = FitResult(getattr(distributions, dist), y,
+                           discrete=False, res=res)
+
+    if method is None:
+        warnings.warn(_anderson_warning_message, FutureWarning, stacklevel=2)
+        return AndersonResult(A2, critical, sig, fit_result=fit_result)
+
+    if method == 'interpolate':
+        sig = 1 - sig if dist == 'weibull_min' else sig / 100
+        pvalue = np.interp(A2, critical, sig)
+    elif isinstance(method, stats.MonteCarloMethod):
+        pvalue = _anderson_simulate_pvalue(x, dist, method)
+    else:
+        message = ("`method` must be either 'interpolate' or "
+                   "an instance of `MonteCarloMethod`.")
+        raise ValueError(message)
+    return SignificanceResult(statistic=A2, pvalue=pvalue)
+
+
+def _anderson_simulate_pvalue(x, dist, method):
+    message = ("The `___` attribute of a `MonteCarloMethod` object passed as the "
+               "`method` parameter of `scipy.stats.anderson` is ignored.")
+
+    method = method._asdict()
+    if method.pop('rvs', False):
+        warnings.warn(message.replace('___', 'rvs'), UserWarning, stacklevel=3)
+    if method.pop('batch', False):
+        warnings.warn(message.replace('___', 'batch'), UserWarning, stacklevel=3)
+    method['n_mc_samples'] = method.pop('n_resamples')
+
+    kwargs= {'known_params': {'loc': 0}} if dist == 'expon' else {}
+    dist = getattr(stats, dist)
+    res = stats.goodness_of_fit(dist, x, statistic='ad', **kwargs, **method)
+    return res.pvalue
+
+
+def _anderson_ksamp_continuous(samples, Z, Zstar, k, n, N):
+    """Compute A2akN equation 3 of Scholz & Stephens.
+
+    Parameters
+    ----------
+    samples : sequence of 1-D array_like
+        Array of sample arrays.
+    Z : array_like
+        Sorted array of all observations.
+    Zstar : array_like
+        Sorted array of unique observations. Unused.
+    k : int
+        Number of samples.
+    n : array_like
+        Number of observations in each sample.
+    N : int
+        Total number of observations.
+
+    Returns
+    -------
+    A2KN : float
+        The A2KN statistics of Scholz and Stephens 1987.
+
+    """
+    A2kN = 0.
+
+    j = np.arange(1, N)
+    for i in arange(0, k):
+        s = np.sort(samples[i])
+        Mij = s.searchsorted(Z[:-1], side='right')
+        inner = (N*Mij - j*n[i])**2 / (j * (N - j))
+        A2kN += inner.sum() / n[i]
+    return A2kN / N
+
+
+def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
+    """Compute A2akN equation 7 of Scholz and Stephens.
+
+    Parameters
+    ----------
+    samples : sequence of 1-D array_like
+        Array of sample arrays.
+    Z : array_like
+        Sorted array of all observations.
+    Zstar : array_like
+        Sorted array of unique observations.
+    k : int
+        Number of samples.
+    n : array_like
+        Number of observations in each sample.
+    N : int
+        Total number of observations.
+
+    Returns
+    -------
+    A2aKN : float
+        The A2aKN statistics of Scholz and Stephens 1987.
+
+    """
+    A2akN = 0.
+    Z_ssorted_left = Z.searchsorted(Zstar, 'left')
+    if N == Zstar.size:
+        lj = 1.
+    else:
+        lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
+    Bj = Z_ssorted_left + lj / 2.
+    for i in arange(0, k):
+        s = np.sort(samples[i])
+        s_ssorted_right = s.searchsorted(Zstar, side='right')
+        Mij = s_ssorted_right.astype(float)
+        fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
+        Mij -= fij / 2.
+        inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
+        A2akN += inner.sum() / n[i]
+    A2akN *= (N - 1.) / N
+    return A2akN
+
+
+def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
+    """Compute A2akN equation 6 of Scholz & Stephens.
+
+    Parameters
+    ----------
+    samples : sequence of 1-D array_like
+        Array of sample arrays.
+    Z : array_like
+        Sorted array of all observations.
+    Zstar : array_like
+        Sorted array of unique observations.
+    k : int
+        Number of samples.
+    n : array_like
+        Number of observations in each sample.
+    N : int
+        Total number of observations.
+
+    Returns
+    -------
+    A2KN : float
+        The A2KN statistics of Scholz and Stephens 1987.
+
+    """
+    A2kN = 0.
+    lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
+                                                              'left')
+    Bj = lj.cumsum()
+    for i in arange(0, k):
+        s = np.sort(samples[i])
+        Mij = s.searchsorted(Zstar[:-1], side='right')
+        inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
+        A2kN += inner.sum() / n[i]
+    return A2kN
+
+
+Anderson_ksampResult = _make_tuple_bunch(
+    'Anderson_ksampResult',
+    ['statistic', 'critical_values', 'pvalue'], []
+)
+
+
+@xp_capabilities(np_only=True)
+def anderson_ksamp(samples, midrank=_NoValue, *, variant=_NoValue, method=None):
+    """The Anderson-Darling test for k-samples.
+
+    The k-sample Anderson-Darling test is a modification of the
+    one-sample Anderson-Darling test. It tests the null hypothesis
+    that k-samples are drawn from the same population without having
+    to specify the distribution function of that population. The
+    critical values depend on the number of samples.
+
+    Parameters
+    ----------
+    samples : sequence of 1-D array_like
+        Array of sample data in arrays.
+    midrank : bool, optional
+        Variant of Anderson-Darling test which is computed. Default
+        (True) is the midrank test applicable to continuous and
+        discrete populations. If False, the right side empirical
+        distribution is used.
+
+        .. deprecated::1.17.0
+            Use parameter `variant` instead.
+    variant : {'midrank', 'right', 'continuous'}
+        Variant of Anderson-Darling test to be computed. ``'midrank'`` is applicable
+        to both continuous and discrete populations. ``'discrete'`` and ``'continuous'``
+        perform alternative versions of the test for discrete  and continuous
+        populations, respectively.
+        When `variant` is specified, the return object will not be unpackable as a
+        tuple, and only attributes ``statistic`` and ``pvalue`` will be present.
+    method : PermutationMethod, optional
+        Defines the method used to compute the p-value. If `method` is an
+        instance of `PermutationMethod`, the p-value is computed using
+        `scipy.stats.permutation_test` with the provided configuration options
+        and other appropriate settings. Otherwise, the p-value is interpolated
+        from tabulated values.
+
+    Returns
+    -------
+    res : Anderson_ksampResult
+        An object containing attributes:
+
+        statistic : float
+            Normalized k-sample Anderson-Darling test statistic.
+        critical_values : array
+            The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,
+            0.5%, 0.1%.
+
+            .. deprecated::1.17.0
+                 Present only when `variant` is unspecified.
+
+        pvalue : float
+            The approximate p-value of the test. If `method` is not
+            provided, the value is floored / capped at 0.1% / 25%.
+
+    Raises
+    ------
+    ValueError
+        If fewer than 2 samples are provided, a sample is empty, or no
+        distinct observations are in the samples.
+
+    See Also
+    --------
+    ks_2samp : 2 sample Kolmogorov-Smirnov test
+    anderson : 1 sample Anderson-Darling test
+
+    Notes
+    -----
+    [1]_ defines three versions of the k-sample Anderson-Darling test:
+    one for continuous distributions and two for discrete
+    distributions, in which ties between samples may occur. The
+    default of this routine is to compute the version based on the
+    midrank empirical distribution function. This test is applicable
+    to continuous and discrete data. If `variant` is set to ``'discrete'``, the
+    right side empirical distribution is used for a test for discrete
+    data; if `variant` is ``'continuous'``, the same test statistic and p-value are
+    computed for data with no ties, but with less computation. According to [1]_,
+    the two discrete test statistics differ only slightly if a few collisions due
+    to round-off errors occur in the test not adjusted for ties between samples.
+
+    The critical values corresponding to the significance levels from 0.01
+    to 0.25 are taken from [1]_. p-values are floored / capped
+    at 0.1% / 25%. Since the range of critical values might be extended in
+    future releases, it is recommended not to test ``p == 0.25``, but rather
+    ``p >= 0.25`` (analogously for the lower bound).
+
+    .. versionadded:: 0.14.0
+
+    References
+    ----------
+    .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
+           Anderson-Darling Tests, Journal of the American Statistical
+           Association, Vol. 82, pp. 918-924.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(44925884305279435)
+    >>> res = stats.anderson_ksamp([rng.normal(size=50), rng.normal(loc=0.5, size=30)],
+    ...                            variant='midrank')
+    >>> res.statistic, res.pvalue
+    (3.4444310693448936, 0.013106682406720973)
+
+    The null hypothesis that the two random samples come from the same
+    distribution can be rejected at the 5% level because the returned
+    p-value is less than 0.05, but not at the 1% level.
+
+    >>> samples = [rng.normal(size=50), rng.normal(size=30),
+    ...            rng.normal(size=20)]
+    >>> res = stats.anderson_ksamp(samples, variant='continuous')
+    >>> res.statistic, res.pvalue
+    (-0.6309662273193832, 0.25)
+
+    As we might expect, the null hypothesis cannot be rejected here for three samples
+    from an identical distribution. The reported p-value (25%) has been capped at the
+    maximum value for which pre-computed p-values are available.
+
+    In such cases where the p-value is capped or when sample sizes are
+    small, a permutation test may be more accurate.
+
+    >>> method = stats.PermutationMethod(n_resamples=9999, random_state=rng)
+    >>> res = stats.anderson_ksamp(samples, variant='continuous', method=method)
+    >>> res.pvalue
+    0.699
+
+    """
+    k = len(samples)
+    if (k < 2):
+        raise ValueError("anderson_ksamp needs at least two samples")
+
+    samples = list(map(np.asarray, samples))
+    Z = np.sort(np.hstack(samples))
+    N = Z.size
+    Zstar = np.unique(Z)
+    if Zstar.size < 2:
+        raise ValueError("anderson_ksamp needs more than one distinct "
+                         "observation")
+
+    n = np.array([sample.size for sample in samples])
+    if np.any(n == 0):
+        raise ValueError("anderson_ksamp encountered sample without "
+                         "observations")
+
+    if variant == _NoValue or midrank != _NoValue:
+        message = ("Parameter `variant` has been introduced to replace `midrank`; "
+                   "`midrank` will be removed in SciPy 1.19.0. Specify `variant` to "
+                   "silence this warning. Note that the returned object will no longer "
+                   "be unpackable as a tuple, and `critical_values` will be omitted.")
+        warnings.warn(message, category=UserWarning, stacklevel=2)
+
+    return_critical_values = False
+    if variant == _NoValue:
+        return_critical_values = True
+        variant = 'midrank' if midrank else 'right'
+
+    if variant == 'midrank':
+        A2kN_fun = _anderson_ksamp_midrank
+    elif variant == 'right':
+        A2kN_fun = _anderson_ksamp_right
+    elif variant == 'continuous':
+        A2kN_fun = _anderson_ksamp_continuous
+    else:
+        message = "`variant` must be one of 'midrank', 'right', or 'continuous'."
+        raise ValueError(message)
+
+    A2kN = A2kN_fun(samples, Z, Zstar, k, n, N)
+
+    def statistic(*samples):
+        return A2kN_fun(samples, Z, Zstar, k, n, N)
+
+    if method is not None:
+        res = stats.permutation_test(samples, statistic, **method._asdict(),
+                                     alternative='greater')
+
+    H = (1. / n).sum()
+    hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
+    h = hs_cs[-1] + 1
+    g = (hs_cs / arange(2, N)).sum()
+
+    a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
+    b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
+    c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
+    d = (2*h + 6)*k**2 - 4*h*k
+    sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
+    m = k - 1
+    A2 = (A2kN - m) / math.sqrt(sigmasq)
+
+    # The b_i values are the interpolation coefficients from Table 2
+    # of Scholz and Stephens 1987
+    b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
+    b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
+    b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
+    critical = b0 + b1 / math.sqrt(m) + b2 / m
+
+    sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
+
+    if A2 < critical.min() and method is None:
+        p = sig.max()
+        msg = (f"p-value capped: true value larger than {p}. Consider "
+               "specifying `method` "
+               "(e.g. `method=stats.PermutationMethod()`.)")
+        warnings.warn(msg, stacklevel=2)
+    elif A2 > critical.max() and method is None:
+        p = sig.min()
+        msg = (f"p-value floored: true value smaller than {p}. Consider "
+               "specifying `method` "
+               "(e.g. `method=stats.PermutationMethod()`.)")
+        warnings.warn(msg, stacklevel=2)
+    elif method is None:
+        # interpolation of probit of significance level
+        pf = np.polyfit(critical, log(sig), 2)
+        p = math.exp(np.polyval(pf, A2))
+    else:
+        p = res.pvalue if method is not None else p
+
+    if return_critical_values:
+        # create result object with alias for backward compatibility
+        res = Anderson_ksampResult(A2, critical, p)
+        res.significance_level = p
+    else:
+        res = SignificanceResult(statistic=A2, pvalue=p)
+
+    return res
+
+
+
+AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
+
+
+class _ABW:
+    """Distribution of Ansari-Bradley W-statistic under the null hypothesis."""
+    # TODO: calculate exact distribution considering ties
+    # We could avoid summing over more than half the frequencies,
+    # but initially it doesn't seem worth the extra complexity
+
+    def __init__(self):
+        """Minimal initializer."""
+        self.m = None
+        self.n = None
+        self.astart = None
+        self.total = None
+        self.freqs = None
+
+    def _recalc(self, n, m):
+        """When necessary, recalculate exact distribution."""
+        if n != self.n or m != self.m:
+            self.n, self.m = n, m
+            # distribution is NOT symmetric when m + n is odd
+            # n is len(x), m is len(y), and ratio of scales is defined x/y
+            astart, a1, _ = gscale(n, m)
+            self.astart = astart  # minimum value of statistic
+            # Exact distribution of test statistic under null hypothesis
+            # expressed as frequencies/counts/integers to maintain precision.
+            # Stored as floats to avoid overflow of sums.
+            self.freqs = a1.astype(np.float64)
+            self.total = self.freqs.sum()  # could calculate from m and n
+            # probability mass is self.freqs / self.total;
+
+    def pmf(self, k, n, m):
+        """Probability mass function."""
+        self._recalc(n, m)
+        # The convention here is that PMF at k = 12.5 is the same as at k = 12,
+        # -> use `floor` in case of ties.
+        ind = np.floor(k - self.astart).astype(int)
+        return self.freqs[ind] / self.total
+
+    def cdf(self, k, n, m):
+        """Cumulative distribution function."""
+        self._recalc(n, m)
+        # Null distribution derived without considering ties is
+        # approximate. Round down to avoid Type I error.
+        ind = np.ceil(k - self.astart).astype(int)
+        return self.freqs[:ind+1].sum() / self.total
+
+    def sf(self, k, n, m):
+        """Survival function."""
+        self._recalc(n, m)
+        # Null distribution derived without considering ties is
+        # approximate. Round down to avoid Type I error.
+        ind = np.floor(k - self.astart).astype(int)
+        return self.freqs[ind:].sum() / self.total
+
+
+# Maintain state for faster repeat calls to ansari w/ method='exact'
+# _ABW() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside ansari().
+_abw_state = threading.local()
+
+
+@xp_capabilities(cpu_only=True, jax_jit=False,    # p-value is Cython
+                 skip_backends=[('dask.array', 'no rankdata')])
+@_axis_nan_policy_factory(AnsariResult, n_samples=2)
+def ansari(x, y, alternative='two-sided', *, axis=0):
+    """Perform the Ansari-Bradley test for equal scale parameters.
+
+    The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test
+    for the equality of the scale parameter of the distributions
+    from which two samples were drawn. The null hypothesis states that
+    the ratio of the scale of the distribution underlying `x` to the scale
+    of the distribution underlying `y` is 1.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Arrays of sample data.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the ratio of scales is not equal to 1.
+        * 'less': the ratio of scales is less than 1.
+        * 'greater': the ratio of scales is greater than 1.
+
+        .. versionadded:: 1.7.0
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The Ansari-Bradley test statistic.
+    pvalue : float
+        The p-value of the hypothesis test.
+
+    See Also
+    --------
+    fligner : A non-parametric test for the equality of k variances
+    mood : A non-parametric test for the equality of two scale parameters
+
+    Notes
+    -----
+    The p-value given is exact when the sample sizes are both less than
+    55 and there are no ties, otherwise a normal approximation for the
+    p-value is used.
+
+    References
+    ----------
+    .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for
+           dispersions, Annals of Mathematical Statistics, 31, 1174-1189.
+    .. [2] Sprent, Peter and N.C. Smeeton.  Applied nonparametric
+           statistical methods.  3rd ed. Chapman and Hall/CRC. 2001.
+           Section 5.8.2.
+    .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality
+           Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import ansari
+    >>> rng = np.random.default_rng()
+
+    For these examples, we'll create three random data sets.  The first
+    two, with sizes 35 and 25, are drawn from a normal distribution with
+    mean 0 and standard deviation 2.  The third data set has size 25 and
+    is drawn from a normal distribution with standard deviation 1.25.
+
+    >>> x1 = rng.normal(loc=0, scale=2, size=35)
+    >>> x2 = rng.normal(loc=0, scale=2, size=25)
+    >>> x3 = rng.normal(loc=0, scale=1.25, size=25)
+
+    First we apply `ansari` to `x1` and `x2`.  These samples are drawn
+    from the same distribution, so we expect the Ansari-Bradley test
+    should not lead us to conclude that the scales of the distributions
+    are different.
+
+    >>> ansari(x1, x2)
+    AnsariResult(statistic=541.0, pvalue=0.9762532927399098)
+
+    With a p-value close to 1, we cannot conclude that there is a
+    significant difference in the scales (as expected).
+
+    Now apply the test to `x1` and `x3`:
+
+    >>> ansari(x1, x3)
+    AnsariResult(statistic=425.0, pvalue=0.0003087020407974518)
+
+    The probability of observing such an extreme value of the statistic
+    under the null hypothesis of equal scales is only 0.03087%. We take this
+    as evidence against the null hypothesis in favor of the alternative:
+    the scales of the distributions from which the samples were drawn
+    are not equal.
+
+    We can use the `alternative` parameter to perform a one-tailed test.
+    In the above example, the scale of `x1` is greater than `x3` and so
+    the ratio of scales of `x1` and `x3` is greater than 1. This means
+    that the p-value when ``alternative='greater'`` should be near 0 and
+    hence we should be able to reject the null hypothesis:
+
+    >>> ansari(x1, x3, alternative='greater')
+    AnsariResult(statistic=425.0, pvalue=0.0001543510203987259)
+
+    As we can see, the p-value is indeed quite low. Use of
+    ``alternative='less'`` should thus yield a large p-value:
+
+    >>> ansari(x1, x3, alternative='less')
+    AnsariResult(statistic=425.0, pvalue=0.9998643258449039)
+
+    """
+    xp = array_namespace(x, y)
+    dtype = xp_result_type(x, y, force_floating=True, xp=xp)
+
+    if alternative not in {'two-sided', 'greater', 'less'}:
+        raise ValueError("'alternative' must be 'two-sided',"
+                         " 'greater', or 'less'.")
+
+    if not hasattr(_abw_state, 'a'):
+        _abw_state.a = _ABW()
+
+    # _axis_nan_policy decorator guarantees that axis=-1
+    n = x.shape[-1]
+    m = y.shape[-1]
+    if m < 1:  # needed by test_axis_nan_policy; not user-facing
+        raise ValueError("Not enough other observations.")
+    if n < 1:
+        raise ValueError("Not enough test observations.")
+
+    N = m + n
+    xy = xp.concat([x, y], axis=-1)  # combine
+    rank, t = _stats_py._rankdata(xy, method='average', return_ties=True)
+    rank, t = xp.astype(rank, dtype), xp.astype(t, dtype)
+    symrank = xp.minimum(rank, N - rank + 1)
+    AB = xp.sum(symrank[..., :n], axis=-1)
+    repeats = xp.any(t > 1)  # in theory we could branch for each slice separately
+    exact = ((m < 55) and (n < 55) and not repeats)
+    if exact:
+        # np.vectorize converts to NumPy here, and we convert back to the result
+        # type before returning
+        cdf = np.vectorize(_abw_state.a.cdf, otypes=[np.float64])
+        sf = np.vectorize(_abw_state.a.sf, otypes=[np.float64])
+        if alternative == 'two-sided':
+            pval = 2.0 * np.minimum(cdf(AB, n, m),
+                                    sf(AB, n, m))
+        elif alternative == 'greater':
+            # AB statistic is _smaller_ when ratio of scales is larger,
+            # so this is the opposite of the usual calculation
+            pval = cdf(AB, n, m)
+        else:
+            pval = sf(AB, n, m)
+        pval = xp.clip(xp.asarray(pval, dtype=dtype), max=1.0)
+        AB = AB[()] if AB.ndim == 0 else AB
+        pval = pval[()] if pval.ndim == 0 else pval
+        return AnsariResult(AB, pval)
+
+    mnAB = (n * (N + 1.0) ** 2 / 4.0 / N) if N % 2 else (n * (N + 2.0) / 4.0)
+
+    if repeats:   # adjust variance estimates
+        # compute np.sum(tj * rj**2,axis=0)
+        fac = xp.sum(symrank**2, axis=-1)
+        if N % 2:  # N odd
+            varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
+        else:  # N even
+            varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
+    else:
+        # otherwise compute normal approximation
+        if N % 2:  # N odd
+            varAB = n * m * (N + 1.0) * (3 + N ** 2) / (48.0 * N ** 2)
+        else:
+            varAB = m * n * (N + 2) * (N - 2.0) / 48 / (N - 1.0)
+        varAB = xp.asarray(varAB, dtype=dtype)
+
+    # Small values of AB indicate larger dispersion for the x sample.
+    # Large values of AB indicate larger dispersion for the y sample.
+    # This is opposite to the way we define the ratio of scales. see [1]_.
+    z = (mnAB - AB) / xp.sqrt(varAB)
+    pvalue = _get_pvalue(z, _SimpleNormal(), alternative, xp=xp)
+    AB = AB[()] if AB.ndim == 0 else AB
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+    return AnsariResult(AB, pvalue)
+
+
+BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
+
+@xp_capabilities()
+@_axis_nan_policy_factory(BartlettResult, n_samples=None)
+def bartlett(*samples, axis=0):
+    r"""Perform Bartlett's test for equal variances.
+
+    Bartlett's test tests the null hypothesis that all input samples
+    are from populations with equal variances.  For samples
+    from significantly non-normal populations, Levene's test
+    `levene` is more robust.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        arrays of sample data.  Only 1d arrays are accepted, they may have
+        different lengths.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The p-value of the test.
+
+    See Also
+    --------
+    fligner : A non-parametric test for the equality of k variances
+    levene : A robust parametric test for equality of k variances
+    :ref:`hypothesis_bartlett` : Extended example
+
+    Notes
+    -----
+    Conover et al. (1981) examine many of the existing parametric and
+    nonparametric tests by extensive simulations and they conclude that the
+    tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
+    superior in terms of robustness of departures from normality and power
+    ([3]_).
+
+    References
+    ----------
+    .. [1]  https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
+    .. [2]  Snedecor, George W. and Cochran, William G. (1989), Statistical
+              Methods, Eighth Edition, Iowa State University Press.
+    .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
+           Hypothesis Testing based on Quadratic Inference Function. Technical
+           Report #99-03, Center for Likelihood Studies, Pennsylvania State
+           University.
+    .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
+           Tests. Proceedings of the Royal Society of London. Series A,
+           Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
+
+    Examples
+    --------
+
+    Test whether the lists `a`, `b` and `c` come from populations
+    with equal variances.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
+    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
+    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
+    >>> stat, p = stats.bartlett(a, b, c)
+    >>> p
+    1.1254782518834628e-05
+
+    The very small p-value suggests that the populations do not have equal
+    variances.
+
+    This is not surprising, given that the sample variance of `b` is much
+    larger than that of `a` and `c`:
+
+    >>> [np.var(x, ddof=1) for x in [a, b, c]]
+    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
+
+    For a more detailed example, see :ref:`hypothesis_bartlett`.
+    """
+    xp = array_namespace(*samples)
+
+    k = len(samples)
+    if k < 2:
+        raise ValueError("Must enter at least two input sample vectors.")
+
+    if axis is None:
+        samples = [xp_ravel(sample) for sample in samples]
+    else:
+        samples = _broadcast_arrays(samples, axis=axis, xp=xp)
+        samples = [xp.moveaxis(sample, axis, -1) for sample in samples]
+
+    Ni = [xp.asarray(_length_nonmasked(sample, axis=-1, xp=xp),
+                     dtype=sample.dtype, device=xp_device(sample))
+          for sample in samples]
+    Ni = [xp.broadcast_to(N, samples[0].shape[:-1]) for N in Ni]
+    ssq = [xp.var(sample, correction=1, axis=-1) for sample in samples]
+    Ni = [arr[xp.newaxis, ...] for arr in Ni]
+    ssq = [arr[xp.newaxis, ...] for arr in ssq]
+    Ni = xp.concat(Ni, axis=0)
+    Ni = xpx.at(Ni)[Ni == 0].set(xp.nan)
+    ssq = xp.concat(ssq, axis=0)
+    dtype = Ni.dtype
+    Ntot = xp.sum(Ni, axis=0)
+    spsq = xp.sum((Ni - 1)*ssq, axis=0, dtype=dtype) / (Ntot - k)
+    numer = ((Ntot - k) * xp.log(spsq)
+             - xp.sum((Ni - 1)*xp.log(ssq), axis=0, dtype=dtype))
+    denom = (1 + 1/(3*(k - 1))
+             * ((xp.sum(1/(Ni - 1), axis=0)) - 1/(Ntot - k)))
+    T = numer / denom
+
+    chi2 = _SimpleChi2(xp.asarray(k-1, dtype=dtype, device=xp_device(T)))
+    pvalue = _get_pvalue(T, chi2, alternative='greater', symmetric=False, xp=xp)
+
+    T = xp.clip(T, min=0., max=xp.inf)
+    T = T[()] if T.ndim == 0 else T
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+
+    return BartlettResult(T, pvalue)
+
+
+LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(cpu_only=True, exceptions=['cupy'])
+@_axis_nan_policy_factory(LeveneResult, n_samples=None)
+def levene(*samples, center='median', proportiontocut=0.05, axis=0):
+    r"""Perform Levene test for equal variances.
+
+    The Levene test tests the null hypothesis that all input samples
+    are from populations with equal variances.  Levene's test is an
+    alternative to Bartlett's test `bartlett` in the case where
+    there are significant deviations from normality.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The sample data, possibly with different lengths.
+    center : {'mean', 'median', 'trimmed'}, optional
+        Which statistics to use to center data points within each sample.  Default
+        is 'median'.
+    proportiontocut : float, optional
+        When `center` is 'trimmed', this gives the proportion of data points
+        to cut from each end. (See `scipy.stats.trim_mean`.)
+        Default is 0.05.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The p-value for the test.
+
+    See Also
+    --------
+    fligner : A non-parametric test for the equality of k variances
+    bartlett : A parametric test for equality of k variances in normal samples
+    :ref:`hypothesis_levene` : Extended example
+
+    Notes
+    -----
+    Three variations of Levene's test are possible.  The possibilities
+    and their recommended usages are:
+
+    * 'median' : Recommended for skewed (non-normal) distributions>
+    * 'mean' : Recommended for symmetric, moderate-tailed distributions.
+    * 'trimmed' : Recommended for heavy-tailed distributions.
+
+    The test version using the mean was proposed in the original article
+    of Levene ([2]_) while the median and trimmed mean have been studied by
+    Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
+    test.
+
+    References
+    ----------
+    .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
+    .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
+           Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
+           Stanford University Press, pp. 278-292.
+    .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
+           Statistical Association, 69, 364-367
+
+    Examples
+    --------
+
+    Test whether the lists `a`, `b` and `c` come from populations
+    with equal variances.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
+    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
+    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
+    >>> stat, p = stats.levene(a, b, c)
+    >>> p
+    0.002431505967249681
+
+    The small p-value suggests that the populations do not have equal
+    variances.
+
+    This is not surprising, given that the sample variance of `b` is much
+    larger than that of `a` and `c`:
+
+    >>> [np.var(x, ddof=1) for x in [a, b, c]]
+    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
+
+    For a more detailed example, see :ref:`hypothesis_levene`.
+    """
+    xp = array_namespace(*samples)
+
+    if center not in ['mean', 'median', 'trimmed']:
+        raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
+
+    k = len(samples)
+    if k < 2:
+        raise ValueError("Must provide at least two samples.")
+
+    if center == 'median':
+
+        def func(x):
+            return (xp.median(x, axis=-1, keepdims=True)
+                    if (is_numpy(xp) or is_dask(xp))
+                    else stats.quantile(x, 0.5, axis=-1, keepdims=True))
+
+    elif center == 'mean':
+
+        def func(x):
+            return xp.mean(x, axis=-1, keepdims=True)
+
+    else:  # center == 'trimmed'
+
+        def func(x):
+            # keepdims=True doesn't currently work for lazy arrays
+            return _stats_py.trim_mean(x, proportiontocut, axis=-1)[..., xp.newaxis]
+
+    Nis = [sample.shape[-1] for sample in samples]
+    Ycis = [func(sample) for sample in samples]
+    Ntot = sum(Nis)
+
+    # compute Zij's
+    Zijs = [xp.abs(sample - Yc) for sample, Yc in zip(samples, Ycis)]
+
+    # compute Zbari
+    Zbaris = [xp.mean(Zij, axis=-1, keepdims=True) for Zij in Zijs]
+    Zbar = sum(Ni*Zbari for Ni, Zbari in zip(Nis, Zbaris)) / Ntot
+
+    # compute numerator and denominator
+    dfd = (Ntot - k)
+    numer = dfd * sum(Ni * (Zbari - Zbar)**2
+                      for Ni, Zbari in zip(Nis, Zbaris))
+    dfn = (k - 1.0)
+    denom = dfn * sum(xp.sum((Zij - Zbari)**2, axis=-1, keepdims=True)
+                      for Zij, Zbari in zip(Zijs, Zbaris))
+
+    W = numer / denom
+    W = xp.squeeze(W, axis=-1)
+    dfn, dfd = xp.asarray(dfn, dtype=W.dtype), xp.asarray(dfd, dtype=W.dtype)
+    pval = _get_pvalue(W, _SimpleF(dfn, dfd), 'greater', xp=xp)
+    return LeveneResult(W[()], pval[()])
+
+
+FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(skip_backends=[('dask.array', 'no rankdata'),
+                                ('cupy', 'no rankdata')], jax_jit=False)
+@_axis_nan_policy_factory(FlignerResult, n_samples=None)
+def fligner(*samples, center='median', proportiontocut=0.05, axis=0):
+    r"""Perform Fligner-Killeen test for equality of variance.
+
+    Fligner's test tests the null hypothesis that all input samples
+    are from populations with equal variances.  Fligner-Killeen's test is
+    distribution free when populations are identical [2]_.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        Arrays of sample data.  Need not be the same length.
+    center : {'mean', 'median', 'trimmed'}, optional
+        Which statistics to use to center data points within each sample. Default
+        is 'median'.
+    proportiontocut : float, optional
+        When `center` is 'trimmed', this gives the proportion of data points
+        to cut from each end. (See `scipy.stats.trim_mean`.)
+        Default is 0.05.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The p-value for the hypothesis test.
+
+    See Also
+    --------
+    bartlett : A parametric test for equality of k variances in normal samples
+    levene : A robust parametric test for equality of k variances
+    :ref:`hypothesis_fligner` : Extended example
+
+    Notes
+    -----
+    As with Levene's test there are three variants of Fligner's test that
+    differ by the measure of central tendency used in the test.  See `levene`
+    for more information.
+
+    Conover et al. (1981) examine many of the existing parametric and
+    nonparametric tests by extensive simulations and they conclude that the
+    tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
+    superior in terms of robustness of departures from normality and power
+    [3]_.
+
+    References
+    ----------
+    .. [1] Qu, A., Lindsay, B. G., and Li, B. (2000). Improving generalized
+           estimating equations using quadratic inference functions.
+           Biometrika, 87(4), 823-836.
+           :doi:`10.1093/biomet/87.4.823`
+    .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
+           tests for scale. Journal of the American Statistical Association.
+           71(353), 210-213.
+    .. [3] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
+           comparative study of tests for homogeneity of variances, with
+           applications to the outer continental shelf bidding data.
+           Technometrics, 23(4), 351-361.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy import stats
+
+    Test whether the lists `a`, `b` and `c` come from populations
+    with equal variances.
+
+    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
+    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
+    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
+    >>> stat, p = stats.fligner(a, b, c)
+    >>> p
+    0.00450826080004775
+
+    The small p-value suggests that the populations do not have equal
+    variances.
+
+    This is not surprising, given that the sample variance of `b` is much
+    larger than that of `a` and `c`:
+
+    >>> [np.var(x, ddof=1) for x in [a, b, c]]
+    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
+
+    For a more detailed example, see :ref:`hypothesis_fligner`.
+    """
+    xp = array_namespace(*samples)
+
+    if center not in ['mean', 'median', 'trimmed']:
+        raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
+
+    k = len(samples)
+    if k < 2:
+        raise ValueError("Must provide at least two samples.")
+
+    samples = xp_promote(*samples, force_floating=True, xp=xp)
+    dtype = samples[0].dtype
+
+    # Handle empty input
+    for sample in samples:
+        if sample.size == 0:
+            NaN = _get_nan(*samples, xp=xp)
+            return FlignerResult(NaN, NaN)
+
+    if center == 'median':
+
+        def func(x):
+            return (xp.median(x, axis=-1, keepdims=True)
+                    if (is_numpy(xp) or is_dask(xp))
+                    else stats.quantile(x, 0.5, axis=-1, keepdims=True))
+
+    elif center == 'mean':
+
+        def func(x):
+            return xp.mean(x, axis=-1, keepdims=True)
+
+    else:  # center == 'trimmed'
+
+        def func(x):
+            # keepdims=True doesn't currently work for lazy arrays
+            return _stats_py.trim_mean(x, proportiontocut, axis=-1)[..., xp.newaxis]
+
+    ni = [sample.shape[-1] for sample in samples]
+    N = sum(ni)
+
+    # Implementation follows [3] pg 355 F-K.
+    Xibar = [func(sample) for sample in samples]
+    Xij_Xibar = [xp.abs(sample - Xibar_) for sample, Xibar_ in zip(samples, Xibar)]
+    Xij_Xibar = xp.concat(Xij_Xibar, axis=-1)
+    ranks = _stats_py._rankdata(Xij_Xibar, method='average', xp=xp)
+    ranks = xp.astype(ranks, dtype)
+    a_Ni = special.ndtri(ranks / (2*(N + 1.0)) + 0.5)
+
+    # [3] Equation 2.1
+    splits = list(itertools.accumulate(ni, initial=0))
+    Ai = [a_Ni[..., i:j] for i, j in zip(splits[:-1], splits[1:])]
+    Aibar = [xp.mean(Ai_, axis=-1) for Ai_ in Ai]
+    abar = xp.mean(a_Ni, axis=-1)
+    V2 = xp.var(a_Ni, axis=-1, correction=1)
+    statistic = sum(ni_ * (Aibar_ - abar)**2 for ni_, Aibar_ in zip(ni, Aibar)) / V2
+
+    chi2 = _SimpleChi2(xp.asarray(k-1, dtype=dtype))
+    pval = _get_pvalue(statistic, chi2, alternative='greater', symmetric=False, xp=xp)
+    return FlignerResult(statistic, pval)
+
+
+def _mood_statistic_with_ties(x, y, t, m, n, N, xp):
+    # First equation of "Mood's Squared Rank Test", Mielke pg 313
+    E_0_T = m * (N * N - 1) / 12
+
+    # m, n, N, t, and S are defined in the second paragraph of Mielke pg 312
+    # The only difference is that our `t` has zeros interspersed with the relevant
+    # numbers to keep the array rectangular, but these terms add nothing to the sum.
+    S = xp.cumulative_sum(t, include_initial=True, axis=-1)
+    S_i, S_i_m1 = S[..., 1:], S[..., :-1]
+    # Second equation of "Mood's Squared Rank Test", Mielke pg 313
+    varM = (m * n * (N + 1.0) * (N**2 - 4) / 180
+            - m * n / (180 * N * (N - 1))
+            * xp.sum(t * (t ** 2 - 1) * (t ** 2 - 4 + (15 * (N - S_i - S_i_m1) ** 2)),
+                     axis=-1))
+
+    # There is a formula for Phi (`phi` in code) in terms of t, S, and Psi(I) at the
+    # bottom of Mielke pg 312. Psi(I) = [I - (N+1)/2]^2 is defined (with a mistake in
+    # the location of the ^2) at the beginning of "Mood's Squared Rank Test" (pg 313).
+    # To vectorize this calculation, let c = (N + 1) / 2, so Psi(I) = I^2 - 2*c*I + c^2.
+    # We sum each of these three parts of Psi separately using formulas for sums from a
+    # to b (inclusive) of terms I^2, I, and 1 where I takes on successive integers.
+    def sum_I2(a, b=None):
+        return (a * (a + 1) * (2 * a + 1) / 6 if b is None
+                else sum_I2(b) - sum_I2(a) + a**2)
+
+    def sum_I(a, b=None):
+        return (a * (a + 1) / 2 if b is None
+                else sum_I(b) - sum_I(a) + a)
+
+    def sum_1(a, b):
+        return (b - a) + 1
+
+    with np.errstate(invalid='ignore', divide='ignore'):
+        sum_I2 = sum_I2(S_i_m1 + 1, S_i)
+        sum_I = sum_I(S_i_m1 + 1, S_i)
+        sum_1 = sum_1(S_i_m1 + 1, S_i)
+        c = (N + 1) / 2
+        phi = (sum_I2 - 2*c*sum_I + sum_1*c**2) / t
+
+    phi = xpx.at(phi)[t == 0].set(0.)  # where t = 0 we get NaNs; eliminate them
+
+    # Mielke pg 312 defines `a` as the count of elements in sample `x` for each of the
+    # unique values in the combined sample. The tricky thing is getting these to line
+    # up with the locations of nonzero elements in `t`/`phi`.
+    x = xp.sort(x, axis=-1)
+    xy = xp.concat((x, y), axis=-1)
+    i = xp.argsort(xy, stable=True, axis=-1)
+    _, a = _stats_py._rankdata(x, method='average', return_ties=True)
+    a = xp.astype(a, phi.dtype)
+
+    zeros = xp.zeros(a.shape[:-1] + (n,), dtype=a.dtype)
+    a = xp.concat((a, zeros), axis=-1)
+    a = xp.take_along_axis(a, i, axis=-1)
+
+    # Mielke pg 312 defines test statistic `T` as the inner product `a` and `phi`
+    T = xp.vecdot(a, phi, axis=-1)
+
+    return (T - E_0_T) / xp.sqrt(varM)
+
+
+def _mood_statistic_no_ties(r, m, n, N, xp):
+    rx = r[..., :m]
+    M = xp.sum((rx - (N + 1.0) / 2) ** 2, axis=-1)
+    E_0_T = m * (N * N - 1.0) / 12
+    varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
+    return (M - E_0_T) / math.sqrt(varM)
+
+
+def _mood_too_small(samples, kwargs, axis=-1):
+    x, y = samples
+    m = x.shape[axis]
+    n = y.shape[axis]
+    N = m + n
+    return N < 3
+
+
+@xp_capabilities(skip_backends=[('cupy', 'no rankdata'), ('dask.array', 'no rankdata')])
+@_axis_nan_policy_factory(SignificanceResult, n_samples=2, too_small=_mood_too_small)
+def mood(x, y, axis=0, alternative="two-sided"):
+    """Perform Mood's test for equal scale parameters.
+
+    Mood's two-sample test for scale parameters is a non-parametric
+    test for the null hypothesis that two samples are drawn from the
+    same distribution with the same scale parameter.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Arrays of sample data. There must be at least three observations
+        total.
+    axis : int, optional
+        The axis along which the samples are tested.  `x` and `y` can be of
+        different length along `axis`.
+        If `axis` is None, `x` and `y` are flattened and the test is done on
+        all values in the flattened arrays.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the scales of the distributions underlying `x` and `y`
+          are different.
+        * 'less': the scale of the distribution underlying `x` is less than
+          the scale of the distribution underlying `y`.
+        * 'greater': the scale of the distribution underlying `x` is greater
+          than the scale of the distribution underlying `y`.
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : scalar or ndarray
+            The z-score for the hypothesis test.  For 1-D inputs a scalar is
+            returned.
+        pvalue : scalar ndarray
+            The p-value for the hypothesis test.
+
+    See Also
+    --------
+    fligner : A non-parametric test for the equality of k variances
+    ansari : A non-parametric test for the equality of 2 variances
+    bartlett : A parametric test for equality of k variances in normal samples
+    levene : A parametric test for equality of k variances
+
+    Notes
+    -----
+    The data are assumed to be drawn from probability distributions ``f(x)``
+    and ``f(x/s) / s`` respectively, for some probability density function f.
+    The null hypothesis is that ``s == 1``.
+
+    For multi-dimensional arrays, if the inputs are of shapes
+    ``(n0, n1, n2, n3)``  and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
+    resulting z and p values will have shape ``(n0, n2, n3)``.  Note that
+    ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
+
+    References
+    ----------
+    [1] Mielke, Paul W. "Note on Some Squared Rank Tests with Existing Ties."
+        Technometrics, vol. 9, no. 2, 1967, pp. 312-14. JSTOR,
+        https://doi.org/10.2307/1266427. Accessed 18 May 2022.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x2 = rng.standard_normal((2, 45, 6, 7))
+    >>> x1 = rng.standard_normal((2, 30, 6, 7))
+    >>> res = stats.mood(x1, x2, axis=1)
+    >>> res.pvalue.shape
+    (2, 6, 7)
+
+    Find the number of points where the difference in scale is not significant:
+
+    >>> (res.pvalue > 0.1).sum()
+    78
+
+    Perform the test with different scales:
+
+    >>> x1 = rng.standard_normal((2, 30))
+    >>> x2 = rng.standard_normal((2, 35)) * 10.0
+    >>> stats.mood(x1, x2, axis=1)
+    SignificanceResult(statistic=array([-5.76174136, -6.12650783]),
+                       pvalue=array([8.32505043e-09, 8.98287869e-10]))
+
+    """
+    xp = array_namespace(x, y)
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+    dtype = x.dtype
+
+    # _axis_nan_policy decorator ensures axis=-1
+    xy = xp.concat((x, y), axis=-1)
+
+    m = x.shape[-1]
+    n = y.shape[-1]
+    N = m + n
+
+    if m == 0 or n == 0 or N < 3:  # only needed for test_axis_nan_policy
+        NaN = _get_nan(x, y, xp=xp)
+        return SignificanceResult(NaN, NaN)
+
+    # determine if any of the samples contain ties
+    # `a` represents ties within `x`; `t` represents ties within `xy`
+    r, t = _stats_py._rankdata(xy, method='average', return_ties=True)
+    r, t = xp.asarray(r, dtype=dtype), xp.asarray(t, dtype=dtype)
+
+    if xp.any(t > 1):
+        z = _mood_statistic_with_ties(x, y, t, m, n, N, xp=xp)
+    else:
+        z = _mood_statistic_no_ties(r, m, n, N, xp=xp)
+
+    pval = _get_pvalue(z, _SimpleNormal(), alternative, xp=xp)
+
+    z = z[()] if z.ndim == 0 else z
+    pval = pval[()] if pval.ndim == 0 else pval
+    return SignificanceResult(z, pval)
+
+
+WilcoxonResult = _make_tuple_bunch('WilcoxonResult', ['statistic', 'pvalue'])
+
+
+def wilcoxon_result_unpacker(res, _):
+    if hasattr(res, 'zstatistic'):
+        return res.statistic, res.pvalue, res.zstatistic
+    else:
+        return res.statistic, res.pvalue
+
+
+def wilcoxon_result_object(statistic, pvalue, zstatistic=None):
+    res = WilcoxonResult(statistic, pvalue)
+    if zstatistic is not None:
+        res.zstatistic = zstatistic
+    return res
+
+
+def wilcoxon_outputs(kwds):
+    method = kwds.get('method', 'auto')
+    if method == 'asymptotic':
+        return 3
+    return 2
+
+
+@xp_capabilities(skip_backends=[("dask.array", "no rankdata"),
+                                ("cupy", "no rankdata")],
+                jax_jit=False, cpu_only=True)  # null distribution is CPU only
+@_rename_parameter("mode", "method")
+@_axis_nan_policy_factory(
+    wilcoxon_result_object, paired=True,
+    n_samples=lambda kwds: 2 if kwds.get('y', None) is not None else 1,
+    result_to_tuple=wilcoxon_result_unpacker, n_outputs=wilcoxon_outputs,
+)
+def wilcoxon(x, y=None, zero_method="wilcox", correction=False,
+             alternative="two-sided", method='auto', *, axis=0):
+    """Calculate the Wilcoxon signed-rank test.
+
+    The Wilcoxon signed-rank test tests the null hypothesis that two
+    related paired samples come from the same distribution. In particular,
+    it tests whether the distribution of the differences ``x - y`` is symmetric
+    about zero. It is a non-parametric version of the paired T-test.
+
+    Parameters
+    ----------
+    x : array_like
+        Either the first set of measurements (in which case ``y`` is the second
+        set of measurements), or the differences between two sets of
+        measurements (in which case ``y`` is not to be specified.)  Must be
+        one-dimensional.
+    y : array_like, optional
+        Either the second set of measurements (if ``x`` is the first set of
+        measurements), or not specified (if ``x`` is the differences between
+        two sets of measurements.)  Must be one-dimensional.
+
+        .. warning::
+            When `y` is provided, `wilcoxon` calculates the test statistic
+            based on the ranks of the absolute values of ``d = x - y``.
+            Roundoff error in the subtraction can result in elements of ``d``
+            being assigned different ranks even when they would be tied with
+            exact arithmetic. Rather than passing `x` and `y` separately,
+            consider computing the difference ``x - y``, rounding as needed to
+            ensure that only truly unique elements are numerically distinct,
+            and passing the result as `x`, leaving `y` at the default (None).
+
+    zero_method : {"wilcox", "pratt", "zsplit"}, optional
+        There are different conventions for handling pairs of observations
+        with equal values ("zero-differences", or "zeros").
+
+        * "wilcox": Discards all zero-differences (default); see [4]_.
+        * "pratt": Includes zero-differences in the ranking process,
+          but drops the ranks of the zeros (more conservative); see [3]_.
+          In this case, the normal approximation is adjusted as in [5]_.
+        * "zsplit": Includes zero-differences in the ranking process and
+          splits the zero rank between positive and negative ones.
+
+    correction : bool, optional
+        If True, apply continuity correction by adjusting the Wilcoxon rank
+        statistic by 0.5 towards the mean value when computing the
+        z-statistic if a normal approximation is used.  Default is False.
+    alternative : {"two-sided", "greater", "less"}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        In the following, let ``d`` represent the difference between the paired
+        samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or
+        ``d = x`` otherwise.
+
+        * 'two-sided': the distribution underlying ``d`` is not symmetric
+          about zero.
+        * 'less': the distribution underlying ``d`` is stochastically less
+          than a distribution symmetric about zero.
+        * 'greater': the distribution underlying ``d`` is stochastically
+          greater than a distribution symmetric about zero.
+
+    method : {"auto", "exact", "asymptotic"} or `PermutationMethod` instance, optional
+        Method to calculate the p-value, see Notes. Default is "auto".
+
+    axis : int or None, default: 0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear
+        in a corresponding element of the output. If ``None``, the input will
+        be raveled before computing the statistic.
+
+    Returns
+    -------
+    An object with the following attributes.
+
+    statistic : array_like
+        If `alternative` is "two-sided", the sum of the ranks of the
+        differences above or below zero, whichever is smaller.
+        Otherwise the sum of the ranks of the differences above zero.
+    pvalue : array_like
+        The p-value for the test depending on `alternative` and `method`.
+    zstatistic : array_like
+        When ``method = 'asymptotic'``, this is the normalized z-statistic::
+
+            z = (T - mn - d) / se
+
+        where ``T`` is `statistic` as defined above, ``mn`` is the mean of the
+        distribution under the null hypothesis, ``d`` is a continuity
+        correction, and ``se`` is the standard error.
+        When ``method != 'asymptotic'``, this attribute is not available.
+
+    See Also
+    --------
+    kruskal, mannwhitneyu
+
+    Notes
+    -----
+    In the following, let ``d`` represent the difference between the paired
+    samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or ``d = x``
+    otherwise. Assume that all elements of ``d`` are independent and
+    identically distributed observations, and all are distinct and nonzero.
+
+    - When ``len(d)`` is sufficiently large, the null distribution of the
+      normalized test statistic (`zstatistic` above) is approximately normal,
+      and ``method = 'asymptotic'`` can be used to compute the p-value.
+
+    - When ``len(d)`` is small, the normal approximation may not be accurate,
+      and ``method='exact'`` is preferred (at the cost of additional
+      execution time).
+
+    - The default, ``method='auto'``, selects between the two:
+      ``method='exact'`` is used when ``len(d) <= 50``, and
+      ``method='asymptotic'`` is used otherwise.
+
+    The presence of "ties" (i.e. not all elements of ``d`` are unique) or
+    "zeros" (i.e. elements of ``d`` are zero) changes the null distribution
+    of the test statistic, and ``method='exact'`` no longer calculates
+    the exact p-value. If ``method='asymptotic'``, the z-statistic is adjusted
+    for more accurate comparison against the standard normal, but still,
+    for finite sample sizes, the standard normal is only an approximation of
+    the true null distribution of the z-statistic. For such situations, the
+    `method` parameter also accepts instances of `PermutationMethod`. In this
+    case, the p-value is computed using `permutation_test` with the provided
+    configuration options and other appropriate settings.
+
+    The presence of ties and zeros affects the resolution of ``method='auto'``
+    accordingly: exhasutive permutations are performed when ``len(d) <= 13``,
+    and the asymptotic method is used otherwise. Note that they asymptotic
+    method may not be very accurate even for ``len(d) > 14``; the threshold
+    was chosen as a compromise between execution time and accuracy under the
+    constraint that the results must be deterministic. Consider providing an
+    instance of `PermutationMethod` method manually, choosing the
+    ``n_resamples`` parameter to balance time constraints and accuracy
+    requirements.
+
+    Please also note that in the edge case that all elements of ``d`` are zero,
+    the p-value relying on the normal approximaton cannot be computed (NaN)
+    if ``zero_method='wilcox'`` or ``zero_method='pratt'``.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
+    .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
+    .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
+       Rank Procedures, Journal of the American Statistical Association,
+       Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
+    .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
+       Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
+    .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank
+       Sampling Distribution When Zero Differences are Present,
+       Journal of the American Statistical Association, Vol. 62, 1967,
+       pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`
+
+    Examples
+    --------
+    In [4]_, the differences in height between cross- and self-fertilized
+    corn plants is given as follows:
+
+    >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
+
+    Cross-fertilized plants appear to be higher. To test the null
+    hypothesis that there is no height difference, we can apply the
+    two-sided test:
+
+    >>> from scipy.stats import wilcoxon
+    >>> res = wilcoxon(d)
+    >>> res.statistic, res.pvalue
+    (24.0, 0.041259765625)
+
+    Hence, we would reject the null hypothesis at a confidence level of 5%,
+    concluding that there is a difference in height between the groups.
+    To confirm that the median of the differences can be assumed to be
+    positive, we use:
+
+    >>> res = wilcoxon(d, alternative='greater')
+    >>> res.statistic, res.pvalue
+    (96.0, 0.0206298828125)
+
+    This shows that the null hypothesis that the median is negative can be
+    rejected at a confidence level of 5% in favor of the alternative that
+    the median is greater than zero. The p-values above are exact. Using the
+    normal approximation gives very similar values:
+
+    >>> res = wilcoxon(d, method='asymptotic')
+    >>> res.statistic, res.pvalue
+    (24.0, 0.04088813291185591)
+
+    Note that the statistic changed to 96 in the one-sided case (the sum
+    of ranks of positive differences) whereas it is 24 in the two-sided
+    case (the minimum of sum of ranks above and below zero).
+
+    In the example above, the differences in height between paired plants are
+    provided to `wilcoxon` directly. Alternatively, `wilcoxon` accepts two
+    samples of equal length, calculates the differences between paired
+    elements, then performs the test. Consider the samples ``x`` and ``y``:
+
+    >>> import numpy as np
+    >>> x = np.array([0.5, 0.825, 0.375, 0.5])
+    >>> y = np.array([0.525, 0.775, 0.325, 0.55])
+    >>> res = wilcoxon(x, y, alternative='greater')
+    >>> res
+    WilcoxonResult(statistic=5.0, pvalue=0.5625)
+
+    Note that had we calculated the differences by hand, the test would have
+    produced different results:
+
+    >>> d = [-0.025, 0.05, 0.05, -0.05]
+    >>> ref = wilcoxon(d, alternative='greater')
+    >>> ref
+    WilcoxonResult(statistic=6.0, pvalue=0.5)
+
+    The substantial difference is due to roundoff error in the results of
+    ``x-y``:
+
+    >>> d - (x-y)
+    array([2.08166817e-17, 6.93889390e-17, 1.38777878e-17, 4.16333634e-17])
+
+    Even though we expected all the elements of ``(x-y)[1:]`` to have the same
+    magnitude ``0.05``, they have slightly different magnitudes in practice,
+    and therefore are assigned different ranks in the test. Before performing
+    the test, consider calculating ``d`` and adjusting it as necessary to
+    ensure that theoretically identically values are not numerically distinct.
+    For example:
+
+    >>> d2 = np.around(x - y, decimals=3)
+    >>> wilcoxon(d2, alternative='greater')
+    WilcoxonResult(statistic=6.0, pvalue=0.5)
+
+    """
+    # replace approx by asymptotic to ensure backwards compatability
+    if method == "approx":
+        method = "asymptotic"
+    return _wilcoxon._wilcoxon_nd(x, y, zero_method, correction, alternative,
+                                  method, axis)
+
+
+MedianTestResult = _make_tuple_bunch(
+    'MedianTestResult',
+    ['statistic', 'pvalue', 'median', 'table'], []
+)
+
+
+@xp_capabilities(np_only=True)
+def median_test(*samples, ties='below', correction=True, lambda_=1,
+                nan_policy='propagate'):
+    """Perform a Mood's median test.
+
+    Test that two or more samples come from populations with the same median.
+
+    Let ``n = len(samples)`` be the number of samples.  The "grand median" of
+    all the data is computed, and a contingency table is formed by
+    classifying the values in each sample as being above or below the grand
+    median.  The contingency table, along with `correction` and `lambda_`,
+    are passed to `scipy.stats.chi2_contingency` to compute the test statistic
+    and p-value.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The set of samples.  There must be at least two samples.
+        Each sample must be a one-dimensional sequence containing at least
+        one value.  The samples are not required to have the same length.
+    ties : str, optional
+        Determines how values equal to the grand median are classified in
+        the contingency table.  The string must be one of::
+
+            "below":
+                Values equal to the grand median are counted as "below".
+            "above":
+                Values equal to the grand median are counted as "above".
+            "ignore":
+                Values equal to the grand median are not counted.
+
+        The default is "below".
+    correction : bool, optional
+        If True, *and* there are just two samples, apply Yates' correction
+        for continuity when computing the test statistic associated with
+        the contingency table.  Default is True.
+    lambda_ : float or str, optional
+        By default, the statistic computed in this test is Pearson's
+        chi-squared statistic.  `lambda_` allows a statistic from the
+        Cressie-Read power divergence family to be used instead.  See
+        `power_divergence` for details.
+        Default is 1 (Pearson's chi-squared statistic).
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan. 'propagate' returns nan,
+        'raise' throws an error, 'omit' performs the calculations ignoring nan
+        values. Default is 'propagate'.
+
+    Returns
+    -------
+    res : MedianTestResult
+        An object containing attributes:
+
+        statistic : float
+            The test statistic.  The statistic that is returned is determined
+            by `lambda_`.  The default is Pearson's chi-squared statistic.
+        pvalue : float
+            The p-value of the test.
+        median : float
+            The grand median.
+        table : ndarray
+            The contingency table.  The shape of the table is (2, n), where
+            n is the number of samples.  The first row holds the counts of the
+            values above the grand median, and the second row holds the counts
+            of the values below the grand median.  The table allows further
+            analysis with, for example, `scipy.stats.chi2_contingency`, or with
+            `scipy.stats.fisher_exact` if there are two samples, without having
+            to recompute the table.  If ``nan_policy`` is "propagate" and there
+            are nans in the input, the return value for ``table`` is ``None``.
+
+    See Also
+    --------
+    kruskal : Compute the Kruskal-Wallis H-test for independent samples.
+    mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
+
+    Notes
+    -----
+    .. versionadded:: 0.15.0
+
+    References
+    ----------
+    .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
+        (1950), pp. 394-399.
+    .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
+        See Sections 8.12 and 10.15.
+
+    Examples
+    --------
+    A biologist runs an experiment in which there are three groups of plants.
+    Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
+    Each plant produces a number of seeds.  The seed counts for each group
+    are::
+
+        Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
+        Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
+        Group 3:  0  3  9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
+
+    The following code applies Mood's median test to these samples.
+
+    >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
+    >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
+    >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
+    >>> from scipy.stats import median_test
+    >>> res = median_test(g1, g2, g3)
+
+    The median is
+
+    >>> res.median
+    34.0
+
+    and the contingency table is
+
+    >>> res.table
+    array([[ 5, 10,  7],
+           [11,  5, 10]])
+
+    `p` is too large to conclude that the medians are not the same:
+
+    >>> res.pvalue
+    0.12609082774093244
+
+    The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
+    `median_test`.
+
+    >>> res = median_test(g1, g2, g3, lambda_="log-likelihood")
+    >>> res.pvalue
+    0.12224779737117837
+
+    The median occurs several times in the data, so we'll get a different
+    result if, for example, ``ties="above"`` is used:
+
+    >>> res = median_test(g1, g2, g3, ties="above")
+    >>> res.pvalue
+    0.063873276069553273
+
+    >>> res.table
+    array([[ 5, 11,  9],
+           [11,  4,  8]])
+
+    This example demonstrates that if the data set is not large and there
+    are values equal to the median, the p-value can be sensitive to the
+    choice of `ties`.
+
+    """
+    if len(samples) < 2:
+        raise ValueError('median_test requires two or more samples.')
+
+    ties_options = ['below', 'above', 'ignore']
+    if ties not in ties_options:
+        raise ValueError(f"invalid 'ties' option '{ties}'; 'ties' must be one "
+                         f"of: {str(ties_options)[1:-1]}")
+
+    data = [np.asarray(sample) for sample in samples]
+
+    # Validate the sizes and shapes of the arguments.
+    for k, d in enumerate(data):
+        if d.size == 0:
+            raise ValueError(f"Sample {k + 1} is empty. All samples must "
+                             f"contain at least one value.")
+        if d.ndim != 1:
+            raise ValueError(f"Sample {k + 1} has {d.ndim} dimensions. "
+                             f"All samples must be one-dimensional sequences.")
+
+    cdata = np.concatenate(data)
+    contains_nan = _contains_nan(cdata, nan_policy)
+    if nan_policy == 'propagate' and contains_nan:
+        return MedianTestResult(np.nan, np.nan, np.nan, None)
+
+    if contains_nan:
+        grand_median = np.median(cdata[~np.isnan(cdata)])
+    else:
+        grand_median = np.median(cdata)
+    # When the minimum version of numpy supported by scipy is 1.9.0,
+    # the above if/else statement can be replaced by the single line:
+    #     grand_median = np.nanmedian(cdata)
+
+    # Create the contingency table.
+    table = np.zeros((2, len(data)), dtype=np.int64)
+    for k, sample in enumerate(data):
+        sample = sample[~np.isnan(sample)]
+
+        nabove = count_nonzero(sample > grand_median)
+        nbelow = count_nonzero(sample < grand_median)
+        nequal = sample.size - (nabove + nbelow)
+        table[0, k] += nabove
+        table[1, k] += nbelow
+        if ties == "below":
+            table[1, k] += nequal
+        elif ties == "above":
+            table[0, k] += nequal
+
+    # Check that no row or column of the table is all zero.
+    # Such a table can not be given to chi2_contingency, because it would have
+    # a zero in the table of expected frequencies.
+    rowsums = table.sum(axis=1)
+    if rowsums[0] == 0:
+        raise ValueError(f"All values are below the grand median ({grand_median}).")
+    if rowsums[1] == 0:
+        raise ValueError(f"All values are above the grand median ({grand_median}).")
+    if ties == "ignore":
+        # We already checked that each sample has at least one value, but it
+        # is possible that all those values equal the grand median.  If `ties`
+        # is "ignore", that would result in a column of zeros in `table`.  We
+        # check for that case here.
+        zero_cols = np.nonzero((table == 0).all(axis=0))[0]
+        if len(zero_cols) > 0:
+            raise ValueError(
+                f"All values in sample {zero_cols[0] + 1} are equal to the grand "
+                f"median ({grand_median!r}), so they are ignored, resulting in an "
+                f"empty sample."
+            )
+
+    stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
+                                              correction=correction)
+    return MedianTestResult(stat, p, grand_median, table)
+
+
+def _circfuncs_common(samples, period, xp=None):
+    xp = array_namespace(samples) if xp is None else xp
+
+    samples = xp_promote(samples, force_floating=True, xp=xp)
+
+    # Recast samples as radians that range between 0 and 2 pi and calculate
+    # the sine and cosine
+    scaled_samples = samples * ((2.0 * pi) / period)
+    sin_samp = xp.sin(scaled_samples)
+    cos_samp = xp.cos(scaled_samples)
+
+    return samples, sin_samp, cos_samp
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, default_axis=None,
+    result_to_tuple=lambda x, _: (x,)
+)
+def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
+    r"""Compute the circular mean of a sample of angle observations.
+
+    Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
+    radians, their *circular mean* is defined by ([1]_, Eq. 2.2.4)
+
+    .. math::
+
+       \mathrm{Arg} \left( \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right)
+
+    where :math:`i` is the imaginary unit and :math:`\mathop{\mathrm{Arg}} z`
+    gives the principal value of the argument of complex number :math:`z`,
+    restricted to the range :math:`[0,2\pi]` by default.  :math:`z` in the
+    above expression is known as the `mean resultant vector`.
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array of angle observations.  The value of a full angle is
+        equal to ``(high - low)``.
+    high : float, optional
+        Upper boundary of the principal value of an angle.  Default is ``2*pi``.
+    low : float, optional
+        Lower boundary of the principal value of an angle.  Default is ``0``.
+
+    Returns
+    -------
+    circmean : float
+        Circular mean, restricted to the range ``[low, high]``.
+
+        If the mean resultant vector is zero, an input-dependent,
+        implementation-defined number between ``[low, high]`` is returned.
+        If the input array is empty, ``np.nan`` is returned.
+
+    See Also
+    --------
+    circstd : Circular standard deviation.
+    circvar : Circular variance.
+
+    References
+    ----------
+    .. [1] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
+           John Wiley & Sons, 1999.
+
+    Examples
+    --------
+    For readability, all angles are printed out in degrees.
+
+    >>> import numpy as np
+    >>> from scipy.stats import circmean
+    >>> import matplotlib.pyplot as plt
+    >>> angles = np.deg2rad(np.array([20, 30, 330]))
+    >>> circmean = circmean(angles)
+    >>> np.rad2deg(circmean)
+    7.294976657784009
+
+    >>> mean = angles.mean()
+    >>> np.rad2deg(mean)
+    126.66666666666666
+
+    Plot and compare the circular mean against the arithmetic mean.
+
+    >>> plt.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
+    ...          np.sin(np.linspace(0, 2*np.pi, 500)),
+    ...          c='k')
+    >>> plt.scatter(np.cos(angles), np.sin(angles), c='k')
+    >>> plt.scatter(np.cos(circmean), np.sin(circmean), c='b',
+    ...             label='circmean')
+    >>> plt.scatter(np.cos(mean), np.sin(mean), c='r', label='mean')
+    >>> plt.legend()
+    >>> plt.axis('equal')
+    >>> plt.show()
+
+    """
+    xp = array_namespace(samples)
+    # Needed for non-NumPy arrays to get appropriate NaN result
+    # Apparently atan2(0, 0) is 0, even though it is mathematically undefined
+    if xp_size(samples) == 0:
+        return xp.mean(samples, axis=axis)
+    period = high - low
+    samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
+    sin_sum = xp.sum(sin_samp, axis=axis)
+    cos_sum = xp.sum(cos_samp, axis=axis)
+    res = xp.atan2(sin_sum, cos_sum)
+
+    res = res[()] if res.ndim == 0 else res
+    return (res * (period / (2.0 * pi)) - low) % period + low
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, default_axis=None,
+    result_to_tuple=lambda x, _: (x,)
+)
+def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
+    r"""Compute the circular variance of a sample of angle observations.
+
+    Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
+    radians, their *circular variance* is defined by ([2]_, Eq. 2.3.3)
+
+    .. math::
+
+       1 - \left| \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right|
+
+    where :math:`i` is the imaginary unit and :math:`|z|` gives the length
+    of the complex number :math:`z`.  :math:`|z|` in the above expression
+    is known as the `mean resultant length`.
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array of angle observations.  The value of a full angle is
+        equal to ``(high - low)``.
+    high : float, optional
+        Upper boundary of the principal value of an angle.  Default is ``2*pi``.
+    low : float, optional
+        Lower boundary of the principal value of an angle.  Default is ``0``.
+
+    Returns
+    -------
+    circvar : float
+        Circular variance.  The returned value is in the range ``[0, 1]``,
+        where ``0`` indicates no variance and ``1`` indicates large variance.
+
+        If the input array is empty, ``np.nan`` is returned.
+
+    See Also
+    --------
+    circmean : Circular mean.
+    circstd : Circular standard deviation.
+
+    Notes
+    -----
+    In the limit of small angles, the circular variance is close to
+    half the 'linear' variance if measured in radians.
+
+    References
+    ----------
+    .. [1] Fisher, N.I. *Statistical analysis of circular data*. Cambridge
+           University Press, 1993.
+    .. [2] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
+           John Wiley & Sons, 1999.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import circvar
+    >>> import matplotlib.pyplot as plt
+    >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
+    ...                       0.133, -0.473, -0.001, -0.348, 0.131])
+    >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
+    ...                       0.104, -0.136, -0.867,  0.012,  0.105])
+    >>> circvar_1 = circvar(samples_1)
+    >>> circvar_2 = circvar(samples_2)
+
+    Plot the samples.
+
+    >>> fig, (left, right) = plt.subplots(ncols=2)
+    >>> for image in (left, right):
+    ...     image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
+    ...                np.sin(np.linspace(0, 2*np.pi, 500)),
+    ...                c='k')
+    ...     image.axis('equal')
+    ...     image.axis('off')
+    >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
+    >>> left.set_title(f"circular variance: {np.round(circvar_1, 2)!r}")
+    >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
+    >>> right.set_title(f"circular variance: {np.round(circvar_2, 2)!r}")
+    >>> plt.show()
+
+    """
+    xp = array_namespace(samples)
+    period = high - low
+    samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
+    sin_mean = xp.mean(sin_samp, axis=axis)
+    cos_mean = xp.mean(cos_samp, axis=axis)
+    hypotenuse = (sin_mean**2. + cos_mean**2.)**0.5
+    # hypotenuse can go slightly above 1 due to rounding errors
+    R = xp.clip(hypotenuse, max=1.)
+
+    res = 1. - R
+    return res
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, default_axis=None,
+    result_to_tuple=lambda x, _: (x,)
+)
+def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate', *,
+            normalize=False):
+    r"""
+    Compute the circular standard deviation of a sample of angle observations.
+
+    Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
+    radians, their `circular standard deviation` is defined by
+    ([2]_, Eq. 2.3.11)
+
+    .. math::
+
+       \sqrt{ -2 \log \left| \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right| }
+
+    where :math:`i` is the imaginary unit and :math:`|z|` gives the length
+    of the complex number :math:`z`.  :math:`|z|` in the above expression
+    is known as the `mean resultant length`.
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array of angle observations.  The value of a full angle is
+        equal to ``(high - low)``.
+    high : float, optional
+        Upper boundary of the principal value of an angle.  Default is ``2*pi``.
+    low : float, optional
+        Lower boundary of the principal value of an angle.  Default is ``0``.
+    normalize : boolean, optional
+        If ``False`` (the default), the return value is computed from the
+        above formula with the input scaled by ``(2*pi)/(high-low)`` and
+        the output scaled (back) by ``(high-low)/(2*pi)``.  If ``True``,
+        the output is not scaled and is returned directly.
+
+    Returns
+    -------
+    circstd : float
+        Circular standard deviation, optionally normalized.
+
+        If the input array is empty, ``np.nan`` is returned.
+
+    See Also
+    --------
+    circmean : Circular mean.
+    circvar : Circular variance.
+
+    Notes
+    -----
+    In the limit of small angles, the circular standard deviation is close
+    to the 'linear' standard deviation if ``normalize`` is ``False``.
+
+    References
+    ----------
+    .. [1] Mardia, K. V. (1972). 2. In *Statistics of Directional Data*
+       (pp. 18-24). Academic Press. :doi:`10.1016/C2013-0-07425-7`.
+    .. [2] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
+           John Wiley & Sons, 1999.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import circstd
+    >>> import matplotlib.pyplot as plt
+    >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
+    ...                       0.133, -0.473, -0.001, -0.348, 0.131])
+    >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
+    ...                       0.104, -0.136, -0.867,  0.012,  0.105])
+    >>> circstd_1 = circstd(samples_1)
+    >>> circstd_2 = circstd(samples_2)
+
+    Plot the samples.
+
+    >>> fig, (left, right) = plt.subplots(ncols=2)
+    >>> for image in (left, right):
+    ...     image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
+    ...                np.sin(np.linspace(0, 2*np.pi, 500)),
+    ...                c='k')
+    ...     image.axis('equal')
+    ...     image.axis('off')
+    >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
+    >>> left.set_title(f"circular std: {np.round(circstd_1, 2)!r}")
+    >>> right.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
+    ...            np.sin(np.linspace(0, 2*np.pi, 500)),
+    ...            c='k')
+    >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
+    >>> right.set_title(f"circular std: {np.round(circstd_2, 2)!r}")
+    >>> plt.show()
+
+    """
+    xp = array_namespace(samples)
+    period = high - low
+    samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
+    sin_mean = xp.mean(sin_samp, axis=axis)  # [1] (2.2.3)
+    cos_mean = xp.mean(cos_samp, axis=axis)  # [1] (2.2.3)
+    hypotenuse = (sin_mean**2. + cos_mean**2.)**0.5
+    # hypotenuse can go slightly above 1 due to rounding errors
+    R = xp.clip(hypotenuse, max=1.)  # [1] (2.2.4)
+
+    res = (-2*xp.log(R))**0.5+0.0  # torch.pow returns -0.0 if R==1
+    if not normalize:
+        res *= (high-low)/(2.*pi)  # [1] (2.3.14) w/ (2.3.7)
+    return res
+
+
+class DirectionalStats:
+    def __init__(self, mean_direction, mean_resultant_length):
+        self.mean_direction = mean_direction
+        self.mean_resultant_length = mean_resultant_length
+
+    def __repr__(self):
+        return (f"DirectionalStats(mean_direction={self.mean_direction},"
+                f" mean_resultant_length={self.mean_resultant_length})")
+
+
+@xp_capabilities()
+def directional_stats(samples, *, axis=0, normalize=True):
+    """
+    Computes sample statistics for directional data.
+
+    Computes the directional mean (also called the mean direction vector) and
+    mean resultant length of a sample of vectors.
+
+    The directional mean is a measure of "preferred direction" of vector data.
+    It is analogous to the sample mean, but it is for use when the length of
+    the data is irrelevant (e.g. unit vectors).
+
+    The mean resultant length is a value between 0 and 1 used to quantify the
+    dispersion of directional data: the smaller the mean resultant length, the
+    greater the dispersion. Several definitions of directional variance
+    involving the mean resultant length are given in [1]_ and [2]_.
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array. Must be at least two-dimensional, and the last axis of the
+        input must correspond with the dimensionality of the vector space.
+        When the input is exactly two dimensional, this means that each row
+        of the data is a vector observation.
+    axis : int, default: 0
+        Axis along which the directional mean is computed.
+    normalize: boolean, default: True
+        If True, normalize the input to ensure that each observation is a
+        unit vector. It the observations are already unit vectors, consider
+        setting this to False to avoid unnecessary computation.
+
+    Returns
+    -------
+    res : DirectionalStats
+        An object containing attributes:
+
+        mean_direction : ndarray
+            Directional mean.
+        mean_resultant_length : ndarray
+            The mean resultant length [1]_.
+
+    See Also
+    --------
+    circmean: circular mean; i.e. directional mean for 2D *angles*
+    circvar: circular variance; i.e. directional variance for 2D *angles*
+
+    Notes
+    -----
+    This uses a definition of directional mean from [1]_.
+    Assuming the observations are unit vectors, the calculation is as follows.
+
+    .. code-block:: python
+
+        mean = samples.mean(axis=0)
+        mean_resultant_length = np.linalg.norm(mean)
+        mean_direction = mean / mean_resultant_length
+
+    This definition is appropriate for *directional* data (i.e. vector data
+    for which the magnitude of each observation is irrelevant) but not
+    for *axial* data (i.e. vector data for which the magnitude and *sign* of
+    each observation is irrelevant).
+
+    Several definitions of directional variance involving the mean resultant
+    length ``R`` have been proposed, including ``1 - R`` [1]_, ``1 - R**2``
+    [2]_, and ``2 * (1 - R)`` [2]_. Rather than choosing one, this function
+    returns ``R`` as attribute `mean_resultant_length` so the user can compute
+    their preferred measure of dispersion.
+
+    References
+    ----------
+    .. [1] Mardia, Jupp. (2000). *Directional Statistics*
+       (p. 163). Wiley.
+
+    .. [2] https://en.wikipedia.org/wiki/Directional_statistics
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import directional_stats
+    >>> data = np.array([[3, 4],    # first observation, 2D vector space
+    ...                  [6, -8]])  # second observation
+    >>> dirstats = directional_stats(data)
+    >>> dirstats.mean_direction
+    array([1., 0.])
+
+    In contrast, the regular sample mean of the vectors would be influenced
+    by the magnitude of each observation. Furthermore, the result would not be
+    a unit vector.
+
+    >>> data.mean(axis=0)
+    array([4.5, -2.])
+
+    An exemplary use case for `directional_stats` is to find a *meaningful*
+    center for a set of observations on a sphere, e.g. geographical locations.
+
+    >>> data = np.array([[0.8660254, 0.5, 0.],
+    ...                  [0.8660254, -0.5, 0.]])
+    >>> dirstats = directional_stats(data)
+    >>> dirstats.mean_direction
+    array([1., 0., 0.])
+
+    The regular sample mean on the other hand yields a result which does not
+    lie on the surface of the sphere.
+
+    >>> data.mean(axis=0)
+    array([0.8660254, 0., 0.])
+
+    The function also returns the mean resultant length, which
+    can be used to calculate a directional variance. For example, using the
+    definition ``Var(z) = 1 - R`` from [2]_ where ``R`` is the
+    mean resultant length, we can calculate the directional variance of the
+    vectors in the above example as:
+
+    >>> 1 - dirstats.mean_resultant_length
+    0.13397459716167093
+    """
+    xp = array_namespace(samples)
+    samples = xp.asarray(samples)
+
+    if samples.ndim < 2:
+        raise ValueError("samples must at least be two-dimensional. "
+                         f"Instead samples has shape: {tuple(samples.shape)}")
+    samples = xp.moveaxis(samples, axis, 0)
+
+    if is_marray(xp):
+        _xp = array_namespace(samples.mask)
+        mask = _xp.any(samples.mask, axis=-1, keepdims=True)
+        samples = xp.asarray(samples.data, mask=mask)
+
+    if normalize:
+        vectornorms = xp_vector_norm(samples, axis=-1, keepdims=True, xp=xp)
+        samples = samples/vectornorms
+    mean = xp.mean(samples, axis=0)
+    mean_resultant_length = xp_vector_norm(mean, axis=-1, keepdims=True, xp=xp)
+    mean_direction = mean / mean_resultant_length
+    mrl = xp.squeeze(mean_resultant_length, axis=-1)
+    mean_resultant_length = mrl[()] if mrl.ndim == 0 else mrl
+    return DirectionalStats(mean_direction, mean_resultant_length)
+
+
+@xp_capabilities(skip_backends=[('dask.array', "no take_along_axis")], jax_jit=False)
+def false_discovery_control(ps, *, axis=0, method='bh'):
+    """Adjust p-values to control the false discovery rate.
+
+    The false discovery rate (FDR) is the expected proportion of rejected null
+    hypotheses that are actually true.
+    If the null hypothesis is rejected when the *adjusted* p-value falls below
+    a specified level, the false discovery rate is controlled at that level.
+
+    Parameters
+    ----------
+    ps : 1D array_like
+        The p-values to adjust. Elements must be real numbers between 0 and 1.
+    axis : int
+        The axis along which to perform the adjustment. The adjustment is
+        performed independently along each axis-slice. If `axis` is None, `ps`
+        is raveled before performing the adjustment.
+    method : {'bh', 'by'}
+        The false discovery rate control procedure to apply: ``'bh'`` is for
+        Benjamini-Hochberg [1]_ (Eq. 1), ``'by'`` is for Benjaminini-Yekutieli
+        [2]_ (Theorem 1.3). The latter is more conservative, but it is
+        guaranteed to control the FDR even when the p-values are not from
+        independent tests.
+
+    Returns
+    -------
+    ps_adusted : array_like
+        The adjusted p-values. If the null hypothesis is rejected where these
+        fall below a specified level, the false discovery rate is controlled
+        at that level.
+
+    See Also
+    --------
+    combine_pvalues
+    statsmodels.stats.multitest.multipletests
+
+    Notes
+    -----
+    In multiple hypothesis testing, false discovery control procedures tend to
+    offer higher power than familywise error rate control procedures (e.g.
+    Bonferroni correction [1]_).
+
+    If the p-values correspond with independent tests (or tests with
+    "positive regression dependencies" [2]_), rejecting null hypotheses
+    corresponding with Benjamini-Hochberg-adjusted p-values below :math:`q`
+    controls the false discovery rate at a level less than or equal to
+    :math:`q m_0 / m`, where :math:`m_0` is the number of true null hypotheses
+    and :math:`m` is the total number of null hypotheses tested. The same is
+    true even for dependent tests when the p-values are adjusted accorded to
+    the more conservative Benjaminini-Yekutieli procedure.
+
+    The adjusted p-values produced by this function are comparable to those
+    produced by the R function ``p.adjust`` and the statsmodels function
+    `statsmodels.stats.multitest.multipletests`. Please consider the latter
+    for more advanced methods of multiple comparison correction.
+
+    References
+    ----------
+    .. [1] Benjamini, Yoav, and Yosef Hochberg. "Controlling the false
+           discovery rate: a practical and powerful approach to multiple
+           testing." Journal of the Royal statistical society: series B
+           (Methodological) 57.1 (1995): 289-300.
+
+    .. [2] Benjamini, Yoav, and Daniel Yekutieli. "The control of the false
+           discovery rate in multiple testing under dependency." Annals of
+           statistics (2001): 1165-1188.
+
+    .. [3] TileStats. FDR - Benjamini-Hochberg explained - Youtube.
+           https://www.youtube.com/watch?v=rZKa4tW2NKs.
+
+    .. [4] Neuhaus, Karl-Ludwig, et al. "Improved thrombolysis in acute
+           myocardial infarction with front-loaded administration of alteplase:
+           results of the rt-PA-APSAC patency study (TAPS)." Journal of the
+           American College of Cardiology 19.5 (1992): 885-891.
+
+    Examples
+    --------
+    We follow the example from [1]_.
+
+        Thrombolysis with recombinant tissue-type plasminogen activator (rt-PA)
+        and anisoylated plasminogen streptokinase activator (APSAC) in
+        myocardial infarction has been proved to reduce mortality. [4]_
+        investigated the effects of a new front-loaded administration of rt-PA
+        versus those obtained with a standard regimen of APSAC, in a randomized
+        multicentre trial in 421 patients with acute myocardial infarction.
+
+    There were four families of hypotheses tested in the study, the last of
+    which was "cardiac and other events after the start of thrombolitic
+    treatment". FDR control may be desired in this family of hypotheses
+    because it would not be appropriate to conclude that the front-loaded
+    treatment is better if it is merely equivalent to the previous treatment.
+
+    The p-values corresponding with the 15 hypotheses in this family were
+
+    >>> ps = [0.0001, 0.0004, 0.0019, 0.0095, 0.0201, 0.0278, 0.0298, 0.0344,
+    ...       0.0459, 0.3240, 0.4262, 0.5719, 0.6528, 0.7590, 1.000]
+
+    If the chosen significance level is 0.05, we may be tempted to reject the
+    null hypotheses for the tests corresponding with the first nine p-values,
+    as the first nine p-values fall below the chosen significance level.
+    However, this would ignore the problem of "multiplicity": if we fail to
+    correct for the fact that multiple comparisons are being performed, we
+    are more likely to incorrectly reject true null hypotheses.
+
+    One approach to the multiplicity problem is to control the family-wise
+    error rate (FWER), that is, the rate at which the null hypothesis is
+    rejected when it is actually true. A common procedure of this kind is the
+    Bonferroni correction [1]_.  We begin by multiplying the p-values by the
+    number of hypotheses tested.
+
+    >>> import numpy as np
+    >>> np.array(ps) * len(ps)
+    array([1.5000e-03, 6.0000e-03, 2.8500e-02, 1.4250e-01, 3.0150e-01,
+           4.1700e-01, 4.4700e-01, 5.1600e-01, 6.8850e-01, 4.8600e+00,
+           6.3930e+00, 8.5785e+00, 9.7920e+00, 1.1385e+01, 1.5000e+01])
+
+    To control the FWER at 5%, we reject only the hypotheses corresponding
+    with adjusted p-values less than 0.05. In this case, only the hypotheses
+    corresponding with the first three p-values can be rejected. According to
+    [1]_, these three hypotheses concerned "allergic reaction" and "two
+    different aspects of bleeding."
+
+    An alternative approach is to control the false discovery rate: the
+    expected fraction of rejected null hypotheses that are actually true. The
+    advantage of this approach is that it typically affords greater power: an
+    increased rate of rejecting the null hypothesis when it is indeed false. To
+    control the false discovery rate at 5%, we apply the Benjamini-Hochberg
+    p-value adjustment.
+
+    >>> from scipy import stats
+    >>> stats.false_discovery_control(ps)
+    array([0.0015    , 0.003     , 0.0095    , 0.035625  , 0.0603    ,
+           0.06385714, 0.06385714, 0.0645    , 0.0765    , 0.486     ,
+           0.58118182, 0.714875  , 0.75323077, 0.81321429, 1.        ])
+
+    Now, the first *four* adjusted p-values fall below 0.05, so we would reject
+    the null hypotheses corresponding with these *four* p-values. Rejection
+    of the fourth null hypothesis was particularly important to the original
+    study as it led to the conclusion that the new treatment had a
+    "substantially lower in-hospital mortality rate."
+
+    For simplicity of exposition, the p-values in the example above were given in
+    sorted order, but this is not required; `false_discovery_control` returns
+    adjusted p-values in order corresponding with the input `ps`.
+
+    >>> stats.false_discovery_control([0.5, 0.6, 0.1, 0.001])
+    array([0.6  , 0.6  , 0.2  , 0.004])
+
+    """
+    xp = array_namespace(ps)
+
+    # Input Validation and Special Cases
+    ps = xp.asarray(ps)
+
+    ps_in_range = (xp.isdtype(ps.dtype, ("integral", "real floating"))
+                   and xp.all(ps == xp.clip(ps, 0., 1.)))
+    if not ps_in_range:
+        raise ValueError("`ps` must include only numbers between 0 and 1.")
+
+    methods = {'bh', 'by'}
+    if method.lower() not in methods:
+        raise ValueError(f"Unrecognized `method` '{method}'."
+                         f"Method must be one of {methods}.")
+    method = method.lower()
+
+    if axis is None:
+        axis = 0
+        ps = xp_ravel(ps)
+
+    axis = np.asarray(axis)[()]  # use of NumPy for input validation is OK
+    if not np.issubdtype(axis.dtype, np.integer) or axis.size != 1:
+        raise ValueError("`axis` must be an integer or `None`")
+    axis = int(axis)
+
+    if xp_size(ps) <= 1 or ps.shape[axis] <= 1:
+        return ps[()] if ps.ndim == 0 else ps
+
+    ps = xp.moveaxis(ps, axis, -1)
+    m = ps.shape[-1]
+
+    # Main Algorithm
+    # Equivalent to the ideas of [1] and [2], except that this adjusts the
+    # p-values as described in [3]. The results are similar to those produced
+    # by R's p.adjust.
+
+    # "Let [ps] be the ordered observed p-values..."
+    order = xp.argsort(ps, axis=-1)
+    ps = xp.take_along_axis(ps, order, axis=-1)  # this copies ps
+
+    # Equation 1 of [1] rearranged to reject when p is less than specified q
+    i = xp.arange(1, m+1, dtype=ps.dtype, device=xp_device(ps))
+    # ps *= m / i
+    ps = xpx.at(ps)[...].multiply(m / i)
+
+    # Theorem 1.3 of [2]
+    if method == 'by':
+        # ps *= np.sum(1 / i)
+        ps = xpx.at(ps)[...].multiply(xp.sum(1 / i))
+
+    # accounts for rejecting all null hypotheses i for i < k, where k is
+    # defined in Eq. 1 of either [1] or [2]. See [3]. Starting with the index j
+    # of the second to last element, we replace element j with element j+1 if
+    # the latter is smaller.
+    if is_numpy(xp):
+        np.minimum.accumulate(ps[..., ::-1], out=ps[..., ::-1], axis=-1)
+    else:
+        n = ps.shape[-1]
+        for j in range(n-2, -1, -1):
+            # ps[..., j] = xp.minimum(ps[..., j], ps[..., j+1])
+            ps = xpx.at(ps)[..., j].set(xp.minimum(ps[..., j], ps[..., j+1]))
+
+    # Restore original order of axes and data
+    ps = _reorder_along_axis(ps, order, axis=-1, xp=xp)
+    ps = xp.moveaxis(ps, -1, axis)
+
+    return xp.clip(ps, 0., 1.)
+
+
+def _reorder_along_axis(x, i, *, axis, xp):
+    if is_jax(xp):
+        return xp.put_along_axis(x, i, values=x, axis=axis, inplace=False)
+    if hasattr(xp, 'put_along_axis'):
+        xp.put_along_axis(x, i, values=x.copy(), axis=axis)
+        return x
+    else:
+        return xp.take_along_axis(x, xp.argsort(i, axis=-1), axis=-1)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_basic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c64a1a5fac8dc81529e4a6c9ba0f07f4454252
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_basic.py
@@ -0,0 +1,3657 @@
+"""
+An extension of scipy.stats._stats_py to support masked arrays
+
+"""
+# Original author (2007): Pierre GF Gerard-Marchant
+
+
+__all__ = ['argstoarray',
+           'count_tied_groups',
+           'describe',
+           'f_oneway', 'find_repeats','friedmanchisquare',
+           'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
+           'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
+           'ks_1samp', 'kstest',
+           'linregress',
+           'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
+           'normaltest',
+           'obrientransform',
+           'pearsonr','plotting_positions','pointbiserialr',
+           'rankdata',
+           'scoreatpercentile','sem',
+           'sen_seasonal_slopes','skew','skewtest','spearmanr',
+           'siegelslopes', 'theilslopes',
+           'tmax','tmean','tmin','trim','trimboth',
+           'trimtail','trima','trimr','trimmed_mean','trimmed_std',
+           'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
+           'ttest_ind','ttest_rel','tvar',
+           'variation',
+           'winsorize',
+           'brunnermunzel',
+           ]
+
+import numpy as np
+from numpy import ndarray
+import numpy.ma as ma
+from numpy.ma import masked, nomask
+import math
+
+import itertools
+import warnings
+from collections import namedtuple
+
+from . import distributions
+from scipy._lib._util import _rename_parameter, _contains_nan, _dedent_for_py313
+from scipy._lib._bunch import _make_tuple_bunch
+import scipy.special as special
+import scipy.stats._stats_py
+import scipy.stats._stats_py as _stats_py
+
+from ._stats_mstats_common import (
+        _find_repeats,
+        theilslopes as stats_theilslopes,
+        siegelslopes as stats_siegelslopes
+        )
+
+
+def _chk_asarray(a, axis):
+    # Always returns a masked array, raveled for axis=None
+    a = ma.asanyarray(a)
+    if axis is None:
+        a = ma.ravel(a)
+        outaxis = 0
+    else:
+        outaxis = axis
+    return a, outaxis
+
+
+def _chk2_asarray(a, b, axis):
+    a = ma.asanyarray(a)
+    b = ma.asanyarray(b)
+    if axis is None:
+        a = ma.ravel(a)
+        b = ma.ravel(b)
+        outaxis = 0
+    else:
+        outaxis = axis
+    return a, b, outaxis
+
+
+def _chk_size(a, b):
+    a = ma.asanyarray(a)
+    b = ma.asanyarray(b)
+    (na, nb) = (a.size, b.size)
+    if na != nb:
+        raise ValueError("The size of the input array should match!"
+                         f" ({na} <> {nb})")
+    return (a, b, na)
+
+
+def _ttest_finish(df, t, alternative):
+    """Common code between all 3 t-test functions."""
+    # We use ``stdtr`` directly here to preserve masked arrays
+
+    if alternative == 'less':
+        pval = special._ufuncs.stdtr(df, t)
+    elif alternative == 'greater':
+        pval = special._ufuncs.stdtr(df, -t)
+    elif alternative == 'two-sided':
+        pval = special._ufuncs.stdtr(df, -np.abs(t))*2
+    else:
+        raise ValueError("alternative must be "
+                         "'less', 'greater' or 'two-sided'")
+
+    if t.ndim == 0:
+        t = t[()]
+    if pval.ndim == 0:
+        pval = pval[()]
+
+    return t, pval
+
+
+def argstoarray(*args):
+    """
+    Constructs a 2D array from a group of sequences.
+
+    Sequences are filled with missing values to match the length of the longest
+    sequence.
+
+    Parameters
+    ----------
+    *args : sequences
+        Group of sequences.
+
+    Returns
+    -------
+    argstoarray : MaskedArray
+        A ( `m` x `n` ) masked array, where `m` is the number of arguments and
+        `n` the length of the longest argument.
+
+    Notes
+    -----
+    `numpy.ma.vstack` has identical behavior, but is called with a sequence
+    of sequences.
+
+    Examples
+    --------
+    A 2D masked array constructed from a group of sequences is returned.
+
+    >>> from scipy.stats.mstats import argstoarray
+    >>> argstoarray([1, 2, 3], [4, 5, 6])
+    masked_array(
+     data=[[1.0, 2.0, 3.0],
+           [4.0, 5.0, 6.0]],
+     mask=[[False, False, False],
+           [False, False, False]],
+     fill_value=1e+20)
+
+    The returned masked array filled with missing values when the lengths of
+    sequences are different.
+
+    >>> argstoarray([1, 3], [4, 5, 6])
+    masked_array(
+     data=[[1.0, 3.0, --],
+           [4.0, 5.0, 6.0]],
+     mask=[[False, False,  True],
+           [False, False, False]],
+     fill_value=1e+20)
+
+    """
+    if len(args) == 1 and not isinstance(args[0], ndarray):
+        output = ma.asarray(args[0])
+        if output.ndim != 2:
+            raise ValueError("The input should be 2D")
+    else:
+        n = len(args)
+        m = max([len(k) for k in args])
+        output = ma.array(np.empty((n,m), dtype=float), mask=True)
+        for (k,v) in enumerate(args):
+            output[k,:len(v)] = v
+
+    output[np.logical_not(np.isfinite(output._data))] = masked
+    return output
+
+
+def find_repeats(arr):
+    """Find repeats in arr and return a tuple (repeats, repeat_count).
+
+    The input is cast to float64. Masked values are discarded.
+
+    Parameters
+    ----------
+    arr : sequence
+        Input array. The array is flattened if it is not 1D.
+
+    Returns
+    -------
+    repeats : ndarray
+        Array of repeated values.
+    counts : ndarray
+        Array of counts.
+
+    Examples
+    --------
+    >>> from scipy.stats import mstats
+    >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
+    (array([2.]), array([4]))
+
+    In the above example, 2 repeats 4 times.
+
+    >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
+    (array([4., 5.]), array([2, 2]))
+
+    In the above example, both 4 and 5 repeat 2 times.
+
+    """
+    # Make sure we get a copy. ma.compressed promises a "new array", but can
+    # actually return a reference.
+    compr = np.asarray(ma.compressed(arr), dtype=np.float64)
+    try:
+        need_copy = np.may_share_memory(compr, arr)
+    except AttributeError:
+        # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
+        # while in numpy 1.8.2 and above it just (correctly) returns False.
+        need_copy = False
+    if need_copy:
+        compr = compr.copy()
+    return _find_repeats(compr)
+
+
+def count_tied_groups(x, use_missing=False):
+    """
+    Counts the number of tied values.
+
+    Parameters
+    ----------
+    x : sequence
+        Sequence of data on which to counts the ties
+    use_missing : bool, optional
+        Whether to consider missing values as tied.
+
+    Returns
+    -------
+    count_tied_groups : dict
+        Returns a dictionary (nb of ties: nb of groups).
+
+    Examples
+    --------
+    >>> from scipy.stats import mstats
+    >>> import numpy as np
+    >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
+    >>> mstats.count_tied_groups(z)
+    {2: 1, 3: 2}
+
+    In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
+
+    >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
+    >>> mstats.count_tied_groups(z)
+    {2: 2, 3: 1}
+    >>> z[[1,-1]] = np.ma.masked
+    >>> mstats.count_tied_groups(z, use_missing=True)
+    {2: 2, 3: 1}
+
+    """
+    nmasked = ma.getmask(x).sum()
+    # We need the copy as find_repeats will overwrite the initial data
+    data = ma.compressed(x).copy()
+    (ties, counts) = find_repeats(data)
+    nties = {}
+    if len(ties):
+        nties = dict(zip(np.unique(counts), itertools.repeat(1)))
+        nties.update(dict(zip(*find_repeats(counts))))
+
+    if nmasked and use_missing:
+        try:
+            nties[nmasked] += 1
+        except KeyError:
+            nties[nmasked] = 1
+
+    return nties
+
+
+def rankdata(data, axis=None, use_missing=False):
+    """Returns the rank (also known as order statistics) of each data point
+    along the given axis.
+
+    If some values are tied, their rank is averaged.
+    If some values are masked, their rank is set to 0 if use_missing is False,
+    or set to the average rank of the unmasked values if use_missing is True.
+
+    Parameters
+    ----------
+    data : sequence
+        Input data. The data is transformed to a masked array
+    axis : {None,int}, optional
+        Axis along which to perform the ranking.
+        If None, the array is first flattened. An exception is raised if
+        the axis is specified for arrays with a dimension larger than 2
+    use_missing : bool, optional
+        Whether the masked values have a rank of 0 (False) or equal to the
+        average rank of the unmasked values (True).
+
+    """
+    def _rank1d(data, use_missing=False):
+        n = data.count()
+        rk = np.empty(data.size, dtype=float)
+        idx = data.argsort()
+        rk[idx[:n]] = np.arange(1,n+1)
+
+        if use_missing:
+            rk[idx[n:]] = (n+1)/2.
+        else:
+            rk[idx[n:]] = 0
+
+        repeats = find_repeats(data.copy())
+        for r in repeats[0]:
+            condition = (data == r).filled(False)
+            rk[condition] = rk[condition].mean()
+        return rk
+
+    data = ma.array(data, copy=False)
+    if axis is None:
+        if data.ndim > 1:
+            return _rank1d(data.ravel(), use_missing).reshape(data.shape)
+        else:
+            return _rank1d(data, use_missing)
+    else:
+        return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
+
+
+ModeResult = namedtuple('ModeResult', ('mode', 'count'))
+
+
+def mode(a, axis=0):
+    """
+    Returns an array of the modal (most common) value in the passed array.
+
+    Parameters
+    ----------
+    a : array_like
+        n-dimensional array of which to find mode(s).
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+
+    Returns
+    -------
+    mode : ndarray
+        Array of modal values.
+    count : ndarray
+        Array of counts for each mode.
+
+    Notes
+    -----
+    For more details, see `scipy.stats.mode`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> from scipy.stats import mstats
+    >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
+    >>> mstats.mode(m_arr)  # note that most zeros are masked
+    ModeResult(mode=array([1.]), count=array([2.]))
+
+    """
+    return _mode(a, axis=axis, keepdims=True)
+
+
+def _mode(a, axis=0, keepdims=True):
+    # Don't want to expose `keepdims` from the public `mstats.mode`
+    a, axis = _chk_asarray(a, axis)
+
+    def _mode1D(a):
+        (rep,cnt) = find_repeats(a)
+        if not cnt.ndim:
+            return (0, 0)
+        elif cnt.size:
+            return (rep[cnt.argmax()], cnt.max())
+        else:
+            return (a.min(), 1)
+
+    if axis is None:
+        output = _mode1D(ma.ravel(a))
+        output = (ma.array(output[0]), ma.array(output[1]))
+    else:
+        output = ma.apply_along_axis(_mode1D, axis, a)
+        if keepdims is None or keepdims:
+            newshape = list(a.shape)
+            newshape[axis] = 1
+            slices = [slice(None)] * output.ndim
+            slices[axis] = 0
+            modes = output[tuple(slices)].reshape(newshape)
+            slices[axis] = 1
+            counts = output[tuple(slices)].reshape(newshape)
+            output = (modes, counts)
+        else:
+            output = np.moveaxis(output, axis, 0)
+
+    return ModeResult(*output)
+
+
+def _betai(a, b, x):
+    x = np.asanyarray(x)
+    x = ma.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0
+    return special.betainc(a, b, x)
+
+
+def msign(x):
+    """Returns the sign of x, or 0 if x is masked."""
+    return ma.filled(np.sign(x), 0)
+
+
+def pearsonr(x, y):
+    r"""
+    Pearson correlation coefficient and p-value for testing non-correlation.
+
+    The Pearson correlation coefficient [1]_ measures the linear relationship
+    between two datasets.  The calculation of the p-value relies on the
+    assumption that each dataset is normally distributed.  (See Kowalski [3]_
+    for a discussion of the effects of non-normality of the input on the
+    distribution of the correlation coefficient.)  Like other correlation
+    coefficients, this one varies between -1 and +1 with 0 implying no
+    correlation. Correlations of -1 or +1 imply an exact linear relationship.
+
+    Parameters
+    ----------
+    x : (N,) array_like
+        Input array.
+    y : (N,) array_like
+        Input array.
+
+    Returns
+    -------
+    r : float
+        Pearson's correlation coefficient.
+    p-value : float
+        Two-tailed p-value.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Raised if an input is a constant array.  The correlation coefficient
+        is not defined in this case, so ``np.nan`` is returned.
+
+    `~scipy.stats.NearConstantInputWarning`
+        Raised if an input is "nearly" constant.  The array ``x`` is considered
+        nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
+        Numerical errors in the calculation ``x - mean(x)`` in this case might
+        result in an inaccurate calculation of r.
+
+    See Also
+    --------
+    spearmanr : Spearman rank-order correlation coefficient.
+    kendalltau : Kendall's tau, a correlation measure for ordinal data.
+
+    Notes
+    -----
+    The correlation coefficient is calculated as follows:
+
+    .. math::
+
+        r = \frac{\sum (x - m_x) (y - m_y)}
+                 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
+
+    where :math:`m_x` is the mean of the vector x and :math:`m_y` is
+    the mean of the vector y.
+
+    Under the assumption that x and y are drawn from
+    independent normal distributions (so the population correlation coefficient
+    is 0), the probability density function of the sample correlation
+    coefficient r is ([1]_, [2]_):
+
+    .. math::
+
+        f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
+
+    where n is the number of samples, and B is the beta function.  This
+    is sometimes referred to as the exact distribution of r.  This is
+    the distribution that is used in `pearsonr` to compute the p-value.
+    The distribution is a beta distribution on the interval [-1, 1],
+    with equal shape parameters a = b = n/2 - 1.  In terms of SciPy's
+    implementation of the beta distribution, the distribution of r is::
+
+        dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
+
+    The p-value returned by `pearsonr` is a two-sided p-value. The p-value
+    roughly indicates the probability of an uncorrelated system
+    producing datasets that have a Pearson correlation at least as extreme
+    as the one computed from these datasets. More precisely, for a
+    given sample with correlation coefficient r, the p-value is
+    the probability that abs(r') of a random sample x' and y' drawn from
+    the population with zero correlation would be greater than or equal
+    to abs(r). In terms of the object ``dist`` shown above, the p-value
+    for a given r and length n can be computed as::
+
+        p = 2*dist.cdf(-abs(r))
+
+    When n is 2, the above continuous distribution is not well-defined.
+    One can interpret the limit of the beta distribution as the shape
+    parameters a and b approach a = b = 0 as a discrete distribution with
+    equal probability masses at r = 1 and r = -1.  More directly, one
+    can observe that, given the data x = [x1, x2] and y = [y1, y2], and
+    assuming x1 != x2 and y1 != y2, the only possible values for r are 1
+    and -1.  Because abs(r') for any sample x' and y' with length 2 will
+    be 1, the two-sided p-value for a sample of length 2 is always 1.
+
+    References
+    ----------
+    .. [1] "Pearson correlation coefficient", Wikipedia,
+           https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    .. [2] Student, "Probable error of a correlation coefficient",
+           Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
+    .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
+           of the Sample Product-Moment Correlation Coefficient"
+           Journal of the Royal Statistical Society. Series C (Applied
+           Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> from scipy.stats import mstats
+    >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
+    (-0.7426106572325057, 0.1505558088534455)
+
+    There is a linear dependence between x and y if y = a + b*x + e, where
+    a,b are constants and e is a random error term, assumed to be independent
+    of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
+    e follow a normal distribution with mean zero and standard deviation s>0.
+
+    >>> s = 0.5
+    >>> x = stats.norm.rvs(size=500)
+    >>> e = stats.norm.rvs(scale=s, size=500)
+    >>> y = x + e
+    >>> mstats.pearsonr(x, y)
+    (0.9029601878969703, 8.428978827629898e-185) # may vary
+
+    This should be close to the exact value given by
+
+    >>> 1/np.sqrt(1 + s**2)
+    0.8944271909999159
+
+    For s=0.5, we observe a high level of correlation. In general, a large
+    variance of the noise reduces the correlation, while the correlation
+    approaches one as the variance of the error goes to zero.
+
+    It is important to keep in mind that no correlation does not imply
+    independence unless (x, y) is jointly normal. Correlation can even be zero
+    when there is a very simple dependence structure: if X follows a
+    standard normal distribution, let y = abs(x). Note that the correlation
+    between x and y is zero. Indeed, since the expectation of x is zero,
+    cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
+    by symmetry. The following lines of code illustrate this observation:
+
+    >>> y = np.abs(x)
+    >>> mstats.pearsonr(x, y)
+    (-0.016172891856853524, 0.7182823678751942) # may vary
+
+    A non-zero correlation coefficient can be misleading. For example, if X has
+    a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
+    A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
+    implying a high level of correlation:
+
+    >>> y = np.where(x < 0, x, 0)
+    >>> mstats.pearsonr(x, y)
+    (0.8537091583771509, 3.183461621422181e-143) # may vary
+
+    This is unintuitive since there is no dependence of x and y if x is larger
+    than zero which happens in about half of the cases if we sample x and y.
+    """
+    (x, y, n) = _chk_size(x, y)
+    (x, y) = (x.ravel(), y.ravel())
+    # Get the common mask and the total nb of unmasked elements
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    n -= m.sum()
+    df = n-2
+    if df < 0:
+        return (masked, masked)
+
+    return scipy.stats._stats_py.pearsonr(
+                ma.masked_array(x, mask=m).compressed(),
+                ma.masked_array(y, mask=m).compressed())
+
+
+def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
+              alternative='two-sided'):
+    """
+    Calculates a Spearman rank-order correlation coefficient and the p-value
+    to test for non-correlation.
+
+    The Spearman correlation is a nonparametric measure of the linear
+    relationship between two datasets. Unlike the Pearson correlation, the
+    Spearman correlation does not assume that both datasets are normally
+    distributed. Like other correlation coefficients, this one varies
+    between -1 and +1 with 0 implying no correlation. Correlations of -1 or
+    +1 imply a monotonic relationship. Positive correlations imply that
+    as `x` increases, so does `y`. Negative correlations imply that as `x`
+    increases, `y` decreases.
+
+    Missing values are discarded pair-wise: if a value is missing in `x`, the
+    corresponding value in `y` is masked.
+
+    The p-value roughly indicates the probability of an uncorrelated system
+    producing datasets that have a Spearman correlation at least as extreme
+    as the one computed from these datasets. The p-values are not entirely
+    reliable but are probably reasonable for datasets larger than 500 or so.
+
+    Parameters
+    ----------
+    x, y : 1D or 2D array_like, y is optional
+        One or two 1-D or 2-D arrays containing multiple variables and
+        observations. When these are 1-D, each represents a vector of
+        observations of a single variable. For the behavior in the 2-D case,
+        see under ``axis``, below.
+    use_ties : bool, optional
+        DO NOT USE.  Does not do anything, keyword is only left in place for
+        backwards compatibility reasons.
+    axis : int or None, optional
+        If axis=0 (default), then each column represents a variable, with
+        observations in the rows. If axis=1, the relationship is transposed:
+        each row represents a variable, while the columns contain observations.
+        If axis=None, then both arrays will be raveled.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan. 'propagate' returns nan,
+        'raise' throws an error, 'omit' performs the calculations ignoring nan
+        values. Default is 'propagate'.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the correlation is nonzero
+        * 'less': the correlation is negative (less than zero)
+        * 'greater':  the correlation is positive (greater than zero)
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float or ndarray (2-D square)
+            Spearman correlation matrix or correlation coefficient (if only 2
+            variables are given as parameters). Correlation matrix is square
+            with length equal to total number of variables (columns or rows) in
+            ``a`` and ``b`` combined.
+        pvalue : float
+            The p-value for a hypothesis test whose null hypothesis
+            is that two sets of data are linearly uncorrelated. See
+            `alternative` above for alternative hypotheses. `pvalue` has the
+            same shape as `statistic`.
+
+    References
+    ----------
+    [CRCProbStat2000] section 14.7
+
+    """
+    if not use_ties:
+        raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
+
+    # Always returns a masked array, raveled if axis=None
+    x, axisout = _chk_asarray(x, axis)
+    if y is not None:
+        # Deal only with 2-D `x` case.
+        y, _ = _chk_asarray(y, axis)
+        if axisout == 0:
+            x = ma.column_stack((x, y))
+        else:
+            x = ma.vstack((x, y))
+
+    if axisout == 1:
+        # To simplify the code that follow (always use `n_obs, n_vars` shape)
+        x = x.T
+
+    if nan_policy == 'omit':
+        x = ma.masked_invalid(x)
+
+    def _spearmanr_2cols(x):
+        # Mask the same observations for all variables, and then drop those
+        # observations (can't leave them masked, rankdata is weird).
+        x = ma.mask_rowcols(x, axis=0)
+        x = x[~x.mask.any(axis=1), :]
+
+        # If either column is entirely NaN or Inf
+        if not np.any(x.data):
+            res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
+            res.correlation = np.nan
+            return res
+
+        m = ma.getmask(x)
+        n_obs = x.shape[0]
+        dof = n_obs - 2 - int(m.sum(axis=0)[0])
+        if dof < 0:
+            raise ValueError("The input must have at least 3 entries!")
+
+        # Gets the ranks and rank differences
+        x_ranked = rankdata(x, axis=0)
+        rs = ma.corrcoef(x_ranked, rowvar=False).data
+
+        # rs can have elements equal to 1, so avoid zero division warnings
+        with np.errstate(divide='ignore'):
+            # clip the small negative values possibly caused by rounding
+            # errors before taking the square root
+            t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
+
+        t, prob = _ttest_finish(dof, t, alternative)
+
+        # For backwards compatibility, return scalars when comparing 2 columns
+        if rs.shape == (2, 2):
+            res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
+                                                           prob[1, 0])
+            res.correlation = rs[1, 0]
+            return res
+        else:
+            res = scipy.stats._stats_py.SignificanceResult(rs, prob)
+            res.correlation = rs
+            return res
+
+    # Need to do this per pair of variables, otherwise the dropped observations
+    # in a third column mess up the result for a pair.
+    n_vars = x.shape[1]
+    if n_vars == 2:
+        return _spearmanr_2cols(x)
+    else:
+        rs = np.ones((n_vars, n_vars), dtype=float)
+        prob = np.zeros((n_vars, n_vars), dtype=float)
+        for var1 in range(n_vars - 1):
+            for var2 in range(var1+1, n_vars):
+                result = _spearmanr_2cols(x[:, [var1, var2]])
+                rs[var1, var2] = result.correlation
+                rs[var2, var1] = result.correlation
+                prob[var1, var2] = result.pvalue
+                prob[var2, var1] = result.pvalue
+
+        res = scipy.stats._stats_py.SignificanceResult(rs, prob)
+        res.correlation = rs
+        return res
+
+
+def _kendall_p_exact(n, c, alternative='two-sided'):
+
+    # Use the fact that distribution is symmetric: always calculate a CDF in
+    # the left tail.
+    # This will be the one-sided p-value if `c` is on the side of
+    # the null distribution predicted by the alternative hypothesis.
+    # The two-sided p-value will be twice this value.
+    # If `c` is on the other side of the null distribution, we'll need to
+    # take the complement and add back the probability mass at `c`.
+    in_right_tail = (c >= (n*(n-1))//2 - c)
+    alternative_greater = (alternative == 'greater')
+    c = int(min(c, (n*(n-1))//2 - c))
+
+    # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
+    # (4th Edition), Charles Griffin & Co., 1970.
+    if n <= 0:
+        raise ValueError(f'n ({n}) must be positive')
+    elif c < 0 or 4*c > n*(n-1):
+        raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
+    elif n == 1:
+        prob = 1.0
+        p_mass_at_c = 1
+    elif n == 2:
+        prob = 1.0
+        p_mass_at_c = 0.5
+    elif c == 0:
+        prob = 2.0/math.factorial(n) if n < 171 else 0.0
+        p_mass_at_c = prob/2
+    elif c == 1:
+        prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
+        p_mass_at_c = (n-1)/math.factorial(n)
+    elif 4*c == n*(n-1) and alternative == 'two-sided':
+        # I'm sure there's a simple formula for p_mass_at_c in this
+        # case, but I don't know it. Use generic formula for one-sided p-value.
+        prob = 1.0
+    elif n < 171:
+        new = np.zeros(c+1)
+        new[0:2] = 1.0
+        for j in range(3,n+1):
+            new = np.cumsum(new)
+            if j <= c:
+                new[j:] -= new[:c+1-j]
+        prob = 2.0*np.sum(new)/math.factorial(n)
+        p_mass_at_c = new[-1]/math.factorial(n)
+    else:
+        new = np.zeros(c+1)
+        new[0:2] = 1.0
+        for j in range(3, n+1):
+            new = np.cumsum(new)/j
+            if j <= c:
+                new[j:] -= new[:c+1-j]
+        prob = np.sum(new)
+        p_mass_at_c = new[-1]/2
+
+    if alternative != 'two-sided':
+        # if the alternative hypothesis and alternative agree,
+        # one-sided p-value is half the two-sided p-value
+        if in_right_tail == alternative_greater:
+            prob /= 2
+        else:
+            prob = 1 - prob/2 + p_mass_at_c
+
+    prob = np.clip(prob, 0, 1)
+
+    return prob
+
+
+def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
+               alternative='two-sided'):
+    """
+    Computes Kendall's rank correlation tau on two variables *x* and *y*.
+
+    Parameters
+    ----------
+    x : sequence
+        First data list (for example, time).
+    y : sequence
+        Second data list.
+    use_ties : {True, False}, optional
+        Whether ties correction should be performed.
+    use_missing : {False, True}, optional
+        Whether missing data should be allocated a rank of 0 (False) or the
+        average rank (True)
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        Defines which method is used to calculate the p-value [1]_.
+        'asymptotic' uses a normal approximation valid for large samples.
+        'exact' computes the exact p-value, but can only be used if no ties
+        are present. As the sample size increases, the 'exact' computation
+        time may grow and the result may lose some precision.
+        'auto' is the default and selects the appropriate
+        method based on a trade-off between speed and accuracy.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the rank correlation is nonzero
+        * 'less': the rank correlation is negative (less than zero)
+        * 'greater':  the rank correlation is positive (greater than zero)
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+           The tau statistic.
+        pvalue : float
+           The p-value for a hypothesis test whose null hypothesis is
+           an absence of association, tau = 0.
+
+    References
+    ----------
+    .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
+           Charles Griffin & Co., 1970.
+
+    """
+    (x, y, n) = _chk_size(x, y)
+    (x, y) = (x.flatten(), y.flatten())
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    if m is not nomask:
+        x = ma.array(x, mask=m, copy=True)
+        y = ma.array(y, mask=m, copy=True)
+        # need int() here, otherwise numpy defaults to 32 bit
+        # integer on all Windows architectures, causing overflow.
+        # int() will keep it infinite precision.
+        n -= int(m.sum())
+
+    if n < 2:
+        res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
+        res.correlation = np.nan
+        return res
+
+    rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
+    ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
+    idx = rx.argsort()
+    (rx, ry) = (rx[idx], ry[idx])
+    C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
+                for i in range(len(ry)-1)], dtype=float)
+    D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
+                for i in range(len(ry)-1)], dtype=float)
+    xties = count_tied_groups(x)
+    yties = count_tied_groups(y)
+    if use_ties:
+        corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
+        corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
+        denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
+    else:
+        denom = n*(n-1)/2.
+    tau = (C-D) / denom
+
+    if method == 'exact' and (xties or yties):
+        raise ValueError("Ties found, exact method cannot be used.")
+
+    if method == 'auto':
+        if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
+            method = 'exact'
+        else:
+            method = 'asymptotic'
+
+    if not xties and not yties and method == 'exact':
+        prob = _kendall_p_exact(n, C, alternative)
+
+    elif method == 'asymptotic':
+        var_s = n*(n-1)*(2*n+5)
+        if use_ties:
+            var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
+            var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
+            v1 = (np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *
+                  np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float))
+            v1 /= 2.*n*(n-1)
+            if n > 2:
+                v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
+                            dtype=float) * \
+                     np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
+                            dtype=float)
+                v2 /= 9.*n*(n-1)*(n-2)
+            else:
+                v2 = 0
+        else:
+            v1 = v2 = 0
+
+        var_s /= 18.
+        var_s += (v1 + v2)
+        z = (C-D)/np.sqrt(var_s)
+        prob = scipy.stats._stats_py._get_pvalue(z, distributions.norm, alternative)
+    else:
+        raise ValueError("Unknown method "+str(method)+" specified, please "
+                         "use auto, exact or asymptotic.")
+
+    res = scipy.stats._stats_py.SignificanceResult(tau[()], prob[()])
+    res.correlation = tau
+    return res
+
+
+def kendalltau_seasonal(x):
+    """
+    Computes a multivariate Kendall's rank correlation tau, for seasonal data.
+
+    Parameters
+    ----------
+    x : 2-D ndarray
+        Array of seasonal data, with seasons in columns.
+
+    """
+    x = ma.array(x, subok=True, copy=False, ndmin=2)
+    (n,m) = x.shape
+    n_p = x.count(0)
+
+    S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
+    S_tot = S_szn.sum()
+
+    n_tot = x.count()
+    ties = count_tied_groups(x.compressed())
+    corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
+    denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
+
+    R = rankdata(x, axis=0, use_missing=True)
+    K = ma.empty((m,m), dtype=int)
+    covmat = ma.empty((m,m), dtype=float)
+    denom_szn = ma.empty(m, dtype=float)
+    for j in range(m):
+        ties_j = count_tied_groups(x[:,j].compressed())
+        corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
+        cmb = n_p[j]*(n_p[j]-1)
+        for k in range(j,m,1):
+            K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
+                         for i in range(n))
+            covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
+                           n*(n_p[j]+1)*(n_p[k]+1))/3.
+            K[k,j] = K[j,k]
+            covmat[k,j] = covmat[j,k]
+
+        denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
+
+    var_szn = covmat.diagonal()
+
+    z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
+    z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
+    z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
+
+    prob_szn = special.erfc(abs(z_szn.data)/np.sqrt(2))
+    prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
+    prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
+
+    chi2_tot = (z_szn*z_szn).sum()
+    chi2_trd = m * z_szn.mean()**2
+    output = {'seasonal tau': S_szn/denom_szn,
+              'global tau': S_tot/denom_tot,
+              'global tau (alt)': S_tot/denom_szn.sum(),
+              'seasonal p-value': prob_szn,
+              'global p-value (indep)': prob_tot_ind,
+              'global p-value (dep)': prob_tot_dep,
+              'chi2 total': chi2_tot,
+              'chi2 trend': chi2_trd,
+              }
+    return output
+
+
+PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
+                                                           'pvalue'))
+
+
+def pointbiserialr(x, y):
+    """Calculates a point biserial correlation coefficient and its p-value.
+
+    Parameters
+    ----------
+    x : array_like of bools
+        Input array.
+    y : array_like
+        Input array.
+
+    Returns
+    -------
+    correlation : float
+        R value
+    pvalue : float
+        2-tailed p-value
+
+    Notes
+    -----
+    Missing values are considered pair-wise: if a value is missing in x,
+    the corresponding value in y is masked.
+
+    For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
+
+    """
+    x = ma.fix_invalid(x, copy=True).astype(bool)
+    y = ma.fix_invalid(y, copy=True).astype(float)
+    # Get rid of the missing data
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    if m is not nomask:
+        unmask = np.logical_not(m)
+        x = x[unmask]
+        y = y[unmask]
+
+    n = len(x)
+    # phat is the fraction of x values that are True
+    phat = x.sum() / float(n)
+    y0 = y[~x]  # y-values where x is False
+    y1 = y[x]  # y-values where x is True
+    y0m = y0.mean()
+    y1m = y1.mean()
+
+    rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
+
+    df = n-2
+    t = rpb*ma.sqrt(df/(1.0-rpb**2))
+    prob = _betai(0.5*df, 0.5, df/(df+t*t))
+
+    return PointbiserialrResult(rpb, prob)
+
+
+def linregress(x, y=None):
+    r"""
+    Calculate a linear least-squares regression for two sets of measurements.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Two sets of measurements.  Both arrays should have the same length N.  If
+        only `x` is given (and ``y=None``), then it must be a two-dimensional
+        array where one dimension has length 2.  The two sets of measurements
+        are then found by splitting the array along the length-2 dimension. In
+        the case where ``y=None`` and `x` is a 2xN array, ``linregress(x)`` is
+        equivalent to ``linregress(x[0], x[1])``.
+
+    Returns
+    -------
+    result : ``LinregressResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Slope of the regression line.
+        intercept : float
+            Intercept of the regression line.
+        rvalue : float
+            The Pearson correlation coefficient. The square of ``rvalue``
+            is equal to the coefficient of determination.
+        pvalue : float
+            The p-value for a hypothesis test whose null hypothesis is
+            that the slope is zero, using Wald Test with t-distribution of
+            the test statistic. See `alternative` above for alternative
+            hypotheses.
+        stderr : float
+            Standard error of the estimated slope (gradient), under the
+            assumption of residual normality.
+        intercept_stderr : float
+            Standard error of the estimated intercept, under the assumption
+            of residual normality.
+
+    See Also
+    --------
+    scipy.optimize.curve_fit :
+        Use non-linear least squares to fit a function to data.
+    scipy.optimize.leastsq :
+        Minimize the sum of squares of a set of equations.
+
+    Notes
+    -----
+    Missing values are considered pair-wise: if a value is missing in `x`,
+    the corresponding value in `y` is masked.
+
+    For compatibility with older versions of SciPy, the return value acts
+    like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
+    ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
+
+        slope, intercept, r, p, se = linregress(x, y)
+
+    With that style, however, the standard error of the intercept is not
+    available.  To have access to all the computed values, including the
+    standard error of the intercept, use the return value as an object
+    with attributes, e.g.::
+
+        result = linregress(x, y)
+        print(result.intercept, result.intercept_stderr)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+
+    Generate some data:
+
+    >>> x = rng.random(10)
+    >>> y = 1.6*x + rng.random(10)
+
+    Perform the linear regression:
+
+    >>> res = stats.mstats.linregress(x, y)
+
+    Coefficient of determination (R-squared):
+
+    >>> print(f"R-squared: {res.rvalue**2:.6f}")
+    R-squared: 0.717533
+
+    Plot the data along with the fitted line:
+
+    >>> plt.plot(x, y, 'o', label='original data')
+    >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
+    >>> plt.legend()
+    >>> plt.show()
+
+    Calculate 95% confidence interval on slope and intercept:
+
+    >>> # Two-sided inverse Students t-distribution
+    >>> # p - probability, df - degrees of freedom
+    >>> from scipy.stats import t
+    >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
+
+    >>> ts = tinv(0.05, len(x)-2)
+    >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
+    slope (95%): 1.453392 +/- 0.743465
+    >>> print(f"intercept (95%): {res.intercept:.6f}"
+    ...       f" +/- {ts*res.intercept_stderr:.6f}")
+    intercept (95%): 0.616950 +/- 0.544475
+
+    """
+    if y is None:
+        x = ma.array(x)
+        if x.shape[0] == 2:
+            x, y = x
+        elif x.shape[1] == 2:
+            x, y = x.T
+        else:
+            raise ValueError("If only `x` is given as input, "
+                             "it has to be of shape (2, N) or (N, 2), "
+                             f"provided shape was {x.shape}")
+    else:
+        x = ma.array(x)
+        y = ma.array(y)
+
+    x = x.flatten()
+    y = y.flatten()
+
+    if np.amax(x) == np.amin(x) and len(x) > 1:
+        raise ValueError("Cannot calculate a linear regression "
+                         "if all x values are identical")
+
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
+    if m is not nomask:
+        x = ma.array(x, mask=m)
+        y = ma.array(y, mask=m)
+        if np.any(~m):
+            result = _stats_py.linregress(x.data[~m], y.data[~m])
+        else:
+            # All data is masked
+            result = _stats_py.LinregressResult(slope=None, intercept=None,
+                                                rvalue=None, pvalue=None,
+                                                stderr=None,
+                                                intercept_stderr=None)
+    else:
+        result = _stats_py.linregress(x.data, y.data)
+
+    return result
+
+
+def theilslopes(y, x=None, alpha=0.95, method='separate'):
+    r"""
+    Computes the Theil-Sen estimator for a set of points (x, y).
+
+    `theilslopes` implements a method for robust linear regression.  It
+    computes the slope as the median of all slopes between paired values.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    alpha : float, optional
+        Confidence degree between 0 and 1. Default is 95% confidence.
+        Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
+        interpreted as "find the 90% confidence interval".
+    method : {'joint', 'separate'}, optional
+        Method to be used for computing estimate for intercept.
+        Following methods are supported,
+
+            * 'joint': Uses np.median(y - slope * x) as intercept.
+            * 'separate': Uses np.median(y) - slope * np.median(x)
+                          as intercept.
+
+        The default is 'separate'.
+
+        .. versionadded:: 1.8.0
+
+    Returns
+    -------
+    result : ``TheilslopesResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Theil slope.
+        intercept : float
+            Intercept of the Theil line.
+        low_slope : float
+            Lower bound of the confidence interval on `slope`.
+        high_slope : float
+            Upper bound of the confidence interval on `slope`.
+
+    See Also
+    --------
+    siegelslopes : a similar technique using repeated medians
+
+
+    Notes
+    -----
+    For more details on `theilslopes`, see `scipy.stats.theilslopes`.
+
+    """
+    y = ma.asarray(y).flatten()
+    if x is None:
+        x = ma.arange(len(y), dtype=float)
+    else:
+        x = ma.asarray(x).flatten()
+        if len(x) != len(y):
+            raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
+
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    y._mask = x._mask = m
+    # Disregard any masked elements of x or y
+    y = y.compressed()
+    x = x.compressed().astype(float)
+    # We now have unmasked arrays so can use `scipy.stats.theilslopes`
+    return stats_theilslopes(y, x, alpha=alpha, method=method)
+
+
+def siegelslopes(y, x=None, method="hierarchical"):
+    r"""
+    Computes the Siegel estimator for a set of points (x, y).
+
+    `siegelslopes` implements a method for robust linear regression
+    using repeated medians to fit a line to the points (x, y).
+    The method is robust to outliers with an asymptotic breakdown point
+    of 50%.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    method : {'hierarchical', 'separate'}
+        If 'hierarchical', estimate the intercept using the estimated
+        slope ``slope`` (default option).
+        If 'separate', estimate the intercept independent of the estimated
+        slope. See Notes for details.
+
+    Returns
+    -------
+    result : ``SiegelslopesResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Estimate of the slope of the regression line.
+        intercept : float
+            Estimate of the intercept of the regression line.
+
+    See Also
+    --------
+    theilslopes : a similar technique without repeated medians
+
+    Notes
+    -----
+    For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
+
+    """
+    y = ma.asarray(y).ravel()
+    if x is None:
+        x = ma.arange(len(y), dtype=float)
+    else:
+        x = ma.asarray(x).ravel()
+        if len(x) != len(y):
+            raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
+
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    y._mask = x._mask = m
+    # Disregard any masked elements of x or y
+    y = y.compressed()
+    x = x.compressed().astype(float)
+    # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
+    return stats_siegelslopes(y, x, method=method)
+
+
+SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
+                                            ['intra_slope', 'inter_slope'])
+
+
+def sen_seasonal_slopes(x):
+    r"""
+    Computes seasonal Theil-Sen and Kendall slope estimators.
+
+    The seasonal generalization of Sen's slope computes the slopes between all
+    pairs of values within a "season" (column) of a 2D array. It returns an
+    array containing the median of these "within-season" slopes for each
+    season (the Theil-Sen slope estimator of each season), and it returns the
+    median of the within-season slopes across all seasons (the seasonal Kendall
+    slope estimator).
+
+    Parameters
+    ----------
+    x : 2D array_like
+        Each column of `x` contains measurements of the dependent variable
+        within a season. The independent variable (usually time) of each season
+        is assumed to be ``np.arange(x.shape[0])``.
+
+    Returns
+    -------
+    result : ``SenSeasonalSlopesResult`` instance
+        The return value is an object with the following attributes:
+
+        intra_slope : ndarray
+            For each season, the Theil-Sen slope estimator: the median of
+            within-season slopes.
+        inter_slope : float
+            The seasonal Kendall slope estimator: the median of within-season
+            slopes *across all* seasons.
+
+    See Also
+    --------
+    theilslopes : the analogous function for non-seasonal data
+    scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
+
+    Notes
+    -----
+    The slopes :math:`d_{ijk}` within season :math:`i` are:
+
+    .. math::
+
+        d_{ijk} = \frac{x_{ij} - x_{ik}}
+                            {j - k}
+
+    for pairs of distinct integer indices :math:`j, k` of :math:`x`.
+
+    Element :math:`i` of the returned `intra_slope` array is the median of the
+    :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
+    estimator of season :math:`i`. The returned `inter_slope` value, better
+    known as the seasonal Kendall slope estimator, is the median of the
+    :math:`d_{ijk}` over all :math:`i, j, k`.
+
+    References
+    ----------
+    .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
+           "Techniques of trend analysis for monthly water quality data."
+           *Water Resources Research* 18.1 (1982): 107-121.
+
+    Examples
+    --------
+    Suppose we have 100 observations of a dependent variable for each of four
+    seasons:
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> x = rng.random(size=(100, 4))
+
+    We compute the seasonal slopes as:
+
+    >>> from scipy import stats
+    >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
+
+    If we define a function to compute all slopes between observations within
+    a season:
+
+    >>> def dijk(yi):
+    ...     n = len(yi)
+    ...     x = np.arange(n)
+    ...     dy = yi - yi[:, np.newaxis]
+    ...     dx = x - x[:, np.newaxis]
+    ...     # we only want unique pairs of distinct indices
+    ...     mask = np.triu(np.ones((n, n), dtype=bool), k=1)
+    ...     return dy[mask]/dx[mask]
+
+    then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
+
+    >>> i = 2
+    >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
+    True
+
+    and ``inter_slope`` is the median of the values returned by ``dijk`` for
+    all seasons:
+
+    >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
+    >>> np.allclose(np.median(all_slopes), inter_slope)
+    True
+
+    Because the data are randomly generated, we would expect the median slopes
+    to be nearly zero both within and across all seasons, and indeed they are:
+
+    >>> intra_slope.data
+    array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
+    >>> inter_slope
+    -0.0010511779872922058
+
+    """
+    x = ma.array(x, subok=True, copy=False, ndmin=2)
+    (n,_) = x.shape
+    # Get list of slopes per season
+    szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
+                            for i in range(n)])
+    szn_medslopes = ma.median(szn_slopes, axis=0)
+    medslope = ma.median(szn_slopes, axis=None)
+    return SenSeasonalSlopesResult(szn_medslopes, medslope)
+
+
+Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
+
+
+def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
+    """
+    Calculates the T-test for the mean of ONE group of scores.
+
+    Parameters
+    ----------
+    a : array_like
+        sample observation
+    popmean : float or array_like
+        expected value in null hypothesis, if array_like than it must have the
+        same shape as `a` excluding the axis dimension
+    axis : int or None, optional
+        Axis along which to compute test. If None, compute over the whole
+        array `a`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the mean of the underlying distribution of the sample
+          is different than the given population mean (`popmean`)
+        * 'less': the mean of the underlying distribution of the sample is
+          less than the given population mean (`popmean`)
+        * 'greater': the mean of the underlying distribution of the sample is
+          greater than the given population mean (`popmean`)
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float or array
+        t-statistic
+    pvalue : float or array
+        The p-value
+
+    Notes
+    -----
+    For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    if a.size == 0:
+        return (np.nan, np.nan)
+
+    x = a.mean(axis=axis)
+    v = a.var(axis=axis, ddof=1)
+    n = a.count(axis=axis)
+    # force df to be an array for masked division not to throw a warning
+    df = ma.asanyarray(n - 1.0)
+    svar = ((n - 1.0) * v) / df
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t = (x - popmean) / ma.sqrt(svar / n)
+
+    t, prob = _ttest_finish(df, t, alternative)
+    return Ttest_1sampResult(t, prob)
+
+
+ttest_onesamp = ttest_1samp
+
+
+Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
+
+
+def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
+    """
+    Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
+
+    Parameters
+    ----------
+    a, b : array_like
+        The arrays must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int or None, optional
+        Axis along which to compute test. If None, compute over the whole
+        arrays, `a`, and `b`.
+    equal_var : bool, optional
+        If True, perform a standard independent 2 sample test that assumes equal
+        population variances.
+        If False, perform Welch's t-test, which does not assume equal population
+        variance.
+
+        .. versionadded:: 0.17.0
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions underlying the samples
+          are unequal.
+        * 'less': the mean of the distribution underlying the first sample
+          is less than the mean of the distribution underlying the second
+          sample.
+        * 'greater': the mean of the distribution underlying the first
+          sample is greater than the mean of the distribution underlying
+          the second sample.
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float or array
+        The calculated t-statistic.
+    pvalue : float or array
+        The p-value.
+
+    Notes
+    -----
+    For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
+
+    """
+    a, b, axis = _chk2_asarray(a, b, axis)
+
+    if a.size == 0 or b.size == 0:
+        return Ttest_indResult(np.nan, np.nan)
+
+    (x1, x2) = (a.mean(axis), b.mean(axis))
+    (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
+    (n1, n2) = (a.count(axis), b.count(axis))
+
+    if equal_var:
+        # force df to be an array for masked division not to throw a warning
+        df = ma.asanyarray(n1 + n2 - 2.0)
+        svar = ((n1-1)*v1+(n2-1)*v2) / df
+        denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2))  # n-D computation here!
+    else:
+        vn1 = v1/n1
+        vn2 = v2/n2
+        with np.errstate(divide='ignore', invalid='ignore'):
+            df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
+
+        # If df is undefined, variances are zero.
+        # It doesn't matter what df is as long as it is not NaN.
+        df = np.where(np.isnan(df), 1, df)
+        denom = ma.sqrt(vn1 + vn2)
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t = (x1-x2) / denom
+
+    t, prob = _ttest_finish(df, t, alternative)
+    return Ttest_indResult(t, prob)
+
+
+Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
+
+
+def ttest_rel(a, b, axis=0, alternative='two-sided'):
+    """
+    Calculates the T-test on TWO RELATED samples of scores, a and b.
+
+    Parameters
+    ----------
+    a, b : array_like
+        The arrays must have the same shape.
+    axis : int or None, optional
+        Axis along which to compute test. If None, compute over the whole
+        arrays, `a`, and `b`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions underlying the samples
+          are unequal.
+        * 'less': the mean of the distribution underlying the first sample
+          is less than the mean of the distribution underlying the second
+          sample.
+        * 'greater': the mean of the distribution underlying the first
+          sample is greater than the mean of the distribution underlying
+          the second sample.
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float or array
+        t-statistic
+    pvalue : float or array
+        two-tailed p-value
+
+    Notes
+    -----
+    For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
+
+    """
+    a, b, axis = _chk2_asarray(a, b, axis)
+    if len(a) != len(b):
+        raise ValueError('unequal length arrays')
+
+    if a.size == 0 or b.size == 0:
+        return Ttest_relResult(np.nan, np.nan)
+
+    n = a.count(axis)
+    df = ma.asanyarray(n-1.0)
+    d = (a-b).astype('d')
+    dm = d.mean(axis)
+    v = d.var(axis=axis, ddof=1)
+    denom = ma.sqrt(v / n)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t = dm / denom
+
+    t, prob = _ttest_finish(df, t, alternative)
+    return Ttest_relResult(t, prob)
+
+
+MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
+                                                       'pvalue'))
+
+
+def mannwhitneyu(x,y, use_continuity=True):
+    """
+    Computes the Mann-Whitney statistic
+
+    Missing values in `x` and/or `y` are discarded.
+
+    Parameters
+    ----------
+    x : sequence
+        Input
+    y : sequence
+        Input
+    use_continuity : {True, False}, optional
+        Whether a continuity correction (1/2.) should be taken into account.
+
+    Returns
+    -------
+    statistic : float
+        The minimum of the Mann-Whitney statistics
+    pvalue : float
+        Approximate two-sided p-value assuming a normal distribution.
+
+    """
+    x = ma.asarray(x).compressed().view(ndarray)
+    y = ma.asarray(y).compressed().view(ndarray)
+    ranks = rankdata(np.concatenate([x,y]))
+    (nx, ny) = (len(x), len(y))
+    nt = nx + ny
+    U = ranks[:nx].sum() - nx*(nx+1)/2.
+    U = max(U, nx*ny - U)
+    u = nx*ny - U
+
+    mu = (nx*ny)/2.
+    sigsq = (nt**3 - nt)/12.
+    ties = count_tied_groups(ranks)
+    sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
+    sigsq *= nx*ny/float(nt*(nt-1))
+
+    if use_continuity:
+        z = (U - 1/2. - mu) / ma.sqrt(sigsq)
+    else:
+        z = (U - mu) / ma.sqrt(sigsq)
+
+    prob = special.erfc(abs(z)/np.sqrt(2))
+    return MannwhitneyuResult(u, prob)
+
+
+KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
+
+
+def kruskal(*args):
+    """
+    Compute the Kruskal-Wallis H-test for independent samples
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+       Two or more arrays with the sample measurements can be given as
+       arguments.
+
+    Returns
+    -------
+    statistic : float
+       The Kruskal-Wallis H statistic, corrected for ties
+    pvalue : float
+       The p-value for the test using the assumption that H has a chi
+       square distribution
+
+    Notes
+    -----
+    For more details on `kruskal`, see `scipy.stats.kruskal`.
+
+    Examples
+    --------
+    >>> from scipy.stats.mstats import kruskal
+
+    Random samples from three different brands of batteries were tested
+    to see how long the charge lasted. Results were as follows:
+
+    >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
+    >>> b = [6.9, 7.0, 6.1, 7.9]
+    >>> c = [7.2, 6.9, 6.1, 6.5]
+
+    Test the hypothesis that the distribution functions for all of the brands'
+    durations are identical. Use 5% level of significance.
+
+    >>> kruskal(a, b, c)
+    KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
+
+    The null hypothesis is rejected at the 5% level of significance
+    because the returned p-value is less than the critical value of 5%.
+
+    """
+    output = argstoarray(*args)
+    ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
+    sumrk = ranks.sum(-1)
+    ngrp = ranks.count(-1)
+    ntot = ranks.count()
+    H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
+    # Tie correction
+    ties = count_tied_groups(ranks)
+    T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
+    if T == 0:
+        raise ValueError('All numbers are identical in kruskal')
+
+    H /= T
+    df = len(output) - 1
+    prob = distributions.chi2.sf(H, df)
+    return KruskalResult(H, prob)
+
+
+kruskalwallis = kruskal
+
+
+@_rename_parameter("mode", "method")
+def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
+    """
+    Computes the Kolmogorov-Smirnov test on one sample of masked values.
+
+    Missing values in `x` are discarded.
+
+    Parameters
+    ----------
+    x : array_like
+        a 1-D array of observations of random variables.
+    cdf : str or callable
+        If a string, it should be the name of a distribution in `scipy.stats`.
+        If a callable, that callable is used to calculate the cdf.
+    args : tuple, sequence, optional
+        Distribution parameters, used if `cdf` is a string.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Indicates the alternative hypothesis.  Default is 'two-sided'.
+    method : {'auto', 'exact', 'asymp'}, optional
+        Defines the method used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : use 'exact' for small size arrays, 'asymp' for large
+          * 'exact' : use approximation to exact distribution of test statistic
+          * 'asymp' : use asymptotic distribution of test statistic
+
+    Returns
+    -------
+    d : float
+        Value of the Kolmogorov Smirnov test
+    p : float
+        Corresponding p-value.
+
+    """
+    alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
+       alternative.lower()[0], alternative)
+    return scipy.stats._stats_py.ks_1samp(
+        x, cdf, args=args, alternative=alternative, method=method)
+
+
+@_rename_parameter("mode", "method")
+def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
+    """
+    Computes the Kolmogorov-Smirnov test on two samples.
+
+    Missing values in `x` and/or `y` are discarded.
+
+    Parameters
+    ----------
+    data1 : array_like
+        First data set
+    data2 : array_like
+        Second data set
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Indicates the alternative hypothesis.  Default is 'two-sided'.
+    method : {'auto', 'exact', 'asymp'}, optional
+        Defines the method used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : use 'exact' for small size arrays, 'asymp' for large
+          * 'exact' : use approximation to exact distribution of test statistic
+          * 'asymp' : use asymptotic distribution of test statistic
+
+    Returns
+    -------
+    d : float
+        Value of the Kolmogorov Smirnov test
+    p : float
+        Corresponding p-value.
+
+    """
+    # Ideally this would be accomplished by
+    # ks_2samp = scipy.stats._stats_py.ks_2samp
+    # but the circular dependencies between _mstats_basic and stats prevent that.
+    alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
+       alternative.lower()[0], alternative)
+    return scipy.stats._stats_py.ks_2samp(data1, data2,
+                                          alternative=alternative,
+                                          method=method)
+
+
+ks_twosamp = ks_2samp
+
+
+@_rename_parameter("mode", "method")
+def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
+    """
+
+    Parameters
+    ----------
+    data1 : array_like
+    data2 : str, callable or array_like
+    args : tuple, sequence, optional
+        Distribution parameters, used if `data1` or `data2` are strings.
+    alternative : str, as documented in stats.kstest
+    method : str, as documented in stats.kstest
+
+    Returns
+    -------
+    tuple of (K-S statistic, probability)
+
+    """
+    return scipy.stats._stats_py.kstest(data1, data2, args,
+                                        alternative=alternative, method=method)
+
+
+def trima(a, limits=None, inclusive=(True,True)):
+    """
+    Trims an array by masking the data outside some given limits.
+
+    Returns a masked version of the input array.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    limits : {None, tuple}, optional
+        Tuple of (lower limit, upper limit) in absolute values.
+        Values of the input array lower (greater) than the lower (upper) limit
+        will be masked.  A limit is None indicates an open interval.
+    inclusive : (bool, bool) tuple, optional
+        Tuple of (lower flag, upper flag), indicating whether values exactly
+        equal to the lower (upper) limit are allowed.
+
+    Examples
+    --------
+    >>> from scipy.stats.mstats import trima
+    >>> import numpy as np
+
+    >>> a = np.arange(10)
+
+    The interval is left-closed and right-open, i.e., `[2, 8)`.
+    Trim the array by keeping only values in the interval.
+
+    >>> trima(a, limits=(2, 8), inclusive=(True, False))
+    masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
+                 mask=[ True,  True, False, False, False, False, False, False,
+                        True,  True],
+           fill_value=999999)
+
+    """
+    a = ma.asarray(a)
+    a.unshare_mask()
+    if (limits is None) or (limits == (None, None)):
+        return a
+
+    (lower_lim, upper_lim) = limits
+    (lower_in, upper_in) = inclusive
+    condition = False
+    if lower_lim is not None:
+        if lower_in:
+            condition |= (a < lower_lim)
+        else:
+            condition |= (a <= lower_lim)
+
+    if upper_lim is not None:
+        if upper_in:
+            condition |= (a > upper_lim)
+        else:
+            condition |= (a >= upper_lim)
+
+    a[condition.filled(True)] = masked
+    return a
+
+
+def trimr(a, limits=None, inclusive=(True, True), axis=None):
+    """
+    Trims an array by masking some proportion of the data on each end.
+    Returns a masked version of the input array.
+
+    Parameters
+    ----------
+    a : sequence
+        Input array.
+    limits : {None, tuple}, optional
+        Tuple of the percentages to cut on each side of the array, with respect
+        to the number of unmasked data, as floats between 0. and 1.
+        Noting n the number of unmasked data before trimming, the
+        (n*limits[0])th smallest data and the (n*limits[1])th largest data are
+        masked, and the total number of unmasked data after trimming is
+        n*(1.-sum(limits)).  The value of one limit can be set to None to
+        indicate an open interval.
+    inclusive : {(True,True) tuple}, optional
+        Tuple of flags indicating whether the number of data being masked on
+        the left (right) end should be truncated (True) or rounded (False) to
+        integers.
+    axis : {None,int}, optional
+        Axis along which to trim. If None, the whole array is trimmed, but its
+        shape is maintained.
+
+    """
+    def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
+        n = a.count()
+        idx = a.argsort()
+        if low_limit:
+            if low_inclusive:
+                lowidx = int(low_limit*n)
+            else:
+                lowidx = int(np.round(low_limit*n))
+            a[idx[:lowidx]] = masked
+        if up_limit is not None:
+            if up_inclusive:
+                upidx = n - int(n*up_limit)
+            else:
+                upidx = n - int(np.round(n*up_limit))
+            a[idx[upidx:]] = masked
+        return a
+
+    a = ma.asarray(a)
+    a.unshare_mask()
+    if limits is None:
+        return a
+
+    # Check the limits
+    (lolim, uplim) = limits
+    errmsg = "The proportion to cut from the %s should be between 0. and 1."
+    if lolim is not None:
+        if lolim > 1. or lolim < 0:
+            raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
+    if uplim is not None:
+        if uplim > 1. or uplim < 0:
+            raise ValueError(errmsg % 'end' + f"(got {uplim})")
+
+    (loinc, upinc) = inclusive
+
+    if axis is None:
+        shp = a.shape
+        return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
+    else:
+        return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
+
+
+trimdoc = _dedent_for_py313("""
+    Parameters
+    ----------
+    a : sequence
+        Input array
+    limits : {None, tuple}, optional
+        If `relative` is False, tuple (lower limit, upper limit) in absolute values.
+        Values of the input array lower (greater) than the lower (upper) limit are
+        masked.
+
+        If `relative` is True, tuple (lower percentage, upper percentage) to cut
+        on each side of the  array, with respect to the number of unmasked data.
+
+        Noting n the number of unmasked data before trimming, the (n*limits[0])th
+        smallest data and the (n*limits[1])th largest data are masked, and the
+        total number of unmasked data after trimming is n*(1.-sum(limits))
+        In each case, the value of one limit can be set to None to indicate an
+        open interval.
+
+        If limits is None, no trimming is performed
+    inclusive : {(bool, bool) tuple}, optional
+        If `relative` is False, tuple indicating whether values exactly equal
+        to the absolute limits are allowed.
+        If `relative` is True, tuple indicating whether the number of data
+        being masked on each side should be rounded (True) or truncated
+        (False).
+    relative : bool, optional
+        Whether to consider the limits as absolute values (False) or proportions
+        to cut (True).
+    axis : int, optional
+        Axis along which to trim.""")
+
+
+def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
+    """
+    Trims an array by masking the data outside some given limits.
+
+    Returns a masked version of the input array.
+
+    %s
+
+    Examples
+    --------
+    >>> from scipy.stats.mstats import trim
+    >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
+    >>> print(trim(z,(3,8)))
+    [-- -- 3 4 5 6 7 8 -- --]
+    >>> print(trim(z,(0.1,0.2),relative=True))
+    [-- 2 3 4 5 6 7 8 -- --]
+
+    """
+    if relative:
+        return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
+    else:
+        return trima(a, limits=limits, inclusive=inclusive)
+
+
+if trim.__doc__:
+    trim.__doc__ = trim.__doc__ % trimdoc
+
+
+def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
+    """
+    Trims the smallest and largest data values.
+
+    Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
+    ``int(proportiontocut * n)`` largest values of data along the given axis,
+    where n is the number of unmasked values before trimming.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data to trim.
+    proportiontocut : float, optional
+        Percentage of trimming (as a float between 0 and 1).
+        If n is the number of unmasked values before trimming, the number of
+        values after trimming is ``(1 - 2*proportiontocut) * n``.
+        Default is 0.2.
+    inclusive : {(bool, bool) tuple}, optional
+        Tuple indicating whether the number of data being masked on each side
+        should be rounded (True) or truncated (False).
+    axis : int, optional
+        Axis along which to perform the trimming.
+        If None, the input array is first flattened.
+
+    """
+    return trimr(data, limits=(proportiontocut,proportiontocut),
+                 inclusive=inclusive, axis=axis)
+
+
+def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
+             axis=None):
+    """
+    Trims the data by masking values from one tail.
+
+    Parameters
+    ----------
+    data : array_like
+        Data to trim.
+    proportiontocut : float, optional
+        Percentage of trimming. If n is the number of unmasked values
+        before trimming, the number of values after trimming is
+        ``(1 - proportiontocut) * n``.  Default is 0.2.
+    tail : {'left','right'}, optional
+        If 'left' the `proportiontocut` lowest values will be masked.
+        If 'right' the `proportiontocut` highest values will be masked.
+        Default is 'left'.
+    inclusive : {(bool, bool) tuple}, optional
+        Tuple indicating whether the number of data being masked on each side
+        should be rounded (True) or truncated (False).  Default is
+        (True, True).
+    axis : int, optional
+        Axis along which to perform the trimming.
+        If None, the input array is first flattened.  Default is None.
+
+    Returns
+    -------
+    trimtail : ndarray
+        Returned array of same shape as `data` with masked tail values.
+
+    """
+    tail = str(tail).lower()[0]
+    if tail == 'l':
+        limits = (proportiontocut,None)
+    elif tail == 'r':
+        limits = (None, proportiontocut)
+    else:
+        raise TypeError("The tail argument should be in ('left','right')")
+
+    return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
+
+
+trim1 = trimtail
+
+
+def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+                 axis=None):
+    """Returns the trimmed mean of the data along the given axis.
+
+    %s
+
+    """
+    if (not isinstance(limits,tuple)) and isinstance(limits,float):
+        limits = (limits, limits)
+    if relative:
+        return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
+    else:
+        return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
+
+
+if trimmed_mean.__doc__:
+    trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
+
+
+def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+                axis=None, ddof=0):
+    """Returns the trimmed variance of the data along the given axis.
+
+    %s
+    ddof : {0,integer}, optional
+        Means Delta Degrees of Freedom. The denominator used during computations
+        is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
+        biased estimate of the variance.
+
+    """
+    if (not isinstance(limits,tuple)) and isinstance(limits,float):
+        limits = (limits, limits)
+    if relative:
+        out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
+    else:
+        out = trima(a,limits=limits,inclusive=inclusive)
+
+    return out.var(axis=axis, ddof=ddof)
+
+
+if trimmed_var.__doc__:
+    trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
+
+
+def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+                axis=None, ddof=0):
+    """Returns the trimmed standard deviation of the data along the given axis.
+
+    %s
+    ddof : {0,integer}, optional
+        Means Delta Degrees of Freedom. The denominator used during computations
+        is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
+        biased estimate of the variance.
+
+    """
+    if (not isinstance(limits,tuple)) and isinstance(limits,float):
+        limits = (limits, limits)
+    if relative:
+        out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
+    else:
+        out = trima(a,limits=limits,inclusive=inclusive)
+    return out.std(axis=axis,ddof=ddof)
+
+
+if trimmed_std.__doc__:
+    trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
+
+
+def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
+    """
+    Returns the standard error of the trimmed mean along the given axis.
+
+    Parameters
+    ----------
+    a : sequence
+        Input array
+    limits : {(0.1,0.1), tuple of float}, optional
+        tuple (lower percentage, upper percentage) to cut  on each side of the
+        array, with respect to the number of unmasked data.
+
+        If n is the number of unmasked data before trimming, the values
+        smaller than ``n * limits[0]`` and the values larger than
+        ``n * `limits[1]`` are masked, and the total number of unmasked
+        data after trimming is ``n * (1.-sum(limits))``.  In each case,
+        the value of one limit can be set to None to indicate an open interval.
+        If `limits` is None, no trimming is performed.
+    inclusive : {(bool, bool) tuple} optional
+        Tuple indicating whether the number of data being masked on each side
+        should be rounded (True) or truncated (False).
+    axis : int, optional
+        Axis along which to trim.
+
+    Returns
+    -------
+    trimmed_stde : scalar or ndarray
+
+    """
+    def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
+        "Returns the standard error of the trimmed mean for a 1D input data."
+        n = a.count()
+        idx = a.argsort()
+        if low_limit:
+            if low_inclusive:
+                lowidx = int(low_limit*n)
+            else:
+                lowidx = np.round(low_limit*n)
+            a[idx[:lowidx]] = masked
+        if up_limit is not None:
+            if up_inclusive:
+                upidx = n - int(n*up_limit)
+            else:
+                upidx = n - np.round(n*up_limit)
+            a[idx[upidx:]] = masked
+        a[idx[:lowidx]] = a[idx[lowidx]]
+        a[idx[upidx:]] = a[idx[upidx-1]]
+        winstd = a.std(ddof=1)
+        return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
+
+    a = ma.array(a, copy=True, subok=True)
+    a.unshare_mask()
+    if limits is None:
+        return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
+    if (not isinstance(limits,tuple)) and isinstance(limits,float):
+        limits = (limits, limits)
+
+    # Check the limits
+    (lolim, uplim) = limits
+    errmsg = "The proportion to cut from the %s should be between 0. and 1."
+    if lolim is not None:
+        if lolim > 1. or lolim < 0:
+            raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
+    if uplim is not None:
+        if uplim > 1. or uplim < 0:
+            raise ValueError(errmsg % 'end' + f"(got {uplim})")
+
+    (loinc, upinc) = inclusive
+    if (axis is None):
+        return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
+    else:
+        if a.ndim > 2:
+            raise ValueError(f"Array 'a' must be at most two dimensional, "
+                             f"but got a.ndim = {a.ndim}")
+        return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
+                                   lolim,uplim,loinc,upinc)
+
+
+def _mask_to_limits(a, limits, inclusive):
+    """Mask an array for values outside of given limits.
+
+    This is primarily a utility function.
+
+    Parameters
+    ----------
+    a : array
+    limits : (float or None, float or None)
+    A tuple consisting of the (lower limit, upper limit).  Values in the
+    input array less than the lower limit or greater than the upper limit
+    will be masked out. None implies no limit.
+    inclusive : (bool, bool)
+    A tuple consisting of the (lower flag, upper flag).  These flags
+    determine whether values exactly equal to lower or upper are allowed.
+
+    Returns
+    -------
+    A MaskedArray.
+
+    Raises
+    ------
+    A ValueError if there are no values within the given limits.
+    """
+    lower_limit, upper_limit = limits
+    lower_include, upper_include = inclusive
+    am = ma.MaskedArray(a)
+    if lower_limit is not None:
+        if lower_include:
+            am = ma.masked_less(am, lower_limit)
+        else:
+            am = ma.masked_less_equal(am, lower_limit)
+
+    if upper_limit is not None:
+        if upper_include:
+            am = ma.masked_greater(am, upper_limit)
+        else:
+            am = ma.masked_greater_equal(am, upper_limit)
+
+    if am.count() == 0:
+        raise ValueError("No array values within given limits")
+
+    return am
+
+
+def tmean(a, limits=None, inclusive=(True, True), axis=None):
+    """
+    Compute the trimmed mean.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored.  When limits is None (default), then all
+        values are used.  Either of the limit values in the tuple can also be
+        None representing a half-open interval.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. If None, compute over the
+        whole array. Default is None.
+
+    Returns
+    -------
+    tmean : float
+
+    Notes
+    -----
+    For more details on `tmean`, see `scipy.stats.tmean`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import mstats
+    >>> a = np.array([[6, 8, 3, 0],
+    ...               [3, 9, 1, 2],
+    ...               [8, 7, 8, 2],
+    ...               [5, 6, 0, 2],
+    ...               [4, 5, 5, 2]])
+    ...
+    ...
+    >>> mstats.tmean(a, (2,5))
+    3.3
+    >>> mstats.tmean(a, (2,5), axis=0)
+    masked_array(data=[4.0, 5.0, 4.0, 2.0],
+                 mask=[False, False, False, False],
+           fill_value=1e+20)
+
+    """
+    return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
+
+
+def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
+    """
+    Compute the trimmed variance
+
+    This function computes the sample variance of an array of values,
+    while ignoring values which are outside of given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored. When limits is None, then all values are
+        used. Either of the limit values in the tuple can also be None
+        representing a half-open interval.  The default value is None.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. If None, compute over the
+        whole array. Default is zero.
+    ddof : int, optional
+        Delta degrees of freedom. Default is 1.
+
+    Returns
+    -------
+    tvar : float
+        Trimmed variance.
+
+    Notes
+    -----
+    For more details on `tvar`, see `scipy.stats.tvar`.
+
+    """
+    a = a.astype(float).ravel()
+    if limits is None:
+        n = (~a.mask).sum()  # todo: better way to do that?
+        return np.ma.var(a) * n/(n-1.)
+    am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
+
+    return np.ma.var(am, axis=axis, ddof=ddof)
+
+
+def tmin(a, lowerlimit=None, axis=0, inclusive=True):
+    """
+    Compute the trimmed minimum
+
+    Parameters
+    ----------
+    a : array_like
+        array of values
+    lowerlimit : None or float, optional
+        Values in the input array less than the given limit will be ignored.
+        When lowerlimit is None, then all values are used. The default value
+        is None.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    inclusive : {True, False}, optional
+        This flag determines whether values exactly equal to the lower limit
+        are included.  The default value is True.
+
+    Returns
+    -------
+    tmin : float, int or ndarray
+
+    Notes
+    -----
+    For more details on `tmin`, see `scipy.stats.tmin`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import mstats
+    >>> a = np.array([[6, 8, 3, 0],
+    ...               [3, 2, 1, 2],
+    ...               [8, 1, 8, 2],
+    ...               [5, 3, 0, 2],
+    ...               [4, 7, 5, 2]])
+    ...
+    >>> mstats.tmin(a, 5)
+    masked_array(data=[5, 7, 5, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    am = trima(a, (lowerlimit, None), (inclusive, False))
+    return ma.minimum.reduce(am, axis)
+
+
+def tmax(a, upperlimit=None, axis=0, inclusive=True):
+    """
+    Compute the trimmed maximum
+
+    This function computes the maximum value of an array along a given axis,
+    while ignoring values larger than a specified upper limit.
+
+    Parameters
+    ----------
+    a : array_like
+        array of values
+    upperlimit : None or float, optional
+        Values in the input array greater than the given limit will be ignored.
+        When upperlimit is None, then all values are used. The default value
+        is None.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    inclusive : {True, False}, optional
+        This flag determines whether values exactly equal to the upper limit
+        are included.  The default value is True.
+
+    Returns
+    -------
+    tmax : float, int or ndarray
+
+    Notes
+    -----
+    For more details on `tmax`, see `scipy.stats.tmax`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import mstats
+    >>> a = np.array([[6, 8, 3, 0],
+    ...               [3, 9, 1, 2],
+    ...               [8, 7, 8, 2],
+    ...               [5, 6, 0, 2],
+    ...               [4, 5, 5, 2]])
+    ...
+    ...
+    >>> mstats.tmax(a, 4)
+    masked_array(data=[4, --, 3, 2],
+                 mask=[False,  True, False, False],
+           fill_value=999999)
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    am = trima(a, (None, upperlimit), (False, inclusive))
+    return ma.maximum.reduce(am, axis)
+
+
+def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
+    """
+    Compute the trimmed standard error of the mean.
+
+    This function finds the standard error of the mean for given
+    values, ignoring values outside the given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        array of values
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored. When limits is None, then all values are
+        used. Either of the limit values in the tuple can also be None
+        representing a half-open interval.  The default value is None.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. If None, compute over the
+        whole array. Default is zero.
+    ddof : int, optional
+        Delta degrees of freedom. Default is 1.
+
+    Returns
+    -------
+    tsem : float
+
+    Notes
+    -----
+    For more details on `tsem`, see `scipy.stats.tsem`.
+
+    """
+    a = ma.asarray(a).ravel()
+    if limits is None:
+        n = float(a.count())
+        return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
+
+    am = trima(a.ravel(), limits, inclusive)
+    sd = np.sqrt(am.var(axis=axis, ddof=ddof))
+    return sd / np.sqrt(am.count())
+
+
+def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
+              axis=None, nan_policy='propagate'):
+    """Returns a Winsorized version of the input array.
+
+    The (limits[0])th lowest values are set to the (limits[0])th percentile,
+    and the (limits[1])th highest values are set to the (1 - limits[1])th
+    percentile.
+    Masked values are skipped.
+
+
+    Parameters
+    ----------
+    a : sequence
+        Input array.
+    limits : {None, tuple of float}, optional
+        Tuple of the percentages to cut on each side of the array, with respect
+        to the number of unmasked data, as floats between 0. and 1.
+        Noting n the number of unmasked data before trimming, the
+        (n*limits[0])th smallest data and the (n*limits[1])th largest data are
+        masked, and the total number of unmasked data after trimming
+        is n*(1.-sum(limits)) The value of one limit can be set to None to
+        indicate an open interval.
+    inclusive : {(True, True) tuple}, optional
+        Tuple indicating whether the number of data being masked on each side
+        should be truncated (True) or rounded (False).
+    inplace : {False, True}, optional
+        Whether to winsorize in place (True) or to use a copy (False)
+    axis : {None, int}, optional
+        Axis along which to trim. If None, the whole array is trimmed, but its
+        shape is maintained.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': allows nan values and may overwrite or propagate them
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+    Notes
+    -----
+    This function is applied to reduce the effect of possibly spurious outliers
+    by limiting the extreme values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import winsorize
+
+    A shuffled array contains integers from 1 to 10.
+
+    >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
+
+    The 10% of the lowest value (i.e., ``1``) and the 20% of the highest
+    values (i.e., ``9`` and ``10``) are replaced.
+
+    >>> winsorize(a, limits=[0.1, 0.2])
+    masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
+                 mask=False,
+           fill_value=999999)
+
+    """
+    def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
+                     contains_nan, nan_policy):
+        n = a.count()
+        idx = a.argsort()
+        if contains_nan:
+            nan_count = np.count_nonzero(np.isnan(a))
+        if low_limit:
+            if low_include:
+                lowidx = int(low_limit * n)
+            else:
+                lowidx = np.round(low_limit * n).astype(int)
+            if contains_nan and nan_policy == 'omit':
+                lowidx = min(lowidx, n-nan_count-1)
+            a[idx[:lowidx]] = a[idx[lowidx]]
+        if up_limit is not None:
+            if up_include:
+                upidx = n - int(n * up_limit)
+            else:
+                upidx = n - np.round(n * up_limit).astype(int)
+            if contains_nan and nan_policy == 'omit':
+                a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
+            else:
+                a[idx[upidx:]] = a[idx[upidx - 1]]
+        return a
+
+    contains_nan = _contains_nan(a, nan_policy)
+    # We are going to modify a: better make a copy
+    a = ma.array(a, copy=np.logical_not(inplace))
+
+    if limits is None:
+        return a
+    if (not isinstance(limits, tuple)) and isinstance(limits, float):
+        limits = (limits, limits)
+
+    # Check the limits
+    (lolim, uplim) = limits
+    errmsg = "The proportion to cut from the %s should be between 0. and 1."
+    if lolim is not None:
+        if lolim > 1. or lolim < 0:
+            raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
+    if uplim is not None:
+        if uplim > 1. or uplim < 0:
+            raise ValueError(errmsg % 'end' + f"(got {uplim})")
+
+    (loinc, upinc) = inclusive
+
+    if axis is None:
+        shp = a.shape
+        return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
+                            contains_nan, nan_policy).reshape(shp)
+    else:
+        return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
+                                   upinc, contains_nan, nan_policy)
+
+
+def moment(a, moment=1, axis=0):
+    """
+    Calculates the nth moment about the mean for a sample.
+
+    Parameters
+    ----------
+    a : array_like
+       data
+    moment : int, optional
+       order of central moment that is returned
+    axis : int or None, optional
+       Axis along which the central moment is computed. Default is 0.
+       If None, compute over the whole array `a`.
+
+    Returns
+    -------
+    n-th central moment : ndarray or float
+       The appropriate moment along the given axis or over all values if axis
+       is None. The denominator for the moment calculation is the number of
+       observations, no degrees of freedom correction is done.
+
+    Notes
+    -----
+    For more details about `moment`, see `scipy.stats.moment`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    if a.size == 0:
+        moment_shape = list(a.shape)
+        del moment_shape[axis]
+        dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
+        # empty array, return nan(s) with shape matching `moment`
+        out_shape = (moment_shape if np.isscalar(moment)
+                     else [len(moment)] + moment_shape)
+        if len(out_shape) == 0:
+            return dtype(np.nan)
+        else:
+            return ma.array(np.full(out_shape, np.nan, dtype=dtype))
+
+    # for array_like moment input, return a value for each.
+    if not np.isscalar(moment):
+        mean = a.mean(axis, keepdims=True)
+        mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
+        return ma.array(mmnt)
+    else:
+        return _moment(a, moment, axis)
+
+
+# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
+def _moment(a, moment, axis, *, mean=None):
+    if np.abs(moment - np.round(moment)) > 0:
+        raise ValueError("All moment parameters must be integers")
+
+    if moment == 0 or moment == 1:
+        # By definition the zeroth moment about the mean is 1, and the first
+        # moment is 0.
+        shape = list(a.shape)
+        del shape[axis]
+        dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
+
+        if len(shape) == 0:
+            return dtype(1.0 if moment == 0 else 0.0)
+        else:
+            return (ma.ones(shape, dtype=dtype) if moment == 0
+                    else ma.zeros(shape, dtype=dtype))
+    else:
+        # Exponentiation by squares: form exponent sequence
+        n_list = [moment]
+        current_n = moment
+        while current_n > 2:
+            if current_n % 2:
+                current_n = (current_n-1)/2
+            else:
+                current_n /= 2
+            n_list.append(current_n)
+
+        # Starting point for exponentiation by squares
+        mean = a.mean(axis, keepdims=True) if mean is None else mean
+        a_zero_mean = a - mean
+        if n_list[-1] == 1:
+            s = a_zero_mean.copy()
+        else:
+            s = a_zero_mean**2
+
+        # Perform multiplications
+        for n in n_list[-2::-1]:
+            s = s**2
+            if n % 2:
+                s *= a_zero_mean
+        return s.mean(axis)
+
+
+def variation(a, axis=0, ddof=0):
+    """
+    Compute the coefficient of variation.
+
+    The coefficient of variation is the standard deviation divided by the
+    mean.  This function is equivalent to::
+
+        np.std(x, axis=axis, ddof=ddof) / np.mean(x)
+
+    The default for ``ddof`` is 0, but many definitions of the coefficient
+    of variation use the square root of the unbiased sample variance
+    for the sample standard deviation, which corresponds to ``ddof=1``.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis along which to calculate the coefficient of variation. Default
+        is 0. If None, compute over the whole array `a`.
+    ddof : int, optional
+        Delta degrees of freedom.  Default is 0.
+
+    Returns
+    -------
+    variation : ndarray
+        The calculated variation along the requested axis.
+
+    Notes
+    -----
+    For more details about `variation`, see `scipy.stats.variation`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import variation
+    >>> a = np.array([2,8,4])
+    >>> variation(a)
+    0.5345224838248487
+    >>> b = np.array([2,8,3,4])
+    >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
+    >>> variation(c)
+    0.5345224838248487
+
+    In the example above, it can be seen that this works the same as
+    `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
+    array elements.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    return a.std(axis, ddof=ddof)/a.mean(axis)
+
+
+def skew(a, axis=0, bias=True):
+    """
+    Computes the skewness of a data set.
+
+    Parameters
+    ----------
+    a : ndarray
+        data
+    axis : int or None, optional
+        Axis along which skewness is calculated. Default is 0.
+        If None, compute over the whole array `a`.
+    bias : bool, optional
+        If False, then the calculations are corrected for statistical bias.
+
+    Returns
+    -------
+    skewness : ndarray
+        The skewness of values along an axis, returning 0 where all values are
+        equal.
+
+    Notes
+    -----
+    For more details about `skew`, see `scipy.stats.skew`.
+
+    """
+    a, axis = _chk_asarray(a,axis)
+    mean = a.mean(axis, keepdims=True)
+    m2 = _moment(a, 2, axis, mean=mean)
+    m3 = _moment(a, 3, axis, mean=mean)
+    zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
+    with np.errstate(all='ignore'):
+        vals = ma.where(zero, 0, m3 / m2**1.5)
+
+    if not bias and zero is not ma.masked and m2 is not ma.masked:
+        n = a.count(axis)
+        can_correct = ~zero & (n > 2)
+        if can_correct.any():
+            n = np.extract(can_correct, n)
+            m2 = np.extract(can_correct, m2)
+            m3 = np.extract(can_correct, m3)
+            nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
+            np.place(vals, can_correct, nval)
+    return vals
+
+
+def kurtosis(a, axis=0, fisher=True, bias=True):
+    """
+    Computes the kurtosis (Fisher or Pearson) of a dataset.
+
+    Kurtosis is the fourth central moment divided by the square of the
+    variance. If Fisher's definition is used, then 3.0 is subtracted from
+    the result to give 0.0 for a normal distribution.
+
+    If bias is False then the kurtosis is calculated using k statistics to
+    eliminate bias coming from biased moment estimators
+
+    Use `kurtosistest` to see if result is close enough to normal.
+
+    Parameters
+    ----------
+    a : array
+        data for which the kurtosis is calculated
+    axis : int or None, optional
+        Axis along which the kurtosis is calculated. Default is 0.
+        If None, compute over the whole array `a`.
+    fisher : bool, optional
+        If True, Fisher's definition is used (normal ==> 0.0). If False,
+        Pearson's definition is used (normal ==> 3.0).
+    bias : bool, optional
+        If False, then the calculations are corrected for statistical bias.
+
+    Returns
+    -------
+    kurtosis : array
+        The kurtosis of values along an axis. If all values are equal,
+        return -3 for Fisher's definition and 0 for Pearson's definition.
+
+    Notes
+    -----
+    For more details about `kurtosis`, see `scipy.stats.kurtosis`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    mean = a.mean(axis, keepdims=True)
+    m2 = _moment(a, 2, axis, mean=mean)
+    m4 = _moment(a, 4, axis, mean=mean)
+    zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
+    with np.errstate(all='ignore'):
+        vals = ma.where(zero, 0, m4 / m2**2.0)
+
+    if not bias and zero is not ma.masked and m2 is not ma.masked:
+        n = a.count(axis)
+        can_correct = ~zero & (n > 3)
+        if can_correct.any():
+            n = np.extract(can_correct, n)
+            m2 = np.extract(can_correct, m2)
+            m4 = np.extract(can_correct, m4)
+            nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
+            np.place(vals, can_correct, nval+3.0)
+    if fisher:
+        return vals - 3
+    else:
+        return vals
+
+
+DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
+                                               'variance', 'skewness',
+                                               'kurtosis'))
+
+
+def describe(a, axis=0, ddof=0, bias=True):
+    """
+    Computes several descriptive statistics of the passed array.
+
+    Parameters
+    ----------
+    a : array_like
+        Data array
+    axis : int or None, optional
+        Axis along which to calculate statistics. Default 0. If None,
+        compute over the whole array `a`.
+    ddof : int, optional
+        degree of freedom (default 0); note that default ddof is different
+        from the same routine in stats.describe
+    bias : bool, optional
+        If False, then the skewness and kurtosis calculations are corrected for
+        statistical bias.
+
+    Returns
+    -------
+    nobs : int
+        (size of the data (discarding missing values)
+
+    minmax : (int, int)
+        min, max
+
+    mean : float
+        arithmetic mean
+
+    variance : float
+        unbiased variance
+
+    skewness : float
+        biased skewness
+
+    kurtosis : float
+        biased kurtosis
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import describe
+    >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
+    >>> describe(ma)
+    DescribeResult(nobs=np.int64(3), minmax=(masked_array(data=0,
+                 mask=False,
+           fill_value=999999), masked_array(data=2,
+                 mask=False,
+           fill_value=999999)), mean=np.float64(1.0),
+           variance=np.float64(0.6666666666666666),
+           skewness=masked_array(data=0., mask=False, fill_value=1e+20),
+            kurtosis=np.float64(-1.5))
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    n = a.count(axis)
+    mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
+    m = a.mean(axis)
+    v = a.var(axis, ddof=ddof)
+    sk = skew(a, axis, bias=bias)
+    kurt = kurtosis(a, axis, bias=bias)
+
+    return DescribeResult(n, mm, m, v, sk, kurt)
+
+
+def stde_median(data, axis=None):
+    """Returns the McKean-Schrader estimate of the standard error of the sample
+    median along the given axis. masked values are discarded.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data to trim.
+    axis : {None,int}, optional
+        Axis along which to perform the trimming.
+        If None, the input array is first flattened.
+
+    """
+    def _stdemed_1D(data):
+        data = np.sort(data.compressed())
+        n = len(data)
+        z = 2.5758293035489004
+        k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
+        return ((data[n-k] - data[k-1])/(2.*z))
+
+    data = ma.array(data, copy=False, subok=True)
+    if (axis is None):
+        return _stdemed_1D(data)
+    else:
+        if data.ndim > 2:
+            raise ValueError(f"Array 'data' must be at most two dimensional, "
+                             f"but got data.ndim = {data.ndim}")
+        return ma.apply_along_axis(_stdemed_1D, axis, data)
+
+
+SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
+
+
+def skewtest(a, axis=0, alternative='two-sided'):
+    """
+    Tests whether the skew is different from the normal distribution.
+
+    Parameters
+    ----------
+    a : array_like
+        The data to be tested
+    axis : int or None, optional
+       Axis along which statistics are calculated. Default is 0.
+       If None, compute over the whole array `a`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the skewness of the distribution underlying the sample
+          is different from that of the normal distribution (i.e. 0)
+        * 'less': the skewness of the distribution underlying the sample
+          is less than that of the normal distribution
+        * 'greater': the skewness of the distribution underlying the sample
+          is greater than that of the normal distribution
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : array_like
+        The computed z-score for this test.
+    pvalue : array_like
+        A p-value for the hypothesis test
+
+    Notes
+    -----
+    For more details about `skewtest`, see `scipy.stats.skewtest`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    if axis is None:
+        a = a.ravel()
+        axis = 0
+    b2 = skew(a,axis)
+    n = a.count(axis)
+    if np.min(n) < 8:
+        raise ValueError(f"skewtest is not valid with less than 8 samples; "
+                         f"{np.min(n)} samples were given.")
+
+    y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
+    beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
+    W2 = -1 + ma.sqrt(2*(beta2-1))
+    delta = 1/ma.sqrt(0.5*ma.log(W2))
+    alpha = ma.sqrt(2.0/(W2-1))
+    y = ma.where(y == 0, 1, y)
+    Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
+    pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
+
+    return SkewtestResult(Z[()], pvalue[()])
+
+
+KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
+
+
+def kurtosistest(a, axis=0, alternative='two-sided'):
+    """
+    Tests whether a dataset has normal kurtosis
+
+    Parameters
+    ----------
+    a : array_like
+        array of the sample data
+    axis : int or None, optional
+       Axis along which to compute test. Default is 0. If None,
+       compute over the whole array `a`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the kurtosis of the distribution underlying the sample
+          is different from that of the normal distribution
+        * 'less': the kurtosis of the distribution underlying the sample
+          is less than that of the normal distribution
+        * 'greater': the kurtosis of the distribution underlying the sample
+          is greater than that of the normal distribution
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : array_like
+        The computed z-score for this test.
+    pvalue : array_like
+        The p-value for the hypothesis test
+
+    Notes
+    -----
+    For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    n = a.count(axis=axis)
+    if np.min(n) < 5:
+        raise ValueError(f"kurtosistest requires at least 5 observations; "
+                         f"{np.min(n)} observations were given.")
+    if np.min(n) < 20:
+        warnings.warn(f"kurtosistest only valid for n>=20 ... continuing "
+                      f"anyway, n={np.min(n)}", stacklevel=2)
+
+    b2 = kurtosis(a, axis, fisher=False)
+    E = 3.0*(n-1) / (n+1)
+    varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
+    x = (b2-E)/ma.sqrt(varb2)
+    sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
+                                                        (n*(n-2)*(n-3)))
+    A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
+    term1 = 1 - 2./(9.0*A)
+    denom = 1 + x*ma.sqrt(2/(A-4.0))
+    if np.ma.isMaskedArray(denom):
+        # For multi-dimensional array input
+        denom[denom == 0.0] = masked
+    elif denom == 0.0:
+        denom = masked
+
+    term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
+                        -ma.power(-(1-2.0/A)/denom, 1/3.0))
+    Z = (term1 - term2) / np.sqrt(2/(9.0*A))
+    pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
+
+    return KurtosistestResult(Z[()], pvalue[()])
+
+
+NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
+
+
+def normaltest(a, axis=0):
+    """
+    Tests whether a sample differs from a normal distribution.
+
+    Parameters
+    ----------
+    a : array_like
+        The array containing the data to be tested.
+    axis : int or None, optional
+        Axis along which to compute test. Default is 0. If None,
+        compute over the whole array `a`.
+
+    Returns
+    -------
+    statistic : float or array
+        ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
+        ``k`` is the z-score returned by `kurtosistest`.
+    pvalue : float or array
+       A 2-sided chi squared probability for the hypothesis test.
+
+    Notes
+    -----
+    For more details about `normaltest`, see `scipy.stats.normaltest`.
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    s, _ = skewtest(a, axis)
+    k, _ = kurtosistest(a, axis)
+    k2 = s*s + k*k
+
+    return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
+
+
+def mquantiles(a, prob=(.25, .5, .75), alphap=.4, betap=.4, axis=None,
+               limit=()):
+    """
+    Computes empirical quantiles for a data array.
+
+    Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
+    where ``x[j]`` is the j-th order statistic, and gamma is a function of
+    ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
+    ``g = n*p + m - j``.
+
+    Reinterpreting the above equations to compare to **R** lead to the
+    equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
+
+    Typical values of (alphap,betap) are:
+        - (0,1)    : ``p(k) = k/n`` : linear interpolation of cdf
+          (**R** type 4)
+        - (.5,.5)  : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
+          (**R** type 5)
+        - (0,0)    : ``p(k) = k/(n+1)`` :
+          (**R** type 6)
+        - (1,1)    : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
+          (**R** type 7, **R** default)
+        - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
+          The resulting quantile estimates are approximately median-unbiased
+          regardless of the distribution of x.
+          (**R** type 8)
+        - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
+          The resulting quantile estimates are approximately unbiased
+          if x is normally distributed
+          (**R** type 9)
+        - (.4,.4)  : approximately quantile unbiased (Cunnane)
+        - (.35,.35): APL, used with PWM
+
+    Parameters
+    ----------
+    a : array_like
+        Input data, as a sequence or array of dimension at most 2.
+    prob : array_like, optional
+        List of quantiles to compute.
+    alphap : float, optional
+        Plotting positions parameter, default is 0.4.
+    betap : float, optional
+        Plotting positions parameter, default is 0.4.
+    axis : int, optional
+        Axis along which to perform the trimming.
+        If None (default), the input array is first flattened.
+    limit : tuple, optional
+        Tuple of (lower, upper) values.
+        Values of `a` outside this open interval are ignored.
+
+    Returns
+    -------
+    mquantiles : MaskedArray
+        An array containing the calculated quantiles.
+
+    Notes
+    -----
+    This formulation is very similar to **R** except the calculation of
+    ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
+    with each type.
+
+    References
+    ----------
+    .. [1] *R* statistical software: https://www.r-project.org/
+    .. [2] *R* ``quantile`` function:
+            http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import mquantiles
+    >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
+    >>> mquantiles(a)
+    array([ 19.2,  40. ,  42.8])
+
+    Using a 2D array, specifying axis and limit.
+
+    >>> data = np.array([[   6.,    7.,    1.],
+    ...                  [  47.,   15.,    2.],
+    ...                  [  49.,   36.,    3.],
+    ...                  [  15.,   39.,    4.],
+    ...                  [  42.,   40., -999.],
+    ...                  [  41.,   41., -999.],
+    ...                  [   7., -999., -999.],
+    ...                  [  39., -999., -999.],
+    ...                  [  43., -999., -999.],
+    ...                  [  40., -999., -999.],
+    ...                  [  36., -999., -999.]])
+    >>> print(mquantiles(data, axis=0, limit=(0, 50)))
+    [[19.2  14.6   1.45]
+     [40.   37.5   2.5 ]
+     [42.8  40.05  3.55]]
+
+    >>> data[:, 2] = -999.
+    >>> print(mquantiles(data, axis=0, limit=(0, 50)))
+    [[19.200000000000003 14.6 --]
+     [40.0 37.5 --]
+     [42.800000000000004 40.05 --]]
+
+    """
+    def _quantiles1D(data,m,p):
+        x = np.sort(data.compressed())
+        n = len(x)
+        if n == 0:
+            return ma.array(np.empty(len(p), dtype=float), mask=True)
+        elif n == 1:
+            return ma.array(np.resize(x, p.shape), mask=nomask)
+        aleph = (n*p + m)
+        k = np.floor(aleph.clip(1, n-1)).astype(int)
+        gamma = (aleph-k).clip(0,1)
+        return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
+
+    data = ma.array(a, copy=False)
+    if data.ndim > 2:
+        raise TypeError("Array should be 2D at most !")
+
+    if limit:
+        condition = (limit[0] < data) & (data < limit[1])
+        data[~condition.filled(True)] = masked
+
+    p = np.atleast_1d(np.asarray(prob))
+    m = alphap + p*(1.-alphap-betap)
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        return _quantiles1D(data, m, p)
+
+    return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
+
+
+def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
+    """Calculate the score at the given 'per' percentile of the
+    sequence a.  For example, the score at per=50 is the median.
+
+    This function is a shortcut to mquantile
+
+    """
+    if (per < 0) or (per > 100.):
+        raise ValueError(f"The percentile should be between 0. and 100. ! (got {per})")
+
+    return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
+                      limit=limit, axis=0).squeeze()
+
+
+def plotting_positions(data, alpha=0.4, beta=0.4):
+    """
+    Returns plotting positions (or empirical percentile points) for the data.
+
+    Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
+        - i is the rank order statistics
+        - n is the number of unmasked values along the given axis
+        - `alpha` and `beta` are two parameters.
+
+    Typical values for `alpha` and `beta` are:
+        - (0,1)    : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
+        - (.5,.5)  : ``p(k) = (k-1/2.)/n``, piecewise linear function
+          (R, type 5)
+        - (0,0)    : ``p(k) = k/(n+1)``, Weibull (R type 6)
+        - (1,1)    : ``p(k) = (k-1)/(n-1)``, in this case,
+          ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
+        - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
+          ``p(k) ~ median[F(x[k])]``.
+          The resulting quantile estimates are approximately median-unbiased
+          regardless of the distribution of x. (R type 8)
+        - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
+          The resulting quantile estimates are approximately unbiased
+          if x is normally distributed (R type 9)
+        - (.4,.4)  : approximately quantile unbiased (Cunnane)
+        - (.35,.35): APL, used with PWM
+        - (.3175, .3175): used in scipy.stats.probplot
+
+    Parameters
+    ----------
+    data : array_like
+        Input data, as a sequence or array of dimension at most 2.
+    alpha : float, optional
+        Plotting positions parameter. Default is 0.4.
+    beta : float, optional
+        Plotting positions parameter. Default is 0.4.
+
+    Returns
+    -------
+    positions : MaskedArray
+        The calculated plotting positions.
+
+    """
+    data = ma.array(data, copy=False).reshape(1,-1)
+    n = data.count()
+    plpos = np.empty(data.size, dtype=float)
+    plpos[n:] = 0
+    plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
+                                          (n + 1.0 - alpha - beta))
+    return ma.array(plpos, mask=data._mask)
+
+
+meppf = plotting_positions
+
+
+def obrientransform(*args):
+    """
+    Computes a transform on input data (any number of columns).  Used to
+    test for homogeneity of variance prior to running one-way stats.  Each
+    array in ``*args`` is one level of a factor.  If an `f_oneway()` run on
+    the transformed data and found significant, variances are unequal.   From
+    Maxwell and Delaney, p.112.
+
+    Returns: transformed data for use in an ANOVA
+    """
+    data = argstoarray(*args).T
+    v = data.var(axis=0,ddof=1)
+    m = data.mean(0)
+    n = data.count(0).astype(float)
+    # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
+    data -= m
+    data **= 2
+    data *= (n-1.5)*n
+    data -= 0.5*v*(n-1)
+    data /= (n-1.)*(n-2.)
+    if not ma.allclose(v,data.mean(0)):
+        raise ValueError("Lack of convergence in obrientransform.")
+
+    return data
+
+
+def sem(a, axis=0, ddof=1):
+    """
+    Calculates the standard error of the mean of the input array.
+
+    Also sometimes called standard error of measurement.
+
+    Parameters
+    ----------
+    a : array_like
+        An array containing the values for which the standard error is
+        returned.
+    axis : int or None, optional
+        If axis is None, ravel `a` first. If axis is an integer, this will be
+        the axis over which to operate. Defaults to 0.
+    ddof : int, optional
+        Delta degrees-of-freedom. How many degrees of freedom to adjust
+        for bias in limited samples relative to the population estimate
+        of variance. Defaults to 1.
+
+    Returns
+    -------
+    s : ndarray or float
+        The standard error of the mean in the sample(s), along the input axis.
+
+    Notes
+    -----
+    The default value for `ddof` changed in scipy 0.15.0 to be consistent with
+    `scipy.stats.sem` as well as with the most common definition used (like in
+    the R documentation).
+
+    Examples
+    --------
+    Find standard error along the first axis:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = np.arange(20).reshape(5,4)
+    >>> print(stats.mstats.sem(a))
+    [2.8284271247461903 2.8284271247461903 2.8284271247461903
+     2.8284271247461903]
+
+    Find standard error across the whole array, using n degrees of freedom:
+
+    >>> print(stats.mstats.sem(a, axis=None, ddof=0))
+    1.2893796958227628
+
+    """
+    a, axis = _chk_asarray(a, axis)
+    n = a.count(axis=axis)
+    s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
+    return s
+
+
+F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
+
+
+def f_oneway(*args):
+    """
+    Performs a 1-way ANOVA, returning an F-value and probability given
+    any number of groups.  From Heiman, pp.394-7.
+
+    Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
+    one per treatment group.
+
+    Returns
+    -------
+    statistic : float
+        The computed F-value of the test.
+    pvalue : float
+        The associated p-value from the F-distribution.
+
+    """
+    # Construct a single array of arguments: each row is a group
+    data = argstoarray(*args)
+    ngroups = len(data)
+    ntot = data.count()
+    sstot = (data**2).sum() - (data.sum())**2/float(ntot)
+    ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
+    sswg = sstot-ssbg
+    dfbg = ngroups-1
+    dfwg = ntot - ngroups
+    msb = ssbg/float(dfbg)
+    msw = sswg/float(dfwg)
+    f = msb/msw
+    prob = special.fdtrc(dfbg, dfwg, f)  # equivalent to stats.f.sf
+
+    return F_onewayResult(f, prob)
+
+
+FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
+                                     ('statistic', 'pvalue'))
+
+
+def friedmanchisquare(*args):
+    """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
+    This function calculates the Friedman Chi-square test for repeated measures
+    and returns the result, along with the associated probability value.
+
+    Each input is considered a given group. Ideally, the number of treatments
+    among each group should be equal. If this is not the case, only the first
+    n treatments are taken into account, where n is the number of treatments
+    of the smallest group.
+    If a group has some missing values, the corresponding treatments are masked
+    in the other groups.
+    The test statistic is corrected for ties.
+
+    Masked values in one group are propagated to the other groups.
+
+    Returns
+    -------
+    statistic : float
+        the test statistic.
+    pvalue : float
+        the associated p-value.
+
+    """
+    data = argstoarray(*args).astype(float)
+    k = len(data)
+    if k < 3:
+        raise ValueError(f"Less than 3 groups ({k}): the Friedman test "
+                         f"is NOT appropriate.")
+
+    ranked = ma.masked_values(rankdata(data, axis=0), 0)
+    if ranked._mask is not nomask:
+        ranked = ma.mask_cols(ranked)
+        ranked = ranked.compressed().reshape(k,-1).view(ndarray)
+    else:
+        ranked = ranked._data
+    (k,n) = ranked.shape
+    # Ties correction
+    repeats = [find_repeats(row) for row in ranked.T]
+    ties = np.array([y for x, y in repeats if x.size > 0])
+    tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
+
+    ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
+    chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
+
+    return FriedmanchisquareResult(chisq,
+                                   distributions.chi2.sf(chisq, k-1))
+
+
+BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
+
+
+def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
+    """
+    Compute the Brunner-Munzel test on samples x and y.
+
+    Any missing values in `x` and/or `y` are discarded.
+
+    The Brunner-Munzel test is a nonparametric test of the null hypothesis that
+    when values are taken one by one from each group, the probabilities of
+    getting large values in both groups are equal.
+    Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
+    assumption of equivariance of two groups. Note that this does not assume
+    the distributions are same. This test works on two independent samples,
+    which may have different sizes.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Array of samples, should be one-dimensional.
+    alternative : 'less', 'two-sided', or 'greater', optional
+        Whether to get the p-value for the one-sided hypothesis ('less'
+        or 'greater') or for the two-sided hypothesis ('two-sided').
+        Defaults value is 'two-sided' .
+    distribution : 't' or 'normal', optional
+        Whether to get the p-value by t-distribution or by standard normal
+        distribution.
+        Defaults value is 't' .
+
+    Returns
+    -------
+    statistic : float
+        The Brunner-Munzer W statistic.
+    pvalue : float
+        p-value assuming an t distribution. One-sided or
+        two-sided, depending on the choice of `alternative` and `distribution`.
+
+    See Also
+    --------
+    mannwhitneyu : Mann-Whitney rank test on two samples.
+
+    Notes
+    -----
+    For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
+
+    Examples
+    --------
+    >>> from scipy.stats.mstats import brunnermunzel
+    >>> import numpy as np
+    >>> x1 = [1, 2, np.nan, np.nan, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1]
+    >>> x2 = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
+    >>> brunnermunzel(x1, x2)
+    BrunnerMunzelResult(statistic=1.4723186918922935, pvalue=0.15479415300426624)  # may vary
+
+    """  # noqa: E501
+    x = ma.asarray(x).compressed().view(ndarray)
+    y = ma.asarray(y).compressed().view(ndarray)
+    nx = len(x)
+    ny = len(y)
+    if nx == 0 or ny == 0:
+        return BrunnerMunzelResult(np.nan, np.nan)
+    rankc = rankdata(np.concatenate((x,y)))
+    rankcx = rankc[0:nx]
+    rankcy = rankc[nx:nx+ny]
+    rankcx_mean = np.mean(rankcx)
+    rankcy_mean = np.mean(rankcy)
+    rankx = rankdata(x)
+    ranky = rankdata(y)
+    rankx_mean = np.mean(rankx)
+    ranky_mean = np.mean(ranky)
+
+    Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
+    Sx /= nx - 1
+    Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
+    Sy /= ny - 1
+
+    wbfn = nx * ny * (rankcy_mean - rankcx_mean)
+    wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
+
+    if distribution == "t":
+        df_numer = np.power(nx * Sx + ny * Sy, 2.0)
+        df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
+        df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
+        df = df_numer / df_denom
+        p = distributions.t.cdf(wbfn, df)
+    elif distribution == "normal":
+        p = distributions.norm.cdf(wbfn)
+    else:
+        raise ValueError(
+            "distribution should be 't' or 'normal'")
+
+    if alternative == "greater":
+        pass
+    elif alternative == "less":
+        p = 1 - p
+    elif alternative == "two-sided":
+        p = 2 * np.min([p, 1-p])
+    else:
+        raise ValueError(
+            "alternative should be 'less', 'greater' or 'two-sided'")
+
+    return BrunnerMunzelResult(wbfn, p)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_extras.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_extras.py
new file mode 100644
index 0000000000000000000000000000000000000000..8de23c79e5e2bd8aa375ab8e1712a50a0edfd863
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_mstats_extras.py
@@ -0,0 +1,521 @@
+"""
+Additional statistics functions with support for masked arrays.
+
+"""
+
+# Original author (2007): Pierre GF Gerard-Marchant
+
+
+__all__ = ['compare_medians_ms',
+           'hdquantiles', 'hdmedian', 'hdquantiles_sd',
+           'idealfourths',
+           'median_cihs','mjci','mquantiles_cimj',
+           'rsh',
+           'trimmed_mean_ci',]
+
+
+import numpy as np
+from numpy import float64, ndarray
+
+import numpy.ma as ma
+from numpy.ma import MaskedArray
+
+from . import _mstats_basic as mstats
+
+from scipy.stats.distributions import norm, beta, t, binom
+
+
+def hdquantiles(data, prob=(.25, .5, .75), axis=None, var=False,):
+    """
+    Computes quantile estimates with the Harrell-Davis method.
+
+    The quantile estimates are calculated as a weighted linear combination
+    of order statistics.
+
+    Parameters
+    ----------
+    data : array_like
+        Data array.
+    prob : sequence, optional
+        Sequence of probabilities at which to compute the quantiles.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    var : bool, optional
+        Whether to return the variance of the estimate.
+
+    Returns
+    -------
+    hdquantiles : MaskedArray
+        A (p,) array of quantiles (if `var` is False), or a (2,p) array of
+        quantiles and variances (if `var` is True), where ``p`` is the
+        number of quantiles.
+
+    See Also
+    --------
+    hdquantiles_sd
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import hdquantiles
+    >>>
+    >>> # Sample data
+    >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
+    >>>
+    >>> # Probabilities at which to compute quantiles
+    >>> probabilities = [0.25, 0.5, 0.75]
+    >>>
+    >>> # Compute Harrell-Davis quantile estimates
+    >>> quantile_estimates = hdquantiles(data, prob=probabilities)
+    >>>
+    >>> # Display the quantile estimates
+    >>> for i, quantile in enumerate(probabilities):
+    ...     print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
+    25th percentile: 3.1505820231763066 # may vary
+    50th percentile: 5.194344084883956
+    75th percentile: 7.430626414674935
+
+    """
+    def _hd_1D(data,prob,var):
+        "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
+        xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
+        # Don't use length here, in case we have a numpy scalar
+        n = xsorted.size
+
+        hd = np.empty((2,len(prob)), float64)
+        if n < 2:
+            hd.flat = np.nan
+            if var:
+                return hd
+            return hd[0]
+
+        v = np.arange(n+1) / float(n)
+        betacdf = beta.cdf
+        for (i,p) in enumerate(prob):
+            _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
+            w = _w[1:] - _w[:-1]
+            hd_mean = np.dot(w, xsorted)
+            hd[0,i] = hd_mean
+            #
+            hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
+            #
+        hd[0, prob == 0] = xsorted[0]
+        hd[0, prob == 1] = xsorted[-1]
+        if var:
+            hd[1, prob == 0] = hd[1, prob == 1] = np.nan
+            return hd
+        return hd[0]
+    # Initialization & checks
+    data = ma.array(data, copy=False, dtype=float64)
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None) or (data.ndim == 1):
+        result = _hd_1D(data, p, var)
+    else:
+        if data.ndim > 2:
+            raise ValueError(f"Array 'data' must be at most two dimensional, "
+                             f"but got data.ndim = {data.ndim}")
+        result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
+
+    return ma.fix_invalid(result, copy=False)
+
+
+def hdmedian(data, axis=-1, var=False):
+    """
+    Returns the Harrell-Davis estimate of the median along the given axis.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    axis : int, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    var : bool, optional
+        Whether to return the variance of the estimate.
+
+    Returns
+    -------
+    hdmedian : MaskedArray
+        The median values.  If ``var=True``, the variance is returned inside
+        the masked array.  E.g. for a 1-D array the shape change from (1,) to
+        (2,).
+
+    """
+    result = hdquantiles(data,[0.5], axis=axis, var=var)
+    return result.squeeze()
+
+
+def hdquantiles_sd(data, prob=(.25, .5, .75), axis=None):
+    """
+    The standard error of the Harrell-Davis quantile estimates by jackknife.
+
+    Parameters
+    ----------
+    data : array_like
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    axis : int, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+
+    Returns
+    -------
+    hdquantiles_sd : MaskedArray
+        Standard error of the Harrell-Davis quantile estimates.
+
+    See Also
+    --------
+    hdquantiles
+
+    """
+    def _hdsd_1D(data, prob):
+        "Computes the std error for 1D arrays."
+        xsorted = np.sort(data.compressed())
+        n = len(xsorted)
+
+        hdsd = np.empty(len(prob), float64)
+        if n < 2:
+            hdsd.flat = np.nan
+
+        vv = np.arange(n) / float(n-1)
+        betacdf = beta.cdf
+
+        for (i,p) in enumerate(prob):
+            _w = betacdf(vv, n*p, n*(1-p))
+            w = _w[1:] - _w[:-1]
+            # cumulative sum of weights and data points if
+            # ith point is left out for jackknife
+            mx_ = np.zeros_like(xsorted)
+            mx_[1:] = np.cumsum(w * xsorted[:-1])
+            # similar but from the right
+            mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
+            hdsd[i] = np.sqrt(mx_.var() * (n - 1))
+        return hdsd
+
+    # Initialization & checks
+    data = ma.array(data, copy=False, dtype=float64)
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        result = _hdsd_1D(data, p)
+    else:
+        if data.ndim > 2:
+            raise ValueError(f"Array 'data' must be at most two dimensional, "
+                             f"but got data.ndim = {data.ndim}")
+        result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
+
+    return ma.fix_invalid(result, copy=False).ravel()
+
+
+def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
+                    alpha=0.05, axis=None):
+    """
+    Selected confidence interval of the trimmed mean along the given axis.
+
+    Parameters
+    ----------
+    data : array_like
+        Input data.
+    limits : {None, tuple}, optional
+        None or a two item tuple.
+        Tuple of the percentages to cut on each side of the array, with respect
+        to the number of unmasked data, as floats between 0. and 1. If ``n``
+        is the number of unmasked data before trimming, then
+        (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
+        largest data are masked.  The total number of unmasked data after
+        trimming is ``n * (1. - sum(limits))``.
+        The value of one limit can be set to None to indicate an open interval.
+
+        Defaults to (0.2, 0.2).
+    inclusive : (2,) tuple of boolean, optional
+        If relative==False, tuple indicating whether values exactly equal to
+        the absolute limits are allowed.
+        If relative==True, tuple indicating whether the number of data being
+        masked on each side should be rounded (True) or truncated (False).
+
+        Defaults to (True, True).
+    alpha : float, optional
+        Confidence level of the intervals.
+
+        Defaults to 0.05.
+    axis : int, optional
+        Axis along which to cut. If None, uses a flattened version of `data`.
+
+        Defaults to None.
+
+    Returns
+    -------
+    trimmed_mean_ci : (2,) ndarray
+        The lower and upper confidence intervals of the trimmed data.
+
+    """
+    data = ma.array(data, copy=False)
+    trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
+    tmean = trimmed.mean(axis)
+    tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
+    df = trimmed.count(axis) - 1
+    tppf = t.ppf(1-alpha/2.,df)
+    return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
+
+
+def mjci(data, prob=(0.25, 0.5, 0.75), axis=None):
+    """
+    Returns the Maritz-Jarrett estimators of the standard error of selected
+    experimental quantiles of the data.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+
+    """
+    def _mjci_1D(data, p):
+        data = np.sort(data.compressed())
+        n = data.size
+        prob = (np.array(p) * n + 0.5).astype(int)
+        betacdf = beta.cdf
+
+        mj = np.empty(len(prob), float64)
+        x = np.arange(1,n+1, dtype=float64) / n
+        y = x - 1./n
+        for (i,m) in enumerate(prob):
+            W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
+            C1 = np.dot(W,data)
+            C2 = np.dot(W,data**2)
+            mj[i] = np.sqrt(C2 - C1**2)
+        return mj
+
+    data = ma.array(data, copy=False)
+    if data.ndim > 2:
+        raise ValueError(f"Array 'data' must be at most two dimensional, "
+                         f"but got data.ndim = {data.ndim}")
+
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        return _mjci_1D(data, p)
+    else:
+        return ma.apply_along_axis(_mjci_1D, axis, data, p)
+
+
+def mquantiles_cimj(data, prob=(0.25, 0.50, 0.75), alpha=0.05, axis=None):
+    """
+    Computes the alpha confidence interval for the selected quantiles of the
+    data, with Maritz-Jarrett estimators.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    alpha : float, optional
+        Confidence level of the intervals.
+    axis : int or None, optional
+        Axis along which to compute the quantiles.
+        If None, use a flattened array.
+
+    Returns
+    -------
+    ci_lower : ndarray
+        The lower boundaries of the confidence interval.  Of the same length as
+        `prob`.
+    ci_upper : ndarray
+        The upper boundaries of the confidence interval.  Of the same length as
+        `prob`.
+
+    """
+    alpha = min(alpha, 1 - alpha)
+    z = norm.ppf(1 - alpha/2.)
+    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
+    smj = mjci(data, prob, axis=axis)
+    return (xq - z * smj, xq + z * smj)
+
+
+def median_cihs(data, alpha=0.05, axis=None):
+    """
+    Computes the alpha-level confidence interval for the median of the data.
+
+    Uses the Hettmasperger-Sheather method.
+
+    Parameters
+    ----------
+    data : array_like
+        Input data. Masked values are discarded. The input should be 1D only,
+        or `axis` should be set to None.
+    alpha : float, optional
+        Confidence level of the intervals.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+
+    Returns
+    -------
+    median_cihs
+        Alpha level confidence interval.
+
+    """
+    def _cihs_1D(data, alpha):
+        data = np.sort(data.compressed())
+        n = len(data)
+        alpha = min(alpha, 1-alpha)
+        k = int(binom._ppf(alpha/2., n, 0.5))
+        gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+        if gk < 1-alpha:
+            k -= 1
+            gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+        gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
+        I = (gk - 1 + alpha)/(gk - gkk)
+        lambd = (n-k) * I / float(k + (n-2*k)*I)
+        lims = (lambd*data[k] + (1-lambd)*data[k-1],
+                lambd*data[n-k-1] + (1-lambd)*data[n-k])
+        return lims
+    data = ma.array(data, copy=False)
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        result = _cihs_1D(data, alpha)
+    else:
+        if data.ndim > 2:
+            raise ValueError(f"Array 'data' must be at most two dimensional, "
+                             f"but got data.ndim = {data.ndim}")
+        result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
+
+    return result
+
+
+def compare_medians_ms(group_1, group_2, axis=None):
+    """
+    Compares the medians from two independent groups along the given axis.
+
+    The comparison is performed using the McKean-Schrader estimate of the
+    standard error of the medians.
+
+    Parameters
+    ----------
+    group_1 : array_like
+        First dataset.  Has to be of size >=7.
+    group_2 : array_like
+        Second dataset.  Has to be of size >=7.
+    axis : int, optional
+        Axis along which the medians are estimated. If None, the arrays are
+        flattened.  If `axis` is not None, then `group_1` and `group_2`
+        should have the same shape.
+
+    Returns
+    -------
+    compare_medians_ms : {float, ndarray}
+        If `axis` is None, then returns a float, otherwise returns a 1-D
+        ndarray of floats with a length equal to the length of `group_1`
+        along `axis`.
+
+    Examples
+    --------
+
+    >>> from scipy import stats
+    >>> a = [1, 2, 3, 4, 5, 6, 7]
+    >>> b = [8, 9, 10, 11, 12, 13, 14]
+    >>> stats.mstats.compare_medians_ms(a, b, axis=None)
+    1.0693225866553746e-05
+
+    The function is vectorized to compute along a given axis.
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> x = rng.random(size=(3, 7))
+    >>> y = rng.random(size=(3, 8))
+    >>> stats.mstats.compare_medians_ms(x, y, axis=1)
+    array([0.36908985, 0.36092538, 0.2765313 ])
+
+    References
+    ----------
+    .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
+       for studentizing the sample median." Communications in
+       Statistics-Simulation and Computation 13.6 (1984): 751-773.
+
+    """
+    (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
+    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
+                      mstats.stde_median(group_2, axis=axis))
+    W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
+    return 1 - norm.cdf(W)
+
+
+def idealfourths(data, axis=None):
+    """
+    Returns an estimate of the lower and upper quartiles.
+
+    Uses the ideal fourths algorithm.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    axis : int, optional
+        Axis along which the quartiles are estimated. If None, the arrays are
+        flattened.
+
+    Returns
+    -------
+    idealfourths : {list of floats, masked array}
+        Returns the two internal values that divide `data` into four parts
+        using the ideal fourths algorithm either along the flattened array
+        (if `axis` is None) or along `axis` of `data`.
+
+    """
+    def _idf(data):
+        x = data.compressed()
+        n = len(x)
+        if n < 3:
+            return [np.nan,np.nan]
+        (j,h) = divmod(n/4. + 5/12.,1)
+        j = int(j)
+        qlo = (1-h)*x[j-1] + h*x[j]
+        k = n - j
+        qup = (1-h)*x[k] + h*x[k-1]
+        return [qlo, qup]
+    data = ma.sort(data, axis=axis).view(MaskedArray)
+    if (axis is None):
+        return _idf(data)
+    else:
+        return ma.apply_along_axis(_idf, axis, data)
+
+
+def rsh(data, points=None):
+    """
+    Evaluates Rosenblatt's shifted histogram estimators for each data point.
+
+    Rosenblatt's estimator is a centered finite-difference approximation to the
+    derivative of the empirical cumulative distribution function.
+
+    Parameters
+    ----------
+    data : sequence
+        Input data, should be 1-D. Masked values are ignored.
+    points : sequence or None, optional
+        Sequence of points where to evaluate Rosenblatt shifted histogram.
+        If None, use the data.
+
+    """
+    data = ma.array(data, copy=False)
+    if points is None:
+        points = data
+    else:
+        points = np.atleast_1d(np.asarray(points))
+
+    if data.ndim != 1:
+        raise AttributeError("The input array should be 1D only !")
+
+    n = data.count()
+    r = idealfourths(data, axis=None)
+    h = 1.2 * (r[-1]-r[0]) / n**(1./5)
+    nhi = (data[:,None] <= points[None,:] + h).sum(0)
+    nlo = (data[:,None] < points[None,:] - h).sum(0)
+    return (nhi-nlo) / (2.*n*h)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multicomp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multicomp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1dd3c1d8e809a62097a21b35821b37391dd6f2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multicomp.py
@@ -0,0 +1,451 @@
+import warnings
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+
+import numpy as np
+
+from scipy import stats
+from scipy.optimize import minimize_scalar
+from scipy.stats._common import ConfidenceInterval
+from scipy.stats._qmc import check_random_state
+from scipy.stats._stats_py import _var
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._util import _transition_to_rng, DecimalNumber, SeedType
+
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+
+__all__ = [
+    'dunnett'
+]
+
+
+@dataclass
+class DunnettResult:
+    """Result object returned by `scipy.stats.dunnett`.
+
+    Attributes
+    ----------
+    statistic : float ndarray
+        The computed statistic of the test for each comparison. The element
+        at index ``i`` is the statistic for the comparison between
+        groups ``i`` and the control.
+    pvalue : float ndarray
+        The computed p-value of the test for each comparison. The element
+        at index ``i`` is the p-value for the comparison between
+        group ``i`` and the control.
+    """
+    statistic: np.ndarray
+    pvalue: np.ndarray
+    _alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
+    _rho: np.ndarray = field(repr=False)
+    _df: int = field(repr=False)
+    _std: float = field(repr=False)
+    _mean_samples: np.ndarray = field(repr=False)
+    _mean_control: np.ndarray = field(repr=False)
+    _n_samples: np.ndarray = field(repr=False)
+    _n_control: int = field(repr=False)
+    _rng: SeedType = field(repr=False)
+    _ci: ConfidenceInterval | None = field(default=None, repr=False)
+    _ci_cl: DecimalNumber | None = field(default=None, repr=False)
+
+    def __str__(self):
+        # Note: `__str__` prints the confidence intervals from the most
+        # recent call to `confidence_interval`. If it has not been called,
+        # it will be called with the default CL of .95.
+        if self._ci is None:
+            self.confidence_interval(confidence_level=.95)
+        s = (
+            "Dunnett's test"
+            f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
+            "Comparison               Statistic  p-value  Lower CI  Upper CI\n"
+        )
+        for i in range(self.pvalue.size):
+            s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
+                  f"{self.pvalue[i]:>10.3f}"
+                  f"{self._ci.low[i]:>10.3f}"
+                  f"{self._ci.high[i]:>10.3f}\n")
+
+        return s
+
+    def _allowance(
+        self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
+    ) -> float:
+        """Allowance.
+
+        It is the quantity to add/subtract from the observed difference
+        between the means of observed groups and the mean of the control
+        group. The result gives confidence limits.
+
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval.
+            Default is .95.
+        tol : float, optional
+            A tolerance for numerical optimization: the allowance will produce
+            a confidence within ``10*tol*(1 - confidence_level)`` of the
+            specified level, or a warning will be emitted. Tight tolerances
+            may be impractical due to noisy evaluation of the objective.
+            Default is 1e-3.
+
+        Returns
+        -------
+        allowance : float
+            Allowance around the mean.
+        """
+        alpha = 1 - confidence_level
+
+        def pvalue_from_stat(statistic):
+            statistic = np.array(statistic)
+            sf = _pvalue_dunnett(
+                rho=self._rho, df=self._df,
+                statistic=statistic, alternative=self._alternative,
+                rng=self._rng
+            )
+            return abs(sf - alpha)/alpha
+
+        # Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
+        # evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
+        # to tolerate a noisy objective function and may fail to find the
+        # minimum accurately. We mitigate this possibility with the validation
+        # step below, but implementation of a noise-tolerant root finder or
+        # minimizer would be a welcome enhancement. See gh-18150.
+        res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
+        critical_value = res.x
+
+        # validation
+        # tol*10 because tol=1e-3 means we tolerate a 1% change at most
+        if res.success is False or res.fun >= tol*10:
+            warnings.warn(
+                "Computation of the confidence interval did not converge to "
+                "the desired level. The confidence level corresponding with "
+                f"the returned interval is approximately {alpha*(1+res.fun)}.",
+                stacklevel=3
+            )
+
+        # From [1] p. 1101 between (1) and (3)
+        allowance = critical_value*self._std*np.sqrt(
+            1/self._n_samples + 1/self._n_control
+        )
+        return abs(allowance)
+
+    def confidence_interval(
+        self, confidence_level: DecimalNumber = 0.95
+    ) -> ConfidenceInterval:
+        """Compute the confidence interval for the specified confidence level.
+
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval.
+            Default is .95.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence intervals for each
+            comparison. The high and low values are accessible for each
+            comparison at index ``i`` for each group ``i``.
+
+        """
+        # check to see if the supplied confidence level matches that of the
+        # previously computed CI.
+        if (self._ci is not None) and (confidence_level == self._ci_cl):
+            return self._ci
+
+        if not (0 < confidence_level < 1):
+            raise ValueError("Confidence level must be between 0 and 1.")
+
+        allowance = self._allowance(confidence_level=confidence_level)
+        diff_means = self._mean_samples - self._mean_control
+
+        low = diff_means-allowance
+        high = diff_means+allowance
+
+        if self._alternative == 'greater':
+            high = [np.inf] * len(diff_means)
+        elif self._alternative == 'less':
+            low = [-np.inf] * len(diff_means)
+
+        self._ci_cl = confidence_level
+        self._ci = ConfidenceInterval(
+            low=low,
+            high=high
+        )
+        return self._ci
+
+
+@xp_capabilities(np_only=True)
+@_transition_to_rng('random_state', replace_doc=False)
+def dunnett(
+    *samples: "npt.ArrayLike",  # noqa: D417
+    control: "npt.ArrayLike",
+    alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
+    rng: SeedType = None
+) -> DunnettResult:
+    """Dunnett's test: multiple comparisons of means against a control group.
+
+    This is an implementation of Dunnett's original, single-step test as
+    described in [1]_.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : 1D array_like
+        The sample measurements for each experimental group.
+    control : 1D array_like
+        The sample measurements for the control group.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+
+        The null hypothesis is that the means of the distributions underlying
+        the samples and control are equal. The following alternative
+        hypotheses are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions underlying the samples
+          and control are unequal.
+        * 'less': the means of the distributions underlying the samples
+          are less than the mean of the distribution underlying the control.
+        * 'greater': the means of the distributions underlying the
+          samples are greater than the mean of the distribution underlying
+          the control.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `random_state` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `random_state` keyword will emit warnings. Following a
+            deprecation period, the `random_state` keyword will be removed.
+
+    Returns
+    -------
+    res : `~scipy.stats._result_classes.DunnettResult`
+        An object containing attributes:
+
+        statistic : float ndarray
+            The computed statistic of the test for each comparison. The element
+            at index ``i`` is the statistic for the comparison between
+            groups ``i`` and the control.
+        pvalue : float ndarray
+            The computed p-value of the test for each comparison. The element
+            at index ``i`` is the p-value for the comparison between
+            group ``i`` and the control.
+
+        And the following method:
+
+        confidence_interval(confidence_level=0.95) :
+            Compute the difference in means of the groups
+            with the control +- the allowance.
+
+    See Also
+    --------
+    tukey_hsd : performs pairwise comparison of means.
+    :ref:`hypothesis_dunnett` : Extended example
+
+    Notes
+    -----
+    Like the independent-sample t-test, Dunnett's test [1]_ is used to make
+    inferences about the means of distributions from which samples were drawn.
+    However, when multiple t-tests are performed at a fixed significance level,
+    the "family-wise error rate" - the probability of incorrectly rejecting the
+    null hypothesis in at least one test - will exceed the significance level.
+    Dunnett's test is designed to perform multiple comparisons while
+    controlling the family-wise error rate.
+
+    Dunnett's test compares the means of multiple experimental groups
+    against a single control group. Tukey's Honestly Significant Difference Test
+    is another multiple-comparison test that controls the family-wise error
+    rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
+    When pairwise comparisons between experimental groups are not needed,
+    Dunnett's test is preferable due to its higher power.
+
+    The use of this test relies on several assumptions.
+
+    1. The observations are independent within and among groups.
+    2. The observations within each group are normally distributed.
+    3. The distributions from which the samples are drawn have the same finite
+       variance.
+
+    References
+    ----------
+    .. [1] Dunnett, Charles W. (1955) "A Multiple Comparison Procedure for
+           Comparing Several Treatments with a Control." Journal of the American
+           Statistical Association, 50:272, 1096-1121,
+           :doi:`10.1080/01621459.1955.10501294`
+    .. [2] Thomson, M. L., & Short, M. D. (1969). Mucociliary function in
+           health, chronic obstructive airway disease, and asbestosis. Journal
+           of applied physiology, 26(5), 535-539.
+           :doi:`10.1152/jappl.1969.26.5.535`
+
+    Examples
+    --------
+    We'll use data from [2]_, Table 1. The null hypothesis is that the means of
+    the distributions underlying the samples and control are equal.
+
+    First, we test that the means of the distributions underlying the samples
+    and control are unequal (``alternative='two-sided'``, the default).
+
+    >>> import numpy as np
+    >>> from scipy.stats import dunnett
+    >>> samples = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]
+    >>> control = [2.9, 3.0, 2.5, 2.6, 3.2]
+    >>> res = dunnett(*samples, control=control)
+    >>> res.statistic
+    array([ 0.90874545, -0.05007117])
+    >>> res.pvalue
+    array([0.58325114, 0.99819341])
+
+    Now, we test that the means of the distributions underlying the samples are
+    greater than the mean of the distribution underlying the control.
+
+    >>> res = dunnett(*samples, control=control, alternative='greater')
+    >>> res.statistic
+    array([ 0.90874545, -0.05007117])
+    >>> res.pvalue
+    array([0.30230596, 0.69115597])
+
+    For a more detailed example, see :ref:`hypothesis_dunnett`.
+    """
+    samples_, control_, rng = _iv_dunnett(
+        samples=samples, control=control,
+        alternative=alternative, rng=rng
+    )
+
+    rho, df, n_group, n_samples, n_control = _params_dunnett(
+        samples=samples_, control=control_
+    )
+
+    statistic, std, mean_control, mean_samples = _statistic_dunnett(
+        samples_, control_, df, n_samples, n_control
+    )
+
+    pvalue = _pvalue_dunnett(
+        rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
+    )
+
+    return DunnettResult(
+        statistic=statistic, pvalue=pvalue,
+        _alternative=alternative,
+        _rho=rho, _df=df, _std=std,
+        _mean_samples=mean_samples,
+        _mean_control=mean_control,
+        _n_samples=n_samples,
+        _n_control=n_control,
+        _rng=rng
+    )
+
+
+def _iv_dunnett(
+    samples: Sequence["npt.ArrayLike"],
+    control: "npt.ArrayLike",
+    alternative: Literal['two-sided', 'less', 'greater'],
+    rng: SeedType
+) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
+    """Input validation for Dunnett's test."""
+    rng = check_random_state(rng)
+
+    if alternative not in {'two-sided', 'less', 'greater'}:
+        raise ValueError(
+            "alternative must be 'less', 'greater' or 'two-sided'"
+        )
+
+    ndim_msg = "Control and samples groups must be 1D arrays"
+    n_obs_msg = "Control and samples groups must have at least 1 observation"
+
+    control = np.asarray(control)
+    samples_ = [np.asarray(sample) for sample in samples]
+
+    # samples checks
+    samples_control: list[np.ndarray] = samples_ + [control]
+    for sample in samples_control:
+        if sample.ndim > 1:
+            raise ValueError(ndim_msg)
+
+        if sample.size < 1:
+            raise ValueError(n_obs_msg)
+
+    return samples_, control, rng
+
+
+def _params_dunnett(
+    samples: list[np.ndarray], control: np.ndarray
+) -> tuple[np.ndarray, int, int, np.ndarray, int]:
+    """Specific parameters for Dunnett's test.
+
+    Degree of freedom is the number of observations minus the number of groups
+    including the control.
+    """
+    n_samples = np.array([sample.size for sample in samples])
+
+    # From [1] p. 1100 d.f. = (sum N)-(p+1)
+    n_sample = n_samples.sum()
+    n_control = control.size
+    n = n_sample + n_control
+    n_groups = len(samples)
+    df = n - n_groups - 1
+
+    # From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
+    rho = n_control/n_samples + 1
+    rho = 1/np.sqrt(rho[:, None] * rho[None, :])
+    np.fill_diagonal(rho, 1)
+
+    return rho, df, n_groups, n_samples, n_control
+
+
+def _statistic_dunnett(
+    samples: list[np.ndarray], control: np.ndarray, df: int,
+    n_samples: np.ndarray, n_control: int
+) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
+    """Statistic of Dunnett's test.
+
+    Computation based on the original single-step test from [1].
+    """
+    mean_control = np.mean(control)
+    mean_samples = np.array([np.mean(sample) for sample in samples])
+    all_samples = [control] + samples
+    all_means = np.concatenate([[mean_control], mean_samples])
+
+    # Variance estimate s^2 from [1] Eq. 1
+    s2 = np.sum([_var(sample, mean=mean)*sample.size
+                 for sample, mean in zip(all_samples, all_means)]) / df
+    std = np.sqrt(s2)
+
+    # z score inferred from [1] unlabeled equation after Eq. 1
+    z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
+
+    return z / std, std, mean_control, mean_samples
+
+
+def _pvalue_dunnett(
+    rho: np.ndarray, df: int, statistic: np.ndarray,
+    alternative: Literal['two-sided', 'less', 'greater'],
+    rng: SeedType = None
+) -> np.ndarray:
+    """pvalue from the multivariate t-distribution.
+
+    Critical values come from the multivariate student-t distribution.
+    """
+    statistic = statistic.reshape(-1, 1)
+
+    mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
+    if alternative == "two-sided":
+        statistic = abs(statistic)
+        pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
+    elif alternative == "greater":
+        pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
+    else:
+        pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
+
+    return np.atleast_1d(pvalue)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multivariate.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multivariate.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee76048248fdc9ed8316ca681ed806949383839
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_multivariate.py
@@ -0,0 +1,8049 @@
+#
+# Author: Joris Vankerschaver 2013
+#
+import math
+import warnings
+import threading
+import types
+import numpy as np
+import scipy.linalg
+from scipy._lib import doccer
+from scipy.special import (gammaln, psi, multigammaln, xlogy, entr, betaln,
+                           ive, loggamma)
+from scipy import special
+import scipy._lib.array_api_extra as xpx
+from scipy._lib._util import check_random_state
+from scipy.linalg.blas import drot, get_blas_funcs
+from ._continuous_distns import norm, invgamma
+from ._discrete_distns import binom
+from . import _covariance, _rcont
+from ._qmvnt import _qmvt, _qmvn, _qauto
+from ._morestats import directional_stats
+from scipy.optimize import root_scalar
+
+__all__ = ['multivariate_normal',
+           'matrix_normal',
+           'dirichlet',
+           'dirichlet_multinomial',
+           'wishart',
+           'invwishart',
+           'multinomial',
+           'special_ortho_group',
+           'ortho_group',
+           'random_correlation',
+           'unitary_group',
+           'multivariate_t',
+           'multivariate_hypergeom',
+           'random_table',
+           'uniform_direction',
+           'vonmises_fisher',
+           'normal_inverse_gamma',
+           'matrix_t']
+
+_LOG_2PI = np.log(2 * np.pi)
+_LOG_2 = np.log(2)
+_LOG_PI = np.log(np.pi)
+MVN_LOCK = threading.Lock()
+
+
+_doc_random_state = """\
+seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+    Used for drawing random variates.
+    If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+    If `seed` is an int, a new ``RandomState`` instance is used, seeded
+    with seed.
+    If `seed` is already a ``RandomState`` or ``Generator`` instance,
+    then that object is used.
+    Default is `None`.
+"""
+
+
+def _squeeze_output(out):
+    """
+    Remove single-dimensional entries from array and convert to scalar,
+    if necessary.
+    """
+    out = out.squeeze()
+    if out.ndim == 0:
+        out = out[()]
+    return out
+
+
+def _eigvalsh_to_eps(spectrum, cond=None, rcond=None):
+    """Determine which eigenvalues are "small" given the spectrum.
+
+    This is for compatibility across various linear algebra functions
+    that should agree about whether or not a Hermitian matrix is numerically
+    singular and what is its numerical matrix rank.
+    This is designed to be compatible with scipy.linalg.pinvh.
+
+    Parameters
+    ----------
+    spectrum : 1d ndarray
+        Array of eigenvalues of a Hermitian matrix.
+    cond, rcond : float, optional
+        Cutoff for small eigenvalues.
+        Singular values smaller than rcond * largest_eigenvalue are
+        considered zero.
+        If None or -1, suitable machine precision is used.
+
+    Returns
+    -------
+    eps : float
+        Magnitude cutoff for numerical negligibility.
+
+    """
+    if rcond is not None:
+        cond = rcond
+    if cond in [None, -1]:
+        t = spectrum.dtype.char.lower()
+        factor = {'f': 1E3, 'd': 1E6}
+        cond = factor[t] * np.finfo(t).eps
+    eps = cond * np.max(abs(spectrum))
+    return eps
+
+
+def _pinv_1d(v, eps=1e-5):
+    """A helper function for computing the pseudoinverse.
+
+    Parameters
+    ----------
+    v : iterable of numbers
+        This may be thought of as a vector of eigenvalues or singular values.
+    eps : float
+        Values with magnitude no greater than eps are considered negligible.
+
+    Returns
+    -------
+    v_pinv : 1d float ndarray
+        A vector of pseudo-inverted numbers.
+
+    """
+    return np.array([0 if abs(x) <= eps else 1/x for x in v], dtype=float)
+
+
+def _validate_marginal_input(dimensions, multivariate_dims):
+    """Determine if input dimensions can be marginalized.
+
+    Parameters
+    ----------
+    dimensions : float, ndarray
+        Input dimensions to be marginalized
+    multivariate_dims : int
+        Number of dimensions of multivariate distribution.
+
+    Returns
+    -------
+    dims : ndarray
+        Array of indices to marginalize
+    """
+    dims = np.copy(dimensions)
+    dims = np.atleast_1d(dims)
+
+    if len(dims) == 0:
+        msg = "Cannot marginalize all dimensions."
+        raise ValueError(msg)
+
+    if not np.issubdtype(dims.dtype, np.integer):
+        msg = ("Elements of `dimensions` must be integers - the indices "
+               "of the marginal variables being retained.")
+        raise ValueError(msg)
+
+    original_dims = np.copy(dims)
+
+    dims[dims < 0] += multivariate_dims
+
+    if len(np.unique(dims)) != len(dims):
+        msg = "All elements of `dimensions` must be unique."
+        raise ValueError(msg)
+
+    i_invalid = (dims < 0) | (dims >= multivariate_dims)
+    if np.any(i_invalid):
+        msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
+               f"for a distribution in {multivariate_dims} dimensions.")
+        raise ValueError(msg)
+
+    return dims
+
+
+class _PSD:
+    """
+    Compute coordinated functions of a symmetric positive semidefinite matrix.
+
+    This class addresses two issues.  Firstly it allows the pseudoinverse,
+    the logarithm of the pseudo-determinant, and the rank of the matrix
+    to be computed using one call to eigh instead of three.
+    Secondly it allows these functions to be computed in a way
+    that gives mutually compatible results.
+    All of the functions are computed with a common understanding as to
+    which of the eigenvalues are to be considered negligibly small.
+    The functions are designed to coordinate with scipy.linalg.pinvh()
+    but not necessarily with np.linalg.det() or with np.linalg.matrix_rank().
+
+    Parameters
+    ----------
+    M : array_like
+        Symmetric positive semidefinite matrix (2-D).
+    cond, rcond : float, optional
+        Cutoff for small eigenvalues.
+        Singular values smaller than rcond * largest_eigenvalue are
+        considered zero.
+        If None or -1, suitable machine precision is used.
+    lower : bool, optional
+        Whether the pertinent array data is taken from the lower
+        or upper triangle of M. (Default: lower)
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite
+        numbers. Disabling may give a performance gain, but may result
+        in problems (crashes, non-termination) if the inputs do contain
+        infinities or NaNs.
+    allow_singular : bool, optional
+        Whether to allow a singular matrix.  (Default: True)
+
+    Notes
+    -----
+    The arguments are similar to those of scipy.linalg.pinvh().
+
+    """
+
+    def __init__(self, M, cond=None, rcond=None, lower=True,
+                 check_finite=True, allow_singular=True):
+        self._M = np.asarray(M)
+
+        # Compute the symmetric eigendecomposition.
+        # Note that eigh takes care of array conversion, chkfinite,
+        # and assertion that the matrix is square.
+        s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite)
+
+        eps = _eigvalsh_to_eps(s, cond, rcond)
+        if np.min(s) < -eps:
+            msg = "The input matrix must be symmetric positive semidefinite."
+            raise ValueError(msg)
+        d = s[s > eps]
+        if len(d) < len(s) and not allow_singular:
+            msg = ("When `allow_singular is False`, the input matrix must be "
+                   "symmetric positive definite.")
+            raise np.linalg.LinAlgError(msg)
+        s_pinv = _pinv_1d(s, eps)
+        U = np.multiply(u, np.sqrt(s_pinv))
+
+        # Save the eigenvector basis, and tolerance for testing support
+        self.eps = 1e3*eps
+        self.V = u[:, s <= eps]
+
+        # Initialize the eagerly precomputed attributes.
+        self.rank = len(d)
+        self.U = U
+        self.log_pdet = np.sum(np.log(d))
+
+        # Initialize attributes to be lazily computed.
+        self._pinv = None
+
+    def _support_mask(self, x):
+        """
+        Check whether x lies in the support of the distribution.
+        """
+        residual = np.linalg.norm(x @ self.V, axis=-1)
+        in_support = residual < self.eps
+        return in_support
+
+    @property
+    def pinv(self):
+        if self._pinv is None:
+            self._pinv = np.dot(self.U, self.U.T)
+        return self._pinv
+
+
+class multi_rv_generic:
+    """
+    Class which encapsulates common functionality between all multivariate
+    distributions.
+    """
+
+    def __init__(self, seed=None):
+        super().__init__()
+        self._random_state = check_random_state(seed)
+
+    @property
+    def random_state(self):
+        """ Get or set the Generator object for generating random variates.
+
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+        """
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, seed):
+        self._random_state = check_random_state(seed)
+
+    def _get_random_state(self, random_state):
+        if random_state is not None:
+            return check_random_state(random_state)
+        else:
+            return self._random_state
+
+
+class multi_rv_frozen:
+    """
+    Class which encapsulates common functionality between all frozen
+    multivariate distributions.
+    """
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(types.GenericAlias)
+
+    @property
+    def random_state(self):
+        return self._dist._random_state
+
+    @random_state.setter
+    def random_state(self, seed):
+        self._dist._random_state = check_random_state(seed)
+
+
+_mvn_doc_default_callparams = """\
+mean : array_like, default: ``[0]``
+    Mean of the distribution.
+cov : array_like or `Covariance`, default: ``[1]``
+    Symmetric positive (semi)definite covariance matrix of the distribution.
+allow_singular : bool, default: ``False``
+    Whether to allow a singular covariance matrix. This is ignored if `cov` is
+    a `Covariance` object.
+"""
+
+_mvn_doc_callparams_note = """\
+Setting the parameter `mean` to `None` is equivalent to having `mean`
+be the zero-vector. The parameter `cov` can be a scalar, in which case
+the covariance matrix is the identity times that value, a vector of
+diagonal entries for the covariance matrix, a two-dimensional array_like,
+or a `Covariance` object.
+"""
+
+_mvn_doc_frozen_callparams = ""
+
+_mvn_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+mvn_docdict_params = {
+    '_mvn_doc_default_callparams': _mvn_doc_default_callparams,
+    '_mvn_doc_callparams_note': _mvn_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+mvn_docdict_noparams = {
+    '_mvn_doc_default_callparams': _mvn_doc_frozen_callparams,
+    '_mvn_doc_callparams_note': _mvn_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class multivariate_normal_gen(multi_rv_generic):
+    r"""A multivariate normal random variable.
+
+    The `mean` keyword specifies the mean. The `cov` keyword specifies the
+    covariance matrix.
+
+    Methods
+    -------
+    pdf(x, mean=None, cov=1, allow_singular=False)
+        Probability density function.
+    logpdf(x, mean=None, cov=1, allow_singular=False)
+        Log of the probability density function.
+    cdf(x, mean=None, cov=1, allow_singular=False, maxpts=1000000*dim, abseps=1e-5, releps=1e-5, lower_limit=None)
+        Cumulative distribution function.
+    logcdf(x, mean=None, cov=1, allow_singular=False, maxpts=1000000*dim, abseps=1e-5, releps=1e-5)
+        Log of the cumulative distribution function.
+    rvs(mean=None, cov=1, size=1, random_state=None)
+        Draw random samples from a multivariate normal distribution.
+    entropy(mean=None, cov=1)
+        Compute the differential entropy of the multivariate normal.
+    marginal(dimensions, mean=None, cov=1, allow_singular=False)
+        Return a marginal multivariate normal distribution.
+    fit(x, fix_mean=None, fix_cov=None)
+        Fit a multivariate normal distribution to data.
+
+    Parameters
+    ----------
+    %(_mvn_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_mvn_doc_callparams_note)s
+
+    The covariance matrix `cov` may be an instance of a subclass of
+    `Covariance`, e.g. `scipy.stats.CovViaPrecision`. If so, `allow_singular`
+    is ignored.
+
+    Otherwise, `cov` must be a symmetric positive semidefinite
+    matrix when `allow_singular` is True; it must be (strictly) positive
+    definite when `allow_singular` is False.
+    Symmetry is not checked; only the lower triangular portion is used.
+    The determinant and inverse of `cov` are computed
+    as the pseudo-determinant and pseudo-inverse, respectively, so
+    that `cov` does not need to have full rank.
+
+    The probability density function for `multivariate_normal` is
+
+    .. math::
+
+        f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}}
+               \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
+
+    where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
+    :math:`k` the rank of :math:`\Sigma`. In case of singular :math:`\Sigma`,
+    SciPy extends this definition according to [1]_.
+
+    .. versionadded:: 0.14.0
+
+    References
+    ----------
+    .. [1] Multivariate Normal Distribution - Degenerate Case, Wikipedia,
+           https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import multivariate_normal
+
+    >>> x = np.linspace(0, 5, 10, endpoint=False)
+    >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
+    array([ 0.00108914,  0.01033349,  0.05946514,  0.20755375,  0.43939129,
+            0.56418958,  0.43939129,  0.20755375,  0.05946514,  0.01033349])
+    >>> fig1 = plt.figure()
+    >>> ax = fig1.add_subplot(111)
+    >>> ax.plot(x, y)
+    >>> plt.show()
+
+    Alternatively, the object may be called (as a function) to fix the mean
+    and covariance parameters, returning a "frozen" multivariate normal
+    random variable:
+
+    >>> rv = multivariate_normal(mean=None, cov=1, allow_singular=False)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # mean and covariance fixed.
+
+    The input quantiles can be any shape of array, as long as the last
+    axis labels the components.  This allows us for instance to
+    display the frozen pdf for a non-isotropic random variable in 2D as
+    follows:
+
+    >>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
+    >>> pos = np.dstack((x, y))
+    >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
+    >>> fig2 = plt.figure()
+    >>> ax2 = fig2.add_subplot(111)
+    >>> ax2.contourf(x, y, rv.pdf(pos))
+
+    """  # noqa: E501
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, mvn_docdict_params)
+
+    def __call__(self, mean=None, cov=1, allow_singular=False, seed=None, **kwds):
+        """Create a frozen multivariate normal distribution.
+
+        See `multivariate_normal_frozen` for more information.
+        """
+        return multivariate_normal_frozen(mean, cov,
+                                          allow_singular=allow_singular,
+                                          seed=seed, **kwds)
+
+    def _process_parameters(self, mean, cov, allow_singular=True):
+        """
+        Infer dimensionality from mean or covariance matrix, ensure that
+        mean and covariance are full vector resp. matrix.
+        """
+        if isinstance(cov, _covariance.Covariance):
+            return self._process_parameters_Covariance(mean, cov)
+        else:
+            # Before `Covariance` classes were introduced,
+            # `multivariate_normal` accepted plain arrays as `cov` and used the
+            # following input validation. To avoid disturbing the behavior of
+            # `multivariate_normal` when plain arrays are used, we use the
+            # original input validation here.
+            dim, mean, cov = self._process_parameters_psd(None, mean, cov)
+            # After input validation, some methods then processed the arrays
+            # with a `_PSD` object and used that to perform computation.
+            # To avoid branching statements in each method depending on whether
+            # `cov` is an array or `Covariance` object, we always process the
+            # array with `_PSD`, and then use wrapper that satisfies the
+            # `Covariance` interface, `CovViaPSD`.
+            psd = _PSD(cov, allow_singular=allow_singular)
+            cov_object = _covariance.CovViaPSD(psd)
+            return dim, mean, cov_object
+
+    def _process_parameters_Covariance(self, mean, cov):
+        dim = cov.shape[-1]
+        mean = np.array([0.]) if mean is None else mean
+        message = (f"`cov` represents a covariance matrix in {dim} dimensions,"
+                   f"and so `mean` must be broadcastable to shape {(dim,)}")
+        try:
+            mean = np.broadcast_to(mean, dim)
+        except ValueError as e:
+            raise ValueError(message) from e
+        return dim, mean, cov
+
+    def _process_parameters_psd(self, dim, mean, cov):
+        # Try to infer dimensionality
+        if dim is None:
+            if mean is None:
+                if cov is None:
+                    dim = 1
+                else:
+                    cov = np.asarray(cov, dtype=float)
+                    if cov.ndim < 2:
+                        dim = 1
+                    else:
+                        dim = cov.shape[0]
+            else:
+                mean = np.asarray(mean, dtype=float)
+                dim = mean.size
+        else:
+            if not np.isscalar(dim):
+                raise ValueError("Dimension of random variable must be "
+                                 "a scalar.")
+
+        # Check input sizes and return full arrays for mean and cov if
+        # necessary
+        if mean is None:
+            mean = np.zeros(dim)
+        mean = np.asarray(mean, dtype=float)
+
+        if cov is None:
+            cov = 1.0
+        cov = np.asarray(cov, dtype=float)
+
+        if dim == 1:
+            mean = mean.reshape(1)
+            cov = cov.reshape(1, 1)
+
+        if mean.ndim != 1 or mean.shape[0] != dim:
+            raise ValueError(f"Array 'mean' must be a vector of length {dim}.")
+        if cov.ndim == 0:
+            cov = cov * np.eye(dim)
+        elif cov.ndim == 1:
+            cov = np.diag(cov)
+        elif cov.ndim == 2 and cov.shape != (dim, dim):
+            rows, cols = cov.shape
+            if rows != cols:
+                msg = ("Array 'cov' must be square if it is two dimensional,"
+                       f" but cov.shape = {str(cov.shape)}.")
+            else:
+                msg = (f"Dimension mismatch: array 'cov' is of shape {cov.shape}, "
+                       f"but 'mean' is a vector of length {len(mean)}.")
+            raise ValueError(msg)
+        elif cov.ndim > 2:
+            raise ValueError(f"Array 'cov' must be at most two-dimensional, "
+                             f"but cov.ndim = {cov.ndim}")
+
+        return dim, mean, cov
+
+    def _process_quantiles(self, x, dim):
+        """
+        Adjust quantiles array so that last axis labels the components of
+        each data point.
+        """
+        x = np.asarray(x, dtype=float)
+
+        if x.ndim == 0:
+            x = x[np.newaxis]
+        elif x.ndim == 1:
+            if dim == 1:
+                x = x[:, np.newaxis]
+            else:
+                x = x[np.newaxis, :]
+
+        return x
+
+    def _logpdf(self, x, mean, cov_object):
+        """Log of the multivariate normal probability density function.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the log of the probability
+            density function
+        mean : ndarray
+            Mean of the distribution
+        cov_object : Covariance
+            An object representing the Covariance matrix
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        log_det_cov, rank = cov_object.log_pdet, cov_object.rank
+        dev = x - mean
+        if dev.ndim > 1:
+            log_det_cov = log_det_cov[..., np.newaxis]
+            rank = rank[..., np.newaxis]
+        maha = np.sum(np.square(cov_object.whiten(dev)), axis=-1)
+        return -0.5 * (rank * _LOG_2PI + log_det_cov + maha)
+
+    def logpdf(self, x, mean=None, cov=1, allow_singular=False):
+        """Log of the multivariate normal probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_mvn_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            Log of the probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        """
+        params = self._process_parameters(mean, cov, allow_singular)
+        dim, mean, cov_object = params
+        x = self._process_quantiles(x, dim)
+        out = self._logpdf(x, mean, cov_object)
+        if np.any(cov_object.rank < dim):
+            out_of_bounds = ~cov_object._support_mask(x-mean)
+            out[out_of_bounds] = -np.inf
+        return _squeeze_output(out)
+
+    def pdf(self, x, mean=None, cov=1, allow_singular=False):
+        """Multivariate normal probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_mvn_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            Probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        """
+        params = self._process_parameters(mean, cov, allow_singular)
+        dim, mean, cov_object = params
+        x = self._process_quantiles(x, dim)
+        out = np.exp(self._logpdf(x, mean, cov_object))
+        if np.any(cov_object.rank < dim):
+            out_of_bounds = ~cov_object._support_mask(x-mean)
+            out[out_of_bounds] = 0.0
+        return _squeeze_output(out)
+
+    def _cdf(self, x, mean, cov, maxpts, abseps, releps, lower_limit, rng):
+        """Multivariate normal cumulative distribution function.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the cumulative distribution function.
+        mean : ndarray
+            Mean of the distribution
+        cov : array_like
+            Covariance matrix of the distribution
+        maxpts : integer
+            The maximum number of points to use for integration
+        abseps : float
+            Absolute error tolerance
+        releps : float
+            Relative error tolerance
+        lower_limit : array_like, optional
+            Lower limit of integration of the cumulative distribution function.
+            Default is negative infinity. Must be broadcastable with `x`.
+        rng : Generator
+            an instance of ``np.random.Generator``, which is used internally
+            for QMC integration.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'cdf' instead.
+
+
+        .. versionadded:: 1.0.0
+
+        """
+        lower = (np.full(mean.shape, -np.inf)
+                 if lower_limit is None else lower_limit)
+        # In 2d, _mvn.mvnun accepts input in which `lower` bound elements
+        # are greater than `x`. Not so in other dimensions. Fix this by
+        # ensuring that lower bounds are indeed lower when passed, then
+        # set signs of resulting CDF manually.
+        b, a = np.broadcast_arrays(x, lower)
+        b, a = b - mean, a - mean  # _qmvn only accepts zero mean
+        i_swap = b < a
+        signs = (-1)**(i_swap.sum(axis=-1))  # odd # of swaps -> negative
+        a, b = a.copy(), b.copy()
+        a[i_swap], b[i_swap] = b[i_swap], a[i_swap]
+        n = x.shape[-1]
+        limits = np.concatenate((a, b), axis=-1)
+
+        # qmvn expects 1-d arguments, so process points sequentially
+        # XXX: if cov.ndim == 2 and limits.ndim == 1, can avoid apply_along_axis
+        def func1d(limits):
+            # res0 = _qmvn(maxpts, cov, limits[:n], limits[n:], rng)[0]
+            res = _qauto(_qmvn, cov, limits[:n], limits[n:],
+                         rng, error=abseps, limit=maxpts, n_batches=10)
+            return np.squeeze(res[0])
+
+        out = np.apply_along_axis(func1d, -1, limits) * signs
+        return _squeeze_output(out)
+
+    def logcdf(self, x, mean=None, cov=1, allow_singular=False, maxpts=None,
+               abseps=1e-5, releps=1e-5, *, lower_limit=None, rng=None):
+        """Log of the multivariate normal cumulative distribution function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_mvn_doc_default_callparams)s
+        maxpts : integer, optional
+            The maximum number of points to use for integration
+            (default ``1000000*dim``)
+        abseps : float, optional
+            Absolute error tolerance (default 1e-5)
+        releps : float, optional
+            Relative error tolerance (default 1e-5)
+        lower_limit : array_like, optional
+            Lower limit of integration of the cumulative distribution function.
+            Default is negative infinity. Must be broadcastable with `x`.
+        rng : Generator, optional
+            an instance of ``np.random.Generator``, which is used internally
+            for QMC integration.
+
+        Returns
+        -------
+        cdf : ndarray or scalar
+            Log of the cumulative distribution function evaluated at `x`
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        .. versionadded:: 1.0.0
+
+        """
+        params = self._process_parameters(mean, cov, allow_singular)
+        dim, mean, cov_object = params
+        cov = cov_object.covariance
+        x = self._process_quantiles(x, dim)
+        if not maxpts:
+            maxpts = 1000000 * dim
+
+        rng = self._get_random_state(rng)
+        cdf = self._cdf(x, mean, cov, maxpts, abseps, releps, lower_limit, rng)
+        # the log of a negative real is complex, and cdf can be negative
+        # if lower limit is greater than upper limit
+        cdf = cdf + 0j if np.any(cdf < 0) else cdf
+        out = np.log(cdf)
+        return out
+
+    def cdf(self, x, mean=None, cov=1, allow_singular=False, maxpts=None,
+            abseps=1e-5, releps=1e-5, *, lower_limit=None, rng=None):
+        """Multivariate normal cumulative distribution function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_mvn_doc_default_callparams)s
+        maxpts : integer, optional
+            The maximum number of points to use for integration
+            (default ``1000000*dim``)
+        abseps : float, optional
+            Absolute error tolerance (default 1e-5)
+        releps : float, optional
+            Relative error tolerance (default 1e-5)
+        lower_limit : array_like, optional
+            Lower limit of integration of the cumulative distribution function.
+            Default is negative infinity. Must be broadcastable with `x`.
+        rng : Generator, optional
+            an instance of ``np.random.Generator``, which is used internally
+            for QMC integration.
+
+        Returns
+        -------
+        cdf : ndarray or scalar
+            Cumulative distribution function evaluated at `x`
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        .. versionadded:: 1.0.0
+
+        """
+        params = self._process_parameters(mean, cov, allow_singular)
+        dim, mean, cov_object = params
+        cov = cov_object.covariance
+        x = self._process_quantiles(x, dim)
+        if not maxpts:
+            maxpts = 1000000 * dim
+        rng = self._get_random_state(rng)
+        out = self._cdf(x, mean, cov, maxpts, abseps, releps, lower_limit, rng)
+        return out
+
+    def rvs(self, mean=None, cov=1, size=1, random_state=None):
+        """Draw random samples from a multivariate normal distribution.
+
+        Parameters
+        ----------
+        %(_mvn_doc_default_callparams)s
+        size : integer, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `N`), where `N` is the
+            dimension of the random variable.
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        """
+        dim, mean, cov_object = self._process_parameters(mean, cov)
+        random_state = self._get_random_state(random_state)
+
+        if isinstance(cov_object, _covariance.CovViaPSD):
+            cov = cov_object.covariance
+            out = random_state.multivariate_normal(mean, cov, size)
+            out = _squeeze_output(out)
+        else:
+            size = size or tuple()
+            if not np.iterable(size):
+                size = (size,)
+            shape = tuple(size) + (cov_object.shape[-1],)
+            x = random_state.normal(size=shape)
+            out = mean + cov_object.colorize(x)
+        return out
+
+    def entropy(self, mean=None, cov=1):
+        """Compute the differential entropy of the multivariate normal.
+
+        Parameters
+        ----------
+        %(_mvn_doc_default_callparams)s
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the multivariate normal distribution
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+
+        """
+        dim, mean, cov_object = self._process_parameters(mean, cov)
+        return 0.5 * (cov_object.rank * (_LOG_2PI + 1) + cov_object.log_pdet)
+
+    def fit(self, x, fix_mean=None, fix_cov=None):
+        """Fit a multivariate normal distribution to data.
+
+        Parameters
+        ----------
+        x : ndarray (m, n)
+            Data the distribution is fitted to. Must have two axes.
+            The first axis of length `m` represents the number of vectors
+            the distribution is fitted to. The second axis of length `n`
+            determines the dimensionality of the fitted distribution.
+        fix_mean : ndarray(n, )
+            Fixed mean vector. Must have length `n`.
+        fix_cov: ndarray (n, n)
+            Fixed covariance matrix. Must have shape ``(n, n)``.
+
+        Returns
+        -------
+        mean : ndarray (n, )
+            Maximum likelihood estimate of the mean vector
+        cov : ndarray (n, n)
+            Maximum likelihood estimate of the covariance matrix
+
+        """
+        # input validation for data to be fitted
+        x = np.asarray(x)
+        if x.ndim != 2:
+            raise ValueError("`x` must be two-dimensional.")
+
+        n_vectors, dim = x.shape
+
+        # parameter estimation
+        # reference: https://home.ttic.edu/~shubhendu/Slides/Estimation.pdf
+        if fix_mean is not None:
+            # input validation for `fix_mean`
+            fix_mean = np.atleast_1d(fix_mean)
+            if fix_mean.shape != (dim, ):
+                msg = ("`fix_mean` must be a one-dimensional array the same "
+                       "length as the dimensionality of the vectors `x`.")
+                raise ValueError(msg)
+            mean = fix_mean
+        else:
+            mean = x.mean(axis=0)
+
+        if fix_cov is not None:
+            # input validation for `fix_cov`
+            fix_cov = np.atleast_2d(fix_cov)
+            # validate shape
+            if fix_cov.shape != (dim, dim):
+                msg = ("`fix_cov` must be a two-dimensional square array "
+                       "of same side length as the dimensionality of the "
+                       "vectors `x`.")
+                raise ValueError(msg)
+            # validate positive semidefiniteness
+            # a trimmed down copy from _PSD
+            s, u = scipy.linalg.eigh(fix_cov, lower=True, check_finite=True)
+            eps = _eigvalsh_to_eps(s)
+            if np.min(s) < -eps:
+                msg = "`fix_cov` must be symmetric positive semidefinite."
+                raise ValueError(msg)
+            cov = fix_cov
+        else:
+            centered_data = x - mean
+            cov = centered_data.T @ centered_data / n_vectors
+        return mean, cov
+
+    def marginal(self, dimensions, mean=None, cov=1, allow_singular=False):
+        """Return a marginal multivariate normal distribution.
+
+        Parameters
+        ----------
+        dimensions : int or 1-d array_like
+            The dimensions of the multivariate distribution corresponding
+            with the marginal variables, that is, the indices of the dimensions
+            that are being retained. The other dimensions are marginalized out.
+        %(_mvn_doc_default_callparams)s
+
+        Returns
+        -------
+        marginal_multivariate_normal : multivariate_normal_frozen
+            An object representing the marginal distribution.
+
+        Notes
+        -----
+        %(_mvn_doc_callparams_note)s
+        """
+        params = self._process_parameters(mean, cov, allow_singular)
+        n, mean, cov_object = params
+        dims = _validate_marginal_input(dimensions, n)
+
+        mean = mean[dims]
+        cov = cov_object.covariance[np.ix_(dims, dims)]
+
+        return multivariate_normal_frozen(mean, cov, allow_singular)
+
+multivariate_normal = multivariate_normal_gen()
+
+
+class multivariate_normal_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, mean=None, cov=1, allow_singular=False, seed=None,
+                 maxpts=None, abseps=1e-5, releps=1e-5):
+        """Create a frozen multivariate normal distribution.
+
+        Parameters
+        ----------
+        mean : array_like, default: ``[0]``
+            Mean of the distribution.
+        cov : array_like, default: ``[1]``
+            Symmetric positive (semi)definite covariance matrix of the
+            distribution.
+        allow_singular : bool, default: ``False``
+            Whether to allow a singular covariance matrix.
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+        maxpts : integer, optional
+            The maximum number of points to use for integration of the
+            cumulative distribution function (default ``1000000*dim``)
+        abseps : float, optional
+            Absolute error tolerance for the cumulative distribution function
+            (default 1e-5)
+        releps : float, optional
+            Relative error tolerance for the cumulative distribution function
+            (default 1e-5)
+
+        Examples
+        --------
+        When called with the default parameters, this will create a 1D random
+        variable with mean 0 and covariance 1:
+
+        >>> from scipy.stats import multivariate_normal
+        >>> r = multivariate_normal()
+        >>> r.mean
+        array([ 0.])
+        >>> r.cov
+        array([[1.]])
+
+        """ # numpy/numpydoc#87  # noqa: E501
+        self._dist = multivariate_normal_gen(seed)
+        self.dim, self.mean, self.cov_object = (
+            self._dist._process_parameters(mean, cov, allow_singular))
+        self.allow_singular = allow_singular or self.cov_object._allow_singular
+        if not maxpts:
+            maxpts = 1000000 * self.dim
+        self.maxpts = maxpts
+        self.abseps = abseps
+        self.releps = releps
+
+    @property
+    def cov(self):
+        return self.cov_object.covariance
+
+    def logpdf(self, x):
+        x = self._dist._process_quantiles(x, self.dim)
+        out = self._dist._logpdf(x, self.mean, self.cov_object)
+        if np.any(self.cov_object.rank < self.dim):
+            out_of_bounds = ~self.cov_object._support_mask(x-self.mean)
+            out[out_of_bounds] = -np.inf
+        return _squeeze_output(out)
+
+    def pdf(self, x):
+        return np.exp(self.logpdf(x))
+
+    def logcdf(self, x, *, lower_limit=None, rng=None):
+        cdf = self.cdf(x, lower_limit=lower_limit, rng=rng)
+        # the log of a negative real is complex, and cdf can be negative
+        # if lower limit is greater than upper limit
+        cdf = cdf + 0j if np.any(cdf < 0) else cdf
+        out = np.log(cdf)
+        return out
+
+    def cdf(self, x, *, lower_limit=None, rng=None):
+        x = self._dist._process_quantiles(x, self.dim)
+        rng = self._dist._get_random_state(rng)
+        out = self._dist._cdf(x, self.mean, self.cov_object.covariance,
+                              self.maxpts, self.abseps, self.releps,
+                              lower_limit, rng)
+        return _squeeze_output(out)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.mean, self.cov_object, size, random_state)
+
+    def entropy(self):
+        """Computes the differential entropy of the multivariate normal.
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the multivariate normal distribution
+
+        """
+        log_pdet = self.cov_object.log_pdet
+        rank = self.cov_object.rank
+        return 0.5 * (rank * (_LOG_2PI + 1) + log_pdet)
+
+    def marginal(self, dimensions):
+        return self._dist.marginal(dimensions, self.mean,
+                                   self.cov_object, self.allow_singular)
+
+# Set frozen generator docstrings from corresponding docstrings in
+# multivariate_normal_gen and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'logcdf', 'cdf', 'rvs']:
+    method = multivariate_normal_gen.__dict__[name]
+    method_frozen = multivariate_normal_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(method.__doc__,
+                                             mvn_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, mvn_docdict_params)
+
+_matnorm_doc_default_callparams = """\
+mean : array_like, optional
+    Mean of the distribution (default: `None`)
+rowcov : array_like, optional
+    Among-row covariance matrix of the distribution (default: ``1``)
+colcov : array_like, optional
+    Among-column covariance matrix of the distribution (default: ``1``)
+"""
+
+_matnorm_doc_callparams_note = """\
+If `mean` is set to `None` then a matrix of zeros is used for the mean.
+The dimensions of this matrix are inferred from the shape of `rowcov` and
+`colcov`, if these are provided, or set to ``1`` if ambiguous.
+
+`rowcov` and `colcov` can be two-dimensional array_likes specifying the
+covariance matrices directly. Alternatively, a one-dimensional array will
+be be interpreted as the entries of a diagonal matrix, and a scalar or
+zero-dimensional array will be interpreted as this value times the
+identity matrix.
+"""
+
+_matnorm_doc_frozen_callparams = ""
+
+_matnorm_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+matnorm_docdict_params = {
+    '_matnorm_doc_default_callparams': _matnorm_doc_default_callparams,
+    '_matnorm_doc_callparams_note': _matnorm_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+matnorm_docdict_noparams = {
+    '_matnorm_doc_default_callparams': _matnorm_doc_frozen_callparams,
+    '_matnorm_doc_callparams_note': _matnorm_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class matrix_normal_gen(multi_rv_generic):
+    r"""A matrix normal random variable.
+
+    The `mean` keyword specifies the mean. The `rowcov` keyword specifies the
+    among-row covariance matrix. The 'colcov' keyword specifies the
+    among-column covariance matrix.
+
+    Methods
+    -------
+    pdf(X, mean=None, rowcov=1, colcov=1)
+        Probability density function.
+    logpdf(X, mean=None, rowcov=1, colcov=1)
+        Log of the probability density function.
+    rvs(mean=None, rowcov=1, colcov=1, size=1, random_state=None)
+        Draw random samples.
+    entropy(rowcol=1, colcov=1)
+        Differential entropy.
+
+    Parameters
+    ----------
+    %(_matnorm_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_matnorm_doc_callparams_note)s
+
+    The covariance matrices specified by `rowcov` and `colcov` must be
+    (symmetric) positive definite. If the samples in `X` are
+    :math:`m \times n`, then `rowcov` must be :math:`m \times m` and
+    `colcov` must be :math:`n \times n`. `mean` must be the same shape as `X`.
+
+    The probability density function for `matrix_normal` is
+
+    .. math::
+
+        f(X) = (2 \pi)^{-\frac{mn}{2}}|U|^{-\frac{n}{2}} |V|^{-\frac{m}{2}}
+               \exp\left( -\frac{1}{2} \mathrm{Tr}\left[ U^{-1} (X-M) V^{-1}
+               (X-M)^T \right] \right),
+
+    where :math:`M` is the mean, :math:`U` the among-row covariance matrix,
+    :math:`V` the among-column covariance matrix.
+
+    The `allow_singular` behaviour of the `multivariate_normal`
+    distribution is not currently supported. Covariance matrices must be
+    full rank.
+
+    The `matrix_normal` distribution is closely related to the
+    `multivariate_normal` distribution. Specifically, :math:`\mathrm{Vec}(X)`
+    (the vector formed by concatenating the columns  of :math:`X`) has a
+    multivariate normal distribution with mean :math:`\mathrm{Vec}(M)`
+    and covariance :math:`V \otimes U` (where :math:`\otimes` is the Kronecker
+    product). Sampling and pdf evaluation are
+    :math:`\mathcal{O}(m^3 + n^3 + m^2 n + m n^2)` for the matrix normal, but
+    :math:`\mathcal{O}(m^3 n^3)` for the equivalent multivariate normal,
+    making this equivalent form algorithmically inefficient.
+
+    .. versionadded:: 0.17.0
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy.stats import matrix_normal
+
+    >>> M = np.arange(6).reshape(3,2); M
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> U = np.diag([1,2,3]); U
+    array([[1, 0, 0],
+           [0, 2, 0],
+           [0, 0, 3]])
+    >>> V = 0.3*np.identity(2); V
+    array([[ 0.3,  0. ],
+           [ 0. ,  0.3]])
+    >>> X = M + 0.1; X
+    array([[ 0.1,  1.1],
+           [ 2.1,  3.1],
+           [ 4.1,  5.1]])
+    >>> matrix_normal.pdf(X, mean=M, rowcov=U, colcov=V)
+    0.023410202050005054
+
+    >>> # Equivalent multivariate normal
+    >>> from scipy.stats import multivariate_normal
+    >>> vectorised_X = X.T.flatten()
+    >>> equiv_mean = M.T.flatten()
+    >>> equiv_cov = np.kron(V,U)
+    >>> multivariate_normal.pdf(vectorised_X, mean=equiv_mean, cov=equiv_cov)
+    0.023410202050005054
+
+    Alternatively, the object may be called (as a function) to fix the mean
+    and covariance parameters, returning a "frozen" matrix normal
+    random variable:
+
+    >>> rv = matrix_normal(mean=None, rowcov=1, colcov=1)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # mean and covariance fixed.
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, matnorm_docdict_params)
+
+    def __call__(self, mean=None, rowcov=1, colcov=1, seed=None):
+        """Create a frozen matrix normal distribution.
+
+        See `matrix_normal_frozen` for more information.
+
+        """
+        return matrix_normal_frozen(mean, rowcov, colcov, seed=seed)
+
+    def _process_parameters(self, mean, rowcov, colcov):
+        """
+        Infer dimensionality from mean or covariance matrices. Handle
+        defaults. Ensure compatible dimensions.
+        """
+
+        # Process mean
+        if mean is not None:
+            mean = np.asarray(mean, dtype=float)
+            meanshape = mean.shape
+            if len(meanshape) != 2:
+                raise ValueError("Array `mean` must be two dimensional.")
+            if np.any(meanshape == 0):
+                raise ValueError("Array `mean` has invalid shape.")
+
+        # Process among-row covariance
+        rowcov = np.asarray(rowcov, dtype=float)
+        if rowcov.ndim == 0:
+            if mean is not None:
+                rowcov = rowcov * np.identity(meanshape[0])
+            else:
+                rowcov = rowcov * np.identity(1)
+        elif rowcov.ndim == 1:
+            rowcov = np.diag(rowcov)
+        rowshape = rowcov.shape
+        if len(rowshape) != 2:
+            raise ValueError("`rowcov` must be a scalar or a 2D array.")
+        if rowshape[0] != rowshape[1]:
+            raise ValueError("Array `rowcov` must be square.")
+        if rowshape[0] == 0:
+            raise ValueError("Array `rowcov` has invalid shape.")
+        numrows = rowshape[0]
+
+        # Process among-column covariance
+        colcov = np.asarray(colcov, dtype=float)
+        if colcov.ndim == 0:
+            if mean is not None:
+                colcov = colcov * np.identity(meanshape[1])
+            else:
+                colcov = colcov * np.identity(1)
+        elif colcov.ndim == 1:
+            colcov = np.diag(colcov)
+        colshape = colcov.shape
+        if len(colshape) != 2:
+            raise ValueError("`colcov` must be a scalar or a 2D array.")
+        if colshape[0] != colshape[1]:
+            raise ValueError("Array `colcov` must be square.")
+        if colshape[0] == 0:
+            raise ValueError("Array `colcov` has invalid shape.")
+        numcols = colshape[0]
+
+        # Ensure mean and covariances compatible
+        if mean is not None:
+            if meanshape[0] != numrows:
+                raise ValueError("Arrays `mean` and `rowcov` must have the "
+                                 "same number of rows.")
+            if meanshape[1] != numcols:
+                raise ValueError("Arrays `mean` and `colcov` must have the "
+                                 "same number of columns.")
+        else:
+            mean = np.zeros((numrows, numcols))
+
+        dims = (numrows, numcols)
+
+        return dims, mean, rowcov, colcov
+
+    def _process_quantiles(self, X, dims):
+        """
+        Adjust quantiles array so that last two axes labels the components of
+        each data point.
+        """
+        X = np.asarray(X, dtype=float)
+        if X.ndim == 2:
+            X = X[np.newaxis, :]
+        if X.shape[-2:] != dims:
+            raise ValueError("The shape of array `X` is not compatible "
+                             "with the distribution parameters.")
+        return X
+
+    def _logpdf(self, dims, X, mean, row_prec_rt, log_det_rowcov,
+                col_prec_rt, log_det_colcov):
+        """Log of the matrix normal probability density function.
+
+        Parameters
+        ----------
+        dims : tuple
+            Dimensions of the matrix variates
+        X : ndarray
+            Points at which to evaluate the log of the probability
+            density function
+        mean : ndarray
+            Mean of the distribution
+        row_prec_rt : ndarray
+            A decomposition such that np.dot(row_prec_rt, row_prec_rt.T)
+            is the inverse of the among-row covariance matrix
+        log_det_rowcov : float
+            Logarithm of the determinant of the among-row covariance matrix
+        col_prec_rt : ndarray
+            A decomposition such that np.dot(col_prec_rt, col_prec_rt.T)
+            is the inverse of the among-column covariance matrix
+        log_det_colcov : float
+            Logarithm of the determinant of the among-column covariance matrix
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        numrows, numcols = dims
+        roll_dev = np.moveaxis(X-mean, -1, 0)
+        scale_dev = np.tensordot(col_prec_rt.T,
+                                 np.dot(roll_dev, row_prec_rt), 1)
+        maha = np.sum(np.sum(np.square(scale_dev), axis=-1), axis=0)
+        return -0.5 * (numrows*numcols*_LOG_2PI + numcols*log_det_rowcov
+                       + numrows*log_det_colcov + maha)
+
+    def logpdf(self, X, mean=None, rowcov=1, colcov=1):
+        """Log of the matrix normal probability density function.
+
+        Parameters
+        ----------
+        X : array_like
+            Quantiles, with the last two axes of `X` denoting the components.
+        %(_matnorm_doc_default_callparams)s
+
+        Returns
+        -------
+        logpdf : ndarray
+            Log of the probability density function evaluated at `X`
+
+        Notes
+        -----
+        %(_matnorm_doc_callparams_note)s
+
+        """
+        dims, mean, rowcov, colcov = self._process_parameters(mean, rowcov,
+                                                              colcov)
+        X = self._process_quantiles(X, dims)
+        rowpsd = _PSD(rowcov, allow_singular=False)
+        colpsd = _PSD(colcov, allow_singular=False)
+        out = self._logpdf(dims, X, mean, rowpsd.U, rowpsd.log_pdet, colpsd.U,
+                           colpsd.log_pdet)
+        return _squeeze_output(out)
+
+    def pdf(self, X, mean=None, rowcov=1, colcov=1):
+        """Matrix normal probability density function.
+
+        Parameters
+        ----------
+        X : array_like
+            Quantiles, with the last two axes of `X` denoting the components.
+        %(_matnorm_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Probability density function evaluated at `X`
+
+        Notes
+        -----
+        %(_matnorm_doc_callparams_note)s
+
+        """
+        return np.exp(self.logpdf(X, mean, rowcov, colcov))
+
+    def rvs(self, mean=None, rowcov=1, colcov=1, size=1, random_state=None):
+        """Draw random samples from a matrix normal distribution.
+
+        Parameters
+        ----------
+        %(_matnorm_doc_default_callparams)s
+        size : integer, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `dims`), where `dims` is the
+            dimension of the random matrices.
+
+        Notes
+        -----
+        %(_matnorm_doc_callparams_note)s
+
+        """
+        size = int(size)
+        dims, mean, rowcov, colcov = self._process_parameters(mean, rowcov,
+                                                              colcov)
+        rowchol = scipy.linalg.cholesky(rowcov, lower=True)
+        colchol = scipy.linalg.cholesky(colcov, lower=True)
+        random_state = self._get_random_state(random_state)
+        # We aren't generating standard normal variates with size=(size,
+        # dims[0], dims[1]) directly to ensure random variates remain backwards
+        # compatible. See https://github.com/scipy/scipy/pull/12312 for more
+        # details.
+        std_norm = random_state.standard_normal(
+            size=(dims[1], size, dims[0])
+        ).transpose(1, 2, 0)
+        out = mean + np.einsum('jp,ipq,kq->ijk',
+                               rowchol, std_norm, colchol,
+                               optimize=True)
+        if size == 1:
+            out = out.reshape(mean.shape)
+        return out
+
+    def entropy(self, rowcov=1, colcov=1):
+        """Log of the matrix normal probability density function.
+
+        Parameters
+        ----------
+        rowcov : array_like, optional
+            Among-row covariance matrix of the distribution (default: ``1``)
+        colcov : array_like, optional
+            Among-column covariance matrix of the distribution (default: ``1``)
+
+        Returns
+        -------
+        entropy : float
+            Entropy of the distribution
+
+        Notes
+        -----
+        %(_matnorm_doc_callparams_note)s
+
+        """
+        dummy_mean = np.zeros((rowcov.shape[0], colcov.shape[0]))
+        dims, _, rowcov, colcov = self._process_parameters(dummy_mean,
+                                                           rowcov,
+                                                           colcov)
+        rowpsd = _PSD(rowcov, allow_singular=False)
+        colpsd = _PSD(colcov, allow_singular=False)
+
+        return self._entropy(dims, rowpsd.log_pdet, colpsd.log_pdet)
+
+    def _entropy(self, dims, row_cov_logdet, col_cov_logdet):
+        n, p = dims
+        return (0.5 * n * p * (1 + _LOG_2PI) + 0.5 * p * row_cov_logdet +
+                0.5 * n * col_cov_logdet)
+
+
+matrix_normal = matrix_normal_gen()
+
+
+class matrix_normal_frozen(multi_rv_frozen):
+    """
+    Create a frozen matrix normal distribution.
+
+    Parameters
+    ----------
+    %(_matnorm_doc_default_callparams)s
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is `None` the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import matrix_normal
+
+    >>> distn = matrix_normal(mean=np.zeros((3,3)))
+    >>> X = distn.rvs(); X
+    array([[-0.02976962,  0.93339138, -0.09663178],
+           [ 0.67405524,  0.28250467, -0.93308929],
+           [-0.31144782,  0.74535536,  1.30412916]])
+    >>> distn.pdf(X)
+    2.5160642368346784e-05
+    >>> distn.logpdf(X)
+    -10.590229595124615
+    """
+    __class_getitem__ = None
+
+    def __init__(self, mean=None, rowcov=1, colcov=1, seed=None):
+        self._dist = matrix_normal_gen(seed)
+        self.dims, self.mean, self.rowcov, self.colcov = \
+            self._dist._process_parameters(mean, rowcov, colcov)
+        self.rowpsd = _PSD(self.rowcov, allow_singular=False)
+        self.colpsd = _PSD(self.colcov, allow_singular=False)
+
+    def logpdf(self, X):
+        X = self._dist._process_quantiles(X, self.dims)
+        out = self._dist._logpdf(self.dims, X, self.mean, self.rowpsd.U,
+                                 self.rowpsd.log_pdet, self.colpsd.U,
+                                 self.colpsd.log_pdet)
+        return _squeeze_output(out)
+
+    def pdf(self, X):
+        return np.exp(self.logpdf(X))
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.mean, self.rowcov, self.colcov, size,
+                              random_state)
+
+    def entropy(self):
+        return self._dist._entropy(self.dims, self.rowpsd.log_pdet,
+                                   self.colpsd.log_pdet)
+
+# Set frozen generator docstrings from corresponding docstrings in
+# matrix_normal_gen and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'rvs', 'entropy']:
+    method = matrix_normal_gen.__dict__[name]
+    method_frozen = matrix_normal_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(method.__doc__,
+                                             matnorm_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, matnorm_docdict_params)
+
+
+_matt_doc_default_callparams = """\
+mean : array_like, optional
+    Mean of the distribution (default: `None`)
+row_spread : array_like, optional
+    Row-wise 2nd order raw central moment matrix (default: ``1``)
+col_spread : array_like, optional
+    Column-wise 2nd order raw central moment matrix (default: ``1``)
+df : scalar, optional
+    Degrees of freedom (default: ``1``)
+"""
+
+_matt_doc_callparams_note = """\
+If `mean` is set to `None` then a matrix of zeros is used for the mean.
+The dimensions of this matrix are inferred from the shape of `row_spread` and
+`col_spread`, if these are provided, or set to ``1`` if ambiguous.
+
+`row_spread` and `col_spread` can be two-dimensional array_likes specifying the
+spread matrices directly. Alternatively, a one-dimensional array will
+be be interpreted as the entries of a diagonal matrix, and a scalar or
+zero-dimensional array will be interpreted as this value times the
+identity matrix.
+"""
+
+_matt_doc_frozen_callparams = ""
+
+_matt_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+matrix_t_docdict_params = {
+    "_matt_doc_default_callparams": _matt_doc_default_callparams,
+    "_matt_doc_callparams_note": _matt_doc_callparams_note,
+    "_doc_random_state": _doc_random_state,
+}
+
+matrix_t_docdict_noparams = {
+    "_matt_doc_default_callparams": _matt_doc_frozen_callparams,
+    "_matt_doc_callparams_note": _matt_doc_frozen_callparams_note,
+    "_doc_random_state": _doc_random_state,
+}
+
+
+class matrix_t_gen(multi_rv_generic):
+    r"""A matrix t-random variable.
+
+    The `mean` keyword specifies the mean.
+    The `row_spread` keyword specifies the row-wise spread matrix.
+    The `col_spread` keyword specifies the column-wise spread matrix.
+
+    Methods
+    -------
+    pdf(x, mean=None, row_spread=None, col_spread=None)
+        Probability density function.
+    logpdf(x, mean=None, row_spread=None, col_spread=None)
+        Log of the probability density function.
+    rvs(mean=None, row_spread=1, col_spread=1, df=1, size=1, random_state=None)
+        Draw random samples.
+
+    Parameters
+    ----------
+    %(_matt_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_matt_doc_callparams_note)s
+
+    The spread matrices specified by `row_spread` and `col_spread` must be
+    (symmetric) positive definite. If the samples in `X` have shape `(m,n)`
+    then `row_spread` must have shape `(m,m)` and `col_spread` must have shape `(n,n)`.
+    Spread matrices must be full rank.
+
+    The probability density function for `matrix_t` is
+
+    .. math::
+
+        f(X \vert \mathrm{M}, \Sigma, \Omega, \mathrm{df}) =
+        \frac{
+            \Gamma_n \left(
+                \frac{\mathrm{df} + m + n - 1}{2}
+            \right)
+            \left(
+            \det \left(
+                I_n + (X - \mathrm{M})^T \Sigma^{-1} (X - \mathrm{M}) \Omega^{-1}
+            \right)
+        \right)^{ -\frac{\mathrm{df} + m + n - 1}{2} }
+        }{
+            \Gamma_n \left(
+                \frac{\mathrm{df} + n - 1}{2}
+            \right)
+            \pi^{mn / 2}
+            \left( \det \Sigma \right)^{n/2}
+            \left( \det \Omega \right)^{m/2}
+        }
+
+    or, alternatively,
+
+    .. math::
+
+        f(X \vert \mathrm{M}, \Sigma, \Omega, \mathrm{df}) =
+        \frac{
+            \Gamma_m \left(
+                \frac{\mathrm{df} + m + n - 1}{2}
+            \right)
+            \left(
+                \det \left(
+                    I_m + \Sigma^{-1} (X - \mathrm{M}) \Omega^{-1} (X - \mathrm{M})^T
+                    \right)
+            \right)^{ -\frac{\mathrm{df} + m + n - 1}{2} }
+        }{
+            \Gamma_m \left(
+                \frac{\mathrm{df} + n - 1}{2}
+            \right)
+            \pi^{mn / 2}
+            \left( \det \Sigma \right)^{n/2}
+            \left( \det \Omega \right)^{m/2}
+        }
+
+    where :math:`\mathrm{M}` is the mean,
+    :math:`\Sigma` is the row-wise spread matrix,
+    :math:`\Omega` is the column-wise matrix,
+    :math:`\mathrm{df}` is the degrees of freedom,
+    and :math:`\Gamma_n` is the multivariate gamma function.
+
+    These equivalent formulations come from the identity
+
+    .. math::
+
+        \det\left( I_m + A B \right) = \det\left( I_n + B A \right)
+
+    for :math:`m \times n` arrays :math:`A` and :math:`B^T`
+    and the fact that
+    :math:`\gamma_n(\mathrm{df} + m) / \gamma_n(\mathrm{df})`
+    is equal to
+    :math:`\gamma_m(\mathrm{df} + n) / \gamma_m(\mathrm{df})`,
+    where
+
+    .. math::
+
+        \gamma_m(\mathrm{df}) = 2^{m(m-1)/2}
+        \Gamma_m\left( (\mathrm{df} + m - 1) / 2 \right)
+
+    denotes a normalized multivariate gamma function.
+
+    When :math:`\mathrm{df} = 1` this distribution is known as the matrix
+    variate Cauchy.
+
+    .. versionadded:: 1.17.0
+
+    References
+    ----------
+    .. [1] Gupta, A.K., & Nagar, D.K. (2000). Matrix Variate Distributions (1st ed.).
+           Chapman and Hall/CRC.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy.stats import matrix_t
+    >>> M = np.arange(6).reshape(3,2)
+    >>> M
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> Sigma = np.diag([1,2,3])
+    >>> Sigma
+    array([[1, 0, 0],
+           [0, 2, 0],
+           [0, 0, 3]])
+    >>> Omega = 0.3*np.identity(2)
+    >>> Omega
+    array([[ 0.3,  0. ],
+           [ 0. ,  0.3]])
+    >>> X = M + 0.1
+    >>> X
+    array([[ 0.1,  1.1],
+           [ 2.1,  3.1],
+           [ 4.1,  5.1]])
+    >>> df = 3
+    >>> matrix_t.pdf(X, mean=M, row_spread=Sigma, col_spread=Omega, df=df)
+    0.9972880280135796
+
+    Alternatively, the object may be called (as a function) to fix the mean
+    and spread parameters, returning a "frozen" matrix t
+    random variable:
+
+    >>> rv = matrix_t(mean=None, row_spread=1, col_spread=1, df=1)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # mean and spreads and degrees of freedom fixed.
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = scipy._lib.doccer.docformat(
+            self.__doc__, matrix_t_docdict_params
+        )
+
+    def __call__(self, mean=None, row_spread=1, col_spread=1, df=None, seed=None):
+        """Create a frozen matrix t distribution.
+
+        See `matrix_t_frozen` for more information.
+        """
+        return matrix_t_frozen(mean, row_spread, col_spread, df, seed)
+
+    def _process_parameters(self, mean, row_spread, col_spread, df):
+        """
+        Infer dimensionality from mean or covariance matrices.
+        Handle defaults. Ensure conformality.
+
+        Parameters
+        ----------
+        mean : ndarray, shape (m,n)
+            Mean of the distribution
+        row_spread : ndarray, shape (m,m)
+            Row-wise spread matrix
+        col_spread : ndarray, shape (n,n)
+            Column-wise spread matrix
+        df : float
+            Degrees of freedom
+        """
+
+        # Process mean
+        if mean is not None:
+            mean = np.asarray(mean, dtype=float)
+            meanshape = mean.shape
+            if 0 in meanshape:
+                raise ValueError("Array `mean` has invalid shape.")
+            if len(meanshape) != 2:
+                raise ValueError("Array `mean` must be 2D.")
+
+        # Process row-wise spread
+        row_spread = np.asarray(row_spread, dtype=float)
+        if row_spread.ndim == 0:
+            if mean is not None:
+                row_spread = row_spread * np.identity(meanshape[0])
+            else:
+                row_spread = row_spread * np.identity(1)
+        elif row_spread.ndim == 1:
+            row_spread = np.diag(row_spread)
+        rowshape = row_spread.shape
+        if 0 in rowshape:
+            raise ValueError("Array `row_spread` has invalid shape.")
+        if len(rowshape) != 2:
+            raise ValueError("Array `row_spread` must be a scalar or a 2D array.")
+        if rowshape[0] != rowshape[1]:
+            raise ValueError("Array `row_spread` must be square.")
+        numrows = rowshape[0]
+
+        # Process column-wise spread
+        col_spread = np.asarray(col_spread, dtype=float)
+        if col_spread.ndim == 0:
+            if mean is not None:
+                col_spread = col_spread * np.identity(meanshape[1])
+            else:
+                col_spread = col_spread * np.identity(1)
+        elif col_spread.ndim == 1:
+            col_spread = np.diag(col_spread)
+        colshape = col_spread.shape
+        if 0 in colshape:
+            raise ValueError("Array `col_spread` has invalid shape.")
+        if len(colshape) != 2:
+            raise ValueError("Array `col_spread` must be a scalar or a 2D array.")
+        if colshape[0] != colshape[1]:
+            raise ValueError("Array `col_spread` must be square.")
+        numcols = colshape[0]
+
+        # Ensure mean and spreads are conformal
+        if mean is not None:
+            if meanshape[0] != numrows:
+                raise ValueError(
+                    "Arrays `mean` and `row_spread` must have the same number of rows."
+                )
+            if meanshape[1] != numcols:
+                raise ValueError(
+                    "Arrays `mean` and `col_spread` must have the same number "
+                    "of columns."
+                )
+        else:
+            mean = np.zeros((numrows, numcols))
+
+        dims = (numrows, numcols)
+
+        if df is None:
+            df = 1  # default to matrix variate Cauchy
+        elif not np.isscalar(df):
+            raise ValueError("Degrees of freedom must be a scalar.")
+        elif df <= 0:
+            raise ValueError("Degrees of freedom must be positive.")
+
+        return dims, mean, row_spread, col_spread, df
+
+    def _process_quantiles(self, X, dims):
+        """
+        Adjust quantiles array so that last two axes labels the component of
+        each data point.
+        """
+        X = np.asarray(X, dtype=float)
+        if X.ndim == 2:
+            X = X[np.newaxis, :]
+        if X.shape[-2:] != dims:
+            raise ValueError(
+                "The shape of array `X` is not conformal with "
+                "the distribution parameters."
+            )
+        return X
+
+    def _logpdf(
+        self,
+        dims,
+        X,
+        mean,
+        df,
+        invrow_spread,
+        invcol_spread,
+        logdetrow_spread,
+        logdetcol_spread,
+    ):
+        """
+        Log of the matrix t probability density function.
+
+        Parameters
+        ----------
+        dims : tuple
+            Dimensions of the matrix variates
+        X : ndarray, shape (m,n) (equal to `dims`)
+            Points at which to evaluate the log of the probability density function
+        mean : ndarray, shape (m,n)
+            Mean of the distribution
+        df : float
+            Degrees-of-freedom parameter
+        invrow_spread : ndarray, shape (m,m)
+            Inverse of the row-wise spread matrix
+        invcol_spread : ndarray, shape (n,n)
+            Inverse of the column-wise spread matrix
+        logdetrow_spread : float
+            Log-determinant of the row-wise spread matrix
+        detcol_spread : float
+            Log-determinant of the column-wise spread matrix
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use `logpdf` instead.
+        """
+        m, n = dims
+        X_shape = X.shape
+        if X.ndim > 3:
+            X = X.reshape(-1, m, n)
+        X_centered = X - mean[np.newaxis, ...]
+        det_arg = np.identity(n) + np.einsum(
+            "nij,njk,nkl,nlp->nip",
+            X_centered.transpose(0, 2, 1),
+            invrow_spread[np.newaxis, ...],
+            X_centered,
+            invcol_spread[np.newaxis, ...],
+            optimize=True,
+        )
+        _, logdet = np.linalg.slogdet(det_arg)
+        log_d_mn = -((df + m + n - 1) / 2) * logdet
+        log_f_mn = (
+            scipy.special.multigammaln((df + m + n - 1) / 2, n)
+            - scipy.special.multigammaln((df + n - 1) / 2, n)
+            - (m * n / 2) * _LOG_PI
+            - (n / 2) * logdetrow_spread
+            - (m / 2) * logdetcol_spread
+        )
+        retval = log_d_mn + log_f_mn
+        if len(X_shape) > 3:
+            retval = retval.reshape(X_shape[:-2])
+        return retval
+
+    def logpdf(self, X, mean=None, row_spread=1, col_spread=1, df=1):
+        """Log of the matrix normal probability density function.
+
+        Parameters
+        ----------
+        X : array_like
+            Quantiles, with the last two axes of `X` denoting the components.
+        %(_matt_doc_default_callparams)s
+
+        Returns
+        -------
+        logpdf : ndarray
+            Log of the probability density function evaluated at `X`
+
+        Notes
+        -----
+        %(_matt_doc_callparams_note)s
+
+        Examples
+        -------
+
+        >>> import numpy as np
+        >>> from scipy.stats import matrix_t
+        >>> M = np.arange(6).reshape(3,2); M
+        array([[0, 1],
+            [2, 3],
+            [4, 5]])
+        >>> Sigma = np.diag([1,2,3]); Sigma
+        array([[1, 0, 0],
+            [0, 2, 0],
+            [0, 0, 3]])
+        >>> Omega = 0.3*np.identity(2); Omega
+        array([[ 0.3,  0. ],
+            [ 0. ,  0.3]])
+        >>> X = M + 0.1; X
+        array([[ 0.1,  1.1],
+            [ 2.1,  3.1],
+            [ 4.1,  5.1]])
+        >>> df = 3; df
+        3
+        >>> matrix_t.logpdf(X, mean=M, row_spread=Sigma, col_spread=Omega, df=df)
+        -0.002715656044664061
+        """
+        dims, mean, row_spread, col_spread, df = self._process_parameters(
+            mean, row_spread, col_spread, df
+        )
+        X = self._process_quantiles(X, dims)
+        rowpsd = _PSD(row_spread, allow_singular=False)
+        colpsd = _PSD(col_spread, allow_singular=False)
+        invrow_spread = rowpsd.pinv
+        invcol_spread = colpsd.pinv
+        logdetrow_spread = rowpsd.log_pdet
+        logdetcol_spread = colpsd.log_pdet
+        out = self._logpdf(
+            dims,
+            X,
+            mean,
+            df,
+            invrow_spread,
+            invcol_spread,
+            logdetrow_spread,
+            logdetcol_spread,
+        )
+        return _squeeze_output(out)
+
+    def pdf(self, X, mean=None, row_spread=1, col_spread=1, df=1):
+        """Matrix t probability density function.
+
+        Parameters
+        ----------
+        X : array_like
+            Quantiles, with the last two axes of `X` denoting the components.
+        %(_matt_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Probability density function evaluated at `X`
+
+        Notes
+        -----
+        %(_matt_doc_callparams_note)s
+
+        Examples
+        --------
+
+        >>> import numpy as np
+        >>> from scipy.stats import matrix_t
+        >>> M = np.arange(6).reshape(3,2); M
+        array([[0, 1],
+            [2, 3],
+            [4, 5]])
+        >>> Sigma = np.diag([1,2,3]); Sigma
+        array([[1, 0, 0],
+            [0, 2, 0],
+            [0, 0, 3]])
+        >>> Omega = 0.3*np.identity(2); Omega
+        array([[ 0.3,  0. ],
+            [ 0. ,  0.3]])
+        >>> X = M + 0.1; X
+        array([[ 0.1,  1.1],
+            [ 2.1,  3.1],
+            [ 4.1,  5.1]])
+        >>> df = 3; df
+        3
+        >>> matrix_t.logpdf(X, mean=M, row_spread=Sigma, col_spread=Omega, df=df)
+        0.9972880280135796
+        """
+        return np.exp(self.logpdf(X, mean, row_spread, col_spread, df))
+
+    def rvs(
+        self, mean=None, row_spread=1, col_spread=1, df=1, size=1, random_state=None
+    ) -> np.ndarray:
+        """Draw random samples from a matrix t distribution.
+
+        Parameters
+        ----------
+        %(_matt_doc_default_callparams)s
+        size : integer, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `dims`), where `dims` is the
+            dimension of the random matrices.
+
+        Notes
+        -----
+        %(_matt_doc_callparams_note)s
+
+        This method takes advantage of the two equivalent expressions of the
+        probability density function. It samples a Cholesky factor of a
+        random variate of the appropriate inverse Wishart distribution using
+        the smaller of the row/column dimensions.
+        """
+        size = int(size)
+        dims, mean, row_spread, col_spread, df = self._process_parameters(
+            mean, row_spread, col_spread, df
+        )
+        random_state = self._get_random_state(random_state)
+        # see scipy.stats.matrix_normal.rvs
+        std_norm = random_state.standard_normal(
+            size=(dims[1], size, dims[0])
+        ).transpose(1, 2, 0)
+        if dims[0] <= dims[1]:
+            rowchol = _cholesky_invwishart_rvs(df, row_spread, size, random_state)
+            colchol = scipy.linalg.cholesky(col_spread, lower=True)[np.newaxis, ...]
+        else:
+            rowchol = scipy.linalg.cholesky(row_spread, lower=True)[np.newaxis, ...]
+            colchol = _cholesky_invwishart_rvs(df, col_spread, size, random_state)
+        t_raw = np.einsum("ijp,ipq,ikq->ijk", rowchol, std_norm, colchol, optimize=True)
+        t_centered = mean[np.newaxis, ...] + t_raw
+        if size == 1:
+            t_centered = t_centered.reshape(mean.shape)
+        return t_centered
+
+
+matrix_t = matrix_t_gen()
+
+
+class matrix_t_frozen:
+    def __init__(self, mean, row_spread, col_spread, df, seed=None):
+        self._dist = matrix_t_gen(seed)
+        self.dims, self.mean, self.row_spread, self.col_spread, self.df = (
+            self._dist._process_parameters(mean, row_spread, col_spread, df)
+        )
+        self._random_state = np.random.RandomState(seed)
+        self.rowpsd = _PSD(self.row_spread, allow_singular=False)
+        self.colpsd = _PSD(self.col_spread, allow_singular=False)
+
+    def logpdf(self, X):
+        X = self._dist._process_quantiles(X, self.dims)
+        rowpsd = _PSD(self.row_spread, allow_singular=False)
+        colpsd = _PSD(self.col_spread, allow_singular=False)
+        invrow_spread = rowpsd.pinv
+        invcol_spread = colpsd.pinv
+        logdetrow_spread = rowpsd.log_pdet
+        logdetcol_spread = colpsd.log_pdet
+        out = self._dist._logpdf(
+            self.dims,
+            X,
+            self.mean,
+            self.df,
+            invrow_spread,
+            invcol_spread,
+            logdetrow_spread,
+            logdetcol_spread,
+        )
+        return _squeeze_output(out)
+
+    def pdf(self, X):
+        return np.exp(self.logpdf(X))
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(
+            self.mean, self.row_spread, self.col_spread, self.df, size, random_state
+        )
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# matrix_t_gen and fill in default strings in class docstrings
+for name in ["logpdf", "pdf", "rvs"]:
+    method = matrix_t_gen.__dict__[name]
+    method_frozen = matrix_t_frozen.__dict__[name]
+    method_frozen.__doc__ = scipy._lib.doccer.docformat(
+        method.__doc__, matrix_t_docdict_noparams
+    )
+    method.__doc__ = scipy._lib.doccer.docformat(
+        method.__doc__, matrix_t_docdict_params
+    )
+
+
+
+def _cholesky_invwishart_rvs(
+    df: float, scale: np.ndarray, size: int, random_state: np.random.Generator
+) -> np.ndarray:
+    r"""Samples the lower Cholesky factor of a matrix following an inverse
+    Wishart distribution.
+
+    Notes
+    -----
+    Intended to be used *as a step in the process* for computing random variates
+    of a matrix t distribution :math:`\mathcal{T}_{m,n}` by appealing to its
+    alternative form as a matrix mixture
+    .. math::
+        \mathcal{T}_{m,n}( \mathrm{df}, \mathrm{M}, \Sigma, \Omega )
+        = \mathcal{N}_{m,n}(
+            \mathrm{M},
+            \mathcal{W}^{-1}_m(\mathrm{df} + m - 1, \Sigma),
+            \Omega
+            )
+        = \mathcal{N}_{m,n}(
+            \mathrm{M},
+            \Sigma,
+            \mathcal{W}^{-1}_n(\mathrm{df} - n + 1, \Omega)
+            )
+    where :math:`\mathcal{N}_{m,n}` is a matrix normal distribution
+    and :math:`\mathcal{W}^{-1}_d` is an inverse Wishart distribution.
+    Accordingly, the degrees of freedom adjustment
+    :math:`\mathrm{df} \to \mathrm{df} + d - 1`
+    occurrs in the scope of this function.
+    """
+    df_iw = df + scale.shape[0] - 1
+    iw_samples = scipy.stats.invwishart.rvs(df_iw, scale, size, random_state)
+    if size == 1:
+        iw_samples = iw_samples[np.newaxis, ...]
+    chol_samples = np.empty_like(iw_samples)
+    for idx in range(size):
+        chol_samples[idx] = scipy.linalg.cholesky(
+            iw_samples[idx], lower=True, check_finite=False
+        ).reshape(iw_samples.shape[1:])
+    return chol_samples.reshape((size, *scale.shape))
+
+
+_dirichlet_doc_default_callparams = """\
+alpha : array_like
+    The concentration parameters. The number of entries determines the
+    dimensionality of the distribution.
+"""
+_dirichlet_doc_frozen_callparams = ""
+
+_dirichlet_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+dirichlet_docdict_params = {
+    '_dirichlet_doc_default_callparams': _dirichlet_doc_default_callparams,
+    '_doc_random_state': _doc_random_state
+}
+
+dirichlet_docdict_noparams = {
+    '_dirichlet_doc_default_callparams': _dirichlet_doc_frozen_callparams,
+    '_doc_random_state': _doc_random_state
+}
+
+
+def _dirichlet_check_parameters(alpha):
+    alpha = np.asarray(alpha)
+    if np.min(alpha) <= 0:
+        raise ValueError("All parameters must be greater than 0")
+    elif alpha.ndim != 1:
+        raise ValueError("Parameter vector 'a' must be one dimensional, "
+                         f"but a.shape = {alpha.shape}.")
+    return alpha
+
+
+def _dirichlet_check_input(alpha, x):
+    x = np.asarray(x)
+
+    if x.shape[0] + 1 != alpha.shape[0] and x.shape[0] != alpha.shape[0]:
+        raise ValueError("Vector 'x' must have either the same number "
+                         "of entries as, or one entry fewer than, "
+                         f"parameter vector 'a', but alpha.shape = {alpha.shape} "
+                         f"and x.shape = {x.shape}.")
+
+    if x.shape[0] != alpha.shape[0]:
+        xk = np.array([1 - np.sum(x, 0)])
+        if xk.ndim == 1:
+            x = np.append(x, xk)
+        elif xk.ndim == 2:
+            x = np.vstack((x, xk))
+        else:
+            raise ValueError("The input must be one dimensional or a two "
+                             "dimensional matrix containing the entries.")
+
+    if np.min(x) < 0:
+        raise ValueError("Each entry in 'x' must be greater than or equal "
+                         "to zero.")
+
+    if np.max(x) > 1:
+        raise ValueError("Each entry in 'x' must be smaller or equal one.")
+
+    # Check x_i > 0 or alpha_i > 1
+    xeq0 = (x == 0)
+    alphalt1 = (alpha < 1)
+    if x.shape != alpha.shape:
+        alphalt1 = np.repeat(alphalt1, x.shape[-1], axis=-1).reshape(x.shape)
+    chk = np.logical_and(xeq0, alphalt1)
+
+    if np.sum(chk):
+        raise ValueError("Each entry in 'x' must be greater than zero if its "
+                         "alpha is less than one.")
+
+    if (np.abs(np.sum(x, 0) - 1.0) > 10e-10).any():
+        raise ValueError("The input vector 'x' must lie within the normal "
+                         f"simplex. but np.sum(x, 0) = {np.sum(x, 0)}.")
+
+    return x
+
+
+def _lnB(alpha):
+    r"""Internal helper function to compute the log of the useful quotient.
+
+    .. math::
+
+        B(\alpha) = \frac{\prod_{i=1}{K}\Gamma(\alpha_i)}
+                         {\Gamma\left(\sum_{i=1}^{K} \alpha_i \right)}
+
+    Parameters
+    ----------
+    %(_dirichlet_doc_default_callparams)s
+
+    Returns
+    -------
+    B : scalar
+        Helper quotient, internal use only
+
+    """
+    return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
+
+
+class dirichlet_gen(multi_rv_generic):
+    r"""A Dirichlet random variable.
+
+    The ``alpha`` keyword specifies the concentration parameters of the
+    distribution.
+
+    .. versionadded:: 0.15.0
+
+    Methods
+    -------
+    pdf(x, alpha)
+        Probability density function.
+    logpdf(x, alpha)
+        Log of the probability density function.
+    rvs(alpha, size=1, random_state=None)
+        Draw random samples from a Dirichlet distribution.
+    mean(alpha)
+        The mean of the Dirichlet distribution
+    var(alpha)
+        The variance of the Dirichlet distribution
+    cov(alpha)
+        The covariance of the Dirichlet distribution
+    entropy(alpha)
+        Compute the differential entropy of the Dirichlet distribution.
+
+    Parameters
+    ----------
+    %(_dirichlet_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    Each :math:`\alpha` entry must be positive. The distribution has only
+    support on the simplex defined by
+
+    .. math::
+        \sum_{i=1}^{K} x_i = 1
+
+    where :math:`0 < x_i < 1`.
+
+    If the quantiles don't lie within the simplex, a ValueError is raised.
+
+    The probability density function for `dirichlet` is
+
+    .. math::
+
+        f(x) = \frac{1}{\mathrm{B}(\boldsymbol\alpha)} \prod_{i=1}^K x_i^{\alpha_i - 1}
+
+    where
+
+    .. math::
+
+        \mathrm{B}(\boldsymbol\alpha) = \frac{\prod_{i=1}^K \Gamma(\alpha_i)}
+                                     {\Gamma\bigl(\sum_{i=1}^K \alpha_i\bigr)}
+
+    and :math:`\boldsymbol\alpha=(\alpha_1,\ldots,\alpha_K)`, the
+    concentration parameters and :math:`K` is the dimension of the space
+    where :math:`x` takes values.
+
+    Note that the `dirichlet` interface is somewhat inconsistent.
+    The array returned by the rvs function is transposed
+    with respect to the format expected by the pdf and logpdf.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import dirichlet
+
+    Generate a dirichlet random variable
+
+    >>> quantiles = np.array([0.2, 0.2, 0.6])  # specify quantiles
+    >>> alpha = np.array([0.4, 5, 15])  # specify concentration parameters
+    >>> dirichlet.pdf(quantiles, alpha)
+    0.2843831684937255
+
+    The same PDF but following a log scale
+
+    >>> dirichlet.logpdf(quantiles, alpha)
+    -1.2574327653159187
+
+    Once we specify the dirichlet distribution
+    we can then calculate quantities of interest
+
+    >>> dirichlet.mean(alpha)  # get the mean of the distribution
+    array([0.01960784, 0.24509804, 0.73529412])
+    >>> dirichlet.var(alpha) # get variance
+    array([0.00089829, 0.00864603, 0.00909517])
+    >>> dirichlet.entropy(alpha)  # calculate the differential entropy
+    -4.3280162474082715
+
+    We can also return random samples from the distribution
+
+    >>> dirichlet.rvs(alpha, size=1, random_state=1)
+    array([[0.00766178, 0.24670518, 0.74563305]])
+    >>> dirichlet.rvs(alpha, size=2, random_state=2)
+    array([[0.01639427, 0.1292273 , 0.85437844],
+           [0.00156917, 0.19033695, 0.80809388]])
+
+    Alternatively, the object may be called (as a function) to fix
+    concentration parameters, returning a "frozen" Dirichlet
+    random variable:
+
+    >>> rv = dirichlet(alpha)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # concentration parameters fixed.
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, dirichlet_docdict_params)
+
+    def __call__(self, alpha, seed=None):
+        return dirichlet_frozen(alpha, seed=seed)
+
+    def _logpdf(self, x, alpha):
+        """Log of the Dirichlet probability density function.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the log of the probability
+            density function
+        %(_dirichlet_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        lnB = _lnB(alpha)
+        return - lnB + np.sum((xlogy(alpha - 1, x.T)).T, 0)
+
+    def logpdf(self, x, alpha):
+        """Log of the Dirichlet probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            Log of the probability density function evaluated at `x`.
+
+        """
+        alpha = _dirichlet_check_parameters(alpha)
+        x = _dirichlet_check_input(alpha, x)
+
+        out = self._logpdf(x, alpha)
+        return _squeeze_output(out)
+
+    def pdf(self, x, alpha):
+        """The Dirichlet probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            The probability density function evaluated at `x`.
+
+        """
+        alpha = _dirichlet_check_parameters(alpha)
+        x = _dirichlet_check_input(alpha, x)
+
+        out = np.exp(self._logpdf(x, alpha))
+        return _squeeze_output(out)
+
+    def mean(self, alpha):
+        """Mean of the Dirichlet distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        mu : ndarray or scalar
+            Mean of the Dirichlet distribution.
+
+        """
+        alpha = _dirichlet_check_parameters(alpha)
+
+        out = alpha / (np.sum(alpha))
+        return _squeeze_output(out)
+
+    def var(self, alpha):
+        """Variance of the Dirichlet distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        v : ndarray or scalar
+            Variance of the Dirichlet distribution.
+
+        """
+
+        alpha = _dirichlet_check_parameters(alpha)
+
+        alpha0 = np.sum(alpha)
+        out = (alpha * (alpha0 - alpha)) / ((alpha0 * alpha0) * (alpha0 + 1))
+        return _squeeze_output(out)
+
+    def cov(self, alpha):
+        """Covariance matrix of the Dirichlet distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        cov : ndarray
+            The covariance matrix of the distribution.
+        """
+
+        alpha = _dirichlet_check_parameters(alpha)
+        alpha0 = np.sum(alpha)
+        a = alpha / alpha0
+
+        cov = (np.diag(a) - np.outer(a, a)) / (alpha0 + 1)
+        return _squeeze_output(cov)
+
+    def entropy(self, alpha):
+        """
+        Differential entropy of the Dirichlet distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_doc_default_callparams)s
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the Dirichlet distribution
+
+        """
+
+        alpha = _dirichlet_check_parameters(alpha)
+
+        alpha0 = np.sum(alpha)
+        lnB = _lnB(alpha)
+        K = alpha.shape[0]
+
+        out = lnB + (alpha0 - K) * scipy.special.psi(alpha0) - np.sum(
+            (alpha - 1) * scipy.special.psi(alpha))
+        return _squeeze_output(out)
+
+    def rvs(self, alpha, size=1, random_state=None):
+        """
+        Draw random samples from a Dirichlet distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_doc_default_callparams)s
+        size : int, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `N`), where `N` is the
+            dimension of the random variable.
+
+        """
+        alpha = _dirichlet_check_parameters(alpha)
+        random_state = self._get_random_state(random_state)
+        return random_state.dirichlet(alpha, size=size)
+
+
+dirichlet = dirichlet_gen()
+
+
+class dirichlet_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, alpha, seed=None):
+        self.alpha = _dirichlet_check_parameters(alpha)
+        self._dist = dirichlet_gen(seed)
+
+    def logpdf(self, x):
+        return self._dist.logpdf(x, self.alpha)
+
+    def pdf(self, x):
+        return self._dist.pdf(x, self.alpha)
+
+    def mean(self):
+        return self._dist.mean(self.alpha)
+
+    def var(self):
+        return self._dist.var(self.alpha)
+
+    def cov(self):
+        return self._dist.cov(self.alpha)
+
+    def entropy(self):
+        return self._dist.entropy(self.alpha)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.alpha, size, random_state)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# multivariate_normal_gen and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'rvs', 'mean', 'var', 'cov', 'entropy']:
+    method = dirichlet_gen.__dict__[name]
+    method_frozen = dirichlet_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, dirichlet_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, dirichlet_docdict_params)
+
+
+_wishart_doc_default_callparams = """\
+df : int
+    Degrees of freedom, must be greater than or equal to dimension of the
+    scale matrix
+scale : array_like
+    Symmetric positive definite scale matrix of the distribution
+"""
+
+_wishart_doc_callparams_note = ""
+
+_wishart_doc_frozen_callparams = ""
+
+_wishart_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+wishart_docdict_params = {
+    '_doc_default_callparams': _wishart_doc_default_callparams,
+    '_doc_callparams_note': _wishart_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+wishart_docdict_noparams = {
+    '_doc_default_callparams': _wishart_doc_frozen_callparams,
+    '_doc_callparams_note': _wishart_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class wishart_gen(multi_rv_generic):
+    r"""A Wishart random variable.
+
+    The `df` keyword specifies the degrees of freedom. The `scale` keyword
+    specifies the scale matrix, which must be symmetric and positive definite.
+    In this context, the scale matrix is often interpreted in terms of a
+    multivariate normal precision matrix (the inverse of the covariance
+    matrix). These arguments must satisfy the relationship
+    ``df > scale.ndim - 1``, but see notes on using the `rvs` method with
+    ``df < scale.ndim``.
+
+    Methods
+    -------
+    pdf(x, df, scale)
+        Probability density function.
+    logpdf(x, df, scale)
+        Log of the probability density function.
+    rvs(df, scale, size=1, random_state=None)
+        Draw random samples from a Wishart distribution.
+    entropy()
+        Compute the differential entropy of the Wishart distribution.
+
+    Parameters
+    ----------
+    %(_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Raises
+    ------
+    scipy.linalg.LinAlgError
+        If the scale matrix `scale` is not positive definite.
+
+    See Also
+    --------
+    invwishart, chi2
+
+    Notes
+    -----
+    %(_doc_callparams_note)s
+
+    The scale matrix `scale` must be a symmetric positive definite
+    matrix. Singular matrices, including the symmetric positive semi-definite
+    case, are not supported. Symmetry is not checked; only the lower triangular
+    portion is used.
+
+    The Wishart distribution is often denoted
+
+    .. math::
+
+        W_p(\nu, \Sigma)
+
+    where :math:`\nu` is the degrees of freedom and :math:`\Sigma` is the
+    :math:`p \times p` scale matrix.
+
+    The probability density function for `wishart` has support over positive
+    definite matrices :math:`S`; if :math:`S \sim W_p(\nu, \Sigma)`, then
+    its PDF is given by:
+
+    .. math::
+
+        f(S) = \frac{|S|^{\frac{\nu - p - 1}{2}}}{2^{ \frac{\nu p}{2} }
+               |\Sigma|^\frac{\nu}{2} \Gamma_p \left ( \frac{\nu}{2} \right )}
+               \exp\left( -tr(\Sigma^{-1} S) / 2 \right)
+
+    If :math:`S \sim W_p(\nu, \Sigma)` (Wishart) then
+    :math:`S^{-1} \sim W_p^{-1}(\nu, \Sigma^{-1})` (inverse Wishart).
+
+    If the scale matrix is 1-dimensional and equal to one, then the Wishart
+    distribution :math:`W_1(\nu, 1)` collapses to the :math:`\chi^2(\nu)`
+    distribution.
+
+    The algorithm [2]_ implemented by the `rvs` method may
+    produce numerically singular matrices with :math:`p - 1 < \nu < p`; the
+    user may wish to check for this condition and generate replacement samples
+    as necessary.
+
+
+    .. versionadded:: 0.16.0
+
+    References
+    ----------
+    .. [1] M.L. Eaton, "Multivariate Statistics: A Vector Space Approach",
+           Wiley, 1983.
+    .. [2] W.B. Smith and R.R. Hocking, "Algorithm AS 53: Wishart Variate
+           Generator", Applied Statistics, vol. 21, pp. 341-345, 1972.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import wishart, chi2
+    >>> x = np.linspace(1e-5, 8, 100)
+    >>> w = wishart.pdf(x, df=3, scale=1); w[:5]
+    array([ 0.00126156,  0.10892176,  0.14793434,  0.17400548,  0.1929669 ])
+    >>> c = chi2.pdf(x, 3); c[:5]
+    array([ 0.00126156,  0.10892176,  0.14793434,  0.17400548,  0.1929669 ])
+    >>> plt.plot(x, w)
+    >>> plt.show()
+
+    The input quantiles can be any shape of array, as long as the last
+    axis labels the components.
+
+    Alternatively, the object may be called (as a function) to fix the degrees
+    of freedom and scale parameters, returning a "frozen" Wishart random
+    variable:
+
+    >>> rv = wishart(df=1, scale=1)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # degrees of freedom and scale fixed.
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, wishart_docdict_params)
+
+    def __call__(self, df=None, scale=None, seed=None):
+        """Create a frozen Wishart distribution.
+
+        See `wishart_frozen` for more information.
+        """
+        return wishart_frozen(df, scale, seed)
+
+    def _process_parameters(self, df, scale):
+        if scale is None:
+            scale = 1.0
+        scale = np.asarray(scale, dtype=float)
+
+        if scale.ndim == 0:
+            scale = scale[np.newaxis, np.newaxis]
+        elif scale.ndim == 1:
+            scale = np.diag(scale)
+        elif scale.ndim == 2 and not scale.shape[0] == scale.shape[1]:
+            raise ValueError("Array 'scale' must be square if it is two dimensional,"
+                             f" but scale.scale = {str(scale.shape)}.")
+        elif scale.ndim > 2:
+            raise ValueError(f"Array 'scale' must be at most two-dimensional, "
+                             f"but scale.ndim = {scale.ndim}")
+
+        dim = scale.shape[0]
+
+        if df is None:
+            df = dim
+        elif not np.isscalar(df):
+            raise ValueError("Degrees of freedom must be a scalar.")
+        elif df <= dim - 1:
+            raise ValueError("Degrees of freedom must be greater than the "
+                             "dimension of scale matrix minus 1.")
+
+        return dim, df, scale
+
+    def _process_quantiles(self, x, dim):
+        """
+        Adjust quantiles array so that last axis labels the components of
+        each data point.
+        """
+        x = np.asarray(x, dtype=float)
+
+        if x.ndim == 0:
+            x = x * np.eye(dim)[:, :, np.newaxis]
+        if x.ndim == 1:
+            if dim == 1:
+                x = x[np.newaxis, np.newaxis, :]
+            else:
+                x = np.diag(x)[:, :, np.newaxis]
+        elif x.ndim == 2:
+            if not x.shape[0] == x.shape[1]:
+                raise ValueError(
+                    "Quantiles must be square if they are two dimensional,"
+                    f" but x.shape = {str(x.shape)}.")
+            x = x[:, :, np.newaxis]
+        elif x.ndim == 3:
+            if not x.shape[0] == x.shape[1]:
+                raise ValueError(
+                    "Quantiles must be square in the first two dimensions "
+                    f"if they are three dimensional, but x.shape = {str(x.shape)}.")
+        elif x.ndim > 3:
+            raise ValueError(f"Quantiles must be at most two-dimensional with an "
+                             f"additional dimension for multiple components, "
+                             f"but x.ndim = {x.ndim}")
+
+        # Now we have 3-dim array; should have shape [dim, dim, *]
+        if not x.shape[0:2] == (dim, dim):
+            raise ValueError('Quantiles have incompatible dimensions: should'
+                             f' be {(dim, dim)}, got {x.shape[0:2]}.')
+
+        return x
+
+    def _process_size(self, size):
+        size = np.asarray(size)
+
+        if size.ndim == 0:
+            size = size[np.newaxis]
+        elif size.ndim > 1:
+            raise ValueError('Size must be an integer or tuple of integers;'
+                 ' thus must have dimension <= 1.'
+                 f' Got size.ndim = {str(tuple(size))}')
+        n = size.prod()
+        shape = tuple(size)
+
+        return n, shape
+
+    def _logpdf(self, x, dim, df, scale, log_det_scale, C):
+        """Log of the Wishart probability density function.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the log of the probability
+            density function
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        scale : ndarray
+            Scale matrix
+        log_det_scale : float
+            Logarithm of the determinant of the scale matrix
+        C : ndarray
+            Cholesky factorization of the scale matrix, lower triangular.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        # log determinant of x
+        # Note: x has components along the last axis, so that x.T has
+        # components alone the 0-th axis. Then since det(A) = det(A'), this
+        # gives us a 1-dim vector of determinants
+
+        # Retrieve tr(scale^{-1} x)
+        log_det_x = np.empty(x.shape[-1])
+        scale_inv_x = np.empty(x.shape)
+        tr_scale_inv_x = np.empty(x.shape[-1])
+        for i in range(x.shape[-1]):
+            _, log_det_x[i] = self._cholesky_logdet(x[:, :, i])
+            scale_inv_x[:, :, i] = scipy.linalg.cho_solve((C, True), x[:, :, i])
+            tr_scale_inv_x[i] = scale_inv_x[:, :, i].trace()
+
+        # Log PDF
+        out = ((0.5 * (df - dim - 1) * log_det_x - 0.5 * tr_scale_inv_x) -
+               (0.5 * df * dim * _LOG_2 + 0.5 * df * log_det_scale +
+                multigammaln(0.5*df, dim)))
+
+        return out
+
+    def logpdf(self, x, df, scale):
+        """Log of the Wishart probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+            Each quantile must be a symmetric positive definite matrix.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Log of the probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        x = self._process_quantiles(x, dim)
+
+        # Cholesky decomposition of scale, get log(det(scale))
+        C, log_det_scale = self._cholesky_logdet(scale)
+
+        out = self._logpdf(x, dim, df, scale, log_det_scale, C)
+        return _squeeze_output(out)
+
+    def pdf(self, x, df, scale):
+        """Wishart probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+            Each quantile must be a symmetric positive definite matrix.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        return np.exp(self.logpdf(x, df, scale))
+
+    def _mean(self, dim, df, scale):
+        """Mean of the Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'mean' instead.
+
+        """
+        return df * scale
+
+    def mean(self, df, scale):
+        """Mean of the Wishart distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mean : float
+            The mean of the distribution
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._mean(dim, df, scale)
+        return _squeeze_output(out)
+
+    def _mode(self, dim, df, scale):
+        """Mode of the Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'mode' instead.
+
+        """
+        if df >= dim + 1:
+            out = (df-dim-1) * scale
+        else:
+            out = None
+        return out
+
+    def mode(self, df, scale):
+        """Mode of the Wishart distribution
+
+        Only valid if the degrees of freedom are greater than the dimension of
+        the scale matrix.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mode : float or None
+            The Mode of the distribution
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._mode(dim, df, scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def _var(self, dim, df, scale):
+        """Variance of the Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'var' instead.
+
+        """
+        var = scale**2
+        diag = scale.diagonal()  # 1 x dim array
+        var += np.outer(diag, diag)
+        var *= df
+        return var
+
+    def var(self, df, scale):
+        """Variance of the Wishart distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        var : float
+            The variance of the distribution
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._var(dim, df, scale)
+        return _squeeze_output(out)
+
+    def _standard_rvs(self, n, shape, dim, df, random_state):
+        """
+        Parameters
+        ----------
+        n : integer
+            Number of variates to generate
+        shape : iterable
+            Shape of the variates to generate
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'rvs' instead.
+
+        """
+        # Random normal variates for off-diagonal elements
+        n_tril = dim * (dim-1) // 2
+        covariances = random_state.normal(
+            size=n*n_tril).reshape(shape+(n_tril,))
+
+        # Random chi-square variates for diagonal elements
+        variances = (np.r_[[random_state.chisquare(df-(i+1)+1, size=n)**0.5
+                            for i in range(dim)]].reshape((dim,) +
+                                                          shape[::-1]).T)
+
+        # Create the A matri(ces) - lower triangular
+        A = np.zeros(shape + (dim, dim))
+
+        # Input the covariances
+        size_idx = tuple([slice(None, None, None)]*len(shape))
+        tril_idx = np.tril_indices(dim, k=-1)
+        A[size_idx + tril_idx] = covariances
+
+        # Input the variances
+        diag_idx = np.diag_indices(dim)
+        A[size_idx + diag_idx] = variances
+
+        return A
+
+    def _rvs(self, n, shape, dim, df, C, random_state):
+        """Draw random samples from a Wishart distribution.
+
+        Parameters
+        ----------
+        n : integer
+            Number of variates to generate
+        shape : iterable
+            Shape of the variates to generate
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        C : ndarray
+            Cholesky factorization of the scale matrix, lower triangular.
+        %(_doc_random_state)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'rvs' instead.
+
+        """
+        random_state = self._get_random_state(random_state)
+        # Calculate the matrices A, which are actually lower triangular
+        # Cholesky factorizations of a matrix B such that B ~ W(df, I)
+        A = self._standard_rvs(n, shape, dim, df, random_state)
+
+        # Calculate SA = C A A' C', where SA ~ W(df, scale)
+        # Note: this is the product of a (lower) (lower) (lower)' (lower)'
+        #       or, denoting B = AA', it is C B C' where C is the lower
+        #       triangular Cholesky factorization of the scale matrix.
+        #       this appears to conflict with the instructions in [1]_, which
+        #       suggest that it should be D' B D where D is the lower
+        #       triangular factorization of the scale matrix. However, it is
+        #       meant to refer to the Bartlett (1933) representation of a
+        #       Wishart random variate as L A A' L' where L is lower triangular
+        #       so it appears that understanding D' to be upper triangular
+        #       is either a typo in or misreading of [1]_.
+        for index in np.ndindex(shape):
+            CA = np.dot(C, A[index])
+            A[index] = np.dot(CA, CA.T)
+
+        return A
+
+    def rvs(self, df, scale, size=1, random_state=None):
+        """Draw random samples from a Wishart distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+        size : integer or iterable of integers, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray
+            Random variates of shape (`size`) + (``dim``, ``dim``), where
+            ``dim`` is the dimension of the scale matrix.
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        n, shape = self._process_size(size)
+        dim, df, scale = self._process_parameters(df, scale)
+
+        # Cholesky decomposition of scale
+        C = scipy.linalg.cholesky(scale, lower=True)
+
+        out = self._rvs(n, shape, dim, df, C, random_state)
+
+        return _squeeze_output(out)
+
+    def _entropy(self, dim, df, log_det_scale):
+        """Compute the differential entropy of the Wishart.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        log_det_scale : float
+            Logarithm of the determinant of the scale matrix
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'entropy' instead.
+
+        """
+        return (
+            0.5 * (dim+1) * log_det_scale +
+            0.5 * dim * (dim+1) * _LOG_2 +
+            multigammaln(0.5*df, dim) -
+            0.5 * (df - dim - 1) * np.sum(
+                [psi(0.5*(df + 1 - (i+1))) for i in range(dim)]
+            ) +
+            0.5 * df * dim
+        )
+
+    def entropy(self, df, scale):
+        """Compute the differential entropy of the Wishart.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the Wishart distribution
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        _, log_det_scale = self._cholesky_logdet(scale)
+        return self._entropy(dim, df, log_det_scale)
+
+    def _cholesky_logdet(self, scale):
+        """Compute Cholesky decomposition and determine (log(det(scale)).
+
+        Parameters
+        ----------
+        scale : ndarray
+            Scale matrix.
+
+        Returns
+        -------
+        c_decomp : ndarray
+            The Cholesky decomposition of `scale`.
+        logdet : scalar
+            The log of the determinant of `scale`.
+
+        Notes
+        -----
+        This computation of ``logdet`` is equivalent to
+        ``np.linalg.slogdet(scale)``.  It is ~2x faster though.
+
+        """
+        c_decomp = scipy.linalg.cholesky(scale, lower=True)
+        logdet = 2 * np.sum(np.log(c_decomp.diagonal()))
+        return c_decomp, logdet
+
+
+wishart = wishart_gen()
+
+
+class wishart_frozen(multi_rv_frozen):
+    """Create a frozen Wishart distribution.
+
+    Parameters
+    ----------
+    df : array_like
+        Degrees of freedom of the distribution
+    scale : array_like
+        Scale matrix of the distribution
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    """
+    __class_getitem__ = None
+
+    def __init__(self, df, scale, seed=None):
+        self._dist = wishart_gen(seed)
+        self.dim, self.df, self.scale = self._dist._process_parameters(
+            df, scale)
+        self.C, self.log_det_scale = self._dist._cholesky_logdet(self.scale)
+
+    def logpdf(self, x):
+        x = self._dist._process_quantiles(x, self.dim)
+
+        out = self._dist._logpdf(x, self.dim, self.df, self.scale,
+                                 self.log_det_scale, self.C)
+        return _squeeze_output(out)
+
+    def pdf(self, x):
+        return np.exp(self.logpdf(x))
+
+    def mean(self):
+        out = self._dist._mean(self.dim, self.df, self.scale)
+        return _squeeze_output(out)
+
+    def mode(self):
+        out = self._dist._mode(self.dim, self.df, self.scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def var(self):
+        out = self._dist._var(self.dim, self.df, self.scale)
+        return _squeeze_output(out)
+
+    def rvs(self, size=1, random_state=None):
+        n, shape = self._dist._process_size(size)
+        out = self._dist._rvs(n, shape, self.dim, self.df,
+                              self.C, random_state)
+        return _squeeze_output(out)
+
+    def entropy(self):
+        return self._dist._entropy(self.dim, self.df, self.log_det_scale)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# Wishart and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'mean', 'mode', 'var', 'rvs', 'entropy']:
+    method = wishart_gen.__dict__[name]
+    method_frozen = wishart_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, wishart_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, wishart_docdict_params)
+
+
+class invwishart_gen(wishart_gen):
+    r"""An inverse Wishart random variable.
+
+    The `df` keyword specifies the degrees of freedom. The `scale` keyword
+    specifies the scale matrix, which must be symmetric and positive definite.
+    In this context, the scale matrix is often interpreted in terms of a
+    multivariate normal covariance matrix.
+
+    Methods
+    -------
+    pdf(x, df, scale)
+        Probability density function.
+    logpdf(x, df, scale)
+        Log of the probability density function.
+    rvs(df, scale, size=1, random_state=None)
+        Draw random samples from an inverse Wishart distribution.
+    entropy(df, scale)
+        Differential entropy of the distribution.
+
+    Parameters
+    ----------
+    %(_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Raises
+    ------
+    scipy.linalg.LinAlgError
+        If the scale matrix `scale` is not positive definite.
+
+    See Also
+    --------
+    wishart
+
+    Notes
+    -----
+    %(_doc_callparams_note)s
+
+    The scale matrix `scale` must be a symmetric positive definite
+    matrix. Singular matrices, including the symmetric positive semi-definite
+    case, are not supported. Symmetry is not checked; only the lower triangular
+    portion is used.
+
+    The inverse Wishart distribution is often denoted
+
+    .. math::
+
+        W_p^{-1}(\nu, \Psi)
+
+    where :math:`\nu` is the degrees of freedom and :math:`\Psi` is the
+    :math:`p \times p` scale matrix.
+
+    The probability density function for `invwishart` has support over positive
+    definite matrices :math:`S`; if :math:`S \sim W^{-1}_p(\nu, \Sigma)`,
+    then its PDF is given by:
+
+    .. math::
+
+        f(S) = \frac{|\Sigma|^\frac{\nu}{2}}{2^{ \frac{\nu p}{2} }
+               |S|^{\frac{\nu + p + 1}{2}} \Gamma_p \left(\frac{\nu}{2} \right)}
+               \exp\left( -tr(\Sigma S^{-1}) / 2 \right)
+
+    If :math:`S \sim W_p^{-1}(\nu, \Psi)` (inverse Wishart) then
+    :math:`S^{-1} \sim W_p(\nu, \Psi^{-1})` (Wishart).
+
+    If the scale matrix is 1-dimensional and equal to one, then the inverse
+    Wishart distribution :math:`W_1(\nu, 1)` collapses to the
+    inverse Gamma distribution with parameters shape = :math:`\frac{\nu}{2}`
+    and scale = :math:`\frac{1}{2}`.
+
+    Instead of inverting a randomly generated Wishart matrix as described in [2],
+    here the algorithm in [4] is used to directly generate a random inverse-Wishart
+    matrix without inversion.
+
+    .. versionadded:: 0.16.0
+
+    References
+    ----------
+    .. [1] M.L. Eaton, "Multivariate Statistics: A Vector Space Approach",
+           Wiley, 1983.
+    .. [2] M.C. Jones, "Generating Inverse Wishart Matrices", Communications
+           in Statistics - Simulation and Computation, vol. 14.2, pp.511-514,
+           1985.
+    .. [3] Gupta, M. and Srivastava, S. "Parametric Bayesian Estimation of
+           Differential Entropy and Relative Entropy". Entropy 12, 818 - 843.
+           2010.
+    .. [4] S.D. Axen, "Efficiently generating inverse-Wishart matrices and
+           their Cholesky factors", :arXiv:`2310.15884v1`. 2023.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import invwishart, invgamma
+    >>> x = np.linspace(0.01, 1, 100)
+    >>> iw = invwishart.pdf(x, df=6, scale=1)
+    >>> iw[:3]
+    array([  1.20546865e-15,   5.42497807e-06,   4.45813929e-03])
+    >>> ig = invgamma.pdf(x, 6/2., scale=1./2)
+    >>> ig[:3]
+    array([  1.20546865e-15,   5.42497807e-06,   4.45813929e-03])
+    >>> plt.plot(x, iw)
+    >>> plt.show()
+
+    The input quantiles can be any shape of array, as long as the last
+    axis labels the components.
+
+    Alternatively, the object may be called (as a function) to fix the degrees
+    of freedom and scale parameters, returning a "frozen" inverse Wishart
+    random variable:
+
+    >>> rv = invwishart(df=1, scale=1)
+    >>> # Frozen object with the same methods but holding the given
+    >>> # degrees of freedom and scale fixed.
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, wishart_docdict_params)
+
+    def __call__(self, df=None, scale=None, seed=None):
+        """Create a frozen inverse Wishart distribution.
+
+        See `invwishart_frozen` for more information.
+
+        """
+        return invwishart_frozen(df, scale, seed)
+
+    def _logpdf(self, x, dim, df, log_det_scale, C):
+        """Log of the inverse Wishart probability density function.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the log of the probability
+            density function.
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        log_det_scale : float
+            Logarithm of the determinant of the scale matrix
+        C : ndarray
+            Cholesky factorization of the scale matrix, lower triangular.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        # Retrieve tr(scale x^{-1})
+        log_det_x = np.empty(x.shape[-1])
+        tr_scale_x_inv = np.empty(x.shape[-1])
+        trsm = get_blas_funcs(('trsm'), (x,))
+        if dim > 1:
+            for i in range(x.shape[-1]):
+                Cx, log_det_x[i] = self._cholesky_logdet(x[:, :, i])
+                A = trsm(1., Cx, C, side=0, lower=True)
+                tr_scale_x_inv[i] = np.linalg.norm(A)**2
+        else:
+            log_det_x[:] = np.log(x[0, 0])
+            tr_scale_x_inv[:] = C[0, 0]**2 / x[0, 0]
+
+        # Log PDF
+        out = ((0.5 * df * log_det_scale - 0.5 * tr_scale_x_inv) -
+               (0.5 * df * dim * _LOG_2 + 0.5 * (df + dim + 1) * log_det_x) -
+               multigammaln(0.5*df, dim))
+
+        return out
+
+    def logpdf(self, x, df, scale):
+        """Log of the inverse Wishart probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+            Each quantile must be a symmetric positive definite matrix.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Log of the probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        x = self._process_quantiles(x, dim)
+        C, log_det_scale = self._cholesky_logdet(scale)
+        out = self._logpdf(x, dim, df, log_det_scale, C)
+        return _squeeze_output(out)
+
+    def pdf(self, x, df, scale):
+        """Inverse Wishart probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+            Each quantile must be a symmetric positive definite matrix.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : ndarray
+            Probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        return np.exp(self.logpdf(x, df, scale))
+
+    def _mean(self, dim, df, scale):
+        """Mean of the inverse Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'mean' instead.
+
+        """
+        if df > dim + 1:
+            out = scale / (df - dim - 1)
+        else:
+            out = None
+        return out
+
+    def mean(self, df, scale):
+        """Mean of the inverse Wishart distribution.
+
+        Only valid if the degrees of freedom are greater than the dimension of
+        the scale matrix plus one.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mean : float or None
+            The mean of the distribution
+
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._mean(dim, df, scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def _mode(self, dim, df, scale):
+        """Mode of the inverse Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'mode' instead.
+
+        """
+        return scale / (df + dim + 1)
+
+    def mode(self, df, scale):
+        """Mode of the inverse Wishart distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mode : float
+            The Mode of the distribution
+
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._mode(dim, df, scale)
+        return _squeeze_output(out)
+
+    def _var(self, dim, df, scale):
+        """Variance of the inverse Wishart distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of the scale matrix
+        %(_doc_default_callparams)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'var' instead.
+
+        """
+        if df > dim + 3:
+            var = (df - dim + 1) * scale**2
+            diag = scale.diagonal()  # 1 x dim array
+            var += (df - dim - 1) * np.outer(diag, diag)
+            var /= (df - dim) * (df - dim - 1)**2 * (df - dim - 3)
+        else:
+            var = None
+        return var
+
+    def var(self, df, scale):
+        """Variance of the inverse Wishart distribution.
+
+        Only valid if the degrees of freedom are greater than the dimension of
+        the scale matrix plus three.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        var : float
+            The variance of the distribution
+        """
+        dim, df, scale = self._process_parameters(df, scale)
+        out = self._var(dim, df, scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def _inv_standard_rvs(self, n, shape, dim, df, random_state):
+        """
+        Parameters
+        ----------
+        n : integer
+            Number of variates to generate
+        shape : iterable
+            Shape of the variates to generate
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Returns
+        -------
+        A : ndarray
+            Random variates of shape (`shape`) + (``dim``, ``dim``).
+            Each slice `A[..., :, :]` is lower-triangular, and its
+            inverse is the lower Cholesky factor of a draw from
+            `invwishart(df, np.eye(dim))`.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'rvs' instead.
+
+        """
+        A = np.zeros(shape + (dim, dim))
+
+        # Random normal variates for off-diagonal elements
+        tri_rows, tri_cols = np.tril_indices(dim, k=-1)
+        n_tril = dim * (dim-1) // 2
+        A[..., tri_rows, tri_cols] = random_state.normal(
+            size=(*shape, n_tril),
+        )
+
+        # Random chi variates for diagonal elements
+        rows = np.arange(dim)
+        chi_dfs = (df - dim + 1) + rows
+        A[..., rows, rows] = random_state.chisquare(
+            df=chi_dfs, size=(*shape, dim),
+        )**0.5
+
+        return A
+
+    def _rvs(self, n, shape, dim, df, C, random_state):
+        """Draw random samples from an inverse Wishart distribution.
+
+        Parameters
+        ----------
+        n : integer
+            Number of variates to generate
+        shape : iterable
+            Shape of the variates to generate
+        dim : int
+            Dimension of the scale matrix
+        df : int
+            Degrees of freedom
+        C : ndarray
+            Cholesky factorization of the scale matrix, lower triangular.
+        %(_doc_random_state)s
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be
+        called directly; use 'rvs' instead.
+
+        """
+        random_state = self._get_random_state(random_state)
+        # Get random draws A such that inv(A) ~ iW(df, I)
+        A = self._inv_standard_rvs(n, shape, dim, df, random_state)
+
+        # Calculate SA = (CA)'^{-1} (CA)^{-1} ~ iW(df, scale)
+        trsm = get_blas_funcs(('trsm'), (A,))
+        trmm = get_blas_funcs(('trmm'), (A,))
+
+        for index in np.ndindex(A.shape[:-2]):
+            if dim > 1:
+                # Calculate CA
+                # Get CA = C A^{-1} via triangular solver
+                CA = trsm(1., A[index], C, side=1, lower=True)
+                # get SA
+                A[index] = trmm(1., CA, CA, side=1, lower=True, trans_a=True)
+            else:
+                A[index][0, 0] = (C[0, 0] / A[index][0, 0])**2
+
+        return A
+
+    def rvs(self, df, scale, size=1, random_state=None):
+        """Draw random samples from an inverse Wishart distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+        size : integer or iterable of integers, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray
+            Random variates of shape (`size`) + (``dim``, ``dim``), where
+            ``dim`` is the dimension of the scale matrix.
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        """
+        n, shape = self._process_size(size)
+        dim, df, scale = self._process_parameters(df, scale)
+
+        # Cholesky decomposition of scale
+        C = scipy.linalg.cholesky(scale, lower=True)
+
+        out = self._rvs(n, shape, dim, df, C, random_state)
+
+        return _squeeze_output(out)
+
+    def _entropy(self, dim, df, log_det_scale):
+        # reference: eq. (17) from ref. 3
+        psi_eval_points = [0.5 * (df - dim + i) for i in range(1, dim + 1)]
+        psi_eval_points = np.asarray(psi_eval_points)
+        return multigammaln(0.5 * df, dim) + 0.5 * dim * df + \
+            0.5 * (dim + 1) * (log_det_scale - _LOG_2) - \
+            0.5 * (df + dim + 1) * \
+            psi(psi_eval_points, out=psi_eval_points).sum()
+
+    def entropy(self, df, scale):
+        dim, df, scale = self._process_parameters(df, scale)
+        _, log_det_scale = self._cholesky_logdet(scale)
+        return self._entropy(dim, df, log_det_scale)
+
+
+invwishart = invwishart_gen()
+
+
+class invwishart_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, df, scale, seed=None):
+        """Create a frozen inverse Wishart distribution.
+
+        Parameters
+        ----------
+        df : array_like
+            Degrees of freedom of the distribution
+        scale : array_like
+            Scale matrix of the distribution
+        seed : {None, int, `numpy.random.Generator`}, optional
+            If `seed` is None the `numpy.random.Generator` singleton is used.
+            If `seed` is an int, a new ``Generator`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` instance then that instance is
+            used.
+
+        """
+        self._dist = invwishart_gen(seed)
+        self.dim, self.df, self.scale = self._dist._process_parameters(
+            df, scale
+        )
+
+        # Get the determinant via Cholesky factorization
+        self.C = scipy.linalg.cholesky(self.scale, lower=True)
+        self.log_det_scale = 2 * np.sum(np.log(self.C.diagonal()))
+
+    def logpdf(self, x):
+        x = self._dist._process_quantiles(x, self.dim)
+        out = self._dist._logpdf(x, self.dim, self.df,
+                                 self.log_det_scale, self.C)
+        return _squeeze_output(out)
+
+    def pdf(self, x):
+        return np.exp(self.logpdf(x))
+
+    def mean(self):
+        out = self._dist._mean(self.dim, self.df, self.scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def mode(self):
+        out = self._dist._mode(self.dim, self.df, self.scale)
+        return _squeeze_output(out)
+
+    def var(self):
+        out = self._dist._var(self.dim, self.df, self.scale)
+        return _squeeze_output(out) if out is not None else out
+
+    def rvs(self, size=1, random_state=None):
+        n, shape = self._dist._process_size(size)
+
+        out = self._dist._rvs(n, shape, self.dim, self.df,
+                              self.C, random_state)
+
+        return _squeeze_output(out)
+
+    def entropy(self):
+        return self._dist._entropy(self.dim, self.df, self.log_det_scale)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# inverse Wishart and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'mean', 'mode', 'var', 'rvs']:
+    method = invwishart_gen.__dict__[name]
+    method_frozen = wishart_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, wishart_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, wishart_docdict_params)
+
+_multinomial_doc_default_callparams = """\
+n : int
+    Number of trials
+p : array_like
+    Probability of a trial falling into each category; should sum to 1
+"""
+
+_multinomial_doc_callparams_note = """\
+`n` should be a nonnegative integer. Each element of `p` should be in the
+interval :math:`[0,1]` and the elements should sum to 1. If they do not sum to
+1, the last element of the `p` array is not used and is replaced with the
+remaining probability left over from the earlier elements.
+"""
+
+_multinomial_doc_frozen_callparams = ""
+
+_multinomial_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+multinomial_docdict_params = {
+    '_doc_default_callparams': _multinomial_doc_default_callparams,
+    '_doc_callparams_note': _multinomial_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+multinomial_docdict_noparams = {
+    '_doc_default_callparams': _multinomial_doc_frozen_callparams,
+    '_doc_callparams_note': _multinomial_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class multinomial_gen(multi_rv_generic):
+    r"""A multinomial random variable.
+
+    Methods
+    -------
+    pmf(x, n, p)
+        Probability mass function.
+    logpmf(x, n, p)
+        Log of the probability mass function.
+    rvs(n, p, size=1, random_state=None)
+        Draw random samples from a multinomial distribution.
+    entropy(n, p)
+        Compute the entropy of the multinomial distribution.
+    cov(n, p)
+        Compute the covariance matrix of the multinomial distribution.
+
+    Parameters
+    ----------
+    %(_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_doc_callparams_note)s
+
+    The probability mass function for `multinomial` is
+
+    .. math::
+
+        f(x) = \frac{n!}{x_1! \cdots x_k!} p_1^{x_1} \cdots p_k^{x_k},
+
+    supported on :math:`x=(x_1, \ldots, x_k)` where each :math:`x_i` is a
+    nonnegative integer and their sum is :math:`n`.
+
+    .. versionadded:: 0.19.0
+
+    Examples
+    --------
+
+    >>> from scipy.stats import multinomial
+    >>> rv = multinomial(8, [0.3, 0.2, 0.5])
+    >>> rv.pmf([1, 3, 4])
+    0.042000000000000072
+
+    The multinomial distribution for :math:`k=2` is identical to the
+    corresponding binomial distribution (tiny numerical differences
+    notwithstanding):
+
+    >>> from scipy.stats import binom
+    >>> multinomial.pmf([3, 4], n=7, p=[0.4, 0.6])
+    0.29030399999999973
+    >>> binom.pmf(3, 7, 0.4)
+    0.29030400000000012
+
+    The functions ``pmf``, ``logpmf``, ``entropy``, and ``cov`` support
+    broadcasting, under the convention that the vector parameters (``x`` and
+    ``p``) are interpreted as if each row along the last axis is a single
+    object. For instance:
+
+    >>> multinomial.pmf([[3, 4], [3, 5]], n=[7, 8], p=[.3, .7])
+    array([0.2268945,  0.25412184])
+
+    Here, ``x.shape == (2, 2)``, ``n.shape == (2,)``, and ``p.shape == (2,)``,
+    but following the rules mentioned above they behave as if the rows
+    ``[3, 4]`` and ``[3, 5]`` in ``x`` and ``[.3, .7]`` in ``p`` were a single
+    object, and as if we had ``x.shape = (2,)``, ``n.shape = (2,)``, and
+    ``p.shape = ()``. To obtain the individual elements without broadcasting,
+    we would do this:
+
+    >>> multinomial.pmf([3, 4], n=7, p=[.3, .7])
+    0.2268945
+    >>> multinomial.pmf([3, 5], 8, p=[.3, .7])
+    0.25412184
+
+    This broadcasting also works for ``cov``, where the output objects are
+    square matrices of size ``p.shape[-1]``. For example:
+
+    >>> multinomial.cov([4, 5], [[.3, .7], [.4, .6]])
+    array([[[ 0.84, -0.84],
+            [-0.84,  0.84]],
+           [[ 1.2 , -1.2 ],
+            [-1.2 ,  1.2 ]]])
+
+    In this example, ``n.shape == (2,)`` and ``p.shape == (2, 2)``, and
+    following the rules above, these broadcast as if ``p.shape == (2,)``.
+    Thus the result should also be of shape ``(2,)``, but since each output is
+    a :math:`2 \times 2` matrix, the result in fact has shape ``(2, 2, 2)``,
+    where ``result[0]`` is equal to ``multinomial.cov(n=4, p=[.3, .7])`` and
+    ``result[1]`` is equal to ``multinomial.cov(n=5, p=[.4, .6])``.
+
+    Alternatively, the object may be called (as a function) to fix the `n` and
+    `p` parameters, returning a "frozen" multinomial random variable:
+
+    >>> rv = multinomial(n=7, p=[.3, .7])
+    >>> # Frozen object with the same methods but holding the given
+    >>> # degrees of freedom and scale fixed.
+
+    See also
+    --------
+    scipy.stats.binom : The binomial distribution.
+    numpy.random.Generator.multinomial : Sampling from the multinomial distribution.
+    scipy.stats.multivariate_hypergeom :
+        The multivariate hypergeometric distribution.
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = \
+            doccer.docformat(self.__doc__, multinomial_docdict_params)
+
+    def __call__(self, n, p, seed=None):
+        """Create a frozen multinomial distribution.
+
+        See `multinomial_frozen` for more information.
+        """
+        return multinomial_frozen(n, p, seed)
+
+    def _process_parameters(self, n, p):
+        """Returns: n_, p_, npcond.
+
+        n_ and p_ are arrays of the correct shape; npcond is a boolean array
+        flagging values out of the domain.
+        """
+        eps = np.finfo(np.result_type(np.asarray(p), np.float32)).eps * 10
+        p = np.array(p, dtype=np.float64, copy=True)
+        p_adjusted = 1. - p[..., :-1].sum(axis=-1)
+        # only make adjustment when it's significant
+        i_adjusted = np.abs(1 - p.sum(axis=-1)) > eps
+        p[i_adjusted, -1] = p_adjusted[i_adjusted]
+
+        if np.any(i_adjusted):
+            message = ("Some rows of `p` do not sum to 1.0 within tolerance of "
+                       f"{eps=}. Currently, the last element of these rows is adjusted "
+                       "to compensate, but this condition will produce NaNs "
+                       "beginning in SciPy 1.18.0. Please ensure that rows of `p` sum "
+                       "to 1.0 to avoid futher disruption.")
+            warnings.warn(message, FutureWarning, stacklevel=3)
+
+        # true for bad p
+        pcond = np.any(p < 0, axis=-1)
+        pcond |= np.any(p > 1, axis=-1)
+
+        n = np.array(n, dtype=int, copy=True)
+
+        # true for bad n
+        ncond = n < 0
+
+        return n, p, ncond | pcond
+
+    def _process_quantiles(self, x, n, p):
+        """Returns: x_, xcond.
+
+        x_ is an int array; xcond is a boolean array flagging values out of the
+        domain.
+        """
+        xx = np.asarray(x, dtype=int)
+
+        if xx.ndim == 0:
+            raise ValueError("x must be an array.")
+
+        if xx.size != 0 and not xx.shape[-1] == p.shape[-1]:
+            raise ValueError(f"Size of each quantile should be size of p: "
+                             f"received {xx.shape[-1]}, but expected "
+                             f"{p.shape[-1]}.")
+
+        # true for x out of the domain
+        cond = np.any(xx != x, axis=-1)
+        cond |= np.any(xx < 0, axis=-1)
+        cond = cond | (np.sum(xx, axis=-1) != n)
+
+        return xx, cond
+
+    def _checkresult(self, result, cond, bad_value):
+        result = np.asarray(result)
+
+        if cond.ndim != 0:
+            result[cond] = bad_value
+        elif cond:
+            if result.ndim == 0:
+                return bad_value
+            result[...] = bad_value
+        return result
+
+    def _logpmf(self, x, n, p):
+        return gammaln(n+1) + np.sum(xlogy(x, p) - gammaln(x+1), axis=-1)
+
+    def logpmf(self, x, n, p):
+        """Log of the Multinomial probability mass function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        logpmf : ndarray or scalar
+            Log of the probability mass function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        n, p, npcond = self._process_parameters(n, p)
+        x, xcond = self._process_quantiles(x, n, p)
+
+        result = self._logpmf(x, n, p)
+
+        # replace values for which x was out of the domain; broadcast
+        # xcond to the right shape
+        xcond_ = xcond | np.zeros(npcond.shape, dtype=np.bool_)
+        result = self._checkresult(result, xcond_, -np.inf)
+
+        # replace values bad for n or p; broadcast npcond to the right shape
+        npcond_ = npcond | np.zeros(xcond.shape, dtype=np.bool_)
+        return self._checkresult(result, npcond_, np.nan)
+
+    def pmf(self, x, n, p):
+        """Multinomial probability mass function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pmf : ndarray or scalar
+            Probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        return np.exp(self.logpmf(x, n, p))
+
+    def mean(self, n, p):
+        """Mean of the Multinomial distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mean : float
+            The mean of the distribution
+        """
+        n, p, npcond = self._process_parameters(n, p)
+        result = n[..., np.newaxis]*p
+        return self._checkresult(result, npcond, np.nan)
+
+    def cov(self, n, p):
+        """Covariance matrix of the multinomial distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        cov : ndarray
+            The covariance matrix of the distribution
+        """
+        n, p, npcond = self._process_parameters(n, p)
+
+        nn = n[..., np.newaxis, np.newaxis]
+        result = nn * np.einsum('...j,...k->...jk', -p, p)
+
+        # change the diagonal
+        for i in range(p.shape[-1]):
+            result[..., i, i] += n*p[..., i]
+
+        return self._checkresult(result, npcond, np.nan)
+
+    def entropy(self, n, p):
+        r"""Compute the entropy of the multinomial distribution.
+
+        The entropy is computed using this expression:
+
+        .. math::
+
+            f(x) = - \log n! - n\sum_{i=1}^k p_i \log p_i +
+            \sum_{i=1}^k \sum_{x=0}^n \binom n x p_i^x(1-p_i)^{n-x} \log x!
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the Multinomial distribution
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        n, p, npcond = self._process_parameters(n, p)
+
+        x = np.r_[1:np.max(n)+1]
+
+        term1 = n*np.sum(entr(p), axis=-1)
+        term1 -= gammaln(n+1)
+
+        n = n[..., np.newaxis]
+        new_axes_needed = max(p.ndim, n.ndim) - x.ndim + 1
+        new_shape = x.shape + (1,)*new_axes_needed
+        x = x.reshape(new_shape)
+
+        term2 = np.sum(binom.pmf(x, n, p)*gammaln(x+1),
+                       axis=(-1, -1-new_axes_needed))
+
+        return self._checkresult(term1 + term2, npcond, np.nan)
+
+    def rvs(self, n, p, size=None, random_state=None):
+        """Draw random samples from a Multinomial distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+        size : integer or iterable of integers, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of shape (`size`, `len(p)`)
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        n, p, npcond = self._process_parameters(n, p)
+        random_state = self._get_random_state(random_state)
+        return random_state.multinomial(n, p, size)
+
+
+multinomial = multinomial_gen()
+
+
+class multinomial_frozen(multi_rv_frozen):
+    r"""Create a frozen Multinomial distribution.
+
+    Parameters
+    ----------
+    n : int
+        number of trials
+    p: array_like
+        probability of a trial falling into each category; should sum to 1
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+    """
+    def __init__(self, n, p, seed=None):
+        self._dist = multinomial_gen(seed)
+        self.n, self.p, self.npcond = self._dist._process_parameters(n, p)
+
+        # monkey patch self._dist
+        def _process_parameters(n, p):
+            return self.n, self.p, self.npcond
+
+        self._dist._process_parameters = _process_parameters
+
+    def logpmf(self, x):
+        return self._dist.logpmf(x, self.n, self.p)
+
+    def pmf(self, x):
+        return self._dist.pmf(x, self.n, self.p)
+
+    def mean(self):
+        return self._dist.mean(self.n, self.p)
+
+    def cov(self):
+        return self._dist.cov(self.n, self.p)
+
+    def entropy(self):
+        return self._dist.entropy(self.n, self.p)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.n, self.p, size, random_state)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# multinomial and fill in default strings in class docstrings
+for name in ['logpmf', 'pmf', 'mean', 'cov', 'rvs']:
+    method = multinomial_gen.__dict__[name]
+    method_frozen = multinomial_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, multinomial_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__,
+                                      multinomial_docdict_params)
+
+
+class special_ortho_group_gen(multi_rv_generic):
+    r"""A Special Orthogonal matrix (SO(N)) random variable.
+
+    Return a random rotation matrix, drawn from the Haar distribution
+    (the only uniform distribution on SO(N)) with a determinant of +1.
+
+    The `dim` keyword specifies the dimension N.
+
+    Methods
+    -------
+    rvs(dim=None, size=1, random_state=None)
+        Draw random samples from SO(N).
+
+    Parameters
+    ----------
+    dim : scalar
+        Dimension of matrices
+    seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    Notes
+    -----
+    The ``rvs`` method returns a random rotation matrix drawn from the Haar
+    distribution, the only uniform distribution on SO(N). The algorithm generates
+    a Haar-distributed orthogonal matrix in O(N) using the ``rvs`` method of
+    `ortho_group`, then adjusts the matrix to ensure that the determinant is +1.
+
+    For a random rotation in three dimensions, see
+    `scipy.spatial.transform.Rotation.random`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import special_ortho_group
+    >>> x = special_ortho_group.rvs(3)
+
+    >>> np.dot(x, x.T)
+    array([[  1.00000000e+00,   1.13231364e-17,  -2.86852790e-16],
+           [  1.13231364e-17,   1.00000000e+00,  -1.46845020e-16],
+           [ -2.86852790e-16,  -1.46845020e-16,   1.00000000e+00]])
+
+    >>> import scipy.linalg
+    >>> scipy.linalg.det(x)
+    1.0
+
+    This generates one random matrix from SO(3). It is orthogonal and
+    has a determinant of 1.
+
+    Alternatively, the object may be called (as a function) to fix the `dim`
+    parameter, returning a "frozen" special_ortho_group random variable:
+
+    >>> rv = special_ortho_group(5)
+    >>> # Frozen object with the same methods but holding the
+    >>> # dimension parameter fixed.
+
+    See Also
+    --------
+    ortho_group, scipy.spatial.transform.Rotation.random
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__)
+
+    def __call__(self, dim=None, seed=None):
+        """Create a frozen SO(N) distribution.
+
+        See `special_ortho_group_frozen` for more information.
+        """
+        return special_ortho_group_frozen(dim, seed=seed)
+
+    def _process_parameters(self, dim):
+        """Dimension N must be specified; it cannot be inferred."""
+        if dim is None or not np.isscalar(dim) or dim < 0 or dim != int(dim):
+            raise ValueError("""Dimension of rotation must be specified,
+                                and must be a scalar nonnegative integer.""")
+
+        return dim
+
+    def rvs(self, dim, size=1, random_state=None):
+        """Draw random samples from SO(N).
+
+        Parameters
+        ----------
+        dim : integer
+            Dimension of rotation space (N).
+        size : integer, optional
+            Number of samples to draw (default 1).
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random size N-dimensional matrices, dimension (size, dim, dim)
+
+        """
+        random_state = self._get_random_state(random_state)
+
+        q = ortho_group.rvs(dim, size, random_state)
+        dets = np.linalg.det(q)
+        if dim:
+            q[..., 0, :] /= dets[..., np.newaxis]
+        return q
+
+special_ortho_group = special_ortho_group_gen()
+
+
+class special_ortho_group_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, dim=None, seed=None):
+        """Create a frozen SO(N) distribution.
+
+        Parameters
+        ----------
+        dim : scalar
+            Dimension of matrices
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Examples
+        --------
+        >>> from scipy.stats import special_ortho_group
+        >>> g = special_ortho_group(5)
+        >>> x = g.rvs()
+
+        """ # numpy/numpydoc#87  # noqa: E501
+        self._dist = special_ortho_group_gen(seed)
+        self.dim = self._dist._process_parameters(dim)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.dim, size, random_state)
+
+
+class ortho_group_gen(multi_rv_generic):
+    r"""An Orthogonal matrix (O(N)) random variable.
+
+    Return a random orthogonal matrix, drawn from the O(N) Haar
+    distribution (the only uniform distribution on O(N)).
+
+    The `dim` keyword specifies the dimension N.
+
+    Methods
+    -------
+    rvs(dim=None, size=1, random_state=None)
+        Draw random samples from O(N).
+
+    Parameters
+    ----------
+    dim : scalar
+        Dimension of matrices
+    seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    Notes
+    -----
+    This class is closely related to `special_ortho_group`.
+
+    Some care is taken to avoid numerical error, as per the paper by Mezzadri.
+
+    References
+    ----------
+    .. [1] F. Mezzadri, "How to generate random matrices from the classical
+           compact groups", :arXiv:`math-ph/0609050v2`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import ortho_group
+    >>> x = ortho_group.rvs(3)
+
+    >>> np.dot(x, x.T)
+    array([[  1.00000000e+00,   1.13231364e-17,  -2.86852790e-16],
+           [  1.13231364e-17,   1.00000000e+00,  -1.46845020e-16],
+           [ -2.86852790e-16,  -1.46845020e-16,   1.00000000e+00]])
+
+    >>> import scipy.linalg
+    >>> np.fabs(scipy.linalg.det(x))
+    1.0
+
+    This generates one random matrix from O(3). It is orthogonal and
+    has a determinant of +1 or -1.
+
+    Alternatively, the object may be called (as a function) to fix the `dim`
+    parameter, returning a "frozen" ortho_group random variable:
+
+    >>> rv = ortho_group(5)
+    >>> # Frozen object with the same methods but holding the
+    >>> # dimension parameter fixed.
+
+    See Also
+    --------
+    special_ortho_group
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__)
+
+    def __call__(self, dim=None, seed=None):
+        """Create a frozen O(N) distribution.
+
+        See `ortho_group_frozen` for more information.
+        """
+        return ortho_group_frozen(dim, seed=seed)
+
+    def _process_parameters(self, dim):
+        """Dimension N must be specified; it cannot be inferred."""
+        if dim is None or not np.isscalar(dim) or dim < 0 or dim != int(dim):
+            raise ValueError("Dimension of rotation must be specified,"
+                             "and must be a scalar nonnegative integer.")
+
+        return dim
+
+    def rvs(self, dim, size=1, random_state=None):
+        """Draw random samples from O(N).
+
+        Parameters
+        ----------
+        dim : integer
+            Dimension of rotation space (N).
+        size : integer, optional
+            Number of samples to draw (default 1).
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random size N-dimensional matrices, dimension (size, dim, dim)
+
+        """
+        random_state = self._get_random_state(random_state)
+
+        size = int(size)
+
+        dim = self._process_parameters(dim)
+
+        size = (size,) if size > 1 else ()
+        z = random_state.normal(size=size + (dim, dim))
+        q, r = np.linalg.qr(z)
+        # The last two dimensions are the rows and columns of R matrices.
+        # Extract the diagonals. Note that this eliminates a dimension.
+        d = r.diagonal(offset=0, axis1=-2, axis2=-1)
+        # Add back a dimension for proper broadcasting: we're dividing
+        # each row of each R matrix by the diagonal of the R matrix.
+        q *= (d/abs(d))[..., np.newaxis, :]  # to broadcast properly
+        return q
+
+
+ortho_group = ortho_group_gen()
+
+
+class ortho_group_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, dim=None, seed=None):
+        """Create a frozen O(N) distribution.
+
+        Parameters
+        ----------
+        dim : scalar
+            Dimension of matrices
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Examples
+        --------
+        >>> from scipy.stats import ortho_group
+        >>> g = ortho_group(5)
+        >>> x = g.rvs()
+
+        """ # numpy/numpydoc#87  # noqa: E501
+        self._dist = ortho_group_gen(seed)
+        self.dim = self._dist._process_parameters(dim)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.dim, size, random_state)
+
+
+class random_correlation_gen(multi_rv_generic):
+    r"""A random correlation matrix.
+
+    Return a random correlation matrix, given a vector of eigenvalues.
+    The returned matrix is symmetric positive semidefinite with unit diagonal.
+
+    The `eigs` keyword specifies the eigenvalues of the correlation matrix,
+    and implies the dimension.
+
+    Methods
+    -------
+    rvs(eigs=None, random_state=None)
+        Draw random correlation matrices, all with eigenvalues eigs.
+
+    Parameters
+    ----------
+    eigs : 1d ndarray
+        Eigenvalues of correlation matrix. All eigenvalues need to be non-negative and
+        need to sum to the number of eigenvalues.
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance
+        then that instance is used.
+    tol : float, optional
+        Tolerance for input parameter checks
+    diag_tol : float, optional
+        Tolerance for deviation of the diagonal of the resulting
+        matrix. Default: 1e-7
+
+    Raises
+    ------
+    RuntimeError
+        Floating point error prevented generating a valid correlation
+        matrix.
+
+    Returns
+    -------
+    rvs : ndarray or scalar
+        Random size N-dimensional matrices, dimension (size, dim, dim),
+        each having eigenvalues eigs.
+
+    Notes
+    -----
+
+    Generates a random correlation matrix following a numerically stable
+    algorithm spelled out by Davies & Higham. This algorithm uses a single O(N)
+    similarity transformation to construct a symmetric positive semi-definite
+    matrix, and applies a series of Givens rotations to scale it to have ones
+    on the diagonal.
+
+    References
+    ----------
+
+    .. [1] Davies, Philip I; Higham, Nicholas J; "Numerically stable generation
+           of correlation matrices and their factors", BIT 2000, Vol. 40,
+           No. 4, pp. 640 651
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import random_correlation
+    >>> rng = np.random.default_rng()
+    >>> x = random_correlation.rvs((.5, .8, 1.2, 1.5), random_state=rng)
+    >>> x
+    array([[ 1.        , -0.02423399,  0.03130519,  0.4946965 ],
+           [-0.02423399,  1.        ,  0.20334736,  0.04039817],
+           [ 0.03130519,  0.20334736,  1.        ,  0.02694275],
+           [ 0.4946965 ,  0.04039817,  0.02694275,  1.        ]])
+    >>> import scipy.linalg
+    >>> e, v = scipy.linalg.eigh(x)
+    >>> e
+    array([ 0.5,  0.8,  1.2,  1.5])
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__)
+
+    def __call__(self, eigs, seed=None, tol=1e-13, diag_tol=1e-7):
+        """Create a frozen random correlation matrix.
+
+        See `random_correlation_frozen` for more information.
+        """
+        return random_correlation_frozen(eigs, seed=seed, tol=tol,
+                                         diag_tol=diag_tol)
+
+    def _process_parameters(self, eigs, tol):
+        eigs = np.asarray(eigs, dtype=float)
+        dim = eigs.size
+
+        if eigs.ndim != 1 or eigs.shape[0] != dim or dim <= 1:
+            raise ValueError("Array 'eigs' must be a vector of length "
+                             "greater than 1.")
+
+        if np.fabs(np.sum(eigs) - dim) > tol:
+            raise ValueError("Sum of eigenvalues must equal dimensionality.")
+
+        for x in eigs:
+            if x < -tol:
+                raise ValueError("All eigenvalues must be non-negative.")
+
+        return dim, eigs
+
+    def _givens_to_1(self, aii, ajj, aij):
+        """Computes a 2x2 Givens matrix to put 1's on the diagonal.
+
+        The input matrix is a 2x2 symmetric matrix M = [ aii aij ; aij ajj ].
+
+        The output matrix g is a 2x2 anti-symmetric matrix of the form
+        [ c s ; -s c ];  the elements c and s are returned.
+
+        Applying the output matrix to the input matrix (as b=g.T M g)
+        results in a matrix with bii=1, provided tr(M) - det(M) >= 1
+        and floating point issues do not occur. Otherwise, some other
+        valid rotation is returned. When tr(M)==2, also bjj=1.
+
+        """
+        aiid = aii - 1.
+        ajjd = ajj - 1.
+
+        if ajjd == 0:
+            # ajj==1, so swap aii and ajj to avoid division by zero
+            return 0., 1.
+
+        dd = math.sqrt(max(aij**2 - aiid*ajjd, 0))
+
+        # The choice of t should be chosen to avoid cancellation [1]
+        t = (aij + math.copysign(dd, aij)) / ajjd
+        c = 1. / math.sqrt(1. + t*t)
+        if c == 0:
+            # Underflow
+            s = 1.0
+        else:
+            s = c*t
+        return c, s
+
+    def _to_corr(self, m):
+        """
+        Given a psd matrix m, rotate to put one's on the diagonal, turning it
+        into a correlation matrix.  This also requires the trace equal the
+        dimensionality. Note: modifies input matrix
+        """
+        # Check requirements for in-place Givens
+        if not (m.flags.c_contiguous and m.dtype == np.float64 and
+                m.shape[0] == m.shape[1]):
+            raise ValueError()
+
+        d = m.shape[0]
+        for i in range(d-1):
+            if m[i, i] == 1:
+                continue
+            elif m[i, i] > 1:
+                for j in range(i+1, d):
+                    if m[j, j] < 1:
+                        break
+            else:
+                for j in range(i+1, d):
+                    if m[j, j] > 1:
+                        break
+
+            c, s = self._givens_to_1(m[i, i], m[j, j], m[i, j])
+
+            # Use BLAS to apply Givens rotations in-place. Equivalent to:
+            # g = np.eye(d)
+            # g[i, i] = g[j,j] = c
+            # g[j, i] = -s; g[i, j] = s
+            # m = np.dot(g.T, np.dot(m, g))
+            mv = m.ravel()
+            drot(mv, mv, c, -s, n=d,
+                 offx=i*d, incx=1, offy=j*d, incy=1,
+                 overwrite_x=True, overwrite_y=True)
+            drot(mv, mv, c, -s, n=d,
+                 offx=i, incx=d, offy=j, incy=d,
+                 overwrite_x=True, overwrite_y=True)
+
+        return m
+
+    def rvs(self, eigs, random_state=None, tol=1e-13, diag_tol=1e-7):
+        """Draw random correlation matrices.
+
+        Parameters
+        ----------
+        eigs : 1d ndarray
+            Eigenvalues of correlation matrix
+        tol : float, optional
+            Tolerance for input parameter checks
+        diag_tol : float, optional
+            Tolerance for deviation of the diagonal of the resulting
+            matrix. Default: 1e-7
+
+        Raises
+        ------
+        RuntimeError
+            Floating point error prevented generating a valid correlation
+            matrix.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random size N-dimensional matrices, dimension (size, dim, dim),
+            each having eigenvalues eigs.
+
+        """
+        dim, eigs = self._process_parameters(eigs, tol=tol)
+
+        random_state = self._get_random_state(random_state)
+
+        m = ortho_group.rvs(dim, random_state=random_state)
+        m = np.dot(np.dot(m, np.diag(eigs)), m.T)  # Set the trace of m
+        m = self._to_corr(m)  # Carefully rotate to unit diagonal
+
+        # Check diagonal
+        if abs(m.diagonal() - 1).max() > diag_tol:
+            raise RuntimeError("Failed to generate a valid correlation matrix")
+
+        return m
+
+
+random_correlation = random_correlation_gen()
+
+
+class random_correlation_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, eigs, seed=None, tol=1e-13, diag_tol=1e-7):
+        """Create a frozen random correlation matrix distribution.
+
+        Parameters
+        ----------
+        eigs : 1d ndarray
+            Eigenvalues of correlation matrix
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+        tol : float, optional
+            Tolerance for input parameter checks
+        diag_tol : float, optional
+            Tolerance for deviation of the diagonal of the resulting
+            matrix. Default: 1e-7
+
+        Raises
+        ------
+        RuntimeError
+            Floating point error prevented generating a valid correlation
+            matrix.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random size N-dimensional matrices, dimension (size, dim, dim),
+            each having eigenvalues eigs.
+        """ # numpy/numpydoc#87  # noqa: E501
+
+        self._dist = random_correlation_gen(seed)
+        self.tol = tol
+        self.diag_tol = diag_tol
+        _, self.eigs = self._dist._process_parameters(eigs, tol=self.tol)
+
+    def rvs(self, random_state=None):
+        return self._dist.rvs(self.eigs, random_state=random_state,
+                              tol=self.tol, diag_tol=self.diag_tol)
+
+
+class unitary_group_gen(multi_rv_generic):
+    r"""A matrix-valued U(N) random variable.
+
+    Return a random unitary matrix.
+
+    The `dim` keyword specifies the dimension N.
+
+    Methods
+    -------
+    rvs(dim=None, size=1, random_state=None)
+        Draw random samples from U(N).
+
+    Parameters
+    ----------
+    dim : scalar
+        Dimension of matrices.
+    seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    Notes
+    -----
+    This class is similar to `ortho_group`.
+
+    References
+    ----------
+    .. [1] F. Mezzadri, "How to generate random matrices from the classical
+           compact groups", :arXiv:`math-ph/0609050v2`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import unitary_group
+    >>> x = unitary_group.rvs(3)
+
+    >>> np.dot(x, x.conj().T)
+    array([[  1.00000000e+00,   1.13231364e-17,  -2.86852790e-16],
+           [  1.13231364e-17,   1.00000000e+00,  -1.46845020e-16],
+           [ -2.86852790e-16,  -1.46845020e-16,   1.00000000e+00]])  # may vary
+
+    This generates one random matrix from U(3). The dot product confirms that
+    it is unitary up to machine precision.
+
+    Alternatively, the object may be called (as a function) to fix the `dim`
+    parameter, return a "frozen" unitary_group random variable:
+
+    >>> rv = unitary_group(5)
+
+    See Also
+    --------
+    ortho_group
+
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__)
+
+    def __call__(self, dim=None, seed=None):
+        """Create a frozen (U(N)) n-dimensional unitary matrix distribution.
+
+        See `unitary_group_frozen` for more information.
+        """
+        return unitary_group_frozen(dim, seed=seed)
+
+    def _process_parameters(self, dim):
+        """Dimension N must be specified; it cannot be inferred."""
+        if dim is None or not np.isscalar(dim) or dim < 0 or dim != int(dim):
+            raise ValueError("Dimension of rotation must be specified,"
+                             "and must be a scalar nonnegative integer.")
+
+        return dim
+
+    def rvs(self, dim, size=1, random_state=None):
+        """Draw random samples from U(N).
+
+        Parameters
+        ----------
+        dim : integer
+            Dimension of space (N).
+        size : integer, optional
+            Number of samples to draw (default 1).
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random size N-dimensional matrices, dimension (size, dim, dim)
+
+        """
+        random_state = self._get_random_state(random_state)
+
+        size = int(size)
+
+        dim = self._process_parameters(dim)
+
+        size = (size,) if size > 1 else ()
+        z = 1/math.sqrt(2)*(random_state.normal(size=size + (dim, dim)) +
+                            1j*random_state.normal(size=size + (dim, dim)))
+        q, r = np.linalg.qr(z)
+        # The last two dimensions are the rows and columns of R matrices.
+        # Extract the diagonals. Note that this eliminates a dimension.
+        d = r.diagonal(offset=0, axis1=-2, axis2=-1)
+        # Add back a dimension for proper broadcasting: we're dividing
+        # each row of each R matrix by the diagonal of the R matrix.
+        q *= (d/abs(d))[..., np.newaxis, :]  # to broadcast properly
+        return q
+
+
+unitary_group = unitary_group_gen()
+
+
+class unitary_group_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, dim=None, seed=None):
+        """Create a frozen (U(N)) n-dimensional unitary matrix distribution.
+
+        Parameters
+        ----------
+        dim : scalar
+            Dimension of matrices
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Examples
+        --------
+        >>> from scipy.stats import unitary_group
+        >>> x = unitary_group(3)
+        >>> x.rvs()
+
+        """ # numpy/numpydoc#87  # noqa: E501
+        self._dist = unitary_group_gen(seed)
+        self.dim = self._dist._process_parameters(dim)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.dim, size, random_state)
+
+
+_mvt_doc_default_callparams = """\
+loc : array_like, optional
+    Location of the distribution. (default ``0``)
+shape : array_like, optional
+    Positive semidefinite matrix of the distribution. (default ``1``)
+df : float, optional
+    Degrees of freedom of the distribution; must be greater than zero.
+    If ``np.inf`` then results are multivariate normal. The default is ``1``.
+allow_singular : bool, optional
+    Whether to allow a singular matrix. (default ``False``)
+"""
+
+_mvt_doc_callparams_note = """\
+Setting the parameter `loc` to ``None`` is equivalent to having `loc`
+be the zero-vector. The parameter `shape` can be a scalar, in which case
+the shape matrix is the identity times that value, a vector of
+diagonal entries for the shape matrix, or a two-dimensional array_like.
+"""
+
+_mvt_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+mvt_docdict_params = {
+    '_mvt_doc_default_callparams': _mvt_doc_default_callparams,
+    '_mvt_doc_callparams_note': _mvt_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+mvt_docdict_noparams = {
+    '_mvt_doc_default_callparams': "",
+    '_mvt_doc_callparams_note': _mvt_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class multivariate_t_gen(multi_rv_generic):
+    r"""A multivariate t-distributed random variable.
+
+    The `loc` parameter specifies the location. The `shape` parameter specifies
+    the positive semidefinite shape matrix. The `df` parameter specifies the
+    degrees of freedom.
+
+    In addition to calling the methods below, the object itself may be called
+    as a function to fix the location, shape matrix, and degrees of freedom
+    parameters, returning a "frozen" multivariate t-distribution random.
+
+    Methods
+    -------
+    pdf(x, loc=None, shape=1, df=1, allow_singular=False)
+        Probability density function.
+    logpdf(x, loc=None, shape=1, df=1, allow_singular=False)
+        Log of the probability density function.
+    cdf(x, loc=None, shape=1, df=1, allow_singular=False, *,
+        maxpts=None, lower_limit=None, random_state=None)
+        Cumulative distribution function.
+    rvs(loc=None, shape=1, df=1, size=1, random_state=None)
+        Draw random samples from a multivariate t-distribution.
+    entropy(loc=None, shape=1, df=1)
+        Differential entropy of a multivariate t-distribution.
+    marginal(dimensions, loc=None, shape=1, df=1, allow_singular=False)
+        Return a marginal multivariate t-distribution.
+
+    Parameters
+    ----------
+    %(_mvt_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_mvt_doc_callparams_note)s
+    The matrix `shape` must be a (symmetric) positive semidefinite matrix. The
+    determinant and inverse of `shape` are computed as the pseudo-determinant
+    and pseudo-inverse, respectively, so that `shape` does not need to have
+    full rank.
+
+    The probability density function for `multivariate_t` is
+
+    .. math::
+
+        f(x) = \frac{\Gamma((\nu + p)/2)}{\Gamma(\nu/2)\nu^{p/2}\pi^{p/2}|\Sigma|^{1/2}}
+               \left[1 + \frac{1}{\nu} (\mathbf{x} - \boldsymbol{\mu})^{\top}
+               \boldsymbol{\Sigma}^{-1}
+               (\mathbf{x} - \boldsymbol{\mu}) \right]^{-(\nu + p)/2},
+
+    where :math:`p` is the dimension of :math:`\mathbf{x}`,
+    :math:`\boldsymbol{\mu}` is the :math:`p`-dimensional location,
+    :math:`\boldsymbol{\Sigma}` the :math:`p \times p`-dimensional shape
+    matrix, and :math:`\nu` is the degrees of freedom.
+
+    .. versionadded:: 1.6.0
+
+    References
+    ----------
+    .. [1] Arellano-Valle et al. "Shannon Entropy and Mutual Information for
+           Multivariate Skew-Elliptical Distributions". Scandinavian Journal
+           of Statistics. Vol. 40, issue 1.
+
+    Examples
+    --------
+    The object may be called (as a function) to fix the `loc`, `shape`,
+    `df`, and `allow_singular` parameters, returning a "frozen"
+    multivariate_t random variable:
+
+    >>> import numpy as np
+    >>> from scipy.stats import multivariate_t
+    >>> rv = multivariate_t([1.0, -0.5], [[2.1, 0.3], [0.3, 1.5]], df=2)
+    >>> # Frozen object with the same methods but holding the given location,
+    >>> # scale, and degrees of freedom fixed.
+
+    Create a contour plot of the PDF.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x, y = np.mgrid[-1:3:.01, -2:1.5:.01]
+    >>> pos = np.dstack((x, y))
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.set_aspect('equal')
+    >>> plt.contourf(x, y, rv.pdf(pos))
+
+    """
+
+    def __init__(self, seed=None):
+        """Initialize a multivariate t-distributed random variable.
+
+        Parameters
+        ----------
+        seed : Random state.
+
+        """
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, mvt_docdict_params)
+        self._random_state = check_random_state(seed)
+
+    def __call__(self, loc=None, shape=1, df=1, allow_singular=False,
+                 seed=None):
+        """Create a frozen multivariate t-distribution.
+
+        See `multivariate_t_frozen` for parameters.
+        """
+        if df == np.inf:
+            return multivariate_normal_frozen(mean=loc, cov=shape,
+                                              allow_singular=allow_singular,
+                                              seed=seed)
+        return multivariate_t_frozen(loc=loc, shape=shape, df=df,
+                                     allow_singular=allow_singular, seed=seed)
+
+    def pdf(self, x, loc=None, shape=1, df=1, allow_singular=False):
+        """Multivariate t-distribution probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the probability density function.
+        %(_mvt_doc_default_callparams)s
+
+        Returns
+        -------
+        pdf : Probability density function evaluated at `x`.
+
+        Examples
+        --------
+        >>> from scipy.stats import multivariate_t
+        >>> x = [0.4, 5]
+        >>> loc = [0, 1]
+        >>> shape = [[1, 0.1], [0.1, 1]]
+        >>> df = 7
+        >>> multivariate_t.pdf(x, loc, shape, df)
+        0.00075713
+
+        """
+        dim, loc, shape, df = self._process_parameters(loc, shape, df)
+        x = self._process_quantiles(x, dim)
+        shape_info = _PSD(shape, allow_singular=allow_singular)
+        logpdf = self._logpdf(x, loc, shape_info.U, shape_info.log_pdet, df,
+                              dim, shape_info.rank)
+        return np.exp(logpdf)
+
+    def logpdf(self, x, loc=None, shape=1, df=1):
+        """Log of the multivariate t-distribution probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the log of the probability density
+            function.
+        %(_mvt_doc_default_callparams)s
+
+        Returns
+        -------
+        logpdf : Log of the probability density function evaluated at `x`.
+
+        Examples
+        --------
+        >>> from scipy.stats import multivariate_t
+        >>> x = [0.4, 5]
+        >>> loc = [0, 1]
+        >>> shape = [[1, 0.1], [0.1, 1]]
+        >>> df = 7
+        >>> multivariate_t.logpdf(x, loc, shape, df)
+        -7.1859802
+
+        See Also
+        --------
+        pdf : Probability density function.
+
+        """
+        dim, loc, shape, df = self._process_parameters(loc, shape, df)
+        x = self._process_quantiles(x, dim)
+        shape_info = _PSD(shape)
+        cov_object = _covariance.CovViaPSD(shape_info)
+
+        return self._logpdf(x, loc, shape_info.U, shape_info.log_pdet, df, dim,
+                            shape_info.rank, cov_object)
+
+    def _logpdf(self, x, loc, prec_U, log_pdet, df, dim, rank, cov_object=None):
+        """Utility method `pdf`, `logpdf` for parameters.
+
+        Parameters
+        ----------
+        x : ndarray
+            Points at which to evaluate the log of the probability density
+            function.
+        loc : ndarray
+            Location of the distribution.
+        prec_U : ndarray
+            A decomposition such that `np.dot(prec_U, prec_U.T)` is the inverse
+            of the shape matrix.
+        log_pdet : float
+            Logarithm of the determinant of the shape matrix.
+        df : float
+            Degrees of freedom of the distribution.
+        dim : int
+            Dimension of the quantiles x.
+        rank : int
+            Rank of the shape matrix.
+
+        Notes
+        -----
+        As this function does no argument checking, it should not be called
+        directly; use 'logpdf' instead.
+
+        """
+        if df == np.inf:
+            return multivariate_normal._logpdf(x, loc, cov_object)
+
+        dev = x - loc
+        maha = np.square(np.dot(dev, prec_U)).sum(axis=-1)
+
+        t = 0.5 * (df + dim)
+        A = gammaln(t)
+        B = gammaln(0.5 * df)
+        C = dim/2. * np.log(df * np.pi)
+        D = 0.5 * log_pdet
+        E = -t * np.log(1 + (1./df) * maha)
+
+        return _squeeze_output(A - B - C - D + E)
+
+    def _cdf(self, x, loc, shape, df, dim, maxpts=None, lower_limit=None,
+             random_state=None):
+
+        # All of this -  random state validation, maxpts, apply_along_axis,
+        # etc. needs to go in this private method unless we want
+        # frozen distribution's `cdf` method to duplicate it or call `cdf`,
+        # which would require re-processing parameters
+        if random_state is not None:
+            rng = check_random_state(random_state)
+        else:
+            rng = self._random_state
+
+        if not maxpts:
+            maxpts = 1000 * dim
+
+        x = self._process_quantiles(x, dim)
+        lower_limit = (np.full(loc.shape, -np.inf)
+                       if lower_limit is None else lower_limit)
+
+        # remove the mean
+        x, lower_limit = x - loc, lower_limit - loc
+
+        b, a = np.broadcast_arrays(x, lower_limit)
+        i_swap = b < a
+        signs = (-1)**(i_swap.sum(axis=-1))  # odd # of swaps -> negative
+        a, b = a.copy(), b.copy()
+        a[i_swap], b[i_swap] = b[i_swap], a[i_swap]
+        n = x.shape[-1]
+        limits = np.concatenate((a, b), axis=-1)
+
+        def func1d(limits):
+            a, b = limits[:n], limits[n:]
+            return _qmvt(maxpts, df, shape, a, b, rng)[0]
+
+        res = np.apply_along_axis(func1d, -1, limits) * signs
+        # Fixing the output shape for existing distributions is a separate
+        # issue. For now, let's keep this consistent with pdf.
+        return _squeeze_output(res)
+
+    def cdf(self, x, loc=None, shape=1, df=1, allow_singular=False, *,
+            maxpts=None, lower_limit=None, random_state=None):
+        """Multivariate t-distribution cumulative distribution function.
+
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the cumulative distribution function.
+        %(_mvt_doc_default_callparams)s
+        maxpts : int, optional
+            Maximum number of points to use for integration. The default is
+            1000 times the number of dimensions.
+        lower_limit : array_like, optional
+            Lower limit of integration of the cumulative distribution function.
+            Default is negative infinity. Must be broadcastable with `x`.
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        cdf : ndarray or scalar
+            Cumulative distribution function evaluated at `x`.
+
+        Examples
+        --------
+        >>> from scipy.stats import multivariate_t
+        >>> x = [0.4, 5]
+        >>> loc = [0, 1]
+        >>> shape = [[1, 0.1], [0.1, 1]]
+        >>> df = 7
+        >>> multivariate_t.cdf(x, loc, shape, df)
+        0.64798491
+
+        """
+        dim, loc, shape, df = self._process_parameters(loc, shape, df)
+        shape = _PSD(shape, allow_singular=allow_singular)._M
+
+        return self._cdf(x, loc, shape, df, dim, maxpts,
+                         lower_limit, random_state)
+
+    def _entropy(self, dim, df=1, shape=1):
+        if df == np.inf:
+            return multivariate_normal(None, cov=shape).entropy()
+
+        shape_info = _PSD(shape)
+        shape_term = 0.5 * shape_info.log_pdet
+
+        def regular(dim, df):
+            halfsum = 0.5 * (dim + df)
+            half_df = 0.5 * df
+            return (
+                -gammaln(halfsum) + gammaln(half_df)
+                + 0.5 * dim * np.log(df * np.pi) + halfsum
+                * (psi(halfsum) - psi(half_df))
+                + shape_term
+            )
+
+        def asymptotic(dim, df):
+            # Formula from Wolfram Alpha:
+            # "asymptotic expansion -gammaln((m+d)/2) + gammaln(d/2) + (m*log(d*pi))/2
+            #  + ((m+d)/2) * (digamma((m+d)/2) - digamma(d/2))"
+            return (
+                dim * norm._entropy() + dim / df
+                - dim * (dim - 2) * df**-2.0 / 4
+                + dim**2 * (dim - 2) * df**-3.0 / 6
+                + dim * (-3 * dim**3 + 8 * dim**2 - 8) * df**-4.0 / 24
+                + dim**2 * (3 * dim**3 - 10 * dim**2 + 16) * df**-5.0 / 30
+                + shape_term
+            )[()]
+
+        # preserves ~12 digits accuracy up to at least `dim=1e5`. See gh-18465.
+        threshold = dim * 100 * 4 / (np.log(dim) + 1)
+        return xpx.apply_where(df >= threshold, (dim, df), asymptotic, regular)
+
+    def entropy(self, loc=None, shape=1, df=1):
+        """Calculate the differential entropy of a multivariate
+        t-distribution.
+
+        Parameters
+        ----------
+        %(_mvt_doc_default_callparams)s
+
+        Returns
+        -------
+        h : float
+            Differential entropy
+
+        """
+        dim, loc, shape, df = self._process_parameters(None, shape, df)
+        return self._entropy(dim, df, shape)
+
+    def rvs(self, loc=None, shape=1, df=1, size=1, random_state=None):
+        """Draw random samples from a multivariate t-distribution.
+
+        Parameters
+        ----------
+        %(_mvt_doc_default_callparams)s
+        size : integer, optional
+            Number of samples to draw (default 1).
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `P`), where `P` is the
+            dimension of the random variable.
+
+        Examples
+        --------
+        >>> from scipy.stats import multivariate_t
+        >>> x = [0.4, 5]
+        >>> loc = [0, 1]
+        >>> shape = [[1, 0.1], [0.1, 1]]
+        >>> df = 7
+        >>> multivariate_t.rvs(loc, shape, df)
+        array([[0.93477495, 3.00408716]])
+
+        """
+        # For implementation details, see equation (3):
+        #
+        #    Hofert, "On Sampling from the Multivariatet Distribution", 2013
+        #     http://rjournal.github.io/archive/2013-2/hofert.pdf
+        #
+        dim, loc, shape, df = self._process_parameters(loc, shape, df)
+        if random_state is not None:
+            rng = check_random_state(random_state)
+        else:
+            rng = self._random_state
+
+        if np.isinf(df):
+            x = np.ones(size)
+        else:
+            x = rng.chisquare(df, size=size) / df
+
+        z = rng.multivariate_normal(np.zeros(dim), shape, size=size)
+        samples = loc + z / np.sqrt(x)[..., None]
+        return _squeeze_output(samples)
+
+    def _process_quantiles(self, x, dim):
+        """
+        Adjust quantiles array so that last axis labels the components of
+        each data point.
+        """
+        x = np.asarray(x, dtype=float)
+        if x.ndim == 0:
+            x = x[np.newaxis]
+        elif x.ndim == 1:
+            if dim == 1:
+                x = x[:, np.newaxis]
+            else:
+                x = x[np.newaxis, :]
+        return x
+
+    def _process_parameters(self, loc, shape, df):
+        """
+        Infer dimensionality from location array and shape matrix, handle
+        defaults, and ensure compatible dimensions.
+        """
+        if loc is None and shape is None:
+            loc = np.asarray(0, dtype=float)
+            shape = np.asarray(1, dtype=float)
+            dim = 1
+        elif loc is None:
+            shape = np.asarray(shape, dtype=float)
+            if shape.ndim < 2:
+                dim = 1
+            else:
+                dim = shape.shape[0]
+            loc = np.zeros(dim)
+        elif shape is None:
+            loc = np.asarray(loc, dtype=float)
+            dim = loc.size
+            shape = np.eye(dim)
+        else:
+            shape = np.asarray(shape, dtype=float)
+            loc = np.asarray(loc, dtype=float)
+            dim = loc.size
+
+        if dim == 1:
+            loc = loc.reshape(1)
+            shape = shape.reshape(1, 1)
+
+        if loc.ndim != 1 or loc.shape[0] != dim:
+            raise ValueError(f"Array 'loc' must be a vector of length {dim}.")
+        if shape.ndim == 0:
+            shape = shape * np.eye(dim)
+        elif shape.ndim == 1:
+            shape = np.diag(shape)
+        elif shape.ndim == 2 and shape.shape != (dim, dim):
+            rows, cols = shape.shape
+            if rows != cols:
+                msg = ("Array 'cov' must be square if it is two dimensional,"
+                       f" but cov.shape = {str(shape.shape)}.")
+            else:
+                msg = ("Dimension mismatch: array 'cov' is of shape %s,"
+                       " but 'loc' is a vector of length %d.")
+                msg = msg % (str(shape.shape), len(loc))
+            raise ValueError(msg)
+        elif shape.ndim > 2:
+            raise ValueError(f"Array 'cov' must be at most two-dimensional, "
+                             f"but cov.ndim = {shape.ndim}")
+
+        # Process degrees of freedom.
+        if df is None:
+            df = 1
+        elif df <= 0:
+            raise ValueError("'df' must be greater than zero.")
+        elif np.isnan(df):
+            raise ValueError("'df' is 'nan' but must be greater than zero or 'np.inf'.")
+
+        return dim, loc, shape, df
+
+    def marginal(self, dimensions, loc=None, shape=1, df=1, allow_singular=False):
+        """Return a marginal multivariate t-distribution.
+
+        Parameters
+        ----------
+        dimensions : int or 1-d array_like
+            The dimensions of the multivariate t corresponding
+            with the marginal variables, that is, the indices of the dimensions
+            that are being retained. The other dimensions are marginalized out.
+        %(_mvt_doc_default_callparams)s
+
+        Returns
+        -------
+        marginal_multivariate_t : multivariate_t_frozen
+            An object representing the marginal t-distribution.
+
+        Notes
+        -----
+        %(_mvt_doc_frozen_callparams_note)s
+        """
+        params = self._process_parameters(loc, shape, df)
+        n, loc, shape, df = params
+        dims = _validate_marginal_input(dimensions, n)
+
+        loc = loc[dims]
+        shape = shape[np.ix_(dims, dims)]
+
+        return multivariate_t_frozen(loc, shape, df, allow_singular)
+
+
+class multivariate_t_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, loc=None, shape=1, df=1, allow_singular=False,
+                 seed=None):
+        """Create a frozen multivariate t distribution.
+
+        Parameters
+        ----------
+        %(_mvt_doc_default_callparams)s
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.stats import multivariate_t
+        >>> loc = np.zeros(3)
+        >>> shape = np.eye(3)
+        >>> df = 10
+        >>> dist = multivariate_t(loc, shape, df)
+        >>> dist.rvs()
+        array([[ 0.81412036, -1.53612361,  0.42199647]])
+        >>> dist.pdf([1, 1, 1])
+        array([0.01237803])
+
+        """
+        self._dist = multivariate_t_gen(seed)
+        dim, loc, shape, df = self._dist._process_parameters(loc, shape, df)
+        self.dim, self.loc, self.shape, self.df = dim, loc, shape, df
+        self.shape_info = _PSD(shape, allow_singular=allow_singular)
+        self.allow_singular = allow_singular
+
+    def logpdf(self, x):
+        x = self._dist._process_quantiles(x, self.dim)
+        U = self.shape_info.U
+        log_pdet = self.shape_info.log_pdet
+        return self._dist._logpdf(x, self.loc, U, log_pdet, self.df, self.dim,
+                                  self.shape_info.rank)
+
+    def cdf(self, x, *, maxpts=None, lower_limit=None, random_state=None):
+        x = self._dist._process_quantiles(x, self.dim)
+        return self._dist._cdf(x, self.loc, self.shape, self.df, self.dim,
+                               maxpts, lower_limit, random_state)
+
+    def pdf(self, x):
+        return np.exp(self.logpdf(x))
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(loc=self.loc,
+                              shape=self.shape,
+                              df=self.df,
+                              size=size,
+                              random_state=random_state)
+
+    def entropy(self):
+        return self._dist._entropy(self.dim, self.df, self.shape)
+
+    def marginal(self, dimensions):
+        return self._dist.marginal(dimensions, self.loc,
+                                   self.shape, self.df, self.allow_singular)
+
+multivariate_t = multivariate_t_gen()
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# multivariate_t_gen and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'rvs', 'cdf', 'entropy']:
+    method = multivariate_t_gen.__dict__[name]
+    method_frozen = multivariate_t_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(method.__doc__,
+                                             mvt_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, mvt_docdict_params)
+
+
+_mhg_doc_default_callparams = """\
+m : array_like
+    The number of each type of object in the population.
+    That is, :math:`m[i]` is the number of objects of
+    type :math:`i`.
+n : array_like
+    The number of samples taken from the population.
+"""
+
+_mhg_doc_callparams_note = """\
+`m` must be an array of positive integers. If the quantile
+:math:`i` contains values out of the range :math:`[0, m_i]`
+where :math:`m_i` is the number of objects of type :math:`i`
+in the population or if the parameters are inconsistent with one
+another (e.g. ``x.sum() != n``), methods return the appropriate
+value (e.g. ``0`` for ``pmf``). If `m` or `n` contain negative
+values, the result will contain ``nan`` there.
+"""
+
+_mhg_doc_frozen_callparams = ""
+
+_mhg_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+mhg_docdict_params = {
+    '_doc_default_callparams': _mhg_doc_default_callparams,
+    '_doc_callparams_note': _mhg_doc_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+mhg_docdict_noparams = {
+    '_doc_default_callparams': _mhg_doc_frozen_callparams,
+    '_doc_callparams_note': _mhg_doc_frozen_callparams_note,
+    '_doc_random_state': _doc_random_state
+}
+
+
+class multivariate_hypergeom_gen(multi_rv_generic):
+    r"""A multivariate hypergeometric random variable.
+
+    Methods
+    -------
+    pmf(x, m, n)
+        Probability mass function.
+    logpmf(x, m, n)
+        Log of the probability mass function.
+    rvs(m, n, size=1, random_state=None)
+        Draw random samples from a multivariate hypergeometric
+        distribution.
+    mean(m, n)
+        Mean of the multivariate hypergeometric distribution.
+    var(m, n)
+        Variance of the multivariate hypergeometric distribution.
+    cov(m, n)
+        Compute the covariance matrix of the multivariate
+        hypergeometric distribution.
+
+    Parameters
+    ----------
+    %(_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_doc_callparams_note)s
+
+    The probability mass function for `multivariate_hypergeom` is
+
+    .. math::
+
+        P(X_1 = x_1, X_2 = x_2, \ldots, X_k = x_k) = \frac{\binom{m_1}{x_1}
+        \binom{m_2}{x_2} \cdots \binom{m_k}{x_k}}{\binom{M}{n}}, \\ \quad
+        (x_1, x_2, \ldots, x_k) \in \mathbb{N}^k \text{ with }
+        \sum_{i=1}^k x_i = n
+
+    where :math:`m_i` are the number of objects of type :math:`i`, :math:`M`
+    is the total number of objects in the population (sum of all the
+    :math:`m_i`), and :math:`n` is the size of the sample to be taken
+    from the population.
+
+    .. versionadded:: 1.6.0
+
+    Examples
+    --------
+    To evaluate the probability mass function of the multivariate
+    hypergeometric distribution, with a dichotomous population of size
+    :math:`10` and :math:`20`, at a sample of size :math:`12` with
+    :math:`8` objects of the first type and :math:`4` objects of the
+    second type, use:
+
+    >>> from scipy.stats import multivariate_hypergeom
+    >>> multivariate_hypergeom.pmf(x=[8, 4], m=[10, 20], n=12)
+    0.0025207176631464523
+
+    The `multivariate_hypergeom` distribution is identical to the
+    corresponding `hypergeom` distribution (tiny numerical differences
+    notwithstanding) when only two types (good and bad) of objects
+    are present in the population as in the example above. Consider
+    another example for a comparison with the hypergeometric distribution:
+
+    >>> from scipy.stats import hypergeom
+    >>> multivariate_hypergeom.pmf(x=[3, 1], m=[10, 5], n=4)
+    0.4395604395604395
+    >>> hypergeom.pmf(k=3, M=15, n=4, N=10)
+    0.43956043956044005
+
+    The functions ``pmf``, ``logpmf``, ``mean``, ``var``, ``cov``, and ``rvs``
+    support broadcasting, under the convention that the vector parameters
+    (``x``, ``m``, and ``n``) are interpreted as if each row along the last
+    axis is a single object. For instance, we can combine the previous two
+    calls to `multivariate_hypergeom` as
+
+    >>> multivariate_hypergeom.pmf(x=[[8, 4], [3, 1]], m=[[10, 20], [10, 5]],
+    ...                            n=[12, 4])
+    array([0.00252072, 0.43956044])
+
+    This broadcasting also works for ``cov``, where the output objects are
+    square matrices of size ``m.shape[-1]``. For example:
+
+    >>> multivariate_hypergeom.cov(m=[[7, 9], [10, 15]], n=[8, 12])
+    array([[[ 1.05, -1.05],
+            [-1.05,  1.05]],
+           [[ 1.56, -1.56],
+            [-1.56,  1.56]]])
+
+    That is, ``result[0]`` is equal to
+    ``multivariate_hypergeom.cov(m=[7, 9], n=8)`` and ``result[1]`` is equal
+    to ``multivariate_hypergeom.cov(m=[10, 15], n=12)``.
+
+    Alternatively, the object may be called (as a function) to fix the `m`
+    and `n` parameters, returning a "frozen" multivariate hypergeometric
+    random variable.
+
+    >>> rv = multivariate_hypergeom(m=[10, 20], n=12)
+    >>> rv.pmf(x=[8, 4])
+    0.0025207176631464523
+
+    See Also
+    --------
+    scipy.stats.hypergeom : The hypergeometric distribution.
+    scipy.stats.multinomial : The multinomial distribution.
+
+    References
+    ----------
+    .. [1] The Multivariate Hypergeometric Distribution,
+           http://www.randomservices.org/random/urn/MultiHypergeometric.html
+    .. [2] Thomas J. Sargent and John Stachurski, 2020,
+           Multivariate Hypergeometric Distribution
+           https://python.quantecon.org/multi_hyper.html
+    """
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__, mhg_docdict_params)
+
+    def __call__(self, m, n, seed=None):
+        """Create a frozen multivariate_hypergeom distribution.
+
+        See `multivariate_hypergeom_frozen` for more information.
+        """
+        return multivariate_hypergeom_frozen(m, n, seed=seed)
+
+    def _process_parameters(self, m, n):
+        m = np.asarray(m)
+        n = np.asarray(n)
+        if m.size == 0:
+            m = m.astype(int)
+        if n.size == 0:
+            n = n.astype(int)
+        if not np.issubdtype(m.dtype, np.integer):
+            raise TypeError("'m' must an array of integers.")
+        if not np.issubdtype(n.dtype, np.integer):
+            raise TypeError("'n' must an array of integers.")
+        if m.ndim == 0:
+            raise ValueError("'m' must be an array with"
+                             " at least one dimension.")
+
+        # check for empty arrays
+        if m.size != 0:
+            n = n[..., np.newaxis]
+
+        m, n = np.broadcast_arrays(m, n)
+
+        # check for empty arrays
+        if m.size != 0:
+            n = n[..., 0]
+
+        mcond = m < 0
+
+        M = m.sum(axis=-1)
+
+        ncond = (n < 0) | (n > M)
+        return M, m, n, mcond, ncond, np.any(mcond, axis=-1) | ncond
+
+    def _process_quantiles(self, x, M, m, n):
+        x = np.asarray(x)
+        if not np.issubdtype(x.dtype, np.integer):
+            raise TypeError("'x' must an array of integers.")
+        if x.ndim == 0:
+            raise ValueError("'x' must be an array with"
+                             " at least one dimension.")
+        if not x.shape[-1] == m.shape[-1]:
+            raise ValueError(f"Size of each quantile must be size of 'm': "
+                             f"received {x.shape[-1]}, "
+                             f"but expected {m.shape[-1]}.")
+
+        # check for empty arrays
+        if m.size != 0:
+            n = n[..., np.newaxis]
+            M = M[..., np.newaxis]
+
+        x, m, n, M = np.broadcast_arrays(x, m, n, M)
+
+        # check for empty arrays
+        if m.size != 0:
+            n, M = n[..., 0], M[..., 0]
+
+        xcond = (x < 0) | (x > m)
+        return (x, M, m, n, xcond,
+                np.any(xcond, axis=-1) | (x.sum(axis=-1) != n))
+
+    def _checkresult(self, result, cond, bad_value):
+        result = np.asarray(result)
+        if cond.ndim != 0:
+            result[cond] = bad_value
+        elif cond:
+            return bad_value
+        if result.ndim == 0:
+            return result[()]
+        return result
+
+    def _logpmf(self, x, M, m, n, mxcond, ncond):
+        # This equation of the pmf comes from the relation,
+        # n combine r = beta(n+1, 1) / beta(r+1, n-r+1)
+        num = np.zeros_like(m, dtype=np.float64)
+        den = np.zeros_like(n, dtype=np.float64)
+        m, x = m[~mxcond], x[~mxcond]
+        M, n = M[~ncond], n[~ncond]
+        num[~mxcond] = (betaln(m+1, 1) - betaln(x+1, m-x+1))
+        den[~ncond] = (betaln(M+1, 1) - betaln(n+1, M-n+1))
+        num[mxcond] = np.nan
+        den[ncond] = np.nan
+        num = num.sum(axis=-1)
+        return num - den
+
+    def logpmf(self, x, m, n):
+        """Log of the multivariate hypergeometric probability mass function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        logpmf : ndarray or scalar
+            Log of the probability mass function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        M, m, n, mcond, ncond, mncond = self._process_parameters(m, n)
+        (x, M, m, n, xcond,
+         xcond_reduced) = self._process_quantiles(x, M, m, n)
+        mxcond = mcond | xcond
+        ncond = ncond | np.zeros(n.shape, dtype=np.bool_)
+
+        result = self._logpmf(x, M, m, n, mxcond, ncond)
+
+        # replace values for which x was out of the domain; broadcast
+        # xcond to the right shape
+        xcond_ = xcond_reduced | np.zeros(mncond.shape, dtype=np.bool_)
+        result = self._checkresult(result, xcond_, -np.inf)
+
+        # replace values bad for n or m; broadcast
+        # mncond to the right shape
+        mncond_ = mncond | np.zeros(xcond_reduced.shape, dtype=np.bool_)
+        return self._checkresult(result, mncond_, np.nan)
+
+    def pmf(self, x, m, n):
+        """Multivariate hypergeometric probability mass function.
+
+        Parameters
+        ----------
+        x : array_like
+            Quantiles, with the last axis of `x` denoting the components.
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        pmf : ndarray or scalar
+            Probability density function evaluated at `x`
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+        """
+        out = np.exp(self.logpmf(x, m, n))
+        return out
+
+    def mean(self, m, n):
+        """Mean of the multivariate hypergeometric distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        mean : array_like or scalar
+            The mean of the distribution
+        """
+        M, m, n, _, _, mncond = self._process_parameters(m, n)
+        # check for empty arrays
+        if m.size != 0:
+            M, n = M[..., np.newaxis], n[..., np.newaxis]
+        cond = (M == 0)
+        M = np.ma.masked_array(M, mask=cond)
+        mu = n*(m/M)
+        if m.size != 0:
+            mncond = (mncond[..., np.newaxis] |
+                      np.zeros(mu.shape, dtype=np.bool_))
+        return self._checkresult(mu, mncond, np.nan)
+
+    def var(self, m, n):
+        """Variance of the multivariate hypergeometric distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        array_like
+            The variances of the components of the distribution.  This is
+            the diagonal of the covariance matrix of the distribution
+        """
+        M, m, n, _, _, mncond = self._process_parameters(m, n)
+        # check for empty arrays
+        if m.size != 0:
+            M, n = M[..., np.newaxis], n[..., np.newaxis]
+        cond = (M == 0) & (M-1 == 0)
+        M = np.ma.masked_array(M, mask=cond)
+        output = n * m/M * (M-m)/M * (M-n)/(M-1)
+        if m.size != 0:
+            mncond = (mncond[..., np.newaxis] |
+                      np.zeros(output.shape, dtype=np.bool_))
+        return self._checkresult(output, mncond, np.nan)
+
+    def cov(self, m, n):
+        """Covariance matrix of the multivariate hypergeometric distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+
+        Returns
+        -------
+        cov : array_like
+            The covariance matrix of the distribution
+        """
+        # see [1]_ for the formula and [2]_ for implementation
+        # cov( x_i,x_j ) = -n * (M-n)/(M-1) * (K_i*K_j) / (M**2)
+        M, m, n, _, _, mncond = self._process_parameters(m, n)
+        # check for empty arrays
+        if m.size != 0:
+            M = M[..., np.newaxis, np.newaxis]
+            n = n[..., np.newaxis, np.newaxis]
+        cond = (M == 0) & (M-1 == 0)
+        M = np.ma.masked_array(M, mask=cond)
+        output = (-n * (M-n)/(M-1) *
+                  np.einsum("...i,...j->...ij", m, m) / (M**2))
+        # check for empty arrays
+        if m.size != 0:
+            M, n = M[..., 0, 0], n[..., 0, 0]
+            cond = cond[..., 0, 0]
+        dim = m.shape[-1]
+        # diagonal entries need to be computed differently
+        for i in range(dim):
+            output[..., i, i] = (n * (M-n) * m[..., i]*(M-m[..., i]))
+            output[..., i, i] = output[..., i, i] / (M-1)
+            output[..., i, i] = output[..., i, i] / (M**2)
+        if m.size != 0:
+            mncond = (mncond[..., np.newaxis, np.newaxis] |
+                      np.zeros(output.shape, dtype=np.bool_))
+        return self._checkresult(output, mncond, np.nan)
+
+    def rvs(self, m, n, size=None, random_state=None):
+        """Draw random samples from a multivariate hypergeometric distribution.
+
+        Parameters
+        ----------
+        %(_doc_default_callparams)s
+        size : integer or iterable of integers, optional
+            Number of samples to draw. Default is ``None``, in which case a
+            single variate is returned as an array with shape ``m.shape``.
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : array_like
+            Random variates of shape ``size`` or ``m.shape``
+            (if ``size=None``).
+
+        Notes
+        -----
+        %(_doc_callparams_note)s
+
+        Also note that NumPy's `multivariate_hypergeometric` sampler is not
+        used as it doesn't support broadcasting.
+        """
+        M, m, n, _, _, _ = self._process_parameters(m, n)
+
+        random_state = self._get_random_state(random_state)
+
+        if size is not None and isinstance(size, int):
+            size = (size, )
+
+        if size is None:
+            rvs = np.empty(m.shape, dtype=m.dtype)
+        else:
+            rvs = np.empty(size + (m.shape[-1], ), dtype=m.dtype)
+        rem = M
+
+        # This sampler has been taken from numpy gh-13794
+        # https://github.com/numpy/numpy/pull/13794
+        for c in range(m.shape[-1] - 1):
+            rem = rem - m[..., c]
+            n0mask = n == 0
+            rvs[..., c] = (~n0mask *
+                           random_state.hypergeometric(m[..., c],
+                                                       rem + n0mask,
+                                                       n + n0mask,
+                                                       size=size))
+            n = n - rvs[..., c]
+        rvs[..., m.shape[-1] - 1] = n
+
+        return rvs
+
+
+multivariate_hypergeom = multivariate_hypergeom_gen()
+
+
+class multivariate_hypergeom_frozen(multi_rv_frozen):
+    def __init__(self, m, n, seed=None):
+        self._dist = multivariate_hypergeom_gen(seed)
+        (self.M, self.m, self.n,
+         self.mcond, self.ncond,
+         self.mncond) = self._dist._process_parameters(m, n)
+
+        # monkey patch self._dist
+        def _process_parameters(m, n):
+            return (self.M, self.m, self.n,
+                    self.mcond, self.ncond,
+                    self.mncond)
+        self._dist._process_parameters = _process_parameters
+
+    def logpmf(self, x):
+        return self._dist.logpmf(x, self.m, self.n)
+
+    def pmf(self, x):
+        return self._dist.pmf(x, self.m, self.n)
+
+    def mean(self):
+        return self._dist.mean(self.m, self.n)
+
+    def var(self):
+        return self._dist.var(self.m, self.n)
+
+    def cov(self):
+        return self._dist.cov(self.m, self.n)
+
+    def rvs(self, size=1, random_state=None):
+        return self._dist.rvs(self.m, self.n,
+                              size=size,
+                              random_state=random_state)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# multivariate_hypergeom and fill in default strings in class docstrings
+for name in ['logpmf', 'pmf', 'mean', 'var', 'cov', 'rvs']:
+    method = multivariate_hypergeom_gen.__dict__[name]
+    method_frozen = multivariate_hypergeom_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, mhg_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__,
+                                      mhg_docdict_params)
+
+
+class random_table_gen(multi_rv_generic):
+    r"""Contingency tables from independent samples with fixed marginal sums.
+
+    This is the distribution of random tables with given row and column vector
+    sums. This distribution represents the set of random tables under the null
+    hypothesis that rows and columns are independent. It is used in hypothesis
+    tests of independence.
+
+    Because of assumed independence, the expected frequency of each table
+    element can be computed from the row and column sums, so that the
+    distribution is completely determined by these two vectors.
+
+    Methods
+    -------
+    logpmf(x)
+        Log-probability of table `x` to occur in the distribution.
+    pmf(x)
+        Probability of table `x` to occur in the distribution.
+    mean(row, col)
+        Mean table.
+    rvs(row, col, size=None, method=None, random_state=None)
+        Draw random tables with given row and column vector sums.
+
+    Parameters
+    ----------
+    %(_doc_row_col)s
+    %(_doc_random_state)s
+
+    Notes
+    -----
+    %(_doc_row_col_note)s
+
+    Random elements from the distribution are generated either with Boyett's
+    [1]_ or Patefield's algorithm [2]_. Boyett's algorithm has
+    O(N) time and space complexity, where N is the total sum of entries in the
+    table. Patefield's algorithm has O(K x log(N)) time complexity, where K is
+    the number of cells in the table and requires only a small constant work
+    space. By default, the `rvs` method selects the fastest algorithm based on
+    the input, but you can specify the algorithm with the keyword `method`.
+    Allowed values are "boyett" and "patefield".
+
+    .. versionadded:: 1.10.0
+
+    Examples
+    --------
+    >>> from scipy.stats import random_table
+
+    >>> row = [1, 5]
+    >>> col = [2, 3, 1]
+    >>> random_table.mean(row, col)
+    array([[0.33333333, 0.5       , 0.16666667],
+           [1.66666667, 2.5       , 0.83333333]])
+
+    Alternatively, the object may be called (as a function) to fix the row
+    and column vector sums, returning a "frozen" distribution.
+
+    >>> dist = random_table(row, col)
+    >>> dist.rvs(random_state=123)
+    array([[1, 0, 0],
+           [1, 3, 1]])
+
+    References
+    ----------
+    .. [1] J. Boyett, AS 144 Appl. Statist. 28 (1979) 329-332
+    .. [2] W.M. Patefield, AS 159 Appl. Statist. 30 (1981) 91-97
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+
+    def __call__(self, row, col, *, seed=None):
+        """Create a frozen distribution of tables with given marginals.
+
+        See `random_table_frozen` for more information.
+        """
+        return random_table_frozen(row, col, seed=seed)
+
+    def logpmf(self, x, row, col):
+        """Log-probability of table to occur in the distribution.
+
+        Parameters
+        ----------
+        %(_doc_x)s
+        %(_doc_row_col)s
+
+        Returns
+        -------
+        logpmf : ndarray or scalar
+            Log of the probability mass function evaluated at `x`.
+
+        Notes
+        -----
+        %(_doc_row_col_note)s
+
+        If row and column marginals of `x` do not match `row` and `col`,
+        negative infinity is returned.
+
+        Examples
+        --------
+        >>> from scipy.stats import random_table
+        >>> import numpy as np
+
+        >>> x = [[1, 5, 1], [2, 3, 1]]
+        >>> row = np.sum(x, axis=1)
+        >>> col = np.sum(x, axis=0)
+        >>> random_table.logpmf(x, row, col)
+        -1.6306401200847027
+
+        Alternatively, the object may be called (as a function) to fix the row
+        and column vector sums, returning a "frozen" distribution.
+
+        >>> d = random_table(row, col)
+        >>> d.logpmf(x)
+        -1.6306401200847027
+        """
+        r, c, n = self._process_parameters(row, col)
+        x = np.asarray(x)
+
+        if x.ndim < 2:
+            raise ValueError("`x` must be at least two-dimensional")
+
+        dtype_is_int = np.issubdtype(x.dtype, np.integer)
+        with np.errstate(invalid='ignore'):
+            if not dtype_is_int and not np.all(x.astype(int) == x):
+                raise ValueError("`x` must contain only integral values")
+
+        # x does not contain NaN if we arrive here
+        if np.any(x < 0):
+            raise ValueError("`x` must contain only non-negative values")
+
+        r2 = np.sum(x, axis=-1)
+        c2 = np.sum(x, axis=-2)
+
+        if r2.shape[-1] != len(r):
+            raise ValueError("shape of `x` must agree with `row`")
+
+        if c2.shape[-1] != len(c):
+            raise ValueError("shape of `x` must agree with `col`")
+
+        res = np.empty(x.shape[:-2])
+
+        mask = np.all(r2 == r, axis=-1) & np.all(c2 == c, axis=-1)
+
+        def lnfac(x):
+            return gammaln(x + 1)
+
+        res[mask] = (np.sum(lnfac(r), axis=-1) + np.sum(lnfac(c), axis=-1)
+                     - lnfac(n) - np.sum(lnfac(x[mask]), axis=(-1, -2)))
+        res[~mask] = -np.inf
+
+        return res[()]
+
+    def pmf(self, x, row, col):
+        """Probability of table to occur in the distribution.
+
+        Parameters
+        ----------
+        %(_doc_x)s
+        %(_doc_row_col)s
+
+        Returns
+        -------
+        pmf : ndarray or scalar
+            Probability mass function evaluated at `x`.
+
+        Notes
+        -----
+        %(_doc_row_col_note)s
+
+        If row and column marginals of `x` do not match `row` and `col`,
+        zero is returned.
+
+        Examples
+        --------
+        >>> from scipy.stats import random_table
+        >>> import numpy as np
+
+        >>> x = [[1, 5, 1], [2, 3, 1]]
+        >>> row = np.sum(x, axis=1)
+        >>> col = np.sum(x, axis=0)
+        >>> random_table.pmf(x, row, col)
+        0.19580419580419592
+
+        Alternatively, the object may be called (as a function) to fix the row
+        and column vector sums, returning a "frozen" distribution.
+
+        >>> d = random_table(row, col)
+        >>> d.pmf(x)
+        0.19580419580419592
+        """
+        return np.exp(self.logpmf(x, row, col))
+
+    def mean(self, row, col):
+        """Mean of distribution of conditional tables.
+        %(_doc_mean_params)s
+
+        Returns
+        -------
+        mean: ndarray
+            Mean of the distribution.
+
+        Notes
+        -----
+        %(_doc_row_col_note)s
+
+        Examples
+        --------
+        >>> from scipy.stats import random_table
+
+        >>> row = [1, 5]
+        >>> col = [2, 3, 1]
+        >>> random_table.mean(row, col)
+        array([[0.33333333, 0.5       , 0.16666667],
+               [1.66666667, 2.5       , 0.83333333]])
+
+        Alternatively, the object may be called (as a function) to fix the row
+        and column vector sums, returning a "frozen" distribution.
+
+        >>> d = random_table(row, col)
+        >>> d.mean()
+        array([[0.33333333, 0.5       , 0.16666667],
+               [1.66666667, 2.5       , 0.83333333]])
+        """
+        r, c, n = self._process_parameters(row, col)
+        return np.outer(r, c) / n
+
+    def rvs(self, row, col, *, size=None, method=None, random_state=None):
+        """Draw random tables with fixed column and row marginals.
+
+        Parameters
+        ----------
+        %(_doc_row_col)s
+        size : integer, optional
+            Number of samples to draw (default 1).
+        method : str, optional
+            Which method to use, "boyett" or "patefield". If None (default),
+            selects the fastest method for this input.
+        %(_doc_random_state)s
+
+        Returns
+        -------
+        rvs : ndarray
+            Random 2D tables of shape (`size`, `len(row)`, `len(col)`).
+
+        Notes
+        -----
+        %(_doc_row_col_note)s
+
+        Examples
+        --------
+        >>> from scipy.stats import random_table
+
+        >>> row = [1, 5]
+        >>> col = [2, 3, 1]
+        >>> random_table.rvs(row, col, random_state=123)
+        array([[1., 0., 0.],
+               [1., 3., 1.]])
+
+        Alternatively, the object may be called (as a function) to fix the row
+        and column vector sums, returning a "frozen" distribution.
+
+        >>> d = random_table(row, col)
+        >>> d.rvs(random_state=123)
+        array([[1., 0., 0.],
+               [1., 3., 1.]])
+        """
+        r, c, n = self._process_parameters(row, col)
+        size, shape = self._process_size_shape(size, r, c)
+
+        random_state = self._get_random_state(random_state)
+        meth = self._process_rvs_method(method, r, c, n)
+
+        return meth(r, c, n, size, random_state).reshape(shape)
+
+    @staticmethod
+    def _process_parameters(row, col):
+        """
+        Check that row and column vectors are one-dimensional, that they do
+        not contain negative or non-integer entries, and that the sums over
+        both vectors are equal.
+        """
+        r = np.array(row, dtype=np.int64, copy=True)
+        c = np.array(col, dtype=np.int64, copy=True)
+
+        if np.ndim(r) != 1:
+            raise ValueError("`row` must be one-dimensional")
+        if np.ndim(c) != 1:
+            raise ValueError("`col` must be one-dimensional")
+
+        if np.any(r < 0):
+            raise ValueError("each element of `row` must be non-negative")
+        if np.any(c < 0):
+            raise ValueError("each element of `col` must be non-negative")
+
+        n = np.sum(r)
+        if n != np.sum(c):
+            raise ValueError("sums over `row` and `col` must be equal")
+
+        if not np.all(r == np.asarray(row)):
+            raise ValueError("each element of `row` must be an integer")
+        if not np.all(c == np.asarray(col)):
+            raise ValueError("each element of `col` must be an integer")
+
+        return r, c, n
+
+    @staticmethod
+    def _process_size_shape(size, r, c):
+        """
+        Compute the number of samples to be drawn and the shape of the output
+        """
+        shape = (len(r), len(c))
+
+        if size is None:
+            return 1, shape
+
+        size = np.atleast_1d(size)
+        if not np.issubdtype(size.dtype, np.integer) or np.any(size < 0):
+            raise ValueError("`size` must be a non-negative integer or `None`")
+
+        return np.prod(size), tuple(size) + shape
+
+    @classmethod
+    def _process_rvs_method(cls, method, r, c, n):
+        known_methods = {
+            None: cls._rvs_select(r, c, n),
+            "boyett": cls._rvs_boyett,
+            "patefield": cls._rvs_patefield,
+        }
+        try:
+            return known_methods[method]
+        except KeyError:
+            raise ValueError(f"'{method}' not recognized, "
+                             f"must be one of {set(known_methods)}")
+
+    @classmethod
+    def _rvs_select(cls, r, c, n):
+        fac = 1.0  # benchmarks show that this value is about 1
+        k = len(r) * len(c)  # number of cells
+        # n + 1 guards against failure if n == 0
+        if n > fac * np.log(n + 1) * k:
+            return cls._rvs_patefield
+        return cls._rvs_boyett
+
+    @staticmethod
+    def _rvs_boyett(row, col, ntot, size, random_state):
+        return _rcont.rvs_rcont1(row, col, ntot, size, random_state)
+
+    @staticmethod
+    def _rvs_patefield(row, col, ntot, size, random_state):
+        return _rcont.rvs_rcont2(row, col, ntot, size, random_state)
+
+
+random_table = random_table_gen()
+
+
+class random_table_frozen(multi_rv_frozen):
+    __class_getitem__ = None
+
+    def __init__(self, row, col, *, seed=None):
+        self._dist = random_table_gen(seed)
+        self._params = self._dist._process_parameters(row, col)
+
+        # monkey patch self._dist
+        def _process_parameters(r, c):
+            return self._params
+        self._dist._process_parameters = _process_parameters
+
+    def logpmf(self, x):
+        return self._dist.logpmf(x, None, None)
+
+    def pmf(self, x):
+        return self._dist.pmf(x, None, None)
+
+    def mean(self):
+        return self._dist.mean(None, None)
+
+    def rvs(self, size=None, method=None, random_state=None):
+        # optimisations are possible here
+        return self._dist.rvs(None, None, size=size, method=method,
+                              random_state=random_state)
+
+
+_ctab_doc_row_col = """\
+row : array_like
+    Sum of table entries in each row.
+col : array_like
+    Sum of table entries in each column."""
+
+_ctab_doc_x = """\
+x : array-like
+   Two-dimensional table of non-negative integers, or a
+   multi-dimensional array with the last two dimensions
+   corresponding with the tables."""
+
+_ctab_doc_row_col_note = """\
+The row and column vectors must be one-dimensional, not empty,
+and each sum up to the same value. They cannot contain negative
+or noninteger entries."""
+
+_ctab_doc_mean_params = f"""
+Parameters
+----------
+{_ctab_doc_row_col}"""
+
+_ctab_doc_row_col_note_frozen = """\
+See class definition for a detailed description of parameters."""
+
+_ctab_docdict = {
+    "_doc_random_state": _doc_random_state,
+    "_doc_row_col": _ctab_doc_row_col,
+    "_doc_x": _ctab_doc_x,
+    "_doc_mean_params": _ctab_doc_mean_params,
+    "_doc_row_col_note": _ctab_doc_row_col_note,
+}
+
+_ctab_docdict_frozen = _ctab_docdict.copy()
+_ctab_docdict_frozen.update({
+    "_doc_row_col": "",
+    "_doc_mean_params": "",
+    "_doc_row_col_note": _ctab_doc_row_col_note_frozen,
+})
+
+
+def _docfill(obj, docdict, template=None):
+    obj.__doc__ = doccer.docformat(template or obj.__doc__, docdict)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# random_table and fill in default strings in class docstrings
+_docfill(random_table_gen, _ctab_docdict)
+for name in ['logpmf', 'pmf', 'mean', 'rvs']:
+    method = random_table_gen.__dict__[name]
+    method_frozen = random_table_frozen.__dict__[name]
+    _docfill(method_frozen, _ctab_docdict_frozen, method.__doc__)
+    _docfill(method, _ctab_docdict)
+
+
+class uniform_direction_gen(multi_rv_generic):
+    r"""A vector-valued uniform direction.
+
+    Return a random direction (unit vector). The `dim` keyword specifies
+    the dimensionality of the space.
+
+    Methods
+    -------
+    rvs(dim=None, size=1, random_state=None)
+        Draw random directions.
+
+    Parameters
+    ----------
+    dim : scalar
+        Dimension of directions.
+    seed : {None, int, `numpy.random.Generator`,
+            `numpy.random.RandomState`}, optional
+
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    Notes
+    -----
+    This distribution generates unit vectors uniformly distributed on
+    the surface of a hypersphere. These can be interpreted as random
+    directions.
+    For example, if `dim` is 3, 3D vectors from the surface of :math:`S^2`
+    will be sampled.
+
+    References
+    ----------
+    .. [1] Marsaglia, G. (1972). "Choosing a Point from the Surface of a
+           Sphere". Annals of Mathematical Statistics. 43 (2): 645-646.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import uniform_direction
+    >>> x = uniform_direction.rvs(3)
+    >>> np.linalg.norm(x)
+    1.
+
+    This generates one random direction, a vector on the surface of
+    :math:`S^2`.
+
+    Alternatively, the object may be called (as a function) to return a frozen
+    distribution with fixed `dim` parameter. Here,
+    we create a `uniform_direction` with ``dim=3`` and draw 5 observations.
+    The samples are then arranged in an array of shape 5x3.
+
+    >>> rng = np.random.default_rng()
+    >>> uniform_sphere_dist = uniform_direction(3)
+    >>> unit_vectors = uniform_sphere_dist.rvs(5, random_state=rng)
+    >>> unit_vectors
+    array([[ 0.56688642, -0.1332634 , -0.81294566],
+           [-0.427126  , -0.74779278,  0.50830044],
+           [ 0.3793989 ,  0.92346629,  0.05715323],
+           [ 0.36428383, -0.92449076, -0.11231259],
+           [-0.27733285,  0.94410968, -0.17816678]])
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__)
+
+    def __call__(self, dim=None, seed=None):
+        """Create a frozen n-dimensional uniform direction distribution.
+
+        See `uniform_direction` for more information.
+        """
+        return uniform_direction_frozen(dim, seed=seed)
+
+    def _process_parameters(self, dim):
+        """Dimension N must be specified; it cannot be inferred."""
+        if dim is None or not np.isscalar(dim) or dim < 1 or dim != int(dim):
+            raise ValueError("Dimension of vector must be specified, "
+                             "and must be an integer greater than 0.")
+
+        return int(dim)
+
+    def rvs(self, dim, size=None, random_state=None):
+        """Draw random samples from S(N-1).
+
+        Parameters
+        ----------
+        dim : integer
+            Dimension of space (N).
+        size : int or tuple of ints, optional
+            Given a shape of, for example, (m,n,k), m*n*k samples are
+            generated, and packed in an m-by-n-by-k arrangement.
+            Because each sample is N-dimensional, the output shape
+            is (m,n,k,N). If no shape is specified, a single (N-D)
+            sample is returned.
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            Pseudorandom number generator state used to generate resamples.
+
+            If `random_state` is ``None`` (or `np.random`), the
+            `numpy.random.RandomState` singleton is used.
+            If `random_state` is an int, a new ``RandomState`` instance is
+            used, seeded with `random_state`.
+            If `random_state` is already a ``Generator`` or ``RandomState``
+            instance then that instance is used.
+
+        Returns
+        -------
+        rvs : ndarray
+            Random direction vectors
+
+        """
+        random_state = self._get_random_state(random_state)
+        if size is None:
+            size = np.array([], dtype=int)
+        size = np.atleast_1d(size)
+
+        dim = self._process_parameters(dim)
+
+        samples = _sample_uniform_direction(dim, size, random_state)
+        return samples
+
+
+uniform_direction = uniform_direction_gen()
+
+
+class uniform_direction_frozen(multi_rv_frozen):
+    def __init__(self, dim=None, seed=None):
+        """Create a frozen n-dimensional uniform direction distribution.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension of matrices
+        seed : {None, int, `numpy.random.Generator`,
+                `numpy.random.RandomState`}, optional
+
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Examples
+        --------
+        >>> from scipy.stats import uniform_direction
+        >>> x = uniform_direction(3)
+        >>> x.rvs()
+
+        """
+        self._dist = uniform_direction_gen(seed)
+        self.dim = self._dist._process_parameters(dim)
+
+    def rvs(self, size=None, random_state=None):
+        return self._dist.rvs(self.dim, size, random_state)
+
+
+def _sample_uniform_direction(dim, size, random_state):
+    """
+    Private method to generate uniform directions
+    Reference: Marsaglia, G. (1972). "Choosing a Point from the Surface of a
+               Sphere". Annals of Mathematical Statistics. 43 (2): 645-646.
+    """
+    samples_shape = np.append(size, dim)
+    samples = random_state.standard_normal(samples_shape)
+    samples /= np.linalg.norm(samples, axis=-1, keepdims=True)
+    return samples
+
+
+_dirichlet_mn_doc_default_callparams = """\
+alpha : array_like
+    The concentration parameters. The number of entries along the last axis
+    determines the dimensionality of the distribution. Each entry must be
+    strictly positive.
+n : int or array_like
+    The number of trials. Each element must be a non-negative integer.
+"""
+
+_dirichlet_mn_doc_frozen_callparams = ""
+
+_dirichlet_mn_doc_frozen_callparams_note = """\
+See class definition for a detailed description of parameters."""
+
+dirichlet_mn_docdict_params = {
+    '_dirichlet_mn_doc_default_callparams': _dirichlet_mn_doc_default_callparams,
+    '_doc_random_state': _doc_random_state
+}
+
+dirichlet_mn_docdict_noparams = {
+    '_dirichlet_mn_doc_default_callparams': _dirichlet_mn_doc_frozen_callparams,
+    '_doc_random_state': _doc_random_state
+}
+
+
+def _dirichlet_multinomial_check_parameters(alpha, n, x=None):
+
+    alpha = np.asarray(alpha)
+    n = np.asarray(n)
+
+    if x is not None:
+        # Ensure that `x` and `alpha` are arrays. If the shapes are
+        # incompatible, NumPy will raise an appropriate error.
+        try:
+            x, alpha = np.broadcast_arrays(x, alpha)
+        except ValueError as e:
+            msg = "`x` and `alpha` must be broadcastable."
+            raise ValueError(msg) from e
+
+        x_int = np.floor(x)
+        if np.any(x < 0) or np.any(x != x_int):
+            raise ValueError("`x` must contain only non-negative integers.")
+        x = x_int
+
+    if np.any(alpha <= 0):
+        raise ValueError("`alpha` must contain only positive values.")
+
+    n_int = np.floor(n)
+    if np.any(n < 0) or np.any(n != n_int):
+        raise ValueError("`n` must be a non-negative integer.")
+    n = n_int
+
+    sum_alpha = np.sum(alpha, axis=-1)
+    sum_alpha, n = np.broadcast_arrays(sum_alpha, n)
+
+    return (alpha, sum_alpha, n) if x is None else (alpha, sum_alpha, n, x)
+
+
+class dirichlet_multinomial_gen(multi_rv_generic):
+    r"""A Dirichlet multinomial random variable.
+
+    The Dirichlet multinomial distribution is a compound probability
+    distribution: it is the multinomial distribution with number of trials
+    `n` and class probabilities ``p`` randomly sampled from a Dirichlet
+    distribution with concentration parameters ``alpha``.
+
+    Methods
+    -------
+    logpmf(x, alpha, n):
+        Log of the probability mass function.
+    pmf(x, alpha, n):
+        Probability mass function.
+    mean(alpha, n):
+        Mean of the Dirichlet multinomial distribution.
+    var(alpha, n):
+        Variance of the Dirichlet multinomial distribution.
+    cov(alpha, n):
+        The covariance of the Dirichlet multinomial distribution.
+
+    Parameters
+    ----------
+    %(_dirichlet_mn_doc_default_callparams)s
+    %(_doc_random_state)s
+
+    See Also
+    --------
+    scipy.stats.dirichlet : The dirichlet distribution.
+    scipy.stats.multinomial : The multinomial distribution.
+
+    References
+    ----------
+    .. [1] Dirichlet-multinomial distribution, Wikipedia,
+           https://www.wikipedia.org/wiki/Dirichlet-multinomial_distribution
+
+    Examples
+    --------
+    >>> from scipy.stats import dirichlet_multinomial
+
+    Get the PMF
+
+    >>> n = 6  # number of trials
+    >>> alpha = [3, 4, 5]  # concentration parameters
+    >>> x = [1, 2, 3]  # counts
+    >>> dirichlet_multinomial.pmf(x, alpha, n)
+    0.08484162895927604
+
+    If the sum of category counts does not equal the number of trials,
+    the probability mass is zero.
+
+    >>> dirichlet_multinomial.pmf(x, alpha, n=7)
+    0.0
+
+    Get the log of the PMF
+
+    >>> dirichlet_multinomial.logpmf(x, alpha, n)
+    -2.4669689491013327
+
+    Get the mean
+
+    >>> dirichlet_multinomial.mean(alpha, n)
+    array([1.5, 2. , 2.5])
+
+    Get the variance
+
+    >>> dirichlet_multinomial.var(alpha, n)
+    array([1.55769231, 1.84615385, 2.01923077])
+
+    Get the covariance
+
+    >>> dirichlet_multinomial.cov(alpha, n)
+    array([[ 1.55769231, -0.69230769, -0.86538462],
+           [-0.69230769,  1.84615385, -1.15384615],
+           [-0.86538462, -1.15384615,  2.01923077]])
+
+    Alternatively, the object may be called (as a function) to fix the
+    `alpha` and `n` parameters, returning a "frozen" Dirichlet multinomial
+    random variable.
+
+    >>> dm = dirichlet_multinomial(alpha, n)
+    >>> dm.pmf(x)
+    0.08484162895927579
+
+    All methods are fully vectorized. Each element of `x` and `alpha` is
+    a vector (along the last axis), each element of `n` is an
+    integer (scalar), and the result is computed element-wise.
+
+    >>> x = [[1, 2, 3], [4, 5, 6]]
+    >>> alpha = [[1, 2, 3], [4, 5, 6]]
+    >>> n = [6, 15]
+    >>> dirichlet_multinomial.pmf(x, alpha, n)
+    array([0.06493506, 0.02626937])
+
+    >>> dirichlet_multinomial.cov(alpha, n).shape  # both covariance matrices
+    (2, 3, 3)
+
+    Broadcasting according to standard NumPy conventions is supported. Here,
+    we have four sets of concentration parameters (each a two element vector)
+    for each of three numbers of trials (each a scalar).
+
+    >>> alpha = [[3, 4], [4, 5], [5, 6], [6, 7]]
+    >>> n = [[6], [7], [8]]
+    >>> dirichlet_multinomial.mean(alpha, n).shape
+    (3, 4, 2)
+
+    """
+    def __init__(self, seed=None):
+        super().__init__(seed)
+        self.__doc__ = doccer.docformat(self.__doc__,
+                                        dirichlet_mn_docdict_params)
+
+    def __call__(self, alpha, n, seed=None):
+        return dirichlet_multinomial_frozen(alpha, n, seed=seed)
+
+    def logpmf(self, x, alpha, n):
+        """The log of the probability mass function.
+
+        Parameters
+        ----------
+        x: ndarray
+            Category counts (non-negative integers). Must be broadcastable
+            with shape parameter ``alpha``. If multidimensional, the last axis
+            must correspond with the categories.
+        %(_dirichlet_mn_doc_default_callparams)s
+
+        Returns
+        -------
+        out: ndarray or scalar
+            Log of the probability mass function.
+
+        """
+
+        a, Sa, n, x = _dirichlet_multinomial_check_parameters(alpha, n, x)
+
+        out = np.asarray(loggamma(Sa) + loggamma(n + 1) - loggamma(n + Sa))
+        out += (loggamma(x + a) - (loggamma(a) + loggamma(x + 1))).sum(axis=-1)
+        np.place(out, n != x.sum(axis=-1), -np.inf)
+        return out[()]
+
+    def pmf(self, x, alpha, n):
+        """Probability mass function for a Dirichlet multinomial distribution.
+
+        Parameters
+        ----------
+        x: ndarray
+            Category counts (non-negative integers). Must be broadcastable
+            with shape parameter ``alpha``. If multidimensional, the last axis
+            must correspond with the categories.
+        %(_dirichlet_mn_doc_default_callparams)s
+
+        Returns
+        -------
+        out: ndarray or scalar
+            Probability mass function.
+
+        """
+        return np.exp(self.logpmf(x, alpha, n))
+
+    def mean(self, alpha, n):
+        """Mean of a Dirichlet multinomial distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_mn_doc_default_callparams)s
+
+        Returns
+        -------
+        out: ndarray
+            Mean of a Dirichlet multinomial distribution.
+
+        """
+        a, Sa, n = _dirichlet_multinomial_check_parameters(alpha, n)
+        n, Sa = n[..., np.newaxis], Sa[..., np.newaxis]
+        return n * a / Sa
+
+    def var(self, alpha, n):
+        """The variance of the Dirichlet multinomial distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_mn_doc_default_callparams)s
+
+        Returns
+        -------
+        out: array_like
+            The variances of the components of the distribution. This is
+            the diagonal of the covariance matrix of the distribution.
+
+        """
+        a, Sa, n = _dirichlet_multinomial_check_parameters(alpha, n)
+        n, Sa = n[..., np.newaxis], Sa[..., np.newaxis]
+        return n * a / Sa * (1 - a/Sa) * (n + Sa) / (1 + Sa)
+
+    def cov(self, alpha, n):
+        """Covariance matrix of a Dirichlet multinomial distribution.
+
+        Parameters
+        ----------
+        %(_dirichlet_mn_doc_default_callparams)s
+
+        Returns
+        -------
+        out : array_like
+            The covariance matrix of the distribution.
+
+        """
+        a, Sa, n = _dirichlet_multinomial_check_parameters(alpha, n)
+        var = dirichlet_multinomial.var(a, n)
+
+        n, Sa = n[..., np.newaxis, np.newaxis], Sa[..., np.newaxis, np.newaxis]
+        aiaj = a[..., :, np.newaxis] * a[..., np.newaxis, :]
+        cov = -n * aiaj / Sa ** 2 * (n + Sa) / (1 + Sa)
+
+        ii = np.arange(cov.shape[-1])
+        cov[..., ii, ii] = var
+        return cov
+
+
+dirichlet_multinomial = dirichlet_multinomial_gen()
+
+
+class dirichlet_multinomial_frozen(multi_rv_frozen):
+    def __init__(self, alpha, n, seed=None):
+        alpha, Sa, n = _dirichlet_multinomial_check_parameters(alpha, n)
+        self.alpha = alpha
+        self.n = n
+        self._dist = dirichlet_multinomial_gen(seed)
+
+    def logpmf(self, x):
+        return self._dist.logpmf(x, self.alpha, self.n)
+
+    def pmf(self, x):
+        return self._dist.pmf(x, self.alpha, self.n)
+
+    def mean(self):
+        return self._dist.mean(self.alpha, self.n)
+
+    def var(self):
+        return self._dist.var(self.alpha, self.n)
+
+    def cov(self):
+        return self._dist.cov(self.alpha, self.n)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# dirichlet_multinomial and fill in default strings in class docstrings.
+for name in ['logpmf', 'pmf', 'mean', 'var', 'cov']:
+    method = dirichlet_multinomial_gen.__dict__[name]
+    method_frozen = dirichlet_multinomial_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(
+        method.__doc__, dirichlet_mn_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__,
+                                      dirichlet_mn_docdict_params)
+
+
+class vonmises_fisher_gen(multi_rv_generic):
+    r"""A von Mises-Fisher variable.
+
+    The `mu` keyword specifies the mean direction vector. The `kappa` keyword
+    specifies the concentration parameter.
+
+    Methods
+    -------
+    pdf(x, mu=None, kappa=1)
+        Probability density function.
+    logpdf(x, mu=None, kappa=1)
+        Log of the probability density function.
+    rvs(mu=None, kappa=1, size=1, random_state=None)
+        Draw random samples from a von Mises-Fisher distribution.
+    entropy(mu=None, kappa=1)
+        Compute the differential entropy of the von Mises-Fisher distribution.
+    fit(data)
+        Fit a von Mises-Fisher distribution to data.
+
+    Parameters
+    ----------
+    mu : array_like
+        Mean direction of the distribution. Must be a one-dimensional unit
+        vector of norm 1.
+    kappa : float
+        Concentration parameter. Must be positive.
+    seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    See Also
+    --------
+    scipy.stats.vonmises : Von-Mises Fisher distribution in 2D on a circle
+    uniform_direction : uniform distribution on the surface of a hypersphere
+
+    Notes
+    -----
+    The von Mises-Fisher distribution is a directional distribution on the
+    surface of the unit hypersphere. The probability density
+    function of a unit vector :math:`\mathbf{x}` is
+
+    .. math::
+
+        f(\mathbf{x}) = \frac{\kappa^{d/2-1}}{(2\pi)^{d/2}I_{d/2-1}(\kappa)}
+               \exp\left(\kappa \mathbf{\mu}^T\mathbf{x}\right),
+
+    where :math:`\mathbf{\mu}` is the mean direction, :math:`\kappa` the
+    concentration parameter, :math:`d` the dimension and :math:`I` the
+    modified Bessel function of the first kind. As :math:`\mu` represents
+    a direction, it must be a unit vector or in other words, a point
+    on the hypersphere: :math:`\mathbf{\mu}\in S^{d-1}`. :math:`\kappa` is a
+    concentration parameter, which means that it must be positive
+    (:math:`\kappa>0`) and that the distribution becomes more narrow with
+    increasing :math:`\kappa`. In that sense, the reciprocal value
+    :math:`1/\kappa` resembles the variance parameter of the normal
+    distribution.
+
+    The von Mises-Fisher distribution often serves as an analogue of the
+    normal distribution on the sphere. Intuitively, for unit vectors, a
+    useful distance measure is given by the angle :math:`\alpha` between
+    them. This is exactly what the scalar product
+    :math:`\mathbf{\mu}^T\mathbf{x}=\cos(\alpha)` in the
+    von Mises-Fisher probability density function describes: the angle
+    between the mean direction :math:`\mathbf{\mu}` and the vector
+    :math:`\mathbf{x}`. The larger the angle between them, the smaller the
+    probability to observe :math:`\mathbf{x}` for this particular mean
+    direction :math:`\mathbf{\mu}`.
+
+    In dimensions 2 and 3, specialized algorithms are used for fast sampling
+    [2]_, [3]_. For dimensions of 4 or higher the rejection sampling algorithm
+    described in [4]_ is utilized. This implementation is partially based on
+    the geomstats package [5]_, [6]_.
+
+    .. versionadded:: 1.11
+
+    References
+    ----------
+    .. [1] Von Mises-Fisher distribution, Wikipedia,
+           https://en.wikipedia.org/wiki/Von_Mises%E2%80%93Fisher_distribution
+    .. [2] Mardia, K., and Jupp, P. Directional statistics. Wiley, 2000.
+    .. [3] J. Wenzel. Numerically stable sampling of the von Mises Fisher
+           distribution on S2.
+           https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf
+    .. [4] Wood, A. Simulation of the von mises fisher distribution.
+           Communications in statistics-simulation and computation 23,
+           1 (1994), 157-164. https://doi.org/10.1080/03610919408813161
+    .. [5] geomstats, Github. MIT License. Accessed: 06.01.2023.
+           https://github.com/geomstats/geomstats
+    .. [6] Miolane, N. et al. Geomstats:  A Python Package for Riemannian
+           Geometry in Machine Learning. Journal of Machine Learning Research
+           21 (2020). http://jmlr.org/papers/v21/19-027.html
+
+    Examples
+    --------
+    **Visualization of the probability density**
+
+    Plot the probability density in three dimensions for increasing
+    concentration parameter. The density is calculated by the ``pdf``
+    method.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import vonmises_fisher
+    >>> from matplotlib.colors import Normalize
+    >>> n_grid = 100
+    >>> u = np.linspace(0, np.pi, n_grid)
+    >>> v = np.linspace(0, 2 * np.pi, n_grid)
+    >>> u_grid, v_grid = np.meshgrid(u, v)
+    >>> vertices = np.stack([np.cos(v_grid) * np.sin(u_grid),
+    ...                      np.sin(v_grid) * np.sin(u_grid),
+    ...                      np.cos(u_grid)],
+    ...                     axis=2)
+    >>> x = np.outer(np.cos(v), np.sin(u))
+    >>> y = np.outer(np.sin(v), np.sin(u))
+    >>> z = np.outer(np.ones_like(u), np.cos(u))
+    >>> def plot_vmf_density(ax, x, y, z, vertices, mu, kappa):
+    ...     vmf = vonmises_fisher(mu, kappa)
+    ...     pdf_values = vmf.pdf(vertices)
+    ...     pdfnorm = Normalize(vmin=pdf_values.min(), vmax=pdf_values.max())
+    ...     ax.plot_surface(x, y, z, rstride=1, cstride=1,
+    ...                     facecolors=plt.cm.viridis(pdfnorm(pdf_values)),
+    ...                     linewidth=0)
+    ...     ax.set_aspect('equal')
+    ...     ax.view_init(azim=-130, elev=0)
+    ...     ax.axis('off')
+    ...     ax.set_title(rf"$\kappa={kappa}$")
+    >>> fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(9, 4),
+    ...                          subplot_kw={"projection": "3d"})
+    >>> left, middle, right = axes
+    >>> mu = np.array([-np.sqrt(0.5), -np.sqrt(0.5), 0])
+    >>> plot_vmf_density(left, x, y, z, vertices, mu, 5)
+    >>> plot_vmf_density(middle, x, y, z, vertices, mu, 20)
+    >>> plot_vmf_density(right, x, y, z, vertices, mu, 100)
+    >>> plt.subplots_adjust(top=1, bottom=0.0, left=0.0, right=1.0, wspace=0.)
+    >>> plt.show()
+
+    As we increase the concentration parameter, the points are getting more
+    clustered together around the mean direction.
+
+    **Sampling**
+
+    Draw 5 samples from the distribution using the ``rvs`` method resulting
+    in a 5x3 array.
+
+    >>> rng = np.random.default_rng()
+    >>> mu = np.array([0, 0, 1])
+    >>> samples = vonmises_fisher(mu, 20).rvs(5, random_state=rng)
+    >>> samples
+    array([[ 0.3884594 , -0.32482588,  0.86231516],
+           [ 0.00611366, -0.09878289,  0.99509023],
+           [-0.04154772, -0.01637135,  0.99900239],
+           [-0.14613735,  0.12553507,  0.98126695],
+           [-0.04429884, -0.23474054,  0.97104814]])
+
+    These samples are unit vectors on the sphere :math:`S^2`. To verify,
+    let us calculate their euclidean norms:
+
+    >>> np.linalg.norm(samples, axis=1)
+    array([1., 1., 1., 1., 1.])
+
+    Plot 20 observations drawn from the von Mises-Fisher distribution for
+    increasing concentration parameter :math:`\kappa`. The red dot highlights
+    the mean direction :math:`\mu`.
+
+    >>> def plot_vmf_samples(ax, x, y, z, mu, kappa):
+    ...     vmf = vonmises_fisher(mu, kappa)
+    ...     samples = vmf.rvs(20)
+    ...     ax.plot_surface(x, y, z, rstride=1, cstride=1, linewidth=0,
+    ...                     alpha=0.2)
+    ...     ax.scatter(samples[:, 0], samples[:, 1], samples[:, 2], c='k', s=5)
+    ...     ax.scatter(mu[0], mu[1], mu[2], c='r', s=30)
+    ...     ax.set_aspect('equal')
+    ...     ax.view_init(azim=-130, elev=0)
+    ...     ax.axis('off')
+    ...     ax.set_title(rf"$\kappa={kappa}$")
+    >>> mu = np.array([-np.sqrt(0.5), -np.sqrt(0.5), 0])
+    >>> fig, axes = plt.subplots(nrows=1, ncols=3,
+    ...                          subplot_kw={"projection": "3d"},
+    ...                          figsize=(9, 4))
+    >>> left, middle, right = axes
+    >>> plot_vmf_samples(left, x, y, z, mu, 5)
+    >>> plot_vmf_samples(middle, x, y, z, mu, 20)
+    >>> plot_vmf_samples(right, x, y, z, mu, 100)
+    >>> plt.subplots_adjust(top=1, bottom=0.0, left=0.0,
+    ...                     right=1.0, wspace=0.)
+    >>> plt.show()
+
+    The plots show that with increasing concentration :math:`\kappa` the
+    resulting samples are centered more closely around the mean direction.
+
+    **Fitting the distribution parameters**
+
+    The distribution can be fitted to data using the ``fit`` method returning
+    the estimated parameters. As a toy example let's fit the distribution to
+    samples drawn from a known von Mises-Fisher distribution.
+
+    >>> mu, kappa = np.array([0, 0, 1]), 20
+    >>> samples = vonmises_fisher(mu, kappa).rvs(1000, random_state=rng)
+    >>> mu_fit, kappa_fit = vonmises_fisher.fit(samples)
+    >>> mu_fit, kappa_fit
+    (array([0.01126519, 0.01044501, 0.99988199]), 19.306398751730995)
+
+    We see that the estimated parameters `mu_fit` and `kappa_fit` are
+    very close to the ground truth parameters.
+
+    """
+    def __init__(self, seed=None):
+        super().__init__(seed)
+
+    def __call__(self, mu=None, kappa=1, seed=None):
+        """Create a frozen von Mises-Fisher distribution.
+
+        See `vonmises_fisher_frozen` for more information.
+        """
+        return vonmises_fisher_frozen(mu, kappa, seed=seed)
+
+    def _process_parameters(self, mu, kappa):
+        """
+        Infer dimensionality from mu and ensure that mu is a one-dimensional
+        unit vector and kappa positive.
+        """
+        mu = np.asarray(mu)
+        if mu.ndim > 1:
+            raise ValueError("'mu' must have one-dimensional shape.")
+        if not np.allclose(np.linalg.norm(mu), 1.):
+            raise ValueError("'mu' must be a unit vector of norm 1.")
+        if not mu.size > 1:
+            raise ValueError("'mu' must have at least two entries.")
+        kappa_error_msg = "'kappa' must be a positive scalar."
+        if not np.isscalar(kappa) or kappa < 0:
+            raise ValueError(kappa_error_msg)
+        if float(kappa) == 0.:
+            raise ValueError("For 'kappa=0' the von Mises-Fisher distribution "
+                             "becomes the uniform distribution on the sphere "
+                             "surface. Consider using "
+                             "'scipy.stats.uniform_direction' instead.")
+        dim = mu.size
+
+        return dim, mu, kappa
+
+    def _check_data_vs_dist(self, x, dim):
+        if x.shape[-1] != dim:
+            raise ValueError("The dimensionality of the last axis of 'x' must "
+                             "match the dimensionality of the "
+                             "von Mises Fisher distribution.")
+        if not np.allclose(np.linalg.norm(x, axis=-1), 1.):
+            msg = "'x' must be unit vectors of norm 1 along last dimension."
+            raise ValueError(msg)
+
+    def _log_norm_factor(self, dim, kappa):
+        # normalization factor is given by
+        # c = kappa**(dim/2-1)/((2*pi)**(dim/2)*I[dim/2-1](kappa))
+        #   = kappa**(dim/2-1)*exp(-kappa) /
+        #     ((2*pi)**(dim/2)*I[dim/2-1](kappa)*exp(-kappa)
+        #   = kappa**(dim/2-1)*exp(-kappa) /
+        #     ((2*pi)**(dim/2)*ive[dim/2-1](kappa)
+        # Then the log is given by
+        # log c = 1/2*(dim -1)*log(kappa) - kappa - -1/2*dim*ln(2*pi) -
+        #         ive[dim/2-1](kappa)
+        halfdim = 0.5 * dim
+        return (0.5 * (dim - 2)*np.log(kappa) - halfdim * _LOG_2PI -
+                np.log(ive(halfdim - 1, kappa)) - kappa)
+
+    def _logpdf(self, x, dim, mu, kappa):
+        """Log of the von Mises-Fisher probability density function.
+
+        As this function does no argument checking, it should not be
+        called directly; use 'logpdf' instead.
+
+        """
+        x = np.asarray(x)
+        self._check_data_vs_dist(x, dim)
+        dotproducts = np.einsum('i,...i->...', mu, x)
+        return self._log_norm_factor(dim, kappa) + kappa * dotproducts
+
+    def logpdf(self, x, mu=None, kappa=1):
+        """Log of the von Mises-Fisher probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the log of the probability
+            density function. The last axis of `x` must correspond
+            to unit vectors of the same dimensionality as the distribution.
+        mu : array_like, default: None
+            Mean direction of the distribution. Must be a one-dimensional unit
+            vector of norm 1.
+        kappa : float, default: 1
+            Concentration parameter. Must be positive.
+
+        Returns
+        -------
+        logpdf : ndarray or scalar
+            Log of the probability density function evaluated at `x`.
+
+        """
+        dim, mu, kappa = self._process_parameters(mu, kappa)
+        return self._logpdf(x, dim, mu, kappa)
+
+    def pdf(self, x, mu=None, kappa=1):
+        """Von Mises-Fisher probability density function.
+
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the probability
+            density function. The last axis of `x` must correspond
+            to unit vectors of the same dimensionality as the distribution.
+        mu : array_like
+            Mean direction of the distribution. Must be a one-dimensional unit
+            vector of norm 1.
+        kappa : float
+            Concentration parameter. Must be positive.
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            Probability density function evaluated at `x`.
+
+        """
+        dim, mu, kappa = self._process_parameters(mu, kappa)
+        return np.exp(self._logpdf(x, dim, mu, kappa))
+
+    def _rvs_2d(self, mu, kappa, size, random_state):
+        """
+        In 2D, the von Mises-Fisher distribution reduces to the
+        von Mises distribution which can be efficiently sampled by numpy.
+        This method is much faster than the general rejection
+        sampling based algorithm.
+
+        """
+        mean_angle = np.arctan2(mu[1], mu[0])
+        angle_samples = random_state.vonmises(mean_angle, kappa, size=size)
+        samples = np.stack([np.cos(angle_samples), np.sin(angle_samples)],
+                           axis=-1)
+        return samples
+
+    def _rvs_3d(self, kappa, size, random_state):
+        """
+        Generate samples from a von Mises-Fisher distribution
+        with mu = [1, 0, 0] and kappa. Samples then have to be
+        rotated towards the desired mean direction mu.
+        This method is much faster than the general rejection
+        sampling based algorithm.
+        Reference: https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf
+
+        """
+        if size is None:
+            sample_size = 1
+        else:
+            sample_size = size
+
+        # compute x coordinate acc. to equation from section 3.1
+        x = random_state.random(sample_size)
+        x = 1. + np.log(x + (1. - x) * np.exp(-2 * kappa))/kappa
+
+        # (y, z) are random 2D vectors that only have to be
+        # normalized accordingly. Then (x, y z) follow a VMF distribution
+        temp = np.sqrt(1. - np.square(x))
+        uniformcircle = _sample_uniform_direction(2, sample_size, random_state)
+        samples = np.stack([x, temp * uniformcircle[..., 0],
+                            temp * uniformcircle[..., 1]],
+                           axis=-1)
+        if size is None:
+            samples = np.squeeze(samples)
+        return samples
+
+    def _rejection_sampling(self, dim, kappa, size, random_state):
+        """
+        Generate samples from an n-dimensional von Mises-Fisher distribution
+        with mu = [1, 0, ..., 0] and kappa via rejection sampling.
+        Samples then have to be rotated towards the desired mean direction mu.
+        Reference: https://doi.org/10.1080/03610919408813161
+        """
+        dim_minus_one = dim - 1
+        # calculate number of requested samples
+        if size is not None:
+            if not np.iterable(size):
+                size = (size, )
+            n_samples = math.prod(size)
+        else:
+            n_samples = 1
+        # calculate envelope for rejection sampler (eq. 4)
+        sqrt = np.sqrt(4 * kappa ** 2. + dim_minus_one ** 2)
+        envelop_param = (-2 * kappa + sqrt) / dim_minus_one
+        if envelop_param == 0:
+            # the regular formula suffers from loss of precision for high
+            # kappa. This can only be detected by checking for 0 here.
+            # Workaround: expansion for sqrt variable
+            # https://www.wolframalpha.com/input?i=sqrt%284*x%5E2%2Bd%5E2%29
+            # e = (-2 * k + sqrt(k**2 + d**2)) / d
+            #   ~ (-2 * k + 2 * k + d**2/(4 * k) - d**4/(64 * k**3)) / d
+            #   = d/(4 * k) - d**3/(64 * k**3)
+            envelop_param = (dim_minus_one/4 * kappa**-1.
+                             - dim_minus_one**3/64 * kappa**-3.)
+        # reference step 0
+        node = (1. - envelop_param) / (1. + envelop_param)
+        # t = ln(1 - ((1-x)/(1+x))**2)
+        #   = ln(4 * x / (1+x)**2)
+        #   = ln(4) + ln(x) - 2*log1p(x)
+        correction = (kappa * node + dim_minus_one
+                      * (np.log(4) + np.log(envelop_param)
+                      - 2 * np.log1p(envelop_param)))
+        n_accepted = 0
+        x = np.zeros((n_samples, ))
+        halfdim = 0.5 * dim_minus_one
+        # main loop
+        while n_accepted < n_samples:
+            # generate candidates acc. to reference step 1
+            sym_beta = random_state.beta(halfdim, halfdim,
+                                         size=n_samples - n_accepted)
+            coord_x = (1 - (1 + envelop_param) * sym_beta) / (
+                1 - (1 - envelop_param) * sym_beta)
+            # accept or reject: reference step 2
+            # reformulation for numerical stability:
+            # t = ln(1 - (1-x)/(1+x) * y)
+            #   = ln((1 + x - y +x*y)/(1 +x))
+            accept_tol = random_state.random(n_samples - n_accepted)
+            criterion = (
+                kappa * coord_x
+                + dim_minus_one * (np.log((1 + envelop_param - coord_x
+                + coord_x * envelop_param) / (1 + envelop_param)))
+                - correction) > np.log(accept_tol)
+            accepted_iter = np.sum(criterion)
+            x[n_accepted:n_accepted + accepted_iter] = coord_x[criterion]
+            n_accepted += accepted_iter
+        # concatenate x and remaining coordinates: step 3
+        coord_rest = _sample_uniform_direction(dim_minus_one, n_accepted,
+                                               random_state)
+        coord_rest = np.einsum(
+            '...,...i->...i', np.sqrt(1 - x ** 2), coord_rest)
+        samples = np.concatenate([x[..., None], coord_rest], axis=1)
+        # reshape output to (size, dim)
+        if size is not None:
+            samples = samples.reshape(size + (dim, ))
+        else:
+            samples = np.squeeze(samples)
+        return samples
+
+    def _rotate_samples(self, samples, mu, dim):
+        """A QR decomposition is used to find the rotation that maps the
+        north pole (1, 0,...,0) to the vector mu. This rotation is then
+        applied to all samples.
+
+        Parameters
+        ----------
+        samples: array_like, shape = [..., n]
+        mu : array-like, shape=[n, ]
+            Point to parametrise the rotation.
+
+        Returns
+        -------
+        samples : rotated samples
+
+        """
+        base_point = np.zeros((dim, ))
+        base_point[0] = 1.
+        embedded = np.concatenate([mu[None, :], np.zeros((dim - 1, dim))])
+        rotmatrix, _ = np.linalg.qr(np.transpose(embedded))
+        if np.allclose(np.matmul(rotmatrix, base_point[:, None])[:, 0], mu):
+            rotsign = 1
+        else:
+            rotsign = -1
+
+        # apply rotation
+        samples = np.einsum('ij,...j->...i', rotmatrix, samples) * rotsign
+        return samples
+
+    def _rvs(self, dim, mu, kappa, size, random_state):
+        if dim == 2:
+            samples = self._rvs_2d(mu, kappa, size, random_state)
+        elif dim == 3:
+            samples = self._rvs_3d(kappa, size, random_state)
+        else:
+            samples = self._rejection_sampling(dim, kappa, size,
+                                               random_state)
+
+        if dim != 2:
+            samples = self._rotate_samples(samples, mu, dim)
+        return samples
+
+    def rvs(self, mu=None, kappa=1, size=1, random_state=None):
+        """Draw random samples from a von Mises-Fisher distribution.
+
+        Parameters
+        ----------
+        mu : array_like
+            Mean direction of the distribution. Must be a one-dimensional unit
+            vector of norm 1.
+        kappa : float
+            Concentration parameter. Must be positive.
+        size : int or tuple of ints, optional
+            Given a shape of, for example, (m,n,k), m*n*k samples are
+            generated, and packed in an m-by-n-by-k arrangement.
+            Because each sample is N-dimensional, the output shape
+            is (m,n,k,N). If no shape is specified, a single (N-D)
+            sample is returned.
+        random_state : {None, int, np.random.RandomState, np.random.Generator},
+                        optional
+            Used for drawing random variates.
+            If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used, seeded
+            with seed.
+            If `seed` is already a ``RandomState`` or ``Generator`` instance,
+            then that object is used.
+            Default is `None`.
+
+        Returns
+        -------
+        rvs : ndarray
+            Random variates of shape (`size`, `N`), where `N` is the
+            dimension of the distribution.
+
+        """
+        dim, mu, kappa = self._process_parameters(mu, kappa)
+        random_state = self._get_random_state(random_state)
+        samples = self._rvs(dim, mu, kappa, size, random_state)
+        return samples
+
+    def _entropy(self, dim, kappa):
+        halfdim = 0.5 * dim
+        return (-self._log_norm_factor(dim, kappa) - kappa *
+                ive(halfdim, kappa) / ive(halfdim - 1, kappa))
+
+    def entropy(self, mu=None, kappa=1):
+        """Compute the differential entropy of the von Mises-Fisher
+        distribution.
+
+        Parameters
+        ----------
+        mu : array_like, default: None
+            Mean direction of the distribution. Must be a one-dimensional unit
+            vector of norm 1.
+        kappa : float, default: 1
+            Concentration parameter. Must be positive.
+
+        Returns
+        -------
+        h : scalar
+            Entropy of the von Mises-Fisher distribution.
+
+        """
+        dim, _, kappa = self._process_parameters(mu, kappa)
+        return self._entropy(dim, kappa)
+
+    def fit(self, x):
+        """Fit the von Mises-Fisher distribution to data.
+
+        Parameters
+        ----------
+        x : array-like
+            Data the distribution is fitted to. Must be two dimensional.
+            The second axis of `x` must be unit vectors of norm 1 and
+            determine the dimensionality of the fitted
+            von Mises-Fisher distribution.
+
+        Returns
+        -------
+        mu : ndarray
+            Estimated mean direction.
+        kappa : float
+            Estimated concentration parameter.
+
+        """
+        # validate input data
+        x = np.asarray(x)
+        if x.ndim != 2:
+            raise ValueError("'x' must be two dimensional.")
+        if not np.allclose(np.linalg.norm(x, axis=-1), 1.):
+            msg = "'x' must be unit vectors of norm 1 along last dimension."
+            raise ValueError(msg)
+        dim = x.shape[-1]
+
+        # mu is simply the directional mean
+        dirstats = directional_stats(x)
+        mu = dirstats.mean_direction
+        r = dirstats.mean_resultant_length
+
+        # kappa is the solution to the equation:
+        # r = I[dim/2](kappa) / I[dim/2 -1](kappa)
+        #   = I[dim/2](kappa) * exp(-kappa) / I[dim/2 -1](kappa) * exp(-kappa)
+        #   = ive(dim/2, kappa) / ive(dim/2 -1, kappa)
+
+        halfdim = 0.5 * dim
+
+        def solve_for_kappa(kappa):
+            bessel_vals = ive([halfdim, halfdim - 1], kappa)
+            return bessel_vals[0]/bessel_vals[1] - r
+
+        root_res = root_scalar(solve_for_kappa, method="brentq",
+                               bracket=(1e-8, 1e9))
+        kappa = root_res.root
+        return mu, kappa
+
+
+vonmises_fisher = vonmises_fisher_gen()
+
+
+class vonmises_fisher_frozen(multi_rv_frozen):
+    def __init__(self, mu=None, kappa=1, seed=None):
+        """Create a frozen von Mises-Fisher distribution.
+
+        Parameters
+        ----------
+        mu : array_like, default: None
+            Mean direction of the distribution.
+        kappa : float, default: 1
+            Concentration parameter. Must be positive.
+        seed : {None, int, `numpy.random.Generator`,
+                `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        """
+        self._dist = vonmises_fisher_gen(seed)
+        self.dim, self.mu, self.kappa = (
+            self._dist._process_parameters(mu, kappa)
+        )
+
+    def logpdf(self, x):
+        """
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the log of the probability
+            density function. The last axis of `x` must correspond
+            to unit vectors of the same dimensionality as the distribution.
+
+        Returns
+        -------
+        logpdf : ndarray or scalar
+            Log of probability density function evaluated at `x`.
+
+        """
+        return self._dist._logpdf(x, self.dim, self.mu, self.kappa)
+
+    def pdf(self, x):
+        """
+        Parameters
+        ----------
+        x : array_like
+            Points at which to evaluate the log of the probability
+            density function. The last axis of `x` must correspond
+            to unit vectors of the same dimensionality as the distribution.
+
+        Returns
+        -------
+        pdf : ndarray or scalar
+            Probability density function evaluated at `x`.
+
+        """
+        return np.exp(self.logpdf(x))
+
+    def rvs(self, size=1, random_state=None):
+        """Draw random variates from the Von Mises-Fisher distribution.
+
+        Parameters
+        ----------
+        size : int or tuple of ints, optional
+            Given a shape of, for example, (m,n,k), m*n*k samples are
+            generated, and packed in an m-by-n-by-k arrangement.
+            Because each sample is N-dimensional, the output shape
+            is (m,n,k,N). If no shape is specified, a single (N-D)
+            sample is returned.
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance
+            then that instance is used.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Random variates of size (`size`, `N`), where `N` is the
+            dimension of the distribution.
+
+        """
+        random_state = self._dist._get_random_state(random_state)
+        return self._dist._rvs(self.dim, self.mu, self.kappa, size,
+                               random_state)
+
+    def entropy(self):
+        """
+        Calculate the differential entropy of the von Mises-Fisher
+        distribution.
+
+        Returns
+        -------
+        h: float
+            Entropy of the Von Mises-Fisher distribution.
+
+        """
+        return self._dist._entropy(self.dim, self.kappa)
+
+
+class normal_inverse_gamma_gen(multi_rv_generic):
+    r"""Normal-inverse-gamma distribution.
+
+    The normal-inverse-gamma distribution is the conjugate prior of a normal
+    distribution with unknown mean and variance.
+
+    Methods
+    -------
+    pdf(x, s2, mu=0, lmbda=1, a=1, b=1)
+        Probability density function.
+    logpdf(x, s2, mu=0, lmbda=1, a=1, b=1)
+        Log of the probability density function.
+    mean(mu=0, lmbda=1, a=1, b=1)
+        Distribution mean.
+    var(mu=0, lmbda=1, a=1, b=1)
+        Distribution variance.
+    rvs(mu=0, lmbda=1, a=1, b=1, size=None, random_state=None)
+        Draw random samples.
+
+    Parameters
+    ----------
+    mu, lmbda, a, b  : array_like
+        Shape parameters of the distribution. See notes.
+    seed : {None, int, np.random.RandomState, np.random.Generator}, optional
+        Used for drawing random variates.
+        If `seed` is `None`, the `~np.random.RandomState` singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used, seeded
+        with seed.
+        If `seed` is already a ``RandomState`` or ``Generator`` instance,
+        then that object is used.
+        Default is `None`.
+
+    See Also
+    --------
+    norm
+    invgamma
+
+    Notes
+    -----
+
+    The probability density function of `normal_inverse_gamma` is:
+
+    .. math::
+
+        f(x, \sigma^2; \mu, \lambda, \alpha, \beta) =
+            \frac{\sqrt{\lambda}}{\sqrt{2 \pi \sigma^2}}
+            \frac{\beta^\alpha}{\Gamma(\alpha)}
+            \left( \frac{1}{\sigma^2} \right)^{\alpha + 1}
+            \exp \left(- \frac{2 \beta + \lambda (x - \mu)^2} {2 \sigma^2} \right)
+
+    where all parameters are real and finite, and :math:`\sigma^2 > 0`,
+    :math:`\lambda > 0`, :math:`\alpha > 0`, and :math:`\beta > 0`.
+
+    Methods ``normal_inverse_gamma.pdf`` and ``normal_inverse_gamma.logpdf``
+    accept `x` and `s2` for arguments :math:`x` and :math:`\sigma^2`.
+    All methods accept `mu`, `lmbda`, `a`, and `b` for shape parameters
+    :math:`\mu`, :math:`\lambda`, :math:`\alpha`, and :math:`\beta`,
+    respectively.
+
+    .. versionadded:: 1.15
+
+    References
+    ----------
+    .. [1] Normal-inverse-gamma distribution, Wikipedia,
+           https://en.wikipedia.org/wiki/Normal-inverse-gamma_distribution
+
+    Examples
+    --------
+    Suppose we wish to investigate the relationship between the
+    normal-inverse-gamma distribution and the inverse gamma distribution.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> rng = np.random.default_rng(527484872345)
+    >>> mu, lmbda, a, b = 0, 1, 20, 20
+    >>> norm_inv_gamma = stats.normal_inverse_gamma(mu, lmbda, a, b)
+    >>> inv_gamma = stats.invgamma(a, scale=b)
+
+    One approach is to compare the distribution of the `s2` elements of
+    random variates against the PDF of an inverse gamma distribution.
+
+    >>> _, s2 = norm_inv_gamma.rvs(size=10000, random_state=rng)
+    >>> bins = np.linspace(s2.min(), s2.max(), 50)
+    >>> plt.hist(s2, bins=bins, density=True, label='Frequency density')
+    >>> s2 = np.linspace(s2.min(), s2.max(), 300)
+    >>> plt.plot(s2, inv_gamma.pdf(s2), label='PDF')
+    >>> plt.xlabel(r'$\sigma^2$')
+    >>> plt.ylabel('Frequency density / PMF')
+    >>> plt.show()
+
+    Similarly, we can compare the marginal distribution of `s2` against
+    an inverse gamma distribution.
+
+    >>> from scipy.integrate import quad_vec
+    >>> from scipy import integrate
+    >>> s2 = np.linspace(0.5, 3, 6)
+    >>> res = quad_vec(lambda x: norm_inv_gamma.pdf(x, s2), -np.inf, np.inf)[0]
+    >>> np.allclose(res, inv_gamma.pdf(s2))
+    True
+
+    The sample mean is comparable to the mean of the distribution.
+
+    >>> x, s2 = norm_inv_gamma.rvs(size=10000, random_state=rng)
+    >>> x.mean(), s2.mean()
+    (np.float64(-0.005254750127304425), np.float64(1.050438111436508))
+    >>> norm_inv_gamma.mean()
+    (np.float64(0.0), np.float64(1.0526315789473684))
+
+    Similarly, for the variance:
+
+    >>> x.var(ddof=1), s2.var(ddof=1)
+    (np.float64(1.0546150578185023), np.float64(0.061829865266330754))
+    >>> norm_inv_gamma.var()
+    (np.float64(1.0526315789473684), np.float64(0.061557402277623886))
+
+    """
+    def rvs(self, mu=0, lmbda=1, a=1, b=1, size=None, random_state=None):
+        """Draw random samples from the distribution.
+
+        Parameters
+        ----------
+        mu, lmbda, a, b : array_like, optional
+            Shape parameters. `lmbda`, `a`, and `b` must be greater
+            than zero.
+        size : int or tuple of ints, optional
+            Shape of samples to draw.
+        random_state : {None, int, np.random.RandomState, np.random.Generator}, optional
+            Used for drawing random variates.
+            If `random_state` is `None`, the `~np.random.RandomState` singleton is used.
+            If `random_state` is an int, a new ``RandomState`` instance is used, seeded
+            with `random_state`.
+            If `random_state` is already a ``RandomState`` or ``Generator`` instance,
+            then that object is used.
+            Default is `None`.
+
+        Returns
+        -------
+        x, s2 : ndarray
+            Random variates.
+
+        """
+        random_state = self._get_random_state(random_state)
+        s2 = invgamma(a, scale=b).rvs(size=size, random_state=random_state)
+        scale = (s2 / lmbda)**0.5
+        x = norm(loc=mu, scale=scale).rvs(size=size, random_state=random_state)
+        dtype = np.result_type(1.0, mu, lmbda, a, b)
+        return x.astype(dtype), s2.astype(dtype)
+
+    def _logpdf(self, x, s2, mu, lmbda, a, b):
+        t1 = 0.5 * (np.log(lmbda) - np.log(2 * np.pi * s2))
+        t2 = a*np.log(b) - special.gammaln(a).astype(a.dtype)
+        t3 = -(a + 1) * np.log(s2)
+        t4 = -(2*b + lmbda*(x - mu)**2) / (2*s2)
+        return t1 + t2 + t3 + t4
+
+    def logpdf(self, x, s2, mu=0, lmbda=1, a=1, b=1):
+        """Log of the probability density function.
+
+        Parameters
+        ----------
+        x, s2 : array_like
+            Arguments. `s2` must be greater than zero.
+        mu, lmbda, a, b : array_like, optional
+            Shape parameters. `lmbda`, `a`, and `b` must be greater
+            than zero.
+
+        Returns
+        -------
+        logpdf : ndarray or scalar
+            Log of the probability density function.
+
+        """
+        invalid, args = self._process_parameters_pdf(x, s2, mu, lmbda, a, b)
+        s2 = args[1]
+        # Keep it simple for now; lazyselect later, perhaps.
+        with np.errstate(all='ignore'):
+            logpdf = np.asarray(self._logpdf(*args))
+        logpdf[s2 <= 0] = -np.inf
+        logpdf[invalid] = np.nan
+        return logpdf[()]
+
+    def _pdf(self, x, s2, mu, lmbda, a, b):
+        t1 = np.sqrt(lmbda / (2 * np.pi * s2))
+        t2 = b**a / special.gamma(a).astype(a.dtype)
+        t3 = (1 / s2)**(a + 1)
+        t4 = np.exp(-(2*b + lmbda*(x - mu)**2) / (2*s2))
+        return t1 * t2 * t3 * t4
+
+    def pdf(self, x, s2, mu=0, lmbda=1, a=1, b=1):
+        """The probability density function.
+
+        Parameters
+        ----------
+        x, s2 : array_like
+            Arguments. `s2` must be greater than zero.
+        mu, lmbda, a, b : array_like, optional
+            Shape parameters. `lmbda`, `a`, and `b` must be greater
+            than zero.
+
+        Returns
+        -------
+        logpdf : ndarray or scalar
+            The probability density function.
+
+        """
+        invalid, args = self._process_parameters_pdf(x, s2, mu, lmbda, a, b)
+        s2 = args[1]
+        # Keep it simple for now; lazyselect later, perhaps.
+        with np.errstate(all='ignore'):
+            pdf = np.asarray(self._pdf(*args))
+        pdf[s2 <= 0] = 0
+        pdf[invalid] = np.nan
+        return pdf[()]
+
+    def mean(self, mu=0, lmbda=1, a=1, b=1):
+        """The mean of the distribution.
+
+        Parameters
+        ----------
+        mu, lmbda, a, b : array_like, optional
+            Shape parameters. `lmbda` and `b` must be greater
+            than zero, and `a` must be greater than one.
+
+        Returns
+        -------
+        x, s2 : ndarray
+            The mean of the distribution.
+
+        """
+        invalid, args = self._process_shapes(mu, lmbda, a, b)
+        mu, lmbda, a, b = args
+        invalid |= ~(a > 1)
+        mean_x = np.asarray(mu).copy()
+        mean_s2 = np.asarray(b / (a - 1))
+        mean_x[invalid] = np.nan
+        mean_s2[invalid] = np.nan
+        return mean_x[()], mean_s2[()]
+
+    def var(self, mu=0, lmbda=1, a=1, b=1):
+        """The variance of the distribution.
+
+        Parameters
+        ----------
+        mu, lmbda, a, b : array_like, optional
+            Shape parameters. `lmbda` and `b` must be greater
+            than zero, and `a` must be greater than two.
+
+        Returns
+        -------
+        x, s2 : ndarray
+            The variance of the distribution.
+
+        """
+        invalid, args = self._process_shapes(mu, lmbda, a, b)
+        mu, lmbda, a, b = args
+        invalid_x = invalid | ~(a > 1)
+        invalid_s2 = invalid | ~(a > 2)
+        var_x = b / ((a - 1) * lmbda)
+        var_s2 = b**2 / ((a - 1)**2 * (a - 2))
+        var_x, var_s2 = np.asarray(var_x), np.asarray(var_s2)
+        var_x[invalid_x] = np.nan
+        var_s2[invalid_s2] = np.nan
+        return var_x[()], var_s2[()]
+
+    def _process_parameters_pdf(self, x, s2, mu, lmbda, a, b):
+        args = np.broadcast_arrays(x, s2, mu, lmbda, a, b)
+        dtype = np.result_type(1.0, *(arg.dtype for arg in args))
+        args = [arg.astype(dtype, copy=False) for arg in args]
+        x, s2, mu, lmbda, a, b = args
+        invalid = ~((lmbda > 0) & (a > 0) & (b > 0))
+        return invalid, args
+
+    def _process_shapes(self, mu, lmbda, a, b):
+        args = np.broadcast_arrays(mu, lmbda, a, b)
+        dtype = np.result_type(1.0, *(arg.dtype for arg in args))
+        args = [arg.astype(dtype, copy=False) for arg in args]
+        mu, lmbda, a, b = args
+        invalid = ~((lmbda > 0) & (a > 0) & (b > 0))
+        return invalid, args
+
+    def __call__(self, mu=0, lmbda=1, a=1, b=1, seed=None):
+        return normal_inverse_gamma_frozen(mu, lmbda, a, b, seed=seed)
+
+
+normal_inverse_gamma = normal_inverse_gamma_gen()
+
+
+class normal_inverse_gamma_frozen(multi_rv_frozen):
+
+    def __init__(self, mu=0, lmbda=1, a=1, b=1, seed=None):
+        self._dist = normal_inverse_gamma_gen(seed)
+        self._shapes = mu, lmbda, a, b
+
+    def logpdf(self, x, s2):
+        return self._dist.logpdf(x, s2, *self._shapes)
+
+    def pdf(self, x, s2):
+        return self._dist.pdf(x, s2, *self._shapes)
+
+    def mean(self):
+        return self._dist.mean(*self._shapes)
+
+    def var(self):
+        return self._dist.var(*self._shapes)
+
+    def rvs(self, size=None, random_state=None):
+        return self._dist.rvs(*self._shapes, size=size, random_state=random_state)
+
+
+# Set frozen generator docstrings from corresponding docstrings in
+# normal_inverse_gamma_gen and fill in default strings in class docstrings
+for name in ['logpdf', 'pdf', 'mean', 'var', 'rvs']:
+    method = normal_inverse_gamma_gen.__dict__[name]
+    method_frozen = normal_inverse_gamma_frozen.__dict__[name]
+    method_frozen.__doc__ = doccer.docformat(method.__doc__,
+                                             mvn_docdict_noparams)
+    method.__doc__ = doccer.docformat(method.__doc__, mvn_docdict_params)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_new_distributions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_new_distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4da18bd1a956a502a7cb254ef703a5d70e619c7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_new_distributions.py
@@ -0,0 +1,540 @@
+import sys
+
+import numpy as np
+from numpy import inf
+
+from scipy._lib import array_api_extra as xpx
+from scipy import special
+from scipy.special import _ufuncs as scu
+from scipy.stats._distribution_infrastructure import (
+    ContinuousDistribution, DiscreteDistribution, _RealInterval, _IntegerInterval,
+    _RealParameter, _Parameterization, _combine_docs)
+
+__all__ = ['Normal', 'Logistic', 'Uniform', 'Binomial']
+
+
+class Normal(ContinuousDistribution):
+    r"""Normal distribution with prescribed mean and standard deviation.
+
+    The probability density function of the normal distribution is:
+
+    .. math::
+
+        f(x) = \frac{1}{\sigma \sqrt{2 \pi}} \exp {
+            \left( -\frac{1}{2}\left( \frac{x - \mu}{\sigma} \right)^2 \right)}
+
+    """
+    # `ShiftedScaledDistribution` allows this to be generated automatically from
+    # an instance of `StandardNormal`, but the normal distribution is so frequently
+    # used that it's worth a bit of code duplication to get better performance.
+    _mu_domain = _RealInterval(endpoints=(-inf, inf))
+    _sigma_domain = _RealInterval(endpoints=(0, inf))
+    _x_support = _RealInterval(endpoints=(-inf, inf))
+
+    _mu_param = _RealParameter('mu',  symbol=r'\mu', domain=_mu_domain,
+                               typical=(-1, 1))
+    _sigma_param = _RealParameter('sigma', symbol=r'\sigma', domain=_sigma_domain,
+                                  typical=(0.5, 1.5))
+    _x_param = _RealParameter('x', domain=_x_support, typical=(-1, 1))
+
+    _parameterizations = [_Parameterization(_mu_param, _sigma_param)]
+
+    _variable = _x_param
+    _normalization = 1/np.sqrt(2*np.pi)
+    _log_normalization = np.log(2*np.pi)/2
+
+    def __new__(cls, mu=None, sigma=None, **kwargs):
+        if mu is None and sigma is None:
+            return super().__new__(StandardNormal)
+        return super().__new__(cls)
+
+    def __init__(self, *, mu=0., sigma=1., **kwargs):
+        super().__init__(mu=mu, sigma=sigma, **kwargs)
+
+    def _logpdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._logpdf_formula(self, (x - mu)/sigma) - np.log(sigma)
+
+    def _pdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._pdf_formula(self, (x - mu)/sigma) / sigma
+
+    def _logcdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._logcdf_formula(self, (x - mu)/sigma)
+
+    def _cdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._cdf_formula(self, (x - mu)/sigma)
+
+    def _logccdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._logccdf_formula(self, (x - mu)/sigma)
+
+    def _ccdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._ccdf_formula(self, (x - mu)/sigma)
+
+    def _icdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._icdf_formula(self, x) * sigma + mu
+
+    def _ilogcdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._ilogcdf_formula(self, x) * sigma + mu
+
+    def _iccdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._iccdf_formula(self, x) * sigma + mu
+
+    def _ilogccdf_formula(self, x, *, mu, sigma, **kwargs):
+        return StandardNormal._ilogccdf_formula(self, x) * sigma + mu
+
+    def _entropy_formula(self, *, mu, sigma, **kwargs):
+        return StandardNormal._entropy_formula(self) + np.log(abs(sigma))
+
+    def _logentropy_formula(self, *, mu, sigma, **kwargs):
+        lH0 = StandardNormal._logentropy_formula(self)
+        with np.errstate(divide='ignore'):
+            # sigma = 1 -> log(sigma) = 0 -> log(log(sigma)) = -inf
+            # Silence the unnecessary runtime warning
+            lls = np.log(np.log(abs(sigma))+0j)
+        return special.logsumexp(np.broadcast_arrays(lH0, lls), axis=0)
+
+    def _median_formula(self, *, mu, sigma, **kwargs):
+        return mu
+
+    def _mode_formula(self, *, mu, sigma, **kwargs):
+        return mu
+
+    def _moment_raw_formula(self, order, *, mu, sigma, **kwargs):
+        if order == 0:
+            return np.ones_like(mu)
+        elif order == 1:
+            return mu
+        else:
+            return None
+    _moment_raw_formula.orders = [0, 1]  # type: ignore[attr-defined]
+
+    def _moment_central_formula(self, order, *, mu, sigma, **kwargs):
+        if order == 0:
+            return np.ones_like(mu)
+        elif order % 2:
+            return np.zeros_like(mu)
+        else:
+            # exact is faster (and obviously more accurate) for reasonable orders
+            return sigma**order * special.factorial2(int(order) - 1, exact=True)
+
+    def _sample_formula(self, full_shape, rng, *, mu, sigma, **kwargs):
+        return rng.normal(loc=mu, scale=sigma, size=full_shape)[()]
+
+
+def _log_diff(log_p, log_q):
+    return special.logsumexp([log_p, log_q+np.pi*1j], axis=0)
+
+
+class StandardNormal(Normal):
+    r"""Standard normal distribution.
+
+    The probability density function of the standard normal distribution is:
+
+    .. math::
+
+        f(x) = \frac{1}{\sqrt{2 \pi}} \exp \left( -\frac{1}{2} x^2 \right)
+
+    """
+    _x_support = _RealInterval(endpoints=(-inf, inf))
+    _x_param = _RealParameter('x', domain=_x_support, typical=(-5, 5))
+    _variable = _x_param
+    _parameterizations = []
+    _normalization = 1/np.sqrt(2*np.pi)
+    _log_normalization = np.log(2*np.pi)/2
+    mu = np.float64(0.)
+    sigma = np.float64(1.)
+
+    def __init__(self, **kwargs):
+        ContinuousDistribution.__init__(self, **kwargs)
+
+    def _logpdf_formula(self, x, **kwargs):
+        return -(self._log_normalization + x**2/2)
+
+    def _pdf_formula(self, x, **kwargs):
+        return self._normalization * np.exp(-x**2/2)
+
+    def _logcdf_formula(self, x, **kwargs):
+        return special.log_ndtr(x)
+
+    def _cdf_formula(self, x, **kwargs):
+        return special.ndtr(x)
+
+    def _logccdf_formula(self, x, **kwargs):
+        return special.log_ndtr(-x)
+
+    def _ccdf_formula(self, x, **kwargs):
+        return special.ndtr(-x)
+
+    def _icdf_formula(self, x, **kwargs):
+        return special.ndtri(x)
+
+    def _ilogcdf_formula(self, x, **kwargs):
+        return special.ndtri_exp(x)
+
+    def _iccdf_formula(self, x, **kwargs):
+        return -special.ndtri(x)
+
+    def _ilogccdf_formula(self, x, **kwargs):
+        return -special.ndtri_exp(x)
+
+    def _entropy_formula(self, **kwargs):
+        return (1 + np.log(2*np.pi))/2
+
+    def _logentropy_formula(self, **kwargs):
+        return np.log1p(np.log(2*np.pi)) - np.log(2)
+
+    def _median_formula(self, **kwargs):
+        return 0
+
+    def _mode_formula(self, **kwargs):
+        return 0
+
+    def _moment_raw_formula(self, order, **kwargs):
+        raw_moments = {0: 1, 1: 0, 2: 1, 3: 0, 4: 3, 5: 0}
+        return raw_moments.get(order, None)
+
+    def _moment_central_formula(self, order, **kwargs):
+        return self._moment_raw_formula(order, **kwargs)
+
+    def _moment_standardized_formula(self, order, **kwargs):
+        return self._moment_raw_formula(order, **kwargs)
+
+    def _sample_formula(self, full_shape, rng, **kwargs):
+        return rng.normal(size=full_shape)[()]
+
+
+class Logistic(ContinuousDistribution):
+    r"""Standard logistic distribution.
+
+    The probability density function of the standard logistic distribution is:
+
+    .. math::
+
+        f(x) = \frac{1}{\left( e^{x / 2} + e^{-x / 2} \right)^2}
+
+    """
+    _x_support = _RealInterval(endpoints=(-inf, inf))
+    _variable = _x_param = _RealParameter('x', domain=_x_support, typical=(-9, 9))
+    _parameterizations = ()
+
+    _scale = np.pi / np.sqrt(3)
+
+    def _logpdf_formula(self, x, **kwargs):
+        y = -np.abs(x)
+        return y - 2 * special.log1p(np.exp(y))
+
+    def _pdf_formula(self, x, **kwargs):
+        # f(x) = sech(x / 2)**2 / 4
+        return (.5 / np.cosh(x / 2))**2
+
+    def _logcdf_formula(self, x, **kwargs):
+        return special.log_expit(x)
+
+    def _cdf_formula(self, x, **kwargs):
+        return special.expit(x)
+
+    def _logccdf_formula(self, x, **kwargs):
+        return special.log_expit(-x)
+
+    def _ccdf_formula(self, x, **kwargs):
+        return special.expit(-x)
+
+    def _icdf_formula(self, x, **kwargs):
+        return special.logit(x)
+
+    def _iccdf_formula(self, x, **kwargs):
+        return -special.logit(x)
+
+    def _entropy_formula(self, **kwargs):
+        return 2.0
+
+    def _logentropy_formula(self, **kwargs):
+        return np.log(2)
+
+    def _median_formula(self, **kwargs):
+        return 0
+
+    def _mode_formula(self, **kwargs):
+        return 0
+
+    def _moment_raw_formula(self, order, **kwargs):
+        n = int(order)
+        if n % 2:
+            return 0.0
+        return np.pi**n * abs((2**n - 2) * float(special.bernoulli(n)[-1]))
+
+    def _moment_central_formula(self, order, **kwargs):
+        return self._moment_raw_formula(order, **kwargs)
+
+    def _moment_standardized_formula(self, order, **kwargs):
+        return self._moment_raw_formula(order, **kwargs) / self._scale**order
+
+    def _sample_formula(self, full_shape, rng, **kwargs):
+        return rng.logistic(size=full_shape)[()]
+
+
+# currently for testing only
+class _LogUniform(ContinuousDistribution):
+    r"""Log-uniform distribution.
+
+    The probability density function of the log-uniform distribution is:
+
+    .. math::
+
+        f(x; a, b) = \frac{1}
+                          {x (\log(b) - \log(a))}
+
+    If :math:`\log(X)` is a random variable that follows a uniform distribution
+    between :math:`\log(a)` and :math:`\log(b)`, then :math:`X` is log-uniformly
+    distributed with shape parameters :math:`a` and :math:`b`.
+
+    """
+
+    _a_domain = _RealInterval(endpoints=(0, inf))
+    _b_domain = _RealInterval(endpoints=('a', inf))
+    _log_a_domain = _RealInterval(endpoints=(-inf, inf))
+    _log_b_domain = _RealInterval(endpoints=('log_a', inf))
+    _x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
+
+    _a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
+    _b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
+    _log_a_param = _RealParameter('log_a', symbol=r'\log(a)',
+                                  domain=_log_a_domain, typical=(-3, -0.1))
+    _log_b_param = _RealParameter('log_b', symbol=r'\log(b)',
+                                  domain=_log_b_domain, typical=(0.1, 3))
+    _x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
+
+    _b_domain.define_parameters(_a_param)
+    _log_b_domain.define_parameters(_log_a_param)
+    _x_support.define_parameters(_a_param, _b_param)
+
+    _parameterizations = [_Parameterization(_log_a_param, _log_b_param),
+                          _Parameterization(_a_param, _b_param)]
+    _variable = _x_param
+
+    def __init__(self, *, a=None, b=None, log_a=None, log_b=None, **kwargs):
+        super().__init__(a=a, b=b, log_a=log_a, log_b=log_b, **kwargs)
+
+    def _process_parameters(self, a=None, b=None, log_a=None, log_b=None, **kwargs):
+        a = np.exp(log_a) if a is None else a
+        b = np.exp(log_b) if b is None else b
+        log_a = np.log(a) if log_a is None else log_a
+        log_b = np.log(b) if log_b is None else log_b
+        kwargs.update(dict(a=a, b=b, log_a=log_a, log_b=log_b))
+        return kwargs
+
+    # def _logpdf_formula(self, x, *, log_a, log_b, **kwargs):
+    #     return -np.log(x) - np.log(log_b - log_a)
+
+    def _pdf_formula(self, x, *, log_a, log_b, **kwargs):
+        return ((log_b - log_a)*x)**-1
+
+    # def _cdf_formula(self, x, *, log_a, log_b, **kwargs):
+    #     return (np.log(x) - log_a)/(log_b - log_a)
+
+    def _moment_raw_formula(self, order, log_a, log_b, **kwargs):
+        if order == 0:
+            return self._one
+        t1 = self._one / (log_b - log_a) / order
+        t2 = np.real(np.exp(_log_diff(order * log_b, order * log_a)))
+        return t1 * t2
+
+
+class Uniform(ContinuousDistribution):
+    r"""Uniform distribution.
+
+    The probability density function of the uniform distribution is:
+
+    .. math::
+
+        f(x; a, b) = \frac{1}
+                          {b - a}
+
+    """
+
+    _a_domain = _RealInterval(endpoints=(-inf, inf))
+    _b_domain = _RealInterval(endpoints=('a', inf))
+    _x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
+
+    _a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
+    _b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
+    _x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
+
+    _b_domain.define_parameters(_a_param)
+    _x_support.define_parameters(_a_param, _b_param)
+
+    _parameterizations = [_Parameterization(_a_param, _b_param)]
+    _variable = _x_param
+
+    def __init__(self, *, a=None, b=None, **kwargs):
+        super().__init__(a=a, b=b, **kwargs)
+
+    def _process_parameters(self, a=None, b=None, ab=None, **kwargs):
+        ab = b - a
+        kwargs.update(dict(a=a, b=b, ab=ab))
+        return kwargs
+
+    def _logpdf_formula(self, x, *, ab, **kwargs):
+        return np.where(np.isnan(x), np.nan, -np.log(ab))
+
+    def _pdf_formula(self, x, *, ab, **kwargs):
+        return np.where(np.isnan(x), np.nan, 1/ab)
+
+    def _logcdf_formula(self, x, *, a, ab, **kwargs):
+        with np.errstate(divide='ignore'):
+            return np.log(x - a) - np.log(ab)
+
+    def _cdf_formula(self, x, *, a, ab, **kwargs):
+        return (x - a) / ab
+
+    def _logccdf_formula(self, x, *, b, ab, **kwargs):
+        with np.errstate(divide='ignore'):
+            return np.log(b - x) - np.log(ab)
+
+    def _ccdf_formula(self, x, *, b, ab, **kwargs):
+        return (b - x) / ab
+
+    def _icdf_formula(self, p, *, a, ab, **kwargs):
+        return a + ab*p
+
+    def _iccdf_formula(self, p, *, b, ab, **kwargs):
+        return b - ab*p
+
+    def _entropy_formula(self, *, ab, **kwargs):
+        return np.log(ab)
+
+    def _mode_formula(self, *, a, b, ab, **kwargs):
+        return a + 0.5*ab
+
+    def _median_formula(self, *, a, b, ab, **kwargs):
+        return a + 0.5*ab
+
+    def _moment_raw_formula(self, order, a, b, ab, **kwargs):
+        np1 = order + 1
+        return (b**np1 - a**np1) / (np1 * ab)
+
+    def _moment_central_formula(self, order, ab, **kwargs):
+        return ab**2/12 if order == 2 else None
+
+    _moment_central_formula.orders = [2]  # type: ignore[attr-defined]
+
+    def _sample_formula(self, full_shape, rng, a, b, ab, **kwargs):
+        try:
+            return rng.uniform(a, b, size=full_shape)[()]
+        except OverflowError:  # happens when there are NaNs
+            return rng.uniform(0, 1, size=full_shape)*ab + a
+
+
+class _Gamma(ContinuousDistribution):
+    # Gamma distribution for testing only
+    _a_domain = _RealInterval(endpoints=(0, inf))
+    _x_support = _RealInterval(endpoints=(0, inf), inclusive=(False, False))
+
+    _a_param = _RealParameter('a', domain=_a_domain, typical=(0.1, 10))
+    _x_param = _RealParameter('x', domain=_x_support, typical=(0.1, 10))
+
+    _parameterizations = [_Parameterization(_a_param)]
+    _variable = _x_param
+
+    def _pdf_formula(self, x, *, a, **kwargs):
+        return x ** (a - 1) * np.exp(-x) / special.gamma(a)
+
+
+class Binomial(DiscreteDistribution):
+    r"""Binomial distribution with prescribed success probability and number of trials
+
+    The probability density function of the binomial distribution is:
+
+    .. math::
+
+        f(x) = {n \choose x} p^x (1 - p)^{n-x}
+
+    """
+    _n_domain = _IntegerInterval(endpoints=(0, inf), inclusive=(False, False))
+    _p_domain = _RealInterval(endpoints=(0, 1), inclusive=(False, False))
+    _x_support = _IntegerInterval(endpoints=(0, 'n'), inclusive=(True, True))
+
+    _n_param = _RealParameter('n', domain=_n_domain, typical=(10, 20))
+    _p_param = _RealParameter('p', domain=_p_domain, typical=(0.25, 0.75))
+    _x_param = _RealParameter('x', domain=_x_support, typical=(0, 10))
+
+    _parameterizations = [_Parameterization(_n_param, _p_param)]
+    _variable = _x_param
+
+    def __init__(self, *, n, p, **kwargs):
+        super().__init__(n=n, p=p, **kwargs)
+
+    def _pmf_formula(self, x, *, n, p, **kwargs):
+        return scu._binom_pmf(x, n, p)
+
+    def _logpmf_formula(self, x, *, n, p, **kwargs):
+        # This implementation is from the ``scipy.stats.binom`` and could be improved
+        # by using a more numerically sound implementation of the absolute value of
+        # the binomial coefficient.
+        combiln = (
+            special.gammaln(n+1) - (special.gammaln(x+1) + special.gammaln(n-x+1))
+        )
+        return combiln + special.xlogy(x, p) + special.xlog1py(n-x, -p)
+
+    def _cdf_formula(self, x, *, n, p, **kwargs):
+        return scu._binom_cdf(x, n, p)
+
+    def _logcdf_formula(self, x, *, n, p, **kwargs):
+        # todo: add this strategy to infrastructure more generally, but allow dist
+        #   author to specify threshold other than median in case median is expensive
+        median = self._icdf_formula(0.5, n=n, p=p)
+        return xpx.apply_where(x < median, (x, n, p),
+            lambda *args: np.log(scu._binom_cdf(*args)),
+            lambda *args: np.log1p(-scu._binom_sf(*args))
+        )
+
+    def _ccdf_formula(self, x, *, n, p, **kwargs):
+        return scu._binom_sf(x, n, p)
+
+    def _logccdf_formula(self, x, *, n, p, **kwargs):
+        median = self._icdf_formula(0.5, n=n, p=p)
+        return xpx.apply_where(x < median, (x, n, p),
+            lambda *args: np.log1p(-scu._binom_cdf(*args)),
+            lambda *args: np.log(scu._binom_sf(*args))
+        )
+
+    def _icdf_formula(self, x, *, n, p, **kwargs):
+        return scu._binom_ppf(x, n, p)
+
+    def _iccdf_formula(self, x, *, n, p, **kwargs):
+        return scu._binom_isf(x, n, p)
+
+    def _mode_formula(self, *, n, p, **kwargs):
+        # https://en.wikipedia.org/wiki/Binomial_distribution#Mode
+        mode = np.floor((n+1)*p)
+        mode = np.where(p == 1, mode - 1, mode)
+        return mode[()]
+
+    def _moment_raw_formula(self, order, *, n, p, **kwargs):
+        # https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
+        if order == 1:
+            return n*p
+        if order == 2:
+            return n*p*(1 - p + n*p)
+        return None
+    _moment_raw_formula.orders = [1, 2]  # type: ignore[attr-defined]
+
+    def _moment_central_formula(self, order, *, n, p, **kwargs):
+        # https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
+        if order == 1:
+            return np.zeros_like(n)
+        if order == 2:
+            return n*p*(1 - p)
+        if order == 3:
+            return n*p*(1 - p)*(1 - 2*p)
+        if order == 4:
+            return n*p*(1 - p)*(1 + (3*n - 6)*p*(1 - p))
+        return None
+    _moment_central_formula.orders = [1, 2, 3, 4]  # type: ignore[attr-defined]
+
+
+# Distribution classes need only define the summary and beginning of the extended
+# summary portion of the class documentation. All other documentation, including
+# examples, is generated automatically.
+_module = sys.modules[__name__].__dict__
+for dist_name in __all__:
+    _module[dist_name].__doc__ = _combine_docs(_module[dist_name])
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_odds_ratio.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_odds_ratio.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc593f5adc9a700c618c721b6c37c801809d868b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_odds_ratio.py
@@ -0,0 +1,466 @@
+import numpy as np
+
+from scipy.special import ndtri
+from scipy.optimize import brentq
+from ._discrete_distns import nchypergeom_fisher
+from ._common import ConfidenceInterval
+
+
+def _sample_odds_ratio(table):
+    """
+    Given a table [[a, b], [c, d]], compute a*d/(b*c).
+
+    Return nan if the numerator and denominator are 0.
+    Return inf if just the denominator is 0.
+    """
+    # table must be a 2x2 numpy array.
+    if table[1, 0] > 0 and table[0, 1] > 0:
+        oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
+    elif table[0, 0] == 0 or table[1, 1] == 0:
+        oddsratio = np.nan
+    else:
+        oddsratio = np.inf
+    return oddsratio
+
+
+def _solve(func):
+    """
+    Solve func(nc) = 0.  func must be an increasing function.
+    """
+    # We could just as well call the variable `x` instead of `nc`, but we
+    # always call this function with functions for which nc (the noncentrality
+    # parameter) is the variable for which we are solving.
+    nc = 1.0
+    value = func(nc)
+    if value == 0:
+        return nc
+
+    # Multiplicative factor by which to increase or decrease nc when
+    # searching for a bracketing interval.
+    factor = 2.0
+    # Find a bracketing interval.
+    if value > 0:
+        nc /= factor
+        while func(nc) > 0:
+            nc /= factor
+        lo = nc
+        hi = factor*nc
+    else:
+        nc *= factor
+        while func(nc) < 0:
+            nc *= factor
+        lo = nc/factor
+        hi = nc
+
+    # lo and hi bracket the solution for nc.
+    nc = brentq(func, lo, hi, xtol=1e-13)
+    return nc
+
+
+def _nc_hypergeom_mean_inverse(x, M, n, N):
+    """
+    For the given noncentral hypergeometric parameters x, M, n,and N
+    (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
+    contingency table), find the noncentrality parameter of Fisher's
+    noncentral hypergeometric distribution whose mean is x.
+    """
+    nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
+    return nc
+
+
+def _hypergeom_params_from_table(table):
+    # The notation M, n and N is consistent with stats.hypergeom and
+    # stats.nchypergeom_fisher.
+    x = table[0, 0]
+    M = table.sum()
+    n = table[0].sum()
+    N = table[:, 0].sum()
+    return x, M, n, N
+
+
+def _ci_upper(table, alpha):
+    """
+    Compute the upper end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == np.inf:
+        return np.inf
+
+    x, M, n, N = _hypergeom_params_from_table(table)
+
+    # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
+    # it in the lambda expression.
+    nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
+    return nc
+
+
+def _ci_lower(table, alpha):
+    """
+    Compute the lower end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == 0:
+        return 0
+
+    x, M, n, N = _hypergeom_params_from_table(table)
+
+    nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
+    return nc
+
+
+def _conditional_oddsratio(table):
+    """
+    Conditional MLE of the odds ratio for the 2x2 contingency table.
+    """
+    x, M, n, N = _hypergeom_params_from_table(table)
+    # Get the bounds of the support.  The support of the noncentral
+    # hypergeometric distribution with parameters M, n, and N is the same
+    # for all values of the noncentrality parameter, so we can use 1 here.
+    lo, hi = nchypergeom_fisher.support(M, n, N, 1)
+
+    # Check if x is at one of the extremes of the support.  If so, we know
+    # the odds ratio is either 0 or inf.
+    if x == lo:
+        # x is at the low end of the support.
+        return 0
+    if x == hi:
+        # x is at the high end of the support.
+        return np.inf
+
+    nc = _nc_hypergeom_mean_inverse(x, M, n, N)
+    return nc
+
+
+def _conditional_oddsratio_ci(table, confidence_level=0.95,
+                              alternative='two-sided'):
+    """
+    Conditional exact confidence interval for the odds ratio.
+    """
+    if alternative == 'two-sided':
+        alpha = 0.5*(1 - confidence_level)
+        lower = _ci_lower(table, alpha)
+        upper = _ci_upper(table, alpha)
+    elif alternative == 'less':
+        lower = 0.0
+        upper = _ci_upper(table, 1 - confidence_level)
+    else:
+        # alternative == 'greater'
+        lower = _ci_lower(table, 1 - confidence_level)
+        upper = np.inf
+
+    return lower, upper
+
+
+def _sample_odds_ratio_ci(table, confidence_level=0.95,
+                          alternative='two-sided'):
+    oddsratio = _sample_odds_ratio(table)
+    log_or = np.log(oddsratio)
+    se = np.sqrt((1/table).sum())
+    if alternative == 'less':
+        z = ndtri(confidence_level)
+        loglow = -np.inf
+        loghigh = log_or + z*se
+    elif alternative == 'greater':
+        z = ndtri(confidence_level)
+        loglow = log_or - z*se
+        loghigh = np.inf
+    else:
+        # alternative is 'two-sided'
+        z = ndtri(0.5*confidence_level + 0.5)
+        loglow = log_or - z*se
+        loghigh = log_or + z*se
+
+    return np.exp(loglow), np.exp(loghigh)
+
+
+class OddsRatioResult:
+    """
+    Result of `scipy.stats.contingency.odds_ratio`.  See the
+    docstring for `odds_ratio` for more details.
+
+    Attributes
+    ----------
+    statistic : float
+        The computed odds ratio.
+
+        * If `kind` is ``'sample'``, this is sample (or unconditional)
+          estimate, given by
+          ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+        * If `kind` is ``'conditional'``, this is the conditional
+          maximum likelihood estimate for the odds ratio. It is
+          the noncentrality parameter of Fisher's noncentral
+          hypergeometric distribution with the same hypergeometric
+          parameters as `table` and whose mean is ``table[0, 0]``.
+
+    Methods
+    -------
+    confidence_interval :
+        Confidence interval for the odds ratio.
+    """
+
+    def __init__(self, _table, _kind, statistic):
+        # for now, no need to make _table and _kind public, since this sort of
+        # information is returned in very few `scipy.stats` results
+        self._table = _table
+        self._kind = _kind
+        self.statistic = statistic
+
+    def __repr__(self):
+        return f"OddsRatioResult(statistic={self.statistic})"
+
+    def confidence_interval(self, confidence_level=0.95,
+                            alternative='two-sided'):
+        """
+        Confidence interval for the odds ratio.
+
+        Parameters
+        ----------
+        confidence_level: float
+            Desired confidence level for the confidence interval.
+            The value must be given as a fraction between 0 and 1.
+            Default is 0.95 (meaning 95%).
+
+        alternative : {'two-sided', 'less', 'greater'}, optional
+            The alternative hypothesis of the hypothesis test to which the
+            confidence interval corresponds. That is, suppose the null
+            hypothesis is that the true odds ratio equals ``OR`` and the
+            confidence interval is ``(low, high)``. Then the following options
+            for `alternative` are available (default is 'two-sided'):
+
+            * 'two-sided': the true odds ratio is not equal to ``OR``. There
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``high < OR`` or ``low > OR``.
+            * 'less': the true odds ratio is less than ``OR``. The ``low`` end
+              of the confidence interval is 0, and there is evidence against
+              the null hypothesis at  the chosen `confidence_level` if
+              ``high < OR``.
+            * 'greater': the true odds ratio is greater than ``OR``.  The
+              ``high`` end of the confidence interval is ``np.inf``, and there
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``low > OR``.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` instance
+            The confidence interval, represented as an object with
+            attributes ``low`` and ``high``.
+
+        Notes
+        -----
+        When `kind` is ``'conditional'``, the limits of the confidence
+        interval are the conditional "exact confidence limits" as described
+        by Fisher [1]_. The conditional odds ratio and confidence interval are
+        also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
+
+        When `kind` is ``'sample'``, the confidence interval is computed
+        under the assumption that the logarithm of the odds ratio is normally
+        distributed with standard error given by::
+
+            se = sqrt(1/a + 1/b + 1/c + 1/d)
+
+        where ``a``, ``b``, ``c`` and ``d`` are the elements of the
+        contingency table.  (See, for example, [2]_, section 3.1.3.2,
+        or [3]_, section 2.3.3).
+
+        References
+        ----------
+        .. [1] R. A. Fisher (1935), The logic of inductive inference,
+               Journal of the Royal Statistical Society, Vol. 98, No. 1,
+               pp. 39-82.
+        .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+               Methods, Techniques, and Applications, CRC Press LLC, Boca
+               Raton, Florida.
+        .. [3] Alan Agresti, An Introduction to Categorical Data Analysis
+               (second edition), Wiley, Hoboken, NJ, USA (2007).
+        """
+        if alternative not in ['two-sided', 'less', 'greater']:
+            raise ValueError("`alternative` must be 'two-sided', 'less' or "
+                             "'greater'.")
+
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+
+        if self._kind == 'conditional':
+            ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
+        else:
+            ci = self._sample_odds_ratio_ci(confidence_level, alternative)
+        return ci
+
+    def _conditional_odds_ratio_ci(self, confidence_level=0.95,
+                                   alternative='two-sided'):
+        """
+        Confidence interval for the conditional odds ratio.
+        """
+
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _conditional_oddsratio_ci(table,
+                                           confidence_level=confidence_level,
+                                           alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+
+    def _sample_odds_ratio_ci(self, confidence_level=0.95,
+                              alternative='two-sided'):
+        """
+        Confidence interval for the sample odds ratio.
+        """
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _sample_odds_ratio_ci(table,
+                                       confidence_level=confidence_level,
+                                       alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+
+
+def odds_ratio(table, *, kind='conditional'):
+    r"""
+    Compute the odds ratio for a 2x2 contingency table.
+
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements must be non-negative integers.
+    kind : str, optional
+        Which kind of odds ratio to compute, either the sample
+        odds ratio (``kind='sample'``) or the conditional odds ratio
+        (``kind='conditional'``).  Default is ``'conditional'``.
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.OddsRatioResult` instance
+        The returned object has two computed attributes:
+
+        statistic : float
+            * If `kind` is ``'sample'``, this is sample (or unconditional)
+              estimate, given by
+              ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+            * If `kind` is ``'conditional'``, this is the conditional
+              maximum likelihood estimate for the odds ratio. It is
+              the noncentrality parameter of Fisher's noncentral
+              hypergeometric distribution with the same hypergeometric
+              parameters as `table` and whose mean is ``table[0, 0]``.
+
+        The object has the method `confidence_interval` that computes
+        the confidence interval of the odds ratio.
+
+    See Also
+    --------
+    scipy.stats.fisher_exact
+    relative_risk
+    :ref:`hypothesis_odds_ratio` : Extended example
+
+    Notes
+    -----
+    The conditional odds ratio was discussed by Fisher (see "Example 1"
+    of [1]_).  Texts that cover the odds ratio include [2]_ and [3]_.
+
+    .. versionadded:: 1.10.0
+
+    References
+    ----------
+    .. [1] R. A. Fisher (1935), The logic of inductive inference,
+           Journal of the Royal Statistical Society, Vol. 98, No. 1,
+           pp. 39-82.
+    .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
+           Volume I - The analysis of case-control studies. IARC Sci Publ.
+           (32):5-338. PMID: 7216345. (See section 4.2.)
+    .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+           Methods, Techniques, and Applications, CRC Press LLC, Boca
+           Raton, Florida.
+
+    Examples
+    --------
+    In epidemiology, individuals are classified as "exposed" or
+    "unexposed" to some factor or treatment. If the occurrence of some
+    illness is under study, those who have the illness are often
+    classified as "cases", and those without it are "noncases".  The
+    counts of the occurrences of these classes gives a contingency
+    table::
+
+                    exposed    unexposed
+        cases          a           b
+        noncases       c           d
+
+    The sample odds ratio may be written ``(a/c) / (b/d)``.  ``a/c`` can
+    be interpreted as the odds of a case occurring in the exposed group,
+    and ``b/d`` as the odds of a case occurring in the unexposed group.
+    The sample odds ratio is the ratio of these odds.  If the odds ratio
+    is greater than 1, it suggests that there is a positive association
+    between being exposed and being a case.
+
+    Interchanging the rows or columns of the contingency table inverts
+    the odds ratio, so it is important to understand the meaning of labels
+    given to the rows and columns of the table when interpreting the
+    odds ratio.
+
+    Consider a hypothetical example where it is hypothesized that exposure to a
+    certain chemical is associated with increased occurrence of a certain
+    disease. Suppose we have the following table for a collection of 410 people::
+
+                exposed unexposed
+        cases        7       15
+        noncases    58      472
+
+    The question we ask is "Is exposure to the chemical associated with
+    increased risk of the disease?"
+
+    Compute the odds ratio:
+
+    >>> from scipy.stats.contingency import odds_ratio
+    >>> res = odds_ratio([[7, 15], [58, 472]])
+    >>> res.statistic
+    3.7836687705553493
+
+    For this sample, the odds of getting the disease for those who have been
+    exposed to the chemical are almost 3.8 times that of those who have not been
+    exposed.
+
+    We can compute the 95% confidence interval for the odds ratio:
+
+    >>> res.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269)
+
+    The 95% confidence interval for the conditional odds ratio is approximately
+    (1.25, 10.4).
+
+    For a more detailed example, see :ref:`hypothesis_odds_ratio`.
+    """
+    if kind not in ['conditional', 'sample']:
+        raise ValueError("`kind` must be 'conditional' or 'sample'.")
+
+    c = np.asarray(table)
+
+    if c.shape != (2, 2):
+        raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
+                         "of shape (2, 2).")
+
+    if not np.issubdtype(c.dtype, np.integer):
+        raise ValueError("`table` must be an array of integers, but got "
+                         f"type {c.dtype}")
+    c = c.astype(np.int64)
+
+    if np.any(c < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+
+    if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
+        # If both values in a row or column are zero, the p-value is NaN and
+        # the odds ratio is NaN.
+        result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
+        return result
+
+    if kind == 'sample':
+        oddsratio = _sample_odds_ratio(c)
+    else:  # kind is 'conditional'
+        oddsratio = _conditional_oddsratio(c)
+
+    result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
+    return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_page_trend_test.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_page_trend_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80ee2ae22a323c3e85ed12a9ef1644f865a57a0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_page_trend_test.py
@@ -0,0 +1,488 @@
+from dataclasses import dataclass
+from itertools import permutations
+import math
+import threading
+
+import numpy as np
+
+from ._continuous_distns import norm
+from scipy._lib._array_api import xp_capabilities
+import scipy.stats
+
+
+@dataclass
+class PageTrendTestResult:
+    statistic: float
+    pvalue: float
+    method: str
+
+
+@xp_capabilities(np_only=True)
+def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
+    r"""
+    Perform Page's Test, a measure of trend in observations between treatments.
+
+    Page's Test (also known as Page's :math:`L` test) is useful when:
+
+    * there are :math:`n \geq 3` treatments,
+    * :math:`m \geq 2` subjects are observed for each treatment, and
+    * the observations are hypothesized to have a particular order.
+
+    Specifically, the test considers the null hypothesis that
+
+    .. math::
+
+        m_1 = m_2 = m_3 \cdots = m_n,
+
+    where :math:`m_j` is the mean of the observed quantity under treatment
+    :math:`j`, against the alternative hypothesis that
+
+    .. math::
+
+        m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
+
+    where at least one inequality is strict.
+
+    As noted by [4]_, Page's :math:`L` test has greater statistical power than
+    the Friedman test against the alternative that there is a difference in
+    trend, as Friedman's test only considers a difference in the means of the
+    observations without considering their order. Whereas Spearman :math:`\rho`
+    considers the correlation between the ranked observations of two variables
+    (e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
+    carries), Page's :math:`L` is concerned with a trend in an observation
+    (e.g. the airspeed velocity of a swallow) across several distinct
+    treatments (e.g. carrying each of five coconuts of different weight) even
+    as the observation is repeated with multiple subjects (e.g. one European
+    swallow and one African swallow).
+
+    Parameters
+    ----------
+    data : array-like
+        A :math:`m \times n` array; the element in row :math:`i` and
+        column :math:`j` is the observation corresponding with subject
+        :math:`i` and treatment :math:`j`. By default, the columns are
+        assumed to be arranged in order of increasing predicted mean.
+
+    ranked : boolean, optional
+        By default, `data` is assumed to be observations rather than ranks;
+        it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
+        `data` is provided in the form of ranks, pass argument ``True``.
+
+    predicted_ranks : array-like, optional
+        The predicted ranks of the column means. If not specified,
+        the columns are assumed to be arranged in order of increasing
+        predicted mean, so the default `predicted_ranks` are
+        :math:`[1, 2, \dots, n-1, n]`.
+
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        Selects the method used to calculate the *p*-value. The following
+        options are available.
+
+        * 'auto': selects between 'exact' and 'asymptotic' to
+          achieve reasonably accurate results in reasonable time (default)
+        * 'asymptotic': compares the standardized test statistic against
+          the normal distribution
+        * 'exact': computes the exact *p*-value by comparing the observed
+          :math:`L` statistic against those realized by all possible
+          permutations of ranks (under the null hypothesis that each
+          permutation is equally likely)
+
+    Returns
+    -------
+    res : PageTrendTestResult
+        An object containing attributes:
+
+        statistic : float
+            Page's :math:`L` test statistic.
+        pvalue : float
+            The associated *p*-value
+        method : {'asymptotic', 'exact'}
+            The method used to compute the *p*-value
+
+    See Also
+    --------
+    rankdata, friedmanchisquare, spearmanr
+
+    Notes
+    -----
+    As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
+    :math:`n` objects or events or performances or persons or trials ranked."
+    Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
+    "groupings by ability or some other control variable, or judges doing
+    the ranking, or random replications of some other sort."
+
+    The procedure for calculating the :math:`L` statistic, adapted from
+    [1]_, is:
+
+    1. "Predetermine with careful logic the appropriate hypotheses
+       concerning the predicted ordering of the experimental results.
+       If no reasonable basis for ordering any treatments is known, the
+       :math:`L` test is not appropriate."
+    2. "As in other experiments, determine at what level of confidence
+       you will reject the null hypothesis that there is no agreement of
+       experimental results with the monotonic hypothesis."
+    3. "Cast the experimental material into a two-way table of :math:`n`
+       columns (treatments, objects ranked, conditions) and :math:`m`
+       rows (subjects, replication groups, levels of control variables)."
+    4. "When experimental observations are recorded, rank them across each
+       row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
+    5. "Add the ranks in each column", e.g.
+       ``colsums = np.sum(ranks, axis=0)``.
+    6. "Multiply each sum of ranks by the predicted rank for that same
+       column", e.g. ``products = predicted_ranks * colsums``.
+    7. "Sum all such products", e.g. ``L = products.sum()``.
+
+    [1]_ continues by suggesting use of the standardized statistic
+
+    .. math::
+
+        \chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
+
+    "which is distributed approximately as chi-square with 1 degree of
+    freedom. The ordinary use of :math:`\chi^2` tables would be
+    equivalent to a two-sided test of agreement. If a one-sided test
+    is desired, *as will almost always be the case*, the probability
+    discovered in the chi-square table should be *halved*."
+
+    However, this standardized statistic does not distinguish between the
+    observed values being well correlated with the predicted ranks and being
+    _anti_-correlated with the predicted ranks. Instead, we follow [2]_
+    and calculate the standardized statistic
+
+    .. math::
+
+        \Lambda = \frac{L - E_0}{\sqrt{V_0}},
+
+    where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
+    :math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
+    normal under the null hypothesis".
+
+    The *p*-value for ``method='exact'`` is generated by comparing the observed
+    value of :math:`L` against the :math:`L` values generated for all
+    :math:`(n!)^m` possible permutations of ranks. The calculation is performed
+    using the recursive method of [5].
+
+    The *p*-values are not adjusted for the possibility of ties. When
+    ties are present, the reported  ``'exact'`` *p*-values may be somewhat
+    larger (i.e. more conservative) than the true *p*-value [2]_. The
+    ``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
+    conservative) than the ``'exact'`` *p*-values.
+
+    References
+    ----------
+    .. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
+       a significant test for linear ranks", *Journal of the American
+       Statistical Association* 58(301), p. 216--230, 1963.
+
+    .. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
+       approach*, CRC Press, p. 150--152, 2012.
+
+    .. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
+       Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
+       Accessed July 12, 2020.
+
+    .. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
+       https://en.wikipedia.org/wiki/Page%27s_trend_test,
+       Accessed July 12, 2020.
+
+    .. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
+       the two-way layout", *Communications in Statistics - Simulation and
+       Computation*,  6(1), p. 49--61, 1977.
+
+    Examples
+    --------
+    We use the example from [3]_: 10 students are asked to rate three
+    teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
+    with 1 being the lowest and 5 being the highest. We have decided that
+    a confidence level of 99% is required to reject the null hypothesis in
+    favor of our alternative: that the seminar will have the highest ratings
+    and the tutorial will have the lowest. Initially, the data have been
+    tabulated with each row representing an individual student's ratings of
+    the three methods in the following order: tutorial, lecture, seminar.
+
+    >>> table = [[3, 4, 3],
+    ...          [2, 2, 4],
+    ...          [3, 3, 5],
+    ...          [1, 3, 2],
+    ...          [2, 3, 2],
+    ...          [2, 4, 5],
+    ...          [1, 2, 4],
+    ...          [3, 4, 4],
+    ...          [2, 4, 5],
+    ...          [1, 3, 4]]
+
+    Because the tutorial is hypothesized to have the lowest ratings, the
+    column corresponding with tutorial rankings should be first; the seminar
+    is hypothesized to have the highest ratings, so its column should be last.
+    Since the columns are already arranged in this order of increasing
+    predicted mean, we can pass the table directly into `page_trend_test`.
+
+    >>> from scipy.stats import page_trend_test
+    >>> res = page_trend_test(table)
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+
+    This *p*-value indicates that there is a 0.1819% chance that
+    the :math:`L` statistic would reach such an extreme value under the null
+    hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
+    the null hypothesis in favor of our alternative at a 99% confidence level.
+
+    The value of the :math:`L` statistic is 133.5. To check this manually,
+    we rank the data such that high scores correspond with high ranks, settling
+    ties with an average rank:
+
+    >>> from scipy.stats import rankdata
+    >>> ranks = rankdata(table, axis=1)
+    >>> ranks
+    array([[1.5, 3. , 1.5],
+           [1.5, 1.5, 3. ],
+           [1.5, 1.5, 3. ],
+           [1. , 3. , 2. ],
+           [1.5, 3. , 1.5],
+           [1. , 2. , 3. ],
+           [1. , 2. , 3. ],
+           [1. , 2.5, 2.5],
+           [1. , 2. , 3. ],
+           [1. , 2. , 3. ]])
+
+    We add the ranks within each column, multiply the sums by the
+    predicted ranks, and sum the products.
+
+    >>> import numpy as np
+    >>> m, n = ranks.shape
+    >>> predicted_ranks = np.arange(1, n+1)
+    >>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
+    >>> res.statistic == L
+    True
+
+    As presented in [3]_, the asymptotic approximation of the *p*-value is the
+    survival function of the normal distribution evaluated at the standardized
+    test statistic:
+
+    >>> from scipy.stats import norm
+    >>> E0 = (m*n*(n+1)**2)/4
+    >>> V0 = (m*n**2*(n+1)*(n**2-1))/144
+    >>> Lambda = (L-E0)/np.sqrt(V0)
+    >>> p = norm.sf(Lambda)
+    >>> p
+    0.0012693433690751756
+
+    This does not precisely match the *p*-value reported by `page_trend_test`
+    above. The asymptotic distribution is not very accurate, nor conservative,
+    for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
+    use ``method='exact'`` based on the dimensions of the table and the
+    recommendations in Page's original paper [1]_. To override
+    `page_trend_test`'s choice, provide the `method` argument.
+
+    >>> res = page_trend_test(table, method="asymptotic")
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
+                        method='asymptotic')
+
+    If the data are already ranked, we can pass in the ``ranks`` instead of
+    the ``table`` to save computation time.
+
+    >>> res = page_trend_test(ranks,             # ranks of data
+    ...                       ranked=True,       # data is already ranked
+    ...                       )
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+
+    Suppose the raw data had been tabulated in an order different from the
+    order of predicted means, say lecture, seminar, tutorial.
+
+    >>> table = np.asarray(table)[:, [1, 2, 0]]
+
+    Since the arrangement of this table is not consistent with the assumed
+    ordering, we can either rearrange the table or provide the
+    `predicted_ranks`. Remembering that the lecture is predicted
+    to have the middle rank, the seminar the highest, and tutorial the lowest,
+    we pass:
+
+    >>> res = page_trend_test(table,             # data as originally tabulated
+    ...                       predicted_ranks=[2, 3, 1],  # our predicted order
+    ...                       )
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+
+    """
+    if not hasattr(_pagel_state, 'state'):
+        _pagel_state.state = _PageL()
+
+    # Possible values of the method parameter and the corresponding function
+    # used to evaluate the p value
+    methods = {"asymptotic": _l_p_asymptotic,
+               "exact": _l_p_exact,
+               "auto": None}
+    if method not in methods:
+        raise ValueError(f"`method` must be in {set(methods)}")
+
+    ranks = np.asarray(data)
+    if ranks.ndim != 2:  # TODO: relax this to accept 3d arrays?
+        raise ValueError("`data` must be a 2d array.")
+
+    m, n = ranks.shape
+    if m < 2 or n < 3:
+        raise ValueError("Page's L is only appropriate for data with two "
+                         "or more rows and three or more columns.")
+
+    if np.any(np.isnan(data)):
+        raise ValueError("`data` contains NaNs, which cannot be ranked "
+                         "meaningfully")
+
+    # ensure NumPy array and rank the data if it's not already ranked
+    if ranked:
+        # Only a basic check on whether data is ranked. Checking that the data
+        # is properly ranked could take as much time as ranking it.
+        if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
+            raise ValueError("`data` is not properly ranked. Rank the data or "
+                             "pass `ranked=False`.")
+    else:
+        ranks = scipy.stats.rankdata(data, axis=-1)
+
+    # generate predicted ranks if not provided, ensure valid NumPy array
+    if predicted_ranks is None:
+        predicted_ranks = np.arange(1, n+1)
+    else:
+        predicted_ranks = np.asarray(predicted_ranks)
+        if (predicted_ranks.ndim < 1 or
+                (set(predicted_ranks) != set(range(1, n+1)) or
+                 len(predicted_ranks) != n)):
+            raise ValueError(f"`predicted_ranks` must include each integer "
+                             f"from 1 to {n} (the number of columns in "
+                             f"`data`) exactly once.")
+
+    if not isinstance(ranked, bool):
+        raise TypeError("`ranked` must be boolean.")
+
+    # Calculate the L statistic
+    L = _l_vectorized(ranks, predicted_ranks)
+
+    # Calculate the p-value
+    if method == "auto":
+        method = _choose_method(ranks)
+    p_fun = methods[method]  # get the function corresponding with the method
+    p = p_fun(L, m, n)
+
+    page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
+    return page_result
+
+
+def _choose_method(ranks):
+    '''Choose method for computing p-value automatically'''
+    m, n = ranks.shape
+    if n > 8 or (m > 12 and n > 3) or m > 20:  # as in [1], [4]
+        method = "asymptotic"
+    else:
+        method = "exact"
+    return method
+
+
+def _l_vectorized(ranks, predicted_ranks):
+    '''Calculate's Page's L statistic for each page of a 3d array'''
+    colsums = ranks.sum(axis=-2, keepdims=True)
+    products = predicted_ranks * colsums
+    Ls = products.sum(axis=-1)
+    Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
+    return Ls
+
+
+def _l_p_asymptotic(L, m, n):
+    '''Calculate the p-value of Page's L from the asymptotic distribution'''
+    # Using [1] as a reference, the asymptotic p-value would be calculated as:
+    # chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
+    # p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
+    # but this is insensitive to the direction of the hypothesized ranking
+
+    # See [2] page 151
+    E0 = (m*n*(n+1)**2)/4
+    V0 = (m*n**2*(n+1)*(n**2-1))/144
+    Lambda = (L-E0)/np.sqrt(V0)
+    # This is a one-sided "greater" test - calculate the probability that the
+    # L statistic under H0 would be greater than the observed L statistic
+    p = norm.sf(Lambda)
+    return p
+
+
+def _l_p_exact(L, m, n):
+    '''Calculate the p-value of Page's L exactly'''
+    # [1] uses m, n; [5] uses n, k.
+    # Switch convention here because exact calculation code references [5].
+    L, n, k = int(L), int(m), int(n)
+    _pagel_state.state.set_k(k)
+    return _pagel_state.state.sf(L, n)
+
+
+class _PageL:
+    '''Maintains state between `page_trend_test` executions'''
+
+    def __init__(self):
+        '''Lightweight initialization'''
+        self.all_pmfs = {}
+
+    def set_k(self, k):
+        '''Calculate lower and upper limits of L for single row'''
+        self.k = k
+        # See [5] top of page 52
+        self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
+
+    def sf(self, l, n):
+        '''Survival function of Page's L statistic'''
+        ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
+        return np.sum(ps)
+
+    def p_l_k_1(self):
+        '''Relative frequency of each L value over all possible single rows'''
+
+        # See [5] Equation (6)
+        ranks = range(1, self.k+1)
+        # generate all possible rows of length k
+        rank_perms = np.array(list(permutations(ranks)))
+        # compute Page's L for all possible rows
+        Ls = (ranks*rank_perms).sum(axis=1)
+        # count occurrences of each L value
+        counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
+        # factorial(k) is number of possible permutations
+        return counts/math.factorial(self.k)
+
+    def pmf(self, l, n):
+        '''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
+
+        if n not in self.all_pmfs:
+            self.all_pmfs[n] = {}
+        if self.k not in self.all_pmfs[n]:
+            self.all_pmfs[n][self.k] = {}
+
+        # Cache results to avoid repeating calculation. Initially this was
+        # written with lru_cache, but this seems faster? Also, we could add
+        # an option to save this for future lookup.
+        if l in self.all_pmfs[n][self.k]:
+            return self.all_pmfs[n][self.k][l]
+
+        if n == 1:
+            ps = self.p_l_k_1()  # [5] Equation 6
+            ls = range(self.a, self.b+1)
+            # not fast, but we'll only be here once
+            self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
+            return self.all_pmfs[n][self.k][l]
+
+        p = 0
+        low = max(l-(n-1)*self.b, self.a)  # [5] Equation 2
+        high = min(l-(n-1)*self.a, self.b)
+
+        # [5] Equation 1
+        for t in range(low, high+1):
+            p1 = self.pmf(l-t, n-1)
+            p2 = self.pmf(t, 1)
+            p += p1*p2
+        self.all_pmfs[n][self.k][l] = p
+        return p
+
+
+# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
+# _PageL() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside page_trend_test().
+_pagel_state = threading.local()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_probability_distribution.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_probability_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bcba74170a9e760e9077bcf7dfbc438ce216e75
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_probability_distribution.py
@@ -0,0 +1,1969 @@
+# Temporary file separated from _distribution_infrastructure.py
+# to simplify the diff during PR review.
+from abc import ABC, abstractmethod
+from types import GenericAlias
+
+class _ProbabilityDistribution(ABC):
+
+    # generic type compatibility with scipy-stubs
+    __class_getitem__ = classmethod(GenericAlias)
+
+    @abstractmethod
+    def support(self):
+        r"""Support of the random variable
+
+        The support of a random variable is set of all possible outcomes;
+        i.e., the subset of the domain of argument :math:`x` for which
+        the probability density function :math:`f(x)` is nonzero.
+
+        This function returns lower and upper bounds of the support.
+
+        Returns
+        -------
+        out : tuple of Array
+            The lower and upper bounds of the support.
+
+        See Also
+        --------
+        pdf
+
+        References
+        ----------
+        .. [1] Support (mathematics), *Wikipedia*,
+               https://en.wikipedia.org/wiki/Support_(mathematics)
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support ``(l, r)``.
+        The following table summarizes the value returned by several
+        methods when the argument is outside the support.
+
+        +----------------+---------------------+---------------------+
+        | Method         | Value for ``x < l`` | Value for ``x > r`` |
+        +================+=====================+=====================+
+        | ``pdf(x)``     | 0                   | 0                   |
+        +----------------+---------------------+---------------------+
+        | ``logpdf(x)``  | -inf                | -inf                |
+        +----------------+---------------------+---------------------+
+        | ``cdf(x)``     | 0                   | 1                   |
+        +----------------+---------------------+---------------------+
+        | ``logcdf(x)``  | -inf                | 0                   |
+        +----------------+---------------------+---------------------+
+        | ``ccdf(x)``    | 1                   | 0                   |
+        +----------------+---------------------+---------------------+
+        | ``logccdf(x)`` | 0                   | -inf                |
+        +----------------+---------------------+---------------------+
+
+        For discrete distributions, the same table is applicable with
+        ``pmf`` and ``logpmf`` substituted for ``pdf`` and ``logpdf``.
+
+        For the ``cdf`` and related methods of continuous distributions, the
+        inequality need not be strict; i.e. the tabulated value is returned
+        when the method is evaluated *at* the corresponding boundary.
+
+        The following table summarizes the value returned by the inverse
+        methods for arguments ``0`` and ``1``, whether the distribution
+        is continuous or discrete.
+
+        +-------------+-----------+-----------+
+        | Method      | ``x = 0`` | ``x = 1`` |
+        +=============+===========+===========+
+        | ``icdf(x)`` | ``l``     | ``r``     |
+        +-------------+-----------+-----------+
+        | ``icdf(x)`` | ``r``     | ``l``     |
+        +-------------+-----------+-----------+
+
+        For the inverse log-functions, the same values are returned
+        for ``x = log(0)`` and ``x = log(1)``. All inverse functions return
+        ``nan`` when evaluated at an argument outside the domain ``0`` to ``1``.
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Retrieve the support of the distribution:
+
+        >>> X.support()
+        (-0.5, 0.5)
+
+        For a distribution with infinite support,
+
+        >>> X = stats.Normal()
+        >>> X.support()
+        (-inf, inf)
+
+        Due to underflow, the numerical value returned by the PDF may be zero
+        even for arguments within the support, even if the true value is
+        nonzero. In such cases, the log-PDF may be useful.
+
+        >>> X.pdf([-100., 100.])
+        array([0., 0.])
+        >>> X.logpdf([-100., 100.])
+        array([-5000.91893853, -5000.91893853])
+
+        Use cases for the log-CDF and related methods are analogous.
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def sample(self, shape, *, method, rng):
+        r"""Random sample from the distribution.
+
+        Parameters
+        ----------
+        shape : tuple of ints, default: ()
+            The shape of the sample to draw. If the parameters of the distribution
+            underlying the random variable are arrays of shape ``param_shape``,
+            the output array will be of shape ``shape + param_shape``.
+        method : {None, 'formula', 'inverse_transform'}
+            The strategy used to produce the sample. By default (``None``),
+            the infrastructure chooses between the following options,
+            listed in order of precedence.
+
+            - ``'formula'``: an implementation specific to the distribution
+            - ``'inverse_transform'``: generate a uniformly distributed sample and
+              return the inverse CDF at these arguments.
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a `NotImplementedError``
+            will be raised.
+        rng : `numpy.random.Generator` or `scipy.stats.QMCEngine`, optional
+            Pseudo- or quasi-random number generator state. When `rng` is None,
+            a new `numpy.random.Generator` is created using entropy from the
+            operating system. Types other than `numpy.random.Generator` and
+            `scipy.stats.QMCEngine` are passed to `numpy.random.default_rng`
+            to instantiate a ``Generator``.
+
+            If `rng` is an instance of `scipy.stats.QMCEngine` configured to use
+            scrambling and `shape` is not empty, then each slice along the zeroth
+            axis of the result is a "quasi-independent", low-discrepancy sequence;
+            that is, they are distinct sequences that can be treated as statistically
+            independent for most practical purposes. Separate calls to `sample`
+            produce new quasi-independent, low-discrepancy sequences.
+
+        References
+        ----------
+        .. [1] Sampling (statistics), *Wikipedia*,
+               https://en.wikipedia.org/wiki/Sampling_(statistics)
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=0., b=1.)
+
+        Generate a pseudorandom sample:
+
+        >>> x = X.sample((1000, 1))
+        >>> octiles = (np.arange(8) + 1) / 8
+        >>> np.count_nonzero(x <= octiles, axis=0)
+        array([ 148,  263,  387,  516,  636,  751,  865, 1000])  # may vary
+
+        >>> X = stats.Uniform(a=np.zeros((3, 1)), b=np.ones(2))
+        >>> X.a.shape,
+        (3, 2)
+        >>> x = X.sample(shape=(5, 4))
+        >>> x.shape
+        (5, 4, 3, 2)
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def moment(self, order, kind, *, method):
+        r"""Raw, central, or standard moment of positive integer order.
+
+        In terms of probability density function :math:`f(x)` and support
+        :math:`\chi`, the "raw" moment (about the origin) of order :math:`n` of
+        a continuous random variable :math:`X` is:
+
+        .. math::
+
+            \mu'_n(X) = \int_{\chi} x^n f(x) dx
+
+        The "central" moment is the raw moment taken about the mean,
+        :math:`\mu = \mu'_1`:
+
+        .. math::
+
+            \mu_n(X) = \int_{\chi} (x - \mu) ^n f(x) dx
+
+        The "standardized" moment is the central moment normalized by the
+        :math:`n^\text{th}` power of the standard deviation
+        :math:`\sigma = \sqrt{\mu_2}` to produce a scale invariant quantity:
+
+        .. math::
+
+            \tilde{\mu}_n(X) = \frac{\mu_n(X)}
+                                    {\sigma^n}
+
+        The definitions for discrete random variables are analogous, with
+        sums over the support replacing the integrals.
+
+        Parameters
+        ----------
+        order : int
+            The integer order of the moment; i.e. :math:`n` in the formulae above.
+        kind : {'raw', 'central', 'standardized'}
+            Whether to return the raw (default), central, or standardized moment
+            defined above.
+        method : {None, 'formula', 'general', 'transform', 'normalize', 'quadrature', 'cache'}
+            The strategy used to evaluate the moment. By default (``None``),
+            the infrastructure chooses between the following options,
+            listed in order of precedence.
+
+            - ``'cache'``: use the value of the moment most recently calculated
+              via another method
+            - ``'formula'``: use a formula for the moment itself
+            - ``'general'``: use a general result that is true for all distributions
+              with finite moments; for instance, the zeroth raw moment is
+              identically 1
+            - ``'transform'``: transform a raw moment to a central moment or
+              vice versa (see Notes)
+            - ``'normalize'``: normalize a central moment to get a standardized
+              or vice versa
+            - ``'quadrature'``: numerically integrate (or, in the discrete case, sum)
+              according to the definition
+
+            Not all `method` options are available for all orders, kinds, and
+            distributions. If the selected `method` is not available, a
+            ``NotImplementedError`` will be raised.
+
+        Returns
+        -------
+        out : array
+            The moment of the random variable of the specified order and kind.
+
+        See Also
+        --------
+        pdf
+        mean
+        variance
+        standard_deviation
+        skewness
+        kurtosis
+
+        Notes
+        -----
+        Not all distributions have finite moments of all orders; moments of some
+        orders may be undefined or infinite. If a formula for the moment is not
+        specifically implemented for the chosen distribution, SciPy will attempt
+        to compute the moment via a generic method, which may yield a finite
+        result where none exists. This is not a critical bug, but an opportunity
+        for an enhancement.
+
+        The definition of a raw moment in the summary is specific to the raw moment
+        about the origin. The raw moment about any point :math:`a` is:
+
+        .. math::
+
+            E[(X-a)^n] = \int_{\chi} (x-a)^n f(x) dx
+
+        In this notation, a raw moment about the origin is :math:`\mu'_n = E[x^n]`,
+        and a central moment is :math:`\mu_n = E[(x-\mu)^n]`, where :math:`\mu`
+        is the first raw moment; i.e. the mean.
+
+        The ``'transform'`` method takes advantage of the following relationships
+        between moments taken about different points :math:`a` and :math:`b`.
+
+        .. math::
+
+            E[(X-b)^n] =  \sum_{i=0}^n E[(X-a)^i] {n \choose i} (a - b)^{n-i}
+
+        For instance, to transform the raw moment to the central moment, we let
+        :math:`b = \mu` and :math:`a = 0`.
+
+        The distribution infrastructure provides flexibility for distribution
+        authors to implement separate formulas for raw moments, central moments,
+        and standardized moments of any order. By default, the moment of the
+        desired order and kind is evaluated from the formula if such a formula
+        is available; if not, the infrastructure uses any formulas that are
+        available rather than resorting directly to numerical integration.
+        For instance, if formulas for the first three raw moments are
+        available and the third standardized moments is desired, the
+        infrastructure will evaluate the raw moments and perform the transforms
+        and standardization required. The decision tree is somewhat complex,
+        but the strategy for obtaining a moment of a given order and kind
+        (possibly as an intermediate step due to the recursive nature of the
+        transform formula above) roughly follows this order of priority:
+
+        #. Use cache (if order of same moment and kind has been calculated)
+        #. Use formula (if available)
+        #. Transform between raw and central moment and/or normalize to convert
+           between central and standardized moments (if efficient)
+        #. Use a generic result true for most distributions (if available)
+        #. Use quadrature
+
+        References
+        ----------
+        .. [1] Moment, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Moment_(mathematics)
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the first raw moment:
+
+        >>> X.moment(order=1, kind='raw')
+        1.0
+        >>> X.moment(order=1, kind='raw') == X.mean() == X.mu
+        True
+
+        Evaluate the second central moment:
+
+        >>> X.moment(order=2, kind='central')
+        4.0
+        >>> X.moment(order=2, kind='central') == X.variance() == X.sigma**2
+        True
+
+        Evaluate the fourth standardized moment:
+
+        >>> X.moment(order=4, kind='standardized')
+        3.0
+        >>> X.moment(order=4, kind='standardized') == X.kurtosis(convention='non-excess')
+        True
+
+        """  # noqa:E501
+        raise NotImplementedError()
+
+    @abstractmethod
+    def mean(self, *, method):
+        r"""Mean (raw first moment about the origin)
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'transform', 'quadrature', 'cache'}
+            Method used to calculate the raw first moment. Not
+            all methods are available for all distributions. See
+            `moment` for details.
+
+        See Also
+        --------
+        moment
+        median
+        mode
+
+        References
+        ----------
+        .. [1] Mean, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Mean#Mean_of_a_probability_distribution
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the variance:
+
+        >>> X.mean()
+        1.0
+        >>> X.mean() == X.moment(order=1, kind='raw') == X.mu
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def median(self, *, method):
+        r"""Median (50th percentile)
+
+        If a continuous random variable :math:`X` has probability :math:`0.5` of
+        taking on a value less than :math:`m`, then :math:`m` is the median.
+
+        More generally, a median is a value :math:`m` for which:
+
+        .. math::
+
+            P(X ≤ m) ≤ 0.5 ≥ P(X ≥ m)
+
+        For discrete random variables, the median may not be unique, in which
+        case the smallest value satisfying the definition is reported.
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'icdf'}
+            The strategy used to evaluate the median.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the median
+            - ``'icdf'``: evaluate the inverse CDF of 0.5
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The median
+
+        See Also
+        --------
+        mean
+        mode
+        icdf
+
+        References
+        ----------
+        .. [1] Median, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Median#Probability_distributions
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=0., b=10.)
+
+        Compute the median:
+
+        >>> X.median()
+        np.float64(5.0)
+        >>> X.median() == X.icdf(0.5) == X.iccdf(0.5)
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def mode(self, *, method):
+        r"""Mode (most likely value)
+
+        Informally, the mode is a value that a random variable has the highest
+        probability (density) of assuming. That is, the mode is the element of
+        the support :math:`\chi` that maximizes the probability density (or mass,
+        for discrete random variables) function :math:`f(x)`:
+
+        .. math::
+
+            \text{mode} = \arg\max_{x \in \chi} f(x)
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'optimization'}
+            The strategy used to evaluate the mode.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the median
+            - ``'optimization'``: numerically maximize the PDF/PMF
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The mode
+
+        See Also
+        --------
+        mean
+        median
+        pdf
+
+        Notes
+        -----
+        For some distributions
+
+        #. the mode is not unique (e.g. the uniform distribution);
+        #. the PDF has one or more singularities, and it is debatable whether
+           a singularity is considered to be in the domain and called the mode
+           (e.g. the gamma distribution with shape parameter less than 1); and/or
+        #. the probability density function may have one or more local maxima
+           that are not a global maximum (e.g. mixture distributions).
+
+        In such cases, `mode` will
+
+        #. return a single value,
+        #. consider the mode to occur at a singularity, and/or
+        #. return a local maximum which may or may not be a global maximum.
+
+        If a formula for the mode is not specifically implemented for the
+        chosen distribution, SciPy will attempt to compute the mode
+        numerically, which may not meet the user's preferred definition of a
+        mode. In such cases, the user is encouraged to subclass the
+        distribution and override ``mode``.
+
+        References
+        ----------
+        .. [1] Mode (statistics), *Wikipedia*,
+               https://en.wikipedia.org/wiki/Mode_(statistics)
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the mode:
+
+        >>> X.mode()
+        1.0
+
+        If the mode is not uniquely defined, ``mode`` nonetheless returns a
+        single value.
+
+        >>> X = stats.Uniform(a=0., b=1.)
+        >>> X.mode()
+        0.5
+
+        If this choice does not satisfy your requirements, subclass the
+        distribution and override ``mode``:
+
+        >>> class BetterUniform(stats.Uniform):
+        ...     def mode(self):
+        ...         return self.b
+        >>> X = BetterUniform(a=0., b=1.)
+        >>> X.mode()
+        1.0
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def variance(self, *, method):
+        r"""Variance (central second moment)
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'transform', 'normalize', 'quadrature', 'cache'}
+            Method used to calculate the central second moment. Not
+            all methods are available for all distributions. See
+            `moment` for details.
+
+        See Also
+        --------
+        moment
+        standard_deviation
+        mean
+
+        References
+        ----------
+        .. [1] Variance, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Variance#Absolutely_continuous_random_variable
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the variance:
+
+        >>> X.variance()
+        4.0
+        >>> X.variance() == X.moment(order=2, kind='central') == X.sigma**2
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def standard_deviation(self, *, method):
+        r"""Standard deviation (square root of the second central moment)
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'transform', 'normalize', 'quadrature', 'cache'}
+            Method used to calculate the central second moment. Not
+            all methods are available for all distributions. See
+            `moment` for details.
+
+        See Also
+        --------
+        variance
+        mean
+        moment
+
+        References
+        ----------
+        .. [1] Standard deviation, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the standard deviation:
+
+        >>> X.standard_deviation()
+        2.0
+        >>> X.standard_deviation() == X.moment(order=2, kind='central')**0.5 == X.sigma
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def skewness(self, *, method):
+        r"""Skewness (standardized third moment)
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'general', 'transform', 'normalize', 'cache'}
+            Method used to calculate the standardized third moment. Not
+            all methods are available for all distributions. See
+            `moment` for details.
+
+        See Also
+        --------
+        moment
+        mean
+        variance
+
+        References
+        ----------
+        .. [1] Skewness, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Skewness
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the skewness:
+
+        >>> X.skewness()
+        0.0
+        >>> X.skewness() == X.moment(order=3, kind='standardized')
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def kurtosis(self, *, method):
+        r"""Kurtosis (standardized fourth moment)
+
+        By default, this is the standardized fourth moment, also known as the
+        "non-excess" or "Pearson" kurtosis (e.g. the kurtosis of the normal
+        distribution is 3). The "excess" or "Fisher" kurtosis (the standardized
+        fourth moment minus 3) is available via the `convention` parameter.
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'general', 'transform', 'normalize', 'cache'}
+            Method used to calculate the standardized fourth moment. Not
+            all methods are available for all distributions. See
+            `moment` for details.
+        convention : {'non-excess', 'excess'}
+            Two distinct conventions are available:
+
+            - ``'non-excess'``: the standardized fourth moment (Pearson's kurtosis)
+            - ``'excess'``: the standardized fourth moment minus 3 (Fisher's kurtosis)
+
+            The default is ``'non-excess'``.
+
+        See Also
+        --------
+        moment
+        mean
+        variance
+
+        References
+        ----------
+        .. [1] Kurtosis, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Kurtosis
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Normal(mu=1., sigma=2.)
+
+        Evaluate the kurtosis:
+
+        >>> X.kurtosis()
+        3.0
+        >>> (X.kurtosis()
+        ...  == X.kurtosis(convention='excess') + 3.
+        ...  == X.moment(order=4, kind='standardized'))
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def pdf(self, x, /, *, method):
+        r"""Probability density function
+
+        The probability density function ("PDF"), denoted :math:`f(x)`, is the
+        probability *per unit length* that the random variable will assume the
+        value :math:`x`. Mathematically, it can be defined as the derivative
+        of the cumulative distribution function :math:`F(x)`:
+
+        .. math::
+
+            f(x) = \frac{d}{dx} F(x)
+
+        `pdf` accepts `x` for :math:`x`.
+
+        Parameters
+        ----------
+        x : array_like
+            The argument of the PDF.
+        method : {None, 'formula', 'logexp'}
+            The strategy used to evaluate the PDF. By default (``None``), the
+            infrastructure chooses between the following options, listed in
+            order of precedence.
+
+            - ``'formula'``: use a formula for the PDF itself
+            - ``'logexp'``: evaluate the log-PDF and exponentiate
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The PDF evaluated at the argument `x`.
+
+        See Also
+        --------
+        cdf
+        logpdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        By definition of the support, the PDF evaluates to its minimum value
+        of :math:`0` outside the support; i.e. for :math:`x < l` or
+        :math:`x > r`. The maximum of the PDF may be less than or greater than
+        :math:`1`; since the value is a probability *density*, only its integral
+        over the support must equal :math:`1`.
+
+        For discrete distributions, `pdf` returns ``inf`` at supported points
+        and ``0`` elsewhere.
+
+        References
+        ----------
+        .. [1] Probability density function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Probability_density_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-1., b=1.)
+
+        Evaluate the PDF at the desired argument:
+
+        >>> X.pdf(0.25)
+        0.5
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def logpdf(self, x, /, *, method):
+        r"""Log of the probability density function
+
+        The probability density function ("PDF"), denoted :math:`f(x)`, is the
+        probability *per unit length* that the random variable will assume the
+        value :math:`x`. Mathematically, it can be defined as the derivative
+        of the cumulative distribution function :math:`F(x)`:
+
+        .. math::
+
+            f(x) = \frac{d}{dx} F(x)
+
+        `logpdf` computes the logarithm of the probability density function
+        ("log-PDF"), :math:`\log(f(x))`, but it may be numerically favorable
+        compared to the naive implementation (computing :math:`f(x)` and
+        taking the logarithm).
+
+        `logpdf` accepts `x` for :math:`x`.
+
+        Parameters
+        ----------
+        x : array_like
+            The argument of the log-PDF.
+        method : {None, 'formula', 'logexp'}
+            The strategy used to evaluate the log-PDF. By default (``None``), the
+            infrastructure chooses between the following options, listed in order
+            of precedence.
+
+            - ``'formula'``: use a formula for the log-PDF itself
+            - ``'logexp'``: evaluate the PDF and takes its logarithm
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The log-PDF evaluated at the argument `x`.
+
+        See Also
+        --------
+        pdf
+        logcdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        By definition of the support, the log-PDF evaluates to its minimum value
+        of :math:`-\infty` (i.e. :math:`\log(0)`) outside the support; i.e. for
+        :math:`x < l` or :math:`x > r`. The maximum of the log-PDF may be less
+        than or greater than :math:`\log(1) = 0` because the maximum of the PDF
+        can be any positive real.
+
+        For distributions with infinite support, it is common for `pdf` to return
+        a value of ``0`` when the argument is theoretically within the support;
+        this can occur because the true value of the PDF is too small to be
+        represented by the chosen dtype. The log-PDF, however, will often be finite
+        (not ``-inf``) over a much larger domain. Consequently, it may be preferred
+        to work with the logarithms of probabilities and probability densities to
+        avoid underflow.
+
+        For discrete distributions, `logpdf` returns ``inf`` at supported points and
+        ``-inf`` (``log(0)``) elsewhere.
+
+        References
+        ----------
+        .. [1] Probability density function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Probability_density_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-1.0, b=1.0)
+
+        Evaluate the log-PDF at the desired argument:
+
+        >>> X.logpdf(0.5)
+        -0.6931471805599453
+        >>> np.allclose(X.logpdf(0.5), np.log(X.pdf(0.5)))
+        True
+
+        """
+        raise NotImplementedError()
+
+    def pmf(self, x, /, *, method=None):
+        r"""Probability mass function
+
+        The probability mass function ("PMF"), denoted :math:`f(x)`, is the
+        probability that the random variable :math:`X` will assume the value :math:`x`.
+
+        .. math::
+
+            f(x) = P(X = x)
+
+        `pmf` accepts `x` for :math:`x`.
+
+        Parameters
+        ----------
+        x : array_like
+            The argument of the PMF.
+        method : {None, 'formula', 'logexp'}
+            The strategy used to evaluate the PMF. By default (``None``), the
+            infrastructure chooses between the following options, listed in
+            order of precedence.
+
+            - ``'formula'``: use a formula for the PMF itself
+            - ``'logexp'``: evaluate the log-PMF and exponentiate
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The PMF evaluated at the argument `x`.
+
+        See Also
+        --------
+        cdf
+        logpmf
+
+        Notes
+        -----
+        Suppose a discrete probability distribution has support over the integers
+        :math:`{l, l+1, ..., r-1, r}`.
+        By definition of the support, the PMF evaluates to its minimum value
+        of :math:`0` for non-integral :math:`x` and for :math:`x` outside the support;
+        i.e. for :math:`x < l` or :math:`x > r`.
+
+        For continuous distributions, `pmf` returns ``0`` at all real arguments.
+
+        References
+        ----------
+        .. [1] Probability mass function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Probability_mass_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Binomial(n=10, p=0.5)
+
+        Evaluate the PMF at the desired argument:
+
+        >>> X.pmf(5)
+        np.float64(0.24609375)
+
+        """
+        raise NotImplementedError()
+
+    def logpmf(self, x, /, *, method=None):
+        r"""Log of the probability mass function
+
+        The probability mass function ("PMF"), denoted :math:`f(x)`, is the
+        probability that the random variable :math:`X` will assume the value :math:`x`.
+
+        .. math::
+
+            f(x) = \frac{d}{dx} F(x)
+
+        `logpmf` computes the logarithm of the probability mass function
+        ("log-PMF"), :math:`\log(f(x))`, but it may be numerically favorable
+        compared to the naive implementation (computing :math:`f(x)` and
+        taking the logarithm).
+
+        `logpmf` accepts `x` for :math:`x`.
+
+        Parameters
+        ----------
+        x : array_like
+            The argument of the log-PMF.
+        method : {None, 'formula', 'logexp'}
+            The strategy used to evaluate the log-PMF. By default (``None``), the
+            infrastructure chooses between the following options, listed in order
+            of precedence.
+
+            - ``'formula'``: use a formula for the log-PMF itself
+            - ``'logexp'``: evaluate the PMF and takes its logarithm
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The log-PMF evaluated at the argument `x`.
+
+        See Also
+        --------
+        pmf
+        logcdf
+
+        Notes
+        -----
+        Suppose a discrete probability distribution has support over the integers
+        :math:`{l, l+1, ..., r-1, r}`.
+        By definition of the support, the log-PMF evaluates to its minimum value
+        of :math:`-\infty` (i.e. :math:`\log(0)`) for non-integral :math:`x` and
+        for :math:`x` outside the support; i.e. for :math:`x < l` or :math:`x > r`.
+
+        For distributions with infinite support, it is common for `pmf` to return
+        a value of ``0`` when the argument is theoretically within the support;
+        this can occur because the true value of the PMF is too small to be
+        represented by the chosen dtype. The log-PMF, however, will often be finite
+        (not ``-inf``) over a much larger domain. Consequently, it may be preferred
+        to work with the logarithms of probabilities and probability densities to
+        avoid underflow.
+
+        References
+        ----------
+        .. [1] Probability density function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Probability_density_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Binomial(n=10, p=0.5)
+
+        Evaluate the log-PMF at the desired argument:
+
+        >>> X.logpmf(5)
+        np.float64(-1.4020427180880297)
+        >>> np.allclose(X.logpmf(5), np.log(X.pmf(5)))
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def cdf(self, x, y, /, *, method):
+        r"""Cumulative distribution function
+
+        The cumulative distribution function ("CDF"), denoted :math:`F(x)`, is
+        the probability the random variable :math:`X` will assume a value
+        less than or equal to :math:`x`:
+
+        .. math::
+
+            F(x) = P(X ≤ x)
+
+        A two-argument variant of this function is also defined as the
+        probability the random variable :math:`X` will assume a value between
+        :math:`x` and :math:`y`.
+
+        .. math::
+
+            F(x, y) = P(x ≤ X ≤ y)
+
+        `cdf` accepts `x` for :math:`x` and `y` for :math:`y`.
+
+        Parameters
+        ----------
+        x, y : array_like
+            The arguments of the CDF. `x` is required; `y` is optional.
+        method : {None, 'formula', 'logexp', 'complement', 'quadrature', 'subtraction'}
+            The strategy used to evaluate the CDF.
+            By default (``None``), the one-argument form of the function
+            chooses between the following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the CDF itself
+            - ``'logexp'``: evaluate the log-CDF and exponentiate
+            - ``'complement'``: evaluate the CCDF and take the complement
+            - ``'quadrature'``: numerically integrate the PDF (or, in the discrete
+              case, sum the PMF)
+
+            In place of ``'complement'``, the two-argument form accepts:
+
+            - ``'subtraction'``: compute the CDF at each argument and take
+              the difference.
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The CDF evaluated at the provided argument(s).
+
+        See Also
+        --------
+        logcdf
+        ccdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        The CDF :math:`F(x)` is related to the probability density function
+        :math:`f(x)` by:
+
+        .. math::
+
+            F(x) = \int_l^x f(u) du
+
+        The two argument version is:
+
+        .. math::
+
+            F(x, y) = \int_x^y f(u) du = F(y) - F(x)
+
+        The CDF evaluates to its minimum value of :math:`0` for :math:`x ≤ l`
+        and its maximum value of :math:`1` for :math:`x ≥ r`.
+
+        Suppose a discrete probability distribution has support :math:`[l, r]`.
+        The CDF :math:`F(x)` is related to the probability mass function
+        :math:`f(x)` by:
+
+        .. math::
+
+            F(x) = \sum_{u=l}^{\lfloor x \rfloor} f(u)
+
+        The CDF evaluates to its minimum value of :math:`0` for :math:`x < l`
+        and its maximum value of :math:`1` for :math:`x ≥ r`.
+
+        The CDF is also known simply as the "distribution function".
+
+        References
+        ----------
+        .. [1] Cumulative distribution function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Cumulative_distribution_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the CDF at the desired argument:
+
+        >>> X.cdf(0.25)
+        0.75
+
+        Evaluate the cumulative probability between two arguments:
+
+        >>> X.cdf(-0.25, 0.25) == X.cdf(0.25) - X.cdf(-0.25)
+        True
+
+        """  # noqa: E501
+        raise NotImplementedError()
+
+    @abstractmethod
+    def icdf(self, p, /, *, method):
+        r"""Inverse of the cumulative distribution function.
+
+        For monotonic continuous distributions, the inverse of the cumulative
+        distribution function ("inverse CDF"), denoted :math:`F^{-1}(p)`, is the
+        argument :math:`x` for which the cumulative distribution function
+        :math:`F(x)` evaluates to :math:`p`.
+
+        .. math::
+
+            F^{-1}(p) = x \quad \text{s.t.} \quad F(x) = p
+
+        When a strict "inverse" of the cumulative distribution function does not
+        exist (e.g. discrete random variables), the "inverse CDF" is defined by
+        convention as the smallest value within the support :math:`\chi` for which
+        :math:`F(x)` is at least :math:`p`.
+
+        .. math::
+
+            F^{-1}(p) = \min_\chi \quad \text{s.t.} \quad F(x) ≥ p
+
+        `icdf` accepts `p` for :math:`p \in [0, 1]`.
+
+        Parameters
+        ----------
+        p : array_like
+            The argument of the inverse CDF.
+        method : {None, 'formula', 'complement', 'inversion'}
+            The strategy used to evaluate the inverse CDF.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the inverse CDF itself
+            - ``'complement'``: evaluate the inverse CCDF at the
+              complement of `p`
+            - ``'inversion'``: solve numerically for the argument at which the
+              CDF is equal to `p`
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The inverse CDF evaluated at the provided argument.
+
+        See Also
+        --------
+        cdf
+        ilogcdf
+
+        Notes
+        -----
+        Suppose a probability distribution has support :math:`[l, r]`. The
+        inverse CDF returns its minimum value of :math:`l` at :math:`p = 0`
+        and its maximum value of :math:`r` at :math:`p = 1`. Because the CDF
+        has range :math:`[0, 1]`, the inverse CDF is only defined on the
+        domain :math:`[0, 1]`; for :math:`p < 0` and :math:`p > 1`, `icdf`
+        returns ``nan``.
+
+        The inverse CDF is also known as the quantile function, percentile function,
+        and percent-point function.
+
+        References
+        ----------
+        .. [1] Quantile function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Quantile_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the inverse CDF at the desired argument:
+
+        >>> X.icdf(0.25)
+        -0.25
+        >>> np.allclose(X.cdf(X.icdf(0.25)), 0.25)
+        True
+
+        This function returns NaN when the argument is outside the domain.
+
+        >>> X.icdf([-0.1, 0, 1, 1.1])
+        array([ nan, -0.5,  0.5,  nan])
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def ccdf(self, x, y, /, *, method):
+        r"""Complementary cumulative distribution function
+
+        The complementary cumulative distribution function ("CCDF"), denoted
+        :math:`G(x)`, is the complement of the cumulative distribution function
+        :math:`F(x)`; i.e., probability the random variable :math:`X` will
+        assume a value greater than :math:`x`:
+
+        .. math::
+
+            G(x) = 1 - F(x) = P(X > x)
+
+        A two-argument variant of this function is:
+
+        .. math::
+
+            G(x, y) = 1 - F(x, y) = P(X < x \text{ or } X > y)
+
+        `ccdf` accepts `x` for :math:`x` and `y` for :math:`y`.
+
+        Parameters
+        ----------
+        x, y : array_like
+            The arguments of the CCDF. `x` is required; `y` is optional.
+        method : {None, 'formula', 'logexp', 'complement', 'quadrature', 'addition'}
+            The strategy used to evaluate the CCDF.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the CCDF itself
+            - ``'logexp'``: evaluate the log-CCDF and exponentiate
+            - ``'complement'``: evaluate the CDF and take the complement
+            - ``'quadrature'``: numerically integrate the PDF (or, in the discrete
+              case, sum the PMF)
+
+            The two-argument form chooses between:
+
+            - ``'formula'``: use a formula for the CCDF itself
+            - ``'addition'``: compute the CDF at `x` and the CCDF at `y`, then add
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The CCDF evaluated at the provided argument(s).
+
+        See Also
+        --------
+        cdf
+        logccdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        The CCDF :math:`G(x)` is related to the probability density function
+        :math:`f(x)` by:
+
+        .. math::
+
+            G(x) = \int_x^r f(u) du
+
+        The two argument version is:
+
+        .. math::
+
+            G(x, y) = \int_l^x f(u) du + \int_y^r f(u) du
+
+        The CCDF returns its minimum value of :math:`0` for :math:`x ≥ r`
+        and its maximum value of :math:`1` for :math:`x ≤ l`.
+
+        Suppose a discrete probability distribution has support :math:`[l, r]`.
+        The CCDF :math:`G(x)` is related to the probability mass function
+        :math:`f(x)` by:
+
+        .. math::
+
+            G(x) = \sum_{u=\lfloor x + 1 \rfloor}^{r} f(u)
+
+        The CCDF evaluates to its minimum value of :math:`0` for :math:`x ≥ r`
+        and its maximum value of :math:`1` for :math:`x < l`.
+
+        The CCDF is also known as the "survival function".
+
+        References
+        ----------
+        .. [1] Cumulative distribution function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Cumulative_distribution_function#Derived_functions
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the CCDF at the desired argument:
+
+        >>> X.ccdf(0.25)
+        0.25
+        >>> np.allclose(X.ccdf(0.25), 1-X.cdf(0.25))
+        True
+
+        Evaluate the complement of the cumulative probability between two arguments:
+
+        >>> X.ccdf(-0.25, 0.25) == X.cdf(-0.25) + X.ccdf(0.25)
+        True
+
+        """  # noqa: E501
+        raise NotImplementedError()
+
+    @abstractmethod
+    def iccdf(self, p, /, *, method):
+        r"""Inverse complementary cumulative distribution function.
+
+        The inverse complementary cumulative distribution function ("inverse CCDF"),
+        denoted :math:`G^{-1}(p)`, is the argument :math:`x` for which the
+        complementary cumulative distribution function :math:`G(x)` evaluates to
+        :math:`p`.
+
+        .. math::
+
+            G^{-1}(p) = x \quad \text{s.t.} \quad G(x) = p
+
+        When a strict "inverse" of the complementary cumulative distribution function
+        does not exist (e.g. discrete random variables), the "inverse CCDF" is defined
+        by convention as the smallest value within the support :math:`\chi` for which
+        :math:`G(x)` is no greater than :math:`p`.
+
+        .. math::
+
+            G^{-1}(p) = \min_\chi \quad \text{s.t.} \quad G(x) ≤ p
+
+        `iccdf` accepts `p` for :math:`p \in [0, 1]`.
+
+        Parameters
+        ----------
+        p : array_like
+            The argument of the inverse CCDF.
+        method : {None, 'formula', 'complement', 'inversion'}
+            The strategy used to evaluate the inverse CCDF.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the inverse CCDF itself
+            - ``'complement'``: evaluate the inverse CDF at the
+              complement of `p`
+            - ``'inversion'``: solve numerically for the argument at which the
+              CCDF is equal to `p`
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The inverse CCDF evaluated at the provided argument.
+
+        Notes
+        -----
+        Suppose a probability distribution has support :math:`[l, r]`. The
+        inverse CCDF returns its minimum value of :math:`l` at :math:`p = 1`
+        and its maximum value of :math:`r` at :math:`p = 0`. Because the CCDF
+        has range :math:`[0, 1]`, the inverse CCDF is only defined on the
+        domain :math:`[0, 1]`; for :math:`p < 0` and :math:`p > 1`, ``iccdf``
+        returns ``nan``.
+
+        See Also
+        --------
+        icdf
+        ilogccdf
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the inverse CCDF at the desired argument:
+
+        >>> X.iccdf(0.25)
+        0.25
+        >>> np.allclose(X.iccdf(0.25), X.icdf(1-0.25))
+        True
+
+        This function returns NaN when the argument is outside the domain.
+
+        >>> X.iccdf([-0.1, 0, 1, 1.1])
+        array([ nan,  0.5, -0.5,  nan])
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def logcdf(self, x, y, /, *, method):
+        r"""Log of the cumulative distribution function
+
+        The cumulative distribution function ("CDF"), denoted :math:`F(x)`, is
+        the probability the random variable :math:`X` will assume a value
+        less than or equal to :math:`x`:
+
+        .. math::
+
+            F(x) = P(X ≤ x)
+
+        A two-argument variant of this function is also defined as the
+        probability the random variable :math:`X` will assume a value between
+        :math:`x` and :math:`y`.
+
+        .. math::
+
+            F(x, y) = P(x ≤ X ≤ y)
+
+        `logcdf` computes the logarithm of the cumulative distribution function
+        ("log-CDF"), :math:`\log(F(x))`/:math:`\log(F(x, y))`, but it may be
+        numerically favorable compared to the naive implementation (computing
+        the CDF and taking the logarithm).
+
+        `logcdf` accepts `x` for :math:`x` and `y` for :math:`y`.
+
+        Parameters
+        ----------
+        x, y : array_like
+            The arguments of the log-CDF. `x` is required; `y` is optional.
+        method : {None, 'formula', 'logexp', 'complement', 'quadrature', 'subtraction'}
+            The strategy used to evaluate the log-CDF.
+            By default (``None``), the one-argument form of the function
+            chooses between the following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the log-CDF itself
+            - ``'logexp'``: evaluate the CDF and take the logarithm
+            - ``'complement'``: evaluate the log-CCDF and take the
+              logarithmic complement (see Notes)
+            - ``'quadrature'``: numerically log-integrate the log-PDF (or, in the
+              discrete case, log-sum the log-PMF)
+
+            In place of ``'complement'``, the two-argument form accepts:
+
+            - ``'subtraction'``: compute the log-CDF at each argument and take
+              the logarithmic difference (see Notes)
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The log-CDF evaluated at the provided argument(s).
+
+        See Also
+        --------
+        cdf
+        logccdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        The log-CDF evaluates to its minimum value of :math:`\log(0) = -\infty`
+        for :math:`x ≤ l` and its maximum value of :math:`\log(1) = 0` for
+        :math:`x ≥ r`. An analogous statement can be made for discrete distributions,
+        but the inequality governing the minimum value is strict.
+
+        For distributions with infinite support, it is common for
+        `cdf` to return a value of ``0`` when the argument
+        is theoretically within the support; this can occur because the true value
+        of the CDF is too small to be represented by the chosen dtype. `logcdf`,
+        however, will often return a finite (not ``-inf``) result over a much larger
+        domain. Similarly, `logcdf` may provided a strictly negative result with
+        arguments for which `cdf` would return ``1.0``. Consequently, it may be
+        preferred to work with the logarithms of probabilities to avoid underflow
+        and related limitations of floating point numbers.
+
+        The "logarithmic complement" of a number :math:`z` is mathematically
+        equivalent to :math:`\log(1-\exp(z))`, but it is computed to avoid loss
+        of precision when :math:`\exp(z)` is nearly :math:`0` or :math:`1`.
+        Similarly, the term "logarithmic difference" of :math:`w` and :math:`z`
+        is used here to mean :math:`\log(\exp(w)-\exp(z))`.
+
+        If ``y < x``, the CDF is negative, and therefore the log-CCDF
+        is complex with imaginary part :math:`\pi`. For
+        consistency, the result of this function always has complex dtype
+        when `y` is provided, regardless of the value of the imaginary part.
+
+        References
+        ----------
+        .. [1] Cumulative distribution function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Cumulative_distribution_function
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the log-CDF at the desired argument:
+
+        >>> X.logcdf(0.25)
+        -0.287682072451781
+        >>> np.allclose(X.logcdf(0.), np.log(X.cdf(0.)))
+        True
+
+        """  # noqa: E501
+        raise NotImplementedError()
+
+    @abstractmethod
+    def ilogcdf(self, logp, /, *, method):
+        r"""Inverse of the logarithm of the cumulative distribution function.
+
+        The inverse of the logarithm of the cumulative distribution function
+        ("inverse log-CDF") is the argument :math:`x` for which the logarithm
+        of the cumulative distribution function :math:`\log(F(x))` evaluates
+        to :math:`\log(p)`.
+
+        Mathematically, it is equivalent to :math:`F^{-1}(\exp(y))`, where
+        :math:`y = \log(p)`, but it may be numerically favorable compared to
+        the naive implementation (computing :math:`p = \exp(y)`, then
+        :math:`F^{-1}(p)`).
+
+        `ilogcdf` accepts `logp` for :math:`\log(p) ≤ 0`.
+
+        Parameters
+        ----------
+        logp : array_like
+            The argument of the inverse log-CDF.
+        method : {None, 'formula', 'complement', 'inversion'}
+            The strategy used to evaluate the inverse log-CDF.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the inverse log-CDF itself
+            - ``'complement'``: evaluate the inverse log-CCDF at the
+              logarithmic complement of `logp` (see Notes)
+            - ``'inversion'``: solve numerically for the argument at which the
+              log-CDF is equal to `logp`
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The inverse log-CDF evaluated at the provided argument.
+
+        See Also
+        --------
+        icdf
+        logcdf
+
+        Notes
+        -----
+        Suppose a probability distribution has support :math:`[l, r]`.
+        The inverse log-CDF returns its minimum value of :math:`l` at
+        :math:`\log(p) = \log(0) = -\infty` and its maximum value of :math:`r` at
+        :math:`\log(p) = \log(1) = 0`. Because the log-CDF has range
+        :math:`[-\infty, 0]`, the inverse log-CDF is only defined on the
+        negative reals; for :math:`\log(p) > 0`, `ilogcdf` returns ``nan``.
+
+        Occasionally, it is needed to find the argument of the CDF for which
+        the resulting probability is very close to ``0`` or ``1`` - too close to
+        represent accurately with floating point arithmetic. In many cases,
+        however, the *logarithm* of this resulting probability may be
+        represented in floating point arithmetic, in which case this function
+        may be used to find the argument of the CDF for which the *logarithm*
+        of the resulting probability is :math:`y = \log(p)`.
+
+        The "logarithmic complement" of a number :math:`z` is mathematically
+        equivalent to :math:`\log(1-\exp(z))`, but it is computed to avoid loss
+        of precision when :math:`\exp(z)` is nearly :math:`0` or :math:`1`.
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the inverse log-CDF at the desired argument:
+
+        >>> X.ilogcdf(-0.25)
+        0.2788007830714034
+        >>> np.allclose(X.ilogcdf(-0.25), X.icdf(np.exp(-0.25)))
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def logccdf(self, x, y, /, *, method):
+        r"""Log of the complementary cumulative distribution function
+
+        The complementary cumulative distribution function ("CCDF"), denoted
+        :math:`G(x)` is the complement of the cumulative distribution function
+        :math:`F(x)`; i.e., probability the random variable :math:`X` will
+        assume a value greater than :math:`x`:
+
+        .. math::
+
+            G(x) = 1 - F(x) = P(X > x)
+
+         A two-argument variant of this function is:
+
+        .. math::
+
+            G(x, y) = 1 - F(x, y) = P(X < x \quad \text{or} \quad X > y)
+
+        `logccdf` computes the logarithm of the complementary cumulative
+        distribution function ("log-CCDF"), :math:`\log(G(x))`/:math:`\log(G(x, y))`,
+        but it may be numerically favorable compared to the naive implementation
+        (computing the CDF and taking the logarithm).
+
+        `logccdf` accepts `x` for :math:`x` and `y` for :math:`y`.
+
+        Parameters
+        ----------
+        x, y : array_like
+            The arguments of the log-CCDF. `x` is required; `y` is optional.
+        method : {None, 'formula', 'logexp', 'complement', 'quadrature', 'addition'}
+            The strategy used to evaluate the log-CCDF.
+            By default (``None``), the one-argument form of the function
+            chooses between the following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the log CCDF itself
+            - ``'logexp'``: evaluate the CCDF and take the logarithm
+            - ``'complement'``: evaluate the log-CDF and take the
+              logarithmic complement (see Notes)
+            - ``'quadrature'``: numerically log-integrate the log-PDF (or, in the
+              discrete case, log-sum the log-PMF)
+
+            The two-argument form chooses between:
+
+            - ``'formula'``: use a formula for the log CCDF itself
+            - ``'addition'``: compute the log-CDF at `x` and the log-CCDF at `y`,
+              then take the logarithmic sum (see Notes)
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The log-CCDF evaluated at the provided argument(s).
+
+        See Also
+        --------
+        ccdf
+        logcdf
+
+        Notes
+        -----
+        Suppose a continuous probability distribution has support :math:`[l, r]`.
+        The log-CCDF returns its minimum value of :math:`\log(0)=-\infty` for
+        :math:`x ≥ r` and its maximum value of :math:`\log(1) = 0` for
+        :math:`x ≤ l`. An analogous statement can be made for discrete distributions,
+        but the inequality governing the maximum value is strict.
+
+        For distributions with infinite support, it is common for
+        `ccdf` to return a value of ``0`` when the argument
+        is theoretically within the support; this can occur because the true value
+        of the CCDF is too small to be represented by the chosen dtype. The log
+        of the CCDF, however, will often be finite (not ``-inf``) over a much larger
+        domain. Similarly, `logccdf` may provided a strictly negative result with
+        arguments for which `ccdf` would return ``1.0``. Consequently, it may be
+        preferred to work with the logarithms of probabilities to avoid underflow
+        and related limitations of floating point numbers.
+
+        The "logarithmic complement" of a number :math:`z` is mathematically
+        equivalent to :math:`\log(1-\exp(z))`, but it is computed to avoid loss
+        of precision when :math:`\exp(z)` is nearly :math:`0` or :math:`1`.
+        Similarly, the term "logarithmic sum" of :math:`w` and :math:`z`
+        is used here to mean the :math:`\log(\exp(w)+\exp(z))`, AKA
+        :math:`\text{LogSumExp}(w, z)`.
+
+        References
+        ----------
+        .. [1] Cumulative distribution function, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Cumulative_distribution_function#Derived_functions
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the log-CCDF at the desired argument:
+
+        >>> X.logccdf(0.25)
+        -1.3862943611198906
+        >>> np.allclose(X.logccdf(0.), np.log(X.ccdf(0.)))
+        True
+
+        """  # noqa: E501
+        raise NotImplementedError()
+
+    @abstractmethod
+    def ilogccdf(self, logp, /, *, method):
+        r"""Inverse of the log of the complementary cumulative distribution function.
+
+        The inverse of the logarithm of the complementary cumulative distribution
+        function ("inverse log-CCDF") is the argument :math:`x` for which the logarithm
+        of the complementary cumulative distribution function :math:`\log(G(x))`
+        evaluates to :math:`\log(p)`.
+
+        Mathematically, it is equivalent to :math:`G^{-1}(\exp(y))`, where
+        :math:`y = \log(p)`, but it may be numerically favorable compared to the naive
+        implementation (computing :math:`p = \exp(y)`, then :math:`G^{-1}(p)`).
+
+        `ilogccdf` accepts `logp` for :math:`\log(p) ≤ 0`.
+
+        Parameters
+        ----------
+        x : array_like
+            The argument of the inverse log-CCDF.
+        method : {None, 'formula', 'complement', 'inversion'}
+            The strategy used to evaluate the inverse log-CCDF.
+            By default (``None``), the infrastructure chooses between the
+            following options, listed in order of precedence.
+
+            - ``'formula'``: use a formula for the inverse log-CCDF itself
+            - ``'complement'``: evaluate the inverse log-CDF at the
+              logarithmic complement of `x` (see Notes)
+            - ``'inversion'``: solve numerically for the argument at which the
+              log-CCDF is equal to `x`
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The inverse log-CCDF evaluated at the provided argument.
+
+        Notes
+        -----
+        Suppose a probability distribution has support :math:`[l, r]`. The
+        inverse log-CCDF returns its minimum value of :math:`l` at
+        :math:`\log(p) = \log(1) = 0` and its maximum value of :math:`r` at
+        :math:`\log(p) = \log(0) = -\infty`. Because the log-CCDF has range
+        :math:`[-\infty, 0]`, the inverse log-CDF is only defined on the
+        negative reals; for :math:`\log(p) > 0`, `ilogccdf` returns ``nan``.
+
+        Occasionally, it is needed to find the argument of the CCDF for which
+        the resulting probability is very close to ``0`` or ``1`` - too close to
+        represent accurately with floating point arithmetic. In many cases,
+        however, the *logarithm* of this resulting probability may be
+        represented in floating point arithmetic, in which case this function
+        may be used to find the argument of the CCDF for which the *logarithm*
+        of the resulting probability is :math:`y = \log(p)`.
+
+        The "logarithmic complement" of a number :math:`z` is mathematically
+        equivalent to :math:`\log(1-\exp(z))`, but it is computed to avoid loss
+        of precision when :math:`\exp(z)` is nearly :math:`0` or :math:`1`.
+
+        See Also
+        --------
+        iccdf
+        ilogccdf
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-0.5, b=0.5)
+
+        Evaluate the inverse log-CCDF at the desired argument:
+
+        >>> X.ilogccdf(-0.25)
+        -0.2788007830714034
+        >>> np.allclose(X.ilogccdf(-0.25), X.iccdf(np.exp(-0.25)))
+        True
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def logentropy(self, *, method):
+        r"""Logarithm of the differential entropy
+
+        In terms of probability density function :math:`f(x)` and support
+        :math:`\chi`, the differential entropy (or simply "entropy") of a
+        continuous random variable :math:`X` is:
+
+        .. math::
+
+            h(X) = - \int_{\chi} f(x) \log f(x) dx
+
+        The definition for a discrete random variable is analogous, with the PMF
+        replacing the PDF and a sum over the support replacing the integral.
+
+        `logentropy` computes the logarithm of the differential entropy
+        ("log-entropy"), :math:`\log(h(X))`, but it may be numerically favorable
+        compared to the naive implementation (computing :math:`h(X)` then
+        taking the logarithm).
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'logexp', 'quadrature}
+            The strategy used to evaluate the log-entropy. By default
+            (``None``), the infrastructure chooses between the following options,
+            listed in order of precedence.
+
+            - ``'formula'``: use a formula for the log-entropy itself
+            - ``'logexp'``: evaluate the entropy and take the logarithm
+            - ``'quadrature'``: numerically log-integrate (or, in the discrete
+              case, log-sum) the logarithm of the entropy integrand (summand)
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The log-entropy.
+
+        See Also
+        --------
+        entropy
+        logpdf
+
+        Notes
+        -----
+        The differential entropy of a continuous distribution can be negative.
+        In this case, the log-entropy is complex with imaginary part :math:`\pi`.
+        For consistency, the result of this function always has complex dtype,
+        regardless of the value of the imaginary part.
+
+        References
+        ----------
+        .. [1] Differential entropy, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Differential_entropy
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-1., b=1.)
+
+        Evaluate the log-entropy:
+
+        >>> X.logentropy()
+        (-0.3665129205816642+0j)
+        >>> np.allclose(np.exp(X.logentropy()), X.entropy())
+        True
+
+        For a random variable with negative entropy, the log-entropy has an
+        imaginary part equal to `np.pi`.
+
+        >>> X = stats.Uniform(a=-.1, b=.1)
+        >>> X.entropy(), X.logentropy()
+        (-1.6094379124341007, (0.4758849953271105+3.141592653589793j))
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def entropy(self, *, method):
+        r"""Differential entropy
+
+        In terms of probability density function :math:`f(x)` and support
+        :math:`\chi`, the differential entropy (or simply "entropy") of a
+        continuous random variable :math:`X` is:
+
+        .. math::
+
+            h(X) = - \int_{\chi} f(x) \log f(x) dx
+
+        The definition for a discrete random variable is analogous, with the
+        PMF replacing the PDF and a sum over the support replacing the integral.
+
+        Parameters
+        ----------
+        method : {None, 'formula', 'logexp', 'quadrature'}
+            The strategy used to evaluate the entropy. By default (``None``),
+            the infrastructure chooses between the following options, listed
+            in order of precedence.
+
+            - ``'formula'``: use a formula for the entropy itself
+            - ``'logexp'``: evaluate the log-entropy and exponentiate
+            - ``'quadrature'``:  numerically integrate (or, in the discrete
+              case, sum) the entropy integrand (summand)
+
+
+            Not all `method` options are available for all distributions.
+            If the selected `method` is not available, a ``NotImplementedError``
+            will be raised.
+
+        Returns
+        -------
+        out : array
+            The entropy of the random variable.
+
+        See Also
+        --------
+        logentropy
+        pdf
+
+        Notes
+        -----
+        This function calculates the entropy using the natural logarithm; i.e.
+        the logarithm with base :math:`e`. Consequently, the value is expressed
+        in (dimensionless) "units" of nats. To convert the entropy to different
+        units (i.e. corresponding with a different base), divide the result by
+        the natural logarithm of the desired base.
+
+        References
+        ----------
+        .. [1] Differential entropy, *Wikipedia*,
+               https://en.wikipedia.org/wiki/Differential_entropy
+
+        Examples
+        --------
+        Instantiate a distribution with the desired parameters:
+
+        >>> from scipy import stats
+        >>> X = stats.Uniform(a=-1., b=1.)
+
+        Evaluate the entropy:
+
+        >>> X.entropy()
+        0.6931471805599454
+
+        """
+        raise NotImplementedError()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae29bea808232f57fefaf39487c1a141b4033545
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc.py
@@ -0,0 +1,2956 @@
+"""Quasi-Monte Carlo engines and helpers."""
+import copy
+import math
+import numbers
+import os
+import warnings
+from abc import ABC, abstractmethod
+from functools import partial
+from typing import (
+    ClassVar,
+    Literal,
+    overload,
+    TYPE_CHECKING,
+)
+from collections.abc import Callable
+
+import numpy as np
+
+from scipy._lib._util import DecimalNumber, GeneratorType, IntNumber, SeedType
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+import scipy.stats as stats
+from scipy._lib._util import rng_integers, _rng_spawn, _transition_to_rng
+from scipy.sparse.csgraph import minimum_spanning_tree
+from scipy.spatial import distance, Voronoi
+from scipy.special import gammainc
+from ._sobol import (
+    _initialize_v, _cscramble, _fill_p_cumulative, _draw, _fast_forward,
+    _categorize, _MAXDIM
+)
+from ._qmc_cy import (
+    _cy_wrapper_centered_discrepancy,
+    _cy_wrapper_wrap_around_discrepancy,
+    _cy_wrapper_mixture_discrepancy,
+    _cy_wrapper_l2_star_discrepancy,
+    _cy_wrapper_update_discrepancy,
+    _cy_van_der_corput_scrambled,
+    _cy_van_der_corput,
+)
+
+
+__all__ = ['scale', 'discrepancy', 'geometric_discrepancy', 'update_discrepancy',
+           'QMCEngine', 'Sobol', 'Halton', 'LatinHypercube', 'PoissonDisk',
+           'MultinomialQMC', 'MultivariateNormalQMC']
+
+
+@overload
+def check_random_state(seed: IntNumber | None = ...) -> np.random.Generator:
+    ...
+
+
+@overload
+def check_random_state(seed: GeneratorType) -> GeneratorType:
+    ...
+
+
+# Based on scipy._lib._util.check_random_state
+# This is going to be removed at the end of the SPEC 7 transition,
+# so I'll just leave the argument name `seed` alone
+def check_random_state(seed=None):
+    """Turn `seed` into a `numpy.random.Generator` instance.
+
+    Parameters
+    ----------
+    seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+        If `seed` is an int or None, a new `numpy.random.Generator` is
+        created using ``np.random.default_rng(seed)``.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance, then
+        the provided instance is used.
+
+    Returns
+    -------
+    seed : {`numpy.random.Generator`, `numpy.random.RandomState`}
+        Random number generator.
+
+    """
+    if seed is None or isinstance(seed, numbers.Integral | np.integer):
+        return np.random.default_rng(seed)
+    elif isinstance(seed, np.random.RandomState | np.random.Generator):
+        return seed
+    else:
+        raise ValueError(f'{seed!r} cannot be used to seed a'
+                         ' numpy.random.Generator instance')
+
+
+def scale(
+    sample: "npt.ArrayLike",
+    l_bounds: "npt.ArrayLike",
+    u_bounds: "npt.ArrayLike",
+    *,
+    reverse: bool = False
+) -> np.ndarray:
+    r"""Sample scaling from unit hypercube to different bounds.
+
+    To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
+    with :math:`a` the lower bounds and :math:`b` the upper bounds.
+    The following transformation is used:
+
+    .. math::
+
+        (b - a) \cdot \text{sample} + a
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        Sample to scale.
+    l_bounds, u_bounds : array_like (d,)
+        Lower and upper bounds (resp. :math:`a`, :math:`b`) of transformed
+        data. If `reverse` is True, range of the original data to transform
+        to the unit hypercube.
+    reverse : bool, optional
+        Reverse the transformation from different bounds to the unit hypercube.
+        Default is False.
+
+    Returns
+    -------
+    sample : array_like (n, d)
+        Scaled sample.
+
+    Examples
+    --------
+    Transform 3 samples in the unit hypercube to bounds:
+
+    >>> from scipy.stats import qmc
+    >>> l_bounds = [-2, 0]
+    >>> u_bounds = [6, 5]
+    >>> sample = [[0.5 , 0.75],
+    ...           [0.5 , 0.5],
+    ...           [0.75, 0.25]]
+    >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
+    >>> sample_scaled
+    array([[2.  , 3.75],
+           [2.  , 2.5 ],
+           [4.  , 1.25]])
+
+    And convert back to the unit hypercube:
+
+    >>> sample_ = qmc.scale(sample_scaled, l_bounds, u_bounds, reverse=True)
+    >>> sample_
+    array([[0.5 , 0.75],
+           [0.5 , 0.5 ],
+           [0.75, 0.25]])
+
+    """
+    sample = np.asarray(sample)
+
+    # Checking bounds and sample
+    if not sample.ndim == 2:
+        raise ValueError('Sample is not a 2D array')
+
+    lower, upper = _validate_bounds(
+        l_bounds=l_bounds, u_bounds=u_bounds, d=sample.shape[1]
+    )
+
+    if not reverse:
+        # Checking that sample is within the hypercube
+        if (sample.max() > 1.) or (sample.min() < 0.):
+            raise ValueError('Sample is not in unit hypercube')
+
+        return sample * (upper - lower) + lower
+    else:
+        # Checking that sample is within the bounds
+        if not (np.all(sample >= lower) and np.all(sample <= upper)):
+            raise ValueError('Sample is out of bounds')
+
+        return (sample - lower) / (upper - lower)
+
+
+def _ensure_in_unit_hypercube(sample: "npt.ArrayLike") -> np.ndarray:
+    """Ensure that sample is a 2D array and is within a unit hypercube
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        A 2D array of points.
+
+    Returns
+    -------
+    np.ndarray
+        The array interpretation of the input sample
+
+    Raises
+    ------
+    ValueError
+        If the input is not a 2D array or contains points outside of
+        a unit hypercube.
+    """
+    sample = np.asarray(sample, dtype=np.float64, order="C")
+
+    if not sample.ndim == 2:
+        raise ValueError("Sample is not a 2D array")
+
+    if (sample.max() > 1.) or (sample.min() < 0.):
+        raise ValueError("Sample is not in unit hypercube")
+
+    return sample
+
+
+def discrepancy(
+        sample: "npt.ArrayLike",
+        *,
+        iterative: bool = False,
+        method: Literal["CD", "WD", "MD", "L2-star"] = "CD",
+        workers: IntNumber = 1) -> float:
+    """Discrepancy of a given sample.
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        The sample to compute the discrepancy from.
+    iterative : bool, optional
+        Must be False if not using it for updating the discrepancy.
+        Default is False. Refer to the notes for more details.
+    method : str, optional
+        Type of discrepancy, can be ``CD``, ``WD``, ``MD`` or ``L2-star``.
+        Refer to the notes for more details. Default is ``CD``.
+    workers : int, optional
+        Number of workers to use for parallel processing. If -1 is given all
+        CPU threads are used. Default is 1.
+
+    Returns
+    -------
+    discrepancy : float
+        Discrepancy.
+
+    See Also
+    --------
+    geometric_discrepancy
+
+    Notes
+    -----
+    The discrepancy is a uniformity criterion used to assess the space filling
+    of a number of samples in a hypercube. A discrepancy quantifies the
+    distance between the continuous uniform distribution on a hypercube and the
+    discrete uniform distribution on :math:`n` distinct sample points.
+
+    The lower the value is, the better the coverage of the parameter space is.
+
+    For a collection of subsets of the hypercube, the discrepancy is the
+    difference between the fraction of sample points in one of those
+    subsets and the volume of that subset. There are different definitions of
+    discrepancy corresponding to different collections of subsets. Some
+    versions take a root mean square difference over subsets instead of
+    a maximum.
+
+    A measure of uniformity is reasonable if it satisfies the following
+    criteria [1]_:
+
+    1. It is invariant under permuting factors and/or runs.
+    2. It is invariant under rotation of the coordinates.
+    3. It can measure not only uniformity of the sample over the hypercube,
+       but also the projection uniformity of the sample over non-empty
+       subset of lower dimension hypercubes.
+    4. There is some reasonable geometric meaning.
+    5. It is easy to compute.
+    6. It satisfies the Koksma-Hlawka-like inequality.
+    7. It is consistent with other criteria in experimental design.
+
+    Four methods are available:
+
+    * ``CD``: Centered Discrepancy - subspace involves a corner of the
+      hypercube
+    * ``WD``: Wrap-around Discrepancy - subspace can wrap around bounds
+    * ``MD``: Mixture Discrepancy - mix between CD/WD covering more criteria
+    * ``L2-star``: L2-star discrepancy - like CD BUT variant to rotation
+
+    Methods ``CD``, ``WD``, and ``MD`` implement the right hand side of equations
+    9, 10, and 18 of [2]_, respectively; the square root is not taken. On the
+    other hand, ``L2-star`` computes the quantity given by equation 10 of
+    [3]_ as implemented by subsequent equations; the square root is taken.
+
+    Lastly, using ``iterative=True``, it is possible to compute the
+    discrepancy as if we had :math:`n+1` samples. This is useful if we want
+    to add a point to a sampling and check the candidate which would give the
+    lowest discrepancy. Then you could just update the discrepancy with
+    each candidate using `update_discrepancy`. This method is faster than
+    computing the discrepancy for a large number of candidates.
+
+    References
+    ----------
+    .. [1] Fang et al. "Design and modeling for computer experiments".
+       Computer Science and Data Analysis Series, 2006.
+    .. [2] Zhou Y.-D. et al. "Mixture discrepancy for quasi-random point sets."
+       Journal of Complexity, 29 (3-4) , pp. 283-301, 2013.
+    .. [3] T. T. Warnock. "Computational investigations of low discrepancy
+       point sets." Applications of Number Theory to Numerical
+       Analysis, Academic Press, pp. 319-343, 1972.
+
+    Examples
+    --------
+    Calculate the quality of the sample using the discrepancy:
+
+    >>> import numpy as np
+    >>> from scipy.stats import qmc
+    >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
+    >>> l_bounds = [0.5, 0.5]
+    >>> u_bounds = [6.5, 6.5]
+    >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
+    >>> space
+    array([[0.08333333, 0.41666667],
+           [0.25      , 0.91666667],
+           [0.41666667, 0.25      ],
+           [0.58333333, 0.75      ],
+           [0.75      , 0.08333333],
+           [0.91666667, 0.58333333]])
+    >>> qmc.discrepancy(space)
+    0.008142039609053464
+
+    We can also compute iteratively the ``CD`` discrepancy by using
+    ``iterative=True``.
+
+    >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
+    >>> disc_init
+    0.04769081147119336
+    >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
+    0.008142039609053513
+
+    """
+    sample = _ensure_in_unit_hypercube(sample)
+
+    workers = _validate_workers(workers)
+
+    methods = {
+        "CD": _cy_wrapper_centered_discrepancy,
+        "WD": _cy_wrapper_wrap_around_discrepancy,
+        "MD": _cy_wrapper_mixture_discrepancy,
+        "L2-star": _cy_wrapper_l2_star_discrepancy,
+    }
+
+    if method in methods:
+        return methods[method](sample, iterative, workers=workers)
+    else:
+        raise ValueError(f"{method!r} is not a valid method. It must be one of"
+                         f" {set(methods)!r}")
+
+
+def geometric_discrepancy(
+        sample: "npt.ArrayLike",
+        method: Literal["mindist", "mst"] = "mindist",
+        metric: str = "euclidean") -> float:
+    """Discrepancy of a given sample based on its geometric properties.
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        The sample to compute the discrepancy from.
+    method : {"mindist", "mst"}, optional
+        The method to use. One of ``mindist`` for minimum distance (default)
+        or ``mst`` for minimum spanning tree.
+    metric : str or callable, optional
+        The distance metric to use. See the documentation
+        for `scipy.spatial.distance.pdist` for the available metrics and
+        the default.
+
+    Returns
+    -------
+    discrepancy : float
+        Discrepancy (higher values correspond to greater sample uniformity).
+
+    See Also
+    --------
+    discrepancy
+
+    Notes
+    -----
+    The discrepancy can serve as a simple measure of quality of a random sample.
+    This measure is based on the geometric properties of the distribution of points
+    in the sample, such as the minimum distance between any pair of points, or
+    the mean edge length in a minimum spanning tree.
+
+    The higher the value is, the better the coverage of the parameter space is.
+    Note that this is different from `scipy.stats.qmc.discrepancy`, where lower
+    values correspond to higher quality of the sample.
+
+    Also note that when comparing different sampling strategies using this function,
+    the sample size must be kept constant.
+
+    It is possible to calculate two metrics from the minimum spanning tree:
+    the mean edge length and the standard deviation of edges lengths. Using
+    both metrics offers a better picture of uniformity than either metric alone,
+    with higher mean and lower standard deviation being preferable (see [1]_
+    for a brief discussion). This function currently only calculates the mean
+    edge length.
+
+    References
+    ----------
+    .. [1] Franco J. et al. "Minimum Spanning Tree: A new approach to assess the quality
+       of the design of computer experiments." Chemometrics and Intelligent Laboratory
+       Systems, 97 (2), pp. 164-169, 2009.
+
+    Examples
+    --------
+    Calculate the quality of the sample using the minimum euclidean distance
+    (the defaults):
+
+    >>> import numpy as np
+    >>> from scipy.stats import qmc
+    >>> rng = np.random.default_rng(191468432622931918890291693003068437394)
+    >>> sample = qmc.LatinHypercube(d=2, rng=rng).random(50)
+    >>> qmc.geometric_discrepancy(sample)
+    0.03708161435687876
+
+    Calculate the quality using the mean edge length in the minimum
+    spanning tree:
+
+    >>> qmc.geometric_discrepancy(sample, method='mst')
+    0.1105149978798376
+
+    Display the minimum spanning tree and the points with
+    the smallest distance:
+
+    >>> import matplotlib.pyplot as plt
+    >>> from matplotlib.lines import Line2D
+    >>> from scipy.sparse.csgraph import minimum_spanning_tree
+    >>> from scipy.spatial.distance import pdist, squareform
+    >>> dist = pdist(sample)
+    >>> mst = minimum_spanning_tree(squareform(dist))
+    >>> edges = np.where(mst.toarray() > 0)
+    >>> edges = np.asarray(edges).T
+    >>> min_dist = np.min(dist)
+    >>> min_idx = np.argwhere(squareform(dist) == min_dist)[0]
+    >>> fig, ax = plt.subplots(figsize=(10, 5))
+    >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',
+    ...            xlim=[0, 1], ylim=[0, 1])
+    >>> for edge in edges:
+    ...     ax.plot(sample[edge, 0], sample[edge, 1], c='k')
+    >>> ax.scatter(sample[:, 0], sample[:, 1])
+    >>> ax.add_patch(plt.Circle(sample[min_idx[0]], min_dist, color='red', fill=False))
+    >>> markers = [
+    ...     Line2D([0], [0], marker='o', lw=0, label='Sample points'),
+    ...     Line2D([0], [0], color='k', label='Minimum spanning tree'),
+    ...     Line2D([0], [0], marker='o', lw=0, markerfacecolor='w', markeredgecolor='r',
+    ...            label='Minimum point-to-point distance'),
+    ... ]
+    >>> ax.legend(handles=markers, loc='center left', bbox_to_anchor=(1, 0.5));
+    >>> plt.show()
+
+    """
+    sample = _ensure_in_unit_hypercube(sample)
+    if sample.shape[0] < 2:
+        raise ValueError("Sample must contain at least two points")
+
+    distances = distance.pdist(sample, metric=metric)  # type: ignore[call-overload]
+
+    if np.any(distances == 0.0):
+        warnings.warn("Sample contains duplicate points.", stacklevel=2)
+
+    if method == "mindist":
+        return np.min(distances[distances.nonzero()])
+    elif method == "mst":
+        fully_connected_graph = distance.squareform(distances)
+        mst = minimum_spanning_tree(fully_connected_graph)
+        distances = mst[mst.nonzero()]
+        # TODO consider returning both the mean and the standard deviation
+        # see [1] for a discussion
+        return np.mean(distances)
+    else:
+        raise ValueError(f"{method!r} is not a valid method. "
+                         f"It must be one of {{'mindist', 'mst'}}")
+
+
+def update_discrepancy(
+        x_new: "npt.ArrayLike",
+        sample: "npt.ArrayLike",
+        initial_disc: DecimalNumber) -> float:
+    """Update the centered discrepancy with a new sample.
+
+    Parameters
+    ----------
+    x_new : array_like (1, d)
+        The new sample to add in `sample`.
+    sample : array_like (n, d)
+        The initial sample.
+    initial_disc : float
+        Centered discrepancy of the `sample`.
+
+    Returns
+    -------
+    discrepancy : float
+        Centered discrepancy of the sample composed of `x_new` and `sample`.
+
+    Examples
+    --------
+    We can also compute iteratively the discrepancy by using
+    ``iterative=True``.
+
+    >>> import numpy as np
+    >>> from scipy.stats import qmc
+    >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
+    >>> l_bounds = [0.5, 0.5]
+    >>> u_bounds = [6.5, 6.5]
+    >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
+    >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
+    >>> disc_init
+    0.04769081147119336
+    >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
+    0.008142039609053513
+
+    """
+    sample = np.asarray(sample, dtype=np.float64, order="C")
+    x_new = np.asarray(x_new, dtype=np.float64, order="C")
+
+    # Checking that sample is within the hypercube and 2D
+    if not sample.ndim == 2:
+        raise ValueError('Sample is not a 2D array')
+
+    if (sample.max() > 1.) or (sample.min() < 0.):
+        raise ValueError('Sample is not in unit hypercube')
+
+    # Checking that x_new is within the hypercube and 1D
+    if not x_new.ndim == 1:
+        raise ValueError('x_new is not a 1D array')
+
+    if not (np.all(x_new >= 0) and np.all(x_new <= 1)):
+        raise ValueError('x_new is not in unit hypercube')
+
+    if x_new.shape[0] != sample.shape[1]:
+        raise ValueError("x_new and sample must be broadcastable")
+
+    return _cy_wrapper_update_discrepancy(x_new, sample, initial_disc)
+
+
+def _perturb_discrepancy(sample: np.ndarray, i1: int, i2: int, k: int,
+                         disc: float):
+    """Centered discrepancy after an elementary perturbation of a LHS.
+
+    An elementary perturbation consists of an exchange of coordinates between
+    two points: ``sample[i1, k] <-> sample[i2, k]``. By construction,
+    this operation conserves the LHS properties.
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        The sample (before permutation) to compute the discrepancy from.
+    i1 : int
+        The first line of the elementary permutation.
+    i2 : int
+        The second line of the elementary permutation.
+    k : int
+        The column of the elementary permutation.
+    disc : float
+        Centered discrepancy of the design before permutation.
+
+    Returns
+    -------
+    discrepancy : float
+        Centered discrepancy of the design after permutation.
+
+    References
+    ----------
+    .. [1] Jin et al. "An efficient algorithm for constructing optimal design
+       of computer experiments", Journal of Statistical Planning and
+       Inference, 2005.
+
+    """
+    n = sample.shape[0]
+
+    z_ij = sample - 0.5
+
+    # Eq (19)
+    c_i1j = (1. / n ** 2.
+             * np.prod(0.5 * (2. + abs(z_ij[i1, :])
+                              + abs(z_ij) - abs(z_ij[i1, :] - z_ij)), axis=1))
+    c_i2j = (1. / n ** 2.
+             * np.prod(0.5 * (2. + abs(z_ij[i2, :])
+                              + abs(z_ij) - abs(z_ij[i2, :] - z_ij)), axis=1))
+
+    # Eq (20)
+    c_i1i1 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i1, :]))
+              - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i1, :])
+                                 - 0.5 * z_ij[i1, :] ** 2))
+    c_i2i2 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i2, :]))
+              - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i2, :])
+                                 - 0.5 * z_ij[i2, :] ** 2))
+
+    # Eq (22), typo in the article in the denominator i2 -> i1
+    num = (2 + abs(z_ij[i2, k]) + abs(z_ij[:, k])
+           - abs(z_ij[i2, k] - z_ij[:, k]))
+    denum = (2 + abs(z_ij[i1, k]) + abs(z_ij[:, k])
+             - abs(z_ij[i1, k] - z_ij[:, k]))
+    gamma = num / denum
+
+    # Eq (23)
+    c_p_i1j = gamma * c_i1j
+    # Eq (24)
+    c_p_i2j = c_i2j / gamma
+
+    alpha = (1 + abs(z_ij[i2, k])) / (1 + abs(z_ij[i1, k]))
+    beta = (2 - abs(z_ij[i2, k])) / (2 - abs(z_ij[i1, k]))
+
+    g_i1 = np.prod(1. + abs(z_ij[i1, :]))
+    g_i2 = np.prod(1. + abs(z_ij[i2, :]))
+    h_i1 = np.prod(1. + 0.5 * abs(z_ij[i1, :]) - 0.5 * (z_ij[i1, :] ** 2))
+    h_i2 = np.prod(1. + 0.5 * abs(z_ij[i2, :]) - 0.5 * (z_ij[i2, :] ** 2))
+
+    # Eq (25), typo in the article g is missing
+    c_p_i1i1 = ((g_i1 * alpha) / (n ** 2) - 2. * alpha * beta * h_i1 / n)
+    # Eq (26), typo in the article n ** 2
+    c_p_i2i2 = ((g_i2 / ((n ** 2) * alpha)) - (2. * h_i2 / (n * alpha * beta)))
+
+    # Eq (26)
+    sum_ = c_p_i1j - c_i1j + c_p_i2j - c_i2j
+
+    mask = np.ones(n, dtype=bool)
+    mask[[i1, i2]] = False
+    sum_ = sum(sum_[mask])
+
+    disc_ep = (disc + c_p_i1i1 - c_i1i1 + c_p_i2i2 - c_i2i2 + 2 * sum_)
+
+    return disc_ep
+
+
+def primes_from_2_to(n: int) -> np.ndarray:
+    """Prime numbers from 2 to *n*.
+
+    Parameters
+    ----------
+    n : int
+        Sup bound with ``n >= 6``.
+
+    Returns
+    -------
+    primes : list(int)
+        Primes in ``2 <= p < n``.
+
+    Notes
+    -----
+    Taken from [1]_ by P.T. Roy, written consent given on 23.04.2021
+    by the original author, Bruno Astrolino, for free use in SciPy under
+    the 3-clause BSD.
+
+    References
+    ----------
+    .. [1] `StackOverflow <https://stackoverflow.com/questions/2068372>`_.
+
+    """
+    sieve = np.ones(n // 3 + (n % 6 == 2), dtype=bool)
+    for i in range(1, int(n ** 0.5) // 3 + 1):
+        k = 3 * i + 1 | 1
+        sieve[k * k // 3::2 * k] = False
+        sieve[k * (k - 2 * (i & 1) + 4) // 3::2 * k] = False
+    return np.r_[2, 3, ((3 * np.nonzero(sieve)[0][1:] + 1) | 1)]
+
+
+def n_primes(n: IntNumber) -> list[int]:
+    """List of the n-first prime numbers.
+
+    Parameters
+    ----------
+    n : int
+        Number of prime numbers wanted.
+
+    Returns
+    -------
+    primes : list(int)
+        List of primes.
+
+    """
+    primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
+              61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
+              131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
+              197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
+              271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
+              353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,
+              433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
+              509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599,
+              601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673,
+              677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761,
+              769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857,
+              859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947,
+              953, 967, 971, 977, 983, 991, 997][:n]
+
+    if len(primes) < n:
+        big_number = 2000
+        while 'Not enough primes':
+            primes = primes_from_2_to(big_number)[:n]  # type: ignore
+            if len(primes) == n:
+                break
+            big_number += 1000
+
+    return primes
+
+
+def _van_der_corput_permutations(
+    base: IntNumber, *, rng: SeedType = None
+) -> np.ndarray:
+    """Permutations for scrambling a Van der Corput sequence.
+
+    Parameters
+    ----------
+    base : int
+        Base of the sequence.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. During the transition, the behavior documented above is not
+            accurate; see `check_random_state` for actual behavior. After the
+            transition, this admonition can be removed.
+
+    Returns
+    -------
+    permutations : array_like
+        Permutation indices.
+
+    Notes
+    -----
+    In Algorithm 1 of Owen 2017, a permutation of `np.arange(base)` is
+    created for each positive integer `k` such that ``1 - base**-k < 1``
+    using floating-point arithmetic. For double precision floats, the
+    condition ``1 - base**-k < 1`` can also be written as ``base**-k >
+    2**-54``, which makes it more apparent how many permutations we need
+    to create.
+    """
+    rng = check_random_state(rng)
+    count = math.ceil(54 / math.log2(base)) - 1
+    permutations = np.repeat(np.arange(base)[None], count, axis=0)
+    for perm in permutations:
+        rng.shuffle(perm)
+
+    return permutations
+
+
+def van_der_corput(
+        n: IntNumber,
+        base: IntNumber = 2,
+        *,
+        start_index: IntNumber = 0,
+        scramble: bool = False,
+        permutations: "npt.ArrayLike | None" = None,
+        rng: SeedType = None,
+        workers: IntNumber = 1) -> np.ndarray:
+    """Van der Corput sequence.
+
+    Pseudo-random number generator based on a b-adic expansion.
+
+    Scrambling uses permutations of the remainders (see [1]_). Multiple
+    permutations are applied to construct a point. The sequence of
+    permutations has to be the same for all points of the sequence.
+
+    Parameters
+    ----------
+    n : int
+        Number of element of the sequence.
+    base : int, optional
+        Base of the sequence. Default is 2.
+    start_index : int, optional
+        Index to start the sequence from. Default is 0.
+    scramble : bool, optional
+        If True, use Owen scrambling. Otherwise no scrambling is done.
+        Default is True.
+    permutations : array_like, optional
+        Permutations used for scrambling.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+    workers : int, optional
+        Number of workers to use for parallel processing. If -1 is
+        given all CPU threads are used. Default is 1.
+
+    Returns
+    -------
+    sequence : list (n,)
+        Sequence of Van der Corput.
+
+    References
+    ----------
+    .. [1] A. B. Owen. "A randomized Halton algorithm in R",
+       :arxiv:`1706.02808`, 2017.
+
+    """
+    if base < 2:
+        raise ValueError("'base' must be at least 2")
+
+    if scramble:
+        if permutations is None:
+            permutations = _van_der_corput_permutations(
+                base=base, rng=rng
+            )
+        else:
+            permutations = np.asarray(permutations)
+
+        permutations = permutations.astype(np.int64)
+        return _cy_van_der_corput_scrambled(n, base, start_index,
+                                            permutations, workers)
+
+    else:
+        return _cy_van_der_corput(n, base, start_index, workers)
+
+
+class QMCEngine(ABC):
+    """A generic Quasi-Monte Carlo sampler class meant for subclassing.
+
+    QMCEngine is a base class to construct a specific Quasi-Monte Carlo
+    sampler. It cannot be used directly as a sampler.
+
+    Parameters
+    ----------
+    d : int
+        Dimension of the parameter space.
+    optimization : {None, "random-cd", "lloyd"}, optional
+        Whether to use an optimization scheme to improve the quality after
+        sampling. Note that this is a post-processing step that does not
+        guarantee that all properties of the sample will be conserved.
+        Default is None.
+
+        * ``random-cd``: random permutations of coordinates to lower the
+          centered discrepancy. The best sample based on the centered
+          discrepancy is constantly updated. Centered discrepancy-based
+          sampling shows better space-filling robustness toward 2D and 3D
+          subprojections compared to using other discrepancy measures.
+        * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
+          The process converges to equally spaced samples.
+
+        .. versionadded:: 1.10.0
+
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    Notes
+    -----
+    By convention samples are distributed over the half-open interval
+    ``[0, 1)``. Instances of the class can access the attributes: ``d`` for
+    the dimension; and ``rng`` for the random number generator.
+
+    **Subclassing**
+
+    When subclassing `QMCEngine` to create a new sampler,  ``__init__`` and
+    ``random`` must be redefined.
+
+    * ``__init__(d, rng=None)``: at least fix the dimension. If the sampler
+      does not take advantage of a ``rng`` (deterministic methods like
+      Halton), this parameter can be omitted.
+    * ``_random(n, *, workers=1)``: draw ``n`` from the engine. ``workers``
+      is used for parallelism. See `Halton` for example.
+
+    Optionally, two other methods can be overwritten by subclasses:
+
+    * ``reset``: Reset the engine to its original state.
+    * ``fast_forward``: If the sequence is deterministic (like Halton
+      sequence), then ``fast_forward(n)`` is skipping the ``n`` first draw.
+
+    Examples
+    --------
+    To create a random sampler based on ``np.random.random``, we would do the
+    following:
+
+    >>> from scipy.stats import qmc
+    >>> class RandomEngine(qmc.QMCEngine):
+    ...     def __init__(self, d, rng=None):
+    ...         super().__init__(d=d, rng=rng)
+    ...
+    ...
+    ...     def _random(self, n=1, *, workers=1):
+    ...         return self.rng.random((n, self.d))
+    ...
+    ...
+    ...     def reset(self):
+    ...         super().__init__(d=self.d, rng=self.rng_seed)
+    ...         return self
+    ...
+    ...
+    ...     def fast_forward(self, n):
+    ...         self.random(n)
+    ...         return self
+
+    After subclassing `QMCEngine` to define the sampling strategy we want to
+    use, we can create an instance to sample from.
+
+    >>> engine = RandomEngine(2)
+    >>> engine.random(5)
+    array([[0.22733602, 0.31675834],  # random
+           [0.79736546, 0.67625467],
+           [0.39110955, 0.33281393],
+           [0.59830875, 0.18673419],
+           [0.67275604, 0.94180287]])
+
+    We can also reset the state of the generator and resample again.
+
+    >>> _ = engine.reset()
+    >>> engine.random(5)
+    array([[0.22733602, 0.31675834],  # random
+           [0.79736546, 0.67625467],
+           [0.39110955, 0.33281393],
+           [0.59830875, 0.18673419],
+           [0.67275604, 0.94180287]])
+
+    """
+
+    @abstractmethod
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self,
+        d: IntNumber,
+        *,
+        optimization: Literal["random-cd", "lloyd"] | None = None,
+        rng: SeedType = None
+    ) -> None:
+        self._initialize(d, optimization=optimization, rng=rng)
+
+    # During SPEC 7 transition:
+    # `__init__` has to be wrapped with @_transition_to_rng decorator
+    # because it is public. Subclasses previously called `__init__`
+    # directly, but this was problematic because arguments passed to
+    # subclass `__init__` as `seed` would get passed to superclass
+    # `__init__` as `rng`, rejecting `RandomState` arguments.
+    def _initialize(
+        self,
+        d: IntNumber,
+        *,
+        optimization: Literal["random-cd", "lloyd"] | None = None,
+        rng: SeedType = None
+    ) -> None:
+        if not np.issubdtype(type(d), np.integer) or d < 0:
+            raise ValueError('d must be a non-negative integer value')
+
+        self.d = d
+
+        if isinstance(rng, np.random.Generator):
+            # Spawn a Generator that we can own and reset.
+            self.rng = _rng_spawn(rng, 1)[0]
+        else:
+            # Create our instance of Generator, does not need spawning
+            # Also catch RandomState which cannot be spawned
+            self.rng = check_random_state(rng)
+        self.rng_seed = copy.deepcopy(self.rng)
+
+        self.num_generated = 0
+
+        config = {
+            # random-cd
+            "n_nochange": 100,
+            "n_iters": 10_000,
+            "rng": self.rng,
+
+            # lloyd
+            "tol": 1e-5,
+            "maxiter": 10,
+            "qhull_options": None,
+        }
+        self._optimization = optimization
+        self.optimization_method = _select_optimizer(optimization, config)
+
+    @abstractmethod
+    def _random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        ...
+
+    def random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        """Draw `n` in the half-open interval ``[0, 1)``.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space.
+            Default is 1.
+        workers : int, optional
+            Only supported with `Halton`.
+            Number of workers to use for parallel processing. If -1 is
+            given all CPU threads are used. Default is 1. It becomes faster
+            than one worker for `n` greater than :math:`10^3`.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            QMC sample.
+
+        """
+        sample = self._random(n, workers=workers)
+        if self.optimization_method is not None:
+            sample = self.optimization_method(sample)
+
+        self.num_generated += n
+        return sample
+
+    def integers(
+        self,
+        l_bounds: "npt.ArrayLike",
+        *,
+        u_bounds: "npt.ArrayLike | None" = None,
+        n: IntNumber = 1,
+        endpoint: bool = False,
+        workers: IntNumber = 1
+    ) -> np.ndarray:
+        r"""
+        Draw `n` integers from `l_bounds` (inclusive) to `u_bounds`
+        (exclusive), or if endpoint=True, `l_bounds` (inclusive) to
+        `u_bounds` (inclusive).
+
+        Parameters
+        ----------
+        l_bounds : int or array-like of ints
+            Lowest (signed) integers to be drawn (unless ``u_bounds=None``,
+            in which case this parameter is 0 and this value is used for
+            `u_bounds`).
+        u_bounds : int or array-like of ints, optional
+            If provided, one above the largest (signed) integer to be drawn
+            (see above for behavior if ``u_bounds=None``).
+            If array-like, must contain integer values.
+        n : int, optional
+            Number of samples to generate in the parameter space.
+            Default is 1.
+        endpoint : bool, optional
+            If true, sample from the interval ``[l_bounds, u_bounds]`` instead
+            of the default ``[l_bounds, u_bounds)``. Defaults is False.
+        workers : int, optional
+            Number of workers to use for parallel processing. If -1 is
+            given all CPU threads are used. Only supported when using `Halton`
+            Default is 1.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            QMC sample.
+
+        Notes
+        -----
+        It is safe to just use the same ``[0, 1)`` to integer mapping
+        with QMC that you would use with MC. You still get unbiasedness,
+        a strong law of large numbers, an asymptotically infinite variance
+        reduction and a finite sample variance bound.
+
+        To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
+        with :math:`a` the lower bounds and :math:`b` the upper bounds,
+        the following transformation is used:
+
+        .. math::
+
+            \text{floor}((b - a) \cdot \text{sample} + a)
+
+        """
+        if u_bounds is None:
+            u_bounds = l_bounds
+            l_bounds = 0
+
+        u_bounds = np.atleast_1d(u_bounds)
+        l_bounds = np.atleast_1d(l_bounds)
+
+        if endpoint:
+            u_bounds = u_bounds + 1
+
+        if (not np.issubdtype(l_bounds.dtype, np.integer) or
+                not np.issubdtype(u_bounds.dtype, np.integer)):
+            message = ("'u_bounds' and 'l_bounds' must be integers or"
+                       " array-like of integers")
+            raise ValueError(message)
+
+        if isinstance(self, Halton):
+            sample = self.random(n=n, workers=workers)
+        else:
+            sample = self.random(n=n)
+
+        sample = scale(sample, l_bounds=l_bounds, u_bounds=u_bounds)
+        sample = np.floor(sample).astype(np.int64)
+
+        return sample
+
+    def reset(self) -> "QMCEngine":
+        """Reset the engine to base state.
+
+        Returns
+        -------
+        engine : QMCEngine
+            Engine reset to its base state.
+
+        """
+        rng = copy.deepcopy(self.rng_seed)
+        self.rng = check_random_state(rng)
+        self.num_generated = 0
+        return self
+
+    def fast_forward(self, n: IntNumber) -> "QMCEngine":
+        """Fast-forward the sequence by `n` positions.
+
+        Parameters
+        ----------
+        n : int
+            Number of points to skip in the sequence.
+
+        Returns
+        -------
+        engine : QMCEngine
+            Engine reset to its base state.
+
+        """
+        self.random(n=n)
+        return self
+
+
+class Halton(QMCEngine):
+    """Halton sequence.
+
+    Pseudo-random number generator that generalize the Van der Corput sequence
+    for multiple dimensions. The Halton sequence uses the base-two Van der
+    Corput sequence for the first dimension, base-three for its second and
+    base-:math:`p` for its :math:`n`-dimension, with :math:`p` the
+    :math:`n`'th prime.
+
+    Parameters
+    ----------
+    d : int
+        Dimension of the parameter space.
+    scramble : bool, optional
+        If True, use random scrambling from [2]_. Otherwise no scrambling
+        is done.
+        Default is True.
+    optimization : {None, "random-cd", "lloyd"}, optional
+        Whether to use an optimization scheme to improve the quality after
+        sampling. Note that this is a post-processing step that does not
+        guarantee that all properties of the sample will be conserved.
+        Default is None.
+
+        * ``random-cd``: random permutations of coordinates to lower the
+          centered discrepancy. The best sample based on the centered
+          discrepancy is constantly updated. Centered discrepancy-based
+          sampling shows better space-filling robustness toward 2D and 3D
+          subprojections compared to using other discrepancy measures.
+        * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
+          The process converges to equally spaced samples.
+
+        .. versionadded:: 1.10.0
+
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    Notes
+    -----
+    The Halton sequence has severe striping artifacts for even modestly
+    large dimensions. These can be ameliorated by scrambling. Scrambling
+    also supports replication-based error estimates and extends
+    applicability to unbounded integrands.
+
+    References
+    ----------
+    .. [1] Halton, "On the efficiency of certain quasi-random sequences of
+       points in evaluating multi-dimensional integrals", Numerische
+       Mathematik, 1960.
+    .. [2] A. B. Owen. "A randomized Halton algorithm in R",
+       :arxiv:`1706.02808`, 2017.
+
+    Examples
+    --------
+    Generate samples from a low discrepancy sequence of Halton.
+
+    >>> from scipy.stats import qmc
+    >>> sampler = qmc.Halton(d=2, scramble=False)
+    >>> sample = sampler.random(n=5)
+    >>> sample
+    array([[0.        , 0.        ],
+           [0.5       , 0.33333333],
+           [0.25      , 0.66666667],
+           [0.75      , 0.11111111],
+           [0.125     , 0.44444444]])
+
+    Compute the quality of the sample using the discrepancy criterion.
+
+    >>> qmc.discrepancy(sample)
+    0.088893711419753
+
+    If some wants to continue an existing design, extra points can be obtained
+    by calling again `random`. Alternatively, you can skip some points like:
+
+    >>> _ = sampler.fast_forward(5)
+    >>> sample_continued = sampler.random(n=5)
+    >>> sample_continued
+    array([[0.3125    , 0.37037037],
+           [0.8125    , 0.7037037 ],
+           [0.1875    , 0.14814815],
+           [0.6875    , 0.48148148],
+           [0.4375    , 0.81481481]])
+
+    Finally, samples can be scaled to bounds.
+
+    >>> l_bounds = [0, 2]
+    >>> u_bounds = [10, 5]
+    >>> qmc.scale(sample_continued, l_bounds, u_bounds)
+    array([[3.125     , 3.11111111],
+           [8.125     , 4.11111111],
+           [1.875     , 2.44444444],
+           [6.875     , 3.44444444],
+           [4.375     , 4.44444444]])
+
+    """
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self, d: IntNumber, *, scramble: bool = True,
+        optimization: Literal["random-cd", "lloyd"] | None = None,
+        rng: SeedType = None
+    ) -> None:
+        # Used in `scipy.integrate.qmc_quad`
+        self._init_quad = {'d': d, 'scramble': True,
+                           'optimization': optimization}
+        super()._initialize(d=d, optimization=optimization, rng=rng)
+
+        # important to have ``type(bdim) == int`` for performance reason
+        self.base = [int(bdim) for bdim in n_primes(d)]
+        self.scramble = scramble
+
+        self._initialize_permutations()
+
+    def _initialize_permutations(self) -> None:
+        """Initialize permutations for all Van der Corput sequences.
+
+        Permutations are only needed for scrambling.
+        """
+        self._permutations: list = [None] * len(self.base)
+        if self.scramble:
+            for i, bdim in enumerate(self.base):
+                permutations = _van_der_corput_permutations(
+                    base=bdim, rng=self.rng
+                )
+
+                self._permutations[i] = permutations
+
+    def _random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        """Draw `n` in the half-open interval ``[0, 1)``.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+        workers : int, optional
+            Number of workers to use for parallel processing. If -1 is
+            given all CPU threads are used. Default is 1. It becomes faster
+            than one worker for `n` greater than :math:`10^3`.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            QMC sample.
+
+        """
+        workers = _validate_workers(workers)
+        # Generate a sample using a Van der Corput sequence per dimension.
+        sample = [van_der_corput(n, bdim, start_index=self.num_generated,
+                                 scramble=self.scramble,
+                                 permutations=self._permutations[i],
+                                 workers=workers)
+                  for i, bdim in enumerate(self.base)]
+
+        return np.array(sample).T.reshape(n, self.d)
+
+
+class LatinHypercube(QMCEngine):
+    r"""Latin hypercube sampling (LHS).
+
+    A Latin hypercube sample [1]_ generates :math:`n` points in
+    :math:`[0,1)^{d}`. Each univariate marginal distribution is stratified,
+    placing exactly one point in :math:`[j/n, (j+1)/n)` for
+    :math:`j=0,1,...,n-1`. They are still applicable when :math:`n << d`.
+
+    Parameters
+    ----------
+    d : int
+        Dimension of the parameter space.
+    scramble : bool, optional
+        When False, center samples within cells of a multi-dimensional grid.
+        Otherwise, samples are randomly placed within cells of the grid.
+
+        .. note::
+            Setting ``scramble=False`` does not ensure deterministic output.
+            For that, use the `rng` parameter.
+
+        Default is True.
+
+        .. versionadded:: 1.10.0
+
+    optimization : {None, "random-cd", "lloyd"}, optional
+        Whether to use an optimization scheme to improve the quality after
+        sampling. Note that this is a post-processing step that does not
+        guarantee that all properties of the sample will be conserved.
+        Default is None.
+
+        * ``random-cd``: random permutations of coordinates to lower the
+          centered discrepancy. The best sample based on the centered
+          discrepancy is constantly updated. Centered discrepancy-based
+          sampling shows better space-filling robustness toward 2D and 3D
+          subprojections compared to using other discrepancy measures.
+        * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
+          The process converges to equally spaced samples.
+
+        .. versionadded:: 1.8.0
+        .. versionchanged:: 1.10.0
+            Add ``lloyd``.
+
+    strength : {1, 2}, optional
+        Strength of the LHS. ``strength=1`` produces a plain LHS while
+        ``strength=2`` produces an orthogonal array based LHS of strength 2
+        [7]_, [8]_. In that case, only ``n=p**2`` points can be sampled,
+        with ``p`` a prime number. It also constrains ``d <= p + 1``.
+        Default is 1.
+
+        .. versionadded:: 1.8.0
+
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    See Also
+    --------
+    :ref:`quasi-monte-carlo`
+
+    Notes
+    -----
+
+    When LHS is used for integrating a function :math:`f` over :math:`n`,
+    LHS is extremely effective on integrands that are nearly additive [2]_.
+    With a LHS of :math:`n` points, the variance of the integral is always
+    lower than plain MC on :math:`n-1` points [3]_. There is a central limit
+    theorem for LHS on the mean and variance of the integral [4]_, but not
+    necessarily for optimized LHS due to the randomization.
+
+    :math:`A` is called an orthogonal array of strength :math:`t` if in each
+    n-row-by-t-column submatrix of :math:`A`: all :math:`p^t` possible
+    distinct rows occur the same number of times. The elements of :math:`A`
+    are in the set :math:`\{0, 1, ..., p-1\}`, also called symbols.
+    The constraint that :math:`p` must be a prime number is to allow modular
+    arithmetic. Increasing strength adds some symmetry to the sub-projections
+    of a sample. With strength 2, samples are symmetric along the diagonals of
+    2D sub-projections. This may be undesirable, but on the other hand, the
+    sample dispersion is improved.
+
+    Strength 1 (plain LHS) brings an advantage over strength 0 (MC) and
+    strength 2 is a useful increment over strength 1. Going to strength 3 is
+    a smaller increment and scrambled QMC like Sobol', Halton are more
+    performant [7]_.
+
+    To create a LHS of strength 2, the orthogonal array :math:`A` is
+    randomized by applying a random, bijective map of the set of symbols onto
+    itself. For example, in column 0, all 0s might become 2; in column 1,
+    all 0s might become 1, etc.
+    Then, for each column :math:`i` and symbol :math:`j`, we add a plain,
+    one-dimensional LHS of size :math:`p` to the subarray where
+    :math:`A^i = j`. The resulting matrix is finally divided by :math:`p`.
+
+    References
+    ----------
+    .. [1] Mckay et al., "A Comparison of Three Methods for Selecting Values
+       of Input Variables in the Analysis of Output from a Computer Code."
+       Technometrics, 1979.
+    .. [2] M. Stein, "Large sample properties of simulations using Latin
+       hypercube sampling." Technometrics 29, no. 2: 143-151, 1987.
+    .. [3] A. B. Owen, "Monte Carlo variance of scrambled net quadrature."
+       SIAM Journal on Numerical Analysis 34, no. 5: 1884-1910, 1997
+    .. [4]  Loh, W.-L. "On Latin hypercube sampling." The annals of statistics
+       24, no. 5: 2058-2080, 1996.
+    .. [5] Fang et al. "Design and modeling for computer experiments".
+       Computer Science and Data Analysis Series, 2006.
+    .. [6] Damblin et al., "Numerical studies of space filling designs:
+       optimization of Latin Hypercube Samples and subprojection properties."
+       Journal of Simulation, 2013.
+    .. [7] A. B. Owen , "Orthogonal arrays for computer experiments,
+       integration and visualization." Statistica Sinica, 1992.
+    .. [8] B. Tang, "Orthogonal Array-Based Latin Hypercubes."
+       Journal of the American Statistical Association, 1993.
+    .. [9] Seaholm, Susan K. et al. (1988). Latin hypercube sampling and the
+       sensitivity analysis of a Monte Carlo epidemic model. Int J Biomed
+       Comput, 23(1-2), 97-112. :doi:`10.1016/0020-7101(88)90067-0`
+
+    Examples
+    --------
+    Generate samples from a Latin hypercube generator.
+
+    >>> from scipy.stats import qmc
+    >>> sampler = qmc.LatinHypercube(d=2)
+    >>> sample = sampler.random(n=5)
+    >>> sample
+    array([[0.1545328 , 0.53664833], # random
+            [0.84052691, 0.06474907],
+            [0.52177809, 0.93343721],
+            [0.68033825, 0.36265316],
+            [0.26544879, 0.61163943]])
+
+    Compute the quality of the sample using the discrepancy criterion.
+
+    >>> qmc.discrepancy(sample)
+    0.0196... # random
+
+    Samples can be scaled to bounds.
+
+    >>> l_bounds = [0, 2]
+    >>> u_bounds = [10, 5]
+    >>> qmc.scale(sample, l_bounds, u_bounds)
+    array([[1.54532796, 3.609945 ], # random
+            [8.40526909, 2.1942472 ],
+            [5.2177809 , 4.80031164],
+            [6.80338249, 3.08795949],
+            [2.65448791, 3.83491828]])
+
+    Below are other examples showing alternative ways to construct LHS with
+    even better coverage of the space.
+
+    Using a base LHS as a baseline.
+
+    >>> sampler = qmc.LatinHypercube(d=2)
+    >>> sample = sampler.random(n=5)
+    >>> qmc.discrepancy(sample)
+    0.0196...  # random
+
+    Use the `optimization` keyword argument to produce a LHS with
+    lower discrepancy at higher computational cost.
+
+    >>> sampler = qmc.LatinHypercube(d=2, optimization="random-cd")
+    >>> sample = sampler.random(n=5)
+    >>> qmc.discrepancy(sample)
+    0.0176...  # random
+
+    Use the `strength` keyword argument to produce an orthogonal array based
+    LHS of strength 2. In this case, the number of sample points must be the
+    square of a prime number.
+
+    >>> sampler = qmc.LatinHypercube(d=2, strength=2)
+    >>> sample = sampler.random(n=9)
+    >>> qmc.discrepancy(sample)
+    0.00526...  # random
+
+    Options could be combined to produce an optimized centered
+    orthogonal array based LHS. After optimization, the result would not
+    be guaranteed to be of strength 2.
+
+    **Real-world example**
+
+    In [9]_, a Latin Hypercube sampling (LHS) strategy was used to sample a
+    parameter space to study the importance of each parameter of an epidemic
+    model. Such analysis is also called a sensitivity analysis.
+
+    Since the dimensionality of the problem is high (6), it is computationally
+    expensive to cover the space. When numerical experiments are costly, QMC
+    enables analysis that may not be possible if using a grid.
+
+    The six parameters of the model represented the probability of illness,
+    the probability of withdrawal, and four contact probabilities. The
+    authors assumed uniform distributions for all parameters and generated
+    50 samples.
+
+    Using `scipy.stats.qmc.LatinHypercube` to replicate the protocol,
+    the first step is to create a sample in the unit hypercube:
+
+    >>> from scipy.stats import qmc
+    >>> sampler = qmc.LatinHypercube(d=6)
+    >>> sample = sampler.random(n=50)
+
+    Then the sample can be scaled to the appropriate bounds:
+
+    >>> l_bounds = [0.000125, 0.01, 0.0025, 0.05, 0.47, 0.7]
+    >>> u_bounds = [0.000375, 0.03, 0.0075, 0.15, 0.87, 0.9]
+    >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
+
+    Such a sample was used to run the model 50 times, and a polynomial
+    response surface was constructed. This allowed the authors to study the
+    relative importance of each parameter across the range of possibilities
+    of every other parameter.
+
+    In this computer experiment, they showed a 14-fold reduction in the
+    number of samples required to maintain an error below 2% on their
+    response surface when compared to a grid sampling.
+
+    """
+
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self, d: IntNumber, *,
+        scramble: bool = True,
+        strength: int = 1,
+        optimization: Literal["random-cd", "lloyd"] | None = None,
+        rng: SeedType = None
+    ) -> None:
+        # Used in `scipy.integrate.qmc_quad`
+        self._init_quad = {'d': d, 'scramble': True, 'strength': strength,
+                           'optimization': optimization}
+        super()._initialize(d=d, rng=rng, optimization=optimization)
+        self.scramble = scramble
+
+        lhs_method_strength = {
+            1: self._random_lhs,
+            2: self._random_oa_lhs
+        }
+
+        try:
+            self.lhs_method: Callable = lhs_method_strength[strength]
+        except KeyError as exc:
+            message = (f"{strength!r} is not a valid strength. It must be one"
+                       f" of {set(lhs_method_strength)!r}")
+            raise ValueError(message) from exc
+
+    def _random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        lhs = self.lhs_method(n)
+        return lhs
+
+    def _random_lhs(self, n: IntNumber = 1) -> np.ndarray:
+        """Base LHS algorithm."""
+        if not self.scramble:
+            samples: np.ndarray | float = 0.5
+        else:
+            samples = self.rng.uniform(size=(n, self.d))
+
+        perms = np.tile(np.arange(1, n + 1),
+                        (self.d, 1))  # type: ignore[arg-type]
+        for i in range(self.d):
+            self.rng.shuffle(perms[i, :])
+        perms = perms.T
+
+        samples = (perms - samples) / n
+        return samples
+
+    def _random_oa_lhs(self, n: IntNumber = 4) -> np.ndarray:
+        """Orthogonal array based LHS of strength 2."""
+        p = np.sqrt(n).astype(int)
+        n_row = p**2
+        n_col = p + 1
+
+        primes = primes_from_2_to(p + 1)
+        if p not in primes or n != n_row:
+            raise ValueError(
+                "n is not the square of a prime number. Close"
+                f" values are {primes[-2:]**2}"
+            )
+        if self.d > p + 1:
+            raise ValueError("n is too small for d. Must be n > (d-1)**2")
+
+        oa_sample = np.zeros(shape=(n_row, n_col), dtype=int)
+
+        # OA of strength 2
+        arrays = np.tile(np.arange(p), (2, 1))
+        oa_sample[:, :2] = np.stack(np.meshgrid(*arrays),
+                                    axis=-1).reshape(-1, 2)
+        for p_ in range(1, p):
+            oa_sample[:, 2+p_-1] = np.mod(oa_sample[:, 0]
+                                          + p_*oa_sample[:, 1], p)
+
+        # scramble the OA
+        oa_sample_ = np.empty(shape=(n_row, n_col), dtype=int)
+        for j in range(n_col):
+            perms = self.rng.permutation(p)
+            oa_sample_[:, j] = perms[oa_sample[:, j]]
+
+        oa_sample = oa_sample_
+        # following is making a scrambled OA into an OA-LHS
+        oa_lhs_sample = np.zeros(shape=(n_row, n_col))
+        lhs_engine = LatinHypercube(d=1, scramble=self.scramble, strength=1,
+                                    rng=self.rng)  # type: QMCEngine
+        for j in range(n_col):
+            for k in range(p):
+                idx = oa_sample[:, j] == k
+                lhs = lhs_engine.random(p).flatten()
+                oa_lhs_sample[:, j][idx] = lhs + oa_sample[:, j][idx]
+
+        oa_lhs_sample /= p
+
+        return oa_lhs_sample[:, :self.d]
+
+
+class Sobol(QMCEngine):
+    """Engine for generating (scrambled) Sobol' sequences.
+
+    Sobol' sequences are low-discrepancy, quasi-random numbers. Points
+    can be drawn using two methods:
+
+    * `random_base2`: safely draw :math:`n=2^m` points. This method
+      guarantees the balance properties of the sequence.
+    * `random`: draw an arbitrary number of points from the
+      sequence. See warning below.
+
+    Parameters
+    ----------
+    d : int
+        Dimensionality of the sequence. Max dimensionality is 21201.
+    scramble : bool, optional
+        If True, use LMS+shift scrambling. Otherwise, no scrambling is done.
+        Default is True.
+    bits : int, optional
+        Number of bits of the generator. Control the maximum number of points
+        that can be generated, which is ``2**bits``. Maximal value is 64.
+        It does not correspond to the return type, which is always
+        ``np.float64`` to prevent points from repeating themselves.
+        Default is None, which for backward compatibility, corresponds to 30.
+
+        .. versionadded:: 1.9.0
+    optimization : {None, "random-cd", "lloyd"}, optional
+        Whether to use an optimization scheme to improve the quality after
+        sampling. Note that this is a post-processing step that does not
+        guarantee that all properties of the sample will be conserved.
+        Default is None.
+
+        * ``random-cd``: random permutations of coordinates to lower the
+          centered discrepancy. The best sample based on the centered
+          discrepancy is constantly updated. Centered discrepancy-based
+          sampling shows better space-filling robustness toward 2D and 3D
+          subprojections compared to using other discrepancy measures.
+        * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
+          The process converges to equally spaced samples.
+
+        .. versionadded:: 1.10.0
+
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    Notes
+    -----
+    Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in
+    :math:`[0,1)^{d}`. Scrambling them [3]_ makes them suitable for singular
+    integrands, provides a means of error estimation, and can improve their
+    rate of convergence. The scrambling strategy which is implemented is a
+    (left) linear matrix scramble (LMS) followed by a digital random shift
+    (LMS+shift) [2]_.
+
+    There are many versions of Sobol' sequences depending on their
+    'direction numbers'. This code uses direction numbers from [4]_. Hence,
+    the maximum number of dimension is 21201. The direction numbers have been
+    precomputed with search criterion 6 and can be retrieved at
+    https://web.maths.unsw.edu.au/~fkuo/sobol/.
+
+    .. warning::
+
+       Sobol' sequences are a quadrature rule and they lose their balance
+       properties if one uses a sample size that is not a power of 2, or skips
+       the first point, or thins the sequence [5]_.
+
+       If :math:`n=2^m` points are not enough then one should take :math:`2^M`
+       points for :math:`M>m`. When scrambling, the number R of independent
+       replicates does not have to be a power of 2.
+
+       Sobol' sequences are generated to some number :math:`B` of bits.
+       After :math:`2^B` points have been generated, the sequence would
+       repeat. Hence, an error is raised.
+       The number of bits can be controlled with the parameter `bits`.
+
+    References
+    ----------
+    .. [1] I. M. Sobol', "The distribution of points in a cube and the accurate
+       evaluation of integrals." Zh. Vychisl. Mat. i Mat. Phys., 7:784-802,
+       1967.
+    .. [2] J. Matousek, "On the L2-discrepancy for anchored boxes."
+       J. of Complexity 14, 527-556, 1998.
+    .. [3] Art B. Owen, "Scrambling Sobol and Niederreiter-Xing points."
+       Journal of Complexity, 14(4):466-489, December 1998.
+    .. [4] S. Joe and F. Y. Kuo, "Constructing sobol sequences with better
+       two-dimensional projections." SIAM Journal on Scientific Computing,
+       30(5):2635-2654, 2008.
+    .. [5] Art B. Owen, "On dropping the first Sobol' point."
+       :arxiv:`2008.08051`, 2020.
+
+    Examples
+    --------
+    Generate samples from a low discrepancy sequence of Sobol'.
+
+    >>> from scipy.stats import qmc
+    >>> sampler = qmc.Sobol(d=2, scramble=False)
+    >>> sample = sampler.random_base2(m=3)
+    >>> sample
+    array([[0.   , 0.   ],
+           [0.5  , 0.5  ],
+           [0.75 , 0.25 ],
+           [0.25 , 0.75 ],
+           [0.375, 0.375],
+           [0.875, 0.875],
+           [0.625, 0.125],
+           [0.125, 0.625]])
+
+    Compute the quality of the sample using the discrepancy criterion.
+
+    >>> qmc.discrepancy(sample)
+    0.013882107204860938
+
+    To continue an existing design, extra points can be obtained
+    by calling again `random_base2`. Alternatively, you can skip some
+    points like:
+
+    >>> _ = sampler.reset()
+    >>> _ = sampler.fast_forward(4)
+    >>> sample_continued = sampler.random_base2(m=2)
+    >>> sample_continued
+    array([[0.375, 0.375],
+           [0.875, 0.875],
+           [0.625, 0.125],
+           [0.125, 0.625]])
+
+    Finally, samples can be scaled to bounds.
+
+    >>> l_bounds = [0, 2]
+    >>> u_bounds = [10, 5]
+    >>> qmc.scale(sample_continued, l_bounds, u_bounds)
+    array([[3.75 , 3.125],
+           [8.75 , 4.625],
+           [6.25 , 2.375],
+           [1.25 , 3.875]])
+
+    """
+
+    MAXDIM: ClassVar[int] = _MAXDIM
+
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self, d: IntNumber, *, scramble: bool = True,
+        bits: IntNumber | None = None, rng: SeedType = None,
+        optimization: Literal["random-cd", "lloyd"] | None = None
+    ) -> None:
+        # Used in `scipy.integrate.qmc_quad`
+        self._init_quad = {'d': d, 'scramble': True, 'bits': bits,
+                           'optimization': optimization}
+
+        super()._initialize(d=d, optimization=optimization, rng=rng)
+        if d > self.MAXDIM:
+            raise ValueError(
+                f"Maximum supported dimensionality is {self.MAXDIM}."
+            )
+
+        self.bits = bits
+        self.dtype_i: type
+        self.scramble = scramble
+
+        if self.bits is None:
+            self.bits = 30
+
+        if self.bits <= 32:
+            self.dtype_i = np.uint32
+        elif 32 < self.bits <= 64:
+            self.dtype_i = np.uint64
+        else:
+            raise ValueError("Maximum supported 'bits' is 64")
+
+        self.maxn = 2**self.bits
+
+        # v is d x maxbit matrix
+        self._sv: np.ndarray = np.zeros((d, self.bits), dtype=self.dtype_i)
+        _initialize_v(self._sv, dim=d, bits=self.bits)
+
+        if not scramble:
+            self._shift: np.ndarray = np.zeros(d, dtype=self.dtype_i)
+        else:
+            # scramble self._shift and self._sv
+            self._scramble()
+
+        self._quasi = self._shift.copy()
+
+        # normalization constant with the largest possible number
+        # calculate in Python to not overflow int with 2**64
+        self._scale = 1.0 / 2 ** self.bits
+
+        self._first_point = (self._quasi * self._scale).reshape(1, -1)
+        # explicit casting to float64
+        self._first_point = self._first_point.astype(np.float64)
+
+    def _scramble(self) -> None:
+        """Scramble the sequence using LMS+shift."""
+        # Generate shift vector
+        self._shift = np.dot(
+            rng_integers(self.rng, 2, size=(self.d, self.bits),
+                         dtype=self.dtype_i),
+            2 ** np.arange(self.bits, dtype=self.dtype_i),
+        )
+        # Generate lower triangular matrices (stacked across dimensions)
+        ltm = np.tril(rng_integers(self.rng, 2,
+                                   size=(self.d, self.bits, self.bits),
+                                   dtype=self.dtype_i))
+        _cscramble(
+            dim=self.d, bits=self.bits,  # type: ignore[arg-type]
+            ltm=ltm, sv=self._sv
+        )
+
+    def _random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        """Draw next point(s) in the Sobol' sequence.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            Sobol' sample.
+
+        """
+        sample: np.ndarray = np.empty((n, self.d), dtype=np.float64)
+
+        if n == 0:
+            return sample
+
+        total_n = self.num_generated + n
+        if total_n > self.maxn:
+            msg = (
+                f"At most 2**{self.bits}={self.maxn} distinct points can be "
+                f"generated. {self.num_generated} points have been previously "
+                f"generated, then: n={self.num_generated}+{n}={total_n}. "
+            )
+            if self.bits != 64:
+                msg += "Consider increasing `bits`."
+            raise ValueError(msg)
+
+        if self.num_generated == 0:
+            # verify n is 2**n
+            if not (n & (n - 1) == 0):
+                warnings.warn("The balance properties of Sobol' points require"
+                              " n to be a power of 2.", stacklevel=3)
+
+            if n == 1:
+                sample = self._first_point
+            else:
+                _draw(
+                    n=n - 1, num_gen=self.num_generated, dim=self.d,
+                    scale=self._scale, sv=self._sv, quasi=self._quasi,
+                    sample=sample
+                )
+                sample = np.concatenate(
+                    [self._first_point, sample]
+                )[:n]
+        else:
+            _draw(
+                n=n, num_gen=self.num_generated - 1, dim=self.d,
+                scale=self._scale, sv=self._sv, quasi=self._quasi,
+                sample=sample
+            )
+
+        return sample
+
+    def random_base2(self, m: IntNumber) -> np.ndarray:
+        """Draw point(s) from the Sobol' sequence.
+
+        This function draws :math:`n=2^m` points in the parameter space
+        ensuring the balance properties of the sequence.
+
+        Parameters
+        ----------
+        m : int
+            Logarithm in base 2 of the number of samples; i.e., n = 2^m.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            Sobol' sample.
+
+        """
+        n = 2 ** m
+
+        total_n = self.num_generated + n
+        if not (total_n & (total_n - 1) == 0):
+            raise ValueError('The balance properties of Sobol\' points require '
+                             f'n to be a power of 2. {self.num_generated} points '
+                             'have been previously generated, then: '
+                             f'n={self.num_generated}+2**{m}={total_n}. '
+                             'If you still want to do this, the function '
+                             '\'Sobol.random()\' can be used.'
+                             )
+
+        return self.random(n)
+
+    def reset(self) -> "Sobol":
+        """Reset the engine to base state.
+
+        Returns
+        -------
+        engine : Sobol
+            Engine reset to its base state.
+
+        """
+        super().reset()
+        self._quasi = self._shift.copy()
+        return self
+
+    def fast_forward(self, n: IntNumber) -> "Sobol":
+        """Fast-forward the sequence by `n` positions.
+
+        Parameters
+        ----------
+        n : int
+            Number of points to skip in the sequence.
+
+        Returns
+        -------
+        engine : Sobol
+            The fast-forwarded engine.
+
+        """
+        if self.num_generated == 0:
+            _fast_forward(
+                n=n - 1, num_gen=self.num_generated, dim=self.d,
+                sv=self._sv, quasi=self._quasi
+            )
+        else:
+            _fast_forward(
+                n=n, num_gen=self.num_generated - 1, dim=self.d,
+                sv=self._sv, quasi=self._quasi
+            )
+        self.num_generated += n
+        return self
+
+
+class PoissonDisk(QMCEngine):
+    """Poisson disk sampling.
+
+    Parameters
+    ----------
+    d : int
+        Dimension of the parameter space.
+    radius : float
+        Minimal distance to keep between points when sampling new candidates.
+    hypersphere : {"volume", "surface"}, optional
+        Sampling strategy to generate potential candidates to be added in the
+        final sample. Default is "volume".
+
+        * ``volume``: original Bridson algorithm as described in [1]_.
+          New candidates are sampled *within* the hypersphere.
+        * ``surface``: only sample the surface of the hypersphere.
+    ncandidates : int
+        Number of candidates to sample per iteration. More candidates result
+        in a denser sampling as more candidates can be accepted per iteration.
+    optimization : {None, "random-cd", "lloyd"}, optional
+        Whether to use an optimization scheme to improve the quality after
+        sampling. Note that this is a post-processing step that does not
+        guarantee that all properties of the sample will be conserved.
+        Default is None.
+
+        * ``random-cd``: random permutations of coordinates to lower the
+          centered discrepancy. The best sample based on the centered
+          discrepancy is constantly updated. Centered discrepancy-based
+          sampling shows better space-filling robustness toward 2D and 3D
+          subprojections compared to using other discrepancy measures.
+        * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
+          The process converges to equally spaced samples.
+
+        .. versionadded:: 1.10.0
+
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    l_bounds, u_bounds : array_like (d,)
+        Lower and upper bounds of target sample data.
+
+    Notes
+    -----
+    Poisson disk sampling is an iterative sampling strategy. Starting from
+    a seed sample, `ncandidates` are sampled in the hypersphere
+    surrounding the seed. Candidates below a certain `radius` or outside the
+    domain are rejected. New samples are added in a pool of sample seed. The
+    process stops when the pool is empty or when the number of required
+    samples is reached.
+
+    The maximum number of point that a sample can contain is directly linked
+    to the `radius`. As the dimension of the space increases, a higher radius
+    spreads the points further and help overcome the curse of dimensionality.
+    See the :ref:`quasi monte carlo tutorial <quasi-monte-carlo>` for more
+    details.
+
+    .. warning::
+
+       The algorithm is more suitable for low dimensions and sampling size
+       due to its iterative nature and memory requirements.
+       Selecting a small radius with a high dimension would
+       mean that the space could contain more samples than using lower
+       dimension or a bigger radius.
+
+    Some code taken from [2]_, written consent given on 31.03.2021
+    by the original author, Shamis, for free use in SciPy under
+    the 3-clause BSD.
+
+    References
+    ----------
+    .. [1] Robert Bridson, "Fast Poisson Disk Sampling in Arbitrary
+       Dimensions." SIGGRAPH, 2007.
+    .. [2] `StackOverflow <https://stackoverflow.com/questions/66047540>`__.
+
+    Examples
+    --------
+    Generate a 2D sample using a `radius` of 0.2.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from matplotlib.collections import PatchCollection
+    >>> from scipy.stats import qmc
+    >>>
+    >>> rng = np.random.default_rng()
+    >>> radius = 0.2
+    >>> engine = qmc.PoissonDisk(d=2, radius=radius, rng=rng)
+    >>> sample = engine.random(20)
+
+    Visualizing the 2D sample and showing that no points are closer than
+    `radius`. ``radius/2`` is used to visualize non-intersecting circles.
+    If two samples are exactly at `radius` from each other, then their circle
+    of radius ``radius/2`` will touch.
+
+    >>> fig, ax = plt.subplots()
+    >>> _ = ax.scatter(sample[:, 0], sample[:, 1])
+    >>> circles = [plt.Circle((xi, yi), radius=radius/2, fill=False)
+    ...            for xi, yi in sample]
+    >>> collection = PatchCollection(circles, match_original=True)
+    >>> ax.add_collection(collection)
+    >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',
+    ...            xlim=[0, 1], ylim=[0, 1])
+    >>> plt.show()
+
+    Such visualization can be seen as circle packing: how many circle can
+    we put in the space. It is a np-hard problem. The method `fill_space`
+    can be used to add samples until no more samples can be added. This is
+    a hard problem and parameters may need to be adjusted manually. Beware of
+    the dimension: as the dimensionality increases, the number of samples
+    required to fill the space increases exponentially
+    (curse-of-dimensionality).
+
+    """
+
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self,
+        d: IntNumber,
+        *,
+        radius: DecimalNumber = 0.05,
+        hypersphere: Literal["volume", "surface"] = "volume",
+        ncandidates: IntNumber = 30,
+        optimization: Literal["random-cd", "lloyd"] | None = None,
+        rng: SeedType = None,
+        l_bounds: "npt.ArrayLike | None" = None,
+        u_bounds: "npt.ArrayLike | None" = None,
+    ) -> None:
+        # Used in `scipy.integrate.qmc_quad`
+        self._init_quad = {'d': d, 'radius': radius,
+                           'hypersphere': hypersphere,
+                           'ncandidates': ncandidates,
+                           'optimization': optimization}
+        super()._initialize(d=d, optimization=optimization, rng=rng)
+
+        hypersphere_sample = {
+            "volume": self._hypersphere_volume_sample,
+            "surface": self._hypersphere_surface_sample
+        }
+
+        try:
+            self.hypersphere_method = hypersphere_sample[hypersphere]
+        except KeyError as exc:
+            message = (
+                f"{hypersphere!r} is not a valid hypersphere sampling"
+                f" method. It must be one of {set(hypersphere_sample)!r}")
+            raise ValueError(message) from exc
+
+        # size of the sphere from which the samples are drawn relative to the
+        # size of a disk (radius)
+        # for the surface sampler, all new points are almost exactly 1 radius
+        # away from at least one existing sample +eps to avoid rejection
+        self.radius_factor = 2 if hypersphere == "volume" else 1.001
+        self.radius = radius
+        self.radius_squared = self.radius**2
+
+        # sample to generate per iteration in the hypersphere around center
+        self.ncandidates = ncandidates
+
+        if u_bounds is None:
+            u_bounds = np.ones(d)
+        if l_bounds is None:
+            l_bounds = np.zeros(d)
+        self.l_bounds, self.u_bounds = _validate_bounds(
+            l_bounds=l_bounds, u_bounds=u_bounds, d=int(d)
+        )
+
+        with np.errstate(divide='ignore'):
+            self.cell_size = self.radius / np.sqrt(self.d)
+            self.grid_size = (
+                np.ceil((self.u_bounds - self.l_bounds) / self.cell_size)
+            ).astype(int)
+
+        self._initialize_grid_pool()
+
+    def _initialize_grid_pool(self):
+        """Sampling pool and sample grid."""
+        self.sample_pool = []
+        # Positions of cells
+        # n-dim value for each grid cell
+        self.sample_grid = np.empty(
+            np.append(self.grid_size, self.d),
+            dtype=np.float32
+        )
+        # Initialise empty cells with NaNs
+        self.sample_grid.fill(np.nan)
+
+    def _random(
+        self, n: IntNumber = 1, *, workers: IntNumber = 1
+    ) -> np.ndarray:
+        """Draw `n` in the interval ``[l_bounds, u_bounds]``.
+
+        Note that it can return fewer samples if the space is full.
+        See the note section of the class.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            QMC sample.
+
+        """
+        if n == 0 or self.d == 0:
+            return np.empty((n, self.d))
+
+        def in_limits(sample: np.ndarray) -> bool:
+            for i in range(self.d):
+                if (sample[i] > self.u_bounds[i] or sample[i] < self.l_bounds[i]):
+                    return False
+            return True
+
+        def in_neighborhood(candidate: np.ndarray, n: int = 2) -> bool:
+            """
+            Check if there are samples closer than ``radius_squared`` to the
+            `candidate` sample.
+            """
+            indices = ((candidate - self.l_bounds) / self.cell_size).astype(int)
+            ind_min = np.maximum(indices - n, 0)
+            ind_max = np.minimum(indices + n + 1, self.grid_size)
+
+            # Check if the center cell is empty
+            if not np.isnan(self.sample_grid[tuple(indices)][0]):
+                return True
+
+            a = [slice(ind_min[i], ind_max[i]) for i in range(self.d)]
+
+            # guards against: invalid value encountered in less as we are
+            # comparing with nan and returns False. Which is wanted.
+            with np.errstate(invalid='ignore'):
+                if np.any(
+                    np.sum(
+                        np.square(candidate - self.sample_grid[tuple(a)]),
+                        axis=self.d
+                    ) < self.radius_squared
+                ):
+                    return True
+
+            return False
+
+        def add_sample(candidate: np.ndarray) -> None:
+            self.sample_pool.append(candidate)
+            indices = ((candidate - self.l_bounds) / self.cell_size).astype(int)
+            self.sample_grid[tuple(indices)] = candidate
+            curr_sample.append(candidate)
+
+        curr_sample: list[np.ndarray] = []
+
+        if len(self.sample_pool) == 0:
+            # the pool is being initialized with a single random sample
+            add_sample(self.rng.uniform(self.l_bounds, self.u_bounds))
+            num_drawn = 1
+        else:
+            num_drawn = 0
+
+        # exhaust sample pool to have up to n sample
+        while len(self.sample_pool) and num_drawn < n:
+            # select a sample from the available pool
+            idx_center = rng_integers(self.rng, len(self.sample_pool))
+            center = self.sample_pool[idx_center]
+            del self.sample_pool[idx_center]
+
+            # generate candidates around the center sample
+            candidates = self.hypersphere_method(
+                center, self.radius * self.radius_factor, self.ncandidates
+            )
+
+            # keep candidates that satisfy some conditions
+            for candidate in candidates:
+                if in_limits(candidate) and not in_neighborhood(candidate):
+                    add_sample(candidate)
+
+                    num_drawn += 1
+                    if num_drawn >= n:
+                        break
+
+        self.num_generated += num_drawn
+        return np.array(curr_sample)
+
+    def fill_space(self) -> np.ndarray:
+        """Draw ``n`` samples in the interval ``[l_bounds, u_bounds]``.
+
+        Unlike `random`, this method will try to add points until
+        the space is full. Depending on ``candidates`` (and to a lesser extent
+        other parameters), some empty areas can still be present in the sample.
+
+        .. warning::
+
+           This can be extremely slow in high dimensions or if the
+           ``radius`` is very small-with respect to the dimensionality.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            QMC sample.
+
+        """
+        return self.random(np.inf)  # type: ignore[arg-type]
+
+    def reset(self) -> "PoissonDisk":
+        """Reset the engine to base state.
+
+        Returns
+        -------
+        engine : PoissonDisk
+            Engine reset to its base state.
+
+        """
+        super().reset()
+        self._initialize_grid_pool()
+        return self
+
+    def _hypersphere_volume_sample(
+        self, center: np.ndarray, radius: DecimalNumber,
+        candidates: IntNumber = 1
+    ) -> np.ndarray:
+        """Uniform sampling within hypersphere."""
+        # should remove samples within r/2
+        x = self.rng.standard_normal(size=(candidates, self.d))
+        ssq = np.sum(x**2, axis=1)
+        fr = radius * gammainc(self.d/2, ssq/2)**(1/self.d) / np.sqrt(ssq)
+        fr_tiled = np.tile(
+            fr.reshape(-1, 1), (1, self.d)  # type: ignore[arg-type]
+        )
+        p = center + np.multiply(x, fr_tiled)
+        return p
+
+    def _hypersphere_surface_sample(
+        self, center: np.ndarray, radius: DecimalNumber,
+        candidates: IntNumber = 1
+    ) -> np.ndarray:
+        """Uniform sampling on the hypersphere's surface."""
+        vec = self.rng.standard_normal(size=(candidates, self.d))
+        vec /= np.linalg.norm(vec, axis=1)[:, None]
+        p = center + np.multiply(vec, radius)
+        return p
+
+
+class MultivariateNormalQMC:
+    r"""QMC sampling from a multivariate Normal :math:`N(\mu, \Sigma)`.
+
+    Parameters
+    ----------
+    mean : array_like (d,)
+        The mean vector. Where ``d`` is the dimension.
+    cov : array_like (d, d), optional
+        The covariance matrix. If omitted, use `cov_root` instead.
+        If both `cov` and `cov_root` are omitted, use the identity matrix.
+    cov_root : array_like (d, d'), optional
+        A root decomposition of the covariance matrix, where ``d'`` may be less
+        than ``d`` if the covariance is not full rank. If omitted, use `cov`.
+    inv_transform : bool, optional
+        If True, use inverse transform instead of Box-Muller. Default is True.
+    engine : QMCEngine, optional
+        Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import qmc
+    >>> dist = qmc.MultivariateNormalQMC(mean=[0, 5], cov=[[1, 0], [0, 1]])
+    >>> sample = dist.random(512)
+    >>> _ = plt.scatter(sample[:, 0], sample[:, 1])
+    >>> plt.show()
+
+    """
+
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+            self,
+            mean: "npt.ArrayLike",
+            cov: "npt.ArrayLike | None" = None,
+            *,
+            cov_root: "npt.ArrayLike | None" = None,
+            inv_transform: bool = True,
+            engine: QMCEngine | None = None,
+            rng: SeedType = None,
+    ) -> None:
+        mean = np.asarray(np.atleast_1d(mean))
+        d = mean.shape[0]
+        if cov is not None:
+            # covariance matrix provided
+            cov = np.asarray(np.atleast_2d(cov))
+            # check for square/symmetric cov matrix and mean vector has the
+            # same d
+            if not mean.shape[0] == cov.shape[0]:
+                raise ValueError("Dimension mismatch between mean and "
+                                 "covariance.")
+            if not np.allclose(cov, cov.transpose()):
+                raise ValueError("Covariance matrix is not symmetric.")
+            # compute Cholesky decomp; if it fails, do the eigen decomposition
+            try:
+                cov_root = np.linalg.cholesky(cov).transpose()
+            except np.linalg.LinAlgError:
+                eigval, eigvec = np.linalg.eigh(cov)
+                if not np.all(eigval >= -1.0e-8):
+                    raise ValueError("Covariance matrix not PSD.")
+                eigval = np.clip(eigval, 0.0, None)
+                cov_root = (eigvec * np.sqrt(eigval)).transpose()
+        elif cov_root is not None:
+            # root decomposition provided
+            cov_root = np.atleast_2d(cov_root)
+            if not mean.shape[0] == cov_root.shape[0]:
+                raise ValueError("Dimension mismatch between mean and "
+                                 "covariance.")
+        else:
+            # corresponds to identity covariance matrix
+            cov_root = None
+
+        self._inv_transform = inv_transform
+
+        if not inv_transform:
+            # to apply Box-Muller, we need an even number of dimensions
+            engine_dim = 2 * math.ceil(d / 2)
+        else:
+            engine_dim = d
+        if engine is None:
+            # Need this during SPEC 7 transition to prevent `RandomState`
+            # from being passed via `rng`.
+            kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng"
+            kwargs = {kwarg: rng}
+            self.engine = Sobol(
+                d=engine_dim, scramble=True, bits=30, **kwargs
+            )  # type: QMCEngine
+        elif isinstance(engine, QMCEngine):
+            if engine.d != engine_dim:
+                raise ValueError("Dimension of `engine` must be consistent"
+                                 " with dimensions of mean and covariance."
+                                 " If `inv_transform` is False, it must be"
+                                 " an even number.")
+            self.engine = engine
+        else:
+            raise ValueError("`engine` must be an instance of "
+                             "`scipy.stats.qmc.QMCEngine` or `None`.")
+
+        self._mean = mean
+        self._corr_matrix = cov_root
+
+        self._d = d
+
+    def random(self, n: IntNumber = 1) -> np.ndarray:
+        """Draw `n` QMC samples from the multivariate Normal.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            Sample.
+
+        """
+        base_samples = self._standard_normal_samples(n)
+        return self._correlate(base_samples)
+
+    def _correlate(self, base_samples: np.ndarray) -> np.ndarray:
+        if self._corr_matrix is not None:
+            return base_samples @ self._corr_matrix + self._mean
+        else:
+            # avoid multiplying with identity here
+            return base_samples + self._mean
+
+    def _standard_normal_samples(self, n: IntNumber = 1) -> np.ndarray:
+        """Draw `n` QMC samples from the standard Normal :math:`N(0, I_d)`.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+
+        Returns
+        -------
+        sample : array_like (n, d)
+            Sample.
+
+        """
+        # get base samples
+        samples = self.engine.random(n)
+        if self._inv_transform:
+            # apply inverse transform
+            # (values to close to 0/1 result in inf values)
+            return stats.norm.ppf(0.5 + (1 - 1e-10) * (samples - 0.5))  # type: ignore[attr-defined]  # noqa: E501
+        else:
+            # apply Box-Muller transform (note: indexes starting from 1)
+            even = np.arange(0, samples.shape[-1], 2)
+            Rs = np.sqrt(-2 * np.log(samples[:, even]))
+            thetas = 2 * math.pi * samples[:, 1 + even]
+            cos = np.cos(thetas)
+            sin = np.sin(thetas)
+            transf_samples = np.stack([Rs * cos, Rs * sin],
+                                      -1).reshape(n, -1)
+            # make sure we only return the number of dimension requested
+            return transf_samples[:, : self._d]
+
+
+class MultinomialQMC:
+    r"""QMC sampling from a multinomial distribution.
+
+    Parameters
+    ----------
+    pvals : array_like (k,)
+        Vector of probabilities of size ``k``, where ``k`` is the number
+        of categories. Elements must be non-negative and sum to 1.
+    n_trials : int
+        Number of trials.
+    engine : QMCEngine, optional
+        Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `seed` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `seed` keyword will emit warnings. Following a
+            deprecation period, the `seed` keyword will be removed.
+
+    Examples
+    --------
+    Let's define 3 categories and for a given sample, the sum of the trials
+    of each category is 8. The number of trials per category is determined
+    by the `pvals` associated to each category.
+    Then, we sample this distribution 64 times.
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import qmc
+    >>> dist = qmc.MultinomialQMC(
+    ...     pvals=[0.2, 0.4, 0.4], n_trials=10, engine=qmc.Halton(d=1)
+    ... )
+    >>> sample = dist.random(64)
+
+    We can plot the sample and verify that the median of number of trials
+    for each category is following the `pvals`. That would be
+    ``pvals * n_trials = [2, 4, 4]``.
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.yaxis.get_major_locator().set_params(integer=True)
+    >>> _ = ax.boxplot(sample)
+    >>> ax.set(xlabel="Categories", ylabel="Trials")
+    >>> plt.show()
+
+    """
+
+    @_transition_to_rng('seed', replace_doc=False)
+    def __init__(
+        self,
+        pvals: "npt.ArrayLike",
+        n_trials: IntNumber,
+        *,
+        engine: QMCEngine | None = None,
+        rng: SeedType = None,
+    ) -> None:
+        self.pvals = np.atleast_1d(np.asarray(pvals))
+        if np.min(pvals) < 0:
+            raise ValueError('Elements of pvals must be non-negative.')
+        if not np.isclose(np.sum(pvals), 1):
+            raise ValueError('Elements of pvals must sum to 1.')
+        self.n_trials = n_trials
+        if engine is None:
+            # Need this during SPEC 7 transition to prevent `RandomState`
+            # from being passed via `rng`.
+            kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng"
+            kwargs = {kwarg: rng}
+            self.engine = Sobol(
+                d=1, scramble=True, bits=30, **kwargs
+            )  # type: QMCEngine
+        elif isinstance(engine, QMCEngine):
+            if engine.d != 1:
+                raise ValueError("Dimension of `engine` must be 1.")
+            self.engine = engine
+        else:
+            raise ValueError("`engine` must be an instance of "
+                             "`scipy.stats.qmc.QMCEngine` or `None`.")
+
+    def random(self, n: IntNumber = 1) -> np.ndarray:
+        """Draw `n` QMC samples from the multinomial distribution.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of samples to generate in the parameter space. Default is 1.
+
+        Returns
+        -------
+        samples : array_like (n, pvals)
+            Sample.
+
+        """
+        sample = np.empty((n, len(self.pvals)))
+        for i in range(n):
+            base_draws = self.engine.random(self.n_trials).ravel()
+            p_cumulative = np.empty_like(self.pvals, dtype=float)
+            _fill_p_cumulative(np.array(self.pvals, dtype=float), p_cumulative)
+            sample_ = np.zeros_like(self.pvals, dtype=np.intp)
+            _categorize(base_draws, p_cumulative, sample_)
+            sample[i] = sample_
+        return sample
+
+
+def _select_optimizer(
+    optimization: Literal["random-cd", "lloyd"] | None, config: dict
+) -> Callable | None:
+    """A factory for optimization methods."""
+    optimization_method: dict[str, Callable] = {
+        "random-cd": _random_cd,
+        "lloyd": _lloyd_centroidal_voronoi_tessellation
+    }
+
+    optimizer: partial | None
+    if optimization is not None:
+        try:
+            optimization = optimization.lower()  # type: ignore[assignment]
+            optimizer_ = optimization_method[optimization]
+        except KeyError as exc:
+            message = (f"{optimization!r} is not a valid optimization"
+                       f" method. It must be one of"
+                       f" {set(optimization_method)!r}")
+            raise ValueError(message) from exc
+
+        # config
+        optimizer = partial(optimizer_, **config)
+    else:
+        optimizer = None
+
+    return optimizer
+
+
+def _random_cd(
+    best_sample: np.ndarray, n_iters: int, n_nochange: int, rng: GeneratorType,
+    **kwargs: dict
+) -> np.ndarray:
+    """Optimal LHS on CD.
+
+    Create a base LHS and do random permutations of coordinates to
+    lower the centered discrepancy.
+    Because it starts with a normal LHS, it also works with the
+    `scramble` keyword argument.
+
+    Two stopping criterion are used to stop the algorithm: at most,
+    `n_iters` iterations are performed; or if there is no improvement
+    for `n_nochange` consecutive iterations.
+    """
+    del kwargs  # only use keywords which are defined, needed by factory
+
+    n, d = best_sample.shape
+
+    if d == 0 or n == 0:
+        return np.empty((n, d))
+
+    if d == 1 or n == 1:
+        # discrepancy measures are invariant under permuting factors and runs
+        return best_sample
+
+    best_disc = discrepancy(best_sample)
+
+    bounds = ([0, d - 1],
+              [0, n - 1],
+              [0, n - 1])
+
+    n_nochange_ = 0
+    n_iters_ = 0
+    while n_nochange_ < n_nochange and n_iters_ < n_iters:
+        n_iters_ += 1
+
+        col = rng_integers(rng, *bounds[0], endpoint=True)  # type: ignore[misc]
+        row_1 = rng_integers(rng, *bounds[1], endpoint=True)  # type: ignore[misc]
+        row_2 = rng_integers(rng, *bounds[2], endpoint=True)  # type: ignore[misc]
+        disc = _perturb_discrepancy(best_sample,
+                                    row_1, row_2, col,
+                                    best_disc)
+        if disc < best_disc:
+            best_sample[row_1, col], best_sample[row_2, col] = (
+                best_sample[row_2, col], best_sample[row_1, col])
+
+            best_disc = disc
+            n_nochange_ = 0
+        else:
+            n_nochange_ += 1
+
+    return best_sample
+
+
+def _l1_norm(sample: np.ndarray) -> float:
+    return distance.pdist(sample, 'cityblock').min()
+
+
+def _lloyd_iteration(
+    sample: np.ndarray,
+    decay: float,
+    qhull_options: str
+) -> np.ndarray:
+    """Lloyd-Max algorithm iteration.
+
+    Based on the implementation of Stéfan van der Walt:
+
+    https://github.com/stefanv/lloyd
+
+    which is:
+
+        Copyright (c) 2021-04-21 Stéfan van der Walt
+        https://github.com/stefanv/lloyd
+        MIT License
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        The sample to iterate on.
+    decay : float
+        Relaxation decay. A positive value would move the samples toward
+        their centroid, and negative value would move them away.
+        1 would move the samples to their centroid.
+    qhull_options : str
+        Additional options to pass to Qhull. See Qhull manual
+        for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
+        "Qbb Qc Qz Qj" otherwise.)
+
+    Returns
+    -------
+    sample : array_like (n, d)
+        The sample after an iteration of Lloyd's algorithm.
+
+    """
+    new_sample = np.empty_like(sample)
+
+    voronoi = Voronoi(sample, qhull_options=qhull_options)
+
+    for ii, idx in enumerate(voronoi.point_region):
+        # the region is a series of indices into self.voronoi.vertices
+        # remove samples at infinity, designated by index -1
+        region = [i for i in voronoi.regions[idx] if i != -1]
+
+        # get the vertices for this region
+        verts = voronoi.vertices[region]
+
+        # clipping would be wrong, we need to intersect
+        # verts = np.clip(verts, 0, 1)
+
+        # move samples towards centroids:
+        # Centroid in n-D is the mean for uniformly distributed nodes
+        # of a geometry.
+        centroid = np.mean(verts, axis=0)
+        new_sample[ii] = sample[ii] + (centroid - sample[ii]) * decay
+
+    # only update sample to centroid within the region
+    is_valid = np.all(np.logical_and(new_sample >= 0, new_sample <= 1), axis=1)
+    sample[is_valid] = new_sample[is_valid]
+
+    return sample
+
+
+def _lloyd_centroidal_voronoi_tessellation(
+    sample: "npt.ArrayLike",
+    *,
+    tol: DecimalNumber = 1e-5,
+    maxiter: IntNumber = 10,
+    qhull_options: str | None = None,
+    **kwargs: dict
+) -> np.ndarray:
+    """Approximate Centroidal Voronoi Tessellation.
+
+    Perturb samples in N-dimensions using Lloyd-Max algorithm.
+
+    Parameters
+    ----------
+    sample : array_like (n, d)
+        The sample to iterate on. With ``n`` the number of samples and ``d``
+        the dimension. Samples must be in :math:`[0, 1]^d`, with ``d>=2``.
+    tol : float, optional
+        Tolerance for termination. If the min of the L1-norm over the samples
+        changes less than `tol`, it stops the algorithm. Default is 1e-5.
+    maxiter : int, optional
+        Maximum number of iterations. It will stop the algorithm even if
+        `tol` is above the threshold.
+        Too many iterations tend to cluster the samples as a hypersphere.
+        Default is 10.
+    qhull_options : str, optional
+        Additional options to pass to Qhull. See Qhull manual
+        for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
+        "Qbb Qc Qz Qj" otherwise.)
+
+    Returns
+    -------
+    sample : array_like (n, d)
+        The sample after being processed by Lloyd-Max algorithm.
+
+    Notes
+    -----
+    Lloyd-Max algorithm is an iterative process with the purpose of improving
+    the dispersion of samples. For given sample: (i) compute a Voronoi
+    Tessellation; (ii) find the centroid of each Voronoi cell; (iii) move the
+    samples toward the centroid of their respective cell. See [1]_, [2]_.
+
+    A relaxation factor is used to control how fast samples can move at each
+    iteration. This factor is starting at 2 and ending at 1 after `maxiter`
+    following an exponential decay.
+
+    The process converges to equally spaced samples. It implies that measures
+    like the discrepancy could suffer from too many iterations. On the other
+    hand, L1 and L2 distances should improve. This is especially true with
+    QMC methods which tend to favor the discrepancy over other criteria.
+
+    .. note::
+
+        The current implementation does not intersect the Voronoi Tessellation
+        with the boundaries. This implies that for a low number of samples,
+        empirically below 20, no Voronoi cell is touching the boundaries.
+        Hence, samples cannot be moved close to the boundaries.
+
+        Further improvements could consider the samples at infinity so that
+        all boundaries are segments of some Voronoi cells. This would fix
+        the computation of the centroid position.
+
+    .. warning::
+
+       The Voronoi Tessellation step is expensive and quickly becomes
+       intractable with dimensions as low as 10 even for a sample
+       of size as low as 1000.
+
+    .. versionadded:: 1.9.0
+
+    References
+    ----------
+    .. [1] Lloyd. "Least Squares Quantization in PCM".
+       IEEE Transactions on Information Theory, 1982.
+    .. [2] Max J. "Quantizing for minimum distortion".
+       IEEE Transactions on Information Theory, 1960.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.spatial import distance
+    >>> from scipy.stats._qmc import _lloyd_centroidal_voronoi_tessellation
+    >>> rng = np.random.default_rng()
+    >>> sample = rng.random((128, 2))
+
+    .. note::
+
+        The samples need to be in :math:`[0, 1]^d`. `scipy.stats.qmc.scale`
+        can be used to scale the samples from their
+        original bounds to :math:`[0, 1]^d`. And back to their original bounds.
+
+    Compute the quality of the sample using the L1 criterion.
+
+    >>> def l1_norm(sample):
+    ...    return distance.pdist(sample, 'cityblock').min()
+
+    >>> l1_norm(sample)
+    0.00161...  # random
+
+    Now process the sample using Lloyd's algorithm and check the improvement
+    on the L1. The value should increase.
+
+    >>> sample = _lloyd_centroidal_voronoi_tessellation(sample)
+    >>> l1_norm(sample)
+    0.0278...  # random
+
+    """
+    del kwargs  # only use keywords which are defined, needed by factory
+
+    sample = np.asarray(sample).copy()
+
+    if not sample.ndim == 2:
+        raise ValueError('`sample` is not a 2D array')
+
+    if not sample.shape[1] >= 2:
+        raise ValueError('`sample` dimension is not >= 2')
+
+    # Checking that sample is within the hypercube
+    if (sample.max() > 1.) or (sample.min() < 0.):
+        raise ValueError('`sample` is not in unit hypercube')
+
+    if qhull_options is None:
+        qhull_options = 'Qbb Qc Qz QJ'
+
+        if sample.shape[1] >= 5:
+            qhull_options += ' Qx'
+
+    # Fit an exponential to be 2 at 0 and 1 at `maxiter`.
+    # The decay is used for relaxation.
+    # analytical solution for y=exp(-maxiter/x) - 0.1
+    root = -maxiter / np.log(0.1)
+    decay = [np.exp(-x / root)+0.9 for x in range(maxiter)]
+
+    l1_old = _l1_norm(sample=sample)
+    for i in range(maxiter):
+        sample = _lloyd_iteration(
+                sample=sample, decay=decay[i],
+                qhull_options=qhull_options,
+        )
+
+        l1_new = _l1_norm(sample=sample)
+
+        if abs(l1_new - l1_old) < tol:
+            break
+        else:
+            l1_old = l1_new
+
+    return sample
+
+
+def _validate_workers(workers: IntNumber = 1) -> IntNumber:
+    """Validate `workers` based on platform and value.
+
+    Parameters
+    ----------
+    workers : int, optional
+        Number of workers to use for parallel processing. If -1 is
+        given all CPU threads are used. Default is 1.
+
+    Returns
+    -------
+    Workers : int
+        Number of CPU used by the algorithm
+
+    """
+    workers = int(workers)
+    if workers == -1:
+        workers = os.cpu_count()  # type: ignore[assignment]
+        if workers is None:
+            raise NotImplementedError(
+                "Cannot determine the number of cpus using os.cpu_count(), "
+                "cannot use -1 for the number of workers"
+            )
+    elif workers <= 0:
+        raise ValueError(f"Invalid number of workers: {workers}, must be -1 "
+                         "or > 0")
+
+    return workers
+
+
+def _validate_bounds(
+    l_bounds: "npt.ArrayLike", u_bounds: "npt.ArrayLike", d: int
+) -> "tuple[npt.NDArray[np.generic], npt.NDArray[np.generic]]":
+    """Bounds input validation.
+
+    Parameters
+    ----------
+    l_bounds, u_bounds : array_like (d,)
+        Lower and upper bounds.
+    d : int
+        Dimension to use for broadcasting.
+
+    Returns
+    -------
+    l_bounds, u_bounds : array_like (d,)
+        Lower and upper bounds.
+
+    """
+    try:
+        lower = np.broadcast_to(l_bounds, d)
+        upper = np.broadcast_to(u_bounds, d)
+    except ValueError as exc:
+        msg = ("'l_bounds' and 'u_bounds' must be broadcastable and respect"
+               " the sample dimension")
+        raise ValueError(msg) from exc
+
+    if not np.all(lower < upper):
+        raise ValueError("Bounds are not consistent 'l_bounds' < 'u_bounds'")
+
+    return lower, upper
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc_cy.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc_cy.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..1006385a43179478a9a4a32ae5f825aa5b8b35c4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmc_cy.pyi
@@ -0,0 +1,54 @@
+import numpy as np
+from scipy._lib._util import DecimalNumber, IntNumber
+
+
+def _cy_wrapper_centered_discrepancy(
+        sample: np.ndarray, 
+        iterative: bool, 
+        workers: IntNumber,
+) -> float: ...
+
+
+def _cy_wrapper_wrap_around_discrepancy(
+        sample: np.ndarray,
+        iterative: bool, 
+        workers: IntNumber,
+) -> float: ...
+
+
+def _cy_wrapper_mixture_discrepancy(
+        sample: np.ndarray,
+        iterative: bool, 
+        workers: IntNumber,
+) -> float: ...
+
+
+def _cy_wrapper_l2_star_discrepancy(
+        sample: np.ndarray,
+        iterative: bool,
+        workers: IntNumber,
+) -> float: ...
+
+
+def _cy_wrapper_update_discrepancy(
+        x_new_view: np.ndarray,
+        sample_view: np.ndarray,
+        initial_disc: DecimalNumber,
+) -> float: ...
+
+
+def _cy_van_der_corput(
+        n: IntNumber,
+        base: IntNumber,
+        start_index: IntNumber,
+        workers: IntNumber,
+) -> np.ndarray: ...
+
+
+def _cy_van_der_corput_scrambled(
+        n: IntNumber,
+        base: IntNumber,
+        start_index: IntNumber,
+        permutations: np.ndarray,
+        workers: IntNumber,
+) -> np.ndarray: ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmvnt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmvnt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0e4b9307536cfbe20536b5cc66f6d19f243105
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_qmvnt.py
@@ -0,0 +1,475 @@
+# Integration of multivariate normal and t distributions.
+
+# Adapted from the MATLAB original implementations by Dr. Alan Genz.
+
+#     http://www.math.wsu.edu/faculty/genz/software/software.html
+
+# Copyright (C) 2013, Alan Genz,  All rights reserved.
+# Python implementation is copyright (C) 2022, Robert Kern,  All rights
+# reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided the following conditions are met:
+#   1. Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#   2. Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#   3. The contributor name(s) may not be used to endorse or promote
+#      products derived from this software without specific prior
+#      written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import math
+import numpy as np
+
+from scipy.fft import fft, ifft
+from scipy.special import ndtr as phi, ndtri as phinv
+from scipy.stats._qmc import primes_from_2_to
+from scipy.stats._stats_pythran import _bvnu
+
+from ._qmvnt_cy import _qmvn_inner, _qmvt_inner
+
+
+def _factorize_int(n):
+    """Return a sorted list of the unique prime factors of a positive integer.
+    """
+    # NOTE: There are lots faster ways to do this, but this isn't terrible.
+    factors = set()
+    for p in primes_from_2_to(int(np.sqrt(n)) + 1):
+        while not (n % p):
+            factors.add(p)
+            n //= p
+        if n == 1:
+            break
+    if n != 1:
+        factors.add(n)
+    return sorted(factors)
+
+
+def _primitive_root(p):
+    """Compute a primitive root of the prime number `p`.
+
+    Used in the CBC lattice construction.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Primitive_root_modulo_n
+    """
+    # p is prime
+    pm = p - 1
+    factors = _factorize_int(pm)
+    n = len(factors)
+    r = 2
+    k = 0
+    while k < n:
+        d = pm // factors[k]
+        # pow() doesn't like numpy scalar types.
+        rd = pow(int(r), int(d), int(p))
+        if rd == 1:
+            r += 1
+            k = 0
+        else:
+            k += 1
+    return r
+
+
+def _cbc_lattice(n_dim, n_qmc_samples):
+    """Compute a QMC lattice generator using a Fast CBC construction.
+
+    Parameters
+    ----------
+    n_dim : int > 0
+        The number of dimensions for the lattice.
+    n_qmc_samples : int > 0
+        The desired number of QMC samples. This will be rounded down to the
+        nearest prime to enable the CBC construction.
+
+    Returns
+    -------
+    q : float array : shape=(n_dim,)
+        The lattice generator vector. All values are in the open interval
+        ``(0, 1)``.
+    actual_n_qmc_samples : int
+        The prime number of QMC samples that must be used with this lattice,
+        no more, no less.
+
+    References
+    ----------
+    .. [1] Nuyens, D. and Cools, R. "Fast Component-by-Component Construction,
+           a Reprise for Different Kernels", In H. Niederreiter and D. Talay,
+           editors, Monte-Carlo and Quasi-Monte Carlo Methods 2004,
+           Springer-Verlag, 2006, 371-385.
+    """
+    # Round down to the nearest prime number.
+    primes = primes_from_2_to(n_qmc_samples + 1)
+    n_qmc_samples = primes[-1]
+
+    bt = np.ones(n_dim)
+    gm = np.hstack([1.0, 0.8 ** np.arange(n_dim - 1)])
+    q = 1
+    w = 0
+    z = np.arange(1, n_dim + 1)
+    m = (n_qmc_samples - 1) // 2
+    g = _primitive_root(n_qmc_samples)
+    # Slightly faster way to compute perm[j] = pow(g, j, n_qmc_samples)
+    # Shame that we don't have modulo pow() implemented as a ufunc.
+    perm = np.ones(m, dtype=int)
+    for j in range(m - 1):
+        perm[j + 1] = (g * perm[j]) % n_qmc_samples
+    perm = np.minimum(n_qmc_samples - perm, perm)
+    pn = perm / n_qmc_samples
+    c = pn * pn - pn + 1.0 / 6
+    fc = fft(c)
+    for s in range(1, n_dim):
+        reordered = np.hstack([
+            c[:w+1][::-1],
+            c[w+1:m][::-1],
+        ])
+        q = q * (bt[s-1] + gm[s-1] * reordered)
+        w = ifft(fc * fft(q)).real.argmin()
+        z[s] = perm[w]
+    q = z / n_qmc_samples
+    return q, n_qmc_samples
+
+
+def _qauto(func, covar, low, high, rng, error=1e-3, limit=10_000, **kwds):
+    """Automatically rerun the integration to get the required error bound.
+
+    Parameters
+    ----------
+    func : callable
+        Either :func:`_qmvn` or :func:`_qmvt`.
+    covar, low, high : array
+        As specified in :func:`_qmvn` and :func:`_qmvt`.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    error : float > 0
+        The desired error bound.
+    limit : int > 0:
+        The rough limit of the number of integration points to consider. The
+        integration will stop looping once this limit has been *exceeded*.
+    **kwds :
+        Other keyword arguments to pass to `func`. When using :func:`_qmvt`, be
+        sure to include ``nu=`` as one of these.
+
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    n_samples : int
+        The number of integration points actually used.
+    """
+    n = len(covar)
+    n_samples = 0
+    if n == 1:
+        prob = phi(high / covar**0.5) - phi(low / covar**0.5)
+        # More or less
+        est_error = 1e-15
+    elif n == 2:
+        prob = _bvn(low, high, covar)
+        est_error = 1e-15
+    else:
+        mi = min(limit, n * 1000)
+        prob = 0.0
+        est_error = 1.0
+        ei = 0.0
+        while est_error > error and n_samples < limit:
+            mi = round(np.sqrt(2) * mi)
+            pi, ei, ni = func(mi, covar, low, high, rng=rng, **kwds)
+            n_samples += ni
+            wt = 1.0 / (1 + (ei / est_error)**2)
+            prob += wt * (pi - prob)
+            est_error = np.sqrt(wt) * ei
+    return prob, est_error, n_samples
+
+
+def _qmvn(m, covar, low, high, rng, lattice='cbc', n_batches=10):
+    """Multivariate normal integration over box bounds.
+
+    Parameters
+    ----------
+    m : int > n_batches
+        The number of points to sample. This number will be divided into
+        `n_batches` batches that apply random offsets of the sampling lattice
+        for each batch in order to estimate the error.
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    lattice : 'cbc' or callable
+        The type of lattice rule to use to construct the integration points.
+    n_batches : int > 0, optional
+        The number of QMC batches to apply.
+
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    """
+    cho, lo, hi = _permuted_cholesky(covar, low, high)
+    if not cho.flags.c_contiguous:
+        # qmvn_inner expects contiguous buffers
+        cho = cho.copy()
+
+    n = cho.shape[0]
+    q, n_qmc_samples = _cbc_lattice(n - 1, max(m // n_batches, 1))
+    rndm = rng.random(size=(n_batches, n))
+
+    prob, est_error, n_samples = _qmvn_inner(
+        q, rndm, int(n_qmc_samples), int(n_batches), cho, lo, hi
+    )
+    return prob, est_error, n_samples
+
+
+# Note: this function is not currently used or tested by any SciPy code. It is
+# included in this file to facilitate the resolution of gh-8367, gh-16142, and
+# possibly gh-14286, but must be reviewed and tested before use.
+def _mvn_qmc_integrand(covar, low, high, use_tent=False):
+    """Transform the multivariate normal integration into a QMC integrand over
+    a unit hypercube.
+
+    The dimensionality of the resulting hypercube integration domain is one
+    less than the dimensionality of the original integrand. Note that this
+    transformation subsumes the integration bounds in order to account for
+    infinite bounds. The QMC integration one does with the returned integrand
+    should be on the unit hypercube.
+
+    Parameters
+    ----------
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    use_tent : bool, optional
+        If True, then use tent periodization. Only helpful for lattice rules.
+
+    Returns
+    -------
+    integrand : Callable[[NDArray], NDArray]
+        The QMC-integrable integrand. It takes an
+        ``(n_qmc_samples, ndim_integrand)`` array of QMC samples in the unit
+        hypercube and returns the ``(n_qmc_samples,)`` evaluations of at these
+        QMC points.
+    ndim_integrand : int
+        The dimensionality of the integrand. Equal to ``n-1``.
+    """
+    cho, lo, hi = _permuted_cholesky(covar, low, high)
+    n = cho.shape[0]
+    ndim_integrand = n - 1
+    ct = cho[0, 0]
+    c = phi(lo[0] / ct)
+    d = phi(hi[0] / ct)
+    ci = c
+    dci = d - ci
+
+    def integrand(*zs):
+        ndim_qmc = len(zs)
+        n_qmc_samples = len(np.atleast_1d(zs[0]))
+        assert ndim_qmc == ndim_integrand
+        y = np.zeros((ndim_qmc, n_qmc_samples))
+        c = np.full(n_qmc_samples, ci)
+        dc = np.full(n_qmc_samples, dci)
+        pv = dc.copy()
+        for i in range(1, n):
+            if use_tent:
+                # Tent periodization transform.
+                x = abs(2 * zs[i-1] - 1)
+            else:
+                x = zs[i-1]
+            y[i - 1, :] = phinv(c + x * dc)
+            s = cho[i, :i] @ y[:i, :]
+            ct = cho[i, i]
+            c = phi((lo[i] - s) / ct)
+            d = phi((hi[i] - s) / ct)
+            dc = d - c
+            pv = pv * dc
+        return pv
+
+    return integrand, ndim_integrand
+
+
+def _qmvt(m, nu, covar, low, high, rng, lattice='cbc', n_batches=10):
+    """Multivariate t integration over box bounds.
+
+    Parameters
+    ----------
+    m : int > n_batches
+        The number of points to sample. This number will be divided into
+        `n_batches` batches that apply random offsets of the sampling lattice
+        for each batch in order to estimate the error.
+    nu : float >= 0
+        The shape parameter of the multivariate t distribution.
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    lattice : 'cbc' or callable
+        The type of lattice rule to use to construct the integration points.
+    n_batches : int > 0, optional
+        The number of QMC batches to apply.
+
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    n_samples : int
+        The number of samples actually used.
+    """
+    sn = max(1.0, np.sqrt(nu))
+    low = np.asarray(low, dtype=np.float64)
+    high = np.asarray(high, dtype=np.float64)
+    cho, lo, hi = _permuted_cholesky(covar, low / sn, high / sn)
+    n = cho.shape[0]
+    q, n_qmc_samples = _cbc_lattice(n, max(m // n_batches, 1))
+    rndm = rng.random(size=(n_batches, n))
+    prob, est_error, n_samples = _qmvt_inner(
+        q, rndm, int(n_qmc_samples), int(n_batches), cho, lo, hi, float(nu)
+    )
+    return prob, est_error, n_samples
+
+
+def _permuted_cholesky(covar, low, high, tol=1e-10):
+    """Compute a scaled, permuted Cholesky factor, with integration bounds.
+
+    The scaling and permuting of the dimensions accomplishes part of the
+    transformation of the original integration problem into a more numerically
+    tractable form. The lower-triangular Cholesky factor will then be used in
+    the subsequent integration. The integration bounds will be scaled and
+    permuted as well.
+
+    Parameters
+    ----------
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    tol : float, optional
+        The singularity tolerance.
+
+    Returns
+    -------
+    cho : (n, n) float array
+        Lower Cholesky factor, scaled and permuted.
+    new_low, new_high : (n,) float array
+        The scaled and permuted low and high integration bounds.
+    """
+    # Make copies for outputting.
+    cho = np.array(covar, dtype=np.float64)
+    new_lo = np.array(low, dtype=np.float64)
+    new_hi = np.array(high, dtype=np.float64)
+    n = cho.shape[0]
+    if cho.shape != (n, n):
+        raise ValueError("expected a square symmetric array")
+    if new_lo.shape != (n,) or new_hi.shape != (n,):
+        raise ValueError(
+            "expected integration boundaries the same dimensions "
+            "as the covariance matrix"
+        )
+    # Scale by the sqrt of the diagonal.
+    dc = np.sqrt(np.maximum(np.diag(cho), 0.0))
+    # But don't divide by 0.
+    dc[dc == 0.0] = 1.0
+    new_lo /= dc
+    new_hi /= dc
+    cho /= dc
+    cho /= dc[:, np.newaxis]
+
+    y = np.zeros(n)
+    sqtp = np.sqrt(2 * np.pi)
+    for k in range(n):
+        epk = (k + 1) * tol
+        im = k
+        ck = 0.0
+        dem = 1.0
+        s = 0.0
+        lo_m = 0.0
+        hi_m = 0.0
+        for i in range(k, n):
+            if cho[i, i] > tol:
+                ci = np.sqrt(cho[i, i])
+                if i > 0:
+                    s = cho[i, :k] @ y[:k]
+                lo_i = (new_lo[i] - s) / ci
+                hi_i = (new_hi[i] - s) / ci
+                de = phi(hi_i) - phi(lo_i)
+                if de <= dem:
+                    ck = ci
+                    dem = de
+                    lo_m = lo_i
+                    hi_m = hi_i
+                    im = i
+        if im > k:
+            # Swap im and k
+            cho[im, im] = cho[k, k]
+            _swap_slices(cho, np.s_[im, :k], np.s_[k, :k])
+            _swap_slices(cho, np.s_[im + 1:, im], np.s_[im + 1:, k])
+            _swap_slices(cho, np.s_[k + 1:im, k], np.s_[im, k + 1:im])
+            _swap_slices(new_lo, k, im)
+            _swap_slices(new_hi, k, im)
+        if ck > epk:
+            cho[k, k] = ck
+            cho[k, k + 1:] = 0.0
+            for i in range(k + 1, n):
+                cho[i, k] /= ck
+                cho[i, k + 1:i + 1] -= cho[i, k] * cho[k + 1:i + 1, k]
+            if abs(dem) > tol:
+                y[k] = ((np.exp(-lo_m * lo_m / 2) - np.exp(-hi_m * hi_m / 2)) /
+                        (sqtp * dem))
+            else:
+                y[k] = (lo_m + hi_m) / 2
+                if lo_m < -10:
+                    y[k] = hi_m
+                elif hi_m > 10:
+                    y[k] = lo_m
+            cho[k, :k + 1] /= ck
+            new_lo[k] /= ck
+            new_hi[k] /= ck
+        else:
+            cho[k:, k] = 0.0
+            y[k] = (new_lo[k] + new_hi[k]) / 2
+    return cho, new_lo, new_hi
+
+
+def _swap_slices(x, slc1, slc2):
+    t = x[slc1].copy()
+    x[slc1] = x[slc2].copy()
+    x[slc2] = t
+
+
+def _bvn(a, b, A):
+    # covariance matrix is written [[s1**2, rho*s1*s2], [rho*s1*s2, s2**2]]
+    # e.g. https://en.wikipedia.org/wiki/Multivariate_normal_distribution
+    # therefore, s12 = rho*s1*s2 -> rho = s12/(s1*s2)
+    s1 = math.sqrt(A[0, 0])
+    s2 = math.sqrt(A[1, 1])
+    s12 = A[0, 1]
+    r = s12 / (s1 * s2)
+    # the x and y coordinates seem to be normalized by the standard devs
+    xl, xu = a[0] / s1, b[0] / s1
+    yl, yu = a[1] / s2, b[1] / s2
+    p = _bvnu(xl, yl, r) - _bvnu(xu, yl, r) - _bvnu(xl, yu, r) + _bvnu(xu, yu, r)
+    p = max( 0., min( p, 1. ) )
+    return p
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_quantile.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_quantile.py
new file mode 100644
index 0000000000000000000000000000000000000000..591e35bd236d7bb07ccc8e16428122a2c0826d54
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_quantile.py
@@ -0,0 +1,522 @@
+import math
+import numpy as np
+from scipy.special import betainc
+from scipy._lib._array_api import (
+    xp_capabilities,
+    xp_ravel,
+    array_namespace,
+    xp_promote,
+    xp_device,
+    _length_nonmasked,
+    is_torch,
+)
+import scipy._lib.array_api_extra as xpx
+from scipy.stats._axis_nan_policy import _broadcast_arrays, _contains_nan
+
+
+def _quantile_iv(x, p, method, axis, nan_policy, keepdims, weights):
+    xp = array_namespace(x, p, weights)
+
+    if not xp.isdtype(xp.asarray(x).dtype, ('integral', 'real floating')):
+        raise ValueError("`x` must have real dtype.")
+
+    if not xp.isdtype(xp.asarray(p).dtype, 'real floating'):
+        raise ValueError("`p` must have real floating dtype.")
+
+    if not (weights is None
+            or xp.isdtype(xp.asarray(weights).dtype, ('integral', 'real floating'))):
+        raise ValueError("`weights` must have real dtype.")
+
+    x, p, weights = xp_promote(x, p, weights, force_floating=True, xp=xp)
+    p = xp.asarray(p, device=xp_device(x))
+    dtype = x.dtype
+
+    axis_none = axis is None
+    ndim = max(x.ndim, p.ndim)
+    if axis_none:
+        x = xp_ravel(x)
+        p = xp_ravel(p)
+        axis = 0
+    elif np.iterable(axis) or int(axis) != axis:
+        message = "`axis` must be an integer or None."
+        raise ValueError(message)
+    elif (axis >= ndim) or (axis < -ndim):
+        message = "`axis` is not compatible with the shapes of the inputs."
+        raise ValueError(message)
+    axis = int(axis)
+
+    methods = {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
+               'hazen', 'interpolated_inverted_cdf', 'linear',
+               'median_unbiased', 'normal_unbiased', 'weibull',
+               'harrell-davis', '_lower', '_midpoint', '_higher', '_nearest',
+               'round_outward', 'round_inward', 'round_nearest'}
+    if method not in methods:
+        message = f"`method` must be one of {methods}"
+        raise ValueError(message)
+
+    no_weights = {'_lower', '_midpoint', '_higher', '_nearest', 'harrell-davis',
+                  'round_nearest', 'round_inward', 'round_outward'}
+    if weights is not None and method in no_weights:
+        message = f"`method='{method}'` does not support `weights`."
+        raise ValueError(message)
+
+    contains_nans = _contains_nan(x, nan_policy, xp_omit_okay=True, xp=xp)
+
+    if keepdims not in {None, True, False}:
+        message = "If specified, `keepdims` must be True or False."
+        raise ValueError(message)
+
+    # If data has length zero along `axis`, the result will be an array of NaNs just
+    # as if the data had length 1 along axis and were filled with NaNs. This is treated
+    # naturally below whether `nan_policy` is `'propagate'` or `'omit'`.
+    if x.shape[axis] == 0:
+        shape = list(x.shape)
+        shape[axis] = 1
+        x = xp.full(shape, xp.nan, dtype=dtype, device=xp_device(x))
+
+    if weights is None:
+        y = xp.sort(x, axis=axis, stable=False)
+        y, p = _broadcast_arrays((y, p), axis=axis)
+        n_zero_weight = None
+    else:
+        x, weights = xp.broadcast_arrays(x, weights)
+        i_zero_weight = (weights == 0)
+        n_zero_weight = xp.count_nonzero(i_zero_weight, axis=axis, keepdims=True)
+        x = xpx.at(x)[i_zero_weight].set(xp.inf, copy=True)
+        i_y = xp.argsort(x, axis=axis, stable=False)
+        y = xp.take_along_axis(x, i_y, axis=axis)
+        weights = xp.take_along_axis(weights, i_y, axis=axis)
+        y, p, weights, i_y, n_zero_weight = _broadcast_arrays(
+            (y, p, weights, i_y, n_zero_weight), axis=axis)
+
+    if (keepdims is False) and (p.shape[axis] != 1):
+        message = "`keepdims` may be False only if the length of `p` along `axis` is 1."
+        raise ValueError(message)
+    keepdims = (p.shape[axis] != 1) if keepdims is None else keepdims
+
+    y = xp.moveaxis(y, axis, -1)
+    p = xp.moveaxis(p, axis, -1)
+    weights = weights if weights is None else xp.moveaxis(weights, axis, -1)
+    n_zero_weight = (n_zero_weight if n_zero_weight is None
+                     else xp.moveaxis(n_zero_weight, axis, -1))
+
+    n = _length_nonmasked(y, -1, xp=xp, keepdims=True)
+    n = n if n_zero_weight is None else n - n_zero_weight
+
+    if contains_nans:
+        nans = xp.isnan(y)
+
+        # Note that if length along `axis` were 0 to begin with,
+        # it is now length 1 and filled with NaNs.
+        if nan_policy == 'propagate':
+            nan_out = xp.any(nans, axis=-1)
+        else:  # 'omit'
+            n_int = n - xp.count_nonzero(nans, axis=-1, keepdims=True)
+            n = xp.astype(n_int, dtype)
+            # NaNs are produced only if slice is empty after removing NaNs
+            nan_out = xp.any(n == 0, axis=-1)
+            n = xpx.at(n, nan_out).set(y.shape[-1])  # avoids pytorch/pytorch#146211
+
+        if xp.any(nan_out):
+            y = xp.asarray(y, copy=True)  # ensure writable
+            y = xpx.at(y, nan_out).set(xp.nan)
+        elif xp.any(nans) and method == 'harrell-davis':
+            y = xp.asarray(y, copy=True)  # ensure writable
+            y = xpx.at(y, nans).set(0)  # any non-nan will prevent NaN from propagating
+
+    n = xp.asarray(n, dtype=dtype, device=xp_device(y))
+
+    p_mask = (p > 1) | (p < 0) | xp.isnan(p)
+    if xp.any(p_mask):
+        p = xp.asarray(p, copy=True)
+        p = xpx.at(p, p_mask).set(0.5)  # these get NaN-ed out at the end
+
+    return (y, p, method, axis, nan_policy, keepdims,
+            n, axis_none, ndim, p_mask, weights, xp)
+
+
+@xp_capabilities(skip_backends=[("dask.array", "No take_along_axis yet.")],
+                 jax_jit=False)
+def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims=None,
+             weights=None):
+    """
+    Compute the p-th quantile of the data along the specified axis.
+
+    Parameters
+    ----------
+    x : array_like of real numbers
+        Data array.
+    p : array_like of float
+        Probability or sequence of probabilities of the quantiles to compute.
+        Values must be between 0 and 1 (inclusive).
+        While `numpy.quantile` can only compute quantiles according to the Cartesian
+        product of the first two arguments, this function enables calculation of
+        quantiles at different probabilities for each axis slice by following
+        broadcasting rules like those of `scipy.stats` reducing functions.
+        See `axis`, `keepdims`, and the examples.
+    method : str, default: 'linear'
+        The method to use for estimating the quantile.
+        The available options, numbered as they appear in [1]_, are:
+
+        1. 'inverted_cdf'
+        2. 'averaged_inverted_cdf'
+        3. 'closest_observation'
+        4. 'interpolated_inverted_cdf'
+        5. 'hazen'
+        6. 'weibull'
+        7. 'linear'  (default)
+        8. 'median_unbiased'
+        9. 'normal_unbiased'
+
+        'harrell-davis' is also available to compute the quantile estimate
+        according to [2]_.
+
+        'round_outward', 'round_inward', and 'round_nearest' are available for use
+        in trimming and winsorizing data.
+
+        See Notes for details.
+    axis : int or None, default: 0
+        Axis along which the quantiles are computed.
+        ``None`` ravels both `x` and `p` before performing the calculation,
+        without checking whether the original shapes were compatible.
+        As in other `scipy.stats` functions, a positive integer `axis` is resolved
+        after prepending 1s to the shape of `x` or `p` as needed until the two arrays
+        have the same dimensionality. When providing `x` and `p` with different
+        dimensionality, consider using negative `axis` integers for clarity.
+    nan_policy : str, default: 'propagate'
+        Defines how to handle NaNs in the input data `x`.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the  statistic is computed, the corresponding slice of the output
+          will contain NaN(s).
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding slice of the output will
+          contain NaN(s).
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+
+        If NaNs are present in `p`, a ``ValueError`` will be raised.
+    keepdims : bool, optional
+        Consider the case in which `x` is 1-D and `p` is a scalar: the quantile
+        is a reducing statistic, and the default behavior is to return a scalar.
+        If `keepdims` is set to True, the axis will not be reduced away, and the
+        result will be a 1-D array with one element.
+
+        The general case is more subtle, since multiple quantiles may be
+        requested for each axis-slice of `x`. For instance, if both `x` and `p`
+        are 1-D and ``p.size > 1``, no axis can be reduced away; there must be an
+        axis to contain the number of quantiles given by ``p.size``. Therefore:
+
+        - By default, the axis will be reduced away if possible (i.e. if there is
+          exactly one element of `p` per axis-slice of `x`).
+        - If `keepdims` is set to True, the axis will not be reduced away.
+        - If `keepdims` is set to False, the axis will be reduced away
+          if possible, and an error will be raised otherwise.
+    weights : array_like of finite, non-negative real numbers
+        Frequency weights; e.g., for counting number weights,
+        ``quantile(x, p, weights=weights)`` is equivalent to
+        ``quantile(np.repeat(x, weights), p)``. Values other than finite counting
+        numbers are accepted, but may not have valid statistical interpretations.
+        Not compatible with ``method='harrell-davis'`` or those that begin with
+        ``'round_'``.
+
+    Returns
+    -------
+    quantile : scalar or ndarray
+        The resulting quantile(s). The dtype is the result dtype of `x` and `p`.
+
+    See Also
+    --------
+    numpy.quantile
+    :ref:`outliers`
+
+    Notes
+    -----
+    Given a sample `x` from an underlying distribution, `quantile` provides a
+    nonparametric estimate of the inverse cumulative distribution function.
+
+    By default, this is done by interpolating between adjacent elements in
+    ``y``, a sorted copy of `x`::
+
+        (1-g)*y[j] + g*y[j+1]
+
+    where the index ``j`` and coefficient ``g`` are the integral and
+    fractional components of ``p * (n-1)``, and ``n`` is the number of
+    elements in the sample.
+
+    This is a special case of Equation 1 of H&F [1]_. More generally,
+
+    - ``j = (p*n + m - 1) // 1``, and
+    - ``g = (p*n + m - 1) % 1``,
+
+    where ``m`` may be defined according to several different conventions.
+    The preferred convention may be selected using the ``method`` parameter:
+
+    =============================== =============== ===============
+    ``method``                      number in H&F   ``m``
+    =============================== =============== ===============
+    ``interpolated_inverted_cdf``   4               ``0``
+    ``hazen``                       5               ``1/2``
+    ``weibull``                     6               ``p``
+    ``linear`` (default)            7               ``1 - p``
+    ``median_unbiased``             8               ``p/3 + 1/3``
+    ``normal_unbiased``             9               ``p/4 + 3/8``
+    =============================== =============== ===============
+
+    Note that indices ``j`` and ``j + 1`` are clipped to the range ``0`` to
+    ``n - 1`` when the results of the formula would be outside the allowed
+    range of non-negative indices. When ``j`` is clipped to zero, ``g`` is
+    set to zero as well. The ``-1`` in the formulas for ``j`` and ``g``
+    accounts for Python's 0-based indexing.
+
+    The table above includes only the estimators from [1]_ that are continuous
+    functions of probability `p` (estimators 4-9). SciPy also provides the
+    three discontinuous estimators from [1]_ (estimators 1-3), where ``j`` is
+    defined as above, ``m`` is defined as follows, and ``g`` is ``0`` when
+    ``index = p*n + m - 1`` is less than ``0`` and otherwise is defined below.
+
+    1. ``inverted_cdf``: ``m = 0`` and ``g = int(index - j > 0)``
+    2. ``averaged_inverted_cdf``: ``m = 0`` and
+       ``g = (1 + int(index - j > 0)) / 2``
+    3. ``closest_observation``: ``m = -1/2`` and
+       ``g = 1 - int((index == j) & (j%2 == 1))``
+
+    Note that for methods ``inverted_cdf`` and ``averaged_inverted_cdf``, only the
+    relative proportions of tied observations (and relative weights) affect the
+    results; for all other methods, the total number of observations (and absolute
+    weights) matter.
+
+    A different strategy for computing quantiles from [2]_, ``method='harrell-davis'``,
+    uses a weighted combination of all elements. The weights are computed as:
+
+    .. math::
+
+        w_{n, i} = I_{i/n}(a, b) - I_{(i - 1)/n}(a, b)
+
+    where :math:`n` is the number of elements in the sample,
+    :math:`i` are the indices :math:`1, 2, ..., n-1, n` of the sorted elements,
+    :math:`a = p (n + 1)`, :math:`b = (1 - p)(n + 1)`,
+    :math:`p` is the probability of the quantile, and
+    :math:`I` is the regularized, lower incomplete beta function
+    (`scipy.special.betainc`).
+
+    ``method='round_nearest'`` is equivalent to indexing ``y[j]``, where::
+
+        j = int(np.round(p*n) if p < 0.5 else np.round(n*p - 1))
+
+    This is useful when winsorizing data: replacing ``p*n`` of the most extreme
+    observations with the next most extreme observation. ``method='round_outward'``
+    adjusts the direction of rounding to winsorize fewer elements::
+
+        j = int(np.floor(p*n) if p < 0.5 else np.ceil(n*p - 1))
+
+    and ``method='round_inward'`` rounds to winsorize more elements::
+
+        j = int(np.ceil(p*n) if p < 0.5 else np.floor(n*p - 1))
+
+    These methods are also useful for trimming data: removing ``p*n`` of the most
+    extreme observations. See :ref:`outliers` for example applications.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.asarray([[10, 8, 7, 5, 4],
+    ...                 [0, 1, 2, 3, 5]])
+
+    Take the median of each row.
+
+    >>> stats.quantile(x, 0.5, axis=-1)
+    array([7.,  2.])
+
+    Take a different quantile for each row.
+
+    >>> stats.quantile(x, [[0.25], [0.75]], axis=-1, keepdims=True)
+    array([[5.],
+           [3.]])
+
+    Take multiple quantiles for each row.
+
+    >>> stats.quantile(x, [0.25, 0.75], axis=-1)
+    array([[5., 8.],
+           [1., 3.]])
+
+    Take different quantiles for each row.
+
+    >>> p = np.asarray([[0.25, 0.75],
+    ...                 [0.5, 1.0]])
+    >>> stats.quantile(x, p, axis=-1)
+    array([[5., 8.],
+           [2., 5.]])
+
+    Take different quantiles for each column.
+
+    >>> stats.quantile(x.T, p.T, axis=0)
+    array([[5., 2.],
+           [8., 5.]])
+
+    References
+    ----------
+    .. [1] R. J. Hyndman and Y. Fan,
+       "Sample quantiles in statistical packages,"
+       The American Statistician, 50(4), pp. 361-365, 1996
+    .. [2] Harrell, Frank E., and C. E. Davis.
+       "A new distribution-free quantile estimator."
+       Biometrika 69.3 (1982): 635-640.
+
+    """
+    # Input validation / standardization
+
+    temp = _quantile_iv(x, p, method, axis, nan_policy, keepdims, weights)
+    (y, p, method, axis, nan_policy, keepdims,
+     n, axis_none, ndim, p_mask, weights, xp) = temp
+
+    if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
+                  'hazen', 'interpolated_inverted_cdf', 'linear',
+                  'median_unbiased', 'normal_unbiased', 'weibull'}:
+        res = _quantile_hf(y, p, n, method, weights, xp)
+    elif method in {'harrell-davis'}:
+        res = _quantile_hd(y, p, n, xp)
+    elif method in {'_lower', '_midpoint', '_higher', '_nearest'}:
+        res = _quantile_bc(y, p, n, method, xp)
+    else:  # method.startswith('round'):
+        res = _quantile_winsor(y, p, n, method, xp)
+
+    res = xpx.at(res, p_mask).set(xp.nan)
+
+    # Reshape per axis/keepdims
+    if axis_none and keepdims:
+        shape = (1,)*(ndim - 1) + res.shape
+        res = xp.reshape(res, shape)
+        axis = -1
+
+    res = xp.moveaxis(res, -1, axis)
+
+    if not keepdims:
+        res = xp.squeeze(res, axis=axis)
+
+    return res[()] if res.ndim == 0 else res
+
+
+def _quantile_hf(y, p, n, method, weights, xp):
+    ms = dict(inverted_cdf=0, averaged_inverted_cdf=0, closest_observation=-0.5,
+              interpolated_inverted_cdf=0, hazen=0.5, weibull=p, linear=1 - p,
+              median_unbiased=p/3 + 1/3, normal_unbiased=p/4 + 3/8)
+    m = ms[method]
+
+    if weights is None:
+        jg = p * n + m
+        jp1 = jg // 1
+        j = jp1 - 1
+    else:
+        cumulative_weights = xp.cumulative_sum(weights, axis=-1)
+        n_int = xp.asarray(n, dtype=xp.int64)
+        n_int = xp.broadcast_to(n_int, cumulative_weights.shape[:-1] + (1,))
+        total_weight = xp.take_along_axis(cumulative_weights, n_int-1, axis=-1)
+        jg = p * total_weight + m
+        jp1 = _xp_searchsorted(cumulative_weights, jg, side='right')
+        j = _xp_searchsorted(cumulative_weights, jg-1, side='right')
+        j, jp1 = xp.astype(j, y.dtype), xp.astype(jp1, y.dtype)
+
+    g = jg % 1
+    if method == 'inverted_cdf':
+        g = xp.astype((g > 0), jg.dtype)
+    elif method == 'averaged_inverted_cdf':
+        g = (1 + xp.astype((g > 0), jg.dtype)) / 2
+    elif method == 'closest_observation':
+        g = (1 - xp.astype((g == 0) & (j % 2 == 1), jg.dtype))
+    if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation'}:
+        g = xp.asarray(g)
+        g = xpx.at(g, jg < 0).set(0)
+
+    g = xpx.at(g)[j < 0].set(0)
+    j = xp.clip(j, 0., n - 1)
+    jp1 = xp.clip(jp1, 0., n - 1)
+
+    return ((1 - g) * xp.take_along_axis(y, xp.astype(j, xp.int64), axis=-1)
+            + g * xp.take_along_axis(y, xp.astype(jp1, xp.int64), axis=-1))
+
+
+def _quantile_hd(y, p, n, xp):
+    # RE axis handling: We need to perform a reducing operation over rows of `y` for
+    # each element in the corresponding row of `p` (a la Cartesian product). Strategy:
+    # move rows of `p` to an axis at the front that is orthogonal to all the rest,
+    # perform the reducing operating over the last axis, then move the front axis back
+    # to the end.
+    p = xp.moveaxis(p, -1, 0)[..., xp.newaxis]
+    a = p * (n + 1)
+    b = (1 - p) * (n + 1)
+    i = xp.arange(y.shape[-1] + 1, dtype=y.dtype, device=xp_device(y))
+    w = betainc(a, b, i / n)
+    w = w[..., 1:] - w[..., :-1]
+    w = xpx.at(w, xp.isnan(w)).set(0)
+    res = xp.vecdot(w, y, axis=-1)
+    return xp.moveaxis(res, 0, -1)
+
+
+def _quantile_winsor(y, p, n, method, xp):
+    ops = dict(round_outward=(xp.floor, xp.ceil),
+               round_inward=(xp.ceil, xp.floor),
+               round_nearest=(xp.round, xp.round))
+    op_left, op_right = ops[method]
+    j = xp.where(p < 0.5, op_left(p*n), op_right(n*p - 1))
+    return xp.take_along_axis(y, xp.astype(j, xp.int64), axis=-1)
+
+
+def _quantile_bc(y, p, n, method, xp):
+    # Methods retained for backward compatibility. NumPy documentation is not
+    # quite right about what these methods do: if `p * (n - 1)` is integral,
+    # that is used as the index. See numpy/numpy#28910.
+    ij = p * (n - 1)
+    if method == '_midpoint':
+        return (xp.take_along_axis(y, xp.astype(xp.floor(ij), xp.int64), axis=-1)
+                + xp.take_along_axis(y, xp.astype(xp.ceil(ij), xp.int64), axis=-1)) / 2
+    elif method == '_lower':
+        k = xp.floor(ij)
+    elif method == '_higher':
+        k = xp.ceil(ij)
+    elif method == '_nearest':
+        k = xp.round(ij)
+    return xp.take_along_axis(y, xp.astype(k, xp.int64), axis=-1)
+
+
+@xp_capabilities(skip_backends=[("dask.array", "No take_along_axis yet.")])
+def _xp_searchsorted(x, y, *, side='left', xp=None):
+    # Vectorized xp.searchsorted. Search is performed along last axis. The shape of the
+    # output is that of `y`, broadcasting the batch dimensions with those of `x` if
+    # necessary.
+    xp = array_namespace(x, y) if xp is None else xp
+    xp_default_int = xp.asarray(1).dtype
+    y_0d = xp.asarray(y).ndim == 0
+    x, y = _broadcast_arrays((x, y), axis=-1, xp=xp)
+    x_1d = x.ndim <= 1
+
+    if x_1d or is_torch(xp):
+        y = xp.reshape(y, ()) if (y_0d and x_1d) else y
+        out = xp.searchsorted(x, y, side=side)
+        out = xp.astype(out, xp_default_int, copy=False)
+        return out
+
+    a = xp.full(y.shape, 0, device=xp_device(x))
+
+    if x.shape[-1] == 0:
+        return a
+
+    n = xp.count_nonzero(~xp.isnan(x), axis=-1, keepdims=True)
+    b = xp.broadcast_to(n, y.shape)
+
+    compare = xp.less_equal if side == 'left' else xp.less
+
+    # while xp.any(b - a > 1):
+    # refactored to for loop with ~log2(n) iterations for JAX JIT
+    for i in range(int(math.log2(x.shape[-1])) + 1):
+        c = (a + b) // 2
+        x0 = xp.take_along_axis(x, c, axis=-1)
+        j = compare(y, x0)
+        b = xp.where(j, c, b)
+        a = xp.where(j, a, c)
+
+    out = xp.where(compare(y, xp.min(x, axis=-1, keepdims=True)), 0, b)
+    out = xp.where(xp.isnan(y), x.shape[-1], out) if side == 'right' else out
+    out = xp.astype(out, xp_default_int, copy=False)
+    return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_relative_risk.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_relative_risk.py
new file mode 100644
index 0000000000000000000000000000000000000000..51525fd28adb37c72b12106450e4178c786091b2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_relative_risk.py
@@ -0,0 +1,263 @@
+import operator
+from dataclasses import dataclass
+import numpy as np
+from scipy.special import ndtri
+from ._common import ConfidenceInterval
+
+
+def _validate_int(n, bound, name):
+    msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
+    try:
+        n = operator.index(n)
+    except TypeError:
+        raise TypeError(msg) from None
+    if n < bound:
+        raise ValueError(msg)
+    return n
+
+
+@dataclass
+class RelativeRiskResult:
+    """
+    Result of `scipy.stats.contingency.relative_risk`.
+
+    Attributes
+    ----------
+    relative_risk : float
+        This is::
+
+            (exposed_cases/exposed_total) / (control_cases/control_total)
+
+    exposed_cases : int
+        The number of "cases" (i.e. occurrence of disease or other event
+        of interest) among the sample of "exposed" individuals.
+    exposed_total : int
+        The total number of "exposed" individuals in the sample.
+    control_cases : int
+        The number of "cases" among the sample of "control" or non-exposed
+        individuals.
+    control_total : int
+        The total number of "control" individuals in the sample.
+
+    Methods
+    -------
+    confidence_interval :
+        Compute the confidence interval for the relative risk estimate.
+    """
+
+    relative_risk: float
+    exposed_cases: int
+    exposed_total: int
+    control_cases: int
+    control_total: int
+
+    def confidence_interval(self, confidence_level=0.95):
+        """
+        Compute the confidence interval for the relative risk.
+
+        The confidence interval is computed using the Katz method
+        (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
+
+        Parameters
+        ----------
+        confidence_level : float, optional
+            The confidence level to use for the confidence interval.
+            Default is 0.95.
+
+        Returns
+        -------
+        ci : ConfidenceInterval instance
+            The return value is an object with attributes ``low`` and
+            ``high`` that hold the confidence interval.
+
+        References
+        ----------
+        .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
+               confidence intervals for the risk ratio in cohort studies",
+               Biometrics, 34, 469-474 (1978).
+        .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
+               CRC Press LLC, Boca Raton, FL, USA (1996).
+
+
+        Examples
+        --------
+        >>> from scipy.stats.contingency import relative_risk
+        >>> result = relative_risk(exposed_cases=10, exposed_total=75,
+        ...                        control_cases=12, control_total=225)
+        >>> result.relative_risk
+        2.5
+        >>> result.confidence_interval()
+        ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
+        """
+        if not 0 <= confidence_level <= 1:
+            raise ValueError('confidence_level must be in the interval '
+                             '[0, 1].')
+
+        # Handle edge cases where either exposed_cases or control_cases
+        # is zero.  We follow the convention of the R function riskratio
+        # from the epitools library.
+        if self.exposed_cases == 0 and self.control_cases == 0:
+            # relative risk is nan.
+            return ConfidenceInterval(low=np.nan, high=np.nan)
+        elif self.exposed_cases == 0:
+            # relative risk is 0.
+            return ConfidenceInterval(low=0.0, high=np.nan)
+        elif self.control_cases == 0:
+            # relative risk is inf
+            return ConfidenceInterval(low=np.nan, high=np.inf)
+
+        alpha = 1 - confidence_level
+        z = ndtri(1 - alpha/2)
+        rr = self.relative_risk
+
+        # Estimate of the variance of log(rr) is
+        # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
+        #                1/control_cases - 1/control_total
+        # and the standard error is the square root of that.
+        se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
+                     1/self.control_cases - 1/self.control_total)
+        delta = z*se
+        katz_lo = rr*np.exp(-delta)
+        katz_hi = rr*np.exp(delta)
+        return ConfidenceInterval(low=katz_lo, high=katz_hi)
+
+
+def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
+    """
+    Compute the relative risk (also known as the risk ratio).
+
+    This function computes the relative risk associated with a 2x2
+    contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
+    of accepting a table as an argument, the individual numbers that are
+    used to compute the relative risk are given as separate parameters.
+    This is to avoid the ambiguity of which row or column of the contingency
+    table corresponds to the "exposed" cases and which corresponds to the
+    "control" cases.  Unlike, say, the odds ratio, the relative risk is not
+    invariant under an interchange of the rows or columns.
+
+    Parameters
+    ----------
+    exposed_cases : nonnegative int
+        The number of "cases" (i.e. occurrence of disease or other event
+        of interest) among the sample of "exposed" individuals.
+    exposed_total : positive int
+        The total number of "exposed" individuals in the sample.
+    control_cases : nonnegative int
+        The number of "cases" among the sample of "control" or non-exposed
+        individuals.
+    control_total : positive int
+        The total number of "control" individuals in the sample.
+
+    Returns
+    -------
+    result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
+        The object has the float attribute ``relative_risk``, which is::
+
+            rr = (exposed_cases/exposed_total) / (control_cases/control_total)
+
+        The object also has the method ``confidence_interval`` to compute
+        the confidence interval of the relative risk for a given confidence
+        level.
+
+    See Also
+    --------
+    odds_ratio
+
+    Notes
+    -----
+    The R package epitools has the function `riskratio`, which accepts
+    a table with the following layout::
+
+                        disease=0   disease=1
+        exposed=0 (ref)    n00         n01
+        exposed=1          n10         n11
+
+    With a 2x2 table in the above format, the estimate of the CI is
+    computed by `riskratio` when the argument method="wald" is given,
+    or with the function `riskratio.wald`.
+
+    For example, in a test of the incidence of lung cancer among a
+    sample of smokers and nonsmokers, the "exposed" category would
+    correspond to "is a smoker" and the "disease" category would
+    correspond to "has or had lung cancer".
+
+    To pass the same data to ``relative_risk``, use::
+
+        relative_risk(n11, n10 + n11, n01, n00 + n01)
+
+    .. versionadded:: 1.7.0
+
+    References
+    ----------
+    .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
+           (second edition), Wiley, Hoboken, NJ, USA (2007).
+    .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
+           CRC Press LLC, Boca Raton, FL, USA (1996).
+
+    Examples
+    --------
+    >>> from scipy.stats.contingency import relative_risk
+
+    This example is from Example 3.1 of [2]_.  The results of a heart
+    disease study are summarized in the following table::
+
+                 High CAT   Low CAT    Total
+                 --------   -------    -----
+        CHD         27         44        71
+        No CHD      95        443       538
+
+        Total      122        487       609
+
+    CHD is coronary heart disease, and CAT refers to the level of
+    circulating catecholamine.  CAT is the "exposure" variable, and
+    high CAT is the "exposed" category. So the data from the table
+    to be passed to ``relative_risk`` is::
+
+        exposed_cases = 27
+        exposed_total = 122
+        control_cases = 44
+        control_total = 487
+
+    >>> result = relative_risk(27, 122, 44, 487)
+    >>> result.relative_risk
+    2.4495156482861398
+
+    Find the confidence interval for the relative risk.
+
+    >>> result.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
+
+    The interval does not contain 1, so the data supports the statement
+    that high CAT is associated with greater risk of CHD.
+    """
+    # Relative risk is a trivial calculation.  The nontrivial part is in the
+    # `confidence_interval` method of the RelativeRiskResult class.
+
+    exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
+    exposed_total = _validate_int(exposed_total, 1, "exposed_total")
+    control_cases = _validate_int(control_cases, 0, "control_cases")
+    control_total = _validate_int(control_total, 1, "control_total")
+
+    if exposed_cases > exposed_total:
+        raise ValueError('exposed_cases must not exceed exposed_total.')
+    if control_cases > control_total:
+        raise ValueError('control_cases must not exceed control_total.')
+
+    if exposed_cases == 0 and control_cases == 0:
+        # relative risk is 0/0.
+        rr = np.nan
+    elif exposed_cases == 0:
+        # relative risk is 0/nonzero
+        rr = 0.0
+    elif control_cases == 0:
+        # relative risk is nonzero/0.
+        rr = np.inf
+    else:
+        p1 = exposed_cases / exposed_total
+        p2 = control_cases / control_total
+        rr = p1 / p2
+    return RelativeRiskResult(relative_risk=rr,
+                              exposed_cases=exposed_cases,
+                              exposed_total=exposed_total,
+                              control_cases=control_cases,
+                              control_total=control_total)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_resampling.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_resampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3a072202e7acc8fda5c7a36d85c7a34d9f683a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_resampling.py
@@ -0,0 +1,2417 @@
+import warnings
+import numpy as np
+from itertools import combinations, permutations, product, accumulate
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+import inspect
+import math
+
+from scipy._lib._util import (check_random_state, _rename_parameter, rng_integers,
+                              _transition_to_rng)
+from scipy._lib._array_api import (
+    array_namespace,
+    is_numpy,
+    is_array_api_strict,
+    xp_capabilities,
+    xp_result_type,
+    xp_size,
+    xp_device,
+    xp_swapaxes
+)
+from scipy._lib import array_api_extra as xpx
+from scipy.special import ndtr, ndtri
+from scipy import stats
+
+from ._common import ConfidenceInterval
+from ._axis_nan_policy import _broadcast_concatenate, _broadcast_arrays
+from ._warnings_errors import DegenerateDataWarning
+
+__all__ = ['bootstrap', 'monte_carlo_test', 'permutation_test']
+
+
+def _vectorize_statistic(statistic):
+    """Vectorize an n-sample statistic"""
+    # This is a little cleaner than np.nditer at the expense of some data
+    # copying: concatenate samples together, then use np.apply_along_axis
+    def stat_nd(*data, axis=0):
+        lengths = [sample.shape[axis] for sample in data]
+        split_indices = np.cumsum(lengths)[:-1]
+        z = _broadcast_concatenate(data, axis)
+
+        # move working axis to position 0 so that new dimensions in the output
+        # of `statistic` are _prepended_. ("This axis is removed, and replaced
+        # with new dimensions...")
+        z = np.moveaxis(z, axis, 0)
+
+        def stat_1d(z):
+            data = np.split(z, split_indices)
+            return statistic(*data)
+
+        return np.apply_along_axis(stat_1d, 0, z)[()]
+    return stat_nd
+
+
+def _jackknife_resample(sample, batch=None, *, xp):
+    """Jackknife resample the sample. Only one-sample stats for now."""
+    n = sample.shape[-1]
+    batch_nominal = batch or n
+
+    for k in range(0, n, batch_nominal):
+        # col_start:col_end are the observations to remove
+        batch_actual = min(batch_nominal, n-k)
+
+        # jackknife - each row leaves out one observation
+        j = np.ones((batch_actual, n), dtype=bool)
+        np.fill_diagonal(j[:, k:k+batch_actual], False)
+        i = np.arange(n)
+        i = np.broadcast_to(i, (batch_actual, n))
+        i = i[j].reshape((batch_actual, n-1))
+
+        i = xp.asarray(i, device=xp_device(sample))
+        yield _get_from_last_axis(sample, i, xp=xp)
+
+
+def _bootstrap_resample(sample, n_resamples=None, rng=None, *, xp):
+    """Bootstrap resample the sample."""
+    n = sample.shape[-1]
+
+    # bootstrap - each row is a random resample of original observations
+    i = rng_integers(rng, 0, n, (n_resamples, n))
+
+    i = xp.asarray(i, device=xp_device(sample))
+    return _get_from_last_axis(sample, i, xp=xp)
+
+
+def _get_from_last_axis(sample, i, xp):
+    if not is_array_api_strict(xp):
+        return sample[..., i]
+
+    # Equivalent to `sample[..., i]` as used in `bootstrap`. Assumes i.ndim <=2.
+    if i.ndim == 2:
+        sample = xp.expand_dims(sample, axis=-2)
+    sample, i = _broadcast_arrays((sample, i), axis=-1, xp=xp)
+    return xp.take_along_axis(sample, i, axis=-1)
+
+
+def _percentile_of_score(a, score, axis, xp):
+    """Vectorized, simplified `scipy.stats.percentileofscore`.
+    Uses logic of the 'mean' value of percentileofscore's kind parameter.
+
+    Unlike `stats.percentileofscore`, the percentile returned is a fraction
+    in [0, 1].
+    """
+    B = a.shape[axis]
+    nonzeros = (xp.count_nonzero(a < score, axis=axis)
+                + xp.count_nonzero(a <= score, axis=axis))
+    return xp.astype(nonzeros, score.dtype) / (2 * B)
+
+
+def _bca_interval(data, statistic, axis, alpha, theta_hat_b, batch, xp):
+    """Bias-corrected and accelerated interval."""
+    # closely follows [1] 14.3 and 15.4 (Eq. 15.36)
+
+    # calculate z0_hat
+    theta_hat = xp.expand_dims(statistic(*data, axis=axis), axis=-1)
+    percentile = _percentile_of_score(theta_hat_b, theta_hat, axis=-1, xp=xp)
+    percentile = xp.asarray(percentile, dtype=theta_hat.dtype,
+                            device=xp_device(theta_hat))
+    z0_hat = ndtri(percentile)
+
+    # calculate a_hat
+    theta_hat_ji = []  # j is for sample of data, i is for jackknife resample
+    for j, sample in enumerate(data):
+        # _jackknife_resample will add an axis prior to the last axis that
+        # corresponds with the different jackknife resamples. Do the same for
+        # each sample of the data to ensure broadcastability. We need to
+        # create a copy of the list containing the samples anyway, so do this
+        # in the loop to simplify the code. This is not the bottleneck...
+        samples = [xp.expand_dims(sample, axis=-2) for sample in data]
+        theta_hat_i = []
+        for jackknife_sample in _jackknife_resample(sample, batch, xp=xp):
+            samples[j] = jackknife_sample
+            broadcasted = _broadcast_arrays(samples, axis=-1, xp=xp)
+            theta_hat_i.append(statistic(*broadcasted, axis=-1))
+        theta_hat_ji.append(theta_hat_i)
+
+    theta_hat_ji = [xp.concat(theta_hat_i, axis=-1)
+                    for theta_hat_i in theta_hat_ji]
+
+    # CuPy array dtype is unstable in 1.13.6 when divided by large Python int:
+    # (cp.ones(1, dtype=cp.float32)/10000).dtype   # float32
+    # (cp.ones(1, dtype=cp.float32)/100000).dtype  # float64
+    # Solution for now is to make the divisor a Python float
+    # Looks like this will be fixed in 1.14.0
+    n_j = [float(theta_hat_i.shape[-1]) for theta_hat_i in theta_hat_ji]
+
+    theta_hat_j_dot = [xp.mean(theta_hat_i, axis=-1, keepdims=True)
+                       for theta_hat_i in theta_hat_ji]
+
+    U_ji = [(n - 1) * (theta_hat_dot - theta_hat_i)
+            for theta_hat_dot, theta_hat_i, n
+            in zip(theta_hat_j_dot, theta_hat_ji, n_j)]
+
+    nums = [xp.sum(U_i**3, axis=-1)/n**3 for U_i, n in zip(U_ji, n_j)]
+    dens = [xp.sum(U_i**2, axis=-1)/n**2 for U_i, n in zip(U_ji, n_j)]
+    a_hat = 1/6 * sum(nums) / sum(dens)**(3/2)
+
+    # calculate alpha_1, alpha_2
+    # `float` because dtype of ndtri output (float64) should not promote other vals
+    z_alpha = float(ndtri(alpha))
+    z_1alpha = -z_alpha
+    num1 = z0_hat + z_alpha
+    alpha_1 = ndtr(z0_hat + num1/(1 - a_hat*num1))
+    num2 = z0_hat + z_1alpha
+    alpha_2 = ndtr(z0_hat + num2/(1 - a_hat*num2))
+    return alpha_1, alpha_2, a_hat  # return a_hat for testing
+
+
+def _bootstrap_iv(data, statistic, vectorized, paired, axis, confidence_level,
+                  alternative, n_resamples, batch, method, bootstrap_result,
+                  rng):
+    """Input validation and standardization for `bootstrap`."""
+    xp = array_namespace(*data)
+
+    if vectorized not in {True, False, None}:
+        raise ValueError("`vectorized` must be `True`, `False`, or `None`.")
+
+    if vectorized is None:
+        vectorized = 'axis' in inspect.signature(statistic).parameters
+
+    if not vectorized:
+        if not is_numpy(xp):
+            message = (f"When using array library {xp.__name__}, `func` must be "
+                       "vectorized and accept argument `axis`.")
+            raise TypeError(message)
+
+        statistic = _vectorize_statistic(statistic)
+
+    axis_int = int(axis)
+    if axis != axis_int:
+        raise ValueError("`axis` must be an integer.")
+
+    n_samples = 0
+    try:
+        n_samples = len(data)
+    except TypeError:
+        raise ValueError("`data` must be a sequence of samples.")
+
+    if n_samples == 0:
+        raise ValueError("`data` must contain at least one sample.")
+
+    data = _broadcast_arrays(data, axis_int, xp=xp)
+
+    data_iv = []
+    for sample in data:
+        if sample.shape[axis_int] <= 1:
+            raise ValueError("each sample in `data` must contain two or more "
+                             "observations along `axis`.")
+        sample = xp.moveaxis(sample, axis_int, -1)
+        data_iv.append(sample)
+
+    if paired not in {True, False}:
+        raise ValueError("`paired` must be `True` or `False`.")
+
+    if paired:
+        n = data_iv[0].shape[-1]
+        for sample in data_iv[1:]:
+            if sample.shape[-1] != n:
+                message = ("When `paired is True`, all samples must have the "
+                           "same length along `axis`")
+                raise ValueError(message)
+
+        # to generate the bootstrap distribution for paired-sample statistics,
+        # resample the indices of the observations
+        def statistic(i, axis=-1, data=data_iv, unpaired_statistic=statistic):
+            # data = [sample[..., i] for sample in data]
+            data = [_get_from_last_axis(sample, i, xp=xp) for sample in data]
+            return unpaired_statistic(*data, axis=axis)
+
+        data_iv = [xp.arange(n)]
+
+    confidence_level_float = float(confidence_level)
+
+    alternative = alternative.lower()
+    alternatives = {'two-sided', 'less', 'greater'}
+    if alternative not in alternatives:
+        raise ValueError(f"`alternative` must be one of {alternatives}")
+
+    n_resamples_int = int(n_resamples)
+    if n_resamples != n_resamples_int or n_resamples_int < 0:
+        raise ValueError("`n_resamples` must be a non-negative integer.")
+
+    if batch is None:
+        batch_iv = batch
+    else:
+        batch_iv = int(batch)
+        if batch != batch_iv or batch_iv <= 0:
+            raise ValueError("`batch` must be a positive integer or None.")
+
+    methods = {'percentile', 'basic', 'bca'}
+    method = method.lower()
+    if method not in methods:
+        raise ValueError(f"`method` must be in {methods}")
+
+    message = "`bootstrap_result` must have attribute `bootstrap_distribution'"
+    if (bootstrap_result is not None
+            and not hasattr(bootstrap_result, "bootstrap_distribution")):
+        raise ValueError(message)
+
+    message = ("Either `bootstrap_result.bootstrap_distribution.size` or "
+               "`n_resamples` must be positive.")
+    if ((not bootstrap_result or
+         not xp_size(bootstrap_result.bootstrap_distribution))
+            and n_resamples_int == 0):
+        raise ValueError(message)
+
+    rng = check_random_state(rng)
+
+    return (data_iv, statistic, vectorized, paired, axis_int,
+            confidence_level_float, alternative, n_resamples_int, batch_iv,
+            method, bootstrap_result, rng, xp)
+
+
+@dataclass
+class BootstrapResult:
+    """Result object returned by `scipy.stats.bootstrap`.
+
+    Attributes
+    ----------
+    confidence_interval : ConfidenceInterval
+        The bootstrap confidence interval as an instance of
+        `collections.namedtuple` with attributes `low` and `high`.
+    bootstrap_distribution : ndarray
+        The bootstrap distribution, that is, the value of `statistic` for
+        each resample. The last dimension corresponds with the resamples
+        (e.g. ``res.bootstrap_distribution.shape[-1] == n_resamples``).
+    standard_error : float or ndarray
+        The bootstrap standard error, that is, the sample standard
+        deviation of the bootstrap distribution.
+
+    """
+    confidence_interval: ConfidenceInterval
+    bootstrap_distribution: np.ndarray
+    standard_error: float | np.ndarray
+
+
+@xp_capabilities(skip_backends=[("jax.numpy", "Incompatible with `quantile`."),
+                                ("dask.array", "Dask doesn't have take_along_axis.")])
+@_transition_to_rng('random_state')
+def bootstrap(data, statistic, *, n_resamples=9999, batch=None,
+              vectorized=None, paired=False, axis=0, confidence_level=0.95,
+              alternative='two-sided', method='BCa', bootstrap_result=None,
+              rng=None):
+    r"""
+    Compute a two-sided bootstrap confidence interval of a statistic.
+
+    When `method` is ``'percentile'`` and `alternative` is ``'two-sided'``,
+    a bootstrap confidence interval is computed according to the following
+    procedure.
+
+    1. Resample the data: for each sample in `data` and for each of
+       `n_resamples`, take a random sample of the original sample
+       (with replacement) of the same size as the original sample.
+
+    2. Compute the bootstrap distribution of the statistic: for each set of
+       resamples, compute the test statistic.
+
+    3. Determine the confidence interval: find the interval of the bootstrap
+       distribution that is
+
+       - symmetric about the median and
+       - contains `confidence_level` of the resampled statistic values.
+
+    While the ``'percentile'`` method is the most intuitive, it is rarely
+    used in practice. Two more common methods are available, ``'basic'``
+    ('reverse percentile') and ``'BCa'`` ('bias-corrected and accelerated');
+    they differ in how step 3 is performed.
+
+    If the samples in `data` are  taken at random from their respective
+    distributions :math:`n` times, the confidence interval returned by
+    `bootstrap` will contain the true value of the statistic for those
+    distributions approximately `confidence_level`:math:`\, \times \, n` times.
+
+    Parameters
+    ----------
+    data : sequence of array-like
+         Each element of `data` is a sample containing scalar observations from an
+         underlying distribution. Elements of `data` must be broadcastable to the
+         same shape (with the possible exception of the dimension specified by `axis`).
+    statistic : callable
+        Statistic for which the confidence interval is to be calculated.
+        `statistic` must be a callable that accepts ``len(data)`` samples
+        as separate arguments and returns the resulting statistic.
+        If `vectorized` is set ``True``,
+        `statistic` must also accept a keyword argument `axis` and be
+        vectorized to compute the statistic along the provided `axis`.
+    n_resamples : int, default: ``9999``
+        The number of resamples performed to form the bootstrap distribution
+        of the statistic.
+    batch : int, optional
+        The number of resamples to process in each vectorized call to
+        `statistic`. Memory usage is O( `batch` * ``n`` ), where ``n`` is the
+        sample size. Default is ``None``, in which case ``batch = n_resamples``
+        (or ``batch = max(n_resamples, n)`` for ``method='BCa'``).
+    vectorized : bool, optional
+        If `vectorized` is set ``False``, `statistic` will not be passed
+        keyword argument `axis` and is expected to calculate the statistic
+        only for 1D samples. If ``True``, `statistic` will be passed keyword
+        argument `axis` and is expected to calculate the statistic along `axis`
+        when passed an ND sample array. If ``None`` (default), `vectorized`
+        will be set ``True`` if ``axis`` is a parameter of `statistic`. Use of
+        a vectorized statistic typically reduces computation time.
+    paired : bool, default: ``False``
+        Whether the statistic treats corresponding elements of the samples
+        in `data` as paired. If True, `bootstrap` resamples an array of
+        *indices* and uses the same indices for all arrays in `data`; otherwise,
+        `bootstrap` independently resamples the elements of each array.
+    axis : int, default: ``0``
+        The axis of the samples in `data` along which the `statistic` is
+        calculated.
+    confidence_level : float, default: ``0.95``
+        The confidence level of the confidence interval.
+    alternative : {'two-sided', 'less', 'greater'}, default: ``'two-sided'``
+        Choose ``'two-sided'`` (default) for a two-sided confidence interval,
+        ``'less'`` for a one-sided confidence interval with the lower bound
+        at ``-np.inf``, and ``'greater'`` for a one-sided confidence interval
+        with the upper bound at ``np.inf``. The other bound of the one-sided
+        confidence intervals is the same as that of a two-sided confidence
+        interval with `confidence_level` twice as far from 1.0; e.g. the upper
+        bound of a 95% ``'less'``  confidence interval is the same as the upper
+        bound of a 90% ``'two-sided'`` confidence interval.
+    method : {'percentile', 'basic', 'bca'}, default: ``'BCa'``
+        Whether to return the 'percentile' bootstrap confidence interval
+        (``'percentile'``), the 'basic' (AKA 'reverse') bootstrap confidence
+        interval (``'basic'``), or the bias-corrected and accelerated bootstrap
+        confidence interval (``'BCa'``).
+    bootstrap_result : BootstrapResult, optional
+        Provide the result object returned by a previous call to `bootstrap`
+        to include the previous bootstrap distribution in the new bootstrap
+        distribution. This can be used, for example, to change
+        `confidence_level`, change `method`, or see the effect of performing
+        additional resampling without repeating computations.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+    Returns
+    -------
+    res : BootstrapResult
+        An object with attributes:
+
+        confidence_interval : ConfidenceInterval
+            The bootstrap confidence interval as an instance of
+            `collections.namedtuple` with attributes `low` and `high`.
+        bootstrap_distribution : ndarray
+            The bootstrap distribution, that is, the value of `statistic` for
+            each resample. The last dimension corresponds with the resamples
+            (e.g. ``res.bootstrap_distribution.shape[-1] == n_resamples``).
+        standard_error : float or ndarray
+            The bootstrap standard error, that is, the sample standard
+            deviation of the bootstrap distribution.
+
+    Warns
+    -----
+    `~scipy.stats.DegenerateDataWarning`
+        Generated when ``method='BCa'`` and the bootstrap distribution is
+        degenerate (e.g. all elements are identical).
+
+    Notes
+    -----
+    Elements of the confidence interval may be NaN for ``method='BCa'`` if
+    the bootstrap distribution is degenerate (e.g. all elements are identical).
+    In this case, consider using another `method` or inspecting `data` for
+    indications that other analysis may be more appropriate (e.g. all
+    observations are identical).
+
+    References
+    ----------
+    .. [1] B. Efron and R. J. Tibshirani, An Introduction to the Bootstrap,
+       Chapman & Hall/CRC, Boca Raton, FL, USA (1993)
+    .. [2] Nathaniel E. Helwig, "Bootstrap Confidence Intervals",
+       http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf
+    .. [3] Bootstrapping (statistics), Wikipedia,
+       https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
+
+    Examples
+    --------
+    Suppose we have sampled data from an unknown distribution.
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> from scipy.stats import norm
+    >>> dist = norm(loc=2, scale=4)  # our "unknown" distribution
+    >>> data = dist.rvs(size=100, random_state=rng)
+
+    We are interested in the standard deviation of the distribution.
+
+    >>> std_true = dist.std()      # the true value of the statistic
+    >>> print(std_true)
+    4.0
+    >>> std_sample = np.std(data)  # the sample statistic
+    >>> print(std_sample)
+    3.9460644295563863
+
+    The bootstrap is used to approximate the variability we would expect if we
+    were to repeatedly sample from the unknown distribution and calculate the
+    statistic of the sample each time. It does this by repeatedly resampling
+    values *from the original sample* with replacement and calculating the
+    statistic of each resample. This results in a "bootstrap distribution" of
+    the statistic.
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.stats import bootstrap
+    >>> data = (data,)  # samples must be in a sequence
+    >>> res = bootstrap(data, np.std, confidence_level=0.9, rng=rng)
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(res.bootstrap_distribution, bins=25)
+    >>> ax.set_title('Bootstrap Distribution')
+    >>> ax.set_xlabel('statistic value')
+    >>> ax.set_ylabel('frequency')
+    >>> plt.show()
+
+    The standard error quantifies this variability. It is calculated as the
+    standard deviation of the bootstrap distribution.
+
+    >>> res.standard_error
+    0.24427002125829136
+    >>> res.standard_error == np.std(res.bootstrap_distribution, ddof=1)
+    True
+
+    The bootstrap distribution of the statistic is often approximately normal
+    with scale equal to the standard error.
+
+    >>> x = np.linspace(3, 5)
+    >>> pdf = norm.pdf(x, loc=std_sample, scale=res.standard_error)
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(res.bootstrap_distribution, bins=25, density=True)
+    >>> ax.plot(x, pdf)
+    >>> ax.set_title('Normal Approximation of the Bootstrap Distribution')
+    >>> ax.set_xlabel('statistic value')
+    >>> ax.set_ylabel('pdf')
+    >>> plt.show()
+
+    This suggests that we could construct a 90% confidence interval on the
+    statistic based on quantiles of this normal distribution.
+
+    >>> norm.interval(0.9, loc=std_sample, scale=res.standard_error)
+    (3.5442759991341726, 4.3478528599786)
+
+    Due to central limit theorem, this normal approximation is accurate for a
+    variety of statistics and distributions underlying the samples; however,
+    the approximation is not reliable in all cases. Because `bootstrap` is
+    designed to work with arbitrary underlying distributions and statistics,
+    it uses more advanced techniques to generate an accurate confidence
+    interval.
+
+    >>> print(res.confidence_interval)
+    ConfidenceInterval(low=3.57655333533867, high=4.382043696342881)
+
+    If we sample from the original distribution 100 times and form a bootstrap
+    confidence interval for each sample, the confidence interval
+    contains the true value of the statistic approximately 90% of the time.
+
+    >>> n_trials = 100
+    >>> ci_contains_true_std = 0
+    >>> for i in range(n_trials):
+    ...    data = (dist.rvs(size=100, random_state=rng),)
+    ...    res = bootstrap(data, np.std, confidence_level=0.9,
+    ...                    n_resamples=999, rng=rng)
+    ...    ci = res.confidence_interval
+    ...    if ci[0] < std_true < ci[1]:
+    ...        ci_contains_true_std += 1
+    >>> print(ci_contains_true_std)
+    88
+
+    Rather than writing a loop, we can also determine the confidence intervals
+    for all 100 samples at once.
+
+    >>> data = (dist.rvs(size=(n_trials, 100), random_state=rng),)
+    >>> res = bootstrap(data, np.std, axis=-1, confidence_level=0.9,
+    ...                 n_resamples=999, rng=rng)
+    >>> ci_l, ci_u = res.confidence_interval
+
+    Here, `ci_l` and `ci_u` contain the confidence interval for each of the
+    ``n_trials = 100`` samples.
+
+    >>> print(ci_l[:5])
+    [3.86401283 3.33304394 3.52474647 3.54160981 3.80569252]
+    >>> print(ci_u[:5])
+    [4.80217409 4.18143252 4.39734707 4.37549713 4.72843584]
+
+    And again, approximately 90% contain the true value, ``std_true = 4``.
+
+    >>> print(np.sum((ci_l < std_true) & (std_true < ci_u)))
+    93
+
+    `bootstrap` can also be used to estimate confidence intervals of
+    multi-sample statistics. For example, to get a confidence interval
+    for the difference between means, we write a function that accepts
+    two sample arguments and returns only the statistic. The use of the
+    ``axis`` argument ensures that all mean calculations are perform in
+    a single vectorized call, which is faster than looping over pairs
+    of resamples in Python.
+
+    >>> def my_statistic(sample1, sample2, axis=-1):
+    ...     mean1 = np.mean(sample1, axis=axis)
+    ...     mean2 = np.mean(sample2, axis=axis)
+    ...     return mean1 - mean2
+
+    Here, we use the 'percentile' method with the default 95% confidence level.
+
+    >>> sample1 = norm.rvs(scale=1, size=100, random_state=rng)
+    >>> sample2 = norm.rvs(scale=2, size=100, random_state=rng)
+    >>> data = (sample1, sample2)
+    >>> res = bootstrap(data, my_statistic, method='basic', rng=rng)
+    >>> print(my_statistic(sample1, sample2))
+    0.16661030792089523
+    >>> print(res.confidence_interval)
+    ConfidenceInterval(low=-0.29087973240818693, high=0.6371338699912273)
+
+    The bootstrap estimate of the standard error is also available.
+
+    >>> print(res.standard_error)
+    0.238323948262459
+
+    Paired-sample statistics work, too. For example, consider the Pearson
+    correlation coefficient.
+
+    >>> from scipy.stats import pearsonr
+    >>> n = 100
+    >>> x = np.linspace(0, 10, n)
+    >>> y = x + rng.uniform(size=n)
+    >>> print(pearsonr(x, y)[0])  # element 0 is the statistic
+    0.9954306665125647
+
+    We wrap `pearsonr` so that it returns only the statistic, ensuring
+    that we use the `axis` argument because it is available.
+
+    >>> def my_statistic(x, y, axis=-1):
+    ...     return pearsonr(x, y, axis=axis)[0]
+
+    We call `bootstrap` using ``paired=True``.
+
+    >>> res = bootstrap((x, y), my_statistic, paired=True, rng=rng)
+    >>> print(res.confidence_interval)
+    ConfidenceInterval(low=0.9941504301315878, high=0.996377412215445)
+
+    The result object can be passed back into `bootstrap` to perform additional
+    resampling:
+
+    >>> len(res.bootstrap_distribution)
+    9999
+    >>> res = bootstrap((x, y), my_statistic, paired=True,
+    ...                 n_resamples=1000, rng=rng,
+    ...                 bootstrap_result=res)
+    >>> len(res.bootstrap_distribution)
+    10999
+
+    or to change the confidence interval options:
+
+    >>> res2 = bootstrap((x, y), my_statistic, paired=True,
+    ...                  n_resamples=0, rng=rng, bootstrap_result=res,
+    ...                  method='percentile', confidence_level=0.9)
+    >>> np.testing.assert_equal(res2.bootstrap_distribution,
+    ...                         res.bootstrap_distribution)
+    >>> res.confidence_interval
+    ConfidenceInterval(low=0.9941574828235082, high=0.9963781698210212)
+
+    without repeating computation of the original bootstrap distribution.
+
+    """
+    # Input validation
+    args = _bootstrap_iv(data, statistic, vectorized, paired, axis,
+                         confidence_level, alternative, n_resamples, batch,
+                         method, bootstrap_result, rng)
+    (data, statistic, vectorized, paired, axis, confidence_level,
+     alternative, n_resamples, batch, method, bootstrap_result,
+     rng, xp) = args
+
+    theta_hat_b = ([] if bootstrap_result is None
+                   else [bootstrap_result.bootstrap_distribution])
+
+    batch_nominal = batch or n_resamples or 1
+
+    for k in range(0, n_resamples, batch_nominal):
+        batch_actual = min(batch_nominal, n_resamples-k)
+        # Generate resamples
+        resampled_data = []
+        for sample in data:
+            resample = _bootstrap_resample(sample, n_resamples=batch_actual,
+                                           rng=rng, xp=xp)
+            resampled_data.append(resample)
+
+        # Compute bootstrap distribution of statistic
+        theta_hat_b.append(statistic(*resampled_data, axis=-1))
+    theta_hat_b = xp.concat(theta_hat_b, axis=-1)
+
+    # Calculate percentile interval
+    alpha = ((1 - confidence_level)/2 if alternative == 'two-sided'
+             else (1 - confidence_level))
+    if method == 'bca':
+        interval = _bca_interval(data, statistic, axis=-1, alpha=alpha,
+                                 theta_hat_b=theta_hat_b, batch=batch, xp=xp)[:2]
+    else:
+        alpha = xp.asarray(alpha, dtype=theta_hat_b.dtype,
+                           device=xp_device(theta_hat_b))
+        interval = alpha, 1 - alpha
+
+    # Calculate confidence interval of statistic
+    interval = xp.stack(interval, axis=-1)
+    ci = stats.quantile(theta_hat_b, interval, axis=-1)
+    if xp.any(xp.isnan(ci)):
+        msg = (
+            "The BCa confidence interval cannot be calculated. "
+            "This problem is known to occur when the distribution "
+            "is degenerate or the statistic is np.min."
+        )
+        warnings.warn(DegenerateDataWarning(msg), stacklevel=2)
+
+    ci_l = ci[..., 0]
+    ci_u = ci[..., 1]
+
+    if method == 'basic':  # see [3]
+        theta_hat = statistic(*data, axis=-1)
+        ci_l, ci_u = 2*theta_hat - ci_u, 2*theta_hat - ci_l
+
+    if alternative == 'less':
+        ci_l = xp.full_like(ci_l, -xp.inf)
+    elif alternative == 'greater':
+        ci_u = xp.full_like(ci_u, xp.inf)
+
+    standard_error = xp.std(theta_hat_b, correction=1, axis=-1)
+
+    ci_l = ci_l[()] if ci_l.ndim == 0 else ci_l
+    ci_u = ci_u[()] if ci_u.ndim == 0 else ci_u
+    standard_error = standard_error[()] if standard_error.ndim == 0 else standard_error
+
+    return BootstrapResult(confidence_interval=ConfidenceInterval(ci_l, ci_u),
+                           bootstrap_distribution=theta_hat_b,
+                           standard_error=standard_error)
+
+
+def _monte_carlo_test_iv(data, rvs, statistic, vectorized, n_resamples,
+                         batch, alternative, axis):
+    """Input validation for `monte_carlo_test`."""
+    axis_int = int(axis)
+    if axis != axis_int:
+        raise ValueError("`axis` must be an integer.")
+
+    if vectorized not in {True, False, None}:
+        raise ValueError("`vectorized` must be `True`, `False`, or `None`.")
+
+    if not isinstance(rvs, Sequence):
+        rvs = (rvs,)
+        data = (data,)
+    for rvs_i in rvs:
+        if not callable(rvs_i):
+            raise TypeError("`rvs` must be callable or sequence of callables.")
+
+    # At this point, `data` should be a sequence
+    # If it isn't, the user passed a sequence for `rvs` but not `data`
+    message = "If `rvs` is a sequence, `len(rvs)` must equal `len(data)`."
+    try:
+        len(data)
+    except TypeError as e:
+        raise ValueError(message) from e
+    if not len(rvs) == len(data):
+        raise ValueError(message)
+
+    if not callable(statistic):
+        raise TypeError("`statistic` must be callable.")
+
+    if vectorized is None:
+        try:
+            signature = inspect.signature(statistic).parameters
+        except ValueError as e:
+            message = (f"Signature inspection of {statistic=} failed; "
+                       "pass `vectorize` explicitly.")
+            raise ValueError(message) from e
+        vectorized = 'axis' in signature
+
+    xp = array_namespace(*data)
+    dtype = xp_result_type(*data, force_floating=True, xp=xp)
+
+    if not vectorized:
+        if is_numpy(xp):
+            statistic_vectorized = _vectorize_statistic(statistic)
+        else:
+            message = ("`statistic` must be vectorized (i.e. support an `axis` "
+                       f"argument) when `data` contains {xp.__name__} arrays.")
+            raise ValueError(message)
+    else:
+        statistic_vectorized = statistic
+
+    data = _broadcast_arrays(data, axis, xp=xp)
+    data_iv = []
+    for sample in data:
+        sample = xp.broadcast_to(sample, (1,)) if sample.ndim == 0 else sample
+        sample = xp.moveaxis(sample, axis_int, -1)
+        data_iv.append(sample)
+
+    n_resamples_int = int(n_resamples)
+    if n_resamples != n_resamples_int or n_resamples_int <= 0:
+        raise ValueError("`n_resamples` must be a positive integer.")
+
+    if batch is None:
+        batch_iv = batch
+    else:
+        batch_iv = int(batch)
+        if batch != batch_iv or batch_iv <= 0:
+            raise ValueError("`batch` must be a positive integer or None.")
+
+    alternatives = {'two-sided', 'greater', 'less'}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f"`alternative` must be in {alternatives}")
+
+    return (data_iv, rvs, statistic_vectorized, vectorized, n_resamples_int,
+            batch_iv, alternative, axis_int, dtype, xp)
+
+
+@dataclass
+class MonteCarloTestResult:
+    """Result object returned by `scipy.stats.monte_carlo_test`.
+
+    Attributes
+    ----------
+    statistic : float or ndarray
+        The observed test statistic of the sample.
+    pvalue : float or ndarray
+        The p-value for the given alternative.
+    null_distribution : ndarray
+        The values of the test statistic generated under the null
+        hypothesis.
+    """
+    statistic: float | np.ndarray
+    pvalue: float | np.ndarray
+    null_distribution: np.ndarray
+
+
+@xp_capabilities()
+@_rename_parameter('sample', 'data')
+def monte_carlo_test(data, rvs, statistic, *, vectorized=None,
+                     n_resamples=9999, batch=None, alternative="two-sided",
+                     axis=0):
+    r"""Perform a Monte Carlo hypothesis test.
+
+    `data` contains a sample or a sequence of one or more samples. `rvs`
+    specifies the distribution(s) of the sample(s) in `data` under the null
+    hypothesis. The value of `statistic` for the given `data` is compared
+    against a Monte Carlo null distribution: the value of the statistic for
+    each of `n_resamples` sets of samples generated using `rvs`. This gives
+    the p-value, the probability of observing such an extreme value of the
+    test statistic under the null hypothesis.
+
+    Parameters
+    ----------
+    data : array-like or sequence of array-like
+        An array or sequence of arrays of observations.
+    rvs : callable or tuple of callables
+        A callable or sequence of callables that generates random variates
+        under the null hypothesis. Each element of `rvs` must be a callable
+        that accepts keyword argument ``size`` (e.g. ``rvs(size=(m, n))``) and
+        returns an N-d array sample of that shape. If `rvs` is a sequence, the
+        number of callables in `rvs` must match the number of samples in
+        `data`, i.e. ``len(rvs) == len(data)``. If `rvs` is a single callable,
+        `data` is treated as a single sample.
+    statistic : callable
+        Statistic for which the p-value of the hypothesis test is to be
+        calculated. `statistic` must be a callable that accepts a sample
+        (e.g. ``statistic(sample)``) or ``len(rvs)`` separate samples (e.g.
+        ``statistic(samples1, sample2)`` if `rvs` contains two callables and
+        `data` contains two samples) and returns the resulting statistic.
+        If `vectorized` is set ``True``, `statistic` must also accept a keyword
+        argument `axis` and be vectorized to compute the statistic along the
+        provided `axis` of the samples in `data`.
+    vectorized : bool, optional
+        If `vectorized` is set ``False``, `statistic` will not be passed
+        keyword argument `axis` and is expected to calculate the statistic
+        only for 1D samples. If ``True``, `statistic` will be passed keyword
+        argument `axis` and is expected to calculate the statistic along `axis`
+        when passed ND sample arrays. If ``None`` (default), `vectorized`
+        will be set ``True`` if ``axis`` is a parameter of `statistic`. Use of
+        a vectorized statistic typically reduces computation time.
+    n_resamples : int, default: 9999
+        Number of samples drawn from each of the callables of `rvs`.
+        Equivalently, the number statistic values under the null hypothesis
+        used as the Monte Carlo null distribution.
+    batch : int, optional
+        The number of Monte Carlo samples to process in each call to
+        `statistic`. Memory usage is O( `batch` * ``sample.size[axis]`` ). Default
+        is ``None``, in which case `batch` equals `n_resamples`.
+    alternative : {'two-sided', 'less', 'greater'}
+        The alternative hypothesis for which the p-value is calculated.
+        For each alternative, the p-value is defined as follows.
+
+        - ``'greater'`` : the percentage of the null distribution that is
+          greater than or equal to the observed value of the test statistic.
+        - ``'less'`` : the percentage of the null distribution that is
+          less than or equal to the observed value of the test statistic.
+        - ``'two-sided'`` : twice the smaller of the p-values above.
+
+    axis : int, default: 0
+        The axis of `data` (or each sample within `data`) over which to
+        calculate the statistic.
+
+    Returns
+    -------
+    res : MonteCarloTestResult
+        An object with attributes:
+
+        statistic : float or ndarray
+            The test statistic of the observed `data`.
+        pvalue : float or ndarray
+            The p-value for the given alternative.
+        null_distribution : ndarray
+            The values of the test statistic generated under the null
+            hypothesis.
+
+    .. warning::
+        The p-value is calculated by counting the elements of the null
+        distribution that are as extreme or more extreme than the observed
+        value of the statistic. Due to the use of finite precision arithmetic,
+        some statistic functions return numerically distinct values when the
+        theoretical values would be exactly equal. In some cases, this could
+        lead to a large error in the calculated p-value. `monte_carlo_test`
+        guards against this by considering elements in the null distribution
+        that are "close" (within a relative tolerance of 100 times the
+        floating point epsilon of inexact dtypes) to the observed
+        value of the test statistic as equal to the observed value of the
+        test statistic. However, the user is advised to inspect the null
+        distribution to assess whether this method of comparison is
+        appropriate, and if not, calculate the p-value manually.
+
+    References
+    ----------
+
+    .. [1] B. Phipson and G. K. Smyth. "Permutation P-values Should Never Be
+       Zero: Calculating Exact P-values When Permutations Are Randomly Drawn."
+       Statistical Applications in Genetics and Molecular Biology 9.1 (2010).
+
+    Examples
+    --------
+
+    Suppose we wish to test whether a small sample has been drawn from a normal
+    distribution. We decide that we will use the skew of the sample as a
+    test statistic, and we will consider a p-value of 0.05 to be statistically
+    significant.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> def statistic(x, axis):
+    ...     return stats.skew(x, axis)
+
+    After collecting our data, we calculate the observed value of the test
+    statistic.
+
+    >>> rng = np.random.default_rng()
+    >>> x = stats.skewnorm.rvs(a=1, size=50, random_state=rng)
+    >>> statistic(x, axis=0)
+    0.12457412450240658
+
+    To determine the probability of observing such an extreme value of the
+    skewness by chance if the sample were drawn from the normal distribution,
+    we can perform a Monte Carlo hypothesis test. The test will draw many
+    samples at random from their normal distribution, calculate the skewness
+    of each sample, and compare our original skewness against this
+    distribution to determine an approximate p-value.
+
+    >>> from scipy.stats import monte_carlo_test
+    >>> # because our statistic is vectorized, we pass `vectorized=True`
+    >>> rvs = lambda size: stats.norm.rvs(size=size, random_state=rng)
+    >>> res = monte_carlo_test(x, rvs, statistic, vectorized=True)
+    >>> print(res.statistic)
+    0.12457412450240658
+    >>> print(res.pvalue)
+    0.7012
+
+    The probability of obtaining a test statistic less than or equal to the
+    observed value under the null hypothesis is ~70%. This is greater than
+    our chosen threshold of 5%, so we cannot consider this to be significant
+    evidence against the null hypothesis.
+
+    Note that this p-value essentially matches that of
+    `scipy.stats.skewtest`, which relies on an asymptotic distribution of a
+    test statistic based on the sample skewness.
+
+    >>> stats.skewtest(x).pvalue
+    0.6892046027110614
+
+    This asymptotic approximation is not valid for small sample sizes, but
+    `monte_carlo_test` can be used with samples of any size.
+
+    >>> x = stats.skewnorm.rvs(a=1, size=7, random_state=rng)
+    >>> # stats.skewtest(x) would produce an error due to small sample
+    >>> res = monte_carlo_test(x, rvs, statistic, vectorized=True)
+
+    The Monte Carlo distribution of the test statistic is provided for
+    further investigation.
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(res.null_distribution, bins=50)
+    >>> ax.set_title("Monte Carlo distribution of test statistic")
+    >>> ax.set_xlabel("Value of Statistic")
+    >>> ax.set_ylabel("Frequency")
+    >>> plt.show()
+
+    """
+    args = _monte_carlo_test_iv(data, rvs, statistic, vectorized,
+                                n_resamples, batch, alternative, axis)
+    (data, rvs, statistic, vectorized, n_resamples,
+     batch, alternative, axis, dtype, xp) = args
+
+    # Some statistics return plain floats; ensure they're at least a NumPy float
+    observed = xp.asarray(statistic(*data, axis=-1))
+    observed = observed[()] if observed.ndim == 0 else observed
+
+    n_observations = [sample.shape[-1] for sample in data]
+    batch_nominal = batch or n_resamples
+    null_distribution = []
+    for k in range(0, n_resamples, batch_nominal):
+        batch_actual = min(batch_nominal, n_resamples - k)
+        resamples = [rvs_i(size=(batch_actual, n_observations_i))
+                     for rvs_i, n_observations_i in zip(rvs, n_observations)]
+        null_distribution.append(statistic(*resamples, axis=-1))
+    null_distribution = xp.concat(null_distribution)
+    null_distribution = xp.reshape(null_distribution, (-1,) + (1,)*observed.ndim)
+
+    # relative tolerance for detecting numerically distinct but
+    # theoretically equal values in the null distribution
+    eps =  (0 if not xp.isdtype(observed.dtype, ('real floating'))
+            else xp.finfo(observed.dtype).eps*100)
+    gamma = xp.abs(eps * observed)
+
+    def less(null_distribution, observed):
+        cmps = null_distribution <= observed + gamma
+        cmps = xp.asarray(cmps, dtype=dtype)
+        pvalues = (xp.sum(cmps, axis=0, dtype=dtype) + 1.) / (n_resamples + 1.)
+        return pvalues
+
+    def greater(null_distribution, observed):
+        cmps = null_distribution >= observed - gamma
+        cmps = xp.asarray(cmps, dtype=dtype)
+        pvalues = (xp.sum(cmps, axis=0, dtype=dtype) + 1.) / (n_resamples + 1.)
+        return pvalues
+
+    def two_sided(null_distribution, observed):
+        pvalues_less = less(null_distribution, observed)
+        pvalues_greater = greater(null_distribution, observed)
+        pvalues = xp.minimum(pvalues_less, pvalues_greater) * 2
+        return pvalues
+
+    compare = {"less": less,
+               "greater": greater,
+               "two-sided": two_sided}
+
+    pvalues = compare[alternative](null_distribution, observed)
+    pvalues = xp.clip(pvalues, 0., 1.)
+
+    return MonteCarloTestResult(observed, pvalues, null_distribution)
+
+
+@dataclass
+class PowerResult:
+    """Result object returned by `scipy.stats.power`.
+
+    Attributes
+    ----------
+    power : float or ndarray
+        The estimated power.
+    pvalues : float or ndarray
+        The simulated p-values.
+    """
+    power: float | np.ndarray
+    pvalues: float | np.ndarray
+
+
+def _wrap_kwargs(fun):
+    """Wrap callable to accept arbitrary kwargs and ignore unused ones"""
+
+    try:
+        keys = set(inspect.signature(fun).parameters.keys())
+    except ValueError:
+        # NumPy Generator methods can't be inspected
+        keys = {'size'}
+
+    # Set keys=keys/fun=fun to avoid late binding gotcha
+    def wrapped_rvs_i(*args, keys=keys, fun=fun, **all_kwargs):
+        kwargs = {key: val for key, val in all_kwargs.items()
+                  if key in keys}
+        return fun(*args, **kwargs)
+    return wrapped_rvs_i
+
+
+def _power_iv(rvs, test, n_observations, significance, vectorized,
+              n_resamples, batch, kwargs):
+    """Input validation for `monte_carlo_test`."""
+    if vectorized not in {True, False, None}:
+        raise ValueError("`vectorized` must be `True`, `False`, or `None`.")
+
+    if not isinstance(rvs, Sequence):
+        rvs = (rvs,)
+        n_observations = (n_observations,)
+    for rvs_i in rvs:
+        if not callable(rvs_i):
+            raise TypeError("`rvs` must be callable or sequence of callables.")
+
+    if not len(rvs) == len(n_observations):
+        message = ("If `rvs` is a sequence, `len(rvs)` "
+                   "must equal `len(n_observations)`.")
+        raise ValueError(message)
+
+    kwargs = dict() if kwargs is None else kwargs
+    if not isinstance(kwargs, dict):
+        raise TypeError("`kwargs` must be a dictionary that maps keywords to arrays.")
+
+    vals = kwargs.values()
+    keys = kwargs.keys()
+
+    xp = array_namespace(*n_observations, significance, *vals)
+
+    significance = xp.asarray(significance)
+    if (not xp.isdtype(significance.dtype, "real floating")
+            or xp.min(significance) < 0 or xp.max(significance) > 1):
+        raise ValueError("`significance` must contain floats between 0 and 1.")
+
+    # Wrap callables to ignore unused keyword arguments
+    wrapped_rvs = [_wrap_kwargs(rvs_i) for rvs_i in rvs]
+
+    # Broadcast, then ravel nobs/kwarg combinations. In the end,
+    # `nobs` and `vals` have shape (# of combinations, number of variables)
+    # todo: find a better way to do this without combining arrays
+    tmp = xp.stack(xp.broadcast_arrays(*n_observations, *vals))
+    shape = tmp.shape
+    if tmp.ndim == 1:
+        tmp = xp.expand_dims(tmp, axis=0)
+    else:
+        tmp = xp.reshape(tmp, (shape[0], -1)).T
+    nobs, vals = tmp[:, :len(rvs)], tmp[:, len(rvs):]
+    integer_dtype = xp_result_type(*n_observations, xp=xp)
+    nobs = xp.astype(nobs, integer_dtype)
+
+    if not callable(test):
+        raise TypeError("`test` must be callable.")
+
+    if vectorized is None:
+        vectorized = 'axis' in inspect.signature(test).parameters
+
+    test_vectorized = test
+    if not vectorized:
+        if not is_numpy(xp):
+            message = (f"When using array library {xp.__name__}, `test` must be "
+                       "be vectorized and accept argument `axis`.")
+            raise TypeError(message)
+
+        test_vectorized = _vectorize_statistic(test)
+
+    # Wrap `test` function to ignore unused kwargs
+    test_vectorized = _wrap_kwargs(test_vectorized)
+
+    n_resamples_int = int(n_resamples)
+    if n_resamples != n_resamples_int or n_resamples_int <= 0:
+        raise ValueError("`n_resamples` must be a positive integer.")
+
+    if batch is None:
+        batch_iv = batch
+    else:
+        batch_iv = int(batch)
+        if batch != batch_iv or batch_iv <= 0:
+            raise ValueError("`batch` must be a positive integer or None.")
+
+    return (wrapped_rvs, test_vectorized, nobs, significance, vectorized,
+            n_resamples_int, batch_iv, vals, keys, shape[1:], xp)
+
+
+@xp_capabilities(allow_dask_compute=True, jax_jit=False)
+def power(test, rvs, n_observations, *, significance=0.01, vectorized=None,
+          n_resamples=10000, batch=None, kwargs=None):
+    r"""Simulate the power of a hypothesis test under an alternative hypothesis.
+
+    Parameters
+    ----------
+    test : callable
+        Hypothesis test for which the power is to be simulated.
+        `test` must be a callable that accepts a sample (e.g. ``test(sample)``)
+        or ``len(rvs)`` separate samples (e.g. ``test(samples1, sample2)`` if
+        `rvs` contains two callables and `n_observations` contains two values)
+        and returns the p-value of the test.
+        If `vectorized` is set to ``True``, `test` must also accept a keyword
+        argument `axis` and be vectorized to perform the test along the
+        provided `axis` of the samples.
+        Any callable from `scipy.stats` with an `axis` argument that returns an
+        object with a `pvalue` attribute is also acceptable.
+    rvs : callable or tuple of callables
+        A callable or sequence of callables that generate(s) random variates
+        under the alternative hypothesis. Each element of `rvs` must accept
+        keyword argument ``size`` (e.g. ``rvs(size=(m, n))``) and return an
+        N-d array of that shape. If `rvs` is a sequence, the number of callables
+        in `rvs` must match the number of elements of `n_observations`, i.e.
+        ``len(rvs) == len(n_observations)``. If `rvs` is a single callable,
+        `n_observations` is treated as a single element.
+    n_observations : tuple of ints or tuple of integer arrays
+        If a sequence of ints, each is the sizes of a sample to be passed to `test`.
+        If a sequence of integer arrays, the power is simulated for each
+        set of corresponding sample sizes. See Examples.
+    significance : float or array_like of floats, default: 0.01
+        The threshold for significance; i.e., the p-value below which the
+        hypothesis test results will be considered as evidence against the null
+        hypothesis. Equivalently, the acceptable rate of Type I error under
+        the null hypothesis. If an array, the power is simulated for each
+        significance threshold.
+    kwargs : dict, optional
+        Keyword arguments to be passed to `rvs` and/or `test` callables.
+        Introspection is used to determine which keyword arguments may be
+        passed to each callable.
+        The value corresponding with each keyword must be an array.
+        Arrays must be broadcastable with one another and with each array in
+        `n_observations`. The power is simulated for each set of corresponding
+        sample sizes and arguments. See Examples.
+    vectorized : bool, optional
+        If `vectorized` is set to ``False``, `test` will not be passed keyword
+        argument `axis` and is expected to perform the test only for 1D samples.
+        If ``True``, `test` will be passed keyword argument `axis` and is
+        expected to perform the test along `axis` when passed N-D sample arrays.
+        If ``None`` (default), `vectorized` will be set ``True`` if ``axis`` is
+        a parameter of `test`. Use of a vectorized test typically reduces
+        computation time.
+    n_resamples : int, default: 10000
+        Number of samples drawn from each of the callables of `rvs`.
+        Equivalently, the number tests performed under the alternative
+        hypothesis to approximate the power.
+    batch : int, optional
+        The number of samples to process in each call to `test`. Memory usage is
+        proportional to the product of `batch` and the largest sample size. Default
+        is ``None``, in which case `batch` equals `n_resamples`.
+
+    Returns
+    -------
+    res : PowerResult
+        An object with attributes:
+
+        power : float or ndarray
+            The estimated power against the alternative.
+        pvalues : ndarray
+            The p-values observed under the alternative hypothesis.
+
+    Notes
+    -----
+    The power is simulated as follows:
+
+    - Draw many random samples (or sets of samples), each of the size(s)
+      specified by `n_observations`, under the alternative specified by
+      `rvs`.
+    - For each sample (or set of samples), compute the p-value according to
+      `test`. These p-values are recorded in the ``pvalues`` attribute of
+      the result object.
+    - Compute the proportion of p-values that are less than the `significance`
+      level. This is the power recorded in the ``power`` attribute of the
+      result object.
+
+    Suppose that `significance` is an array with shape ``shape1``, the elements
+    of `kwargs` and `n_observations` are mutually broadcastable to shape ``shape2``,
+    and `test` returns an array of p-values of shape ``shape3``. Then the result
+    object ``power`` attribute will be of shape ``shape1 + shape2 + shape3``, and
+    the ``pvalues`` attribute will be of shape ``shape2 + shape3 + (n_resamples,)``.
+
+    Examples
+    --------
+    Suppose we wish to simulate the power of the independent sample t-test
+    under the following conditions:
+
+    - The first sample has 10 observations drawn from a normal distribution
+      with mean 0.
+    - The second sample has 12 observations drawn from a normal distribution
+      with mean 1.0.
+    - The threshold on p-values for significance is 0.05.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(2549598345528)
+    >>>
+    >>> test = stats.ttest_ind
+    >>> n_observations = (10, 12)
+    >>> rvs1 = rng.normal
+    >>> rvs2 = lambda size: rng.normal(loc=1, size=size)
+    >>> rvs = (rvs1, rvs2)
+    >>> res = stats.power(test, rvs, n_observations, significance=0.05)
+    >>> res.power
+    0.6116
+
+    With samples of size 10 and 12, respectively, the power of the t-test
+    with a significance threshold of 0.05 is approximately 60% under the chosen
+    alternative. We can investigate the effect of sample size on the power
+    by passing sample size arrays.
+
+    >>> import matplotlib.pyplot as plt
+    >>> nobs_x = np.arange(5, 21)
+    >>> nobs_y = nobs_x
+    >>> n_observations = (nobs_x, nobs_y)
+    >>> res = stats.power(test, rvs, n_observations, significance=0.05)
+    >>> ax = plt.subplot()
+    >>> ax.plot(nobs_x, res.power)
+    >>> ax.set_xlabel('Sample Size')
+    >>> ax.set_ylabel('Simulated Power')
+    >>> ax.set_title('Simulated Power of `ttest_ind` with Equal Sample Sizes')
+    >>> plt.show()
+
+    Alternatively, we can investigate the impact that effect size has on the power.
+    In this case, the effect size is the location of the distribution underlying
+    the second sample.
+
+    >>> n_observations = (10, 12)
+    >>> loc = np.linspace(0, 1, 20)
+    >>> rvs2 = lambda size, loc: rng.normal(loc=loc, size=size)
+    >>> rvs = (rvs1, rvs2)
+    >>> res = stats.power(test, rvs, n_observations, significance=0.05,
+    ...                   kwargs={'loc': loc})
+    >>> ax = plt.subplot()
+    >>> ax.plot(loc, res.power)
+    >>> ax.set_xlabel('Effect Size')
+    >>> ax.set_ylabel('Simulated Power')
+    >>> ax.set_title('Simulated Power of `ttest_ind`, Varying Effect Size')
+    >>> plt.show()
+
+    We can also use `power` to estimate the Type I error rate (also referred to by the
+    ambiguous term "size") of a test and assess whether it matches the nominal level.
+    For example, the null hypothesis of `jarque_bera` is that the sample was drawn from
+    a distribution with the same skewness and kurtosis as the normal distribution. To
+    estimate the Type I error rate, we can consider the null hypothesis to be a true
+    *alternative* hypothesis and calculate the power.
+
+    >>> test = stats.jarque_bera
+    >>> n_observations = 10
+    >>> rvs = rng.normal
+    >>> significance = np.linspace(0.0001, 0.1, 1000)
+    >>> res = stats.power(test, rvs, n_observations, significance=significance)
+    >>> size = res.power
+
+    As shown below, the Type I error rate of the test is far below the nominal level
+    for such a small sample, as mentioned in its documentation.
+
+    >>> ax = plt.subplot()
+    >>> ax.plot(significance, size)
+    >>> ax.plot([0, 0.1], [0, 0.1], '--')
+    >>> ax.set_xlabel('nominal significance level')
+    >>> ax.set_ylabel('estimated test size (Type I error rate)')
+    >>> ax.set_title('Estimated test size vs nominal significance level')
+    >>> ax.set_aspect('equal', 'box')
+    >>> ax.legend(('`ttest_1samp`', 'ideal test'))
+    >>> plt.show()
+
+    As one might expect from such a conservative test, the power is quite low with
+    respect to some alternatives. For example, the power of the test under the
+    alternative that the sample was drawn from the Laplace distribution may not
+    be much greater than the Type I error rate.
+
+    >>> rvs = rng.laplace
+    >>> significance = np.linspace(0.0001, 0.1, 1000)
+    >>> res = stats.power(test, rvs, n_observations, significance=0.05)
+    >>> print(res.power)
+    0.0587
+
+    This is not a mistake in SciPy's implementation; it is simply due to the fact
+    that the null distribution of the test statistic is derived under the assumption
+    that the sample size is large (i.e. approaches infinity), and this asymptotic
+    approximation is not accurate for small samples. In such cases, resampling
+    and Monte Carlo methods (e.g. `permutation_test`, `goodness_of_fit`,
+    `monte_carlo_test`) may be more appropriate.
+
+    """
+    tmp = _power_iv(rvs, test, n_observations, significance,
+                    vectorized, n_resamples, batch, kwargs)
+    (rvs, test, nobs, significance,
+     vectorized, n_resamples, batch, args, kwds, shape, xp) = tmp
+
+    batch_nominal = batch or n_resamples
+    pvalues = []  # results of various nobs/kwargs combinations
+    for i in range(nobs.shape[0]):
+        nobs_i, args_i = nobs[i, ...], args[i, ...]
+        kwargs_i = dict(zip(kwds, args_i))
+        pvalues_i = []  # results of batches; fixed nobs/kwargs combination
+        for k in range(0, n_resamples, batch_nominal):
+            batch_actual = min(batch_nominal, n_resamples - k)
+            resamples = [rvs_j(size=(batch_actual, int(nobs_ij)), **kwargs_i)
+                         for rvs_j, nobs_ij in zip(rvs, nobs_i)]
+            res = test(*resamples, **kwargs_i, axis=-1)
+            p = getattr(res, 'pvalue', res)
+            pvalues_i.append(p)
+        # Concatenate results from batches
+        pvalues_i = xp.concat(pvalues_i, axis=-1)
+        pvalues.append(pvalues_i)
+    # `test` can return result with array of p-values
+    shape += pvalues_i.shape[:-1]
+    # Concatenate results from various nobs/kwargs combinations
+    pvalues = xp.concat(pvalues, axis=0)
+    # nobs/kwargs arrays were raveled to single axis; unravel
+    pvalues = xp.reshape(pvalues, shape + (-1,))
+    if significance.ndim > 0:
+        newdims = tuple(range(significance.ndim, pvalues.ndim + significance.ndim))
+        significance = xpx.expand_dims(significance, axis=newdims)
+
+    float_dtype = xp_result_type(significance, pvalues, xp=xp)
+    powers = xp.mean(xp.astype(pvalues < significance, float_dtype), axis=-1)
+
+    return PowerResult(power=powers, pvalues=pvalues)
+
+
+@dataclass
+class PermutationTestResult:
+    """Result object returned by `scipy.stats.permutation_test`.
+
+    Attributes
+    ----------
+    statistic : float or ndarray
+        The observed test statistic of the data.
+    pvalue : float or ndarray
+        The p-value for the given alternative.
+    null_distribution : ndarray
+        The values of the test statistic generated under the null
+        hypothesis.
+    """
+    statistic: float | np.ndarray
+    pvalue: float | np.ndarray
+    null_distribution: np.ndarray
+
+
+def _all_partitions_concatenated(ns):
+    """
+    Generate all partitions of indices of groups of given sizes, concatenated
+
+    `ns` is an iterable of ints.
+    """
+    def all_partitions(z, n):
+        for c in combinations(z, n):
+            x0 = set(c)
+            x1 = z - x0
+            yield [x0, x1]
+
+    def all_partitions_n(z, ns):
+        if len(ns) == 0:
+            yield [z]
+            return
+        for c in all_partitions(z, ns[0]):
+            for d in all_partitions_n(c[1], ns[1:]):
+                yield c[0:1] + d
+
+    z = set(range(np.sum(ns)))
+    for partitioning in all_partitions_n(z, ns[:]):
+        x = np.concatenate([list(partition)
+                            for partition in partitioning]).astype(int)
+        yield x
+
+
+def _batch_generator(iterable, batch):
+    """A generator that yields batches of elements from an iterable"""
+    iterator = iter(iterable)
+    if batch <= 0:
+        raise ValueError("`batch` must be positive.")
+    z = [item for i, item in zip(range(batch), iterator)]
+    while z:  # we don't want StopIteration without yielding an empty list
+        yield z
+        z = [item for i, item in zip(range(batch), iterator)]
+
+
+def _pairings_permutations_gen(n_permutations, n_samples, n_obs_sample, batch,
+                               rng):
+    # Returns a generator that yields arrays of size
+    # `(batch, n_samples, n_obs_sample)`.
+    # Each row is an independent permutation of indices 0 to `n_obs_sample`.
+    batch = min(batch, n_permutations)
+
+    if hasattr(rng, 'permuted'):
+        def batched_perm_generator():
+            indices = np.arange(n_obs_sample)
+            indices = np.tile(indices, (batch, n_samples, 1))
+            for k in range(0, n_permutations, batch):
+                batch_actual = min(batch, n_permutations-k)
+                # Don't permute in place, otherwise results depend on `batch`
+                permuted_indices = rng.permuted(indices, axis=-1)
+                yield permuted_indices[:batch_actual]
+    else:  # RandomState and early Generators don't have `permuted`
+        def batched_perm_generator():
+            for k in range(0, n_permutations, batch):
+                batch_actual = min(batch, n_permutations-k)
+                size = (batch_actual, n_samples, n_obs_sample)
+                x = rng.random(size=size)
+                yield np.argsort(x, axis=-1)[:batch_actual]
+
+    return batched_perm_generator()
+
+
+def _calculate_null_both(data, statistic, n_permutations, batch,
+                         rng=None, *, xp):
+    """
+    Calculate null distribution for independent sample tests.
+    """
+    # compute number of permutations
+    # (distinct partitions of data into samples of these sizes)
+    n_obs_i = [sample.shape[-1] for sample in data]  # observations per sample
+    n_obs_ic = list(accumulate(n_obs_i, initial=0))
+    n_obs = n_obs_ic[-1]  # total number of observations
+    n_max = math.prod([math.comb(n, k) for n, k in zip(n_obs_ic[1:], n_obs_ic[:-1])])
+
+    # perm_generator is an iterator that produces permutations of indices
+    # from 0 to n_obs. We'll concatenate the samples, use these indices to
+    # permute the data, then split the samples apart again.
+    if n_permutations >= n_max:
+        exact_test = True
+        n_permutations = n_max
+        perm_generator = _all_partitions_concatenated(n_obs_i)
+    else:
+        exact_test = False
+        # Neither RandomState.permutation nor Generator.permutation
+        # can permute axis-slices independently. If this feature is
+        # added in the future, batches of the desired size should be
+        # generated in a single call.
+        perm_generator = (rng.permutation(n_obs)
+                          for i in range(n_permutations))
+
+    batch = batch or int(n_permutations)
+    null_distribution = []
+
+    # First, concatenate all the samples. In batches, permute samples with
+    # indices produced by the `perm_generator`, split them into new samples of
+    # the original sizes, compute the statistic for each batch, and add these
+    # statistic values to the null distribution.
+    data = xp.concat(data, axis=-1)
+    for indices in _batch_generator(perm_generator, batch=batch):
+        # Creating a tensor from a list of numpy.ndarrays is extremely slow...
+        indices = np.asarray(indices)
+        indices = xp.asarray(indices)
+
+        # `indices` is 2D: each row is a permutation of the indices.
+        # We use it to index `data` along its last axis, which corresponds
+        # with observations.
+        # After indexing, the second to last axis of `data_batch` corresponds
+        # with permutations, and the last axis corresponds with observations.
+        # data_batch = data[..., indices]
+        data_batch = _get_from_last_axis(data, indices, xp=xp)
+
+        # Move the permutation axis to the front: we'll concatenate a list
+        # of batched statistic values along this zeroth axis to form the
+        # null distribution.
+        data_batch = xp.moveaxis(data_batch, -2, 0)
+        # data_batch = np.split(data_batch, n_obs_ic[:-1], axis=-1)
+        data_batch = [data_batch[..., i:j] for i, j in zip(n_obs_ic[:-1], n_obs_ic[1:])]
+        null_distribution.append(statistic(*data_batch, axis=-1))
+    null_distribution = xp.concat(null_distribution, axis=0)
+
+    return null_distribution, n_permutations, exact_test
+
+
+def _calculate_null_pairings(data, statistic, n_permutations, batch,
+                             rng=None, *, xp):
+    """
+    Calculate null distribution for association tests.
+    """
+    n_samples = len(data)
+
+    # compute number of permutations (factorial(n) permutations of each sample)
+    n_obs_sample = data[0].shape[-1]  # observations per sample; same for each
+    n_max = math.factorial(n_obs_sample)**n_samples
+
+    # `perm_generator` is an iterator that produces a list of permutations of
+    # indices from 0 to n_obs_sample, one for each sample.
+    if n_permutations >= n_max:
+        exact_test = True
+        n_permutations = n_max
+        batch = batch or int(n_permutations)
+        # Cartesian product of the sets of all permutations of indices
+        perm_generator = product(*(permutations(range(n_obs_sample))
+                                   for i in range(n_samples)))
+        batched_perm_generator = _batch_generator(perm_generator, batch=batch)
+    else:
+        exact_test = False
+        batch = batch or int(n_permutations)
+        # Separate random permutations of indices for each sample.
+        # Again, it would be nice if RandomState/Generator.permutation
+        # could permute each axis-slice separately.
+        args = n_permutations, n_samples, n_obs_sample, batch, rng
+        batched_perm_generator = _pairings_permutations_gen(*args)
+
+    null_distribution = []
+
+    for indices in batched_perm_generator:
+        indices = xp.asarray(indices)
+
+        # `indices` is 3D: the zeroth axis is for permutations, the next is
+        # for samples, and the last is for observations. Swap the first two
+        # to make the zeroth axis correspond with samples, as it does for
+        # `data`.
+        indices = xp_swapaxes(indices, 0, 1, xp=xp)
+
+        # When we're done, `data_batch` will be a list of length `n_samples`.
+        # Each element will be a batch of random permutations of one sample.
+        # The zeroth axis of each batch will correspond with permutations,
+        # and the last will correspond with observations. (This makes it
+        # easy to pass into `statistic`.)
+        data_batch = [None]*n_samples
+        for i in range(n_samples):
+            # data_batch[i] = data[i][..., indices[i]]
+            data_batch[i] = _get_from_last_axis(data[i], indices[i, ...], xp=xp)
+            data_batch[i] = xp.moveaxis(data_batch[i], -2, 0)
+
+        null_distribution.append(statistic(*data_batch, axis=-1))
+    null_distribution = xp.concat(null_distribution, axis=0)
+
+    return null_distribution, n_permutations, exact_test
+
+
+def _calculate_null_samples(data, statistic, n_permutations, batch,
+                            rng=None, *, xp):
+    """
+    Calculate null distribution for paired-sample tests.
+    """
+    n_samples = len(data)
+
+    # By convention, the meaning of the "samples" permutations type for
+    # data with only one sample is to flip the sign of the observations.
+    # Achieve this by adding a second sample - the negative of the original.
+    if n_samples == 1:
+        data = [data[0], -data[0]]
+
+    # The "samples" permutation strategy is the same as the "pairings"
+    # strategy except the roles of samples and observations are flipped.
+    # So swap these axes, then we'll use the function for the "pairings"
+    # strategy to do all the work!
+    data = xp.stack(data, axis=0)
+    data = xp_swapaxes(data, 0, -1, xp=xp)
+
+    # (Of course, the user's statistic doesn't know what we've done here,
+    # so we need to pass it what it's expecting.)
+    def statistic_wrapped(*data, axis):
+        # can we do this without converting back and forth between
+        # array and list?
+        data = xp.stack(data, axis=0)
+        data = xp_swapaxes(data, 0, -1, xp=xp)
+        if n_samples == 1:
+            data = data[0:1, ...]
+        data = [data[i, ...] for i in range(data.shape[0])]
+        return statistic(*data, axis=axis)
+
+    data = [data[i, ...] for i in range(data.shape[0])]
+    return _calculate_null_pairings(data, statistic_wrapped, n_permutations,
+                                    batch, rng, xp=xp)
+
+
+def _permutation_test_iv(data, statistic, permutation_type, vectorized,
+                         n_resamples, batch, alternative, axis, rng):
+    """Input validation for `permutation_test`."""
+    axis_int = int(axis)
+    if axis != axis_int:
+        raise ValueError("`axis` must be an integer.")
+
+    permutation_types = {'samples', 'pairings', 'independent'}
+    permutation_type = permutation_type.lower()
+    if permutation_type not in permutation_types:
+        raise ValueError(f"`permutation_type` must be in {permutation_types}.")
+
+    if vectorized not in {True, False, None}:
+        raise ValueError("`vectorized` must be `True`, `False`, or `None`.")
+
+    if vectorized is None:
+        vectorized = 'axis' in inspect.signature(statistic).parameters
+
+    message = "`data` must be a tuple containing at least two samples"
+    try:
+        if len(data) < 2 and permutation_type == 'independent':
+            raise ValueError(message)
+    except TypeError:
+        raise TypeError(message)
+
+    xp = array_namespace(*data)
+
+    if not vectorized:
+        if not is_numpy(xp):
+            message = (f"When using array library {xp.__name__}, `func` must be "
+                       "vectorized and accept argument `axis`.")
+            raise TypeError(message)
+
+        statistic = _vectorize_statistic(statistic)
+
+    data = _broadcast_arrays(data, axis, xp=xp)
+    data_iv = []
+    for sample in data:
+        sample = xpx.atleast_nd(sample, ndim=1)
+        if sample.shape[axis] <= 1:
+            raise ValueError("each sample in `data` must contain two or more "
+                             "observations along `axis`.")
+        sample = xp.moveaxis(sample, axis_int, -1)
+        data_iv.append(sample)
+
+    n_resamples_int = (int(n_resamples) if not math.isinf(n_resamples)
+                       else xp.inf)
+    if n_resamples != n_resamples_int or n_resamples_int <= 0:
+        raise ValueError("`n_resamples` must be a positive integer.")
+
+    if batch is None:
+        batch_iv = batch
+    else:
+        batch_iv = int(batch)
+        if batch != batch_iv or batch_iv <= 0:
+            raise ValueError("`batch` must be a positive integer or None.")
+
+    alternatives = {'two-sided', 'greater', 'less'}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f"`alternative` must be in {alternatives}")
+
+    rng = check_random_state(rng)
+
+    float_dtype = xp_result_type(*data_iv, force_floating=True, xp=xp)
+
+    return (data_iv, statistic, permutation_type, vectorized, n_resamples_int,
+            batch_iv, alternative, axis_int, rng, float_dtype, xp)
+
+
+@xp_capabilities(skip_backends=[('dask.array', "lacks required indexing capabilities")])
+@_transition_to_rng('random_state')
+def permutation_test(data, statistic, *, permutation_type='independent',
+                     vectorized=None, n_resamples=9999, batch=None,
+                     alternative="two-sided", axis=0, rng=None):
+    r"""
+    Performs a permutation test of a given statistic on provided data.
+
+    For independent sample statistics, the null hypothesis is that the data are
+    randomly sampled from the same distribution.
+    For paired sample statistics, two null hypothesis can be tested:
+    that the data are paired at random or that the data are assigned to samples
+    at random.
+
+    Parameters
+    ----------
+    data : iterable of array-like
+        Contains the samples, each of which is an array of observations.
+        Dimensions of sample arrays must be compatible for broadcasting except
+        along `axis`.
+    statistic : callable
+        Statistic for which the p-value of the hypothesis test is to be
+        calculated. `statistic` must be a callable that accepts samples
+        as separate arguments (e.g. ``statistic(*data)``) and returns the
+        resulting statistic.
+        If `vectorized` is set ``True``, `statistic` must also accept a keyword
+        argument `axis` and be vectorized to compute the statistic along the
+        provided `axis` of the sample arrays.
+    permutation_type : {'independent', 'samples', 'pairings'}, optional
+        The type of permutations to be performed, in accordance with the
+        null hypothesis. The first two permutation types are for paired sample
+        statistics, in which all samples contain the same number of
+        observations and observations with corresponding indices along `axis`
+        are considered to be paired; the third is for independent sample
+        statistics.
+
+        - ``'samples'`` : observations are assigned to different samples
+          but remain paired with the same observations from other samples.
+          This permutation type is appropriate for paired sample hypothesis
+          tests such as the Wilcoxon signed-rank test and the paired t-test.
+        - ``'pairings'`` : observations are paired with different observations,
+          but they remain within the same sample. This permutation type is
+          appropriate for association/correlation tests with statistics such
+          as Spearman's :math:`\rho`, Kendall's :math:`\tau`, and Pearson's
+          :math:`r`.
+        - ``'independent'`` (default) : observations are assigned to different
+          samples. Samples may contain different numbers of observations. This
+          permutation type is appropriate for independent sample hypothesis
+          tests such as the Mann-Whitney :math:`U` test and the independent
+          sample t-test.
+
+          Please see the Notes section below for more detailed descriptions
+          of the permutation types.
+
+    vectorized : bool, optional
+        If `vectorized` is set ``False``, `statistic` will not be passed
+        keyword argument `axis` and is expected to calculate the statistic
+        only for 1D samples. If ``True``, `statistic` will be passed keyword
+        argument `axis` and is expected to calculate the statistic along `axis`
+        when passed an ND sample array. If ``None`` (default), `vectorized`
+        will be set ``True`` if ``axis`` is a parameter of `statistic`. Use
+        of a vectorized statistic typically reduces computation time.
+    n_resamples : int or np.inf, default: 9999
+        Number of random permutations (resamples) used to approximate the null
+        distribution. If greater than or equal to the number of distinct
+        permutations, the exact null distribution will be computed.
+        Note that the number of distinct permutations grows very rapidly with
+        the sizes of samples, so exact tests are feasible only for very small
+        data sets.
+    batch : int, optional
+        The number of permutations to process in each call to `statistic`.
+        Memory usage is O( `batch` * ``n`` ), where ``n`` is the total size
+        of all samples, regardless of the value of `vectorized`. Default is
+        ``None``, in which case ``batch`` is the number of permutations.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        The alternative hypothesis for which the p-value is calculated.
+        For each alternative, the p-value is defined for exact tests as
+        follows.
+
+        - ``'greater'`` : the percentage of the null distribution that is
+          greater than or equal to the observed value of the test statistic.
+        - ``'less'`` : the percentage of the null distribution that is
+          less than or equal to the observed value of the test statistic.
+        - ``'two-sided'`` (default) : twice the smaller of the p-values above.
+
+        Note that p-values for randomized tests are calculated according to the
+        conservative (over-estimated) approximation suggested in [2]_ and [3]_
+        rather than the unbiased estimator suggested in [4]_. That is, when
+        calculating the proportion of the randomized null distribution that is
+        as extreme as the observed value of the test statistic, the values in
+        the numerator and denominator are both increased by one. An
+        interpretation of this adjustment is that the observed value of the
+        test statistic is always included as an element of the randomized
+        null distribution.
+        The convention used for two-sided p-values is not universal;
+        the observed test statistic and null distribution are returned in
+        case a different definition is preferred.
+
+    axis : int, default: 0
+        The axis of the (broadcasted) samples over which to calculate the
+        statistic. If samples have a different number of dimensions,
+        singleton dimensions are prepended to samples with fewer dimensions
+        before `axis` is considered.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+    Returns
+    -------
+    res : PermutationTestResult
+        An object with attributes:
+
+        statistic : float or ndarray
+            The observed test statistic of the data.
+        pvalue : float or ndarray
+            The p-value for the given alternative.
+        null_distribution : ndarray
+            The values of the test statistic generated under the null
+            hypothesis.
+
+    Notes
+    -----
+
+    The three types of permutation tests supported by this function are
+    described below.
+
+    **Unpaired statistics** (``permutation_type='independent'``):
+
+    The null hypothesis associated with this permutation type is that all
+    observations are sampled from the same underlying distribution and that
+    they have been assigned to one of the samples at random.
+
+    Suppose ``data`` contains two samples; e.g. ``a, b = data``.
+    When ``1 < n_resamples < binom(n, k)``, where
+
+    * ``k`` is the number of observations in ``a``,
+    * ``n`` is the total number of observations in ``a`` and ``b``, and
+    * ``binom(n, k)`` is the binomial coefficient (``n`` choose ``k``),
+
+    the data are pooled (concatenated), randomly assigned to either the first
+    or second sample, and the statistic is calculated. This process is
+    performed repeatedly, `permutation` times, generating a distribution of the
+    statistic under the null hypothesis. The statistic of the original
+    data is compared to this distribution to determine the p-value.
+
+    When ``n_resamples >= binom(n, k)``, an exact test is performed: the data
+    are *partitioned* between the samples in each distinct way exactly once,
+    and the exact null distribution is formed.
+    Note that for a given partitioning of the data between the samples,
+    only one ordering/permutation of the data *within* each sample is
+    considered. For statistics that do not depend on the order of the data
+    within samples, this dramatically reduces computational cost without
+    affecting the shape of the null distribution (because the frequency/count
+    of each value is affected by the same factor).
+
+    For ``a = [a1, a2, a3, a4]`` and ``b = [b1, b2, b3]``, an example of this
+    permutation type is ``x = [b3, a1, a2, b2]`` and ``y = [a4, b1, a3]``.
+    Because only one ordering/permutation of the data *within* each sample
+    is considered in an exact test, a resampling like ``x = [b3, a1, b2, a2]``
+    and ``y = [a4, a3, b1]`` would *not* be considered distinct from the
+    example above.
+
+    ``permutation_type='independent'`` does not support one-sample statistics,
+    but it can be applied to statistics with more than two samples. In this
+    case, if ``n`` is an array of the number of observations within each
+    sample, the number of distinct partitions is::
+
+        np.prod([binom(sum(n[i:]), sum(n[i+1:])) for i in range(len(n)-1)])
+
+    **Paired statistics, permute pairings** (``permutation_type='pairings'``):
+
+    The null hypothesis associated with this permutation type is that
+    observations within each sample are drawn from the same underlying
+    distribution and that pairings with elements of other samples are
+    assigned at random.
+
+    Suppose ``data`` contains only one sample; e.g. ``a, = data``, and we
+    wish to consider all possible pairings of elements of ``a`` with elements
+    of a second sample, ``b``. Let ``n`` be the number of observations in
+    ``a``, which must also equal the number of observations in ``b``.
+
+    When ``1 < n_resamples < factorial(n)``, the elements of ``a`` are
+    randomly permuted. The user-supplied statistic accepts one data argument,
+    say ``a_perm``, and calculates the statistic considering ``a_perm`` and
+    ``b``. This process is performed repeatedly, `permutation` times,
+    generating a distribution of the statistic under the null hypothesis.
+    The statistic of the original data is compared to this distribution to
+    determine the p-value.
+
+    When ``n_resamples >= factorial(n)``, an exact test is performed:
+    ``a`` is permuted in each distinct way exactly once. Therefore, the
+    `statistic` is computed for each unique pairing of samples between ``a``
+    and ``b`` exactly once.
+
+    For ``a = [a1, a2, a3]`` and ``b = [b1, b2, b3]``, an example of this
+    permutation type is ``a_perm = [a3, a1, a2]`` while ``b`` is left
+    in its original order.
+
+    ``permutation_type='pairings'`` supports ``data`` containing any number
+    of samples, each of which must contain the same number of observations.
+    All samples provided in ``data`` are permuted *independently*. Therefore,
+    if ``m`` is the number of samples and ``n`` is the number of observations
+    within each sample, then the number of permutations in an exact test is::
+
+        factorial(n)**m
+
+    Note that if a two-sample statistic, for example, does not inherently
+    depend on the order in which observations are provided - only on the
+    *pairings* of observations - then only one of the two samples should be
+    provided in ``data``. This dramatically reduces computational cost without
+    affecting the shape of the null distribution (because the frequency/count
+    of each value is affected by the same factor).
+
+    **Paired statistics, permute samples** (``permutation_type='samples'``):
+
+    The null hypothesis associated with this permutation type is that
+    observations within each pair are drawn from the same underlying
+    distribution and that the sample to which they are assigned is random.
+
+    Suppose ``data`` contains two samples; e.g. ``a, b = data``.
+    Let ``n`` be the number of observations in ``a``, which must also equal
+    the number of observations in ``b``.
+
+    When ``1 < n_resamples < 2**n``, the elements of ``a`` are ``b`` are
+    randomly swapped between samples (maintaining their pairings) and the
+    statistic is calculated. This process is performed repeatedly,
+    `permutation` times,  generating a distribution of the statistic under the
+    null hypothesis. The statistic of the original data is compared to this
+    distribution to determine the p-value.
+
+    When ``n_resamples >= 2**n``, an exact test is performed: the observations
+    are assigned to the two samples in each distinct way (while maintaining
+    pairings) exactly once.
+
+    For ``a = [a1, a2, a3]`` and ``b = [b1, b2, b3]``, an example of this
+    permutation type is ``x = [b1, a2, b3]`` and ``y = [a1, b2, a3]``.
+
+    ``permutation_type='samples'`` supports ``data`` containing any number
+    of samples, each of which must contain the same number of observations.
+    If ``data`` contains more than one sample, paired observations within
+    ``data`` are exchanged between samples *independently*. Therefore, if ``m``
+    is the number of samples and ``n`` is the number of observations within
+    each sample, then the number of permutations in an exact test is::
+
+        factorial(m)**n
+
+    Several paired-sample statistical tests, such as the Wilcoxon signed rank
+    test and paired-sample t-test, can be performed considering only the
+    *difference* between two paired elements. Accordingly, if ``data`` contains
+    only one sample, then the null distribution is formed by independently
+    changing the *sign* of each observation.
+
+    .. warning::
+        The p-value is calculated by counting the elements of the null
+        distribution that are as extreme or more extreme than the observed
+        value of the statistic. Due to the use of finite precision arithmetic,
+        some statistic functions return numerically distinct values when the
+        theoretical values would be exactly equal. In some cases, this could
+        lead to a large error in the calculated p-value. `permutation_test`
+        guards against this by considering elements in the null distribution
+        that are "close" (within a relative tolerance of 100 times the
+        floating point epsilon of inexact dtypes) to the observed
+        value of the test statistic as equal to the observed value of the
+        test statistic. However, the user is advised to inspect the null
+        distribution to assess whether this method of comparison is
+        appropriate, and if not, calculate the p-value manually. See example
+        below.
+
+    References
+    ----------
+
+    .. [1] R. A. Fisher. The Design of Experiments, 6th Ed (1951).
+    .. [2] B. Phipson and G. K. Smyth. "Permutation P-values Should Never Be
+       Zero: Calculating Exact P-values When Permutations Are Randomly Drawn."
+       Statistical Applications in Genetics and Molecular Biology 9.1 (2010).
+    .. [3] M. D. Ernst. "Permutation Methods: A Basis for Exact Inference".
+       Statistical Science (2004).
+    .. [4] B. Efron and R. J. Tibshirani. An Introduction to the Bootstrap
+       (1993).
+
+    Examples
+    --------
+
+    Suppose we wish to test whether two samples are drawn from the same
+    distribution. Assume that the underlying distributions are unknown to us,
+    and that before observing the data, we hypothesized that the mean of the
+    first sample would be less than that of the second sample. We decide that
+    we will use the difference between the sample means as a test statistic,
+    and we will consider a p-value of 0.05 to be statistically significant.
+
+    For efficiency, we write the function defining the test statistic in a
+    vectorized fashion: the samples ``x`` and ``y`` can be ND arrays, and the
+    statistic will be calculated for each axis-slice along `axis`.
+
+    >>> import numpy as np
+    >>> def statistic(x, y, axis):
+    ...     return np.mean(x, axis=axis) - np.mean(y, axis=axis)
+
+    After collecting our data, we calculate the observed value of the test
+    statistic.
+
+    >>> from scipy.stats import norm
+    >>> rng = np.random.default_rng()
+    >>> x = norm.rvs(size=5, random_state=rng)
+    >>> y = norm.rvs(size=6, loc = 3, random_state=rng)
+    >>> statistic(x, y, 0)
+    -3.5411688580987266
+
+    Indeed, the test statistic is negative, suggesting that the true mean of
+    the distribution underlying ``x`` is less than that of the distribution
+    underlying ``y``. To determine the probability of this occurring by chance
+    if the two samples were drawn from the same distribution, we perform
+    a permutation test.
+
+    >>> from scipy.stats import permutation_test
+    >>> # because our statistic is vectorized, we pass `vectorized=True`
+    >>> # `n_resamples=np.inf` indicates that an exact test is to be performed
+    >>> res = permutation_test((x, y), statistic, vectorized=True,
+    ...                        n_resamples=np.inf, alternative='less')
+    >>> print(res.statistic)
+    -3.5411688580987266
+    >>> print(res.pvalue)
+    0.004329004329004329
+
+    The probability of obtaining a test statistic less than or equal to the
+    observed value under the null hypothesis is 0.4329%. This is less than our
+    chosen threshold of 5%, so we consider this to be significant evidence
+    against the null hypothesis in favor of the alternative.
+
+    Because the size of the samples above was small, `permutation_test` could
+    perform an exact test. For larger samples, we resort to a randomized
+    permutation test.
+
+    >>> x = norm.rvs(size=100, random_state=rng)
+    >>> y = norm.rvs(size=120, loc=0.2, random_state=rng)
+    >>> res = permutation_test((x, y), statistic, n_resamples=9999,
+    ...                        vectorized=True, alternative='less',
+    ...                        rng=rng)
+    >>> print(res.statistic)
+    -0.4230459671240913
+    >>> print(res.pvalue)
+    0.0015
+
+    The approximate probability of obtaining a test statistic less than or
+    equal to the observed value under the null hypothesis is 0.0225%. This is
+    again less than our chosen threshold of 5%, so again we have significant
+    evidence to reject the null hypothesis in favor of the alternative.
+
+    For large samples and number of permutations, the result is comparable to
+    that of the corresponding asymptotic test, the independent sample t-test.
+
+    >>> from scipy.stats import ttest_ind
+    >>> res_asymptotic = ttest_ind(x, y, alternative='less')
+    >>> print(res_asymptotic.pvalue)
+    0.0014669545224902675
+
+    The permutation distribution of the test statistic is provided for
+    further investigation.
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.hist(res.null_distribution, bins=50)
+    >>> plt.title("Permutation distribution of test statistic")
+    >>> plt.xlabel("Value of Statistic")
+    >>> plt.ylabel("Frequency")
+    >>> plt.show()
+
+    Inspection of the null distribution is essential if the statistic suffers
+    from inaccuracy due to limited machine precision. Consider the following
+    case:
+
+    >>> from scipy.stats import pearsonr
+    >>> x = [1, 2, 4, 3]
+    >>> y = [2, 4, 6, 8]
+    >>> def statistic(x, y, axis=-1):
+    ...     return pearsonr(x, y, axis=axis).statistic
+    >>> res = permutation_test((x, y), statistic, vectorized=True,
+    ...                        permutation_type='pairings',
+    ...                        alternative='greater')
+    >>> r, pvalue, null = res.statistic, res.pvalue, res.null_distribution
+
+    In this case, some elements of the null distribution differ from the
+    observed value of the correlation coefficient ``r`` due to numerical noise.
+    We manually inspect the elements of the null distribution that are nearly
+    the same as the observed value of the test statistic.
+
+    >>> r
+    0.7999999999999999
+    >>> unique = np.unique(null)
+    >>> unique
+    array([-1. , -1. , -0.8, -0.8, -0.8, -0.6, -0.4, -0.4, -0.2, -0.2, -0.2,
+        0. ,  0.2,  0.2,  0.2,  0.4,  0.4,  0.6,  0.8,  0.8,  0.8,  1. ,
+        1. ])  # may vary
+    >>> unique[np.isclose(r, unique)].tolist()
+    [0.7999999999999998, 0.7999999999999999, 0.8]  # may vary
+
+    If `permutation_test` were to perform the comparison naively, the
+    elements of the null distribution with value ``0.7999999999999998`` would
+    not be considered as extreme or more extreme as the observed value of the
+    statistic, so the calculated p-value would be too small.
+
+    >>> incorrect_pvalue = np.count_nonzero(null >= r) / len(null)
+    >>> incorrect_pvalue
+    0.14583333333333334  # may vary
+
+    Instead, `permutation_test` treats elements of the null distribution that
+    are within ``max(1e-14, abs(r)*1e-14)`` of the observed value of the
+    statistic ``r`` to be equal to ``r``.
+
+    >>> correct_pvalue = np.count_nonzero(null >= r - 1e-14) / len(null)
+    >>> correct_pvalue
+    0.16666666666666666
+    >>> res.pvalue == correct_pvalue
+    True
+
+    This method of comparison is expected to be accurate in most practical
+    situations, but the user is advised to assess this by inspecting the
+    elements of the null distribution that are close to the observed value
+    of the statistic. Also, consider the use of statistics that can be
+    calculated using exact arithmetic (e.g. integer statistics).
+
+    """
+    args = _permutation_test_iv(data, statistic, permutation_type, vectorized,
+                                n_resamples, batch, alternative, axis,
+                                rng)
+    (data, statistic, permutation_type, vectorized, n_resamples, batch,
+     alternative, axis, rng, float_dtype, xp) = args
+
+    observed = statistic(*data, axis=-1)
+
+    null_calculators = {"pairings": _calculate_null_pairings,
+                        "samples": _calculate_null_samples,
+                        "independent": _calculate_null_both}
+    null_calculator_args = (data, statistic, n_resamples,
+                            batch, rng)
+    calculate_null = null_calculators[permutation_type]
+    null_distribution, n_resamples, exact_test = (
+        calculate_null(*null_calculator_args, xp=xp))
+
+    # See References [2] and [3]
+    adjustment = 0 if exact_test else 1
+
+    # relative tolerance for detecting numerically distinct but
+    # theoretically equal values in the null distribution
+    eps =  (0 if not xp.isdtype(observed.dtype, 'real floating')
+            else xp.finfo(observed.dtype).eps*100)
+    gamma = xp.abs(eps * observed)
+
+    def less(null_distribution, observed):
+        cmps = null_distribution <= observed + gamma
+        count = xp.count_nonzero(cmps, axis=0) + adjustment
+        pvalues = xp.astype(count, float_dtype) / (n_resamples + adjustment)
+        return pvalues
+
+    def greater(null_distribution, observed):
+        cmps = null_distribution >= observed - gamma
+        count = xp.count_nonzero(cmps, axis=0) + adjustment
+        pvalues = xp.astype(count, float_dtype) / (n_resamples + adjustment)
+        return pvalues
+
+    def two_sided(null_distribution, observed):
+        pvalues_less = less(null_distribution, observed)
+        pvalues_greater = greater(null_distribution, observed)
+        pvalues = xp.minimum(pvalues_less, pvalues_greater) * 2
+        return pvalues
+
+    compare = {"less": less,
+               "greater": greater,
+               "two-sided": two_sided}
+
+    pvalues = compare[alternative](null_distribution, observed)
+    pvalues = xp.clip(pvalues, 0., 1.)
+
+    return PermutationTestResult(observed, pvalues, null_distribution)
+
+
+@dataclass
+class ResamplingMethod:
+    """Configuration information for a statistical resampling method.
+
+    Instances of this class can be passed into the `method` parameter of some
+    hypothesis test functions to perform a resampling or Monte Carlo version
+    of the hypothesis test.
+
+    Attributes
+    ----------
+    n_resamples : int
+        The number of resamples to perform or Monte Carlo samples to draw.
+    batch : int, optional
+        The number of resamples to process in each vectorized call to
+        the statistic. Batch sizes >>1 tend to be faster when the statistic
+        is vectorized, but memory usage scales linearly with the batch size.
+        Default is ``None``, which processes all resamples in a single batch.
+
+    """
+    n_resamples: int = 9999
+    batch: int = None  # type: ignore[assignment]
+
+
+@dataclass
+class MonteCarloMethod(ResamplingMethod):
+    """Configuration information for a Monte Carlo hypothesis test.
+
+    Instances of this class can be passed into the `method` parameter of some
+    hypothesis test functions to perform a Monte Carlo version of the
+    hypothesis tests.
+
+    Attributes
+    ----------
+    n_resamples : int, optional
+        The number of Monte Carlo samples to draw. Default is 9999.
+    batch : int, optional
+        The number of Monte Carlo samples to process in each vectorized call to
+        the statistic. Batch sizes >>1 tend to be faster when the statistic
+        is vectorized, but memory usage scales linearly with the batch size.
+        Default is ``None``, which processes all samples in a single batch.
+    rvs : callable or tuple of callables, optional
+        A callable or sequence of callables that generates random variates
+        under the null hypothesis. Each element of `rvs` must be a callable
+        that accepts keyword argument ``size`` (e.g. ``rvs(size=(m, n))``) and
+        returns an N-d array sample of that shape. If `rvs` is a sequence, the
+        number of callables in `rvs` must match the number of samples passed
+        to the hypothesis test in which the `MonteCarloMethod` is used. Default
+        is ``None``, in which case the hypothesis test function chooses values
+        to match the standard version of the hypothesis test. For example,
+        the null hypothesis of `scipy.stats.pearsonr` is typically that the
+        samples are drawn from the standard normal distribution, so
+        ``rvs = (rng.normal, rng.normal)`` where
+        ``rng = np.random.default_rng()``.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+    """
+    rvs: object = None
+    rng: object = None
+
+    def __init__(self, n_resamples=9999, batch=None, rvs=None, rng=None):
+        if (rvs is not None) and (rng is not None):
+            message = 'Use of `rvs` and `rng` are mutually exclusive.'
+            raise ValueError(message)
+
+        self.n_resamples = n_resamples
+        self.batch = batch
+        self.rvs = rvs
+        self.rng = rng
+
+    def _asdict(self):
+        # `dataclasses.asdict` deepcopies; we don't want that.
+        return dict(n_resamples=self.n_resamples, batch=self.batch,
+                    rvs=self.rvs, rng=self.rng)
+
+
+_rs_deprecation = ("Use of attribute `random_state` is deprecated and replaced by "
+                   "`rng`. Support for `random_state` will be removed in SciPy 1.19.0. "
+                   "To silence this warning and ensure consistent behavior in SciPy "
+                   "1.19.0, control the RNG using attribute `rng`. Values set using "
+                   "attribute `rng` will be validated by `np.random.default_rng`, so "
+                   "the behavior corresponding with a given value may change compared "
+                   "to use of `random_state`. For example, 1) `None` will result in "
+                   "unpredictable random numbers, 2) an integer will result in a "
+                   "different stream of random numbers, (with the same distribution), "
+                   "and 3) `np.random` or `RandomState` instances will result in an "
+                   "error. See the documentation of `default_rng` for more "
+                   "information.")
+
+
+@dataclass
+class PermutationMethod(ResamplingMethod):
+    """Configuration information for a permutation hypothesis test.
+
+    Instances of this class can be passed into the `method` parameter of some
+    hypothesis test functions to perform a permutation version of the
+    hypothesis tests.
+
+    Attributes
+    ----------
+    n_resamples : int, optional
+        The number of resamples to perform. Default is 9999.
+    batch : int, optional
+        The number of resamples to process in each vectorized call to
+        the statistic. Batch sizes >>1 tend to be faster when the statistic
+        is vectorized, but memory usage scales linearly with the batch size.
+        Default is ``None``, which processes all resamples in a single batch.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator used to perform resampling.
+
+        If `rng` is passed by keyword to the initializer or the `rng` attribute is used
+        directly, types other than `numpy.random.Generator` are passed to
+        `numpy.random.default_rng` to instantiate a ``Generator`` before use.
+        If `rng` is already a ``Generator`` instance, then the provided instance is
+        used. Specify `rng` for repeatable behavior.
+
+        If this argument is passed by position, if `random_state` is passed by keyword
+        into the initializer, or if the `random_state` attribute is used directly,
+        legacy behavior for `random_state` applies:
+
+        - If `random_state` is None (or `numpy.random`), the `numpy.random.RandomState`
+          singleton is used.
+        - If `random_state` is an int, a new ``RandomState`` instance is used,
+          seeded with `random_state`.
+        - If `random_state` is already a ``Generator`` or ``RandomState`` instance then
+          that instance is used.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this attribute name was changed from
+            `random_state` to `rng`. For an interim period, both names will continue to
+            work, although only one may be specified at a time. After the interim
+            period, uses of `random_state` will emit warnings. The behavior of both
+            `random_state` and `rng` are outlined above, but only `rng` should be used
+            in new code.
+
+    """
+    rng: object  # type: ignore[misc]
+    _rng: object = field(init=False, repr=False, default=None)  # type: ignore[assignment]
+
+    @property
+    def random_state(self):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation, DeprecationWarning, stacklevel=2)
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, val):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation, DeprecationWarning, stacklevel=2)
+        self._random_state = val
+
+    @property  # type: ignore[no-redef]
+    def rng(self):  # noqa: F811
+        return self._rng
+
+    def __init__(self, n_resamples=9999, batch=None, random_state=None, *, rng=None):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation.replace('attribute', 'argument'),
+        #               DeprecationWarning, stacklevel=2)
+        self._rng = rng
+        self._random_state = random_state
+        super().__init__(n_resamples=n_resamples, batch=batch)
+
+    def _asdict(self):
+        # `dataclasses.asdict` deepcopies; we don't want that.
+        d = dict(n_resamples=self.n_resamples, batch=self.batch)
+        if self.rng is not None:
+            d['rng'] = self.rng
+        if self.random_state is not None:
+            d['random_state'] = self.random_state
+        return d
+
+
+@dataclass
+class BootstrapMethod(ResamplingMethod):
+    """Configuration information for a bootstrap confidence interval.
+
+    Instances of this class can be passed into the `method` parameter of some
+    confidence interval methods to generate a bootstrap confidence interval.
+
+    Attributes
+    ----------
+    n_resamples : int, optional
+        The number of resamples to perform. Default is 9999.
+    batch : int, optional
+        The number of resamples to process in each vectorized call to
+        the statistic. Batch sizes >>1 tend to be faster when the statistic
+        is vectorized, but memory usage scales linearly with the batch size.
+        Default is ``None``, which processes all resamples in a single batch.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator used to perform resampling.
+
+        If `rng` is passed by keyword to the initializer or the `rng` attribute is used
+        directly, types other than `numpy.random.Generator` are passed to
+        `numpy.random.default_rng` to instantiate a ``Generator``  before use.
+        If `rng` is already a ``Generator`` instance, then the provided instance is
+        used. Specify `rng` for repeatable behavior.
+
+        If this argument is passed by position, if `random_state` is passed by keyword
+        into the initializer, or if the `random_state` attribute is used directly,
+        legacy behavior for `random_state` applies:
+
+        - If `random_state` is None (or `numpy.random`), the `numpy.random.RandomState`
+          singleton is used.
+        - If `random_state` is an int, a new ``RandomState`` instance is used,
+          seeded with `random_state`.
+        - If `random_state` is already a ``Generator`` or ``RandomState`` instance then
+          that instance is used.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this attribute name was changed from
+            `random_state` to `rng`. For an interim period, both names will continue to
+            work, although only one may be specified at a time. After the interim
+            period, uses of `random_state` will emit warnings. The behavior of both
+            `random_state` and `rng` are outlined above, but only `rng` should be used
+            in new code.
+
+    method : {'BCa', 'percentile', 'basic'}
+        Whether to use the 'percentile' bootstrap ('percentile'), the 'basic'
+        (AKA 'reverse') bootstrap ('basic'), or the bias-corrected and
+        accelerated bootstrap ('BCa', default).
+
+    """
+    rng: object  # type: ignore[misc]
+    _rng: object = field(init=False, repr=False, default=None)  # type: ignore[assignment]
+    method: str = 'BCa'
+
+    @property
+    def random_state(self):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation, DeprecationWarning, stacklevel=2)
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, val):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation, DeprecationWarning, stacklevel=2)
+        self._random_state = val
+
+    @property  # type: ignore[no-redef]
+    def rng(self):  # noqa: F811
+        return self._rng
+
+    def __init__(self, n_resamples=9999, batch=None, random_state=None,
+                 method='BCa', *, rng=None):
+        # Uncomment in SciPy 1.17.0
+        # warnings.warn(_rs_deprecation.replace('attribute', 'argument'),
+        #               DeprecationWarning, stacklevel=2)
+        self._rng = rng  # don't validate with `default_rng`
+        self._random_state = random_state
+        self.method = method
+        super().__init__(n_resamples=n_resamples, batch=batch)
+
+    def _asdict(self):
+        # `dataclasses.asdict` deepcopies; we don't want that.
+        d = dict(n_resamples=self.n_resamples, batch=self.batch,
+                 method=self.method)
+        if self.rng is not None:
+            d['rng'] = self.rng
+        if self.random_state is not None:
+            d['random_state'] = self.random_state
+        return d
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_result_classes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_result_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..975af9310efb0c9a414439fd8d531fb95c988951
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_result_classes.py
@@ -0,0 +1,40 @@
+# This module exists only to allow Sphinx to generate docs
+# for the result objects returned by some functions in stats
+# _without_ adding them to the main stats documentation page.
+
+"""
+Result classes
+--------------
+
+.. currentmodule:: scipy.stats._result_classes
+
+.. autosummary::
+   :toctree: generated/
+
+   RelativeRiskResult
+   BinomTestResult
+   TukeyHSDResult
+   DunnettResult
+   PearsonRResult
+   FitResult
+   OddsRatioResult
+   TtestResult
+   ECDFResult
+   EmpiricalDistributionFunction
+
+"""
+
+__all__ = ['BinomTestResult', 'RelativeRiskResult', 'TukeyHSDResult',
+           'PearsonRResult', 'FitResult', 'OddsRatioResult',
+           'TtestResult', 'DunnettResult', 'ECDFResult',
+           'EmpiricalDistributionFunction']
+
+
+from ._binomtest import BinomTestResult
+from ._odds_ratio import OddsRatioResult
+from ._relative_risk import RelativeRiskResult
+from ._hypotests import TukeyHSDResult
+from ._multicomp import DunnettResult
+from ._stats_py import PearsonRResult, TtestResult
+from ._fit import FitResult
+from ._survival import ECDFResult, EmpiricalDistributionFunction
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sampling.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..e85078c97a876464855583f824dd92a852ae900f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sampling.py
@@ -0,0 +1,1314 @@
+import math
+import numbers
+import numpy as np
+from scipy import stats
+from scipy import special as sc
+from ._qmc import (check_random_state as check_random_state_qmc,
+                   Halton, QMCEngine)
+from ._unuran.unuran_wrapper import NumericalInversePolynomial
+from scipy._lib._util import check_random_state
+
+
+__all__ = ['FastGeneratorInversion', 'RatioUniforms']
+
+
+# define pdfs and other helper functions to create the generators
+
+def argus_pdf(x, chi):
+    # approach follows Baumgarten/Hoermann: Generating ARGUS random variates
+    # for chi > 5, use relationship of the ARGUS distribution to Gamma(1.5)
+    if chi <= 5:
+        y = 1 - x * x
+        return x * math.sqrt(y) * math.exp(-0.5 * chi**2 * y)
+    return math.sqrt(x) * math.exp(-x)
+
+
+def argus_gamma_trf(x, chi):
+    if chi <= 5:
+        return x
+    return np.sqrt(1.0 - 2 * x / chi**2)
+
+
+def argus_gamma_inv_trf(x, chi):
+    if chi <= 5:
+        return x
+    return 0.5 * chi**2 * (1 - x**2)
+
+
+def betaprime_pdf(x, a, b):
+    if x > 0:
+        logf = (a - 1) * math.log(x) - (a + b) * math.log1p(x) - sc.betaln(a, b)
+        return math.exp(logf)
+    else:
+        # return pdf at x == 0 separately to avoid runtime warnings
+        if a > 1:
+            return 0
+        elif a < 1:
+            return np.inf
+        else:
+            return 1 / sc.beta(a, b)
+
+
+def beta_valid_params(a, b):
+    return (min(a, b) >= 0.1) and (max(a, b) <= 700)
+
+
+def gamma_pdf(x, a):
+    if x > 0:
+        return math.exp(-math.lgamma(a) + (a - 1.0) * math.log(x) - x)
+    else:
+        return 0 if a >= 1 else np.inf
+
+
+def invgamma_pdf(x, a):
+    if x > 0:
+        return math.exp(-(a + 1.0) * math.log(x) - math.lgamma(a) - 1 / x)
+    else:
+        return 0 if a >= 1 else np.inf
+
+
+def burr_pdf(x, cc, dd):
+    # note: we use np.exp instead of math.exp, otherwise an overflow
+    # error can occur in the setup, e.g., for parameters
+    # 1.89128135, 0.30195177, see test test_burr_overflow
+    if x > 0:
+        lx = math.log(x)
+        return np.exp(-(cc + 1) * lx - (dd + 1) * math.log1p(np.exp(-cc * lx)))
+    else:
+        return 0
+
+
+def burr12_pdf(x, cc, dd):
+    if x > 0:
+        lx = math.log(x)
+        logterm = math.log1p(math.exp(cc * lx))
+        return math.exp((cc - 1) * lx - (dd + 1) * logterm + math.log(cc * dd))
+    else:
+        return 0
+
+
+def chi_pdf(x, a):
+    if x > 0:
+        return math.exp(
+            (a - 1) * math.log(x)
+            - 0.5 * (x * x)
+            - (a / 2 - 1) * math.log(2)
+            - math.lgamma(0.5 * a)
+        )
+    else:
+        return 0 if a >= 1 else np.inf
+
+
+def chi2_pdf(x, df):
+    if x > 0:
+        return math.exp(
+            (df / 2 - 1) * math.log(x)
+            - 0.5 * x
+            - (df / 2) * math.log(2)
+            - math.lgamma(0.5 * df)
+        )
+    else:
+        return 0 if df >= 1 else np.inf
+
+
+def alpha_pdf(x, a):
+    if x > 0:
+        return math.exp(-2.0 * math.log(x) - 0.5 * (a - 1.0 / x) ** 2)
+    return 0.0
+
+
+def bradford_pdf(x, c):
+    if 0 <= x <= 1:
+        return 1.0 / (1.0 + c * x)
+    return 0.0
+
+
+def crystalball_pdf(x, b, m):
+    if x > -b:
+        return math.exp(-0.5 * x * x)
+    return math.exp(m * math.log(m / b) - 0.5 * b * b - m * math.log(m / b - b - x))
+
+
+def weibull_min_pdf(x, c):
+    if x > 0:
+        return c * math.exp((c - 1) * math.log(x) - x**c)
+    return 0.0
+
+
+def weibull_max_pdf(x, c):
+    if x < 0:
+        return c * math.exp((c - 1) * math.log(-x) - ((-x) ** c))
+    return 0.0
+
+
+def invweibull_pdf(x, c):
+    if x > 0:
+        return c * math.exp(-(c + 1) * math.log(x) - x ** (-c))
+    return 0.0
+
+
+def wald_pdf(x):
+    if x > 0:
+        return math.exp(-((x - 1) ** 2) / (2 * x)) / math.sqrt(x**3)
+    return 0.0
+
+
+def geninvgauss_mode(p, b):
+    if p > 1:  # equivalent mode formulas numerical more stable versions
+        return (math.sqrt((1 - p) ** 2 + b**2) - (1 - p)) / b
+    return b / (math.sqrt((1 - p) ** 2 + b**2) + (1 - p))
+
+
+def geninvgauss_pdf(x, p, b):
+    m = geninvgauss_mode(p, b)
+    lfm = (p - 1) * math.log(m) - 0.5 * b * (m + 1 / m)
+    if x > 0:
+        return math.exp((p - 1) * math.log(x) - 0.5 * b * (x + 1 / x) - lfm)
+    return 0.0
+
+
+def invgauss_mode(mu):
+    return 1.0 / (math.sqrt(1.5 * 1.5 + 1 / (mu * mu)) + 1.5)
+
+
+def invgauss_pdf(x, mu):
+    m = invgauss_mode(mu)
+    lfm = -1.5 * math.log(m) - (m - mu) ** 2 / (2 * m * mu**2)
+    if x > 0:
+        return math.exp(-1.5 * math.log(x) - (x - mu) ** 2 / (2 * x * mu**2) - lfm)
+    return 0.0
+
+
+def powerlaw_pdf(x, a):
+    if x > 0:
+        return x ** (a - 1)
+    return 0.0
+
+
+# Define a dictionary: for a given distribution (keys), another dictionary
+# (values) specifies the parameters for NumericalInversePolynomial (PINV).
+# The keys of the latter dictionary are:
+# - pdf: the pdf of the distribution (callable). The signature of the pdf
+#   is float -> float (i.e., the function does not have to be vectorized).
+#   If possible, functions like log or exp from the module math should be
+#   preferred over functions from numpy since the PINV setup will be faster
+#   in that case.
+# - check_pinv_params: callable f that returns true if the shape parameters
+#   (args) are recommended parameters for PINV (i.e., the u-error does
+#   not exceed the default tolerance)
+# - center: scalar if the center does not depend on args, otherwise
+#   callable that returns the center as a function of the shape parameters
+# - rvs_transform: a callable that can be used to transform the rvs that
+#   are distributed according to the pdf to the target distribution
+#   (as an example, see the entry for the beta distribution)
+# - rvs_transform_inv: the inverse of rvs_transform (it is required
+#   for the transformed ppf)
+# - mirror_uniform: boolean or a callable that returns true or false
+#   depending on the shape parameters. If True, the ppf is applied
+#   to 1-u instead of u to generate rvs, where u is a uniform rv.
+#   While both u and 1-u are uniform, it can be required to use 1-u
+#   to compute the u-error correctly. This is only relevant for the argus
+#   distribution.
+# The only required keys are "pdf" and "check_pinv_params".
+# All other keys are optional.
+
+PINV_CONFIG = {
+    "alpha": {
+        "pdf": alpha_pdf,
+        "check_pinv_params": lambda a: 1.0e-11 <= a < 2.1e5,
+        "center": lambda a: 0.25 * (math.sqrt(a * a + 8.0) - a),
+    },
+    "anglit": {
+        "pdf": lambda x: math.cos(2 * x) + 1.0e-13,
+        # +1.e-13 is necessary, otherwise PINV has strange problems as
+        # f(upper border) is very close to 0
+        "center": 0,
+    },
+    "argus": {
+        "pdf": argus_pdf,
+        "center": lambda chi: 0.7 if chi <= 5 else 0.5,
+        "check_pinv_params": lambda chi: 1e-20 < chi < 901,
+        "rvs_transform": argus_gamma_trf,
+        "rvs_transform_inv": argus_gamma_inv_trf,
+        "mirror_uniform": lambda chi: chi > 5,
+    },
+    "beta": {
+        "pdf": betaprime_pdf,
+        "center": lambda a, b: max(0.1, (a - 1) / (b + 1)),
+        "check_pinv_params": beta_valid_params,
+        "rvs_transform": lambda x, *args: x / (1 + x),
+        "rvs_transform_inv": lambda x, *args: x / (1 - x) if x < 1 else np.inf,
+    },
+    "betaprime": {
+        "pdf": betaprime_pdf,
+        "center": lambda a, b: max(0.1, (a - 1) / (b + 1)),
+        "check_pinv_params": beta_valid_params,
+    },
+    "bradford": {
+        "pdf": bradford_pdf,
+        "check_pinv_params": lambda a: 1.0e-6 <= a <= 1e9,
+        "center": 0.5,
+    },
+    "burr": {
+        "pdf": burr_pdf,
+        "center": lambda a, b: (2 ** (1 / b) - 1) ** (-1 / a),
+        "check_pinv_params": lambda a, b: (min(a, b) >= 0.3) and (max(a, b) <= 50),
+    },
+    "burr12": {
+        "pdf": burr12_pdf,
+        "center": lambda a, b: (2 ** (1 / b) - 1) ** (1 / a),
+        "check_pinv_params": lambda a, b: (min(a, b) >= 0.2) and (max(a, b) <= 50),
+    },
+    "cauchy": {
+        "pdf": lambda x: 1 / (1 + (x * x)),
+        "center": 0,
+    },
+    "chi": {
+        "pdf": chi_pdf,
+        "check_pinv_params": lambda df: 0.05 <= df <= 1.0e6,
+        "center": lambda a: math.sqrt(a),
+    },
+    "chi2": {
+        "pdf": chi2_pdf,
+        "check_pinv_params": lambda df: 0.07 <= df <= 1e6,
+        "center": lambda a: a,
+    },
+    "cosine": {
+        "pdf": lambda x: 1 + math.cos(x),
+        "center": 0,
+    },
+    "crystalball": {
+        "pdf": crystalball_pdf,
+        "check_pinv_params": lambda b, m: (0.01 <= b <= 5.5)
+        and (1.1 <= m <= 75.1),
+        "center": 0.0,
+    },
+    "expon": {
+        "pdf": lambda x: math.exp(-x),
+        "center": 1.0,
+    },
+    "gamma": {
+        "pdf": gamma_pdf,
+        "check_pinv_params": lambda a: 0.04 <= a <= 1e6,
+        "center": lambda a: a,
+    },
+    "gennorm": {
+        "pdf": lambda x, b: math.exp(-abs(x) ** b),
+        "check_pinv_params": lambda b: 0.081 <= b <= 45.0,
+        "center": 0.0,
+    },
+    "geninvgauss": {
+        "pdf": geninvgauss_pdf,
+        "check_pinv_params": lambda p, b: (abs(p) <= 1200.0)
+        and (1.0e-10 <= b <= 1200.0),
+        "center": geninvgauss_mode,
+    },
+    "gumbel_l": {
+        "pdf": lambda x: math.exp(x - math.exp(x)),
+        "center": -0.6,
+    },
+    "gumbel_r": {
+        "pdf": lambda x: math.exp(-x - math.exp(-x)),
+        "center": 0.6,
+    },
+    "hypsecant": {
+        "pdf": lambda x: 1.0 / (math.exp(x) + math.exp(-x)),
+        "center": 0.0,
+    },
+    "invgamma": {
+        "pdf": invgamma_pdf,
+        "check_pinv_params": lambda a: 0.04 <= a <= 1e6,
+        "center": lambda a: 1 / a,
+    },
+    "invgauss": {
+        "pdf": invgauss_pdf,
+        "check_pinv_params": lambda mu: 1.0e-10 <= mu <= 1.0e9,
+        "center": invgauss_mode,
+    },
+    "invweibull": {
+        "pdf": invweibull_pdf,
+        "check_pinv_params": lambda a: 0.12 <= a <= 512,
+        "center": 1.0,
+    },
+    "laplace": {
+        "pdf": lambda x: math.exp(-abs(x)),
+        "center": 0.0,
+    },
+    "logistic": {
+        "pdf": lambda x: math.exp(-x) / (1 + math.exp(-x)) ** 2,
+        "center": 0.0,
+    },
+    "maxwell": {
+        "pdf": lambda x: x * x * math.exp(-0.5 * x * x),
+        "center": 1.41421,
+    },
+    "moyal": {
+        "pdf": lambda x: math.exp(-(x + math.exp(-x)) / 2),
+        "center": 1.2,
+    },
+    "norm": {
+        "pdf": lambda x: math.exp(-x * x / 2),
+        "center": 0.0,
+    },
+    "pareto": {
+        "pdf": lambda x, b: x ** -(b + 1),
+        "center": lambda b: b / (b - 1) if b > 2 else 1.5,
+        "check_pinv_params": lambda b: 0.08 <= b <= 400000,
+    },
+    "powerlaw": {
+        "pdf": powerlaw_pdf,
+        "center": 1.0,
+        "check_pinv_params": lambda a: 0.06 <= a <= 1.0e5,
+    },
+    "t": {
+        "pdf": lambda x, df: (1 + x * x / df) ** (-0.5 * (df + 1)),
+        "check_pinv_params": lambda a: 0.07 <= a <= 1e6,
+        "center": 0.0,
+    },
+    "rayleigh": {
+        "pdf": lambda x: x * math.exp(-0.5 * (x * x)),
+        "center": 1.0,
+    },
+    "semicircular": {
+        "pdf": lambda x: math.sqrt(1.0 - (x * x)),
+        "center": 0,
+    },
+    "wald": {
+        "pdf": wald_pdf,
+        "center": 1.0,
+    },
+    "weibull_max": {
+        "pdf": weibull_max_pdf,
+        "check_pinv_params": lambda a: 0.25 <= a <= 512,
+        "center": -1.0,
+    },
+    "weibull_min": {
+        "pdf": weibull_min_pdf,
+        "check_pinv_params": lambda a: 0.25 <= a <= 512,
+        "center": 1.0,
+    },
+}
+
+
+def _validate_qmc_input(qmc_engine, d, seed):
+    # Input validation for `qmc_engine` and `d`
+    # Error messages for invalid `d` are raised by QMCEngine
+    # we could probably use a stats.qmc.check_qrandom_state
+    if isinstance(qmc_engine, QMCEngine):
+        if d is not None and qmc_engine.d != d:
+            message = "`d` must be consistent with dimension of `qmc_engine`."
+            raise ValueError(message)
+        d = qmc_engine.d if d is None else d
+    elif qmc_engine is None:
+        d = 1 if d is None else d
+        qmc_engine = Halton(d, seed=seed)
+    else:
+        message = (
+            "`qmc_engine` must be an instance of "
+            "`scipy.stats.qmc.QMCEngine` or `None`."
+        )
+        raise ValueError(message)
+
+    return qmc_engine, d
+
+
+class CustomDistPINV:
+    def __init__(self, pdf, args):
+        self._pdf = lambda x: pdf(x, *args)
+
+    def pdf(self, x):
+        return self._pdf(x)
+
+
+class FastGeneratorInversion:
+    """
+    Fast sampling by numerical inversion of the CDF for a large class of
+    continuous distributions in `scipy.stats`.
+
+    Parameters
+    ----------
+    dist : rv_frozen object
+        Frozen distribution object from `scipy.stats`. The list of supported
+        distributions can be found in the Notes section. The shape parameters,
+        `loc` and `scale` used to create the distributions must be scalars.
+        For example, for the Gamma distribution with shape parameter `p`,
+        `p` has to be a float, and for the beta distribution with shape
+        parameters (a, b), both a and b have to be floats.
+    domain : tuple of floats, optional
+        If one wishes to sample from a truncated/conditional distribution,
+        the domain has to be specified.
+        The default is None. In that case, the random variates are not
+        truncated, and the domain is inferred from the support of the
+        distribution.
+    ignore_shape_range : boolean, optional.
+        If False, shape parameters that are outside of the valid range
+        of values to ensure that the numerical accuracy (see Notes) is
+        high, raise a ValueError. If True, any shape parameters that are valid
+        for the distribution are accepted. This can be useful for testing.
+        The default is False.
+    random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            A NumPy random number generator or seed for the underlying NumPy
+            random number generator used to generate the stream of uniform
+            random numbers.
+            If `random_state` is None, it uses ``self.random_state``.
+            If `random_state` is an int,
+            ``np.random.default_rng(random_state)`` is used.
+            If `random_state` is already a ``Generator`` or ``RandomState``
+            instance then that instance is used.
+
+    Attributes
+    ----------
+    loc : float
+        The location parameter.
+    random_state : {`numpy.random.Generator`, `numpy.random.RandomState`}
+        The random state used in relevant methods like `rvs` (unless
+        another `random_state` is passed as an argument to these methods).
+    scale : float
+        The scale parameter.
+
+    Methods
+    -------
+    cdf
+    evaluate_error
+    ppf
+    qrvs
+    rvs
+    support
+
+    Notes
+    -----
+    The class creates an object for continuous distributions specified
+    by `dist`. The method `rvs` uses a generator from
+    `scipy.stats.sampling` that is created when the object is instantiated.
+    In addition, the methods `qrvs` and `ppf` are added.
+    `qrvs` generate samples based on quasi-random numbers from
+    `scipy.stats.qmc`. `ppf` is the PPF based on the
+    numerical inversion method in [1]_ (`NumericalInversePolynomial`) that is
+    used to generate random variates.
+
+    Supported distributions (`distname`) are:
+    ``alpha``, ``anglit``, ``argus``, ``beta``, ``betaprime``, ``bradford``,
+    ``burr``, ``burr12``, ``cauchy``, ``chi``, ``chi2``, ``cosine``,
+    ``crystalball``, ``expon``, ``gamma``, ``gennorm``, ``geninvgauss``,
+    ``gumbel_l``, ``gumbel_r``, ``hypsecant``, ``invgamma``, ``invgauss``,
+    ``invweibull``, ``laplace``, ``logistic``, ``maxwell``, ``moyal``,
+    ``norm``, ``pareto``, ``powerlaw``, ``t``, ``rayleigh``, ``semicircular``,
+    ``wald``, ``weibull_max``, ``weibull_min``.
+
+    `rvs` relies on the accuracy of the numerical inversion. If very extreme
+    shape parameters are used, the numerical inversion might not work. However,
+    for all implemented distributions, the admissible shape parameters have
+    been tested, and an error will be raised if the user supplies values
+    outside of the allowed range. The u-error should not exceed 1e-10 for all
+    valid parameters. Note that warnings might be raised even if parameters
+    are within the valid range when the object is instantiated.
+    To check numerical accuracy, the method `evaluate_error` can be used.
+
+    Note that all implemented distributions are also part of `scipy.stats`, and
+    the object created by `FastGeneratorInversion` relies on methods like
+    `ppf`, `cdf` and `pdf` from `rv_frozen`. The main benefit of using this
+    class can be summarized as follows: Once the generator to sample random
+    variates is created in the setup step, sampling and evaluation of
+    the PPF using `ppf` are very fast,
+    and performance is essentially independent of the distribution. Therefore,
+    a substantial speed-up can be achieved for many distributions if large
+    numbers of random variates are required. It is important to know that this
+    fast sampling is achieved by inversion of the CDF. Thus, one uniform
+    random variate is transformed into a non-uniform variate, which is an
+    advantage for several simulation methods, e.g., when
+    the variance reduction methods of common random variates or
+    antithetic variates are be used ([2]_).
+
+    In addition, inversion makes it possible to
+    - to use a QMC generator from `scipy.stats.qmc` (method `qrvs`),
+    - to generate random variates truncated to an interval. For example, if
+    one aims to sample standard normal random variates from
+    the interval (2, 4), this can be easily achieved by using the parameter
+    `domain`.
+
+    The location and scale that are initially defined by `dist`
+    can be reset without having to rerun the setup
+    step to create the generator that is used for sampling. The relation
+    of the distribution `Y` with `loc` and `scale` to the standard
+    distribution `X` (i.e., ``loc=0`` and ``scale=1``) is given by
+    ``Y = loc + scale * X``.
+
+    References
+    ----------
+    .. [1] Derflinger, Gerhard, Wolfgang Hörmann, and Josef Leydold.
+           "Random variate  generation by numerical inversion when only the
+           density is known." ACM Transactions on Modeling and Computer
+           Simulation (TOMACS) 20.4 (2010): 1-25.
+    .. [2] Hörmann, Wolfgang, Josef Leydold and Gerhard Derflinger.
+           "Automatic nonuniform random number generation."
+           Springer, 2004.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> from scipy.stats.sampling import FastGeneratorInversion
+
+    Let's start with a simple example to illustrate the main features:
+
+    >>> gamma_frozen = stats.gamma(1.5)
+    >>> gamma_dist = FastGeneratorInversion(gamma_frozen)
+    >>> r = gamma_dist.rvs(size=1000)
+
+    The mean should be approximately equal to the shape parameter 1.5:
+
+    >>> r.mean()
+    1.52423591130436  # may vary
+
+    Similarly, we can draw a sample based on quasi-random numbers:
+
+    >>> r = gamma_dist.qrvs(size=1000)
+    >>> r.mean()
+    1.4996639255942914  # may vary
+
+    Compare the PPF against approximation `ppf`.
+
+    >>> q = [0.001, 0.2, 0.5, 0.8, 0.999]
+    >>> np.max(np.abs(gamma_frozen.ppf(q) - gamma_dist.ppf(q)))
+    4.313394796895409e-08
+
+    To confirm that the numerical inversion is accurate, we evaluate the
+    approximation error (u-error), which should be below 1e-10 (for more
+    details, refer to the documentation of `evaluate_error`):
+
+    >>> gamma_dist.evaluate_error()
+    (7.446320551265581e-11, nan)  # may vary
+
+    Note that the location and scale can be changed without instantiating a
+    new generator:
+
+    >>> gamma_dist.loc = 2
+    >>> gamma_dist.scale = 3
+    >>> r = gamma_dist.rvs(size=1000)
+
+    The mean should be approximately 2 + 3*1.5 = 6.5.
+
+    >>> r.mean()
+    6.399549295242894  # may vary
+
+    Let us also illustrate how truncation can be applied:
+
+    >>> trunc_norm = FastGeneratorInversion(stats.norm(), domain=(3, 4))
+    >>> r = trunc_norm.rvs(size=1000)
+    >>> 3 < r.min() < r.max() < 4
+    True
+
+    Check the mean:
+
+    >>> r.mean()
+    3.250433367078603  # may vary
+
+    >>> stats.norm.expect(lb=3, ub=4, conditional=True)
+    3.260454285589997
+
+    In this particular, case, `scipy.stats.truncnorm` could also be used to
+    generate truncated normal random variates.
+
+    """
+
+    def __init__(
+        self,
+        dist,
+        *,
+        domain=None,
+        ignore_shape_range=False,
+        random_state=None,
+    ):
+
+        if isinstance(dist, stats.distributions.rv_frozen):
+            distname = dist.dist.name
+            if distname not in PINV_CONFIG.keys():
+                raise ValueError(
+                    f"Distribution '{distname}' is not supported."
+                    f"It must be one of {list(PINV_CONFIG.keys())}"
+                    )
+        else:
+            raise ValueError("`dist` must be a frozen distribution object")
+
+        loc = dist.kwds.get("loc", 0)
+        scale = dist.kwds.get("scale", 1)
+        args = dist.args
+        if not np.isscalar(loc):
+            raise ValueError("loc must be scalar.")
+        if not np.isscalar(scale):
+            raise ValueError("scale must be scalar.")
+
+        self._frozendist = getattr(stats, distname)(
+            *args,
+            loc=loc,
+            scale=scale,
+        )
+        self._distname = distname
+
+        nargs = np.broadcast_arrays(args)[0].size
+        nargs_expected = self._frozendist.dist.numargs
+        if nargs != nargs_expected:
+            raise ValueError(
+                f"Each of the {nargs_expected} shape parameters must be a "
+                f"scalar, but {nargs} values are provided."
+            )
+
+        self.random_state = random_state
+
+        if domain is None:
+            self._domain = self._frozendist.support()
+            self._p_lower = 0.0
+            self._p_domain = 1.0
+        else:
+            self._domain = domain
+            self._p_lower = self._frozendist.cdf(self._domain[0])
+            _p_domain = self._frozendist.cdf(self._domain[1]) - self._p_lower
+            self._p_domain = _p_domain
+        self._set_domain_adj()
+        self._ignore_shape_range = ignore_shape_range
+
+        # the domain to be passed to NumericalInversePolynomial
+        # define a separate variable since in case of a transformation,
+        # domain_pinv will not be the same as self._domain
+        self._domain_pinv = self._domain
+
+        # get information about the distribution from the config to set up
+        # the generator
+        dist = self._process_config(distname, args)
+
+        if self._rvs_transform_inv is not None:
+            d0 = self._rvs_transform_inv(self._domain[0], *args)
+            d1 = self._rvs_transform_inv(self._domain[1], *args)
+            if d0 > d1:
+                # swap values if transformation if decreasing
+                d0, d1 = d1, d0
+            # only update _domain_pinv and not _domain
+            # _domain refers to the original distribution, _domain_pinv
+            # to the transformed distribution
+            self._domain_pinv = d0, d1
+
+        # self._center has been set by the call self._process_config
+        # check if self._center is inside the transformed domain
+        # _domain_pinv, otherwise move it to the endpoint that is closer
+        if self._center is not None:
+            if self._center < self._domain_pinv[0]:
+                self._center = self._domain_pinv[0]
+            elif self._center > self._domain_pinv[1]:
+                self._center = self._domain_pinv[1]
+
+        self._rng = NumericalInversePolynomial(
+            dist,
+            random_state=self.random_state,
+            domain=self._domain_pinv,
+            center=self._center,
+            )
+
+    @property
+    def random_state(self):
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, random_state):
+        self._random_state = check_random_state_qmc(random_state)
+
+    @property
+    def loc(self):
+        return self._frozendist.kwds.get("loc", 0)
+
+    @loc.setter
+    def loc(self, loc):
+        if not np.isscalar(loc):
+            raise ValueError("loc must be scalar.")
+        self._frozendist.kwds["loc"] = loc
+        # update the adjusted domain that depends on loc and scale
+        self._set_domain_adj()
+
+    @property
+    def scale(self):
+        return self._frozendist.kwds.get("scale", 0)
+
+    @scale.setter
+    def scale(self, scale):
+        if not np.isscalar(scale):
+            raise ValueError("scale must be scalar.")
+        self._frozendist.kwds["scale"] = scale
+        # update the adjusted domain that depends on loc and scale
+        self._set_domain_adj()
+
+    def _set_domain_adj(self):
+        """ Adjust the domain based on loc and scale. """
+        loc = self.loc
+        scale = self.scale
+        lb = self._domain[0] * scale + loc
+        ub = self._domain[1] * scale + loc
+        self._domain_adj = (lb, ub)
+
+    def _process_config(self, distname, args):
+        cfg = PINV_CONFIG[distname]
+        if "check_pinv_params" in cfg:
+            if not self._ignore_shape_range:
+                if not cfg["check_pinv_params"](*args):
+                    msg = ("No generator is defined for the shape parameters "
+                           f"{args}. Use ignore_shape_range to proceed "
+                           "with the selected values.")
+                    raise ValueError(msg)
+
+        if "center" in cfg.keys():
+            if not np.isscalar(cfg["center"]):
+                self._center = cfg["center"](*args)
+            else:
+                self._center = cfg["center"]
+        else:
+            self._center = None
+        self._rvs_transform = cfg.get("rvs_transform", None)
+        self._rvs_transform_inv = cfg.get("rvs_transform_inv", None)
+        _mirror_uniform = cfg.get("mirror_uniform", None)
+        if _mirror_uniform is None:
+            self._mirror_uniform = False
+        else:
+            self._mirror_uniform = _mirror_uniform(*args)
+
+        return CustomDistPINV(cfg["pdf"], args)
+
+    def rvs(self, size=None):
+        """
+        Sample from the distribution by inversion.
+
+        Parameters
+        ----------
+        size : int or tuple, optional
+            The shape of samples. Default is ``None`` in which case a scalar
+            sample is returned.
+
+        Returns
+        -------
+        rvs : array_like
+            A NumPy array of random variates.
+
+        Notes
+        -----
+        Random variates are generated by numerical inversion of the CDF, i.e.,
+        `ppf` computed by `NumericalInversePolynomial` when the class
+        is instantiated. Note that the
+        default ``rvs`` method of the rv_continuous class is
+        overwritten. Hence, a different stream of random numbers is generated
+        even if the same seed is used.
+        """
+        # note: we cannot use self._rng.rvs directly in case
+        # self._mirror_uniform is true
+        u = self.random_state.uniform(size=size)
+        if self._mirror_uniform:
+            u = 1 - u
+        r = self._rng.ppf(u)
+        if self._rvs_transform is not None:
+            r = self._rvs_transform(r, *self._frozendist.args)
+        return self.loc + self.scale * r
+
+    def ppf(self, q):
+        """
+        Very fast PPF (inverse CDF) of the distribution which
+        is a very close approximation of the exact PPF values.
+
+        Parameters
+        ----------
+        u : array_like
+            Array with probabilities.
+
+        Returns
+        -------
+        ppf : array_like
+            Quantiles corresponding to the values in `u`.
+
+        Notes
+        -----
+        The evaluation of the PPF is very fast but it may have a large
+        relative error in the far tails. The numerical precision of the PPF
+        is controlled by the u-error, that is,
+        ``max |u - CDF(PPF(u))|`` where the max is taken over points in
+        the interval [0,1], see `evaluate_error`.
+
+        Note that this PPF is designed to generate random samples.
+        """
+        q = np.asarray(q)
+        if self._mirror_uniform:
+            x = self._rng.ppf(1 - q)
+        else:
+            x = self._rng.ppf(q)
+        if self._rvs_transform is not None:
+            x = self._rvs_transform(x, *self._frozendist.args)
+        return self.scale * x + self.loc
+
+    def qrvs(self, size=None, d=None, qmc_engine=None):
+        """
+        Quasi-random variates of the given distribution.
+
+        The `qmc_engine` is used to draw uniform quasi-random variates, and
+        these are converted to quasi-random variates of the given distribution
+        using inverse transform sampling.
+
+        Parameters
+        ----------
+        size : int, tuple of ints, or None; optional
+            Defines shape of random variates array. Default is ``None``.
+        d : int or None, optional
+            Defines dimension of uniform quasi-random variates to be
+            transformed. Default is ``None``.
+        qmc_engine : scipy.stats.qmc.QMCEngine(d=1), optional
+            Defines the object to use for drawing
+            quasi-random variates. Default is ``None``, which uses
+            `scipy.stats.qmc.Halton(1)`.
+
+        Returns
+        -------
+        rvs : ndarray or scalar
+            Quasi-random variates. See Notes for shape information.
+
+        Notes
+        -----
+        The shape of the output array depends on `size`, `d`, and `qmc_engine`.
+        The intent is for the interface to be natural, but the detailed rules
+        to achieve this are complicated.
+
+        - If `qmc_engine` is ``None``, a `scipy.stats.qmc.Halton` instance is
+          created with dimension `d`. If `d` is not provided, ``d=1``.
+        - If `qmc_engine` is not ``None`` and `d` is ``None``, `d` is
+          determined from the dimension of the `qmc_engine`.
+        - If `qmc_engine` is not ``None`` and `d` is not ``None`` but the
+          dimensions are inconsistent, a ``ValueError`` is raised.
+        - After `d` is determined according to the rules above, the output
+          shape is ``tuple_shape + d_shape``, where:
+
+              - ``tuple_shape = tuple()`` if `size` is ``None``,
+              - ``tuple_shape = (size,)`` if `size` is an ``int``,
+              - ``tuple_shape = size`` if `size` is a sequence,
+              - ``d_shape = tuple()`` if `d` is ``None`` or `d` is 1, and
+              - ``d_shape = (d,)`` if `d` is greater than 1.
+
+        The elements of the returned array are part of a low-discrepancy
+        sequence. If `d` is 1, this means that none of the samples are truly
+        independent. If `d` > 1, each slice ``rvs[..., i]`` will be of a
+        quasi-independent sequence; see `scipy.stats.qmc.QMCEngine` for
+        details. Note that when `d` > 1, the samples returned are still those
+        of the provided univariate distribution, not a multivariate
+        generalization of that distribution.
+
+        """
+        qmc_engine, d = _validate_qmc_input(qmc_engine, d, self.random_state)
+        # mainly copied from unuran_wrapper.pyx.templ
+        # `rvs` is flexible about whether `size` is an int or tuple, so this
+        # should be, too.
+        try:
+            if size is None:
+                tuple_size = (1,)
+            else:
+                tuple_size = tuple(size)
+        except TypeError:
+            tuple_size = (size,)
+        # we do not use rng.qrvs directly since we need to be
+        # able to apply the ppf to 1 - u
+        N = 1 if size is None else np.prod(size)
+        u = qmc_engine.random(N)
+        if self._mirror_uniform:
+            u = 1 - u
+        qrvs = self._ppf(u)
+        if self._rvs_transform is not None:
+            qrvs = self._rvs_transform(qrvs, *self._frozendist.args)
+        if size is None:
+            qrvs = qrvs.squeeze()[()]
+        else:
+            if d == 1:
+                qrvs = qrvs.reshape(tuple_size)
+            else:
+                qrvs = qrvs.reshape(tuple_size + (d,))
+        return self.loc + self.scale * qrvs
+
+    def evaluate_error(self, size=100000, random_state=None, x_error=False):
+        """
+        Evaluate the numerical accuracy of the inversion (u- and x-error).
+
+        Parameters
+        ----------
+        size : int, optional
+            The number of random points over which the error is estimated.
+            Default is ``100000``.
+        random_state : {None, int, `numpy.random.Generator`,
+                        `numpy.random.RandomState`}, optional
+
+            A NumPy random number generator or seed for the underlying NumPy
+            random number generator used to generate the stream of uniform
+            random numbers.
+            If `random_state` is None, use ``self.random_state``.
+            If `random_state` is an int,
+            ``np.random.default_rng(random_state)`` is used.
+            If `random_state` is already a ``Generator`` or ``RandomState``
+            instance then that instance is used.
+
+        Returns
+        -------
+        u_error, x_error : tuple of floats
+            A NumPy array of random variates.
+
+        Notes
+        -----
+        The numerical precision of the inverse CDF `ppf` is controlled by
+        the u-error. It is computed as follows:
+        ``max |u - CDF(PPF(u))|`` where the max is taken `size` random
+        points in the interval [0,1]. `random_state` determines the random
+        sample. Note that if `ppf` was exact, the u-error would be zero.
+
+        The x-error measures the direct distance between the exact PPF
+        and `ppf`. If ``x_error`` is set to ``True`, it is
+        computed as the maximum of the minimum of the relative and absolute
+        x-error:
+        ``max(min(x_error_abs[i], x_error_rel[i]))`` where
+        ``x_error_abs[i] = |PPF(u[i]) - PPF_fast(u[i])|``,
+        ``x_error_rel[i] = max |(PPF(u[i]) - PPF_fast(u[i])) / PPF(u[i])|``.
+        Note that it is important to consider the relative x-error in the case
+        that ``PPF(u)`` is close to zero or very large.
+
+        By default, only the u-error is evaluated and the x-error is set to
+        ``np.nan``. Note that the evaluation of the x-error will be very slow
+        if the implementation of the PPF is slow.
+
+        Further information about these error measures can be found in [1]_.
+
+        References
+        ----------
+        .. [1] Derflinger, Gerhard, Wolfgang Hörmann, and Josef Leydold.
+               "Random variate  generation by numerical inversion when only the
+               density is known." ACM Transactions on Modeling and Computer
+               Simulation (TOMACS) 20.4 (2010): 1-25.
+
+        Examples
+        --------
+
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> from scipy.stats.sampling import FastGeneratorInversion
+
+        Create an object for the normal distribution:
+
+        >>> d_norm_frozen = stats.norm()
+        >>> d_norm = FastGeneratorInversion(d_norm_frozen)
+
+        To confirm that the numerical inversion is accurate, we evaluate the
+        approximation error (u-error and x-error).
+
+        >>> u_error, x_error = d_norm.evaluate_error(x_error=True)
+
+        The u-error should be below 1e-10:
+
+        >>> u_error
+        8.785783212061915e-11  # may vary
+
+        Compare the PPF against approximation `ppf`:
+
+        >>> q = [0.001, 0.2, 0.4, 0.6, 0.8, 0.999]
+        >>> diff = np.abs(d_norm_frozen.ppf(q) - d_norm.ppf(q))
+        >>> x_error_abs = np.max(diff)
+        >>> x_error_abs
+        1.2937954707581412e-08
+
+        This is the absolute x-error evaluated at the points q. The relative
+        error is given by
+
+        >>> x_error_rel = np.max(diff / np.abs(d_norm_frozen.ppf(q)))
+        >>> x_error_rel
+        4.186725600453555e-09
+
+        The x_error computed above is derived in a very similar way over a
+        much larger set of random values q. At each value q[i], the minimum
+        of the relative and absolute error is taken. The final value is then
+        derived as the maximum of these values. In our example, we get the
+        following value:
+
+        >>> x_error
+        4.507068014335139e-07  # may vary
+
+        """
+        if not isinstance(size, numbers.Integral | np.integer):
+            raise ValueError("size must be an integer.")
+        # urng will be used to draw the samples for testing the error
+        # it must not interfere with self.random_state. therefore, do not
+        # call self.rvs, but draw uniform random numbers and apply
+        # self.ppf (note: like in rvs, consider self._mirror_uniform)
+        urng = check_random_state_qmc(random_state)
+        u = urng.uniform(size=size)
+        if self._mirror_uniform:
+            u = 1 - u
+        x = self.ppf(u)
+        uerr = np.max(np.abs(self._cdf(x) - u))
+        if not x_error:
+            return uerr, np.nan
+        ppf_u = self._ppf(u)
+        x_error_abs = np.abs(self.ppf(u)-ppf_u)
+        x_error_rel = x_error_abs / np.abs(ppf_u)
+        x_error_combined = np.array([x_error_abs, x_error_rel]).min(axis=0)
+        return uerr, np.max(x_error_combined)
+
+    def support(self):
+        """Support of the distribution.
+
+        Returns
+        -------
+        a, b : float
+            end-points of the distribution's support.
+
+        Notes
+        -----
+
+        Note that the support of the distribution depends on `loc`,
+        `scale` and `domain`.
+
+        Examples
+        --------
+
+        >>> from scipy import stats
+        >>> from scipy.stats.sampling import FastGeneratorInversion
+
+        Define a truncated normal distribution:
+
+        >>> d_norm = FastGeneratorInversion(stats.norm(), domain=(0, 1))
+        >>> d_norm.support()
+        (0, 1)
+
+        Shift the distribution:
+
+        >>> d_norm.loc = 2.5
+        >>> d_norm.support()
+        (2.5, 3.5)
+
+        """
+        return self._domain_adj
+
+    def _cdf(self, x):
+        """Cumulative distribution function (CDF)
+
+        Parameters
+        ----------
+        x : array_like
+            The values where the CDF is evaluated
+
+        Returns
+        -------
+        y : ndarray
+            CDF evaluated at x
+
+        """
+        y = self._frozendist.cdf(x)
+        if self._p_domain == 1.0:
+            return y
+        return np.clip((y - self._p_lower) / self._p_domain, 0, 1)
+
+    def _ppf(self, q):
+        """Percent point function (inverse of `cdf`)
+
+        Parameters
+        ----------
+        q : array_like
+            lower tail probability
+
+        Returns
+        -------
+        x : array_like
+            quantile corresponding to the lower tail probability q.
+
+        """
+        if self._p_domain == 1.0:
+            return self._frozendist.ppf(q)
+        x = self._frozendist.ppf(self._p_domain * np.array(q) + self._p_lower)
+        return np.clip(x, self._domain_adj[0], self._domain_adj[1])
+
+
+class RatioUniforms:
+    """
+    Generate random samples from a probability density function using the
+    ratio-of-uniforms method.
+
+    Parameters
+    ----------
+    pdf : callable
+        A function with signature `pdf(x)` that is proportional to the
+        probability density function of the distribution.
+    umax : float
+        The upper bound of the bounding rectangle in the u-direction.
+    vmin : float
+        The lower bound of the bounding rectangle in the v-direction.
+    vmax : float
+        The upper bound of the bounding rectangle in the v-direction.
+    c : float, optional.
+        Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+
+        If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+        singleton is used.
+        If `seed` is an int, a new ``RandomState`` instance is used,
+        seeded with `seed`.
+        If `seed` is already a ``Generator`` or ``RandomState`` instance then
+        that instance is used.
+
+    Methods
+    -------
+    rvs
+
+    Notes
+    -----
+    Given a univariate probability density function `pdf` and a constant `c`,
+    define the set ``A = {(u, v) : 0 < u <= sqrt(pdf(v/u + c))}``.
+    If ``(U, V)`` is a random vector uniformly distributed over ``A``,
+    then ``V/U + c`` follows a distribution according to `pdf`.
+
+    The above result (see [1]_, [2]_) can be used to sample random variables
+    using only the PDF, i.e. no inversion of the CDF is required. Typical
+    choices of `c` are zero or the mode of `pdf`. The set ``A`` is a subset of
+    the rectangle ``R = [0, umax] x [vmin, vmax]`` where
+
+    - ``umax = sup sqrt(pdf(x))``
+    - ``vmin = inf (x - c) sqrt(pdf(x))``
+    - ``vmax = sup (x - c) sqrt(pdf(x))``
+
+    In particular, these values are finite if `pdf` is bounded and
+    ``x**2 * pdf(x)`` is bounded (i.e. subquadratic tails).
+    One can generate ``(U, V)`` uniformly on ``R`` and return
+    ``V/U + c`` if ``(U, V)`` are also in ``A`` which can be directly
+    verified.
+
+    The algorithm is not changed if one replaces `pdf` by k * `pdf` for any
+    constant k > 0. Thus, it is often convenient to work with a function
+    that is proportional to the probability density function by dropping
+    unnecessary normalization factors.
+
+    Intuitively, the method works well if ``A`` fills up most of the
+    enclosing rectangle such that the probability is high that ``(U, V)``
+    lies in ``A`` whenever it lies in ``R`` as the number of required
+    iterations becomes too large otherwise. To be more precise, note that
+    the expected number of iterations to draw ``(U, V)`` uniformly
+    distributed on ``R`` such that ``(U, V)`` is also in ``A`` is given by
+    the ratio ``area(R) / area(A) = 2 * umax * (vmax - vmin) / area(pdf)``,
+    where `area(pdf)` is the integral of `pdf` (which is equal to one if the
+    probability density function is used but can take on other values if a
+    function proportional to the density is used). The equality holds since
+    the area of ``A`` is equal to ``0.5 * area(pdf)`` (Theorem 7.1 in [1]_).
+    If the sampling fails to generate a single random variate after 50000
+    iterations (i.e. not a single draw is in ``A``), an exception is raised.
+
+    If the bounding rectangle is not correctly specified (i.e. if it does not
+    contain ``A``), the algorithm samples from a distribution different from
+    the one given by `pdf`. It is therefore recommended to perform a
+    test such as `~scipy.stats.kstest` as a check.
+
+    References
+    ----------
+    .. [1] L. Devroye, "Non-Uniform Random Variate Generation",
+       Springer-Verlag, 1986.
+
+    .. [2] W. Hoermann and J. Leydold, "Generating generalized inverse Gaussian
+       random variates", Statistics and Computing, 24(4), p. 547--557, 2014.
+
+    .. [3] A.J. Kinderman and J.F. Monahan, "Computer Generation of Random
+       Variables Using the Ratio of Uniform Deviates",
+       ACM Transactions on Mathematical Software, 3(3), p. 257--260, 1977.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+
+    >>> from scipy.stats.sampling import RatioUniforms
+    >>> rng = np.random.default_rng()
+
+    Simulate normally distributed random variables. It is easy to compute the
+    bounding rectangle explicitly in that case. For simplicity, we drop the
+    normalization factor of the density.
+
+    >>> f = lambda x: np.exp(-x**2 / 2)
+    >>> v = np.sqrt(f(np.sqrt(2))) * np.sqrt(2)
+    >>> umax = np.sqrt(f(0))
+    >>> gen = RatioUniforms(f, umax=umax, vmin=-v, vmax=v, random_state=rng)
+    >>> r = gen.rvs(size=2500)
+
+    The K-S test confirms that the random variates are indeed normally
+    distributed (normality is not rejected at 5% significance level):
+
+    >>> stats.kstest(r, 'norm')[1]
+    0.250634764150542
+
+    The exponential distribution provides another example where the bounding
+    rectangle can be determined explicitly.
+
+    >>> gen = RatioUniforms(lambda x: np.exp(-x), umax=1, vmin=0,
+    ...                     vmax=2*np.exp(-1), random_state=rng)
+    >>> r = gen.rvs(1000)
+    >>> stats.kstest(r, 'expon')[1]
+    0.21121052054580314
+
+    """
+    
+    def __init__(self, pdf, *, umax, vmin, vmax, c=0, random_state=None):
+        if vmin >= vmax:
+            raise ValueError("vmin must be smaller than vmax.")
+
+        if umax <= 0:
+            raise ValueError("umax must be positive.")
+        
+        self._pdf = pdf
+        self._umax = umax
+        self._vmin = vmin
+        self._vmax = vmax
+        self._c = c
+        self._rng = check_random_state(random_state)
+
+    def rvs(self, size=1):
+        """Sampling of random variates
+
+        Parameters
+        ----------
+        size : int or tuple of ints, optional
+            Number of random variates to be generated (default is 1).
+
+        Returns
+        -------
+        rvs : ndarray
+            The random variates distributed according to the probability
+            distribution defined by the pdf.
+
+        """
+        size1d = tuple(np.atleast_1d(size))
+        N = np.prod(size1d)  # number of rvs needed, reshape upon return
+
+        # start sampling using ratio of uniforms method
+        x = np.zeros(N)
+        simulated, i = 0, 1
+
+        # loop until N rvs have been generated: expected runtime is finite.
+        # to avoid infinite loop, raise exception if not a single rv has been
+        # generated after 50000 tries. even if the expected number of iterations
+        # is 1000, the probability of this event is (1-1/1000)**50000
+        # which is of order 10e-22
+        while simulated < N:
+            k = N - simulated
+            # simulate uniform rvs on [0, umax] and [vmin, vmax]
+            u1 = self._umax * self._rng.uniform(size=k)
+            v1 = self._rng.uniform(self._vmin, self._vmax, size=k)
+            # apply rejection method
+            rvs = v1 / u1 + self._c
+            accept = (u1**2 <= self._pdf(rvs))
+            num_accept = np.sum(accept)
+            if num_accept > 0:
+                x[simulated:(simulated + num_accept)] = rvs[accept]
+                simulated += num_accept
+
+            if (simulated == 0) and (i*N >= 50000):
+                msg = (
+                    f"Not a single random variate could be generated in {i*N} "
+                    "attempts. The ratio of uniforms method does not appear "
+                    "to work for the provided parameters. Please check the "
+                    "pdf and the bounds."
+                )
+                raise RuntimeError(msg)
+            i += 1
+
+        return np.reshape(x, size1d)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sensitivity_analysis.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sensitivity_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..93850ffb586b96f351839897e20cfd70c558d452
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sensitivity_analysis.py
@@ -0,0 +1,716 @@
+import inspect
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+from collections.abc import Callable
+
+import numpy as np
+
+from scipy.stats._common import ConfidenceInterval
+from scipy.stats._qmc import check_random_state
+from scipy.stats._resampling import BootstrapResult
+from scipy.stats import qmc, bootstrap
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._util import _transition_to_rng
+
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+    from scipy._lib._util import DecimalNumber, IntNumber
+
+
+__all__ = [
+    'sobol_indices'
+]
+
+
+def f_ishigami(x: "npt.ArrayLike") -> "npt.NDArray[np.inexact[Any]]":
+    r"""Ishigami function.
+
+    .. math::
+
+        Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1
+
+    with :math:`\mathbf{x} \in [-\pi, \pi]^3`.
+
+    Parameters
+    ----------
+    x : array_like ([x1, x2, x3], n)
+
+    Returns
+    -------
+    f : array_like (n,)
+        Function evaluation.
+
+    References
+    ----------
+    .. [1] Ishigami, T. and T. Homma. "An importance quantification technique
+       in uncertainty analysis for computer models." IEEE,
+       :doi:`10.1109/ISUMA.1990.151285`, 1990.
+    """
+    x = np.atleast_2d(x)
+    f_eval = (
+        np.sin(x[0])
+        + 7 * np.sin(x[1])**2
+        + 0.1 * (x[2]**4) * np.sin(x[0])
+    )
+    return f_eval
+
+
+def sample_A_B(
+    n,
+    dists,
+    rng=None
+):
+    """Sample two matrices A and B.
+
+    Uses a Sobol' sequence with 2`d` columns to have 2 uncorrelated matrices.
+    This is more efficient than using 2 random draw of Sobol'.
+    See sec. 5 from [1]_.
+
+    Output shape is (d, n).
+
+    References
+    ----------
+    .. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
+       S. Tarantola. "Variance based sensitivity analysis of model
+       output. Design and estimator for the total sensitivity index."
+       Computer Physics Communications, 181(2):259-270,
+       :doi:`10.1016/j.cpc.2009.09.018`, 2010.
+    """
+    d = len(dists)
+    A_B = qmc.Sobol(d=2*d, seed=rng, bits=64).random(n).T
+    A_B = A_B.reshape(2, d, -1)
+    try:
+        for d_, dist in enumerate(dists):
+            A_B[:, d_] = dist.ppf(A_B[:, d_])
+    except AttributeError as exc:
+        message = "Each distribution in `dists` must have method `ppf`."
+        raise ValueError(message) from exc
+    return A_B
+
+
+def sample_AB(A: np.ndarray, B: np.ndarray) -> np.ndarray:
+    """AB matrix.
+
+    AB: rows of B into A. Shape (d, d, n).
+    - Copy A into d "pages"
+    - In the first page, replace 1st rows of A with 1st row of B.
+    ...
+    - In the dth page, replace dth row of A with dth row of B.
+    - return the stack of pages
+    """
+    d, n = A.shape
+    AB = np.tile(A, (d, 1, 1))
+    i = np.arange(d)
+    AB[i, i] = B[i]
+    return AB
+
+
+def saltelli_2010(
+    f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    r"""Saltelli2010 formulation.
+
+    .. math::
+
+        S_i = \frac{1}{N} \sum_{j=1}^N
+        f(\mathbf{B})_j (f(\mathbf{AB}^{(i)})_j - f(\mathbf{A})_j)
+
+    .. math::
+
+        S_{T_i} = \frac{1}{N} \sum_{j=1}^N
+        (f(\mathbf{A})_j - f(\mathbf{AB}^{(i)})_j)^2
+
+    Parameters
+    ----------
+    f_A, f_B : array_like (s, n)
+        Function values at A and B, respectively
+    f_AB : array_like (d, s, n)
+        Function values at each of the AB pages
+
+    Returns
+    -------
+    s, st : array_like (s, d)
+        First order and total order Sobol' indices.
+
+    References
+    ----------
+    .. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
+       S. Tarantola. "Variance based sensitivity analysis of model
+       output. Design and estimator for the total sensitivity index."
+       Computer Physics Communications, 181(2):259-270,
+       :doi:`10.1016/j.cpc.2009.09.018`, 2010.
+    """
+    # Empirical variance calculated using output from A and B which are
+    # independent. Output of AB is not independent and cannot be used
+    var = np.var([f_A, f_B], axis=(0, -1))
+
+    # We divide by the variance to have a ratio of variance
+    # this leads to eq. 2
+    s = np.mean(f_B * (f_AB - f_A), axis=-1) / var  # Table 2 (b)
+    st = 0.5 * np.mean((f_A - f_AB) ** 2, axis=-1) / var  # Table 2 (f)
+
+    return s.T, st.T
+
+
+@dataclass
+class BootstrapSobolResult:
+    first_order: BootstrapResult
+    total_order: BootstrapResult
+
+
+@dataclass
+class SobolResult:
+    first_order: np.ndarray
+    total_order: np.ndarray
+    _indices_method: Callable
+    _f_A: np.ndarray
+    _f_B: np.ndarray
+    _f_AB: np.ndarray
+    _A: np.ndarray | None = None
+    _B: np.ndarray | None = None
+    _AB: np.ndarray | None = None
+    _bootstrap_result: BootstrapResult | None = None
+
+    def bootstrap(
+        self,
+        confidence_level: "DecimalNumber" = 0.95,
+        n_resamples: "IntNumber" = 999
+    ) -> BootstrapSobolResult:
+        """Bootstrap Sobol' indices to provide confidence intervals.
+
+        Parameters
+        ----------
+        confidence_level : float, default: ``0.95``
+            The confidence level of the confidence intervals.
+        n_resamples : int, default: ``999``
+            The number of resamples performed to form the bootstrap
+            distribution of the indices.
+
+        Returns
+        -------
+        res : BootstrapSobolResult
+            Bootstrap result containing the confidence intervals and the
+            bootstrap distribution of the indices.
+
+            An object with attributes:
+
+            first_order : BootstrapResult
+                Bootstrap result of the first order indices.
+            total_order : BootstrapResult
+                Bootstrap result of the total order indices.
+            See `BootstrapResult` for more details.
+
+        """
+        def statistic(idx):
+            f_A_ = self._f_A[:, idx]
+            f_B_ = self._f_B[:, idx]
+            f_AB_ = self._f_AB[..., idx]
+            return self._indices_method(f_A_, f_B_, f_AB_)
+
+        n = self._f_A.shape[1]
+
+        res = bootstrap(
+            [np.arange(n)], statistic=statistic, method="BCa",
+            n_resamples=n_resamples,
+            confidence_level=confidence_level,
+            bootstrap_result=self._bootstrap_result
+        )
+        self._bootstrap_result = res
+
+        first_order = BootstrapResult(
+            confidence_interval=ConfidenceInterval(
+                res.confidence_interval.low[0], res.confidence_interval.high[0]
+            ),
+            bootstrap_distribution=res.bootstrap_distribution[0],
+            standard_error=res.standard_error[0],
+        )
+        total_order = BootstrapResult(
+            confidence_interval=ConfidenceInterval(
+                res.confidence_interval.low[1], res.confidence_interval.high[1]
+            ),
+            bootstrap_distribution=res.bootstrap_distribution[1],
+            standard_error=res.standard_error[1],
+        )
+
+        return BootstrapSobolResult(
+            first_order=first_order, total_order=total_order
+        )
+
+
+@xp_capabilities(np_only=True)
+@_transition_to_rng('random_state', replace_doc=False)
+def sobol_indices(
+    *,
+    func,
+    n,
+    dists=None,
+    method='saltelli_2010',
+    rng=None
+):
+    r"""Global sensitivity indices of Sobol'.
+
+    Parameters
+    ----------
+    func : callable or dict(str, array_like)
+        If `func` is a callable, function to compute the Sobol' indices from.
+        Its signature must be::
+
+            func(x: ArrayLike) -> ArrayLike
+
+        with ``x`` of shape ``(d, n)`` and output of shape ``(s, n)`` where:
+
+        - ``d`` is the input dimensionality of `func`
+          (number of input variables),
+        - ``s`` is the output dimensionality of `func`
+          (number of output variables), and
+        - ``n`` is the number of samples (see `n` below).
+
+        Function evaluation values must be finite.
+
+        If `func` is a dictionary, contains the function evaluations from three
+        different arrays. Keys must be: ``f_A``, ``f_B`` and ``f_AB``.
+        ``f_A`` and ``f_B`` should have a shape ``(s, n)`` and ``f_AB``
+        should have a shape ``(d, s, n)``.
+        This is an advanced feature and misuse can lead to wrong analysis.
+    n : int
+        Number of samples used to generate the matrices ``A`` and ``B``.
+        Must be a power of 2. The total number of points at which `func` is
+        evaluated will be ``n*(d+2)``.
+    dists : list(distributions), optional
+        List of each parameter's distribution. The distribution of parameters
+        depends on the application and should be carefully chosen.
+        Parameters are assumed to be independently distributed, meaning there
+        is no constraint nor relationship between their values.
+
+        Distributions must be an instance of a class with a ``ppf``
+        method.
+
+        Must be specified if `func` is a callable, and ignored otherwise.
+    method : Callable or str, default: 'saltelli_2010'
+        Method used to compute the first and total Sobol' indices.
+
+        If a callable, its signature must be::
+
+            func(f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray)
+            -> Tuple[np.ndarray, np.ndarray]
+
+        with ``f_A, f_B`` of shape ``(s, n)`` and ``f_AB`` of shape
+        ``(d, s, n)``.
+        These arrays contain the function evaluations from three different sets
+        of samples.
+        The output is a tuple of the first and total indices with
+        shape ``(s, d)``.
+        This is an advanced feature and misuse can lead to wrong analysis.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
+
+        .. versionchanged:: 1.15.0
+
+            As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
+            transition from use of `numpy.random.RandomState` to
+            `numpy.random.Generator`, this keyword was changed from `random_state` to
+            `rng`. For an interim period, both keywords will continue to work, although
+            only one may be specified at a time. After the interim period, function
+            calls using the `random_state` keyword will emit warnings. Following a
+            deprecation period, the `random_state` keyword will be removed.
+
+    Returns
+    -------
+    res : SobolResult
+        An object with attributes:
+
+        first_order : ndarray of shape (s, d)
+            First order Sobol' indices.
+        total_order : ndarray of shape (s, d)
+            Total order Sobol' indices.
+
+        And method:
+
+        bootstrap(confidence_level: float, n_resamples: int)
+        -> BootstrapSobolResult
+
+            A method providing confidence intervals on the indices.
+            See `scipy.stats.bootstrap` for more details.
+
+            The bootstrapping is done on both first and total order indices,
+            and they are available in `BootstrapSobolResult` as attributes
+            ``first_order`` and ``total_order``.
+
+    Notes
+    -----
+    The Sobol' method [1]_, [2]_ is a variance-based Sensitivity Analysis which
+    obtains the contribution of each parameter to the variance of the
+    quantities of interest (QoIs; i.e., the outputs of `func`).
+    Respective contributions can be used to rank the parameters and
+    also gauge the complexity of the model by computing the
+    model's effective (or mean) dimension.
+
+    .. note::
+
+        Parameters are assumed to be independently distributed. Each
+        parameter can still follow any distribution. In fact, the distribution
+        is very important and should match the real distribution of the
+        parameters.
+
+    It uses a functional decomposition of the variance of the function to
+    explore
+
+    .. math::
+
+        \mathbb{V}(Y) = \sum_{i}^{d} \mathbb{V}_i (Y) + \sum_{i<j}^{d}
+        \mathbb{V}_{ij}(Y) + ... + \mathbb{V}_{1,2,...,d}(Y),
+
+    introducing conditional variances:
+
+    .. math::
+
+        \mathbb{V}_i(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i)]
+        \qquad
+        \mathbb{V}_{ij}(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i x_j)]
+        - \mathbb{V}_i(Y) - \mathbb{V}_j(Y),
+
+    Sobol' indices are expressed as
+
+    .. math::
+
+        S_i = \frac{\mathbb{V}_i(Y)}{\mathbb{V}[Y]}
+        \qquad
+        S_{ij} =\frac{\mathbb{V}_{ij}(Y)}{\mathbb{V}[Y]}.
+
+    :math:`S_{i}` corresponds to the first-order term which apprises the
+    contribution of the i-th parameter, while :math:`S_{ij}` corresponds to the
+    second-order term which informs about the contribution of interactions
+    between the i-th and the j-th parameters. These equations can be
+    generalized to compute higher order terms; however, they are expensive to
+    compute and their interpretation is complex.
+    This is why only first order indices are provided.
+
+    Total order indices represent the global contribution of the parameters
+    to the variance of the QoI and are defined as:
+
+    .. math::
+
+        S_{T_i} = S_i + \sum_j S_{ij} + \sum_{j,k} S_{ijk} + ...
+        = 1 - \frac{\mathbb{V}[\mathbb{E}(Y|x_{\sim i})]}{\mathbb{V}[Y]}.
+
+    First order indices sum to at most 1, while total order indices sum to at
+    least 1. If there are no interactions, then first and total order indices
+    are equal, and both first and total order indices sum to 1.
+
+    .. warning::
+
+        Negative Sobol' values are due to numerical errors. Increasing the
+        number of points `n` should help.
+
+        The number of sample required to have a good analysis increases with
+        the dimensionality of the problem. e.g. for a 3 dimension problem,
+        consider at minima ``n >= 2**12``. The more complex the model is,
+        the more samples will be needed.
+
+        Even for a purely additive model, the indices may not sum to 1 due
+        to numerical noise.
+
+    References
+    ----------
+    .. [1] Sobol, I. M.. "Sensitivity analysis for nonlinear mathematical
+       models." Mathematical Modeling and Computational Experiment, 1:407-414,
+       1993.
+    .. [2] Sobol, I. M. (2001). "Global sensitivity indices for nonlinear
+       mathematical models and their Monte Carlo estimates." Mathematics
+       and Computers in Simulation, 55(1-3):271-280,
+       :doi:`10.1016/S0378-4754(00)00270-6`, 2001.
+    .. [3] Saltelli, A. "Making best use of model evaluations to
+       compute sensitivity indices."  Computer Physics Communications,
+       145(2):280-297, :doi:`10.1016/S0010-4655(02)00280-1`, 2002.
+    .. [4] Saltelli, A., M. Ratto, T. Andres, F. Campolongo, J. Cariboni,
+       D. Gatelli, M. Saisana, and S. Tarantola. "Global Sensitivity Analysis.
+       The Primer." 2007.
+    .. [5] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
+       S. Tarantola. "Variance based sensitivity analysis of model
+       output. Design and estimator for the total sensitivity index."
+       Computer Physics Communications, 181(2):259-270,
+       :doi:`10.1016/j.cpc.2009.09.018`, 2010.
+    .. [6] Ishigami, T. and T. Homma. "An importance quantification technique
+       in uncertainty analysis for computer models." IEEE,
+       :doi:`10.1109/ISUMA.1990.151285`, 1990.
+
+    Examples
+    --------
+    The following is an example with the Ishigami function [6]_
+
+    .. math::
+
+        Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1,
+
+    with :math:`\mathbf{x} \in [-\pi, \pi]^3`. This function exhibits strong
+    non-linearity and non-monotonicity.
+
+    Remember, Sobol' indices assumes that samples are independently
+    distributed. In this case we use a uniform distribution on each marginals.
+
+    >>> import numpy as np
+    >>> from scipy.stats import sobol_indices, uniform
+    >>> rng = np.random.default_rng()
+    >>> def f_ishigami(x):
+    ...     f_eval = (
+    ...         np.sin(x[0])
+    ...         + 7 * np.sin(x[1])**2
+    ...         + 0.1 * (x[2]**4) * np.sin(x[0])
+    ...     )
+    ...     return f_eval
+    >>> indices = sobol_indices(
+    ...     func=f_ishigami, n=1024,
+    ...     dists=[
+    ...         uniform(loc=-np.pi, scale=2*np.pi),
+    ...         uniform(loc=-np.pi, scale=2*np.pi),
+    ...         uniform(loc=-np.pi, scale=2*np.pi)
+    ...     ],
+    ...     rng=rng
+    ... )
+    >>> indices.first_order
+    array([0.31637954, 0.43781162, 0.00318825])
+    >>> indices.total_order
+    array([0.56122127, 0.44287857, 0.24229595])
+
+    Confidence interval can be obtained using bootstrapping.
+
+    >>> boot = indices.bootstrap()
+
+    Then, this information can be easily visualized.
+
+    >>> import matplotlib.pyplot as plt
+    >>> fig, axs = plt.subplots(1, 2, figsize=(9, 4))
+    >>> _ = axs[0].errorbar(
+    ...     [1, 2, 3], indices.first_order, fmt='o',
+    ...     yerr=[
+    ...         indices.first_order - boot.first_order.confidence_interval.low,
+    ...         boot.first_order.confidence_interval.high - indices.first_order
+    ...     ],
+    ... )
+    >>> axs[0].set_ylabel("First order Sobol' indices")
+    >>> axs[0].set_xlabel('Input parameters')
+    >>> axs[0].set_xticks([1, 2, 3])
+    >>> _ = axs[1].errorbar(
+    ...     [1, 2, 3], indices.total_order, fmt='o',
+    ...     yerr=[
+    ...         indices.total_order - boot.total_order.confidence_interval.low,
+    ...         boot.total_order.confidence_interval.high - indices.total_order
+    ...     ],
+    ... )
+    >>> axs[1].set_ylabel("Total order Sobol' indices")
+    >>> axs[1].set_xlabel('Input parameters')
+    >>> axs[1].set_xticks([1, 2, 3])
+    >>> plt.tight_layout()
+    >>> plt.show()
+
+    .. note::
+
+        By default, `scipy.stats.uniform` has support ``[0, 1]``.
+        Using the parameters ``loc`` and ``scale``, one obtains the uniform
+        distribution on ``[loc, loc + scale]``.
+
+    This result is particularly interesting because the first order index
+    :math:`S_{x_3} = 0` whereas its total order is :math:`S_{T_{x_3}} = 0.244`.
+    This means that higher order interactions with :math:`x_3` are responsible
+    for the difference. Almost 25% of the observed variance
+    on the QoI is due to the correlations between :math:`x_3` and :math:`x_1`,
+    although :math:`x_3` by itself has no impact on the QoI.
+
+    The following gives a visual explanation of Sobol' indices on this
+    function. Let's generate 1024 samples in :math:`[-\pi, \pi]^3` and
+    calculate the value of the output.
+
+    >>> from scipy.stats import qmc
+    >>> n_dim = 3
+    >>> p_labels = ['$x_1$', '$x_2$', '$x_3$']
+    >>> sample = qmc.Sobol(d=n_dim, seed=rng).random(1024)
+    >>> sample = qmc.scale(
+    ...     sample=sample,
+    ...     l_bounds=[-np.pi, -np.pi, -np.pi],
+    ...     u_bounds=[np.pi, np.pi, np.pi]
+    ... )
+    >>> output = f_ishigami(sample.T)
+
+    Now we can do scatter plots of the output with respect to each parameter.
+    This gives a visual way to understand how each parameter impacts the
+    output of the function.
+
+    >>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
+    >>> for i in range(n_dim):
+    ...     xi = sample[:, i]
+    ...     ax[i].scatter(xi, output, marker='+')
+    ...     ax[i].set_xlabel(p_labels[i])
+    >>> ax[0].set_ylabel('Y')
+    >>> plt.tight_layout()
+    >>> plt.show()
+
+    Now Sobol' goes a step further:
+    by conditioning the output value by given values of the parameter
+    (black lines), the conditional output mean is computed. It corresponds to
+    the term :math:`\mathbb{E}(Y|x_i)`. Taking the variance of this term gives
+    the numerator of the Sobol' indices.
+
+    >>> mini = np.min(output)
+    >>> maxi = np.max(output)
+    >>> n_bins = 10
+    >>> bins = np.linspace(-np.pi, np.pi, num=n_bins, endpoint=False)
+    >>> dx = bins[1] - bins[0]
+    >>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
+    >>> for i in range(n_dim):
+    ...     xi = sample[:, i]
+    ...     ax[i].scatter(xi, output, marker='+')
+    ...     ax[i].set_xlabel(p_labels[i])
+    ...     for bin_ in bins:
+    ...         idx = np.where((bin_ <= xi) & (xi <= bin_ + dx))
+    ...         xi_ = xi[idx]
+    ...         y_ = output[idx]
+    ...         ave_y_ = np.mean(y_)
+    ...         ax[i].plot([bin_ + dx/2] * 2, [mini, maxi], c='k')
+    ...         ax[i].scatter(bin_ + dx/2, ave_y_, c='r')
+    >>> ax[0].set_ylabel('Y')
+    >>> plt.tight_layout()
+    >>> plt.show()
+
+    Looking at :math:`x_3`, the variance
+    of the mean is zero leading to :math:`S_{x_3} = 0`. But we can further
+    observe that the variance of the output is not constant along the parameter
+    values of :math:`x_3`. This heteroscedasticity is explained by higher order
+    interactions. Moreover, an heteroscedasticity is also noticeable on
+    :math:`x_1` leading to an interaction between :math:`x_3` and :math:`x_1`.
+    On :math:`x_2`, the variance seems to be constant and thus null interaction
+    with this parameter can be supposed.
+
+    This case is fairly simple to analyse visually---although it is only a
+    qualitative analysis. Nevertheless, when the number of input parameters
+    increases such analysis becomes unrealistic as it would be difficult to
+    conclude on high-order terms. Hence the benefit of using Sobol' indices.
+
+    """
+    rng = check_random_state(rng)
+
+    n_ = int(n)
+    if not (n_ & (n_ - 1) == 0) or n != n_:
+        raise ValueError(
+            "The balance properties of Sobol' points require 'n' "
+            "to be a power of 2."
+        )
+    n = n_
+
+    if not callable(method):
+        indices_methods = {
+            "saltelli_2010": saltelli_2010,
+        }
+        try:
+            method = method.lower()  # type: ignore[assignment]
+            indices_method_ = indices_methods[method]
+        except KeyError as exc:
+            message = (
+                f"{method!r} is not a valid 'method'. It must be one of"
+                f" {set(indices_methods)!r} or a callable."
+            )
+            raise ValueError(message) from exc
+    else:
+        indices_method_ = method
+        sig = inspect.signature(indices_method_)
+
+        if set(sig.parameters) != {'f_A', 'f_B', 'f_AB'}:
+            message = (
+                "If 'method' is a callable, it must have the following"
+                f" signature: {inspect.signature(saltelli_2010)}"
+            )
+            raise ValueError(message)
+
+    def indices_method(f_A, f_B, f_AB):
+        """Wrap indices method to ensure proper output dimension.
+
+        1D when single output, 2D otherwise.
+        """
+        return np.squeeze(indices_method_(f_A=f_A, f_B=f_B, f_AB=f_AB))
+
+    if callable(func):
+        if dists is None:
+            raise ValueError(
+                "'dists' must be defined when 'func' is a callable."
+            )
+
+        def wrapped_func(x):
+            return np.atleast_2d(func(x))
+
+        A, B = sample_A_B(n=n, dists=dists, rng=rng)
+        AB = sample_AB(A=A, B=B)
+
+        f_A = wrapped_func(A)
+
+        if f_A.shape[1] != n:
+            raise ValueError(
+                "'func' output should have a shape ``(s, -1)`` with ``s`` "
+                "the number of output."
+            )
+
+        def funcAB(AB):
+            d, d, n = AB.shape
+            AB = np.moveaxis(AB, 0, -1).reshape(d, n*d)
+            f_AB = wrapped_func(AB)
+            return np.moveaxis(f_AB.reshape((-1, n, d)), -1, 0)
+
+        f_B = wrapped_func(B)
+        f_AB = funcAB(AB)
+    else:
+        message = (
+            "When 'func' is a dictionary, it must contain the following "
+            "keys: 'f_A', 'f_B' and 'f_AB'."
+            "'f_A' and 'f_B' should have a shape ``(s, n)`` and 'f_AB' "
+            "should have a shape ``(d, s, n)``."
+        )
+        try:
+            f_A, f_B, f_AB = map(lambda arr: arr.copy(), np.atleast_2d(
+                func['f_A'], func['f_B'], func['f_AB']
+            ))
+        except KeyError as exc:
+            raise ValueError(message) from exc
+
+        if f_A.shape[1] != n or f_A.shape != f_B.shape or \
+                f_AB.shape == f_A.shape or f_AB.shape[-1] % n != 0:
+            raise ValueError(message)
+
+    # Normalization by mean
+    # Sobol', I. and Levitan, Y. L. (1999). On the use of variance reducing
+    # multipliers in monte carlo computations of a global sensitivity index.
+    # Computer Physics Communications, 117(1) :52-61.
+    mean = np.mean([f_A, f_B], axis=(0, -1)).reshape(-1, 1)
+    f_A -= mean
+    f_B -= mean
+    f_AB -= mean
+
+    # Compute indices
+    # Filter warnings for constant output as var = 0
+    with np.errstate(divide='ignore', invalid='ignore'):
+        first_order, total_order = indices_method(f_A=f_A, f_B=f_B, f_AB=f_AB)
+
+    # null variance means null indices
+    first_order[~np.isfinite(first_order)] = 0
+    total_order[~np.isfinite(total_order)] = 0
+
+    res = dict(
+        first_order=first_order,
+        total_order=total_order,
+        _indices_method=indices_method,
+        _f_A=f_A,
+        _f_B=f_B,
+        _f_AB=f_AB
+    )
+
+    if callable(func):
+        res.update(
+            dict(
+                _A=A,
+                _B=B,
+                _AB=AB,
+            )
+        )
+
+    return SobolResult(**res)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sobol.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sobol.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7ca5e3a9c1a142b25ac26401e9ab1cb6726c877f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_sobol.pyi
@@ -0,0 +1,54 @@
+import numpy as np
+from scipy._lib._util import IntNumber
+from typing import Literal
+
+def _initialize_v(
+    v : np.ndarray, 
+    dim : IntNumber,
+    bits: IntNumber
+) -> None: ...
+
+def _cscramble (
+    dim : IntNumber,
+    bits: IntNumber,
+    ltm : np.ndarray,
+    sv: np.ndarray
+) -> None: ...
+
+def _fill_p_cumulative(
+    p: np.ndarray,
+    p_cumulative: np.ndarray
+) -> None: ...
+
+def _draw(
+    n : IntNumber,
+    num_gen: IntNumber,
+    dim: IntNumber,
+    scale: float,
+    sv: np.ndarray,
+    quasi: np.ndarray,
+    sample: np.ndarray
+    ) -> None: ...
+
+def _fast_forward(
+    n: IntNumber,
+    num_gen: IntNumber,
+    dim: IntNumber,
+    sv: np.ndarray,
+    quasi: np.ndarray
+    ) -> None: ...
+
+def _categorize(
+    draws: np.ndarray,
+    p_cumulative: np.ndarray,
+    result: np.ndarray
+    ) -> None: ...
+
+_MAXDIM: Literal[21201]
+_MAXDEG: Literal[18]
+
+def _test_find_index(
+    p_cumulative: np.ndarray, 
+    size: int, 
+    value: float
+    ) -> int: ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats.pxd b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..e01565f75fe232446e4b8b0b50fdf645c8506108
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats.pxd
@@ -0,0 +1,10 @@
+# destined to be used in a LowLevelCallable
+
+cdef double _geninvgauss_pdf(double x, void *user_data) noexcept nogil
+cdef double _studentized_range_cdf(int n, double[2] x, void *user_data) noexcept nogil
+cdef double _studentized_range_cdf_asymptotic(double z, void *user_data) noexcept nogil
+cdef double _studentized_range_pdf(int n, double[2] x, void *user_data) noexcept nogil
+cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) noexcept nogil
+cdef double _studentized_range_moment(int n, double[3] x_arg, void *user_data) noexcept nogil
+cdef double _genhyperbolic_pdf(double x, void *user_data) noexcept nogil
+cdef double _genhyperbolic_logpdf(double x, void *user_data) noexcept nogil
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_mstats_common.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_mstats_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7fc77763604165d4f8fa0372f380e5a0ea49166
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_mstats_common.py
@@ -0,0 +1,325 @@
+import warnings
+import numpy as np
+from . import distributions
+from .._lib._array_api import xp_capabilities
+from .._lib._bunch import _make_tuple_bunch
+from ._axis_nan_policy import _axis_nan_policy_factory
+from ._stats_pythran import siegelslopes as siegelslopes_pythran
+
+__all__ = ['_find_repeats', 'theilslopes', 'siegelslopes']
+
+# This is not a namedtuple for backwards compatibility. See PR #12983
+TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
+                                      ['slope', 'intercept',
+                                       'low_slope', 'high_slope'])
+SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
+                                       ['slope', 'intercept'])
+
+
+def _n_samples_optional_x(kwargs):
+    return 2 if kwargs.get('x', None) is not None else 1
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(TheilslopesResult, default_axis=None, n_outputs=4,
+                          n_samples=_n_samples_optional_x,
+                          result_to_tuple=lambda x, _: tuple(x), paired=True,
+                          too_small=1)
+def theilslopes(y, x=None, alpha=0.95, method='separate'):
+    r"""
+    Computes the Theil-Sen estimator for a set of points (x, y).
+
+    `theilslopes` implements a method for robust linear regression.  It
+    computes the slope as the median of all slopes between paired values.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    alpha : float, optional
+        Confidence degree between 0 and 1. Default is 95% confidence.
+        Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
+        interpreted as "find the 90% confidence interval".
+    method : {'joint', 'separate'}, optional
+        Method to be used for computing estimate for intercept.
+        Following methods are supported,
+
+            * 'joint': Uses np.median(y - slope * x) as intercept.
+            * 'separate': Uses np.median(y) - slope * np.median(x)
+                          as intercept.
+
+        The default is 'separate'.
+
+        .. versionadded:: 1.8.0
+
+    Returns
+    -------
+    result : ``TheilslopesResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Theil slope.
+        intercept : float
+            Intercept of the Theil line.
+        low_slope : float
+            Lower bound of the confidence interval on `slope`.
+        high_slope : float
+            Upper bound of the confidence interval on `slope`.
+
+    See Also
+    --------
+    siegelslopes : a similar technique using repeated medians
+
+    Notes
+    -----
+    The implementation of `theilslopes` follows [1]_. The intercept is
+    not defined in [1]_, and here it is defined as ``median(y) -
+    slope*median(x)``, which is given in [3]_. Other definitions of
+    the intercept exist in the literature such as  ``median(y - slope*x)``
+    in [4]_. The approach to compute the intercept can be determined by the
+    parameter ``method``. A confidence interval for the intercept is not
+    given as this question is not addressed in [1]_.
+
+    For compatibility with older versions of SciPy, the return value acts
+    like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
+    ``low_slope``, and ``high_slope``, so one can continue to write::
+
+        slope, intercept, low_slope, high_slope = theilslopes(y, x)
+
+    References
+    ----------
+    .. [1] P.K. Sen, "Estimates of the regression coefficient based on
+           Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
+    .. [2] H. Theil, "A rank-invariant method of linear and polynomial
+           regression analysis I, II and III",  Nederl. Akad. Wetensch., Proc.
+           53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
+    .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
+           John Wiley and Sons, New York, pp. 493.
+    .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-5, 5, num=150)
+    >>> y = x + np.random.normal(size=x.size)
+    >>> y[11:15] += 10  # add outliers
+    >>> y[-5:] -= 7
+
+    Compute the slope, intercept and 90% confidence interval.  For comparison,
+    also compute the least-squares fit with `linregress`:
+
+    >>> res = stats.theilslopes(y, x, 0.90, method='separate')
+    >>> lsq_res = stats.linregress(x, y)
+
+    Plot the results. The Theil-Sen regression line is shown in red, with the
+    dashed red lines illustrating the confidence interval of the slope (note
+    that the dashed red lines are not the confidence interval of the regression
+    as the confidence interval of the intercept is not included). The green
+    line shows the least-squares fit for comparison.
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, y, 'b.')
+    >>> ax.plot(x, res[1] + res[0] * x, 'r-')
+    >>> ax.plot(x, res[1] + res[2] * x, 'r--')
+    >>> ax.plot(x, res[1] + res[3] * x, 'r--')
+    >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
+    >>> plt.show()
+
+    """
+    if method not in ['joint', 'separate']:
+        raise ValueError("method must be either 'joint' or 'separate'."
+                         f"'{method}' is invalid.")
+    # We copy both x and y so we can use _find_repeats.
+    y = np.array(y, dtype=float, copy=True).ravel()
+    if x is None:
+        x = np.arange(len(y), dtype=float)
+    else:
+        x = np.array(x, dtype=float, copy=True).ravel()
+        if len(x) != len(y):
+            raise ValueError("Array shapes are incompatible for broadcasting.")
+    if len(x) < 2:
+        raise ValueError("`x` and `y` must have length at least 2.")
+
+    # Compute sorted slopes only when deltax > 0
+    deltax = x[:, np.newaxis] - x
+    deltay = y[:, np.newaxis] - y
+    slopes = deltay[deltax > 0] / deltax[deltax > 0]
+    if not slopes.size:
+        msg = "All `x` coordinates are identical."
+        warnings.warn(msg, RuntimeWarning, stacklevel=2)
+    slopes.sort()
+    medslope = np.median(slopes)
+    if method == 'joint':
+        medinter = np.median(y - medslope * x)
+    else:
+        medinter = np.median(y) - medslope * np.median(x)
+    # Now compute confidence intervals
+    if alpha > 0.5:
+        alpha = 1. - alpha
+
+    z = distributions.norm.ppf(alpha / 2.)
+    # This implements (2.6) from Sen (1968)
+    _, nxreps = _find_repeats(x)
+    _, nyreps = _find_repeats(y)
+    nt = len(slopes)       # N in Sen (1968)
+    ny = len(y)            # n in Sen (1968)
+    # Equation 2.6 in Sen (1968):
+    sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
+                     sum(k * (k-1) * (2*k + 5) for k in nxreps) -
+                     sum(k * (k-1) * (2*k + 5) for k in nyreps))
+    # Find the confidence interval indices in `slopes`
+    try:
+        sigma = np.sqrt(sigsq)
+        Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
+        Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
+        delta = slopes[[Rl, Ru]]
+    except (ValueError, IndexError):
+        delta = (np.nan, np.nan)
+
+    return TheilslopesResult(slope=medslope, intercept=medinter,
+                             low_slope=delta[0], high_slope=delta[1])
+
+
+def _find_repeats(arr):
+    # This function assumes it may clobber its input.
+    if len(arr) == 0:
+        return np.array(0, np.float64), np.array(0, np.intp)
+
+    # XXX This cast was previously needed for the Fortran implementation,
+    # should we ditch it?
+    arr = np.asarray(arr, np.float64).ravel()
+    arr.sort()
+
+    # Taken from NumPy 1.9's np.unique.
+    change = np.concatenate(([True], arr[1:] != arr[:-1]))
+    unique = arr[change]
+    change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
+    freq = np.diff(change_idx)
+    atleast2 = freq > 1
+    return unique[atleast2], freq[atleast2]
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(SiegelslopesResult, default_axis=None, n_outputs=2,
+                          n_samples=_n_samples_optional_x,
+                          result_to_tuple=lambda x, _: tuple(x), paired=True,
+                          too_small=1)
+def siegelslopes(y, x=None, method="hierarchical"):
+    r"""
+    Computes the Siegel estimator for a set of points (x, y).
+
+    `siegelslopes` implements a method for robust linear regression
+    using repeated medians (see [1]_) to fit a line to the points (x, y).
+    The method is robust to outliers with an asymptotic breakdown point
+    of 50%.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    method : {'hierarchical', 'separate'}
+        If 'hierarchical', estimate the intercept using the estimated
+        slope ``slope`` (default option).
+        If 'separate', estimate the intercept independent of the estimated
+        slope. See Notes for details.
+
+    Returns
+    -------
+    result : ``SiegelslopesResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Estimate of the slope of the regression line.
+        intercept : float
+            Estimate of the intercept of the regression line.
+
+    See Also
+    --------
+    theilslopes : a similar technique without repeated medians
+
+    Notes
+    -----
+    With ``n = len(y)``, compute ``m_j`` as the median of
+    the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
+    ``slope`` is then the median of all slopes ``m_j``.
+    Two ways are given to estimate the intercept in [1]_ which can be chosen
+    via the parameter ``method``.
+    The hierarchical approach uses the estimated slope ``slope``
+    and computes ``intercept`` as the median of ``y - slope*x``.
+    The other approach estimates the intercept separately as follows: for
+    each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
+    lines through the remaining points and take the median ``i_j``.
+    ``intercept`` is the median of the ``i_j``.
+
+    The implementation computes `n` times the median of a vector of size `n`
+    which can be slow for large vectors. There are more efficient algorithms
+    (see [2]_) which are not implemented here.
+
+    For compatibility with older versions of SciPy, the return value acts
+    like a ``namedtuple`` of length 2, with fields ``slope`` and
+    ``intercept``, so one can continue to write::
+
+        slope, intercept = siegelslopes(y, x)
+
+    References
+    ----------
+    .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
+           Biometrika, Vol. 69, pp. 242-244, 1982.
+
+    .. [2] A. Stein and M. Werman, "Finding the repeated median regression
+           line", Proceedings of the Third Annual ACM-SIAM Symposium on
+           Discrete Algorithms, pp. 409-413, 1992.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-5, 5, num=150)
+    >>> y = x + np.random.normal(size=x.size)
+    >>> y[11:15] += 10  # add outliers
+    >>> y[-5:] -= 7
+
+    Compute the slope and intercept.  For comparison, also compute the
+    least-squares fit with `linregress`:
+
+    >>> res = stats.siegelslopes(y, x)
+    >>> lsq_res = stats.linregress(x, y)
+
+    Plot the results. The Siegel regression line is shown in red. The green
+    line shows the least-squares fit for comparison.
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, y, 'b.')
+    >>> ax.plot(x, res[1] + res[0] * x, 'r-')
+    >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
+    >>> plt.show()
+
+    """
+    if method not in ['hierarchical', 'separate']:
+        raise ValueError("method can only be 'hierarchical' or 'separate'")
+    y = np.asarray(y).ravel()
+    if x is None:
+        x = np.arange(len(y), dtype=float)
+    else:
+        x = np.asarray(x, dtype=float).ravel()
+        if len(x) != len(y):
+            raise ValueError("Array shapes are incompatible for broadcasting.")
+    if len(x) < 2:
+        raise ValueError("`x` and `y` must have length at least 2.")
+
+    dtype = np.result_type(x, y, np.float32)  # use at least float32
+    y, x = y.astype(dtype), x.astype(dtype)
+    medslope, medinter = siegelslopes_pythran(y, x, method)
+    medslope, medinter = np.asarray(medslope)[()], np.asarray(medinter)[()]
+    return SiegelslopesResult(slope=medslope, intercept=medinter)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_py.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_py.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e4cc9cb467fca29d67e4adbc582f6e2a631bca
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_stats_py.py
@@ -0,0 +1,10840 @@
+# Copyright 2002 Gary Strangman.  All rights reserved
+# Copyright 2002-2016 The SciPy Developers
+#
+# The original code from Gary Strangman was heavily adapted for
+# use in SciPy by Travis Oliphant.  The original code came with the
+# following disclaimer:
+#
+# This software is provided "as-is".  There are no expressed or implied
+# warranties of any kind, including, but not limited to, the warranties
+# of merchantability and fitness for a given application.  In no event
+# shall Gary Strangman be liable for any direct, indirect, incidental,
+# special, exemplary or consequential damages (including, but not limited
+# to, loss of use, data or profits, or business interruption) however
+# caused and on any theory of liability, whether in contract, strict
+# liability or tort (including negligence or otherwise) arising in any way
+# out of the use of this software, even if advised of the possibility of
+# such damage.
+
+"""
+A collection of basic statistical functions for Python.
+
+References
+----------
+.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+   Probability and Statistics Tables and Formulae. Chapman & Hall: New
+   York. 2000.
+
+"""
+import math
+import itertools
+import operator
+import warnings
+from collections import namedtuple
+from collections.abc import Sequence
+
+import numpy as np
+from numpy import array, asarray, ma
+
+from scipy import sparse
+from scipy.spatial import distance_matrix
+
+from scipy.optimize import milp, LinearConstraint
+from scipy._lib._util import _get_nan, _rename_parameter, _contains_nan, np_vecdot
+
+import scipy.special as special
+# Import unused here but needs to stay until end of deprecation periode
+# See https://github.com/scipy/scipy/issues/15765#issuecomment-1875564522
+from scipy import linalg  # noqa: F401
+from . import distributions
+from . import _mstats_basic as mstats_basic
+
+from ._stats_mstats_common import theilslopes, siegelslopes
+from ._stats import _kendall_dis, _toint64, _weightedrankedtau
+
+from dataclasses import dataclass, field
+from ._stats_pythran import _compute_outer_prob_inside_method
+from ._resampling import (MonteCarloMethod, PermutationMethod, BootstrapMethod,
+                          monte_carlo_test, permutation_test, bootstrap,)
+from ._axis_nan_policy import (_axis_nan_policy_factory, _broadcast_shapes,
+                               _broadcast_array_shapes_remove_axis, SmallSampleWarning,
+                               too_small_1d_not_omit, too_small_1d_omit,
+                               too_small_nd_not_omit, too_small_nd_omit)
+from ._binomtest import _binary_search_for_binom_tst as _binary_search
+from scipy._lib._bunch import _make_tuple_bunch
+from scipy import stats
+from scipy.optimize import root_scalar
+from scipy._lib._array_api import (
+    _asarray,
+    array_namespace,
+    is_lazy_array,
+    is_dask,
+    is_numpy,
+    is_cupy,
+    is_marray,
+    xp_size,
+    xp_vector_norm,
+    xp_promote,
+    xp_result_type,
+    xp_capabilities,
+    xp_ravel,
+    _length_nonmasked,
+    _share_masks,
+    xp_swapaxes,
+    xp_default_dtype,
+    xp_device,
+)
+import scipy._lib.array_api_extra as xpx
+
+
+
+# Functions/classes in other files should be added in `__init__.py`, not here
+__all__ = ['gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',
+           'tmin', 'tmax', 'tstd', 'tsem', 'moment',
+           'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
+           'normaltest', 'jarque_bera',
+           'scoreatpercentile', 'percentileofscore',
+           'cumfreq', 'relfreq', 'obrientransform',
+           'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',
+           'median_abs_deviation',
+           'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
+           'f_oneway', 'pearsonr', 'fisher_exact',
+           'spearmanr', 'pointbiserialr',
+           'kendalltau', 'weightedtau',
+           'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
+           'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
+           'kstest', 'ks_1samp', 'ks_2samp',
+           'chisquare', 'power_divergence',
+           'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
+           'rankdata', 'combine_pvalues', 'quantile_test',
+           'wasserstein_distance', 'wasserstein_distance_nd', 'energy_distance',
+           'brunnermunzel', 'alexandergovern',
+           'expectile', 'lmoment']
+
+
+def _chk_asarray(a, axis, *, xp=None):
+    if xp is None:
+        xp = array_namespace(a)
+
+    if axis is None:
+        a = xp.reshape(a, (-1,))
+        outaxis = 0
+    else:
+        a = xp.asarray(a)
+        outaxis = axis
+
+    if a.ndim == 0:
+        a = xp.reshape(a, (-1,))
+
+    return a, outaxis
+
+
+SignificanceResult = _make_tuple_bunch('SignificanceResult',
+                                       ['statistic', 'pvalue'], [])
+# Let's call a SignificanceResult with legacy :correlation" attribute a
+# "CorrelationResult". Don't add to `extra_field_names`- shouldn't be in repr.
+
+
+def _pack_CorrelationResult(statistic, pvalue, correlation):
+    res = SignificanceResult(statistic, pvalue)
+    res.correlation = correlation
+    return res
+
+
+def _unpack_CorrelationResult(res, _):
+    return res.statistic, res.pvalue, res.correlation
+
+
+# note that `weights` are paired with `x`
+@xp_capabilities()
+@_axis_nan_policy_factory(
+        lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
+        result_to_tuple=lambda x, _: (x,), kwd_samples=['weights'])
+def gmean(a, axis=0, dtype=None, weights=None):
+    r"""Compute the weighted geometric mean along the specified axis.
+
+    The weighted geometric mean of the array :math:`a_i` associated to weights
+    :math:`w_i` is:
+
+    .. math::
+
+        \exp \left( \frac{ \sum_{i=1}^n w_i \ln a_i }{ \sum_{i=1}^n w_i }
+                   \right) \, ,
+
+    and, with equal weights, it gives:
+
+    .. math::
+
+        \sqrt[n]{ \prod_{i=1}^n a_i } \, .
+
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array.
+    axis : int or None, optional
+        Axis along which the geometric mean is computed. Default is 0.
+        If None, compute over the whole array `a`.
+    dtype : dtype, optional
+        Type to which the input arrays are cast before the calculation is
+        performed.
+    weights : array_like, optional
+        The `weights` array must be broadcastable to the same shape as `a`.
+        Default is None, which gives each value a weight of 1.0.
+
+    Returns
+    -------
+    gmean : ndarray
+        See `dtype` parameter above.
+
+    See Also
+    --------
+    numpy.mean : Arithmetic average
+    numpy.average : Weighted average
+    hmean : Harmonic mean
+
+    Notes
+    -----
+    The sample geometric mean is the exponential of the mean of the natural
+    logarithms of the observations.
+    Negative observations will produce NaNs in the output because the *natural*
+    logarithm (as opposed to the *complex* logarithm) is defined only for
+    non-negative reals.
+
+    References
+    ----------
+    .. [1] "Weighted Geometric Mean", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Weighted_geometric_mean.
+    .. [2] Grossman, J., Grossman, M., Katz, R., "Averages: A New Approach",
+           Archimedes Foundation, 1983
+
+    Examples
+    --------
+    >>> from scipy.stats import gmean
+    >>> gmean([1, 4])
+    2.0
+    >>> gmean([1, 2, 3, 4, 5, 6, 7])
+    3.3800151591412964
+    >>> gmean([1, 4, 7], weights=[3, 1, 3])
+    2.80668351922014
+
+    """
+    xp = array_namespace(a, weights)
+    a = xp.asarray(a, dtype=dtype)
+
+    if weights is not None:
+        weights = xp.asarray(weights, dtype=dtype)
+
+    with np.errstate(divide='ignore'):
+        log_a = xp.log(a)
+
+    return xp.exp(_xp_mean(log_a, axis=axis, weights=weights))
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=1)
+@_axis_nan_policy_factory(
+        lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
+        result_to_tuple=lambda x, _: (x,), kwd_samples=['weights'])
+def hmean(a, axis=0, dtype=None, *, weights=None):
+    r"""Calculate the weighted harmonic mean along the specified axis.
+
+    The weighted harmonic mean of the array :math:`a_i` associated to weights
+    :math:`w_i` is:
+
+    .. math::
+
+        \frac{ \sum_{i=1}^n w_i }{ \sum_{i=1}^n \frac{w_i}{a_i} } \, ,
+
+    and, with equal weights, it gives:
+
+    .. math::
+
+        \frac{ n }{ \sum_{i=1}^n \frac{1}{a_i} } \, .
+
+    Parameters
+    ----------
+    a : array_like
+        Input array, masked array or object that can be converted to an array.
+    axis : int or None, optional
+        Axis along which the harmonic mean is computed. Default is 0.
+        If None, compute over the whole array `a`.
+    dtype : dtype, optional
+        Type of the returned array and of the accumulator in which the
+        elements are summed. If `dtype` is not specified, it defaults to the
+        dtype of `a`, unless `a` has an integer `dtype` with a precision less
+        than that of the default platform integer. In that case, the default
+        platform integer is used.
+    weights : array_like, optional
+        The weights array can either be 1-D (in which case its length must be
+        the size of `a` along the given `axis`) or of the same shape as `a`.
+        Default is None, which gives each value a weight of 1.0.
+
+        .. versionadded:: 1.9
+
+    Returns
+    -------
+    hmean : ndarray
+        See `dtype` parameter above.
+
+    See Also
+    --------
+    numpy.mean : Arithmetic average
+    numpy.average : Weighted average
+    gmean : Geometric mean
+
+    Notes
+    -----
+    The sample harmonic mean is the reciprocal of the mean of the reciprocals
+    of the observations.
+
+    The harmonic mean is computed over a single dimension of the input
+    array, axis=0 by default, or all values in the array if axis=None.
+    float64 intermediate and return values are used for integer inputs.
+
+    The harmonic mean is only defined if all observations are non-negative;
+    otherwise, the result is NaN.
+
+    References
+    ----------
+    .. [1] "Weighted Harmonic Mean", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Harmonic_mean#Weighted_harmonic_mean
+    .. [2] Ferger, F., "The nature and use of the harmonic mean", Journal of
+           the American Statistical Association, vol. 26, pp. 36-40, 1931
+
+    Examples
+    --------
+    >>> from scipy.stats import hmean
+    >>> hmean([1, 4])
+    1.6000000000000001
+    >>> hmean([1, 2, 3, 4, 5, 6, 7])
+    2.6997245179063363
+    >>> hmean([1, 4, 7], weights=[3, 1, 3])
+    1.9029126213592233
+
+    """
+    xp = array_namespace(a, weights)
+    a = xp.asarray(a, dtype=dtype)
+
+    if weights is not None:
+        weights = xp.asarray(weights, dtype=dtype)
+
+    negative_mask = a < 0
+    if xp.any(negative_mask):
+        # `where` avoids having to be careful about dtypes and will work with
+        # JAX. This is the exceptional case, so it's OK to be a little slower.
+        # Won't work for array_api_strict for now, but see data-apis/array-api#807
+        a = xp.where(negative_mask, xp.nan, a)
+        message = ("The harmonic mean is only defined if all elements are "
+                   "non-negative; otherwise, the result is NaN.")
+        warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+    with np.errstate(divide='ignore'):
+        return 1.0 / _xp_mean(1.0 / a, axis=axis, weights=weights)
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=1)
+@_axis_nan_policy_factory(
+        lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
+        result_to_tuple=lambda x, _: (x,), kwd_samples=['weights'])
+def pmean(a, p, *, axis=0, dtype=None, weights=None):
+    r"""Calculate the weighted power mean along the specified axis.
+
+    The weighted power mean of the array :math:`a_i` associated to weights
+    :math:`w_i` is:
+
+    .. math::
+
+        \left( \frac{ \sum_{i=1}^n w_i a_i^p }{ \sum_{i=1}^n w_i }
+              \right)^{ 1 / p } \, ,
+
+    and, with equal weights, it gives:
+
+    .. math::
+
+        \left( \frac{ 1 }{ n } \sum_{i=1}^n a_i^p \right)^{ 1 / p }  \, .
+
+    When ``p=0``, it returns the geometric mean.
+
+    This mean is also called generalized mean or Hölder mean, and must not be
+    confused with the Kolmogorov generalized mean, also called
+    quasi-arithmetic mean or generalized f-mean [3]_.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array, masked array or object that can be converted to an array.
+    p : int or float
+        Exponent. Must be finite.
+    axis : int or None, optional
+        Axis along which the power mean is computed. Default is 0.
+        If None, compute over the whole array `a`.
+    dtype : dtype, optional
+        Type of the returned array and of the accumulator in which the
+        elements are summed. If `dtype` is not specified, it defaults to the
+        dtype of `a`, unless `a` has an integer `dtype` with a precision less
+        than that of the default platform integer. In that case, the default
+        platform integer is used.
+    weights : array_like, optional
+        The weights array can either be 1-D (in which case its length must be
+        the size of `a` along the given `axis`) or of the same shape as `a`.
+        Default is None, which gives each value a weight of 1.0.
+
+    Returns
+    -------
+    pmean : ndarray, see `dtype` parameter above.
+        Output array containing the power mean values.
+
+    See Also
+    --------
+    numpy.average : Weighted average
+    gmean : Geometric mean
+    hmean : Harmonic mean
+
+    Notes
+    -----
+    The power mean is computed over a single dimension of the input
+    array, ``axis=0`` by default, or all values in the array if ``axis=None``.
+    float64 intermediate and return values are used for integer inputs.
+
+    The power mean is only defined if all observations are non-negative;
+    otherwise, the result is NaN.
+
+    .. versionadded:: 1.9
+
+    References
+    ----------
+    .. [1] "Generalized Mean", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Generalized_mean
+    .. [2] Norris, N., "Convexity properties of generalized mean value
+           functions", The Annals of Mathematical Statistics, vol. 8,
+           pp. 118-120, 1937
+    .. [3] Bullen, P.S., Handbook of Means and Their Inequalities, 2003
+
+    Examples
+    --------
+    >>> from scipy.stats import pmean, hmean, gmean
+    >>> pmean([1, 4], 1.3)
+    2.639372938300652
+    >>> pmean([1, 2, 3, 4, 5, 6, 7], 1.3)
+    4.157111214492084
+    >>> pmean([1, 4, 7], -2, weights=[3, 1, 3])
+    1.4969684896631954
+
+    For p=-1, power mean is equal to harmonic mean:
+
+    >>> pmean([1, 4, 7], -1, weights=[3, 1, 3])
+    1.9029126213592233
+    >>> hmean([1, 4, 7], weights=[3, 1, 3])
+    1.9029126213592233
+
+    For p=0, power mean is defined as the geometric mean:
+
+    >>> pmean([1, 4, 7], 0, weights=[3, 1, 3])
+    2.80668351922014
+    >>> gmean([1, 4, 7], weights=[3, 1, 3])
+    2.80668351922014
+
+    """
+    if not isinstance(p, int | float):
+        raise ValueError("Power mean only defined for exponent of type int or "
+                         "float.")
+    if p == 0:
+        return gmean(a, axis=axis, dtype=dtype, weights=weights)
+    elif math.isinf(p):
+        message = "Power mean only implemented for finite `p`"
+        raise NotImplementedError(message)
+
+    xp = array_namespace(a, weights)
+    a = xp.asarray(a, dtype=dtype)
+
+    if weights is not None:
+        weights = xp.asarray(weights, dtype=dtype)
+
+    negative_mask = a < 0
+    if xp.any(negative_mask):
+        # `where` avoids having to be careful about dtypes and will work with
+        # JAX. This is the exceptional case, so it's OK to be a little slower.
+        # Won't work for array_api_strict for now, but see data-apis/array-api#807
+        a = xp.where(negative_mask, np.nan, a)
+        message = ("The power mean is only defined if all elements are "
+                   "non-negative; otherwise, the result is NaN.")
+        warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        return _xp_mean(a**float(p), axis=axis, weights=weights)**(1/p)
+
+
+ModeResult = namedtuple('ModeResult', ('mode', 'count'))
+
+
+def _mode_result(mode, count):
+    # When a slice is empty, `_axis_nan_policy` automatically produces
+    # NaN for `mode` and `count`. This is a reasonable convention for `mode`,
+    # but `count` should not be NaN; it should be zero.
+    xp = array_namespace(mode, count)
+    i = xp.isnan(count)
+    if i.shape == ():
+        count = xp.asarray(0, dtype=count.dtype)[()] if i else count
+    else:
+        count = xpx.at(count)[i].set(0)
+    return ModeResult(mode, count)
+
+
+@xp_capabilities(skip_backends=[('dask.array', "can't compute chunk size"),
+                                ('jax.numpy', "relies on _axis_nan_policy"),
+                                ('cupy', "data-apis/array-api-compat#312")])
+@_axis_nan_policy_factory(_mode_result, override={'nan_propagation': False})
+def mode(a, axis=0, nan_policy='propagate', keepdims=False):
+    r"""Return an array of the modal (most common) value in the passed array.
+
+    If there is more than one such value, only one is returned.
+    The bin-count for the modal bins is also returned.
+
+    Parameters
+    ----------
+    a : array_like
+        Numeric, n-dimensional array of which to find mode(s).
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': treats nan as it would treat any other value
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    keepdims : bool, optional
+        If set to ``False``, the `axis` over which the statistic is taken
+        is consumed (eliminated from the output array). If set to ``True``,
+        the `axis` is retained with size one, and the result will broadcast
+        correctly against the input array.
+
+    Returns
+    -------
+    mode : ndarray
+        Array of modal values.
+    count : ndarray
+        Array of counts for each mode.
+
+    Notes
+    -----
+    The mode  is calculated using `numpy.unique`.
+    In NumPy versions 1.21 and after, all NaNs - even those with different
+    binary representations - are treated as equivalent and counted as separate
+    instances of the same value.
+
+    By convention, the mode of an empty array is NaN, and the associated count
+    is zero.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> a = np.array([[3, 0, 3, 7],
+    ...               [3, 2, 6, 2],
+    ...               [1, 7, 2, 8],
+    ...               [3, 0, 6, 1],
+    ...               [3, 2, 5, 5]])
+    >>> from scipy import stats
+    >>> stats.mode(a, keepdims=True)
+    ModeResult(mode=array([[3, 0, 6, 1]]), count=array([[4, 2, 2, 1]]))
+
+    To get mode of whole array, specify ``axis=None``:
+
+    >>> stats.mode(a, axis=None, keepdims=True)
+    ModeResult(mode=[[3]], count=[[5]])
+    >>> stats.mode(a, axis=None, keepdims=False)
+    ModeResult(mode=3, count=5)
+
+    """
+    xp = array_namespace(a)
+
+    # `axis`, `nan_policy`, and `keepdims` are handled by `_axis_nan_policy`
+    if not xp.isdtype(a.dtype, 'numeric'):
+        message = ("Argument `a` is not recognized as numeric. "
+                   "Support for input that cannot be coerced to a numeric "
+                   "array was deprecated in SciPy 1.9.0 and removed in SciPy "
+                   "1.11.0. Please consider `np.unique`.")
+        raise TypeError(message)
+
+    if xp_size(a) == 0:
+        NaN = _get_nan(a, xp=xp)
+        return ModeResult(*xp.asarray([NaN, 0], dtype=NaN.dtype))
+
+    if a.ndim == 1:
+        vals, cnts = xp.unique_counts(a)
+        # in contrast with np.unique, `unique_counts` treats all NaNs as distinct,
+        # but we have always grouped them. Replace `cnts` corresponding with NaNs
+        # with the number of NaNs.
+        mask = xp.isnan(vals)
+        cnts = xpx.at(cnts)[mask].set(xp.count_nonzero(mask))
+        modes, counts = vals[xp.argmax(cnts)], xp.max(cnts)
+        default_int = xp.asarray(1).dtype  # fail slow CI job failed - incorrect dtype
+        counts = xp.astype(counts, default_int, copy=False)
+        modes = modes[()] if modes.ndim == 0 else modes
+        counts = counts[()] if counts.ndim == 0 else counts
+        return ModeResult(modes, counts)
+
+    # `axis` is always -1 after the `_axis_nan_policy` decorator
+    y = xp.sort(a, axis=-1)
+    # Get boolean array of elements that are different from the previous element
+    i = xp.concat([xp.ones(y.shape[:-1] + (1,), dtype=xp.bool),
+                  (y[..., :-1] != y[..., 1:]) & ~xp.isnan(y[..., :-1])], axis=-1)
+    # Get linear integer indices of these elements in a raveled array
+    indices = xp.arange(xp_size(y), device=xp_device(y))[xp_ravel(i)]
+    # The difference between integer indices is the number of repeats
+    append = xp.full(indices.shape[:-1] + (1,), xp_size(y), dtype=indices.dtype)
+    counts = xp.diff(indices, append=append)
+    # Now we form an array of `counts` corresponding with each element of `y`...
+    counts = xp.reshape(xp.repeat(counts, counts), y.shape)
+    # ... so we can get the argmax of *each slice* separately.
+    k = xp.argmax(counts, axis=-1, keepdims=True)
+    # Extract the corresponding element/count, and eliminate the reduced dimension
+    modes = xp.take_along_axis(y, k, axis=-1)[..., 0]
+    counts = xp.take_along_axis(counts, k, axis=-1)[..., 0]
+    modes = modes[()] if modes.ndim == 0 else modes
+    counts = counts[()] if counts.ndim == 0 else counts
+    return ModeResult(modes, counts)
+
+
+def _put_val_to_limits(a, limits, inclusive, val=np.nan, xp=None):
+    """Replace elements outside limits with a value.
+
+    This is primarily a utility function.
+
+    Parameters
+    ----------
+    a : array
+    limits : (float or None, float or None)
+        A tuple consisting of the (lower limit, upper limit).  Elements in the
+        input array less than the lower limit or greater than the upper limit
+        will be replaced with `val`. None implies no limit.
+    inclusive : (bool, bool)
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to lower or upper are allowed.
+    val : float, default: NaN
+        The value with which extreme elements of the array are replaced.
+
+    """
+    xp = array_namespace(a) if xp is None else xp
+    mask = xp.zeros_like(a, dtype=xp.bool)
+    if limits is None:
+        return a, mask
+    lower_limit, upper_limit = limits
+    lower_include, upper_include = inclusive
+    if lower_limit is not None:
+        mask = mask | ((a < lower_limit) if lower_include else a <= lower_limit)
+    if upper_limit is not None:
+        mask = mask | ((a > upper_limit) if upper_include else a >= upper_limit)
+    lazy = is_lazy_array(mask)
+    if not lazy and xp.all(mask):
+        raise ValueError("No array values within given limits")
+    if lazy or xp.any(mask):
+        a = xp.where(mask, val, a)
+    return a, mask
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, default_axis=None,
+    result_to_tuple=lambda x, _: (x,)
+)
+def tmean(a, limits=None, inclusive=(True, True), axis=None):
+    """Compute the trimmed mean.
+
+    This function finds the arithmetic mean of given values, ignoring values
+    outside the given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored.  When limits is None (default), then all
+        values are used.  Either of the limit values in the tuple can also be
+        None representing a half-open interval.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to compute test. Default is None.
+
+    Returns
+    -------
+    tmean : ndarray
+        Trimmed mean.
+
+    See Also
+    --------
+    trim_mean : Returns mean after trimming a proportion from both tails.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tmean(x)
+    9.5
+    >>> stats.tmean(x, (3,17))
+    10.0
+
+    """
+    xp = array_namespace(a)
+    a, mask = _put_val_to_limits(a, limits, inclusive, val=0., xp=xp)
+    # explicit dtype specification required due to data-apis/array-api-compat#152
+    sum = xp.sum(a, axis=axis, dtype=a.dtype)
+    n = xp.sum(xp.asarray(~mask, dtype=a.dtype, device=xp_device(a)), axis=axis,
+               dtype=a.dtype)
+    mean = xpx.apply_where(n != 0, (sum, n), operator.truediv, fill_value=xp.nan)
+    return mean[()] if mean.ndim == 0 else mean
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
+    """Compute the trimmed variance.
+
+    This function computes the sample variance of an array of values,
+    while ignoring values which are outside of given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored. When limits is None, then all values are
+        used. Either of the limit values in the tuple can also be None
+        representing a half-open interval.  The default value is None.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    ddof : int, optional
+        Delta degrees of freedom.  Default is 1.
+
+    Returns
+    -------
+    tvar : float
+        Trimmed variance.
+
+    Notes
+    -----
+    `tvar` computes the unbiased sample variance, i.e. it uses a correction
+    factor ``n / (n - 1)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tvar(x)
+    35.0
+    >>> stats.tvar(x, (3,17))
+    20.0
+
+    """
+    xp = array_namespace(a)
+    a, _ = _put_val_to_limits(a, limits, inclusive, xp=xp)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", SmallSampleWarning)
+        # Currently, this behaves like nan_policy='omit' for alternative array
+        # backends, but nan_policy='propagate' will be handled for other backends
+        # by the axis_nan_policy decorator shortly.
+        return _xp_var(a, correction=ddof, axis=axis, nan_policy='omit', xp=xp)
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
+    """Compute the trimmed minimum.
+
+    This function finds the minimum value of an array `a` along the
+    specified axis, but only considering values greater than a specified
+    lower limit.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    lowerlimit : None or float, optional
+        Values in the input array less than the given limit will be ignored.
+        When lowerlimit is None, then all values are used. The default value
+        is None.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    inclusive : {True, False}, optional
+        This flag determines whether values exactly equal to the lower limit
+        are included.  The default value is True.
+
+    Returns
+    -------
+    tmin : float, int or ndarray
+        Trimmed minimum.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tmin(x)
+    0
+
+    >>> stats.tmin(x, 13)
+    13
+
+    >>> stats.tmin(x, 13, inclusive=False)
+    14
+
+    """
+    xp = array_namespace(a)
+
+    max_ = xp.iinfo(a.dtype).max if xp.isdtype(a.dtype, 'integral') else xp.inf
+    a, mask = _put_val_to_limits(a, (lowerlimit, None), (inclusive, None),
+                                 val=max_, xp=xp)
+
+    res = xp.min(a, axis=axis)
+    invalid = xp.all(mask, axis=axis)  # All elements are below lowerlimit
+
+    # For eager backends, output dtype is data-dependent
+    if is_lazy_array(invalid) or xp.any(invalid):
+        # Possible loss of precision for int types
+        res = xp_promote(res, force_floating=True, xp=xp)
+        res = xp.where(invalid, xp.nan, res)
+
+    return res[()] if res.ndim == 0 else res
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
+    """Compute the trimmed maximum.
+
+    This function computes the maximum value of an array along a given axis,
+    while ignoring values larger than a specified upper limit.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    upperlimit : None or float, optional
+        Values in the input array greater than the given limit will be ignored.
+        When upperlimit is None, then all values are used. The default value
+        is None.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    inclusive : {True, False}, optional
+        This flag determines whether values exactly equal to the upper limit
+        are included.  The default value is True.
+
+    Returns
+    -------
+    tmax : float, int or ndarray
+        Trimmed maximum.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tmax(x)
+    19
+
+    >>> stats.tmax(x, 13)
+    13
+
+    >>> stats.tmax(x, 13, inclusive=False)
+    12
+
+    """
+    xp = array_namespace(a)
+
+    min_ = xp.iinfo(a.dtype).min if xp.isdtype(a.dtype, 'integral') else -xp.inf
+    a, mask = _put_val_to_limits(a, (None, upperlimit), (None, inclusive),
+                                 val=min_, xp=xp)
+
+    res = xp.max(a, axis=axis)
+    invalid = xp.all(mask, axis=axis)  # All elements are above upperlimit
+
+    # For eager backends, output dtype is data-dependent
+    if is_lazy_array(invalid) or xp.any(invalid):
+        # Possible loss of precision for int types
+        res = xp_promote(res, force_floating=True, xp=xp)
+        res = xp.where(invalid, xp.nan, res)
+
+    return res[()] if res.ndim == 0 else res
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
+    """Compute the trimmed sample standard deviation.
+
+    This function finds the sample standard deviation of given values,
+    ignoring values outside the given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored. When limits is None, then all values are
+        used. Either of the limit values in the tuple can also be None
+        representing a half-open interval.  The default value is None.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    ddof : int, optional
+        Delta degrees of freedom.  Default is 1.
+
+    Returns
+    -------
+    tstd : float
+        Trimmed sample standard deviation.
+
+    Notes
+    -----
+    `tstd` computes the unbiased sample standard deviation, i.e. it uses a
+    correction factor ``n / (n - 1)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tstd(x)
+    5.9160797830996161
+    >>> stats.tstd(x, (3,17))
+    4.4721359549995796
+
+    """
+    return tvar(a, limits, inclusive, axis, ddof, _no_deco=True)**0.5
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
+    """Compute the trimmed standard error of the mean.
+
+    This function finds the standard error of the mean for given
+    values, ignoring values outside the given `limits`.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of values.
+    limits : None or (lower limit, upper limit), optional
+        Values in the input array less than the lower limit or greater than the
+        upper limit will be ignored. When limits is None, then all values are
+        used. Either of the limit values in the tuple can also be None
+        representing a half-open interval.  The default value is None.
+    inclusive : (bool, bool), optional
+        A tuple consisting of the (lower flag, upper flag).  These flags
+        determine whether values exactly equal to the lower or upper limits
+        are included.  The default value is (True, True).
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over the
+        whole array `a`.
+    ddof : int, optional
+        Delta degrees of freedom.  Default is 1.
+
+    Returns
+    -------
+    tsem : float
+        Trimmed standard error of the mean.
+
+    Notes
+    -----
+    `tsem` uses unbiased sample standard deviation, i.e. it uses a
+    correction factor ``n / (n - 1)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = np.arange(20)
+    >>> stats.tsem(x)
+    1.3228756555322954
+    >>> stats.tsem(x, (3,17))
+    1.1547005383792515
+
+    """
+    xp = array_namespace(a)
+    a, _ = _put_val_to_limits(a, limits, inclusive, xp=xp)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", SmallSampleWarning)
+        # Currently, this behaves like nan_policy='omit' for alternative array
+        # backends, but nan_policy='propagate' will be handled for other backends
+        # by the axis_nan_policy decorator shortly.
+        sd = _xp_var(a, correction=ddof, axis=axis, nan_policy='omit', xp=xp)**0.5
+
+    not_nan = xp.astype(~xp.isnan(a), a.dtype)
+    n_obs = xp.sum(not_nan, axis=axis, dtype=sd.dtype)
+    return sd / n_obs**0.5
+
+
+#####################################
+#              MOMENTS              #
+#####################################
+
+
+def _moment_outputs(kwds, default_order=1):
+    order = np.atleast_1d(kwds.get('order', default_order))
+    message = "`order` must be a scalar or a non-empty 1D array."
+    if order.size == 0 or order.ndim > 1:
+        raise ValueError(message)
+    return len(order)
+
+
+def _moment_result_object(*args):
+    if len(args) == 1:
+        return args[0]
+    xp = array_namespace(*args)
+    return xp.stack(args)
+
+
+# When `order` is array-like with size > 1, moment produces an *array*
+# rather than a tuple, but the zeroth dimension is to be treated like
+# separate outputs. It is important to make the distinction between
+# separate outputs when adding the reduced axes back (`keepdims=True`).
+def _moment_tuple(x, n_out):
+    return tuple(x[i, ...] for i in range(x.shape[0])) if n_out > 1 else (x,)
+
+
+# `moment` fits into the `_axis_nan_policy` pattern, but it is a bit unusual
+# because the number of outputs is variable. Specifically,
+# `result_to_tuple=lambda x: (x,)` may be surprising for a function that
+# can produce more than one output, but it is intended here.
+# When `moment is called to produce the output:
+# - `result_to_tuple` packs the returned array into a single-element tuple,
+# - `_moment_result_object` extracts and returns that single element.
+# However, when the input array is empty, `moment` is never called. Instead,
+# - `_check_empty_inputs` is used to produce an empty array with the
+#   appropriate dimensions.
+# - A list comprehension creates the appropriate number of copies of this
+#   array, depending on `n_outputs`.
+# - This list - which may have multiple elements - is passed into
+#   `_moment_result_object`.
+# - If there is a single output, `_moment_result_object` extracts and returns
+#   the single output from the list.
+# - If there are multiple outputs, and therefore multiple elements in the list,
+#   `_moment_result_object` converts the list of arrays to a single array and
+#   returns it.
+# Currently, this leads to a slight inconsistency: when the input array is
+# empty, there is no distinction between the `moment` function being called
+# with parameter `order=1` and `order=[1]`; the latter *should* produce
+# the same as the former but with a singleton zeroth dimension.
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+@_rename_parameter('moment', 'order')
+@_axis_nan_policy_factory(  # noqa: E302
+    _moment_result_object, n_samples=1, result_to_tuple=_moment_tuple,
+    n_outputs=_moment_outputs
+)
+def moment(a, order=1, axis=0, nan_policy='propagate', *, center=None):
+    r"""Calculate the nth moment about the mean for a sample.
+
+    A moment is a specific quantitative measure of the shape of a set of
+    points. It is often used to calculate coefficients of skewness and kurtosis
+    due to its close relationship with them.
+
+    Parameters
+    ----------
+    a : array_like
+       Input array.
+    order : int or 1-D array_like of ints, optional
+       Order of central moment that is returned. Default is 1.
+    axis : int or None, optional
+       Axis along which the central moment is computed. Default is 0.
+       If None, compute over the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+    center : float or None, optional
+       The point about which moments are taken. This can be the sample mean,
+       the origin, or any other be point. If `None` (default) compute the
+       center as the sample mean.
+
+    Returns
+    -------
+    n-th moment about the `center` : ndarray or float
+       The appropriate moment along the given axis or over all values if axis
+       is None. The denominator for the moment calculation is the number of
+       observations, no degrees of freedom correction is done.
+
+    See Also
+    --------
+    kurtosis, skew, describe
+
+    Notes
+    -----
+    The k-th moment of a data sample is:
+
+    .. math::
+
+        m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - c)^k
+
+    Where `n` is the number of samples, and `c` is the center around which the
+    moment is calculated. This function uses exponentiation by squares [1]_ for
+    efficiency.
+
+    Note that, if `a` is an empty array (``a.size == 0``), array `moment` with
+    one element (`moment.size == 1`) is treated the same as scalar `moment`
+    (``np.isscalar(moment)``). This might produce arrays of unexpected shape.
+
+    References
+    ----------
+    .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
+
+    Examples
+    --------
+    >>> from scipy.stats import moment
+    >>> moment([1, 2, 3, 4, 5], order=1)
+    0.0
+    >>> moment([1, 2, 3, 4, 5], order=2)
+    2.0
+
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    a = xp_promote(a, force_floating=True, xp=xp)
+
+    order = xp.asarray(order, dtype=a.dtype, device=xp_device(a))
+    if xp_size(order) == 0:
+        # This is tested by `_moment_outputs`, which is run by the `_axis_nan_policy`
+        # decorator. Currently, the `_axis_nan_policy` decorator is skipped when `a`
+        # is a non-NumPy array, so we need to check again. When the decorator is
+        # updated for array API compatibility, we can remove this second check.
+        raise ValueError("`order` must be a scalar or a non-empty 1D array.")
+    if xp.any(order != xp.round(order)):
+        raise ValueError("All elements of `order` must be integral.")
+    order = order[()] if order.ndim == 0 else order
+
+    # for array_like order input, return a value for each.
+    if order.ndim > 0:
+        # Calculated the mean once at most, and only if it will be used
+        calculate_mean = center is None and xp.any(order > 1)
+        mean = xp.mean(a, axis=axis, keepdims=True) if calculate_mean else None
+        mmnt = []
+        for i in range(order.shape[0]):
+            order_i = order[i]
+            if center is None and order_i > 1:
+                mmnt.append(_moment(a, order_i, axis, mean=mean)[np.newaxis, ...])
+            else:
+                mmnt.append(_moment(a, order_i, axis, mean=center)[np.newaxis, ...])
+        return xp.concat(mmnt, axis=0)
+    else:
+        return _moment(a, order, axis, mean=center)
+
+
+def _demean(a, mean, axis, *, xp, precision_warning=True):
+    # subtracts `mean` from `a` and returns the result,
+    # warning if there is catastrophic cancellation. `mean`
+    # must be the mean of `a` along axis with `keepdims=True`.
+    # Used in e.g. `_moment`, `_zscore`, `_xp_var`. See gh-15905.
+    a_zero_mean = a - mean
+
+    if (xp_size(a_zero_mean) == 0 or not precision_warning
+        or is_lazy_array(a_zero_mean)):
+        return a_zero_mean
+
+    eps = xp.finfo(mean.dtype).eps * 10
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        rel_diff = xp.max(xp.abs(a_zero_mean), axis=axis,
+                          keepdims=True) / xp.abs(mean)
+
+    n = _length_nonmasked(a, axis, xp=xp)
+    with np.errstate(invalid='ignore'):
+        # Old NumPy doesn't accept `device` arg
+        device = {} if xp is np and np.__version__ < '2.0' else {'device': xp_device(a)}
+        precision_loss = xp.any(xp.asarray(rel_diff < eps, **device)
+                                & xp.asarray(n > 1, **device))
+
+    if precision_loss:
+        message = ("Precision loss occurred in moment calculation due to "
+                   "catastrophic cancellation. This occurs when the data "
+                   "are nearly identical. Results may be unreliable.")
+        warnings.warn(message, RuntimeWarning, stacklevel=5)
+    return a_zero_mean
+
+
+def _moment(a, order, axis, *, mean=None, xp=None):
+    """Vectorized calculation of raw moment about specified center
+
+    When `mean` is None, the mean is computed and used as the center;
+    otherwise, the provided value is used as the center.
+
+    """
+    xp = array_namespace(a) if xp is None else xp
+
+    a = xp_promote(a, force_floating=True, xp=xp)
+    dtype = a.dtype
+
+    # moment of empty array is the same regardless of order
+    if xp_size(a) == 0:
+        return xp.mean(a, axis=axis)
+
+    if order == 0 or (order == 1 and mean is None):
+        # By definition the zeroth moment is always 1, and the first *central*
+        # moment is 0.
+        shape = list(a.shape)
+        del shape[axis]
+
+        temp = (xp.ones(shape, dtype=dtype, device=xp_device(a)) if order == 0
+                else xp.zeros(shape, dtype=dtype, device=xp_device(a)))
+        return temp[()] if temp.ndim == 0 else temp
+
+    # Exponentiation by squares: form exponent sequence
+    n_list = [order]
+    current_n = order
+    while current_n > 2:
+        if current_n % 2:
+            current_n = (current_n - 1) / 2
+        else:
+            current_n /= 2
+        n_list.append(current_n)
+
+    # Starting point for exponentiation by squares
+    mean = (xp.mean(a, axis=axis, keepdims=True) if mean is None
+            else xp.asarray(mean, dtype=dtype))
+    mean = mean[()] if mean.ndim == 0 else mean
+    a_zero_mean = _demean(a, mean, axis, xp=xp)
+
+    if n_list[-1] == 1:
+        s = xp.asarray(a_zero_mean, copy=True)
+    else:
+        s = a_zero_mean**2
+
+    # Perform multiplications
+    for n in n_list[-2::-1]:
+        s = s**2
+        if n % 2:
+            s *= a_zero_mean
+    return xp.mean(s, axis=axis)
+
+
+def _var(x, axis=0, ddof=0, mean=None, xp=None):
+    # Calculate variance of sample, warning if precision is lost
+    xp = array_namespace(x) if xp is None else xp
+    var = _moment(x, 2, axis, mean=mean, xp=xp)
+    if ddof != 0:
+        n = _length_nonmasked(x, axis, xp=xp)
+        n = xp.asarray(n, dtype=x.dtype, device=xp_device(x))
+        var *= (n / (n-ddof))  # to avoid error on division by zero
+    return var
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=2)
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1
+)
+# nan_policy handled by `_axis_nan_policy`, but needs to be left
+# in signature to preserve use as a positional argument
+def skew(a, axis=0, bias=True, nan_policy='propagate'):
+    r"""Compute the sample skewness of a data set.
+
+    For normally distributed data, the skewness should be about zero. For
+    unimodal continuous distributions, a skewness value greater than zero means
+    that there is more weight in the right tail of the distribution. The
+    function `skewtest` can be used to determine if the skewness value
+    is close enough to zero, statistically speaking.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axis : int or None, optional
+        Axis along which skewness is calculated. Default is 0.
+        If None, compute over the whole array `a`.
+    bias : bool, optional
+        If False, then the calculations are corrected for statistical bias.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    skewness : ndarray
+        The skewness of values along an axis, returning NaN where all values
+        are equal.
+
+    Notes
+    -----
+    The sample skewness is computed as the Fisher-Pearson coefficient
+    of skewness, i.e.
+
+    .. math::
+
+        g_1=\frac{m_3}{m_2^{3/2}}
+
+    where
+
+    .. math::
+
+        m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i
+
+    is the biased sample :math:`i\texttt{th}` central moment, and
+    :math:`\bar{x}` is
+    the sample mean.  If ``bias`` is False, the calculations are
+    corrected for bias and the value computed is the adjusted
+    Fisher-Pearson standardized moment coefficient, i.e.
+
+    .. math::
+
+        G_1=\frac{k_3}{k_2^{3/2}}=
+            \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.
+
+    References
+    ----------
+    .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+       Probability and Statistics Tables and Formulae. Chapman & Hall: New
+       York. 2000.
+       Section 2.2.24.1
+
+    Examples
+    --------
+    >>> from scipy.stats import skew
+    >>> skew([1, 2, 3, 4, 5])
+    0.0
+    >>> skew([2, 8, 0, 4, 1, 9, 9, 0])
+    0.2650554122698573
+
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+    n = _length_nonmasked(a, axis, xp=xp)
+
+    mean = xp.mean(a, axis=axis, keepdims=True)
+    mean_reduced = xp.squeeze(mean, axis=axis)  # needed later
+    m2 = _moment(a, 2, axis, mean=mean, xp=xp)
+    m3 = _moment(a, 3, axis, mean=mean, xp=xp)
+    with np.errstate(all='ignore'):
+        eps = xp.finfo(m2.dtype).eps
+        zero = m2 <= (eps * mean_reduced)**2
+        vals = xp.where(zero, xp.nan, m3 / m2**1.5)
+    if not bias:
+        can_correct = ~zero & (n > 2)
+        if is_lazy_array(can_correct) or xp.any(can_correct):
+            nval = ((n - 1.0) * n)**0.5 / (n - 2.0) * m3 / m2**1.5
+            vals = xp.where(can_correct, nval, vals)
+
+    return vals[()] if vals.ndim == 0 else vals
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=2)
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1
+)
+# nan_policy handled by `_axis_nan_policy`, but needs to be left
+# in signature to preserve use as a positional argument
+def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):
+    """Compute the kurtosis (Fisher or Pearson) of a dataset.
+
+    Kurtosis is the fourth central moment divided by the square of the
+    variance. If Fisher's definition is used, then 3.0 is subtracted from
+    the result to give 0.0 for a normal distribution.
+
+    If bias is False then the kurtosis is calculated using k statistics to
+    eliminate bias coming from biased moment estimators
+
+    Use `kurtosistest` to see if result is close enough to normal.
+
+    Parameters
+    ----------
+    a : array
+        Data for which the kurtosis is calculated.
+    axis : int or None, optional
+        Axis along which the kurtosis is calculated. Default is 0.
+        If None, compute over the whole array `a`.
+    fisher : bool, optional
+        If True, Fisher's definition is used (normal ==> 0.0). If False,
+        Pearson's definition is used (normal ==> 3.0).
+    bias : bool, optional
+        If False, then the calculations are corrected for statistical bias.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan. 'propagate' returns nan,
+        'raise' throws an error, 'omit' performs the calculations ignoring nan
+        values. Default is 'propagate'.
+
+    Returns
+    -------
+    kurtosis : array
+        The kurtosis of values along an axis, returning NaN where all values
+        are equal.
+
+    References
+    ----------
+    .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+       Probability and Statistics Tables and Formulae. Chapman & Hall: New
+       York. 2000.
+
+    Examples
+    --------
+    In Fisher's definition, the kurtosis of the normal distribution is zero.
+    In the following example, the kurtosis is close to zero, because it was
+    calculated from the dataset, not from the continuous distribution.
+
+    >>> import numpy as np
+    >>> from scipy.stats import norm, kurtosis
+    >>> data = norm.rvs(size=1000, random_state=3)
+    >>> kurtosis(data)
+    -0.06928694200380558
+
+    The distribution with a higher kurtosis has a heavier tail.
+    The zero valued kurtosis of the normal distribution in Fisher's definition
+    can serve as a reference point.
+
+    >>> import matplotlib.pyplot as plt
+    >>> import scipy.stats as stats
+    >>> from scipy.stats import kurtosis
+
+    >>> x = np.linspace(-5, 5, 100)
+    >>> ax = plt.subplot()
+    >>> distnames = ['laplace', 'norm', 'uniform']
+
+    >>> for distname in distnames:
+    ...     if distname == 'uniform':
+    ...         dist = getattr(stats, distname)(loc=-2, scale=4)
+    ...     else:
+    ...         dist = getattr(stats, distname)
+    ...     data = dist.rvs(size=1000)
+    ...     kur = kurtosis(data, fisher=True)
+    ...     y = dist.pdf(x)
+    ...     ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3)))
+    ...     ax.legend()
+
+    The Laplace distribution has a heavier tail than the normal distribution.
+    The uniform distribution (which has negative kurtosis) has the thinnest
+    tail.
+
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    n = _length_nonmasked(a, axis, xp=xp)
+    mean = xp.mean(a, axis=axis, keepdims=True)
+    mean_reduced = xp.squeeze(mean, axis=axis)  # needed later
+    m2 = _moment(a, 2, axis, mean=mean, xp=xp)
+    m4 = _moment(a, 4, axis, mean=mean, xp=xp)
+    with np.errstate(all='ignore'):
+        zero = m2 <= (xp.finfo(m2.dtype).eps * mean_reduced)**2
+        vals = xp.where(zero, xp.nan, m4 / m2**2.0)
+
+    if not bias:
+        can_correct = ~zero & (n > 3)
+        if is_lazy_array(can_correct) or xp.any(can_correct):
+            nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)
+            vals = xp.where(can_correct, nval + 3.0, vals)
+
+    vals = vals - 3 if fisher else vals
+    return vals[()] if vals.ndim == 0 else vals
+
+
+DescribeResult = namedtuple('DescribeResult',
+                            ('nobs', 'minmax', 'mean', 'variance', 'skewness',
+                             'kurtosis'))
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'):
+    """Compute several descriptive statistics of the passed array.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : int or None, optional
+        Axis along which statistics are calculated. Default is 0.
+        If None, compute over the whole array `a`.
+    ddof : int, optional
+        Delta degrees of freedom (only for variance).  Default is 1.
+    bias : bool, optional
+        If False, then the skewness and kurtosis calculations are corrected
+        for statistical bias.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    nobs : int or ndarray of ints
+        Number of observations (length of data along `axis`).
+        When 'omit' is chosen as nan_policy, the length along each axis
+        slice is counted separately.
+    minmax: tuple of ndarrays or floats
+        Minimum and maximum value of `a` along the given axis.
+    mean : ndarray or float
+        Arithmetic mean of `a` along the given axis.
+    variance : ndarray or float
+        Unbiased variance of `a` along the given axis; denominator is number
+        of observations minus one.
+    skewness : ndarray or float
+        Skewness of `a` along the given axis, based on moment calculations
+        with denominator equal to the number of observations, i.e. no degrees
+        of freedom correction.
+    kurtosis : ndarray or float
+        Kurtosis (Fisher) of `a` along the given axis.  The kurtosis is
+        normalized so that it is zero for the normal distribution.  No
+        degrees of freedom are used.
+
+    Raises
+    ------
+    ValueError
+        If size of `a` is 0.
+
+    See Also
+    --------
+    skew, kurtosis
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = np.arange(10)
+    >>> stats.describe(a)
+    DescribeResult(nobs=10, minmax=(0, 9), mean=4.5,
+                   variance=9.166666666666666, skewness=0.0,
+                   kurtosis=-1.2242424242424244)
+    >>> b = [[1, 2], [3, 4]]
+    >>> stats.describe(b)
+    DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])),
+                   mean=array([2., 3.]), variance=array([2., 2.]),
+                   skewness=array([0., 0.]), kurtosis=array([-2., -2.]))
+
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    contains_nan = _contains_nan(a, nan_policy)
+
+    # Test nan_policy before the implicit call to bool(contains_nan)
+    # to avoid raising on lazy xps on the default nan_policy='propagate'
+    if nan_policy == 'omit' and contains_nan:
+        # only NumPy gets here; `_contains_nan` raises error for the rest
+        a = ma.masked_invalid(a)
+        return mstats_basic.describe(a, axis, ddof, bias)
+
+    if xp_size(a) == 0:
+        raise ValueError("The input must not be empty.")
+
+    # use xp.astype when data-apis/array-api-compat#226 is resolved
+    n = xp.asarray(_length_nonmasked(a, axis, xp=xp), dtype=xp.int64,
+                   device=xp_device(a))
+    n = n[()] if n.ndim == 0 else n
+    mm = (xp.min(a, axis=axis), xp.max(a, axis=axis))
+    m = xp.mean(a, axis=axis)
+    v = _var(a, axis=axis, ddof=ddof, xp=xp)
+    sk = skew(a, axis, bias=bias)
+    kurt = kurtosis(a, axis, bias=bias)
+
+    return DescribeResult(n, mm, m, v, sk, kurt)
+
+#####################################
+#         NORMALITY TESTS           #
+#####################################
+
+
+def _get_pvalue(statistic, distribution, alternative, symmetric=True, xp=None):
+    """Get p-value given the statistic, (continuous) distribution, and alternative"""
+    xp = array_namespace(statistic) if xp is None else xp
+
+    if alternative == 'less':
+        pvalue = distribution.cdf(statistic)
+    elif alternative == 'greater':
+        pvalue = distribution.sf(statistic)
+    elif alternative == 'two-sided':
+        pvalue = 2 * (distribution.sf(xp.abs(statistic)) if symmetric
+                      else xp.minimum(distribution.cdf(statistic),
+                                      distribution.sf(statistic)))
+    else:
+        message = "`alternative` must be 'less', 'greater', or 'two-sided'."
+        raise ValueError(message)
+
+    return pvalue
+
+
+SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(SkewtestResult, n_samples=1, too_small=7)
+# nan_policy handled by `_axis_nan_policy`, but needs to be left
+# in signature to preserve use as a positional argument
+def skewtest(a, axis=0, nan_policy='propagate', alternative='two-sided'):
+    r"""Test whether the skew is different from the normal distribution.
+
+    This function tests the null hypothesis that the skewness of
+    the population that the sample was drawn from is the same
+    as that of a corresponding normal distribution.
+
+    Parameters
+    ----------
+    a : array
+        The data to be tested. Must contain at least eight observations.
+    axis : int or None, optional
+       Axis along which statistics are calculated. Default is 0.
+       If None, compute over the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the skewness of the distribution underlying the sample
+          is different from that of the normal distribution (i.e. 0)
+        * 'less': the skewness of the distribution underlying the sample
+          is less than that of the normal distribution
+        * 'greater': the skewness of the distribution underlying the sample
+          is greater than that of the normal distribution
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float
+        The computed z-score for this test.
+    pvalue : float
+        The p-value for the hypothesis test.
+
+    See Also
+    --------
+    :ref:`hypothesis_skewtest` : Extended example
+
+    Notes
+    -----
+    The sample size must be at least 8.
+
+    References
+    ----------
+    .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr.,
+            "A suggestion for using powerful and informative tests of
+            normality", American Statistician 44, pp. 316-321, 1990.
+
+    Examples
+    --------
+
+    >>> from scipy.stats import skewtest
+    >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8])
+    SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897)
+    >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0])
+    SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459)
+    >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000])
+    SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133)
+    >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101])
+    SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634)
+    >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='less')
+    SkewtestResult(statistic=1.0108048609177787, pvalue=0.8439450819289052)
+    >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='greater')
+    SkewtestResult(statistic=1.0108048609177787, pvalue=0.15605491807109484)
+
+    For a more detailed example, see :ref:`hypothesis_skewtest`.
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    b2 = skew(a, axis, _no_deco=True)
+
+    n = xp.asarray(_length_nonmasked(a, axis), dtype=b2.dtype, device=xp_device(a))
+    n = xpx.at(n, n < 8).set(xp.nan)
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        y = b2 * xp.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))
+        beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) /
+                 ((n-2.0) * (n+5) * (n+7) * (n+9)))
+        W2 = -1 + xp.sqrt(2 * (beta2 - 1))
+        delta = 1 / xp.sqrt(0.5 * xp.log(W2))
+        alpha = xp.sqrt(2.0 / (W2 - 1))
+        y = xp.where(y == 0, 1., y)
+        Z = delta * xp.log(y / alpha + xp.sqrt((y / alpha)**2 + 1))
+        pvalue = _get_pvalue(Z, _SimpleNormal(), alternative, xp=xp)
+
+    Z = Z[()] if Z.ndim == 0 else Z
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+    return SkewtestResult(Z, pvalue)
+
+
+KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(KurtosistestResult, n_samples=1, too_small=4)
+def kurtosistest(a, axis=0, nan_policy='propagate', alternative='two-sided'):
+    r"""Test whether a dataset has normal kurtosis.
+
+    This function tests the null hypothesis that the kurtosis
+    of the population from which the sample was drawn is that
+    of the normal distribution.
+
+    Parameters
+    ----------
+    a : array
+        Array of the sample data. Must contain at least five observations.
+    axis : int or None, optional
+       Axis along which to compute test. Default is 0. If None,
+       compute over the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the kurtosis of the distribution underlying the sample
+          is different from that of the normal distribution
+        * 'less': the kurtosis of the distribution underlying the sample
+          is less than that of the normal distribution
+        * 'greater': the kurtosis of the distribution underlying the sample
+          is greater than that of the normal distribution
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float
+        The computed z-score for this test.
+    pvalue : float
+        The p-value for the hypothesis test.
+
+    See Also
+    --------
+    :ref:`hypothesis_kurtosistest` : Extended example
+
+    Notes
+    -----
+    Valid only for n>20. This function uses the method described in [1]_.
+
+    References
+    ----------
+    .. [1] F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis
+       statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy.stats import kurtosistest
+    >>> kurtosistest(list(range(20)))
+    KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348)
+    >>> kurtosistest(list(range(20)), alternative='less')
+    KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.04402169166264174)
+    >>> kurtosistest(list(range(20)), alternative='greater')
+    KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.9559783083373583)
+    >>> rng = np.random.default_rng()
+    >>> s = rng.normal(0, 1, 1000)
+    >>> kurtosistest(s)
+    KurtosistestResult(statistic=-1.475047944490622, pvalue=0.14019965402996987)
+
+    For a more detailed example, see :ref:`hypothesis_kurtosistest`.
+    """
+    xp = array_namespace(a)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    b2 = kurtosis(a, axis, fisher=False, _no_deco=True)
+
+    n = xp.asarray(_length_nonmasked(a, axis), dtype=b2.dtype, device=xp_device(a))
+    n = xpx.at(n, n < 5).set(xp.nan)
+
+    E = 3.0*(n-1) / (n+1)
+    varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))  # [1]_ Eq. 1
+    x = (b2-E) / varb2**0.5  # [1]_ Eq. 4
+    # [1]_ Eq. 2:
+    sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * ((6.0*(n+3)*(n+5))
+                                                 / (n*(n-2)*(n-3)))**0.5
+    # [1]_ Eq. 3:
+    A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + (1+4.0/(sqrtbeta1**2))**0.5)
+    term1 = 1 - 2/(9.0*A)
+    denom = 1 + x * (2/(A-4.0))**0.5
+    term2 = xp.sign(denom) * xp.where(denom == 0.0, xp.nan,
+                                      ((1-2.0/A)/xp.abs(denom))**(1/3))
+    if not is_lazy_array(denom) and xp.any(denom == 0):
+        msg = ("Test statistic not defined in some cases due to division by "
+               "zero. Return nan in that case...")
+        warnings.warn(msg, RuntimeWarning, stacklevel=2)
+
+    Z = (term1 - term2) / (2/(9.0*A))**0.5  # [1]_ Eq. 5
+    pvalue = _get_pvalue(Z, _SimpleNormal(), alternative, xp=xp)
+
+    Z = Z[()] if Z.ndim == 0 else Z
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+    return KurtosistestResult(Z, pvalue)
+
+
+NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(NormaltestResult, n_samples=1, too_small=7)
+def normaltest(a, axis=0, nan_policy='propagate'):
+    r"""Test whether a sample differs from a normal distribution.
+
+    This function tests the null hypothesis that a sample comes
+    from a normal distribution.  It is based on D'Agostino and
+    Pearson's [1]_, [2]_ test that combines skew and kurtosis to
+    produce an omnibus test of normality.
+
+    Parameters
+    ----------
+    a : array_like
+        The array containing the sample to be tested. Must contain
+        at least eight observations.
+    axis : int or None, optional
+        Axis along which to compute test. Default is 0. If None,
+        compute over the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+            * 'propagate': returns nan
+            * 'raise': throws an error
+            * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    statistic : float or array
+        ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
+        ``k`` is the z-score returned by `kurtosistest`.
+    pvalue : float or array
+        A 2-sided chi squared probability for the hypothesis test.
+
+    See Also
+    --------
+    :ref:`hypothesis_normaltest` : Extended example
+
+    References
+    ----------
+    .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for
+            moderate and large sample size", Biometrika, 58, 341-348
+    .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from
+            normality", Biometrika, 60, 613-622
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> pts = 1000
+    >>> a = rng.normal(0, 1, size=pts)
+    >>> b = rng.normal(2, 1, size=pts)
+    >>> x = np.concatenate((a, b))
+    >>> res = stats.normaltest(x)
+    >>> res.statistic
+    53.619...  # random
+    >>> res.pvalue
+    2.273917413209226e-12  # random
+
+    For a more detailed example, see :ref:`hypothesis_normaltest`.
+    """
+    xp = array_namespace(a)
+
+    s, _ = skewtest(a, axis, _no_deco=True)
+    k, _ = kurtosistest(a, axis, _no_deco=True)
+    statistic = s*s + k*k
+
+    chi2 = _SimpleChi2(xp.asarray(2., dtype=statistic.dtype, device=xp_device(a)))
+    pvalue = _get_pvalue(statistic, chi2, alternative='greater', symmetric=False, xp=xp)
+
+    statistic = statistic[()] if statistic.ndim == 0 else statistic
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+
+    return NormaltestResult(statistic, pvalue)
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(SignificanceResult, default_axis=None)
+def jarque_bera(x, *, axis=None):
+    r"""Perform the Jarque-Bera goodness of fit test on sample data.
+
+    The Jarque-Bera test tests whether the sample data has the skewness and
+    kurtosis matching a normal distribution.
+
+    Note that this test only works for a large enough number of data samples
+    (>2000) as the test statistic asymptotically has a Chi-squared distribution
+    with 2 degrees of freedom.
+
+    Parameters
+    ----------
+    x : array_like
+        Observations of a random variable.
+    axis : int or None, default: 0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear in
+        a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    result : SignificanceResult
+        An object with the following attributes:
+
+        statistic : float
+            The test statistic.
+        pvalue : float
+            The p-value for the hypothesis test.
+
+    See Also
+    --------
+    :ref:`hypothesis_jarque_bera` : Extended example
+
+    References
+    ----------
+    .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality,
+           homoscedasticity and serial independence of regression residuals",
+           6 Econometric Letters 255-259.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = rng.normal(0, 1, 100000)
+    >>> jarque_bera_test = stats.jarque_bera(x)
+    >>> jarque_bera_test
+    Jarque_beraResult(statistic=3.3415184718131554, pvalue=0.18810419594996775)
+    >>> jarque_bera_test.statistic
+    3.3415184718131554
+    >>> jarque_bera_test.pvalue
+    0.18810419594996775
+
+    For a more detailed example, see :ref:`hypothesis_jarque_bera`.
+    """
+    xp = array_namespace(x)
+    x, axis = _chk_asarray(x, axis, xp=xp)
+
+    mu = _xp_mean(x, axis=axis, keepdims=True)
+    diffx = x - mu
+    s = skew(diffx, axis=axis, _no_deco=True)
+    k = kurtosis(diffx, axis=axis, _no_deco=True)
+
+    n = xp.asarray(_length_nonmasked(x, axis), dtype=mu.dtype, device=xp_device(x))
+    statistic = n / 6 * (s**2 + k**2 / 4)
+
+    chi2 = _SimpleChi2(xp.asarray(2., dtype=mu.dtype, device=xp_device(x)))
+    pvalue = _get_pvalue(statistic, chi2, alternative='greater', symmetric=False, xp=xp)
+
+    statistic = statistic[()] if statistic.ndim == 0 else statistic
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+
+    return SignificanceResult(statistic, pvalue)
+
+
+#####################################
+#        FREQUENCY FUNCTIONS        #
+#####################################
+
+
+@xp_capabilities(np_only=True)
+def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',
+                      axis=None):
+    """Calculate the score at a given percentile of the input sequence.
+
+    For example, the score at ``per=50`` is the median. If the desired quantile
+    lies between two data points, we interpolate between them, according to
+    the value of `interpolation`. If the parameter `limit` is provided, it
+    should be a tuple (lower, upper) of two values.
+
+    Parameters
+    ----------
+    a : array_like
+        A 1-D array of values from which to extract score.
+    per : array_like
+        Percentile(s) at which to extract score.  Values should be in range
+        [0,100].
+    limit : tuple, optional
+        Tuple of two scalars, the lower and upper limits within which to
+        compute the percentile. Values of `a` outside
+        this (closed) interval will be ignored.
+    interpolation_method : {'fraction', 'lower', 'higher'}, optional
+        Specifies the interpolation method to use,
+        when the desired quantile lies between two data points `i` and `j`
+        The following options are available (default is 'fraction'):
+
+          * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the
+            fractional part of the index surrounded by ``i`` and ``j``
+          * 'lower': ``i``
+          * 'higher': ``j``
+
+    axis : int, optional
+        Axis along which the percentiles are computed. Default is None. If
+        None, compute over the whole array `a`.
+
+    Returns
+    -------
+    score : float or ndarray
+        Score at percentile(s).
+
+    See Also
+    --------
+    percentileofscore, numpy.percentile
+
+    Notes
+    -----
+    This function will become obsolete in the future.
+    For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality
+    that `scoreatpercentile` provides.  And it's significantly faster.
+    Therefore it's recommended to use `numpy.percentile` for users that have
+    numpy >= 1.9.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = np.arange(100)
+    >>> stats.scoreatpercentile(a, 50)
+    49.5
+
+    """
+    # adapted from NumPy's percentile function.  When we require numpy >= 1.8,
+    # the implementation of this function can be replaced by np.percentile.
+    a = np.asarray(a)
+    if a.size == 0:
+        # empty array, return nan(s) with shape matching `per`
+        if np.isscalar(per):
+            return np.nan
+        else:
+            return np.full(np.asarray(per).shape, np.nan, dtype=np.float64)
+
+    if limit:
+        a = a[(limit[0] <= a) & (a <= limit[1])]
+
+    sorted_ = np.sort(a, axis=axis)
+    if axis is None:
+        axis = 0
+
+    return _compute_qth_percentile(sorted_, per, interpolation_method, axis)
+
+
+# handle sequence of per's without calling sort multiple times
+def _compute_qth_percentile(sorted_, per, interpolation_method, axis):
+    if not np.isscalar(per):
+        score = [_compute_qth_percentile(sorted_, i,
+                                         interpolation_method, axis)
+                 for i in per]
+        return np.array(score)
+
+    if not (0 <= per <= 100):
+        raise ValueError("percentile must be in the range [0, 100]")
+
+    indexer = [slice(None)] * sorted_.ndim
+    idx = per / 100. * (sorted_.shape[axis] - 1)
+
+    if int(idx) != idx:
+        # round fractional indices according to interpolation method
+        if interpolation_method == 'lower':
+            idx = int(np.floor(idx))
+        elif interpolation_method == 'higher':
+            idx = int(np.ceil(idx))
+        elif interpolation_method == 'fraction':
+            pass  # keep idx as fraction and interpolate
+        else:
+            raise ValueError("interpolation_method can only be 'fraction', "
+                             "'lower' or 'higher'")
+
+    i = int(idx)
+    if i == idx:
+        indexer[axis] = slice(i, i + 1)
+        weights = array(1)
+        sumval = 1.0
+    else:
+        indexer[axis] = slice(i, i + 2)
+        j = i + 1
+        weights = array([(j - idx), (idx - i)], float)
+        wshape = [1] * sorted_.ndim
+        wshape[axis] = 2
+        weights = weights.reshape(wshape)
+        sumval = weights.sum()
+
+    # Use np.add.reduce (== np.sum but a little faster) to coerce data type
+    return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval
+
+
+@xp_capabilities(np_only=True)
+def percentileofscore(a, score, kind='rank', nan_policy='propagate'):
+    """Compute the percentile rank of a score relative to a list of scores.
+
+    A `percentileofscore` of, for example, 80% means that 80% of the
+    scores in `a` are below the given score. In the case of gaps or
+    ties, the exact definition depends on the optional keyword, `kind`.
+
+    Parameters
+    ----------
+    a : array_like
+        A 1-D array to which `score` is compared.
+    score : float or array_like
+        A float score or array of scores for which to compute the percentile(s).
+    kind : {'rank', 'weak', 'strict', 'mean'}, optional
+        Specifies the interpretation of the resulting score.
+        The following options are available (default is 'rank'):
+
+        * 'rank': Average percentage ranking of score.  In case of multiple
+          matches, average the percentage rankings of all matching scores.
+        * 'weak': This kind corresponds to the definition of a cumulative
+          distribution function.  A percentileofscore of 80% means that 80%
+          of values are less than or equal to the provided score.
+        * 'strict': Similar to "weak", except that only values that are
+          strictly less than the given score are counted.
+        * 'mean': The average of the "weak" and "strict" scores, often used
+          in testing.  See https://en.wikipedia.org/wiki/Percentile_rank
+
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Specifies how to treat `nan` values in `a`.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan (for each value in `score`).
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    pcos : float or array-like
+        Percentile-position(s) of `score` (0-100) relative to `a`.
+
+    See Also
+    --------
+    numpy.percentile
+    scipy.stats.scoreatpercentile, scipy.stats.rankdata
+
+    Examples
+    --------
+    Three-quarters of the given values lie below a given score:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> stats.percentileofscore([1, 2, 3, 4], 3)
+    75.0
+
+    With multiple matches, note how the scores of the two matches, 0.6
+    and 0.8 respectively, are averaged:
+
+    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)
+    70.0
+
+    Only 2/5 values are strictly less than 3:
+
+    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
+    40.0
+
+    But 4/5 values are less than or equal to 3:
+
+    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
+    80.0
+
+    The average between the weak and the strict scores is:
+
+    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
+    60.0
+
+    Score arrays (of any dimensionality) are supported:
+
+    >>> stats.percentileofscore([1, 2, 3, 3, 4], [2, 3])
+    array([40., 70.])
+
+    The inputs can be infinite:
+
+    >>> stats.percentileofscore([-np.inf, 0, 1, np.inf], [1, 2, np.inf])
+    array([75., 75., 100.])
+
+    If `a` is empty, then the resulting percentiles are all `nan`:
+
+    >>> stats.percentileofscore([], [1, 2])
+    array([nan, nan])
+    """
+
+    a = np.asarray(a)
+    score = np.asarray(score)
+
+    if a.ndim != 1:
+        raise ValueError("`a` must be 1-dimensional.")
+
+    n = len(a)
+
+    # Nan treatment
+    cna = _contains_nan(a, nan_policy)
+    cns = _contains_nan(score, nan_policy)
+
+    if cns:
+        # If a score is nan, then the output should be nan
+        # (also if nan_policy is "omit", because it only applies to `a`)
+        score = ma.masked_where(np.isnan(score), score)
+
+    if cna:
+        if nan_policy == "omit":
+            # Don't count nans
+            a = ma.masked_where(np.isnan(a), a)
+            n = a.count()
+
+        if nan_policy == "propagate":
+            # All outputs should be nans
+            n = 0
+
+    # Cannot compare to empty list ==> nan
+    if n == 0:
+        perct = np.full_like(score, np.nan, dtype=np.float64)
+
+    else:
+        # Prepare broadcasting
+        score = score[..., None]
+
+        def count(x):
+            return np.count_nonzero(x, -1)
+
+        # Main computations/logic
+        if kind == 'rank':
+            left = count(a < score)
+            right = count(a <= score)
+            plus1 = left < right
+            perct = (left + right + plus1) * (50.0 / n)
+        elif kind == 'strict':
+            perct = count(a < score) * (100.0 / n)
+        elif kind == 'weak':
+            perct = count(a <= score) * (100.0 / n)
+        elif kind == 'mean':
+            left = count(a < score)
+            right = count(a <= score)
+            perct = (left + right) * (50.0 / n)
+        else:
+            raise ValueError(
+                "kind can only be 'rank', 'strict', 'weak' or 'mean'")
+
+    # Re-insert nan values
+    perct = ma.filled(perct, np.nan)
+
+    if perct.ndim == 0:
+        return perct[()]
+    return perct
+
+
+HistogramResult = namedtuple('HistogramResult',
+                             ('count', 'lowerlimit', 'binsize', 'extrapoints'))
+
+
+def _histogram(a, numbins=10, defaultlimits=None, weights=None,
+               printextras=False):
+    """Create a histogram.
+
+    Separate the range into several bins and return the number of instances
+    in each bin.
+
+    Parameters
+    ----------
+    a : array_like
+        Array of scores which will be put into bins.
+    numbins : int, optional
+        The number of bins to use for the histogram. Default is 10.
+    defaultlimits : tuple (lower, upper), optional
+        The lower and upper values for the range of the histogram.
+        If no value is given, a range slightly larger than the range of the
+        values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
+        where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
+    weights : array_like, optional
+        The weights for each value in `a`. Default is None, which gives each
+        value a weight of 1.0
+    printextras : bool, optional
+        If True, if there are extra points (i.e. the points that fall outside
+        the bin limits) a warning is raised saying how many of those points
+        there are.  Default is False.
+
+    Returns
+    -------
+    count : ndarray
+        Number of points (or sum of weights) in each bin.
+    lowerlimit : float
+        Lowest value of histogram, the lower limit of the first bin.
+    binsize : float
+        The size of the bins (all bins have the same size).
+    extrapoints : int
+        The number of points outside the range of the histogram.
+
+    See Also
+    --------
+    numpy.histogram
+
+    Notes
+    -----
+    This histogram is based on numpy's histogram but has a larger range by
+    default if default limits is not set.
+
+    """
+    a = np.ravel(a)
+    if defaultlimits is None:
+        if a.size == 0:
+            # handle empty arrays. Undetermined range, so use 0-1.
+            defaultlimits = (0, 1)
+        else:
+            # no range given, so use values in `a`
+            data_min = a.min()
+            data_max = a.max()
+            # Have bins extend past min and max values slightly
+            s = (data_max - data_min) / (2. * (numbins - 1.))
+            defaultlimits = (data_min - s, data_max + s)
+
+    # use numpy's histogram method to compute bins
+    hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits,
+                                   weights=weights)
+    # hist are not always floats, convert to keep with old output
+    hist = np.array(hist, dtype=float)
+    # fixed width for bins is assumed, as numpy's histogram gives
+    # fixed width bins for int values for 'bins'
+    binsize = bin_edges[1] - bin_edges[0]
+    # calculate number of extra points
+    extrapoints = len([v for v in a
+                       if defaultlimits[0] > v or v > defaultlimits[1]])
+    if extrapoints > 0 and printextras:
+        warnings.warn(f"Points outside given histogram range = {extrapoints}",
+                      stacklevel=3,)
+
+    return HistogramResult(hist, defaultlimits[0], binsize, extrapoints)
+
+
+CumfreqResult = namedtuple('CumfreqResult',
+                           ('cumcount', 'lowerlimit', 'binsize',
+                            'extrapoints'))
+
+
+@xp_capabilities(np_only=True)
+def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):
+    """Return a cumulative frequency histogram, using the histogram function.
+
+    A cumulative histogram is a mapping that counts the cumulative number of
+    observations in all of the bins up to the specified bin.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    numbins : int, optional
+        The number of bins to use for the histogram. Default is 10.
+    defaultreallimits : tuple (lower, upper), optional
+        The lower and upper values for the range of the histogram.
+        If no value is given, a range slightly larger than the range of the
+        values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``,
+        where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
+    weights : array_like, optional
+        The weights for each value in `a`. Default is None, which gives each
+        value a weight of 1.0
+
+    Returns
+    -------
+    cumcount : ndarray
+        Binned values of cumulative frequency.
+    lowerlimit : float
+        Lower real limit
+    binsize : float
+        Width of each bin.
+    extrapoints : int
+        Extra points.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = [1, 4, 2, 1, 3, 1]
+    >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5))
+    >>> res.cumcount
+    array([ 1.,  2.,  3.,  3.])
+    >>> res.extrapoints
+    3
+
+    Create a normal distribution with 1000 random values
+
+    >>> samples = stats.norm.rvs(size=1000, random_state=rng)
+
+    Calculate cumulative frequencies
+
+    >>> res = stats.cumfreq(samples, numbins=25)
+
+    Calculate space of values for x
+
+    >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size,
+    ...                                  res.cumcount.size + 1)
+
+    Plot histogram and cumulative histogram
+
+    >>> fig = plt.figure(figsize=(10, 4))
+    >>> ax1 = fig.add_subplot(1, 2, 1)
+    >>> ax2 = fig.add_subplot(1, 2, 2)
+    >>> ax1.hist(samples, bins=25)
+    >>> ax1.set_title('Histogram')
+    >>> ax2.bar(x[:-1], res.cumcount, width=res.binsize, align='edge')
+    >>> ax2.set_title('Cumulative histogram')
+    >>> ax2.set_xlim([x.min(), x.max()])
+
+    >>> plt.show()
+
+    """
+    h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
+    cumhist = np.cumsum(h * 1, axis=0)
+    return CumfreqResult(cumhist, l, b, e)
+
+
+RelfreqResult = namedtuple('RelfreqResult',
+                           ('frequency', 'lowerlimit', 'binsize',
+                            'extrapoints'))
+
+
+@xp_capabilities(np_only=True)
+def relfreq(a, numbins=10, defaultreallimits=None, weights=None):
+    """Return a relative frequency histogram, using the histogram function.
+
+    A relative frequency  histogram is a mapping of the number of
+    observations in each of the bins relative to the total of observations.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    numbins : int, optional
+        The number of bins to use for the histogram. Default is 10.
+    defaultreallimits : tuple (lower, upper), optional
+        The lower and upper values for the range of the histogram.
+        If no value is given, a range slightly larger than the range of the
+        values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
+        where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
+    weights : array_like, optional
+        The weights for each value in `a`. Default is None, which gives each
+        value a weight of 1.0
+
+    Returns
+    -------
+    frequency : ndarray
+        Binned values of relative frequency.
+    lowerlimit : float
+        Lower real limit.
+    binsize : float
+        Width of each bin.
+    extrapoints : int
+        Extra points.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> a = np.array([2, 4, 1, 2, 3, 2])
+    >>> res = stats.relfreq(a, numbins=4)
+    >>> res.frequency
+    array([ 0.16666667, 0.5       , 0.16666667,  0.16666667])
+    >>> np.sum(res.frequency)  # relative frequencies should add up to 1
+    1.0
+
+    Create a normal distribution with 1000 random values
+
+    >>> samples = stats.norm.rvs(size=1000, random_state=rng)
+
+    Calculate relative frequencies
+
+    >>> res = stats.relfreq(samples, numbins=25)
+
+    Calculate space of values for x
+
+    >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size,
+    ...                                  res.frequency.size)
+
+    Plot relative frequency histogram
+
+    >>> fig = plt.figure(figsize=(5, 4))
+    >>> ax = fig.add_subplot(1, 1, 1)
+    >>> ax.bar(x, res.frequency, width=res.binsize)
+    >>> ax.set_title('Relative frequency histogram')
+    >>> ax.set_xlim([x.min(), x.max()])
+
+    >>> plt.show()
+
+    """
+    a = np.asanyarray(a)
+    h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
+    h = h / a.shape[0]
+
+    return RelfreqResult(h, l, b, e)
+
+
+#####################################
+#        VARIABILITY FUNCTIONS      #
+#####################################
+
+@xp_capabilities(np_only=True)
+def obrientransform(*samples):
+    """Compute the O'Brien transform on input data (any number of arrays).
+
+    Used to test for homogeneity of variance prior to running one-way stats.
+    Each array in ``*samples`` is one level of a factor.
+    If `f_oneway` is run on the transformed data and found significant,
+    the variances are unequal.  From Maxwell and Delaney [1]_, p.112.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        Any number of arrays.
+
+    Returns
+    -------
+    obrientransform : ndarray
+        Transformed data for use in an ANOVA.  The first dimension
+        of the result corresponds to the sequence of transformed
+        arrays.  If the arrays given are all 1-D of the same length,
+        the return value is a 2-D array; otherwise it is a 1-D array
+        of type object, with each element being an ndarray.
+
+    Raises
+    ------
+    ValueError
+        If the mean of the transformed data is not equal to the original
+        variance, indicating a lack of convergence in the O'Brien transform.
+
+    References
+    ----------
+    .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and
+           Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990.
+
+    Examples
+    --------
+    We'll test the following data sets for differences in their variance.
+
+    >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10]
+    >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15]
+
+    Apply the O'Brien transform to the data.
+
+    >>> from scipy.stats import obrientransform
+    >>> tx, ty = obrientransform(x, y)
+
+    Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the
+    transformed data.
+
+    >>> from scipy.stats import f_oneway
+    >>> F, p = f_oneway(tx, ty)
+    >>> p
+    0.1314139477040335
+
+    If we require that ``p < 0.05`` for significance, we cannot conclude
+    that the variances are different.
+
+    """
+    TINY = np.sqrt(np.finfo(float).eps)
+
+    # `arrays` will hold the transformed arguments.
+    arrays = []
+    sLast = None
+
+    for sample in samples:
+        a = np.asarray(sample)
+        n = len(a)
+        mu = np.mean(a)
+        sq = (a - mu)**2
+        sumsq = sq.sum()
+
+        # The O'Brien transform.
+        t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2))
+
+        # Check that the mean of the transformed data is equal to the
+        # original variance.
+        var = sumsq / (n - 1)
+        if abs(var - np.mean(t)) > TINY:
+            raise ValueError('Lack of convergence in obrientransform.')
+
+        arrays.append(t)
+        sLast = a.shape
+
+    if sLast:
+        for arr in arrays[:-1]:
+            if sLast != arr.shape:
+                return np.array(arrays, dtype=object)
+    return np.array(arrays)
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, too_small=1
+)
+def sem(a, axis=0, ddof=1, nan_policy='propagate'):
+    """Compute standard error of the mean.
+
+    Calculate the standard error of the mean (or standard error of
+    measurement) of the values in the input array.
+
+    Parameters
+    ----------
+    a : array_like
+        An array containing the values for which the standard error is
+        returned. Must contain at least two observations.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+    ddof : int, optional
+        Delta degrees-of-freedom. How many degrees of freedom to adjust
+        for bias in limited samples relative to the population estimate
+        of variance. Defaults to 1.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    s : ndarray or float
+        The standard error of the mean in the sample(s), along the input axis.
+
+    Notes
+    -----
+    The default value for `ddof` is different to the default (0) used by other
+    ddof containing routines, such as np.std and np.nanstd.
+
+    Examples
+    --------
+    Find standard error along the first axis:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = np.arange(20).reshape(5,4)
+    >>> stats.sem(a)
+    array([ 2.8284,  2.8284,  2.8284,  2.8284])
+
+    Find standard error across the whole array, using n degrees of freedom:
+
+    >>> stats.sem(a, axis=None, ddof=0)
+    1.2893796958227628
+
+    """
+    xp = array_namespace(a)
+    if axis is None:
+        a = xp.reshape(a, (-1,))
+        axis = 0
+    a = xpx.atleast_nd(xp.asarray(a), ndim=1, xp=xp)
+    n = _length_nonmasked(a, axis, xp=xp)
+    s = xp.std(a, axis=axis, correction=ddof) / n**0.5
+    return s
+
+
+def _isconst(x):
+    """
+    Check if all values in x are the same.  nans are ignored.
+
+    x must be a 1d array.
+
+    The return value is a 1d array with length 1, so it can be used
+    in np.apply_along_axis.
+    """
+    y = x[~np.isnan(x)]
+    if y.size == 0:
+        return np.array([True])
+    else:
+        return (y[0] == y).all(keepdims=True)
+
+
+@xp_capabilities()
+def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
+    """
+    Compute the z score.
+
+    Compute the z score of each value in the sample, relative to the
+    sample mean and standard deviation.
+
+    Parameters
+    ----------
+    a : array_like
+        An array like object containing the sample data.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+    ddof : int, optional
+        Degrees of freedom correction in the calculation of the
+        standard deviation. Default is 0.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan. 'propagate' returns nan,
+        'raise' throws an error, 'omit' performs the calculations ignoring nan
+        values. Default is 'propagate'.  Note that when the value is 'omit',
+        nans in the input also propagate to the output, but they do not affect
+        the z-scores computed for the non-nan values.
+
+    Returns
+    -------
+    zscore : array_like
+        The z-scores, standardized by mean and standard deviation of
+        input array `a`.
+
+    See Also
+    --------
+    numpy.mean : Arithmetic average
+    numpy.std : Arithmetic standard deviation
+    scipy.stats.gzscore : Geometric standard score
+
+    Notes
+    -----
+    This function preserves ndarray subclasses, and works also with
+    matrices and masked arrays (it uses `asanyarray` instead of
+    `asarray` for parameters).
+
+    References
+    ----------
+    .. [1] "Standard score", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Standard_score.
+    .. [2] Huck, S. W., Cross, T. L., Clark, S. B, "Overcoming misconceptions
+           about Z-scores", Teaching Statistics, vol. 8, pp. 38-40, 1986
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> a = np.array([ 0.7972,  0.0767,  0.4383,  0.7866,  0.8091,
+    ...                0.1954,  0.6307,  0.6599,  0.1065,  0.0508])
+    >>> from scipy import stats
+    >>> stats.zscore(a)
+    array([ 1.1273, -1.247 , -0.0552,  1.0923,  1.1664, -0.8559,  0.5786,
+            0.6748, -1.1488, -1.3324])
+
+    Computing along a specified axis, using n-1 degrees of freedom
+    (``ddof=1``) to calculate the standard deviation:
+
+    >>> b = np.array([[ 0.3148,  0.0478,  0.6243,  0.4608],
+    ...               [ 0.7149,  0.0775,  0.6072,  0.9656],
+    ...               [ 0.6341,  0.1403,  0.9759,  0.4064],
+    ...               [ 0.5918,  0.6948,  0.904 ,  0.3721],
+    ...               [ 0.0921,  0.2481,  0.1188,  0.1366]])
+    >>> stats.zscore(b, axis=1, ddof=1)
+    array([[-0.19264823, -1.28415119,  1.07259584,  0.40420358],
+           [ 0.33048416, -1.37380874,  0.04251374,  1.00081084],
+           [ 0.26796377, -1.12598418,  1.23283094, -0.37481053],
+           [-0.22095197,  0.24468594,  1.19042819, -1.21416216],
+           [-0.82780366,  1.4457416 , -0.43867764, -0.1792603 ]])
+
+    An example with ``nan_policy='omit'``:
+
+    >>> x = np.array([[25.11, 30.10, np.nan, 32.02, 43.15],
+    ...               [14.95, 16.06, 121.25, 94.35, 29.81]])
+    >>> stats.zscore(x, axis=1, nan_policy='omit')
+    array([[-1.13490897, -0.37830299,         nan, -0.08718406,  1.60039602],
+           [-0.91611681, -0.89090508,  1.4983032 ,  0.88731639, -0.5785977 ]])
+    """
+    return zmap(a, a, axis=axis, ddof=ddof, nan_policy=nan_policy)
+
+
+@xp_capabilities()
+def gzscore(a, *, axis=0, ddof=0, nan_policy='propagate'):
+    """
+    Compute the geometric standard score.
+
+    Compute the geometric z score of each strictly positive value in the
+    sample, relative to the geometric mean and standard deviation.
+    Mathematically the geometric z score can be evaluated as::
+
+        gzscore = log(a/gmu) / log(gsigma)
+
+    where ``gmu`` (resp. ``gsigma``) is the geometric mean (resp. standard
+    deviation).
+
+    Parameters
+    ----------
+    a : array_like
+        Sample data.
+    axis : int or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+    ddof : int, optional
+        Degrees of freedom correction in the calculation of the
+        standard deviation. Default is 0.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan. 'propagate' returns nan,
+        'raise' throws an error, 'omit' performs the calculations ignoring nan
+        values. Default is 'propagate'.  Note that when the value is 'omit',
+        nans in the input also propagate to the output, but they do not affect
+        the geometric z scores computed for the non-nan values.
+
+    Returns
+    -------
+    gzscore : array_like
+        The geometric z scores, standardized by geometric mean and geometric
+        standard deviation of input array `a`.
+
+    See Also
+    --------
+    gmean : Geometric mean
+    gstd : Geometric standard deviation
+    zscore : Standard score
+
+    Notes
+    -----
+    This function preserves ndarray subclasses, and works also with
+    matrices and masked arrays (it uses ``asanyarray`` instead of
+    ``asarray`` for parameters).
+
+    .. versionadded:: 1.8
+
+    References
+    ----------
+    .. [1] "Geometric standard score", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Geometric_standard_deviation#Geometric_standard_score.
+
+    Examples
+    --------
+    Draw samples from a log-normal distribution:
+
+    >>> import numpy as np
+    >>> from scipy.stats import zscore, gzscore
+    >>> import matplotlib.pyplot as plt
+
+    >>> rng = np.random.default_rng()
+    >>> mu, sigma = 3., 1.  # mean and standard deviation
+    >>> x = rng.lognormal(mu, sigma, size=500)
+
+    Display the histogram of the samples:
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(x, 50)
+    >>> plt.show()
+
+    Display the histogram of the samples standardized by the classical zscore.
+    Distribution is rescaled but its shape is unchanged.
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(zscore(x), 50)
+    >>> plt.show()
+
+    Demonstrate that the distribution of geometric zscores is rescaled and
+    quasinormal:
+
+    >>> fig, ax = plt.subplots()
+    >>> ax.hist(gzscore(x), 50)
+    >>> plt.show()
+
+    """
+    xp = array_namespace(a)
+    a = xp_promote(a, force_floating=True, xp=xp)
+    log = ma.log if isinstance(a, ma.MaskedArray) else xp.log
+    return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
+
+
+@xp_capabilities()
+def zmap(scores, compare, axis=0, ddof=0, nan_policy='propagate'):
+    """
+    Calculate the relative z-scores.
+
+    Return an array of z-scores, i.e., scores that are standardized to
+    zero mean and unit variance, where mean and variance are calculated
+    from the comparison array.
+
+    Parameters
+    ----------
+    scores : array_like
+        The input for which z-scores are calculated.
+    compare : array_like
+        The input from which the mean and standard deviation of the
+        normalization are taken; assumed to have the same dimension as
+        `scores`.
+    axis : int or None, optional
+        Axis over which mean and variance of `compare` are calculated.
+        Default is 0. If None, compute over the whole array `scores`.
+    ddof : int, optional
+        Degrees of freedom correction in the calculation of the
+        standard deviation. Default is 0.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle the occurrence of nans in `compare`.
+        'propagate' returns nan, 'raise' raises an exception, 'omit'
+        performs the calculations ignoring nan values. Default is
+        'propagate'. Note that when the value is 'omit', nans in `scores`
+        also propagate to the output, but they do not affect the z-scores
+        computed for the non-nan values.
+
+    Returns
+    -------
+    zscore : array_like
+        Z-scores, in the same shape as `scores`.
+
+    Notes
+    -----
+    This function preserves ndarray subclasses, and works also with
+    matrices and masked arrays (it uses `asanyarray` instead of
+    `asarray` for parameters).
+
+    Examples
+    --------
+    >>> from scipy.stats import zmap
+    >>> a = [0.5, 2.0, 2.5, 3]
+    >>> b = [0, 1, 2, 3, 4]
+    >>> zmap(a, b)
+    array([-1.06066017,  0.        ,  0.35355339,  0.70710678])
+
+    """
+    # The docstring explicitly states that it preserves subclasses.
+    # Let's table deprecating that and just get the array API version
+    # working.
+
+    like_zscore = (scores is compare)
+    xp = array_namespace(scores, compare)
+    scores, compare = xp_promote(scores, compare, force_floating=True, xp=xp)
+
+    with warnings.catch_warnings():
+        if like_zscore:  # zscore should not emit SmallSampleWarning
+            warnings.simplefilter('ignore', SmallSampleWarning)
+
+        mn = _xp_mean(compare, axis=axis, keepdims=True, nan_policy=nan_policy)
+        std = _xp_var(compare, axis=axis, correction=ddof,
+                      keepdims=True, nan_policy=nan_policy)**0.5
+
+    with np.errstate(invalid='ignore', divide='ignore'):
+        z = _demean(scores, mn, axis, xp=xp, precision_warning=False) / std
+
+    # If we know that scores and compare are identical, we can infer that
+    # some slices should have NaNs.
+    if like_zscore:
+        eps = xp.finfo(z.dtype).eps
+        zero = std <= xp.abs(eps * mn)
+        zero = xp.broadcast_to(zero, z.shape)
+        z = xpx.at(z, zero).set(xp.nan)
+
+    return z
+
+
+@xp_capabilities()
+def gstd(a, axis=0, ddof=1, *, keepdims=False, nan_policy='propagate'):
+    r"""
+    Calculate the geometric standard deviation of an array.
+
+    The geometric standard deviation describes the spread of a set of numbers
+    where the geometric mean is preferred. It is a multiplicative factor, and
+    so a dimensionless quantity.
+
+    It is defined as the exponential of the standard deviation of the
+    natural logarithms of the observations.
+
+    Parameters
+    ----------
+    a : array_like
+        An array containing finite, strictly positive, real numbers.
+    axis : int, tuple or None, optional
+        Axis along which to operate. Default is 0. If None, compute over
+        the whole array `a`.
+    ddof : int, optional
+        Degree of freedom correction in the calculation of the
+        geometric standard deviation. Default is 1.
+    keepdims : boolean, optional
+        If this is set to ``True``, the axes which are reduced are left
+        in the result as dimensions with length one. With this option,
+        the result will broadcast correctly against the input array.
+    nan_policy : {'propagate', 'omit', 'raise'}, default: 'propagate'
+        Defines how to handle input NaNs.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the statistic is computed, the corresponding entry of the output
+          will be NaN.
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding entry of the output will be
+          NaN.
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+
+    Returns
+    -------
+    gstd : ndarray or float
+        An array of the geometric standard deviation. If `axis` is None or `a`
+        is a 1d array a float is returned.
+
+    See Also
+    --------
+    gmean : Geometric mean
+    numpy.std : Standard deviation
+    gzscore : Geometric standard score
+
+    Notes
+    -----
+    Mathematically, the sample geometric standard deviation :math:`s_G` can be
+    defined in terms of the natural logarithms of the observations
+    :math:`y_i = \log(x_i)`:
+
+    .. math::
+
+        s_G = \exp(s), \quad s = \sqrt{\frac{1}{n - d} \sum_{i=1}^n (y_i - \bar y)^2}
+
+    where :math:`n` is the number of observations, :math:`d` is the adjustment `ddof`
+    to the degrees of freedom, and :math:`\bar y` denotes the mean of the natural
+    logarithms of the observations. Note that the default ``ddof=1`` is different from
+    the default value used by similar functions, such as `numpy.std` and `numpy.var`.
+
+    When an observation is infinite, the geometric standard deviation is
+    NaN (undefined). Non-positive observations will also produce NaNs in the
+    output because the *natural* logarithm (as opposed to the *complex*
+    logarithm) is defined and finite only for positive reals.
+    The geometric standard deviation is sometimes confused with the exponential
+    of the standard deviation, ``exp(std(a))``. Instead, the geometric standard
+    deviation is ``exp(std(log(a)))``.
+
+    References
+    ----------
+    .. [1] "Geometric standard deviation", *Wikipedia*,
+           https://en.wikipedia.org/wiki/Geometric_standard_deviation.
+    .. [2] Kirkwood, T. B., "Geometric means and measures of dispersion",
+           Biometrics, vol. 35, pp. 908-909, 1979
+
+    Examples
+    --------
+    Find the geometric standard deviation of a log-normally distributed sample.
+    Note that the standard deviation of the distribution is one; on a
+    log scale this evaluates to approximately ``exp(1)``.
+
+    >>> import numpy as np
+    >>> from scipy.stats import gstd
+    >>> rng = np.random.default_rng()
+    >>> sample = rng.lognormal(mean=0, sigma=1, size=1000)
+    >>> gstd(sample)
+    2.810010162475324
+
+    Compute the geometric standard deviation of a multidimensional array and
+    of a given axis.
+
+    >>> a = np.arange(1, 25).reshape(2, 3, 4)
+    >>> gstd(a, axis=None)
+    2.2944076136018947
+    >>> gstd(a, axis=2)
+    array([[1.82424757, 1.22436866, 1.13183117],
+           [1.09348306, 1.07244798, 1.05914985]])
+    >>> gstd(a, axis=(1,2))
+    array([2.12939215, 1.22120169])
+
+    """
+    xp = array_namespace(a)
+    a = xp_promote(a, force_floating=True, xp=xp)
+
+    kwargs = dict(axis=axis, correction=ddof, keepdims=keepdims, nan_policy=nan_policy)
+    with np.errstate(invalid='ignore', divide='ignore'):
+        res = xp.exp(_xp_var(xp.log(a), **kwargs)**0.5)
+
+    if not is_lazy_array(a) and xp.any(a <= 0):
+        message = ("The geometric standard deviation is only defined if all elements "
+                   "are greater than or equal to zero; otherwise, the result is NaN.")
+        warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+    return res
+
+# Private dictionary initialized only once at module level
+# See https://en.wikipedia.org/wiki/Robust_measures_of_scale
+_scale_conversions = {'normal': float(special.erfinv(0.5) * 2.0 * math.sqrt(2.0))}
+
+
+@xp_capabilities(skip_backends=[('dask.array', 'no quantile (take_along_axis)'),
+                                ('jax.numpy', 'lazy -> no _axis_nan_policy)')])
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1,
+    default_axis=None, override={'nan_propagation': False}
+)
+def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate',
+        interpolation='linear', keepdims=False):
+    r"""
+    Compute the interquartile range of the data along the specified axis.
+
+    The interquartile range (IQR) is the difference between the 75th and
+    25th percentile of the data. It is a measure of the dispersion
+    similar to standard deviation or variance, but is much more robust
+    against outliers [2]_.
+
+    The ``rng`` parameter allows this function to compute other
+    percentile ranges than the actual IQR. For example, setting
+    ``rng=(0, 100)`` is equivalent to `numpy.ptp`.
+
+    The IQR of an empty array is `np.nan`.
+
+    .. versionadded:: 0.18.0
+
+    Parameters
+    ----------
+    x : array_like
+        Input array or object that can be converted to an array.
+    axis : int or sequence of int, optional
+        Axis along which the range is computed. The default is to
+        compute the IQR for the entire array.
+    rng : Two-element sequence containing floats in range of [0,100] optional
+        Percentiles over which to compute the range. Each must be
+        between 0 and 100, inclusive. The default is the true IQR:
+        ``(25, 75)``. The order of the elements is not important.
+    scale : scalar or str or array_like of reals, optional
+        The numerical value of scale will be divided out of the final
+        result. The following string value is also recognized:
+
+          * 'normal' : Scale by
+            :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`.
+
+        The default is 1.0.
+        Array-like `scale` of real dtype is also allowed, as long
+        as it broadcasts correctly to the output such that
+        ``out / scale`` is a valid operation. The output dimensions
+        depend on the input array, `x`, the `axis` argument, and the
+        `keepdims` flag.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    interpolation : str, optional
+
+        Specifies the interpolation method to use when the percentile
+        boundaries lie between two data points ``i`` and ``j``.
+        The following options are available (default is 'linear'):
+
+          * 'linear': ``i + (j - i)*fraction``, where ``fraction`` is the
+            fractional part of the index surrounded by ``i`` and ``j``.
+          * 'lower': ``i``.
+          * 'higher': ``j``.
+          * 'nearest': ``i`` or ``j`` whichever is nearest.
+          * 'midpoint': ``(i + j)/2``.
+
+        For NumPy >= 1.22.0, the additional options provided by the ``method``
+        keyword of `numpy.percentile` are also valid.
+
+    keepdims : bool, optional
+        If this is set to True, the reduced axes are left in the
+        result as dimensions with size one. With this option, the result
+        will broadcast correctly against the original array `x`.
+
+    Returns
+    -------
+    iqr : scalar or ndarray
+        If ``axis=None``, a scalar is returned. If the input contains
+        integers or floats of smaller precision than ``np.float64``, then the
+        output data-type is ``np.float64``. Otherwise, the output data-type is
+        the same as that of the input.
+
+    See Also
+    --------
+    numpy.std, numpy.var
+
+    References
+    ----------
+    .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range
+    .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale
+    .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import iqr
+    >>> x = np.array([[10, 7, 4], [3, 2, 1]])
+    >>> x
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> iqr(x)
+    4.0
+    >>> iqr(x, axis=0)
+    array([ 3.5,  2.5,  1.5])
+    >>> iqr(x, axis=1)
+    array([ 3.,  1.])
+    >>> iqr(x, axis=1, keepdims=True)
+    array([[ 3.],
+           [ 1.]])
+
+    """
+    xp = array_namespace(x)
+
+    # An error may be raised here, so fail-fast, before doing lengthy
+    # computations, even though `scale` is not used until later
+    if isinstance(scale, str):
+        scale_key = scale.lower()
+        if scale_key not in _scale_conversions:
+            raise ValueError(f"{scale} not a valid scale for `iqr`")
+        scale = _scale_conversions[scale_key]
+
+    if len(rng) != 2:
+        raise TypeError("`rng` must be a two element sequence.")
+
+    if np.isnan(rng).any():  # OK to use NumPy; this shouldn't be an array
+        raise ValueError("`rng` must not contain NaNs.")
+
+    rng = (rng[0]/100, rng[1]/100) if rng[0] < rng[1] else (rng[1]/100, rng[0]/100)
+
+    if rng[0] < 0 or rng[1] > 1:
+        raise ValueError("Elements of `rng` must be in the range [0, 100].")
+
+    if interpolation in {'lower', 'midpoint', 'higher', 'nearest'}:
+        interpolation = '_' + interpolation
+
+    rng = xp.asarray(rng, dtype=xp_result_type(x, force_floating=True, xp=xp))
+    pct = stats.quantile(x, rng, axis=-1, method=interpolation, keepdims=True)
+    out = pct[..., 1:2] - pct[..., 0:1]
+
+    if scale != 1.0:
+        out /= scale
+
+    out = out if keepdims else xp.squeeze(out, axis=-1)
+    return out[()] if out.ndim == 0 else out
+
+
+def _mad_1d(x, center, nan_policy):
+    # Median absolute deviation for 1-d array x.
+    # This is a helper function for `median_abs_deviation`; it assumes its
+    # arguments have been validated already.  In particular,  x must be a
+    # 1-d numpy array, center must be callable, and if nan_policy is not
+    # 'propagate', it is assumed to be 'omit', because 'raise' is handled
+    # in `median_abs_deviation`.
+    # No warning is generated if x is empty or all nan.
+    isnan = np.isnan(x)
+    if isnan.any():
+        if nan_policy == 'propagate':
+            return np.nan
+        x = x[~isnan]
+    if x.size == 0:
+        # MAD of an empty array is nan.
+        return np.nan
+    # Edge cases have been handled, so do the basic MAD calculation.
+    med = center(x)
+    mad = np.median(np.abs(x - med))
+    return mad
+
+
+@xp_capabilities(skip_backends=[('jax.numpy', 'not supported by `quantile`'),
+                                ('dask.array', 'not supported by `quantile`')])
+@_axis_nan_policy_factory(
+    lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, default_axis=0
+)
+def median_abs_deviation(x, axis=0, center=None, scale=1.0,
+                         nan_policy='propagate'):
+    r"""
+    Compute the median absolute deviation of the data along the given axis.
+
+    The median absolute deviation (MAD, [1]_) computes the median over the
+    absolute deviations from the median. It is a measure of dispersion
+    similar to the standard deviation but more robust to outliers [2]_.
+
+    The MAD of an empty array is ``np.nan``.
+
+    .. versionadded:: 1.5.0
+
+    Parameters
+    ----------
+    x : array_like
+        Input array or object that can be converted to an array.
+    axis : int or None, optional
+        Axis along which the range is computed. Default is 0. If None, compute
+        the MAD over the entire array.
+    center : callable, optional
+        A function that will return the central value. The default is to use
+        np.median. Any user defined function used will need to have the
+        function signature ``func(arr, axis)``.
+    scale : scalar or str, optional
+        The numerical value of scale will be divided out of the final
+        result. The default is 1.0. The string "normal" is also accepted,
+        and results in `scale` being the inverse of the standard normal
+        quantile function at 0.75, which is approximately 0.67449.
+        Array-like scale is also allowed, as long as it broadcasts correctly
+        to the output such that ``out / scale`` is a valid operation. The
+        output dimensions depend on the input array, `x`, and the `axis`
+        argument.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    mad : scalar or ndarray
+        If ``axis=None``, a scalar is returned. If the input contains
+        integers or floats of smaller precision than ``np.float64``, then the
+        output data-type is ``np.float64``. Otherwise, the output data-type is
+        the same as that of the input.
+
+    See Also
+    --------
+    numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,
+    scipy.stats.tstd, scipy.stats.tvar
+
+    Notes
+    -----
+    The `center` argument only affects the calculation of the central value
+    around which the MAD is calculated. That is, passing in ``center=np.mean``
+    will calculate the MAD around the mean - it will not calculate the *mean*
+    absolute deviation.
+
+    The input array may contain `inf`, but if `center` returns `inf`, the
+    corresponding MAD for that data will be `nan`.
+
+    References
+    ----------
+    .. [1] "Median absolute deviation",
+           https://en.wikipedia.org/wiki/Median_absolute_deviation
+    .. [2] "Robust measures of scale",
+           https://en.wikipedia.org/wiki/Robust_measures_of_scale
+
+    Examples
+    --------
+    When comparing the behavior of `median_abs_deviation` with ``np.std``,
+    the latter is affected when we change a single value of an array to have an
+    outlier value while the MAD hardly changes:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)
+    >>> x.std()
+    0.9973906394005013
+    >>> stats.median_abs_deviation(x)
+    0.82832610097857
+    >>> x[0] = 345.6
+    >>> x.std()
+    34.42304872314415
+    >>> stats.median_abs_deviation(x)
+    0.8323442311590675
+
+    Axis handling example:
+
+    >>> x = np.array([[10, 7, 4], [3, 2, 1]])
+    >>> x
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> stats.median_abs_deviation(x)
+    array([3.5, 2.5, 1.5])
+    >>> stats.median_abs_deviation(x, axis=None)
+    2.0
+
+    Scale normal example:
+
+    >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456)
+    >>> stats.median_abs_deviation(x)
+    1.3487398527041636
+    >>> stats.median_abs_deviation(x, scale='normal')
+    1.9996446978061115
+
+    """
+    xp = array_namespace(x)
+    xp_median = (xp.median if is_numpy(xp)
+                 else lambda x, axis: stats.quantile(x, 0.5, axis=axis))
+    center = (xp_median if center is None else center)
+
+    if not callable(center):
+        raise TypeError("The argument 'center' must be callable. The given "
+                        f"value {repr(center)} is not callable.")
+
+    # An error may be raised here, so fail-fast, before doing lengthy
+    # computations, even though `scale` is not used until later
+    if isinstance(scale, str):
+        if scale.lower() == 'normal':
+            scale = 0.6744897501960817  # special.ndtri(0.75)
+        else:
+            raise ValueError(f"{scale} is not a valid scale value.")
+
+    # Wrap the call to center() in expand_dims() so it acts like
+    # keepdims=True was used.
+    med = xp.expand_dims(center(x, axis=axis), axis=axis)
+    mad = xp_median(xp.abs(x - med), axis=axis)
+
+    return mad / scale
+
+
+#####################################
+#         TRIMMING FUNCTIONS        #
+#####################################
+
+
+SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper'))
+
+
+@xp_capabilities(skip_backends=[('dask.array', "doesn't know array size")],
+                 jax_jit=False)
+def sigmaclip(a, low=4., high=4.):
+    """Perform iterative sigma-clipping of array elements.
+
+    Starting from the full sample, all elements outside the critical range are
+    removed, i.e. all elements of the input array `c` that satisfy either of
+    the following conditions::
+
+        c < mean(c) - std(c)*low
+        c > mean(c) + std(c)*high
+
+    The iteration continues with the updated sample until no
+    elements are outside the (updated) range.
+
+    Parameters
+    ----------
+    a : array_like
+        Data array, will be raveled if not 1-D.
+    low : float, optional
+        Lower bound factor of sigma clipping. Default is 4.
+    high : float, optional
+        Upper bound factor of sigma clipping. Default is 4.
+
+    Returns
+    -------
+    clipped : ndarray
+        Input array with clipped elements removed.
+    lower : float
+        Lower threshold value use for clipping.
+    upper : float
+        Upper threshold value use for clipping.
+
+    Notes
+    -----
+    This function iteratively *removes* observations. Once observations are
+    removed, they are not re-added in subsequent iterations. Consequently,
+    although it is often the case that ``clipped`` is identical to
+    ``a[(a >= lower) & (a <= upper)]``, this property is not guaranteed to be
+    satisfied; ``clipped`` may have fewer elements.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import sigmaclip
+    >>> a = np.concatenate((np.linspace(9.5, 10.5, 31),
+    ...                     np.linspace(0, 20, 5)))
+    >>> fact = 1.5
+    >>> c, low, upp = sigmaclip(a, fact, fact)
+    >>> c
+    array([  9.96666667,  10.        ,  10.03333333,  10.        ])
+    >>> c.var(), c.std()
+    (0.00055555555555555165, 0.023570226039551501)
+    >>> low, c.mean() - fact*c.std(), c.min()
+    (9.9646446609406727, 9.9646446609406727, 9.9666666666666668)
+    >>> upp, c.mean() + fact*c.std(), c.max()
+    (10.035355339059327, 10.035355339059327, 10.033333333333333)
+
+    >>> a = np.concatenate((np.linspace(9.5, 10.5, 11),
+    ...                     np.linspace(-100, -50, 3)))
+    >>> c, low, upp = sigmaclip(a, 1.8, 1.8)
+    >>> (c == np.linspace(9.5, 10.5, 11)).all()
+    True
+
+    """
+    xp = array_namespace(a)
+    c = xp_ravel(xp.asarray(a))
+    delta = 1
+    while delta:
+        c_std = xp.std(c)
+        c_mean = xp.mean(c)
+        size = xp_size(c)
+        critlower = c_mean - c_std * low
+        critupper = c_mean + c_std * high
+        c = c[(c >= critlower) & (c <= critupper)]
+        delta = size - xp_size(c)
+
+    return SigmaclipResult(c, critlower, critupper)
+
+
+@xp_capabilities(np_only=True)
+def trimboth(a, proportiontocut, axis=0):
+    """Slice off a proportion of items from both ends of an array.
+
+    Slice off the passed proportion of items from both ends of the passed
+    array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
+    rightmost 10% of scores). The trimmed values are the lowest and
+    highest ones.
+    Slice off less if proportion results in a non-integer slice index (i.e.
+    conservatively slices off `proportiontocut`).
+
+    Parameters
+    ----------
+    a : array_like
+        Data to trim.
+    proportiontocut : float
+        Proportion (in range 0-1) of total data set to trim of each end.
+    axis : int or None, optional
+        Axis along which to trim data. Default is 0. If None, compute over
+        the whole array `a`.
+
+    Returns
+    -------
+    out : ndarray
+        Trimmed version of array `a`. The order of the trimmed content
+        is undefined.
+
+    See Also
+    --------
+    trim_mean
+
+    Examples
+    --------
+    Create an array of 10 values and trim 10% of those values from each end:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> stats.trimboth(a, 0.1)
+    array([1, 3, 2, 4, 5, 6, 7, 8])
+
+    Note that the elements of the input array are trimmed by value, but the
+    output array is not necessarily sorted.
+
+    The proportion to trim is rounded down to the nearest integer. For
+    instance, trimming 25% of the values from each end of an array of 10
+    values will return an array of 6 values:
+
+    >>> b = np.arange(10)
+    >>> stats.trimboth(b, 1/4).shape
+    (6,)
+
+    Multidimensional arrays can be trimmed along any axis or across the entire
+    array:
+
+    >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]
+    >>> d = np.array([a, b, c])
+    >>> stats.trimboth(d, 0.4, axis=0).shape
+    (1, 10)
+    >>> stats.trimboth(d, 0.4, axis=1).shape
+    (3, 2)
+    >>> stats.trimboth(d, 0.4, axis=None).shape
+    (6,)
+
+    """
+    a = np.asarray(a)
+
+    if a.size == 0:
+        return a
+
+    if axis is None:
+        a = a.ravel()
+        axis = 0
+
+    nobs = a.shape[axis]
+    lowercut = int(proportiontocut * nobs)
+    uppercut = nobs - lowercut
+    if (lowercut >= uppercut):
+        raise ValueError("Proportion too big.")
+
+    atmp = np.partition(a, (lowercut, uppercut - 1), axis)
+
+    sl = [slice(None)] * atmp.ndim
+    sl[axis] = slice(lowercut, uppercut)
+    return atmp[tuple(sl)]
+
+
+@xp_capabilities(np_only=True)
+def trim1(a, proportiontocut, tail='right', axis=0):
+    """Slice off a proportion from ONE end of the passed array distribution.
+
+    If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'
+    10% of scores. The lowest or highest values are trimmed (depending on
+    the tail).
+    Slice off less if proportion results in a non-integer slice index
+    (i.e. conservatively slices off `proportiontocut` ).
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    proportiontocut : float
+        Fraction to cut off of 'left' or 'right' of distribution.
+    tail : {'left', 'right'}, optional
+        Defaults to 'right'.
+    axis : int or None, optional
+        Axis along which to trim data. Default is 0. If None, compute over
+        the whole array `a`.
+
+    Returns
+    -------
+    trim1 : ndarray
+        Trimmed version of array `a`. The order of the trimmed content is
+        undefined.
+
+    Examples
+    --------
+    Create an array of 10 values and trim 20% of its lowest values:
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> stats.trim1(a, 0.2, 'left')
+    array([2, 4, 3, 5, 6, 7, 8, 9])
+
+    Note that the elements of the input array are trimmed by value, but the
+    output array is not necessarily sorted.
+
+    The proportion to trim is rounded down to the nearest integer. For
+    instance, trimming 25% of the values from an array of 10 values will
+    return an array of 8 values:
+
+    >>> b = np.arange(10)
+    >>> stats.trim1(b, 1/4).shape
+    (8,)
+
+    Multidimensional arrays can be trimmed along any axis or across the entire
+    array:
+
+    >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]
+    >>> d = np.array([a, b, c])
+    >>> stats.trim1(d, 0.8, axis=0).shape
+    (1, 10)
+    >>> stats.trim1(d, 0.8, axis=1).shape
+    (3, 2)
+    >>> stats.trim1(d, 0.8, axis=None).shape
+    (6,)
+
+    """
+    a = np.asarray(a)
+    if axis is None:
+        a = a.ravel()
+        axis = 0
+
+    nobs = a.shape[axis]
+
+    # avoid possible corner case
+    if proportiontocut >= 1:
+        return []
+
+    if tail.lower() == 'right':
+        lowercut = 0
+        uppercut = nobs - int(proportiontocut * nobs)
+
+    elif tail.lower() == 'left':
+        lowercut = int(proportiontocut * nobs)
+        uppercut = nobs
+
+    atmp = np.partition(a, (lowercut, uppercut - 1), axis)
+
+    sl = [slice(None)] * atmp.ndim
+    sl[axis] = slice(lowercut, uppercut)
+    return atmp[tuple(sl)]
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1)
+def trim_mean(a, proportiontocut, axis=0):
+    """Return mean of array after trimming a specified fraction of extreme values
+
+    Removes the specified proportion of elements from *each* end of the
+    sorted array, then computes the mean of the remaining elements.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    proportiontocut : float
+        Fraction of the most positive and most negative elements to remove.
+        When the specified proportion does not result in an integer number of
+        elements, the number of elements to trim is rounded down.
+    axis : int or None, default: 0
+        Axis along which the trimmed means are computed.
+        If None, compute over the raveled array.
+
+    Returns
+    -------
+    trim_mean : ndarray
+        Mean of trimmed array.
+
+    See Also
+    --------
+    trimboth : Remove a proportion of elements from each end of an array.
+    tmean : Compute the mean after trimming values outside specified limits.
+
+    Notes
+    -----
+    For 1-D array `a`, `trim_mean` is approximately equivalent to the following
+    calculation::
+
+        import numpy as np
+        a = np.sort(a)
+        m = int(proportiontocut * len(a))
+        np.mean(a[m: len(a) - m])
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = [1, 2, 3, 5]
+    >>> stats.trim_mean(x, 0.25)
+    2.5
+
+    When the specified proportion does not result in an integer number of
+    elements, the number of elements to trim is rounded down.
+
+    >>> stats.trim_mean(x, 0.24999) == np.mean(x)
+    True
+
+    Use `axis` to specify the axis along which the calculation is performed.
+
+    >>> x2 = [[1, 2, 3, 5],
+    ...       [10, 20, 30, 50]]
+    >>> stats.trim_mean(x2, 0.25)
+    array([ 5.5, 11. , 16.5, 27.5])
+    >>> stats.trim_mean(x2, 0.25, axis=1)
+    array([ 2.5, 25. ])
+
+    """
+    xp = array_namespace(a)
+
+    a = xp.asarray(a)
+
+    if xp_size(a) == 0:
+        return _get_nan(a, xp=xp)
+
+    if axis is None:
+        a = xp_ravel(a)
+        axis = 0
+
+    nobs = a.shape[axis]
+    lowercut = int(proportiontocut * nobs)
+    uppercut = nobs - lowercut
+    if (lowercut > uppercut):
+        raise ValueError("Proportion too big.")
+
+    atmp = (np.partition(a, (lowercut, uppercut - 1), axis) if is_numpy(xp)
+            else xp.sort(a, axis=axis))
+
+    sl = [slice(None)] * atmp.ndim
+    sl[axis] = slice(lowercut, uppercut)
+    trimmed = xp_promote(atmp[tuple(sl)], force_floating=True, xp=xp)
+    return xp.mean(trimmed, axis=axis)
+
+
+F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
+
+
+def _f_oneway_is_too_small(samples, kwargs=None, axis=-1):
+    message = f"At least two samples are required; got {len(samples)}."
+    if len(samples) < 2:
+        raise TypeError(message)
+
+    # Check this after forming alldata, so shape errors are detected
+    # and reported before checking for 0 length inputs.
+    if any(sample.shape[axis] == 0 for sample in samples):
+        return True
+
+    # Must have at least one group with length greater than 1.
+    if all(sample.shape[axis] == 1 for sample in samples):
+        msg = ('all input arrays have length 1.  f_oneway requires that at '
+               'least one input has length greater than 1.')
+        warnings.warn(SmallSampleWarning(msg), stacklevel=2)
+        return True
+
+    return False
+
+
+@xp_capabilities(jax_jit=False, cpu_only=True, exceptions=['cupy'])
+@_axis_nan_policy_factory(
+    F_onewayResult, n_samples=None, too_small=_f_oneway_is_too_small)
+def f_oneway(*samples, axis=0, equal_var=True):
+    """Perform one-way ANOVA.
+
+    The one-way ANOVA tests the null hypothesis that two or more groups have
+    the same population mean.  The test is applied to samples from two or
+    more groups, possibly with differing sizes.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The sample measurements for each group.  There must be at least
+        two arguments.  If the arrays are multidimensional, then all the
+        dimensions of the array must be the same except for `axis`.
+    axis : int, optional
+        Axis of the input arrays along which the test is applied.
+        Default is 0.
+    equal_var: bool, optional
+        If True (default), perform a standard one-way ANOVA test that
+        assumes equal population variances [2]_.
+        If False, perform Welch's ANOVA test, which does not assume
+        equal population variances [4]_.
+
+        .. versionadded:: 1.16.0
+
+    Returns
+    -------
+    statistic : float
+        The computed F statistic of the test.
+    pvalue : float
+        The associated p-value from the F distribution.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Emitted if all values within each of the input arrays are identical.
+        In this case the F statistic is either infinite or isn't defined,
+        so ``np.inf`` or ``np.nan`` is returned.
+
+    RuntimeWarning
+        Emitted if the length of any input array is 0, or if all the input
+        arrays have length 1.  ``np.nan`` is returned for the F statistic
+        and the p-value in these cases.
+
+    Notes
+    -----
+    The ANOVA test has important assumptions that must be satisfied in order
+    for the associated p-value to be valid.
+
+    1. The samples are independent.
+    2. Each sample is from a normally distributed population.
+    3. The population standard deviations of the groups are all equal.  This
+       property is known as homoscedasticity.
+
+    If these assumptions are not true for a given set of data, it may still
+    be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) or
+    the Alexander-Govern test (`scipy.stats.alexandergovern`) although with
+    some loss of power.
+
+    The length of each group must be at least one, and there must be at
+    least one group with length greater than one.  If these conditions
+    are not satisfied, a warning is generated and (``np.nan``, ``np.nan``)
+    is returned.
+
+    If all values in each group are identical, and there exist at least two
+    groups with different values, the function generates a warning and
+    returns (``np.inf``, 0).
+
+    If all values in all groups are the same, function generates a warning
+    and returns (``np.nan``, ``np.nan``).
+
+    The algorithm is from Heiman [2]_, pp.394-7.
+
+    References
+    ----------
+    .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics",
+           Chapter 14, 2014, http://vassarstats.net/textbook/
+
+    .. [2] G.W. Heiman, "Understanding research methods and statistics: An
+           integrated introduction for psychology", Houghton, Mifflin and
+           Company, 2001.
+
+    .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA.
+           http://www.biostathandbook.com/onewayanova.html
+
+    .. [4] B. L. Welch, "On the Comparison of Several Mean Values:
+           An Alternative Approach", Biometrika, vol. 38, no. 3/4,
+           pp. 330-336, 1951, doi: 10.2307/2332579.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import f_oneway
+
+    Here are some data [3]_ on a shell measurement (the length of the anterior
+    adductor muscle scar, standardized by dividing by length) in the mussel
+    Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;
+    Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a
+    much larger data set used in McDonald et al. (1991).
+
+    >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
+    ...              0.0659, 0.0923, 0.0836]
+    >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
+    ...            0.0725]
+    >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
+    >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
+    ...            0.0689]
+    >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
+    >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne)
+    F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)
+
+    `f_oneway` accepts multidimensional input arrays.  When the inputs
+    are multidimensional and `axis` is not given, the test is performed
+    along the first axis of the input arrays.  For the following data, the
+    test is performed three times, once for each column.
+
+    >>> a = np.array([[9.87, 9.03, 6.81],
+    ...               [7.18, 8.35, 7.00],
+    ...               [8.39, 7.58, 7.68],
+    ...               [7.45, 6.33, 9.35],
+    ...               [6.41, 7.10, 9.33],
+    ...               [8.00, 8.24, 8.44]])
+    >>> b = np.array([[6.35, 7.30, 7.16],
+    ...               [6.65, 6.68, 7.63],
+    ...               [5.72, 7.73, 6.72],
+    ...               [7.01, 9.19, 7.41],
+    ...               [7.75, 7.87, 8.30],
+    ...               [6.90, 7.97, 6.97]])
+    >>> c = np.array([[3.31, 8.77, 1.01],
+    ...               [8.25, 3.24, 3.62],
+    ...               [6.32, 8.81, 5.19],
+    ...               [7.48, 8.83, 8.91],
+    ...               [8.59, 6.01, 6.07],
+    ...               [3.07, 9.72, 7.48]])
+    >>> F = f_oneway(a, b, c)
+    >>> F.statistic
+    array([1.75676344, 0.03701228, 3.76439349])
+    >>> F.pvalue
+    array([0.20630784, 0.96375203, 0.04733157])
+
+    Welch ANOVA will be performed if `equal_var` is False.
+
+    """
+    xp = array_namespace(*samples)
+    samples = xp_promote(*samples, force_floating=True, xp=xp)
+
+    if len(samples) < 2:
+        raise TypeError('at least two inputs are required;'
+                        f' got {len(samples)}.')
+
+    # ANOVA on N groups, each in its own array
+    num_groups = len(samples)
+
+    # axis is guaranteed to be -1 by the _axis_nan_policy decorator
+    alldata = xp.concat(samples, axis=-1)
+    bign = _length_nonmasked(alldata, axis=-1, xp=xp)
+
+    # Check if the inputs are too small (for testing _axis_nan_policy decorator)
+    if _f_oneway_is_too_small(samples):
+        NaN = _get_nan(*samples, xp=xp)
+        return F_onewayResult(NaN, NaN)
+
+    # Check if all values within each group are identical, and if the common
+    # value in at least one group is different from that in another group.
+    # Based on https://github.com/scipy/scipy/issues/11669
+
+    # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ...,
+    # then is_const is a boolean array with shape (num_groups, ...).
+    # It is True if the values within the groups along the axis slice are
+    # identical. In the typical case where each input array is 1-d, is_const is
+    # a 1-d array with length num_groups.
+    is_const = xp.concat([xp.all(xp.diff(sample, axis=-1) == 0, axis=-1, keepdims=True)
+                          for sample in samples], axis=-1)
+
+    # all_const is a boolean array with shape (...) (see previous comment).
+    # It is True if the values within each group along the axis slice are
+    # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]).
+    all_const = xp.all(is_const, axis=-1)
+
+    # all_same_const is True if all the values in the groups along the axis=0
+    # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]).
+    all_same_const = xp.all(xp.diff(alldata, axis=-1) == 0, axis=-1)
+
+    if not isinstance(equal_var, bool):
+        raise TypeError("Expected a boolean value for 'equal_var'")
+
+    if equal_var:
+        # Determine the mean of the data, and subtract that from all inputs to a
+        # variance (via sum_of_sq / sq_of_sum) calculation.  Variance is invariant
+        # to a shift in location, and centering all data around zero vastly
+        # improves numerical stability.
+        offset = xp.mean(alldata, axis=-1, keepdims=True)
+        alldata = alldata - offset
+
+        normalized_ss = xp.sum(alldata, axis=-1)**2. / bign
+
+        sstot = xp.vecdot(alldata, alldata, axis=-1) - normalized_ss
+
+        ssbn = 0
+        for sample in samples:
+            smo_ss = xp.sum(sample - offset, axis=-1)**2.
+            ssbn = ssbn + smo_ss / _length_nonmasked(sample, axis=-1, xp=xp)
+
+        # Naming: variables ending in bn/b are for "between treatments", wn/w are
+        # for "within treatments"
+        ssbn = ssbn - normalized_ss
+        sswn = sstot - ssbn
+        dfbn = num_groups - 1
+        dfwn = bign - num_groups
+        msb = ssbn / dfbn
+        msw = sswn / dfwn
+        with np.errstate(divide='ignore', invalid='ignore'):
+            f = msb / msw
+        dfn, dfd = dfbn, dfwn
+
+    else:
+        # calculate basic statistics for each sample
+        # Beginning of second paragraph [4] page 1:
+        # "As a particular case $y_t$ may be the means ... of samples
+        y_t = xp.stack([xp.mean(sample, axis=-1) for sample in samples])
+        # "... of $n_t$ observations..."
+        if is_marray(xp):
+            n_t = xp.stack([_length_nonmasked(sample, axis=-1, xp=xp)
+                            for sample in samples])
+            n_t = xp.asarray(n_t, dtype=n_t.dtype)
+        else:
+            n_t = xp.asarray([sample.shape[-1] for sample in samples], dtype=y_t.dtype)
+            n_t = xp.reshape(n_t, (-1,) + (1,) * (y_t.ndim - 1))
+        # "... from $k$ different normal populations..."
+        k = len(samples)
+        # "The separate samples provide estimates $s_t^2$ of the $\sigma_t^2$."
+        s_t2 = xp.stack([xp.var(sample, axis=-1, correction=1) for sample in samples])
+
+        # calculate weight by number of data and variance
+        # "we have $\lambda_t = 1 / n_t$ ... where w_t = 1 / {\lambda_t s_t^2}$"
+        w_t = n_t / s_t2
+        # sum of w_t
+        s_w_t = xp.sum(w_t, axis=0)
+
+        # calculate adjusted grand mean
+        # "... and $\hat{y} = \sum w_t y_t / \sum w_t$. When all..."
+        axis_zero = -w_t.ndim
+        y_hat = xp.vecdot(w_t, y_t, axis=axis_zero) / xp.sum(w_t, axis=0)
+
+        # adjust f statistic
+        # ref.[4] p.334 eq.29
+        numerator =  xp.vecdot(w_t, (y_t - y_hat)**2, axis=axis_zero) / (k - 1)
+        denominator = (
+                1 + 2 * (k - 2) / (k**2 - 1) *
+                xp.vecdot(1 / (n_t - 1), (1 - w_t / s_w_t)**2, axis=axis_zero)
+        )
+        f = numerator / denominator
+
+        # degree of freedom 1
+        # ref.[4] p.334 eq.30
+        hat_f1 = k - 1
+
+        # adjusted degree of freedom 2
+        # ref.[4] p.334 eq.30
+        hat_f2 = (
+                (k**2 - 1) /
+                (3 * xp.vecdot(1 / (n_t - 1), (1 - w_t / s_w_t)**2, axis=axis_zero))
+        )
+
+        dfn, dfd = hat_f1, hat_f2
+
+    # Fix any f values that should be inf or nan because the corresponding
+    # inputs were constant.
+    f = xpx.at(f)[all_const].set(xp.inf)
+    f = xpx.at(f)[all_same_const].set(xp.nan)
+
+    # calculate p value
+    # ref.[4] p.334 eq.28
+    prob = special.fdtrc(dfn, dfd, f)
+    prob = xp.asarray(prob, dtype=f.dtype)
+
+    f, prob = (f[()], prob[()]) if f.ndim == 0 else (f, prob)
+    return F_onewayResult(f, prob)
+
+
+@dataclass
+class AlexanderGovernResult:
+    statistic: float
+    pvalue: float
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    AlexanderGovernResult, n_samples=None,
+    result_to_tuple=lambda x, _: (x.statistic, x.pvalue),
+    too_small=1
+)
+def alexandergovern(*samples, nan_policy='propagate', axis=0):
+    """Performs the Alexander Govern test.
+
+    The Alexander-Govern approximation tests the equality of k independent
+    means in the face of heterogeneity of variance. The test is applied to
+    samples from two or more groups, possibly with differing sizes.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The sample measurements for each group.  There must be at least
+        two samples, and each sample must contain at least two observations.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    Returns
+    -------
+    res : AlexanderGovernResult
+        An object with attributes:
+
+        statistic : float
+            The computed A statistic of the test.
+        pvalue : float
+            The associated p-value from the chi-squared distribution.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Raised if an input is a constant array.  The statistic is not defined
+        in this case, so ``np.nan`` is returned.
+
+    See Also
+    --------
+    f_oneway : one-way ANOVA
+
+    Notes
+    -----
+    The use of this test relies on several assumptions.
+
+    1. The samples are independent.
+    2. Each sample is from a normally distributed population.
+    3. Unlike `f_oneway`, this test does not assume on homoscedasticity,
+       instead relaxing the assumption of equal variances.
+
+    Input samples must be finite, one dimensional, and with size greater than
+    one.
+
+    References
+    ----------
+    .. [1] Alexander, Ralph A., and Diane M. Govern. "A New and Simpler
+           Approximation for ANOVA under Variance Heterogeneity." Journal
+           of Educational Statistics, vol. 19, no. 2, 1994, pp. 91-101.
+           JSTOR, www.jstor.org/stable/1165140. Accessed 12 Sept. 2020.
+
+    Examples
+    --------
+    >>> from scipy.stats import alexandergovern
+
+    Here are some data on annual percentage rate of interest charged on
+    new car loans at nine of the largest banks in four American cities
+    taken from the National Institute of Standards and Technology's
+    ANOVA dataset.
+
+    We use `alexandergovern` to test the null hypothesis that all cities
+    have the same mean APR against the alternative that the cities do not
+    all have the same mean APR. We decide that a significance level of 5%
+    is required to reject the null hypothesis in favor of the alternative.
+
+    >>> atlanta = [13.75, 13.75, 13.5, 13.5, 13.0, 13.0, 13.0, 12.75, 12.5]
+    >>> chicago = [14.25, 13.0, 12.75, 12.5, 12.5, 12.4, 12.3, 11.9, 11.9]
+    >>> houston = [14.0, 14.0, 13.51, 13.5, 13.5, 13.25, 13.0, 12.5, 12.5]
+    >>> memphis = [15.0, 14.0, 13.75, 13.59, 13.25, 12.97, 12.5, 12.25,
+    ...           11.89]
+    >>> alexandergovern(atlanta, chicago, houston, memphis)
+    AlexanderGovernResult(statistic=4.65087071883494,
+                          pvalue=0.19922132490385214)
+
+    The p-value is 0.1992, indicating a nearly 20% chance of observing
+    such an extreme value of the test statistic under the null hypothesis.
+    This exceeds 5%, so we do not reject the null hypothesis in favor of
+    the alternative.
+
+    """
+    xp = array_namespace(*samples)
+    samples = _alexandergovern_input_validation(samples, nan_policy, axis, xp=xp)
+
+    # The following formula numbers reference the equation described on
+    # page 92 by Alexander, Govern. Formulas 5, 6, and 7 describe other
+    # tests that serve as the basis for equation (8) but are not needed
+    # to perform the test.
+
+    # precalculate mean and length of each sample
+    lengths = [sample.shape[-1] for sample in samples]
+    means = xp.stack([_xp_mean(sample, axis=-1) for sample in samples])
+
+    # (1) determine standard error of the mean for each sample
+    se2 = [(_xp_var(sample, correction=1, axis=-1) / length)
+           for sample, length in zip(samples, lengths)]
+    standard_errors_squared = xp.stack(se2)
+    standard_errors = standard_errors_squared**0.5
+
+    # Special case: statistic is NaN when variance is zero
+    eps = xp.finfo(standard_errors.dtype).eps
+    zero = standard_errors <= xp.abs(eps * means)
+    NaN = xp.asarray(xp.nan, dtype=standard_errors.dtype)
+    standard_errors = xp.where(zero, NaN, standard_errors)
+
+    # (2) define a weight for each sample
+    inv_sq_se = 1 / standard_errors_squared
+    weights = inv_sq_se / xp.sum(inv_sq_se, axis=0, keepdims=True)
+
+    # (3) determine variance-weighted estimate of the common mean
+    # Consider replacing with vecdot when data-apis/array-api#910 is resolved
+    var_w = xp.sum(weights * means, axis=0, keepdims=True)
+
+    # (4) determine one-sample t statistic for each group
+    t_stats = _demean(means, var_w, axis=0, xp=xp) / standard_errors
+
+    # calculate parameters to be used in transformation
+    v = xp.asarray(lengths, dtype=t_stats.dtype) - 1
+    # align along 0th axis, which corresponds with separate samples
+    v = xp.reshape(v, (-1,) + (1,)*(t_stats.ndim-1))
+    a = v - .5
+    b = 48 * a**2
+    c = (a * xp.log(1 + (t_stats ** 2)/v))**.5
+
+    # (8) perform a normalizing transformation on t statistic
+    z = (c + ((c**3 + 3*c)/b) -
+         ((4*c**7 + 33*c**5 + 240*c**3 + 855*c) /
+          (b**2*10 + 8*b*c**4 + 1000*b)))
+
+    # (9) calculate statistic
+    A = xp.vecdot(z, z, axis=-z.ndim)
+    A = A[()] if A.ndim == 0 else A  # data-apis/array-api-compat#355
+
+    # "[the p value is determined from] central chi-square random deviates
+    # with k - 1 degrees of freedom". Alexander, Govern (94)
+    df = xp.asarray(len(samples) - 1, dtype=A.dtype)
+    chi2 = _SimpleChi2(df)
+    p = _get_pvalue(A, chi2, alternative='greater', symmetric=False, xp=xp)
+    return AlexanderGovernResult(A, p)
+
+
+def _alexandergovern_input_validation(samples, nan_policy, axis, xp):
+    if len(samples) < 2:
+        raise TypeError(f"2 or more inputs required, got {len(samples)}")
+
+    for sample in samples:
+        if sample.shape[axis] <= 1:
+            raise ValueError("Input sample size must be greater than one.")
+
+    samples = [xp.moveaxis(sample, axis, -1) for sample in samples]
+
+    return samples
+
+
+def _pearsonr_fisher_ci(r, n, confidence_level, alternative):
+    """
+    Compute the confidence interval for Pearson's R.
+
+    Fisher's transformation is used to compute the confidence interval
+    (https://en.wikipedia.org/wiki/Fisher_transformation).
+    """
+    xp = array_namespace(r)
+
+    ones = xp.ones_like(r)
+    n = xp.asarray(n, dtype=r.dtype, device=xp_device(r))
+    confidence_level = xp.asarray(confidence_level, dtype=r.dtype, device=xp_device(r))
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        zr = xp.atanh(r)
+        se = xp.sqrt(1 / (n - 3))
+
+    if alternative == "two-sided":
+        h = special.ndtri(0.5 + confidence_level/2)
+        zlo = zr - h*se
+        zhi = zr + h*se
+        rlo = xp.tanh(zlo)
+        rhi = xp.tanh(zhi)
+    elif alternative == "less":
+        h = special.ndtri(confidence_level)
+        zhi = zr + h*se
+        rhi = xp.tanh(zhi)
+        rlo = -ones
+    else:
+        # alternative == "greater":
+        h = special.ndtri(confidence_level)
+        zlo = zr - h*se
+        rlo = xp.tanh(zlo)
+        rhi = ones
+
+    mask = (n <= 3)
+    if mask.ndim == 0:
+        # This is Array API legal, but Dask doesn't like it.
+        mask = xp.broadcast_to(mask, rlo.shape)
+
+    rlo = xpx.at(rlo)[mask].set(-1)
+    rhi = xpx.at(rhi)[mask].set(1)
+
+    rlo = rlo[()] if rlo.ndim == 0 else rlo
+    rhi = rhi[()] if rhi.ndim == 0 else rhi
+    return ConfidenceInterval(low=rlo, high=rhi)
+
+
+def _pearsonr_bootstrap_ci(confidence_level, method, x, y, alternative, axis):
+    """
+    Compute the confidence interval for Pearson's R using the bootstrap.
+    """
+    def statistic(x, y, axis):
+        statistic, _ = pearsonr(x, y, axis=axis)
+        return statistic
+
+    res = bootstrap((x, y), statistic, confidence_level=confidence_level, axis=axis,
+                    paired=True, alternative=alternative, **method._asdict())
+    # for one-sided confidence intervals, bootstrap gives +/- inf on one side
+    res.confidence_interval = np.clip(res.confidence_interval, -1, 1)
+
+    return ConfidenceInterval(*res.confidence_interval)
+
+
+ConfidenceInterval = namedtuple('ConfidenceInterval', ['low', 'high'])
+
+PearsonRResultBase = _make_tuple_bunch('PearsonRResultBase',
+                                       ['statistic', 'pvalue'], [])
+
+
+class PearsonRResult(PearsonRResultBase):
+    """
+    Result of `scipy.stats.pearsonr`
+
+    Attributes
+    ----------
+    statistic : float
+        Pearson product-moment correlation coefficient.
+    pvalue : float
+        The p-value associated with the chosen alternative.
+
+    Methods
+    -------
+    confidence_interval
+        Computes the confidence interval of the correlation
+        coefficient `statistic` for the given confidence level.
+
+    """
+    def __init__(self, statistic, pvalue, alternative, n, x, y, axis):
+        super().__init__(statistic, pvalue)
+        self._alternative = alternative
+        self._n = n
+        self._x = x
+        self._y = y
+        self._axis = axis
+
+        # add alias for consistency with other correlation functions
+        self.correlation = statistic
+
+    def confidence_interval(self, confidence_level=0.95, method=None):
+        """
+        The confidence interval for the correlation coefficient.
+
+        Compute the confidence interval for the correlation coefficient
+        ``statistic`` with the given confidence level.
+
+        If `method` is not provided,
+        The confidence interval is computed using the Fisher transformation
+        F(r) = arctanh(r) [1]_.  When the sample pairs are drawn from a
+        bivariate normal distribution, F(r) approximately follows a normal
+        distribution with standard error ``1/sqrt(n - 3)``, where ``n`` is the
+        length of the original samples along the calculation axis. When
+        ``n <= 3``, this approximation does not yield a finite, real standard
+        error, so we define the confidence interval to be -1 to 1.
+
+        If `method` is an instance of `BootstrapMethod`, the confidence
+        interval is computed using `scipy.stats.bootstrap` with the provided
+        configuration options and other appropriate settings. In some cases,
+        confidence limits may be NaN due to a degenerate resample, and this is
+        typical for very small samples (~6 observations).
+
+        Parameters
+        ----------
+        confidence_level : float
+            The confidence level for the calculation of the correlation
+            coefficient confidence interval. Default is 0.95.
+
+        method : BootstrapMethod, optional
+            Defines the method used to compute the confidence interval. See
+            method description for details.
+
+            .. versionadded:: 1.11.0
+
+        Returns
+        -------
+        ci : namedtuple
+            The confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`.
+
+        References
+        ----------
+        .. [1] "Pearson correlation coefficient", Wikipedia,
+               https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+        """
+        if isinstance(method, BootstrapMethod):
+            xp = array_namespace(self._x)
+            message = ('`method` must be `None` if `pearsonr` '
+                       'arguments were not NumPy arrays.')
+            if not is_numpy(xp):
+                raise ValueError(message)
+
+            ci = _pearsonr_bootstrap_ci(confidence_level, method, self._x, self._y,
+                                        self._alternative, self._axis)
+        elif method is None:
+            ci = _pearsonr_fisher_ci(self.statistic, self._n, confidence_level,
+                                     self._alternative)
+        else:
+            message = ('`method` must be an instance of `BootstrapMethod` '
+                       'or None.')
+            raise ValueError(message)
+        return ci
+
+
+# Missing special.betainc on torch
+@xp_capabilities(cpu_only=True, exceptions=['cupy', 'jax.numpy'])
+def pearsonr(x, y, *, alternative='two-sided', method=None, axis=0):
+    r"""
+    Pearson correlation coefficient and p-value for testing non-correlation.
+
+    The Pearson correlation coefficient [1]_ measures the linear relationship
+    between two datasets. Like other correlation
+    coefficients, this one varies between -1 and +1 with 0 implying no
+    correlation. Correlations of -1 or +1 imply an exact linear relationship.
+    Positive correlations imply that as x increases, so does y. Negative
+    correlations imply that as x increases, y decreases.
+
+    This function also performs a test of the null hypothesis that the
+    distributions underlying the samples are uncorrelated and normally
+    distributed. (See Kowalski [3]_
+    for a discussion of the effects of non-normality of the input on the
+    distribution of the correlation coefficient.)
+    The p-value roughly indicates the probability of an uncorrelated system
+    producing datasets that have a Pearson correlation at least as extreme
+    as the one computed from these datasets.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    y : array_like
+        Input array.
+    axis : int or None, default
+        Axis along which to perform the calculation. Default is 0.
+        If None, ravel both arrays before performing the calculation.
+
+        .. versionadded:: 1.14.0
+    alternative : {'two-sided', 'greater', 'less'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the correlation is nonzero
+        * 'less': the correlation is negative (less than zero)
+        * 'greater':  the correlation is positive (greater than zero)
+
+        .. versionadded:: 1.9.0
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value. If `method` is an
+        instance of `PermutationMethod`/`MonteCarloMethod`, the p-value is
+        computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Otherwise, the p-value is computed as documented in the notes.
+
+        .. versionadded:: 1.11.0
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.PearsonRResult`
+        An object with the following attributes:
+
+        statistic : float
+            Pearson product-moment correlation coefficient.
+        pvalue : float
+            The p-value associated with the chosen alternative.
+
+        The object has the following method:
+
+        confidence_interval(confidence_level, method)
+            This computes the confidence interval of the correlation
+            coefficient `statistic` for the given confidence level.
+            The confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`. If `method` is not provided, the
+            confidence interval is computed using the Fisher transformation
+            [1]_. If `method` is an instance of `BootstrapMethod`, the
+            confidence interval is computed using `scipy.stats.bootstrap` with
+            the provided configuration options and other appropriate settings.
+            In some cases, confidence limits may be NaN due to a degenerate
+            resample, and this is typical for very small samples (~6
+            observations).
+
+    Raises
+    ------
+    ValueError
+        If `x` and `y` do not have length at least 2.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Raised if an input is a constant array.  The correlation coefficient
+        is not defined in this case, so ``np.nan`` is returned.
+
+    `~scipy.stats.NearConstantInputWarning`
+        Raised if an input is "nearly" constant.  The array ``x`` is considered
+        nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
+        Numerical errors in the calculation ``x - mean(x)`` in this case might
+        result in an inaccurate calculation of r.
+
+    See Also
+    --------
+    spearmanr : Spearman rank-order correlation coefficient.
+    kendalltau : Kendall's tau, a correlation measure for ordinal data.
+    :ref:`hypothesis_pearsonr` : Extended example
+
+    Notes
+    -----
+    The correlation coefficient is calculated as follows:
+
+    .. math::
+
+        r = \frac{\sum (x - m_x) (y - m_y)}
+                 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
+
+    where :math:`m_x` is the mean of the vector x and :math:`m_y` is
+    the mean of the vector y.
+
+    Under the assumption that x and y are drawn from
+    independent normal distributions (so the population correlation coefficient
+    is 0), the probability density function of the sample correlation
+    coefficient r is ([1]_, [2]_):
+
+    .. math::
+        f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
+
+    where n is the number of samples, and B is the beta function.  This
+    is sometimes referred to as the exact distribution of r.  This is
+    the distribution that is used in `pearsonr` to compute the p-value when
+    the `method` parameter is left at its default value (None).
+    The distribution is a beta distribution on the interval [-1, 1],
+    with equal shape parameters a = b = n/2 - 1.  In terms of SciPy's
+    implementation of the beta distribution, the distribution of r is::
+
+        dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
+
+    The default p-value returned by `pearsonr` is a two-sided p-value. For a
+    given sample with correlation coefficient r, the p-value is
+    the probability that abs(r') of a random sample x' and y' drawn from
+    the population with zero correlation would be greater than or equal
+    to abs(r). In terms of the object ``dist`` shown above, the p-value
+    for a given r and length n can be computed as::
+
+        p = 2*dist.cdf(-abs(r))
+
+    When n is 2, the above continuous distribution is not well-defined.
+    One can interpret the limit of the beta distribution as the shape
+    parameters a and b approach a = b = 0 as a discrete distribution with
+    equal probability masses at r = 1 and r = -1.  More directly, one
+    can observe that, given the data x = [x1, x2] and y = [y1, y2], and
+    assuming x1 != x2 and y1 != y2, the only possible values for r are 1
+    and -1.  Because abs(r') for any sample x' and y' with length 2 will
+    be 1, the two-sided p-value for a sample of length 2 is always 1.
+
+    For backwards compatibility, the object that is returned also behaves
+    like a tuple of length two that holds the statistic and the p-value.
+
+    References
+    ----------
+    .. [1] "Pearson correlation coefficient", Wikipedia,
+           https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    .. [2] Student, "Probable error of a correlation coefficient",
+           Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
+    .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
+           of the Sample Product-Moment Correlation Coefficient"
+           Journal of the Royal Statistical Society. Series C (Applied
+           Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x, y = [1, 2, 3, 4, 5, 6, 7], [10, 9, 2.5, 6, 4, 3, 2]
+    >>> res = stats.pearsonr(x, y)
+    >>> res
+    PearsonRResult(statistic=-0.828503883588428, pvalue=0.021280260007523286)
+
+    To perform an exact permutation version of the test:
+
+    >>> rng = np.random.default_rng(7796654889291491997)
+    >>> method = stats.PermutationMethod(n_resamples=np.inf, random_state=rng)
+    >>> stats.pearsonr(x, y, method=method)
+    PearsonRResult(statistic=-0.828503883588428, pvalue=0.028174603174603175)
+
+    To perform the test under the null hypothesis that the data were drawn from
+    *uniform* distributions:
+
+    >>> method = stats.MonteCarloMethod(rvs=(rng.uniform, rng.uniform))
+    >>> stats.pearsonr(x, y, method=method)
+    PearsonRResult(statistic=-0.828503883588428, pvalue=0.0188)
+
+    To produce an asymptotic 90% confidence interval:
+
+    >>> res.confidence_interval(confidence_level=0.9)
+    ConfidenceInterval(low=-0.9644331982722841, high=-0.3460237473272273)
+
+    And for a bootstrap confidence interval:
+
+    >>> method = stats.BootstrapMethod(method='BCa', rng=rng)
+    >>> res.confidence_interval(confidence_level=0.9, method=method)
+    ConfidenceInterval(low=-0.9983163756488651, high=-0.22771001702132443)  # may vary
+
+    If N-dimensional arrays are provided, multiple tests are performed in a
+    single call according to the same conventions as most `scipy.stats` functions:
+
+    >>> rng = np.random.default_rng(2348246935601934321)
+    >>> x = rng.standard_normal((8, 15))
+    >>> y = rng.standard_normal((8, 15))
+    >>> stats.pearsonr(x, y, axis=0).statistic.shape  # between corresponding columns
+    (15,)
+    >>> stats.pearsonr(x, y, axis=1).statistic.shape  # between corresponding rows
+    (8,)
+
+    To perform all pairwise comparisons between slices of the arrays,
+    use standard NumPy broadcasting techniques. For instance, to compute the
+    correlation between all pairs of rows:
+
+    >>> stats.pearsonr(x[:, np.newaxis, :], y, axis=-1).statistic.shape
+    (8, 8)
+
+    There is a linear dependence between x and y if y = a + b*x + e, where
+    a,b are constants and e is a random error term, assumed to be independent
+    of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
+    e follow a normal distribution with mean zero and standard deviation s>0.
+
+    >>> rng = np.random.default_rng()
+    >>> s = 0.5
+    >>> x = stats.norm.rvs(size=500, random_state=rng)
+    >>> e = stats.norm.rvs(scale=s, size=500, random_state=rng)
+    >>> y = x + e
+    >>> stats.pearsonr(x, y).statistic
+    0.9001942438244763
+
+    This should be close to the exact value given by
+
+    >>> 1/np.sqrt(1 + s**2)
+    0.8944271909999159
+
+    For s=0.5, we observe a high level of correlation. In general, a large
+    variance of the noise reduces the correlation, while the correlation
+    approaches one as the variance of the error goes to zero.
+
+    It is important to keep in mind that no correlation does not imply
+    independence unless (x, y) is jointly normal. Correlation can even be zero
+    when there is a very simple dependence structure: if X follows a
+    standard normal distribution, let y = abs(x). Note that the correlation
+    between x and y is zero. Indeed, since the expectation of x is zero,
+    cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
+    by symmetry. The following lines of code illustrate this observation:
+
+    >>> y = np.abs(x)
+    >>> stats.pearsonr(x, y)
+    PearsonRResult(statistic=-0.05444919272687482, pvalue=0.22422294836207743)
+
+    A non-zero correlation coefficient can be misleading. For example, if X has
+    a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
+    A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
+    implying a high level of correlation:
+
+    >>> y = np.where(x < 0, x, 0)
+    >>> stats.pearsonr(x, y)
+    PearsonRResult(statistic=0.861985781588, pvalue=4.813432002751103e-149)
+
+    This is unintuitive since there is no dependence of x and y if x is larger
+    than zero which happens in about half of the cases if we sample x and y.
+
+    For a more detailed example, see :ref:`hypothesis_pearsonr`.
+
+    """
+    xp = array_namespace(x, y)
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+    dtype = x.dtype
+
+    if not is_numpy(xp) and method is not None:
+        method = 'invalid'
+
+    if axis is None:
+        x = xp.reshape(x, (-1,))
+        y = xp.reshape(y, (-1,))
+        axis = -1
+
+    axis_int = int(axis)
+    if axis_int != axis:
+        raise ValueError('`axis` must be an integer.')
+    axis = axis_int
+
+    try:
+        np.broadcast_shapes(x.shape, y.shape)
+        # For consistency with other `stats` functions, we need to
+        # match the dimensionalities before looking at `axis`.
+        # (Note: this is not the NEP 5 / gufunc order of operations;
+        #  see TestPearsonr::test_different_dimensionality for more information.)
+        ndim = max(x.ndim, y.ndim)
+        x = xp.reshape(x, (1,) * (ndim - x.ndim) + x.shape)
+        y = xp.reshape(y, (1,) * (ndim - y.ndim) + y.shape)
+
+    except (ValueError, RuntimeError) as e:
+        message = '`x` and `y` must be broadcastable.'
+        raise ValueError(message) from e
+
+    if x.shape[axis] != y.shape[axis]:
+        raise ValueError('`x` and `y` must have the same length along `axis`.')
+
+    if x.shape[axis] < 2:
+        raise ValueError('`x` and `y` must have length at least 2.')
+
+    x, y = _share_masks(x, y, xp=xp)
+    n = xp.asarray(_length_nonmasked(x, axis=axis), dtype=x.dtype)
+
+    x = xp.moveaxis(x, axis, -1)
+    y = xp.moveaxis(y, axis, -1)
+    axis = -1
+
+    if xp.isdtype(dtype, "complex floating"):
+        raise ValueError('This function does not support complex data')
+
+    x = xp.astype(x, dtype, copy=False)
+    y = xp.astype(y, dtype, copy=False)
+    threshold = xp.finfo(dtype).eps ** 0.75
+
+    # If an input is constant, the correlation coefficient is not defined.
+    const_x = xp.all(x == x[..., 0:1], axis=-1)
+    const_y = xp.all(y == y[..., 0:1], axis=-1)
+    const_xy = const_x | const_y
+
+    any_const_xy = xp.any(const_xy)
+    lazy = is_lazy_array(const_xy)
+    if not lazy and any_const_xy:
+        msg = ("An input array is constant; the correlation coefficient "
+               "is not defined.")
+        warnings.warn(stats.ConstantInputWarning(msg), stacklevel=2)
+    if lazy or any_const_xy:
+        x = xp.where(const_x[..., xp.newaxis], xp.nan, x)
+        y = xp.where(const_y[..., xp.newaxis], xp.nan, y)
+
+    if isinstance(method, PermutationMethod):
+        def statistic(y, axis):
+            statistic, _ = pearsonr(x, y, axis=axis, alternative=alternative)
+            return statistic
+
+        res = permutation_test((y,), statistic, permutation_type='pairings',
+                               axis=axis, alternative=alternative, **method._asdict())
+
+        return PearsonRResult(statistic=res.statistic, pvalue=res.pvalue, n=n,
+                              alternative=alternative, x=x, y=y, axis=axis)
+    elif isinstance(method, MonteCarloMethod):
+        def statistic(x, y, axis):
+            statistic, _ = pearsonr(x, y, axis=axis, alternative=alternative)
+            return statistic
+
+        # `monte_carlo_test` accepts an `rvs` tuple of callables, not an `rng`
+        # If the user specified an `rng`, replace it with the appropriate callables
+        method = method._asdict()
+        if (rng := method.pop('rng', None)) is not None:  # goo-goo g'joob
+            rng = np.random.default_rng(rng)
+            method['rvs'] = rng.normal, rng.normal
+
+        res = monte_carlo_test((x, y,), statistic=statistic, axis=axis,
+                               alternative=alternative, **method)
+
+        return PearsonRResult(statistic=res.statistic, pvalue=res.pvalue, n=n,
+                              alternative=alternative, x=x, y=y, axis=axis)
+    elif method == 'invalid':
+        message = '`method` must be `None` if arguments are not NumPy arrays.'
+        raise ValueError(message)
+    elif method is not None:
+        message = ('`method` must be an instance of `PermutationMethod`, '
+                   '`MonteCarloMethod`, or None.')
+        raise ValueError(message)
+
+    xmean = xp.mean(x, axis=axis, keepdims=True)
+    ymean = xp.mean(y, axis=axis, keepdims=True)
+    xm = x - xmean
+    ym = y - ymean
+
+    # scipy.linalg.norm(xm) avoids premature overflow when xm is e.g.
+    # [-5e210, 5e210, 3e200, -3e200]
+    # but not when `axis` is provided, so scale manually. scipy.linalg.norm
+    # also raises an error with NaN input rather than returning NaN, so
+    # use np.linalg.norm.
+    xmax = xp.max(xp.abs(xm), axis=axis, keepdims=True)
+    ymax = xp.max(xp.abs(ym), axis=axis, keepdims=True)
+    with np.errstate(invalid='ignore', divide='ignore'):
+        normxm = xmax * xp_vector_norm(xm/xmax, axis=axis, keepdims=True)
+        normym = ymax * xp_vector_norm(ym/ymax, axis=axis, keepdims=True)
+
+    if not lazy:
+        nconst_x = xp.any(normxm < threshold*xp.abs(xmean), axis=axis)
+        nconst_y = xp.any(normym < threshold*xp.abs(ymean), axis=axis)
+        nconst_xy = nconst_x | nconst_y
+        if xp.any(nconst_xy & (~const_xy)):
+            # If all the values in x (likewise y) are very close to the mean,
+            # the loss of precision that occurs in the subtraction xm = x - xmean
+            # might result in large errors in r.
+            msg = ("An input array is nearly constant; the computed "
+                "correlation coefficient may be inaccurate.")
+            warnings.warn(stats.NearConstantInputWarning(msg), stacklevel=2)
+
+    with np.errstate(invalid='ignore', divide='ignore'):
+        r = xp.vecdot(xm / normxm, ym / normym, axis=axis)
+
+    # Presumably, if abs(r) > 1, then it is only some small artifact of
+    # floating point arithmetic.
+    r = xp.clip(r, -1., 1.)
+    r = xpx.at(r, const_xy).set(xp.nan)
+
+    # As explained in the docstring, the distribution of `r` under the null
+    # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1.
+    ab = xp.asarray(n/2 - 1, dtype=dtype, device=xp_device(x))
+    dist = _SimpleBeta(ab, ab, loc=-1, scale=2)
+    pvalue = _get_pvalue(r, dist, alternative, xp=xp)
+
+    mask = (n == 2)   #  return exactly 1.0 or -1.0 values for n == 2 case as promised
+    # data-apis/array-api-extra#196
+    mxp = array_namespace(r._meta) if is_dask(xp) else xp
+    def special_case(r):
+        return mxp.where(mxp.isnan(r), mxp.nan, mxp.ones_like(r))
+    r = xpx.apply_where(mask, r, mxp.round, fill_value=r)
+    pvalue = xpx.apply_where(mask, (r,), special_case, fill_value=pvalue)
+
+    r = r[()] if r.ndim == 0 else r
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+    return PearsonRResult(statistic=r, pvalue=pvalue, n=n,
+                          alternative=alternative, x=x, y=y, axis=axis)
+
+
+@xp_capabilities(np_only=True)
+def fisher_exact(table, alternative=None, *, method=None):
+    """Perform a Fisher exact test on a contingency table.
+
+    For a 2x2 table,
+    the null hypothesis is that the true odds ratio of the populations
+    underlying the observations is one, and the observations were sampled
+    from these populations under a condition: the marginals of the
+    resulting table must equal those of the observed table.
+    The statistic is the unconditional maximum likelihood estimate of the odds
+    ratio, and the p-value is the probability under the null hypothesis of
+    obtaining a table at least as extreme as the one that was actually
+    observed.
+
+    For other table sizes, or if `method` is provided, the null hypothesis
+    is that the rows and columns of the tables have fixed sums and are
+    independent; i.e., the table was sampled from a `scipy.stats.random_table`
+    distribution with the observed marginals. The statistic is the
+    probability mass of this distribution evaluated at `table`, and the
+    p-value is the percentage of the population of tables with statistic at
+    least as extreme (small) as that of `table`. There is only one alternative
+    hypothesis available: the rows and columns are not independent.
+
+    There are other possible choices of statistic and two-sided
+    p-value definition associated with Fisher's exact test; please see the
+    Notes for more information.
+
+    Parameters
+    ----------
+    table : array_like of ints
+        A contingency table.  Elements must be non-negative integers.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis for 2x2 tables; unused for other
+        table sizes.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the odds ratio of the underlying population is not one
+        * 'less': the odds ratio of the underlying population is less than one
+        * 'greater': the odds ratio of the underlying population is greater
+          than one
+
+        See the Notes for more details.
+
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value.
+        If `method` is an instance of `PermutationMethod`/`MonteCarloMethod`,
+        the p-value is computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Note that if `method` is an instance of `MonteCarloMethod`, the ``rvs``
+        attribute must be left unspecified; Monte Carlo samples are always drawn
+        using the ``rvs`` method of `scipy.stats.random_table`.
+        Otherwise, the p-value is computed as documented in the notes.
+
+        .. versionadded:: 1.15.0
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+            For a 2x2 table with default `method`, this is the odds ratio - the
+            prior odds ratio not a posterior estimate. In all other cases, this
+            is the probability density of obtaining the observed table under the
+            null hypothesis of independence with marginals fixed.
+        pvalue : float
+            The probability under the null hypothesis of obtaining a
+            table at least as extreme as the one that was actually observed.
+
+    Raises
+    ------
+    ValueError
+        If `table` is not two-dimensional or has negative entries.
+
+    See Also
+    --------
+    chi2_contingency : Chi-square test of independence of variables in a
+        contingency table.  This can be used as an alternative to
+        `fisher_exact` when the numbers in the table are large.
+    contingency.odds_ratio : Compute the odds ratio (sample or conditional
+        MLE) for a 2x2 contingency table.
+    barnard_exact : Barnard's exact test, which is a more powerful alternative
+        than Fisher's exact test for 2x2 contingency tables.
+    boschloo_exact : Boschloo's exact test, which is a more powerful
+        alternative than Fisher's exact test for 2x2 contingency tables.
+    :ref:`hypothesis_fisher_exact` : Extended example
+
+    Notes
+    -----
+    *Null hypothesis and p-values*
+
+    The null hypothesis is that the true odds ratio of the populations
+    underlying the observations is one, and the observations were sampled at
+    random from these populations under a condition: the marginals of the
+    resulting table must equal those of the observed table. Equivalently,
+    the null hypothesis is that the input table is from the hypergeometric
+    distribution with parameters (as used in `hypergeom`)
+    ``M = a + b + c + d``, ``n = a + b`` and ``N = a + c``, where the
+    input table is ``[[a, b], [c, d]]``.  This distribution has support
+    ``max(0, N + n - M) <= x <= min(N, n)``, or, in terms of the values
+    in the input table, ``min(0, a - d) <= x <= a + min(b, c)``.  ``x``
+    can be interpreted as the upper-left element of a 2x2 table, so the
+    tables in the distribution have form::
+
+        [  x           n - x     ]
+        [N - x    M - (n + N) + x]
+
+    For example, if::
+
+        table = [6  2]
+                [1  4]
+
+    then the support is ``2 <= x <= 7``, and the tables in the distribution
+    are::
+
+        [2 6]   [3 5]   [4 4]   [5 3]   [6 2]  [7 1]
+        [5 0]   [4 1]   [3 2]   [2 3]   [1 4]  [0 5]
+
+    The probability of each table is given by the hypergeometric distribution
+    ``hypergeom.pmf(x, M, n, N)``.  For this example, these are (rounded to
+    three significant digits)::
+
+        x       2      3      4      5       6        7
+        p  0.0163  0.163  0.408  0.326  0.0816  0.00466
+
+    These can be computed with::
+
+        >>> import numpy as np
+        >>> from scipy.stats import hypergeom
+        >>> table = np.array([[6, 2], [1, 4]])
+        >>> M = table.sum()
+        >>> n = table[0].sum()
+        >>> N = table[:, 0].sum()
+        >>> start, end = hypergeom.support(M, n, N)
+        >>> hypergeom.pmf(np.arange(start, end+1), M, n, N)
+        array([0.01631702, 0.16317016, 0.40792541, 0.32634033, 0.08158508,
+               0.004662  ])
+
+    The two-sided p-value is the probability that, under the null hypothesis,
+    a random table would have a probability equal to or less than the
+    probability of the input table.  For our example, the probability of
+    the input table (where ``x = 6``) is 0.0816.  The x values where the
+    probability does not exceed this are 2, 6 and 7, so the two-sided p-value
+    is ``0.0163 + 0.0816 + 0.00466 ~= 0.10256``::
+
+        >>> from scipy.stats import fisher_exact
+        >>> res = fisher_exact(table, alternative='two-sided')
+        >>> res.pvalue
+        0.10256410256410257
+
+    The one-sided p-value for ``alternative='greater'`` is the probability
+    that a random table has ``x >= a``, which in our example is ``x >= 6``,
+    or ``0.0816 + 0.00466 ~= 0.08626``::
+
+        >>> res = fisher_exact(table, alternative='greater')
+        >>> res.pvalue
+        0.08624708624708627
+
+    This is equivalent to computing the survival function of the
+    distribution at ``x = 5`` (one less than ``x`` from the input table,
+    because we want to include the probability of ``x = 6`` in the sum)::
+
+        >>> hypergeom.sf(5, M, n, N)
+        0.08624708624708627
+
+    For ``alternative='less'``, the one-sided p-value is the probability
+    that a random table has ``x <= a``, (i.e. ``x <= 6`` in our example),
+    or ``0.0163 + 0.163 + 0.408 + 0.326 + 0.0816 ~= 0.9949``::
+
+        >>> res = fisher_exact(table, alternative='less')
+        >>> res.pvalue
+        0.9953379953379957
+
+    This is equivalent to computing the cumulative distribution function
+    of the distribution at ``x = 6``:
+
+        >>> hypergeom.cdf(6, M, n, N)
+        0.9953379953379957
+
+    *Odds ratio*
+
+    The calculated odds ratio is different from the value computed by the
+    R function ``fisher.test``.  This implementation returns the "sample"
+    or "unconditional" maximum likelihood estimate, while ``fisher.test``
+    in R uses the conditional maximum likelihood estimate.  To compute the
+    conditional maximum likelihood estimate of the odds ratio, use
+    `scipy.stats.contingency.odds_ratio`.
+
+    References
+    ----------
+    .. [1] Fisher, Sir Ronald A, "The Design of Experiments:
+           Mathematics of a Lady Tasting Tea." ISBN 978-0-486-41151-4, 1935.
+    .. [2] "Fisher's exact test",
+           https://en.wikipedia.org/wiki/Fisher's_exact_test
+
+    Examples
+    --------
+
+    >>> from scipy.stats import fisher_exact
+    >>> res = fisher_exact([[8, 2], [1, 5]])
+    >>> res.statistic
+    20.0
+    >>> res.pvalue
+    0.034965034965034975
+
+    For tables with shape other than ``(2, 2)``, provide an instance of
+    `scipy.stats.MonteCarloMethod` or `scipy.stats.PermutationMethod` for the
+    `method` parameter:
+
+    >>> import numpy as np
+    >>> from scipy.stats import MonteCarloMethod
+    >>> rng = np.random.default_rng(4507195762371367)
+    >>> method = MonteCarloMethod(rng=rng)
+    >>> fisher_exact([[8, 2, 3], [1, 5, 4]], method=method)
+    SignificanceResult(statistic=np.float64(0.005782), pvalue=np.float64(0.0603))
+
+    For a more detailed example, see :ref:`hypothesis_fisher_exact`.
+    """
+    hypergeom = distributions.hypergeom
+    # int32 is not enough for the algorithm
+    c = np.asarray(table, dtype=np.int64)
+    if not c.ndim == 2:
+        raise ValueError("The input `table` must have two dimensions.")
+
+    if np.any(c < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+
+    if not c.shape == (2, 2) or method is not None:
+        return _fisher_exact_rxc(c, alternative, method)
+    alternative = 'two-sided' if alternative is None else alternative
+
+    if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
+        # If both values in a row or column are zero, the p-value is 1 and
+        # the odds ratio is NaN.
+        return SignificanceResult(np.nan, 1.0)
+
+    if c[1, 0] > 0 and c[0, 1] > 0:
+        oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1])
+    else:
+        oddsratio = np.inf
+
+    n1 = c[0, 0] + c[0, 1]
+    n2 = c[1, 0] + c[1, 1]
+    n = c[0, 0] + c[1, 0]
+
+    def pmf(x):
+        return hypergeom.pmf(x, n1 + n2, n1, n)
+
+    if alternative == 'less':
+        pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
+    elif alternative == 'greater':
+        # Same formula as the 'less' case, but with the second column.
+        pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1])
+    elif alternative == 'two-sided':
+        mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2))
+        pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n)
+        pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
+
+        epsilon = 1e-14
+        gamma = 1 + epsilon
+
+        if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= epsilon:
+            return SignificanceResult(oddsratio, 1.)
+
+        elif c[0, 0] < mode:
+            plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
+            if hypergeom.pmf(n, n1 + n2, n1, n) > pexact * gamma:
+                return SignificanceResult(oddsratio, plower)
+
+            guess = _binary_search(lambda x: -pmf(x), -pexact * gamma, mode, n)
+            pvalue = plower + hypergeom.sf(guess, n1 + n2, n1, n)
+        else:
+            pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n)
+            if hypergeom.pmf(0, n1 + n2, n1, n) > pexact * gamma:
+                return SignificanceResult(oddsratio, pupper)
+
+            guess = _binary_search(pmf, pexact * gamma, 0, mode)
+            pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
+    else:
+        msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}"
+        raise ValueError(msg)
+
+    pvalue = min(pvalue, 1.0)
+
+    return SignificanceResult(oddsratio, pvalue)
+
+
+def _fisher_exact_rxc(table, alternative, method):
+    if alternative is not None:
+        message = ('`alternative` must be the default (None) unless '
+                  '`table` has shape `(2, 2)` and `method is None`.')
+        raise ValueError(message)
+
+    if table.size == 0:
+        raise ValueError("`table` must have at least one row and one column.")
+
+    if table.shape[0] == 1 or table.shape[1] == 1 or np.all(table == 0):
+        # Only one such table with those marginals
+        return SignificanceResult(1.0, 1.0)
+
+    if method is None:
+        method = stats.MonteCarloMethod()
+
+    if isinstance(method, stats.PermutationMethod):
+        res = _fisher_exact_permutation_method(table, method)
+    elif isinstance(method, stats.MonteCarloMethod):
+        res = _fisher_exact_monte_carlo_method(table, method)
+    else:
+        message = (f'`{method=}` not recognized; if provided, `method` must be an '
+                   'instance of `PermutationMethod` or `MonteCarloMethod`.')
+        raise ValueError(message)
+
+    return SignificanceResult(np.clip(res.statistic, None, 1.0), res.pvalue)
+
+
+def _fisher_exact_permutation_method(table, method):
+    x, y = _untabulate(table)
+    colsums = np.sum(table, axis=0)
+    rowsums = np.sum(table, axis=1)
+    X = stats.random_table(rowsums, colsums)
+
+    # `permutation_test` with `permutation_type='pairings' permutes the order of `x`,
+    # which pairs observations in `x` with different observations in `y`.
+    def statistic(x):
+        # crosstab the resample and compute the statistic
+        table = stats.contingency.crosstab(x, y)[1]
+        return X.pmf(table)
+
+    # tables with *smaller* probability mass are considered to be more extreme
+    return stats.permutation_test((x,), statistic, permutation_type='pairings',
+                                  alternative='less', **method._asdict())
+
+
+def _fisher_exact_monte_carlo_method(table, method):
+    method = method._asdict()
+
+    if method.pop('rvs', None) is not None:
+        message = ('If the `method` argument of `fisher_exact` is an '
+                   'instance of `MonteCarloMethod`, its `rvs` attribute '
+                   'must be unspecified. Use the `MonteCarloMethod` `rng` argument '
+                   'to control the random state.')
+        raise ValueError(message)
+    rng = np.random.default_rng(method.pop('rng', None))
+
+    # `random_table.rvs` produces random contingency tables with the given marginals
+    # under the null hypothesis of independence
+    shape = table.shape
+    colsums = np.sum(table, axis=0)
+    rowsums = np.sum(table, axis=1)
+    totsum = np.sum(table)
+    X = stats.random_table(rowsums, colsums, seed=rng)
+
+    def rvs(size):
+        n_resamples = size[0]
+        return X.rvs(size=n_resamples).reshape(size)
+
+    # axis signals to `monte_carlo_test` that statistic is vectorized, but we know
+    # how it will pass the table(s), so we don't need to use `axis` explicitly.
+    def statistic(table, axis):
+        shape_ = (-1,) + shape if table.size > totsum else shape
+        return X.pmf(table.reshape(shape_))
+
+    # tables with *smaller* probability mass are considered to be more extreme
+    return stats.monte_carlo_test(table.ravel(), rvs, statistic,
+                                  alternative='less', **method)
+
+
+def _untabulate(table):
+    # converts a contingency table to paired samples indicating the
+    # correspondence between row and column indices
+    r, c = table.shape
+    x, y = [], []
+    for i in range(r):
+        for j in range(c):
+            x.append([i] * table[i, j])
+            y.append([j] * table[i, j])
+    return np.concatenate(x), np.concatenate(y)
+
+
+@xp_capabilities(np_only=True)
+def spearmanr(a, b=None, axis=0, nan_policy='propagate',
+              alternative='two-sided'):
+    r"""Calculate a Spearman correlation coefficient with associated p-value.
+
+    The Spearman rank-order correlation coefficient is a nonparametric measure
+    of the monotonicity of the relationship between two datasets.
+    Like other correlation coefficients,
+    this one varies between -1 and +1 with 0 implying no correlation.
+    Correlations of -1 or +1 imply an exact monotonic relationship. Positive
+    correlations imply that as x increases, so does y. Negative correlations
+    imply that as x increases, y decreases.
+
+    The p-value roughly indicates the probability of an uncorrelated system
+    producing datasets that have a Spearman correlation at least as extreme
+    as the one computed from these datasets. Although calculation of the
+    p-value does not make strong assumptions about the distributions underlying
+    the samples, it is only accurate for very large samples (>500
+    observations). For smaller sample sizes, consider a permutation test (see
+    Examples section below).
+
+    Parameters
+    ----------
+    a, b : 1D or 2D array_like, b is optional
+        One or two 1-D or 2-D arrays containing multiple variables and
+        observations. When these are 1-D, each represents a vector of
+        observations of a single variable. For the behavior in the 2-D case,
+        see under ``axis``, below.
+        Both arrays need to have the same length in the ``axis`` dimension.
+    axis : int or None, optional
+        If axis=0 (default), then each column represents a variable, with
+        observations in the rows. If axis=1, the relationship is transposed:
+        each row represents a variable, while the columns contain observations.
+        If axis=None, then both arrays will be raveled.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the correlation is nonzero
+        * 'less': the correlation is negative (less than zero)
+        * 'greater':  the correlation is positive (greater than zero)
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float or ndarray (2-D square)
+            Spearman correlation matrix or correlation coefficient (if only 2
+            variables are given as parameters). Correlation matrix is square
+            with length equal to total number of variables (columns or rows) in
+            ``a`` and ``b`` combined.
+        pvalue : float
+            The p-value for a hypothesis test whose null hypothesis
+            is that two samples have no ordinal correlation. See
+            `alternative` above for alternative hypotheses. `pvalue` has the
+            same shape as `statistic`.
+
+    Raises
+    ------
+    ValueError
+        If `axis` is not 0, 1 or None, or if the number of dimensions of `a`
+        is greater than 2, or if `b` is None and the number of dimensions of
+        `a` is less than 2.
+
+    Warns
+    -----
+    `~scipy.stats.ConstantInputWarning`
+        Raised if an input is a constant array.  The correlation coefficient
+        is not defined in this case, so ``np.nan`` is returned.
+
+    See Also
+    --------
+    :ref:`hypothesis_spearmanr` : Extended example
+
+    References
+    ----------
+    .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+       Probability and Statistics Tables and Formulae. Chapman & Hall: New
+       York. 2000.
+       Section  14.7
+    .. [2] Kendall, M. G. and Stuart, A. (1973).
+       The Advanced Theory of Statistics, Volume 2: Inference and Relationship.
+       Griffin. 1973.
+       Section 31.18
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> res = stats.spearmanr([1, 2, 3, 4, 5], [5, 6, 7, 8, 7])
+    >>> res.statistic
+    0.8207826816681233
+    >>> res.pvalue
+    0.08858700531354381
+
+    >>> rng = np.random.default_rng()
+    >>> x2n = rng.standard_normal((100, 2))
+    >>> y2n = rng.standard_normal((100, 2))
+    >>> res = stats.spearmanr(x2n)
+    >>> res.statistic, res.pvalue
+    (-0.07960396039603959, 0.4311168705769747)
+
+    >>> res = stats.spearmanr(x2n[:, 0], x2n[:, 1])
+    >>> res.statistic, res.pvalue
+    (-0.07960396039603959, 0.4311168705769747)
+
+    >>> res = stats.spearmanr(x2n, y2n)
+    >>> res.statistic
+    array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
+           [-0.07960396, 1. , -0.14448245, 0.16738074],
+           [-0.08314431, -0.14448245, 1. , 0.03234323],
+           [ 0.09662166, 0.16738074, 0.03234323, 1. ]])
+    >>> res.pvalue
+    array([[0. , 0.43111687, 0.41084066, 0.33891628],
+           [0.43111687, 0. , 0.15151618, 0.09600687],
+           [0.41084066, 0.15151618, 0. , 0.74938561],
+           [0.33891628, 0.09600687, 0.74938561, 0. ]])
+
+    >>> res = stats.spearmanr(x2n.T, y2n.T, axis=1)
+    >>> res.statistic
+    array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
+           [-0.07960396, 1. , -0.14448245, 0.16738074],
+           [-0.08314431, -0.14448245, 1. , 0.03234323],
+           [ 0.09662166, 0.16738074, 0.03234323, 1. ]])
+
+    >>> res = stats.spearmanr(x2n, y2n, axis=None)
+    >>> res.statistic, res.pvalue
+    (0.044981624540613524, 0.5270803651336189)
+
+    >>> res = stats.spearmanr(x2n.ravel(), y2n.ravel())
+    >>> res.statistic, res.pvalue
+    (0.044981624540613524, 0.5270803651336189)
+
+    >>> rng = np.random.default_rng()
+    >>> xint = rng.integers(10, size=(100, 2))
+    >>> res = stats.spearmanr(xint)
+    >>> res.statistic, res.pvalue
+    (0.09800224850707953, 0.3320271757932076)
+
+    For small samples, consider performing a permutation test instead of
+    relying on the asymptotic p-value. Note that to calculate the null
+    distribution of the statistic (for all possibly pairings between
+    observations in sample ``x`` and ``y``), only one of the two inputs needs
+    to be permuted.
+
+    >>> x = [1.76405235, 0.40015721, 0.97873798,
+    ... 2.2408932, 1.86755799, -0.97727788]
+    >>> y = [2.71414076, 0.2488, 0.87551913,
+    ... 2.6514917, 2.01160156, 0.47699563]
+
+    >>> def statistic(x): # permute only `x`
+    ...     return stats.spearmanr(x, y).statistic
+    >>> res_exact = stats.permutation_test((x,), statistic,
+    ...     permutation_type='pairings')
+    >>> res_asymptotic = stats.spearmanr(x, y)
+    >>> res_exact.pvalue, res_asymptotic.pvalue # asymptotic pvalue is too low
+    (0.10277777777777777, 0.07239650145772594)
+
+    For a more detailed example, see :ref:`hypothesis_spearmanr`.
+    """
+    if axis is not None and axis > 1:
+        raise ValueError("spearmanr only handles 1-D or 2-D arrays, "
+                         f"supplied axis argument {axis}, please use only "
+                         "values 0, 1 or None for axis")
+
+    a, axisout = _chk_asarray(a, axis)
+    if a.ndim > 2:
+        raise ValueError("spearmanr only handles 1-D or 2-D arrays")
+
+    if b is None:
+        if a.ndim < 2:
+            raise ValueError("`spearmanr` needs at least 2 "
+                             "variables to compare")
+    else:
+        # Concatenate a and b, so that we now only have to handle the case
+        # of a 2-D `a`.
+        b, _ = _chk_asarray(b, axis)
+        if axisout == 0:
+            a = np.column_stack((a, b))
+        else:
+            a = np.vstack((a, b))
+
+    n_vars = a.shape[1 - axisout]
+    n_obs = a.shape[axisout]
+    if n_obs <= 1:
+        # Handle empty arrays or single observations.
+        res = SignificanceResult(np.nan, np.nan)
+        res.correlation = np.nan
+        return res
+
+    warn_msg = ("An input array is constant; the correlation coefficient "
+                "is not defined.")
+    if axisout == 0:
+        if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all():
+            # If an input is constant, the correlation coefficient
+            # is not defined.
+            warnings.warn(stats.ConstantInputWarning(warn_msg), stacklevel=2)
+            res = SignificanceResult(np.nan, np.nan)
+            res.correlation = np.nan
+            return res
+    else:  # case when axisout == 1 b/c a is 2 dim only
+        if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all():
+            # If an input is constant, the correlation coefficient
+            # is not defined.
+            warnings.warn(stats.ConstantInputWarning(warn_msg), stacklevel=2)
+            res = SignificanceResult(np.nan, np.nan)
+            res.correlation = np.nan
+            return res
+
+    a_contains_nan = _contains_nan(a, nan_policy)
+    variable_has_nan = np.zeros(n_vars, dtype=bool)
+    if a_contains_nan:
+        if nan_policy == 'omit':
+            return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy,
+                                          alternative=alternative)
+        elif nan_policy == 'propagate':
+            if a.ndim == 1 or n_vars <= 2:
+                res = SignificanceResult(np.nan, np.nan)
+                res.correlation = np.nan
+                return res
+            else:
+                # Keep track of variables with NaNs, set the outputs to NaN
+                # only for those variables
+                variable_has_nan = np.isnan(a).any(axis=axisout)
+
+    a_ranked = np.apply_along_axis(rankdata, axisout, a)
+    rs = np.corrcoef(a_ranked, rowvar=axisout)
+    dof = n_obs - 2  # degrees of freedom
+
+    # rs can have elements equal to 1, so avoid zero division warnings
+    with np.errstate(divide='ignore'):
+        # clip the small negative values possibly caused by rounding
+        # errors before taking the square root
+        t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
+
+    dist = _SimpleStudentT(dof)
+    prob = _get_pvalue(t, dist, alternative, xp=np)
+
+    # For backwards compatibility, return scalars when comparing 2 columns
+    if rs.shape == (2, 2):
+        res = SignificanceResult(rs[1, 0], prob[1, 0])
+        res.correlation = rs[1, 0]
+        return res
+    else:
+        rs[variable_has_nan, :] = np.nan
+        rs[:, variable_has_nan] = np.nan
+        res = SignificanceResult(rs[()], prob[()])
+        res.correlation = rs
+        return res
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(_pack_CorrelationResult, n_samples=2,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3)
+def pointbiserialr(x, y):
+    r"""Calculate a point biserial correlation coefficient and its p-value.
+
+    The point biserial correlation is used to measure the relationship
+    between a binary variable, x, and a continuous variable, y. Like other
+    correlation coefficients, this one varies between -1 and +1 with 0
+    implying no correlation. Correlations of -1 or +1 imply a determinative
+    relationship.
+
+    This function may be computed using a shortcut formula but produces the
+    same result as `pearsonr`.
+
+    Parameters
+    ----------
+    x : array_like of bools
+        Input array.
+    y : array_like
+        Input array.
+
+    Returns
+    -------
+    res: SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+            The R value.
+        pvalue : float
+            The two-sided p-value.
+
+    Notes
+    -----
+    `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom.
+    It is equivalent to `pearsonr`.
+
+    The value of the point-biserial correlation can be calculated from:
+
+    .. math::
+
+        r_{pb} = \frac{\overline{Y_1} - \overline{Y_0}}
+                      {s_y}
+                 \sqrt{\frac{N_0 N_1}
+                            {N (N - 1)}}
+
+    Where :math:`\overline{Y_{0}}` and :math:`\overline{Y_{1}}` are means
+    of the metric observations coded 0 and 1 respectively; :math:`N_{0}` and
+    :math:`N_{1}` are number of observations coded 0 and 1 respectively;
+    :math:`N` is the total number of observations and :math:`s_{y}` is the
+    standard deviation of all the metric observations.
+
+    A value of :math:`r_{pb}` that is significantly different from zero is
+    completely equivalent to a significant difference in means between the two
+    groups. Thus, an independent groups t Test with :math:`N-2` degrees of
+    freedom may be used to test whether :math:`r_{pb}` is nonzero. The
+    relation between the t-statistic for comparing two independent groups and
+    :math:`r_{pb}` is given by:
+
+    .. math::
+
+        t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}}
+
+    References
+    ----------
+    .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math.
+           Statist., Vol. 20, no.1, pp. 125-126, 1949.
+
+    .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous
+           Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25,
+           np. 3, pp. 603-607, 1954.
+
+    .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef:
+           Statistics Reference Online (eds N. Balakrishnan, et al.), 2014.
+           :doi:`10.1002/9781118445112.stat06227`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
+    >>> b = np.arange(7)
+    >>> stats.pointbiserialr(a, b)
+    (0.8660254037844386, 0.011724811003954652)
+    >>> stats.pearsonr(a, b)
+    (0.86602540378443871, 0.011724811003954626)
+    >>> np.corrcoef(a, b)
+    array([[ 1.       ,  0.8660254],
+           [ 0.8660254,  1.       ]])
+
+    """
+    rpb, prob = pearsonr(x, y)
+    # create result object with alias for backward compatibility
+    res = SignificanceResult(rpb, prob)
+    res.correlation = rpb
+    return res
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(_pack_CorrelationResult, default_axis=None, n_samples=2,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3)
+def kendalltau(x, y, *, nan_policy='propagate',
+               method='auto', variant='b', alternative='two-sided'):
+    r"""Calculate Kendall's tau, a correlation measure for ordinal data.
+
+    Kendall's tau is a measure of the correspondence between two rankings.
+    Values close to 1 indicate strong agreement, and values close to -1
+    indicate strong disagreement. This implements two variants of Kendall's
+    tau: tau-b (the default) and tau-c (also known as Stuart's tau-c). These
+    differ only in how they are normalized to lie within the range -1 to 1;
+    the hypothesis tests (their p-values) are identical. Kendall's original
+    tau-a is not implemented separately because both tau-b and tau-c reduce
+    to tau-a in the absence of ties.
+
+    Although a naive implementation has O(n^2) complexity, this implementation
+    uses a Fenwick tree to do the computation in O(n log(n)) complexity.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Arrays of rankings, of the same shape. If arrays are not 1-D, they
+        will be flattened to 1-D.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+        * 'propagate': returns nan
+        * 'raise': throws an error
+        * 'omit': performs the calculations ignoring nan values
+
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        Defines which method is used to calculate the p-value [5]_.
+        The following options are available (default is 'auto'):
+
+        * 'auto': selects the appropriate method based on a trade-off
+          between speed and accuracy
+        * 'asymptotic': uses a normal approximation valid for large samples
+        * 'exact': computes the exact p-value, but can only be used if no ties
+          are present. As the sample size increases, the 'exact' computation
+          time may grow and the result may lose some precision.
+
+    variant : {'b', 'c'}, optional
+        Defines which variant of Kendall's tau is returned. Default is 'b'.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the rank correlation is nonzero
+        * 'less': the rank correlation is negative (less than zero)
+        * 'greater': the rank correlation is positive (greater than zero)
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+           The tau statistic.
+        pvalue : float
+           The p-value for a hypothesis test whose null hypothesis is
+           an absence of association, tau = 0.
+
+    Raises
+    ------
+    ValueError
+        If `nan_policy` is 'omit' and `variant` is not 'b' or
+        if `method` is 'exact' and there are ties between `x` and `y`.
+
+    See Also
+    --------
+    spearmanr : Calculates a Spearman rank-order correlation coefficient.
+    theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
+    weightedtau : Computes a weighted version of Kendall's tau.
+    :ref:`hypothesis_kendalltau` : Extended example
+
+    Notes
+    -----
+    The definition of Kendall's tau that is used is [2]_::
+
+      tau_b = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
+
+      tau_c = 2 (P - Q) / (n**2 * (m - 1) / m)
+
+    where P is the number of concordant pairs, Q the number of discordant
+    pairs, T the number of tied pairs only in `x`, and U the number of tied pairs only
+    in `y`.  If a tie occurs for the same pair in both `x` and `y`, it is not
+    added to either T or U. n is the total number of samples, and m is the
+    number of unique values in either `x` or `y`, whichever is smaller.
+
+    References
+    ----------
+    .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika
+           Vol. 30, No. 1/2, pp. 81-93, 1938.
+    .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",
+           Biometrika Vol. 33, No. 3, pp. 239-251. 1945.
+    .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John
+           Wiley & Sons, 1967.
+    .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency
+           tables", Software: Practice and Experience, Vol. 24, No. 3,
+           pp. 327-336, 1994.
+    .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
+           Charles Griffin & Co., 1970.
+
+    Examples
+    --------
+
+    >>> from scipy import stats
+    >>> x1 = [12, 2, 1, 12, 2]
+    >>> x2 = [1, 4, 7, 1, 0]
+    >>> res = stats.kendalltau(x1, x2)
+    >>> res.statistic
+    -0.47140452079103173
+    >>> res.pvalue
+    0.2827454599327748
+
+    For a more detailed example, see :ref:`hypothesis_kendalltau`.
+    """
+    x = np.asarray(x).ravel()
+    y = np.asarray(y).ravel()
+
+    if x.size != y.size:
+        raise ValueError("Array shapes are incompatible for broadcasting.")
+    elif not x.size or not y.size:
+        # Return NaN if arrays are empty
+        NaN = _get_nan(x, y)
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
+        return res
+
+    def count_rank_tie(ranks):
+        cnt = np.bincount(ranks).astype('int64', copy=False)
+        cnt = cnt[cnt > 1]
+        # Python ints to avoid overflow down the line
+        return (int((cnt * (cnt - 1) // 2).sum()),
+                int((cnt * (cnt - 1.) * (cnt - 2)).sum()),
+                int((cnt * (cnt - 1.) * (2*cnt + 5)).sum()))
+
+    size = x.size
+    perm = np.argsort(y)  # sort on y and convert y to dense ranks
+    x, y = x[perm], y[perm]
+    y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)
+
+    # stable sort on x and convert x to dense ranks
+    perm = np.argsort(x, kind='mergesort')
+    x, y = x[perm], y[perm]
+    x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)
+
+    dis = _kendall_dis(x, y)  # discordant pairs
+
+    obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
+    cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
+
+    ntie = int((cnt * (cnt - 1) // 2).sum())  # joint ties
+    xtie, x0, x1 = count_rank_tie(x)     # ties in x, stats
+    ytie, y0, y1 = count_rank_tie(y)     # ties in y, stats
+
+    tot = (size * (size - 1)) // 2
+
+    if xtie == tot or ytie == tot:
+        NaN = _get_nan(x, y)
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
+        return res
+
+    # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
+    #               = con + dis + xtie + ytie - ntie
+    con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
+    if variant == 'b':
+        tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)
+    elif variant == 'c':
+        minclasses = min(len(set(x)), len(set(y)))
+        tau = 2*con_minus_dis / (size**2 * (minclasses-1)/minclasses)
+    else:
+        raise ValueError(f"Unknown variant of the method chosen: {variant}. "
+                         "variant must be 'b' or 'c'.")
+
+    # Limit range to fix computational errors
+    tau = np.minimum(1., max(-1., tau))
+
+    # The p-value calculation is the same for all variants since the p-value
+    # depends only on con_minus_dis.
+    if method == 'exact' and (xtie != 0 or ytie != 0):
+        raise ValueError("Ties found, exact method cannot be used.")
+
+    if method == 'auto':
+        if (xtie == 0 and ytie == 0) and (size <= 33 or
+                                          min(dis, tot-dis) <= 1):
+            method = 'exact'
+        else:
+            method = 'asymptotic'
+
+    if xtie == 0 and ytie == 0 and method == 'exact':
+        pvalue = mstats_basic._kendall_p_exact(size, tot-dis, alternative)
+    elif method == 'asymptotic':
+        # con_minus_dis is approx normally distributed with this variance [3]_
+        m = size * (size - 1.)
+        var = ((m * (2*size + 5) - x1 - y1) / 18 +
+               (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))
+        z = con_minus_dis / np.sqrt(var)
+        pvalue = _get_pvalue(z, _SimpleNormal(), alternative, xp=np)
+    else:
+        raise ValueError(f"Unknown method {method} specified.  Use 'auto', "
+                         "'exact' or 'asymptotic'.")
+
+    # create result object with alias for backward compatibility
+    res = SignificanceResult(tau[()], pvalue[()])
+    res.correlation = tau[()]
+    return res
+
+
+def _weightedtau_n_samples(kwargs):
+    rank = kwargs.get('rank', False)
+    return 2 if (isinstance(rank, bool) or rank is None) else 3
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(_pack_CorrelationResult, default_axis=None,
+                          n_samples=_weightedtau_n_samples,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3, override={'nan_propagation': False})
+def weightedtau(x, y, rank=True, weigher=None, additive=True):
+    r"""Compute a weighted version of Kendall's :math:`\tau`.
+
+    The weighted :math:`\tau` is a weighted version of Kendall's
+    :math:`\tau` in which exchanges of high weight are more influential than
+    exchanges of low weight. The default parameters compute the additive
+    hyperbolic version of the index, :math:`\tau_\mathrm h`, which has
+    been shown to provide the best balance between important and
+    unimportant elements [1]_.
+
+    The weighting is defined by means of a rank array, which assigns a
+    nonnegative rank to each element (higher importance ranks being
+    associated with smaller values, e.g., 0 is the highest possible rank),
+    and a weigher function, which assigns a weight based on the rank to
+    each element. The weight of an exchange is then the sum or the product
+    of the weights of the ranks of the exchanged elements. The default
+    parameters compute :math:`\tau_\mathrm h`: an exchange between
+    elements with rank :math:`r` and :math:`s` (starting from zero) has
+    weight :math:`1/(r+1) + 1/(s+1)`.
+
+    Specifying a rank array is meaningful only if you have in mind an
+    external criterion of importance. If, as it usually happens, you do
+    not have in mind a specific rank, the weighted :math:`\tau` is
+    defined by averaging the values obtained using the decreasing
+    lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the
+    behavior with default parameters. Note that the convention used
+    here for ranking (lower values imply higher importance) is opposite
+    to that used by other SciPy statistical functions.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Arrays of scores, of the same shape. If arrays are not 1-D, they will
+        be flattened to 1-D.
+    rank : array_like of ints or bool, optional
+        A nonnegative rank assigned to each element. If it is None, the
+        decreasing lexicographical rank by (`x`, `y`) will be used: elements of
+        higher rank will be those with larger `x`-values, using `y`-values to
+        break ties (in particular, swapping `x` and `y` will give a different
+        result). If it is False, the element indices will be used
+        directly as ranks. The default is True, in which case this
+        function returns the average of the values obtained using the
+        decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`).
+    weigher : callable, optional
+        The weigher function. Must map nonnegative integers (zero
+        representing the most important element) to a nonnegative weight.
+        The default, None, provides hyperbolic weighing, that is,
+        rank :math:`r` is mapped to weight :math:`1/(r+1)`.
+    additive : bool, optional
+        If True, the weight of an exchange is computed by adding the
+        weights of the ranks of the exchanged elements; otherwise, the weights
+        are multiplied. The default is True.
+
+    Returns
+    -------
+    res: SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+           The weighted :math:`\tau` correlation index.
+        pvalue : float
+           Presently ``np.nan``, as the null distribution of the statistic is
+           unknown (even in the additive hyperbolic case).
+
+    See Also
+    --------
+    kendalltau : Calculates Kendall's tau.
+    spearmanr : Calculates a Spearman rank-order correlation coefficient.
+    theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
+
+    Notes
+    -----
+    This function uses an :math:`O(n \log n)`, mergesort-based algorithm
+    [1]_ that is a weighted extension of Knight's algorithm for Kendall's
+    :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_
+    between rankings without ties (i.e., permutations) by setting
+    `additive` and `rank` to False, as the definition given in [1]_ is a
+    generalization of Shieh's.
+
+    NaNs are considered the smallest possible score.
+
+    .. versionadded:: 0.19.0
+
+    References
+    ----------
+    .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with
+           ties", Proceedings of the 24th international conference on World
+           Wide Web, pp. 1166-1176, ACM, 2015.
+    .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with
+           Ungrouped Data", Journal of the American Statistical Association,
+           Vol. 61, No. 314, Part 1, pp. 436-439, 1966.
+    .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics &
+           Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> x = [12, 2, 1, 12, 2]
+    >>> y = [1, 4, 7, 1, 0]
+    >>> res = stats.weightedtau(x, y)
+    >>> res.statistic
+    -0.56694968153682723
+    >>> res.pvalue
+    nan
+    >>> res = stats.weightedtau(x, y, additive=False)
+    >>> res.statistic
+    -0.62205716951801038
+
+    NaNs are considered the smallest possible score:
+
+    >>> x = [12, 2, 1, 12, 2]
+    >>> y = [1, 4, 7, 1, np.nan]
+    >>> res = stats.weightedtau(x, y)
+    >>> res.statistic
+    -0.56694968153682723
+
+    This is exactly Kendall's tau:
+
+    >>> x = [12, 2, 1, 12, 2]
+    >>> y = [1, 4, 7, 1, 0]
+    >>> res = stats.weightedtau(x, y, weigher=lambda x: 1)
+    >>> res.statistic
+    -0.47140452079103173
+
+    >>> x = [12, 2, 1, 12, 2]
+    >>> y = [1, 4, 7, 1, 0]
+    >>> stats.weightedtau(x, y, rank=None)
+    SignificanceResult(statistic=-0.4157652301037516, pvalue=nan)
+    >>> stats.weightedtau(y, x, rank=None)
+    SignificanceResult(statistic=-0.7181341329699028, pvalue=nan)
+
+    """
+    x = np.asarray(x).ravel()
+    y = np.asarray(y).ravel()
+    NaN = _get_nan(x, y)
+
+    if x.size != y.size:
+        raise ValueError("Array shapes are incompatible for broadcasting.")
+    if not x.size:
+        # Return NaN if arrays are empty
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
+        return res
+
+    # If there are NaNs we apply _toint64()
+    if np.isnan(np.sum(x)):
+        x = _toint64(x)
+    if np.isnan(np.sum(y)):
+        y = _toint64(y)
+
+    # Reduce to ranks unsupported types
+    if x.dtype != y.dtype:
+        if x.dtype != np.int64:
+            x = _toint64(x)
+        if y.dtype != np.int64:
+            y = _toint64(y)
+    else:
+        if x.dtype not in (np.int32, np.int64, np.float32, np.float64):
+            x = _toint64(x)
+            y = _toint64(y)
+
+    if rank is True:
+        tau = np.asarray(
+            _weightedrankedtau(x, y, None, weigher, additive) +
+            _weightedrankedtau(y, x, None, weigher, additive)
+        )[()] / 2
+        res = SignificanceResult(tau, NaN)
+        res.correlation = tau
+        return res
+
+    if rank is False:
+        rank = np.arange(x.size, dtype=np.intp)
+    elif rank is not None:
+        rank = np.asarray(rank).ravel()
+        rank = _toint64(rank).astype(np.intp)
+        if rank.size != x.size:
+            raise ValueError(
+                "All inputs to `weightedtau` must be of the same size, "
+                f"found x-size {x.size} and rank-size {rank.size}"
+            )
+
+    tau = np.asarray(_weightedrankedtau(x, y, rank, weigher, additive))[()]
+    res = SignificanceResult(tau, NaN)
+    res.correlation = tau
+    return res
+
+
+#####################################
+#       INFERENTIAL STATISTICS      #
+#####################################
+
+TtestResultBase = _make_tuple_bunch('TtestResultBase',
+                                    ['statistic', 'pvalue'], ['df'])
+
+
+class TtestResult(TtestResultBase):
+    """
+    Result of a t-test.
+
+    See the documentation of the particular t-test function for more
+    information about the definition of the statistic and meaning of
+    the confidence interval.
+
+    Attributes
+    ----------
+    statistic : float or array
+        The t-statistic of the sample.
+    pvalue : float or array
+        The p-value associated with the given alternative.
+    df : float or array
+        The number of degrees of freedom used in calculation of the
+        t-statistic; this is one less than the size of the sample
+        (``a.shape[axis]-1`` if there are no masked elements or omitted NaNs).
+
+    Methods
+    -------
+    confidence_interval
+        Computes a confidence interval around the population statistic
+        for the given confidence level.
+        The confidence interval is returned in a ``namedtuple`` with
+        fields `low` and `high`.
+
+    """
+
+    def __init__(self, statistic, pvalue, df,  # public
+                 alternative, standard_error, estimate,  # private
+                 statistic_np=None, xp=None):  # private
+        super().__init__(statistic, pvalue, df=df)
+        self._alternative = alternative
+        self._standard_error = standard_error  # denominator of t-statistic
+        self._estimate = estimate  # point estimate of sample mean
+        self._statistic_np = statistic if statistic_np is None else statistic_np
+        self._dtype = statistic.dtype
+        self._xp = array_namespace(statistic, pvalue) if xp is None else xp
+
+
+    def confidence_interval(self, confidence_level=0.95):
+        """
+        Parameters
+        ----------
+        confidence_level : float
+            The confidence level for the calculation of the population mean
+            confidence interval. Default is 0.95.
+
+        Returns
+        -------
+        ci : namedtuple
+            The confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`.
+
+        """
+        low, high = _t_confidence_interval(self.df, self._statistic_np,
+                                           confidence_level, self._alternative,
+                                           self._dtype, self._xp)
+        low = low * self._standard_error + self._estimate
+        high = high * self._standard_error + self._estimate
+        return ConfidenceInterval(low=low, high=high)
+
+
+def pack_TtestResult(statistic, pvalue, df, alternative, standard_error,
+                     estimate):
+    # this could be any number of dimensions (including 0d), but there is
+    # at most one unique non-NaN value
+    xp = array_namespace(statistic, pvalue)
+    alternative = xpx.atleast_nd(xp.asarray(alternative), ndim=1, xp=xp)
+    alternative = alternative[xp.isfinite(alternative)]
+    alternative = alternative[0] if xp_size(alternative) != 0 else xp.nan
+    return TtestResult(statistic, pvalue, df=df, alternative=alternative,
+                       standard_error=standard_error, estimate=estimate)
+
+
+def unpack_TtestResult(res, _):
+    return (res.statistic, res.pvalue, res.df, res._alternative,
+            res._standard_error, res._estimate)
+
+
+@xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"],
+                 jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,
+                          result_to_tuple=unpack_TtestResult, n_outputs=6)
+# nan_policy handled by `_axis_nan_policy`, but needs to be left
+# in signature to preserve use as a positional argument
+def ttest_1samp(a, popmean, axis=0, nan_policy="propagate", alternative="two-sided"):
+    """Calculate the T-test for the mean of ONE group of scores.
+
+    This is a test for the null hypothesis that the expected value
+    (mean) of a sample of independent observations `a` is equal to the given
+    population mean, `popmean`.
+
+    Parameters
+    ----------
+    a : array_like
+        Sample observations.
+    popmean : float or array_like
+        Expected value in null hypothesis. If array_like, then its length along
+        `axis` must equal 1, and it must otherwise be broadcastable with `a`.
+    axis : int or None, optional
+        Axis along which to compute test; default is 0. If None, compute over
+        the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the mean of the underlying distribution of the sample
+          is different than the given population mean (`popmean`)
+        * 'less': the mean of the underlying distribution of the sample is
+          less than the given population mean (`popmean`)
+        * 'greater': the mean of the underlying distribution of the sample is
+          greater than the given population mean (`popmean`)
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.TtestResult`
+        An object with the following attributes:
+
+        statistic : float or array
+            The t-statistic.
+        pvalue : float or array
+            The p-value associated with the given alternative.
+        df : float or array
+            The number of degrees of freedom used in calculation of the
+            t-statistic; this is one less than the size of the sample
+            (``a.shape[axis]``).
+
+            .. versionadded:: 1.10.0
+
+        The object also has the following method:
+
+        confidence_interval(confidence_level=0.95)
+            Computes a confidence interval around the population
+            mean for the given confidence level.
+            The confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`.
+
+            .. versionadded:: 1.10.0
+
+    Notes
+    -----
+    The statistic is calculated as ``(np.mean(a) - popmean)/se``, where
+    ``se`` is the standard error. Therefore, the statistic will be positive
+    when the sample mean is greater than the population mean and negative when
+    the sample mean is less than the population mean.
+
+    Examples
+    --------
+    Suppose we wish to test the null hypothesis that the mean of a population
+    is equal to 0.5. We choose a confidence level of 99%; that is, we will
+    reject the null hypothesis in favor of the alternative if the p-value is
+    less than 0.01.
+
+    When testing random variates from the standard uniform distribution, which
+    has a mean of 0.5, we expect the data to be consistent with the null
+    hypothesis most of the time.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> rvs = stats.uniform.rvs(size=50, random_state=rng)
+    >>> stats.ttest_1samp(rvs, popmean=0.5)
+    TtestResult(statistic=2.456308468440, pvalue=0.017628209047638, df=49)
+
+    As expected, the p-value of 0.017 is not below our threshold of 0.01, so
+    we cannot reject the null hypothesis.
+
+    When testing data from the standard *normal* distribution, which has a mean
+    of 0, we would expect the null hypothesis to be rejected.
+
+    >>> rvs = stats.norm.rvs(size=50, random_state=rng)
+    >>> stats.ttest_1samp(rvs, popmean=0.5)
+    TtestResult(statistic=-7.433605518875, pvalue=1.416760157221e-09, df=49)
+
+    Indeed, the p-value is lower than our threshold of 0.01, so we reject the
+    null hypothesis in favor of the default "two-sided" alternative: the mean
+    of the population is *not* equal to 0.5.
+
+    However, suppose we were to test the null hypothesis against the
+    one-sided alternative that the mean of the population is *greater* than
+    0.5. Since the mean of the standard normal is less than 0.5, we would not
+    expect the null hypothesis to be rejected.
+
+    >>> stats.ttest_1samp(rvs, popmean=0.5, alternative='greater')
+    TtestResult(statistic=-7.433605518875, pvalue=0.99999999929, df=49)
+
+    Unsurprisingly, with a p-value greater than our threshold, we would not
+    reject the null hypothesis.
+
+    Note that when working with a confidence level of 99%, a true null
+    hypothesis will be rejected approximately 1% of the time.
+
+    >>> rvs = stats.uniform.rvs(size=(100, 50), random_state=rng)
+    >>> res = stats.ttest_1samp(rvs, popmean=0.5, axis=1)
+    >>> np.sum(res.pvalue < 0.01)
+    1
+
+    Indeed, even though all 100 samples above were drawn from the standard
+    uniform distribution, which *does* have a population mean of 0.5, we would
+    mistakenly reject the null hypothesis for one of them.
+
+    `ttest_1samp` can also compute a confidence interval around the population
+    mean.
+
+    >>> rvs = stats.norm.rvs(size=50, random_state=rng)
+    >>> res = stats.ttest_1samp(rvs, popmean=0)
+    >>> ci = res.confidence_interval(confidence_level=0.95)
+    >>> ci
+    ConfidenceInterval(low=-0.3193887540880017, high=0.2898583388980972)
+
+    The bounds of the 95% confidence interval are the
+    minimum and maximum values of the parameter `popmean` for which the
+    p-value of the test would be 0.05.
+
+    >>> res = stats.ttest_1samp(rvs, popmean=ci.low)
+    >>> np.testing.assert_allclose(res.pvalue, 0.05)
+    >>> res = stats.ttest_1samp(rvs, popmean=ci.high)
+    >>> np.testing.assert_allclose(res.pvalue, 0.05)
+
+    Under certain assumptions about the population from which a sample
+    is drawn, the confidence interval with confidence level 95% is expected
+    to contain the true population mean in 95% of sample replications.
+
+    >>> rvs = stats.norm.rvs(size=(50, 1000), loc=1, random_state=rng)
+    >>> res = stats.ttest_1samp(rvs, popmean=0)
+    >>> ci = res.confidence_interval()
+    >>> contains_pop_mean = (ci.low < 1) & (ci.high > 1)
+    >>> contains_pop_mean.sum()
+    953
+
+    """
+    xp = array_namespace(a)
+    a, popmean = xp_promote(a, popmean, force_floating=True, xp=xp)
+    a, axis = _chk_asarray(a, axis, xp=xp)
+
+    n = _length_nonmasked(a, axis)
+    df = n - 1
+
+    if a.shape[axis] == 0:
+        # This is really only needed for *testing* _axis_nan_policy decorator
+        # It won't happen when the decorator is used.
+        NaN = _get_nan(a)
+        return TtestResult(NaN, NaN, df=NaN, alternative=NaN,
+                           standard_error=NaN, estimate=NaN)
+
+    mean = xp.mean(a, axis=axis)
+    try:
+        popmean = xp.asarray(popmean)
+        popmean = xp.squeeze(popmean, axis=axis) if popmean.ndim > 0 else popmean
+    except ValueError as e:
+        raise ValueError("`popmean.shape[axis]` must equal 1.") from e
+    d = mean - popmean
+    v = _var(a, axis=axis, ddof=1)
+    denom = xp.sqrt(v / n)
+
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t = xp.divide(d, denom)
+        t = t[()] if t.ndim == 0 else t
+
+    dist = _SimpleStudentT(xp.asarray(df, dtype=t.dtype, device=xp_device(a)))
+    prob = _get_pvalue(t, dist, alternative, xp=xp)
+    prob = prob[()] if prob.ndim == 0 else prob
+
+    # when nan_policy='omit', `df` can be different for different axis-slices
+    df = xp.broadcast_to(xp.asarray(df, device=xp_device(a)), t.shape)
+    df = df[()] if df.ndim == 0 else df
+    # _axis_nan_policy decorator doesn't play well with strings
+    alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative]
+    return TtestResult(t, prob, df=df, alternative=alternative_num,
+                       standard_error=denom, estimate=mean,
+                       statistic_np=xp.asarray(t), xp=xp)
+
+
+def _t_confidence_interval(df, t, confidence_level, alternative, dtype=None, xp=None):
+    # Input validation on `alternative` is already done
+    # We just need IV on confidence_level
+    dtype = t.dtype if dtype is None else dtype
+    xp = array_namespace(t) if xp is None else xp
+
+    if confidence_level < 0 or confidence_level > 1:
+        message = "`confidence_level` must be a number between 0 and 1."
+        raise ValueError(message)
+
+    confidence_level = xp.asarray(confidence_level, dtype=dtype, device=xp_device(t))
+    inf = xp.asarray(xp.inf, dtype=dtype)
+
+    if alternative < 0:  # 'less'
+        p = confidence_level
+        low, high = xp.broadcast_arrays(-inf, special.stdtrit(df, p))
+    elif alternative > 0:  # 'greater'
+        p = 1 - confidence_level
+        low, high = xp.broadcast_arrays(special.stdtrit(df, p), inf)
+    elif alternative == 0:  # 'two-sided'
+        tail_probability = (1 - confidence_level)/2
+        p = xp.stack((tail_probability, 1-tail_probability))
+        # axis of p must be the zeroth and orthogonal to all the rest
+        p = xp.reshape(p, tuple([2] + [1]*xp.asarray(df, device=xp_device(t)).ndim))
+        ci = special.stdtrit(df, p)
+        low, high = ci[0, ...], ci[1, ...]
+    else:  # alternative is NaN when input is empty (see _axis_nan_policy)
+        nan = xp.asarray(xp.nan, device=xp_device(t))
+        p, nans = xp.broadcast_arrays(t, nan)
+        low, high = nans, nans
+
+    low = xp.asarray(low, dtype=dtype)
+    low = low[()] if low.ndim == 0 else low
+    high = xp.asarray(high, dtype=dtype)
+    high = high[()] if high.ndim == 0 else high
+    return low, high
+
+
+def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative, xp=None):
+    xp = array_namespace(mean1, mean2, denom) if xp is None else xp
+
+    d = mean1 - mean2
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t = xp.divide(d, denom)
+
+    dist = _SimpleStudentT(xp.asarray(df, dtype=t.dtype, device=xp_device(t)))
+    prob = _get_pvalue(t, dist, alternative, xp=xp)
+    prob = prob[()] if prob.ndim == 0 else prob
+
+    t = t[()] if t.ndim == 0 else t
+    prob = prob[()] if prob.ndim == 0 else prob
+    return t, prob
+
+
+def _unequal_var_ttest_denom(v1, n1, v2, n2, xp=None):
+    xp = array_namespace(v1, v2) if xp is None else xp
+    vn1 = v1 / n1
+    vn2 = v2 / n2
+    with np.errstate(divide='ignore', invalid='ignore'):
+        df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
+
+    # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
+    # Hence it doesn't matter what df is as long as it's not NaN.
+    df = xp.where(xp.isnan(df), 1., df)
+    denom = xp.sqrt(vn1 + vn2)
+    return df, denom
+
+
+def _equal_var_ttest_denom(v1, n1, v2, n2, xp=None):
+    xp = array_namespace(v1, v2) if xp is None else xp
+
+    # If there is a single observation in one sample, this formula for pooled
+    # variance breaks down because the variance of that sample is undefined.
+    # The pooled variance is still defined, though, because the (n-1) in the
+    # numerator should cancel with the (n-1) in the denominator, leaving only
+    # the sum of squared differences from the mean: zero.
+    v1 = xp.where(xp.asarray(n1 == 1), 0., v1)
+    v2 = xp.where(xp.asarray(n2 == 1), 0., v2)
+
+    df = n1 + n2 - 2.0
+    svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
+    denom = xp.sqrt(svar * (1.0 / n1 + 1.0 / n2))
+    df = xp.asarray(df, dtype=denom.dtype)
+    return df, denom
+
+
+Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"])
+def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2,
+                         equal_var=True, alternative="two-sided"):
+    r"""
+    T-test for means of two independent samples from descriptive statistics.
+
+    This is a test for the null hypothesis that two independent
+    samples have identical average (expected) values.
+
+    Parameters
+    ----------
+    mean1 : array_like
+        The mean(s) of sample 1.
+    std1 : array_like
+        The corrected sample standard deviation of sample 1 (i.e. ``ddof=1``).
+    nobs1 : array_like
+        The number(s) of observations of sample 1.
+    mean2 : array_like
+        The mean(s) of sample 2.
+    std2 : array_like
+        The corrected sample standard deviation of sample 2 (i.e. ``ddof=1``).
+    nobs2 : array_like
+        The number(s) of observations of sample 2.
+    equal_var : bool, optional
+        If True (default), perform a standard independent 2 sample test
+        that assumes equal population variances [1]_.
+        If False, perform Welch's t-test, which does not assume equal
+        population variance [2]_.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions are unequal.
+        * 'less': the mean of the first distribution is less than the
+          mean of the second distribution.
+        * 'greater': the mean of the first distribution is greater than the
+          mean of the second distribution.
+
+        .. versionadded:: 1.6.0
+
+    Returns
+    -------
+    statistic : float or array
+        The calculated t-statistics.
+    pvalue : float or array
+        The two-tailed p-value.
+
+    See Also
+    --------
+    scipy.stats.ttest_ind
+
+    Notes
+    -----
+    The statistic is calculated as ``(mean1 - mean2)/se``, where ``se`` is the
+    standard error. Therefore, the statistic will be positive when `mean1` is
+    greater than `mean2` and negative when `mean1` is less than `mean2`.
+
+    This method does not check whether any of the elements of `std1` or `std2`
+    are negative. If any elements of the `std1` or `std2` parameters are
+    negative in a call to this method, this method will return the same result
+    as if it were passed ``numpy.abs(std1)`` and ``numpy.abs(std2)``,
+    respectively, instead; no exceptions or warnings will be emitted.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
+
+    .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
+
+    Examples
+    --------
+    Suppose we have the summary data for two samples, as follows (with the
+    Sample Variance being the corrected sample variance)::
+
+                         Sample   Sample
+                   Size   Mean   Variance
+        Sample 1    13    15.0     87.5
+        Sample 2    11    12.0     39.0
+
+    Apply the t-test to this data (with the assumption that the population
+    variances are equal):
+
+    >>> import numpy as np
+    >>> from scipy.stats import ttest_ind_from_stats
+    >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13,
+    ...                      mean2=12.0, std2=np.sqrt(39.0), nobs2=11)
+    Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487)
+
+    For comparison, here is the data from which those summary statistics
+    were taken.  With this data, we can compute the same result using
+    `scipy.stats.ttest_ind`:
+
+    >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26])
+    >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21])
+    >>> from scipy.stats import ttest_ind
+    >>> ttest_ind(a, b)
+    TtestResult(statistic=0.905135809331027,
+                pvalue=0.3751996797581486,
+                df=22.0)
+
+    Suppose we instead have binary data and would like to apply a t-test to
+    compare the proportion of 1s in two independent groups::
+
+                          Number of    Sample     Sample
+                    Size    ones        Mean     Variance
+        Sample 1    150      30         0.2        0.161073
+        Sample 2    200      45         0.225      0.175251
+
+    The sample mean :math:`\hat{p}` is the proportion of ones in the sample
+    and the variance for a binary observation is estimated by
+    :math:`\hat{p}(1-\hat{p})`.
+
+    >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.161073), nobs1=150,
+    ...                      mean2=0.225, std2=np.sqrt(0.175251), nobs2=200)
+    Ttest_indResult(statistic=-0.5627187905196761, pvalue=0.5739887114209541)
+
+    For comparison, we could compute the t statistic and p-value using
+    arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above.
+
+    >>> group1 = np.array([1]*30 + [0]*(150-30))
+    >>> group2 = np.array([1]*45 + [0]*(200-45))
+    >>> ttest_ind(group1, group2)
+    TtestResult(statistic=-0.5627179589855622,
+                pvalue=0.573989277115258,
+                df=348.0)
+
+    """
+    xp = array_namespace(mean1, std1, mean2, std2)
+
+    mean1 = xp.asarray(mean1)
+    std1 = xp.asarray(std1)
+    mean2 = xp.asarray(mean2)
+    std2 = xp.asarray(std2)
+
+    if equal_var:
+        df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2, xp=xp)
+    else:
+        df, denom = _unequal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2, xp=xp)
+
+    res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative)
+    return Ttest_indResult(*res)
+
+
+@xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"])
+@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,
+                          result_to_tuple=unpack_TtestResult, n_outputs=6)
+def ttest_ind(a, b, *, axis=0, equal_var=True, nan_policy='propagate',
+              alternative="two-sided", trim=0, method=None):
+    """
+    Calculate the T-test for the means of *two independent* samples of scores.
+
+    This is a test for the null hypothesis that 2 independent samples
+    have identical average (expected) values. This test assumes that the
+    populations have identical variances by default.
+
+    Parameters
+    ----------
+    a, b : array_like
+        The arrays must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int or None, optional
+        Axis along which to compute test. If None, compute over the whole
+        arrays, `a`, and `b`.
+    equal_var : bool, optional
+        If True (default), perform a standard independent 2 sample test
+        that assumes equal population variances [1]_.
+        If False, perform Welch's t-test, which does not assume equal
+        population variance [2]_.
+
+        .. versionadded:: 0.11.0
+
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+
+        The 'omit' option is not currently available for one-sided asymptotic tests.
+
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions underlying the samples
+          are unequal.
+        * 'less': the mean of the distribution underlying the first sample
+          is less than the mean of the distribution underlying the second
+          sample.
+        * 'greater': the mean of the distribution underlying the first
+          sample is greater than the mean of the distribution underlying
+          the second sample.
+
+    trim : float, optional
+        If nonzero, performs a trimmed (Yuen's) t-test.
+        Defines the fraction of elements to be trimmed from each end of the
+        input samples. If 0 (default), no elements will be trimmed from either
+        side. The number of trimmed elements from each tail is the floor of the
+        trim times the number of elements. Valid range is [0, .5).
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value. If `method` is an
+        instance of `PermutationMethod`/`MonteCarloMethod`, the p-value is
+        computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Otherwise, the p-value is computed by comparing the test statistic
+        against a theoretical t-distribution.
+
+        .. versionadded:: 1.15.0
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.TtestResult`
+        An object with the following attributes:
+
+        statistic : float or ndarray
+            The t-statistic.
+        pvalue : float or ndarray
+            The p-value associated with the given alternative.
+        df : float or ndarray
+            The number of degrees of freedom used in calculation of the
+            t-statistic.
+
+            .. versionadded:: 1.11.0
+
+        The object also has the following method:
+
+        confidence_interval(confidence_level=0.95)
+            Computes a confidence interval around the difference in
+            population means for the given confidence level.
+            The confidence interval is returned in a ``namedtuple`` with
+            fields ``low`` and ``high``.
+
+            .. versionadded:: 1.11.0
+
+    Notes
+    -----
+    Suppose we observe two independent samples, e.g. flower petal lengths, and
+    we are considering whether the two samples were drawn from the same
+    population (e.g. the same species of flower or two species with similar
+    petal characteristics) or two different populations.
+
+    The t-test quantifies the difference between the arithmetic means
+    of the two samples. The p-value quantifies the probability of observing
+    as or more extreme values assuming the null hypothesis, that the
+    samples are drawn from populations with the same population means, is true.
+    A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that
+    our observation is not so unlikely to have occurred by chance. Therefore,
+    we do not reject the null hypothesis of equal population means.
+    If the p-value is smaller than our threshold, then we have evidence
+    against the null hypothesis of equal population means.
+
+    By default, the p-value is determined by comparing the t-statistic of the
+    observed data against a theoretical t-distribution.
+
+    It is also possible to compute the test statistic using a permutation test by
+    passing ``method=scipy.stats.PermutationMethod(n_resamples=permutations)``,
+    where ``permutations`` is the desired number of "permutations" to use in
+    forming the null distribution. When ``1 < permutations < binom(n, k)``, where
+
+    * ``k`` is the number of observations in `a`,
+    * ``n`` is the total number of observations in `a` and `b`, and
+    * ``binom(n, k)`` is the binomial coefficient (``n`` choose ``k``),
+
+    the data are pooled (concatenated), randomly assigned to either group `a`
+    or `b`, and the t-statistic is calculated. This process is performed
+    repeatedly (``permutations`` times), generating a distribution of the
+    t-statistic under the null hypothesis, and the t-statistic of the observed
+    data is compared to this distribution to determine the p-value.
+    Specifically, the p-value reported is the "achieved significance level"
+    (ASL) as defined in 4.4 of [3]_. Note that there are other ways of
+    estimating p-values using randomized permutation tests; for other
+    options, see the more general `permutation_test`.
+
+    When ``permutations >= binom(n, k)``, an exact test is performed: the data
+    are partitioned between the groups in each distinct way exactly once.
+
+    The permutation test can be computationally expensive and not necessarily
+    more accurate than the analytical test, but it does not make strong
+    assumptions about the shape of the underlying distribution.
+
+    Use of trimming is commonly referred to as the trimmed t-test. At times
+    called Yuen's t-test, this is an extension of Welch's t-test, with the
+    difference being the use of winsorized means in calculation of the variance
+    and the trimmed sample size in calculation of the statistic. Trimming is
+    recommended if the underlying distribution is long-tailed or contaminated
+    with outliers [4]_.
+
+    The statistic is calculated as ``(np.mean(a) - np.mean(b))/se``, where
+    ``se`` is the standard error. Therefore, the statistic will be positive
+    when the sample mean of `a` is greater than the sample mean of `b` and
+    negative when the sample mean of `a` is less than the sample mean of
+    `b`.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
+
+    .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
+
+    .. [3] B. Efron and T. Hastie. Computer Age Statistical Inference. (2016).
+
+    .. [4] Yuen, Karen K. "The Two-Sample Trimmed t for Unequal Population
+           Variances." Biometrika, vol. 61, no. 1, 1974, pp. 165-170. JSTOR,
+           www.jstor.org/stable/2334299. Accessed 30 Mar. 2021.
+
+    .. [5] Yuen, Karen K., and W. J. Dixon. "The Approximate Behaviour and
+           Performance of the Two-Sample Trimmed t." Biometrika, vol. 60,
+           no. 2, 1973, pp. 369-374. JSTOR, www.jstor.org/stable/2334550.
+           Accessed 30 Mar. 2021.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+
+    Test with sample with identical means:
+
+    >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
+    >>> rvs2 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
+    >>> stats.ttest_ind(rvs1, rvs2)
+    TtestResult(statistic=-0.4390847099199348,
+                pvalue=0.6606952038870015,
+                df=998.0)
+    >>> stats.ttest_ind(rvs1, rvs2, equal_var=False)
+    TtestResult(statistic=-0.4390847099199348,
+                pvalue=0.6606952553131064,
+                df=997.4602304121448)
+
+    `ttest_ind` underestimates p for unequal variances:
+
+    >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500, random_state=rng)
+    >>> stats.ttest_ind(rvs1, rvs3)
+    TtestResult(statistic=-1.6370984482905417,
+                pvalue=0.1019251574705033,
+                df=998.0)
+    >>> stats.ttest_ind(rvs1, rvs3, equal_var=False)
+    TtestResult(statistic=-1.637098448290542,
+                pvalue=0.10202110497954867,
+                df=765.1098655246868)
+
+    When ``n1 != n2``, the equal variance t-statistic is no longer equal to the
+    unequal variance t-statistic:
+
+    >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100, random_state=rng)
+    >>> stats.ttest_ind(rvs1, rvs4)
+    TtestResult(statistic=-1.9481646859513422,
+                pvalue=0.05186270935842703,
+                df=598.0)
+    >>> stats.ttest_ind(rvs1, rvs4, equal_var=False)
+    TtestResult(statistic=-1.3146566100751664,
+                pvalue=0.1913495266513811,
+                df=110.41349083985212)
+
+    T-test with different means, variance, and n:
+
+    >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100, random_state=rng)
+    >>> stats.ttest_ind(rvs1, rvs5)
+    TtestResult(statistic=-2.8415950600298774,
+                pvalue=0.0046418707568707885,
+                df=598.0)
+    >>> stats.ttest_ind(rvs1, rvs5, equal_var=False)
+    TtestResult(statistic=-1.8686598649188084,
+                pvalue=0.06434714193919686,
+                df=109.32167496550137)
+
+    Take these two samples, one of which has an extreme tail.
+
+    >>> a = (56, 128.6, 12, 123.8, 64.34, 78, 763.3)
+    >>> b = (1.1, 2.9, 4.2)
+
+    Use the `trim` keyword to perform a trimmed (Yuen) t-test. For example,
+    using 20% trimming, ``trim=.2``, the test will reduce the impact of one
+    (``np.floor(trim*len(a))``) element from each tail of sample `a`. It will
+    have no effect on sample `b` because ``np.floor(trim*len(b))`` is 0.
+
+    >>> stats.ttest_ind(a, b, trim=.2)
+    TtestResult(statistic=3.4463884028073513,
+                pvalue=0.01369338726499547,
+                df=6.0)
+    """
+    xp = array_namespace(a, b)
+
+    a, b = xp_promote(a, b, force_floating=True, xp=xp)
+
+    if axis is None:
+        a, b, axis = xp_ravel(a), xp_ravel(b), 0
+
+    if not (0 <= trim < .5):
+        raise ValueError("Trimming percentage should be 0 <= `trim` < .5.")
+
+    if not isinstance(method, PermutationMethod | MonteCarloMethod | None):
+        message = ("`method` must be an instance of `PermutationMethod`, an instance "
+                   "of `MonteCarloMethod`, or None (default).")
+        raise ValueError(message)
+
+    if not is_numpy(xp) and method is not None:
+        message = "Use of resampling methods is compatible only with NumPy arrays."
+        raise NotImplementedError(message)
+
+    result_shape = _broadcast_array_shapes_remove_axis((a, b), axis=axis)
+    NaN = _get_nan(a, b, shape=result_shape, xp=xp)
+    if xp_size(a) == 0 or xp_size(b) == 0:
+        return TtestResult(NaN, NaN, df=NaN, alternative=NaN,
+                           standard_error=NaN, estimate=NaN)
+
+    alternative_nums = {"less": -1, "two-sided": 0, "greater": 1}
+
+    n1 = _length_nonmasked(a, axis)
+    n2 = _length_nonmasked(b, axis)
+
+    if trim == 0:
+        with np.errstate(divide='ignore', invalid='ignore'):
+            v1 = _var(a, axis, ddof=1, xp=xp)
+            v2 = _var(b, axis, ddof=1, xp=xp)
+
+        m1 = xp.mean(a, axis=axis)
+        m2 = xp.mean(b, axis=axis)
+    else:
+        message = "Use of `trim` is compatible only with NumPy arrays."
+        if not is_numpy(xp):
+            raise NotImplementedError(message)
+
+        v1, m1, n1 = _ttest_trim_var_mean_len(a, trim, axis)
+        v2, m2, n2 = _ttest_trim_var_mean_len(b, trim, axis)
+
+    if equal_var:
+        df, denom = _equal_var_ttest_denom(v1, n1, v2, n2, xp=xp)
+    else:
+        df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2, xp=xp)
+
+    if method is None:
+        t, prob = _ttest_ind_from_stats(m1, m2, denom, df, alternative)
+    else:
+        # nan_policy is taken care of by axis_nan_policy decorator
+        ttest_kwargs = dict(equal_var=equal_var, trim=trim)
+        t, prob = _ttest_resampling(a, b, axis, alternative, ttest_kwargs, method)
+
+    # when nan_policy='omit', `df` can be different for different axis-slices
+    df = xp.broadcast_to(df, t.shape)
+    df = df[()] if df.ndim ==0 else df
+    estimate = m1 - m2
+
+    return TtestResult(t, prob, df=df, alternative=alternative_nums[alternative],
+                       standard_error=denom, estimate=estimate)
+
+
+def _ttest_resampling(x, y, axis, alternative, ttest_kwargs, method):
+    def statistic(x, y, axis):
+        return ttest_ind(x, y, axis=axis, **ttest_kwargs).statistic
+
+    test = (permutation_test if isinstance(method, PermutationMethod)
+            else monte_carlo_test)
+    method = method._asdict()
+
+    if test is monte_carlo_test:
+        # `monte_carlo_test` accepts an `rvs` tuple of callables, not an `rng`
+        # If the user specified an `rng`, replace it with the default callables
+        if (rng := method.pop('rng', None)) is not None:
+            rng = np.random.default_rng(rng)
+            method['rvs'] = rng.normal, rng.normal
+
+    res = test((x, y,), statistic=statistic, axis=axis,
+               alternative=alternative, **method)
+
+    return res.statistic, res.pvalue
+
+
+def _ttest_trim_var_mean_len(a, trim, axis):
+    """Variance, mean, and length of winsorized input along specified axis"""
+    # for use with `ttest_ind` when trimming.
+    # further calculations in this test assume that the inputs are sorted.
+    # From [4] Section 1 "Let x_1, ..., x_n be n ordered observations..."
+    a = np.sort(a, axis=axis)
+
+    # `g` is the number of elements to be replaced on each tail, converted
+    # from a percentage amount of trimming
+    n = a.shape[axis]
+    g = int(n * trim)
+
+    # Calculate the Winsorized variance of the input samples according to
+    # specified `g`
+    v = _calculate_winsorized_variance(a, g, axis)
+
+    # the total number of elements in the trimmed samples
+    n -= 2 * g
+
+    # calculate the g-times trimmed mean, as defined in [4] (1-1)
+    m = trim_mean(a, trim, axis=axis)
+    return v, m, n
+
+
+def _calculate_winsorized_variance(a, g, axis):
+    """Calculates g-times winsorized variance along specified axis"""
+    # it is expected that the input `a` is sorted along the correct axis
+    if g == 0:
+        return _var(a, ddof=1, axis=axis)
+    # move the intended axis to the end that way it is easier to manipulate
+    a_win = np.moveaxis(a, axis, -1)
+
+    # save where NaNs are for later use.
+    nans_indices = np.any(np.isnan(a_win), axis=-1)
+
+    # Winsorization and variance calculation are done in one step in [4]
+    # (1-3), but here winsorization is done first; replace the left and
+    # right sides with the repeating value. This can be see in effect in (
+    # 1-3) in [4], where the leftmost and rightmost tails are replaced with
+    # `(g + 1) * x_{g + 1}` on the left and `(g + 1) * x_{n - g}` on the
+    # right. Zero-indexing turns `g + 1` to `g`, and `n - g` to `- g - 1` in
+    # array indexing.
+    a_win[..., :g] = a_win[..., [g]]
+    a_win[..., -g:] = a_win[..., [-g - 1]]
+
+    # Determine the variance. In [4], the degrees of freedom is expressed as
+    # `h - 1`, where `h = n - 2g` (unnumbered equations in Section 1, end of
+    # page 369, beginning of page 370). This is converted to NumPy's format,
+    # `n - ddof` for use with `np.var`. The result is converted to an
+    # array to accommodate indexing later.
+    var_win = np.asarray(_var(a_win, ddof=(2 * g + 1), axis=-1))
+
+    # with `nan_policy='propagate'`, NaNs may be completely trimmed out
+    # because they were sorted into the tail of the array. In these cases,
+    # replace computed variances with `np.nan`.
+    var_win[nans_indices] = np.nan
+    return var_win
+
+
+@xp_capabilities(cpu_only=True, exceptions=["cupy", "jax.numpy"],
+                 jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,
+                          result_to_tuple=unpack_TtestResult, n_outputs=6,
+                          paired=True)
+def ttest_rel(a, b, axis=0, nan_policy='propagate', alternative="two-sided"):
+    """Calculate the t-test on TWO RELATED samples of scores, a and b.
+
+    This is a test for the null hypothesis that two related or
+    repeated samples have identical average (expected) values.
+
+    Parameters
+    ----------
+    a, b : array_like
+        The arrays must have the same shape.
+    axis : int or None, optional
+        Axis along which to compute test. If None, compute over the whole
+        arrays, `a`, and `b`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the means of the distributions underlying the samples
+          are unequal.
+        * 'less': the mean of the distribution underlying the first sample
+          is less than the mean of the distribution underlying the second
+          sample.
+        * 'greater': the mean of the distribution underlying the first
+          sample is greater than the mean of the distribution underlying
+          the second sample.
+
+        .. versionadded:: 1.6.0
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.TtestResult`
+        An object with the following attributes:
+
+        statistic : float or array
+            The t-statistic.
+        pvalue : float or array
+            The p-value associated with the given alternative.
+        df : float or array
+            The number of degrees of freedom used in calculation of the
+            t-statistic; this is one less than the size of the sample
+            (``a.shape[axis]``).
+
+            .. versionadded:: 1.10.0
+
+        The object also has the following method:
+
+        confidence_interval(confidence_level=0.95)
+            Computes a confidence interval around the difference in
+            population means for the given confidence level.
+            The confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`.
+
+            .. versionadded:: 1.10.0
+
+    Notes
+    -----
+    Examples for use are scores of the same set of student in
+    different exams, or repeated sampling from the same units. The
+    test measures whether the average score differs significantly
+    across samples (e.g. exams). If we observe a large p-value, for
+    example greater than 0.05 or 0.1 then we cannot reject the null
+    hypothesis of identical average scores. If the p-value is smaller
+    than the threshold, e.g. 1%, 5% or 10%, then we reject the null
+    hypothesis of equal averages. Small p-values are associated with
+    large t-statistics.
+
+    The t-statistic is calculated as ``np.mean(a - b)/se``, where ``se`` is the
+    standard error. Therefore, the t-statistic will be positive when the sample
+    mean of ``a - b`` is greater than zero and negative when the sample mean of
+    ``a - b`` is less than zero.
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+
+    >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
+    >>> rvs2 = (stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
+    ...         + stats.norm.rvs(scale=0.2, size=500, random_state=rng))
+    >>> stats.ttest_rel(rvs1, rvs2)
+    TtestResult(statistic=-0.4549717054410304, pvalue=0.6493274702088672, df=499)
+    >>> rvs3 = (stats.norm.rvs(loc=8, scale=10, size=500, random_state=rng)
+    ...         + stats.norm.rvs(scale=0.2, size=500, random_state=rng))
+    >>> stats.ttest_rel(rvs1, rvs3)
+    TtestResult(statistic=-5.879467544540889, pvalue=7.540777129099917e-09, df=499)
+
+    """
+    return ttest_1samp(a - b, popmean=0., axis=axis, alternative=alternative,
+                       _no_deco=True)
+
+
+# Map from names to lambda_ values used in power_divergence().
+_power_div_lambda_names = {
+    "pearson": 1,
+    "log-likelihood": 0,
+    "freeman-tukey": -0.5,
+    "mod-log-likelihood": -1,
+    "neyman": -2,
+    "cressie-read": 2/3,
+}
+
+
+Power_divergenceResult = namedtuple('Power_divergenceResult',
+                                    ('statistic', 'pvalue'))
+
+
+def _pd_nsamples(kwargs):
+    return 2 if kwargs.get('f_exp', None) is not None else 1
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(Power_divergenceResult, paired=True, n_samples=_pd_nsamples,
+                          too_small=-1)
+def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
+    """Cressie-Read power divergence statistic and goodness of fit test.
+
+    This function tests the null hypothesis that the categorical data
+    has the given frequencies, using the Cressie-Read power divergence
+    statistic.
+
+    Parameters
+    ----------
+    f_obs : array_like
+        Observed frequencies in each category.
+    f_exp : array_like, optional
+        Expected frequencies in each category.  By default the categories are
+        assumed to be equally likely.
+    ddof : int, optional
+        "Delta degrees of freedom": adjustment to the degrees of freedom
+        for the p-value.  The p-value is computed using a chi-squared
+        distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
+        is the number of observed frequencies.  The default value of `ddof`
+        is 0.
+    axis : int or None, optional
+        The axis of the broadcast result of `f_obs` and `f_exp` along which to
+        apply the test.  If axis is None, all values in `f_obs` are treated
+        as a single data set.  Default is 0.
+    lambda_ : float or str, optional
+        The power in the Cressie-Read power divergence statistic.  The default
+        is 1.  For convenience, `lambda_` may be assigned one of the following
+        strings, in which case the corresponding numerical value is used:
+
+        * ``"pearson"`` (value 1)
+            Pearson's chi-squared statistic. In this case, the function is
+            equivalent to `chisquare`.
+        * ``"log-likelihood"`` (value 0)
+            Log-likelihood ratio. Also known as the G-test [3]_.
+        * ``"freeman-tukey"`` (value -1/2)
+            Freeman-Tukey statistic.
+        * ``"mod-log-likelihood"`` (value -1)
+            Modified log-likelihood ratio.
+        * ``"neyman"`` (value -2)
+            Neyman's statistic.
+        * ``"cressie-read"`` (value 2/3)
+            The power recommended in [5]_.
+
+    Returns
+    -------
+    res: Power_divergenceResult
+        An object containing attributes:
+
+        statistic : float or ndarray
+            The Cressie-Read power divergence test statistic.  The value is
+            a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
+        pvalue : float or ndarray
+            The p-value of the test.  The value is a float if `ddof` and the
+            return value `stat` are scalars.
+
+    See Also
+    --------
+    chisquare
+
+    Notes
+    -----
+    This test is invalid when the observed or expected frequencies in each
+    category are too small.  A typical rule is that all of the observed
+    and expected frequencies should be at least 5.
+
+    Also, the sum of the observed and expected frequencies must be the same
+    for the test to be valid; `power_divergence` raises an error if the sums
+    do not agree within a relative tolerance of ``eps**0.5``, where ``eps``
+    is the precision of the input dtype.
+
+    When `lambda_` is less than zero, the formula for the statistic involves
+    dividing by `f_obs`, so a warning or error may be generated if any value
+    in `f_obs` is 0.
+
+    Similarly, a warning or error may be generated if any value in `f_exp` is
+    zero when `lambda_` >= 0.
+
+    The default degrees of freedom, k-1, are for the case when no parameters
+    of the distribution are estimated. If p parameters are estimated by
+    efficient maximum likelihood then the correct degrees of freedom are
+    k-1-p. If the parameters are estimated in a different way, then the
+    dof can be between k-1-p and k-1. However, it is also possible that
+    the asymptotic distribution is not a chisquare, in which case this
+    test is not appropriate.
+
+    References
+    ----------
+    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
+           Statistics". Chapter 8.
+           https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
+    .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
+    .. [3] "G-test", https://en.wikipedia.org/wiki/G-test
+    .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
+           practice of statistics in biological research", New York: Freeman
+           (1981)
+    .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
+           Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
+           pp. 440-464.
+
+    Examples
+    --------
+    (See `chisquare` for more examples.)
+
+    When just `f_obs` is given, it is assumed that the expected frequencies
+    are uniform and given by the mean of the observed frequencies.  Here we
+    perform a G-test (i.e. use the log-likelihood ratio statistic):
+
+    >>> import numpy as np
+    >>> from scipy.stats import power_divergence
+    >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood')
+    (2.006573162632538, 0.84823476779463769)
+
+    The expected frequencies can be given with the `f_exp` argument:
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12],
+    ...                  f_exp=[16, 16, 16, 16, 16, 8],
+    ...                  lambda_='log-likelihood')
+    (3.3281031458963746, 0.6495419288047497)
+
+    When `f_obs` is 2-D, by default the test is applied to each column.
+
+    >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
+    >>> obs.shape
+    (6, 2)
+    >>> power_divergence(obs, lambda_="log-likelihood")
+    (array([ 2.00657316,  6.77634498]), array([ 0.84823477,  0.23781225]))
+
+    By setting ``axis=None``, the test is applied to all data in the array,
+    which is equivalent to applying the test to the flattened array.
+
+    >>> power_divergence(obs, axis=None)
+    (23.31034482758621, 0.015975692534127565)
+    >>> power_divergence(obs.ravel())
+    (23.31034482758621, 0.015975692534127565)
+
+    `ddof` is the change to make to the default degrees of freedom.
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1)
+    (2.0, 0.73575888234288467)
+
+    The calculation of the p-values is done by broadcasting the
+    test statistic with `ddof`.
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
+    (2.0, array([ 0.84914504,  0.73575888,  0.5724067 ]))
+
+    `f_obs` and `f_exp` are also broadcast.  In the following, `f_obs` has
+    shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
+    `f_obs` and `f_exp` has shape (2, 6).  To compute the desired chi-squared
+    statistics, we must use ``axis=1``:
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12],
+    ...                  f_exp=[[16, 16, 16, 16, 16, 8],
+    ...                         [8, 20, 20, 16, 12, 12]],
+    ...                  axis=1)
+    (array([ 3.5 ,  9.25]), array([ 0.62338763,  0.09949846]))
+
+    """
+    return _power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, lambda_=lambda_)
+
+
+def _power_divergence(f_obs, f_exp, ddof, axis, lambda_, sum_check=True):
+    xp = array_namespace(f_obs, f_exp, ddof)
+    f_obs, f_exp, ddof = xp_promote(f_obs, f_exp, ddof,
+                                    force_floating=True, xp=xp)
+
+    # Convert the input argument `lambda_` to a numerical value.
+    if isinstance(lambda_, str):
+        if lambda_ not in _power_div_lambda_names:
+            names = repr(list(_power_div_lambda_names.keys()))[1:-1]
+            raise ValueError(f"invalid string for lambda_: {lambda_!r}. "
+                             f"Valid strings are {names}")
+        lambda_ = _power_div_lambda_names[lambda_]
+    elif lambda_ is None:
+        lambda_ = 1
+
+    if f_exp is not None:
+        # not sure why we force to float64, but not going to touch it
+        f_obs_float = xp.asarray(f_obs, dtype=xp.float64)
+        bshape = _broadcast_shapes((f_obs_float.shape, f_exp.shape))
+        f_obs_float = xp.broadcast_to(f_obs_float, bshape)
+        f_exp = xp.broadcast_to(f_exp, bshape)
+        f_obs_float, f_exp = _share_masks(f_obs_float, f_exp, xp=xp)
+
+        if sum_check:
+            dtype_res = xp.result_type(f_obs.dtype, f_exp.dtype)
+            rtol = xp.finfo(dtype_res).eps**0.5  # to pass existing tests
+            with np.errstate(invalid='ignore'):
+                f_obs_sum = xp.sum(f_obs_float, axis=axis)
+                f_exp_sum = xp.sum(f_exp, axis=axis)
+                relative_diff = (xp.abs(f_obs_sum - f_exp_sum) /
+                                 xp.minimum(f_obs_sum, f_exp_sum))
+                diff_gt_tol = xp.any(relative_diff > rtol, axis=None)
+            if diff_gt_tol:
+                msg = (f"For each axis slice, the sum of the observed "
+                       f"frequencies must agree with the sum of the "
+                       f"expected frequencies to a relative tolerance "
+                       f"of {rtol}, but the percent differences are:\n"
+                       f"{relative_diff}")
+                raise ValueError(msg)
+
+    else:
+        # Avoid warnings with the edge case of a data set with length 0
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            f_exp = xp.mean(f_obs, axis=axis, keepdims=True)
+
+    # `terms` is the array of terms that are summed along `axis` to create
+    # the test statistic.  We use some specialized code for a few special
+    # cases of lambda_.
+    if lambda_ == 1:
+        # Pearson's chi-squared statistic
+        terms = (f_obs - f_exp)**2 / f_exp
+    elif lambda_ == 0:
+        # Log-likelihood ratio (i.e. G-test)
+        terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
+    elif lambda_ == -1:
+        # Modified log-likelihood ratio
+        terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
+    else:
+        # General Cressie-Read power divergence.
+        terms = f_obs * ((f_obs / f_exp)**lambda_ - 1)
+        terms /= 0.5 * lambda_ * (lambda_ + 1)
+
+    stat = xp.sum(terms, axis=axis)
+
+    num_obs = xp.asarray(_length_nonmasked(terms, axis), device=xp_device(terms),
+                         dtype=f_obs.dtype)
+
+    df = num_obs - 1 - ddof
+    chi2 = _SimpleChi2(df)
+    pvalue = _get_pvalue(stat, chi2 , alternative='greater', symmetric=False, xp=xp)
+
+    stat = stat[()] if stat.ndim == 0 else stat
+    pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+
+    return Power_divergenceResult(stat, pvalue)
+
+
+
+@xp_capabilities(jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(Power_divergenceResult, paired=True, n_samples=_pd_nsamples,
+                          too_small=-1)
+def chisquare(f_obs, f_exp=None, ddof=0, axis=0, *, sum_check=True):
+    """Perform Pearson's chi-squared test.
+
+    Pearson's chi-squared test [1]_ is a goodness-of-fit test for a multinomial
+    distribution with given probabilities; that is, it assesses the null hypothesis
+    that the observed frequencies (counts) are obtained by independent
+    sampling of *N* observations from a categorical distribution with given
+    expected frequencies.
+
+    Parameters
+    ----------
+    f_obs : array_like
+        Observed frequencies in each category.
+    f_exp : array_like, optional
+        Expected frequencies in each category. By default, the categories are
+        assumed to be equally likely.
+    ddof : int, optional
+        "Delta degrees of freedom": adjustment to the degrees of freedom
+        for the p-value.  The p-value is computed using a chi-squared
+        distribution with ``k - 1 - ddof`` degrees of freedom, where ``k``
+        is the number of categories.  The default value of `ddof` is 0.
+    axis : int or None, optional
+        The axis of the broadcast result of `f_obs` and `f_exp` along which to
+        apply the test.  If axis is None, all values in `f_obs` are treated
+        as a single data set.  Default is 0.
+    sum_check : bool, optional
+        Whether to perform a check that ``sum(f_obs) - sum(f_exp) == 0``. If True,
+        (default) raise an error when the relative difference exceeds the square root
+        of the precision of the data type. See Notes for rationale and possible
+        exceptions.
+
+    Returns
+    -------
+    res: Power_divergenceResult
+        An object containing attributes:
+
+        statistic : float or ndarray
+            The chi-squared test statistic.  The value is a float if `axis` is
+            None or `f_obs` and `f_exp` are 1-D.
+        pvalue : float or ndarray
+            The p-value of the test.  The value is a float if `ddof` and the
+            result attribute `statistic` are scalars.
+
+    See Also
+    --------
+    scipy.stats.power_divergence
+    scipy.stats.fisher_exact : Fisher exact test on a 2x2 contingency table.
+    scipy.stats.barnard_exact : An unconditional exact test. An alternative
+        to chi-squared test for small sample sizes.
+    :ref:`hypothesis_chisquare` : Extended example
+
+    Notes
+    -----
+    This test is invalid when the observed or expected frequencies in each
+    category are too small.  A typical rule is that all of the observed
+    and expected frequencies should be at least 5. According to [2]_, the
+    total number of observations is recommended to be greater than 13,
+    otherwise exact tests (such as Barnard's Exact test) should be used
+    because they do not overreject.
+
+    The default degrees of freedom, k-1, are for the case when no parameters
+    of the distribution are estimated. If p parameters are estimated by
+    efficient maximum likelihood then the correct degrees of freedom are
+    k-1-p. If the parameters are estimated in a different way, then the
+    dof can be between k-1-p and k-1. However, it is also possible that
+    the asymptotic distribution is not chi-square, in which case this test
+    is not appropriate.
+
+    For Pearson's chi-squared test, the total observed and expected counts must match
+    for the p-value to accurately reflect the probability of observing such an extreme
+    value of the statistic under the null hypothesis.
+    This function may be used to perform other statistical tests that do not require
+    the total counts to be equal. For instance, to test the null hypothesis that
+    ``f_obs[i]`` is Poisson-distributed with expectation ``f_exp[i]``, set ``ddof=-1``
+    and ``sum_check=False``. This test follows from the fact that a Poisson random
+    variable with mean and variance ``f_exp[i]`` is approximately normal with the
+    same mean and variance; the chi-squared statistic standardizes, squares, and sums
+    the observations; and the sum of ``n`` squared standard normal variables follows
+    the chi-squared distribution with ``n`` degrees of freedom.
+
+    References
+    ----------
+    .. [1] "Pearson's chi-squared test".
+           *Wikipedia*. https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+    .. [2] Pearson, Karl. "On the criterion that a given system of deviations from the probable
+           in the case of a correlated system of variables is such that it can be reasonably
+           supposed to have arisen from random sampling", Philosophical Magazine. Series 5. 50
+           (1900), pp. 157-175.
+
+    Examples
+    --------
+    When only the mandatory `f_obs` argument is given, it is assumed that the
+    expected frequencies are uniform and given by the mean of the observed
+    frequencies:
+
+    >>> import numpy as np
+    >>> from scipy.stats import chisquare
+    >>> chisquare([16, 18, 16, 14, 12, 12])
+    Power_divergenceResult(statistic=2.0, pvalue=0.84914503608460956)
+
+    The optional `f_exp` argument gives the expected frequencies.
+
+    >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
+    Power_divergenceResult(statistic=3.5, pvalue=0.62338762774958223)
+
+    When `f_obs` is 2-D, by default the test is applied to each column.
+
+    >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
+    >>> obs.shape
+    (6, 2)
+    >>> chisquare(obs)
+    Power_divergenceResult(statistic=array([2.        , 6.66666667]), pvalue=array([0.84914504, 0.24663415]))
+
+    By setting ``axis=None``, the test is applied to all data in the array,
+    which is equivalent to applying the test to the flattened array.
+
+    >>> chisquare(obs, axis=None)
+    Power_divergenceResult(statistic=23.31034482758621, pvalue=0.015975692534127565)
+    >>> chisquare(obs.ravel())
+    Power_divergenceResult(statistic=23.310344827586206, pvalue=0.01597569253412758)
+
+    `ddof` is the change to make to the default degrees of freedom.
+
+    >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1)
+    Power_divergenceResult(statistic=2.0, pvalue=0.7357588823428847)
+
+    The calculation of the p-values is done by broadcasting the
+    chi-squared statistic with `ddof`.
+
+    >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0, 1, 2])
+    Power_divergenceResult(statistic=2.0, pvalue=array([0.84914504, 0.73575888, 0.5724067 ]))
+
+    `f_obs` and `f_exp` are also broadcast.  In the following, `f_obs` has
+    shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
+    `f_obs` and `f_exp` has shape (2, 6).  To compute the desired chi-squared
+    statistics, we use ``axis=1``:
+
+    >>> chisquare([16, 18, 16, 14, 12, 12],
+    ...           f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],
+    ...           axis=1)
+    Power_divergenceResult(statistic=array([3.5 , 9.25]), pvalue=array([0.62338763, 0.09949846]))
+
+    For a more detailed example, see :ref:`hypothesis_chisquare`.
+    """  # noqa: E501
+    return _power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis,
+                             lambda_="pearson", sum_check=sum_check)
+
+
+KstestResult = _make_tuple_bunch('KstestResult', ['statistic', 'pvalue'],
+                                 ['statistic_location', 'statistic_sign'])
+
+
+def _compute_d(cdfvals, x, sign, xp=None):
+    """Computes D+/D- as used in the Kolmogorov-Smirnov test.
+
+    Vectorized along the last axis.
+
+    Parameters
+    ----------
+    cdfvals : array_like
+        Sorted array of CDF values between 0 and 1
+    x: array_like
+        Sorted array of the stochastic variable itself
+    sign: int
+        Indicates whether to compute D+ (+1) or D- (-1).
+
+    Returns
+    -------
+    res: Pair with the following elements:
+        - The maximum distance of the CDF values below/above (D+/D-) Uniform(0, 1).
+        - The location at which the maximum is reached.
+
+    """
+    xp = array_namespace(cdfvals, x) if xp is None else xp
+    n = cdfvals.shape[-1]
+    D = (xp.arange(1.0, n + 1, dtype=x.dtype) / n - cdfvals if sign == +1
+         else (cdfvals - xp.arange(0.0, n, dtype=x.dtype)/n))
+    amax = xp.argmax(D, axis=-1, keepdims=True)
+    loc_max = xp.squeeze(xp.take_along_axis(x, amax, axis=-1), axis=-1)
+    D = xp.squeeze(xp.take_along_axis(D, amax, axis=-1), axis=-1)
+    return D[()] if D.ndim == 0 else D, loc_max[()] if loc_max.ndim == 0 else loc_max
+
+
+def _tuple_to_KstestResult(statistic, pvalue,
+                           statistic_location, statistic_sign):
+    return KstestResult(statistic, pvalue,
+                        statistic_location=statistic_location,
+                        statistic_sign=statistic_sign)
+
+
+def _KstestResult_to_tuple(res, _):
+    return *res, res.statistic_location, res.statistic_sign
+
+
+@xp_capabilities(cpu_only=True, jax_jit=False,
+                 skip_backends=[('dask.array', 'needs take_along_axis')])
+@_axis_nan_policy_factory(_tuple_to_KstestResult, n_samples=1, n_outputs=4,
+                          result_to_tuple=_KstestResult_to_tuple)
+@_rename_parameter("mode", "method")
+def ks_1samp(x, cdf, args=(), alternative='two-sided', method='auto', *, axis=0):
+    """
+    Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.
+
+    This test compares the underlying distribution F(x) of a sample
+    against a given continuous distribution G(x). See Notes for a description
+    of the available null and alternative hypotheses.
+
+    Parameters
+    ----------
+    x : array_like
+        a 1-D array of observations of iid random variables.
+    cdf : callable
+        callable used to calculate the cdf.
+    args : tuple, sequence, optional
+        Distribution parameters, used with `cdf`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes below.
+    method : {'auto', 'exact', 'approx', 'asymp'}, optional
+        Defines the distribution used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : selects one of the other options.
+          * 'exact' : uses the exact distribution of test statistic.
+          * 'approx' : approximates the two-sided probability with twice
+            the one-sided probability
+          * 'asymp': uses asymptotic distribution of test statistic
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    res: KstestResult
+        An object containing attributes:
+
+        statistic : float
+            KS test statistic, either D+, D-, or D (the maximum of the two)
+        pvalue : float
+            One-tailed or two-tailed p-value.
+        statistic_location : float
+            Value of `x` corresponding with the KS statistic; i.e., the
+            distance between the empirical distribution function and the
+            hypothesized cumulative distribution function is measured at this
+            observation.
+        statistic_sign : int
+            +1 if the KS statistic is the maximum positive difference between
+            the empirical distribution function and the hypothesized cumulative
+            distribution function (D+); -1 if the KS statistic is the maximum
+            negative difference (D-).
+
+
+    See Also
+    --------
+    ks_2samp, kstest
+
+    Notes
+    -----
+    There are three options for the null and corresponding alternative
+    hypothesis that can be selected using the `alternative` parameter.
+
+    - `two-sided`: The null hypothesis is that the two distributions are
+      identical, F(x)=G(x) for all x; the alternative is that they are not
+      identical.
+
+    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
+      alternative is that F(x) < G(x) for at least one x.
+
+    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
+      alternative is that F(x) > G(x) for at least one x.
+
+    Note that the alternative hypotheses describe the *CDFs* of the
+    underlying distributions, not the observed values. For example,
+    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
+    x1 tend to be less than those in x2.
+
+    Examples
+    --------
+    Suppose we wish to test the null hypothesis that a sample is distributed
+    according to the standard normal.
+    We choose a confidence level of 95%; that is, we will reject the null
+    hypothesis in favor of the alternative if the p-value is less than 0.05.
+
+    When testing uniformly distributed data, we would expect the
+    null hypothesis to be rejected.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> stats.ks_1samp(stats.uniform.rvs(size=100, random_state=rng),
+    ...                stats.norm.cdf)
+    KstestResult(statistic=0.5001899973268688,
+                 pvalue=1.1616392184763533e-23,
+                 statistic_location=0.00047625268963724654,
+                 statistic_sign=-1)
+
+    Indeed, the p-value is lower than our threshold of 0.05, so we reject the
+    null hypothesis in favor of the default "two-sided" alternative: the data
+    are *not* distributed according to the standard normal.
+
+    When testing random variates from the standard normal distribution, we
+    expect the data to be consistent with the null hypothesis most of the time.
+
+    >>> x = stats.norm.rvs(size=100, random_state=rng)
+    >>> stats.ks_1samp(x, stats.norm.cdf)
+    KstestResult(statistic=0.05345882212970396,
+                 pvalue=0.9227159037744717,
+                 statistic_location=-1.2451343873745018,
+                 statistic_sign=1)
+
+    As expected, the p-value of 0.92 is not below our threshold of 0.05, so
+    we cannot reject the null hypothesis.
+
+    Suppose, however, that the random variates are distributed according to
+    a normal distribution that is shifted toward greater values. In this case,
+    the cumulative density function (CDF) of the underlying distribution tends
+    to be *less* than the CDF of the standard normal. Therefore, we would
+    expect the null hypothesis to be rejected with ``alternative='less'``:
+
+    >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)
+    >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less')
+    KstestResult(statistic=0.17482387821055168,
+                 pvalue=0.001913921057766743,
+                 statistic_location=0.3713830565352756,
+                 statistic_sign=-1)
+
+    and indeed, with p-value smaller than our threshold, we reject the null
+    hypothesis in favor of the alternative.
+
+    """
+    # `_axis_nan_policy` decorator ensures `axis=-1`
+    xp = array_namespace(x)
+    mode = method
+
+    alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
+        alternative.lower()[0], alternative)
+    if alternative not in ['two-sided', 'greater', 'less']:
+        raise ValueError(f"Unexpected value {alternative=}")
+
+    N = x.shape[-1]
+    x = xp.sort(x, axis=-1)
+    x = xp_promote(x, force_floating=True, xp=xp)
+    cdfvals = cdf(x, *args)
+    ones = xp.ones(x.shape[:-1], dtype=xp.int8)
+    ones = ones[()] if ones.ndim == 0 else ones
+
+    if alternative == 'greater':
+        Dplus, d_location = _compute_d(cdfvals, x, +1)
+        pvalue = xp.asarray(distributions.ksone.sf(Dplus, N), dtype=x.dtype)
+        pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+        return KstestResult(Dplus, pvalue,
+                            statistic_location=d_location,
+                            statistic_sign=ones)
+
+    if alternative == 'less':
+        Dminus, d_location = _compute_d(cdfvals, x, -1)
+        pvalue = xp.asarray(distributions.ksone.sf(Dminus, N), dtype=x.dtype)
+        pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
+        return KstestResult(Dminus, pvalue,
+                            statistic_location=d_location,
+                            statistic_sign=-ones)
+
+    # alternative == 'two-sided':
+    Dplus, dplus_location = _compute_d(cdfvals, x, +1)
+    Dminus, dminus_location = _compute_d(cdfvals, x, -1)
+    i_plus = Dplus > Dminus
+    D = xp.where(i_plus, Dplus, Dminus)
+    d_location = xp.where(i_plus, dplus_location, dminus_location)
+    d_sign = xp.where(i_plus, ones, -ones)
+    if D.ndim == 0:
+        D, d_location, d_sign = D[()], d_location[()], d_sign[()]
+
+    if mode == 'auto':  # Always select exact
+        mode = 'exact'
+    if mode == 'exact':
+        prob = distributions.kstwo.sf(D, N)
+    elif mode == 'asymp':
+        prob = distributions.kstwobign.sf(D * math.sqrt(N))
+    else:
+        # mode == 'approx'
+        prob = 2 * distributions.ksone.sf(D, N)
+    prob = xp.clip(xp.asarray(prob, dtype=x.dtype), 0., 1.)
+    return KstestResult(D, prob,
+                        statistic_location=d_location,
+                        statistic_sign=d_sign)
+
+
+Ks_2sampResult = KstestResult
+
+
+def _compute_prob_outside_square(n, h):
+    """
+    Compute the proportion of paths that pass outside the two diagonal lines.
+
+    Parameters
+    ----------
+    n : integer
+        n > 0
+    h : integer
+        0 <= h <= n
+
+    Returns
+    -------
+    p : float
+        The proportion of paths that pass outside the lines x-y = +/-h.
+
+    """
+    # Compute Pr(D_{n,n} >= h/n)
+    # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )
+    # / binom(2n, n)
+    # This formulation exhibits subtractive cancellation.
+    # Instead divide each term by binom(2n, n), then factor common terms
+    # and use a Horner-like algorithm
+    # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))
+
+    P = 0.0
+    k = int(np.floor(n / h))
+    while k >= 0:
+        p1 = 1.0
+        # Each of the Ai terms has numerator and denominator with
+        # h simple terms.
+        for j in range(h):
+            p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
+        P = p1 * (1.0 - P)
+        k -= 1
+    return 2 * P
+
+
+def _count_paths_outside_method(m, n, g, h):
+    """Count the number of paths that pass outside the specified diagonal.
+
+    Parameters
+    ----------
+    m : integer
+        m > 0
+    n : integer
+        n > 0
+    g : integer
+        g is greatest common divisor of m and n
+    h : integer
+        0 <= h <= lcm(m,n)
+
+    Returns
+    -------
+    p : float
+        The number of paths that go low.
+        The calculation may overflow - check for a finite answer.
+
+    Notes
+    -----
+    Count the integer lattice paths from (0, 0) to (m, n), which at some
+    point (x, y) along the path, satisfy:
+      m*y <= n*x - h*g
+    The paths make steps of size +1 in either positive x or positive y
+    directions.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
+    Hodges, J.L. Jr.,
+    "The Significance Probability of the Smirnov Two-Sample Test,"
+    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+    """
+    # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
+    # B(x, y) = #{paths from (0,0) to (x,y) without
+    #             previously crossing the boundary}
+    #         = binom(x, y) - #{paths which already reached the boundary}
+    # Multiply by the number of path extensions going from (x, y) to (m, n)
+    # Sum.
+
+    # Probability is symmetrical in m, n.  Computation below assumes m >= n.
+    if m < n:
+        m, n = n, m
+    mg = m // g
+    ng = n // g
+
+    # Not every x needs to be considered.
+    # xj holds the list of x values to be checked.
+    # Wherever n*x/m + ng*h crosses an integer
+    lxj = n + (mg-h)//mg
+    xj = [(h + mg * j + ng-1)//ng for j in range(lxj)]
+    # B is an array just holding a few values of B(x,y), the ones needed.
+    # B[j] == B(x_j, j)
+    if lxj == 0:
+        return special.binom(m + n, n)
+    B = np.zeros(lxj)
+    B[0] = 1
+    # Compute the B(x, y) terms
+    for j in range(1, lxj):
+        Bj = special.binom(xj[j] + j, j)
+        for i in range(j):
+            bin = special.binom(xj[j] - xj[i] + j - i, j-i)
+            Bj -= bin * B[i]
+        B[j] = Bj
+    # Compute the number of path extensions...
+    num_paths = 0
+    for j in range(lxj):
+        bin = special.binom((m-xj[j]) + (n - j), n-j)
+        term = B[j] * bin
+        num_paths += term
+    return num_paths
+
+
+def _attempt_exact_2kssamp(n1, n2, g, d, alternative):
+    """Attempts to compute the exact 2sample probability.
+
+    n1, n2 are the sample sizes
+    g is the gcd(n1, n2)
+    d is the computed max difference in ECDFs
+
+    Returns (success, d, probability)
+    """
+    lcm = (n1 // g) * n2
+    h = int(np.round(d * lcm))
+    d = h * 1.0 / lcm
+    if h == 0:
+        return True, d, 1.0
+    saw_fp_error, prob = False, np.nan
+    try:
+        with np.errstate(invalid="raise", over="raise"):
+            if alternative == 'two-sided':
+                if n1 == n2:
+                    prob = _compute_prob_outside_square(n1, h)
+                else:
+                    prob = _compute_outer_prob_inside_method(n1, n2, g, h)
+            else:
+                if n1 == n2:
+                    # prob = binom(2n, n-h) / binom(2n, n)
+                    # Evaluating in that form incurs roundoff errors
+                    # from special.binom. Instead calculate directly
+                    jrange = np.arange(h)
+                    prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))
+                else:
+                    with np.errstate(over='raise'):
+                        num_paths = _count_paths_outside_method(n1, n2, g, h)
+                    bin = special.binom(n1 + n2, n1)
+                    if num_paths > bin or np.isinf(bin):
+                        saw_fp_error = True
+                    else:
+                        prob = num_paths / bin
+
+    except (FloatingPointError, OverflowError):
+        saw_fp_error = True
+
+    if saw_fp_error:
+        return False, d, np.nan
+    if not (0 <= prob <= 1):
+        return False, d, prob
+    return True, d, prob
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(_tuple_to_KstestResult, n_samples=2, n_outputs=4,
+                          result_to_tuple=_KstestResult_to_tuple)
+@_rename_parameter("mode", "method")
+def ks_2samp(data1, data2, alternative='two-sided', method='auto'):
+    """
+    Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.
+
+    This test compares the underlying continuous distributions F(x) and G(x)
+    of two independent samples.  See Notes for a description of the available
+    null and alternative hypotheses.
+
+    Parameters
+    ----------
+    data1, data2 : array_like, 1-Dimensional
+        Two arrays of sample observations assumed to be drawn from a continuous
+        distribution, sample sizes can be different.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes below.
+    method : {'auto', 'exact', 'asymp'}, optional
+        Defines the method used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : use 'exact' for small size arrays, 'asymp' for large
+          * 'exact' : use exact distribution of test statistic
+          * 'asymp' : use asymptotic distribution of test statistic
+
+    Returns
+    -------
+    res: KstestResult
+        An object containing attributes:
+
+        statistic : float
+            KS test statistic.
+        pvalue : float
+            One-tailed or two-tailed p-value.
+        statistic_location : float
+            Value from `data1` or `data2` corresponding with the KS statistic;
+            i.e., the distance between the empirical distribution functions is
+            measured at this observation.
+        statistic_sign : int
+            +1 if the empirical distribution function of `data1` exceeds
+            the empirical distribution function of `data2` at
+            `statistic_location`, otherwise -1.
+
+    See Also
+    --------
+    kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp
+
+    Notes
+    -----
+    There are three options for the null and corresponding alternative
+    hypothesis that can be selected using the `alternative` parameter.
+
+    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
+      alternative is that F(x) < G(x) for at least one x. The statistic
+      is the magnitude of the minimum (most negative) difference between the
+      empirical distribution functions of the samples.
+
+    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
+      alternative is that F(x) > G(x) for at least one x. The statistic
+      is the maximum (most positive) difference between the empirical
+      distribution functions of the samples.
+
+    - `two-sided`: The null hypothesis is that the two distributions are
+      identical, F(x)=G(x) for all x; the alternative is that they are not
+      identical. The statistic is the maximum absolute difference between the
+      empirical distribution functions of the samples.
+
+    Note that the alternative hypotheses describe the *CDFs* of the
+    underlying distributions, not the observed values of the data. For example,
+    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
+    x1 tend to be less than those in x2.
+
+    If the KS statistic is large, then the p-value will be small, and this may
+    be taken as evidence against the null hypothesis in favor of the
+    alternative.
+
+    If ``method='exact'``, `ks_2samp` attempts to compute an exact p-value,
+    that is, the probability under the null hypothesis of obtaining a test
+    statistic value as extreme as the value computed from the data.
+    If ``method='asymp'``, the asymptotic Kolmogorov-Smirnov distribution is
+    used to compute an approximate p-value.
+    If ``method='auto'``, an exact p-value computation is attempted if both
+    sample sizes are less than 10000; otherwise, the asymptotic method is used.
+    In any case, if an exact p-value calculation is attempted and fails, a
+    warning will be emitted, and the asymptotic p-value will be returned.
+
+    The 'two-sided' 'exact' computation computes the complementary probability
+    and then subtracts from 1.  As such, the minimum probability it can return
+    is about 1e-16.  While the algorithm itself is exact, numerical
+    errors may accumulate for large sample sizes.   It is most suited to
+    situations in which one of the sample sizes is only a few thousand.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.
+
+    References
+    ----------
+    .. [1] Hodges, J.L. Jr.,  "The Significance Probability of the Smirnov
+           Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-486.
+
+    Examples
+    --------
+    Suppose we wish to test the null hypothesis that two samples were drawn
+    from the same distribution.
+    We choose a confidence level of 95%; that is, we will reject the null
+    hypothesis in favor of the alternative if the p-value is less than 0.05.
+
+    If the first sample were drawn from a uniform distribution and the second
+    were drawn from the standard normal, we would expect the null hypothesis
+    to be rejected.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> sample1 = stats.uniform.rvs(size=100, random_state=rng)
+    >>> sample2 = stats.norm.rvs(size=110, random_state=rng)
+    >>> stats.ks_2samp(sample1, sample2)
+    KstestResult(statistic=0.5454545454545454,
+                 pvalue=7.37417839555191e-15,
+                 statistic_location=-0.014071496412861274,
+                 statistic_sign=-1)
+
+
+    Indeed, the p-value is lower than our threshold of 0.05, so we reject the
+    null hypothesis in favor of the default "two-sided" alternative: the data
+    were *not* drawn from the same distribution.
+
+    When both samples are drawn from the same distribution, we expect the data
+    to be consistent with the null hypothesis most of the time.
+
+    >>> sample1 = stats.norm.rvs(size=105, random_state=rng)
+    >>> sample2 = stats.norm.rvs(size=95, random_state=rng)
+    >>> stats.ks_2samp(sample1, sample2)
+    KstestResult(statistic=0.10927318295739348,
+                 pvalue=0.5438289009927495,
+                 statistic_location=-0.1670157701848795,
+                 statistic_sign=-1)
+
+    As expected, the p-value of 0.54 is not below our threshold of 0.05, so
+    we cannot reject the null hypothesis.
+
+    Suppose, however, that the first sample were drawn from
+    a normal distribution shifted toward greater values. In this case,
+    the cumulative density function (CDF) of the underlying distribution tends
+    to be *less* than the CDF underlying the second sample. Therefore, we would
+    expect the null hypothesis to be rejected with ``alternative='less'``:
+
+    >>> sample1 = stats.norm.rvs(size=105, loc=0.5, random_state=rng)
+    >>> stats.ks_2samp(sample1, sample2, alternative='less')
+    KstestResult(statistic=0.4055137844611529,
+                 pvalue=3.5474563068855554e-08,
+                 statistic_location=-0.13249370614972575,
+                 statistic_sign=-1)
+
+    and indeed, with p-value smaller than our threshold, we reject the null
+    hypothesis in favor of the alternative.
+
+    """
+    mode = method
+
+    if mode not in ['auto', 'exact', 'asymp']:
+        raise ValueError(f'Invalid value for mode: {mode}')
+    alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
+        alternative.lower()[0], alternative)
+    if alternative not in ['two-sided', 'less', 'greater']:
+        raise ValueError(f'Invalid value for alternative: {alternative}')
+    MAX_AUTO_N = 10000  # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N
+    if np.ma.is_masked(data1):
+        data1 = data1.compressed()
+    if np.ma.is_masked(data2):
+        data2 = data2.compressed()
+    data1 = np.sort(data1)
+    data2 = np.sort(data2)
+    n1 = data1.shape[0]
+    n2 = data2.shape[0]
+    if min(n1, n2) == 0:
+        raise ValueError('Data passed to ks_2samp must not be empty')
+
+    data_all = np.concatenate([data1, data2])
+    # using searchsorted solves equal data problem
+    cdf1 = np.searchsorted(data1, data_all, side='right') / n1
+    cdf2 = np.searchsorted(data2, data_all, side='right') / n2
+    cddiffs = cdf1 - cdf2
+
+    # Identify the location of the statistic
+    argminS = np.argmin(cddiffs)
+    argmaxS = np.argmax(cddiffs)
+    loc_minS = data_all[argminS]
+    loc_maxS = data_all[argmaxS]
+
+    # Ensure sign of minS is not negative.
+    minS = np.clip(-cddiffs[argminS], 0, 1)
+    maxS = cddiffs[argmaxS]
+
+    if alternative == 'less' or (alternative == 'two-sided' and minS > maxS):
+        d = minS
+        d_location = loc_minS
+        d_sign = -1
+    else:
+        d = maxS
+        d_location = loc_maxS
+        d_sign = 1
+    g = math.gcd(n1, n2)
+    n1g = n1 // g
+    n2g = n2 // g
+    prob = -np.inf
+    if mode == 'auto':
+        mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp'
+    elif mode == 'exact':
+        # If lcm(n1, n2) is too big, switch from exact to asymp
+        if n1g >= np.iinfo(np.int32).max / n2g:
+            mode = 'asymp'
+            warnings.warn(
+                f"Exact ks_2samp calculation not possible with samples sizes "
+                f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning,
+                stacklevel=3)
+
+    if mode == 'exact':
+        success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
+        if not success:
+            mode = 'asymp'
+            warnings.warn(f"ks_2samp: Exact calculation unsuccessful. "
+                          f"Switching to method={mode}.", RuntimeWarning,
+                          stacklevel=3)
+
+    if mode == 'asymp':
+        # The product n1*n2 is large.  Use Smirnov's asymptotic formula.
+        # Ensure float to avoid overflow in multiplication
+        # sorted because the one-sided formula is not symmetric in n1, n2
+        m, n = sorted([float(n1), float(n2)], reverse=True)
+        en = m * n / (m + n)
+        if alternative == 'two-sided':
+            prob = distributions.kstwo.sf(d, np.round(en))
+        else:
+            z = np.sqrt(en) * d
+            # Use Hodges' suggested approximation Eqn 5.3
+            # Requires m to be the larger of (n1, n2)
+            expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0
+            prob = np.exp(expt)
+
+    prob = np.clip(prob, 0, 1)
+    # Currently, `d` is a Python float. We want it to be a NumPy type, so
+    # float64 is appropriate. An enhancement would be for `d` to respect the
+    # dtype of the input.
+    return KstestResult(np.float64(d), prob, statistic_location=d_location,
+                        statistic_sign=np.int8(d_sign))
+
+
+def _parse_kstest_args(data1, data2, args, N):
+    # kstest allows many different variations of arguments.
+    # Pull out the parsing into a separate function
+    # (xvals, yvals, )  # 2sample
+    # (xvals, cdf function,..)
+    # (xvals, name of distribution, ...)
+    # (name of distribution, name of distribution, ...)
+
+    # Returns xvals, yvals, cdf
+    # where cdf is a cdf function, or None
+    # and yvals is either an array_like of values, or None
+    # and xvals is array_like.
+    rvsfunc, cdf = None, None
+    if isinstance(data1, str):
+        rvsfunc = getattr(distributions, data1).rvs
+    elif callable(data1):
+        rvsfunc = data1
+
+    if isinstance(data2, str):
+        cdf = getattr(distributions, data2).cdf
+        data2 = None
+    elif callable(data2):
+        cdf = data2
+        data2 = None
+
+    xp = array_namespace(data1, data2, *args)
+    data1 = xp.sort(rvsfunc(*args, size=N) if rvsfunc else data1)
+    return data1, data2, cdf
+
+
+def _kstest_n_samples(kwargs):
+    cdf = kwargs['cdf']
+    return 1 if (isinstance(cdf, str) or callable(cdf)) else 2
+
+
+@xp_capabilities(out_of_scope=True)
+@_axis_nan_policy_factory(_tuple_to_KstestResult, n_samples=_kstest_n_samples,
+                          n_outputs=4, result_to_tuple=_KstestResult_to_tuple)
+@_rename_parameter("mode", "method")
+def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', method='auto'):
+    """
+    Performs the (one-sample or two-sample) Kolmogorov-Smirnov test for
+    goodness of fit.
+
+    The one-sample test compares the underlying distribution F(x) of a sample
+    against a given distribution G(x). The two-sample test compares the
+    underlying distributions of two independent samples. Both tests are valid
+    only for continuous distributions.
+
+    Parameters
+    ----------
+    rvs : str, array_like, or callable
+        If an array, it should be a 1-D array of observations of random
+        variables.
+        If a callable, it should be a function to generate random variables;
+        it is required to have a keyword argument `size`.
+        If a string, it should be the name of a distribution in `scipy.stats`,
+        which will be used to generate random variables.
+    cdf : str, array_like or callable
+        If array_like, it should be a 1-D array of observations of random
+        variables, and the two-sample test is performed
+        (and rvs must be array_like).
+        If a callable, that callable is used to calculate the cdf.
+        If a string, it should be the name of a distribution in `scipy.stats`,
+        which will be used as the cdf function.
+    args : tuple, sequence, optional
+        Distribution parameters, used if `rvs` or `cdf` are strings or
+        callables.
+    N : int, optional
+        Sample size if `rvs` is string or callable.  Default is 20.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes below.
+    method : {'auto', 'exact', 'approx', 'asymp'}, optional
+        Defines the distribution used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : selects one of the other options.
+          * 'exact' : uses the exact distribution of test statistic.
+          * 'approx' : approximates the two-sided probability with twice the
+            one-sided probability
+          * 'asymp': uses asymptotic distribution of test statistic
+
+    Returns
+    -------
+    res: KstestResult
+        An object containing attributes:
+
+        statistic : float
+            KS test statistic, either D+, D-, or D (the maximum of the two)
+        pvalue : float
+            One-tailed or two-tailed p-value.
+        statistic_location : float
+            In a one-sample test, this is the value of `rvs`
+            corresponding with the KS statistic; i.e., the distance between
+            the empirical distribution function and the hypothesized cumulative
+            distribution function is measured at this observation.
+
+            In a two-sample test, this is the value from `rvs` or `cdf`
+            corresponding with the KS statistic; i.e., the distance between
+            the empirical distribution functions is measured at this
+            observation.
+        statistic_sign : int
+            In a one-sample test, this is +1 if the KS statistic is the
+            maximum positive difference between the empirical distribution
+            function and the hypothesized cumulative distribution function
+            (D+); it is -1 if the KS statistic is the maximum negative
+            difference (D-).
+
+            In a two-sample test, this is +1 if the empirical distribution
+            function of `rvs` exceeds the empirical distribution
+            function of `cdf` at `statistic_location`, otherwise -1.
+
+    See Also
+    --------
+    ks_1samp, ks_2samp
+
+    Notes
+    -----
+    There are three options for the null and corresponding alternative
+    hypothesis that can be selected using the `alternative` parameter.
+
+    - `two-sided`: The null hypothesis is that the two distributions are
+      identical, F(x)=G(x) for all x; the alternative is that they are not
+      identical.
+
+    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
+      alternative is that F(x) < G(x) for at least one x.
+
+    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
+      alternative is that F(x) > G(x) for at least one x.
+
+    Note that the alternative hypotheses describe the *CDFs* of the
+    underlying distributions, not the observed values. For example,
+    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
+    x1 tend to be less than those in x2.
+
+
+    Examples
+    --------
+    Suppose we wish to test the null hypothesis that a sample is distributed
+    according to the standard normal.
+    We choose a confidence level of 95%; that is, we will reject the null
+    hypothesis in favor of the alternative if the p-value is less than 0.05.
+
+    When testing uniformly distributed data, we would expect the
+    null hypothesis to be rejected.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> stats.kstest(stats.uniform.rvs(size=100, random_state=rng),
+    ...              stats.norm.cdf)
+    KstestResult(statistic=0.5001899973268688,
+                 pvalue=1.1616392184763533e-23,
+                 statistic_location=0.00047625268963724654,
+                 statistic_sign=-1)
+
+    Indeed, the p-value is lower than our threshold of 0.05, so we reject the
+    null hypothesis in favor of the default "two-sided" alternative: the data
+    are *not* distributed according to the standard normal.
+
+    When testing random variates from the standard normal distribution, we
+    expect the data to be consistent with the null hypothesis most of the time.
+
+    >>> x = stats.norm.rvs(size=100, random_state=rng)
+    >>> stats.kstest(x, stats.norm.cdf)
+    KstestResult(statistic=0.05345882212970396,
+                 pvalue=0.9227159037744717,
+                 statistic_location=-1.2451343873745018,
+                 statistic_sign=1)
+
+
+    As expected, the p-value of 0.92 is not below our threshold of 0.05, so
+    we cannot reject the null hypothesis.
+
+    Suppose, however, that the random variates are distributed according to
+    a normal distribution that is shifted toward greater values. In this case,
+    the cumulative density function (CDF) of the underlying distribution tends
+    to be *less* than the CDF of the standard normal. Therefore, we would
+    expect the null hypothesis to be rejected with ``alternative='less'``:
+
+    >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)
+    >>> stats.kstest(x, stats.norm.cdf, alternative='less')
+    KstestResult(statistic=0.17482387821055168,
+                 pvalue=0.001913921057766743,
+                 statistic_location=0.3713830565352756,
+                 statistic_sign=-1)
+
+    and indeed, with p-value smaller than our threshold, we reject the null
+    hypothesis in favor of the alternative.
+
+    For convenience, the previous test can be performed using the name of the
+    distribution as the second argument.
+
+    >>> stats.kstest(x, "norm", alternative='less')
+    KstestResult(statistic=0.17482387821055168,
+                 pvalue=0.001913921057766743,
+                 statistic_location=0.3713830565352756,
+                 statistic_sign=-1)
+
+    The examples above have all been one-sample tests identical to those
+    performed by `ks_1samp`. Note that `kstest` can also perform two-sample
+    tests identical to those performed by `ks_2samp`. For example, when two
+    samples are drawn from the same distribution, we expect the data to be
+    consistent with the null hypothesis most of the time.
+
+    >>> sample1 = stats.laplace.rvs(size=105, random_state=rng)
+    >>> sample2 = stats.laplace.rvs(size=95, random_state=rng)
+    >>> stats.kstest(sample1, sample2)
+    KstestResult(statistic=0.11779448621553884,
+                 pvalue=0.4494256912629795,
+                 statistic_location=0.6138814275424155,
+                 statistic_sign=1)
+
+    As expected, the p-value of 0.45 is not below our threshold of 0.05, so
+    we cannot reject the null hypothesis.
+
+    """
+    # to not break compatibility with existing code
+    if alternative == 'two_sided':
+        alternative = 'two-sided'
+    if alternative not in ['two-sided', 'greater', 'less']:
+        raise ValueError(f"Unexpected alternative: {alternative}")
+    xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N)
+    if cdf:
+        return ks_1samp(xvals, cdf, args=args, alternative=alternative,
+                        method=method, _no_deco=True)
+    return ks_2samp(xvals, yvals, alternative=alternative, method=method,
+                    _no_deco=True)
+
+
+@xp_capabilities(np_only=True)
+def tiecorrect(rankvals):
+    """Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests.
+
+    Parameters
+    ----------
+    rankvals : array_like
+        A 1-D sequence of ranks.  Typically this will be the array
+        returned by `~scipy.stats.rankdata`.
+
+    Returns
+    -------
+    factor : float
+        Correction factor for U or H.
+
+    See Also
+    --------
+    rankdata : Assign ranks to the data
+    mannwhitneyu : Mann-Whitney rank test
+    kruskal : Kruskal-Wallis H test
+
+    References
+    ----------
+    .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral
+           Sciences.  New York: McGraw-Hill.
+
+    Examples
+    --------
+    >>> from scipy.stats import tiecorrect, rankdata
+    >>> tiecorrect([1, 2.5, 2.5, 4])
+    0.9
+    >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])
+    >>> ranks
+    array([ 1. ,  4. ,  2.5,  5.5,  7. ,  8. ,  2.5,  9. ,  5.5])
+    >>> tiecorrect(ranks)
+    0.9833333333333333
+
+    """
+    arr = np.sort(rankvals)
+    idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]
+    cnt = np.diff(idx).astype(np.float64)
+
+    size = np.float64(arr.size)
+    return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size)
+
+
+RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(RanksumsResult, n_samples=2)
+def ranksums(x, y, alternative='two-sided'):
+    """Compute the Wilcoxon rank-sum statistic for two samples.
+
+    The Wilcoxon rank-sum test tests the null hypothesis that two sets
+    of measurements are drawn from the same distribution.  The alternative
+    hypothesis is that values in one sample are more likely to be
+    larger than the values in the other sample.
+
+    This test should be used to compare two samples from continuous
+    distributions.  It does not handle ties between measurements
+    in x and y.  For tie-handling and an optional continuity correction
+    see `scipy.stats.mannwhitneyu`.
+
+    Parameters
+    ----------
+    x,y : array_like
+        The data from the two samples.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': one of the distributions (underlying `x` or `y`) is
+          stochastically greater than the other.
+        * 'less': the distribution underlying `x` is stochastically less
+          than the distribution underlying `y`.
+        * 'greater': the distribution underlying `x` is stochastically greater
+          than the distribution underlying `y`.
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    statistic : float
+        The test statistic under the large-sample approximation that the
+        rank sum statistic is normally distributed.
+    pvalue : float
+        The p-value of the test.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test
+
+    Examples
+    --------
+    We can test the hypothesis that two independent unequal-sized samples are
+    drawn from the same distribution with computing the Wilcoxon rank-sum
+    statistic.
+
+    >>> import numpy as np
+    >>> from scipy.stats import ranksums
+    >>> rng = np.random.default_rng()
+    >>> sample1 = rng.uniform(-1, 1, 200)
+    >>> sample2 = rng.uniform(-0.5, 1.5, 300) # a shifted distribution
+    >>> ranksums(sample1, sample2)
+    RanksumsResult(statistic=-7.887059,
+                   pvalue=3.09390448e-15) # may vary
+    >>> ranksums(sample1, sample2, alternative='less')
+    RanksumsResult(statistic=-7.750585297581713,
+                   pvalue=4.573497606342543e-15) # may vary
+    >>> ranksums(sample1, sample2, alternative='greater')
+    RanksumsResult(statistic=-7.750585297581713,
+                   pvalue=0.9999999999999954) # may vary
+
+    The p-value of less than ``0.05`` indicates that this test rejects the
+    hypothesis at the 5% significance level.
+
+    """
+    x, y = map(np.asarray, (x, y))
+    n1 = len(x)
+    n2 = len(y)
+    alldata = np.concatenate((x, y))
+    ranked = rankdata(alldata)
+    x = ranked[:n1]
+    s = np.sum(x, axis=0)
+    expected = n1 * (n1+n2+1) / 2.0
+    z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
+    pvalue = _get_pvalue(z, _SimpleNormal(), alternative, xp=np)
+
+    return RanksumsResult(z[()], pvalue[()])
+
+
+KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
+
+
+@xp_capabilities(skip_backends=[('cupy', 'no rankdata'), ('dask.array', 'no rankdata')],
+                 jax_jit=False)
+@_axis_nan_policy_factory(KruskalResult, n_samples=None)
+def kruskal(*samples, nan_policy='propagate', axis=0):
+    """Compute the Kruskal-Wallis H-test for independent samples.
+
+    The Kruskal-Wallis H-test tests the null hypothesis that the population
+    median of all of the groups are equal.  It is a non-parametric version of
+    ANOVA.  The test works on 2 or more independent samples, which may have
+    different sizes.  Note that rejecting the null hypothesis does not
+    indicate which of the groups differs.  Post hoc comparisons between
+    groups are required to determine which groups are different.
+
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+       Two or more arrays with the sample measurements can be given as
+       arguments. Samples must be one-dimensional.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+       The Kruskal-Wallis H statistic, corrected for ties.
+    pvalue : float
+       The p-value for the test using the assumption that H has a chi
+       square distribution. The p-value returned is the survival function of
+       the chi square distribution evaluated at H.
+
+    See Also
+    --------
+    f_oneway : 1-way ANOVA.
+    mannwhitneyu : Mann-Whitney rank test on two samples.
+    friedmanchisquare : Friedman test for repeated measurements.
+
+    Notes
+    -----
+    Due to the assumption that H has a chi square distribution, the number
+    of samples in each group must not be too small.  A typical rule is
+    that each sample must have at least 5 measurements.
+
+    References
+    ----------
+    .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in
+       One-Criterion Variance Analysis", Journal of the American Statistical
+       Association, Vol. 47, Issue 260, pp. 583-621, 1952.
+    .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> x = [1, 3, 5, 7, 9]
+    >>> y = [2, 4, 6, 8, 10]
+    >>> stats.kruskal(x, y)
+    KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)
+
+    >>> x = [1, 1, 1]
+    >>> y = [2, 2, 2]
+    >>> z = [2, 2]
+    >>> stats.kruskal(x, y, z)
+    KruskalResult(statistic=7.0, pvalue=0.0301973834223185)
+
+    """
+    xp = array_namespace(*samples)
+    samples = xp_promote(*samples, force_floating=True, xp=xp)
+
+    num_groups = len(samples)
+    if num_groups < 2:
+        raise ValueError("Need at least two groups in stats.kruskal()")
+
+    n = [sample.shape[-1] for sample in samples]
+    totaln = sum(n)
+    if any(n) < 1:  # Only needed for `test_axis_nan_policy`
+        raise ValueError("Inputs must not be empty.")
+
+    alldata = xp.concat(samples, axis=-1)
+    ranked, t = _rankdata(alldata, method='average', return_ties=True)
+    # should adjust output dtype of _rankdata
+    ranked = xp.astype(ranked, alldata.dtype, copy=False)
+    t = xp.astype(t, alldata.dtype, copy=False)
+    ties = 1 - xp.sum(t**3 - t, axis=-1) / (totaln**3 - totaln)  # tiecorrect(ranked)
+
+    # Compute sum^2/n for each group and sum
+    j = list(itertools.accumulate(n, initial=0))
+    ssbn = sum(xp.sum(ranked[..., j[i]:j[i + 1]], axis=-1)**2 / n[i]
+               for i in range(num_groups))
+
+    h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
+    df = xp.asarray(num_groups - 1, dtype=h.dtype)
+    h /= ties
+
+    chi2 = _SimpleChi2(df)
+    pvalue = _get_pvalue(h, chi2, alternative='greater', symmetric=False, xp=np)
+    return KruskalResult(h, pvalue)
+
+
+FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
+                                     ('statistic', 'pvalue'))
+
+
+@xp_capabilities(skip_backends=[("cupy", "no rankdata"), ("dask.array", "no rankdata")],
+                 jax_jit=False)
+@_axis_nan_policy_factory(FriedmanchisquareResult, n_samples=None, paired=True)
+def friedmanchisquare(*samples, axis=0):
+    """Compute the Friedman test for repeated samples.
+
+    The Friedman test tests the null hypothesis that repeated samples of
+    the same individuals have the same distribution.  It is often used
+    to test for consistency among samples obtained in different ways.
+    For example, if two sampling techniques are used on the same set of
+    individuals, the Friedman test can be used to determine if the two
+    sampling techniques are consistent.
+
+    Parameters
+    ----------
+    sample1, sample2, sample3... : array_like
+        Arrays of observations.  All of the arrays must have the same number
+        of elements.  At least three samples must be given.
+    axis : int or tuple of ints, default: 0
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic, correcting for ties.
+    pvalue : float
+        The associated p-value assuming that the test statistic has a chi
+        squared distribution.
+
+    See Also
+    --------
+    :ref:`hypothesis_friedmanchisquare` : Extended example
+
+    Notes
+    -----
+    Due to the assumption that the test statistic has a chi squared
+    distribution, the p-value is only reliable for n > 10 and more than
+    6 repeated samples.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Friedman_test
+    .. [2] Demsar, J. (2006). Statistical comparisons of classifiers over
+           multiple data sets. Journal of Machine Learning Research, 7, 1-30.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> rng = np.random.default_rng(seed=18)
+    >>> x = rng.random((6, 10))
+    >>> from scipy.stats import friedmanchisquare
+    >>> res = friedmanchisquare(x[0], x[1], x[2], x[3], x[4], x[5])
+    >>> res.statistic, res.pvalue
+    (11.428571428571416, 0.043514520866727614)
+
+    The p-value is less than 0.05; however, as noted above, the results may not
+    be reliable since we have a small number of repeated samples.
+
+    For a more detailed example, see :ref:`hypothesis_friedmanchisquare`.
+    """
+    k = len(samples)
+    if k < 3:
+        raise ValueError('At least 3 samples must be given '
+                         f'for Friedman test, got {k}.')
+
+    xp = array_namespace(*samples)
+    samples = xp_promote(*samples, force_floating=True, xp=xp)
+    dtype = samples[0].dtype
+
+    n = samples[0].shape[-1]
+    if n == 0:  # only for `test_axis_nan_policy`; user doesn't see this
+        raise ValueError("One or more sample arguments is too small.")
+
+    # Rank data
+    # axis-slices are aligned with axis -1 by decorator; stack puts samples along axis 0
+    # The transpose flips this so we can work with axis-slices along -1. This is a
+    # reducing statistic, so both axes 0 and -1 are consumed.
+    data = xp_swapaxes(xp.stack(samples), 0, -1)
+    data, t = _rankdata(data, method='average', return_ties=True)
+    data, t = xp.asarray(data, dtype=dtype), xp.asarray(t, dtype=dtype)
+
+    # Handle ties
+    ties = xp.sum(t * (t*t - 1), axis=(0, -1))
+    c = 1 - ties / (k*(k*k - 1)*n)
+
+    ssbn = xp.sum(xp.sum(data, axis=0)**2, axis=-1)
+    statistic = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c
+
+    chi2 = _SimpleChi2(xp.asarray(k - 1, dtype=dtype))
+    pvalue = _get_pvalue(statistic, chi2, alternative='greater', symmetric=False, xp=xp)
+    return FriedmanchisquareResult(statistic, pvalue)
+
+
+BrunnerMunzelResult = namedtuple('BrunnerMunzelResult',
+                                 ('statistic', 'pvalue'))
+
+
+@xp_capabilities(cpu_only=True, # torch GPU can't use `stdtr`
+                 skip_backends=[('dask.array', 'needs rankdata'),
+                                ('cupy', 'needs rankdata'),
+                                ('jax.numpy', 'needs _axis_nan_policy decorator')])
+@_axis_nan_policy_factory(BrunnerMunzelResult, n_samples=2)
+def brunnermunzel(x, y, alternative="two-sided", distribution="t",
+                  nan_policy='propagate', *, axis=0):
+    """Compute the Brunner-Munzel test on samples x and y.
+
+    The Brunner-Munzel test is a nonparametric test of the null hypothesis that
+    when values are taken one by one from each group, the probabilities of
+    getting large values in both groups are equal.
+    Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
+    assumption of equivariance of two groups. Note that this does not assume
+    the distributions are same. This test works on two independent samples,
+    which may have different sizes.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Array of samples, should be one-dimensional.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+          * 'two-sided'
+          * 'less': one-sided
+          * 'greater': one-sided
+    distribution : {'t', 'normal'}, optional
+        Defines how to get the p-value.
+        The following options are available (default is 't'):
+
+          * 't': get the p-value by t-distribution
+          * 'normal': get the p-value by standard normal distribution.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    axis : int or None, default=0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear
+        in a corresponding element of the output. If None, the input will be
+        raveled before computing the statistic.
+
+    Returns
+    -------
+    statistic : float
+        The Brunner-Munzer W statistic.
+    pvalue : float
+        p-value assuming an t distribution. One-sided or
+        two-sided, depending on the choice of `alternative` and `distribution`.
+
+    See Also
+    --------
+    mannwhitneyu : Mann-Whitney rank test on two samples.
+
+    Notes
+    -----
+    Brunner and Munzel recommended to estimate the p-value by t-distribution
+    when the size of data is 50 or less. If the size is lower than 10, it would
+    be better to use permuted Brunner Munzel test (see [2]_).
+
+    References
+    ----------
+    .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
+           problem: Asymptotic theory and a small-sample approximation".
+           Biometrical Journal. Vol. 42(2000): 17-25.
+    .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
+           non-parametric Behrens-Fisher problem". Computational Statistics and
+           Data Analysis. Vol. 51(2007): 5192-5204.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
+    >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
+    >>> w, p_value = stats.brunnermunzel(x1, x2)
+    >>> w
+    3.1374674823029505
+    >>> p_value
+    0.0057862086661515377
+
+    """
+    xp = array_namespace(x, y)
+    nx = x.shape[-1]
+    ny = y.shape[-1]
+
+    # _axis_nan_policy decorator ensures we can work along the last axis
+    rankc = rankdata(xp.concat((x, y), axis=axis), axis=-1)
+    rankcx = rankc[..., 0:nx]
+    rankcy = rankc[..., nx:nx+ny]
+    rankcx_mean = xp.mean(rankcx, axis=-1, keepdims=True)
+    rankcy_mean = xp.mean(rankcy, axis=-1, keepdims=True)
+    rankx = rankdata(x, axis=-1)
+    ranky = rankdata(y, axis=-1)
+    rankx_mean = xp.mean(rankx, axis=-1, keepdims=True)
+    ranky_mean = xp.mean(ranky, axis=-1, keepdims=True)
+
+    temp_x = rankcx - rankx - rankcx_mean + rankx_mean
+    Sx = xp.vecdot(temp_x, temp_x, axis=-1)
+    Sx /= nx - 1
+    temp_y = rankcy - ranky - rankcy_mean + ranky_mean
+    Sy = xp.vecdot(temp_y, temp_y, axis=-1)
+    Sy /= ny - 1
+
+    rankcx_mean = xp.squeeze(rankcx_mean, axis=-1)
+    rankcy_mean = xp.squeeze(rankcy_mean, axis=-1)
+    wbfn = nx * ny * (rankcy_mean - rankcx_mean)
+    wbfn /= (nx + ny) * xp.sqrt(nx * Sx + ny * Sy)
+
+    if distribution == "t":
+        df_numer = xp.pow(nx * Sx + ny * Sy, 2.0)
+        df_denom = xp.pow(nx * Sx, 2.0) / (nx - 1)
+        df_denom += xp.pow(ny * Sy, 2.0) / (ny - 1)
+        df = df_numer / df_denom
+
+        if xp.any(df_numer == 0) and xp.any(df_denom == 0):
+            message = ("p-value cannot be estimated with `distribution='t' "
+                       "because degrees of freedom parameter is undefined "
+                       "(0/0). Try using `distribution='normal'")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+        distribution = _SimpleStudentT(df)
+    elif distribution == "normal":
+        distribution = _SimpleNormal()
+    else:
+        raise ValueError(
+            "distribution should be 't' or 'normal'")
+
+    p = _get_pvalue(-wbfn, distribution, alternative, xp=xp)
+
+    return BrunnerMunzelResult(wbfn, p)
+
+
+@xp_capabilities(cpu_only=True, exceptions=['cupy', 'jax.numpy'],
+    reason='Delegation for `special.stdtr` only implemented for CuPy and JAX.',
+    jax_jit=False, allow_dask_compute=True)
+@_axis_nan_policy_factory(SignificanceResult, kwd_samples=['weights'], paired=True)
+def combine_pvalues(pvalues, method='fisher', weights=None, *, axis=0):
+    """
+    Combine p-values from independent tests that bear upon the same hypothesis.
+
+    These methods are intended only for combining p-values from hypothesis
+    tests based upon continuous distributions.
+
+    Each method assumes that under the null hypothesis, the p-values are
+    sampled independently and uniformly from the interval [0, 1]. A test
+    statistic (different for each method) is computed and a combined
+    p-value is calculated based upon the distribution of this test statistic
+    under the null hypothesis.
+
+    Parameters
+    ----------
+    pvalues : array_like
+        Array of p-values assumed to come from independent tests based on
+        continuous distributions.
+    method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'}
+
+        Name of method to use to combine p-values.
+
+        The available methods are (see Notes for details):
+
+        * 'fisher': Fisher's method (Fisher's combined probability test)
+        * 'pearson': Pearson's method
+        * 'mudholkar_george': Mudholkar's and George's method
+        * 'tippett': Tippett's method
+        * 'stouffer': Stouffer's Z-score method
+    weights : array_like, optional
+        Optional array of weights used only for Stouffer's Z-score method.
+        Ignored by other methods.
+
+    Returns
+    -------
+    res : SignificanceResult
+        An object containing attributes:
+
+        statistic : float
+            The statistic calculated by the specified method.
+        pvalue : float
+            The combined p-value.
+
+    Examples
+    --------
+    Suppose we wish to combine p-values from four independent tests
+    of the same null hypothesis using Fisher's method (default).
+
+    >>> from scipy.stats import combine_pvalues
+    >>> pvalues = [0.1, 0.05, 0.02, 0.3]
+    >>> combine_pvalues(pvalues)
+    SignificanceResult(statistic=20.828626352604235, pvalue=0.007616871850449092)
+
+    When the individual p-values carry different weights, consider Stouffer's
+    method.
+
+    >>> weights = [1, 2, 3, 4]
+    >>> res = combine_pvalues(pvalues, method='stouffer', weights=weights)
+    >>> res.pvalue
+    0.009578891494533616
+
+    Notes
+    -----
+    If this function is applied to tests with a discrete statistics such as
+    any rank test or contingency-table test, it will yield systematically
+    wrong results, e.g. Fisher's method will systematically overestimate the
+    p-value [1]_. This problem becomes less severe for large sample sizes
+    when the discrete distributions become approximately continuous.
+
+    The differences between the methods can be best illustrated by their
+    statistics and what aspects of a combination of p-values they emphasise
+    when considering significance [2]_. For example, methods emphasising large
+    p-values are more sensitive to strong false and true negatives; conversely
+    methods focussing on small p-values are sensitive to positives.
+
+    * The statistics of Fisher's method (also known as Fisher's combined
+      probability test) [3]_ is :math:`-2\\sum_i \\log(p_i)`, which is
+      equivalent (as a test statistics) to the product of individual p-values:
+      :math:`\\prod_i p_i`. Under the null hypothesis, this statistics follows
+      a :math:`\\chi^2` distribution. This method emphasises small p-values.
+    * Pearson's method uses :math:`-2\\sum_i\\log(1-p_i)`, which is equivalent
+      to :math:`\\prod_i \\frac{1}{1-p_i}` [2]_.
+      It thus emphasises large p-values.
+    * Mudholkar and George compromise between Fisher's and Pearson's method by
+      averaging their statistics [4]_. Their method emphasises extreme
+      p-values, both close to 1 and 0.
+    * Stouffer's method [5]_ uses Z-scores and the statistic:
+      :math:`\\sum_i \\Phi^{-1} (p_i)`, where :math:`\\Phi` is the CDF of the
+      standard normal distribution. The advantage of this method is that it is
+      straightforward to introduce weights, which can make Stouffer's method
+      more powerful than Fisher's method when the p-values are from studies
+      of different size [6]_ [7]_.
+    * Tippett's method uses the smallest p-value as a statistic.
+      (Mind that this minimum is not the combined p-value.)
+
+    Fisher's method may be extended to combine p-values from dependent tests
+    [8]_. Extensions such as Brown's method and Kost's method are not currently
+    implemented.
+
+    .. versionadded:: 0.15.0
+
+    References
+    ----------
+    .. [1] Kincaid, W. M., "The Combination of Tests Based on Discrete
+           Distributions." Journal of the American Statistical Association 57,
+           no. 297 (1962), 10-19.
+    .. [2] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of
+           combining p-values."  Biometrika 105.1 (2018): 239-246.
+    .. [3] https://en.wikipedia.org/wiki/Fisher%27s_method
+    .. [4] George, E. O., and G. S. Mudholkar. "On the convolution of logistic
+           random variables." Metrika 30.1 (1983): 1-13.
+    .. [5] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method
+    .. [6] Whitlock, M. C. "Combining probability from independent tests: the
+           weighted Z-method is superior to Fisher's approach." Journal of
+           Evolutionary Biology 18, no. 5 (2005): 1368-1373.
+    .. [7] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method
+           for combining probabilities in meta-analysis." Journal of
+           Evolutionary Biology 24, no. 8 (2011): 1836-1841.
+    .. [8] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method
+
+    """
+    xp = array_namespace(pvalues, weights)
+    pvalues, weights = xp_promote(pvalues, weights, broadcast=True,
+                                  force_floating=True, xp=xp)
+
+    if xp_size(pvalues) == 0:
+        # This is really only needed for *testing* _axis_nan_policy decorator
+        # It won't happen when the decorator is used.
+        NaN = _get_nan(pvalues)
+        return SignificanceResult(NaN, NaN)
+
+    n = _length_nonmasked(pvalues, axis)
+    n = xp.asarray(n, dtype=pvalues.dtype, device=xp_device(pvalues))
+
+    if method == 'fisher':
+        statistic = -2 * xp.sum(xp.log(pvalues), axis=axis)
+        chi2 = _SimpleChi2(2*n)
+        pval = _get_pvalue(statistic, chi2, alternative='greater',
+                           symmetric=False, xp=xp)
+    elif method == 'pearson':
+        statistic = 2 * xp.sum(xp.log1p(-pvalues), axis=axis)
+        chi2 = _SimpleChi2(2*n)
+        pval = _get_pvalue(-statistic, chi2, alternative='less', symmetric=False, xp=xp)
+    elif method == 'mudholkar_george':
+        normalizing_factor = xp.sqrt(3/n)/xp.pi
+        statistic = (-xp.sum(xp.log(pvalues), axis=axis)
+                     + xp.sum(xp.log1p(-pvalues), axis=axis))
+        nu = 5*n + 4
+        approx_factor = xp.sqrt(nu / (nu - 2))
+        t = _SimpleStudentT(nu)
+        pval = _get_pvalue(statistic * normalizing_factor * approx_factor, t,
+                           alternative="greater", xp=xp)
+    elif method == 'tippett':
+        statistic = xp.min(pvalues, axis=axis)
+        beta = _SimpleBeta(xp.ones_like(n), n)
+        pval = _get_pvalue(statistic, beta, alternative='less', symmetric=False, xp=xp)
+    elif method == 'stouffer':
+        if weights is None:
+            weights = xp.ones_like(pvalues, dtype=pvalues.dtype)
+        pvalues, weights = _share_masks(pvalues, weights, xp=xp)
+
+        norm = _SimpleNormal()
+        Zi = norm.isf(pvalues)
+        # Consider `vecdot` when data-apis/array-api#910 is resolved
+        statistic = (xp.sum(weights * Zi, axis=axis)
+                     / xp_vector_norm(weights, axis=axis))
+        pval = _get_pvalue(statistic, norm, alternative="greater", xp=xp)
+
+    else:
+        raise ValueError(
+            f"Invalid method {method!r}. Valid methods are 'fisher', "
+            "'pearson', 'mudholkar_george', 'tippett', and 'stouffer'"
+        )
+
+    return SignificanceResult(statistic, pval)
+
+
+@dataclass
+class QuantileTestResult:
+    r"""
+    Result of `scipy.stats.quantile_test`.
+
+    Attributes
+    ----------
+    statistic: float
+        The statistic used to calculate the p-value; either ``T1``, the
+        number of observations less than or equal to the hypothesized quantile,
+        or ``T2``, the number of observations strictly less than the
+        hypothesized quantile. Two test statistics are required to handle the
+        possibility the data was generated from a discrete or mixed
+        distribution.
+
+    statistic_type : int
+        ``1`` or ``2`` depending on which of ``T1`` or ``T2`` was used to
+        calculate the p-value respectively. ``T1`` corresponds to the
+        ``"greater"`` alternative hypothesis and ``T2`` to the ``"less"``.  For
+        the ``"two-sided"`` case, the statistic type that leads to smallest
+        p-value is used.  For significant tests, ``statistic_type = 1`` means
+        there is evidence that the population quantile is significantly greater
+        than the hypothesized value and ``statistic_type = 2`` means there is
+        evidence that it is significantly less than the hypothesized value.
+
+    pvalue : float
+        The p-value of the hypothesis test.
+    """
+    statistic: float
+    statistic_type: int
+    pvalue: float
+    _alternative: list[str] = field(repr=False)
+    _x : np.ndarray = field(repr=False)
+    _p : float = field(repr=False)
+
+    def confidence_interval(self, confidence_level=0.95):
+        """
+        Compute the confidence interval of the quantile.
+
+        Parameters
+        ----------
+        confidence_level : float, default: 0.95
+            Confidence level for the computed confidence interval
+            of the quantile. Default is 0.95.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence interval.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import scipy.stats as stats
+        >>> p = 0.75  # quantile of interest
+        >>> q = 0  # hypothesized value of the quantile
+        >>> x = np.exp(np.arange(0, 1.01, 0.01))
+        >>> res = stats.quantile_test(x, q=q, p=p, alternative='less')
+        >>> lb, ub = res.confidence_interval()
+        >>> lb, ub
+        (-inf, 2.293318740264183)
+        >>> res = stats.quantile_test(x, q=q, p=p, alternative='two-sided')
+        >>> lb, ub = res.confidence_interval(0.9)
+        >>> lb, ub
+        (1.9542373206359396, 2.293318740264183)
+        """
+
+        alternative = self._alternative
+        p = self._p
+        x = np.sort(self._x)
+        n = len(x)
+        bd = stats.binom(n, p)
+
+        if confidence_level <= 0 or confidence_level >= 1:
+            message = "`confidence_level` must be a number between 0 and 1."
+            raise ValueError(message)
+
+        low_index = np.nan
+        high_index = np.nan
+
+        if alternative == 'less':
+            p = 1 - confidence_level
+            low = -np.inf
+            high_index = int(bd.isf(p))
+            high = x[high_index] if high_index < n else np.nan
+        elif alternative == 'greater':
+            p = 1 - confidence_level
+            low_index = int(bd.ppf(p)) - 1
+            low = x[low_index] if low_index >= 0 else np.nan
+            high = np.inf
+        elif alternative == 'two-sided':
+            p = (1 - confidence_level) / 2
+            low_index = int(bd.ppf(p)) - 1
+            low = x[low_index] if low_index >= 0 else np.nan
+            high_index = int(bd.isf(p))
+            high = x[high_index] if high_index < n else np.nan
+
+        return ConfidenceInterval(low, high)
+
+
+def quantile_test_iv(x, q, p, alternative):
+
+    x = np.atleast_1d(x)
+    message = '`x` must be a one-dimensional array of numbers.'
+    if x.ndim != 1 or not np.issubdtype(x.dtype, np.number):
+        raise ValueError(message)
+
+    q = np.array(q)[()]
+    message = "`q` must be a scalar."
+    if q.ndim != 0 or not np.issubdtype(q.dtype, np.number):
+        raise ValueError(message)
+
+    p = np.array(p)[()]
+    message = "`p` must be a float strictly between 0 and 1."
+    if p.ndim != 0 or p >= 1 or p <= 0:
+        raise ValueError(message)
+
+    alternatives = {'two-sided', 'less', 'greater'}
+    message = f"`alternative` must be one of {alternatives}"
+    if alternative not in alternatives:
+        raise ValueError(message)
+
+    return x, q, p, alternative
+
+
+@xp_capabilities(np_only=True)
+def quantile_test(x, *, q=0, p=0.5, alternative='two-sided'):
+    r"""
+    Perform a quantile test and compute a confidence interval of the quantile.
+
+    This function tests the null hypothesis that `q` is the value of the
+    quantile associated with probability `p` of the population underlying
+    sample `x`. For example, with default parameters, it tests that the
+    median of the population underlying `x` is zero. The function returns an
+    object including the test statistic, a p-value, and a method for computing
+    the confidence interval around the quantile.
+
+    Parameters
+    ----------
+    x : array_like
+        A one-dimensional sample.
+    q : float, default: 0
+        The hypothesized value of the quantile.
+    p : float, default: 0.5
+        The probability associated with the quantile; i.e. the proportion of
+        the population less than `q` is `p`. Must be strictly between 0 and
+        1.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+        * 'two-sided': the quantile associated with the probability `p`
+          is not `q`.
+        * 'less': the quantile associated with the probability `p` is less
+          than `q`.
+        * 'greater': the quantile associated with the probability `p` is
+          greater than `q`.
+
+    Returns
+    -------
+    result : QuantileTestResult
+        An object with the following attributes:
+
+        statistic : float
+            One of two test statistics that may be used in the quantile test.
+            The first test statistic, ``T1``, is the proportion of samples in
+            `x` that are less than or equal to the hypothesized quantile
+            `q`. The second test statistic, ``T2``, is the proportion of
+            samples in `x` that are strictly less than the hypothesized
+            quantile `q`.
+
+            When ``alternative = 'greater'``, ``T1`` is used to calculate the
+            p-value and ``statistic`` is set to ``T1``.
+
+            When ``alternative = 'less'``, ``T2`` is used to calculate the
+            p-value and ``statistic`` is set to ``T2``.
+
+            When ``alternative = 'two-sided'``, both ``T1`` and ``T2`` are
+            considered, and the one that leads to the smallest p-value is used.
+
+        statistic_type : int
+            Either `1` or `2` depending on which of ``T1`` or ``T2`` was
+            used to calculate the p-value.
+
+        pvalue : float
+            The p-value associated with the given alternative.
+
+        The object also has the following method:
+
+        confidence_interval(confidence_level=0.95)
+            Computes a confidence interval around the the
+            population quantile associated with the probability `p`. The
+            confidence interval is returned in a ``namedtuple`` with
+            fields `low` and `high`.  Values are `nan` when there are
+            not enough observations to compute the confidence interval at
+            the desired confidence.
+
+    Notes
+    -----
+    This test and its method for computing confidence intervals are
+    non-parametric. They are valid if and only if the observations are i.i.d.
+
+    The implementation of the test follows Conover [1]_. Two test statistics
+    are considered.
+
+    ``T1``: The number of observations in `x` less than or equal to `q`.
+
+        ``T1 = (x <= q).sum()``
+
+    ``T2``: The number of observations in `x` strictly less than `q`.
+
+        ``T2 = (x < q).sum()``
+
+    The use of two test statistics is necessary to handle the possibility that
+    `x` was generated from a discrete or mixed distribution.
+
+    The null hypothesis for the test is:
+
+        H0: The :math:`p^{\mathrm{th}}` population quantile is `q`.
+
+    and the null distribution for each test statistic is
+    :math:`\mathrm{binom}\left(n, p\right)`. When ``alternative='less'``,
+    the alternative hypothesis is:
+
+        H1: The :math:`p^{\mathrm{th}}` population quantile is less than `q`.
+
+    and the p-value is the probability that the binomial random variable
+
+    .. math::
+        Y \sim \mathrm{binom}\left(n, p\right)
+
+    is greater than or equal to the observed value ``T2``.
+
+    When ``alternative='greater'``, the alternative hypothesis is:
+
+        H1: The :math:`p^{\mathrm{th}}` population quantile is greater than `q`
+
+    and the p-value is the probability that the binomial random variable Y
+    is less than or equal to the observed value ``T1``.
+
+    When ``alternative='two-sided'``, the alternative hypothesis is
+
+        H1: `q` is not the :math:`p^{\mathrm{th}}` population quantile.
+
+    and the p-value is twice the smaller of the p-values for the ``'less'``
+    and ``'greater'`` cases. Both of these p-values can exceed 0.5 for the same
+    data, so the value is clipped into the interval :math:`[0, 1]`.
+
+    The approach for confidence intervals is attributed to Thompson [2]_ and
+    later proven to be applicable to any set of i.i.d. samples [3]_. The
+    computation is based on the observation that the probability of a quantile
+    :math:`q` to be larger than any observations :math:`x_m (1\leq m \leq N)`
+    can be computed as
+
+    .. math::
+
+        \mathbb{P}(x_m \leq q) = 1 - \sum_{k=0}^{m-1} \binom{N}{k}
+        q^k(1-q)^{N-k}
+
+    By default, confidence intervals are computed for a 95% confidence level.
+    A common interpretation of a 95% confidence intervals is that if i.i.d.
+    samples are drawn repeatedly from the same population and confidence
+    intervals are formed each time, the confidence interval will contain the
+    true value of the specified quantile in approximately 95% of trials.
+
+    A similar function is available in the QuantileNPCI R package [4]_. The
+    foundation is the same, but it computes the confidence interval bounds by
+    doing interpolations between the sample values, whereas this function uses
+    only sample values as bounds. Thus, ``quantile_test.confidence_interval``
+    returns more conservative intervals (i.e., larger).
+
+    The same computation of confidence intervals for quantiles is included in
+    the confintr package [5]_.
+
+    Two-sided confidence intervals are not guaranteed to be optimal; i.e.,
+    there may exist a tighter interval that may contain the quantile of
+    interest with probability larger than the confidence level.
+    Without further assumption on the samples (e.g., the nature of the
+    underlying distribution), the one-sided intervals are optimally tight.
+
+    References
+    ----------
+    .. [1] W. J. Conover. Practical Nonparametric Statistics, 3rd Ed. 1999.
+    .. [2] W. R. Thompson, "On Confidence Ranges for the Median and Other
+       Expectation Distributions for Populations of Unknown Distribution
+       Form," The Annals of Mathematical Statistics, vol. 7, no. 3,
+       pp. 122-128, 1936, Accessed: Sep. 18, 2019. [Online]. Available:
+       https://www.jstor.org/stable/2957563.
+    .. [3] H. A. David and H. N. Nagaraja, "Order Statistics in Nonparametric
+       Inference" in Order Statistics, John Wiley & Sons, Ltd, 2005, pp.
+       159-170. Available:
+       https://onlinelibrary.wiley.com/doi/10.1002/0471722162.ch7.
+    .. [4] N. Hutson, A. Hutson, L. Yan, "QuantileNPCI: Nonparametric
+       Confidence Intervals for Quantiles," R package,
+       https://cran.r-project.org/package=QuantileNPCI
+    .. [5] M. Mayer, "confintr: Confidence Intervals," R package,
+       https://cran.r-project.org/package=confintr
+
+
+    Examples
+    --------
+
+    Suppose we wish to test the null hypothesis that the median of a population
+    is equal to 0.5. We choose a confidence level of 99%; that is, we will
+    reject the null hypothesis in favor of the alternative if the p-value is
+    less than 0.01.
+
+    When testing random variates from the standard uniform distribution, which
+    has a median of 0.5, we expect the data to be consistent with the null
+    hypothesis most of the time.
+
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(6981396440634228121)
+    >>> rvs = stats.uniform.rvs(size=100, random_state=rng)
+    >>> stats.quantile_test(rvs, q=0.5, p=0.5)
+    QuantileTestResult(statistic=45, statistic_type=1, pvalue=0.36820161732669576)
+
+    As expected, the p-value is not below our threshold of 0.01, so
+    we cannot reject the null hypothesis.
+
+    When testing data from the standard *normal* distribution, which has a
+    median of 0, we would expect the null hypothesis to be rejected.
+
+    >>> rvs = stats.norm.rvs(size=100, random_state=rng)
+    >>> stats.quantile_test(rvs, q=0.5, p=0.5)
+    QuantileTestResult(statistic=67, statistic_type=2, pvalue=0.0008737198369123724)
+
+    Indeed, the p-value is lower than our threshold of 0.01, so we reject the
+    null hypothesis in favor of the default "two-sided" alternative: the median
+    of the population is *not* equal to 0.5.
+
+    However, suppose we were to test the null hypothesis against the
+    one-sided alternative that the median of the population is *greater* than
+    0.5. Since the median of the standard normal is less than 0.5, we would not
+    expect the null hypothesis to be rejected.
+
+    >>> stats.quantile_test(rvs, q=0.5, p=0.5, alternative='greater')
+    QuantileTestResult(statistic=67, statistic_type=1, pvalue=0.9997956114162866)
+
+    Unsurprisingly, with a p-value greater than our threshold, we would not
+    reject the null hypothesis in favor of the chosen alternative.
+
+    The quantile test can be used for any quantile, not only the median. For
+    example, we can test whether the third quartile of the distribution
+    underlying the sample is greater than 0.6.
+
+    >>> rvs = stats.uniform.rvs(size=100, random_state=rng)
+    >>> stats.quantile_test(rvs, q=0.6, p=0.75, alternative='greater')
+    QuantileTestResult(statistic=64, statistic_type=1, pvalue=0.00940696592998271)
+
+    The p-value is lower than the threshold. We reject the null hypothesis in
+    favor of the alternative: the third quartile of the distribution underlying
+    our sample is greater than 0.6.
+
+    `quantile_test` can also compute confidence intervals for any quantile.
+
+    >>> rvs = stats.norm.rvs(size=100, random_state=rng)
+    >>> res = stats.quantile_test(rvs, q=0.6, p=0.75)
+    >>> ci = res.confidence_interval(confidence_level=0.95)
+    >>> ci
+    ConfidenceInterval(low=0.284491604437432, high=0.8912531024914844)
+
+    When testing a one-sided alternative, the confidence interval contains
+    all observations such that if passed as `q`, the p-value of the
+    test would be greater than 0.05, and therefore the null hypothesis
+    would not be rejected. For example:
+
+    >>> rvs.sort()
+    >>> q, p, alpha = 0.6, 0.75, 0.95
+    >>> res = stats.quantile_test(rvs, q=q, p=p, alternative='less')
+    >>> ci = res.confidence_interval(confidence_level=alpha)
+    >>> for x in rvs[rvs <= ci.high]:
+    ...     res = stats.quantile_test(rvs, q=x, p=p, alternative='less')
+    ...     assert res.pvalue > 1-alpha
+    >>> for x in rvs[rvs > ci.high]:
+    ...     res = stats.quantile_test(rvs, q=x, p=p, alternative='less')
+    ...     assert res.pvalue < 1-alpha
+
+    Also, if a 95% confidence interval is repeatedly generated for random
+    samples, the confidence interval will contain the true quantile value in
+    approximately 95% of replications.
+
+    >>> dist = stats.rayleigh() # our "unknown" distribution
+    >>> p = 0.2
+    >>> true_stat = dist.ppf(p) # the true value of the statistic
+    >>> n_trials = 1000
+    >>> quantile_ci_contains_true_stat = 0
+    >>> for i in range(n_trials):
+    ...     data = dist.rvs(size=100, random_state=rng)
+    ...     res = stats.quantile_test(data, p=p)
+    ...     ci = res.confidence_interval(0.95)
+    ...     if ci[0] < true_stat < ci[1]:
+    ...         quantile_ci_contains_true_stat += 1
+    >>> quantile_ci_contains_true_stat >= 950
+    True
+
+    This works with any distribution and any quantile, as long as the samples
+    are i.i.d.
+    """
+    # Implementation carefully follows [1] 3.2
+    # "H0: the p*th quantile of X is x*"
+    # To facilitate comparison with [1], we'll use variable names that
+    # best match Conover's notation
+    X, x_star, p_star, H1 = quantile_test_iv(x, q, p, alternative)
+
+    # "We will use two test statistics in this test. Let T1 equal "
+    # "the number of observations less than or equal to x*, and "
+    # "let T2 equal the number of observations less than x*."
+    T1 = np.count_nonzero(X <= x_star)
+    T2 = np.count_nonzero(X < x_star)
+
+    # "The null distribution of the test statistics T1 and T2 is "
+    # "the binomial distribution, with parameters n = sample size, and "
+    # "p = p* as given in the null hypothesis.... Y has the binomial "
+    # "distribution with parameters n and p*."
+    n = len(X)
+    Y = stats.binom(n=n, p=p_star)
+
+    # "H1: the p* population quantile is less than x*"
+    if H1 == 'less':
+        # "The p-value is the probability that a binomial random variable Y "
+        # "is greater than *or equal to* the observed value of T2...using p=p*"
+        pvalue = Y.sf(T2-1)  # Y.pmf(T2) + Y.sf(T2)
+        statistic = T2
+        statistic_type = 2
+    # "H1: the p* population quantile is greater than x*"
+    elif H1 == 'greater':
+        # "The p-value is the probability that a binomial random variable Y "
+        # "is less than or equal to the observed value of T1... using p = p*"
+        pvalue = Y.cdf(T1)
+        statistic = T1
+        statistic_type = 1
+    # "H1: x* is not the p*th population quantile"
+    elif H1 == 'two-sided':
+        # "The p-value is twice the smaller of the probabilities that a
+        # binomial random variable Y is less than or equal to the observed
+        # value of T1 or greater than or equal to the observed value of T2
+        # using p=p*."
+        # Note: both one-sided p-values can exceed 0.5 for the same data, so
+        # `clip`
+        pvalues = [Y.cdf(T1), Y.sf(T2 - 1)]  # [greater, less]
+        sorted_idx = np.argsort(pvalues)
+        pvalue = np.clip(2*pvalues[sorted_idx[0]], 0, 1)
+        if sorted_idx[0]:
+            statistic, statistic_type = T2, 2
+        else:
+            statistic, statistic_type = T1, 1
+
+    return QuantileTestResult(
+        statistic=statistic,
+        statistic_type=statistic_type,
+        pvalue=pvalue,
+        _alternative=H1,
+        _x=X,
+        _p=p_star
+    )
+
+
+#####################################
+#       STATISTICAL DISTANCES       #
+#####################################
+
+
+@xp_capabilities(np_only=True)
+def wasserstein_distance_nd(u_values, v_values, u_weights=None, v_weights=None):
+    r"""
+    Compute the Wasserstein-1 distance between two N-D discrete distributions.
+
+    The Wasserstein distance, also called the Earth mover's distance or the
+    optimal transport distance, is a similarity metric between two probability
+    distributions [1]_. In the discrete case, the Wasserstein distance can be
+    understood as the cost of an optimal transport plan to convert one
+    distribution into the other. The cost is calculated as the product of the
+    amount of probability mass being moved and the distance it is being moved.
+    A brief and intuitive introduction can be found at [2]_.
+
+    .. versionadded:: 1.13.0
+
+    Parameters
+    ----------
+    u_values : 2d array_like
+        A sample from a probability distribution or the support (set of all
+        possible values) of a probability distribution. Each element along
+        axis 0 is an observation or possible value, and axis 1 represents the
+        dimensionality of the distribution; i.e., each row is a vector
+        observation or possible value.
+
+    v_values : 2d array_like
+        A sample from or the support of a second distribution.
+
+    u_weights, v_weights : 1d array_like, optional
+        Weights or counts corresponding with the sample or probability masses
+        corresponding with the support values. Sum of elements must be positive
+        and finite. If unspecified, each value is assigned the same weight.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    Given two probability mass functions, :math:`u`
+    and :math:`v`, the first Wasserstein distance between the distributions
+    using the Euclidean norm is:
+
+    .. math::
+
+        l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int \| x-y \|_2 \mathrm{d} \pi (x, y)
+
+    where :math:`\Gamma (u, v)` is the set of (probability) distributions on
+    :math:`\mathbb{R}^n \times \mathbb{R}^n` whose marginals are :math:`u` and
+    :math:`v` on the first and second factors respectively. For a given value
+    :math:`x`, :math:`u(x)` gives the probability of :math:`u` at position
+    :math:`x`, and the same for :math:`v(x)`.
+
+    This is also called the optimal transport problem or the Monge problem.
+    Let the finite point sets :math:`\{x_i\}` and :math:`\{y_j\}` denote
+    the support set of probability mass function :math:`u` and :math:`v`
+    respectively. The Monge problem can be expressed as follows,
+
+    Let :math:`\Gamma` denote the transport plan, :math:`D` denote the
+    distance matrix and,
+
+    .. math::
+
+        x = \text{vec}(\Gamma)          \\
+        c = \text{vec}(D)               \\
+        b = \begin{bmatrix}
+                u\\
+                v\\
+            \end{bmatrix}
+
+    The :math:`\text{vec}()` function denotes the Vectorization function
+    that transforms a matrix into a column vector by vertically stacking
+    the columns of the matrix.
+    The transport plan :math:`\Gamma` is a matrix :math:`[\gamma_{ij}]` in
+    which :math:`\gamma_{ij}` is a positive value representing the amount of
+    probability mass transported from :math:`u(x_i)` to :math:`v(y_i)`.
+    Summing over the rows of :math:`\Gamma` should give the source distribution
+    :math:`u` : :math:`\sum_j \gamma_{ij} = u(x_i)` holds for all :math:`i`
+    and summing over the columns of :math:`\Gamma` should give the target
+    distribution :math:`v`: :math:`\sum_i \gamma_{ij} = v(y_j)` holds for all
+    :math:`j`.
+    The distance matrix :math:`D` is a matrix :math:`[d_{ij}]`, in which
+    :math:`d_{ij} = d(x_i, y_j)`.
+
+    Given :math:`\Gamma`, :math:`D`, :math:`b`, the Monge problem can be
+    transformed into a linear programming problem by
+    taking :math:`A x = b` as constraints and :math:`z = c^T x` as minimization
+    target (sum of costs) , where matrix :math:`A` has the form
+
+    .. math::
+
+        \begin{array} {rrrr|rrrr|r|rrrr}
+            1 & 1 & \dots & 1 & 0 & 0 & \dots & 0 & \dots & 0 & 0 & \dots &
+                0 \cr
+            0 & 0 & \dots & 0 & 1 & 1 & \dots & 1 & \dots & 0 & 0 &\dots &
+                0 \cr
+            \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots
+                & \vdots & \vdots & \vdots & \vdots & \ddots & \vdots  \cr
+            0 & 0 & \dots & 0 & 0 & 0 & \dots & 0 & \dots & 1 & 1 & \dots &
+                1 \cr \hline
+
+            1 & 0 & \dots & 0 & 1 & 0 & \dots & \dots & \dots & 1 & 0 & \dots &
+                0 \cr
+            0 & 1 & \dots & 0 & 0 & 1 & \dots & \dots & \dots & 0 & 1 & \dots &
+                0 \cr
+            \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots &
+                \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \cr
+            0 & 0 & \dots & 1 & 0 & 0 & \dots & 1 & \dots & 0 & 0 & \dots & 1
+        \end{array}
+
+    By solving the dual form of the above linear programming problem (with
+    solution :math:`y^*`), the Wasserstein distance :math:`l_1 (u, v)` can
+    be computed as :math:`b^T y^*`.
+
+    The above solution is inspired by Vincent Herrmann's blog [3]_ . For a
+    more thorough explanation, see [4]_ .
+
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] "Wasserstein metric",
+           https://en.wikipedia.org/wiki/Wasserstein_metric
+    .. [2] Lili Weng, "What is Wasserstein distance?", Lil'log,
+           https://lilianweng.github.io/posts/2017-08-20-gan/#what-is-wasserstein-distance.
+    .. [3] Hermann, Vincent. "Wasserstein GAN and the Kantorovich-Rubinstein
+           Duality". https://vincentherrmann.github.io/blog/wasserstein/.
+    .. [4] Peyré, Gabriel, and Marco Cuturi. "Computational optimal
+           transport." Center for Research in Economics and Statistics
+           Working Papers 2017-86 (2017).
+
+    See Also
+    --------
+    wasserstein_distance: Compute the Wasserstein-1 distance between two
+        1D discrete distributions.
+
+    Examples
+    --------
+    Compute the Wasserstein distance between two three-dimensional samples,
+    each with two observations.
+
+    >>> from scipy.stats import wasserstein_distance_nd
+    >>> wasserstein_distance_nd([[0, 2, 3], [1, 2, 5]], [[3, 2, 3], [4, 2, 5]])
+    3.0
+
+    Compute the Wasserstein distance between two two-dimensional distributions
+    with three and two weighted observations, respectively.
+
+    >>> wasserstein_distance_nd([[0, 2.75], [2, 209.3], [0, 0]],
+    ...                      [[0.2, 0.322], [4.5, 25.1808]],
+    ...                      [0.4, 5.2, 0.114], [0.8, 1.5])
+    174.15840245217169
+    """
+    m, n = len(u_values), len(v_values)
+    u_values = asarray(u_values)
+    v_values = asarray(v_values)
+
+    if u_values.ndim > 2 or v_values.ndim > 2:
+        raise ValueError('Invalid input values. The inputs must have either '
+                         'one or two dimensions.')
+    # if dimensions are not equal throw error
+    if u_values.ndim != v_values.ndim:
+        raise ValueError('Invalid input values. Dimensions of inputs must be '
+                         'equal.')
+    # if data is 1D then call the cdf_distance function
+    if u_values.ndim == 1 and v_values.ndim == 1:
+        return _cdf_distance(1, u_values, v_values, u_weights, v_weights)
+
+    u_values, u_weights = _validate_distribution(u_values, u_weights)
+    v_values, v_weights = _validate_distribution(v_values, v_weights)
+    # if number of columns is not equal throw error
+    if u_values.shape[1] != v_values.shape[1]:
+        raise ValueError('Invalid input values. If two-dimensional, '
+                         '`u_values` and `v_values` must have the same '
+                         'number of columns.')
+
+    # if data contains np.inf then return inf or nan
+    if np.any(np.isinf(u_values)) ^ np.any(np.isinf(v_values)):
+        return np.inf
+    elif np.any(np.isinf(u_values)) and np.any(np.isinf(v_values)):
+        return np.nan
+
+    # create constraints
+    A_upper_part = sparse.block_diag((np.ones((1, n)), ) * m)
+    A_lower_part = sparse.hstack((sparse.eye(n), ) * m)
+    # sparse constraint matrix of size (m + n)*(m * n)
+    A = sparse.vstack((A_upper_part, A_lower_part))
+    A = sparse.coo_array(A)
+
+    # get cost matrix
+    D = distance_matrix(u_values, v_values, p=2)
+    cost = D.ravel()
+
+    # create the minimization target
+    p_u = np.full(m, 1/m) if u_weights is None else u_weights/np.sum(u_weights)
+    p_v = np.full(n, 1/n) if v_weights is None else v_weights/np.sum(v_weights)
+    b = np.concatenate((p_u, p_v), axis=0)
+
+    # solving LP
+    constraints = LinearConstraint(A=A.T, ub=cost)
+    opt_res = milp(c=-b, constraints=constraints, bounds=(-np.inf, np.inf))
+    return -opt_res.fun
+
+
+@xp_capabilities(np_only=True)
+def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
+    r"""
+    Compute the Wasserstein-1 distance between two 1D discrete distributions.
+
+    The Wasserstein distance, also called the Earth mover's distance or the
+    optimal transport distance, is a similarity metric between two probability
+    distributions [1]_. In the discrete case, the Wasserstein distance can be
+    understood as the cost of an optimal transport plan to convert one
+    distribution into the other. The cost is calculated as the product of the
+    amount of probability mass being moved and the distance it is being moved.
+    A brief and intuitive introduction can be found at [2]_.
+
+    .. versionadded:: 1.0.0
+
+    Parameters
+    ----------
+    u_values : 1d array_like
+        A sample from a probability distribution or the support (set of all
+        possible values) of a probability distribution. Each element is an
+        observation or possible value.
+
+    v_values : 1d array_like
+        A sample from or the support of a second distribution.
+
+    u_weights, v_weights : 1d array_like, optional
+        Weights or counts corresponding with the sample or probability masses
+        corresponding with the support values. Sum of elements must be positive
+        and finite. If unspecified, each value is assigned the same weight.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    Given two 1D probability mass functions, :math:`u` and :math:`v`, the first
+    Wasserstein distance between the distributions is:
+
+    .. math::
+
+        l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times
+        \mathbb{R}} |x-y| \mathrm{d} \pi (x, y)
+
+    where :math:`\Gamma (u, v)` is the set of (probability) distributions on
+    :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and
+    :math:`v` on the first and second factors respectively. For a given value
+    :math:`x`, :math:`u(x)` gives the probability of :math:`u` at position
+    :math:`x`, and the same for :math:`v(x)`.
+
+    If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and
+    :math:`v`, this distance also equals to:
+
+    .. math::
+
+        l_1(u, v) = \int_{-\infty}^{+\infty} |U-V|
+
+    See [3]_ for a proof of the equivalence of both definitions.
+
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric
+    .. [2] Lili Weng, "What is Wasserstein distance?", Lil'log,
+           https://lilianweng.github.io/posts/2017-08-20-gan/#what-is-wasserstein-distance.
+    .. [3] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related
+           Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`.
+
+    See Also
+    --------
+    wasserstein_distance_nd: Compute the Wasserstein-1 distance between two N-D
+        discrete distributions.
+
+    Examples
+    --------
+    >>> from scipy.stats import wasserstein_distance
+    >>> wasserstein_distance([0, 1, 3], [5, 6, 8])
+    5.0
+    >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2])
+    0.25
+    >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4],
+    ...                      [1.4, 0.9, 3.1, 7.2], [3.2, 3.5])
+    4.0781331438047861
+
+    """
+    return _cdf_distance(1, u_values, v_values, u_weights, v_weights)
+
+
+@xp_capabilities(np_only=True)
+def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
+    r"""Compute the energy distance between two 1D distributions.
+
+    .. versionadded:: 1.0.0
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The energy distance between two distributions :math:`u` and :math:`v`, whose
+    respective CDFs are :math:`U` and :math:`V`, equals to:
+
+    .. math::
+
+        D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| -
+        \mathbb E|Y - Y'| \right)^{1/2}
+
+    where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are
+    independent random variables whose probability distribution is :math:`u`
+    (resp. :math:`v`).
+
+    Sometimes the square of this quantity is referred to as the "energy
+    distance" (e.g. in [2]_, [4]_), but as noted in [1]_ and [3]_, only the
+    definition above satisfies the axioms of a distance function (metric).
+
+    As shown in [2]_, for one-dimensional real-valued variables, the energy
+    distance is linked to the non-distribution-free version of the Cramér-von
+    Mises distance:
+
+    .. math::
+
+        D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2
+        \right)^{1/2}
+
+    Note that the common Cramér-von Mises criterion uses the distribution-free
+    version of the distance. See [2]_ (section 2), for more details about both
+    versions of the distance.
+
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews:
+           Computational Statistics, 8(1):27-38 (2015).
+    .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling
+           Green State University, Department of Mathematics and Statistics,
+           Technical Report 02-16 (2002).
+    .. [3] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance
+    .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+
+    Examples
+    --------
+    >>> from scipy.stats import energy_distance
+    >>> energy_distance([0], [2])
+    2.0000000000000004
+    >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2])
+    1.0000000000000002
+    >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ],
+    ...                 [2.1, 4.2, 7.4, 8. ], [7.6, 8.8])
+    0.88003340976158217
+
+    """
+    return np.sqrt(2) * _cdf_distance(2, u_values, v_values,
+                                      u_weights, v_weights)
+
+
+def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None):
+    r"""
+    Compute, between two one-dimensional distributions :math:`u` and
+    :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
+    statistical distance that is defined as:
+
+    .. math::
+
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+
+    p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
+    gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+
+    """
+    u_values, u_weights = _validate_distribution(u_values, u_weights)
+    v_values, v_weights = _validate_distribution(v_values, v_weights)
+
+    u_sorter = np.argsort(u_values)
+    v_sorter = np.argsort(v_values)
+
+    all_values = np.concatenate((u_values, v_values))
+    all_values.sort(kind='mergesort')
+
+    # Compute the differences between pairs of successive values of u and v.
+    deltas = np.diff(all_values)
+
+    # Get the respective positions of the values of u and v among the values of
+    # both distributions.
+    u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right')
+    v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right')
+
+    # Calculate the CDFs of u and v using their weights, if specified.
+    if u_weights is None:
+        u_cdf = u_cdf_indices / u_values.size
+    else:
+        u_sorted_cumweights = np.concatenate(([0],
+                                              np.cumsum(u_weights[u_sorter])))
+        u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
+
+    if v_weights is None:
+        v_cdf = v_cdf_indices / v_values.size
+    else:
+        v_sorted_cumweights = np.concatenate(([0],
+                                              np.cumsum(v_weights[v_sorter])))
+        v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
+
+    # Compute the value of the integral based on the CDFs.
+    # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
+    # of about 15%.
+    if p == 1:
+        return np_vecdot(np.abs(u_cdf - v_cdf), deltas)
+    if p == 2:
+        return np.sqrt(np_vecdot(np.square(u_cdf - v_cdf), deltas))
+    return np.power(np_vecdot(np.power(np.abs(u_cdf - v_cdf), p), deltas), 1/p)
+
+
+def _validate_distribution(values, weights):
+    """
+    Validate the values and weights from a distribution input of `cdf_distance`
+    and return them as ndarray objects.
+
+    Parameters
+    ----------
+    values : array_like
+        Values observed in the (empirical) distribution.
+    weights : array_like
+        Weight for each value.
+
+    Returns
+    -------
+    values : ndarray
+        Values as ndarray.
+    weights : ndarray
+        Weights as ndarray.
+
+    """
+    # Validate the value array.
+    values = np.asarray(values, dtype=float)
+    if len(values) == 0:
+        raise ValueError("Distribution can't be empty.")
+
+    # Validate the weight array, if specified.
+    if weights is not None:
+        weights = np.asarray(weights, dtype=float)
+        if len(weights) != len(values):
+            raise ValueError('Value and weight array-likes for the same '
+                             'empirical distribution must be of the same size.')
+        if np.any(weights < 0):
+            raise ValueError('All weights must be non-negative.')
+        if not 0 < np.sum(weights) < np.inf:
+            raise ValueError('Weight array-like sum must be positive and '
+                             'finite. Set as None for an equal distribution of '
+                             'weight.')
+
+        return values, weights
+
+    return values, None
+
+
+@xp_capabilities(skip_backends=[("cupy", "`repeat` can't handle array second arg"),
+                                ("dask.array", "no `take_along_axis`")],
+                 jax_jit=False)
+def rankdata(a, method='average', *, axis=None, nan_policy='propagate'):
+    """Assign ranks to data, dealing with ties appropriately.
+
+    By default (``axis=None``), the data array is first flattened, and a flat
+    array of ranks is returned. Separately reshape the rank array to the
+    shape of the data array if desired (see Examples).
+
+    Ranks begin at 1.  The `method` argument controls how ranks are assigned
+    to equal values.  See [1]_ for further discussion of ranking methods.
+
+    Parameters
+    ----------
+    a : array_like
+        The array of values to be ranked.
+    method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional
+        The method used to assign ranks to tied elements.
+        The following methods are available (default is 'average'):
+
+          * 'average': The average of the ranks that would have been assigned to
+            all the tied values is assigned to each value.
+          * 'min': The minimum of the ranks that would have been assigned to all
+            the tied values is assigned to each value.  (This is also
+            referred to as "competition" ranking.)
+          * 'max': The maximum of the ranks that would have been assigned to all
+            the tied values is assigned to each value.
+          * 'dense': Like 'min', but the rank of the next highest element is
+            assigned the rank immediately after those assigned to the tied
+            elements.
+          * 'ordinal': All values are given a distinct rank, corresponding to
+            the order that the values occur in `a`.
+    axis : {None, int}, optional
+        Axis along which to perform the ranking. If ``None``, the data array
+        is first flattened.
+    nan_policy : {'propagate', 'omit', 'raise'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+
+          * 'propagate': propagates nans through the rank calculation
+          * 'omit': performs the calculations ignoring nan values
+          * 'raise': raises an error
+
+        .. note::
+
+            When `nan_policy` is 'propagate', the output is an array of *all*
+            nans because ranks relative to nans in the input are undefined.
+            When `nan_policy` is 'omit', nans in `a` are ignored when ranking
+            the other values, and the corresponding locations of the output
+            are nan.
+
+        .. versionadded:: 1.10
+
+    Returns
+    -------
+    ranks : ndarray
+         An array of size equal to the size of `a`, containing rank
+         scores.
+
+    References
+    ----------
+    .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import rankdata
+    >>> rankdata([0, 2, 3, 2])
+    array([ 1. ,  2.5,  4. ,  2.5])
+    >>> rankdata([0, 2, 3, 2], method='min')
+    array([ 1,  2,  4,  2])
+    >>> rankdata([0, 2, 3, 2], method='max')
+    array([ 1,  3,  4,  3])
+    >>> rankdata([0, 2, 3, 2], method='dense')
+    array([ 1,  2,  3,  2])
+    >>> rankdata([0, 2, 3, 2], method='ordinal')
+    array([ 1,  2,  4,  3])
+    >>> rankdata([[0, 2], [3, 2]]).reshape(2,2)
+    array([[1. , 2.5],
+          [4. , 2.5]])
+    >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1)
+    array([[1. , 2.5, 2.5],
+           [2. , 1. , 3. ]])
+    >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="propagate")
+    array([nan, nan, nan, nan, nan, nan])
+    >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="omit")
+    array([ 2.,  3.,  4., nan,  1., nan])
+
+    """
+    methods = ('average', 'min', 'max', 'dense', 'ordinal')
+    if method not in methods:
+        raise ValueError(f'unknown method "{method}"')
+
+    xp = array_namespace(a)
+    x = xp.asarray(a)
+
+    if axis is None:
+        x = xp_ravel(x)
+        axis = -1
+
+    if xp_size(x) == 0:
+        dtype = xp.asarray(1.).dtype if method == 'average' else xp.asarray(1).dtype
+        return xp.empty(x.shape, dtype=dtype)
+
+    contains_nan = _contains_nan(x, nan_policy)
+
+    x = xp_swapaxes(x, axis, -1, xp=xp)
+    ranks = _rankdata(x, method, xp=xp)
+
+    if contains_nan:
+        default_float = xp_default_dtype(xp)
+        i_nan = (xp.isnan(x) if nan_policy == 'omit'
+                 else xp.any(xp.isnan(x), axis=-1))
+        ranks = xp.asarray(ranks, dtype=default_float)  # copy=False when implemented
+        ranks[i_nan] = xp.nan
+
+    ranks = xp_swapaxes(ranks, axis, -1, xp=xp)
+    return ranks
+
+
+def _order_ranks(ranks, j, *, xp):
+    # Reorder ascending order `ranks` according to `j`
+    xp = array_namespace(ranks) if xp is None else xp
+    if is_numpy(xp) or is_cupy(xp):
+        ordered_ranks = xp.empty(j.shape, dtype=ranks.dtype)
+        xp.put_along_axis(ordered_ranks, j, ranks, axis=-1)
+    else:
+        # `put_along_axis` not in array API (data-apis/array-api#177)
+        #  so argsort the argsort and take_along_axis...
+        j_inv = xp.argsort(j, axis=-1, stable=True)
+        ordered_ranks = xp.take_along_axis(ranks, j_inv, axis=-1)
+    return ordered_ranks
+
+
+def _rankdata(x, method, return_ties=False, xp=None):
+    # Rank data `x` by desired `method`; `return_ties` if desired
+    xp = array_namespace(x) if xp is None else xp
+    shape = x.shape
+    dtype = xp.asarray(1.).dtype if method == 'average' else xp.asarray(1).dtype
+
+    # Get sort order
+    j = xp.argsort(x, axis=-1, stable=True)
+    ordinal_ranks = xp.broadcast_to(xp.arange(1, shape[-1]+1, dtype=dtype), shape)
+
+    # Ordinal ranks is very easy because ties don't matter. We're done.
+    if method == 'ordinal':
+        return _order_ranks(ordinal_ranks, j, xp=xp)  # never return ties
+
+    # Sort array
+    y = xp.take_along_axis(x, j, axis=-1)
+    # Logical indices of unique elements
+    i = xp.concat([xp.ones(shape[:-1] + (1,), dtype=xp.bool),
+                   y[..., :-1] != y[..., 1:]], axis=-1)
+
+    # Integer indices of unique elements
+    indices = xp.arange(xp_size(y))[xp.reshape(i, (-1,))]  # i gets raveled
+    # Counts of unique elements
+    counts = xp.diff(indices, append=xp.asarray([xp_size(y)], dtype=indices.dtype))
+
+    # Compute `'min'`, `'max'`, and `'mid'` ranks of unique elements
+    if method == 'min':
+        ranks = ordinal_ranks[i]
+    elif method == 'max':
+        ranks = ordinal_ranks[i] + counts - 1
+    elif method == 'average':
+        # array API doesn't promote integers to floats
+        ranks = ordinal_ranks[i] + (xp.asarray(counts, dtype=dtype) - 1)/2
+    elif method == 'dense':
+        ranks = xp.cumulative_sum(xp.astype(i, dtype, copy=False), axis=-1)[i]
+
+    ranks = xp.reshape(xp.repeat(ranks, counts), shape)
+    ranks = _order_ranks(ranks, j, xp=xp)
+
+    if return_ties:
+        # Tie information is returned in a format that is useful to functions that
+        # rely on this (private) function. Example:
+        # >>> x = np.asarray([3, 2, 1, 2, 2, 2, 1])
+        # >>> _, t = _rankdata(x, 'average', return_ties=True)
+        # >>> t  # array([2., 0., 4., 0., 0., 0., 1.])  # two 1s, four 2s, and one 3
+        # Unlike ranks, tie counts are *not* reordered to correspond with the order of
+        # the input; e.g. the number of appearances of the lowest rank element comes
+        # first. This is a useful format because:
+        # - The shape of the result is the shape of the input. Different slices can
+        #   have different numbers of tied elements but not result in a ragged array.
+        # - Functions that use `t` usually don't need to which each element of the
+        #   original array is associated with each tie count; they perform a reduction
+        #   over the tie counts onnly. The tie counts are naturally computed in a
+        #   sorted order, so this does not unnecessarily reorder them.
+        # - One exception is `wilcoxon`, which needs the number of zeros. Zeros always
+        #   have the lowest rank, so it is easy to find them at the zeroth index.
+        t = xp.zeros(shape, dtype=xp.float64)
+        t = xpx.at(t)[i].set(xp.astype(counts, t.dtype, copy=False))
+        return ranks, t
+    return ranks
+
+
+@xp_capabilities(np_only=True)
+def expectile(a, alpha=0.5, *, weights=None):
+    r"""Compute the expectile at the specified level.
+
+    Expectiles are a generalization of the expectation in the same way as
+    quantiles are a generalization of the median. The expectile at level
+    `alpha = 0.5` is the mean (average). See Notes for more details.
+
+    Parameters
+    ----------
+    a : array_like
+        Array containing numbers whose expectile is desired.
+    alpha : float, default: 0.5
+        The level of the expectile; e.g., ``alpha=0.5`` gives the mean.
+    weights : array_like, optional
+        An array of weights associated with the values in `a`.
+        The `weights` must be broadcastable to the same shape as `a`.
+        Default is None, which gives each value a weight of 1.0.
+        An integer valued weight element acts like repeating the corresponding
+        observation in `a` that many times. See Notes for more details.
+
+    Returns
+    -------
+    expectile : ndarray
+        The empirical expectile at level `alpha`.
+
+    See Also
+    --------
+    numpy.mean : Arithmetic average
+    numpy.quantile : Quantile
+
+    Notes
+    -----
+    In general, the expectile at level :math:`\alpha` of a random variable
+    :math:`X` with cumulative distribution function (CDF) :math:`F` is given
+    by the unique solution :math:`t` of:
+
+    .. math::
+
+        \alpha E((X - t)_+) = (1 - \alpha) E((t - X)_+) \,.
+
+    Here, :math:`(x)_+ = \max(0, x)` is the positive part of :math:`x`.
+    This equation can be equivalently written as:
+
+    .. math::
+
+        \alpha \int_t^\infty (x - t)\mathrm{d}F(x)
+        = (1 - \alpha) \int_{-\infty}^t (t - x)\mathrm{d}F(x) \,.
+
+    The empirical expectile at level :math:`\alpha` (`alpha`) of a sample
+    :math:`a_i` (the array `a`) is defined by plugging in the empirical CDF of
+    `a`. Given sample or case weights :math:`w` (the array `weights`), it
+    reads :math:`F_a(x) = \frac{1}{\sum_i w_i} \sum_i w_i 1_{a_i \leq x}`
+    with indicator function :math:`1_{A}`. This leads to the definition of the
+    empirical expectile at level `alpha` as the unique solution :math:`t` of:
+
+    .. math::
+
+        \alpha \sum_{i=1}^n w_i (a_i - t)_+ =
+            (1 - \alpha) \sum_{i=1}^n w_i (t - a_i)_+ \,.
+
+    For :math:`\alpha=0.5`, this simplifies to the weighted average.
+    Furthermore, the larger :math:`\alpha`, the larger the value of the
+    expectile.
+
+    As a final remark, the expectile at level :math:`\alpha` can also be
+    written as a minimization problem. One often used choice is
+
+    .. math::
+
+        \operatorname{argmin}_t
+        E(\lvert 1_{t\geq X} - \alpha\rvert(t - X)^2) \,.
+
+    References
+    ----------
+    .. [1] W. K. Newey and J. L. Powell (1987), "Asymmetric Least Squares
+           Estimation and Testing," Econometrica, 55, 819-847.
+    .. [2] T. Gneiting (2009). "Making and Evaluating Point Forecasts,"
+           Journal of the American Statistical Association, 106, 746 - 762.
+           :doi:`10.48550/arXiv.0912.0902`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import expectile
+    >>> a = [1, 4, 2, -1]
+    >>> expectile(a, alpha=0.5) == np.mean(a)
+    True
+    >>> expectile(a, alpha=0.2)
+    0.42857142857142855
+    >>> expectile(a, alpha=0.8)
+    2.5714285714285716
+    >>> weights = [1, 3, 1, 1]
+    >>> expectile(a, alpha=0.8, weights=weights)
+    3.3333333333333335
+    """
+    if alpha < 0 or alpha > 1:
+        raise ValueError(
+            "The expectile level alpha must be in the range [0, 1]."
+        )
+    a = np.asarray(a)
+
+    if weights is not None:
+        weights = np.broadcast_to(weights, a.shape)
+
+    # This is the empirical equivalent of Eq. (13) with identification
+    # function from Table 9 (omitting a factor of 2) in [2] (their y is our
+    # data a, their x is our t)
+    def first_order(t):
+        return np.average(np.abs((a <= t) - alpha) * (t - a), weights=weights)
+
+    if alpha >= 0.5:
+        x0 = np.average(a, weights=weights)
+        x1 = np.amax(a)
+    else:
+        x1 = np.average(a, weights=weights)
+        x0 = np.amin(a)
+
+    if x0 == x1:
+        # a has a single unique element
+        return x0
+
+    # Note that the expectile is the unique solution, so no worries about
+    # finding a wrong root.
+    res = root_scalar(first_order, x0=x0, x1=x1)
+    return res.root
+
+
+def _lmoment_iv(sample, order, axis, sorted, standardize, xp):
+    # input validation/standardization for `lmoment`
+    sample = xp_promote(sample, force_floating=True, xp=xp)
+
+    message = "`sample` must be an array of real numbers."
+    if not xp.isdtype(sample.dtype, "real floating"):
+        raise ValueError(message)
+
+    message = "`order` must be a scalar or a non-empty array of positive integers."
+    order = xp.arange(1, 5) if order is None else xp.asarray(order)
+    if (not xp.isdtype(order.dtype, "integral") or xp.any(order <= 0)
+            or order.size == 0 or order.ndim > 1):
+        raise ValueError(message)
+
+    # input validation of non-array types can still be performed with NumPy
+    axis = np.asarray(axis)[()]
+    message = "`axis` must be an integer."
+    if not np.issubdtype(axis.dtype, np.integer) or axis.ndim != 0:
+        raise ValueError(message)
+    axis = int(axis)
+
+    sorted = np.asarray(sorted)[()]
+    message = "`sorted` must be True or False."
+    if not np.issubdtype(sorted.dtype, np.bool_) or sorted.ndim != 0:
+        raise ValueError(message)
+    sorted = bool(sorted)
+
+    standardize = np.asarray(standardize)[()]
+    message = "`standardize` must be True or False."
+    if not np.issubdtype(standardize.dtype, np.bool_) or standardize.ndim != 0:
+        raise ValueError(message)
+    standardize = bool(standardize)
+
+    sample = xp.moveaxis(sample, axis, -1)
+    sample = xp.sort(sample, axis=-1) if not sorted else sample
+
+    return sample, order, axis, sorted, standardize
+
+
+def _br(x, *, r=0, xp):
+    n = x.shape[-1]
+    x = xp.expand_dims(x, axis=-2)
+    x = xp.broadcast_to(x, x.shape[:-2] + (r.shape[0], n))
+    x = xp.triu(x)
+    j = xp.arange(n, dtype=x.dtype)
+    n = xp.asarray(n, dtype=x.dtype)[()]
+    return (xp.vecdot(special.binom(j, r[:, xp.newaxis]), x, axis=-1)
+            / special.binom(n-1, r) / n)
+
+
+def _prk(r, k):
+    # Writen to match [1] Equation 27 closely to facilitate review.
+    # This does not protect against overflow, so improvements to
+    # robustness would be a welcome follow-up.
+    return (-1)**(r-k)*special.binom(r, k)*special.binom(r+k, k)
+
+
+@xp_capabilities(skip_backends=[('dask.array', "too many issues")],
+                 jax_jit=False, cpu_only=True,  # torch doesn't have `binom`
+                 exceptions=('cupy', 'jax.numpy'))
+@_axis_nan_policy_factory(  # noqa: E302
+    _moment_result_object, n_samples=1, result_to_tuple=_moment_tuple,
+    n_outputs=lambda kwds: _moment_outputs(kwds, [1, 2, 3, 4])
+)
+def lmoment(sample, order=None, *, axis=0, sorted=False, standardize=True):
+    r"""Compute L-moments of a sample from a continuous distribution
+
+    The L-moments of a probability distribution are summary statistics with
+    uses similar to those of conventional moments, but they are defined in
+    terms of the expected values of order statistics.
+    Sample L-moments are defined analogously to population L-moments, and
+    they can serve as estimators of population L-moments. They tend to be less
+    sensitive to extreme observations than conventional moments.
+
+    Parameters
+    ----------
+    sample : array_like
+        The real-valued sample whose L-moments are desired.
+    order : array_like, optional
+        The (positive integer) orders of the desired L-moments.
+        Must be a scalar or non-empty 1D array. Default is [1, 2, 3, 4].
+    axis : int or None, default=0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear
+        in a corresponding element of the output. If None, the input will be
+        raveled before computing the statistic.
+    sorted : bool, default=False
+        Whether `sample` is already sorted in increasing order along `axis`.
+        If False (default), `sample` will be sorted.
+    standardize : bool, default=True
+        Whether to return L-moment ratios for orders 3 and higher.
+        L-moment ratios are analogous to standardized conventional
+        moments: they are the non-standardized L-moments divided
+        by the L-moment of order 2.
+
+    Returns
+    -------
+    lmoments : ndarray
+        The sample L-moments of order `order`.
+
+    See Also
+    --------
+    moment
+
+    References
+    ----------
+    .. [1] D. Bilkova. "L-Moments and TL-Moments as an Alternative Tool of
+           Statistical Data Analysis". Journal of Applied Mathematics and
+           Physics. 2014. :doi:`10.4236/jamp.2014.210104`
+    .. [2] J. R. M. Hosking. "L-Moments: Analysis and Estimation of Distributions
+           Using Linear Combinations of Order Statistics". Journal of the Royal
+           Statistical Society. 1990. :doi:`10.1111/j.2517-6161.1990.tb01775.x`
+    .. [3] "L-moment". *Wikipedia*. https://en.wikipedia.org/wiki/L-moment.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(328458568356392)
+    >>> sample = rng.exponential(size=100000)
+    >>> stats.lmoment(sample)
+    array([1.00124272, 0.50111437, 0.3340092 , 0.16755338])
+
+    Note that the first four standardized population L-moments of the standard
+    exponential distribution are 1, 1/2, 1/3, and 1/6; the sample L-moments
+    provide reasonable estimates.
+
+    """
+    xp = array_namespace(sample)
+    args = _lmoment_iv(sample, order, axis, sorted, standardize, xp=xp)
+    sample, order, axis, sorted, standardize = args
+
+    n_moments = int(xp.max(order))
+    k = xp.arange(n_moments, dtype=sample.dtype)
+    prk = _prk(xpx.expand_dims(k, axis=tuple(range(1, sample.ndim+1))), k)
+    bk = _br(sample, r=k, xp=xp)
+
+    n = sample.shape[-1]
+    if n < bk.shape[-1]:
+        bk = xpx.at(bk)[..., n:].set(0)  # remove NaNs due to n_moments > n
+
+    lmoms = xp.vecdot(prk, bk, axis=-1)
+    if standardize and n_moments > 2:
+        lmoms = xpx.at(lmoms)[2:, ...].divide(lmoms[1, ...])
+
+    if n < lmoms.shape[0]:
+        lmoms = xpx.at(lmoms)[n:, ...].set(xp.nan)  # add NaNs where appropriate
+    # return lmoms[order-1]  # strict can't handle fancy indexing plus ellipses
+    return xp.take(lmoms, order - 1, axis=0) if order.ndim == 1 else lmoms[order - 1]
+
+
+LinregressResult = _make_tuple_bunch('LinregressResult',
+                                     ['slope', 'intercept', 'rvalue',
+                                      'pvalue', 'stderr'],
+                                     extra_field_names=['intercept_stderr'])
+
+
+def _pack_LinregressResult(slope, intercept, rvalue, pvalue, stderr, intercept_stderr):
+    return LinregressResult(slope, intercept, rvalue, pvalue, stderr,
+                            intercept_stderr=intercept_stderr)
+
+
+def _unpack_LinregressResult(res, _):
+    return tuple(res) + (res.intercept_stderr,)
+
+
+@xp_capabilities(np_only=True)
+@_axis_nan_policy_factory(_pack_LinregressResult, n_samples=2,
+                          result_to_tuple=_unpack_LinregressResult, paired=True,
+                          too_small=1, n_outputs=6)
+def linregress(x, y, alternative='two-sided'):
+    """
+    Calculate a linear least-squares regression for two sets of measurements.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Two sets of measurements.  Both arrays should have the same length N.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+
+        * 'two-sided': the slope of the regression line is nonzero
+        * 'less': the slope of the regression line is less than zero
+        * 'greater':  the slope of the regression line is greater than zero
+
+        .. versionadded:: 1.7.0
+
+    Returns
+    -------
+    result : ``LinregressResult`` instance
+        The return value is an object with the following attributes:
+
+        slope : float
+            Slope of the regression line.
+        intercept : float
+            Intercept of the regression line.
+        rvalue : float
+            The Pearson correlation coefficient. The square of ``rvalue``
+            is equal to the coefficient of determination.
+        pvalue : float
+            The p-value for a hypothesis test whose null hypothesis is
+            that the slope is zero, using Wald Test with t-distribution of
+            the test statistic. See `alternative` above for alternative
+            hypotheses.
+        stderr : float
+            Standard error of the estimated slope (gradient), under the
+            assumption of residual normality.
+        intercept_stderr : float
+            Standard error of the estimated intercept, under the assumption
+            of residual normality.
+
+    See Also
+    --------
+    scipy.optimize.curve_fit :
+        Use non-linear least squares to fit a function to data.
+    scipy.optimize.leastsq :
+        Minimize the sum of squares of a set of equations.
+
+    Notes
+    -----
+    For compatibility with older versions of SciPy, the return value acts
+    like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
+    ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
+
+        slope, intercept, r, p, se = linregress(x, y)
+
+    With that style, however, the standard error of the intercept is not
+    available.  To have access to all the computed values, including the
+    standard error of the intercept, use the return value as an object
+    with attributes, e.g.::
+
+        result = linregress(x, y)
+        print(result.intercept, result.intercept_stderr)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+
+    Generate some data:
+
+    >>> x = rng.random(10)
+    >>> y = 1.6*x + rng.random(10)
+
+    Perform the linear regression:
+
+    >>> res = stats.linregress(x, y)
+
+    Coefficient of determination (R-squared):
+
+    >>> print(f"R-squared: {res.rvalue**2:.6f}")
+    R-squared: 0.717533
+
+    Plot the data along with the fitted line:
+
+    >>> plt.plot(x, y, 'o', label='original data')
+    >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
+    >>> plt.legend()
+    >>> plt.show()
+
+    Calculate 95% confidence interval on slope and intercept:
+
+    >>> # Two-sided inverse Students t-distribution
+    >>> # p - probability, df - degrees of freedom
+    >>> from scipy.stats import t
+    >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
+
+    >>> ts = tinv(0.05, len(x)-2)
+    >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
+    slope (95%): 1.453392 +/- 0.743465
+    >>> print(f"intercept (95%): {res.intercept:.6f}"
+    ...       f" +/- {ts*res.intercept_stderr:.6f}")
+    intercept (95%): 0.616950 +/- 0.544475
+
+    """
+    TINY = 1.0e-20
+    x = np.asarray(x)
+    y = np.asarray(y)
+
+    if x.size == 0 or y.size == 0:
+        raise ValueError("Inputs must not be empty.")
+
+    if np.amax(x) == np.amin(x) and len(x) > 1:
+        raise ValueError("Cannot calculate a linear regression "
+                         "if all x values are identical")
+
+    n = len(x)
+    xmean = np.mean(x, None)
+    ymean = np.mean(y, None)
+
+    # Average sums of square differences from the mean
+    #   ssxm = mean( (x-mean(x))^2 )
+    #   ssxym = mean( (x-mean(x)) * (y-mean(y)) )
+    ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
+
+    # R-value
+    #   r = ssxym / sqrt( ssxm * ssym )
+    if ssxm == 0.0 or ssym == 0.0:
+        # If the denominator was going to be 0
+        r = np.asarray(np.nan if ssxym == 0 else 0.0)[()]
+    else:
+        r = ssxym / np.sqrt(ssxm * ssym)
+        # Test for numerical error propagation (make sure -1 < r < 1)
+        if r > 1.0:
+            r = 1.0
+        elif r < -1.0:
+            r = -1.0
+
+    slope = ssxym / ssxm
+    intercept = ymean - slope*xmean
+    if n == 2:
+        # handle case when only two points are passed in
+        if y[0] == y[1]:
+            prob = 1.0
+        else:
+            prob = 0.0
+        slope_stderr = 0.0
+        intercept_stderr = 0.0
+    else:
+        df = n - 2  # Number of degrees of freedom
+        # n-2 degrees of freedom because 2 has been used up
+        # to estimate the mean and standard deviation
+        t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
+
+        dist = _SimpleStudentT(df)
+        prob = _get_pvalue(t, dist, alternative, xp=np)
+        prob = prob[()] if prob.ndim == 0 else prob
+
+        slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
+
+        # Also calculate the standard error of the intercept
+        # The following relationship is used:
+        #   ssxm = mean( (x-mean(x))^2 )
+        #        = ssx - sx*sx
+        #        = mean( x^2 ) - mean(x)^2
+        intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
+
+    return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
+                            pvalue=prob, stderr=slope_stderr,
+                            intercept_stderr=intercept_stderr)
+
+
+def _xp_mean(x, /, *, axis=None, weights=None, keepdims=False, nan_policy='propagate',
+             dtype=None, xp=None):
+    r"""Compute the arithmetic mean along the specified axis.
+
+    Parameters
+    ----------
+    x : real array
+        Array containing real numbers whose mean is desired.
+    axis : int or tuple of ints, default: None
+        If an int or tuple of ints, the axis or axes of the input along which
+        to compute the statistic. The statistic of each axis-slice (e.g. row)
+        of the input will appear in a corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+    weights : real array, optional
+        If specified, an array of weights associated with the values in `x`;
+        otherwise ``1``. If `weights` and `x` do not have the same shape, the
+        arrays will be broadcasted before performing the calculation. See
+        Notes for details.
+    keepdims : boolean, optional
+        If this is set to ``True``, the axes which are reduced are left
+        in the result as dimensions with length one. With this option,
+        the result will broadcast correctly against the input array.
+    nan_policy : {'propagate', 'omit', 'raise'}, default: 'propagate'
+        Defines how to handle input NaNs.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the statistic is computed, the corresponding entry of the output
+          will be NaN.
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding entry of the output will be
+          NaN.
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+
+    dtype : dtype, optional
+        Type to use in computing the mean. For integer inputs, the default is
+        the default float type of the array library; for floating point inputs,
+        the dtype is that of the input.
+
+    Returns
+    -------
+    out : array
+        The mean of each slice
+
+    Notes
+    -----
+    Let :math:`x_i` represent element :math:`i` of data `x` and let :math:`w_i`
+    represent the corresponding element of `weights` after broadcasting. Then the
+    (weighted) mean :math:`\bar{x}_w` is given by:
+
+    .. math::
+
+        \bar{x}_w = \frac{ \sum_{i=0}^{n-1} w_i x_i }
+                         { \sum_{i=0}^{n-1} w_i }
+
+    where :math:`n` is the number of elements along a slice. Note that this simplifies
+    to the familiar :math:`(\sum_i x_i) / n` when the weights are all ``1`` (default).
+
+    The behavior of this function with respect to weights is somewhat different
+    from that of `np.average`. For instance,
+    `np.average` raises an error when `axis` is not specified and the shapes of `x`
+    and the `weights` array are not the same; `xp_mean` simply broadcasts the two.
+    Also, `np.average` raises an error when weights sum to zero along a slice;
+    `xp_mean` computes the appropriate result. The intent is for this function's
+    interface to be consistent with the rest of `scipy.stats`.
+
+    Note that according to the formula, including NaNs with zero weights is not
+    the same as *omitting* NaNs with ``nan_policy='omit'``; in the former case,
+    the NaNs will continue to propagate through the calculation whereas in the
+    latter case, the NaNs are excluded entirely.
+
+    """
+    # ensure that `x` and `weights` are array-API compatible arrays of identical shape
+    xp = array_namespace(x) if xp is None else xp
+    x = _asarray(x, dtype=dtype, subok=True)
+    weights = xp.asarray(weights, dtype=dtype) if weights is not None else weights
+
+    # to ensure that this matches the behavior of decorated functions when one of the
+    # arguments has size zero, it's easiest to call a similar decorated function.
+    if is_numpy(xp) and (xp_size(x) == 0
+                         or (weights is not None and xp_size(weights) == 0)):
+        return gmean(x, weights=weights, axis=axis, keepdims=keepdims)
+
+    x, weights = xp_promote(x, weights, broadcast=True, force_floating=True, xp=xp)
+    if weights is not None:
+        x, weights = _share_masks(x, weights, xp=xp)
+
+    # handle the special case of zero-sized arrays
+    message = (too_small_1d_not_omit if (x.ndim == 1 or axis is None)
+               else too_small_nd_not_omit)
+    if xp_size(x) == 0:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            res = xp.mean(x, axis=axis, keepdims=keepdims)
+        if xp_size(res) != 0:
+            warnings.warn(message, SmallSampleWarning, stacklevel=2)
+        return res
+
+    contains_nan = _contains_nan(x, nan_policy, xp_omit_okay=True, xp=xp)
+    if weights is not None:
+        contains_nan_w = _contains_nan(weights, nan_policy, xp_omit_okay=True, xp=xp)
+        contains_nan = contains_nan | contains_nan_w
+
+    # Handle `nan_policy='omit'` by giving zero weight to NaNs, whether they
+    # appear in `x` or `weights`. Emit warning if there is an all-NaN slice.
+    # Test nan_policy before the implicit call to bool(contains_nan)
+    # to avoid raising on lazy xps on the default nan_policy='propagate'
+    lazy = is_lazy_array(x)
+    if nan_policy == 'omit' and (lazy or contains_nan):
+        nan_mask = xp.isnan(x)
+        if weights is not None:
+            nan_mask |= xp.isnan(weights)
+        if not lazy and xp.any(xp.all(nan_mask, axis=axis)):
+            message = (too_small_1d_omit if (x.ndim == 1 or axis is None)
+                       else too_small_nd_omit)
+            warnings.warn(message, SmallSampleWarning, stacklevel=2)
+        weights = xp.ones_like(x) if weights is None else weights
+        x = xp.where(nan_mask, 0., x)
+        weights = xp.where(nan_mask, 0., weights)
+
+    # Perform the mean calculation itself
+    if weights is None:
+        return xp.mean(x, axis=axis, keepdims=keepdims)
+
+    # consider using `vecdot` if `axis` tuple support is added (data-apis/array-api#910)
+    norm = xp.sum(weights, axis=axis)
+    wsum = xp.sum(x * weights, axis=axis)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        res = wsum/norm
+
+    # Respect `keepdims` and convert NumPy 0-D arrays to scalars
+    if keepdims:
+
+        if axis is None:
+            final_shape = (1,) * len(x.shape)
+        else:
+            # axis can be a scalar or sequence
+            axes = (axis,) if not isinstance(axis, Sequence) else axis
+            final_shape = list(x.shape)
+            for i in axes:
+                final_shape[i] = 1
+
+        res = xp.reshape(res, tuple(final_shape))
+
+    return res[()] if res.ndim == 0 else res
+
+
+def _xp_var(x, /, *, axis=None, correction=0, keepdims=False, nan_policy='propagate',
+            dtype=None, xp=None):
+    # an array-api compatible function for variance with scipy.stats interface
+    # and features (e.g. `nan_policy`).
+    xp = array_namespace(x) if xp is None else xp
+    x = _asarray(x, subok=True)
+
+    # use `_xp_mean` instead of `xp.var` for desired warning behavior
+    # it would be nice to combine this with `_var`, which uses `_moment`
+    # and therefore warns when precision is lost, but that does not support
+    # `axis` tuples or keepdims. Eventually, `_axis_nan_policy` will simplify
+    # `axis` tuples and implement `keepdims` for non-NumPy arrays; then it will
+    # be easy.
+    kwargs = dict(axis=axis, nan_policy=nan_policy, dtype=dtype, xp=xp)
+    mean = _xp_mean(x, keepdims=True, **kwargs)
+    x = _asarray(x, dtype=mean.dtype, subok=True)
+    x_mean = _demean(x, mean, axis, xp=xp)
+    x_mean_conj = (xp.conj(x_mean) if xp.isdtype(x_mean.dtype, 'complex floating')
+                   else x_mean)  # crossref data-apis/array-api#824
+    var = _xp_mean(x_mean * x_mean_conj, keepdims=keepdims, **kwargs)
+
+    if correction != 0:
+        n = _length_nonmasked(x, axis, xp=xp)
+        # Or two lines with ternaries : )
+        # axis = range(x.ndim) if axis is None else axis
+        # n = math.prod(x.shape[i] for i in axis) if iterable(axis) else x.shape[axis]
+
+        n = xp.asarray(n, dtype=var.dtype, device=xp_device(x))
+
+        if nan_policy == 'omit':
+            nan_mask = xp.astype(xp.isnan(x), var.dtype)
+            n = n - xp.sum(nan_mask, axis=axis, keepdims=keepdims)
+
+        # Produce NaNs silently when n - correction <= 0
+        nc = n - correction
+        factor = xpx.apply_where(nc > 0, (n, nc), operator.truediv, fill_value=xp.nan)
+        var *= factor
+
+    return var[()] if var.ndim == 0 else var
+
+
+class _SimpleNormal:
+    # A very simple, array-API compatible normal distribution for use in
+    # hypothesis tests. May be replaced by new infrastructure Normal
+    # distribution in due time.
+
+    def cdf(self, x):
+        return special.ndtr(x)
+
+    def sf(self, x):
+        return special.ndtr(-x)
+
+    def isf(self, x):
+        return -special.ndtri(x)
+
+
+class _SimpleChi2:
+    # A very simple, array-API compatible chi-squared distribution for use in
+    # hypothesis tests. May be replaced by new infrastructure chi-squared
+    # distribution in due time.
+    def __init__(self, df):
+        self.df = df
+
+    def cdf(self, x):
+        return special.chdtr(self.df, x)
+
+    def sf(self, x):
+        return special.chdtrc(self.df, x)
+
+
+class _SimpleBeta:
+    # A very simple, array-API compatible beta distribution for use in
+    # hypothesis tests. May be replaced by new infrastructure beta
+    # distribution in due time.
+    def __init__(self, a, b, *, loc=None, scale=None):
+        self.a = a
+        self.b = b
+        self.loc = loc
+        self.scale = scale
+
+    def cdf(self, x):
+        if self.loc is not None or self.scale is not None:
+            loc = 0 if self.loc is None else self.loc
+            scale = 1 if self.scale is None else self.scale
+            return special.betainc(self.a, self.b, (x - loc)/scale)
+        return special.betainc(self.a, self.b, x)
+
+    def sf(self, x):
+        if self.loc is not None or self.scale is not None:
+            loc = 0 if self.loc is None else self.loc
+            scale = 1 if self.scale is None else self.scale
+            return special.betaincc(self.a, self.b, (x - loc)/scale)
+        return special.betaincc(self.a, self.b, x)
+
+
+class _SimpleStudentT:
+    # A very simple, array-API compatible t distribution for use in
+    # hypothesis tests. May be replaced by new infrastructure t
+    # distribution in due time.
+    def __init__(self, df):
+        self.df = df
+
+    def cdf(self, t):
+        return special.stdtr(self.df, t)
+
+    def sf(self, t):
+        return special.stdtr(self.df, -t)
+
+
+class _SimpleF:
+    # A very simple, array-API compatible F distribution for use in
+    # hypothesis tests.
+    def __init__(self, dfn, dfd):
+        self.dfn = dfn
+        self.dfd = dfd
+
+    def cdf(self, x):
+        return special.fdtr(self.dfn, self.dfd, x)
+
+    def sf(self, x):
+        return special.fdtrc(self.dfn, self.dfd, x)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_survival.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_survival.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a84806a95ae82b2968bb9cf5ee37f48808c59c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_survival.py
@@ -0,0 +1,686 @@
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+import warnings
+
+import numpy as np
+from scipy import special, interpolate, stats
+from scipy._lib._array_api import xp_capabilities
+from scipy.stats._censored_data import CensoredData
+from scipy.stats._common import ConfidenceInterval
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+
+__all__ = ['ecdf', 'logrank']
+
+
+@dataclass
+class EmpiricalDistributionFunction:
+    """An empirical distribution function produced by `scipy.stats.ecdf`
+
+    Attributes
+    ----------
+    quantiles : ndarray
+        The unique values of the sample from which the
+        `EmpiricalDistributionFunction` was estimated.
+    probabilities : ndarray
+        The point estimates of the cumulative distribution function (CDF) or
+        its complement, the survival function (SF), corresponding with
+        `quantiles`.
+    """
+    quantiles: np.ndarray
+    probabilities: np.ndarray
+    # Exclude these from __str__
+    _n: np.ndarray = field(repr=False)  # number "at risk"
+    _d: np.ndarray = field(repr=False)  # number of "deaths"
+    _sf: np.ndarray = field(repr=False)  # survival function for var estimate
+    _kind: str = field(repr=False)  # type of function: "cdf" or "sf"
+
+    def __init__(self, q, p, n, d, kind):
+        self.probabilities = p
+        self.quantiles = q
+        self._n = n
+        self._d = d
+        self._sf = p if kind == 'sf' else 1 - p
+        self._kind = kind
+
+        f0 = 1 if kind == 'sf' else 0  # leftmost function value
+        f1 = 1 - f0
+        # fill_value can't handle edge cases at infinity
+        x = np.insert(q, [0, len(q)], [-np.inf, np.inf])
+        y = np.insert(p, [0, len(p)], [f0, f1])
+        # `or` conditions handle the case of empty x, points
+        self._f = interpolate.interp1d(x, y, kind='previous',
+                                       assume_sorted=True)
+
+    def evaluate(self, x):
+        """Evaluate the empirical CDF/SF function at the input.
+
+        Parameters
+        ----------
+        x : ndarray
+            Argument to the CDF/SF
+
+        Returns
+        -------
+        y : ndarray
+            The CDF/SF evaluated at the input
+        """
+        return self._f(x)
+
+    def plot(self, ax=None, **matplotlib_kwargs):
+        """Plot the empirical distribution function
+
+        Available only if ``matplotlib`` is installed.
+
+        Parameters
+        ----------
+        ax : matplotlib.axes.Axes
+            Axes object to draw the plot onto, otherwise uses the current Axes.
+
+        **matplotlib_kwargs : dict, optional
+            Keyword arguments passed directly to `matplotlib.axes.Axes.step`.
+            Unless overridden, ``where='post'``.
+
+        Returns
+        -------
+        lines : list of `matplotlib.lines.Line2D`
+            Objects representing the plotted data
+        """
+        try:
+            import matplotlib  # noqa: F401
+        except ModuleNotFoundError as exc:
+            message = "matplotlib must be installed to use method `plot`."
+            raise ModuleNotFoundError(message) from exc
+
+        if ax is None:
+            import matplotlib.pyplot as plt
+            ax = plt.gca()
+
+        kwargs = {'where': 'post'}
+        kwargs.update(matplotlib_kwargs)
+
+        delta = np.ptp(self.quantiles)*0.05  # how far past sample edge to plot
+        q = self.quantiles
+        q = [q[0] - delta] + list(q) + [q[-1] + delta]
+
+        return ax.step(q, self.evaluate(q), **kwargs)
+
+    def confidence_interval(self, confidence_level=0.95, *, method='linear'):
+        """Compute a confidence interval around the CDF/SF point estimate
+
+        Parameters
+        ----------
+        confidence_level : float, default: 0.95
+            Confidence level for the computed confidence interval
+
+        method : str, {"linear", "log-log"}
+            Method used to compute the confidence interval. Options are
+            "linear" for the conventional Greenwood confidence interval
+            (default)  and "log-log" for the "exponential Greenwood",
+            log-negative-log-transformed confidence interval.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval``
+            An object with attributes ``low`` and ``high``, instances of
+            `~scipy.stats._result_classes.EmpiricalDistributionFunction` that
+            represent the lower and upper bounds (respectively) of the
+            confidence interval.
+
+        Notes
+        -----
+        Confidence intervals are computed according to the Greenwood formula
+        (``method='linear'``) or the more recent "exponential Greenwood"
+        formula (``method='log-log'``) as described in [1]_. The conventional
+        Greenwood formula can result in lower confidence limits less than 0
+        and upper confidence limits greater than 1; these are clipped to the
+        unit interval. NaNs may be produced by either method; these are
+        features of the formulas.
+
+        References
+        ----------
+        .. [1] Sawyer, Stanley. "The Greenwood and Exponential Greenwood
+               Confidence Intervals in Survival Analysis."
+               https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
+
+        """
+        message = ("Confidence interval bounds do not implement a "
+                   "`confidence_interval` method.")
+        if self._n is None:
+            raise NotImplementedError(message)
+
+        methods = {'linear': self._linear_ci,
+                   'log-log': self._loglog_ci}
+
+        message = f"`method` must be one of {set(methods)}."
+        if method.lower() not in methods:
+            raise ValueError(message)
+
+        message = "`confidence_level` must be a scalar between 0 and 1."
+        confidence_level = np.asarray(confidence_level)[()]
+        if confidence_level.shape or not (0 <= confidence_level <= 1):
+            raise ValueError(message)
+
+        method_fun = methods[method.lower()]
+        low, high = method_fun(confidence_level)
+
+        message = ("The confidence interval is undefined at some observations."
+                   " This is a feature of the mathematical formula used, not"
+                   " an error in its implementation.")
+        if np.any(np.isnan(low) | np.isnan(high)):
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+
+        low, high = np.clip(low, 0, 1), np.clip(high, 0, 1)
+        low = EmpiricalDistributionFunction(self.quantiles, low, None, None,
+                                            self._kind)
+        high = EmpiricalDistributionFunction(self.quantiles, high, None, None,
+                                             self._kind)
+        return ConfidenceInterval(low, high)
+
+    def _linear_ci(self, confidence_level):
+        sf, d, n = self._sf, self._d, self._n
+        # When n == d, Greenwood's formula divides by zero.
+        # When s != 0, this can be ignored: var == inf, and CI is [0, 1]
+        # When s == 0, this results in NaNs. Produce an informative warning.
+        with np.errstate(divide='ignore', invalid='ignore'):
+            var = sf ** 2 * np.cumsum(d / (n * (n - d)))
+
+        se = np.sqrt(var)
+        z = special.ndtri(1 / 2 + confidence_level / 2)
+
+        z_se = z * se
+        low = self.probabilities - z_se
+        high = self.probabilities + z_se
+
+        return low, high
+
+    def _loglog_ci(self, confidence_level):
+        sf, d, n = self._sf, self._d, self._n
+
+        with np.errstate(divide='ignore', invalid='ignore'):
+            var = 1 / np.log(sf) ** 2 * np.cumsum(d / (n * (n - d)))
+
+        se = np.sqrt(var)
+        z = special.ndtri(1 / 2 + confidence_level / 2)
+
+        with np.errstate(divide='ignore'):
+            lnl_points = np.log(-np.log(sf))
+
+        z_se = z * se
+        low = np.exp(-np.exp(lnl_points + z_se))
+        high = np.exp(-np.exp(lnl_points - z_se))
+        if self._kind == "cdf":
+            low, high = 1-high, 1-low
+
+        return low, high
+
+
+@dataclass
+class ECDFResult:
+    """ Result object returned by `scipy.stats.ecdf`
+
+    Attributes
+    ----------
+    cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
+        An object representing the empirical cumulative distribution function.
+    sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
+        An object representing the complement of the empirical cumulative
+        distribution function.
+    """
+    cdf: EmpiricalDistributionFunction
+    sf: EmpiricalDistributionFunction
+
+    def __init__(self, q, cdf, sf, n, d):
+        self.cdf = EmpiricalDistributionFunction(q, cdf, n, d, "cdf")
+        self.sf = EmpiricalDistributionFunction(q, sf, n, d, "sf")
+
+
+def _iv_CensoredData(
+    sample: "npt.ArrayLike | CensoredData", param_name: str = "sample"
+) -> CensoredData:
+    """Attempt to convert `sample` to `CensoredData`."""
+    if not isinstance(sample, CensoredData):
+        try:  # takes care of input standardization/validation
+            sample = CensoredData(uncensored=sample)
+        except ValueError as e:
+            message = str(e).replace('uncensored', param_name)
+            raise type(e)(message) from e
+    return sample
+
+
+@xp_capabilities(np_only=True)
+def ecdf(sample: "npt.ArrayLike | CensoredData") -> ECDFResult:
+    """Empirical cumulative distribution function of a sample.
+
+    The empirical cumulative distribution function (ECDF) is a step function
+    estimate of the CDF of the distribution underlying a sample. This function
+    returns objects representing both the empirical distribution function and
+    its complement, the empirical survival function.
+
+    Parameters
+    ----------
+    sample : 1D array_like or `scipy.stats.CensoredData`
+        Besides array_like, instances of `scipy.stats.CensoredData` containing
+        uncensored and right-censored observations are supported. Currently,
+        other instances of `scipy.stats.CensoredData` will result in a
+        ``NotImplementedError``.
+
+    Returns
+    -------
+    res : `~scipy.stats._result_classes.ECDFResult`
+        An object with the following attributes.
+
+        cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
+            An object representing the empirical cumulative distribution
+            function.
+        sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
+            An object representing the empirical survival function.
+
+        The `cdf` and `sf` attributes themselves have the following attributes.
+
+        quantiles : ndarray
+            The unique values in the sample that defines the empirical CDF/SF.
+        probabilities : ndarray
+            The point estimates of the probabilities corresponding with
+            `quantiles`.
+
+        And the following methods:
+
+        evaluate(x) :
+            Evaluate the CDF/SF at the argument.
+
+        plot(ax) :
+            Plot the CDF/SF on the provided axes.
+
+        confidence_interval(confidence_level=0.95) :
+            Compute the confidence interval around the CDF/SF at the values in
+            `quantiles`.
+
+    Notes
+    -----
+    When each observation of the sample is a precise measurement, the ECDF
+    steps up by ``1/len(sample)`` at each of the observations [1]_.
+
+    When observations are lower bounds, upper bounds, or both upper and lower
+    bounds, the data is said to be "censored", and `sample` may be provided as
+    an instance of `scipy.stats.CensoredData`.
+
+    For right-censored data, the ECDF is given by the Kaplan-Meier estimator
+    [2]_; other forms of censoring are not supported at this time.
+
+    Confidence intervals are computed according to the Greenwood formula or the
+    more recent "Exponential Greenwood" formula as described in [4]_.
+
+    References
+    ----------
+    .. [1] Conover, William Jay. Practical nonparametric statistics. Vol. 350.
+           John Wiley & Sons, 1999.
+
+    .. [2] Kaplan, Edward L., and Paul Meier. "Nonparametric estimation from
+           incomplete observations." Journal of the American statistical
+           association 53.282 (1958): 457-481.
+
+    .. [3] Goel, Manish Kumar, Pardeep Khanna, and Jugal Kishore.
+           "Understanding survival analysis: Kaplan-Meier estimate."
+           International journal of Ayurveda research 1.4 (2010): 274.
+
+    .. [4] Sawyer, Stanley. "The Greenwood and Exponential Greenwood Confidence
+           Intervals in Survival Analysis."
+           https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
+
+    Examples
+    --------
+    **Uncensored Data**
+
+    As in the example from [1]_ page 79, five boys were selected at random from
+    those in a single high school. Their one-mile run times were recorded as
+    follows.
+
+    >>> sample = [6.23, 5.58, 7.06, 6.42, 5.20]  # one-mile run times (minutes)
+
+    The empirical distribution function, which approximates the distribution
+    function of one-mile run times of the population from which the boys were
+    sampled, is calculated as follows.
+
+    >>> from scipy import stats
+    >>> res = stats.ecdf(sample)
+    >>> res.cdf.quantiles
+    array([5.2 , 5.58, 6.23, 6.42, 7.06])
+    >>> res.cdf.probabilities
+    array([0.2, 0.4, 0.6, 0.8, 1. ])
+
+    To plot the result as a step function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> ax = plt.subplot()
+    >>> res.cdf.plot(ax)
+    >>> ax.set_xlabel('One-Mile Run Time (minutes)')
+    >>> ax.set_ylabel('Empirical CDF')
+    >>> plt.show()
+
+    **Right-censored Data**
+
+    As in the example from [1]_ page 91, the lives of ten car fanbelts were
+    tested. Five tests concluded because the fanbelt being tested broke, but
+    the remaining tests concluded for other reasons (e.g. the study ran out of
+    funding, but the fanbelt was still functional). The mileage driven
+    with the fanbelts were recorded as follows.
+
+    >>> broken = [77, 47, 81, 56, 80]  # in thousands of miles driven
+    >>> unbroken = [62, 60, 43, 71, 37]
+
+    Precise survival times of the fanbelts that were still functional at the
+    end of the tests are unknown, but they are known to exceed the values
+    recorded in ``unbroken``. Therefore, these observations are said to be
+    "right-censored", and the data is represented using
+    `scipy.stats.CensoredData`.
+
+    >>> sample = stats.CensoredData(uncensored=broken, right=unbroken)
+
+    The empirical survival function is calculated as follows.
+
+    >>> res = stats.ecdf(sample)
+    >>> res.sf.quantiles
+    array([37., 43., 47., 56., 60., 62., 71., 77., 80., 81.])
+    >>> res.sf.probabilities
+    array([1.   , 1.   , 0.875, 0.75 , 0.75 , 0.75 , 0.75 , 0.5  , 0.25 , 0.   ])
+
+    To plot the result as a step function:
+
+    >>> ax = plt.subplot()
+    >>> res.sf.plot(ax)
+    >>> ax.set_xlabel('Fanbelt Survival Time (thousands of miles)')
+    >>> ax.set_ylabel('Empirical SF')
+    >>> plt.show()
+
+    """
+    sample = _iv_CensoredData(sample)
+
+    if sample.num_censored() == 0:
+        res = _ecdf_uncensored(sample._uncensor())
+    elif sample.num_censored() == sample._right.size:
+        res = _ecdf_right_censored(sample)
+    else:
+        # Support additional censoring options in follow-up PRs
+        message = ("Currently, only uncensored and right-censored data is "
+                   "supported.")
+        raise NotImplementedError(message)
+
+    t, cdf, sf, n, d = res
+    return ECDFResult(t, cdf, sf, n, d)
+
+
+def _ecdf_uncensored(sample):
+    sample = np.sort(sample)
+    x, counts = np.unique(sample, return_counts=True)
+
+    # [1].81 "the fraction of [observations] that are less than or equal to x
+    events = np.cumsum(counts)
+    n = sample.size
+    cdf = events / n
+
+    # [1].89 "the relative frequency of the sample that exceeds x in value"
+    sf = 1 - cdf
+
+    at_risk = np.concatenate(([n], n - events[:-1]))
+    return x, cdf, sf, at_risk, counts
+
+
+def _ecdf_right_censored(sample):
+    # It is conventional to discuss right-censored data in terms of
+    # "survival time", "death", and "loss" (e.g. [2]). We'll use that
+    # terminology here.
+    # This implementation was influenced by the references cited and also
+    # https://www.youtube.com/watch?v=lxoWsVco_iM
+    # https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
+    # In retrospect it is probably most easily compared against [3].
+    # Ultimately, the data needs to be sorted, so this implementation is
+    # written to avoid a separate call to `unique` after sorting. In hope of
+    # better performance on large datasets, it also computes survival
+    # probabilities at unique times only rather than at each observation.
+    tod = sample._uncensored  # time of "death"
+    tol = sample._right  # time of "loss"
+    times = np.concatenate((tod, tol))
+    died = np.asarray([1]*tod.size + [0]*tol.size)
+
+    # sort by times
+    i = np.argsort(times)
+    times = times[i]
+    died = died[i]
+    at_risk = np.arange(times.size, 0, -1)
+
+    # logical indices of unique times
+    j = np.diff(times, prepend=-np.inf, append=np.inf) > 0
+    j_l = j[:-1]  # first instances of unique times
+    j_r = j[1:]  # last instances of unique times
+
+    # get number at risk and deaths at each unique time
+    t = times[j_l]  # unique times
+    n = at_risk[j_l]  # number at risk at each unique time
+    cd = np.cumsum(died)[j_r]  # cumulative deaths up to/including unique times
+    d = np.diff(cd, prepend=0)  # deaths at each unique time
+
+    # compute survival function
+    sf = np.cumprod((n - d) / n)
+    cdf = 1 - sf
+    return t, cdf, sf, n, d
+
+
+@dataclass
+class LogRankResult:
+    """Result object returned by `scipy.stats.logrank`.
+
+    Attributes
+    ----------
+    statistic : float ndarray
+        The computed statistic (defined below). Its magnitude is the
+        square root of the magnitude returned by most other logrank test
+        implementations.
+    pvalue : float ndarray
+        The computed p-value of the test.
+    """
+    statistic: np.ndarray
+    pvalue: np.ndarray
+
+
+@xp_capabilities(np_only=True)
+def logrank(
+    x: "npt.ArrayLike | CensoredData",
+    y: "npt.ArrayLike | CensoredData",
+    alternative: Literal['two-sided', 'less', 'greater'] = "two-sided"
+) -> LogRankResult:
+    r"""Compare the survival distributions of two samples via the logrank test.
+
+    Parameters
+    ----------
+    x, y : array_like or CensoredData
+        Samples to compare based on their empirical survival functions.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+
+        The null hypothesis is that the survival distributions of the two
+        groups, say *X* and *Y*, are identical.
+
+        The following alternative hypotheses [4]_ are available (default is
+        'two-sided'):
+
+        * 'two-sided': the survival distributions of the two groups are not
+          identical.
+        * 'less': survival of group *X* is favored: the group *X* failure rate
+          function is less than the group *Y* failure rate function at some
+          times.
+        * 'greater': survival of group *Y* is favored: the group *X* failure
+          rate function is greater than the group *Y* failure rate function at
+          some times.
+
+    Returns
+    -------
+    res : `~scipy.stats._result_classes.LogRankResult`
+        An object containing attributes:
+
+        statistic : float ndarray
+            The computed statistic (defined below). Its magnitude is the
+            square root of the magnitude returned by most other logrank test
+            implementations.
+        pvalue : float ndarray
+            The computed p-value of the test.
+
+    See Also
+    --------
+    scipy.stats.ecdf
+
+    Notes
+    -----
+    The logrank test [1]_ compares the observed number of events to
+    the expected number of events under the null hypothesis that the two
+    samples were drawn from the same distribution. The statistic is
+
+    .. math::
+
+        Z_i = \frac{\sum_{j=1}^J(O_{i,j}-E_{i,j})}{\sqrt{\sum_{j=1}^J V_{i,j}}}
+        \rightarrow \mathcal{N}(0,1)
+
+    where
+
+    .. math::
+
+        E_{i,j} = O_j \frac{N_{i,j}}{N_j},
+        \qquad
+        V_{i,j} = E_{i,j} \left(\frac{N_j-O_j}{N_j}\right)
+        \left(\frac{N_j-N_{i,j}}{N_j-1}\right),
+
+    :math:`i` denotes the group (i.e. it may assume values :math:`x` or
+    :math:`y`, or it may be omitted to refer to the combined sample)
+    :math:`j` denotes the time (at which an event occurred),
+    :math:`N` is the number of subjects at risk just before an event occurred,
+    and :math:`O` is the observed number of events at that time.
+
+    The ``statistic`` :math:`Z_x` returned by `logrank` is the (signed) square
+    root of the statistic returned by many other implementations. Under the
+    null hypothesis, :math:`Z_x**2` is asymptotically distributed according to
+    the chi-squared distribution with one degree of freedom. Consequently,
+    :math:`Z_x` is asymptotically distributed according to the standard normal
+    distribution. The advantage of using :math:`Z_x` is that the sign
+    information (i.e. whether the observed number of events tends to be less
+    than or greater than the number expected under the null hypothesis) is
+    preserved, allowing `scipy.stats.logrank` to offer one-sided alternative
+    hypotheses.
+
+    References
+    ----------
+    .. [1] Mantel N. "Evaluation of survival data and two new rank order
+           statistics arising in its consideration."
+           Cancer Chemotherapy Reports, 50(3):163-170, PMID: 5910392, 1966
+    .. [2] Bland, Altman, "The logrank test", BMJ, 328:1073,
+           :doi:`10.1136/bmj.328.7447.1073`, 2004
+    .. [3] "Logrank test", Wikipedia,
+           https://en.wikipedia.org/wiki/Logrank_test
+    .. [4] Brown, Mark. "On the choice of variance for the log rank test."
+           Biometrika 71.1 (1984): 65-74.
+    .. [5] Klein, John P., and Melvin L. Moeschberger. Survival analysis:
+           techniques for censored and truncated data. Vol. 1230. New York:
+           Springer, 2003.
+
+    Examples
+    --------
+    Reference [2]_ compared the survival times of patients with two different
+    types of recurrent malignant gliomas. The samples below record the time
+    (number of weeks) for which each patient participated in the study. The
+    `scipy.stats.CensoredData` class is used because the data is
+    right-censored: the uncensored observations correspond with observed deaths
+    whereas the censored observations correspond with the patient leaving the
+    study for another reason.
+
+    >>> from scipy import stats
+    >>> x = stats.CensoredData(
+    ...     uncensored=[6, 13, 21, 30, 37, 38, 49, 50,
+    ...                 63, 79, 86, 98, 202, 219],
+    ...     right=[31, 47, 80, 82, 82, 149]
+    ... )
+    >>> y = stats.CensoredData(
+    ...     uncensored=[10, 10, 12, 13, 14, 15, 16, 17, 18, 20, 24, 24,
+    ...                 25, 28,30, 33, 35, 37, 40, 40, 46, 48, 76, 81,
+    ...                 82, 91, 112, 181],
+    ...     right=[34, 40, 70]
+    ... )
+
+    We can calculate and visualize the empirical survival functions
+    of both groups as follows.
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> ax = plt.subplot()
+    >>> ecdf_x = stats.ecdf(x)
+    >>> ecdf_x.sf.plot(ax, label='Astrocytoma')
+    >>> ecdf_y = stats.ecdf(y)
+    >>> ecdf_y.sf.plot(ax, label='Glioblastoma')
+    >>> ax.set_xlabel('Time to death (weeks)')
+    >>> ax.set_ylabel('Empirical SF')
+    >>> plt.legend()
+    >>> plt.show()
+
+    Visual inspection of the empirical survival functions suggests that the
+    survival times tend to be different between the two groups. To formally
+    assess whether the difference is significant at the 1% level, we use the
+    logrank test.
+
+    >>> res = stats.logrank(x=x, y=y)
+    >>> res.statistic
+    -2.73799
+    >>> res.pvalue
+    0.00618
+
+    The p-value is less than 1%, so we can consider the data to be evidence
+    against the null hypothesis in favor of the alternative that there is a
+    difference between the two survival functions.
+
+    """
+    # Input validation. `alternative` IV handled in `_get_pvalue` below.
+    x = _iv_CensoredData(sample=x, param_name='x')
+    y = _iv_CensoredData(sample=y, param_name='y')
+
+    # Combined sample. (Under H0, the two groups are identical.)
+    xy = CensoredData(
+        uncensored=np.concatenate((x._uncensored, y._uncensored)),
+        right=np.concatenate((x._right, y._right))
+    )
+
+    # Extract data from the combined sample
+    res = ecdf(xy)
+    idx = res.sf._d.astype(bool)  # indices of observed events
+    times_xy = res.sf.quantiles[idx]  # unique times of observed events
+    at_risk_xy = res.sf._n[idx]  # combined number of subjects at risk
+    deaths_xy = res.sf._d[idx]  # combined number of events
+
+    # Get the number at risk within each sample.
+    # First compute the number at risk in group X at each of the `times_xy`.
+    # Could use `interpolate_1d`, but this is more compact.
+    res_x = ecdf(x)
+    i = np.searchsorted(res_x.sf.quantiles, times_xy)
+    at_risk_x = np.append(res_x.sf._n, 0)[i]  # 0 at risk after last time
+    # Subtract from the combined number at risk to get number at risk in Y
+    at_risk_y = at_risk_xy - at_risk_x
+
+    # Compute the variance.
+    num = at_risk_x * at_risk_y * deaths_xy * (at_risk_xy - deaths_xy)
+    den = at_risk_xy**2 * (at_risk_xy - 1)
+    # Note: when `at_risk_xy == 1`, we would have `at_risk_xy - 1 == 0` in the
+    # numerator and denominator. Simplifying the fraction symbolically, we
+    # would always find the overall quotient to be zero, so don't compute it.
+    i = at_risk_xy > 1
+    sum_var = np.sum(num[i]/den[i])
+
+    # Get the observed and expected number of deaths in group X
+    n_died_x = x._uncensored.size
+    sum_exp_deaths_x = np.sum(at_risk_x * (deaths_xy/at_risk_xy))
+
+    # Compute the statistic. This is the square root of that in references.
+    statistic = (n_died_x - sum_exp_deaths_x)/np.sqrt(sum_var)
+
+    # Equivalent to chi2(df=1).sf(statistic**2) when alternative='two-sided'
+    norm = stats._stats_py._SimpleNormal()
+    pvalue = stats._stats_py._get_pvalue(statistic, norm, alternative, xp=np)
+
+    return LogRankResult(statistic=statistic[()], pvalue=pvalue[()])
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_tukeylambda_stats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_tukeylambda_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3cda54fa2801f7659419b3c9c68d15ed80fc13e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_tukeylambda_stats.py
@@ -0,0 +1,199 @@
+import numpy as np
+from numpy import poly1d
+from scipy.special import beta
+
+
+# The following code was used to generate the Pade coefficients for the
+# Tukey Lambda variance function.  Version 0.17 of mpmath was used.
+#---------------------------------------------------------------------------
+# import mpmath as mp
+#
+# mp.mp.dps = 60
+#
+# one   = mp.mpf(1)
+# two   = mp.mpf(2)
+#
+# def mpvar(lam):
+#     if lam == 0:
+#         v = mp.pi**2 / three
+#     else:
+#         v = (two / lam**2) * (one / (one + two*lam) -
+#                               mp.beta(lam + one, lam + one))
+#     return v
+#
+# t = mp.taylor(mpvar, 0, 8)
+# p, q = mp.pade(t, 4, 4)
+# print("p =", [mp.fp.mpf(c) for c in p])
+# print("q =", [mp.fp.mpf(c) for c in q])
+#---------------------------------------------------------------------------
+
+# Pade coefficients for the Tukey Lambda variance function.
+_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
+                       -0.5370742306855439, 0.17292046290190008,
+                       -0.02371146284628187]
+_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
+                       1.7660926747377275, 0.2643989311168465]
+
+# numpy.poly1d instances for the numerator and denominator of the
+# Pade approximation to the Tukey Lambda variance.
+_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
+_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
+
+
+def tukeylambda_variance(lam):
+    """Variance of the Tukey Lambda distribution.
+
+    Parameters
+    ----------
+    lam : array_like
+        The lambda values at which to compute the variance.
+
+    Returns
+    -------
+    v : ndarray
+        The variance.  For lam < -0.5, the variance is not defined, so
+        np.nan is returned.  For lam = 0.5, np.inf is returned.
+
+    Notes
+    -----
+    In an interval around lambda=0, this function uses the [4,4] Pade
+    approximation to compute the variance.  Otherwise it uses the standard
+    formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution).  The
+    Pade approximation is used because the standard formula has a removable
+    discontinuity at lambda = 0, and does not produce accurate numerical
+    results near lambda = 0.
+    """
+    lam = np.asarray(lam)
+    shp = lam.shape
+    lam = np.atleast_1d(lam).astype(np.float64)
+
+    # For absolute values of lam less than threshold, use the Pade
+    # approximation.
+    threshold = 0.075
+
+    # Play games with masks to implement the conditional evaluation of
+    # the distribution.
+    # lambda < -0.5:  var = nan
+    low_mask = lam < -0.5
+    # lambda == -0.5: var = inf
+    neghalf_mask = lam == -0.5
+    # abs(lambda) < threshold:  use Pade approximation
+    small_mask = np.abs(lam) < threshold
+    # else the "regular" case:  use the explicit formula.
+    reg_mask = ~(low_mask | neghalf_mask | small_mask)
+
+    # Get the 'lam' values for the cases where they are needed.
+    small = lam[small_mask]
+    reg = lam[reg_mask]
+
+    # Compute the function for each case.
+    v = np.empty_like(lam)
+    v[low_mask] = np.nan
+    v[neghalf_mask] = np.inf
+    if small.size > 0:
+        # Use the Pade approximation near lambda = 0.
+        v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
+    if reg.size > 0:
+        v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
+                                        beta(reg + 1, reg + 1))
+    v = v.reshape(shp)
+    return v
+
+
+# The following code was used to generate the Pade coefficients for the
+# Tukey Lambda kurtosis function.  Version 0.17 of mpmath was used.
+#---------------------------------------------------------------------------
+# import mpmath as mp
+#
+# mp.mp.dps = 60
+#
+# one   = mp.mpf(1)
+# two   = mp.mpf(2)
+# three = mp.mpf(3)
+# four  = mp.mpf(4)
+#
+# def mpkurt(lam):
+#     if lam == 0:
+#         k = mp.mpf(6)/5
+#     else:
+#         numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
+#                  three*mp.beta(two*lam+one, two*lam+one))
+#         denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
+#         k = numer / denom - three
+#     return k
+#
+# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
+# # taylor function and we request a degree 9 Taylor polynomial, we actually
+# # get degree 8.
+# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
+# t = [mp.chop(c, tol=1e-15) for c in t]
+# p, q = mp.pade(t, 4, 4)
+# print("p =", [mp.fp.mpf(c) for c in p])
+# print("q =", [mp.fp.mpf(c) for c in q])
+#---------------------------------------------------------------------------
+
+# Pade coefficients for the Tukey Lambda kurtosis function.
+_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
+                        0.20601184383406815, 4.59796302262789]
+_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
+                        0.43075235247853005, -2.789746758009912]
+
+# numpy.poly1d instances for the numerator and denominator of the
+# Pade approximation to the Tukey Lambda kurtosis.
+_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
+_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
+
+
+def tukeylambda_kurtosis(lam):
+    """Kurtosis of the Tukey Lambda distribution.
+
+    Parameters
+    ----------
+    lam : array_like
+        The lambda values at which to compute the variance.
+
+    Returns
+    -------
+    v : ndarray
+        The variance.  For lam < -0.25, the variance is not defined, so
+        np.nan is returned.  For lam = 0.25, np.inf is returned.
+
+    """
+    lam = np.asarray(lam)
+    shp = lam.shape
+    lam = np.atleast_1d(lam).astype(np.float64)
+
+    # For absolute values of lam less than threshold, use the Pade
+    # approximation.
+    threshold = 0.055
+
+    # Use masks to implement the conditional evaluation of the kurtosis.
+    # lambda < -0.25:  kurtosis = nan
+    low_mask = lam < -0.25
+    # lambda == -0.25: kurtosis = inf
+    negqrtr_mask = lam == -0.25
+    # lambda near 0:  use Pade approximation
+    small_mask = np.abs(lam) < threshold
+    # else the "regular" case:  use the explicit formula.
+    reg_mask = ~(low_mask | negqrtr_mask | small_mask)
+
+    # Get the 'lam' values for the cases where they are needed.
+    small = lam[small_mask]
+    reg = lam[reg_mask]
+
+    # Compute the function for each case.
+    k = np.empty_like(lam)
+    k[low_mask] = np.nan
+    k[negqrtr_mask] = np.inf
+    if small.size > 0:
+        k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
+    if reg.size > 0:
+        numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
+                 3 * beta(2 * reg + 1, 2 * reg + 1))
+        denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
+        k[reg_mask] = numer / denom - 3
+
+    # The return value will be a numpy array; resetting the shape ensures that
+    # if `lam` was a scalar, the return value is a 0-d array.
+    k = k.reshape(shp)
+    return k
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_variation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_variation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca9e903e1339763207929b88b66717252fef058e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_variation.py
@@ -0,0 +1,132 @@
+import warnings
+import numpy as np
+
+from scipy._lib._array_api import (
+    array_namespace,
+    xp_capabilities,
+    xp_device,
+    _length_nonmasked,
+)
+import scipy._lib.array_api_extra as xpx
+
+from ._axis_nan_policy import _axis_nan_policy_factory
+
+
+@xp_capabilities()
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,)
+)
+def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False):
+    """
+    Compute the coefficient of variation.
+
+    The coefficient of variation is the standard deviation divided by the
+    mean.  This function is equivalent to::
+
+        np.std(x, axis=axis, ddof=ddof) / np.mean(x)
+
+    The default for ``ddof`` is 0, but many definitions of the coefficient
+    of variation use the square root of the unbiased sample variance
+    for the sample standard deviation, which corresponds to ``ddof=1``.
+
+    The function does not take the absolute value of the mean of the data,
+    so the return value is negative if the mean is negative.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis along which to calculate the coefficient of variation.
+        Default is 0. If None, compute over the whole array `a`.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains ``nan``.
+        The following options are available:
+
+          * 'propagate': return ``nan``
+          * 'raise': raise an exception
+          * 'omit': perform the calculation with ``nan`` values omitted
+
+        The default is 'propagate'.
+    ddof : int, optional
+        Gives the "Delta Degrees Of Freedom" used when computing the
+        standard deviation.  The divisor used in the calculation of the
+        standard deviation is ``N - ddof``, where ``N`` is the number of
+        elements.  `ddof` must be less than ``N``; if it isn't, the result
+        will be ``nan`` or ``inf``, depending on ``N`` and the values in
+        the array.  By default `ddof` is zero for backwards compatibility,
+        but it is recommended to use ``ddof=1`` to ensure that the sample
+        standard deviation is computed as the square root of the unbiased
+        sample variance.
+
+    Returns
+    -------
+    variation : ndarray
+        The calculated variation along the requested axis.
+
+    Notes
+    -----
+    There are several edge cases that are handled without generating a
+    warning:
+
+    * If both the mean and the standard deviation are zero, ``nan``
+      is returned.
+    * If the mean is zero and the standard deviation is nonzero, ``inf``
+      is returned.
+    * If the input has length zero (either because the array has zero
+      length, or all the input values are ``nan`` and ``nan_policy`` is
+      ``'omit'``), ``nan`` is returned.
+    * If the input contains ``inf``, ``nan`` is returned.
+
+    References
+    ----------
+    .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+       Probability and Statistics Tables and Formulae. Chapman & Hall: New
+       York. 2000.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import variation
+    >>> variation([1, 2, 3, 4, 5], ddof=1)
+    0.5270462766947299
+
+    Compute the variation along a given dimension of an array that contains
+    a few ``nan`` values:
+
+    >>> x = np.array([[  10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0],
+    ...               [  29.0,   30.0, 32.0, 33.0, 35.0, 56.0, 57.0],
+    ...               [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]])
+    >>> variation(x, axis=1, ddof=1, nan_policy='omit')
+    array([1.05109361, 0.31428986, 0.146483  ])
+
+    """
+    xp = array_namespace(a)
+    a = xp.asarray(a)
+
+    # `nan_policy` and `keepdims` are handled by `_axis_nan_policy`
+    if axis is None:
+        a = xp.reshape(a, (-1,))
+        axis = 0
+
+    n = xp.asarray(_length_nonmasked(a, axis=axis), dtype=a.dtype, device=xp_device(a))
+
+    with (np.errstate(divide='ignore', invalid='ignore'), warnings.catch_warnings()):
+        warnings.simplefilter("ignore")
+        mean_a = xp.mean(a, axis=axis)
+        std_a = xp.std(a, axis=axis)
+        correction = (n / (n - ddof))**0.5  # we may need uncorrected std below
+        result = std_a * correction / mean_a
+
+    def special_case(std_a, mean_a):
+        # xref data-apis/array-api-extra#196
+        mxp = array_namespace(std_a, mean_a)
+        # `_xp_inf` is a workaround for torch.copysign not accepting a scalar yet,
+        # xref data-apis/array-api-compat#271
+        _xp_inf = mxp.asarray(mxp.inf, dtype=mean_a.dtype, device=xp_device(mean_a))
+        return mxp.where(std_a > 0, mxp.copysign(_xp_inf, mean_a), mxp.nan)
+
+    result = xpx.apply_where((ddof == n), (std_a, mean_a),
+                             special_case, fill_value=result)
+
+    return result[()] if result.ndim == 0 else result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_warnings_errors.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_warnings_errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..38385b862c9d642b41af8d74279f98c6a427208a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_warnings_errors.py
@@ -0,0 +1,38 @@
+# Warnings
+
+
+class DegenerateDataWarning(RuntimeWarning):
+    """Warns when data is degenerate and results may not be reliable."""
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = ("Degenerate data encountered; results may not be reliable.")
+        self.args = (msg,)
+
+
+class ConstantInputWarning(DegenerateDataWarning):
+    """Warns when all values in data are exactly equal."""
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = ("All values in data are exactly equal; "
+                   "results may not be reliable.")
+        self.args = (msg,)
+
+
+class NearConstantInputWarning(DegenerateDataWarning):
+    """Warns when all values in data are nearly equal."""
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = ("All values in data are nearly equal; "
+                   "results may not be reliable.")
+        self.args = (msg,)
+
+
+# Errors
+
+
+class FitError(RuntimeError):
+    """Represents an error condition when fitting a distribution to data."""
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = ("An error occurred when fitting a distribution to data.")
+        self.args = (msg,)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_wilcoxon.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_wilcoxon.py
new file mode 100644
index 0000000000000000000000000000000000000000..7869cf23bccece6694ff15d10ee5e7fa846e717f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/_wilcoxon.py
@@ -0,0 +1,272 @@
+import numpy as np
+
+from scipy import stats
+from ._stats_py import _get_pvalue, _rankdata, _SimpleNormal
+from . import _morestats
+from ._axis_nan_policy import _broadcast_arrays
+from ._hypotests import _get_wilcoxon_distr
+from scipy._lib._util import _get_nan
+from scipy._lib._array_api import array_namespace, xp_promote, xp_size
+import scipy._lib.array_api_extra as xpx
+
+
+class WilcoxonDistribution:
+
+    def __init__(self, n):
+        n = np.asarray(n).astype(int, copy=False)
+        self.n = n
+        self._dists = {ni: _get_wilcoxon_distr(ni) for ni in np.unique(n)}
+
+    def _cdf1(self, k, n):
+        pmfs = self._dists[n]
+        return pmfs[:k + 1].sum()
+
+    def _cdf(self, k, n):
+        return np.vectorize(self._cdf1, otypes=[float])(k, n)
+
+    def _sf1(self, k, n):
+        pmfs = self._dists[n]
+        return pmfs[k:].sum()
+
+    def _sf(self, k, n):
+        return np.vectorize(self._sf1, otypes=[float])(k, n)
+
+    def mean(self):
+        return self.n * (self.n + 1) / 4
+
+    def _prep(self, k):
+        k = np.asarray(k).astype(int, copy=False)
+        mn = self.mean()
+        out = np.empty(k.shape, dtype=np.float64)
+        return k, mn, out
+
+    def cdf(self, k):
+        k, mn, out = self._prep(k)
+        return xpx.apply_where(
+            k <= mn, (k, self.n),
+            self._cdf,
+            lambda k, n: 1 - self._sf(k+1, n))[()]
+
+    def sf(self, k):
+        k, mn, out = self._prep(k)
+        return xpx.apply_where(
+            k <= mn, (k, self.n),
+            self._sf,
+            lambda k, n: 1 - self._cdf(k-1, n))[()]
+
+
+def _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis):
+    xp = array_namespace(x, y)
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+
+    axis = np.asarray(axis)[()]  # OK to use NumPy for input validation
+    message = "`axis` must be an integer."
+    if not np.issubdtype(axis.dtype, np.integer) or axis.ndim != 0:
+        raise ValueError(message)
+    axis = int(axis)
+
+    message = '`axis` must be compatible with the shape(s) of `x` (and `y`)'
+    AxisError = getattr(np, 'AxisError', None) or np.exceptions.AxisError
+    try:
+        if y is None:
+            d = x
+        else:
+            x, y = _broadcast_arrays((x, y), axis=axis, xp=xp)
+            d = x - y
+        d = xp.moveaxis(d, axis, -1)
+    except AxisError as e:
+        raise AxisError(message) from e
+
+    message = "`x` and `y` must have the same length along `axis`."
+    if y is not None and x.shape[axis] != y.shape[axis]:
+        raise ValueError(message)
+
+    message = "`x` (and `y`, if provided) must be an array of real numbers."
+    if not xp.isdtype(d.dtype, "real floating"):
+        raise ValueError(message)
+
+    zero_method = str(zero_method).lower()
+    zero_methods = {"wilcox", "pratt", "zsplit"}
+    message = f"`zero_method` must be one of {zero_methods}."
+    if zero_method not in zero_methods:
+        raise ValueError(message)
+
+    corrections = {True, False}
+    message = f"`correction` must be one of {corrections}."
+    if correction not in corrections:
+        raise ValueError(message)
+
+    alternative = str(alternative).lower()
+    alternatives = {"two-sided", "less", "greater"}
+    message = f"`alternative` must be one of {alternatives}."
+    if alternative not in alternatives:
+        raise ValueError(message)
+
+    if not isinstance(method, stats.PermutationMethod):
+        methods = {"auto", "asymptotic", "exact"}
+        message = (f"`method` must be one of {methods} or "
+                   "an instance of `stats.PermutationMethod`.")
+        if method not in methods:
+            raise ValueError(message)
+    output_z = True if method == 'asymptotic' else False
+
+    # For small samples, we decide later whether to perform an exact test or a
+    # permutation test. The reason is that the presence of ties is not
+    # known at the input validation stage.
+    n_zero = xp.count_nonzero(d == 0, axis=None)
+    if method == "auto" and d.shape[-1] > 50:
+        method = "asymptotic"
+
+    return d, zero_method, correction, alternative, method, axis, output_z, n_zero, xp
+
+
+def _wilcoxon_statistic(d, method, zero_method='wilcox', *, xp):
+    dtype = d.dtype
+    i_zeros = (d == 0)
+
+    if zero_method == 'wilcox':
+        # Wilcoxon's method for treating zeros was to remove them from
+        # the calculation. We do this by replacing 0s with NaNs, which
+        # are ignored anyway.
+        # Copy required for array-api-strict. See data-apis/array-api-extra#506.
+        d = xpx.at(d)[i_zeros].set(xp.nan, copy=True)
+
+    i_nan = xp.isnan(d)
+    n_nan = xp.count_nonzero(i_nan, axis=-1)
+    count = xp.astype(d.shape[-1] - n_nan, dtype)
+
+    r, t = _rankdata(xp.abs(d), 'average', return_ties=True, xp=xp)
+    r, t = xp.astype(r, dtype, copy=False), xp.astype(t, dtype, copy=False)
+
+    r_plus = xp.sum(xp.astype(d > 0, dtype) * r, axis=-1)
+    r_minus = xp.sum(xp.astype(d < 0, dtype) * r, axis=-1)
+
+    has_ties = xp.any(t == 0)
+
+    if zero_method == "zsplit":
+        # The "zero-split" method for treating zeros is to add half their contribution
+        # to r_plus and half to r_minus.
+        # See gh-2263 for the origin of this method.
+        r_zero_2 = xp.sum(xp.astype(i_zeros, dtype) * r, axis=-1) / 2
+        r_plus = xpx.at(r_plus)[...].add(r_zero_2)
+        r_minus = xpx.at(r_minus)[...].add(r_zero_2)
+
+    mn = count * (count + 1.) * 0.25
+    se = count * (count + 1.) * (2. * count + 1.)
+
+    if zero_method == "pratt":
+        # Pratt's method for treating zeros was just to modify the z-statistic.
+
+        # normal approximation needs to be adjusted, see Cureton (1967)
+        n_zero = xp.astype(xp.count_nonzero(i_zeros, axis=-1), dtype)
+        mn = xpx.at(mn)[...].subtract(n_zero * (n_zero + 1.) * 0.25)
+        se = xpx.at(se)[...].subtract(n_zero * (n_zero + 1.) * (2. * n_zero + 1.))
+
+        # zeros are not to be included in tie-correction.
+        # any tie counts corresponding with zeros are in the 0th column
+        # t[xp.any(i_zeros, axis=-1), 0] = 0
+        t_i_zeros = xp.zeros_like(i_zeros)
+        t_i_zeros = xpx.at(t_i_zeros)[..., 0].set(xp.any(i_zeros, axis=-1))
+        t = xpx.at(t)[t_i_zeros].set(0.)
+
+    tie_correct = xp.sum(t**3 - t, axis=-1)
+    se = xp.sqrt((se - tie_correct/2) / 24)
+
+    # se = 0 means that no non-zero values are left in d. we only need z
+    # if method is asymptotic. however, if method="auto", the switch to
+    # asymptotic might only happen after the statistic is calculated, so z
+    # needs to be computed. in all other cases, avoid division by zero warning
+    # (z is not needed anyways)
+    if method in ["asymptotic", "auto"]:
+        z = (r_plus - mn) / se
+    else:
+        z = xp.nan
+
+    return r_plus, r_minus, se, z, count, has_ties
+
+
+def _correction_sign(z, alternative, xp):
+    if alternative == 'greater':
+        return 1
+    elif alternative == 'less':
+        return -1
+    else:
+        return xp.sign(z)
+
+
+def _wilcoxon_nd(x, y=None, zero_method='wilcox', correction=True,
+                 alternative='two-sided', method='auto', axis=0):
+
+    temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
+    d, zero_method, correction, alternative, method, axis, output_z, n_zero, xp = temp
+
+    if xp_size(d) == 0:
+        NaN = _get_nan(d, xp=xp)
+        res = _morestats.WilcoxonResult(statistic=NaN, pvalue=NaN)
+        if method == 'asymptotic':
+            res.zstatistic = NaN
+        return res
+
+    r_plus, r_minus, se, z, count, has_ties = _wilcoxon_statistic(
+        d, method, zero_method, xp=xp
+    )
+
+    # we only know if there are ties after computing the statistic and not
+    # at the input validation stage. if the original method was auto and
+    # the decision was to use an exact test, we override this to
+    # a permutation test now (since method='exact' is not exact in the
+    # presence of ties)
+    if method == "auto":
+        if not (has_ties or n_zero > 0):
+            method = "exact"
+        elif d.shape[-1] <= 13:
+            # the possible outcomes to be simulated by the permutation test
+            # are 2**n, where n is the sample size.
+            # if n <= 13, the p-value is deterministic since 2**13 is less
+            # than 9999, the default number of n_resamples
+            method = stats.PermutationMethod()
+        else:
+            # if there are ties and the sample size is too large to
+            # run a deterministic permutation test, fall back to asymptotic
+            method = "asymptotic"
+
+    if method == 'asymptotic':
+        if correction:
+            sign = _correction_sign(z, alternative, xp=xp)
+            z = xpx.at(z)[...].subtract(sign * 0.5 / se)
+        p = _get_pvalue(z, _SimpleNormal(), alternative, xp=xp)
+    elif method == 'exact':
+        dist = WilcoxonDistribution(count)
+        # The null distribution in `dist` is exact only if there are no ties
+        # or zeros. If there are ties or zeros, the statistic can be non-
+        # integral, but the null distribution is only defined for integral
+        # values of the statistic. Therefore, we're conservative: round
+        # non-integral statistic up before computing CDF and down before
+        # computing SF. This preserves symmetry w.r.t. alternatives and
+        # order of the input arguments. See gh-19872.
+        r_plus_np = np.asarray(r_plus)
+        if alternative == 'less':
+            p = dist.cdf(np.ceil(r_plus_np))
+        elif alternative == 'greater':
+            p = dist.sf(np.floor(r_plus_np))
+        else:
+            p = 2 * np.minimum(dist.sf(np.floor(r_plus_np)),
+                               dist.cdf(np.ceil(r_plus_np)))
+            p = np.clip(p, 0, 1)
+        p = xp.asarray(p, dtype=d.dtype)
+    else:  # `PermutationMethod` instance (already validated)
+        p = stats.permutation_test(
+            (d,), lambda d: _wilcoxon_statistic(d, method, zero_method, xp=xp)[0],
+            permutation_type='samples', **method._asdict(),
+            alternative=alternative, axis=-1).pvalue
+
+    # for backward compatibility...
+    statistic = xp.minimum(r_plus, r_minus) if alternative=='two-sided' else r_plus
+    z = -xp.abs(z) if (alternative == 'two-sided' and method == 'asymptotic') else z
+
+    statistic = statistic[()] if statistic.ndim == 0 else statistic
+    p = p[()] if p.ndim == 0 else p
+    res = _morestats.WilcoxonResult(statistic=statistic, pvalue=p)
+    if output_z:
+        res.zstatistic = z[()] if z.ndim == 0 else z
+    return res
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/biasedurn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/biasedurn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b1c9f1cf2b3b39acdcaeda97a90e8c11c589d89
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/biasedurn.py
@@ -0,0 +1,16 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__: list[str] = []
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="biasedurn",
+                                   private_modules=["_biasedurn"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/contingency.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/contingency.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f160e37340098f50bf8b7285e426c2c00672077
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/contingency.py
@@ -0,0 +1,526 @@
+"""
+Contingency table functions (:mod:`scipy.stats.contingency`)
+============================================================
+
+Functions for creating and analyzing contingency tables.
+
+.. currentmodule:: scipy.stats.contingency
+
+.. autosummary::
+   :toctree: generated/
+
+   chi2_contingency
+   relative_risk
+   odds_ratio
+   crosstab
+   association
+
+   expected_freq
+   margins
+
+"""
+
+
+from functools import reduce
+import math
+import numpy as np
+from ._stats_py import power_divergence, _untabulate
+from ._relative_risk import relative_risk
+from ._crosstab import crosstab
+from ._odds_ratio import odds_ratio
+from scipy._lib._array_api import xp_capabilities
+from scipy._lib._bunch import _make_tuple_bunch
+from scipy import stats
+
+
+__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
+           'association', 'relative_risk', 'odds_ratio']
+
+
+@xp_capabilities(np_only=True)
+def margins(a):
+    """Return a list of the marginal sums of the array `a`.
+
+    Parameters
+    ----------
+    a : ndarray
+        The array for which to compute the marginal sums.
+
+    Returns
+    -------
+    margsums : list of ndarrays
+        A list of length `a.ndim`.  `margsums[k]` is the result
+        of summing `a` over all axes except `k`; it has the same
+        number of dimensions as `a`, but the length of each axis
+        except axis `k` will be 1.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import margins
+
+    >>> a = np.arange(12).reshape(2, 6)
+    >>> a
+    array([[ 0,  1,  2,  3,  4,  5],
+           [ 6,  7,  8,  9, 10, 11]])
+    >>> m0, m1 = margins(a)
+    >>> m0
+    array([[15],
+           [51]])
+    >>> m1
+    array([[ 6,  8, 10, 12, 14, 16]])
+
+    >>> b = np.arange(24).reshape(2,3,4)
+    >>> m0, m1, m2 = margins(b)
+    >>> m0
+    array([[[ 66]],
+           [[210]]])
+    >>> m1
+    array([[[ 60],
+            [ 92],
+            [124]]])
+    >>> m2
+    array([[[60, 66, 72, 78]]])
+    """
+    margsums = []
+    ranged = list(range(a.ndim))
+    for k in ranged:
+        marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
+        margsums.append(marg)
+    return margsums
+
+
+@xp_capabilities(np_only=True)
+def expected_freq(observed):
+    """
+    Compute the expected frequencies from a contingency table.
+
+    Given an n-dimensional contingency table of observed frequencies,
+    compute the expected frequencies for the table based on the marginal
+    sums under the assumption that the groups associated with each
+    dimension are independent.
+
+    Parameters
+    ----------
+    observed : array_like
+        The table of observed frequencies.  (While this function can handle
+        a 1-D array, that case is trivial.  Generally `observed` is at
+        least 2-D.)
+
+    Returns
+    -------
+    expected : ndarray of float64
+        The expected frequencies, based on the marginal sums of the table.
+        Same shape as `observed`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import expected_freq
+    >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
+    >>> expected_freq(observed)
+    array([[ 12.,  12.,  16.],
+           [ 18.,  18.,  24.]])
+
+    """
+    # Typically `observed` is an integer array. If `observed` has a large
+    # number of dimensions or holds large values, some of the following
+    # computations may overflow, so we first switch to floating point.
+    observed = np.asarray(observed, dtype=np.float64)
+
+    # Create a list of the marginal sums.
+    margsums = margins(observed)
+
+    # Create the array of expected frequencies.  The shapes of the
+    # marginal sums returned by apply_over_axes() are just what we
+    # need for broadcasting in the following product.
+    d = observed.ndim
+    expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
+    return expected
+
+
+Chi2ContingencyResult = _make_tuple_bunch(
+    'Chi2ContingencyResult',
+    ['statistic', 'pvalue', 'dof', 'expected_freq'], []
+)
+
+
+@xp_capabilities(np_only=True)
+def chi2_contingency(observed, correction=True, lambda_=None, *, method=None):
+    """Chi-square test of independence of variables in a contingency table.
+
+    This function computes the chi-square statistic and p-value for the
+    hypothesis test of independence of the observed frequencies in the
+    contingency table [1]_ `observed`.  The expected frequencies are computed
+    based on the marginal sums under the assumption of independence; see
+    `scipy.stats.contingency.expected_freq`.  The number of degrees of
+    freedom is (expressed using numpy functions and attributes)::
+
+        dof = observed.size - sum(observed.shape) + observed.ndim - 1
+
+
+    Parameters
+    ----------
+    observed : array_like
+        The contingency table. The table contains the observed frequencies
+        (i.e. number of occurrences) in each category.  In the two-dimensional
+        case, the table is often described as an "R x C table".
+    correction : bool, optional
+        If True, *and* the degrees of freedom is 1, apply Yates' correction
+        for continuity.  The effect of the correction is to adjust each
+        observed value by 0.5 towards the corresponding expected value.
+    lambda_ : float or str, optional
+        By default, the statistic computed in this test is Pearson's
+        chi-squared statistic [2]_.  `lambda_` allows a statistic from the
+        Cressie-Read power divergence family [3]_ to be used instead.  See
+        `scipy.stats.power_divergence` for details.
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value. Compatible only with
+        `correction=False`,  default `lambda_`, and two-way tables.
+        If `method` is an instance of `PermutationMethod`/`MonteCarloMethod`,
+        the p-value is computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Otherwise, the p-value is computed as documented in the notes.
+        Note that if `method` is an instance of `MonteCarloMethod`, the ``rvs``
+        attribute must be left unspecified; Monte Carlo samples are always drawn
+        using the ``rvs`` method of `scipy.stats.random_table`.
+
+        .. versionadded:: 1.15.0
+
+
+    Returns
+    -------
+    res : Chi2ContingencyResult
+        An object containing attributes:
+
+        statistic : float
+            The test statistic.
+        pvalue : float
+            The p-value of the test.
+        dof : int
+            The degrees of freedom. NaN if `method` is not ``None``.
+        expected_freq : ndarray, same shape as `observed`
+            The expected frequencies, based on the marginal sums of the table.
+
+    See Also
+    --------
+    scipy.stats.contingency.expected_freq
+    scipy.stats.fisher_exact
+    scipy.stats.chisquare
+    scipy.stats.power_divergence
+    scipy.stats.barnard_exact
+    scipy.stats.boschloo_exact
+    :ref:`hypothesis_chi2_contingency` : Extended example
+
+    Notes
+    -----
+    An often quoted guideline for the validity of this calculation is that
+    the test should be used only if the observed and expected frequencies
+    in each cell are at least 5.
+
+    This is a test for the independence of different categories of a
+    population. The test is only meaningful when the dimension of
+    `observed` is two or more.  Applying the test to a one-dimensional
+    table will always result in `expected` equal to `observed` and a
+    chi-square statistic equal to 0.
+
+    This function does not handle masked arrays, because the calculation
+    does not make sense with missing values.
+
+    Like `scipy.stats.chisquare`, this function computes a chi-square
+    statistic; the convenience this function provides is to figure out the
+    expected frequencies and degrees of freedom from the given contingency
+    table. If these were already known, and if the Yates' correction was not
+    required, one could use `scipy.stats.chisquare`.  That is, if one calls::
+
+        res = chi2_contingency(obs, correction=False)
+
+    then the following is true::
+
+        (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
+                                                       f_exp=ex.ravel(),
+                                                       ddof=obs.size - 1 - dof)
+
+    The `lambda_` argument was added in version 0.13.0 of scipy.
+
+    References
+    ----------
+    .. [1] "Contingency table",
+           https://en.wikipedia.org/wiki/Contingency_table
+    .. [2] "Pearson's chi-squared test",
+           https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+    .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
+           Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
+           pp. 440-464.
+
+    Examples
+    --------
+    A two-way example (2 x 3):
+
+    >>> import numpy as np
+    >>> from scipy.stats import chi2_contingency
+    >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
+    >>> res = chi2_contingency(obs)
+    >>> res.statistic
+    2.7777777777777777
+    >>> res.pvalue
+    0.24935220877729619
+    >>> res.dof
+    2
+    >>> res.expected_freq
+    array([[ 12.,  12.,  16.],
+           [ 18.,  18.,  24.]])
+
+    Perform the test using the log-likelihood ratio (i.e. the "G-test")
+    instead of Pearson's chi-squared statistic.
+
+    >>> res = chi2_contingency(obs, lambda_="log-likelihood")
+    >>> res.statistic
+    2.7688587616781319
+    >>> res.pvalue
+    0.25046668010954165
+
+    A four-way example (2 x 2 x 2 x 2):
+
+    >>> obs = np.array(
+    ...     [[[[12, 17],
+    ...        [11, 16]],
+    ...       [[11, 12],
+    ...        [15, 16]]],
+    ...      [[[23, 15],
+    ...        [30, 22]],
+    ...       [[14, 17],
+    ...        [15, 16]]]])
+    >>> res = chi2_contingency(obs)
+    >>> res.statistic
+    8.7584514426741897
+    >>> res.pvalue
+    0.64417725029295503
+
+    When the sum of the elements in a two-way table is small, the p-value
+    produced by the default asymptotic approximation may be inaccurate.
+    Consider passing a `PermutationMethod` or `MonteCarloMethod` as the
+    `method` parameter with `correction=False`.
+
+    >>> from scipy.stats import PermutationMethod
+    >>> obs = np.asarray([[12, 3],
+    ...                   [17, 16]])
+    >>> res = chi2_contingency(obs, correction=False)
+    >>> ref = chi2_contingency(obs, correction=False, method=PermutationMethod())
+    >>> res.pvalue, ref.pvalue
+    (0.0614122539870913, 0.1074)  # may vary
+
+    For a more detailed example, see :ref:`hypothesis_chi2_contingency`.
+
+    """
+    observed = np.asarray(observed)
+    if np.any(observed < 0):
+        raise ValueError("All values in `observed` must be nonnegative.")
+    if observed.size == 0:
+        raise ValueError("No data; `observed` has size 0.")
+
+    expected = expected_freq(observed)
+    if np.any(expected == 0):
+        # Include one of the positions where expected is zero in
+        # the exception message.
+        zeropos = list(zip(*np.nonzero(expected == 0)))[0]
+        raise ValueError("The internally computed table of expected "
+                         f"frequencies has a zero element at {zeropos}.")
+
+    if method is not None:
+        return _chi2_resampling_methods(observed, expected, correction, lambda_, method)
+
+    # The degrees of freedom
+    dof = expected.size - sum(expected.shape) + expected.ndim - 1
+
+    if dof == 0:
+        # Degenerate case; this occurs when `observed` is 1D (or, more
+        # generally, when it has only one nontrivial dimension).  In this
+        # case, we also have observed == expected, so chi2 is 0.
+        chi2 = 0.0
+        p = 1.0
+    else:
+        if dof == 1 and correction:
+            # Adjust `observed` according to Yates' correction for continuity.
+            # Magnitude of correction no bigger than difference; see gh-13875
+            diff = expected - observed
+            direction = np.sign(diff)
+            magnitude = np.minimum(0.5, np.abs(diff))
+            observed = observed + magnitude * direction
+
+        chi2, p = power_divergence(observed, expected,
+                                   ddof=observed.size - 1 - dof, axis=None,
+                                   lambda_=lambda_)
+
+    return Chi2ContingencyResult(chi2, p, dof, expected)
+
+
+def _chi2_resampling_methods(observed, expected, correction, lambda_, method):
+
+    if observed.ndim != 2:
+        message = 'Use of `method` is only compatible with two-way tables.'
+        raise ValueError(message)
+
+    if correction:
+        message = f'`{correction=}` is not compatible with `{method=}.`'
+        raise ValueError(message)
+
+    if lambda_ is not None:
+        message = f'`{lambda_=}` is not compatible with `{method=}.`'
+        raise ValueError(message)
+
+    if isinstance(method, stats.PermutationMethod):
+        res = _chi2_permutation_method(observed, expected, method)
+    elif isinstance(method, stats.MonteCarloMethod):
+        res = _chi2_monte_carlo_method(observed, expected, method)
+    else:
+        message = (f'`{method=}` not recognized; if provided, `method` must be an '
+                   'instance of `PermutationMethod` or `MonteCarloMethod`.')
+        raise ValueError(message)
+
+    return Chi2ContingencyResult(res.statistic, res.pvalue, np.nan, expected)
+
+
+def _chi2_permutation_method(observed, expected, method):
+    x, y = _untabulate(observed)
+    # `permutation_test` with `permutation_type='pairings' permutes the order of `x`,
+    # which pairs observations in `x` with different observations in `y`.
+    def statistic(x):
+        # crosstab the resample and compute the statistic
+        table = crosstab(x, y)[1]
+        return np.sum((table - expected)**2/expected)
+
+    return stats.permutation_test((x,), statistic, permutation_type='pairings',
+                                  alternative='greater', **method._asdict())
+
+
+def _chi2_monte_carlo_method(observed, expected, method):
+    method = method._asdict()
+
+    if method.pop('rvs', None) is not None:
+        message = ('If the `method` argument of `chi2_contingency` is an '
+                   'instance of `MonteCarloMethod`, its `rvs` attribute '
+                   'must be unspecified. Use the `MonteCarloMethod` `rng` argument '
+                   'to control the random state.')
+        raise ValueError(message)
+    rng = np.random.default_rng(method.pop('rng', None))
+
+    # `random_table.rvs` produces random contingency tables with the given marginals
+    # under the null hypothesis of independence
+    rowsums, colsums = stats.contingency.margins(observed)
+    X = stats.random_table(rowsums.ravel(), colsums.ravel(), seed=rng)
+    def rvs(size):
+        n_resamples = size[0]
+        return X.rvs(size=n_resamples).reshape(size)
+
+    expected = expected.ravel()
+    def statistic(table, axis):
+        return np.sum((table - expected)**2/expected, axis=axis)
+
+    return stats.monte_carlo_test(observed.ravel(), rvs, statistic,
+                                  alternative='greater', **method)
+
+
+@xp_capabilities(np_only=True)
+def association(observed, method="cramer", correction=False, lambda_=None):
+    """Calculates degree of association between two nominal variables.
+
+    The function provides the option for computing one of three measures of
+    association between two nominal variables from the data given in a 2d
+    contingency table: Tschuprow's T, Pearson's Contingency Coefficient
+    and Cramer's V.
+
+    Parameters
+    ----------
+    observed : array-like
+        The array of observed values
+    method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
+        The association test statistic.
+    correction : bool, optional
+        Inherited from `scipy.stats.contingency.chi2_contingency()`
+    lambda_ : float or str, optional
+        Inherited from `scipy.stats.contingency.chi2_contingency()`
+
+    Returns
+    -------
+    statistic : float
+        Value of the test statistic
+
+    Notes
+    -----
+    Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
+    measure the degree to which two nominal or ordinal variables are related,
+    or the level of their association. This differs from correlation, although
+    many often mistakenly consider them equivalent. Correlation measures in
+    what way two variables are related, whereas, association measures how
+    related the variables are. As such, association does not subsume
+    independent variables, and is rather a test of independence. A value of
+    1.0 indicates perfect association, and 0.0 means the variables have no
+    association.
+
+    Both the Cramer's V and Tschuprow's T are extensions of the phi
+    coefficient.  Moreover, due to the close relationship between the
+    Cramer's V and Tschuprow's T the returned values can often be similar
+    or even equivalent.  They are likely to diverge more as the array shape
+    diverges from a 2x2.
+
+    References
+    ----------
+    .. [1] "Tschuprow's T",
+           https://en.wikipedia.org/wiki/Tschuprow's_T
+    .. [2] Tschuprow, A. A. (1939)
+           Principles of the Mathematical Theory of Correlation;
+           translated by M. Kantorowitsch. W. Hodge & Co.
+    .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
+    .. [4] "Nominal Association: Phi and Cramer's V",
+           http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
+    .. [5] Gingrich, Paul, "Association Between Variables",
+           http://uregina.ca/~gingrich/ch11a.pdf
+
+    Examples
+    --------
+    An example with a 4x2 contingency table:
+
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import association
+    >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
+
+    Pearson's contingency coefficient
+
+    >>> association(obs4x2, method="pearson")
+    0.18303298140595667
+
+    Cramer's V
+
+    >>> association(obs4x2, method="cramer")
+    0.18617813077483678
+
+    Tschuprow's T
+
+    >>> association(obs4x2, method="tschuprow")
+    0.14146478765062995
+    """
+    arr = np.asarray(observed)
+    if not np.issubdtype(arr.dtype, np.integer):
+        raise ValueError("`observed` must be an integer array.")
+
+    if len(arr.shape) != 2:
+        raise ValueError("method only accepts 2d arrays")
+
+    chi2_stat = chi2_contingency(arr, correction=correction,
+                                 lambda_=lambda_)
+
+    phi2 = chi2_stat.statistic / arr.sum()
+    n_rows, n_cols = arr.shape
+    if method == "cramer":
+        value = phi2 / min(n_cols - 1, n_rows - 1)
+    elif method == "tschuprow":
+        value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
+    elif method == 'pearson':
+        value = phi2 / (1 + phi2)
+    else:
+        raise ValueError("Invalid argument value: 'method' argument must "
+                         "be 'cramer', 'tschuprow', or 'pearson'")
+
+    return math.sqrt(value)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/distributions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9c37aa98c9545b2616c8d32e8f676d8d49289e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/distributions.py
@@ -0,0 +1,24 @@
+#
+# Author:  Travis Oliphant  2002-2011 with contributions from
+#          SciPy Developers 2004-2011
+#
+# NOTE: To look at history using `git blame`, use `git blame -M -C -C`
+#       instead of `git blame -Lxxx,+x`.
+#
+from ._distn_infrastructure import (rv_discrete, rv_continuous, rv_frozen)  # noqa: F401
+
+from . import _continuous_distns
+from . import _discrete_distns
+
+from ._continuous_distns import *  # noqa: F403
+from ._levy_stable import levy_stable
+from ._discrete_distns import *  # noqa: F403
+from ._entropy import entropy
+
+# For backwards compatibility e.g. pymc expects distributions.__all__.
+__all__ = ['rv_discrete', 'rv_continuous', 'rv_histogram', 'entropy']  # noqa: F405
+
+# Add only the distribution names, not the *_gen names.
+__all__ += _continuous_distns._distn_names
+__all__ += ['levy_stable']
+__all__ += _discrete_distns._distn_names
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/kde.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..4401da5a30f4452ab394232d3928493d0e3b77ec
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/kde.py
@@ -0,0 +1,18 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = ["gaussian_kde"]  # noqa: F822
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="kde",
+                                   private_modules=["_kde"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/morestats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/morestats.py
new file mode 100644
index 0000000000000000000000000000000000000000..76040ea0ca5251cabbdda7fb7936af4b281b3212
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/morestats.py
@@ -0,0 +1,27 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'mvsdist',
+    'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
+    'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
+    'shapiro', 'anderson', 'ansari', 'bartlett', 'levene',
+    'fligner', 'mood', 'wilcoxon', 'median_test',
+    'circmean', 'circvar', 'circstd', 'anderson_ksamp',
+    'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
+    'yeojohnson_normplot', 'chi2_contingency', 'distributions',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="morestats",
+                                   private_modules=["_morestats"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats.py
new file mode 100644
index 0000000000000000000000000000000000000000..88016af71803dc5c4ebadba168f22cdcd8273dbb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats.py
@@ -0,0 +1,140 @@
+"""
+===================================================================
+Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
+===================================================================
+
+.. currentmodule:: scipy.stats.mstats
+
+This module contains a large number of statistical functions that can
+be used with masked arrays.
+
+Most of these functions are similar to those in `scipy.stats` but might
+have small differences in the API or in the algorithm used. Since this
+is a relatively new package, some API changes are still possible.
+
+Summary statistics
+==================
+
+.. autosummary::
+   :toctree: generated/
+
+   describe
+   gmean
+   hmean
+   kurtosis
+   mode
+   mquantiles
+   hdmedian
+   hdquantiles
+   hdquantiles_sd
+   idealfourths
+   plotting_positions
+   meppf
+   moment
+   skew
+   tmean
+   tvar
+   tmin
+   tmax
+   tsem
+   variation
+   find_repeats
+   sem
+   trimmed_mean
+   trimmed_mean_ci
+   trimmed_std
+   trimmed_var
+
+Frequency statistics
+====================
+
+.. autosummary::
+   :toctree: generated/
+
+   scoreatpercentile
+
+Correlation functions
+=====================
+
+.. autosummary::
+   :toctree: generated/
+
+   f_oneway
+   pearsonr
+   spearmanr
+   pointbiserialr
+   kendalltau
+   kendalltau_seasonal
+   linregress
+   siegelslopes
+   theilslopes
+   sen_seasonal_slopes
+
+Statistical tests
+=================
+
+.. autosummary::
+   :toctree: generated/
+
+   ttest_1samp
+   ttest_onesamp
+   ttest_ind
+   ttest_rel
+   chisquare
+   kstest
+   ks_2samp
+   ks_1samp
+   ks_twosamp
+   mannwhitneyu
+   rankdata
+   kruskal
+   kruskalwallis
+   friedmanchisquare
+   brunnermunzel
+   skewtest
+   kurtosistest
+   normaltest
+
+Transformations
+===============
+
+.. autosummary::
+   :toctree: generated/
+
+   obrientransform
+   trim
+   trima
+   trimmed_stde
+   trimr
+   trimtail
+   trimboth
+   winsorize
+   zmap
+   zscore
+
+Other
+=====
+
+.. autosummary::
+   :toctree: generated/
+
+   argstoarray
+   count_tied_groups
+   msign
+   compare_medians_ms
+   median_cihs
+   mjci
+   mquantiles_cimj
+   rsh
+
+"""
+from . import _mstats_basic
+from . import _mstats_extras
+from ._mstats_basic import *  # noqa: F403
+from ._mstats_extras import *  # noqa: F403
+# Functions that support masked array input in stats but need to be kept in the
+# mstats namespace for backwards compatibility:
+from scipy.stats import gmean, hmean, zmap, zscore, chisquare
+
+__all__ = _mstats_basic.__all__ + _mstats_extras.__all__
+__all__ += ['gmean', 'hmean', 'zmap', 'zscore', 'chisquare']
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_basic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..19cc67a6acdfa054ffa2b29b6e774dd7aafda263
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_basic.py
@@ -0,0 +1,42 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'argstoarray',
+    'count_tied_groups',
+    'describe',
+    'f_oneway', 'find_repeats','friedmanchisquare',
+    'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
+    'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
+    'ks_1samp', 'kstest',
+    'linregress',
+    'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
+    'normaltest',
+    'obrientransform',
+    'pearsonr','plotting_positions','pointbiserialr',
+    'rankdata',
+    'scoreatpercentile','sem',
+    'sen_seasonal_slopes','skew','skewtest','spearmanr',
+    'siegelslopes', 'theilslopes',
+    'tmax','tmean','tmin','trim','trimboth',
+    'trimtail','trima','trimr','trimmed_mean','trimmed_std',
+    'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
+    'ttest_ind','ttest_rel','tvar',
+    'variation',
+    'winsorize',
+    'brunnermunzel',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="mstats_basic",
+                                   private_modules=["_mstats_basic"], all=__all__,
+                                   attribute=name, correct_module="mstats")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_extras.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_extras.py
new file mode 100644
index 0000000000000000000000000000000000000000..fec695329cf2c2d58a4918cc99e209c0650c3ea6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mstats_extras.py
@@ -0,0 +1,25 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'compare_medians_ms',
+    'hdquantiles', 'hdmedian', 'hdquantiles_sd',
+    'idealfourths',
+    'median_cihs','mjci','mquantiles_cimj',
+    'rsh',
+    'trimmed_mean_ci',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="mstats_extras",
+                                   private_modules=["_mstats_extras"], all=__all__,
+                                   attribute=name, correct_module="mstats")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mvn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..65da9e20f6a4e6d24c1cb206c59821730fb6ab83
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/mvn.py
@@ -0,0 +1,17 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+__all__: list[str] = []
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="mvn",
+                                   private_modules=["_mvn"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/qmc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/qmc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8a08343cf4c759938b31c29e32aaa644bf6e0fd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/qmc.py
@@ -0,0 +1,236 @@
+r"""
+====================================================
+Quasi-Monte Carlo submodule (:mod:`scipy.stats.qmc`)
+====================================================
+
+.. currentmodule:: scipy.stats.qmc
+
+This module provides Quasi-Monte Carlo generators and associated helper
+functions.
+
+
+Quasi-Monte Carlo
+=================
+
+Engines
+-------
+
+.. autosummary::
+   :toctree: generated/
+
+   QMCEngine
+   Sobol
+   Halton
+   LatinHypercube
+   PoissonDisk
+   MultinomialQMC
+   MultivariateNormalQMC
+
+Helpers
+-------
+
+.. autosummary::
+   :toctree: generated/
+
+   discrepancy
+   geometric_discrepancy
+   update_discrepancy
+   scale
+
+
+Introduction to Quasi-Monte Carlo
+=================================
+
+Quasi-Monte Carlo (QMC) methods [1]_, [2]_, [3]_ provide an
+:math:`n \times d` array of numbers in :math:`[0,1]`. They can be used in
+place of :math:`n` points from the :math:`U[0,1]^{d}` distribution. Compared to
+random points, QMC points are designed to have fewer gaps and clumps. This is
+quantified by discrepancy measures [4]_. From the Koksma-Hlawka
+inequality [5]_ we know that low discrepancy reduces a bound on
+integration error. Averaging a function :math:`f` over :math:`n` QMC points
+can achieve an integration error close to :math:`O(n^{-1})` for well
+behaved functions [2]_.
+
+Most QMC constructions are designed for special values of :math:`n`
+such as powers of 2 or large primes. Changing the sample
+size by even one can degrade their performance, even their
+rate of convergence [6]_. For instance :math:`n=100` points may give less
+accuracy than :math:`n=64` if the method was designed for :math:`n=2^m`.
+
+Some QMC constructions are extensible in :math:`n`: we can find
+another special sample size :math:`n' > n` and often an infinite
+sequence of increasing special sample sizes. Some QMC
+constructions are extensible in :math:`d`: we can increase the dimension,
+possibly to some upper bound, and typically without requiring
+special values of :math:`d`. Some QMC methods are extensible in
+both :math:`n` and :math:`d`.
+
+QMC points are deterministic. That makes it hard to estimate the accuracy of
+integrals estimated by averages over QMC points. Randomized QMC (RQMC) [7]_
+points are constructed so that each point is individually :math:`U[0,1]^{d}`
+while collectively the :math:`n` points retain their low discrepancy.
+One can make :math:`R` independent replications of RQMC points to
+see how stable a computation is. From :math:`R` independent values,
+a t-test (or bootstrap t-test [8]_) then gives approximate confidence
+intervals on the mean value. Some RQMC methods produce a
+root mean squared error that is actually :math:`o(1/n)` and smaller than
+the rate seen in unrandomized QMC. An intuitive explanation is
+that the error is a sum of many small ones and random errors
+cancel in a way that deterministic ones do not. RQMC also
+has advantages on integrands that are singular or, for other
+reasons, fail to be Riemann integrable.
+
+(R)QMC cannot beat Bahkvalov's curse of dimension (see [9]_). For
+any random or deterministic method, there are worst case functions
+that will give it poor performance in high dimensions. A worst
+case function for QMC might be 0 at all n points but very
+large elsewhere. Worst case analyses get very pessimistic
+in high dimensions. (R)QMC can bring a great improvement over
+MC when the functions on which it is used are not worst case.
+For instance (R)QMC can be especially effective on integrands
+that are well approximated by sums of functions of
+some small number of their input variables at a time [10]_, [11]_.
+That property is often a surprising finding about those functions.
+
+Also, to see an improvement over IID MC, (R)QMC requires a bit of smoothness of
+the integrand, roughly the mixed first order derivative in each direction,
+:math:`\partial^d f/\partial x_1 \cdots \partial x_d`, must be integral.
+For instance, a function that is 1 inside the hypersphere and 0 outside of it
+has infinite variation in the sense of Hardy and Krause for any dimension
+:math:`d = 2`.
+
+Scrambled nets are a kind of RQMC that have some valuable robustness
+properties [12]_. If the integrand is square integrable, they give variance
+:math:`var_{SNET} = o(1/n)`. There is a finite upper bound on
+:math:`var_{SNET} / var_{MC}` that holds simultaneously for every square
+integrable integrand. Scrambled nets satisfy a strong law of large numbers
+for :math:`f` in :math:`L^p` when :math:`p>1`. In some
+special cases there is a central limit theorem [13]_. For smooth enough
+integrands they can achieve RMSE nearly :math:`O(n^{-3})`. See [12]_
+for references about these properties.
+
+The main kinds of QMC methods are lattice rules [14]_ and digital
+nets and sequences [2]_, [15]_. The theories meet up in polynomial
+lattice rules [16]_ which can produce digital nets. Lattice rules
+require some form of search for good constructions. For digital
+nets there are widely used default constructions.
+
+The most widely used QMC methods are Sobol' sequences [17]_.
+These are digital nets. They are extensible in both :math:`n` and :math:`d`.
+They can be scrambled. The special sample sizes are powers
+of 2. Another popular method are Halton sequences [18]_.
+The constructions resemble those of digital nets. The earlier
+dimensions have much better equidistribution properties than
+later ones. There are essentially no special sample sizes.
+They are not thought to be as accurate as Sobol' sequences.
+They can be scrambled. The nets of Faure [19]_ are also widely
+used. All dimensions are equally good, but the special sample
+sizes grow rapidly with dimension :math:`d`. They can be scrambled.
+The nets of Niederreiter and Xing [20]_ have the best asymptotic
+properties but have not shown good empirical performance [21]_.
+
+Higher order digital nets are formed by a digit interleaving process
+in the digits of the constructed points. They can achieve higher
+levels of asymptotic accuracy given higher smoothness conditions on :math:`f`
+and they can be scrambled [22]_. There is little or no empirical work
+showing the improved rate to be attained.
+
+Using QMC is like using the entire period of a small random
+number generator. The constructions are similar and so
+therefore are the computational costs [23]_.
+
+(R)QMC is sometimes improved by passing the points through
+a baker's transformation (tent function) prior to using them.
+That function has the form :math:`1-2|x-1/2|`. As :math:`x` goes from 0 to
+1, this function goes from 0 to 1 and then back. It is very
+useful to produce a periodic function for lattice rules [14]_,
+and sometimes it improves the convergence rate [24]_.
+
+It is not straightforward to apply QMC methods to Markov
+chain Monte Carlo (MCMC).  We can think of MCMC as using
+:math:`n=1` point in :math:`[0,1]^{d}` for very large :math:`d`, with
+ergodic results corresponding to :math:`d \to \infty`. One proposal is
+in [25]_ and under strong conditions an improved rate of convergence
+has been shown [26]_.
+
+Returning to Sobol' points: there are many versions depending
+on what are called direction numbers. Those are the result of
+searches and are tabulated. A very widely used set of direction
+numbers come from [27]_. It is extensible in dimension up to
+:math:`d=21201`.
+
+References
+----------
+.. [1] Owen, Art B. "Monte Carlo Book: the Quasi-Monte Carlo parts." 2019.
+.. [2] Niederreiter, Harald. "Random number generation and quasi-Monte Carlo
+   methods." Society for Industrial and Applied Mathematics, 1992.
+.. [3] Dick, Josef, Frances Y. Kuo, and Ian H. Sloan. "High-dimensional
+   integration: the quasi-Monte Carlo way." Acta Numerica no. 22: 133, 2013.
+.. [4] Aho, A. V., C. Aistleitner, T. Anderson, K. Appel, V. Arnol'd, N.
+   Aronszajn, D. Asotsky et al. "W. Chen et al.(eds.), "A Panorama of
+   Discrepancy Theory", Sringer International Publishing,
+   Switzerland: 679, 2014.
+.. [5] Hickernell, Fred J. "Koksma-Hlawka Inequality." Wiley StatsRef:
+   Statistics Reference Online, 2014.
+.. [6] Owen, Art B. "On dropping the first Sobol' point." :arxiv:`2008.08051`,
+   2020.
+.. [7] L'Ecuyer, Pierre, and Christiane Lemieux. "Recent advances in randomized
+   quasi-Monte Carlo methods." In Modeling uncertainty, pp. 419-474. Springer,
+   New York, NY, 2002.
+.. [8] DiCiccio, Thomas J., and Bradley Efron. "Bootstrap confidence
+   intervals." Statistical science: 189-212, 1996.
+.. [9] Dimov, Ivan T. "Monte Carlo methods for applied scientists." World
+   Scientific, 2008.
+.. [10] Caflisch, Russel E., William J. Morokoff, and Art B. Owen. "Valuation
+   of mortgage backed securities using Brownian bridges to reduce effective
+   dimension." Journal of Computational Finance: no. 1 27-46, 1997.
+.. [11] Sloan, Ian H., and Henryk Wozniakowski. "When are quasi-Monte Carlo
+   algorithms efficient for high dimensional integrals?." Journal of Complexity
+   14, no. 1 (1998): 1-33.
+.. [12] Owen, Art B., and Daniel Rudolf, "A strong law of large numbers for
+   scrambled net integration." SIAM Review, to appear.
+.. [13] Loh, Wei-Liem. "On the asymptotic distribution of scrambled net
+   quadrature." The Annals of Statistics 31, no. 4: 1282-1324, 2003.
+.. [14] Sloan, Ian H. and S. Joe. "Lattice methods for multiple integration."
+   Oxford University Press, 1994.
+.. [15] Dick, Josef, and Friedrich Pillichshammer. "Digital nets and sequences:
+   discrepancy theory and quasi-Monte Carlo integration." Cambridge University
+   Press, 2010.
+.. [16] Dick, Josef, F. Kuo, Friedrich Pillichshammer, and I. Sloan.
+   "Construction algorithms for polynomial lattice rules for multivariate
+   integration." Mathematics of computation 74, no. 252: 1895-1921, 2005.
+.. [17] Sobol', Il'ya Meerovich. "On the distribution of points in a cube and
+   the approximate evaluation of integrals." Zhurnal Vychislitel'noi Matematiki
+   i Matematicheskoi Fiziki 7, no. 4: 784-802, 1967.
+.. [18] Halton, John H. "On the efficiency of certain quasi-random sequences of
+   points in evaluating multi-dimensional integrals." Numerische Mathematik 2,
+   no. 1: 84-90, 1960.
+.. [19] Faure, Henri. "Discrepance de suites associees a un systeme de
+   numeration (en dimension s)." Acta arithmetica 41, no. 4: 337-351, 1982.
+.. [20] Niederreiter, Harold, and Chaoping Xing. "Low-discrepancy sequences and
+   global function fields with many rational places." Finite Fields and their
+   applications 2, no. 3: 241-273, 1996.
+.. [21] Hong, Hee Sun, and Fred J. Hickernell. "Algorithm 823: Implementing
+   scrambled digital sequences." ACM Transactions on Mathematical Software
+   (TOMS) 29, no. 2: 95-109, 2003.
+.. [22] Dick, Josef. "Higher order scrambled digital nets achieve the optimal
+   rate of the root mean square error for smooth integrands." The Annals of
+   Statistics 39, no. 3: 1372-1398, 2011.
+.. [23] Niederreiter, Harald. "Multidimensional numerical integration using
+   pseudorandom numbers." In Stochastic Programming 84 Part I, pp. 17-38.
+   Springer, Berlin, Heidelberg, 1986.
+.. [24] Hickernell, Fred J. "Obtaining O (N-2+e) Convergence for Lattice
+   Quadrature Rules." In Monte Carlo and Quasi-Monte Carlo Methods 2000,
+   pp. 274-289. Springer, Berlin, Heidelberg, 2002.
+.. [25] Owen, Art B., and Seth D. Tribble. "A quasi-Monte Carlo Metropolis
+   algorithm." Proceedings of the National Academy of Sciences 102,
+   no. 25: 8844-8849, 2005.
+.. [26] Chen, Su. "Consistency and convergence rate of Markov chain quasi Monte
+   Carlo with examples." PhD diss., Stanford University, 2011.
+.. [27] Joe, Stephen, and Frances Y. Kuo. "Constructing Sobol sequences with
+   better two-dimensional projections." SIAM Journal on Scientific Computing
+   30, no. 5: 2635-2654, 2008.
+
+"""
+from ._qmc import *  # noqa: F403
+from ._qmc import __all__  # noqa: F401
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/sampling.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..12174d9dfb3cb93fa33811ed4b5d233817512e36
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/sampling.py
@@ -0,0 +1,73 @@
+"""
+======================================================
+Random Number Generators (:mod:`scipy.stats.sampling`)
+======================================================
+
+.. currentmodule:: scipy.stats.sampling
+
+This module contains a collection of random number generators to sample
+from univariate continuous and discrete distributions. It uses the
+implementation of a C library called "UNU.RAN". The only exception is
+RatioUniforms, which is a pure Python implementation of the
+Ratio-of-Uniforms method.
+
+Generators Wrapped
+==================
+
+For continuous distributions
+----------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   NumericalInverseHermite
+   NumericalInversePolynomial
+   TransformedDensityRejection
+   SimpleRatioUniforms
+   RatioUniforms
+
+For discrete distributions
+--------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   DiscreteAliasUrn
+   DiscreteGuideTable
+
+Warnings / Errors used in :mod:`scipy.stats.sampling`
+-----------------------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   UNURANError
+
+
+Generators for pre-defined distributions
+========================================
+
+To easily apply the above methods for some of the continuous distributions
+in :mod:`scipy.stats`, the following functionality can be used:
+
+.. autosummary::
+   :toctree: generated/
+
+   FastGeneratorInversion
+
+"""
+from ._sampling import FastGeneratorInversion, RatioUniforms  # noqa: F401
+from ._unuran.unuran_wrapper import (  # noqa: F401
+   TransformedDensityRejection,
+   DiscreteAliasUrn,
+   DiscreteGuideTable,
+   NumericalInversePolynomial,
+   NumericalInverseHermite,
+   SimpleRatioUniforms,
+   UNURANError
+)
+
+__all__ = ["NumericalInverseHermite", "NumericalInversePolynomial",
+           "TransformedDensityRejection", "SimpleRatioUniforms",
+           "RatioUniforms", "DiscreteAliasUrn", "DiscreteGuideTable",
+           "UNURANError", "FastGeneratorInversion"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/stats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..6879c9c07cb0708b645731c6a397c244c79dfecd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/scipy/stats/stats.py
@@ -0,0 +1,41 @@
+# This file is not meant for public use and will be removed in SciPy v2.0.0.
+# Use the `scipy.stats` namespace for importing the functions
+# included below.
+
+from scipy._lib.deprecation import _sub_module_deprecation
+
+
+__all__ = [  # noqa: F822
+    'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',
+    'tmin', 'tmax', 'tstd', 'tsem', 'moment',
+    'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
+    'normaltest', 'jarque_bera',
+    'scoreatpercentile', 'percentileofscore',
+    'cumfreq', 'relfreq', 'obrientransform',
+    'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',
+    'median_abs_deviation',
+    'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
+    'f_oneway',
+    'pearsonr', 'fisher_exact',
+    'spearmanr', 'pointbiserialr',
+    'kendalltau', 'weightedtau', 'multiscale_graphcorr',
+    'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
+    'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
+    'kstest', 'ks_1samp', 'ks_2samp',
+    'chisquare', 'power_divergence',
+    'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
+    'rankdata',
+    'combine_pvalues', 'wasserstein_distance', 'energy_distance',
+    'brunnermunzel', 'alexandergovern', 'distributions',
+    'mstats_basic',
+]
+
+
+def __dir__():
+    return __all__
+
+
+def __getattr__(name):
+    return _sub_module_deprecation(sub_package="stats", module="stats",
+                                   private_modules=["_stats_py", "_mgc"], all=__all__,
+                                   attribute=name)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c08783910964d60db737e6eb005246334f8e6bf3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_internally_replaced_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_internally_replaced_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81436c726dbfd57545dd1d9c3a5906e9ebf5cfb7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_internally_replaced_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_meta_registrations.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_meta_registrations.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..228d386c3b75b0ebadf900f9fe7704f2fb4bcdf1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_meta_registrations.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc979b71c44c90bdc28bf932a45affe018d0af58
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/extension.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/extension.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96c44c729dc62ef69b27e52c5453771e6c422b77
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/extension.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58630618ceed6e7d5b2c9d79b438da0a1467f9eb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/version.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/version.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..229b479f4ee77bff8d2f8eda074a9d7b4c040256
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/__pycache__/version.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66b87f954ad45bd2339c9a85be4ef8ee37e08dba
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_optical_flow.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_optical_flow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2fdcb104e975c307954003906ae0fa98395731f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_optical_flow.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_stereo_matching.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_stereo_matching.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9b976ff64f72b7d773a58a9f70e7f2dca72e616
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/_stereo_matching.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/caltech.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/caltech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1039d282537d598d65d97eeb4feef64a6f670234
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/caltech.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/celeba.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/celeba.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f405e1ba2217d1ce0e80c99e0b83b5d877ab32f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/celeba.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cifar.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cifar.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9527b55ae2384e45997cad11e2f14257fd9d815b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cifar.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cityscapes.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cityscapes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1144c5fec1801702722431427d6946dd4905784e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/cityscapes.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/clevr.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/clevr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbed144d4f14d1bbd4f97443754ab218ec34d9ff
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/clevr.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/coco.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/coco.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b298d9b618dcfa1f6188cd7802480b6138a8829f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/coco.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/country211.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/country211.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6ed9a5c05dea566b70b519da3f12efc0021a487
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/country211.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/dtd.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/dtd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..460faadd840b3f19cb288966360e8f8bb27bd5f0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/dtd.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/eurosat.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/eurosat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f22930c060cd275ecac880e5ed36a2b24e2a059
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/eurosat.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fakedata.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fakedata.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5832f3870648985315ebbc53f94af50d234bd880
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fakedata.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fer2013.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fer2013.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2dc64a2d16e4550599fb76c62d0b40899451cab
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fer2013.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fgvc_aircraft.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fgvc_aircraft.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0100acf99baaec3fd5cb16c35d9147ffd4b8ded4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/fgvc_aircraft.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flickr.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flickr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..042470d63de1db2fed98382e0a69d2f8e38a0592
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flickr.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flowers102.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flowers102.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d14f444d3438aa78a48287c9e1f637326381d24f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/flowers102.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/folder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/folder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8e40e91a0e47d0069d04a7d5aca01b50fc87a46
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/folder.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/food101.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/food101.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..527b3483e429e05a690aad0cc237023ff7a66066
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/food101.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/gtsrb.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/gtsrb.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50a2d4881775e86a37ff7acf7423289cb5c6db98
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/gtsrb.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/hmdb51.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/hmdb51.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..165d0467688a7761e1befb9f1e71fd0420863c46
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/hmdb51.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb278d97c032513423490af83da5d349c51ddfd
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenette.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenette.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f0145d835e45ea0d81c1bf7c9d0813651f3220b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/imagenette.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/inaturalist.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/inaturalist.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40b61bd68d7e8323ece4a8b8f651e1e6cc2f6b47
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/inaturalist.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kinetics.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kinetics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44185757e65ab9bede73efac725d889b67b97338
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kinetics.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kitti.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kitti.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4efc4d62e61269501f8fedba1359f114d3cca5eb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/kitti.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lfw.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lfw.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33d4305210d44097a1f7dbf725a66d75b281ee2f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lfw.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lsun.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lsun.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdba5642f4b604d46c4c7f8193e6190e5b103be2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/lsun.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/mnist.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/mnist.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb0a9ff20cdaf1d7202bd494fb9e63ad09cfabe6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/mnist.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/moving_mnist.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/moving_mnist.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..728beed78afc248c1c4b212965b0166dedc02160
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/moving_mnist.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/omniglot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/omniglot.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5566866281be62d1a74295c2697adb097163693a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/omniglot.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/oxford_iiit_pet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/oxford_iiit_pet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0ea59fe6f8587f82b5eada002d7415280439523
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/oxford_iiit_pet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/pcam.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/pcam.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9486fc23d490588a140cdb46ef61084dc5119561
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/pcam.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/phototour.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/phototour.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d18d665c1f8abd1b181adb2d22520a0a41ef8ca2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/phototour.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/places365.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/places365.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0625e0ccce2fcebfdab5846a6b3a2f9cf73dbaee
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/places365.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/rendered_sst2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/rendered_sst2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63024be4d887ab05b39883598758408973172cab
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/rendered_sst2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbd.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1144a6d2268035f71480850ff3d5690e98b09b24
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbd.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbu.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a610abb78777215799e0bc684f63553494685169
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sbu.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/semeion.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/semeion.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa530ffe2e3b06c9d82bfc3a88c6c1ab89f3caa9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/semeion.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stanford_cars.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stanford_cars.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c07cfea6c035eb24f28fe2d96b888155134b0446
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stanford_cars.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stl10.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stl10.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d273296fcdf06f6a78e125165219be5f5b71f075
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/stl10.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sun397.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sun397.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00aa7f18a2214e22a4d3876357b34023998c3a55
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/sun397.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/svhn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/svhn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..184ed9744960e3e5be0cdf6a1bc542ba23b498db
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/svhn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/ucf101.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/ucf101.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..528724799774c66bb7255b4c0248ebff50e5aa10
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/ucf101.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/usps.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/usps.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1923feb61520329f5d8dc348672df7797a484f30
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/usps.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acf44380b7e92c53f88322259d20547a49b91422
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/video_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/video_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..788842a78b4c175be412fc4d1e0d81d9553b49fe
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/video_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/vision.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/vision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0879b53af136bdfdc0f948db0d79bff686a9fd2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/vision.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/voc.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/voc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab4db5b050d08d3efeac52a37894cdd2b5f7f3db
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/voc.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/widerface.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/widerface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f82c9f547c045d89ff94763149cbf6a06916a40
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/__pycache__/widerface.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b2d2abd936d885221174d194a633a8e413935f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__init__.py
@@ -0,0 +1,3 @@
+from .clip_sampler import DistributedSampler, RandomClipSampler, UniformClipSampler
+
+__all__ = ("DistributedSampler", "UniformClipSampler", "RandomClipSampler")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b35f8d4ada7de35aa083bb27c77b3a7c10eb478
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/clip_sampler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/clip_sampler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9ed9e93c70a3af56f9eaceb6675fb641560bc6b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/__pycache__/clip_sampler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/clip_sampler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/clip_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..570bc85eee906686a63f114eff6db08480737a8a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/datasets/samplers/clip_sampler.py
@@ -0,0 +1,173 @@
+import math
+from collections.abc import Iterator, Sized
+from typing import cast, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import Sampler
+from torchvision.datasets.video_utils import VideoClips
+
+
+class DistributedSampler(Sampler):
+    """
+    Extension of DistributedSampler, as discussed in
+    https://github.com/pytorch/pytorch/issues/23430
+
+    Example:
+        dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        num_replicas: 4
+        shuffle: False
+
+    when group_size = 1
+            RANK    |  shard_dataset
+            =========================
+            rank_0  |  [0, 4, 8, 12]
+            rank_1  |  [1, 5, 9, 13]
+            rank_2  |  [2, 6, 10, 0]
+            rank_3  |  [3, 7, 11, 1]
+
+    when group_size = 2
+
+            RANK    |  shard_dataset
+            =========================
+            rank_0  |  [0, 1, 8, 9]
+            rank_1  |  [2, 3, 10, 11]
+            rank_2  |  [4, 5, 12, 13]
+            rank_3  |  [6, 7, 0, 1]
+
+    """
+
+    def __init__(
+        self,
+        dataset: Sized,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = False,
+        group_size: int = 1,
+    ) -> None:
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if len(dataset) % group_size != 0:
+            raise ValueError(
+                f"dataset length must be a multiplier of group size dataset length: {len(dataset)}, group size: {group_size}"
+            )
+        self.dataset = dataset
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        dataset_group_length = len(dataset) // group_size
+        self.num_group_samples = int(math.ceil(dataset_group_length * 1.0 / self.num_replicas))
+        self.num_samples = self.num_group_samples * group_size
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+
+    def __iter__(self) -> Iterator[int]:
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices: Union[torch.Tensor, list[int]]
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        total_group_size = self.total_size // self.group_size
+        indices = torch.reshape(torch.LongTensor(indices), (total_group_size, self.group_size))
+
+        # subsample
+        indices = indices[self.rank : total_group_size : self.num_replicas, :]
+        indices = torch.reshape(indices, (-1,)).tolist()
+        assert len(indices) == self.num_samples
+
+        if isinstance(self.dataset, Sampler):
+            orig_indices = list(iter(self.dataset))
+            indices = [orig_indices[i] for i in indices]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        self.epoch = epoch
+
+
+class UniformClipSampler(Sampler):
+    """
+    Sample `num_video_clips_per_video` clips for each video, equally spaced.
+    When number of unique clips in the video is fewer than num_video_clips_per_video,
+    repeat the clips until `num_video_clips_per_video` clips are collected
+
+    Args:
+        video_clips (VideoClips): video clips to sample from
+        num_clips_per_video (int): number of clips to be sampled per video
+    """
+
+    def __init__(self, video_clips: VideoClips, num_clips_per_video: int) -> None:
+        if not isinstance(video_clips, VideoClips):
+            raise TypeError(f"Expected video_clips to be an instance of VideoClips, got {type(video_clips)}")
+        self.video_clips = video_clips
+        self.num_clips_per_video = num_clips_per_video
+
+    def __iter__(self) -> Iterator[int]:
+        idxs = []
+        s = 0
+        # select num_clips_per_video for each video, uniformly spaced
+        for c in self.video_clips.clips:
+            length = len(c)
+            if length == 0:
+                # corner case where video decoding fails
+                continue
+
+            sampled = torch.linspace(s, s + length - 1, steps=self.num_clips_per_video).floor().to(torch.int64)
+            s += length
+            idxs.append(sampled)
+        return iter(cast(list[int], torch.cat(idxs).tolist()))
+
+    def __len__(self) -> int:
+        return sum(self.num_clips_per_video for c in self.video_clips.clips if len(c) > 0)
+
+
+class RandomClipSampler(Sampler):
+    """
+    Samples at most `max_video_clips_per_video` clips for each video randomly
+
+    Args:
+        video_clips (VideoClips): video clips to sample from
+        max_clips_per_video (int): maximum number of clips to be sampled per video
+    """
+
+    def __init__(self, video_clips: VideoClips, max_clips_per_video: int) -> None:
+        if not isinstance(video_clips, VideoClips):
+            raise TypeError(f"Expected video_clips to be an instance of VideoClips, got {type(video_clips)}")
+        self.video_clips = video_clips
+        self.max_clips_per_video = max_clips_per_video
+
+    def __iter__(self) -> Iterator[int]:
+        idxs = []
+        s = 0
+        # select at most max_clips_per_video for each video, randomly
+        for c in self.video_clips.clips:
+            length = len(c)
+            size = min(length, self.max_clips_per_video)
+            sampled = torch.randperm(length)[:size] + s
+            s += length
+            idxs.append(sampled)
+        idxs_ = torch.cat(idxs)
+        # shuffle all clips randomly
+        perm = torch.randperm(len(idxs_))
+        return iter(idxs_[perm].tolist())
+
+    def __len__(self) -> int:
+        return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03bd5d23cb2cf8e3acb67b7567e3ad9ef8061874
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__init__.py
@@ -0,0 +1,73 @@
+try:
+    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
+except ModuleNotFoundError:
+    _HAS_GPU_VIDEO_DECODER = False
+
+from ._video_opt import (
+    _HAS_CPU_VIDEO_DECODER,
+    _HAS_VIDEO_OPT,
+    _probe_video_from_file,
+    _probe_video_from_memory,
+    _read_video_from_file,
+    _read_video_from_memory,
+    _read_video_timestamps_from_file,
+    _read_video_timestamps_from_memory,
+    Timebase,
+    VideoMetaData,
+)
+from .image import (
+    decode_avif,
+    decode_gif,
+    decode_heic,
+    decode_image,
+    decode_jpeg,
+    decode_png,
+    decode_webp,
+    encode_jpeg,
+    encode_png,
+    ImageReadMode,
+    read_file,
+    read_image,
+    write_file,
+    write_jpeg,
+    write_png,
+)
+from .video import read_video, read_video_timestamps, write_video
+from .video_reader import VideoReader
+
+
+__all__ = [
+    "write_video",
+    "read_video",
+    "read_video_timestamps",
+    "_read_video_from_file",
+    "_read_video_timestamps_from_file",
+    "_probe_video_from_file",
+    "_read_video_from_memory",
+    "_read_video_timestamps_from_memory",
+    "_probe_video_from_memory",
+    "_HAS_CPU_VIDEO_DECODER",
+    "_HAS_VIDEO_OPT",
+    "_HAS_GPU_VIDEO_DECODER",
+    "_read_video_clip_from_memory",
+    "_read_video_meta_data",
+    "VideoMetaData",
+    "Timebase",
+    "ImageReadMode",
+    "decode_image",
+    "decode_jpeg",
+    "decode_png",
+    "decode_avif",
+    "decode_heic",
+    "decode_webp",
+    "decode_gif",
+    "encode_jpeg",
+    "encode_png",
+    "read_file",
+    "read_image",
+    "write_file",
+    "write_jpeg",
+    "write_png",
+    "Video",
+    "VideoReader",
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..742b2c24955cee29ff11063020a94591f5b1a8f5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_load_gpu_decoder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_load_gpu_decoder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bcfec387b5931e4a72822e35dbc4917e070bd93
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_load_gpu_decoder.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_deprecation_warning.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_deprecation_warning.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a5c5053312c64b797cbc1aedb96b3f3dfd29671
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_deprecation_warning.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_opt.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_opt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9126fa495db81645aaa1513fb6501c5557cc82f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/_video_opt.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/image.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/image.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..121b46e8ca3ac7b35a738ff25f98ff99c347037b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/image.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9268ffa9c97e31bf8217bffa7c20c8d62e24cc0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video_reader.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video_reader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f52e3e1fa5a582a1b46bb6414f0704db375589b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/__pycache__/video_reader.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_load_gpu_decoder.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_load_gpu_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfd40c545d8201b67290e27bf74ce115774dace1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_load_gpu_decoder.py
@@ -0,0 +1,8 @@
+from ..extension import _load_library
+
+
+try:
+    _load_library("gpu_decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError):
+    _HAS_GPU_VIDEO_DECODER = False
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_deprecation_warning.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_deprecation_warning.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e18dc0916d9012a1dc7c5968a4f75c41c0fbd31
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_deprecation_warning.py
@@ -0,0 +1,16 @@
+import warnings
+
+import torch
+
+
+def _raise_video_deprecation_warning():
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        warnings.warn(
+            "The video decoding and encoding capabilities of torchvision "
+            "are deprecated from version 0.22 and will be removed in version 0.24. "
+            "We recommend that you migrate to TorchCodec, where we'll consolidate "
+            "the future decoding/encoding capabilities of PyTorch: "
+            "https://github.com/pytorch/torchcodec",
+            UserWarning,
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_opt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dbf035886fc4465f3c8c634100d572d6c9f019d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/_video_opt.py
@@ -0,0 +1,521 @@
+import math
+import warnings
+from fractions import Fraction
+from typing import Optional, Union
+
+import torch
+
+from ..extension import _load_library
+from ._video_deprecation_warning import _raise_video_deprecation_warning
+
+
+try:
+    _load_library("video_reader")
+    _HAS_CPU_VIDEO_DECODER = True
+except (ImportError, OSError):
+    _HAS_CPU_VIDEO_DECODER = False
+
+_HAS_VIDEO_OPT = _HAS_CPU_VIDEO_DECODER  # For BC
+default_timebase = Fraction(0, 1)
+
+
+# simple class for torch scripting
+# the complex Fraction class from fractions module is not scriptable
+class Timebase:
+    __annotations__ = {"numerator": int, "denominator": int}
+    __slots__ = ["numerator", "denominator"]
+
+    def __init__(
+        self,
+        numerator: int,
+        denominator: int,
+    ) -> None:
+        self.numerator = numerator
+        self.denominator = denominator
+
+
+class VideoMetaData:
+    __annotations__ = {
+        "has_video": bool,
+        "video_timebase": Timebase,
+        "video_duration": float,
+        "video_fps": float,
+        "has_audio": bool,
+        "audio_timebase": Timebase,
+        "audio_duration": float,
+        "audio_sample_rate": float,
+    }
+    __slots__ = [
+        "has_video",
+        "video_timebase",
+        "video_duration",
+        "video_fps",
+        "has_audio",
+        "audio_timebase",
+        "audio_duration",
+        "audio_sample_rate",
+    ]
+
+    def __init__(self) -> None:
+        self.has_video = False
+        self.video_timebase = Timebase(0, 1)
+        self.video_duration = 0.0
+        self.video_fps = 0.0
+        self.has_audio = False
+        self.audio_timebase = Timebase(0, 1)
+        self.audio_duration = 0.0
+        self.audio_sample_rate = 0.0
+
+
+def _validate_pts(pts_range: tuple[int, int]) -> None:
+
+    if pts_range[0] > pts_range[1] > 0:
+        raise ValueError(
+            f"Start pts should not be smaller than end pts, got start pts: {pts_range[0]} and end pts: {pts_range[1]}"
+        )
+
+
+def _fill_info(
+    vtimebase: torch.Tensor,
+    vfps: torch.Tensor,
+    vduration: torch.Tensor,
+    atimebase: torch.Tensor,
+    asample_rate: torch.Tensor,
+    aduration: torch.Tensor,
+) -> VideoMetaData:
+    """
+    Build update VideoMetaData struct with info about the video
+    """
+    meta = VideoMetaData()
+    if vtimebase.numel() > 0:
+        meta.video_timebase = Timebase(int(vtimebase[0].item()), int(vtimebase[1].item()))
+        timebase = vtimebase[0].item() / float(vtimebase[1].item())
+        if vduration.numel() > 0:
+            meta.has_video = True
+            meta.video_duration = float(vduration.item()) * timebase
+    if vfps.numel() > 0:
+        meta.video_fps = float(vfps.item())
+    if atimebase.numel() > 0:
+        meta.audio_timebase = Timebase(int(atimebase[0].item()), int(atimebase[1].item()))
+        timebase = atimebase[0].item() / float(atimebase[1].item())
+        if aduration.numel() > 0:
+            meta.has_audio = True
+            meta.audio_duration = float(aduration.item()) * timebase
+    if asample_rate.numel() > 0:
+        meta.audio_sample_rate = float(asample_rate.item())
+
+    return meta
+
+
+def _align_audio_frames(
+    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: tuple[int, int]
+) -> torch.Tensor:
+    start, end = aframe_pts[0], aframe_pts[-1]
+    num_samples = aframes.size(0)
+    step_per_aframe = float(end - start + 1) / float(num_samples)
+    s_idx = 0
+    e_idx = num_samples
+    if start < audio_pts_range[0]:
+        s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
+    if audio_pts_range[1] != -1 and end > audio_pts_range[1]:
+        e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
+    return aframes[s_idx:e_idx, :]
+
+
+def _read_video_from_file(
+    filename: str,
+    seek_frame_margin: float = 0.25,
+    read_video_stream: bool = True,
+    video_width: int = 0,
+    video_height: int = 0,
+    video_min_dimension: int = 0,
+    video_max_dimension: int = 0,
+    video_pts_range: tuple[int, int] = (0, -1),
+    video_timebase: Fraction = default_timebase,
+    read_audio_stream: bool = True,
+    audio_samples: int = 0,
+    audio_channels: int = 0,
+    audio_pts_range: tuple[int, int] = (0, -1),
+    audio_timebase: Fraction = default_timebase,
+) -> tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
+    """
+    Reads a video from a file, returning both the video frames and the audio frames
+
+    Args:
+    filename (str): path to the video file
+    seek_frame_margin (double, optional): seeking frame in the stream is imprecise. Thus,
+        when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
+    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
+    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
+        the size of decoded frames:
+
+            - When video_width = 0, video_height = 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the original frame resolution
+            - When video_width = 0, video_height = 0, video_min_dimension != 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize the
+                frame so that shorter edge size is video_min_dimension
+            - When video_width = 0, video_height = 0, video_min_dimension = 0,
+                and video_max_dimension != 0, keep the aspect ratio and resize
+                the frame so that longer edge size is video_max_dimension
+            - When video_width = 0, video_height = 0, video_min_dimension != 0,
+                and video_max_dimension != 0, resize the frame so that shorter
+                edge size is video_min_dimension, and longer edge size is
+                video_max_dimension. The aspect ratio may not be preserved
+            - When video_width = 0, video_height != 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize
+                the frame so that frame video_height is $video_height
+            - When video_width != 0, video_height == 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize
+                the frame so that frame video_width is $video_width
+            - When video_width != 0, video_height != 0, video_min_dimension = 0,
+                and video_max_dimension = 0, resize the frame so that frame
+                video_width and  video_height are set to $video_width and
+                $video_height, respectively
+    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
+    video_timebase (Fraction, optional): a Fraction rational number which denotes timebase in video stream
+    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
+    audio_samples (int, optional): audio sampling rate
+    audio_channels (int optional): audio channels
+    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
+    audio_timebase (Fraction, optional): a Fraction rational number which denotes time base in audio stream
+
+    Returns
+        vframes (Tensor[T, H, W, C]): the `T` video frames
+        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
+            `K` is the number of audio_channels
+        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
+            and audio_fps (int)
+    """
+    _raise_video_deprecation_warning()
+    _validate_pts(video_pts_range)
+    _validate_pts(audio_pts_range)
+
+    result = torch.ops.video_reader.read_video_from_file(
+        filename,
+        seek_frame_margin,
+        0,  # getPtsOnly
+        read_video_stream,
+        video_width,
+        video_height,
+        video_min_dimension,
+        video_max_dimension,
+        video_pts_range[0],
+        video_pts_range[1],
+        video_timebase.numerator,
+        video_timebase.denominator,
+        read_audio_stream,
+        audio_samples,
+        audio_channels,
+        audio_pts_range[0],
+        audio_pts_range[1],
+        audio_timebase.numerator,
+        audio_timebase.denominator,
+    )
+    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
+    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
+    if aframes.numel() > 0:
+        # when audio stream is found
+        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
+    return vframes, aframes, info
+
+
+def _read_video_timestamps_from_file(filename: str) -> tuple[list[int], list[int], VideoMetaData]:
+    """
+    Decode all video- and audio frames in the video. Only pts
+    (presentation timestamp) is returned. The actual frame pixel data is not
+    copied. Thus, it is much faster than read_video(...)
+    """
+    result = torch.ops.video_reader.read_video_from_file(
+        filename,
+        0,  # seek_frame_margin
+        1,  # getPtsOnly
+        1,  # read_video_stream
+        0,  # video_width
+        0,  # video_height
+        0,  # video_min_dimension
+        0,  # video_max_dimension
+        0,  # video_start_pts
+        -1,  # video_end_pts
+        0,  # video_timebase_num
+        1,  # video_timebase_den
+        1,  # read_audio_stream
+        0,  # audio_samples
+        0,  # audio_channels
+        0,  # audio_start_pts
+        -1,  # audio_end_pts
+        0,  # audio_timebase_num
+        1,  # audio_timebase_den
+    )
+    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
+    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
+
+    vframe_pts = vframe_pts.numpy().tolist()
+    aframe_pts = aframe_pts.numpy().tolist()
+    return vframe_pts, aframe_pts, info
+
+
+def _probe_video_from_file(filename: str) -> VideoMetaData:
+    """
+    Probe a video file and return VideoMetaData with info about the video
+    """
+    _raise_video_deprecation_warning()
+    result = torch.ops.video_reader.probe_video_from_file(filename)
+    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
+    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
+    return info
+
+
+def _read_video_from_memory(
+    video_data: torch.Tensor,
+    seek_frame_margin: float = 0.25,
+    read_video_stream: int = 1,
+    video_width: int = 0,
+    video_height: int = 0,
+    video_min_dimension: int = 0,
+    video_max_dimension: int = 0,
+    video_pts_range: tuple[int, int] = (0, -1),
+    video_timebase_numerator: int = 0,
+    video_timebase_denominator: int = 1,
+    read_audio_stream: int = 1,
+    audio_samples: int = 0,
+    audio_channels: int = 0,
+    audio_pts_range: tuple[int, int] = (0, -1),
+    audio_timebase_numerator: int = 0,
+    audio_timebase_denominator: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Reads a video from memory, returning both the video frames as the audio frames
+    This function is torchscriptable.
+
+    Args:
+    video_data (data type could be 1) torch.Tensor, dtype=torch.int8 or 2) python bytes):
+        compressed video content stored in either 1) torch.Tensor 2) python bytes
+    seek_frame_margin (double, optional): seeking frame in the stream is imprecise.
+        Thus, when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
+    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
+    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
+        the size of decoded frames:
+
+            - When video_width = 0, video_height = 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the original frame resolution
+            - When video_width = 0, video_height = 0, video_min_dimension != 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize the
+                frame so that shorter edge size is video_min_dimension
+            - When video_width = 0, video_height = 0, video_min_dimension = 0,
+                and video_max_dimension != 0, keep the aspect ratio and resize
+                the frame so that longer edge size is video_max_dimension
+            - When video_width = 0, video_height = 0, video_min_dimension != 0,
+                and video_max_dimension != 0, resize the frame so that shorter
+                edge size is video_min_dimension, and longer edge size is
+                video_max_dimension. The aspect ratio may not be preserved
+            - When video_width = 0, video_height != 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize
+                the frame so that frame video_height is $video_height
+            - When video_width != 0, video_height == 0, video_min_dimension = 0,
+                and video_max_dimension = 0, keep the aspect ratio and resize
+                the frame so that frame video_width is $video_width
+            - When video_width != 0, video_height != 0, video_min_dimension = 0,
+                and video_max_dimension = 0, resize the frame so that frame
+                video_width and  video_height are set to $video_width and
+                $video_height, respectively
+    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
+    video_timebase_numerator / video_timebase_denominator (float, optional): a rational
+        number which denotes timebase in video stream
+    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
+    audio_samples (int, optional): audio sampling rate
+    audio_channels (int optional): audio audio_channels
+    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
+    audio_timebase_numerator / audio_timebase_denominator (float, optional):
+        a rational number which denotes time base in audio stream
+
+    Returns:
+        vframes (Tensor[T, H, W, C]): the `T` video frames
+        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
+            `K` is the number of channels
+    """
+
+    _raise_video_deprecation_warning()
+    _validate_pts(video_pts_range)
+    _validate_pts(audio_pts_range)
+
+    if not isinstance(video_data, torch.Tensor):
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+
+    result = torch.ops.video_reader.read_video_from_memory(
+        video_data,
+        seek_frame_margin,
+        0,  # getPtsOnly
+        read_video_stream,
+        video_width,
+        video_height,
+        video_min_dimension,
+        video_max_dimension,
+        video_pts_range[0],
+        video_pts_range[1],
+        video_timebase_numerator,
+        video_timebase_denominator,
+        read_audio_stream,
+        audio_samples,
+        audio_channels,
+        audio_pts_range[0],
+        audio_pts_range[1],
+        audio_timebase_numerator,
+        audio_timebase_denominator,
+    )
+
+    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
+
+    if aframes.numel() > 0:
+        # when audio stream is found
+        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
+
+    return vframes, aframes
+
+
+def _read_video_timestamps_from_memory(
+    video_data: torch.Tensor,
+) -> tuple[list[int], list[int], VideoMetaData]:
+    """
+    Decode all frames in the video. Only pts (presentation timestamp) is returned.
+    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
+    is much faster than read_video(...)
+    """
+    if not isinstance(video_data, torch.Tensor):
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+    result = torch.ops.video_reader.read_video_from_memory(
+        video_data,
+        0,  # seek_frame_margin
+        1,  # getPtsOnly
+        1,  # read_video_stream
+        0,  # video_width
+        0,  # video_height
+        0,  # video_min_dimension
+        0,  # video_max_dimension
+        0,  # video_start_pts
+        -1,  # video_end_pts
+        0,  # video_timebase_num
+        1,  # video_timebase_den
+        1,  # read_audio_stream
+        0,  # audio_samples
+        0,  # audio_channels
+        0,  # audio_start_pts
+        -1,  # audio_end_pts
+        0,  # audio_timebase_num
+        1,  # audio_timebase_den
+    )
+    _raise_video_deprecation_warning()
+    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
+    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
+
+    vframe_pts = vframe_pts.numpy().tolist()
+    aframe_pts = aframe_pts.numpy().tolist()
+    return vframe_pts, aframe_pts, info
+
+
+def _probe_video_from_memory(
+    video_data: torch.Tensor,
+) -> VideoMetaData:
+    """
+    Probe a video in memory and return VideoMetaData with info about the video
+    This function is torchscriptable
+    """
+    _raise_video_deprecation_warning()
+    if not isinstance(video_data, torch.Tensor):
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+    result = torch.ops.video_reader.probe_video_from_memory(video_data)
+    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
+    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
+    return info
+
+
+def _read_video(
+    filename: str,
+    start_pts: Union[float, Fraction] = 0,
+    end_pts: Optional[Union[float, Fraction]] = None,
+    pts_unit: str = "pts",
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, float]]:
+    _raise_video_deprecation_warning()
+    if end_pts is None:
+        end_pts = float("inf")
+
+    if pts_unit == "pts":
+        warnings.warn(
+            "The pts_unit 'pts' gives wrong results and will be removed in a "
+            + "follow-up version. Please use pts_unit 'sec'."
+        )
+
+    info = _probe_video_from_file(filename)
+
+    has_video = info.has_video
+    has_audio = info.has_audio
+
+    def get_pts(time_base):
+        start_offset = start_pts
+        end_offset = end_pts
+        if pts_unit == "sec":
+            start_offset = int(math.floor(start_pts * (1 / time_base)))
+            if end_offset != float("inf"):
+                end_offset = int(math.ceil(end_pts * (1 / time_base)))
+        if end_offset == float("inf"):
+            end_offset = -1
+        return start_offset, end_offset
+
+    video_pts_range = (0, -1)
+    video_timebase = default_timebase
+    if has_video:
+        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
+        video_pts_range = get_pts(video_timebase)
+
+    audio_pts_range = (0, -1)
+    audio_timebase = default_timebase
+    if has_audio:
+        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
+        audio_pts_range = get_pts(audio_timebase)
+
+    vframes, aframes, info = _read_video_from_file(
+        filename,
+        read_video_stream=True,
+        video_pts_range=video_pts_range,
+        video_timebase=video_timebase,
+        read_audio_stream=True,
+        audio_pts_range=audio_pts_range,
+        audio_timebase=audio_timebase,
+    )
+    _info = {}
+    if has_video:
+        _info["video_fps"] = info.video_fps
+    if has_audio:
+        _info["audio_fps"] = info.audio_sample_rate
+
+    return vframes, aframes, _info
+
+
+def _read_video_timestamps(
+    filename: str, pts_unit: str = "pts"
+) -> tuple[Union[list[int], list[Fraction]], Optional[float]]:
+    _raise_video_deprecation_warning()
+    if pts_unit == "pts":
+        warnings.warn(
+            "The pts_unit 'pts' gives wrong results and will be removed in a "
+            + "follow-up version. Please use pts_unit 'sec'."
+        )
+
+    pts: Union[list[int], list[Fraction]]
+    pts, _, info = _read_video_timestamps_from_file(filename)
+
+    if pts_unit == "sec":
+        video_time_base = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
+        pts = [x * video_time_base for x in pts]
+
+    video_fps = info.video_fps if info.has_video else None
+
+    return pts, video_fps
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/image.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88e58ca4cac5f39124ab257875ee3665858e720
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/image.py
@@ -0,0 +1,511 @@
+from enum import Enum
+from typing import Union
+from warnings import warn
+
+import torch
+
+from ..extension import _load_library
+from ..utils import _log_api_usage_once
+
+
+try:
+    _load_library("image")
+except (ImportError, OSError) as e:
+    warn(
+        f"Failed to load image Python extension: '{e}'"
+        f"If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. "
+        f"Otherwise, there might be something wrong with your environment. "
+        f"Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?"
+    )
+
+
+class ImageReadMode(Enum):
+    """Allow automatic conversion to RGB, RGBA, etc while decoding.
+
+    .. note::
+
+        You don't need to use this struct, you can just pass strings to all
+        ``mode`` parameters, e.g. ``mode="RGB"``.
+
+    The different available modes are the following.
+
+    - UNCHANGED: loads the image as-is
+    - RGB: converts to RGB
+    - RGBA: converts to RGB with transparency (also aliased as RGB_ALPHA)
+    - GRAY: converts to grayscale
+    - GRAY_ALPHA: converts to grayscale with transparency
+
+    .. note::
+
+        Some decoders won't support all possible values, e.g. GRAY and
+        GRAY_ALPHA are only supported for PNG and JPEG images.
+    """
+
+    UNCHANGED = 0
+    GRAY = 1
+    GRAY_ALPHA = 2
+    RGB = 3
+    RGB_ALPHA = 4
+    RGBA = RGB_ALPHA  # Alias for convenience
+
+
+def read_file(path: str) -> torch.Tensor:
+    """
+    Return the bytes contents of a file as a uint8 1D Tensor.
+
+    Args:
+        path (str or ``pathlib.Path``): the path to the file to be read
+
+    Returns:
+        data (Tensor)
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_file)
+    data = torch.ops.image.read_file(str(path))
+    return data
+
+
+def write_file(filename: str, data: torch.Tensor) -> None:
+    """
+    Write the content of an uint8 1D tensor to a file.
+
+    Args:
+        filename (str or ``pathlib.Path``): the path to the file to be written
+        data (Tensor): the contents to be written to the output file
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(write_file)
+    torch.ops.image.write_file(str(filename), data)
+
+
+def decode_png(
+    input: torch.Tensor,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
+    """
+    Decodes a PNG image into a 3 dimensional RGB or grayscale Tensor.
+
+    The values of the output tensor are in uint8 in [0, 255] for most cases. If
+    the image is a 16-bit png, then the output tensor is uint16 in [0, 65535]
+    (supported from torchvision ``0.21``). Since uint16 support is limited in
+    pytorch, we recommend calling
+    :func:`torchvision.transforms.v2.functional.to_dtype()` with ``scale=True``
+    after this function to convert the decoded image into a uint8 or float
+    tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional uint8 tensor containing
+            the raw bytes of the PNG image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False.
+
+    Returns:
+        output (Tensor[image_channels, image_height, image_width])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_png)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    output = torch.ops.image.decode_png(input, mode.value, apply_exif_orientation)
+    return output
+
+
+def encode_png(input: torch.Tensor, compression_level: int = 6) -> torch.Tensor:
+    """
+    Takes an input tensor in CHW layout and returns a buffer with the contents
+    of its corresponding PNG file.
+
+    Args:
+        input (Tensor[channels, image_height, image_width]): int8 image tensor of
+            ``c`` channels, where ``c`` must 3 or 1.
+        compression_level (int): Compression factor for the resulting file, it must be a number
+            between 0 and 9. Default: 6
+
+    Returns:
+        Tensor[1]: A one dimensional int8 tensor that contains the raw bytes of the
+            PNG file.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(encode_png)
+    output = torch.ops.image.encode_png(input, compression_level)
+    return output
+
+
+def write_png(input: torch.Tensor, filename: str, compression_level: int = 6):
+    """
+    Takes an input tensor in CHW layout (or HW in the case of grayscale images)
+    and saves it in a PNG file.
+
+    Args:
+        input (Tensor[channels, image_height, image_width]): int8 image tensor of
+            ``c`` channels, where ``c`` must be 1 or 3.
+        filename (str or ``pathlib.Path``): Path to save the image.
+        compression_level (int): Compression factor for the resulting file, it must be a number
+            between 0 and 9. Default: 6
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(write_png)
+    output = encode_png(input, compression_level)
+    write_file(filename, output)
+
+
+def decode_jpeg(
+    input: Union[torch.Tensor, list[torch.Tensor]],
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    device: Union[str, torch.device] = "cpu",
+    apply_exif_orientation: bool = False,
+) -> Union[torch.Tensor, list[torch.Tensor]]:
+    """Decode JPEG image(s) into 3D RGB or grayscale Tensor(s), on CPU or CUDA.
+
+    The values of the output tensor are uint8 between 0 and 255.
+
+    .. note::
+        When using a CUDA device, passing a list of tensors is more efficient than repeated individual calls to ``decode_jpeg``.
+        When using CPU the performance is equivalent.
+        The CUDA version of this function has explicitly been designed with thread-safety in mind.
+        This function does not return partial results in case of an error.
+
+    Args:
+        input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing
+            the raw bytes of the JPEG image. The tensor(s) must be on CPU,
+            regardless of the ``device`` parameter.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+        device (str or torch.device): The device on which the decoded image will
+            be stored. If a cuda device is specified, the image will be decoded
+            with `nvjpeg <https://developer.nvidia.com/nvjpeg>`_. This is only
+            supported for CUDA version >= 10.1
+
+            .. betastatus:: device parameter
+
+            .. warning::
+                There is a memory leak in the nvjpeg library for CUDA versions < 11.6.
+                Make sure to rely on CUDA 11.6 or above before using ``device="cuda"``.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False. Only implemented for JPEG format on CPU.
+
+    Returns:
+        output (Tensor[image_channels, image_height, image_width] or list[Tensor[image_channels, image_height, image_width]]):
+            The values of the output tensor(s) are uint8 between 0 and 255.
+            ``output.device`` will be set to the specified ``device``
+
+
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_jpeg)
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+
+    if isinstance(input, list):
+        if len(input) == 0:
+            raise ValueError("Input list must contain at least one element")
+        if not all(isinstance(t, torch.Tensor) for t in input):
+            raise ValueError("All elements of the input list must be tensors.")
+        if not all(t.device.type == "cpu" for t in input):
+            raise ValueError("Input list must contain tensors on CPU.")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda(input, mode.value, device)
+        else:
+            return [torch.ops.image.decode_jpeg(img, mode.value, apply_exif_orientation) for img in input]
+
+    else:  # input is tensor
+        if input.device.type != "cpu":
+            raise ValueError("Input tensor must be a CPU tensor")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda([input], mode.value, device)[0]
+        else:
+            return torch.ops.image.decode_jpeg(input, mode.value, apply_exif_orientation)
+
+
+def encode_jpeg(
+    input: Union[torch.Tensor, list[torch.Tensor]], quality: int = 75
+) -> Union[torch.Tensor, list[torch.Tensor]]:
+    """Encode RGB tensor(s) into raw encoded jpeg bytes, on CPU or CUDA.
+
+    .. note::
+        Passing a list of CUDA tensors is more efficient than repeated individual calls to ``encode_jpeg``.
+        For CPU tensors the performance is equivalent.
+
+    Args:
+        input (Tensor[channels, image_height, image_width] or List[Tensor[channels, image_height, image_width]]):
+            (list of) uint8 image tensor(s) of ``c`` channels, where ``c`` must be 1 or 3
+        quality (int): Quality of the resulting JPEG file(s). Must be a number between
+            1 and 100. Default: 75
+
+    Returns:
+        output (Tensor[1] or list[Tensor[1]]): A (list of) one dimensional uint8 tensor(s) that contain the raw bytes of the JPEG file.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(encode_jpeg)
+    if quality < 1 or quality > 100:
+        raise ValueError("Image quality should be a positive number between 1 and 100")
+    if isinstance(input, list):
+        if not input:
+            raise ValueError("encode_jpeg requires at least one input tensor when a list is passed")
+        if input[0].device.type == "cuda":
+            return torch.ops.image.encode_jpegs_cuda(input, quality)
+        else:
+            return [torch.ops.image.encode_jpeg(image, quality) for image in input]
+    else:  # single input tensor
+        if input.device.type == "cuda":
+            return torch.ops.image.encode_jpegs_cuda([input], quality)[0]
+        else:
+            return torch.ops.image.encode_jpeg(input, quality)
+
+
+def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75):
+    """
+    Takes an input tensor in CHW layout and saves it in a JPEG file.
+
+    Args:
+        input (Tensor[channels, image_height, image_width]): int8 image tensor of ``c``
+            channels, where ``c`` must be 1 or 3.
+        filename (str or ``pathlib.Path``): Path to save the image.
+        quality (int): Quality of the resulting JPEG file, it must be a number
+            between 1 and 100. Default: 75
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(write_jpeg)
+    output = encode_jpeg(input, quality)
+    assert isinstance(output, torch.Tensor)  # Needed for torchscript
+    write_file(filename, output)
+
+
+def decode_image(
+    input: Union[torch.Tensor, str],
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
+    """Decode an image into a uint8 tensor, from a path or from raw encoded bytes.
+
+    Currently supported image formats are jpeg, png, gif and webp.
+
+    The values of the output tensor are in uint8 in [0, 255] for most cases.
+
+    If the image is a 16-bit png, then the output tensor is uint16 in [0, 65535]
+    (supported from torchvision ``0.21``). Since uint16 support is limited in
+    pytorch, we recommend calling
+    :func:`torchvision.transforms.v2.functional.to_dtype()` with ``scale=True``
+    after this function to convert the decoded image into a uint8 or float
+    tensor.
+
+    .. note::
+
+        ``decode_image()`` doesn't work yet on AVIF or HEIC images. For these
+        formats, directly call  :func:`~torchvision.io.decode_avif` or
+        :func:`~torchvision.io.decode_heic`.
+
+    Args:
+        input (Tensor or str or ``pathlib.Path``): The image to decode. If a
+            tensor is passed, it must be one dimensional uint8 tensor containing
+            the raw bytes of the image. Otherwise, this must be a path to the image file.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+           Only applies to JPEG and PNG images. Default: False.
+
+    Returns:
+        output (Tensor[image_channels, image_height, image_width])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_image)
+    if not isinstance(input, torch.Tensor):
+        input = read_file(str(input))
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    output = torch.ops.image.decode_image(input, mode.value, apply_exif_orientation)
+    return output
+
+
+def read_image(
+    path: str,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+    apply_exif_orientation: bool = False,
+) -> torch.Tensor:
+    """[OBSOLETE] Use :func:`~torchvision.io.decode_image` instead."""
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_image)
+    data = read_file(path)
+    return decode_image(data, mode, apply_exif_orientation=apply_exif_orientation)
+
+
+def decode_gif(input: torch.Tensor) -> torch.Tensor:
+    """
+    Decode a GIF image into a 3 or 4 dimensional RGB Tensor.
+
+    The values of the output tensor are uint8 between 0 and 255.
+    The output tensor has shape ``(C, H, W)`` if there is only one image in the
+    GIF, and ``(N, C, H, W)`` if there are ``N`` images.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the GIF image.
+
+    Returns:
+        output (Tensor[image_channels, image_height, image_width] or Tensor[num_images, image_channels, image_height, image_width])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_gif)
+    return torch.ops.image.decode_gif(input)
+
+
+def decode_webp(
+    input: torch.Tensor,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
+) -> torch.Tensor:
+    """
+    Decode a WEBP image into a 3 dimensional RGB[A] Tensor.
+
+    The values of the output tensor are uint8 between 0 and 255.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the WEBP image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(decode_webp)
+    if isinstance(mode, str):
+        mode = ImageReadMode[mode.upper()]
+    return torch.ops.image.decode_webp(input, mode.value)
+
+
+# TODO_AVIF_HEIC: Better support for torchscript. Scripting decode_avif of
+# decode_heic currently fails, mainly because of the logic
+# _load_extra_decoders_once() (using global variables, try/except statements,
+# etc.).
+# The ops (torch.ops.extra_decoders_ns.decode_*) are otherwise torchscript-able,
+# and users who need torchscript can always just wrap those.
+
+# TODO_AVIF_HEIC: decode_image() should work for those. The key technical issue
+# we have here is that the format detection logic of decode_image() is
+# implemented in torchvision, and torchvision has zero knowledge of
+# torchvision-extra-decoders, so we cannot call the AVIF/HEIC C++ decoders
+# (those in torchvision-extra-decoders) from there.
+# A trivial check that could be done within torchvision would be to check the
+# file extension, if a path was passed. We could also just implement the
+# AVIF/HEIC detection logic in Python as a fallback, if the file detection
+# didn't find any format. In any case: properly determining whether a file is
+# HEIC is far from trivial, and relying on libmagic would probably be best
+
+
+_EXTRA_DECODERS_ALREADY_LOADED = False
+
+
+def _load_extra_decoders_once():
+    global _EXTRA_DECODERS_ALREADY_LOADED
+    if _EXTRA_DECODERS_ALREADY_LOADED:
+        return
+
+    try:
+        import torchvision_extra_decoders
+
+        # torchvision-extra-decoders only supports linux for now. BUT, users on
+        # e.g. MacOS can still install it: they will get the pure-python
+        # 0.0.0.dev version:
+        # https://pypi.org/project/torchvision-extra-decoders/0.0.0.dev0, which
+        # is a dummy version that was created to reserve the namespace on PyPI.
+        # We have to check that expose_extra_decoders() exists for those users,
+        # so we can properly error on non-Linux archs.
+        assert hasattr(torchvision_extra_decoders, "expose_extra_decoders")
+    except (AssertionError, ImportError) as e:
+        raise RuntimeError(
+            "In order to enable the AVIF and HEIC decoding capabilities of "
+            "torchvision, you need to `pip install torchvision-extra-decoders`. "
+            "Just install the package, you don't need to update your code. "
+            "This is only supported on Linux, and this feature is still in BETA stage. "
+            "Please let us know of any issue: https://github.com/pytorch/vision/issues/new/choose. "
+            "Note that `torchvision-extra-decoders` is released under the LGPL license. "
+        ) from e
+
+    # This will expose torch.ops.extra_decoders_ns.decode_avif and torch.ops.extra_decoders_ns.decode_heic
+    torchvision_extra_decoders.expose_extra_decoders()
+
+    _EXTRA_DECODERS_ALREADY_LOADED = True
+
+
+def decode_avif(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+    """Decode an AVIF image into a 3 dimensional RGB[A] Tensor.
+
+    .. warning::
+        In order to enable the AVIF decoding capabilities of torchvision, you
+        first need to run ``pip install torchvision-extra-decoders``. Just
+        install the package, you don't need to update your code. This is only
+        supported on Linux, and this feature is still in BETA stage. Please let
+        us know of any issue:
+        https://github.com/pytorch/vision/issues/new/choose. Note that
+        `torchvision-extra-decoders
+        <https://github.com/meta-pytorch/torchvision-extra-decoders/>`_ is
+        released under the LGPL license.
+
+    The values of the output tensor are in uint8 in [0, 255] for most images. If
+    the image has a bit-depth of more than 8, then the output tensor is uint16
+    in [0, 65535]. Since uint16 support is limited in pytorch, we recommend
+    calling :func:`torchvision.transforms.v2.functional.to_dtype()` with
+    ``scale=True`` after this function to convert the decoded image into a uint8
+    or float tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the AVIF image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    _load_extra_decoders_once()
+    if input.dtype != torch.uint8:
+        raise RuntimeError(f"Input tensor must have uint8 data type, got {input.dtype}")
+    return torch.ops.extra_decoders_ns.decode_avif(input, mode.value)
+
+
+def decode_heic(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+    """Decode an HEIC image into a 3 dimensional RGB[A] Tensor.
+
+    .. warning::
+        In order to enable the HEIC decoding capabilities of torchvision, you
+        first need to run ``pip install torchvision-extra-decoders``. Just
+        install the package, you don't need to update your code. This is only
+        supported on Linux, and this feature is still in BETA stage. Please let
+        us know of any issue:
+        https://github.com/pytorch/vision/issues/new/choose. Note that
+        `torchvision-extra-decoders
+        <https://github.com/meta-pytorch/torchvision-extra-decoders/>`_ is
+        released under the LGPL license.
+
+    The values of the output tensor are in uint8 in [0, 255] for most images. If
+    the image has a bit-depth of more than 8, then the output tensor is uint16
+    in [0, 65535]. Since uint16 support is limited in pytorch, we recommend
+    calling :func:`torchvision.transforms.v2.functional.to_dtype()` with
+    ``scale=True`` after this function to convert the decoded image into a uint8
+    or float tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the HEIC image.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
+    _load_extra_decoders_once()
+    if input.dtype != torch.uint8:
+        raise RuntimeError(f"Input tensor must have uint8 data type, got {input.dtype}")
+    return torch.ops.extra_decoders_ns.decode_heic(input, mode.value)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..14edcf50aaaa5e7d242657ffdc2e3bebf105b8fc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video.py
@@ -0,0 +1,468 @@
+import gc
+import math
+import os
+import re
+import warnings
+from fractions import Fraction
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import _log_api_usage_once
+from . import _video_opt
+from ._video_deprecation_warning import _raise_video_deprecation_warning
+
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+    try:
+        FFmpegError = av.FFmpegError  # from av 14 https://github.com/PyAV-Org/PyAV/blob/main/CHANGELOG.rst
+    except AttributeError:
+        FFmpegError = av.AVError
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
+def _check_av_available() -> None:
+    if isinstance(av, Exception):
+        raise av
+
+
+def _av_available() -> bool:
+    return not isinstance(av, Exception)
+
+
+# PyAV has some reference cycles
+_CALLED_TIMES = 0
+_GC_COLLECTION_INTERVAL = 10
+
+
+def write_video(
+    filename: str,
+    video_array: torch.Tensor,
+    fps: float,
+    video_codec: str = "libx264",
+    options: Optional[dict[str, Any]] = None,
+    audio_array: Optional[torch.Tensor] = None,
+    audio_fps: Optional[float] = None,
+    audio_codec: Optional[str] = None,
+    audio_options: Optional[dict[str, Any]] = None,
+) -> None:
+    """
+    [DEPRECATED] Writes a 4d tensor in [T, H, W, C] format in a video file.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    This function relies on PyAV (therefore, ultimately FFmpeg) to encode
+    videos, you can get more fine-grained control by referring to the other
+    options at your disposal within `the FFMpeg wiki
+    <http://trac.ffmpeg.org/wiki#Encoding>`_.
+
+    Args:
+        filename (str): path where the video will be saved
+        video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
+            as a uint8 tensor in [T, H, W, C] format
+        fps (Number): video frames per second
+        video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
+        options (Dict): dictionary containing options to be passed into the PyAV video stream.
+            The list of options is codec-dependent and can all
+            be found from `the FFMpeg wiki <http://trac.ffmpeg.org/wiki#Encoding>`_.
+        audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
+            and N is the number of samples
+        audio_fps (Number): audio sample rate, typically 44100 or 48000
+        audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
+        audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream.
+            The list of options is codec-dependent and can all
+            be found from `the FFMpeg wiki <http://trac.ffmpeg.org/wiki#Encoding>`_.
+
+    Examples::
+        >>> # Creating libx264 video with CRF 17, for visually lossless footage:
+        >>>
+        >>> from torchvision.io import write_video
+        >>> # 1000 frames of 100x100, 3-channel image.
+        >>> vid = torch.randn(1000, 100, 100, 3, dtype = torch.uint8)
+        >>> write_video("video.mp4", options = {"crf": "17"})
+
+    """
+    _raise_video_deprecation_warning()
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(write_video)
+    _check_av_available()
+    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy(force=True)
+
+    # PyAV does not support floating point numbers with decimal point
+    # and will throw OverflowException in case this is not the case
+    if isinstance(fps, float):
+        fps = int(np.round(fps))
+
+    with av.open(filename, mode="w") as container:
+        stream = container.add_stream(video_codec, rate=fps)
+        stream.width = video_array.shape[2]
+        stream.height = video_array.shape[1]
+        stream.pix_fmt = "yuv420p" if video_codec != "libx264rgb" else "rgb24"
+        stream.options = options or {}
+
+        if audio_array is not None:
+            audio_format_dtypes = {
+                "dbl": "<f8",
+                "dblp": "<f8",
+                "flt": "<f4",
+                "fltp": "<f4",
+                "s16": "<i2",
+                "s16p": "<i2",
+                "s32": "<i4",
+                "s32p": "<i4",
+                "u8": "u1",
+                "u8p": "u1",
+            }
+            a_stream = container.add_stream(audio_codec, rate=audio_fps)
+            a_stream.options = audio_options or {}
+
+            num_channels = audio_array.shape[0]
+            audio_layout = "stereo" if num_channels > 1 else "mono"
+            audio_sample_fmt = container.streams.audio[0].format.name
+
+            format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
+            audio_array = torch.as_tensor(audio_array).numpy(force=True).astype(format_dtype)
+
+            frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout)
+
+            frame.sample_rate = audio_fps
+
+            for packet in a_stream.encode(frame):
+                container.mux(packet)
+
+            for packet in a_stream.encode():
+                container.mux(packet)
+
+        for img in video_array:
+            frame = av.VideoFrame.from_ndarray(img, format="rgb24")
+            try:
+                frame.pict_type = "NONE"
+            except TypeError:
+                from av.video.frame import PictureType  # noqa
+
+                frame.pict_type = PictureType.NONE
+
+            for packet in stream.encode(frame):
+                container.mux(packet)
+
+        # Flush stream
+        for packet in stream.encode():
+            container.mux(packet)
+
+
+def _read_from_stream(
+    container: "av.container.Container",
+    start_offset: float,
+    end_offset: float,
+    pts_unit: str,
+    stream: "av.stream.Stream",
+    stream_name: dict[str, Optional[Union[int, tuple[int, ...], list[int]]]],
+) -> list["av.frame.Frame"]:
+    global _CALLED_TIMES, _GC_COLLECTION_INTERVAL
+    _CALLED_TIMES += 1
+    if _CALLED_TIMES % _GC_COLLECTION_INTERVAL == _GC_COLLECTION_INTERVAL - 1:
+        gc.collect()
+
+    if pts_unit == "sec":
+        # TODO: we should change all of this from ground up to simply take
+        # sec and convert to MS in C++
+        start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
+        if end_offset != float("inf"):
+            end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
+    else:
+        warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
+
+    frames = {}
+    should_buffer = True
+    max_buffer_size = 5
+    if stream.type == "video":
+        # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt)
+        # so need to buffer some extra frames to sort everything
+        # properly
+        extradata = stream.codec_context.extradata
+        # overly complicated way of finding if `divx_packed` is set, following
+        # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263
+        if extradata and b"DivX" in extradata:
+            # can't use regex directly because of some weird characters sometimes...
+            pos = extradata.find(b"DivX")
+            d = extradata[pos:]
+            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
+            if o is None:
+                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
+            if o is not None:
+                should_buffer = o.group(3) == b"p"
+    seek_offset = start_offset
+    # some files don't seek to the right location, so better be safe here
+    seek_offset = max(seek_offset - 1, 0)
+    if should_buffer:
+        # FIXME this is kind of a hack, but we will jump to the previous keyframe
+        # so this will be safe
+        seek_offset = max(seek_offset - max_buffer_size, 0)
+    try:
+        # TODO check if stream needs to always be the video stream here or not
+        container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
+    except FFmpegError:
+        # TODO add some warnings in this case
+        # print("Corrupted file?", container.name)
+        return []
+    buffer_count = 0
+    try:
+        for _idx, frame in enumerate(container.decode(**stream_name)):
+            frames[frame.pts] = frame
+            if frame.pts >= end_offset:
+                if should_buffer and buffer_count < max_buffer_size:
+                    buffer_count += 1
+                    continue
+                break
+    except FFmpegError:
+        # TODO add a warning
+        pass
+    # ensure that the results are sorted wrt the pts
+    result = [frames[i] for i in sorted(frames) if start_offset <= frames[i].pts <= end_offset]
+    if len(frames) > 0 and start_offset > 0 and start_offset not in frames:
+        # if there is no frame that exactly matches the pts of start_offset
+        # add the last frame smaller than start_offset, to guarantee that
+        # we will have all the necessary data. This is most useful for audio
+        preceding_frames = [i for i in frames if i < start_offset]
+        if len(preceding_frames) > 0:
+            first_frame_pts = max(preceding_frames)
+            result.insert(0, frames[first_frame_pts])
+    return result
+
+
+def _align_audio_frames(
+    aframes: torch.Tensor, audio_frames: list["av.frame.Frame"], ref_start: int, ref_end: float
+) -> torch.Tensor:
+    start, end = audio_frames[0].pts, audio_frames[-1].pts
+    total_aframes = aframes.shape[1]
+    step_per_aframe = (end - start + 1) / total_aframes
+    s_idx = 0
+    e_idx = total_aframes
+    if start < ref_start:
+        s_idx = int((ref_start - start) / step_per_aframe)
+    if end > ref_end:
+        e_idx = int((ref_end - end) / step_per_aframe)
+    return aframes[:, s_idx:e_idx]
+
+
+def read_video(
+    filename: str,
+    start_pts: Union[float, Fraction] = 0,
+    end_pts: Optional[Union[float, Fraction]] = None,
+    pts_unit: str = "pts",
+    output_format: str = "THWC",
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]:
+    """[DEPRECATED] Reads a video from a file, returning both the video frames and the audio frames
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    Args:
+        filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts.
+        start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
+            The start presentation time of the video
+        end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
+            The end presentation time
+        pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted,
+            either 'pts' or 'sec'. Defaults to 'pts'.
+        output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW".
+
+    Returns:
+        vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames
+        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
+        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
+    """
+    _raise_video_deprecation_warning()
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_video)
+
+    output_format = output_format.upper()
+    if output_format not in ("THWC", "TCHW"):
+        raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
+
+    from torchvision import get_video_backend
+
+    if get_video_backend() != "pyav":
+        if not os.path.exists(filename):
+            raise RuntimeError(f"File not found: {filename}")
+        vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
+    else:
+        _check_av_available()
+
+        if end_pts is None:
+            end_pts = float("inf")
+
+        if end_pts < start_pts:
+            raise ValueError(
+                f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}"
+            )
+
+        info = {}
+        video_frames = []
+        audio_frames = []
+        audio_timebase = _video_opt.default_timebase
+
+        try:
+            with av.open(filename, metadata_errors="ignore") as container:
+                if container.streams.audio:
+                    audio_timebase = container.streams.audio[0].time_base
+                if container.streams.video:
+                    video_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.video[0],
+                        {"video": 0},
+                    )
+                    video_fps = container.streams.video[0].average_rate
+                    # guard against potentially corrupted files
+                    if video_fps is not None:
+                        info["video_fps"] = float(video_fps)
+
+                if container.streams.audio:
+                    audio_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.audio[0],
+                        {"audio": 0},
+                    )
+                    info["audio_fps"] = container.streams.audio[0].rate
+
+        except FFmpegError:
+            # TODO raise a warning?
+            pass
+
+        vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
+        aframes_list = [frame.to_ndarray() for frame in audio_frames]
+
+        if vframes_list:
+            vframes = torch.as_tensor(np.stack(vframes_list))
+        else:
+            vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+
+        if aframes_list:
+            aframes = np.concatenate(aframes_list, 1)
+            aframes = torch.as_tensor(aframes)
+            if pts_unit == "sec":
+                start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+                if end_pts != float("inf"):
+                    end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
+            aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        else:
+            aframes = torch.empty((1, 0), dtype=torch.float32)
+
+    if output_format == "TCHW":
+        # [T,H,W,C] --> [T,C,H,W]
+        vframes = vframes.permute(0, 3, 1, 2)
+
+    return vframes, aframes, info
+
+
+def _can_read_timestamps_from_packets(container: "av.container.Container") -> bool:
+    extradata = container.streams[0].codec_context.extradata
+    if extradata is None:
+        return False
+    if b"Lavc" in extradata:
+        return True
+    return False
+
+
+def _decode_video_timestamps(container: "av.container.Container") -> list[int]:
+    if _can_read_timestamps_from_packets(container):
+        # fast path
+        return [x.pts for x in container.demux(video=0) if x.pts is not None]
+    else:
+        return [x.pts for x in container.decode(video=0) if x.pts is not None]
+
+
+def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[int], Optional[float]]:
+    """[DEPREACTED] List the video frames timestamps.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    Note that the function decodes the whole video frame-by-frame.
+
+    Args:
+        filename (str): path to the video file
+        pts_unit (str, optional): unit in which timestamp values will be returned
+            either 'pts' or 'sec'. Defaults to 'pts'.
+
+    Returns:
+        pts (List[int] if pts_unit = 'pts', List[Fraction] if pts_unit = 'sec'):
+            presentation timestamps for each one of the frames in the video.
+        video_fps (float, optional): the frame rate for the video
+
+    """
+    _raise_video_deprecation_warning()
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_video_timestamps)
+    from torchvision import get_video_backend
+
+    if get_video_backend() != "pyav":
+        return _video_opt._read_video_timestamps(filename, pts_unit)
+
+    _check_av_available()
+
+    video_fps = None
+    pts = []
+
+    try:
+        with av.open(filename, metadata_errors="ignore") as container:
+            if container.streams.video:
+                video_stream = container.streams.video[0]
+                video_time_base = video_stream.time_base
+                try:
+                    pts = _decode_video_timestamps(container)
+                except FFmpegError:
+                    warnings.warn(f"Failed decoding frames for file {filename}")
+                video_fps = float(video_stream.average_rate)
+    except FFmpegError as e:
+        msg = f"Failed to open container for {filename}; Caught error: {e}"
+        warnings.warn(msg, RuntimeWarning)
+
+    pts.sort()
+
+    if pts_unit == "sec":
+        pts = [x * video_time_base for x in pts]
+
+    return pts, video_fps
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video_reader.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc58c4790557a7b15478d5fd9d8feacfbf489c9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/io/video_reader.py
@@ -0,0 +1,296 @@
+import io
+import warnings
+from collections.abc import Iterator
+
+from typing import Any
+
+import torch
+
+from ..utils import _log_api_usage_once
+from ._video_deprecation_warning import _raise_video_deprecation_warning
+
+from ._video_opt import _HAS_CPU_VIDEO_DECODER
+
+if _HAS_CPU_VIDEO_DECODER:
+
+    def _has_video_opt() -> bool:
+        return True
+
+else:
+
+    def _has_video_opt() -> bool:
+        return False
+
+
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
+class VideoReader:
+    """[DEPRECATED] Fine-grained video-reading API.
+    Supports frame-by-frame reading of various streams from a single video
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    .. betastatus:: VideoReader class
+
+    Example:
+        The following examples creates a :mod:`VideoReader` object, seeks into 2s
+        point, and returns a single frame::
+
+            import torchvision
+            video_path = "path_to_a_test_video"
+            reader = torchvision.io.VideoReader(video_path, "video")
+            reader.seek(2.0)
+            frame = next(reader)
+
+        :mod:`VideoReader` implements the iterable API, which makes it suitable to
+        using it in conjunction with :mod:`itertools` for more advanced reading.
+        As such, we can use a :mod:`VideoReader` instance inside for loops::
+
+            reader.seek(2)
+            for frame in reader:
+                frames.append(frame['data'])
+            # additionally, `seek` implements a fluent API, so we can do
+            for frame in reader.seek(2):
+                frames.append(frame['data'])
+
+        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
+        following code::
+
+            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
+                frames.append(frame['data'])
+
+        and similarly, reading 10 frames after the 2s timestamp can be achieved
+        as follows::
+
+            for frame in itertools.islice(reader.seek(2), 10):
+                frames.append(frame['data'])
+
+    .. note::
+
+        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
+        a unique stream id (which are determined by the video encoding).
+        In this way, if the video container contains multiple
+        streams of the same type, users can access the one they want.
+        If only stream type is passed, the decoder auto-detects first stream of that type.
+
+    Args:
+        src (string, bytes object, or tensor): The media source.
+            If string-type, it must be a file path supported by FFMPEG.
+            If bytes, should be an in-memory representation of a file supported by FFMPEG.
+            If Tensor, it is interpreted internally as byte buffer.
+            It must be one-dimensional, of type ``torch.uint8``.
+
+        stream (string, optional): descriptor of the required stream, followed by the stream id,
+            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
+            Currently available options include ``['video', 'audio']``
+
+        num_threads (int, optional): number of threads used by the codec to decode video.
+            Default value (0) enables multithreading with codec-dependent heuristic. The performance
+            will depend on the version of FFMPEG codecs supported.
+    """
+
+    def __init__(
+        self,
+        src: str,
+        stream: str = "video",
+        num_threads: int = 0,
+    ) -> None:
+        _raise_video_deprecation_warning()
+        _log_api_usage_once(self)
+        from .. import get_video_backend
+
+        self.backend = get_video_backend()
+        if isinstance(src, str):
+            if not src:
+                raise ValueError("src cannot be empty")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                with warnings.catch_warnings():
+                    # Ignore the warning because we actually don't modify the buffer in this function
+                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
+                    src = torch.frombuffer(src, dtype=torch.uint8)
+        elif isinstance(src, torch.Tensor):
+            if self.backend in ["cuda", "pyav"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
+                )
+        else:
+            raise ValueError(f"src must be either string, Tensor or bytes object. Got {type(src)}")
+
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+
+            # TODO: add extradata exception
+
+        else:
+            raise RuntimeError(f"Unknown video backend: {self.backend}")
+
+    def __next__(self) -> dict[str, Any]:
+        """Decodes and returns the next frame of the current stream.
+        Frames are encoded as a dict with mandatory
+        data and pts fields, where data is a tensor, and pts is a
+        presentation timestamp of the frame expressed in seconds
+        as a float.
+
+        Returns:
+            (dict): a dictionary and containing decoded frame (``data``)
+            and corresponding timestamp (``pts``) in seconds
+
+        """
+        if self.backend == "cuda":
+            frame = self._c.next()
+            if frame.numel() == 0:
+                raise StopIteration
+            return {"data": frame, "pts": None}
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.as_tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.as_tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
+
+        if frame.numel() == 0:
+            raise StopIteration
+
+        return {"data": frame, "pts": pts}
+
+    def __iter__(self) -> Iterator[dict[str, Any]]:
+        return self
+
+    def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
+        """Seek within current stream.
+
+        Args:
+            time_s (float): seek time in seconds
+            keyframes_only (bool): allow to seek only to keyframes
+
+        .. note::
+            Current implementation is the so-called precise seek. This
+            means following seek, call to :mod:`next()` will return the
+            frame with the exact timestamp if it exists or
+            the first frame with timestamp larger than ``time_s``.
+        """
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
+        return self
+
+    def get_metadata(self) -> dict[str, Any]:
+        """Returns video metadata
+
+        Returns:
+            (dict): dictionary containing duration and frame rate for every stream
+        """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+
+                rate = getattr(stream, "average_rate", None) or stream.sample_rate
+
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
+        return self._c.get_metadata()
+
+    def set_current_stream(self, stream: str) -> bool:
+        """Set current stream.
+        Explicitly define the stream we are operating on.
+
+        Args:
+            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
+                Currently available stream types include ``['video', 'audio']``.
+                Each descriptor consists of two parts: stream type (e.g. 'video') and
+                a unique stream id (which are determined by video encoding).
+                In this way, if the video container contains multiple
+                streams of the same type, users can access the one they want.
+                If only stream type is passed, the decoder auto-detects first stream
+                of that type and returns it.
+
+        Returns:
+            (bool): True on success, False otherwise
+        """
+        if self.backend == "cuda":
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
+        return self._c.set_current_stream(stream)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea0a1f7178b6ca03776d58c17411a8ff483f8b2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__init__.py
@@ -0,0 +1,23 @@
+from .alexnet import *
+from .convnext import *
+from .densenet import *
+from .efficientnet import *
+from .googlenet import *
+from .inception import *
+from .mnasnet import *
+from .mobilenet import *
+from .regnet import *
+from .resnet import *
+from .shufflenetv2 import *
+from .squeezenet import *
+from .vgg import *
+from .vision_transformer import *
+from .swin_transformer import *
+from .maxvit import *
+from . import detection, optical_flow, quantization, segmentation, video
+
+# The Weights and WeightsEnum are developer-facing utils that we make public for
+# downstream libs like torchgeo https://github.com/pytorch/vision/issues/7094
+# TODO: we could / should document them publicly, but it's not clear where, as
+# they're not intended for end users.
+from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models, Weights, WeightsEnum
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdc2a9fe845cf9ec18e9d7d7511c14374739dd97
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_api.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cc5e80554bb6bec7040fa70f41d74ffb5b26b33
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_api.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_meta.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_meta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..211d4c1540d090f63009fa9e141fe413388fb37e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_meta.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e4fc9211342117057312b44d977c899501dcd31
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/alexnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/alexnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31d8f5747cfff61e5b3f370a94901efc27bcd58a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/alexnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/convnext.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a44821adca403d1bdf706fa33d0b93ece5234ec9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/convnext.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/densenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/densenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce4a6a1d905d6d472b50cb69c6c0282b4a3c5405
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/densenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/efficientnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/efficientnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae7cdd112038809039a084d9b17f3752fb762856
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/efficientnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/feature_extraction.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/feature_extraction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdeb5a7421345b971604629bbd2c49955ece504e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/feature_extraction.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/googlenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/googlenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4a54b1b0d8cf0850b3332b267f8ccf2e2489b28
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/googlenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/inception.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/inception.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3595d8d160b8c9d3701b34863eb19b97e89f7cf5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/inception.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/maxvit.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/maxvit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7e4be36a734660470d1d649deaa48ccc6f2d28f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/maxvit.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mnasnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mnasnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b109623db7700ef1e838968e3965cab2f06d01
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mnasnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..028b37c1cb1376ba82ec591d720c7afacdb5f57d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7a49a500c370e4c5a25a3061cccd9e23e5a00e8
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv3.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b383b7e59f31dcffab699004845d7525f4bea7f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/mobilenetv3.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/regnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/regnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0693324a9f6141fa987a913e7eabc548543a2e8
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/regnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/resnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b956ceca37b03c3a50ebb0d0118ab96fac957335
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/resnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/shufflenetv2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/shufflenetv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58f5947fe947d0e1fab3a8a59161321faf510d55
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/shufflenetv2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/squeezenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/squeezenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfbdd99728f58357485e8c65e4104721a37fc3ca
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/squeezenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/swin_transformer.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/swin_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cf13d40b182ebdbdcb9acf99e87395b318e890d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/swin_transformer.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vgg.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vgg.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c764b427556aa17719d5d2a33ae1923dcc576d5d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vgg.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vision_transformer.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vision_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e44f141ecf91ba18385a8a588e06b82176f435c7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/__pycache__/vision_transformer.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_api.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..358e6f431591c30edcfe4d5ebc6dee8a5a44b130
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_api.py
@@ -0,0 +1,277 @@
+import fnmatch
+import importlib
+import inspect
+import sys
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from enum import Enum
+from functools import partial
+from inspect import signature
+from types import ModuleType
+from typing import Any, Callable, get_args, Optional, TypeVar, Union
+
+from torch import nn
+
+from .._internally_replaced_utils import load_state_dict_from_url
+
+
+__all__ = ["WeightsEnum", "Weights", "get_model", "get_model_builder", "get_model_weights", "get_weight", "list_models"]
+
+
+@dataclass
+class Weights:
+    """
+    This class is used to group important attributes associated with the pre-trained weights.
+
+    Args:
+        url (str): The location where we find the weights.
+        transforms (Callable): A callable that constructs the preprocessing method (or validation preset transforms)
+            needed to use the model. The reason we attach a constructor method rather than an already constructed
+            object is because the specific object might have memory and thus we want to delay initialization until
+            needed.
+        meta (Dict[str, Any]): Stores meta-data related to the weights of the model and its configuration. These can be
+            informative attributes (for example the number of parameters/flops, recipe link/methods used in training
+            etc), configuration parameters (for example the `num_classes`) needed to construct the model or important
+            meta-data (for example the `classes` of a classification model) needed to use the model.
+    """
+
+    url: str
+    transforms: Callable
+    meta: dict[str, Any]
+
+    def __eq__(self, other: Any) -> bool:
+        # We need this custom implementation for correct deep-copy and deserialization behavior.
+        # TL;DR: After the definition of an enum, creating a new instance, i.e. by deep-copying or deserializing it,
+        # involves an equality check against the defined members. Unfortunately, the `transforms` attribute is often
+        # defined with `functools.partial` and `fn = partial(...); assert deepcopy(fn) != fn`. Without custom handling
+        # for it, the check against the defined members would fail and effectively prevent the weights from being
+        # deep-copied or deserialized.
+        # See https://github.com/pytorch/vision/pull/7107 for details.
+        if not isinstance(other, Weights):
+            return NotImplemented
+
+        if self.url != other.url:
+            return False
+
+        if self.meta != other.meta:
+            return False
+
+        if isinstance(self.transforms, partial) and isinstance(other.transforms, partial):
+            return (
+                self.transforms.func == other.transforms.func
+                and self.transforms.args == other.transforms.args
+                and self.transforms.keywords == other.transforms.keywords
+            )
+        else:
+            return self.transforms == other.transforms
+
+
+class WeightsEnum(Enum):
+    """
+    This class is the parent class of all model weights. Each model building method receives an optional `weights`
+    parameter with its associated pre-trained weights. It inherits from `Enum` and its values should be of type
+    `Weights`.
+
+    Args:
+        value (Weights): The data class entry with the weight information.
+    """
+
+    @classmethod
+    def verify(cls, obj: Any) -> Any:
+        if obj is not None:
+            if type(obj) is str:
+                obj = cls[obj.replace(cls.__name__ + ".", "")]
+            elif not isinstance(obj, cls):
+                raise TypeError(
+                    f"Invalid Weight class provided; expected {cls.__name__} but received {obj.__class__.__name__}."
+                )
+        return obj
+
+    def get_state_dict(self, *args: Any, **kwargs: Any) -> Mapping[str, Any]:
+        return load_state_dict_from_url(self.url, *args, **kwargs)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}.{self._name_}"
+
+    @property
+    def url(self):
+        return self.value.url
+
+    @property
+    def transforms(self):
+        return self.value.transforms
+
+    @property
+    def meta(self):
+        return self.value.meta
+
+
+def get_weight(name: str) -> WeightsEnum:
+    """
+    Gets the weights enum value by its full name. Example: "ResNet50_Weights.IMAGENET1K_V1"
+
+    Args:
+        name (str): The name of the weight enum entry.
+
+    Returns:
+        WeightsEnum: The requested weight enum.
+    """
+    try:
+        enum_name, value_name = name.split(".")
+    except ValueError:
+        raise ValueError(f"Invalid weight name provided: '{name}'.")
+
+    base_module_name = ".".join(sys.modules[__name__].__name__.split(".")[:-1])
+    base_module = importlib.import_module(base_module_name)
+    model_modules = [base_module] + [
+        x[1]
+        for x in inspect.getmembers(base_module, inspect.ismodule)
+        if x[1].__file__.endswith("__init__.py")  # type: ignore[union-attr]
+    ]
+
+    weights_enum = None
+    for m in model_modules:
+        potential_class = m.__dict__.get(enum_name, None)
+        if potential_class is not None and issubclass(potential_class, WeightsEnum):
+            weights_enum = potential_class
+            break
+
+    if weights_enum is None:
+        raise ValueError(f"The weight enum '{enum_name}' for the specific method couldn't be retrieved.")
+
+    return weights_enum[value_name]
+
+
+def get_model_weights(name: Union[Callable, str]) -> type[WeightsEnum]:
+    """
+    Returns the weights enum class associated to the given model.
+
+    Args:
+        name (callable or str): The model builder function or the name under which it is registered.
+
+    Returns:
+        weights_enum (WeightsEnum): The weights enum class associated with the model.
+    """
+    model = get_model_builder(name) if isinstance(name, str) else name
+    return _get_enum_from_fn(model)
+
+
+def _get_enum_from_fn(fn: Callable) -> type[WeightsEnum]:
+    """
+    Internal method that gets the weight enum of a specific model builder method.
+
+    Args:
+        fn (Callable): The builder method used to create the model.
+    Returns:
+        WeightsEnum: The requested weight enum.
+    """
+    sig = signature(fn)
+    if "weights" not in sig.parameters:
+        raise ValueError("The method is missing the 'weights' argument.")
+
+    ann = sig.parameters["weights"].annotation
+    weights_enum = None
+    if isinstance(ann, type) and issubclass(ann, WeightsEnum):
+        weights_enum = ann
+    else:
+        # handle cases like Union[Optional, T]
+        for t in get_args(ann):  # type: ignore[union-attr]
+            if isinstance(t, type) and issubclass(t, WeightsEnum):
+                weights_enum = t
+                break
+
+    if weights_enum is None:
+        raise ValueError(
+            "The WeightsEnum class for the specific method couldn't be retrieved. Make sure the typing info is correct."
+        )
+
+    return weights_enum
+
+
+M = TypeVar("M", bound=nn.Module)
+
+BUILTIN_MODELS = {}
+
+
+def register_model(name: Optional[str] = None) -> Callable[[Callable[..., M]], Callable[..., M]]:
+    def wrapper(fn: Callable[..., M]) -> Callable[..., M]:
+        key = name if name is not None else fn.__name__
+        if key in BUILTIN_MODELS:
+            raise ValueError(f"An entry is already registered under the name '{key}'.")
+        BUILTIN_MODELS[key] = fn
+        return fn
+
+    return wrapper
+
+
+def list_models(
+    module: Optional[ModuleType] = None,
+    include: Union[Iterable[str], str, None] = None,
+    exclude: Union[Iterable[str], str, None] = None,
+) -> list[str]:
+    """
+    Returns a list with the names of registered models.
+
+    Args:
+        module (ModuleType, optional): The module from which we want to extract the available models.
+        include (str or Iterable[str], optional): Filter(s) for including the models from the set of all models.
+            Filters are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is the union of individual filters.
+        exclude (str or Iterable[str], optional): Filter(s) applied after include_filters to remove models.
+            Filter are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is removal of all the models that match any individual filter.
+
+    Returns:
+        models (list): A list with the names of available models.
+    """
+    all_models = {
+        k for k, v in BUILTIN_MODELS.items() if module is None or v.__module__.rsplit(".", 1)[0] == module.__name__
+    }
+    if include:
+        models: set[str] = set()
+        if isinstance(include, str):
+            include = [include]
+        for include_filter in include:
+            models = models | set(fnmatch.filter(all_models, include_filter))
+    else:
+        models = all_models
+
+    if exclude:
+        if isinstance(exclude, str):
+            exclude = [exclude]
+        for exclude_filter in exclude:
+            models = models - set(fnmatch.filter(all_models, exclude_filter))
+    return sorted(models)
+
+
+def get_model_builder(name: str) -> Callable[..., nn.Module]:
+    """
+    Gets the model name and returns the model builder method.
+
+    Args:
+        name (str): The name under which the model is registered.
+
+    Returns:
+        fn (Callable): The model builder method.
+    """
+    name = name.lower()
+    try:
+        fn = BUILTIN_MODELS[name]
+    except KeyError:
+        raise ValueError(f"Unknown model {name}")
+    return fn
+
+
+def get_model(name: str, **config: Any) -> nn.Module:
+    """
+    Gets the model name and configuration and returns an instantiated model.
+
+    Args:
+        name (str): The name under which the model is registered.
+        **config (Any): parameters passed to the model builder method.
+
+    Returns:
+        model (nn.Module): The initialized model.
+    """
+    fn = get_model_builder(name)
+    return fn(**config)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_meta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..e66f411c287e0f456448315ba4fd0bfcce281d2b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_meta.py
@@ -0,0 +1,1554 @@
+"""
+This file is part of the private API. Please do not refer to any variables defined here directly as they will be
+removed on future versions without warning.
+"""
+
+# This will eventually be replaced with a call at torchvision.datasets.info("imagenet").categories
+_IMAGENET_CATEGORIES = [
+    "tench",
+    "goldfish",
+    "great white shark",
+    "tiger shark",
+    "hammerhead",
+    "electric ray",
+    "stingray",
+    "cock",
+    "hen",
+    "ostrich",
+    "brambling",
+    "goldfinch",
+    "house finch",
+    "junco",
+    "indigo bunting",
+    "robin",
+    "bulbul",
+    "jay",
+    "magpie",
+    "chickadee",
+    "water ouzel",
+    "kite",
+    "bald eagle",
+    "vulture",
+    "great grey owl",
+    "European fire salamander",
+    "common newt",
+    "eft",
+    "spotted salamander",
+    "axolotl",
+    "bullfrog",
+    "tree frog",
+    "tailed frog",
+    "loggerhead",
+    "leatherback turtle",
+    "mud turtle",
+    "terrapin",
+    "box turtle",
+    "banded gecko",
+    "common iguana",
+    "American chameleon",
+    "whiptail",
+    "agama",
+    "frilled lizard",
+    "alligator lizard",
+    "Gila monster",
+    "green lizard",
+    "African chameleon",
+    "Komodo dragon",
+    "African crocodile",
+    "American alligator",
+    "triceratops",
+    "thunder snake",
+    "ringneck snake",
+    "hognose snake",
+    "green snake",
+    "king snake",
+    "garter snake",
+    "water snake",
+    "vine snake",
+    "night snake",
+    "boa constrictor",
+    "rock python",
+    "Indian cobra",
+    "green mamba",
+    "sea snake",
+    "horned viper",
+    "diamondback",
+    "sidewinder",
+    "trilobite",
+    "harvestman",
+    "scorpion",
+    "black and gold garden spider",
+    "barn spider",
+    "garden spider",
+    "black widow",
+    "tarantula",
+    "wolf spider",
+    "tick",
+    "centipede",
+    "black grouse",
+    "ptarmigan",
+    "ruffed grouse",
+    "prairie chicken",
+    "peacock",
+    "quail",
+    "partridge",
+    "African grey",
+    "macaw",
+    "sulphur-crested cockatoo",
+    "lorikeet",
+    "coucal",
+    "bee eater",
+    "hornbill",
+    "hummingbird",
+    "jacamar",
+    "toucan",
+    "drake",
+    "red-breasted merganser",
+    "goose",
+    "black swan",
+    "tusker",
+    "echidna",
+    "platypus",
+    "wallaby",
+    "koala",
+    "wombat",
+    "jellyfish",
+    "sea anemone",
+    "brain coral",
+    "flatworm",
+    "nematode",
+    "conch",
+    "snail",
+    "slug",
+    "sea slug",
+    "chiton",
+    "chambered nautilus",
+    "Dungeness crab",
+    "rock crab",
+    "fiddler crab",
+    "king crab",
+    "American lobster",
+    "spiny lobster",
+    "crayfish",
+    "hermit crab",
+    "isopod",
+    "white stork",
+    "black stork",
+    "spoonbill",
+    "flamingo",
+    "little blue heron",
+    "American egret",
+    "bittern",
+    "crane bird",
+    "limpkin",
+    "European gallinule",
+    "American coot",
+    "bustard",
+    "ruddy turnstone",
+    "red-backed sandpiper",
+    "redshank",
+    "dowitcher",
+    "oystercatcher",
+    "pelican",
+    "king penguin",
+    "albatross",
+    "grey whale",
+    "killer whale",
+    "dugong",
+    "sea lion",
+    "Chihuahua",
+    "Japanese spaniel",
+    "Maltese dog",
+    "Pekinese",
+    "Shih-Tzu",
+    "Blenheim spaniel",
+    "papillon",
+    "toy terrier",
+    "Rhodesian ridgeback",
+    "Afghan hound",
+    "basset",
+    "beagle",
+    "bloodhound",
+    "bluetick",
+    "black-and-tan coonhound",
+    "Walker hound",
+    "English foxhound",
+    "redbone",
+    "borzoi",
+    "Irish wolfhound",
+    "Italian greyhound",
+    "whippet",
+    "Ibizan hound",
+    "Norwegian elkhound",
+    "otterhound",
+    "Saluki",
+    "Scottish deerhound",
+    "Weimaraner",
+    "Staffordshire bullterrier",
+    "American Staffordshire terrier",
+    "Bedlington terrier",
+    "Border terrier",
+    "Kerry blue terrier",
+    "Irish terrier",
+    "Norfolk terrier",
+    "Norwich terrier",
+    "Yorkshire terrier",
+    "wire-haired fox terrier",
+    "Lakeland terrier",
+    "Sealyham terrier",
+    "Airedale",
+    "cairn",
+    "Australian terrier",
+    "Dandie Dinmont",
+    "Boston bull",
+    "miniature schnauzer",
+    "giant schnauzer",
+    "standard schnauzer",
+    "Scotch terrier",
+    "Tibetan terrier",
+    "silky terrier",
+    "soft-coated wheaten terrier",
+    "West Highland white terrier",
+    "Lhasa",
+    "flat-coated retriever",
+    "curly-coated retriever",
+    "golden retriever",
+    "Labrador retriever",
+    "Chesapeake Bay retriever",
+    "German short-haired pointer",
+    "vizsla",
+    "English setter",
+    "Irish setter",
+    "Gordon setter",
+    "Brittany spaniel",
+    "clumber",
+    "English springer",
+    "Welsh springer spaniel",
+    "cocker spaniel",
+    "Sussex spaniel",
+    "Irish water spaniel",
+    "kuvasz",
+    "schipperke",
+    "groenendael",
+    "malinois",
+    "briard",
+    "kelpie",
+    "komondor",
+    "Old English sheepdog",
+    "Shetland sheepdog",
+    "collie",
+    "Border collie",
+    "Bouvier des Flandres",
+    "Rottweiler",
+    "German shepherd",
+    "Doberman",
+    "miniature pinscher",
+    "Greater Swiss Mountain dog",
+    "Bernese mountain dog",
+    "Appenzeller",
+    "EntleBucher",
+    "boxer",
+    "bull mastiff",
+    "Tibetan mastiff",
+    "French bulldog",
+    "Great Dane",
+    "Saint Bernard",
+    "Eskimo dog",
+    "malamute",
+    "Siberian husky",
+    "dalmatian",
+    "affenpinscher",
+    "basenji",
+    "pug",
+    "Leonberg",
+    "Newfoundland",
+    "Great Pyrenees",
+    "Samoyed",
+    "Pomeranian",
+    "chow",
+    "keeshond",
+    "Brabancon griffon",
+    "Pembroke",
+    "Cardigan",
+    "toy poodle",
+    "miniature poodle",
+    "standard poodle",
+    "Mexican hairless",
+    "timber wolf",
+    "white wolf",
+    "red wolf",
+    "coyote",
+    "dingo",
+    "dhole",
+    "African hunting dog",
+    "hyena",
+    "red fox",
+    "kit fox",
+    "Arctic fox",
+    "grey fox",
+    "tabby",
+    "tiger cat",
+    "Persian cat",
+    "Siamese cat",
+    "Egyptian cat",
+    "cougar",
+    "lynx",
+    "leopard",
+    "snow leopard",
+    "jaguar",
+    "lion",
+    "tiger",
+    "cheetah",
+    "brown bear",
+    "American black bear",
+    "ice bear",
+    "sloth bear",
+    "mongoose",
+    "meerkat",
+    "tiger beetle",
+    "ladybug",
+    "ground beetle",
+    "long-horned beetle",
+    "leaf beetle",
+    "dung beetle",
+    "rhinoceros beetle",
+    "weevil",
+    "fly",
+    "bee",
+    "ant",
+    "grasshopper",
+    "cricket",
+    "walking stick",
+    "cockroach",
+    "mantis",
+    "cicada",
+    "leafhopper",
+    "lacewing",
+    "dragonfly",
+    "damselfly",
+    "admiral",
+    "ringlet",
+    "monarch",
+    "cabbage butterfly",
+    "sulphur butterfly",
+    "lycaenid",
+    "starfish",
+    "sea urchin",
+    "sea cucumber",
+    "wood rabbit",
+    "hare",
+    "Angora",
+    "hamster",
+    "porcupine",
+    "fox squirrel",
+    "marmot",
+    "beaver",
+    "guinea pig",
+    "sorrel",
+    "zebra",
+    "hog",
+    "wild boar",
+    "warthog",
+    "hippopotamus",
+    "ox",
+    "water buffalo",
+    "bison",
+    "ram",
+    "bighorn",
+    "ibex",
+    "hartebeest",
+    "impala",
+    "gazelle",
+    "Arabian camel",
+    "llama",
+    "weasel",
+    "mink",
+    "polecat",
+    "black-footed ferret",
+    "otter",
+    "skunk",
+    "badger",
+    "armadillo",
+    "three-toed sloth",
+    "orangutan",
+    "gorilla",
+    "chimpanzee",
+    "gibbon",
+    "siamang",
+    "guenon",
+    "patas",
+    "baboon",
+    "macaque",
+    "langur",
+    "colobus",
+    "proboscis monkey",
+    "marmoset",
+    "capuchin",
+    "howler monkey",
+    "titi",
+    "spider monkey",
+    "squirrel monkey",
+    "Madagascar cat",
+    "indri",
+    "Indian elephant",
+    "African elephant",
+    "lesser panda",
+    "giant panda",
+    "barracouta",
+    "eel",
+    "coho",
+    "rock beauty",
+    "anemone fish",
+    "sturgeon",
+    "gar",
+    "lionfish",
+    "puffer",
+    "abacus",
+    "abaya",
+    "academic gown",
+    "accordion",
+    "acoustic guitar",
+    "aircraft carrier",
+    "airliner",
+    "airship",
+    "altar",
+    "ambulance",
+    "amphibian",
+    "analog clock",
+    "apiary",
+    "apron",
+    "ashcan",
+    "assault rifle",
+    "backpack",
+    "bakery",
+    "balance beam",
+    "balloon",
+    "ballpoint",
+    "Band Aid",
+    "banjo",
+    "bannister",
+    "barbell",
+    "barber chair",
+    "barbershop",
+    "barn",
+    "barometer",
+    "barrel",
+    "barrow",
+    "baseball",
+    "basketball",
+    "bassinet",
+    "bassoon",
+    "bathing cap",
+    "bath towel",
+    "bathtub",
+    "beach wagon",
+    "beacon",
+    "beaker",
+    "bearskin",
+    "beer bottle",
+    "beer glass",
+    "bell cote",
+    "bib",
+    "bicycle-built-for-two",
+    "bikini",
+    "binder",
+    "binoculars",
+    "birdhouse",
+    "boathouse",
+    "bobsled",
+    "bolo tie",
+    "bonnet",
+    "bookcase",
+    "bookshop",
+    "bottlecap",
+    "bow",
+    "bow tie",
+    "brass",
+    "brassiere",
+    "breakwater",
+    "breastplate",
+    "broom",
+    "bucket",
+    "buckle",
+    "bulletproof vest",
+    "bullet train",
+    "butcher shop",
+    "cab",
+    "caldron",
+    "candle",
+    "cannon",
+    "canoe",
+    "can opener",
+    "cardigan",
+    "car mirror",
+    "carousel",
+    "carpenter's kit",
+    "carton",
+    "car wheel",
+    "cash machine",
+    "cassette",
+    "cassette player",
+    "castle",
+    "catamaran",
+    "CD player",
+    "cello",
+    "cellular telephone",
+    "chain",
+    "chainlink fence",
+    "chain mail",
+    "chain saw",
+    "chest",
+    "chiffonier",
+    "chime",
+    "china cabinet",
+    "Christmas stocking",
+    "church",
+    "cinema",
+    "cleaver",
+    "cliff dwelling",
+    "cloak",
+    "clog",
+    "cocktail shaker",
+    "coffee mug",
+    "coffeepot",
+    "coil",
+    "combination lock",
+    "computer keyboard",
+    "confectionery",
+    "container ship",
+    "convertible",
+    "corkscrew",
+    "cornet",
+    "cowboy boot",
+    "cowboy hat",
+    "cradle",
+    "crane",
+    "crash helmet",
+    "crate",
+    "crib",
+    "Crock Pot",
+    "croquet ball",
+    "crutch",
+    "cuirass",
+    "dam",
+    "desk",
+    "desktop computer",
+    "dial telephone",
+    "diaper",
+    "digital clock",
+    "digital watch",
+    "dining table",
+    "dishrag",
+    "dishwasher",
+    "disk brake",
+    "dock",
+    "dogsled",
+    "dome",
+    "doormat",
+    "drilling platform",
+    "drum",
+    "drumstick",
+    "dumbbell",
+    "Dutch oven",
+    "electric fan",
+    "electric guitar",
+    "electric locomotive",
+    "entertainment center",
+    "envelope",
+    "espresso maker",
+    "face powder",
+    "feather boa",
+    "file",
+    "fireboat",
+    "fire engine",
+    "fire screen",
+    "flagpole",
+    "flute",
+    "folding chair",
+    "football helmet",
+    "forklift",
+    "fountain",
+    "fountain pen",
+    "four-poster",
+    "freight car",
+    "French horn",
+    "frying pan",
+    "fur coat",
+    "garbage truck",
+    "gasmask",
+    "gas pump",
+    "goblet",
+    "go-kart",
+    "golf ball",
+    "golfcart",
+    "gondola",
+    "gong",
+    "gown",
+    "grand piano",
+    "greenhouse",
+    "grille",
+    "grocery store",
+    "guillotine",
+    "hair slide",
+    "hair spray",
+    "half track",
+    "hammer",
+    "hamper",
+    "hand blower",
+    "hand-held computer",
+    "handkerchief",
+    "hard disc",
+    "harmonica",
+    "harp",
+    "harvester",
+    "hatchet",
+    "holster",
+    "home theater",
+    "honeycomb",
+    "hook",
+    "hoopskirt",
+    "horizontal bar",
+    "horse cart",
+    "hourglass",
+    "iPod",
+    "iron",
+    "jack-o'-lantern",
+    "jean",
+    "jeep",
+    "jersey",
+    "jigsaw puzzle",
+    "jinrikisha",
+    "joystick",
+    "kimono",
+    "knee pad",
+    "knot",
+    "lab coat",
+    "ladle",
+    "lampshade",
+    "laptop",
+    "lawn mower",
+    "lens cap",
+    "letter opener",
+    "library",
+    "lifeboat",
+    "lighter",
+    "limousine",
+    "liner",
+    "lipstick",
+    "Loafer",
+    "lotion",
+    "loudspeaker",
+    "loupe",
+    "lumbermill",
+    "magnetic compass",
+    "mailbag",
+    "mailbox",
+    "maillot",
+    "maillot tank suit",
+    "manhole cover",
+    "maraca",
+    "marimba",
+    "mask",
+    "matchstick",
+    "maypole",
+    "maze",
+    "measuring cup",
+    "medicine chest",
+    "megalith",
+    "microphone",
+    "microwave",
+    "military uniform",
+    "milk can",
+    "minibus",
+    "miniskirt",
+    "minivan",
+    "missile",
+    "mitten",
+    "mixing bowl",
+    "mobile home",
+    "Model T",
+    "modem",
+    "monastery",
+    "monitor",
+    "moped",
+    "mortar",
+    "mortarboard",
+    "mosque",
+    "mosquito net",
+    "motor scooter",
+    "mountain bike",
+    "mountain tent",
+    "mouse",
+    "mousetrap",
+    "moving van",
+    "muzzle",
+    "nail",
+    "neck brace",
+    "necklace",
+    "nipple",
+    "notebook",
+    "obelisk",
+    "oboe",
+    "ocarina",
+    "odometer",
+    "oil filter",
+    "organ",
+    "oscilloscope",
+    "overskirt",
+    "oxcart",
+    "oxygen mask",
+    "packet",
+    "paddle",
+    "paddlewheel",
+    "padlock",
+    "paintbrush",
+    "pajama",
+    "palace",
+    "panpipe",
+    "paper towel",
+    "parachute",
+    "parallel bars",
+    "park bench",
+    "parking meter",
+    "passenger car",
+    "patio",
+    "pay-phone",
+    "pedestal",
+    "pencil box",
+    "pencil sharpener",
+    "perfume",
+    "Petri dish",
+    "photocopier",
+    "pick",
+    "pickelhaube",
+    "picket fence",
+    "pickup",
+    "pier",
+    "piggy bank",
+    "pill bottle",
+    "pillow",
+    "ping-pong ball",
+    "pinwheel",
+    "pirate",
+    "pitcher",
+    "plane",
+    "planetarium",
+    "plastic bag",
+    "plate rack",
+    "plow",
+    "plunger",
+    "Polaroid camera",
+    "pole",
+    "police van",
+    "poncho",
+    "pool table",
+    "pop bottle",
+    "pot",
+    "potter's wheel",
+    "power drill",
+    "prayer rug",
+    "printer",
+    "prison",
+    "projectile",
+    "projector",
+    "puck",
+    "punching bag",
+    "purse",
+    "quill",
+    "quilt",
+    "racer",
+    "racket",
+    "radiator",
+    "radio",
+    "radio telescope",
+    "rain barrel",
+    "recreational vehicle",
+    "reel",
+    "reflex camera",
+    "refrigerator",
+    "remote control",
+    "restaurant",
+    "revolver",
+    "rifle",
+    "rocking chair",
+    "rotisserie",
+    "rubber eraser",
+    "rugby ball",
+    "rule",
+    "running shoe",
+    "safe",
+    "safety pin",
+    "saltshaker",
+    "sandal",
+    "sarong",
+    "sax",
+    "scabbard",
+    "scale",
+    "school bus",
+    "schooner",
+    "scoreboard",
+    "screen",
+    "screw",
+    "screwdriver",
+    "seat belt",
+    "sewing machine",
+    "shield",
+    "shoe shop",
+    "shoji",
+    "shopping basket",
+    "shopping cart",
+    "shovel",
+    "shower cap",
+    "shower curtain",
+    "ski",
+    "ski mask",
+    "sleeping bag",
+    "slide rule",
+    "sliding door",
+    "slot",
+    "snorkel",
+    "snowmobile",
+    "snowplow",
+    "soap dispenser",
+    "soccer ball",
+    "sock",
+    "solar dish",
+    "sombrero",
+    "soup bowl",
+    "space bar",
+    "space heater",
+    "space shuttle",
+    "spatula",
+    "speedboat",
+    "spider web",
+    "spindle",
+    "sports car",
+    "spotlight",
+    "stage",
+    "steam locomotive",
+    "steel arch bridge",
+    "steel drum",
+    "stethoscope",
+    "stole",
+    "stone wall",
+    "stopwatch",
+    "stove",
+    "strainer",
+    "streetcar",
+    "stretcher",
+    "studio couch",
+    "stupa",
+    "submarine",
+    "suit",
+    "sundial",
+    "sunglass",
+    "sunglasses",
+    "sunscreen",
+    "suspension bridge",
+    "swab",
+    "sweatshirt",
+    "swimming trunks",
+    "swing",
+    "switch",
+    "syringe",
+    "table lamp",
+    "tank",
+    "tape player",
+    "teapot",
+    "teddy",
+    "television",
+    "tennis ball",
+    "thatch",
+    "theater curtain",
+    "thimble",
+    "thresher",
+    "throne",
+    "tile roof",
+    "toaster",
+    "tobacco shop",
+    "toilet seat",
+    "torch",
+    "totem pole",
+    "tow truck",
+    "toyshop",
+    "tractor",
+    "trailer truck",
+    "tray",
+    "trench coat",
+    "tricycle",
+    "trimaran",
+    "tripod",
+    "triumphal arch",
+    "trolleybus",
+    "trombone",
+    "tub",
+    "turnstile",
+    "typewriter keyboard",
+    "umbrella",
+    "unicycle",
+    "upright",
+    "vacuum",
+    "vase",
+    "vault",
+    "velvet",
+    "vending machine",
+    "vestment",
+    "viaduct",
+    "violin",
+    "volleyball",
+    "waffle iron",
+    "wall clock",
+    "wallet",
+    "wardrobe",
+    "warplane",
+    "washbasin",
+    "washer",
+    "water bottle",
+    "water jug",
+    "water tower",
+    "whiskey jug",
+    "whistle",
+    "wig",
+    "window screen",
+    "window shade",
+    "Windsor tie",
+    "wine bottle",
+    "wing",
+    "wok",
+    "wooden spoon",
+    "wool",
+    "worm fence",
+    "wreck",
+    "yawl",
+    "yurt",
+    "web site",
+    "comic book",
+    "crossword puzzle",
+    "street sign",
+    "traffic light",
+    "book jacket",
+    "menu",
+    "plate",
+    "guacamole",
+    "consomme",
+    "hot pot",
+    "trifle",
+    "ice cream",
+    "ice lolly",
+    "French loaf",
+    "bagel",
+    "pretzel",
+    "cheeseburger",
+    "hotdog",
+    "mashed potato",
+    "head cabbage",
+    "broccoli",
+    "cauliflower",
+    "zucchini",
+    "spaghetti squash",
+    "acorn squash",
+    "butternut squash",
+    "cucumber",
+    "artichoke",
+    "bell pepper",
+    "cardoon",
+    "mushroom",
+    "Granny Smith",
+    "strawberry",
+    "orange",
+    "lemon",
+    "fig",
+    "pineapple",
+    "banana",
+    "jackfruit",
+    "custard apple",
+    "pomegranate",
+    "hay",
+    "carbonara",
+    "chocolate sauce",
+    "dough",
+    "meat loaf",
+    "pizza",
+    "potpie",
+    "burrito",
+    "red wine",
+    "espresso",
+    "cup",
+    "eggnog",
+    "alp",
+    "bubble",
+    "cliff",
+    "coral reef",
+    "geyser",
+    "lakeside",
+    "promontory",
+    "sandbar",
+    "seashore",
+    "valley",
+    "volcano",
+    "ballplayer",
+    "groom",
+    "scuba diver",
+    "rapeseed",
+    "daisy",
+    "yellow lady's slipper",
+    "corn",
+    "acorn",
+    "hip",
+    "buckeye",
+    "coral fungus",
+    "agaric",
+    "gyromitra",
+    "stinkhorn",
+    "earthstar",
+    "hen-of-the-woods",
+    "bolete",
+    "ear",
+    "toilet tissue",
+]
+
+# To be replaced with torchvision.datasets.info("coco").categories
+_COCO_CATEGORIES = [
+    "__background__",
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "N/A",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "N/A",
+    "backpack",
+    "umbrella",
+    "N/A",
+    "N/A",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "N/A",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "N/A",
+    "dining table",
+    "N/A",
+    "N/A",
+    "toilet",
+    "N/A",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "N/A",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+
+# To be replaced with torchvision.datasets.info("coco_kp")
+_COCO_PERSON_CATEGORIES = ["no person", "person"]
+_COCO_PERSON_KEYPOINT_NAMES = [
+    "nose",
+    "left_eye",
+    "right_eye",
+    "left_ear",
+    "right_ear",
+    "left_shoulder",
+    "right_shoulder",
+    "left_elbow",
+    "right_elbow",
+    "left_wrist",
+    "right_wrist",
+    "left_hip",
+    "right_hip",
+    "left_knee",
+    "right_knee",
+    "left_ankle",
+    "right_ankle",
+]
+
+# To be replaced with torchvision.datasets.info("voc").categories
+_VOC_CATEGORIES = [
+    "__background__",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+
+# To be replaced with torchvision.datasets.info("kinetics400").categories
+_KINETICS400_CATEGORIES = [
+    "abseiling",
+    "air drumming",
+    "answering questions",
+    "applauding",
+    "applying cream",
+    "archery",
+    "arm wrestling",
+    "arranging flowers",
+    "assembling computer",
+    "auctioning",
+    "baby waking up",
+    "baking cookies",
+    "balloon blowing",
+    "bandaging",
+    "barbequing",
+    "bartending",
+    "beatboxing",
+    "bee keeping",
+    "belly dancing",
+    "bench pressing",
+    "bending back",
+    "bending metal",
+    "biking through snow",
+    "blasting sand",
+    "blowing glass",
+    "blowing leaves",
+    "blowing nose",
+    "blowing out candles",
+    "bobsledding",
+    "bookbinding",
+    "bouncing on trampoline",
+    "bowling",
+    "braiding hair",
+    "breading or breadcrumbing",
+    "breakdancing",
+    "brush painting",
+    "brushing hair",
+    "brushing teeth",
+    "building cabinet",
+    "building shed",
+    "bungee jumping",
+    "busking",
+    "canoeing or kayaking",
+    "capoeira",
+    "carrying baby",
+    "cartwheeling",
+    "carving pumpkin",
+    "catching fish",
+    "catching or throwing baseball",
+    "catching or throwing frisbee",
+    "catching or throwing softball",
+    "celebrating",
+    "changing oil",
+    "changing wheel",
+    "checking tires",
+    "cheerleading",
+    "chopping wood",
+    "clapping",
+    "clay pottery making",
+    "clean and jerk",
+    "cleaning floor",
+    "cleaning gutters",
+    "cleaning pool",
+    "cleaning shoes",
+    "cleaning toilet",
+    "cleaning windows",
+    "climbing a rope",
+    "climbing ladder",
+    "climbing tree",
+    "contact juggling",
+    "cooking chicken",
+    "cooking egg",
+    "cooking on campfire",
+    "cooking sausages",
+    "counting money",
+    "country line dancing",
+    "cracking neck",
+    "crawling baby",
+    "crossing river",
+    "crying",
+    "curling hair",
+    "cutting nails",
+    "cutting pineapple",
+    "cutting watermelon",
+    "dancing ballet",
+    "dancing charleston",
+    "dancing gangnam style",
+    "dancing macarena",
+    "deadlifting",
+    "decorating the christmas tree",
+    "digging",
+    "dining",
+    "disc golfing",
+    "diving cliff",
+    "dodgeball",
+    "doing aerobics",
+    "doing laundry",
+    "doing nails",
+    "drawing",
+    "dribbling basketball",
+    "drinking",
+    "drinking beer",
+    "drinking shots",
+    "driving car",
+    "driving tractor",
+    "drop kicking",
+    "drumming fingers",
+    "dunking basketball",
+    "dying hair",
+    "eating burger",
+    "eating cake",
+    "eating carrots",
+    "eating chips",
+    "eating doughnuts",
+    "eating hotdog",
+    "eating ice cream",
+    "eating spaghetti",
+    "eating watermelon",
+    "egg hunting",
+    "exercising arm",
+    "exercising with an exercise ball",
+    "extinguishing fire",
+    "faceplanting",
+    "feeding birds",
+    "feeding fish",
+    "feeding goats",
+    "filling eyebrows",
+    "finger snapping",
+    "fixing hair",
+    "flipping pancake",
+    "flying kite",
+    "folding clothes",
+    "folding napkins",
+    "folding paper",
+    "front raises",
+    "frying vegetables",
+    "garbage collecting",
+    "gargling",
+    "getting a haircut",
+    "getting a tattoo",
+    "giving or receiving award",
+    "golf chipping",
+    "golf driving",
+    "golf putting",
+    "grinding meat",
+    "grooming dog",
+    "grooming horse",
+    "gymnastics tumbling",
+    "hammer throw",
+    "headbanging",
+    "headbutting",
+    "high jump",
+    "high kick",
+    "hitting baseball",
+    "hockey stop",
+    "holding snake",
+    "hopscotch",
+    "hoverboarding",
+    "hugging",
+    "hula hooping",
+    "hurdling",
+    "hurling (sport)",
+    "ice climbing",
+    "ice fishing",
+    "ice skating",
+    "ironing",
+    "javelin throw",
+    "jetskiing",
+    "jogging",
+    "juggling balls",
+    "juggling fire",
+    "juggling soccer ball",
+    "jumping into pool",
+    "jumpstyle dancing",
+    "kicking field goal",
+    "kicking soccer ball",
+    "kissing",
+    "kitesurfing",
+    "knitting",
+    "krumping",
+    "laughing",
+    "laying bricks",
+    "long jump",
+    "lunge",
+    "making a cake",
+    "making a sandwich",
+    "making bed",
+    "making jewelry",
+    "making pizza",
+    "making snowman",
+    "making sushi",
+    "making tea",
+    "marching",
+    "massaging back",
+    "massaging feet",
+    "massaging legs",
+    "massaging person's head",
+    "milking cow",
+    "mopping floor",
+    "motorcycling",
+    "moving furniture",
+    "mowing lawn",
+    "news anchoring",
+    "opening bottle",
+    "opening present",
+    "paragliding",
+    "parasailing",
+    "parkour",
+    "passing American football (in game)",
+    "passing American football (not in game)",
+    "peeling apples",
+    "peeling potatoes",
+    "petting animal (not cat)",
+    "petting cat",
+    "picking fruit",
+    "planting trees",
+    "plastering",
+    "playing accordion",
+    "playing badminton",
+    "playing bagpipes",
+    "playing basketball",
+    "playing bass guitar",
+    "playing cards",
+    "playing cello",
+    "playing chess",
+    "playing clarinet",
+    "playing controller",
+    "playing cricket",
+    "playing cymbals",
+    "playing didgeridoo",
+    "playing drums",
+    "playing flute",
+    "playing guitar",
+    "playing harmonica",
+    "playing harp",
+    "playing ice hockey",
+    "playing keyboard",
+    "playing kickball",
+    "playing monopoly",
+    "playing organ",
+    "playing paintball",
+    "playing piano",
+    "playing poker",
+    "playing recorder",
+    "playing saxophone",
+    "playing squash or racquetball",
+    "playing tennis",
+    "playing trombone",
+    "playing trumpet",
+    "playing ukulele",
+    "playing violin",
+    "playing volleyball",
+    "playing xylophone",
+    "pole vault",
+    "presenting weather forecast",
+    "pull ups",
+    "pumping fist",
+    "pumping gas",
+    "punching bag",
+    "punching person (boxing)",
+    "push up",
+    "pushing car",
+    "pushing cart",
+    "pushing wheelchair",
+    "reading book",
+    "reading newspaper",
+    "recording music",
+    "riding a bike",
+    "riding camel",
+    "riding elephant",
+    "riding mechanical bull",
+    "riding mountain bike",
+    "riding mule",
+    "riding or walking with horse",
+    "riding scooter",
+    "riding unicycle",
+    "ripping paper",
+    "robot dancing",
+    "rock climbing",
+    "rock scissors paper",
+    "roller skating",
+    "running on treadmill",
+    "sailing",
+    "salsa dancing",
+    "sanding floor",
+    "scrambling eggs",
+    "scuba diving",
+    "setting table",
+    "shaking hands",
+    "shaking head",
+    "sharpening knives",
+    "sharpening pencil",
+    "shaving head",
+    "shaving legs",
+    "shearing sheep",
+    "shining shoes",
+    "shooting basketball",
+    "shooting goal (soccer)",
+    "shot put",
+    "shoveling snow",
+    "shredding paper",
+    "shuffling cards",
+    "side kick",
+    "sign language interpreting",
+    "singing",
+    "situp",
+    "skateboarding",
+    "ski jumping",
+    "skiing (not slalom or crosscountry)",
+    "skiing crosscountry",
+    "skiing slalom",
+    "skipping rope",
+    "skydiving",
+    "slacklining",
+    "slapping",
+    "sled dog racing",
+    "smoking",
+    "smoking hookah",
+    "snatch weight lifting",
+    "sneezing",
+    "sniffing",
+    "snorkeling",
+    "snowboarding",
+    "snowkiting",
+    "snowmobiling",
+    "somersaulting",
+    "spinning poi",
+    "spray painting",
+    "spraying",
+    "springboard diving",
+    "squat",
+    "sticking tongue out",
+    "stomping grapes",
+    "stretching arm",
+    "stretching leg",
+    "strumming guitar",
+    "surfing crowd",
+    "surfing water",
+    "sweeping floor",
+    "swimming backstroke",
+    "swimming breast stroke",
+    "swimming butterfly stroke",
+    "swing dancing",
+    "swinging legs",
+    "swinging on something",
+    "sword fighting",
+    "tai chi",
+    "taking a shower",
+    "tango dancing",
+    "tap dancing",
+    "tapping guitar",
+    "tapping pen",
+    "tasting beer",
+    "tasting food",
+    "testifying",
+    "texting",
+    "throwing axe",
+    "throwing ball",
+    "throwing discus",
+    "tickling",
+    "tobogganing",
+    "tossing coin",
+    "tossing salad",
+    "training dog",
+    "trapezing",
+    "trimming or shaving beard",
+    "trimming trees",
+    "triple jump",
+    "tying bow tie",
+    "tying knot (not on a tie)",
+    "tying tie",
+    "unboxing",
+    "unloading truck",
+    "using computer",
+    "using remote controller (not gaming)",
+    "using segway",
+    "vault",
+    "waiting in line",
+    "walking the dog",
+    "washing dishes",
+    "washing feet",
+    "washing hair",
+    "washing hands",
+    "water skiing",
+    "water sliding",
+    "watering plants",
+    "waxing back",
+    "waxing chest",
+    "waxing eyebrows",
+    "waxing legs",
+    "weaving basket",
+    "welding",
+    "whistling",
+    "windsurfing",
+    "wrapping present",
+    "wrestling",
+    "writing",
+    "yawning",
+    "yoga",
+    "zumba",
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b9a069f98f1b2114c72a5e16d12ab88b9f2400
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/_utils.py
@@ -0,0 +1,256 @@
+import functools
+import inspect
+import warnings
+from collections import OrderedDict
+from typing import Any, Callable, Optional, TypeVar, Union
+
+from torch import nn
+
+from .._utils import sequence_to_str
+from ._api import WeightsEnum
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+
+    Args:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+
+    Examples::
+
+        >>> m = torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT)
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
+        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = new_m(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+    """
+
+    _version = 2
+    __annotations__ = {
+        "return_layers": dict[str, str],
+    }
+
+    def __init__(self, model: nn.Module, return_layers: dict[str, str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+D = TypeVar("D")
+
+
+def kwonly_to_pos_or_kw(fn: Callable[..., D]) -> Callable[..., D]:
+    """Decorates a function that uses keyword only parameters to also allow them being passed as positionals.
+
+    For example, consider the use case of changing the signature of ``old_fn`` into the one from ``new_fn``:
+
+    .. code::
+
+        def old_fn(foo, bar, baz=None):
+            ...
+
+        def new_fn(foo, *, bar, baz=None):
+            ...
+
+    Calling ``old_fn("foo", "bar, "baz")`` was valid, but the same call is no longer valid with ``new_fn``. To keep BC
+    and at the same time warn the user of the deprecation, this decorator can be used:
+
+    .. code::
+
+        @kwonly_to_pos_or_kw
+        def new_fn(foo, *, bar, baz=None):
+            ...
+
+        new_fn("foo", "bar, "baz")
+    """
+    params = inspect.signature(fn).parameters
+
+    try:
+        keyword_only_start_idx = next(
+            idx for idx, param in enumerate(params.values()) if param.kind == param.KEYWORD_ONLY
+        )
+    except StopIteration:
+        raise TypeError(f"Found no keyword-only parameter on function '{fn.__name__}'") from None
+
+    keyword_only_params = tuple(inspect.signature(fn).parameters)[keyword_only_start_idx:]
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> D:
+        args, keyword_only_args = args[:keyword_only_start_idx], args[keyword_only_start_idx:]
+        if keyword_only_args:
+            keyword_only_kwargs = dict(zip(keyword_only_params, keyword_only_args))
+            warnings.warn(
+                f"Using {sequence_to_str(tuple(keyword_only_kwargs.keys()), separate_last='and ')} as positional "
+                f"parameter(s) is deprecated since 0.13 and may be removed in the future. Please use keyword parameter(s) "
+                f"instead."
+            )
+            kwargs.update(keyword_only_kwargs)
+
+        return fn(*args, **kwargs)
+
+    return wrapper
+
+
+W = TypeVar("W", bound=WeightsEnum)
+M = TypeVar("M", bound=nn.Module)
+V = TypeVar("V")
+
+
+def handle_legacy_interface(**weights: tuple[str, Union[Optional[W], Callable[[dict[str, Any]], Optional[W]]]]):
+    """Decorates a model builder with the new interface to make it compatible with the old.
+
+    In particular this handles two things:
+
+    1. Allows positional parameters again, but emits a deprecation warning in case they are used. See
+        :func:`torchvision.prototype.utils._internal.kwonly_to_pos_or_kw` for details.
+    2. Handles the default value change from ``pretrained=False`` to ``weights=None`` and ``pretrained=True`` to
+        ``weights=Weights`` and emits a deprecation warning with instructions for the new interface.
+
+    Args:
+        **weights (Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]): Deprecated parameter
+            name and default value for the legacy ``pretrained=True``. The default value can be a callable in which
+            case it will be called with a dictionary of the keyword arguments. The only key that is guaranteed to be in
+            the dictionary is the deprecated parameter name passed as first element in the tuple. All other parameters
+            should be accessed with :meth:`~dict.get`.
+    """
+
+    def outer_wrapper(builder: Callable[..., M]) -> Callable[..., M]:
+        @kwonly_to_pos_or_kw
+        @functools.wraps(builder)
+        def inner_wrapper(*args: Any, **kwargs: Any) -> M:
+            for weights_param, (pretrained_param, default) in weights.items():  # type: ignore[union-attr]
+                # If neither the weights nor the pretrained parameter as passed, or the weights argument already use
+                # the new style arguments, there is nothing to do. Note that we cannot use `None` as sentinel for the
+                # weight argument, since it is a valid value.
+                sentinel = object()
+                weights_arg = kwargs.get(weights_param, sentinel)
+                if (
+                    (weights_param not in kwargs and pretrained_param not in kwargs)
+                    or isinstance(weights_arg, WeightsEnum)
+                    or (isinstance(weights_arg, str) and weights_arg != "legacy")
+                    or weights_arg is None
+                ):
+                    continue
+
+                # If the pretrained parameter was passed as positional argument, it is now mapped to
+                # `kwargs[weights_param]`. This happens because the @kwonly_to_pos_or_kw decorator uses the current
+                # signature to infer the names of positionally passed arguments and thus has no knowledge that there
+                # used to be a pretrained parameter.
+                pretrained_positional = weights_arg is not sentinel
+                if pretrained_positional:
+                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
+                    # unified access to the value if the default value is a callable.
+                    kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
+                else:
+                    pretrained_arg = kwargs[pretrained_param]
+
+                if pretrained_arg:
+                    default_weights_arg = default(kwargs) if callable(default) else default
+                    if not isinstance(default_weights_arg, WeightsEnum):
+                        raise ValueError(f"No weights available for model {builder.__name__}")
+                else:
+                    default_weights_arg = None
+
+                if not pretrained_positional:
+                    warnings.warn(
+                        f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "
+                        f"please use '{weights_param}' instead."
+                    )
+
+                msg = (
+                    f"Arguments other than a weight enum or `None` for '{weights_param}' are deprecated since 0.13 and "
+                    f"may be removed in the future. "
+                    f"The current behavior is equivalent to passing `{weights_param}={default_weights_arg}`."
+                )
+                if pretrained_arg:
+                    msg = (
+                        f"{msg} You can also use `{weights_param}={type(default_weights_arg).__name__}.DEFAULT` "
+                        f"to get the most up-to-date weights."
+                    )
+                warnings.warn(msg)
+
+                del kwargs[pretrained_param]
+                kwargs[weights_param] = default_weights_arg
+
+            return builder(*args, **kwargs)
+
+        return inner_wrapper
+
+    return outer_wrapper
+
+
+def _ovewrite_named_param(kwargs: dict[str, Any], param: str, new_value: V) -> None:
+    if param in kwargs:
+        if kwargs[param] != new_value:
+            raise ValueError(f"The parameter '{param}' expected value {new_value} but got {kwargs[param]} instead.")
+    else:
+        kwargs[param] = new_value
+
+
+def _ovewrite_value_param(param: str, actual: Optional[V], expected: V) -> V:
+    if actual is not None:
+        if actual != expected:
+            raise ValueError(f"The parameter '{param}' expected value {expected} but got {actual} instead.")
+    return expected
+
+
+class _ModelURLs(dict):
+    def __getitem__(self, item):
+        warnings.warn(
+            "Accessing the model URLs via the internal dictionary of the module is deprecated since 0.13 and may "
+            "be removed in the future. Please access them via the appropriate Weights Enum instead."
+        )
+        return super().__getitem__(item)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/alexnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f85acbeb2148d2aa8f289808e61aa61e2d68e2f9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/alexnet.py
@@ -0,0 +1,119 @@
+from functools import partial
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = ["AlexNet", "AlexNet_Weights", "alexnet"]
+
+
+class AlexNet(nn.Module):
+    def __init__(self, num_classes: int = 1000, dropout: float = 0.5) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(256 * 6 * 6, 4096),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, 4096),
+            nn.ReLU(inplace=True),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+
+class AlexNet_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            "num_params": 61100840,
+            "min_size": (63, 63),
+            "categories": _IMAGENET_CATEGORIES,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 56.522,
+                    "acc@5": 79.066,
+                }
+            },
+            "_ops": 0.714,
+            "_file_size": 233.087,
+            "_docs": """
+                These weights reproduce closely the results of the paper using a simplified training recipe.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", AlexNet_Weights.IMAGENET1K_V1))
+def alexnet(*, weights: Optional[AlexNet_Weights] = None, progress: bool = True, **kwargs: Any) -> AlexNet:
+    """AlexNet model architecture from `One weird trick for parallelizing convolutional neural networks <https://arxiv.org/abs/1404.5997>`__.
+
+    .. note::
+        AlexNet was originally introduced in the `ImageNet Classification with
+        Deep Convolutional Neural Networks
+        <https://papers.nips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html>`__
+        paper. Our implementation is based instead on the "One weird trick"
+        paper above.
+
+    Args:
+        weights (:class:`~torchvision.models.AlexNet_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.AlexNet_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.squeezenet.AlexNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.AlexNet_Weights
+        :members:
+    """
+
+    weights = AlexNet_Weights.verify(weights)
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = AlexNet(**kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/convnext.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..3264cb1fd0ce43ca40cad4e8f0ca46e9cf1703db
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/convnext.py
@@ -0,0 +1,415 @@
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+from ..ops.misc import Conv2dNormActivation, Permute
+from ..ops.stochastic_depth import StochasticDepth
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "ConvNeXt",
+    "ConvNeXt_Tiny_Weights",
+    "ConvNeXt_Small_Weights",
+    "ConvNeXt_Base_Weights",
+    "ConvNeXt_Large_Weights",
+    "convnext_tiny",
+    "convnext_small",
+    "convnext_base",
+    "convnext_large",
+]
+
+
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.permute(0, 2, 3, 1)
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.permute(0, 3, 1, 2)
+        return x
+
+
+class CNBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        layer_scale: float,
+        stochastic_depth_prob: float,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.block = nn.Sequential(
+            nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True),
+            Permute([0, 2, 3, 1]),
+            norm_layer(dim),
+            nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+            nn.GELU(),
+            nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+            Permute([0, 3, 1, 2]),
+        )
+        self.layer_scale = nn.Parameter(torch.ones(dim, 1, 1) * layer_scale)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.layer_scale * self.block(input)
+        result = self.stochastic_depth(result)
+        result += input
+        return result
+
+
+class CNBlockConfig:
+    # Stores information listed at Section 3 of the ConvNeXt paper
+    def __init__(
+        self,
+        input_channels: int,
+        out_channels: Optional[int],
+        num_layers: int,
+    ) -> None:
+        self.input_channels = input_channels
+        self.out_channels = out_channels
+        self.num_layers = num_layers
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "input_channels={input_channels}"
+        s += ", out_channels={out_channels}"
+        s += ", num_layers={num_layers}"
+        s += ")"
+        return s.format(**self.__dict__)
+
+
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        block_setting: list[CNBlockConfig],
+        stochastic_depth_prob: float = 0.0,
+        layer_scale: float = 1e-6,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not block_setting:
+            raise ValueError("The block_setting should not be empty")
+        elif not (isinstance(block_setting, Sequence) and all([isinstance(s, CNBlockConfig) for s in block_setting])):
+            raise TypeError("The block_setting should be List[CNBlockConfig]")
+
+        if block is None:
+            block = CNBlock
+
+        if norm_layer is None:
+            norm_layer = partial(LayerNorm2d, eps=1e-6)
+
+        layers: list[nn.Module] = []
+
+        # Stem
+        firstconv_output_channels = block_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=None,
+                bias=True,
+            )
+        )
+
+        total_stage_blocks = sum(cnf.num_layers for cnf in block_setting)
+        stage_block_id = 0
+        for cnf in block_setting:
+            # Bottlenecks
+            stage: list[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
+                stage.append(block(cnf.input_channels, layer_scale, sd_prob))
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            if cnf.out_channels is not None:
+                # Downsampling
+                layers.append(
+                    nn.Sequential(
+                        norm_layer(cnf.input_channels),
+                        nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=2, stride=2),
+                    )
+                )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+
+        lastblock = block_setting[-1]
+        lastconv_output_channels = (
+            lastblock.out_channels if lastblock.out_channels is not None else lastblock.input_channels
+        )
+        self.classifier = nn.Sequential(
+            norm_layer(lastconv_output_channels), nn.Flatten(1), nn.Linear(lastconv_output_channels, num_classes)
+        )
+
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _convnext(
+    block_setting: list[CNBlockConfig],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> ConvNeXt:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (32, 32),
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#convnext",
+    "_docs": """
+        These weights improve upon the results of the original paper by using a modified version of TorchVision's
+        `new training recipe
+        <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+    """,
+}
+
+
+class ConvNeXt_Tiny_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/convnext_tiny-983f1562.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=236),
+        meta={
+            **_COMMON_META,
+            "num_params": 28589128,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.520,
+                    "acc@5": 96.146,
+                }
+            },
+            "_ops": 4.456,
+            "_file_size": 109.119,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ConvNeXt_Small_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/convnext_small-0c510722.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=230),
+        meta={
+            **_COMMON_META,
+            "num_params": 50223688,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.616,
+                    "acc@5": 96.650,
+                }
+            },
+            "_ops": 8.684,
+            "_file_size": 191.703,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ConvNeXt_Base_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/convnext_base-6075fbad.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 88591464,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.062,
+                    "acc@5": 96.870,
+                }
+            },
+            "_ops": 15.355,
+            "_file_size": 338.064,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ConvNeXt_Large_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/convnext_large-ea097f82.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 197767336,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.414,
+                    "acc@5": 96.976,
+                }
+            },
+            "_ops": 34.361,
+            "_file_size": 754.537,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ConvNeXt_Tiny_Weights.IMAGENET1K_V1))
+def convnext_tiny(*, weights: Optional[ConvNeXt_Tiny_Weights] = None, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    """ConvNeXt Tiny model architecture from the
+    `A ConvNet for the 2020s <https://arxiv.org/abs/2201.03545>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.convnext.ConvNeXt_Tiny_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.convnext.ConvNeXt_Tiny_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.convnext.ConvNext``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ConvNeXt_Tiny_Weights
+        :members:
+    """
+    weights = ConvNeXt_Tiny_Weights.verify(weights)
+
+    block_setting = [
+        CNBlockConfig(96, 192, 3),
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 9),
+        CNBlockConfig(768, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.1)
+    return _convnext(block_setting, stochastic_depth_prob, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ConvNeXt_Small_Weights.IMAGENET1K_V1))
+def convnext_small(
+    *, weights: Optional[ConvNeXt_Small_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ConvNeXt:
+    """ConvNeXt Small model architecture from the
+    `A ConvNet for the 2020s <https://arxiv.org/abs/2201.03545>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.convnext.ConvNeXt_Small_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.convnext.ConvNeXt_Small_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.convnext.ConvNext``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ConvNeXt_Small_Weights
+        :members:
+    """
+    weights = ConvNeXt_Small_Weights.verify(weights)
+
+    block_setting = [
+        CNBlockConfig(96, 192, 3),
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 27),
+        CNBlockConfig(768, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.4)
+    return _convnext(block_setting, stochastic_depth_prob, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ConvNeXt_Base_Weights.IMAGENET1K_V1))
+def convnext_base(*, weights: Optional[ConvNeXt_Base_Weights] = None, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    """ConvNeXt Base model architecture from the
+    `A ConvNet for the 2020s <https://arxiv.org/abs/2201.03545>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.convnext.ConvNeXt_Base_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.convnext.ConvNeXt_Base_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.convnext.ConvNext``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ConvNeXt_Base_Weights
+        :members:
+    """
+    weights = ConvNeXt_Base_Weights.verify(weights)
+
+    block_setting = [
+        CNBlockConfig(128, 256, 3),
+        CNBlockConfig(256, 512, 3),
+        CNBlockConfig(512, 1024, 27),
+        CNBlockConfig(1024, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5)
+    return _convnext(block_setting, stochastic_depth_prob, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ConvNeXt_Large_Weights.IMAGENET1K_V1))
+def convnext_large(
+    *, weights: Optional[ConvNeXt_Large_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ConvNeXt:
+    """ConvNeXt Large model architecture from the
+    `A ConvNet for the 2020s <https://arxiv.org/abs/2201.03545>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.convnext.ConvNeXt_Large_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.convnext.ConvNeXt_Large_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.convnext.ConvNext``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ConvNeXt_Large_Weights
+        :members:
+    """
+    weights = ConvNeXt_Large_Weights.verify(weights)
+
+    block_setting = [
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 3),
+        CNBlockConfig(768, 1536, 27),
+        CNBlockConfig(1536, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5)
+    return _convnext(block_setting, stochastic_depth_prob, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/densenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..06457f7b09e9d383327b0bc41304a412eb6b7839
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/densenet.py
@@ -0,0 +1,448 @@
+import re
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch import Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+__all__ = [
+    "DenseNet",
+    "DenseNet121_Weights",
+    "DenseNet161_Weights",
+    "DenseNet169_Weights",
+    "DenseNet201_Weights",
+    "densenet121",
+    "densenet161",
+    "densenet169",
+    "densenet201",
+]
+
+
+class _DenseLayer(nn.Module):
+    def __init__(
+        self, num_input_features: int, growth_rate: int, bn_size: int, drop_rate: float, memory_efficient: bool = False
+    ) -> None:
+        super().__init__()
+        self.norm1 = nn.BatchNorm2d(num_input_features)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv1 = nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)
+
+        self.norm2 = nn.BatchNorm2d(bn_size * growth_rate)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)
+
+        self.drop_rate = float(drop_rate)
+        self.memory_efficient = memory_efficient
+
+    def bn_function(self, inputs: list[Tensor]) -> Tensor:
+        concated_features = torch.cat(inputs, 1)
+        bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))  # noqa: T484
+        return bottleneck_output
+
+    # todo: rewrite when torchscript supports any
+    def any_requires_grad(self, input: list[Tensor]) -> bool:
+        for tensor in input:
+            if tensor.requires_grad:
+                return True
+        return False
+
+    @torch.jit.unused  # noqa: T484
+    def call_checkpoint_bottleneck(self, input: list[Tensor]) -> Tensor:
+        def closure(*inputs):
+            return self.bn_function(inputs)
+
+        return cp.checkpoint(closure, *input, use_reentrant=False)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, input: list[Tensor]) -> Tensor:  # noqa: F811
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, input: Tensor) -> Tensor:  # noqa: F811
+        pass
+
+    # torchscript does not yet support *args, so we overload method
+    # allowing it to take either a List[Tensor] or single Tensor
+    def forward(self, input: Tensor) -> Tensor:  # noqa: F811
+        if isinstance(input, Tensor):
+            prev_features = [input]
+        else:
+            prev_features = input
+
+        if self.memory_efficient and self.any_requires_grad(prev_features):
+            if torch.jit.is_scripting():
+                raise Exception("Memory Efficient not supported in JIT")
+
+            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
+        else:
+            bottleneck_output = self.bn_function(prev_features)
+
+        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        return new_features
+
+
+class _DenseBlock(nn.ModuleDict):
+    _version = 2
+
+    def __init__(
+        self,
+        num_layers: int,
+        num_input_features: int,
+        bn_size: int,
+        growth_rate: int,
+        drop_rate: float,
+        memory_efficient: bool = False,
+    ) -> None:
+        super().__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(
+                num_input_features + i * growth_rate,
+                growth_rate=growth_rate,
+                bn_size=bn_size,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.add_module("denselayer%d" % (i + 1), layer)
+
+    def forward(self, init_features: Tensor) -> Tensor:
+        features = [init_features]
+        for name, layer in self.items():
+            new_features = layer(features)
+            features.append(new_features)
+        return torch.cat(features, 1)
+
+
+class _Transition(nn.Sequential):
+    def __init__(self, num_input_features: int, num_output_features: int) -> None:
+        super().__init__()
+        self.norm = nn.BatchNorm2d(num_input_features)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv = nn.Conv2d(num_input_features, num_output_features, kernel_size=1, stride=1, bias=False)
+        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
+
+
+class DenseNet(nn.Module):
+    r"""Densenet-BC model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+
+    Args:
+        growth_rate (int) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+
+    def __init__(
+        self,
+        growth_rate: int = 32,
+        block_config: tuple[int, int, int, int] = (6, 12, 24, 16),
+        num_init_features: int = 64,
+        bn_size: int = 4,
+        drop_rate: float = 0,
+        num_classes: int = 1000,
+        memory_efficient: bool = False,
+    ) -> None:
+
+        super().__init__()
+        _log_api_usage_once(self)
+
+        # First convolution
+        self.features = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+                    ("norm0", nn.BatchNorm2d(num_init_features)),
+                    ("relu0", nn.ReLU(inplace=True)),
+                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+                ]
+            )
+        )
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bn_size,
+                growth_rate=growth_rate,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.features.add_module("denseblock%d" % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
+                self.features.add_module("transition%d" % (i + 1), trans)
+                num_features = num_features // 2
+
+        # Final batch norm
+        self.features.add_module("norm5", nn.BatchNorm2d(num_features))
+
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+
+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        features = self.features(x)
+        out = F.relu(features, inplace=True)
+        out = F.adaptive_avg_pool2d(out, (1, 1))
+        out = torch.flatten(out, 1)
+        out = self.classifier(out)
+        return out
+
+
+def _load_state_dict(model: nn.Module, weights: WeightsEnum, progress: bool) -> None:
+    # '.'s are no longer allowed in module names, but previous _DenseLayer
+    # has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
+    # They are also in the checkpoints in model_urls. This pattern is used
+    # to find such keys.
+    pattern = re.compile(
+        r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$"
+    )
+
+    state_dict = weights.get_state_dict(progress=progress, check_hash=True)
+    for key in list(state_dict.keys()):
+        res = pattern.match(key)
+        if res:
+            new_key = res.group(1) + res.group(2)
+            state_dict[new_key] = state_dict[key]
+            del state_dict[key]
+    model.load_state_dict(state_dict)
+
+
+def _densenet(
+    growth_rate: int,
+    block_config: tuple[int, int, int, int],
+    num_init_features: int,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> DenseNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
+
+    if weights is not None:
+        _load_state_dict(model=model, weights=weights, progress=progress)
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (29, 29),
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/pytorch/vision/pull/116",
+    "_docs": """These weights are ported from LuaTorch.""",
+}
+
+
+class DenseNet121_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/densenet121-a639ec97.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 7978856,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 74.434,
+                    "acc@5": 91.972,
+                }
+            },
+            "_ops": 2.834,
+            "_file_size": 30.845,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class DenseNet161_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/densenet161-8d451a50.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 28681000,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.138,
+                    "acc@5": 93.560,
+                }
+            },
+            "_ops": 7.728,
+            "_file_size": 110.369,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class DenseNet169_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/densenet169-b2777c0a.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 14149480,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.600,
+                    "acc@5": 92.806,
+                }
+            },
+            "_ops": 3.36,
+            "_file_size": 54.708,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class DenseNet201_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/densenet201-c1103571.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 20013928,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.896,
+                    "acc@5": 93.370,
+                }
+            },
+            "_ops": 4.291,
+            "_file_size": 77.373,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", DenseNet121_Weights.IMAGENET1K_V1))
+def densenet121(*, weights: Optional[DenseNet121_Weights] = None, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-121 model from
+    `Densely Connected Convolutional Networks <https://arxiv.org/abs/1608.06993>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.DenseNet121_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.DenseNet121_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.densenet.DenseNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.DenseNet121_Weights
+        :members:
+    """
+    weights = DenseNet121_Weights.verify(weights)
+
+    return _densenet(32, (6, 12, 24, 16), 64, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", DenseNet161_Weights.IMAGENET1K_V1))
+def densenet161(*, weights: Optional[DenseNet161_Weights] = None, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-161 model from
+    `Densely Connected Convolutional Networks <https://arxiv.org/abs/1608.06993>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.DenseNet161_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.DenseNet161_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.densenet.DenseNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.DenseNet161_Weights
+        :members:
+    """
+    weights = DenseNet161_Weights.verify(weights)
+
+    return _densenet(48, (6, 12, 36, 24), 96, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", DenseNet169_Weights.IMAGENET1K_V1))
+def densenet169(*, weights: Optional[DenseNet169_Weights] = None, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-169 model from
+    `Densely Connected Convolutional Networks <https://arxiv.org/abs/1608.06993>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.DenseNet169_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.DenseNet169_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.densenet.DenseNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.DenseNet169_Weights
+        :members:
+    """
+    weights = DenseNet169_Weights.verify(weights)
+
+    return _densenet(32, (6, 12, 32, 32), 64, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", DenseNet201_Weights.IMAGENET1K_V1))
+def densenet201(*, weights: Optional[DenseNet201_Weights] = None, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-201 model from
+    `Densely Connected Convolutional Networks <https://arxiv.org/abs/1608.06993>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.DenseNet201_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.DenseNet201_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.densenet.DenseNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.DenseNet201_Weights
+        :members:
+    """
+    weights = DenseNet201_Weights.verify(weights)
+
+    return _densenet(32, (6, 12, 48, 32), 64, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4146651c737971cc5a883b6750f2ded3051bc8ea
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__init__.py
@@ -0,0 +1,7 @@
+from .faster_rcnn import *
+from .fcos import *
+from .keypoint_rcnn import *
+from .mask_rcnn import *
+from .retinanet import *
+from .ssd import *
+from .ssdlite import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..574110ce0c77542e6b6b9bb148ec61202f7d6067
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41c53253112132efbe568985ee6aa4ef92f73010
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/anchor_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/anchor_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3951937135ed7884766de498b4b7ca52986f9223
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/anchor_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/backbone_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/backbone_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8dc9b667c4d80f62457b7bb61d9262c87e4f48e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/backbone_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/faster_rcnn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/faster_rcnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87b7ff170c9965104efc44dd9ff4a0623c8c571e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/faster_rcnn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/fcos.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/fcos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1ff02e0dc427a7c0bb566904c171753df38e45d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/fcos.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/generalized_rcnn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/generalized_rcnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..770c58fa91fe3bff8cc52362526942f4e910b6fa
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/generalized_rcnn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/image_list.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/image_list.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7b1850ee79cb37095887fc12027dc49dd6e50c0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/image_list.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/keypoint_rcnn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/keypoint_rcnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e58ae37dd359f24bb6de1381b931c154c35cf16
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/keypoint_rcnn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/mask_rcnn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/mask_rcnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc646b77ded30a18618b2c860cd8d7d36e1de63
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/mask_rcnn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/retinanet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/retinanet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13b97b60eae52984ce3aa3ffc62357e145bbf04b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/retinanet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/roi_heads.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/roi_heads.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a4ebb66753cb820203b9e7ad1abc654d49bbdd1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/roi_heads.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/rpn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/rpn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..955ca9c0e786a8a2c9793c7bbdf7f145408cd686
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/rpn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssd.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a9036d7c7aa89efcebc2cea8808935478a37d01
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssd.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssdlite.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssdlite.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..718a3c1737c541033a2d3c96f9533b152ea0269d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/ssdlite.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/transform.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/transform.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7f6c24eb6fdf0c07f7f43c9cd7228e484d0e098
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/__pycache__/transform.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..805c05a92ffb074c123540fcd36751d00a454dde
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/_utils.py
@@ -0,0 +1,539 @@
+import math
+from collections import OrderedDict
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.ops import complete_box_iou_loss, distance_box_iou_loss, FrozenBatchNorm2d, generalized_box_iou_loss
+
+
+class BalancedPositiveNegativeSampler:
+    """
+    This class samples batches, ensuring that they contain a fixed proportion of positives
+    """
+
+    def __init__(self, batch_size_per_image: int, positive_fraction: float) -> None:
+        """
+        Args:
+            batch_size_per_image (int): number of elements to be selected per image
+            positive_fraction (float): percentage of positive elements per batch
+        """
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+
+    def __call__(self, matched_idxs: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
+        """
+        Args:
+            matched_idxs: list of tensors containing -1, 0 or positive values.
+                Each tensor corresponds to a specific image.
+                -1 values are ignored, 0 are considered as negatives and > 0 as
+                positives.
+
+        Returns:
+            pos_idx (list[tensor])
+            neg_idx (list[tensor])
+
+        Returns two lists of binary masks for each image.
+        The first list contains the positive elements that were selected,
+        and the second list the negative example.
+        """
+        pos_idx = []
+        neg_idx = []
+        for matched_idxs_per_image in matched_idxs:
+            positive = torch.where(matched_idxs_per_image >= 1)[0]
+            negative = torch.where(matched_idxs_per_image == 0)[0]
+
+            num_pos = int(self.batch_size_per_image * self.positive_fraction)
+            # protect against not enough positive examples
+            num_pos = min(positive.numel(), num_pos)
+            num_neg = self.batch_size_per_image - num_pos
+            # protect against not enough negative examples
+            num_neg = min(negative.numel(), num_neg)
+
+            # randomly select positive and negative examples
+            perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+            perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+            pos_idx_per_image = positive[perm1]
+            neg_idx_per_image = negative[perm2]
+
+            # create binary mask from indices
+            pos_idx_per_image_mask = torch.zeros_like(matched_idxs_per_image, dtype=torch.uint8)
+            neg_idx_per_image_mask = torch.zeros_like(matched_idxs_per_image, dtype=torch.uint8)
+
+            pos_idx_per_image_mask[pos_idx_per_image] = 1
+            neg_idx_per_image_mask[neg_idx_per_image] = 1
+
+            pos_idx.append(pos_idx_per_image_mask)
+            neg_idx.append(neg_idx_per_image_mask)
+
+        return pos_idx, neg_idx
+
+
+@torch.jit._script_if_tracing
+def encode_boxes(reference_boxes: Tensor, proposals: Tensor, weights: Tensor) -> Tensor:
+    """
+    Encode a set of proposals with respect to some
+    reference boxes
+
+    Args:
+        reference_boxes (Tensor): reference boxes
+        proposals (Tensor): boxes to be encoded
+        weights (Tensor[4]): the weights for ``(x, y, w, h)``
+    """
+
+    # perform some unpacking to make it JIT-fusion friendly
+    wx = weights[0]
+    wy = weights[1]
+    ww = weights[2]
+    wh = weights[3]
+
+    proposals_x1 = proposals[:, 0].unsqueeze(1)
+    proposals_y1 = proposals[:, 1].unsqueeze(1)
+    proposals_x2 = proposals[:, 2].unsqueeze(1)
+    proposals_y2 = proposals[:, 3].unsqueeze(1)
+
+    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
+    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
+    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
+    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
+
+    # implementation starts here
+    ex_widths = proposals_x2 - proposals_x1
+    ex_heights = proposals_y2 - proposals_y1
+    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
+    ex_ctr_y = proposals_y1 + 0.5 * ex_heights
+
+    gt_widths = reference_boxes_x2 - reference_boxes_x1
+    gt_heights = reference_boxes_y2 - reference_boxes_y1
+    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
+    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
+
+    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = ww * torch.log(gt_widths / ex_widths)
+    targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+    targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+    return targets
+
+
+class BoxCoder:
+    """
+    This class encodes and decodes a set of bounding boxes into
+    the representation used for training the regressors.
+    """
+
+    def __init__(
+        self, weights: tuple[float, float, float, float], bbox_xform_clip: float = math.log(1000.0 / 16)
+    ) -> None:
+        """
+        Args:
+            weights (4-element tuple)
+            bbox_xform_clip (float)
+        """
+        self.weights = weights
+        self.bbox_xform_clip = bbox_xform_clip
+
+    def encode(self, reference_boxes: list[Tensor], proposals: list[Tensor]) -> list[Tensor]:
+        boxes_per_image = [len(b) for b in reference_boxes]
+        reference_boxes = torch.cat(reference_boxes, dim=0)
+        proposals = torch.cat(proposals, dim=0)
+        targets = self.encode_single(reference_boxes, proposals)
+        return targets.split(boxes_per_image, 0)
+
+    def encode_single(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Args:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        dtype = reference_boxes.dtype
+        device = reference_boxes.device
+        weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
+        targets = encode_boxes(reference_boxes, proposals, weights)
+
+        return targets
+
+    def decode(self, rel_codes: Tensor, boxes: list[Tensor]) -> Tensor:
+        torch._assert(
+            isinstance(boxes, (list, tuple)),
+            "This function expects boxes of type list or tuple.",
+        )
+        torch._assert(
+            isinstance(rel_codes, torch.Tensor),
+            "This function expects rel_codes of type torch.Tensor.",
+        )
+        boxes_per_image = [b.size(0) for b in boxes]
+        concat_boxes = torch.cat(boxes, dim=0)
+        box_sum = 0
+        for val in boxes_per_image:
+            box_sum += val
+        if box_sum > 0:
+            rel_codes = rel_codes.reshape(box_sum, -1)
+        pred_boxes = self.decode_single(rel_codes, concat_boxes)
+        if box_sum > 0:
+            pred_boxes = pred_boxes.reshape(box_sum, -1, 4)
+        return pred_boxes
+
+    def decode_single(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Args:
+            rel_codes (Tensor): encoded boxes
+            boxes (Tensor): reference boxes.
+        """
+
+        boxes = boxes.to(rel_codes.dtype)
+
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = rel_codes[:, 0::4] / wx
+        dy = rel_codes[:, 1::4] / wy
+        dw = rel_codes[:, 2::4] / ww
+        dh = rel_codes[:, 3::4] / wh
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.bbox_xform_clip)
+        dh = torch.clamp(dh, max=self.bbox_xform_clip)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        # Distance from center to box's corner.
+        c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
+        c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
+
+        pred_boxes1 = pred_ctr_x - c_to_c_w
+        pred_boxes2 = pred_ctr_y - c_to_c_h
+        pred_boxes3 = pred_ctr_x + c_to_c_w
+        pred_boxes4 = pred_ctr_y + c_to_c_h
+        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
+        return pred_boxes
+
+
+class BoxLinearCoder:
+    """
+    The linear box-to-box transform defined in FCOS. The transformation is parameterized
+    by the distance from the center of (square) src box to 4 edges of the target box.
+    """
+
+    def __init__(self, normalize_by_size: bool = True) -> None:
+        """
+        Args:
+            normalize_by_size (bool): normalize deltas by the size of src (anchor) boxes.
+        """
+        self.normalize_by_size = normalize_by_size
+
+    def encode(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
+        """
+        Encode a set of proposals with respect to some reference boxes
+
+        Args:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+
+        Returns:
+            Tensor: the encoded relative box offsets that can be used to
+            decode the boxes.
+
+        """
+
+        # get the center of reference_boxes
+        reference_boxes_ctr_x = 0.5 * (reference_boxes[..., 0] + reference_boxes[..., 2])
+        reference_boxes_ctr_y = 0.5 * (reference_boxes[..., 1] + reference_boxes[..., 3])
+
+        # get box regression transformation deltas
+        target_l = reference_boxes_ctr_x - proposals[..., 0]
+        target_t = reference_boxes_ctr_y - proposals[..., 1]
+        target_r = proposals[..., 2] - reference_boxes_ctr_x
+        target_b = proposals[..., 3] - reference_boxes_ctr_y
+
+        targets = torch.stack((target_l, target_t, target_r, target_b), dim=-1)
+
+        if self.normalize_by_size:
+            reference_boxes_w = reference_boxes[..., 2] - reference_boxes[..., 0]
+            reference_boxes_h = reference_boxes[..., 3] - reference_boxes[..., 1]
+            reference_boxes_size = torch.stack(
+                (reference_boxes_w, reference_boxes_h, reference_boxes_w, reference_boxes_h), dim=-1
+            )
+            targets = targets / reference_boxes_size
+        return targets
+
+    def decode(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Args:
+            rel_codes (Tensor): encoded boxes
+            boxes (Tensor): reference boxes.
+
+        Returns:
+            Tensor: the predicted boxes with the encoded relative box offsets.
+
+        .. note::
+            This method assumes that ``rel_codes`` and ``boxes`` have same size for 0th dimension. i.e. ``len(rel_codes) == len(boxes)``.
+
+        """
+
+        boxes = boxes.to(dtype=rel_codes.dtype)
+
+        ctr_x = 0.5 * (boxes[..., 0] + boxes[..., 2])
+        ctr_y = 0.5 * (boxes[..., 1] + boxes[..., 3])
+
+        if self.normalize_by_size:
+            boxes_w = boxes[..., 2] - boxes[..., 0]
+            boxes_h = boxes[..., 3] - boxes[..., 1]
+
+            list_box_size = torch.stack((boxes_w, boxes_h, boxes_w, boxes_h), dim=-1)
+            rel_codes = rel_codes * list_box_size
+
+        pred_boxes1 = ctr_x - rel_codes[..., 0]
+        pred_boxes2 = ctr_y - rel_codes[..., 1]
+        pred_boxes3 = ctr_x + rel_codes[..., 2]
+        pred_boxes4 = ctr_y + rel_codes[..., 3]
+
+        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=-1)
+        return pred_boxes
+
+
+class Matcher:
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be assigned to zero or more predicted elements.
+
+    Matching is based on the MxN match_quality_matrix, that characterizes how well
+    each (ground-truth, predicted)-pair match. For example, if the elements are
+    boxes, the matrix may contain box IoU overlap values.
+
+    The matcher returns a tensor of size N containing the index of the ground-truth
+    element m that matches to prediction n. If there is no match, a negative value
+    is returned.
+    """
+
+    BELOW_LOW_THRESHOLD = -1
+    BETWEEN_THRESHOLDS = -2
+
+    __annotations__ = {
+        "BELOW_LOW_THRESHOLD": int,
+        "BETWEEN_THRESHOLDS": int,
+    }
+
+    def __init__(self, high_threshold: float, low_threshold: float, allow_low_quality_matches: bool = False) -> None:
+        """
+        Args:
+            high_threshold (float): quality values greater than or equal to
+                this value are candidate matches.
+            low_threshold (float): a lower quality threshold used to stratify
+                matches into three levels:
+                1) matches >= high_threshold
+                2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
+                3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions that have only low-quality match candidates. See
+                set_low_quality_matches_ for more details.
+        """
+        self.BELOW_LOW_THRESHOLD = -1
+        self.BETWEEN_THRESHOLDS = -2
+        torch._assert(low_threshold <= high_threshold, "low_threshold should be <= high_threshold")
+        self.high_threshold = high_threshold
+        self.low_threshold = low_threshold
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix: Tensor) -> Tensor:
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+            pairwise quality between M ground-truth elements and N predicted elements.
+
+        Returns:
+            matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
+            [0, M - 1] or a negative value indicating that prediction i could not
+            be matched.
+        """
+        if match_quality_matrix.numel() == 0:
+            # empty targets or proposals not supported during training
+            if match_quality_matrix.shape[0] == 0:
+                raise ValueError("No ground-truth boxes available for one of the images during training")
+            else:
+                raise ValueError("No proposal boxes available for one of the images during training")
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+        if self.allow_low_quality_matches:
+            all_matches = matches.clone()
+        else:
+            all_matches = None  # type: ignore[assignment]
+
+        # Assign candidate matches with low quality to negative (unassigned) values
+        below_low_threshold = matched_vals < self.low_threshold
+        between_thresholds = (matched_vals >= self.low_threshold) & (matched_vals < self.high_threshold)
+        matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD
+        matches[between_thresholds] = self.BETWEEN_THRESHOLDS
+
+        if self.allow_low_quality_matches:
+            if all_matches is None:
+                torch._assert(False, "all_matches should not be None")
+            else:
+                self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
+
+        return matches
+
+    def set_low_quality_matches_(self, matches: Tensor, all_matches: Tensor, match_quality_matrix: Tensor) -> None:
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth with which it has the highest
+        quality value.
+        """
+        # For each gt, find the prediction with which it has the highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties
+        gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
+        # Example gt_pred_pairs_of_highest_quality:
+        # (tensor([0, 1, 1, 2, 2, 3, 3, 4, 5, 5]),
+        #  tensor([39796, 32055, 32070, 39190, 40255, 40390, 41455, 45470, 45325, 46390]))
+        # Each element in the first tensor is a gt index, and each element in second tensor is a prediction index
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
+        matches[pred_inds_to_update] = all_matches[pred_inds_to_update]
+
+
+class SSDMatcher(Matcher):
+    def __init__(self, threshold: float) -> None:
+        super().__init__(threshold, threshold, allow_low_quality_matches=False)
+
+    def __call__(self, match_quality_matrix: Tensor) -> Tensor:
+        matches = super().__call__(match_quality_matrix)
+
+        # For each gt, find the prediction with which it has the highest quality
+        _, highest_quality_pred_foreach_gt = match_quality_matrix.max(dim=1)
+        matches[highest_quality_pred_foreach_gt] = torch.arange(
+            highest_quality_pred_foreach_gt.size(0), dtype=torch.int64, device=highest_quality_pred_foreach_gt.device
+        )
+
+        return matches
+
+
+def overwrite_eps(model: nn.Module, eps: float) -> None:
+    """
+    This method overwrites the default eps values of all the
+    FrozenBatchNorm2d layers of the model with the provided value.
+    This is necessary to address the BC-breaking change introduced
+    by the bug-fix at pytorch/vision#2933. The overwrite is applied
+    only when the pretrained weights are loaded to maintain compatibility
+    with previous versions.
+
+    Args:
+        model (nn.Module): The model on which we perform the overwrite.
+        eps (float): The new value of eps.
+    """
+    for module in model.modules():
+        if isinstance(module, FrozenBatchNorm2d):
+            module.eps = eps
+
+
+def retrieve_out_channels(model: nn.Module, size: tuple[int, int]) -> list[int]:
+    """
+    This method retrieves the number of output channels of a specific model.
+
+    Args:
+        model (nn.Module): The model for which we estimate the out_channels.
+            It should return a single Tensor or an OrderedDict[Tensor].
+        size (Tuple[int, int]): The size (wxh) of the input.
+
+    Returns:
+        out_channels (List[int]): A list of the output channels of the model.
+    """
+    in_training = model.training
+    model.eval()
+
+    with torch.no_grad():
+        # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values
+        device = next(model.parameters()).device
+        tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device)
+        features = model(tmp_img)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([("0", features)])
+        out_channels = [x.size(1) for x in features.values()]
+
+    if in_training:
+        model.train()
+
+    return out_channels
+
+
+@torch.jit.unused
+def _fake_cast_onnx(v: Tensor) -> int:
+    return v  # type: ignore[return-value]
+
+
+def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
+    """
+    ONNX spec requires the k-value to be less than or equal to the number of inputs along
+    provided dim. Certain models use the number of elements along a particular axis instead of K
+    if K exceeds the number of elements along that axis. Previously, python's min() function was
+    used to determine whether to use the provided k-value or the specified dim axis value.
+
+    However, in cases where the model is being exported in tracing mode, python min() is
+    static causing the model to be traced incorrectly and eventually fail at the topk node.
+    In order to avoid this situation, in tracing mode, torch.min() is used instead.
+
+    Args:
+        input (Tensor): The original input tensor.
+        orig_kval (int): The provided k-value.
+        axis(int): Axis along which we retrieve the input size.
+
+    Returns:
+        min_kval (int): Appropriately selected k-value.
+    """
+    if not torch.jit.is_tracing():
+        return min(orig_kval, input.size(axis))
+    axis_dim_val = torch._shape_as_tensor(input)[axis].unsqueeze(0)
+    min_kval = torch.min(torch.cat((torch.tensor([orig_kval], dtype=axis_dim_val.dtype), axis_dim_val), 0))
+    return _fake_cast_onnx(min_kval)
+
+
+def _box_loss(
+    type: str,
+    box_coder: BoxCoder,
+    anchors_per_image: Tensor,
+    matched_gt_boxes_per_image: Tensor,
+    bbox_regression_per_image: Tensor,
+    cnf: Optional[dict[str, float]] = None,
+) -> Tensor:
+    torch._assert(type in ["l1", "smooth_l1", "ciou", "diou", "giou"], f"Unsupported loss: {type}")
+
+    if type == "l1":
+        target_regression = box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+        return F.l1_loss(bbox_regression_per_image, target_regression, reduction="sum")
+    elif type == "smooth_l1":
+        target_regression = box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+        beta = cnf["beta"] if cnf is not None and "beta" in cnf else 1.0
+        return F.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction="sum", beta=beta)
+    else:
+        bbox_per_image = box_coder.decode_single(bbox_regression_per_image, anchors_per_image)
+        eps = cnf["eps"] if cnf is not None and "eps" in cnf else 1e-7
+        if type == "ciou":
+            return complete_box_iou_loss(bbox_per_image, matched_gt_boxes_per_image, reduction="sum", eps=eps)
+        if type == "diou":
+            return distance_box_iou_loss(bbox_per_image, matched_gt_boxes_per_image, reduction="sum", eps=eps)
+        # otherwise giou
+        return generalized_box_iou_loss(bbox_per_image, matched_gt_boxes_per_image, reduction="sum", eps=eps)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/anchor_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/anchor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..05aa7664beadfd60dc572831fa759eca10093fad
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/anchor_utils.py
@@ -0,0 +1,268 @@
+import math
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+
+from .image_list import ImageList
+
+
+class AnchorGenerator(nn.Module):
+    """
+    Module that generates anchors for a set of feature maps and
+    image sizes.
+
+    The module support computing anchors at multiple sizes and aspect ratios
+    per feature map. This module assumes aspect ratio = height / width for
+    each anchor.
+
+    sizes and aspect_ratios should have the same number of elements, and it should
+    correspond to the number of feature maps.
+
+    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
+    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
+    per spatial location for feature map i.
+
+    Args:
+        sizes (Tuple[Tuple[int]]):
+        aspect_ratios (Tuple[Tuple[float]]):
+    """
+
+    __annotations__ = {
+        "cell_anchors": list[torch.Tensor],
+    }
+
+    def __init__(
+        self,
+        sizes=((128, 256, 512),),
+        aspect_ratios=((0.5, 1.0, 2.0),),
+    ):
+        super().__init__()
+
+        if not isinstance(sizes[0], (list, tuple)):
+            # TODO change this
+            sizes = tuple((s,) for s in sizes)
+        if not isinstance(aspect_ratios[0], (list, tuple)):
+            aspect_ratios = (aspect_ratios,) * len(sizes)
+
+        self.sizes = sizes
+        self.aspect_ratios = aspect_ratios
+        self.cell_anchors = [
+            self.generate_anchors(size, aspect_ratio) for size, aspect_ratio in zip(sizes, aspect_ratios)
+        ]
+
+    # TODO: https://github.com/pytorch/pytorch/issues/26792
+    # For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
+    # (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
+    # This method assumes aspect ratio = height / width for an anchor.
+    def generate_anchors(
+        self,
+        scales: list[int],
+        aspect_ratios: list[float],
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ) -> Tensor:
+        scales = torch.as_tensor(scales, dtype=dtype, device=device)
+        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
+        h_ratios = torch.sqrt(aspect_ratios)
+        w_ratios = 1 / h_ratios
+
+        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
+        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
+
+        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
+        return base_anchors.round()
+
+    def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
+        self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
+
+    def num_anchors_per_location(self) -> list[int]:
+        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
+
+    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
+    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
+    def grid_anchors(self, grid_sizes: list[list[int]], strides: list[list[Tensor]]) -> list[Tensor]:
+        anchors = []
+        cell_anchors = self.cell_anchors
+        torch._assert(cell_anchors is not None, "cell_anchors should not be None")
+        torch._assert(
+            len(grid_sizes) == len(strides) == len(cell_anchors),
+            "Anchors should be Tuple[Tuple[int]] because each feature "
+            "map could potentially have different sizes and aspect ratios. "
+            "There needs to be a match between the number of "
+            "feature maps passed and the number of sizes / aspect ratios specified.",
+        )
+
+        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
+            grid_height, grid_width = size
+            stride_height, stride_width = stride
+            device = base_anchors.device
+
+            # For output anchor, compute [x_center, y_center, x_center, y_center]
+            shifts_x = torch.arange(0, grid_width, dtype=torch.int32, device=device) * stride_width
+            shifts_y = torch.arange(0, grid_height, dtype=torch.int32, device=device) * stride_height
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            # For every (base anchor, output anchor) pair,
+            # offset each zero-centered base anchor by the center of the output anchor.
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+
+        return anchors
+
+    def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Tensor]:
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
+        image_size = image_list.tensors.shape[-2:]
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        strides = [
+            [
+                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[0] // g[0]),
+                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[1] // g[1]),
+            ]
+            for g in grid_sizes
+        ]
+        self.set_cell_anchors(dtype, device)
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
+        anchors: list[list[torch.Tensor]] = []
+        for _ in range(len(image_list.image_sizes)):
+            anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
+            anchors.append(anchors_in_image)
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+        return anchors
+
+
+class DefaultBoxGenerator(nn.Module):
+    """
+    This module generates the default boxes of SSD for a set of feature maps and image sizes.
+
+    Args:
+        aspect_ratios (List[List[int]]): A list with all the aspect ratios used in each feature map.
+        min_ratio (float): The minimum scale :math:`\text{s}_{\text{min}}` of the default boxes used in the estimation
+            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
+        max_ratio (float): The maximum scale :math:`\text{s}_{\text{max}}`  of the default boxes used in the estimation
+            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
+        scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
+            the ``min_ratio`` and ``max_ratio`` parameters.
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
+            it will be estimated from the data.
+        clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
+            is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
+    """
+
+    def __init__(
+        self,
+        aspect_ratios: list[list[int]],
+        min_ratio: float = 0.15,
+        max_ratio: float = 0.9,
+        scales: Optional[list[float]] = None,
+        steps: Optional[list[int]] = None,
+        clip: bool = True,
+    ):
+        super().__init__()
+        if steps is not None and len(aspect_ratios) != len(steps):
+            raise ValueError("aspect_ratios and steps should have the same length")
+        self.aspect_ratios = aspect_ratios
+        self.steps = steps
+        self.clip = clip
+        num_outputs = len(aspect_ratios)
+
+        # Estimation of default boxes scales
+        if scales is None:
+            if num_outputs > 1:
+                range_ratio = max_ratio - min_ratio
+                self.scales = [min_ratio + range_ratio * k / (num_outputs - 1.0) for k in range(num_outputs)]
+                self.scales.append(1.0)
+            else:
+                self.scales = [min_ratio, max_ratio]
+        else:
+            self.scales = scales
+
+        self._wh_pairs = self._generate_wh_pairs(num_outputs)
+
+    def _generate_wh_pairs(
+        self, num_outputs: int, dtype: torch.dtype = torch.float32, device: torch.device = torch.device("cpu")
+    ) -> list[Tensor]:
+        _wh_pairs: list[Tensor] = []
+        for k in range(num_outputs):
+            # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k
+            s_k = self.scales[k]
+            s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1])
+            wh_pairs = [[s_k, s_k], [s_prime_k, s_prime_k]]
+
+            # Adding 2 pairs for each aspect ratio of the feature map k
+            for ar in self.aspect_ratios[k]:
+                sq_ar = math.sqrt(ar)
+                w = self.scales[k] * sq_ar
+                h = self.scales[k] / sq_ar
+                wh_pairs.extend([[w, h], [h, w]])
+
+            _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
+        return _wh_pairs
+
+    def num_anchors_per_location(self) -> list[int]:
+        # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
+        return [2 + 2 * len(r) for r in self.aspect_ratios]
+
+    # Default Boxes calculation based on page 6 of SSD paper
+    def _grid_default_boxes(
+        self, grid_sizes: list[list[int]], image_size: list[int], dtype: torch.dtype = torch.float32
+    ) -> Tensor:
+        default_boxes = []
+        for k, f_k in enumerate(grid_sizes):
+            # Now add the default boxes for each width-height pair
+            if self.steps is not None:
+                x_f_k = image_size[1] / self.steps[k]
+                y_f_k = image_size[0] / self.steps[k]
+            else:
+                y_f_k, x_f_k = f_k
+
+            shifts_x = ((torch.arange(0, f_k[1]) + 0.5) / x_f_k).to(dtype=dtype)
+            shifts_y = ((torch.arange(0, f_k[0]) + 0.5) / y_f_k).to(dtype=dtype)
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+
+            shifts = torch.stack((shift_x, shift_y) * len(self._wh_pairs[k]), dim=-1).reshape(-1, 2)
+            # Clipping the default boxes while the boxes are encoded in format (cx, cy, w, h)
+            _wh_pair = self._wh_pairs[k].clamp(min=0, max=1) if self.clip else self._wh_pairs[k]
+            wh_pairs = _wh_pair.repeat((f_k[0] * f_k[1]), 1)
+
+            default_box = torch.cat((shifts, wh_pairs), dim=1)
+
+            default_boxes.append(default_box)
+
+        return torch.cat(default_boxes, dim=0)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"aspect_ratios={self.aspect_ratios}"
+            f", clip={self.clip}"
+            f", scales={self.scales}"
+            f", steps={self.steps}"
+            ")"
+        )
+        return s
+
+    def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Tensor]:
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
+        image_size = image_list.tensors.shape[-2:]
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype)
+        default_boxes = default_boxes.to(device)
+
+        dboxes = []
+        x_y_size = torch.tensor([image_size[1], image_size[0]], device=default_boxes.device)
+        for _ in image_list.image_sizes:
+            dboxes_in_image = default_boxes
+            dboxes_in_image = torch.cat(
+                [
+                    (dboxes_in_image[:, :2] - 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
+                    (dboxes_in_image[:, :2] + 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
+                ],
+                -1,
+            )
+            dboxes.append(dboxes_in_image)
+        return dboxes
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/backbone_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/backbone_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f24c121d59a06186fc104cdfe5634bcd5615cf7e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/backbone_utils.py
@@ -0,0 +1,244 @@
+import warnings
+from typing import Callable, Optional, Union
+
+from torch import nn, Tensor
+from torchvision.ops import misc as misc_nn_ops
+from torchvision.ops.feature_pyramid_network import ExtraFPNBlock, FeaturePyramidNetwork, LastLevelMaxPool
+
+from .. import mobilenet, resnet
+from .._api import _get_enum_from_fn, WeightsEnum
+from .._utils import handle_legacy_interface, IntermediateLayerGetter
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediateLayerGetter apply here.
+    Args:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        return_layers: dict[str, str],
+        in_channels_list: list[int],
+        out_channels: int,
+        extra_blocks: Optional[ExtraFPNBlock] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+            norm_layer=norm_layer,
+        )
+        self.out_channels = out_channels
+
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+
+
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
+    ),
+)
+def resnet_fpn_backbone(
+    *,
+    backbone_name: str,
+    weights: Optional[WeightsEnum],
+    norm_layer: Callable[..., nn.Module] = misc_nn_ops.FrozenBatchNorm2d,
+    trainable_layers: int = 3,
+    returned_layers: Optional[list[int]] = None,
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+) -> BackboneWithFPN:
+    """
+    Constructs a specified ResNet backbone with FPN on top. Freezes the specified number of layers in the backbone.
+
+    Examples::
+
+        >>> import torch
+        >>> from torchvision.models import ResNet50_Weights
+        >>> from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
+        >>> backbone = resnet_fpn_backbone(backbone_name='resnet50', weights=ResNet50_Weights.DEFAULT, trainable_layers=3)
+        >>> # get some dummy image
+        >>> x = torch.rand(1,3,64,64)
+        >>> # compute the output
+        >>> output = backbone(x)
+        >>> print([(k, v.shape) for k, v in output.items()])
+        >>> # returns
+        >>>   [('0', torch.Size([1, 256, 16, 16])),
+        >>>    ('1', torch.Size([1, 256, 8, 8])),
+        >>>    ('2', torch.Size([1, 256, 4, 4])),
+        >>>    ('3', torch.Size([1, 256, 2, 2])),
+        >>>    ('pool', torch.Size([1, 256, 1, 1]))]
+
+    Args:
+        backbone_name (string): resnet architecture. Possible values are 'resnet18', 'resnet34', 'resnet50',
+             'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'
+        weights (WeightsEnum, optional): The pretrained weights for the model
+        norm_layer (callable): it is recommended to use the default value. For details visit:
+            (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
+        trainable_layers (int): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+        returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
+            By default, all layers are returned.
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names. By
+            default, a ``LastLevelMaxPool`` is used.
+    """
+    backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
+    return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
+
+
+def _resnet_fpn_extractor(
+    backbone: resnet.ResNet,
+    trainable_layers: int,
+    returned_layers: Optional[list[int]] = None,
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+    norm_layer: Optional[Callable[..., nn.Module]] = None,
+) -> BackboneWithFPN:
+
+    # select layers that won't be frozen
+    if trainable_layers < 0 or trainable_layers > 5:
+        raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
+    layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
+    if trainable_layers == 5:
+        layers_to_train.append("bn1")
+    for name, parameter in backbone.named_parameters():
+        if all([not name.startswith(layer) for layer in layers_to_train]):
+            parameter.requires_grad_(False)
+
+    if extra_blocks is None:
+        extra_blocks = LastLevelMaxPool()
+
+    if returned_layers is None:
+        returned_layers = [1, 2, 3, 4]
+    if min(returned_layers) <= 0 or max(returned_layers) >= 5:
+        raise ValueError(f"Each returned layer should be in the range [1,4]. Got {returned_layers}")
+    return_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}
+
+    in_channels_stage2 = backbone.inplanes // 8
+    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
+    out_channels = 256
+    return BackboneWithFPN(
+        backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks, norm_layer=norm_layer
+    )
+
+
+def _validate_trainable_layers(
+    is_trained: bool,
+    trainable_backbone_layers: Optional[int],
+    max_value: int,
+    default_value: int,
+) -> int:
+    # don't freeze any layers if pretrained model or backbone is not used
+    if not is_trained:
+        if trainable_backbone_layers is not None:
+            warnings.warn(
+                "Changing trainable_backbone_layers has no effect if "
+                "neither pretrained nor pretrained_backbone have been set to True, "
+                f"falling back to trainable_backbone_layers={max_value} so that all layers are trainable"
+            )
+        trainable_backbone_layers = max_value
+
+    # by default freeze first blocks
+    if trainable_backbone_layers is None:
+        trainable_backbone_layers = default_value
+    if trainable_backbone_layers < 0 or trainable_backbone_layers > max_value:
+        raise ValueError(
+            f"Trainable backbone layers should be in the range [0,{max_value}], got {trainable_backbone_layers} "
+        )
+    return trainable_backbone_layers
+
+
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
+    ),
+)
+def mobilenet_backbone(
+    *,
+    backbone_name: str,
+    weights: Optional[WeightsEnum],
+    fpn: bool,
+    norm_layer: Callable[..., nn.Module] = misc_nn_ops.FrozenBatchNorm2d,
+    trainable_layers: int = 2,
+    returned_layers: Optional[list[int]] = None,
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+) -> nn.Module:
+    backbone = mobilenet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
+    return _mobilenet_extractor(backbone, fpn, trainable_layers, returned_layers, extra_blocks)
+
+
+def _mobilenet_extractor(
+    backbone: Union[mobilenet.MobileNetV2, mobilenet.MobileNetV3],
+    fpn: bool,
+    trainable_layers: int,
+    returned_layers: Optional[list[int]] = None,
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+    norm_layer: Optional[Callable[..., nn.Module]] = None,
+) -> nn.Module:
+    backbone = backbone.features
+    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    num_stages = len(stage_indices)
+
+    # find the index of the layer from which we won't freeze
+    if trainable_layers < 0 or trainable_layers > num_stages:
+        raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
+    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
+
+    for b in backbone[:freeze_before]:
+        for parameter in b.parameters():
+            parameter.requires_grad_(False)
+
+    out_channels = 256
+    if fpn:
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        if returned_layers is None:
+            returned_layers = [num_stages - 2, num_stages - 1]
+        if min(returned_layers) < 0 or max(returned_layers) >= num_stages:
+            raise ValueError(f"Each returned layer should be in the range [0,{num_stages - 1}], got {returned_layers} ")
+        return_layers = {f"{stage_indices[k]}": str(v) for v, k in enumerate(returned_layers)}
+
+        in_channels_list = [backbone[stage_indices[i]].out_channels for i in returned_layers]
+        return BackboneWithFPN(
+            backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks, norm_layer=norm_layer
+        )
+    else:
+        m = nn.Sequential(
+            backbone,
+            # depthwise linear combination of channels to reduce their size
+            nn.Conv2d(backbone[-1].out_channels, out_channels, 1),
+        )
+        m.out_channels = out_channels  # type: ignore[assignment]
+        return m
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/faster_rcnn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f7063107b66af2ead318677e0c7b0001905eac
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/faster_rcnn.py
@@ -0,0 +1,846 @@
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops import MultiScaleRoIAlign
+
+from ...ops import misc as misc_nn_ops
+from ...transforms._presets import ObjectDetection
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights
+from ..resnet import resnet50, ResNet50_Weights
+from ._utils import overwrite_eps
+from .anchor_utils import AnchorGenerator
+from .backbone_utils import _mobilenet_extractor, _resnet_fpn_extractor, _validate_trainable_layers
+from .generalized_rcnn import GeneralizedRCNN
+from .roi_heads import RoIHeads
+from .rpn import RegionProposalNetwork, RPNHead
+from .transform import GeneralizedRCNNTransform
+
+
+__all__ = [
+    "FasterRCNN",
+    "FasterRCNN_ResNet50_FPN_Weights",
+    "FasterRCNN_ResNet50_FPN_V2_Weights",
+    "FasterRCNN_MobileNet_V3_Large_FPN_Weights",
+    "FasterRCNN_MobileNet_V3_Large_320_FPN_Weights",
+    "fasterrcnn_resnet50_fpn",
+    "fasterrcnn_resnet50_fpn_v2",
+    "fasterrcnn_mobilenet_v3_large_fpn",
+    "fasterrcnn_mobilenet_v3_large_320_fpn",
+]
+
+
+def _default_anchorgen():
+    anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    return AnchorGenerator(anchor_sizes, aspect_ratios)
+
+
+class FasterRCNN(GeneralizedRCNN):
+    """
+    Implements Faster R-CNN.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses for both the RPN and the R-CNN.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores or each prediction
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or and OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+            If box_predictor is specified, num_classes should be None.
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
+        rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
+        rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
+        rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
+        rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
+        rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
+        box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+            the locations indicated by the bounding boxes
+        box_head (nn.Module): module that takes the cropped feature maps as input
+        box_predictor (nn.Module): module that takes the output of box_head and returns the
+            classification logits and box regression deltas.
+        box_score_thresh (float): during inference, only return proposals with a classification score
+            greater than box_score_thresh
+        box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
+        box_detections_per_img (int): maximum number of detections per image, for all classes.
+        box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
+            considered as positive during training of the classification head
+        box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
+            considered as negative during training of the classification head
+        box_batch_size_per_image (int): number of proposals that are sampled during training of the
+            classification head
+        box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
+            of the classification head
+        bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
+            bounding boxes
+
+    Example::
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import FasterRCNN
+        >>> from torchvision.models.detection.rpn import AnchorGenerator
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
+        >>> # FasterRCNN needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
+        >>> # so we need to add it here
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the RPN generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
+        >>>                                    aspect_ratios=((0.5, 1.0, 2.0),))
+        >>>
+        >>> # let's define what are the feature maps that we will
+        >>> # use to perform the region of interest cropping, as well as
+        >>> # the size of the crop after rescaling.
+        >>> # if your backbone returns a Tensor, featmap_names is expected to
+        >>> # be ['0']. More generally, the backbone should return an
+        >>> # OrderedDict[Tensor], and in featmap_names you can choose which
+        >>> # feature maps to use.
+        >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
+        >>>                                                 output_size=7,
+        >>>                                                 sampling_ratio=2)
+        >>>
+        >>> # put the pieces together inside a FasterRCNN model
+        >>> model = FasterRCNN(backbone,
+        >>>                    num_classes=2,
+        >>>                    rpn_anchor_generator=anchor_generator,
+        >>>                    box_roi_pool=roi_pooler)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+
+    def __init__(
+        self,
+        backbone,
+        num_classes=None,
+        # transform parameters
+        min_size=800,
+        max_size=1333,
+        image_mean=None,
+        image_std=None,
+        # RPN parameters
+        rpn_anchor_generator=None,
+        rpn_head=None,
+        rpn_pre_nms_top_n_train=2000,
+        rpn_pre_nms_top_n_test=1000,
+        rpn_post_nms_top_n_train=2000,
+        rpn_post_nms_top_n_test=1000,
+        rpn_nms_thresh=0.7,
+        rpn_fg_iou_thresh=0.7,
+        rpn_bg_iou_thresh=0.3,
+        rpn_batch_size_per_image=256,
+        rpn_positive_fraction=0.5,
+        rpn_score_thresh=0.0,
+        # Box parameters
+        box_roi_pool=None,
+        box_head=None,
+        box_predictor=None,
+        box_score_thresh=0.05,
+        box_nms_thresh=0.5,
+        box_detections_per_img=100,
+        box_fg_iou_thresh=0.5,
+        box_bg_iou_thresh=0.5,
+        box_batch_size_per_image=512,
+        box_positive_fraction=0.25,
+        bbox_reg_weights=None,
+        **kwargs,
+    ):
+
+        if not hasattr(backbone, "out_channels"):
+            raise ValueError(
+                "backbone should contain an attribute out_channels "
+                "specifying the number of output channels (assumed to be the "
+                "same for all the levels)"
+            )
+
+        if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))):
+            raise TypeError(
+                f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}"
+            )
+        if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))):
+            raise TypeError(
+                f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}"
+            )
+
+        if num_classes is not None:
+            if box_predictor is not None:
+                raise ValueError("num_classes should be None when box_predictor is specified")
+        else:
+            if box_predictor is None:
+                raise ValueError("num_classes should not be None when box_predictor is not specified")
+
+        out_channels = backbone.out_channels
+
+        if rpn_anchor_generator is None:
+            rpn_anchor_generator = _default_anchorgen()
+        if rpn_head is None:
+            rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
+
+        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
+        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
+
+        rpn = RegionProposalNetwork(
+            rpn_anchor_generator,
+            rpn_head,
+            rpn_fg_iou_thresh,
+            rpn_bg_iou_thresh,
+            rpn_batch_size_per_image,
+            rpn_positive_fraction,
+            rpn_pre_nms_top_n,
+            rpn_post_nms_top_n,
+            rpn_nms_thresh,
+            score_thresh=rpn_score_thresh,
+        )
+
+        if box_roi_pool is None:
+            box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
+
+        if box_head is None:
+            resolution = box_roi_pool.output_size[0]
+            representation_size = 1024
+            box_head = TwoMLPHead(out_channels * resolution**2, representation_size)
+
+        if box_predictor is None:
+            representation_size = 1024
+            box_predictor = FastRCNNPredictor(representation_size, num_classes)
+
+        roi_heads = RoIHeads(
+            # Box
+            box_roi_pool,
+            box_head,
+            box_predictor,
+            box_fg_iou_thresh,
+            box_bg_iou_thresh,
+            box_batch_size_per_image,
+            box_positive_fraction,
+            bbox_reg_weights,
+            box_score_thresh,
+            box_nms_thresh,
+            box_detections_per_img,
+        )
+
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
+
+        super().__init__(backbone, rpn, roi_heads, transform)
+
+
+class TwoMLPHead(nn.Module):
+    """
+    Standard heads for FPN-based models
+
+    Args:
+        in_channels (int): number of input channels
+        representation_size (int): size of the intermediate representation
+    """
+
+    def __init__(self, in_channels, representation_size):
+        super().__init__()
+
+        self.fc6 = nn.Linear(in_channels, representation_size)
+        self.fc7 = nn.Linear(representation_size, representation_size)
+
+    def forward(self, x):
+        x = x.flatten(start_dim=1)
+
+        x = F.relu(self.fc6(x))
+        x = F.relu(self.fc7(x))
+
+        return x
+
+
+class FastRCNNConvFCHead(nn.Sequential):
+    def __init__(
+        self,
+        input_size: tuple[int, int, int],
+        conv_layers: list[int],
+        fc_layers: list[int],
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
+        """
+        Args:
+            input_size (Tuple[int, int, int]): the input size in CHW format.
+            conv_layers (list): feature dimensions of each Convolution layer
+            fc_layers (list): feature dimensions of each FCN layer
+            norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+        """
+        in_channels, in_height, in_width = input_size
+
+        blocks = []
+        previous_channels = in_channels
+        for current_channels in conv_layers:
+            blocks.append(misc_nn_ops.Conv2dNormActivation(previous_channels, current_channels, norm_layer=norm_layer))
+            previous_channels = current_channels
+        blocks.append(nn.Flatten())
+        previous_channels = previous_channels * in_height * in_width
+        for current_channels in fc_layers:
+            blocks.append(nn.Linear(previous_channels, current_channels))
+            blocks.append(nn.ReLU(inplace=True))
+            previous_channels = current_channels
+
+        super().__init__(*blocks)
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
+                if layer.bias is not None:
+                    nn.init.zeros_(layer.bias)
+
+
+class FastRCNNPredictor(nn.Module):
+    """
+    Standard classification + bounding box regression layers
+    for Fast R-CNN.
+
+    Args:
+        in_channels (int): number of input channels
+        num_classes (int): number of output classes (including background)
+    """
+
+    def __init__(self, in_channels, num_classes):
+        super().__init__()
+        self.cls_score = nn.Linear(in_channels, num_classes)
+        self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
+
+    def forward(self, x):
+        if x.dim() == 4:
+            torch._assert(
+                list(x.shape[2:]) == [1, 1],
+                f"x has the wrong shape, expecting the last two dimensions to be [1,1] instead of {list(x.shape[2:])}",
+            )
+        x = x.flatten(start_dim=1)
+        scores = self.cls_score(x)
+        bbox_deltas = self.bbox_pred(x)
+
+        return scores, bbox_deltas
+
+
+_COMMON_META = {
+    "categories": _COCO_CATEGORIES,
+    "min_size": (1, 1),
+}
+
+
+class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 41755286,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-resnet-50-fpn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 37.0,
+                }
+            },
+            "_ops": 134.38,
+            "_file_size": 159.743,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 43712278,
+            "recipe": "https://github.com/pytorch/vision/pull/5763",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 46.7,
+                }
+            },
+            "_ops": 280.371,
+            "_file_size": 167.104,
+            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 19386354,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-fpn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 32.8,
+                }
+            },
+            "_ops": 4.494,
+            "_file_size": 74.239,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_320_fpn-907ea3f9.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 19386354,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-320-fpn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 22.8,
+                }
+            },
+            "_ops": 0.719,
+            "_file_size": 74.239,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FasterRCNN_ResNet50_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def fasterrcnn_resnet50_fpn(
+    *,
+    weights: Optional[FasterRCNN_ResNet50_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FasterRCNN:
+    """
+    Faster R-CNN model with a ResNet-50-FPN backbone from the `Faster R-CNN: Towards Real-Time Object
+    Detection with Region Proposal Networks <https://arxiv.org/abs/1506.01497>`__
+    paper.
+
+    .. betastatus:: detection module
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses for both the RPN and the R-CNN.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
+
+    Example::
+
+        >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
+        >>> # For training
+        >>> images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
+        >>> boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]
+        >>> labels = torch.randint(1, 91, (4, 11))
+        >>> images = list(image for image in images)
+        >>> targets = []
+        >>> for i in range(len(images)):
+        >>>     d = {}
+        >>>     d['boxes'] = boxes[i]
+        >>>     d['labels'] = labels[i]
+        >>>     targets.append(d)
+        >>> output = model(images, targets)
+        >>> # For inference
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+        >>>
+        >>> # optionally, if you want to export the model to ONNX:
+        >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights
+        :members:
+    """
+    weights = FasterRCNN_ResNet50_FPN_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
+    model = FasterRCNN(backbone, num_classes=num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1:
+            overwrite_eps(model, 0.0)
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def fasterrcnn_resnet50_fpn_v2(
+    *,
+    weights: Optional[FasterRCNN_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FasterRCNN:
+    """
+    Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone from `Benchmarking Detection
+    Transfer Learning with Vision Transformers <https://arxiv.org/abs/2111.11429>`__ paper.
+
+    .. betastatus:: detection module
+
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
+
+    Args:
+        weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights
+        :members:
+    """
+    weights = FasterRCNN_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
+    rpn_anchor_generator = _default_anchorgen()
+    rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
+    box_head = FastRCNNConvFCHead(
+        (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
+    )
+    model = FasterRCNN(
+        backbone,
+        num_classes=num_classes,
+        rpn_anchor_generator=rpn_anchor_generator,
+        rpn_head=rpn_head,
+        box_head=box_head,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+def _fasterrcnn_mobilenet_v3_large_fpn(
+    *,
+    weights: Optional[Union[FasterRCNN_MobileNet_V3_Large_FPN_Weights, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights]],
+    progress: bool,
+    num_classes: Optional[int],
+    weights_backbone: Optional[MobileNet_V3_Large_Weights],
+    trainable_backbone_layers: Optional[int],
+    **kwargs: Any,
+) -> FasterRCNN:
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 6, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = mobilenet_v3_large(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
+    anchor_sizes = (
+        (
+            32,
+            64,
+            128,
+            256,
+            512,
+        ),
+    ) * 3
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    model = FasterRCNN(
+        backbone, num_classes, rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
+)
+def fasterrcnn_mobilenet_v3_large_320_fpn(
+    *,
+    weights: Optional[FasterRCNN_MobileNet_V3_Large_320_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FasterRCNN:
+    """
+    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
+
+    .. betastatus:: detection module
+
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
+
+    Example::
+
+        >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights=FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
+        :members:
+    """
+    weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.verify(weights)
+    weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
+
+    defaults = {
+        "min_size": 320,
+        "max_size": 640,
+        "rpn_pre_nms_top_n_test": 150,
+        "rpn_post_nms_top_n_test": 150,
+        "rpn_score_thresh": 0.05,
+    }
+
+    kwargs = {**defaults, **kwargs}
+    return _fasterrcnn_mobilenet_v3_large_fpn(
+        weights=weights,
+        progress=progress,
+        num_classes=num_classes,
+        weights_backbone=weights_backbone,
+        trainable_backbone_layers=trainable_backbone_layers,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FasterRCNN_MobileNet_V3_Large_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
+)
+def fasterrcnn_mobilenet_v3_large_fpn(
+    *,
+    weights: Optional[FasterRCNN_MobileNet_V3_Large_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FasterRCNN:
+    """
+    Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
+
+    .. betastatus:: detection module
+
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
+
+    Example::
+
+        >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights
+        :members:
+    """
+    weights = FasterRCNN_MobileNet_V3_Large_FPN_Weights.verify(weights)
+    weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
+
+    defaults = {
+        "rpn_score_thresh": 0.05,
+    }
+
+    kwargs = {**defaults, **kwargs}
+    return _fasterrcnn_mobilenet_v3_large_fpn(
+        weights=weights,
+        progress=progress,
+        num_classes=num_classes,
+        weights_backbone=weights_backbone,
+        trainable_backbone_layers=trainable_backbone_layers,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/fcos.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccbd2496517c33b74a1a1581e0cbf3b3f173bfed
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/fcos.py
@@ -0,0 +1,775 @@
+import math
+import warnings
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+
+from ...ops import boxes as box_ops, generalized_box_iou_loss, misc as misc_nn_ops, sigmoid_focal_loss
+from ...ops.feature_pyramid_network import LastLevelP6P7
+from ...transforms._presets import ObjectDetection
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..resnet import resnet50, ResNet50_Weights
+from . import _utils as det_utils
+from .anchor_utils import AnchorGenerator
+from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
+from .transform import GeneralizedRCNNTransform
+
+
+__all__ = [
+    "FCOS",
+    "FCOS_ResNet50_FPN_Weights",
+    "fcos_resnet50_fpn",
+]
+
+
+class FCOSHead(nn.Module):
+    """
+    A regression and classification head for use in FCOS.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_classes (int): number of classes to be predicted
+        num_convs (Optional[int]): number of conv layer of head. Default: 4.
+    """
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxLinearCoder,
+    }
+
+    def __init__(self, in_channels: int, num_anchors: int, num_classes: int, num_convs: Optional[int] = 4) -> None:
+        super().__init__()
+        self.box_coder = det_utils.BoxLinearCoder(normalize_by_size=True)
+        self.classification_head = FCOSClassificationHead(in_channels, num_anchors, num_classes, num_convs)
+        self.regression_head = FCOSRegressionHead(in_channels, num_anchors, num_convs)
+
+    def compute_loss(
+        self,
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        matched_idxs: list[Tensor],
+    ) -> dict[str, Tensor]:
+
+        cls_logits = head_outputs["cls_logits"]  # [N, HWA, C]
+        bbox_regression = head_outputs["bbox_regression"]  # [N, HWA, 4]
+        bbox_ctrness = head_outputs["bbox_ctrness"]  # [N, HWA, 1]
+
+        all_gt_classes_targets = []
+        all_gt_boxes_targets = []
+        for targets_per_image, matched_idxs_per_image in zip(targets, matched_idxs):
+            if len(targets_per_image["labels"]) == 0:
+                gt_classes_targets = targets_per_image["labels"].new_zeros((len(matched_idxs_per_image),))
+                gt_boxes_targets = targets_per_image["boxes"].new_zeros((len(matched_idxs_per_image), 4))
+            else:
+                gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
+                gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
+            gt_classes_targets[matched_idxs_per_image < 0] = -1  # background
+            all_gt_classes_targets.append(gt_classes_targets)
+            all_gt_boxes_targets.append(gt_boxes_targets)
+
+        # List[Tensor] to Tensor conversion of  `all_gt_boxes_target`, `all_gt_classes_targets` and `anchors`
+        all_gt_boxes_targets, all_gt_classes_targets, anchors = (
+            torch.stack(all_gt_boxes_targets),
+            torch.stack(all_gt_classes_targets),
+            torch.stack(anchors),
+        )
+
+        # compute foregroud
+        foregroud_mask = all_gt_classes_targets >= 0
+        num_foreground = foregroud_mask.sum().item()
+
+        # classification loss
+        gt_classes_targets = torch.zeros_like(cls_logits)
+        gt_classes_targets[foregroud_mask, all_gt_classes_targets[foregroud_mask]] = 1.0
+        loss_cls = sigmoid_focal_loss(cls_logits, gt_classes_targets, reduction="sum")
+
+        # amp issue: pred_boxes need to convert float
+        pred_boxes = self.box_coder.decode(bbox_regression, anchors)
+
+        # regression loss: GIoU loss
+        loss_bbox_reg = generalized_box_iou_loss(
+            pred_boxes[foregroud_mask],
+            all_gt_boxes_targets[foregroud_mask],
+            reduction="sum",
+        )
+
+        # ctrness loss
+
+        bbox_reg_targets = self.box_coder.encode(anchors, all_gt_boxes_targets)
+
+        if len(bbox_reg_targets) == 0:
+            gt_ctrness_targets = bbox_reg_targets.new_zeros(bbox_reg_targets.size()[:-1])
+        else:
+            left_right = bbox_reg_targets[:, :, [0, 2]]
+            top_bottom = bbox_reg_targets[:, :, [1, 3]]
+            gt_ctrness_targets = torch.sqrt(
+                (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0])
+                * (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+            )
+        pred_centerness = bbox_ctrness.squeeze(dim=2)
+        loss_bbox_ctrness = nn.functional.binary_cross_entropy_with_logits(
+            pred_centerness[foregroud_mask], gt_ctrness_targets[foregroud_mask], reduction="sum"
+        )
+
+        return {
+            "classification": loss_cls / max(1, num_foreground),
+            "bbox_regression": loss_bbox_reg / max(1, num_foreground),
+            "bbox_ctrness": loss_bbox_ctrness / max(1, num_foreground),
+        }
+
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
+        cls_logits = self.classification_head(x)
+        bbox_regression, bbox_ctrness = self.regression_head(x)
+        return {
+            "cls_logits": cls_logits,
+            "bbox_regression": bbox_regression,
+            "bbox_ctrness": bbox_ctrness,
+        }
+
+
+class FCOSClassificationHead(nn.Module):
+    """
+    A classification head for use in FCOS.
+
+    Args:
+        in_channels (int): number of channels of the input feature.
+        num_anchors (int): number of anchors to be predicted.
+        num_classes (int): number of classes to be predicted.
+        num_convs (Optional[int]): number of conv layer. Default: 4.
+        prior_probability (Optional[float]): probability of prior. Default: 0.01.
+        norm_layer: Module specifying the normalization layer to use.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_anchors: int,
+        num_classes: int,
+        num_convs: int = 4,
+        prior_probability: float = 0.01,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+
+        if norm_layer is None:
+            norm_layer = partial(nn.GroupNorm, 32)
+
+        conv = []
+        for _ in range(num_convs):
+            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
+            conv.append(norm_layer(in_channels))
+            conv.append(nn.ReLU())
+        self.conv = nn.Sequential(*conv)
+
+        for layer in self.conv.children():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                torch.nn.init.constant_(layer.bias, 0)
+
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
+        torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
+
+    def forward(self, x: list[Tensor]) -> Tensor:
+        all_cls_logits = []
+
+        for features in x:
+            cls_logits = self.conv(features)
+            cls_logits = self.cls_logits(cls_logits)
+
+            # Permute classification output from (N, A * K, H, W) to (N, HWA, K).
+            N, _, H, W = cls_logits.shape
+            cls_logits = cls_logits.view(N, -1, self.num_classes, H, W)
+            cls_logits = cls_logits.permute(0, 3, 4, 1, 2)
+            cls_logits = cls_logits.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
+
+            all_cls_logits.append(cls_logits)
+
+        return torch.cat(all_cls_logits, dim=1)
+
+
+class FCOSRegressionHead(nn.Module):
+    """
+    A regression head for use in FCOS, which combines regression branch and center-ness branch.
+    This can obtain better performance.
+
+    Reference: `FCOS: A simple and strong anchor-free object detector <https://arxiv.org/abs/2006.09214>`_.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_convs (Optional[int]): number of conv layer. Default: 4.
+        norm_layer: Module specifying the normalization layer to use.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_anchors: int,
+        num_convs: int = 4,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
+        super().__init__()
+
+        if norm_layer is None:
+            norm_layer = partial(nn.GroupNorm, 32)
+
+        conv = []
+        for _ in range(num_convs):
+            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
+            conv.append(norm_layer(in_channels))
+            conv.append(nn.ReLU())
+        self.conv = nn.Sequential(*conv)
+
+        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
+        self.bbox_ctrness = nn.Conv2d(in_channels, num_anchors * 1, kernel_size=3, stride=1, padding=1)
+        for layer in [self.bbox_reg, self.bbox_ctrness]:
+            torch.nn.init.normal_(layer.weight, std=0.01)
+            torch.nn.init.zeros_(layer.bias)
+
+        for layer in self.conv.children():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                torch.nn.init.zeros_(layer.bias)
+
+    def forward(self, x: list[Tensor]) -> tuple[Tensor, Tensor]:
+        all_bbox_regression = []
+        all_bbox_ctrness = []
+
+        for features in x:
+            bbox_feature = self.conv(features)
+            bbox_regression = nn.functional.relu(self.bbox_reg(bbox_feature))
+            bbox_ctrness = self.bbox_ctrness(bbox_feature)
+
+            # permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4).
+            N, _, H, W = bbox_regression.shape
+            bbox_regression = bbox_regression.view(N, -1, 4, H, W)
+            bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2)
+            bbox_regression = bbox_regression.reshape(N, -1, 4)  # Size=(N, HWA, 4)
+            all_bbox_regression.append(bbox_regression)
+
+            # permute bbox ctrness output from (N, 1 * A, H, W) to (N, HWA, 1).
+            bbox_ctrness = bbox_ctrness.view(N, -1, 1, H, W)
+            bbox_ctrness = bbox_ctrness.permute(0, 3, 4, 1, 2)
+            bbox_ctrness = bbox_ctrness.reshape(N, -1, 1)
+            all_bbox_ctrness.append(bbox_ctrness)
+
+        return torch.cat(all_bbox_regression, dim=1), torch.cat(all_bbox_ctrness, dim=1)
+
+
+class FCOS(nn.Module):
+    """
+    Implements FCOS.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification, regression
+    and centerness losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores for each prediction
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or an OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps. For FCOS, only set one anchor for per position of each level, the width and height equal to
+            the stride of feature map, and set aspect ratio = 1.0, so the center of anchor is equivalent to the point
+            in FCOS paper.
+        head (nn.Module): Module run on top of the feature pyramid.
+            Defaults to a module containing a classification and regression module.
+        center_sampling_radius (int): radius of the "center" of a groundtruth box,
+            within which all anchor points are labeled positive.
+        score_thresh (float): Score threshold used for postprocessing the detections.
+        nms_thresh (float): NMS threshold used for postprocessing the detections.
+        detections_per_img (int): Number of best detections to keep after NMS.
+        topk_candidates (int): Number of best detections to keep before NMS.
+
+    Example:
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import FCOS
+        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
+        >>> # FCOS needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
+        >>> # so we need to add it here
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the network generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(
+        >>>     sizes=((8,), (16,), (32,), (64,), (128,)),
+        >>>     aspect_ratios=((1.0,),)
+        >>> )
+        >>>
+        >>> # put the pieces together inside a FCOS model
+        >>> model = FCOS(
+        >>>     backbone,
+        >>>     num_classes=80,
+        >>>     anchor_generator=anchor_generator,
+        >>> )
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxLinearCoder,
+    }
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        num_classes: int,
+        # transform parameters
+        min_size: int = 800,
+        max_size: int = 1333,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        # Anchor parameters
+        anchor_generator: Optional[AnchorGenerator] = None,
+        head: Optional[nn.Module] = None,
+        center_sampling_radius: float = 1.5,
+        score_thresh: float = 0.2,
+        nms_thresh: float = 0.6,
+        detections_per_img: int = 100,
+        topk_candidates: int = 1000,
+        **kwargs,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not hasattr(backbone, "out_channels"):
+            raise ValueError(
+                "backbone should contain an attribute out_channels "
+                "specifying the number of output channels (assumed to be the "
+                "same for all the levels)"
+            )
+        self.backbone = backbone
+
+        if not isinstance(anchor_generator, (AnchorGenerator, type(None))):
+            raise TypeError(
+                f"anchor_generator should be of type AnchorGenerator or None, instead  got {type(anchor_generator)}"
+            )
+
+        if anchor_generator is None:
+            anchor_sizes = ((8,), (16,), (32,), (64,), (128,))  # equal to strides of multi-level feature map
+            aspect_ratios = ((1.0,),) * len(anchor_sizes)  # set only one anchor
+            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+        self.anchor_generator = anchor_generator
+        if self.anchor_generator.num_anchors_per_location()[0] != 1:
+            raise ValueError(
+                f"anchor_generator.num_anchors_per_location()[0] should be 1 instead of {anchor_generator.num_anchors_per_location()[0]}"
+            )
+
+        if head is None:
+            head = FCOSHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes)
+        self.head = head
+
+        self.box_coder = det_utils.BoxLinearCoder(normalize_by_size=True)
+
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
+
+        self.center_sampling_radius = center_sampling_radius
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
+
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(
+        self, losses: dict[str, Tensor], detections: list[dict[str, Tensor]]
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
+        if self.training:
+            return losses
+
+        return detections
+
+    def compute_loss(
+        self,
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        num_anchors_per_level: list[int],
+    ) -> dict[str, Tensor]:
+        matched_idxs = []
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            if targets_per_image["boxes"].numel() == 0:
+                matched_idxs.append(
+                    torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device)
+                )
+                continue
+
+            gt_boxes = targets_per_image["boxes"]
+            gt_centers = (gt_boxes[:, :2] + gt_boxes[:, 2:]) / 2  # Nx2
+            anchor_centers = (anchors_per_image[:, :2] + anchors_per_image[:, 2:]) / 2  # N
+            anchor_sizes = anchors_per_image[:, 2] - anchors_per_image[:, 0]
+            # center sampling: anchor point must be close enough to gt center.
+            pairwise_match = (anchor_centers[:, None, :] - gt_centers[None, :, :]).abs_().max(
+                dim=2
+            ).values < self.center_sampling_radius * anchor_sizes[:, None]
+            # compute pairwise distance between N points and M boxes
+            x, y = anchor_centers.unsqueeze(dim=2).unbind(dim=1)  # (N, 1)
+            x0, y0, x1, y1 = gt_boxes.unsqueeze(dim=0).unbind(dim=2)  # (1, M)
+            pairwise_dist = torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2)  # (N, M)
+
+            # anchor point must be inside gt
+            pairwise_match &= pairwise_dist.min(dim=2).values > 0
+
+            # each anchor is only responsible for certain scale range.
+            lower_bound = anchor_sizes * 4
+            lower_bound[: num_anchors_per_level[0]] = 0
+            upper_bound = anchor_sizes * 8
+            upper_bound[-num_anchors_per_level[-1] :] = float("inf")
+            pairwise_dist = pairwise_dist.max(dim=2).values
+            pairwise_match &= (pairwise_dist > lower_bound[:, None]) & (pairwise_dist < upper_bound[:, None])
+
+            # match the GT box with minimum area, if there are multiple GT matches
+            gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1])  # N
+            pairwise_match = pairwise_match.to(torch.float32) * (1e8 - gt_areas[None, :])
+            min_values, matched_idx = pairwise_match.max(dim=1)  # R, per-anchor match
+            matched_idx[min_values < 1e-5] = -1  # unmatched anchors are assigned -1
+
+            matched_idxs.append(matched_idx)
+
+        return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
+
+    def postprocess_detections(
+        self, head_outputs: dict[str, list[Tensor]], anchors: list[list[Tensor]], image_shapes: list[tuple[int, int]]
+    ) -> list[dict[str, Tensor]]:
+        class_logits = head_outputs["cls_logits"]
+        box_regression = head_outputs["bbox_regression"]
+        box_ctrness = head_outputs["bbox_ctrness"]
+
+        num_images = len(image_shapes)
+
+        detections: list[dict[str, Tensor]] = []
+
+        for index in range(num_images):
+            box_regression_per_image = [br[index] for br in box_regression]
+            logits_per_image = [cl[index] for cl in class_logits]
+            box_ctrness_per_image = [bc[index] for bc in box_ctrness]
+            anchors_per_image, image_shape = anchors[index], image_shapes[index]
+
+            image_boxes = []
+            image_scores = []
+            image_labels = []
+
+            for box_regression_per_level, logits_per_level, box_ctrness_per_level, anchors_per_level in zip(
+                box_regression_per_image, logits_per_image, box_ctrness_per_image, anchors_per_image
+            ):
+                num_classes = logits_per_level.shape[-1]
+
+                # remove low scoring boxes
+                scores_per_level = torch.sqrt(
+                    torch.sigmoid(logits_per_level) * torch.sigmoid(box_ctrness_per_level)
+                ).flatten()
+                keep_idxs = scores_per_level > self.score_thresh
+                scores_per_level = scores_per_level[keep_idxs]
+                topk_idxs = torch.where(keep_idxs)[0]
+
+                # keep only topk scoring predictions
+                num_topk = det_utils._topk_min(topk_idxs, self.topk_candidates, 0)
+                scores_per_level, idxs = scores_per_level.topk(num_topk)
+                topk_idxs = topk_idxs[idxs]
+
+                anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode="floor")
+                labels_per_level = topk_idxs % num_classes
+
+                boxes_per_level = self.box_coder.decode(
+                    box_regression_per_level[anchor_idxs], anchors_per_level[anchor_idxs]
+                )
+                boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape)
+
+                image_boxes.append(boxes_per_level)
+                image_scores.append(scores_per_level)
+                image_labels.append(labels_per_level)
+
+            image_boxes = torch.cat(image_boxes, dim=0)
+            image_scores = torch.cat(image_scores, dim=0)
+            image_labels = torch.cat(image_labels, dim=0)
+
+            # non-maximum suppression
+            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+            keep = keep[: self.detections_per_img]
+
+            detections.append(
+                {
+                    "boxes": image_boxes[keep],
+                    "scores": image_scores[keep],
+                    "labels": image_labels[keep],
+                }
+            )
+
+        return detections
+
+    def forward(
+        self,
+        images: list[Tensor],
+        targets: Optional[list[dict[str, Tensor]]] = None,
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
+        """
+        Args:
+            images (list[Tensor]): images to be processed
+            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+        """
+        if self.training:
+
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                for target in targets:
+                    boxes = target["boxes"]
+                    torch._assert(isinstance(boxes, torch.Tensor), "Expected target boxes to be of type Tensor.")
+                    torch._assert(
+                        len(boxes.shape) == 2 and boxes.shape[-1] == 4,
+                        f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
+                    )
+
+        original_image_sizes: list[tuple[int, int]] = []
+        for img in images:
+            val = img.shape[-2:]
+            torch._assert(
+                len(val) == 2,
+                f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
+            )
+            original_image_sizes.append((val[0], val[1]))
+
+        # transform the input
+        images, targets = self.transform(images, targets)
+
+        # Check for degenerate boxes
+        if targets is not None:
+            for target_idx, target in enumerate(targets):
+                boxes = target["boxes"]
+                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+                if degenerate_boxes.any():
+                    # print the first degenerate box
+                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
+                    torch._assert(
+                        False,
+                        f"All bounding boxes should have positive height and width. Found invalid box {degen_bb} for target at index {target_idx}.",
+                    )
+
+        # get the features from the backbone
+        features = self.backbone(images.tensors)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([("0", features)])
+
+        features = list(features.values())
+
+        # compute the fcos heads outputs using the features
+        head_outputs = self.head(features)
+
+        # create the set of anchors
+        anchors = self.anchor_generator(images, features)
+        # recover level sizes
+        num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
+
+        losses = {}
+        detections: list[dict[str, Tensor]] = []
+        if self.training:
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                # compute the losses
+                losses = self.compute_loss(targets, head_outputs, anchors, num_anchors_per_level)
+        else:
+            # split outputs per level
+            split_head_outputs: dict[str, list[Tensor]] = {}
+            for k in head_outputs:
+                split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
+            split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
+
+            # compute the detections
+            detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)
+            detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
+
+        if torch.jit.is_scripting():
+            if not self._has_warned:
+                warnings.warn("FCOS always returns a (Losses, Detections) tuple in scripting")
+                self._has_warned = True
+            return losses, detections
+        return self.eager_outputs(losses, detections)
+
+
+class FCOS_ResNet50_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/fcos_resnet50_fpn_coco-99b0c9b7.pth",
+        transforms=ObjectDetection,
+        meta={
+            "num_params": 32269600,
+            "categories": _COCO_CATEGORIES,
+            "min_size": (1, 1),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#fcos-resnet-50-fpn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 39.2,
+                }
+            },
+            "_ops": 128.207,
+            "_file_size": 123.608,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FCOS_ResNet50_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def fcos_resnet50_fpn(
+    *,
+    weights: Optional[FCOS_ResNet50_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FCOS:
+    """
+    Constructs a FCOS model with a ResNet-50-FPN backbone.
+
+    .. betastatus:: detection module
+
+    Reference: `FCOS: Fully Convolutional One-Stage Object Detection <https://arxiv.org/abs/1904.01355>`_.
+               `FCOS: A simple and strong anchor-free object detector <https://arxiv.org/abs/2006.09214>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example:
+
+        >>> model = torchvision.models.detection.fcos_resnet50_fpn(weights=FCOS_ResNet50_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.FCOS_ResNet50_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.FCOS_ResNet50_FPN_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained weights for
+            the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) resnet layers starting
+            from final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3. Default: None
+        **kwargs: parameters passed to the ``torchvision.models.detection.FCOS``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/fcos.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.FCOS_ResNet50_FPN_Weights
+        :members:
+    """
+    weights = FCOS_ResNet50_FPN_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    backbone = _resnet_fpn_extractor(
+        backbone, trainable_backbone_layers, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(256, 256)
+    )
+    model = FCOS(backbone, num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/generalized_rcnn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/generalized_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f07fa77aae95042f4997869c9164eb07122dd8de
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/generalized_rcnn.py
@@ -0,0 +1,133 @@
+"""
+Implements the Generalized R-CNN framework
+"""
+
+import warnings
+from collections import OrderedDict
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...utils import _log_api_usage_once
+
+
+class GeneralizedRCNN(nn.Module):
+    """
+    Main class for Generalized R-CNN.
+
+    Args:
+        backbone (nn.Module):
+        rpn (nn.Module):
+        roi_heads (nn.Module): takes the features + the proposals from the RPN and computes
+            detections / masks from it.
+        transform (nn.Module): performs the data transformation from the inputs to feed into
+            the model
+    """
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        rpn: nn.Module,
+        roi_heads: nn.Module,
+        transform: nn.Module,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.transform = transform
+        self.backbone = backbone
+        self.rpn = rpn
+        self.roi_heads = roi_heads
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(
+        self, losses: dict[str, torch.Tensor], detections: list[dict[str, torch.Tensor]]
+    ) -> Union[dict[str, torch.Tensor], list[dict[str, torch.Tensor]]]:
+        if self.training:
+            return losses
+
+        return detections
+
+    def forward(
+        self,
+        images: list[torch.Tensor],
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+    ) -> tuple[dict[str, torch.Tensor], list[dict[str, torch.Tensor]]]:
+        """
+        Args:
+            images (list[Tensor]): images to be processed
+            targets (list[dict[str, tensor]]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        if self.training:
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                for target in targets:
+                    boxes = target["boxes"]
+                    if isinstance(boxes, torch.Tensor):
+                        torch._assert(
+                            len(boxes.shape) == 2 and boxes.shape[-1] == 4,
+                            f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
+                        )
+                    else:
+                        torch._assert(
+                            False,
+                            f"Expected target boxes to be of type Tensor, got {type(boxes)}.",
+                        )
+
+        original_image_sizes: list[tuple[int, int]] = []
+        for img in images:
+            val = img.shape[-2:]
+            torch._assert(
+                len(val) == 2,
+                f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
+            )
+            original_image_sizes.append((val[0], val[1]))
+
+        images, targets = self.transform(images, targets)
+
+        # Check for degenerate boxes
+        # TODO: Move this to a function
+        if targets is not None:
+            for target_idx, target in enumerate(targets):
+                boxes = target["boxes"]
+                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+                if degenerate_boxes.any():
+                    # print the first degenerate box
+                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
+                    torch._assert(
+                        False,
+                        "All bounding boxes should have positive height and width."
+                        f" Found invalid box {degen_bb} for target at index {target_idx}.",
+                    )
+
+        features = self.backbone(images.tensors)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([("0", features)])
+        proposals, proposal_losses = self.rpn(images, features, targets)
+        detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+        detections = self.transform.postprocess(
+            detections, images.image_sizes, original_image_sizes
+        )  # type: ignore[operator]
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+
+        if torch.jit.is_scripting():
+            if not self._has_warned:
+                warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting")
+                self._has_warned = True
+            return losses, detections
+        else:
+            return self.eager_outputs(losses, detections)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/image_list.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/image_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..08aabe3a486e2609b53352f2d50a3148c4428066
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/image_list.py
@@ -0,0 +1,23 @@
+import torch
+from torch import Tensor
+
+
+class ImageList:
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+
+    Args:
+        tensors (tensor): Tensor containing images.
+        image_sizes (list[tuple[int, int]]): List of Tuples each containing size of images.
+    """
+
+    def __init__(self, tensors: Tensor, image_sizes: list[tuple[int, int]]) -> None:
+        self.tensors = tensors
+        self.image_sizes = image_sizes
+
+    def to(self, device: torch.device) -> "ImageList":
+        cast_tensor = self.tensors.to(device)
+        return ImageList(cast_tensor, self.image_sizes)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/keypoint_rcnn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/keypoint_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b9d65562d81f9ce1be56180c433de44d5e9b4f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/keypoint_rcnn.py
@@ -0,0 +1,476 @@
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from torchvision.ops import MultiScaleRoIAlign
+
+from ...ops import misc as misc_nn_ops
+from ...transforms._presets import ObjectDetection
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_PERSON_CATEGORIES, _COCO_PERSON_KEYPOINT_NAMES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..resnet import resnet50, ResNet50_Weights
+from ._utils import overwrite_eps
+from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
+from .faster_rcnn import FasterRCNN
+
+
+__all__ = [
+    "KeypointRCNN",
+    "KeypointRCNN_ResNet50_FPN_Weights",
+    "keypointrcnn_resnet50_fpn",
+]
+
+
+class KeypointRCNN(FasterRCNN):
+    """
+    Implements Keypoint R-CNN.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+            ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+        - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the
+          format [x, y, visibility], where visibility=0 means that the keypoint is not visible.
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses for both the RPN and the R-CNN, and the keypoint loss.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+            ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores or each prediction
+        - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or and OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+            If box_predictor is specified, num_classes should be None.
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
+        rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
+        rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
+        rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
+        rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
+        rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
+        box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+            the locations indicated by the bounding boxes
+        box_head (nn.Module): module that takes the cropped feature maps as input
+        box_predictor (nn.Module): module that takes the output of box_head and returns the
+            classification logits and box regression deltas.
+        box_score_thresh (float): during inference, only return proposals with a classification score
+            greater than box_score_thresh
+        box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
+        box_detections_per_img (int): maximum number of detections per image, for all classes.
+        box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
+            considered as positive during training of the classification head
+        box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
+            considered as negative during training of the classification head
+        box_batch_size_per_image (int): number of proposals that are sampled during training of the
+            classification head
+        box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
+            of the classification head
+        bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
+            bounding boxes
+        keypoint_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+             the locations indicated by the bounding boxes, which will be used for the keypoint head.
+        keypoint_head (nn.Module): module that takes the cropped feature maps as input
+        keypoint_predictor (nn.Module): module that takes the output of the keypoint_head and returns the
+            heatmap logits
+
+    Example::
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import KeypointRCNN
+        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
+        >>>
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
+        >>> # KeypointRCNN needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
+        >>> # so we need to add it here
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the RPN generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
+        >>>                                    aspect_ratios=((0.5, 1.0, 2.0),))
+        >>>
+        >>> # let's define what are the feature maps that we will
+        >>> # use to perform the region of interest cropping, as well as
+        >>> # the size of the crop after rescaling.
+        >>> # if your backbone returns a Tensor, featmap_names is expected to
+        >>> # be ['0']. More generally, the backbone should return an
+        >>> # OrderedDict[Tensor], and in featmap_names you can choose which
+        >>> # feature maps to use.
+        >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
+        >>>                                                 output_size=7,
+        >>>                                                 sampling_ratio=2)
+        >>>
+        >>> keypoint_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
+        >>>                                                          output_size=14,
+        >>>                                                          sampling_ratio=2)
+        >>> # put the pieces together inside a KeypointRCNN model
+        >>> model = KeypointRCNN(backbone,
+        >>>                      num_classes=2,
+        >>>                      rpn_anchor_generator=anchor_generator,
+        >>>                      box_roi_pool=roi_pooler,
+        >>>                      keypoint_roi_pool=keypoint_roi_pooler)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+
+    def __init__(
+        self,
+        backbone,
+        num_classes=None,
+        # transform parameters
+        min_size=None,
+        max_size=1333,
+        image_mean=None,
+        image_std=None,
+        # RPN parameters
+        rpn_anchor_generator=None,
+        rpn_head=None,
+        rpn_pre_nms_top_n_train=2000,
+        rpn_pre_nms_top_n_test=1000,
+        rpn_post_nms_top_n_train=2000,
+        rpn_post_nms_top_n_test=1000,
+        rpn_nms_thresh=0.7,
+        rpn_fg_iou_thresh=0.7,
+        rpn_bg_iou_thresh=0.3,
+        rpn_batch_size_per_image=256,
+        rpn_positive_fraction=0.5,
+        rpn_score_thresh=0.0,
+        # Box parameters
+        box_roi_pool=None,
+        box_head=None,
+        box_predictor=None,
+        box_score_thresh=0.05,
+        box_nms_thresh=0.5,
+        box_detections_per_img=100,
+        box_fg_iou_thresh=0.5,
+        box_bg_iou_thresh=0.5,
+        box_batch_size_per_image=512,
+        box_positive_fraction=0.25,
+        bbox_reg_weights=None,
+        # keypoint parameters
+        keypoint_roi_pool=None,
+        keypoint_head=None,
+        keypoint_predictor=None,
+        num_keypoints=None,
+        **kwargs,
+    ):
+
+        if not isinstance(keypoint_roi_pool, (MultiScaleRoIAlign, type(None))):
+            raise TypeError(
+                "keypoint_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(keypoint_roi_pool)}"
+            )
+        if min_size is None:
+            min_size = (640, 672, 704, 736, 768, 800)
+
+        if num_keypoints is not None:
+            if keypoint_predictor is not None:
+                raise ValueError("num_keypoints should be None when keypoint_predictor is specified")
+        else:
+            num_keypoints = 17
+
+        out_channels = backbone.out_channels
+
+        if keypoint_roi_pool is None:
+            keypoint_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=14, sampling_ratio=2)
+
+        if keypoint_head is None:
+            keypoint_layers = tuple(512 for _ in range(8))
+            keypoint_head = KeypointRCNNHeads(out_channels, keypoint_layers)
+
+        if keypoint_predictor is None:
+            keypoint_dim_reduced = 512  # == keypoint_layers[-1]
+            keypoint_predictor = KeypointRCNNPredictor(keypoint_dim_reduced, num_keypoints)
+
+        super().__init__(
+            backbone,
+            num_classes,
+            # transform parameters
+            min_size,
+            max_size,
+            image_mean,
+            image_std,
+            # RPN-specific parameters
+            rpn_anchor_generator,
+            rpn_head,
+            rpn_pre_nms_top_n_train,
+            rpn_pre_nms_top_n_test,
+            rpn_post_nms_top_n_train,
+            rpn_post_nms_top_n_test,
+            rpn_nms_thresh,
+            rpn_fg_iou_thresh,
+            rpn_bg_iou_thresh,
+            rpn_batch_size_per_image,
+            rpn_positive_fraction,
+            rpn_score_thresh,
+            # Box parameters
+            box_roi_pool,
+            box_head,
+            box_predictor,
+            box_score_thresh,
+            box_nms_thresh,
+            box_detections_per_img,
+            box_fg_iou_thresh,
+            box_bg_iou_thresh,
+            box_batch_size_per_image,
+            box_positive_fraction,
+            bbox_reg_weights,
+            **kwargs,
+        )
+
+        self.roi_heads.keypoint_roi_pool = keypoint_roi_pool
+        self.roi_heads.keypoint_head = keypoint_head
+        self.roi_heads.keypoint_predictor = keypoint_predictor
+
+
+class KeypointRCNNHeads(nn.Sequential):
+    def __init__(self, in_channels, layers):
+        d = []
+        next_feature = in_channels
+        for out_channels in layers:
+            d.append(nn.Conv2d(next_feature, out_channels, 3, stride=1, padding=1))
+            d.append(nn.ReLU(inplace=True))
+            next_feature = out_channels
+        super().__init__(*d)
+        for m in self.children():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                nn.init.constant_(m.bias, 0)
+
+
+class KeypointRCNNPredictor(nn.Module):
+    def __init__(self, in_channels, num_keypoints):
+        super().__init__()
+        input_features = in_channels
+        deconv_kernel = 4
+        self.kps_score_lowres = nn.ConvTranspose2d(
+            input_features,
+            num_keypoints,
+            deconv_kernel,
+            stride=2,
+            padding=deconv_kernel // 2 - 1,
+        )
+        nn.init.kaiming_normal_(self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu")
+        nn.init.constant_(self.kps_score_lowres.bias, 0)
+        self.up_scale = 2
+        self.out_channels = num_keypoints
+
+    def forward(self, x):
+        x = self.kps_score_lowres(x)
+        return torch.nn.functional.interpolate(
+            x, scale_factor=float(self.up_scale), mode="bilinear", align_corners=False, recompute_scale_factor=False
+        )
+
+
+_COMMON_META = {
+    "categories": _COCO_PERSON_CATEGORIES,
+    "keypoint_names": _COCO_PERSON_KEYPOINT_NAMES,
+    "min_size": (1, 1),
+}
+
+
+class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
+    COCO_LEGACY = Weights(
+        url="https://download.pytorch.org/models/keypointrcnn_resnet50_fpn_coco-9f466800.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 59137258,
+            "recipe": "https://github.com/pytorch/vision/issues/1606",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 50.6,
+                    "kp_map": 61.1,
+                }
+            },
+            "_ops": 133.924,
+            "_file_size": 226.054,
+            "_docs": """
+                These weights were produced by following a similar training recipe as on the paper but use a checkpoint
+                from an early epoch.
+            """,
+        },
+    )
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/keypointrcnn_resnet50_fpn_coco-fc266e95.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 59137258,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#keypoint-r-cnn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 54.6,
+                    "kp_map": 65.0,
+                }
+            },
+            "_ops": 137.42,
+            "_file_size": 226.054,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY
+            if kwargs["pretrained"] == "legacy"
+            else KeypointRCNN_ResNet50_FPN_Weights.COCO_V1
+        ),
+    ),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def keypointrcnn_resnet50_fpn(
+    *,
+    weights: Optional[KeypointRCNN_ResNet50_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    num_keypoints: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> KeypointRCNN:
+    """
+    Constructs a Keypoint R-CNN model with a ResNet-50-FPN backbone.
+
+    .. betastatus:: detection module
+
+    Reference: `Mask R-CNN <https://arxiv.org/abs/1703.06870>`__.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+        - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the
+          format ``[x, y, visibility]``, where ``visibility=0`` means that the keypoint is not visible.
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses for both the RPN and the R-CNN, and the keypoint loss.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detected instances:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each instance
+        - scores (``Tensor[N]``): the scores or each instance
+        - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format.
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Keypoint R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
+
+    Example::
+
+        >>> model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+        >>>
+        >>> # optionally, if you want to export the model to ONNX:
+        >>> torch.onnx.export(model, x, "keypoint_rcnn.onnx", opset_version = 11)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.KeypointRCNN_ResNet50_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.KeypointRCNN_ResNet50_FPN_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int, optional): number of output classes of the model (including the background)
+        num_keypoints (int, optional): number of keypoints
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+
+    .. autoclass:: torchvision.models.detection.KeypointRCNN_ResNet50_FPN_Weights
+        :members:
+    """
+    weights = KeypointRCNN_ResNet50_FPN_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        num_keypoints = _ovewrite_value_param("num_keypoints", num_keypoints, len(weights.meta["keypoint_names"]))
+    else:
+        if num_classes is None:
+            num_classes = 2
+        if num_keypoints is None:
+            num_keypoints = 17
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
+    model = KeypointRCNN(backbone, num_classes, num_keypoints=num_keypoints, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if weights == KeypointRCNN_ResNet50_FPN_Weights.COCO_V1:
+            overwrite_eps(model, 0.0)
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/mask_rcnn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1668ab423e52fee248d696d9d2f3ad1fcac90b5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/mask_rcnn.py
@@ -0,0 +1,590 @@
+from collections import OrderedDict
+from typing import Any, Callable, Optional
+
+from torch import nn
+from torchvision.ops import MultiScaleRoIAlign
+
+from ...ops import misc as misc_nn_ops
+from ...transforms._presets import ObjectDetection
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..resnet import resnet50, ResNet50_Weights
+from ._utils import overwrite_eps
+from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
+from .faster_rcnn import _default_anchorgen, FasterRCNN, FastRCNNConvFCHead, RPNHead
+
+
+__all__ = [
+    "MaskRCNN",
+    "MaskRCNN_ResNet50_FPN_Weights",
+    "MaskRCNN_ResNet50_FPN_V2_Weights",
+    "maskrcnn_resnet50_fpn",
+    "maskrcnn_resnet50_fpn_v2",
+]
+
+
+class MaskRCNN(FasterRCNN):
+    """
+    Implements Mask R-CNN.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+        - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses for both the RPN and the R-CNN, and the mask loss.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores or each prediction
+        - masks (FloatTensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
+          obtain the final segmentation masks, the soft masks can be thresholded, generally
+          with a value of 0.5 (mask >= 0.5)
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or and OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+            If box_predictor is specified, num_classes should be None.
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
+        rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
+        rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
+        rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
+        rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
+        rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh
+        box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+            the locations indicated by the bounding boxes
+        box_head (nn.Module): module that takes the cropped feature maps as input
+        box_predictor (nn.Module): module that takes the output of box_head and returns the
+            classification logits and box regression deltas.
+        box_score_thresh (float): during inference, only return proposals with a classification score
+            greater than box_score_thresh
+        box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
+        box_detections_per_img (int): maximum number of detections per image, for all classes.
+        box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
+            considered as positive during training of the classification head
+        box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
+            considered as negative during training of the classification head
+        box_batch_size_per_image (int): number of proposals that are sampled during training of the
+            classification head
+        box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
+            of the classification head
+        bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
+            bounding boxes
+        mask_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
+             the locations indicated by the bounding boxes, which will be used for the mask head.
+        mask_head (nn.Module): module that takes the cropped feature maps as input
+        mask_predictor (nn.Module): module that takes the output of the mask_head and returns the
+            segmentation mask logits
+
+    Example::
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import MaskRCNN
+        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
+        >>>
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
+        >>> # MaskRCNN needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # so we need to add it here,
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the RPN generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
+        >>>                                    aspect_ratios=((0.5, 1.0, 2.0),))
+        >>>
+        >>> # let's define what are the feature maps that we will
+        >>> # use to perform the region of interest cropping, as well as
+        >>> # the size of the crop after rescaling.
+        >>> # if your backbone returns a Tensor, featmap_names is expected to
+        >>> # be ['0']. More generally, the backbone should return an
+        >>> # OrderedDict[Tensor], and in featmap_names you can choose which
+        >>> # feature maps to use.
+        >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
+        >>>                                                 output_size=7,
+        >>>                                                 sampling_ratio=2)
+        >>>
+        >>> mask_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
+        >>>                                                      output_size=14,
+        >>>                                                      sampling_ratio=2)
+        >>> # put the pieces together inside a MaskRCNN model
+        >>> model = MaskRCNN(backbone,
+        >>>                  num_classes=2,
+        >>>                  rpn_anchor_generator=anchor_generator,
+        >>>                  box_roi_pool=roi_pooler,
+        >>>                  mask_roi_pool=mask_roi_pooler)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+
+    def __init__(
+        self,
+        backbone,
+        num_classes=None,
+        # transform parameters
+        min_size=800,
+        max_size=1333,
+        image_mean=None,
+        image_std=None,
+        # RPN parameters
+        rpn_anchor_generator=None,
+        rpn_head=None,
+        rpn_pre_nms_top_n_train=2000,
+        rpn_pre_nms_top_n_test=1000,
+        rpn_post_nms_top_n_train=2000,
+        rpn_post_nms_top_n_test=1000,
+        rpn_nms_thresh=0.7,
+        rpn_fg_iou_thresh=0.7,
+        rpn_bg_iou_thresh=0.3,
+        rpn_batch_size_per_image=256,
+        rpn_positive_fraction=0.5,
+        rpn_score_thresh=0.0,
+        # Box parameters
+        box_roi_pool=None,
+        box_head=None,
+        box_predictor=None,
+        box_score_thresh=0.05,
+        box_nms_thresh=0.5,
+        box_detections_per_img=100,
+        box_fg_iou_thresh=0.5,
+        box_bg_iou_thresh=0.5,
+        box_batch_size_per_image=512,
+        box_positive_fraction=0.25,
+        bbox_reg_weights=None,
+        # Mask parameters
+        mask_roi_pool=None,
+        mask_head=None,
+        mask_predictor=None,
+        **kwargs,
+    ):
+
+        if not isinstance(mask_roi_pool, (MultiScaleRoIAlign, type(None))):
+            raise TypeError(
+                f"mask_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(mask_roi_pool)}"
+            )
+
+        if num_classes is not None:
+            if mask_predictor is not None:
+                raise ValueError("num_classes should be None when mask_predictor is specified")
+
+        out_channels = backbone.out_channels
+
+        if mask_roi_pool is None:
+            mask_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=14, sampling_ratio=2)
+
+        if mask_head is None:
+            mask_layers = (256, 256, 256, 256)
+            mask_dilation = 1
+            mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation)
+
+        if mask_predictor is None:
+            mask_predictor_in_channels = 256  # == mask_layers[-1]
+            mask_dim_reduced = 256
+            mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, num_classes)
+
+        super().__init__(
+            backbone,
+            num_classes,
+            # transform parameters
+            min_size,
+            max_size,
+            image_mean,
+            image_std,
+            # RPN-specific parameters
+            rpn_anchor_generator,
+            rpn_head,
+            rpn_pre_nms_top_n_train,
+            rpn_pre_nms_top_n_test,
+            rpn_post_nms_top_n_train,
+            rpn_post_nms_top_n_test,
+            rpn_nms_thresh,
+            rpn_fg_iou_thresh,
+            rpn_bg_iou_thresh,
+            rpn_batch_size_per_image,
+            rpn_positive_fraction,
+            rpn_score_thresh,
+            # Box parameters
+            box_roi_pool,
+            box_head,
+            box_predictor,
+            box_score_thresh,
+            box_nms_thresh,
+            box_detections_per_img,
+            box_fg_iou_thresh,
+            box_bg_iou_thresh,
+            box_batch_size_per_image,
+            box_positive_fraction,
+            bbox_reg_weights,
+            **kwargs,
+        )
+
+        self.roi_heads.mask_roi_pool = mask_roi_pool
+        self.roi_heads.mask_head = mask_head
+        self.roi_heads.mask_predictor = mask_predictor
+
+
+class MaskRCNNHeads(nn.Sequential):
+    _version = 2
+
+    def __init__(self, in_channels, layers, dilation, norm_layer: Optional[Callable[..., nn.Module]] = None):
+        """
+        Args:
+            in_channels (int): number of input channels
+            layers (list): feature dimensions of each FCN layer
+            dilation (int): dilation rate of kernel
+            norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+        """
+        blocks = []
+        next_feature = in_channels
+        for layer_features in layers:
+            blocks.append(
+                misc_nn_ops.Conv2dNormActivation(
+                    next_feature,
+                    layer_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilation,
+                    dilation=dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+            next_feature = layer_features
+
+        super().__init__(*blocks)
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
+                if layer.bias is not None:
+                    nn.init.zeros_(layer.bias)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            num_blocks = len(self)
+            for i in range(num_blocks):
+                for type in ["weight", "bias"]:
+                    old_key = f"{prefix}mask_fcn{i+1}.{type}"
+                    new_key = f"{prefix}{i}.0.{type}"
+                    if old_key in state_dict:
+                        state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class MaskRCNNPredictor(nn.Sequential):
+    def __init__(self, in_channels, dim_reduced, num_classes):
+        super().__init__(
+            OrderedDict(
+                [
+                    ("conv5_mask", nn.ConvTranspose2d(in_channels, dim_reduced, 2, 2, 0)),
+                    ("relu", nn.ReLU(inplace=True)),
+                    ("mask_fcn_logits", nn.Conv2d(dim_reduced, num_classes, 1, 1, 0)),
+                ]
+            )
+        )
+
+        for name, param in self.named_parameters():
+            if "weight" in name:
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+            # elif "bias" in name:
+            #     nn.init.constant_(param, 0)
+
+
+_COMMON_META = {
+    "categories": _COCO_CATEGORIES,
+    "min_size": (1, 1),
+}
+
+
+class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 44401393,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#mask-r-cnn",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 37.9,
+                    "mask_map": 34.6,
+                }
+            },
+            "_ops": 134.38,
+            "_file_size": 169.84,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/maskrcnn_resnet50_fpn_v2_coco-73cbd019.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 46359409,
+            "recipe": "https://github.com/pytorch/vision/pull/5773",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 47.4,
+                    "mask_map": 41.8,
+                }
+            },
+            "_ops": 333.577,
+            "_file_size": 177.219,
+            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", MaskRCNN_ResNet50_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def maskrcnn_resnet50_fpn(
+    *,
+    weights: Optional[MaskRCNN_ResNet50_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> MaskRCNN:
+    """Mask R-CNN model with a ResNet-50-FPN backbone from the `Mask R-CNN
+    <https://arxiv.org/abs/1703.06870>`_ paper.
+
+    .. betastatus:: detection module
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+        - masks (``UInt8Tensor[N, H, W]``): the segmentation binary masks for each instance
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses for both the RPN and the R-CNN, and the mask loss.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detected instances:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each instance
+        - scores (``Tensor[N]``): the scores or each instance
+        - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
+          obtain the final segmentation masks, the soft masks can be thresholded, generally
+          with a value of 0.5 (``mask >= 0.5``)
+
+    For more details on the output and on how to plot the masks, you may refer to :ref:`instance_seg_output`.
+
+    Mask R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
+
+    Example::
+
+        >>> model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+        >>>
+        >>> # optionally, if you want to export the model to ONNX:
+        >>> torch.onnx.export(model, x, "mask_rcnn.onnx", opset_version = 11)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.mask_rcnn.MaskRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/mask_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights
+        :members:
+    """
+    weights = MaskRCNN_ResNet50_FPN_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
+    model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if weights == MaskRCNN_ResNet50_FPN_Weights.COCO_V1:
+            overwrite_eps(model, 0.0)
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def maskrcnn_resnet50_fpn_v2(
+    *,
+    weights: Optional[MaskRCNN_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> MaskRCNN:
+    """Improved Mask R-CNN model with a ResNet-50-FPN backbone from the `Benchmarking Detection Transfer
+    Learning with Vision Transformers <https://arxiv.org/abs/2111.11429>`_ paper.
+
+    .. betastatus:: detection module
+
+    :func:`~torchvision.models.detection.maskrcnn_resnet50_fpn` for more details.
+
+    Args:
+        weights (:class:`~torchvision.models.detection.MaskRCNN_ResNet50_FPN_V2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.MaskRCNN_ResNet50_FPN_V2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
+            final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
+            trainable. If ``None`` is passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.mask_rcnn.MaskRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/mask_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.MaskRCNN_ResNet50_FPN_V2_Weights
+        :members:
+    """
+    weights = MaskRCNN_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
+    rpn_anchor_generator = _default_anchorgen()
+    rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
+    box_head = FastRCNNConvFCHead(
+        (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
+    )
+    mask_head = MaskRCNNHeads(backbone.out_channels, [256, 256, 256, 256], 1, norm_layer=nn.BatchNorm2d)
+    model = MaskRCNN(
+        backbone,
+        num_classes=num_classes,
+        rpn_anchor_generator=rpn_anchor_generator,
+        rpn_head=rpn_head,
+        box_head=box_head,
+        mask_head=mask_head,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/retinanet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd77749d2c13778d4fec1d845247c1ab6297c33c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/retinanet.py
@@ -0,0 +1,903 @@
+import math
+import warnings
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+
+from ...ops import boxes as box_ops, misc as misc_nn_ops, sigmoid_focal_loss
+from ...ops.feature_pyramid_network import LastLevelP6P7
+from ...transforms._presets import ObjectDetection
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..resnet import resnet50, ResNet50_Weights
+from . import _utils as det_utils
+from ._utils import _box_loss, overwrite_eps
+from .anchor_utils import AnchorGenerator
+from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
+from .transform import GeneralizedRCNNTransform
+
+
+__all__ = [
+    "RetinaNet",
+    "RetinaNet_ResNet50_FPN_Weights",
+    "RetinaNet_ResNet50_FPN_V2_Weights",
+    "retinanet_resnet50_fpn",
+    "retinanet_resnet50_fpn_v2",
+]
+
+
+def _sum(x: list[Tensor]) -> Tensor:
+    res = x[0]
+    for i in x[1:]:
+        res = res + i
+    return res
+
+
+def _v1_to_v2_weights(state_dict, prefix):
+    for i in range(4):
+        for type in ["weight", "bias"]:
+            old_key = f"{prefix}conv.{2*i}.{type}"
+            new_key = f"{prefix}conv.{i}.0.{type}"
+            if old_key in state_dict:
+                state_dict[new_key] = state_dict.pop(old_key)
+
+
+def _default_anchorgen():
+    anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+    return anchor_generator
+
+
+class RetinaNetHead(nn.Module):
+    """
+    A regression and classification head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_classes (int): number of classes to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+    """
+
+    def __init__(self, in_channels, num_anchors, num_classes, norm_layer: Optional[Callable[..., nn.Module]] = None):
+        super().__init__()
+        self.classification_head = RetinaNetClassificationHead(
+            in_channels, num_anchors, num_classes, norm_layer=norm_layer
+        )
+        self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors, norm_layer=norm_layer)
+
+    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor], list[Tensor]) -> dict[str, Tensor]
+        return {
+            "classification": self.classification_head.compute_loss(targets, head_outputs, matched_idxs),
+            "bbox_regression": self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
+        }
+
+    def forward(self, x):
+        # type: (list[Tensor]) -> dict[str, Tensor]
+        return {"cls_logits": self.classification_head(x), "bbox_regression": self.regression_head(x)}
+
+
+class RetinaNetClassificationHead(nn.Module):
+    """
+    A classification head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_classes (int): number of classes to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+    """
+
+    _version = 2
+
+    def __init__(
+        self,
+        in_channels,
+        num_anchors,
+        num_classes,
+        prior_probability=0.01,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
+        super().__init__()
+
+        conv = []
+        for _ in range(4):
+            conv.append(misc_nn_ops.Conv2dNormActivation(in_channels, in_channels, norm_layer=norm_layer))
+        self.conv = nn.Sequential(*conv)
+
+        for layer in self.conv.modules():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                if layer.bias is not None:
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
+        torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
+
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+
+        # This is to fix using det_utils.Matcher.BETWEEN_THRESHOLDS in TorchScript.
+        # TorchScript doesn't support class attributes.
+        # https://github.com/pytorch/vision/pull/1697#issuecomment-630255584
+        self.BETWEEN_THRESHOLDS = det_utils.Matcher.BETWEEN_THRESHOLDS
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            _v1_to_v2_weights(state_dict, prefix)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def compute_loss(self, targets, head_outputs, matched_idxs):
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor]) -> Tensor
+        losses = []
+
+        cls_logits = head_outputs["cls_logits"]
+
+        for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs):
+            # determine only the foreground
+            foreground_idxs_per_image = matched_idxs_per_image >= 0
+            num_foreground = foreground_idxs_per_image.sum()
+
+            # create the target classification
+            gt_classes_target = torch.zeros_like(cls_logits_per_image)
+            gt_classes_target[
+                foreground_idxs_per_image,
+                targets_per_image["labels"][matched_idxs_per_image[foreground_idxs_per_image]],
+            ] = 1.0
+
+            # find indices for which anchors should be ignored
+            valid_idxs_per_image = matched_idxs_per_image != self.BETWEEN_THRESHOLDS
+
+            # compute the classification loss
+            losses.append(
+                sigmoid_focal_loss(
+                    cls_logits_per_image[valid_idxs_per_image],
+                    gt_classes_target[valid_idxs_per_image],
+                    reduction="sum",
+                )
+                / max(1, num_foreground)
+            )
+
+        return _sum(losses) / len(targets)
+
+    def forward(self, x):
+        # type: (list[Tensor]) -> Tensor
+        all_cls_logits = []
+
+        for features in x:
+            cls_logits = self.conv(features)
+            cls_logits = self.cls_logits(cls_logits)
+
+            # Permute classification output from (N, A * K, H, W) to (N, HWA, K).
+            N, _, H, W = cls_logits.shape
+            cls_logits = cls_logits.view(N, -1, self.num_classes, H, W)
+            cls_logits = cls_logits.permute(0, 3, 4, 1, 2)
+            cls_logits = cls_logits.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
+
+            all_cls_logits.append(cls_logits)
+
+        return torch.cat(all_cls_logits, dim=1)
+
+
+class RetinaNetRegressionHead(nn.Module):
+    """
+    A regression head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+    """
+
+    _version = 2
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxCoder,
+    }
+
+    def __init__(self, in_channels, num_anchors, norm_layer: Optional[Callable[..., nn.Module]] = None):
+        super().__init__()
+
+        conv = []
+        for _ in range(4):
+            conv.append(misc_nn_ops.Conv2dNormActivation(in_channels, in_channels, norm_layer=norm_layer))
+        self.conv = nn.Sequential(*conv)
+
+        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.bbox_reg.weight, std=0.01)
+        torch.nn.init.zeros_(self.bbox_reg.bias)
+
+        for layer in self.conv.modules():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                if layer.bias is not None:
+                    torch.nn.init.zeros_(layer.bias)
+
+        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+        self._loss_type = "l1"
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            _v1_to_v2_weights(state_dict, prefix)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor], list[Tensor]) -> Tensor
+        losses = []
+
+        bbox_regression = head_outputs["bbox_regression"]
+
+        for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in zip(
+            targets, bbox_regression, anchors, matched_idxs
+        ):
+            # determine only the foreground indices, ignore the rest
+            foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
+            num_foreground = foreground_idxs_per_image.numel()
+
+            # select only the foreground boxes
+            matched_gt_boxes_per_image = targets_per_image["boxes"][matched_idxs_per_image[foreground_idxs_per_image]]
+            bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
+            anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
+
+            # compute the loss
+            losses.append(
+                _box_loss(
+                    self._loss_type,
+                    self.box_coder,
+                    anchors_per_image,
+                    matched_gt_boxes_per_image,
+                    bbox_regression_per_image,
+                )
+                / max(1, num_foreground)
+            )
+
+        return _sum(losses) / max(1, len(targets))
+
+    def forward(self, x):
+        # type: (list[Tensor]) -> Tensor
+        all_bbox_regression = []
+
+        for features in x:
+            bbox_regression = self.conv(features)
+            bbox_regression = self.bbox_reg(bbox_regression)
+
+            # Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4).
+            N, _, H, W = bbox_regression.shape
+            bbox_regression = bbox_regression.view(N, -1, 4, H, W)
+            bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2)
+            bbox_regression = bbox_regression.reshape(N, -1, 4)  # Size=(N, HWA, 4)
+
+            all_bbox_regression.append(bbox_regression)
+
+        return torch.cat(all_bbox_regression, dim=1)
+
+
+class RetinaNet(nn.Module):
+    """
+    Implements RetinaNet.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores for each prediction
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or an OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+        min_size (int): Images are rescaled before feeding them to the backbone:
+            we attempt to preserve the aspect ratio and scale the shorter edge
+            to ``min_size``. If the resulting longer edge exceeds ``max_size``,
+            then downscale so that the longer edge does not exceed ``max_size``.
+            This may result in the shorter edge beeing lower than ``min_size``.
+        max_size (int): See ``min_size``.
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        head (nn.Module): Module run on top of the feature pyramid.
+            Defaults to a module containing a classification and regression module.
+        score_thresh (float): Score threshold used for postprocessing the detections.
+        nms_thresh (float): NMS threshold used for postprocessing the detections.
+        detections_per_img (int): Number of best detections to keep after NMS.
+        fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training.
+        bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training.
+        topk_candidates (int): Number of best detections to keep before NMS.
+
+    Example:
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import RetinaNet
+        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
+        >>> # RetinaNet needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
+        >>> # so we need to add it here
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the network generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(
+        >>>     sizes=((32, 64, 128, 256, 512),),
+        >>>     aspect_ratios=((0.5, 1.0, 2.0),)
+        >>> )
+        >>>
+        >>> # put the pieces together inside a RetinaNet model
+        >>> model = RetinaNet(backbone,
+        >>>                   num_classes=2,
+        >>>                   anchor_generator=anchor_generator)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxCoder,
+        "proposal_matcher": det_utils.Matcher,
+    }
+
+    def __init__(
+        self,
+        backbone,
+        num_classes,
+        # transform parameters
+        min_size=800,
+        max_size=1333,
+        image_mean=None,
+        image_std=None,
+        # Anchor parameters
+        anchor_generator=None,
+        head=None,
+        proposal_matcher=None,
+        score_thresh=0.05,
+        nms_thresh=0.5,
+        detections_per_img=300,
+        fg_iou_thresh=0.5,
+        bg_iou_thresh=0.4,
+        topk_candidates=1000,
+        **kwargs,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not hasattr(backbone, "out_channels"):
+            raise ValueError(
+                "backbone should contain an attribute out_channels "
+                "specifying the number of output channels (assumed to be the "
+                "same for all the levels)"
+            )
+        self.backbone = backbone
+
+        if not isinstance(anchor_generator, (AnchorGenerator, type(None))):
+            raise TypeError(
+                f"anchor_generator should be of type AnchorGenerator or None instead of {type(anchor_generator)}"
+            )
+
+        if anchor_generator is None:
+            anchor_generator = _default_anchorgen()
+        self.anchor_generator = anchor_generator
+
+        if head is None:
+            head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes)
+        self.head = head
+
+        if proposal_matcher is None:
+            proposal_matcher = det_utils.Matcher(
+                fg_iou_thresh,
+                bg_iou_thresh,
+                allow_low_quality_matches=True,
+            )
+        self.proposal_matcher = proposal_matcher
+
+        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
+
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
+
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(self, losses, detections):
+        # type: (dict[str, Tensor], list[dict[str, Tensor]]) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]
+        if self.training:
+            return losses
+
+        return detections
+
+    def compute_loss(self, targets, head_outputs, anchors):
+        # type: (list[dict[str, Tensor]], dict[str, Tensor], list[Tensor]) -> dict[str, Tensor]
+        matched_idxs = []
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            if targets_per_image["boxes"].numel() == 0:
+                matched_idxs.append(
+                    torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device)
+                )
+                continue
+
+            match_quality_matrix = box_ops.box_iou(targets_per_image["boxes"], anchors_per_image)
+            matched_idxs.append(self.proposal_matcher(match_quality_matrix))
+
+        return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
+
+    def postprocess_detections(self, head_outputs, anchors, image_shapes):
+        # type: (dict[str, list[Tensor]], list[list[Tensor]], list[tuple[int, int]]) -> list[dict[str, Tensor]]
+        class_logits = head_outputs["cls_logits"]
+        box_regression = head_outputs["bbox_regression"]
+
+        num_images = len(image_shapes)
+
+        detections: list[dict[str, Tensor]] = []
+
+        for index in range(num_images):
+            box_regression_per_image = [br[index] for br in box_regression]
+            logits_per_image = [cl[index] for cl in class_logits]
+            anchors_per_image, image_shape = anchors[index], image_shapes[index]
+
+            image_boxes = []
+            image_scores = []
+            image_labels = []
+
+            for box_regression_per_level, logits_per_level, anchors_per_level in zip(
+                box_regression_per_image, logits_per_image, anchors_per_image
+            ):
+                num_classes = logits_per_level.shape[-1]
+
+                # remove low scoring boxes
+                scores_per_level = torch.sigmoid(logits_per_level).flatten()
+                keep_idxs = scores_per_level > self.score_thresh
+                scores_per_level = scores_per_level[keep_idxs]
+                topk_idxs = torch.where(keep_idxs)[0]
+
+                # keep only topk scoring predictions
+                num_topk = det_utils._topk_min(topk_idxs, self.topk_candidates, 0)
+                scores_per_level, idxs = scores_per_level.topk(num_topk)
+                topk_idxs = topk_idxs[idxs]
+
+                anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode="floor")
+                labels_per_level = topk_idxs % num_classes
+
+                boxes_per_level = self.box_coder.decode_single(
+                    box_regression_per_level[anchor_idxs], anchors_per_level[anchor_idxs]
+                )
+                boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape)
+
+                image_boxes.append(boxes_per_level)
+                image_scores.append(scores_per_level)
+                image_labels.append(labels_per_level)
+
+            image_boxes = torch.cat(image_boxes, dim=0)
+            image_scores = torch.cat(image_scores, dim=0)
+            image_labels = torch.cat(image_labels, dim=0)
+
+            # non-maximum suppression
+            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+            keep = keep[: self.detections_per_img]
+
+            detections.append(
+                {
+                    "boxes": image_boxes[keep],
+                    "scores": image_scores[keep],
+                    "labels": image_labels[keep],
+                }
+            )
+
+        return detections
+
+    def forward(self, images, targets=None):
+        # type: (list[Tensor], Optional[list[dict[str, Tensor]]]) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]
+        """
+        Args:
+            images (list[Tensor]): images to be processed
+            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        if self.training:
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                for target in targets:
+                    boxes = target["boxes"]
+                    torch._assert(isinstance(boxes, torch.Tensor), "Expected target boxes to be of type Tensor.")
+                    torch._assert(
+                        len(boxes.shape) == 2 and boxes.shape[-1] == 4,
+                        "Expected target boxes to be a tensor of shape [N, 4].",
+                    )
+
+        # get the original image sizes
+        original_image_sizes: list[tuple[int, int]] = []
+        for img in images:
+            val = img.shape[-2:]
+            torch._assert(
+                len(val) == 2,
+                f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
+            )
+            original_image_sizes.append((val[0], val[1]))
+
+        # transform the input
+        images, targets = self.transform(images, targets)
+
+        # Check for degenerate boxes
+        # TODO: Move this to a function
+        if targets is not None:
+            for target_idx, target in enumerate(targets):
+                boxes = target["boxes"]
+                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+                if degenerate_boxes.any():
+                    # print the first degenerate box
+                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
+                    torch._assert(
+                        False,
+                        "All bounding boxes should have positive height and width."
+                        f" Found invalid box {degen_bb} for target at index {target_idx}.",
+                    )
+
+        # get the features from the backbone
+        features = self.backbone(images.tensors)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([("0", features)])
+
+        # TODO: Do we want a list or a dict?
+        features = list(features.values())
+
+        # compute the retinanet heads outputs using the features
+        head_outputs = self.head(features)
+
+        # create the set of anchors
+        anchors = self.anchor_generator(images, features)
+
+        losses = {}
+        detections: list[dict[str, Tensor]] = []
+        if self.training:
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                # compute the losses
+                losses = self.compute_loss(targets, head_outputs, anchors)
+        else:
+            # recover level sizes
+            num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
+            HW = 0
+            for v in num_anchors_per_level:
+                HW += v
+            HWA = head_outputs["cls_logits"].size(1)
+            A = HWA // HW
+            num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
+
+            # split outputs per level
+            split_head_outputs: dict[str, list[Tensor]] = {}
+            for k in head_outputs:
+                split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
+            split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
+
+            # compute the detections
+            detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)
+            detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
+
+        if torch.jit.is_scripting():
+            if not self._has_warned:
+                warnings.warn("RetinaNet always returns a (Losses, Detections) tuple in scripting")
+                self._has_warned = True
+            return losses, detections
+        return self.eager_outputs(losses, detections)
+
+
+_COMMON_META = {
+    "categories": _COCO_CATEGORIES,
+    "min_size": (1, 1),
+}
+
+
+class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 34014999,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#retinanet",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 36.4,
+                }
+            },
+            "_ops": 151.54,
+            "_file_size": 130.267,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/retinanet_resnet50_fpn_v2_coco-5905b1c5.pth",
+        transforms=ObjectDetection,
+        meta={
+            **_COMMON_META,
+            "num_params": 38198935,
+            "recipe": "https://github.com/pytorch/vision/pull/5756",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 41.5,
+                }
+            },
+            "_ops": 152.238,
+            "_file_size": 146.037,
+            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", RetinaNet_ResNet50_FPN_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def retinanet_resnet50_fpn(
+    *,
+    weights: Optional[RetinaNet_ResNet50_FPN_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> RetinaNet:
+    """
+    Constructs a RetinaNet model with a ResNet-50-FPN backbone.
+
+    .. betastatus:: detection module
+
+    Reference: `Focal Loss for Dense Object Detection <https://arxiv.org/abs/1708.02002>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example::
+
+        >>> model = torchvision.models.detection.retinanet_resnet50_fpn(weights=RetinaNet_ResNet50_FPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained weights for
+            the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.RetinaNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights
+        :members:
+    """
+    weights = RetinaNet_ResNet50_FPN_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
+
+    backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
+    # skip P2 because it generates too many anchors (according to their paper)
+    backbone = _resnet_fpn_extractor(
+        backbone, trainable_backbone_layers, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(256, 256)
+    )
+    model = RetinaNet(backbone, num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if weights == RetinaNet_ResNet50_FPN_Weights.COCO_V1:
+            overwrite_eps(model, 0.0)
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", RetinaNet_ResNet50_FPN_V2_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def retinanet_resnet50_fpn_v2(
+    *,
+    weights: Optional[RetinaNet_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> RetinaNet:
+    """
+    Constructs an improved RetinaNet model with a ResNet-50-FPN backbone.
+
+    .. betastatus:: detection module
+
+    Reference: `Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection
+    <https://arxiv.org/abs/1912.02424>`_.
+
+    :func:`~torchvision.models.detection.retinanet_resnet50_fpn` for more details.
+
+    Args:
+        weights (:class:`~torchvision.models.detection.RetinaNet_ResNet50_FPN_V2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.RetinaNet_ResNet50_FPN_V2_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained weights for
+            the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+        **kwargs: parameters passed to the ``torchvision.models.detection.RetinaNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.RetinaNet_ResNet50_FPN_V2_Weights
+        :members:
+    """
+    weights = RetinaNet_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(
+        backbone, trainable_backbone_layers, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(2048, 256)
+    )
+    anchor_generator = _default_anchorgen()
+    head = RetinaNetHead(
+        backbone.out_channels,
+        anchor_generator.num_anchors_per_location()[0],
+        num_classes,
+        norm_layer=partial(nn.GroupNorm, 32),
+    )
+    head.regression_head._loss_type = "giou"
+    model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/roi_heads.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7216745370f87e2dd5fdb2ce926dcb0a188e6d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/roi_heads.py
@@ -0,0 +1,878 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.ops import boxes as box_ops, roi_align
+
+from . import _utils as det_utils
+
+
+def fastrcnn_loss(
+    class_logits: torch.Tensor,
+    box_regression: torch.Tensor,
+    labels: list[torch.Tensor],
+    regression_targets: list[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes the loss for Faster R-CNN.
+
+    Args:
+        class_logits (Tensor)
+        box_regression (Tensor)
+        labels (list[BoxList])
+        regression_targets (Tensor)
+
+    Returns:
+        classification_loss (Tensor)
+        box_loss (Tensor)
+    """
+
+    labels = torch.cat(labels, dim=0)
+    regression_targets = torch.cat(regression_targets, dim=0)
+
+    classification_loss = F.cross_entropy(class_logits, labels)
+
+    # get indices that correspond to the regression targets for
+    # the corresponding ground truth labels, to be used with
+    # advanced indexing
+    sampled_pos_inds_subset = torch.where(labels > 0)[0]
+    labels_pos = labels[sampled_pos_inds_subset]
+    N, num_classes = class_logits.shape
+    box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
+
+    box_loss = F.smooth_l1_loss(
+        box_regression[sampled_pos_inds_subset, labels_pos],
+        regression_targets[sampled_pos_inds_subset],
+        beta=1 / 9,
+        reduction="sum",
+    )
+    box_loss = box_loss / labels.numel()
+
+    return classification_loss, box_loss
+
+
+def maskrcnn_inference(x: torch.Tensor, labels: list[torch.Tensor]) -> list[torch.Tensor]:
+    """
+    From the results of the CNN, post process the masks
+    by taking the mask corresponding to the class with max
+    probability (which are of fixed size and directly output
+    by the CNN) and return the masks in the mask field of the BoxList.
+
+    Args:
+        x (Tensor): the mask logits
+        labels (list[BoxList]): bounding boxes that are used as
+            reference, one for each image
+
+    Returns:
+        results (list[BoxList]): one BoxList for each image, containing
+            the extra field mask
+    """
+    mask_prob = x.sigmoid()
+
+    # select masks corresponding to the predicted classes
+    num_masks = x.shape[0]
+    boxes_per_image = [label.shape[0] for label in labels]
+    labels = torch.cat(labels)
+    index = torch.arange(num_masks, device=labels.device)
+    mask_prob = mask_prob[index, labels][:, None]
+    mask_prob = mask_prob.split(boxes_per_image, dim=0)
+
+    return mask_prob
+
+
+def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
+    # type: (Tensor, Tensor, Tensor, int) -> Tensor
+    """
+    Given segmentation masks and the bounding boxes corresponding
+    to the location of the masks in the image, this function
+    crops and resizes the masks in the position defined by the
+    boxes. This prepares the masks for them to be fed to the
+    loss computation as the targets.
+    """
+    matched_idxs = matched_idxs.to(boxes)
+    rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
+    gt_masks = gt_masks[:, None].to(rois)
+    return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
+
+
+def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
+    # type: (Tensor, list[Tensor], list[Tensor], list[Tensor], list[Tensor]) -> Tensor
+    """
+    Args:
+        proposals (list[BoxList])
+        mask_logits (Tensor)
+        targets (list[BoxList])
+
+    Return:
+        mask_loss (Tensor): scalar tensor containing the loss
+    """
+
+    discretization_size = mask_logits.shape[-1]
+    labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
+    mask_targets = [
+        project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
+    ]
+
+    labels = torch.cat(labels, dim=0)
+    mask_targets = torch.cat(mask_targets, dim=0)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if mask_targets.numel() == 0:
+        return mask_logits.sum() * 0
+
+    mask_loss = F.binary_cross_entropy_with_logits(
+        mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
+    )
+    return mask_loss
+
+
+def keypoints_to_heatmap(keypoints, rois, heatmap_size):
+    # type: (Tensor, Tensor, int) -> tuple[Tensor, Tensor]
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
+
+
+def _onnx_heatmaps_to_keypoints(
+    maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
+):
+    num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
+
+    width_correction = widths_i / roi_map_width
+    height_correction = heights_i / roi_map_height
+
+    roi_map = F.interpolate(
+        maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
+    )[:, 0]
+
+    w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
+    pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
+
+    x_int = pos % w
+    y_int = (pos - x_int) // w
+
+    x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
+        dtype=torch.float32
+    )
+    y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
+        dtype=torch.float32
+    )
+
+    xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
+    xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
+    xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
+    xy_preds_i = torch.stack(
+        [
+            xy_preds_i_0.to(dtype=torch.float32),
+            xy_preds_i_1.to(dtype=torch.float32),
+            xy_preds_i_2.to(dtype=torch.float32),
+        ],
+        0,
+    )
+
+    # TODO: simplify when indexing without rank will be supported by ONNX
+    base = num_keypoints * num_keypoints + num_keypoints + 1
+    ind = torch.arange(num_keypoints)
+    ind = ind.to(dtype=torch.int64) * base
+    end_scores_i = (
+        roi_map.index_select(1, y_int.to(dtype=torch.int64))
+        .index_select(2, x_int.to(dtype=torch.int64))
+        .view(-1)
+        .index_select(0, ind.to(dtype=torch.int64))
+    )
+
+    return xy_preds_i, end_scores_i
+
+
+@torch.jit._script_if_tracing
+def _onnx_heatmaps_to_keypoints_loop(
+    maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
+):
+    xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
+    end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
+
+    for i in range(int(rois.size(0))):
+        xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
+            maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
+        )
+        xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
+        end_scores = torch.cat(
+            (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
+        )
+    return xy_preds, end_scores
+
+
+def heatmaps_to_keypoints(maps, rois):
+    """Extract predicted keypoint locations from heatmaps. Output has shape
+    (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
+    for each keypoint.
+    """
+    # This function converts a discrete image coordinate in a HEATMAP_SIZE x
+    # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
+    # consistency with keypoints_to_heatmap_labels by using the conversion from
+    # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
+    # continuous coordinate.
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = rois[:, 2] - rois[:, 0]
+    heights = rois[:, 3] - rois[:, 1]
+    widths = widths.clamp(min=1)
+    heights = heights.clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+
+    num_keypoints = maps.shape[1]
+
+    if torchvision._is_tracing():
+        xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
+            maps,
+            rois,
+            widths_ceil,
+            heights_ceil,
+            widths,
+            heights,
+            offset_x,
+            offset_y,
+            torch.scalar_tensor(num_keypoints, dtype=torch.int64),
+        )
+        return xy_preds.permute(0, 2, 1), end_scores
+
+    xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
+    end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
+    for i in range(len(rois)):
+        roi_map_width = int(widths_ceil[i].item())
+        roi_map_height = int(heights_ceil[i].item())
+        width_correction = widths[i] / roi_map_width
+        height_correction = heights[i] / roi_map_height
+        roi_map = F.interpolate(
+            maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
+        )[:, 0]
+        # roi_map_probs = scores_to_probs(roi_map.copy())
+        w = roi_map.shape[2]
+        pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
+
+        x_int = pos % w
+        y_int = torch.div(pos - x_int, w, rounding_mode="floor")
+        # assert (roi_map_probs[k, y_int, x_int] ==
+        #         roi_map_probs[k, :, :].max())
+        x = (x_int.float() + 0.5) * width_correction
+        y = (y_int.float() + 0.5) * height_correction
+        xy_preds[i, 0, :] = x + offset_x[i]
+        xy_preds[i, 1, :] = y + offset_y[i]
+        xy_preds[i, 2, :] = 1
+        end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
+
+    return xy_preds.permute(0, 2, 1), end_scores
+
+
+def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
+    # type: (Tensor, list[Tensor], list[Tensor], list[Tensor]) -> Tensor
+    N, K, H, W = keypoint_logits.shape
+    if H != W:
+        raise ValueError(
+            f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
+        )
+    discretization_size = H
+    heatmaps = []
+    valid = []
+    for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
+        kp = gt_kp_in_image[midx]
+        heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
+        heatmaps.append(heatmaps_per_image.view(-1))
+        valid.append(valid_per_image.view(-1))
+
+    keypoint_targets = torch.cat(heatmaps, dim=0)
+    valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
+    valid = torch.where(valid)[0]
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it sepaartely
+    if keypoint_targets.numel() == 0 or len(valid) == 0:
+        return keypoint_logits.sum() * 0
+
+    keypoint_logits = keypoint_logits.view(N * K, H * W)
+
+    keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
+    return keypoint_loss
+
+
+def keypointrcnn_inference(x, boxes):
+    # type: (Tensor, list[Tensor]) -> tuple[list[Tensor], list[Tensor]]
+    kp_probs = []
+    kp_scores = []
+
+    boxes_per_image = [box.size(0) for box in boxes]
+    x2 = x.split(boxes_per_image, dim=0)
+
+    for xx, bb in zip(x2, boxes):
+        kp_prob, scores = heatmaps_to_keypoints(xx, bb)
+        kp_probs.append(kp_prob)
+        kp_scores.append(scores)
+
+    return kp_probs, kp_scores
+
+
+def _onnx_expand_boxes(boxes, scale):
+    # type: (Tensor, float) -> Tensor
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half = w_half.to(dtype=torch.float32) * scale
+    h_half = h_half.to(dtype=torch.float32) * scale
+
+    boxes_exp0 = x_c - w_half
+    boxes_exp1 = y_c - h_half
+    boxes_exp2 = x_c + w_half
+    boxes_exp3 = y_c + h_half
+    boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
+    return boxes_exp
+
+
+# the next two functions should be merged inside Masker
+# but are kept here for the moment while we need them
+# temporarily for paste_mask_in_image
+def expand_boxes(boxes, scale):
+    # type: (Tensor, float) -> Tensor
+    if torchvision._is_tracing():
+        return _onnx_expand_boxes(boxes, scale)
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = torch.zeros_like(boxes)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+
+
+@torch.jit.unused
+def expand_masks_tracing_scale(M, padding):
+    # type: (int, int) -> float
+    return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
+
+
+def expand_masks(mask, padding):
+    # type: (Tensor, int) -> tuple[Tensor, float]
+    M = mask.shape[-1]
+    if torch._C._get_tracing_state():  # could not import is_tracing(), not sure why
+        scale = expand_masks_tracing_scale(M, padding)
+    else:
+        scale = float(M + 2 * padding) / M
+    padded_mask = F.pad(mask, (padding,) * 4)
+    return padded_mask, scale
+
+
+def paste_mask_in_image(mask, box, im_h, im_w):
+    # type: (Tensor, Tensor, int, int) -> Tensor
+    TO_REMOVE = 1
+    w = int(box[2] - box[0] + TO_REMOVE)
+    h = int(box[3] - box[1] + TO_REMOVE)
+    w = max(w, 1)
+    h = max(h, 1)
+
+    # Set shape to [batchxCxHxW]
+    mask = mask.expand((1, 1, -1, -1))
+
+    # Resize mask
+    mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
+    mask = mask[0][0]
+
+    im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, im_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, im_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
+    return im_mask
+
+
+def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
+    one = torch.ones(1, dtype=torch.int64)
+    zero = torch.zeros(1, dtype=torch.int64)
+
+    w = box[2] - box[0] + one
+    h = box[3] - box[1] + one
+    w = torch.max(torch.cat((w, one)))
+    h = torch.max(torch.cat((h, one)))
+
+    # Set shape to [batchxCxHxW]
+    mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
+
+    # Resize mask
+    mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
+    mask = mask[0][0]
+
+    x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
+    x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
+    y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
+    y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
+
+    unpaded_im_mask = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
+
+    # TODO : replace below with a dynamic padding when support is added in ONNX
+
+    # pad y
+    zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
+    zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
+    concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
+    # pad x
+    zeros_x0 = torch.zeros(concat_0.size(0), x_0)
+    zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
+    im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
+    return im_mask
+
+
+@torch.jit._script_if_tracing
+def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
+    res_append = torch.zeros(0, im_h, im_w)
+    for i in range(masks.size(0)):
+        mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
+        mask_res = mask_res.unsqueeze(0)
+        res_append = torch.cat((res_append, mask_res))
+    return res_append
+
+
+def paste_masks_in_image(masks, boxes, img_shape, padding=1):
+    # type: (Tensor, Tensor, tuple[int, int], int) -> Tensor
+    masks, scale = expand_masks(masks, padding=padding)
+    boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
+    im_h, im_w = img_shape
+
+    if torchvision._is_tracing():
+        return _onnx_paste_masks_in_image_loop(
+            masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
+        )[:, None]
+    res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
+    if len(res) > 0:
+        ret = torch.stack(res, dim=0)[:, None]
+    else:
+        ret = masks.new_empty((0, 1, im_h, im_w))
+    return ret
+
+
+class RoIHeads(nn.Module):
+    __annotations__ = {
+        "box_coder": det_utils.BoxCoder,
+        "proposal_matcher": det_utils.Matcher,
+        "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
+    }
+
+    def __init__(
+        self,
+        box_roi_pool,
+        box_head,
+        box_predictor,
+        # Faster R-CNN training
+        fg_iou_thresh,
+        bg_iou_thresh,
+        batch_size_per_image,
+        positive_fraction,
+        bbox_reg_weights,
+        # Faster R-CNN inference
+        score_thresh,
+        nms_thresh,
+        detections_per_img,
+        # Mask
+        mask_roi_pool=None,
+        mask_head=None,
+        mask_predictor=None,
+        keypoint_roi_pool=None,
+        keypoint_head=None,
+        keypoint_predictor=None,
+    ):
+        super().__init__()
+
+        self.box_similarity = box_ops.box_iou
+        # assign ground-truth boxes for each proposal
+        self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
+
+        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
+
+        if bbox_reg_weights is None:
+            bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
+        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
+
+        self.box_roi_pool = box_roi_pool
+        self.box_head = box_head
+        self.box_predictor = box_predictor
+
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+
+        self.mask_roi_pool = mask_roi_pool
+        self.mask_head = mask_head
+        self.mask_predictor = mask_predictor
+
+        self.keypoint_roi_pool = keypoint_roi_pool
+        self.keypoint_head = keypoint_head
+        self.keypoint_predictor = keypoint_predictor
+
+    def has_mask(self):
+        if self.mask_roi_pool is None:
+            return False
+        if self.mask_head is None:
+            return False
+        if self.mask_predictor is None:
+            return False
+        return True
+
+    def has_keypoint(self):
+        if self.keypoint_roi_pool is None:
+            return False
+        if self.keypoint_head is None:
+            return False
+        if self.keypoint_predictor is None:
+            return False
+        return True
+
+    def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
+        # type: (list[Tensor], list[Tensor], list[Tensor]) -> tuple[list[Tensor], list[Tensor]]
+        matched_idxs = []
+        labels = []
+        for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
+
+            if gt_boxes_in_image.numel() == 0:
+                # Background image
+                device = proposals_in_image.device
+                clamped_matched_idxs_in_image = torch.zeros(
+                    (proposals_in_image.shape[0],), dtype=torch.int64, device=device
+                )
+                labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
+            else:
+                #  set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
+                match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
+                matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
+
+                clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
+
+                labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
+                labels_in_image = labels_in_image.to(dtype=torch.int64)
+
+                # Label background (below the low threshold)
+                bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
+                labels_in_image[bg_inds] = 0
+
+                # Label ignore proposals (between low and high thresholds)
+                ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
+                labels_in_image[ignore_inds] = -1  # -1 is ignored by sampler
+
+            matched_idxs.append(clamped_matched_idxs_in_image)
+            labels.append(labels_in_image)
+        return matched_idxs, labels
+
+    def subsample(self, labels):
+        # type: (list[Tensor]) -> list[Tensor]
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        sampled_inds = []
+        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
+            img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
+            sampled_inds.append(img_sampled_inds)
+        return sampled_inds
+
+    def add_gt_proposals(self, proposals, gt_boxes):
+        # type: (list[Tensor], list[Tensor]) -> list[Tensor]
+        proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
+
+        return proposals
+
+    def check_targets(self, targets):
+        # type: (Optional[list[dict[str, Tensor]]]) -> None
+        if targets is None:
+            raise ValueError("targets should not be None")
+        if not all(["boxes" in t for t in targets]):
+            raise ValueError("Every element of targets should have a boxes key")
+        if not all(["labels" in t for t in targets]):
+            raise ValueError("Every element of targets should have a labels key")
+        if self.has_mask():
+            if not all(["masks" in t for t in targets]):
+                raise ValueError("Every element of targets should have a masks key")
+
+    def select_training_samples(
+        self,
+        proposals,  # type: list[Tensor]
+        targets,  # type: Optional[list[dict[str, Tensor]]]
+    ):
+        # type: (...) -> tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]
+        self.check_targets(targets)
+        if targets is None:
+            raise ValueError("targets should not be None")
+        dtype = proposals[0].dtype
+        device = proposals[0].device
+
+        gt_boxes = [t["boxes"].to(dtype) for t in targets]
+        gt_labels = [t["labels"] for t in targets]
+
+        # append ground-truth bboxes to propos
+        proposals = self.add_gt_proposals(proposals, gt_boxes)
+
+        # get matching gt indices for each proposal
+        matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
+        # sample a fixed proportion of positive-negative proposals
+        sampled_inds = self.subsample(labels)
+        matched_gt_boxes = []
+        num_images = len(proposals)
+        for img_id in range(num_images):
+            img_sampled_inds = sampled_inds[img_id]
+            proposals[img_id] = proposals[img_id][img_sampled_inds]
+            labels[img_id] = labels[img_id][img_sampled_inds]
+            matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
+
+            gt_boxes_in_image = gt_boxes[img_id]
+            if gt_boxes_in_image.numel() == 0:
+                gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
+            matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
+
+        regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
+        return proposals, matched_idxs, labels, regression_targets
+
+    def postprocess_detections(
+        self,
+        class_logits,  # type: Tensor
+        box_regression,  # type: Tensor
+        proposals,  # type: list[Tensor]
+        image_shapes,  # type: list[tuple[int, int]]
+    ):
+        # type: (...) -> tuple[list[Tensor], list[Tensor], list[Tensor]]
+        device = class_logits.device
+        num_classes = class_logits.shape[-1]
+
+        boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
+        pred_boxes = self.box_coder.decode(box_regression, proposals)
+
+        pred_scores = F.softmax(class_logits, -1)
+
+        pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
+        pred_scores_list = pred_scores.split(boxes_per_image, 0)
+
+        all_boxes = []
+        all_scores = []
+        all_labels = []
+        for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
+            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
+
+            # create labels for each prediction
+            labels = torch.arange(num_classes, device=device)
+            labels = labels.view(1, -1).expand_as(scores)
+
+            # remove predictions with the background label
+            boxes = boxes[:, 1:]
+            scores = scores[:, 1:]
+            labels = labels[:, 1:]
+
+            # batch everything, by making every class prediction be a separate instance
+            boxes = boxes.reshape(-1, 4)
+            scores = scores.reshape(-1)
+            labels = labels.reshape(-1)
+
+            # remove low scoring boxes
+            inds = torch.where(scores > self.score_thresh)[0]
+            boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
+
+            # remove empty boxes
+            keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
+            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
+
+            # non-maximum suppression, independently done per class
+            keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
+            # keep only topk scoring predictions
+            keep = keep[: self.detections_per_img]
+            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
+
+            all_boxes.append(boxes)
+            all_scores.append(scores)
+            all_labels.append(labels)
+
+        return all_boxes, all_scores, all_labels
+
+    def forward(
+        self,
+        features: dict[str, torch.Tensor],
+        proposals: list[torch.Tensor],
+        image_shapes: list[tuple[int, int]],
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
+        """
+        Args:
+            features (List[Tensor])
+            proposals (List[Tensor[N, 4]])
+            image_shapes (List[Tuple[H, W]])
+            targets (List[Dict])
+        """
+        if targets is not None:
+            for t in targets:
+                # TODO: https://github.com/pytorch/pytorch/issues/26731
+                floating_point_types = (torch.float, torch.double, torch.half)
+                if t["boxes"].dtype not in floating_point_types:
+                    raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
+                if not t["labels"].dtype == torch.int64:
+                    raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
+                if self.has_keypoint():
+                    if not t["keypoints"].dtype == torch.float32:
+                        raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
+
+        if self.training:
+            proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
+        else:
+            labels = None
+            regression_targets = None
+            matched_idxs = None
+
+        box_features = self.box_roi_pool(features, proposals, image_shapes)
+        box_features = self.box_head(box_features)
+        class_logits, box_regression = self.box_predictor(box_features)
+
+        result: list[dict[str, torch.Tensor]] = []
+        losses = {}
+        if self.training:
+            if labels is None:
+                raise ValueError("labels cannot be None")
+            if regression_targets is None:
+                raise ValueError("regression_targets cannot be None")
+            loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
+            losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
+        else:
+            boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+            num_images = len(boxes)
+            for i in range(num_images):
+                result.append(
+                    {
+                        "boxes": boxes[i],
+                        "labels": labels[i],
+                        "scores": scores[i],
+                    }
+                )
+
+        if self.has_mask():
+            mask_proposals = [p["boxes"] for p in result]
+            if self.training:
+                if matched_idxs is None:
+                    raise ValueError("if in training, matched_idxs should not be None")
+
+                # during training, only focus on positive boxes
+                num_images = len(proposals)
+                mask_proposals = []
+                pos_matched_idxs = []
+                for img_id in range(num_images):
+                    pos = torch.where(labels[img_id] > 0)[0]
+                    mask_proposals.append(proposals[img_id][pos])
+                    pos_matched_idxs.append(matched_idxs[img_id][pos])
+            else:
+                pos_matched_idxs = None
+
+            if self.mask_roi_pool is not None:
+                mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+                mask_features = self.mask_head(mask_features)
+                mask_logits = self.mask_predictor(mask_features)
+            else:
+                raise Exception("Expected mask_roi_pool to be not None")
+
+            loss_mask = {}
+            if self.training:
+                if targets is None or pos_matched_idxs is None or mask_logits is None:
+                    raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
+
+                gt_masks = [t["masks"] for t in targets]
+                gt_labels = [t["labels"] for t in targets]
+                rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
+                loss_mask = {"loss_mask": rcnn_loss_mask}
+            else:
+                labels = [r["labels"] for r in result]
+                masks_probs = maskrcnn_inference(mask_logits, labels)
+                for mask_prob, r in zip(masks_probs, result):
+                    r["masks"] = mask_prob
+
+            losses.update(loss_mask)
+
+        # keep none checks in if conditional so torchscript will conditionally
+        # compile each branch
+        if (
+            self.keypoint_roi_pool is not None
+            and self.keypoint_head is not None
+            and self.keypoint_predictor is not None
+        ):
+            keypoint_proposals = [p["boxes"] for p in result]
+            if self.training:
+                # during training, only focus on positive boxes
+                num_images = len(proposals)
+                keypoint_proposals = []
+                pos_matched_idxs = []
+                if matched_idxs is None:
+                    raise ValueError("if in trainning, matched_idxs should not be None")
+
+                for img_id in range(num_images):
+                    pos = torch.where(labels[img_id] > 0)[0]
+                    keypoint_proposals.append(proposals[img_id][pos])
+                    pos_matched_idxs.append(matched_idxs[img_id][pos])
+            else:
+                pos_matched_idxs = None
+
+            keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
+            keypoint_features = self.keypoint_head(keypoint_features)
+            keypoint_logits = self.keypoint_predictor(keypoint_features)
+
+            loss_keypoint = {}
+            if self.training:
+                if targets is None or pos_matched_idxs is None:
+                    raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
+
+                gt_keypoints = [t["keypoints"] for t in targets]
+                rcnn_loss_keypoint = keypointrcnn_loss(
+                    keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
+                )
+                loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
+            else:
+                if keypoint_logits is None or keypoint_proposals is None:
+                    raise ValueError(
+                        "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
+                    )
+
+                keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
+                for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
+                    r["keypoints"] = keypoint_prob
+                    r["keypoints_scores"] = kps
+            losses.update(loss_keypoint)
+
+        return result, losses
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/rpn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5718922cb2cb001a5e47f48731b733ffd808eb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/rpn.py
@@ -0,0 +1,387 @@
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.ops import boxes as box_ops, Conv2dNormActivation
+
+from . import _utils as det_utils
+
+# Import AnchorGenerator to keep compatibility.
+from .anchor_utils import AnchorGenerator  # noqa: 401
+from .image_list import ImageList
+
+
+class RPNHead(nn.Module):
+    """
+    Adds a simple RPN Head with classification and regression heads
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        conv_depth (int, optional): number of convolutions
+    """
+
+    _version = 2
+
+    def __init__(self, in_channels: int, num_anchors: int, conv_depth=1) -> None:
+        super().__init__()
+        convs = []
+        for _ in range(conv_depth):
+            convs.append(Conv2dNormActivation(in_channels, in_channels, kernel_size=3, norm_layer=None))
+        self.conv = nn.Sequential(*convs)
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
+        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
+
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                torch.nn.init.normal_(layer.weight, std=0.01)  # type: ignore[arg-type]
+                if layer.bias is not None:
+                    torch.nn.init.constant_(layer.bias, 0)  # type: ignore[arg-type]
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            for type in ["weight", "bias"]:
+                old_key = f"{prefix}conv.{type}"
+                new_key = f"{prefix}conv.0.0.{type}"
+                if old_key in state_dict:
+                    state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def forward(self, x: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
+        logits = []
+        bbox_reg = []
+        for feature in x:
+            t = self.conv(feature)
+            logits.append(self.cls_logits(t))
+            bbox_reg.append(self.bbox_pred(t))
+        return logits, bbox_reg
+
+
+def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int, W: int) -> Tensor:
+    layer = layer.view(N, -1, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)
+    layer = layer.reshape(N, -1, C)
+    return layer
+
+
+def concat_box_prediction_layers(box_cls: list[Tensor], box_regression: list[Tensor]) -> tuple[Tensor, Tensor]:
+    box_cls_flattened = []
+    box_regression_flattened = []
+    # for each feature level, permute the outputs to make them be in the
+    # same format as the labels. Note that the labels are computed for
+    # all feature levels concatenated, so we keep the same representation
+    # for the objectness and the box_regression
+    for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
+        N, AxC, H, W = box_cls_per_level.shape
+        Ax4 = box_regression_per_level.shape[1]
+        A = Ax4 // 4
+        C = AxC // A
+        box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
+        box_cls_flattened.append(box_cls_per_level)
+
+        box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
+        box_regression_flattened.append(box_regression_per_level)
+    # concatenate on the first dimension (representing the feature levels), to
+    # take into account the way the labels were generated (with all feature maps
+    # being concatenated as well)
+    box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)
+    box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
+    return box_cls, box_regression
+
+
+class RegionProposalNetwork(torch.nn.Module):
+    """
+    Implements Region Proposal Network (RPN).
+
+    Args:
+        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        head (nn.Module): module that computes the objectness and regression deltas
+        fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training of the RPN.
+        bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training of the RPN.
+        batch_size_per_image (int): number of anchors that are sampled during training of the RPN
+            for computing the loss
+        positive_fraction (float): proportion of positive anchors in a mini-batch during training
+            of the RPN
+        pre_nms_top_n (Dict[str, int]): number of proposals to keep before applying NMS. It should
+            contain two fields: training and testing, to allow for different values depending
+            on training or evaluation
+        post_nms_top_n (Dict[str, int]): number of proposals to keep after applying NMS. It should
+            contain two fields: training and testing, to allow for different values depending
+            on training or evaluation
+        nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
+        score_thresh (float): only return proposals with an objectness score greater than score_thresh
+
+    """
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxCoder,
+        "proposal_matcher": det_utils.Matcher,
+        "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
+    }
+
+    def __init__(
+        self,
+        anchor_generator: AnchorGenerator,
+        head: nn.Module,
+        # Faster-RCNN Training
+        fg_iou_thresh: float,
+        bg_iou_thresh: float,
+        batch_size_per_image: int,
+        positive_fraction: float,
+        # Faster-RCNN Inference
+        pre_nms_top_n: dict[str, int],
+        post_nms_top_n: dict[str, int],
+        nms_thresh: float,
+        score_thresh: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.anchor_generator = anchor_generator
+        self.head = head
+        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+
+        # used during training
+        self.box_similarity = box_ops.box_iou
+
+        self.proposal_matcher = det_utils.Matcher(
+            fg_iou_thresh,
+            bg_iou_thresh,
+            allow_low_quality_matches=True,
+        )
+
+        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
+        # used during testing
+        self._pre_nms_top_n = pre_nms_top_n
+        self._post_nms_top_n = post_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.score_thresh = score_thresh
+        self.min_size = 1e-3
+
+    def pre_nms_top_n(self) -> int:
+        if self.training:
+            return self._pre_nms_top_n["training"]
+        return self._pre_nms_top_n["testing"]
+
+    def post_nms_top_n(self) -> int:
+        if self.training:
+            return self._post_nms_top_n["training"]
+        return self._post_nms_top_n["testing"]
+
+    def assign_targets_to_anchors(
+        self, anchors: list[Tensor], targets: list[dict[str, Tensor]]
+    ) -> tuple[list[Tensor], list[Tensor]]:
+
+        labels = []
+        matched_gt_boxes = []
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            gt_boxes = targets_per_image["boxes"]
+
+            if gt_boxes.numel() == 0:
+                # Background image (negative example)
+                device = anchors_per_image.device
+                matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
+                labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
+            else:
+                match_quality_matrix = self.box_similarity(gt_boxes, anchors_per_image)
+                matched_idxs = self.proposal_matcher(match_quality_matrix)
+                # get the targets corresponding GT for each proposal
+                # NB: need to clamp the indices because we can have a single
+                # GT in the image, and matched_idxs can be -2, which goes
+                # out of bounds
+                matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]
+
+                labels_per_image = matched_idxs >= 0
+                labels_per_image = labels_per_image.to(dtype=torch.float32)
+
+                # Background (negative examples)
+                bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD
+                labels_per_image[bg_indices] = 0.0
+
+                # discard indices that are between thresholds
+                inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS
+                labels_per_image[inds_to_discard] = -1.0
+
+            labels.append(labels_per_image)
+            matched_gt_boxes.append(matched_gt_boxes_per_image)
+        return labels, matched_gt_boxes
+
+    def _get_top_n_idx(self, objectness: Tensor, num_anchors_per_level: list[int]) -> Tensor:
+        r = []
+        offset = 0
+        for ob in objectness.split(num_anchors_per_level, 1):
+            num_anchors = ob.shape[1]
+            pre_nms_top_n = det_utils._topk_min(ob, self.pre_nms_top_n(), 1)
+            _, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
+            r.append(top_n_idx + offset)
+            offset += num_anchors
+        return torch.cat(r, dim=1)
+
+    def filter_proposals(
+        self,
+        proposals: Tensor,
+        objectness: Tensor,
+        image_shapes: list[tuple[int, int]],
+        num_anchors_per_level: list[int],
+    ) -> tuple[list[Tensor], list[Tensor]]:
+
+        num_images = proposals.shape[0]
+        device = proposals.device
+        # do not backprop through objectness
+        objectness = objectness.detach()
+        objectness = objectness.reshape(num_images, -1)
+
+        levels = [
+            torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level)
+        ]
+        levels = torch.cat(levels, 0)
+        levels = levels.reshape(1, -1).expand_as(objectness)
+
+        # select top_n boxes independently per level before applying nms
+        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
+
+        image_range = torch.arange(num_images, device=device)
+        batch_idx = image_range[:, None]
+
+        objectness = objectness[batch_idx, top_n_idx]
+        levels = levels[batch_idx, top_n_idx]
+        proposals = proposals[batch_idx, top_n_idx]
+
+        objectness_prob = torch.sigmoid(objectness)
+
+        final_boxes = []
+        final_scores = []
+        for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
+            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
+
+            # remove small boxes
+            keep = box_ops.remove_small_boxes(boxes, self.min_size)
+            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
+
+            # remove low scoring boxes
+            # use >= for Backwards compatibility
+            keep = torch.where(scores >= self.score_thresh)[0]
+            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
+
+            # non-maximum suppression, independently done per level
+            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)
+
+            # keep only topk scoring predictions
+            keep = keep[: self.post_nms_top_n()]
+            boxes, scores = boxes[keep], scores[keep]
+
+            final_boxes.append(boxes)
+            final_scores.append(scores)
+        return final_boxes, final_scores
+
+    def compute_loss(
+        self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: list[Tensor], regression_targets: list[Tensor]
+    ) -> tuple[Tensor, Tensor]:
+        """
+        Args:
+            objectness (Tensor)
+            pred_bbox_deltas (Tensor)
+            labels (List[Tensor])
+            regression_targets (List[Tensor])
+
+        Returns:
+            objectness_loss (Tensor)
+            box_loss (Tensor)
+        """
+
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        sampled_pos_inds = torch.where(torch.cat(sampled_pos_inds, dim=0))[0]
+        sampled_neg_inds = torch.where(torch.cat(sampled_neg_inds, dim=0))[0]
+
+        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
+
+        objectness = objectness.flatten()
+
+        labels = torch.cat(labels, dim=0)
+        regression_targets = torch.cat(regression_targets, dim=0)
+
+        box_loss = F.smooth_l1_loss(
+            pred_bbox_deltas[sampled_pos_inds],
+            regression_targets[sampled_pos_inds],
+            beta=1 / 9,
+            reduction="sum",
+        ) / (sampled_inds.numel())
+
+        objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds])
+
+        return objectness_loss, box_loss
+
+    def forward(
+        self,
+        images: ImageList,
+        features: dict[str, Tensor],
+        targets: Optional[list[dict[str, Tensor]]] = None,
+    ) -> tuple[list[Tensor], dict[str, Tensor]]:
+        """
+        Args:
+            images (ImageList): images for which we want to compute the predictions
+            features (Dict[str, Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (List[Dict[str, Tensor]]): ground-truth boxes present in the image (optional).
+                If provided, each element in the dict should contain a field `boxes`,
+                with the locations of the ground-truth boxes.
+
+        Returns:
+            boxes (List[Tensor]): the predicted boxes from the RPN, one Tensor per
+                image.
+            losses (Dict[str, Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        # RPN uses all feature maps that are available
+        features = list(features.values())
+        objectness, pred_bbox_deltas = self.head(features)
+        anchors = self.anchor_generator(images, features)
+
+        num_images = len(anchors)
+        num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
+        num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
+        objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness, pred_bbox_deltas)
+        # apply pred_bbox_deltas to anchors to obtain the decoded proposals
+        # note that we detach the deltas because Faster R-CNN do not backprop through
+        # the proposals
+        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
+        proposals = proposals.view(num_images, -1, 4)
+        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
+
+        losses = {}
+        if self.training:
+            if targets is None:
+                raise ValueError("targets should not be None")
+            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
+            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
+            loss_objectness, loss_rpn_box_reg = self.compute_loss(
+                objectness, pred_bbox_deltas, labels, regression_targets
+            )
+            losses = {
+                "loss_objectness": loss_objectness,
+                "loss_rpn_box_reg": loss_rpn_box_reg,
+            }
+        return boxes, losses
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssd.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd43d04c7520965a8e9eed11d7d184e9991f805
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssd.py
@@ -0,0 +1,682 @@
+import warnings
+from collections import OrderedDict
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ...ops import boxes as box_ops
+from ...transforms._presets import ObjectDetection
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..vgg import VGG, vgg16, VGG16_Weights
+from . import _utils as det_utils
+from .anchor_utils import DefaultBoxGenerator
+from .backbone_utils import _validate_trainable_layers
+from .transform import GeneralizedRCNNTransform
+
+
+__all__ = [
+    "SSD300_VGG16_Weights",
+    "ssd300_vgg16",
+]
+
+
+class SSD300_VGG16_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth",
+        transforms=ObjectDetection,
+        meta={
+            "num_params": 35641826,
+            "categories": _COCO_CATEGORIES,
+            "min_size": (1, 1),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssd300-vgg16",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 25.1,
+                }
+            },
+            "_ops": 34.858,
+            "_file_size": 135.988,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+def _xavier_init(conv: nn.Module):
+    for layer in conv.modules():
+        if isinstance(layer, nn.Conv2d):
+            torch.nn.init.xavier_uniform_(layer.weight)
+            if layer.bias is not None:
+                torch.nn.init.constant_(layer.bias, 0.0)
+
+
+class SSDHead(nn.Module):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], num_classes: int):
+        super().__init__()
+        self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes)
+        self.regression_head = SSDRegressionHead(in_channels, num_anchors)
+
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
+        return {
+            "bbox_regression": self.regression_head(x),
+            "cls_logits": self.classification_head(x),
+        }
+
+
+class SSDScoringHead(nn.Module):
+    def __init__(self, module_list: nn.ModuleList, num_columns: int):
+        super().__init__()
+        self.module_list = module_list
+        self.num_columns = num_columns
+
+    def _get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.module_list[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.module_list)
+        if idx < 0:
+            idx += num_blocks
+        out = x
+        for i, module in enumerate(self.module_list):
+            if i == idx:
+                out = module(x)
+        return out
+
+    def forward(self, x: list[Tensor]) -> Tensor:
+        all_results = []
+
+        for i, features in enumerate(x):
+            results = self._get_result_from_module_list(features, i)
+
+            # Permute output from (N, A * K, H, W) to (N, HWA, K).
+            N, _, H, W = results.shape
+            results = results.view(N, -1, self.num_columns, H, W)
+            results = results.permute(0, 3, 4, 1, 2)
+            results = results.reshape(N, -1, self.num_columns)  # Size=(N, HWA, K)
+
+            all_results.append(results)
+
+        return torch.cat(all_results, dim=1)
+
+
+class SSDClassificationHead(SSDScoringHead):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], num_classes: int):
+        cls_logits = nn.ModuleList()
+        for channels, anchors in zip(in_channels, num_anchors):
+            cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1))
+        _xavier_init(cls_logits)
+        super().__init__(cls_logits, num_classes)
+
+
+class SSDRegressionHead(SSDScoringHead):
+    def __init__(self, in_channels: list[int], num_anchors: list[int]):
+        bbox_reg = nn.ModuleList()
+        for channels, anchors in zip(in_channels, num_anchors):
+            bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1))
+        _xavier_init(bbox_reg)
+        super().__init__(bbox_reg, 4)
+
+
+class SSD(nn.Module):
+    """
+    Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
+    to a fixed size before passing it to the backbone.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each detection
+        - scores (Tensor[N]): the scores for each detection
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute with the list of the output channels of
+            each feature map. The backbone should return a single Tensor or an OrderedDict[Tensor].
+        anchor_generator (DefaultBoxGenerator): module that generates the default boxes for a
+            set of feature maps.
+        size (Tuple[int, int]): the width and height to which images will be rescaled before feeding them
+            to the backbone.
+        num_classes (int): number of output classes of the model (including the background).
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        head (nn.Module, optional): Module run on top of the backbone features. Defaults to a module containing
+            a classification and regression module.
+        score_thresh (float): Score threshold used for postprocessing the detections.
+        nms_thresh (float): NMS threshold used for postprocessing the detections.
+        detections_per_img (int): Number of best detections to keep after NMS.
+        iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training.
+        topk_candidates (int): Number of best detections to keep before NMS.
+        positive_fraction (float): a number between 0 and 1 which indicates the proportion of positive
+            proposals used during the training of the classification head. It is used to estimate the negative to
+            positive ratio.
+    """
+
+    __annotations__ = {
+        "box_coder": det_utils.BoxCoder,
+        "proposal_matcher": det_utils.Matcher,
+    }
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        anchor_generator: DefaultBoxGenerator,
+        size: tuple[int, int],
+        num_classes: int,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        head: Optional[nn.Module] = None,
+        score_thresh: float = 0.01,
+        nms_thresh: float = 0.45,
+        detections_per_img: int = 200,
+        iou_thresh: float = 0.5,
+        topk_candidates: int = 400,
+        positive_fraction: float = 0.25,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        self.backbone = backbone
+
+        self.anchor_generator = anchor_generator
+
+        self.box_coder = det_utils.BoxCoder(weights=(10.0, 10.0, 5.0, 5.0))
+
+        if head is None:
+            if hasattr(backbone, "out_channels"):
+                out_channels = backbone.out_channels
+            else:
+                out_channels = det_utils.retrieve_out_channels(backbone, size)
+
+            if len(out_channels) != len(anchor_generator.aspect_ratios):
+                raise ValueError(
+                    f"The length of the output channels from the backbone ({len(out_channels)}) do not match the length of the anchor generator aspect ratios ({len(anchor_generator.aspect_ratios)})"
+                )
+
+            num_anchors = self.anchor_generator.num_anchors_per_location()
+            head = SSDHead(out_channels, num_anchors, num_classes)
+        self.head = head
+
+        self.proposal_matcher = det_utils.SSDMatcher(iou_thresh)
+
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+        self.transform = GeneralizedRCNNTransform(
+            min(size), max(size), image_mean, image_std, size_divisible=1, fixed_size=size, **kwargs
+        )
+
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
+        self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction
+
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(
+        self, losses: dict[str, Tensor], detections: list[dict[str, Tensor]]
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
+        if self.training:
+            return losses
+
+        return detections
+
+    def compute_loss(
+        self,
+        targets: list[dict[str, Tensor]],
+        head_outputs: dict[str, Tensor],
+        anchors: list[Tensor],
+        matched_idxs: list[Tensor],
+    ) -> dict[str, Tensor]:
+        bbox_regression = head_outputs["bbox_regression"]
+        cls_logits = head_outputs["cls_logits"]
+
+        # Match original targets with default boxes
+        num_foreground = 0
+        bbox_loss = []
+        cls_targets = []
+        for (
+            targets_per_image,
+            bbox_regression_per_image,
+            cls_logits_per_image,
+            anchors_per_image,
+            matched_idxs_per_image,
+        ) in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs):
+            # produce the matching between boxes and targets
+            foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
+            foreground_matched_idxs_per_image = matched_idxs_per_image[foreground_idxs_per_image]
+            num_foreground += foreground_matched_idxs_per_image.numel()
+
+            # Calculate regression loss
+            matched_gt_boxes_per_image = targets_per_image["boxes"][foreground_matched_idxs_per_image]
+            bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
+            anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
+            target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+            bbox_loss.append(
+                torch.nn.functional.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction="sum")
+            )
+
+            # Estimate ground truth for class targets
+            gt_classes_target = torch.zeros(
+                (cls_logits_per_image.size(0),),
+                dtype=targets_per_image["labels"].dtype,
+                device=targets_per_image["labels"].device,
+            )
+            gt_classes_target[foreground_idxs_per_image] = targets_per_image["labels"][
+                foreground_matched_idxs_per_image
+            ]
+            cls_targets.append(gt_classes_target)
+
+        bbox_loss = torch.stack(bbox_loss)
+        cls_targets = torch.stack(cls_targets)
+
+        # Calculate classification loss
+        num_classes = cls_logits.size(-1)
+        cls_loss = F.cross_entropy(cls_logits.view(-1, num_classes), cls_targets.view(-1), reduction="none").view(
+            cls_targets.size()
+        )
+
+        # Hard Negative Sampling
+        foreground_idxs = cls_targets > 0
+        num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True)
+        # num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio
+        negative_loss = cls_loss.clone()
+        negative_loss[foreground_idxs] = -float("inf")  # use -inf to detect positive values that creeped in the sample
+        values, idx = negative_loss.sort(1, descending=True)
+        # background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values))
+        background_idxs = idx.sort(1)[1] < num_negative
+
+        N = max(1, num_foreground)
+        return {
+            "bbox_regression": bbox_loss.sum() / N,
+            "classification": (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / N,
+        }
+
+    def forward(
+        self, images: list[Tensor], targets: Optional[list[dict[str, Tensor]]] = None
+    ) -> tuple[dict[str, Tensor], list[dict[str, Tensor]]]:
+        if self.training:
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                for target in targets:
+                    boxes = target["boxes"]
+                    if isinstance(boxes, torch.Tensor):
+                        torch._assert(
+                            len(boxes.shape) == 2 and boxes.shape[-1] == 4,
+                            f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
+                        )
+                    else:
+                        torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
+
+        # get the original image sizes
+        original_image_sizes: list[tuple[int, int]] = []
+        for img in images:
+            val = img.shape[-2:]
+            torch._assert(
+                len(val) == 2,
+                f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
+            )
+            original_image_sizes.append((val[0], val[1]))
+
+        # transform the input
+        images, targets = self.transform(images, targets)
+
+        # Check for degenerate boxes
+        if targets is not None:
+            for target_idx, target in enumerate(targets):
+                boxes = target["boxes"]
+                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+                if degenerate_boxes.any():
+                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                    degen_bb: list[float] = boxes[bb_idx].tolist()
+                    torch._assert(
+                        False,
+                        "All bounding boxes should have positive height and width."
+                        f" Found invalid box {degen_bb} for target at index {target_idx}.",
+                    )
+
+        # get the features from the backbone
+        features = self.backbone(images.tensors)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([("0", features)])
+
+        features = list(features.values())
+
+        # compute the ssd heads outputs using the features
+        head_outputs = self.head(features)
+
+        # create the set of anchors
+        anchors = self.anchor_generator(images, features)
+
+        losses = {}
+        detections: list[dict[str, Tensor]] = []
+        if self.training:
+            matched_idxs = []
+            if targets is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                for anchors_per_image, targets_per_image in zip(anchors, targets):
+                    if targets_per_image["boxes"].numel() == 0:
+                        matched_idxs.append(
+                            torch.full(
+                                (anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device
+                            )
+                        )
+                        continue
+
+                    match_quality_matrix = box_ops.box_iou(targets_per_image["boxes"], anchors_per_image)
+                    matched_idxs.append(self.proposal_matcher(match_quality_matrix))
+
+                losses = self.compute_loss(targets, head_outputs, anchors, matched_idxs)
+        else:
+            detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes)
+            detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
+
+        if torch.jit.is_scripting():
+            if not self._has_warned:
+                warnings.warn("SSD always returns a (Losses, Detections) tuple in scripting")
+                self._has_warned = True
+            return losses, detections
+        return self.eager_outputs(losses, detections)
+
+    def postprocess_detections(
+        self, head_outputs: dict[str, Tensor], image_anchors: list[Tensor], image_shapes: list[tuple[int, int]]
+    ) -> list[dict[str, Tensor]]:
+        bbox_regression = head_outputs["bbox_regression"]
+        pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1)
+
+        num_classes = pred_scores.size(-1)
+        device = pred_scores.device
+
+        detections: list[dict[str, Tensor]] = []
+
+        for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
+            boxes = self.box_coder.decode_single(boxes, anchors)
+            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
+
+            image_boxes = []
+            image_scores = []
+            image_labels = []
+            for label in range(1, num_classes):
+                score = scores[:, label]
+
+                keep_idxs = score > self.score_thresh
+                score = score[keep_idxs]
+                box = boxes[keep_idxs]
+
+                # keep only topk scoring predictions
+                num_topk = det_utils._topk_min(score, self.topk_candidates, 0)
+                score, idxs = score.topk(num_topk)
+                box = box[idxs]
+
+                image_boxes.append(box)
+                image_scores.append(score)
+                image_labels.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device))
+
+            image_boxes = torch.cat(image_boxes, dim=0)
+            image_scores = torch.cat(image_scores, dim=0)
+            image_labels = torch.cat(image_labels, dim=0)
+
+            # non-maximum suppression
+            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+            keep = keep[: self.detections_per_img]
+
+            detections.append(
+                {
+                    "boxes": image_boxes[keep],
+                    "scores": image_scores[keep],
+                    "labels": image_labels[keep],
+                }
+            )
+        return detections
+
+
+class SSDFeatureExtractorVGG(nn.Module):
+    def __init__(self, backbone: nn.Module, highres: bool):
+        super().__init__()
+
+        _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d))
+
+        # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper
+        backbone[maxpool3_pos].ceil_mode = True
+
+        # parameters used for L2 regularization + rescaling
+        self.scale_weight = nn.Parameter(torch.ones(512) * 20)
+
+        # Multiple Feature maps - page 4, Fig 2 of SSD paper
+        self.features = nn.Sequential(*backbone[:maxpool4_pos])  # until conv4_3
+
+        # SSD300 case - page 4, Fig 2 of SSD paper
+        extra = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(1024, 256, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2),  # conv8_2
+                    nn.ReLU(inplace=True),
+                ),
+                nn.Sequential(
+                    nn.Conv2d(512, 128, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2),  # conv9_2
+                    nn.ReLU(inplace=True),
+                ),
+                nn.Sequential(
+                    nn.Conv2d(256, 128, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(128, 256, kernel_size=3),  # conv10_2
+                    nn.ReLU(inplace=True),
+                ),
+                nn.Sequential(
+                    nn.Conv2d(256, 128, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(128, 256, kernel_size=3),  # conv11_2
+                    nn.ReLU(inplace=True),
+                ),
+            ]
+        )
+        if highres:
+            # Additional layers for the SSD512 case. See page 11, footernote 5.
+            extra.append(
+                nn.Sequential(
+                    nn.Conv2d(256, 128, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(128, 256, kernel_size=4),  # conv12_2
+                    nn.ReLU(inplace=True),
+                )
+            )
+        _xavier_init(extra)
+
+        fc = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=False),  # add modified maxpool5
+            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6),  # FC6 with atrous
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1),  # FC7
+            nn.ReLU(inplace=True),
+        )
+        _xavier_init(fc)
+        extra.insert(
+            0,
+            nn.Sequential(
+                *backbone[maxpool4_pos:-1],  # until conv5_3, skip maxpool5
+                fc,
+            ),
+        )
+        self.extra = extra
+
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
+        # L2 regularization + Rescaling of 1st block's feature map
+        x = self.features(x)
+        rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x)
+        output = [rescaled]
+
+        # Calculating Feature maps for the rest blocks
+        for block in self.extra:
+            x = block(x)
+            output.append(x)
+
+        return OrderedDict([(str(i), v) for i, v in enumerate(output)])
+
+
+def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
+    backbone = backbone.features
+    # Gather the indices of maxpools. These are the locations of output blocks.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
+    num_stages = len(stage_indices)
+
+    # find the index of the layer from which we won't freeze
+    torch._assert(
+        0 <= trainable_layers <= num_stages,
+        f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
+    )
+    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
+
+    for b in backbone[:freeze_before]:
+        for parameter in b.parameters():
+            parameter.requires_grad_(False)
+
+    return SSDFeatureExtractorVGG(backbone, highres)
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", SSD300_VGG16_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", VGG16_Weights.IMAGENET1K_FEATURES),
+)
+def ssd300_vgg16(
+    *,
+    weights: Optional[SSD300_VGG16_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[VGG16_Weights] = VGG16_Weights.IMAGENET1K_FEATURES,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> SSD:
+    """The SSD300 model is based on the `SSD: Single Shot MultiBox Detector
+    <https://arxiv.org/abs/1512.02325>`_ paper.
+
+    .. betastatus:: detection module
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
+    to a fixed size before passing it to the backbone.
+
+    The behavior of the model changes depending on if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors and targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each detection
+        - scores (Tensor[N]): the scores for each detection
+
+    Example:
+
+        >>> model = torchvision.models.detection.ssd300_vgg16(weights=SSD300_VGG16_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.SSD300_VGG16_Weights`, optional): The pretrained
+                weights to use. See
+                :class:`~torchvision.models.detection.SSD300_VGG16_Weights`
+                below for more details, and possible values. By default, no
+                pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr
+            Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.VGG16_Weights`, optional): The pretrained weights for the
+            backbone
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 4.
+        **kwargs: parameters passed to the ``torchvision.models.detection.SSD``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssd.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.SSD300_VGG16_Weights
+        :members:
+    """
+    weights = SSD300_VGG16_Weights.verify(weights)
+    weights_backbone = VGG16_Weights.verify(weights_backbone)
+
+    if "size" in kwargs:
+        warnings.warn("The size of the model is already fixed; ignoring the parameter.")
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    trainable_backbone_layers = _validate_trainable_layers(
+        weights is not None or weights_backbone is not None, trainable_backbone_layers, 5, 4
+    )
+
+    # Use custom backbones more appropriate for SSD
+    backbone = vgg16(weights=weights_backbone, progress=progress)
+    backbone = _vgg_extractor(backbone, False, trainable_backbone_layers)
+    anchor_generator = DefaultBoxGenerator(
+        [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
+        scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05],
+        steps=[8, 16, 32, 64, 100, 300],
+    )
+
+    defaults = {
+        # Rescale the input in a way compatible to the backbone
+        "image_mean": [0.48235, 0.45882, 0.40784],
+        "image_std": [1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0],  # undo the 0-1 scaling of toTensor
+    }
+    kwargs: Any = {**defaults, **kwargs}
+    model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssdlite.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssdlite.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b05aae0c0fc38d25388550ce27df35bfc45c3a7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/ssdlite.py
@@ -0,0 +1,331 @@
+import warnings
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import nn, Tensor
+
+from ...ops.misc import Conv2dNormActivation
+from ...transforms._presets import ObjectDetection
+from ...utils import _log_api_usage_once
+from .. import mobilenet
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _COCO_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface
+from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights
+from . import _utils as det_utils
+from .anchor_utils import DefaultBoxGenerator
+from .backbone_utils import _validate_trainable_layers
+from .ssd import SSD, SSDScoringHead
+
+
+__all__ = [
+    "SSDLite320_MobileNet_V3_Large_Weights",
+    "ssdlite320_mobilenet_v3_large",
+]
+
+
+# Building blocks of SSDlite as described in section 6.2 of MobileNetV2 paper
+def _prediction_block(
+    in_channels: int, out_channels: int, kernel_size: int, norm_layer: Callable[..., nn.Module]
+) -> nn.Sequential:
+    return nn.Sequential(
+        # 3x3 depthwise with stride 1 and padding 1
+        Conv2dNormActivation(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            norm_layer=norm_layer,
+            activation_layer=nn.ReLU6,
+        ),
+        # 1x1 projetion to output channels
+        nn.Conv2d(in_channels, out_channels, 1),
+    )
+
+
+def _extra_block(in_channels: int, out_channels: int, norm_layer: Callable[..., nn.Module]) -> nn.Sequential:
+    activation = nn.ReLU6
+    intermediate_channels = out_channels // 2
+    return nn.Sequential(
+        # 1x1 projection to half output channels
+        Conv2dNormActivation(
+            in_channels, intermediate_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation
+        ),
+        # 3x3 depthwise with stride 2 and padding 1
+        Conv2dNormActivation(
+            intermediate_channels,
+            intermediate_channels,
+            kernel_size=3,
+            stride=2,
+            groups=intermediate_channels,
+            norm_layer=norm_layer,
+            activation_layer=activation,
+        ),
+        # 1x1 projetion to output channels
+        Conv2dNormActivation(
+            intermediate_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation
+        ),
+    )
+
+
+def _normal_init(conv: nn.Module):
+    for layer in conv.modules():
+        if isinstance(layer, nn.Conv2d):
+            torch.nn.init.normal_(layer.weight, mean=0.0, std=0.03)
+            if layer.bias is not None:
+                torch.nn.init.constant_(layer.bias, 0.0)
+
+
+class SSDLiteHead(nn.Module):
+    def __init__(
+        self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module]
+    ):
+        super().__init__()
+        self.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, num_classes, norm_layer)
+        self.regression_head = SSDLiteRegressionHead(in_channels, num_anchors, norm_layer)
+
+    def forward(self, x: list[Tensor]) -> dict[str, Tensor]:
+        return {
+            "bbox_regression": self.regression_head(x),
+            "cls_logits": self.classification_head(x),
+        }
+
+
+class SSDLiteClassificationHead(SSDScoringHead):
+    def __init__(
+        self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module]
+    ):
+        cls_logits = nn.ModuleList()
+        for channels, anchors in zip(in_channels, num_anchors):
+            cls_logits.append(_prediction_block(channels, num_classes * anchors, 3, norm_layer))
+        _normal_init(cls_logits)
+        super().__init__(cls_logits, num_classes)
+
+
+class SSDLiteRegressionHead(SSDScoringHead):
+    def __init__(self, in_channels: list[int], num_anchors: list[int], norm_layer: Callable[..., nn.Module]):
+        bbox_reg = nn.ModuleList()
+        for channels, anchors in zip(in_channels, num_anchors):
+            bbox_reg.append(_prediction_block(channels, 4 * anchors, 3, norm_layer))
+        _normal_init(bbox_reg)
+        super().__init__(bbox_reg, 4)
+
+
+class SSDLiteFeatureExtractorMobileNet(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        c4_pos: int,
+        norm_layer: Callable[..., nn.Module],
+        width_mult: float = 1.0,
+        min_depth: int = 16,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if backbone[c4_pos].use_res_connect:
+            raise ValueError("backbone[c4_pos].use_res_connect should be False")
+
+        self.features = nn.Sequential(
+            # As described in section 6.3 of MobileNetV3 paper
+            nn.Sequential(*backbone[:c4_pos], backbone[c4_pos].block[0]),  # from start until C4 expansion layer
+            nn.Sequential(backbone[c4_pos].block[1:], *backbone[c4_pos + 1 :]),  # from C4 depthwise until end
+        )
+
+        get_depth = lambda d: max(min_depth, int(d * width_mult))  # noqa: E731
+        extra = nn.ModuleList(
+            [
+                _extra_block(backbone[-1].out_channels, get_depth(512), norm_layer),
+                _extra_block(get_depth(512), get_depth(256), norm_layer),
+                _extra_block(get_depth(256), get_depth(256), norm_layer),
+                _extra_block(get_depth(256), get_depth(128), norm_layer),
+            ]
+        )
+        _normal_init(extra)
+
+        self.extra = extra
+
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
+        # Get feature maps from backbone and extra. Can't be refactored due to JIT limitations.
+        output = []
+        for block in self.features:
+            x = block(x)
+            output.append(x)
+
+        for block in self.extra:
+            x = block(x)
+            output.append(x)
+
+        return OrderedDict([(str(i), v) for i, v in enumerate(output)])
+
+
+def _mobilenet_extractor(
+    backbone: Union[mobilenet.MobileNetV2, mobilenet.MobileNetV3],
+    trainable_layers: int,
+    norm_layer: Callable[..., nn.Module],
+):
+    backbone = backbone.features
+    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    num_stages = len(stage_indices)
+
+    # find the index of the layer from which we won't freeze
+    if not 0 <= trainable_layers <= num_stages:
+        raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
+    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
+
+    for b in backbone[:freeze_before]:
+        for parameter in b.parameters():
+            parameter.requires_grad_(False)
+
+    return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer)
+
+
+class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
+    COCO_V1 = Weights(
+        url="https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth",
+        transforms=ObjectDetection,
+        meta={
+            "num_params": 3440060,
+            "categories": _COCO_CATEGORIES,
+            "min_size": (1, 1),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssdlite320-mobilenetv3-large",
+            "_metrics": {
+                "COCO-val2017": {
+                    "box_map": 21.3,
+                }
+            },
+            "_ops": 0.583,
+            "_file_size": 13.418,
+            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
+        },
+    )
+    DEFAULT = COCO_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", SSDLite320_MobileNet_V3_Large_Weights.COCO_V1),
+    weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
+)
+def ssdlite320_mobilenet_v3_large(
+    *,
+    weights: Optional[SSDLite320_MobileNet_V3_Large_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+    trainable_backbone_layers: Optional[int] = None,
+    norm_layer: Optional[Callable[..., nn.Module]] = None,
+    **kwargs: Any,
+) -> SSD:
+    """SSDlite model architecture with input size 320x320 and a MobileNetV3 Large backbone, as
+    described at `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__ and
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/abs/1801.04381>`__.
+
+    .. betastatus:: detection module
+
+    See :func:`~torchvision.models.detection.ssd300_vgg16` for more details.
+
+    Example:
+
+        >>> model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(weights=SSDLite320_MobileNet_V3_Large_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model
+            (including the background).
+        weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained
+            weights for the backbone.
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers
+            starting from final block. Valid values are between 0 and 6, with 6 meaning all
+            backbone layers are trainable. If ``None`` is passed (the default) this value is
+            set to 6.
+        norm_layer (callable, optional): Module specifying the normalization layer to use.
+        **kwargs: parameters passed to the ``torchvision.models.detection.ssd.SSD``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssdlite.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights
+        :members:
+    """
+
+    weights = SSDLite320_MobileNet_V3_Large_Weights.verify(weights)
+    weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
+
+    if "size" in kwargs:
+        warnings.warn("The size of the model is already fixed; ignoring the parameter.")
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    trainable_backbone_layers = _validate_trainable_layers(
+        weights is not None or weights_backbone is not None, trainable_backbone_layers, 6, 6
+    )
+
+    # Enable reduced tail if no pretrained backbone is selected. See Table 6 of MobileNetV3 paper.
+    reduce_tail = weights_backbone is None
+
+    if norm_layer is None:
+        norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
+
+    backbone = mobilenet_v3_large(
+        weights=weights_backbone, progress=progress, norm_layer=norm_layer, reduced_tail=reduce_tail, **kwargs
+    )
+    if weights_backbone is None:
+        # Change the default initialization scheme if not pretrained
+        _normal_init(backbone)
+    backbone = _mobilenet_extractor(
+        backbone,
+        trainable_backbone_layers,
+        norm_layer,
+    )
+
+    size = (320, 320)
+    anchor_generator = DefaultBoxGenerator([[2, 3] for _ in range(6)], min_ratio=0.2, max_ratio=0.95)
+    out_channels = det_utils.retrieve_out_channels(backbone, size)
+    num_anchors = anchor_generator.num_anchors_per_location()
+    if len(out_channels) != len(anchor_generator.aspect_ratios):
+        raise ValueError(
+            f"The length of the output channels from the backbone {len(out_channels)} do not match the length of the anchor generator aspect ratios {len(anchor_generator.aspect_ratios)}"
+        )
+
+    defaults = {
+        "score_thresh": 0.001,
+        "nms_thresh": 0.55,
+        "detections_per_img": 300,
+        "topk_candidates": 300,
+        # Rescale the input in a way compatible to the backbone:
+        # The following mean/std rescale the data from [0, 1] to [-1, 1]
+        "image_mean": [0.5, 0.5, 0.5],
+        "image_std": [0.5, 0.5, 0.5],
+    }
+    kwargs: Any = {**defaults, **kwargs}
+    model = SSD(
+        backbone,
+        anchor_generator,
+        size,
+        num_classes,
+        head=SSDLiteHead(out_channels, num_anchors, num_classes, norm_layer),
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/transform.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac54873dee8b798761db2b00f407d24bdafec33f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/detection/transform.py
@@ -0,0 +1,319 @@
+import math
+from typing import Any, Optional
+
+import torch
+import torchvision
+from torch import nn, Tensor
+
+from .image_list import ImageList
+from .roi_heads import paste_masks_in_image
+
+
+@torch.jit.unused
+def _get_shape_onnx(image: Tensor) -> Tensor:
+    from torch.onnx import operators
+
+    return operators.shape_as_tensor(image)[-2:]
+
+
+@torch.jit.unused
+def _fake_cast_onnx(v: Tensor) -> float:
+    # ONNX requires a tensor but here we fake its type for JIT.
+    return v
+
+
+def _resize_image_and_masks(
+    image: Tensor,
+    self_min_size: int,
+    self_max_size: int,
+    target: Optional[dict[str, Tensor]] = None,
+    fixed_size: Optional[tuple[int, int]] = None,
+) -> tuple[Tensor, Optional[dict[str, Tensor]]]:
+    if torchvision._is_tracing():
+        im_shape = _get_shape_onnx(image)
+    elif torch.jit.is_scripting():
+        im_shape = torch.tensor(image.shape[-2:])
+    else:
+        im_shape = image.shape[-2:]
+
+    size: Optional[list[int]] = None
+    scale_factor: Optional[float] = None
+    recompute_scale_factor: Optional[bool] = None
+    if fixed_size is not None:
+        size = [fixed_size[1], fixed_size[0]]
+    else:
+        if torch.jit.is_scripting() or torchvision._is_tracing():
+            min_size = torch.min(im_shape).to(dtype=torch.float32)
+            max_size = torch.max(im_shape).to(dtype=torch.float32)
+            self_min_size_f = float(self_min_size)
+            self_max_size_f = float(self_max_size)
+            scale = torch.min(self_min_size_f / min_size, self_max_size_f / max_size)
+
+            if torchvision._is_tracing():
+                scale_factor = _fake_cast_onnx(scale)
+            else:
+                scale_factor = scale.item()
+
+        else:
+            # Do it the normal way
+            min_size = min(im_shape)
+            max_size = max(im_shape)
+            scale_factor = min(self_min_size / min_size, self_max_size / max_size)
+
+        recompute_scale_factor = True
+
+    image = torch.nn.functional.interpolate(
+        image[None],
+        size=size,
+        scale_factor=scale_factor,
+        mode="bilinear",
+        recompute_scale_factor=recompute_scale_factor,
+        align_corners=False,
+    )[0]
+
+    if target is None:
+        return image, target
+
+    if "masks" in target:
+        mask = target["masks"]
+        mask = torch.nn.functional.interpolate(
+            mask[:, None].float(), size=size, scale_factor=scale_factor, recompute_scale_factor=recompute_scale_factor
+        )[:, 0].byte()
+        target["masks"] = mask
+    return image, target
+
+
+class GeneralizedRCNNTransform(nn.Module):
+    """
+    Performs input / target transformation before feeding the data to a GeneralizedRCNN
+    model.
+
+    The transformations it performs are:
+        - input normalization (mean subtraction and std division)
+        - input / target resizing to match min_size / max_size
+
+    It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
+    """
+
+    def __init__(
+        self,
+        min_size: int,
+        max_size: int,
+        image_mean: list[float],
+        image_std: list[float],
+        size_divisible: int = 32,
+        fixed_size: Optional[tuple[int, int]] = None,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size_divisible = size_divisible
+        self.fixed_size = fixed_size
+        self._skip_resize = kwargs.pop("_skip_resize", False)
+
+    def forward(
+        self, images: list[Tensor], targets: Optional[list[dict[str, Tensor]]] = None
+    ) -> tuple[ImageList, Optional[list[dict[str, Tensor]]]]:
+        images = [img for img in images]
+        if targets is not None:
+            # make a copy of targets to avoid modifying it in-place
+            # once torchscript supports dict comprehension
+            # this can be simplified as follows
+            # targets = [{k: v for k,v in t.items()} for t in targets]
+            targets_copy: list[dict[str, Tensor]] = []
+            for t in targets:
+                data: dict[str, Tensor] = {}
+                for k, v in t.items():
+                    data[k] = v
+                targets_copy.append(data)
+            targets = targets_copy
+        for i in range(len(images)):
+            image = images[i]
+            target_index = targets[i] if targets is not None else None
+
+            if image.dim() != 3:
+                raise ValueError(f"images is expected to be a list of 3d tensors of shape [C, H, W], got {image.shape}")
+            image = self.normalize(image)
+            image, target_index = self.resize(image, target_index)
+            images[i] = image
+            if targets is not None and target_index is not None:
+                targets[i] = target_index
+
+        image_sizes = [img.shape[-2:] for img in images]
+        images = self.batch_images(images, size_divisible=self.size_divisible)
+        image_sizes_list: list[tuple[int, int]] = []
+        for image_size in image_sizes:
+            torch._assert(
+                len(image_size) == 2,
+                f"Input tensors expected to have in the last two elements H and W, instead got {image_size}",
+            )
+            image_sizes_list.append((image_size[0], image_size[1]))
+
+        image_list = ImageList(images, image_sizes_list)
+        return image_list, targets
+
+    def normalize(self, image: Tensor) -> Tensor:
+        if not image.is_floating_point():
+            raise TypeError(
+                f"Expected input images to be of floating type (in range [0, 1]), "
+                f"but found type {image.dtype} instead"
+            )
+        dtype, device = image.dtype, image.device
+        mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
+        std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
+        return (image - mean[:, None, None]) / std[:, None, None]
+
+    def torch_choice(self, k: list[int]) -> int:
+        """
+        Implements `random.choice` via torch ops, so it can be compiled with
+        TorchScript and we use PyTorch's RNG (not native RNG)
+        """
+        index = int(torch.empty(1).uniform_(0.0, float(len(k))).item())
+        return k[index]
+
+    def resize(
+        self,
+        image: Tensor,
+        target: Optional[dict[str, Tensor]] = None,
+    ) -> tuple[Tensor, Optional[dict[str, Tensor]]]:
+        h, w = image.shape[-2:]
+        if self.training:
+            if self._skip_resize:
+                return image, target
+            size = self.torch_choice(self.min_size)
+        else:
+            size = self.min_size[-1]
+        image, target = _resize_image_and_masks(image, size, self.max_size, target, self.fixed_size)
+
+        if target is None:
+            return image, target
+
+        bbox = target["boxes"]
+        bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
+        target["boxes"] = bbox
+
+        if "keypoints" in target:
+            keypoints = target["keypoints"]
+            keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:])
+            target["keypoints"] = keypoints
+        return image, target
+
+    # _onnx_batch_images() is an implementation of
+    # batch_images() that is supported by ONNX tracing.
+    @torch.jit.unused
+    def _onnx_batch_images(self, images: list[Tensor], size_divisible: int = 32) -> Tensor:
+        max_size = []
+        for i in range(images[0].dim()):
+            max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
+            max_size.append(max_size_i)
+        stride = size_divisible
+        max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
+        max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
+        max_size = tuple(max_size)
+
+        # work around for
+        # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+        # which is not yet supported in onnx
+        padded_imgs = []
+        for img in images:
+            padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+            padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+            padded_imgs.append(padded_img)
+
+        return torch.stack(padded_imgs)
+
+    def max_by_axis(self, the_list: list[list[int]]) -> list[int]:
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def batch_images(self, images: list[Tensor], size_divisible: int = 32) -> Tensor:
+        if torchvision._is_tracing():
+            # batch_images() does not export well to ONNX
+            # call _onnx_batch_images() instead
+            return self._onnx_batch_images(images, size_divisible)
+
+        max_size = self.max_by_axis([list(img.shape) for img in images])
+        stride = float(size_divisible)
+        max_size = list(max_size)
+        max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
+        max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
+
+        batch_shape = [len(images)] + max_size
+        batched_imgs = images[0].new_full(batch_shape, 0)
+        for i in range(batched_imgs.shape[0]):
+            img = images[i]
+            batched_imgs[i, : img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+
+        return batched_imgs
+
+    def postprocess(
+        self,
+        result: list[dict[str, Tensor]],
+        image_shapes: list[tuple[int, int]],
+        original_image_sizes: list[tuple[int, int]],
+    ) -> list[dict[str, Tensor]]:
+        if self.training:
+            return result
+        for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
+            boxes = pred["boxes"]
+            boxes = resize_boxes(boxes, im_s, o_im_s)
+            result[i]["boxes"] = boxes
+            if "masks" in pred:
+                masks = pred["masks"]
+                masks = paste_masks_in_image(masks, boxes, o_im_s)
+                result[i]["masks"] = masks
+            if "keypoints" in pred:
+                keypoints = pred["keypoints"]
+                keypoints = resize_keypoints(keypoints, im_s, o_im_s)
+                result[i]["keypoints"] = keypoints
+        return result
+
+    def __repr__(self) -> str:
+        format_string = f"{self.__class__.__name__}("
+        _indent = "\n    "
+        format_string += f"{_indent}Normalize(mean={self.image_mean}, std={self.image_std})"
+        format_string += f"{_indent}Resize(min_size={self.min_size}, max_size={self.max_size}, mode='bilinear')"
+        format_string += "\n)"
+        return format_string
+
+
+def resize_keypoints(keypoints: Tensor, original_size: list[int], new_size: list[int]) -> Tensor:
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=keypoints.device)
+        / torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratio_h, ratio_w = ratios
+    resized_data = keypoints.clone()
+    if torch._C._get_tracing_state():
+        resized_data_0 = resized_data[:, :, 0] * ratio_w
+        resized_data_1 = resized_data[:, :, 1] * ratio_h
+        resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2)
+    else:
+        resized_data[..., 0] *= ratio_w
+        resized_data[..., 1] *= ratio_h
+    return resized_data
+
+
+def resize_boxes(boxes: Tensor, original_size: list[int], new_size: list[int]) -> Tensor:
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=boxes.device)
+        / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratio_height, ratio_width = ratios
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+
+    xmin = xmin * ratio_width
+    xmax = xmax * ratio_width
+    ymin = ymin * ratio_height
+    ymax = ymax * ratio_height
+    return torch.stack((xmin, ymin, xmax, ymax), dim=1)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/efficientnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b755a3e20751631993eedade35ec549a5b917c4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/efficientnet.py
@@ -0,0 +1,1132 @@
+import copy
+import math
+from collections.abc import Sequence
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import nn, Tensor
+from torchvision.ops import StochasticDepth
+
+from ..ops.misc import Conv2dNormActivation, SqueezeExcitation
+from ..transforms._presets import ImageClassification, InterpolationMode
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "EfficientNet",
+    "EfficientNet_B0_Weights",
+    "EfficientNet_B1_Weights",
+    "EfficientNet_B2_Weights",
+    "EfficientNet_B3_Weights",
+    "EfficientNet_B4_Weights",
+    "EfficientNet_B5_Weights",
+    "EfficientNet_B6_Weights",
+    "EfficientNet_B7_Weights",
+    "EfficientNet_V2_S_Weights",
+    "EfficientNet_V2_M_Weights",
+    "EfficientNet_V2_L_Weights",
+    "efficientnet_b0",
+    "efficientnet_b1",
+    "efficientnet_b2",
+    "efficientnet_b3",
+    "efficientnet_b4",
+    "efficientnet_b5",
+    "efficientnet_b6",
+    "efficientnet_b7",
+    "efficientnet_v2_s",
+    "efficientnet_v2_m",
+    "efficientnet_v2_l",
+]
+
+
+@dataclass
+class _MBConvConfig:
+    expand_ratio: float
+    kernel: int
+    stride: int
+    input_channels: int
+    out_channels: int
+    num_layers: int
+    block: Callable[..., nn.Module]
+
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
+        return _make_divisible(channels * width_mult, 8, min_value)
+
+
+class MBConvConfig(_MBConvConfig):
+    # Stores information listed at Table 1 of the EfficientNet paper & Table 4 of the EfficientNetV2 paper
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        width_mult: float = 1.0,
+        depth_mult: float = 1.0,
+        block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        input_channels = self.adjust_channels(input_channels, width_mult)
+        out_channels = self.adjust_channels(out_channels, width_mult)
+        num_layers = self.adjust_depth(num_layers, depth_mult)
+        if block is None:
+            block = MBConv
+        super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+    @staticmethod
+    def adjust_depth(num_layers: int, depth_mult: float):
+        return int(math.ceil(num_layers * depth_mult))
+
+
+class FusedMBConvConfig(_MBConvConfig):
+    # Stores information listed at Table 4 of the EfficientNetV2 paper
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        if block is None:
+            block = FusedMBConv
+        super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+
+class MBConv(nn.Module):
+    def __init__(
+        self,
+        cnf: MBConvConfig,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., nn.Module],
+        se_layer: Callable[..., nn.Module] = SqueezeExcitation,
+    ) -> None:
+        super().__init__()
+
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+        layers: list[nn.Module] = []
+        activation_layer = nn.SiLU
+
+        # expand
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+        # depthwise
+        layers.append(
+            Conv2dNormActivation(
+                expanded_channels,
+                expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=cnf.stride,
+                groups=expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+
+        # squeeze and excitation
+        squeeze_channels = max(1, cnf.input_channels // 4)
+        layers.append(se_layer(expanded_channels, squeeze_channels, activation=partial(nn.SiLU, inplace=True)))
+
+        # project
+        layers.append(
+            Conv2dNormActivation(
+                expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+
+        self.block = nn.Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+
+
+class FusedMBConv(nn.Module):
+    def __init__(
+        self,
+        cnf: FusedMBConvConfig,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., nn.Module],
+    ) -> None:
+        super().__init__()
+
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+        layers: list[nn.Module] = []
+        activation_layer = nn.SiLU
+
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            # fused expand
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+            # project
+            layers.append(
+                Conv2dNormActivation(
+                    expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+                )
+            )
+        else:
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    cnf.out_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+        self.block = nn.Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+
+
+class EfficientNet(nn.Module):
+    def __init__(
+        self,
+        inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+        dropout: float,
+        stochastic_depth_prob: float = 0.2,
+        num_classes: int = 1000,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        last_channel: Optional[int] = None,
+    ) -> None:
+        """
+        EfficientNet V1 and V2 main class
+
+        Args:
+            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
+            dropout (float): The droupout probability
+            stochastic_depth_prob (float): The stochastic depth probability
+            num_classes (int): Number of classes
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            last_channel (int): The number of channels on the penultimate layer
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        layers: list[nn.Module] = []
+
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU
+            )
+        )
+
+        # building inverted residual blocks
+        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+        stage_block_id = 0
+        for cnf in inverted_residual_setting:
+            stage: list[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # copy to avoid modifications. shallow copy is enough
+                block_cnf = copy.copy(cnf)
+
+                # overwrite info if not the first conv in the stage
+                if stage:
+                    block_cnf.input_channels = block_cnf.out_channels
+                    block_cnf.stride = 1
+
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+
+                stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
+                stage_block_id += 1
+
+            layers.append(nn.Sequential(*stage))
+
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels
+        layers.append(
+            Conv2dNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.SiLU,
+            )
+        )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(lastconv_output_channels, num_classes),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                init_range = 1.0 / math.sqrt(m.out_features)
+                nn.init.uniform_(m.weight, -init_range, init_range)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        x = self.classifier(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _efficientnet(
+    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+    dropout: float,
+    last_channel: Optional[int],
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> EfficientNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+def _efficientnet_conf(
+    arch: str,
+    **kwargs: Any,
+) -> tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
+    if arch.startswith("efficientnet_b"):
+        bneck_conf = partial(MBConvConfig, width_mult=kwargs.pop("width_mult"), depth_mult=kwargs.pop("depth_mult"))
+        inverted_residual_setting = [
+            bneck_conf(1, 3, 1, 32, 16, 1),
+            bneck_conf(6, 3, 2, 16, 24, 2),
+            bneck_conf(6, 5, 2, 24, 40, 2),
+            bneck_conf(6, 3, 2, 40, 80, 3),
+            bneck_conf(6, 5, 1, 80, 112, 3),
+            bneck_conf(6, 5, 2, 112, 192, 4),
+            bneck_conf(6, 3, 1, 192, 320, 1),
+        ]
+        last_channel = None
+    elif arch.startswith("efficientnet_v2_s"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+            FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+            FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+            MBConvConfig(4, 3, 2, 64, 128, 6),
+            MBConvConfig(6, 3, 1, 128, 160, 9),
+            MBConvConfig(6, 3, 2, 160, 256, 15),
+        ]
+        last_channel = 1280
+    elif arch.startswith("efficientnet_v2_m"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 24, 24, 3),
+            FusedMBConvConfig(4, 3, 2, 24, 48, 5),
+            FusedMBConvConfig(4, 3, 2, 48, 80, 5),
+            MBConvConfig(4, 3, 2, 80, 160, 7),
+            MBConvConfig(6, 3, 1, 160, 176, 14),
+            MBConvConfig(6, 3, 2, 176, 304, 18),
+            MBConvConfig(6, 3, 1, 304, 512, 5),
+        ]
+        last_channel = 1280
+    elif arch.startswith("efficientnet_v2_l"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 32, 32, 4),
+            FusedMBConvConfig(4, 3, 2, 32, 64, 7),
+            FusedMBConvConfig(4, 3, 2, 64, 96, 7),
+            MBConvConfig(4, 3, 2, 96, 192, 10),
+            MBConvConfig(6, 3, 1, 192, 224, 19),
+            MBConvConfig(6, 3, 2, 224, 384, 25),
+            MBConvConfig(6, 3, 1, 384, 640, 7),
+        ]
+        last_channel = 1280
+    else:
+        raise ValueError(f"Unsupported model type {arch}")
+
+    return inverted_residual_setting, last_channel
+
+
+_COMMON_META: dict[str, Any] = {
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+
+_COMMON_META_V1 = {
+    **_COMMON_META,
+    "min_size": (1, 1),
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#efficientnet-v1",
+}
+
+
+_COMMON_META_V2 = {
+    **_COMMON_META,
+    "min_size": (33, 33),
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#efficientnet-v2",
+}
+
+
+class EfficientNet_B0_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/rwightman/pytorch-image-models/
+        url="https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=256, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 5288548,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.692,
+                    "acc@5": 93.532,
+                }
+            },
+            "_ops": 0.386,
+            "_file_size": 20.451,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B1_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/rwightman/pytorch-image-models/
+        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-bac287d4.pth",
+        transforms=partial(
+            ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 7794184,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.642,
+                    "acc@5": 94.186,
+                }
+            },
+            "_ops": 0.687,
+            "_file_size": 30.134,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/efficientnet_b1-c27df63c.pth",
+        transforms=partial(
+            ImageClassification, crop_size=240, resize_size=255, interpolation=InterpolationMode.BILINEAR
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 7794184,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-lr-wd-crop-tuning",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.838,
+                    "acc@5": 94.934,
+                }
+            },
+            "_ops": 0.687,
+            "_file_size": 30.136,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class EfficientNet_B2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/rwightman/pytorch-image-models/
+        url="https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth",
+        transforms=partial(
+            ImageClassification, crop_size=288, resize_size=288, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 9109994,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.608,
+                    "acc@5": 95.310,
+                }
+            },
+            "_ops": 1.088,
+            "_file_size": 35.174,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B3_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/rwightman/pytorch-image-models/
+        url="https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth",
+        transforms=partial(
+            ImageClassification, crop_size=300, resize_size=320, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 12233232,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.008,
+                    "acc@5": 96.054,
+                }
+            },
+            "_ops": 1.827,
+            "_file_size": 47.184,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B4_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/rwightman/pytorch-image-models/
+        url="https://download.pytorch.org/models/efficientnet_b4_rwightman-23ab8bcd.pth",
+        transforms=partial(
+            ImageClassification, crop_size=380, resize_size=384, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 19341616,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.384,
+                    "acc@5": 96.594,
+                }
+            },
+            "_ops": 4.394,
+            "_file_size": 74.489,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B5_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+        url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-1a07897c.pth",
+        transforms=partial(
+            ImageClassification, crop_size=456, resize_size=456, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 30389784,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.444,
+                    "acc@5": 96.628,
+                }
+            },
+            "_ops": 10.266,
+            "_file_size": 116.864,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B6_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+        url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-24a108a5.pth",
+        transforms=partial(
+            ImageClassification, crop_size=528, resize_size=528, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 43040704,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.008,
+                    "acc@5": 96.916,
+                }
+            },
+            "_ops": 19.068,
+            "_file_size": 165.362,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B7_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+        url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-c5b4e57e.pth",
+        transforms=partial(
+            ImageClassification, crop_size=600, resize_size=600, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META_V1,
+            "num_params": 66347960,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.122,
+                    "acc@5": 96.908,
+                }
+            },
+            "_ops": 37.746,
+            "_file_size": 254.675,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_S_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=384,
+            resize_size=384,
+            interpolation=InterpolationMode.BILINEAR,
+        ),
+        meta={
+            **_COMMON_META_V2,
+            "num_params": 21458488,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.228,
+                    "acc@5": 96.878,
+                }
+            },
+            "_ops": 8.366,
+            "_file_size": 82.704,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_M_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/efficientnet_v2_m-dc08266a.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=480,
+            resize_size=480,
+            interpolation=InterpolationMode.BILINEAR,
+        ),
+        meta={
+            **_COMMON_META_V2,
+            "num_params": 54139356,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 85.112,
+                    "acc@5": 97.156,
+                }
+            },
+            "_ops": 24.582,
+            "_file_size": 208.01,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_L_Weights(WeightsEnum):
+    # Weights ported from https://github.com/google/automl/tree/master/efficientnetv2
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/efficientnet_v2_l-59c71312.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=480,
+            resize_size=480,
+            interpolation=InterpolationMode.BICUBIC,
+            mean=(0.5, 0.5, 0.5),
+            std=(0.5, 0.5, 0.5),
+        ),
+        meta={
+            **_COMMON_META_V2,
+            "num_params": 118515272,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 85.808,
+                    "acc@5": 97.788,
+                }
+            },
+            "_ops": 56.08,
+            "_file_size": 454.573,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B0_Weights.IMAGENET1K_V1))
+def efficientnet_b0(
+    *, weights: Optional[EfficientNet_B0_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B0 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B0_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B0_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B0_Weights
+        :members:
+    """
+    weights = EfficientNet_B0_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B1_Weights.IMAGENET1K_V1))
+def efficientnet_b1(
+    *, weights: Optional[EfficientNet_B1_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B1 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B1_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B1_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B1_Weights
+        :members:
+    """
+    weights = EfficientNet_B1_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B2_Weights.IMAGENET1K_V1))
+def efficientnet_b2(
+    *, weights: Optional[EfficientNet_B2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B2 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B2_Weights
+        :members:
+    """
+    weights = EfficientNet_B2_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.3), last_channel, weights, progress, **kwargs
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B3_Weights.IMAGENET1K_V1))
+def efficientnet_b3(
+    *, weights: Optional[EfficientNet_B3_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B3 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B3_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B3_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B3_Weights
+        :members:
+    """
+    weights = EfficientNet_B3_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B4_Weights.IMAGENET1K_V1))
+def efficientnet_b4(
+    *, weights: Optional[EfficientNet_B4_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B4 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B4_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B4_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B4_Weights
+        :members:
+    """
+    weights = EfficientNet_B4_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B5_Weights.IMAGENET1K_V1))
+def efficientnet_b5(
+    *, weights: Optional[EfficientNet_B5_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B5 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B5_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B5_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B5_Weights
+        :members:
+    """
+    weights = EfficientNet_B5_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B6_Weights.IMAGENET1K_V1))
+def efficientnet_b6(
+    *, weights: Optional[EfficientNet_B6_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B6 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B6_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B6_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B6_Weights
+        :members:
+    """
+    weights = EfficientNet_B6_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.5),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B7_Weights.IMAGENET1K_V1))
+def efficientnet_b7(
+    *, weights: Optional[EfficientNet_B7_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """EfficientNet B7 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+    Neural Networks <https://arxiv.org/abs/1905.11946>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_B7_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_B7_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_B7_Weights
+        :members:
+    """
+    weights = EfficientNet_B7_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.5),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_S_Weights.IMAGENET1K_V1))
+def efficientnet_v2_s(
+    *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """
+    Constructs an EfficientNetV2-S architecture from
+    `EfficientNetV2: Smaller Models and Faster Training <https://arxiv.org/abs/2104.00298>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_V2_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_V2_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_V2_S_Weights
+        :members:
+    """
+    weights = EfficientNet_V2_S_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.2),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_M_Weights.IMAGENET1K_V1))
+def efficientnet_v2_m(
+    *, weights: Optional[EfficientNet_V2_M_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """
+    Constructs an EfficientNetV2-M architecture from
+    `EfficientNetV2: Smaller Models and Faster Training <https://arxiv.org/abs/2104.00298>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_V2_M_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_V2_M_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_V2_M_Weights
+        :members:
+    """
+    weights = EfficientNet_V2_M_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_L_Weights.IMAGENET1K_V1))
+def efficientnet_v2_l(
+    *, weights: Optional[EfficientNet_V2_L_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    """
+    Constructs an EfficientNetV2-L architecture from
+    `EfficientNetV2: Smaller Models and Faster Training <https://arxiv.org/abs/2104.00298>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.EfficientNet_V2_L_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.EfficientNet_V2_L_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.EfficientNet_V2_L_Weights
+        :members:
+    """
+    weights = EfficientNet_V2_L_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/feature_extraction.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..320b1936d6f8897d6f324b6c4938dbe289fd466e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/feature_extraction.py
@@ -0,0 +1,607 @@
+import copy
+import inspect
+import math
+import re
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+from itertools import chain
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torchvision
+from torch import fx, nn
+from torch.fx.graph_module import _CodeOnlyModule, _copy_attr, _USER_PRESERVED_ATTRIBUTES_KEY
+
+
+__all__ = ["create_feature_extractor", "get_graph_node_names"]
+
+
+class LeafModuleAwareTracer(fx.Tracer):
+    """
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
+    modules that are not to be traced through. The resulting graph ends up
+    having single nodes referencing calls to the leaf modules' forward methods.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.leaf_modules = {}
+        if "leaf_modules" in kwargs:
+            leaf_modules = kwargs.pop("leaf_modules")
+            self.leaf_modules = leaf_modules
+        super().__init__(*args, **kwargs)
+
+    def is_leaf_module(self, m: nn.Module, module_qualname: str) -> bool:
+        if isinstance(m, tuple(self.leaf_modules)):
+            return True
+        return super().is_leaf_module(m, module_qualname)
+
+
+class NodePathTracer(LeafModuleAwareTracer):
+    """
+    NodePathTracer is an FX tracer that, for each operation, also records the
+    name of the Node from which the operation originated. A node name here is
+    a `.` separated path walking the hierarchy from top level module down to
+    leaf operation or leaf module. The name of the top level module is not
+    included as part of the node name. For example, if we trace a module whose
+    forward method applies a ReLU module, the name for that node will simply
+    be 'relu'.
+
+    Some notes on the specifics:
+        - Nodes are recorded to `self.node_to_qualname` which is a dictionary
+          mapping a given Node object to its node name.
+        - Nodes are recorded in the order which they are executed during
+          tracing.
+        - When a duplicate node name is encountered, a suffix of the form
+          _{int} is added. The counter starts from 1.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Track the qualified name of the Node being traced
+        self.current_module_qualname = ""
+        # A map from FX Node to the qualified name\#
+        # NOTE: This is loosely like the "qualified name" mentioned in the
+        # torch.fx docs https://pytorch.org/docs/stable/fx.html but adapted
+        # for the purposes of the torchvision feature extractor
+        self.node_to_qualname = OrderedDict()
+
+    def call_module(self, m: torch.nn.Module, forward: Callable, args, kwargs):
+        """
+        Override of `fx.Tracer.call_module`
+        This override:
+        1) Stores away the qualified name of the caller for restoration later
+        2) Adds the qualified name of the caller to
+           `current_module_qualname` for retrieval by `create_proxy`
+        3) Once a leaf module is reached, calls `create_proxy`
+        4) Restores the caller's qualified name into current_module_qualname
+        """
+        old_qualname = self.current_module_qualname
+        try:
+            module_qualname = self.path_of_module(m)
+            self.current_module_qualname = module_qualname
+            if not self.is_leaf_module(m, module_qualname):
+                out = forward(*args, **kwargs)
+                return out
+            return self.create_proxy("call_module", module_qualname, args, kwargs)
+        finally:
+            self.current_module_qualname = old_qualname
+
+    def create_proxy(
+        self, kind: str, target: fx.node.Target, args, kwargs, name=None, type_expr=None, *_
+    ) -> fx.proxy.Proxy:
+        """
+        Override of `Tracer.create_proxy`. This override intercepts the recording
+        of every operation and stores away the current traced module's qualified
+        name in `node_to_qualname`
+        """
+        proxy = super().create_proxy(kind, target, args, kwargs, name, type_expr)
+        self.node_to_qualname[proxy.node] = self._get_node_qualname(self.current_module_qualname, proxy.node)
+        return proxy
+
+    def _get_node_qualname(self, module_qualname: str, node: fx.node.Node) -> str:
+        node_qualname = module_qualname
+
+        if node.op != "call_module":
+            # In this case module_qualname from torch.fx doesn't go all the
+            # way to the leaf function/op, so we need to append it
+            if len(node_qualname) > 0:
+                # Only append '.' if we are deeper than the top level module
+                node_qualname += "."
+            node_qualname += str(node)
+
+        # Now we need to add an _{index} postfix on any repeated node names
+        # For modules we do this from scratch
+        # But for anything else, torch.fx already has a globally scoped
+        # _{index} postfix. But we want it locally (relative to direct parent)
+        # scoped. So first we need to undo the torch.fx postfix
+        if re.match(r".+_[0-9]+$", node_qualname) is not None:
+            node_qualname = node_qualname.rsplit("_", 1)[0]
+
+        # ... and now we add on our own postfix
+        for existing_qualname in reversed(self.node_to_qualname.values()):
+            # Check to see if existing_qualname is of the form
+            # {node_qualname} or {node_qualname}_{int}
+            if re.match(rf"{node_qualname}(_[0-9]+)?$", existing_qualname) is not None:
+                postfix = existing_qualname.replace(node_qualname, "")
+                if len(postfix):
+                    # existing_qualname is of the form {node_qualname}_{int}
+                    next_index = int(postfix[1:]) + 1
+                else:
+                    # existing_qualname is of the form {node_qualname}
+                    next_index = 1
+                node_qualname += f"_{next_index}"
+                break
+
+        return node_qualname
+
+
+def _is_subseq(x, y):
+    """Check if y is a subsequence of x
+    https://stackoverflow.com/a/24017747/4391249
+    """
+    iter_x = iter(x)
+    return all(any(x_item == y_item for x_item in iter_x) for y_item in y)
+
+
+def _warn_graph_differences(train_tracer: NodePathTracer, eval_tracer: NodePathTracer):
+    """
+    Utility function for warning the user if there are differences between
+    the train graph nodes and the eval graph nodes.
+    """
+    train_nodes = list(train_tracer.node_to_qualname.values())
+    eval_nodes = list(eval_tracer.node_to_qualname.values())
+
+    if len(train_nodes) == len(eval_nodes) and all(t == e for t, e in zip(train_nodes, eval_nodes)):
+        return
+
+    suggestion_msg = (
+        "When choosing nodes for feature extraction, you may need to specify "
+        "output nodes for train and eval mode separately."
+    )
+
+    if _is_subseq(train_nodes, eval_nodes):
+        msg = (
+            "NOTE: The nodes obtained by tracing the model in eval mode "
+            "are a subsequence of those obtained in train mode. "
+        )
+    elif _is_subseq(eval_nodes, train_nodes):
+        msg = (
+            "NOTE: The nodes obtained by tracing the model in train mode "
+            "are a subsequence of those obtained in eval mode. "
+        )
+    else:
+        msg = "The nodes obtained by tracing the model in train mode are different to those obtained in eval mode. "
+    warnings.warn(msg + suggestion_msg)
+
+
+def _get_leaf_modules_for_ops() -> list[type]:
+    members = inspect.getmembers(torchvision.ops)
+    result = []
+    for _, obj in members:
+        if inspect.isclass(obj) and issubclass(obj, torch.nn.Module):
+            result.append(obj)
+    return result
+
+
+def _set_default_tracer_kwargs(original_tr_kwargs: Optional[dict[str, Any]]) -> dict[str, Any]:
+    default_autowrap_modules = (math, torchvision.ops)
+    default_leaf_modules = _get_leaf_modules_for_ops()
+    result_tracer_kwargs = {} if original_tr_kwargs is None else original_tr_kwargs
+    result_tracer_kwargs["autowrap_modules"] = (
+        tuple(set(result_tracer_kwargs["autowrap_modules"] + default_autowrap_modules))
+        if "autowrap_modules" in result_tracer_kwargs
+        else default_autowrap_modules
+    )
+    result_tracer_kwargs["leaf_modules"] = (
+        list(set(result_tracer_kwargs["leaf_modules"] + default_leaf_modules))
+        if "leaf_modules" in result_tracer_kwargs
+        else default_leaf_modules
+    )
+    return result_tracer_kwargs
+
+
+def get_graph_node_names(
+    model: nn.Module,
+    tracer_kwargs: Optional[dict[str, Any]] = None,
+    suppress_diff_warning: bool = False,
+    concrete_args: Optional[dict[str, Any]] = None,
+) -> tuple[list[str], list[str]]:
+    """
+    Dev utility to return node names in order of execution. See note on node
+    names under :func:`create_feature_extractor`. Useful for seeing which node
+    names are available for feature extraction. There are two reasons that
+    node names can't easily be read directly from the code for a model:
+
+        1. Not all submodules are traced through. Modules from ``torch.nn`` all
+           fall within this category.
+        2. Nodes representing the repeated application of the same operation
+           or leaf module get a ``_{counter}`` postfix.
+
+    The model is traced twice: once in train mode, and once in eval mode. Both
+    sets of node names are returned.
+
+    For more details on the node naming conventions used here, please see the
+    :ref:`relevant subheading <about-node-names>` in the
+    `documentation <https://pytorch.org/vision/stable/feature_extraction.html>`_.
+
+    Args:
+        model (nn.Module): model for which we'd like to print node names
+        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
+            ``NodePathTracer`` (they are eventually passed onto
+            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
+            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
+            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
+            provided dictionary.
+        suppress_diff_warning (bool, optional): whether to suppress a warning
+            when there are discrepancies between the train and eval version of
+            the graph. Defaults to False.
+        concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+            not be treated as Proxies. According to the `Pytorch docs
+            <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer.trace>`_,
+            this parameter's API may not be guaranteed.
+
+    Returns:
+        tuple(list, list): a list of node names from tracing the model in
+        train mode, and another from tracing the model in eval mode.
+
+    Examples::
+
+        >>> model = torchvision.models.resnet18()
+        >>> train_nodes, eval_nodes = get_graph_node_names(model)
+    """
+    tracer_kwargs = _set_default_tracer_kwargs(tracer_kwargs)
+    is_training = model.training
+    train_tracer = NodePathTracer(**tracer_kwargs)
+    train_tracer.trace(model.train(), concrete_args=concrete_args)
+    eval_tracer = NodePathTracer(**tracer_kwargs)
+    eval_tracer.trace(model.eval(), concrete_args=concrete_args)
+    train_nodes = list(train_tracer.node_to_qualname.values())
+    eval_nodes = list(eval_tracer.node_to_qualname.values())
+    if not suppress_diff_warning:
+        _warn_graph_differences(train_tracer, eval_tracer)
+    # Restore training state
+    model.train(is_training)
+    return train_nodes, eval_nodes
+
+
+class DualGraphModule(fx.GraphModule):
+    """
+    A derivative of `fx.GraphModule`. Differs in the following ways:
+    - Requires a train and eval version of the underlying graph
+    - Copies submodules according to the nodes of both train and eval graphs.
+    - Calling train(mode) switches between train graph and eval graph.
+    """
+
+    def __init__(
+        self, root: torch.nn.Module, train_graph: fx.Graph, eval_graph: fx.Graph, class_name: str = "GraphModule"
+    ):
+        """
+        Args:
+            root (nn.Module): module from which the copied module hierarchy is
+                built
+            train_graph (fx.Graph): the graph that should be used in train mode
+            eval_graph (fx.Graph): the graph that should be used in eval mode
+        """
+        super(fx.GraphModule, self).__init__()
+
+        self.__class__.__name__ = class_name
+
+        self.train_graph = train_graph
+        self.eval_graph = eval_graph
+
+        # Copy all get_attr and call_module ops (indicated by BOTH train and
+        # eval graphs)
+        for node in chain(iter(train_graph.nodes), iter(eval_graph.nodes)):
+            if node.op in ["get_attr", "call_module"]:
+                if not isinstance(node.target, str):
+                    raise TypeError(f"node.target should be of type str instead of {type(node.target)}")
+                _copy_attr(root, self, node.target)
+
+        # train mode by default
+        self.train()
+        self.graph = train_graph
+
+        # (borrowed from fx.GraphModule):
+        # Store the Tracer class responsible for creating a Graph separately as part of the
+        # GraphModule state, except when the Tracer is defined in a local namespace.
+        # Locally defined Tracers are not pickleable. This is needed because torch.package will
+        # serialize a GraphModule without retaining the Graph, and needs to use the correct Tracer
+        # to re-create the Graph during deserialization.
+        if self.eval_graph._tracer_cls != self.train_graph._tracer_cls:
+            raise TypeError(
+                f"Train mode and eval mode should use the same tracer class. Instead got {self.eval_graph._tracer_cls} for eval vs {self.train_graph._tracer_cls} for train"
+            )
+        self._tracer_cls = None
+        if self.graph._tracer_cls and "<locals>" not in self.graph._tracer_cls.__qualname__:
+            self._tracer_cls = self.graph._tracer_cls
+
+    def train(self, mode=True):
+        """
+        Swap out the graph depending on the selected training mode.
+        NOTE this should be safe when calling model.eval() because that just
+        calls this with mode == False.
+        """
+        # NOTE: Only set self.graph if the current graph is not the desired
+        # one. This saves us from recompiling the graph where not necessary.
+        if mode and not self.training:
+            self.graph = self.train_graph
+        elif not mode and self.training:
+            self.graph = self.eval_graph
+        return super().train(mode=mode)
+
+    def _deepcopy_init(self):
+        # See __deepcopy__ below
+        return DualGraphModule.__init__
+
+    def __deepcopy__(self, memo):
+        # Same as the base class' __deepcopy__ from pytorch, with minor
+        # modification to account for train_graph and eval_graph
+        # https://github.com/pytorch/pytorch/blob/f684dbd0026f98f8fa291cab74dbc4d61ba30580/torch/fx/graph_module.py#L875
+        #
+        # This is using a bunch of private stuff from torch, so if that breaks,
+        # we'll likely have to remove this, along with the associated
+        # non-regression test.
+        res = type(self).__new__(type(self))
+        memo[id(self)] = res
+        fake_mod = _CodeOnlyModule(copy.deepcopy(self.__dict__, memo))
+        self._deepcopy_init()(res, fake_mod, fake_mod.__dict__["train_graph"], fake_mod.__dict__["eval_graph"])
+
+        extra_preserved_attrs = [
+            "_state_dict_hooks",
+            "_load_state_dict_pre_hooks",
+            "_load_state_dict_post_hooks",
+            "_replace_hook",
+            "_create_node_hooks",
+            "_erase_node_hooks",
+        ]
+        for attr in extra_preserved_attrs:
+            if attr in self.__dict__:
+                setattr(res, attr, copy.deepcopy(self.__dict__[attr], memo))
+        res.meta = copy.deepcopy(getattr(self, "meta", {}), memo)
+        if _USER_PRESERVED_ATTRIBUTES_KEY in res.meta:
+            for attr_name, attr in res.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():
+                setattr(res, attr_name, attr)
+        return res
+
+
+def create_feature_extractor(
+    model: nn.Module,
+    return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    train_return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    eval_return_nodes: Optional[Union[list[str], dict[str, str]]] = None,
+    tracer_kwargs: Optional[dict[str, Any]] = None,
+    suppress_diff_warning: bool = False,
+    concrete_args: Optional[dict[str, Any]] = None,
+) -> fx.GraphModule:
+    """
+    Creates a new graph module that returns intermediate nodes from a given
+    model as dictionary with user specified keys as strings, and the requested
+    outputs as values. This is achieved by re-writing the computation graph of
+    the model via FX to return the desired nodes as outputs. All unused nodes
+    are removed, together with their corresponding parameters.
+
+    Desired output nodes must be specified as a ``.`` separated
+    path walking the module hierarchy from top level module down to leaf
+    operation or leaf module. For more details on the node naming conventions
+    used here, please see the :ref:`relevant subheading <about-node-names>`
+    in the `documentation <https://pytorch.org/vision/stable/feature_extraction.html>`_.
+
+    Not all models will be FX traceable, although with some massaging they can
+    be made to cooperate. Here's a (not exhaustive) list of tips:
+
+        - If you don't need to trace through a particular, problematic
+          sub-module, turn it into a "leaf module" by passing a list of
+          ``leaf_modules`` as one of the ``tracer_kwargs`` (see example below).
+          It will not be traced through, but rather, the resulting graph will
+          hold a reference to that module's forward method.
+        - Likewise, you may turn functions into leaf functions by passing a
+          list of ``autowrap_functions`` as one of the ``tracer_kwargs`` (see
+          example below).
+        - Some inbuilt Python functions can be problematic. For instance,
+          ``int`` will raise an error during tracing. You may wrap them in your
+          own function and then pass that in ``autowrap_functions`` as one of
+          the ``tracer_kwargs``.
+
+    For further information on FX see the
+    `torch.fx documentation <https://pytorch.org/docs/stable/fx.html>`_.
+
+    Args:
+        model (nn.Module): model on which we will extract the features
+        return_nodes (list or dict, optional): either a ``List`` or a ``Dict``
+            containing the names (or partial names - see note above)
+            of the nodes for which the activations will be returned. If it is
+            a ``Dict``, the keys are the node names, and the values
+            are the user-specified keys for the graph module's returned
+            dictionary. If it is a ``List``, it is treated as a ``Dict`` mapping
+            node specification strings directly to output names. In the case
+            that ``train_return_nodes`` and ``eval_return_nodes`` are specified,
+            this should not be specified.
+        train_return_nodes (list or dict, optional): similar to
+            ``return_nodes``. This can be used if the return nodes
+            for train mode are different than those from eval mode.
+            If this is specified, ``eval_return_nodes`` must also be specified,
+            and ``return_nodes`` should not be specified.
+        eval_return_nodes (list or dict, optional): similar to
+            ``return_nodes``. This can be used if the return nodes
+            for train mode are different than those from eval mode.
+            If this is specified, ``train_return_nodes`` must also be specified,
+            and `return_nodes` should not be specified.
+        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
+            ``NodePathTracer`` (which passes them onto it's parent class
+            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
+            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
+            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
+            provided dictionary.
+        suppress_diff_warning (bool, optional): whether to suppress a warning
+            when there are discrepancies between the train and eval version of
+            the graph. Defaults to False.
+        concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+            not be treated as Proxies. According to the `Pytorch docs
+            <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer.trace>`_,
+            this parameter's API may not be guaranteed.
+
+    Examples::
+
+        >>> # Feature extraction with resnet
+        >>> model = torchvision.models.resnet18()
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> model = create_feature_extractor(
+        >>>     model, {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = model(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+
+        >>> # Specifying leaf modules and leaf functions
+        >>> def leaf_function(x):
+        >>>     # This would raise a TypeError if traced through
+        >>>     return int(x)
+        >>>
+        >>> class LeafModule(torch.nn.Module):
+        >>>     def forward(self, x):
+        >>>         # This would raise a TypeError if traced through
+        >>>         int(x.shape[0])
+        >>>         return torch.nn.functional.relu(x + 4)
+        >>>
+        >>> class MyModule(torch.nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.conv = torch.nn.Conv2d(3, 1, 3)
+        >>>         self.leaf_module = LeafModule()
+        >>>
+        >>>     def forward(self, x):
+        >>>         leaf_function(x.shape[0])
+        >>>         x = self.conv(x)
+        >>>         return self.leaf_module(x)
+        >>>
+        >>> model = create_feature_extractor(
+        >>>     MyModule(), return_nodes=['leaf_module'],
+        >>>     tracer_kwargs={'leaf_modules': [LeafModule],
+        >>>                    'autowrap_functions': [leaf_function]})
+
+    """
+    tracer_kwargs = _set_default_tracer_kwargs(tracer_kwargs)
+    is_training = model.training
+
+    if all(arg is None for arg in [return_nodes, train_return_nodes, eval_return_nodes]):
+
+        raise ValueError(
+            "Either `return_nodes` or `train_return_nodes` and `eval_return_nodes` together, should be specified"
+        )
+
+    if (train_return_nodes is None) ^ (eval_return_nodes is None):
+        raise ValueError(
+            "If any of `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified"
+        )
+
+    if not ((return_nodes is None) ^ (train_return_nodes is None)):
+        raise ValueError("If `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified")
+
+    # Put *_return_nodes into Dict[str, str] format
+    def to_strdict(n) -> dict[str, str]:
+        if isinstance(n, list):
+            return {str(i): str(i) for i in n}
+        return {str(k): str(v) for k, v in n.items()}
+
+    if train_return_nodes is None:
+        return_nodes = to_strdict(return_nodes)
+        train_return_nodes = deepcopy(return_nodes)
+        eval_return_nodes = deepcopy(return_nodes)
+    else:
+        train_return_nodes = to_strdict(train_return_nodes)
+        eval_return_nodes = to_strdict(eval_return_nodes)
+
+    # Repeat the tracing and graph rewriting for train and eval mode
+    tracers = {}
+    graphs = {}
+    mode_return_nodes: dict[str, dict[str, str]] = {"train": train_return_nodes, "eval": eval_return_nodes}
+    for mode in ["train", "eval"]:
+        if mode == "train":
+            model.train()
+        elif mode == "eval":
+            model.eval()
+
+        # Instantiate our NodePathTracer and use that to trace the model
+        tracer = NodePathTracer(**tracer_kwargs)
+        graph = tracer.trace(model, concrete_args=concrete_args)
+
+        name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__
+        graph_module = fx.GraphModule(tracer.root, graph, name)
+
+        available_nodes = list(tracer.node_to_qualname.values())
+        # FIXME We don't know if we should expect this to happen
+        if len(set(available_nodes)) != len(available_nodes):
+            raise ValueError(
+                "There are duplicate nodes! Please raise an issue https://github.com/pytorch/vision/issues"
+            )
+        # Check that all outputs in return_nodes are present in the model
+        for query in mode_return_nodes[mode].keys():
+            # To check if a query is available we need to check that at least
+            # one of the available names starts with it up to a .
+            if not any([re.match(rf"^{query}(\.|$)", n) is not None for n in available_nodes]):
+                raise ValueError(
+                    f"node: '{query}' is not present in model. Hint: use "
+                    "`get_graph_node_names` to make sure the "
+                    "`return_nodes` you specified are present. It may even "
+                    "be that you need to specify `train_return_nodes` and "
+                    "`eval_return_nodes` separately."
+                )
+
+        # Remove existing output nodes (train mode)
+        orig_output_nodes = []
+        for n in reversed(graph_module.graph.nodes):
+            if n.op == "output":
+                orig_output_nodes.append(n)
+        if not orig_output_nodes:
+            raise ValueError("No output nodes found in graph_module.graph.nodes")
+
+        for n in orig_output_nodes:
+            graph_module.graph.erase_node(n)
+
+        # Find nodes corresponding to return_nodes and make them into output_nodes
+        nodes = [n for n in graph_module.graph.nodes]
+        output_nodes = OrderedDict()
+        for n in reversed(nodes):
+            module_qualname = tracer.node_to_qualname.get(n)
+            if module_qualname is None:
+                # NOTE - Know cases where this happens:
+                # - Node representing creation of a tensor constant - probably
+                #   not interesting as a return node
+                # - When packing outputs into a named tuple like in InceptionV3
+                continue
+            for query in mode_return_nodes[mode]:
+                depth = query.count(".")
+                if ".".join(module_qualname.split(".")[: depth + 1]) == query:
+                    output_nodes[mode_return_nodes[mode][query]] = n
+                    mode_return_nodes[mode].pop(query)
+                    break
+        output_nodes = OrderedDict(reversed(list(output_nodes.items())))
+
+        # And add them in the end of the graph
+        with graph_module.graph.inserting_after(nodes[-1]):
+            graph_module.graph.output(output_nodes)
+
+        # Remove unused modules / parameters
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+
+        # Keep track of the tracer and graph, so we can choose the main one
+        tracers[mode] = tracer
+        graphs[mode] = graph
+
+    # Warn user if there are any discrepancies between the graphs of the
+    # train and eval modes
+    if not suppress_diff_warning:
+        _warn_graph_differences(tracers["train"], tracers["eval"])
+
+    # Build the final graph module
+    graph_module = DualGraphModule(model, graphs["train"], graphs["eval"], class_name=name)
+
+    # Restore original training mode
+    model.train(is_training)
+    graph_module.train(is_training)
+
+    return graph_module
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/googlenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb29764951531b2fbfa91ea91e367ba240f05b0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/googlenet.py
@@ -0,0 +1,345 @@
+import warnings
+from collections import namedtuple
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = ["GoogLeNet", "GoogLeNetOutputs", "_GoogLeNetOutputs", "GoogLeNet_Weights", "googlenet"]
+
+
+GoogLeNetOutputs = namedtuple("GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"])
+GoogLeNetOutputs.__annotations__ = {"logits": Tensor, "aux_logits2": Optional[Tensor], "aux_logits1": Optional[Tensor]}
+
+# Script annotations failed with _GoogleNetOutputs = namedtuple ...
+# _GoogLeNetOutputs set here for backwards compat
+_GoogLeNetOutputs = GoogLeNetOutputs
+
+
+class GoogLeNet(nn.Module):
+    __constants__ = ["aux_logits", "transform_input"]
+
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        aux_logits: bool = True,
+        transform_input: bool = False,
+        init_weights: Optional[bool] = None,
+        blocks: Optional[list[Callable[..., nn.Module]]] = None,
+        dropout: float = 0.2,
+        dropout_aux: float = 0.7,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if blocks is None:
+            blocks = [BasicConv2d, Inception, InceptionAux]
+        if init_weights is None:
+            warnings.warn(
+                "The default weight initialization of GoogleNet will be changed in future releases of "
+                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
+                " due to scipy/scipy#11299), please set init_weights=True.",
+                FutureWarning,
+            )
+            init_weights = True
+        if len(blocks) != 3:
+            raise ValueError(f"blocks length should be 3 instead of {len(blocks)}")
+        conv_block = blocks[0]
+        inception_block = blocks[1]
+        inception_aux_block = blocks[2]
+
+        self.aux_logits = aux_logits
+        self.transform_input = transform_input
+
+        self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3)
+        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.conv2 = conv_block(64, 64, kernel_size=1)
+        self.conv3 = conv_block(64, 192, kernel_size=3, padding=1)
+        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+
+        self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)
+        self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+
+        self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)
+        self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)
+        self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64)
+        self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)
+        self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)
+        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+        self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)
+        self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)
+
+        if aux_logits:
+            self.aux1 = inception_aux_block(512, num_classes, dropout=dropout_aux)
+            self.aux2 = inception_aux_block(528, num_classes, dropout=dropout_aux)
+        else:
+            self.aux1 = None  # type: ignore[assignment]
+            self.aux2 = None  # type: ignore[assignment]
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(p=dropout)
+        self.fc = nn.Linear(1024, num_classes)
+
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                    torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=0.01, a=-2, b=2)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def _transform_input(self, x: Tensor) -> Tensor:
+        if self.transform_input:
+            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
+        return x
+
+    def _forward(self, x: Tensor) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        # N x 3 x 224 x 224
+        x = self.conv1(x)
+        # N x 64 x 112 x 112
+        x = self.maxpool1(x)
+        # N x 64 x 56 x 56
+        x = self.conv2(x)
+        # N x 64 x 56 x 56
+        x = self.conv3(x)
+        # N x 192 x 56 x 56
+        x = self.maxpool2(x)
+
+        # N x 192 x 28 x 28
+        x = self.inception3a(x)
+        # N x 256 x 28 x 28
+        x = self.inception3b(x)
+        # N x 480 x 28 x 28
+        x = self.maxpool3(x)
+        # N x 480 x 14 x 14
+        x = self.inception4a(x)
+        # N x 512 x 14 x 14
+        aux1: Optional[Tensor] = None
+        if self.aux1 is not None:
+            if self.training:
+                aux1 = self.aux1(x)
+
+        x = self.inception4b(x)
+        # N x 512 x 14 x 14
+        x = self.inception4c(x)
+        # N x 512 x 14 x 14
+        x = self.inception4d(x)
+        # N x 528 x 14 x 14
+        aux2: Optional[Tensor] = None
+        if self.aux2 is not None:
+            if self.training:
+                aux2 = self.aux2(x)
+
+        x = self.inception4e(x)
+        # N x 832 x 14 x 14
+        x = self.maxpool4(x)
+        # N x 832 x 7 x 7
+        x = self.inception5a(x)
+        # N x 832 x 7 x 7
+        x = self.inception5b(x)
+        # N x 1024 x 7 x 7
+
+        x = self.avgpool(x)
+        # N x 1024 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 1024
+        x = self.dropout(x)
+        x = self.fc(x)
+        # N x 1000 (num_classes)
+        return x, aux2, aux1
+
+    @torch.jit.unused
+    def eager_outputs(self, x: Tensor, aux2: Tensor, aux1: Optional[Tensor]) -> GoogLeNetOutputs:
+        if self.training and self.aux_logits:
+            return _GoogLeNetOutputs(x, aux2, aux1)
+        else:
+            return x  # type: ignore[return-value]
+
+    def forward(self, x: Tensor) -> GoogLeNetOutputs:
+        x = self._transform_input(x)
+        x, aux2, aux1 = self._forward(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted GoogleNet always returns GoogleNetOutputs Tuple")
+            return GoogLeNetOutputs(x, aux2, aux1)
+        else:
+            return self.eager_outputs(x, aux2, aux1)
+
+
+class Inception(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        ch1x1: int,
+        ch3x3red: int,
+        ch3x3: int,
+        ch5x5red: int,
+        ch5x5: int,
+        pool_proj: int,
+        conv_block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1)
+
+        self.branch2 = nn.Sequential(
+            conv_block(in_channels, ch3x3red, kernel_size=1), conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1)
+        )
+
+        self.branch3 = nn.Sequential(
+            conv_block(in_channels, ch5x5red, kernel_size=1),
+            # Here, kernel_size=3 instead of kernel_size=5 is a known bug.
+            # Please see https://github.com/pytorch/vision/issues/906 for details.
+            conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1),
+        )
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True),
+            conv_block(in_channels, pool_proj, kernel_size=1),
+        )
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch1 = self.branch1(x)
+        branch2 = self.branch2(x)
+        branch3 = self.branch3(x)
+        branch4 = self.branch4(x)
+
+        outputs = [branch1, branch2, branch3, branch4]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionAux(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_classes: int,
+        conv_block: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.7,
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.conv = conv_block(in_channels, 128, kernel_size=1)
+
+        self.fc1 = nn.Linear(2048, 1024)
+        self.fc2 = nn.Linear(1024, num_classes)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
+        x = F.adaptive_avg_pool2d(x, (4, 4))
+        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
+        x = self.conv(x)
+        # N x 128 x 4 x 4
+        x = torch.flatten(x, 1)
+        # N x 2048
+        x = F.relu(self.fc1(x), inplace=True)
+        # N x 1024
+        x = self.dropout(x)
+        # N x 1024
+        x = self.fc2(x)
+        # N x 1000 (num_classes)
+
+        return x
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+
+
+class GoogLeNet_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/googlenet-1378be20.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            "num_params": 6624904,
+            "min_size": (15, 15),
+            "categories": _IMAGENET_CATEGORIES,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#googlenet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.778,
+                    "acc@5": 89.530,
+                }
+            },
+            "_ops": 1.498,
+            "_file_size": 49.731,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", GoogLeNet_Weights.IMAGENET1K_V1))
+def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = True, **kwargs: Any) -> GoogLeNet:
+    """GoogLeNet (Inception v1) model architecture from
+    `Going Deeper with Convolutions <http://arxiv.org/abs/1409.4842>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.GoogLeNet_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.GoogLeNet_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.GoogLeNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.GoogLeNet_Weights
+        :members:
+    """
+    weights = GoogLeNet_Weights.verify(weights)
+
+    original_aux_logits = kwargs.get("aux_logits", False)
+    if weights is not None:
+        if "transform_input" not in kwargs:
+            _ovewrite_named_param(kwargs, "transform_input", True)
+        _ovewrite_named_param(kwargs, "aux_logits", True)
+        _ovewrite_named_param(kwargs, "init_weights", False)
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = GoogLeNet(**kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if not original_aux_logits:
+            model.aux_logits = False
+            model.aux1 = None  # type: ignore[assignment]
+            model.aux2 = None  # type: ignore[assignment]
+        else:
+            warnings.warn(
+                "auxiliary heads in the pretrained googlenet model are NOT pretrained, so make sure to train them"
+            )
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/inception.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/inception.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c36ec2a0ad721c0ccfc588fe389eb9c7e810fb5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/inception.py
@@ -0,0 +1,478 @@
+import warnings
+from collections import namedtuple
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = ["Inception3", "InceptionOutputs", "_InceptionOutputs", "Inception_V3_Weights", "inception_v3"]
+
+
+InceptionOutputs = namedtuple("InceptionOutputs", ["logits", "aux_logits"])
+InceptionOutputs.__annotations__ = {"logits": Tensor, "aux_logits": Optional[Tensor]}
+
+# Script annotations failed with _GoogleNetOutputs = namedtuple ...
+# _InceptionOutputs set here for backwards compat
+_InceptionOutputs = InceptionOutputs
+
+
+class Inception3(nn.Module):
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        aux_logits: bool = True,
+        transform_input: bool = False,
+        inception_blocks: Optional[list[Callable[..., nn.Module]]] = None,
+        init_weights: Optional[bool] = None,
+        dropout: float = 0.5,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if inception_blocks is None:
+            inception_blocks = [BasicConv2d, InceptionA, InceptionB, InceptionC, InceptionD, InceptionE, InceptionAux]
+        if init_weights is None:
+            warnings.warn(
+                "The default weight initialization of inception_v3 will be changed in future releases of "
+                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
+                " due to scipy/scipy#11299), please set init_weights=True.",
+                FutureWarning,
+            )
+            init_weights = True
+        if len(inception_blocks) != 7:
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
+        conv_block = inception_blocks[0]
+        inception_a = inception_blocks[1]
+        inception_b = inception_blocks[2]
+        inception_c = inception_blocks[3]
+        inception_d = inception_blocks[4]
+        inception_e = inception_blocks[5]
+        inception_aux = inception_blocks[6]
+
+        self.aux_logits = aux_logits
+        self.transform_input = transform_input
+        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
+        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
+        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
+        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.Mixed_5b = inception_a(192, pool_features=32)
+        self.Mixed_5c = inception_a(256, pool_features=64)
+        self.Mixed_5d = inception_a(288, pool_features=64)
+        self.Mixed_6a = inception_b(288)
+        self.Mixed_6b = inception_c(768, channels_7x7=128)
+        self.Mixed_6c = inception_c(768, channels_7x7=160)
+        self.Mixed_6d = inception_c(768, channels_7x7=160)
+        self.Mixed_6e = inception_c(768, channels_7x7=192)
+        self.AuxLogits: Optional[nn.Module] = None
+        if aux_logits:
+            self.AuxLogits = inception_aux(768, num_classes)
+        self.Mixed_7a = inception_d(768)
+        self.Mixed_7b = inception_e(1280)
+        self.Mixed_7c = inception_e(2048)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(p=dropout)
+        self.fc = nn.Linear(2048, num_classes)
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                    stddev = float(m.stddev) if hasattr(m, "stddev") else 0.1  # type: ignore
+                    torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=stddev, a=-2, b=2)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def _transform_input(self, x: Tensor) -> Tensor:
+        if self.transform_input:
+            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
+        return x
+
+    def _forward(self, x: Tensor) -> tuple[Tensor, Optional[Tensor]]:
+        # N x 3 x 299 x 299
+        x = self.Conv2d_1a_3x3(x)
+        # N x 32 x 149 x 149
+        x = self.Conv2d_2a_3x3(x)
+        # N x 32 x 147 x 147
+        x = self.Conv2d_2b_3x3(x)
+        # N x 64 x 147 x 147
+        x = self.maxpool1(x)
+        # N x 64 x 73 x 73
+        x = self.Conv2d_3b_1x1(x)
+        # N x 80 x 73 x 73
+        x = self.Conv2d_4a_3x3(x)
+        # N x 192 x 71 x 71
+        x = self.maxpool2(x)
+        # N x 192 x 35 x 35
+        x = self.Mixed_5b(x)
+        # N x 256 x 35 x 35
+        x = self.Mixed_5c(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_5d(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_6a(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6b(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6c(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6d(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6e(x)
+        # N x 768 x 17 x 17
+        aux: Optional[Tensor] = None
+        if self.AuxLogits is not None:
+            if self.training:
+                aux = self.AuxLogits(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_7a(x)
+        # N x 1280 x 8 x 8
+        x = self.Mixed_7b(x)
+        # N x 2048 x 8 x 8
+        x = self.Mixed_7c(x)
+        # N x 2048 x 8 x 8
+        # Adaptive average pooling
+        x = self.avgpool(x)
+        # N x 2048 x 1 x 1
+        x = self.dropout(x)
+        # N x 2048 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 2048
+        x = self.fc(x)
+        # N x 1000 (num_classes)
+        return x, aux
+
+    @torch.jit.unused
+    def eager_outputs(self, x: Tensor, aux: Optional[Tensor]) -> InceptionOutputs:
+        if self.training and self.aux_logits:
+            return InceptionOutputs(x, aux)
+        else:
+            return x  # type: ignore[return-value]
+
+    def forward(self, x: Tensor) -> InceptionOutputs:
+        x = self._transform_input(x)
+        x, aux = self._forward(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted Inception3 always returns Inception3 Tuple")
+            return InceptionOutputs(x, aux)
+        else:
+            return self.eager_outputs(x, aux)
+
+
+class InceptionA(nn.Module):
+    def __init__(
+        self, in_channels: int, pool_features: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
+
+        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
+        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
+
+        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionB(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+
+        outputs = [branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionC(nn.Module):
+    def __init__(
+        self, in_channels: int, channels_7x7: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
+
+        c7 = channels_7x7
+        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
+
+        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
+
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionD(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
+
+        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+        outputs = [branch3x3, branch7x7x3, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionE(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
+
+        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
+        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
+        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionAux(nn.Module):
+    def __init__(
+        self, in_channels: int, num_classes: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
+        self.conv1 = conv_block(128, 768, kernel_size=5)
+        self.conv1.stddev = 0.01  # type: ignore[assignment]
+        self.fc = nn.Linear(768, num_classes)
+        self.fc.stddev = 0.001  # type: ignore[assignment]
+
+    def forward(self, x: Tensor) -> Tensor:
+        # N x 768 x 17 x 17
+        x = F.avg_pool2d(x, kernel_size=5, stride=3)
+        # N x 768 x 5 x 5
+        x = self.conv0(x)
+        # N x 128 x 5 x 5
+        x = self.conv1(x)
+        # N x 768 x 1 x 1
+        # Adaptive average pooling
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        # N x 768 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 768
+        x = self.fc(x)
+        # N x 1000
+        return x
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+
+
+class Inception_V3_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
+        transforms=partial(ImageClassification, crop_size=299, resize_size=342),
+        meta={
+            "num_params": 27161264,
+            "min_size": (75, 75),
+            "categories": _IMAGENET_CATEGORIES,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#inception-v3",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.294,
+                    "acc@5": 93.450,
+                }
+            },
+            "_ops": 5.713,
+            "_file_size": 103.903,
+            "_docs": """These weights are ported from the original paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Inception_V3_Weights.IMAGENET1K_V1))
+def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bool = True, **kwargs: Any) -> Inception3:
+    """
+    Inception v3 model architecture from
+    `Rethinking the Inception Architecture for Computer Vision <http://arxiv.org/abs/1512.00567>`_.
+
+    .. note::
+        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
+        N x 3 x 299 x 299, so ensure your images are sized accordingly.
+
+    Args:
+        weights (:class:`~torchvision.models.Inception_V3_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.Inception_V3_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.Inception3``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Inception_V3_Weights
+        :members:
+    """
+    weights = Inception_V3_Weights.verify(weights)
+
+    original_aux_logits = kwargs.get("aux_logits", True)
+    if weights is not None:
+        if "transform_input" not in kwargs:
+            _ovewrite_named_param(kwargs, "transform_input", True)
+        _ovewrite_named_param(kwargs, "aux_logits", True)
+        _ovewrite_named_param(kwargs, "init_weights", False)
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = Inception3(**kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if not original_aux_logits:
+            model.aux_logits = False
+            model.AuxLogits = None
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/maxvit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/maxvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cc53e5ed94019e56e97bfa74d5c32312dfe389
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/maxvit.py
@@ -0,0 +1,834 @@
+import math
+from collections import OrderedDict
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torchvision.models._api import register_model, Weights, WeightsEnum
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import _ovewrite_named_param, handle_legacy_interface
+from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation
+from torchvision.ops.stochastic_depth import StochasticDepth
+from torchvision.transforms._presets import ImageClassification, InterpolationMode
+from torchvision.utils import _log_api_usage_once
+
+__all__ = [
+    "MaxVit",
+    "MaxVit_T_Weights",
+    "maxvit_t",
+]
+
+
+def _get_conv_output_shape(input_size: tuple[int, int], kernel_size: int, stride: int, padding: int) -> tuple[int, int]:
+    return (
+        (input_size[0] - kernel_size + 2 * padding) // stride + 1,
+        (input_size[1] - kernel_size + 2 * padding) // stride + 1,
+    )
+
+
+def _make_block_input_shapes(input_size: tuple[int, int], n_blocks: int) -> list[tuple[int, int]]:
+    """Util function to check that the input size is correct for a MaxVit configuration."""
+    shapes = []
+    block_input_shape = _get_conv_output_shape(input_size, 3, 2, 1)
+    for _ in range(n_blocks):
+        block_input_shape = _get_conv_output_shape(block_input_shape, 3, 2, 1)
+        shapes.append(block_input_shape)
+    return shapes
+
+
+def _get_relative_position_index(height: int, width: int) -> torch.Tensor:
+    coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)], indexing="ij"))
+    coords_flat = torch.flatten(coords, 1)
+    relative_coords = coords_flat[:, :, None] - coords_flat[:, None, :]
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+    relative_coords[:, :, 0] += height - 1
+    relative_coords[:, :, 1] += width - 1
+    relative_coords[:, :, 0] *= 2 * width - 1
+    return relative_coords.sum(-1)
+
+
+class MBConv(nn.Module):
+    """MBConv: Mobile Inverted Residual Bottleneck.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        stride (int): Stride of the depthwise convolution.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expansion_ratio: float,
+        squeeze_ratio: float,
+        stride: int,
+        activation_layer: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        p_stochastic_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+
+        proj: Sequence[nn.Module]
+        self.proj: nn.Module
+
+        should_proj = stride != 1 or in_channels != out_channels
+        if should_proj:
+            proj = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=True)]
+            if stride == 2:
+                proj = [nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)] + proj  # type: ignore
+            self.proj = nn.Sequential(*proj)
+        else:
+            self.proj = nn.Identity()  # type: ignore
+
+        mid_channels = int(out_channels * expansion_ratio)
+        sqz_channels = int(out_channels * squeeze_ratio)
+
+        if p_stochastic_dropout:
+            self.stochastic_depth = StochasticDepth(p_stochastic_dropout, mode="row")  # type: ignore
+        else:
+            self.stochastic_depth = nn.Identity()  # type: ignore
+
+        _layers = OrderedDict()
+        _layers["pre_norm"] = norm_layer(in_channels)
+        _layers["conv_a"] = Conv2dNormActivation(
+            in_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            inplace=None,
+        )
+        _layers["conv_b"] = Conv2dNormActivation(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            groups=mid_channels,
+            inplace=None,
+        )
+        _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels, activation=nn.SiLU)
+        _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True)
+
+        self.layers = nn.Sequential(_layers)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride].
+        """
+        res = self.proj(x)
+        x = self.stochastic_depth(self.layers(x))
+        return res + x
+
+
+class RelativePositionalMultiHeadAttention(nn.Module):
+    """Relative Positional Multi-Head Attention.
+
+    Args:
+        feat_dim (int): Number of input features.
+        head_dim (int): Number of features per head.
+        max_seq_len (int): Maximum sequence length.
+    """
+
+    def __init__(
+        self,
+        feat_dim: int,
+        head_dim: int,
+        max_seq_len: int,
+    ) -> None:
+        super().__init__()
+
+        if feat_dim % head_dim != 0:
+            raise ValueError(f"feat_dim: {feat_dim} must be divisible by head_dim: {head_dim}")
+
+        self.n_heads = feat_dim // head_dim
+        self.head_dim = head_dim
+        self.size = int(math.sqrt(max_seq_len))
+        self.max_seq_len = max_seq_len
+
+        self.to_qkv = nn.Linear(feat_dim, self.n_heads * self.head_dim * 3)
+        self.scale_factor = feat_dim**-0.5
+
+        self.merge = nn.Linear(self.head_dim * self.n_heads, feat_dim)
+        self.relative_position_bias_table = nn.parameter.Parameter(
+            torch.empty(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32),
+        )
+
+        self.register_buffer("relative_position_index", _get_relative_position_index(self.size, self.size))
+        # initialize with truncated normal the bias
+        torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def get_relative_positional_bias(self) -> torch.Tensor:
+        bias_index = self.relative_position_index.view(-1)  # type: ignore
+        relative_bias = self.relative_position_bias_table[bias_index].view(self.max_seq_len, self.max_seq_len, -1)  # type: ignore
+        relative_bias = relative_bias.permute(2, 0, 1).contiguous()
+        return relative_bias.unsqueeze(0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, G, P, D].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, G, P, D].
+        """
+        B, G, P, D = x.shape
+        H, DH = self.n_heads, self.head_dim
+
+        qkv = self.to_qkv(x)
+        q, k, v = torch.chunk(qkv, 3, dim=-1)
+
+        q = q.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+        k = k.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+        v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4)
+
+        k = k * self.scale_factor
+        dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k)
+        pos_bias = self.get_relative_positional_bias()
+
+        dot_prod = F.softmax(dot_prod + pos_bias, dim=-1)
+
+        out = torch.einsum("B G H I J, B G H J D -> B G H I D", dot_prod, v)
+        out = out.permute(0, 1, 3, 2, 4).reshape(B, G, P, D)
+
+        out = self.merge(out)
+        return out
+
+
+class SwapAxes(nn.Module):
+    """Permute the axes of a tensor."""
+
+    def __init__(self, a: int, b: int) -> None:
+        super().__init__()
+        self.a = a
+        self.b = b
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        res = torch.swapaxes(x, self.a, self.b)
+        return res
+
+
+class WindowPartition(nn.Module):
+    """
+    Partition the input tensor into non-overlapping windows.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: Tensor, p: int) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+            p (int): Number of partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
+        """
+        B, C, H, W = x.shape
+        P = p
+        # chunk up H and W dimensions
+        x = x.reshape(B, C, H // P, P, W // P, P)
+        x = x.permute(0, 2, 4, 3, 5, 1)
+        # colapse P * P dimension
+        x = x.reshape(B, (H // P) * (W // P), P * P, C)
+        return x
+
+
+class WindowDepartition(nn.Module):
+    """
+    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
+            p (int): Number of partitions.
+            h_partitions (int): Number of vertical partitions.
+            w_partitions (int): Number of horizontal partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H, W].
+        """
+        B, G, PP, C = x.shape
+        P = p
+        HP, WP = h_partitions, w_partitions
+        # split P * P dimension into 2 P tile dimensionsa
+        x = x.reshape(B, HP, WP, P, P, C)
+        # permute into B, C, HP, P, WP, P
+        x = x.permute(0, 5, 1, 3, 2, 4)
+        # reshape into B, C, H, W
+        x = x.reshape(B, C, HP * P, WP * P)
+        return x
+
+
+class PartitionAttentionLayer(nn.Module):
+    """
+    Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window.
+
+    Args:
+        in_channels (int): Number of input channels.
+        head_dim (int): Dimension of each attention head.
+        partition_size (int): Size of the partitions.
+        partition_type (str): Type of partitioning to use. Can be either "grid" or "window".
+        grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into.
+        mlp_ratio (int): Ratio of the  feature size expansion in the MLP layer.
+        activation_layer (Callable[..., nn.Module]): Activation function to use.
+        norm_layer (Callable[..., nn.Module]): Normalization function to use.
+        attention_dropout (float): Dropout probability for the attention layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        p_stochastic_dropout (float): Probability of dropping out a partition.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        head_dim: int,
+        # partitioning parameters
+        partition_size: int,
+        partition_type: str,
+        # grid size needs to be known at initialization time
+        # because we need to know hamy relative offsets there are in the grid
+        grid_size: tuple[int, int],
+        mlp_ratio: int,
+        activation_layer: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        attention_dropout: float,
+        mlp_dropout: float,
+        p_stochastic_dropout: float,
+    ) -> None:
+        super().__init__()
+
+        self.n_heads = in_channels // head_dim
+        self.head_dim = head_dim
+        self.n_partitions = grid_size[0] // partition_size
+        self.partition_type = partition_type
+        self.grid_size = grid_size
+
+        if partition_type not in ["grid", "window"]:
+            raise ValueError("partition_type must be either 'grid' or 'window'")
+
+        if partition_type == "window":
+            self.p, self.g = partition_size, self.n_partitions
+        else:
+            self.p, self.g = self.n_partitions, partition_size
+
+        self.partition_op = WindowPartition()
+        self.departition_op = WindowDepartition()
+        self.partition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity()
+        self.departition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity()
+
+        self.attn_layer = nn.Sequential(
+            norm_layer(in_channels),
+            # it's always going to be partition_size ** 2 because
+            # of the axis swap in the case of grid partitioning
+            RelativePositionalMultiHeadAttention(in_channels, head_dim, partition_size**2),
+            nn.Dropout(attention_dropout),
+        )
+
+        # pre-normalization similar to transformer layers
+        self.mlp_layer = nn.Sequential(
+            nn.LayerNorm(in_channels),
+            nn.Linear(in_channels, in_channels * mlp_ratio),
+            activation_layer(),
+            nn.Linear(in_channels * mlp_ratio, in_channels),
+            nn.Dropout(mlp_dropout),
+        )
+
+        # layer scale factors
+        self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode="row")
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H, W].
+        """
+
+        # Undefined behavior if H or W are not divisible by p
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766
+        gh, gw = self.grid_size[0] // self.p, self.grid_size[1] // self.p
+        torch._assert(
+            self.grid_size[0] % self.p == 0 and self.grid_size[1] % self.p == 0,
+            "Grid size must be divisible by partition size. Got grid size of {} and partition size of {}".format(
+                self.grid_size, self.p
+            ),
+        )
+
+        x = self.partition_op(x, self.p)
+        x = self.partition_swap(x)
+        x = x + self.stochastic_dropout(self.attn_layer(x))
+        x = x + self.stochastic_dropout(self.mlp_layer(x))
+        x = self.departition_swap(x)
+        x = self.departition_op(x, self.p, gh, gw)
+
+        return x
+
+
+class MaxVitLayer(nn.Module):
+    """
+    MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        stride (int): Stride of the depthwise convolution.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Ratio of the MLP layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        attention_dropout (float): Dropout probability for the attention layer.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+        partition_size (int): Size of the partitions.
+        grid_size (Tuple[int, int]): Size of the input feature grid.
+    """
+
+    def __init__(
+        self,
+        # conv parameters
+        in_channels: int,
+        out_channels: int,
+        squeeze_ratio: float,
+        expansion_ratio: float,
+        stride: int,
+        # conv + transformer parameters
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        # transformer parameters
+        head_dim: int,
+        mlp_ratio: int,
+        mlp_dropout: float,
+        attention_dropout: float,
+        p_stochastic_dropout: float,
+        # partitioning parameters
+        partition_size: int,
+        grid_size: tuple[int, int],
+    ) -> None:
+        super().__init__()
+
+        layers: OrderedDict = OrderedDict()
+
+        # convolutional layer
+        layers["MBconv"] = MBConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            expansion_ratio=expansion_ratio,
+            squeeze_ratio=squeeze_ratio,
+            stride=stride,
+            activation_layer=activation_layer,
+            norm_layer=norm_layer,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        # attention layers, block -> grid
+        layers["window_attention"] = PartitionAttentionLayer(
+            in_channels=out_channels,
+            head_dim=head_dim,
+            partition_size=partition_size,
+            partition_type="window",
+            grid_size=grid_size,
+            mlp_ratio=mlp_ratio,
+            activation_layer=activation_layer,
+            norm_layer=nn.LayerNorm,
+            attention_dropout=attention_dropout,
+            mlp_dropout=mlp_dropout,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        layers["grid_attention"] = PartitionAttentionLayer(
+            in_channels=out_channels,
+            head_dim=head_dim,
+            partition_size=partition_size,
+            partition_type="grid",
+            grid_size=grid_size,
+            mlp_ratio=mlp_ratio,
+            activation_layer=activation_layer,
+            norm_layer=nn.LayerNorm,
+            attention_dropout=attention_dropout,
+            mlp_dropout=mlp_dropout,
+            p_stochastic_dropout=p_stochastic_dropout,
+        )
+        self.layers = nn.Sequential(layers)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W).
+        Returns:
+            Tensor: Output tensor of shape (B, C, H, W).
+        """
+        x = self.layers(x)
+        return x
+
+
+class MaxVitBlock(nn.Module):
+    """
+    A MaxVit block consisting of `n_layers` MaxVit layers.
+
+     Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expansion_ratio (float): Expansion ratio in the bottleneck.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer.
+        activation_layer (Callable[..., nn.Module]): Activation function.
+        norm_layer (Callable[..., nn.Module]): Normalization function.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Ratio of the MLP layer.
+        mlp_dropout (float): Dropout probability for the MLP layer.
+        attention_dropout (float): Dropout probability for the attention layer.
+        p_stochastic_dropout (float): Probability of stochastic depth.
+        partition_size (int): Size of the partitions.
+        input_grid_size (Tuple[int, int]): Size of the input feature grid.
+        n_layers (int): Number of layers in the block.
+        p_stochastic (List[float]): List of probabilities for stochastic depth for each layer.
+    """
+
+    def __init__(
+        self,
+        # conv parameters
+        in_channels: int,
+        out_channels: int,
+        squeeze_ratio: float,
+        expansion_ratio: float,
+        # conv + transformer parameters
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        # transformer parameters
+        head_dim: int,
+        mlp_ratio: int,
+        mlp_dropout: float,
+        attention_dropout: float,
+        # partitioning parameters
+        partition_size: int,
+        input_grid_size: tuple[int, int],
+        # number of layers
+        n_layers: int,
+        p_stochastic: list[float],
+    ) -> None:
+        super().__init__()
+        if not len(p_stochastic) == n_layers:
+            raise ValueError(f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}.")
+
+        self.layers = nn.ModuleList()
+        # account for the first stride of the first layer
+        self.grid_size = _get_conv_output_shape(input_grid_size, kernel_size=3, stride=2, padding=1)
+
+        for idx, p in enumerate(p_stochastic):
+            stride = 2 if idx == 0 else 1
+            self.layers += [
+                MaxVitLayer(
+                    in_channels=in_channels if idx == 0 else out_channels,
+                    out_channels=out_channels,
+                    squeeze_ratio=squeeze_ratio,
+                    expansion_ratio=expansion_ratio,
+                    stride=stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    head_dim=head_dim,
+                    mlp_ratio=mlp_ratio,
+                    mlp_dropout=mlp_dropout,
+                    attention_dropout=attention_dropout,
+                    partition_size=partition_size,
+                    grid_size=self.grid_size,
+                    p_stochastic_dropout=p,
+                ),
+            ]
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W).
+        Returns:
+            Tensor: Output tensor of shape (B, C, H, W).
+        """
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MaxVit(nn.Module):
+    """
+    Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_ paper.
+    Args:
+        input_size (Tuple[int, int]): Size of the input image.
+        stem_channels (int): Number of channels in the stem.
+        partition_size (int): Size of the partitions.
+        block_channels (List[int]): Number of channels in each block.
+        block_layers (List[int]): Number of layers in each block.
+        stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
+        squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
+        expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
+        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
+        activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
+        head_dim (int): Dimension of the attention heads.
+        mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
+        mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0.
+        attention_dropout (float): Dropout probability for the attention layer. Default: 0.0.
+        num_classes (int): Number of classes. Default: 1000.
+    """
+
+    def __init__(
+        self,
+        # input size parameters
+        input_size: tuple[int, int],
+        # stem and task parameters
+        stem_channels: int,
+        # partitioning parameters
+        partition_size: int,
+        # block parameters
+        block_channels: list[int],
+        block_layers: list[int],
+        # attention head dimensions
+        head_dim: int,
+        stochastic_depth_prob: float,
+        # conv + transformer parameters
+        # norm_layer is applied only to the conv layers
+        # activation_layer is applied both to conv and transformer layers
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation_layer: Callable[..., nn.Module] = nn.GELU,
+        # conv parameters
+        squeeze_ratio: float = 0.25,
+        expansion_ratio: float = 4,
+        # transformer parameters
+        mlp_ratio: int = 4,
+        mlp_dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        # task parameters
+        num_classes: int = 1000,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        input_channels = 3
+
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030
+        # for the exact parameters used in batchnorm
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.01)
+
+        # Make sure input size will be divisible by the partition size in all blocks
+        # Undefined behavior if H or W are not divisible by p
+        # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766
+        block_input_sizes = _make_block_input_shapes(input_size, len(block_channels))
+        for idx, block_input_size in enumerate(block_input_sizes):
+            if block_input_size[0] % partition_size != 0 or block_input_size[1] % partition_size != 0:
+                raise ValueError(
+                    f"Input size {block_input_size} of block {idx} is not divisible by partition size {partition_size}. "
+                    f"Consider changing the partition size or the input size.\n"
+                    f"Current configuration yields the following block input sizes: {block_input_sizes}."
+                )
+
+        # stem
+        self.stem = nn.Sequential(
+            Conv2dNormActivation(
+                input_channels,
+                stem_channels,
+                3,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+                bias=False,
+                inplace=None,
+            ),
+            Conv2dNormActivation(
+                stem_channels, stem_channels, 3, stride=1, norm_layer=None, activation_layer=None, bias=True
+            ),
+        )
+
+        # account for stem stride
+        input_size = _get_conv_output_shape(input_size, kernel_size=3, stride=2, padding=1)
+        self.partition_size = partition_size
+
+        # blocks
+        self.blocks = nn.ModuleList()
+        in_channels = [stem_channels] + block_channels[:-1]
+        out_channels = block_channels
+
+        # precompute the stochastich depth probabilities from 0 to stochastic_depth_prob
+        # since we have N blocks with L layers, we will have N * L probabilities uniformly distributed
+        # over the range [0, stochastic_depth_prob]
+        p_stochastic = np.linspace(0, stochastic_depth_prob, sum(block_layers)).tolist()
+
+        p_idx = 0
+        for in_channel, out_channel, num_layers in zip(in_channels, out_channels, block_layers):
+            self.blocks.append(
+                MaxVitBlock(
+                    in_channels=in_channel,
+                    out_channels=out_channel,
+                    squeeze_ratio=squeeze_ratio,
+                    expansion_ratio=expansion_ratio,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    head_dim=head_dim,
+                    mlp_ratio=mlp_ratio,
+                    mlp_dropout=mlp_dropout,
+                    attention_dropout=attention_dropout,
+                    partition_size=partition_size,
+                    input_grid_size=input_size,
+                    n_layers=num_layers,
+                    p_stochastic=p_stochastic[p_idx : p_idx + num_layers],
+                ),
+            )
+            input_size = self.blocks[-1].grid_size  # type: ignore[assignment]
+            p_idx += num_layers
+
+        # see https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1137-L1158
+        # for why there is Linear -> Tanh -> Linear
+        self.classifier = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.LayerNorm(block_channels[-1]),
+            nn.Linear(block_channels[-1], block_channels[-1]),
+            nn.Tanh(),
+            nn.Linear(block_channels[-1], num_classes, bias=False),
+        )
+
+        self._init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.classifier(x)
+        return x
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+
+def _maxvit(
+    # stem parameters
+    stem_channels: int,
+    # block parameters
+    block_channels: list[int],
+    block_layers: list[int],
+    stochastic_depth_prob: float,
+    # partitioning parameters
+    partition_size: int,
+    # transformer parameters
+    head_dim: int,
+    # Weights API
+    weights: Optional[WeightsEnum] = None,
+    progress: bool = False,
+    # kwargs,
+    **kwargs: Any,
+) -> MaxVit:
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        assert weights.meta["min_size"][0] == weights.meta["min_size"][1]
+        _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"])
+
+    input_size = kwargs.pop("input_size", (224, 224))
+
+    model = MaxVit(
+        stem_channels=stem_channels,
+        block_channels=block_channels,
+        block_layers=block_layers,
+        stochastic_depth_prob=stochastic_depth_prob,
+        head_dim=head_dim,
+        partition_size=partition_size,
+        input_size=input_size,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+class MaxVit_T_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # URL empty until official release
+        url="https://download.pytorch.org/models/maxvit_t-bc5ab103.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            "categories": _IMAGENET_CATEGORIES,
+            "num_params": 30919624,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#maxvit",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.700,
+                    "acc@5": 96.722,
+                }
+            },
+            "_ops": 5.558,
+            "_file_size": 118.769,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.
+            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MaxVit_T_Weights.IMAGENET1K_V1))
+def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = True, **kwargs: Any) -> MaxVit:
+    """
+    Constructs a maxvit_t architecture from
+    `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MaxVit_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MaxVit_T_Weights
+        :members:
+    """
+    weights = MaxVit_T_Weights.verify(weights)
+
+    return _maxvit(
+        stem_channels=64,
+        block_channels=[64, 128, 256, 512],
+        block_layers=[2, 2, 5, 2],
+        head_dim=32,
+        stochastic_depth_prob=0.2,
+        partition_size=7,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mnasnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0471b19a6d59618385df3e1ab0e9ecf65bb21dcf
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mnasnet.py
@@ -0,0 +1,434 @@
+import warnings
+from functools import partial
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "MNASNet",
+    "MNASNet0_5_Weights",
+    "MNASNet0_75_Weights",
+    "MNASNet1_0_Weights",
+    "MNASNet1_3_Weights",
+    "mnasnet0_5",
+    "mnasnet0_75",
+    "mnasnet1_0",
+    "mnasnet1_3",
+]
+
+
+# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
+# 1.0 - tensorflow.
+_BN_MOMENTUM = 1 - 0.9997
+
+
+class _InvertedResidual(nn.Module):
+    def __init__(
+        self, in_ch: int, out_ch: int, kernel_size: int, stride: int, expansion_factor: int, bn_momentum: float = 0.1
+    ) -> None:
+        super().__init__()
+        if stride not in [1, 2]:
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
+        if kernel_size not in [3, 5]:
+            raise ValueError(f"kernel_size should be 3 or 5 instead of {kernel_size}")
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = in_ch == out_ch and stride == 1
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=bn_momentum),
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        if self.apply_residual:
+            return self.layers(input) + input
+        else:
+            return self.layers(input)
+
+
+def _stack(
+    in_ch: int, out_ch: int, kernel_size: int, stride: int, exp_factor: int, repeats: int, bn_momentum: float
+) -> nn.Sequential:
+    """Creates a stack of inverted residuals."""
+    if repeats < 1:
+        raise ValueError(f"repeats should be >= 1, instead got {repeats}")
+    # First one has no skip, because feature map size changes.
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+
+
+def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9) -> int:
+    """Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88."""
+    if not 0.0 < round_up_bias < 1.0:
+        raise ValueError(f"round_up_bias should be greater than 0.0 and smaller than 1.0 instead of {round_up_bias}")
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+
+
+def _get_depths(alpha: float) -> list[int]:
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
+    rather than down."""
+    depths = [32, 16, 24, 40, 80, 96, 192, 320]
+    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
+
+
+class MNASNet(torch.nn.Module):
+    """MNASNet, as described in https://arxiv.org/abs/1807.11626. This
+    implements the B1 variant of the model.
+    >>> model = MNASNet(1.0, num_classes=1000)
+    >>> x = torch.rand(1, 3, 224, 224)
+    >>> y = model(x)
+    >>> y.dim()
+    2
+    >>> y.nelement()
+    1000
+    """
+
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
+
+    def __init__(self, alpha: float, num_classes: int = 1000, dropout: float = 0.2) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if alpha <= 0.0:
+            raise ValueError(f"alpha should be greater than 0.0 instead of {alpha}")
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _get_depths(alpha)
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Depthwise separable, no skip.
+            nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1, groups=depths[0], bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(depths[1], momentum=_BN_MOMENTUM),
+            # MNASNet blocks: stacks of inverted residuals.
+            _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
+            _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
+            _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
+            _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_uniform_(m.weight, mode="fan_out", nonlinearity="sigmoid")
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.layers(x)
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
+        return self.classifier(x)
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
+    ) -> None:
+        version = local_metadata.get("version", None)
+        if version not in [1, 2]:
+            raise ValueError(f"version should be set to 1 or 2 instead of {version}")
+
+        if version == 1 and not self.alpha == 1.0:
+            # In the initial version of the model (v1), stem was fixed-size.
+            # All other layer configurations were the same. This will patch
+            # the model so that it's identical to v1. Model with alpha 1.0 is
+            # unaffected.
+            depths = _get_depths(self.alpha)
+            v1_stem = [
+                nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32, bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
+                nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
+                _stack(16, depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            ]
+            for idx, layer in enumerate(v1_stem):
+                self.layers[idx] = layer
+
+            # The model is now identical to v1, and must be saved as such.
+            self._version = 1
+            warnings.warn(
+                "A new version of MNASNet model has been implemented. "
+                "Your checkpoint was saved using the previous version. "
+                "This checkpoint will load and work as before, but "
+                "you may want to upgrade by training a newer model or "
+                "transfer learning from an updated ImageNet checkpoint.",
+                UserWarning,
+            )
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/1e100/mnasnet_trainer",
+}
+
+
+class MNASNet0_5_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mnasnet0.5_top1_67.823-3ffadce67e.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 2218512,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 67.734,
+                    "acc@5": 87.490,
+                }
+            },
+            "_ops": 0.104,
+            "_file_size": 8.591,
+            "_docs": """These weights reproduce closely the results of the paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class MNASNet0_75_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mnasnet0_75-7090bc5f.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/6019",
+            "num_params": 3170208,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 71.180,
+                    "acc@5": 90.496,
+                }
+            },
+            "_ops": 0.215,
+            "_file_size": 12.303,
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class MNASNet1_0_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 4383312,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 73.456,
+                    "acc@5": 91.510,
+                }
+            },
+            "_ops": 0.314,
+            "_file_size": 16.915,
+            "_docs": """These weights reproduce closely the results of the paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class MNASNet1_3_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mnasnet1_3-a4c69d6f.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/6019",
+            "num_params": 6282256,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.506,
+                    "acc@5": 93.522,
+                }
+            },
+            "_ops": 0.526,
+            "_file_size": 24.246,
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwargs: Any) -> MNASNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = MNASNet(alpha, **kwargs)
+
+    if weights:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MNASNet0_5_Weights.IMAGENET1K_V1))
+def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
+    """MNASNet with depth multiplier of 0.5 from
+    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
+    <https://arxiv.org/abs/1807.11626>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.MNASNet0_5_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MNASNet0_5_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mnasnet.MNASNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MNASNet0_5_Weights
+        :members:
+    """
+    weights = MNASNet0_5_Weights.verify(weights)
+
+    return _mnasnet(0.5, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MNASNet0_75_Weights.IMAGENET1K_V1))
+def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
+    """MNASNet with depth multiplier of 0.75 from
+    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
+    <https://arxiv.org/abs/1807.11626>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.MNASNet0_75_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MNASNet0_75_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mnasnet.MNASNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MNASNet0_75_Weights
+        :members:
+    """
+    weights = MNASNet0_75_Weights.verify(weights)
+
+    return _mnasnet(0.75, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MNASNet1_0_Weights.IMAGENET1K_V1))
+def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
+    """MNASNet with depth multiplier of 1.0 from
+    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
+    <https://arxiv.org/abs/1807.11626>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.MNASNet1_0_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MNASNet1_0_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mnasnet.MNASNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MNASNet1_0_Weights
+        :members:
+    """
+    weights = MNASNet1_0_Weights.verify(weights)
+
+    return _mnasnet(1.0, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MNASNet1_3_Weights.IMAGENET1K_V1))
+def mnasnet1_3(*, weights: Optional[MNASNet1_3_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
+    """MNASNet with depth multiplier of 1.3 from
+    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
+    <https://arxiv.org/abs/1807.11626>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.MNASNet1_3_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MNASNet1_3_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mnasnet.MNASNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MNASNet1_3_Weights
+        :members:
+    """
+    weights = MNASNet1_3_Weights.verify(weights)
+
+    return _mnasnet(1.3, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a270d14d3a4ad9eda62b68c2c01e9fdb710ef38
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenet.py
@@ -0,0 +1,6 @@
+from .mobilenetv2 import *  # noqa: F401, F403
+from .mobilenetv3 import *  # noqa: F401, F403
+from .mobilenetv2 import __all__ as mv2_all
+from .mobilenetv3 import __all__ as mv3_all
+
+__all__ = mv2_all + mv3_all
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f62e398a3207f59b33ad8609590888364148af
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv2.py
@@ -0,0 +1,260 @@
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+
+from ..ops.misc import Conv2dNormActivation
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = ["MobileNetV2", "MobileNet_V2_Weights", "mobilenet_v2"]
+
+
+# necessary for backwards compatibility
+class InvertedResidual(nn.Module):
+    def __init__(
+        self, inp: int, oup: int, stride: int, expand_ratio: int, norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        self.stride = stride
+        if stride not in [1, 2]:
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers: list[nn.Module] = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(
+                Conv2dNormActivation(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6)
+            )
+        layers.extend(
+            [
+                # dw
+                Conv2dNormActivation(
+                    hidden_dim,
+                    hidden_dim,
+                    stride=stride,
+                    groups=hidden_dim,
+                    norm_layer=norm_layer,
+                    activation_layer=nn.ReLU6,
+                ),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                norm_layer(oup),
+            ]
+        )
+        self.conv = nn.Sequential(*layers)
+        self.out_channels = oup
+        self._is_cn = stride > 1
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        width_mult: float = 1.0,
+        inverted_residual_setting: Optional[list[list[int]]] = None,
+        round_nearest: int = 8,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.2,
+    ) -> None:
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+            norm_layer: Module specifying the normalization layer to use
+            dropout (float): The droupout probability
+
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError(
+                f"inverted_residual_setting should be non-empty or a 4-element list, got {inverted_residual_setting}"
+            )
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features: list[nn.Module] = [
+            Conv2dNormActivation(3, input_channel, stride=2, norm_layer=norm_layer, activation_layer=nn.ReLU6)
+        ]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
+                input_channel = output_channel
+        # building last several layers
+        features.append(
+            Conv2dNormActivation(
+                input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6
+            )
+        )
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(self.last_channel, num_classes),
+        )
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        x = self.features(x)
+        # Cannot use "squeeze" as batch-size can be 1
+        x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+_COMMON_META = {
+    "num_params": 3504872,
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class MobileNet_V2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv2",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 71.878,
+                    "acc@5": 90.286,
+                }
+            },
+            "_ops": 0.301,
+            "_file_size": 13.555,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 72.154,
+                    "acc@5": 90.822,
+                }
+            },
+            "_ops": 0.301,
+            "_file_size": 13.598,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MobileNet_V2_Weights.IMAGENET1K_V1))
+def mobilenet_v2(
+    *, weights: Optional[MobileNet_V2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> MobileNetV2:
+    """MobileNetV2 architecture from the `MobileNetV2: Inverted Residuals and Linear
+    Bottlenecks <https://arxiv.org/abs/1801.04381>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.MobileNet_V2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MobileNet_V2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mobilenetv2.MobileNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MobileNet_V2_Weights
+        :members:
+    """
+    weights = MobileNet_V2_Weights.verify(weights)
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = MobileNetV2(**kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6239d095ba2a0bb4d85a929540de95be4667d67
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/mobilenetv3.py
@@ -0,0 +1,424 @@
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+
+from ..ops.misc import Conv2dNormActivation, SqueezeExcitation as SElayer
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "MobileNetV3",
+    "MobileNet_V3_Large_Weights",
+    "MobileNet_V3_Small_Weights",
+    "mobilenet_v3_large",
+    "mobilenet_v3_small",
+]
+
+
+class InvertedResidualConfig:
+    # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
+    def __init__(
+        self,
+        input_channels: int,
+        kernel: int,
+        expanded_channels: int,
+        out_channels: int,
+        use_se: bool,
+        activation: str,
+        stride: int,
+        dilation: int,
+        width_mult: float,
+    ):
+        self.input_channels = self.adjust_channels(input_channels, width_mult)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
+        self.out_channels = self.adjust_channels(out_channels, width_mult)
+        self.use_se = use_se
+        self.use_hs = activation == "HS"
+        self.stride = stride
+        self.dilation = dilation
+
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float):
+        return _make_divisible(channels * width_mult, 8)
+
+
+class InvertedResidual(nn.Module):
+    # Implemented as described at section 5 of MobileNetV3 paper
+    def __init__(
+        self,
+        cnf: InvertedResidualConfig,
+        norm_layer: Callable[..., nn.Module],
+        se_layer: Callable[..., nn.Module] = partial(SElayer, scale_activation=nn.Hardsigmoid),
+    ):
+        super().__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+        layers: list[nn.Module] = []
+        activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
+
+        # expand
+        if cnf.expanded_channels != cnf.input_channels:
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    cnf.expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+        # depthwise
+        stride = 1 if cnf.dilation > 1 else cnf.stride
+        layers.append(
+            Conv2dNormActivation(
+                cnf.expanded_channels,
+                cnf.expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=stride,
+                dilation=cnf.dilation,
+                groups=cnf.expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+        if cnf.use_se:
+            squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8)
+            layers.append(se_layer(cnf.expanded_channels, squeeze_channels))
+
+        # project
+        layers.append(
+            Conv2dNormActivation(
+                cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+
+        self.block = nn.Sequential(*layers)
+        self.out_channels = cnf.out_channels
+        self._is_cn = cnf.stride > 1
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result += input
+        return result
+
+
+class MobileNetV3(nn.Module):
+    def __init__(
+        self,
+        inverted_residual_setting: list[InvertedResidualConfig],
+        last_channel: int,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.2,
+        **kwargs: Any,
+    ) -> None:
+        """
+        MobileNet V3 main class
+
+        Args:
+            inverted_residual_setting (List[InvertedResidualConfig]): Network structure
+            last_channel (int): The number of channels on the penultimate layer
+            num_classes (int): Number of classes
+            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            dropout (float): The droupout probability
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
+
+        layers: list[nn.Module] = []
+
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=3,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+
+        # building inverted residual blocks
+        for cnf in inverted_residual_setting:
+            layers.append(block(cnf, norm_layer))
+
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = 6 * lastconv_input_channels
+        layers.append(
+            Conv2dNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Linear(lastconv_output_channels, last_channel),
+            nn.Hardswish(inplace=True),
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(last_channel, num_classes),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        x = self.classifier(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _mobilenet_v3_conf(
+    arch: str, width_mult: float = 1.0, reduced_tail: bool = False, dilated: bool = False, **kwargs: Any
+):
+    reduce_divider = 2 if reduced_tail else 1
+    dilation = 2 if dilated else 1
+
+    bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
+    adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
+
+    if arch == "mobilenet_v3_large":
+        inverted_residual_setting = [
+            bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
+            bneck_conf(16, 3, 64, 24, False, "RE", 2, 1),  # C1
+            bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
+            bneck_conf(24, 5, 72, 40, True, "RE", 2, 1),  # C2
+            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+            bneck_conf(40, 3, 240, 80, False, "HS", 2, 1),  # C3
+            bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
+            bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
+            bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation),  # C4
+            bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+            bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+        ]
+        last_channel = adjust_channels(1280 // reduce_divider)  # C5
+    elif arch == "mobilenet_v3_small":
+        inverted_residual_setting = [
+            bneck_conf(16, 3, 16, 16, True, "RE", 2, 1),  # C1
+            bneck_conf(16, 3, 72, 24, False, "RE", 2, 1),  # C2
+            bneck_conf(24, 3, 88, 24, False, "RE", 1, 1),
+            bneck_conf(24, 5, 96, 40, True, "HS", 2, 1),  # C3
+            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+            bneck_conf(40, 5, 120, 48, True, "HS", 1, 1),
+            bneck_conf(48, 5, 144, 48, True, "HS", 1, 1),
+            bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation),  # C4
+            bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
+            bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
+        ]
+        last_channel = adjust_channels(1024 // reduce_divider)  # C5
+    else:
+        raise ValueError(f"Unsupported model type {arch}")
+
+    return inverted_residual_setting, last_channel
+
+
+def _mobilenet_v3(
+    inverted_residual_setting: list[InvertedResidualConfig],
+    last_channel: int,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> MobileNetV3:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class MobileNet_V3_Large_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 5483032,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 74.042,
+                    "acc@5": 91.340,
+                }
+            },
+            "_ops": 0.217,
+            "_file_size": 21.114,
+            "_docs": """These weights were trained from scratch by using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/mobilenet_v3_large-5c1a4163.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 5483032,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.274,
+                    "acc@5": 92.566,
+                }
+            },
+            "_ops": 0.217,
+            "_file_size": 21.107,
+            "_docs": """
+                These weights improve marginally upon the results of the original paper by using a modified version of
+                TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class MobileNet_V3_Small_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 2542856,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 67.668,
+                    "acc@5": 87.402,
+                }
+            },
+            "_ops": 0.057,
+            "_file_size": 9.829,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a simple training recipe.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MobileNet_V3_Large_Weights.IMAGENET1K_V1))
+def mobilenet_v3_large(
+    *, weights: Optional[MobileNet_V3_Large_Weights] = None, progress: bool = True, **kwargs: Any
+) -> MobileNetV3:
+    """
+    Constructs a large MobileNetV3 architecture from
+    `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MobileNet_V3_Large_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MobileNet_V3_Large_Weights
+        :members:
+    """
+    weights = MobileNet_V3_Large_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
+    return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MobileNet_V3_Small_Weights.IMAGENET1K_V1))
+def mobilenet_v3_small(
+    *, weights: Optional[MobileNet_V3_Small_Weights] = None, progress: bool = True, **kwargs: Any
+) -> MobileNetV3:
+    """
+    Constructs a small MobileNetV3 architecture from
+    `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.MobileNet_V3_Small_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.MobileNet_V3_Small_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.MobileNet_V3_Small_Weights
+        :members:
+    """
+    weights = MobileNet_V3_Small_Weights.verify(weights)
+
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
+    return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89d2302f825ff0dbe25d02f6dc7c84d3c0790ad0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__init__.py
@@ -0,0 +1 @@
+from .raft import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49b70599d3e512de3f096f1939dc1dd90e21aa05
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb2b6bc31bb45cd29242d8af519bf1e355fb10e9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/raft.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/raft.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..537f20d3bfe4c99b1155b72f89d3f5d97ecead52
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/__pycache__/raft.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2454a27315d6e560dccb6ea2ce6083da03e256
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/_utils.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def grid_sample(img: Tensor, absolute_grid: Tensor, mode: str = "bilinear", align_corners: Optional[bool] = None):
+    """Same as torch's grid_sample, with absolute pixel coordinates instead of normalized coordinates."""
+    h, w = img.shape[-2:]
+
+    xgrid, ygrid = absolute_grid.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (w - 1) - 1
+    # Adding condition if h > 1 to enable this function be reused in raft-stereo
+    if h > 1:
+        ygrid = 2 * ygrid / (h - 1) - 1
+    normalized_grid = torch.cat([xgrid, ygrid], dim=-1)
+
+    return F.grid_sample(img, normalized_grid, mode=mode, align_corners=align_corners)
+
+
+def make_coords_grid(batch_size: int, h: int, w: int, device: str = "cpu"):
+    device = torch.device(device)
+    coords = torch.meshgrid(torch.arange(h, device=device), torch.arange(w, device=device), indexing="ij")
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch_size, 1, 1, 1)
+
+
+def upsample_flow(flow, up_mask: Optional[Tensor] = None, factor: int = 8):
+    """Upsample flow by the input factor (default 8).
+
+    If up_mask is None we just interpolate.
+    If up_mask is specified, we upsample using a convex combination of its weights. See paper page 8 and appendix B.
+    Note that in appendix B the picture assumes a downsample factor of 4 instead of 8.
+    """
+    batch_size, num_channels, h, w = flow.shape
+    new_h, new_w = h * factor, w * factor
+
+    if up_mask is None:
+        return factor * F.interpolate(flow, size=(new_h, new_w), mode="bilinear", align_corners=True)
+
+    up_mask = up_mask.view(batch_size, 1, 9, factor, factor, h, w)
+    up_mask = torch.softmax(up_mask, dim=2)  # "convex" == weights sum to 1
+
+    upsampled_flow = F.unfold(factor * flow, kernel_size=3, padding=1).view(batch_size, num_channels, 9, 1, 1, h, w)
+    upsampled_flow = torch.sum(up_mask * upsampled_flow, dim=2)
+
+    return upsampled_flow.permute(0, 1, 4, 2, 5, 3).reshape(batch_size, num_channels, new_h, new_w)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/raft.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..644adc2dc5c67c3517b697e2c0a3f0e273ea7277
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/optical_flow/raft.py
@@ -0,0 +1,947 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.instancenorm import InstanceNorm2d
+from torchvision.ops import Conv2dNormActivation
+
+from ...transforms._presets import OpticalFlow
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._utils import handle_legacy_interface
+from ._utils import grid_sample, make_coords_grid, upsample_flow
+
+
+__all__ = (
+    "RAFT",
+    "raft_large",
+    "raft_small",
+    "Raft_Large_Weights",
+    "Raft_Small_Weights",
+)
+
+
+class ResidualBlock(nn.Module):
+    """Slightly modified Residual block with extra relu and biases."""
+
+    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1, always_project: bool = False):
+        super().__init__()
+
+        # Note regarding bias=True:
+        # Usually we can pass bias=False in conv layers followed by a norm layer.
+        # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
+        # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
+        # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
+        self.convnormrelu1 = Conv2dNormActivation(
+            in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
+        )
+        self.convnormrelu2 = Conv2dNormActivation(
+            out_channels, out_channels, norm_layer=norm_layer, kernel_size=3, bias=True
+        )
+
+        # make mypy happy
+        self.downsample: nn.Module
+
+        if stride == 1 and not always_project:
+            self.downsample = nn.Identity()
+        else:
+            self.downsample = Conv2dNormActivation(
+                in_channels,
+                out_channels,
+                norm_layer=norm_layer,
+                kernel_size=1,
+                stride=stride,
+                bias=True,
+                activation_layer=None,
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        y = x
+        y = self.convnormrelu1(y)
+        y = self.convnormrelu2(y)
+
+        x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BottleneckBlock(nn.Module):
+    """Slightly modified BottleNeck block (extra relu and biases)"""
+
+    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
+        super().__init__()
+
+        # See note in ResidualBlock for the reason behind bias=True
+        self.convnormrelu1 = Conv2dNormActivation(
+            in_channels, out_channels // 4, norm_layer=norm_layer, kernel_size=1, bias=True
+        )
+        self.convnormrelu2 = Conv2dNormActivation(
+            out_channels // 4, out_channels // 4, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
+        )
+        self.convnormrelu3 = Conv2dNormActivation(
+            out_channels // 4, out_channels, norm_layer=norm_layer, kernel_size=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+
+        if stride == 1:
+            self.downsample = nn.Identity()
+        else:
+            self.downsample = Conv2dNormActivation(
+                in_channels,
+                out_channels,
+                norm_layer=norm_layer,
+                kernel_size=1,
+                stride=stride,
+                bias=True,
+                activation_layer=None,
+            )
+
+    def forward(self, x):
+        y = x
+        y = self.convnormrelu1(y)
+        y = self.convnormrelu2(y)
+        y = self.convnormrelu3(y)
+
+        x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class FeatureEncoder(nn.Module):
+    """The feature encoder, used both as the actual feature encoder, and as the context encoder.
+
+    It must downsample its input by 8.
+    """
+
+    def __init__(
+        self, *, block=ResidualBlock, layers=(64, 64, 96, 128, 256), strides=(2, 1, 2, 2), norm_layer=nn.BatchNorm2d
+    ):
+        super().__init__()
+
+        if len(layers) != 5:
+            raise ValueError(f"The expected number of layers is 5, instead got {len(layers)}")
+
+        # See note in ResidualBlock for the reason behind bias=True
+        self.convnormrelu = Conv2dNormActivation(
+            3, layers[0], norm_layer=norm_layer, kernel_size=7, stride=strides[0], bias=True
+        )
+
+        self.layer1 = self._make_2_blocks(block, layers[0], layers[1], norm_layer=norm_layer, first_stride=strides[1])
+        self.layer2 = self._make_2_blocks(block, layers[1], layers[2], norm_layer=norm_layer, first_stride=strides[2])
+        self.layer3 = self._make_2_blocks(block, layers[2], layers[3], norm_layer=norm_layer, first_stride=strides[3])
+
+        self.conv = nn.Conv2d(layers[3], layers[4], kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+        num_downsamples = len(list(filter(lambda s: s == 2, strides)))
+        self.output_dim = layers[-1]
+        self.downsample_factor = 2**num_downsamples
+
+    def _make_2_blocks(self, block, in_channels, out_channels, norm_layer, first_stride):
+        block1 = block(in_channels, out_channels, norm_layer=norm_layer, stride=first_stride)
+        block2 = block(out_channels, out_channels, norm_layer=norm_layer, stride=1)
+        return nn.Sequential(block1, block2)
+
+    def forward(self, x):
+        x = self.convnormrelu(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv(x)
+
+        return x
+
+
+class MotionEncoder(nn.Module):
+    """The motion encoder, part of the update block.
+
+    Takes the current predicted flow and the correlation features as input and returns an encoded version of these.
+    """
+
+    def __init__(self, *, in_channels_corr, corr_layers=(256, 192), flow_layers=(128, 64), out_channels=128):
+        super().__init__()
+
+        if len(flow_layers) != 2:
+            raise ValueError(f"The expected number of flow_layers is 2, instead got {len(flow_layers)}")
+        if len(corr_layers) not in (1, 2):
+            raise ValueError(f"The number of corr_layers should be 1 or 2, instead got {len(corr_layers)}")
+
+        self.convcorr1 = Conv2dNormActivation(in_channels_corr, corr_layers[0], norm_layer=None, kernel_size=1)
+        if len(corr_layers) == 2:
+            self.convcorr2 = Conv2dNormActivation(corr_layers[0], corr_layers[1], norm_layer=None, kernel_size=3)
+        else:
+            self.convcorr2 = nn.Identity()
+
+        self.convflow1 = Conv2dNormActivation(2, flow_layers[0], norm_layer=None, kernel_size=7)
+        self.convflow2 = Conv2dNormActivation(flow_layers[0], flow_layers[1], norm_layer=None, kernel_size=3)
+
+        # out_channels - 2 because we cat the flow (2 channels) at the end
+        self.conv = Conv2dNormActivation(
+            corr_layers[-1] + flow_layers[-1], out_channels - 2, norm_layer=None, kernel_size=3
+        )
+
+        self.out_channels = out_channels
+
+    def forward(self, flow, corr_features):
+        corr = self.convcorr1(corr_features)
+        corr = self.convcorr2(corr)
+
+        flow_orig = flow
+        flow = self.convflow1(flow)
+        flow = self.convflow2(flow)
+
+        corr_flow = torch.cat([corr, flow], dim=1)
+        corr_flow = self.conv(corr_flow)
+        return torch.cat([corr_flow, flow_orig], dim=1)
+
+
+class ConvGRU(nn.Module):
+    """Convolutional Gru unit."""
+
+    def __init__(self, *, input_size, hidden_size, kernel_size, padding):
+        super().__init__()
+        self.convz = nn.Conv2d(hidden_size + input_size, hidden_size, kernel_size=kernel_size, padding=padding)
+        self.convr = nn.Conv2d(hidden_size + input_size, hidden_size, kernel_size=kernel_size, padding=padding)
+        self.convq = nn.Conv2d(hidden_size + input_size, hidden_size, kernel_size=kernel_size, padding=padding)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+
+
+def _pass_through_h(h, _):
+    # Declared here for torchscript
+    return h
+
+
+class RecurrentBlock(nn.Module):
+    """Recurrent block, part of the update block.
+
+    Takes the current hidden state and the concatenation of (motion encoder output, context) as input.
+    Returns an updated hidden state.
+    """
+
+    def __init__(self, *, input_size, hidden_size, kernel_size=((1, 5), (5, 1)), padding=((0, 2), (2, 0))):
+        super().__init__()
+
+        if len(kernel_size) != len(padding):
+            raise ValueError(
+                f"kernel_size should have the same length as padding, instead got len(kernel_size) = {len(kernel_size)} and len(padding) = {len(padding)}"
+            )
+        if len(kernel_size) not in (1, 2):
+            raise ValueError(f"kernel_size should either 1 or 2, instead got {len(kernel_size)}")
+
+        self.convgru1 = ConvGRU(
+            input_size=input_size, hidden_size=hidden_size, kernel_size=kernel_size[0], padding=padding[0]
+        )
+        if len(kernel_size) == 2:
+            self.convgru2 = ConvGRU(
+                input_size=input_size, hidden_size=hidden_size, kernel_size=kernel_size[1], padding=padding[1]
+            )
+        else:
+            self.convgru2 = _pass_through_h
+
+        self.hidden_size = hidden_size
+
+    def forward(self, h, x):
+        h = self.convgru1(h, x)
+        h = self.convgru2(h, x)
+        return h
+
+
+class FlowHead(nn.Module):
+    """Flow head, part of the update block.
+
+    Takes the hidden state of the recurrent unit as input, and outputs the predicted "delta flow".
+    """
+
+    def __init__(self, *, in_channels, hidden_size):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, hidden_size, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_size, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+
+class UpdateBlock(nn.Module):
+    """The update block which contains the motion encoder, the recurrent block, and the flow head.
+
+    It must expose a ``hidden_state_size`` attribute which is the hidden state size of its recurrent block.
+    """
+
+    def __init__(self, *, motion_encoder, recurrent_block, flow_head):
+        super().__init__()
+        self.motion_encoder = motion_encoder
+        self.recurrent_block = recurrent_block
+        self.flow_head = flow_head
+
+        self.hidden_state_size = recurrent_block.hidden_size
+
+    def forward(self, hidden_state, context, corr_features, flow):
+        motion_features = self.motion_encoder(flow, corr_features)
+        x = torch.cat([context, motion_features], dim=1)
+
+        hidden_state = self.recurrent_block(hidden_state, x)
+        delta_flow = self.flow_head(hidden_state)
+        return hidden_state, delta_flow
+
+
+class MaskPredictor(nn.Module):
+    """Mask predictor to be used when upsampling the predicted flow.
+
+    It takes the hidden state of the recurrent unit as input and outputs the mask.
+    This is not used in the raft-small model.
+    """
+
+    def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
+        super().__init__()
+        self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
+        # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
+        self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)
+
+        # In the original code, they use a factor of 0.25 to "downweight the gradients" of that branch.
+        # See e.g. https://github.com/princeton-vl/RAFT/issues/119#issuecomment-953950419
+        # or https://github.com/princeton-vl/RAFT/issues/24.
+        # It doesn't seem to affect epe significantly and can likely be set to 1.
+        self.multiplier = multiplier
+
+    def forward(self, x):
+        x = self.convrelu(x)
+        x = self.conv(x)
+        return self.multiplier * x
+
+
+class CorrBlock(nn.Module):
+    """The correlation block.
+
+    Creates a correlation pyramid with ``num_levels`` levels from the outputs of the feature encoder,
+    and then indexes from this pyramid to create correlation features.
+    The "indexing" of a given centroid pixel x' is done by concatenating its surrounding neighbors that
+    are within a ``radius``, according to the infinity norm (see paper section 3.2).
+    Note: typo in the paper, it should be infinity norm, not 1-norm.
+    """
+
+    def __init__(self, *, num_levels: int = 4, radius: int = 4):
+        super().__init__()
+        self.num_levels = num_levels
+        self.radius = radius
+
+        self.corr_pyramid: list[Tensor] = [torch.tensor(0)]  # useless, but torchscript is otherwise confused :')
+
+        # The neighborhood of a centroid pixel x' is {x' + delta, ||delta||_inf <= radius}
+        # so it's a square surrounding x', and its sides have a length of 2 * radius + 1
+        # The paper claims that it's ||.||_1 instead of ||.||_inf but it's a typo:
+        # https://github.com/princeton-vl/RAFT/issues/122
+        self.out_channels = num_levels * (2 * radius + 1) ** 2
+
+    def build_pyramid(self, fmap1, fmap2):
+        """Build the correlation pyramid from two feature maps.
+
+        The correlation volume is first computed as the dot product of each pair (pixel_in_fmap1, pixel_in_fmap2)
+        The last 2 dimensions of the correlation volume are then pooled num_levels times at different resolutions
+        to build the correlation pyramid.
+        """
+
+        if fmap1.shape != fmap2.shape:
+            raise ValueError(
+                f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)"
+            )
+
+        # Explaining min_fmap_size below: the fmaps are down-sampled (num_levels - 1) times by a factor of 2.
+        # The last corr_volume most have at least 2 values (hence the 2* factor), otherwise grid_sample() would
+        # produce nans in its output.
+        min_fmap_size = 2 * (2 ** (self.num_levels - 1))
+        if any(fmap_size < min_fmap_size for fmap_size in fmap1.shape[-2:]):
+            raise ValueError(
+                "Feature maps are too small to be down-sampled by the correlation pyramid. "
+                f"H and W of feature maps should be at least {min_fmap_size}; got: {fmap1.shape[-2:]}. "
+                "Remember that input images to the model are downsampled by 8, so that means their "
+                f"dimensions should be at least 8 * {min_fmap_size} = {8 * min_fmap_size}."
+            )
+
+        corr_volume = self._compute_corr_volume(fmap1, fmap2)
+
+        batch_size, h, w, num_channels, _, _ = corr_volume.shape  # _, _ = h, w
+        corr_volume = corr_volume.reshape(batch_size * h * w, num_channels, h, w)
+        self.corr_pyramid = [corr_volume]
+        for _ in range(self.num_levels - 1):
+            corr_volume = F.avg_pool2d(corr_volume, kernel_size=2, stride=2)
+            self.corr_pyramid.append(corr_volume)
+
+    def index_pyramid(self, centroids_coords):
+        """Return correlation features by indexing from the pyramid."""
+        neighborhood_side_len = 2 * self.radius + 1  # see note in __init__ about out_channels
+        di = torch.linspace(-self.radius, self.radius, neighborhood_side_len)
+        dj = torch.linspace(-self.radius, self.radius, neighborhood_side_len)
+        delta = torch.stack(torch.meshgrid(di, dj, indexing="ij"), dim=-1).to(centroids_coords.device)
+        delta = delta.view(1, neighborhood_side_len, neighborhood_side_len, 2)
+
+        batch_size, _, h, w = centroids_coords.shape  # _ = 2
+        centroids_coords = centroids_coords.permute(0, 2, 3, 1).reshape(batch_size * h * w, 1, 1, 2)
+
+        indexed_pyramid = []
+        for corr_volume in self.corr_pyramid:
+            sampling_coords = centroids_coords + delta  # end shape is (batch_size * h * w, side_len, side_len, 2)
+            indexed_corr_volume = grid_sample(corr_volume, sampling_coords, align_corners=True, mode="bilinear").view(
+                batch_size, h, w, -1
+            )
+            indexed_pyramid.append(indexed_corr_volume)
+            centroids_coords = centroids_coords / 2
+
+        corr_features = torch.cat(indexed_pyramid, dim=-1).permute(0, 3, 1, 2).contiguous()
+
+        expected_output_shape = (batch_size, self.out_channels, h, w)
+        if corr_features.shape != expected_output_shape:
+            raise ValueError(
+                f"Output shape of index pyramid is incorrect. Should be {expected_output_shape}, got {corr_features.shape}"
+            )
+
+        return corr_features
+
+    def _compute_corr_volume(self, fmap1, fmap2):
+        batch_size, num_channels, h, w = fmap1.shape
+        fmap1 = fmap1.view(batch_size, num_channels, h * w)
+        fmap2 = fmap2.view(batch_size, num_channels, h * w)
+
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch_size, h, w, 1, h, w)
+        return corr / torch.sqrt(torch.tensor(num_channels))
+
+
+class RAFT(nn.Module):
+    def __init__(self, *, feature_encoder, context_encoder, corr_block, update_block, mask_predictor=None):
+        """RAFT model from
+        `RAFT: Recurrent All Pairs Field Transforms for Optical Flow <https://arxiv.org/abs/2003.12039>`_.
+
+        args:
+            feature_encoder (nn.Module): The feature encoder. It must downsample the input by 8.
+                Its input is the concatenation of ``image1`` and ``image2``.
+            context_encoder (nn.Module): The context encoder. It must downsample the input by 8.
+                Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:
+
+                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
+                - one part will be used to initialize the hidden state of the recurrent unit of
+                  the ``update_block``
+
+                These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
+                of the ``context_encoder`` must be strictly greater than ``hidden_state_size``.
+
+            corr_block (nn.Module): The correlation block, which creates a correlation pyramid from the output of the
+                ``feature_encoder``, and then indexes from this pyramid to create correlation features. It must expose
+                2 methods:
+
+                - a ``build_pyramid`` method that takes ``feature_map_1`` and ``feature_map_2`` as input (these are the
+                  output of the ``feature_encoder``).
+                - a ``index_pyramid`` method that takes the coordinates of the centroid pixels as input, and returns
+                  the correlation features. See paper section 3.2.
+
+                It must expose an ``out_channels`` attribute.
+
+            update_block (nn.Module): The update block, which contains the motion encoder, the recurrent unit, and the
+                flow head. It takes as input the hidden state of its recurrent unit, the context, the correlation
+                features, and the current predicted flow. It outputs an updated hidden state, and the ``delta_flow``
+                prediction (see paper appendix A). It must expose a ``hidden_state_size`` attribute.
+            mask_predictor (nn.Module, optional): Predicts the mask that will be used to upsample the predicted flow.
+                The output channel must be 8 * 8 * 9 - see paper section 3.3, and Appendix B.
+                If ``None`` (default), the flow is upsampled using interpolation.
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        self.feature_encoder = feature_encoder
+        self.context_encoder = context_encoder
+        self.corr_block = corr_block
+        self.update_block = update_block
+
+        self.mask_predictor = mask_predictor
+
+        if not hasattr(self.update_block, "hidden_state_size"):
+            raise ValueError("The update_block parameter should expose a 'hidden_state_size' attribute.")
+
+    def forward(self, image1, image2, num_flow_updates: int = 12):
+
+        batch_size, _, h, w = image1.shape
+        if (h, w) != image2.shape[-2:]:
+            raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
+        if not ((h % 8 == 0) and (w % 8 == 0)):
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")
+
+        fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
+        fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
+        if fmap1.shape[-2:] != (h // 8, w // 8):
+            raise ValueError("The feature encoder should downsample H and W by 8")
+
+        self.corr_block.build_pyramid(fmap1, fmap2)
+
+        context_out = self.context_encoder(image1)
+        if context_out.shape[-2:] != (h // 8, w // 8):
+            raise ValueError("The context encoder should downsample H and W by 8")
+
+        # As in the original paper, the actual output of the context encoder is split in 2 parts:
+        # - one part is used to initialize the hidden state of the recurent units of the update block
+        # - the rest is the "actual" context.
+        hidden_state_size = self.update_block.hidden_state_size
+        out_channels_context = context_out.shape[1] - hidden_state_size
+        if out_channels_context <= 0:
+            raise ValueError(
+                f"The context encoder outputs {context_out.shape[1]} channels, but it should have at strictly more than hidden_state={hidden_state_size} channels"
+            )
+        hidden_state, context = torch.split(context_out, [hidden_state_size, out_channels_context], dim=1)
+        hidden_state = torch.tanh(hidden_state)
+        context = F.relu(context)
+
+        coords0 = make_coords_grid(batch_size, h // 8, w // 8).to(fmap1.device)
+        coords1 = make_coords_grid(batch_size, h // 8, w // 8).to(fmap1.device)
+
+        flow_predictions = []
+        for _ in range(num_flow_updates):
+            coords1 = coords1.detach()  # Don't backpropagate gradients through this branch, see paper
+            corr_features = self.corr_block.index_pyramid(centroids_coords=coords1)
+
+            flow = coords1 - coords0
+            hidden_state, delta_flow = self.update_block(hidden_state, context, corr_features, flow)
+
+            coords1 = coords1 + delta_flow
+
+            up_mask = None if self.mask_predictor is None else self.mask_predictor(hidden_state)
+            upsampled_flow = upsample_flow(flow=(coords1 - coords0), up_mask=up_mask)
+            flow_predictions.append(upsampled_flow)
+
+        return flow_predictions
+
+
+_COMMON_META = {
+    "min_size": (128, 128),
+}
+
+
+class Raft_Large_Weights(WeightsEnum):
+    """The metrics reported here are as follows.
+
+    ``epe`` is the "end-point-error" and indicates how far (in pixels) the
+    predicted flow is from its true value. This is averaged over all pixels
+    of all images. ``per_image_epe`` is similar, but the average is different:
+    the epe is first computed on each image independently, and then averaged
+    over all images. This corresponds to "Fl-epe" (sometimes written "F1-epe")
+    in the original paper, and it's only used on Kitti. ``fl-all`` is also a
+    Kitti-specific metric, defined by the author of the dataset and used for the
+    Kitti leaderboard. It corresponds to the average of pixels whose epe is
+    either <3px, or <5% of flow's 2-norm.
+    """
+
+    C_T_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT
+        url="https://download.pytorch.org/models/raft_large_C_T_V1-22a6c225.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/princeton-vl/RAFT",
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.4411},
+                "Sintel-Train-Finalpass": {"epe": 2.7894},
+                "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """These weights were ported from the original paper. They
+            are trained on :class:`~torchvision.datasets.FlyingChairs` +
+            :class:`~torchvision.datasets.FlyingThings3D`.""",
+        },
+    )
+
+    C_T_V2 = Weights(
+        url="https://download.pytorch.org/models/raft_large_C_T_V2-1bb1363a.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.3822},
+                "Sintel-Train-Finalpass": {"epe": 2.7161},
+                "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """These weights were trained from scratch on
+            :class:`~torchvision.datasets.FlyingChairs` +
+            :class:`~torchvision.datasets.FlyingThings3D`.""",
+        },
+    )
+
+    C_T_SKHT_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT
+        url="https://download.pytorch.org/models/raft_large_C_T_SKHT_V1-0b8c9e55.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/princeton-vl/RAFT",
+            "_metrics": {
+                "Sintel-Test-Cleanpass": {"epe": 1.94},
+                "Sintel-Test-Finalpass": {"epe": 3.18},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """
+                These weights were ported from the original paper. They are
+                trained on :class:`~torchvision.datasets.FlyingChairs` +
+                :class:`~torchvision.datasets.FlyingThings3D` and fine-tuned on
+                Sintel. The Sintel fine-tuning step is a combination of
+                :class:`~torchvision.datasets.Sintel`,
+                :class:`~torchvision.datasets.KittiFlow`,
+                :class:`~torchvision.datasets.HD1K`, and
+                :class:`~torchvision.datasets.FlyingThings3D` (clean pass).
+            """,
+        },
+    )
+
+    C_T_SKHT_V2 = Weights(
+        url="https://download.pytorch.org/models/raft_large_C_T_SKHT_V2-ff5fadd5.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
+            "_metrics": {
+                "Sintel-Test-Cleanpass": {"epe": 1.819},
+                "Sintel-Test-Finalpass": {"epe": 3.067},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """
+                These weights were trained from scratch. They are
+                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
+                :class:`~torchvision.datasets.FlyingThings3D` and then
+                fine-tuned on Sintel. The Sintel fine-tuning step is a
+                combination of :class:`~torchvision.datasets.Sintel`,
+                :class:`~torchvision.datasets.KittiFlow`,
+                :class:`~torchvision.datasets.HD1K`, and
+                :class:`~torchvision.datasets.FlyingThings3D` (clean pass).
+            """,
+        },
+    )
+
+    C_T_SKHT_K_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT
+        url="https://download.pytorch.org/models/raft_large_C_T_SKHT_K_V1-4a6a5039.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/princeton-vl/RAFT",
+            "_metrics": {
+                "Kitti-Test": {"fl_all": 5.10},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """
+                These weights were ported from the original paper. They are
+                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
+                :class:`~torchvision.datasets.FlyingThings3D`,
+                fine-tuned on Sintel, and then fine-tuned on
+                :class:`~torchvision.datasets.KittiFlow`. The Sintel fine-tuning
+                step was described above.
+            """,
+        },
+    )
+
+    C_T_SKHT_K_V2 = Weights(
+        url="https://download.pytorch.org/models/raft_large_C_T_SKHT_K_V2-b5c70766.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 5257536,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
+            "_metrics": {
+                "Kitti-Test": {"fl_all": 5.19},
+            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
+            "_docs": """
+                These weights were trained from scratch. They are
+                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
+                :class:`~torchvision.datasets.FlyingThings3D`,
+                fine-tuned on Sintel, and then fine-tuned on
+                :class:`~torchvision.datasets.KittiFlow`. The Sintel fine-tuning
+                step was described above.
+            """,
+        },
+    )
+
+    DEFAULT = C_T_SKHT_V2
+
+
+class Raft_Small_Weights(WeightsEnum):
+    """The metrics reported here are as follows.
+
+    ``epe`` is the "end-point-error" and indicates how far (in pixels) the
+    predicted flow is from its true value. This is averaged over all pixels
+    of all images. ``per_image_epe`` is similar, but the average is different:
+    the epe is first computed on each image independently, and then averaged
+    over all images. This corresponds to "Fl-epe" (sometimes written "F1-epe")
+    in the original paper, and it's only used on Kitti. ``fl-all`` is also a
+    Kitti-specific metric, defined by the author of the dataset and used for the
+    Kitti leaderboard. It corresponds to the average of pixels whose epe is
+    either <3px, or <5% of flow's 2-norm.
+    """
+
+    C_T_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT
+        url="https://download.pytorch.org/models/raft_small_C_T_V1-ad48884c.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 990162,
+            "recipe": "https://github.com/princeton-vl/RAFT",
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 2.1231},
+                "Sintel-Train-Finalpass": {"epe": 3.2790},
+                "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
+            },
+            "_ops": 47.655,
+            "_file_size": 3.821,
+            "_docs": """These weights were ported from the original paper. They
+            are trained on :class:`~torchvision.datasets.FlyingChairs` +
+            :class:`~torchvision.datasets.FlyingThings3D`.""",
+        },
+    )
+    C_T_V2 = Weights(
+        url="https://download.pytorch.org/models/raft_small_C_T_V2-01064c6d.pth",
+        transforms=OpticalFlow,
+        meta={
+            **_COMMON_META,
+            "num_params": 990162,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.9901},
+                "Sintel-Train-Finalpass": {"epe": 3.2831},
+                "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
+            },
+            "_ops": 47.655,
+            "_file_size": 3.821,
+            "_docs": """These weights were trained from scratch on
+            :class:`~torchvision.datasets.FlyingChairs` +
+            :class:`~torchvision.datasets.FlyingThings3D`.""",
+        },
+    )
+
+    DEFAULT = C_T_V2
+
+
+def _raft(
+    *,
+    weights=None,
+    progress=False,
+    # Feature encoder
+    feature_encoder_layers,
+    feature_encoder_block,
+    feature_encoder_norm_layer,
+    # Context encoder
+    context_encoder_layers,
+    context_encoder_block,
+    context_encoder_norm_layer,
+    # Correlation block
+    corr_block_num_levels,
+    corr_block_radius,
+    # Motion encoder
+    motion_encoder_corr_layers,
+    motion_encoder_flow_layers,
+    motion_encoder_out_channels,
+    # Recurrent block
+    recurrent_block_hidden_state_size,
+    recurrent_block_kernel_size,
+    recurrent_block_padding,
+    # Flow Head
+    flow_head_hidden_size,
+    # Mask predictor
+    use_mask_predictor,
+    **kwargs,
+):
+    feature_encoder = kwargs.pop("feature_encoder", None) or FeatureEncoder(
+        block=feature_encoder_block, layers=feature_encoder_layers, norm_layer=feature_encoder_norm_layer
+    )
+    context_encoder = kwargs.pop("context_encoder", None) or FeatureEncoder(
+        block=context_encoder_block, layers=context_encoder_layers, norm_layer=context_encoder_norm_layer
+    )
+
+    corr_block = kwargs.pop("corr_block", None) or CorrBlock(num_levels=corr_block_num_levels, radius=corr_block_radius)
+
+    update_block = kwargs.pop("update_block", None)
+    if update_block is None:
+        motion_encoder = MotionEncoder(
+            in_channels_corr=corr_block.out_channels,
+            corr_layers=motion_encoder_corr_layers,
+            flow_layers=motion_encoder_flow_layers,
+            out_channels=motion_encoder_out_channels,
+        )
+
+        # See comments in forward pass of RAFT class about why we split the output of the context encoder
+        out_channels_context = context_encoder_layers[-1] - recurrent_block_hidden_state_size
+        recurrent_block = RecurrentBlock(
+            input_size=motion_encoder.out_channels + out_channels_context,
+            hidden_size=recurrent_block_hidden_state_size,
+            kernel_size=recurrent_block_kernel_size,
+            padding=recurrent_block_padding,
+        )
+
+        flow_head = FlowHead(in_channels=recurrent_block_hidden_state_size, hidden_size=flow_head_hidden_size)
+
+        update_block = UpdateBlock(motion_encoder=motion_encoder, recurrent_block=recurrent_block, flow_head=flow_head)
+
+    mask_predictor = kwargs.pop("mask_predictor", None)
+    if mask_predictor is None and use_mask_predictor:
+        mask_predictor = MaskPredictor(
+            in_channels=recurrent_block_hidden_state_size,
+            hidden_size=256,
+            multiplier=0.25,  # See comment in MaskPredictor about this
+        )
+
+    model = RAFT(
+        feature_encoder=feature_encoder,
+        context_encoder=context_encoder,
+        corr_block=corr_block,
+        update_block=update_block,
+        mask_predictor=mask_predictor,
+        **kwargs,  # not really needed, all params should be consumed by now
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Raft_Large_Weights.C_T_SKHT_V2))
+def raft_large(*, weights: Optional[Raft_Large_Weights] = None, progress=True, **kwargs) -> RAFT:
+    """RAFT model from
+    `RAFT: Recurrent All Pairs Field Transforms for Optical Flow <https://arxiv.org/abs/2003.12039>`_.
+
+    Please see the example below for a tutorial on how to use this model.
+
+    Args:
+        weights(:class:`~torchvision.models.optical_flow.Raft_Large_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.optical_flow.Raft_Large_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.optical_flow.RAFT``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.optical_flow.Raft_Large_Weights
+        :members:
+    """
+
+    weights = Raft_Large_Weights.verify(weights)
+
+    return _raft(
+        weights=weights,
+        progress=progress,
+        # Feature encoder
+        feature_encoder_layers=(64, 64, 96, 128, 256),
+        feature_encoder_block=ResidualBlock,
+        feature_encoder_norm_layer=InstanceNorm2d,
+        # Context encoder
+        context_encoder_layers=(64, 64, 96, 128, 256),
+        context_encoder_block=ResidualBlock,
+        context_encoder_norm_layer=BatchNorm2d,
+        # Correlation block
+        corr_block_num_levels=4,
+        corr_block_radius=4,
+        # Motion encoder
+        motion_encoder_corr_layers=(256, 192),
+        motion_encoder_flow_layers=(128, 64),
+        motion_encoder_out_channels=128,
+        # Recurrent block
+        recurrent_block_hidden_state_size=128,
+        recurrent_block_kernel_size=((1, 5), (5, 1)),
+        recurrent_block_padding=((0, 2), (2, 0)),
+        # Flow head
+        flow_head_hidden_size=256,
+        # Mask predictor
+        use_mask_predictor=True,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Raft_Small_Weights.C_T_V2))
+def raft_small(*, weights: Optional[Raft_Small_Weights] = None, progress=True, **kwargs) -> RAFT:
+    """RAFT "small" model from
+    `RAFT: Recurrent All Pairs Field Transforms for Optical Flow <https://arxiv.org/abs/2003.12039>`__.
+
+    Please see the example below for a tutorial on how to use this model.
+
+    Args:
+        weights(:class:`~torchvision.models.optical_flow.Raft_Small_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.optical_flow.Raft_Small_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.optical_flow.RAFT``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.optical_flow.Raft_Small_Weights
+        :members:
+    """
+    weights = Raft_Small_Weights.verify(weights)
+
+    return _raft(
+        weights=weights,
+        progress=progress,
+        # Feature encoder
+        feature_encoder_layers=(32, 32, 64, 96, 128),
+        feature_encoder_block=BottleneckBlock,
+        feature_encoder_norm_layer=InstanceNorm2d,
+        # Context encoder
+        context_encoder_layers=(32, 32, 64, 96, 160),
+        context_encoder_block=BottleneckBlock,
+        context_encoder_norm_layer=None,
+        # Correlation block
+        corr_block_num_levels=4,
+        corr_block_radius=3,
+        # Motion encoder
+        motion_encoder_corr_layers=(96,),
+        motion_encoder_flow_layers=(64, 32),
+        motion_encoder_out_channels=82,
+        # Recurrent block
+        recurrent_block_hidden_state_size=96,
+        recurrent_block_kernel_size=(3,),
+        recurrent_block_padding=(1,),
+        # Flow head
+        flow_head_hidden_size=128,
+        # Mask predictor
+        use_mask_predictor=False,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8bbba3567b0b9110429354d89b65ec679a2fd5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__init__.py
@@ -0,0 +1,5 @@
+from .googlenet import *
+from .inception import *
+from .mobilenet import *
+from .resnet import *
+from .shufflenetv2 import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d74a515e76499ed3f8e713f13027483fb9c77a28
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/googlenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/googlenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b35d7db17d2b8365ea571d05966feff02bdb6ae5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/googlenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/inception.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/inception.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..444d75a0adb78bd04d19ef2a5185d315f269d1ca
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/inception.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c36724bffd35f0f327535d0e6b4a79dea0b1f785
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe449a0abebe77a8a4f5ac12e01d9ed4c5f5fa43
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv3.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..274d40f5be76d7592dc12f49fc83d5aa27fcd1a7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/mobilenetv3.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/resnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fd4ea182cbcfd9634b9113072335b4ace6e25ed
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/resnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/shufflenetv2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/shufflenetv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..142d6780ef998b6922fcdcfe420b9816b519ec7e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/shufflenetv2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00c3b4d15570d92619d6615dfd0cb0fa12bf4324
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/googlenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ec1a340dd70cf0a03f101e6b4efa6229c5431b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/googlenet.py
@@ -0,0 +1,212 @@
+import warnings
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..googlenet import BasicConv2d, GoogLeNet, GoogLeNet_Weights, GoogLeNetOutputs, Inception, InceptionAux
+from .utils import _fuse_modules, _replace_relu, quantize_model
+
+
+__all__ = [
+    "QuantizableGoogLeNet",
+    "GoogLeNet_QuantizedWeights",
+    "googlenet",
+]
+
+
+class QuantizableBasicConv2d(BasicConv2d):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        _fuse_modules(self, ["conv", "bn", "relu"], is_qat, inplace=True)
+
+
+class QuantizableInception(Inception):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.cat = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.cat.cat(outputs, 1)
+
+
+class QuantizableInceptionAux(InceptionAux):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.relu = nn.ReLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
+        x = F.adaptive_avg_pool2d(x, (4, 4))
+        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
+        x = self.conv(x)
+        # N x 128 x 4 x 4
+        x = torch.flatten(x, 1)
+        # N x 2048
+        x = self.relu(self.fc1(x))
+        # N x 1024
+        x = self.dropout(x)
+        # N x 1024
+        x = self.fc2(x)
+        # N x 1000 (num_classes)
+
+        return x
+
+
+class QuantizableGoogLeNet(GoogLeNet):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(  # type: ignore[misc]
+            *args, blocks=[QuantizableBasicConv2d, QuantizableInception, QuantizableInceptionAux], **kwargs
+        )
+        self.quant = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x: Tensor) -> GoogLeNetOutputs:
+        x = self._transform_input(x)
+        x = self.quant(x)
+        x, aux1, aux2 = self._forward(x)
+        x = self.dequant(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted QuantizableGoogleNet always returns GoogleNetOutputs Tuple")
+            return GoogLeNetOutputs(x, aux2, aux1)
+        else:
+            return self.eager_outputs(x, aux2, aux1)
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        r"""Fuse conv/bn/relu modules in googlenet model
+
+        Fuse conv+bn+relu/ conv+relu/conv+bn modules to prepare for quantization.
+        Model is modified in place.  Note that this operation does not change numerics
+        and the model after modification is in floating point
+        """
+
+        for m in self.modules():
+            if type(m) is QuantizableBasicConv2d:
+                m.fuse_model(is_qat)
+
+
+class GoogLeNet_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c81f6644.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            "num_params": 6624904,
+            "min_size": (15, 15),
+            "categories": _IMAGENET_CATEGORIES,
+            "backend": "fbgemm",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models",
+            "unquantized": GoogLeNet_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.826,
+                    "acc@5": 89.404,
+                }
+            },
+            "_ops": 1.498,
+            "_file_size": 12.618,
+            "_docs": """
+                These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
+                weights listed below.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+@register_model(name="quantized_googlenet")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else GoogLeNet_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def googlenet(
+    *,
+    weights: Optional[Union[GoogLeNet_QuantizedWeights, GoogLeNet_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableGoogLeNet:
+    """GoogLeNet (Inception v1) model architecture from `Going Deeper with Convolutions <http://arxiv.org/abs/1409.4842>`__.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.GoogLeNet_QuantizedWeights` or :class:`~torchvision.models.GoogLeNet_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.GoogLeNet_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableGoogLeNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/googlenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.GoogLeNet_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.GoogLeNet_Weights
+        :members:
+        :noindex:
+    """
+    weights = (GoogLeNet_QuantizedWeights if quantize else GoogLeNet_Weights).verify(weights)
+
+    original_aux_logits = kwargs.get("aux_logits", False)
+    if weights is not None:
+        if "transform_input" not in kwargs:
+            _ovewrite_named_param(kwargs, "transform_input", True)
+        _ovewrite_named_param(kwargs, "aux_logits", True)
+        _ovewrite_named_param(kwargs, "init_weights", False)
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "fbgemm")
+
+    model = QuantizableGoogLeNet(**kwargs)
+    _replace_relu(model)
+    if quantize:
+        quantize_model(model, backend)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if not original_aux_logits:
+            model.aux_logits = False
+            model.aux1 = None  # type: ignore[assignment]
+            model.aux2 = None  # type: ignore[assignment]
+        else:
+            warnings.warn(
+                "auxiliary heads in the pretrained googlenet model are NOT pretrained, so make sure to train them"
+            )
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/inception.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/inception.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6eb9370d0d6a178f63f76b2acf4266a6ac4556d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/inception.py
@@ -0,0 +1,275 @@
+import warnings
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torchvision.models import inception as inception_module
+from torchvision.models.inception import Inception_V3_Weights, InceptionOutputs
+
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from .utils import _fuse_modules, _replace_relu, quantize_model
+
+
+__all__ = [
+    "QuantizableInception3",
+    "Inception_V3_QuantizedWeights",
+    "inception_v3",
+]
+
+
+class QuantizableBasicConv2d(inception_module.BasicConv2d):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        _fuse_modules(self, ["conv", "bn", "relu"], is_qat, inplace=True)
+
+
+class QuantizableInceptionA(inception_module.InceptionA):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.myop = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.myop.cat(outputs, 1)
+
+
+class QuantizableInceptionB(inception_module.InceptionB):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.myop = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.myop.cat(outputs, 1)
+
+
+class QuantizableInceptionC(inception_module.InceptionC):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.myop = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.myop.cat(outputs, 1)
+
+
+class QuantizableInceptionD(inception_module.InceptionD):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.myop = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.myop.cat(outputs, 1)
+
+
+class QuantizableInceptionE(inception_module.InceptionE):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+        self.myop1 = nn.quantized.FloatFunctional()
+        self.myop2 = nn.quantized.FloatFunctional()
+        self.myop3 = nn.quantized.FloatFunctional()
+
+    def _forward(self, x: Tensor) -> list[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [self.branch3x3_2a(branch3x3), self.branch3x3_2b(branch3x3)]
+        branch3x3 = self.myop1.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = self.myop2.cat(branch3x3dbl, 1)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return self.myop3.cat(outputs, 1)
+
+
+class QuantizableInceptionAux(inception_module.InceptionAux):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, conv_block=QuantizableBasicConv2d, **kwargs)  # type: ignore[misc]
+
+
+class QuantizableInception3(inception_module.Inception3):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(  # type: ignore[misc]
+            *args,
+            inception_blocks=[
+                QuantizableBasicConv2d,
+                QuantizableInceptionA,
+                QuantizableInceptionB,
+                QuantizableInceptionC,
+                QuantizableInceptionD,
+                QuantizableInceptionE,
+                QuantizableInceptionAux,
+            ],
+            **kwargs,
+        )
+        self.quant = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x: Tensor) -> InceptionOutputs:
+        x = self._transform_input(x)
+        x = self.quant(x)
+        x, aux = self._forward(x)
+        x = self.dequant(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted QuantizableInception3 always returns QuantizableInception3 Tuple")
+            return InceptionOutputs(x, aux)
+        else:
+            return self.eager_outputs(x, aux)
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        r"""Fuse conv/bn/relu modules in inception model
+
+        Fuse conv+bn+relu/ conv+relu/conv+bn modules to prepare for quantization.
+        Model is modified in place.  Note that this operation does not change numerics
+        and the model after modification is in floating point
+        """
+
+        for m in self.modules():
+            if type(m) is QuantizableBasicConv2d:
+                m.fuse_model(is_qat)
+
+
+class Inception_V3_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-a2837893.pth",
+        transforms=partial(ImageClassification, crop_size=299, resize_size=342),
+        meta={
+            "num_params": 27161264,
+            "min_size": (75, 75),
+            "categories": _IMAGENET_CATEGORIES,
+            "backend": "fbgemm",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models",
+            "unquantized": Inception_V3_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.176,
+                    "acc@5": 93.354,
+                }
+            },
+            "_ops": 5.713,
+            "_file_size": 23.146,
+            "_docs": """
+                These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
+                weights listed below.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+@register_model(name="quantized_inception_v3")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else Inception_V3_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def inception_v3(
+    *,
+    weights: Optional[Union[Inception_V3_QuantizedWeights, Inception_V3_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableInception3:
+    r"""Inception v3 model architecture from
+    `Rethinking the Inception Architecture for Computer Vision <http://arxiv.org/abs/1512.00567>`__.
+
+    .. note::
+        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
+        N x 3 x 299 x 299, so ensure your images are sized accordingly.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.Inception_V3_QuantizedWeights` or :class:`~torchvision.models.Inception_V3_Weights`, optional): The pretrained
+            weights for the model. See
+            :class:`~torchvision.models.quantization.Inception_V3_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr.
+            Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model.
+            Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableInception3``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/inception.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.Inception_V3_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.Inception_V3_Weights
+        :members:
+        :noindex:
+    """
+    weights = (Inception_V3_QuantizedWeights if quantize else Inception_V3_Weights).verify(weights)
+
+    original_aux_logits = kwargs.get("aux_logits", False)
+    if weights is not None:
+        if "transform_input" not in kwargs:
+            _ovewrite_named_param(kwargs, "transform_input", True)
+        _ovewrite_named_param(kwargs, "aux_logits", True)
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "fbgemm")
+
+    model = QuantizableInception3(**kwargs)
+    _replace_relu(model)
+    if quantize:
+        quantize_model(model, backend)
+
+    if weights is not None:
+        if quantize and not original_aux_logits:
+            model.aux_logits = False
+            model.AuxLogits = None
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+        if not quantize and not original_aux_logits:
+            model.aux_logits = False
+            model.AuxLogits = None
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a270d14d3a4ad9eda62b68c2c01e9fdb710ef38
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenet.py
@@ -0,0 +1,6 @@
+from .mobilenetv2 import *  # noqa: F401, F403
+from .mobilenetv3 import *  # noqa: F401, F403
+from .mobilenetv2 import __all__ as mv2_all
+from .mobilenetv3 import __all__ as mv3_all
+
+__all__ = mv2_all + mv3_all
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1cef2d94136eecae20cd33d9b1de7f34eece1bc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv2.py
@@ -0,0 +1,156 @@
+from functools import partial
+from typing import Any, Optional, Union
+
+from torch import nn, Tensor
+from torch.ao.quantization import DeQuantStub, QuantStub
+from torchvision.models.mobilenetv2 import InvertedResidual, MobileNet_V2_Weights, MobileNetV2
+
+from ...ops.misc import Conv2dNormActivation
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from .utils import _fuse_modules, _replace_relu, quantize_model
+
+
+__all__ = [
+    "QuantizableMobileNetV2",
+    "MobileNet_V2_QuantizedWeights",
+    "mobilenet_v2",
+]
+
+
+class QuantizableInvertedResidual(InvertedResidual):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.use_res_connect:
+            return self.skip_add.add(x, self.conv(x))
+        else:
+            return self.conv(x)
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        for idx in range(len(self.conv)):
+            if type(self.conv[idx]) is nn.Conv2d:
+                _fuse_modules(self.conv, [str(idx), str(idx + 1)], is_qat, inplace=True)
+
+
+class QuantizableMobileNetV2(MobileNetV2):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        MobileNet V2 main class
+
+        Args:
+           Inherits args from floating point MobileNetV2
+        """
+        super().__init__(*args, **kwargs)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.quant(x)
+        x = self._forward_impl(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        for m in self.modules():
+            if type(m) is Conv2dNormActivation:
+                _fuse_modules(m, ["0", "1", "2"], is_qat, inplace=True)
+            if type(m) is QuantizableInvertedResidual:
+                m.fuse_model(is_qat)
+
+
+class MobileNet_V2_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_QNNPACK_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            "num_params": 3504872,
+            "min_size": (1, 1),
+            "categories": _IMAGENET_CATEGORIES,
+            "backend": "qnnpack",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#qat-mobilenetv2",
+            "unquantized": MobileNet_V2_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 71.658,
+                    "acc@5": 90.150,
+                }
+            },
+            "_ops": 0.301,
+            "_file_size": 3.423,
+            "_docs": """
+                These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
+                weights listed below.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_QNNPACK_V1
+
+
+@register_model(name="quantized_mobilenet_v2")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1
+            if kwargs.get("quantize", False)
+            else MobileNet_V2_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def mobilenet_v2(
+    *,
+    weights: Optional[Union[MobileNet_V2_QuantizedWeights, MobileNet_V2_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableMobileNetV2:
+    """
+    Constructs a MobileNetV2 architecture from
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks
+    <https://arxiv.org/abs/1801.04381>`_.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.MobileNet_V2_QuantizedWeights` or :class:`~torchvision.models.MobileNet_V2_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.MobileNet_V2_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        quantize (bool, optional): If True, returns a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableMobileNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/mobilenetv2.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.quantization.MobileNet_V2_QuantizedWeights
+        :members:
+    .. autoclass:: torchvision.models.MobileNet_V2_Weights
+        :members:
+        :noindex:
+    """
+    weights = (MobileNet_V2_QuantizedWeights if quantize else MobileNet_V2_Weights).verify(weights)
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "qnnpack")
+
+    model = QuantizableMobileNetV2(block=QuantizableInvertedResidual, **kwargs)
+    _replace_relu(model)
+    if quantize:
+        quantize_model(model, backend)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..7431b07df85d930c411979691c865b160316bd7b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/mobilenetv3.py
@@ -0,0 +1,239 @@
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn, Tensor
+from torch.ao.quantization import DeQuantStub, QuantStub
+
+from ...ops.misc import Conv2dNormActivation, SqueezeExcitation
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..mobilenetv3 import (
+    _mobilenet_v3_conf,
+    InvertedResidual,
+    InvertedResidualConfig,
+    MobileNet_V3_Large_Weights,
+    MobileNetV3,
+)
+from .utils import _fuse_modules, _replace_relu
+
+
+__all__ = [
+    "QuantizableMobileNetV3",
+    "MobileNet_V3_Large_QuantizedWeights",
+    "mobilenet_v3_large",
+]
+
+
+class QuantizableSqueezeExcitation(SqueezeExcitation):
+    _version = 2
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        kwargs["scale_activation"] = nn.Hardsigmoid
+        super().__init__(*args, **kwargs)
+        self.skip_mul = nn.quantized.FloatFunctional()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.skip_mul.mul(self._scale(input), input)
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        _fuse_modules(self, ["fc1", "activation"], is_qat, inplace=True)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if hasattr(self, "qconfig") and (version is None or version < 2):
+            default_state_dict = {
+                "scale_activation.activation_post_process.scale": torch.tensor([1.0]),
+                "scale_activation.activation_post_process.activation_post_process.scale": torch.tensor([1.0]),
+                "scale_activation.activation_post_process.zero_point": torch.tensor([0], dtype=torch.int32),
+                "scale_activation.activation_post_process.activation_post_process.zero_point": torch.tensor(
+                    [0], dtype=torch.int32
+                ),
+                "scale_activation.activation_post_process.fake_quant_enabled": torch.tensor([1]),
+                "scale_activation.activation_post_process.observer_enabled": torch.tensor([1]),
+            }
+            for k, v in default_state_dict.items():
+                full_key = prefix + k
+                if full_key not in state_dict:
+                    state_dict[full_key] = v
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class QuantizableInvertedResidual(InvertedResidual):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, se_layer=QuantizableSqueezeExcitation, **kwargs)  # type: ignore[misc]
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.use_res_connect:
+            return self.skip_add.add(x, self.block(x))
+        else:
+            return self.block(x)
+
+
+class QuantizableMobileNetV3(MobileNetV3):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        MobileNet V3 main class
+
+        Args:
+           Inherits args from floating point MobileNetV3
+        """
+        super().__init__(*args, **kwargs)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.quant(x)
+        x = self._forward_impl(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        for m in self.modules():
+            if type(m) is Conv2dNormActivation:
+                modules_to_fuse = ["0", "1"]
+                if len(m) == 3 and type(m[2]) is nn.ReLU:
+                    modules_to_fuse.append("2")
+                _fuse_modules(m, modules_to_fuse, is_qat, inplace=True)
+            elif type(m) is QuantizableSqueezeExcitation:
+                m.fuse_model(is_qat)
+
+
+def _mobilenet_v3_model(
+    inverted_residual_setting: list[InvertedResidualConfig],
+    last_channel: int,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    quantize: bool,
+    **kwargs: Any,
+) -> QuantizableMobileNetV3:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "qnnpack")
+
+    model = QuantizableMobileNetV3(inverted_residual_setting, last_channel, block=QuantizableInvertedResidual, **kwargs)
+    _replace_relu(model)
+
+    if quantize:
+        # Instead of quantizing the model and then loading the quantized weights we take a different approach.
+        # We prepare the QAT model, load the QAT weights from training and then convert it.
+        # This is done to avoid extremely low accuracies observed on the specific model. This is rather a workaround
+        # for an unresolved bug on the eager quantization API detailed at: https://github.com/pytorch/vision/issues/5890
+        model.fuse_model(is_qat=True)
+        model.qconfig = torch.ao.quantization.get_default_qat_qconfig(backend)
+        torch.ao.quantization.prepare_qat(model, inplace=True)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    if quantize:
+        torch.ao.quantization.convert(model, inplace=True)
+        model.eval()
+
+    return model
+
+
+class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_QNNPACK_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/mobilenet_v3_large_qnnpack-5bcacf28.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            "num_params": 5483032,
+            "min_size": (1, 1),
+            "categories": _IMAGENET_CATEGORIES,
+            "backend": "qnnpack",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#qat-mobilenetv3",
+            "unquantized": MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 73.004,
+                    "acc@5": 90.858,
+                }
+            },
+            "_ops": 0.217,
+            "_file_size": 21.554,
+            "_docs": """
+                These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
+                weights listed below.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_QNNPACK_V1
+
+
+@register_model(name="quantized_mobilenet_v3_large")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1
+            if kwargs.get("quantize", False)
+            else MobileNet_V3_Large_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def mobilenet_v3_large(
+    *,
+    weights: Optional[Union[MobileNet_V3_Large_QuantizedWeights, MobileNet_V3_Large_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableMobileNetV3:
+    """
+    MobileNetV3 (Large) model from
+    `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.MobileNet_V3_Large_QuantizedWeights` or :class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.MobileNet_V3_Large_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.MobileNet_V3_Large_QuantizedWeights``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/mobilenetv3.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.MobileNet_V3_Large_QuantizedWeights
+        :members:
+    .. autoclass:: torchvision.models.MobileNet_V3_Large_Weights
+        :members:
+        :noindex:
+    """
+    weights = (MobileNet_V3_Large_QuantizedWeights if quantize else MobileNet_V3_Large_Weights).verify(weights)
+
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
+    return _mobilenet_v3_model(inverted_residual_setting, last_channel, weights, progress, quantize, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/resnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4ba003b6a3cba1e3f2efd9c3f070d439b1f7dd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/resnet.py
@@ -0,0 +1,492 @@
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchvision.models.resnet import (
+    BasicBlock,
+    Bottleneck,
+    ResNet,
+    ResNet18_Weights,
+    ResNet50_Weights,
+    ResNeXt101_32X8D_Weights,
+    ResNeXt101_64X4D_Weights,
+)
+
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from .utils import _fuse_modules, _replace_relu, quantize_model
+
+
+__all__ = [
+    "QuantizableResNet",
+    "ResNet18_QuantizedWeights",
+    "ResNet50_QuantizedWeights",
+    "ResNeXt101_32X8D_QuantizedWeights",
+    "ResNeXt101_64X4D_QuantizedWeights",
+    "resnet18",
+    "resnet50",
+    "resnext101_32x8d",
+    "resnext101_64x4d",
+]
+
+
+class QuantizableBasicBlock(BasicBlock):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.add_relu = torch.nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = self.add_relu.add_relu(out, identity)
+
+        return out
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        _fuse_modules(self, [["conv1", "bn1", "relu"], ["conv2", "bn2"]], is_qat, inplace=True)
+        if self.downsample:
+            _fuse_modules(self.downsample, ["0", "1"], is_qat, inplace=True)
+
+
+class QuantizableBottleneck(Bottleneck):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.skip_add_relu = nn.quantized.FloatFunctional()
+        self.relu1 = nn.ReLU(inplace=False)
+        self.relu2 = nn.ReLU(inplace=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu2(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out = self.skip_add_relu.add_relu(out, identity)
+
+        return out
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        _fuse_modules(
+            self, [["conv1", "bn1", "relu1"], ["conv2", "bn2", "relu2"], ["conv3", "bn3"]], is_qat, inplace=True
+        )
+        if self.downsample:
+            _fuse_modules(self.downsample, ["0", "1"], is_qat, inplace=True)
+
+
+class QuantizableResNet(ResNet):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.quant = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.quant(x)
+        # Ensure scriptability
+        # super(QuantizableResNet,self).forward(x)
+        # is not scriptable
+        x = self._forward_impl(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        r"""Fuse conv/bn/relu modules in resnet models
+
+        Fuse conv+bn+relu/ Conv+relu/conv+Bn modules to prepare for quantization.
+        Model is modified in place.  Note that this operation does not change numerics
+        and the model after modification is in floating point
+        """
+        _fuse_modules(self, ["conv1", "bn1", "relu"], is_qat, inplace=True)
+        for m in self.modules():
+            if type(m) is QuantizableBottleneck or type(m) is QuantizableBasicBlock:
+                m.fuse_model(is_qat)
+
+
+def _resnet(
+    block: type[Union[QuantizableBasicBlock, QuantizableBottleneck]],
+    layers: list[int],
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    quantize: bool,
+    **kwargs: Any,
+) -> QuantizableResNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "fbgemm")
+
+    model = QuantizableResNet(block, layers, **kwargs)
+    _replace_relu(model)
+    if quantize:
+        quantize_model(model, backend)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+    "backend": "fbgemm",
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models",
+    "_docs": """
+        These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
+        weights listed below.
+    """,
+}
+
+
+class ResNet18_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnet18_fbgemm_16fa66dd.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 11689512,
+            "unquantized": ResNet18_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.494,
+                    "acc@5": 88.882,
+                }
+            },
+            "_ops": 1.814,
+            "_file_size": 11.238,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+class ResNet50_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnet50_fbgemm_bf931d71.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "unquantized": ResNet50_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.920,
+                    "acc@5": 92.814,
+                }
+            },
+            "_ops": 4.089,
+            "_file_size": 24.759,
+        },
+    )
+    IMAGENET1K_FBGEMM_V2 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnet50_fbgemm-23753f79.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "unquantized": ResNet50_Weights.IMAGENET1K_V2,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.282,
+                    "acc@5": 94.976,
+                }
+            },
+            "_ops": 4.089,
+            "_file_size": 24.953,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V2
+
+
+class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnext101_32x8_fbgemm_09835ccf.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "unquantized": ResNeXt101_32X8D_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.986,
+                    "acc@5": 94.480,
+                }
+            },
+            "_ops": 16.414,
+            "_file_size": 86.034,
+        },
+    )
+    IMAGENET1K_FBGEMM_V2 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnext101_32x8_fbgemm-ee16d00c.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "unquantized": ResNeXt101_32X8D_Weights.IMAGENET1K_V2,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.574,
+                    "acc@5": 96.132,
+                }
+            },
+            "_ops": 16.414,
+            "_file_size": 86.645,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V2
+
+
+class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/resnext101_64x4d_fbgemm-605a1cb3.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 83455272,
+            "recipe": "https://github.com/pytorch/vision/pull/5935",
+            "unquantized": ResNeXt101_64X4D_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.898,
+                    "acc@5": 96.326,
+                }
+            },
+            "_ops": 15.46,
+            "_file_size": 81.556,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+@register_model(name="quantized_resnet18")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNet18_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def resnet18(
+    *,
+    weights: Optional[Union[ResNet18_QuantizedWeights, ResNet18_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableResNet:
+    """ResNet-18 model from
+    `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ResNet18_QuantizedWeights` or :class:`~torchvision.models.ResNet18_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ResNet18_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ResNet18_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ResNet18_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ResNet18_QuantizedWeights if quantize else ResNet18_Weights).verify(weights)
+
+    return _resnet(QuantizableBasicBlock, [2, 2, 2, 2], weights, progress, quantize, **kwargs)
+
+
+@register_model(name="quantized_resnet50")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNet50_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def resnet50(
+    *,
+    weights: Optional[Union[ResNet50_QuantizedWeights, ResNet50_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableResNet:
+    """ResNet-50 model from
+    `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ResNet50_QuantizedWeights` or :class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ResNet50_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ResNet50_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ResNet50_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ResNet50_QuantizedWeights if quantize else ResNet50_Weights).verify(weights)
+
+    return _resnet(QuantizableBottleneck, [3, 4, 6, 3], weights, progress, quantize, **kwargs)
+
+
+@register_model(name="quantized_resnext101_32x8d")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNeXt101_32X8D_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def resnext101_32x8d(
+    *,
+    weights: Optional[Union[ResNeXt101_32X8D_QuantizedWeights, ResNeXt101_32X8D_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableResNet:
+    """ResNeXt-101 32x8d model from
+    `Aggregated Residual Transformation for Deep Neural Networks <https://arxiv.org/abs/1611.05431>`_
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ResNeXt101_32X8D_QuantizedWeights` or :class:`~torchvision.models.ResNeXt101_32X8D_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ResNet101_32X8D_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ResNeXt101_32X8D_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ResNeXt101_32X8D_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ResNeXt101_32X8D_QuantizedWeights if quantize else ResNeXt101_32X8D_Weights).verify(weights)
+
+    _ovewrite_named_param(kwargs, "groups", 32)
+    _ovewrite_named_param(kwargs, "width_per_group", 8)
+    return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
+
+
+@register_model(name="quantized_resnext101_64x4d")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ResNeXt101_64X4D_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ResNeXt101_64X4D_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def resnext101_64x4d(
+    *,
+    weights: Optional[Union[ResNeXt101_64X4D_QuantizedWeights, ResNeXt101_64X4D_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableResNet:
+    """ResNeXt-101 64x4d model from
+    `Aggregated Residual Transformation for Deep Neural Networks <https://arxiv.org/abs/1611.05431>`_
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ResNeXt101_64X4D_QuantizedWeights` or :class:`~torchvision.models.ResNeXt101_64X4D_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ResNet101_64X4D_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model. Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.QuantizableResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ResNeXt101_64X4D_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ResNeXt101_64X4D_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ResNeXt101_64X4D_QuantizedWeights if quantize else ResNeXt101_64X4D_Weights).verify(weights)
+
+    _ovewrite_named_param(kwargs, "groups", 64)
+    _ovewrite_named_param(kwargs, "width_per_group", 4)
+    return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/shufflenetv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/shufflenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a2eb8eb1b5ecfcac78729c53f2501688c17a01
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/shufflenetv2.py
@@ -0,0 +1,435 @@
+from functools import partial
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchvision.models import shufflenetv2
+
+from ...transforms._presets import ImageClassification
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _IMAGENET_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..shufflenetv2 import (
+    ShuffleNet_V2_X0_5_Weights,
+    ShuffleNet_V2_X1_0_Weights,
+    ShuffleNet_V2_X1_5_Weights,
+    ShuffleNet_V2_X2_0_Weights,
+)
+from .utils import _fuse_modules, _replace_relu, quantize_model
+
+
+__all__ = [
+    "QuantizableShuffleNetV2",
+    "ShuffleNet_V2_X0_5_QuantizedWeights",
+    "ShuffleNet_V2_X1_0_QuantizedWeights",
+    "ShuffleNet_V2_X1_5_QuantizedWeights",
+    "ShuffleNet_V2_X2_0_QuantizedWeights",
+    "shufflenet_v2_x0_5",
+    "shufflenet_v2_x1_0",
+    "shufflenet_v2_x1_5",
+    "shufflenet_v2_x2_0",
+]
+
+
+class QuantizableInvertedResidual(shufflenetv2.InvertedResidual):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.cat = nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = self.cat.cat([x1, self.branch2(x2)], dim=1)
+        else:
+            out = self.cat.cat([self.branch1(x), self.branch2(x)], dim=1)
+
+        out = shufflenetv2.channel_shuffle(out, 2)
+
+        return out
+
+
+class QuantizableShuffleNetV2(shufflenetv2.ShuffleNetV2):
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, inverted_residual=QuantizableInvertedResidual, **kwargs)  # type: ignore[misc]
+        self.quant = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.quant(x)
+        x = self._forward_impl(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self, is_qat: Optional[bool] = None) -> None:
+        r"""Fuse conv/bn/relu modules in shufflenetv2 model
+
+        Fuse conv+bn+relu/ conv+relu/conv+bn modules to prepare for quantization.
+        Model is modified in place.
+
+        .. note::
+            Note that this operation does not change numerics
+            and the model after modification is in floating point
+        """
+        for name, m in self._modules.items():
+            if name in ["conv1", "conv5"] and m is not None:
+                _fuse_modules(m, [["0", "1", "2"]], is_qat, inplace=True)
+        for m in self.modules():
+            if type(m) is QuantizableInvertedResidual:
+                if len(m.branch1._modules.items()) > 0:
+                    _fuse_modules(m.branch1, [["0", "1"], ["2", "3", "4"]], is_qat, inplace=True)
+                _fuse_modules(
+                    m.branch2,
+                    [["0", "1", "2"], ["3", "4"], ["5", "6", "7"]],
+                    is_qat,
+                    inplace=True,
+                )
+
+
+def _shufflenetv2(
+    stages_repeats: list[int],
+    stages_out_channels: list[int],
+    *,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    quantize: bool,
+    **kwargs: Any,
+) -> QuantizableShuffleNetV2:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if "backend" in weights.meta:
+            _ovewrite_named_param(kwargs, "backend", weights.meta["backend"])
+    backend = kwargs.pop("backend", "fbgemm")
+
+    model = QuantizableShuffleNetV2(stages_repeats, stages_out_channels, **kwargs)
+    _replace_relu(model)
+    if quantize:
+        quantize_model(model, backend)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+    "backend": "fbgemm",
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models",
+    "_docs": """
+        These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
+        weights listed below.
+    """,
+}
+
+
+class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x0.5_fbgemm-00845098.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 1366792,
+            "unquantized": ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 57.972,
+                    "acc@5": 79.780,
+                }
+            },
+            "_ops": 0.04,
+            "_file_size": 1.501,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-1e62bb32.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 2278604,
+            "unquantized": ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 68.360,
+                    "acc@5": 87.582,
+                }
+            },
+            "_ops": 0.145,
+            "_file_size": 2.334,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+class ShuffleNet_V2_X1_5_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_5_fbgemm-d7401f05.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5906",
+            "num_params": 3503624,
+            "unquantized": ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 72.052,
+                    "acc@5": 90.700,
+                }
+            },
+            "_ops": 0.296,
+            "_file_size": 3.672,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
+    IMAGENET1K_FBGEMM_V1 = Weights(
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x2_0_fbgemm-5cac526c.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5906",
+            "num_params": 7393996,
+            "unquantized": ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.354,
+                    "acc@5": 92.488,
+                }
+            },
+            "_ops": 0.583,
+            "_file_size": 7.467,
+        },
+    )
+    DEFAULT = IMAGENET1K_FBGEMM_V1
+
+
+@register_model(name="quantized_shufflenet_v2_x0_5")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def shufflenet_v2_x0_5(
+    *,
+    weights: Optional[Union[ShuffleNet_V2_X0_5_QuantizedWeights, ShuffleNet_V2_X0_5_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 0.5x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ShuffleNet_V2_X0_5_QuantizedWeights` or :class:`~torchvision.models.ShuffleNet_V2_X0_5_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ShuffleNet_V2_X0_5_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr.
+            Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model.
+            Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.ShuffleNet_V2_X0_5_QuantizedWeights``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ShuffleNet_V2_X0_5_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X0_5_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ShuffleNet_V2_X0_5_QuantizedWeights if quantize else ShuffleNet_V2_X0_5_Weights).verify(weights)
+    return _shufflenetv2(
+        [4, 8, 4], [24, 48, 96, 192, 1024], weights=weights, progress=progress, quantize=quantize, **kwargs
+    )
+
+
+@register_model(name="quantized_shufflenet_v2_x1_0")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def shufflenet_v2_x1_0(
+    *,
+    weights: Optional[Union[ShuffleNet_V2_X1_0_QuantizedWeights, ShuffleNet_V2_X1_0_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 1.0x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ShuffleNet_V2_X1_0_QuantizedWeights` or :class:`~torchvision.models.ShuffleNet_V2_X1_0_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ShuffleNet_V2_X1_0_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr.
+            Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model.
+            Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.ShuffleNet_V2_X1_0_QuantizedWeights``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ShuffleNet_V2_X1_0_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X1_0_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ShuffleNet_V2_X1_0_QuantizedWeights if quantize else ShuffleNet_V2_X1_0_Weights).verify(weights)
+    return _shufflenetv2(
+        [4, 8, 4], [24, 116, 232, 464, 1024], weights=weights, progress=progress, quantize=quantize, **kwargs
+    )
+
+
+@register_model(name="quantized_shufflenet_v2_x1_5")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X1_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def shufflenet_v2_x1_5(
+    *,
+    weights: Optional[Union[ShuffleNet_V2_X1_5_QuantizedWeights, ShuffleNet_V2_X1_5_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 1.5x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ShuffleNet_V2_X1_5_QuantizedWeights` or :class:`~torchvision.models.ShuffleNet_V2_X1_5_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ShuffleNet_V2_X1_5_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr.
+            Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model.
+            Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.ShuffleNet_V2_X1_5_QuantizedWeights``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ShuffleNet_V2_X1_5_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X1_5_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ShuffleNet_V2_X1_5_QuantizedWeights if quantize else ShuffleNet_V2_X1_5_Weights).verify(weights)
+    return _shufflenetv2(
+        [4, 8, 4], [24, 176, 352, 704, 1024], weights=weights, progress=progress, quantize=quantize, **kwargs
+    )
+
+
+@register_model(name="quantized_shufflenet_v2_x2_0")
+@handle_legacy_interface(
+    weights=(
+        "pretrained",
+        lambda kwargs: (
+            ShuffleNet_V2_X2_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1
+            if kwargs.get("quantize", False)
+            else ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1
+        ),
+    )
+)
+def shufflenet_v2_x2_0(
+    *,
+    weights: Optional[Union[ShuffleNet_V2_X2_0_QuantizedWeights, ShuffleNet_V2_X2_0_Weights]] = None,
+    progress: bool = True,
+    quantize: bool = False,
+    **kwargs: Any,
+) -> QuantizableShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 2.0x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    .. note::
+        Note that ``quantize = True`` returns a quantized model with 8 bit
+        weights. Quantized models only support inference and run on CPUs.
+        GPU inference is not yet supported.
+
+    Args:
+        weights (:class:`~torchvision.models.quantization.ShuffleNet_V2_X2_0_QuantizedWeights` or :class:`~torchvision.models.ShuffleNet_V2_X2_0_Weights`, optional): The
+            pretrained weights for the model. See
+            :class:`~torchvision.models.quantization.ShuffleNet_V2_X2_0_QuantizedWeights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr.
+            Default is True.
+        quantize (bool, optional): If True, return a quantized version of the model.
+            Default is False.
+        **kwargs: parameters passed to the ``torchvision.models.quantization.ShuffleNet_V2_X2_0_QuantizedWeights``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/quantization/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.quantization.ShuffleNet_V2_X2_0_QuantizedWeights
+        :members:
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X2_0_Weights
+        :members:
+        :noindex:
+    """
+    weights = (ShuffleNet_V2_X2_0_QuantizedWeights if quantize else ShuffleNet_V2_X2_0_Weights).verify(weights)
+    return _shufflenetv2(
+        [4, 8, 4], [24, 244, 488, 976, 2048], weights=weights, progress=progress, quantize=quantize, **kwargs
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d50bb9b480afef6467a5ae18ed92b167861f99
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/quantization/utils.py
@@ -0,0 +1,51 @@
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+
+
+def _replace_relu(module: nn.Module) -> None:
+    reassign = {}
+    for name, mod in module.named_children():
+        _replace_relu(mod)
+        # Checking for explicit type instead of instance
+        # as we only want to replace modules of the exact type
+        # not inherited classes
+        if type(mod) is nn.ReLU or type(mod) is nn.ReLU6:
+            reassign[name] = nn.ReLU(inplace=False)
+
+    for key, value in reassign.items():
+        module._modules[key] = value
+
+
+def quantize_model(model: nn.Module, backend: str) -> None:
+    _dummy_input_data = torch.rand(1, 3, 299, 299)
+    if backend not in torch.backends.quantized.supported_engines:
+        raise RuntimeError("Quantized backend not supported ")
+    torch.backends.quantized.engine = backend
+    model.eval()
+    # Make sure that weight qconfig matches that of the serialized models
+    if backend == "fbgemm":
+        model.qconfig = torch.ao.quantization.QConfig(  # type: ignore[assignment]
+            activation=torch.ao.quantization.default_observer,
+            weight=torch.ao.quantization.default_per_channel_weight_observer,
+        )
+    elif backend == "qnnpack":
+        model.qconfig = torch.ao.quantization.QConfig(  # type: ignore[assignment]
+            activation=torch.ao.quantization.default_observer, weight=torch.ao.quantization.default_weight_observer
+        )
+
+    # TODO https://github.com/pytorch/vision/pull/4232#pullrequestreview-730461659
+    model.fuse_model()  # type: ignore[operator]
+    torch.ao.quantization.prepare(model, inplace=True)
+    model(_dummy_input_data)
+    torch.ao.quantization.convert(model, inplace=True)
+
+
+def _fuse_modules(
+    model: nn.Module, modules_to_fuse: Union[list[str], list[list[str]]], is_qat: Optional[bool], **kwargs: Any
+):
+    if is_qat is None:
+        is_qat = model.training
+    method = torch.ao.quantization.fuse_modules_qat if is_qat else torch.ao.quantization.fuse_modules
+    return method(model, modules_to_fuse, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/regnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..915ef22bf33f1b00d5544b7d04c31a24006ac9df
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/regnet.py
@@ -0,0 +1,1571 @@
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn, Tensor
+
+from ..ops.misc import Conv2dNormActivation, SqueezeExcitation
+from ..transforms._presets import ImageClassification, InterpolationMode
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "RegNet",
+    "RegNet_Y_400MF_Weights",
+    "RegNet_Y_800MF_Weights",
+    "RegNet_Y_1_6GF_Weights",
+    "RegNet_Y_3_2GF_Weights",
+    "RegNet_Y_8GF_Weights",
+    "RegNet_Y_16GF_Weights",
+    "RegNet_Y_32GF_Weights",
+    "RegNet_Y_128GF_Weights",
+    "RegNet_X_400MF_Weights",
+    "RegNet_X_800MF_Weights",
+    "RegNet_X_1_6GF_Weights",
+    "RegNet_X_3_2GF_Weights",
+    "RegNet_X_8GF_Weights",
+    "RegNet_X_16GF_Weights",
+    "RegNet_X_32GF_Weights",
+    "regnet_y_400mf",
+    "regnet_y_800mf",
+    "regnet_y_1_6gf",
+    "regnet_y_3_2gf",
+    "regnet_y_8gf",
+    "regnet_y_16gf",
+    "regnet_y_32gf",
+    "regnet_y_128gf",
+    "regnet_x_400mf",
+    "regnet_x_800mf",
+    "regnet_x_1_6gf",
+    "regnet_x_3_2gf",
+    "regnet_x_8gf",
+    "regnet_x_16gf",
+    "regnet_x_32gf",
+]
+
+
+class SimpleStemIN(Conv2dNormActivation):
+    """Simple stem for ImageNet: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+    ) -> None:
+        super().__init__(
+            width_in, width_out, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+
+
+class BottleneckTransform(nn.Sequential):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float],
+    ) -> None:
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        w_b = int(round(width_out * bottleneck_multiplier))
+        g = w_b // group_width
+
+        layers["a"] = Conv2dNormActivation(
+            width_in, w_b, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+        layers["b"] = Conv2dNormActivation(
+            w_b, w_b, kernel_size=3, stride=stride, groups=g, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+
+        if se_ratio:
+            # The SE reduction ratio is defined with respect to the
+            # beginning of the block
+            width_se_out = int(round(se_ratio * width_in))
+            layers["se"] = SqueezeExcitation(
+                input_channels=w_b,
+                squeeze_channels=width_se_out,
+                activation=activation_layer,
+            )
+
+        layers["c"] = Conv2dNormActivation(
+            w_b, width_out, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=None
+        )
+        super().__init__(layers)
+
+
+class ResBottleneckBlock(nn.Module):
+    """Residual bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int = 1,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+
+        # Use skip connection with projection if shape changes
+        self.proj = None
+        should_proj = (width_in != width_out) or (stride != 1)
+        if should_proj:
+            self.proj = Conv2dNormActivation(
+                width_in, width_out, kernel_size=1, stride=stride, norm_layer=norm_layer, activation_layer=None
+            )
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            norm_layer,
+            activation_layer,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+        self.activation = activation_layer(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj is not None:
+            x = self.proj(x) + self.f(x)
+        else:
+            x = x + self.f(x)
+        return self.activation(x)
+
+
+class AnyStage(nn.Sequential):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        depth: int,
+        block_constructor: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float] = None,
+        stage_index: int = 0,
+    ) -> None:
+        super().__init__()
+
+        for i in range(depth):
+            block = block_constructor(
+                width_in if i == 0 else width_out,
+                width_out,
+                stride if i == 0 else 1,
+                norm_layer,
+                activation_layer,
+                group_width,
+                bottleneck_multiplier,
+                se_ratio,
+            )
+
+            self.add_module(f"block{stage_index}-{i}", block)
+
+
+class BlockParams:
+    def __init__(
+        self,
+        depths: list[int],
+        widths: list[int],
+        group_widths: list[int],
+        bottleneck_multipliers: list[float],
+        strides: list[int],
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        self.depths = depths
+        self.widths = widths
+        self.group_widths = group_widths
+        self.bottleneck_multipliers = bottleneck_multipliers
+        self.strides = strides
+        self.se_ratio = se_ratio
+
+    @classmethod
+    def from_init_params(
+        cls,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+        **kwargs: Any,
+    ) -> "BlockParams":
+        """
+        Programmatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = torch.arange(depth) * w_a + w_0
+        block_capacity = torch.round(torch.log(widths_cont / w_0) / math.log(w_m))
+        block_widths = (torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT)) * QUANT).int().tolist()
+        num_stages = len(set(block_widths))
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = torch.diff(torch.tensor([d for d, t in enumerate(splits) if t])).int().tolist()
+
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [bottleneck_multiplier] * num_stages
+        group_widths = [group_width] * num_stages
+
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = cls._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return cls(
+            depths=stage_depths,
+            widths=stage_widths,
+            group_widths=group_widths,
+            bottleneck_multipliers=bottleneck_multipliers,
+            strides=strides,
+            se_ratio=se_ratio,
+        )
+
+    def _get_expanded_params(self):
+        return zip(self.widths, self.strides, self.depths, self.group_widths, self.bottleneck_multipliers)
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+        stage_widths: list[int], bottleneck_ratios: list[float], group_widths: list[int]
+    ) -> tuple[list[int], list[int]]:
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
+class RegNet(nn.Module):
+    def __init__(
+        self,
+        block_params: BlockParams,
+        num_classes: int = 1000,
+        stem_width: int = 32,
+        stem_type: Optional[Callable[..., nn.Module]] = None,
+        block_type: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if stem_type is None:
+            stem_type = SimpleStemIN
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if block_type is None:
+            block_type = ResBottleneckBlock
+        if activation is None:
+            activation = nn.ReLU
+
+        # Ad hoc stem
+        self.stem = stem_type(
+            3,  # width_in
+            stem_width,
+            norm_layer,
+            activation,
+        )
+
+        current_width = stem_width
+
+        blocks = []
+        for i, (
+            width_out,
+            stride,
+            depth,
+            group_width,
+            bottleneck_multiplier,
+        ) in enumerate(block_params._get_expanded_params()):
+            blocks.append(
+                (
+                    f"block{i+1}",
+                    AnyStage(
+                        current_width,
+                        width_out,
+                        stride,
+                        depth,
+                        block_type,
+                        norm_layer,
+                        activation,
+                        group_width,
+                        bottleneck_multiplier,
+                        block_params.se_ratio,
+                        stage_index=i + 1,
+                    ),
+                )
+            )
+
+            current_width = width_out
+
+        self.trunk_output = nn.Sequential(OrderedDict(blocks))
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
+
+        # Performs ResNet-style weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # Note that there is no bias due to BN
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out))
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, mean=0.0, std=0.01)
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.trunk_output(x)
+
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+
+        return x
+
+
+def _regnet(
+    block_params: BlockParams,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> RegNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    norm_layer = kwargs.pop("norm_layer", partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1))
+    model = RegNet(block_params, norm_layer=norm_layer, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META: dict[str, Any] = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+_COMMON_SWAG_META = {
+    **_COMMON_META,
+    "recipe": "https://github.com/facebookresearch/SWAG",
+    "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE",
+}
+
+
+class RegNet_Y_400MF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 4344144,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 74.046,
+                    "acc@5": 91.716,
+                }
+            },
+            "_ops": 0.402,
+            "_file_size": 16.806,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_400mf-e6988f5f.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 4344144,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.804,
+                    "acc@5": 92.742,
+                }
+            },
+            "_ops": 0.402,
+            "_file_size": 16.806,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_800MF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 6432512,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.420,
+                    "acc@5": 93.136,
+                }
+            },
+            "_ops": 0.834,
+            "_file_size": 24.774,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_800mf-58fc7688.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 6432512,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.828,
+                    "acc@5": 94.502,
+                }
+            },
+            "_ops": 0.834,
+            "_file_size": 24.774,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_1_6GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 11202430,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.950,
+                    "acc@5": 93.966,
+                }
+            },
+            "_ops": 1.612,
+            "_file_size": 43.152,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_1_6gf-0d7bc02a.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 11202430,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.876,
+                    "acc@5": 95.444,
+                }
+            },
+            "_ops": 1.612,
+            "_file_size": 43.152,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_3_2GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 19436338,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#medium-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.948,
+                    "acc@5": 94.576,
+                }
+            },
+            "_ops": 3.176,
+            "_file_size": 74.567,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_3_2gf-9180c971.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 19436338,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.982,
+                    "acc@5": 95.972,
+                }
+            },
+            "_ops": 3.176,
+            "_file_size": 74.567,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_8GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 39381472,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#medium-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.032,
+                    "acc@5": 95.048,
+                }
+            },
+            "_ops": 8.473,
+            "_file_size": 150.701,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_8gf-dc2b1b54.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 39381472,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.828,
+                    "acc@5": 96.330,
+                }
+            },
+            "_ops": 8.473,
+            "_file_size": 150.701,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_16GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 83590140,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.424,
+                    "acc@5": 95.240,
+                }
+            },
+            "_ops": 15.912,
+            "_file_size": 319.49,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_16gf-3e4a00f9.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 83590140,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.886,
+                    "acc@5": 96.328,
+                }
+            },
+            "_ops": 15.912,
+            "_file_size": 319.49,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_16gf_swag-43afe44d.pth",
+        transforms=partial(
+            ImageClassification, crop_size=384, resize_size=384, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 83590140,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 86.012,
+                    "acc@5": 98.054,
+                }
+            },
+            "_ops": 46.735,
+            "_file_size": 319.49,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_16gf_lc_swag-f3ec0043.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 83590140,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.976,
+                    "acc@5": 97.244,
+                }
+            },
+            "_ops": 15.912,
+            "_file_size": 319.49,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_32GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 145046770,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.878,
+                    "acc@5": 95.340,
+                }
+            },
+            "_ops": 32.28,
+            "_file_size": 554.076,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_32gf-8db6d4b5.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 145046770,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.368,
+                    "acc@5": 96.498,
+                }
+            },
+            "_ops": 32.28,
+            "_file_size": 554.076,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_32gf_swag-04fdfa75.pth",
+        transforms=partial(
+            ImageClassification, crop_size=384, resize_size=384, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 145046770,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 86.838,
+                    "acc@5": 98.362,
+                }
+            },
+            "_ops": 94.826,
+            "_file_size": 554.076,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_32gf_lc_swag-e1583746.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 145046770,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.622,
+                    "acc@5": 97.480,
+                }
+            },
+            "_ops": 32.28,
+            "_file_size": 554.076,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_Y_128GF_Weights(WeightsEnum):
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_128gf_swag-c8ce3e52.pth",
+        transforms=partial(
+            ImageClassification, crop_size=384, resize_size=384, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 644812894,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 88.228,
+                    "acc@5": 98.682,
+                }
+            },
+            "_ops": 374.57,
+            "_file_size": 2461.564,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_y_128gf_lc_swag-cbe8ce12.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 644812894,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 86.068,
+                    "acc@5": 97.844,
+                }
+            },
+            "_ops": 127.518,
+            "_file_size": 2461.564,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_SWAG_E2E_V1
+
+
+class RegNet_X_400MF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 5495976,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 72.834,
+                    "acc@5": 90.950,
+                }
+            },
+            "_ops": 0.414,
+            "_file_size": 21.258,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_400mf-62229a5f.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 5495976,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 74.864,
+                    "acc@5": 92.322,
+                }
+            },
+            "_ops": 0.414,
+            "_file_size": 21.257,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_800MF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 7259656,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.212,
+                    "acc@5": 92.348,
+                }
+            },
+            "_ops": 0.8,
+            "_file_size": 27.945,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_800mf-94a99ebd.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 7259656,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.522,
+                    "acc@5": 93.826,
+                }
+            },
+            "_ops": 0.8,
+            "_file_size": 27.945,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_1_6GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 9190136,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#small-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.040,
+                    "acc@5": 93.440,
+                }
+            },
+            "_ops": 1.603,
+            "_file_size": 35.339,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_1_6gf-a12f2b72.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 9190136,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.668,
+                    "acc@5": 94.922,
+                }
+            },
+            "_ops": 1.603,
+            "_file_size": 35.339,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_3_2GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 15296552,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#medium-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.364,
+                    "acc@5": 93.992,
+                }
+            },
+            "_ops": 3.177,
+            "_file_size": 58.756,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_3_2gf-7071aa85.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 15296552,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.196,
+                    "acc@5": 95.430,
+                }
+            },
+            "_ops": 3.177,
+            "_file_size": 58.756,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_8GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 39572648,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#medium-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.344,
+                    "acc@5": 94.686,
+                }
+            },
+            "_ops": 7.995,
+            "_file_size": 151.456,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_8gf-2b70d774.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 39572648,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.682,
+                    "acc@5": 95.678,
+                }
+            },
+            "_ops": 7.995,
+            "_file_size": 151.456,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_16GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 54278536,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#medium-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.058,
+                    "acc@5": 94.944,
+                }
+            },
+            "_ops": 15.941,
+            "_file_size": 207.627,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_16gf-ba3796d7.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 54278536,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.716,
+                    "acc@5": 96.196,
+                }
+            },
+            "_ops": 15.941,
+            "_file_size": 207.627,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class RegNet_X_32GF_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 107811560,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.622,
+                    "acc@5": 95.248,
+                }
+            },
+            "_ops": 31.736,
+            "_file_size": 412.039,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/regnet_x_32gf-6eb8fdc6.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 107811560,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.014,
+                    "acc@5": 96.288,
+                }
+            },
+            "_ops": 31.736,
+            "_file_size": 412.039,
+            "_docs": """
+                These weights improve upon the results of the original paper by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_400MF_Weights.IMAGENET1K_V1))
+def regnet_y_400mf(*, weights: Optional[RegNet_Y_400MF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_400MF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_400MF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_400MF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_400MF_Weights
+        :members:
+    """
+    weights = RegNet_Y_400MF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, se_ratio=0.25, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_800MF_Weights.IMAGENET1K_V1))
+def regnet_y_800mf(*, weights: Optional[RegNet_Y_800MF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_800MF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_800MF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_800MF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_800MF_Weights
+        :members:
+    """
+    weights = RegNet_Y_800MF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, se_ratio=0.25, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_1_6GF_Weights.IMAGENET1K_V1))
+def regnet_y_1_6gf(*, weights: Optional[RegNet_Y_1_6GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_1.6GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_1_6GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_1_6GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_1_6GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_1_6GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_3_2GF_Weights.IMAGENET1K_V1))
+def regnet_y_3_2gf(*, weights: Optional[RegNet_Y_3_2GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_3.2GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_3_2GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_3_2GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_3_2GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_3_2GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_8GF_Weights.IMAGENET1K_V1))
+def regnet_y_8gf(*, weights: Optional[RegNet_Y_8GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_8GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_8GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_8GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_8GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_8GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_16GF_Weights.IMAGENET1K_V1))
+def regnet_y_16gf(*, weights: Optional[RegNet_Y_16GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_16GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_16GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_16GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_16GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_16GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_Y_32GF_Weights.IMAGENET1K_V1))
+def regnet_y_32gf(*, weights: Optional[RegNet_Y_32GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_32GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_32GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_32GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_32GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_32GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", None))
+def regnet_y_128gf(*, weights: Optional[RegNet_Y_128GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_128GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_Y_128GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_Y_128GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_Y_128GF_Weights
+        :members:
+    """
+    weights = RegNet_Y_128GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(
+        depth=27, w_0=456, w_a=160.83, w_m=2.52, group_width=264, se_ratio=0.25, **kwargs
+    )
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_400MF_Weights.IMAGENET1K_V1))
+def regnet_x_400mf(*, weights: Optional[RegNet_X_400MF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_400MF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_400MF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_400MF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_400MF_Weights
+        :members:
+    """
+    weights = RegNet_X_400MF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=22, w_0=24, w_a=24.48, w_m=2.54, group_width=16, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_800MF_Weights.IMAGENET1K_V1))
+def regnet_x_800mf(*, weights: Optional[RegNet_X_800MF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_800MF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_800MF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_800MF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_800MF_Weights
+        :members:
+    """
+    weights = RegNet_X_800MF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_1_6GF_Weights.IMAGENET1K_V1))
+def regnet_x_1_6gf(*, weights: Optional[RegNet_X_1_6GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_1.6GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_1_6GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_1_6GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_1_6GF_Weights
+        :members:
+    """
+    weights = RegNet_X_1_6GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_3_2GF_Weights.IMAGENET1K_V1))
+def regnet_x_3_2gf(*, weights: Optional[RegNet_X_3_2GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_3.2GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_3_2GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_3_2GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_3_2GF_Weights
+        :members:
+    """
+    weights = RegNet_X_3_2GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_8GF_Weights.IMAGENET1K_V1))
+def regnet_x_8gf(*, weights: Optional[RegNet_X_8GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_8GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_8GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_8GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_8GF_Weights
+        :members:
+    """
+    weights = RegNet_X_8GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_16GF_Weights.IMAGENET1K_V1))
+def regnet_x_16gf(*, weights: Optional[RegNet_X_16GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_16GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_16GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_16GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_16GF_Weights
+        :members:
+    """
+    weights = RegNet_X_16GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", RegNet_X_32GF_Weights.IMAGENET1K_V1))
+def regnet_x_32gf(*, weights: Optional[RegNet_X_32GF_Weights] = None, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_32GF architecture from
+    `Designing Network Design Spaces <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.RegNet_X_32GF_Weights`, optional): The pretrained weights to use.
+            See :class:`~torchvision.models.RegNet_X_32GF_Weights` below for more details and possible values.
+            By default, no pretrained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to either ``torchvision.models.regnet.RegNet`` or
+            ``torchvision.models.regnet.BlockParams`` class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py>`_
+            for more detail about the classes.
+
+    .. autoclass:: torchvision.models.RegNet_X_32GF_Weights
+        :members:
+    """
+    weights = RegNet_X_32GF_Weights.verify(weights)
+
+    params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs)
+    return _regnet(params, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/resnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..47067ec83175a97cc6f6a8721b342128a434d440
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/resnet.py
@@ -0,0 +1,985 @@
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "ResNet",
+    "ResNet18_Weights",
+    "ResNet34_Weights",
+    "ResNet50_Weights",
+    "ResNet101_Weights",
+    "ResNet152_Weights",
+    "ResNeXt50_32X4D_Weights",
+    "ResNeXt101_32X8D_Weights",
+    "ResNeXt101_64X4D_Weights",
+    "Wide_ResNet50_2_Weights",
+    "Wide_ResNet101_2_Weights",
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "resnext50_32x4d",
+    "resnext101_32x8d",
+    "resnext101_64x4d",
+    "wide_resnet50_2",
+    "wide_resnet101_2",
+]
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: type[Union[BasicBlock, Bottleneck]],
+        layers: list[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[list[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    block: type[Union[BasicBlock, Bottleneck]],
+    layers: list[int],
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> ResNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = ResNet(block, layers, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class ResNet18_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet18-f37072fd.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 11689512,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.758,
+                    "acc@5": 89.078,
+                }
+            },
+            "_ops": 1.814,
+            "_file_size": 44.661,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ResNet34_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet34-b627a593.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 21797672,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 73.314,
+                    "acc@5": 91.420,
+                }
+            },
+            "_ops": 3.664,
+            "_file_size": 83.275,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ResNet50_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet50-0676ba61.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.130,
+                    "acc@5": 92.862,
+                }
+            },
+            "_ops": 4.089,
+            "_file_size": 97.781,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet50-11ad3fa6.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#issuecomment-1013906621",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.858,
+                    "acc@5": 95.434,
+                }
+            },
+            "_ops": 4.089,
+            "_file_size": 97.79,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class ResNet101_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet101-63fe2227.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 44549160,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.374,
+                    "acc@5": 93.546,
+                }
+            },
+            "_ops": 7.801,
+            "_file_size": 170.511,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet101-cd907fc2.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 44549160,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.886,
+                    "acc@5": 95.780,
+                }
+            },
+            "_ops": 7.801,
+            "_file_size": 170.53,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class ResNet152_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet152-394f9c45.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 60192808,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.312,
+                    "acc@5": 94.046,
+                }
+            },
+            "_ops": 11.514,
+            "_file_size": 230.434,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet152-f82ba261.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 60192808,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.284,
+                    "acc@5": 96.002,
+                }
+            },
+            "_ops": 11.514,
+            "_file_size": 230.474,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt50_32X4D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 25028904,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.618,
+                    "acc@5": 93.698,
+                }
+            },
+            "_ops": 4.23,
+            "_file_size": 95.789,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 25028904,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.198,
+                    "acc@5": 95.340,
+                }
+            },
+            "_ops": 4.23,
+            "_file_size": 95.833,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt101_32X8D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.312,
+                    "acc@5": 94.526,
+                }
+            },
+            "_ops": 16.414,
+            "_file_size": 339.586,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnext101_32x8d-110c445d.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.834,
+                    "acc@5": 96.228,
+                }
+            },
+            "_ops": 16.414,
+            "_file_size": 339.673,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt101_64X4D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 83455272,
+            "recipe": "https://github.com/pytorch/vision/pull/5935",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.246,
+                    "acc@5": 96.454,
+                }
+            },
+            "_ops": 15.46,
+            "_file_size": 319.318,
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Wide_ResNet50_2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 68883240,
+            "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.468,
+                    "acc@5": 94.086,
+                }
+            },
+            "_ops": 11.398,
+            "_file_size": 131.82,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet50_2-9ba9bcbe.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 68883240,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.602,
+                    "acc@5": 95.758,
+                }
+            },
+            "_ops": 11.398,
+            "_file_size": 263.124,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+class Wide_ResNet101_2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 126886696,
+            "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.848,
+                    "acc@5": 94.284,
+                }
+            },
+            "_ops": 22.753,
+            "_file_size": 242.896,
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet101_2-d733dc28.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 126886696,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.510,
+                    "acc@5": 96.020,
+                }
+            },
+            "_ops": 22.753,
+            "_file_size": 484.747,
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
+def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet18_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet18_Weights
+        :members:
+    """
+    weights = ResNet18_Weights.verify(weights)
+
+    return _resnet(BasicBlock, [2, 2, 2, 2], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
+def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet34_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet34_Weights
+        :members:
+    """
+    weights = ResNet34_Weights.verify(weights)
+
+    return _resnet(BasicBlock, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
+def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
+
+    .. note::
+       The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+       convolution while the original paper places it to the first 1x1 convolution.
+       This variant improves the accuracy and is known as `ResNet V1.5
+       <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet50_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet50_Weights
+        :members:
+    """
+    weights = ResNet50_Weights.verify(weights)
+
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
+def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
+
+    .. note::
+       The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+       convolution while the original paper places it to the first 1x1 convolution.
+       This variant improves the accuracy and is known as `ResNet V1.5
+       <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNet101_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet101_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet101_Weights
+        :members:
+    """
+    weights = ResNet101_Weights.verify(weights)
+
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
+def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
+
+    .. note::
+       The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+       convolution while the original paper places it to the first 1x1 convolution.
+       This variant improves the accuracy and is known as `ResNet V1.5
+       <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNet152_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet152_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet152_Weights
+        :members:
+    """
+    weights = ResNet152_Weights.verify(weights)
+
+    return _resnet(Bottleneck, [3, 8, 36, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNeXt50_32X4D_Weights.IMAGENET1K_V1))
+def resnext50_32x4d(
+    *, weights: Optional[ResNeXt50_32X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    """ResNeXt-50 32x4d model from
+    `Aggregated Residual Transformation for Deep Neural Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNeXt50_32X4D_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNext50_32X4D_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.ResNeXt50_32X4D_Weights
+        :members:
+    """
+    weights = ResNeXt50_32X4D_Weights.verify(weights)
+
+    _ovewrite_named_param(kwargs, "groups", 32)
+    _ovewrite_named_param(kwargs, "width_per_group", 4)
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNeXt101_32X8D_Weights.IMAGENET1K_V1))
+def resnext101_32x8d(
+    *, weights: Optional[ResNeXt101_32X8D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    """ResNeXt-101 32x8d model from
+    `Aggregated Residual Transformation for Deep Neural Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNeXt101_32X8D_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNeXt101_32X8D_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.ResNeXt101_32X8D_Weights
+        :members:
+    """
+    weights = ResNeXt101_32X8D_Weights.verify(weights)
+
+    _ovewrite_named_param(kwargs, "groups", 32)
+    _ovewrite_named_param(kwargs, "width_per_group", 8)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ResNeXt101_64X4D_Weights.IMAGENET1K_V1))
+def resnext101_64x4d(
+    *, weights: Optional[ResNeXt101_64X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    """ResNeXt-101 64x4d model from
+    `Aggregated Residual Transformation for Deep Neural Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ResNeXt101_64X4D_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNeXt101_64X4D_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.ResNeXt101_64X4D_Weights
+        :members:
+    """
+    weights = ResNeXt101_64X4D_Weights.verify(weights)
+
+    _ovewrite_named_param(kwargs, "groups", 64)
+    _ovewrite_named_param(kwargs, "width_per_group", 4)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet50_2_Weights.IMAGENET1K_V1))
+def wide_resnet50_2(
+    *, weights: Optional[Wide_ResNet50_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    """Wide ResNet-50-2 model from
+    `Wide Residual Networks <https://arxiv.org/abs/1605.07146>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        weights (:class:`~torchvision.models.Wide_ResNet50_2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Wide_ResNet50_2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.Wide_ResNet50_2_Weights
+        :members:
+    """
+    weights = Wide_ResNet50_2_Weights.verify(weights)
+
+    _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet101_2_Weights.IMAGENET1K_V1))
+def wide_resnet101_2(
+    *, weights: Optional[Wide_ResNet101_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    """Wide ResNet-101-2 model from
+    `Wide Residual Networks <https://arxiv.org/abs/1605.07146>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-101 has 2048-512-2048
+    channels, and in Wide ResNet-101-2 has 2048-1024-2048.
+
+    Args:
+        weights (:class:`~torchvision.models.Wide_ResNet101_2_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Wide_ResNet101_2_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.Wide_ResNet101_2_Weights
+        :members:
+    """
+    weights = Wide_ResNet101_2_Weights.verify(weights)
+
+    _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6f37f958a131b76ce80306718b77d78bc3f045
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__init__.py
@@ -0,0 +1,3 @@
+from .deeplabv3 import *
+from .fcn import *
+from .lraspp import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4aa06a5e003b43f7191f197cf6ecbb742b60827a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a226ad620fd21b9ecd2b7bbceb43a23418805c6f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/deeplabv3.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/deeplabv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..479a26bbf6ac24fae6bd3d4251f48c5045297d22
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/deeplabv3.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/fcn.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/fcn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6a1b697b5ded71941dbd83f4580b186e5aa694
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/fcn.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/lraspp.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/lraspp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b93ee729a3a7a54fd3f392d2f762295c25e3b3f7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/__pycache__/lraspp.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..45bc2e7c43563ad5603f4c53cfee3064cce5e4c7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/_utils.py
@@ -0,0 +1,37 @@
+from collections import OrderedDict
+from typing import Optional
+
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+from ...utils import _log_api_usage_once
+
+
+class _SimpleSegmentationModel(nn.Module):
+    __constants__ = ["aux_classifier"]
+
+    def __init__(self, backbone: nn.Module, classifier: nn.Module, aux_classifier: Optional[nn.Module] = None) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.backbone = backbone
+        self.classifier = classifier
+        self.aux_classifier = aux_classifier
+
+    def forward(self, x: Tensor) -> dict[str, Tensor]:
+        input_shape = x.shape[-2:]
+        # contract: features is a dict of tensors
+        features = self.backbone(x)
+
+        result = OrderedDict()
+        x = features["out"]
+        x = self.classifier(x)
+        x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+        result["out"] = x
+
+        if self.aux_classifier is not None:
+            x = features["aux"]
+            x = self.aux_classifier(x)
+            x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            result["aux"] = x
+
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/deeplabv3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/deeplabv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..62790ecb4ddcb05753cf4e7d2004154ad1159e94
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/deeplabv3.py
@@ -0,0 +1,391 @@
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ...transforms._presets import SemanticSegmentation
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _VOC_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface, IntermediateLayerGetter
+from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights, MobileNetV3
+from ..resnet import ResNet, resnet101, ResNet101_Weights, resnet50, ResNet50_Weights
+from ._utils import _SimpleSegmentationModel
+from .fcn import FCNHead
+
+
+__all__ = [
+    "DeepLabV3",
+    "DeepLabV3_ResNet50_Weights",
+    "DeepLabV3_ResNet101_Weights",
+    "DeepLabV3_MobileNet_V3_Large_Weights",
+    "deeplabv3_mobilenet_v3_large",
+    "deeplabv3_resnet50",
+    "deeplabv3_resnet101",
+]
+
+
+class DeepLabV3(_SimpleSegmentationModel):
+    """
+    Implements DeepLabV3 model from
+    `"Rethinking Atrous Convolution for Semantic Image Segmentation"
+    <https://arxiv.org/abs/1706.05587>`_.
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            The backbone should return an OrderedDict[Tensor], with the key being
+            "out" for the last feature map used, and "aux" if an auxiliary classifier
+            is used.
+        classifier (nn.Module): module that takes the "out" element returned from
+            the backbone and returns a dense prediction.
+        aux_classifier (nn.Module, optional): auxiliary classifier used during training
+    """
+
+    pass
+
+
+class DeepLabHead(nn.Sequential):
+    def __init__(self, in_channels: int, num_classes: int, atrous_rates: Sequence[int] = (12, 24, 36)) -> None:
+        super().__init__(
+            ASPP(in_channels, atrous_rates),
+            nn.Conv2d(256, 256, 3, padding=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Conv2d(256, num_classes, 1),
+        )
+
+
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels: int, out_channels: int, dilation: int) -> None:
+        modules = [
+            nn.Conv2d(in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        ]
+        super().__init__(*modules)
+
+
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        super().__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        size = x.shape[-2:]
+        for mod in self:
+            x = mod(x)
+        return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+
+
+class ASPP(nn.Module):
+    def __init__(self, in_channels: int, atrous_rates: Sequence[int], out_channels: int = 256) -> None:
+        super().__init__()
+        modules = []
+        modules.append(
+            nn.Sequential(nn.Conv2d(in_channels, out_channels, 1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU())
+        )
+
+        rates = tuple(atrous_rates)
+        for rate in rates:
+            modules.append(ASPPConv(in_channels, out_channels, rate))
+
+        modules.append(ASPPPooling(in_channels, out_channels))
+
+        self.convs = nn.ModuleList(modules)
+
+        self.project = nn.Sequential(
+            nn.Conv2d(len(self.convs) * out_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _res = []
+        for conv in self.convs:
+            _res.append(conv(x))
+        res = torch.cat(_res, dim=1)
+        return self.project(res)
+
+
+def _deeplabv3_resnet(
+    backbone: ResNet,
+    num_classes: int,
+    aux: Optional[bool],
+) -> DeepLabV3:
+    return_layers = {"layer4": "out"}
+    if aux:
+        return_layers["layer3"] = "aux"
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
+
+    aux_classifier = FCNHead(1024, num_classes) if aux else None
+    classifier = DeepLabHead(2048, num_classes)
+    return DeepLabV3(backbone, classifier, aux_classifier)
+
+
+_COMMON_META = {
+    "categories": _VOC_CATEGORIES,
+    "min_size": (1, 1),
+    "_docs": """
+        These weights were trained on a subset of COCO, using only the 20 categories that are present in the Pascal VOC
+        dataset.
+    """,
+}
+
+
+class DeepLabV3_ResNet50_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/deeplabv3_resnet50_coco-cd0a2569.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            **_COMMON_META,
+            "num_params": 42004074,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#deeplabv3_resnet50",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 66.4,
+                    "pixel_acc": 92.4,
+                }
+            },
+            "_ops": 178.722,
+            "_file_size": 160.515,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+class DeepLabV3_ResNet101_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            **_COMMON_META,
+            "num_params": 60996202,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#fcn_resnet101",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 67.4,
+                    "pixel_acc": 92.4,
+                }
+            },
+            "_ops": 258.743,
+            "_file_size": 233.217,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+class DeepLabV3_MobileNet_V3_Large_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/deeplabv3_mobilenet_v3_large-fc3c493d.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            **_COMMON_META,
+            "num_params": 11029328,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#deeplabv3_mobilenet_v3_large",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 60.3,
+                    "pixel_acc": 91.2,
+                }
+            },
+            "_ops": 10.452,
+            "_file_size": 42.301,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+def _deeplabv3_mobilenetv3(
+    backbone: MobileNetV3,
+    num_classes: int,
+    aux: Optional[bool],
+) -> DeepLabV3:
+    backbone = backbone.features
+    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    out_pos = stage_indices[-1]  # use C5 which has output_stride = 16
+    out_inplanes = backbone[out_pos].out_channels
+    aux_pos = stage_indices[-4]  # use C2 here which has output_stride = 8
+    aux_inplanes = backbone[aux_pos].out_channels
+    return_layers = {str(out_pos): "out"}
+    if aux:
+        return_layers[str(aux_pos)] = "aux"
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
+
+    aux_classifier = FCNHead(aux_inplanes, num_classes) if aux else None
+    classifier = DeepLabHead(out_inplanes, num_classes)
+    return DeepLabV3(backbone, classifier, aux_classifier)
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", DeepLabV3_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def deeplabv3_resnet50(
+    *,
+    weights: Optional[DeepLabV3_ResNet50_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    aux_loss: Optional[bool] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> DeepLabV3:
+    """Constructs a DeepLabV3 model with a ResNet-50 backbone.
+
+    .. betastatus:: segmentation module
+
+    Reference: `Rethinking Atrous Convolution for Semantic Image Segmentation <https://arxiv.org/abs/1706.05587>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.DeepLabV3_ResNet50_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.DeepLabV3_ResNet50_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        aux_loss (bool, optional): If True, it uses an auxiliary loss
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained weights for the
+            backbone
+        **kwargs: unused
+
+    .. autoclass:: torchvision.models.segmentation.DeepLabV3_ResNet50_Weights
+        :members:
+    """
+    weights = DeepLabV3_ResNet50_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = resnet50(weights=weights_backbone, replace_stride_with_dilation=[False, True, True])
+    model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", DeepLabV3_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", ResNet101_Weights.IMAGENET1K_V1),
+)
+def deeplabv3_resnet101(
+    *,
+    weights: Optional[DeepLabV3_ResNet101_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    aux_loss: Optional[bool] = None,
+    weights_backbone: Optional[ResNet101_Weights] = ResNet101_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> DeepLabV3:
+    """Constructs a DeepLabV3 model with a ResNet-101 backbone.
+
+    .. betastatus:: segmentation module
+
+    Reference: `Rethinking Atrous Convolution for Semantic Image Segmentation <https://arxiv.org/abs/1706.05587>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.DeepLabV3_ResNet101_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.DeepLabV3_ResNet101_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        aux_loss (bool, optional): If True, it uses an auxiliary loss
+        weights_backbone (:class:`~torchvision.models.ResNet101_Weights`, optional): The pretrained weights for the
+            backbone
+        **kwargs: unused
+
+    .. autoclass:: torchvision.models.segmentation.DeepLabV3_ResNet101_Weights
+        :members:
+    """
+    weights = DeepLabV3_ResNet101_Weights.verify(weights)
+    weights_backbone = ResNet101_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = resnet101(weights=weights_backbone, replace_stride_with_dilation=[False, True, True])
+    model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", DeepLabV3_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
+)
+def deeplabv3_mobilenet_v3_large(
+    *,
+    weights: Optional[DeepLabV3_MobileNet_V3_Large_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    aux_loss: Optional[bool] = None,
+    weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> DeepLabV3:
+    """Constructs a DeepLabV3 model with a MobileNetV3-Large backbone.
+
+    Reference: `Rethinking Atrous Convolution for Semantic Image Segmentation <https://arxiv.org/abs/1706.05587>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.DeepLabV3_MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.DeepLabV3_MobileNet_V3_Large_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        aux_loss (bool, optional): If True, it uses an auxiliary loss
+        weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained weights
+            for the backbone
+        **kwargs: unused
+
+    .. autoclass:: torchvision.models.segmentation.DeepLabV3_MobileNet_V3_Large_Weights
+        :members:
+    """
+    weights = DeepLabV3_MobileNet_V3_Large_Weights.verify(weights)
+    weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = mobilenet_v3_large(weights=weights_backbone, dilated=True)
+    model = _deeplabv3_mobilenetv3(backbone, num_classes, aux_loss)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/fcn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/fcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb2e242adac0e7430bab6155ae0347770e29fee9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/fcn.py
@@ -0,0 +1,232 @@
+from functools import partial
+from typing import Any, Optional
+
+from torch import nn
+
+from ...transforms._presets import SemanticSegmentation
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _VOC_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface, IntermediateLayerGetter
+from ..resnet import ResNet, resnet101, ResNet101_Weights, resnet50, ResNet50_Weights
+from ._utils import _SimpleSegmentationModel
+
+
+__all__ = ["FCN", "FCN_ResNet50_Weights", "FCN_ResNet101_Weights", "fcn_resnet50", "fcn_resnet101"]
+
+
+class FCN(_SimpleSegmentationModel):
+    """
+    Implements FCN model from
+    `"Fully Convolutional Networks for Semantic Segmentation"
+    <https://arxiv.org/abs/1411.4038>`_.
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            The backbone should return an OrderedDict[Tensor], with the key being
+            "out" for the last feature map used, and "aux" if an auxiliary classifier
+            is used.
+        classifier (nn.Module): module that takes the "out" element returned from
+            the backbone and returns a dense prediction.
+        aux_classifier (nn.Module, optional): auxiliary classifier used during training
+    """
+
+    pass
+
+
+class FCNHead(nn.Sequential):
+    def __init__(self, in_channels: int, channels: int) -> None:
+        inter_channels = in_channels // 4
+        layers = [
+            nn.Conv2d(in_channels, inter_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Conv2d(inter_channels, channels, 1),
+        ]
+
+        super().__init__(*layers)
+
+
+_COMMON_META = {
+    "categories": _VOC_CATEGORIES,
+    "min_size": (1, 1),
+    "_docs": """
+        These weights were trained on a subset of COCO, using only the 20 categories that are present in the Pascal VOC
+        dataset.
+    """,
+}
+
+
+class FCN_ResNet50_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/fcn_resnet50_coco-1167a1af.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            **_COMMON_META,
+            "num_params": 35322218,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#fcn_resnet50",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 60.5,
+                    "pixel_acc": 91.4,
+                }
+            },
+            "_ops": 152.717,
+            "_file_size": 135.009,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+class FCN_ResNet101_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            **_COMMON_META,
+            "num_params": 54314346,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#deeplabv3_resnet101",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 63.7,
+                    "pixel_acc": 91.9,
+                }
+            },
+            "_ops": 232.738,
+            "_file_size": 207.711,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+def _fcn_resnet(
+    backbone: ResNet,
+    num_classes: int,
+    aux: Optional[bool],
+) -> FCN:
+    return_layers = {"layer4": "out"}
+    if aux:
+        return_layers["layer3"] = "aux"
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
+
+    aux_classifier = FCNHead(1024, num_classes) if aux else None
+    classifier = FCNHead(2048, num_classes)
+    return FCN(backbone, classifier, aux_classifier)
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
+)
+def fcn_resnet50(
+    *,
+    weights: Optional[FCN_ResNet50_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    aux_loss: Optional[bool] = None,
+    weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> FCN:
+    """Fully-Convolutional Network model with a ResNet-50 backbone from the `Fully Convolutional
+    Networks for Semantic Segmentation <https://arxiv.org/abs/1411.4038>`_ paper.
+
+    .. betastatus:: segmentation module
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.FCN_ResNet50_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.FCN_ResNet50_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background).
+        aux_loss (bool, optional): If True, it uses an auxiliary loss.
+        weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained
+            weights for the backbone.
+        **kwargs: parameters passed to the ``torchvision.models.segmentation.fcn.FCN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.segmentation.FCN_ResNet50_Weights
+        :members:
+    """
+
+    weights = FCN_ResNet50_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = resnet50(weights=weights_backbone, replace_stride_with_dilation=[False, True, True])
+    model = _fcn_resnet(backbone, num_classes, aux_loss)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", FCN_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", ResNet101_Weights.IMAGENET1K_V1),
+)
+def fcn_resnet101(
+    *,
+    weights: Optional[FCN_ResNet101_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    aux_loss: Optional[bool] = None,
+    weights_backbone: Optional[ResNet101_Weights] = ResNet101_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> FCN:
+    """Fully-Convolutional Network model with a ResNet-101 backbone from the `Fully Convolutional
+    Networks for Semantic Segmentation <https://arxiv.org/abs/1411.4038>`_ paper.
+
+    .. betastatus:: segmentation module
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.FCN_ResNet101_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.FCN_ResNet101_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background).
+        aux_loss (bool, optional): If True, it uses an auxiliary loss.
+        weights_backbone (:class:`~torchvision.models.ResNet101_Weights`, optional): The pretrained
+            weights for the backbone.
+        **kwargs: parameters passed to the ``torchvision.models.segmentation.fcn.FCN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.segmentation.FCN_ResNet101_Weights
+        :members:
+    """
+
+    weights = FCN_ResNet101_Weights.verify(weights)
+    weights_backbone = ResNet101_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        aux_loss = _ovewrite_value_param("aux_loss", aux_loss, True)
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = resnet101(weights=weights_backbone, replace_stride_with_dilation=[False, True, True])
+    model = _fcn_resnet(backbone, num_classes, aux_loss)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/lraspp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/lraspp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e49b06d5b9facef807acc8fc9516a53d56ef01c4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/segmentation/lraspp.py
@@ -0,0 +1,178 @@
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Optional
+
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+from ...transforms._presets import SemanticSegmentation
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _VOC_CATEGORIES
+from .._utils import _ovewrite_value_param, handle_legacy_interface, IntermediateLayerGetter
+from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights, MobileNetV3
+
+
+__all__ = ["LRASPP", "LRASPP_MobileNet_V3_Large_Weights", "lraspp_mobilenet_v3_large"]
+
+
+class LRASPP(nn.Module):
+    """
+    Implements a Lite R-ASPP Network for semantic segmentation from
+    `"Searching for MobileNetV3"
+    <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            The backbone should return an OrderedDict[Tensor], with the key being
+            "high" for the high level feature map and "low" for the low level feature map.
+        low_channels (int): the number of channels of the low level features.
+        high_channels (int): the number of channels of the high level features.
+        num_classes (int, optional): number of output classes of the model (including the background).
+        inter_channels (int, optional): the number of channels for intermediate computations.
+    """
+
+    def __init__(
+        self, backbone: nn.Module, low_channels: int, high_channels: int, num_classes: int, inter_channels: int = 128
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.backbone = backbone
+        self.classifier = LRASPPHead(low_channels, high_channels, num_classes, inter_channels)
+
+    def forward(self, input: Tensor) -> dict[str, Tensor]:
+        features = self.backbone(input)
+        out = self.classifier(features)
+        out = F.interpolate(out, size=input.shape[-2:], mode="bilinear", align_corners=False)
+
+        result = OrderedDict()
+        result["out"] = out
+
+        return result
+
+
+class LRASPPHead(nn.Module):
+    def __init__(self, low_channels: int, high_channels: int, num_classes: int, inter_channels: int) -> None:
+        super().__init__()
+        self.cbr = nn.Sequential(
+            nn.Conv2d(high_channels, inter_channels, 1, bias=False),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.scale = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(high_channels, inter_channels, 1, bias=False),
+            nn.Sigmoid(),
+        )
+        self.low_classifier = nn.Conv2d(low_channels, num_classes, 1)
+        self.high_classifier = nn.Conv2d(inter_channels, num_classes, 1)
+
+    def forward(self, input: dict[str, Tensor]) -> Tensor:
+        low = input["low"]
+        high = input["high"]
+
+        x = self.cbr(high)
+        s = self.scale(high)
+        x = x * s
+        x = F.interpolate(x, size=low.shape[-2:], mode="bilinear", align_corners=False)
+
+        return self.low_classifier(low) + self.high_classifier(x)
+
+
+def _lraspp_mobilenetv3(backbone: MobileNetV3, num_classes: int) -> LRASPP:
+    backbone = backbone.features
+    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    low_pos = stage_indices[-4]  # use C2 here which has output_stride = 8
+    high_pos = stage_indices[-1]  # use C5 which has output_stride = 16
+    low_channels = backbone[low_pos].out_channels
+    high_channels = backbone[high_pos].out_channels
+    backbone = IntermediateLayerGetter(backbone, return_layers={str(low_pos): "low", str(high_pos): "high"})
+
+    return LRASPP(backbone, low_channels, high_channels, num_classes)
+
+
+class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum):
+    COCO_WITH_VOC_LABELS_V1 = Weights(
+        url="https://download.pytorch.org/models/lraspp_mobilenet_v3_large-d234d4ea.pth",
+        transforms=partial(SemanticSegmentation, resize_size=520),
+        meta={
+            "num_params": 3221538,
+            "categories": _VOC_CATEGORIES,
+            "min_size": (1, 1),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#lraspp_mobilenet_v3_large",
+            "_metrics": {
+                "COCO-val2017-VOC-labels": {
+                    "miou": 57.9,
+                    "pixel_acc": 91.2,
+                }
+            },
+            "_ops": 2.086,
+            "_file_size": 12.49,
+            "_docs": """
+                These weights were trained on a subset of COCO, using only the 20 categories that are present in the
+                Pascal VOC dataset.
+            """,
+        },
+    )
+    DEFAULT = COCO_WITH_VOC_LABELS_V1
+
+
+@register_model()
+@handle_legacy_interface(
+    weights=("pretrained", LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1),
+    weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
+)
+def lraspp_mobilenet_v3_large(
+    *,
+    weights: Optional[LRASPP_MobileNet_V3_Large_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> LRASPP:
+    """Constructs a Lite R-ASPP Network model with a MobileNetV3-Large backbone from
+    `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_ paper.
+
+    .. betastatus:: segmentation module
+
+    Args:
+        weights (:class:`~torchvision.models.segmentation.LRASPP_MobileNet_V3_Large_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.segmentation.LRASPP_MobileNet_V3_Large_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background).
+        aux_loss (bool, optional): If True, it uses an auxiliary loss.
+        weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained
+            weights for the backbone.
+        **kwargs: parameters passed to the ``torchvision.models.segmentation.LRASPP``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/lraspp.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.segmentation.LRASPP_MobileNet_V3_Large_Weights
+        :members:
+    """
+    if kwargs.pop("aux_loss", False):
+        raise NotImplementedError("This model does not use auxiliary loss")
+
+    weights = LRASPP_MobileNet_V3_Large_Weights.verify(weights)
+    weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 21
+
+    backbone = mobilenet_v3_large(weights=weights_backbone, dilated=True)
+    model = _lraspp_mobilenetv3(backbone, num_classes)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/shufflenetv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/shufflenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..96736f6a7ac289102ef7a57cb4cbb960c02c625e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/shufflenetv2.py
@@ -0,0 +1,408 @@
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "ShuffleNetV2",
+    "ShuffleNet_V2_X0_5_Weights",
+    "ShuffleNet_V2_X1_0_Weights",
+    "ShuffleNet_V2_X1_5_Weights",
+    "ShuffleNet_V2_X2_0_Weights",
+    "shufflenet_v2_x0_5",
+    "shufflenet_v2_x1_0",
+    "shufflenet_v2_x1_5",
+    "shufflenet_v2_x2_0",
+]
+
+
+def channel_shuffle(x: Tensor, groups: int) -> Tensor:
+    batchsize, num_channels, height, width = x.size()
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    # flatten
+    x = x.view(batchsize, num_channels, height, width)
+
+    return x
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp: int, oup: int, stride: int) -> None:
+        super().__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError("illegal stride value")
+        self.stride = stride
+
+        branch_features = oup // 2
+        if (self.stride == 1) and (inp != branch_features << 1):
+            raise ValueError(
+                f"Invalid combination of stride {stride}, inp {inp} and oup {oup} values. If stride == 1 then inp should be equal to oup // 2 << 1."
+            )
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(branch_features),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(
+                inp if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+            self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+        )
+
+    @staticmethod
+    def depthwise_conv(
+        i: int, o: int, kernel_size: int, stride: int = 1, padding: int = 0, bias: bool = False
+    ) -> nn.Conv2d:
+        return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(
+        self,
+        stages_repeats: list[int],
+        stages_out_channels: list[int],
+        num_classes: int = 1000,
+        inverted_residual: Callable[..., nn.Module] = InvertedResidual,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if len(stages_repeats) != 3:
+            raise ValueError("expected stages_repeats as list of 3 positive ints")
+        if len(stages_out_channels) != 5:
+            raise ValueError("expected stages_out_channels as list of 5 positive ints")
+        self._stage_out_channels = stages_out_channels
+
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Static annotations for mypy
+        self.stage2: nn.Sequential
+        self.stage3: nn.Sequential
+        self.stage4: nn.Sequential
+        stage_names = [f"stage{i}" for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(stage_names, stages_repeats, self._stage_out_channels[1:]):
+            seq = [inverted_residual(input_channels, output_channels, 2)]
+            for i in range(repeats - 1):
+                seq.append(inverted_residual(output_channels, output_channels, 1))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+
+        output_channels = self._stage_out_channels[-1]
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+
+        self.fc = nn.Linear(output_channels, num_classes)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.stage4(x)
+        x = self.conv5(x)
+        x = x.mean([2, 3])  # globalpool
+        x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _shufflenetv2(
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    *args: Any,
+    **kwargs: Any,
+) -> ShuffleNetV2:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = ShuffleNetV2(*args, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/ericsun99/Shufflenet-v2-Pytorch",
+}
+
+
+class ShuffleNet_V2_X0_5_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/ericsun99/Shufflenet-v2-Pytorch
+        url="https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 1366792,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 60.552,
+                    "acc@5": 81.746,
+                }
+            },
+            "_ops": 0.04,
+            "_file_size": 5.282,
+            "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ShuffleNet_V2_X1_0_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        # Weights ported from https://github.com/ericsun99/Shufflenet-v2-Pytorch
+        url="https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 2278604,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.362,
+                    "acc@5": 88.316,
+                }
+            },
+            "_ops": 0.145,
+            "_file_size": 8.791,
+            "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ShuffleNet_V2_X1_5_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/shufflenetv2_x1_5-3c479a10.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5906",
+            "num_params": 3503624,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 72.996,
+                    "acc@5": 91.086,
+                }
+            },
+            "_ops": 0.296,
+            "_file_size": 13.557,
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ShuffleNet_V2_X2_0_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/shufflenetv2_x2_0-8be3c8ee.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5906",
+            "num_params": 7393996,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.230,
+                    "acc@5": 93.006,
+                }
+            },
+            "_ops": 0.583,
+            "_file_size": 28.433,
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1))
+def shufflenet_v2_x0_5(
+    *, weights: Optional[ShuffleNet_V2_X0_5_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 architecture with 0.5x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ShuffleNet_V2_X0_5_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ShuffleNet_V2_X0_5_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.shufflenetv2.ShuffleNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X0_5_Weights
+        :members:
+    """
+    weights = ShuffleNet_V2_X0_5_Weights.verify(weights)
+
+    return _shufflenetv2(weights, progress, [4, 8, 4], [24, 48, 96, 192, 1024], **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1))
+def shufflenet_v2_x1_0(
+    *, weights: Optional[ShuffleNet_V2_X1_0_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 architecture with 1.0x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ShuffleNet_V2_X1_0_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ShuffleNet_V2_X1_0_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.shufflenetv2.ShuffleNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X1_0_Weights
+        :members:
+    """
+    weights = ShuffleNet_V2_X1_0_Weights.verify(weights)
+
+    return _shufflenetv2(weights, progress, [4, 8, 4], [24, 116, 232, 464, 1024], **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1))
+def shufflenet_v2_x1_5(
+    *, weights: Optional[ShuffleNet_V2_X1_5_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 architecture with 1.5x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ShuffleNet_V2_X1_5_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ShuffleNet_V2_X1_5_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.shufflenetv2.ShuffleNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X1_5_Weights
+        :members:
+    """
+    weights = ShuffleNet_V2_X1_5_Weights.verify(weights)
+
+    return _shufflenetv2(weights, progress, [4, 8, 4], [24, 176, 352, 704, 1024], **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1))
+def shufflenet_v2_x2_0(
+    *, weights: Optional[ShuffleNet_V2_X2_0_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 architecture with 2.0x output channels, as described in
+    `ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    <https://arxiv.org/abs/1807.11164>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.ShuffleNet_V2_X2_0_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ShuffleNet_V2_X2_0_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.shufflenetv2.ShuffleNetV2``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ShuffleNet_V2_X2_0_Weights
+        :members:
+    """
+    weights = ShuffleNet_V2_X2_0_Weights.verify(weights)
+
+    return _shufflenetv2(weights, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/squeezenet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/squeezenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..982b32107b09c280b4c7caa61e6b80be0cbf041e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/squeezenet.py
@@ -0,0 +1,223 @@
+from functools import partial
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = ["SqueezeNet", "SqueezeNet1_0_Weights", "SqueezeNet1_1_Weights", "squeezenet1_0", "squeezenet1_1"]
+
+
+class Fire(nn.Module):
+    def __init__(self, inplanes: int, squeeze_planes: int, expand1x1_planes: int, expand3x3_planes: int) -> None:
+        super().__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes, kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat(
+            [self.expand1x1_activation(self.expand1x1(x)), self.expand3x3_activation(self.expand3x3(x))], 1
+        )
+
+
+class SqueezeNet(nn.Module):
+    def __init__(self, version: str = "1_0", num_classes: int = 1000, dropout: float = 0.5) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+        if version == "1_0":
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 96, kernel_size=7, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(96, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                Fire(128, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 32, 128, 128),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(512, 64, 256, 256),
+            )
+        elif version == "1_1":
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=3, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(64, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(128, 32, 128, 128),
+                Fire(256, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                Fire(512, 64, 256, 256),
+            )
+        else:
+            # FIXME: Is this needed? SqueezeNet should only be called from the
+            # FIXME: squeezenet1_x() functions
+            # FIXME: This checking is not done for the other models
+            raise ValueError(f"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected")
+
+        # Final convolution is initialized differently from the rest
+        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout), final_conv, nn.ReLU(inplace=True), nn.AdaptiveAvgPool2d((1, 1))
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if m is final_conv:
+                    init.normal_(m.weight, mean=0.0, std=0.01)
+                else:
+                    init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.classifier(x)
+        return torch.flatten(x, 1)
+
+
+def _squeezenet(
+    version: str,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SqueezeNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SqueezeNet(version, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/pytorch/vision/pull/49#issuecomment-277560717",
+    "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+}
+
+
+class SqueezeNet1_0_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "min_size": (21, 21),
+            "num_params": 1248424,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 58.092,
+                    "acc@5": 80.420,
+                }
+            },
+            "_ops": 0.819,
+            "_file_size": 4.778,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class SqueezeNet1_1_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "min_size": (17, 17),
+            "num_params": 1235496,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 58.178,
+                    "acc@5": 80.624,
+                }
+            },
+            "_ops": 0.349,
+            "_file_size": 4.729,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", SqueezeNet1_0_Weights.IMAGENET1K_V1))
+def squeezenet1_0(
+    *, weights: Optional[SqueezeNet1_0_Weights] = None, progress: bool = True, **kwargs: Any
+) -> SqueezeNet:
+    """SqueezeNet model architecture from the `SqueezeNet: AlexNet-level
+    accuracy with 50x fewer parameters and <0.5MB model size
+    <https://arxiv.org/abs/1602.07360>`_ paper.
+
+    Args:
+        weights (:class:`~torchvision.models.SqueezeNet1_0_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.SqueezeNet1_0_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.squeezenet.SqueezeNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.SqueezeNet1_0_Weights
+        :members:
+    """
+    weights = SqueezeNet1_0_Weights.verify(weights)
+    return _squeezenet("1_0", weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", SqueezeNet1_1_Weights.IMAGENET1K_V1))
+def squeezenet1_1(
+    *, weights: Optional[SqueezeNet1_1_Weights] = None, progress: bool = True, **kwargs: Any
+) -> SqueezeNet:
+    """SqueezeNet 1.1 model from the `official SqueezeNet repo
+    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
+
+    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
+    than SqueezeNet 1.0, without sacrificing accuracy.
+
+    Args:
+        weights (:class:`~torchvision.models.SqueezeNet1_1_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.SqueezeNet1_1_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.squeezenet.SqueezeNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.SqueezeNet1_1_Weights
+        :members:
+    """
+    weights = SqueezeNet1_1_Weights.verify(weights)
+    return _squeezenet("1_1", weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/swin_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..80850b4a389e6488a2d5ae76a4159b5ad26a6faa
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/swin_transformer.py
@@ -0,0 +1,1033 @@
+import math
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ..ops.misc import MLP, Permute
+from ..ops.stochastic_depth import StochasticDepth
+from ..transforms._presets import ImageClassification, InterpolationMode
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "SwinTransformer",
+    "Swin_T_Weights",
+    "Swin_S_Weights",
+    "Swin_B_Weights",
+    "Swin_V2_T_Weights",
+    "Swin_V2_S_Weights",
+    "Swin_V2_B_Weights",
+    "swin_t",
+    "swin_s",
+    "swin_b",
+    "swin_v2_t",
+    "swin_v2_s",
+    "swin_v2_b",
+]
+
+
+def _patch_merging_pad(x: torch.Tensor) -> torch.Tensor:
+    H, W, _ = x.shape[-3:]
+    x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+    x0 = x[..., 0::2, 0::2, :]  # ... H/2 W/2 C
+    x1 = x[..., 1::2, 0::2, :]  # ... H/2 W/2 C
+    x2 = x[..., 0::2, 1::2, :]  # ... H/2 W/2 C
+    x3 = x[..., 1::2, 1::2, :]  # ... H/2 W/2 C
+    x = torch.cat([x0, x1, x2, x3], -1)  # ... H/2 W/2 4*C
+    return x
+
+
+torch.fx.wrap("_patch_merging_pad")
+
+
+def _get_relative_position_bias(
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: list[int]
+) -> torch.Tensor:
+    N = window_size[0] * window_size[1]
+    relative_position_bias = relative_position_bias_table[relative_position_index]  # type: ignore[index]
+    relative_position_bias = relative_position_bias.view(N, N, -1)
+    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+    return relative_position_bias
+
+
+torch.fx.wrap("_get_relative_position_bias")
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+    """
+
+    def __init__(self, dim: int, norm_layer: Callable[..., nn.Module] = nn.LayerNorm):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input tensor with expected layout of [..., H, W, C]
+        Returns:
+            Tensor with layout of [..., H/2, W/2, 2*C]
+        """
+        x = _patch_merging_pad(x)
+        x = self.norm(x)
+        x = self.reduction(x)  # ... H/2 W/2 2*C
+        return x
+
+
+class PatchMergingV2(nn.Module):
+    """Patch Merging Layer for Swin Transformer V2.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+    """
+
+    def __init__(self, dim: int, norm_layer: Callable[..., nn.Module] = nn.LayerNorm):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(2 * dim)  # difference
+
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input tensor with expected layout of [..., H, W, C]
+        Returns:
+            Tensor with layout of [..., H/2, W/2, 2*C]
+        """
+        x = _patch_merging_pad(x)
+        x = self.reduction(x)  # ... H/2 W/2 2*C
+        x = self.norm(x)
+        return x
+
+
+def shifted_window_attention(
+    input: Tensor,
+    qkv_weight: Tensor,
+    proj_weight: Tensor,
+    relative_position_bias: Tensor,
+    window_size: list[int],
+    num_heads: int,
+    shift_size: list[int],
+    attention_dropout: float = 0.0,
+    dropout: float = 0.0,
+    qkv_bias: Optional[Tensor] = None,
+    proj_bias: Optional[Tensor] = None,
+    logit_scale: Optional[torch.Tensor] = None,
+    training: bool = True,
+) -> Tensor:
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        input (Tensor[N, H, W, C]): The input tensor or 4-dimensions.
+        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+        relative_position_bias (Tensor): The learned relative position bias added to attention.
+        window_size (List[int]): Window size.
+        num_heads (int): Number of attention heads.
+        shift_size (List[int]): Shift size for shifted window attention.
+        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+        dropout (float): Dropout ratio of output. Default: 0.0.
+        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+        logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
+    Returns:
+        Tensor[N, H, W, C]: The output tensor after shifted window attention.
+    """
+    B, H, W, C = input.shape
+    # pad feature maps to multiples of window size
+    pad_r = (window_size[1] - W % window_size[1]) % window_size[1]
+    pad_b = (window_size[0] - H % window_size[0]) % window_size[0]
+    x = F.pad(input, (0, 0, 0, pad_r, 0, pad_b))
+    _, pad_H, pad_W, _ = x.shape
+
+    shift_size = shift_size.copy()
+    # If window size is larger than feature size, there is no need to shift window
+    if window_size[0] >= pad_H:
+        shift_size[0] = 0
+    if window_size[1] >= pad_W:
+        shift_size[1] = 0
+
+    # cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2))
+
+    # partition windows
+    num_windows = (pad_H // window_size[0]) * (pad_W // window_size[1])
+    x = x.view(B, pad_H // window_size[0], window_size[0], pad_W // window_size[1], window_size[1], C)
+    x = x.permute(0, 1, 3, 2, 4, 5).reshape(B * num_windows, window_size[0] * window_size[1], C)  # B*nW, Ws*Ws, C
+
+    # multi-head attention
+    if logit_scale is not None and qkv_bias is not None:
+        qkv_bias = qkv_bias.clone()
+        length = qkv_bias.numel() // 3
+        qkv_bias[length : 2 * length].zero_()
+    qkv = F.linear(x, qkv_weight, qkv_bias)
+    qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, C // num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    if logit_scale is not None:
+        # cosine attention
+        attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+        logit_scale = torch.clamp(logit_scale, max=math.log(100.0)).exp()
+        attn = attn * logit_scale
+    else:
+        q = q * (C // num_heads) ** -0.5
+        attn = q.matmul(k.transpose(-2, -1))
+    # add relative position bias
+    attn = attn + relative_position_bias
+
+    if sum(shift_size) > 0:
+        # generate attention mask
+        attn_mask = x.new_zeros((pad_H, pad_W))
+        h_slices = ((0, -window_size[0]), (-window_size[0], -shift_size[0]), (-shift_size[0], None))
+        w_slices = ((0, -window_size[1]), (-window_size[1], -shift_size[1]), (-shift_size[1], None))
+        count = 0
+        for h in h_slices:
+            for w in w_slices:
+                attn_mask[h[0] : h[1], w[0] : w[1]] = count
+                count += 1
+        attn_mask = attn_mask.view(pad_H // window_size[0], window_size[0], pad_W // window_size[1], window_size[1])
+        attn_mask = attn_mask.permute(0, 2, 1, 3).reshape(num_windows, window_size[0] * window_size[1])
+        attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+        attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+        attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+    attn = F.softmax(attn, dim=-1)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
+
+    x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
+    x = F.linear(x, proj_weight, proj_bias)
+    x = F.dropout(x, p=dropout, training=training)
+
+    # reverse windows
+    x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
+    x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, pad_H, pad_W, C)
+
+    # reverse cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2))
+
+    # unpad features
+    x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+torch.fx.wrap("shifted_window_attention")
+
+
+class ShiftedWindowAttention(nn.Module):
+    """
+    See :func:`shifted_window_attention`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: list[int],
+        shift_size: list[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        if len(window_size) != 2 or len(shift_size) != 2:
+            raise ValueError("window_size and shift_size must be of length 2")
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.num_heads = num_heads
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+        self.define_relative_position_bias_table()
+        self.define_relative_position_index()
+
+    def define_relative_position_bias_table(self):
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), self.num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def define_relative_position_index(self):
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1).flatten()  # Wh*Ww*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_position_bias(self) -> torch.Tensor:
+        return _get_relative_position_bias(
+            self.relative_position_bias_table, self.relative_position_index, self.window_size  # type: ignore[arg-type]
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor with layout of [B, H, W, C]
+        Returns:
+            Tensor with same layout as input, i.e. [B, H, W, C]
+        """
+        relative_position_bias = self.get_relative_position_bias()
+        return shifted_window_attention(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            self.window_size,
+            self.num_heads,
+            shift_size=self.shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+            training=self.training,
+        )
+
+
+class ShiftedWindowAttentionV2(ShiftedWindowAttention):
+    """
+    See :func:`shifted_window_attention_v2`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: list[int],
+        shift_size: list[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ):
+        super().__init__(
+            dim,
+            window_size,
+            shift_size,
+            num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attention_dropout=attention_dropout,
+            dropout=dropout,
+        )
+
+        self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
+        # mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(2, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, num_heads, bias=False)
+        )
+        if qkv_bias:
+            length = self.qkv.bias.numel() // 3
+            self.qkv.bias[length : 2 * length].data.zero_()
+
+    def define_relative_position_bias_table(self):
+        # get relative_coords_table
+        relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
+        relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
+        relative_coords_table = torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w], indexing="ij"))
+        relative_coords_table = relative_coords_table.permute(1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
+
+        relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
+        relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
+
+        relative_coords_table *= 8  # normalize to -8, 8
+        relative_coords_table = (
+            torch.sign(relative_coords_table) * torch.log2(torch.abs(relative_coords_table) + 1.0) / 3.0
+        )
+        self.register_buffer("relative_coords_table", relative_coords_table)
+
+    def get_relative_position_bias(self) -> torch.Tensor:
+        relative_position_bias = _get_relative_position_bias(
+            self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads),
+            self.relative_position_index,  # type: ignore[arg-type]
+            self.window_size,
+        )
+        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
+        return relative_position_bias
+
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): Tensor with layout of [B, H, W, C]
+        Returns:
+            Tensor with same layout as input, i.e. [B, H, W, C]
+        """
+        relative_position_bias = self.get_relative_position_bias()
+        return shifted_window_attention(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            self.window_size,
+            self.num_heads,
+            shift_size=self.shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+            logit_scale=self.logit_scale,
+            training=self.training,
+        )
+
+
+class SwinTransformerBlock(nn.Module):
+    """
+    Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (List[int]): Window size.
+        shift_size (List[int]): Shift size for shifted window attention.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
+        norm_layer (nn.Module): Normalization layer.  Default: nn.LayerNorm.
+        attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttention
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: list[int],
+        shift_size: list[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.0,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_layer: Callable[..., nn.Module] = ShiftedWindowAttention,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_layer(
+            dim,
+            window_size,
+            shift_size,
+            num_heads,
+            attention_dropout=attention_dropout,
+            dropout=dropout,
+        )
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
+
+        for m in self.mlp.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.normal_(m.bias, std=1e-6)
+
+    def forward(self, x: Tensor):
+        x = x + self.stochastic_depth(self.attn(self.norm1(x)))
+        x = x + self.stochastic_depth(self.mlp(self.norm2(x)))
+        return x
+
+
+class SwinTransformerBlockV2(SwinTransformerBlock):
+    """
+    Swin Transformer V2 Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (List[int]): Window size.
+        shift_size (List[int]): Shift size for shifted window attention.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
+        norm_layer (nn.Module): Normalization layer.  Default: nn.LayerNorm.
+        attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttentionV2.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: list[int],
+        shift_size: list[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.0,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_layer: Callable[..., nn.Module] = ShiftedWindowAttentionV2,
+    ):
+        super().__init__(
+            dim,
+            num_heads,
+            window_size,
+            shift_size,
+            mlp_ratio=mlp_ratio,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            stochastic_depth_prob=stochastic_depth_prob,
+            norm_layer=norm_layer,
+            attn_layer=attn_layer,
+        )
+
+    def forward(self, x: Tensor):
+        # Here is the difference, we apply norm after the attention in V2.
+        # In V1 we applied norm before the attention.
+        x = x + self.stochastic_depth(self.norm1(self.attn(x)))
+        x = x + self.stochastic_depth(self.norm2(self.mlp(x)))
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """
+    Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
+    Shifted Windows" <https://arxiv.org/abs/2103.14030>`_ paper.
+    Args:
+        patch_size (List[int]): Patch size.
+        embed_dim (int): Patch embedding dimension.
+        depths (List(int)): Depth of each Swin Transformer layer.
+        num_heads (List(int)): Number of attention heads in different layers.
+        window_size (List[int]): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
+        num_classes (int): Number of classes for classification head. Default: 1000.
+        block (nn.Module, optional): SwinTransformer Block. Default: None.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None.
+        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
+    """
+
+    def __init__(
+        self,
+        patch_size: list[int],
+        embed_dim: int,
+        depths: list[int],
+        num_heads: list[int],
+        window_size: list[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.1,
+        num_classes: int = 1000,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        block: Optional[Callable[..., nn.Module]] = None,
+        downsample_layer: Callable[..., nn.Module] = PatchMerging,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+
+        if block is None:
+            block = SwinTransformerBlock
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+        layers: list[nn.Module] = []
+        # split image into non-overlapping patches
+        layers.append(
+            nn.Sequential(
+                nn.Conv2d(
+                    3, embed_dim, kernel_size=(patch_size[0], patch_size[1]), stride=(patch_size[0], patch_size[1])
+                ),
+                Permute([0, 2, 3, 1]),
+                norm_layer(embed_dim),
+            )
+        )
+
+        total_stage_blocks = sum(depths)
+        stage_block_id = 0
+        # build SwinTransformer blocks
+        for i_stage in range(len(depths)):
+            stage: list[nn.Module] = []
+            dim = embed_dim * 2**i_stage
+            for i_layer in range(depths[i_stage]):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+                stage.append(
+                    block(
+                        dim,
+                        num_heads[i_stage],
+                        window_size=window_size,
+                        shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+                        mlp_ratio=mlp_ratio,
+                        dropout=dropout,
+                        attention_dropout=attention_dropout,
+                        stochastic_depth_prob=sd_prob,
+                        norm_layer=norm_layer,
+                    )
+                )
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            # add patch merging layer
+            if i_stage < (len(depths) - 1):
+                layers.append(downsample_layer(dim, norm_layer))
+        self.features = nn.Sequential(*layers)
+
+        num_features = embed_dim * 2 ** (len(depths) - 1)
+        self.norm = norm_layer(num_features)
+        self.permute = Permute([0, 3, 1, 2])  # B H W C -> B C H W
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.flatten = nn.Flatten(1)
+        self.head = nn.Linear(num_features, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.norm(x)
+        x = self.permute(x)
+        x = self.avgpool(x)
+        x = self.flatten(x)
+        x = self.head(x)
+        return x
+
+
+def _swin_transformer(
+    patch_size: list[int],
+    embed_dim: int,
+    depths: list[int],
+    num_heads: list[int],
+    window_size: list[int],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SwinTransformer:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SwinTransformer(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class Swin_T_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_t-704ceda3.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=232, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 28288354,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.474,
+                    "acc@5": 95.776,
+                }
+            },
+            "_ops": 4.491,
+            "_file_size": 108.19,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Swin_S_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_s-5e29d889.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=246, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 49606258,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.196,
+                    "acc@5": 96.360,
+                }
+            },
+            "_ops": 8.741,
+            "_file_size": 189.786,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Swin_B_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_b-68c6b09e.pth",
+        transforms=partial(
+            ImageClassification, crop_size=224, resize_size=238, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 87768224,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.582,
+                    "acc@5": 96.640,
+                }
+            },
+            "_ops": 15.431,
+            "_file_size": 335.364,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Swin_V2_T_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_v2_t-b137f0e2.pth",
+        transforms=partial(
+            ImageClassification, crop_size=256, resize_size=260, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 28351570,
+            "min_size": (256, 256),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer-v2",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.072,
+                    "acc@5": 96.132,
+                }
+            },
+            "_ops": 5.94,
+            "_file_size": 108.626,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Swin_V2_S_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_v2_s-637d8ceb.pth",
+        transforms=partial(
+            ImageClassification, crop_size=256, resize_size=260, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 49737442,
+            "min_size": (256, 256),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer-v2",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.712,
+                    "acc@5": 96.816,
+                }
+            },
+            "_ops": 11.546,
+            "_file_size": 190.675,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class Swin_V2_B_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin_v2_b-781e5279.pth",
+        transforms=partial(
+            ImageClassification, crop_size=256, resize_size=272, interpolation=InterpolationMode.BICUBIC
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 87930848,
+            "min_size": (256, 256),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer-v2",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 84.112,
+                    "acc@5": 96.864,
+                }
+            },
+            "_ops": 20.325,
+            "_file_size": 336.372,
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_T_Weights.IMAGENET1K_V1))
+def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_tiny architecture from
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_T_Weights
+        :members:
+    """
+    weights = Swin_T_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[7, 7],
+        stochastic_depth_prob=0.2,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_S_Weights.IMAGENET1K_V1))
+def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_small architecture from
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_S_Weights
+        :members:
+    """
+    weights = Swin_S_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[7, 7],
+        stochastic_depth_prob=0.3,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_B_Weights.IMAGENET1K_V1))
+def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_base architecture from
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_B_Weights
+        :members:
+    """
+    weights = Swin_B_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[7, 7],
+        stochastic_depth_prob=0.5,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_T_Weights.IMAGENET1K_V1))
+def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_v2_tiny architecture from
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_V2_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_V2_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_V2_T_Weights
+        :members:
+    """
+    weights = Swin_V2_T_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 8],
+        stochastic_depth_prob=0.2,
+        weights=weights,
+        progress=progress,
+        block=SwinTransformerBlockV2,
+        downsample_layer=PatchMergingV2,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_S_Weights.IMAGENET1K_V1))
+def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_v2_small architecture from
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_V2_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_V2_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_V2_S_Weights
+        :members:
+    """
+    weights = Swin_V2_S_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 8],
+        stochastic_depth_prob=0.3,
+        weights=weights,
+        progress=progress,
+        block=SwinTransformerBlockV2,
+        downsample_layer=PatchMergingV2,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin_V2_B_Weights.IMAGENET1K_V1))
+def swin_v2_b(*, weights: Optional[Swin_V2_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+    """
+    Constructs a swin_v2_base architecture from
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.Swin_V2_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.Swin_V2_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.Swin_V2_B_Weights
+        :members:
+    """
+    weights = Swin_V2_B_Weights.verify(weights)
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[8, 8],
+        stochastic_depth_prob=0.5,
+        weights=weights,
+        progress=progress,
+        block=SwinTransformerBlockV2,
+        downsample_layer=PatchMergingV2,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vgg.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..feed0ce8d77ed53cfdca222b11d5c694dae4b104
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vgg.py
@@ -0,0 +1,511 @@
+from functools import partial
+from typing import Any, cast, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ..transforms._presets import ImageClassification
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "VGG",
+    "VGG11_Weights",
+    "VGG11_BN_Weights",
+    "VGG13_Weights",
+    "VGG13_BN_Weights",
+    "VGG16_Weights",
+    "VGG16_BN_Weights",
+    "VGG19_Weights",
+    "VGG19_BN_Weights",
+    "vgg11",
+    "vgg11_bn",
+    "vgg13",
+    "vgg13_bn",
+    "vgg16",
+    "vgg16_bn",
+    "vgg19",
+    "vgg19_bn",
+]
+
+
+class VGG(nn.Module):
+    def __init__(
+        self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True, dropout: float = 0.5
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.features = features
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, num_classes),
+        )
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                    if m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Linear):
+                    nn.init.normal_(m.weight, 0, 0.01)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+
+def make_layers(cfg: list[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
+    layers: list[nn.Module] = []
+    in_channels = 3
+    for v in cfg:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            v = cast(int, v)
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+
+
+cfgs: dict[str, list[Union[str, int]]] = {
+    "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
+    "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"],
+}
+
+
+def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: bool, **kwargs: Any) -> VGG:
+    if weights is not None:
+        kwargs["init_weights"] = False
+        if weights.meta["categories"] is not None:
+            _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+    return model
+
+
+_COMMON_META = {
+    "min_size": (32, 32),
+    "categories": _IMAGENET_CATEGORIES,
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg",
+    "_docs": """These weights were trained from scratch by using a simplified training recipe.""",
+}
+
+
+class VGG11_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg11-8a719046.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 132863336,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.020,
+                    "acc@5": 88.628,
+                }
+            },
+            "_ops": 7.609,
+            "_file_size": 506.84,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG11_BN_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 132868840,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 70.370,
+                    "acc@5": 89.810,
+                }
+            },
+            "_ops": 7.609,
+            "_file_size": 506.881,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG13_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg13-19584684.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 133047848,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.928,
+                    "acc@5": 89.246,
+                }
+            },
+            "_ops": 11.308,
+            "_file_size": 507.545,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG13_BN_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 133053736,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 71.586,
+                    "acc@5": 90.374,
+                }
+            },
+            "_ops": 11.308,
+            "_file_size": 507.59,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG16_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg16-397923af.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 138357544,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 71.592,
+                    "acc@5": 90.382,
+                }
+            },
+            "_ops": 15.47,
+            "_file_size": 527.796,
+        },
+    )
+    IMAGENET1K_FEATURES = Weights(
+        # Weights ported from https://github.com/amdegroot/ssd.pytorch/
+        url="https://download.pytorch.org/models/vgg16_features-amdegroot-88682ab5.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=224,
+            mean=(0.48235, 0.45882, 0.40784),
+            std=(1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0),
+        ),
+        meta={
+            **_COMMON_META,
+            "num_params": 138357544,
+            "categories": None,
+            "recipe": "https://github.com/amdegroot/ssd.pytorch#training-ssd",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": float("nan"),
+                    "acc@5": float("nan"),
+                }
+            },
+            "_ops": 15.47,
+            "_file_size": 527.802,
+            "_docs": """
+                These weights can't be used for classification because they are missing values in the `classifier`
+                module. Only the `features` module has valid values and can be used for feature extraction. The weights
+                were trained using the original input standardization method as described in the paper.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG16_BN_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 138365992,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 73.360,
+                    "acc@5": 91.516,
+                }
+            },
+            "_ops": 15.47,
+            "_file_size": 527.866,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG19_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 143667240,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 72.376,
+                    "acc@5": 90.876,
+                }
+            },
+            "_ops": 19.632,
+            "_file_size": 548.051,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class VGG19_BN_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vgg19_bn-c79401a0.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 143678248,
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 74.218,
+                    "acc@5": 91.842,
+                }
+            },
+            "_ops": 19.632,
+            "_file_size": 548.143,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG11_Weights.IMAGENET1K_V1))
+def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-11 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG11_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG11_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG11_Weights
+        :members:
+    """
+    weights = VGG11_Weights.verify(weights)
+
+    return _vgg("A", False, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG11_BN_Weights.IMAGENET1K_V1))
+def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-11-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG11_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG11_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG11_BN_Weights
+        :members:
+    """
+    weights = VGG11_BN_Weights.verify(weights)
+
+    return _vgg("A", True, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG13_Weights.IMAGENET1K_V1))
+def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-13 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG13_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG13_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG13_Weights
+        :members:
+    """
+    weights = VGG13_Weights.verify(weights)
+
+    return _vgg("B", False, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG13_BN_Weights.IMAGENET1K_V1))
+def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-13-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG13_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG13_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG13_BN_Weights
+        :members:
+    """
+    weights = VGG13_BN_Weights.verify(weights)
+
+    return _vgg("B", True, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG16_Weights.IMAGENET1K_V1))
+def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-16 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG16_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG16_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG16_Weights
+        :members:
+    """
+    weights = VGG16_Weights.verify(weights)
+
+    return _vgg("D", False, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG16_BN_Weights.IMAGENET1K_V1))
+def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-16-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG16_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG16_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG16_BN_Weights
+        :members:
+    """
+    weights = VGG16_BN_Weights.verify(weights)
+
+    return _vgg("D", True, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG19_Weights.IMAGENET1K_V1))
+def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-19 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG19_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG19_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG19_Weights
+        :members:
+    """
+    weights = VGG19_Weights.verify(weights)
+
+    return _vgg("E", False, weights, progress, **kwargs)
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", VGG19_BN_Weights.IMAGENET1K_V1))
+def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+    """VGG-19_BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.VGG19_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG19_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG19_BN_Weights
+        :members:
+    """
+    weights = VGG19_BN_Weights.verify(weights)
+
+    return _vgg("E", True, weights, progress, **kwargs)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1eedd3116001af22ec202d2ccec6eefad8090ae
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__init__.py
@@ -0,0 +1,4 @@
+from .mvit import *
+from .resnet import *
+from .s3d import *
+from .swin_transformer import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d64a7fd27533cae4a99f95d7c771629c8d5edfd
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/mvit.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/mvit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee8022c7a0d7fdb73933729c502cf3067d09caf1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/mvit.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/resnet.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78ce58ea635c74c3d0abc2455522613f30c6a3d8
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/resnet.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/s3d.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/s3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15ac34e7435262b9dddbab828bd29730d2331e14
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/s3d.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/swin_transformer.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/swin_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c85091e5c2b76c209b953be5ceecd1f79914ae80
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/__pycache__/swin_transformer.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/mvit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/mvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..64d6d171b75d4f6a316e0ff4b80aa94efeb49294
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/mvit.py
@@ -0,0 +1,898 @@
+import math
+from collections.abc import Sequence
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.fx
+import torch.nn as nn
+
+from ...ops import MLP, StochasticDepth
+from ...transforms._presets import VideoClassification
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "MViT",
+    "MViT_V1_B_Weights",
+    "mvit_v1_b",
+    "MViT_V2_S_Weights",
+    "mvit_v2_s",
+]
+
+
+@dataclass
+class MSBlockConfig:
+    num_heads: int
+    input_channels: int
+    output_channels: int
+    kernel_q: list[int]
+    kernel_kv: list[int]
+    stride_q: list[int]
+    stride_kv: list[int]
+
+
+def _prod(s: Sequence[int]) -> int:
+    product = 1
+    for v in s:
+        product *= v
+    return product
+
+
+def _unsqueeze(x: torch.Tensor, target_dim: int, expand_dim: int) -> tuple[torch.Tensor, int]:
+    tensor_dim = x.dim()
+    if tensor_dim == target_dim - 1:
+        x = x.unsqueeze(expand_dim)
+    elif tensor_dim != target_dim:
+        raise ValueError(f"Unsupported input dimension {x.shape}")
+    return x, tensor_dim
+
+
+def _squeeze(x: torch.Tensor, target_dim: int, expand_dim: int, tensor_dim: int) -> torch.Tensor:
+    if tensor_dim == target_dim - 1:
+        x = x.squeeze(expand_dim)
+    return x
+
+
+torch.fx.wrap("_unsqueeze")
+torch.fx.wrap("_squeeze")
+
+
+class Pool(nn.Module):
+    def __init__(
+        self,
+        pool: nn.Module,
+        norm: Optional[nn.Module],
+        activation: Optional[nn.Module] = None,
+        norm_before_pool: bool = False,
+    ) -> None:
+        super().__init__()
+        self.pool = pool
+        layers = []
+        if norm is not None:
+            layers.append(norm)
+        if activation is not None:
+            layers.append(activation)
+        self.norm_act = nn.Sequential(*layers) if layers else None
+        self.norm_before_pool = norm_before_pool
+
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
+        x, tensor_dim = _unsqueeze(x, 4, 1)
+
+        # Separate the class token and reshape the input
+        class_token, x = torch.tensor_split(x, indices=(1,), dim=2)
+        x = x.transpose(2, 3)
+        B, N, C = x.shape[:3]
+        x = x.reshape((B * N, C) + thw).contiguous()
+
+        # normalizing prior pooling is useful when we use BN which can be absorbed to speed up inference
+        if self.norm_before_pool and self.norm_act is not None:
+            x = self.norm_act(x)
+
+        # apply the pool on the input and add back the token
+        x = self.pool(x)
+        T, H, W = x.shape[2:]
+        x = x.reshape(B, N, C, -1).transpose(2, 3)
+        x = torch.cat((class_token, x), dim=2)
+
+        if not self.norm_before_pool and self.norm_act is not None:
+            x = self.norm_act(x)
+
+        x = _squeeze(x, 4, 1, tensor_dim)
+        return x, (T, H, W)
+
+
+def _interpolate(embedding: torch.Tensor, d: int) -> torch.Tensor:
+    if embedding.shape[0] == d:
+        return embedding
+
+    return (
+        nn.functional.interpolate(
+            embedding.permute(1, 0).unsqueeze(0),
+            size=d,
+            mode="linear",
+        )
+        .squeeze(0)
+        .permute(1, 0)
+    )
+
+
+def _add_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    q_thw: tuple[int, int, int],
+    k_thw: tuple[int, int, int],
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    rel_pos_t: torch.Tensor,
+) -> torch.Tensor:
+    # Modified code from: https://github.com/facebookresearch/SlowFast/commit/1aebd71a2efad823d52b827a3deaf15a56cf4932
+    q_t, q_h, q_w = q_thw
+    k_t, k_h, k_w = k_thw
+    dh = int(2 * max(q_h, k_h) - 1)
+    dw = int(2 * max(q_w, k_w) - 1)
+    dt = int(2 * max(q_t, k_t) - 1)
+
+    # Scale up rel pos if shapes for q and k are different.
+    q_h_ratio = max(k_h / q_h, 1.0)
+    k_h_ratio = max(q_h / k_h, 1.0)
+    dist_h = torch.arange(q_h)[:, None] * q_h_ratio - (torch.arange(k_h)[None, :] + (1.0 - k_h)) * k_h_ratio
+    q_w_ratio = max(k_w / q_w, 1.0)
+    k_w_ratio = max(q_w / k_w, 1.0)
+    dist_w = torch.arange(q_w)[:, None] * q_w_ratio - (torch.arange(k_w)[None, :] + (1.0 - k_w)) * k_w_ratio
+    q_t_ratio = max(k_t / q_t, 1.0)
+    k_t_ratio = max(q_t / k_t, 1.0)
+    dist_t = torch.arange(q_t)[:, None] * q_t_ratio - (torch.arange(k_t)[None, :] + (1.0 - k_t)) * k_t_ratio
+
+    # Interpolate rel pos if needed.
+    rel_pos_h = _interpolate(rel_pos_h, dh)
+    rel_pos_w = _interpolate(rel_pos_w, dw)
+    rel_pos_t = _interpolate(rel_pos_t, dt)
+    Rh = rel_pos_h[dist_h.long()]
+    Rw = rel_pos_w[dist_w.long()]
+    Rt = rel_pos_t[dist_t.long()]
+
+    B, n_head, _, dim = q.shape
+
+    r_q = q[:, :, 1:].reshape(B, n_head, q_t, q_h, q_w, dim)
+    rel_h_q = torch.einsum("bythwc,hkc->bythwk", r_q, Rh)  # [B, H, q_t, qh, qw, k_h]
+    rel_w_q = torch.einsum("bythwc,wkc->bythwk", r_q, Rw)  # [B, H, q_t, qh, qw, k_w]
+    # [B, H, q_t, q_h, q_w, dim] -> [q_t, B, H, q_h, q_w, dim] -> [q_t, B*H*q_h*q_w, dim]
+    r_q = r_q.permute(2, 0, 1, 3, 4, 5).reshape(q_t, B * n_head * q_h * q_w, dim)
+    # [q_t, B*H*q_h*q_w, dim] * [q_t, dim, k_t] = [q_t, B*H*q_h*q_w, k_t] -> [B*H*q_h*q_w, q_t, k_t]
+    rel_q_t = torch.matmul(r_q, Rt.transpose(1, 2)).transpose(0, 1)
+    # [B*H*q_h*q_w, q_t, k_t] -> [B, H, q_t, q_h, q_w, k_t]
+    rel_q_t = rel_q_t.view(B, n_head, q_h, q_w, q_t, k_t).permute(0, 1, 4, 2, 3, 5)
+
+    # Combine rel pos.
+    rel_pos = (
+        rel_h_q[:, :, :, :, :, None, :, None]
+        + rel_w_q[:, :, :, :, :, None, None, :]
+        + rel_q_t[:, :, :, :, :, :, None, None]
+    ).reshape(B, n_head, q_t * q_h * q_w, k_t * k_h * k_w)
+
+    # Add it to attention
+    attn[:, :, 1:, 1:] += rel_pos
+
+    return attn
+
+
+def _add_shortcut(x: torch.Tensor, shortcut: torch.Tensor, residual_with_cls_embed: bool):
+    if residual_with_cls_embed:
+        x.add_(shortcut)
+    else:
+        x[:, :, 1:, :] += shortcut[:, :, 1:, :]
+    return x
+
+
+torch.fx.wrap("_add_rel_pos")
+torch.fx.wrap("_add_shortcut")
+
+
+class MultiscaleAttention(nn.Module):
+    def __init__(
+        self,
+        input_size: list[int],
+        embed_dim: int,
+        output_dim: int,
+        num_heads: int,
+        kernel_q: list[int],
+        kernel_kv: list[int],
+        stride_q: list[int],
+        stride_kv: list[int],
+        residual_pool: bool,
+        residual_with_cls_embed: bool,
+        rel_pos_embed: bool,
+        dropout: float = 0.0,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.output_dim = output_dim
+        self.num_heads = num_heads
+        self.head_dim = output_dim // num_heads
+        self.scaler = 1.0 / math.sqrt(self.head_dim)
+        self.residual_pool = residual_pool
+        self.residual_with_cls_embed = residual_with_cls_embed
+
+        self.qkv = nn.Linear(embed_dim, 3 * output_dim)
+        layers: list[nn.Module] = [nn.Linear(output_dim, output_dim)]
+        if dropout > 0.0:
+            layers.append(nn.Dropout(dropout, inplace=True))
+        self.project = nn.Sequential(*layers)
+
+        self.pool_q: Optional[nn.Module] = None
+        if _prod(kernel_q) > 1 or _prod(stride_q) > 1:
+            padding_q = [int(q // 2) for q in kernel_q]
+            self.pool_q = Pool(
+                nn.Conv3d(
+                    self.head_dim,
+                    self.head_dim,
+                    kernel_q,  # type: ignore[arg-type]
+                    stride=stride_q,  # type: ignore[arg-type]
+                    padding=padding_q,  # type: ignore[arg-type]
+                    groups=self.head_dim,
+                    bias=False,
+                ),
+                norm_layer(self.head_dim),
+            )
+
+        self.pool_k: Optional[nn.Module] = None
+        self.pool_v: Optional[nn.Module] = None
+        if _prod(kernel_kv) > 1 or _prod(stride_kv) > 1:
+            padding_kv = [int(kv // 2) for kv in kernel_kv]
+            self.pool_k = Pool(
+                nn.Conv3d(
+                    self.head_dim,
+                    self.head_dim,
+                    kernel_kv,  # type: ignore[arg-type]
+                    stride=stride_kv,  # type: ignore[arg-type]
+                    padding=padding_kv,  # type: ignore[arg-type]
+                    groups=self.head_dim,
+                    bias=False,
+                ),
+                norm_layer(self.head_dim),
+            )
+            self.pool_v = Pool(
+                nn.Conv3d(
+                    self.head_dim,
+                    self.head_dim,
+                    kernel_kv,  # type: ignore[arg-type]
+                    stride=stride_kv,  # type: ignore[arg-type]
+                    padding=padding_kv,  # type: ignore[arg-type]
+                    groups=self.head_dim,
+                    bias=False,
+                ),
+                norm_layer(self.head_dim),
+            )
+
+        self.rel_pos_h: Optional[nn.Parameter] = None
+        self.rel_pos_w: Optional[nn.Parameter] = None
+        self.rel_pos_t: Optional[nn.Parameter] = None
+        if rel_pos_embed:
+            size = max(input_size[1:])
+            q_size = size // stride_q[1] if len(stride_q) > 0 else size
+            kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size
+            spatial_dim = 2 * max(q_size, kv_size) - 1
+            temporal_dim = 2 * input_size[0] - 1
+            self.rel_pos_h = nn.Parameter(torch.zeros(spatial_dim, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(spatial_dim, self.head_dim))
+            self.rel_pos_t = nn.Parameter(torch.zeros(temporal_dim, self.head_dim))
+            nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+            nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+            nn.init.trunc_normal_(self.rel_pos_t, std=0.02)
+
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
+        B, N, C = x.shape
+        q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(dim=2)
+
+        if self.pool_k is not None:
+            k, k_thw = self.pool_k(k, thw)
+        else:
+            k_thw = thw
+        if self.pool_v is not None:
+            v = self.pool_v(v, thw)[0]
+        if self.pool_q is not None:
+            q, thw = self.pool_q(q, thw)
+
+        attn = torch.matmul(self.scaler * q, k.transpose(2, 3))
+        if self.rel_pos_h is not None and self.rel_pos_w is not None and self.rel_pos_t is not None:
+            attn = _add_rel_pos(
+                attn,
+                q,
+                thw,
+                k_thw,
+                self.rel_pos_h,
+                self.rel_pos_w,
+                self.rel_pos_t,
+            )
+        attn = attn.softmax(dim=-1)
+
+        x = torch.matmul(attn, v)
+        if self.residual_pool:
+            _add_shortcut(x, q, self.residual_with_cls_embed)
+        x = x.transpose(1, 2).reshape(B, -1, self.output_dim)
+        x = self.project(x)
+
+        return x, thw
+
+
+class MultiscaleBlock(nn.Module):
+    def __init__(
+        self,
+        input_size: list[int],
+        cnf: MSBlockConfig,
+        residual_pool: bool,
+        residual_with_cls_embed: bool,
+        rel_pos_embed: bool,
+        proj_after_attn: bool,
+        dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.0,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        self.proj_after_attn = proj_after_attn
+
+        self.pool_skip: Optional[nn.Module] = None
+        if _prod(cnf.stride_q) > 1:
+            kernel_skip = [s + 1 if s > 1 else s for s in cnf.stride_q]
+            padding_skip = [int(k // 2) for k in kernel_skip]
+            self.pool_skip = Pool(
+                nn.MaxPool3d(kernel_skip, stride=cnf.stride_q, padding=padding_skip), None  # type: ignore[arg-type]
+            )
+
+        attn_dim = cnf.output_channels if proj_after_attn else cnf.input_channels
+
+        self.norm1 = norm_layer(cnf.input_channels)
+        self.norm2 = norm_layer(attn_dim)
+        self.needs_transposal = isinstance(self.norm1, nn.BatchNorm1d)
+
+        self.attn = MultiscaleAttention(
+            input_size,
+            cnf.input_channels,
+            attn_dim,
+            cnf.num_heads,
+            kernel_q=cnf.kernel_q,
+            kernel_kv=cnf.kernel_kv,
+            stride_q=cnf.stride_q,
+            stride_kv=cnf.stride_kv,
+            rel_pos_embed=rel_pos_embed,
+            residual_pool=residual_pool,
+            residual_with_cls_embed=residual_with_cls_embed,
+            dropout=dropout,
+            norm_layer=norm_layer,
+        )
+        self.mlp = MLP(
+            attn_dim,
+            [4 * attn_dim, cnf.output_channels],
+            activation_layer=nn.GELU,
+            dropout=dropout,
+            inplace=None,
+        )
+
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+
+        self.project: Optional[nn.Module] = None
+        if cnf.input_channels != cnf.output_channels:
+            self.project = nn.Linear(cnf.input_channels, cnf.output_channels)
+
+    def forward(self, x: torch.Tensor, thw: tuple[int, int, int]) -> tuple[torch.Tensor, tuple[int, int, int]]:
+        x_norm1 = self.norm1(x.transpose(1, 2)).transpose(1, 2) if self.needs_transposal else self.norm1(x)
+        x_attn, thw_new = self.attn(x_norm1, thw)
+        x = x if self.project is None or not self.proj_after_attn else self.project(x_norm1)
+        x_skip = x if self.pool_skip is None else self.pool_skip(x, thw)[0]
+        x = x_skip + self.stochastic_depth(x_attn)
+
+        x_norm2 = self.norm2(x.transpose(1, 2)).transpose(1, 2) if self.needs_transposal else self.norm2(x)
+        x_proj = x if self.project is None or self.proj_after_attn else self.project(x_norm2)
+
+        return x_proj + self.stochastic_depth(self.mlp(x_norm2)), thw_new
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, embed_size: int, spatial_size: tuple[int, int], temporal_size: int, rel_pos_embed: bool) -> None:
+        super().__init__()
+        self.spatial_size = spatial_size
+        self.temporal_size = temporal_size
+
+        self.class_token = nn.Parameter(torch.zeros(embed_size))
+        self.spatial_pos: Optional[nn.Parameter] = None
+        self.temporal_pos: Optional[nn.Parameter] = None
+        self.class_pos: Optional[nn.Parameter] = None
+        if not rel_pos_embed:
+            self.spatial_pos = nn.Parameter(torch.zeros(self.spatial_size[0] * self.spatial_size[1], embed_size))
+            self.temporal_pos = nn.Parameter(torch.zeros(self.temporal_size, embed_size))
+            self.class_pos = nn.Parameter(torch.zeros(embed_size))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        class_token = self.class_token.expand(x.size(0), -1).unsqueeze(1)
+        x = torch.cat((class_token, x), dim=1)
+
+        if self.spatial_pos is not None and self.temporal_pos is not None and self.class_pos is not None:
+            hw_size, embed_size = self.spatial_pos.shape
+            pos_embedding = torch.repeat_interleave(self.temporal_pos, hw_size, dim=0)
+            pos_embedding.add_(self.spatial_pos.unsqueeze(0).expand(self.temporal_size, -1, -1).reshape(-1, embed_size))
+            pos_embedding = torch.cat((self.class_pos.unsqueeze(0), pos_embedding), dim=0).unsqueeze(0)
+            x.add_(pos_embedding)
+
+        return x
+
+
+class MViT(nn.Module):
+    def __init__(
+        self,
+        spatial_size: tuple[int, int],
+        temporal_size: int,
+        block_setting: Sequence[MSBlockConfig],
+        residual_pool: bool,
+        residual_with_cls_embed: bool,
+        rel_pos_embed: bool,
+        proj_after_attn: bool,
+        dropout: float = 0.5,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.0,
+        num_classes: int = 400,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        patch_embed_kernel: tuple[int, int, int] = (3, 7, 7),
+        patch_embed_stride: tuple[int, int, int] = (2, 4, 4),
+        patch_embed_padding: tuple[int, int, int] = (1, 3, 3),
+    ) -> None:
+        """
+        MViT main class.
+
+        Args:
+            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
+            temporal_size (int): The temporal size ``T`` of the input.
+            block_setting (sequence of MSBlockConfig): The Network structure.
+            residual_pool (bool): If True, use MViTv2 pooling residual connection.
+            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
+                the class embedding.
+            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
+            proj_after_attn (bool): If True, apply the projection after the attention.
+            dropout (float): Dropout rate. Default: 0.0.
+            attention_dropout (float): Attention dropout rate. Default: 0.0.
+            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
+            num_classes (int): The number of classes.
+            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
+            norm_layer (callable, optional): Module specifying the normalization layer to use.
+            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
+            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
+            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
+        """
+        super().__init__()
+        # This implementation employs a different parameterization scheme than the one used at PyTorch Video:
+        # https://github.com/facebookresearch/pytorchvideo/blob/718d0a4/pytorchvideo/models/vision_transformers.py
+        # We remove any experimental configuration that didn't make it to the final variants of the models. To represent
+        # the configuration of the architecture we use the simplified form suggested at Table 1 of the paper.
+        _log_api_usage_once(self)
+        total_stage_blocks = len(block_setting)
+        if total_stage_blocks == 0:
+            raise ValueError("The configuration parameter can't be empty.")
+
+        if block is None:
+            block = MultiscaleBlock
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        # Patch Embedding module
+        self.conv_proj = nn.Conv3d(
+            in_channels=3,
+            out_channels=block_setting[0].input_channels,
+            kernel_size=patch_embed_kernel,
+            stride=patch_embed_stride,
+            padding=patch_embed_padding,
+        )
+
+        input_size = [size // stride for size, stride in zip((temporal_size,) + spatial_size, self.conv_proj.stride)]
+
+        # Spatio-Temporal Class Positional Encoding
+        self.pos_encoding = PositionalEncoding(
+            embed_size=block_setting[0].input_channels,
+            spatial_size=(input_size[1], input_size[2]),
+            temporal_size=input_size[0],
+            rel_pos_embed=rel_pos_embed,
+        )
+
+        # Encoder module
+        self.blocks = nn.ModuleList()
+        for stage_block_id, cnf in enumerate(block_setting):
+            # adjust stochastic depth probability based on the depth of the stage block
+            sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
+
+            self.blocks.append(
+                block(
+                    input_size=input_size,
+                    cnf=cnf,
+                    residual_pool=residual_pool,
+                    residual_with_cls_embed=residual_with_cls_embed,
+                    rel_pos_embed=rel_pos_embed,
+                    proj_after_attn=proj_after_attn,
+                    dropout=attention_dropout,
+                    stochastic_depth_prob=sd_prob,
+                    norm_layer=norm_layer,
+                )
+            )
+
+            if len(cnf.stride_q) > 0:
+                input_size = [size // stride for size, stride in zip(input_size, cnf.stride_q)]
+        self.norm = norm_layer(block_setting[-1].output_channels)
+
+        # Classifier module
+        self.head = nn.Sequential(
+            nn.Dropout(dropout, inplace=True),
+            nn.Linear(block_setting[-1].output_channels, num_classes),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0)
+            elif isinstance(m, nn.LayerNorm):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1.0)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0)
+            elif isinstance(m, PositionalEncoding):
+                for weights in m.parameters():
+                    nn.init.trunc_normal_(weights, std=0.02)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Convert if necessary (B, C, H, W) -> (B, C, 1, H, W)
+        x = _unsqueeze(x, 5, 2)[0]
+        # patchify and reshape: (B, C, T, H, W) -> (B, embed_channels[0], T', H', W') -> (B, THW', embed_channels[0])
+        x = self.conv_proj(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        # add positional encoding
+        x = self.pos_encoding(x)
+
+        # pass patches through the encoder
+        thw = (self.pos_encoding.temporal_size,) + self.pos_encoding.spatial_size
+        for block in self.blocks:
+            x, thw = block(x, thw)
+        x = self.norm(x)
+
+        # classifier "token" as used by standard language architectures
+        x = x[:, 0]
+        x = self.head(x)
+
+        return x
+
+
+def _mvit(
+    block_setting: list[MSBlockConfig],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> MViT:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        assert weights.meta["min_size"][0] == weights.meta["min_size"][1]
+        _ovewrite_named_param(kwargs, "spatial_size", weights.meta["min_size"])
+        _ovewrite_named_param(kwargs, "temporal_size", weights.meta["min_temporal_size"])
+    spatial_size = kwargs.pop("spatial_size", (224, 224))
+    temporal_size = kwargs.pop("temporal_size", 16)
+
+    model = MViT(
+        spatial_size=spatial_size,
+        temporal_size=temporal_size,
+        block_setting=block_setting,
+        residual_pool=kwargs.pop("residual_pool", False),
+        residual_with_cls_embed=kwargs.pop("residual_with_cls_embed", True),
+        rel_pos_embed=kwargs.pop("rel_pos_embed", False),
+        proj_after_attn=kwargs.pop("proj_after_attn", False),
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+class MViT_V1_B_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.45, 0.45, 0.45),
+            std=(0.225, 0.225, 0.225),
+        ),
+        meta={
+            "min_size": (224, 224),
+            "min_temporal_size": 16,
+            "categories": _KINETICS400_CATEGORIES,
+            "recipe": "https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`"
+            ),
+            "num_params": 36610672,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 78.477,
+                    "acc@5": 93.582,
+                }
+            },
+            "_ops": 70.599,
+            "_file_size": 139.764,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class MViT_V2_S_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/mvit_v2_s-ae3be167.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.45, 0.45, 0.45),
+            std=(0.225, 0.225, 0.225),
+        ),
+        meta={
+            "min_size": (224, 224),
+            "min_temporal_size": 16,
+            "categories": _KINETICS400_CATEGORIES,
+            "recipe": "https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`"
+            ),
+            "num_params": 34537744,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 80.757,
+                    "acc@5": 94.665,
+                }
+            },
+            "_ops": 64.224,
+            "_file_size": 131.884,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MViT_V1_B_Weights.KINETICS400_V1))
+def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
+    """
+    Constructs a base MViTV1 architecture from
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.
+
+    .. betastatus:: video module
+
+    Args:
+        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
+        :members:
+    """
+    weights = MViT_V1_B_Weights.verify(weights)
+
+    config: dict[str, list] = {
+        "num_heads": [1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8],
+        "input_channels": [96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768],
+        "output_channels": [192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768, 768],
+        "kernel_q": [[], [3, 3, 3], [], [3, 3, 3], [], [], [], [], [], [], [], [], [], [], [3, 3, 3], []],
+        "kernel_kv": [
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+        ],
+        "stride_q": [[], [1, 2, 2], [], [1, 2, 2], [], [], [], [], [], [], [], [], [], [], [1, 2, 2], []],
+        "stride_kv": [
+            [1, 8, 8],
+            [1, 4, 4],
+            [1, 4, 4],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 1, 1],
+            [1, 1, 1],
+        ],
+    }
+
+    block_setting = []
+    for i in range(len(config["num_heads"])):
+        block_setting.append(
+            MSBlockConfig(
+                num_heads=config["num_heads"][i],
+                input_channels=config["input_channels"][i],
+                output_channels=config["output_channels"][i],
+                kernel_q=config["kernel_q"][i],
+                kernel_kv=config["kernel_kv"][i],
+                stride_q=config["stride_q"][i],
+                stride_kv=config["stride_kv"][i],
+            )
+        )
+
+    return _mvit(
+        spatial_size=(224, 224),
+        temporal_size=16,
+        block_setting=block_setting,
+        residual_pool=False,
+        residual_with_cls_embed=False,
+        stochastic_depth_prob=kwargs.pop("stochastic_depth_prob", 0.2),
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MViT_V2_S_Weights.KINETICS400_V1))
+def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
+    """Constructs a small MViTV2 architecture from
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
+    `MViTv2: Improved Multiscale Vision Transformers for Classification
+    and Detection <https://arxiv.org/abs/2112.01526>`__.
+
+    .. betastatus:: video module
+
+    Args:
+        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
+            :members:
+    """
+    weights = MViT_V2_S_Weights.verify(weights)
+
+    config: dict[str, list] = {
+        "num_heads": [1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8],
+        "input_channels": [96, 96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768],
+        "output_channels": [96, 192, 192, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 768, 768],
+        "kernel_q": [
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+        ],
+        "kernel_kv": [
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+        ],
+        "stride_q": [
+            [1, 1, 1],
+            [1, 2, 2],
+            [1, 1, 1],
+            [1, 2, 2],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 1, 1],
+            [1, 2, 2],
+            [1, 1, 1],
+        ],
+        "stride_kv": [
+            [1, 8, 8],
+            [1, 4, 4],
+            [1, 4, 4],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 2, 2],
+            [1, 1, 1],
+            [1, 1, 1],
+        ],
+    }
+
+    block_setting = []
+    for i in range(len(config["num_heads"])):
+        block_setting.append(
+            MSBlockConfig(
+                num_heads=config["num_heads"][i],
+                input_channels=config["input_channels"][i],
+                output_channels=config["output_channels"][i],
+                kernel_q=config["kernel_q"][i],
+                kernel_kv=config["kernel_kv"][i],
+                stride_q=config["stride_q"][i],
+                stride_kv=config["stride_kv"][i],
+            )
+        )
+
+    return _mvit(
+        spatial_size=(224, 224),
+        temporal_size=16,
+        block_setting=block_setting,
+        residual_pool=True,
+        residual_with_cls_embed=False,
+        rel_pos_embed=True,
+        proj_after_attn=True,
+        stochastic_depth_prob=kwargs.pop("stochastic_depth_prob", 0.2),
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/resnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..43b0df48ffe35c055e63362031088d18c24a2dbe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/resnet.py
@@ -0,0 +1,504 @@
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+from ...transforms._presets import VideoClassification
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "VideoResNet",
+    "R3D_18_Weights",
+    "MC3_18_Weights",
+    "R2Plus1D_18_Weights",
+    "r3d_18",
+    "mc3_18",
+    "r2plus1d_18",
+]
+
+
+class Conv3DSimple(nn.Conv3d):
+    def __init__(
+        self, in_planes: int, out_planes: int, midplanes: Optional[int] = None, stride: int = 1, padding: int = 1
+    ) -> None:
+
+        super().__init__(
+            in_channels=in_planes,
+            out_channels=out_planes,
+            kernel_size=(3, 3, 3),
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+
+    @staticmethod
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
+        return stride, stride, stride
+
+
+class Conv2Plus1D(nn.Sequential):
+    def __init__(self, in_planes: int, out_planes: int, midplanes: int, stride: int = 1, padding: int = 1) -> None:
+        super().__init__(
+            nn.Conv3d(
+                in_planes,
+                midplanes,
+                kernel_size=(1, 3, 3),
+                stride=(1, stride, stride),
+                padding=(0, padding, padding),
+                bias=False,
+            ),
+            nn.BatchNorm3d(midplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(
+                midplanes, out_planes, kernel_size=(3, 1, 1), stride=(stride, 1, 1), padding=(padding, 0, 0), bias=False
+            ),
+        )
+
+    @staticmethod
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
+        return stride, stride, stride
+
+
+class Conv3DNoTemporal(nn.Conv3d):
+    def __init__(
+        self, in_planes: int, out_planes: int, midplanes: Optional[int] = None, stride: int = 1, padding: int = 1
+    ) -> None:
+
+        super().__init__(
+            in_channels=in_planes,
+            out_channels=out_planes,
+            kernel_size=(1, 3, 3),
+            stride=(1, stride, stride),
+            padding=(0, padding, padding),
+            bias=False,
+        )
+
+    @staticmethod
+    def get_downsample_stride(stride: int) -> tuple[int, int, int]:
+        return 1, stride, stride
+
+
+class BasicBlock(nn.Module):
+
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        conv_builder: Callable[..., nn.Module],
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+    ) -> None:
+        midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
+
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            conv_builder(inplanes, planes, midplanes, stride), nn.BatchNorm3d(planes), nn.ReLU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(conv_builder(planes, planes, midplanes), nn.BatchNorm3d(planes))
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        conv_builder: Callable[..., nn.Module],
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+    ) -> None:
+
+        super().__init__()
+        midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
+
+        # 1x1x1
+        self.conv1 = nn.Sequential(
+            nn.Conv3d(inplanes, planes, kernel_size=1, bias=False), nn.BatchNorm3d(planes), nn.ReLU(inplace=True)
+        )
+        # Second kernel
+        self.conv2 = nn.Sequential(
+            conv_builder(planes, planes, midplanes, stride), nn.BatchNorm3d(planes), nn.ReLU(inplace=True)
+        )
+
+        # 1x1x1
+        self.conv3 = nn.Sequential(
+            nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False),
+            nn.BatchNorm3d(planes * self.expansion),
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicStem(nn.Sequential):
+    """The default conv-batchnorm-relu stem"""
+
+    def __init__(self) -> None:
+        super().__init__(
+            nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False),
+            nn.BatchNorm3d(64),
+            nn.ReLU(inplace=True),
+        )
+
+
+class R2Plus1dStem(nn.Sequential):
+    """R(2+1)D stem is different than the default one as it uses separated 3D convolution"""
+
+    def __init__(self) -> None:
+        super().__init__(
+            nn.Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False),
+            nn.BatchNorm3d(45),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False),
+            nn.BatchNorm3d(64),
+            nn.ReLU(inplace=True),
+        )
+
+
+class VideoResNet(nn.Module):
+    def __init__(
+        self,
+        block: type[Union[BasicBlock, Bottleneck]],
+        conv_makers: Sequence[type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
+        layers: list[int],
+        stem: Callable[..., nn.Module],
+        num_classes: int = 400,
+        zero_init_residual: bool = False,
+    ) -> None:
+        """Generic resnet video generator.
+
+        Args:
+            block (Type[Union[BasicBlock, Bottleneck]]): resnet building block
+            conv_makers (List[Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]]): generator
+                function for each layer
+            layers (List[int]): number of blocks per layer
+            stem (Callable[..., nn.Module]): module specifying the ResNet stem.
+            num_classes (int, optional): Dimension of the final FC layer. Defaults to 400.
+            zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False.
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+        self.inplanes = 64
+
+        self.stem = stem()
+
+        self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1)
+        self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, conv_makers[2], 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, conv_makers[3], 512, layers[3], stride=2)
+
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        # init weights
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm3d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[union-attr, arg-type]
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        # Flatten the layer to fc
+        x = x.flatten(1)
+        x = self.fc(x)
+
+        return x
+
+    def _make_layer(
+        self,
+        block: type[Union[BasicBlock, Bottleneck]],
+        conv_builder: type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+    ) -> nn.Sequential:
+        downsample = None
+
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            ds_stride = conv_builder.get_downsample_stride(stride)
+            downsample = nn.Sequential(
+                nn.Conv3d(self.inplanes, planes * block.expansion, kernel_size=1, stride=ds_stride, bias=False),
+                nn.BatchNorm3d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, conv_builder, stride, downsample))
+
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, conv_builder))
+
+        return nn.Sequential(*layers)
+
+
+def _video_resnet(
+    block: type[Union[BasicBlock, Bottleneck]],
+    conv_makers: Sequence[type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]],
+    layers: list[int],
+    stem: Callable[..., nn.Module],
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> VideoResNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = VideoResNet(block, conv_makers, layers, stem, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _KINETICS400_CATEGORIES,
+    "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification",
+    "_docs": (
+        "The weights reproduce closely the accuracy of the paper. The accuracies are estimated on video-level "
+        "with parameters `frame_rate=15`, `clips_per_video=5`, and `clip_len=16`."
+    ),
+}
+
+
+class R3D_18_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/r3d_18-b3b3357e.pth",
+        transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)),
+        meta={
+            **_COMMON_META,
+            "num_params": 33371472,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 63.200,
+                    "acc@5": 83.479,
+                }
+            },
+            "_ops": 40.697,
+            "_file_size": 127.359,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class MC3_18_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/mc3_18-a90a0ba3.pth",
+        transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)),
+        meta={
+            **_COMMON_META,
+            "num_params": 11695440,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 63.960,
+                    "acc@5": 84.130,
+                }
+            },
+            "_ops": 43.343,
+            "_file_size": 44.672,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class R2Plus1D_18_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth",
+        transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)),
+        meta={
+            **_COMMON_META,
+            "num_params": 31505325,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 67.463,
+                    "acc@5": 86.175,
+                }
+            },
+            "_ops": 40.519,
+            "_file_size": 120.318,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", R3D_18_Weights.KINETICS400_V1))
+def r3d_18(*, weights: Optional[R3D_18_Weights] = None, progress: bool = True, **kwargs: Any) -> VideoResNet:
+    """Construct 18 layer Resnet3D model.
+
+    .. betastatus:: video module
+
+    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.video.R3D_18_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.R3D_18_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
+            Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.R3D_18_Weights
+        :members:
+    """
+    weights = R3D_18_Weights.verify(weights)
+
+    return _video_resnet(
+        BasicBlock,
+        [Conv3DSimple] * 4,
+        [2, 2, 2, 2],
+        BasicStem,
+        weights,
+        progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", MC3_18_Weights.KINETICS400_V1))
+def mc3_18(*, weights: Optional[MC3_18_Weights] = None, progress: bool = True, **kwargs: Any) -> VideoResNet:
+    """Construct 18 layer Mixed Convolution network as in
+
+    .. betastatus:: video module
+
+    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.video.MC3_18_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.MC3_18_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
+            Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.MC3_18_Weights
+        :members:
+    """
+    weights = MC3_18_Weights.verify(weights)
+
+    return _video_resnet(
+        BasicBlock,
+        [Conv3DSimple] + [Conv3DNoTemporal] * 3,  # type: ignore[list-item]
+        [2, 2, 2, 2],
+        BasicStem,
+        weights,
+        progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", R2Plus1D_18_Weights.KINETICS400_V1))
+def r2plus1d_18(*, weights: Optional[R2Plus1D_18_Weights] = None, progress: bool = True, **kwargs: Any) -> VideoResNet:
+    """Construct 18 layer deep R(2+1)D network as in
+
+    .. betastatus:: video module
+
+    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.
+
+    Args:
+        weights (:class:`~torchvision.models.video.R2Plus1D_18_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.R2Plus1D_18_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
+            Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.R2Plus1D_18_Weights
+        :members:
+    """
+    weights = R2Plus1D_18_Weights.verify(weights)
+
+    return _video_resnet(
+        BasicBlock,
+        [Conv2Plus1D] * 4,
+        [2, 2, 2, 2],
+        R2Plus1dStem,
+        weights,
+        progress,
+        **kwargs,
+    )
+
+
+# The dictionary below is internal implementation detail and will be removed in v0.15
+from .._utils import _ModelURLs
+
+
+model_urls = _ModelURLs(
+    {
+        "r3d_18": R3D_18_Weights.KINETICS400_V1.url,
+        "mc3_18": MC3_18_Weights.KINETICS400_V1.url,
+        "r2plus1d_18": R2Plus1D_18_Weights.KINETICS400_V1.url,
+    }
+)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/s3d.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/s3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b202829b24fb1dc314452d38a521dfe6c8e446f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/s3d.py
@@ -0,0 +1,219 @@
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+from torch import nn
+from torchvision.ops.misc import Conv3dNormActivation
+
+from ...transforms._presets import VideoClassification
+from ...utils import _log_api_usage_once
+from .._api import register_model, Weights, WeightsEnum
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "S3D",
+    "S3D_Weights",
+    "s3d",
+]
+
+
+class TemporalSeparableConv(nn.Sequential):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        norm_layer: Callable[..., nn.Module],
+    ):
+        super().__init__(
+            Conv3dNormActivation(
+                in_planes,
+                out_planes,
+                kernel_size=(1, kernel_size, kernel_size),
+                stride=(1, stride, stride),
+                padding=(0, padding, padding),
+                bias=False,
+                norm_layer=norm_layer,
+            ),
+            Conv3dNormActivation(
+                out_planes,
+                out_planes,
+                kernel_size=(kernel_size, 1, 1),
+                stride=(stride, 1, 1),
+                padding=(padding, 0, 0),
+                bias=False,
+                norm_layer=norm_layer,
+            ),
+        )
+
+
+class SepInceptionBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_planes: int,
+        b0_out: int,
+        b1_mid: int,
+        b1_out: int,
+        b2_mid: int,
+        b2_out: int,
+        b3_out: int,
+        norm_layer: Callable[..., nn.Module],
+    ):
+        super().__init__()
+
+        self.branch0 = Conv3dNormActivation(in_planes, b0_out, kernel_size=1, stride=1, norm_layer=norm_layer)
+        self.branch1 = nn.Sequential(
+            Conv3dNormActivation(in_planes, b1_mid, kernel_size=1, stride=1, norm_layer=norm_layer),
+            TemporalSeparableConv(b1_mid, b1_out, kernel_size=3, stride=1, padding=1, norm_layer=norm_layer),
+        )
+        self.branch2 = nn.Sequential(
+            Conv3dNormActivation(in_planes, b2_mid, kernel_size=1, stride=1, norm_layer=norm_layer),
+            TemporalSeparableConv(b2_mid, b2_out, kernel_size=3, stride=1, padding=1, norm_layer=norm_layer),
+        )
+        self.branch3 = nn.Sequential(
+            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
+            Conv3dNormActivation(in_planes, b3_out, kernel_size=1, stride=1, norm_layer=norm_layer),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+
+        return out
+
+
+class S3D(nn.Module):
+    """S3D main class.
+
+    Args:
+        num_class (int): number of classes for the classification task.
+        dropout (float): dropout probability.
+        norm_layer (Optional[Callable]): Module specifying the normalization layer to use.
+
+    Inputs:
+        x (Tensor): batch of videos with dimensions (batch, channel, time, height, width)
+    """
+
+    def __init__(
+        self,
+        num_classes: int = 400,
+        dropout: float = 0.2,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm3d, eps=0.001, momentum=0.001)
+
+        self.features = nn.Sequential(
+            TemporalSeparableConv(3, 64, 7, 2, 3, norm_layer),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
+            Conv3dNormActivation(
+                64,
+                64,
+                kernel_size=1,
+                stride=1,
+                norm_layer=norm_layer,
+            ),
+            TemporalSeparableConv(64, 192, 3, 1, 1, norm_layer),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
+            SepInceptionBlock3D(192, 64, 96, 128, 16, 32, 32, norm_layer),
+            SepInceptionBlock3D(256, 128, 128, 192, 32, 96, 64, norm_layer),
+            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1)),
+            SepInceptionBlock3D(480, 192, 96, 208, 16, 48, 64, norm_layer),
+            SepInceptionBlock3D(512, 160, 112, 224, 24, 64, 64, norm_layer),
+            SepInceptionBlock3D(512, 128, 128, 256, 24, 64, 64, norm_layer),
+            SepInceptionBlock3D(512, 112, 144, 288, 32, 64, 64, norm_layer),
+            SepInceptionBlock3D(528, 256, 160, 320, 32, 128, 128, norm_layer),
+            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0)),
+            SepInceptionBlock3D(832, 256, 160, 320, 32, 128, 128, norm_layer),
+            SepInceptionBlock3D(832, 384, 192, 384, 48, 128, 128, norm_layer),
+        )
+        self.avgpool = nn.AvgPool3d(kernel_size=(2, 7, 7), stride=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Conv3d(1024, num_classes, kernel_size=1, stride=1, bias=True),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x)
+        x = torch.mean(x, dim=(2, 3, 4))
+        return x
+
+
+class S3D_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/s3d-d76dad2f.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256, 256),
+        ),
+        meta={
+            "min_size": (224, 224),
+            "min_temporal_size": 14,
+            "categories": _KINETICS400_CATEGORIES,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification#s3d",
+            "_docs": (
+                "The weights aim to approximate the accuracy of the paper. The accuracies are estimated on clip-level "
+                "with parameters `frame_rate=15`, `clips_per_video=1`, and `clip_len=128`."
+            ),
+            "num_params": 8320048,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 68.368,
+                    "acc@5": 88.050,
+                }
+            },
+            "_ops": 17.979,
+            "_file_size": 31.972,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", S3D_Weights.KINETICS400_V1))
+def s3d(*, weights: Optional[S3D_Weights] = None, progress: bool = True, **kwargs: Any) -> S3D:
+    """Construct Separable 3D CNN model.
+
+    Reference: `Rethinking Spatiotemporal Feature Learning <https://arxiv.org/abs/1712.04851>`__.
+
+    .. betastatus:: video module
+
+    Args:
+        weights (:class:`~torchvision.models.video.S3D_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.S3D_Weights`
+            below for more details, and possible values. By default, no
+            pre-trained weights are used.
+        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.S3D`` base class.
+            Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/s3d.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.S3D_Weights
+        :members:
+    """
+    weights = S3D_Weights.verify(weights)
+
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = S3D(**kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/swin_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a198142874224a6766f321d9e0dfc97a01ecb43
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/video/swin_transformer.py
@@ -0,0 +1,743 @@
+# Modified from 2d Swin Transformers in torchvision:
+# https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py
+
+from functools import partial
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ...transforms._presets import VideoClassification
+
+from ...utils import _log_api_usage_once
+
+from .._api import register_model, Weights, WeightsEnum
+
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..swin_transformer import PatchMerging, SwinTransformerBlock
+
+__all__ = [
+    "SwinTransformer3d",
+    "Swin3D_T_Weights",
+    "Swin3D_S_Weights",
+    "Swin3D_B_Weights",
+    "swin3d_t",
+    "swin3d_s",
+    "swin3d_b",
+]
+
+
+def _get_window_and_shift_size(
+    shift_size: list[int], size_dhw: list[int], window_size: list[int]
+) -> tuple[list[int], list[int]]:
+    for i in range(3):
+        if size_dhw[i] <= window_size[i]:
+            # In this case, window_size will adapt to the input size, and no need to shift
+            window_size[i] = size_dhw[i]
+            shift_size[i] = 0
+
+    return window_size, shift_size
+
+
+torch.fx.wrap("_get_window_and_shift_size")
+
+
+def _get_relative_position_bias(
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: list[int]
+) -> Tensor:
+    window_vol = window_size[0] * window_size[1] * window_size[2]
+    # In 3d case we flatten the relative_position_bias
+    relative_position_bias = relative_position_bias_table[
+        relative_position_index[:window_vol, :window_vol].flatten()  # type: ignore[index]
+    ]
+    relative_position_bias = relative_position_bias.view(window_vol, window_vol, -1)
+    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+    return relative_position_bias
+
+
+torch.fx.wrap("_get_relative_position_bias")
+
+
+def _compute_pad_size_3d(size_dhw: tuple[int, int, int], patch_size: tuple[int, int, int]) -> tuple[int, int, int]:
+    pad_size = [(patch_size[i] - size_dhw[i] % patch_size[i]) % patch_size[i] for i in range(3)]
+    return pad_size[0], pad_size[1], pad_size[2]
+
+
+torch.fx.wrap("_compute_pad_size_3d")
+
+
+def _compute_attention_mask_3d(
+    x: Tensor,
+    size_dhw: tuple[int, int, int],
+    window_size: tuple[int, int, int],
+    shift_size: tuple[int, int, int],
+) -> Tensor:
+    # generate attention mask
+    attn_mask = x.new_zeros(*size_dhw)
+    num_windows = (size_dhw[0] // window_size[0]) * (size_dhw[1] // window_size[1]) * (size_dhw[2] // window_size[2])
+    slices = [
+        (
+            (0, -window_size[i]),
+            (-window_size[i], -shift_size[i]),
+            (-shift_size[i], None),
+        )
+        for i in range(3)
+    ]
+    count = 0
+    for d in slices[0]:
+        for h in slices[1]:
+            for w in slices[2]:
+                attn_mask[d[0] : d[1], h[0] : h[1], w[0] : w[1]] = count
+                count += 1
+
+    # Partition window on attn_mask
+    attn_mask = attn_mask.view(
+        size_dhw[0] // window_size[0],
+        window_size[0],
+        size_dhw[1] // window_size[1],
+        window_size[1],
+        size_dhw[2] // window_size[2],
+        window_size[2],
+    )
+    attn_mask = attn_mask.permute(0, 2, 4, 1, 3, 5).reshape(
+        num_windows, window_size[0] * window_size[1] * window_size[2]
+    )
+    attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+torch.fx.wrap("_compute_attention_mask_3d")
+
+
+def shifted_window_attention_3d(
+    input: Tensor,
+    qkv_weight: Tensor,
+    proj_weight: Tensor,
+    relative_position_bias: Tensor,
+    window_size: list[int],
+    num_heads: int,
+    shift_size: list[int],
+    attention_dropout: float = 0.0,
+    dropout: float = 0.0,
+    qkv_bias: Optional[Tensor] = None,
+    proj_bias: Optional[Tensor] = None,
+    training: bool = True,
+) -> Tensor:
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
+        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+        relative_position_bias (Tensor): The learned relative position bias added to attention.
+        window_size (List[int]): 3-dimensions window size, T, H, W .
+        num_heads (int): Number of attention heads.
+        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
+        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+        dropout (float): Dropout ratio of output. Default: 0.0.
+        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
+    Returns:
+        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
+    """
+    b, t, h, w, c = input.shape
+    # pad feature maps to multiples of window size
+    pad_size = _compute_pad_size_3d((t, h, w), (window_size[0], window_size[1], window_size[2]))
+    x = F.pad(input, (0, 0, 0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+    _, tp, hp, wp, _ = x.shape
+    padded_size = (tp, hp, wp)
+
+    # cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+
+    # partition windows
+    num_windows = (
+        (padded_size[0] // window_size[0]) * (padded_size[1] // window_size[1]) * (padded_size[2] // window_size[2])
+    )
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        window_size[0],
+        padded_size[1] // window_size[1],
+        window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b * num_windows, window_size[0] * window_size[1] * window_size[2], c
+    )  # B*nW, Wd*Wh*Ww, C
+
+    # multi-head attention
+    qkv = F.linear(x, qkv_weight, qkv_bias)
+    qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, c // num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    q = q * (c // num_heads) ** -0.5
+    attn = q.matmul(k.transpose(-2, -1))
+    # add relative position bias
+    attn = attn + relative_position_bias
+
+    if sum(shift_size) > 0:
+        # generate attention mask to handle shifted windows with varying size
+        attn_mask = _compute_attention_mask_3d(
+            x,
+            (padded_size[0], padded_size[1], padded_size[2]),
+            (window_size[0], window_size[1], window_size[2]),
+            (shift_size[0], shift_size[1], shift_size[2]),
+        )
+        attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+        attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+        attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+    attn = F.softmax(attn, dim=-1)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
+
+    x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), c)
+    x = F.linear(x, proj_weight, proj_bias)
+    x = F.dropout(x, p=dropout, training=training)
+
+    # reverse windows
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        padded_size[1] // window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[0],
+        window_size[1],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, tp, hp, wp, c)
+
+    # reverse cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+
+    # unpad features
+    x = x[:, :t, :h, :w, :].contiguous()
+    return x
+
+
+torch.fx.wrap("shifted_window_attention_3d")
+
+
+class ShiftedWindowAttention3d(nn.Module):
+    """
+    See :func:`shifted_window_attention_3d`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: list[int],
+        shift_size: list[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        if len(window_size) != 3 or len(shift_size) != 3:
+            raise ValueError("window_size and shift_size must be of length 2")
+
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.shift_size = shift_size
+        self.num_heads = num_heads
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+        self.define_relative_position_bias_table()
+        self.define_relative_position_index()
+
+    def define_relative_position_bias_table(self) -> None:
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                self.num_heads,
+            )
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def define_relative_position_index(self) -> None:
+        # get pair-wise relative position index for each token inside the window
+        coords_dhw = [torch.arange(self.window_size[i]) for i in range(3)]
+        coords = torch.stack(
+            torch.meshgrid(coords_dhw[0], coords_dhw[1], coords_dhw[2], indexing="ij")
+        )  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        # We don't flatten the relative_position_index here in 3d case.
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_position_bias(self, window_size: list[int]) -> torch.Tensor:
+        return _get_relative_position_bias(self.relative_position_bias_table, self.relative_position_index, window_size)  # type: ignore
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, t, h, w, _ = x.shape
+        size_dhw = [t, h, w]
+        window_size, shift_size = self.window_size.copy(), self.shift_size.copy()
+        # Handle case where window_size is larger than the input tensor
+        window_size, shift_size = _get_window_and_shift_size(shift_size, size_dhw, window_size)
+
+        relative_position_bias = self.get_relative_position_bias(window_size)
+
+        return shifted_window_attention_3d(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            window_size,
+            self.num_heads,
+            shift_size=shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+            training=self.training,
+        )
+
+
+# Modified from:
+# https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/mmaction/models/backbones/swin_transformer.py
+class PatchEmbed3d(nn.Module):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (List[int]): Patch token size.
+        in_channels (int): Number of input channels. Default: 3
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self,
+        patch_size: list[int],
+        in_channels: int = 3,
+        embed_dim: int = 96,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.tuple_patch_size = (patch_size[0], patch_size[1], patch_size[2])
+
+        self.proj = nn.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=self.tuple_patch_size,
+            stride=self.tuple_patch_size,
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        # padding
+        _, _, t, h, w = x.size()
+        pad_size = _compute_pad_size_3d((t, h, w), self.tuple_patch_size)
+        x = F.pad(x, (0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+        x = self.proj(x)  # B C T Wh Ww
+        x = x.permute(0, 2, 3, 4, 1)  # B T Wh Ww C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class SwinTransformer3d(nn.Module):
+    """
+    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
+    Args:
+        patch_size (List[int]): Patch size.
+        embed_dim (int): Patch embedding dimension.
+        depths (List(int)): Depth of each Swin Transformer layer.
+        num_heads (List(int)): Number of attention heads in different layers.
+        window_size (List[int]): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
+        num_classes (int): Number of classes for classification head. Default: 400.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None.
+        block (nn.Module, optional): SwinTransformer Block. Default: None.
+        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
+        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        patch_size: list[int],
+        embed_dim: int,
+        depths: list[int],
+        num_heads: list[int],
+        window_size: list[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.1,
+        num_classes: int = 400,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        block: Optional[Callable[..., nn.Module]] = None,
+        downsample_layer: Callable[..., nn.Module] = PatchMerging,
+        patch_embed: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+
+        if block is None:
+            block = partial(SwinTransformerBlock, attn_layer=ShiftedWindowAttention3d)
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+        if patch_embed is None:
+            patch_embed = PatchEmbed3d
+
+        # split image into non-overlapping patches
+        self.patch_embed = patch_embed(patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=dropout)
+
+        layers: list[nn.Module] = []
+        total_stage_blocks = sum(depths)
+        stage_block_id = 0
+        # build SwinTransformer blocks
+        for i_stage in range(len(depths)):
+            stage: list[nn.Module] = []
+            dim = embed_dim * 2**i_stage
+            for i_layer in range(depths[i_stage]):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+                stage.append(
+                    block(
+                        dim,
+                        num_heads[i_stage],
+                        window_size=window_size,
+                        shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+                        mlp_ratio=mlp_ratio,
+                        dropout=dropout,
+                        attention_dropout=attention_dropout,
+                        stochastic_depth_prob=sd_prob,
+                        norm_layer=norm_layer,
+                        attn_layer=ShiftedWindowAttention3d,
+                    )
+                )
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            # add patch merging layer
+            if i_stage < (len(depths) - 1):
+                layers.append(downsample_layer(dim, norm_layer))
+        self.features = nn.Sequential(*layers)
+
+        self.num_features = embed_dim * 2 ** (len(depths) - 1)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool3d(1)
+        self.head = nn.Linear(self.num_features, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: B C T H W
+        x = self.patch_embed(x)  # B _T _H _W C
+        x = self.pos_drop(x)
+        x = self.features(x)  # B _T _H _W C
+        x = self.norm(x)
+        x = x.permute(0, 4, 1, 2, 3)  # B, C, _T, _H, _W
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.head(x)
+        return x
+
+
+def _swin_transformer3d(
+    patch_size: list[int],
+    embed_dim: int,
+    depths: list[int],
+    num_heads: list[int],
+    window_size: list[int],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SwinTransformer3d:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SwinTransformer3d(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _KINETICS400_CATEGORIES,
+    "min_size": (1, 1),
+    "min_temporal_size": 1,
+}
+
+
+class Swin3D_T_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_t-7615ae03.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 28158070,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 77.715,
+                    "acc@5": 93.519,
+                }
+            },
+            "_ops": 43.882,
+            "_file_size": 121.543,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_S_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_s-da41c237.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 49816678,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.521,
+                    "acc@5": 94.158,
+                }
+            },
+            "_ops": 82.841,
+            "_file_size": 218.288,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_B_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.427,
+                    "acc@5": 94.386,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    KINETICS400_IMAGENET22K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 81.643,
+                    "acc@5": 95.574,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_T_Weights.KINETICS400_V1))
+def swin3d_t(*, weights: Optional[Swin3D_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_tiny architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
+        :members:
+    """
+    weights = Swin3D_T_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_S_Weights.KINETICS400_V1))
+def swin3d_s(*, weights: Optional[Swin3D_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_small architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
+        :members:
+    """
+    weights = Swin3D_S_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_B_Weights.KINETICS400_V1))
+def swin3d_b(*, weights: Optional[Swin3D_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_base architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
+        :members:
+    """
+    weights = Swin3D_B_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vision_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ec3a5c59f0a4112f1eec0ec7d5c0ccba5289946
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/models/vision_transformer.py
@@ -0,0 +1,864 @@
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, NamedTuple, Optional
+
+import torch
+import torch.nn as nn
+
+from ..ops.misc import Conv2dNormActivation, MLP
+from ..transforms._presets import ImageClassification, InterpolationMode
+from ..utils import _log_api_usage_once
+from ._api import register_model, Weights, WeightsEnum
+from ._meta import _IMAGENET_CATEGORIES
+from ._utils import _ovewrite_named_param, handle_legacy_interface
+
+
+__all__ = [
+    "VisionTransformer",
+    "ViT_B_16_Weights",
+    "ViT_B_32_Weights",
+    "ViT_L_16_Weights",
+    "ViT_L_32_Weights",
+    "ViT_H_14_Weights",
+    "vit_b_16",
+    "vit_b_32",
+    "vit_l_16",
+    "vit_l_32",
+    "vit_h_14",
+]
+
+
+class ConvStemConfig(NamedTuple):
+    out_channels: int
+    kernel_size: int
+    stride: int
+    norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d
+    activation_layer: Callable[..., nn.Module] = nn.ReLU
+
+
+class MLPBlock(MLP):
+    """Transformer MLP block."""
+
+    _version = 2
+
+    def __init__(self, in_dim: int, mlp_dim: int, dropout: float):
+        super().__init__(in_dim, [mlp_dim, in_dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.normal_(m.bias, std=1e-6)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # Replacing legacy MLPBlock with MLP. See https://github.com/pytorch/vision/pull/6053
+            for i in range(2):
+                for type in ["weight", "bias"]:
+                    old_key = f"{prefix}linear_{i+1}.{type}"
+                    new_key = f"{prefix}{3*i}.{type}"
+                    if old_key in state_dict:
+                        state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class EncoderBlock(nn.Module):
+    """Transformer encoder block."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+
+        # Attention block
+        self.ln_1 = norm_layer(hidden_dim)
+        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+
+        # MLP block
+        self.ln_2 = norm_layer(hidden_dim)
+        self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout)
+
+    def forward(self, input: torch.Tensor):
+        torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
+        x = self.ln_1(input)
+        x, _ = self.self_attention(x, x, x, need_weights=False)
+        x = self.dropout(x)
+        x = x + input
+
+        y = self.ln_2(x)
+        y = self.mlp(y)
+        return x + y
+
+
+class Encoder(nn.Module):
+    """Transformer Model Encoder for sequence to sequence translation."""
+
+    def __init__(
+        self,
+        seq_length: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        # Note that batch_size is on the first dim because
+        # we have batch_first=True in nn.MultiAttention() by default
+        self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02))  # from BERT
+        self.dropout = nn.Dropout(dropout)
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        for i in range(num_layers):
+            layers[f"encoder_layer_{i}"] = EncoderBlock(
+                num_heads,
+                hidden_dim,
+                mlp_dim,
+                dropout,
+                attention_dropout,
+                norm_layer,
+            )
+        self.layers = nn.Sequential(layers)
+        self.ln = norm_layer(hidden_dim)
+
+    def forward(self, input: torch.Tensor):
+        torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
+        input = input + self.pos_embedding
+        return self.ln(self.layers(self.dropout(input)))
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer as per https://arxiv.org/abs/2010.11929."""
+
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        num_classes: int = 1000,
+        representation_size: Optional[int] = None,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+        conv_stem_configs: Optional[list[ConvStemConfig]] = None,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!")
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+        self.num_classes = num_classes
+        self.representation_size = representation_size
+        self.norm_layer = norm_layer
+
+        if conv_stem_configs is not None:
+            # As per https://arxiv.org/abs/2106.14881
+            seq_proj = nn.Sequential()
+            prev_channels = 3
+            for i, conv_stem_layer_config in enumerate(conv_stem_configs):
+                seq_proj.add_module(
+                    f"conv_bn_relu_{i}",
+                    Conv2dNormActivation(
+                        in_channels=prev_channels,
+                        out_channels=conv_stem_layer_config.out_channels,
+                        kernel_size=conv_stem_layer_config.kernel_size,
+                        stride=conv_stem_layer_config.stride,
+                        norm_layer=conv_stem_layer_config.norm_layer,
+                        activation_layer=conv_stem_layer_config.activation_layer,
+                    ),
+                )
+                prev_channels = conv_stem_layer_config.out_channels
+            seq_proj.add_module(
+                "conv_last", nn.Conv2d(in_channels=prev_channels, out_channels=hidden_dim, kernel_size=1)
+            )
+            self.conv_proj: nn.Module = seq_proj
+        else:
+            self.conv_proj = nn.Conv2d(
+                in_channels=3, out_channels=hidden_dim, kernel_size=patch_size, stride=patch_size
+            )
+
+        seq_length = (image_size // patch_size) ** 2
+
+        # Add a class token
+        self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
+        seq_length += 1
+
+        self.encoder = Encoder(
+            seq_length,
+            num_layers,
+            num_heads,
+            hidden_dim,
+            mlp_dim,
+            dropout,
+            attention_dropout,
+            norm_layer,
+        )
+        self.seq_length = seq_length
+
+        heads_layers: OrderedDict[str, nn.Module] = OrderedDict()
+        if representation_size is None:
+            heads_layers["head"] = nn.Linear(hidden_dim, num_classes)
+        else:
+            heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size)
+            heads_layers["act"] = nn.Tanh()
+            heads_layers["head"] = nn.Linear(representation_size, num_classes)
+
+        self.heads = nn.Sequential(heads_layers)
+
+        if isinstance(self.conv_proj, nn.Conv2d):
+            # Init the patchify stem
+            fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1]
+            nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in))
+            if self.conv_proj.bias is not None:
+                nn.init.zeros_(self.conv_proj.bias)
+        elif self.conv_proj.conv_last is not None and isinstance(self.conv_proj.conv_last, nn.Conv2d):
+            # Init the last 1x1 conv of the conv stem
+            nn.init.normal_(
+                self.conv_proj.conv_last.weight, mean=0.0, std=math.sqrt(2.0 / self.conv_proj.conv_last.out_channels)
+            )
+            if self.conv_proj.conv_last.bias is not None:
+                nn.init.zeros_(self.conv_proj.conv_last.bias)
+
+        if hasattr(self.heads, "pre_logits") and isinstance(self.heads.pre_logits, nn.Linear):
+            fan_in = self.heads.pre_logits.in_features
+            nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in))
+            nn.init.zeros_(self.heads.pre_logits.bias)
+
+        if isinstance(self.heads.head, nn.Linear):
+            nn.init.zeros_(self.heads.head.weight)
+            nn.init.zeros_(self.heads.head.bias)
+
+    def _process_input(self, x: torch.Tensor) -> torch.Tensor:
+        n, c, h, w = x.shape
+        p = self.patch_size
+        torch._assert(h == self.image_size, f"Wrong image height! Expected {self.image_size} but got {h}!")
+        torch._assert(w == self.image_size, f"Wrong image width! Expected {self.image_size} but got {w}!")
+        n_h = h // p
+        n_w = w // p
+
+        # (n, c, h, w) -> (n, hidden_dim, n_h, n_w)
+        x = self.conv_proj(x)
+        # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w))
+        x = x.reshape(n, self.hidden_dim, n_h * n_w)
+
+        # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim)
+        # The self attention layer expects inputs in the format (N, S, E)
+        # where S is the source sequence length, N is the batch size, E is the
+        # embedding dimension
+        x = x.permute(0, 2, 1)
+
+        return x
+
+    def forward(self, x: torch.Tensor):
+        # Reshape and permute the input tensor
+        x = self._process_input(x)
+        n = x.shape[0]
+
+        # Expand the class token to the full batch
+        batch_class_token = self.class_token.expand(n, -1, -1)
+        x = torch.cat([batch_class_token, x], dim=1)
+
+        x = self.encoder(x)
+
+        # Classifier "token" as used by standard language architectures
+        x = x[:, 0]
+
+        x = self.heads(x)
+
+        return x
+
+
+def _vision_transformer(
+    patch_size: int,
+    num_layers: int,
+    num_heads: int,
+    hidden_dim: int,
+    mlp_dim: int,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> VisionTransformer:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        assert weights.meta["min_size"][0] == weights.meta["min_size"][1]
+        _ovewrite_named_param(kwargs, "image_size", weights.meta["min_size"][0])
+    image_size = kwargs.pop("image_size", 224)
+
+    model = VisionTransformer(
+        image_size=image_size,
+        patch_size=patch_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        hidden_dim=hidden_dim,
+        mlp_dim=mlp_dim,
+        **kwargs,
+    )
+
+    if weights:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META: dict[str, Any] = {
+    "categories": _IMAGENET_CATEGORIES,
+}
+
+_COMMON_SWAG_META = {
+    **_COMMON_META,
+    "recipe": "https://github.com/facebookresearch/SWAG",
+    "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE",
+}
+
+
+class ViT_B_16_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_16-c867db91.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 86567656,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_16",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.072,
+                    "acc@5": 95.318,
+                }
+            },
+            "_ops": 17.564,
+            "_file_size": 330.285,
+            "_docs": """
+                These weights were trained from scratch by using a modified version of `DeIT
+                <https://arxiv.org/abs/2012.12877>`_'s training recipe.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=384,
+            resize_size=384,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 86859496,
+            "min_size": (384, 384),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 85.304,
+                    "acc@5": 97.650,
+                }
+            },
+            "_ops": 55.484,
+            "_file_size": 331.398,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_16_lc_swag-4e70ced5.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=224,
+            resize_size=224,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 86567656,
+            "min_size": (224, 224),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.886,
+                    "acc@5": 96.180,
+                }
+            },
+            "_ops": 17.564,
+            "_file_size": 330.285,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ViT_B_32_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_32-d86f8d99.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 88224232,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_32",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 75.912,
+                    "acc@5": 92.466,
+                }
+            },
+            "_ops": 4.409,
+            "_file_size": 336.604,
+            "_docs": """
+                These weights were trained from scratch by using a modified version of `DeIT
+                <https://arxiv.org/abs/2012.12877>`_'s training recipe.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ViT_L_16_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_16-852ce7e3.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=242),
+        meta={
+            **_COMMON_META,
+            "num_params": 304326632,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_16",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.662,
+                    "acc@5": 94.638,
+                }
+            },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
+            "_docs": """
+                These weights were trained from scratch by using a modified version of TorchVision's
+                `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_16_swag-4f3808c9.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=512,
+            resize_size=512,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 305174504,
+            "min_size": (512, 512),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 88.064,
+                    "acc@5": 98.512,
+                }
+            },
+            "_ops": 361.986,
+            "_file_size": 1164.258,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_16_lc_swag-4d563306.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=224,
+            resize_size=224,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 304326632,
+            "min_size": (224, 224),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 85.146,
+                    "acc@5": 97.422,
+                }
+            },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ViT_L_32_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_32-c7638314.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 306535400,
+            "min_size": (224, 224),
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_32",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.972,
+                    "acc@5": 93.07,
+                }
+            },
+            "_ops": 15.378,
+            "_file_size": 1169.449,
+            "_docs": """
+                These weights were trained from scratch by using a modified version of `DeIT
+                <https://arxiv.org/abs/2012.12877>`_'s training recipe.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+
+
+class ViT_H_14_Weights(WeightsEnum):
+    IMAGENET1K_SWAG_E2E_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_h_14_swag-80465313.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=518,
+            resize_size=518,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 633470440,
+            "min_size": (518, 518),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 88.552,
+                    "acc@5": 98.694,
+                }
+            },
+            "_ops": 1016.717,
+            "_file_size": 2416.643,
+            "_docs": """
+                These weights are learnt via transfer learning by end-to-end fine-tuning the original
+                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
+            """,
+        },
+    )
+    IMAGENET1K_SWAG_LINEAR_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=224,
+            resize_size=224,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "recipe": "https://github.com/pytorch/vision/pull/5793",
+            "num_params": 632045800,
+            "min_size": (224, 224),
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 85.708,
+                    "acc@5": 97.730,
+                }
+            },
+            "_ops": 167.295,
+            "_file_size": 2411.209,
+            "_docs": """
+                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
+                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_SWAG_E2E_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ViT_B_16_Weights.IMAGENET1K_V1))
+def vit_b_16(*, weights: Optional[ViT_B_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_b_16 architecture from
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ViT_B_16_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.ViT_B_16_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ViT_B_16_Weights
+        :members:
+    """
+    weights = ViT_B_16_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ViT_B_32_Weights.IMAGENET1K_V1))
+def vit_b_32(*, weights: Optional[ViT_B_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_b_32 architecture from
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ViT_B_32_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.ViT_B_32_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ViT_B_32_Weights
+        :members:
+    """
+    weights = ViT_B_32_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=32,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ViT_L_16_Weights.IMAGENET1K_V1))
+def vit_l_16(*, weights: Optional[ViT_L_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_l_16 architecture from
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ViT_L_16_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.ViT_L_16_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ViT_L_16_Weights
+        :members:
+    """
+    weights = ViT_L_16_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=16,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", ViT_L_32_Weights.IMAGENET1K_V1))
+def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_l_32 architecture from
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ViT_L_32_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.ViT_L_32_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ViT_L_32_Weights
+        :members:
+    """
+    weights = ViT_L_32_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=32,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", None))
+def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_h_14 architecture from
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.ViT_H_14_Weights`, optional): The pretrained
+            weights to use. See :class:`~torchvision.models.ViT_H_14_Weights`
+            below for more details and possible values. By default, no pre-trained weights are used.
+        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ViT_H_14_Weights
+        :members:
+    """
+    weights = ViT_H_14_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+def interpolate_embeddings(
+    image_size: int,
+    patch_size: int,
+    model_state: "OrderedDict[str, torch.Tensor]",
+    interpolation_mode: str = "bicubic",
+    reset_heads: bool = False,
+) -> "OrderedDict[str, torch.Tensor]":
+    """This function helps interpolate positional embeddings during checkpoint loading,
+    especially when you want to apply a pre-trained model on images with different resolution.
+
+    Args:
+        image_size (int): Image size of the new model.
+        patch_size (int): Patch size of the new model.
+        model_state (OrderedDict[str, torch.Tensor]): State dict of the pre-trained model.
+        interpolation_mode (str): The algorithm used for upsampling. Default: bicubic.
+        reset_heads (bool): If true, not copying the state of heads. Default: False.
+
+    Returns:
+        OrderedDict[str, torch.Tensor]: A state dict which can be loaded into the new model.
+    """
+    # Shape of pos_embedding is (1, seq_length, hidden_dim)
+    pos_embedding = model_state["encoder.pos_embedding"]
+    n, seq_length, hidden_dim = pos_embedding.shape
+    if n != 1:
+        raise ValueError(f"Unexpected position embedding shape: {pos_embedding.shape}")
+
+    new_seq_length = (image_size // patch_size) ** 2 + 1
+
+    # Need to interpolate the weights for the position embedding.
+    # We do this by reshaping the positions embeddings to a 2d grid, performing
+    # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
+    if new_seq_length != seq_length:
+        # The class token embedding shouldn't be interpolated, so we split it up.
+        seq_length -= 1
+        new_seq_length -= 1
+        pos_embedding_token = pos_embedding[:, :1, :]
+        pos_embedding_img = pos_embedding[:, 1:, :]
+
+        # (1, seq_length, hidden_dim) -> (1, hidden_dim, seq_length)
+        pos_embedding_img = pos_embedding_img.permute(0, 2, 1)
+        seq_length_1d = int(math.sqrt(seq_length))
+        if seq_length_1d * seq_length_1d != seq_length:
+            raise ValueError(
+                f"seq_length is not a perfect square! Instead got seq_length_1d * seq_length_1d = {seq_length_1d * seq_length_1d } and seq_length = {seq_length}"
+            )
+
+        # (1, hidden_dim, seq_length) -> (1, hidden_dim, seq_l_1d, seq_l_1d)
+        pos_embedding_img = pos_embedding_img.reshape(1, hidden_dim, seq_length_1d, seq_length_1d)
+        new_seq_length_1d = image_size // patch_size
+
+        # Perform interpolation.
+        # (1, hidden_dim, seq_l_1d, seq_l_1d) -> (1, hidden_dim, new_seq_l_1d, new_seq_l_1d)
+        new_pos_embedding_img = nn.functional.interpolate(
+            pos_embedding_img,
+            size=new_seq_length_1d,
+            mode=interpolation_mode,
+            align_corners=True,
+        )
+
+        # (1, hidden_dim, new_seq_l_1d, new_seq_l_1d) -> (1, hidden_dim, new_seq_length)
+        new_pos_embedding_img = new_pos_embedding_img.reshape(1, hidden_dim, new_seq_length)
+
+        # (1, hidden_dim, new_seq_length) -> (1, new_seq_length, hidden_dim)
+        new_pos_embedding_img = new_pos_embedding_img.permute(0, 2, 1)
+        new_pos_embedding = torch.cat([pos_embedding_token, new_pos_embedding_img], dim=1)
+
+        model_state["encoder.pos_embedding"] = new_pos_embedding
+
+        if reset_heads:
+            model_state_copy: "OrderedDict[str, torch.Tensor]" = OrderedDict()
+            for k, v in model_state.items():
+                if not k.startswith("heads"):
+                    model_state_copy[k] = v
+            model_state = model_state_copy
+
+    return model_state
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..827505b842d4f1ad0e16dfe54ef28658364cc9ac
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__init__.py
@@ -0,0 +1,73 @@
+from ._register_onnx_ops import _register_custom_op
+from .boxes import (
+    batched_nms,
+    box_area,
+    box_convert,
+    box_iou,
+    clip_boxes_to_image,
+    complete_box_iou,
+    distance_box_iou,
+    generalized_box_iou,
+    masks_to_boxes,
+    nms,
+    remove_small_boxes,
+)
+from .ciou_loss import complete_box_iou_loss
+from .deform_conv import deform_conv2d, DeformConv2d
+from .diou_loss import distance_box_iou_loss
+from .drop_block import drop_block2d, drop_block3d, DropBlock2d, DropBlock3d
+from .feature_pyramid_network import FeaturePyramidNetwork
+from .focal_loss import sigmoid_focal_loss
+from .giou_loss import generalized_box_iou_loss
+from .misc import Conv2dNormActivation, Conv3dNormActivation, FrozenBatchNorm2d, MLP, Permute, SqueezeExcitation
+from .poolers import MultiScaleRoIAlign
+from .ps_roi_align import ps_roi_align, PSRoIAlign
+from .ps_roi_pool import ps_roi_pool, PSRoIPool
+from .roi_align import roi_align, RoIAlign
+from .roi_pool import roi_pool, RoIPool
+from .stochastic_depth import stochastic_depth, StochasticDepth
+
+_register_custom_op()
+
+
+__all__ = [
+    "masks_to_boxes",
+    "deform_conv2d",
+    "DeformConv2d",
+    "nms",
+    "batched_nms",
+    "remove_small_boxes",
+    "clip_boxes_to_image",
+    "box_convert",
+    "box_area",
+    "box_iou",
+    "generalized_box_iou",
+    "distance_box_iou",
+    "complete_box_iou",
+    "roi_align",
+    "RoIAlign",
+    "roi_pool",
+    "RoIPool",
+    "ps_roi_align",
+    "PSRoIAlign",
+    "ps_roi_pool",
+    "PSRoIPool",
+    "MultiScaleRoIAlign",
+    "FeaturePyramidNetwork",
+    "sigmoid_focal_loss",
+    "stochastic_depth",
+    "StochasticDepth",
+    "FrozenBatchNorm2d",
+    "Conv2dNormActivation",
+    "Conv3dNormActivation",
+    "SqueezeExcitation",
+    "MLP",
+    "Permute",
+    "generalized_box_iou_loss",
+    "distance_box_iou_loss",
+    "complete_box_iou_loss",
+    "drop_block2d",
+    "DropBlock2d",
+    "drop_block3d",
+    "DropBlock3d",
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa5e7cab85dd94849a19f211543edc910f8ab3c3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_box_convert.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_box_convert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a249ba0b1ebb6b3b4452543a2f33f9f420d2cce
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_box_convert.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_register_onnx_ops.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_register_onnx_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35727212d99e475ac85b62ee4d2d9a1d9d923c65
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_register_onnx_ops.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a802f37094663c58c5ad692d007da749eaf04f9c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/boxes.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/boxes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d569b5dcff203db377fd06bd9fd636d5bea800d0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/boxes.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ciou_loss.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ciou_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2efbb53668b4d4091da9d6ece27ecea31dd3329
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ciou_loss.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/deform_conv.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/deform_conv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1efb819283a2e4340d7e9bbfee2edb1b4b69c52
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/deform_conv.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/diou_loss.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/diou_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78d68bcdf288e09f0725756f6710ff8fc88f3a5a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/diou_loss.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/drop_block.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/drop_block.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2aab397c4ec5436e829c245ceca528295ac7ed4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/drop_block.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/feature_pyramid_network.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/feature_pyramid_network.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38e442fa6a24969914a2f87b5039fa579eaf691e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/feature_pyramid_network.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/focal_loss.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/focal_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..668078550ea9350089edcc0dda4400bfed05efd9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/focal_loss.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/giou_loss.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/giou_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ba58c3d25142ba5c4581410db57db9c60b643b6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/giou_loss.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/misc.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52cee7cf08deaadda57379f0ab698985aa5f53d9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/misc.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/poolers.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/poolers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae4c19f432eda6cf53af787671ad2feb5eb2b3ed
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/poolers.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_align.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_align.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b44433e0be65e97418dfd5bdd95b75aaf9accb88
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_align.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_pool.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_pool.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10d9cd5cd6f19c639545fc78cf359227aa62265a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/ps_roi_pool.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_align.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_align.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e68cfff44eef83ddafcd9f4a06db03d16188ef1a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_align.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_pool.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_pool.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cab7f46864fa743cc7fe52281bf836d79b66b29
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/roi_pool.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/stochastic_depth.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/stochastic_depth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ccab95dc0c7173a4cf1f88c33f34dc36dbc4a11
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/__pycache__/stochastic_depth.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_box_convert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_box_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..81406248020b5e284b8d4a6ae8bd6528bb12c58a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_box_convert.py
@@ -0,0 +1,207 @@
+import torch
+from torch import Tensor
+
+
+def _box_cxcywh_to_xyxy(boxes: Tensor) -> Tensor:
+    """
+    Converts bounding boxes from (cx, cy, w, h) format to (x1, y1, x2, y2) format.
+    (cx, cy) refers to center of bounding box
+    (w, h) are width and height of bounding box
+    Args:
+        boxes (Tensor[N, 4]): boxes in (cx, cy, w, h) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 4)): boxes in (x1, y1, x2, y2) format.
+    """
+    # We need to change all 4 of them so some temporary variable is needed.
+    cx, cy, w, h = boxes.unbind(-1)
+    x1 = cx - 0.5 * w
+    y1 = cy - 0.5 * h
+    x2 = cx + 0.5 * w
+    y2 = cy + 0.5 * h
+
+    boxes = torch.stack((x1, y1, x2, y2), dim=-1)
+
+    return boxes
+
+
+def _box_xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
+    """
+    Converts bounding boxes from (x1, y1, x2, y2) format to (cx, cy, w, h) format.
+    (x1, y1) refer to top left of bounding box
+    (x2, y2) refer to bottom right of bounding box
+    Args:
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 4)): boxes in (cx, cy, w, h) format.
+    """
+    x1, y1, x2, y2 = boxes.unbind(-1)
+    cx = (x1 + x2) / 2
+    cy = (y1 + y2) / 2
+    w = x2 - x1
+    h = y2 - y1
+
+    boxes = torch.stack((cx, cy, w, h), dim=-1)
+
+    return boxes
+
+
+def _box_xywh_to_xyxy(boxes: Tensor) -> Tensor:
+    """
+    Converts bounding boxes from (x, y, w, h) format to (x1, y1, x2, y2) format.
+    (x, y) refers to top left of bounding box.
+    (w, h) refers to width and height of box.
+    Args:
+        boxes (Tensor[N, 4]): boxes in (x, y, w, h) which will be converted.
+
+    Returns:
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format.
+    """
+    x, y, w, h = boxes.unbind(-1)
+    boxes = torch.stack([x, y, x + w, y + h], dim=-1)
+    return boxes
+
+
+def _box_xyxy_to_xywh(boxes: Tensor) -> Tensor:
+    """
+    Converts bounding boxes from (x1, y1, x2, y2) format to (x, y, w, h) format.
+    (x1, y1) refer to top left of bounding box
+    (x2, y2) refer to bottom right of bounding box
+    Args:
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) which will be converted.
+
+    Returns:
+        boxes (Tensor[N, 4]): boxes in (x, y, w, h) format.
+    """
+    x1, y1, x2, y2 = boxes.unbind(-1)
+    w = x2 - x1  # x2 - x1
+    h = y2 - y1  # y2 - y1
+    boxes = torch.stack((x1, y1, w, h), dim=-1)
+    return boxes
+
+
+def _box_cxcywhr_to_xywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (cx, cy, w, h, r) format to (x1, y1, w, h, r) format.
+    (cx, cy) refers to center of bounding box
+    (w, h) refers to width and height of rotated bounding box
+    (x1, y1) refers to top left of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+    Args:
+        boxes (Tensor[N, 5]): boxes in (cx, cy, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 5)): rotated boxes in (x1, y1, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    cx, cy, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    x1 = cx - w / 2 * cos - h / 2 * sin
+    y1 = cy - h / 2 * cos + w / 2 * sin
+    boxes = torch.stack((x1, y1, w, h, r), dim=-1)
+
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (cx, cy, w, h, r) format.
+    (x1, y1) refers to top left of rotated bounding box
+    (w, h) refers to width and height of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+    Args:
+        boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    cx = x1 + w / 2 * cos + h / 2 * sin
+    cy = y1 - w / 2 * sin + h / 2 * cos
+
+    boxes = torch.stack([cx, cy, w, h, r], dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x2, y2, x3, y3, x4, y4) format.
+    (x1, y1) refer to top left of bounding box
+    (w, h) are width and height of the rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    (x1, y1) refer to top left of rotated bounding box
+    (x2, y2) refer to top right of rotated bounding box
+    (x3, y3) refer to bottom right of rotated bounding box
+    (x4, y4) refer to bottom left ofrotated bounding box
+    Args:
+        boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format which will be converted.
+
+    Returns:
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, w, h, r = boxes.unbind(-1)
+    r_rad = r * torch.pi / 180.0
+    cos, sin = torch.cos(r_rad), torch.sin(r_rad)
+
+    x2 = x1 + w * cos
+    y2 = y1 - w * sin
+    x3 = x2 + h * sin
+    y3 = y2 + h * cos
+    x4 = x1 + h * sin
+    y4 = y1 + h * cos
+
+    boxes = torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
+
+
+def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
+    """
+    Converts rotated bounding boxes from (x1, y1, x2, y2, x3, y3, x4, y4) format to (x1, y1, w, h, r) format.
+    (x1, y1) refer to top left of the rotated bounding box
+    (x2, y2) refer to bottom left of the rotated bounding box
+    (x3, y3) refer to bottom right of the rotated bounding box
+    (x4, y4) refer to top right of the rotated bounding box
+    (w, h) refers to width and height of rotated bounding box
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    Args:
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
+
+    Returns:
+        boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format.
+    """
+    dtype = boxes.dtype
+    need_cast = not boxes.is_floating_point()
+    x1, y1, x2, y2, x3, y3, x4, y4 = boxes.unbind(-1)
+    r_rad = torch.atan2(y1 - y2, x2 - x1)
+    r = r_rad * 180 / torch.pi
+
+    w = ((x2 - x1) ** 2 + (y1 - y2) ** 2).sqrt()
+    h = ((x3 - x2) ** 2 + (y3 - y2) ** 2).sqrt()
+
+    boxes = torch.stack((x1, y1, w, h, r), dim=-1)
+    if need_cast:
+        boxes.round_()
+        boxes = boxes.to(dtype)
+    return boxes
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_register_onnx_ops.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_register_onnx_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dd263a5d8ef497becc4aa39252a93c913b84880
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_register_onnx_ops.py
@@ -0,0 +1,107 @@
+import sys
+import warnings
+
+import torch
+from torch.onnx import symbolic_opset11 as opset11
+from torch.onnx.symbolic_helper import parse_args
+
+_ONNX_OPSET_VERSION_11 = 11
+_ONNX_OPSET_VERSION_16 = 16
+BASE_ONNX_OPSET_VERSION = _ONNX_OPSET_VERSION_11
+
+
+@parse_args("v", "v", "f")
+def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
+    boxes = opset11.unsqueeze(g, boxes, 0)
+    scores = opset11.unsqueeze(g, opset11.unsqueeze(g, scores, 0), 0)
+    max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
+    iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
+
+    # Cast boxes and scores to float32 in case they are float64 inputs
+    nms_out = g.op(
+        "NonMaxSuppression",
+        g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        max_output_per_class,
+        iou_threshold,
+    )
+    return opset11.squeeze(
+        g, opset11.select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1
+    )
+
+
+def _process_batch_indices_for_roi_align(g, rois):
+    indices = opset11.squeeze(
+        g, opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1
+    )
+    return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
+
+
+def _process_rois_for_roi_align(g, rois):
+    return opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+
+def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
+    if sampling_ratio < 0:
+        warnings.warn(
+            "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
+            "The model will be exported with a sampling_ratio of 0."
+        )
+        sampling_ratio = 0
+    return sampling_ratio
+
+
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    if aligned:
+        warnings.warn(
+            "ROIAlign with aligned=True is only supported in opset >= 16. "
+            "Please export with opset 16 or higher, or use aligned=False."
+        )
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
+
+
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        coordinate_transformation_mode_s=coordinate_transformation_mode,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
+
+
+@parse_args("v", "v", "f", "i", "i")
+def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
+    roi_pool = g.op(
+        "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+    )
+    return roi_pool, None
+
+
+def _register_custom_op():
+    torch.onnx.register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _ONNX_OPSET_VERSION_16)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _ONNX_OPSET_VERSION_11)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..40bae605d028d3f522531711a1e28298b63ffbfc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/_utils.py
@@ -0,0 +1,106 @@
+from typing import Optional, Union
+
+import torch
+from torch import nn, Tensor
+
+
+def _cat(tensors: list[Tensor], dim: int = 0) -> Tensor:
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    # TODO add back the assert
+    # assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+def convert_boxes_to_roi_format(boxes: list[Tensor]) -> Tensor:
+    concat_boxes = _cat([b for b in boxes], dim=0)
+    temp = []
+    for i, b in enumerate(boxes):
+        temp.append(torch.full_like(b[:, :1], i))
+    ids = _cat(temp, dim=0)
+    rois = torch.cat([ids, concat_boxes], dim=1)
+    return rois
+
+
+def check_roi_boxes_shape(boxes: Union[Tensor, list[Tensor]]):
+    if isinstance(boxes, (list, tuple)):
+        for _tensor in boxes:
+            torch._assert(
+                _tensor.size(1) == 4, "The shape of the tensor in the boxes list is not correct as List[Tensor[L, 4]]"
+            )
+    elif isinstance(boxes, torch.Tensor):
+        torch._assert(boxes.size(1) == 5, "The boxes tensor shape is not correct as Tensor[K, 5]")
+    else:
+        torch._assert(False, "boxes is expected to be a Tensor[L, 5] or a List[Tensor[K, 4]]")
+    return
+
+
+def split_normalization_params(
+    model: nn.Module, norm_classes: Optional[list[type]] = None
+) -> tuple[list[Tensor], list[Tensor]]:
+    # Adapted from https://github.com/facebookresearch/ClassyVision/blob/659d7f78/classy_vision/generic/util.py#L501
+    if not norm_classes:
+        norm_classes = [
+            nn.modules.batchnorm._BatchNorm,
+            nn.LayerNorm,
+            nn.GroupNorm,
+            nn.modules.instancenorm._InstanceNorm,
+            nn.LocalResponseNorm,
+        ]
+
+    for t in norm_classes:
+        if not issubclass(t, nn.Module):
+            raise ValueError(f"Class {t} is not a subclass of nn.Module.")
+
+    classes = tuple(norm_classes)
+
+    norm_params = []
+    other_params = []
+    for module in model.modules():
+        if next(module.children(), None):
+            other_params.extend(p for p in module.parameters(recurse=False) if p.requires_grad)
+        elif isinstance(module, classes):
+            norm_params.extend(p for p in module.parameters() if p.requires_grad)
+        else:
+            other_params.extend(p for p in module.parameters() if p.requires_grad)
+    return norm_params, other_params
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def _upcast_non_float(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.dtype not in (torch.float32, torch.float64):
+        return t.float()
+    return t
+
+
+def _loss_inter_union(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsctk = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk
+
+    return intsctk, unionk
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/boxes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f8d6b86e9720ca4656c965b565b623204b2064
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/boxes.py
@@ -0,0 +1,520 @@
+import torch
+import torchvision
+from torch import Tensor
+from torchvision.extension import _assert_has_ops
+
+from ..utils import _log_api_usage_once
+from ._box_convert import (
+    _box_cxcywh_to_xyxy,
+    _box_cxcywhr_to_xywhr,
+    _box_xywh_to_xyxy,
+    _box_xywhr_to_cxcywhr,
+    _box_xywhr_to_xyxyxyxy,
+    _box_xyxy_to_cxcywh,
+    _box_xyxy_to_xywh,
+    _box_xyxyxyxy_to_xywhr,
+)
+from ._utils import _upcast
+
+
+def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than ``iou_threshold`` with another (higher scoring)
+    box.
+
+    If multiple boxes have the exact same score and satisfy the IoU
+    criterion with respect to a reference box, the selected box is
+    not guaranteed to be the same between CPU and GPU. This is similar
+    to the behavior of argsort in PyTorch when repeated values are present.
+
+    Args:
+        boxes (Tensor[N, 4])): boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(nms)
+    _assert_has_ops()
+    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
+
+
+def batched_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+        idxs (Tensor[N]): indices of the categories for each one of the boxes.
+        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
+        in decreasing order of scores
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(batched_nms)
+    # Benchmarks that drove the following thresholds are at
+    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
+    # and https://github.com/pytorch/vision/pull/8925
+    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 100_000) and not torchvision._is_tracing():
+        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
+    else:
+        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
+
+
+@torch.jit._script_if_tracing
+def _batched_nms_coordinate_trick(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    # strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    max_coordinate = boxes.max()
+    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+    boxes_for_nms = boxes + offsets[:, None]
+    keep = nms(boxes_for_nms, scores, iou_threshold)
+    return keep
+
+
+@torch.jit._script_if_tracing
+def _batched_nms_vanilla(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    # Based on Detectron2 implementation, just manually call nms() on each class independently
+    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
+    for class_id in torch.unique(idxs):
+        curr_indices = torch.where(idxs == class_id)[0]
+        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
+        keep_mask[curr_indices[curr_keep_indices]] = True
+    keep_indices = torch.where(keep_mask)[0]
+    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
+
+
+def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
+    """
+    Remove every box from ``boxes`` which contains at least one side length
+    that is smaller than ``min_size``.
+
+    .. note::
+        For sanitizing a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
+        the transform :func:`~torchvision.transforms.v2.SanitizeBoundingBoxes` instead.
+
+    Args:
+        boxes (Tensor[..., 4]): boxes in ``(x1, y1, x2, y2)`` format
+            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+        min_size (float): minimum size
+
+    Returns:
+        Tensor[K]: indices of the boxes that have both sides
+        larger than ``min_size``
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(remove_small_boxes)
+    ws, hs = boxes[..., 2] - boxes[..., 0], boxes[..., 3] - boxes[..., 1]
+    keep = (ws >= min_size) & (hs >= min_size)
+    keep = torch.where(keep)[0]
+    return keep
+
+
+def clip_boxes_to_image(boxes: Tensor, size: tuple[int, int]) -> Tensor:
+    """
+    Clip boxes so that they lie inside an image of size ``size``.
+
+    .. note::
+        For clipping a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
+        the transform :func:`~torchvision.transforms.v2.ClampBoundingBoxes` instead.
+
+    Args:
+        boxes (Tensor[..., 4]): boxes in ``(x1, y1, x2, y2)`` format
+            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+        size (Tuple[height, width]): size of the image
+
+    Returns:
+        Tensor[..., 4]: clipped boxes
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(clip_boxes_to_image)
+    dim = boxes.dim()
+    boxes_x = boxes[..., 0::2]
+    boxes_y = boxes[..., 1::2]
+    height, width = size
+
+    if torchvision._is_tracing():
+        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
+    else:
+        boxes_x = boxes_x.clamp(min=0, max=width)
+        boxes_y = boxes_y.clamp(min=0, max=height)
+
+    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
+    return clipped_boxes.reshape(boxes.shape)
+
+
+def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
+    """
+    Converts :class:`torch.Tensor` boxes from a given ``in_fmt`` to ``out_fmt``.
+
+    .. note::
+        For converting a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.BoundingBoxes` object
+        between different formats,
+        consider using :func:`~torchvision.transforms.v2.functional.convert_bounding_box_format` instead.
+        Or see the corresponding transform :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat`.
+
+    Supported ``in_fmt`` and ``out_fmt`` strings are:
+
+    ``'xyxy'``: boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
+    This is the format that torchvision utilities expect.
+
+    ``'xywh'``: boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
+
+    ``'cxcywh'``: boxes are represented via centre, width and height, cx, cy being center of box, w, h
+    being width and height.
+
+    ``'xywhr'``: boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    ``'cxcywhr'``: boxes are represented via centre, width and height, cx, cy being center of box, w, h
+    being width and height.
+    r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
+
+    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 top right,
+    x3, y3 bottom right, and x4, y4 bottom left.
+
+    Args:
+        boxes (Tensor[N, K]): boxes which will be converted. K is the number of coordinates (4 for unrotated bounding boxes, 5 or 8 for rotated bounding boxes)
+        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh', 'xywhr', 'cxcywhr', 'xyxyxyxy'].
+        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh', 'xywhr', 'cxcywhr', 'xyxyxyxy']
+
+    Returns:
+        Tensor[N, K]: Boxes into converted format.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(box_convert)
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+        "xywhr",
+        "cxcywhr",
+        "xyxyxyxy",
+    )
+    if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Bounding Box Conversions for given in_fmt {in_fmt} and out_fmt {out_fmt}")
+
+    if in_fmt == out_fmt:
+        return boxes.clone()
+    e = (in_fmt, out_fmt)
+    if e == ("xywh", "xyxy"):
+        boxes = _box_xywh_to_xyxy(boxes)
+    elif e == ("cxcywh", "xyxy"):
+        boxes = _box_cxcywh_to_xyxy(boxes)
+    elif e == ("xyxy", "xywh"):
+        boxes = _box_xyxy_to_xywh(boxes)
+    elif e == ("xyxy", "cxcywh"):
+        boxes = _box_xyxy_to_cxcywh(boxes)
+    elif e == ("xywh", "cxcywh"):
+        boxes = _box_xywh_to_xyxy(boxes)
+        boxes = _box_xyxy_to_cxcywh(boxes)
+    elif e == ("cxcywh", "xywh"):
+        boxes = _box_cxcywh_to_xyxy(boxes)
+        boxes = _box_xyxy_to_xywh(boxes)
+    elif e == ("cxcywhr", "xywhr"):
+        boxes = _box_cxcywhr_to_xywhr(boxes)
+    elif e == ("xywhr", "cxcywhr"):
+        boxes = _box_xywhr_to_cxcywhr(boxes)
+    elif e == ("cxcywhr", "xyxyxyxy"):
+        boxes = _box_cxcywhr_to_xywhr(boxes).to(boxes.dtype)
+        boxes = _box_xywhr_to_xyxyxyxy(boxes)
+    elif e == ("xyxyxyxy", "cxcywhr"):
+        boxes = _box_xyxyxyxy_to_xywhr(boxes).to(boxes.dtype)
+        boxes = _box_xywhr_to_cxcywhr(boxes)
+    elif e == ("xywhr", "xyxyxyxy"):
+        boxes = _box_xywhr_to_xyxyxyxy(boxes)
+    elif e == ("xyxyxyxy", "xywhr"):
+        boxes = _box_xyxyxyxy_to_xywhr(boxes)
+    else:
+        raise NotImplementedError(f"Unsupported Bounding Box Conversions for given in_fmt {e[0]} and out_fmt {e[1]}")
+
+    return boxes
+
+
+def box_area(boxes: Tensor, fmt: str = "xyxy") -> Tensor:
+    """
+    Computes the area of a set of bounding boxes from a given format.
+
+    Args:
+        boxes (Tensor[..., 4]): boxes for which the area will be computed.
+        fmt (str): Format of the input boxes.
+            Default is "xyxy" to preserve backward compatibility.
+            Supported formats are "xyxy", "xywh", and "cxcywh".
+
+    Returns:
+        Tensor[N]: Tensor containing the area for each box.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(box_area)
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Bounding Box area for given format {fmt}")
+    boxes = _upcast(boxes)
+    if fmt == "xyxy":
+        area = (boxes[..., 2] - boxes[..., 0]) * (boxes[..., 3] - boxes[..., 1])
+    else:
+        # For formats with width and height, area = width * height
+        # Supported: cxcywh, xywh
+        area = boxes[..., 2] * boxes[..., 3]
+
+    return area
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def _box_inter_union(boxes1: Tensor, boxes2: Tensor, fmt: str = "xyxy") -> tuple[Tensor, Tensor]:
+    area1 = box_area(boxes1, fmt=fmt)
+    area2 = box_area(boxes2, fmt=fmt)
+
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Box IoU Calculation for given fmt {fmt}.")
+
+    if fmt == "xyxy":
+        lt = torch.max(boxes1[..., None, :2], boxes2[..., None, :, :2])  # [...,N,M,2]
+        rb = torch.min(boxes1[..., None, 2:], boxes2[..., None, :, 2:])  # [...,N,M,2]
+    elif fmt == "xywh":
+        lt = torch.max(boxes1[..., None, :2], boxes2[..., None, :, :2])  # [...,N,M,2]
+        rb = torch.min(
+            boxes1[..., None, :2] + boxes1[..., None, 2:], boxes2[..., None, :, :2] + boxes2[..., None, :, 2:]
+        )  # [...,N,M,2]
+    else:  # fmt == "cxcywh":
+        lt = torch.max(
+            boxes1[..., None, :2] - boxes1[..., None, 2:] / 2, boxes2[..., None, :, :2] - boxes2[..., None, :, 2:] / 2
+        )  # [N,M,2]
+        rb = torch.min(
+            boxes1[..., None, :2] + boxes1[..., None, 2:] / 2, boxes2[..., None, :, :2] + boxes2[..., None, :, 2:] / 2
+        )  # [N,M,2]
+
+    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[..., 0] * wh[..., 1]  # [N,M]
+
+    union = area1[..., None] + area2[..., None, :] - inter
+
+    return inter, union
+
+
+def box_iou(boxes1: Tensor, boxes2: Tensor, fmt: str = "xyxy") -> Tensor:
+    """
+    Return intersection-over-union (Jaccard index) between two sets of boxes from a given format.
+
+    Args:
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
+        fmt (str): Format of the input boxes.
+            Default is "xyxy" to preserve backward compatibility.
+            Supported formats are "xyxy", "xywh", and "cxcywh".
+
+    Returns:
+        Tensor[..., N, M]: the NxM matrix containing the pairwise IoU values for every element
+        in boxes1 and boxes2
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(box_iou)
+    allowed_fmts = (
+        "xyxy",
+        "xywh",
+        "cxcywh",
+    )
+    if fmt not in allowed_fmts:
+        raise ValueError(f"Unsupported Box IoU Calculation for given format {fmt}.")
+    inter, union = _box_inter_union(boxes1, boxes2, fmt=fmt)
+    iou = inter / union
+    return iou
+
+
+# Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Args:
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
+
+    Returns:
+        Tensor[..., N, M]: the NxM matrix containing the pairwise generalized IoU values
+        for every element in boxes1 and boxes2
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(generalized_box_iou)
+
+    inter, union = _box_inter_union(boxes1, boxes2)
+    iou = inter / union
+
+    lti = torch.min(boxes1[..., None, :2], boxes2[..., None, :, :2])
+    rbi = torch.max(boxes1[..., None, 2:], boxes2[..., None, :, 2:])
+
+    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
+    areai = whi[..., 0] * whi[..., 1]
+
+    return iou - (areai - union) / areai
+
+
+def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
+    """
+    Return complete intersection-over-union (Jaccard index) between two sets of boxes.
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+    Args:
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
+        eps (float, optional): small number to prevent division by zero. Default: 1e-7
+    Returns:
+        Tensor[..., N, M]: the NxM matrix containing the pairwise complete IoU values
+        for every element in boxes1 and boxes2
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(complete_box_iou)
+
+    boxes1 = _upcast(boxes1)
+    boxes2 = _upcast(boxes2)
+
+    diou, iou = _box_diou_iou(boxes1, boxes2, eps)
+
+    w_pred = boxes1[..., None, 2] - boxes1[..., None, 0]
+    h_pred = boxes1[..., None, 3] - boxes1[..., None, 1]
+
+    w_gt = boxes2[..., None, :, 2] - boxes2[..., None, :, 0]
+    h_gt = boxes2[..., None, :, 3] - boxes2[..., None, :, 1]
+
+    v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w_pred / h_pred) - torch.atan(w_gt / h_gt), 2)
+    with torch.no_grad():
+        alpha = v / (1 - iou + v + eps)
+    return diou - alpha * v
+
+
+def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
+    """
+    Return distance intersection-over-union (Jaccard index) between two sets of boxes.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Args:
+        boxes1 (Tensor[..., N, 4]): first set of boxes
+        boxes2 (Tensor[..., M, 4]): second set of boxes
+        eps (float, optional): small number to prevent division by zero. Default: 1e-7
+
+    Returns:
+        Tensor[..., N, M]: the NxM matrix containing the pairwise distance IoU values
+        for every element in boxes1 and boxes2
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(distance_box_iou)
+
+    boxes1 = _upcast(boxes1)
+    boxes2 = _upcast(boxes2)
+    diou, _ = _box_diou_iou(boxes1, boxes2, eps=eps)
+    return diou
+
+
+def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> tuple[Tensor, Tensor]:
+
+    iou = box_iou(boxes1, boxes2)
+    lti = torch.min(boxes1[..., None, :2], boxes2[..., None, :, :2])
+    rbi = torch.max(boxes1[..., None, 2:], boxes2[..., None, :, 2:])
+    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
+    diagonal_distance_squared = (whi[..., 0] ** 2) + (whi[..., 1] ** 2) + eps
+    # centers of boxes
+    x_p = (boxes1[..., 0] + boxes1[..., 2]) / 2
+    y_p = (boxes1[..., 1] + boxes1[..., 3]) / 2
+    x_g = (boxes2[..., 0] + boxes2[..., 2]) / 2
+    y_g = (boxes2[..., 1] + boxes2[..., 3]) / 2
+    # The distance between boxes' centers squared.
+    centers_distance_squared = (_upcast(x_p[..., None] - x_g[..., None, :]) ** 2) + (
+        _upcast(y_p[..., None] - y_g[..., None, :]) ** 2
+    )
+    # The distance IoU is the IoU penalized by a normalized
+    # distance between boxes' centers squared.
+    return iou - (centers_distance_squared / diagonal_distance_squared), iou
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided masks.
+
+    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 <= x2`` and ``0 <= y1 <= y2``.
+
+    .. warning::
+
+        In most cases the output will guarantee ``x1 < x2`` and ``y1 < y2``. But
+        if the input is degenerate, e.g. if a mask is a single row or a single
+        column, then the output may have x1 = x2 or y1 = y2.
+
+    Args:
+        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
+            and (H, W) are the spatial dimensions.
+
+    Returns:
+        Tensor[N, 4]: bounding boxes
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(masks_to_boxes)
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
+
+    n = masks.shape[0]
+
+    bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
+
+    for index, mask in enumerate(masks):
+        y, x = torch.where(mask != 0)
+
+        bounding_boxes[index, 0] = torch.min(x)
+        bounding_boxes[index, 1] = torch.min(y)
+        bounding_boxes[index, 2] = torch.max(x)
+        bounding_boxes[index, 3] = torch.max(y)
+
+    return bounding_boxes
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ciou_loss.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ciou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d825e79dff0953389897195785b34cbf905f01e5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ciou_loss.py
@@ -0,0 +1,77 @@
+import torch
+
+from ..utils import _log_api_usage_once
+from ._utils import _upcast_non_float
+from .diou_loss import _diou_iou_loss
+
+
+def complete_box_iou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Gradient-friendly IoU loss with an additional penalty that is non-zero when the
+    boxes do not overlap. This loss function considers important geometrical
+    factors such as overlap area, normalized central point distance and aspect ratio.
+    This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``, and The two boxes should have the
+    same dimensions.
+
+    Args:
+        boxes1 : (Tensor[N, 4] or Tensor[4]) first set of boxes
+        boxes2 : (Tensor[N, 4] or Tensor[4]) second set of boxes
+        reduction : (string, optional) Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: No reduction will be
+            applied to the output. ``'mean'``: The output will be averaged.
+            ``'sum'``: The output will be summed. Default: ``'none'``
+        eps : (float): small number to prevent division by zero. Default: 1e-7
+
+    Returns:
+        Tensor: Loss tensor with the reduction option applied.
+
+    Reference:
+        Zhaohui Zheng et al.: Complete Intersection over Union Loss:
+        https://arxiv.org/abs/1911.08287
+
+    """
+
+    # Original Implementation from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(complete_box_iou_loss)
+
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
+
+    diou_loss, iou = _diou_iou_loss(boxes1, boxes2)
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # width and height of boxes
+    w_pred = x2 - x1
+    h_pred = y2 - y1
+    w_gt = x2g - x1g
+    h_gt = y2g - y1g
+    v = (4 / (torch.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
+    with torch.no_grad():
+        alpha = v / (1 - iou + v + eps)
+
+    loss = diou_loss + alpha * v
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
+    return loss
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/deform_conv.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..da13ee6da9a69770ca321f1dc74a19382e4b7c20
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/deform_conv.py
@@ -0,0 +1,195 @@
+import math
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import init
+from torch.nn.modules.utils import _pair
+from torch.nn.parameter import Parameter
+from torchvision.extension import _assert_has_ops
+
+from ..utils import _log_api_usage_once
+
+
+def deform_conv2d(
+    input: Tensor,
+    offset: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: tuple[int, int] = (1, 1),
+    padding: tuple[int, int] = (0, 0),
+    dilation: tuple[int, int] = (1, 1),
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    r"""
+    Performs Deformable Convolution v2, described in
+    `Deformable ConvNets v2: More Deformable, Better Results
+    <https://arxiv.org/abs/1811.11168>`__ if :attr:`mask` is not ``None`` and
+    Performs Deformable Convolution, described in
+    `Deformable Convolutional Networks
+    <https://arxiv.org/abs/1703.06211>`__ if :attr:`mask` is ``None``.
+
+    Args:
+        input (Tensor[batch_size, in_channels, in_height, in_width]): input tensor
+        offset (Tensor[batch_size, 2 * offset_groups * kernel_height * kernel_width, out_height, out_width]):
+            offsets to be applied for each position in the convolution kernel.
+        weight (Tensor[out_channels, in_channels // groups, kernel_height, kernel_width]): convolution weights,
+            split into groups of size (in_channels // groups)
+        bias (Tensor[out_channels]): optional bias of shape (out_channels,). Default: None
+        stride (int or Tuple[int, int]): distance between convolution centers. Default: 1
+        padding (int or Tuple[int, int]): height/width of padding of zeroes around
+            each image. Default: 0
+        dilation (int or Tuple[int, int]): the spacing between kernel elements. Default: 1
+        mask (Tensor[batch_size, offset_groups * kernel_height * kernel_width, out_height, out_width]):
+            masks to be applied for each position in the convolution kernel. Default: None
+
+    Returns:
+        Tensor[batch_sz, out_channels, out_h, out_w]: result of convolution
+
+    Examples::
+        >>> input = torch.rand(4, 3, 10, 10)
+        >>> kh, kw = 3, 3
+        >>> weight = torch.rand(5, 3, kh, kw)
+        >>> # offset and mask should have the same spatial size as the output
+        >>> # of the convolution. In this case, for an input of 10, stride of 1
+        >>> # and kernel size of 3, without padding, the output size is 8
+        >>> offset = torch.rand(4, 2 * kh * kw, 8, 8)
+        >>> mask = torch.rand(4, kh * kw, 8, 8)
+        >>> out = deform_conv2d(input, offset, weight, mask=mask)
+        >>> print(out.shape)
+        >>> # returns
+        >>>  torch.Size([4, 5, 8, 8])
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(deform_conv2d)
+    _assert_has_ops()
+    out_channels = weight.shape[0]
+
+    use_mask = mask is not None
+
+    if mask is None:
+        mask = torch.zeros((input.shape[0], 1), device=input.device, dtype=input.dtype)
+
+    if bias is None:
+        bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype)
+
+    stride_h, stride_w = _pair(stride)
+    pad_h, pad_w = _pair(padding)
+    dil_h, dil_w = _pair(dilation)
+    weights_h, weights_w = weight.shape[-2:]
+    _, n_in_channels, _, _ = input.shape
+
+    n_offset_grps = offset.shape[1] // (2 * weights_h * weights_w)
+    n_weight_grps = n_in_channels // weight.shape[1]
+
+    if n_offset_grps == 0:
+        raise RuntimeError(
+            "the shape of the offset tensor at dimension 1 is not valid. It should "
+            "be a multiple of 2 * weight.size[2] * weight.size[3].\n"
+            f"Got offset.shape[1]={offset.shape[1]}, while 2 * weight.size[2] * weight.size[3]={2 * weights_h * weights_w}"
+        )
+
+    return torch.ops.torchvision.deform_conv2d(
+        input,
+        weight,
+        offset,
+        mask,
+        bias,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dil_h,
+        dil_w,
+        n_weight_grps,
+        n_offset_grps,
+        use_mask,
+    )
+
+
+class DeformConv2d(nn.Module):
+    """
+    See :func:`deform_conv2d`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+
+        self.weight = Parameter(
+            torch.empty(out_channels, in_channels // groups, self.kernel_size[0], self.kernel_size[1])
+        )
+
+        if bias:
+            self.bias = Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input: Tensor, offset: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        """
+        Args:
+            input (Tensor[batch_size, in_channels, in_height, in_width]): input tensor
+            offset (Tensor[batch_size, 2 * offset_groups * kernel_height * kernel_width, out_height, out_width]):
+                offsets to be applied for each position in the convolution kernel.
+            mask (Tensor[batch_size, offset_groups * kernel_height * kernel_width, out_height, out_width]):
+                masks to be applied for each position in the convolution kernel.
+        """
+        return deform_conv2d(
+            input,
+            offset,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            mask=mask,
+        )
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"{self.in_channels}"
+            f", {self.out_channels}"
+            f", kernel_size={self.kernel_size}"
+            f", stride={self.stride}"
+        )
+        s += f", padding={self.padding}" if self.padding != (0, 0) else ""
+        s += f", dilation={self.dilation}" if self.dilation != (1, 1) else ""
+        s += f", groups={self.groups}" if self.groups != 1 else ""
+        s += ", bias=False" if self.bias is None else ""
+        s += ")"
+
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/diou_loss.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/diou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9381878ce1d81e853b370d8cc92681cfd3a5b5c6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/diou_loss.py
@@ -0,0 +1,91 @@
+import torch
+
+from ..utils import _log_api_usage_once
+from ._utils import _loss_inter_union, _upcast_non_float
+
+
+def distance_box_iou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Gradient-friendly IoU loss with an additional penalty that is non-zero when the
+    distance between boxes' centers isn't zero. Indeed, for two exactly overlapping
+    boxes, the distance IoU is the same as the IoU loss.
+    This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``, and The two boxes should have the
+    same dimensions.
+
+    Args:
+        boxes1 (Tensor[N, 4]): first set of boxes
+        boxes2 (Tensor[N, 4]): second set of boxes
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: No reduction will be
+            applied to the output. ``'mean'``: The output will be averaged.
+            ``'sum'``: The output will be summed. Default: ``'none'``
+        eps (float, optional): small number to prevent division by zero. Default: 1e-7
+
+    Returns:
+        Tensor: Loss tensor with the reduction option applied.
+
+    Reference:
+        Zhaohui Zheng et al.: Distance Intersection over Union Loss:
+        https://arxiv.org/abs/1911.08287
+    """
+
+    # Original Implementation from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(distance_box_iou_loss)
+
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
+
+    loss, _ = _diou_iou_loss(boxes1, boxes2, eps)
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
+    return loss
+
+
+def _diou_iou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    eps: float = 1e-7,
+) -> tuple[torch.Tensor, torch.Tensor]:
+
+    intsct, union = _loss_inter_union(boxes1, boxes2)
+    iou = intsct / (union + eps)
+    # smallest enclosing box
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    # The diagonal distance of the smallest enclosing box squared
+    diagonal_distance_squared = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    # The distance between boxes' centers squared.
+    centers_distance_squared = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+    # The distance IoU is the IoU penalized by a normalized
+    # distance between boxes' centers squared.
+    loss = 1 - iou + (centers_distance_squared / diagonal_distance_squared)
+    return loss, iou
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/drop_block.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/drop_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb80921e3afaf9d4163e4cbfe857e1218dd02337
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/drop_block.py
@@ -0,0 +1,161 @@
+import torch
+import torch.fx
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ..utils import _log_api_usage_once
+
+
+def drop_block2d(
+    input: Tensor, p: float, block_size: int, inplace: bool = False, eps: float = 1e-06, training: bool = True
+) -> Tensor:
+    """
+    Implements DropBlock2d from `"DropBlock: A regularization method for convolutional networks"
+    <https://arxiv.org/abs/1810.12890>`.
+
+    Args:
+        input (Tensor[N, C, H, W]): The input tensor or 4-dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): Probability of an element to be dropped.
+        block_size (int): Size of the block to drop.
+        inplace (bool): If set to ``True``, will do this operation in-place. Default: ``False``.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-6.
+        training (bool): apply dropblock if is ``True``. Default: ``True``.
+
+    Returns:
+        Tensor[N, C, H, W]: The randomly zeroed tensor after dropblock.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(drop_block2d)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}.")
+    if input.ndim != 4:
+        raise ValueError(f"input should be 4 dimensional. Got {input.ndim} dimensions.")
+    if not training or p == 0.0:
+        return input
+
+    N, C, H, W = input.size()
+    block_size = min(block_size, W, H)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
+    # compute the gamma of Bernoulli distribution
+    gamma = (p * H * W) / ((block_size**2) * ((H - block_size + 1) * (W - block_size + 1)))
+    noise = torch.empty((N, C, H - block_size + 1, W - block_size + 1), dtype=input.dtype, device=input.device)
+    noise.bernoulli_(gamma)
+
+    noise = F.pad(noise, [block_size // 2] * 4, value=0)
+    noise = F.max_pool2d(noise, stride=(1, 1), kernel_size=(block_size, block_size), padding=block_size // 2)
+    noise = 1 - noise
+    normalize_scale = noise.numel() / (eps + noise.sum())
+    if inplace:
+        input.mul_(noise).mul_(normalize_scale)
+    else:
+        input = input * noise * normalize_scale
+    return input
+
+
+def drop_block3d(
+    input: Tensor, p: float, block_size: int, inplace: bool = False, eps: float = 1e-06, training: bool = True
+) -> Tensor:
+    """
+    Implements DropBlock3d from `"DropBlock: A regularization method for convolutional networks"
+    <https://arxiv.org/abs/1810.12890>`.
+
+    Args:
+        input (Tensor[N, C, D, H, W]): The input tensor or 5-dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): Probability of an element to be dropped.
+        block_size (int): Size of the block to drop.
+        inplace (bool): If set to ``True``, will do this operation in-place. Default: ``False``.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-6.
+        training (bool): apply dropblock if is ``True``. Default: ``True``.
+
+    Returns:
+        Tensor[N, C, D, H, W]: The randomly zeroed tensor after dropblock.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(drop_block3d)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}.")
+    if input.ndim != 5:
+        raise ValueError(f"input should be 5 dimensional. Got {input.ndim} dimensions.")
+    if not training or p == 0.0:
+        return input
+
+    N, C, D, H, W = input.size()
+    block_size = min(block_size, D, H, W)
+    if block_size % 2 == 0:
+        raise ValueError(f"block size should be odd. Got {block_size} which is even.")
+
+    # compute the gamma of Bernoulli distribution
+    gamma = (p * D * H * W) / ((block_size**3) * ((D - block_size + 1) * (H - block_size + 1) * (W - block_size + 1)))
+    noise = torch.empty(
+        (N, C, D - block_size + 1, H - block_size + 1, W - block_size + 1), dtype=input.dtype, device=input.device
+    )
+    noise.bernoulli_(gamma)
+
+    noise = F.pad(noise, [block_size // 2] * 6, value=0)
+    noise = F.max_pool3d(
+        noise, stride=(1, 1, 1), kernel_size=(block_size, block_size, block_size), padding=block_size // 2
+    )
+    noise = 1 - noise
+    normalize_scale = noise.numel() / (eps + noise.sum())
+    if inplace:
+        input.mul_(noise).mul_(normalize_scale)
+    else:
+        input = input * noise * normalize_scale
+    return input
+
+
+torch.fx.wrap("drop_block2d")
+
+
+class DropBlock2d(nn.Module):
+    """
+    See :func:`drop_block2d`.
+    """
+
+    def __init__(self, p: float, block_size: int, inplace: bool = False, eps: float = 1e-06) -> None:
+        super().__init__()
+
+        self.p = p
+        self.block_size = block_size
+        self.inplace = inplace
+        self.eps = eps
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Args:
+            input (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        return drop_block2d(input, self.p, self.block_size, self.inplace, self.eps, self.training)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(p={self.p}, block_size={self.block_size}, inplace={self.inplace})"
+        return s
+
+
+torch.fx.wrap("drop_block3d")
+
+
+class DropBlock3d(DropBlock2d):
+    """
+    See :func:`drop_block3d`.
+    """
+
+    def __init__(self, p: float, block_size: int, inplace: bool = False, eps: float = 1e-06) -> None:
+        super().__init__(p, block_size, inplace, eps)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Args:
+            input (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        return drop_block3d(input, self.p, self.block_size, self.inplace, self.eps, self.training)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/feature_pyramid_network.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/feature_pyramid_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c85e19a6996e38b5b1a5a5690708d2c9ff99dff
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/feature_pyramid_network.py
@@ -0,0 +1,250 @@
+from collections import OrderedDict
+from typing import Callable, Optional
+
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ..ops.misc import Conv2dNormActivation
+from ..utils import _log_api_usage_once
+
+
+class ExtraFPNBlock(nn.Module):
+    """
+    Base class for the extra block in the FPN.
+
+    Args:
+        results (List[Tensor]): the result of the FPN
+        x (List[Tensor]): the original feature maps
+        names (List[str]): the names for each one of the
+            original feature maps
+
+    Returns:
+        results (List[Tensor]): the extended set of results
+            of the FPN
+        names (List[str]): the extended set of names for the results
+    """
+
+    def forward(
+        self,
+        results: list[Tensor],
+        x: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
+        pass
+
+
+class FeaturePyramidNetwork(nn.Module):
+    """
+    Module that adds a FPN from on top of a set of feature maps. This is based on
+    `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
+
+    The feature maps are currently supposed to be in increasing depth
+    order.
+
+    The input to the model is expected to be an OrderedDict[Tensor], containing
+    the feature maps on top of which the FPN will be added.
+
+    Args:
+        in_channels_list (list[int]): number of channels for each feature map that
+            is passed to the module
+        out_channels (int): number of channels of the FPN representation
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+
+    Examples::
+
+        >>> m = torchvision.ops.FeaturePyramidNetwork([10, 20, 30], 5)
+        >>> # get some dummy data
+        >>> x = OrderedDict()
+        >>> x['feat0'] = torch.rand(1, 10, 64, 64)
+        >>> x['feat2'] = torch.rand(1, 20, 16, 16)
+        >>> x['feat3'] = torch.rand(1, 30, 8, 8)
+        >>> # compute the FPN on top of x
+        >>> output = m(x)
+        >>> print([(k, v.shape) for k, v in output.items()])
+        >>> # returns
+        >>>   [('feat0', torch.Size([1, 5, 64, 64])),
+        >>>    ('feat2', torch.Size([1, 5, 16, 16])),
+        >>>    ('feat3', torch.Size([1, 5, 8, 8]))]
+
+    """
+
+    _version = 2
+
+    def __init__(
+        self,
+        in_channels_list: list[int],
+        out_channels: int,
+        extra_blocks: Optional[ExtraFPNBlock] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.inner_blocks = nn.ModuleList()
+        self.layer_blocks = nn.ModuleList()
+        for in_channels in in_channels_list:
+            if in_channels == 0:
+                raise ValueError("in_channels=0 is currently not supported")
+            inner_block_module = Conv2dNormActivation(
+                in_channels, out_channels, kernel_size=1, padding=0, norm_layer=norm_layer, activation_layer=None
+            )
+            layer_block_module = Conv2dNormActivation(
+                out_channels, out_channels, kernel_size=3, norm_layer=norm_layer, activation_layer=None
+            )
+            self.inner_blocks.append(inner_block_module)
+            self.layer_blocks.append(layer_block_module)
+
+        # initialize parameters now to avoid modifying the initialization of top_blocks
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+        if extra_blocks is not None:
+            if not isinstance(extra_blocks, ExtraFPNBlock):
+                raise TypeError(f"extra_blocks should be of type ExtraFPNBlock not {type(extra_blocks)}")
+        self.extra_blocks = extra_blocks
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            num_blocks = len(self.inner_blocks)
+            for block in ["inner_blocks", "layer_blocks"]:
+                for i in range(num_blocks):
+                    for type in ["weight", "bias"]:
+                        old_key = f"{prefix}{block}.{i}.{type}"
+                        new_key = f"{prefix}{block}.{i}.0.{type}"
+                        if old_key in state_dict:
+                            state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.inner_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.inner_blocks)
+        if idx < 0:
+            idx += num_blocks
+        out = x
+        for i, module in enumerate(self.inner_blocks):
+            if i == idx:
+                out = module(x)
+        return out
+
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.layer_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.layer_blocks)
+        if idx < 0:
+            idx += num_blocks
+        out = x
+        for i, module in enumerate(self.layer_blocks):
+            if i == idx:
+                out = module(x)
+        return out
+
+    def forward(self, x: dict[str, Tensor]) -> dict[str, Tensor]:
+        """
+        Computes the FPN for a set of feature maps.
+
+        Args:
+            x (OrderedDict[Tensor]): feature maps for each feature level.
+
+        Returns:
+            results (OrderedDict[Tensor]): feature maps after FPN layers.
+                They are ordered from the highest resolution first.
+        """
+        # unpack OrderedDict into two lists for easier handling
+        names = list(x.keys())
+        x = list(x.values())
+
+        last_inner = self.get_result_from_inner_blocks(x[-1], -1)
+        results = []
+        results.append(self.get_result_from_layer_blocks(last_inner, -1))
+
+        for idx in range(len(x) - 2, -1, -1):
+            inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
+            feat_shape = inner_lateral.shape[-2:]
+            inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
+
+        if self.extra_blocks is not None:
+            results, names = self.extra_blocks(results, x, names)
+
+        # make it back an OrderedDict
+        out = OrderedDict([(k, v) for k, v in zip(names, results)])
+
+        return out
+
+
+class LastLevelMaxPool(ExtraFPNBlock):
+    """
+    Applies a max_pool2d (not actual max_pool2d, we just subsample) on top of the last feature map
+    """
+
+    def forward(
+        self,
+        x: list[Tensor],
+        y: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
+        names.append("pool")
+        # Use max pooling to simulate stride 2 subsampling
+        x.append(F.max_pool2d(x[-1], kernel_size=1, stride=2, padding=0))
+        return x, names
+
+
+class LastLevelP6P7(ExtraFPNBlock):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            nn.init.kaiming_uniform_(module.weight, a=1)
+            nn.init.constant_(module.bias, 0)
+        self.use_P5 = in_channels == out_channels
+
+    def forward(
+        self,
+        p: list[Tensor],
+        c: list[Tensor],
+        names: list[str],
+    ) -> tuple[list[Tensor], list[str]]:
+        p5, c5 = p[-1], c[-1]
+        x = p5 if self.use_P5 else c5
+        p6 = self.p6(x)
+        p7 = self.p7(F.relu(p6))
+        p.extend([p6, p7])
+        names.extend(["p6", "p7"])
+        return p, names
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/focal_loss.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd781eaab54b3eec755c341f2678a58068e84eb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/focal_loss.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn.functional as F
+
+from ..utils import _log_api_usage_once
+
+
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    reduction: str = "none",
+) -> torch.Tensor:
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (Tensor): A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets (Tensor): A float tensor with the same shape as inputs. Stores the binary
+                classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha (float): Weighting factor in range [0, 1] to balance
+                positive vs negative examples or -1 for ignore. Default: ``0.25``.
+        gamma (float): Exponent of the modulating factor (1 - p_t) to
+                balance easy vs hard examples. Default: ``2``.
+        reduction (string): ``'none'`` | ``'mean'`` | ``'sum'``
+                ``'none'``: No reduction will be applied to the output.
+                ``'mean'``: The output will be averaged.
+                ``'sum'``: The output will be summed. Default: ``'none'``.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py
+
+    if not (0 <= alpha <= 1) and alpha != -1:
+        raise ValueError(f"Invalid alpha value: {alpha}. alpha must be in the range [0,1] or -1 for ignore.")
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(sigmoid_focal_loss)
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
+    return loss
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/giou_loss.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/giou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e56dcc16c7d84fe6ba0f59c0b60e30a84110fbb0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/giou_loss.py
@@ -0,0 +1,75 @@
+import torch
+
+from ..utils import _log_api_usage_once
+from ._utils import _loss_inter_union, _upcast_non_float
+
+
+def generalized_box_iou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Gradient-friendly IoU loss with an additional penalty that is non-zero when the
+    boxes do not overlap and scales with the size of their smallest enclosing box.
+    This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``, and The two boxes should have the
+    same dimensions.
+
+    Args:
+        boxes1 (Tensor[N, 4] or Tensor[4]): first set of boxes
+        boxes2 (Tensor[N, 4] or Tensor[4]): second set of boxes
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: No reduction will be
+            applied to the output. ``'mean'``: The output will be averaged.
+            ``'sum'``: The output will be summed. Default: ``'none'``
+        eps (float): small number to prevent division by zero. Default: 1e-7
+
+    Returns:
+        Tensor: Loss tensor with the reduction option applied.
+
+    Reference:
+        Hamid Rezatofighi et al.: Generalized Intersection over Union:
+        A Metric and A Loss for Bounding Box Regression:
+        https://arxiv.org/abs/1902.09630
+    """
+
+    # Original implementation from https://github.com/facebookresearch/fvcore/blob/bfff2ef/fvcore/nn/giou_loss.py
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(generalized_box_iou_loss)
+
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
+    intsctk, unionk = _loss_inter_union(boxes1, boxes2)
+    iouk = intsctk / (unionk + eps)
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+
+    area_c = (xc2 - xc1) * (yc2 - yc1)
+    miouk = iouk - ((area_c - unionk) / (area_c + eps))
+
+    loss = 1 - miouk
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
+    return loss
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/misc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa1e23a5ee62a81784949157fb485b7529a37e8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/misc.py
@@ -0,0 +1,321 @@
+import warnings
+from collections.abc import Sequence
+from typing import Callable, Optional, Union
+
+import torch
+from torch import Tensor
+
+from ..utils import _log_api_usage_once, _make_ntuple
+
+
+interpolate = torch.nn.functional.interpolate
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed
+
+    Args:
+        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
+        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
+
+
+class ConvNormActivation(torch.nn.Sequential):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, tuple[int, ...]] = 3,
+        stride: Union[int, tuple[int, ...]] = 1,
+        padding: Optional[Union[int, tuple[int, ...], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, tuple[int, ...]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+        conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
+    ) -> None:
+
+        if padding is None:
+            if isinstance(kernel_size, int) and isinstance(dilation, int):
+                padding = (kernel_size - 1) // 2 * dilation
+            else:
+                _conv_dim = len(kernel_size) if isinstance(kernel_size, Sequence) else len(dilation)
+                kernel_size = _make_ntuple(kernel_size, _conv_dim)
+                dilation = _make_ntuple(dilation, _conv_dim)
+                padding = tuple((kernel_size[i] - 1) // 2 * dilation[i] for i in range(_conv_dim))
+        if bias is None:
+            bias = norm_layer is None
+
+        layers = [
+            conv_layer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        ]
+
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+
+        if activation_layer is not None:
+            params = {} if inplace is None else {"inplace": inplace}
+            layers.append(activation_layer(**params))
+        super().__init__(*layers)
+        _log_api_usage_once(self)
+        self.out_channels = out_channels
+
+        if self.__class__ == ConvNormActivation:
+            warnings.warn(
+                "Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
+            )
+
+
+class Conv2dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution2d-Normalization-Activation blocks.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, tuple[int, int]] = 3,
+        stride: Union[int, tuple[int, int]] = 1,
+        padding: Optional[Union[int, tuple[int, int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, tuple[int, int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv2d,
+        )
+
+
+class Conv3dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution3d-Normalization-Activation blocks.
+
+    Args:
+        in_channels (int): Number of channels in the input video.
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, tuple[int, int, int]] = 3,
+        stride: Union[int, tuple[int, int, int]] = 1,
+        padding: Optional[Union[int, tuple[int, int, int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, tuple[int, int, int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv3d,
+        )
+
+
+class SqueezeExcitation(torch.nn.Module):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
+
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
+        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
+    """
+
+    def __init__(
+        self,
+        input_channels: int,
+        squeeze_channels: int,
+        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
+        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
+        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
+        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input: Tensor) -> Tensor:
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input: Tensor) -> Tensor:
+        scale = self._scale(input)
+        return scale * input
+
+
+class MLP(torch.nn.Sequential):
+    """This block implements the multi-layer perceptron (MLP) module.
+
+    Args:
+        in_channels (int): Number of channels of the input
+        hidden_channels (List[int]): List of the hidden channel dimensions
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
+        inplace (bool, optional): Parameter for the activation layer, which can optionally do the operation in-place.
+            Default is ``None``, which uses the respective default values of the ``activation_layer`` and Dropout layer.
+        bias (bool): Whether to use bias in the linear layer. Default ``True``
+        dropout (float): The probability for the dropout layer. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: list[int],
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        inplace: Optional[bool] = None,
+        bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        # The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
+        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
+        params = {} if inplace is None else {"inplace": inplace}
+
+        layers = []
+        in_dim = in_channels
+        for hidden_dim in hidden_channels[:-1]:
+            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
+            if norm_layer is not None:
+                layers.append(norm_layer(hidden_dim))
+            layers.append(activation_layer(**params))
+            layers.append(torch.nn.Dropout(dropout, **params))
+            in_dim = hidden_dim
+
+        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
+        layers.append(torch.nn.Dropout(dropout, **params))
+
+        super().__init__(*layers)
+        _log_api_usage_once(self)
+
+
+class Permute(torch.nn.Module):
+    """This module returns a view of the tensor input with its dimensions permuted.
+
+    Args:
+        dims (List[int]): The desired ordering of dimensions
+    """
+
+    def __init__(self, dims: list[int]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.permute(x, self.dims)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/poolers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f887f6aee332e8785f2a6596fe0c11f66264cb88
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/poolers.py
@@ -0,0 +1,327 @@
+from typing import Optional, Union
+
+import torch
+import torch.fx
+import torchvision
+from torch import nn, Tensor
+from torchvision.ops.boxes import box_area
+
+from ..utils import _log_api_usage_once
+from .roi_align import roi_align
+
+
+# copying result_idx_in_level to a specific index in result[]
+# is not supported by ONNX tracing yet.
+# _onnx_merge_levels() is an implementation supported by ONNX
+# that merges the levels to the right indices
+@torch.jit.unused
+def _onnx_merge_levels(levels: Tensor, unmerged_results: list[Tensor]) -> Tensor:
+    first_result = unmerged_results[0]
+    dtype, device = first_result.dtype, first_result.device
+    res = torch.zeros(
+        (levels.size(0), first_result.size(1), first_result.size(2), first_result.size(3)), dtype=dtype, device=device
+    )
+    for level in range(len(unmerged_results)):
+        index = torch.where(levels == level)[0].view(-1, 1, 1, 1)
+        index = index.expand(
+            index.size(0),
+            unmerged_results[level].size(1),
+            unmerged_results[level].size(2),
+            unmerged_results[level].size(3),
+        )
+        res = res.scatter(0, index, unmerged_results[level])
+    return res
+
+
+# TODO: (eellison) T54974082 https://github.com/pytorch/pytorch/issues/26744/pytorch/issues/26744
+def initLevelMapper(
+    k_min: int,
+    k_max: int,
+    canonical_scale: int = 224,
+    canonical_level: int = 4,
+    eps: float = 1e-6,
+):
+    return LevelMapper(k_min, k_max, canonical_scale, canonical_level, eps)
+
+
+class LevelMapper:
+    """Determine which FPN level each RoI in a set of RoIs should map to based
+    on the heuristic in the FPN paper.
+
+    Args:
+        k_min (int)
+        k_max (int)
+        canonical_scale (int)
+        canonical_level (int)
+        eps (float)
+    """
+
+    def __init__(
+        self,
+        k_min: int,
+        k_max: int,
+        canonical_scale: int = 224,
+        canonical_level: int = 4,
+        eps: float = 1e-6,
+    ):
+        self.k_min = k_min
+        self.k_max = k_max
+        self.s0 = canonical_scale
+        self.lvl0 = canonical_level
+        self.eps = eps
+
+    def __call__(self, boxlists: list[Tensor]) -> Tensor:
+        """
+        Args:
+            boxlists (list[BoxList])
+        """
+        # Compute level ids
+        s = torch.sqrt(torch.cat([box_area(boxlist) for boxlist in boxlists]))
+
+        # Eqn.(1) in FPN paper
+        target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0) + torch.tensor(self.eps, dtype=s.dtype))
+        target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max)
+        return (target_lvls.to(torch.int64) - self.k_min).to(torch.int64)
+
+
+def _convert_to_roi_format(boxes: list[Tensor]) -> Tensor:
+    concat_boxes = torch.cat(boxes, dim=0)
+    device, dtype = concat_boxes.device, concat_boxes.dtype
+    ids = torch.cat(
+        [torch.full_like(b[:, :1], i, dtype=dtype, layout=torch.strided, device=device) for i, b in enumerate(boxes)],
+        dim=0,
+    )
+    rois = torch.cat([ids, concat_boxes], dim=1)
+    return rois
+
+
+def _infer_scale(feature: Tensor, original_size: list[int]) -> float:
+    # assumption: the scale is of the form 2 ** (-k), with k integer
+    size = feature.shape[-2:]
+    possible_scales: list[float] = []
+    for s1, s2 in zip(size, original_size):
+        approx_scale = float(s1) / float(s2)
+        scale = 2 ** float(torch.tensor(approx_scale).log2().round())
+        possible_scales.append(scale)
+    return possible_scales[0]
+
+
+@torch.fx.wrap
+def _setup_scales(
+    features: list[Tensor], image_shapes: list[tuple[int, int]], canonical_scale: int, canonical_level: int
+) -> tuple[list[float], LevelMapper]:
+    if not image_shapes:
+        raise ValueError("images list should not be empty")
+    max_x = 0
+    max_y = 0
+    for shape in image_shapes:
+        max_x = max(shape[0], max_x)
+        max_y = max(shape[1], max_y)
+    original_input_shape = (max_x, max_y)
+
+    scales = [_infer_scale(feat, original_input_shape) for feat in features]
+    # get the levels in the feature map by leveraging the fact that the network always
+    # downsamples by a factor of 2 at each level.
+    lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item()
+    lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item()
+
+    map_levels = initLevelMapper(
+        int(lvl_min),
+        int(lvl_max),
+        canonical_scale=canonical_scale,
+        canonical_level=canonical_level,
+    )
+    return scales, map_levels
+
+
+@torch.fx.wrap
+def _filter_input(x: dict[str, Tensor], featmap_names: list[str]) -> list[Tensor]:
+    x_filtered = []
+    for k, v in x.items():
+        if k in featmap_names:
+            x_filtered.append(v)
+    return x_filtered
+
+
+@torch.fx.wrap
+def _multiscale_roi_align(
+    x_filtered: list[Tensor],
+    boxes: list[Tensor],
+    output_size: list[int],
+    sampling_ratio: int,
+    scales: Optional[list[float]],
+    mapper: Optional[LevelMapper],
+) -> Tensor:
+    """
+    Args:
+        x_filtered (List[Tensor]): List of input tensors.
+        boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
+            (x1, y1, x2, y2) format and in the image reference size, not the feature map
+            reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+        output_size (Union[List[Tuple[int, int]], List[int]]): size of the output
+        sampling_ratio (int): sampling ratio for ROIAlign
+        scales (Optional[List[float]]): If None, scales will be automatically inferred. Default value is None.
+        mapper (Optional[LevelMapper]): If none, mapper will be automatically inferred. Default value is None.
+    Returns:
+        result (Tensor)
+    """
+    if scales is None or mapper is None:
+        raise ValueError("scales and mapper should not be None")
+
+    num_levels = len(x_filtered)
+    rois = _convert_to_roi_format(boxes)
+
+    if num_levels == 1:
+        return roi_align(
+            x_filtered[0],
+            rois,
+            output_size=output_size,
+            spatial_scale=scales[0],
+            sampling_ratio=sampling_ratio,
+        )
+
+    levels = mapper(boxes)
+
+    num_rois = len(rois)
+    num_channels = x_filtered[0].shape[1]
+
+    dtype, device = x_filtered[0].dtype, x_filtered[0].device
+    result = torch.zeros(
+        (
+            num_rois,
+            num_channels,
+        )
+        + output_size,
+        dtype=dtype,
+        device=device,
+    )
+
+    tracing_results = []
+    for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)):
+        idx_in_level = torch.where(levels == level)[0]
+        rois_per_level = rois[idx_in_level]
+
+        result_idx_in_level = roi_align(
+            per_level_feature,
+            rois_per_level,
+            output_size=output_size,
+            spatial_scale=scale,
+            sampling_ratio=sampling_ratio,
+        )
+
+        if torchvision._is_tracing():
+            tracing_results.append(result_idx_in_level.to(dtype))
+        else:
+            # result and result_idx_in_level's dtypes are based on dtypes of different
+            # elements in x_filtered.  x_filtered contains tensors output by different
+            # layers.  When autocast is active, it may choose different dtypes for
+            # different layers' outputs.  Therefore, we defensively match result's dtype
+            # before copying elements from result_idx_in_level in the following op.
+            # We need to cast manually (can't rely on autocast to cast for us) because
+            # the op acts on result in-place, and autocast only affects out-of-place ops.
+            result[idx_in_level] = result_idx_in_level.to(result.dtype)
+
+    if torchvision._is_tracing():
+        result = _onnx_merge_levels(levels, tracing_results)
+
+    return result
+
+
+class MultiScaleRoIAlign(nn.Module):
+    """
+    Multi-scale RoIAlign pooling, which is useful for detection with or without FPN.
+
+    It infers the scale of the pooling via the heuristics specified in eq. 1
+    of the `Feature Pyramid Network paper <https://arxiv.org/abs/1612.03144>`_.
+    They keyword-only parameters ``canonical_scale`` and ``canonical_level``
+    correspond respectively to ``224`` and ``k0=4`` in eq. 1, and
+    have the following meaning: ``canonical_level`` is the target level of the pyramid from
+    which to pool a region of interest with ``w x h = canonical_scale x canonical_scale``.
+
+    Args:
+        featmap_names (List[str]): the names of the feature maps that will be used
+            for the pooling.
+        output_size (List[Tuple[int, int]] or List[int]): output size for the pooled region
+        sampling_ratio (int): sampling ratio for ROIAlign
+        canonical_scale (int, optional): canonical_scale for LevelMapper
+        canonical_level (int, optional): canonical_level for LevelMapper
+
+    Examples::
+
+        >>> m = torchvision.ops.MultiScaleRoIAlign(['feat1', 'feat3'], 3, 2)
+        >>> i = OrderedDict()
+        >>> i['feat1'] = torch.rand(1, 5, 64, 64)
+        >>> i['feat2'] = torch.rand(1, 5, 32, 32)  # this feature won't be used in the pooling
+        >>> i['feat3'] = torch.rand(1, 5, 16, 16)
+        >>> # create some random bounding boxes
+        >>> boxes = torch.rand(6, 4) * 256; boxes[:, 2:] += boxes[:, :2]
+        >>> # original image size, before computing the feature maps
+        >>> image_sizes = [(512, 512)]
+        >>> output = m(i, [boxes], image_sizes)
+        >>> print(output.shape)
+        >>> torch.Size([6, 5, 3, 3])
+
+    """
+
+    __annotations__ = {"scales": Optional[list[float]], "map_levels": Optional[LevelMapper]}
+
+    def __init__(
+        self,
+        featmap_names: list[str],
+        output_size: Union[int, tuple[int], list[int]],
+        sampling_ratio: int,
+        *,
+        canonical_scale: int = 224,
+        canonical_level: int = 4,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        self.featmap_names = featmap_names
+        self.sampling_ratio = sampling_ratio
+        self.output_size = tuple(output_size)
+        self.scales = None
+        self.map_levels = None
+        self.canonical_scale = canonical_scale
+        self.canonical_level = canonical_level
+
+    def forward(
+        self,
+        x: dict[str, Tensor],
+        boxes: list[Tensor],
+        image_shapes: list[tuple[int, int]],
+    ) -> Tensor:
+        """
+        Args:
+            x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have
+                all the same number of channels, but they can have different sizes.
+            boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
+                (x1, y1, x2, y2) format and in the image reference size, not the feature map
+                reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            image_shapes (List[Tuple[height, width]]): the sizes of each image before they
+                have been fed to a CNN to obtain feature maps. This allows us to infer the
+                scale factor for each one of the levels to be pooled.
+        Returns:
+            result (Tensor)
+        """
+        x_filtered = _filter_input(x, self.featmap_names)
+        if self.scales is None or self.map_levels is None:
+            self.scales, self.map_levels = _setup_scales(
+                x_filtered, image_shapes, self.canonical_scale, self.canonical_level
+            )
+
+        return _multiscale_roi_align(
+            x_filtered,
+            boxes,
+            self.output_size,
+            self.sampling_ratio,
+            self.scales,
+            self.map_levels,
+        )
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(featmap_names={self.featmap_names}, "
+            f"output_size={self.output_size}, sampling_ratio={self.sampling_ratio})"
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_align.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..82809b8f8885667b28eccd22aca60d1dca02f3bf
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_align.py
@@ -0,0 +1,90 @@
+import torch
+import torch.fx
+from torch import nn, Tensor
+from torch.nn.modules.utils import _pair
+from torchvision.extension import _assert_has_ops
+
+from ..utils import _log_api_usage_once
+from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
+
+
+@torch.fx.wrap
+def ps_roi_align(
+    input: Tensor,
+    boxes: Tensor,
+    output_size: int,
+    spatial_scale: float = 1.0,
+    sampling_ratio: int = -1,
+) -> Tensor:
+    """
+    Performs Position-Sensitive Region of Interest (RoI) Align operator
+    mentioned in Light-Head R-CNN.
+
+    Args:
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element
+            contains ``C`` feature maps of dimensions ``H x W``.
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed, then the first column should
+            contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``.
+            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
+            in the batch.
+        output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
+            is performed, as (height, width).
+        spatial_scale (float): a scaling factor that maps the box coordinates to
+            the input coordinates. For example, if your boxes are defined on the scale
+            of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+            the original image), you'll want to set this to 0.5. Default: 1.0
+        sampling_ratio (int): number of sampling points in the interpolation grid
+            used to compute the output value of each pooled output bin. If > 0,
+            then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+            <= 0, then an adaptive number of grid points are used (computed as
+            ``ceil(roi_width / output_width)``, and likewise for height). Default: -1
+
+    Returns:
+        Tensor[K, C / (output_size[0] * output_size[1]), output_size[0], output_size[1]]: The pooled RoIs
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(ps_roi_align)
+    _assert_has_ops()
+    check_roi_boxes_shape(boxes)
+    rois = boxes
+    output_size = _pair(output_size)
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    output, _ = torch.ops.torchvision.ps_roi_align(
+        input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio
+    )
+    return output
+
+
+class PSRoIAlign(nn.Module):
+    """
+    See :func:`ps_roi_align`.
+    """
+
+    def __init__(
+        self,
+        output_size: int,
+        spatial_scale: float,
+        sampling_ratio: int,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:
+        return ps_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"output_size={self.output_size}"
+            f", spatial_scale={self.spatial_scale}"
+            f", sampling_ratio={self.sampling_ratio}"
+            f")"
+        )
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_pool.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..15292dcad97490aaa740cdec2d0aedb31e5662eb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/ps_roi_pool.py
@@ -0,0 +1,70 @@
+import torch
+import torch.fx
+from torch import nn, Tensor
+from torch.nn.modules.utils import _pair
+from torchvision.extension import _assert_has_ops
+
+from ..utils import _log_api_usage_once
+from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
+
+
+@torch.fx.wrap
+def ps_roi_pool(
+    input: Tensor,
+    boxes: Tensor,
+    output_size: int,
+    spatial_scale: float = 1.0,
+) -> Tensor:
+    """
+    Performs Position-Sensitive Region of Interest (RoI) Pool operator
+    described in R-FCN
+
+    Args:
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element
+            contains ``C`` feature maps of dimensions ``H x W``.
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed, then the first column should
+            contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``.
+            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
+            in the batch.
+        output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
+            is performed, as (height, width).
+        spatial_scale (float): a scaling factor that maps the box coordinates to
+            the input coordinates. For example, if your boxes are defined on the scale
+            of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+            the original image), you'll want to set this to 0.5. Default: 1.0
+
+    Returns:
+        Tensor[K, C / (output_size[0] * output_size[1]), output_size[0], output_size[1]]: The pooled RoIs.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(ps_roi_pool)
+    _assert_has_ops()
+    check_roi_boxes_shape(boxes)
+    rois = boxes
+    output_size = _pair(output_size)
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    output, _ = torch.ops.torchvision.ps_roi_pool(input, rois, spatial_scale, output_size[0], output_size[1])
+    return output
+
+
+class PSRoIPool(nn.Module):
+    """
+    See :func:`ps_roi_pool`.
+    """
+
+    def __init__(self, output_size: int, spatial_scale: float):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:
+        return ps_roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(output_size={self.output_size}, spatial_scale={self.spatial_scale})"
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_align.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..25214d6b13038149d5333c1bab16dc3fb6946396
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_align.py
@@ -0,0 +1,294 @@
+import functools
+from typing import Union
+
+import torch
+import torch.fx
+from torch import nn, Tensor
+from torch._dynamo.utils import is_compile_supported
+from torch.jit.annotations import BroadcastingList2
+from torch.nn.modules.utils import _pair
+from torchvision.extension import _assert_has_ops, _has_ops
+
+from ..utils import _log_api_usage_once
+from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
+
+
+def lazy_compile(**compile_kwargs):
+    """Lazily wrap a function with torch.compile on the first call
+
+    This avoids eagerly importing dynamo.
+    """
+
+    def decorate_fn(fn):
+        @functools.wraps(fn)
+        def compile_hook(*args, **kwargs):
+            compiled_fn = torch.compile(fn, **compile_kwargs)
+            globals()[fn.__name__] = functools.wraps(fn)(compiled_fn)
+            return compiled_fn(*args, **kwargs)
+
+        return compile_hook
+
+    return decorate_fn
+
+
+# NB: all inputs are tensors
+def _bilinear_interpolate(
+    input,  # [N, C, H, W]
+    roi_batch_ind,  # [K]
+    y,  # [K, PH, IY]
+    x,  # [K, PW, IX]
+    ymask,  # [K, IY]
+    xmask,  # [K, IX]
+):
+    _, channels, height, width = input.size()
+
+    # deal with inverse element out of feature map boundary
+    y = y.clamp(min=0)
+    x = x.clamp(min=0)
+    y_low = y.int()
+    x_low = x.int()
+    y_high = torch.where(y_low >= height - 1, height - 1, y_low + 1)
+    y_low = torch.where(y_low >= height - 1, height - 1, y_low)
+    y = torch.where(y_low >= height - 1, y.to(input.dtype), y)
+
+    x_high = torch.where(x_low >= width - 1, width - 1, x_low + 1)
+    x_low = torch.where(x_low >= width - 1, width - 1, x_low)
+    x = torch.where(x_low >= width - 1, x.to(input.dtype), x)
+
+    ly = y - y_low
+    lx = x - x_low
+    hy = 1.0 - ly
+    hx = 1.0 - lx
+
+    # do bilinear interpolation, but respect the masking!
+    # TODO: It's possible the masking here is unnecessary if y and
+    # x were clamped appropriately; hard to tell
+    def masked_index(
+        y,  # [K, PH, IY]
+        x,  # [K, PW, IX]
+    ):
+        if ymask is not None:
+            assert xmask is not None
+            y = torch.where(ymask[:, None, :], y, 0)
+            x = torch.where(xmask[:, None, :], x, 0)
+        return input[
+            roi_batch_ind[:, None, None, None, None, None],
+            torch.arange(channels, device=input.device)[None, :, None, None, None, None],
+            y[:, None, :, None, :, None],  # prev [K, PH, IY]
+            x[:, None, None, :, None, :],  # prev [K, PW, IX]
+        ]  # [K, C, PH, PW, IY, IX]
+
+    v1 = masked_index(y_low, x_low)
+    v2 = masked_index(y_low, x_high)
+    v3 = masked_index(y_high, x_low)
+    v4 = masked_index(y_high, x_high)
+
+    # all ws preemptively [K, C, PH, PW, IY, IX]
+    def outer_prod(y, x):
+        return y[:, None, :, None, :, None] * x[:, None, None, :, None, :]
+
+    w1 = outer_prod(hy, hx)
+    w2 = outer_prod(hy, lx)
+    w3 = outer_prod(ly, hx)
+    w4 = outer_prod(ly, lx)
+
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+    return val
+
+
+# TODO: this doesn't actually cache
+# TODO: main library should make this easier to do
+def maybe_cast(tensor):
+    if torch.is_autocast_enabled() and tensor.is_cuda and tensor.dtype != torch.double:
+        return tensor.float()
+    else:
+        return tensor
+
+
+# This is a pure Python and differentiable implementation of roi_align.  When
+# run in eager mode, it uses a lot of memory, but when compiled it has
+# acceptable memory usage.  The main point of this implementation is that
+# its backwards is deterministic.
+# It is transcribed directly off of the roi_align CUDA kernel, see
+# https://dev-discuss.pytorch.org/t/a-pure-python-implementation-of-roi-align-that-looks-just-like-its-cuda-kernel/1266
+@lazy_compile(dynamic=True)
+def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    orig_dtype = input.dtype
+
+    input = maybe_cast(input)
+    rois = maybe_cast(rois)
+
+    _, _, height, width = input.size()
+
+    ph = torch.arange(pooled_height, device=input.device)  # [PH]
+    pw = torch.arange(pooled_width, device=input.device)  # [PW]
+
+    # input: [N, C, H, W]
+    # rois: [K, 5]
+
+    roi_batch_ind = rois[:, 0].int()  # [K]
+    offset = 0.5 if aligned else 0.0
+    roi_start_w = rois[:, 1] * spatial_scale - offset  # [K]
+    roi_start_h = rois[:, 2] * spatial_scale - offset  # [K]
+    roi_end_w = rois[:, 3] * spatial_scale - offset  # [K]
+    roi_end_h = rois[:, 4] * spatial_scale - offset  # [K]
+
+    roi_width = roi_end_w - roi_start_w  # [K]
+    roi_height = roi_end_h - roi_start_h  # [K]
+    if not aligned:
+        roi_width = torch.clamp(roi_width, min=1.0)  # [K]
+        roi_height = torch.clamp(roi_height, min=1.0)  # [K]
+
+    bin_size_h = roi_height / pooled_height  # [K]
+    bin_size_w = roi_width / pooled_width  # [K]
+
+    exact_sampling = sampling_ratio > 0
+
+    roi_bin_grid_h = sampling_ratio if exact_sampling else torch.ceil(roi_height / pooled_height)  # scalar or [K]
+    roi_bin_grid_w = sampling_ratio if exact_sampling else torch.ceil(roi_width / pooled_width)  # scalar or [K]
+
+    """
+    iy, ix = dims(2)
+    """
+
+    if exact_sampling:
+        count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  # scalar
+        iy = torch.arange(roi_bin_grid_h, device=input.device)  # [IY]
+        ix = torch.arange(roi_bin_grid_w, device=input.device)  # [IX]
+        ymask = None
+        xmask = None
+    else:
+        count = torch.clamp(roi_bin_grid_h * roi_bin_grid_w, min=1)  # [K]
+        # When doing adaptive sampling, the number of samples we need to do
+        # is data-dependent based on how big the ROIs are.  This is a bit
+        # awkward because first-class dims can't actually handle this.
+        # So instead, we inefficiently suppose that we needed to sample ALL
+        # the points and mask out things that turned out to be unnecessary
+        iy = torch.arange(height, device=input.device)  # [IY]
+        ix = torch.arange(width, device=input.device)  # [IX]
+        ymask = iy[None, :] < roi_bin_grid_h[:, None]  # [K, IY]
+        xmask = ix[None, :] < roi_bin_grid_w[:, None]  # [K, IX]
+
+    def from_K(t):
+        return t[:, None, None]
+
+    y = (
+        from_K(roi_start_h)
+        + ph[None, :, None] * from_K(bin_size_h)
+        + (iy[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_h / roi_bin_grid_h)
+    )  # [K, PH, IY]
+    x = (
+        from_K(roi_start_w)
+        + pw[None, :, None] * from_K(bin_size_w)
+        + (ix[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_w / roi_bin_grid_w)
+    )  # [K, PW, IX]
+    val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask)  # [K, C, PH, PW, IY, IX]
+
+    # Mask out samples that weren't actually adaptively needed
+    if not exact_sampling:
+        val = torch.where(ymask[:, None, None, None, :, None], val, 0)
+        val = torch.where(xmask[:, None, None, None, None, :], val, 0)
+
+    output = val.sum((-1, -2))  # remove IY, IX ~> [K, C, PH, PW]
+    if isinstance(count, torch.Tensor):
+        output /= count[:, None, None, None]
+    else:
+        output /= count
+
+    output = output.to(orig_dtype)
+
+    return output
+
+
+@torch.fx.wrap
+def roi_align(
+    input: Tensor,
+    boxes: Union[Tensor, list[Tensor]],
+    output_size: BroadcastingList2[int],
+    spatial_scale: float = 1.0,
+    sampling_ratio: int = -1,
+    aligned: bool = False,
+) -> Tensor:
+    """
+    Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+
+    Args:
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element
+            contains ``C`` feature maps of dimensions ``H x W``.
+            If the tensor is quantized, we expect a batch size of ``N == 1``.
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed, then the first column should
+            contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``.
+            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
+            in the batch.
+        output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
+            is performed, as (height, width).
+        spatial_scale (float): a scaling factor that maps the box coordinates to
+            the input coordinates. For example, if your boxes are defined on the scale
+            of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+            the original image), you'll want to set this to 0.5. Default: 1.0
+        sampling_ratio (int): number of sampling points in the interpolation grid
+            used to compute the output value of each pooled output bin. If > 0,
+            then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+            <= 0, then an adaptive number of grid points are used (computed as
+            ``ceil(roi_width / output_width)``, and likewise for height). Default: -1
+        aligned (bool): If False, use the legacy implementation.
+            If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two
+            neighboring pixel indices. This version is used in Detectron2
+
+    Returns:
+        Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(roi_align)
+    check_roi_boxes_shape(boxes)
+    rois = boxes
+    output_size = _pair(output_size)
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    if not torch.jit.is_scripting():
+        if (
+            not _has_ops()
+            or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps or input.is_xpu))
+        ) and is_compile_supported(input.device.type):
+            return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
+    _assert_has_ops()
+    return torch.ops.torchvision.roi_align(
+        input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
+    )
+
+
+class RoIAlign(nn.Module):
+    """
+    See :func:`roi_align`.
+    """
+
+    def __init__(
+        self,
+        output_size: BroadcastingList2[int],
+        spatial_scale: float,
+        sampling_ratio: int,
+        aligned: bool = False,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+    def forward(self, input: Tensor, rois: Union[Tensor, list[Tensor]]) -> Tensor:
+        return roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"output_size={self.output_size}"
+            f", spatial_scale={self.spatial_scale}"
+            f", sampling_ratio={self.sampling_ratio}"
+            f", aligned={self.aligned}"
+            f")"
+        )
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_pool.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4bb95c0f3e49da94ef46d9f85f9f449531b632
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/roi_pool.py
@@ -0,0 +1,72 @@
+from typing import Union
+
+import torch
+import torch.fx
+from torch import nn, Tensor
+from torch.jit.annotations import BroadcastingList2
+from torch.nn.modules.utils import _pair
+from torchvision.extension import _assert_has_ops
+
+from ..utils import _log_api_usage_once
+from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
+
+
+@torch.fx.wrap
+def roi_pool(
+    input: Tensor,
+    boxes: Union[Tensor, list[Tensor]],
+    output_size: BroadcastingList2[int],
+    spatial_scale: float = 1.0,
+) -> Tensor:
+    """
+    Performs Region of Interest (RoI) Pool operator described in Fast R-CNN
+
+    Args:
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element
+            contains ``C`` feature maps of dimensions ``H x W``.
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed, then the first column should
+            contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``.
+            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
+            in the batch.
+        output_size (int or Tuple[int, int]): the size of the output after the cropping
+            is performed, as (height, width)
+        spatial_scale (float): a scaling factor that maps the box coordinates to
+            the input coordinates. For example, if your boxes are defined on the scale
+            of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+            the original image), you'll want to set this to 0.5. Default: 1.0
+
+    Returns:
+        Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(roi_pool)
+    _assert_has_ops()
+    check_roi_boxes_shape(boxes)
+    rois = boxes
+    output_size = _pair(output_size)
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    output, _ = torch.ops.torchvision.roi_pool(input, rois, spatial_scale, output_size[0], output_size[1])
+    return output
+
+
+class RoIPool(nn.Module):
+    """
+    See :func:`roi_pool`.
+    """
+
+    def __init__(self, output_size: BroadcastingList2[int], spatial_scale: float):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+
+    def forward(self, input: Tensor, rois: Union[Tensor, list[Tensor]]) -> Tensor:
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(output_size={self.output_size}, spatial_scale={self.spatial_scale})"
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/stochastic_depth.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/stochastic_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8167b2315e941f7e31a0626eeec270d350a710
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/ops/stochastic_depth.py
@@ -0,0 +1,66 @@
+import torch
+import torch.fx
+from torch import nn, Tensor
+
+from ..utils import _log_api_usage_once
+
+
+def stochastic_depth(input: Tensor, p: float, mode: str, training: bool = True) -> Tensor:
+    """
+    Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
+    <https://arxiv.org/abs/1603.09382>`_ used for randomly dropping residual
+    branches of residual architectures.
+
+    Args:
+        input (Tensor[N, ...]): The input tensor or arbitrary dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): probability of the input to be zeroed.
+        mode (str): ``"batch"`` or ``"row"``.
+                    ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
+                    randomly selected rows from the batch.
+        training: apply stochastic depth if is ``True``. Default: ``True``
+
+    Returns:
+        Tensor[N, ...]: The randomly zeroed tensor.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(stochastic_depth)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
+    if mode not in ["batch", "row"]:
+        raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
+    if not training or p == 0.0:
+        return input
+
+    survival_rate = 1.0 - p
+    if mode == "row":
+        size = [input.shape[0]] + [1] * (input.ndim - 1)
+    else:
+        size = [1] * input.ndim
+    noise = torch.empty(size, dtype=input.dtype, device=input.device)
+    noise = noise.bernoulli_(survival_rate)
+    if survival_rate > 0.0:
+        noise.div_(survival_rate)
+    return input * noise
+
+
+torch.fx.wrap("stochastic_depth")
+
+
+class StochasticDepth(nn.Module):
+    """
+    See :func:`stochastic_depth`.
+    """
+
+    def __init__(self, p: float, mode: str) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+        self.mode = mode
+
+    def forward(self, input: Tensor) -> Tensor:
+        return stochastic_depth(input, self.p, self.mode, self.training)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77680a14f0d0599f4004a2ce5c299c0f5e13a0d5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__init__.py
@@ -0,0 +1,2 @@
+from .transforms import *
+from .autoaugment import *
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cb3267e7ad720cdebde04045cb3196321dbb841
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_pil.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_pil.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67f95fd7f39c2be4cdf4298bef9a3abae11cc181
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_pil.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_tensor.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_tensor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ee4331aae9a3c6fa57d03950576a4728ffefa9e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_tensor.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_video.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b47eb33c3fdc2eed8fb8f15c53a53c701c5145f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_functional_video.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_presets.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_presets.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e817942756e57bf60e09eef27c58e4dd3cb78d9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_presets.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_transforms_video.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_transforms_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..290a8f502b37e80625a1c86ac58ea50ba9e66d5c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/_transforms_video.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/autoaugment.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/autoaugment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2febc729d3097aa36e6e0f99123b1e14f3f5cbf6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/autoaugment.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/functional.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/functional.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bf5ec7ed4465904b28c79fafc945c2120a62e4c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/__pycache__/functional.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_pil.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_pil.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b806cf6edfe7657cce7a67562b53cf494ba814
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_pil.py
@@ -0,0 +1,396 @@
+import numbers
+from collections.abc import Sequence
+from typing import Any, Literal, Optional, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageEnhance, ImageOps
+
+from ..utils import _Image_fromarray
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+
+@torch.jit.unused
+def _is_pil_image(img: Any) -> bool:
+    if accimage is not None:
+        return isinstance(img, (Image.Image, accimage.Image))
+    else:
+        return isinstance(img, Image.Image)
+
+
+@torch.jit.unused
+def get_dimensions(img: Any) -> list[int]:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            channels = len(img.getbands())
+        else:
+            channels = img.channels
+        width, height = img.size
+        return [channels, height, width]
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_size(img: Any) -> list[int]:
+    if _is_pil_image(img):
+        return list(img.size)
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_num_channels(img: Any) -> int:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            return len(img.getbands())
+        else:
+            return img.channels
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def hflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
+
+
+@torch.jit.unused
+def vflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
+
+
+@torch.jit.unused
+def adjust_brightness(img: Image.Image, brightness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_contrast(img: Image.Image, contrast_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_saturation(img: Image.Image, saturation_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    input_mode = img.mode
+    if input_mode in {"L", "1", "I", "F"}:
+        return img
+
+    h, s, v = img.convert("HSV").split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # This will over/underflow, as desired
+    np_h += np.int32(hue_factor * 255).astype(np.uint8)
+
+    h = _Image_fromarray(np_h, "L")
+
+    img = Image.merge("HSV", (h, s, v)).convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def adjust_gamma(
+    img: Image.Image,
+    gamma: float,
+    gain: float = 1.0,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    input_mode = img.mode
+    img = img.convert("RGB")
+    gamma_map = [int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) for ele in range(256)] * 3
+    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
+
+    img = img.convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def pad(
+    img: Image.Image,
+    padding: Union[int, list[int], tuple[int, ...]],
+    fill: Optional[Union[float, list[float], tuple[float, ...]]] = 0,
+    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, list):
+        padding = tuple(padding)
+
+    if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]:
+        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
+
+    if isinstance(padding, tuple) and len(padding) == 1:
+        # Compatibility with `functional_tensor.pad`
+        padding = padding[0]
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    if padding_mode == "constant":
+        opts = _parse_fill(fill, img, name="fill")
+        if img.mode == "P":
+            palette = img.getpalette()
+            image = ImageOps.expand(img, border=padding, **opts)
+            image.putpalette(palette)
+            return image
+
+        return ImageOps.expand(img, border=padding, **opts)
+    else:
+        if isinstance(padding, int):
+            pad_left = pad_right = pad_top = pad_bottom = padding
+        if isinstance(padding, tuple) and len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        if isinstance(padding, tuple) and len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+
+        p = [pad_left, pad_top, pad_right, pad_bottom]
+        cropping = -np.minimum(p, 0)
+
+        if cropping.any():
+            crop_left, crop_top, crop_right, crop_bottom = cropping
+            img = img.crop((crop_left, crop_top, img.width - crop_right, img.height - crop_bottom))
+
+        pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0)
+
+        if img.mode == "P":
+            palette = img.getpalette()
+            img = np.asarray(img)
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
+            img = Image.fromarray(img)
+            img.putpalette(palette)
+            return img
+
+        img = np.asarray(img)
+        # RGB image
+        if len(img.shape) == 3:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), padding_mode)
+        # Grayscale image
+        if len(img.shape) == 2:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
+
+        return Image.fromarray(img)
+
+
+@torch.jit.unused
+def crop(
+    img: Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.crop((left, top, left + width, top + height))
+
+
+@torch.jit.unused
+def resize(
+    img: Image.Image,
+    size: Union[list[int], int],
+    interpolation: int = Image.BILINEAR,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    if not (isinstance(size, list) and len(size) == 2):
+        raise TypeError(f"Got inappropriate size arg: {size}")
+
+    return img.resize(tuple(size[::-1]), interpolation)
+
+
+@torch.jit.unused
+def _parse_fill(
+    fill: Optional[Union[float, list[float], tuple[float, ...]]],
+    img: Image.Image,
+    name: str = "fillcolor",
+) -> dict[str, Optional[Union[float, list[float], tuple[float, ...]]]]:
+
+    # Process fill color for affine transforms
+    num_channels = get_image_num_channels(img)
+    if fill is None:
+        fill = 0
+    if isinstance(fill, (int, float)) and num_channels > 1:
+        fill = tuple([fill] * num_channels)
+    if isinstance(fill, (list, tuple)):
+        if len(fill) == 1:
+            fill = fill * num_channels
+        elif len(fill) != num_channels:
+            msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
+            raise ValueError(msg.format(len(fill), num_channels))
+
+        fill = tuple(fill)  # type: ignore[arg-type]
+
+    if img.mode != "F":
+        if isinstance(fill, (list, tuple)):
+            fill = tuple(int(x) for x in fill)
+        else:
+            fill = int(fill)
+
+    return {name: fill}
+
+
+@torch.jit.unused
+def affine(
+    img: Image.Image,
+    matrix: list[float],
+    interpolation: int = Image.NEAREST,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    output_size = img.size
+    opts = _parse_fill(fill, img)
+    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
+
+
+@torch.jit.unused
+def rotate(
+    img: Image.Image,
+    angle: float,
+    interpolation: int = Image.NEAREST,
+    expand: bool = False,
+    center: Optional[tuple[int, int]] = None,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+    return img.rotate(angle, interpolation, expand, center, **opts)
+
+
+@torch.jit.unused
+def perspective(
+    img: Image.Image,
+    perspective_coeffs: list[float],
+    interpolation: int = Image.BICUBIC,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+
+    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
+
+
+@torch.jit.unused
+def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if num_output_channels == 1:
+        img = img.convert("L")
+    elif num_output_channels == 3:
+        img = img.convert("L")
+        np_img = np.array(img, dtype=np.uint8)
+        np_img = np.dstack([np_img, np_img, np_img])
+        img = _Image_fromarray(np_img, "RGB")
+    else:
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    return img
+
+
+@torch.jit.unused
+def invert(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.invert(img)
+
+
+@torch.jit.unused
+def posterize(img: Image.Image, bits: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.posterize(img, bits)
+
+
+@torch.jit.unused
+def solarize(img: Image.Image, threshold: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.solarize(img, threshold)
+
+
+@torch.jit.unused
+def adjust_sharpness(img: Image.Image, sharpness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Sharpness(img)
+    img = enhancer.enhance(sharpness_factor)
+    return img
+
+
+@torch.jit.unused
+def autocontrast(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.autocontrast(img)
+
+
+@torch.jit.unused
+def equalize(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.equalize(img)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_tensor.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..71409c40af31fa76debcced5211284437d2f1bdd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_functional_tensor.py
@@ -0,0 +1,962 @@
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+from torch.nn.functional import conv2d, grid_sample, interpolate, pad as torch_pad
+
+
+def _is_tensor_a_torch_image(x: Tensor) -> bool:
+    return x.ndim >= 2
+
+
+def _assert_image_tensor(img: Tensor) -> None:
+    if not _is_tensor_a_torch_image(img):
+        raise TypeError("Tensor is not a torch image.")
+
+
+def get_dimensions(img: Tensor) -> list[int]:
+    _assert_image_tensor(img)
+    channels = 1 if img.ndim == 2 else img.shape[-3]
+    height, width = img.shape[-2:]
+    return [channels, height, width]
+
+
+def get_image_size(img: Tensor) -> list[int]:
+    # Returns (w, h) of tensor image
+    _assert_image_tensor(img)
+    return [img.shape[-1], img.shape[-2]]
+
+
+def get_image_num_channels(img: Tensor) -> int:
+    _assert_image_tensor(img)
+    if img.ndim == 2:
+        return 1
+    elif img.ndim > 2:
+        return img.shape[-3]
+
+    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
+
+
+def _max_value(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 255
+    elif dtype == torch.int8:
+        return 127
+    elif dtype == torch.int16:
+        return 32767
+    elif dtype == torch.uint16:
+        return 65535
+    elif dtype == torch.int32:
+        return 2147483647
+    elif dtype == torch.int64:
+        return 9223372036854775807
+    else:
+        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
+        # easy.
+        return 1
+
+
+def _assert_channels(img: Tensor, permitted: list[int]) -> None:
+    c = get_dimensions(img)[0]
+    if c not in permitted:
+        raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
+
+
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    if image.dtype == dtype:
+        return image
+
+    if image.is_floating_point():
+
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely."
+            raise RuntimeError(msg)
+
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # For data in the range 0-1, (float * 255).to(uint) is only 255
+        # when float is exactly 1.0.
+        # `max + 1 - epsilon` provides more evenly distributed mapping of
+        # ranges of floats to ints.
+        eps = 1e-3
+        max_val = float(_max_value(dtype))
+        result = image.mul(max_val + 1.0 - eps)
+        return result.to(dtype)
+    else:
+        input_max = float(_max_value(image.dtype))
+
+        # int to float
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            image = image.to(dtype)
+            return image / input_max
+
+        output_max = float(_max_value(dtype))
+
+        # int to int
+        if input_max > output_max:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image // factor can produce different results
+            factor = int((input_max + 1) // (output_max + 1))
+            image = torch.div(image, factor, rounding_mode="floor")
+            return image.to(dtype)
+        else:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image * factor can produce different results
+            factor = int((output_max + 1) // (input_max + 1))
+            image = image.to(dtype)
+            return image * factor
+
+
+def vflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-2)
+
+
+def hflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-1)
+
+
+def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
+    _assert_image_tensor(img)
+
+    _, h, w = get_dimensions(img)
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        padding_ltrb = [
+            max(-left + min(0, right), 0),
+            max(-top + min(0, bottom), 0),
+            max(right - max(w, left), 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
+    return img[..., top:bottom, left:right]
+
+
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    _assert_channels(img, [1, 3])
+
+    if num_output_channels not in (1, 3):
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    if img.shape[-3] == 3:
+        r, g, b = img.unbind(dim=-3)
+        # This implementation closely follows the TF one:
+        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
+        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
+        l_img = l_img.unsqueeze(dim=-3)
+    else:
+        l_img = img.clone()
+
+    if num_output_channels == 3:
+        return l_img.expand(img.shape)
+
+    return l_img
+
+
+def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    return _blend(img, torch.zeros_like(img), brightness_factor)
+
+
+def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [3, 1])
+    c = get_dimensions(img)[0]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    if c == 3:
+        mean = torch.mean(rgb_to_grayscale(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
+    else:
+        mean = torch.mean(img.to(dtype), dim=(-3, -2, -1), keepdim=True)
+
+    return _blend(img, mean, contrast_factor)
+
+
+def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    orig_dtype = img.dtype
+    img = convert_image_dtype(img, torch.float32)
+
+    img = _rgb2hsv(img)
+    h, s, v = img.unbind(dim=-3)
+    h = (h + hue_factor) % 1.0
+    img = torch.stack((h, s, v), dim=-3)
+    img_hue_adj = _hsv2rgb(img)
+
+    return convert_image_dtype(img_hue_adj, orig_dtype)
+
+
+def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    return _blend(img, rgb_to_grayscale(img), saturation_factor)
+
+
+def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
+    if not isinstance(img, torch.Tensor):
+        raise TypeError("Input img should be a Tensor.")
+
+    _assert_channels(img, [1, 3])
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    result = img
+    dtype = img.dtype
+    if not torch.is_floating_point(img):
+        result = convert_image_dtype(result, torch.float32)
+
+    result = (gain * result**gamma).clamp(0, 1)
+
+    result = convert_image_dtype(result, dtype)
+    return result
+
+
+def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
+    ratio = float(ratio)
+    bound = _max_value(img1.dtype)
+    return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
+
+
+def _rgb2hsv(img: Tensor) -> Tensor:
+    r, g, b = img.unbind(dim=-3)
+
+    # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
+    # src/libImaging/Convert.c#L330
+    maxc = torch.max(img, dim=-3).values
+    minc = torch.min(img, dim=-3).values
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    cr = maxc - minc
+    # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = cr / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    cr_divisor = torch.where(eqc, ones, cr)
+    rc = (maxc - r) / cr_divisor
+    gc = (maxc - g) / cr_divisor
+    bc = (maxc - b) / cr_divisor
+
+    hr = (maxc == r) * (bc - gc)
+    hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc)
+    hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc)
+    h = hr + hg + hb
+    h = torch.fmod((h / 6.0 + 1.0), 1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv2rgb(img: Tensor) -> Tensor:
+    h, s, v = img.unbind(dim=-3)
+    i = torch.floor(h * 6.0)
+    f = (h * 6.0) - i
+    i = i.to(dtype=torch.int32)
+
+    p = torch.clamp((v * (1.0 - s)), 0.0, 1.0)
+    q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0)
+    t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
+    i = i % 6
+
+    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
+
+    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
+    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
+    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
+    a4 = torch.stack((a1, a2, a3), dim=-4)
+
+    return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
+
+
+def _pad_symmetric(img: Tensor, padding: list[int]) -> Tensor:
+    # padding is left, right, top, bottom
+
+    # crop if needed
+    if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0:
+        neg_min_padding = [-min(x, 0) for x in padding]
+        crop_left, crop_right, crop_top, crop_bottom = neg_min_padding
+        img = img[..., crop_top : img.shape[-2] - crop_bottom, crop_left : img.shape[-1] - crop_right]
+        padding = [max(x, 0) for x in padding]
+
+    in_sizes = img.size()
+
+    _x_indices = [i for i in range(in_sizes[-1])]  # [0, 1, 2, 3, ...]
+    left_indices = [i for i in range(padding[0] - 1, -1, -1)]  # e.g. [3, 2, 1, 0]
+    right_indices = [-(i + 1) for i in range(padding[1])]  # e.g. [-1, -2, -3]
+    x_indices = torch.tensor(left_indices + _x_indices + right_indices, device=img.device)
+
+    _y_indices = [i for i in range(in_sizes[-2])]
+    top_indices = [i for i in range(padding[2] - 1, -1, -1)]
+    bottom_indices = [-(i + 1) for i in range(padding[3])]
+    y_indices = torch.tensor(top_indices + _y_indices + bottom_indices, device=img.device)
+
+    ndim = img.ndim
+    if ndim == 3:
+        return img[:, y_indices[:, None], x_indices[None, :]]
+    elif ndim == 4:
+        return img[:, :, y_indices[:, None], x_indices[None, :]]
+    else:
+        raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
+
+
+def _parse_pad_padding(padding: Union[int, list[int]]) -> list[int]:
+    if isinstance(padding, int):
+        if torch.jit.is_scripting():
+            # This maybe unreachable
+            raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]")
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif len(padding) == 1:
+        pad_left = pad_right = pad_top = pad_bottom = padding[0]
+    elif len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    else:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+def pad(
+    img: Tensor, padding: Union[int, list[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if fill is None:
+        fill = 0
+
+    if not isinstance(padding, (int, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if not isinstance(fill, (int, float)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, tuple):
+        padding = list(padding)
+
+    if isinstance(padding, list):
+        # TODO: Jit is failing on loading this op when scripted and saved
+        # https://github.com/pytorch/pytorch/issues/81100
+        if len(padding) not in [1, 2, 4]:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    p = _parse_pad_padding(padding)
+
+    if padding_mode == "edge":
+        # remap padding_mode str
+        padding_mode = "replicate"
+    elif padding_mode == "symmetric":
+        # route to another implementation
+        return _pad_symmetric(img, p)
+
+    need_squeeze = False
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
+        # Here we temporarily cast input tensor to float
+        # until pytorch issue is resolved :
+        # https://github.com/pytorch/pytorch/issues/40763
+        need_cast = True
+        img = img.to(torch.float32)
+
+    if padding_mode in ("reflect", "replicate"):
+        img = torch_pad(img, p, mode=padding_mode)
+    else:
+        img = torch_pad(img, p, mode=padding_mode, value=float(fill))
+
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        img = img.to(out_dtype)
+
+    return img
+
+
+def resize(
+    img: Tensor,
+    size: list[int],
+    interpolation: str = "bilinear",
+    antialias: Optional[bool] = True,
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if isinstance(size, tuple):
+        size = list(size)
+
+    if antialias is None:
+        antialias = False
+
+    if antialias and interpolation not in ["bilinear", "bicubic"]:
+        # We manually set it to False to avoid an error downstream in interpolate()
+        # This behaviour is documented: the parameter is irrelevant for modes
+        # that are not bilinear or bicubic. We used to raise an error here, but
+        # now we don't as True is the default.
+        antialias = False
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
+
+    # Define align_corners to avoid warnings
+    align_corners = False if interpolation in ["bilinear", "bicubic"] else None
+
+    img = interpolate(img, size=size, mode=interpolation, align_corners=align_corners, antialias=antialias)
+
+    if interpolation == "bicubic" and out_dtype == torch.uint8:
+        img = img.clamp(min=0, max=255)
+
+    img = _cast_squeeze_out(img, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
+
+    return img
+
+
+def _assert_grid_transform_inputs(
+    img: Tensor,
+    matrix: Optional[list[float]],
+    interpolation: str,
+    fill: Optional[Union[int, float, list[float]]],
+    supported_interpolation_modes: list[str],
+    coeffs: Optional[list[float]] = None,
+) -> None:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor")
+
+    _assert_image_tensor(img)
+
+    if matrix is not None and not isinstance(matrix, list):
+        raise TypeError("Argument matrix should be a list")
+
+    if matrix is not None and len(matrix) != 6:
+        raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None and not isinstance(fill, (int, float, tuple, list)):
+        warnings.warn("Argument fill should be either int, float, tuple or list")
+
+    # Check fill
+    num_channels = get_dimensions(img)[0]
+    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
+        msg = (
+            "The number of elements in 'fill' cannot broadcast to match the number of "
+            "channels of the image ({} != {})"
+        )
+        raise ValueError(msg.format(len(fill), num_channels))
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _cast_squeeze_in(img: Tensor, req_dtypes: list[torch.dtype]) -> tuple[Tensor, bool, bool, torch.dtype]:
+    need_squeeze = False
+    # make image NCHW
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if out_dtype not in req_dtypes:
+        need_cast = True
+        req_dtype = req_dtypes[0]
+        img = img.to(req_dtype)
+    return img, need_cast, need_squeeze, out_dtype
+
+
+def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtype: torch.dtype) -> Tensor:
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        if out_dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+            # it is better to round before cast
+            img = torch.round(img)
+        img = img.to(out_dtype)
+
+    return img
+
+
+def _apply_grid_transform(
+    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, list[float]]]
+) -> Tensor:
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
+
+    if img.shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3])
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
+        img = torch.cat((img, mask), dim=1)
+
+    img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        mask = img[:, -1:, :, :]  # N * 1 * H * W
+        img = img[:, :-1, :, :]  # N * C * H * W
+        mask = mask.expand_as(img)
+        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
+        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
+        if mode == "nearest":
+            mask = mask < 0.5
+            img[mask] = fill_img[mask]
+        else:  # 'bilinear'
+            img = img * mask + (1.0 - mask) * fill_img
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def _gen_affine_grid(
+    theta: Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device)
+    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device)
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def affine(
+    img: Tensor,
+    matrix: list[float],
+    interpolation: str = "nearest",
+    fill: Optional[Union[int, float, list[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    shape = img.shape
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _compute_affine_output_size(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    pts = torch.tensor(
+        [
+            [-0.5 * w, -0.5 * h, 1.0],
+            [-0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, -0.5 * h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, _ = new_pts.min(dim=0)
+    max_vals, _ = new_pts.max(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    min_vals += torch.tensor((w * 0.5, h * 0.5))
+    max_vals += torch.tensor((w * 0.5, h * 0.5))
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    cmax = torch.ceil((max_vals / tol).trunc_() * tol)
+    cmin = torch.floor((min_vals / tol).trunc_() * tol)
+    size = cmax - cmin
+    return int(size[0]), int(size[1])  # w, h
+
+
+def rotate(
+    img: Tensor,
+    matrix: list[float],
+    interpolation: str = "nearest",
+    expand: bool = False,
+    fill: Optional[Union[int, float, list[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+    w, h = img.shape[-1], img.shape[-2]
+    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=w, h=h, ow=ow, oh=oh)
+
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _perspective_grid(coeffs: list[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)
+    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.view(1, oh, ow, 2)
+
+
+def perspective(
+    img: Tensor,
+    perspective_coeffs: list[float],
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, list[float]]] = None,
+) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor.")
+
+    _assert_image_tensor(img)
+
+    _assert_grid_transform_inputs(
+        img,
+        matrix=None,
+        interpolation=interpolation,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=img.device)
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> Tensor:
+    ksize_half = (kernel_size - 1) * 0.5
+
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size, dtype=dtype, device=device)
+    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
+    kernel1d = pdf / pdf.sum()
+
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: list[int], sigma: list[float], dtype: torch.dtype, device: torch.device
+) -> Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
+    kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
+    return kernel2d
+
+
+def gaussian_blur(img: Tensor, kernel_size: list[int], sigma: list[float]) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    _assert_image_tensor(img)
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    img = torch_pad(img, padding, mode="reflect")
+    img = conv2d(img, kernel, groups=img.shape[-3])
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def invert(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    return _max_value(img.dtype) - img
+
+
+def posterize(img: Tensor, bits: int) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+    mask = -int(2 ** (8 - bits))  # JIT-friendly for: ~(2 ** (8 - bits) - 1)
+    return img & mask
+
+
+def solarize(img: Tensor, threshold: float) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    if threshold > _max_value(img.dtype):
+        raise TypeError("Threshold should be less than bound of img.")
+
+    inverted_img = invert(img)
+    return torch.where(img >= threshold, inverted_img, img)
+
+
+def _blurred_degenerate_image(img: Tensor) -> Tensor:
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    kernel = torch.ones((3, 3), dtype=dtype, device=img.device)
+    kernel[1, 1] = 5.0
+    kernel /= kernel.sum()
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+    result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
+    result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
+
+    result = img.clone()
+    result[..., 1:-1, 1:-1] = result_tmp
+
+    return result
+
+
+def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if img.size(-1) <= 2 or img.size(-2) <= 2:
+        return img
+
+    return _blend(img, _blurred_degenerate_image(img), sharpness_factor)
+
+
+def autocontrast(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    bound = _max_value(img.dtype)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
+    maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype)
+    scale = bound / (maximum - minimum)
+    eq_idxs = torch.isfinite(scale).logical_not()
+    minimum[eq_idxs] = 0
+    scale[eq_idxs] = 1
+
+    return ((img - minimum) * scale).clamp(0, bound).to(img.dtype)
+
+
+def _scale_channel(img_chan: Tensor) -> Tensor:
+    # TODO: we should expect bincount to always be faster than histc, but this
+    # isn't always the case. Once
+    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
+    # block and only use bincount.
+    if img_chan.is_cuda:
+        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
+    else:
+        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
+
+    nonzero_hist = hist[hist != 0]
+    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
+    if step == 0:
+        return img_chan
+
+    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
+    lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255)
+
+    return lut[img_chan.to(torch.int64)].to(torch.uint8)
+
+
+def _equalize_single_image(img: Tensor) -> Tensor:
+    return torch.stack([_scale_channel(img[c]) for c in range(img.size(0))])
+
+
+def equalize(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if not (3 <= img.ndim <= 4):
+        raise TypeError(f"Input image tensor should have 3 or 4 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+
+    if img.ndim == 3:
+        return _equalize_single_image(img)
+
+    return torch.stack([_equalize_single_image(x) for x in img])
+
+
+def normalize(tensor: Tensor, mean: list[float], std: list[float], inplace: bool = False) -> Tensor:
+    _assert_image_tensor(tensor)
+
+    if not tensor.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {tensor.dtype}.")
+
+    if tensor.ndim < 3:
+        raise ValueError(
+            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {tensor.size()}"
+        )
+
+    if not inplace:
+        tensor = tensor.clone()
+
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if (std == 0).any():
+        raise ValueError(f"std evaluated to zero after conversion to {dtype}, leading to division by zero.")
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+    return tensor.sub_(mean).div_(std)
+
+
+def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
+    _assert_image_tensor(img)
+
+    if not inplace:
+        img = img.clone()
+
+    img[..., i : i + h, j : j + w] = v
+    return img
+
+
+def _create_identity_grid(size: list[int]) -> Tensor:
+    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
+    grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
+    return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
+
+
+def elastic_transform(
+    img: Tensor,
+    displacement: Tensor,
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, list[float]]] = None,
+) -> Tensor:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    size = list(img.shape[-2:])
+    displacement = displacement.to(img.device)
+
+    identity_grid = _create_identity_grid(size)
+    grid = identity_grid.to(img.device) + displacement
+    return _apply_grid_transform(img, grid, interpolation, fill)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_presets.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_presets.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7eba6721c789625da9b5a4e8ed7372c6efbcd4d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_presets.py
@@ -0,0 +1,217 @@
+"""
+This file is part of the private API. Please do not use directly these classes as they will be modified on
+future versions without warning. The classes should be accessed only via the transforms argument of Weights.
+"""
+
+from typing import Optional, Union
+
+import torch
+from torch import nn, Tensor
+
+from . import functional as F, InterpolationMode
+
+
+__all__ = [
+    "ObjectDetection",
+    "ImageClassification",
+    "VideoClassification",
+    "SemanticSegmentation",
+    "OpticalFlow",
+]
+
+
+class ObjectDetection(nn.Module):
+    def forward(self, img: Tensor) -> Tensor:
+        if not isinstance(img, Tensor):
+            img = F.pil_to_tensor(img)
+        return F.convert_image_dtype(img, torch.float)
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "()"
+
+    def describe(self) -> str:
+        return (
+            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
+            "The images are rescaled to ``[0.0, 1.0]``."
+        )
+
+
+class ImageClassification(nn.Module):
+    def __init__(
+        self,
+        *,
+        crop_size: int,
+        resize_size: int = 256,
+        mean: tuple[float, ...] = (0.485, 0.456, 0.406),
+        std: tuple[float, ...] = (0.229, 0.224, 0.225),
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.crop_size = [crop_size]
+        self.resize_size = [resize_size]
+        self.mean = list(mean)
+        self.std = list(std)
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def forward(self, img: Tensor) -> Tensor:
+        img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
+        img = F.center_crop(img, self.crop_size)
+        if not isinstance(img, Tensor):
+            img = F.pil_to_tensor(img)
+        img = F.convert_image_dtype(img, torch.float)
+        img = F.normalize(img, mean=self.mean, std=self.std)
+        return img
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        format_string += f"\n    crop_size={self.crop_size}"
+        format_string += f"\n    resize_size={self.resize_size}"
+        format_string += f"\n    mean={self.mean}"
+        format_string += f"\n    std={self.std}"
+        format_string += f"\n    interpolation={self.interpolation}"
+        format_string += "\n)"
+        return format_string
+
+    def describe(self) -> str:
+        return (
+            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
+            f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``, "
+            f"followed by a central crop of ``crop_size={self.crop_size}``. Finally the values are first rescaled to "
+            f"``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and ``std={self.std}``."
+        )
+
+
+class VideoClassification(nn.Module):
+    def __init__(
+        self,
+        *,
+        crop_size: tuple[int, int],
+        resize_size: Union[tuple[int], tuple[int, int]],
+        mean: tuple[float, ...] = (0.43216, 0.394666, 0.37645),
+        std: tuple[float, ...] = (0.22803, 0.22145, 0.216989),
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    ) -> None:
+        super().__init__()
+        self.crop_size = list(crop_size)
+        self.resize_size = list(resize_size)
+        self.mean = list(mean)
+        self.std = list(std)
+        self.interpolation = interpolation
+
+    def forward(self, vid: Tensor) -> Tensor:
+        need_squeeze = False
+        if vid.ndim < 5:
+            vid = vid.unsqueeze(dim=0)
+            need_squeeze = True
+
+        N, T, C, H, W = vid.shape
+        vid = vid.view(-1, C, H, W)
+        # We hard-code antialias=False to preserve results after we changed
+        # its default from None to True (see
+        # https://github.com/pytorch/vision/pull/7160)
+        # TODO: we could re-train the video models with antialias=True?
+        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation, antialias=False)
+        vid = F.center_crop(vid, self.crop_size)
+        vid = F.convert_image_dtype(vid, torch.float)
+        vid = F.normalize(vid, mean=self.mean, std=self.std)
+        H, W = self.crop_size
+        vid = vid.view(N, T, C, H, W)
+        vid = vid.permute(0, 2, 1, 3, 4)  # (N, T, C, H, W) => (N, C, T, H, W)
+
+        if need_squeeze:
+            vid = vid.squeeze(dim=0)
+        return vid
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        format_string += f"\n    crop_size={self.crop_size}"
+        format_string += f"\n    resize_size={self.resize_size}"
+        format_string += f"\n    mean={self.mean}"
+        format_string += f"\n    std={self.std}"
+        format_string += f"\n    interpolation={self.interpolation}"
+        format_string += "\n)"
+        return format_string
+
+    def describe(self) -> str:
+        return (
+            "Accepts batched ``(B, T, C, H, W)`` and single ``(T, C, H, W)`` video frame ``torch.Tensor`` objects. "
+            f"The frames are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``, "
+            f"followed by a central crop of ``crop_size={self.crop_size}``. Finally the values are first rescaled to "
+            f"``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and ``std={self.std}``. Finally the output "
+            "dimensions are permuted to ``(..., C, T, H, W)`` tensors."
+        )
+
+
+class SemanticSegmentation(nn.Module):
+    def __init__(
+        self,
+        *,
+        resize_size: Optional[int],
+        mean: tuple[float, ...] = (0.485, 0.456, 0.406),
+        std: tuple[float, ...] = (0.229, 0.224, 0.225),
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.resize_size = [resize_size] if resize_size is not None else None
+        self.mean = list(mean)
+        self.std = list(std)
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def forward(self, img: Tensor) -> Tensor:
+        if isinstance(self.resize_size, list):
+            img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
+        if not isinstance(img, Tensor):
+            img = F.pil_to_tensor(img)
+        img = F.convert_image_dtype(img, torch.float)
+        img = F.normalize(img, mean=self.mean, std=self.std)
+        return img
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        format_string += f"\n    resize_size={self.resize_size}"
+        format_string += f"\n    mean={self.mean}"
+        format_string += f"\n    std={self.std}"
+        format_string += f"\n    interpolation={self.interpolation}"
+        format_string += "\n)"
+        return format_string
+
+    def describe(self) -> str:
+        return (
+            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
+            f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``. "
+            f"Finally the values are first rescaled to ``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and "
+            f"``std={self.std}``."
+        )
+
+
+class OpticalFlow(nn.Module):
+    def forward(self, img1: Tensor, img2: Tensor) -> tuple[Tensor, Tensor]:
+        if not isinstance(img1, Tensor):
+            img1 = F.pil_to_tensor(img1)
+        if not isinstance(img2, Tensor):
+            img2 = F.pil_to_tensor(img2)
+
+        img1 = F.convert_image_dtype(img1, torch.float)
+        img2 = F.convert_image_dtype(img2, torch.float)
+
+        # map [0, 1] into [-1, 1]
+        img1 = F.normalize(img1, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        img2 = F.normalize(img2, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        img1 = img1.contiguous()
+        img2 = img2.contiguous()
+
+        return img1, img2
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "()"
+
+    def describe(self) -> str:
+        return (
+            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
+            "The images are rescaled to ``[-1.0, 1.0]``."
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_transforms_video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_transforms_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..a04da4f74849805641e4c470f6b6b8d5f7000e3a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/_transforms_video.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+
+import numbers
+import random
+import warnings
+
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+
+from . import _functional_video as F
+
+
+__all__ = [
+    "RandomCropVideo",
+    "RandomResizedCropVideo",
+    "CenterCropVideo",
+    "NormalizeVideo",
+    "ToTensorVideo",
+    "RandomHorizontalFlipVideo",
+]
+
+
+warnings.warn(
+    "The 'torchvision.transforms._transforms_video' module is deprecated since 0.12 and will be removed in the future. "
+    "Please use the 'torchvision.transforms' module instead."
+)
+
+
+class RandomCropVideo(RandomCrop):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip, self.size)
+        return F.crop(clip, i, j, h, w)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+
+
+class RandomResizedCropVideo(RandomResizedCrop):
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+
+        self.interpolation_mode = interpolation_mode
+        self.scale = scale
+        self.ratio = ratio
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, H, W)
+        """
+        i, j, h, w = self.get_params(clip, self.scale, self.ratio)
+        return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})"
+
+
+class CenterCropVideo:
+    def __init__(self, crop_size):
+        if isinstance(crop_size, numbers.Number):
+            self.crop_size = (int(crop_size), int(crop_size))
+        else:
+            self.crop_size = crop_size
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: central cropping of video clip. Size is
+            (C, T, crop_size, crop_size)
+        """
+        return F.center_crop(clip, self.crop_size)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(crop_size={self.crop_size})"
+
+
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W)
+        """
+        return F.normalize(clip, self.mean, self.std, self.inplace)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+
+
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
+        """
+        return F.to_tensor(clip)
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+
+
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (C, T, H, W)
+        Return:
+            clip (torch.tensor): Size is (C, T, H, W)
+        """
+        if random.random() < self.p:
+            clip = F.hflip(clip)
+        return clip
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/autoaugment.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/autoaugment.py
new file mode 100644
index 0000000000000000000000000000000000000000..20291d09b9432b99a94f2241d2c2af76f4fde526
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/autoaugment.py
@@ -0,0 +1,615 @@
+import math
+from enum import Enum
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from . import functional as F, InterpolationMode
+
+__all__ = ["AutoAugmentPolicy", "AutoAugment", "RandAugment", "TrivialAugmentWide", "AugMix"]
+
+
+def _apply_op(
+    img: Tensor, op_name: str, magnitude: float, interpolation: InterpolationMode, fill: Optional[list[float]]
+):
+    if op_name == "ShearX":
+        # magnitude should be arctan(magnitude)
+        # official autoaug: (1, level, 0, 0, 1, 0)
+        # https://github.com/tensorflow/models/blob/dd02069717128186b88afa8d857ce57d17957f03/research/autoaugment/augmentation_transforms.py#L290
+        # compared to
+        # torchvision:      (1, tan(level), 0, 0, 1, 0)
+        # https://github.com/pytorch/vision/blob/0c2373d0bba3499e95776e7936e207d8a1676e65/torchvision/transforms/functional.py#L976
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, 0],
+            scale=1.0,
+            shear=[math.degrees(math.atan(magnitude)), 0.0],
+            interpolation=interpolation,
+            fill=fill,
+            center=[0, 0],
+        )
+    elif op_name == "ShearY":
+        # magnitude should be arctan(magnitude)
+        # See above
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, 0],
+            scale=1.0,
+            shear=[0.0, math.degrees(math.atan(magnitude))],
+            interpolation=interpolation,
+            fill=fill,
+            center=[0, 0],
+        )
+    elif op_name == "TranslateX":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[int(magnitude), 0],
+            scale=1.0,
+            interpolation=interpolation,
+            shear=[0.0, 0.0],
+            fill=fill,
+        )
+    elif op_name == "TranslateY":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, int(magnitude)],
+            scale=1.0,
+            interpolation=interpolation,
+            shear=[0.0, 0.0],
+            fill=fill,
+        )
+    elif op_name == "Rotate":
+        img = F.rotate(img, magnitude, interpolation=interpolation, fill=fill)
+    elif op_name == "Brightness":
+        img = F.adjust_brightness(img, 1.0 + magnitude)
+    elif op_name == "Color":
+        img = F.adjust_saturation(img, 1.0 + magnitude)
+    elif op_name == "Contrast":
+        img = F.adjust_contrast(img, 1.0 + magnitude)
+    elif op_name == "Sharpness":
+        img = F.adjust_sharpness(img, 1.0 + magnitude)
+    elif op_name == "Posterize":
+        img = F.posterize(img, int(magnitude))
+    elif op_name == "Solarize":
+        img = F.solarize(img, magnitude)
+    elif op_name == "AutoContrast":
+        img = F.autocontrast(img)
+    elif op_name == "Equalize":
+        img = F.equalize(img)
+    elif op_name == "Invert":
+        img = F.invert(img)
+    elif op_name == "Identity":
+        pass
+    else:
+        raise ValueError(f"The provided operator {op_name} is not recognized.")
+    return img
+
+
+class AutoAugmentPolicy(Enum):
+    """AutoAugment policies learned on different datasets.
+    Available policies are IMAGENET, CIFAR10 and SVHN.
+    """
+
+    IMAGENET = "imagenet"
+    CIFAR10 = "cifar10"
+    SVHN = "svhn"
+
+
+# FIXME: Eliminate copy-pasted code for fill standardization and _augmentation_space() by moving stuff on a base class
+class AutoAugment(torch.nn.Module):
+    r"""AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    def __init__(
+        self,
+        policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.policy = policy
+        self.interpolation = interpolation
+        self.fill = fill
+        self.policies = self._get_policies(policy)
+
+    def _get_policies(
+        self, policy: AutoAugmentPolicy
+    ) -> list[tuple[tuple[str, float, Optional[int]], tuple[str, float, Optional[int]]]]:
+        if policy == AutoAugmentPolicy.IMAGENET:
+            return [
+                (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+                (("Posterize", 0.6, 7), ("Posterize", 0.6, 6)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Equalize", 0.4, None), ("Rotate", 0.8, 8)),
+                (("Solarize", 0.6, 3), ("Equalize", 0.6, None)),
+                (("Posterize", 0.8, 5), ("Equalize", 1.0, None)),
+                (("Rotate", 0.2, 3), ("Solarize", 0.6, 8)),
+                (("Equalize", 0.6, None), ("Posterize", 0.4, 6)),
+                (("Rotate", 0.8, 8), ("Color", 0.4, 0)),
+                (("Rotate", 0.4, 9), ("Equalize", 0.6, None)),
+                (("Equalize", 0.0, None), ("Equalize", 0.8, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Rotate", 0.8, 8), ("Color", 1.0, 2)),
+                (("Color", 0.8, 8), ("Solarize", 0.8, 7)),
+                (("Sharpness", 0.4, 7), ("Invert", 0.6, None)),
+                (("ShearX", 0.6, 5), ("Equalize", 1.0, None)),
+                (("Color", 0.4, 0), ("Equalize", 0.6, None)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+            ]
+        elif policy == AutoAugmentPolicy.CIFAR10:
+            return [
+                (("Invert", 0.1, None), ("Contrast", 0.2, 6)),
+                (("Rotate", 0.7, 2), ("TranslateX", 0.3, 9)),
+                (("Sharpness", 0.8, 1), ("Sharpness", 0.9, 3)),
+                (("ShearY", 0.5, 8), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.5, None), ("Equalize", 0.9, None)),
+                (("ShearY", 0.2, 7), ("Posterize", 0.3, 7)),
+                (("Color", 0.4, 3), ("Brightness", 0.6, 7)),
+                (("Sharpness", 0.3, 9), ("Brightness", 0.7, 9)),
+                (("Equalize", 0.6, None), ("Equalize", 0.5, None)),
+                (("Contrast", 0.6, 7), ("Sharpness", 0.6, 5)),
+                (("Color", 0.7, 7), ("TranslateX", 0.5, 8)),
+                (("Equalize", 0.3, None), ("AutoContrast", 0.4, None)),
+                (("TranslateY", 0.4, 3), ("Sharpness", 0.2, 6)),
+                (("Brightness", 0.9, 6), ("Color", 0.2, 8)),
+                (("Solarize", 0.5, 2), ("Invert", 0.0, None)),
+                (("Equalize", 0.2, None), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.2, None), ("Equalize", 0.6, None)),
+                (("Color", 0.9, 9), ("Equalize", 0.6, None)),
+                (("AutoContrast", 0.8, None), ("Solarize", 0.2, 8)),
+                (("Brightness", 0.1, 3), ("Color", 0.7, 0)),
+                (("Solarize", 0.4, 5), ("AutoContrast", 0.9, None)),
+                (("TranslateY", 0.9, 9), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.9, None), ("Solarize", 0.8, 3)),
+                (("Equalize", 0.8, None), ("Invert", 0.1, None)),
+                (("TranslateY", 0.7, 9), ("AutoContrast", 0.9, None)),
+            ]
+        elif policy == AutoAugmentPolicy.SVHN:
+            return [
+                (("ShearX", 0.9, 4), ("Invert", 0.2, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.6, None), ("Solarize", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("AutoContrast", 0.8, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.4, None)),
+                (("ShearY", 0.9, 5), ("Solarize", 0.2, 6)),
+                (("Invert", 0.9, None), ("AutoContrast", 0.8, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("Solarize", 0.3, 3)),
+                (("ShearY", 0.8, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.9, None), ("TranslateY", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Contrast", 0.3, 3), ("Rotate", 0.8, 4)),
+                (("Invert", 0.8, None), ("TranslateY", 0.0, 2)),
+                (("ShearY", 0.7, 6), ("Solarize", 0.4, 8)),
+                (("Invert", 0.6, None), ("Rotate", 0.8, 4)),
+                (("ShearY", 0.3, 7), ("TranslateX", 0.9, 3)),
+                (("ShearX", 0.1, 6), ("Invert", 0.6, None)),
+                (("Solarize", 0.7, 2), ("TranslateY", 0.6, 7)),
+                (("ShearY", 0.8, 4), ("Invert", 0.8, None)),
+                (("ShearX", 0.7, 9), ("TranslateY", 0.8, 3)),
+                (("ShearY", 0.8, 5), ("AutoContrast", 0.7, None)),
+                (("ShearX", 0.7, 2), ("Invert", 0.1, None)),
+            ]
+        else:
+            raise ValueError(f"The provided policy {policy} is not recognized.")
+
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.3, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True),
+            "Rotate": (torch.linspace(0.0, 30.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+            "Invert": (torch.tensor(0.0), False),
+        }
+
+    @staticmethod
+    def get_params(transform_num: int) -> tuple[int, Tensor, Tensor]:
+        """Get parameters for autoaugment transformation
+
+        Returns:
+            params required by the autoaugment transformation
+        """
+        policy_id = int(torch.randint(transform_num, (1,)).item())
+        probs = torch.rand((2,))
+        signs = torch.randint(2, (2,))
+
+        return policy_id, probs, signs
+
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: AutoAugmented image.
+        """
+        fill = self.fill
+        channels, height, width = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+
+        transform_id, probs, signs = self.get_params(len(self.policies))
+
+        op_meta = self._augmentation_space(10, (height, width))
+        for i, (op_name, p, magnitude_id) in enumerate(self.policies[transform_id]):
+            if probs[i] <= p:
+                magnitudes, signed = op_meta[op_name]
+                magnitude = float(magnitudes[magnitude_id].item()) if magnitude_id is not None else 0.0
+                if signed and signs[i] == 0:
+                    magnitude *= -1.0
+                img = _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(policy={self.policy}, fill={self.fill})"
+
+
+class RandAugment(torch.nn.Module):
+    r"""RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int): Number of augmentation transformations to apply sequentially.
+        magnitude (int): Magnitude for all the transformations.
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    def __init__(
+        self,
+        num_ops: int = 2,
+        magnitude: int = 9,
+        num_magnitude_bins: int = 31,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_ops = num_ops
+        self.magnitude = magnitude
+        self.num_magnitude_bins = num_magnitude_bins
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "Identity": (torch.tensor(0.0), False),
+            "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.3, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True),
+            "Rotate": (torch.linspace(0.0, 30.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+        }
+
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        fill = self.fill
+        channels, height, width = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+
+        op_meta = self._augmentation_space(self.num_magnitude_bins, (height, width))
+        for _ in range(self.num_ops):
+            op_index = int(torch.randint(len(op_meta), (1,)).item())
+            op_name = list(op_meta.keys())[op_index]
+            magnitudes, signed = op_meta[op_name]
+            magnitude = float(magnitudes[self.magnitude].item()) if magnitudes.ndim > 0 else 0.0
+            if signed and torch.randint(2, (1,)):
+                magnitude *= -1.0
+            img = _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+
+        return img
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_ops={self.num_ops}"
+            f", magnitude={self.magnitude}"
+            f", num_magnitude_bins={self.num_magnitude_bins}"
+            f", interpolation={self.interpolation}"
+            f", fill={self.fill}"
+            f")"
+        )
+        return s
+
+
+class TrivialAugmentWide(torch.nn.Module):
+    r"""Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    def __init__(
+        self,
+        num_magnitude_bins: int = 31,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_magnitude_bins = num_magnitude_bins
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def _augmentation_space(self, num_bins: int) -> dict[str, tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "Identity": (torch.tensor(0.0), False),
+            "ShearX": (torch.linspace(0.0, 0.99, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.99, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 32.0, num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 32.0, num_bins), True),
+            "Rotate": (torch.linspace(0.0, 135.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+        }
+
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        fill = self.fill
+        channels, height, width = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+
+        op_meta = self._augmentation_space(self.num_magnitude_bins)
+        op_index = int(torch.randint(len(op_meta), (1,)).item())
+        op_name = list(op_meta.keys())[op_index]
+        magnitudes, signed = op_meta[op_name]
+        magnitude = (
+            float(magnitudes[torch.randint(len(magnitudes), (1,), dtype=torch.long)].item())
+            if magnitudes.ndim > 0
+            else 0.0
+        )
+        if signed and torch.randint(2, (1,)):
+            magnitude *= -1.0
+
+        return _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_magnitude_bins={self.num_magnitude_bins}"
+            f", interpolation={self.interpolation}"
+            f", fill={self.fill}"
+            f")"
+        )
+        return s
+
+
+class AugMix(torch.nn.Module):
+    r"""AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int): The number of augmentation chains. Default is ``3``.
+        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    def __init__(
+        self,
+        severity: int = 3,
+        mixture_width: int = 3,
+        chain_depth: int = -1,
+        alpha: float = 1.0,
+        all_ops: bool = True,
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        fill: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self._PARAMETER_MAX = 10
+        if not (1 <= severity <= self._PARAMETER_MAX):
+            raise ValueError(f"The severity must be between [1, {self._PARAMETER_MAX}]. Got {severity} instead.")
+        self.severity = severity
+        self.mixture_width = mixture_width
+        self.chain_depth = chain_depth
+        self.alpha = alpha
+        self.all_ops = all_ops
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def _augmentation_space(self, num_bins: int, image_size: tuple[int, int]) -> dict[str, tuple[Tensor, bool]]:
+        s = {
+            # op_name: (magnitudes, signed)
+            "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.3, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, image_size[1] / 3.0, num_bins), True),
+            "TranslateY": (torch.linspace(0.0, image_size[0] / 3.0, num_bins), True),
+            "Rotate": (torch.linspace(0.0, 30.0, num_bins), True),
+            "Posterize": (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+        }
+        if self.all_ops:
+            s.update(
+                {
+                    "Brightness": (torch.linspace(0.0, 0.9, num_bins), True),
+                    "Color": (torch.linspace(0.0, 0.9, num_bins), True),
+                    "Contrast": (torch.linspace(0.0, 0.9, num_bins), True),
+                    "Sharpness": (torch.linspace(0.0, 0.9, num_bins), True),
+                }
+            )
+        return s
+
+    @torch.jit.unused
+    def _pil_to_tensor(self, img) -> Tensor:
+        return F.pil_to_tensor(img)
+
+    @torch.jit.unused
+    def _tensor_to_pil(self, img: Tensor):
+        return F.to_pil_image(img)
+
+    def _sample_dirichlet(self, params: Tensor) -> Tensor:
+        # Must be on a separate method so that we can overwrite it in tests.
+        return torch._sample_dirichlet(params)
+
+    def forward(self, orig_img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        fill = self.fill
+        channels, height, width = F.get_dimensions(orig_img)
+        if isinstance(orig_img, Tensor):
+            img = orig_img
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+        else:
+            img = self._pil_to_tensor(orig_img)
+
+        op_meta = self._augmentation_space(self._PARAMETER_MAX, (height, width))
+
+        orig_dims = list(img.shape)
+        batch = img.view([1] * max(4 - img.ndim, 0) + orig_dims)
+        batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
+
+        # Sample the beta weights for combining the original and augmented image. To get Beta, we use a Dirichlet
+        # with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of augmented image.
+        m = self._sample_dirichlet(
+            torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
+        )
+
+        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images.
+        combined_weights = self._sample_dirichlet(
+            torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
+        ) * m[:, 1].view([batch_dims[0], -1])
+
+        mix = m[:, 0].view(batch_dims) * batch
+        for i in range(self.mixture_width):
+            aug = batch
+            depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
+            for _ in range(depth):
+                op_index = int(torch.randint(len(op_meta), (1,)).item())
+                op_name = list(op_meta.keys())[op_index]
+                magnitudes, signed = op_meta[op_name]
+                magnitude = (
+                    float(magnitudes[torch.randint(self.severity, (1,), dtype=torch.long)].item())
+                    if magnitudes.ndim > 0
+                    else 0.0
+                )
+                if signed and torch.randint(2, (1,)):
+                    magnitude *= -1.0
+                aug = _apply_op(aug, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+            mix.add_(combined_weights[:, i].view(batch_dims) * aug)
+        mix = mix.view(orig_dims).to(dtype=img.dtype)
+
+        if not isinstance(orig_img, Tensor):
+            return self._tensor_to_pil(mix)
+        return mix
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"severity={self.severity}"
+            f", mixture_width={self.mixture_width}"
+            f", chain_depth={self.chain_depth}"
+            f", alpha={self.alpha}"
+            f", all_ops={self.all_ops}"
+            f", interpolation={self.interpolation}"
+            f", fill={self.fill}"
+            f")"
+        )
+        return s
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/functional.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b950b0c45b53f82949d3fe14850a3d1c17f24d1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/functional.py
@@ -0,0 +1,1586 @@
+import math
+import numbers
+import sys
+import warnings
+from enum import Enum
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from PIL.Image import Image as PILImage
+from torch import Tensor
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+from ..utils import _Image_fromarray, _log_api_usage_once
+from . import _functional_pil as F_pil, _functional_tensor as F_t
+
+
+class InterpolationMode(Enum):
+    """Interpolation modes
+    Available interpolation methods are ``nearest``, ``nearest-exact``, ``bilinear``, ``bicubic``, ``box``, ``hamming``,
+    and ``lanczos``.
+    """
+
+    NEAREST = "nearest"
+    NEAREST_EXACT = "nearest-exact"
+    BILINEAR = "bilinear"
+    BICUBIC = "bicubic"
+    # For PIL compatibility
+    BOX = "box"
+    HAMMING = "hamming"
+    LANCZOS = "lanczos"
+
+
+# TODO: Once torchscript supports Enums with staticmethod
+# this can be put into InterpolationMode as staticmethod
+def _interpolation_modes_from_int(i: int) -> InterpolationMode:
+    inverse_modes_mapping = {
+        0: InterpolationMode.NEAREST,
+        2: InterpolationMode.BILINEAR,
+        3: InterpolationMode.BICUBIC,
+        4: InterpolationMode.BOX,
+        5: InterpolationMode.HAMMING,
+        1: InterpolationMode.LANCZOS,
+    }
+    return inverse_modes_mapping[i]
+
+
+pil_modes_mapping = {
+    InterpolationMode.NEAREST: 0,
+    InterpolationMode.BILINEAR: 2,
+    InterpolationMode.BICUBIC: 3,
+    InterpolationMode.NEAREST_EXACT: 0,
+    InterpolationMode.BOX: 4,
+    InterpolationMode.HAMMING: 5,
+    InterpolationMode.LANCZOS: 1,
+}
+
+_is_pil_image = F_pil._is_pil_image
+
+
+def get_dimensions(img: Tensor) -> list[int]:
+    """Returns the dimensions of an image as [channels, height, width].
+
+    Args:
+        img (PIL Image or Tensor): The image to be checked.
+
+    Returns:
+        List[int]: The image dimensions.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(get_dimensions)
+    if isinstance(img, torch.Tensor):
+        return F_t.get_dimensions(img)
+
+    return F_pil.get_dimensions(img)
+
+
+def get_image_size(img: Tensor) -> list[int]:
+    """Returns the size of an image as [width, height].
+
+    Args:
+        img (PIL Image or Tensor): The image to be checked.
+
+    Returns:
+        List[int]: The image size.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(get_image_size)
+    if isinstance(img, torch.Tensor):
+        return F_t.get_image_size(img)
+
+    return F_pil.get_image_size(img)
+
+
+def get_image_num_channels(img: Tensor) -> int:
+    """Returns the number of channels of an image.
+
+    Args:
+        img (PIL Image or Tensor): The image to be checked.
+
+    Returns:
+        int: The number of channels.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(get_image_num_channels)
+    if isinstance(img, torch.Tensor):
+        return F_t.get_image_num_channels(img)
+
+    return F_pil.get_image_num_channels(img)
+
+
+@torch.jit.unused
+def _is_numpy(img: Any) -> bool:
+    return isinstance(img, np.ndarray)
+
+
+@torch.jit.unused
+def _is_numpy_image(img: Any) -> bool:
+    return img.ndim in {2, 3}
+
+
+def to_tensor(pic: Union[PILImage, np.ndarray]) -> Tensor:
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    This function does not support torchscript.
+
+    See :class:`~torchvision.transforms.ToTensor` for more details.
+
+    Args:
+        pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+
+    Returns:
+        Tensor: Converted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(to_tensor)
+    if not (F_pil._is_pil_image(pic) or _is_numpy(pic)):
+        raise TypeError(f"pic should be PIL Image or ndarray. Got {type(pic)}")
+
+    if _is_numpy(pic) and not _is_numpy_image(pic):
+        raise ValueError(f"pic should be 2/3 dimensional. Got {pic.ndim} dimensions.")
+
+    default_float_dtype = torch.get_default_dtype()
+
+    if isinstance(pic, np.ndarray):
+        # handle numpy array
+        if pic.ndim == 2:
+            pic = pic[:, :, None]
+
+        img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()
+        # backward compatibility
+        if isinstance(img, torch.ByteTensor):
+            return img.to(dtype=default_float_dtype).div(255)
+        else:
+            return img
+
+    if accimage is not None and isinstance(pic, accimage.Image):
+        nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
+        pic.copyto(nppic)
+        return torch.from_numpy(nppic).to(dtype=default_float_dtype)
+
+    # handle PIL Image
+    mode_to_nptype = {"I": np.int32, "I;16" if sys.byteorder == "little" else "I;16B": np.int16, "F": np.float32}
+    img = torch.from_numpy(np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
+
+    if pic.mode == "1":
+        img = 255 * img
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
+    # put it from HWC to CHW format
+    img = img.permute((2, 0, 1)).contiguous()
+    if isinstance(img, torch.ByteTensor):
+        return img.to(dtype=default_float_dtype).div(255)
+    else:
+        return img
+
+
+def pil_to_tensor(pic: Any) -> Tensor:
+    """Convert a ``PIL Image`` to a tensor of the same type.
+    This function does not support torchscript.
+
+    See :class:`~torchvision.transforms.PILToTensor` for more details.
+
+    .. note::
+
+        A deep copy of the underlying array is performed.
+
+    Args:
+        pic (PIL Image): Image to be converted to tensor.
+
+    Returns:
+        Tensor: Converted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(pil_to_tensor)
+    if not F_pil._is_pil_image(pic):
+        raise TypeError(f"pic should be PIL Image. Got {type(pic)}")
+
+    if accimage is not None and isinstance(pic, accimage.Image):
+        # accimage format is always uint8 internally, so always return uint8 here
+        nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.uint8)
+        pic.copyto(nppic)
+        return torch.as_tensor(nppic)
+
+    # handle PIL Image
+    img = torch.as_tensor(np.array(pic, copy=True))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
+    # put it from HWC to CHW format
+    img = img.permute((2, 0, 1))
+    return img
+
+
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    This function does not support PIL Image.
+
+    Args:
+        image (torch.Tensor): Image to be converted
+        dtype (torch.dtype): Desired data type of the output
+
+    Returns:
+        Tensor: Converted image
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(convert_image_dtype)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError("Input img should be Tensor Image")
+
+    return F_t.convert_image_dtype(image, dtype)
+
+
+def to_pil_image(pic, mode=None):
+    """Convert a tensor or an ndarray to PIL Image. This function does not support torchscript.
+
+    See :class:`~torchvision.transforms.ToPILImage` for more details.
+
+    Args:
+        pic (Tensor or numpy.ndarray): Image to be converted to PIL Image.
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+
+    Returns:
+        PIL Image: Image converted to PIL Image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(to_pil_image)
+
+    if isinstance(pic, torch.Tensor):
+        if pic.ndim == 3:
+            pic = pic.permute((1, 2, 0))
+        pic = pic.numpy(force=True)
+    elif not isinstance(pic, np.ndarray):
+        raise TypeError(f"pic should be Tensor or ndarray. Got {type(pic)}.")
+
+    if pic.ndim == 2:
+        # if 2D image, add channel dimension (HWC)
+        pic = np.expand_dims(pic, 2)
+    if pic.ndim != 3:
+        raise ValueError(f"pic should be 2/3 dimensional. Got {pic.ndim} dimensions.")
+
+    if pic.shape[-1] > 4:
+        raise ValueError(f"pic should not have > 4 channels. Got {pic.shape[-1]} channels.")
+
+    npimg = pic
+
+    if np.issubdtype(npimg.dtype, np.floating) and mode != "F":
+        npimg = (npimg * 255).astype(np.uint8)
+
+    if npimg.shape[2] == 1:
+        expected_mode = None
+        npimg = npimg[:, :, 0]
+        if npimg.dtype == np.uint8:
+            expected_mode = "L"
+        elif npimg.dtype == np.int16:
+            expected_mode = "I;16" if sys.byteorder == "little" else "I;16B"
+        elif npimg.dtype == np.int32:
+            expected_mode = "I"
+        elif npimg.dtype == np.float32:
+            expected_mode = "F"
+        if mode is not None and mode != expected_mode:
+            raise ValueError(f"Incorrect mode ({mode}) supplied for input type {np.dtype}. Should be {expected_mode}")
+        mode = expected_mode
+
+    elif npimg.shape[2] == 2:
+        permitted_2_channel_modes = ["LA"]
+        if mode is not None and mode not in permitted_2_channel_modes:
+            raise ValueError(f"Only modes {permitted_2_channel_modes} are supported for 2D inputs")
+
+        if mode is None and npimg.dtype == np.uint8:
+            mode = "LA"
+
+    elif npimg.shape[2] == 4:
+        permitted_4_channel_modes = ["RGBA", "CMYK", "RGBX"]
+        if mode is not None and mode not in permitted_4_channel_modes:
+            raise ValueError(f"Only modes {permitted_4_channel_modes} are supported for 4D inputs")
+
+        if mode is None and npimg.dtype == np.uint8:
+            mode = "RGBA"
+    else:
+        permitted_3_channel_modes = ["RGB", "YCbCr", "HSV"]
+        if mode is not None and mode not in permitted_3_channel_modes:
+            raise ValueError(f"Only modes {permitted_3_channel_modes} are supported for 3D inputs")
+        if mode is None and npimg.dtype == np.uint8:
+            mode = "RGB"
+
+    if mode is None:
+        raise TypeError(f"Input type {npimg.dtype} is not supported")
+
+    return _Image_fromarray(npimg, mode=mode)
+
+
+def normalize(tensor: Tensor, mean: list[float], std: list[float], inplace: bool = False) -> Tensor:
+    """Normalize a float tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+
+    .. note::
+        This transform acts out of place by default, i.e., it does not mutates the input tensor.
+
+    See :class:`~torchvision.transforms.Normalize` for more details.
+
+    Args:
+        tensor (Tensor): Float tensor image of size (C, H, W) or (B, C, H, W) to be normalized.
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation inplace.
+
+    Returns:
+        Tensor: Normalized Tensor image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(normalize)
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError(f"img should be Tensor Image. Got {type(tensor)}")
+
+    return F_t.normalize(tensor, mean=mean, std=std, inplace=inplace)
+
+
+def _compute_resized_output_size(
+    image_size: tuple[int, int],
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    allow_size_none: bool = False,  # only True in v2
+) -> list[int]:
+    h, w = image_size
+    short, long = (w, h) if w <= h else (h, w)
+    if size is None:
+        if not allow_size_none:
+            raise ValueError("This should never happen!!")
+        if not isinstance(max_size, int):
+            raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
+        new_short, new_long = int(max_size * short / long), max_size
+        new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short)
+    elif len(size) == 1:  # specified size only for the smallest edge
+        requested_new_short = size if isinstance(size, int) else size[0]
+        new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+        if max_size is not None:
+            if max_size <= requested_new_short:
+                raise ValueError(
+                    f"max_size = {max_size} must be strictly greater than the requested "
+                    f"size for the smaller edge size = {size}"
+                )
+            if new_long > max_size:
+                new_short, new_long = int(max_size * new_short / new_long), max_size
+
+        new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short)
+    else:  # specified both h and w
+        new_w, new_h = size[1], size[0]
+    return [new_h, new_w]
+
+
+def resize(
+    img: Tensor,
+    size: list[int],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> Tensor:
+    r"""Resize the input image to the given size.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        img (PIL Image or Tensor): Image to be resized.
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), the output size will be matched to this. If size is an int,
+            the smaller edge of the image will be matched to this number maintaining
+            the aspect ratio. i.e, if height > width, then image will be rescaled to
+            :math:`\left(\text{size} \times \frac{\text{height}}{\text{width}}, \text{size}\right)`.
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`.
+            Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+
+    Returns:
+        PIL Image or Tensor: Resized image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(resize)
+
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
+    if isinstance(size, (list, tuple)):
+        if len(size) not in [1, 2]:
+            raise ValueError(
+                f"Size must be an int or a 1 or 2 element tuple/list, not a {len(size)} element tuple/list"
+            )
+        if max_size is not None and len(size) != 1:
+            raise ValueError(
+                "max_size should only be passed if size specifies the length of the smaller edge, "
+                "i.e. size should be an int or a sequence of length 1 in torchscript mode."
+            )
+
+    _, image_height, image_width = get_dimensions(img)
+    if isinstance(size, int):
+        size = [size]
+    output_size = _compute_resized_output_size((image_height, image_width), size, max_size)
+
+    if [image_height, image_width] == output_size:
+        return img
+
+    if not isinstance(img, torch.Tensor):
+        if antialias is False:
+            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+        pil_interpolation = pil_modes_mapping[interpolation]
+        return F_pil.resize(img, size=output_size, interpolation=pil_interpolation)
+
+    return F_t.resize(img, size=output_size, interpolation=interpolation.value, antialias=antialias)
+
+
+def pad(img: Tensor, padding: list[int], fill: Union[int, float] = 0, padding_mode: str = "constant") -> Tensor:
+    r"""Pad the given image on all sides with the given "pad" value.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
+    at most 3 leading dimensions for mode edge,
+    and an arbitrary number of leading dimensions for mode constant
+
+    Args:
+        img (PIL Image or Tensor): Image to be padded.
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0.
+            If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        PIL Image or Tensor: Padded image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(pad)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.pad(img, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    return F_t.pad(img, padding=padding, fill=fill, padding_mode=padding_mode)
+
+
+def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
+    """Crop the given image at specified location and output size.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then cropped.
+
+    Args:
+        img (PIL Image or Tensor): Image to be cropped. (0,0) denotes the top left corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(crop)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.crop(img, top, left, height, width)
+
+    return F_t.crop(img, top, left, height, width)
+
+
+def center_crop(img: Tensor, output_size: list[int]) -> Tensor:
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int,
+            it is used for both directions.
+
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(center_crop)
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+
+    _, image_height, image_width = get_dimensions(img)
+    crop_height, crop_width = output_size
+
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = pad(img, padding_ltrb, fill=0)  # PIL uses fill value 0
+        _, image_height, image_width = get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return crop(img, crop_top, crop_left, crop_height, crop_width)
+
+
+def resized_crop(
+    img: Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> Tensor:
+    """Crop the given image and resize it to desired size.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Notably used in :class:`~torchvision.transforms.RandomResizedCrop`.
+
+    Args:
+        img (PIL Image or Tensor): Image to be cropped. (0,0) denotes the top left corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+        size (sequence or int): Desired output size. Same semantics as ``resize``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`.
+            Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(resized_crop)
+    img = crop(img, top, left, height, width)
+    img = resize(img, size, interpolation, antialias=antialias)
+    return img
+
+
+def hflip(img: Tensor) -> Tensor:
+    """Horizontally flip the given image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be flipped. If img
+            is a Tensor, it is expected to be in [..., H, W] format,
+            where ... means it can have an arbitrary number of leading
+            dimensions.
+
+    Returns:
+        PIL Image or Tensor:  Horizontally flipped image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(hflip)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.hflip(img)
+
+    return F_t.hflip(img)
+
+
+def _get_perspective_coeffs(startpoints: list[list[int]], endpoints: list[list[int]]) -> list[float]:
+    """Helper function to get the coefficients (a, b, c, d, e, f, g, h) for the perspective transforms.
+
+    In Perspective Transform each pixel (x, y) in the original image gets transformed as,
+     (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
+
+    Args:
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
+
+    Returns:
+        octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
+    """
+    if len(startpoints) != 4 or len(endpoints) != 4:
+        raise ValueError(
+            f"Please provide exactly four corners, got {len(startpoints)} startpoints and {len(endpoints)} endpoints."
+        )
+    a_matrix = torch.zeros(2 * len(startpoints), 8, dtype=torch.float64)
+
+    for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
+        a_matrix[2 * i, :] = torch.tensor([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
+        a_matrix[2 * i + 1, :] = torch.tensor([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
+
+    b_matrix = torch.tensor(startpoints, dtype=torch.float64).view(8)
+    # do least squares in double precision to prevent numerical issues
+    res = torch.linalg.lstsq(a_matrix, b_matrix, driver="gels").solution.to(torch.float32)
+
+    output: list[float] = res.tolist()
+    return output
+
+
+def perspective(
+    img: Tensor,
+    startpoints: list[list[int]],
+    endpoints: list[list[int]],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    fill: Optional[list[float]] = None,
+) -> Tensor:
+    """Perform perspective transform of the given image.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        img (PIL Image or Tensor): Image to be transformed.
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+            .. note::
+                In torchscript mode single int/float value is not supported, please use a sequence
+                of length 1: ``[value, ]``.
+
+    Returns:
+        PIL Image or Tensor: transformed Image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(perspective)
+
+    coeffs = _get_perspective_coeffs(startpoints, endpoints)
+
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
+    if not isinstance(img, torch.Tensor):
+        pil_interpolation = pil_modes_mapping[interpolation]
+        return F_pil.perspective(img, coeffs, interpolation=pil_interpolation, fill=fill)
+
+    return F_t.perspective(img, coeffs, interpolation=interpolation.value, fill=fill)
+
+
+def vflip(img: Tensor) -> Tensor:
+    """Vertically flip the given image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be flipped. If img
+            is a Tensor, it is expected to be in [..., H, W] format,
+            where ... means it can have an arbitrary number of leading
+            dimensions.
+
+    Returns:
+        PIL Image or Tensor:  Vertically flipped image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(vflip)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.vflip(img)
+
+    return F_t.vflip(img)
+
+
+def five_crop(img: Tensor, size: list[int]) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    """Crop the given image into four corners and the central crop.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Returns:
+       tuple: tuple (tl, tr, bl, br, center)
+       Corresponding top left, top right, bottom left, bottom right and center crop.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(five_crop)
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    elif isinstance(size, (tuple, list)) and len(size) == 1:
+        size = (size[0], size[0])
+
+    if len(size) != 2:
+        raise ValueError("Please provide only two dimensions (h, w) for size.")
+
+    _, image_height, image_width = get_dimensions(img)
+    crop_height, crop_width = size
+    if crop_width > image_width or crop_height > image_height:
+        msg = "Requested crop size {} is bigger than input size {}"
+        raise ValueError(msg.format(size, (image_height, image_width)))
+
+    tl = crop(img, 0, 0, crop_height, crop_width)
+    tr = crop(img, 0, image_width - crop_width, crop_height, crop_width)
+    bl = crop(img, image_height - crop_height, 0, crop_height, crop_width)
+    br = crop(img, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+
+    center = center_crop(img, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+def ten_crop(
+    img: Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+    """Generate ten cropped images from the given image.
+    Crop the given image into four corners and the central crop plus the
+    flipped version of these (horizontal flipping is used by default).
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool): Use vertical flipping instead of horizontal
+
+    Returns:
+        tuple: tuple (tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip)
+        Corresponding top left, top right, bottom left, bottom right and
+        center crop and same for the flipped image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(ten_crop)
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    elif isinstance(size, (tuple, list)) and len(size) == 1:
+        size = (size[0], size[0])
+
+    if len(size) != 2:
+        raise ValueError("Please provide only two dimensions (h, w) for size.")
+
+    first_five = five_crop(img, size)
+
+    if vertical_flip:
+        img = vflip(img)
+    else:
+        img = hflip(img)
+
+    second_five = five_crop(img, size)
+    return first_five + second_five
+
+
+def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
+    """Adjust brightness of an image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non-negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        PIL Image or Tensor: Brightness adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_brightness)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_brightness(img, brightness_factor)
+
+    return F_t.adjust_brightness(img, brightness_factor)
+
+
+def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
+    """Adjust contrast of an image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non-negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        PIL Image or Tensor: Contrast adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_contrast)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_contrast(img, contrast_factor)
+
+    return F_t.adjust_contrast(img, contrast_factor)
+
+
+def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
+    """Adjust color saturation of an image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        PIL Image or Tensor: Saturation adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_saturation)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_saturation(img, saturation_factor)
+
+    return F_t.adjust_saturation(img, saturation_factor)
+
+
+def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    See `Hue`_ for more details.
+
+    .. _Hue: https://en.wikipedia.org/wiki/Hue
+
+    Args:
+        img (PIL Image or Tensor): Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+            Note: the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        PIL Image or Tensor: Hue adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_hue)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_hue(img, hue_factor)
+
+    return F_t.adjust_hue(img, hue_factor)
+
+
+def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
+    r"""Perform gamma correction on an image.
+
+    Also known as Power Law Transform. Intensities in RGB mode are adjusted
+    based on the following equation:
+
+    .. math::
+        I_{\text{out}} = 255 \times \text{gain} \times \left(\frac{I_{\text{in}}}{255}\right)^{\gamma}
+
+    See `Gamma Correction`_ for more details.
+
+    .. _Gamma Correction: https://en.wikipedia.org/wiki/Gamma_correction
+
+    Args:
+        img (PIL Image or Tensor): PIL Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, modes with transparency (alpha channel) are not supported.
+        gamma (float): Non negative real number, same as :math:`\gamma` in the equation.
+            gamma larger than 1 make the shadows darker,
+            while gamma smaller than 1 make dark regions lighter.
+        gain (float): The constant multiplier.
+    Returns:
+        PIL Image or Tensor: Gamma correction adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_gamma)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_gamma(img, gamma, gain)
+
+    return F_t.adjust_gamma(img, gamma, gain)
+
+
+def _get_inverse_affine_matrix(
+    center: list[float], angle: float, translate: list[float], scale: float, shear: list[float], inverted: bool = True
+) -> list[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # RSS without scaling
+    a = math.cos(rot - sy) / math.cos(sy)
+    b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot)
+    c = math.sin(rot - sy) / math.cos(sy)
+    d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot)
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d, -b, 0.0, -c, a, 0.0]
+        matrix = [x / scale for x in matrix]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty)
+        matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty)
+        # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx
+        matrix[5] += cy
+    else:
+        matrix = [a, b, 0.0, c, d, 0.0]
+        matrix = [x * scale for x in matrix]
+        # Apply inverse of center translation: RSS * C^-1
+        matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy)
+        matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy)
+        # Apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx + tx
+        matrix[5] += cy + ty
+
+    return matrix
+
+
+def rotate(
+    img: Tensor,
+    angle: float,
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[int]] = None,
+    fill: Optional[list[float]] = None,
+) -> Tensor:
+    """Rotate the image by angle.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        img (PIL Image or Tensor): image to be rotated.
+        angle (number): rotation angle value in degrees, counter-clockwise.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
+            Default is the center of the image.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+            .. note::
+                In torchscript mode single int/float value is not supported, please use a sequence
+                of length 1: ``[value, ]``.
+    Returns:
+        PIL Image or Tensor: Rotated image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(rotate)
+
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if center is not None and not isinstance(center, (list, tuple)):
+        raise TypeError("Argument center should be a sequence")
+
+    if not isinstance(img, torch.Tensor):
+        pil_interpolation = pil_modes_mapping[interpolation]
+        return F_pil.rotate(img, angle=angle, interpolation=pil_interpolation, expand=expand, center=center, fill=fill)
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        _, height, width = get_dimensions(img)
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    # due to current incoherence of rotation angle direction between affine and rotate implementations
+    # we need to set -angle.
+    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
+    return F_t.rotate(img, matrix=matrix, interpolation=interpolation.value, expand=expand, fill=fill)
+
+
+def affine(
+    img: Tensor,
+    angle: float,
+    translate: list[int],
+    scale: float,
+    shear: list[float],
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    fill: Optional[list[float]] = None,
+    center: Optional[list[int]] = None,
+) -> Tensor:
+    """Apply affine transformation on the image keeping image center invariant.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        img (PIL Image or Tensor): image to transform.
+        angle (number): rotation angle in degrees between -180 and 180, clockwise direction.
+        translate (sequence of integers): horizontal and vertical translations (post-rotation translation)
+        scale (float): overall scale
+        shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x-axis, while
+            the second value corresponds to a shear parallel to the y-axis.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+            .. note::
+                In torchscript mode single int/float value is not supported, please use a sequence
+                of length 1: ``[value, ]``.
+        center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        PIL Image or Tensor: Transformed image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(affine)
+
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError("Shear should be either a single value or a sequence of two values")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None and not isinstance(center, (list, tuple)):
+        raise TypeError("Argument center should be a sequence")
+
+    _, height, width = get_dimensions(img)
+    if not isinstance(img, torch.Tensor):
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
+        if center is None:
+            center = [width * 0.5, height * 0.5]
+        matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        pil_interpolation = pil_modes_mapping[interpolation]
+        return F_pil.affine(img, matrix=matrix, interpolation=pil_interpolation, fill=fill)
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        _, height, width = get_dimensions(img)
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    translate_f = [1.0 * t for t in translate]
+    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
+    return F_t.affine(img, matrix=matrix, interpolation=interpolation.value, fill=fill)
+
+
+# Looks like to_grayscale() is a stand-alone functional that is never called
+# from the transform classes. Perhaps it's still here for BC? I can't be
+# bothered to dig.
+@torch.jit.unused
+def to_grayscale(img, num_output_channels=1):
+    """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
+    This transform does not support torch Tensor.
+
+    Args:
+        img (PIL Image): PIL Image to be converted to grayscale.
+        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default is 1.
+
+    Returns:
+        PIL Image: Grayscale version of the image.
+
+        - if num_output_channels = 1 : returned image is single channel
+        - if num_output_channels = 3 : returned image is 3 channel with r = g = b
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(to_grayscale)
+    if isinstance(img, Image.Image):
+        return F_pil.to_grayscale(img, num_output_channels)
+
+    raise TypeError("Input should be PIL Image")
+
+
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
+    """Convert RGB image to grayscale version of image.
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Note:
+        Please, note that this method supports only RGB images as input. For inputs in other color spaces,
+        please, consider using :meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image.
+
+    Args:
+        img (PIL Image or Tensor): RGB Image to be converted to grayscale.
+        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the image.
+
+        - if num_output_channels = 1 : returned image is single channel
+        - if num_output_channels = 3 : returned image is 3 channel with r = g = b
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(rgb_to_grayscale)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.to_grayscale(img, num_output_channels)
+
+    return F_t.rgb_to_grayscale(img, num_output_channels)
+
+
+def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
+    """Erase the input Tensor Image with given value.
+    This transform does not support PIL Image.
+
+    Args:
+        img (Tensor Image): Tensor image of size (C, H, W) to be erased
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the erased region.
+        w (int): Width of the erased region.
+        v: Erasing value.
+        inplace(bool, optional): For in-place operations. By default, is set False.
+
+    Returns:
+        Tensor Image: Erased image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(erase)
+    if not isinstance(img, torch.Tensor):
+        raise TypeError(f"img should be Tensor Image. Got {type(img)}")
+
+    return F_t.erase(img, i, j, h, w, v, inplace=inplace)
+
+
+def gaussian_blur(img: Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None) -> Tensor:
+    """Performs Gaussian blurring on the image by given kernel
+
+    The convolution will be using reflection padding corresponding to the kernel size, to maintain the input shape.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most one leading dimension.
+
+    Args:
+        img (PIL Image or Tensor): Image to be blurred
+        kernel_size (sequence of ints or int): Gaussian kernel size. Can be a sequence of integers
+            like ``(kx, ky)`` or a single integer for square kernels.
+
+            .. note::
+                In torchscript mode kernel_size as single int is not supported, use a sequence of
+                length 1: ``[ksize, ]``.
+        sigma (sequence of floats or float, optional): Gaussian kernel standard deviation. Can be a
+            sequence of floats like ``(sigma_x, sigma_y)`` or a single float to define the
+            same sigma in both X/Y directions. If None, then it is computed using
+            ``kernel_size`` as ``sigma = 0.3 * ((kernel_size - 1) * 0.5 - 1) + 0.8``.
+            Default, None.
+
+            .. note::
+                In torchscript mode sigma as single float is
+                not supported, use a sequence of length 1: ``[sigma, ]``.
+
+    Returns:
+        PIL Image or Tensor: Gaussian Blurred version of the image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(gaussian_blur)
+    if not isinstance(kernel_size, (int, list, tuple)):
+        raise TypeError(f"kernel_size should be int or a sequence of integers. Got {type(kernel_size)}")
+    if isinstance(kernel_size, int):
+        kernel_size = [kernel_size, kernel_size]
+    if len(kernel_size) != 2:
+        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
+    for ksize in kernel_size:
+        if ksize % 2 == 0 or ksize < 0:
+            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
+
+    if sigma is None:
+        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
+
+    if sigma is not None and not isinstance(sigma, (int, float, list, tuple)):
+        raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
+    if isinstance(sigma, (int, float)):
+        sigma = [float(sigma), float(sigma)]
+    if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
+        sigma = [sigma[0], sigma[0]]
+    if len(sigma) != 2:
+        raise ValueError(f"If sigma is a sequence, its length should be 2. Got {len(sigma)}")
+    for s in sigma:
+        if s <= 0.0:
+            raise ValueError(f"sigma should have positive values. Got {sigma}")
+
+    t_img = img
+    if not isinstance(img, torch.Tensor):
+        if not F_pil._is_pil_image(img):
+            raise TypeError(f"img should be PIL Image or Tensor. Got {type(img)}")
+
+        t_img = pil_to_tensor(img)
+
+    output = F_t.gaussian_blur(t_img, kernel_size, sigma)
+
+    if not isinstance(img, torch.Tensor):
+        output = to_pil_image(output, mode=img.mode)
+    return output
+
+
+def invert(img: Tensor) -> Tensor:
+    """Invert the colors of an RGB/grayscale image.
+
+    Args:
+        img (PIL Image or Tensor): Image to have its colors inverted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Returns:
+        PIL Image or Tensor: Color inverted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(invert)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.invert(img)
+
+    return F_t.invert(img)
+
+
+def posterize(img: Tensor, bits: int) -> Tensor:
+    """Posterize an image by reducing the number of bits for each color channel.
+
+    Args:
+        img (PIL Image or Tensor): Image to have its colors posterized.
+            If img is torch Tensor, it should be of type torch.uint8, and
+            it is expected to be in [..., 1 or 3, H, W] format, where ... means
+            it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, it is expected to be in mode "L" or "RGB".
+        bits (int): The number of bits to keep for each channel (0-8).
+    Returns:
+        PIL Image or Tensor: Posterized image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(posterize)
+    if not (0 <= bits <= 8):
+        raise ValueError(f"The number if bits should be between 0 and 8. Got {bits}")
+
+    if not isinstance(img, torch.Tensor):
+        return F_pil.posterize(img, bits)
+
+    return F_t.posterize(img, bits)
+
+
+def solarize(img: Tensor, threshold: float) -> Tensor:
+    """Solarize an RGB/grayscale image by inverting all pixel values above a threshold.
+
+    Args:
+        img (PIL Image or Tensor): Image to have its colors inverted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, it is expected to be in mode "L" or "RGB".
+        threshold (float): All pixels equal or above this value are inverted.
+    Returns:
+        PIL Image or Tensor: Solarized image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(solarize)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.solarize(img, threshold)
+
+    return F_t.solarize(img, threshold)
+
+
+def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
+    """Adjust the sharpness of an image.
+
+    Args:
+        img (PIL Image or Tensor): Image to be adjusted.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+
+    Returns:
+        PIL Image or Tensor: Sharpness adjusted image.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(adjust_sharpness)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.adjust_sharpness(img, sharpness_factor)
+
+    return F_t.adjust_sharpness(img, sharpness_factor)
+
+
+def autocontrast(img: Tensor) -> Tensor:
+    """Maximize contrast of an image by remapping its
+    pixels per channel so that the lowest becomes black and the lightest
+    becomes white.
+
+    Args:
+        img (PIL Image or Tensor): Image on which autocontrast is applied.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Returns:
+        PIL Image or Tensor: An image that was autocontrasted.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(autocontrast)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.autocontrast(img)
+
+    return F_t.autocontrast(img)
+
+
+def equalize(img: Tensor) -> Tensor:
+    """Equalize the histogram of an image by applying
+    a non-linear mapping to the input in order to create a uniform
+    distribution of grayscale values in the output.
+
+    Args:
+        img (PIL Image or Tensor): Image on which equalize is applied.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            The tensor dtype must be ``torch.uint8`` and values are expected to be in ``[0, 255]``.
+            If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Returns:
+        PIL Image or Tensor: An image that was equalized.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(equalize)
+    if not isinstance(img, torch.Tensor):
+        return F_pil.equalize(img)
+
+    return F_t.equalize(img)
+
+
+def elastic_transform(
+    img: Tensor,
+    displacement: Tensor,
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    fill: Optional[list[float]] = None,
+) -> Tensor:
+    """Transform a tensor image with elastic transformations.
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to grid_sample from the image.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        img (PIL Image or Tensor): Image on which elastic_transform is applied.
+            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+            where ... means it can have an arbitrary number of leading dimensions.
+            If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+        displacement (Tensor): The displacement field. Expected shape is [1, H, W, 2].
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`.
+            Default is ``InterpolationMode.BILINEAR``.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
+            If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(elastic_transform)
+    # Backward compatibility with integer value
+    if isinstance(interpolation, int):
+        warnings.warn(
+            "Argument interpolation should be of type InterpolationMode instead of int. "
+            "Please, use InterpolationMode enum."
+        )
+        interpolation = _interpolation_modes_from_int(interpolation)
+
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
+    t_img = img
+    if not isinstance(img, torch.Tensor):
+        if not F_pil._is_pil_image(img):
+            raise TypeError(f"img should be PIL Image or Tensor. Got {type(img)}")
+        t_img = pil_to_tensor(img)
+
+    shape = t_img.shape
+    shape = (1,) + shape[-2:] + (2,)
+    if shape != displacement.shape:
+        raise ValueError(f"Argument displacement shape should be {shape}, but given {displacement.shape}")
+
+    # TODO: if image shape is [N1, N2, ..., C, H, W] and
+    # displacement is [1, H, W, 2] we need to reshape input image
+    # such grid_sampler takes internal code for 4D input
+
+    output = F_t.elastic_transform(
+        t_img,
+        displacement,
+        interpolation=interpolation.value,
+        fill=fill,
+    )
+
+    if not isinstance(img, torch.Tensor):
+        output = to_pil_image(output, mode=img.mode)
+    return output
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/transforms.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6595a3402ee970e8751e1ecb2068db5b91805c6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/transforms.py
@@ -0,0 +1,2161 @@
+import math
+import numbers
+import random
+import warnings
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+from ..utils import _log_api_usage_once
+from . import functional as F
+from .functional import _interpolation_modes_from_int, InterpolationMode
+
+__all__ = [
+    "Compose",
+    "ToTensor",
+    "PILToTensor",
+    "ConvertImageDtype",
+    "ToPILImage",
+    "Normalize",
+    "Resize",
+    "CenterCrop",
+    "Pad",
+    "Lambda",
+    "RandomApply",
+    "RandomChoice",
+    "RandomOrder",
+    "RandomCrop",
+    "RandomHorizontalFlip",
+    "RandomVerticalFlip",
+    "RandomResizedCrop",
+    "FiveCrop",
+    "TenCrop",
+    "LinearTransformation",
+    "ColorJitter",
+    "RandomRotation",
+    "RandomAffine",
+    "Grayscale",
+    "RandomGrayscale",
+    "RandomPerspective",
+    "RandomErasing",
+    "GaussianBlur",
+    "InterpolationMode",
+    "RandomInvert",
+    "RandomPosterize",
+    "RandomSolarize",
+    "RandomAdjustSharpness",
+    "RandomAutocontrast",
+    "RandomEqualize",
+    "ElasticTransform",
+]
+
+
+class Compose:
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
+    def __init__(self, transforms):
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            _log_api_usage_once(self)
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+
+
+class ToTensor:
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
+    def __init__(self) -> None:
+        _log_api_usage_once(self)
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+
+        Returns:
+            Tensor: Converted image.
+        """
+        return F.to_tensor(pic)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+
+
+class PILToTensor:
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
+
+    Convert a PIL Image with H height, W width, and C channels to a Tensor of shape (C x H x W).
+
+    Example:
+        >>> from PIL import Image
+        >>> import torchvision.transforms as T
+        >>> img = Image.new("RGB", (320, 240))  # size (W=320, H=240)
+        >>> tensor = T.PILToTensor()(img)
+        >>> print(tensor.shape)
+        torch.Size([3, 240, 320])
+    """
+
+    def __init__(self) -> None:
+        _log_api_usage_once(self)
+
+    def __call__(self, pic):
+        """
+        .. note::
+
+            A deep copy of the underlying array is performed.
+
+        Args:
+            pic (PIL Image): Image to be converted to tensor.
+
+        Returns:
+            Tensor: Converted image.
+        """
+        return F.pil_to_tensor(pic)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+
+
+class ConvertImageDtype(torch.nn.Module):
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
+    def __init__(self, dtype: torch.dtype) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.dtype = dtype
+
+    def forward(self, image):
+        return F.convert_image_dtype(image, self.dtype)
+
+
+class ToPILImage:
+    """Convert a tensor or an ndarray to PIL Image
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
+    def __init__(self, mode=None):
+        _log_api_usage_once(self)
+        self.mode = mode
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (Tensor or numpy.ndarray): Image to be converted to PIL Image.
+
+        Returns:
+            PIL Image: Image converted to PIL Image.
+
+        """
+        return F.to_pil_image(pic, self.mode)
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        if self.mode is not None:
+            format_string += f"mode={self.mode}"
+        format_string += ")"
+        return format_string
+
+
+class Normalize(torch.nn.Module):
+    """Normalize a tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Args:
+            tensor (Tensor): Tensor image to be normalized.
+
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return F.normalize(tensor, self.mean, self.std, self.inplace)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std})"
+
+
+class Resize(torch.nn.Module):
+    """Resize the input image to the given size.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means a maximum of two leading dimensions
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=True):
+        super().__init__()
+        _log_api_usage_once(self)
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(f"Size should be int or sequence. Got {type(size)}")
+        if isinstance(size, Sequence) and len(size) not in (1, 2):
+            raise ValueError("If size is a sequence, it should have 1 or 2 values")
+        self.size = size
+        self.max_size = max_size
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        return F.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
+
+    def __repr__(self) -> str:
+        detail = f"(size={self.size}, interpolation={self.interpolation.value}, max_size={self.max_size}, antialias={self.antialias})"
+        return f"{self.__class__.__name__}{detail}"
+
+
+class CenterCrop(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
+    def __init__(self, size):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return F.center_crop(img, self.size)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+
+
+class Pad(torch.nn.Module):
+    """Pad the given image on all sides with the given "pad" value.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
+    at most 3 leading dimensions for mode edge,
+    and an arbitrary number of leading dimensions for mode constant
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self, padding, fill=0, padding_mode="constant"):
+        super().__init__()
+        _log_api_usage_once(self)
+        if not isinstance(padding, (numbers.Number, tuple, list)):
+            raise TypeError("Got inappropriate padding arg")
+
+        if not isinstance(fill, (numbers.Number, tuple, list)):
+            raise TypeError("Got inappropriate fill arg")
+
+        if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+            raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+        if isinstance(padding, Sequence) and len(padding) not in [1, 2, 4]:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+
+        self.padding = padding
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be padded.
+
+        Returns:
+            PIL Image or Tensor: Padded image.
+        """
+        return F.pad(img, self.padding, self.fill, self.padding_mode)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(padding={self.padding}, fill={self.fill}, padding_mode={self.padding_mode})"
+
+
+class Lambda:
+    """Apply a user-defined lambda as a transform. This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
+    def __init__(self, lambd):
+        _log_api_usage_once(self)
+        if not callable(lambd):
+            raise TypeError(f"Argument lambd should be callable, got {repr(type(lambd).__name__)}")
+        self.lambd = lambd
+
+    def __call__(self, img):
+        return self.lambd(img)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+
+
+class RandomTransforms:
+    """Base class for a list of transformations with randomness
+
+    Args:
+        transforms (sequence): list of transformations
+    """
+
+    def __init__(self, transforms):
+        _log_api_usage_once(self)
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence")
+        self.transforms = transforms
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+
+
+class RandomApply(torch.nn.Module):
+    """Apply randomly a list of transformations with a given probability.
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability
+    """
+
+    def __init__(self, transforms, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.transforms = transforms
+        self.p = p
+
+    def forward(self, img):
+        if self.p < torch.rand(1):
+            return img
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        format_string += f"\n    p={self.p}"
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+
+
+class RandomOrder(RandomTransforms):
+    """Apply a list of transformations in a random order. This transform does not support torchscript."""
+
+    def __call__(self, img):
+        order = list(range(len(self.transforms)))
+        random.shuffle(order)
+        for i in order:
+            img = self.transforms[i](img)
+        return img
+
+
+class RandomChoice(RandomTransforms):
+    """Apply single transformation randomly picked from a list. This transform does not support torchscript."""
+
+    def __init__(self, transforms, p=None):
+        super().__init__(transforms)
+        if p is not None and not isinstance(p, Sequence):
+            raise TypeError("Argument p should be a sequence")
+        self.p = p
+
+    def __call__(self, *args):
+        t = random.choices(self.transforms, weights=self.p)[0]
+        return t(*args)
+
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}(p={self.p})"
+
+
+class RandomCrop(torch.nn.Module):
+    """Crop the given image at a random location.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
+    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image, applied before cropping. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    @staticmethod
+    def get_params(img: Tensor, output_size: tuple[int, int]) -> tuple[int, int, int, int]:
+        """Get parameters for ``crop`` for a random crop.
+
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        _, h, w = F.get_dimensions(img)
+        th, tw = output_size
+
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+
+    def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode="constant"):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        self.size = tuple(_setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
+
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+
+        _, height, width = F.get_dimensions(img)
+        # pad the width if needed
+        if self.pad_if_needed and width < self.size[1]:
+            padding = [self.size[1] - width, 0]
+            img = F.pad(img, padding, self.fill, self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and height < self.size[0]:
+            padding = [0, self.size[0] - height]
+            img = F.pad(img, padding, self.fill, self.padding_mode)
+
+        i, j, h, w = self.get_params(img, self.size)
+
+        return F.crop(img, i, j, h, w)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, padding={self.padding})"
+
+
+class RandomHorizontalFlip(torch.nn.Module):
+    """Horizontally flip the given image randomly with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be flipped.
+
+        Returns:
+            PIL Image or Tensor: Randomly flipped image.
+        """
+        if torch.rand(1) < self.p:
+            return F.hflip(img)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomVerticalFlip(torch.nn.Module):
+    """Vertically flip the given image randomly with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be flipped.
+
+        Returns:
+            PIL Image or Tensor: Randomly flipped image.
+        """
+        if torch.rand(1) < self.p:
+            return F.vflip(img)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomPerspective(torch.nn.Module):
+    """Performs a random perspective transformation of the given image with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float): probability of the image being transformed. Default is 0.5.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+    """
+
+    def __init__(self, distortion_scale=0.5, p=0.5, interpolation=InterpolationMode.BILINEAR, fill=0):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
+        self.interpolation = interpolation
+        self.distortion_scale = distortion_scale
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+
+        self.fill = fill
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be Perspectively transformed.
+
+        Returns:
+            PIL Image or Tensor: Randomly transformed image.
+        """
+
+        fill = self.fill
+        channels, height, width = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            else:
+                fill = [float(f) for f in fill]
+
+        if torch.rand(1) < self.p:
+            startpoints, endpoints = self.get_params(width, height, self.distortion_scale)
+            return F.perspective(img, startpoints, endpoints, self.interpolation, fill)
+        return img
+
+    @staticmethod
+    def get_params(width: int, height: int, distortion_scale: float) -> tuple[list[list[int]], list[list[int]]]:
+        """Get parameters for ``perspective`` for a random perspective transform.
+
+        Args:
+            width (int): width of the image.
+            height (int): height of the image.
+            distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+
+        Returns:
+            List containing [top-left, top-right, bottom-right, bottom-left] of the original image,
+            List containing [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        """
+        half_height = height // 2
+        half_width = width // 2
+        topleft = [
+            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
+            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
+        ]
+        topright = [
+            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
+            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
+        ]
+        botright = [
+            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
+            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
+        ]
+        botleft = [
+            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
+            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+        return startpoints, endpoints
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomResizedCrop(torch.nn.Module):
+    """Crop a random portion of image and resize it to a given size.
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    A crop of the original image is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation=InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img: Tensor, scale: list[float], ratio: list[float]) -> tuple[int, int, int, int]:
+        """Get parameters for ``crop`` for a random sized crop.
+
+        Args:
+            img (PIL Image or Tensor): Input image.
+            scale (list): range of scale of the origin size cropped
+            ratio (list): range of aspect ratio of the origin aspect ratio cropped
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+            sized crop.
+        """
+        _, height, width = F.get_dimensions(img)
+        area = height * width
+
+        log_ratio = torch.log(torch.tensor(ratio))
+        for _ in range(10):
+            target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+            aspect_ratio = torch.exp(torch.empty(1).uniform_(log_ratio[0], log_ratio[1])).item()
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if in_ratio < min(ratio):
+            w = width
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = height
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = width
+            h = height
+        i = (height - h) // 2
+        j = (width - w) // 2
+        return i, j, h, w
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped and resized.
+
+        Returns:
+            PIL Image or Tensor: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        return F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias)
+
+    def __repr__(self) -> str:
+        interpolate_str = self.interpolation.value
+        format_string = self.__class__.__name__ + f"(size={self.size}"
+        format_string += f", scale={tuple(round(s, 4) for s in self.scale)}"
+        format_string += f", ratio={tuple(round(r, 4) for r in self.ratio)}"
+        format_string += f", interpolation={interpolate_str}"
+        format_string += f", antialias={self.antialias})"
+        return format_string
+
+
+class FiveCrop(torch.nn.Module):
+    """Crop the given image into four corners and the central crop.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Example:
+         >>> transform = Compose([
+         >>>    FiveCrop(size), # this is a list of PIL Images
+         >>>    Lambda(lambda crops: torch.stack([PILToTensor()(crop) for crop in crops])) # returns a 4D tensor
+         >>> ])
+         >>> #In your test loop you can do the following:
+         >>> input, target = batch # input is a 5d tensor, target is 2d
+         >>> bs, ncrops, c, h, w = input.size()
+         >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops
+         >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops
+    """
+
+    def __init__(self, size):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+
+        Returns:
+            tuple of 5 images. Image can be PIL Image or Tensor
+        """
+        return F.five_crop(img, self.size)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+
+
+class TenCrop(torch.nn.Module):
+    """Crop the given image into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool): Use vertical flipping instead of horizontal
+
+    Example:
+         >>> transform = Compose([
+         >>>    TenCrop(size), # this is a tuple of PIL Images
+         >>>    Lambda(lambda crops: torch.stack([PILToTensor()(crop) for crop in crops])) # returns a 4D tensor
+         >>> ])
+         >>> #In your test loop you can do the following:
+         >>> input, target = batch # input is a 5d tensor, target is 2d
+         >>> bs, ncrops, c, h, w = input.size()
+         >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops
+         >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops
+    """
+
+    def __init__(self, size, vertical_flip=False):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.vertical_flip = vertical_flip
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+
+        Returns:
+            tuple of 10 images. Image can be PIL Image or Tensor
+        """
+        return F.ten_crop(img, self.size, self.vertical_flip)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, vertical_flip={self.vertical_flip})"
+
+
+class LinearTransformation(torch.nn.Module):
+    """Transform a tensor image with a square transformation matrix and a mean_vector computed
+    offline.
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
+    def __init__(self, transformation_matrix, mean_vector):
+        super().__init__()
+        _log_api_usage_once(self)
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError(
+                "transformation_matrix should be square. Got "
+                f"{tuple(transformation_matrix.size())} rectangular matrix."
+            )
+
+        if mean_vector.size(0) != transformation_matrix.size(0):
+            raise ValueError(
+                f"mean_vector should have the same length {mean_vector.size(0)}"
+                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
+            )
+
+        if transformation_matrix.device != mean_vector.device:
+            raise ValueError(
+                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
+            )
+
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
+        self.transformation_matrix = transformation_matrix
+        self.mean_vector = mean_vector
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Args:
+            tensor (Tensor): Tensor image to be whitened.
+
+        Returns:
+            Tensor: Transformed image.
+        """
+        shape = tensor.shape
+        n = shape[-3] * shape[-2] * shape[-1]
+        if n != self.transformation_matrix.shape[0]:
+            raise ValueError(
+                "Input tensor and transformation matrix have incompatible shape."
+                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
+                + f"{self.transformation_matrix.shape[0]}"
+            )
+
+        if tensor.device.type != self.mean_vector.device.type:
+            raise ValueError(
+                "Input tensor should be on the same device as transformation matrix and mean vector. "
+                f"Got {tensor.device} vs {self.mean_vector.device}"
+            )
+
+        flat_tensor = tensor.view(-1, n) - self.mean_vector
+        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
+        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
+        tensor = transformed_tensor.view(shape)
+        return tensor
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}(transformation_matrix="
+            f"{self.transformation_matrix.tolist()}"
+            f", mean_vector={self.mean_vector.tolist()})"
+        )
+        return s
+
+
+class ColorJitter(torch.nn.Module):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
+    def __init__(
+        self,
+        brightness: Union[float, tuple[float, float]] = 0,
+        contrast: Union[float, tuple[float, float]] = 0,
+        saturation: Union[float, tuple[float, float]] = 0,
+        hue: Union[float, tuple[float, float]] = 0,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.brightness = self._check_input(brightness, "brightness")
+        self.contrast = self._check_input(contrast, "contrast")
+        self.saturation = self._check_input(saturation, "saturation")
+        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    @torch.jit.unused
+    def _check_input(self, value, name, center=1, bound=(0, float("inf")), clip_first_on_zero=True):
+        if isinstance(value, numbers.Number):
+            if value < 0:
+                raise ValueError(f"If {name} is a single number, it must be non negative.")
+            value = [center - float(value), center + float(value)]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0.0)
+        elif isinstance(value, (tuple, list)) and len(value) == 2:
+            value = [float(value[0]), float(value[1])]
+        else:
+            raise TypeError(f"{name} should be a single number or a list/tuple with length 2.")
+
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
+        # if value is 0 or (1., 1.) for brightness/contrast/saturation
+        # or (0., 0.) for hue, do nothing
+        if value[0] == value[1] == center:
+            return None
+        else:
+            return tuple(value)
+
+    @staticmethod
+    def get_params(
+        brightness: Optional[list[float]],
+        contrast: Optional[list[float]],
+        saturation: Optional[list[float]],
+        hue: Optional[list[float]],
+    ) -> tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]:
+        """Get the parameters for the randomized transform to be applied on image.
+
+        Args:
+            brightness (tuple of float (min, max), optional): The range from which the brightness_factor is chosen
+                uniformly. Pass None to turn off the transformation.
+            contrast (tuple of float (min, max), optional): The range from which the contrast_factor is chosen
+                uniformly. Pass None to turn off the transformation.
+            saturation (tuple of float (min, max), optional): The range from which the saturation_factor is chosen
+                uniformly. Pass None to turn off the transformation.
+            hue (tuple of float (min, max), optional): The range from which the hue_factor is chosen uniformly.
+                Pass None to turn off the transformation.
+
+        Returns:
+            tuple: The parameters used to apply the randomized transform
+            along with their random order.
+        """
+        fn_idx = torch.randperm(4)
+
+        b = None if brightness is None else float(torch.empty(1).uniform_(brightness[0], brightness[1]))
+        c = None if contrast is None else float(torch.empty(1).uniform_(contrast[0], contrast[1]))
+        s = None if saturation is None else float(torch.empty(1).uniform_(saturation[0], saturation[1]))
+        h = None if hue is None else float(torch.empty(1).uniform_(hue[0], hue[1]))
+
+        return fn_idx, b, c, s, h
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Input image.
+
+        Returns:
+            PIL Image or Tensor: Color jittered image.
+        """
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
+
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+
+        return img
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"brightness={self.brightness}"
+            f", contrast={self.contrast}"
+            f", saturation={self.saturation}"
+            f", hue={self.hue})"
+        )
+        return s
+
+
+class RandomRotation(torch.nn.Module):
+    """Rotate the image by angle.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+        fill (sequence or number): Pixel fill value for the area outside the rotated
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    def __init__(self, degrees, interpolation=InterpolationMode.NEAREST, expand=False, center=None, fill=0):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+        self.interpolation = interpolation
+        self.expand = expand
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+
+        self.fill = fill
+
+    @staticmethod
+    def get_params(degrees: list[float]) -> float:
+        """Get parameters for ``rotate`` for a random rotation.
+
+        Returns:
+            float: angle parameter to be passed to ``rotate`` for random rotation.
+        """
+        angle = float(torch.empty(1).uniform_(float(degrees[0]), float(degrees[1])).item())
+        return angle
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be rotated.
+
+        Returns:
+            PIL Image or Tensor: Rotated image.
+        """
+        fill = self.fill
+        channels, _, _ = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            else:
+                fill = [float(f) for f in fill]
+        angle = self.get_params(self.degrees)
+
+        return F.rotate(img, angle, self.interpolation, self.expand, self.center, fill)
+
+    def __repr__(self) -> str:
+        interpolate_str = self.interpolation.value
+        format_string = self.__class__.__name__ + f"(degrees={self.degrees}"
+        format_string += f", interpolation={interpolate_str}"
+        format_string += f", expand={self.expand}"
+        if self.center is not None:
+            format_string += f", center={self.center}"
+        if self.fill is not None:
+            format_string += f", fill={self.fill}"
+        format_string += ")"
+        return format_string
+
+
+class RandomAffine(torch.nn.Module):
+    """Random affine transformation of the image keeping center invariant.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    def __init__(
+        self,
+        degrees,
+        translate=None,
+        scale=None,
+        shear=None,
+        interpolation=InterpolationMode.NEAREST,
+        fill=0,
+        center=None,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        self.interpolation = interpolation
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+
+        self.fill = fill
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    @staticmethod
+    def get_params(
+        degrees: list[float],
+        translate: Optional[list[float]],
+        scale_ranges: Optional[list[float]],
+        shears: Optional[list[float]],
+        img_size: list[int],
+    ) -> tuple[float, tuple[int, int], float, tuple[float, float]]:
+        """Get parameters for affine transformation
+
+        Returns:
+            params to be passed to the affine transformation
+        """
+        angle = float(torch.empty(1).uniform_(float(degrees[0]), float(degrees[1])).item())
+        if translate is not None:
+            max_dx = float(translate[0] * img_size[0])
+            max_dy = float(translate[1] * img_size[1])
+            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+            translations = (tx, ty)
+        else:
+            translations = (0, 0)
+
+        if scale_ranges is not None:
+            scale = float(torch.empty(1).uniform_(scale_ranges[0], scale_ranges[1]).item())
+        else:
+            scale = 1.0
+
+        shear_x = shear_y = 0.0
+        if shears is not None:
+            shear_x = float(torch.empty(1).uniform_(shears[0], shears[1]).item())
+            if len(shears) == 4:
+                shear_y = float(torch.empty(1).uniform_(shears[2], shears[3]).item())
+
+        shear = (shear_x, shear_y)
+
+        return angle, translations, scale, shear
+
+    def forward(self, img):
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: Affine transformed image.
+        """
+        fill = self.fill
+        channels, height, width = F.get_dimensions(img)
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * channels
+            else:
+                fill = [float(f) for f in fill]
+
+        img_size = [width, height]  # flip for keeping BC on get_params call
+
+        ret = self.get_params(self.degrees, self.translate, self.scale, self.shear, img_size)
+
+        return F.affine(img, *ret, interpolation=self.interpolation, fill=fill, center=self.center)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(degrees={self.degrees}"
+        s += f", translate={self.translate}" if self.translate is not None else ""
+        s += f", scale={self.scale}" if self.scale is not None else ""
+        s += f", shear={self.shear}" if self.shear is not None else ""
+        s += f", interpolation={self.interpolation.value}" if self.interpolation != InterpolationMode.NEAREST else ""
+        s += f", fill={self.fill}" if self.fill != 0 else ""
+        s += f", center={self.center}" if self.center is not None else ""
+        s += ")"
+
+        return s
+
+
+class Grayscale(torch.nn.Module):
+    """Convert image to grayscale.
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+
+    Returns:
+        PIL Image: Grayscale version of the input.
+
+        - If ``num_output_channels == 1`` : returned image is single channel
+        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
+
+    """
+
+    def __init__(self, num_output_channels=1):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_output_channels = num_output_channels
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be converted to grayscale.
+
+        Returns:
+            PIL Image or Tensor: Grayscaled image.
+        """
+        return F.rgb_to_grayscale(img, num_output_channels=self.num_output_channels)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(num_output_channels={self.num_output_channels})"
+
+
+class RandomGrayscale(torch.nn.Module):
+    """Randomly convert image to grayscale with a probability of p (default 0.1).
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
+    def __init__(self, p=0.1):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be converted to grayscale.
+
+        Returns:
+            PIL Image or Tensor: Randomly grayscaled image.
+        """
+        num_output_channels, _, _ = F.get_dimensions(img)
+        if torch.rand(1) < self.p:
+            return F.rgb_to_grayscale(img, num_output_channels=num_output_channels)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomErasing(torch.nn.Module):
+    """Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+         p: probability that the random erasing operation will be performed.
+         scale: range of proportion of erased area against input image.
+         ratio: range of aspect ratio of erased area.
+         value: erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+         inplace: boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased Image.
+
+    Example:
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
+    def __init__(self, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False):
+        super().__init__()
+        _log_api_usage_once(self)
+        if not isinstance(value, (numbers.Number, str, tuple, list)):
+            raise TypeError("Argument value should be either a number or str or a sequence")
+        if isinstance(value, str) and value != "random":
+            raise ValueError("If value is str, it should be 'random'")
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+        if scale[0] < 0 or scale[1] > 1:
+            raise ValueError("Scale should be between 0 and 1")
+        if p < 0 or p > 1:
+            raise ValueError("Random erasing probability should be between 0 and 1")
+
+        self.p = p
+        self.scale = scale
+        self.ratio = ratio
+        self.value = value
+        self.inplace = inplace
+
+    @staticmethod
+    def get_params(
+        img: Tensor, scale: tuple[float, float], ratio: tuple[float, float], value: Optional[list[float]] = None
+    ) -> tuple[int, int, int, int, Tensor]:
+        """Get parameters for ``erase`` for a random erasing.
+
+        Args:
+            img (Tensor): Tensor image to be erased.
+            scale (sequence): range of proportion of erased area against input image.
+            ratio (sequence): range of aspect ratio of erased area.
+            value (list, optional): erasing value. If None, it is interpreted as "random"
+                (erasing each pixel with random values). If ``len(value)`` is 1, it is interpreted as a number,
+                i.e. ``value[0]``.
+
+        Returns:
+            tuple: params (i, j, h, w, v) to be passed to ``erase`` for random erasing.
+        """
+        img_c, img_h, img_w = img.shape[-3], img.shape[-2], img.shape[-1]
+        area = img_h * img_w
+
+        log_ratio = torch.log(torch.tensor(ratio))
+        for _ in range(10):
+            erase_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+            aspect_ratio = torch.exp(torch.empty(1).uniform_(log_ratio[0], log_ratio[1])).item()
+
+            h = int(round(math.sqrt(erase_area * aspect_ratio)))
+            w = int(round(math.sqrt(erase_area / aspect_ratio)))
+            if not (h < img_h and w < img_w):
+                continue
+
+            if value is None:
+                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
+            else:
+                v = torch.tensor(value)[:, None, None]
+
+            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
+            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
+            return i, j, h, w, v
+
+        # Return original image
+        return 0, 0, img_h, img_w, img
+
+    def forward(self, img):
+        """
+        Args:
+            img (Tensor): Tensor image to be erased.
+
+        Returns:
+            img (Tensor): Erased Tensor image.
+        """
+        if torch.rand(1) < self.p:
+
+            # cast self.value to script acceptable type
+            if isinstance(self.value, (int, float)):
+                value = [float(self.value)]
+            elif isinstance(self.value, str):
+                value = None
+            elif isinstance(self.value, (list, tuple)):
+                value = [float(v) for v in self.value]
+            else:
+                value = self.value
+
+            if value is not None and len(value) not in (1, img.shape[-3]):
+                raise ValueError(
+                    "If value is a sequence, it should have either a single value or "
+                    f"{img.shape[-3]} (number of input channels)"
+                )
+
+            x, y, h, w, v = self.get_params(img, scale=self.scale, ratio=self.ratio, value=value)
+            return F.erase(img, x, y, h, w, v, self.inplace)
+        return img
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}"
+            f"(p={self.p}, "
+            f"scale={self.scale}, "
+            f"ratio={self.ratio}, "
+            f"value={self.value}, "
+            f"inplace={self.inplace})"
+        )
+        return s
+
+
+class GaussianBlur(torch.nn.Module):
+    """Blurs image with randomly chosen Gaussian blur.
+    If the image is torch Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means at most one leading dimension.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+
+    Returns:
+        PIL Image or Tensor: Gaussian blurred version of the input image.
+
+    """
+
+    def __init__(self, kernel_size, sigma=(0.1, 2.0)):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
+        for ks in self.kernel_size:
+            if ks <= 0 or ks % 2 == 0:
+                raise ValueError("Kernel size value should be an odd and positive number.")
+
+        if isinstance(sigma, numbers.Number):
+            if sigma <= 0:
+                raise ValueError("If sigma is a single number, it must be positive.")
+            sigma = (sigma, sigma)
+        elif isinstance(sigma, Sequence) and len(sigma) == 2:
+            if not 0.0 < sigma[0] <= sigma[1]:
+                raise ValueError("sigma values should be positive and of the form (min, max).")
+        else:
+            raise ValueError("sigma should be a single number or a list/tuple with length 2.")
+
+        self.sigma = sigma
+
+    @staticmethod
+    def get_params(sigma_min: float, sigma_max: float) -> float:
+        """Choose sigma for random gaussian blurring.
+
+        Args:
+            sigma_min (float): Minimum standard deviation that can be chosen for blurring kernel.
+            sigma_max (float): Maximum standard deviation that can be chosen for blurring kernel.
+
+        Returns:
+            float: Standard deviation to be passed to calculate kernel for gaussian blurring.
+        """
+        return torch.empty(1).uniform_(sigma_min, sigma_max).item()
+
+    def forward(self, img: Tensor) -> Tensor:
+        """
+        Args:
+            img (PIL Image or Tensor): image to be blurred.
+
+        Returns:
+            PIL Image or Tensor: Gaussian blurred image
+        """
+        sigma = self.get_params(self.sigma[0], self.sigma[1])
+        return F.gaussian_blur(img, self.kernel_size, [sigma, sigma])
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(kernel_size={self.kernel_size}, sigma={self.sigma})"
+        return s
+
+
+def _setup_size(size, error_msg):
+    if isinstance(size, numbers.Number):
+        return int(size), int(size)
+
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+
+    if len(size) != 2:
+        raise ValueError(error_msg)
+
+    return size
+
+
+def _check_sequence_input(x, name, req_sizes):
+    msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join([str(s) for s in req_sizes])
+    if not isinstance(x, Sequence):
+        raise TypeError(f"{name} should be a sequence of length {msg}.")
+    if len(x) not in req_sizes:
+        raise ValueError(f"{name} should be a sequence of length {msg}.")
+
+
+def _setup_angle(x, name, req_sizes=(2,)):
+    if isinstance(x, numbers.Number):
+        if x < 0:
+            raise ValueError(f"If {name} is a single number, it must be positive.")
+        x = [-x, x]
+    else:
+        _check_sequence_input(x, name, req_sizes)
+
+    return [float(d) for d in x]
+
+
+class RandomInvert(torch.nn.Module):
+    """Inverts the colors of the given image randomly with a given probability.
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be inverted.
+
+        Returns:
+            PIL Image or Tensor: Randomly color inverted image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.invert(img)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomPosterize(torch.nn.Module):
+    """Posterize the image randomly with a given probability by reducing the
+    number of bits for each color channel. If the image is torch Tensor, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
+    def __init__(self, bits, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.bits = bits
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be posterized.
+
+        Returns:
+            PIL Image or Tensor: Randomly posterized image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.posterize(img, self.bits)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(bits={self.bits},p={self.p})"
+
+
+class RandomSolarize(torch.nn.Module):
+    """Solarize the image randomly with a given probability by inverting all pixel
+    values above a threshold. If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
+    def __init__(self, threshold, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.threshold = threshold
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be solarized.
+
+        Returns:
+            PIL Image or Tensor: Randomly solarized image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.solarize(img, self.threshold)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(threshold={self.threshold},p={self.p})"
+
+
+class RandomAdjustSharpness(torch.nn.Module):
+    """Adjust the sharpness of the image randomly with a given probability. If the image is torch Tensor,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
+    def __init__(self, sharpness_factor, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.sharpness_factor = sharpness_factor
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be sharpened.
+
+        Returns:
+            PIL Image or Tensor: Randomly sharpened image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.adjust_sharpness(img, self.sharpness_factor)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(sharpness_factor={self.sharpness_factor},p={self.p})"
+
+
+class RandomAutocontrast(torch.nn.Module):
+    """Autocontrast the pixels of the given image randomly with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be autocontrasted.
+
+        Returns:
+            PIL Image or Tensor: Randomly autocontrasted image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.autocontrast(img)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class RandomEqualize(torch.nn.Module):
+    """Equalize the histogram of the given image randomly with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be equalized.
+
+        Returns:
+            PIL Image or Tensor: Randomly equalized image.
+        """
+        if torch.rand(1).item() < self.p:
+            return F.equalize(img)
+        return img
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+
+
+class ElasticTransform(torch.nn.Module):
+    """Transform a tensor image with elastic transformations.
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to grid_sample from the image.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+
+    """
+
+    def __init__(self, alpha=50.0, sigma=5.0, interpolation=InterpolationMode.BILINEAR, fill=0):
+        super().__init__()
+        _log_api_usage_once(self)
+        if not isinstance(alpha, (float, Sequence)):
+            raise TypeError(f"alpha should be float or a sequence of floats. Got {type(alpha)}")
+        if isinstance(alpha, Sequence) and len(alpha) != 2:
+            raise ValueError(f"If alpha is a sequence its length should be 2. Got {len(alpha)}")
+        if isinstance(alpha, Sequence):
+            for element in alpha:
+                if not isinstance(element, float):
+                    raise TypeError(f"alpha should be a sequence of floats. Got {type(element)}")
+
+        if isinstance(alpha, float):
+            alpha = [float(alpha), float(alpha)]
+        if isinstance(alpha, (list, tuple)) and len(alpha) == 1:
+            alpha = [alpha[0], alpha[0]]
+
+        self.alpha = alpha
+
+        if not isinstance(sigma, (float, Sequence)):
+            raise TypeError(f"sigma should be float or a sequence of floats. Got {type(sigma)}")
+        if isinstance(sigma, Sequence) and len(sigma) != 2:
+            raise ValueError(f"If sigma is a sequence its length should be 2. Got {len(sigma)}")
+        if isinstance(sigma, Sequence):
+            for element in sigma:
+                if not isinstance(element, float):
+                    raise TypeError(f"sigma should be a sequence of floats. Got {type(element)}")
+
+        if isinstance(sigma, float):
+            sigma = [float(sigma), float(sigma)]
+        if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
+            sigma = [sigma[0], sigma[0]]
+
+        self.sigma = sigma
+
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+        self.interpolation = interpolation
+
+        if isinstance(fill, (int, float)):
+            fill = [float(fill)]
+        elif isinstance(fill, (list, tuple)):
+            fill = [float(f) for f in fill]
+        else:
+            raise TypeError(f"fill should be int or float or a list or tuple of them. Got {type(fill)}")
+        self.fill = fill
+
+    @staticmethod
+    def get_params(alpha: list[float], sigma: list[float], size: list[int]) -> Tensor:
+        dx = torch.rand([1, 1] + size) * 2 - 1
+        if sigma[0] > 0.0:
+            kx = int(8 * sigma[0] + 1)
+            # if kernel size is even we have to make it odd
+            if kx % 2 == 0:
+                kx += 1
+            dx = F.gaussian_blur(dx, [kx, kx], sigma)
+        dx = dx * alpha[0] / size[0]
+
+        dy = torch.rand([1, 1] + size) * 2 - 1
+        if sigma[1] > 0.0:
+            ky = int(8 * sigma[1] + 1)
+            # if kernel size is even we have to make it odd
+            if ky % 2 == 0:
+                ky += 1
+            dy = F.gaussian_blur(dy, [ky, ky], sigma)
+        dy = dy * alpha[1] / size[1]
+        return torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Args:
+            tensor (PIL Image or Tensor): Image to be transformed.
+
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        _, height, width = F.get_dimensions(tensor)
+        displacement = self.get_params(self.alpha, self.sigma, [height, width])
+        return F.elastic_transform(tensor, displacement, self.interpolation, self.fill)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__
+        format_string += f"(alpha={self.alpha}"
+        format_string += f", sigma={self.sigma}"
+        format_string += f", interpolation={self.interpolation}"
+        format_string += f", fill={self.fill})"
+        return format_string
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..895bf6e2f711bb928934c45025f483e0fecb56d6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__init__.py
@@ -0,0 +1,61 @@
+from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
+
+from . import functional  # usort: skip
+
+from ._transform import Transform  # usort: skip
+
+from ._augment import CutMix, JPEG, MixUp, RandomErasing
+from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
+from ._color import (
+    ColorJitter,
+    Grayscale,
+    RandomAdjustSharpness,
+    RandomAutocontrast,
+    RandomChannelPermutation,
+    RandomEqualize,
+    RandomGrayscale,
+    RandomInvert,
+    RandomPhotometricDistort,
+    RandomPosterize,
+    RandomSolarize,
+    RGB,
+)
+from ._container import Compose, RandomApply, RandomChoice, RandomOrder
+from ._geometry import (
+    CenterCrop,
+    ElasticTransform,
+    FiveCrop,
+    Pad,
+    RandomAffine,
+    RandomCrop,
+    RandomHorizontalFlip,
+    RandomIoUCrop,
+    RandomPerspective,
+    RandomResize,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomShortestSize,
+    RandomVerticalFlip,
+    RandomZoomOut,
+    Resize,
+    ScaleJitter,
+    TenCrop,
+)
+from ._meta import ClampBoundingBoxes, ClampKeyPoints, ConvertBoundingBoxFormat, SetClampingMode
+from ._misc import (
+    ConvertImageDtype,
+    GaussianBlur,
+    GaussianNoise,
+    Identity,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    SanitizeBoundingBoxes,
+    SanitizeKeyPoints,
+    ToDtype,
+)
+from ._temporal import UniformTemporalSubsample
+from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
+from ._utils import check_type, get_bounding_boxes, get_keypoints, has_all, has_any, query_chw, query_size
+
+from ._deprecated import ToTensor  # usort: skip
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9abb6cc949af1b7d03c7e3930b42806805ea198
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_augment.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_augment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75078bed76306ef653e2cd7c3dd392d8c7e976ca
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_augment.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_auto_augment.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_auto_augment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3083dcc36d0ba8acd03f52a3e651fe06b677d14f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_auto_augment.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_color.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_color.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..098ae17a8112709f5ee5a0a9ca57b58245a1aaa2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_color.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_container.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_container.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4a2d577b37cef93becebc3e9a17d1a2cd2c6087
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_container.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_deprecated.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_deprecated.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b95bd1efba2045ffbe4289d8bd2814948dd823
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_deprecated.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_geometry.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_geometry.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed2715881d5d3ffe813fbf9add52c14fc859fdfd
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_geometry.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_meta.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_meta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76fb1a88883e842a8bee3b0aaec832fd5884917b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_meta.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_misc.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65a2b65ecc94f52ebc785fc760c0a552ce6a6bd4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_misc.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_temporal.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_temporal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be8d885105024c583cf270c41cbe84d3b488ff16
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_temporal.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_transform.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_transform.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e812a8b393713650ba014d9c65eb3a6b35be9b8d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_transform.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_type_conversion.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_type_conversion.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d838ef8763e9605fb92f942a1afe42ac87e46bbf
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_type_conversion.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f1e039237665c0c7cf45ce1266c015f43a0b365
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_augment.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6da9aba98bece914509ce5a2651ea1af495a72b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_augment.py
@@ -0,0 +1,374 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn.functional import one_hot
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+from ._transform import _RandomApplyTransform, Transform
+from ._utils import _check_sequence_input, _parse_labels_getter, has_any, is_pure_tensor, query_chw, query_size
+
+
+class RandomErasing(_RandomApplyTransform):
+    """Randomly select a rectangle region in the input image or video and erase its pixels.
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
+    _v1_transform_cls = _transforms.RandomErasing
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return dict(
+            super()._extract_params_for_v1_transform(),
+            value="random" if self.value is None else self.value,
+        )
+
+    def __init__(
+        self,
+        p: float = 0.5,
+        scale: Sequence[float] = (0.02, 0.33),
+        ratio: Sequence[float] = (0.3, 3.3),
+        value: float = 0.0,
+        inplace: bool = False,
+    ):
+        super().__init__(p=p)
+        if not isinstance(value, (numbers.Number, str, tuple, list)):
+            raise TypeError("Argument value should be either a number or str or a sequence")
+        if isinstance(value, str) and value != "random":
+            raise ValueError("If value is str, it should be 'random'")
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+        if scale[0] < 0 or scale[1] > 1:
+            raise ValueError("Scale should be between 0 and 1")
+        self.scale = scale
+        self.ratio = ratio
+        if isinstance(value, (int, float)):
+            self.value = [float(value)]
+        elif isinstance(value, str):
+            self.value = None
+        elif isinstance(value, (list, tuple)):
+            self.value = [float(v) for v in value]
+        else:
+            self.value = value
+        self.inplace = inplace
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        img_c, img_h, img_w = query_chw(flat_inputs)
+
+        if self.value is not None and len(self.value) not in (1, img_c):
+            raise ValueError(
+                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
+            )
+
+        area = img_h * img_w
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            h = int(round(math.sqrt(erase_area * aspect_ratio)))
+            w = int(round(math.sqrt(erase_area / aspect_ratio)))
+            if not (h < img_h and w < img_w):
+                continue
+
+            if self.value is None:
+                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
+            else:
+                v = torch.tensor(self.value)[:, None, None]
+
+            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
+            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
+            break
+        else:
+            i, j, h, w, v = 0, 0, img_h, img_w, None
+
+        return dict(i=i, j=j, h=h, w=w, v=v)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["v"] is not None:
+            inpt = self._call_kernel(F.erase, inpt, **params, inplace=self.inplace)
+
+        return inpt
+
+
+class _BaseMixUpCutMix(Transform):
+    def __init__(self, *, alpha: float = 1.0, num_classes: Optional[int] = None, labels_getter="default") -> None:
+        super().__init__()
+        self.alpha = float(alpha)
+        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
+
+        self.num_classes = num_classes
+
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        if has_any(flat_inputs, PIL.Image.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints):
+            raise ValueError(
+                f"{type(self).__name__}() does not support PIL images, bounding boxes, keypoints and masks."
+            )
+
+        labels = self._labels_getter(inputs)
+        if not isinstance(labels, torch.Tensor):
+            raise ValueError(f"The labels must be a tensor, but got {type(labels)} instead.")
+        if labels.ndim not in (1, 2):
+            raise ValueError(
+                f"labels should be index based with shape (batch_size,) "
+                f"or probability based with shape (batch_size, num_classes), "
+                f"but got a tensor of shape {labels.shape} instead."
+            )
+        if labels.ndim == 2 and self.num_classes is not None and labels.shape[-1] != self.num_classes:
+            raise ValueError(
+                f"When passing 2D labels, "
+                f"the number of elements in last dimension must match num_classes: "
+                f"{labels.shape[-1]} != {self.num_classes}. "
+                f"You can Leave num_classes to None."
+            )
+        if labels.ndim == 1 and self.num_classes is None:
+            raise ValueError("num_classes must be passed if the labels are index-based (1D)")
+
+        params = {
+            "labels": labels,
+            "batch_size": labels.shape[0],
+            **self.make_params(
+                [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+            ),
+        }
+
+        # By default, the labels will be False inside needs_transform_list, since they are a torch.Tensor coming
+        # after an image or video. However, we need to handle them in _transform, so we make sure to set them to True
+        needs_transform_list[next(idx for idx, inpt in enumerate(flat_inputs) if inpt is labels)] = True
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int):
+        expected_num_dims = 5 if isinstance(inpt, tv_tensors.Video) else 4
+        if inpt.ndim != expected_num_dims:
+            raise ValueError(
+                f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead."
+            )
+        if inpt.shape[0] != batch_size:
+            raise ValueError(
+                f"The batch size of the image or video does not match the batch size of the labels: "
+                f"{inpt.shape[0]} != {batch_size}."
+            )
+
+    def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor:
+        if label.ndim == 1:
+            label = one_hot(label, num_classes=self.num_classes)  # type: ignore[arg-type]
+        if not label.dtype.is_floating_point:
+            label = label.float()
+        return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam))
+
+
+class MixUp(_BaseMixUpCutMix):
+    """Apply MixUp to the provided batch of images and labels.
+
+    Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding.
+            Can be None only if the labels are already one-hot-encoded.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        lam = params["lam"]
+
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=lam)
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
+
+
+class CutMix(_BaseMixUpCutMix):
+    """Apply CutMix to the provided batch of images and labels.
+
+    Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
+    <https://arxiv.org/abs/1905.04899>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding.
+            Can be None only if the labels are already one-hot-encoded.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
+
+        H, W = query_size(flat_inputs)
+
+        r_x = torch.randint(W, size=(1,))
+        r_y = torch.randint(H, size=(1,))
+
+        r = 0.5 * math.sqrt(1.0 - lam)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        box = (x1, y1, x2, y2)
+
+        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+
+        return dict(box=box, lam_adjusted=lam_adjusted)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=params["lam_adjusted"])
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            x1, y1, x2, y2 = params["box"]
+            rolled = inpt.roll(1, 0)
+            output = inpt.clone()
+            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
+
+
+class JPEG(Transform):
+    """Apply JPEG compression and decompression to the given images.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to be of dtype uint8, on CPU, and have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        quality (sequence or number): JPEG quality, from 1 to 100. Lower means more compression.
+            If quality is a sequence like (min, max), it specifies the range of JPEG quality to
+            randomly select from (inclusive of both ends).
+
+    Returns:
+        image with JPEG compression.
+    """
+
+    def __init__(self, quality: Union[int, Sequence[int]]):
+        super().__init__()
+        if isinstance(quality, int):
+            if isinstance(quality, bool):
+                raise TypeError("quality can't be bool")
+            quality = [quality, quality]
+        else:
+            _check_sequence_input(quality, "quality", req_sizes=(2,))
+
+        if not (1 <= quality[0] <= quality[1] <= 100 and isinstance(quality[0], int) and isinstance(quality[1], int)):
+            raise ValueError(f"quality must be an integer from 1 to 100, got {quality =}")
+
+        self.quality = quality
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        quality = torch.randint(self.quality[0], self.quality[1] + 1, ()).item()
+        return dict(quality=quality)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.jpeg, inpt, quality=params["quality"])
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_auto_augment.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_auto_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..52707af1f2e8be5d3fbfe4570455beeb13560f6f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_auto_augment.py
@@ -0,0 +1,631 @@
+import math
+from typing import Any, Callable, cast, Optional, Union
+
+import PIL.Image
+import torch
+
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms import _functional_tensor as _FT
+from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._meta import get_size
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
+
+from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor
+
+
+ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video]
+
+
+class _AutoAugmentBase(Transform):
+    def __init__(
+        self,
+        *,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__()
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if isinstance(params["fill"], dict):
+            raise ValueError(f"{type(self).__name__}() can not be scripted for when `fill` is a dictionary.")
+
+        return params
+
+    def _get_random_item(self, dct: dict[str, tuple[Callable, bool]]) -> tuple[str, tuple[Callable, bool]]:
+        keys = tuple(dct.keys())
+        key = keys[int(torch.randint(len(keys), ()))]
+        return key, dct[key]
+
+    def _flatten_and_extract_image_or_video(
+        self,
+        inputs: Any,
+        unsupported_types: tuple[type, ...] = (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints),
+    ) -> tuple[tuple[list[Any], TreeSpec, int], ImageOrVideo]:
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        image_or_videos = []
+        for idx, (inpt, needs_transform) in enumerate(zip(flat_inputs, needs_transform_list)):
+            if needs_transform and check_type(
+                inpt,
+                (
+                    tv_tensors.Image,
+                    PIL.Image.Image,
+                    is_pure_tensor,
+                    tv_tensors.Video,
+                ),
+            ):
+                image_or_videos.append((idx, inpt))
+            elif isinstance(inpt, unsupported_types):
+                raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
+
+        if not image_or_videos:
+            raise TypeError("Found no image in the sample.")
+        if len(image_or_videos) > 1:
+            raise TypeError(
+                f"Auto augment transformations are only properly defined for a single image or video, "
+                f"but found {len(image_or_videos)}."
+            )
+
+        idx, image_or_video = image_or_videos[0]
+        return (flat_inputs, spec, idx), image_or_video
+
+    def _unflatten_and_insert_image_or_video(
+        self,
+        flat_inputs_with_spec: tuple[list[Any], TreeSpec, int],
+        image_or_video: ImageOrVideo,
+    ) -> Any:
+        flat_inputs, spec, idx = flat_inputs_with_spec
+        flat_inputs[idx] = image_or_video
+        return tree_unflatten(flat_inputs, spec)
+
+    def _apply_image_or_video_transform(
+        self,
+        image: ImageOrVideo,
+        transform_id: str,
+        magnitude: float,
+        interpolation: Union[InterpolationMode, int],
+        fill: dict[Union[type, str], _FillTypeJIT],
+    ) -> ImageOrVideo:
+        # Note: this cast is wrong and is only here to make mypy happy (it disagrees with torchscript)
+        image = cast(torch.Tensor, image)
+        fill_ = _get_fill(fill, type(image))
+
+        if transform_id == "Identity":
+            return image
+        elif transform_id == "ShearX":
+            # magnitude should be arctan(magnitude)
+            # official autoaug: (1, level, 0, 0, 1, 0)
+            # https://github.com/tensorflow/models/blob/dd02069717128186b88afa8d857ce57d17957f03/research/autoaugment/augmentation_transforms.py#L290
+            # compared to
+            # torchvision:      (1, tan(level), 0, 0, 1, 0)
+            # https://github.com/pytorch/vision/blob/0c2373d0bba3499e95776e7936e207d8a1676e65/torchvision/transforms/functional.py#L976
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, 0],
+                scale=1.0,
+                shear=[math.degrees(math.atan(magnitude)), 0.0],
+                interpolation=interpolation,
+                fill=fill_,
+                center=[0, 0],
+            )
+        elif transform_id == "ShearY":
+            # magnitude should be arctan(magnitude)
+            # See above
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, 0],
+                scale=1.0,
+                shear=[0.0, math.degrees(math.atan(magnitude))],
+                interpolation=interpolation,
+                fill=fill_,
+                center=[0, 0],
+            )
+        elif transform_id == "TranslateX":
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[int(magnitude), 0],
+                scale=1.0,
+                interpolation=interpolation,
+                shear=[0.0, 0.0],
+                fill=fill_,
+            )
+        elif transform_id == "TranslateY":
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, int(magnitude)],
+                scale=1.0,
+                interpolation=interpolation,
+                shear=[0.0, 0.0],
+                fill=fill_,
+            )
+        elif transform_id == "Rotate":
+            return F.rotate(image, angle=magnitude, interpolation=interpolation, fill=fill_)
+        elif transform_id == "Brightness":
+            return F.adjust_brightness(image, brightness_factor=1.0 + magnitude)
+        elif transform_id == "Color":
+            return F.adjust_saturation(image, saturation_factor=1.0 + magnitude)
+        elif transform_id == "Contrast":
+            return F.adjust_contrast(image, contrast_factor=1.0 + magnitude)
+        elif transform_id == "Sharpness":
+            return F.adjust_sharpness(image, sharpness_factor=1.0 + magnitude)
+        elif transform_id == "Posterize":
+            return F.posterize(image, bits=int(magnitude))
+        elif transform_id == "Solarize":
+            bound = _FT._max_value(image.dtype) if isinstance(image, torch.Tensor) else 255.0
+            return F.solarize(image, threshold=bound * magnitude)
+        elif transform_id == "AutoContrast":
+            return F.autocontrast(image)
+        elif transform_id == "Equalize":
+            return F.equalize(image)
+        elif transform_id == "Invert":
+            return F.invert(image)
+        else:
+            raise ValueError(f"No transform available for {transform_id}")
+
+
+class AutoAugment(_AutoAugmentBase):
+    r"""AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.AutoAugment
+
+    _AUGMENTATION_SPACE = {
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
+            True,
+        ),
+        "TranslateY": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
+            True,
+        ),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+        "Invert": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        self.policy = policy
+        self._policies = self._get_policies(policy)
+
+    def _get_policies(
+        self, policy: AutoAugmentPolicy
+    ) -> list[tuple[tuple[str, float, Optional[int]], tuple[str, float, Optional[int]]]]:
+        if policy == AutoAugmentPolicy.IMAGENET:
+            return [
+                (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+                (("Posterize", 0.6, 7), ("Posterize", 0.6, 6)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Equalize", 0.4, None), ("Rotate", 0.8, 8)),
+                (("Solarize", 0.6, 3), ("Equalize", 0.6, None)),
+                (("Posterize", 0.8, 5), ("Equalize", 1.0, None)),
+                (("Rotate", 0.2, 3), ("Solarize", 0.6, 8)),
+                (("Equalize", 0.6, None), ("Posterize", 0.4, 6)),
+                (("Rotate", 0.8, 8), ("Color", 0.4, 0)),
+                (("Rotate", 0.4, 9), ("Equalize", 0.6, None)),
+                (("Equalize", 0.0, None), ("Equalize", 0.8, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Rotate", 0.8, 8), ("Color", 1.0, 2)),
+                (("Color", 0.8, 8), ("Solarize", 0.8, 7)),
+                (("Sharpness", 0.4, 7), ("Invert", 0.6, None)),
+                (("ShearX", 0.6, 5), ("Equalize", 1.0, None)),
+                (("Color", 0.4, 0), ("Equalize", 0.6, None)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+            ]
+        elif policy == AutoAugmentPolicy.CIFAR10:
+            return [
+                (("Invert", 0.1, None), ("Contrast", 0.2, 6)),
+                (("Rotate", 0.7, 2), ("TranslateX", 0.3, 9)),
+                (("Sharpness", 0.8, 1), ("Sharpness", 0.9, 3)),
+                (("ShearY", 0.5, 8), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.5, None), ("Equalize", 0.9, None)),
+                (("ShearY", 0.2, 7), ("Posterize", 0.3, 7)),
+                (("Color", 0.4, 3), ("Brightness", 0.6, 7)),
+                (("Sharpness", 0.3, 9), ("Brightness", 0.7, 9)),
+                (("Equalize", 0.6, None), ("Equalize", 0.5, None)),
+                (("Contrast", 0.6, 7), ("Sharpness", 0.6, 5)),
+                (("Color", 0.7, 7), ("TranslateX", 0.5, 8)),
+                (("Equalize", 0.3, None), ("AutoContrast", 0.4, None)),
+                (("TranslateY", 0.4, 3), ("Sharpness", 0.2, 6)),
+                (("Brightness", 0.9, 6), ("Color", 0.2, 8)),
+                (("Solarize", 0.5, 2), ("Invert", 0.0, None)),
+                (("Equalize", 0.2, None), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.2, None), ("Equalize", 0.6, None)),
+                (("Color", 0.9, 9), ("Equalize", 0.6, None)),
+                (("AutoContrast", 0.8, None), ("Solarize", 0.2, 8)),
+                (("Brightness", 0.1, 3), ("Color", 0.7, 0)),
+                (("Solarize", 0.4, 5), ("AutoContrast", 0.9, None)),
+                (("TranslateY", 0.9, 9), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.9, None), ("Solarize", 0.8, 3)),
+                (("Equalize", 0.8, None), ("Invert", 0.1, None)),
+                (("TranslateY", 0.7, 9), ("AutoContrast", 0.9, None)),
+            ]
+        elif policy == AutoAugmentPolicy.SVHN:
+            return [
+                (("ShearX", 0.9, 4), ("Invert", 0.2, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.6, None), ("Solarize", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("AutoContrast", 0.8, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.4, None)),
+                (("ShearY", 0.9, 5), ("Solarize", 0.2, 6)),
+                (("Invert", 0.9, None), ("AutoContrast", 0.8, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("Solarize", 0.3, 3)),
+                (("ShearY", 0.8, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.9, None), ("TranslateY", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Contrast", 0.3, 3), ("Rotate", 0.8, 4)),
+                (("Invert", 0.8, None), ("TranslateY", 0.0, 2)),
+                (("ShearY", 0.7, 6), ("Solarize", 0.4, 8)),
+                (("Invert", 0.6, None), ("Rotate", 0.8, 4)),
+                (("ShearY", 0.3, 7), ("TranslateX", 0.9, 3)),
+                (("ShearX", 0.1, 6), ("Invert", 0.6, None)),
+                (("Solarize", 0.7, 2), ("TranslateY", 0.6, 7)),
+                (("ShearY", 0.8, 4), ("Invert", 0.8, None)),
+                (("ShearX", 0.7, 9), ("TranslateY", 0.8, 3)),
+                (("ShearY", 0.8, 5), ("AutoContrast", 0.7, None)),
+                (("ShearX", 0.7, 2), ("Invert", 0.1, None)),
+            ]
+        else:
+            raise ValueError(f"The provided policy {policy} is not recognized.")
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
+
+        policy = self._policies[int(torch.randint(len(self._policies), ()))]
+
+        for transform_id, probability, magnitude_idx in policy:
+            if not torch.rand(()) <= probability:
+                continue
+
+            magnitudes_fn, signed = self._AUGMENTATION_SPACE[transform_id]
+
+            magnitudes = magnitudes_fn(10, height, width)
+            if magnitudes is not None:
+                magnitude = float(magnitudes[magnitude_idx])
+                if signed and torch.rand(()) <= 0.5:
+                    magnitude *= -1
+            else:
+                magnitude = 0.0
+
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+            )
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class RandAugment(_AutoAugmentBase):
+    r"""RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially,
+            must be non-negative integer. Default: 2.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.RandAugment
+    _AUGMENTATION_SPACE = {
+        "Identity": (lambda num_bins, height, width: None, False),
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
+            True,
+        ),
+        "TranslateY": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
+            True,
+        ),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        num_ops: int = 2,
+        magnitude: int = 9,
+        num_magnitude_bins: int = 31,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        if not isinstance(num_ops, int) or (num_ops < 0):
+            raise ValueError(f"num_ops should be a non-negative integer, but got {num_ops} instead.")
+        self.num_ops = num_ops
+        self.magnitude = magnitude
+        self.num_magnitude_bins = num_magnitude_bins
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
+
+        for _ in range(self.num_ops):
+            transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
+            magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
+            if magnitudes is not None:
+                magnitude = float(magnitudes[self.magnitude])
+                if signed and torch.rand(()) <= 0.5:
+                    magnitude *= -1
+            else:
+                magnitude = 0.0
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+            )
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class TrivialAugmentWide(_AutoAugmentBase):
+    r"""Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.TrivialAugmentWide
+    _AUGMENTATION_SPACE = {
+        "Identity": (lambda num_bins, height, width: None, False),
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
+        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 135.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        num_magnitude_bins: int = 31,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
+    ):
+        super().__init__(interpolation=interpolation, fill=fill)
+        self.num_magnitude_bins = num_magnitude_bins
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)  # type: ignore[arg-type]
+
+        transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
+
+        magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
+        if magnitudes is not None:
+            magnitude = float(magnitudes[int(torch.randint(self.num_magnitude_bins, ()))])
+            if signed and torch.rand(()) <= 0.5:
+                magnitude *= -1
+        else:
+            magnitude = 0.0
+
+        image_or_video = self._apply_image_or_video_transform(
+            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+        )
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class AugMix(_AutoAugmentBase):
+    r"""AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.AugMix
+
+    _PARTIAL_AUGMENTATION_SPACE = {
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, width / 3.0, num_bins), True),
+        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, height / 3.0, num_bins), True),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+    _AUGMENTATION_SPACE: dict[str, tuple[Callable[[int, int, int], Optional[torch.Tensor]], bool]] = {
+        **_PARTIAL_AUGMENTATION_SPACE,
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+    }
+
+    def __init__(
+        self,
+        severity: int = 3,
+        mixture_width: int = 3,
+        chain_depth: int = -1,
+        alpha: float = 1.0,
+        all_ops: bool = True,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        self._PARAMETER_MAX = 10
+        if not (1 <= severity <= self._PARAMETER_MAX):
+            raise ValueError(f"The severity must be between [1, {self._PARAMETER_MAX}]. Got {severity} instead.")
+        self.severity = severity
+        self.mixture_width = mixture_width
+        self.chain_depth = chain_depth
+        self.alpha = alpha
+        self.all_ops = all_ops
+
+    def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor:
+        # Must be on a separate method so that we can overwrite it in tests.
+        return torch._sample_dirichlet(params)
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(orig_image_or_video)  # type: ignore[arg-type]
+
+        if isinstance(orig_image_or_video, torch.Tensor):
+            image_or_video = orig_image_or_video
+        else:  # isinstance(inpt, PIL.Image.Image):
+            image_or_video = F.pil_to_tensor(orig_image_or_video)
+
+        augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
+
+        orig_dims = list(image_or_video.shape)
+        expected_ndim = 5 if isinstance(orig_image_or_video, tv_tensors.Video) else 4
+        batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
+        batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
+
+        # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
+        # Dirichlet with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of
+        # augmented image or video.
+        m = self._sample_dirichlet(
+            torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
+        )
+
+        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos.
+        combined_weights = self._sample_dirichlet(
+            torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
+        ) * m[:, 1].reshape([batch_dims[0], -1])
+
+        mix = m[:, 0].reshape(batch_dims) * batch
+        for i in range(self.mixture_width):
+            aug = batch
+            depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
+            for _ in range(depth):
+                transform_id, (magnitudes_fn, signed) = self._get_random_item(augmentation_space)
+
+                magnitudes = magnitudes_fn(self._PARAMETER_MAX, height, width)
+                if magnitudes is not None:
+                    magnitude = float(magnitudes[int(torch.randint(self.severity, ()))])
+                    if signed and torch.rand(()) <= 0.5:
+                        magnitude *= -1
+                else:
+                    magnitude = 0.0
+
+                aug = self._apply_image_or_video_transform(aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill)  # type: ignore[assignment]
+            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
+        mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
+
+        if isinstance(orig_image_or_video, (tv_tensors.Image, tv_tensors.Video)):
+            mix = tv_tensors.wrap(mix, like=orig_image_or_video)
+        elif isinstance(orig_image_or_video, PIL.Image.Image):
+            mix = F.to_pil_image(mix)
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_color.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_color.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf4ae55d23222073704fc3473e589f07c7254c43
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_color.py
@@ -0,0 +1,377 @@
+import collections.abc
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import torch
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._transform import _RandomApplyTransform
+from ._utils import query_chw
+
+
+class Grayscale(Transform):
+    """Convert images or videos to grayscale.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+    """
+
+    _v1_transform_cls = _transforms.Grayscale
+
+    def __init__(self, num_output_channels: int = 1):
+        super().__init__()
+        self.num_output_channels = num_output_channels
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=self.num_output_channels)
+
+
+class RandomGrayscale(_RandomApplyTransform):
+    """Randomly convert image or videos to grayscale with a probability of p (default 0.1).
+
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+    """
+
+    _v1_transform_cls = _transforms.RandomGrayscale
+
+    def __init__(self, p: float = 0.1) -> None:
+        super().__init__(p=p)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_input_channels, *_ = query_chw(flat_inputs)
+        return dict(num_input_channels=num_input_channels)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=params["num_input_channels"])
+
+
+class RGB(Transform):
+    """Convert images or videos to RGB (if they are already not RGB).
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.grayscale_to_rgb, inpt)
+
+
+class ColorJitter(Transform):
+    """Randomly change the brightness, contrast, saturation and hue of an image or video.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
+    _v1_transform_cls = _transforms.ColorJitter
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return {attr: value or 0 for attr, value in super()._extract_params_for_v1_transform().items()}
+
+    def __init__(
+        self,
+        brightness: Optional[Union[float, Sequence[float]]] = None,
+        contrast: Optional[Union[float, Sequence[float]]] = None,
+        saturation: Optional[Union[float, Sequence[float]]] = None,
+        hue: Optional[Union[float, Sequence[float]]] = None,
+    ) -> None:
+        super().__init__()
+        self.brightness = self._check_input(brightness, "brightness")
+        self.contrast = self._check_input(contrast, "contrast")
+        self.saturation = self._check_input(saturation, "saturation")
+        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    def _check_input(
+        self,
+        value: Optional[Union[float, Sequence[float]]],
+        name: str,
+        center: float = 1.0,
+        bound: tuple[float, float] = (0, float("inf")),
+        clip_first_on_zero: bool = True,
+    ) -> Optional[tuple[float, float]]:
+        if value is None:
+            return None
+
+        if isinstance(value, (int, float)):
+            if value < 0:
+                raise ValueError(f"If {name} is a single number, it must be non negative.")
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0.0)
+        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
+            value = [float(v) for v in value]
+        else:
+            raise TypeError(f"{name}={value} should be a single number or a sequence with length 2.")
+
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound} and increasing, but got {value}.")
+
+        return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
+
+    @staticmethod
+    def _generate_value(left: float, right: float) -> float:
+        return torch.empty(1).uniform_(left, right).item()
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        fn_idx = torch.randperm(4)
+
+        b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
+        c = None if self.contrast is None else self._generate_value(self.contrast[0], self.contrast[1])
+        s = None if self.saturation is None else self._generate_value(self.saturation[0], self.saturation[1])
+        h = None if self.hue is None else self._generate_value(self.hue[0], self.hue[1])
+
+        return dict(fn_idx=fn_idx, brightness_factor=b, contrast_factor=c, saturation_factor=s, hue_factor=h)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        output = inpt
+        brightness_factor = params["brightness_factor"]
+        contrast_factor = params["contrast_factor"]
+        saturation_factor = params["saturation_factor"]
+        hue_factor = params["hue_factor"]
+        for fn_id in params["fn_idx"]:
+            if fn_id == 0 and brightness_factor is not None:
+                output = self._call_kernel(F.adjust_brightness, output, brightness_factor=brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                output = self._call_kernel(F.adjust_contrast, output, contrast_factor=contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                output = self._call_kernel(F.adjust_saturation, output, saturation_factor=saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                output = self._call_kernel(F.adjust_hue, output, hue_factor=hue_factor)
+        return output
+
+
+class RandomChannelPermutation(Transform):
+    """Randomly permute the channels of an image or video"""
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        return dict(permutation=torch.randperm(num_channels))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.permute_channels, inpt, params["permutation"])
+
+
+class RandomPhotometricDistort(Transform):
+    """Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast (tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness: tuple[float, float] = (0.875, 1.125),
+        contrast: tuple[float, float] = (0.5, 1.5),
+        saturation: tuple[float, float] = (0.5, 1.5),
+        hue: tuple[float, float] = (-0.05, 0.05),
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self.brightness = brightness
+        self.contrast = contrast
+        self.hue = hue
+        self.saturation = saturation
+        self.p = p
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        params: dict[str, Any] = {
+            key: ColorJitter._generate_value(range[0], range[1]) if torch.rand(1) < self.p else None
+            for key, range in [
+                ("brightness_factor", self.brightness),
+                ("contrast_factor", self.contrast),
+                ("saturation_factor", self.saturation),
+                ("hue_factor", self.hue),
+            ]
+        }
+        params["contrast_before"] = bool(torch.rand(()) < 0.5)
+        params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
+        return params
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["brightness_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_brightness, inpt, brightness_factor=params["brightness_factor"])
+        if params["contrast_factor"] is not None and params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["saturation_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_saturation, inpt, saturation_factor=params["saturation_factor"])
+        if params["hue_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_hue, inpt, hue_factor=params["hue_factor"])
+        if params["contrast_factor"] is not None and not params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["channel_permutation"] is not None:
+            inpt = self._call_kernel(F.permute_channels, inpt, permutation=params["channel_permutation"])
+        return inpt
+
+
+class RandomEqualize(_RandomApplyTransform):
+    """Equalize the histogram of the given image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomEqualize
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.equalize, inpt)
+
+
+class RandomInvert(_RandomApplyTransform):
+    """Inverts the colors of the given image or video with a given probability.
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomInvert
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.invert, inpt)
+
+
+class RandomPosterize(_RandomApplyTransform):
+    """Posterize the image or video with a given probability by reducing the
+    number of bits for each color channel.
+
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomPosterize
+
+    def __init__(self, bits: int, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.bits = bits
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.posterize, inpt, bits=self.bits)
+
+
+class RandomSolarize(_RandomApplyTransform):
+    """Solarize the image or video with a given probability by inverting all pixel
+    values above a threshold.
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomSolarize
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+        params["threshold"] = float(params["threshold"])
+        return params
+
+    def __init__(self, threshold: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.threshold = threshold
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.solarize, inpt, threshold=self.threshold)
+
+
+class RandomAutocontrast(_RandomApplyTransform):
+    """Autocontrast the pixels of the given image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAutocontrast
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.autocontrast, inpt)
+
+
+class RandomAdjustSharpness(_RandomApplyTransform):
+    """Adjust the sharpness of the image or video with a given probability.
+
+    If the input is a :class:`torch.Tensor`,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAdjustSharpness
+
+    def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.sharpness_factor = sharpness_factor
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=self.sharpness_factor)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_container.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ec25a22f84501ac6ca7520f70db63b5a0f5084
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_container.py
@@ -0,0 +1,180 @@
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from torch import nn
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import Transform
+
+
+class Compose(Transform):
+    """Composes several transforms together.
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        super().__init__()
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+        for transform in self.transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomApply(Transform):
+    """Apply randomly a list of transformations with a given probability.
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability of applying the list of transforms
+    """
+
+    _v1_transform_cls = _transforms.RandomApply
+
+    def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
+        super().__init__()
+
+        if not isinstance(transforms, (Sequence, nn.ModuleList)):
+            raise TypeError("Argument transforms should be a sequence of callables or a `nn.ModuleList`")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        self.transforms = transforms
+
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+        self.p = p
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        return {"transforms": self.transforms, "p": self.p}
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+
+        if torch.rand(1) >= self.p:
+            return inputs if needs_unpacking else inputs[0]
+
+        for transform in self.transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomChoice(Transform):
+    """Apply single transformation randomly picked from a list.
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
+
+    def __init__(
+        self,
+        transforms: Sequence[Callable],
+        p: Optional[list[float]] = None,
+    ) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
+
+        super().__init__()
+
+        self.transforms = transforms
+        total = sum(p)
+        self.p = [prob / total for prob in p]
+
+    def forward(self, *inputs: Any) -> Any:
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
+        transform = self.transforms[idx]
+        return transform(*inputs)
+
+
+class RandomOrder(Transform):
+    """Apply a list of transformations in a random order.
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        super().__init__()
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+        for idx in torch.randperm(len(self.transforms)):
+            transform = self.transforms[idx]
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_deprecated.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7d6170d4f4c48678027e75cbb246cf250b587a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_deprecated.py
@@ -0,0 +1,50 @@
+import warnings
+from typing import Any, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision.transforms import functional as _F
+
+from torchvision.transforms.v2 import Transform
+
+
+class ToTensor(Transform):
+    """[DEPRECATED] Use ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`` instead.
+
+    Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])``.
+        Output is equivalent up to float precision.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
+    _transformed_types = (PIL.Image.Image, np.ndarray)
+
+    def __init__(self) -> None:
+        warnings.warn(
+            "The transform `ToTensor()` is deprecated and will be removed in a future release. "
+            "Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`."
+            "Output is equivalent up to float precision."
+        )
+        super().__init__()
+
+    def transform(self, inpt: Union[PIL.Image.Image, np.ndarray], params: dict[str, Any]) -> torch.Tensor:
+        return _F.to_tensor(inpt)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_geometry.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..1418a6b4953fc27745e68b17d9ede29ba17c57f6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_geometry.py
@@ -0,0 +1,1417 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Literal, Optional, Union
+
+import PIL.Image
+import torch
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._utils import _FillType
+
+from ._transform import _RandomApplyTransform
+from ._utils import (
+    _check_padding_arg,
+    _check_padding_mode_arg,
+    _check_sequence_input,
+    _get_fill,
+    _setup_angle,
+    _setup_fill_arg,
+    _setup_number_or_seq,
+    _setup_size,
+    get_bounding_boxes,
+    has_all,
+    has_any,
+    is_pure_tensor,
+    query_size,
+)
+
+
+class RandomHorizontalFlip(_RandomApplyTransform):
+    """Horizontally flip the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomHorizontalFlip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.horizontal_flip, inpt)
+
+
+class RandomVerticalFlip(_RandomApplyTransform):
+    """Vertically flip the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomVerticalFlip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.vertical_flip, inpt)
+
+
+class Resize(Transform):
+    """Resize the input to the given size.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence, int, or None): Desired
+            output size.
+
+            - If size is a sequence like (h, w), output size will be matched to this.
+            - If size is an int, smaller edge of the image will be matched to this
+              number.  i.e, if height > width, then image will be rescaled to
+              (size * height / width, size).
+            - If size is None, the output shape is determined by the ``max_size``
+              parameter.
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image.
+
+            - If ``size`` is an int: if the longer edge of the image is greater
+              than ``max_size`` after being resized according to ``size``,
+              ``size`` will be overruled so that the longer edge is equal to
+              ``max_size``. As a result, the smaller edge may be shorter than
+              ``size``. This is only supported if ``size`` is an int (or a
+              sequence of length 1 in torchscript mode).
+            - If ``size`` is None: the longer edge of the image will be matched
+              to max_size.  i.e, if height > width, then image will be rescaled
+              to (max_size, max_size * width / height).
+
+            This should be left to ``None`` (default) when ``size`` is a
+            sequence.
+
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.Resize
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int], None],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        max_size: Optional[int] = None,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+
+        if isinstance(size, int):
+            size = [size]
+        elif isinstance(size, Sequence) and len(size) in {1, 2}:
+            size = list(size)
+        elif size is None:
+            if not isinstance(max_size, int):
+                raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
+        else:
+            raise ValueError(
+                f"size can be an integer, a sequence of one or two integers, or None, but got {size} instead."
+            )
+        self.size = size
+
+        self.interpolation = interpolation
+        self.max_size = max_size
+        self.antialias = antialias
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize,
+            inpt,
+            self.size,
+            interpolation=self.interpolation,
+            max_size=self.max_size,
+            antialias=self.antialias,
+        )
+
+
+class CenterCrop(Transform):
+    """Crop the input at the center.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
+    _v1_transform_cls = _transforms.CenterCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.center_crop, inpt, output_size=self.size)
+
+
+class RandomResizedCrop(Transform):
+    """Crop a random portion of the input and resize it to a given size.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    A crop of the original input is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.RandomResizedCrop
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        scale: tuple[float, float] = (0.08, 1.0),
+        ratio: tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if not isinstance(scale, Sequence) or len(scale) != 2:
+            raise TypeError("Scale should be a sequence of two floats.")
+        if not isinstance(ratio, Sequence) or len(ratio) != 2:
+            raise TypeError("Ratio should be a sequence of two floats.")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+        area = height * width
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                break
+        else:
+            # Fallback to central crop
+            in_ratio = float(width) / float(height)
+            if in_ratio < min(self.ratio):
+                w = width
+                h = int(round(w / min(self.ratio)))
+            elif in_ratio > max(self.ratio):
+                h = height
+                w = int(round(h * max(self.ratio)))
+            else:  # whole image
+                w = width
+                h = height
+            i = (height - h) // 2
+            j = (width - w) // 2
+
+        return dict(top=i, left=j, height=h, width=w)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class FiveCrop(Transform):
+    """Crop the image or video into four corners and the central crop.
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Example:
+        >>> class BatchMultiCrop(transforms.Transform):
+        ...     def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
+        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
+        ...         return images_or_videos, labels
+        ...
+        >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
+        >>> label = 3
+        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
+        >>> images, labels = transform(image, label)
+        >>> images.shape
+        torch.Size([5, 3, 224, 224])
+        >>> labels
+        tensor([3, 3, 3, 3, 3])
+    """
+
+    _v1_transform_cls = _transforms.FiveCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.five_crop, inpt, self.size)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+
+class TenCrop(Transform):
+    """Crop the image or video into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
+    """
+
+    _v1_transform_cls = _transforms.TenCrop
+
+    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.vertical_flip = vertical_flip
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
+
+
+class Pad(Transform):
+    """Pad the input on all sides with the given "pad" value.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.Pad
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        return params
+
+    def __init__(
+        self,
+        padding: Union[int, Sequence[int]],
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        _check_padding_arg(padding)
+        _check_padding_mode_arg(padding_mode)
+
+        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
+        if not isinstance(padding, int):
+            padding = list(padding)
+        self.padding = padding
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+
+class RandomZoomOut(_RandomApplyTransform):
+    """ "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability that the zoom operation will be performed.
+    """
+
+    def __init__(
+        self,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        side_range: Sequence[float] = (1.0, 4.0),
+        p: float = 0.5,
+    ) -> None:
+        super().__init__(p=p)
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
+
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid side range provided {side_range}.")
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+        padding = [left, top, right, bottom]
+
+        return dict(padding=padding)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, **params, fill=fill)
+
+
+class RandomRotation(Transform):
+    """Rotate the input by angle.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be [-degrees, +degrees].
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center (see note below) and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+            .. note::
+
+                In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
+                center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
+                differences of the resulting image size compared to using the image center in the first place. Thus, when
+                setting ``expand=True``, it's best to leave ``center=None`` (default).
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomRotation
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        expand: bool = False,
+        center: Optional[list[float]] = None,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        self.interpolation = interpolation
+        self.expand = expand
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        return dict(angle=angle)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.rotate,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            expand=self.expand,
+            center=self.center,
+            fill=fill,
+        )
+
+
+class RandomAffine(Transform):
+    """Random affine transformation the input keeping center invariant.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomAffine
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        translate: Optional[Sequence[float]] = None,
+        scale: Optional[Sequence[float]] = None,
+        shear: Optional[Union[int, float, Sequence[float]]] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        center: Optional[list[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        if self.translate is not None:
+            max_dx = float(self.translate[0] * width)
+            max_dy = float(self.translate[1] * height)
+            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+            translate = (tx, ty)
+        else:
+            translate = (0, 0)
+
+        if self.scale is not None:
+            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+        else:
+            scale = 1.0
+
+        shear_x = shear_y = 0.0
+        if self.shear is not None:
+            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
+            if len(self.shear) == 4:
+                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
+
+        shear = (shear_x, shear_y)
+        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.affine,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            fill=fill,
+            center=self.center,
+        )
+
+
+class RandomCrop(Transform):
+    """Crop the input at a random location.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image, applied before cropping. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.RandomCrop
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        padding = self.padding
+        if padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = padding
+            padding = [pad_left, pad_top, pad_right, pad_bottom]
+        params["padding"] = padding
+
+        return params
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        padding: Optional[Union[int, Sequence[int]]] = None,
+        pad_if_needed: bool = False,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if pad_if_needed or padding is not None:
+            if padding is not None:
+                _check_padding_arg(padding)
+            _check_padding_mode_arg(padding_mode)
+
+        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        padded_height, padded_width = query_size(flat_inputs)
+
+        if self.padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = self.padding
+            padded_height += pad_top + pad_bottom
+            padded_width += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        cropped_height, cropped_width = self.size
+
+        if self.pad_if_needed:
+            if padded_height < cropped_height:
+                diff = cropped_height - padded_height
+
+                pad_top += diff
+                pad_bottom += diff
+                padded_height += 2 * diff
+
+            if padded_width < cropped_width:
+                diff = cropped_width - padded_width
+
+                pad_left += diff
+                pad_right += diff
+                padded_width += 2 * diff
+
+        if padded_height < cropped_height or padded_width < cropped_width:
+            raise ValueError(
+                f"Required crop size {(cropped_height, cropped_width)} is larger than "
+                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
+            )
+
+        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        needs_pad = any(padding)
+
+        needs_vert_crop, top = (
+            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
+            if padded_height > cropped_height
+            else (False, 0)
+        )
+        needs_horz_crop, left = (
+            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
+            if padded_width > cropped_width
+            else (False, 0)
+        )
+
+        return dict(
+            needs_crop=needs_vert_crop or needs_horz_crop,
+            top=top,
+            left=left,
+            height=cropped_height,
+            width=cropped_width,
+            needs_pad=needs_pad,
+            padding=padding,
+        )
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if params["needs_pad"]:
+            fill = _get_fill(self._fill, type(inpt))
+            inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
+
+        if params["needs_crop"]:
+            inpt = self._call_kernel(
+                F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+            )
+
+        return inpt
+
+
+class RandomPerspective(_RandomApplyTransform):
+    """Perform a random perspective transformation of the input with a given probability.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.RandomPerspective
+
+    def __init__(
+        self,
+        distortion_scale: float = 0.5,
+        p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__(p=p)
+
+        if not (0 <= distortion_scale <= 1):
+            raise ValueError("Argument distortion_scale value should be between 0 and 1")
+
+        self.distortion_scale = distortion_scale
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        distortion_scale = self.distortion_scale
+
+        half_height = height // 2
+        half_width = width // 2
+        bound_height = int(distortion_scale * half_height) + 1
+        bound_width = int(distortion_scale * half_width) + 1
+        topleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        topright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        botright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        botleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return dict(coefficients=perspective_coeffs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.perspective,
+            inpt,
+            startpoints=None,
+            endpoints=None,
+            fill=fill,
+            interpolation=self.interpolation,
+            **params,
+        )
+
+
+class ElasticTransform(Transform):
+    """Transform the input with elastic transformations.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.ElasticTransform
+
+    def __init__(
+        self,
+        alpha: Union[float, Sequence[float]] = 50.0,
+        sigma: Union[float, Sequence[float]] = 5.0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, dict[Union[type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.alpha = _setup_number_or_seq(alpha, "alpha")
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        self.interpolation = interpolation
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        size = list(query_size(flat_inputs))
+
+        dx = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[0] > 0.0:
+            kx = int(8 * self.sigma[0] + 1)
+            # if kernel size is even we have to make it odd
+            if kx % 2 == 0:
+                kx += 1
+            dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
+        dx = dx * self.alpha[0] / size[0]
+
+        dy = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[1] > 0.0:
+            ky = int(8 * self.sigma[1] + 1)
+            # if kernel size is even we have to make it odd
+            if ky % 2 == 0:
+                ky += 1
+            dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
+        dy = dy * self.alpha[1] / size[1]
+        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
+        return dict(displacement=displacement)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.elastic,
+            inpt,
+            **params,
+            fill=fill,
+            interpolation=self.interpolation,
+        )
+
+
+class RandomIoUCrop(Transform):
+    """Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[list[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        if not (
+            has_all(flat_inputs, tv_tensors.BoundingBoxes)
+            and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
+        ):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
+                "and bounding boxes. Sample can also contain masks."
+            )
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+        bboxes = get_bounding_boxes(flat_inputs)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return dict()
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                xyxy_bboxes = F.convert_bounding_box_format(
+                    bboxes.as_subclass(torch.Tensor),
+                    bboxes.format,
+                    tv_tensors.BoundingBoxFormat.XYXY,
+                )
+                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
+                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
+                ious = box_iou(
+                    xyxy_bboxes,
+                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+
+        if len(params) < 1:
+            return inpt
+
+        output = self._call_kernel(
+            F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+        )
+
+        if isinstance(output, tv_tensors.BoundingBoxes):
+            # We "mark" the invalid boxes as degenreate, and they can be
+            # removed by a later call to SanitizeBoundingBoxes()
+            output[~params["is_within_crop_area"]] = 0
+
+        return output
+
+
+class ScaleJitter(Transform):
+    """Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        target_size: tuple[int, int],
+        scale_range: tuple[float, float] = (0.1, 2.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomShortestSize(Transform):
+    """Randomly resize the input.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: Union[list[int], tuple[int], int],
+        max_size: Optional[int] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
+        r = min_size / min(orig_height, orig_width)
+        if self.max_size is not None:
+            r = min(r, self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomResize(Transform):
+    """Randomly resize the input.
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The default value changed from ``None`` to ``True`` in
+            v0.17, for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: int,
+        max_size: int,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[bool] = True,
+    ) -> None:
+        super().__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        size = int(torch.randint(self.min_size, self.max_size, ()))
+        return dict(size=[size])
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_meta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f223f0398c836b9d109faf817526376fece7d2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_meta.py
@@ -0,0 +1,81 @@
+from typing import Any, Union
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+
+class ConvertBoundingBoxFormat(Transform):
+    """Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
+
+    Args:
+        format (str or tv_tensors.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.tv_tensors.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def __init__(self, format: Union[str, tv_tensors.BoundingBoxFormat]) -> None:
+        super().__init__()
+        self.format = format
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.convert_bounding_box_format(inpt, new_format=self.format)  # type: ignore[return-value, arg-type]
+
+
+class ClampBoundingBoxes(Transform):
+    """Clamp bounding boxes to their corresponding image dimensions.
+
+    Args:
+        clamping_mode: Default is "auto" which relies on the input box'
+            ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto`
+            for more details on how to use this transform.
+    """
+
+    def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None:
+        super().__init__()
+        self.clamping_mode = clamping_mode
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.clamp_bounding_boxes(inpt, clamping_mode=self.clamping_mode)  # type: ignore[return-value]
+
+
+class ClampKeyPoints(Transform):
+    """Clamp keypoints to their corresponding image dimensions.
+
+    The clamping is done according to the keypoints' ``canvas_size`` meta-data.
+    """
+
+    _transformed_types = (tv_tensors.KeyPoints,)
+
+    def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_tensors.KeyPoints:
+        return F.clamp_keypoints(inpt)  # type: ignore[return-value]
+
+
+class SetClampingMode(Transform):
+    """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms.
+
+
+
+    Args:
+        clamping_mode: The clamping mode to set. Possible values are: "soft",
+            "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more
+            details on how to use this transform.
+    """
+
+    def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None:
+        super().__init__()
+        self.clamping_mode = clamping_mode
+
+        if self.clamping_mode not in (None, "soft", "hard"):
+            raise ValueError(f"clamping_mode must be soft, hard or None, got {clamping_mode}")
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        out: tv_tensors.BoundingBoxes = inpt.clone()  # type: ignore[assignment]
+        out.clamping_mode = self.clamping_mode
+        return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_misc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..305149c87b115a7e6789979c224c71c53645d596
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_misc.py
@@ -0,0 +1,570 @@
+import warnings
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import PIL.Image
+
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._utils import (
+    _parse_labels_getter,
+    _setup_number_or_seq,
+    _setup_size,
+    get_bounding_boxes,
+    get_keypoints,
+    has_any,
+    is_pure_tensor,
+)
+
+
+# TODO: do we want/need to expose this?
+class Identity(Transform):
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return inpt
+
+
+class Lambda(Transform):
+    """Apply a user-defined function as a transform.
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
+    _transformed_types = (object,)
+
+    def __init__(self, lambd: Callable[[Any], Any], *types: type):
+        super().__init__()
+        self.lambd = lambd
+        self.types = types or self._transformed_types
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if isinstance(inpt, self.types):
+            return self.lambd(inpt)
+        else:
+            return inpt
+
+    def extra_repr(self) -> str:
+        extras = []
+        name = getattr(self.lambd, "__name__", None)
+        if name:
+            extras.append(name)
+        extras.append(f"types={[type.__name__ for type in self.types]}")
+        return ", ".join(extras)
+
+
+class LinearTransformation(Transform):
+    """Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
+    _v1_transform_cls = _transforms.LinearTransformation
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
+
+    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
+        super().__init__()
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError(
+                "transformation_matrix should be square. Got "
+                f"{tuple(transformation_matrix.size())} rectangular matrix."
+            )
+
+        if mean_vector.size(0) != transformation_matrix.size(0):
+            raise ValueError(
+                f"mean_vector should have the same length {mean_vector.size(0)}"
+                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
+            )
+
+        if transformation_matrix.device != mean_vector.device:
+            raise ValueError(
+                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
+            )
+
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
+        self.transformation_matrix = transformation_matrix
+        self.mean_vector = mean_vector
+
+    def check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        shape = inpt.shape
+        n = shape[-3] * shape[-2] * shape[-1]
+        if n != self.transformation_matrix.shape[0]:
+            raise ValueError(
+                "Input tensor and transformation matrix have incompatible shape."
+                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
+                + f"{self.transformation_matrix.shape[0]}"
+            )
+
+        if inpt.device.type != self.mean_vector.device.type:
+            raise ValueError(
+                "Input tensor should be on the same device as transformation matrix and mean vector. "
+                f"Got {inpt.device} vs {self.mean_vector.device}"
+            )
+
+        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
+
+        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
+        output = torch.mm(flat_inpt, transformation_matrix)
+        output = output.reshape(shape)
+
+        if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+            output = tv_tensors.wrap(output, like=inpt)
+        return output
+
+
+class Normalize(Transform):
+    """Normalize a tensor image or video with mean and standard deviation.
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    _v1_transform_cls = _transforms.Normalize
+
+    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
+        super().__init__()
+        self.mean = list(mean)
+        self.std = list(std)
+        self.inplace = inplace
+
+    def check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.normalize, inpt, mean=self.mean, std=self.std, inplace=self.inplace)
+
+
+class GaussianBlur(Transform):
+    """Blurs image with randomly chosen Gaussian blur kernel.
+
+    The convolution will be using reflection padding corresponding to the kernel size, to maintain the input shape.
+
+    If the input is a Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+    """
+
+    _v1_transform_cls = _transforms.GaussianBlur
+
+    def __init__(
+        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
+        for ks in self.kernel_size:
+            if ks <= 0 or ks % 2 == 0:
+                raise ValueError("Kernel size value should be an odd and positive number.")
+
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        if not 0.0 < self.sigma[0] <= self.sigma[1]:
+            raise ValueError(f"sigma values should be positive and of the form (min, max). Got {self.sigma}")
+
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
+        return dict(sigma=[sigma, sigma])
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.gaussian_blur, inpt, self.kernel_size, **params)
+
+
+class GaussianNoise(Transform):
+    """Add gaussian noise to images or videos.
+
+    The input tensor is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    Each image or frame in a batch will be transformed independently i.e. the
+    noise added to each image will be different.
+
+    The input tensor is also expected to be of float dtype in ``[0, 1]``,
+    or of ``uint8`` dtype in ``[0, 255]``. This transform does not support PIL
+    images.
+
+    Regardless of the dtype used, the parameters of the function use the same
+    scale, so a ``mean`` parameter of 0.5 will result in an average value
+    increase of 0.5 units for float images, and an average increase of 127.5
+    units for ``uint8`` images.
+
+    Args:
+        mean (float): Mean of the sampled normal distribution. Default is 0.
+        sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1.
+        clip (bool, optional): Whether to clip the values after adding noise, be it to
+            ``[0, 1]`` for floats or to ``[0, 255]`` for ``uint8``. Setting this parameter to
+            ``False`` may cause unsigned integer overflows with uint8 inputs.
+            Default is True.
+    """
+
+    def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None:
+        super().__init__()
+        self.mean = mean
+        self.sigma = sigma
+        self.clip = clip
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.gaussian_noise, inpt, mean=self.mean, sigma=self.sigma, clip=self.clip)
+
+
+class ToDtype(Transform):
+    """Converts the input to a specific dtype, optionally scaling the values for images or videos.
+
+    .. note::
+        ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
+
+    Args:
+        dtype (``torch.dtype`` or dict of ``TVTensor`` -> ``torch.dtype``): The dtype to convert to.
+            If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
+            to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
+            A dict can be passed to specify per-tv_tensor conversions, e.g.
+            ``dtype={tv_tensors.Image: torch.float32, tv_tensors.Mask: torch.int64, "others":None}``. The "others"
+            key can be used as a catch-all for any other tv_tensor type, and ``None`` means no conversion.
+        scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
+            Default: ``False``.
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(
+        self, dtype: Union[torch.dtype, dict[Union[type, str], Optional[torch.dtype]]], scale: bool = False
+    ) -> None:
+        super().__init__()
+
+        if not isinstance(dtype, (dict, torch.dtype)):
+            raise ValueError(f"dtype must be a dict or a torch.dtype, got {type(dtype)} instead")
+
+        if (
+            isinstance(dtype, dict)
+            and torch.Tensor in dtype
+            and any(cls in dtype for cls in [tv_tensors.Image, tv_tensors.Video])
+        ):
+            warnings.warn(
+                "Got `dtype` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
+            )
+        self.dtype = dtype
+        self.scale = scale
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        if isinstance(self.dtype, torch.dtype):
+            # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
+            # is a simple torch.dtype
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                return inpt
+
+            dtype: Optional[torch.dtype] = self.dtype
+        elif type(inpt) in self.dtype:
+            dtype = self.dtype[type(inpt)]
+        elif "others" in self.dtype:
+            dtype = self.dtype["others"]
+        else:
+            raise ValueError(
+                f"No dtype was specified for type {type(inpt)}. "
+                "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
+                "If you're passing a dict as dtype, "
+                'you can use "others" as a catch-all key '
+                'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
+            )
+
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video))
+        if dtype is None:
+            if self.scale and supports_scaling:
+                warnings.warn(
+                    "scale was set to True but no dtype was specified for images or videos: no scaling will be done."
+                )
+            return inpt
+
+        return self._call_kernel(F.to_dtype, inpt, dtype=dtype, scale=self.scale)
+
+
+class ConvertImageDtype(Transform):
+    """[DEPRECATED] Use ``v2.ToDtype(dtype, scale=True)`` instead.
+
+    Convert input image to the given ``dtype`` and scale the values accordingly.
+
+    .. warning::
+        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
+    _v1_transform_cls = _transforms.ConvertImageDtype
+
+    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.to_dtype, inpt, dtype=self.dtype, scale=True)
+
+
+class SanitizeBoundingBoxes(Transform):
+    """Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size`` or ``min_area``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
+
+    It can also sanitize other tensors like the "iscrowd" or "area" properties from COCO
+    (see ``labels_getter`` parameter).
+
+    .. note::
+        **Mask handling**: This transform automatically detects and sanitizes per-instance masks
+        (shape ``[N, H, W]`` where N matches the number of bounding boxes). Semantic segmentation masks
+        (shape ``[H, W]``) or masks with mismatched dimensions are passed through unchanged.
+        You do not need to add masks to ``labels_getter`` for them to be sanitized.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional): The size below which bounding boxes are removed. Default is 1.
+        min_area (float, optional): The area below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input
+            (or anything else that needs to be sanitized along with the bounding boxes).
+            By default, this will try to find a "labels" key in the input (case-insensitive), if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+
+            It can also be a callable that takes the same input as the transform, and returns either:
+
+            - A single tensor (the labels)
+            - A tuple/list of tensors, each of which will be subject to the same sanitization as the bounding boxes.
+              This is useful to sanitize multiple tensors like the labels, and the "iscrowd" or "area" properties
+              from COCO.
+
+            If ``labels_getter`` is None then only bounding boxes are sanitized.
+    """
+
+    def __init__(
+        self,
+        min_size: float = 1.0,
+        min_area: float = 1.0,
+        labels_getter: Union[Callable[[Any], Any], str, None] = "default",
+    ) -> None:
+        super().__init__()
+
+        if min_size < 1:
+            raise ValueError(f"min_size must be >= 1, got {min_size}.")
+        self.min_size = min_size
+
+        if min_area < 1:
+            raise ValueError(f"min_area must be >= 1, got {min_area}.")
+        self.min_area = min_area
+
+        self.labels_getter = labels_getter
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        labels = self._labels_getter(inputs)
+        if labels is not None:
+            msg = "The labels in the input to forward() must be a tensor or None, got {type} instead."
+            if isinstance(labels, torch.Tensor):
+                labels = (labels,)
+            elif isinstance(labels, (tuple, list)):
+                for entry in labels:
+                    if not isinstance(entry, torch.Tensor):
+                        # TODO: we don't need to enforce tensors, just that entries are indexable as t[bool_mask]
+                        raise ValueError(msg.format(type=type(entry)))
+            else:
+                raise ValueError(msg.format(type=type(labels)))
+
+        flat_inputs, spec = tree_flatten(inputs)
+        boxes = get_bounding_boxes(flat_inputs)
+
+        if labels is not None:
+            for label in labels:
+                if boxes.shape[0] != label.shape[0]:
+                    raise ValueError(
+                        f"Number of boxes (shape={boxes.shape}) and must match the number of labels."
+                        f"Found labels with shape={label.shape})."
+                    )
+
+        valid = F._misc._get_sanitize_bounding_boxes_mask(
+            boxes,
+            format=boxes.format,
+            canvas_size=boxes.canvas_size,
+            min_size=self.min_size,
+            min_area=self.min_area,
+        )
+
+        params = dict(valid=valid, labels=labels)
+        flat_outputs = [self.transform(inpt, params) for inpt in flat_inputs]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        is_label = params["labels"] is not None and any(inpt is label for label in params["labels"])
+        is_bounding_boxes = isinstance(inpt, tv_tensors.BoundingBoxes)
+        is_mask = isinstance(inpt, tv_tensors.Mask)
+
+        if not (is_label or is_bounding_boxes or is_mask):
+            return inpt
+
+        try:
+            output = inpt[params["valid"]]
+        except (IndexError):
+            # If indexing fails (e.g., shape mismatch), pass through unchanged
+            return inpt
+
+        if is_label:
+            return output
+        else:
+            return tv_tensors.wrap(output, like=inpt)
+
+
+class SanitizeKeyPoints(Transform):
+    """Remove keypoints outside of the image area and their corresponding labels (if any).
+
+    This transform removes keypoints or groups of keypoints and their associated labels that
+    have coordinates outside of their corresponding image.
+    If you would instead like to clamp such keypoints to the image edges, use
+    :class:`~torchvision.transforms.v2.ClampKeyPoints`.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models.
+
+    Keypoints can be passed as a set of individual keypoints or as a set of objects
+    (e.g., polygons or polygonal chains) consisting of a fixed number of keypoints of shape ``[..., 2]``.
+    When groups of keypoints are passed (i.e., an at least 3-dimensional tensor), this transform
+    will only remove entire groups, not individual keypoints within a group.
+
+    Args:
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input
+            (or anything else that needs to be sanitized along with the keypoints).
+            If set to the string ``"default"``, this will try to find a "labels" key in the input (case-insensitive), if
+            the input is a dict or it is a tuple whose second element is a dict.
+
+            It can also be a callable that takes the same input as the transform, and returns either:
+
+            - A single tensor (the labels)
+            - A tuple/list of tensors, each of which will be subject to the same sanitization as the keypoints.
+
+            If ``labels_getter`` is None (the default), then only keypoints are sanitized.
+    """
+
+    def __init__(
+        self,
+        labels_getter: Union[Callable[[Any], Any], str, None] = None,
+    ) -> None:
+        super().__init__()
+        self.labels_getter = labels_getter
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        labels = self._labels_getter(inputs)
+        if labels is not None:
+            msg = "The labels in the input to forward() must be a tensor or None, got {type} instead."
+            if isinstance(labels, torch.Tensor):
+                labels = (labels,)
+            elif isinstance(labels, (tuple, list)):
+                for entry in labels:
+                    if not isinstance(entry, torch.Tensor):
+                        # TODO: we don't need to enforce tensors, just that entries are indexable as t[bool_mask]
+                        raise ValueError(msg.format(type=type(entry)))
+            else:
+                raise ValueError(msg.format(type=type(labels)))
+
+        flat_inputs, spec = tree_flatten(inputs)
+        points = get_keypoints(flat_inputs)
+
+        if labels is not None:
+            for label in labels:
+                if points.shape[0] != label.shape[0]:
+                    raise ValueError(
+                        f"Number of kepyoints (shape={points.shape}) must match the number of labels."
+                        f"Found labels with shape={label.shape})."
+                    )
+
+        valid = F._misc._get_sanitize_keypoints_mask(
+            points,
+            canvas_size=points.canvas_size,
+        )
+
+        params = dict(valid=valid, labels=labels)
+        flat_outputs = [self.transform(inpt, params) for inpt in flat_inputs]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        is_label = params["labels"] is not None and any(inpt is label for label in params["labels"])
+        is_keypoints = isinstance(inpt, tv_tensors.KeyPoints)
+
+        if not (is_label or is_keypoints):
+            return inpt
+
+        output = inpt[params["valid"]]
+
+        if is_label:
+            return output
+        else:
+            return tv_tensors.wrap(output, like=inpt)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_temporal.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..0642a741e35ae8bb2a3f2b825b7b921fd9548dad
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_temporal.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import torch
+from torchvision.transforms.v2 import functional as F, Transform
+
+
+class UniformTemporalSubsample(Transform):
+    """Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(self, num_samples: int):
+        super().__init__()
+        self.num_samples = num_samples
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        return self._call_kernel(F.uniform_temporal_subsample, inpt, self.num_samples)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_transform.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84fcb6c826d4d6473fd8441965089cf80ca920
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_transform.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+import enum
+from typing import Any, Callable
+
+import PIL.Image
+import torch
+from torch import nn
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import tv_tensors
+from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor
+from torchvision.utils import _log_api_usage_once
+
+from .functional._utils import _get_kernel
+
+
+class Transform(nn.Module):
+    """Base class to implement your own v2 transforms.
+
+    See  :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py` for
+    more details.
+    """
+
+    # Class attribute defining transformed types. Other types are passed-through without any transformation
+    # We support both Types and callables that are able to do further checks on the type of the input.
+    _transformed_types: tuple[type | Callable[[Any], bool], ...] = (torch.Tensor, PIL.Image.Image)
+
+    def __init__(self) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+    def check_inputs(self, flat_inputs: list[Any]) -> None:
+        pass
+
+    # When v2 was introduced, this method was private and called
+    # `_get_params()`. Now it's publicly exposed as `make_params()`. It cannot
+    # be exposed as `get_params()` because there is already a `get_params()`
+    # methods for v2 transforms: it's the v1's `get_params()` that we have  to
+    # keep in order to guarantee 100% BC with v1. (It's defined in
+    # __init_subclass__ below).
+    def make_params(self, flat_inputs: list[Any]) -> dict[str, Any]:
+        """Method to override for custom transforms.
+
+        See :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`"""
+        return dict()
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        kernel = _get_kernel(functional, type(inpt), allow_passthrough=True)
+        return kernel(inpt, *args, **kwargs)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        """Method to override for custom transforms.
+
+        See :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`"""
+        raise NotImplementedError
+
+    def forward(self, *inputs: Any) -> Any:
+        """Do not override this! Use ``transform()`` instead."""
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+
+        self.check_inputs(flat_inputs)
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self.make_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _needs_transform_list(self, flat_inputs: list[Any]) -> list[bool]:
+        # Below is a heuristic on how to deal with pure tensor inputs:
+        # 1. Pure tensors, i.e. tensors that are not a tv_tensor, are passed through if there is an explicit image
+        #    (`tv_tensors.Image` or `PIL.Image.Image`) or video (`tv_tensors.Video`) in the sample.
+        # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
+        #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
+        #    of `tree_flatten`, which recurses depth-first through the input.
+        #
+        # This heuristic stems from two requirements:
+        # 1. We need to keep BC for single input pure tensors and treat them as images.
+        # 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
+        #    return supplemental numerical data as tensors that cannot be transformed as images.
+        #
+        # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
+        # tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
+        # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
+
+        needs_transform_list = []
+        transform_pure_tensor = not has_any(flat_inputs, tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)
+        for inpt in flat_inputs:
+            needs_transform = True
+
+            if not check_type(inpt, self._transformed_types):
+                needs_transform = False
+            elif is_pure_tensor(inpt):
+                if transform_pure_tensor:
+                    transform_pure_tensor = False
+                else:
+                    needs_transform = False
+            needs_transform_list.append(needs_transform)
+        return needs_transform_list
+
+    def extra_repr(self) -> str:
+        extra = []
+        for name, value in self.__dict__.items():
+            if name.startswith("_") or name == "training":
+                continue
+
+            if not isinstance(value, (bool, int, float, str, tuple, list, enum.Enum)):
+                continue
+
+            extra.append(f"{name}={value}")
+
+        return ", ".join(extra)
+
+    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables two things:
+    # 1. In case the v1 transform has a static `get_params` method, it will also be available under the same name on
+    #    the v2 transform. See `__init_subclass__` for details.
+    # 2. The v2 transform will be JIT scriptable. See `_extract_params_for_v1_transform` and `__prepare_scriptable__`
+    #    for details.
+    _v1_transform_cls: type[nn.Module] | None = None
+
+    def __init_subclass__(cls) -> None:
+        # Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
+        # This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
+        if cls._v1_transform_cls is not None and hasattr(cls._v1_transform_cls, "get_params"):
+            cls.get_params = staticmethod(cls._v1_transform_cls.get_params)  # type: ignore[attr-defined]
+
+    def _extract_params_for_v1_transform(self) -> dict[str, Any]:
+        # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
+        # v2 transform instance. It extracts all available public attributes that are specific to that transform and
+        # not `nn.Module` in general.
+        # Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
+        # if the v2 transform introduced new parameters that are not support by the v1 transform.
+        common_attrs = nn.Module().__dict__.keys()
+        return {
+            attr: value
+            for attr, value in self.__dict__.items()
+            if not attr.startswith("_") and attr not in common_attrs
+        }
+
+    def __prepare_scriptable__(self) -> nn.Module:
+        # This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
+        # value is used for scripting over the original object that should have been scripted. Since the v1 transforms
+        # are JIT scriptable, and we made sure that for single image inputs v1 and v2 are equivalent, we just return the
+        # equivalent v1 transform here. This of course only makes transforms v2 JIT scriptable as long as transforms v1
+        # is around.
+        if self._v1_transform_cls is None:
+            raise RuntimeError(
+                f"Transform {type(self).__name__} cannot be JIT scripted. "
+                "torchscript is only supported for backward compatibility with transforms "
+                "which are already in torchvision.transforms. "
+                "For torchscript support (on tensors only), you can use the functional API instead."
+            )
+
+        return self._v1_transform_cls(**self._extract_params_for_v1_transform())
+
+
+class _RandomApplyTransform(Transform):
+    def __init__(self, p: float = 0.5) -> None:
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+
+        super().__init__()
+        self.p = p
+
+    def forward(self, *inputs: Any) -> Any:
+        # We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return
+        # early afterwards in case the random check triggers. The same result could be achieved by calling
+        # `super().forward()` after the random check, but that would call `self.check_inputs` twice.
+
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+
+        self.check_inputs(flat_inputs)
+
+        if torch.rand(1) >= self.p:
+            return inputs
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self.make_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self.transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_type_conversion.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_type_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cac62868b9b5331e4760e56dde284fa40929d14
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_type_conversion.py
@@ -0,0 +1,92 @@
+from typing import Any, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from torchvision.transforms.v2._utils import is_pure_tensor
+
+
+class PILToTensor(Transform):
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
+
+    Convert a PIL Image with H height, W width, and C channels to a Tensor of shape (C x H x W).
+
+    Example:
+        >>> from PIL import Image
+        >>> from torchvision.transforms import v2
+        >>> img = Image.new("RGB", (320, 240))  # size (W=320, H=240)
+        >>> tensor = v2.PILToTensor()(img)
+        >>> print(tensor.shape)
+        torch.Size([3, 240, 320])
+    """
+
+    _transformed_types = (PIL.Image.Image,)
+
+    def transform(self, inpt: PIL.Image.Image, params: dict[str, Any]) -> torch.Tensor:
+        return F.pil_to_tensor(inpt)
+
+
+class ToImage(Transform):
+    """Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
+    ; this does not scale values.
+
+    This transform does not support torchscript.
+    """
+
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
+
+    def transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: dict[str, Any]
+    ) -> tv_tensors.Image:
+        return F.to_image(inpt)
+
+
+class ToPILImage(Transform):
+    """Convert a tensor or an ndarray to PIL Image
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+              ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, np.ndarray)
+
+    def __init__(self, mode: Optional[str] = None) -> None:
+        super().__init__()
+        self.mode = mode
+
+    def transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: dict[str, Any]
+    ) -> PIL.Image.Image:
+        return F.to_pil_image(inpt, mode=self.mode)
+
+
+class ToPureTensor(Transform):
+    """Convert all TVTensors to pure tensors, removing associated metadata (if any).
+
+    This doesn't scale or change the values, only the type.
+    """
+
+    _transformed_types = (tv_tensors.TVTensor,)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> torch.Tensor:
+        return inpt.as_subclass(torch.Tensor)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb6051b4e61c02c004c01e6610f8a5f584046e87
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/_utils.py
@@ -0,0 +1,242 @@
+from __future__ import annotations
+
+import collections.abc
+import numbers
+from collections.abc import Sequence
+from contextlib import suppress
+
+from typing import Any, Callable, Literal
+
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision._utils import sequence_to_str
+
+from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
+
+
+def _setup_number_or_seq(arg: int | float | Sequence[int | float], name: str) -> Sequence[float]:
+    if not isinstance(arg, (int, float, Sequence)):
+        raise TypeError(f"{name} should be a number or a sequence of numbers. Got {type(arg)}")
+    if isinstance(arg, Sequence) and len(arg) not in (1, 2):
+        raise ValueError(f"If {name} is a sequence its length should be 1 or 2. Got {len(arg)}")
+    if isinstance(arg, Sequence):
+        for element in arg:
+            if not isinstance(element, (int, float)):
+                raise ValueError(f"{name} should be a sequence of numbers. Got {type(element)}")
+
+    if isinstance(arg, (int, float)):
+        arg = [float(arg), float(arg)]
+    elif isinstance(arg, Sequence):
+        if len(arg) == 1:
+            arg = [float(arg[0]), float(arg[0])]
+        else:
+            arg = [float(arg[0]), float(arg[1])]
+    return arg
+
+
+def _check_fill_arg(fill: _FillType | dict[type | str, _FillType]) -> None:
+    if isinstance(fill, dict):
+        for value in fill.values():
+            _check_fill_arg(value)
+    else:
+        if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+            raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
+
+
+def _convert_fill_arg(fill: _FillType) -> _FillTypeJIT:
+    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
+    # So, we can't reassign fill to 0
+    # if fill is None:
+    #     fill = 0
+    if fill is None:
+        return fill
+
+    if not isinstance(fill, (int, float)):
+        fill = [float(v) for v in list(fill)]
+    return fill  # type: ignore[return-value]
+
+
+def _setup_fill_arg(fill: _FillType | dict[type | str, _FillType]) -> dict[type | str, _FillTypeJIT]:
+    _check_fill_arg(fill)
+
+    if isinstance(fill, dict):
+        for k, v in fill.items():
+            fill[k] = _convert_fill_arg(v)
+        return fill  # type: ignore[return-value]
+    else:
+        return {"others": _convert_fill_arg(fill)}
+
+
+def _get_fill(fill_dict, inpt_type):
+    if inpt_type in fill_dict:
+        return fill_dict[inpt_type]
+    elif "others" in fill_dict:
+        return fill_dict["others"]
+    else:
+        RuntimeError("This should never happen, please open an issue on the torchvision repo if you hit this.")
+
+
+def _check_padding_arg(padding: int | Sequence[int]) -> None:
+
+    err_msg = f"Padding must be an int or a 1, 2, or 4 element of tuple or list, got {padding}."
+    if isinstance(padding, (tuple, list)):
+        if len(padding) not in [1, 2, 4] or not all(isinstance(p, int) for p in padding):
+            raise ValueError(err_msg)
+    elif not isinstance(padding, int):
+        raise ValueError(err_msg)
+
+
+# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
+# https://github.com/pytorch/vision/issues/6250
+def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+
+def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
+    """
+    This heuristic covers three cases:
+
+    1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
+       classification inputs for MixUp and CutMix (typically after the Dataloder).
+    2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
+       under a label-like (see below) key. This happens for the inputs of detection models.
+    3. The input is a dictionary that is structured as the one from 2.
+
+    What is "label-like" key? We first search for an case-insensitive match of 'labels' inside the keys of the
+    dictionary. This is the name our detection models expect. If we can't find that, we look for a case-insensitive
+    match of the term 'label' anywhere inside the key, i.e. 'FooLaBeLBar'. If we can't find that either, the dictionary
+    contains no "label-like" key.
+    """
+
+    if isinstance(inputs, (tuple, list)):
+        inputs = inputs[1]
+
+    # MixUp, CutMix
+    if is_pure_tensor(inputs):
+        return inputs
+
+    if not isinstance(inputs, collections.abc.Mapping):
+        raise ValueError(
+            f"When using the default labels_getter, the input passed to forward must be a dictionary or a two-tuple "
+            f"whose second item is a dictionary or a tensor, but got {inputs} instead."
+        )
+
+    candidate_key = None
+    with suppress(StopIteration):
+        candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+    if candidate_key is None:
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+    if candidate_key is None:
+        raise ValueError(
+            "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+            "If there are no labels in the sample by design, pass labels_getter=None."
+        )
+
+    return inputs[candidate_key]
+
+
+def _parse_labels_getter(labels_getter: str | Callable[[Any], Any] | None) -> Callable[[Any], Any]:
+    if labels_getter == "default":
+        return _find_labels_default_heuristic
+    elif callable(labels_getter):
+        return labels_getter
+    elif labels_getter is None:
+        return lambda _: None
+    else:
+        raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
+
+
+def get_bounding_boxes(flat_inputs: list[Any]) -> tv_tensors.BoundingBoxes:
+    """Return the Bounding Boxes in the input.
+
+    Assumes only one ``BoundingBoxes`` object is present.
+    """
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
+
+
+def get_keypoints(flat_inputs: list[Any]) -> tv_tensors.KeyPoints:
+    """Return the keypoints in the input.
+
+    Assumes only one ``KeyPoints`` object is present.
+    """
+    # This assumes there is only one keypoint per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.KeyPoints))
+    except StopIteration:
+        raise ValueError("No keypoints were found in the sample")
+
+
+def query_chw(flat_inputs: list[Any]) -> tuple[int, int, int]:
+    """Return Channel, Height, and Width."""
+    chws = {
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+    }
+    if not chws:
+        raise TypeError("No image or video was found in the sample")
+    elif len(chws) > 1:
+        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
+    c, h, w = chws.pop()
+    return c, h, w
+
+
+def query_size(flat_inputs: list[Any]) -> tuple[int, int]:
+    """Return Height and Width."""
+    sizes = {
+        tuple(get_size(inpt))
+        for inpt in flat_inputs
+        if check_type(
+            inpt,
+            (
+                is_pure_tensor,
+                tv_tensors.Image,
+                PIL.Image.Image,
+                tv_tensors.Video,
+                tv_tensors.Mask,
+                tv_tensors.BoundingBoxes,
+                tv_tensors.KeyPoints,
+            ),
+        )
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask, bounding box of keypoint was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
+def check_type(obj: Any, types_or_checks: tuple[type | Callable[[Any], bool], ...]) -> bool:
+    for type_or_check in types_or_checks:
+        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+            return True
+    return False
+
+
+def has_any(flat_inputs: list[Any], *types_or_checks: type | Callable[[Any], bool]) -> bool:
+    for inpt in flat_inputs:
+        if check_type(inpt, types_or_checks):
+            return True
+    return False
+
+
+def has_all(flat_inputs: list[Any], *types_or_checks: type | Callable[[Any], bool]) -> bool:
+    for type_or_check in types_or_checks:
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
+                break
+        else:
+            return False
+    return True
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13fbaa588fea9bf99857a5409136efeb486d19cb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__init__.py
@@ -0,0 +1,167 @@
+from torchvision.transforms import InterpolationMode  # usort: skip
+
+from ._utils import is_pure_tensor, register_kernel  # usort: skip
+
+from ._meta import (
+    clamp_bounding_boxes,
+    clamp_keypoints,
+    convert_bounding_box_format,
+    get_dimensions_image,
+    get_dimensions_video,
+    get_dimensions,
+    get_num_frames_video,
+    get_num_frames,
+    get_image_num_channels,
+    get_num_channels_image,
+    get_num_channels_video,
+    get_num_channels,
+    get_size_bounding_boxes,
+    get_size_keypoints,
+    get_size_image,
+    get_size_mask,
+    get_size_video,
+    get_size,
+)  # usort: skip
+
+from ._augment import erase, erase_image, erase_video, jpeg, jpeg_image, jpeg_video
+from ._color import (
+    adjust_brightness,
+    adjust_brightness_image,
+    adjust_brightness_video,
+    adjust_contrast,
+    adjust_contrast_image,
+    adjust_contrast_video,
+    adjust_gamma,
+    adjust_gamma_image,
+    adjust_gamma_video,
+    adjust_hue,
+    adjust_hue_image,
+    adjust_hue_video,
+    adjust_saturation,
+    adjust_saturation_image,
+    adjust_saturation_video,
+    adjust_sharpness,
+    adjust_sharpness_image,
+    adjust_sharpness_video,
+    autocontrast,
+    autocontrast_image,
+    autocontrast_video,
+    equalize,
+    equalize_image,
+    equalize_video,
+    grayscale_to_rgb,
+    grayscale_to_rgb_image,
+    invert,
+    invert_image,
+    invert_video,
+    permute_channels,
+    permute_channels_image,
+    permute_channels_video,
+    posterize,
+    posterize_image,
+    posterize_video,
+    rgb_to_grayscale,
+    rgb_to_grayscale_image,
+    solarize,
+    solarize_image,
+    solarize_video,
+    to_grayscale,
+)
+from ._geometry import (
+    affine,
+    affine_bounding_boxes,
+    affine_image,
+    affine_keypoints,
+    affine_mask,
+    affine_video,
+    center_crop,
+    center_crop_bounding_boxes,
+    center_crop_image,
+    center_crop_keypoints,
+    center_crop_mask,
+    center_crop_video,
+    crop,
+    crop_bounding_boxes,
+    crop_image,
+    crop_keypoints,
+    crop_mask,
+    crop_video,
+    elastic,
+    elastic_bounding_boxes,
+    elastic_image,
+    elastic_keypoints,
+    elastic_mask,
+    elastic_transform,
+    elastic_video,
+    five_crop,
+    five_crop_image,
+    five_crop_video,
+    hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
+    horizontal_flip,
+    horizontal_flip_bounding_boxes,
+    horizontal_flip_image,
+    horizontal_flip_keypoints,
+    horizontal_flip_mask,
+    horizontal_flip_video,
+    pad,
+    pad_bounding_boxes,
+    pad_image,
+    pad_keypoints,
+    pad_mask,
+    pad_video,
+    perspective,
+    perspective_bounding_boxes,
+    perspective_image,
+    perspective_keypoints,
+    perspective_mask,
+    perspective_video,
+    resize,
+    resize_bounding_boxes,
+    resize_image,
+    resize_keypoints,
+    resize_mask,
+    resize_video,
+    resized_crop,
+    resized_crop_bounding_boxes,
+    resized_crop_image,
+    resized_crop_keypoints,
+    resized_crop_mask,
+    resized_crop_video,
+    rotate,
+    rotate_bounding_boxes,
+    rotate_image,
+    rotate_keypoints,
+    rotate_mask,
+    rotate_video,
+    ten_crop,
+    ten_crop_image,
+    ten_crop_video,
+    vertical_flip,
+    vertical_flip_bounding_boxes,
+    vertical_flip_image,
+    vertical_flip_keypoints,
+    vertical_flip_mask,
+    vertical_flip_video,
+    vflip,
+)
+from ._misc import (
+    convert_image_dtype,
+    gaussian_blur,
+    gaussian_blur_image,
+    gaussian_blur_video,
+    gaussian_noise,
+    gaussian_noise_image,
+    gaussian_noise_video,
+    normalize,
+    normalize_image,
+    normalize_video,
+    sanitize_bounding_boxes,
+    sanitize_keypoints,
+    to_dtype,
+    to_dtype_image,
+    to_dtype_video,
+)
+from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
+from ._type_conversion import pil_to_tensor, to_image, to_pil_image
+
+from ._deprecated import get_image_size, to_tensor  # usort: skip
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e49f1250445f4dc7bb8f949c65549e83f1258065
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_augment.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_augment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd59b53392ff00df8cfd526efbeb4c16e38d76d9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_augment.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_color.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_color.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..395ca880b88df0c62bf213390d8afc0b08b0fa67
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_color.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_deprecated.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_deprecated.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f0d54bb1349a4a56cd0cb4173133420b129166
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_deprecated.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_meta.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_meta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..546dbe1bfc088700f39516a4b9a50374c65c9659
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_meta.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_misc.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fae759bd79c63523f6b6f80661d16d64ae5abc0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_misc.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_temporal.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_temporal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db614e2d60fcac9f1719c56c0fa920a831a85295
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_temporal.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_type_conversion.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_type_conversion.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b55705dc766e0936faaedeb6eb45cd7d87b86d01
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_type_conversion.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3f2298c338d9c99bc09605c24b9271144d75618
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_augment.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..a904d8d7cbdfeb78588abbf43c8bca37b3431735
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_augment.py
@@ -0,0 +1,106 @@
+import io
+
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+from torchvision.io import decode_jpeg, encode_jpeg
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def erase(
+    inpt: torch.Tensor,
+    i: int,
+    j: int,
+    h: int,
+    w: int,
+    v: torch.Tensor,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomErase` for details."""
+    if torch.jit.is_scripting():
+        return erase_image(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+    _log_api_usage_once(erase)
+
+    kernel = _get_kernel(erase, type(inpt))
+    return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
+@_register_kernel_internal(erase, torch.Tensor)
+@_register_kernel_internal(erase, tv_tensors.Image)
+def erase_image(
+    image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    if not inplace:
+        image = image.clone()
+
+    image[..., i : i + h, j : j + w] = v
+    return image
+
+
+@_register_kernel_internal(erase, PIL.Image.Image)
+def _erase_image_pil(
+    image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = erase_image(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(erase, tv_tensors.Video)
+def erase_video(
+    video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    return erase_image(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
+def jpeg(image: torch.Tensor, quality: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.JPEG` for details."""
+    if torch.jit.is_scripting():
+        return jpeg_image(image, quality=quality)
+
+    _log_api_usage_once(jpeg)
+
+    kernel = _get_kernel(jpeg, type(image))
+    return kernel(image, quality=quality)
+
+
+@_register_kernel_internal(jpeg, torch.Tensor)
+@_register_kernel_internal(jpeg, tv_tensors.Image)
+def jpeg_image(image: torch.Tensor, quality: int) -> torch.Tensor:
+    original_shape = image.shape
+    image = image.view((-1,) + image.shape[-3:])
+
+    if image.shape[0] == 0:  # degenerate
+        return image.reshape(original_shape).clone()
+
+    images = []
+    for i in range(image.shape[0]):
+        # isinstance checks are needed for torchscript.
+        encoded_image = encode_jpeg(image[i], quality=quality)
+        assert isinstance(encoded_image, torch.Tensor)
+        decoded_image = decode_jpeg(encoded_image)
+        assert isinstance(decoded_image, torch.Tensor)
+        images.append(decoded_image)
+
+    images = torch.stack(images, dim=0).view(original_shape)
+    return images
+
+
+@_register_kernel_internal(jpeg, tv_tensors.Video)
+def jpeg_video(video: torch.Tensor, quality: int) -> torch.Tensor:
+    return jpeg_image(video, quality=quality)
+
+
+@_register_kernel_internal(jpeg, PIL.Image.Image)
+def _jpeg_image_pil(image: PIL.Image.Image, quality: int) -> PIL.Image.Image:
+    raw_jpeg = io.BytesIO()
+    image.save(raw_jpeg, format="JPEG", quality=quality)
+
+    # we need to copy since PIL.Image.open() will return PIL.JpegImagePlugin.JpegImageFile
+    # which is a sub-class of PIL.Image.Image. this will fail check_transform() test.
+    return PIL.Image.open(raw_jpeg).copy()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_color.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_color.py
new file mode 100644
index 0000000000000000000000000000000000000000..be254c0d63a0dd6d67c3d3a042a24265a3bd2034
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_color.py
@@ -0,0 +1,740 @@
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _max_value
+
+from torchvision.utils import _log_api_usage_once
+
+from ._misc import _num_value_bits, to_dtype_image
+from ._type_conversion import pil_to_tensor, to_pil_image
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Grayscale` for details."""
+    if torch.jit.is_scripting():
+        return rgb_to_grayscale_image(inpt, num_output_channels=num_output_channels)
+
+    _log_api_usage_once(rgb_to_grayscale)
+
+    kernel = _get_kernel(rgb_to_grayscale, type(inpt))
+    return kernel(inpt, num_output_channels=num_output_channels)
+
+
+# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
+# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
+to_grayscale = rgb_to_grayscale
+
+
+def _rgb_to_grayscale_image(
+    image: torch.Tensor, num_output_channels: int = 1, preserve_dtype: bool = True
+) -> torch.Tensor:
+    # TODO: Maybe move the validation that num_output_channels is 1 or 3 to this function instead of callers.
+    if image.shape[-3] == 1 and num_output_channels == 1:
+        return image.clone()
+    if image.shape[-3] == 1 and num_output_channels == 3:
+        s = [1] * len(image.shape)
+        s[-3] = 3
+        return image.repeat(s)
+    r, g, b = image.unbind(dim=-3)
+    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    l_img = l_img.unsqueeze(dim=-3)
+    if preserve_dtype:
+        l_img = l_img.to(image.dtype)
+    if num_output_channels == 3:
+        l_img = l_img.expand(image.shape)
+    return l_img
+
+
+@_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
+@_register_kernel_internal(rgb_to_grayscale, tv_tensors.Image)
+def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _rgb_to_grayscale_image(image, num_output_channels=num_output_channels, preserve_dtype=True)
+
+
+@_register_kernel_internal(rgb_to_grayscale, PIL.Image.Image)
+def _rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _FP.to_grayscale(image, num_output_channels=num_output_channels)
+
+
+def grayscale_to_rgb(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RGB` for details."""
+    if torch.jit.is_scripting():
+        return grayscale_to_rgb_image(inpt)
+
+    _log_api_usage_once(grayscale_to_rgb)
+
+    kernel = _get_kernel(grayscale_to_rgb, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(grayscale_to_rgb, torch.Tensor)
+@_register_kernel_internal(grayscale_to_rgb, tv_tensors.Image)
+def grayscale_to_rgb_image(image: torch.Tensor) -> torch.Tensor:
+    if image.shape[-3] >= 3:
+        # Image already has RGB channels. We don't need to do anything.
+        return image
+    # rgb_to_grayscale can be used to add channels so we reuse that function.
+    return _rgb_to_grayscale_image(image, num_output_channels=3, preserve_dtype=True)
+
+
+@_register_kernel_internal(grayscale_to_rgb, PIL.Image.Image)
+def grayscale_to_rgb_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return image.convert(mode="RGB")
+
+
+def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
+    ratio = float(ratio)
+    fp = image1.is_floating_point()
+    bound = _max_value(image1.dtype)
+    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
+    return output if fp else output.to(image1.dtype)
+
+
+def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    """Adjust brightness."""
+
+    if torch.jit.is_scripting():
+        return adjust_brightness_image(inpt, brightness_factor=brightness_factor)
+
+    _log_api_usage_once(adjust_brightness)
+
+    kernel = _get_kernel(adjust_brightness, type(inpt))
+    return kernel(inpt, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, torch.Tensor)
+@_register_kernel_internal(adjust_brightness, tv_tensors.Image)
+def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    fp = image.is_floating_point()
+    bound = _max_value(image.dtype)
+    output = image.mul(brightness_factor).clamp_(0, bound)
+    return output if fp else output.to(image.dtype)
+
+
+@_register_kernel_internal(adjust_brightness, PIL.Image.Image)
+def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
+    return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, tv_tensors.Video)
+def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    return adjust_brightness_image(video, brightness_factor=brightness_factor)
+
+
+def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    """Adjust saturation."""
+    if torch.jit.is_scripting():
+        return adjust_saturation_image(inpt, saturation_factor=saturation_factor)
+
+    _log_api_usage_once(adjust_saturation)
+
+    kernel = _get_kernel(adjust_saturation, type(inpt))
+    return kernel(inpt, saturation_factor=saturation_factor)
+
+
+@_register_kernel_internal(adjust_saturation, torch.Tensor)
+@_register_kernel_internal(adjust_saturation, tv_tensors.Image)
+def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+    if not image.is_floating_point():
+        grayscale_image = grayscale_image.floor_()
+
+    return _blend(image, grayscale_image, saturation_factor)
+
+
+_adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
+
+
+@_register_kernel_internal(adjust_saturation, tv_tensors.Video)
+def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    return adjust_saturation_image(video, saturation_factor=saturation_factor)
+
+
+def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.RandomAutocontrast`"""
+    if torch.jit.is_scripting():
+        return adjust_contrast_image(inpt, contrast_factor=contrast_factor)
+
+    _log_api_usage_once(adjust_contrast)
+
+    kernel = _get_kernel(adjust_contrast, type(inpt))
+    return kernel(inpt, contrast_factor=contrast_factor)
+
+
+@_register_kernel_internal(adjust_contrast, torch.Tensor)
+@_register_kernel_internal(adjust_contrast, tv_tensors.Image)
+def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+    fp = image.is_floating_point()
+    if c == 3:
+        grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+        if not fp:
+            grayscale_image = grayscale_image.floor_()
+    else:
+        grayscale_image = image if fp else image.to(torch.float32)
+    mean = torch.mean(grayscale_image, dim=(-3, -2, -1), keepdim=True)
+    return _blend(image, mean, contrast_factor)
+
+
+_adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
+
+
+@_register_kernel_internal(adjust_contrast, tv_tensors.Video)
+def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    return adjust_contrast_image(video, contrast_factor=contrast_factor)
+
+
+def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.RandomAdjustSharpness`"""
+    if torch.jit.is_scripting():
+        return adjust_sharpness_image(inpt, sharpness_factor=sharpness_factor)
+
+    _log_api_usage_once(adjust_sharpness)
+
+    kernel = _get_kernel(adjust_sharpness, type(inpt))
+    return kernel(inpt, sharpness_factor=sharpness_factor)
+
+
+@_register_kernel_internal(adjust_sharpness, torch.Tensor)
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Image)
+def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    num_channels, height, width = image.shape[-3:]
+    if num_channels not in (1, 3):
+        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
+
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    if image.numel() == 0 or height <= 2 or width <= 2:
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.reshape(-1, num_channels, height, width)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    # The following is a normalized 3x3 kernel with 1s in the edges and a 5 in the middle.
+    kernel_dtype = image.dtype if fp else torch.float32
+    a, b = 1.0 / 13.0, 5.0 / 13.0
+    kernel = torch.tensor([[a, a, a], [a, b, a], [a, a, a]], dtype=kernel_dtype, device=image.device)
+    kernel = kernel.expand(num_channels, 1, 3, 3)
+
+    # We copy and cast at the same time to avoid modifications on the original data
+    output = image.to(dtype=kernel_dtype, copy=True)
+    blurred_degenerate = conv2d(output, kernel, groups=num_channels)
+    if not fp:
+        # it is better to round before cast
+        blurred_degenerate = blurred_degenerate.round_()
+
+    # Create a view on the underlying output while pointing at the same data. We do this to avoid indexing twice.
+    view = output[..., 1:-1, 1:-1]
+
+    # We speed up blending by minimizing flops and doing in-place. The 2 blend options are mathematically equivalent:
+    # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
+    view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))
+
+    # The actual data of output have been modified by the above. We only need to clamp and cast now.
+    output = output.clamp_(0, bound)
+    if not fp:
+        output = output.to(image.dtype)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
+
+
+_adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
+
+
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Video)
+def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    return adjust_sharpness_image(video, sharpness_factor=sharpness_factor)
+
+
+def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    """Adjust hue"""
+    if torch.jit.is_scripting():
+        return adjust_hue_image(inpt, hue_factor=hue_factor)
+
+    _log_api_usage_once(adjust_hue)
+
+    kernel = _get_kernel(adjust_hue, type(inpt))
+    return kernel(inpt, hue_factor=hue_factor)
+
+
+def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
+    r, g, _ = image.unbind(dim=-3)
+
+    # Implementation is based on
+    # https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/src/libImaging/Convert.c#L330
+    minc, maxc = torch.aminmax(image, dim=-3)
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    channels_range = maxc - minc
+    # Since `eqc => channels_range = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = channels_range / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    channels_range_divisor = torch.where(eqc, ones, channels_range).unsqueeze_(dim=-3)
+    rc, gc, bc = ((maxc.unsqueeze(dim=-3) - image) / channels_range_divisor).unbind(dim=-3)
+
+    mask_maxc_neq_r = maxc != r
+    mask_maxc_eq_g = maxc == g
+
+    hg = rc.add(2.0).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
+    hr = bc.sub_(gc).mul_(~mask_maxc_neq_r)
+    hb = gc.add_(4.0).sub_(rc).mul_(mask_maxc_neq_r.logical_and_(mask_maxc_eq_g.logical_not_()))
+
+    h = hr.add_(hg).add_(hb)
+    h = h.mul_(1.0 / 6.0).add_(1.0).fmod_(1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
+    h, s, v = img.unbind(dim=-3)
+    h6 = h.mul(6)
+    i = torch.floor(h6)
+    f = h6.sub_(i)
+    i = i.to(dtype=torch.int32)
+
+    sxf = s * f
+    one_minus_s = 1.0 - s
+    q = (1.0 - sxf).mul_(v).clamp_(0.0, 1.0)
+    t = sxf.add_(one_minus_s).mul_(v).clamp_(0.0, 1.0)
+    p = one_minus_s.mul_(v).clamp_(0.0, 1.0)
+    i.remainder_(6)
+
+    vpqt = torch.stack((v, p, q, t), dim=-3)
+
+    # vpqt -> rgb mapping based on i
+    select = torch.tensor([[0, 2, 1, 1, 3, 0], [3, 0, 0, 2, 1, 1], [1, 1, 3, 0, 0, 2]], dtype=torch.long)
+    select = select.to(device=img.device, non_blocking=True)
+
+    select = select[:, i]
+    if select.ndim > 3:
+        # if input.shape is (B, ..., C, H, W) then
+        # select.shape is (C, B, ...,  H, W)
+        # thus we move C axis to get (B, ..., C, H, W)
+        select = select.moveaxis(0, -3)
+
+    return vpqt.gather(-3, select)
+
+
+@_register_kernel_internal(adjust_hue, torch.Tensor)
+@_register_kernel_internal(adjust_hue, tv_tensors.Image)
+def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    orig_dtype = image.dtype
+    image = to_dtype_image(image, torch.float32, scale=True)
+
+    image = _rgb_to_hsv(image)
+    h, s, v = image.unbind(dim=-3)
+    h.add_(hue_factor).remainder_(1.0)
+    image = torch.stack((h, s, v), dim=-3)
+    image_hue_adj = _hsv_to_rgb(image)
+
+    return to_dtype_image(image_hue_adj, orig_dtype, scale=True)
+
+
+_adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
+
+
+@_register_kernel_internal(adjust_hue, tv_tensors.Video)
+def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    return adjust_hue_image(video, hue_factor=hue_factor)
+
+
+def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    """Adjust gamma."""
+    if torch.jit.is_scripting():
+        return adjust_gamma_image(inpt, gamma=gamma, gain=gain)
+
+    _log_api_usage_once(adjust_gamma)
+
+    kernel = _get_kernel(adjust_gamma, type(inpt))
+    return kernel(inpt, gamma=gamma, gain=gain)
+
+
+@_register_kernel_internal(adjust_gamma, torch.Tensor)
+@_register_kernel_internal(adjust_gamma, tv_tensors.Image)
+def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
+    # Since the gamma is non-negative, the output remains at [0, 1] scale.
+    if not torch.is_floating_point(image):
+        output = to_dtype_image(image, torch.float32, scale=True).pow_(gamma)
+    else:
+        output = image.pow(gamma)
+
+    if gain != 1.0:
+        # The clamp operation is needed only if multiplication is performed. It's only when gain != 1, that the scale
+        # of the output can go beyond [0, 1].
+        output = output.mul_(gain).clamp_(0.0, 1.0)
+
+    return to_dtype_image(output, image.dtype, scale=True)
+
+
+_adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
+
+
+@_register_kernel_internal(adjust_gamma, tv_tensors.Video)
+def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    return adjust_gamma_image(video, gamma=gamma, gain=gain)
+
+
+def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomPosterize` for details."""
+    if torch.jit.is_scripting():
+        return posterize_image(inpt, bits=bits)
+
+    _log_api_usage_once(posterize)
+
+    kernel = _get_kernel(posterize, type(inpt))
+    return kernel(inpt, bits=bits)
+
+
+@_register_kernel_internal(posterize, torch.Tensor)
+@_register_kernel_internal(posterize, tv_tensors.Image)
+def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
+    if not isinstance(bits, int) or not 0 <= bits <= 8:
+        raise TypeError(f"bits must be a positive integer in the range [0, 8], got {bits} instead.")
+
+    if image.is_floating_point():
+        levels = 1 << bits
+        return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)
+    else:
+        num_value_bits = _num_value_bits(image.dtype)
+        if bits >= num_value_bits:
+            return image
+
+        mask = ((1 << bits) - 1) << (num_value_bits - bits)
+        return image & mask
+
+
+_posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
+
+
+@_register_kernel_internal(posterize, tv_tensors.Video)
+def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
+    return posterize_image(video, bits=bits)
+
+
+def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomSolarize` for details."""
+    if torch.jit.is_scripting():
+        return solarize_image(inpt, threshold=threshold)
+
+    _log_api_usage_once(solarize)
+
+    kernel = _get_kernel(solarize, type(inpt))
+    return kernel(inpt, threshold=threshold)
+
+
+@_register_kernel_internal(solarize, torch.Tensor)
+@_register_kernel_internal(solarize, tv_tensors.Image)
+def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
+    if threshold > _max_value(image.dtype):
+        raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
+
+    return torch.where(image >= threshold, invert_image(image), image)
+
+
+_solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
+
+
+@_register_kernel_internal(solarize, tv_tensors.Video)
+def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
+    return solarize_image(video, threshold=threshold)
+
+
+def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomAutocontrast` for details."""
+    if torch.jit.is_scripting():
+        return autocontrast_image(inpt)
+
+    _log_api_usage_once(autocontrast)
+
+    kernel = _get_kernel(autocontrast, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(autocontrast, torch.Tensor)
+@_register_kernel_internal(autocontrast, tv_tensors.Image)
+def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    float_image = image if fp else image.to(torch.float32)
+
+    minimum = float_image.amin(dim=(-2, -1), keepdim=True)
+    maximum = float_image.amax(dim=(-2, -1), keepdim=True)
+
+    eq_idxs = maximum == minimum
+    inv_scale = maximum.sub_(minimum).mul_(1.0 / bound)
+    minimum[eq_idxs] = 0.0
+    inv_scale[eq_idxs] = 1.0
+
+    if fp:
+        diff = float_image.sub(minimum)
+    else:
+        diff = float_image.sub_(minimum)
+
+    return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
+
+
+_autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
+
+
+@_register_kernel_internal(autocontrast, tv_tensors.Video)
+def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
+    return autocontrast_image(video)
+
+
+def equalize(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomEqualize` for details."""
+    if torch.jit.is_scripting():
+        return equalize_image(inpt)
+
+    _log_api_usage_once(equalize)
+
+    kernel = _get_kernel(equalize, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(equalize, torch.Tensor)
+@_register_kernel_internal(equalize, tv_tensors.Image)
+def equalize_image(image: torch.Tensor) -> torch.Tensor:
+    if image.numel() == 0:
+        return image
+
+    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
+    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
+    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
+    #    unfeasible for `torch.int64`.
+    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
+    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
+    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
+    # by far the most common, we choose it as base.
+    output_dtype = image.dtype
+    image = to_dtype_image(image, torch.uint8, scale=True)
+
+    # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
+    # corresponds to adding 1 to index 127 in the histogram.
+    batch_shape = image.shape[:-2]
+    flat_image = image.flatten(start_dim=-2).to(torch.long)
+    hist = flat_image.new_zeros(batch_shape + (256,), dtype=torch.int32)
+    hist.scatter_add_(dim=-1, index=flat_image, src=hist.new_ones(1).expand_as(flat_image))
+    cum_hist = hist.cumsum(dim=-1)
+
+    # The simplest form of lookup-table (LUT) that also achieves histogram equalization is
+    # `lut = cum_hist / flat_image.shape[-1] * 255`
+    # However, PIL uses a more elaborate scheme:
+    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
+    # `lut = ((cum_hist + num_non_max_pixels // (2 * 255)) // num_non_max_pixels) * 255`
+
+    # The last non-zero element in the histogram is the first element in the cumulative histogram with the maximum
+    # value. Thus, the "max" in `num_non_max_pixels` does not refer to 255 as the maximum value of uint8 images, but
+    # rather the maximum value in the image, which might be or not be 255.
+    index = cum_hist.argmax(dim=-1)
+    num_non_max_pixels = flat_image.shape[-1] - hist.gather(dim=-1, index=index.unsqueeze_(-1))
+
+    # This is performance optimization that saves us one multiplication later. With this, the LUT computation simplifies
+    # to `lut = (cum_hist + step // 2) // step` and thus saving the final multiplication by 255 while keeping the
+    # division count the same. PIL uses the variable name `step` for this, so we keep that for easier comparison.
+    step = num_non_max_pixels.div_(255, rounding_mode="floor")
+
+    # Although it looks like we could return early if we find `step == 0` like PIL does, that is unfortunately not as
+    # easy due to our support for batched images. We can only return early if `(step == 0).all()` holds. If it doesn't,
+    # we have to go through the computation below anyway. Since `step == 0` is an edge case anyway, it makes no sense to
+    # pay the runtime cost for checking it every time.
+    valid_equalization = step.ne(0).unsqueeze_(-1)
+
+    # `lut[k]` is computed with `cum_hist[k-1]` with `lut[0] == (step // 2) // step == 0`. Thus, we perform the
+    # computation only for `lut[1:]` with `cum_hist[:-1]` and add `lut[0] == 0` afterwards.
+    cum_hist = cum_hist[..., :-1]
+    (
+        cum_hist.add_(step // 2)
+        # We need the `clamp_`(min=1) call here to avoid zero division since they fail for integer dtypes. This has no
+        # effect on the returned result of this kernel since images inside the batch with `step == 0` are returned as is
+        # instead of equalized version.
+        .div_(step.clamp_(min=1), rounding_mode="floor")
+        # We need the `clamp_` call here since PILs LUT computation scheme can produce values outside the valid value
+        # range of uint8 images
+        .clamp_(0, 255)
+    )
+    lut = cum_hist.to(torch.uint8)
+    lut = torch.cat([lut.new_zeros(1).expand(batch_shape + (1,)), lut], dim=-1)
+    equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
+
+    output = torch.where(valid_equalization, equalized_image, image)
+    return to_dtype_image(output, output_dtype, scale=True)
+
+
+_equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
+
+
+@_register_kernel_internal(equalize, tv_tensors.Video)
+def equalize_video(video: torch.Tensor) -> torch.Tensor:
+    return equalize_image(video)
+
+
+def invert(inpt: torch.Tensor) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.RandomInvert`."""
+    if torch.jit.is_scripting():
+        return invert_image(inpt)
+
+    _log_api_usage_once(invert)
+
+    kernel = _get_kernel(invert, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(invert, torch.Tensor)
+@_register_kernel_internal(invert, tv_tensors.Image)
+def invert_image(image: torch.Tensor) -> torch.Tensor:
+    if image.is_floating_point():
+        return 1.0 - image
+    elif image.dtype == torch.uint8:
+        return image.bitwise_not()
+    else:  # signed integer dtypes
+        # We can't use `Tensor.bitwise_not` here, since we want to retain the leading zero bit that encodes the sign
+        return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
+
+
+_invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
+
+
+@_register_kernel_internal(invert, tv_tensors.Video)
+def invert_video(video: torch.Tensor) -> torch.Tensor:
+    return invert_image(video)
+
+
+def permute_channels(inpt: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    """Permute the channels of the input according to the given permutation.
+
+    This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
+    :class:`torchvision.tv_tensors.Image` and :class:`torchvision.tv_tensors.Video`.
+
+    Example:
+        >>> rgb_image = torch.rand(3, 256, 256)
+        >>> bgr_image = F.permute_channels(rgb_image, permutation=[2, 1, 0])
+
+    Args:
+        permutation (List[int]): Valid permutation of the input channel indices. The index of the element determines the
+            channel index in the input and the value determines the channel index in the output. For example,
+            ``permutation=[2, 0 , 1]``
+
+            - takes ``ìnpt[..., 0, :, :]`` and puts it at ``output[..., 2, :, :]``,
+            - takes ``ìnpt[..., 1, :, :]`` and puts it at ``output[..., 0, :, :]``, and
+            - takes ``ìnpt[..., 2, :, :]`` and puts it at ``output[..., 1, :, :]``.
+
+    Raises:
+        ValueError: If ``len(permutation)`` doesn't match the number of channels in the input.
+    """
+    if torch.jit.is_scripting():
+        return permute_channels_image(inpt, permutation=permutation)
+
+    _log_api_usage_once(permute_channels)
+
+    kernel = _get_kernel(permute_channels, type(inpt))
+    return kernel(inpt, permutation=permutation)
+
+
+@_register_kernel_internal(permute_channels, torch.Tensor)
+@_register_kernel_internal(permute_channels, tv_tensors.Image)
+def permute_channels_image(image: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    if len(permutation) != num_channels:
+        raise ValueError(
+            f"Length of permutation does not match number of channels: " f"{len(permutation)} != {num_channels}"
+        )
+
+    if image.numel() == 0:
+        return image
+
+    image = image.reshape(-1, num_channels, height, width)
+    image = image[:, permutation, :, :]
+    return image.reshape(shape)
+
+
+@_register_kernel_internal(permute_channels, PIL.Image.Image)
+def _permute_channels_image_pil(image: PIL.Image.Image, permutation: list[int]) -> PIL.Image.Image:
+    return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation))
+
+
+@_register_kernel_internal(permute_channels, tv_tensors.Video)
+def permute_channels_video(video: torch.Tensor, permutation: list[int]) -> torch.Tensor:
+    return permute_channels_image(video, permutation=permutation)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_deprecated.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..3131b5e8c495ec763ccc822a43e19133eb5fd3ba
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_deprecated.py
@@ -0,0 +1,24 @@
+import warnings
+from typing import Any
+
+import torch
+
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_tensor(inpt: Any) -> torch.Tensor:
+    """[DEPREACTED] Use to_image() and to_dtype() instead."""
+    warnings.warn(
+        "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `to_image(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
+    )
+    return _F.to_tensor(inpt)
+
+
+def get_image_size(inpt: torch.Tensor) -> list[int]:
+    warnings.warn(
+        "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`."
+    )
+    return _F.get_image_size(inpt)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_geometry.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fcb7fabe0df05a8ac5d33da2bbe41a7c2aac3e2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_geometry.py
@@ -0,0 +1,3003 @@
+import math
+import numbers
+import warnings
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _pad_symmetric
+from torchvision.transforms.functional import (
+    _compute_resized_output_size as __compute_resized_output_size,
+    _get_perspective_coeffs,
+    _interpolation_modes_from_int,
+    InterpolationMode,
+    pil_modes_mapping,
+    pil_to_tensor,
+    to_pil_image,
+)
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+from torchvision.utils import _log_api_usage_once
+
+from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_bounding_box_format
+
+from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
+
+
+def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise ValueError(
+            f"Argument interpolation should be an `InterpolationMode` or a corresponding Pillow integer constant, "
+            f"but got {interpolation}."
+        )
+    return interpolation
+
+
+def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomHorizontalFlip` for details."""
+    if torch.jit.is_scripting():
+        return horizontal_flip_image(inpt)
+
+    _log_api_usage_once(horizontal_flip)
+
+    kernel = _get_kernel(horizontal_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(horizontal_flip, torch.Tensor)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Image)
+def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-1)
+
+
+@_register_kernel_internal(horizontal_flip, PIL.Image.Image)
+def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.hflip(image)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Mask)
+def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(mask)
+
+
+def horizontal_flip_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]):
+    shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+    keypoints[..., 0] = keypoints[..., 0].sub_(canvas_size[1] - 1).neg_()
+    return keypoints.reshape(shape)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _horizontal_flip_keypoints_dispatch(keypoints: tv_tensors.KeyPoints):
+    out = horizontal_flip_keypoints(keypoints.as_subclass(torch.Tensor), canvas_size=keypoints.canvas_size)
+    return tv_tensors.wrap(out, like=keypoints)
+
+
+def horizontal_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    if tv_tensors.is_rotated_bounding_format(format):
+        bounding_boxes = (
+            bounding_boxes.clone().reshape(-1, 5)
+            if format != tv_tensors.BoundingBoxFormat.XYXYXYXY
+            else bounding_boxes.clone().reshape(-1, 8)
+        )
+    else:
+        bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes[:, 0::2].sub_(canvas_size[1]).neg_()
+        bounding_boxes = bounding_boxes[:, [2, 3, 0, 1, 6, 7, 4, 5]]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
+        angle_rad = bounding_boxes[:, 4].mul(torch.pi).div(180)
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2].mul(angle_rad.cos())).sub_(canvas_size[1]).neg_()
+        bounding_boxes[:, 1].sub_(bounding_boxes[:, 2].mul(angle_rad.sin()))
+        bounding_boxes[:, 4].neg_()
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
+        bounding_boxes[:, 4].neg_()
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _horizontal_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = horizontal_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Video)
+def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(video)
+
+
+def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomVerticalFlip` for details."""
+    if torch.jit.is_scripting():
+        return vertical_flip_image(inpt)
+
+    _log_api_usage_once(vertical_flip)
+
+    kernel = _get_kernel(vertical_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(vertical_flip, torch.Tensor)
+@_register_kernel_internal(vertical_flip, tv_tensors.Image)
+def vertical_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-2)
+
+
+@_register_kernel_internal(vertical_flip, PIL.Image.Image)
+def _vertical_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.vflip(image)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Mask)
+def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(mask)
+
+
+def vertical_flip_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]) -> torch.Tensor:
+    shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+    keypoints[..., 1] = keypoints[..., 1].sub_(canvas_size[0] - 1).neg_()
+    return keypoints.reshape(shape)
+
+
+def vertical_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    if tv_tensors.is_rotated_bounding_format(format):
+        bounding_boxes = (
+            bounding_boxes.clone().reshape(-1, 5)
+            if format != tv_tensors.BoundingBoxFormat.XYXYXYXY
+            else bounding_boxes.clone().reshape(-1, 8)
+        )
+    else:
+        bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes[:, 1::2].sub_(canvas_size[0]).neg_()
+        bounding_boxes = bounding_boxes[:, [2, 3, 0, 1, 6, 7, 4, 5]]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR:
+        angle_rad = bounding_boxes[:, 4].mul(torch.pi).div(180)
+        bounding_boxes[:, 1].sub_(bounding_boxes[:, 2].mul(angle_rad.sin())).sub_(canvas_size[0]).neg_()
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2].mul(angle_rad.cos()))
+        bounding_boxes[:, 4].neg_().add_(180)
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
+        bounding_boxes[:, 4].neg_().add_(180)
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _vertical_flip_keypoints_dispatch(inpt: tv_tensors.KeyPoints) -> tv_tensors.KeyPoints:
+    output = vertical_flip_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _vertical_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = vertical_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Video)
+def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(video)
+
+
+# We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
+# prevalent and well understood. Thus, we just alias them without deprecating the old names.
+hflip = horizontal_flip
+vflip = vertical_flip
+
+
+def _compute_resized_output_size(
+    canvas_size: tuple[int, int], size: Optional[list[int]], max_size: Optional[int] = None
+) -> list[int]:
+    if isinstance(size, int):
+        size = [size]
+    elif max_size is not None and size is not None and len(size) != 1:
+        raise ValueError(
+            "max_size should only be passed if size is None or specifies the length of the smaller edge, "
+            "i.e. size should be an int or a sequence of length 1 in torchscript mode."
+        )
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size, allow_size_none=True)
+
+
+def resize(
+    inpt: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Resize` for details."""
+    if torch.jit.is_scripting():
+        return resize_image(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+    _log_api_usage_once(resize)
+
+    kernel = _get_kernel(resize, type(inpt))
+    return kernel(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+# This is an internal helper method for resize_image. We should put it here instead of keeping it
+# inside resize_image due to torchscript.
+# uint8 dtype support for bilinear and bicubic is limited to cpu and
+# according to our benchmarks on eager, non-AVX CPUs should still prefer u8->f32->interpolate->u8 path for bilinear
+def _do_native_uint8_resize_on_cpu(interpolation: InterpolationMode) -> bool:
+    if interpolation == InterpolationMode.BILINEAR:
+        if torch.compiler.is_compiling():
+            return True
+        else:
+            return torch.backends.cpu.get_cpu_capability() in ("AVX2", "AVX512")
+
+    return interpolation == InterpolationMode.BICUBIC
+
+
+@_register_kernel_internal(resize, torch.Tensor)
+@_register_kernel_internal(resize, tv_tensors.Image)
+def resize_image(
+    image: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+    antialias = False if antialias is None else antialias
+    align_corners: Optional[bool] = None
+    if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
+        align_corners = False
+    else:
+        # The default of antialias is True from 0.17, so we don't warn or
+        # error if other interpolation modes are used. This is documented.
+        antialias = False
+
+    shape = image.shape
+    numel = image.numel()
+    num_channels, old_height, old_width = shape[-3:]
+    new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+    elif numel > 0:
+        dtype = image.dtype
+        acceptable_dtypes = [torch.float32, torch.float64]
+        if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
+            # uint8 dtype can be included for cpu and cuda input if nearest mode
+            acceptable_dtypes.append(torch.uint8)
+        elif image.device.type == "cpu":
+            if _do_native_uint8_resize_on_cpu(interpolation):
+                acceptable_dtypes.append(torch.uint8)
+
+        image = image.reshape(-1, num_channels, old_height, old_width)
+        strides = image.stride()
+        if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
+            # There is a weird behaviour in torch core where the output tensor of `interpolate()` can be allocated as
+            # contiguous even though the input is un-ambiguously channels_last (https://github.com/pytorch/pytorch/issues/68430).
+            # In particular this happens for the typical torchvision use-case of single CHW images where we fake the batch dim
+            # to become 1CHW. Below, we restride those tensors to trick torch core into properly allocating the output as
+            # channels_last, thus preserving the memory format of the input. This is not just for format consistency:
+            # for uint8 bilinear images, this also avoids an extra copy (re-packing) of the output and saves time.
+            # TODO: when https://github.com/pytorch/pytorch/issues/68430 is fixed (possibly by https://github.com/pytorch/pytorch/pull/100373),
+            # we should be able to remove this hack.
+            new_strides = list(strides)
+            new_strides[0] = numel
+            image = image.as_strided((1, num_channels, old_height, old_width), new_strides)
+
+        need_cast = dtype not in acceptable_dtypes
+        if need_cast:
+            image = image.to(dtype=torch.float32)
+
+        image = interpolate(
+            image,
+            size=[new_height, new_width],
+            mode=interpolation.value,
+            align_corners=align_corners,
+            antialias=antialias,
+        )
+
+        if need_cast:
+            if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                # This path is hit on non-AVX archs, or on GPU.
+                image = image.clamp_(min=0, max=255)
+            if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+                image = image.round_()
+            image = image.to(dtype=dtype)
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+def _resize_image_pil(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+) -> PIL.Image.Image:
+    old_height, old_width = image.height, image.width
+    new_height, new_width = _compute_resized_output_size(
+        (old_height, old_width),
+        size=size,  # type: ignore[arg-type]
+        max_size=max_size,
+    )
+
+    interpolation = _check_interpolation(interpolation)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+
+    return image.resize((new_width, new_height), resample=pil_modes_mapping[interpolation])
+
+
+@_register_kernel_internal(resize, PIL.Image.Image)
+def __resize_image_pil_dispatch(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
+
+
+def resize_mask(mask: torch.Tensor, size: Optional[list[int]], max_size: Optional[int] = None) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = resize_image(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(resize, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resize_mask_dispatch(
+    inpt: tv_tensors.Mask, size: list[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.Mask:
+    output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def resize_keypoints(
+    keypoints: torch.Tensor,
+    size: Optional[list[int]],
+    canvas_size: tuple[int, int],
+    max_size: Optional[int] = None,
+):
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return keypoints, canvas_size
+
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    ratios = torch.tensor([w_ratio, h_ratio], device=keypoints.device)
+    keypoints = keypoints.mul(ratios).to(keypoints.dtype)
+
+    return keypoints, (new_height, new_width)
+
+
+@_register_kernel_internal(resize, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _resize_keypoints_dispatch(
+    keypoints: tv_tensors.KeyPoints,
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    **kwargs: Any,
+) -> tv_tensors.KeyPoints:
+    out, canvas_size = resize_keypoints(
+        keypoints.as_subclass(torch.Tensor),
+        size,
+        canvas_size=keypoints.canvas_size,
+        max_size=max_size,
+    )
+    return tv_tensors.wrap(out, like=keypoints, canvas_size=canvas_size)
+
+
+def _parallelogram_to_bounding_boxes(parallelogram: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a parallelogram to a rectangle while keeping two points unchanged.
+    This function transforms a parallelogram represented by 8 coordinates (4 points) into a rectangle.
+    The two diagonally opposed points of the parallelogram forming the longest diagonal remain fixed.
+    The other points are adjusted to form a proper rectangle.
+
+    Note:
+        This function is not applied in-place and will return a copy of the input tensor.
+
+    Args:
+        parallelogram (torch.Tensor): Tensor of shape (..., 8) containing coordinates of parallelograms.
+                                     Format is [x1, y1, x2, y2, x3, y3, x4, y4].
+
+    Returns:
+        torch.Tensor: Tensor of same shape as input containing the rectangle coordinates.
+                     The output maintains the same dtype as the input.
+    """
+    original_shape = parallelogram.shape
+    dtype = parallelogram.dtype
+    acceptable_dtypes = [torch.float32, torch.float64]
+    need_cast = dtype not in acceptable_dtypes
+    if need_cast:
+        # Up-case to avoid overflow for square operations
+        parallelogram = parallelogram.to(torch.float32)
+
+    x1, y1, x2, y2, x3, y3, x4, y4 = parallelogram.unbind(-1)
+    cx = (x1 + x3) / 2
+    cy = (y1 + y3) / 2
+
+    # Calculate width, height, and rotation angle of the parallelogram
+    wp = torch.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+    hp = torch.sqrt((x4 - x1) ** 2 + (y4 - y1) ** 2)
+    r12 = torch.atan2(y1 - y2, x2 - x1)
+    r14 = torch.atan2(y1 - y4, x4 - x1)
+    r_rad = r12 - r14
+    sign = torch.where(r_rad > torch.pi / 2, -1, 1)
+    cos, sin = r_rad.cos(), r_rad.sin()
+
+    # Calculate width, height, and rotation angle of the rectangle
+    w = torch.where(wp < hp, wp * sin, wp + hp * cos * sign)
+    h = torch.where(wp > hp, hp * sin, hp + wp * cos * sign)
+    r_rad = torch.where(hp > wp, r14 + torch.pi / 2, r12)
+    cos, sin = r_rad.cos(), r_rad.sin()
+
+    x1 = cx - w / 2 * cos - h / 2 * sin
+    y1 = cy - h / 2 * cos + w / 2 * sin
+    x2 = cx + w / 2 * cos - h / 2 * sin
+    y2 = cy - h / 2 * cos - w / 2 * sin
+    x3 = cx + w / 2 * cos + h / 2 * sin
+    y3 = cy + h / 2 * cos - w / 2 * sin
+    x4 = cx - w / 2 * cos + h / 2 * sin
+    y4 = cy + h / 2 * cos + w / 2 * sin
+    out_boxes = torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1).reshape(original_shape)
+
+    if need_cast:
+        out_boxes = out_boxes.to(dtype)
+    return out_boxes
+
+
+def resize_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    size: Optional[list[int]],
+    max_size: Optional[int] = None,
+    format: tv_tensors.BoundingBoxFormat = tv_tensors.BoundingBoxFormat.XYXY,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    # We set the default format as `tv_tensors.BoundingBoxFormat.XYXY`
+    # to ensure backward compatibility.
+    # Indeed before the introduction of rotated bounding box format
+    # this function did not received `format` parameter as input.
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return bounding_boxes, canvas_size
+
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    if tv_tensors.is_rotated_bounding_format(format):
+        original_shape = bounding_boxes.shape
+        xyxyxyxy_boxes = convert_bounding_box_format(
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, inplace=False
+        ).reshape(-1, 8)
+
+        ratios = torch.tensor(
+            [w_ratio, h_ratio, w_ratio, h_ratio, w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device
+        )
+        transformed_points = xyxyxyxy_boxes.mul(ratios)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+        out_bboxes = clamp_bounding_boxes(
+            out_bboxes,
+            format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+            canvas_size=(new_height, new_width),
+            clamping_mode=clamping_mode,
+        )
+        return (
+            convert_bounding_box_format(
+                out_bboxes,
+                old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+                new_format=format,
+                inplace=False,
+            )
+            .to(bounding_boxes.dtype)
+            .reshape(original_shape),
+            (new_height, new_width),
+        )
+    else:
+        ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device)
+        return (
+            bounding_boxes.mul(ratios).to(bounding_boxes.dtype),
+            (new_height, new_width),
+        )
+
+
+@_register_kernel_internal(resize, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resize_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, size: Optional[list[int]], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resize_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        size=size,
+        max_size=max_size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resize, tv_tensors.Video)
+def resize_video(
+    video: torch.Tensor,
+    size: Optional[list[int]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+def affine(
+    inpt: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomAffine` for details."""
+    if torch.jit.is_scripting():
+        return affine_image(
+            inpt,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+
+    _log_api_usage_once(affine)
+
+    kernel = _get_kernel(affine, type(inpt))
+    return kernel(
+        inpt,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def _affine_parse_args(
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    center: Optional[list[float]] = None,
+) -> tuple[float, list[float], list[float], Optional[list[float]]]:
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError("Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, InterpolationMode):
+        raise TypeError("Argument interpolation should be a InterpolationMode")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None:
+        if not isinstance(center, (list, tuple)):
+            raise TypeError("Argument center should be a sequence")
+        else:
+            center = [float(c) for c in center]
+
+    return angle, translate, shear, center
+
+
+def _get_inverse_affine_matrix(
+    center: list[float], angle: float, translate: list[float], scale: float, shear: list[float], inverted: bool = True
+) -> list[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # Cached results
+    cos_sy = math.cos(sy)
+    tan_sx = math.tan(sx)
+    rot_minus_sy = rot - sy
+    cx_plus_tx = cx + tx
+    cy_plus_ty = cy + ty
+
+    # Rotate Scale Shear (RSS) without scaling
+    a = math.cos(rot_minus_sy) / cos_sy
+    b = -(a * tan_sx + math.sin(rot))
+    c = math.sin(rot_minus_sy) / cos_sy
+    d = math.cos(rot) - c * tan_sx
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        # and then apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty
+        matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty
+    else:
+        matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0]
+        # Apply inverse of center translation: RSS * C^-1
+        # and then apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy
+        matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy
+
+    return matrix
+
+
+def _compute_affine_output_size(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    if torch.compiler.is_compiling() and not torch.jit.is_scripting():
+        return _compute_affine_output_size_python(matrix, w, h)
+    else:
+        return _compute_affine_output_size_tensor(matrix, w, h)
+
+
+def _compute_affine_output_size_tensor(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    pts = torch.tensor(
+        [
+            [-half_w, -half_h, 1.0],
+            [-half_w, half_h, 1.0],
+            [half_w, half_h, 1.0],
+            [half_w, -half_h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, max_vals = new_pts.aminmax(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    halfs = torch.tensor((half_w, half_h))
+    min_vals.add_(halfs)
+    max_vals.add_(halfs)
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    inv_tol = 1.0 / tol
+    cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_()
+    cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_()
+    size = cmax.sub_(cmin)
+    return int(size[0]), int(size[1])  # w, h
+
+
+def _compute_affine_output_size_python(matrix: list[float], w: int, h: int) -> tuple[int, int]:
+    # Mostly copied from PIL implementation:
+    # The only difference is with transformed points as input matrix has zero translation part here and
+    # PIL has a centered translation part.
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    a, b, c, d, e, f = matrix
+    xx = []
+    yy = []
+
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    for x, y in ((-half_w, -half_h), (half_w, -half_h), (half_w, half_h), (-half_w, half_h)):
+        nx = a * x + b * y + c
+        ny = d * x + e * y + f
+        xx.append(nx + half_w)
+        yy.append(ny + half_h)
+
+    nw = math.ceil(max(xx)) - math.floor(min(xx))
+    nh = math.ceil(max(yy)) - math.floor(min(yy))
+    return int(nw), int(nh)  # w, h
+
+
+def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill: _FillTypeJIT) -> torch.Tensor:
+    input_shape = img.shape
+    output_height, output_width = grid.shape[1], grid.shape[2]
+    num_channels, input_height, input_width = input_shape[-3:]
+    output_shape = input_shape[:-3] + (num_channels, output_height, output_width)
+
+    if img.numel() == 0:
+        return img.reshape(output_shape)
+
+    img = img.reshape(-1, num_channels, input_height, input_width)
+    squashed_batch_size = img.shape[0]
+
+    # We are using context knowledge that grid should have float dtype
+    fp = img.dtype == grid.dtype
+    float_img = img if fp else img.to(grid.dtype)
+
+    if squashed_batch_size > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(squashed_batch_size, -1, -1, -1)
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones(
+            (squashed_batch_size, 1, input_height, input_width), dtype=float_img.dtype, device=float_img.device
+        )
+        float_img = torch.cat((float_img, mask), dim=1)
+
+    float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
+        mask = mask.expand_as(float_img)
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]  # type: ignore[arg-type]
+        fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
+        if mode == "nearest":
+            float_img = torch.where(mask < 0.5, fill_img.expand_as(float_img), float_img)
+        else:  # 'bilinear'
+            # The following is mathematically equivalent to:
+            # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
+            float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
+
+    img = float_img.round_().to(img.dtype) if not fp else float_img
+
+    return img.reshape(output_shape)
+
+
+def _assert_grid_transform_inputs(
+    image: torch.Tensor,
+    matrix: Optional[list[float]],
+    interpolation: str,
+    fill: _FillTypeJIT,
+    supported_interpolation_modes: list[str],
+    coeffs: Optional[list[float]] = None,
+) -> None:
+    if matrix is not None:
+        if not isinstance(matrix, list):
+            raise TypeError("Argument matrix should be a list")
+        elif len(matrix) != 6:
+            raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None:
+        if isinstance(fill, (tuple, list)):
+            length = len(fill)
+            num_channels = image.shape[-3]
+            if length > 1 and length != num_channels:
+                raise ValueError(
+                    "The number of elements in 'fill' cannot broadcast to match the number of "
+                    f"channels of the image ({length} != {num_channels})"
+                )
+        elif not isinstance(fill, (int, float)):
+            raise ValueError("Argument fill should be either int, float, tuple or list")
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _affine_grid(
+    theta: torch.Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> torch.Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+    dtype = theta.dtype
+    device = theta.device
+
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device))
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+@_register_kernel_internal(affine, torch.Tensor)
+@_register_kernel_internal(affine, tv_tensors.Image)
+def affine_image(
+    image: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    height, width = image.shape[-2:]
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    translate_f = [float(t) for t in translate]
+    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
+
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(affine, PIL.Image.Image)
+def _affine_image_pil(
+    image: PIL.Image.Image,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
+    # it is visually better to estimate the center without 0.5 offset
+    # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
+    if center is None:
+        height, width = _get_size_image_pil(image)
+        center = [width * 0.5, height * 0.5]
+    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+
+    return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+# TODO: Consider merging/unifying this with the bbox implementation
+def _affine_keypoints_with_expand(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    expand: bool = False,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if keypoints.numel() == 0:
+        return keypoints, canvas_size
+
+    original_dtype = keypoints.dtype
+    original_shape = keypoints.shape
+    keypoints = keypoints.clone() if keypoints.is_floating_point() else keypoints.float()
+    dtype = keypoints.dtype
+    device = keypoints.device
+
+    angle, translate, shear, center = _affine_parse_args(
+        angle, translate, scale, shear, InterpolationMode.NEAREST, center
+    )
+
+    if center is None:
+        height, width = canvas_size
+        center = [width * 0.5, height * 0.5]
+
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=dtype,
+            device=device,
+        )
+        .reshape(2, 3)
+        .T
+    )
+
+    # 1) We transform points into a tensor of points with shape (N, 3), where N is the number of points.
+    points = keypoints.reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
+    # 2) Now let's transform the points using affine matrix
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
+
+    if expand:
+        # Compute minimum point for transformed image frame:
+        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+        height, width = canvas_size
+        points = torch.tensor(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        new_points = torch.matmul(points, transposed_affine_matrix)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
+        # Translate keypoints
+        transformed_points.sub_(tr)
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
+        canvas_size = (new_height, new_width)
+
+    out_keypoints = transformed_points.reshape(original_shape)
+    out_keypoints = out_keypoints.to(original_dtype)
+
+    return out_keypoints, canvas_size
+
+
+def affine_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+):
+    return _affine_keypoints_with_expand(
+        keypoints=keypoints,
+        canvas_size=canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+    )
+
+
+@_register_kernel_internal(affine, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _affine_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = affine_keypoints(
+        inpt.as_subclass(torch.Tensor),
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def _affine_bounding_boxes_with_expand(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    expand: bool = False,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes, canvas_size
+
+    original_shape = bounding_boxes.shape
+    dtype = bounding_boxes.dtype
+    need_cast = not bounding_boxes.is_floating_point()
+    bounding_boxes = bounding_boxes.float() if need_cast else bounding_boxes.clone()
+    device = bounding_boxes.device
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    intermediate_shape = 8 if is_rotated else 4
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=intermediate_format, inplace=True)
+    ).reshape(-1, intermediate_shape)
+
+    angle, translate, shear, center = _affine_parse_args(
+        angle, translate, scale, shear, InterpolationMode.NEAREST, center
+    )
+
+    if center is None:
+        height, width = canvas_size
+        center = [width * 0.5, height * 0.5]
+
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=bounding_boxes.dtype,
+            device=device,
+        )
+        .reshape(2, 3)
+        .T
+    )
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    if is_rotated:
+        points = bounding_boxes.reshape(-1, 2)
+    else:
+        points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=bounding_boxes.dtype)], dim=-1)
+    # 2) Now let's transform the points using affine matrix
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 8)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
+
+    if expand:
+        # Compute minimum point for transformed image frame:
+        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+        height, width = canvas_size
+        points = torch.tensor(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
+            ],
+            dtype=bounding_boxes.dtype,
+            device=device,
+        )
+        new_points = torch.matmul(points, transposed_affine_matrix)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
+        # Translate bounding boxes
+        out_bboxes.sub_(tr.repeat((1, 4 if is_rotated else 2)))
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
+        canvas_size = (new_height, new_width)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    if need_cast:
+        out_bboxes = out_bboxes.to(dtype)
+    return out_bboxes, canvas_size
+
+
+def affine_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    out_box, _ = _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+        clamping_mode=clamping_mode,
+    )
+    return out_box
+
+
+@_register_kernel_internal(affine, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _affine_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = affine_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def affine_mask(
+    mask: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = affine_image(
+        mask,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(affine, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _affine_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = affine_mask(
+        inpt.as_subclass(torch.Tensor),
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        fill=fill,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(affine, tv_tensors.Video)
+def affine_video(
+    video: torch.Tensor,
+    angle: Union[int, float],
+    translate: list[float],
+    scale: float,
+    shear: list[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[list[float]] = None,
+) -> torch.Tensor:
+    return affine_image(
+        video,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def rotate(
+    inpt: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomRotation` for details."""
+    if torch.jit.is_scripting():
+        return rotate_image(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+    _log_api_usage_once(rotate)
+
+    kernel = _get_kernel(rotate, type(inpt))
+    return kernel(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+@_register_kernel_internal(rotate, torch.Tensor)
+@_register_kernel_internal(rotate, tv_tensors.Image)
+def rotate_image(
+    image: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    angle = angle % 360  # shift angle to [0, 360) range
+
+    # fast path: transpose without affine transform
+    if center is None:
+        if angle == 0:
+            return image.clone()
+        if angle == 180:
+            return torch.rot90(image, k=2, dims=(-2, -1))
+
+        if expand or image.shape[-1] == image.shape[-2]:
+            if angle == 90:
+                return torch.rot90(image, k=1, dims=(-2, -1))
+            if angle == 270:
+                return torch.rot90(image, k=3, dims=(-2, -1))
+
+    interpolation = _check_interpolation(interpolation)
+
+    input_height, input_width = image.shape[-2:]
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [input_width, input_height])]
+
+    # due to current incoherence of rotation angle direction between affine and rotate implementations
+    # we need to set -angle.
+    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
+
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    output_width, output_height = (
+        _compute_affine_output_size(matrix, input_width, input_height) if expand else (input_width, input_height)
+    )
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=input_width, h=input_height, ow=output_width, oh=output_height)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(rotate, PIL.Image.Image)
+def _rotate_image_pil(
+    image: PIL.Image.Image,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+
+    return _FP.rotate(
+        image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center  # type: ignore[arg-type]
+    )
+
+
+def rotate_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    return _affine_keypoints_with_expand(
+        keypoints=keypoints,
+        canvas_size=canvas_size,
+        angle=-angle,
+        translate=[0.0, 0.0],
+        scale=1.0,
+        shear=[0.0, 0.0],
+        center=center,
+        expand=expand,
+    )
+
+
+@_register_kernel_internal(rotate, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _rotate_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints, angle: float, expand: bool = False, center: Optional[list[float]] = None, **kwargs
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = rotate_keypoints(
+        inpt, canvas_size=inpt.canvas_size, angle=angle, center=center, expand=expand
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def rotate_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    return _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=-angle,
+        translate=[0.0, 0.0],
+        scale=1.0,
+        shear=[0.0, 0.0],
+        center=center,
+        expand=expand,
+        clamping_mode=clamping_mode,
+    )
+
+
+@_register_kernel_internal(rotate, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _rotate_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, angle: float, expand: bool = False, center: Optional[list[float]] = None, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = rotate_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        expand=expand,
+        center=center,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def rotate_mask(
+    mask: torch.Tensor,
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = rotate_image(
+        mask,
+        angle=angle,
+        expand=expand,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(rotate, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _rotate_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: float,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(rotate, tv_tensors.Video)
+def rotate_video(
+    video: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[list[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return rotate_image(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+def pad(
+    inpt: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Pad` for details."""
+    if torch.jit.is_scripting():
+        return pad_image(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    _log_api_usage_once(pad)
+
+    kernel = _get_kernel(pad, type(inpt))
+    return kernel(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+
+def _parse_pad_padding(padding: Union[int, list[int]]) -> list[int]:
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif isinstance(padding, (tuple, list)):
+        if len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        elif len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+        else:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+    else:
+        raise TypeError(f"`padding` should be an integer or tuple or list of integers, but got {padding}")
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+@_register_kernel_internal(pad, torch.Tensor)
+@_register_kernel_internal(pad, tv_tensors.Image)
+def pad_image(
+    image: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    # Be aware that while `padding` has order `[left, top, right, bottom]`, `torch_padding` uses
+    # `[left, right, top, bottom]`. This stems from the fact that we align our API with PIL, but need to use `torch_pad`
+    # internally.
+    torch_padding = _parse_pad_padding(padding)
+
+    if padding_mode not in ("constant", "edge", "reflect", "symmetric"):
+        raise ValueError(
+            f"`padding_mode` should be either `'constant'`, `'edge'`, `'reflect'` or `'symmetric'`, "
+            f"but got `'{padding_mode}'`."
+        )
+
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (int, float)):
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+    elif len(fill) == 1:
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill[0], padding_mode=padding_mode)
+    else:
+        return _pad_with_vector_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+
+
+def _pad_with_scalar_fill(
+    image: torch.Tensor,
+    torch_padding: list[int],
+    fill: Union[int, float],
+    padding_mode: str,
+) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    batch_size = 1
+    for s in shape[:-3]:
+        batch_size *= s
+
+    image = image.reshape(batch_size, num_channels, height, width)
+
+    if padding_mode == "edge":
+        # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+        # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+        # name.
+        padding_mode = "replicate"
+
+    if padding_mode == "constant":
+        image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+    elif padding_mode in ("reflect", "replicate"):
+        # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+        # TODO: See https://github.com/pytorch/pytorch/issues/40763
+        dtype = image.dtype
+        if not image.is_floating_point():
+            needs_cast = True
+            image = image.to(torch.float32)
+        else:
+            needs_cast = False
+
+        image = torch_pad(image, torch_padding, mode=padding_mode)
+
+        if needs_cast:
+            image = image.to(dtype)
+    else:  # padding_mode == "symmetric"
+        image = _pad_symmetric(image, torch_padding)
+
+    new_height, new_width = image.shape[-2:]
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+# TODO: This should be removed once torch_pad supports non-scalar padding values
+def _pad_with_vector_fill(
+    image: torch.Tensor,
+    torch_padding: list[int],
+    fill: list[float],
+    padding_mode: str,
+) -> torch.Tensor:
+    if padding_mode != "constant":
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
+
+    output = _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    left, right, top, bottom = torch_padding
+
+    # We are creating the tensor in the autodetected dtype first and convert to the right one after to avoid an implicit
+    # float -> int conversion. That happens for example for the valid input of a uint8 image with floating point fill
+    # value.
+    fill = torch.tensor(fill, device=image.device).to(dtype=image.dtype).reshape(-1, 1, 1)
+
+    if top > 0:
+        output[..., :top, :] = fill
+    if left > 0:
+        output[..., :, :left] = fill
+    if bottom > 0:
+        output[..., -bottom:, :] = fill
+    if right > 0:
+        output[..., :, -right:] = fill
+    return output
+
+
+_pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
+
+
+@_register_kernel_internal(pad, tv_tensors.Mask)
+def pad_mask(
+    mask: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (tuple, list)):
+        raise ValueError("Non-scalar fill value is not supported")
+
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = pad_image(mask, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+def pad_keypoints(
+    keypoints: torch.Tensor, canvas_size: tuple[int, int], padding: list[int], padding_mode: str = "constant"
+):
+    SUPPORTED_MODES = ["constant"]
+    if padding_mode not in SUPPORTED_MODES:
+        # TODO: add support of other padding modes
+        raise ValueError(
+            f"Padding mode '{padding_mode}' is not supported with KeyPoints"
+            f" (supported modes are {', '.join(SUPPORTED_MODES)})"
+        )
+    left, right, top, bottom = _parse_pad_padding(padding)
+    pad = torch.tensor([left, top], dtype=keypoints.dtype, device=keypoints.device)
+    canvas_size = (canvas_size[0] + top + bottom, canvas_size[1] + left + right)
+    return keypoints + pad, canvas_size
+
+
+@_register_kernel_internal(pad, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _pad_keypoints_dispatch(
+    keypoints: tv_tensors.KeyPoints, padding: list[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = pad_keypoints(
+        keypoints.as_subclass(torch.Tensor),
+        canvas_size=keypoints.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+    )
+    return tv_tensors.wrap(output, like=keypoints, canvas_size=canvas_size)
+
+
+def pad_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    padding: list[int],
+    padding_mode: str = "constant",
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    if padding_mode not in ["constant"]:
+        # TODO: add support of other padding modes
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported with bounding boxes")
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        pad = [left, top, left, top, left, top, left, top]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR or format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        pad = [left, top, 0, 0, 0]
+    elif format == tv_tensors.BoundingBoxFormat.XYXY:
+        pad = [left, top, left, top]
+    else:
+        pad = [left, top, 0, 0]
+    bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+
+    height, width = canvas_size
+    height += top + bottom
+    width += left + right
+    canvas_size = (height, width)
+
+    return (
+        clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode),
+        canvas_size,
+    )
+
+
+@_register_kernel_internal(pad, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _pad_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, padding: list[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = pad_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(pad, tv_tensors.Video)
+def pad_video(
+    video: torch.Tensor,
+    padding: list[int],
+    fill: Optional[Union[int, float, list[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    return pad_image(video, padding, fill=fill, padding_mode=padding_mode)
+
+
+def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return crop_image(inpt, top=top, left=left, height=height, width=width)
+
+    _log_api_usage_once(crop)
+
+    kernel = _get_kernel(crop, type(inpt))
+    return kernel(inpt, top=top, left=left, height=height, width=width)
+
+
+@_register_kernel_internal(crop, torch.Tensor)
+@_register_kernel_internal(crop, tv_tensors.Image)
+def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    h, w = image.shape[-2:]
+
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        image = image[..., max(top, 0) : bottom, max(left, 0) : right]
+        torch_padding = [
+            max(min(right, 0) - left, 0),
+            max(right - max(w, left), 0),
+            max(min(bottom, 0) - top, 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    return image[..., top:bottom, left:right]
+
+
+_crop_image_pil = _FP.crop
+_register_kernel_internal(crop, PIL.Image.Image)(_crop_image_pil)
+
+
+def crop_keypoints(
+    keypoints: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+
+    keypoints = keypoints - torch.tensor([left, top], dtype=keypoints.dtype, device=keypoints.device)
+    canvas_size = (height, width)
+
+    return keypoints, canvas_size
+
+
+@_register_kernel_internal(crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _crop_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints, top: int, left: int, height: int, width: int
+) -> tv_tensors.KeyPoints:
+    output, canvas_size = crop_keypoints(inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+
+    # Crop or implicit pad if left and/or top have negative values:
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        sub = [left, top, left, top, left, top, left, top]
+    elif format == tv_tensors.BoundingBoxFormat.XYWHR or format == tv_tensors.BoundingBoxFormat.CXCYWHR:
+        sub = [left, top, 0, 0, 0]
+    elif format == tv_tensors.BoundingBoxFormat.XYXY:
+        sub = [left, top, left, top]
+    else:
+        sub = [left, top, 0, 0]
+
+    bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+    canvas_size = (height, width)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _parallelogram_to_bounding_boxes(bounding_boxes)
+
+    return (
+        clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode),
+        canvas_size,
+    )
+
+
+@_register_kernel_internal(crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(crop, tv_tensors.Mask)
+def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = crop_image(mask, top, left, height, width)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(crop, tv_tensors.Video)
+def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    return crop_image(video, top, left, height, width)
+
+
+def perspective(
+    inpt: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomPerspective` for details."""
+    if torch.jit.is_scripting():
+        return perspective_image(
+            inpt,
+            startpoints=startpoints,
+            endpoints=endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
+        )
+
+    _log_api_usage_once(perspective)
+
+    kernel = _get_kernel(perspective, type(inpt))
+    return kernel(
+        inpt,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        interpolation=interpolation,
+        fill=fill,
+        coefficients=coefficients,
+    )
+
+
+def _perspective_grid(coeffs: list[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
+    shape = (1, oh * ow, 3)
+    output_grid1 = base_grid.view(shape).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1.div_(output_grid2).sub_(1.0)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def _perspective_coefficients(
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]],
+) -> list[float]:
+    if coefficients is not None:
+        if startpoints is not None and endpoints is not None:
+            raise ValueError("The startpoints/endpoints and the coefficients shouldn't be defined concurrently.")
+        elif len(coefficients) != 8:
+            raise ValueError("Argument coefficients should have 8 float values")
+        return coefficients
+    elif startpoints is not None and endpoints is not None:
+        return _get_perspective_coeffs(startpoints, endpoints)
+    else:
+        raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
+
+
+@_register_kernel_internal(perspective, torch.Tensor)
+@_register_kernel_internal(perspective, tv_tensors.Image)
+def perspective_image(
+    image: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+
+    _assert_grid_transform_inputs(
+        image,
+        matrix=None,
+        interpolation=interpolation.value,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    oh, ow = image.shape[-2:]
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
+    return _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+
+@_register_kernel_internal(perspective, PIL.Image.Image)
+def _perspective_image_pil(
+    image: PIL.Image.Image,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> PIL.Image.Image:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+    return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+def perspective_keypoints(
+    keypoints: torch.Tensor,
+    canvas_size: tuple[int, int],
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+):
+    if keypoints.numel() == 0:
+        return keypoints
+    dtype = keypoints.dtype if torch.is_floating_point(keypoints) else torch.float32
+    device = keypoints.device
+    original_shape = keypoints.shape
+
+    keypoints = keypoints.clone().reshape(-1, 2)
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+
+    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
+    if denom == 0:
+        raise RuntimeError(
+            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform keypoints. "
+            f"Denominator is zero, denom={denom}"
+        )
+
+    theta1, theta2 = _compute_perspective_thetas(perspective_coeffs, dtype, device, denom)
+    points = torch.cat([keypoints, torch.ones(keypoints.shape[0], 1, device=keypoints.device)], dim=-1)
+
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
+    transformed_points = numer_points.div_(denom_points)
+    return transformed_points.to(keypoints.dtype).reshape(original_shape)
+
+
+@_register_kernel_internal(perspective, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _perspective_keypoints_dispatch(
+    inpt: tv_tensors.KeyPoints,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.KeyPoints:
+    output = perspective_keypoints(
+        inpt.as_subclass(torch.Tensor),
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def perspective_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+
+    original_shape = bounding_boxes.shape
+    original_dtype = bounding_boxes.dtype
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=intermediate_format)
+    ).reshape(-1, 8 if is_rotated else 4)
+
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    device = bounding_boxes.device
+
+    # perspective_coeffs are computed as endpoint -> start point
+    # We have to invert perspective_coeffs for bboxes:
+    # (x, y) - end point and (x_out, y_out) - start point
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # and we would like to get:
+    # x = (inv_coeffs[0] * x_out + inv_coeffs[1] * y_out + inv_coeffs[2])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # y = (inv_coeffs[3] * x_out + inv_coeffs[4] * y_out + inv_coeffs[5])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # and compute inv_coeffs in terms of coeffs
+
+    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
+    if denom == 0:
+        raise RuntimeError(
+            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform bounding boxes. "
+            f"Denominator is zero, denom={denom}"
+        )
+
+    theta1, theta2 = _compute_perspective_thetas(perspective_coeffs, dtype, device, denom)
+
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    points = bounding_boxes if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+    points = points.reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
+    # 2) Now let's transform the points using perspective matrices
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
+    transformed_points = numer_points.div_(denom_points)
+
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 8)
+        out_bboxes = _parallelogram_to_bounding_boxes(transformed_points)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes
+
+
+def _compute_perspective_thetas(
+    perspective_coeffs: list[float],
+    dtype: torch.dtype,
+    device: torch.device,
+    denom: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    inv_coeffs = [
+        (perspective_coeffs[4] - perspective_coeffs[5] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[1] + perspective_coeffs[2] * perspective_coeffs[7]) / denom,
+        (perspective_coeffs[1] * perspective_coeffs[5] - perspective_coeffs[2] * perspective_coeffs[4]) / denom,
+        (-perspective_coeffs[3] + perspective_coeffs[5] * perspective_coeffs[6]) / denom,
+        (perspective_coeffs[0] - perspective_coeffs[2] * perspective_coeffs[6]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[5] + perspective_coeffs[2] * perspective_coeffs[3]) / denom,
+        (-perspective_coeffs[4] * perspective_coeffs[6] + perspective_coeffs[3] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
+    ]
+
+    theta1 = torch.tensor(
+        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
+        dtype=dtype,
+        device=device,
+    )
+
+    theta2 = torch.tensor(
+        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
+    )
+
+    return theta1, theta2
+
+
+@_register_kernel_internal(perspective, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _perspective_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = perspective_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def perspective_mask(
+    mask: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = perspective_image(
+        mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(perspective, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _perspective_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = perspective_mask(
+        inpt.as_subclass(torch.Tensor),
+        startpoints=startpoints,
+        endpoints=endpoints,
+        fill=fill,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(perspective, tv_tensors.Video)
+def perspective_video(
+    video: torch.Tensor,
+    startpoints: Optional[list[list[int]]],
+    endpoints: Optional[list[list[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[list[float]] = None,
+) -> torch.Tensor:
+    return perspective_image(
+        video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+    )
+
+
+def elastic(
+    inpt: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.ElasticTransform` for details."""
+    if torch.jit.is_scripting():
+        return elastic_image(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+    _log_api_usage_once(elastic)
+
+    kernel = _get_kernel(elastic, type(inpt))
+    return kernel(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+
+elastic_transform = elastic
+
+
+@_register_kernel_internal(elastic, torch.Tensor)
+@_register_kernel_internal(elastic, tv_tensors.Image)
+def elastic_image(
+    image: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
+    interpolation = _check_interpolation(interpolation)
+
+    height, width = image.shape[-2:]
+    device = image.device
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+
+    # Patch: elastic transform should support (cpu,f16) input
+    is_cpu_half = device.type == "cpu" and dtype == torch.float16
+    if is_cpu_half:
+        image = image.to(torch.float32)
+        dtype = torch.float32
+
+    # We are aware that if input image dtype is uint8 and displacement is float64 then
+    # displacement will be cast to float32 and all computations will be done with float32
+    # We can fix this later if needed
+
+    expected_shape = (1, height, width, 2)
+    if expected_shape != displacement.shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    grid = _create_identity_grid((height, width), device=device, dtype=dtype).add_(
+        displacement.to(dtype=dtype, device=device)
+    )
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+    if is_cpu_half:
+        output = output.to(torch.float16)
+
+    return output
+
+
+@_register_kernel_internal(elastic, PIL.Image.Image)
+def _elastic_image_pil(
+    image: PIL.Image.Image,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = elastic_image(t_img, displacement, interpolation=interpolation, fill=fill)
+    return to_pil_image(output, mode=image.mode)
+
+
+def _create_identity_grid(size: tuple[int, int], device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+    sy, sx = size
+    base_grid = torch.empty(1, sy, sx, 2, device=device, dtype=dtype)
+    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+
+    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+
+    return base_grid
+
+
+def elastic_keypoints(
+    keypoints: torch.Tensor, canvas_size: tuple[int, int], displacement: torch.Tensor
+) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if keypoints.numel() == 0:
+        return keypoints
+
+    device = keypoints.device
+    dtype = keypoints.dtype if torch.is_floating_point(keypoints) else torch.float32
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    original_shape = keypoints.shape
+    keypoints = keypoints.clone().reshape(-1, 2)
+
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
+    inv_grid = id_grid.sub_(displacement)
+
+    index_xy = keypoints.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+    # Unlike bounding boxes, this may not work well.
+    index_x.clamp_(0, inv_grid.shape[2] - 1)
+    index_y.clamp_(0, inv_grid.shape[1] - 1)
+
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
+
+    return transformed_points.to(keypoints.dtype).reshape(original_shape)
+
+
+@_register_kernel_internal(elastic, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _elastic_keypoints_dispatch(inpt: tv_tensors.KeyPoints, displacement: torch.Tensor, **kwargs):
+    output = elastic_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size, displacement=displacement)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def elastic_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    displacement: torch.Tensor,
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    # TODO: add in docstring about approximation we are doing for grid inversion
+    device = bounding_boxes.device
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    intermediate_format = tv_tensors.BoundingBoxFormat.CXCYWHR if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes.clone(), old_format=format, new_format=intermediate_format)
+    ).reshape(-1, 5 if is_rotated else 4)
+
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
+    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
+    # This is not an exact inverse of the grid
+    inv_grid = id_grid.sub_(displacement)
+
+    # Get points from bboxes
+    points = bounding_boxes[:, :2] if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+    points = points.reshape(-1, 2)
+    if points.is_floating_point():
+        points = points.ceil_()
+    index_xy = points.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+
+    # Transform points:
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
+
+    if is_rotated:
+        transformed_points = transformed_points.reshape(-1, 2)
+        out_bboxes = torch.cat([transformed_points, bounding_boxes[:, 2:]], dim=1).to(bounding_boxes.dtype)
+    else:
+        transformed_points = transformed_points.reshape(-1, 4, 2)
+        out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+        out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype)
+
+    out_bboxes = clamp_bounding_boxes(
+        out_bboxes, format=intermediate_format, canvas_size=canvas_size, clamping_mode=clamping_mode
+    )
+
+    return convert_bounding_box_format(
+        out_bboxes, old_format=intermediate_format, new_format=format, inplace=False
+    ).reshape(original_shape)
+
+
+@_register_kernel_internal(elastic, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _elastic_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, displacement: torch.Tensor, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output = elastic_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        displacement=displacement,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def elastic_mask(
+    mask: torch.Tensor,
+    displacement: torch.Tensor,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = elastic_image(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(elastic, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _elastic_mask_dispatch(
+    inpt: tv_tensors.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
+) -> tv_tensors.Mask:
+    output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(elastic, tv_tensors.Video)
+def elastic_video(
+    video: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return elastic_image(video, displacement, interpolation=interpolation, fill=fill)
+
+
+def center_crop(inpt: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return center_crop_image(inpt, output_size=output_size)
+
+    _log_api_usage_once(center_crop)
+
+    kernel = _get_kernel(center_crop, type(inpt))
+    return kernel(inpt, output_size=output_size)
+
+
+def _center_crop_parse_output_size(output_size: list[int]) -> list[int]:
+    if isinstance(output_size, numbers.Number):
+        s = int(output_size)
+        return [s, s]
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        return [output_size[0], output_size[0]]
+    else:
+        return list(output_size)
+
+
+def _center_crop_compute_padding(crop_height: int, crop_width: int, image_height: int, image_width: int) -> list[int]:
+    return [
+        (crop_width - image_width) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height) // 2 if crop_height > image_height else 0,
+        (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+    ]
+
+
+def _center_crop_compute_crop_anchor(
+    crop_height: int, crop_width: int, image_height: int, image_width: int
+) -> tuple[int, int]:
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return crop_top, crop_left
+
+
+@_register_kernel_internal(center_crop, torch.Tensor)
+@_register_kernel_internal(center_crop, tv_tensors.Image)
+def center_crop_image(image: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    shape = image.shape
+    if image.numel() == 0:
+        return image.reshape(shape[:-2] + (crop_height, crop_width))
+    image_height, image_width = shape[-2:]
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0)
+
+        image_height, image_width = image.shape[-2:]
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return image[..., crop_top : (crop_top + crop_height), crop_left : (crop_left + crop_width)]
+
+
+@_register_kernel_internal(center_crop, PIL.Image.Image)
+def _center_crop_image_pil(image: PIL.Image.Image, output_size: list[int]) -> PIL.Image.Image:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = _pad_image_pil(image, padding_ltrb, fill=0)
+
+        image_height, image_width = _get_size_image_pil(image)
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return _crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
+
+
+def center_crop_keypoints(inpt: torch.Tensor, canvas_size: tuple[int, int], output_size: list[int]):
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
+    return crop_keypoints(inpt, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
+
+
+@_register_kernel_internal(center_crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _center_crop_keypoints_dispatch(inpt: tv_tensors.KeyPoints, output_size: list[int]) -> tv_tensors.KeyPoints:
+    output, canvas_size = center_crop_keypoints(
+        inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size, output_size=output_size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def center_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    output_size: list[int],
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
+    return crop_bounding_boxes(
+        bounding_boxes,
+        format,
+        top=crop_top,
+        left=crop_left,
+        height=crop_height,
+        width=crop_width,
+        clamping_mode=clamping_mode,
+    )
+
+
+@_register_kernel_internal(center_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _center_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, output_size: list[int]
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = center_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        output_size=output_size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Mask)
+def center_crop_mask(mask: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = center_crop_image(image=mask, output_size=output_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Video)
+def center_crop_video(video: torch.Tensor, output_size: list[int]) -> torch.Tensor:
+    return center_crop_image(video, output_size)
+
+
+def resized_crop(
+    inpt: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.RandomResizedCrop` for details."""
+    if torch.jit.is_scripting():
+        return resized_crop_image(
+            inpt,
+            top=top,
+            left=left,
+            height=height,
+            width=width,
+            size=size,
+            interpolation=interpolation,
+            antialias=antialias,
+        )
+
+    _log_api_usage_once(resized_crop)
+
+    kernel = _get_kernel(resized_crop, type(inpt))
+    return kernel(
+        inpt,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+        antialias=antialias,
+    )
+
+
+@_register_kernel_internal(resized_crop, torch.Tensor)
+@_register_kernel_internal(resized_crop, tv_tensors.Image)
+def resized_crop_image(
+    image: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    image = crop_image(image, top, left, height, width)
+    return resize_image(image, size, interpolation=interpolation, antialias=antialias)
+
+
+def _resized_crop_image_pil(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+) -> PIL.Image.Image:
+    image = _crop_image_pil(image, top, left, height, width)
+    return _resize_image_pil(image, size, interpolation=interpolation)
+
+
+@_register_kernel_internal(resized_crop, PIL.Image.Image)
+def _resized_crop_image_pil_dispatch(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resized_crop_image_pil(
+        image,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+    )
+
+
+def resized_crop_keypoints(
+    keypoints: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    keypoints, canvas_size = crop_keypoints(keypoints, top, left, height, width)
+    return resize_keypoints(keypoints, size=size, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def _resized_crop_keypoints_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+):
+    output, canvas_size = resized_crop_keypoints(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def resized_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    clamping_mode: CLAMPING_MODE_TYPE = "soft",
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    bounding_boxes, canvas_size = crop_bounding_boxes(
+        bounding_boxes, format, top, left, height, width, clamping_mode=clamping_mode
+    )
+    return resize_bounding_boxes(
+        bounding_boxes, format=format, canvas_size=canvas_size, size=size, clamping_mode=clamping_mode
+    )
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resized_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resized_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        clamping_mode=inpt.clamping_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def resized_crop_mask(
+    mask: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+) -> torch.Tensor:
+    mask = crop_mask(mask, top, left, height, width)
+    return resize_mask(mask, size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resized_crop_mask_dispatch(
+    inpt: tv_tensors.Mask, top: int, left: int, height: int, width: int, size: list[int], **kwargs
+) -> tv_tensors.Mask:
+    output = resized_crop_mask(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Video)
+def resized_crop_video(
+    video: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: list[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[bool] = True,
+) -> torch.Tensor:
+    return resized_crop_image(
+        video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
+    )
+
+
+def five_crop(
+    inpt: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """See :class:`~torchvision.transforms.v2.FiveCrop` for details."""
+    if torch.jit.is_scripting():
+        return five_crop_image(inpt, size=size)
+
+    _log_api_usage_once(five_crop)
+
+    kernel = _get_kernel(five_crop, type(inpt))
+    return kernel(inpt, size=size)
+
+
+def _parse_five_crop_size(size: list[int]) -> list[int]:
+    if isinstance(size, numbers.Number):
+        s = int(size)
+        size = [s, s]
+    elif isinstance(size, (tuple, list)) and len(size) == 1:
+        s = size[0]
+        size = [s, s]
+
+    if len(size) != 2:
+        raise ValueError("Please provide only two dimensions (h, w) for size.")
+
+    return size
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Image)
+def five_crop_image(
+    image: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = image.shape[-2:]
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = crop_image(image, 0, 0, crop_height, crop_width)
+    tr = crop_image(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = crop_image(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = crop_image(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = center_crop_image(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, PIL.Image.Image)
+def _five_crop_image_pil(
+    image: PIL.Image.Image, size: list[int]
+) -> tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = _crop_image_pil(image, 0, 0, crop_height, crop_width)
+    tr = _crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = _crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = _crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = _center_crop_image_pil(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Video)
+def five_crop_video(
+    video: torch.Tensor, size: list[int]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    return five_crop_image(video, size)
+
+
+def ten_crop(
+    inpt: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """See :class:`~torchvision.transforms.v2.TenCrop` for details."""
+    if torch.jit.is_scripting():
+        return ten_crop_image(inpt, size=size, vertical_flip=vertical_flip)
+
+    _log_api_usage_once(ten_crop)
+
+    kernel = _get_kernel(ten_crop, type(inpt))
+    return kernel(inpt, size=size, vertical_flip=vertical_flip)
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Image)
+def ten_crop_image(
+    image: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    non_flipped = five_crop_image(image, size)
+
+    if vertical_flip:
+        image = vertical_flip_image(image)
+    else:
+        image = horizontal_flip_image(image)
+
+    flipped = five_crop_image(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, PIL.Image.Image)
+def _ten_crop_image_pil(
+    image: PIL.Image.Image, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+]:
+    non_flipped = _five_crop_image_pil(image, size)
+
+    if vertical_flip:
+        image = _vertical_flip_image_pil(image)
+    else:
+        image = _horizontal_flip_image_pil(image)
+
+    flipped = _five_crop_image_pil(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Video)
+def ten_crop_video(
+    video: torch.Tensor, size: list[int], vertical_flip: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    return ten_crop_image(video, size, vertical_flip=vertical_flip)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_meta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4568b39ab5991afd41f456944ee96e273e229e0b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_meta.py
@@ -0,0 +1,685 @@
+from typing import Optional, Union
+
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.tv_tensors import BoundingBoxFormat
+from torchvision.tv_tensors._bounding_boxes import CLAMPING_MODE_TYPE
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
+
+
+def get_dimensions(inpt: torch.Tensor) -> list[int]:
+    if torch.jit.is_scripting():
+        return get_dimensions_image(inpt)
+
+    _log_api_usage_once(get_dimensions)
+
+    kernel = _get_kernel(get_dimensions, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_dimensions, torch.Tensor)
+@_register_kernel_internal(get_dimensions, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_dimensions_image(image: torch.Tensor) -> list[int]:
+    chw = list(image.shape[-3:])
+    ndims = len(chw)
+    if ndims == 3:
+        return chw
+    elif ndims == 2:
+        chw.insert(0, 1)
+        return chw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
+
+
+@_register_kernel_internal(get_dimensions, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_dimensions_video(video: torch.Tensor) -> list[int]:
+    return get_dimensions_image(video)
+
+
+def get_num_channels(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_channels_image(inpt)
+
+    _log_api_usage_once(get_num_channels)
+
+    kernel = _get_kernel(get_num_channels, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_channels, torch.Tensor)
+@_register_kernel_internal(get_num_channels, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_num_channels_image(image: torch.Tensor) -> int:
+    chw = image.shape[-3:]
+    ndims = len(chw)
+    if ndims == 3:
+        return chw[0]
+    elif ndims == 2:
+        return 1
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
+
+
+@_register_kernel_internal(get_num_channels, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_channels_video(video: torch.Tensor) -> int:
+    return get_num_channels_image(video)
+
+
+# We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
+# deprecating the old names.
+get_image_num_channels = get_num_channels
+
+
+def get_size(inpt: torch.Tensor) -> list[int]:
+    if torch.jit.is_scripting():
+        return get_size_image(inpt)
+
+    _log_api_usage_once(get_size)
+
+    kernel = _get_kernel(get_size, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_size, torch.Tensor)
+@_register_kernel_internal(get_size, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_size_image(image: torch.Tensor) -> list[int]:
+    hw = list(image.shape[-2:])
+    ndims = len(hw)
+    if ndims == 2:
+        return hw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+@_register_kernel_internal(get_size, PIL.Image.Image)
+def _get_size_image_pil(image: PIL.Image.Image) -> list[int]:
+    width, height = _FP.get_image_size(image)
+    return [height, width]
+
+
+@_register_kernel_internal(get_size, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_size_video(video: torch.Tensor) -> list[int]:
+    return get_size_image(video)
+
+
+@_register_kernel_internal(get_size, tv_tensors.Mask, tv_tensor_wrapper=False)
+def get_size_mask(mask: torch.Tensor) -> list[int]:
+    return get_size_image(mask)
+
+
+@_register_kernel_internal(get_size, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def get_size_bounding_boxes(bounding_box: tv_tensors.BoundingBoxes) -> list[int]:
+    return list(bounding_box.canvas_size)
+
+
+@_register_kernel_internal(get_size, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+def get_size_keypoints(keypoints: tv_tensors.KeyPoints) -> list[int]:
+    return list(keypoints.canvas_size)
+
+
+def get_num_frames(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_frames_video(inpt)
+
+    _log_api_usage_once(get_num_frames)
+
+    kernel = _get_kernel(get_num_frames, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_frames, torch.Tensor)
+@_register_kernel_internal(get_num_frames, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_frames_video(video: torch.Tensor) -> int:
+    return video.shape[-4]
+
+
+def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xyxy = xywh if inplace else xywh.clone()
+    xyxy[..., 2:] += xyxy[..., :2]
+    return xyxy
+
+
+def _xyxy_to_xywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xywh = xyxy if inplace else xyxy.clone()
+    xywh[..., 2:] -= xywh[..., :2]
+    return xywh
+
+
+def _cxcywh_to_xyxy(cxcywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywh = cxcywh.clone()
+
+    # Trick to do fast division by 2 and ceil, without casting. It produces the same result as
+    # `torchvision.ops._box_convert._box_cxcywh_to_xyxy`.
+    half_wh = cxcywh[..., 2:].div(-2, rounding_mode=None if cxcywh.is_floating_point() else "floor").abs_()
+    # (cx - width / 2) = x1, same for y1
+    cxcywh[..., :2].sub_(half_wh)
+    # (x1 + width) = x2, same for y2
+    cxcywh[..., 2:].add_(cxcywh[..., :2])
+
+    return cxcywh
+
+
+def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xyxy = xyxy.clone()
+
+    # (x2 - x1) = width, same for height
+    xyxy[..., 2:].sub_(xyxy[..., :2])
+    # (x1 * 2 + width) / 2 = x1 + width / 2 = x1 + (x2-x1)/2 = (x1 + x2)/2 = cx, same for cy
+    xyxy[..., :2].mul_(2).add_(xyxy[..., 2:]).div_(2, rounding_mode=None if xyxy.is_floating_point() else "floor")
+
+    return xyxy
+
+
+def _xyxy_to_keypoints(bounding_boxes: torch.Tensor) -> torch.Tensor:
+    return bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]]
+
+
+def _xyxyxyxy_to_keypoints(bounding_boxes: torch.Tensor) -> torch.Tensor:
+    return bounding_boxes[:, [[0, 1], [2, 3], [4, 5], [6, 7]]]
+
+
+def _cxcywhr_to_xywhr(cxcywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywhr = cxcywhr.clone()
+
+    half_wh = cxcywhr[..., 2:-1].div(-2, rounding_mode=None if cxcywhr.is_floating_point() else "floor").abs_()
+    r_rad = cxcywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    # (cx - width / 2 * cos - height / 2 * sin) = x1
+    cxcywhr[..., 0].sub_(half_wh[..., 0].mul(cos)).sub_(half_wh[..., 1].mul(sin))
+    # (cy + width / 2 * sin - height / 2 * cos) = y1
+    cxcywhr[..., 1].add_(half_wh[..., 0].mul(sin)).sub_(half_wh[..., 1].mul(cos))
+
+    return cxcywhr
+
+
+def _xywhr_to_cxcywhr(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xywhr = xywhr.clone()
+
+    half_wh = xywhr[..., 2:-1].div(-2, rounding_mode=None if xywhr.is_floating_point() else "floor").abs_()
+    r_rad = xywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    # (x1 + width / 2 * cos + height / 2 * sin) = cx
+    xywhr[..., 0].add_(half_wh[..., 0].mul(cos)).add_(half_wh[..., 1].mul(sin))
+    # (y1 - width / 2 * sin + height / 2 * cos) = cy
+    xywhr[..., 1].sub_(half_wh[..., 0].mul(sin)).add_(half_wh[..., 1].mul(cos))
+
+    return xywhr
+
+
+def _xywhr_to_xyxyxyxy(xywhr: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # NOTE: This function cannot modify the input tensor inplace as it requires a dimension change.
+    if not inplace:
+        xywhr = xywhr.clone()
+
+    wh = xywhr[..., 2:-1]
+    r_rad = xywhr[..., 4].mul(torch.pi).div(180.0)
+    cos, sin = r_rad.cos(), r_rad.sin()
+    xywhr = xywhr[..., :2].tile((1, 4))
+    # x1 + w * cos = x2
+    xywhr[..., 2].add_(wh[..., 0].mul(cos))
+    # y1 - w * sin = y2
+    xywhr[..., 3].sub_(wh[..., 0].mul(sin))
+    # x1 + w * cos + h * sin = x3
+    xywhr[..., 4].add_(wh[..., 0].mul(cos).add(wh[..., 1].mul(sin)))
+    # y1 - w * sin + h * cos = y3
+    xywhr[..., 5].sub_(wh[..., 0].mul(sin).sub(wh[..., 1].mul(cos)))
+    # x1 + h * sin = x4
+    xywhr[..., 6].add_(wh[..., 1].mul(sin))
+    # y1 + h * cos = y4
+    xywhr[..., 7].add_(wh[..., 1].mul(cos))
+
+    return xywhr
+
+
+def _xyxyxyxy_to_xywhr(xyxyxyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # NOTE: This function cannot modify the input tensor inplace as it requires a dimension change.
+    if not inplace:
+        xyxyxyxy = xyxyxyxy.clone()
+
+    dtype = xyxyxyxy.dtype
+    acceptable_dtypes = [torch.float32, torch.float64]  # Ensure consistency between CPU and GPU.
+    need_cast = dtype not in acceptable_dtypes
+    if need_cast:
+        # Up-case to avoid overflow for square operations
+        xyxyxyxy = xyxyxyxy.to(torch.float32)
+
+    r_rad = torch.atan2(xyxyxyxy[..., 1].sub(xyxyxyxy[..., 3]), xyxyxyxy[..., 2].sub(xyxyxyxy[..., 0]))
+    # x1, y1, (x2 - x1), (y2 - y1), (x3 - x2), (y3 - y2) x4, y4
+    xyxyxyxy[..., 4:6].sub_(xyxyxyxy[..., 2:4])
+    xyxyxyxy[..., 2:4].sub_(xyxyxyxy[..., :2])
+    # sqrt((x2 - x1) ** 2 + (y1 - y2) ** 2) = w
+    xyxyxyxy[..., 2] = xyxyxyxy[..., 2].pow(2).add(xyxyxyxy[..., 3].pow(2)).sqrt()
+    # sqrt((x2 - x3) ** 2 + (y2 - y3) ** 2) = h
+    xyxyxyxy[..., 3] = xyxyxyxy[..., 4].pow(2).add(xyxyxyxy[..., 5].pow(2)).sqrt()
+    xyxyxyxy[..., 4] = r_rad.div_(torch.pi).mul_(180.0)
+
+    if need_cast:
+        xyxyxyxy = xyxyxyxy.to(dtype)
+
+    return xyxyxyxy[..., :5]
+
+
+def _convert_bounding_box_format(
+    bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
+) -> torch.Tensor:
+
+    if new_format == old_format:
+        return bounding_boxes
+
+    if tv_tensors.is_rotated_bounding_format(old_format) ^ tv_tensors.is_rotated_bounding_format(new_format):
+        raise ValueError("Cannot convert between rotated and unrotated bounding boxes.")
+
+    # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
+    if old_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.CXCYWHR:
+        bounding_boxes = _cxcywhr_to_xywhr(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _xyxyxyxy_to_xywhr(bounding_boxes, inplace)
+
+    if new_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.CXCYWHR:
+        bounding_boxes = _xywhr_to_cxcywhr(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.XYXYXYXY:
+        bounding_boxes = _xywhr_to_xyxyxyxy(bounding_boxes, inplace)
+
+    return bounding_boxes
+
+
+def convert_bounding_box_format(
+    inpt: torch.Tensor,
+    old_format: Optional[BoundingBoxFormat] = None,
+    new_format: Optional[BoundingBoxFormat] = None,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details."""
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
+    # inputs as well as extract it from `tv_tensors.BoundingBoxes` inputs. However, putting a default value on
+    # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
+    # default error that would be thrown if `new_format` had no default value.
+    if new_format is None:
+        raise TypeError("convert_bounding_box_format() missing 1 required argument: 'new_format'")
+
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_bounding_box_format)
+
+    if isinstance(old_format, str):
+        old_format = BoundingBoxFormat[old_format.upper()]
+    if isinstance(new_format, str):
+        new_format = BoundingBoxFormat[new_format.upper()]
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+        if old_format is None:
+            raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
+        return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if old_format is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `old_format` must not be passed.")
+        output = _convert_bounding_box_format(
+            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
+        )
+        return tv_tensors.wrap(output, like=inpt, format=new_format)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
+
+
+def _clamp_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    if clamping_mode is None:
+        return bounding_boxes.clone()
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    in_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    xyxy_boxes = convert_bounding_box_format(
+        bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
+    )
+    # hard and soft modes are equivalent for non-rotated boxes
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
+    out_boxes = convert_bounding_box_format(
+        xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    )
+    return out_boxes.to(in_dtype)
+
+
+def _order_bounding_boxes_points(
+    bounding_boxes: torch.Tensor, indices: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Re-order points in bounding boxes based on specific criteria or provided indices.
+
+    This function reorders the points of bounding boxes either according to provided indices or
+    by a default ordering strategy. In the default strategy, (x1, y1) corresponds to the point
+    with the lowest x value. If multiple points have the same lowest x value, the point with the
+    lowest y value is chosen.
+
+    Args:
+        bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates in format [x1, y1, x2, y2, x3, y3, x4, y4].
+        indices (torch.Tensor | None): Optional tensor containing indices for reordering. If None, default ordering is applied.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - indices: The indices used for reordering
+            - reordered_boxes: The bounding boxes with reordered points
+    """
+    if indices is None:
+        output_xyxyxyxy = bounding_boxes.reshape(-1, 8)
+        x, y = output_xyxyxyxy[..., 0::2], output_xyxyxyxy[..., 1::2]
+        y_max = torch.max(y.abs(), dim=1, keepdim=True)[0]
+        x_max = torch.max(x.abs(), dim=1, keepdim=True)[0]
+        _, x1 = (y / y_max + (x / x_max) * 100).min(dim=1)
+        indices = torch.ones_like(output_xyxyxyxy)
+        indices[..., 0] = x1.mul(2)
+        indices.cumsum_(1).remainder_(8)
+    return indices, bounding_boxes.gather(1, indices.to(torch.int64))
+
+
+def _get_slope_and_intercept(box: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the slope and y-intercept of the lines defined by consecutive vertices in a bounding box.
+    This function computes the slope (a) and y-intercept (b) for each line segment in a bounding box,
+    where each line is defined by two consecutive vertices.
+    """
+    x, y = box[..., ::2], box[..., 1::2]
+    a = y.diff(append=y[..., 0:1]) / x.diff(append=x[..., 0:1])
+    b = y - a * x
+    return a, b
+
+
+def _get_intersection_point(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Calculate the intersection point of two lines defined by their slopes and y-intercepts.
+    This function computes the intersection points between pairs of lines, where each line
+    is defined by the equation y = ax + b (slope and y-intercept form).
+    """
+    batch_size = a.shape[0]
+    x = b.diff(prepend=b[..., 3:4]).neg() / a.diff(prepend=a[..., 3:4])
+    y = a * x + b
+    return torch.cat((x.unsqueeze(-1), y.unsqueeze(-1)), dim=-1).view(batch_size, 8)
+
+
+def _clamp_y_intercept(
+    bounding_boxes: torch.Tensor,
+    original_bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Apply clamping to bounding box y-intercepts. This function handles two clamping strategies:
+    - Hard clamping: Ensures all box vertices stay within canvas boundaries, finding the largest
+      angle-preserving box enclosed within the original box and the image canvas.
+    - Soft clamping: Allows some vertices to extend beyond the canvas, finding the smallest
+      angle-preserving box that encloses the intersection of the original box and the image canvas.
+
+    The function first calculates the slopes and y-intercepts of the lines forming the bounding box,
+    then applies various constraints to ensure the clamping conditions are respected.
+    """
+
+    # Calculate slopes and y-intercepts for bounding boxes
+    a, b = _get_slope_and_intercept(bounding_boxes)
+    a1, a2, a3, a4 = a.unbind(-1)
+    b1, b2, b3, b4 = b.unbind(-1)
+
+    # Get y-intercepts from original bounding boxes
+    _, bm = _get_slope_and_intercept(original_bounding_boxes)
+    b1m, b2m, b3m, b4m = bm.unbind(-1)
+
+    # Soft clamping: Clamp y-intercepts within canvas boundaries
+    b1 = b2.clamp(b1, b3).clamp(0, canvas_size[0])
+    b4 = b3.clamp(b2, b4).clamp(0, canvas_size[0])
+
+    if clamping_mode is not None and clamping_mode == "hard":
+        # Hard clamping: Average b1 and b4, and adjust b2 and b3 for maximum area
+        b1 = b4 = (b1 + b4) / 2
+
+        # Calculate candidate values for b2 based on geometric constraints
+        b2_candidates = torch.stack(
+            [
+                b1 * a2 / a1,  # Constraint at y=0
+                b3 * a2 / a3,  # Constraint at y=0
+                (a1 - a2) * canvas_size[1] + b1,  # Constraint at x=canvas_width
+                (a3 - a2) * canvas_size[1] + b3,  # Constraint at x=canvas_width
+            ],
+            dim=1,
+        )
+        # Take maximum value that doesn't exceed original b2
+        b2 = torch.max(b2_candidates, dim=1)[0].clamp(max=b2)
+
+        # Calculate candidate values for b3 based on geometric constraints
+        b3_candidates = torch.stack(
+            [
+                canvas_size[0] * (1 - a3 / a4) + b4 * a3 / a4,  # Constraint at y=canvas_height
+                canvas_size[0] * (1 - a3 / a2) + b2 * a3 / a2,  # Constraint at y=canvas_height
+                (a2 - a3) * canvas_size[1] + b2,  # Constraint at x=canvas_width
+                (a4 - a3) * canvas_size[1] + b4,  # Constraint at x=canvas_width
+            ],
+            dim=1,
+        )
+        # Take minimum value that doesn't go below original b3
+        b3 = torch.min(b3_candidates, dim=1)[0].clamp(min=b3)
+
+    # Final clamping to ensure y-intercepts are within original box bounds
+    b1.clamp_(b1m, b3m)
+    b3.clamp_(b1m, b3m)
+    b2.clamp_(b2m, b4m)
+    b4.clamp_(b2m, b4m)
+
+    return torch.stack([b1, b2, b3, b4], dim=-1)
+
+
+def _clamp_along_y_axis(
+    bounding_boxes: torch.Tensor,
+    original_bounding_boxes: torch.Tensor,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Adjusts bounding boxes along the y-axis based on specific conditions.
+
+    This function modifies the bounding boxes by evaluating different cases
+    and applying the appropriate transformation to ensure the bounding boxes
+    are clamped correctly along the y-axis.
+
+    Args:
+        bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates.
+        original_bounding_boxes (torch.Tensor): The original bounding boxes before any clamping is applied.
+        canvas_size (tuple[int, int]): The size of the canvas as (height, width).
+        clamping_mode (str, optional): The clamping strategy to use.
+
+    Returns:
+        torch.Tensor: The adjusted bounding boxes.
+    """
+    original_shape = bounding_boxes.shape
+    bounding_boxes = bounding_boxes.reshape(-1, 8)
+    original_bounding_boxes = original_bounding_boxes.reshape(-1, 8)
+
+    # Calculate slopes (a) and y-intercepts (b) for all lines in the bounding boxes
+    a, b = _get_slope_and_intercept(bounding_boxes)
+    x1, y1, x2, y2, x3, y3, x4, y4 = bounding_boxes.unbind(-1)
+    b = _clamp_y_intercept(bounding_boxes, original_bounding_boxes, canvas_size, clamping_mode)
+
+    case_a = _get_intersection_point(a, b)
+    case_b = bounding_boxes.clone()
+    case_b[..., 0].clamp_(0)  # Clamp x1 to 0
+    case_b[..., 6].clamp_(0)  # Clamp x4 to 0
+    case_c = torch.zeros_like(case_b)
+
+    cond_a = (x1 < 0) & ~case_a.isnan().any(-1)  # First point is outside left boundary
+    cond_b = y1.isclose(y2) | y3.isclose(y4)  # First line is nearly vertical
+    cond_c = (x1 <= 0) & (x2 <= 0) & (x3 <= 0) & (x4 <= 0)  # All points outside left boundary
+    cond_c = cond_c | y1.isclose(y4) | y2.isclose(y3) | (cond_b & x1.isclose(x2))  # First line is nearly horizontal
+
+    for (cond, case) in zip(
+        [cond_a, cond_b, cond_c],
+        [case_a, case_b, case_c],
+    ):
+        bounding_boxes = torch.where(cond.unsqueeze(1).repeat(1, 8), case.reshape(-1, 8), bounding_boxes)
+
+    return bounding_boxes.reshape(original_shape)
+
+
+def _clamp_rotated_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    clamping_mode: CLAMPING_MODE_TYPE,
+) -> torch.Tensor:
+    """
+    Clamp rotated bounding boxes to ensure they stay within the canvas boundaries.
+
+    This function handles rotated bounding boxes by:
+    1. Converting them to XYXYXYXY format (8 coordinates representing 4 corners).
+    2. Re-ordering the points in the bounding boxes to ensure (x1, y1) corresponds to the point with the lowest x value
+    2. Translates the points (x1, y1), (x2, y2) and (x3, y3)
+        to ensure the bounding box does not go out beyond the left boundary of the canvas.
+    3. Rotate the bounding box four times and apply the same transformation to each vertex to ensure
+        the box does not go beyond the top, right, and bottom boundaries.
+    3. Converting back to the original format and re-order the points as in the original input.
+
+    Args:
+        bounding_boxes (torch.Tensor): Tensor containing rotated bounding box coordinates
+        format (BoundingBoxFormat): The format of the input bounding boxes
+        canvas_size (tuple[int, int]): The size of the canvas as (height, width)
+
+    Returns:
+        torch.Tensor: Clamped bounding boxes in the original format and shape
+    """
+    if clamping_mode is None:
+        return bounding_boxes.clone()
+    original_shape = bounding_boxes.shape
+    bounding_boxes = bounding_boxes.clone()
+    out_boxes = (
+        convert_bounding_box_format(
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, inplace=True
+        )
+    ).reshape(-1, 8)
+
+    original_boxes = out_boxes.clone()
+    for _ in range(4):  # Iterate over the 4 vertices.
+        indices, out_boxes = _order_bounding_boxes_points(out_boxes)
+        _, original_boxes = _order_bounding_boxes_points(original_boxes, indices)
+        out_boxes = _clamp_along_y_axis(out_boxes, original_boxes, canvas_size, clamping_mode)
+        _, out_boxes = _order_bounding_boxes_points(out_boxes, indices)
+        _, original_boxes = _order_bounding_boxes_points(original_boxes, indices)
+        # rotate 90 degrees counter clock wise
+        out_boxes[:, ::2], out_boxes[:, 1::2] = (
+            out_boxes[:, 1::2].clone(),
+            canvas_size[1] - out_boxes[:, ::2].clone(),
+        )
+        original_boxes[:, ::2], original_boxes[:, 1::2] = (
+            original_boxes[:, 1::2].clone(),
+            canvas_size[1] - original_boxes[:, ::2].clone(),
+        )
+        canvas_size = (canvas_size[1], canvas_size[0])
+
+    out_boxes = convert_bounding_box_format(
+        out_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    return out_boxes
+
+
+def clamp_bounding_boxes(
+    inpt: torch.Tensor,
+    format: Optional[BoundingBoxFormat] = None,
+    canvas_size: Optional[tuple[int, int]] = None,
+    clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto",
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ClampBoundingBoxes` for details."""
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_bounding_boxes)
+
+    if clamping_mode is not None and clamping_mode not in ("soft", "hard", "auto"):
+        raise ValueError(f"clamping_mode must be soft, hard, auto or None, got {clamping_mode}")
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+
+        if format is None or canvas_size is None or (clamping_mode is not None and clamping_mode == "auto"):
+            raise ValueError("For pure tensor inputs, `format`, `canvas_size` and `clamping_mode` have to be passed.")
+        if tv_tensors.is_rotated_bounding_format(format):
+            return _clamp_rotated_bounding_boxes(
+                inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode
+            )
+        else:
+            return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if format is not None or canvas_size is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed.")
+        if clamping_mode is not None and clamping_mode == "auto":
+            clamping_mode = inpt.clamping_mode
+        if tv_tensors.is_rotated_bounding_format(inpt.format):
+            output = _clamp_rotated_bounding_boxes(
+                inpt.as_subclass(torch.Tensor),
+                format=inpt.format,
+                canvas_size=inpt.canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        else:
+            output = _clamp_bounding_boxes(
+                inpt.as_subclass(torch.Tensor),
+                format=inpt.format,
+                canvas_size=inpt.canvas_size,
+                clamping_mode=clamping_mode,
+            )
+        return tv_tensors.wrap(output, like=inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
+
+
+def _clamp_keypoints(keypoints: torch.Tensor, canvas_size: tuple[int, int]) -> torch.Tensor:
+    dtype = keypoints.dtype
+    keypoints = keypoints.clone() if keypoints.is_floating_point() else keypoints.float()
+    # Note that max is canvas_size[i] - 1 and not can canvas_size[i] like for
+    # bounding boxes.
+    keypoints[..., 0].clamp_(min=0, max=canvas_size[1] - 1)
+    keypoints[..., 1].clamp_(min=0, max=canvas_size[0] - 1)
+    return keypoints.to(dtype=dtype)
+
+
+def clamp_keypoints(
+    inpt: torch.Tensor,
+    canvas_size: Optional[tuple[int, int]] = None,
+) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ClampKeyPoints` for details."""
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_keypoints)
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+
+        if canvas_size is None:
+            raise ValueError("For pure tensor inputs, `canvas_size` has to be passed.")
+        return _clamp_keypoints(inpt, canvas_size=canvas_size)
+    elif isinstance(inpt, tv_tensors.KeyPoints):
+        if canvas_size is not None:
+            raise ValueError("For keypoints tv_tensor inputs, `canvas_size` must not be passed.")
+        output = _clamp_keypoints(inpt.as_subclass(torch.Tensor), canvas_size=inpt.canvas_size)
+        return tv_tensors.wrap(output, like=inpt)
+    else:
+        raise TypeError(f"Input can either be a plain tensor or a keypoints tv_tensor, but got {type(inpt)} instead.")
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_misc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf263df046f2767a65ef0a7ee70ea2b62d813f9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_misc.py
@@ -0,0 +1,517 @@
+import math
+from typing import Optional
+
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms._functional_tensor import _max_value
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+
+from torchvision.utils import _log_api_usage_once
+
+from ._meta import _convert_bounding_box_format
+
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
+
+
+def normalize(
+    inpt: torch.Tensor,
+    mean: list[float],
+    std: list[float],
+    inplace: bool = False,
+) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.Normalize` for details."""
+    if torch.jit.is_scripting():
+        return normalize_image(inpt, mean=mean, std=std, inplace=inplace)
+
+    _log_api_usage_once(normalize)
+
+    kernel = _get_kernel(normalize, type(inpt))
+    return kernel(inpt, mean=mean, std=std, inplace=inplace)
+
+
+@_register_kernel_internal(normalize, torch.Tensor)
+@_register_kernel_internal(normalize, tv_tensors.Image)
+def normalize_image(image: torch.Tensor, mean: list[float], std: list[float], inplace: bool = False) -> torch.Tensor:
+    if not image.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
+
+    if image.ndim < 3:
+        raise ValueError(f"Expected tensor to be a tensor image of size (..., C, H, W). Got {image.shape}.")
+
+    if isinstance(std, (tuple, list)):
+        divzero = not all(std)
+    elif isinstance(std, (int, float)):
+        divzero = std == 0
+    else:
+        divzero = False
+    if divzero:
+        raise ValueError("std evaluated to zero, leading to division by zero.")
+
+    dtype = image.dtype
+    device = image.device
+    mean = torch.as_tensor(mean, dtype=dtype, device=device)
+    std = torch.as_tensor(std, dtype=dtype, device=device)
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+
+    if inplace:
+        image = image.sub_(mean)
+    else:
+        image = image.sub(mean)
+
+    return image.div_(std)
+
+
+@_register_kernel_internal(normalize, tv_tensors.Video)
+def normalize_video(video: torch.Tensor, mean: list[float], std: list[float], inplace: bool = False) -> torch.Tensor:
+    return normalize_image(video, mean, std, inplace=inplace)
+
+
+def gaussian_blur(inpt: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.GaussianBlur` for details."""
+    if torch.jit.is_scripting():
+        return gaussian_blur_image(inpt, kernel_size=kernel_size, sigma=sigma)
+
+    _log_api_usage_once(gaussian_blur)
+
+    kernel = _get_kernel(gaussian_blur, type(inpt))
+    return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0))
+    x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
+    kernel1d = torch.softmax(x.div(sigma).pow(2).neg(), dim=0)
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: list[int], sigma: list[float], dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
+    kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x
+    return kernel2d
+
+
+@_register_kernel_internal(gaussian_blur, torch.Tensor)
+@_register_kernel_internal(gaussian_blur, tv_tensors.Image)
+def gaussian_blur_image(
+    image: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> torch.Tensor:
+    # TODO: consider deprecating integers from sigma on the future
+    if isinstance(kernel_size, int):
+        kernel_size = [kernel_size, kernel_size]
+    elif len(kernel_size) != 2:
+        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
+    for ksize in kernel_size:
+        if ksize % 2 == 0 or ksize < 0:
+            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
+
+    if sigma is None:
+        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
+    else:
+        if isinstance(sigma, (list, tuple)):
+            length = len(sigma)
+            if length == 1:
+                s = sigma[0]
+                sigma = [s, s]
+            elif length != 2:
+                raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")
+        elif isinstance(sigma, (int, float)):
+            s = float(sigma)
+            sigma = [s, s]
+        else:
+            raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
+    for s in sigma:
+        if s <= 0.0:
+            raise ValueError(f"sigma should have positive values. Got {sigma}")
+
+    if image.numel() == 0:
+        return image
+
+    dtype = image.dtype
+    shape = image.shape
+    ndim = image.ndim
+    if ndim == 3:
+        image = image.unsqueeze(dim=0)
+    elif ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+
+    fp = torch.is_floating_point(image)
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype if fp else torch.float32, device=image.device)
+    kernel = kernel.expand(shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    output = image if fp else image.to(dtype=torch.float32)
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    output = torch_pad(output, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=shape[-3])
+
+    if ndim == 3:
+        output = output.squeeze(dim=0)
+    elif ndim > 4:
+        output = output.reshape(shape)
+
+    if not fp:
+        output = output.round_().to(dtype=dtype)
+
+    return output
+
+
+@_register_kernel_internal(gaussian_blur, PIL.Image.Image)
+def _gaussian_blur_image_pil(
+    image: PIL.Image.Image, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = gaussian_blur_image(t_img, kernel_size=kernel_size, sigma=sigma)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(gaussian_blur, tv_tensors.Video)
+def gaussian_blur_video(
+    video: torch.Tensor, kernel_size: list[int], sigma: Optional[list[float]] = None
+) -> torch.Tensor:
+    return gaussian_blur_image(video, kernel_size, sigma)
+
+
+def gaussian_noise(inpt: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.GaussianNoise`"""
+    if torch.jit.is_scripting():
+        return gaussian_noise_image(inpt, mean=mean, sigma=sigma)
+
+    _log_api_usage_once(gaussian_noise)
+
+    kernel = _get_kernel(gaussian_noise, type(inpt))
+    return kernel(inpt, mean=mean, sigma=sigma, clip=clip)
+
+
+@_register_kernel_internal(gaussian_noise, torch.Tensor)
+@_register_kernel_internal(gaussian_noise, tv_tensors.Image)
+def gaussian_noise_image(image: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    if sigma < 0:
+        raise ValueError(f"sigma shouldn't be negative. Got {sigma}")
+
+    if image.is_floating_point():
+        noise = mean + torch.randn_like(image) * sigma
+        out = image + noise
+        if clip:
+            out = torch.clamp(out, 0, 1)
+        return out
+
+    elif image.dtype == torch.uint8:
+        # Convert to intermediate dtype int16 to add to input more efficiently
+        # See https://github.com/pytorch/vision/pull/9169 for alternative implementations and benchmark
+        noise = ((mean * 255) + torch.randn_like(image, dtype=torch.float32) * (sigma * 255)).to(torch.int16)
+        out = image + noise
+
+        if clip:
+            out = torch.clamp(out, 0, 255)
+        return out.to(torch.uint8)
+
+    else:
+        raise ValueError(f"Input tensor is expected to be in uint8 or float dtype, got dtype={image.dtype}")
+
+
+@_register_kernel_internal(gaussian_noise, tv_tensors.Video)
+def gaussian_noise_video(video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor:
+    return gaussian_noise_image(video, mean=mean, sigma=sigma, clip=clip)
+
+
+@_register_kernel_internal(gaussian_noise, PIL.Image.Image)
+def _gaussian_noise_pil(
+    video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True
+) -> PIL.Image.Image:
+    raise ValueError("Gaussian Noise is not implemented for PIL images.")
+
+
+def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    """See :func:`~torchvision.transforms.v2.ToDtype` for details."""
+    if torch.jit.is_scripting():
+        return to_dtype_image(inpt, dtype=dtype, scale=scale)
+
+    _log_api_usage_once(to_dtype)
+
+    kernel = _get_kernel(to_dtype, type(inpt))
+    return kernel(inpt, dtype=dtype, scale=scale)
+
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.uint16:
+        return 16
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+@_register_kernel_internal(to_dtype, torch.Tensor)
+@_register_kernel_internal(to_dtype, tv_tensors.Image)
+def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+
+    if image.dtype == dtype:
+        return image
+    elif not scale:
+        return image.to(dtype)
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        # TODO: Remove if/else inner blocks once uint16 dtype supports bitwise shift operations.
+        shift_by = abs(num_value_bits_input - num_value_bits_output)
+        if num_value_bits_input > num_value_bits_output:
+            if image.dtype == torch.uint16:
+                return (image / 2 ** (shift_by)).to(dtype)
+            else:
+                return image.bitwise_right_shift(shift_by).to(dtype)
+        else:
+            if dtype == torch.uint16:
+                return image.to(dtype) * 2 ** (shift_by)
+            else:
+                return image.to(dtype).bitwise_left_shift_(shift_by)
+
+
+# We encourage users to use to_dtype() instead but we keep this for BC
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """[DEPRECATED] Use to_dtype() instead."""
+    return to_dtype_image(image, dtype=dtype, scale=True)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.Video)
+def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    return to_dtype_image(video, dtype, scale=scale)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.KeyPoints, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor:
+    # We don't need to unwrap and rewrap here, since TVTensor.to() preserves the type
+    return inpt.to(dtype)
+
+
+def sanitize_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: Optional[tv_tensors.BoundingBoxFormat] = None,
+    canvas_size: Optional[tuple[int, int]] = None,
+    min_size: float = 1.0,
+    min_area: float = 1.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Remove degenerate/invalid bounding boxes and return the corresponding indexing mask.
+
+    This removes bounding boxes that:
+
+    - are below a given ``min_size`` or ``min_area``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :func:`~torchvision.transforms.v2.functional.clamp_bounding_boxes` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        bounding_boxes (Tensor or :class:`~torchvision.tv_tensors.BoundingBoxes`): The bounding boxes to be sanitized.
+        format (str or :class:`~torchvision.tv_tensors.BoundingBoxFormat`, optional): The format of the bounding boxes.
+            Must be left to none if ``bounding_boxes`` is a :class:`~torchvision.tv_tensors.BoundingBoxes` object.
+        canvas_size (tuple of int, optional): The canvas_size of the bounding boxes
+            (size of the corresponding image/video).
+            Must be left to none if ``bounding_boxes`` is a :class:`~torchvision.tv_tensors.BoundingBoxes` object.
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        min_area (float, optional) The area below which bounding boxes are removed. Default is 1.
+
+    Returns:
+        out (tuple of Tensors): The subset of valid bounding boxes, and the corresponding indexing mask.
+        The mask can then be used to subset other tensors (e.g. labels) that are associated with the bounding boxes.
+    """
+    if torch.jit.is_scripting() or is_pure_tensor(bounding_boxes):
+        if format is None or canvas_size is None:
+            raise ValueError(
+                "format and canvas_size cannot be None if bounding_boxes is a pure tensor. "
+                f"Got format={format} and canvas_size={canvas_size}."
+                "Set those to appropriate values or pass bounding_boxes as a tv_tensors.BoundingBoxes object."
+            )
+        if isinstance(format, str):
+            format = tv_tensors.BoundingBoxFormat[format.upper()]
+        valid = _get_sanitize_bounding_boxes_mask(
+            bounding_boxes, format=format, canvas_size=canvas_size, min_size=min_size, min_area=min_area
+        )
+        bounding_boxes = bounding_boxes[valid]
+    else:
+        if not isinstance(bounding_boxes, tv_tensors.BoundingBoxes):
+            raise ValueError("bounding_boxes must be a tv_tensors.BoundingBoxes instance or a pure tensor.")
+        if format is not None or canvas_size is not None:
+            raise ValueError(
+                "format and canvas_size must be None when bounding_boxes is a tv_tensors.BoundingBoxes instance. "
+                f"Got format={format} and canvas_size={canvas_size}. "
+                "Leave those to None or pass bounding_boxes as a pure tensor."
+            )
+        valid = _get_sanitize_bounding_boxes_mask(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            min_size=min_size,
+            min_area=min_area,
+        )
+        bounding_boxes = tv_tensors.wrap(bounding_boxes[valid], like=bounding_boxes)
+
+    return bounding_boxes, valid
+
+
+def _get_sanitize_bounding_boxes_mask(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: tuple[int, int],
+    min_size: float = 1.0,
+    min_area: float = 1.0,
+) -> torch.Tensor:
+
+    is_rotated = tv_tensors.is_rotated_bounding_format(format)
+    intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY
+    bounding_boxes = _convert_bounding_box_format(bounding_boxes, new_format=intermediate_format, old_format=format)
+
+    image_h, image_w = canvas_size
+    if is_rotated:
+        dx12 = bounding_boxes[..., 0] - bounding_boxes[..., 2]
+        dy12 = bounding_boxes[..., 1] - bounding_boxes[..., 3]
+        dx23 = bounding_boxes[..., 3] - bounding_boxes[..., 5]
+        dy23 = bounding_boxes[..., 4] - bounding_boxes[..., 6]
+        ws = torch.sqrt(dx12**2 + dy12**2)
+        hs = torch.sqrt(dx23**2 + dy23**2)
+    else:
+        ws, hs = bounding_boxes[:, 2] - bounding_boxes[:, 0], bounding_boxes[:, 3] - bounding_boxes[:, 1]
+    valid = (ws >= min_size) & (hs >= min_size) & (bounding_boxes >= 0).all(dim=-1) & (ws * hs >= min_area)
+    # TODO: Do we really need to check for out of bounds here? All
+    # transforms should be clamping anyway, so this should never happen?
+    image_h, image_w = canvas_size
+    valid &= (bounding_boxes[:, 0] <= image_w) & (bounding_boxes[:, 2] <= image_w)
+    valid &= (bounding_boxes[:, 1] <= image_h) & (bounding_boxes[:, 3] <= image_h)
+    if is_rotated:
+        valid &= (bounding_boxes[..., 4] <= image_w) & (bounding_boxes[..., 5] <= image_h)
+        valid &= (bounding_boxes[..., 6] <= image_w) & (bounding_boxes[..., 7] <= image_h)
+    return valid
+
+
+def sanitize_keypoints(
+    key_points: torch.Tensor,
+    canvas_size: Optional[tuple[int, int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Remove keypoints outside of the image area and their corresponding labels (if any).
+
+    This transform removes keypoints or groups of keypoints and their associated labels that
+    have coordinates outside of their corresponding image.
+    If you would instead like to clamp such keypoints to the image edges, use
+    :class:`~torchvision.transforms.v2.ClampKeyPoints`.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models.
+
+    Keypoints can be passed as a set of individual keypoints or as a set of objects
+    (e.g., polygons or polygonal chains) consisting of a fixed number of keypoints of shape ``[..., 2]``.
+    When groups of keypoints are passed (i.e., an at least 3-dimensional tensor),
+    this transform will only remove entire groups, not individual keypoints within a group.
+
+    Args:
+        key_points (Tensor or :class:`~torchvision.tv_tensors.KeyPoints`): The keypoints to be sanitized.
+        canvas_size (tuple of int, optional): The canvas_size of the keypoints
+            (size of the corresponding image/video).
+            Must be left to none if ``key_points`` is a :class:`~torchvision.tv_tensors.KeyPoints` object.
+
+    Returns:
+        out (tuple of Tensors): The subset of valid keypoints, and the corresponding indexing mask.
+        The mask can then be used to subset other tensors (e.g. labels) that are associated with the keypoints.
+    """
+    if torch.jit.is_scripting() or is_pure_tensor(key_points):
+        if canvas_size is None:
+            raise ValueError(
+                "canvas_size cannot be None if key_points is a pure tensor. "
+                "Set it to an appropriate value or pass key_points as a tv_tensors.KeyPoints object."
+            )
+        valid = _get_sanitize_keypoints_mask(
+            key_points,
+            canvas_size=canvas_size,
+        )
+        key_points = key_points[valid]
+    else:
+        if not isinstance(key_points, tv_tensors.KeyPoints):
+            raise ValueError("key_points must be a tv_tensors.KeyPoints instance or a pure tensor.")
+        if canvas_size is not None:
+            raise ValueError(
+                "canvas_size must be None when key_points is a tv_tensors.KeyPoints instance. "
+                f"Got canvas_size={canvas_size}. "
+                "Leave it to None or pass key_points as a pure tensor."
+            )
+        valid = _get_sanitize_keypoints_mask(
+            key_points,
+            canvas_size=key_points.canvas_size,
+        )
+        key_points = tv_tensors.wrap(key_points[valid], like=key_points)
+
+    return key_points, valid
+
+
+def _get_sanitize_keypoints_mask(
+    key_points: torch.Tensor,
+    canvas_size: tuple[int, int],
+) -> torch.Tensor:
+
+    h, w = canvas_size
+
+    x, y = key_points[..., 0], key_points[..., 1]
+    valid = (x >= 0) & (x < w) & (y >= 0) & (y < h)
+
+    valid = valid.flatten(start_dim=1).all(dim=1) if valid.ndim > 1 else valid
+
+    return valid
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_temporal.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..f932b06a295fd10316fba3e796ec4649053e92db
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_temporal.py
@@ -0,0 +1,27 @@
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
+    """See :class:`~torchvision.transforms.v2.UniformTemporalSubsample` for details."""
+    if torch.jit.is_scripting():
+        return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
+
+    _log_api_usage_once(uniform_temporal_subsample)
+
+    kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
+    return kernel(inpt, num_samples=num_samples)
+
+
+@_register_kernel_internal(uniform_temporal_subsample, torch.Tensor)
+@_register_kernel_internal(uniform_temporal_subsample, tv_tensors.Video)
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
+    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+    t_max = video.shape[-4] - 1
+    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
+    return torch.index_select(video, -4, indices)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_type_conversion.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_type_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a731fe143c365400d5905db8370c538097583a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_type_conversion.py
@@ -0,0 +1,27 @@
+from typing import Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
+    """See :class:`~torchvision.transforms.v2.ToImage` for details."""
+    if isinstance(inpt, np.ndarray):
+        output = torch.from_numpy(np.atleast_3d(inpt)).permute((2, 0, 1)).contiguous()
+    elif isinstance(inpt, PIL.Image.Image):
+        output = pil_to_tensor(inpt)
+    elif isinstance(inpt, torch.Tensor):
+        output = inpt
+    else:
+        raise TypeError(
+            f"Input can either be a pure Tensor, a numpy array, or a PIL image, but got {type(inpt)} instead."
+        )
+    return tv_tensors.Image(output)
+
+
+to_pil_image = _F.to_pil_image
+pil_to_tensor = _F.pil_to_tensor
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b857285c891c8502ff95ed7c3ac998953aa04170
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/transforms/v2/functional/_utils.py
@@ -0,0 +1,142 @@
+import functools
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torchvision import tv_tensors
+
+_FillType = Union[int, float, Sequence[int], Sequence[float], None]
+_FillTypeJIT = Optional[list[float]]
+
+
+def is_pure_tensor(inpt: Any) -> bool:
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, tv_tensors.TVTensor)
+
+
+# {functional: {input_type: type_specific_kernel}}
+_KERNEL_REGISTRY: dict[Callable, dict[type, Callable]] = {}
+
+
+def _kernel_tv_tensor_wrapper(kernel):
+    @functools.wraps(kernel)
+    def wrapper(inpt, *args, **kwargs):
+        # If you're wondering whether we could / should get rid of this wrapper,
+        # the answer is no: we want to pass pure Tensors to avoid the overhead
+        # of the __torch_function__ machinery. Note that this is always valid,
+        # regardless of whether we override __torch_function__ in our base class
+        # or not.
+        # Also, even if we didn't call `as_subclass` here, we would still need
+        # this wrapper to call wrap(), because the TVTensor type would be
+        # lost after the first operation due to our own __torch_function__
+        # logic.
+        output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
+        return tv_tensors.wrap(output, like=inpt)
+
+    return wrapper
+
+
+def _register_kernel_internal(functional, input_type, *, tv_tensor_wrapper=True):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.")
+
+    def decorator(kernel):
+        registry[input_type] = (
+            _kernel_tv_tensor_wrapper(kernel)
+            if issubclass(input_type, tv_tensors.TVTensor) and tv_tensor_wrapper
+            else kernel
+        )
+        return kernel
+
+    return decorator
+
+
+def _name_to_functional(name):
+    import torchvision.transforms.v2.functional  # noqa
+
+    try:
+        return getattr(torchvision.transforms.v2.functional, name)
+    except AttributeError:
+        raise ValueError(
+            f"Could not find functional with name '{name}' in torchvision.transforms.v2.functional."
+        ) from None
+
+
+_BUILTIN_DATAPOINT_TYPES = {
+    obj for obj in tv_tensors.__dict__.values() if isinstance(obj, type) and issubclass(obj, tv_tensors.TVTensor)
+}
+
+
+def register_kernel(functional, tv_tensor_cls):
+    """Decorate a kernel to register it for a functional and a (custom) tv_tensor type.
+
+    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for usage
+    details.
+    """
+    if isinstance(functional, str):
+        functional = _name_to_functional(name=functional)
+    elif not (
+        callable(functional)
+        and getattr(functional, "__module__", "").startswith("torchvision.transforms.v2.functional")
+    ):
+        raise ValueError(
+            f"Kernels can only be registered on functionals from the torchvision.transforms.v2.functional namespace, "
+            f"but got {functional}."
+        )
+
+    if not (isinstance(tv_tensor_cls, type) and issubclass(tv_tensor_cls, tv_tensors.TVTensor)):
+        raise ValueError(
+            f"Kernels can only be registered for subclasses of torchvision.tv_tensors.TVTensor, "
+            f"but got {tv_tensor_cls}."
+        )
+
+    if tv_tensor_cls in _BUILTIN_DATAPOINT_TYPES:
+        raise ValueError(f"Kernels cannot be registered for the builtin tv_tensor classes, but got {tv_tensor_cls}")
+
+    return _register_kernel_internal(functional, tv_tensor_cls, tv_tensor_wrapper=False)
+
+
+def _get_kernel(functional, input_type, *, allow_passthrough=False):
+    registry = _KERNEL_REGISTRY.get(functional)
+    if not registry:
+        raise ValueError(f"No kernel registered for functional {functional.__name__}.")
+
+    for cls in input_type.__mro__:
+        if cls in registry:
+            return registry[cls]
+        elif cls is tv_tensors.TVTensor:
+            # We don't want user-defined tv_tensors to dispatch to the pure Tensor kernels, so we explicit stop the
+            # MRO traversal before hitting torch.Tensor. We can even stop at tv_tensors.TVTensor, since we don't
+            # allow kernels to be registered for tv_tensors.TVTensor anyway.
+            break
+
+    if allow_passthrough:
+        return lambda inpt, *args, **kwargs: inpt
+
+    raise TypeError(
+        f"Functional F.{functional.__name__} supports inputs of type {registry.keys()}, "
+        f"but got {input_type} instead."
+    )
+
+
+# This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
+# We could get rid of this by letting _register_kernel_internal take arbitrary functionals rather than wrap_kernel: bool
+def _register_five_ten_crop_kernel_internal(functional, input_type):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise TypeError(f"Functional '{functional}' already has a kernel registered for type '{input_type}'.")
+
+    def wrap(kernel):
+        @functools.wraps(kernel)
+        def wrapper(inpt, *args, **kwargs):
+            output = kernel(inpt, *args, **kwargs)
+            container_type = type(output)
+            return container_type(tv_tensors.wrap(o, like=inpt) for o in output)
+
+        return wrapper
+
+    def decorator(kernel):
+        registry[input_type] = wrap(kernel) if issubclass(input_type, tv_tensors.TVTensor) else kernel
+        return kernel
+
+    return decorator
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..744e52411355ed4d20de1fb653da3123854fa16d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__init__.py
@@ -0,0 +1,39 @@
+import torch
+
+from ._bounding_boxes import BoundingBoxes, BoundingBoxFormat, is_rotated_bounding_format
+from ._image import Image
+from ._keypoints import KeyPoints
+from ._mask import Mask
+from ._torch_function_helpers import set_return_type
+from ._tv_tensor import TVTensor
+from ._video import Video
+
+
+# TODO: Fix this. We skip this method as it leads to
+# RecursionError: maximum recursion depth exceeded while calling a Python object
+# Until `disable` is removed, there will be graph breaks after all calls to functional transforms
+@torch.compiler.disable
+def wrap(wrappee, *, like, **kwargs):
+    """Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.tv_tensors.TVTensor` subclass as ``like``.
+
+    If ``like`` is a :class:`~torchvision.tv_tensors.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
+
+    Args:
+        wrappee (Tensor): The tensor to convert.
+        like (:class:`~torchvision.tv_tensors.TVTensor`): The reference.
+            ``wrappee`` will be converted into the same subclass as ``like``.
+        kwargs: Can contain "format", "canvas_size" and "clamping_mode" if ``like`` is a :class:`~torchvision.tv_tensor.BoundingBoxes`.
+            Ignored otherwise.
+    """
+    if isinstance(like, BoundingBoxes):
+        return BoundingBoxes._wrap(
+            wrappee,
+            format=kwargs.get("format", like.format),
+            canvas_size=kwargs.get("canvas_size", like.canvas_size),
+            clamping_mode=kwargs.get("clamping_mode", like.clamping_mode),
+        )
+    elif isinstance(like, KeyPoints):
+        return KeyPoints._wrap(wrappee, canvas_size=kwargs.get("canvas_size", like.canvas_size))
+    else:
+        return wrappee.as_subclass(type(like))
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ab0a9b0319aaa0ce89af0db91c43ca42cfd57c4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_bounding_boxes.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_bounding_boxes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6db6b0fccf1d5ac4c0c4fb0c81de195d0a2e6737
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_bounding_boxes.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_dataset_wrapper.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_dataset_wrapper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f9ab6eee591b7436a88fcd3353565eb451bfd99
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_dataset_wrapper.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_image.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_image.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5427eadb9688cbbcbe2b445eedc28c3c5d7559d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_image.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_keypoints.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_keypoints.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cacc93498abc894e25c42fb2d2a670b34eb587a8
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_keypoints.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_mask.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_mask.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc7f2bd6eff5e19056564b038aa17d3d3038159f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_mask.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_torch_function_helpers.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_torch_function_helpers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3295fcc9105dc5567c66a9ce42fc6e697a7c70e7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_torch_function_helpers.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_tv_tensor.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_tv_tensor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17dfc45ad81ee0eb5930eae4b7397b8e27626d9c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_tv_tensor.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_video.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..485758f11ddfb1bf73c8fa57b5ac0d63dea7b453
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/__pycache__/_video.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_bounding_boxes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_bounding_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa3e50458d7677b0c986dcff2361f2f0ff72448
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_bounding_boxes.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from enum import Enum
+from typing import Any, Optional
+
+import torch
+from torch.utils._pytree import tree_flatten
+
+from ._tv_tensor import TVTensor
+
+
+class BoundingBoxFormat(Enum):
+    """Coordinate format of a bounding box.
+
+    Available formats are:
+
+    * ``XYXY``: bounding box represented via corners; x1, y1 being top left;
+      x2, y2 being bottom right.
+    * ``XYWH``: bounding box represented via corner, width and height; x1, y1
+      being top left; w, h being width and height.
+    * ``CXCYWH``: bounding box represented via centre, width and height; cx,
+      cy being center of box; w, h being width and height.
+    * ``XYWHR``: rotated boxes represented via corner, width and height; x1, y1
+      being top left; w, h being width and height. r is rotation angle in
+      degrees.
+    * ``CXCYWHR``: rotated boxes represented via center, width and height; cx,
+      cy being center of box; w, h being width and height. r is rotation angle
+      in degrees.
+    * ``XYXYXYXY``: rotated boxes represented via corners; x1, y1 being top
+      left; x2, y2 being top right; x3, y3 being bottom right; x4, y4 being
+      bottom left.
+    """
+
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
+    XYWHR = "XYWHR"
+    CXCYWHR = "CXCYWHR"
+    XYXYXYXY = "XYXYXYXY"
+
+
+# TODO: Once torchscript supports Enums with staticmethod
+# this can be put into BoundingBoxFormat as staticmethod
+def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool:
+    if isinstance(format, BoundingBoxFormat):
+        return (
+            format == BoundingBoxFormat.XYWHR
+            or format == BoundingBoxFormat.CXCYWHR
+            or format == BoundingBoxFormat.XYXYXYXY
+        )
+    elif isinstance(format, str):
+        return format in ("XYWHR", "CXCYWHR", "XYXYXYXY")
+    else:
+        raise ValueError(f"format should be str or BoundingBoxFormat, got {type(format)}")
+
+
+# This should ideally be a Literal, but torchscript fails.
+CLAMPING_MODE_TYPE = Optional[str]
+
+
+class BoundingBoxes(TVTensor):
+    """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``.
+
+    .. note::
+        Support for rotated bounding boxes was released in TorchVision 0.23 and
+        is currently a BETA feature. We don't expect the API to change, but
+        there may be some rare edge-cases. If you find any issues, please report
+        them on our bug tracker:
+        https://github.com/pytorch/vision/issues?q=is:open+is:issue
+
+    Where ``N`` is the number of bounding boxes
+    and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes.
+
+    .. note::
+        There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
+        instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
+        although one :class:`~torchvision.tv_tensors.BoundingBoxes` object can
+        contain multiple bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
+        clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes
+            partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    format: BoundingBoxFormat
+    canvas_size: tuple[int, int]
+    clamping_mode: CLAMPING_MODE_TYPE
+
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat | str, canvas_size: tuple[int, int], clamping_mode: CLAMPING_MODE_TYPE = "soft", check_dims: bool = True) -> BoundingBoxes:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.ndim != 2:
+                raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
+        if clamping_mode is not None and clamping_mode not in ("hard", "soft"):
+            raise ValueError(f"clamping_mode must be None, hard or soft, got {clamping_mode}.")
+
+        if isinstance(format, str):
+            format = BoundingBoxFormat[format.upper()]
+
+        bounding_boxes = tensor.as_subclass(cls)
+        bounding_boxes.format = format
+        bounding_boxes.canvas_size = canvas_size
+        bounding_boxes.clamping_mode = clamping_mode
+        return bounding_boxes
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        format: BoundingBoxFormat | str,
+        canvas_size: tuple[int, int],
+        clamping_mode: CLAMPING_MODE_TYPE = "soft",
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> BoundingBoxes:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if not torch.is_floating_point(tensor) and is_rotated_bounding_format(format):
+            raise ValueError(f"Rotated bounding boxes should be floating point tensors, got {tensor.dtype}.")
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> BoundingBoxes:
+        # If there are BoundingBoxes instances in the output, their metadata got lost when we called
+        # super().__torch_function__. We need to restore the metadata somehow, so we choose to take
+        # the metadata from the first bbox in the parameters.
+        # This should be what we want in most cases. When it's not, it's probably a mis-use anyway, e.g.
+        # something like some_xyxy_bbox + some_xywh_bbox; we don't guard against those cases.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_bbox_from_args = next(x for x in flat_params if isinstance(x, BoundingBoxes))
+        format, canvas_size, clamping_mode = (
+            first_bbox_from_args.format,
+            first_bbox_from_args.canvas_size,
+            first_bbox_from_args.clamping_mode,
+        )
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, BoundingBoxes):
+            output = BoundingBoxes._wrap(
+                output, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode, check_dims=False
+            )
+        elif isinstance(output, (tuple, list)):
+            # This branch exists for chunk() and unbind()
+            output = type(output)(
+                BoundingBoxes._wrap(
+                    part, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode, check_dims=False
+                )
+                for part in output
+            )
+        return output
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(format=self.format, canvas_size=self.canvas_size, clamping_mode=self.clamping_mode)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_dataset_wrapper.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_dataset_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..23683221f6005a9ce6a55e785e59409a649d7928
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_dataset_wrapper.py
@@ -0,0 +1,666 @@
+# type: ignore
+
+from __future__ import annotations
+
+import collections.abc
+
+import contextlib
+from collections import defaultdict
+from copy import copy
+
+import torch
+
+from torchvision import datasets, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+__all__ = ["wrap_dataset_for_transforms_v2"]
+
+
+def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
+    """Wrap a ``torchvision.dataset`` for usage with :mod:`torchvision.transforms.v2`.
+
+    Example:
+        >>> dataset = torchvision.datasets.CocoDetection(...)
+        >>> dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    .. note::
+
+       For now, only the most popular datasets are supported. Furthermore, the wrapper only supports dataset
+       configurations that are fully supported by ``torchvision.transforms.v2``. If you encounter an error prompting you
+       to raise an issue to ``torchvision`` for a dataset or configuration that you need, please do so.
+
+    The dataset samples are wrapped according to the description below.
+
+    Special cases:
+
+        * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
+          returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
+          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.tv_tensors``.
+          The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the
+          ``"image_id"``, ``"boxes"``, and ``"labels"``.
+        * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
+          the target and wrap the data in the corresponding ``torchvision.tv_tensors``. The original keys are
+          preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
+          dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
+          in the corresponding ``torchvision.tv_tensors``. The original keys are preserved. If ``target_keys`` is
+          omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor.
+        * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor. The target for ``target_type="instance"`` is *replaced* by
+          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.tv_tensors.Mask` tv_tensor) and
+          ``"labels"``.
+        * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+
+    Image classification datasets
+
+        This wrapper is a no-op for image classification datasets, since they were already fully supported by
+        :mod:`torchvision.transforms` and thus no change is needed for :mod:`torchvision.transforms.v2`.
+
+    Segmentation datasets
+
+        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of
+        :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
+        segmentation mask into a :class:`~torchvision.tv_tensors.Mask` (second item).
+
+    Video classification datasets
+
+        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a
+        :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
+        :class:`~torchvision.tv_tensors.Video` while leaving the other items as is.
+
+        .. note::
+
+            Only datasets constructed with ``output_format="TCHW"`` are supported, since the alternative
+            ``output_format="THWC"`` is not supported by :mod:`torchvision.transforms.v2`.
+
+    Args:
+        dataset: the dataset instance to wrap for compatibility with transforms v2.
+        target_keys: Target keys to return in case the target is a dictionary. If ``None`` (default), selected keys are
+            specific to the dataset. If ``"all"``, returns the full target. Can also be a collection of strings for
+            fine grained access. Currently only supported for :class:`~torchvision.datasets.CocoDetection`,
+            :class:`~torchvision.datasets.VOCDetection`, :class:`~torchvision.datasets.Kitti`, and
+            :class:`~torchvision.datasets.WIDERFace`. See above for details.
+    """
+    if not (
+        target_keys is None
+        or target_keys == "all"
+        or (isinstance(target_keys, collections.abc.Collection) and all(isinstance(key, str) for key in target_keys))
+    ):
+        raise ValueError(
+            f"`target_keys` can be None, 'all', or a collection of strings denoting the keys to be returned, "
+            f"but got {target_keys}"
+        )
+
+    # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetTVTensorWrapper (see below) as well as the
+    # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
+    # while we can still inject everything that we need.
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetTVTensorWrapper, type(dataset)), {})
+    # Since VisionDatasetTVTensorWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetTVTensorWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
+    # have the existing instance as attribute on the new object.
+    return wrapped_dataset_cls(dataset, target_keys)
+
+
+class WrapperFactories(dict):
+    def register(self, dataset_cls):
+        def decorator(wrapper_factory):
+            self[dataset_cls] = wrapper_factory
+            return wrapper_factory
+
+        return decorator
+
+
+# We need this two-stage design, i.e. a wrapper factory producing the actual wrapper, since some wrappers depend on the
+# dataset instance rather than just the class, since they require the user defined instance attributes. Thus, we can
+# provide a wrapping from the dataset class to the factory here, but can only instantiate the wrapper at runtime when
+# we have access to the dataset instance.
+WRAPPER_FACTORIES = WrapperFactories()
+
+
+class VisionDatasetTVTensorWrapper:
+    def __init__(self, dataset, target_keys):
+        dataset_cls = type(dataset)
+
+        if not isinstance(dataset, datasets.VisionDataset):
+            raise TypeError(
+                f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
+                f"but got a '{dataset_cls.__name__}' instead.\n"
+                f"For an example of how to perform the wrapping for custom datasets, see\n\n"
+                "https://pytorch.org/vision/main/auto_examples/plot_tv_tensors.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
+            )
+
+        for cls in dataset_cls.mro():
+            if cls in WRAPPER_FACTORIES:
+                wrapper_factory = WRAPPER_FACTORIES[cls]
+                if target_keys is not None and cls not in {
+                    datasets.CocoDetection,
+                    datasets.VOCDetection,
+                    datasets.Kitti,
+                    datasets.WIDERFace,
+                }:
+                    raise ValueError(
+                        f"`target_keys` is currently only supported for `CocoDetection`, `VOCDetection`, `Kitti`, "
+                        f"and `WIDERFace`, but got {cls.__name__}."
+                    )
+                break
+            elif cls is datasets.VisionDataset:
+                # TODO: If we have documentation on how to do that, put a link in the error message.
+                msg = f"No wrapper exists for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
+                if dataset_cls in datasets.__dict__.values():
+                    msg = (
+                        f"{msg} If an automated wrapper for this dataset would be useful for you, "
+                        f"please open an issue at https://github.com/pytorch/vision/issues."
+                    )
+                raise TypeError(msg)
+
+        self._dataset = dataset
+        self._target_keys = target_keys
+        self._wrapper = wrapper_factory(dataset, target_keys)
+
+        # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
+        # Although internally, `datasets.VisionDataset` merges `transform` and `target_transform` into the joint
+        # `transforms`
+        # https://github.com/pytorch/vision/blob/135a0f9ea9841b6324b4fe8974e2543cbb95709a/torchvision/datasets/vision.py#L52-L54
+        # some (if not most) datasets still use `transform` and `target_transform` individually. Thus, we need to
+        # disable all three here to be able to extract the untransformed sample to wrap.
+        self.transform, dataset.transform = dataset.transform, None
+        self.target_transform, dataset.target_transform = dataset.target_transform, None
+        self.transforms, dataset.transforms = dataset.transforms, None
+
+    def __getattr__(self, item):
+        with contextlib.suppress(AttributeError):
+            return object.__getattribute__(self, item)
+
+        return getattr(self._dataset, item)
+
+    def __getitem__(self, idx):
+        # This gets us the raw sample since we disabled the transforms for the underlying dataset in the constructor
+        # of this class
+        sample = self._dataset[idx]
+
+        sample = self._wrapper(idx, sample)
+
+        # Regardless of whether the user has supplied the transforms individually (`transform` and `target_transform`)
+        # or joint (`transforms`), we can access the full functionality through `transforms`
+        if self.transforms is not None:
+            sample = self.transforms(*sample)
+
+        return sample
+
+    def __len__(self):
+        return len(self._dataset)
+
+    # TODO: maybe we should use __getstate__ and __setstate__ instead of __reduce__, as recommended in the docs.
+    def __reduce__(self):
+        # __reduce__ gets called when we try to pickle the dataset.
+        # In a DataLoader with spawn context, this gets called `num_workers` times from the main process.
+
+        # We have to reset the [target_]transform[s] attributes of the dataset
+        # to their original values, because we previously set them to None in __init__().
+        dataset = copy(self._dataset)
+        dataset.transform = self.transform
+        dataset.transforms = self.transforms
+        dataset.target_transform = self.target_transform
+
+        return wrap_dataset_for_transforms_v2, (dataset, self._target_keys)
+
+
+def raise_not_supported(description):
+    raise RuntimeError(
+        f"{description} is currently not supported by this wrapper. "
+        f"If this would be helpful for you, please open an issue at https://github.com/pytorch/vision/issues."
+    )
+
+
+def identity(item):
+    return item
+
+
+def identity_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        return sample
+
+    return wrapper
+
+
+def pil_image_to_mask(pil_image):
+    return tv_tensors.Mask(pil_image)
+
+
+def parse_target_keys(target_keys, *, available, default):
+    if target_keys is None:
+        target_keys = default
+    if target_keys == "all":
+        target_keys = available
+    else:
+        target_keys = set(target_keys)
+        extra = target_keys - available
+        if extra:
+            raise ValueError(f"Target keys {sorted(extra)} are not available")
+
+    return target_keys
+
+
+def list_of_dicts_to_dict_of_lists(list_of_dicts):
+    dict_of_lists = defaultdict(list)
+    for dct in list_of_dicts:
+        for key, value in dct.items():
+            dict_of_lists[key].append(value)
+    return dict(dict_of_lists)
+
+
+def wrap_target_by_type(target, *, target_types, type_wrappers):
+    if not isinstance(target, (tuple, list)):
+        target = [target]
+
+    wrapped_target = tuple(
+        type_wrappers.get(target_type, identity)(item) for target_type, item in zip(target_types, target)
+    )
+
+    if len(wrapped_target) == 1:
+        wrapped_target = wrapped_target[0]
+
+    return wrapped_target
+
+
+def classification_wrapper_factory(dataset, target_keys):
+    return identity_wrapper_factory(dataset, target_keys)
+
+
+for dataset_cls in [
+    datasets.Caltech256,
+    datasets.CIFAR10,
+    datasets.CIFAR100,
+    datasets.ImageNet,
+    datasets.MNIST,
+    datasets.FashionMNIST,
+    datasets.GTSRB,
+    datasets.DatasetFolder,
+    datasets.ImageFolder,
+    datasets.Imagenette,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(classification_wrapper_factory)
+
+
+def segmentation_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, mask = sample
+        return image, pil_image_to_mask(mask)
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.VOCSegmentation,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(segmentation_wrapper_factory)
+
+
+def video_classification_wrapper_factory(dataset, target_keys):
+    if dataset.video_clips.output_format == "THWC":
+        raise RuntimeError(
+            f"{type(dataset).__name__} with `output_format='THWC'` is not supported by this wrapper, "
+            f"since it is not compatible with the transformations. Please use `output_format='TCHW'` instead."
+        )
+
+    def wrapper(idx, sample):
+        video, audio, label = sample
+
+        video = tv_tensors.Video(video)
+
+        return video, audio, label
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.HMDB51,
+    datasets.Kinetics,
+    datasets.UCF101,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(video_classification_wrapper_factory)
+
+
+@WRAPPER_FACTORIES.register(datasets.Caltech101)
+def caltech101_wrapper_factory(dataset, target_keys):
+    if "annotation" in dataset.target_type:
+        raise_not_supported("Caltech101 dataset with `target_type=['annotation', ...]`")
+
+    return classification_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CocoDetection)
+def coco_dectection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "segmentation",
+            "area",
+            "iscrowd",
+            "image_id",
+            "bbox",
+            "category_id",
+            # added by the wrapper
+            "boxes",
+            "masks",
+            "labels",
+        },
+        default={"image_id", "boxes", "labels"},
+    )
+
+    def segmentation_to_mask(segmentation, *, canvas_size):
+        from pycocotools import mask
+
+        if isinstance(segmentation, dict):
+            # if counts is a string, it is already an encoded RLE mask
+            if not isinstance(segmentation["counts"], str):
+                segmentation = mask.frPyObjects(segmentation, *canvas_size)
+        elif isinstance(segmentation, list):
+            segmentation = mask.merge(mask.frPyObjects(segmentation, *canvas_size))
+        else:
+            raise ValueError(f"COCO segmentation expected to be a dict or a list, got {type(segmentation)}")
+        return torch.from_numpy(mask.decode(segmentation))
+
+    def wrapper(idx, sample):
+        image_id = dataset.ids[idx]
+
+        image, target = sample
+
+        if not target:
+            return image, dict(image_id=image_id)
+
+        canvas_size = tuple(F.get_size(image))
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "image_id" in target_keys:
+            target["image_id"] = image_id
+
+        if "boxes" in target_keys:
+            target["boxes"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    batched_target["bbox"],
+                    format=tv_tensors.BoundingBoxFormat.XYWH,
+                    canvas_size=canvas_size,
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        if "masks" in target_keys:
+            target["masks"] = tv_tensors.Mask(
+                torch.stack(
+                    [
+                        segmentation_to_mask(segmentation, canvas_size=canvas_size)
+                        for segmentation in batched_target["segmentation"]
+                    ]
+                ),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(batched_target["category_id"])
+
+        for target_key in target_keys - {"image_id", "boxes", "masks", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+WRAPPER_FACTORIES.register(datasets.CocoCaptions)(identity_wrapper_factory)
+
+
+VOC_DETECTION_CATEGORIES = [
+    "__background__",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC_DETECTION_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.VOCDetection)
+def voc_detection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "annotation",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
+
+        if "annotation" not in target_keys:
+            target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                [
+                    [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
+                    for bndbox in batched_instances["bndbox"]
+                ],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(
+                [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.SBDataset)
+def sbd_wrapper(dataset, target_keys):
+    if dataset.mode == "boundaries":
+        raise_not_supported("SBDataset with mode='boundaries'")
+
+    return segmentation_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CelebA)
+def celeba_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
+        raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "bbox": lambda item: F.convert_bounding_box_format(
+                    tv_tensors.BoundingBoxes(
+                        item,
+                        format=tv_tensors.BoundingBoxFormat.XYWH,
+                        canvas_size=(image.height, image.width),
+                    ),
+                    new_format=tv_tensors.BoundingBoxFormat.XYXY,
+                ),
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+KITTI_CATEGORIES = ["Car", "Van", "Truck", "Pedestrian", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"]
+KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.Kitti)
+def kitti_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "type",
+            "truncated",
+            "occluded",
+            "alpha",
+            "bbox",
+            "dimensions",
+            "location",
+            "rotation_y",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                batched_target["bbox"],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in batched_target["type"]])
+
+        for target_key in target_keys - {"boxes", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
+def oxford_iiit_pet_wrapper_factor(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is not None:
+            target = wrap_target_by_type(
+                target,
+                target_types=dataset._target_types,
+                type_wrappers={
+                    "segmentation": pil_image_to_mask,
+                },
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.Cityscapes)
+def cityscapes_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["polygon", "color"]):
+        raise_not_supported("`Cityscapes` dataset with `target_type=['polygon', 'color', ...]`")
+
+    def instance_segmentation_wrapper(mask):
+        # See https://github.com/mcordts/cityscapesScripts/blob/8da5dd00c9069058ccc134654116aac52d4f6fa2/cityscapesscripts/preparation/json2instanceImg.py#L7-L21
+        data = pil_image_to_mask(mask)
+        masks = []
+        labels = []
+        for id in data.unique():
+            masks.append(data == id)
+            label = id
+            if label >= 1_000:
+                label //= 1_000
+            labels.append(label)
+        return dict(masks=tv_tensors.Mask(torch.stack(masks)), labels=torch.stack(labels))
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "instance": instance_segmentation_wrapper,
+                "semantic": pil_image_to_mask,
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.WIDERFace)
+def widerface_wrapper(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            "bbox",
+            "blur",
+            "expression",
+            "illumination",
+            "occlusion",
+            "pose",
+            "invalid",
+        },
+        default="all",
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        target = {key: target[key] for key in target_keys}
+
+        if "bbox" in target_keys:
+            target["bbox"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    target["bbox"], format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        return image, target
+
+    return wrapper
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_image.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fe468ac8103035ebb9dd87faa4f454f286de92
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_image.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import Any
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Image(TVTensor):
+    """:class:`torch.Tensor` subclass for images with shape ``[..., C, H, W]``.
+
+    .. note::
+
+        In the :ref:`transforms <transforms>`, ``Image`` instances are largely
+        interchangeable with pure :class:`torch.Tensor`. See
+        :ref:`this note <passthrough_heuristic>` for more details.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the image is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Image:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if tensor.ndim < 2:
+            raise ValueError(f"Tensor must be 2D or higher, got {tensor.ndim}D tensor.")
+        elif tensor.ndim == 2:
+            tensor = tensor.unsqueeze(0)
+
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_keypoints.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..aede31ad7db74b6aa4358fac8b9a1697c70ef88a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_keypoints.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from typing import Any, Mapping, Sequence
+
+import torch
+from torch.utils._pytree import tree_flatten
+
+from ._tv_tensor import TVTensor
+
+
+class KeyPoints(TVTensor):
+    """:class:`torch.Tensor` subclass for tensors with shape ``[..., 2]`` that represent points in an image.
+
+    .. note::
+        Support for keypoints was released in TorchVision 0.23 and is currently
+        a BETA feature. We don't expect the API to change, but there may be some
+        rare edge-cases. If you find any issues, please report them on our bug
+        tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue
+        Each point is represented by its X and Y coordinates along the width and
+        height dimensions, respectively.
+
+    Each point is represented by its X and Y coordinates along the width and height dimensions, respectively.
+
+    KeyPoints may represent any object that can be represented by sequences of 2D points:
+
+    - `Polygonal chains <https://en.wikipedia.org/wiki/Polygonal_chain>`_,
+      including polylines, Bézier curves, etc., which can be of shape
+      ``[N_chains, N_points, 2]``.
+    - Polygons, which can be of shape ``[N_polygons, N_points, 2]``.
+    - Skeletons, which can be of shape ``[N_skeletons, N_bones, 2, 2]`` for
+      pose-estimation models.
+
+    .. note::
+        Like for :class:`torchvision.tv_tensors.BoundingBoxes`, there should
+        only be a single instance of the
+        :class:`torchvision.tv_tensors.KeyPoints` class per sample e.g.
+        ``{"img": img, "poins_of_interest": KeyPoints(...)}``, although one
+        :class:`torchvision.tv_tensors.KeyPoints` object can contain multiple
+        key points
+
+    Args:
+        data: Any data that can be turned into a tensor with
+            :func:`torch.as_tensor`.
+        canvas_size (two-tuple of ints): Height and width of the corresponding
+            image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If
+            omitted, will be inferred from ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If
+            omitted and ``data`` is a :class:`torch.Tensor`, the device is taken
+            from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record
+            operations on the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the value is taken from it. Otherwise,
+            defaults to ``False``.
+    """
+
+    canvas_size: tuple[int, int]
+
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, canvas_size: tuple[int, int], check_dims: bool = True) -> KeyPoints:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.shape[-1] != 2:
+                raise ValueError(f"Expected a tensor of shape (..., 2), not {tensor.shape}")
+        points = tensor.as_subclass(cls)
+        points.canvas_size = canvas_size
+        return points
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        canvas_size: tuple[int, int],
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> KeyPoints:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor, canvas_size=canvas_size)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> KeyPoints:
+        # Similar to BoundingBoxes._wrap_output(), see comment there.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_keypoints_from_args = next(x for x in flat_params if isinstance(x, KeyPoints))
+        canvas_size = first_keypoints_from_args.canvas_size
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, KeyPoints):
+            output = KeyPoints._wrap(output, canvas_size=canvas_size, check_dims=False)
+        elif isinstance(output, (tuple, list)):
+            # This branch exists for chunk() and unbind()
+            output = type(output)(KeyPoints._wrap(part, canvas_size=canvas_size, check_dims=False) for part in output)
+        return output
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(canvas_size=self.canvas_size)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_mask.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..f43a5c7e2fd477fd129ef84df2117e1cd28b53e8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_mask.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Any
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Mask(TVTensor):
+    """:class:`torch.Tensor` subclass for segmentation and detection masks with shape ``[..., H, W]``.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the mask is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Mask:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return tensor.as_subclass(cls)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_torch_function_helpers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_torch_function_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..66812fb5ca641fc4dabd10aad281ee6614229168
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_torch_function_helpers.py
@@ -0,0 +1,78 @@
+import torch
+
+_TORCHFUNCTION_SUBCLASS = False
+
+
+class _ReturnTypeCM:
+    def __init__(self, to_restore):
+        self.to_restore = to_restore
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        global _TORCHFUNCTION_SUBCLASS
+        _TORCHFUNCTION_SUBCLASS = self.to_restore
+
+
+def set_return_type(return_type: str):
+    """Set the return type of torch operations on :class:`~torchvision.tv_tensors.TVTensor`.
+
+    This only affects the behaviour of torch operations. It has no effect on
+    ``torchvision`` transforms or functionals, which will always return as
+    output the same type that was passed as input.
+
+    .. warning::
+
+        We recommend using :class:`~torchvision.transforms.v2.ToPureTensor` at
+        the end of your transform pipelines if you use
+        ``set_return_type("TVTensor")``. This will avoid the
+        ``__torch_function__`` overhead in the models ``forward()``.
+
+    Can be used as a global flag for the entire program:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor (default behaviour)
+
+        set_return_type("TVTensor")
+        img + 2  # This is an Image
+
+    or as a context manager to restrict the scope:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor
+        with set_return_type("TVTensor"):
+            img + 2  # This is an Image
+        img + 2  # This is a pure Tensor
+
+    Args:
+        return_type (str): Can be "TVTensor" or "Tensor" (case-insensitive).
+            Default is "Tensor" (i.e. pure :class:`torch.Tensor`).
+    """
+    global _TORCHFUNCTION_SUBCLASS
+    to_restore = _TORCHFUNCTION_SUBCLASS
+
+    try:
+        _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tvtensor": True}[return_type.lower()]
+    except KeyError:
+        raise ValueError(f"return_type must be 'TVTensor' or 'Tensor', got {return_type}") from None
+
+    return _ReturnTypeCM(to_restore)
+
+
+def _must_return_subclass():
+    return _TORCHFUNCTION_SUBCLASS
+
+
+# For those ops we always want to preserve the original subclass instead of returning a pure Tensor
+_FORCE_TORCHFUNCTION_SUBCLASS = {
+    torch.Tensor.clone,
+    torch.Tensor.to,
+    torch.Tensor.detach,
+    torch.Tensor.requires_grad_,
+    torch.Tensor.pin_memory,
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_tv_tensor.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_tv_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f07fc8f2267613e4f3b72eae7084f91b8e85344
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_tv_tensor.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from typing import Any, Callable, TypeVar
+
+import torch
+from torch._C import DisableTorchFunctionSubclass
+from torch.types import _device, _dtype, _size
+
+from torchvision.tv_tensors._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
+
+
+D = TypeVar("D", bound="TVTensor")
+
+
+class TVTensor(torch.Tensor):
+    """Base class for all TVTensors.
+
+    You probably don't want to use this class unless you're defining your own
+    custom TVTensors. See
+    :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for details.
+    """
+
+    @staticmethod
+    def _to_tensor(
+        data: Any,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> torch.Tensor:
+        if requires_grad is None:
+            requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
+        return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> torch.Tensor:
+        # Same as torch._tensor._convert
+        if isinstance(output, torch.Tensor) and not isinstance(output, cls):
+            output = output.as_subclass(cls)
+
+        if isinstance(output, (tuple, list)):
+            # Also handles things like namedtuples
+            output = type(output)(cls._wrap_output(part, args, kwargs) for part in output)
+        return output
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable[..., torch.Tensor],
+        types: tuple[type[torch.Tensor], ...],
+        args: Sequence[Any] = (),
+        kwargs: Mapping[str, Any] | None = None,
+    ) -> torch.Tensor:
+        """For general information about how the __torch_function__ protocol works,
+        see https://pytorch.org/docs/stable/notes/extending.html#extending-torch
+
+        TL;DR: Every time a PyTorch operator is called, it goes through the inputs and looks for the
+        ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
+        ``args`` and ``kwargs`` of the original call.
+
+        Why do we override this? Because the base implementation in torch.Tensor would preserve the TVTensor type
+        of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the
+        "TVTensors FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
+
+        Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out.
+        """
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        # Like in the base Tensor.__torch_function__ implementation, it's easier to always use
+        # DisableTorchFunctionSubclass and then manually re-wrap the output if necessary
+        with DisableTorchFunctionSubclass():
+            output = func(*args, **kwargs or dict())
+
+        must_return_subclass = _must_return_subclass()
+        if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
+            # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails
+            # in test_to_tv_tensor_reference().
+            # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in
+            # the computation by walking the MRO upwards. For example,
+            # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with
+            # `args = (a_pure_tensor, an_image)` first. Without this guard, `out` would
+            # be wrapped into an `Image`.
+            return cls._wrap_output(output, args, kwargs)
+
+        if not must_return_subclass and isinstance(output, cls):
+            # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
+            # so for those, the output is still a TVTensor. Thus, we need to manually unwrap.
+            return output.as_subclass(torch.Tensor)
+
+        return output
+
+    def _make_repr(self, **kwargs: Any) -> str:
+        # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
+        # If that ever gets implemented, remove this in favor of the solution on the `torch.Tensor` class.
+        extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
+        return f"{super().__repr__()[:-1]}, {extra_repr})"
+
+    # Add properties for common attributes like shape, dtype, device, ndim etc
+    # this way we return the result without passing into __torch_function__
+    @property
+    def shape(self) -> _size:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().shape
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().ndim
+
+    @property
+    def device(self, *args: Any, **kwargs: Any) -> _device:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().device
+
+    @property
+    def dtype(self) -> _dtype:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().dtype
+
+    def __deepcopy__(self: D, memo: dict[int, Any]) -> D:
+        # We need to detach first, since a plain `Tensor.clone` will be part of the computation graph, which does
+        # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
+        # attribute is cleared, so we need to refill it before we return.
+        # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
+        # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBoxes.clone()`.
+        return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dd9dafadde9fc11d58a98cc8c66480e50ed9ec2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torchvision/tv_tensors/_video.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Video(TVTensor):
+    """:class:`torch.Tensor` subclass for videos with shape ``[..., T, C, H, W]``.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the video is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | str | int | None = None,
+        requires_grad: bool | None = None,
+    ) -> Video:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if data.ndim < 4:
+            raise ValueError
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/_C/libtriton/linear_layout.pyi b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/_C/libtriton/linear_layout.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e1b4599dd024fb96de12cb14b3eecd16bf1dbf64
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/_C/libtriton/linear_layout.pyi
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from typing import List, Optional, Sequence, Tuple
+
+
+class LinearLayout:
+    def __init__(self) -> None: ...
+
+    @staticmethod
+    def identity_1d(size: int, inDim: str, outDim: str) -> LinearLayout: ...
+
+    @staticmethod
+    def strided_1d(
+        size: int, stride: int, inDim: str, outDim: str
+    ) -> LinearLayout: ...
+
+    @staticmethod
+    def zeros_1d(
+        size: int, inDim: str, outDim: str, outDimSize: int
+    ) -> LinearLayout: ...
+
+    @staticmethod
+    def from_bases(
+        bases: Sequence[Tuple[str, Sequence[Sequence[int]]]],
+        out_dim_names: Sequence[str],
+        out_dim_sizes: Optional[Sequence[int]] = ...,
+        require_surjective: bool = ...,
+    ) -> LinearLayout: ...
+
+    def compose(self, other: LinearLayout) -> LinearLayout: ...
+
+    def invert_and_compose(self, other: LinearLayout) -> LinearLayout: ...
+
+    def invert(self) -> LinearLayout: ...
+
+    def pseudoinvert(self) -> LinearLayout: ...
+
+    def is_surjective(self) -> bool: ...
+
+    def is_injective(self) -> bool: ...
+
+    def is_invertible(self) -> bool: ...
+
+    def get_in_dim_names(self) -> List[str]: ...
+
+    def get_out_dim_names(self) -> List[str]: ...
+
+    @property
+    def bases(self) -> List[Tuple[str, List[List[int]]]]: ...
+
+    @property
+    def out_dims(self) -> List[Tuple[str, int]]: ...
+
+    @property
+    def num_in_dims(self) -> int: ...
+
+    @property
+    def num_out_dims(self) -> int: ...
+
+    def __mul__(self, other: LinearLayout) -> LinearLayout: ...
+
+    def __imul__(self, other: LinearLayout) -> LinearLayout: ...
+
+    def get_shared_view(self, useHWPointOfView: bool) -> str: ...
+
+    def get_distributed_view(self, useHWPointOfView: bool) -> str: ...
+
+    def get_matrix_view(self) -> List[List[int]]: ...
+
+    def apply(
+        self, inputs: Sequence[Tuple[str, int]]
+    ) -> List[Tuple[str, int]]: ...
+
+    def __eq__(self, other: object) -> bool: ...
+
+    def __ne__(self, other: object) -> bool: ...
+
+    def __repr__(self) -> str: ...
+
+    def __str__(self) -> str: ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9070c185be0588bdf929fcdc13a78ac2da04678f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_filecheck.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_filecheck.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..076ff3ee4cd54696a4865d135f182ad4fc6532b7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_filecheck.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_internal_testing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_internal_testing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c067cf97ef2596bd05e3f627feb1d15d389f60d4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_internal_testing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99c070362329036f2bfd072c68c5e401ce1df0d0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/_utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/errors.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/errors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d106d07a5f56b19b4f4ee44ed14a664f040ecb2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/errors.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/knobs.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/knobs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33f34d4de03d8d61cf8c0331f39c3f0efb2d59a6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/knobs.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/testing.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/testing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73daaa5dbc226f399e769b1527ff464a6a8f180b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/__pycache__/testing.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97092379e02dfd25611fc049744ecfc493d76c3d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__init__.py
@@ -0,0 +1,66 @@
+import importlib
+import os
+import inspect
+import sys
+from dataclasses import dataclass
+from typing import Type, TypeVar, Union
+from types import ModuleType
+from .driver import DriverBase
+from .compiler import BaseBackend
+
+if sys.version_info >= (3, 10):
+    from importlib.metadata import entry_points
+else:
+    from importlib_metadata import entry_points
+
+T = TypeVar("T", bound=Union[BaseBackend, DriverBase])
+
+
+def _find_concrete_subclasses(module: ModuleType, base_class: Type[T]) -> Type[T]:
+    ret: list[Type[T]] = []
+    for attr_name in dir(module):
+        attr = getattr(module, attr_name)
+        if isinstance(attr, type) and issubclass(attr, base_class) and not inspect.isabstract(attr):
+            ret.append(attr)
+    if len(ret) == 0:
+        raise RuntimeError(f"Found 0 concrete subclasses of {base_class} in {module}: {ret}")
+    if len(ret) > 1:
+        raise RuntimeError(f"Found >1 concrete subclasses of {base_class} in {module}: {ret}")
+    return ret[0]
+
+
+@dataclass(frozen=True)
+class Backend:
+    compiler: Type[BaseBackend]
+    driver: Type[DriverBase]
+
+
+def _discover_backends() -> dict[str, Backend]:
+    backends = dict()
+    # Fast path: optionally skip entry point discovery (which can be slow) and
+    # discover only in-tree backends under the `triton.backends` namespace.
+    skip_entrypoints_env = os.environ.get("TRITON_BACKENDS_IN_TREE", "")
+
+    if skip_entrypoints_env == "1":
+        root = os.path.dirname(__file__)
+        for name in os.listdir(root):
+            if not os.path.isdir(os.path.join(root, name)):
+                continue
+            if name.startswith('__'):
+                continue
+            compiler = importlib.import_module(f"triton.backends.{name}.compiler")
+            driver = importlib.import_module(f"triton.backends.{name}.driver")
+            backends[name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),
+                                     _find_concrete_subclasses(driver, DriverBase))
+        return backends
+
+    # Default path: discover via entry points for out-of-tree/downstream plugins.
+    for ep in entry_points().select(group="triton.backends"):
+        compiler = importlib.import_module(f"{ep.value}.compiler")
+        driver = importlib.import_module(f"{ep.value}.driver")
+        backends[ep.name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),  # type: ignore
+                                    _find_concrete_subclasses(driver, DriverBase))  # type: ignore
+    return backends
+
+
+backends: dict[str, Backend] = _discover_backends()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70cff61a211949a07bd04260fa088daa1a4f153f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/compiler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4031cabc0eae2e2d7a7dda9186f4a5c53754e0cb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/compiler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/driver.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/driver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3209f4186e8191b62b327c0be49836d4281fd5c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/__pycache__/driver.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..050346f3e4f4cf47dff34cca49d94b593996c1cc
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/compiler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ebac3103a8a77af9dc267b7daca37cc1f17849c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/compiler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/driver.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/driver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..239347aebbd836b86cf6cc386a2aa766ad05d732
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/__pycache__/driver.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/compiler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..887802333d8d385c2ba42e5a2753e61bcb6e9b59
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/compiler.py
@@ -0,0 +1,495 @@
+from triton.backends.compiler import BaseBackend, GPUTarget, Language
+from triton._C.libtriton import ir, passes, llvm, amd
+from triton import knobs
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+from types import ModuleType
+import hashlib
+import tempfile
+import re
+import functools
+import warnings
+from pathlib import Path
+
+
+def get_min_dot_size(target: GPUTarget):
+    # We fallback to use FMA and cast arguments if certain configurations is
+    # not supported natively by matrix core units.
+    return lambda lhs_type, rhs_type: (1, 1, 1)
+
+
+def is_pingpong_schedule_enabled(arch, use_async_copy):
+    return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)
+            ) if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
+
+
+def is_in_thread_transpose_enabled(arch):
+    return (arch == "gfx942") if knobs.amd.use_in_thread_transpose is None else knobs.amd.use_in_thread_transpose
+
+
+@dataclass(frozen=True)
+class HIPOptions:
+    num_warps: int = 4
+    waves_per_eu: int = 0
+    num_stages: int = 2
+    num_ctas: int = 1
+    extern_libs: dict = None
+    debug: bool = False
+    sanitize_overflow: bool = True
+    arch: str = None
+    # We have native support for OCP fp8 variants since CDNA4/RDNA4. For earlier generations,
+    # we software emulate the support for them.
+    # UZ fp8 variants (fp8e4b8 and fp8e5b16) are natively supported for CDNA3. For other
+    # architectures they are software emulated.
+    supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "ieee"
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", 'bf16x3', 'bf16x6')
+    enable_fp_fusion: bool = True
+    launch_cooperative_grid: bool = False
+    matrix_instr_nonkdim: int = 0
+    kpack: int = 1
+    allow_flush_denorm: bool = False
+    max_num_imprecise_acc_default: int = 0
+    backend_name: str = 'hip'
+    instrumentation_mode: str = ""
+
+    # The following option provides hints to the AMDGPU backend regarding instruction scheduling
+    # for all `tt.dot` operations in a kernel. The "none" variant preserves the default
+    # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
+    # The option is experimental and may change at any time regarding its semantics and/or may
+    # be gone entirely anytime.
+    #
+    # Current experimental scheduling variants:
+    #
+    # attention: enables a bunch of optimizations for attention kernels, including:
+    #            - iglp 2 and sched.barrier around it
+    #            - sink-insts-to-avoid-spills flag to avoid register spills
+    # memory-bound-attention: enables custom scheduling strategy in llvm backend,
+    #            This option targets special FA variant, which is memory bound and
+    #            has a lot of elementwise operations from fused operand dequantizations.
+    #            Note that this option is highly experimental,
+    #            and will be removed as soon as default sceduler algorithm is fixed.
+    #
+    # Option allows to set multiple variants divided by commas:
+    # schedule_hint="attention,memory-bound-attention"
+    schedule_hint: str = 'none'
+
+    def __post_init__(self):
+        gfx_major = int(self.arch[3:-2])  # Drop "gfx" prefix and minor/patch number
+        warp_size = 32 if gfx_major >= 10 else 64
+        object.__setattr__(self, 'warp_size', warp_size)
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+            "num_warps must be a power of 2"
+
+        if (self.arch == 'gfx950') and (self.kpack != 1):
+            warnings.warn(
+                f"kpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = {self.kpack} will be overwritten to 1 to make transitioning easier."
+            )
+            object.__setattr__(self, 'kpack', 1)
+
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        for lib in ["ocml", "ockl"]:
+            extern_libs[lib] = str(default_libdir / f'{lib}.bc')
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+
+    def hash(self):
+        key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+
+class HIPBackend(BaseBackend):
+    instrumentation = None
+    supports_native_tensor_specialization = False
+
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'hip'
+
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        assert isinstance(target.arch, str)
+        self.binary_ext = "hsaco"
+
+    def get_target_name(self, options) -> str:
+        return f"hip:{options.arch}"
+
+    def parse_options(self, opts) -> Any:
+        args = {'arch': knobs.runtime.override_arch or self.target.arch}
+
+        if opts.get("num_ctas", 1) > 1 and not amd.supports_multi_cta_launch(self.target.arch):
+            raise ValueError(f"num_ctas > 1 not supported on {self.target.arch}")
+
+        # Enable XF32 (TF32) for CDNA3 GPUs
+        if self.target.arch == 'gfx942':
+            allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
+            allowed_dot_input_precisions.update({'tf32'})
+            args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
+
+        if "supported_fp8_dtypes" not in opts:
+            args["supported_fp8_dtypes"] = tuple(sorted(HIPOptions.supported_fp8_dtypes))
+
+        if self.target.arch == 'gfx950':
+            deprecated_fp8_dot_operand_dtypes = set(HIPOptions.deprecated_fp8_dot_operand_dtypes)
+            deprecated_fp8_dot_operand_dtypes.update({"fp8e5b16", "fp8e4b8"})
+            args["deprecated_fp8_dot_operand_dtypes"] = tuple(sorted(deprecated_fp8_dot_operand_dtypes))
+
+        if "enable_fp_fusion" not in opts:
+            args["enable_fp_fusion"] = knobs.language.default_fp_fusion
+        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts and opts[k] is not None})
+        return HIPOptions(**args)
+
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+        )
+
+    def get_codegen_implementation(self, options):
+        return {"min_dot_size": get_min_dot_size(self.target)}
+
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        from triton.language.extra.hip import libdevice
+
+        return {"triton.language.extra.libdevice": libdevice}
+
+    def load_dialects(self, ctx):
+        amd.load_dialects(ctx)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.load_dialects(ctx)
+
+    @staticmethod
+    def is_within_2gb(arg):
+        import torch
+
+        MAX_INT_32 = 2**31 - 1
+        if hasattr(arg, "ptr_range"):
+            return arg.ptr_range() <= MAX_INT_32
+        if isinstance(arg, torch.Tensor) and hasattr(arg, "untyped_storage"):
+            return arg.untyped_storage().size() <= MAX_INT_32
+        return False
+
+    @staticmethod
+    def parse_attr(desc):
+        ret = BaseBackend.parse_attr(desc)
+        if "S" in desc:
+            ret += [["tt.pointer_range", 32]]
+        return ret
+
+    @staticmethod
+    def get_tensor_specialization(arg, **kwargs):
+        ret = BaseBackend.get_tensor_specialization(arg, **kwargs)
+        if knobs.amd.use_buffer_ops and HIPBackend.is_within_2gb(arg):
+            ret += "S"
+        return ret
+
+    @staticmethod
+    def make_ttir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_combine(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.ttir.add_triton_licm(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.ttir.add_loop_unroll(pm)
+        pm.run(mod, 'make_ttir')
+        return mod
+
+    @staticmethod
+    def make_ttgir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
+                                           options.num_ctas)
+        pm.run(mod, 'make_ttgir_early')
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        emuTF32 = False
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_f32_dot_tc(pm, emuTF32)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_optimize_epilogue(pm)
+        amd.passes.ttgpuir.add_optimize_dot_operands(pm, options.arch)
+        amd.passes.ttgpuir.add_hoist_layout_conversions(pm)
+
+        passes.ttgpuir.add_fuse_nested_loops(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_triton_licm(pm)
+        passes.common.add_canonicalizer(pm)
+
+        use_async_copy = knobs.amd.use_async_copy
+        use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)
+
+        amd.passes.ttgpuir.add_schedule_loops(pm, options.num_stages)
+        amd.passes.ttgpuir.add_pipeline(pm, use_async_copy, use_block_pingpong)
+        if use_async_copy:
+            amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
+        passes.common.add_canonicalizer(pm)
+        if options.schedule_hint.lower() != "none":
+            for hint in options.schedule_hint.split(","):
+                amd.passes.ttgpuir.insert_instruction_sched_hints(pm, hint)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        if is_in_thread_transpose_enabled(options.arch):
+            amd.passes.ttgpuir.add_in_thread_transpose(pm)
+            passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_reorder_instructions(pm)
+        if use_block_pingpong and options.num_stages > 1:
+            amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
+
+        if knobs.amd.use_buffer_ops:
+            amd.passes.ttgpuir.add_canonicalize_pointers(pm)
+            passes.common.add_canonicalizer(pm)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(
+                pm,
+                options.arch,
+                knobs.amd.use_buffer_atomics,
+                knobs.amd.buffer_ops_analyze_small_tensor_range,
+            )
+
+        amd.passes.ttgpuir.add_fold_true_cmpi(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        pm.run(mod, 'make_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+
+    @staticmethod
+    def gluon_to_ttgir(src, metadata, options):
+        mod = src
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+
+        passes.gluon.add_inliner(pm)
+        passes.gluon.add_resolve_auto_encodings(pm)
+        passes.common.add_sccp(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+
+        pm.run(mod, 'gluon_to_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+
+    @staticmethod
+    def make_llir(src, metadata, options):
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
+        # custom_lds_size is an experimental parameter that defines amount of LDS available
+        # for one thread block. Measured in bytes.
+        #
+        # If custom_lds_size = 0, pass will consider all LDS is available for one threads block,
+        # LDS size is determined by provided arch name.
+        custom_lds_size = 0
+        amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
+        passes.convert.add_scf_to_cf(pm)
+        passes.gluon.add_inliner(pm)
+        passes.convert.add_index_to_llvmir(pm)
+
+        amd.passes.ttgpuir.add_allocate_shared_memory(pm)
+        # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
+        ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
+        ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
+        ##    of the value of kernel arg `allow_flush_denorm`.
+        ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
+        ##    depends on the value of kernel arg `allow_flush_denorm`.
+        ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
+        ##    For now it is used as a controller for developers only.
+        __HIP_FTZ = True
+        amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+
+        passes.convert.add_cf_to_llvmir(pm)
+        passes.convert.add_arith_to_llvmir(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+
+        if options.schedule_hint.lower() != "none":
+            amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
+
+        # This can not be moved below the di_scope pass
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
+
+        if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables:
+            passes.llvmir.add_di_scope(pm)
+
+        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
+        pm.run(mod, 'make_llir')
+
+        if knobs.compilation.dump_ir_extract_di_local_variables:
+            # comments below on why separate it
+            if not knobs.compilation.disable_line_info:
+                pm = ir.pass_manager(mod.context)
+                pm.enable_debug()
+                passes.llvmir.add_di_scope(pm)
+                pm.run(mod, 'make_llir.disable_line_info')
+
+            # insert dbg intrinsic with several DI Attribute including source
+            # var name and type info note: unknown reason for now, but this
+            # pass and add_di_scope has to be run separately, otherwise if we
+            # put them into previous pipline, it trigger a segmentfault without
+            # any error message; could be due to a bug in mlir or pybind11
+            pm = ir.pass_manager(mod.context)
+            pm.enable_debug()
+            passes.llvmir.add_di_local_variable(pm)
+            pm.run(mod, 'make_llir.dump_ir_extract_di_local_variables')
+
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        llvm_mod = llvm.to_module(mod, context)
+        amd.attach_target_triple(llvm_mod)
+        target_features = ''
+        if knobs.compilation.enable_asan:
+            target_features = '+xnack'
+        llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, target_features)
+
+        # Set various control constants on the LLVM module so that device
+        # libraries can resolve references to them.
+        amd.set_isa_version(llvm_mod, options.arch)
+        amd.set_abi_version(llvm_mod, 500)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
+
+        # Set kernel attributes first given this may affect later optimizations.
+        fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
+        # The public kernel should be kernel 0.
+        fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
+        fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
+        if "memory-bound-attention" in options.schedule_hint.split(','):
+            fns[0].add_fn_attr("amdgpu-sched-strategy", "iterative-ilp")
+        fns[0].add_fn_attr("uniform-work-group-size", "true")
+        # LLVM AMDGPU backend supports the attribute "amdgpu-waves-per-eu"="<min>[, <max>]".
+        # This attribute may be attached to a kernel function definition and is an optimization hint.
+        # <min> parameter specifies the requested minimum number of waves per EU, and optional <max> parameter
+        # specifies the requested maximum number of waves per EU (must be >= <min> if specified).
+        # If <max> is omitted, then there is no restriction on the maximum number of waves per EU other than
+        # the one dictated by the hardware for which the kernel is compiled. Passing 0, 0 as <min>, <max>
+        # implies the default behavior (no limits).
+        # Specifying N, N forces LLVM to focus on a single register count, simplifies some heuristics
+        # and may improve scheduling.
+        fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}, {options.waves_per_eu}")
+        denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
+        fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
+        if knobs.compilation.enable_asan:
+            fns[0].add_fn_target_feature("+xnack")
+            fns[0].add_fn_asan_attr()
+
+        # Hint the compiler that we'd like the firmware to set the kernel arguments
+        # to user SGPRs so that the kernel does not need to s_load its arguments
+        # from memory.
+        amd.set_all_fn_arg_inreg(fns[0])
+
+        if knobs.compilation.enable_asan:
+            default_libdir = Path(__file__).parent / 'lib'
+            paths = [
+                str(default_libdir / 'asanrtl.bc'),
+                str(default_libdir / "ocml.bc"),
+                str(default_libdir / "ockl.bc")
+            ]
+            llvm.link_extern_libs(llvm_mod, paths)
+        elif options.extern_libs:
+            paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
+            if len(paths) > 0:
+                llvm.link_extern_libs(llvm_mod, paths)
+
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
+
+        # Architectures with architected SGPRs store the workgroup id in ttmp9 (X) and ttmp7 (Y[15:0], Z[31:16]).
+        # These attributes are used to determine if Z should be masked out when loading Y. They are inferred during
+        # optimize_module from calls to @llvm.amdgcn.workgroup.id.x/y/z(). We cannot rely on this because a
+        # dispatch dimensions might be used even if there is no program_id() call for it.
+        if amd.has_architected_sgprs(options.arch):
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-x")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-y")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-z")
+
+        if knobs.amd.scalarize_packed_fops:
+            amd.add_scalarize_packed_fops_llvm_pass(fns[0])
+
+        # Get some metadata
+        metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
+        metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
+
+        amd.cleanup_bitcode_metadata(llvm_mod)
+        # Disable inlining of print related functions,
+        # because inlining of these function could slow down compilation significantly
+        amd.disable_print_inline(llvm_mod)
+        return str(llvm_mod)
+
+    @staticmethod
+    def make_amdgcn(src, metadata, options):
+        # Find kernel names (there should only be one)
+        # We get the name at the last possible step to accommodate `triton.compile`
+        # on user-provided LLVM
+        names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # llvm -> hsaco
+        flags = []
+        features = '-real-true16' if 'gfx11' in options.arch else ''
+        ir_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()
+        dump_file_id = names[0] + '_' + ir_hash
+        _ = llvm.translate_to_mir(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                                  dump_file_id)
+        llvm.dump_sched_dag(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                            dump_file_id)
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                                       False)
+        if knobs.amd.dump_amdgcn:
+            print("// -----// AMDGCN Dump //----- //")
+            print(amdgcn)
+        return amdgcn
+
+    @staticmethod
+    def make_hsaco(src, metadata, options):
+        target_features = ''
+        if knobs.compilation.enable_asan:
+            target_features = '+xnack'
+        hsaco = amd.assemble_amdgcn(src, options.arch, target_features)
+        with tempfile.NamedTemporaryFile() as tmp_out:
+            with tempfile.NamedTemporaryFile() as tmp_in:
+                with open(tmp_in.name, "wb") as fd_in:
+                    fd_in.write(hsaco)
+                amd.link_hsaco(tmp_in.name, tmp_out.name)
+            with open(tmp_out.name, "rb") as fd_out:
+                ret = fd_out.read()
+        return ret
+
+    def add_stages(self, stages, options, language):
+        if language == Language.TRITON:
+            stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
+            stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
+        elif language == Language.GLUON:
+            stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
+        stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
+        stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
+        if knobs.runtime.add_stages_inspection_hook is not None:
+            knobs.runtime.add_stages_inspection_hook(self, stages, options, language, None)
+
+    @functools.lru_cache()
+    def hash(self):
+        return f'{self.target}'
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.c b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.c
new file mode 100644
index 0000000000000000000000000000000000000000..24178b54c319bb278bf014570128887315d35827
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.c
@@ -0,0 +1,504 @@
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct {
+  uint32_t group0_0;
+  uint32_t group0_1;
+  uint32_t group0_2;
+  uint32_t group0_3;
+  uint32_t group1_0;
+  uint32_t group1_1;
+  uint32_t group1_2;
+  uint32_t group1_3;
+  uint32_t group1_4;
+  uint32_t group1_5;
+  uint32_t group1_6;
+  uint32_t group1_7;
+} TDMDescriptor;
+
+typedef struct {
+  PyObject_HEAD;
+  TDMDescriptor desc;
+} PyTDMDescriptorObject;
+
+static PyObject *PyTDMDescriptor_new(PyTypeObject *type, PyObject *args,
+                                     PyObject *kw) {
+  PyTDMDescriptorObject *self =
+      (PyTDMDescriptorObject *)type->tp_alloc(type, 0);
+  if (!self)
+    return NULL;
+
+  memset(&self->desc, 0, sizeof(self->desc));
+  return (PyObject *)self;
+}
+
+static void PyTDMDescriptor_dealloc(PyTDMDescriptorObject *self) {
+  Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyTypeObject PyTDMDescriptorType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name =
+        "triton.backends.amd.PyTDMDescriptor",
+    .tp_basicsize = sizeof(PyTDMDescriptorObject),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "PyObject for TDMDescriptor",
+    .tp_new = PyTDMDescriptor_new,
+    .tp_dealloc = (destructor)PyTDMDescriptor_dealloc,
+};
+
+// TODO: Both host-side and device-side TDM descriptor follow the same encoding
+// format. Consider to add a common utility to remove duplicate code.
+static bool encodeTDMDescriptor(TDMDescriptor *desc, int elementBitWidth,
+                                uint32_t *blockSize, int numWarps,
+                                int padInterval, int padAmount, uint32_t *shape,
+                                uint32_t *strides, uint64_t globalAddress,
+                                int rank) {
+  // NYI: TDM > 2D cases
+  if (rank != 2)
+    return false;
+
+  // Get warp distribution
+  uint32_t numWarpsDim0 = numWarps;
+  for (; numWarpsDim0 > blockSize[0]; numWarpsDim0 /= 2)
+    ;
+  uint32_t numWarpsDim1 = numWarps / numWarpsDim0;
+  if (!(numWarpsDim0 > 0 && blockSize[1] % numWarpsDim1 == 0))
+    return false;
+
+  uint32_t blockSize0 = (blockSize[0] + numWarpsDim0 - 1) / numWarpsDim0;
+  uint32_t blockSize1 = (blockSize[1] + numWarpsDim1 - 1) / numWarpsDim1;
+
+  // group0 (128 bits / 4 dwords) effective bit encoding:
+  // [120:64]:  global address
+  // [127:126]: type - currently always set to 0x2
+  desc->group0_2 = (uint32_t)(globalAddress & 0xFFFFFFFF);
+  desc->group0_3 = (uint32_t)((globalAddress >> 32) & 0x01FFFFFF);
+  desc->group0_3 |= (0x1 << 31);
+
+  // group1 (256 bits / 8 dwords) effective bit encoding:
+  // [17:16]:   data size - log2(element size in bytes)
+  // [20]:      enable padding
+  // [24:22]:   pad interval - log2(pad interval in dwords) - 1
+  // [31:25]:   pad amount - pad amount in dwords - 1
+  // [79:48]:   tensor shape dim inner
+  // [111:80]:  tensor shape dim outer
+  // [127:112]: block shape dim inner
+  // [143:128]: block shape dim outer
+  // [207:160]: tensor stride dim outer (we only use 32 bits)
+  int elementSizeInBytes = elementBitWidth / 8;
+  int dataSize = log2(elementSizeInBytes);
+  desc->group1_0 = (dataSize << 16);
+  int dwordSize = 32;
+  int padIntervalInDwords = padInterval * elementBitWidth / dwordSize;
+  int padAmountInDwords = padAmount * elementBitWidth / dwordSize;
+  if (padIntervalInDwords > 0 && padAmountInDwords > 0) {
+    int log2PadInterval = log2(padIntervalInDwords);
+    desc->group1_0 |= (1 << 20);
+    desc->group1_0 |= ((log2PadInterval - 1) << 22);
+    desc->group1_0 |= ((padAmountInDwords - 1) << 25);
+  }
+  desc->group1_1 = (shape[1] << 16);
+  desc->group1_2 = (shape[1] >> 16);
+  desc->group1_2 |= (shape[0] << 16);
+  desc->group1_3 = (shape[0] >> 16);
+  desc->group1_3 |= (blockSize1 << 16);
+  desc->group1_4 = (blockSize0 & 0xFFFF);
+  desc->group1_5 = strides[0];
+
+  return true;
+}
+
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"};
+
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+// |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t;
+// |FOR_EACH_STR_FN| is a macro to process APIs that return const char *.
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                      \
+  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                      \
+  FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_t *prop, int deviceId) \
+  FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \
+                  unsigned int numOptions, hipJitOption *options,              \
+                  void **optionValues)                                         \
+  FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function,               \
+                  hipModule_t module, const char *kname)                       \
+  FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr,      \
+                  hipFunction_t function)
+
+// HIP driver version format: HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR *
+// 100000 + HIP_VERSION_PATCH.
+#define TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(version) ((version) / 10000000)
+#define TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(version)                       \
+  (((version) % 10000000) / 100000)
+#define TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(version) ((version) % 100000)
+#define TRITON_HIP_DRIVER_REQ_MAJOR_VERSION (6)
+
+// #define TRITON_HIP_DRIVER_DBG_VERSION
+#ifdef TRITON_HIP_DRIVER_DBG_VERSION
+#define TRITON_HIP_DRIVER_LOG_VERSION(version, msgBuff)                        \
+  do {                                                                         \
+    snprintf(msgBuff, sizeof(msgBuff), "libamdhip64 version is: %d.%d.%d",     \
+             TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(version),                 \
+             TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(version),                 \
+             TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(version));                \
+    printf("%s\n", msgBuff);                                                   \
+  } while (0);
+#else
+#define TRITON_HIP_DRIVER_LOG_VERSION(version, msgBuff)                        \
+  do {                                                                         \
+    (void)msgBuff;                                                             \
+    (void)(version);                                                           \
+  } while (0);
+#endif
+
+#define TRITON_HIP_MSG_BUFF_SIZE (1024U)
+
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                              \
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                              \
+  const char *(*hipSymbolName)(__VA_ARGS__);
+
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+};
+
+static struct HIPSymbolTable hipSymbolTable;
+
+static int checkDriverVersion(void *lib) {
+  int hipVersion = -1;
+  const char *error = NULL;
+  typedef hipError_t (*hipDriverGetVersion_fn)(int *driverVersion);
+  hipDriverGetVersion_fn hipDriverGetVersion;
+  dlerror(); // Clear existing errors
+  hipDriverGetVersion =
+      (hipDriverGetVersion_fn)dlsym(lib, "hipDriverGetVersion");
+  error = dlerror();
+  if (error) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipDriverGetVersion' from libamdhip64.so");
+    dlclose(lib);
+    return -1;
+  }
+
+  (void)hipDriverGetVersion(&hipVersion);
+  char msgBuff[TRITON_HIP_MSG_BUFF_SIZE] = {0};
+
+  const int hipMajVersion = TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(hipVersion);
+  if (hipMajVersion < TRITON_HIP_DRIVER_REQ_MAJOR_VERSION) {
+    const int hipMinVersion =
+        TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(hipVersion);
+    const int hipPatchVersion =
+        TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(hipVersion);
+    snprintf(msgBuff, sizeof(msgBuff),
+             "libamdhip64 version %d.%d.%d is not supported! Required major "
+             "version is >=%d.",
+             hipMajVersion, hipMinVersion, hipPatchVersion,
+             TRITON_HIP_DRIVER_REQ_MAJOR_VERSION);
+    PyErr_SetString(PyExc_RuntimeError, msgBuff);
+    dlclose(lib);
+    return -1;
+  }
+
+  TRITON_HIP_DRIVER_LOG_VERSION(hipVersion, msgBuff);
+
+  return hipVersion;
+}
+
+bool initSymbolTable() {
+  void *lib;
+
+  // Go through the list of search paths to dlopen the first HIP driver library.
+  int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+  for (int i = 0; i < n; ++i) {
+    void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+    if (handle) {
+      lib = handle;
+      // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
+    }
+  }
+
+  if (!lib) {
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }
+
+  int hipVersion = checkDriverVersion(lib);
+  if (hipVersion == -1)
+    return false;
+
+  const char *error = NULL;
+  typedef hipError_t (*hipGetProcAddress_fn)(
+      const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags,
+      hipDriverProcAddressQueryResult *symbolStatus);
+  hipGetProcAddress_fn hipGetProcAddress;
+  dlerror(); // Clear existing errors
+
+  *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress");
+  error = dlerror();
+  if (error) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipGetProcAddress' from libamdhip64.so");
+    dlclose(lib);
+    return false;
+  }
+
+  // Resolve all symbols we are interested in.
+  uint64_t hipFlags = 0;
+  hipDriverProcAddressQueryResult symbolStatus;
+  hipError_t status = hipSuccess;
+#define QUERY_EACH_FN(hipSymbolName, ...)                                      \
+  status = hipGetProcAddress(#hipSymbolName,                                   \
+                             (void **)&hipSymbolTable.hipSymbolName,           \
+                             hipVersion, hipFlags, &symbolStatus);             \
+  if (status != hipSuccess) {                                                  \
+    PyErr_SetString(PyExc_RuntimeError,                                        \
+                    "cannot get address for '" #hipSymbolName                  \
+                    "' from libamdhip64.so");                                  \
+    dlclose(lib);                                                              \
+    return false;                                                              \
+  }
+
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+
+  return true;
+}
+
+static inline void gpuAssert(hipError_t code, const char *file, int line) {
+  {
+    if (code != HIP_SUCCESS) {
+      {
+        const char *prefix = "Triton Error [HIP]: ";
+        const char *str = hipSymbolTable.hipGetErrorString(code);
+        char err[TRITON_HIP_MSG_BUFF_SIZE] = {0};
+        snprintf(err, sizeof(err), "%s Code: %d, Messsage: %s", prefix, code,
+                 str);
+        PyGILState_STATE gil_state;
+        gil_state = PyGILState_Ensure();
+        PyErr_SetString(PyExc_RuntimeError, err);
+        PyGILState_Release(gil_state);
+      }
+    }
+  }
+}
+
+#define HIP_CHECK(ans)                                                         \
+  {                                                                            \
+    gpuAssert((ans), __FILE__, __LINE__);                                      \
+    if (PyErr_Occurred())                                                      \
+      return NULL;                                                             \
+  }
+
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+
+  hipDeviceProp_t props;
+  HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id));
+
+  // create a struct to hold device properties
+  return Py_BuildValue(
+      "{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i, s:i}", "max_shared_mem",
+      props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock,
+      "multiprocessor_count", props.multiProcessorCount, "sm_clock_rate",
+      props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width",
+      props.memoryBusWidth, "arch", props.gcnArchName, "warpSize",
+      props.warpSize, "max_threads_per_sm", props.maxThreadsPerMultiProcessor);
+}
+
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+
+  // set HIP options
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
+                        hipJitOptionErrorLogBuffer,
+                        hipJitOptionInfoLogBufferSizeBytes,
+                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
+  const unsigned int errbufsize = 8192;
+  const unsigned int logbufsize = 8192;
+  char _err[errbufsize];
+  char _log[logbufsize];
+  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
+                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
+
+  // launch HIP Binary
+  hipModule_t mod;
+  hipFunction_t fun;
+  HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval))
+  HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name));
+
+  // get allocated registers and spilled registers from the function
+  int n_regs = 0;
+  int n_spills = 0;
+  int32_t n_max_threads = 0;
+  hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun);
+  hipSymbolTable.hipFuncGetAttribute(&n_spills,
+                                     HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
+  hipSymbolTable.hipFuncGetAttribute(
+      &n_max_threads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun);
+  n_spills /= 4;
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills, n_max_threads);
+}
+
+static PyObject *createTDMDescriptor(PyObject *self, PyObject *args) {
+  int elementBitWidth;
+  PyObject *blockSize;
+  int numWarps;
+  int padInterval;
+  int padAmount;
+  PyObject *shape;
+  PyObject *strides;
+  unsigned long long globalAddress;
+
+  if (!PyArg_ParseTuple(args, "iOiiiOOK", &elementBitWidth, &blockSize,
+                        &numWarps, &padInterval, &padAmount, &shape, &strides,
+                        &globalAddress)) {
+    return NULL;
+  }
+
+  PyTDMDescriptorObject *descObj = (PyTDMDescriptorObject *)PyObject_CallObject(
+      (PyObject *)&PyTDMDescriptorType, NULL);
+  if (!descObj)
+    return NULL;
+
+  PyObject *blockSizeFast = NULL;
+  PyObject *shapeFast = NULL;
+  PyObject *stridesFast = NULL;
+
+  uint32_t blockSizeInt[2];
+  uint32_t shapeInt[2];
+  uint32_t stridesInt[2];
+
+  blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
+  if (!blockSizeFast)
+    goto cleanup;
+  int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
+  if (rank != 2) {
+    PyErr_SetString(PyExc_RuntimeError, "rank must be 2");
+    goto cleanup;
+  }
+
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "block size must be an int");
+      goto cleanup;
+    }
+    blockSizeInt[i] = PyLong_AsLong(item);
+  }
+
+  shapeFast = PySequence_Fast(shape, "shape must be a sequence");
+  if (!shapeFast)
+    goto cleanup;
+
+  if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    shapeInt[i] = PyLong_AsLong(item);
+  }
+
+  stridesFast = PySequence_Fast(strides, "strides must be a sequence");
+  if (!stridesFast)
+    goto cleanup;
+
+  if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    stridesInt[i] = PyLong_AsLong(item);
+  }
+
+  Py_DECREF(blockSizeFast);
+  blockSizeFast = NULL;
+  Py_DECREF(shapeFast);
+  shapeFast = NULL;
+  Py_DECREF(stridesFast);
+  stridesFast = NULL;
+
+  bool success = encodeTDMDescriptor(
+      &descObj->desc, elementBitWidth, blockSizeInt, numWarps, padInterval,
+      padAmount, shapeInt, stridesInt, globalAddress, rank);
+  if (!success) {
+    PyErr_SetString(PyExc_RuntimeError, "Failed to encode TDM descriptor");
+    goto cleanup;
+  }
+
+  return (PyObject *)descObj;
+
+cleanup:
+  Py_XDECREF(blockSizeFast);
+  Py_XDECREF(shapeFast);
+  Py_XDECREF(stridesFast);
+  Py_XDECREF(descObj);
+  return NULL;
+}
+
+static PyMethodDef ModuleMethods[] = {
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided hsaco into HIP driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {"create_tdm_descriptor", createTDMDescriptor, METH_VARARGS,
+     "create a host-side TDM descriptor"},
+    {NULL, NULL, 0, NULL} // sentinel
+};
+
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
+
+PyMODINIT_FUNC PyInit_hip_utils(void) {
+  if (!initSymbolTable()) {
+    return NULL;
+  }
+
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+  PyModule_AddFunctions(m, ModuleMethods);
+
+  if (PyType_Ready(&PyTDMDescriptorType) < 0)
+    return NULL;
+  Py_INCREF(&PyTDMDescriptorType);
+  PyModule_AddObject(m, "PyTDMDescriptor", (PyObject *)&PyTDMDescriptorType);
+
+  return m;
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..24a0d84e8ab646c37f86fb5d3d462bd41c5a75ba
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/driver.py
@@ -0,0 +1,877 @@
+import functools
+import os
+import subprocess
+import re
+import triton
+from pathlib import Path
+from triton import knobs
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+from triton.runtime import _allocation
+from triton.runtime.build import compile_module_from_src
+
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dirs = [os.path.join(dirname, "include")]
+PyTDMDescriptor = None
+
+
+def _find_already_mmapped_dylib_on_linux(lib_name):
+    import platform
+    if platform.system() != 'Linux':
+        return None
+
+    # Use dl_iterate_phdr to walk through the list of shared libraries at runtime.
+    # See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details.
+
+    import ctypes
+    from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER
+
+    class DlPhdrInfo(ctypes.Structure):
+        _fields_ = [
+            ('dlpi_addr', c_void_p),
+            ('dlpi_name', c_char_p),
+            # We don't care about the remaining fields.
+        ]
+
+    # callback_t must use POINTER(c_char) to avoid copying.
+    callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char))
+
+    # Load libc and get the dl_iterate_phdr symbol.
+    try:
+        dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
+    except Exception:
+        return None
+    # argtypes must use c_char_p to accept create_string_buffer.
+    dl_iterate_phdr.argtypes = [callback_t, c_char_p]
+    dl_iterate_phdr.restype = c_int
+
+    max_path_length = 4096
+    path = ctypes.create_string_buffer(max_path_length + 1)
+
+    # Define callback to get the loaded dylib path.
+    def callback(info, size, data):
+        dlpi_name = info.contents.dlpi_name
+        p = Path(os.fsdecode(dlpi_name))
+        if lib_name in p.name:
+            # Found the dylib; get its path.
+            ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name)))
+            return 1
+        return 0
+
+    if dl_iterate_phdr(callback_t(callback), path):
+        return os.fsdecode(ctypes.string_at(path))
+    return None
+
+
+@functools.lru_cache()
+def _get_path_to_hip_runtime_dylib():
+    lib_name = "libamdhip64.so"
+
+    # If we are told explicitly what HIP runtime dynamic library to use, obey that.
+    if env_libhip_path := knobs.amd.libhip_path:
+        if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path):
+            return env_libhip_path
+        raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
+
+    # If the shared object is already mmapped to address space, use it.
+    mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name)
+    if mmapped_path:
+        if os.path.exists(mmapped_path):
+            return mmapped_path
+        raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
+
+    paths = []
+
+    # Check backend
+    local_lib = os.path.join(os.path.dirname(__file__), "lib", lib_name)
+    if os.path.exists(local_lib):
+        return local_lib
+    paths.append(local_lib)
+
+    import site
+    # First search the HIP runtime dynamic library packaged with PyTorch. It's very likely
+    # that we run Triton together with PyTorch. This makes sure we use the same dynamic
+    # library to avoid version mismatch.
+    site_packages = site.getsitepackages()
+    user_site = site.getusersitepackages()
+    if site.ENABLE_USER_SITE:  # ENABLE_USER_SITE is initialized in getusersitepackages()
+        site_packages = [user_site] + site_packages
+    for path in site_packages:
+        path = os.path.join(path, "torch", "lib", lib_name)
+        if os.path.exists(path):
+            return path
+        paths.append(path)
+
+    # Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH.
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path:
+        for d in env_ld_library_path.split(":"):
+            f = os.path.join(d, lib_name)
+            if os.path.exists(f):
+                return f
+            paths.append(f)
+
+    # HIP_PATH should point to HIP SDK root if set
+    env_hip_path = os.getenv("HIP_PATH")
+    if env_hip_path:
+        hip_lib_path = os.path.join(env_hip_path, "lib", lib_name)
+        if os.path.exists(hip_lib_path):
+            return hip_lib_path
+        paths.append(hip_lib_path)
+
+    # if available, `hipconfig --path` prints the HIP SDK root
+    try:
+        hip_root = subprocess.check_output(["hipconfig", "--path"]).decode().strip()
+        if hip_root:
+            hip_lib_path = os.path.join(hip_root, "lib", lib_name)
+            if os.path.exists(hip_lib_path):
+                return hip_lib_path
+            paths.append(hip_lib_path)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # hipconfig may not be available
+        pass
+
+    # ROCm lib dir based on env var
+    env_rocm_path = os.getenv("ROCM_PATH")
+    if env_rocm_path:
+        rocm_lib_path = os.path.join(env_rocm_path, "lib", lib_name)
+        if os.path.exists(rocm_lib_path):
+            return rocm_lib_path
+        paths.append(rocm_lib_path)
+
+    # Afterwards try to search the loader dynamic library resolution paths.
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode(errors="ignore")
+    # each line looks like the following:
+    # libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6
+    # libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so
+    locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)]
+    for loc in locs:
+        if os.path.exists(loc):
+            return loc
+        paths.append(loc)
+
+    # As a last resort, guess if we have it in some common installation path.
+    common_install_path = os.path.join('/opt/rocm/lib/', lib_name)
+    if os.path.exists(common_install_path):
+        return common_install_path
+    paths.append(common_install_path)
+
+    raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
+
+
+class HIPUtils(object):
+
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(HIPUtils, cls).__new__(cls)
+        return cls.instance
+
+    def __init__(self):
+        libhip_path = _get_path_to_hip_runtime_dylib()
+        src = Path(os.path.join(dirname, "driver.c")).read_text()
+        # Just do a simple search and replace here instead of templates or format strings.
+        # This way we don't need to escape-quote C code curly brackets and we can replace
+        # exactly once.
+        src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
+        mod = compile_module_from_src(src=src, name="hip_utils", include_dirs=include_dirs)
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.create_tdm_descriptor = mod.create_tdm_descriptor
+        global PyTDMDescriptor
+        PyTDMDescriptor = mod.PyTDMDescriptor
+
+
+# -------------------- Launcher ----------------------------
+def ty_to_cpp(ty):
+    if ty.startswith('*'):
+        return "hipDeviceptr_t"
+    if ty == "tensordesc":
+        return "TDMDescriptor"
+    return {
+        "i1": "int8_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint8_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
+        "fp64": "double",
+    }[ty]
+
+
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+
+_BASE_ARGS_FORMAT = "piiiKKOOOOO"
+
+
+def make_launcher(constants, signature, warp_size, tensordesc_meta):
+
+    def _expand_signature(signature):
+        output = []
+        tensordesc_idx = 0
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+
+                # If there is no descriptor's metadata, the descriptor has been decomposed to base pointer, shape and strides
+                if meta is None:
+                    output.append("*" + dtype)
+                    for _ in range(2 * ndim):
+                        output.append("i64")
+                    output.append("i1")
+                else:
+                    output.append("tensordesc")
+
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+
+        return output
+
+    def _serialize_signature(sig):
+        if isinstance(sig, tuple):
+            return ','.join(map(_serialize_signature, sig))
+        return sig
+
+    def _extracted_type(ty):
+        if isinstance(ty, tuple):
+            val = ','.join(map(_extracted_type, ty))
+            return f"[{val}]"
+        if ty.startswith("*") or ty.startswith("tensordesc"):
+            return "PyObject*"
+        if ty == "constexpr":
+            return "PyObject*"
+        return ty_to_cpp(ty)
+
+    def format_of(ty):
+        if isinstance(ty, tuple):
+            val = ''.join(map(format_of, ty))
+            return f"({val})"
+        if ty.startswith("*") or ty.startswith("tensordesc"):
+            return "O"
+        if ty == "constexpr":
+            return "O"
+        return {
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "L",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty_to_cpp(ty)]
+
+    signature = {idx: s for idx, s in enumerate(_expand_signature(signature.values()))}
+
+    args_format = ''.join([format_of(ty) for ty in signature.values()])
+    format = _BASE_ARGS_FORMAT + args_format
+    signature = ','.join(map(_serialize_signature, signature.values()))
+    signature = list(filter(bool, signature.split(',')))
+    signature = {i: s for i, s in enumerate(signature)}
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty.startswith("*"):
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty.startswith("tensordesc"):
+            internal_args_list.append(f"*desc{i}")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
+        elif ty != "constexpr":
+            internal_args_list.append(f"_arg{i}")
+
+    newline = '\n  '
+    ptr_decls = [
+        f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;"
+        for i, ty in signature.items()
+        if ty.startswith("*")
+    ]
+    tensor_desc_decls = [
+        f"TDMDescriptor* desc{i} = getTDMDescriptor(_arg{i}, {i});" for i, ty in signature.items()
+        if ty.startswith("tensordesc")
+    ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
+
+    libhip_path = _get_path_to_hip_runtime_dylib()
+
+    # generate glue code
+    params = list(range(len(signature)))
+    params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
+    params.append("&global_scratch")
+    params.append("&profile_scratch")
+    src = f"""
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <dlfcn.h>
+
+typedef struct {{
+  uint32_t group0_0;
+  uint32_t group0_1;
+  uint32_t group0_2;
+  uint32_t group0_3;
+  uint32_t group1_0;
+  uint32_t group1_1;
+  uint32_t group1_2;
+  uint32_t group1_3;
+  uint32_t group1_4;
+  uint32_t group1_5;
+  uint32_t group1_6;
+  uint32_t group1_7;
+}} TDMDescriptor;
+
+typedef struct {{
+  PyObject_HEAD;
+  TDMDescriptor desc;
+}} PyTDMDescriptorObject;
+
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
+
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \\
+  FOR_EACH_STR_FN(hipGetLastError, true)                                      \\
+  FOR_EACH_STR_FN(hipGetErrorString, true, hipError_t hipError)               \\
+  FOR_EACH_ERR_FN(hipDrvLaunchKernelEx, false,                                \\
+                  const HIP_LAUNCH_CONFIG *config,                            \\
+                  hipFunction_t f,                                            \\
+                  void **kernelParams,                                        \\
+                  void **extra)                                               \\
+  FOR_EACH_ERR_FN(hipModuleLaunchKernel, true, hipFunction_t f,               \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, true, hipFunction_t f,    \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipPointerGetAttribute, true, void *data,                   \\
+                  hipPointer_attribute attribute, hipDeviceptr_t ptr)
+
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {{
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, required, ...)                   \\
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, required, ...)                   \\
+  const char *(*hipSymbolName)(__VA_ARGS__);
+
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+}};
+
+static struct HIPSymbolTable hipSymbolTable;
+
+bool initSymbolTable() {{
+  // Use the HIP runtime library loaded into the existing process if it exits.
+  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
+
+  // Otherwise, go through the list of search paths to dlopen the first HIP
+  // driver library.
+  if (!lib) {{
+    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+    for (int i = 0; i < n; ++i) {{
+      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+      if (handle) {{
+        lib = handle;
+      }}
+    }}
+  }}
+  if (!lib) {{
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }}
+
+  typedef hipError_t (*hipGetProcAddress_fn)(
+      const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags,
+      hipDriverProcAddressQueryResult *symbolStatus);
+  hipGetProcAddress_fn hipGetProcAddress;
+  dlerror(); // Clear existing errors
+  const char *error = NULL;
+  *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress");
+  error = dlerror();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipGetProcAddress' from libamdhip64.so");
+    dlclose(lib);
+    return false;
+  }}
+
+  // Resolve all symbols we are interested in.
+  int hipVersion = HIP_VERSION;
+  uint64_t hipFlags = 0;
+  hipDriverProcAddressQueryResult symbolStatus;
+  hipError_t status = hipSuccess;
+#define QUERY_EACH_FN(hipSymbolName, required, ...)                            \
+  status = hipGetProcAddress(#hipSymbolName,                                   \
+                             (void **)&hipSymbolTable.hipSymbolName,           \
+                             hipVersion, hipFlags, &symbolStatus);             \
+  if (required && status != hipSuccess) {{                                     \
+    PyErr_SetString(PyExc_RuntimeError,                                        \
+                    "cannot get address for '" #hipSymbolName                  \
+                    "' from libamdhip64.so");                                  \
+    dlclose(lib);                                                              \
+    return false;                                                              \
+  }}
+
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+
+  return true;
+}}
+
+static inline void gpuAssert(hipError_t code, const char *file, int line)
+{{
+   if (code != HIP_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [HIP]: ";
+      const char* str = hipSymbolTable.hipGetErrorString(code);
+      char err[1024] = {{0}};
+      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
+      PyErr_SetString(PyExc_RuntimeError, err);
+   }}
+}}
+
+#define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int shared_memory, hipStream_t stream, hipFunction_t function, hipDeviceptr_t profile_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  if (gridX * gridY * gridZ == 0)
+    return;
+  hipDeviceptr_t global_scratch = 0;
+  void *params[] = {{ {', '.join(params)} }};
+  if(num_ctas > 1) {{
+    if (!hipSymbolTable.hipDrvLaunchKernelEx) {{
+        PyErr_SetString(PyExc_RuntimeError, "missing hipDrvLaunchKernelEx symbol; please update HIP runtime");
+        return;
+    }}
+
+    hipLaunchAttribute attributes[2];
+    // Attribute0: Cluster dimensions
+    attributes[0].id = 4;
+    int *cluster_dims = (int*)attributes[0].val.pad;
+    cluster_dims[0] = num_ctas;
+    cluster_dims[1] = 1;
+    cluster_dims[2] = 1;
+    // Attribute1: Cooperative launch
+    attributes[1].id = hipLaunchAttributeCooperative;
+    attributes[1].val.cooperative = launch_cooperative_grid;
+
+    HIP_LAUNCH_CONFIG config = {{
+        gridX * num_ctas, gridY, gridZ, // Grid size
+        {warp_size} * num_warps, 1, 1, // Block size
+        shared_memory, stream,
+        attributes, 2 // Number of attributes
+    }};
+    HIP_CHECK(hipSymbolTable.hipDrvLaunchKernelEx(&config, function, params, 0));
+    return;
+  }}
+  else if (launch_cooperative_grid) {{
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+    return;
+  }}
+  else {{
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+  }}
+}}
+
+typedef struct _DevicePtrInfo {{
+    hipDeviceptr_t dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+
+static PyObject* data_ptr_str = NULL;
+static PyObject* py_tdm_descriptor_type = NULL;
+
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  hipError_t status = hipSuccess;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
+  if (!ret) {{
+    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  if (!PyLong_Check(ret)) {{
+    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
+  if (!ptr_info.dev_ptr)
+    goto cleanup;
+  uint64_t dev_ptr;
+  status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+  if (status == hipErrorInvalidValue) {{
+      PyErr_Format(PyExc_ValueError,
+                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+      ptr_info.valid = false;
+      // Clear and ignore HIP error
+      (void)hipSymbolTable.hipGetLastError();
+  }}
+  ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
+cleanup:
+  Py_DECREF(ret);
+  return ptr_info;
+}}
+
+static inline TDMDescriptor* getTDMDescriptor(PyObject* obj, int idx) {{
+  if (Py_TYPE(obj) != (PyTypeObject*)py_tdm_descriptor_type) {{
+    PyErr_Format(PyExc_TypeError, "object must be of type PyTDMDescriptor, got %s", Py_TYPE(obj)->tp_name);
+    return NULL;
+  }}
+
+  TDMDescriptor* desc = &((PyTDMDescriptorObject*)obj)->desc;
+  return desc;
+}}
+
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat/blob/5e317108f872c904eb726cb8d560dcadbdf88a72/pythoncapi_compat.h#L482-L492
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#else
+    PyFloat_Pack2(f, (char*)&result, 1);
+#endif
+    return result;
+}}
+
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  int launch_cooperative_grid;
+  PyObject *profile_scratch_obj = NULL;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &launch_cooperative_grid,
+                                           &gridX, &gridY, &gridZ, &_stream, &_function, &profile_scratch_obj,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook {args_list})) {{
+    return NULL;
+  }}
+
+  // extract kernel metadata
+  int num_warps, num_ctas, shared_memory;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iii\", &num_warps, &num_ctas, &shared_memory)) {{
+    return NULL;
+  }}
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+
+  hipDeviceptr_t profile_scratch = 0;
+  if (profile_scratch_obj != Py_None) {{
+    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
+    if (!profile_scratch_info.valid) {{
+      return NULL;
+    }}
+    profile_scratch = profile_scratch_info.dev_ptr;
+  }}
+
+  // raise exception asap
+  {newline.join(tensor_desc_decls)}
+  {newline.join(ptr_decls)}
+  {newline.join(float_storage_decls)}
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, (hipDeviceptr_t)profile_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+
+  if(launch_exit_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+
+  if(PyErr_Occurred()) {{
+    return NULL;
+  }}
+  Py_RETURN_NONE;
+}}
+
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  if (!initSymbolTable()) {{
+    return NULL;
+  }}
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  data_ptr_str = PyUnicode_InternFromString("data_ptr");
+  if(data_ptr_str == NULL) {{
+    return NULL;
+  }}
+  PyObject* driver_mod = PyImport_ImportModule("triton.backends.amd.driver");
+  if (driver_mod == NULL) {{
+    return NULL;
+  }}
+  py_tdm_descriptor_type = PyObject_GetAttrString(driver_mod, "PyTDMDescriptor");
+  if (py_tdm_descriptor_type == NULL) {{
+    return NULL;
+  }}
+
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+
+
+def make_tensordesc_arg(arg, kernel_metadata, tensordesc_metadata):
+    """
+    Translate a tensor descriptor argument into the appropriate list of kernel
+    arguments. If `tensordesc_metadata` is provided, we will create a
+    TDMDescriptor object. Otherwise, we decompose the tensor descriptor into
+    base pointer, shape, strides, and padding flag. In both cases, we append the
+    shape and strides at the end to match the expected kernel signature.
+    """
+
+    if tensordesc_metadata is None:
+        # Currently the host side tensor descriptors get decomposed in
+        # the frontend to tensor desc, shape, and strides. We have no
+        # way to use these shape and strides when processing tensor
+        # descriptors which is why we provide our own decomposition
+        # above. Sadly this means we have to pass the shape and strides
+        # twice.
+        return [arg.base, *arg.shape, *arg.strides, arg.padding == "nan", *arg.shape, *arg.strides]
+
+    shape = arg.shape
+    strides = arg.strides
+    base = arg.base.data_ptr()
+
+    assert "elem_bits" in tensordesc_metadata and "block_size" in tensordesc_metadata
+    elem_bits = tensordesc_metadata["elem_bits"]
+    block_size = tensordesc_metadata["block_size"]
+    pad_interval, pad_amount = 0, 0
+    interval_padding_pairs = tensordesc_metadata.get("interval_padding_pairs", [])
+    if interval_padding_pairs:
+        assert len(interval_padding_pairs) == 1 and len(interval_padding_pairs[0]) == 2
+        pad_interval, pad_amount = interval_padding_pairs[0]
+    num_warps = kernel_metadata[0]
+
+    driver = triton.runtime.driver.active
+    assert isinstance(driver, HIPDriver)
+
+    desc = driver.utils.create_tdm_descriptor(elem_bits, block_size, num_warps, pad_interval, pad_amount, shape,
+                                              strides, base)
+
+    return [desc, *shape, *strides]
+
+
+def wrap_handle_tensordesc(launcher, signature, tensordesc_metadata):
+    """
+    Wrap a kernel launcher function to handle tensor descriptor arguments.
+    Use the provided `tensordesc_metadata` to determine whether to create
+    TDMDescriptor objects or decompose the tensor descriptors.
+
+    Args:
+        launcher (callable): The original kernel launcher function.
+        signature (Dict[int, str]): The kernel signature mapping argument indices to types.
+        tensordesc_metadata (List[Dict] or None): The list of tensor descriptor metadata, following the order
+                                                  of tensor descriptor arguments. If None, decompose tensor descriptors.
+    Returns:
+        launcher (callable): The wrapped kernel launcher function.
+    """
+
+    has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+    if not has_tensor_desc_arg:
+        return launcher
+
+    tensordesc_indices = set(
+        [i for i, sig in enumerate(signature.values()) if isinstance(sig, str) and sig.startswith("tensordesc")])
+    assert not tensordesc_metadata or len(tensordesc_metadata) == len(tensordesc_indices)
+    if not tensordesc_metadata:
+        tensordesc_metadata = [None] * len(tensordesc_indices)
+
+    def inner(*args):
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
+        final_args = []
+        tensordesc_idx = 0
+        for i, arg in enumerate(raw_kernel_args):
+            if i in tensordesc_indices:
+                tensordesc_args = make_tensordesc_arg(arg, meta_args[7],  # kernel_metadata
+                                                      tensordesc_metadata[tensordesc_idx])
+                final_args.extend(tensordesc_args)
+                tensordesc_idx += 1
+            else:
+                final_args.append(arg)
+        return launcher(*meta_args, *final_args)
+
+    return inner
+
+
+class HIPLauncher(object):
+
+    def __init__(self, src, metadata):
+        constants = src.constants if hasattr(src, "constants") else dict()
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
+        src = make_launcher(constants, signature, metadata.warp_size, tensordesc_meta)
+        mod = compile_module_from_src(src=src, name="__triton_launcher", include_dirs=include_dirs)
+        self.launch = wrap_handle_tensordesc(mod.launch, signature, tensordesc_meta)
+        self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.profile_scratch_size = metadata.profile_scratch_size
+        self.profile_scratch_align = metadata.profile_scratch_align
+
+    def __call__(self, gridX, gridY, gridZ, stream, function, *args):
+
+        def allocate_scratch(size, align, allocator):
+            if size > 0:
+                grid_size = gridX * gridY * gridZ
+                alloc_size = grid_size * size
+                alloc_fn = allocator.get()
+                return alloc_fn(alloc_size, align, stream)
+            return None
+
+        profile_scratch = allocate_scratch(self.profile_scratch_size, self.profile_scratch_align,
+                                           _allocation._profile_allocator)
+
+        self.launch(self.launch_cooperative_grid, gridX, gridY, gridZ, stream, function, profile_scratch, *args)
+
+
+class HIPDriver(GPUDriver):
+
+    def __init__(self):
+        super().__init__()
+        self.utils = HIPUtils()
+        self.launcher_cls = HIPLauncher
+
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+
+    @staticmethod
+    def is_active():
+        try:
+            import torch
+            return torch.cuda.is_available() and (torch.version.hip is not None)
+        except ImportError:
+            return False
+
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+
+    def get_current_target(self):
+        device = self.get_current_device()
+        device_properties = self.utils.get_device_properties(device)
+        arch = knobs.runtime.override_arch or device_properties['arch']
+        warp_size = device_properties['warpSize']
+        return GPUTarget("hip", arch.split(':')[0], warp_size)
+
+    def get_active_torch_device(self):
+        import torch
+        # when using hip devices, the device string in pytorch is "cuda"
+        return torch.device("cuda", self.get_current_device())
+
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+
+    def get_empty_cache_for_benchmark(self):
+        import torch
+
+        # It's the same as the Nvidia backend.
+        cache_size = 256 * 1024 * 1024
+        return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
+
+    def clear_cache(self, cache):
+        cache.zero_()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..26d03abfb44ee564d22da3df1d94f4a4df57782d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h
@@ -0,0 +1,311 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_common.h>
+#include <hip/driver_types.h>
+#include <hip/amd_detail/amd_hip_vector_types.h>
+#endif
+
+#ifdef __cplusplus
+
+extern "C" HIP_PUBLIC_API hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                                    hipChannelFormatKind f);
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+}
+
+template <typename T> static inline hipChannelFormatDesc hipCreateChannelDesc() {
+  return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+  int e = (int)sizeof(char) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+  int e = (int)sizeof(signed char) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+  int e = (int)sizeof(unsigned char) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+  int e = (int)sizeof(unsigned char) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+  int e = (int)sizeof(signed char) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+  int e = (int)sizeof(unsigned char) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+  int e = (int)sizeof(signed char) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__  // vector3 is the same as vector4
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+  int e = (int)sizeof(unsigned char) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+  int e = (int)sizeof(signed char) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+  int e = (int)sizeof(unsigned char) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+  int e = (int)sizeof(signed char) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+  int e = (int)sizeof(signed short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+  int e = (int)sizeof(signed short) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+  int e = (int)sizeof(signed short) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+  int e = (int)sizeof(signed short) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+  int e = (int)sizeof(unsigned short) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+  int e = (int)sizeof(signed short) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+  int e = (int)sizeof(unsigned int) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+  int e = (int)sizeof(signed int) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+  int e = (int)sizeof(unsigned int) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+  int e = (int)sizeof(signed int) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+  int e = (int)sizeof(unsigned int) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+  int e = (int)sizeof(signed int) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+  int e = (int)sizeof(unsigned int) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+  int e = (int)sizeof(signed int) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+  int e = (int)sizeof(unsigned int) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+  int e = (int)sizeof(signed int) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+  int e = (int)sizeof(float) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+  int e = (int)sizeof(float) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+  int e = (int)sizeof(float) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+}
+
+#ifndef __GNUC__
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+  int e = (int)sizeof(float) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
+}
+#endif
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+  int e = (int)sizeof(float) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+}
+
+#if !defined(__LP64__)
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+  int e = (int)sizeof(unsigned long) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+  int e = (int)sizeof(signed long) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+  int e = (int)sizeof(unsigned long) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+  int e = (int)sizeof(signed long) * 8;
+  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+  int e = (int)sizeof(unsigned long) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+  int e = (int)sizeof(signed long) * 8;
+  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+  int e = (int)sizeof(unsigned long) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+  int e = (int)sizeof(signed long) * 8;
+  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+  int e = (int)sizeof(unsigned long) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+  int e = (int)sizeof(signed long) * 8;
+  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+#endif /* !__LP64__ */
+
+#else
+
+struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                 enum hipChannelFormatKind f);
+
+#endif /* __cplusplus */
+
+#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_device_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf331225707b77a9dabf4331b4b41693478ca8b9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_device_functions.h
@@ -0,0 +1,925 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/amd_hip_common.h>
+#include <hip/amd_detail/device_library_decls.h>
+#include <hip/amd_detail/hip_assert.h>
+#include "host_defines.h"
+#include "math_fwd.h"
+#include <hip/hip_runtime_api.h>
+#include <stddef.h>
+#include <hip/hip_vector_types.h>
+#endif  // !defined(__HIPCC_RTC__)
+
+#if defined(__clang__) && defined(__HIP__)
+extern "C" __device__ int printf(const char* fmt, ...);
+#else
+template <typename... All> static inline __device__ void printf(const char* format, All... all) {}
+#endif
+
+extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
+
+/*
+Integer Intrinsics
+*/
+
+// integer intrinsic function __poc __clz __ffs __brev
+__device__ static inline unsigned int __popc(unsigned int input) {
+  return __builtin_popcount(input);
+}
+__device__ static inline unsigned int __popcll(unsigned long long int input) {
+  return __builtin_popcountll(input);
+}
+
+__device__ static inline int __clz(int input) { return __ockl_clz_u32((uint)input); }
+
+__device__ static inline int __clzll(long long int input) {
+  return __ockl_clz_u64((__hip_uint64_t)input);
+}
+
+__device__ static inline int __ffs(unsigned int input) {
+  return (input == 0 ? -1 : __builtin_ctz(input)) + 1;
+}
+
+__device__ static inline int __ffsll(unsigned long long int input) {
+  return (input == 0 ? -1 : __builtin_ctzll(input)) + 1;
+}
+
+__device__ static inline int __ffs(int input) {
+  return (input == 0 ? -1 : __builtin_ctz(input)) + 1;
+}
+
+__device__ static inline int __ffsll(long long int input) {
+  return (input == 0 ? -1 : __builtin_ctzll(input)) + 1;
+}
+
+// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
+// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit
+// position. If not found, return -1.
+__device__ static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base,
+                                        __hip_int32_t offset) {
+  __hip_uint64_t temp_mask = mask;
+  __hip_int32_t temp_offset = offset;
+
+  if (offset == 0) {
+    temp_mask &= (1 << base);
+    temp_offset = 1;
+  } else if (offset < 0) {
+    temp_mask = __builtin_bitreverse64(mask);
+    base = 63 - base;
+    temp_offset = -offset;
+  }
+
+  temp_mask = temp_mask & ((~0ULL) << base);
+  if (__builtin_popcountll(temp_mask) < temp_offset) return -1;
+  __hip_int32_t total = 0;
+  for (int i = 0x20; i > 0; i >>= 1) {
+    __hip_uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
+    __hip_int32_t pcnt = __builtin_popcountll(temp_mask_lo);
+    if (pcnt < temp_offset) {
+      temp_mask = temp_mask >> i;
+      temp_offset -= pcnt;
+      total += i;
+    } else {
+      temp_mask = temp_mask_lo;
+    }
+  }
+  if (offset < 0)
+    return 63 - total;
+  else
+    return total;
+}
+
+__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base,
+                                        __hip_int32_t offset) {
+  __hip_uint32_t temp_mask = mask;
+  __hip_int32_t temp_offset = offset;
+  if (offset == 0) {
+    temp_mask &= (1 << base);
+    temp_offset = 1;
+  } else if (offset < 0) {
+    temp_mask = __builtin_bitreverse32(mask);
+    base = 31 - base;
+    temp_offset = -offset;
+  }
+  temp_mask = temp_mask & ((~0U) << base);
+  if (__builtin_popcount(temp_mask) < temp_offset) return -1;
+  __hip_int32_t total = 0;
+  for (int i = 0x10; i > 0; i >>= 1) {
+    __hip_uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
+    __hip_int32_t pcnt = __builtin_popcount(temp_mask_lo);
+    if (pcnt < temp_offset) {
+      temp_mask = temp_mask >> i;
+      temp_offset -= pcnt;
+      total += i;
+    } else {
+      temp_mask = temp_mask_lo;
+    }
+  }
+  if (offset < 0)
+    return 31 - total;
+  else
+    return total;
+}
+
+// Wrapper around __fns32() to make porting from CUDA easier
+__device__ static __hip_int32_t __fns(unsigned int mask, unsigned int base, int offset) {
+  return __fns32(mask, base, offset);
+}
+
+__device__ static inline unsigned int __brev(unsigned int input) {
+  return __builtin_bitreverse32(input);
+}
+
+__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
+  return __builtin_bitreverse64(input);
+}
+
+__device__ static inline unsigned int __lastbit_u32_u64(__hip_uint64_t input) {
+  return input == 0 ? -1 : __builtin_ctzl(input);
+}
+
+__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1,
+                                                       unsigned int src2) {
+  __hip_uint32_t offset = src1 & 31;
+  __hip_uint32_t width = src2 & 31;
+  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
+}
+
+__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1,
+                                                         unsigned int src2) {
+  __hip_uint64_t offset = src1 & 63;
+  __hip_uint64_t width = src2 & 63;
+  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
+}
+
+__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1,
+                                                      unsigned int src2, unsigned int src3) {
+  __hip_uint32_t offset = src2 & 31;
+  __hip_uint32_t width = src3 & 31;
+  __hip_uint32_t mask = (1 << width) - 1;
+  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1,
+                                                        unsigned int src2, unsigned int src3) {
+  __hip_uint64_t offset = src2 & 63;
+  __hip_uint64_t width = src3 & 63;
+  __hip_uint64_t mask = (1ULL << width) - 1;
+  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi,
+                                               unsigned int shift) {
+  __hip_uint32_t mask_shift = shift & 31;
+  return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
+}
+
+__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi,
+                                                unsigned int shift) {
+  __hip_uint32_t min_shift = shift >= 32 ? 32 : shift;
+  return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
+}
+
+__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi,
+                                               unsigned int shift) {
+  return __builtin_amdgcn_alignbit(hi, lo, shift);
+}
+
+__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi,
+                                                unsigned int shift) {
+  return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
+}
+
+__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+__device__ static int __hadd(int x, int y);
+__device__ static int __mul24(int x, int y);
+__device__ static long long int __mul64hi(long long int x, long long int y);
+__device__ static int __mulhi(int x, int y);
+__device__ static int __rhadd(int x, int y);
+__device__ static unsigned int __sad(int x, int y, unsigned int z);
+__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
+__device__ static int __umul24(unsigned int x, unsigned int y);
+__device__ static unsigned long long int __umul64hi(unsigned long long int x,
+                                                    unsigned long long int y);
+__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
+__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
+__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
+
+struct ucharHolder {
+  union {
+    unsigned char c[4];
+    unsigned int ui;
+  };
+} __attribute__((aligned(4)));
+
+struct uchar2Holder {
+  union {
+    unsigned int ui[2];
+    unsigned char c[8];
+  };
+} __attribute__((aligned(8)));
+
+__device__ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+  struct uchar2Holder cHoldVal;
+  struct ucharHolder cHoldKey;
+  cHoldKey.ui = s;
+  cHoldVal.ui[0] = x;
+  cHoldVal.ui[1] = y;
+  unsigned int result;
+  result = cHoldVal.c[cHoldKey.c[0] & 0x07];
+  result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
+  result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
+  result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
+  return result;
+}
+
+__device__ static inline int __hadd(int x, int y) { return ((long long)x + (long long)y) >> 1; }
+
+__device__ static inline int __mul24(int x, int y) { return __ockl_mul24_i32(x, y); }
+
+__device__ static inline long long __mul64hi(long long int x, long long int y) {
+  unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
+  long long x1 = x >> 32;
+  unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
+  long long y1 = y >> 32;
+  unsigned long long z0 = x0 * y0;
+  long long t = x1 * y0 + (z0 >> 32);
+  long long z1 = t & 0xffffffffL;
+  long long z2 = t >> 32;
+  z1 = x0 * y1 + z1;
+  return x1 * y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline int __mulhi(int x, int y) { return __ockl_mul_hi_i32(x, y); }
+
+__device__ static inline int __rhadd(int x, int y) {
+  return ((long long)x + (long long)y + 1) >> 1;
+}
+
+__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
+  return x > y ? x - y + z : y - x + z;
+}
+
+__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
+  return ((unsigned long long)x + (unsigned long long)y) >> 1;
+}
+
+__device__ static inline int __umul24(unsigned int x, unsigned int y) {
+  return __ockl_mul24_u32(x, y);
+}
+
+__device__ static inline unsigned long long __umul64hi(unsigned long long int x,
+                                                       unsigned long long int y) {
+  unsigned long long x0 = x & 0xffffffffUL;
+  unsigned long long x1 = x >> 32;
+  unsigned long long y0 = y & 0xffffffffUL;
+  unsigned long long y1 = y >> 32;
+  unsigned long long z0 = x0 * y0;
+  unsigned long long t = x1 * y0 + (z0 >> 32);
+  unsigned long long z1 = t & 0xffffffffUL;
+  unsigned long long z2 = t >> 32;
+  z1 = x0 * y1 + z1;
+  return x1 * y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
+  return __ockl_mul_hi_u32(x, y);
+}
+
+__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
+  return ((unsigned long long)x + (unsigned long long)y + 1) >> 1;
+}
+
+__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+  return __ockl_sadd_u32(x, y, z);
+}
+
+__device__ static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {
+  return __builtin_amdgcn_mbcnt_lo(x, y);
+};
+
+__device__ static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {
+  return __builtin_amdgcn_mbcnt_hi(x, y);
+};
+
+/*
+HIP specific device functions
+*/
+
+#if !defined(__HIPCC_RTC__)
+#include "amd_warp_functions.h"
+#include "amd_warp_sync_functions.h"
+#endif
+
+#define MASK1 0x00ff00ff
+#define MASK2 0xff00ff00
+
+__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+  char4 out;
+  unsigned one1 = in1.w & MASK1;
+  unsigned one2 = in2.w & MASK1;
+  out.w = (one1 + one2) & MASK1;
+  one1 = in1.w & MASK2;
+  one2 = in2.w & MASK2;
+  out.w = out.w | ((one1 + one2) & MASK2);
+  return out;
+}
+
+__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+  char4 out;
+  unsigned one1 = in1.w & MASK1;
+  unsigned one2 = in2.w & MASK1;
+  out.w = (one1 - one2) & MASK1;
+  one1 = in1.w & MASK2;
+  one2 = in2.w & MASK2;
+  out.w = out.w | ((one1 - one2) & MASK2);
+  return out;
+}
+
+__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+  char4 out;
+  unsigned one1 = in1.w & MASK1;
+  unsigned one2 = in2.w & MASK1;
+  out.w = (one1 * one2) & MASK1;
+  one1 = in1.w & MASK2;
+  one2 = in2.w & MASK2;
+  out.w = out.w | ((one1 * one2) & MASK2);
+  return out;
+}
+
+__device__ static inline float __double2float_rd(double x) { return __ocml_cvtrtn_f32_f64(x); }
+__device__ static inline float __double2float_rn(double x) { return x; }
+__device__ static inline float __double2float_ru(double x) { return __ocml_cvtrtp_f32_f64(x); }
+__device__ static inline float __double2float_rz(double x) { return __ocml_cvtrtz_f32_f64(x); }
+
+__device__ static inline int __double2hiint(double x) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+  return tmp[1];
+}
+__device__ static inline int __double2loint(double x) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+  return tmp[0];
+}
+
+__device__ static inline int __double2int_rd(double x) {
+  return (int)__builtin_elementwise_floor(x);
+}
+__device__ static inline int __double2int_rn(double x) {
+  return (int)__builtin_elementwise_rint(x);
+}
+__device__ static inline int __double2int_ru(double x) {
+  return (int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline int __double2int_rz(double x) { return (int)x; }
+
+__device__ static inline long long int __double2ll_rd(double x) {
+  return (long long)__builtin_elementwise_floor(x);
+}
+__device__ static inline long long int __double2ll_rn(double x) {
+  return (long long)__builtin_elementwise_rint(x);
+}
+__device__ static inline long long int __double2ll_ru(double x) {
+  return (long long)__builtin_elementwise_ceil(x);
+}
+__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
+
+__device__ static inline unsigned int __double2uint_rd(double x) {
+  return (unsigned int)__builtin_elementwise_floor(x);
+}
+__device__ static inline unsigned int __double2uint_rn(double x) {
+  return (unsigned int)__builtin_elementwise_rint(x);
+}
+__device__ static inline unsigned int __double2uint_ru(double x) {
+  return (unsigned int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __double2ull_rd(double x) {
+  return (unsigned long long int)__builtin_elementwise_floor(x);
+}
+__device__ static inline unsigned long long int __double2ull_rn(double x) {
+  return (unsigned long long int)__builtin_elementwise_rint(x);
+}
+__device__ static inline unsigned long long int __double2ull_ru(double x) {
+  return (unsigned long long int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline unsigned long long int __double2ull_rz(double x) {
+  return (unsigned long long int)x;
+}
+__device__ static inline long long int __double_as_longlong(double x) {
+  static_assert(sizeof(long long) == sizeof(double), "");
+
+  long long tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+/*
+__device__ unsigned short __float2half_rn(float x);
+__device__ float __half2float(unsigned short);
+
+The above device function are not a valid .
+Use
+__device__ __half __float2half_rn(float x);
+__device__ float __half2float(__half);
+from hip_fp16.h
+
+CUDA implements half as unsigned short whereas, HIP doesn't.
+
+*/
+
+__device__ static inline int __float2int_rd(float x) { return (int)__builtin_elementwise_floor(x); }
+__device__ static inline int __float2int_rn(float x) { return (int)__builtin_elementwise_rint(x); }
+__device__ static inline int __float2int_ru(float x) { return (int)__builtin_elementwise_ceil(x); }
+__device__ static inline int __float2int_rz(float x) { return (int)__builtin_elementwise_trunc(x); }
+
+__device__ static inline long long int __float2ll_rd(float x) {
+  return (long long int)__builtin_elementwise_floor(x);
+}
+__device__ static inline long long int __float2ll_rn(float x) {
+  return (long long int)__builtin_elementwise_rint(x);
+}
+__device__ static inline long long int __float2ll_ru(float x) {
+  return (long long int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
+
+__device__ static inline unsigned int __float2uint_rd(float x) {
+  return (unsigned int)__builtin_elementwise_floor(x);
+}
+__device__ static inline unsigned int __float2uint_rn(float x) {
+  return (unsigned int)__builtin_elementwise_rint(x);
+}
+__device__ static inline unsigned int __float2uint_ru(float x) {
+  return (unsigned int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __float2ull_rd(float x) {
+  return (unsigned long long int)__builtin_elementwise_floor(x);
+}
+__device__ static inline unsigned long long int __float2ull_rn(float x) {
+  return (unsigned long long int)__builtin_elementwise_rint(x);
+}
+__device__ static inline unsigned long long int __float2ull_ru(float x) {
+  return (unsigned long long int)__builtin_elementwise_ceil(x);
+}
+__device__ static inline unsigned long long int __float2ull_rz(float x) {
+  return (unsigned long long int)x;
+}
+
+__device__ static inline int __float_as_int(float x) {
+  static_assert(sizeof(int) == sizeof(float), "");
+
+  int tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+__device__ static inline unsigned int __float_as_uint(float x) {
+  static_assert(sizeof(unsigned int) == sizeof(float), "");
+
+  unsigned int tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+__device__ static inline double __hiloint2double(int hi, int lo) {
+  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+
+  return tmp1;
+}
+
+__device__ static inline double __int2double_rn(int x) { return (double)x; }
+
+__device__ static inline float __int2float_rd(int x) { return __ocml_cvtrtn_f32_s32(x); }
+__device__ static inline float __int2float_rn(int x) { return (float)x; }
+__device__ static inline float __int2float_ru(int x) { return __ocml_cvtrtp_f32_s32(x); }
+__device__ static inline float __int2float_rz(int x) { return __ocml_cvtrtz_f32_s32(x); }
+
+__device__ static inline float __int_as_float(int x) {
+  static_assert(sizeof(float) == sizeof(int), "");
+
+  float tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+__device__ static inline double __ll2double_rd(long long int x) { return __ocml_cvtrtn_f64_s64(x); }
+__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_ru(long long int x) { return __ocml_cvtrtp_f64_s64(x); }
+__device__ static inline double __ll2double_rz(long long int x) { return __ocml_cvtrtz_f64_s64(x); }
+
+__device__ static inline float __ll2float_rd(long long int x) { return __ocml_cvtrtn_f32_s64(x); }
+__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_ru(long long int x) { return __ocml_cvtrtp_f32_s64(x); }
+__device__ static inline float __ll2float_rz(long long int x) { return __ocml_cvtrtz_f32_s64(x); }
+
+__device__ static inline double __longlong_as_double(long long int x) {
+  static_assert(sizeof(double) == sizeof(long long), "");
+
+  double tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
+
+__device__ static inline float __uint2float_rd(unsigned int x) { return __ocml_cvtrtn_f32_u32(x); }
+__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_ru(unsigned int x) { return __ocml_cvtrtp_f32_u32(x); }
+__device__ static inline float __uint2float_rz(unsigned int x) { return __ocml_cvtrtz_f32_u32(x); }
+
+__device__ static inline float __uint_as_float(unsigned int x) {
+  static_assert(sizeof(float) == sizeof(unsigned int), "");
+
+  float tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
+}
+
+__device__ static inline double __ull2double_rd(unsigned long long int x) {
+  return __ocml_cvtrtn_f64_u64(x);
+}
+__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_ru(unsigned long long int x) {
+  return __ocml_cvtrtp_f64_u64(x);
+}
+__device__ static inline double __ull2double_rz(unsigned long long int x) {
+  return __ocml_cvtrtz_f64_u64(x);
+}
+
+__device__ static inline float __ull2float_rd(unsigned long long int x) {
+  return __ocml_cvtrtn_f32_u64(x);
+}
+__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_ru(unsigned long long int x) {
+  return __ocml_cvtrtp_f32_u64(x);
+}
+__device__ static inline float __ull2float_rz(unsigned long long int x) {
+  return __ocml_cvtrtz_f32_u64(x);
+}
+
+#if defined(__clang__) && defined(__HIP__)
+
+// Clock functions
+__device__ long long int __clock64();
+__device__ long long int __clock();
+__device__ long long int clock64();
+__device__ long long int clock();
+__device__ long long int wall_clock64();
+// hip.amdgcn.bc - named sync
+__device__ void __named_sync();
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+// Clock function to return GPU core cycle count.
+// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
+// through hipDeviceAttributeClockRate attribute.
+__device__ inline __attribute((always_inline)) long long int __clock64() {
+  return (long long int)__builtin_readcyclecounter();
+}
+
+__device__ inline __attribute((always_inline)) long long int __clock() { return __clock64(); }
+
+// Clock function to return wall clock count at a constant frequency that can be queried
+// through hipDeviceAttributeWallClockRate attribute.
+__device__ inline __attribute__((always_inline)) long long int wall_clock64() {
+  return (long long int)__ockl_steadyctr_u64();
+}
+
+__device__ inline __attribute__((always_inline)) long long int clock64() { return __clock64(); }
+
+__device__ inline __attribute__((always_inline)) long long int clock() { return __clock(); }
+
+// hip.amdgcn.bc - named sync
+__device__ inline void __named_sync() { __builtin_amdgcn_s_barrier(); }
+
+#endif  // __HIP_DEVICE_COMPILE__
+
+// hip.amdgcn.bc - lanemask
+__device__ inline __hip_uint64_t __lanemask_gt() {
+  __hip_uint32_t lane = __ockl_lane_u32();
+  if (lane == 63) return 0;
+  __hip_uint64_t ballot = __ballot64(1);
+  __hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1);
+  return mask & ballot;
+}
+
+__device__ inline __hip_uint64_t __lanemask_lt() {
+  __hip_uint32_t lane = __ockl_lane_u32();
+  __hip_int64_t ballot = __ballot64(1);
+  __hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1;
+  return mask & ballot;
+}
+
+__device__ inline __hip_uint64_t __lanemask_eq() {
+  __hip_uint32_t lane = __ockl_lane_u32();
+  __hip_int64_t mask = ((__hip_uint64_t)1 << lane);
+  return mask;
+}
+
+
+__device__ inline void* __local_to_generic(void* p) { return p; }
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__ inline void* __get_dynamicgroupbaseptr() {
+  // Get group segment base pointer.
+  return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
+}
+#else
+__device__ void* __get_dynamicgroupbaseptr();
+#endif  // __HIP_DEVICE_COMPILE__
+
+__device__ inline void* __amdgcn_get_dynamicgroupbaseptr() { return __get_dynamicgroupbaseptr(); }
+
+// Memory Fence Functions
+__device__ inline static void __threadfence() { __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); }
+
+__device__ inline static void __threadfence_block() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+}
+
+__device__ inline static void __threadfence_system() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+}
+__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
+  if (flags == (__CLK_GLOBAL_MEM_FENCE | __CLK_LOCAL_MEM_FENCE)) {
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+    __builtin_amdgcn_s_barrier();
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+  } else if (flags & (__CLK_GLOBAL_MEM_FENCE)) {
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup", "global");
+    __builtin_amdgcn_s_barrier();
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup", "global");
+  } else if (flags & (__CLK_LOCAL_MEM_FENCE)) {
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup", "local");
+    __builtin_amdgcn_s_barrier();
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup", "local");
+  } else {
+    __builtin_amdgcn_s_barrier();
+  }
+}
+
+__device__ inline static void __barrier(int n) { __work_group_barrier((__cl_mem_fence_flags)n); }
+
+__device__
+inline
+__attribute__((convergent))
+void __syncthreads()
+{
+  __barrier(__CLK_GLOBAL_MEM_FENCE | __CLK_LOCAL_MEM_FENCE);
+}
+
+__device__ inline __attribute__((convergent)) int __syncthreads_count(int predicate) {
+  return __ockl_wgred_add_i32(!!predicate);
+}
+
+__device__ inline __attribute__((convergent)) int __syncthreads_and(int predicate) {
+  return __ockl_wgred_and_i32(!!predicate);
+}
+
+__device__ inline __attribute__((convergent)) int __syncthreads_or(int predicate) {
+  return __ockl_wgred_or_i32(!!predicate);
+}
+
+// hip.amdgcn.bc - device routine
+/*
+  HW_ID Register bit structure for RDNA2 & RDNA3
+  WAVE_ID     4:0     Wave id within the SIMD.
+  SIMD_ID     9:8     SIMD_ID within the WGP: [0] = row, [1] = column.
+  WGP_ID      13:10   Physical WGP ID.
+  SA_ID       16      Shader Array ID
+  SE_ID       20:18   Shader Engine the wave is assigned to for gfx11
+  SE_ID       19:18   Shader Engine the wave is assigned to for gfx10
+  DP_RATE     31:29   Number of double-precision float units per SIMD
+
+  HW_ID Register bit structure for GCN and CDNA
+  WAVE_ID     3:0     Wave buffer slot number. 0-9.
+  SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+  PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+  CU_ID       11:8    Compute Unit the wave is assigned to.
+  SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+  SE_ID       15:13   Shader Engine the wave is assigned to for gfx908, gfx90a
+              14:13   Shader Engine the wave is assigned to for 942
+  TG_ID       19:16   Thread-group ID
+  VM_ID       23:20   Virtual Memory ID
+  QUEUE_ID    26:24   Queue from which this wave was dispatched.
+  STATE_ID    29:27   State ID (graphics only, not compute).
+  ME_ID       31:30   Micro-engine ID.
+
+  XCC_ID Register bit structure for 942/950
+  XCC_ID      3:0     XCC the wave is assigned to.
+ */
+
+#if (defined(__GFX10__) || defined(__GFX11__))
+#define HW_ID 23
+#else
+#define HW_ID 4
+#endif
+
+#if (defined(__GFX10__) || defined(__GFX11__))
+#define HW_ID_WGP_ID_SIZE 4
+#define HW_ID_WGP_ID_OFFSET 10
+#if (defined(__AMDGCN_CUMODE__))
+#define HW_ID_CU_ID_SIZE 1
+#define HW_ID_CU_ID_OFFSET 8
+#endif
+#else
+#define HW_ID_CU_ID_SIZE 4
+#define HW_ID_CU_ID_OFFSET 8
+#endif
+
+#if (defined(__gfx908__) || defined(__gfx90a__) || defined(__GFX11__))
+#define HW_ID_SE_ID_SIZE 3
+#else  // 4 SEs/XCC for 942
+#define HW_ID_SE_ID_SIZE 2
+#endif
+#if (defined(__GFX10__) || defined(__GFX11__))
+#define HW_ID_SE_ID_OFFSET 18
+#define HW_ID_SA_ID_OFFSET 16
+#define HW_ID_SA_ID_SIZE 1
+#else
+#define HW_ID_SE_ID_OFFSET 13
+#endif
+
+#if (defined(__gfx942__) || defined(__gfx950__))
+#define __gfx94plus_clr__
+#define XCC_ID 20
+#define XCC_ID_XCC_ID_SIZE 4
+#define XCC_ID_XCC_ID_OFFSET 0
+#endif
+
+#if !defined(__HIP_NO_IMAGE_SUPPORT) && defined(__gfx94plus_clr__)
+#define __HIP_NO_IMAGE_SUPPORT 1
+#endif
+
+/*
+   Encoding of parameter bitmask
+   HW_ID        5:0     HW_ID
+   OFFSET       10:6    Range: 0..31
+   SIZE         15:11   Range: 1..32
+ */
+
+#define GETREG_IMMED(SZ, OFF, REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
+
+/*
+  __smid returns the wave's assigned Compute Unit and Shader Engine.
+  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
+  Note: the results vary over time.
+  SZ minus 1 since SIZE is 1-based.
+*/
+__device__ inline unsigned __smid(void) {
+  unsigned se_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_SE_ID_SIZE - 1, HW_ID_SE_ID_OFFSET, HW_ID));
+#if (defined(__GFX10__) || defined(__GFX11__))
+  unsigned wgp_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
+  unsigned sa_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
+#if (defined(__AMDGCN_CUMODE__))
+  unsigned cu_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
+#endif
+#else
+#if defined(__gfx94plus_clr__)
+  unsigned xcc_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
+#endif
+  unsigned cu_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
+#endif
+#if (defined(__GFX10__) || defined(__GFX11__))
+  unsigned temp = se_id;
+  temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
+  temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
+#if (defined(__AMDGCN_CUMODE__))
+  temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
+#endif
+  return temp;
+  // TODO : CU Mode impl
+#elif defined(__gfx94plus_clr__)
+  unsigned temp = xcc_id;
+  temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
+  temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
+  return temp;
+#else
+  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+#endif
+}
+
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#endif  // defined(__clang__) && defined(__HIP__)
+
+
+// loop unrolling
+static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
+  auto dstPtr = static_cast<unsigned char*>(dst);
+  auto srcPtr = static_cast<const unsigned char*>(src);
+
+  while (size >= 4u) {
+    dstPtr[0] = srcPtr[0];
+    dstPtr[1] = srcPtr[1];
+    dstPtr[2] = srcPtr[2];
+    dstPtr[3] = srcPtr[3];
+
+    size -= 4u;
+    srcPtr += 4u;
+    dstPtr += 4u;
+  }
+  switch (size) {
+    case 3:
+      dstPtr[2] = srcPtr[2];
+    case 2:
+      dstPtr[1] = srcPtr[1];
+    case 1:
+      dstPtr[0] = srcPtr[0];
+  }
+
+  return dst;
+}
+
+static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
+  auto dstPtr = static_cast<unsigned char*>(dst);
+
+  while (size >= 4u) {
+    dstPtr[0] = val;
+    dstPtr[1] = val;
+    dstPtr[2] = val;
+    dstPtr[3] = val;
+
+    size -= 4u;
+    dstPtr += 4u;
+  }
+  switch (size) {
+    case 3:
+      dstPtr[2] = val;
+    case 2:
+      dstPtr[1] = val;
+    case 1:
+      dstPtr[0] = val;
+  }
+
+  return dst;
+}
+#ifndef __OPENMP_AMDGCN__
+static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+  return __hip_hc_memcpy(dst, src, size);
+}
+
+static inline __device__ void* memset(void* ptr, int val, size_t size) {
+  unsigned char val8 = static_cast<unsigned char>(val);
+  return __hip_hc_memset(ptr, val8, size);
+}
+#endif  // !__OPENMP_AMDGCN__
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9b79faeb9dc18192b23e451bc0870e0d613df1f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h
@@ -0,0 +1,746 @@
+/*
+Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if !defined(__HIPCC_RTC__)
+#include "amd_device_functions.h"
+#endif
+
+#if !defined(__HIP_ATOMIC_BACKWARD_COMPAT)
+#define __HIP_ATOMIC_BACKWARD_COMPAT 1
+#endif
+
+#if defined(__has_extension) && __has_extension(clang_atomic_attributes) && __HIP_ATOMIC_BACKWARD_COMPAT
+#define __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY [[clang::atomic(fine_grained_memory, remote_memory)]]
+#else
+#define __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY
+#endif
+
+template <bool B, typename T, typename F> struct Cond_t;
+
+template <typename T, typename F> struct Cond_t<true, T, F> {
+  using type = T;
+};
+template <typename T, typename F> struct Cond_t<false, T, F> {
+  using type = F;
+};
+
+#if !__HIP_DEVICE_COMPILE__
+// TODO: Remove this after compiler pre-defines the following Macros.
+#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
+#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
+#define __HIP_MEMORY_SCOPE_WORKGROUP 3
+#define __HIP_MEMORY_SCOPE_AGENT 4
+#define __HIP_MEMORY_SCOPE_SYSTEM 5
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include "amd_hip_unsafe_atomics.h"
+#endif
+
+// Atomic expanders
+template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
+          typename Op, typename F>
+inline __attribute__((always_inline, device)) T hip_cas_expander(T* p, T x, Op op, F f) noexcept {
+  using FP = __attribute__((address_space(0))) const void*;
+
+  __device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
+
+  if (is_shared_workaround((FP)p)) return f();
+
+  using U =
+      typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
+
+  auto q = reinterpret_cast<U*>(p);
+
+  U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
+  U tmp1;
+  do {
+    tmp1 = tmp0;
+
+    op(reinterpret_cast<T&>(tmp1), x);
+  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order, mem_order, mem_scope));
+
+  return reinterpret_cast<const T&>(tmp0);
+}
+
+template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
+          typename Cmp, typename F>
+inline __attribute__((always_inline, device)) T hip_cas_extrema_expander(T* p, T x, Cmp cmp,
+                                                                         F f) noexcept {
+  using FP = __attribute__((address_space(0))) const void*;
+
+  __device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
+
+  if (is_shared_workaround((FP)p)) return f();
+
+  using U =
+      typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
+
+  auto q = reinterpret_cast<U*>(p);
+
+  U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
+  while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
+         !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order, mem_scope));
+
+  return reinterpret_cast<const T&>(tmp);
+}
+
+__device__ inline unsigned short int atomicCAS(unsigned short int* address,
+                                               unsigned short int compare, unsigned short int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline unsigned short int atomicCAS_system(unsigned short int* address,
+                                                      unsigned short int compare,
+                                                      unsigned short int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline int atomicCAS(int* address, int compare, int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline int atomicCAS_system(int* address, int compare, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline unsigned int atomicCAS(unsigned int* address, unsigned int compare,
+                                         unsigned int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline unsigned int atomicCAS_system(unsigned int* address, unsigned int compare,
+                                                unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline unsigned long atomicCAS(unsigned long* address, unsigned long compare,
+                                          unsigned long val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline unsigned long atomicCAS_system(unsigned long* address, unsigned long compare,
+                                                 unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline unsigned long long atomicCAS(unsigned long long* address,
+                                               unsigned long long compare, unsigned long long val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline unsigned long long atomicCAS_system(unsigned long long* address,
+                                                      unsigned long long compare,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline float atomicCAS(float* address, float compare, float val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline float atomicCAS_system(float* address, float compare, float val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline double atomicCAS(double* address, double compare, double val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__ inline double atomicCAS_system(double* address, double compare, double val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+  return compare;
+}
+
+__device__ inline int atomicAdd(int* address, int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicAdd_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicAdd(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicAdd(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicAdd(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicAdd_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline float atomicAdd(float* address, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline float atomicAdd_system(float* address, float val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+#if !defined(__HIPCC_RTC__)
+HIP_DEPRECATED("use atomicAdd instead")
+#endif  // !defined(__HIPCC_RTC__)
+__device__ inline void atomicAddNoRet(float* address, float val) { unsafeAtomicAdd(address, val); }
+
+__device__ inline double atomicAdd(double* address, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline double atomicAdd_system(double* address, double val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline int atomicSub(int* address, int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicSub_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicSub(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicSub(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicSub(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicSub_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline float atomicSub(float* address, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, -val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline float atomicSub_system(float* address, float val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline double atomicSub(double* address, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, -val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline double atomicSub_system(double* address, double val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline int atomicExch(int* address, int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicExch_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicExch(unsigned int* address, unsigned int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicExch(unsigned long* address, unsigned long val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicExch(unsigned long long* address,
+                                                unsigned long long val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicExch_system(unsigned long long* address,
+                                                       unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline float atomicExch(float* address, float val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline float atomicExch_system(float* address, float val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline double atomicExch(double* address, double val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline double atomicExch_system(double* address, double val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline int atomicMin(int* address, int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicMin_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicMin(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicMin(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicMin(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicMin_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline long long atomicMin(long long* address, long long val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline long long atomicMin_system(long long* address, long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline float atomicMin(float* addr, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMin(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline float atomicMin_system(float* addr, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMin(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#endif
+}
+
+__device__ inline double atomicMin(double* addr, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMin(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline double atomicMin_system(double* addr, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMin(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_min(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#endif
+}
+
+__device__ inline int atomicMax(int* address, int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicMax_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicMax(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicMax(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicMax(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicMax_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+__device__ inline long long atomicMax(long long* address, long long val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline long long atomicMax_system(long long* address, long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline float atomicMax(float* addr, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMax(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline float atomicMax_system(float* addr, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMax(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#endif
+}
+
+__device__ inline double atomicMax(double* addr, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMax(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#endif
+}
+
+__device__ inline double atomicMax_system(double* addr, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicMax(addr, val);
+#else
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_max(addr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#endif
+}
+
+__device__ inline unsigned int atomicInc(unsigned int* address, unsigned int val) {
+  return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
+}
+
+__device__ inline unsigned int atomicDec(unsigned int* address, unsigned int val) {
+  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
+}
+
+__device__ inline int atomicAnd(int* address, int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicAnd_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicAnd(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicAnd(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicAnd(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicAnd_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline int atomicOr(int* address, int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicOr_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicOr(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicOr(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicOr_system(unsigned long long* address,
+                                                     unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline int atomicXor(int* address, int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline int atomicXor_system(int* address, int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned int atomicXor(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long atomicXor(unsigned long* address, unsigned long val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
+
+__device__ inline unsigned long long atomicXor(unsigned long long* address,
+                                               unsigned long long val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ inline unsigned long long atomicXor_system(unsigned long long* address,
+                                                      unsigned long long val) {
+  __HIP_ATOMIC_BACKWARD_COMPAT_MEMORY {
+    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_common.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fb7f6034a1f56408bdee8a168fa4e05c91aef64
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_common.h
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+
+#if defined(__clang__) && defined(__HIP__)
+#define __HIP_CLANG_ONLY__ 1
+#else
+#define __HIP_CLANG_ONLY__ 0
+#endif
+
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..72a8f63f61ab5b13bb693b147e943f37477034c8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h
@@ -0,0 +1,110 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_AMD_HIP_GL_INTEROP_H
+#define HIP_INCLUDE_AMD_HIP_GL_INTEROP_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ *
+ * @addtogroup GlobalDefs
+ * @{
+ *
+ */
+
+/**
+ * HIP Devices used by current OpenGL Context.
+ */
+typedef enum hipGLDeviceList {
+  hipGLDeviceListAll = 1,           ///< All hip devices used by current OpenGL context.
+  hipGLDeviceListCurrentFrame = 2,  ///< Hip devices used by current OpenGL context in current
+                                    ///< frame
+  hipGLDeviceListNextFrame = 3      ///< Hip devices used by current OpenGL context in next
+                                    ///< frame.
+} hipGLDeviceList;
+
+
+/** GLuint as uint.*/
+typedef unsigned int GLuint;
+/** GLenum as uint.*/
+typedef unsigned int GLenum;
+/**
+ * @}
+ */
+
+/**
+ * @defgroup GL OpenGL Interoperability
+ * @ingroup API
+ * @{
+ * This section describes OpenGL interoperability functions of HIP runtime API.
+ */
+
+/**
+ * @brief Queries devices associated with the current OpenGL context.
+ *
+ * @param [out] pHipDeviceCount - Pointer of number of devices on the current GL context.
+ * @param [out] pHipDevices - Pointer of devices on the current OpenGL context.
+ * @param [in] hipDeviceCount - Size of device.
+ * @param [in] deviceList - The setting of devices. It could be either hipGLDeviceListCurrentFrame
+ * for the devices used to render the current frame, or hipGLDeviceListAll for all devices.
+ * The default setting is Invalid deviceList value.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ */
+hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices,
+                           unsigned int hipDeviceCount, hipGLDeviceList deviceList);
+/**
+ * @brief Registers a GL Buffer for interop and returns corresponding graphics resource.
+ *
+ * @param [out] resource - Returns pointer of graphics resource.
+ * @param [in] buffer - Buffer to be registered.
+ * @param [in] flags - Register flags.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
+ *
+ */
+hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer,
+                                       unsigned int flags);
+/**
+ * @brief Register a GL Image for interop and returns the corresponding graphic resource.
+ *
+ * @param [out] resource - Returns pointer of graphics resource.
+ * @param [in] image - Image to be registered.
+ * @param [in] target - Valid target value Id.
+ * @param [in] flags - Register flags.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
+ *
+ */
+hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target,
+                                      unsigned int flags);
+/**
+ * @}
+ */
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* HIP_INCLUDE_AMD_HIP_GL_INTEROP_H */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..779eb621d8d9771d407aa48bbb73c8a9e86b8ed8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h
@@ -0,0 +1,390 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_runtime.h
+ *  @brief Contains definitions of APIs for HIP runtime.
+ */
+
+// #pragma once
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+
+#include <hip/amd_detail/amd_hip_common.h>
+
+#if !defined(__HIPCC_RTC__)
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif  // __cplusplus
+#endif  // !defined(__HIPCC_RTC__)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Query the installed library build name.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns a string describing the build version of the library.  The
+ * string is owned by the library.
+ */
+const char* amd_dbgapi_get_build_name();
+
+/**
+ * @brief Query the installed library git hash.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns git hash of the library.
+ */
+const char* amd_dbgapi_get_git_hash();
+
+/**
+ * @brief Query the installed library build ID.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns build ID of the library.
+ */
+size_t amd_dbgapi_get_build_id();
+
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
+
+//---
+// Top part of file can be compiled with any compiler
+
+#if !defined(__HIPCC_RTC__)
+#ifdef __cplusplus
+#include <cmath>
+#include <cstdint>
+#include <tuple>
+#else
+#include <math.h>
+#include <stdint.h>
+#endif  // __cplusplus
+#endif  // !defined(__HIPCC_RTC__)
+
+#if __HIP_CLANG_ONLY__
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#define CUDA_SUCCESS hipSuccess
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+#include <hip/amd_detail/amd_hip_atomic.h>
+#include <hip/amd_detail/amd_device_functions.h>
+#include <hip/amd_detail/amd_surface_functions.h>
+#include <hip/amd_detail/texture_fetch_functions.h>
+#include <hip/amd_detail/texture_indirect_functions.h>
+extern int HIP_TRACE_API;
+#endif  // !defined(__HIPCC_RTC__)
+
+#ifdef __cplusplus
+#include <hip/amd_detail/hip_ldg.h>
+#endif
+
+#include <hip/amd_detail/host_defines.h>
+
+// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
+#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
+#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
+#endif
+
+// Feature tests:
+#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
+// Device compile and not host compile:
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (1)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (1)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+
+#endif /* Device feature flags */
+
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
+  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                       \
+                 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...)                                                                     \
+  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
+
+#if !defined(__HIPCC_RTC__)
+__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+#endif  // !defined(__HIPCC_RTC__)
+
+// End doxygen API:
+/**
+ *   @}
+ */
+
+//
+// hip-clang functions
+//
+#if !defined(__HIPCC_RTC__)
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+#define HIP_SYMBOL(X) X
+
+typedef int hipLaunchParm;
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>&, void*) {}
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
+  using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
+
+  static_assert(!std::is_reference<T>{},
+                "A __global__ function cannot have a reference as one of its "
+                "arguments.");
+#if defined(HIP_STRICT)
+  static_assert(std::is_trivially_copyable<T>{},
+                "Only TriviallyCopyable types can be arguments to a __global__ "
+                "function");
+#endif
+  _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
+  return pArgs<n + 1>(formals, _vargs);
+}
+
+template <typename... Formals, typename... Actuals>
+std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...),
+                                             std::tuple<Actuals...>(actuals)) {
+  static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
+  std::tuple<Formals...> to_formals{std::move(actuals)};
+  return to_formals;
+}
+
+#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
+template <typename... Args, typename F = void (*)(Args...)>
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
+  constexpr size_t count = sizeof...(Args);
+  auto tup_ = std::tuple<Args...>{args...};
+  auto tup = validateArgsCountType(kernel, tup_);
+  void* _Args[count];
+  pArgs<0>(tup, _Args);
+
+  auto k = reinterpret_cast<void*>(kernel);
+  hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
+}
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+  do {                                                                                             \
+    kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__);             \
+  } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+#endif
+
+#include <hip/hip_runtime_api.h>
+#endif  // !defined(__HIPCC_RTC__)
+
+#if defined(__HIPCC_RTC__)
+typedef struct dim3 {
+  __hip_uint32_t x;  ///< x
+  __hip_uint32_t y;  ///< y
+  __hip_uint32_t z;  ///< z
+#ifdef __cplusplus
+  constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1)
+      : x(_x), y(_y), z(_z) {};
+#endif
+} dim3;
+#endif  // !defined(__HIPCC_RTC__)
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ __forceinline__
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
+__DEVICE__ unsigned int __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
+__DEVICE__ unsigned int __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
+__DEVICE__ unsigned int __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
+__DEVICE__ unsigned int __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
+__DEVICE__ unsigned int __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
+__DEVICE__ unsigned int __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
+__DEVICE__ unsigned int __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
+__DEVICE__ unsigned int __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
+__DEVICE__ unsigned int __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
+__DEVICE__ unsigned int __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
+__DEVICE__ unsigned int __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
+__DEVICE__ unsigned int __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
+
+#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION)                                                  \
+  __declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION;                            \
+  __DEVICE__ unsigned int __get_##DIMENSION(void) { return FUNCTION; }
+
+struct __hip_builtin_threadIdx_t {
+  __HIP_DEVICE_BUILTIN(x, __hip_get_thread_idx_x());
+  __HIP_DEVICE_BUILTIN(y, __hip_get_thread_idx_y());
+  __HIP_DEVICE_BUILTIN(z, __hip_get_thread_idx_z());
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+};
+
+struct __hip_builtin_blockIdx_t {
+  __HIP_DEVICE_BUILTIN(x, __hip_get_block_idx_x());
+  __HIP_DEVICE_BUILTIN(y, __hip_get_block_idx_y());
+  __HIP_DEVICE_BUILTIN(z, __hip_get_block_idx_z());
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+};
+
+struct __hip_builtin_blockDim_t {
+  __HIP_DEVICE_BUILTIN(x, __hip_get_block_dim_x());
+  __HIP_DEVICE_BUILTIN(y, __hip_get_block_dim_y());
+  __HIP_DEVICE_BUILTIN(z, __hip_get_block_dim_z());
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+};
+
+struct __hip_builtin_gridDim_t {
+  __HIP_DEVICE_BUILTIN(x, __hip_get_grid_dim_x());
+  __HIP_DEVICE_BUILTIN(y, __hip_get_grid_dim_y());
+  __HIP_DEVICE_BUILTIN(z, __hip_get_grid_dim_z());
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+};
+
+#undef __HIP_DEVICE_BUILTIN
+#pragma pop_macro("__DEVICE__")
+
+extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
+extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
+extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
+extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/amd_math_functions.h>
+#endif
+
+#if __HIP_HCC_COMPAT_MODE__
+// Define HCC work item functions in terms of HIP builtin variables.
+#pragma push_macro("__DEFINE_HCC_FUNC")
+#define __DEFINE_HCC_FUNC(hc_fun, hip_var)                                                         \
+  inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) {  \
+    if (i == 0)                                                                                    \
+      return hip_var.x;                                                                            \
+    else if (i == 1)                                                                               \
+      return hip_var.y;                                                                            \
+    else                                                                                           \
+      return hip_var.z;                                                                            \
+  }
+
+__DEFINE_HCC_FUNC(workitem_id, threadIdx)
+__DEFINE_HCC_FUNC(group_id, blockIdx)
+__DEFINE_HCC_FUNC(group_size, blockDim)
+__DEFINE_HCC_FUNC(num_groups, gridDim)
+#pragma pop_macro("__DEFINE_HCC_FUNC")
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
+inline __device__ __attribute__((always_inline)) unsigned int hc_get_workitem_absolute_id(int dim) {
+  return (unsigned int)__ockl_get_global_id(dim);
+}
+
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if !defined(__HIPCC_RTC__)
+// Support std::complex.
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#pragma push_macro("__CUDA__")
+#define __CUDA__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+// Workaround for using libc++ with HIP-Clang.
+// The following headers requires clang include path before standard C++ include path.
+// However libc++ include path requires to be before clang include path.
+// To workaround this, we pass -isystem with the parent directory of clang include
+// path instead of the clang include path itself.
+#include <include/cuda_wrappers/algorithm>
+#include <include/cuda_wrappers/complex>
+#include <include/cuda_wrappers/new>
+#undef __CUDA__
+#pragma pop_macro("__CUDA__")
+#endif  // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif  // !defined(__HIPCC_RTC__)
+#endif  // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif  // __HIP_CLANG_ONLY__
+
+#endif  // HIP_AMD_DETAIL_RUNTIME_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..e477ad8b67f630f17a9814448749ffa79b087a81
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h
@@ -0,0 +1,207 @@
+/*
+Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+
+/// hipStreamPerThread implementation
+#if defined(HIP_API_PER_THREAD_DEFAULT_STREAM)
+#define __HIP_STREAM_PER_THREAD
+#define __HIP_API_SPT(api) api##_spt
+#else
+#define __HIP_API_SPT(api) api
+#endif
+
+#if defined(__HIP_STREAM_PER_THREAD)
+// Memory APIs
+#define hipMemcpy __HIP_API_SPT(hipMemcpy)
+#define hipMemcpyToSymbol __HIP_API_SPT(hipMemcpyToSymbol)
+#define hipMemcpyFromSymbol __HIP_API_SPT(hipMemcpyFromSymbol)
+#define hipMemcpy2D __HIP_API_SPT(hipMemcpy2D)
+#define hipMemcpy2DFromArray __HIP_API_SPT(hipMemcpy2DFromArray)
+#define hipMemcpy3D __HIP_API_SPT(hipMemcpy3D)
+#define hipMemset __HIP_API_SPT(hipMemset)
+#define hipMemset2D __HIP_API_SPT(hipMemset2D)
+#define hipMemset3D __HIP_API_SPT(hipMemset3D)
+#define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync)
+#define hipMemset3DAsync __HIP_API_SPT(hipMemset3DAsync)
+#define hipMemset2DAsync __HIP_API_SPT(hipMemset2DAsync)
+#define hipMemsetAsync __HIP_API_SPT(hipMemsetAsync)
+#define hipMemcpy3DAsync __HIP_API_SPT(hipMemcpy3DAsync)
+#define hipMemcpy2DAsync __HIP_API_SPT(hipMemcpy2DAsync)
+#define hipMemcpyFromSymbolAsync __HIP_API_SPT(hipMemcpyFromSymbolAsync)
+#define hipMemcpyToSymbolAsync __HIP_API_SPT(hipMemcpyToSymbolAsync)
+#define hipMemcpyFromArray __HIP_API_SPT(hipMemcpyFromArray)
+#define hipMemcpy2DToArray __HIP_API_SPT(hipMemcpy2DToArray)
+#define hipMemcpy2DFromArrayAsync __HIP_API_SPT(hipMemcpy2DFromArrayAsync)
+#define hipMemcpy2DToArrayAsync __HIP_API_SPT(hipMemcpy2DToArrayAsync)
+
+// Stream APIs
+#define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize)
+#define hipStreamQuery __HIP_API_SPT(hipStreamQuery)
+#define hipStreamGetFlags __HIP_API_SPT(hipStreamGetFlags)
+#define hipStreamGetPriority __HIP_API_SPT(hipStreamGetPriority)
+#define hipStreamWaitEvent __HIP_API_SPT(hipStreamWaitEvent)
+#define hipStreamAddCallback __HIP_API_SPT(hipStreamAddCallback)
+#define hipLaunchHostFunc __HIP_API_SPT(hipLaunchHostFunc)
+
+// Event APIs
+#define hipEventRecord __HIP_API_SPT(hipEventRecord)
+
+// Launch APIs
+#define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel)
+#define hipLaunchCooperativeKernel __HIP_API_SPT(hipLaunchCooperativeKernel)
+
+// Graph APIs
+#define hipGraphLaunch __HIP_API_SPT(hipGraphLaunch)
+#define hipStreamBeginCapture __HIP_API_SPT(hipStreamBeginCapture)
+#define hipStreamEndCapture __HIP_API_SPT(hipStreamEndCapture)
+#define hipStreamIsCapturing __HIP_API_SPT(hipStreamIsCapturing)
+#define hipStreamGetCaptureInfo __HIP_API_SPT(hipStreamGetCaptureInfo)
+#define hipStreamGetCaptureInfo_v2 __HIP_API_SPT(hipStreamGetCaptureInfo_v2)
+
+// Driver Entry Point API
+#define hipGetDriverEntryPoint __HIP_API_SPT(hipGetDriverEntryPoint)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+
+hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t sizeBytes,
+                                 size_t offset __dparm(0),
+                                 hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
+
+hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol, size_t sizeBytes,
+                                   size_t offset __dparm(0),
+                                   hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
+
+hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                           size_t height, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DFromArray_spt(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
+                                    size_t hOffset, size_t width, size_t height,
+                                    hipMemcpyKind kind);
+
+hipError_t hipMemcpy3D_spt(const struct hipMemcpy3DParms* p);
+
+hipError_t hipMemset_spt(void* dst, int value, size_t sizeBytes);
+
+hipError_t hipMemsetAsync_spt(void* dst, int value, size_t sizeBytes,
+                              hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height);
+
+hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value, size_t width, size_t height,
+                                hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemset3DAsync_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
+                                hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent);
+
+hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                              hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpy3DAsync_spt(const hipMemcpy3DParms* p,
+                                hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                size_t width, size_t height, hipMemcpyKind kind,
+                                hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpyFromSymbolAsync_spt(void* dst, const void* symbol, size_t sizeBytes,
+                                        size_t offset, hipMemcpyKind kind,
+                                        hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpyToSymbolAsync_spt(const void* symbol, const void* src, size_t sizeBytes,
+                                      size_t offset, hipMemcpyKind kind,
+                                      hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc,
+                                  size_t hOffset, size_t count, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DToArray_spt(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
+                                  size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DFromArrayAsync_spt(void* dst, size_t dpitch, hipArray_const_t src,
+                                         size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+                                         size_t height, hipMemcpyKind kind,
+                                         hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipMemcpy2DToArrayAsync_spt(hipArray_t dst, size_t wOffset, size_t hOffset,
+                                       const void* src, size_t spitch, size_t width, size_t height,
+                                       hipMemcpyKind kind,
+                                       hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipStreamQuery_spt(hipStream_t stream);
+
+hipError_t hipStreamSynchronize_spt(hipStream_t stream);
+
+hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority);
+
+hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event,
+                                  unsigned int flags __dparm(0));
+
+hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags);
+
+hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback,
+                                    void* userData, unsigned int flags);
+
+hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipLaunchCooperativeKernel_spt(const void* f, dim3 gridDim, dim3 blockDim,
+                                          void** kernelParams, uint32_t sharedMemBytes,
+                                          hipStream_t hStream __dparm(hipStreamPerThread));
+
+hipError_t hipLaunchKernel_spt(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+                               void** args, size_t sharedMemBytes,
+                               hipStream_t stream __dparm(hipStreamPerThread));
+
+hipError_t hipGraphLaunch_spt(hipGraphExec_t graphExec, hipStream_t stream);
+hipError_t hipStreamBeginCapture_spt(hipStream_t stream, hipStreamCaptureMode mode);
+hipError_t hipStreamEndCapture_spt(hipStream_t stream, hipGraph_t* pGraph);
+hipError_t hipStreamIsCapturing_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
+hipError_t hipStreamGetCaptureInfo_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
+                                       unsigned long long* pId);
+hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream,
+                                          hipStreamCaptureStatus* captureStatus_out,
+                                          unsigned long long* id_out, hipGraph_t* graph_out,
+                                          const hipGraphNode_t** dependencies_out,
+                                          size_t* numDependencies_out);
+hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userData);
+hipError_t hipGetDriverEntryPoint_spt(const char* symbol, void** funcPtr, unsigned long long flags,
+                                      hipDriverEntryPointQueryResult* status);
+
+
+#ifdef __cplusplus
+}
+#endif  // extern "C"
+
+#endif  // defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#endif  // HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h
new file mode 100644
index 0000000000000000000000000000000000000000..0366f278ede7667299edd63175258974dba021fd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h
@@ -0,0 +1,595 @@
+/*
+Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+
+#pragma push_macro("__HIP_ATOMICS_IGNORE_DENORMAL_MODE")
+#if defined(__has_extension) && __has_extension(clang_atomic_attributes)
+#define __HIP_ATOMICS_IGNORE_DENORMAL_MODE [[clang::atomic(ignore_denormal_mode)]]
+#else
+#define __HIP_ATOMICS_IGNORE_DENORMAL_MODE
+#endif
+
+/**
+ * @brief Unsafe floating point rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write floating point atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 4 bytes aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and is not supported.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_is_shared) &&                            \
+    __has_builtin(__builtin_amdgcn_is_private) &&                                                  \
+    __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) &&                                          \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
+  if (__builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void*)addr))
+    return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
+  else if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
+    float temp = *addr;
+    *addr = temp + value;
+    return temp;
+  } else
+    return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
+#elif __has_builtin(__hip_atomic_fetch_add)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Unsafe floating point rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write floating point atomic max with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if greater.
+ *
+ * @note This operation is currently identical to that performed by
+ * atomicMax and is included for completeness.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicMax(float* addr, float val) {
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value < val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned int* uaddr = (unsigned int*)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+#endif
+}
+
+/**
+ * @brief Unsafe floating point rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write floating point atomic min with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if lesser.
+ *
+ * @note This operation is currently identical to that performed by
+ * atomicMin and is included for completeness.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicMin(float* addr, float val) {
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value > val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned int* uaddr = (unsigned int*)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+#endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write double precision atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline double unsafeAtomicAdd(double* addr, double value) {
+#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
+  return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
+#elif __has_builtin(__hip_atomic_fetch_add)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write double precision atomic max with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if greater.
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double unsafeAtomicMax(double* addr, double val) {
+#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) &&                                         \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
+  return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
+#else
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value < val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned long long* uaddr = (unsigned long long*)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+#endif
+#endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write double precision atomic min with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if lesser.
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double unsafeAtomicMin(double* addr, double val) {
+#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) &&                                         \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
+  return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
+#else
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value > val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned long long* uaddr = (unsigned long long*)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+#endif
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write floating point atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+#if defined(__gfx908__) || ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
+                            !__has_builtin(__hip_atomic_fetch_add))
+  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
+  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
+  // On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
+  // need to force a CAS loop here.
+  float old_val;
+#if __has_builtin(__hip_atomic_load)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else   // !__has_builtin(__hip_atomic_load)
+  old_val =
+      __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
+#endif  // __has_builtin(__hip_atomic_load)
+  float expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+      __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                           __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+#else   // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
+#endif  // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__float_as_uint(temp) != __float_as_uint(old_val));
+  return old_val;
+#elif defined(__gfx90a__)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#elif __has_builtin(__hip_atomic_fetch_add)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write floating point atomic max with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if greater.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicMax(float* addr, float val) {
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value < val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned int* uaddr = (unsigned int*)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write floating point atomic min with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if lesser.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicMin(float* addr, float val) {
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    bool done = false;
+    while (!done && value > val) {
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    return value;
+  }
+#else
+  unsigned int* uaddr = (unsigned int*)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+#endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write double precision atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+  }
+#elif defined(__gfx90a__)
+  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
+  // force a CAS loop here.
+  double old_val;
+#if __has_builtin(__hip_atomic_load)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else   // !__has_builtin(__hip_atomic_load)
+  old_val = __longlong_as_double(
+      __atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
+#endif  // __has_builtin(__hip_atomic_load)
+  double expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+      __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                           __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+#else   // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
+#endif  // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
+  return old_val;
+#else   // !defined(__gfx90a__)
+#if __has_builtin(__hip_atomic_fetch_add)
+  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+    return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+#else   // !__has_builtin(__hip_atomic_fetch_add)
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif  // __has_builtin(__hip_atomic_fetch_add)
+#endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write double precision atomic max with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if greater.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double safeAtomicMax(double* addr, double val) {
+#if __has_builtin(__builtin_amdgcn_is_private)
+  if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
+    double old = *addr;
+    *addr = __builtin_fmax(old, val);
+    return old;
+  } else {
+#endif
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+      double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      bool done = false;
+      while (!done && value < val) {
+        done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                    __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      }
+      return value;
+    }
+#else
+  unsigned long long* uaddr = (unsigned long long*)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+#endif
+#if __has_builtin(__builtin_amdgcn_is_private)
+  }
+#endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write double precision atomic min with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if lesser.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double safeAtomicMin(double* addr, double val) {
+#if __has_builtin(__builtin_amdgcn_is_private)
+  if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
+    double old = *addr;
+    *addr = __builtin_fmin(old, val);
+    return old;
+  } else {
+#endif
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
+    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
+      double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      bool done = false;
+      while (!done && value > val) {
+        done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                    __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      }
+      return value;
+    }
+#else
+  unsigned long long* uaddr = (unsigned long long*)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+#endif
+#if __has_builtin(__builtin_amdgcn_is_private)
+  }
+#endif
+}
+
+#pragma pop_macro("__HIP_ATOMICS_IGNORE_DENORMAL_MODE")
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..b96bcc1e16449b2ad6284d46389df652dcd4f30d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h
@@ -0,0 +1,1606 @@
+/*
+Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_vector_types.h
+ *  @brief Defines the different newt vector types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_VECTOR_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_VECTOR_TYPES_H
+
+#include "hip/amd_detail/host_defines.h"
+
+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
+#endif
+
+#if defined(__has_attribute)
+#if __has_attribute(ext_vector_type)
+#define __HIP_USE_NATIVE_VECTOR__ 1
+#define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
+#else
+#define __NATIVE_VECTOR__(n, T) alignas(n * sizeof(T)) T[n]
+#endif
+
+#if defined(__cplusplus)
+#if !defined(__HIPCC_RTC__)
+#include <array>
+#include <iosfwd>
+#include <type_traits>
+#endif  // defined(__HIPCC_RTC__)
+
+template <typename T, unsigned int n> struct HIP_vector_base;
+template <typename T, unsigned int rank> struct HIP_vector_type;
+
+namespace hip_impl {
+template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
+    typename HIP_vector_base<T, n>::Native_vec_*
+    get_native_pointer(HIP_vector_base<T, n>& base_vec) {
+  static_assert(sizeof(base_vec) == sizeof(typename HIP_vector_base<T, n>::Native_vec_));
+  static_assert(
+      (__hip_internal::alignment_of<HIP_vector_base<T, n>>::value %
+       __hip_internal::alignment_of<typename HIP_vector_base<T, n>::Native_vec_>::value) == 0);
+  return reinterpret_cast<typename HIP_vector_base<T, n>::Native_vec_*>(&base_vec);
+};
+
+template <typename T, unsigned int n>
+__attribute__((always_inline)) __HOST_DEVICE__ const typename HIP_vector_base<T, n>::Native_vec_*
+get_native_pointer(const HIP_vector_base<T, n>& base_vec) {
+  static_assert(sizeof(base_vec) == sizeof(typename HIP_vector_base<T, n>::Native_vec_));
+  static_assert(
+      (__hip_internal::alignment_of<HIP_vector_base<T, n>>::value %
+       __hip_internal::alignment_of<typename HIP_vector_base<T, n>::Native_vec_>::value) == 0);
+  return reinterpret_cast<const typename HIP_vector_base<T, n>::Native_vec_*>(&base_vec);
+};
+}  // Namespace hip_impl.
+
+template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
+    typename HIP_vector_base<T, n>::Native_vec_&
+    get_native_vector(HIP_vector_base<T, n>& base_vec) {
+  return *hip_impl::get_native_pointer(base_vec);
+};
+
+template <typename T, unsigned int n>
+__attribute__((always_inline)) __HOST_DEVICE__ const typename HIP_vector_base<T, n>::Native_vec_&
+get_native_vector(const HIP_vector_base<T, n>& base_vec) {
+  return *hip_impl::get_native_pointer(base_vec);
+};
+
+template <typename T> struct HIP_vector_base<T, 1> {
+  using Native_vec_ = __NATIVE_VECTOR__(1, T);
+
+  T x;
+
+  using value_type = T;
+
+  __HOST_DEVICE__
+  HIP_vector_base() = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(const HIP_vector_base&) = default;
+  __HOST_DEVICE__
+  explicit constexpr HIP_vector_base(T x_) : x(x_) {}
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(HIP_vector_base&&) = default;
+  __HOST_DEVICE__
+  ~HIP_vector_base() = default;
+  __HOST_DEVICE__
+  HIP_vector_base& operator=(const HIP_vector_base&) = default;
+};
+
+template <typename T> struct alignas(2 * sizeof(T)) HIP_vector_base<T, 2> {
+  using Native_vec_ = __NATIVE_VECTOR__(2, T);
+
+  T x, y;
+
+  using value_type = T;
+
+  __HOST_DEVICE__
+  HIP_vector_base() = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(const HIP_vector_base&) = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(T x_, T y_ = T()) : x(x_), y(y_) {}
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(HIP_vector_base&&) = default;
+  __HOST_DEVICE__
+  ~HIP_vector_base() = default;
+  __HOST_DEVICE__
+  HIP_vector_base& operator=(const HIP_vector_base&) = default;
+};
+
+template <typename T> struct HIP_vector_base<T, 3> {
+  struct Native_vec_ {
+    T d[3];
+
+    __HOST_DEVICE__
+    Native_vec_() = default;
+
+    __HOST_DEVICE__
+    explicit constexpr Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
+    __HOST_DEVICE__
+    constexpr Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
+    __HOST_DEVICE__
+    constexpr Native_vec_(const Native_vec_&) = default;
+    __HOST_DEVICE__
+    constexpr Native_vec_(Native_vec_&&) = default;
+    __HOST_DEVICE__
+    ~Native_vec_() = default;
+
+    __HOST_DEVICE__
+    Native_vec_& operator=(const Native_vec_&) = default;
+    __HOST_DEVICE__
+    Native_vec_& operator=(Native_vec_&&) = default;
+
+    __HOST_DEVICE__
+    T& operator[](unsigned int idx) noexcept { return d[idx]; }
+    __HOST_DEVICE__
+    T operator[](unsigned int idx) const noexcept { return d[idx]; }
+
+    __HOST_DEVICE__
+    Native_vec_& operator+=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
+      return *this;
+    }
+    __HOST_DEVICE__
+    Native_vec_& operator-=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
+      return *this;
+    }
+
+    __HOST_DEVICE__
+    Native_vec_& operator*=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
+      return *this;
+    }
+    __HOST_DEVICE__
+    Native_vec_& operator/=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
+      return *this;
+    }
+
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_signed<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_ operator-() const noexcept {
+      auto r{*this};
+      for (auto&& x : r.d) x = -x;
+      return r;
+    }
+
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_ operator~() const noexcept {
+      auto r{*this};
+      for (auto&& x : r.d) x = ~x;
+      return r;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator%=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
+      return *this;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator^=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
+      return *this;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator|=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
+      return *this;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator&=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
+      return *this;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator>>=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
+      return *this;
+    }
+    template <typename U = T,
+              typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+    __HOST_DEVICE__ Native_vec_& operator<<=(const Native_vec_& x_) noexcept {
+      for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
+      return *this;
+    }
+#if defined(__INTEL_COMPILER)
+    typedef struct {
+      int values[4];
+    } _Vec3_cmp;
+    using Vec3_cmp = _Vec3_cmp;
+#else
+    using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
+#endif  // INTEL
+    __HOST_DEVICE__
+    Vec3_cmp operator==(const Native_vec_& x_) const noexcept {
+      return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
+    }
+  };
+
+  T x, y, z;
+
+  using value_type = T;
+
+  __HOST_DEVICE__
+  HIP_vector_base() = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(const HIP_vector_base&) = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(T x_, T y_ = T(), T z_ = T()) : x(x_), y(y_), z(z_) {};
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(HIP_vector_base&&) = default;
+  __HOST_DEVICE__
+  ~HIP_vector_base() = default;
+
+  __HOST_DEVICE__
+  HIP_vector_base& operator=(const HIP_vector_base&) = default;
+  __HOST_DEVICE__
+  HIP_vector_base& operator=(HIP_vector_base&&) = default;
+};
+
+template <typename T> struct alignas(4 * sizeof(T)) HIP_vector_base<T, 4> {
+  using Native_vec_ = __NATIVE_VECTOR__(4, T);
+
+  T x, y, z, w;
+
+  using value_type = T;
+
+  __HOST_DEVICE__
+  HIP_vector_base() = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(const HIP_vector_base&) = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(T x_, T y_ = T(), T z_ = T(), T w_ = T())
+      : x(x_), y(y_), z(z_), w(w_) {};
+  __HOST_DEVICE__
+  constexpr HIP_vector_base(HIP_vector_base&&) = default;
+  __HOST_DEVICE__
+  ~HIP_vector_base() = default;
+  __HOST_DEVICE__
+  HIP_vector_base& operator=(const HIP_vector_base&) = default;
+};
+
+template <typename T, size_t rank, size_t... indices>
+constexpr inline __HOST_DEVICE__ HIP_vector_type<T, rank> make_vector_type_impl(
+    T val, __hip_internal::index_sequence<indices...>) noexcept {
+  // Fills vec with vals, and ignores the indices
+  return HIP_vector_type<T, rank>{((void)indices, val)...};
+}
+
+template <typename T, unsigned int rank>
+constexpr inline __HOST_DEVICE__ HIP_vector_type<T, rank> make_vector_type(T val) {
+  return make_vector_type_impl<T, rank>(
+      val, __hip_internal::make_index_sequence_value(__hip_internal::make_index_sequence<rank>{}));
+}
+
+template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vector_base<T, rank> {
+  using typename HIP_vector_base<T, rank>::Native_vec_;
+
+  __HOST_DEVICE__
+  HIP_vector_type() = default;
+  template <typename U, typename __hip_internal::enable_if<
+                            __hip_internal::is_convertible<U, T>::value>::type* = nullptr>
+  __HOST_DEVICE__ explicit constexpr HIP_vector_type(U x_) noexcept
+      : HIP_vector_base<T, rank>{static_cast<T>(x_)} {}
+  template <  // TODO: constrain based on type as well.
+      typename... Us,
+      typename __hip_internal::enable_if<(rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
+  __HOST_DEVICE__ constexpr HIP_vector_type(Us... xs) noexcept
+      : HIP_vector_base<T, rank>{static_cast<T>(xs)...} {}
+  __HOST_DEVICE__
+  constexpr HIP_vector_type(const HIP_vector_type&) = default;
+  __HOST_DEVICE__
+  constexpr HIP_vector_type(HIP_vector_type&&) = default;
+  __HOST_DEVICE__
+  ~HIP_vector_type() = default;
+
+  __HOST_DEVICE__
+  HIP_vector_type& operator=(const HIP_vector_type&) = default;
+  __HOST_DEVICE__
+  HIP_vector_type& operator=(HIP_vector_type&&) = default;
+
+  // Operators
+  __HOST_DEVICE__
+  T& operator[](size_t idx) noexcept { return reinterpret_cast<T*>(this)[idx]; }
+  __HOST_DEVICE__
+  const T& operator[](size_t idx) const noexcept { return reinterpret_cast<const T*>(this)[idx]; }
+
+  __HOST_DEVICE__
+  HIP_vector_type& operator++() noexcept {
+    HIP_vector_type unity = make_vector_type<T, rank>(1);
+    return *this += unity;
+  }
+  __HOST_DEVICE__
+  HIP_vector_type operator++(int) noexcept {
+    auto tmp(*this);
+    ++*this;
+    return tmp;
+  }
+
+  __HOST_DEVICE__
+  HIP_vector_type& operator--() noexcept {
+    HIP_vector_type unity = make_vector_type<T, rank>(1);
+    return *this -= unity;
+  }
+  __HOST_DEVICE__
+  HIP_vector_type operator--(int) noexcept {
+    auto tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  __HOST_DEVICE__ HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) += get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] += get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+  template <typename U, typename __hip_internal::enable_if<
+                            __hip_internal::is_convertible<U, T>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator+=(U x) noexcept {
+    return *this += make_vector_type<T, rank>(x);
+  }
+
+  __HOST_DEVICE__ HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) -= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] -= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+  template <typename U, typename __hip_internal::enable_if<
+                            __hip_internal::is_convertible<U, T>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator-=(U x) noexcept {
+    return *this -= make_vector_type<T, rank>(x);
+  }
+
+  __HOST_DEVICE__ HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) *= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] *= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator*(
+      HIP_vector_type x, const HIP_vector_type& y) noexcept {
+    return HIP_vector_type{x} *= y;
+  }
+
+  template <typename U, typename __hip_internal::enable_if<
+                            __hip_internal::is_convertible<U, T>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator*=(U x) noexcept {
+    return *this *= make_vector_type<T, rank>(x);
+  }
+
+  friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator/(
+      HIP_vector_type x, const HIP_vector_type& y) noexcept {
+    return HIP_vector_type{x} /= y;
+  }
+
+  __HOST_DEVICE__ HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) /= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] /= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+  template <typename U, typename __hip_internal::enable_if<
+                            __hip_internal::is_convertible<U, T>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator/=(U x) noexcept {
+    return *this /= make_vector_type<T, rank>(x);
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_signed<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type operator-() const noexcept {
+    auto tmp(*this);
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(tmp) = -get_native_vector(tmp);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(tmp)[i] = -get_native_vector(tmp)[i];
+#endif
+    return tmp;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type operator~() const noexcept {
+    HIP_vector_type r{*this};
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(r) = ~get_native_vector(r);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(r)[i] = ~get_native_vector(r)[i];
+#endif
+    return r;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) %= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] %= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) ^= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] ^= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) |= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] |= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) &= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] &= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) >>= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] >>= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+
+  template <typename U = T,
+            typename __hip_internal::enable_if<__hip_internal::is_integral<U>{}>::type* = nullptr>
+  __HOST_DEVICE__ HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept {
+#if __HIP_USE_NATIVE_VECTOR__
+    get_native_vector(*this) <<= get_native_vector(x);
+#else
+    for (auto i = 0u; i != rank; ++i) get_native_vector(*this)[i] <<= get_native_vector(x)[i];
+#endif
+    return *this;
+  }
+};
+
+template <typename T, unsigned int n>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator+(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} += y;
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator+(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} += make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator+(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) += y;
+}
+
+template <typename T, unsigned int n>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator-(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} -= y;
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator-(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} -= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator-(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) -= y;
+}
+
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator*(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} *= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator*(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) *= y;
+}
+
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator/(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} /= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator/(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) /= y;
+}
+
+template <typename T, unsigned int n> __HOST_DEVICE__ inline
+#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__)
+    constexpr
+#endif
+    bool
+    operator==(const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  bool isTrue = true;
+  const auto& native_x = get_native_vector(x);
+  const auto& native_y = get_native_vector(y);
+  for (unsigned int i = 0; i < n; ++i) {
+    isTrue = (isTrue && (native_x[i] == native_y[i]));
+  }
+  return isTrue;
+}
+
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept {
+  return x == make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) == y;
+}
+
+template <typename T, unsigned int n>
+__HOST_DEVICE__ inline constexpr bool operator!=(const HIP_vector_type<T, n>& x,
+                                                 const HIP_vector_type<T, n>& y) noexcept {
+  return !(x == y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept {
+  return !(x == y);
+}
+template <typename T, unsigned int n, typename U>
+__HOST_DEVICE__ inline constexpr bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept {
+  return !(x == y);
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator%(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} %= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator%(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} %= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator%(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) %= y;
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator^(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} ^= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator^(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} ^= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator^(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) ^= y;
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator|(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} |= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator|(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} |= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator|(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) |= y;
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator&(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} &= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator&(const HIP_vector_type<T, n>& x,
+                                                                 U y) noexcept {
+  return HIP_vector_type<T, n>{x} &= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator&(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) &= y;
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator>>(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} >>= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator>>(const HIP_vector_type<T, n>& x,
+                                                                  U y) noexcept {
+  return HIP_vector_type<T, n>{x} >>= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator>>(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) >>= y;
+}
+
+template <typename T, unsigned int n,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator<<(
+    const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
+  return HIP_vector_type<T, n>{x} <<= y;
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator<<(const HIP_vector_type<T, n>& x,
+                                                                  U y) noexcept {
+  return HIP_vector_type<T, n>{x} <<= make_vector_type<T, n>(y);
+}
+template <typename T, unsigned int n, typename U,
+          typename __hip_internal::enable_if<__hip_internal::is_arithmetic<U>::value>::type,
+          typename __hip_internal::enable_if<__hip_internal::is_integral<T>{}>* = nullptr>
+__HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator<<(
+    U x, const HIP_vector_type<T, n>& y) noexcept {
+  return make_vector_type<T, n>(x) <<= y;
+}
+
+/*
+ * Map HIP_vector_type<U, rankU> to HIP_vector_type<T, rankT>
+ */
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 1 && rankU >= 1),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x));
+};
+
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 2 && rankU == 1),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(0));
+};
+
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 2 && rankU >= 2),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y));
+};
+
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 4 && rankU == 1),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(0), static_cast<T>(0),
+                                   static_cast<T>(0));
+};
+
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 4 && rankU == 2),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y), static_cast<T>(0),
+                                   static_cast<T>(0));
+};
+
+template <typename T, unsigned int rankT, typename U, unsigned int rankU>
+__forceinline__ __HOST_DEVICE__
+    typename __hip_internal::enable_if<(rankT == 4 && rankU == 4),
+                                       const HIP_vector_type<T, rankT>>::type
+    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
+  return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y), static_cast<T>(u.z),
+                                   static_cast<T>(u.w));
+};
+
+#define __MAKE_VECTOR_TYPE__(CUDA_name, T)                                                         \
+  using CUDA_name##1 = HIP_vector_type<T, 1>;                                                      \
+  using CUDA_name##2 = HIP_vector_type<T, 2>;                                                      \
+  using CUDA_name##3 = HIP_vector_type<T, 3>;                                                      \
+  using CUDA_name##4 = HIP_vector_type<T, 4>;
+#else
+#define __MAKE_VECTOR_TYPE__(CUDA_name, T)                                                         \
+  typedef struct {                                                                                 \
+    T x;                                                                                           \
+  } CUDA_name##1;                                                                                  \
+  typedef struct {                                                                                 \
+    T x;                                                                                           \
+    T y;                                                                                           \
+  } CUDA_name##2;                                                                                  \
+  typedef struct {                                                                                 \
+    T x;                                                                                           \
+    T y;                                                                                           \
+    T z;                                                                                           \
+  } CUDA_name##3;                                                                                  \
+  typedef struct {                                                                                 \
+    T x;                                                                                           \
+    T y;                                                                                           \
+    T z;                                                                                           \
+    T w;                                                                                           \
+  } CUDA_name##4;
+#endif
+
+__MAKE_VECTOR_TYPE__(uchar, unsigned char);
+__MAKE_VECTOR_TYPE__(char, char);
+__MAKE_VECTOR_TYPE__(ushort, unsigned short);
+__MAKE_VECTOR_TYPE__(short, short);
+__MAKE_VECTOR_TYPE__(uint, unsigned int);
+__MAKE_VECTOR_TYPE__(int, int);
+__MAKE_VECTOR_TYPE__(ulong, unsigned long);
+__MAKE_VECTOR_TYPE__(long, long);
+__MAKE_VECTOR_TYPE__(ulonglong, unsigned long long);
+__MAKE_VECTOR_TYPE__(longlong, long long);
+__MAKE_VECTOR_TYPE__(float, float);
+__MAKE_VECTOR_TYPE__(double, double);
+
+#else  // !defined(__has_attribute)
+
+#if defined(_MSC_VER)
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+
+/*
+this is for compatibility with CUDA as CUDA allows accessing vector components
+in C++ program with MSVC
+structs are wrapped with templates so that mangled names match templated implementation
+*/
+
+template <typename T, unsigned int n> struct HIP_vector_type;
+
+// One template per vector size
+template <typename T> struct HIP_vector_type<T, 1> {
+  union {
+    struct {
+      T x;
+    };
+    T data;
+  };
+};
+template <typename T> struct HIP_vector_type<T, 2> {
+  union {
+    struct {
+      T x;
+      T y;
+    };
+    T data[2];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 3> {
+  union {
+    struct {
+      T x;
+      T y;
+      T z;
+    };
+    T data[3];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 4> {
+  union {
+    struct {
+      T x;
+      T y;
+      T z;
+      T w;
+    };
+    T data[4];
+  };
+};
+// 8- and 16-length vectors do not have CUDA-style accessible components
+template <typename T> struct HIP_vector_type<T, 8> {
+  union {
+    T data[8];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 16> {
+  union {
+    T data[16];
+  };
+};
+
+// Explicit specialization for vectors using MSVC-specific definitions
+template <> struct HIP_vector_type<char, 8> {
+  union {
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<char, 16> {
+  union {
+    __m128i data;
+  };
+};
+
+template <> struct HIP_vector_type<unsigned char, 8> {
+  union {
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<unsigned char, 16> {
+  union {
+    __m128i data;
+  };
+};
+
+template <> struct HIP_vector_type<short, 4> {
+  union {
+    struct {
+      short x;
+      short y;
+      short z;
+      short w;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<short, 8> {
+  union {
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<short, 16> {
+  union {
+    __m128i data[2];
+  };
+};
+
+template <> struct HIP_vector_type<unsigned short, 4> {
+  union {
+    struct {
+      unsigned short x;
+      unsigned short y;
+      unsigned short z;
+      unsigned short w;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<unsigned short, 8> {
+  union {
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<unsigned short, 16> {
+  union {
+    __m128i data[2];
+  };
+};
+
+template <> struct HIP_vector_type<int, 2> {
+  union {
+    struct {
+      int x;
+      int y;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<int, 4> {
+  union {
+    struct {
+      int x;
+      int y;
+      int z;
+      int w;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<int, 8> {
+  union {
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<int, 16> {
+  union {
+    __m128i data[4];
+  };
+};
+
+template <> struct HIP_vector_type<unsigned int, 2> {
+  union {
+    struct {
+      unsigned int x;
+      unsigned int y;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<unsigned int, 4> {
+  union {
+    struct {
+      unsigned int x;
+      unsigned int y;
+      unsigned int z;
+      unsigned int w;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<unsigned int, 8> {
+  union {
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<unsigned int, 16> {
+  union {
+    __m128i data[4];
+  };
+};
+
+// MSVC uses 32-bit longs and 64-bit long longs, explicitly defining for clarity
+template <> struct HIP_vector_type<long, 1> {
+  union {
+    struct {
+      std::int32_t x;
+    };
+    std::int32_t data;
+  };
+};
+template <> struct HIP_vector_type<long, 2> {
+  union {
+    struct {
+      std::int32_t x;
+      std::int32_t y;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<long, 3> {
+  union {
+    struct {
+      std::int32_t x;
+      std::int32_t y;
+      std::int32_t z;
+    };
+    std::int32_t data[3];
+  };
+};
+template <> struct HIP_vector_type<long, 4> {
+  union {
+    struct {
+      std::int32_t x;
+      std::int32_t y;
+      std::int32_t z;
+      std::int32_t w;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<long, 8> {
+  union {
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<long, 16> {
+  union {
+    __m128i data[4];
+  };
+};
+
+template <> struct HIP_vector_type<unsigned long, 1> {
+  union {
+    struct {
+      std::uint32_t x;
+    };
+    std::uint32_t data;
+  };
+};
+template <> struct HIP_vector_type<unsigned long, 2> {
+  union {
+    struct {
+      std::uint32_t x;
+      std::uint32_t y;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<unsigned long, 3> {
+  union {
+    struct {
+      std::uint32_t x;
+      std::uint32_t y;
+      std::uint32_t z;
+    };
+    std::uint32_t data[3];
+  };
+};
+template <> struct HIP_vector_type<unsigned long, 4> {
+  union {
+    struct {
+      std::uint32_t x;
+      std::uint32_t y;
+      std::uint32_t z;
+      std::uint32_t w;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<unsigned long, 8> {
+  union {
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<unsigned long, 16> {
+  union {
+    __m128i data[4];
+  };
+};
+
+template <> struct HIP_vector_type<long long, 1> {
+  union {
+    struct {
+      std::int64_t x;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<long long, 2> {
+  union {
+    struct {
+      std::int64_t x;
+      std::int64_t y;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<long long, 3> {
+  union {
+    struct {
+      std::int64_t x;
+      std::int64_t y;
+      std::int64_t z;
+    };
+    __m64 data[3];
+  };
+};
+template <> struct HIP_vector_type<long long, 4> {
+  union {
+    struct {
+      std::int64_t x;
+      std::int64_t y;
+      std::int64_t z;
+      std::int64_t w;
+    };
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<long long, 8> {
+  union {
+    __m128i data[4];
+  };
+};
+template <> struct HIP_vector_type<long long, 16> {
+  union {
+    __m128i data[8];
+  };
+};
+
+template <> struct HIP_vector_type<unsigned long long, 1> {
+  union {
+    struct {
+      std::uint64_t x;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<unsigned long long, 2> {
+  union {
+    struct {
+      std::uint64_t x;
+      std::uint64_t y;
+    };
+    __m128i data;
+  };
+};
+template <> struct HIP_vector_type<unsigned long long, 3> {
+  union {
+    struct {
+      std::uint64_t x;
+      std::uint64_t y;
+      std::uint64_t z;
+    };
+    __m64 data[3];
+  };
+};
+template <> struct HIP_vector_type<unsigned long long, 4> {
+  union {
+    struct {
+      std::uint64_t x;
+      std::uint64_t y;
+      std::uint64_t z;
+      std::uint64_t w;
+    };
+    __m128i data[2];
+  };
+};
+template <> struct HIP_vector_type<unsigned long long, 8> {
+  union {
+    __m128i data[4];
+  };
+};
+template <> struct HIP_vector_type<unsigned long long, 16> {
+  union {
+    __m128i data[8];
+  };
+};
+
+template <> struct HIP_vector_type<float, 2> {
+  union {
+    struct {
+      float x;
+      float y;
+    };
+    __m64 data;
+  };
+};
+template <> struct HIP_vector_type<float, 4> {
+  union {
+    struct {
+      float x;
+      float y;
+      float z;
+      float w;
+    };
+    __m128 data;
+  };
+};
+template <> struct HIP_vector_type<float, 8> {
+  union {
+    __m256 data;
+  };
+};
+template <> struct HIP_vector_type<float, 16> {
+  union {
+    __m256 data[2];
+  };
+};
+
+template <> struct HIP_vector_type<double, 2> {
+  union {
+    struct {
+      double x;
+      double y;
+    };
+    __m128d data;
+  };
+};
+template <> struct HIP_vector_type<double, 4> {
+  union {
+    struct {
+      double x;
+      double y;
+      double z;
+      double w;
+    };
+    __m256d data;
+  };
+};
+template <> struct HIP_vector_type<double, 8> {
+  union {
+    __m256d data[2];
+  };
+};
+template <> struct HIP_vector_type<double, 16> {
+  union {
+    __m256d data[4];
+  };
+};
+
+// Type aliasing
+using char1 = HIP_vector_type<char, 1>;
+using char2 = HIP_vector_type<char, 2>;
+using char3 = HIP_vector_type<char, 3>;
+using char4 = HIP_vector_type<char, 4>;
+using char8 = HIP_vector_type<char, 8>;
+using char16 = HIP_vector_type<char, 16>;
+using uchar1 = HIP_vector_type<unsigned char, 1>;
+using uchar2 = HIP_vector_type<unsigned char, 2>;
+using uchar3 = HIP_vector_type<unsigned char, 3>;
+using uchar4 = HIP_vector_type<unsigned char, 4>;
+using uchar8 = HIP_vector_type<unsigned char, 8>;
+using uchar16 = HIP_vector_type<unsigned char, 16>;
+using short1 = HIP_vector_type<short, 1>;
+using short2 = HIP_vector_type<short, 2>;
+using short3 = HIP_vector_type<short, 3>;
+using short4 = HIP_vector_type<short, 4>;
+using short8 = HIP_vector_type<short, 8>;
+using short16 = HIP_vector_type<short, 16>;
+using ushort1 = HIP_vector_type<unsigned short, 1>;
+using ushort2 = HIP_vector_type<unsigned short, 2>;
+using ushort3 = HIP_vector_type<unsigned short, 3>;
+using ushort4 = HIP_vector_type<unsigned short, 4>;
+using ushort8 = HIP_vector_type<unsigned short, 8>;
+using ushort16 = HIP_vector_type<unsigned short, 16>;
+using int1 = HIP_vector_type<int, 1>;
+using int2 = HIP_vector_type<int, 2>;
+using int3 = HIP_vector_type<int, 3>;
+using int4 = HIP_vector_type<int, 4>;
+using int8 = HIP_vector_type<int, 8>;
+using int16 = HIP_vector_type<int, 16>;
+using uint1 = HIP_vector_type<unsigned int, 1>;
+using uint2 = HIP_vector_type<unsigned int, 2>;
+using uint3 = HIP_vector_type<unsigned int, 3>;
+using uint4 = HIP_vector_type<unsigned int, 4>;
+using uint8 = HIP_vector_type<unsigned int, 8>;
+using uint16 = HIP_vector_type<unsigned int, 16>;
+using long1 = HIP_vector_type<long, 1>;
+using long2 = HIP_vector_type<long, 2>;
+using long3 = HIP_vector_type<long, 3>;
+using long4 = HIP_vector_type<long, 4>;
+using long8 = HIP_vector_type<long, 8>;
+using long16 = HIP_vector_type<long, 16>;
+using ulong1 = HIP_vector_type<unsigned long, 1>;
+using ulong2 = HIP_vector_type<unsigned long, 2>;
+using ulong3 = HIP_vector_type<unsigned long, 3>;
+using ulong4 = HIP_vector_type<unsigned long, 4>;
+using ulong8 = HIP_vector_type<unsigned long, 8>;
+using ulong16 = HIP_vector_type<unsigned long, 16>;
+using longlong1 = HIP_vector_type<long long, 1>;
+using longlong2 = HIP_vector_type<long long, 2>;
+using longlong3 = HIP_vector_type<long long, 3>;
+using longlong4 = HIP_vector_type<long long, 4>;
+using longlong8 = HIP_vector_type<long long, 8>;
+using longlong16 = HIP_vector_type<long long, 16>;
+using ulonglong1 = HIP_vector_type<unsigned long long, 1>;
+using ulonglong2 = HIP_vector_type<unsigned long long, 2>;
+using ulonglong3 = HIP_vector_type<unsigned long long, 3>;
+using ulonglong4 = HIP_vector_type<unsigned long long, 4>;
+using ulonglong8 = HIP_vector_type<unsigned long long, 8>;
+using ulonglong16 = HIP_vector_type<unsigned long long, 16>;
+using float1 = HIP_vector_type<float, 1>;
+using float2 = HIP_vector_type<float, 2>;
+using float3 = HIP_vector_type<float, 3>;
+using float4 = HIP_vector_type<float, 4>;
+using float8 = HIP_vector_type<float, 8>;
+using float16 = HIP_vector_type<float, 16>;
+using double1 = HIP_vector_type<double, 1>;
+using double2 = HIP_vector_type<double, 2>;
+using double3 = HIP_vector_type<double, 3>;
+using double4 = HIP_vector_type<double, 4>;
+using double8 = HIP_vector_type<double, 8>;
+using double16 = HIP_vector_type<double, 16>;
+
+#else  // !defined(_MSC_VER)
+
+/*
+this is for compatibility with CUDA as CUDA allows accessing vector components
+in C++ program with MSVC
+structs are wrapped with templates so that mangled names match templated implementation
+*/
+
+template <typename T, unsigned int n> struct HIP_vector_type;
+
+// One template per vector size
+template <typename T> struct HIP_vector_type<T, 1> {
+  union {
+    struct {
+      T x;
+    };
+    T data;
+  };
+};
+template <typename T> struct HIP_vector_type<T, 2> {
+  union {
+    struct {
+      T x;
+      T y;
+    };
+    T data[2];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 3> {
+  union {
+    struct {
+      T x;
+      T y;
+      T z;
+    };
+    T data[3];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 4> {
+  union {
+    struct {
+      T x;
+      T y;
+      T z;
+      T w;
+    };
+    T data[4];
+  };
+};
+// 8- and 16-length vectors do not have CUDA-style accessible components
+template <typename T> struct HIP_vector_type<T, 8> {
+  union {
+    T data[8];
+  };
+};
+template <typename T> struct HIP_vector_type<T, 16> {
+  union {
+    T data[16];
+  };
+};
+
+// Type aliasing
+using char1 = HIP_vector_type<char, 1>;
+using char2 = HIP_vector_type<char, 2>;
+using char3 = HIP_vector_type<char, 3>;
+using char4 = HIP_vector_type<char, 4>;
+using char8 = HIP_vector_type<char, 8>;
+using char16 = HIP_vector_type<char, 16>;
+using uchar1 = HIP_vector_type<unsigned char, 1>;
+using uchar2 = HIP_vector_type<unsigned char, 2>;
+using uchar3 = HIP_vector_type<unsigned char, 3>;
+using uchar4 = HIP_vector_type<unsigned char, 4>;
+using uchar8 = HIP_vector_type<unsigned char, 8>;
+using uchar16 = HIP_vector_type<unsigned char, 16>;
+using short1 = HIP_vector_type<short, 1>;
+using short2 = HIP_vector_type<short, 2>;
+using short3 = HIP_vector_type<short, 3>;
+using short4 = HIP_vector_type<short, 4>;
+using short8 = HIP_vector_type<short, 8>;
+using short16 = HIP_vector_type<short, 16>;
+using ushort1 = HIP_vector_type<unsigned short, 1>;
+using ushort2 = HIP_vector_type<unsigned short, 2>;
+using ushort3 = HIP_vector_type<unsigned short, 3>;
+using ushort4 = HIP_vector_type<unsigned short, 4>;
+using ushort8 = HIP_vector_type<unsigned short, 8>;
+using ushort16 = HIP_vector_type<unsigned short, 16>;
+using int1 = HIP_vector_type<int, 1>;
+using int2 = HIP_vector_type<int, 2>;
+using int3 = HIP_vector_type<int, 3>;
+using int4 = HIP_vector_type<int, 4>;
+using int8 = HIP_vector_type<int, 8>;
+using int16 = HIP_vector_type<int, 16>;
+using uint1 = HIP_vector_type<unsigned int, 1>;
+using uint2 = HIP_vector_type<unsigned int, 2>;
+using uint3 = HIP_vector_type<unsigned int, 3>;
+using uint4 = HIP_vector_type<unsigned int, 4>;
+using uint8 = HIP_vector_type<unsigned int, 8>;
+using uint16 = HIP_vector_type<unsigned int, 16>;
+using long1 = HIP_vector_type<long, 1>;
+using long2 = HIP_vector_type<long, 2>;
+using long3 = HIP_vector_type<long, 3>;
+using long4 = HIP_vector_type<long, 4>;
+using long8 = HIP_vector_type<long, 8>;
+using long16 = HIP_vector_type<long, 16>;
+using ulong1 = HIP_vector_type<unsigned long, 1>;
+using ulong2 = HIP_vector_type<unsigned long, 2>;
+using ulong3 = HIP_vector_type<unsigned long, 3>;
+using ulong4 = HIP_vector_type<unsigned long, 4>;
+using ulong8 = HIP_vector_type<unsigned long, 8>;
+using ulong16 = HIP_vector_type<unsigned long, 16>;
+using longlong1 = HIP_vector_type<long long, 1>;
+using longlong2 = HIP_vector_type<long long, 2>;
+using longlong3 = HIP_vector_type<long long, 3>;
+using longlong4 = HIP_vector_type<long long, 4>;
+using longlong8 = HIP_vector_type<long long, 8>;
+using longlong16 = HIP_vector_type<long long, 16>;
+using ulonglong1 = HIP_vector_type<unsigned long long, 1>;
+using ulonglong2 = HIP_vector_type<unsigned long long, 2>;
+using ulonglong3 = HIP_vector_type<unsigned long long, 3>;
+using ulonglong4 = HIP_vector_type<unsigned long long, 4>;
+using ulonglong8 = HIP_vector_type<unsigned long long, 8>;
+using ulonglong16 = HIP_vector_type<unsigned long long, 16>;
+using float1 = HIP_vector_type<float, 1>;
+using float2 = HIP_vector_type<float, 2>;
+using float3 = HIP_vector_type<float, 3>;
+using float4 = HIP_vector_type<float, 4>;
+using float8 = HIP_vector_type<float, 8>;
+using float16 = HIP_vector_type<float, 16>;
+using double1 = HIP_vector_type<double, 1>;
+using double2 = HIP_vector_type<double, 2>;
+using double3 = HIP_vector_type<double, 3>;
+using double4 = HIP_vector_type<double, 4>;
+using double8 = HIP_vector_type<double, 8>;
+using double16 = HIP_vector_type<double, 16>;
+
+#endif  // defined(_MSC_VER)
+#endif  // defined(__has_attribute)
+
+#ifdef __cplusplus
+#define DECLOP_MAKE_ONE_COMPONENT(comp, type)                                                      \
+  static inline __HOST_DEVICE__ type make_##type(comp x) {                                         \
+    type r{x};                                                                                     \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_TWO_COMPONENT(comp, type)                                                      \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y) {                                 \
+    type r{x, y};                                                                                  \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_THREE_COMPONENT(comp, type)                                                    \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y, comp z) {                         \
+    type r{x, y, z};                                                                               \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_FOUR_COMPONENT(comp, type)                                                     \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y, comp z, comp w) {                 \
+    type r{x, y, z, w};                                                                            \
+    return r;                                                                                      \
+  }
+#else
+#define DECLOP_MAKE_ONE_COMPONENT(comp, type)                                                      \
+  static inline __HOST_DEVICE__ type make_##type(comp x) {                                         \
+    type r;                                                                                        \
+    r.x = x;                                                                                       \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_TWO_COMPONENT(comp, type)                                                      \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y) {                                 \
+    type r;                                                                                        \
+    r.x = x;                                                                                       \
+    r.y = y;                                                                                       \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_THREE_COMPONENT(comp, type)                                                    \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y, comp z) {                         \
+    type r;                                                                                        \
+    r.x = x;                                                                                       \
+    r.y = y;                                                                                       \
+    r.z = z;                                                                                       \
+    return r;                                                                                      \
+  }
+
+#define DECLOP_MAKE_FOUR_COMPONENT(comp, type)                                                     \
+  static inline __HOST_DEVICE__ type make_##type(comp x, comp y, comp z, comp w) {                 \
+    type r;                                                                                        \
+    r.x = x;                                                                                       \
+    r.y = y;                                                                                       \
+    r.z = z;                                                                                       \
+    r.w = w;                                                                                       \
+    return r;                                                                                      \
+  }
+#endif
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned char, uchar4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed char, char1);
+DECLOP_MAKE_TWO_COMPONENT(signed char, char2);
+DECLOP_MAKE_THREE_COMPONENT(signed char, char3);
+DECLOP_MAKE_FOUR_COMPONENT(signed char, char4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned short, ushort1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned short, ushort2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned short, ushort3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned short, ushort4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed short, short1);
+DECLOP_MAKE_TWO_COMPONENT(signed short, short2);
+DECLOP_MAKE_THREE_COMPONENT(signed short, short3);
+DECLOP_MAKE_FOUR_COMPONENT(signed short, short4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned int, uint1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned int, uint2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned int, uint3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned int, uint4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed int, int1);
+DECLOP_MAKE_TWO_COMPONENT(signed int, int2);
+DECLOP_MAKE_THREE_COMPONENT(signed int, int3);
+DECLOP_MAKE_FOUR_COMPONENT(signed int, int4);
+
+DECLOP_MAKE_ONE_COMPONENT(float, float1);
+DECLOP_MAKE_TWO_COMPONENT(float, float2);
+DECLOP_MAKE_THREE_COMPONENT(float, float3);
+DECLOP_MAKE_FOUR_COMPONENT(float, float4);
+
+DECLOP_MAKE_ONE_COMPONENT(double, double1);
+DECLOP_MAKE_TWO_COMPONENT(double, double2);
+DECLOP_MAKE_THREE_COMPONENT(double, double3);
+DECLOP_MAKE_FOUR_COMPONENT(double, double4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long, ulong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long, ulong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long, ulong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long, ulong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long, long1);
+DECLOP_MAKE_TWO_COMPONENT(signed long, long2);
+DECLOP_MAKE_THREE_COMPONENT(signed long, long3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long, long4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long long, ulonglong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long long, ulonglong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long long, ulonglong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long long, ulonglong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
+DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
+DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_math_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..11e811d3b744fcbb373bae2043e2aa7507abef59
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_math_functions.h
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if !defined(__HIPCC_RTC__)
+#include "hip_fp16_math_fwd.h"
+#include "amd_hip_vector_types.h"
+#include "math_fwd.h"
+
+#include <hip/amd_detail/host_defines.h>
+
+#include <algorithm>
+// assert.h is only for the host version of assert.
+// The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
+// Users should include hip_runtime.h for the device version of assert.
+#if !__HIP_DEVICE_COMPILE__
+#include <assert.h>
+#endif
+#include <limits.h>
+#include <limits>
+#include <stdint.h>
+#endif  // !defined(__HIPCC_RTC__)
+
+#pragma push_macro("__DEVICE__")
+#pragma push_macro("__RETURN_TYPE")
+
+#define __DEVICE__ static __device__
+#define __RETURN_TYPE bool
+
+// DOT FUNCTIONS
+#if defined(__clang__) && defined(__HIP__)
+__DEVICE__
+inline int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
+  return __ockl_sdot2(get_native_vector(a), get_native_vector(b), c, saturate);
+}
+__DEVICE__
+inline uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
+  return __ockl_udot2(get_native_vector(a), get_native_vector(b), c, saturate);
+}
+__DEVICE__
+inline int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
+  return __ockl_sdot4(get_native_vector(a), get_native_vector(b), c, saturate);
+}
+__DEVICE__
+inline uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
+  return __ockl_udot4(get_native_vector(a), get_native_vector(b), c, saturate);
+}
+__DEVICE__
+inline int amd_mixed_dot(int a, int b, int c, bool saturate) {
+  return __ockl_sdot8(a, b, c, saturate);
+}
+__DEVICE__
+inline uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
+  return __ockl_udot8(a, b, c, saturate);
+}
+#endif
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__RETURN_TYPE")
+// For backward compatibility.
+// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
+// defined after including math_functions.h.
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/amd_hip_runtime.h>
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac17b1bf7954df515f79b531891c87fd8edd7dfe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h
@@ -0,0 +1,394 @@
+/*
+Copyright (c) 2018 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+
+#if defined(__cplusplus)
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/surface_types.h>
+#include <hip/hip_vector_types.h>
+#include <hip/amd_detail/texture_fetch_functions.h>
+#include <hip/amd_detail/ockl_image.h>
+#endif
+
+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
+#endif
+
+#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT                                                       \
+  unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
+
+/**
+ *  @defgroup SurfaceAPI Surface API
+ *  @{
+ */
+
+// CUDA is using byte address, need map to pixel address for HIP
+static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
+  /*
+  * use below format index to generate format LUT
+    typedef enum {
+      HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+      HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
+    } hsa_ext_image_channel_type_t;
+  */
+  static const int FormatLUT[] = {0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2};
+  x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
+
+  /*
+  * use below order index to generate order LUT
+    typedef enum {
+      HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+      HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+    } hsa_ext_image_channel_order_t;
+  */
+  static const int OrderLUT[] = {0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0};
+  return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
+}
+
+/** \brief Reads the value at coordinate x from the one-dimensional surface.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The coordinate where the value will be read out.
+ *  \param boundaryMode [in] The boundary mode is currently ignored.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                  int boundaryMode = hipBoundaryModeZero) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT;
+  (void)boundaryMode;
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+  auto tmp = __ockl_image_load_1D(i, x);
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the one-dimensional surface at coordinate x.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The coordinate where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_1D(i, x, tmp);
+}
+
+
+/** \brief Reads the value from the two-dimensional surface at coordinate x, y.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the value will be read out.
+ *  \param y [in] The y coordinate where the value will be read out.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                  int y) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __ockl_image_load_2D(i, get_native_vector(coords));
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the two-dimensional surface at coordinate
+ *         x, y.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param y [in] The y coordinate where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                   int y) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_2D(i, get_native_vector(coords), tmp);
+}
+
+/** \brief Reads the value from the three-dimensional surface at coordinate
+ *         x, y, z.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the value will be read out.
+ *  \param y [in] The y coordinate where the value will be read out.
+ *  \param z [in] The z coordinate where the value will be read out.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
+                                                  int z) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
+  int4 coords{x, y, z, 0};
+  auto tmp = __ockl_image_load_3D(i, get_native_vector(coords));
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the three-dimensional surface at coordinate
+ *         x, y, z.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param y [in] The y coordinate where the data will be written.
+ *  \param z [in] The z coordinate where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
+                                                   int z) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
+  int4 coords{x, y, z, 0};
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_3D(i, get_native_vector(coords), tmp);
+}
+
+/** \brief Reads the value from the one-dimensional layered surface at
+ *         coordinate x and layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The coordinate where the value will be read out.
+ *  \param layer [in] The layer index where the value will be read out.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                         int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+  auto tmp = __ockl_image_load_lod_1D(i, x, layer);
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the one-dimensional layered surface at
+ *         coordinate x and layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param layer [in] The layer index where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                          int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_lod_1D(i, x, layer, tmp);
+}
+
+/** \brief Reads the value from the two-dimensional layered surface at
+ *         coordinate x, y and layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the value will be read out.
+ *  \param y [in] The y coordinate where the value will be read out.
+ *  \param layer [in] The layer index where the value will be read out.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                         int y, int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __ockl_image_load_lod_2D(i, get_native_vector(coords), layer);
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the two-dimensional layered surface at
+ *         coordinate x, y and layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param y [in] The y coordinate where the data will be written.
+ *  \param layer [in] The layer index where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                          int y, int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_lod_2D(i, get_native_vector(coords), layer, tmp);
+}
+
+/** \brief Reads the value from the cubemap surface at coordinate x, y and
+ *         face index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the value will be read out.
+ *  \param y [in] The y coordinate where the value will be read out.
+ *  \param face [in] The face index where the value will be read out.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                       int y, int face) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __ockl_image_load_CM(i, get_native_vector(coords), face);
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the cubemap surface at coordinate x, y and
+ *         face index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value is written to surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param y [in] The y coordinate where the data will be written.
+ *  \param face [in] The face index where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                        int y, int face) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_CM(i, get_native_vector(coords), face, tmp);
+}
+
+/** \brief Reads the value from the layered cubemap surface at coordinate x, y
+ *         and face, layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [out] The T type result is stored in this pointer.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the value will be read out.
+ *  \param y [in] The y coordinate where the value will be read out.
+ *  \param face [in] The face index where the value will be read out.
+ *  \param layer [in] The layer index where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj,
+                                                              int x, int y, int face, int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __ockl_image_load_lod_CM(i, get_native_vector(coords), face, layer);
+  *data = __hipMapFrom<T>(tmp);
+}
+
+/** \brief Writes the value data to the layered cubemap surface at coordinate
+ *         x, y and face, layer index.
+ *
+ *  \tparam T The data type of the surface.
+ *  \param data [in] The T type value to write to the surface.
+ *  \param surfObj [in] The surface descriptor.
+ *  \param x [in] The x coordinate where the data will be written.
+ *  \param y [in] The y coordinate where the data will be written.
+ *  \param face [in] The face index where the data will be written.
+ *  \param layer [in] The layer index where the data will be written.
+ */
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj,
+                                                               int x, int y, int face, int layer) {
+  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+  int2 coords{x, y};
+  auto tmp = __hipMapTo<float4::Native_vec_>(data);
+  __ockl_image_store_lod_CM(i, get_native_vector(coords), face, layer, tmp);
+}
+
+// Doxygen end group SurfaceAPI
+/**
+ * @}
+ */
+
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..efd13acdbe480ac0614229054d866ea9502a3549
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h
@@ -0,0 +1,603 @@
+/*
+Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
+
+#if !defined(__HIPCC_RTC__)
+#include "device_library_decls.h"  // ockl warp functions
+#endif                             // !defined(__HIPCC_RTC__)
+
+#if defined(__has_attribute) && __has_attribute(maybe_undef)
+#define MAYBE_UNDEF __attribute__((maybe_undef))
+#else
+#define MAYBE_UNDEF
+#endif
+
+__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
+  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+  return tmp.u;
+}
+
+__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
+  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+  return tmp.f;
+}
+
+__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
+  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+  return tmp.u;
+}
+
+__device__ static inline float __hip_ds_permutef(int index, float src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
+  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+  return tmp.f;
+}
+
+#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
+#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
+
+template <int pattern> __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
+  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+  return tmp.u;
+}
+
+template <int pattern> __device__ static inline float __hip_ds_swizzlef_N(float src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
+  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+  return tmp.f;
+}
+
+#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl)                             \
+  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ static inline int __hip_move_dpp_N(int src) {
+  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+}
+
+inline __device__ const struct final {
+  __device__ __attribute__((always_inline, const)) operator int() const noexcept {
+    return __builtin_amdgcn_wavefrontsize();
+  }
+} warpSize{};
+
+// warp vote function __all __any __ballot
+__device__ inline int __all(int predicate) { return __ockl_wfall_i32(predicate); }
+
+__device__ inline int __any(int predicate) { return __ockl_wfany_i32(predicate); }
+
+__device__ inline unsigned long long int __ballot(int predicate) {
+  return __builtin_amdgcn_ballot_w64(predicate);
+}
+
+__device__ inline unsigned long long int __ballot64(int predicate) { return __ballot(predicate); }
+
+// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
+// Since threads in a wave do not make independent progress, __activemask()
+// always returns the exact active mask, i.e, all active threads in the wave.
+__device__ inline unsigned long long __activemask() { return __ballot(true); }
+#endif  // HIP_DISABLE_WARP_SYNC_BUILTINS
+
+__device__ static inline unsigned int __lane_id() {
+  if (static_cast<int>(warpSize) == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
+  return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+__device__ inline int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
+  int self = __lane_id();
+  int index = (src_lane & (width - 1)) + (self & ~(width - 1));
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+__device__ inline unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane,
+                                      int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
+  tmp.i = __shfl(tmp.i, src_lane, width);
+  return tmp.u;
+}
+__device__ inline float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = __shfl(tmp.i, src_lane, width);
+  return tmp.f;
+}
+__device__ inline double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl(tmp[0], src_lane, width);
+  tmp[1] = __shfl(tmp[1], src_lane, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl(tmp[0], src_lane, width);
+  tmp[1] = __shfl(tmp[1], src_lane, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(long) == sizeof(int), "");
+  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
+#endif
+}
+__device__ inline unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane,
+                                       int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
+
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl(tmp[0], src_lane, width);
+  tmp[1] = __shfl(tmp[1], src_lane, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
+#endif
+}
+__device__ inline long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize) {
+  static_assert(sizeof(long long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl(tmp[0], src_lane, width);
+  tmp[1] = __shfl(tmp[1], src_lane, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane,
+                                            int width = warpSize) {
+  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
+
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl(tmp[0], src_lane, width);
+  tmp[1] = __shfl(tmp[1], src_lane, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+__device__ inline int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta,
+                                int width = warpSize) {
+  int self = __lane_id();
+  int index = self - lane_delta;
+  index = (index < (self & ~(width - 1))) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+__device__ inline unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
+                                         int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
+  tmp.i = __shfl_up(tmp.i, lane_delta, width);
+  return tmp.u;
+}
+__device__ inline float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta,
+                                  int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = __shfl_up(tmp.i, lane_delta, width);
+  return tmp.f;
+}
+__device__ inline double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta,
+                                   int width = warpSize) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta,
+                                 int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(long) == sizeof(int), "");
+  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
+#endif
+}
+
+__device__ inline unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
+                                          int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
+
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+  return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
+#endif
+}
+
+__device__ inline long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta,
+                                      int width = warpSize) {
+  static_assert(sizeof(long long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+__device__ inline unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var,
+                                               unsigned int lane_delta, int width = warpSize) {
+  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+__device__ inline int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta,
+                                  int width = warpSize) {
+  int self = __lane_id();
+  int index = self + lane_delta;
+  index = (int)((self & (width - 1)) + lane_delta) >= width ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+__device__ inline unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
+                                           int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
+  tmp.i = __shfl_down(tmp.i, lane_delta, width);
+  return tmp.u;
+}
+__device__ inline float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta,
+                                    int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = __shfl_down(tmp.i, lane_delta, width);
+  return tmp.f;
+}
+__device__ inline double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta,
+                                     int width = warpSize) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta,
+                                   int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(long) == sizeof(int), "");
+  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+#endif
+}
+__device__ inline unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
+                                            int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
+
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
+#endif
+}
+__device__ inline long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta,
+                                        int width = warpSize) {
+  static_assert(sizeof(long long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var,
+                                                 unsigned int lane_delta, int width = warpSize) {
+  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+__device__ inline int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
+  int self = __lane_id();
+  int index = self ^ lane_mask;
+  index = index >= ((self + width) & ~(width - 1)) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+__device__ inline unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask,
+                                          int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
+  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+  return tmp.u;
+}
+__device__ inline float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+  return tmp.f;
+}
+__device__ inline double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(long) == sizeof(int), "");
+  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
+#endif
+}
+__device__ inline unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask,
+                                           int width = warpSize) {
+#ifndef _MSC_VER
+  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
+
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+#else
+  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
+#endif
+}
+__device__ inline long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask,
+                                       int width = warpSize) {
+  static_assert(sizeof(long long) == 2 * sizeof(int), "");
+  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+__device__ inline unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask,
+                                                int width = warpSize) {
+  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaad1827e29efaa40ad2a02c6cd8cd66d0f33448
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h
@@ -0,0 +1,682 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a
+// preview to allow end-users to adapt to the new interface involving 64-bit
+// masks. These are enabled by default, and can be disabled by setting the macro
+// "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
+// __activemask() builtin defined in amd_warp_functions.h.
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
+
+#if !defined(__HIPCC_RTC__)
+#include "amd_warp_functions.h"
+#include "amd_device_functions.h"
+#include "hip_assert.h"
+#include <functional>
+#include <algorithm>
+#endif
+
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_add_i32(int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_add_u32(unsigned int);
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_min_i32(int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_min_u32(unsigned int);
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_max_i32(int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_max_u32(unsigned int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_and_u32(unsigned int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_or_u32(unsigned int);
+extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_xor_u32(unsigned int);
+
+#ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
+// this macro enable types that are not in CUDA
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_add_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_add_u64(
+    unsigned long long);
+extern "C" __device__ __attribute__((const)) float __ockl_wfred_add_f32(float);
+extern "C" __device__ __attribute__((const)) double __ockl_wfred_add_f64(double);
+
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_min_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_min_u64(
+    unsigned long long);
+extern "C" __device__ __attribute__((const)) float __ockl_wfred_min_f32(float);
+extern "C" __device__ __attribute__((const)) double __ockl_wfred_min_f64(double);
+
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_max_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_max_u64(
+    unsigned long long);
+extern "C" __device__ __attribute__((const)) float __ockl_wfred_max_f32(float);
+extern "C" __device__ __attribute__((const)) double __ockl_wfred_max_f64(double);
+
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_and_i32(int);
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_and_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_and_u64(
+    unsigned long long);
+
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_or_i32(int);
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_or_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_or_u64(
+    unsigned long long);
+
+extern "C" __device__ __attribute__((const)) int __ockl_wfred_xor_i32(int);
+extern "C" __device__ __attribute__((const)) long long __ockl_wfred_xor_i64(long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_xor_u64(
+    unsigned long long);
+
+#endif
+
+template <typename T> __device__ inline T __hip_readfirstlane(T val) {
+  // In theory, behaviour is undefined when reading from a union member other
+  // than the member that was last assigned to, but it works in practice because
+  // we rely on the compiler to do the reasonable thing.
+  union {
+    unsigned long long l;
+    T d;
+  } u;
+  u.d = val;
+  // NOTE: The builtin returns int, so we first cast it to unsigned int and only
+  // then extend it to 64 bits.
+  unsigned long long lower = (unsigned)__builtin_amdgcn_readfirstlane(u.l);
+  unsigned long long upper = (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
+  u.l = (upper << 32) | lower;
+  return u.d;
+}
+
+// When compiling for wave32 mode, ignore the upper half of the 64-bit mask.
+#define __hip_adjust_mask_for_wave32(MASK)                                                         \
+  do {                                                                                             \
+    if (static_cast<int>(warpSize) == 32) MASK &= 0xFFFFFFFF;                                      \
+  } while (0)
+
+// We use a macro to expand each builtin into a waterfall that implements the
+// mask semantics:
+//
+// 1. The mask argument may be divergent.
+// 2. Each active thread must have its own bit set in its own mask value.
+// 3. For a given mask value, all threads that are mentioned in the mask must
+//    execute the same static instance of the builtin with the same mask.
+// 4. The union of all mask values supplied at a static instance must be equal
+//    to the activemask at the program point.
+//
+// Thus, the mask argument partitions the set of currently active threads in the
+// wave into disjoint subsets that cover all active threads.
+//
+// Implementation notes:
+// ---------------------
+//
+// We implement this as a waterfall loop that executes the builtin for each
+// subset separately. The return value is a divergent value across the active
+// threads. The value for inactive threads is defined by each builtin
+// separately.
+//
+// As long as every mask value is non-zero, we don't need to check if a lane
+// specifies itself in the mask; that is done by the later assertion where all
+// chosen lanes must be in the chosen mask.
+
+#define __hip_check_mask(MASK)                                                                     \
+  do {                                                                                             \
+    __hip_assert(MASK && "mask must be non-zero");                                                 \
+    bool done = false;                                                                             \
+    while (__any(!done)) {                                                                         \
+      if (!done) {                                                                                 \
+        auto chosen_mask = __hip_readfirstlane(MASK);                                              \
+        if (MASK == chosen_mask) {                                                                 \
+          __hip_assert(MASK == __ballot(true) &&                                                   \
+                       "all threads specified in the mask"                                         \
+                       " must execute the same operation with the same mask");                     \
+          done = true;                                                                             \
+        }                                                                                          \
+      }                                                                                            \
+    }                                                                                              \
+  } while (0)
+
+#define __hip_do_sync(RETVAL, FUNC, MASK, ...)                                                     \
+  do {                                                                                             \
+    __hip_assert(MASK && "mask must be non-zero");                                                 \
+    bool done = false;                                                                             \
+    while (__any(!done)) {                                                                         \
+      if (!done) {                                                                                 \
+        auto chosen_mask = __hip_readfirstlane(MASK);                                              \
+        if (MASK == chosen_mask) {                                                                 \
+          __hip_assert(MASK == __ballot(true) &&                                                   \
+                       "all threads specified in the mask"                                         \
+                       " must execute the same operation with the same mask");                     \
+          RETVAL = FUNC(__VA_ARGS__);                                                              \
+          done = true;                                                                             \
+        }                                                                                          \
+      }                                                                                            \
+    }                                                                                              \
+  } while (0)
+
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+
+template <typename MaskT> __device__ inline void __syncwarp(MaskT mask) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_check_mask(mask);
+  return __syncwarp();
+}
+
+// __all_sync, __any_sync, __ballot_sync
+
+template <typename MaskT>
+__device__ inline unsigned long long __ballot_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __ballot(predicate) & mask;
+}
+
+template <typename MaskT> __device__ inline int __all_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  return __ballot_sync(mask, predicate) == mask;
+}
+
+template <typename MaskT> __device__ inline int __any_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  return __ballot_sync(mask, predicate) != 0;
+}
+
+// __match_any, __match_all and sync variants
+
+template <typename T> __device__ inline unsigned long long __match_any(T value) {
+  static_assert(
+      (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
+          (sizeof(T) == 4 || sizeof(T) == 8),
+      "T can be int, unsigned int, long, unsigned long, long long, unsigned "
+      "long long, float or double.");
+  bool done = false;
+  unsigned long long retval = 0;
+
+  while (__any(!done)) {
+    if (!done) {
+      T chosen = __hip_readfirstlane(value);
+      if (chosen == value) {
+        retval = __activemask();
+        done = true;
+      }
+    }
+  }
+
+  return retval;
+}
+
+template <typename MaskT, typename T>
+__device__ inline unsigned long long __match_any_sync(MaskT mask, T value) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __match_any(value) & mask;
+}
+
+template <typename T> __device__ inline unsigned long long __match_all(T value, int* pred) {
+  static_assert(
+      (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
+          (sizeof(T) == 4 || sizeof(T) == 8),
+      "T can be int, unsigned int, long, unsigned long, long long, unsigned "
+      "long long, float or double.");
+  T first = __hip_readfirstlane(value);
+  if (__all(first == value)) {
+    *pred = true;
+    return __activemask();
+  } else {
+    *pred = false;
+    return 0;
+  }
+}
+
+template <typename MaskT, typename T>
+__device__ inline unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  MaskT retval = 0;
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_do_sync(retval, __match_all, mask, value, pred);
+  return retval;
+}
+
+// various variants of shfl
+
+template <typename MaskT, typename T>
+__device__ inline T __shfl_sync(MaskT mask, T var, int srcLane, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __shfl(var, srcLane, width);
+}
+
+template <typename MaskT, typename T>
+__device__ inline T __shfl_up_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __shfl_up(var, delta, width);
+}
+
+template <typename MaskT, typename T>
+__device__ inline T __shfl_down_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __shfl_down(var, delta, width);
+}
+
+template <typename MaskT, typename T>
+__device__ inline T __shfl_xor_sync(MaskT mask, T var, int laneMask, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  __hip_check_mask(mask);
+  return __shfl_xor(var, laneMask, width);
+}
+
+template <typename MaskT, typename T, typename BinaryOp, typename WfReduce>
+__device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wfReduce) {
+  using permuteType =
+      typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, T, unsigned int>::type;
+  static constexpr auto kMaskNumBits = sizeof(MaskT) * 8;
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+                "The mask must be a 64-bit integer. "
+                "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
+  unsigned int laneId;
+  unsigned int maskIdx;
+  // next bit to aggregate with
+  int nextBit;
+
+  // if doing the binary reduction tree, this will increase by two in every iteration
+  int modulo = 1;
+  int leadingZeroes = __clzll(mask);
+  int firstLane;
+  int lastLane = kMaskNumBits - leadingZeroes - 1;
+  int maskNumBits;
+  int numIterations;
+  // unsigned int[2] is used when T is 64-bit wide
+  typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, permuteType,
+                                       permuteType[2]>::type result,
+      permuteResult;
+  auto backwardPermute = [](int index, permuteType val) {
+    if constexpr (__hip_internal::is_integral<T>::value ||
+                  __hip_internal::is_same<T, double>::value)
+      return __hip_ds_bpermute(index, val);
+    else
+      return __hip_ds_bpermutef(index, val);
+  };
+
+  __hip_check_mask(mask);
+  maskNumBits = __popcll(mask);
+
+#ifdef __OPTIMIZE__  // at the time of this writing the ockl wfred functions do not compile when
+                     // using -O0
+  if (maskNumBits == lastLane + 1)
+    // this means the mask "does not have holes", and starts from 0; we can use a specific intrinsic
+    // to calculate the aggregated result
+    return wfReduce(val);
+#endif
+
+  firstLane = __builtin_ctzll(mask);
+  laneId = __ockl_lane_u32();
+  nextBit = laneId;
+  // the number of iterations needs to be at least log2(number of bits on)
+  numIterations = sizeof(int) * 8 - __clz(maskNumBits);
+
+  if (!(maskNumBits & (maskNumBits - 1)))
+    // the number of bits in the mask is a power of 2
+    numIterations -= 1;
+
+  maskIdx = __popcll(((1ul << laneId) - 1) & mask);
+  mask >>= laneId;
+  mask >>= 1ul;
+
+  if constexpr (sizeof(T) == 4 || sizeof(T) == 2)
+    result = val;
+  else
+    __builtin_memcpy(&result, &val, sizeof(T));
+
+  // add the values from the lanes using a reduction tree (first the threads with even-numbered
+  // lanes, then multiples of 4, then 8, ...
+  while (numIterations) {
+    int offset = modulo >> 1;
+    int increment = modulo - offset;
+    int nextPos = maskIdx + offset + increment;
+    bool insideLanes = nextPos < maskNumBits;
+
+    if (insideLanes) {
+      int next;
+
+      // find the position to aggregate with; although we could just call fns64() that will probably
+      // be very slow when called multiple times in this for loop; this is equivalent
+      for (int i = 0; i < increment; i++) {
+        next = __builtin_ctzll(mask) + 1;
+        mask >>= next;
+        nextBit += next;
+      }
+    }
+
+    if constexpr (sizeof(T) == 2) {
+      union {
+        int i;
+        T f;
+      } tmp;
+
+      tmp.f = result;
+      tmp.i = __hip_ds_bpermute(nextBit << 2, tmp.i);
+      permuteResult = tmp.f;
+    } else if constexpr (sizeof(T) == 4)
+      permuteResult = backwardPermute(nextBit << 2, result);
+    else {
+      // ds_bpermute only deals with 32-bit sizes, so for 64-bit types
+      // we need to call the permute twice for each half
+      permuteResult[0] = backwardPermute(nextBit << 2, result[0]);
+      permuteResult[1] = backwardPermute(nextBit << 2, result[1]);
+    }
+
+    if (insideLanes) {
+      if constexpr (sizeof(T) == 4 || sizeof(T) == 2)
+        result = op(result, permuteResult);
+      else {
+        T tmp;
+        unsigned long long rhs =
+            (static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];
+
+        __builtin_memcpy(&tmp, &result, sizeof(T));
+        tmp = op(tmp, *reinterpret_cast<T*>(&rhs));
+        __builtin_memcpy(&result, &tmp, sizeof(T));
+      }
+    }
+
+    modulo <<= 1;
+    numIterations--;
+  }
+
+  if constexpr (sizeof(T) == 2) {
+    union {
+      int i;
+      T f;
+    } tmp;
+    tmp.f = result;
+    tmp.i = __hip_ds_bpermute(firstLane << 2, tmp.i);
+    return tmp.f;
+  } else if constexpr (sizeof(T) == 4)
+    return backwardPermute(firstLane << 2, result);
+  else {
+    auto tmp = (static_cast<unsigned long long>(backwardPermute(firstLane << 2, result[1])) << 32) |
+               static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
+    return *reinterpret_cast<T*>(&tmp);
+  }
+}
+
+template <typename MaskT> __device__ inline int __reduce_add_sync(MaskT mask, int val) {
+  // although C++ has std::plus and other functors, we do not use them because
+  // they are in the header <functional> and they were causing problem with hipRTC
+  // at this time
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_add_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline int __reduce_min_sync(MaskT mask, int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_min_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline int __reduce_max_sync(MaskT mask, int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+#ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
+template <typename MaskT> __device__ inline long long __reduce_add_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_add_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline float __reduce_add_sync(MaskT mask, float val) {
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline double __reduce_add_sync(MaskT mask, double val) {
+  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline long long __reduce_min_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_min_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline float __reduce_min_sync(MaskT mask, float val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline double __reduce_min_sync(MaskT mask, double val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline long long __reduce_max_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_max_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline float __reduce_max_sync(MaskT mask, float val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline double __reduce_max_sync(MaskT mask, double val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline int __reduce_and_sync(MaskT mask, int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline long long __reduce_and_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_and_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline int __reduce_or_sync(MaskT mask, int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline long long __reduce_or_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_or_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline int __reduce_xor_sync(MaskT mask, int val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i32(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT> __device__ inline long long __reduce_xor_sync(MaskT mask, long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+template <typename MaskT>
+__device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long long val) {
+  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
+  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u64(v); };
+
+  return __reduce_op_sync(mask, val, op, wfReduce);
+}
+
+#undef __hip_do_sync
+#undef __hip_check_mask
+#undef __hip_adjust_mask_for_wave32
+
+#endif  // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
+#endif  // HIP_DISABLE_WARP_SYNC_BUILTINS
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/device_library_decls.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/device_library_decls.h
new file mode 100644
index 0000000000000000000000000000000000000000..33623f5881d7bbc784bcba28a67158efa58b1cc2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/device_library_decls.h
@@ -0,0 +1,135 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/device_library_decls.h
+ *  @brief Contains declarations for types and functions in device library.
+ *         Uses __hip_int64_t and __hip_uint64_t instead of long, long long, unsigned
+ *         long and unsigned long long types for device library API
+ *         declarations.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+
+#if !defined(__HIPCC_RTC__)
+#include "hip/amd_detail/host_defines.h"
+#if __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned long long ullong;
+
+extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
+extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
+extern "C" __device__ uint __ockl_activelane_u32(void);
+
+extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
+
+extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
+extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
+extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
+extern "C" __device__ __attribute__((const)) __hip_uint64_t __ockl_clz_u64(__hip_uint64_t);
+
+extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);
+
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(__hip_uint64_t);
+
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
+
+extern "C" __device__ __attribute__((const)) __hip_uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
+extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+
+extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
+
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
+
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin();
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(
+    __hip_uint64_t msg_desc, __hip_uint32_t num_args, __hip_uint64_t value0, __hip_uint64_t value1,
+    __hip_uint64_t value2, __hip_uint64_t value3, __hip_uint64_t value4, __hip_uint64_t value5,
+    __hip_uint64_t value6, __hip_uint32_t is_last);
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc,
+                                                                    const char* data,
+                                                                    __hip_uint64_t length,
+                                                                    __hip_uint32_t is_last);
+
+// Introduce local address space
+#define __local __attribute__((address_space(3)))
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
+#endif  //__HIP_DEVICE_COMPILE__
+
+// Using hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE  0x01
+#define __CLK_GLOBAL_MEM_FENCE 0x02
+typedef unsigned __cl_mem_fence_flags;
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_assert.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..00ed9efa389c30fb6d77b4f6dfb42b7d11f4d88e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_assert.h
@@ -0,0 +1,95 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__clang__) and defined(__HIP__)
+
+// abort
+extern "C" __device__ inline __attribute__((weak)) void abort() { __builtin_trap(); }
+
+// The noinline attribute helps encapsulate the printf expansion,
+// which otherwise has a performance impact just by increasing the
+// size of the calling function. Additionally, the weak attribute
+// allows the function to exist as a global although its definition is
+// included in every compilation unit.
+#if defined(_WIN32) || defined(_WIN64)
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void _wassert(
+    const wchar_t* _msg, const wchar_t* _file, unsigned _line) {
+  // FIXME: Need `wchar_t` support to generate assertion message.
+  __builtin_trap();
+}
+#else /* defined(_WIN32) || defined(_WIN64) */
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
+    const char* assertion, const char* file, unsigned int line, const char* function) {
+  const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
+
+  // strlen is not available as a built-in yet, so we create our own
+  // loop in a macro. With a string literal argument, the compiler
+  // usually manages to replace the loop with a constant.
+  //
+  // The macro does not check for null pointer, since all the string
+  // arguments are defined to be constant literals when called from
+  // the assert() macro.
+  //
+  // NOTE: The loop below includes the null terminator in the length
+  // as required by append_string_n().
+#define __hip_get_string_length(LEN, STR)                                                          \
+  do {                                                                                             \
+    const char* tmp = STR;                                                                         \
+    while (*tmp++);                                                                                \
+    LEN = tmp - STR;                                                                               \
+  } while (0)
+
+  auto msg = __ockl_fprintf_stderr_begin();
+  int len = 0;
+  __hip_get_string_length(len, fmt);
+  msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
+  __hip_get_string_length(len, file);
+  msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
+  msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
+  __hip_get_string_length(len, function);
+  msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
+  __hip_get_string_length(len, assertion);
+  __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
+
+#undef __hip_get_string_length
+
+  __builtin_trap();
+}
+
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assertfail() {
+  // ignore all the args for now.
+  __builtin_trap();
+}
+#endif /* defined(_WIN32) || defined(_WIN64) */
+
+#if defined(NDEBUG)
+#define __hip_assert(COND)
+#else
+#define __hip_assert(COND)                                                                         \
+  do {                                                                                             \
+    if (!(COND)) __builtin_trap();                                                                 \
+  } while (0)
+#endif
+
+#endif  // defined(__clang__) and defined(__HIP__)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..46759c8a5afdfb79bd7c57ec233e5b78f397e17d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// /*
+// Half Math Functions
+// */
+#if !defined(__HIPCC_RTC__)
+#include "host_defines.h"
+#endif
+#ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+extern "C" {
+__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+__device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
+__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+__device__ _Float16 __ocml_sin_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
+
+typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+#if defined(__clang__) && defined(__HIP__)
+__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
+#endif
+
+__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+__device__ __2f16 __ocml_cos_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+__device__ __2f16 __ocml_sin_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+}
+#endif  // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
+extern "C" {
+__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_ldg.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_ldg.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce1fb51f464c4d8c9be1baa52512e356ba01a216
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_ldg.h
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+
+#if __HIP_CLANG_ONLY__
+#include "amd_hip_vector_types.h"
+#include "host_defines.h"
+
+__device__ inline static char __ldg(const char* ptr) { return *ptr; }
+
+__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }
+
+__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }
+
+__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
+
+
+__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }
+
+__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
+
+__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
+
+
+__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }
+
+__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
+
+__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
+
+
+__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
+
+
+__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }
+
+__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
+
+
+__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
+
+__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
+
+
+__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
+
+__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
+
+
+__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }
+
+__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
+
+__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
+
+
+__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }
+
+__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
+
+#endif  // __HIP_CLANG_ONLY__
+
+#endif  // HIP_LDG_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_prof_str.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_prof_str.h
new file mode 100644
index 0000000000000000000000000000000000000000..13d85b3dd00fb8227d714ca31310b4a5e937bbd3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_prof_str.h
@@ -0,0 +1,11944 @@
+// Generated file. DO NOT EDIT.
+//
+// This file is automatically generated by the hip_prof_gen.py script.
+// If changes are required, run the script and commit the updated file.
+
+#ifndef _HIP_PROF_STR_H
+#define _HIP_PROF_STR_H
+#define HIP_PROF_VER 1
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_deprecated.h>
+#include "amd_hip_gl_interop.h"
+
+#define HIP_API_ID_CONCAT_HELPER(a,b) a##b
+#define HIP_API_ID_CONCAT(a,b) HIP_API_ID_CONCAT_HELPER(a,b)
+
+// HIP API callbacks ID enumeration
+enum hip_api_id_t {
+  HIP_API_ID_NONE = 0,
+  HIP_API_ID_FIRST = 1,
+  HIP_API_ID___hipPopCallConfiguration = 1,
+  HIP_API_ID___hipPushCallConfiguration = 2,
+  HIP_API_ID_hipArray3DCreate = 3,
+  HIP_API_ID_hipArrayCreate = 4,
+  HIP_API_ID_hipArrayDestroy = 5,
+  HIP_API_ID_hipChooseDeviceR0000 = 6,
+  HIP_API_ID_hipConfigureCall = 7,
+  HIP_API_ID_hipCtxCreate = 8,
+  HIP_API_ID_hipCtxDestroy = 9,
+  HIP_API_ID_hipCtxDisablePeerAccess = 10,
+  HIP_API_ID_hipCtxEnablePeerAccess = 11,
+  HIP_API_ID_hipCtxGetApiVersion = 12,
+  HIP_API_ID_hipCtxGetCacheConfig = 13,
+  HIP_API_ID_hipCtxGetCurrent = 14,
+  HIP_API_ID_hipCtxGetDevice = 15,
+  HIP_API_ID_hipCtxGetFlags = 16,
+  HIP_API_ID_hipCtxGetSharedMemConfig = 17,
+  HIP_API_ID_hipCtxPopCurrent = 18,
+  HIP_API_ID_hipCtxPushCurrent = 19,
+  HIP_API_ID_hipCtxSetCacheConfig = 20,
+  HIP_API_ID_hipCtxSetCurrent = 21,
+  HIP_API_ID_hipCtxSetSharedMemConfig = 22,
+  HIP_API_ID_hipCtxSynchronize = 23,
+  HIP_API_ID_hipDestroyExternalMemory = 24,
+  HIP_API_ID_hipDestroyExternalSemaphore = 25,
+  HIP_API_ID_hipDeviceCanAccessPeer = 26,
+  HIP_API_ID_hipDeviceComputeCapability = 27,
+  HIP_API_ID_hipDeviceDisablePeerAccess = 28,
+  HIP_API_ID_hipDeviceEnablePeerAccess = 29,
+  HIP_API_ID_hipDeviceGet = 30,
+  HIP_API_ID_hipDeviceGetAttribute = 31,
+  HIP_API_ID_hipDeviceGetByPCIBusId = 32,
+  HIP_API_ID_hipDeviceGetCacheConfig = 33,
+  HIP_API_ID_hipDeviceGetLimit = 34,
+  HIP_API_ID_hipDeviceGetName = 35,
+  HIP_API_ID_hipDeviceGetP2PAttribute = 36,
+  HIP_API_ID_hipDeviceGetPCIBusId = 37,
+  HIP_API_ID_hipDeviceGetSharedMemConfig = 38,
+  HIP_API_ID_hipDeviceGetStreamPriorityRange = 39,
+  HIP_API_ID_hipDevicePrimaryCtxGetState = 40,
+  HIP_API_ID_hipDevicePrimaryCtxRelease = 41,
+  HIP_API_ID_hipDevicePrimaryCtxReset = 42,
+  HIP_API_ID_hipDevicePrimaryCtxRetain = 43,
+  HIP_API_ID_hipDevicePrimaryCtxSetFlags = 44,
+  HIP_API_ID_hipDeviceReset = 45,
+  HIP_API_ID_hipDeviceSetCacheConfig = 46,
+  HIP_API_ID_hipDeviceSetSharedMemConfig = 47,
+  HIP_API_ID_hipDeviceSynchronize = 48,
+  HIP_API_ID_hipDeviceTotalMem = 49,
+  HIP_API_ID_RESERVED_50 = 50,
+  HIP_API_ID_hipDrvMemcpy2DUnaligned = 51,
+  HIP_API_ID_hipDrvMemcpy3D = 52,
+  HIP_API_ID_hipDrvMemcpy3DAsync = 53,
+  HIP_API_ID_hipEventCreate = 54,
+  HIP_API_ID_hipEventCreateWithFlags = 55,
+  HIP_API_ID_hipEventDestroy = 56,
+  HIP_API_ID_hipEventElapsedTime = 57,
+  HIP_API_ID_hipEventQuery = 58,
+  HIP_API_ID_hipEventRecord = 59,
+  HIP_API_ID_hipEventSynchronize = 60,
+  HIP_API_ID_hipExtGetLinkTypeAndHopCount = 61,
+  HIP_API_ID_hipExtLaunchKernel = 62,
+  HIP_API_ID_hipExtLaunchMultiKernelMultiDevice = 63,
+  HIP_API_ID_hipExtMallocWithFlags = 64,
+  HIP_API_ID_hipExtModuleLaunchKernel = 65,
+  HIP_API_ID_hipExtStreamCreateWithCUMask = 66,
+  HIP_API_ID_hipExtStreamGetCUMask = 67,
+  HIP_API_ID_hipExternalMemoryGetMappedBuffer = 68,
+  HIP_API_ID_hipFree = 69,
+  HIP_API_ID_hipFreeArray = 70,
+  HIP_API_ID_hipFreeHost = 71,
+  HIP_API_ID_hipFreeMipmappedArray = 72,
+  HIP_API_ID_hipFuncGetAttribute = 73,
+  HIP_API_ID_hipFuncGetAttributes = 74,
+  HIP_API_ID_hipFuncSetAttribute = 75,
+  HIP_API_ID_hipFuncSetCacheConfig = 76,
+  HIP_API_ID_hipFuncSetSharedMemConfig = 77,
+  HIP_API_ID_hipGetDevice = 78,
+  HIP_API_ID_hipGetDeviceCount = 79,
+  HIP_API_ID_hipGetDeviceFlags = 80,
+  HIP_API_ID_hipGetDevicePropertiesR0000 = 81,
+  HIP_API_ID_RESERVED_82 = 82,
+  HIP_API_ID_RESERVED_83 = 83,
+  HIP_API_ID_hipGetLastError = 84,
+  HIP_API_ID_hipGetMipmappedArrayLevel = 85,
+  HIP_API_ID_hipGetSymbolAddress = 86,
+  HIP_API_ID_hipGetSymbolSize = 87,
+  HIP_API_ID_hipHccModuleLaunchKernel = 88,
+  HIP_API_ID_hipHostAlloc = 89,
+  HIP_API_ID_hipHostFree = 90,
+  HIP_API_ID_hipHostGetDevicePointer = 91,
+  HIP_API_ID_hipHostGetFlags = 92,
+  HIP_API_ID_hipHostMalloc = 93,
+  HIP_API_ID_hipHostRegister = 94,
+  HIP_API_ID_hipHostUnregister = 95,
+  HIP_API_ID_hipImportExternalMemory = 96,
+  HIP_API_ID_hipImportExternalSemaphore = 97,
+  HIP_API_ID_hipInit = 98,
+  HIP_API_ID_hipIpcCloseMemHandle = 99,
+  HIP_API_ID_hipIpcGetEventHandle = 100,
+  HIP_API_ID_hipIpcGetMemHandle = 101,
+  HIP_API_ID_hipIpcOpenEventHandle = 102,
+  HIP_API_ID_hipIpcOpenMemHandle = 103,
+  HIP_API_ID_hipLaunchByPtr = 104,
+  HIP_API_ID_hipLaunchCooperativeKernel = 105,
+  HIP_API_ID_hipLaunchCooperativeKernelMultiDevice = 106,
+  HIP_API_ID_hipLaunchKernel = 107,
+  HIP_API_ID_hipMalloc = 108,
+  HIP_API_ID_hipMalloc3D = 109,
+  HIP_API_ID_hipMalloc3DArray = 110,
+  HIP_API_ID_hipMallocArray = 111,
+  HIP_API_ID_hipMallocHost = 112,
+  HIP_API_ID_hipMallocManaged = 113,
+  HIP_API_ID_hipMallocMipmappedArray = 114,
+  HIP_API_ID_hipMallocPitch = 115,
+  HIP_API_ID_hipMemAdvise = 116,
+  HIP_API_ID_hipMemAllocHost = 117,
+  HIP_API_ID_hipMemAllocPitch = 118,
+  HIP_API_ID_hipMemGetAddressRange = 119,
+  HIP_API_ID_hipMemGetInfo = 120,
+  HIP_API_ID_hipMemPrefetchAsync = 121,
+  HIP_API_ID_hipMemPtrGetInfo = 122,
+  HIP_API_ID_hipMemRangeGetAttribute = 123,
+  HIP_API_ID_hipMemRangeGetAttributes = 124,
+  HIP_API_ID_hipMemcpy = 125,
+  HIP_API_ID_hipMemcpy2D = 126,
+  HIP_API_ID_hipMemcpy2DAsync = 127,
+  HIP_API_ID_hipMemcpy2DFromArray = 128,
+  HIP_API_ID_hipMemcpy2DFromArrayAsync = 129,
+  HIP_API_ID_hipMemcpy2DToArray = 130,
+  HIP_API_ID_hipMemcpy2DToArrayAsync = 131,
+  HIP_API_ID_hipMemcpy3D = 132,
+  HIP_API_ID_hipMemcpy3DAsync = 133,
+  HIP_API_ID_hipMemcpyAsync = 134,
+  HIP_API_ID_hipMemcpyAtoH = 135,
+  HIP_API_ID_hipMemcpyDtoD = 136,
+  HIP_API_ID_hipMemcpyDtoDAsync = 137,
+  HIP_API_ID_hipMemcpyDtoH = 138,
+  HIP_API_ID_hipMemcpyDtoHAsync = 139,
+  HIP_API_ID_hipMemcpyFromArray = 140,
+  HIP_API_ID_hipMemcpyFromSymbol = 141,
+  HIP_API_ID_hipMemcpyFromSymbolAsync = 142,
+  HIP_API_ID_hipMemcpyHtoA = 143,
+  HIP_API_ID_hipMemcpyHtoD = 144,
+  HIP_API_ID_hipMemcpyHtoDAsync = 145,
+  HIP_API_ID_hipMemcpyParam2D = 146,
+  HIP_API_ID_hipMemcpyParam2DAsync = 147,
+  HIP_API_ID_hipMemcpyPeer = 148,
+  HIP_API_ID_hipMemcpyPeerAsync = 149,
+  HIP_API_ID_hipMemcpyToArray = 150,
+  HIP_API_ID_hipMemcpyToSymbol = 151,
+  HIP_API_ID_hipMemcpyToSymbolAsync = 152,
+  HIP_API_ID_hipMemcpyWithStream = 153,
+  HIP_API_ID_hipMemset = 154,
+  HIP_API_ID_hipMemset2D = 155,
+  HIP_API_ID_hipMemset2DAsync = 156,
+  HIP_API_ID_hipMemset3D = 157,
+  HIP_API_ID_hipMemset3DAsync = 158,
+  HIP_API_ID_hipMemsetAsync = 159,
+  HIP_API_ID_hipMemsetD16 = 160,
+  HIP_API_ID_hipMemsetD16Async = 161,
+  HIP_API_ID_hipMemsetD32 = 162,
+  HIP_API_ID_hipMemsetD32Async = 163,
+  HIP_API_ID_hipMemsetD8 = 164,
+  HIP_API_ID_hipMemsetD8Async = 165,
+  HIP_API_ID_hipModuleGetFunction = 166,
+  HIP_API_ID_hipModuleGetGlobal = 167,
+  HIP_API_ID_hipModuleGetTexRef = 168,
+  HIP_API_ID_hipModuleLaunchKernel = 169,
+  HIP_API_ID_hipModuleLoad = 170,
+  HIP_API_ID_hipModuleLoadData = 171,
+  HIP_API_ID_hipModuleLoadDataEx = 172,
+  HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor = 173,
+  HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 174,
+  HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize = 175,
+  HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags = 176,
+  HIP_API_ID_hipModuleUnload = 177,
+  HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor = 178,
+  HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 179,
+  HIP_API_ID_hipOccupancyMaxPotentialBlockSize = 180,
+  HIP_API_ID_hipPeekAtLastError = 181,
+  HIP_API_ID_hipPointerGetAttributes = 182,
+  HIP_API_ID_hipProfilerStart = 183,
+  HIP_API_ID_hipProfilerStop = 184,
+  HIP_API_ID_RESERVED_185 = 185,
+  HIP_API_ID_hipSetDevice = 186,
+  HIP_API_ID_hipSetDeviceFlags = 187,
+  HIP_API_ID_hipSetupArgument = 188,
+  HIP_API_ID_hipSignalExternalSemaphoresAsync = 189,
+  HIP_API_ID_hipStreamAddCallback = 190,
+  HIP_API_ID_hipStreamAttachMemAsync = 191,
+  HIP_API_ID_hipStreamCreate = 192,
+  HIP_API_ID_hipStreamCreateWithFlags = 193,
+  HIP_API_ID_hipStreamCreateWithPriority = 194,
+  HIP_API_ID_hipStreamDestroy = 195,
+  HIP_API_ID_hipStreamGetFlags = 196,
+  HIP_API_ID_hipStreamGetPriority = 197,
+  HIP_API_ID_hipStreamQuery = 198,
+  HIP_API_ID_hipStreamSynchronize = 199,
+  HIP_API_ID_hipStreamWaitEvent = 200,
+  HIP_API_ID_hipStreamWaitValue32 = 201,
+  HIP_API_ID_hipStreamWaitValue64 = 202,
+  HIP_API_ID_hipStreamWriteValue32 = 203,
+  HIP_API_ID_hipStreamWriteValue64 = 204,
+  HIP_API_ID_hipWaitExternalSemaphoresAsync = 205,
+  HIP_API_ID_hipCreateSurfaceObject = 206,
+  HIP_API_ID_hipDestroySurfaceObject = 207,
+  HIP_API_ID_hipGraphAddKernelNode = 208,
+  HIP_API_ID_hipGraphAddMemcpyNode = 209,
+  HIP_API_ID_hipGraphAddMemsetNode = 210,
+  HIP_API_ID_hipGraphCreate = 211,
+  HIP_API_ID_hipGraphDestroy = 212,
+  HIP_API_ID_hipGraphExecDestroy = 213,
+  HIP_API_ID_hipGraphInstantiate = 214,
+  HIP_API_ID_hipGraphLaunch = 215,
+  HIP_API_ID_hipMipmappedArrayCreate = 216,
+  HIP_API_ID_hipMipmappedArrayDestroy = 217,
+  HIP_API_ID_hipMipmappedArrayGetLevel = 218,
+  HIP_API_ID_hipStreamBeginCapture = 219,
+  HIP_API_ID_hipStreamEndCapture = 220,
+  HIP_API_ID_hipTexRefGetAddress = 221,
+  HIP_API_ID_hipTexRefGetFlags = 222,
+  HIP_API_ID_hipTexRefGetFormat = 223,
+  HIP_API_ID_hipTexRefGetMaxAnisotropy = 224,
+  HIP_API_ID_hipTexRefGetMipMappedArray = 225,
+  HIP_API_ID_hipTexRefGetMipmapLevelBias = 226,
+  HIP_API_ID_hipTexRefGetMipmapLevelClamp = 227,
+  HIP_API_ID_hipTexRefSetAddress = 228,
+  HIP_API_ID_hipTexRefSetAddress2D = 229,
+  HIP_API_ID_hipTexRefSetBorderColor = 230,
+  HIP_API_ID_hipTexRefSetFormat = 231,
+  HIP_API_ID_hipTexRefSetMaxAnisotropy = 232,
+  HIP_API_ID_hipTexRefSetMipmapLevelClamp = 233,
+  HIP_API_ID_hipTexRefSetMipmappedArray = 234,
+  HIP_API_ID_hipGLGetDevices = 235,
+  HIP_API_ID_hipGraphAddDependencies = 236,
+  HIP_API_ID_hipGraphAddEmptyNode = 237,
+  HIP_API_ID_hipGraphExecKernelNodeSetParams = 238,
+  HIP_API_ID_hipGraphGetNodes = 239,
+  HIP_API_ID_hipGraphGetRootNodes = 240,
+  HIP_API_ID_hipGraphKernelNodeGetParams = 241,
+  HIP_API_ID_hipGraphKernelNodeSetParams = 242,
+  HIP_API_ID_hipGraphMemcpyNodeGetParams = 243,
+  HIP_API_ID_hipGraphMemcpyNodeSetParams = 244,
+  HIP_API_ID_hipGraphMemsetNodeGetParams = 245,
+  HIP_API_ID_hipGraphMemsetNodeSetParams = 246,
+  HIP_API_ID_hipGraphicsGLRegisterBuffer = 247,
+  HIP_API_ID_hipGraphicsMapResources = 248,
+  HIP_API_ID_hipGraphicsResourceGetMappedPointer = 249,
+  HIP_API_ID_hipGraphicsUnmapResources = 250,
+  HIP_API_ID_hipGraphicsUnregisterResource = 251,
+  HIP_API_ID_hipGraphAddChildGraphNode = 252,
+  HIP_API_ID_hipGraphAddEventRecordNode = 253,
+  HIP_API_ID_hipGraphAddEventWaitNode = 254,
+  HIP_API_ID_hipGraphAddHostNode = 255,
+  HIP_API_ID_hipGraphAddMemcpyNode1D = 256,
+  HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol = 257,
+  HIP_API_ID_hipGraphAddMemcpyNodeToSymbol = 258,
+  HIP_API_ID_hipGraphChildGraphNodeGetGraph = 259,
+  HIP_API_ID_hipGraphClone = 260,
+  HIP_API_ID_hipGraphDestroyNode = 261,
+  HIP_API_ID_hipGraphEventRecordNodeGetEvent = 262,
+  HIP_API_ID_hipGraphEventRecordNodeSetEvent = 263,
+  HIP_API_ID_hipGraphEventWaitNodeGetEvent = 264,
+  HIP_API_ID_hipGraphEventWaitNodeSetEvent = 265,
+  HIP_API_ID_hipGraphExecChildGraphNodeSetParams = 266,
+  HIP_API_ID_hipGraphExecEventRecordNodeSetEvent = 267,
+  HIP_API_ID_hipGraphExecEventWaitNodeSetEvent = 268,
+  HIP_API_ID_hipGraphExecHostNodeSetParams = 269,
+  HIP_API_ID_hipGraphExecMemcpyNodeSetParams = 270,
+  HIP_API_ID_hipGraphExecMemcpyNodeSetParams1D = 271,
+  HIP_API_ID_hipGraphExecMemcpyNodeSetParamsFromSymbol = 272,
+  HIP_API_ID_hipGraphExecMemcpyNodeSetParamsToSymbol = 273,
+  HIP_API_ID_hipGraphExecMemsetNodeSetParams = 274,
+  HIP_API_ID_hipGraphExecUpdate = 275,
+  HIP_API_ID_hipGraphGetEdges = 276,
+  HIP_API_ID_hipGraphHostNodeGetParams = 277,
+  HIP_API_ID_hipGraphHostNodeSetParams = 278,
+  HIP_API_ID_hipGraphInstantiateWithFlags = 279,
+  HIP_API_ID_hipGraphMemcpyNodeSetParams1D = 280,
+  HIP_API_ID_hipGraphMemcpyNodeSetParamsFromSymbol = 281,
+  HIP_API_ID_hipGraphMemcpyNodeSetParamsToSymbol = 282,
+  HIP_API_ID_hipGraphNodeFindInClone = 283,
+  HIP_API_ID_hipGraphNodeGetDependencies = 284,
+  HIP_API_ID_hipGraphNodeGetDependentNodes = 285,
+  HIP_API_ID_hipGraphNodeGetType = 286,
+  HIP_API_ID_hipGraphRemoveDependencies = 287,
+  HIP_API_ID_hipStreamGetCaptureInfo = 288,
+  HIP_API_ID_hipStreamGetCaptureInfo_v2 = 289,
+  HIP_API_ID_hipStreamIsCapturing = 290,
+  HIP_API_ID_hipStreamUpdateCaptureDependencies = 291,
+  HIP_API_ID_hipDrvPointerGetAttributes = 292,
+  HIP_API_ID_hipGraphicsGLRegisterImage = 293,
+  HIP_API_ID_hipGraphicsSubResourceGetMappedArray = 294,
+  HIP_API_ID_hipPointerGetAttribute = 295,
+  HIP_API_ID_RESERVED_296 = 296,
+  HIP_API_ID_hipThreadExchangeStreamCaptureMode = 297,
+  HIP_API_ID_hipDeviceGetUuid = 298,
+  HIP_API_ID_hipGetChannelDesc = 299,
+  HIP_API_ID_hipGraphKernelNodeGetAttribute = 300,
+  HIP_API_ID_hipGraphKernelNodeSetAttribute = 301,
+  HIP_API_ID_hipLaunchHostFunc = 302,
+  HIP_API_ID_hipDeviceGetDefaultMemPool = 303,
+  HIP_API_ID_hipDeviceGetMemPool = 304,
+  HIP_API_ID_hipDeviceSetMemPool = 305,
+  HIP_API_ID_hipFreeAsync = 306,
+  HIP_API_ID_hipMallocAsync = 307,
+  HIP_API_ID_hipMallocFromPoolAsync = 308,
+  HIP_API_ID_hipMemPoolCreate = 309,
+  HIP_API_ID_hipMemPoolDestroy = 310,
+  HIP_API_ID_hipMemPoolExportPointer = 311,
+  HIP_API_ID_hipMemPoolExportToShareableHandle = 312,
+  HIP_API_ID_hipMemPoolGetAccess = 313,
+  HIP_API_ID_hipMemPoolGetAttribute = 314,
+  HIP_API_ID_hipMemPoolImportFromShareableHandle = 315,
+  HIP_API_ID_hipMemPoolImportPointer = 316,
+  HIP_API_ID_hipMemPoolSetAccess = 317,
+  HIP_API_ID_hipMemPoolSetAttribute = 318,
+  HIP_API_ID_hipMemPoolTrimTo = 319,
+  HIP_API_ID_hipMemAddressFree = 320,
+  HIP_API_ID_hipMemAddressReserve = 321,
+  HIP_API_ID_hipMemCreate = 322,
+  HIP_API_ID_hipMemExportToShareableHandle = 323,
+  HIP_API_ID_hipMemGetAccess = 324,
+  HIP_API_ID_hipMemGetAllocationGranularity = 325,
+  HIP_API_ID_hipMemGetAllocationPropertiesFromHandle = 326,
+  HIP_API_ID_hipMemImportFromShareableHandle = 327,
+  HIP_API_ID_hipMemMap = 328,
+  HIP_API_ID_hipMemMapArrayAsync = 329,
+  HIP_API_ID_hipMemRelease = 330,
+  HIP_API_ID_hipMemRetainAllocationHandle = 331,
+  HIP_API_ID_hipMemSetAccess = 332,
+  HIP_API_ID_hipMemUnmap = 333,
+  HIP_API_ID_hipDeviceSetGraphMemAttribute = 334,
+  HIP_API_ID_hipDeviceGetGraphMemAttribute = 335,
+  HIP_API_ID_hipDeviceGraphMemTrim = 336,
+  HIP_API_ID_hipDeviceSetLimit = 337,
+  HIP_API_ID_hipTexRefSetArray = 338,
+  HIP_API_ID_hipTexRefSetFlags = 339,
+  HIP_API_ID_hipTexRefSetMipmapLevelBias = 340,
+  HIP_API_ID_hipDriverGetVersion = 341,
+  HIP_API_ID_hipGraphUpload = 342,
+  HIP_API_ID_hipRuntimeGetVersion = 343,
+  HIP_API_ID_hipUserObjectCreate = 344,
+  HIP_API_ID_hipUserObjectRelease = 345,
+  HIP_API_ID_hipUserObjectRetain = 346,
+  HIP_API_ID_hipGraphRetainUserObject = 347,
+  HIP_API_ID_hipGraphReleaseUserObject = 348,
+  HIP_API_ID_hipGraphDebugDotPrint = 349,
+  HIP_API_ID_hipGraphKernelNodeCopyAttributes = 350,
+  HIP_API_ID_hipGraphNodeGetEnabled = 351,
+  HIP_API_ID_hipGraphNodeSetEnabled = 352,
+  HIP_API_ID_hipPointerSetAttribute = 353,
+  HIP_API_ID_hipGraphAddMemAllocNode = 354,
+  HIP_API_ID_hipGraphAddMemFreeNode = 355,
+  HIP_API_ID_hipGraphMemAllocNodeGetParams = 356,
+  HIP_API_ID_hipGraphMemFreeNodeGetParams = 357,
+  HIP_API_ID_hipModuleLaunchCooperativeKernel = 358,
+  HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice = 359,
+  HIP_API_ID_hipArray3DGetDescriptor = 360,
+  HIP_API_ID_hipArrayGetDescriptor = 361,
+  HIP_API_ID_hipArrayGetInfo = 362,
+  HIP_API_ID_hipStreamGetDevice = 363,
+  HIP_API_ID_hipExternalMemoryGetMappedMipmappedArray = 364,
+  HIP_API_ID_hipChooseDeviceR0600 = 365,
+  HIP_API_ID_hipDrvGraphAddMemcpyNode = 366,
+  HIP_API_ID_hipDrvGraphAddMemsetNode = 367,
+  HIP_API_ID_hipDrvGraphMemcpyNodeGetParams = 368,
+  HIP_API_ID_hipDrvGraphMemcpyNodeSetParams = 369,
+  HIP_API_ID_hipGetDevicePropertiesR0600 = 370,
+  HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode = 371,
+  HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode = 372,
+  HIP_API_ID_hipGraphExecExternalSemaphoresSignalNodeSetParams = 373,
+  HIP_API_ID_hipGraphExecExternalSemaphoresWaitNodeSetParams = 374,
+  HIP_API_ID_hipGraphExternalSemaphoresSignalNodeGetParams = 375,
+  HIP_API_ID_hipGraphExternalSemaphoresSignalNodeSetParams = 376,
+  HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams = 377,
+  HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams = 378,
+  HIP_API_ID_hipExtGetLastError = 379,
+  HIP_API_ID_hipGraphAddNode = 380,
+  HIP_API_ID_hipGetProcAddress = 381,
+  HIP_API_ID_hipGraphExecGetFlags = 382,
+  HIP_API_ID_hipGraphExecNodeSetParams = 383,
+  HIP_API_ID_hipGraphInstantiateWithParams = 384,
+  HIP_API_ID_hipGraphNodeSetParams = 385,
+  HIP_API_ID_hipDrvGraphAddMemFreeNode = 386,
+  HIP_API_ID_hipDrvGraphExecMemcpyNodeSetParams = 387,
+  HIP_API_ID_hipDrvGraphExecMemsetNodeSetParams = 388,
+  HIP_API_ID_hipTexRefGetArray = 389,
+  HIP_API_ID_hipTexRefGetBorderColor = 390,
+  HIP_API_ID_hipStreamBeginCaptureToGraph = 391,
+  HIP_API_ID_hipGetFuncBySymbol = 392,
+  HIP_API_ID_RESERVED_393 = 393,
+  HIP_API_ID_RESERVED_394 = 394,
+  HIP_API_ID_RESERVED_395 = 395,
+  HIP_API_ID_RESERVED_396 = 396,
+  HIP_API_ID_RESERVED_397 = 397,
+  HIP_API_ID_RESERVED_398 = 398,
+  HIP_API_ID_RESERVED_399 = 399,
+  HIP_API_ID_hipMemcpy2DArrayToArray = 400,
+  HIP_API_ID_hipMemcpyAtoA = 401,
+  HIP_API_ID_hipMemcpyAtoD = 402,
+  HIP_API_ID_hipMemcpyAtoHAsync = 403,
+  HIP_API_ID_hipMemcpyDtoA = 404,
+  HIP_API_ID_hipMemcpyHtoAAsync = 405,
+  HIP_API_ID_hipSetValidDevices = 406,
+  HIP_API_ID_RESERVED_407 = 407,
+  HIP_API_ID_hipStreamBatchMemOp = 408,
+  HIP_API_ID_hipGraphAddBatchMemOpNode = 409,
+  HIP_API_ID_hipGraphBatchMemOpNodeGetParams = 410,
+  HIP_API_ID_hipGraphBatchMemOpNodeSetParams = 411,
+  HIP_API_ID_hipGraphExecBatchMemOpNodeSetParams = 412,
+  HIP_API_ID_hipEventRecordWithFlags = 413,
+  HIP_API_ID_hipLinkAddData = 414,
+  HIP_API_ID_hipLinkAddFile = 415,
+  HIP_API_ID_hipLinkComplete = 416,
+  HIP_API_ID_hipLinkCreate = 417,
+  HIP_API_ID_hipLinkDestroy = 418,
+  HIP_API_ID_hipLaunchKernelExC = 419,
+  HIP_API_ID_hipDrvLaunchKernelEx = 420,
+  HIP_API_ID_hipModuleGetFunctionCount = 421,
+  HIP_API_ID_hipMemsetD2D16 = 422,
+  HIP_API_ID_hipMemsetD2D16Async = 423,
+  HIP_API_ID_hipMemsetD2D32 = 424,
+  HIP_API_ID_hipMemsetD2D32Async = 425,
+  HIP_API_ID_hipMemsetD2D8 = 426,
+  HIP_API_ID_hipMemsetD2D8Async = 427,
+  HIP_API_ID_hipStreamGetAttribute = 428,
+  HIP_API_ID_hipStreamSetAttribute = 429,
+  HIP_API_ID_hipModuleLoadFatBinary = 430,
+  HIP_API_ID_hipMemcpy3DBatchAsync = 431,
+  HIP_API_ID_hipMemcpy3DPeer = 432,
+  HIP_API_ID_hipMemcpy3DPeerAsync = 433,
+  HIP_API_ID_hipMemcpyBatchAsync = 434,
+  HIP_API_ID_hipGetDriverEntryPoint = 435,
+  HIP_API_ID_hipMemPrefetchAsync_v2 = 436,
+  HIP_API_ID_hipMemAdvise_v2 = 437,
+  HIP_API_ID_hipStreamGetId = 438,
+  HIP_API_ID_hipLibraryLoadData = 439,
+  HIP_API_ID_hipLibraryLoadFromFile = 440,
+  HIP_API_ID_hipLibraryUnload = 441,
+  HIP_API_ID_hipLibraryGetKernel = 442,
+  HIP_API_ID_hipLibraryGetKernelCount = 443,
+  HIP_API_ID_hipMemGetHandleForAddressRange = 444,
+  HIP_API_ID_LAST = 444,
+
+  HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
+  HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
+
+  HIP_API_ID_hipBindTexture = HIP_API_ID_NONE,
+  HIP_API_ID_hipBindTexture2D = HIP_API_ID_NONE,
+  HIP_API_ID_hipBindTextureToArray = HIP_API_ID_NONE,
+  HIP_API_ID_hipBindTextureToMipmappedArray = HIP_API_ID_NONE,
+  HIP_API_ID_hipCreateTextureObject = HIP_API_ID_NONE,
+  HIP_API_ID_hipDestroyTextureObject = HIP_API_ID_NONE,
+  HIP_API_ID_hipDeviceGetCount = HIP_API_ID_NONE,
+  HIP_API_ID_hipDeviceGetTexture1DLinearMaxWidth = HIP_API_ID_NONE,
+  HIP_API_ID_hipGetTextureAlignmentOffset = HIP_API_ID_NONE,
+  HIP_API_ID_hipGetTextureObjectResourceDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipGetTextureObjectResourceViewDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipGetTextureObjectTextureDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipGetTextureReference = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexObjectCreate = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexObjectDestroy = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexObjectGetResourceDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexObjectGetResourceViewDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexObjectGetTextureDesc = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefGetAddressMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefGetFilterMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefGetMipmapFilterMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefSetAddressMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefSetFilterMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipTexRefSetMipmapFilterMode = HIP_API_ID_NONE,
+  HIP_API_ID_hipUnbindTexture = HIP_API_ID_NONE,
+};
+
+#undef HIP_API_ID_CONCAT_HELPER
+#undef HIP_API_ID_CONCAT
+
+// Return the HIP API string for a given callback ID
+static inline const char* hip_api_name(const uint32_t id) {
+  switch(id) {
+    case HIP_API_ID___hipPopCallConfiguration: return "__hipPopCallConfiguration";
+    case HIP_API_ID___hipPushCallConfiguration: return "__hipPushCallConfiguration";
+    case HIP_API_ID_hipArray3DCreate: return "hipArray3DCreate";
+    case HIP_API_ID_hipArray3DGetDescriptor: return "hipArray3DGetDescriptor";
+    case HIP_API_ID_hipArrayCreate: return "hipArrayCreate";
+    case HIP_API_ID_hipArrayDestroy: return "hipArrayDestroy";
+    case HIP_API_ID_hipArrayGetDescriptor: return "hipArrayGetDescriptor";
+    case HIP_API_ID_hipArrayGetInfo: return "hipArrayGetInfo";
+    case HIP_API_ID_hipChooseDeviceR0000: return "hipChooseDeviceR0000";
+    case HIP_API_ID_hipChooseDeviceR0600: return "hipChooseDeviceR0600";
+    case HIP_API_ID_hipConfigureCall: return "hipConfigureCall";
+    case HIP_API_ID_hipCreateSurfaceObject: return "hipCreateSurfaceObject";
+    case HIP_API_ID_hipCtxCreate: return "hipCtxCreate";
+    case HIP_API_ID_hipCtxDestroy: return "hipCtxDestroy";
+    case HIP_API_ID_hipCtxDisablePeerAccess: return "hipCtxDisablePeerAccess";
+    case HIP_API_ID_hipCtxEnablePeerAccess: return "hipCtxEnablePeerAccess";
+    case HIP_API_ID_hipCtxGetApiVersion: return "hipCtxGetApiVersion";
+    case HIP_API_ID_hipCtxGetCacheConfig: return "hipCtxGetCacheConfig";
+    case HIP_API_ID_hipCtxGetCurrent: return "hipCtxGetCurrent";
+    case HIP_API_ID_hipCtxGetDevice: return "hipCtxGetDevice";
+    case HIP_API_ID_hipCtxGetFlags: return "hipCtxGetFlags";
+    case HIP_API_ID_hipCtxGetSharedMemConfig: return "hipCtxGetSharedMemConfig";
+    case HIP_API_ID_hipCtxPopCurrent: return "hipCtxPopCurrent";
+    case HIP_API_ID_hipCtxPushCurrent: return "hipCtxPushCurrent";
+    case HIP_API_ID_hipCtxSetCacheConfig: return "hipCtxSetCacheConfig";
+    case HIP_API_ID_hipCtxSetCurrent: return "hipCtxSetCurrent";
+    case HIP_API_ID_hipCtxSetSharedMemConfig: return "hipCtxSetSharedMemConfig";
+    case HIP_API_ID_hipCtxSynchronize: return "hipCtxSynchronize";
+    case HIP_API_ID_hipDestroyExternalMemory: return "hipDestroyExternalMemory";
+    case HIP_API_ID_hipDestroyExternalSemaphore: return "hipDestroyExternalSemaphore";
+    case HIP_API_ID_hipDestroySurfaceObject: return "hipDestroySurfaceObject";
+    case HIP_API_ID_hipDeviceCanAccessPeer: return "hipDeviceCanAccessPeer";
+    case HIP_API_ID_hipDeviceComputeCapability: return "hipDeviceComputeCapability";
+    case HIP_API_ID_hipDeviceDisablePeerAccess: return "hipDeviceDisablePeerAccess";
+    case HIP_API_ID_hipDeviceEnablePeerAccess: return "hipDeviceEnablePeerAccess";
+    case HIP_API_ID_hipDeviceGet: return "hipDeviceGet";
+    case HIP_API_ID_hipDeviceGetAttribute: return "hipDeviceGetAttribute";
+    case HIP_API_ID_hipDeviceGetByPCIBusId: return "hipDeviceGetByPCIBusId";
+    case HIP_API_ID_hipDeviceGetCacheConfig: return "hipDeviceGetCacheConfig";
+    case HIP_API_ID_hipDeviceGetDefaultMemPool: return "hipDeviceGetDefaultMemPool";
+    case HIP_API_ID_hipDeviceGetGraphMemAttribute: return "hipDeviceGetGraphMemAttribute";
+    case HIP_API_ID_hipDeviceGetLimit: return "hipDeviceGetLimit";
+    case HIP_API_ID_hipDeviceGetMemPool: return "hipDeviceGetMemPool";
+    case HIP_API_ID_hipDeviceGetName: return "hipDeviceGetName";
+    case HIP_API_ID_hipDeviceGetP2PAttribute: return "hipDeviceGetP2PAttribute";
+    case HIP_API_ID_hipDeviceGetPCIBusId: return "hipDeviceGetPCIBusId";
+    case HIP_API_ID_hipDeviceGetSharedMemConfig: return "hipDeviceGetSharedMemConfig";
+    case HIP_API_ID_hipDeviceGetStreamPriorityRange: return "hipDeviceGetStreamPriorityRange";
+    case HIP_API_ID_hipDeviceGetUuid: return "hipDeviceGetUuid";
+    case HIP_API_ID_hipDeviceGraphMemTrim: return "hipDeviceGraphMemTrim";
+    case HIP_API_ID_hipDevicePrimaryCtxGetState: return "hipDevicePrimaryCtxGetState";
+    case HIP_API_ID_hipDevicePrimaryCtxRelease: return "hipDevicePrimaryCtxRelease";
+    case HIP_API_ID_hipDevicePrimaryCtxReset: return "hipDevicePrimaryCtxReset";
+    case HIP_API_ID_hipDevicePrimaryCtxRetain: return "hipDevicePrimaryCtxRetain";
+    case HIP_API_ID_hipDevicePrimaryCtxSetFlags: return "hipDevicePrimaryCtxSetFlags";
+    case HIP_API_ID_hipDeviceReset: return "hipDeviceReset";
+    case HIP_API_ID_hipDeviceSetCacheConfig: return "hipDeviceSetCacheConfig";
+    case HIP_API_ID_hipDeviceSetGraphMemAttribute: return "hipDeviceSetGraphMemAttribute";
+    case HIP_API_ID_hipDeviceSetLimit: return "hipDeviceSetLimit";
+    case HIP_API_ID_hipDeviceSetMemPool: return "hipDeviceSetMemPool";
+    case HIP_API_ID_hipDeviceSetSharedMemConfig: return "hipDeviceSetSharedMemConfig";
+    case HIP_API_ID_hipDeviceSynchronize: return "hipDeviceSynchronize";
+    case HIP_API_ID_hipDeviceTotalMem: return "hipDeviceTotalMem";
+    case HIP_API_ID_hipDriverGetVersion: return "hipDriverGetVersion";
+    case HIP_API_ID_hipDrvGraphAddMemFreeNode: return "hipDrvGraphAddMemFreeNode";
+    case HIP_API_ID_hipDrvGraphAddMemcpyNode: return "hipDrvGraphAddMemcpyNode";
+    case HIP_API_ID_hipDrvGraphAddMemsetNode: return "hipDrvGraphAddMemsetNode";
+    case HIP_API_ID_hipDrvGraphExecMemcpyNodeSetParams: return "hipDrvGraphExecMemcpyNodeSetParams";
+    case HIP_API_ID_hipDrvGraphExecMemsetNodeSetParams: return "hipDrvGraphExecMemsetNodeSetParams";
+    case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams: return "hipDrvGraphMemcpyNodeGetParams";
+    case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams: return "hipDrvGraphMemcpyNodeSetParams";
+    case HIP_API_ID_hipDrvLaunchKernelEx: return "hipDrvLaunchKernelEx";
+    case HIP_API_ID_hipDrvMemcpy2DUnaligned: return "hipDrvMemcpy2DUnaligned";
+    case HIP_API_ID_hipDrvMemcpy3D: return "hipDrvMemcpy3D";
+    case HIP_API_ID_hipDrvMemcpy3DAsync: return "hipDrvMemcpy3DAsync";
+    case HIP_API_ID_hipDrvPointerGetAttributes: return "hipDrvPointerGetAttributes";
+    case HIP_API_ID_hipEventCreate: return "hipEventCreate";
+    case HIP_API_ID_hipEventCreateWithFlags: return "hipEventCreateWithFlags";
+    case HIP_API_ID_hipEventDestroy: return "hipEventDestroy";
+    case HIP_API_ID_hipEventElapsedTime: return "hipEventElapsedTime";
+    case HIP_API_ID_hipEventQuery: return "hipEventQuery";
+    case HIP_API_ID_hipEventRecord: return "hipEventRecord";
+    case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags";
+    case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
+    case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
+    case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
+    case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
+    case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
+    case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags";
+    case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel";
+    case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask";
+    case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask";
+    case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer";
+    case HIP_API_ID_hipExternalMemoryGetMappedMipmappedArray: return "hipExternalMemoryGetMappedMipmappedArray";
+    case HIP_API_ID_hipFree: return "hipFree";
+    case HIP_API_ID_hipFreeArray: return "hipFreeArray";
+    case HIP_API_ID_hipFreeAsync: return "hipFreeAsync";
+    case HIP_API_ID_hipFreeHost: return "hipFreeHost";
+    case HIP_API_ID_hipFreeMipmappedArray: return "hipFreeMipmappedArray";
+    case HIP_API_ID_hipFuncGetAttribute: return "hipFuncGetAttribute";
+    case HIP_API_ID_hipFuncGetAttributes: return "hipFuncGetAttributes";
+    case HIP_API_ID_hipFuncSetAttribute: return "hipFuncSetAttribute";
+    case HIP_API_ID_hipFuncSetCacheConfig: return "hipFuncSetCacheConfig";
+    case HIP_API_ID_hipFuncSetSharedMemConfig: return "hipFuncSetSharedMemConfig";
+    case HIP_API_ID_hipGLGetDevices: return "hipGLGetDevices";
+    case HIP_API_ID_hipGetChannelDesc: return "hipGetChannelDesc";
+    case HIP_API_ID_hipGetDevice: return "hipGetDevice";
+    case HIP_API_ID_hipGetDeviceCount: return "hipGetDeviceCount";
+    case HIP_API_ID_hipGetDeviceFlags: return "hipGetDeviceFlags";
+    case HIP_API_ID_hipGetDevicePropertiesR0000: return "hipGetDevicePropertiesR0000";
+    case HIP_API_ID_hipGetDevicePropertiesR0600: return "hipGetDevicePropertiesR0600";
+    case HIP_API_ID_hipGetDriverEntryPoint: return "hipGetDriverEntryPoint";
+    case HIP_API_ID_hipGetFuncBySymbol: return "hipGetFuncBySymbol";
+    case HIP_API_ID_hipGetLastError: return "hipGetLastError";
+    case HIP_API_ID_hipGetMipmappedArrayLevel: return "hipGetMipmappedArrayLevel";
+    case HIP_API_ID_hipGetProcAddress: return "hipGetProcAddress";
+    case HIP_API_ID_hipGetSymbolAddress: return "hipGetSymbolAddress";
+    case HIP_API_ID_hipGetSymbolSize: return "hipGetSymbolSize";
+    case HIP_API_ID_hipGraphAddBatchMemOpNode: return "hipGraphAddBatchMemOpNode";
+    case HIP_API_ID_hipGraphAddChildGraphNode: return "hipGraphAddChildGraphNode";
+    case HIP_API_ID_hipGraphAddDependencies: return "hipGraphAddDependencies";
+    case HIP_API_ID_hipGraphAddEmptyNode: return "hipGraphAddEmptyNode";
+    case HIP_API_ID_hipGraphAddEventRecordNode: return "hipGraphAddEventRecordNode";
+    case HIP_API_ID_hipGraphAddEventWaitNode: return "hipGraphAddEventWaitNode";
+    case HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode: return "hipGraphAddExternalSemaphoresSignalNode";
+    case HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode: return "hipGraphAddExternalSemaphoresWaitNode";
+    case HIP_API_ID_hipGraphAddHostNode: return "hipGraphAddHostNode";
+    case HIP_API_ID_hipGraphAddKernelNode: return "hipGraphAddKernelNode";
+    case HIP_API_ID_hipGraphAddMemAllocNode: return "hipGraphAddMemAllocNode";
+    case HIP_API_ID_hipGraphAddMemFreeNode: return "hipGraphAddMemFreeNode";
+    case HIP_API_ID_hipGraphAddMemcpyNode: return "hipGraphAddMemcpyNode";
+    case HIP_API_ID_hipGraphAddMemcpyNode1D: return "hipGraphAddMemcpyNode1D";
+    case HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol: return "hipGraphAddMemcpyNodeFromSymbol";
+    case HIP_API_ID_hipGraphAddMemcpyNodeToSymbol: return "hipGraphAddMemcpyNodeToSymbol";
+    case HIP_API_ID_hipGraphAddMemsetNode: return "hipGraphAddMemsetNode";
+    case HIP_API_ID_hipGraphAddNode: return "hipGraphAddNode";
+    case HIP_API_ID_hipGraphBatchMemOpNodeGetParams: return "hipGraphBatchMemOpNodeGetParams";
+    case HIP_API_ID_hipGraphBatchMemOpNodeSetParams: return "hipGraphBatchMemOpNodeSetParams";
+    case HIP_API_ID_hipGraphChildGraphNodeGetGraph: return "hipGraphChildGraphNodeGetGraph";
+    case HIP_API_ID_hipGraphClone: return "hipGraphClone";
+    case HIP_API_ID_hipGraphCreate: return "hipGraphCreate";
+    case HIP_API_ID_hipGraphDebugDotPrint: return "hipGraphDebugDotPrint";
+    case HIP_API_ID_hipGraphDestroy: return "hipGraphDestroy";
+    case HIP_API_ID_hipGraphDestroyNode: return "hipGraphDestroyNode";
+    case HIP_API_ID_hipGraphEventRecordNodeGetEvent: return "hipGraphEventRecordNodeGetEvent";
+    case HIP_API_ID_hipGraphEventRecordNodeSetEvent: return "hipGraphEventRecordNodeSetEvent";
+    case HIP_API_ID_hipGraphEventWaitNodeGetEvent: return "hipGraphEventWaitNodeGetEvent";
+    case HIP_API_ID_hipGraphEventWaitNodeSetEvent: return "hipGraphEventWaitNodeSetEvent";
+    case HIP_API_ID_hipGraphExecBatchMemOpNodeSetParams: return "hipGraphExecBatchMemOpNodeSetParams";
+    case HIP_API_ID_hipGraphExecChildGraphNodeSetParams: return "hipGraphExecChildGraphNodeSetParams";
+    case HIP_API_ID_hipGraphExecDestroy: return "hipGraphExecDestroy";
+    case HIP_API_ID_hipGraphExecEventRecordNodeSetEvent: return "hipGraphExecEventRecordNodeSetEvent";
+    case HIP_API_ID_hipGraphExecEventWaitNodeSetEvent: return "hipGraphExecEventWaitNodeSetEvent";
+    case HIP_API_ID_hipGraphExecExternalSemaphoresSignalNodeSetParams: return "hipGraphExecExternalSemaphoresSignalNodeSetParams";
+    case HIP_API_ID_hipGraphExecExternalSemaphoresWaitNodeSetParams: return "hipGraphExecExternalSemaphoresWaitNodeSetParams";
+    case HIP_API_ID_hipGraphExecGetFlags: return "hipGraphExecGetFlags";
+    case HIP_API_ID_hipGraphExecHostNodeSetParams: return "hipGraphExecHostNodeSetParams";
+    case HIP_API_ID_hipGraphExecKernelNodeSetParams: return "hipGraphExecKernelNodeSetParams";
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams: return "hipGraphExecMemcpyNodeSetParams";
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams1D: return "hipGraphExecMemcpyNodeSetParams1D";
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsFromSymbol: return "hipGraphExecMemcpyNodeSetParamsFromSymbol";
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsToSymbol: return "hipGraphExecMemcpyNodeSetParamsToSymbol";
+    case HIP_API_ID_hipGraphExecMemsetNodeSetParams: return "hipGraphExecMemsetNodeSetParams";
+    case HIP_API_ID_hipGraphExecNodeSetParams: return "hipGraphExecNodeSetParams";
+    case HIP_API_ID_hipGraphExecUpdate: return "hipGraphExecUpdate";
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeGetParams: return "hipGraphExternalSemaphoresSignalNodeGetParams";
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeSetParams: return "hipGraphExternalSemaphoresSignalNodeSetParams";
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams: return "hipGraphExternalSemaphoresWaitNodeGetParams";
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams: return "hipGraphExternalSemaphoresWaitNodeSetParams";
+    case HIP_API_ID_hipGraphGetEdges: return "hipGraphGetEdges";
+    case HIP_API_ID_hipGraphGetNodes: return "hipGraphGetNodes";
+    case HIP_API_ID_hipGraphGetRootNodes: return "hipGraphGetRootNodes";
+    case HIP_API_ID_hipGraphHostNodeGetParams: return "hipGraphHostNodeGetParams";
+    case HIP_API_ID_hipGraphHostNodeSetParams: return "hipGraphHostNodeSetParams";
+    case HIP_API_ID_hipGraphInstantiate: return "hipGraphInstantiate";
+    case HIP_API_ID_hipGraphInstantiateWithFlags: return "hipGraphInstantiateWithFlags";
+    case HIP_API_ID_hipGraphInstantiateWithParams: return "hipGraphInstantiateWithParams";
+    case HIP_API_ID_hipGraphKernelNodeCopyAttributes: return "hipGraphKernelNodeCopyAttributes";
+    case HIP_API_ID_hipGraphKernelNodeGetAttribute: return "hipGraphKernelNodeGetAttribute";
+    case HIP_API_ID_hipGraphKernelNodeGetParams: return "hipGraphKernelNodeGetParams";
+    case HIP_API_ID_hipGraphKernelNodeSetAttribute: return "hipGraphKernelNodeSetAttribute";
+    case HIP_API_ID_hipGraphKernelNodeSetParams: return "hipGraphKernelNodeSetParams";
+    case HIP_API_ID_hipGraphLaunch: return "hipGraphLaunch";
+    case HIP_API_ID_hipGraphMemAllocNodeGetParams: return "hipGraphMemAllocNodeGetParams";
+    case HIP_API_ID_hipGraphMemFreeNodeGetParams: return "hipGraphMemFreeNodeGetParams";
+    case HIP_API_ID_hipGraphMemcpyNodeGetParams: return "hipGraphMemcpyNodeGetParams";
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams: return "hipGraphMemcpyNodeSetParams";
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams1D: return "hipGraphMemcpyNodeSetParams1D";
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsFromSymbol: return "hipGraphMemcpyNodeSetParamsFromSymbol";
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsToSymbol: return "hipGraphMemcpyNodeSetParamsToSymbol";
+    case HIP_API_ID_hipGraphMemsetNodeGetParams: return "hipGraphMemsetNodeGetParams";
+    case HIP_API_ID_hipGraphMemsetNodeSetParams: return "hipGraphMemsetNodeSetParams";
+    case HIP_API_ID_hipGraphNodeFindInClone: return "hipGraphNodeFindInClone";
+    case HIP_API_ID_hipGraphNodeGetDependencies: return "hipGraphNodeGetDependencies";
+    case HIP_API_ID_hipGraphNodeGetDependentNodes: return "hipGraphNodeGetDependentNodes";
+    case HIP_API_ID_hipGraphNodeGetEnabled: return "hipGraphNodeGetEnabled";
+    case HIP_API_ID_hipGraphNodeGetType: return "hipGraphNodeGetType";
+    case HIP_API_ID_hipGraphNodeSetEnabled: return "hipGraphNodeSetEnabled";
+    case HIP_API_ID_hipGraphNodeSetParams: return "hipGraphNodeSetParams";
+    case HIP_API_ID_hipGraphReleaseUserObject: return "hipGraphReleaseUserObject";
+    case HIP_API_ID_hipGraphRemoveDependencies: return "hipGraphRemoveDependencies";
+    case HIP_API_ID_hipGraphRetainUserObject: return "hipGraphRetainUserObject";
+    case HIP_API_ID_hipGraphUpload: return "hipGraphUpload";
+    case HIP_API_ID_hipGraphicsGLRegisterBuffer: return "hipGraphicsGLRegisterBuffer";
+    case HIP_API_ID_hipGraphicsGLRegisterImage: return "hipGraphicsGLRegisterImage";
+    case HIP_API_ID_hipGraphicsMapResources: return "hipGraphicsMapResources";
+    case HIP_API_ID_hipGraphicsResourceGetMappedPointer: return "hipGraphicsResourceGetMappedPointer";
+    case HIP_API_ID_hipGraphicsSubResourceGetMappedArray: return "hipGraphicsSubResourceGetMappedArray";
+    case HIP_API_ID_hipGraphicsUnmapResources: return "hipGraphicsUnmapResources";
+    case HIP_API_ID_hipGraphicsUnregisterResource: return "hipGraphicsUnregisterResource";
+    case HIP_API_ID_hipHccModuleLaunchKernel: return "hipHccModuleLaunchKernel";
+    case HIP_API_ID_hipHostAlloc: return "hipHostAlloc";
+    case HIP_API_ID_hipHostFree: return "hipHostFree";
+    case HIP_API_ID_hipHostGetDevicePointer: return "hipHostGetDevicePointer";
+    case HIP_API_ID_hipHostGetFlags: return "hipHostGetFlags";
+    case HIP_API_ID_hipHostMalloc: return "hipHostMalloc";
+    case HIP_API_ID_hipHostRegister: return "hipHostRegister";
+    case HIP_API_ID_hipHostUnregister: return "hipHostUnregister";
+    case HIP_API_ID_hipImportExternalMemory: return "hipImportExternalMemory";
+    case HIP_API_ID_hipImportExternalSemaphore: return "hipImportExternalSemaphore";
+    case HIP_API_ID_hipInit: return "hipInit";
+    case HIP_API_ID_hipIpcCloseMemHandle: return "hipIpcCloseMemHandle";
+    case HIP_API_ID_hipIpcGetEventHandle: return "hipIpcGetEventHandle";
+    case HIP_API_ID_hipIpcGetMemHandle: return "hipIpcGetMemHandle";
+    case HIP_API_ID_hipIpcOpenEventHandle: return "hipIpcOpenEventHandle";
+    case HIP_API_ID_hipIpcOpenMemHandle: return "hipIpcOpenMemHandle";
+    case HIP_API_ID_hipLaunchByPtr: return "hipLaunchByPtr";
+    case HIP_API_ID_hipLaunchCooperativeKernel: return "hipLaunchCooperativeKernel";
+    case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: return "hipLaunchCooperativeKernelMultiDevice";
+    case HIP_API_ID_hipLaunchHostFunc: return "hipLaunchHostFunc";
+    case HIP_API_ID_hipLaunchKernel: return "hipLaunchKernel";
+    case HIP_API_ID_hipLaunchKernelExC: return "hipLaunchKernelExC";
+    case HIP_API_ID_hipLibraryGetKernel: return "hipLibraryGetKernel";
+    case HIP_API_ID_hipLibraryGetKernelCount: return "hipLibraryGetKernelCount";
+    case HIP_API_ID_hipLibraryLoadData: return "hipLibraryLoadData";
+    case HIP_API_ID_hipLibraryLoadFromFile: return "hipLibraryLoadFromFile";
+    case HIP_API_ID_hipLibraryUnload: return "hipLibraryUnload";
+    case HIP_API_ID_hipLinkAddData: return "hipLinkAddData";
+    case HIP_API_ID_hipLinkAddFile: return "hipLinkAddFile";
+    case HIP_API_ID_hipLinkComplete: return "hipLinkComplete";
+    case HIP_API_ID_hipLinkCreate: return "hipLinkCreate";
+    case HIP_API_ID_hipLinkDestroy: return "hipLinkDestroy";
+    case HIP_API_ID_hipMalloc: return "hipMalloc";
+    case HIP_API_ID_hipMalloc3D: return "hipMalloc3D";
+    case HIP_API_ID_hipMalloc3DArray: return "hipMalloc3DArray";
+    case HIP_API_ID_hipMallocArray: return "hipMallocArray";
+    case HIP_API_ID_hipMallocAsync: return "hipMallocAsync";
+    case HIP_API_ID_hipMallocFromPoolAsync: return "hipMallocFromPoolAsync";
+    case HIP_API_ID_hipMallocHost: return "hipMallocHost";
+    case HIP_API_ID_hipMallocManaged: return "hipMallocManaged";
+    case HIP_API_ID_hipMallocMipmappedArray: return "hipMallocMipmappedArray";
+    case HIP_API_ID_hipMallocPitch: return "hipMallocPitch";
+    case HIP_API_ID_hipMemAddressFree: return "hipMemAddressFree";
+    case HIP_API_ID_hipMemAddressReserve: return "hipMemAddressReserve";
+    case HIP_API_ID_hipMemAdvise: return "hipMemAdvise";
+    case HIP_API_ID_hipMemAdvise_v2: return "hipMemAdvise_v2";
+    case HIP_API_ID_hipMemAllocHost: return "hipMemAllocHost";
+    case HIP_API_ID_hipMemAllocPitch: return "hipMemAllocPitch";
+    case HIP_API_ID_hipMemCreate: return "hipMemCreate";
+    case HIP_API_ID_hipMemExportToShareableHandle: return "hipMemExportToShareableHandle";
+    case HIP_API_ID_hipMemGetAccess: return "hipMemGetAccess";
+    case HIP_API_ID_hipMemGetAddressRange: return "hipMemGetAddressRange";
+    case HIP_API_ID_hipMemGetAllocationGranularity: return "hipMemGetAllocationGranularity";
+    case HIP_API_ID_hipMemGetAllocationPropertiesFromHandle: return "hipMemGetAllocationPropertiesFromHandle";
+    case HIP_API_ID_hipMemGetHandleForAddressRange: return "hipMemGetHandleForAddressRange";
+    case HIP_API_ID_hipMemGetInfo: return "hipMemGetInfo";
+    case HIP_API_ID_hipMemImportFromShareableHandle: return "hipMemImportFromShareableHandle";
+    case HIP_API_ID_hipMemMap: return "hipMemMap";
+    case HIP_API_ID_hipMemMapArrayAsync: return "hipMemMapArrayAsync";
+    case HIP_API_ID_hipMemPoolCreate: return "hipMemPoolCreate";
+    case HIP_API_ID_hipMemPoolDestroy: return "hipMemPoolDestroy";
+    case HIP_API_ID_hipMemPoolExportPointer: return "hipMemPoolExportPointer";
+    case HIP_API_ID_hipMemPoolExportToShareableHandle: return "hipMemPoolExportToShareableHandle";
+    case HIP_API_ID_hipMemPoolGetAccess: return "hipMemPoolGetAccess";
+    case HIP_API_ID_hipMemPoolGetAttribute: return "hipMemPoolGetAttribute";
+    case HIP_API_ID_hipMemPoolImportFromShareableHandle: return "hipMemPoolImportFromShareableHandle";
+    case HIP_API_ID_hipMemPoolImportPointer: return "hipMemPoolImportPointer";
+    case HIP_API_ID_hipMemPoolSetAccess: return "hipMemPoolSetAccess";
+    case HIP_API_ID_hipMemPoolSetAttribute: return "hipMemPoolSetAttribute";
+    case HIP_API_ID_hipMemPoolTrimTo: return "hipMemPoolTrimTo";
+    case HIP_API_ID_hipMemPrefetchAsync: return "hipMemPrefetchAsync";
+    case HIP_API_ID_hipMemPrefetchAsync_v2: return "hipMemPrefetchAsync_v2";
+    case HIP_API_ID_hipMemPtrGetInfo: return "hipMemPtrGetInfo";
+    case HIP_API_ID_hipMemRangeGetAttribute: return "hipMemRangeGetAttribute";
+    case HIP_API_ID_hipMemRangeGetAttributes: return "hipMemRangeGetAttributes";
+    case HIP_API_ID_hipMemRelease: return "hipMemRelease";
+    case HIP_API_ID_hipMemRetainAllocationHandle: return "hipMemRetainAllocationHandle";
+    case HIP_API_ID_hipMemSetAccess: return "hipMemSetAccess";
+    case HIP_API_ID_hipMemUnmap: return "hipMemUnmap";
+    case HIP_API_ID_hipMemcpy: return "hipMemcpy";
+    case HIP_API_ID_hipMemcpy2D: return "hipMemcpy2D";
+    case HIP_API_ID_hipMemcpy2DArrayToArray: return "hipMemcpy2DArrayToArray";
+    case HIP_API_ID_hipMemcpy2DAsync: return "hipMemcpy2DAsync";
+    case HIP_API_ID_hipMemcpy2DFromArray: return "hipMemcpy2DFromArray";
+    case HIP_API_ID_hipMemcpy2DFromArrayAsync: return "hipMemcpy2DFromArrayAsync";
+    case HIP_API_ID_hipMemcpy2DToArray: return "hipMemcpy2DToArray";
+    case HIP_API_ID_hipMemcpy2DToArrayAsync: return "hipMemcpy2DToArrayAsync";
+    case HIP_API_ID_hipMemcpy3D: return "hipMemcpy3D";
+    case HIP_API_ID_hipMemcpy3DAsync: return "hipMemcpy3DAsync";
+    case HIP_API_ID_hipMemcpy3DBatchAsync: return "hipMemcpy3DBatchAsync";
+    case HIP_API_ID_hipMemcpy3DPeer: return "hipMemcpy3DPeer";
+    case HIP_API_ID_hipMemcpy3DPeerAsync: return "hipMemcpy3DPeerAsync";
+    case HIP_API_ID_hipMemcpyAsync: return "hipMemcpyAsync";
+    case HIP_API_ID_hipMemcpyAtoA: return "hipMemcpyAtoA";
+    case HIP_API_ID_hipMemcpyAtoD: return "hipMemcpyAtoD";
+    case HIP_API_ID_hipMemcpyAtoH: return "hipMemcpyAtoH";
+    case HIP_API_ID_hipMemcpyAtoHAsync: return "hipMemcpyAtoHAsync";
+    case HIP_API_ID_hipMemcpyBatchAsync: return "hipMemcpyBatchAsync";
+    case HIP_API_ID_hipMemcpyDtoA: return "hipMemcpyDtoA";
+    case HIP_API_ID_hipMemcpyDtoD: return "hipMemcpyDtoD";
+    case HIP_API_ID_hipMemcpyDtoDAsync: return "hipMemcpyDtoDAsync";
+    case HIP_API_ID_hipMemcpyDtoH: return "hipMemcpyDtoH";
+    case HIP_API_ID_hipMemcpyDtoHAsync: return "hipMemcpyDtoHAsync";
+    case HIP_API_ID_hipMemcpyFromArray: return "hipMemcpyFromArray";
+    case HIP_API_ID_hipMemcpyFromSymbol: return "hipMemcpyFromSymbol";
+    case HIP_API_ID_hipMemcpyFromSymbolAsync: return "hipMemcpyFromSymbolAsync";
+    case HIP_API_ID_hipMemcpyHtoA: return "hipMemcpyHtoA";
+    case HIP_API_ID_hipMemcpyHtoAAsync: return "hipMemcpyHtoAAsync";
+    case HIP_API_ID_hipMemcpyHtoD: return "hipMemcpyHtoD";
+    case HIP_API_ID_hipMemcpyHtoDAsync: return "hipMemcpyHtoDAsync";
+    case HIP_API_ID_hipMemcpyParam2D: return "hipMemcpyParam2D";
+    case HIP_API_ID_hipMemcpyParam2DAsync: return "hipMemcpyParam2DAsync";
+    case HIP_API_ID_hipMemcpyPeer: return "hipMemcpyPeer";
+    case HIP_API_ID_hipMemcpyPeerAsync: return "hipMemcpyPeerAsync";
+    case HIP_API_ID_hipMemcpyToArray: return "hipMemcpyToArray";
+    case HIP_API_ID_hipMemcpyToSymbol: return "hipMemcpyToSymbol";
+    case HIP_API_ID_hipMemcpyToSymbolAsync: return "hipMemcpyToSymbolAsync";
+    case HIP_API_ID_hipMemcpyWithStream: return "hipMemcpyWithStream";
+    case HIP_API_ID_hipMemset: return "hipMemset";
+    case HIP_API_ID_hipMemset2D: return "hipMemset2D";
+    case HIP_API_ID_hipMemset2DAsync: return "hipMemset2DAsync";
+    case HIP_API_ID_hipMemset3D: return "hipMemset3D";
+    case HIP_API_ID_hipMemset3DAsync: return "hipMemset3DAsync";
+    case HIP_API_ID_hipMemsetAsync: return "hipMemsetAsync";
+    case HIP_API_ID_hipMemsetD16: return "hipMemsetD16";
+    case HIP_API_ID_hipMemsetD16Async: return "hipMemsetD16Async";
+    case HIP_API_ID_hipMemsetD2D16: return "hipMemsetD2D16";
+    case HIP_API_ID_hipMemsetD2D16Async: return "hipMemsetD2D16Async";
+    case HIP_API_ID_hipMemsetD2D32: return "hipMemsetD2D32";
+    case HIP_API_ID_hipMemsetD2D32Async: return "hipMemsetD2D32Async";
+    case HIP_API_ID_hipMemsetD2D8: return "hipMemsetD2D8";
+    case HIP_API_ID_hipMemsetD2D8Async: return "hipMemsetD2D8Async";
+    case HIP_API_ID_hipMemsetD32: return "hipMemsetD32";
+    case HIP_API_ID_hipMemsetD32Async: return "hipMemsetD32Async";
+    case HIP_API_ID_hipMemsetD8: return "hipMemsetD8";
+    case HIP_API_ID_hipMemsetD8Async: return "hipMemsetD8Async";
+    case HIP_API_ID_hipMipmappedArrayCreate: return "hipMipmappedArrayCreate";
+    case HIP_API_ID_hipMipmappedArrayDestroy: return "hipMipmappedArrayDestroy";
+    case HIP_API_ID_hipMipmappedArrayGetLevel: return "hipMipmappedArrayGetLevel";
+    case HIP_API_ID_hipModuleGetFunction: return "hipModuleGetFunction";
+    case HIP_API_ID_hipModuleGetFunctionCount: return "hipModuleGetFunctionCount";
+    case HIP_API_ID_hipModuleGetGlobal: return "hipModuleGetGlobal";
+    case HIP_API_ID_hipModuleGetTexRef: return "hipModuleGetTexRef";
+    case HIP_API_ID_hipModuleLaunchCooperativeKernel: return "hipModuleLaunchCooperativeKernel";
+    case HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice: return "hipModuleLaunchCooperativeKernelMultiDevice";
+    case HIP_API_ID_hipModuleLaunchKernel: return "hipModuleLaunchKernel";
+    case HIP_API_ID_hipModuleLoad: return "hipModuleLoad";
+    case HIP_API_ID_hipModuleLoadData: return "hipModuleLoadData";
+    case HIP_API_ID_hipModuleLoadDataEx: return "hipModuleLoadDataEx";
+    case HIP_API_ID_hipModuleLoadFatBinary: return "hipModuleLoadFatBinary";
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor: return "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor";
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: return "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags";
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize: return "hipModuleOccupancyMaxPotentialBlockSize";
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags: return "hipModuleOccupancyMaxPotentialBlockSizeWithFlags";
+    case HIP_API_ID_hipModuleUnload: return "hipModuleUnload";
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor: return "hipOccupancyMaxActiveBlocksPerMultiprocessor";
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: return "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags";
+    case HIP_API_ID_hipOccupancyMaxPotentialBlockSize: return "hipOccupancyMaxPotentialBlockSize";
+    case HIP_API_ID_hipPeekAtLastError: return "hipPeekAtLastError";
+    case HIP_API_ID_hipPointerGetAttribute: return "hipPointerGetAttribute";
+    case HIP_API_ID_hipPointerGetAttributes: return "hipPointerGetAttributes";
+    case HIP_API_ID_hipPointerSetAttribute: return "hipPointerSetAttribute";
+    case HIP_API_ID_hipProfilerStart: return "hipProfilerStart";
+    case HIP_API_ID_hipProfilerStop: return "hipProfilerStop";
+    case HIP_API_ID_hipRuntimeGetVersion: return "hipRuntimeGetVersion";
+    case HIP_API_ID_hipSetDevice: return "hipSetDevice";
+    case HIP_API_ID_hipSetDeviceFlags: return "hipSetDeviceFlags";
+    case HIP_API_ID_hipSetValidDevices: return "hipSetValidDevices";
+    case HIP_API_ID_hipSetupArgument: return "hipSetupArgument";
+    case HIP_API_ID_hipSignalExternalSemaphoresAsync: return "hipSignalExternalSemaphoresAsync";
+    case HIP_API_ID_hipStreamAddCallback: return "hipStreamAddCallback";
+    case HIP_API_ID_hipStreamAttachMemAsync: return "hipStreamAttachMemAsync";
+    case HIP_API_ID_hipStreamBatchMemOp: return "hipStreamBatchMemOp";
+    case HIP_API_ID_hipStreamBeginCapture: return "hipStreamBeginCapture";
+    case HIP_API_ID_hipStreamBeginCaptureToGraph: return "hipStreamBeginCaptureToGraph";
+    case HIP_API_ID_hipStreamCreate: return "hipStreamCreate";
+    case HIP_API_ID_hipStreamCreateWithFlags: return "hipStreamCreateWithFlags";
+    case HIP_API_ID_hipStreamCreateWithPriority: return "hipStreamCreateWithPriority";
+    case HIP_API_ID_hipStreamDestroy: return "hipStreamDestroy";
+    case HIP_API_ID_hipStreamEndCapture: return "hipStreamEndCapture";
+    case HIP_API_ID_hipStreamGetAttribute: return "hipStreamGetAttribute";
+    case HIP_API_ID_hipStreamGetCaptureInfo: return "hipStreamGetCaptureInfo";
+    case HIP_API_ID_hipStreamGetCaptureInfo_v2: return "hipStreamGetCaptureInfo_v2";
+    case HIP_API_ID_hipStreamGetDevice: return "hipStreamGetDevice";
+    case HIP_API_ID_hipStreamGetFlags: return "hipStreamGetFlags";
+    case HIP_API_ID_hipStreamGetId: return "hipStreamGetId";
+    case HIP_API_ID_hipStreamGetPriority: return "hipStreamGetPriority";
+    case HIP_API_ID_hipStreamIsCapturing: return "hipStreamIsCapturing";
+    case HIP_API_ID_hipStreamQuery: return "hipStreamQuery";
+    case HIP_API_ID_hipStreamSetAttribute: return "hipStreamSetAttribute";
+    case HIP_API_ID_hipStreamSynchronize: return "hipStreamSynchronize";
+    case HIP_API_ID_hipStreamUpdateCaptureDependencies: return "hipStreamUpdateCaptureDependencies";
+    case HIP_API_ID_hipStreamWaitEvent: return "hipStreamWaitEvent";
+    case HIP_API_ID_hipStreamWaitValue32: return "hipStreamWaitValue32";
+    case HIP_API_ID_hipStreamWaitValue64: return "hipStreamWaitValue64";
+    case HIP_API_ID_hipStreamWriteValue32: return "hipStreamWriteValue32";
+    case HIP_API_ID_hipStreamWriteValue64: return "hipStreamWriteValue64";
+    case HIP_API_ID_hipTexRefGetAddress: return "hipTexRefGetAddress";
+    case HIP_API_ID_hipTexRefGetArray: return "hipTexRefGetArray";
+    case HIP_API_ID_hipTexRefGetBorderColor: return "hipTexRefGetBorderColor";
+    case HIP_API_ID_hipTexRefGetFlags: return "hipTexRefGetFlags";
+    case HIP_API_ID_hipTexRefGetFormat: return "hipTexRefGetFormat";
+    case HIP_API_ID_hipTexRefGetMaxAnisotropy: return "hipTexRefGetMaxAnisotropy";
+    case HIP_API_ID_hipTexRefGetMipMappedArray: return "hipTexRefGetMipMappedArray";
+    case HIP_API_ID_hipTexRefGetMipmapLevelBias: return "hipTexRefGetMipmapLevelBias";
+    case HIP_API_ID_hipTexRefGetMipmapLevelClamp: return "hipTexRefGetMipmapLevelClamp";
+    case HIP_API_ID_hipTexRefSetAddress: return "hipTexRefSetAddress";
+    case HIP_API_ID_hipTexRefSetAddress2D: return "hipTexRefSetAddress2D";
+    case HIP_API_ID_hipTexRefSetArray: return "hipTexRefSetArray";
+    case HIP_API_ID_hipTexRefSetBorderColor: return "hipTexRefSetBorderColor";
+    case HIP_API_ID_hipTexRefSetFlags: return "hipTexRefSetFlags";
+    case HIP_API_ID_hipTexRefSetFormat: return "hipTexRefSetFormat";
+    case HIP_API_ID_hipTexRefSetMaxAnisotropy: return "hipTexRefSetMaxAnisotropy";
+    case HIP_API_ID_hipTexRefSetMipmapLevelBias: return "hipTexRefSetMipmapLevelBias";
+    case HIP_API_ID_hipTexRefSetMipmapLevelClamp: return "hipTexRefSetMipmapLevelClamp";
+    case HIP_API_ID_hipTexRefSetMipmappedArray: return "hipTexRefSetMipmappedArray";
+    case HIP_API_ID_hipThreadExchangeStreamCaptureMode: return "hipThreadExchangeStreamCaptureMode";
+    case HIP_API_ID_hipUserObjectCreate: return "hipUserObjectCreate";
+    case HIP_API_ID_hipUserObjectRelease: return "hipUserObjectRelease";
+    case HIP_API_ID_hipUserObjectRetain: return "hipUserObjectRetain";
+    case HIP_API_ID_hipWaitExternalSemaphoresAsync: return "hipWaitExternalSemaphoresAsync";
+  };
+  return "unknown";
+};
+
+#include <string.h>
+// Return the HIP API callback ID for a given name
+static inline uint32_t hipApiIdByName(const char* name) {
+  if (strcmp("__hipPopCallConfiguration", name) == 0) return HIP_API_ID___hipPopCallConfiguration;
+  if (strcmp("__hipPushCallConfiguration", name) == 0) return HIP_API_ID___hipPushCallConfiguration;
+  if (strcmp("hipArray3DCreate", name) == 0) return HIP_API_ID_hipArray3DCreate;
+  if (strcmp("hipArray3DGetDescriptor", name) == 0) return HIP_API_ID_hipArray3DGetDescriptor;
+  if (strcmp("hipArrayCreate", name) == 0) return HIP_API_ID_hipArrayCreate;
+  if (strcmp("hipArrayDestroy", name) == 0) return HIP_API_ID_hipArrayDestroy;
+  if (strcmp("hipArrayGetDescriptor", name) == 0) return HIP_API_ID_hipArrayGetDescriptor;
+  if (strcmp("hipArrayGetInfo", name) == 0) return HIP_API_ID_hipArrayGetInfo;
+  if (strcmp("hipChooseDeviceR0000", name) == 0) return HIP_API_ID_hipChooseDeviceR0000;
+  if (strcmp("hipChooseDeviceR0600", name) == 0) return HIP_API_ID_hipChooseDeviceR0600;
+  if (strcmp("hipConfigureCall", name) == 0) return HIP_API_ID_hipConfigureCall;
+  if (strcmp("hipCreateSurfaceObject", name) == 0) return HIP_API_ID_hipCreateSurfaceObject;
+  if (strcmp("hipCtxCreate", name) == 0) return HIP_API_ID_hipCtxCreate;
+  if (strcmp("hipCtxDestroy", name) == 0) return HIP_API_ID_hipCtxDestroy;
+  if (strcmp("hipCtxDisablePeerAccess", name) == 0) return HIP_API_ID_hipCtxDisablePeerAccess;
+  if (strcmp("hipCtxEnablePeerAccess", name) == 0) return HIP_API_ID_hipCtxEnablePeerAccess;
+  if (strcmp("hipCtxGetApiVersion", name) == 0) return HIP_API_ID_hipCtxGetApiVersion;
+  if (strcmp("hipCtxGetCacheConfig", name) == 0) return HIP_API_ID_hipCtxGetCacheConfig;
+  if (strcmp("hipCtxGetCurrent", name) == 0) return HIP_API_ID_hipCtxGetCurrent;
+  if (strcmp("hipCtxGetDevice", name) == 0) return HIP_API_ID_hipCtxGetDevice;
+  if (strcmp("hipCtxGetFlags", name) == 0) return HIP_API_ID_hipCtxGetFlags;
+  if (strcmp("hipCtxGetSharedMemConfig", name) == 0) return HIP_API_ID_hipCtxGetSharedMemConfig;
+  if (strcmp("hipCtxPopCurrent", name) == 0) return HIP_API_ID_hipCtxPopCurrent;
+  if (strcmp("hipCtxPushCurrent", name) == 0) return HIP_API_ID_hipCtxPushCurrent;
+  if (strcmp("hipCtxSetCacheConfig", name) == 0) return HIP_API_ID_hipCtxSetCacheConfig;
+  if (strcmp("hipCtxSetCurrent", name) == 0) return HIP_API_ID_hipCtxSetCurrent;
+  if (strcmp("hipCtxSetSharedMemConfig", name) == 0) return HIP_API_ID_hipCtxSetSharedMemConfig;
+  if (strcmp("hipCtxSynchronize", name) == 0) return HIP_API_ID_hipCtxSynchronize;
+  if (strcmp("hipDestroyExternalMemory", name) == 0) return HIP_API_ID_hipDestroyExternalMemory;
+  if (strcmp("hipDestroyExternalSemaphore", name) == 0) return HIP_API_ID_hipDestroyExternalSemaphore;
+  if (strcmp("hipDestroySurfaceObject", name) == 0) return HIP_API_ID_hipDestroySurfaceObject;
+  if (strcmp("hipDeviceCanAccessPeer", name) == 0) return HIP_API_ID_hipDeviceCanAccessPeer;
+  if (strcmp("hipDeviceComputeCapability", name) == 0) return HIP_API_ID_hipDeviceComputeCapability;
+  if (strcmp("hipDeviceDisablePeerAccess", name) == 0) return HIP_API_ID_hipDeviceDisablePeerAccess;
+  if (strcmp("hipDeviceEnablePeerAccess", name) == 0) return HIP_API_ID_hipDeviceEnablePeerAccess;
+  if (strcmp("hipDeviceGet", name) == 0) return HIP_API_ID_hipDeviceGet;
+  if (strcmp("hipDeviceGetAttribute", name) == 0) return HIP_API_ID_hipDeviceGetAttribute;
+  if (strcmp("hipDeviceGetByPCIBusId", name) == 0) return HIP_API_ID_hipDeviceGetByPCIBusId;
+  if (strcmp("hipDeviceGetCacheConfig", name) == 0) return HIP_API_ID_hipDeviceGetCacheConfig;
+  if (strcmp("hipDeviceGetDefaultMemPool", name) == 0) return HIP_API_ID_hipDeviceGetDefaultMemPool;
+  if (strcmp("hipDeviceGetGraphMemAttribute", name) == 0) return HIP_API_ID_hipDeviceGetGraphMemAttribute;
+  if (strcmp("hipDeviceGetLimit", name) == 0) return HIP_API_ID_hipDeviceGetLimit;
+  if (strcmp("hipDeviceGetMemPool", name) == 0) return HIP_API_ID_hipDeviceGetMemPool;
+  if (strcmp("hipDeviceGetName", name) == 0) return HIP_API_ID_hipDeviceGetName;
+  if (strcmp("hipDeviceGetP2PAttribute", name) == 0) return HIP_API_ID_hipDeviceGetP2PAttribute;
+  if (strcmp("hipDeviceGetPCIBusId", name) == 0) return HIP_API_ID_hipDeviceGetPCIBusId;
+  if (strcmp("hipDeviceGetSharedMemConfig", name) == 0) return HIP_API_ID_hipDeviceGetSharedMemConfig;
+  if (strcmp("hipDeviceGetStreamPriorityRange", name) == 0) return HIP_API_ID_hipDeviceGetStreamPriorityRange;
+  if (strcmp("hipDeviceGetUuid", name) == 0) return HIP_API_ID_hipDeviceGetUuid;
+  if (strcmp("hipDeviceGraphMemTrim", name) == 0) return HIP_API_ID_hipDeviceGraphMemTrim;
+  if (strcmp("hipDevicePrimaryCtxGetState", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxGetState;
+  if (strcmp("hipDevicePrimaryCtxRelease", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxRelease;
+  if (strcmp("hipDevicePrimaryCtxReset", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxReset;
+  if (strcmp("hipDevicePrimaryCtxRetain", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxRetain;
+  if (strcmp("hipDevicePrimaryCtxSetFlags", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxSetFlags;
+  if (strcmp("hipDeviceReset", name) == 0) return HIP_API_ID_hipDeviceReset;
+  if (strcmp("hipDeviceSetCacheConfig", name) == 0) return HIP_API_ID_hipDeviceSetCacheConfig;
+  if (strcmp("hipDeviceSetGraphMemAttribute", name) == 0) return HIP_API_ID_hipDeviceSetGraphMemAttribute;
+  if (strcmp("hipDeviceSetLimit", name) == 0) return HIP_API_ID_hipDeviceSetLimit;
+  if (strcmp("hipDeviceSetMemPool", name) == 0) return HIP_API_ID_hipDeviceSetMemPool;
+  if (strcmp("hipDeviceSetSharedMemConfig", name) == 0) return HIP_API_ID_hipDeviceSetSharedMemConfig;
+  if (strcmp("hipDeviceSynchronize", name) == 0) return HIP_API_ID_hipDeviceSynchronize;
+  if (strcmp("hipDeviceTotalMem", name) == 0) return HIP_API_ID_hipDeviceTotalMem;
+  if (strcmp("hipDriverGetVersion", name) == 0) return HIP_API_ID_hipDriverGetVersion;
+  if (strcmp("hipDrvGraphAddMemFreeNode", name) == 0) return HIP_API_ID_hipDrvGraphAddMemFreeNode;
+  if (strcmp("hipDrvGraphAddMemcpyNode", name) == 0) return HIP_API_ID_hipDrvGraphAddMemcpyNode;
+  if (strcmp("hipDrvGraphAddMemsetNode", name) == 0) return HIP_API_ID_hipDrvGraphAddMemsetNode;
+  if (strcmp("hipDrvGraphExecMemcpyNodeSetParams", name) == 0) return HIP_API_ID_hipDrvGraphExecMemcpyNodeSetParams;
+  if (strcmp("hipDrvGraphExecMemsetNodeSetParams", name) == 0) return HIP_API_ID_hipDrvGraphExecMemsetNodeSetParams;
+  if (strcmp("hipDrvGraphMemcpyNodeGetParams", name) == 0) return HIP_API_ID_hipDrvGraphMemcpyNodeGetParams;
+  if (strcmp("hipDrvGraphMemcpyNodeSetParams", name) == 0) return HIP_API_ID_hipDrvGraphMemcpyNodeSetParams;
+  if (strcmp("hipDrvLaunchKernelEx", name) == 0) return HIP_API_ID_hipDrvLaunchKernelEx;
+  if (strcmp("hipDrvMemcpy2DUnaligned", name) == 0) return HIP_API_ID_hipDrvMemcpy2DUnaligned;
+  if (strcmp("hipDrvMemcpy3D", name) == 0) return HIP_API_ID_hipDrvMemcpy3D;
+  if (strcmp("hipDrvMemcpy3DAsync", name) == 0) return HIP_API_ID_hipDrvMemcpy3DAsync;
+  if (strcmp("hipDrvPointerGetAttributes", name) == 0) return HIP_API_ID_hipDrvPointerGetAttributes;
+  if (strcmp("hipEventCreate", name) == 0) return HIP_API_ID_hipEventCreate;
+  if (strcmp("hipEventCreateWithFlags", name) == 0) return HIP_API_ID_hipEventCreateWithFlags;
+  if (strcmp("hipEventDestroy", name) == 0) return HIP_API_ID_hipEventDestroy;
+  if (strcmp("hipEventElapsedTime", name) == 0) return HIP_API_ID_hipEventElapsedTime;
+  if (strcmp("hipEventQuery", name) == 0) return HIP_API_ID_hipEventQuery;
+  if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord;
+  if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags;
+  if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize;
+  if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError;
+  if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount;
+  if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel;
+  if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice;
+  if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags;
+  if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel;
+  if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask;
+  if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask;
+  if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer;
+  if (strcmp("hipExternalMemoryGetMappedMipmappedArray", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedMipmappedArray;
+  if (strcmp("hipFree", name) == 0) return HIP_API_ID_hipFree;
+  if (strcmp("hipFreeArray", name) == 0) return HIP_API_ID_hipFreeArray;
+  if (strcmp("hipFreeAsync", name) == 0) return HIP_API_ID_hipFreeAsync;
+  if (strcmp("hipFreeHost", name) == 0) return HIP_API_ID_hipFreeHost;
+  if (strcmp("hipFreeMipmappedArray", name) == 0) return HIP_API_ID_hipFreeMipmappedArray;
+  if (strcmp("hipFuncGetAttribute", name) == 0) return HIP_API_ID_hipFuncGetAttribute;
+  if (strcmp("hipFuncGetAttributes", name) == 0) return HIP_API_ID_hipFuncGetAttributes;
+  if (strcmp("hipFuncSetAttribute", name) == 0) return HIP_API_ID_hipFuncSetAttribute;
+  if (strcmp("hipFuncSetCacheConfig", name) == 0) return HIP_API_ID_hipFuncSetCacheConfig;
+  if (strcmp("hipFuncSetSharedMemConfig", name) == 0) return HIP_API_ID_hipFuncSetSharedMemConfig;
+  if (strcmp("hipGLGetDevices", name) == 0) return HIP_API_ID_hipGLGetDevices;
+  if (strcmp("hipGetChannelDesc", name) == 0) return HIP_API_ID_hipGetChannelDesc;
+  if (strcmp("hipGetDevice", name) == 0) return HIP_API_ID_hipGetDevice;
+  if (strcmp("hipGetDeviceCount", name) == 0) return HIP_API_ID_hipGetDeviceCount;
+  if (strcmp("hipGetDeviceFlags", name) == 0) return HIP_API_ID_hipGetDeviceFlags;
+  if (strcmp("hipGetDevicePropertiesR0000", name) == 0) return HIP_API_ID_hipGetDevicePropertiesR0000;
+  if (strcmp("hipGetDevicePropertiesR0600", name) == 0) return HIP_API_ID_hipGetDevicePropertiesR0600;
+  if (strcmp("hipGetDriverEntryPoint", name) == 0) return HIP_API_ID_hipGetDriverEntryPoint;
+  if (strcmp("hipGetFuncBySymbol", name) == 0) return HIP_API_ID_hipGetFuncBySymbol;
+  if (strcmp("hipGetLastError", name) == 0) return HIP_API_ID_hipGetLastError;
+  if (strcmp("hipGetMipmappedArrayLevel", name) == 0) return HIP_API_ID_hipGetMipmappedArrayLevel;
+  if (strcmp("hipGetProcAddress", name) == 0) return HIP_API_ID_hipGetProcAddress;
+  if (strcmp("hipGetSymbolAddress", name) == 0) return HIP_API_ID_hipGetSymbolAddress;
+  if (strcmp("hipGetSymbolSize", name) == 0) return HIP_API_ID_hipGetSymbolSize;
+  if (strcmp("hipGraphAddBatchMemOpNode", name) == 0) return HIP_API_ID_hipGraphAddBatchMemOpNode;
+  if (strcmp("hipGraphAddChildGraphNode", name) == 0) return HIP_API_ID_hipGraphAddChildGraphNode;
+  if (strcmp("hipGraphAddDependencies", name) == 0) return HIP_API_ID_hipGraphAddDependencies;
+  if (strcmp("hipGraphAddEmptyNode", name) == 0) return HIP_API_ID_hipGraphAddEmptyNode;
+  if (strcmp("hipGraphAddEventRecordNode", name) == 0) return HIP_API_ID_hipGraphAddEventRecordNode;
+  if (strcmp("hipGraphAddEventWaitNode", name) == 0) return HIP_API_ID_hipGraphAddEventWaitNode;
+  if (strcmp("hipGraphAddExternalSemaphoresSignalNode", name) == 0) return HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode;
+  if (strcmp("hipGraphAddExternalSemaphoresWaitNode", name) == 0) return HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode;
+  if (strcmp("hipGraphAddHostNode", name) == 0) return HIP_API_ID_hipGraphAddHostNode;
+  if (strcmp("hipGraphAddKernelNode", name) == 0) return HIP_API_ID_hipGraphAddKernelNode;
+  if (strcmp("hipGraphAddMemAllocNode", name) == 0) return HIP_API_ID_hipGraphAddMemAllocNode;
+  if (strcmp("hipGraphAddMemFreeNode", name) == 0) return HIP_API_ID_hipGraphAddMemFreeNode;
+  if (strcmp("hipGraphAddMemcpyNode", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNode;
+  if (strcmp("hipGraphAddMemcpyNode1D", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNode1D;
+  if (strcmp("hipGraphAddMemcpyNodeFromSymbol", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol;
+  if (strcmp("hipGraphAddMemcpyNodeToSymbol", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNodeToSymbol;
+  if (strcmp("hipGraphAddMemsetNode", name) == 0) return HIP_API_ID_hipGraphAddMemsetNode;
+  if (strcmp("hipGraphAddNode", name) == 0) return HIP_API_ID_hipGraphAddNode;
+  if (strcmp("hipGraphBatchMemOpNodeGetParams", name) == 0) return HIP_API_ID_hipGraphBatchMemOpNodeGetParams;
+  if (strcmp("hipGraphBatchMemOpNodeSetParams", name) == 0) return HIP_API_ID_hipGraphBatchMemOpNodeSetParams;
+  if (strcmp("hipGraphChildGraphNodeGetGraph", name) == 0) return HIP_API_ID_hipGraphChildGraphNodeGetGraph;
+  if (strcmp("hipGraphClone", name) == 0) return HIP_API_ID_hipGraphClone;
+  if (strcmp("hipGraphCreate", name) == 0) return HIP_API_ID_hipGraphCreate;
+  if (strcmp("hipGraphDebugDotPrint", name) == 0) return HIP_API_ID_hipGraphDebugDotPrint;
+  if (strcmp("hipGraphDestroy", name) == 0) return HIP_API_ID_hipGraphDestroy;
+  if (strcmp("hipGraphDestroyNode", name) == 0) return HIP_API_ID_hipGraphDestroyNode;
+  if (strcmp("hipGraphEventRecordNodeGetEvent", name) == 0) return HIP_API_ID_hipGraphEventRecordNodeGetEvent;
+  if (strcmp("hipGraphEventRecordNodeSetEvent", name) == 0) return HIP_API_ID_hipGraphEventRecordNodeSetEvent;
+  if (strcmp("hipGraphEventWaitNodeGetEvent", name) == 0) return HIP_API_ID_hipGraphEventWaitNodeGetEvent;
+  if (strcmp("hipGraphEventWaitNodeSetEvent", name) == 0) return HIP_API_ID_hipGraphEventWaitNodeSetEvent;
+  if (strcmp("hipGraphExecBatchMemOpNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecBatchMemOpNodeSetParams;
+  if (strcmp("hipGraphExecChildGraphNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecChildGraphNodeSetParams;
+  if (strcmp("hipGraphExecDestroy", name) == 0) return HIP_API_ID_hipGraphExecDestroy;
+  if (strcmp("hipGraphExecEventRecordNodeSetEvent", name) == 0) return HIP_API_ID_hipGraphExecEventRecordNodeSetEvent;
+  if (strcmp("hipGraphExecEventWaitNodeSetEvent", name) == 0) return HIP_API_ID_hipGraphExecEventWaitNodeSetEvent;
+  if (strcmp("hipGraphExecExternalSemaphoresSignalNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecExternalSemaphoresSignalNodeSetParams;
+  if (strcmp("hipGraphExecExternalSemaphoresWaitNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecExternalSemaphoresWaitNodeSetParams;
+  if (strcmp("hipGraphExecGetFlags", name) == 0) return HIP_API_ID_hipGraphExecGetFlags;
+  if (strcmp("hipGraphExecHostNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecHostNodeSetParams;
+  if (strcmp("hipGraphExecKernelNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecKernelNodeSetParams;
+  if (strcmp("hipGraphExecMemcpyNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecMemcpyNodeSetParams;
+  if (strcmp("hipGraphExecMemcpyNodeSetParams1D", name) == 0) return HIP_API_ID_hipGraphExecMemcpyNodeSetParams1D;
+  if (strcmp("hipGraphExecMemcpyNodeSetParamsFromSymbol", name) == 0) return HIP_API_ID_hipGraphExecMemcpyNodeSetParamsFromSymbol;
+  if (strcmp("hipGraphExecMemcpyNodeSetParamsToSymbol", name) == 0) return HIP_API_ID_hipGraphExecMemcpyNodeSetParamsToSymbol;
+  if (strcmp("hipGraphExecMemsetNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecMemsetNodeSetParams;
+  if (strcmp("hipGraphExecNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExecNodeSetParams;
+  if (strcmp("hipGraphExecUpdate", name) == 0) return HIP_API_ID_hipGraphExecUpdate;
+  if (strcmp("hipGraphExternalSemaphoresSignalNodeGetParams", name) == 0) return HIP_API_ID_hipGraphExternalSemaphoresSignalNodeGetParams;
+  if (strcmp("hipGraphExternalSemaphoresSignalNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExternalSemaphoresSignalNodeSetParams;
+  if (strcmp("hipGraphExternalSemaphoresWaitNodeGetParams", name) == 0) return HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams;
+  if (strcmp("hipGraphExternalSemaphoresWaitNodeSetParams", name) == 0) return HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams;
+  if (strcmp("hipGraphGetEdges", name) == 0) return HIP_API_ID_hipGraphGetEdges;
+  if (strcmp("hipGraphGetNodes", name) == 0) return HIP_API_ID_hipGraphGetNodes;
+  if (strcmp("hipGraphGetRootNodes", name) == 0) return HIP_API_ID_hipGraphGetRootNodes;
+  if (strcmp("hipGraphHostNodeGetParams", name) == 0) return HIP_API_ID_hipGraphHostNodeGetParams;
+  if (strcmp("hipGraphHostNodeSetParams", name) == 0) return HIP_API_ID_hipGraphHostNodeSetParams;
+  if (strcmp("hipGraphInstantiate", name) == 0) return HIP_API_ID_hipGraphInstantiate;
+  if (strcmp("hipGraphInstantiateWithFlags", name) == 0) return HIP_API_ID_hipGraphInstantiateWithFlags;
+  if (strcmp("hipGraphInstantiateWithParams", name) == 0) return HIP_API_ID_hipGraphInstantiateWithParams;
+  if (strcmp("hipGraphKernelNodeCopyAttributes", name) == 0) return HIP_API_ID_hipGraphKernelNodeCopyAttributes;
+  if (strcmp("hipGraphKernelNodeGetAttribute", name) == 0) return HIP_API_ID_hipGraphKernelNodeGetAttribute;
+  if (strcmp("hipGraphKernelNodeGetParams", name) == 0) return HIP_API_ID_hipGraphKernelNodeGetParams;
+  if (strcmp("hipGraphKernelNodeSetAttribute", name) == 0) return HIP_API_ID_hipGraphKernelNodeSetAttribute;
+  if (strcmp("hipGraphKernelNodeSetParams", name) == 0) return HIP_API_ID_hipGraphKernelNodeSetParams;
+  if (strcmp("hipGraphLaunch", name) == 0) return HIP_API_ID_hipGraphLaunch;
+  if (strcmp("hipGraphMemAllocNodeGetParams", name) == 0) return HIP_API_ID_hipGraphMemAllocNodeGetParams;
+  if (strcmp("hipGraphMemFreeNodeGetParams", name) == 0) return HIP_API_ID_hipGraphMemFreeNodeGetParams;
+  if (strcmp("hipGraphMemcpyNodeGetParams", name) == 0) return HIP_API_ID_hipGraphMemcpyNodeGetParams;
+  if (strcmp("hipGraphMemcpyNodeSetParams", name) == 0) return HIP_API_ID_hipGraphMemcpyNodeSetParams;
+  if (strcmp("hipGraphMemcpyNodeSetParams1D", name) == 0) return HIP_API_ID_hipGraphMemcpyNodeSetParams1D;
+  if (strcmp("hipGraphMemcpyNodeSetParamsFromSymbol", name) == 0) return HIP_API_ID_hipGraphMemcpyNodeSetParamsFromSymbol;
+  if (strcmp("hipGraphMemcpyNodeSetParamsToSymbol", name) == 0) return HIP_API_ID_hipGraphMemcpyNodeSetParamsToSymbol;
+  if (strcmp("hipGraphMemsetNodeGetParams", name) == 0) return HIP_API_ID_hipGraphMemsetNodeGetParams;
+  if (strcmp("hipGraphMemsetNodeSetParams", name) == 0) return HIP_API_ID_hipGraphMemsetNodeSetParams;
+  if (strcmp("hipGraphNodeFindInClone", name) == 0) return HIP_API_ID_hipGraphNodeFindInClone;
+  if (strcmp("hipGraphNodeGetDependencies", name) == 0) return HIP_API_ID_hipGraphNodeGetDependencies;
+  if (strcmp("hipGraphNodeGetDependentNodes", name) == 0) return HIP_API_ID_hipGraphNodeGetDependentNodes;
+  if (strcmp("hipGraphNodeGetEnabled", name) == 0) return HIP_API_ID_hipGraphNodeGetEnabled;
+  if (strcmp("hipGraphNodeGetType", name) == 0) return HIP_API_ID_hipGraphNodeGetType;
+  if (strcmp("hipGraphNodeSetEnabled", name) == 0) return HIP_API_ID_hipGraphNodeSetEnabled;
+  if (strcmp("hipGraphNodeSetParams", name) == 0) return HIP_API_ID_hipGraphNodeSetParams;
+  if (strcmp("hipGraphReleaseUserObject", name) == 0) return HIP_API_ID_hipGraphReleaseUserObject;
+  if (strcmp("hipGraphRemoveDependencies", name) == 0) return HIP_API_ID_hipGraphRemoveDependencies;
+  if (strcmp("hipGraphRetainUserObject", name) == 0) return HIP_API_ID_hipGraphRetainUserObject;
+  if (strcmp("hipGraphUpload", name) == 0) return HIP_API_ID_hipGraphUpload;
+  if (strcmp("hipGraphicsGLRegisterBuffer", name) == 0) return HIP_API_ID_hipGraphicsGLRegisterBuffer;
+  if (strcmp("hipGraphicsGLRegisterImage", name) == 0) return HIP_API_ID_hipGraphicsGLRegisterImage;
+  if (strcmp("hipGraphicsMapResources", name) == 0) return HIP_API_ID_hipGraphicsMapResources;
+  if (strcmp("hipGraphicsResourceGetMappedPointer", name) == 0) return HIP_API_ID_hipGraphicsResourceGetMappedPointer;
+  if (strcmp("hipGraphicsSubResourceGetMappedArray", name) == 0) return HIP_API_ID_hipGraphicsSubResourceGetMappedArray;
+  if (strcmp("hipGraphicsUnmapResources", name) == 0) return HIP_API_ID_hipGraphicsUnmapResources;
+  if (strcmp("hipGraphicsUnregisterResource", name) == 0) return HIP_API_ID_hipGraphicsUnregisterResource;
+  if (strcmp("hipHccModuleLaunchKernel", name) == 0) return HIP_API_ID_hipHccModuleLaunchKernel;
+  if (strcmp("hipHostAlloc", name) == 0) return HIP_API_ID_hipHostAlloc;
+  if (strcmp("hipHostFree", name) == 0) return HIP_API_ID_hipHostFree;
+  if (strcmp("hipHostGetDevicePointer", name) == 0) return HIP_API_ID_hipHostGetDevicePointer;
+  if (strcmp("hipHostGetFlags", name) == 0) return HIP_API_ID_hipHostGetFlags;
+  if (strcmp("hipHostMalloc", name) == 0) return HIP_API_ID_hipHostMalloc;
+  if (strcmp("hipHostRegister", name) == 0) return HIP_API_ID_hipHostRegister;
+  if (strcmp("hipHostUnregister", name) == 0) return HIP_API_ID_hipHostUnregister;
+  if (strcmp("hipImportExternalMemory", name) == 0) return HIP_API_ID_hipImportExternalMemory;
+  if (strcmp("hipImportExternalSemaphore", name) == 0) return HIP_API_ID_hipImportExternalSemaphore;
+  if (strcmp("hipInit", name) == 0) return HIP_API_ID_hipInit;
+  if (strcmp("hipIpcCloseMemHandle", name) == 0) return HIP_API_ID_hipIpcCloseMemHandle;
+  if (strcmp("hipIpcGetEventHandle", name) == 0) return HIP_API_ID_hipIpcGetEventHandle;
+  if (strcmp("hipIpcGetMemHandle", name) == 0) return HIP_API_ID_hipIpcGetMemHandle;
+  if (strcmp("hipIpcOpenEventHandle", name) == 0) return HIP_API_ID_hipIpcOpenEventHandle;
+  if (strcmp("hipIpcOpenMemHandle", name) == 0) return HIP_API_ID_hipIpcOpenMemHandle;
+  if (strcmp("hipLaunchByPtr", name) == 0) return HIP_API_ID_hipLaunchByPtr;
+  if (strcmp("hipLaunchCooperativeKernel", name) == 0) return HIP_API_ID_hipLaunchCooperativeKernel;
+  if (strcmp("hipLaunchCooperativeKernelMultiDevice", name) == 0) return HIP_API_ID_hipLaunchCooperativeKernelMultiDevice;
+  if (strcmp("hipLaunchHostFunc", name) == 0) return HIP_API_ID_hipLaunchHostFunc;
+  if (strcmp("hipLaunchKernel", name) == 0) return HIP_API_ID_hipLaunchKernel;
+  if (strcmp("hipLaunchKernelExC", name) == 0) return HIP_API_ID_hipLaunchKernelExC;
+  if (strcmp("hipLibraryGetKernel", name) == 0) return HIP_API_ID_hipLibraryGetKernel;
+  if (strcmp("hipLibraryGetKernelCount", name) == 0) return HIP_API_ID_hipLibraryGetKernelCount;
+  if (strcmp("hipLibraryLoadData", name) == 0) return HIP_API_ID_hipLibraryLoadData;
+  if (strcmp("hipLibraryLoadFromFile", name) == 0) return HIP_API_ID_hipLibraryLoadFromFile;
+  if (strcmp("hipLibraryUnload", name) == 0) return HIP_API_ID_hipLibraryUnload;
+  if (strcmp("hipLinkAddData", name) == 0) return HIP_API_ID_hipLinkAddData;
+  if (strcmp("hipLinkAddFile", name) == 0) return HIP_API_ID_hipLinkAddFile;
+  if (strcmp("hipLinkComplete", name) == 0) return HIP_API_ID_hipLinkComplete;
+  if (strcmp("hipLinkCreate", name) == 0) return HIP_API_ID_hipLinkCreate;
+  if (strcmp("hipLinkDestroy", name) == 0) return HIP_API_ID_hipLinkDestroy;
+  if (strcmp("hipMalloc", name) == 0) return HIP_API_ID_hipMalloc;
+  if (strcmp("hipMalloc3D", name) == 0) return HIP_API_ID_hipMalloc3D;
+  if (strcmp("hipMalloc3DArray", name) == 0) return HIP_API_ID_hipMalloc3DArray;
+  if (strcmp("hipMallocArray", name) == 0) return HIP_API_ID_hipMallocArray;
+  if (strcmp("hipMallocAsync", name) == 0) return HIP_API_ID_hipMallocAsync;
+  if (strcmp("hipMallocFromPoolAsync", name) == 0) return HIP_API_ID_hipMallocFromPoolAsync;
+  if (strcmp("hipMallocHost", name) == 0) return HIP_API_ID_hipMallocHost;
+  if (strcmp("hipMallocManaged", name) == 0) return HIP_API_ID_hipMallocManaged;
+  if (strcmp("hipMallocMipmappedArray", name) == 0) return HIP_API_ID_hipMallocMipmappedArray;
+  if (strcmp("hipMallocPitch", name) == 0) return HIP_API_ID_hipMallocPitch;
+  if (strcmp("hipMemAddressFree", name) == 0) return HIP_API_ID_hipMemAddressFree;
+  if (strcmp("hipMemAddressReserve", name) == 0) return HIP_API_ID_hipMemAddressReserve;
+  if (strcmp("hipMemAdvise", name) == 0) return HIP_API_ID_hipMemAdvise;
+  if (strcmp("hipMemAdvise_v2", name) == 0) return HIP_API_ID_hipMemAdvise_v2;
+  if (strcmp("hipMemAllocHost", name) == 0) return HIP_API_ID_hipMemAllocHost;
+  if (strcmp("hipMemAllocPitch", name) == 0) return HIP_API_ID_hipMemAllocPitch;
+  if (strcmp("hipMemCreate", name) == 0) return HIP_API_ID_hipMemCreate;
+  if (strcmp("hipMemExportToShareableHandle", name) == 0) return HIP_API_ID_hipMemExportToShareableHandle;
+  if (strcmp("hipMemGetAccess", name) == 0) return HIP_API_ID_hipMemGetAccess;
+  if (strcmp("hipMemGetAddressRange", name) == 0) return HIP_API_ID_hipMemGetAddressRange;
+  if (strcmp("hipMemGetAllocationGranularity", name) == 0) return HIP_API_ID_hipMemGetAllocationGranularity;
+  if (strcmp("hipMemGetAllocationPropertiesFromHandle", name) == 0) return HIP_API_ID_hipMemGetAllocationPropertiesFromHandle;
+  if (strcmp("hipMemGetHandleForAddressRange", name) == 0) return HIP_API_ID_hipMemGetHandleForAddressRange;
+  if (strcmp("hipMemGetInfo", name) == 0) return HIP_API_ID_hipMemGetInfo;
+  if (strcmp("hipMemImportFromShareableHandle", name) == 0) return HIP_API_ID_hipMemImportFromShareableHandle;
+  if (strcmp("hipMemMap", name) == 0) return HIP_API_ID_hipMemMap;
+  if (strcmp("hipMemMapArrayAsync", name) == 0) return HIP_API_ID_hipMemMapArrayAsync;
+  if (strcmp("hipMemPoolCreate", name) == 0) return HIP_API_ID_hipMemPoolCreate;
+  if (strcmp("hipMemPoolDestroy", name) == 0) return HIP_API_ID_hipMemPoolDestroy;
+  if (strcmp("hipMemPoolExportPointer", name) == 0) return HIP_API_ID_hipMemPoolExportPointer;
+  if (strcmp("hipMemPoolExportToShareableHandle", name) == 0) return HIP_API_ID_hipMemPoolExportToShareableHandle;
+  if (strcmp("hipMemPoolGetAccess", name) == 0) return HIP_API_ID_hipMemPoolGetAccess;
+  if (strcmp("hipMemPoolGetAttribute", name) == 0) return HIP_API_ID_hipMemPoolGetAttribute;
+  if (strcmp("hipMemPoolImportFromShareableHandle", name) == 0) return HIP_API_ID_hipMemPoolImportFromShareableHandle;
+  if (strcmp("hipMemPoolImportPointer", name) == 0) return HIP_API_ID_hipMemPoolImportPointer;
+  if (strcmp("hipMemPoolSetAccess", name) == 0) return HIP_API_ID_hipMemPoolSetAccess;
+  if (strcmp("hipMemPoolSetAttribute", name) == 0) return HIP_API_ID_hipMemPoolSetAttribute;
+  if (strcmp("hipMemPoolTrimTo", name) == 0) return HIP_API_ID_hipMemPoolTrimTo;
+  if (strcmp("hipMemPrefetchAsync", name) == 0) return HIP_API_ID_hipMemPrefetchAsync;
+  if (strcmp("hipMemPrefetchAsync_v2", name) == 0) return HIP_API_ID_hipMemPrefetchAsync_v2;
+  if (strcmp("hipMemPtrGetInfo", name) == 0) return HIP_API_ID_hipMemPtrGetInfo;
+  if (strcmp("hipMemRangeGetAttribute", name) == 0) return HIP_API_ID_hipMemRangeGetAttribute;
+  if (strcmp("hipMemRangeGetAttributes", name) == 0) return HIP_API_ID_hipMemRangeGetAttributes;
+  if (strcmp("hipMemRelease", name) == 0) return HIP_API_ID_hipMemRelease;
+  if (strcmp("hipMemRetainAllocationHandle", name) == 0) return HIP_API_ID_hipMemRetainAllocationHandle;
+  if (strcmp("hipMemSetAccess", name) == 0) return HIP_API_ID_hipMemSetAccess;
+  if (strcmp("hipMemUnmap", name) == 0) return HIP_API_ID_hipMemUnmap;
+  if (strcmp("hipMemcpy", name) == 0) return HIP_API_ID_hipMemcpy;
+  if (strcmp("hipMemcpy2D", name) == 0) return HIP_API_ID_hipMemcpy2D;
+  if (strcmp("hipMemcpy2DArrayToArray", name) == 0) return HIP_API_ID_hipMemcpy2DArrayToArray;
+  if (strcmp("hipMemcpy2DAsync", name) == 0) return HIP_API_ID_hipMemcpy2DAsync;
+  if (strcmp("hipMemcpy2DFromArray", name) == 0) return HIP_API_ID_hipMemcpy2DFromArray;
+  if (strcmp("hipMemcpy2DFromArrayAsync", name) == 0) return HIP_API_ID_hipMemcpy2DFromArrayAsync;
+  if (strcmp("hipMemcpy2DToArray", name) == 0) return HIP_API_ID_hipMemcpy2DToArray;
+  if (strcmp("hipMemcpy2DToArrayAsync", name) == 0) return HIP_API_ID_hipMemcpy2DToArrayAsync;
+  if (strcmp("hipMemcpy3D", name) == 0) return HIP_API_ID_hipMemcpy3D;
+  if (strcmp("hipMemcpy3DAsync", name) == 0) return HIP_API_ID_hipMemcpy3DAsync;
+  if (strcmp("hipMemcpy3DBatchAsync", name) == 0) return HIP_API_ID_hipMemcpy3DBatchAsync;
+  if (strcmp("hipMemcpy3DPeer", name) == 0) return HIP_API_ID_hipMemcpy3DPeer;
+  if (strcmp("hipMemcpy3DPeerAsync", name) == 0) return HIP_API_ID_hipMemcpy3DPeerAsync;
+  if (strcmp("hipMemcpyAsync", name) == 0) return HIP_API_ID_hipMemcpyAsync;
+  if (strcmp("hipMemcpyAtoA", name) == 0) return HIP_API_ID_hipMemcpyAtoA;
+  if (strcmp("hipMemcpyAtoD", name) == 0) return HIP_API_ID_hipMemcpyAtoD;
+  if (strcmp("hipMemcpyAtoH", name) == 0) return HIP_API_ID_hipMemcpyAtoH;
+  if (strcmp("hipMemcpyAtoHAsync", name) == 0) return HIP_API_ID_hipMemcpyAtoHAsync;
+  if (strcmp("hipMemcpyBatchAsync", name) == 0) return HIP_API_ID_hipMemcpyBatchAsync;
+  if (strcmp("hipMemcpyDtoA", name) == 0) return HIP_API_ID_hipMemcpyDtoA;
+  if (strcmp("hipMemcpyDtoD", name) == 0) return HIP_API_ID_hipMemcpyDtoD;
+  if (strcmp("hipMemcpyDtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyDtoDAsync;
+  if (strcmp("hipMemcpyDtoH", name) == 0) return HIP_API_ID_hipMemcpyDtoH;
+  if (strcmp("hipMemcpyDtoHAsync", name) == 0) return HIP_API_ID_hipMemcpyDtoHAsync;
+  if (strcmp("hipMemcpyFromArray", name) == 0) return HIP_API_ID_hipMemcpyFromArray;
+  if (strcmp("hipMemcpyFromSymbol", name) == 0) return HIP_API_ID_hipMemcpyFromSymbol;
+  if (strcmp("hipMemcpyFromSymbolAsync", name) == 0) return HIP_API_ID_hipMemcpyFromSymbolAsync;
+  if (strcmp("hipMemcpyHtoA", name) == 0) return HIP_API_ID_hipMemcpyHtoA;
+  if (strcmp("hipMemcpyHtoAAsync", name) == 0) return HIP_API_ID_hipMemcpyHtoAAsync;
+  if (strcmp("hipMemcpyHtoD", name) == 0) return HIP_API_ID_hipMemcpyHtoD;
+  if (strcmp("hipMemcpyHtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyHtoDAsync;
+  if (strcmp("hipMemcpyParam2D", name) == 0) return HIP_API_ID_hipMemcpyParam2D;
+  if (strcmp("hipMemcpyParam2DAsync", name) == 0) return HIP_API_ID_hipMemcpyParam2DAsync;
+  if (strcmp("hipMemcpyPeer", name) == 0) return HIP_API_ID_hipMemcpyPeer;
+  if (strcmp("hipMemcpyPeerAsync", name) == 0) return HIP_API_ID_hipMemcpyPeerAsync;
+  if (strcmp("hipMemcpyToArray", name) == 0) return HIP_API_ID_hipMemcpyToArray;
+  if (strcmp("hipMemcpyToSymbol", name) == 0) return HIP_API_ID_hipMemcpyToSymbol;
+  if (strcmp("hipMemcpyToSymbolAsync", name) == 0) return HIP_API_ID_hipMemcpyToSymbolAsync;
+  if (strcmp("hipMemcpyWithStream", name) == 0) return HIP_API_ID_hipMemcpyWithStream;
+  if (strcmp("hipMemset", name) == 0) return HIP_API_ID_hipMemset;
+  if (strcmp("hipMemset2D", name) == 0) return HIP_API_ID_hipMemset2D;
+  if (strcmp("hipMemset2DAsync", name) == 0) return HIP_API_ID_hipMemset2DAsync;
+  if (strcmp("hipMemset3D", name) == 0) return HIP_API_ID_hipMemset3D;
+  if (strcmp("hipMemset3DAsync", name) == 0) return HIP_API_ID_hipMemset3DAsync;
+  if (strcmp("hipMemsetAsync", name) == 0) return HIP_API_ID_hipMemsetAsync;
+  if (strcmp("hipMemsetD16", name) == 0) return HIP_API_ID_hipMemsetD16;
+  if (strcmp("hipMemsetD16Async", name) == 0) return HIP_API_ID_hipMemsetD16Async;
+  if (strcmp("hipMemsetD2D16", name) == 0) return HIP_API_ID_hipMemsetD2D16;
+  if (strcmp("hipMemsetD2D16Async", name) == 0) return HIP_API_ID_hipMemsetD2D16Async;
+  if (strcmp("hipMemsetD2D32", name) == 0) return HIP_API_ID_hipMemsetD2D32;
+  if (strcmp("hipMemsetD2D32Async", name) == 0) return HIP_API_ID_hipMemsetD2D32Async;
+  if (strcmp("hipMemsetD2D8", name) == 0) return HIP_API_ID_hipMemsetD2D8;
+  if (strcmp("hipMemsetD2D8Async", name) == 0) return HIP_API_ID_hipMemsetD2D8Async;
+  if (strcmp("hipMemsetD32", name) == 0) return HIP_API_ID_hipMemsetD32;
+  if (strcmp("hipMemsetD32Async", name) == 0) return HIP_API_ID_hipMemsetD32Async;
+  if (strcmp("hipMemsetD8", name) == 0) return HIP_API_ID_hipMemsetD8;
+  if (strcmp("hipMemsetD8Async", name) == 0) return HIP_API_ID_hipMemsetD8Async;
+  if (strcmp("hipMipmappedArrayCreate", name) == 0) return HIP_API_ID_hipMipmappedArrayCreate;
+  if (strcmp("hipMipmappedArrayDestroy", name) == 0) return HIP_API_ID_hipMipmappedArrayDestroy;
+  if (strcmp("hipMipmappedArrayGetLevel", name) == 0) return HIP_API_ID_hipMipmappedArrayGetLevel;
+  if (strcmp("hipModuleGetFunction", name) == 0) return HIP_API_ID_hipModuleGetFunction;
+  if (strcmp("hipModuleGetFunctionCount", name) == 0) return HIP_API_ID_hipModuleGetFunctionCount;
+  if (strcmp("hipModuleGetGlobal", name) == 0) return HIP_API_ID_hipModuleGetGlobal;
+  if (strcmp("hipModuleGetTexRef", name) == 0) return HIP_API_ID_hipModuleGetTexRef;
+  if (strcmp("hipModuleLaunchCooperativeKernel", name) == 0) return HIP_API_ID_hipModuleLaunchCooperativeKernel;
+  if (strcmp("hipModuleLaunchCooperativeKernelMultiDevice", name) == 0) return HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice;
+  if (strcmp("hipModuleLaunchKernel", name) == 0) return HIP_API_ID_hipModuleLaunchKernel;
+  if (strcmp("hipModuleLoad", name) == 0) return HIP_API_ID_hipModuleLoad;
+  if (strcmp("hipModuleLoadData", name) == 0) return HIP_API_ID_hipModuleLoadData;
+  if (strcmp("hipModuleLoadDataEx", name) == 0) return HIP_API_ID_hipModuleLoadDataEx;
+  if (strcmp("hipModuleLoadFatBinary", name) == 0) return HIP_API_ID_hipModuleLoadFatBinary;
+  if (strcmp("hipModuleOccupancyMaxActiveBlocksPerMultiprocessor", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+  if (strcmp("hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+  if (strcmp("hipModuleOccupancyMaxPotentialBlockSize", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize;
+  if (strcmp("hipModuleOccupancyMaxPotentialBlockSizeWithFlags", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags;
+  if (strcmp("hipModuleUnload", name) == 0) return HIP_API_ID_hipModuleUnload;
+  if (strcmp("hipOccupancyMaxActiveBlocksPerMultiprocessor", name) == 0) return HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor;
+  if (strcmp("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", name) == 0) return HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+  if (strcmp("hipOccupancyMaxPotentialBlockSize", name) == 0) return HIP_API_ID_hipOccupancyMaxPotentialBlockSize;
+  if (strcmp("hipPeekAtLastError", name) == 0) return HIP_API_ID_hipPeekAtLastError;
+  if (strcmp("hipPointerGetAttribute", name) == 0) return HIP_API_ID_hipPointerGetAttribute;
+  if (strcmp("hipPointerGetAttributes", name) == 0) return HIP_API_ID_hipPointerGetAttributes;
+  if (strcmp("hipPointerSetAttribute", name) == 0) return HIP_API_ID_hipPointerSetAttribute;
+  if (strcmp("hipProfilerStart", name) == 0) return HIP_API_ID_hipProfilerStart;
+  if (strcmp("hipProfilerStop", name) == 0) return HIP_API_ID_hipProfilerStop;
+  if (strcmp("hipRuntimeGetVersion", name) == 0) return HIP_API_ID_hipRuntimeGetVersion;
+  if (strcmp("hipSetDevice", name) == 0) return HIP_API_ID_hipSetDevice;
+  if (strcmp("hipSetDeviceFlags", name) == 0) return HIP_API_ID_hipSetDeviceFlags;
+  if (strcmp("hipSetValidDevices", name) == 0) return HIP_API_ID_hipSetValidDevices;
+  if (strcmp("hipSetupArgument", name) == 0) return HIP_API_ID_hipSetupArgument;
+  if (strcmp("hipSignalExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipSignalExternalSemaphoresAsync;
+  if (strcmp("hipStreamAddCallback", name) == 0) return HIP_API_ID_hipStreamAddCallback;
+  if (strcmp("hipStreamAttachMemAsync", name) == 0) return HIP_API_ID_hipStreamAttachMemAsync;
+  if (strcmp("hipStreamBatchMemOp", name) == 0) return HIP_API_ID_hipStreamBatchMemOp;
+  if (strcmp("hipStreamBeginCapture", name) == 0) return HIP_API_ID_hipStreamBeginCapture;
+  if (strcmp("hipStreamBeginCaptureToGraph", name) == 0) return HIP_API_ID_hipStreamBeginCaptureToGraph;
+  if (strcmp("hipStreamCreate", name) == 0) return HIP_API_ID_hipStreamCreate;
+  if (strcmp("hipStreamCreateWithFlags", name) == 0) return HIP_API_ID_hipStreamCreateWithFlags;
+  if (strcmp("hipStreamCreateWithPriority", name) == 0) return HIP_API_ID_hipStreamCreateWithPriority;
+  if (strcmp("hipStreamDestroy", name) == 0) return HIP_API_ID_hipStreamDestroy;
+  if (strcmp("hipStreamEndCapture", name) == 0) return HIP_API_ID_hipStreamEndCapture;
+  if (strcmp("hipStreamGetAttribute", name) == 0) return HIP_API_ID_hipStreamGetAttribute;
+  if (strcmp("hipStreamGetCaptureInfo", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo;
+  if (strcmp("hipStreamGetCaptureInfo_v2", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo_v2;
+  if (strcmp("hipStreamGetDevice", name) == 0) return HIP_API_ID_hipStreamGetDevice;
+  if (strcmp("hipStreamGetFlags", name) == 0) return HIP_API_ID_hipStreamGetFlags;
+  if (strcmp("hipStreamGetId", name) == 0) return HIP_API_ID_hipStreamGetId;
+  if (strcmp("hipStreamGetPriority", name) == 0) return HIP_API_ID_hipStreamGetPriority;
+  if (strcmp("hipStreamIsCapturing", name) == 0) return HIP_API_ID_hipStreamIsCapturing;
+  if (strcmp("hipStreamQuery", name) == 0) return HIP_API_ID_hipStreamQuery;
+  if (strcmp("hipStreamSetAttribute", name) == 0) return HIP_API_ID_hipStreamSetAttribute;
+  if (strcmp("hipStreamSynchronize", name) == 0) return HIP_API_ID_hipStreamSynchronize;
+  if (strcmp("hipStreamUpdateCaptureDependencies", name) == 0) return HIP_API_ID_hipStreamUpdateCaptureDependencies;
+  if (strcmp("hipStreamWaitEvent", name) == 0) return HIP_API_ID_hipStreamWaitEvent;
+  if (strcmp("hipStreamWaitValue32", name) == 0) return HIP_API_ID_hipStreamWaitValue32;
+  if (strcmp("hipStreamWaitValue64", name) == 0) return HIP_API_ID_hipStreamWaitValue64;
+  if (strcmp("hipStreamWriteValue32", name) == 0) return HIP_API_ID_hipStreamWriteValue32;
+  if (strcmp("hipStreamWriteValue64", name) == 0) return HIP_API_ID_hipStreamWriteValue64;
+  if (strcmp("hipTexRefGetAddress", name) == 0) return HIP_API_ID_hipTexRefGetAddress;
+  if (strcmp("hipTexRefGetArray", name) == 0) return HIP_API_ID_hipTexRefGetArray;
+  if (strcmp("hipTexRefGetBorderColor", name) == 0) return HIP_API_ID_hipTexRefGetBorderColor;
+  if (strcmp("hipTexRefGetFlags", name) == 0) return HIP_API_ID_hipTexRefGetFlags;
+  if (strcmp("hipTexRefGetFormat", name) == 0) return HIP_API_ID_hipTexRefGetFormat;
+  if (strcmp("hipTexRefGetMaxAnisotropy", name) == 0) return HIP_API_ID_hipTexRefGetMaxAnisotropy;
+  if (strcmp("hipTexRefGetMipMappedArray", name) == 0) return HIP_API_ID_hipTexRefGetMipMappedArray;
+  if (strcmp("hipTexRefGetMipmapLevelBias", name) == 0) return HIP_API_ID_hipTexRefGetMipmapLevelBias;
+  if (strcmp("hipTexRefGetMipmapLevelClamp", name) == 0) return HIP_API_ID_hipTexRefGetMipmapLevelClamp;
+  if (strcmp("hipTexRefSetAddress", name) == 0) return HIP_API_ID_hipTexRefSetAddress;
+  if (strcmp("hipTexRefSetAddress2D", name) == 0) return HIP_API_ID_hipTexRefSetAddress2D;
+  if (strcmp("hipTexRefSetArray", name) == 0) return HIP_API_ID_hipTexRefSetArray;
+  if (strcmp("hipTexRefSetBorderColor", name) == 0) return HIP_API_ID_hipTexRefSetBorderColor;
+  if (strcmp("hipTexRefSetFlags", name) == 0) return HIP_API_ID_hipTexRefSetFlags;
+  if (strcmp("hipTexRefSetFormat", name) == 0) return HIP_API_ID_hipTexRefSetFormat;
+  if (strcmp("hipTexRefSetMaxAnisotropy", name) == 0) return HIP_API_ID_hipTexRefSetMaxAnisotropy;
+  if (strcmp("hipTexRefSetMipmapLevelBias", name) == 0) return HIP_API_ID_hipTexRefSetMipmapLevelBias;
+  if (strcmp("hipTexRefSetMipmapLevelClamp", name) == 0) return HIP_API_ID_hipTexRefSetMipmapLevelClamp;
+  if (strcmp("hipTexRefSetMipmappedArray", name) == 0) return HIP_API_ID_hipTexRefSetMipmappedArray;
+  if (strcmp("hipThreadExchangeStreamCaptureMode", name) == 0) return HIP_API_ID_hipThreadExchangeStreamCaptureMode;
+  if (strcmp("hipUserObjectCreate", name) == 0) return HIP_API_ID_hipUserObjectCreate;
+  if (strcmp("hipUserObjectRelease", name) == 0) return HIP_API_ID_hipUserObjectRelease;
+  if (strcmp("hipUserObjectRetain", name) == 0) return HIP_API_ID_hipUserObjectRetain;
+  if (strcmp("hipWaitExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipWaitExternalSemaphoresAsync;
+  return HIP_API_ID_NONE;
+}
+
+// HIP API callbacks data structures
+typedef struct hip_api_data_s {
+  uint64_t correlation_id;
+  uint32_t phase;
+  union {
+    struct {
+      dim3* gridDim;
+      dim3 gridDim__val;
+      dim3* blockDim;
+      dim3 blockDim__val;
+      size_t* sharedMem;
+      size_t sharedMem__val;
+      hipStream_t* stream;
+      hipStream_t stream__val;
+    } __hipPopCallConfiguration;
+    struct {
+      dim3 gridDim;
+      dim3 blockDim;
+      size_t sharedMem;
+      hipStream_t stream;
+    } __hipPushCallConfiguration;
+    struct {
+      hipArray_t* array;
+      hipArray_t array__val;
+      const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray;
+      HIP_ARRAY3D_DESCRIPTOR pAllocateArray__val;
+    } hipArray3DCreate;
+    struct {
+      HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor;
+      HIP_ARRAY3D_DESCRIPTOR pArrayDescriptor__val;
+      hipArray_t array;
+    } hipArray3DGetDescriptor;
+    struct {
+      hipArray_t* pHandle;
+      hipArray_t pHandle__val;
+      const HIP_ARRAY_DESCRIPTOR* pAllocateArray;
+      HIP_ARRAY_DESCRIPTOR pAllocateArray__val;
+    } hipArrayCreate;
+    struct {
+      hipArray_t array;
+    } hipArrayDestroy;
+    struct {
+      HIP_ARRAY_DESCRIPTOR* pArrayDescriptor;
+      HIP_ARRAY_DESCRIPTOR pArrayDescriptor__val;
+      hipArray_t array;
+    } hipArrayGetDescriptor;
+    struct {
+      hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      hipExtent* extent;
+      hipExtent extent__val;
+      unsigned int* flags;
+      unsigned int flags__val;
+      hipArray_t array;
+    } hipArrayGetInfo;
+    struct {
+      int* device;
+      int device__val;
+      const hipDeviceProp_tR0000* prop;
+      hipDeviceProp_tR0000 prop__val;
+    } hipChooseDeviceR0000;
+    struct {
+      int* device;
+      int device__val;
+      const hipDeviceProp_tR0600* prop;
+      hipDeviceProp_tR0600 prop__val;
+    } hipChooseDeviceR0600;
+    struct {
+      dim3 gridDim;
+      dim3 blockDim;
+      size_t sharedMem;
+      hipStream_t stream;
+    } hipConfigureCall;
+    struct {
+      hipSurfaceObject_t* pSurfObject;
+      hipSurfaceObject_t pSurfObject__val;
+      const hipResourceDesc* pResDesc;
+      hipResourceDesc pResDesc__val;
+    } hipCreateSurfaceObject;
+    struct {
+      hipCtx_t* ctx;
+      hipCtx_t ctx__val;
+      unsigned int flags;
+      hipDevice_t device;
+    } hipCtxCreate;
+    struct {
+      hipCtx_t ctx;
+    } hipCtxDestroy;
+    struct {
+      hipCtx_t peerCtx;
+    } hipCtxDisablePeerAccess;
+    struct {
+      hipCtx_t peerCtx;
+      unsigned int flags;
+    } hipCtxEnablePeerAccess;
+    struct {
+      hipCtx_t ctx;
+      unsigned int* apiVersion;
+      unsigned int apiVersion__val;
+    } hipCtxGetApiVersion;
+    struct {
+      hipFuncCache_t* cacheConfig;
+      hipFuncCache_t cacheConfig__val;
+    } hipCtxGetCacheConfig;
+    struct {
+      hipCtx_t* ctx;
+      hipCtx_t ctx__val;
+    } hipCtxGetCurrent;
+    struct {
+      hipDevice_t* device;
+      hipDevice_t device__val;
+    } hipCtxGetDevice;
+    struct {
+      unsigned int* flags;
+      unsigned int flags__val;
+    } hipCtxGetFlags;
+    struct {
+      hipSharedMemConfig* pConfig;
+      hipSharedMemConfig pConfig__val;
+    } hipCtxGetSharedMemConfig;
+    struct {
+      hipCtx_t* ctx;
+      hipCtx_t ctx__val;
+    } hipCtxPopCurrent;
+    struct {
+      hipCtx_t ctx;
+    } hipCtxPushCurrent;
+    struct {
+      hipFuncCache_t cacheConfig;
+    } hipCtxSetCacheConfig;
+    struct {
+      hipCtx_t ctx;
+    } hipCtxSetCurrent;
+    struct {
+      hipSharedMemConfig config;
+    } hipCtxSetSharedMemConfig;
+    struct {
+      hipExternalMemory_t extMem;
+    } hipDestroyExternalMemory;
+    struct {
+      hipExternalSemaphore_t extSem;
+    } hipDestroyExternalSemaphore;
+    struct {
+      hipSurfaceObject_t surfaceObject;
+    } hipDestroySurfaceObject;
+    struct {
+      int* canAccessPeer;
+      int canAccessPeer__val;
+      int deviceId;
+      int peerDeviceId;
+    } hipDeviceCanAccessPeer;
+    struct {
+      int* major;
+      int major__val;
+      int* minor;
+      int minor__val;
+      hipDevice_t device;
+    } hipDeviceComputeCapability;
+    struct {
+      int peerDeviceId;
+    } hipDeviceDisablePeerAccess;
+    struct {
+      int peerDeviceId;
+      unsigned int flags;
+    } hipDeviceEnablePeerAccess;
+    struct {
+      hipDevice_t* device;
+      hipDevice_t device__val;
+      int ordinal;
+    } hipDeviceGet;
+    struct {
+      int* pi;
+      int pi__val;
+      hipDeviceAttribute_t attr;
+      int deviceId;
+    } hipDeviceGetAttribute;
+    struct {
+      int* device;
+      int device__val;
+      const char* pciBusId;
+      char pciBusId__val;
+    } hipDeviceGetByPCIBusId;
+    struct {
+      hipFuncCache_t* cacheConfig;
+      hipFuncCache_t cacheConfig__val;
+    } hipDeviceGetCacheConfig;
+    struct {
+      hipMemPool_t* mem_pool;
+      hipMemPool_t mem_pool__val;
+      int device;
+    } hipDeviceGetDefaultMemPool;
+    struct {
+      int device;
+      hipGraphMemAttributeType attr;
+      void* value;
+    } hipDeviceGetGraphMemAttribute;
+    struct {
+      size_t* pValue;
+      size_t pValue__val;
+      enum hipLimit_t limit;
+    } hipDeviceGetLimit;
+    struct {
+      hipMemPool_t* mem_pool;
+      hipMemPool_t mem_pool__val;
+      int device;
+    } hipDeviceGetMemPool;
+    struct {
+      char* name;
+      char name__val;
+      int len;
+      hipDevice_t device;
+    } hipDeviceGetName;
+    struct {
+      int* value;
+      int value__val;
+      hipDeviceP2PAttr attr;
+      int srcDevice;
+      int dstDevice;
+    } hipDeviceGetP2PAttribute;
+    struct {
+      char* pciBusId;
+      char pciBusId__val;
+      int len;
+      int device;
+    } hipDeviceGetPCIBusId;
+    struct {
+      hipSharedMemConfig* pConfig;
+      hipSharedMemConfig pConfig__val;
+    } hipDeviceGetSharedMemConfig;
+    struct {
+      int* leastPriority;
+      int leastPriority__val;
+      int* greatestPriority;
+      int greatestPriority__val;
+    } hipDeviceGetStreamPriorityRange;
+    struct {
+      hipUUID* uuid;
+      hipUUID uuid__val;
+      hipDevice_t device;
+    } hipDeviceGetUuid;
+    struct {
+      int device;
+    } hipDeviceGraphMemTrim;
+    struct {
+      hipDevice_t dev;
+      unsigned int* flags;
+      unsigned int flags__val;
+      int* active;
+      int active__val;
+    } hipDevicePrimaryCtxGetState;
+    struct {
+      hipDevice_t dev;
+    } hipDevicePrimaryCtxRelease;
+    struct {
+      hipDevice_t dev;
+    } hipDevicePrimaryCtxReset;
+    struct {
+      hipCtx_t* pctx;
+      hipCtx_t pctx__val;
+      hipDevice_t dev;
+    } hipDevicePrimaryCtxRetain;
+    struct {
+      hipDevice_t dev;
+      unsigned int flags;
+    } hipDevicePrimaryCtxSetFlags;
+    struct {
+      hipFuncCache_t cacheConfig;
+    } hipDeviceSetCacheConfig;
+    struct {
+      int device;
+      hipGraphMemAttributeType attr;
+      void* value;
+    } hipDeviceSetGraphMemAttribute;
+    struct {
+      enum hipLimit_t limit;
+      size_t value;
+    } hipDeviceSetLimit;
+    struct {
+      int device;
+      hipMemPool_t mem_pool;
+    } hipDeviceSetMemPool;
+    struct {
+      hipSharedMemConfig config;
+    } hipDeviceSetSharedMemConfig;
+    struct {
+      size_t* bytes;
+      size_t bytes__val;
+      hipDevice_t device;
+    } hipDeviceTotalMem;
+    struct {
+      int* driverVersion;
+      int driverVersion__val;
+    } hipDriverGetVersion;
+    struct {
+      hipGraphNode_t* phGraphNode;
+      hipGraphNode_t phGraphNode__val;
+      hipGraph_t hGraph;
+      const hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      size_t numDependencies;
+      hipDeviceptr_t dptr;
+    } hipDrvGraphAddMemFreeNode;
+    struct {
+      hipGraphNode_t* phGraphNode;
+      hipGraphNode_t phGraphNode__val;
+      hipGraph_t hGraph;
+      const hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      size_t numDependencies;
+      const HIP_MEMCPY3D* copyParams;
+      HIP_MEMCPY3D copyParams__val;
+      hipCtx_t ctx;
+    } hipDrvGraphAddMemcpyNode;
+    struct {
+      hipGraphNode_t* phGraphNode;
+      hipGraphNode_t phGraphNode__val;
+      hipGraph_t hGraph;
+      const hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      size_t numDependencies;
+      const hipMemsetParams* memsetParams;
+      hipMemsetParams memsetParams__val;
+      hipCtx_t ctx;
+    } hipDrvGraphAddMemsetNode;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      const HIP_MEMCPY3D* copyParams;
+      HIP_MEMCPY3D copyParams__val;
+      hipCtx_t ctx;
+    } hipDrvGraphExecMemcpyNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      const hipMemsetParams* memsetParams;
+      hipMemsetParams memsetParams__val;
+      hipCtx_t ctx;
+    } hipDrvGraphExecMemsetNodeSetParams;
+    struct {
+      hipGraphNode_t hNode;
+      HIP_MEMCPY3D* nodeParams;
+      HIP_MEMCPY3D nodeParams__val;
+    } hipDrvGraphMemcpyNodeGetParams;
+    struct {
+      hipGraphNode_t hNode;
+      const HIP_MEMCPY3D* nodeParams;
+      HIP_MEMCPY3D nodeParams__val;
+    } hipDrvGraphMemcpyNodeSetParams;
+    struct {
+      const HIP_LAUNCH_CONFIG* config;
+      HIP_LAUNCH_CONFIG config__val;
+      hipFunction_t f;
+      void** params;
+      void* params__val;
+      void** extra;
+      void* extra__val;
+    } hipDrvLaunchKernelEx;
+    struct {
+      const hip_Memcpy2D* pCopy;
+      hip_Memcpy2D pCopy__val;
+    } hipDrvMemcpy2DUnaligned;
+    struct {
+      const HIP_MEMCPY3D* pCopy;
+      HIP_MEMCPY3D pCopy__val;
+    } hipDrvMemcpy3D;
+    struct {
+      const HIP_MEMCPY3D* pCopy;
+      HIP_MEMCPY3D pCopy__val;
+      hipStream_t stream;
+    } hipDrvMemcpy3DAsync;
+    struct {
+      unsigned int numAttributes;
+      hipPointer_attribute* attributes;
+      hipPointer_attribute attributes__val;
+      void** data;
+      void* data__val;
+      hipDeviceptr_t ptr;
+    } hipDrvPointerGetAttributes;
+    struct {
+      hipEvent_t* event;
+      hipEvent_t event__val;
+    } hipEventCreate;
+    struct {
+      hipEvent_t* event;
+      hipEvent_t event__val;
+      unsigned int flags;
+    } hipEventCreateWithFlags;
+    struct {
+      hipEvent_t event;
+    } hipEventDestroy;
+    struct {
+      float* ms;
+      float ms__val;
+      hipEvent_t start;
+      hipEvent_t stop;
+    } hipEventElapsedTime;
+    struct {
+      hipEvent_t event;
+    } hipEventQuery;
+    struct {
+      hipEvent_t event;
+      hipStream_t stream;
+    } hipEventRecord;
+    struct {
+      hipEvent_t event;
+      hipStream_t stream;
+      unsigned int flags;
+    } hipEventRecordWithFlags;
+    struct {
+      hipEvent_t event;
+    } hipEventSynchronize;
+    struct {
+      int device1;
+      int device2;
+      unsigned int* linktype;
+      unsigned int linktype__val;
+      unsigned int* hopcount;
+      unsigned int hopcount__val;
+    } hipExtGetLinkTypeAndHopCount;
+    struct {
+      const void* function_address;
+      dim3 numBlocks;
+      dim3 dimBlocks;
+      void** args;
+      void* args__val;
+      size_t sharedMemBytes;
+      hipStream_t stream;
+      hipEvent_t startEvent;
+      hipEvent_t stopEvent;
+      int flags;
+    } hipExtLaunchKernel;
+    struct {
+      hipLaunchParams* launchParamsList;
+      hipLaunchParams launchParamsList__val;
+      int numDevices;
+      unsigned int flags;
+    } hipExtLaunchMultiKernelMultiDevice;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t sizeBytes;
+      unsigned int flags;
+    } hipExtMallocWithFlags;
+    struct {
+      hipFunction_t f;
+      unsigned int globalWorkSizeX;
+      unsigned int globalWorkSizeY;
+      unsigned int globalWorkSizeZ;
+      unsigned int localWorkSizeX;
+      unsigned int localWorkSizeY;
+      unsigned int localWorkSizeZ;
+      size_t sharedMemBytes;
+      hipStream_t hStream;
+      void** kernelParams;
+      void* kernelParams__val;
+      void** extra;
+      void* extra__val;
+      hipEvent_t startEvent;
+      hipEvent_t stopEvent;
+      unsigned int flags;
+    } hipExtModuleLaunchKernel;
+    struct {
+      hipStream_t* stream;
+      hipStream_t stream__val;
+      unsigned int cuMaskSize;
+      const unsigned int* cuMask;
+      unsigned int cuMask__val;
+    } hipExtStreamCreateWithCUMask;
+    struct {
+      hipStream_t stream;
+      unsigned int cuMaskSize;
+      unsigned int* cuMask;
+      unsigned int cuMask__val;
+    } hipExtStreamGetCUMask;
+    struct {
+      void** devPtr;
+      void* devPtr__val;
+      hipExternalMemory_t extMem;
+      const hipExternalMemoryBufferDesc* bufferDesc;
+      hipExternalMemoryBufferDesc bufferDesc__val;
+    } hipExternalMemoryGetMappedBuffer;
+    struct {
+      hipMipmappedArray_t* mipmap;
+      hipMipmappedArray_t mipmap__val;
+      hipExternalMemory_t extMem;
+      const hipExternalMemoryMipmappedArrayDesc* mipmapDesc;
+      hipExternalMemoryMipmappedArrayDesc mipmapDesc__val;
+    } hipExternalMemoryGetMappedMipmappedArray;
+    struct {
+      void* ptr;
+    } hipFree;
+    struct {
+      hipArray_t array;
+    } hipFreeArray;
+    struct {
+      void* dev_ptr;
+      hipStream_t stream;
+    } hipFreeAsync;
+    struct {
+      void* ptr;
+    } hipFreeHost;
+    struct {
+      hipMipmappedArray_t mipmappedArray;
+    } hipFreeMipmappedArray;
+    struct {
+      int* value;
+      int value__val;
+      hipFunction_attribute attrib;
+      hipFunction_t hfunc;
+    } hipFuncGetAttribute;
+    struct {
+      hipFuncAttributes* attr;
+      hipFuncAttributes attr__val;
+      const void* func;
+    } hipFuncGetAttributes;
+    struct {
+      const void* func;
+      hipFuncAttribute attr;
+      int value;
+    } hipFuncSetAttribute;
+    struct {
+      const void* func;
+      hipFuncCache_t config;
+    } hipFuncSetCacheConfig;
+    struct {
+      const void* func;
+      hipSharedMemConfig config;
+    } hipFuncSetSharedMemConfig;
+    struct {
+      unsigned int* pHipDeviceCount;
+      unsigned int pHipDeviceCount__val;
+      int* pHipDevices;
+      int pHipDevices__val;
+      unsigned int hipDeviceCount;
+      hipGLDeviceList deviceList;
+    } hipGLGetDevices;
+    struct {
+      hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      hipArray_const_t array;
+    } hipGetChannelDesc;
+    struct {
+      int* deviceId;
+      int deviceId__val;
+    } hipGetDevice;
+    struct {
+      int* count;
+      int count__val;
+    } hipGetDeviceCount;
+    struct {
+      unsigned int* flags;
+      unsigned int flags__val;
+    } hipGetDeviceFlags;
+    struct {
+      hipDeviceProp_tR0000* prop;
+      hipDeviceProp_tR0000 prop__val;
+      int device;
+    } hipGetDevicePropertiesR0000;
+    struct {
+      hipDeviceProp_tR0600* prop;
+      hipDeviceProp_tR0600 prop__val;
+      int deviceId;
+    } hipGetDevicePropertiesR0600;
+    struct {
+      const char* symbol;
+      char symbol__val;
+      void** funcPtr;
+      void* funcPtr__val;
+      unsigned long long flags;
+      hipDriverEntryPointQueryResult* driverStatus;
+      hipDriverEntryPointQueryResult driverStatus__val;
+    } hipGetDriverEntryPoint;
+    struct {
+      hipFunction_t* functionPtr;
+      hipFunction_t functionPtr__val;
+      const void* symbolPtr;
+    } hipGetFuncBySymbol;
+    struct {
+      hipArray_t* levelArray;
+      hipArray_t levelArray__val;
+      hipMipmappedArray_const_t mipmappedArray;
+      unsigned int level;
+    } hipGetMipmappedArrayLevel;
+    struct {
+      const char* symbol;
+      char symbol__val;
+      void** pfn;
+      void* pfn__val;
+      int hipVersion;
+      uint64_t flags;
+      hipDriverProcAddressQueryResult* symbolStatus;
+      hipDriverProcAddressQueryResult symbolStatus__val;
+    } hipGetProcAddress;
+    struct {
+      void** devPtr;
+      void* devPtr__val;
+      const void* symbol;
+    } hipGetSymbolAddress;
+    struct {
+      size_t* size;
+      size_t size__val;
+      const void* symbol;
+    } hipGetSymbolSize;
+    struct {
+      hipGraphNode_t* phGraphNode;
+      hipGraphNode_t phGraphNode__val;
+      hipGraph_t hGraph;
+      const hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      size_t numDependencies;
+      const hipBatchMemOpNodeParams* nodeParams;
+      hipBatchMemOpNodeParams nodeParams__val;
+    } hipGraphAddBatchMemOpNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      hipGraph_t childGraph;
+    } hipGraphAddChildGraphNode;
+    struct {
+      hipGraph_t graph;
+      const hipGraphNode_t* from;
+      hipGraphNode_t from__val;
+      const hipGraphNode_t* to;
+      hipGraphNode_t to__val;
+      size_t numDependencies;
+    } hipGraphAddDependencies;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+    } hipGraphAddEmptyNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      hipEvent_t event;
+    } hipGraphAddEventRecordNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      hipEvent_t event;
+    } hipGraphAddEventWaitNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipExternalSemaphoreSignalNodeParams* nodeParams;
+      hipExternalSemaphoreSignalNodeParams nodeParams__val;
+    } hipGraphAddExternalSemaphoresSignalNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipExternalSemaphoreWaitNodeParams* nodeParams;
+      hipExternalSemaphoreWaitNodeParams nodeParams__val;
+    } hipGraphAddExternalSemaphoresWaitNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipHostNodeParams* pNodeParams;
+      hipHostNodeParams pNodeParams__val;
+    } hipGraphAddHostNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipKernelNodeParams* pNodeParams;
+      hipKernelNodeParams pNodeParams__val;
+    } hipGraphAddKernelNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      hipMemAllocNodeParams* pNodeParams;
+      hipMemAllocNodeParams pNodeParams__val;
+    } hipGraphAddMemAllocNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      void* dev_ptr;
+    } hipGraphAddMemFreeNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipMemcpy3DParms* pCopyParams;
+      hipMemcpy3DParms pCopyParams__val;
+    } hipGraphAddMemcpyNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      void* dst;
+      const void* src;
+      size_t count;
+      hipMemcpyKind kind;
+    } hipGraphAddMemcpyNode1D;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      void* dst;
+      const void* symbol;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphAddMemcpyNodeFromSymbol;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const void* symbol;
+      const void* src;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphAddMemcpyNodeToSymbol;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      const hipMemsetParams* pMemsetParams;
+      hipMemsetParams pMemsetParams__val;
+    } hipGraphAddMemsetNode;
+    struct {
+      hipGraphNode_t* pGraphNode;
+      hipGraphNode_t pGraphNode__val;
+      hipGraph_t graph;
+      const hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t numDependencies;
+      hipGraphNodeParams* nodeParams;
+      hipGraphNodeParams nodeParams__val;
+    } hipGraphAddNode;
+    struct {
+      hipGraphNode_t hNode;
+      hipBatchMemOpNodeParams* nodeParams_out;
+      hipBatchMemOpNodeParams nodeParams_out__val;
+    } hipGraphBatchMemOpNodeGetParams;
+    struct {
+      hipGraphNode_t hNode;
+      hipBatchMemOpNodeParams* nodeParams;
+      hipBatchMemOpNodeParams nodeParams__val;
+    } hipGraphBatchMemOpNodeSetParams;
+    struct {
+      hipGraphNode_t node;
+      hipGraph_t* pGraph;
+      hipGraph_t pGraph__val;
+    } hipGraphChildGraphNodeGetGraph;
+    struct {
+      hipGraph_t* pGraphClone;
+      hipGraph_t pGraphClone__val;
+      hipGraph_t originalGraph;
+    } hipGraphClone;
+    struct {
+      hipGraph_t* pGraph;
+      hipGraph_t pGraph__val;
+      unsigned int flags;
+    } hipGraphCreate;
+    struct {
+      hipGraph_t graph;
+      const char* path;
+      char path__val;
+      unsigned int flags;
+    } hipGraphDebugDotPrint;
+    struct {
+      hipGraph_t graph;
+    } hipGraphDestroy;
+    struct {
+      hipGraphNode_t node;
+    } hipGraphDestroyNode;
+    struct {
+      hipGraphNode_t node;
+      hipEvent_t* event_out;
+      hipEvent_t event_out__val;
+    } hipGraphEventRecordNodeGetEvent;
+    struct {
+      hipGraphNode_t node;
+      hipEvent_t event;
+    } hipGraphEventRecordNodeSetEvent;
+    struct {
+      hipGraphNode_t node;
+      hipEvent_t* event_out;
+      hipEvent_t event_out__val;
+    } hipGraphEventWaitNodeGetEvent;
+    struct {
+      hipGraphNode_t node;
+      hipEvent_t event;
+    } hipGraphEventWaitNodeSetEvent;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      const hipBatchMemOpNodeParams* nodeParams;
+      hipBatchMemOpNodeParams nodeParams__val;
+    } hipGraphExecBatchMemOpNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      hipGraph_t childGraph;
+    } hipGraphExecChildGraphNodeSetParams;
+    struct {
+      hipGraphExec_t graphExec;
+    } hipGraphExecDestroy;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      hipEvent_t event;
+    } hipGraphExecEventRecordNodeSetEvent;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      hipEvent_t event;
+    } hipGraphExecEventWaitNodeSetEvent;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      const hipExternalSemaphoreSignalNodeParams* nodeParams;
+      hipExternalSemaphoreSignalNodeParams nodeParams__val;
+    } hipGraphExecExternalSemaphoresSignalNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      const hipExternalSemaphoreWaitNodeParams* nodeParams;
+      hipExternalSemaphoreWaitNodeParams nodeParams__val;
+    } hipGraphExecExternalSemaphoresWaitNodeSetParams;
+    struct {
+      hipGraphExec_t graphExec;
+      unsigned long long* flags;
+      unsigned long long flags__val;
+    } hipGraphExecGetFlags;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      const hipHostNodeParams* pNodeParams;
+      hipHostNodeParams pNodeParams__val;
+    } hipGraphExecHostNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      const hipKernelNodeParams* pNodeParams;
+      hipKernelNodeParams pNodeParams__val;
+    } hipGraphExecKernelNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      hipMemcpy3DParms* pNodeParams;
+      hipMemcpy3DParms pNodeParams__val;
+    } hipGraphExecMemcpyNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      void* dst;
+      const void* src;
+      size_t count;
+      hipMemcpyKind kind;
+    } hipGraphExecMemcpyNodeSetParams1D;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      void* dst;
+      const void* symbol;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphExecMemcpyNodeSetParamsFromSymbol;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      const void* symbol;
+      const void* src;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphExecMemcpyNodeSetParamsToSymbol;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t node;
+      const hipMemsetParams* pNodeParams;
+      hipMemsetParams pNodeParams__val;
+    } hipGraphExecMemsetNodeSetParams;
+    struct {
+      hipGraphExec_t graphExec;
+      hipGraphNode_t node;
+      hipGraphNodeParams* nodeParams;
+      hipGraphNodeParams nodeParams__val;
+    } hipGraphExecNodeSetParams;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraph_t hGraph;
+      hipGraphNode_t* hErrorNode_out;
+      hipGraphNode_t hErrorNode_out__val;
+      hipGraphExecUpdateResult* updateResult_out;
+      hipGraphExecUpdateResult updateResult_out__val;
+    } hipGraphExecUpdate;
+    struct {
+      hipGraphNode_t hNode;
+      hipExternalSemaphoreSignalNodeParams* params_out;
+      hipExternalSemaphoreSignalNodeParams params_out__val;
+    } hipGraphExternalSemaphoresSignalNodeGetParams;
+    struct {
+      hipGraphNode_t hNode;
+      const hipExternalSemaphoreSignalNodeParams* nodeParams;
+      hipExternalSemaphoreSignalNodeParams nodeParams__val;
+    } hipGraphExternalSemaphoresSignalNodeSetParams;
+    struct {
+      hipGraphNode_t hNode;
+      hipExternalSemaphoreWaitNodeParams* params_out;
+      hipExternalSemaphoreWaitNodeParams params_out__val;
+    } hipGraphExternalSemaphoresWaitNodeGetParams;
+    struct {
+      hipGraphNode_t hNode;
+      const hipExternalSemaphoreWaitNodeParams* nodeParams;
+      hipExternalSemaphoreWaitNodeParams nodeParams__val;
+    } hipGraphExternalSemaphoresWaitNodeSetParams;
+    struct {
+      hipGraph_t graph;
+      hipGraphNode_t* from;
+      hipGraphNode_t from__val;
+      hipGraphNode_t* to;
+      hipGraphNode_t to__val;
+      size_t* numEdges;
+      size_t numEdges__val;
+    } hipGraphGetEdges;
+    struct {
+      hipGraph_t graph;
+      hipGraphNode_t* nodes;
+      hipGraphNode_t nodes__val;
+      size_t* numNodes;
+      size_t numNodes__val;
+    } hipGraphGetNodes;
+    struct {
+      hipGraph_t graph;
+      hipGraphNode_t* pRootNodes;
+      hipGraphNode_t pRootNodes__val;
+      size_t* pNumRootNodes;
+      size_t pNumRootNodes__val;
+    } hipGraphGetRootNodes;
+    struct {
+      hipGraphNode_t node;
+      hipHostNodeParams* pNodeParams;
+      hipHostNodeParams pNodeParams__val;
+    } hipGraphHostNodeGetParams;
+    struct {
+      hipGraphNode_t node;
+      const hipHostNodeParams* pNodeParams;
+      hipHostNodeParams pNodeParams__val;
+    } hipGraphHostNodeSetParams;
+    struct {
+      hipGraphExec_t* pGraphExec;
+      hipGraphExec_t pGraphExec__val;
+      hipGraph_t graph;
+      hipGraphNode_t* pErrorNode;
+      hipGraphNode_t pErrorNode__val;
+      char* pLogBuffer;
+      char pLogBuffer__val;
+      size_t bufferSize;
+    } hipGraphInstantiate;
+    struct {
+      hipGraphExec_t* pGraphExec;
+      hipGraphExec_t pGraphExec__val;
+      hipGraph_t graph;
+      unsigned long long flags;
+    } hipGraphInstantiateWithFlags;
+    struct {
+      hipGraphExec_t* pGraphExec;
+      hipGraphExec_t pGraphExec__val;
+      hipGraph_t graph;
+      hipGraphInstantiateParams* instantiateParams;
+      hipGraphInstantiateParams instantiateParams__val;
+    } hipGraphInstantiateWithParams;
+    struct {
+      hipGraphNode_t hSrc;
+      hipGraphNode_t hDst;
+    } hipGraphKernelNodeCopyAttributes;
+    struct {
+      hipGraphNode_t hNode;
+      hipLaunchAttributeID attr;
+      hipLaunchAttributeValue* value;
+      hipLaunchAttributeValue value__val;
+    } hipGraphKernelNodeGetAttribute;
+    struct {
+      hipGraphNode_t node;
+      hipKernelNodeParams* pNodeParams;
+      hipKernelNodeParams pNodeParams__val;
+    } hipGraphKernelNodeGetParams;
+    struct {
+      hipGraphNode_t hNode;
+      hipLaunchAttributeID attr;
+      const hipLaunchAttributeValue* value;
+      hipLaunchAttributeValue value__val;
+    } hipGraphKernelNodeSetAttribute;
+    struct {
+      hipGraphNode_t node;
+      const hipKernelNodeParams* pNodeParams;
+      hipKernelNodeParams pNodeParams__val;
+    } hipGraphKernelNodeSetParams;
+    struct {
+      hipGraphExec_t graphExec;
+      hipStream_t stream;
+    } hipGraphLaunch;
+    struct {
+      hipGraphNode_t node;
+      hipMemAllocNodeParams* pNodeParams;
+      hipMemAllocNodeParams pNodeParams__val;
+    } hipGraphMemAllocNodeGetParams;
+    struct {
+      hipGraphNode_t node;
+      void* dev_ptr;
+    } hipGraphMemFreeNodeGetParams;
+    struct {
+      hipGraphNode_t node;
+      hipMemcpy3DParms* pNodeParams;
+      hipMemcpy3DParms pNodeParams__val;
+    } hipGraphMemcpyNodeGetParams;
+    struct {
+      hipGraphNode_t node;
+      const hipMemcpy3DParms* pNodeParams;
+      hipMemcpy3DParms pNodeParams__val;
+    } hipGraphMemcpyNodeSetParams;
+    struct {
+      hipGraphNode_t node;
+      void* dst;
+      const void* src;
+      size_t count;
+      hipMemcpyKind kind;
+    } hipGraphMemcpyNodeSetParams1D;
+    struct {
+      hipGraphNode_t node;
+      void* dst;
+      const void* symbol;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphMemcpyNodeSetParamsFromSymbol;
+    struct {
+      hipGraphNode_t node;
+      const void* symbol;
+      const void* src;
+      size_t count;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipGraphMemcpyNodeSetParamsToSymbol;
+    struct {
+      hipGraphNode_t node;
+      hipMemsetParams* pNodeParams;
+      hipMemsetParams pNodeParams__val;
+    } hipGraphMemsetNodeGetParams;
+    struct {
+      hipGraphNode_t node;
+      const hipMemsetParams* pNodeParams;
+      hipMemsetParams pNodeParams__val;
+    } hipGraphMemsetNodeSetParams;
+    struct {
+      hipGraphNode_t* pNode;
+      hipGraphNode_t pNode__val;
+      hipGraphNode_t originalNode;
+      hipGraph_t clonedGraph;
+    } hipGraphNodeFindInClone;
+    struct {
+      hipGraphNode_t node;
+      hipGraphNode_t* pDependencies;
+      hipGraphNode_t pDependencies__val;
+      size_t* pNumDependencies;
+      size_t pNumDependencies__val;
+    } hipGraphNodeGetDependencies;
+    struct {
+      hipGraphNode_t node;
+      hipGraphNode_t* pDependentNodes;
+      hipGraphNode_t pDependentNodes__val;
+      size_t* pNumDependentNodes;
+      size_t pNumDependentNodes__val;
+    } hipGraphNodeGetDependentNodes;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      unsigned int* isEnabled;
+      unsigned int isEnabled__val;
+    } hipGraphNodeGetEnabled;
+    struct {
+      hipGraphNode_t node;
+      hipGraphNodeType* pType;
+      hipGraphNodeType pType__val;
+    } hipGraphNodeGetType;
+    struct {
+      hipGraphExec_t hGraphExec;
+      hipGraphNode_t hNode;
+      unsigned int isEnabled;
+    } hipGraphNodeSetEnabled;
+    struct {
+      hipGraphNode_t node;
+      hipGraphNodeParams* nodeParams;
+      hipGraphNodeParams nodeParams__val;
+    } hipGraphNodeSetParams;
+    struct {
+      hipGraph_t graph;
+      hipUserObject_t object;
+      unsigned int count;
+    } hipGraphReleaseUserObject;
+    struct {
+      hipGraph_t graph;
+      const hipGraphNode_t* from;
+      hipGraphNode_t from__val;
+      const hipGraphNode_t* to;
+      hipGraphNode_t to__val;
+      size_t numDependencies;
+    } hipGraphRemoveDependencies;
+    struct {
+      hipGraph_t graph;
+      hipUserObject_t object;
+      unsigned int count;
+      unsigned int flags;
+    } hipGraphRetainUserObject;
+    struct {
+      hipGraphExec_t graphExec;
+      hipStream_t stream;
+    } hipGraphUpload;
+    struct {
+      hipGraphicsResource** resource;
+      hipGraphicsResource* resource__val;
+      GLuint buffer;
+      unsigned int flags;
+    } hipGraphicsGLRegisterBuffer;
+    struct {
+      hipGraphicsResource** resource;
+      hipGraphicsResource* resource__val;
+      GLuint image;
+      GLenum target;
+      unsigned int flags;
+    } hipGraphicsGLRegisterImage;
+    struct {
+      int count;
+      hipGraphicsResource_t* resources;
+      hipGraphicsResource_t resources__val;
+      hipStream_t stream;
+    } hipGraphicsMapResources;
+    struct {
+      void** devPtr;
+      void* devPtr__val;
+      size_t* size;
+      size_t size__val;
+      hipGraphicsResource_t resource;
+    } hipGraphicsResourceGetMappedPointer;
+    struct {
+      hipArray_t* array;
+      hipArray_t array__val;
+      hipGraphicsResource_t resource;
+      unsigned int arrayIndex;
+      unsigned int mipLevel;
+    } hipGraphicsSubResourceGetMappedArray;
+    struct {
+      int count;
+      hipGraphicsResource_t* resources;
+      hipGraphicsResource_t resources__val;
+      hipStream_t stream;
+    } hipGraphicsUnmapResources;
+    struct {
+      hipGraphicsResource_t resource;
+    } hipGraphicsUnregisterResource;
+    struct {
+      hipFunction_t f;
+      unsigned int globalWorkSizeX;
+      unsigned int globalWorkSizeY;
+      unsigned int globalWorkSizeZ;
+      unsigned int blockDimX;
+      unsigned int blockDimY;
+      unsigned int blockDimZ;
+      size_t sharedMemBytes;
+      hipStream_t hStream;
+      void** kernelParams;
+      void* kernelParams__val;
+      void** extra;
+      void* extra__val;
+      hipEvent_t startEvent;
+      hipEvent_t stopEvent;
+    } hipHccModuleLaunchKernel;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+      unsigned int flags;
+    } hipHostAlloc;
+    struct {
+      void* ptr;
+    } hipHostFree;
+    struct {
+      void** devPtr;
+      void* devPtr__val;
+      void* hstPtr;
+      unsigned int flags;
+    } hipHostGetDevicePointer;
+    struct {
+      unsigned int* flagsPtr;
+      unsigned int flagsPtr__val;
+      void* hostPtr;
+    } hipHostGetFlags;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+      unsigned int flags;
+    } hipHostMalloc;
+    struct {
+      void* hostPtr;
+      size_t sizeBytes;
+      unsigned int flags;
+    } hipHostRegister;
+    struct {
+      void* hostPtr;
+    } hipHostUnregister;
+    struct {
+      hipExternalMemory_t* extMem_out;
+      hipExternalMemory_t extMem_out__val;
+      const hipExternalMemoryHandleDesc* memHandleDesc;
+      hipExternalMemoryHandleDesc memHandleDesc__val;
+    } hipImportExternalMemory;
+    struct {
+      hipExternalSemaphore_t* extSem_out;
+      hipExternalSemaphore_t extSem_out__val;
+      const hipExternalSemaphoreHandleDesc* semHandleDesc;
+      hipExternalSemaphoreHandleDesc semHandleDesc__val;
+    } hipImportExternalSemaphore;
+    struct {
+      unsigned int flags;
+    } hipInit;
+    struct {
+      void* devPtr;
+    } hipIpcCloseMemHandle;
+    struct {
+      hipIpcEventHandle_t* handle;
+      hipIpcEventHandle_t handle__val;
+      hipEvent_t event;
+    } hipIpcGetEventHandle;
+    struct {
+      hipIpcMemHandle_t* handle;
+      hipIpcMemHandle_t handle__val;
+      void* devPtr;
+    } hipIpcGetMemHandle;
+    struct {
+      hipEvent_t* event;
+      hipEvent_t event__val;
+      hipIpcEventHandle_t handle;
+    } hipIpcOpenEventHandle;
+    struct {
+      void** devPtr;
+      void* devPtr__val;
+      hipIpcMemHandle_t handle;
+      unsigned int flags;
+    } hipIpcOpenMemHandle;
+    struct {
+      const void* hostFunction;
+    } hipLaunchByPtr;
+    struct {
+      const void* f;
+      dim3 gridDim;
+      dim3 blockDimX;
+      void** kernelParams;
+      void* kernelParams__val;
+      unsigned int sharedMemBytes;
+      hipStream_t stream;
+    } hipLaunchCooperativeKernel;
+    struct {
+      hipLaunchParams* launchParamsList;
+      hipLaunchParams launchParamsList__val;
+      int numDevices;
+      unsigned int flags;
+    } hipLaunchCooperativeKernelMultiDevice;
+    struct {
+      hipStream_t stream;
+      hipHostFn_t fn;
+      void* userData;
+    } hipLaunchHostFunc;
+    struct {
+      const void* function_address;
+      dim3 numBlocks;
+      dim3 dimBlocks;
+      void** args;
+      void* args__val;
+      size_t sharedMemBytes;
+      hipStream_t stream;
+    } hipLaunchKernel;
+    struct {
+      const hipLaunchConfig_t* config;
+      hipLaunchConfig_t config__val;
+      const void* fPtr;
+      void** args;
+      void* args__val;
+    } hipLaunchKernelExC;
+    struct {
+      hipKernel_t* pKernel;
+      hipKernel_t pKernel__val;
+      hipLibrary_t library;
+      const char* name;
+      char name__val;
+    } hipLibraryGetKernel;
+    struct {
+      unsigned int* count;
+      unsigned int count__val;
+      hipLibrary_t library;
+    } hipLibraryGetKernelCount;
+    struct {
+      hipLibrary_t* library;
+      hipLibrary_t library__val;
+      const void* code;
+      hipJitOption** jitOptions;
+      hipJitOption* jitOptions__val;
+      void** jitOptionsValues;
+      void* jitOptionsValues__val;
+      unsigned int numJitOptions;
+      hipLibraryOption** libraryOptions;
+      hipLibraryOption* libraryOptions__val;
+      void** libraryOptionValues;
+      void* libraryOptionValues__val;
+      unsigned int numLibraryOptions;
+    } hipLibraryLoadData;
+    struct {
+      hipLibrary_t* library;
+      hipLibrary_t library__val;
+      const char* fileName;
+      char fileName__val;
+      hipJitOption** jitOptions;
+      hipJitOption* jitOptions__val;
+      void** jitOptionsValues;
+      void* jitOptionsValues__val;
+      unsigned int numJitOptions;
+      hipLibraryOption** libraryOptions;
+      hipLibraryOption* libraryOptions__val;
+      void** libraryOptionValues;
+      void* libraryOptionValues__val;
+      unsigned int numLibraryOptions;
+    } hipLibraryLoadFromFile;
+    struct {
+      hipLibrary_t library;
+    } hipLibraryUnload;
+    struct {
+      hipLinkState_t state;
+      hipJitInputType type;
+      void* data;
+      size_t size;
+      const char* name;
+      char name__val;
+      unsigned int numOptions;
+      hipJitOption* options;
+      hipJitOption options__val;
+      void** optionValues;
+      void* optionValues__val;
+    } hipLinkAddData;
+    struct {
+      hipLinkState_t state;
+      hipJitInputType type;
+      const char* path;
+      char path__val;
+      unsigned int numOptions;
+      hipJitOption* options;
+      hipJitOption options__val;
+      void** optionValues;
+      void* optionValues__val;
+    } hipLinkAddFile;
+    struct {
+      hipLinkState_t state;
+      void** hipBinOut;
+      void* hipBinOut__val;
+      size_t* sizeOut;
+      size_t sizeOut__val;
+    } hipLinkComplete;
+    struct {
+      unsigned int numOptions;
+      hipJitOption* options;
+      hipJitOption options__val;
+      void** optionValues;
+      void* optionValues__val;
+      hipLinkState_t* stateOut;
+      hipLinkState_t stateOut__val;
+    } hipLinkCreate;
+    struct {
+      hipLinkState_t state;
+    } hipLinkDestroy;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+    } hipMalloc;
+    struct {
+      hipPitchedPtr* pitchedDevPtr;
+      hipPitchedPtr pitchedDevPtr__val;
+      hipExtent extent;
+    } hipMalloc3D;
+    struct {
+      hipArray_t* array;
+      hipArray_t array__val;
+      const hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      hipExtent extent;
+      unsigned int flags;
+    } hipMalloc3DArray;
+    struct {
+      hipArray_t* array;
+      hipArray_t array__val;
+      const hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      size_t width;
+      size_t height;
+      unsigned int flags;
+    } hipMallocArray;
+    struct {
+      void** dev_ptr;
+      void* dev_ptr__val;
+      size_t size;
+      hipStream_t stream;
+    } hipMallocAsync;
+    struct {
+      void** dev_ptr;
+      void* dev_ptr__val;
+      size_t size;
+      hipMemPool_t mem_pool;
+      hipStream_t stream;
+    } hipMallocFromPoolAsync;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+    } hipMallocHost;
+    struct {
+      void** dev_ptr;
+      void* dev_ptr__val;
+      size_t size;
+      unsigned int flags;
+    } hipMallocManaged;
+    struct {
+      hipMipmappedArray_t* mipmappedArray;
+      hipMipmappedArray_t mipmappedArray__val;
+      const hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      hipExtent extent;
+      unsigned int numLevels;
+      unsigned int flags;
+    } hipMallocMipmappedArray;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t* pitch;
+      size_t pitch__val;
+      size_t width;
+      size_t height;
+    } hipMallocPitch;
+    struct {
+      void* devPtr;
+      size_t size;
+    } hipMemAddressFree;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+      size_t alignment;
+      void* addr;
+      unsigned long long flags;
+    } hipMemAddressReserve;
+    struct {
+      const void* dev_ptr;
+      size_t count;
+      hipMemoryAdvise advice;
+      int device;
+    } hipMemAdvise;
+    struct {
+      const void* dev_ptr;
+      size_t count;
+      hipMemoryAdvise advice;
+      hipMemLocation location;
+    } hipMemAdvise_v2;
+    struct {
+      void** ptr;
+      void* ptr__val;
+      size_t size;
+    } hipMemAllocHost;
+    struct {
+      hipDeviceptr_t* dptr;
+      hipDeviceptr_t dptr__val;
+      size_t* pitch;
+      size_t pitch__val;
+      size_t widthInBytes;
+      size_t height;
+      unsigned int elementSizeBytes;
+    } hipMemAllocPitch;
+    struct {
+      hipMemGenericAllocationHandle_t* handle;
+      hipMemGenericAllocationHandle_t handle__val;
+      size_t size;
+      const hipMemAllocationProp* prop;
+      hipMemAllocationProp prop__val;
+      unsigned long long flags;
+    } hipMemCreate;
+    struct {
+      void* shareableHandle;
+      hipMemGenericAllocationHandle_t handle;
+      hipMemAllocationHandleType handleType;
+      unsigned long long flags;
+    } hipMemExportToShareableHandle;
+    struct {
+      unsigned long long* flags;
+      unsigned long long flags__val;
+      const hipMemLocation* location;
+      hipMemLocation location__val;
+      void* ptr;
+    } hipMemGetAccess;
+    struct {
+      hipDeviceptr_t* pbase;
+      hipDeviceptr_t pbase__val;
+      size_t* psize;
+      size_t psize__val;
+      hipDeviceptr_t dptr;
+    } hipMemGetAddressRange;
+    struct {
+      size_t* granularity;
+      size_t granularity__val;
+      const hipMemAllocationProp* prop;
+      hipMemAllocationProp prop__val;
+      hipMemAllocationGranularity_flags option;
+    } hipMemGetAllocationGranularity;
+    struct {
+      hipMemAllocationProp* prop;
+      hipMemAllocationProp prop__val;
+      hipMemGenericAllocationHandle_t handle;
+    } hipMemGetAllocationPropertiesFromHandle;
+    struct {
+      void* handle;
+      hipDeviceptr_t dptr;
+      size_t size;
+      hipMemRangeHandleType handleType;
+      unsigned long long flags;
+    } hipMemGetHandleForAddressRange;
+    struct {
+      size_t* free;
+      size_t free__val;
+      size_t* total;
+      size_t total__val;
+    } hipMemGetInfo;
+    struct {
+      hipMemGenericAllocationHandle_t* handle;
+      hipMemGenericAllocationHandle_t handle__val;
+      void* osHandle;
+      hipMemAllocationHandleType shHandleType;
+    } hipMemImportFromShareableHandle;
+    struct {
+      void* ptr;
+      size_t size;
+      size_t offset;
+      hipMemGenericAllocationHandle_t handle;
+      unsigned long long flags;
+    } hipMemMap;
+    struct {
+      hipArrayMapInfo* mapInfoList;
+      hipArrayMapInfo mapInfoList__val;
+      unsigned int count;
+      hipStream_t stream;
+    } hipMemMapArrayAsync;
+    struct {
+      hipMemPool_t* mem_pool;
+      hipMemPool_t mem_pool__val;
+      const hipMemPoolProps* pool_props;
+      hipMemPoolProps pool_props__val;
+    } hipMemPoolCreate;
+    struct {
+      hipMemPool_t mem_pool;
+    } hipMemPoolDestroy;
+    struct {
+      hipMemPoolPtrExportData* export_data;
+      hipMemPoolPtrExportData export_data__val;
+      void* dev_ptr;
+    } hipMemPoolExportPointer;
+    struct {
+      void* shared_handle;
+      hipMemPool_t mem_pool;
+      hipMemAllocationHandleType handle_type;
+      unsigned int flags;
+    } hipMemPoolExportToShareableHandle;
+    struct {
+      hipMemAccessFlags* flags;
+      hipMemAccessFlags flags__val;
+      hipMemPool_t mem_pool;
+      hipMemLocation* location;
+      hipMemLocation location__val;
+    } hipMemPoolGetAccess;
+    struct {
+      hipMemPool_t mem_pool;
+      hipMemPoolAttr attr;
+      void* value;
+    } hipMemPoolGetAttribute;
+    struct {
+      hipMemPool_t* mem_pool;
+      hipMemPool_t mem_pool__val;
+      void* shared_handle;
+      hipMemAllocationHandleType handle_type;
+      unsigned int flags;
+    } hipMemPoolImportFromShareableHandle;
+    struct {
+      void** dev_ptr;
+      void* dev_ptr__val;
+      hipMemPool_t mem_pool;
+      hipMemPoolPtrExportData* export_data;
+      hipMemPoolPtrExportData export_data__val;
+    } hipMemPoolImportPointer;
+    struct {
+      hipMemPool_t mem_pool;
+      const hipMemAccessDesc* desc_list;
+      hipMemAccessDesc desc_list__val;
+      size_t count;
+    } hipMemPoolSetAccess;
+    struct {
+      hipMemPool_t mem_pool;
+      hipMemPoolAttr attr;
+      void* value;
+    } hipMemPoolSetAttribute;
+    struct {
+      hipMemPool_t mem_pool;
+      size_t min_bytes_to_hold;
+    } hipMemPoolTrimTo;
+    struct {
+      const void* dev_ptr;
+      size_t count;
+      int device;
+      hipStream_t stream;
+    } hipMemPrefetchAsync;
+    struct {
+      const void* dev_ptr;
+      size_t count;
+      hipMemLocation location;
+      unsigned int flags;
+      hipStream_t stream;
+    } hipMemPrefetchAsync_v2;
+    struct {
+      void* ptr;
+      size_t* size;
+      size_t size__val;
+    } hipMemPtrGetInfo;
+    struct {
+      void* data;
+      size_t data_size;
+      hipMemRangeAttribute attribute;
+      const void* dev_ptr;
+      size_t count;
+    } hipMemRangeGetAttribute;
+    struct {
+      void** data;
+      void* data__val;
+      size_t* data_sizes;
+      size_t data_sizes__val;
+      hipMemRangeAttribute* attributes;
+      hipMemRangeAttribute attributes__val;
+      size_t num_attributes;
+      const void* dev_ptr;
+      size_t count;
+    } hipMemRangeGetAttributes;
+    struct {
+      hipMemGenericAllocationHandle_t handle;
+    } hipMemRelease;
+    struct {
+      hipMemGenericAllocationHandle_t* handle;
+      hipMemGenericAllocationHandle_t handle__val;
+      void* addr;
+    } hipMemRetainAllocationHandle;
+    struct {
+      void* ptr;
+      size_t size;
+      const hipMemAccessDesc* desc;
+      hipMemAccessDesc desc__val;
+      size_t count;
+    } hipMemSetAccess;
+    struct {
+      void* ptr;
+      size_t size;
+    } hipMemUnmap;
+    struct {
+      void* dst;
+      const void* src;
+      size_t sizeBytes;
+      hipMemcpyKind kind;
+    } hipMemcpy;
+    struct {
+      void* dst;
+      size_t dpitch;
+      const void* src;
+      size_t spitch;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+    } hipMemcpy2D;
+    struct {
+      hipArray_t dst;
+      size_t wOffsetDst;
+      size_t hOffsetDst;
+      hipArray_const_t src;
+      size_t wOffsetSrc;
+      size_t hOffsetSrc;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+    } hipMemcpy2DArrayToArray;
+    struct {
+      void* dst;
+      size_t dpitch;
+      const void* src;
+      size_t spitch;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpy2DAsync;
+    struct {
+      void* dst;
+      size_t dpitch;
+      hipArray_const_t src;
+      size_t wOffset;
+      size_t hOffset;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+    } hipMemcpy2DFromArray;
+    struct {
+      void* dst;
+      size_t dpitch;
+      hipArray_const_t src;
+      size_t wOffset;
+      size_t hOffset;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpy2DFromArrayAsync;
+    struct {
+      hipArray_t dst;
+      size_t wOffset;
+      size_t hOffset;
+      const void* src;
+      size_t spitch;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+    } hipMemcpy2DToArray;
+    struct {
+      hipArray_t dst;
+      size_t wOffset;
+      size_t hOffset;
+      const void* src;
+      size_t spitch;
+      size_t width;
+      size_t height;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpy2DToArrayAsync;
+    struct {
+      const hipMemcpy3DParms* p;
+      hipMemcpy3DParms p__val;
+    } hipMemcpy3D;
+    struct {
+      const hipMemcpy3DParms* p;
+      hipMemcpy3DParms p__val;
+      hipStream_t stream;
+    } hipMemcpy3DAsync;
+    struct {
+      size_t numOps;
+      hipMemcpy3DBatchOp* opList;
+      hipMemcpy3DBatchOp opList__val;
+      size_t* failIdx;
+      size_t failIdx__val;
+      unsigned long long flags;
+      hipStream_t stream;
+    } hipMemcpy3DBatchAsync;
+    struct {
+      hipMemcpy3DPeerParms* p;
+      hipMemcpy3DPeerParms p__val;
+    } hipMemcpy3DPeer;
+    struct {
+      hipMemcpy3DPeerParms* p;
+      hipMemcpy3DPeerParms p__val;
+      hipStream_t stream;
+    } hipMemcpy3DPeerAsync;
+    struct {
+      void* dst;
+      const void* src;
+      size_t sizeBytes;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpyAsync;
+    struct {
+      hipArray_t dstArray;
+      size_t dstOffset;
+      hipArray_t srcArray;
+      size_t srcOffset;
+      size_t ByteCount;
+    } hipMemcpyAtoA;
+    struct {
+      hipDeviceptr_t dstDevice;
+      hipArray_t srcArray;
+      size_t srcOffset;
+      size_t ByteCount;
+    } hipMemcpyAtoD;
+    struct {
+      void* dst;
+      hipArray_t srcArray;
+      size_t srcOffset;
+      size_t count;
+    } hipMemcpyAtoH;
+    struct {
+      void* dstHost;
+      hipArray_t srcArray;
+      size_t srcOffset;
+      size_t ByteCount;
+      hipStream_t stream;
+    } hipMemcpyAtoHAsync;
+    struct {
+      void** dsts;
+      void* dsts__val;
+      void** srcs;
+      void* srcs__val;
+      size_t* sizes;
+      size_t sizes__val;
+      size_t count;
+      hipMemcpyAttributes* attrs;
+      hipMemcpyAttributes attrs__val;
+      size_t* attrsIdxs;
+      size_t attrsIdxs__val;
+      size_t numAttrs;
+      size_t* failIdx;
+      size_t failIdx__val;
+      hipStream_t stream;
+    } hipMemcpyBatchAsync;
+    struct {
+      hipArray_t dstArray;
+      size_t dstOffset;
+      hipDeviceptr_t srcDevice;
+      size_t ByteCount;
+    } hipMemcpyDtoA;
+    struct {
+      hipDeviceptr_t dst;
+      hipDeviceptr_t src;
+      size_t sizeBytes;
+    } hipMemcpyDtoD;
+    struct {
+      hipDeviceptr_t dst;
+      hipDeviceptr_t src;
+      size_t sizeBytes;
+      hipStream_t stream;
+    } hipMemcpyDtoDAsync;
+    struct {
+      void* dst;
+      hipDeviceptr_t src;
+      size_t sizeBytes;
+    } hipMemcpyDtoH;
+    struct {
+      void* dst;
+      hipDeviceptr_t src;
+      size_t sizeBytes;
+      hipStream_t stream;
+    } hipMemcpyDtoHAsync;
+    struct {
+      void* dst;
+      hipArray_const_t srcArray;
+      size_t wOffset;
+      size_t hOffset;
+      size_t count;
+      hipMemcpyKind kind;
+    } hipMemcpyFromArray;
+    struct {
+      void* dst;
+      const void* symbol;
+      size_t sizeBytes;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipMemcpyFromSymbol;
+    struct {
+      void* dst;
+      const void* symbol;
+      size_t sizeBytes;
+      size_t offset;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpyFromSymbolAsync;
+    struct {
+      hipArray_t dstArray;
+      size_t dstOffset;
+      const void* srcHost;
+      size_t count;
+    } hipMemcpyHtoA;
+    struct {
+      hipArray_t dstArray;
+      size_t dstOffset;
+      const void* srcHost;
+      size_t ByteCount;
+      hipStream_t stream;
+    } hipMemcpyHtoAAsync;
+    struct {
+      hipDeviceptr_t dst;
+      const void* src;
+      size_t sizeBytes;
+    } hipMemcpyHtoD;
+    struct {
+      hipDeviceptr_t dst;
+      const void* src;
+      size_t sizeBytes;
+      hipStream_t stream;
+    } hipMemcpyHtoDAsync;
+    struct {
+      const hip_Memcpy2D* pCopy;
+      hip_Memcpy2D pCopy__val;
+    } hipMemcpyParam2D;
+    struct {
+      const hip_Memcpy2D* pCopy;
+      hip_Memcpy2D pCopy__val;
+      hipStream_t stream;
+    } hipMemcpyParam2DAsync;
+    struct {
+      void* dst;
+      int dstDeviceId;
+      const void* src;
+      int srcDeviceId;
+      size_t sizeBytes;
+    } hipMemcpyPeer;
+    struct {
+      void* dst;
+      int dstDeviceId;
+      const void* src;
+      int srcDevice;
+      size_t sizeBytes;
+      hipStream_t stream;
+    } hipMemcpyPeerAsync;
+    struct {
+      hipArray_t dst;
+      size_t wOffset;
+      size_t hOffset;
+      const void* src;
+      size_t count;
+      hipMemcpyKind kind;
+    } hipMemcpyToArray;
+    struct {
+      const void* symbol;
+      const void* src;
+      size_t sizeBytes;
+      size_t offset;
+      hipMemcpyKind kind;
+    } hipMemcpyToSymbol;
+    struct {
+      const void* symbol;
+      const void* src;
+      size_t sizeBytes;
+      size_t offset;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpyToSymbolAsync;
+    struct {
+      void* dst;
+      const void* src;
+      size_t sizeBytes;
+      hipMemcpyKind kind;
+      hipStream_t stream;
+    } hipMemcpyWithStream;
+    struct {
+      void* dst;
+      int value;
+      size_t sizeBytes;
+    } hipMemset;
+    struct {
+      void* dst;
+      size_t pitch;
+      int value;
+      size_t width;
+      size_t height;
+    } hipMemset2D;
+    struct {
+      void* dst;
+      size_t pitch;
+      int value;
+      size_t width;
+      size_t height;
+      hipStream_t stream;
+    } hipMemset2DAsync;
+    struct {
+      hipPitchedPtr pitchedDevPtr;
+      int value;
+      hipExtent extent;
+    } hipMemset3D;
+    struct {
+      hipPitchedPtr pitchedDevPtr;
+      int value;
+      hipExtent extent;
+      hipStream_t stream;
+    } hipMemset3DAsync;
+    struct {
+      void* dst;
+      int value;
+      size_t sizeBytes;
+      hipStream_t stream;
+    } hipMemsetAsync;
+    struct {
+      hipDeviceptr_t dest;
+      unsigned short value;
+      size_t count;
+    } hipMemsetD16;
+    struct {
+      hipDeviceptr_t dest;
+      unsigned short value;
+      size_t count;
+      hipStream_t stream;
+    } hipMemsetD16Async;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned short value;
+      size_t width;
+      size_t height;
+    } hipMemsetD2D16;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned short value;
+      size_t width;
+      size_t height;
+      hipStream_t stream;
+    } hipMemsetD2D16Async;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned int value;
+      size_t width;
+      size_t height;
+    } hipMemsetD2D32;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned int value;
+      size_t width;
+      size_t height;
+      hipStream_t stream;
+    } hipMemsetD2D32Async;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned char value;
+      size_t width;
+      size_t height;
+    } hipMemsetD2D8;
+    struct {
+      hipDeviceptr_t dst;
+      size_t dstPitch;
+      unsigned char value;
+      size_t width;
+      size_t height;
+      hipStream_t stream;
+    } hipMemsetD2D8Async;
+    struct {
+      hipDeviceptr_t dest;
+      int value;
+      size_t count;
+    } hipMemsetD32;
+    struct {
+      hipDeviceptr_t dst;
+      int value;
+      size_t count;
+      hipStream_t stream;
+    } hipMemsetD32Async;
+    struct {
+      hipDeviceptr_t dest;
+      unsigned char value;
+      size_t count;
+    } hipMemsetD8;
+    struct {
+      hipDeviceptr_t dest;
+      unsigned char value;
+      size_t count;
+      hipStream_t stream;
+    } hipMemsetD8Async;
+    struct {
+      hipMipmappedArray_t* pHandle;
+      hipMipmappedArray_t pHandle__val;
+      HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc;
+      HIP_ARRAY3D_DESCRIPTOR pMipmappedArrayDesc__val;
+      unsigned int numMipmapLevels;
+    } hipMipmappedArrayCreate;
+    struct {
+      hipMipmappedArray_t hMipmappedArray;
+    } hipMipmappedArrayDestroy;
+    struct {
+      hipArray_t* pLevelArray;
+      hipArray_t pLevelArray__val;
+      hipMipmappedArray_t hMipMappedArray;
+      unsigned int level;
+    } hipMipmappedArrayGetLevel;
+    struct {
+      hipFunction_t* function;
+      hipFunction_t function__val;
+      hipModule_t module;
+      const char* kname;
+      char kname__val;
+    } hipModuleGetFunction;
+    struct {
+      unsigned int* count;
+      unsigned int count__val;
+      hipModule_t mod;
+    } hipModuleGetFunctionCount;
+    struct {
+      hipDeviceptr_t* dptr;
+      hipDeviceptr_t dptr__val;
+      size_t* bytes;
+      size_t bytes__val;
+      hipModule_t hmod;
+      const char* name;
+      char name__val;
+    } hipModuleGetGlobal;
+    struct {
+      textureReference** texRef;
+      textureReference* texRef__val;
+      hipModule_t hmod;
+      const char* name;
+      char name__val;
+    } hipModuleGetTexRef;
+    struct {
+      hipFunction_t f;
+      unsigned int gridDimX;
+      unsigned int gridDimY;
+      unsigned int gridDimZ;
+      unsigned int blockDimX;
+      unsigned int blockDimY;
+      unsigned int blockDimZ;
+      unsigned int sharedMemBytes;
+      hipStream_t stream;
+      void** kernelParams;
+      void* kernelParams__val;
+    } hipModuleLaunchCooperativeKernel;
+    struct {
+      hipFunctionLaunchParams* launchParamsList;
+      hipFunctionLaunchParams launchParamsList__val;
+      unsigned int numDevices;
+      unsigned int flags;
+    } hipModuleLaunchCooperativeKernelMultiDevice;
+    struct {
+      hipFunction_t f;
+      unsigned int gridDimX;
+      unsigned int gridDimY;
+      unsigned int gridDimZ;
+      unsigned int blockDimX;
+      unsigned int blockDimY;
+      unsigned int blockDimZ;
+      unsigned int sharedMemBytes;
+      hipStream_t stream;
+      void** kernelParams;
+      void* kernelParams__val;
+      void** extra;
+      void* extra__val;
+    } hipModuleLaunchKernel;
+    struct {
+      hipModule_t* module;
+      hipModule_t module__val;
+      const char* fname;
+      char fname__val;
+    } hipModuleLoad;
+    struct {
+      hipModule_t* module;
+      hipModule_t module__val;
+      const void* image;
+    } hipModuleLoadData;
+    struct {
+      hipModule_t* module;
+      hipModule_t module__val;
+      const void* image;
+      unsigned int numOptions;
+      hipJitOption* options;
+      hipJitOption options__val;
+      void** optionsValues;
+      void* optionsValues__val;
+    } hipModuleLoadDataEx;
+    struct {
+      hipModule_t* module;
+      hipModule_t module__val;
+      const void* fatbin;
+    } hipModuleLoadFatBinary;
+    struct {
+      int* numBlocks;
+      int numBlocks__val;
+      hipFunction_t f;
+      int blockSize;
+      size_t dynSharedMemPerBlk;
+    } hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+    struct {
+      int* numBlocks;
+      int numBlocks__val;
+      hipFunction_t f;
+      int blockSize;
+      size_t dynSharedMemPerBlk;
+      unsigned int flags;
+    } hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+    struct {
+      int* gridSize;
+      int gridSize__val;
+      int* blockSize;
+      int blockSize__val;
+      hipFunction_t f;
+      size_t dynSharedMemPerBlk;
+      int blockSizeLimit;
+    } hipModuleOccupancyMaxPotentialBlockSize;
+    struct {
+      int* gridSize;
+      int gridSize__val;
+      int* blockSize;
+      int blockSize__val;
+      hipFunction_t f;
+      size_t dynSharedMemPerBlk;
+      int blockSizeLimit;
+      unsigned int flags;
+    } hipModuleOccupancyMaxPotentialBlockSizeWithFlags;
+    struct {
+      hipModule_t module;
+    } hipModuleUnload;
+    struct {
+      int* numBlocks;
+      int numBlocks__val;
+      const void* f;
+      int blockSize;
+      size_t dynamicSMemSize;
+    } hipOccupancyMaxActiveBlocksPerMultiprocessor;
+    struct {
+      int* numBlocks;
+      int numBlocks__val;
+      const void* f;
+      int blockSize;
+      size_t dynamicSMemSize;
+      unsigned int flags;
+    } hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+    struct {
+      int* gridSize;
+      int gridSize__val;
+      int* blockSize;
+      int blockSize__val;
+      const void* f;
+      size_t dynSharedMemPerBlk;
+      int blockSizeLimit;
+    } hipOccupancyMaxPotentialBlockSize;
+    struct {
+      void* data;
+      hipPointer_attribute attribute;
+      hipDeviceptr_t ptr;
+    } hipPointerGetAttribute;
+    struct {
+      hipPointerAttribute_t* attributes;
+      hipPointerAttribute_t attributes__val;
+      const void* ptr;
+    } hipPointerGetAttributes;
+    struct {
+      const void* value;
+      hipPointer_attribute attribute;
+      hipDeviceptr_t ptr;
+    } hipPointerSetAttribute;
+    struct {
+      int* runtimeVersion;
+      int runtimeVersion__val;
+    } hipRuntimeGetVersion;
+    struct {
+      int deviceId;
+    } hipSetDevice;
+    struct {
+      unsigned int flags;
+    } hipSetDeviceFlags;
+    struct {
+      int* device_arr;
+      int device_arr__val;
+      int len;
+    } hipSetValidDevices;
+    struct {
+      const void* arg;
+      size_t size;
+      size_t offset;
+    } hipSetupArgument;
+    struct {
+      const hipExternalSemaphore_t* extSemArray;
+      hipExternalSemaphore_t extSemArray__val;
+      const hipExternalSemaphoreSignalParams* paramsArray;
+      hipExternalSemaphoreSignalParams paramsArray__val;
+      unsigned int numExtSems;
+      hipStream_t stream;
+    } hipSignalExternalSemaphoresAsync;
+    struct {
+      hipStream_t stream;
+      hipStreamCallback_t callback;
+      void* userData;
+      unsigned int flags;
+    } hipStreamAddCallback;
+    struct {
+      hipStream_t stream;
+      void* dev_ptr;
+      size_t length;
+      unsigned int flags;
+    } hipStreamAttachMemAsync;
+    struct {
+      hipStream_t stream;
+      unsigned int count;
+      hipStreamBatchMemOpParams* paramArray;
+      hipStreamBatchMemOpParams paramArray__val;
+      unsigned int flags;
+    } hipStreamBatchMemOp;
+    struct {
+      hipStream_t stream;
+      hipStreamCaptureMode mode;
+    } hipStreamBeginCapture;
+    struct {
+      hipStream_t stream;
+      hipGraph_t graph;
+      const hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      const hipGraphEdgeData* dependencyData;
+      hipGraphEdgeData dependencyData__val;
+      size_t numDependencies;
+      hipStreamCaptureMode mode;
+    } hipStreamBeginCaptureToGraph;
+    struct {
+      hipStream_t* stream;
+      hipStream_t stream__val;
+    } hipStreamCreate;
+    struct {
+      hipStream_t* stream;
+      hipStream_t stream__val;
+      unsigned int flags;
+    } hipStreamCreateWithFlags;
+    struct {
+      hipStream_t* stream;
+      hipStream_t stream__val;
+      unsigned int flags;
+      int priority;
+    } hipStreamCreateWithPriority;
+    struct {
+      hipStream_t stream;
+    } hipStreamDestroy;
+    struct {
+      hipStream_t stream;
+      hipGraph_t* pGraph;
+      hipGraph_t pGraph__val;
+    } hipStreamEndCapture;
+    struct {
+      hipStream_t stream;
+      hipLaunchAttributeID attr;
+      hipLaunchAttributeValue* value_out;
+      hipLaunchAttributeValue value_out__val;
+    } hipStreamGetAttribute;
+    struct {
+      hipStream_t stream;
+      hipStreamCaptureStatus* pCaptureStatus;
+      hipStreamCaptureStatus pCaptureStatus__val;
+      unsigned long long* pId;
+      unsigned long long pId__val;
+    } hipStreamGetCaptureInfo;
+    struct {
+      hipStream_t stream;
+      hipStreamCaptureStatus* captureStatus_out;
+      hipStreamCaptureStatus captureStatus_out__val;
+      unsigned long long* id_out;
+      unsigned long long id_out__val;
+      hipGraph_t* graph_out;
+      hipGraph_t graph_out__val;
+      const hipGraphNode_t** dependencies_out;
+      const hipGraphNode_t* dependencies_out__val;
+      size_t* numDependencies_out;
+      size_t numDependencies_out__val;
+    } hipStreamGetCaptureInfo_v2;
+    struct {
+      hipStream_t stream;
+      hipDevice_t* device;
+      hipDevice_t device__val;
+    } hipStreamGetDevice;
+    struct {
+      hipStream_t stream;
+      unsigned int* flags;
+      unsigned int flags__val;
+    } hipStreamGetFlags;
+    struct {
+      hipStream_t stream;
+      unsigned long long* streamId;
+      unsigned long long streamId__val;
+    } hipStreamGetId;
+    struct {
+      hipStream_t stream;
+      int* priority;
+      int priority__val;
+    } hipStreamGetPriority;
+    struct {
+      hipStream_t stream;
+      hipStreamCaptureStatus* pCaptureStatus;
+      hipStreamCaptureStatus pCaptureStatus__val;
+    } hipStreamIsCapturing;
+    struct {
+      hipStream_t stream;
+    } hipStreamQuery;
+    struct {
+      hipStream_t stream;
+      hipLaunchAttributeID attr;
+      const hipLaunchAttributeValue* value;
+      hipLaunchAttributeValue value__val;
+    } hipStreamSetAttribute;
+    struct {
+      hipStream_t stream;
+    } hipStreamSynchronize;
+    struct {
+      hipStream_t stream;
+      hipGraphNode_t* dependencies;
+      hipGraphNode_t dependencies__val;
+      size_t numDependencies;
+      unsigned int flags;
+    } hipStreamUpdateCaptureDependencies;
+    struct {
+      hipStream_t stream;
+      hipEvent_t event;
+      unsigned int flags;
+    } hipStreamWaitEvent;
+    struct {
+      hipStream_t stream;
+      void* ptr;
+      unsigned int value;
+      unsigned int flags;
+      unsigned int mask;
+    } hipStreamWaitValue32;
+    struct {
+      hipStream_t stream;
+      void* ptr;
+      uint64_t value;
+      unsigned int flags;
+      uint64_t mask;
+    } hipStreamWaitValue64;
+    struct {
+      hipStream_t stream;
+      void* ptr;
+      unsigned int value;
+      unsigned int flags;
+    } hipStreamWriteValue32;
+    struct {
+      hipStream_t stream;
+      void* ptr;
+      uint64_t value;
+      unsigned int flags;
+    } hipStreamWriteValue64;
+    struct {
+      hipDeviceptr_t* dev_ptr;
+      hipDeviceptr_t dev_ptr__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetAddress;
+    struct {
+      hipArray_t* pArray;
+      hipArray_t pArray__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetArray;
+    struct {
+      float* pBorderColor;
+      float pBorderColor__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetBorderColor;
+    struct {
+      unsigned int* pFlags;
+      unsigned int pFlags__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetFlags;
+    struct {
+      hipArray_Format* pFormat;
+      hipArray_Format pFormat__val;
+      int* pNumChannels;
+      int pNumChannels__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetFormat;
+    struct {
+      int* pmaxAnsio;
+      int pmaxAnsio__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetMaxAnisotropy;
+    struct {
+      hipMipmappedArray_t* pArray;
+      hipMipmappedArray_t pArray__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetMipMappedArray;
+    struct {
+      float* pbias;
+      float pbias__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetMipmapLevelBias;
+    struct {
+      float* pminMipmapLevelClamp;
+      float pminMipmapLevelClamp__val;
+      float* pmaxMipmapLevelClamp;
+      float pmaxMipmapLevelClamp__val;
+      const textureReference* texRef;
+      textureReference texRef__val;
+    } hipTexRefGetMipmapLevelClamp;
+    struct {
+      size_t* ByteOffset;
+      size_t ByteOffset__val;
+      textureReference* texRef;
+      textureReference texRef__val;
+      hipDeviceptr_t dptr;
+      size_t bytes;
+    } hipTexRefSetAddress;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      const HIP_ARRAY_DESCRIPTOR* desc;
+      HIP_ARRAY_DESCRIPTOR desc__val;
+      hipDeviceptr_t dptr;
+      size_t Pitch;
+    } hipTexRefSetAddress2D;
+    struct {
+      textureReference* tex;
+      textureReference tex__val;
+      hipArray_const_t array;
+      unsigned int flags;
+    } hipTexRefSetArray;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      float* pBorderColor;
+      float pBorderColor__val;
+    } hipTexRefSetBorderColor;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      unsigned int Flags;
+    } hipTexRefSetFlags;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      hipArray_Format fmt;
+      int NumPackedComponents;
+    } hipTexRefSetFormat;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      unsigned int maxAniso;
+    } hipTexRefSetMaxAnisotropy;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      float bias;
+    } hipTexRefSetMipmapLevelBias;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      float minMipMapLevelClamp;
+      float maxMipMapLevelClamp;
+    } hipTexRefSetMipmapLevelClamp;
+    struct {
+      textureReference* texRef;
+      textureReference texRef__val;
+      hipMipmappedArray* mipmappedArray;
+      hipMipmappedArray mipmappedArray__val;
+      unsigned int Flags;
+    } hipTexRefSetMipmappedArray;
+    struct {
+      hipStreamCaptureMode* mode;
+      hipStreamCaptureMode mode__val;
+    } hipThreadExchangeStreamCaptureMode;
+    struct {
+      hipUserObject_t* object_out;
+      hipUserObject_t object_out__val;
+      void* ptr;
+      hipHostFn_t destroy;
+      unsigned int initialRefcount;
+      unsigned int flags;
+    } hipUserObjectCreate;
+    struct {
+      hipUserObject_t object;
+      unsigned int count;
+    } hipUserObjectRelease;
+    struct {
+      hipUserObject_t object;
+      unsigned int count;
+    } hipUserObjectRetain;
+    struct {
+      const hipExternalSemaphore_t* extSemArray;
+      hipExternalSemaphore_t extSemArray__val;
+      const hipExternalSemaphoreWaitParams* paramsArray;
+      hipExternalSemaphoreWaitParams paramsArray__val;
+      unsigned int numExtSems;
+      hipStream_t stream;
+    } hipWaitExternalSemaphoresAsync;
+  } args;
+  uint64_t *phase_data;
+} hip_api_data_t;
+
+// HIP API callbacks args data filling macros
+// __hipPopCallConfiguration[('dim3*', 'gridDim'), ('dim3*', 'blockDim'), ('size_t*', 'sharedMem'), ('hipStream_t*', 'stream')]
+#define INIT___hipPopCallConfiguration_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.__hipPopCallConfiguration.gridDim = (dim3*)gridDim; \
+  cb_data.args.__hipPopCallConfiguration.blockDim = (dim3*)blockDim; \
+  cb_data.args.__hipPopCallConfiguration.sharedMem = (size_t*)sharedMem; \
+  cb_data.args.__hipPopCallConfiguration.stream = (hipStream_t*)stream; \
+};
+// __hipPushCallConfiguration[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+#define INIT___hipPushCallConfiguration_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.__hipPushCallConfiguration.gridDim = (dim3)gridDim; \
+  cb_data.args.__hipPushCallConfiguration.blockDim = (dim3)blockDim; \
+  cb_data.args.__hipPushCallConfiguration.sharedMem = (size_t)sharedMem; \
+  cb_data.args.__hipPushCallConfiguration.stream = (hipStream_t)stream; \
+};
+// hipArray3DCreate[('hipArray_t*', 'array'), ('const HIP_ARRAY3D_DESCRIPTOR*', 'pAllocateArray')]
+#define INIT_hipArray3DCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArray3DCreate.array = (hipArray_t*)array; \
+  cb_data.args.hipArray3DCreate.pAllocateArray = (const HIP_ARRAY3D_DESCRIPTOR*)pAllocateArray; \
+};
+// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray_t', 'array')]
+#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArray3DGetDescriptor.pArrayDescriptor = (HIP_ARRAY3D_DESCRIPTOR*)pArrayDescriptor; \
+  cb_data.args.hipArray3DGetDescriptor.array = (hipArray_t)array; \
+};
+// hipArrayCreate[('hipArray_t*', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
+#define INIT_hipArrayCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayCreate.pHandle = (hipArray_t*)array; \
+  cb_data.args.hipArrayCreate.pAllocateArray = (const HIP_ARRAY_DESCRIPTOR*)pAllocateArray; \
+};
+// hipArrayDestroy[('hipArray_t', 'array')]
+#define INIT_hipArrayDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayDestroy.array = (hipArray_t)array; \
+};
+// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray_t', 'array')]
+#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayGetDescriptor.pArrayDescriptor = (HIP_ARRAY_DESCRIPTOR*)pArrayDescriptor; \
+  cb_data.args.hipArrayGetDescriptor.array = (hipArray_t)array; \
+};
+// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray_t', 'array')]
+#define INIT_hipArrayGetInfo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayGetInfo.desc = (hipChannelFormatDesc*)desc; \
+  cb_data.args.hipArrayGetInfo.extent = (hipExtent*)extent; \
+  cb_data.args.hipArrayGetInfo.flags = (unsigned int*)flags; \
+  cb_data.args.hipArrayGetInfo.array = (hipArray_t)array; \
+};
+// hipChooseDeviceR0000[('int*', 'device'), ('const hipDeviceProp_tR0000*', 'prop')]
+#define INIT_hipChooseDeviceR0000_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipChooseDeviceR0000.device = (int*)device; \
+  cb_data.args.hipChooseDeviceR0000.prop = (const hipDeviceProp_tR0000*)properties; \
+};
+// hipChooseDeviceR0600[('int*', 'device'), ('const hipDeviceProp_tR0600*', 'prop')]
+#define INIT_hipChooseDeviceR0600_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipChooseDeviceR0600.device = (int*)device; \
+  cb_data.args.hipChooseDeviceR0600.prop = (const hipDeviceProp_tR0600*)properties; \
+};
+// hipConfigureCall[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+#define INIT_hipConfigureCall_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipConfigureCall.gridDim = (dim3)gridDim; \
+  cb_data.args.hipConfigureCall.blockDim = (dim3)blockDim; \
+  cb_data.args.hipConfigureCall.sharedMem = (size_t)sharedMem; \
+  cb_data.args.hipConfigureCall.stream = (hipStream_t)stream; \
+};
+// hipCreateSurfaceObject[('hipSurfaceObject_t*', 'pSurfObject'), ('const hipResourceDesc*', 'pResDesc')]
+#define INIT_hipCreateSurfaceObject_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCreateSurfaceObject.pSurfObject = (hipSurfaceObject_t*)pSurfObject; \
+  cb_data.args.hipCreateSurfaceObject.pResDesc = (const hipResourceDesc*)pResDesc; \
+};
+// hipCtxCreate[('hipCtx_t*', 'ctx'), ('unsigned int', 'flags'), ('hipDevice_t', 'device')]
+#define INIT_hipCtxCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxCreate.ctx = (hipCtx_t*)ctx; \
+  cb_data.args.hipCtxCreate.flags = (unsigned int)flags; \
+  cb_data.args.hipCtxCreate.device = (hipDevice_t)device; \
+};
+// hipCtxDestroy[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxDestroy.ctx = (hipCtx_t)ctx; \
+};
+// hipCtxDisablePeerAccess[('hipCtx_t', 'peerCtx')]
+#define INIT_hipCtxDisablePeerAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxDisablePeerAccess.peerCtx = (hipCtx_t)peerCtx; \
+};
+// hipCtxEnablePeerAccess[('hipCtx_t', 'peerCtx'), ('unsigned int', 'flags')]
+#define INIT_hipCtxEnablePeerAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxEnablePeerAccess.peerCtx = (hipCtx_t)peerCtx; \
+  cb_data.args.hipCtxEnablePeerAccess.flags = (unsigned int)flags; \
+};
+// hipCtxGetApiVersion[('hipCtx_t', 'ctx'), ('unsigned int*', 'apiVersion')]
+#define INIT_hipCtxGetApiVersion_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetApiVersion.ctx = (hipCtx_t)ctx; \
+  cb_data.args.hipCtxGetApiVersion.apiVersion = (unsigned int*)apiVersion; \
+};
+// hipCtxGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+#define INIT_hipCtxGetCacheConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetCacheConfig.cacheConfig = (hipFuncCache_t*)cacheConfig; \
+};
+// hipCtxGetCurrent[('hipCtx_t*', 'ctx')]
+#define INIT_hipCtxGetCurrent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetCurrent.ctx = (hipCtx_t*)ctx; \
+};
+// hipCtxGetDevice[('hipDevice_t*', 'device')]
+#define INIT_hipCtxGetDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetDevice.device = (hipDevice_t*)device; \
+};
+// hipCtxGetFlags[('unsigned int*', 'flags')]
+#define INIT_hipCtxGetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetFlags.flags = (unsigned int*)flags; \
+};
+// hipCtxGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+#define INIT_hipCtxGetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxGetSharedMemConfig.pConfig = (hipSharedMemConfig*)pConfig; \
+};
+// hipCtxPopCurrent[('hipCtx_t*', 'ctx')]
+#define INIT_hipCtxPopCurrent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxPopCurrent.ctx = (hipCtx_t*)ctx; \
+};
+// hipCtxPushCurrent[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxPushCurrent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxPushCurrent.ctx = (hipCtx_t)ctx; \
+};
+// hipCtxSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+#define INIT_hipCtxSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxSetCacheConfig.cacheConfig = (hipFuncCache_t)cacheConfig; \
+};
+// hipCtxSetCurrent[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxSetCurrent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxSetCurrent.ctx = (hipCtx_t)ctx; \
+};
+// hipCtxSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+#define INIT_hipCtxSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipCtxSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipCtxSynchronize[]
+#define INIT_hipCtxSynchronize_CB_ARGS_DATA(cb_data) { \
+};
+// hipDestroyExternalMemory[('hipExternalMemory_t', 'extMem')]
+#define INIT_hipDestroyExternalMemory_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDestroyExternalMemory.extMem = (hipExternalMemory_t)extMem; \
+};
+// hipDestroyExternalSemaphore[('hipExternalSemaphore_t', 'extSem')]
+#define INIT_hipDestroyExternalSemaphore_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDestroyExternalSemaphore.extSem = (hipExternalSemaphore_t)extSem; \
+};
+// hipDestroySurfaceObject[('hipSurfaceObject_t', 'surfaceObject')]
+#define INIT_hipDestroySurfaceObject_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDestroySurfaceObject.surfaceObject = (hipSurfaceObject_t)surfaceObject; \
+};
+// hipDeviceCanAccessPeer[('int*', 'canAccessPeer'), ('int', 'deviceId'), ('int', 'peerDeviceId')]
+#define INIT_hipDeviceCanAccessPeer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceCanAccessPeer.canAccessPeer = (int*)canAccess; \
+  cb_data.args.hipDeviceCanAccessPeer.deviceId = (int)deviceId; \
+  cb_data.args.hipDeviceCanAccessPeer.peerDeviceId = (int)peerDeviceId; \
+};
+// hipDeviceComputeCapability[('int*', 'major'), ('int*', 'minor'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceComputeCapability_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceComputeCapability.major = (int*)major; \
+  cb_data.args.hipDeviceComputeCapability.minor = (int*)minor; \
+  cb_data.args.hipDeviceComputeCapability.device = (hipDevice_t)device; \
+};
+// hipDeviceDisablePeerAccess[('int', 'peerDeviceId')]
+#define INIT_hipDeviceDisablePeerAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceDisablePeerAccess.peerDeviceId = (int)peerDeviceId; \
+};
+// hipDeviceEnablePeerAccess[('int', 'peerDeviceId'), ('unsigned int', 'flags')]
+#define INIT_hipDeviceEnablePeerAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceEnablePeerAccess.peerDeviceId = (int)peerDeviceId; \
+  cb_data.args.hipDeviceEnablePeerAccess.flags = (unsigned int)flags; \
+};
+// hipDeviceGet[('hipDevice_t*', 'device'), ('int', 'ordinal')]
+#define INIT_hipDeviceGet_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGet.device = (hipDevice_t*)device; \
+  cb_data.args.hipDeviceGet.ordinal = (int)deviceId; \
+};
+// hipDeviceGetAttribute[('int*', 'pi'), ('hipDeviceAttribute_t', 'attr'), ('int', 'deviceId')]
+#define INIT_hipDeviceGetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetAttribute.pi = (int*)pi; \
+  cb_data.args.hipDeviceGetAttribute.attr = (hipDeviceAttribute_t)attr; \
+  cb_data.args.hipDeviceGetAttribute.deviceId = (int)device; \
+};
+// hipDeviceGetByPCIBusId[('int*', 'device'), ('const char*', 'pciBusId')]
+#define INIT_hipDeviceGetByPCIBusId_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetByPCIBusId.device = (int*)device; \
+  cb_data.args.hipDeviceGetByPCIBusId.pciBusId = (pciBusIdstr) ? strdup(pciBusIdstr) : NULL; \
+};
+// hipDeviceGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+#define INIT_hipDeviceGetCacheConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetCacheConfig.cacheConfig = (hipFuncCache_t*)cacheConfig; \
+};
+// hipDeviceGetDefaultMemPool[('hipMemPool_t*', 'mem_pool'), ('int', 'device')]
+#define INIT_hipDeviceGetDefaultMemPool_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetDefaultMemPool.mem_pool = (hipMemPool_t*)mem_pool; \
+  cb_data.args.hipDeviceGetDefaultMemPool.device = (int)device; \
+};
+// hipDeviceGetGraphMemAttribute[('int', 'device'), ('hipGraphMemAttributeType', 'attr'), ('void*', 'value')]
+#define INIT_hipDeviceGetGraphMemAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetGraphMemAttribute.device = (int)device; \
+  cb_data.args.hipDeviceGetGraphMemAttribute.attr = (hipGraphMemAttributeType)attr; \
+  cb_data.args.hipDeviceGetGraphMemAttribute.value = (void*)value; \
+};
+// hipDeviceGetLimit[('size_t*', 'pValue'), ('hipLimit_t', 'limit')]
+#define INIT_hipDeviceGetLimit_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetLimit.pValue = (size_t*)pValue; \
+  cb_data.args.hipDeviceGetLimit.limit = (hipLimit_t)limit; \
+};
+// hipDeviceGetMemPool[('hipMemPool_t*', 'mem_pool'), ('int', 'device')]
+#define INIT_hipDeviceGetMemPool_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetMemPool.mem_pool = (hipMemPool_t*)mem_pool; \
+  cb_data.args.hipDeviceGetMemPool.device = (int)device; \
+};
+// hipDeviceGetName[('char*', 'name'), ('int', 'len'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceGetName_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetName.name = (char*)name; \
+  cb_data.args.hipDeviceGetName.len = (int)len; \
+  cb_data.args.hipDeviceGetName.device = (hipDevice_t)device; \
+};
+// hipDeviceGetP2PAttribute[('int*', 'value'), ('hipDeviceP2PAttr', 'attr'), ('int', 'srcDevice'), ('int', 'dstDevice')]
+#define INIT_hipDeviceGetP2PAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetP2PAttribute.value = (int*)value; \
+  cb_data.args.hipDeviceGetP2PAttribute.attr = (hipDeviceP2PAttr)attr; \
+  cb_data.args.hipDeviceGetP2PAttribute.srcDevice = (int)srcDevice; \
+  cb_data.args.hipDeviceGetP2PAttribute.dstDevice = (int)dstDevice; \
+};
+// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
+#define INIT_hipDeviceGetPCIBusId_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetPCIBusId.pciBusId = (char*)pciBusId; \
+  cb_data.args.hipDeviceGetPCIBusId.len = (int)len; \
+  cb_data.args.hipDeviceGetPCIBusId.device = (int)device; \
+};
+// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+#define INIT_hipDeviceGetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetSharedMemConfig.pConfig = (hipSharedMemConfig*)pConfig; \
+};
+// hipDeviceGetStreamPriorityRange[('int*', 'leastPriority'), ('int*', 'greatestPriority')]
+#define INIT_hipDeviceGetStreamPriorityRange_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetStreamPriorityRange.leastPriority = (int*)leastPriority; \
+  cb_data.args.hipDeviceGetStreamPriorityRange.greatestPriority = (int*)greatestPriority; \
+};
+// hipDeviceGetUuid[('hipUUID*', 'uuid'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceGetUuid_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGetUuid.uuid = (hipUUID*)uuid; \
+  cb_data.args.hipDeviceGetUuid.device = (hipDevice_t)device; \
+};
+// hipDeviceGraphMemTrim[('int', 'device')]
+#define INIT_hipDeviceGraphMemTrim_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceGraphMemTrim.device = (int)device; \
+};
+// hipDevicePrimaryCtxGetState[('hipDevice_t', 'dev'), ('unsigned int*', 'flags'), ('int*', 'active')]
+#define INIT_hipDevicePrimaryCtxGetState_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDevicePrimaryCtxGetState.dev = (hipDevice_t)dev; \
+  cb_data.args.hipDevicePrimaryCtxGetState.flags = (unsigned int*)flags; \
+  cb_data.args.hipDevicePrimaryCtxGetState.active = (int*)active; \
+};
+// hipDevicePrimaryCtxRelease[('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxRelease_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDevicePrimaryCtxRelease.dev = (hipDevice_t)dev; \
+};
+// hipDevicePrimaryCtxReset[('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxReset_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDevicePrimaryCtxReset.dev = (hipDevice_t)dev; \
+};
+// hipDevicePrimaryCtxRetain[('hipCtx_t*', 'pctx'), ('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxRetain_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDevicePrimaryCtxRetain.pctx = (hipCtx_t*)pctx; \
+  cb_data.args.hipDevicePrimaryCtxRetain.dev = (hipDevice_t)dev; \
+};
+// hipDevicePrimaryCtxSetFlags[('hipDevice_t', 'dev'), ('unsigned int', 'flags')]
+#define INIT_hipDevicePrimaryCtxSetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDevicePrimaryCtxSetFlags.dev = (hipDevice_t)dev; \
+  cb_data.args.hipDevicePrimaryCtxSetFlags.flags = (unsigned int)flags; \
+};
+// hipDeviceReset[]
+#define INIT_hipDeviceReset_CB_ARGS_DATA(cb_data) { \
+};
+// hipDeviceSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+#define INIT_hipDeviceSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceSetCacheConfig.cacheConfig = (hipFuncCache_t)cacheConfig; \
+};
+// hipDeviceSetGraphMemAttribute[('int', 'device'), ('hipGraphMemAttributeType', 'attr'), ('void*', 'value')]
+#define INIT_hipDeviceSetGraphMemAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceSetGraphMemAttribute.device = (int)device; \
+  cb_data.args.hipDeviceSetGraphMemAttribute.attr = (hipGraphMemAttributeType)attr; \
+  cb_data.args.hipDeviceSetGraphMemAttribute.value = (void*)value; \
+};
+// hipDeviceSetLimit[('hipLimit_t', 'limit'), ('size_t', 'value')]
+#define INIT_hipDeviceSetLimit_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceSetLimit.limit = (hipLimit_t)limit; \
+  cb_data.args.hipDeviceSetLimit.value = (size_t)value; \
+};
+// hipDeviceSetMemPool[('int', 'device'), ('hipMemPool_t', 'mem_pool')]
+#define INIT_hipDeviceSetMemPool_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceSetMemPool.device = (int)device; \
+  cb_data.args.hipDeviceSetMemPool.mem_pool = (hipMemPool_t)mem_pool; \
+};
+// hipDeviceSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+#define INIT_hipDeviceSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipDeviceSynchronize[]
+#define INIT_hipDeviceSynchronize_CB_ARGS_DATA(cb_data) { \
+};
+// hipDeviceTotalMem[('size_t*', 'bytes'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceTotalMem_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDeviceTotalMem.bytes = (size_t*)bytes; \
+  cb_data.args.hipDeviceTotalMem.device = (hipDevice_t)device; \
+};
+// hipDriverGetVersion[('int*', 'driverVersion')]
+#define INIT_hipDriverGetVersion_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDriverGetVersion.driverVersion = (int*)driverVersion; \
+};
+// hipDrvGraphAddMemFreeNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('hipDeviceptr_t', 'dptr')]
+#define INIT_hipDrvGraphAddMemFreeNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphAddMemFreeNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \
+  cb_data.args.hipDrvGraphAddMemFreeNode.hGraph = (hipGraph_t)hGraph; \
+  cb_data.args.hipDrvGraphAddMemFreeNode.dependencies = (const hipGraphNode_t*)dependencies; \
+  cb_data.args.hipDrvGraphAddMemFreeNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipDrvGraphAddMemFreeNode.dptr = (hipDeviceptr_t)dptr; \
+};
+// hipDrvGraphAddMemcpyNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const HIP_MEMCPY3D*', 'copyParams'), ('hipCtx_t', 'ctx')]
+#define INIT_hipDrvGraphAddMemcpyNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphAddMemcpyNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \
+  cb_data.args.hipDrvGraphAddMemcpyNode.hGraph = (hipGraph_t)hGraph; \
+  cb_data.args.hipDrvGraphAddMemcpyNode.dependencies = (const hipGraphNode_t*)dependencies; \
+  cb_data.args.hipDrvGraphAddMemcpyNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipDrvGraphAddMemcpyNode.copyParams = (const HIP_MEMCPY3D*)copyParams; \
+  cb_data.args.hipDrvGraphAddMemcpyNode.ctx = (hipCtx_t)ctx; \
+};
+// hipDrvGraphAddMemsetNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const hipMemsetParams*', 'memsetParams'), ('hipCtx_t', 'ctx')]
+#define INIT_hipDrvGraphAddMemsetNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphAddMemsetNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \
+  cb_data.args.hipDrvGraphAddMemsetNode.hGraph = (hipGraph_t)hGraph; \
+  cb_data.args.hipDrvGraphAddMemsetNode.dependencies = (const hipGraphNode_t*)dependencies; \
+  cb_data.args.hipDrvGraphAddMemsetNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipDrvGraphAddMemsetNode.memsetParams = (const hipMemsetParams*)memsetParams; \
+  cb_data.args.hipDrvGraphAddMemsetNode.ctx = (hipCtx_t)ctx; \
+};
+// hipDrvGraphExecMemcpyNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'copyParams'), ('hipCtx_t', 'ctx')]
+#define INIT_hipDrvGraphExecMemcpyNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphExecMemcpyNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipDrvGraphExecMemcpyNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipDrvGraphExecMemcpyNodeSetParams.copyParams = (const HIP_MEMCPY3D*)copyParams; \
+  cb_data.args.hipDrvGraphExecMemcpyNodeSetParams.ctx = (hipCtx_t)ctx; \
+};
+// hipDrvGraphExecMemsetNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipMemsetParams*', 'memsetParams'), ('hipCtx_t', 'ctx')]
+#define INIT_hipDrvGraphExecMemsetNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphExecMemsetNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipDrvGraphExecMemsetNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipDrvGraphExecMemsetNodeSetParams.memsetParams = (const hipMemsetParams*)memsetParams; \
+  cb_data.args.hipDrvGraphExecMemsetNodeSetParams.ctx = (hipCtx_t)ctx; \
+};
+// hipDrvGraphMemcpyNodeGetParams[('hipGraphNode_t', 'hNode'), ('HIP_MEMCPY3D*', 'nodeParams')]
+#define INIT_hipDrvGraphMemcpyNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphMemcpyNodeGetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipDrvGraphMemcpyNodeGetParams.nodeParams = (HIP_MEMCPY3D*)nodeParams; \
+};
+// hipDrvGraphMemcpyNodeSetParams[('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'nodeParams')]
+#define INIT_hipDrvGraphMemcpyNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvGraphMemcpyNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipDrvGraphMemcpyNodeSetParams.nodeParams = (const HIP_MEMCPY3D*)nodeParams; \
+};
+// hipDrvLaunchKernelEx[('const HIP_LAUNCH_CONFIG*', 'config'), ('hipFunction_t', 'f'), ('void**', 'params'), ('void**', 'extra')]
+#define INIT_hipDrvLaunchKernelEx_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvLaunchKernelEx.config = (const HIP_LAUNCH_CONFIG*)config; \
+  cb_data.args.hipDrvLaunchKernelEx.f = (hipFunction_t)f; \
+  cb_data.args.hipDrvLaunchKernelEx.params = (void**)kernelParams; \
+  cb_data.args.hipDrvLaunchKernelEx.extra = (void**)extra; \
+};
+// hipDrvMemcpy2DUnaligned[('const hip_Memcpy2D*', 'pCopy')]
+#define INIT_hipDrvMemcpy2DUnaligned_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvMemcpy2DUnaligned.pCopy = (const hip_Memcpy2D*)pCopy; \
+};
+// hipDrvMemcpy3D[('const HIP_MEMCPY3D*', 'pCopy')]
+#define INIT_hipDrvMemcpy3D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvMemcpy3D.pCopy = (const HIP_MEMCPY3D*)pCopy; \
+};
+// hipDrvMemcpy3DAsync[('const HIP_MEMCPY3D*', 'pCopy'), ('hipStream_t', 'stream')]
+#define INIT_hipDrvMemcpy3DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvMemcpy3DAsync.pCopy = (const HIP_MEMCPY3D*)pCopy; \
+  cb_data.args.hipDrvMemcpy3DAsync.stream = (hipStream_t)stream; \
+};
+// hipDrvPointerGetAttributes[('unsigned int', 'numAttributes'), ('hipPointer_attribute*', 'attributes'), ('void**', 'data'), ('hipDeviceptr_t', 'ptr')]
+#define INIT_hipDrvPointerGetAttributes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipDrvPointerGetAttributes.numAttributes = (unsigned int)numAttributes; \
+  cb_data.args.hipDrvPointerGetAttributes.attributes = (hipPointer_attribute*)attributes; \
+  cb_data.args.hipDrvPointerGetAttributes.data = (void**)data; \
+  cb_data.args.hipDrvPointerGetAttributes.ptr = (hipDeviceptr_t)ptr; \
+};
+// hipEventCreate[('hipEvent_t*', 'event')]
+#define INIT_hipEventCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventCreate.event = (hipEvent_t*)event; \
+};
+// hipEventCreateWithFlags[('hipEvent_t*', 'event'), ('unsigned int', 'flags')]
+#define INIT_hipEventCreateWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventCreateWithFlags.event = (hipEvent_t*)event; \
+  cb_data.args.hipEventCreateWithFlags.flags = (unsigned int)flags; \
+};
+// hipEventDestroy[('hipEvent_t', 'event')]
+#define INIT_hipEventDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventDestroy.event = (hipEvent_t)event; \
+};
+// hipEventElapsedTime[('float*', 'ms'), ('hipEvent_t', 'start'), ('hipEvent_t', 'stop')]
+#define INIT_hipEventElapsedTime_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventElapsedTime.ms = (float*)ms; \
+  cb_data.args.hipEventElapsedTime.start = (hipEvent_t)start; \
+  cb_data.args.hipEventElapsedTime.stop = (hipEvent_t)stop; \
+};
+// hipEventQuery[('hipEvent_t', 'event')]
+#define INIT_hipEventQuery_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventQuery.event = (hipEvent_t)event; \
+};
+// hipEventRecord[('hipEvent_t', 'event'), ('hipStream_t', 'stream')]
+#define INIT_hipEventRecord_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventRecord.event = (hipEvent_t)event; \
+  cb_data.args.hipEventRecord.stream = (hipStream_t)stream; \
+};
+// hipEventRecordWithFlags[('hipEvent_t', 'event'), ('hipStream_t', 'stream'), ('unsigned int', 'flags')]
+#define INIT_hipEventRecordWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventRecordWithFlags.event = (hipEvent_t)event; \
+  cb_data.args.hipEventRecordWithFlags.stream = (hipStream_t)stream; \
+  cb_data.args.hipEventRecordWithFlags.flags = (unsigned int)flags; \
+};
+// hipEventSynchronize[('hipEvent_t', 'event')]
+#define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \
+};
+// hipExtGetLastError[]
+#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \
+};
+// hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')]
+#define INIT_hipExtGetLinkTypeAndHopCount_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtGetLinkTypeAndHopCount.device1 = (int)device1; \
+  cb_data.args.hipExtGetLinkTypeAndHopCount.device2 = (int)device2; \
+  cb_data.args.hipExtGetLinkTypeAndHopCount.linktype = (unsigned int*)linktype; \
+  cb_data.args.hipExtGetLinkTypeAndHopCount.hopcount = (unsigned int*)hopcount; \
+};
+// hipExtLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('int', 'flags')]
+#define INIT_hipExtLaunchKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtLaunchKernel.function_address = (const void*)hostFunction; \
+  cb_data.args.hipExtLaunchKernel.numBlocks = (dim3)gridDim; \
+  cb_data.args.hipExtLaunchKernel.dimBlocks = (dim3)blockDim; \
+  cb_data.args.hipExtLaunchKernel.args = (void**)args; \
+  cb_data.args.hipExtLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+  cb_data.args.hipExtLaunchKernel.stream = (hipStream_t)stream; \
+  cb_data.args.hipExtLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+  cb_data.args.hipExtLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+  cb_data.args.hipExtLaunchKernel.flags = (int)flags; \
+};
+// hipExtLaunchMultiKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+#define INIT_hipExtLaunchMultiKernelMultiDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtLaunchMultiKernelMultiDevice.launchParamsList = (hipLaunchParams*)launchParamsList; \
+  cb_data.args.hipExtLaunchMultiKernelMultiDevice.numDevices = (int)numDevices; \
+  cb_data.args.hipExtLaunchMultiKernelMultiDevice.flags = (unsigned int)flags; \
+};
+// hipExtMallocWithFlags[('void**', 'ptr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+#define INIT_hipExtMallocWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtMallocWithFlags.ptr = (void**)ptr; \
+  cb_data.args.hipExtMallocWithFlags.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipExtMallocWithFlags.flags = (unsigned int)flags; \
+};
+// hipExtModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'localWorkSizeX'), ('unsigned int', 'localWorkSizeY'), ('unsigned int', 'localWorkSizeZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('unsigned int', 'flags')]
+#define INIT_hipExtModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtModuleLaunchKernel.f = (hipFunction_t)f; \
+  cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeX = (unsigned int)globalWorkSizeX; \
+  cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeY = (unsigned int)globalWorkSizeY; \
+  cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeZ = (unsigned int)globalWorkSizeZ; \
+  cb_data.args.hipExtModuleLaunchKernel.localWorkSizeX = (unsigned int)localWorkSizeX; \
+  cb_data.args.hipExtModuleLaunchKernel.localWorkSizeY = (unsigned int)localWorkSizeY; \
+  cb_data.args.hipExtModuleLaunchKernel.localWorkSizeZ = (unsigned int)localWorkSizeZ; \
+  cb_data.args.hipExtModuleLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+  cb_data.args.hipExtModuleLaunchKernel.hStream = (hipStream_t)hStream; \
+  cb_data.args.hipExtModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+  cb_data.args.hipExtModuleLaunchKernel.extra = (void**)extra; \
+  cb_data.args.hipExtModuleLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+  cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+  cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \
+};
+// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
+#define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \
+  cb_data.args.hipExtStreamCreateWithCUMask.cuMaskSize = (unsigned int)cuMaskSize; \
+  cb_data.args.hipExtStreamCreateWithCUMask.cuMask = (const unsigned int*)cuMask; \
+};
+// hipExtStreamGetCUMask[('hipStream_t', 'stream'), ('unsigned int', 'cuMaskSize'), ('unsigned int*', 'cuMask')]
+#define INIT_hipExtStreamGetCUMask_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExtStreamGetCUMask.stream = (hipStream_t)stream; \
+  cb_data.args.hipExtStreamGetCUMask.cuMaskSize = (unsigned int)cuMaskSize; \
+  cb_data.args.hipExtStreamGetCUMask.cuMask = (unsigned int*)cuMask; \
+};
+// hipExternalMemoryGetMappedBuffer[('void**', 'devPtr'), ('hipExternalMemory_t', 'extMem'), ('const hipExternalMemoryBufferDesc*', 'bufferDesc')]
+#define INIT_hipExternalMemoryGetMappedBuffer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExternalMemoryGetMappedBuffer.devPtr = (void**)devPtr; \
+  cb_data.args.hipExternalMemoryGetMappedBuffer.extMem = (hipExternalMemory_t)extMem; \
+  cb_data.args.hipExternalMemoryGetMappedBuffer.bufferDesc = (const hipExternalMemoryBufferDesc*)bufferDesc; \
+};
+// hipExternalMemoryGetMappedMipmappedArray[('hipMipmappedArray_t*', 'mipmap'), ('hipExternalMemory_t', 'extMem'), ('const hipExternalMemoryMipmappedArrayDesc*', 'mipmapDesc')]
+#define INIT_hipExternalMemoryGetMappedMipmappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipExternalMemoryGetMappedMipmappedArray.mipmap = (hipMipmappedArray_t*)mipmap; \
+  cb_data.args.hipExternalMemoryGetMappedMipmappedArray.extMem = (hipExternalMemory_t)extMem; \
+  cb_data.args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc = (const hipExternalMemoryMipmappedArrayDesc*)mipmapDesc; \
+};
+// hipFree[('void*', 'ptr')]
+#define INIT_hipFree_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFree.ptr = (void*)ptr; \
+};
+// hipFreeArray[('hipArray_t', 'array')]
+#define INIT_hipFreeArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFreeArray.array = (hipArray_t)array; \
+};
+// hipFreeAsync[('void*', 'dev_ptr'), ('hipStream_t', 'stream')]
+#define INIT_hipFreeAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFreeAsync.dev_ptr = (void*)dev_ptr; \
+  cb_data.args.hipFreeAsync.stream = (hipStream_t)stream; \
+};
+// hipFreeHost[('void*', 'ptr')]
+#define INIT_hipFreeHost_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFreeHost.ptr = (void*)ptr; \
+};
+// hipFreeMipmappedArray[('hipMipmappedArray_t', 'mipmappedArray')]
+#define INIT_hipFreeMipmappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFreeMipmappedArray.mipmappedArray = (hipMipmappedArray_t)mipmappedArray; \
+};
+// hipFuncGetAttribute[('int*', 'value'), ('hipFunction_attribute', 'attrib'), ('hipFunction_t', 'hfunc')]
+#define INIT_hipFuncGetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFuncGetAttribute.value = (int*)value; \
+  cb_data.args.hipFuncGetAttribute.attrib = (hipFunction_attribute)attrib; \
+  cb_data.args.hipFuncGetAttribute.hfunc = (hipFunction_t)hfunc; \
+};
+// hipFuncGetAttributes[('hipFuncAttributes*', 'attr'), ('const void*', 'func')]
+#define INIT_hipFuncGetAttributes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFuncGetAttributes.attr = (hipFuncAttributes*)attr; \
+  cb_data.args.hipFuncGetAttributes.func = (const void*)func; \
+};
+// hipFuncSetAttribute[('const void*', 'func'), ('hipFuncAttribute', 'attr'), ('int', 'value')]
+#define INIT_hipFuncSetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFuncSetAttribute.func = (const void*)func; \
+  cb_data.args.hipFuncSetAttribute.attr = (hipFuncAttribute)attr; \
+  cb_data.args.hipFuncSetAttribute.value = (int)value; \
+};
+// hipFuncSetCacheConfig[('const void*', 'func'), ('hipFuncCache_t', 'config')]
+#define INIT_hipFuncSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFuncSetCacheConfig.func = (const void*)func; \
+  cb_data.args.hipFuncSetCacheConfig.config = (hipFuncCache_t)cacheConfig; \
+};
+// hipFuncSetSharedMemConfig[('const void*', 'func'), ('hipSharedMemConfig', 'config')]
+#define INIT_hipFuncSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipFuncSetSharedMemConfig.func = (const void*)func; \
+  cb_data.args.hipFuncSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipGLGetDevices[('unsigned int*', 'pHipDeviceCount'), ('int*', 'pHipDevices'), ('unsigned int', 'hipDeviceCount'), ('hipGLDeviceList', 'deviceList')]
+#define INIT_hipGLGetDevices_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGLGetDevices.pHipDeviceCount = (unsigned int*)pHipDeviceCount; \
+  cb_data.args.hipGLGetDevices.pHipDevices = (int*)pHipDevices; \
+  cb_data.args.hipGLGetDevices.hipDeviceCount = (unsigned int)hipDeviceCount; \
+  cb_data.args.hipGLGetDevices.deviceList = (hipGLDeviceList)deviceList; \
+};
+// hipGetChannelDesc[('hipChannelFormatDesc*', 'desc'), ('hipArray_const_t', 'array')]
+#define INIT_hipGetChannelDesc_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetChannelDesc.desc = (hipChannelFormatDesc*)desc; \
+  cb_data.args.hipGetChannelDesc.array = (hipArray_const_t)array; \
+};
+// hipGetDevice[('int*', 'deviceId')]
+#define INIT_hipGetDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDevice.deviceId = (int*)deviceId; \
+};
+// hipGetDeviceCount[('int*', 'count')]
+#define INIT_hipGetDeviceCount_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDeviceCount.count = (int*)count; \
+};
+// hipGetDeviceFlags[('unsigned int*', 'flags')]
+#define INIT_hipGetDeviceFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDeviceFlags.flags = (unsigned int*)flags; \
+};
+// hipGetDevicePropertiesR0000[('hipDeviceProp_tR0000*', 'prop'), ('int', 'device')]
+#define INIT_hipGetDevicePropertiesR0000_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDevicePropertiesR0000.prop = (hipDeviceProp_tR0000*)prop; \
+  cb_data.args.hipGetDevicePropertiesR0000.device = (int)device; \
+};
+// hipGetDevicePropertiesR0600[('hipDeviceProp_tR0600*', 'prop'), ('int', 'deviceId')]
+#define INIT_hipGetDevicePropertiesR0600_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDevicePropertiesR0600.prop = (hipDeviceProp_tR0600*)prop; \
+  cb_data.args.hipGetDevicePropertiesR0600.deviceId = (int)device; \
+};
+// hipGetDriverEntryPoint[('const char*', 'symbol'), ('void**', 'funcPtr'), ('unsigned long long', 'flags'), ('hipDriverEntryPointQueryResult*', 'driverStatus')]
+#define INIT_hipGetDriverEntryPoint_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetDriverEntryPoint.symbol = (symbol) ? strdup(symbol) : NULL; \
+  cb_data.args.hipGetDriverEntryPoint.funcPtr = (void**)funcPtr; \
+  cb_data.args.hipGetDriverEntryPoint.flags = (unsigned long long)flags; \
+  cb_data.args.hipGetDriverEntryPoint.driverStatus = (hipDriverEntryPointQueryResult*)status; \
+};
+// hipGetFuncBySymbol[('hipFunction_t*', 'functionPtr'), ('const void*', 'symbolPtr')]
+#define INIT_hipGetFuncBySymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetFuncBySymbol.functionPtr = (hipFunction_t*)functionPtr; \
+  cb_data.args.hipGetFuncBySymbol.symbolPtr = (const void*)symbolPtr; \
+};
+// hipGetLastError[]
+#define INIT_hipGetLastError_CB_ARGS_DATA(cb_data) { \
+};
+// hipGetMipmappedArrayLevel[('hipArray_t*', 'levelArray'), ('hipMipmappedArray_const_t', 'mipmappedArray'), ('unsigned int', 'level')]
+#define INIT_hipGetMipmappedArrayLevel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetMipmappedArrayLevel.levelArray = (hipArray_t*)levelArray; \
+  cb_data.args.hipGetMipmappedArrayLevel.mipmappedArray = (hipMipmappedArray_const_t)mipmappedArray; \
+  cb_data.args.hipGetMipmappedArrayLevel.level = (unsigned int)level; \
+};
+// hipGetProcAddress[('const char*', 'symbol'), ('void**', 'pfn'), ('int', 'hipVersion'), ('uint64_t', 'flags'), ('hipDriverProcAddressQueryResult*', 'symbolStatus')]
+#define INIT_hipGetProcAddress_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetProcAddress.symbol = (symbol) ? strdup(symbol) : NULL; \
+  cb_data.args.hipGetProcAddress.pfn = (void**)pfn; \
+  cb_data.args.hipGetProcAddress.hipVersion = (int)hipVersion; \
+  cb_data.args.hipGetProcAddress.flags = (uint64_t)flags; \
+  cb_data.args.hipGetProcAddress.symbolStatus = (hipDriverProcAddressQueryResult*)symbolStatus; \
+};
+// hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')]
+#define INIT_hipGetSymbolAddress_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetSymbolAddress.devPtr = (void**)devPtr; \
+  cb_data.args.hipGetSymbolAddress.symbol = (const void*)symbol; \
+};
+// hipGetSymbolSize[('size_t*', 'size'), ('const void*', 'symbol')]
+#define INIT_hipGetSymbolSize_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGetSymbolSize.size = (size_t*)sizePtr; \
+  cb_data.args.hipGetSymbolSize.symbol = (const void*)symbol; \
+};
+// hipGraphAddBatchMemOpNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const hipBatchMemOpNodeParams*', 'nodeParams')]
+#define INIT_hipGraphAddBatchMemOpNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddBatchMemOpNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \
+  cb_data.args.hipGraphAddBatchMemOpNode.hGraph = (hipGraph_t)hGraph; \
+  cb_data.args.hipGraphAddBatchMemOpNode.dependencies = (const hipGraphNode_t*)dependencies; \
+  cb_data.args.hipGraphAddBatchMemOpNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddBatchMemOpNode.nodeParams = (const hipBatchMemOpNodeParams*)nodeParams; \
+};
+// hipGraphAddChildGraphNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraph_t', 'childGraph')]
+#define INIT_hipGraphAddChildGraphNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddChildGraphNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddChildGraphNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddChildGraphNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddChildGraphNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddChildGraphNode.childGraph = (hipGraph_t)childGraph; \
+};
+// hipGraphAddDependencies[('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'from'), ('const hipGraphNode_t*', 'to'), ('size_t', 'numDependencies')]
+#define INIT_hipGraphAddDependencies_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddDependencies.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddDependencies.from = (const hipGraphNode_t*)from; \
+  cb_data.args.hipGraphAddDependencies.to = (const hipGraphNode_t*)to; \
+  cb_data.args.hipGraphAddDependencies.numDependencies = (size_t)numDependencies; \
+};
+// hipGraphAddEmptyNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies')]
+#define INIT_hipGraphAddEmptyNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddEmptyNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddEmptyNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddEmptyNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddEmptyNode.numDependencies = (size_t)numDependencies; \
+};
+// hipGraphAddEventRecordNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphAddEventRecordNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddEventRecordNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddEventRecordNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddEventRecordNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddEventRecordNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddEventRecordNode.event = (hipEvent_t)event; \
+};
+// hipGraphAddEventWaitNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphAddEventWaitNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddEventWaitNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddEventWaitNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddEventWaitNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddEventWaitNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddEventWaitNode.event = (hipEvent_t)event; \
+};
+// hipGraphAddExternalSemaphoresSignalNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+#define INIT_hipGraphAddExternalSemaphoresSignalNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddExternalSemaphoresSignalNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddExternalSemaphoresSignalNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddExternalSemaphoresSignalNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddExternalSemaphoresSignalNode.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \
+};
+// hipGraphAddExternalSemaphoresWaitNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+#define INIT_hipGraphAddExternalSemaphoresWaitNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddExternalSemaphoresWaitNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddExternalSemaphoresWaitNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddExternalSemaphoresWaitNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddExternalSemaphoresWaitNode.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \
+};
+// hipGraphAddHostNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipHostNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphAddHostNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddHostNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddHostNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddHostNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddHostNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddHostNode.pNodeParams = (const hipHostNodeParams*)pNodeParams; \
+};
+// hipGraphAddKernelNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipKernelNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphAddKernelNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddKernelNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddKernelNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddKernelNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddKernelNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddKernelNode.pNodeParams = (const hipKernelNodeParams*)pNodeParams; \
+};
+// hipGraphAddMemAllocNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipMemAllocNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphAddMemAllocNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemAllocNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemAllocNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemAllocNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemAllocNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemAllocNode.pNodeParams = (hipMemAllocNodeParams*)pNodeParams; \
+};
+// hipGraphAddMemFreeNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dev_ptr')]
+#define INIT_hipGraphAddMemFreeNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemFreeNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemFreeNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemFreeNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemFreeNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemFreeNode.dev_ptr = (void*)dev_ptr; \
+};
+// hipGraphAddMemcpyNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipMemcpy3DParms*', 'pCopyParams')]
+#define INIT_hipGraphAddMemcpyNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemcpyNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemcpyNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemcpyNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemcpyNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemcpyNode.pCopyParams = (const hipMemcpy3DParms*)pCopyParams; \
+};
+// hipGraphAddMemcpyNode1D[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphAddMemcpyNode1D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemcpyNode1D.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemcpyNode1D.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemcpyNode1D.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemcpyNode1D.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemcpyNode1D.dst = (void*)dst; \
+  cb_data.args.hipGraphAddMemcpyNode1D.src = (const void*)src; \
+  cb_data.args.hipGraphAddMemcpyNode1D.count = (size_t)count; \
+  cb_data.args.hipGraphAddMemcpyNode1D.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphAddMemcpyNodeFromSymbol[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphAddMemcpyNodeFromSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.dst = (void*)dst; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphAddMemcpyNodeFromSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphAddMemcpyNodeToSymbol[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphAddMemcpyNodeToSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.src = (const void*)src; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphAddMemcpyNodeToSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphAddMemsetNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipMemsetParams*', 'pMemsetParams')]
+#define INIT_hipGraphAddMemsetNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddMemsetNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddMemsetNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddMemsetNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddMemsetNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddMemsetNode.pMemsetParams = (const hipMemsetParams*)pMemsetParams; \
+};
+// hipGraphAddNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraphNodeParams*', 'nodeParams')]
+#define INIT_hipGraphAddNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphAddNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \
+  cb_data.args.hipGraphAddNode.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphAddNode.pDependencies = (const hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphAddNode.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipGraphAddNode.nodeParams = (hipGraphNodeParams*)nodeParams; \
+};
+// hipGraphBatchMemOpNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipBatchMemOpNodeParams*', 'nodeParams_out')]
+#define INIT_hipGraphBatchMemOpNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphBatchMemOpNodeGetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphBatchMemOpNodeGetParams.nodeParams_out = (hipBatchMemOpNodeParams*)nodeParams_out; \
+};
+// hipGraphBatchMemOpNodeSetParams[('hipGraphNode_t', 'hNode'), ('hipBatchMemOpNodeParams*', 'nodeParams')]
+#define INIT_hipGraphBatchMemOpNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphBatchMemOpNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphBatchMemOpNodeSetParams.nodeParams = (hipBatchMemOpNodeParams*)nodeParams; \
+};
+// hipGraphChildGraphNodeGetGraph[('hipGraphNode_t', 'node'), ('hipGraph_t*', 'pGraph')]
+#define INIT_hipGraphChildGraphNodeGetGraph_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphChildGraphNodeGetGraph.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphChildGraphNodeGetGraph.pGraph = (hipGraph_t*)pGraph; \
+};
+// hipGraphClone[('hipGraph_t*', 'pGraphClone'), ('hipGraph_t', 'originalGraph')]
+#define INIT_hipGraphClone_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphClone.pGraphClone = (hipGraph_t*)pGraphClone; \
+  cb_data.args.hipGraphClone.originalGraph = (hipGraph_t)originalGraph; \
+};
+// hipGraphCreate[('hipGraph_t*', 'pGraph'), ('unsigned int', 'flags')]
+#define INIT_hipGraphCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphCreate.pGraph = (hipGraph_t*)pGraph; \
+  cb_data.args.hipGraphCreate.flags = (unsigned int)flags; \
+};
+// hipGraphDebugDotPrint[('hipGraph_t', 'graph'), ('const char*', 'path'), ('unsigned int', 'flags')]
+#define INIT_hipGraphDebugDotPrint_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphDebugDotPrint.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphDebugDotPrint.path = (path) ? strdup(path) : NULL; \
+  cb_data.args.hipGraphDebugDotPrint.flags = (unsigned int)flags; \
+};
+// hipGraphDestroy[('hipGraph_t', 'graph')]
+#define INIT_hipGraphDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphDestroy.graph = (hipGraph_t)graph; \
+};
+// hipGraphDestroyNode[('hipGraphNode_t', 'node')]
+#define INIT_hipGraphDestroyNode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphDestroyNode.node = (hipGraphNode_t)node; \
+};
+// hipGraphEventRecordNodeGetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t*', 'event_out')]
+#define INIT_hipGraphEventRecordNodeGetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphEventRecordNodeGetEvent.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphEventRecordNodeGetEvent.event_out = (hipEvent_t*)event_out; \
+};
+// hipGraphEventRecordNodeSetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphEventRecordNodeSetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphEventRecordNodeSetEvent.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphEventRecordNodeSetEvent.event = (hipEvent_t)event; \
+};
+// hipGraphEventWaitNodeGetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t*', 'event_out')]
+#define INIT_hipGraphEventWaitNodeGetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphEventWaitNodeGetEvent.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphEventWaitNodeGetEvent.event_out = (hipEvent_t*)event_out; \
+};
+// hipGraphEventWaitNodeSetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphEventWaitNodeSetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphEventWaitNodeSetEvent.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphEventWaitNodeSetEvent.event = (hipEvent_t)event; \
+};
+// hipGraphExecBatchMemOpNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipBatchMemOpNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExecBatchMemOpNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecBatchMemOpNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecBatchMemOpNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExecBatchMemOpNodeSetParams.nodeParams = (const hipBatchMemOpNodeParams*)nodeParams; \
+};
+// hipGraphExecChildGraphNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('hipGraph_t', 'childGraph')]
+#define INIT_hipGraphExecChildGraphNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecChildGraphNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecChildGraphNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecChildGraphNodeSetParams.childGraph = (hipGraph_t)childGraph; \
+};
+// hipGraphExecDestroy[('hipGraphExec_t', 'graphExec')]
+#define INIT_hipGraphExecDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecDestroy.graphExec = (hipGraphExec_t)pGraphExec; \
+};
+// hipGraphExecEventRecordNodeSetEvent[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphExecEventRecordNodeSetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecEventRecordNodeSetEvent.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecEventRecordNodeSetEvent.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExecEventRecordNodeSetEvent.event = (hipEvent_t)event; \
+};
+// hipGraphExecEventWaitNodeSetEvent[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('hipEvent_t', 'event')]
+#define INIT_hipGraphExecEventWaitNodeSetEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecEventWaitNodeSetEvent.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecEventWaitNodeSetEvent.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExecEventWaitNodeSetEvent.event = (hipEvent_t)event; \
+};
+// hipGraphExecExternalSemaphoresSignalNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExecExternalSemaphoresSignalNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \
+};
+// hipGraphExecExternalSemaphoresWaitNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExecExternalSemaphoresWaitNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \
+};
+// hipGraphExecGetFlags[('hipGraphExec_t', 'graphExec'), ('unsigned long long*', 'flags')]
+#define INIT_hipGraphExecGetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecGetFlags.graphExec = (hipGraphExec_t)graphExec; \
+  cb_data.args.hipGraphExecGetFlags.flags = (unsigned long long*)flags; \
+};
+// hipGraphExecHostNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipHostNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphExecHostNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecHostNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecHostNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecHostNodeSetParams.pNodeParams = (const hipHostNodeParams*)pNodeParams; \
+};
+// hipGraphExecKernelNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipKernelNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphExecKernelNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecKernelNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecKernelNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecKernelNodeSetParams.pNodeParams = (const hipKernelNodeParams*)pNodeParams; \
+};
+// hipGraphExecMemcpyNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('hipMemcpy3DParms*', 'pNodeParams')]
+#define INIT_hipGraphExecMemcpyNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams.pNodeParams = (hipMemcpy3DParms*)pNodeParams; \
+};
+// hipGraphExecMemcpyNodeSetParams1D[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphExecMemcpyNodeSetParams1D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.dst = (void*)dst; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.src = (const void*)src; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.count = (size_t)count; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParams1D.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphExecMemcpyNodeSetParamsFromSymbol[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphExecMemcpyNodeSetParamsFromSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.dst = (void*)dst; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsFromSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphExecMemcpyNodeSetParamsToSymbol[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphExecMemcpyNodeSetParamsToSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.src = (const void*)src; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphExecMemcpyNodeSetParamsToSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphExecMemsetNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipMemsetParams*', 'pNodeParams')]
+#define INIT_hipGraphExecMemsetNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecMemsetNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecMemsetNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecMemsetNodeSetParams.pNodeParams = (const hipMemsetParams*)pNodeParams; \
+};
+// hipGraphExecNodeSetParams[('hipGraphExec_t', 'graphExec'), ('hipGraphNode_t', 'node'), ('hipGraphNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExecNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecNodeSetParams.graphExec = (hipGraphExec_t)graphExec; \
+  cb_data.args.hipGraphExecNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphExecNodeSetParams.nodeParams = (hipGraphNodeParams*)nodeParams; \
+};
+// hipGraphExecUpdate[('hipGraphExec_t', 'hGraphExec'), ('hipGraph_t', 'hGraph'), ('hipGraphNode_t*', 'hErrorNode_out'), ('hipGraphExecUpdateResult*', 'updateResult_out')]
+#define INIT_hipGraphExecUpdate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExecUpdate.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphExecUpdate.hGraph = (hipGraph_t)hGraph; \
+  cb_data.args.hipGraphExecUpdate.hErrorNode_out = (hipGraphNode_t*)hErrorNode_out; \
+  cb_data.args.hipGraphExecUpdate.updateResult_out = (hipGraphExecUpdateResult*)updateResult_out; \
+};
+// hipGraphExternalSemaphoresSignalNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreSignalNodeParams*', 'params_out')]
+#define INIT_hipGraphExternalSemaphoresSignalNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExternalSemaphoresSignalNodeGetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out = (hipExternalSemaphoreSignalNodeParams*)params_out; \
+};
+// hipGraphExternalSemaphoresSignalNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExternalSemaphoresSignalNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExternalSemaphoresSignalNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \
+};
+// hipGraphExternalSemaphoresWaitNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreWaitNodeParams*', 'params_out')]
+#define INIT_hipGraphExternalSemaphoresWaitNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExternalSemaphoresWaitNodeGetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out = (hipExternalSemaphoreWaitNodeParams*)params_out; \
+};
+// hipGraphExternalSemaphoresWaitNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+#define INIT_hipGraphExternalSemaphoresWaitNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphExternalSemaphoresWaitNodeSetParams.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \
+};
+// hipGraphGetEdges[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'from'), ('hipGraphNode_t*', 'to'), ('size_t*', 'numEdges')]
+#define INIT_hipGraphGetEdges_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphGetEdges.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphGetEdges.from = (hipGraphNode_t*)from; \
+  cb_data.args.hipGraphGetEdges.to = (hipGraphNode_t*)to; \
+  cb_data.args.hipGraphGetEdges.numEdges = (size_t*)numEdges; \
+};
+// hipGraphGetNodes[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'nodes'), ('size_t*', 'numNodes')]
+#define INIT_hipGraphGetNodes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphGetNodes.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphGetNodes.nodes = (hipGraphNode_t*)nodes; \
+  cb_data.args.hipGraphGetNodes.numNodes = (size_t*)numNodes; \
+};
+// hipGraphGetRootNodes[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'pRootNodes'), ('size_t*', 'pNumRootNodes')]
+#define INIT_hipGraphGetRootNodes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphGetRootNodes.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphGetRootNodes.pRootNodes = (hipGraphNode_t*)pRootNodes; \
+  cb_data.args.hipGraphGetRootNodes.pNumRootNodes = (size_t*)pNumRootNodes; \
+};
+// hipGraphHostNodeGetParams[('hipGraphNode_t', 'node'), ('hipHostNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphHostNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphHostNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphHostNodeGetParams.pNodeParams = (hipHostNodeParams*)pNodeParams; \
+};
+// hipGraphHostNodeSetParams[('hipGraphNode_t', 'node'), ('const hipHostNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphHostNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphHostNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphHostNodeSetParams.pNodeParams = (const hipHostNodeParams*)pNodeParams; \
+};
+// hipGraphInstantiate[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'pErrorNode'), ('char*', 'pLogBuffer'), ('size_t', 'bufferSize')]
+#define INIT_hipGraphInstantiate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphInstantiate.pGraphExec = (hipGraphExec_t*)pGraphExec; \
+  cb_data.args.hipGraphInstantiate.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphInstantiate.pErrorNode = (hipGraphNode_t*)pErrorNode; \
+  cb_data.args.hipGraphInstantiate.pLogBuffer = (char*)pLogBuffer; \
+  cb_data.args.hipGraphInstantiate.bufferSize = (size_t)bufferSize; \
+};
+// hipGraphInstantiateWithFlags[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('unsigned long long', 'flags')]
+#define INIT_hipGraphInstantiateWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphInstantiateWithFlags.pGraphExec = (hipGraphExec_t*)pGraphExec; \
+  cb_data.args.hipGraphInstantiateWithFlags.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphInstantiateWithFlags.flags = (unsigned long long)flags; \
+};
+// hipGraphInstantiateWithParams[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphInstantiateParams*', 'instantiateParams')]
+#define INIT_hipGraphInstantiateWithParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphInstantiateWithParams.pGraphExec = (hipGraphExec_t*)pGraphExec; \
+  cb_data.args.hipGraphInstantiateWithParams.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphInstantiateWithParams.instantiateParams = (hipGraphInstantiateParams*)instantiateParams; \
+};
+// hipGraphKernelNodeCopyAttributes[('hipGraphNode_t', 'hSrc'), ('hipGraphNode_t', 'hDst')]
+#define INIT_hipGraphKernelNodeCopyAttributes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphKernelNodeCopyAttributes.hSrc = (hipGraphNode_t)hSrc; \
+  cb_data.args.hipGraphKernelNodeCopyAttributes.hDst = (hipGraphNode_t)hDst; \
+};
+// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value')]
+#define INIT_hipGraphKernelNodeGetAttribute_CB_ARGS_DATA(cb_data) { \
+};
+// hipGraphKernelNodeGetParams[('hipGraphNode_t', 'node'), ('hipKernelNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphKernelNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphKernelNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphKernelNodeGetParams.pNodeParams = (hipKernelNodeParams*)pNodeParams; \
+};
+// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')]
+#define INIT_hipGraphKernelNodeSetAttribute_CB_ARGS_DATA(cb_data) { \
+};
+// hipGraphKernelNodeSetParams[('hipGraphNode_t', 'node'), ('const hipKernelNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphKernelNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphKernelNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphKernelNodeSetParams.pNodeParams = (const hipKernelNodeParams*)pNodeParams; \
+};
+// hipGraphLaunch[('hipGraphExec_t', 'graphExec'), ('hipStream_t', 'stream')]
+#define INIT_hipGraphLaunch_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphLaunch.graphExec = (hipGraphExec_t)graphExec; \
+  cb_data.args.hipGraphLaunch.stream = (hipStream_t)stream; \
+};
+// hipGraphMemAllocNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemAllocNodeParams*', 'pNodeParams')]
+#define INIT_hipGraphMemAllocNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemAllocNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemAllocNodeGetParams.pNodeParams = (hipMemAllocNodeParams*)pNodeParams; \
+};
+// hipGraphMemFreeNodeGetParams[('hipGraphNode_t', 'node'), ('void*', 'dev_ptr')]
+#define INIT_hipGraphMemFreeNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemFreeNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemFreeNodeGetParams.dev_ptr = (void*)dev_ptr; \
+};
+// hipGraphMemcpyNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemcpy3DParms*', 'pNodeParams')]
+#define INIT_hipGraphMemcpyNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemcpyNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemcpyNodeGetParams.pNodeParams = (hipMemcpy3DParms*)pNodeParams; \
+};
+// hipGraphMemcpyNodeSetParams[('hipGraphNode_t', 'node'), ('const hipMemcpy3DParms*', 'pNodeParams')]
+#define INIT_hipGraphMemcpyNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemcpyNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemcpyNodeSetParams.pNodeParams = (const hipMemcpy3DParms*)pNodeParams; \
+};
+// hipGraphMemcpyNodeSetParams1D[('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphMemcpyNodeSetParams1D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemcpyNodeSetParams1D.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemcpyNodeSetParams1D.dst = (void*)dst; \
+  cb_data.args.hipGraphMemcpyNodeSetParams1D.src = (const void*)src; \
+  cb_data.args.hipGraphMemcpyNodeSetParams1D.count = (size_t)count; \
+  cb_data.args.hipGraphMemcpyNodeSetParams1D.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphMemcpyNodeSetParamsFromSymbol[('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphMemcpyNodeSetParamsFromSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.dst = (void*)dst; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsFromSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphMemcpyNodeSetParamsToSymbol[('hipGraphNode_t', 'node'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipGraphMemcpyNodeSetParamsToSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.src = (const void*)src; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.count = (size_t)count; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.offset = (size_t)offset; \
+  cb_data.args.hipGraphMemcpyNodeSetParamsToSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipGraphMemsetNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemsetParams*', 'pNodeParams')]
+#define INIT_hipGraphMemsetNodeGetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemsetNodeGetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemsetNodeGetParams.pNodeParams = (hipMemsetParams*)pNodeParams; \
+};
+// hipGraphMemsetNodeSetParams[('hipGraphNode_t', 'node'), ('const hipMemsetParams*', 'pNodeParams')]
+#define INIT_hipGraphMemsetNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphMemsetNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphMemsetNodeSetParams.pNodeParams = (const hipMemsetParams*)pNodeParams; \
+};
+// hipGraphNodeFindInClone[('hipGraphNode_t*', 'pNode'), ('hipGraphNode_t', 'originalNode'), ('hipGraph_t', 'clonedGraph')]
+#define INIT_hipGraphNodeFindInClone_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeFindInClone.pNode = (hipGraphNode_t*)pNode; \
+  cb_data.args.hipGraphNodeFindInClone.originalNode = (hipGraphNode_t)originalNode; \
+  cb_data.args.hipGraphNodeFindInClone.clonedGraph = (hipGraph_t)clonedGraph; \
+};
+// hipGraphNodeGetDependencies[('hipGraphNode_t', 'node'), ('hipGraphNode_t*', 'pDependencies'), ('size_t*', 'pNumDependencies')]
+#define INIT_hipGraphNodeGetDependencies_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeGetDependencies.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphNodeGetDependencies.pDependencies = (hipGraphNode_t*)pDependencies; \
+  cb_data.args.hipGraphNodeGetDependencies.pNumDependencies = (size_t*)pNumDependencies; \
+};
+// hipGraphNodeGetDependentNodes[('hipGraphNode_t', 'node'), ('hipGraphNode_t*', 'pDependentNodes'), ('size_t*', 'pNumDependentNodes')]
+#define INIT_hipGraphNodeGetDependentNodes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeGetDependentNodes.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphNodeGetDependentNodes.pDependentNodes = (hipGraphNode_t*)pDependentNodes; \
+  cb_data.args.hipGraphNodeGetDependentNodes.pNumDependentNodes = (size_t*)pNumDependentNodes; \
+};
+// hipGraphNodeGetEnabled[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('unsigned int*', 'isEnabled')]
+#define INIT_hipGraphNodeGetEnabled_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeGetEnabled.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphNodeGetEnabled.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphNodeGetEnabled.isEnabled = (unsigned int*)isEnabled; \
+};
+// hipGraphNodeGetType[('hipGraphNode_t', 'node'), ('hipGraphNodeType*', 'pType')]
+#define INIT_hipGraphNodeGetType_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeGetType.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphNodeGetType.pType = (hipGraphNodeType*)pType; \
+};
+// hipGraphNodeSetEnabled[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('unsigned int', 'isEnabled')]
+#define INIT_hipGraphNodeSetEnabled_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeSetEnabled.hGraphExec = (hipGraphExec_t)hGraphExec; \
+  cb_data.args.hipGraphNodeSetEnabled.hNode = (hipGraphNode_t)hNode; \
+  cb_data.args.hipGraphNodeSetEnabled.isEnabled = (unsigned int)isEnabled; \
+};
+// hipGraphNodeSetParams[('hipGraphNode_t', 'node'), ('hipGraphNodeParams*', 'nodeParams')]
+#define INIT_hipGraphNodeSetParams_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphNodeSetParams.node = (hipGraphNode_t)node; \
+  cb_data.args.hipGraphNodeSetParams.nodeParams = (hipGraphNodeParams*)nodeParams; \
+};
+// hipGraphReleaseUserObject[('hipGraph_t', 'graph'), ('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+#define INIT_hipGraphReleaseUserObject_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphReleaseUserObject.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphReleaseUserObject.object = (hipUserObject_t)object; \
+  cb_data.args.hipGraphReleaseUserObject.count = (unsigned int)count; \
+};
+// hipGraphRemoveDependencies[('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'from'), ('const hipGraphNode_t*', 'to'), ('size_t', 'numDependencies')]
+#define INIT_hipGraphRemoveDependencies_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphRemoveDependencies.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphRemoveDependencies.from = (const hipGraphNode_t*)from; \
+  cb_data.args.hipGraphRemoveDependencies.to = (const hipGraphNode_t*)to; \
+  cb_data.args.hipGraphRemoveDependencies.numDependencies = (size_t)numDependencies; \
+};
+// hipGraphRetainUserObject[('hipGraph_t', 'graph'), ('hipUserObject_t', 'object'), ('unsigned int', 'count'), ('unsigned int', 'flags')]
+#define INIT_hipGraphRetainUserObject_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphRetainUserObject.graph = (hipGraph_t)graph; \
+  cb_data.args.hipGraphRetainUserObject.object = (hipUserObject_t)object; \
+  cb_data.args.hipGraphRetainUserObject.count = (unsigned int)count; \
+  cb_data.args.hipGraphRetainUserObject.flags = (unsigned int)flags; \
+};
+// hipGraphUpload[('hipGraphExec_t', 'graphExec'), ('hipStream_t', 'stream')]
+#define INIT_hipGraphUpload_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphUpload.graphExec = (hipGraphExec_t)graphExec; \
+  cb_data.args.hipGraphUpload.stream = (hipStream_t)stream; \
+};
+// hipGraphicsGLRegisterBuffer[('hipGraphicsResource**', 'resource'), ('GLuint', 'buffer'), ('unsigned int', 'flags')]
+#define INIT_hipGraphicsGLRegisterBuffer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsGLRegisterBuffer.resource = (hipGraphicsResource**)resource; \
+  cb_data.args.hipGraphicsGLRegisterBuffer.buffer = (GLuint)buffer; \
+  cb_data.args.hipGraphicsGLRegisterBuffer.flags = (unsigned int)flags; \
+};
+// hipGraphicsGLRegisterImage[('hipGraphicsResource**', 'resource'), ('GLuint', 'image'), ('GLenum', 'target'), ('unsigned int', 'flags')]
+#define INIT_hipGraphicsGLRegisterImage_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsGLRegisterImage.resource = (hipGraphicsResource**)resource; \
+  cb_data.args.hipGraphicsGLRegisterImage.image = (GLuint)image; \
+  cb_data.args.hipGraphicsGLRegisterImage.target = (GLenum)target; \
+  cb_data.args.hipGraphicsGLRegisterImage.flags = (unsigned int)flags; \
+};
+// hipGraphicsMapResources[('int', 'count'), ('hipGraphicsResource_t*', 'resources'), ('hipStream_t', 'stream')]
+#define INIT_hipGraphicsMapResources_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsMapResources.count = (int)count; \
+  cb_data.args.hipGraphicsMapResources.resources = (hipGraphicsResource_t*)resources; \
+  cb_data.args.hipGraphicsMapResources.stream = (hipStream_t)stream; \
+};
+// hipGraphicsResourceGetMappedPointer[('void**', 'devPtr'), ('size_t*', 'size'), ('hipGraphicsResource_t', 'resource')]
+#define INIT_hipGraphicsResourceGetMappedPointer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsResourceGetMappedPointer.devPtr = (void**)devPtr; \
+  cb_data.args.hipGraphicsResourceGetMappedPointer.size = (size_t*)size; \
+  cb_data.args.hipGraphicsResourceGetMappedPointer.resource = (hipGraphicsResource_t)resource; \
+};
+// hipGraphicsSubResourceGetMappedArray[('hipArray_t*', 'array'), ('hipGraphicsResource_t', 'resource'), ('unsigned int', 'arrayIndex'), ('unsigned int', 'mipLevel')]
+#define INIT_hipGraphicsSubResourceGetMappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsSubResourceGetMappedArray.array = (hipArray_t*)array; \
+  cb_data.args.hipGraphicsSubResourceGetMappedArray.resource = (hipGraphicsResource_t)resource; \
+  cb_data.args.hipGraphicsSubResourceGetMappedArray.arrayIndex = (unsigned int)arrayIndex; \
+  cb_data.args.hipGraphicsSubResourceGetMappedArray.mipLevel = (unsigned int)mipLevel; \
+};
+// hipGraphicsUnmapResources[('int', 'count'), ('hipGraphicsResource_t*', 'resources'), ('hipStream_t', 'stream')]
+#define INIT_hipGraphicsUnmapResources_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsUnmapResources.count = (int)count; \
+  cb_data.args.hipGraphicsUnmapResources.resources = (hipGraphicsResource_t*)resources; \
+  cb_data.args.hipGraphicsUnmapResources.stream = (hipStream_t)stream; \
+};
+// hipGraphicsUnregisterResource[('hipGraphicsResource_t', 'resource')]
+#define INIT_hipGraphicsUnregisterResource_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipGraphicsUnregisterResource.resource = (hipGraphicsResource_t)resource; \
+};
+// hipHccModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent')]
+#define INIT_hipHccModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHccModuleLaunchKernel.f = (hipFunction_t)f; \
+  cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeX = (unsigned int)globalWorkSizeX; \
+  cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeY = (unsigned int)globalWorkSizeY; \
+  cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeZ = (unsigned int)globalWorkSizeZ; \
+  cb_data.args.hipHccModuleLaunchKernel.blockDimX = (unsigned int)blockDimX; \
+  cb_data.args.hipHccModuleLaunchKernel.blockDimY = (unsigned int)blockDimY; \
+  cb_data.args.hipHccModuleLaunchKernel.blockDimZ = (unsigned int)blockDimZ; \
+  cb_data.args.hipHccModuleLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+  cb_data.args.hipHccModuleLaunchKernel.hStream = (hipStream_t)hStream; \
+  cb_data.args.hipHccModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+  cb_data.args.hipHccModuleLaunchKernel.extra = (void**)extra; \
+  cb_data.args.hipHccModuleLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+  cb_data.args.hipHccModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+};
+// hipHostAlloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipHostAlloc_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostAlloc.ptr = (void**)ptr; \
+  cb_data.args.hipHostAlloc.size = (size_t)sizeBytes; \
+  cb_data.args.hipHostAlloc.flags = (unsigned int)flags; \
+};
+// hipHostFree[('void*', 'ptr')]
+#define INIT_hipHostFree_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostFree.ptr = (void*)ptr; \
+};
+// hipHostGetDevicePointer[('void**', 'devPtr'), ('void*', 'hstPtr'), ('unsigned int', 'flags')]
+#define INIT_hipHostGetDevicePointer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostGetDevicePointer.devPtr = (void**)devicePointer; \
+  cb_data.args.hipHostGetDevicePointer.hstPtr = (void*)hostPointer; \
+  cb_data.args.hipHostGetDevicePointer.flags = (unsigned int)flags; \
+};
+// hipHostGetFlags[('unsigned int*', 'flagsPtr'), ('void*', 'hostPtr')]
+#define INIT_hipHostGetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostGetFlags.flagsPtr = (unsigned int*)flagsPtr; \
+  cb_data.args.hipHostGetFlags.hostPtr = (void*)hostPtr; \
+};
+// hipHostMalloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipHostMalloc_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostMalloc.ptr = (void**)ptr; \
+  cb_data.args.hipHostMalloc.size = (size_t)sizeBytes; \
+  cb_data.args.hipHostMalloc.flags = (unsigned int)flags; \
+};
+// hipHostRegister[('void*', 'hostPtr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+#define INIT_hipHostRegister_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostRegister.hostPtr = (void*)hostPtr; \
+  cb_data.args.hipHostRegister.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipHostRegister.flags = (unsigned int)flags; \
+};
+// hipHostUnregister[('void*', 'hostPtr')]
+#define INIT_hipHostUnregister_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipHostUnregister.hostPtr = (void*)hostPtr; \
+};
+// hipImportExternalMemory[('hipExternalMemory_t*', 'extMem_out'), ('const hipExternalMemoryHandleDesc*', 'memHandleDesc')]
+#define INIT_hipImportExternalMemory_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipImportExternalMemory.extMem_out = (hipExternalMemory_t*)extMem_out; \
+  cb_data.args.hipImportExternalMemory.memHandleDesc = (const hipExternalMemoryHandleDesc*)memHandleDesc; \
+};
+// hipImportExternalSemaphore[('hipExternalSemaphore_t*', 'extSem_out'), ('const hipExternalSemaphoreHandleDesc*', 'semHandleDesc')]
+#define INIT_hipImportExternalSemaphore_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipImportExternalSemaphore.extSem_out = (hipExternalSemaphore_t*)extSem_out; \
+  cb_data.args.hipImportExternalSemaphore.semHandleDesc = (const hipExternalSemaphoreHandleDesc*)semHandleDesc; \
+};
+// hipInit[('unsigned int', 'flags')]
+#define INIT_hipInit_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipInit.flags = (unsigned int)flags; \
+};
+// hipIpcCloseMemHandle[('void*', 'devPtr')]
+#define INIT_hipIpcCloseMemHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipIpcCloseMemHandle.devPtr = (void*)dev_ptr; \
+};
+// hipIpcGetEventHandle[('hipIpcEventHandle_t*', 'handle'), ('hipEvent_t', 'event')]
+#define INIT_hipIpcGetEventHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipIpcGetEventHandle.handle = (hipIpcEventHandle_t*)handle; \
+  cb_data.args.hipIpcGetEventHandle.event = (hipEvent_t)event; \
+};
+// hipIpcGetMemHandle[('hipIpcMemHandle_t*', 'handle'), ('void*', 'devPtr')]
+#define INIT_hipIpcGetMemHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipIpcGetMemHandle.handle = (hipIpcMemHandle_t*)handle; \
+  cb_data.args.hipIpcGetMemHandle.devPtr = (void*)dev_ptr; \
+};
+// hipIpcOpenEventHandle[('hipEvent_t*', 'event'), ('hipIpcEventHandle_t', 'handle')]
+#define INIT_hipIpcOpenEventHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipIpcOpenEventHandle.event = (hipEvent_t*)event; \
+  cb_data.args.hipIpcOpenEventHandle.handle = (hipIpcEventHandle_t)handle; \
+};
+// hipIpcOpenMemHandle[('void**', 'devPtr'), ('hipIpcMemHandle_t', 'handle'), ('unsigned int', 'flags')]
+#define INIT_hipIpcOpenMemHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipIpcOpenMemHandle.devPtr = (void**)dev_ptr; \
+  cb_data.args.hipIpcOpenMemHandle.handle = (hipIpcMemHandle_t)handle; \
+  cb_data.args.hipIpcOpenMemHandle.flags = (unsigned int)flags; \
+};
+// hipLaunchByPtr[('const void*', 'hostFunction')]
+#define INIT_hipLaunchByPtr_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchByPtr.hostFunction = (const void*)hostFunction; \
+};
+// hipLaunchCooperativeKernel[('const void*', 'f'), ('dim3', 'gridDim'), ('dim3', 'blockDimX'), ('void**', 'kernelParams'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipLaunchCooperativeKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchCooperativeKernel.f = (const void*)f; \
+  cb_data.args.hipLaunchCooperativeKernel.gridDim = (dim3)gridDim; \
+  cb_data.args.hipLaunchCooperativeKernel.blockDimX = (dim3)blockDim; \
+  cb_data.args.hipLaunchCooperativeKernel.kernelParams = (void**)kernelParams; \
+  cb_data.args.hipLaunchCooperativeKernel.sharedMemBytes = (unsigned int)sharedMemBytes; \
+  cb_data.args.hipLaunchCooperativeKernel.stream = (hipStream_t)hStream; \
+};
+// hipLaunchCooperativeKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+#define INIT_hipLaunchCooperativeKernelMultiDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchCooperativeKernelMultiDevice.launchParamsList = (hipLaunchParams*)launchParamsList; \
+  cb_data.args.hipLaunchCooperativeKernelMultiDevice.numDevices = (int)numDevices; \
+  cb_data.args.hipLaunchCooperativeKernelMultiDevice.flags = (unsigned int)flags; \
+};
+// hipLaunchHostFunc[('hipStream_t', 'stream'), ('hipHostFn_t', 'fn'), ('void*', 'userData')]
+#define INIT_hipLaunchHostFunc_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchHostFunc.stream = (hipStream_t)stream; \
+  cb_data.args.hipLaunchHostFunc.fn = (hipHostFn_t)fn; \
+  cb_data.args.hipLaunchHostFunc.userData = (void*)userData; \
+};
+// hipLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipLaunchKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchKernel.function_address = (const void*)hostFunction; \
+  cb_data.args.hipLaunchKernel.numBlocks = (dim3)gridDim; \
+  cb_data.args.hipLaunchKernel.dimBlocks = (dim3)blockDim; \
+  cb_data.args.hipLaunchKernel.args = (void**)args; \
+  cb_data.args.hipLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+  cb_data.args.hipLaunchKernel.stream = (hipStream_t)stream; \
+};
+// hipLaunchKernelExC[('const hipLaunchConfig_t*', 'config'), ('const void*', 'fPtr'), ('void**', 'args')]
+#define INIT_hipLaunchKernelExC_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLaunchKernelExC.config = (const hipLaunchConfig_t*)config; \
+  cb_data.args.hipLaunchKernelExC.fPtr = (const void*)fPtr; \
+  cb_data.args.hipLaunchKernelExC.args = (void**)args; \
+};
+// hipLibraryGetKernel[('hipKernel_t*', 'pKernel'), ('hipLibrary_t', 'library'), ('const char*', 'name')]
+#define INIT_hipLibraryGetKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLibraryGetKernel.pKernel = (hipKernel_t*)kernel; \
+  cb_data.args.hipLibraryGetKernel.library = (hipLibrary_t)library; \
+  cb_data.args.hipLibraryGetKernel.name = (kname) ? strdup(kname) : NULL; \
+};
+// hipLibraryGetKernelCount[('unsigned int*', 'count'), ('hipLibrary_t', 'library')]
+#define INIT_hipLibraryGetKernelCount_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLibraryGetKernelCount.count = (unsigned int*)count; \
+  cb_data.args.hipLibraryGetKernelCount.library = (hipLibrary_t)library; \
+};
+// hipLibraryLoadData[('hipLibrary_t*', 'library'), ('const void*', 'code'), ('hipJitOption**', 'jitOptions'), ('void**', 'jitOptionsValues'), ('unsigned int', 'numJitOptions'), ('hipLibraryOption**', 'libraryOptions'), ('void**', 'libraryOptionValues'), ('unsigned int', 'numLibraryOptions')]
+#define INIT_hipLibraryLoadData_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLibraryLoadData.library = (hipLibrary_t*)library; \
+  cb_data.args.hipLibraryLoadData.code = (const void*)image; \
+  cb_data.args.hipLibraryLoadData.jitOptions = (hipJitOption**)jitOptions; \
+  cb_data.args.hipLibraryLoadData.jitOptionsValues = (void**)jitOptionsValues; \
+  cb_data.args.hipLibraryLoadData.numJitOptions = (unsigned int)numJitOptions; \
+  cb_data.args.hipLibraryLoadData.libraryOptions = (hipLibraryOption**)libraryOptions; \
+  cb_data.args.hipLibraryLoadData.libraryOptionValues = (void**)libraryOptionValues; \
+  cb_data.args.hipLibraryLoadData.numLibraryOptions = (unsigned int)numLibraryOptions; \
+};
+// hipLibraryLoadFromFile[('hipLibrary_t*', 'library'), ('const char*', 'fileName'), ('hipJitOption**', 'jitOptions'), ('void**', 'jitOptionsValues'), ('unsigned int', 'numJitOptions'), ('hipLibraryOption**', 'libraryOptions'), ('void**', 'libraryOptionValues'), ('unsigned int', 'numLibraryOptions')]
+#define INIT_hipLibraryLoadFromFile_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLibraryLoadFromFile.library = (hipLibrary_t*)library; \
+  cb_data.args.hipLibraryLoadFromFile.fileName = (fname) ? strdup(fname) : NULL; \
+  cb_data.args.hipLibraryLoadFromFile.jitOptions = (hipJitOption**)jitOptions; \
+  cb_data.args.hipLibraryLoadFromFile.jitOptionsValues = (void**)jitOptionsValues; \
+  cb_data.args.hipLibraryLoadFromFile.numJitOptions = (unsigned int)numJitOptions; \
+  cb_data.args.hipLibraryLoadFromFile.libraryOptions = (hipLibraryOption**)libraryOptions; \
+  cb_data.args.hipLibraryLoadFromFile.libraryOptionValues = (void**)libraryOptionValues; \
+  cb_data.args.hipLibraryLoadFromFile.numLibraryOptions = (unsigned int)numLibraryOptions; \
+};
+// hipLibraryUnload[('hipLibrary_t', 'library')]
+#define INIT_hipLibraryUnload_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLibraryUnload.library = (hipLibrary_t)library; \
+};
+// hipLinkAddData[('hipLinkState_t', 'state'), ('hipJitInputType', 'type'), ('void*', 'data'), ('size_t', 'size'), ('const char*', 'name'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues')]
+#define INIT_hipLinkAddData_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLinkAddData.state = (hipLinkState_t)hip_link_state; \
+  cb_data.args.hipLinkAddData.type = (hipJitInputType)input_type; \
+  cb_data.args.hipLinkAddData.data = (void*)image; \
+  cb_data.args.hipLinkAddData.size = (size_t)image_size; \
+  cb_data.args.hipLinkAddData.name = (name) ? strdup(name) : NULL; \
+  cb_data.args.hipLinkAddData.numOptions = (unsigned int)num_options; \
+  cb_data.args.hipLinkAddData.options = (hipJitOption*)options_ptr; \
+  cb_data.args.hipLinkAddData.optionValues = (void**)option_values; \
+};
+// hipLinkAddFile[('hipLinkState_t', 'state'), ('hipJitInputType', 'type'), ('const char*', 'path'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues')]
+#define INIT_hipLinkAddFile_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLinkAddFile.state = (hipLinkState_t)hip_link_state; \
+  cb_data.args.hipLinkAddFile.type = (hipJitInputType)input_type; \
+  cb_data.args.hipLinkAddFile.path = (file_path) ? strdup(file_path) : NULL; \
+  cb_data.args.hipLinkAddFile.numOptions = (unsigned int)num_options; \
+  cb_data.args.hipLinkAddFile.options = (hipJitOption*)options_ptr; \
+  cb_data.args.hipLinkAddFile.optionValues = (void**)option_values; \
+};
+// hipLinkComplete[('hipLinkState_t', 'state'), ('void**', 'hipBinOut'), ('size_t*', 'sizeOut')]
+#define INIT_hipLinkComplete_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLinkComplete.state = (hipLinkState_t)hip_link_state; \
+  cb_data.args.hipLinkComplete.hipBinOut = (void**)bin_out; \
+  cb_data.args.hipLinkComplete.sizeOut = (size_t*)size_out; \
+};
+// hipLinkCreate[('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues'), ('hipLinkState_t*', 'stateOut')]
+#define INIT_hipLinkCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLinkCreate.numOptions = (unsigned int)num_options; \
+  cb_data.args.hipLinkCreate.options = (hipJitOption*)options_ptr; \
+  cb_data.args.hipLinkCreate.optionValues = (void**)options_vals_pptr; \
+  cb_data.args.hipLinkCreate.stateOut = (hipLinkState_t*)hip_link_state_ptr; \
+};
+// hipLinkDestroy[('hipLinkState_t', 'state')]
+#define INIT_hipLinkDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipLinkDestroy.state = (hipLinkState_t)hip_link_state; \
+};
+// hipMalloc[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMalloc_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMalloc.ptr = (void**)ptr; \
+  cb_data.args.hipMalloc.size = (size_t)sizeBytes; \
+};
+// hipMalloc3D[('hipPitchedPtr*', 'pitchedDevPtr'), ('hipExtent', 'extent')]
+#define INIT_hipMalloc3D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMalloc3D.pitchedDevPtr = (hipPitchedPtr*)pitchedDevPtr; \
+  cb_data.args.hipMalloc3D.extent = (hipExtent)extent; \
+};
+// hipMalloc3DArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'flags')]
+#define INIT_hipMalloc3DArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMalloc3DArray.array = (hipArray_t*)array; \
+  cb_data.args.hipMalloc3DArray.desc = (const hipChannelFormatDesc*)desc; \
+  cb_data.args.hipMalloc3DArray.extent = (hipExtent)extent; \
+  cb_data.args.hipMalloc3DArray.flags = (unsigned int)flags; \
+};
+// hipMallocArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('size_t', 'width'), ('size_t', 'height'), ('unsigned int', 'flags')]
+#define INIT_hipMallocArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocArray.array = (hipArray_t*)array; \
+  cb_data.args.hipMallocArray.desc = (const hipChannelFormatDesc*)desc; \
+  cb_data.args.hipMallocArray.width = (size_t)width; \
+  cb_data.args.hipMallocArray.height = (size_t)height; \
+  cb_data.args.hipMallocArray.flags = (unsigned int)flags; \
+};
+// hipMallocAsync[('void**', 'dev_ptr'), ('size_t', 'size'), ('hipStream_t', 'stream')]
+#define INIT_hipMallocAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocAsync.dev_ptr = (void**)dev_ptr; \
+  cb_data.args.hipMallocAsync.size = (size_t)size; \
+  cb_data.args.hipMallocAsync.stream = (hipStream_t)stream; \
+};
+// hipMallocFromPoolAsync[('void**', 'dev_ptr'), ('size_t', 'size'), ('hipMemPool_t', 'mem_pool'), ('hipStream_t', 'stream')]
+#define INIT_hipMallocFromPoolAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocFromPoolAsync.dev_ptr = (void**)dev_ptr; \
+  cb_data.args.hipMallocFromPoolAsync.size = (size_t)size; \
+  cb_data.args.hipMallocFromPoolAsync.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMallocFromPoolAsync.stream = (hipStream_t)stream; \
+};
+// hipMallocHost[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMallocHost_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocHost.ptr = (void**)ptr; \
+  cb_data.args.hipMallocHost.size = (size_t)size; \
+};
+// hipMallocManaged[('void**', 'dev_ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipMallocManaged_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocManaged.dev_ptr = (void**)dev_ptr; \
+  cb_data.args.hipMallocManaged.size = (size_t)size; \
+  cb_data.args.hipMallocManaged.flags = (unsigned int)flags; \
+};
+// hipMallocMipmappedArray[('hipMipmappedArray_t*', 'mipmappedArray'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'numLevels'), ('unsigned int', 'flags')]
+#define INIT_hipMallocMipmappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocMipmappedArray.mipmappedArray = (hipMipmappedArray_t*)mipmappedArray; \
+  cb_data.args.hipMallocMipmappedArray.desc = (const hipChannelFormatDesc*)desc; \
+  cb_data.args.hipMallocMipmappedArray.extent = (hipExtent)extent; \
+  cb_data.args.hipMallocMipmappedArray.numLevels = (unsigned int)numLevels; \
+  cb_data.args.hipMallocMipmappedArray.flags = (unsigned int)flags; \
+};
+// hipMallocPitch[('void**', 'ptr'), ('size_t*', 'pitch'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMallocPitch_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMallocPitch.ptr = (void**)ptr; \
+  cb_data.args.hipMallocPitch.pitch = (size_t*)pitch; \
+  cb_data.args.hipMallocPitch.width = (size_t)width; \
+  cb_data.args.hipMallocPitch.height = (size_t)height; \
+};
+// hipMemAddressFree[('void*', 'devPtr'), ('size_t', 'size')]
+#define INIT_hipMemAddressFree_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAddressFree.devPtr = (void*)devPtr; \
+  cb_data.args.hipMemAddressFree.size = (size_t)size; \
+};
+// hipMemAddressReserve[('void**', 'ptr'), ('size_t', 'size'), ('size_t', 'alignment'), ('void*', 'addr'), ('unsigned long long', 'flags')]
+#define INIT_hipMemAddressReserve_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAddressReserve.ptr = (void**)ptr; \
+  cb_data.args.hipMemAddressReserve.size = (size_t)size; \
+  cb_data.args.hipMemAddressReserve.alignment = (size_t)alignment; \
+  cb_data.args.hipMemAddressReserve.addr = (void*)addr; \
+  cb_data.args.hipMemAddressReserve.flags = (unsigned long long)flags; \
+};
+// hipMemAdvise[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('int', 'device')]
+#define INIT_hipMemAdvise_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAdvise.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemAdvise.count = (size_t)count; \
+  cb_data.args.hipMemAdvise.advice = (hipMemoryAdvise)advice; \
+  cb_data.args.hipMemAdvise.device = (int)device; \
+};
+// hipMemAdvise_v2[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('hipMemLocation', 'location')]
+#define INIT_hipMemAdvise_v2_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAdvise_v2.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemAdvise_v2.count = (size_t)count; \
+  cb_data.args.hipMemAdvise_v2.advice = (hipMemoryAdvise)advice; \
+  cb_data.args.hipMemAdvise_v2.location = (hipMemLocation)location; \
+};
+// hipMemAllocHost[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMemAllocHost_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAllocHost.ptr = (void**)ptr; \
+  cb_data.args.hipMemAllocHost.size = (size_t)size; \
+};
+// hipMemAllocPitch[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'pitch'), ('size_t', 'widthInBytes'), ('size_t', 'height'), ('unsigned int', 'elementSizeBytes')]
+#define INIT_hipMemAllocPitch_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemAllocPitch.dptr = (hipDeviceptr_t*)dptr; \
+  cb_data.args.hipMemAllocPitch.pitch = (size_t*)pitch; \
+  cb_data.args.hipMemAllocPitch.widthInBytes = (size_t)widthInBytes; \
+  cb_data.args.hipMemAllocPitch.height = (size_t)height; \
+  cb_data.args.hipMemAllocPitch.elementSizeBytes = (unsigned int)elementSizeBytes; \
+};
+// hipMemCreate[('hipMemGenericAllocationHandle_t*', 'handle'), ('size_t', 'size'), ('const hipMemAllocationProp*', 'prop'), ('unsigned long long', 'flags')]
+#define INIT_hipMemCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemCreate.handle = (hipMemGenericAllocationHandle_t*)handle; \
+  cb_data.args.hipMemCreate.size = (size_t)size; \
+  cb_data.args.hipMemCreate.prop = (const hipMemAllocationProp*)prop; \
+  cb_data.args.hipMemCreate.flags = (unsigned long long)flags; \
+};
+// hipMemExportToShareableHandle[('void*', 'shareableHandle'), ('hipMemGenericAllocationHandle_t', 'handle'), ('hipMemAllocationHandleType', 'handleType'), ('unsigned long long', 'flags')]
+#define INIT_hipMemExportToShareableHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemExportToShareableHandle.shareableHandle = (void*)shareableHandle; \
+  cb_data.args.hipMemExportToShareableHandle.handle = (hipMemGenericAllocationHandle_t)handle; \
+  cb_data.args.hipMemExportToShareableHandle.handleType = (hipMemAllocationHandleType)handleType; \
+  cb_data.args.hipMemExportToShareableHandle.flags = (unsigned long long)flags; \
+};
+// hipMemGetAccess[('unsigned long long*', 'flags'), ('const hipMemLocation*', 'location'), ('void*', 'ptr')]
+#define INIT_hipMemGetAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetAccess.flags = (unsigned long long*)flags; \
+  cb_data.args.hipMemGetAccess.location = (const hipMemLocation*)location; \
+  cb_data.args.hipMemGetAccess.ptr = (void*)ptr; \
+};
+// hipMemGetAddressRange[('hipDeviceptr_t*', 'pbase'), ('size_t*', 'psize'), ('hipDeviceptr_t', 'dptr')]
+#define INIT_hipMemGetAddressRange_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetAddressRange.pbase = (hipDeviceptr_t*)pbase; \
+  cb_data.args.hipMemGetAddressRange.psize = (size_t*)psize; \
+  cb_data.args.hipMemGetAddressRange.dptr = (hipDeviceptr_t)dptr; \
+};
+// hipMemGetAllocationGranularity[('size_t*', 'granularity'), ('const hipMemAllocationProp*', 'prop'), ('hipMemAllocationGranularity_flags', 'option')]
+#define INIT_hipMemGetAllocationGranularity_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetAllocationGranularity.granularity = (size_t*)granularity; \
+  cb_data.args.hipMemGetAllocationGranularity.prop = (const hipMemAllocationProp*)prop; \
+  cb_data.args.hipMemGetAllocationGranularity.option = (hipMemAllocationGranularity_flags)option; \
+};
+// hipMemGetAllocationPropertiesFromHandle[('hipMemAllocationProp*', 'prop'), ('hipMemGenericAllocationHandle_t', 'handle')]
+#define INIT_hipMemGetAllocationPropertiesFromHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetAllocationPropertiesFromHandle.prop = (hipMemAllocationProp*)prop; \
+  cb_data.args.hipMemGetAllocationPropertiesFromHandle.handle = (hipMemGenericAllocationHandle_t)handle; \
+};
+// hipMemGetHandleForAddressRange[('void*', 'handle'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'size'), ('hipMemRangeHandleType', 'handleType'), ('unsigned long long', 'flags')]
+#define INIT_hipMemGetHandleForAddressRange_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetHandleForAddressRange.handle = (void*)handle; \
+  cb_data.args.hipMemGetHandleForAddressRange.dptr = (hipDeviceptr_t)dptr; \
+  cb_data.args.hipMemGetHandleForAddressRange.size = (size_t)size; \
+  cb_data.args.hipMemGetHandleForAddressRange.handleType = (hipMemRangeHandleType)handleType; \
+  cb_data.args.hipMemGetHandleForAddressRange.flags = (unsigned long long)flags; \
+};
+// hipMemGetInfo[('size_t*', 'free'), ('size_t*', 'total')]
+#define INIT_hipMemGetInfo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemGetInfo.free = (size_t*)free; \
+  cb_data.args.hipMemGetInfo.total = (size_t*)total; \
+};
+// hipMemImportFromShareableHandle[('hipMemGenericAllocationHandle_t*', 'handle'), ('void*', 'osHandle'), ('hipMemAllocationHandleType', 'shHandleType')]
+#define INIT_hipMemImportFromShareableHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemImportFromShareableHandle.handle = (hipMemGenericAllocationHandle_t*)handle; \
+  cb_data.args.hipMemImportFromShareableHandle.osHandle = (void*)osHandle; \
+  cb_data.args.hipMemImportFromShareableHandle.shHandleType = (hipMemAllocationHandleType)shHandleType; \
+};
+// hipMemMap[('void*', 'ptr'), ('size_t', 'size'), ('size_t', 'offset'), ('hipMemGenericAllocationHandle_t', 'handle'), ('unsigned long long', 'flags')]
+#define INIT_hipMemMap_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemMap.ptr = (void*)ptr; \
+  cb_data.args.hipMemMap.size = (size_t)size; \
+  cb_data.args.hipMemMap.offset = (size_t)offset; \
+  cb_data.args.hipMemMap.handle = (hipMemGenericAllocationHandle_t)handle; \
+  cb_data.args.hipMemMap.flags = (unsigned long long)flags; \
+};
+// hipMemMapArrayAsync[('hipArrayMapInfo*', 'mapInfoList'), ('unsigned int', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemMapArrayAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemMapArrayAsync.mapInfoList = (hipArrayMapInfo*)mapInfoList; \
+  cb_data.args.hipMemMapArrayAsync.count = (unsigned int)count; \
+  cb_data.args.hipMemMapArrayAsync.stream = (hipStream_t)stream; \
+};
+// hipMemPoolCreate[('hipMemPool_t*', 'mem_pool'), ('const hipMemPoolProps*', 'pool_props')]
+#define INIT_hipMemPoolCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolCreate.mem_pool = (hipMemPool_t*)mem_pool; \
+  cb_data.args.hipMemPoolCreate.pool_props = (const hipMemPoolProps*)pool_props; \
+};
+// hipMemPoolDestroy[('hipMemPool_t', 'mem_pool')]
+#define INIT_hipMemPoolDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolDestroy.mem_pool = (hipMemPool_t)mem_pool; \
+};
+// hipMemPoolExportPointer[('hipMemPoolPtrExportData*', 'export_data'), ('void*', 'dev_ptr')]
+#define INIT_hipMemPoolExportPointer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolExportPointer.export_data = (hipMemPoolPtrExportData*)export_data; \
+  cb_data.args.hipMemPoolExportPointer.dev_ptr = (void*)ptr; \
+};
+// hipMemPoolExportToShareableHandle[('void*', 'shared_handle'), ('hipMemPool_t', 'mem_pool'), ('hipMemAllocationHandleType', 'handle_type'), ('unsigned int', 'flags')]
+#define INIT_hipMemPoolExportToShareableHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolExportToShareableHandle.shared_handle = (void*)shared_handle; \
+  cb_data.args.hipMemPoolExportToShareableHandle.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolExportToShareableHandle.handle_type = (hipMemAllocationHandleType)handle_type; \
+  cb_data.args.hipMemPoolExportToShareableHandle.flags = (unsigned int)flags; \
+};
+// hipMemPoolGetAccess[('hipMemAccessFlags*', 'flags'), ('hipMemPool_t', 'mem_pool'), ('hipMemLocation*', 'location')]
+#define INIT_hipMemPoolGetAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolGetAccess.flags = (hipMemAccessFlags*)flags; \
+  cb_data.args.hipMemPoolGetAccess.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolGetAccess.location = (hipMemLocation*)location; \
+};
+// hipMemPoolGetAttribute[('hipMemPool_t', 'mem_pool'), ('hipMemPoolAttr', 'attr'), ('void*', 'value')]
+#define INIT_hipMemPoolGetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolGetAttribute.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolGetAttribute.attr = (hipMemPoolAttr)attr; \
+  cb_data.args.hipMemPoolGetAttribute.value = (void*)value; \
+};
+// hipMemPoolImportFromShareableHandle[('hipMemPool_t*', 'mem_pool'), ('void*', 'shared_handle'), ('hipMemAllocationHandleType', 'handle_type'), ('unsigned int', 'flags')]
+#define INIT_hipMemPoolImportFromShareableHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolImportFromShareableHandle.mem_pool = (hipMemPool_t*)mem_pool; \
+  cb_data.args.hipMemPoolImportFromShareableHandle.shared_handle = (void*)shared_handle; \
+  cb_data.args.hipMemPoolImportFromShareableHandle.handle_type = (hipMemAllocationHandleType)handle_type; \
+  cb_data.args.hipMemPoolImportFromShareableHandle.flags = (unsigned int)flags; \
+};
+// hipMemPoolImportPointer[('void**', 'dev_ptr'), ('hipMemPool_t', 'mem_pool'), ('hipMemPoolPtrExportData*', 'export_data')]
+#define INIT_hipMemPoolImportPointer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolImportPointer.dev_ptr = (void**)ptr; \
+  cb_data.args.hipMemPoolImportPointer.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolImportPointer.export_data = (hipMemPoolPtrExportData*)export_data; \
+};
+// hipMemPoolSetAccess[('hipMemPool_t', 'mem_pool'), ('const hipMemAccessDesc*', 'desc_list'), ('size_t', 'count')]
+#define INIT_hipMemPoolSetAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolSetAccess.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolSetAccess.desc_list = (const hipMemAccessDesc*)desc_list; \
+  cb_data.args.hipMemPoolSetAccess.count = (size_t)count; \
+};
+// hipMemPoolSetAttribute[('hipMemPool_t', 'mem_pool'), ('hipMemPoolAttr', 'attr'), ('void*', 'value')]
+#define INIT_hipMemPoolSetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolSetAttribute.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolSetAttribute.attr = (hipMemPoolAttr)attr; \
+  cb_data.args.hipMemPoolSetAttribute.value = (void*)value; \
+};
+// hipMemPoolTrimTo[('hipMemPool_t', 'mem_pool'), ('size_t', 'min_bytes_to_hold')]
+#define INIT_hipMemPoolTrimTo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPoolTrimTo.mem_pool = (hipMemPool_t)mem_pool; \
+  cb_data.args.hipMemPoolTrimTo.min_bytes_to_hold = (size_t)min_bytes_to_hold; \
+};
+// hipMemPrefetchAsync[('const void*', 'dev_ptr'), ('size_t', 'count'), ('int', 'device'), ('hipStream_t', 'stream')]
+#define INIT_hipMemPrefetchAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPrefetchAsync.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemPrefetchAsync.count = (size_t)count; \
+  cb_data.args.hipMemPrefetchAsync.device = (int)device; \
+  cb_data.args.hipMemPrefetchAsync.stream = (hipStream_t)stream; \
+};
+// hipMemPrefetchAsync_v2[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemLocation', 'location'), ('unsigned int', 'flags'), ('hipStream_t', 'stream')]
+#define INIT_hipMemPrefetchAsync_v2_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPrefetchAsync_v2.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemPrefetchAsync_v2.count = (size_t)count; \
+  cb_data.args.hipMemPrefetchAsync_v2.location = (hipMemLocation)location; \
+  cb_data.args.hipMemPrefetchAsync_v2.flags = (unsigned int)flags; \
+  cb_data.args.hipMemPrefetchAsync_v2.stream = (hipStream_t)stream; \
+};
+// hipMemPtrGetInfo[('void*', 'ptr'), ('size_t*', 'size')]
+#define INIT_hipMemPtrGetInfo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemPtrGetInfo.ptr = (void*)ptr; \
+  cb_data.args.hipMemPtrGetInfo.size = (size_t*)size; \
+};
+// hipMemRangeGetAttribute[('void*', 'data'), ('size_t', 'data_size'), ('hipMemRangeAttribute', 'attribute'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+#define INIT_hipMemRangeGetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemRangeGetAttribute.data = (void*)data; \
+  cb_data.args.hipMemRangeGetAttribute.data_size = (size_t)data_size; \
+  cb_data.args.hipMemRangeGetAttribute.attribute = (hipMemRangeAttribute)attribute; \
+  cb_data.args.hipMemRangeGetAttribute.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemRangeGetAttribute.count = (size_t)count; \
+};
+// hipMemRangeGetAttributes[('void**', 'data'), ('size_t*', 'data_sizes'), ('hipMemRangeAttribute*', 'attributes'), ('size_t', 'num_attributes'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+#define INIT_hipMemRangeGetAttributes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemRangeGetAttributes.data = (void**)data; \
+  cb_data.args.hipMemRangeGetAttributes.data_sizes = (size_t*)data_sizes; \
+  cb_data.args.hipMemRangeGetAttributes.attributes = (hipMemRangeAttribute*)attributes; \
+  cb_data.args.hipMemRangeGetAttributes.num_attributes = (size_t)num_attributes; \
+  cb_data.args.hipMemRangeGetAttributes.dev_ptr = (const void*)dev_ptr; \
+  cb_data.args.hipMemRangeGetAttributes.count = (size_t)count; \
+};
+// hipMemRelease[('hipMemGenericAllocationHandle_t', 'handle')]
+#define INIT_hipMemRelease_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemRelease.handle = (hipMemGenericAllocationHandle_t)handle; \
+};
+// hipMemRetainAllocationHandle[('hipMemGenericAllocationHandle_t*', 'handle'), ('void*', 'addr')]
+#define INIT_hipMemRetainAllocationHandle_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemRetainAllocationHandle.handle = (hipMemGenericAllocationHandle_t*)handle; \
+  cb_data.args.hipMemRetainAllocationHandle.addr = (void*)addr; \
+};
+// hipMemSetAccess[('void*', 'ptr'), ('size_t', 'size'), ('const hipMemAccessDesc*', 'desc'), ('size_t', 'count')]
+#define INIT_hipMemSetAccess_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemSetAccess.ptr = (void*)ptr; \
+  cb_data.args.hipMemSetAccess.size = (size_t)size; \
+  cb_data.args.hipMemSetAccess.desc = (const hipMemAccessDesc*)desc; \
+  cb_data.args.hipMemSetAccess.count = (size_t)count; \
+};
+// hipMemUnmap[('void*', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMemUnmap_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemUnmap.ptr = (void*)ptr; \
+  cb_data.args.hipMemUnmap.size = (size_t)size; \
+};
+// hipMemcpy[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy.dst = (void*)dst; \
+  cb_data.args.hipMemcpy.src = (const void*)src; \
+  cb_data.args.hipMemcpy.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpy.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2D[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2D.dst = (void*)dst; \
+  cb_data.args.hipMemcpy2D.dpitch = (size_t)dpitch; \
+  cb_data.args.hipMemcpy2D.src = (const void*)src; \
+  cb_data.args.hipMemcpy2D.spitch = (size_t)spitch; \
+  cb_data.args.hipMemcpy2D.width = (size_t)width; \
+  cb_data.args.hipMemcpy2D.height = (size_t)height; \
+  cb_data.args.hipMemcpy2D.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2DArrayToArray[('hipArray_t', 'dst'), ('size_t', 'wOffsetDst'), ('size_t', 'hOffsetDst'), ('hipArray_const_t', 'src'), ('size_t', 'wOffsetSrc'), ('size_t', 'hOffsetSrc'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2DArrayToArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DArrayToArray.dst = (hipArray_t)dst; \
+  cb_data.args.hipMemcpy2DArrayToArray.wOffsetDst = (size_t)wOffsetDst; \
+  cb_data.args.hipMemcpy2DArrayToArray.hOffsetDst = (size_t)hOffsetDst; \
+  cb_data.args.hipMemcpy2DArrayToArray.src = (hipArray_const_t)src; \
+  cb_data.args.hipMemcpy2DArrayToArray.wOffsetSrc = (size_t)wOffsetSrc; \
+  cb_data.args.hipMemcpy2DArrayToArray.hOffsetSrc = (size_t)hOffsetSrc; \
+  cb_data.args.hipMemcpy2DArrayToArray.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DArrayToArray.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DArrayToArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy2DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DAsync.dst = (void*)dst; \
+  cb_data.args.hipMemcpy2DAsync.dpitch = (size_t)dpitch; \
+  cb_data.args.hipMemcpy2DAsync.src = (const void*)src; \
+  cb_data.args.hipMemcpy2DAsync.spitch = (size_t)spitch; \
+  cb_data.args.hipMemcpy2DAsync.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DAsync.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpy2DAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpy2DFromArray[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2DFromArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DFromArray.dst = (void*)dst; \
+  cb_data.args.hipMemcpy2DFromArray.dpitch = (size_t)dpitch; \
+  cb_data.args.hipMemcpy2DFromArray.src = (hipArray_const_t)src; \
+  cb_data.args.hipMemcpy2DFromArray.wOffset = (size_t)wOffsetSrc; \
+  cb_data.args.hipMemcpy2DFromArray.hOffset = (size_t)hOffset; \
+  cb_data.args.hipMemcpy2DFromArray.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DFromArray.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DFromArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2DFromArrayAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy2DFromArrayAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DFromArrayAsync.dst = (void*)dst; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.dpitch = (size_t)dpitch; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.src = (hipArray_const_t)src; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.wOffset = (size_t)wOffsetSrc; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.hOffset = (size_t)hOffsetSrc; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpy2DFromArrayAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpy2DToArray[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2DToArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DToArray.dst = (hipArray_t)dst; \
+  cb_data.args.hipMemcpy2DToArray.wOffset = (size_t)wOffset; \
+  cb_data.args.hipMemcpy2DToArray.hOffset = (size_t)hOffset; \
+  cb_data.args.hipMemcpy2DToArray.src = (const void*)src; \
+  cb_data.args.hipMemcpy2DToArray.spitch = (size_t)spitch; \
+  cb_data.args.hipMemcpy2DToArray.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DToArray.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DToArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2DToArrayAsync[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy2DToArrayAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy2DToArrayAsync.dst = (hipArray_t)dst; \
+  cb_data.args.hipMemcpy2DToArrayAsync.wOffset = (size_t)wOffset; \
+  cb_data.args.hipMemcpy2DToArrayAsync.hOffset = (size_t)hOffset; \
+  cb_data.args.hipMemcpy2DToArrayAsync.src = (const void*)src; \
+  cb_data.args.hipMemcpy2DToArrayAsync.spitch = (size_t)spitch; \
+  cb_data.args.hipMemcpy2DToArrayAsync.width = (size_t)width; \
+  cb_data.args.hipMemcpy2DToArrayAsync.height = (size_t)height; \
+  cb_data.args.hipMemcpy2DToArrayAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpy2DToArrayAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpy3D[('const hipMemcpy3DParms*', 'p')]
+#define INIT_hipMemcpy3D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy3D.p = (const hipMemcpy3DParms*)p; \
+};
+// hipMemcpy3DAsync[('const hipMemcpy3DParms*', 'p'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy3DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy3DAsync.p = (const hipMemcpy3DParms*)p; \
+  cb_data.args.hipMemcpy3DAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpy3DBatchAsync[('size_t', 'numOps'), ('hipMemcpy3DBatchOp*', 'opList'), ('size_t*', 'failIdx'), ('unsigned long long', 'flags'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy3DBatchAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy3DBatchAsync.numOps = (size_t)numOps; \
+  cb_data.args.hipMemcpy3DBatchAsync.opList = (hipMemcpy3DBatchOp*)opList; \
+  cb_data.args.hipMemcpy3DBatchAsync.failIdx = (size_t*)failIdx; \
+  cb_data.args.hipMemcpy3DBatchAsync.flags = (unsigned long long)flags; \
+  cb_data.args.hipMemcpy3DBatchAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpy3DPeer[('hipMemcpy3DPeerParms*', 'p')]
+#define INIT_hipMemcpy3DPeer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy3DPeer.p = (hipMemcpy3DPeerParms*)p; \
+};
+// hipMemcpy3DPeerAsync[('hipMemcpy3DPeerParms*', 'p'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy3DPeerAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpy3DPeerAsync.p = (hipMemcpy3DPeerParms*)p; \
+  cb_data.args.hipMemcpy3DPeerAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyAsync[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyAsync.dst = (void*)dst; \
+  cb_data.args.hipMemcpyAsync.src = (const void*)src; \
+  cb_data.args.hipMemcpyAsync.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpyAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyAtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')]
+#define INIT_hipMemcpyAtoA_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyAtoA.dstArray = (hipArray_t)dstArray; \
+  cb_data.args.hipMemcpyAtoA.dstOffset = (size_t)dstOffset; \
+  cb_data.args.hipMemcpyAtoA.srcArray = (hipArray_t)srcArray; \
+  cb_data.args.hipMemcpyAtoA.srcOffset = (size_t)srcOffset; \
+  cb_data.args.hipMemcpyAtoA.ByteCount = (size_t)ByteCount; \
+};
+// hipMemcpyAtoD[('hipDeviceptr_t', 'dstDevice'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')]
+#define INIT_hipMemcpyAtoD_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyAtoD.dstDevice = (hipDeviceptr_t)dstDevice; \
+  cb_data.args.hipMemcpyAtoD.srcArray = (hipArray_t)srcArray; \
+  cb_data.args.hipMemcpyAtoD.srcOffset = (size_t)srcOffset; \
+  cb_data.args.hipMemcpyAtoD.ByteCount = (size_t)ByteCount; \
+};
+// hipMemcpyAtoH[('void*', 'dst'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')]
+#define INIT_hipMemcpyAtoH_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyAtoH.dst = (void*)dstHost; \
+  cb_data.args.hipMemcpyAtoH.srcArray = (hipArray_t)srcArray; \
+  cb_data.args.hipMemcpyAtoH.srcOffset = (size_t)srcOffset; \
+  cb_data.args.hipMemcpyAtoH.count = (size_t)ByteCount; \
+};
+// hipMemcpyAtoHAsync[('void*', 'dstHost'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyAtoHAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyAtoHAsync.dstHost = (void*)dstHost; \
+  cb_data.args.hipMemcpyAtoHAsync.srcArray = (hipArray_t)srcArray; \
+  cb_data.args.hipMemcpyAtoHAsync.srcOffset = (size_t)srcOffset; \
+  cb_data.args.hipMemcpyAtoHAsync.ByteCount = (size_t)ByteCount; \
+  cb_data.args.hipMemcpyAtoHAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyBatchAsync[('void**', 'dsts'), ('void**', 'srcs'), ('size_t*', 'sizes'), ('size_t', 'count'), ('hipMemcpyAttributes*', 'attrs'), ('size_t*', 'attrsIdxs'), ('size_t', 'numAttrs'), ('size_t*', 'failIdx'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyBatchAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyBatchAsync.dsts = (void**)dsts; \
+  cb_data.args.hipMemcpyBatchAsync.srcs = (void**)srcs; \
+  cb_data.args.hipMemcpyBatchAsync.sizes = (size_t*)sizes; \
+  cb_data.args.hipMemcpyBatchAsync.count = (size_t)count; \
+  cb_data.args.hipMemcpyBatchAsync.attrs = (hipMemcpyAttributes*)attrs; \
+  cb_data.args.hipMemcpyBatchAsync.attrsIdxs = (size_t*)attrsIdxs; \
+  cb_data.args.hipMemcpyBatchAsync.numAttrs = (size_t)numAttrs; \
+  cb_data.args.hipMemcpyBatchAsync.failIdx = (size_t*)failIdx; \
+  cb_data.args.hipMemcpyBatchAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyDtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipDeviceptr_t', 'srcDevice'), ('size_t', 'ByteCount')]
+#define INIT_hipMemcpyDtoA_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyDtoA.dstArray = (hipArray_t)dstArray; \
+  cb_data.args.hipMemcpyDtoA.dstOffset = (size_t)dstOffset; \
+  cb_data.args.hipMemcpyDtoA.srcDevice = (hipDeviceptr_t)srcDevice; \
+  cb_data.args.hipMemcpyDtoA.ByteCount = (size_t)ByteCount; \
+};
+// hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyDtoD_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyDtoD.dst = (hipDeviceptr_t)dstDevice; \
+  cb_data.args.hipMemcpyDtoD.src = (hipDeviceptr_t)srcDevice; \
+  cb_data.args.hipMemcpyDtoD.sizeBytes = (size_t)ByteCount; \
+};
+// hipMemcpyDtoDAsync[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyDtoDAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyDtoDAsync.dst = (hipDeviceptr_t)dstDevice; \
+  cb_data.args.hipMemcpyDtoDAsync.src = (hipDeviceptr_t)srcDevice; \
+  cb_data.args.hipMemcpyDtoDAsync.sizeBytes = (size_t)ByteCount; \
+  cb_data.args.hipMemcpyDtoDAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyDtoH[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyDtoH_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyDtoH.dst = (void*)dstHost; \
+  cb_data.args.hipMemcpyDtoH.src = (hipDeviceptr_t)srcDevice; \
+  cb_data.args.hipMemcpyDtoH.sizeBytes = (size_t)ByteCount; \
+};
+// hipMemcpyDtoHAsync[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyDtoHAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyDtoHAsync.dst = (void*)dstHost; \
+  cb_data.args.hipMemcpyDtoHAsync.src = (hipDeviceptr_t)srcDevice; \
+  cb_data.args.hipMemcpyDtoHAsync.sizeBytes = (size_t)ByteCount; \
+  cb_data.args.hipMemcpyDtoHAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyFromArray[('void*', 'dst'), ('hipArray_const_t', 'srcArray'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyFromArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyFromArray.dst = (void*)dst; \
+  cb_data.args.hipMemcpyFromArray.srcArray = (hipArray_const_t)src; \
+  cb_data.args.hipMemcpyFromArray.wOffset = (size_t)wOffsetSrc; \
+  cb_data.args.hipMemcpyFromArray.hOffset = (size_t)hOffset; \
+  cb_data.args.hipMemcpyFromArray.count = (size_t)count; \
+  cb_data.args.hipMemcpyFromArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpyFromSymbol[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyFromSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyFromSymbol.dst = (void*)dst; \
+  cb_data.args.hipMemcpyFromSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipMemcpyFromSymbol.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyFromSymbol.offset = (size_t)offset; \
+  cb_data.args.hipMemcpyFromSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpyFromSymbolAsync[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyFromSymbolAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyFromSymbolAsync.dst = (void*)dst; \
+  cb_data.args.hipMemcpyFromSymbolAsync.symbol = (const void*)symbol; \
+  cb_data.args.hipMemcpyFromSymbolAsync.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyFromSymbolAsync.offset = (size_t)offset; \
+  cb_data.args.hipMemcpyFromSymbolAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpyFromSymbolAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyHtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'count')]
+#define INIT_hipMemcpyHtoA_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyHtoA.dstArray = (hipArray_t)dstArray; \
+  cb_data.args.hipMemcpyHtoA.dstOffset = (size_t)dstOffset; \
+  cb_data.args.hipMemcpyHtoA.srcHost = (const void*)srcHost; \
+  cb_data.args.hipMemcpyHtoA.count = (size_t)ByteCount; \
+};
+// hipMemcpyHtoAAsync[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyHtoAAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyHtoAAsync.dstArray = (hipArray_t)dstArray; \
+  cb_data.args.hipMemcpyHtoAAsync.dstOffset = (size_t)dstOffset; \
+  cb_data.args.hipMemcpyHtoAAsync.srcHost = (const void*)srcHost; \
+  cb_data.args.hipMemcpyHtoAAsync.ByteCount = (size_t)ByteCount; \
+  cb_data.args.hipMemcpyHtoAAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyHtoD_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyHtoD.dst = (hipDeviceptr_t)dstDevice; \
+  cb_data.args.hipMemcpyHtoD.src = (const void*)srcHost; \
+  cb_data.args.hipMemcpyHtoD.sizeBytes = (size_t)ByteCount; \
+};
+// hipMemcpyHtoDAsync[('hipDeviceptr_t', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyHtoDAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyHtoDAsync.dst = (hipDeviceptr_t)dstDevice; \
+  cb_data.args.hipMemcpyHtoDAsync.src = (const void*)srcHost; \
+  cb_data.args.hipMemcpyHtoDAsync.sizeBytes = (size_t)ByteCount; \
+  cb_data.args.hipMemcpyHtoDAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyParam2D[('const hip_Memcpy2D*', 'pCopy')]
+#define INIT_hipMemcpyParam2D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyParam2D.pCopy = (const hip_Memcpy2D*)pCopy; \
+};
+// hipMemcpyParam2DAsync[('const hip_Memcpy2D*', 'pCopy'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyParam2DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyParam2DAsync.pCopy = (const hip_Memcpy2D*)pCopy; \
+  cb_data.args.hipMemcpyParam2DAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyPeer[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDeviceId'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyPeer_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyPeer.dst = (void*)dst; \
+  cb_data.args.hipMemcpyPeer.dstDeviceId = (int)dstDevice; \
+  cb_data.args.hipMemcpyPeer.src = (const void*)src; \
+  cb_data.args.hipMemcpyPeer.srcDeviceId = (int)srcDevice; \
+  cb_data.args.hipMemcpyPeer.sizeBytes = (size_t)sizeBytes; \
+};
+// hipMemcpyPeerAsync[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDevice'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyPeerAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyPeerAsync.dst = (void*)dst; \
+  cb_data.args.hipMemcpyPeerAsync.dstDeviceId = (int)dstDevice; \
+  cb_data.args.hipMemcpyPeerAsync.src = (const void*)src; \
+  cb_data.args.hipMemcpyPeerAsync.srcDevice = (int)srcDevice; \
+  cb_data.args.hipMemcpyPeerAsync.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyPeerAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyToArray[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyToArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyToArray.dst = (hipArray_t)dst; \
+  cb_data.args.hipMemcpyToArray.wOffset = (size_t)wOffset; \
+  cb_data.args.hipMemcpyToArray.hOffset = (size_t)hOffset; \
+  cb_data.args.hipMemcpyToArray.src = (const void*)src; \
+  cb_data.args.hipMemcpyToArray.count = (size_t)count; \
+  cb_data.args.hipMemcpyToArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpyToSymbol[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyToSymbol_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyToSymbol.symbol = (const void*)symbol; \
+  cb_data.args.hipMemcpyToSymbol.src = (const void*)src; \
+  cb_data.args.hipMemcpyToSymbol.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyToSymbol.offset = (size_t)offset; \
+  cb_data.args.hipMemcpyToSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpyToSymbolAsync[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyToSymbolAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyToSymbolAsync.symbol = (const void*)symbol; \
+  cb_data.args.hipMemcpyToSymbolAsync.src = (const void*)src; \
+  cb_data.args.hipMemcpyToSymbolAsync.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyToSymbolAsync.offset = (size_t)offset; \
+  cb_data.args.hipMemcpyToSymbolAsync.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpyToSymbolAsync.stream = (hipStream_t)stream; \
+};
+// hipMemcpyWithStream[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyWithStream_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemcpyWithStream.dst = (void*)dst; \
+  cb_data.args.hipMemcpyWithStream.src = (const void*)src; \
+  cb_data.args.hipMemcpyWithStream.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemcpyWithStream.kind = (hipMemcpyKind)kind; \
+  cb_data.args.hipMemcpyWithStream.stream = (hipStream_t)stream; \
+};
+// hipMemset[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemset_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemset.dst = (void*)dst; \
+  cb_data.args.hipMemset.value = (int)value; \
+  cb_data.args.hipMemset.sizeBytes = (size_t)sizeBytes; \
+};
+// hipMemset2D[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMemset2D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemset2D.dst = (void*)dst; \
+  cb_data.args.hipMemset2D.pitch = (size_t)pitch; \
+  cb_data.args.hipMemset2D.value = (int)value; \
+  cb_data.args.hipMemset2D.width = (size_t)width; \
+  cb_data.args.hipMemset2D.height = (size_t)height; \
+};
+// hipMemset2DAsync[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+#define INIT_hipMemset2DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemset2DAsync.dst = (void*)dst; \
+  cb_data.args.hipMemset2DAsync.pitch = (size_t)pitch; \
+  cb_data.args.hipMemset2DAsync.value = (int)value; \
+  cb_data.args.hipMemset2DAsync.width = (size_t)width; \
+  cb_data.args.hipMemset2DAsync.height = (size_t)height; \
+  cb_data.args.hipMemset2DAsync.stream = (hipStream_t)stream; \
+};
+// hipMemset3D[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent')]
+#define INIT_hipMemset3D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemset3D.pitchedDevPtr = (hipPitchedPtr)pitchedDevPtr; \
+  cb_data.args.hipMemset3D.value = (int)value; \
+  cb_data.args.hipMemset3D.extent = (hipExtent)extent; \
+};
+// hipMemset3DAsync[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent'), ('hipStream_t', 'stream')]
+#define INIT_hipMemset3DAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemset3DAsync.pitchedDevPtr = (hipPitchedPtr)pitchedDevPtr; \
+  cb_data.args.hipMemset3DAsync.value = (int)value; \
+  cb_data.args.hipMemset3DAsync.extent = (hipExtent)extent; \
+  cb_data.args.hipMemset3DAsync.stream = (hipStream_t)stream; \
+};
+// hipMemsetAsync[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetAsync.dst = (void*)dst; \
+  cb_data.args.hipMemsetAsync.value = (int)value; \
+  cb_data.args.hipMemsetAsync.sizeBytes = (size_t)sizeBytes; \
+  cb_data.args.hipMemsetAsync.stream = (hipStream_t)stream; \
+};
+// hipMemsetD16[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD16_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD16.dest = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD16.value = (unsigned short)value; \
+  cb_data.args.hipMemsetD16.count = (size_t)count; \
+};
+// hipMemsetD16Async[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD16Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD16Async.dest = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD16Async.value = (unsigned short)value; \
+  cb_data.args.hipMemsetD16Async.count = (size_t)count; \
+  cb_data.args.hipMemsetD16Async.stream = (hipStream_t)stream; \
+};
+// hipMemsetD2D16[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned short', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMemsetD2D16_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D16.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D16.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D16.value = (unsigned short)value; \
+  cb_data.args.hipMemsetD2D16.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D16.height = (size_t)height; \
+};
+// hipMemsetD2D16Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned short', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD2D16Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D16Async.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D16Async.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D16Async.value = (unsigned short)value; \
+  cb_data.args.hipMemsetD2D16Async.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D16Async.height = (size_t)height; \
+  cb_data.args.hipMemsetD2D16Async.stream = (hipStream_t)stream; \
+};
+// hipMemsetD2D32[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMemsetD2D32_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D32.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D32.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D32.value = (unsigned int)value; \
+  cb_data.args.hipMemsetD2D32.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D32.height = (size_t)height; \
+};
+// hipMemsetD2D32Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD2D32Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D32Async.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D32Async.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D32Async.value = (unsigned int)value; \
+  cb_data.args.hipMemsetD2D32Async.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D32Async.height = (size_t)height; \
+  cb_data.args.hipMemsetD2D32Async.stream = (hipStream_t)stream; \
+};
+// hipMemsetD2D8[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned char', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMemsetD2D8_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D8.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D8.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D8.value = (unsigned char)value; \
+  cb_data.args.hipMemsetD2D8.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D8.height = (size_t)height; \
+};
+// hipMemsetD2D8Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned char', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD2D8Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD2D8Async.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD2D8Async.dstPitch = (size_t)dstPitch; \
+  cb_data.args.hipMemsetD2D8Async.value = (unsigned char)value; \
+  cb_data.args.hipMemsetD2D8Async.width = (size_t)width; \
+  cb_data.args.hipMemsetD2D8Async.height = (size_t)height; \
+  cb_data.args.hipMemsetD2D8Async.stream = (hipStream_t)stream; \
+};
+// hipMemsetD32[('hipDeviceptr_t', 'dest'), ('int', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD32_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD32.dest = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD32.value = (int)value; \
+  cb_data.args.hipMemsetD32.count = (size_t)count; \
+};
+// hipMemsetD32Async[('hipDeviceptr_t', 'dst'), ('int', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD32Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD32Async.dst = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD32Async.value = (int)value; \
+  cb_data.args.hipMemsetD32Async.count = (size_t)count; \
+  cb_data.args.hipMemsetD32Async.stream = (hipStream_t)stream; \
+};
+// hipMemsetD8[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD8_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD8.dest = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD8.value = (unsigned char)value; \
+  cb_data.args.hipMemsetD8.count = (size_t)count; \
+};
+// hipMemsetD8Async[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD8Async_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMemsetD8Async.dest = (hipDeviceptr_t)dst; \
+  cb_data.args.hipMemsetD8Async.value = (unsigned char)value; \
+  cb_data.args.hipMemsetD8Async.count = (size_t)count; \
+  cb_data.args.hipMemsetD8Async.stream = (hipStream_t)stream; \
+};
+// hipMipmappedArrayCreate[('hipMipmappedArray_t*', 'pHandle'), ('HIP_ARRAY3D_DESCRIPTOR*', 'pMipmappedArrayDesc'), ('unsigned int', 'numMipmapLevels')]
+#define INIT_hipMipmappedArrayCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMipmappedArrayCreate.pHandle = (hipMipmappedArray_t*)mipmapped_array_pptr; \
+  cb_data.args.hipMipmappedArrayCreate.pMipmappedArrayDesc = (HIP_ARRAY3D_DESCRIPTOR*)mipmapped_array_desc_ptr; \
+  cb_data.args.hipMipmappedArrayCreate.numMipmapLevels = (unsigned int)num_mipmap_levels; \
+};
+// hipMipmappedArrayDestroy[('hipMipmappedArray_t', 'hMipmappedArray')]
+#define INIT_hipMipmappedArrayDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMipmappedArrayDestroy.hMipmappedArray = (hipMipmappedArray_t)mipmapped_array_ptr; \
+};
+// hipMipmappedArrayGetLevel[('hipArray_t*', 'pLevelArray'), ('hipMipmappedArray_t', 'hMipMappedArray'), ('unsigned int', 'level')]
+#define INIT_hipMipmappedArrayGetLevel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipMipmappedArrayGetLevel.pLevelArray = (hipArray_t*)level_array_pptr; \
+  cb_data.args.hipMipmappedArrayGetLevel.hMipMappedArray = (hipMipmappedArray_t)mipmapped_array_ptr; \
+  cb_data.args.hipMipmappedArrayGetLevel.level = (unsigned int)mip_level; \
+};
+// hipModuleGetFunction[('hipFunction_t*', 'function'), ('hipModule_t', 'module'), ('const char*', 'kname')]
+#define INIT_hipModuleGetFunction_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleGetFunction.function = (hipFunction_t*)hfunc; \
+  cb_data.args.hipModuleGetFunction.module = (hipModule_t)hmod; \
+  cb_data.args.hipModuleGetFunction.kname = (name) ? strdup(name) : NULL; \
+};
+// hipModuleGetFunctionCount[('unsigned int*', 'count'), ('hipModule_t', 'mod')]
+#define INIT_hipModuleGetFunctionCount_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleGetFunctionCount.count = (unsigned int*)count; \
+  cb_data.args.hipModuleGetFunctionCount.mod = (hipModule_t)mod; \
+};
+// hipModuleGetGlobal[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'bytes'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+#define INIT_hipModuleGetGlobal_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleGetGlobal.dptr = (hipDeviceptr_t*)dptr; \
+  cb_data.args.hipModuleGetGlobal.bytes = (size_t*)bytes; \
+  cb_data.args.hipModuleGetGlobal.hmod = (hipModule_t)hmod; \
+  cb_data.args.hipModuleGetGlobal.name = (name) ? strdup(name) : NULL; \
+};
+// hipModuleGetTexRef[('textureReference**', 'texRef'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+#define INIT_hipModuleGetTexRef_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleGetTexRef.texRef = (textureReference**)texRef; \
+  cb_data.args.hipModuleGetTexRef.hmod = (hipModule_t)hmod; \
+  cb_data.args.hipModuleGetTexRef.name = (name) ? strdup(name) : NULL; \
+};
+// hipModuleLaunchCooperativeKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams')]
+#define INIT_hipModuleLaunchCooperativeKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLaunchCooperativeKernel.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.gridDimX = (unsigned int)gridDimX; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.gridDimY = (unsigned int)gridDimY; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.gridDimZ = (unsigned int)gridDimZ; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.blockDimX = (unsigned int)blockDimX; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.blockDimY = (unsigned int)blockDimY; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.blockDimZ = (unsigned int)blockDimZ; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.sharedMemBytes = (unsigned int)sharedMemBytes; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.stream = (hipStream_t)stream; \
+  cb_data.args.hipModuleLaunchCooperativeKernel.kernelParams = (void**)kernelParams; \
+};
+// hipModuleLaunchCooperativeKernelMultiDevice[('hipFunctionLaunchParams*', 'launchParamsList'), ('unsigned int', 'numDevices'), ('unsigned int', 'flags')]
+#define INIT_hipModuleLaunchCooperativeKernelMultiDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList = (hipFunctionLaunchParams*)launchParamsList; \
+  cb_data.args.hipModuleLaunchCooperativeKernelMultiDevice.numDevices = (unsigned int)numDevices; \
+  cb_data.args.hipModuleLaunchCooperativeKernelMultiDevice.flags = (unsigned int)flags; \
+};
+// hipModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams'), ('void**', 'extra')]
+#define INIT_hipModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLaunchKernel.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleLaunchKernel.gridDimX = (unsigned int)gridDimX; \
+  cb_data.args.hipModuleLaunchKernel.gridDimY = (unsigned int)gridDimY; \
+  cb_data.args.hipModuleLaunchKernel.gridDimZ = (unsigned int)gridDimZ; \
+  cb_data.args.hipModuleLaunchKernel.blockDimX = (unsigned int)blockDimX; \
+  cb_data.args.hipModuleLaunchKernel.blockDimY = (unsigned int)blockDimY; \
+  cb_data.args.hipModuleLaunchKernel.blockDimZ = (unsigned int)blockDimZ; \
+  cb_data.args.hipModuleLaunchKernel.sharedMemBytes = (unsigned int)sharedMemBytes; \
+  cb_data.args.hipModuleLaunchKernel.stream = (hipStream_t)hStream; \
+  cb_data.args.hipModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+  cb_data.args.hipModuleLaunchKernel.extra = (void**)extra; \
+};
+// hipModuleLoad[('hipModule_t*', 'module'), ('const char*', 'fname')]
+#define INIT_hipModuleLoad_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLoad.module = (hipModule_t*)module; \
+  cb_data.args.hipModuleLoad.fname = (fname) ? strdup(fname) : NULL; \
+};
+// hipModuleLoadData[('hipModule_t*', 'module'), ('const void*', 'image')]
+#define INIT_hipModuleLoadData_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLoadData.module = (hipModule_t*)module; \
+  cb_data.args.hipModuleLoadData.image = (const void*)image; \
+};
+// hipModuleLoadDataEx[('hipModule_t*', 'module'), ('const void*', 'image'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionsValues')]
+#define INIT_hipModuleLoadDataEx_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLoadDataEx.module = (hipModule_t*)module; \
+  cb_data.args.hipModuleLoadDataEx.image = (const void*)image; \
+  cb_data.args.hipModuleLoadDataEx.numOptions = (unsigned int)numOptions; \
+  cb_data.args.hipModuleLoadDataEx.options = (hipJitOption*)options; \
+  cb_data.args.hipModuleLoadDataEx.optionsValues = (void**)optionsValues; \
+};
+// hipModuleLoadFatBinary[('hipModule_t*', 'module'), ('const void*', 'fatbin')]
+#define INIT_hipModuleLoadFatBinary_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleLoadFatBinary.module = (hipModule_t*)module; \
+  cb_data.args.hipModuleLoadFatBinary.fatbin = (const void*)fatbin; \
+};
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk')]
+#define INIT_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks = (int*)numBlocks; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.blockSize = (int)blockSize; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+};
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk'), ('unsigned int', 'flags')]
+#define INIT_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks = (int*)numBlocks; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize = (int)blockSize; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+  cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags = (unsigned int)flags; \
+};
+// hipModuleOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+#define INIT_hipModuleOccupancyMaxPotentialBlockSize_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.gridSize = (int*)gridSize; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.blockSize = (int*)blockSize; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.blockSizeLimit = (int)blockSizeLimit; \
+};
+// hipModuleOccupancyMaxPotentialBlockSizeWithFlags[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit'), ('unsigned int', 'flags')]
+#define INIT_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize = (int*)gridSize; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize = (int*)blockSize; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.f = (hipFunction_t)f; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSizeLimit = (int)blockSizeLimit; \
+  cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.flags = (unsigned int)flags; \
+};
+// hipModuleUnload[('hipModule_t', 'module')]
+#define INIT_hipModuleUnload_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipModuleUnload.module = (hipModule_t)hmod; \
+};
+// hipOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize')]
+#define INIT_hipOccupancyMaxActiveBlocksPerMultiprocessor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks = (int*)numBlocks; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.f = (const void*)f; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.blockSize = (int)blockSize; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.dynamicSMemSize = (size_t)dynamicSMemSize; \
+};
+// hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize'), ('unsigned int', 'flags')]
+#define INIT_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks = (int*)numBlocks; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f = (const void*)f; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize = (int)blockSize; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynamicSMemSize = (size_t)dynamicSMemSize; \
+  cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags = (unsigned int)flags; \
+};
+// hipOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('const void*', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+#define INIT_hipOccupancyMaxPotentialBlockSize_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipOccupancyMaxPotentialBlockSize.gridSize = (int*)gridSize; \
+  cb_data.args.hipOccupancyMaxPotentialBlockSize.blockSize = (int*)blockSize; \
+  cb_data.args.hipOccupancyMaxPotentialBlockSize.f = (const void*)f; \
+  cb_data.args.hipOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+  cb_data.args.hipOccupancyMaxPotentialBlockSize.blockSizeLimit = (int)blockSizeLimit; \
+};
+// hipPeekAtLastError[]
+#define INIT_hipPeekAtLastError_CB_ARGS_DATA(cb_data) { \
+};
+// hipPointerGetAttribute[('void*', 'data'), ('hipPointer_attribute', 'attribute'), ('hipDeviceptr_t', 'ptr')]
+#define INIT_hipPointerGetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipPointerGetAttribute.data = (void*)data; \
+  cb_data.args.hipPointerGetAttribute.attribute = (hipPointer_attribute)attribute; \
+  cb_data.args.hipPointerGetAttribute.ptr = (hipDeviceptr_t)ptr; \
+};
+// hipPointerGetAttributes[('hipPointerAttribute_t*', 'attributes'), ('const void*', 'ptr')]
+#define INIT_hipPointerGetAttributes_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipPointerGetAttributes.attributes = (hipPointerAttribute_t*)attributes; \
+  cb_data.args.hipPointerGetAttributes.ptr = (const void*)ptr; \
+};
+// hipPointerSetAttribute[('const void*', 'value'), ('hipPointer_attribute', 'attribute'), ('hipDeviceptr_t', 'ptr')]
+#define INIT_hipPointerSetAttribute_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipPointerSetAttribute.value = (const void*)value; \
+  cb_data.args.hipPointerSetAttribute.attribute = (hipPointer_attribute)attribute; \
+  cb_data.args.hipPointerSetAttribute.ptr = (hipDeviceptr_t)ptr; \
+};
+// hipProfilerStart[]
+#define INIT_hipProfilerStart_CB_ARGS_DATA(cb_data) { \
+};
+// hipProfilerStop[]
+#define INIT_hipProfilerStop_CB_ARGS_DATA(cb_data) { \
+};
+// hipRuntimeGetVersion[('int*', 'runtimeVersion')]
+#define INIT_hipRuntimeGetVersion_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipRuntimeGetVersion.runtimeVersion = (int*)runtimeVersion; \
+};
+// hipSetDevice[('int', 'deviceId')]
+#define INIT_hipSetDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipSetDevice.deviceId = (int)device; \
+};
+// hipSetDeviceFlags[('unsigned int', 'flags')]
+#define INIT_hipSetDeviceFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipSetDeviceFlags.flags = (unsigned int)flags; \
+};
+// hipSetValidDevices[('int*', 'device_arr'), ('int', 'len')]
+#define INIT_hipSetValidDevices_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipSetValidDevices.device_arr = (int*)device_arr; \
+  cb_data.args.hipSetValidDevices.len = (int)len; \
+};
+// hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')]
+#define INIT_hipSetupArgument_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipSetupArgument.arg = (const void*)arg; \
+  cb_data.args.hipSetupArgument.size = (size_t)size; \
+  cb_data.args.hipSetupArgument.offset = (size_t)offset; \
+};
+// hipSignalExternalSemaphoresAsync[('const hipExternalSemaphore_t*', 'extSemArray'), ('const hipExternalSemaphoreSignalParams*', 'paramsArray'), ('unsigned int', 'numExtSems'), ('hipStream_t', 'stream')]
+#define INIT_hipSignalExternalSemaphoresAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipSignalExternalSemaphoresAsync.extSemArray = (const hipExternalSemaphore_t*)extSemArray; \
+  cb_data.args.hipSignalExternalSemaphoresAsync.paramsArray = (const hipExternalSemaphoreSignalParams*)paramsArray; \
+  cb_data.args.hipSignalExternalSemaphoresAsync.numExtSems = (unsigned int)numExtSems; \
+  cb_data.args.hipSignalExternalSemaphoresAsync.stream = (hipStream_t)stream; \
+};
+// hipStreamAddCallback[('hipStream_t', 'stream'), ('hipStreamCallback_t', 'callback'), ('void*', 'userData'), ('unsigned int', 'flags')]
+#define INIT_hipStreamAddCallback_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamAddCallback.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamAddCallback.callback = (hipStreamCallback_t)callback; \
+  cb_data.args.hipStreamAddCallback.userData = (void*)userData; \
+  cb_data.args.hipStreamAddCallback.flags = (unsigned int)flags; \
+};
+// hipStreamAttachMemAsync[('hipStream_t', 'stream'), ('void*', 'dev_ptr'), ('size_t', 'length'), ('unsigned int', 'flags')]
+#define INIT_hipStreamAttachMemAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamAttachMemAsync.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamAttachMemAsync.dev_ptr = (void*)dev_ptr; \
+  cb_data.args.hipStreamAttachMemAsync.length = (size_t)length; \
+  cb_data.args.hipStreamAttachMemAsync.flags = (unsigned int)flags; \
+};
+// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'), ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
+#define INIT_hipStreamBatchMemOp_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamBatchMemOp.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamBatchMemOp.count = (unsigned int)count; \
+  cb_data.args.hipStreamBatchMemOp.paramArray = (hipStreamBatchMemOpParams*)paramArray; \
+  cb_data.args.hipStreamBatchMemOp.flags = (unsigned int)flags; \
+};
+// hipStreamBeginCapture[('hipStream_t', 'stream'), ('hipStreamCaptureMode', 'mode')]
+#define INIT_hipStreamBeginCapture_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamBeginCapture.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamBeginCapture.mode = (hipStreamCaptureMode)mode; \
+};
+// hipStreamBeginCaptureToGraph[('hipStream_t', 'stream'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'dependencies'), ('const hipGraphEdgeData*', 'dependencyData'), ('size_t', 'numDependencies'), ('hipStreamCaptureMode', 'mode')]
+#define INIT_hipStreamBeginCaptureToGraph_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamBeginCaptureToGraph.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamBeginCaptureToGraph.graph = (hipGraph_t)graph; \
+  cb_data.args.hipStreamBeginCaptureToGraph.dependencies = (const hipGraphNode_t*)dependencies; \
+  cb_data.args.hipStreamBeginCaptureToGraph.dependencyData = (const hipGraphEdgeData*)dependencyData; \
+  cb_data.args.hipStreamBeginCaptureToGraph.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipStreamBeginCaptureToGraph.mode = (hipStreamCaptureMode)mode; \
+};
+// hipStreamCreate[('hipStream_t*', 'stream')]
+#define INIT_hipStreamCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamCreate.stream = (hipStream_t*)stream; \
+};
+// hipStreamCreateWithFlags[('hipStream_t*', 'stream'), ('unsigned int', 'flags')]
+#define INIT_hipStreamCreateWithFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamCreateWithFlags.stream = (hipStream_t*)stream; \
+  cb_data.args.hipStreamCreateWithFlags.flags = (unsigned int)flags; \
+};
+// hipStreamCreateWithPriority[('hipStream_t*', 'stream'), ('unsigned int', 'flags'), ('int', 'priority')]
+#define INIT_hipStreamCreateWithPriority_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamCreateWithPriority.stream = (hipStream_t*)stream; \
+  cb_data.args.hipStreamCreateWithPriority.flags = (unsigned int)flags; \
+  cb_data.args.hipStreamCreateWithPriority.priority = (int)priority; \
+};
+// hipStreamDestroy[('hipStream_t', 'stream')]
+#define INIT_hipStreamDestroy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamDestroy.stream = (hipStream_t)stream; \
+};
+// hipStreamEndCapture[('hipStream_t', 'stream'), ('hipGraph_t*', 'pGraph')]
+#define INIT_hipStreamEndCapture_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamEndCapture.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamEndCapture.pGraph = (hipGraph_t*)pGraph; \
+};
+// hipStreamGetAttribute[('hipStream_t', 'stream'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value_out')]
+#define INIT_hipStreamGetAttribute_CB_ARGS_DATA(cb_data) { \
+};
+// hipStreamGetCaptureInfo[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'pCaptureStatus'), ('unsigned long long*', 'pId')]
+#define INIT_hipStreamGetCaptureInfo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetCaptureInfo.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetCaptureInfo.pCaptureStatus = (hipStreamCaptureStatus*)pCaptureStatus; \
+  cb_data.args.hipStreamGetCaptureInfo.pId = (unsigned long long*)pId; \
+};
+// hipStreamGetCaptureInfo_v2[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'captureStatus_out'), ('unsigned long long*', 'id_out'), ('hipGraph_t*', 'graph_out'), ('const hipGraphNode_t**', 'dependencies_out'), ('size_t*', 'numDependencies_out')]
+#define INIT_hipStreamGetCaptureInfo_v2_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetCaptureInfo_v2.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetCaptureInfo_v2.captureStatus_out = (hipStreamCaptureStatus*)captureStatus_out; \
+  cb_data.args.hipStreamGetCaptureInfo_v2.id_out = (unsigned long long*)id_out; \
+  cb_data.args.hipStreamGetCaptureInfo_v2.graph_out = (hipGraph_t*)graph_out; \
+  cb_data.args.hipStreamGetCaptureInfo_v2.dependencies_out = (const hipGraphNode_t**)dependencies_out; \
+  cb_data.args.hipStreamGetCaptureInfo_v2.numDependencies_out = (size_t*)numDependencies_out; \
+};
+// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')]
+#define INIT_hipStreamGetDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetDevice.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetDevice.device = (hipDevice_t*)device; \
+};
+// hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
+#define INIT_hipStreamGetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetFlags.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetFlags.flags = (unsigned int*)flags; \
+};
+// hipStreamGetId[('hipStream_t', 'stream'), ('unsigned long long*', 'streamId')]
+#define INIT_hipStreamGetId_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetId.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetId.streamId = (unsigned long long*)streamId; \
+};
+// hipStreamGetPriority[('hipStream_t', 'stream'), ('int*', 'priority')]
+#define INIT_hipStreamGetPriority_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetPriority.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetPriority.priority = (int*)priority; \
+};
+// hipStreamIsCapturing[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'pCaptureStatus')]
+#define INIT_hipStreamIsCapturing_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamIsCapturing.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamIsCapturing.pCaptureStatus = (hipStreamCaptureStatus*)pCaptureStatus; \
+};
+// hipStreamQuery[('hipStream_t', 'stream')]
+#define INIT_hipStreamQuery_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamQuery.stream = (hipStream_t)stream; \
+};
+// hipStreamSetAttribute[('hipStream_t', 'stream'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')]
+#define INIT_hipStreamSetAttribute_CB_ARGS_DATA(cb_data) { \
+};
+// hipStreamSynchronize[('hipStream_t', 'stream')]
+#define INIT_hipStreamSynchronize_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamSynchronize.stream = (hipStream_t)stream; \
+};
+// hipStreamUpdateCaptureDependencies[('hipStream_t', 'stream'), ('hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('unsigned int', 'flags')]
+#define INIT_hipStreamUpdateCaptureDependencies_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamUpdateCaptureDependencies.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamUpdateCaptureDependencies.dependencies = (hipGraphNode_t*)dependencies; \
+  cb_data.args.hipStreamUpdateCaptureDependencies.numDependencies = (size_t)numDependencies; \
+  cb_data.args.hipStreamUpdateCaptureDependencies.flags = (unsigned int)flags; \
+};
+// hipStreamWaitEvent[('hipStream_t', 'stream'), ('hipEvent_t', 'event'), ('unsigned int', 'flags')]
+#define INIT_hipStreamWaitEvent_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamWaitEvent.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamWaitEvent.event = (hipEvent_t)event; \
+  cb_data.args.hipStreamWaitEvent.flags = (unsigned int)flags; \
+};
+// hipStreamWaitValue32[('hipStream_t', 'stream'), ('void*', 'ptr'), ('unsigned int', 'value'), ('unsigned int', 'flags'), ('unsigned int', 'mask')]
+#define INIT_hipStreamWaitValue32_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamWaitValue32.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamWaitValue32.ptr = (void*)ptr; \
+  cb_data.args.hipStreamWaitValue32.value = (unsigned int)value; \
+  cb_data.args.hipStreamWaitValue32.flags = (unsigned int)flags; \
+  cb_data.args.hipStreamWaitValue32.mask = (unsigned int)mask; \
+};
+// hipStreamWaitValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags'), ('uint64_t', 'mask')]
+#define INIT_hipStreamWaitValue64_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamWaitValue64.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamWaitValue64.ptr = (void*)ptr; \
+  cb_data.args.hipStreamWaitValue64.value = (uint64_t)value; \
+  cb_data.args.hipStreamWaitValue64.flags = (unsigned int)flags; \
+  cb_data.args.hipStreamWaitValue64.mask = (uint64_t)mask; \
+};
+// hipStreamWriteValue32[('hipStream_t', 'stream'), ('void*', 'ptr'), ('unsigned int', 'value'), ('unsigned int', 'flags')]
+#define INIT_hipStreamWriteValue32_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamWriteValue32.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamWriteValue32.ptr = (void*)ptr; \
+  cb_data.args.hipStreamWriteValue32.value = (unsigned int)value; \
+  cb_data.args.hipStreamWriteValue32.flags = (unsigned int)flags; \
+};
+// hipStreamWriteValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags')]
+#define INIT_hipStreamWriteValue64_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamWriteValue64.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamWriteValue64.ptr = (void*)ptr; \
+  cb_data.args.hipStreamWriteValue64.value = (uint64_t)value; \
+  cb_data.args.hipStreamWriteValue64.flags = (unsigned int)flags; \
+};
+// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetAddress_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetAddress.dev_ptr = (hipDeviceptr_t*)dptr; \
+  cb_data.args.hipTexRefGetAddress.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetArray[('hipArray_t*', 'pArray'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetArray.pArray = (hipArray_t*)pArray; \
+  cb_data.args.hipTexRefGetArray.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetBorderColor[('float*', 'pBorderColor'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetBorderColor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetBorderColor.pBorderColor = (float*)pBorderColor; \
+  cb_data.args.hipTexRefGetBorderColor.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetFlags[('unsigned int*', 'pFlags'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetFlags.pFlags = (unsigned int*)pFlags; \
+  cb_data.args.hipTexRefGetFlags.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetFormat[('hipArray_Format*', 'pFormat'), ('int*', 'pNumChannels'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetFormat_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetFormat.pFormat = (hipArray_Format*)pFormat; \
+  cb_data.args.hipTexRefGetFormat.pNumChannels = (int*)pNumChannels; \
+  cb_data.args.hipTexRefGetFormat.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetMaxAnisotropy[('int*', 'pmaxAnsio'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetMaxAnisotropy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetMaxAnisotropy.pmaxAnsio = (int*)pmaxAnsio; \
+  cb_data.args.hipTexRefGetMaxAnisotropy.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetMipMappedArray[('hipMipmappedArray_t*', 'pArray'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetMipMappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetMipMappedArray.pArray = (hipMipmappedArray_t*)pArray; \
+  cb_data.args.hipTexRefGetMipMappedArray.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetMipmapLevelBias[('float*', 'pbias'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetMipmapLevelBias_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetMipmapLevelBias.pbias = (float*)pbias; \
+  cb_data.args.hipTexRefGetMipmapLevelBias.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefGetMipmapLevelClamp[('float*', 'pminMipmapLevelClamp'), ('float*', 'pmaxMipmapLevelClamp'), ('const textureReference*', 'texRef')]
+#define INIT_hipTexRefGetMipmapLevelClamp_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp = (float*)pminMipmapLevelClamp; \
+  cb_data.args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp = (float*)pmaxMipmapLevelClamp; \
+  cb_data.args.hipTexRefGetMipmapLevelClamp.texRef = (const textureReference*)texRef; \
+};
+// hipTexRefSetAddress[('size_t*', 'ByteOffset'), ('textureReference*', 'texRef'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'bytes')]
+#define INIT_hipTexRefSetAddress_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetAddress.ByteOffset = (size_t*)ByteOffset; \
+  cb_data.args.hipTexRefSetAddress.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetAddress.dptr = (hipDeviceptr_t)dptr; \
+  cb_data.args.hipTexRefSetAddress.bytes = (size_t)bytes; \
+};
+// hipTexRefSetAddress2D[('textureReference*', 'texRef'), ('const HIP_ARRAY_DESCRIPTOR*', 'desc'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'Pitch')]
+#define INIT_hipTexRefSetAddress2D_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetAddress2D.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetAddress2D.desc = (const HIP_ARRAY_DESCRIPTOR*)desc; \
+  cb_data.args.hipTexRefSetAddress2D.dptr = (hipDeviceptr_t)dptr; \
+  cb_data.args.hipTexRefSetAddress2D.Pitch = (size_t)Pitch; \
+};
+// hipTexRefSetArray[('textureReference*', 'tex'), ('hipArray_const_t', 'array'), ('unsigned int', 'flags')]
+#define INIT_hipTexRefSetArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetArray.tex = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetArray.array = (hipArray_const_t)array; \
+  cb_data.args.hipTexRefSetArray.flags = (unsigned int)flags; \
+};
+// hipTexRefSetBorderColor[('textureReference*', 'texRef'), ('float*', 'pBorderColor')]
+#define INIT_hipTexRefSetBorderColor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetBorderColor.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetBorderColor.pBorderColor = (float*)pBorderColor; \
+};
+// hipTexRefSetFlags[('textureReference*', 'texRef'), ('unsigned int', 'Flags')]
+#define INIT_hipTexRefSetFlags_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetFlags.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetFlags.Flags = (unsigned int)Flags; \
+};
+// hipTexRefSetFormat[('textureReference*', 'texRef'), ('hipArray_Format', 'fmt'), ('int', 'NumPackedComponents')]
+#define INIT_hipTexRefSetFormat_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetFormat.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetFormat.fmt = (hipArray_Format)fmt; \
+  cb_data.args.hipTexRefSetFormat.NumPackedComponents = (int)NumPackedComponents; \
+};
+// hipTexRefSetMaxAnisotropy[('textureReference*', 'texRef'), ('unsigned int', 'maxAniso')]
+#define INIT_hipTexRefSetMaxAnisotropy_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetMaxAnisotropy.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetMaxAnisotropy.maxAniso = (unsigned int)maxAniso; \
+};
+// hipTexRefSetMipmapLevelBias[('textureReference*', 'texRef'), ('float', 'bias')]
+#define INIT_hipTexRefSetMipmapLevelBias_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetMipmapLevelBias.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetMipmapLevelBias.bias = (float)bias; \
+};
+// hipTexRefSetMipmapLevelClamp[('textureReference*', 'texRef'), ('float', 'minMipMapLevelClamp'), ('float', 'maxMipMapLevelClamp')]
+#define INIT_hipTexRefSetMipmapLevelClamp_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetMipmapLevelClamp.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetMipmapLevelClamp.minMipMapLevelClamp = (float)minMipMapLevelClamp; \
+  cb_data.args.hipTexRefSetMipmapLevelClamp.maxMipMapLevelClamp = (float)maxMipMapLevelClamp; \
+};
+// hipTexRefSetMipmappedArray[('textureReference*', 'texRef'), ('hipMipmappedArray*', 'mipmappedArray'), ('unsigned int', 'Flags')]
+#define INIT_hipTexRefSetMipmappedArray_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipTexRefSetMipmappedArray.texRef = (textureReference*)texRef; \
+  cb_data.args.hipTexRefSetMipmappedArray.mipmappedArray = (hipMipmappedArray*)mipmappedArray; \
+  cb_data.args.hipTexRefSetMipmappedArray.Flags = (unsigned int)Flags; \
+};
+// hipThreadExchangeStreamCaptureMode[('hipStreamCaptureMode*', 'mode')]
+#define INIT_hipThreadExchangeStreamCaptureMode_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipThreadExchangeStreamCaptureMode.mode = (hipStreamCaptureMode*)mode; \
+};
+// hipUserObjectCreate[('hipUserObject_t*', 'object_out'), ('void*', 'ptr'), ('hipHostFn_t', 'destroy'), ('unsigned int', 'initialRefcount'), ('unsigned int', 'flags')]
+#define INIT_hipUserObjectCreate_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipUserObjectCreate.object_out = (hipUserObject_t*)object_out; \
+  cb_data.args.hipUserObjectCreate.ptr = (void*)ptr; \
+  cb_data.args.hipUserObjectCreate.destroy = (hipHostFn_t)destroy; \
+  cb_data.args.hipUserObjectCreate.initialRefcount = (unsigned int)initialRefcount; \
+  cb_data.args.hipUserObjectCreate.flags = (unsigned int)flags; \
+};
+// hipUserObjectRelease[('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+#define INIT_hipUserObjectRelease_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipUserObjectRelease.object = (hipUserObject_t)object; \
+  cb_data.args.hipUserObjectRelease.count = (unsigned int)count; \
+};
+// hipUserObjectRetain[('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+#define INIT_hipUserObjectRetain_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipUserObjectRetain.object = (hipUserObject_t)object; \
+  cb_data.args.hipUserObjectRetain.count = (unsigned int)count; \
+};
+// hipWaitExternalSemaphoresAsync[('const hipExternalSemaphore_t*', 'extSemArray'), ('const hipExternalSemaphoreWaitParams*', 'paramsArray'), ('unsigned int', 'numExtSems'), ('hipStream_t', 'stream')]
+#define INIT_hipWaitExternalSemaphoresAsync_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipWaitExternalSemaphoresAsync.extSemArray = (const hipExternalSemaphore_t*)extSemArray; \
+  cb_data.args.hipWaitExternalSemaphoresAsync.paramsArray = (const hipExternalSemaphoreWaitParams*)paramsArray; \
+  cb_data.args.hipWaitExternalSemaphoresAsync.numExtSems = (unsigned int)numExtSems; \
+  cb_data.args.hipWaitExternalSemaphoresAsync.stream = (hipStream_t)stream; \
+};
+#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)
+
+// Macros for non-public API primitives
+// hipBindTexture()
+#define INIT_hipBindTexture_CB_ARGS_DATA(cb_data) {};
+// hipBindTexture2D()
+#define INIT_hipBindTexture2D_CB_ARGS_DATA(cb_data) {};
+// hipBindTextureToArray()
+#define INIT_hipBindTextureToArray_CB_ARGS_DATA(cb_data) {};
+// hipBindTextureToMipmappedArray()
+#define INIT_hipBindTextureToMipmappedArray_CB_ARGS_DATA(cb_data) {};
+// hipCreateTextureObject()
+#define INIT_hipCreateTextureObject_CB_ARGS_DATA(cb_data) {};
+// hipDestroyTextureObject()
+#define INIT_hipDestroyTextureObject_CB_ARGS_DATA(cb_data) {};
+// hipDeviceGetCount()
+#define INIT_hipDeviceGetCount_CB_ARGS_DATA(cb_data) {};
+// hipDeviceGetTexture1DLinearMaxWidth()
+#define INIT_hipDeviceGetTexture1DLinearMaxWidth_CB_ARGS_DATA(cb_data) {};
+// hipGetTextureAlignmentOffset()
+#define INIT_hipGetTextureAlignmentOffset_CB_ARGS_DATA(cb_data) {};
+// hipGetTextureObjectResourceDesc()
+#define INIT_hipGetTextureObjectResourceDesc_CB_ARGS_DATA(cb_data) {};
+// hipGetTextureObjectResourceViewDesc()
+#define INIT_hipGetTextureObjectResourceViewDesc_CB_ARGS_DATA(cb_data) {};
+// hipGetTextureObjectTextureDesc()
+#define INIT_hipGetTextureObjectTextureDesc_CB_ARGS_DATA(cb_data) {};
+// hipGetTextureReference()
+#define INIT_hipGetTextureReference_CB_ARGS_DATA(cb_data) {};
+// hipTexObjectCreate()
+#define INIT_hipTexObjectCreate_CB_ARGS_DATA(cb_data) {};
+// hipTexObjectDestroy()
+#define INIT_hipTexObjectDestroy_CB_ARGS_DATA(cb_data) {};
+// hipTexObjectGetResourceDesc()
+#define INIT_hipTexObjectGetResourceDesc_CB_ARGS_DATA(cb_data) {};
+// hipTexObjectGetResourceViewDesc()
+#define INIT_hipTexObjectGetResourceViewDesc_CB_ARGS_DATA(cb_data) {};
+// hipTexObjectGetTextureDesc()
+#define INIT_hipTexObjectGetTextureDesc_CB_ARGS_DATA(cb_data) {};
+// hipTexRefGetAddressMode()
+#define INIT_hipTexRefGetAddressMode_CB_ARGS_DATA(cb_data) {};
+// hipTexRefGetFilterMode()
+#define INIT_hipTexRefGetFilterMode_CB_ARGS_DATA(cb_data) {};
+// hipTexRefGetMipmapFilterMode()
+#define INIT_hipTexRefGetMipmapFilterMode_CB_ARGS_DATA(cb_data) {};
+// hipTexRefSetAddressMode()
+#define INIT_hipTexRefSetAddressMode_CB_ARGS_DATA(cb_data) {};
+// hipTexRefSetFilterMode()
+#define INIT_hipTexRefSetFilterMode_CB_ARGS_DATA(cb_data) {};
+// hipTexRefSetMipmapFilterMode()
+#define INIT_hipTexRefSetMipmapFilterMode_CB_ARGS_DATA(cb_data) {};
+// hipUnbindTexture()
+#define INIT_hipUnbindTexture_CB_ARGS_DATA(cb_data) {};
+
+#define INIT_NONE_CB_ARGS_DATA(cb_data) {};
+
+#if HIP_PROF_HIP_API_STRING
+// HIP API args filling helper
+static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
+  switch (id) {
+// __hipPopCallConfiguration[('dim3*', 'gridDim'), ('dim3*', 'blockDim'), ('size_t*', 'sharedMem'), ('hipStream_t*', 'stream')]
+    case HIP_API_ID___hipPopCallConfiguration:
+      if (data->args.__hipPopCallConfiguration.gridDim) data->args.__hipPopCallConfiguration.gridDim__val = *(data->args.__hipPopCallConfiguration.gridDim);
+      if (data->args.__hipPopCallConfiguration.blockDim) data->args.__hipPopCallConfiguration.blockDim__val = *(data->args.__hipPopCallConfiguration.blockDim);
+      if (data->args.__hipPopCallConfiguration.sharedMem) data->args.__hipPopCallConfiguration.sharedMem__val = *(data->args.__hipPopCallConfiguration.sharedMem);
+      if (data->args.__hipPopCallConfiguration.stream) data->args.__hipPopCallConfiguration.stream__val = *(data->args.__hipPopCallConfiguration.stream);
+      break;
+// __hipPushCallConfiguration[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+    case HIP_API_ID___hipPushCallConfiguration:
+      break;
+// hipArray3DCreate[('hipArray_t*', 'array'), ('const HIP_ARRAY3D_DESCRIPTOR*', 'pAllocateArray')]
+    case HIP_API_ID_hipArray3DCreate:
+      if (data->args.hipArray3DCreate.array) data->args.hipArray3DCreate.array__val = *(data->args.hipArray3DCreate.array);
+      if (data->args.hipArray3DCreate.pAllocateArray) data->args.hipArray3DCreate.pAllocateArray__val = *(data->args.hipArray3DCreate.pAllocateArray);
+      break;
+// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray_t', 'array')]
+    case HIP_API_ID_hipArray3DGetDescriptor:
+      if (data->args.hipArray3DGetDescriptor.pArrayDescriptor) data->args.hipArray3DGetDescriptor.pArrayDescriptor__val = *(data->args.hipArray3DGetDescriptor.pArrayDescriptor);
+      break;
+// hipArrayCreate[('hipArray_t*', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
+    case HIP_API_ID_hipArrayCreate:
+      if (data->args.hipArrayCreate.pHandle) data->args.hipArrayCreate.pHandle__val = *(data->args.hipArrayCreate.pHandle);
+      if (data->args.hipArrayCreate.pAllocateArray) data->args.hipArrayCreate.pAllocateArray__val = *(data->args.hipArrayCreate.pAllocateArray);
+      break;
+// hipArrayDestroy[('hipArray_t', 'array')]
+    case HIP_API_ID_hipArrayDestroy:
+      break;
+// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray_t', 'array')]
+    case HIP_API_ID_hipArrayGetDescriptor:
+      if (data->args.hipArrayGetDescriptor.pArrayDescriptor) data->args.hipArrayGetDescriptor.pArrayDescriptor__val = *(data->args.hipArrayGetDescriptor.pArrayDescriptor);
+      break;
+// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray_t', 'array')]
+    case HIP_API_ID_hipArrayGetInfo:
+      if (data->args.hipArrayGetInfo.desc) data->args.hipArrayGetInfo.desc__val = *(data->args.hipArrayGetInfo.desc);
+      if (data->args.hipArrayGetInfo.extent) data->args.hipArrayGetInfo.extent__val = *(data->args.hipArrayGetInfo.extent);
+      if (data->args.hipArrayGetInfo.flags) data->args.hipArrayGetInfo.flags__val = *(data->args.hipArrayGetInfo.flags);
+      break;
+// hipChooseDeviceR0000[('int*', 'device'), ('const hipDeviceProp_tR0000*', 'prop')]
+    case HIP_API_ID_hipChooseDeviceR0000:
+      if (data->args.hipChooseDeviceR0000.device) data->args.hipChooseDeviceR0000.device__val = *(data->args.hipChooseDeviceR0000.device);
+      if (data->args.hipChooseDeviceR0000.prop) data->args.hipChooseDeviceR0000.prop__val = *(data->args.hipChooseDeviceR0000.prop);
+      break;
+// hipChooseDeviceR0600[('int*', 'device'), ('const hipDeviceProp_tR0600*', 'prop')]
+    case HIP_API_ID_hipChooseDeviceR0600:
+      if (data->args.hipChooseDeviceR0600.device) data->args.hipChooseDeviceR0600.device__val = *(data->args.hipChooseDeviceR0600.device);
+      if (data->args.hipChooseDeviceR0600.prop) data->args.hipChooseDeviceR0600.prop__val = *(data->args.hipChooseDeviceR0600.prop);
+      break;
+// hipConfigureCall[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipConfigureCall:
+      break;
+// hipCreateSurfaceObject[('hipSurfaceObject_t*', 'pSurfObject'), ('const hipResourceDesc*', 'pResDesc')]
+    case HIP_API_ID_hipCreateSurfaceObject:
+      if (data->args.hipCreateSurfaceObject.pSurfObject) data->args.hipCreateSurfaceObject.pSurfObject__val = *(data->args.hipCreateSurfaceObject.pSurfObject);
+      if (data->args.hipCreateSurfaceObject.pResDesc) data->args.hipCreateSurfaceObject.pResDesc__val = *(data->args.hipCreateSurfaceObject.pResDesc);
+      break;
+// hipCtxCreate[('hipCtx_t*', 'ctx'), ('unsigned int', 'flags'), ('hipDevice_t', 'device')]
+    case HIP_API_ID_hipCtxCreate:
+      if (data->args.hipCtxCreate.ctx) data->args.hipCtxCreate.ctx__val = *(data->args.hipCtxCreate.ctx);
+      break;
+// hipCtxDestroy[('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipCtxDestroy:
+      break;
+// hipCtxDisablePeerAccess[('hipCtx_t', 'peerCtx')]
+    case HIP_API_ID_hipCtxDisablePeerAccess:
+      break;
+// hipCtxEnablePeerAccess[('hipCtx_t', 'peerCtx'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipCtxEnablePeerAccess:
+      break;
+// hipCtxGetApiVersion[('hipCtx_t', 'ctx'), ('unsigned int*', 'apiVersion')]
+    case HIP_API_ID_hipCtxGetApiVersion:
+      if (data->args.hipCtxGetApiVersion.apiVersion) data->args.hipCtxGetApiVersion.apiVersion__val = *(data->args.hipCtxGetApiVersion.apiVersion);
+      break;
+// hipCtxGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+    case HIP_API_ID_hipCtxGetCacheConfig:
+      if (data->args.hipCtxGetCacheConfig.cacheConfig) data->args.hipCtxGetCacheConfig.cacheConfig__val = *(data->args.hipCtxGetCacheConfig.cacheConfig);
+      break;
+// hipCtxGetCurrent[('hipCtx_t*', 'ctx')]
+    case HIP_API_ID_hipCtxGetCurrent:
+      if (data->args.hipCtxGetCurrent.ctx) data->args.hipCtxGetCurrent.ctx__val = *(data->args.hipCtxGetCurrent.ctx);
+      break;
+// hipCtxGetDevice[('hipDevice_t*', 'device')]
+    case HIP_API_ID_hipCtxGetDevice:
+      if (data->args.hipCtxGetDevice.device) data->args.hipCtxGetDevice.device__val = *(data->args.hipCtxGetDevice.device);
+      break;
+// hipCtxGetFlags[('unsigned int*', 'flags')]
+    case HIP_API_ID_hipCtxGetFlags:
+      if (data->args.hipCtxGetFlags.flags) data->args.hipCtxGetFlags.flags__val = *(data->args.hipCtxGetFlags.flags);
+      break;
+// hipCtxGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+    case HIP_API_ID_hipCtxGetSharedMemConfig:
+      if (data->args.hipCtxGetSharedMemConfig.pConfig) data->args.hipCtxGetSharedMemConfig.pConfig__val = *(data->args.hipCtxGetSharedMemConfig.pConfig);
+      break;
+// hipCtxPopCurrent[('hipCtx_t*', 'ctx')]
+    case HIP_API_ID_hipCtxPopCurrent:
+      if (data->args.hipCtxPopCurrent.ctx) data->args.hipCtxPopCurrent.ctx__val = *(data->args.hipCtxPopCurrent.ctx);
+      break;
+// hipCtxPushCurrent[('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipCtxPushCurrent:
+      break;
+// hipCtxSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+    case HIP_API_ID_hipCtxSetCacheConfig:
+      break;
+// hipCtxSetCurrent[('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipCtxSetCurrent:
+      break;
+// hipCtxSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+    case HIP_API_ID_hipCtxSetSharedMemConfig:
+      break;
+// hipCtxSynchronize[]
+    case HIP_API_ID_hipCtxSynchronize:
+      break;
+// hipDestroyExternalMemory[('hipExternalMemory_t', 'extMem')]
+    case HIP_API_ID_hipDestroyExternalMemory:
+      break;
+// hipDestroyExternalSemaphore[('hipExternalSemaphore_t', 'extSem')]
+    case HIP_API_ID_hipDestroyExternalSemaphore:
+      break;
+// hipDestroySurfaceObject[('hipSurfaceObject_t', 'surfaceObject')]
+    case HIP_API_ID_hipDestroySurfaceObject:
+      break;
+// hipDeviceCanAccessPeer[('int*', 'canAccessPeer'), ('int', 'deviceId'), ('int', 'peerDeviceId')]
+    case HIP_API_ID_hipDeviceCanAccessPeer:
+      if (data->args.hipDeviceCanAccessPeer.canAccessPeer) data->args.hipDeviceCanAccessPeer.canAccessPeer__val = *(data->args.hipDeviceCanAccessPeer.canAccessPeer);
+      break;
+// hipDeviceComputeCapability[('int*', 'major'), ('int*', 'minor'), ('hipDevice_t', 'device')]
+    case HIP_API_ID_hipDeviceComputeCapability:
+      if (data->args.hipDeviceComputeCapability.major) data->args.hipDeviceComputeCapability.major__val = *(data->args.hipDeviceComputeCapability.major);
+      if (data->args.hipDeviceComputeCapability.minor) data->args.hipDeviceComputeCapability.minor__val = *(data->args.hipDeviceComputeCapability.minor);
+      break;
+// hipDeviceDisablePeerAccess[('int', 'peerDeviceId')]
+    case HIP_API_ID_hipDeviceDisablePeerAccess:
+      break;
+// hipDeviceEnablePeerAccess[('int', 'peerDeviceId'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipDeviceEnablePeerAccess:
+      break;
+// hipDeviceGet[('hipDevice_t*', 'device'), ('int', 'ordinal')]
+    case HIP_API_ID_hipDeviceGet:
+      if (data->args.hipDeviceGet.device) data->args.hipDeviceGet.device__val = *(data->args.hipDeviceGet.device);
+      break;
+// hipDeviceGetAttribute[('int*', 'pi'), ('hipDeviceAttribute_t', 'attr'), ('int', 'deviceId')]
+    case HIP_API_ID_hipDeviceGetAttribute:
+      if (data->args.hipDeviceGetAttribute.pi) data->args.hipDeviceGetAttribute.pi__val = *(data->args.hipDeviceGetAttribute.pi);
+      break;
+// hipDeviceGetByPCIBusId[('int*', 'device'), ('const char*', 'pciBusId')]
+    case HIP_API_ID_hipDeviceGetByPCIBusId:
+      if (data->args.hipDeviceGetByPCIBusId.device) data->args.hipDeviceGetByPCIBusId.device__val = *(data->args.hipDeviceGetByPCIBusId.device);
+      if (data->args.hipDeviceGetByPCIBusId.pciBusId) data->args.hipDeviceGetByPCIBusId.pciBusId__val = *(data->args.hipDeviceGetByPCIBusId.pciBusId);
+      break;
+// hipDeviceGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+    case HIP_API_ID_hipDeviceGetCacheConfig:
+      if (data->args.hipDeviceGetCacheConfig.cacheConfig) data->args.hipDeviceGetCacheConfig.cacheConfig__val = *(data->args.hipDeviceGetCacheConfig.cacheConfig);
+      break;
+// hipDeviceGetDefaultMemPool[('hipMemPool_t*', 'mem_pool'), ('int', 'device')]
+    case HIP_API_ID_hipDeviceGetDefaultMemPool:
+      if (data->args.hipDeviceGetDefaultMemPool.mem_pool) data->args.hipDeviceGetDefaultMemPool.mem_pool__val = *(data->args.hipDeviceGetDefaultMemPool.mem_pool);
+      break;
+// hipDeviceGetGraphMemAttribute[('int', 'device'), ('hipGraphMemAttributeType', 'attr'), ('void*', 'value')]
+    case HIP_API_ID_hipDeviceGetGraphMemAttribute:
+      break;
+// hipDeviceGetLimit[('size_t*', 'pValue'), ('hipLimit_t', 'limit')]
+    case HIP_API_ID_hipDeviceGetLimit:
+      if (data->args.hipDeviceGetLimit.pValue) data->args.hipDeviceGetLimit.pValue__val = *(data->args.hipDeviceGetLimit.pValue);
+      break;
+// hipDeviceGetMemPool[('hipMemPool_t*', 'mem_pool'), ('int', 'device')]
+    case HIP_API_ID_hipDeviceGetMemPool:
+      if (data->args.hipDeviceGetMemPool.mem_pool) data->args.hipDeviceGetMemPool.mem_pool__val = *(data->args.hipDeviceGetMemPool.mem_pool);
+      break;
+// hipDeviceGetName[('char*', 'name'), ('int', 'len'), ('hipDevice_t', 'device')]
+    case HIP_API_ID_hipDeviceGetName:
+      data->args.hipDeviceGetName.name = (data->args.hipDeviceGetName.name) ? strdup(data->args.hipDeviceGetName.name) : NULL;
+      break;
+// hipDeviceGetP2PAttribute[('int*', 'value'), ('hipDeviceP2PAttr', 'attr'), ('int', 'srcDevice'), ('int', 'dstDevice')]
+    case HIP_API_ID_hipDeviceGetP2PAttribute:
+      if (data->args.hipDeviceGetP2PAttribute.value) data->args.hipDeviceGetP2PAttribute.value__val = *(data->args.hipDeviceGetP2PAttribute.value);
+      break;
+// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
+    case HIP_API_ID_hipDeviceGetPCIBusId:
+      data->args.hipDeviceGetPCIBusId.pciBusId = (data->args.hipDeviceGetPCIBusId.pciBusId) ? strdup(data->args.hipDeviceGetPCIBusId.pciBusId) : NULL;
+      break;
+// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+    case HIP_API_ID_hipDeviceGetSharedMemConfig:
+      if (data->args.hipDeviceGetSharedMemConfig.pConfig) data->args.hipDeviceGetSharedMemConfig.pConfig__val = *(data->args.hipDeviceGetSharedMemConfig.pConfig);
+      break;
+// hipDeviceGetStreamPriorityRange[('int*', 'leastPriority'), ('int*', 'greatestPriority')]
+    case HIP_API_ID_hipDeviceGetStreamPriorityRange:
+      if (data->args.hipDeviceGetStreamPriorityRange.leastPriority) data->args.hipDeviceGetStreamPriorityRange.leastPriority__val = *(data->args.hipDeviceGetStreamPriorityRange.leastPriority);
+      if (data->args.hipDeviceGetStreamPriorityRange.greatestPriority) data->args.hipDeviceGetStreamPriorityRange.greatestPriority__val = *(data->args.hipDeviceGetStreamPriorityRange.greatestPriority);
+      break;
+// hipDeviceGetUuid[('hipUUID*', 'uuid'), ('hipDevice_t', 'device')]
+    case HIP_API_ID_hipDeviceGetUuid:
+      if (data->args.hipDeviceGetUuid.uuid) data->args.hipDeviceGetUuid.uuid__val = *(data->args.hipDeviceGetUuid.uuid);
+      break;
+// hipDeviceGraphMemTrim[('int', 'device')]
+    case HIP_API_ID_hipDeviceGraphMemTrim:
+      break;
+// hipDevicePrimaryCtxGetState[('hipDevice_t', 'dev'), ('unsigned int*', 'flags'), ('int*', 'active')]
+    case HIP_API_ID_hipDevicePrimaryCtxGetState:
+      if (data->args.hipDevicePrimaryCtxGetState.flags) data->args.hipDevicePrimaryCtxGetState.flags__val = *(data->args.hipDevicePrimaryCtxGetState.flags);
+      if (data->args.hipDevicePrimaryCtxGetState.active) data->args.hipDevicePrimaryCtxGetState.active__val = *(data->args.hipDevicePrimaryCtxGetState.active);
+      break;
+// hipDevicePrimaryCtxRelease[('hipDevice_t', 'dev')]
+    case HIP_API_ID_hipDevicePrimaryCtxRelease:
+      break;
+// hipDevicePrimaryCtxReset[('hipDevice_t', 'dev')]
+    case HIP_API_ID_hipDevicePrimaryCtxReset:
+      break;
+// hipDevicePrimaryCtxRetain[('hipCtx_t*', 'pctx'), ('hipDevice_t', 'dev')]
+    case HIP_API_ID_hipDevicePrimaryCtxRetain:
+      if (data->args.hipDevicePrimaryCtxRetain.pctx) data->args.hipDevicePrimaryCtxRetain.pctx__val = *(data->args.hipDevicePrimaryCtxRetain.pctx);
+      break;
+// hipDevicePrimaryCtxSetFlags[('hipDevice_t', 'dev'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipDevicePrimaryCtxSetFlags:
+      break;
+// hipDeviceReset[]
+    case HIP_API_ID_hipDeviceReset:
+      break;
+// hipDeviceSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+    case HIP_API_ID_hipDeviceSetCacheConfig:
+      break;
+// hipDeviceSetGraphMemAttribute[('int', 'device'), ('hipGraphMemAttributeType', 'attr'), ('void*', 'value')]
+    case HIP_API_ID_hipDeviceSetGraphMemAttribute:
+      break;
+// hipDeviceSetLimit[('hipLimit_t', 'limit'), ('size_t', 'value')]
+    case HIP_API_ID_hipDeviceSetLimit:
+      break;
+// hipDeviceSetMemPool[('int', 'device'), ('hipMemPool_t', 'mem_pool')]
+    case HIP_API_ID_hipDeviceSetMemPool:
+      break;
+// hipDeviceSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+    case HIP_API_ID_hipDeviceSetSharedMemConfig:
+      break;
+// hipDeviceSynchronize[]
+    case HIP_API_ID_hipDeviceSynchronize:
+      break;
+// hipDeviceTotalMem[('size_t*', 'bytes'), ('hipDevice_t', 'device')]
+    case HIP_API_ID_hipDeviceTotalMem:
+      if (data->args.hipDeviceTotalMem.bytes) data->args.hipDeviceTotalMem.bytes__val = *(data->args.hipDeviceTotalMem.bytes);
+      break;
+// hipDriverGetVersion[('int*', 'driverVersion')]
+    case HIP_API_ID_hipDriverGetVersion:
+      if (data->args.hipDriverGetVersion.driverVersion) data->args.hipDriverGetVersion.driverVersion__val = *(data->args.hipDriverGetVersion.driverVersion);
+      break;
+// hipDrvGraphAddMemFreeNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('hipDeviceptr_t', 'dptr')]
+    case HIP_API_ID_hipDrvGraphAddMemFreeNode:
+      if (data->args.hipDrvGraphAddMemFreeNode.phGraphNode) data->args.hipDrvGraphAddMemFreeNode.phGraphNode__val = *(data->args.hipDrvGraphAddMemFreeNode.phGraphNode);
+      if (data->args.hipDrvGraphAddMemFreeNode.dependencies) data->args.hipDrvGraphAddMemFreeNode.dependencies__val = *(data->args.hipDrvGraphAddMemFreeNode.dependencies);
+      break;
+// hipDrvGraphAddMemcpyNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const HIP_MEMCPY3D*', 'copyParams'), ('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipDrvGraphAddMemcpyNode:
+      if (data->args.hipDrvGraphAddMemcpyNode.phGraphNode) data->args.hipDrvGraphAddMemcpyNode.phGraphNode__val = *(data->args.hipDrvGraphAddMemcpyNode.phGraphNode);
+      if (data->args.hipDrvGraphAddMemcpyNode.dependencies) data->args.hipDrvGraphAddMemcpyNode.dependencies__val = *(data->args.hipDrvGraphAddMemcpyNode.dependencies);
+      if (data->args.hipDrvGraphAddMemcpyNode.copyParams) data->args.hipDrvGraphAddMemcpyNode.copyParams__val = *(data->args.hipDrvGraphAddMemcpyNode.copyParams);
+      break;
+// hipDrvGraphAddMemsetNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const hipMemsetParams*', 'memsetParams'), ('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipDrvGraphAddMemsetNode:
+      if (data->args.hipDrvGraphAddMemsetNode.phGraphNode) data->args.hipDrvGraphAddMemsetNode.phGraphNode__val = *(data->args.hipDrvGraphAddMemsetNode.phGraphNode);
+      if (data->args.hipDrvGraphAddMemsetNode.dependencies) data->args.hipDrvGraphAddMemsetNode.dependencies__val = *(data->args.hipDrvGraphAddMemsetNode.dependencies);
+      if (data->args.hipDrvGraphAddMemsetNode.memsetParams) data->args.hipDrvGraphAddMemsetNode.memsetParams__val = *(data->args.hipDrvGraphAddMemsetNode.memsetParams);
+      break;
+// hipDrvGraphExecMemcpyNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'copyParams'), ('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipDrvGraphExecMemcpyNodeSetParams:
+      if (data->args.hipDrvGraphExecMemcpyNodeSetParams.copyParams) data->args.hipDrvGraphExecMemcpyNodeSetParams.copyParams__val = *(data->args.hipDrvGraphExecMemcpyNodeSetParams.copyParams);
+      break;
+// hipDrvGraphExecMemsetNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipMemsetParams*', 'memsetParams'), ('hipCtx_t', 'ctx')]
+    case HIP_API_ID_hipDrvGraphExecMemsetNodeSetParams:
+      if (data->args.hipDrvGraphExecMemsetNodeSetParams.memsetParams) data->args.hipDrvGraphExecMemsetNodeSetParams.memsetParams__val = *(data->args.hipDrvGraphExecMemsetNodeSetParams.memsetParams);
+      break;
+// hipDrvGraphMemcpyNodeGetParams[('hipGraphNode_t', 'hNode'), ('HIP_MEMCPY3D*', 'nodeParams')]
+    case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams:
+      if (data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams) data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams__val = *(data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams);
+      break;
+// hipDrvGraphMemcpyNodeSetParams[('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'nodeParams')]
+    case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams:
+      if (data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams) data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams__val = *(data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams);
+      break;
+// hipDrvLaunchKernelEx[('const HIP_LAUNCH_CONFIG*', 'config'), ('hipFunction_t', 'f'), ('void**', 'params'), ('void**', 'extra')]
+    case HIP_API_ID_hipDrvLaunchKernelEx:
+      if (data->args.hipDrvLaunchKernelEx.config) data->args.hipDrvLaunchKernelEx.config__val = *(data->args.hipDrvLaunchKernelEx.config);
+      if (data->args.hipDrvLaunchKernelEx.params) data->args.hipDrvLaunchKernelEx.params__val = *(data->args.hipDrvLaunchKernelEx.params);
+      if (data->args.hipDrvLaunchKernelEx.extra) data->args.hipDrvLaunchKernelEx.extra__val = *(data->args.hipDrvLaunchKernelEx.extra);
+      break;
+// hipDrvMemcpy2DUnaligned[('const hip_Memcpy2D*', 'pCopy')]
+    case HIP_API_ID_hipDrvMemcpy2DUnaligned:
+      if (data->args.hipDrvMemcpy2DUnaligned.pCopy) data->args.hipDrvMemcpy2DUnaligned.pCopy__val = *(data->args.hipDrvMemcpy2DUnaligned.pCopy);
+      break;
+// hipDrvMemcpy3D[('const HIP_MEMCPY3D*', 'pCopy')]
+    case HIP_API_ID_hipDrvMemcpy3D:
+      if (data->args.hipDrvMemcpy3D.pCopy) data->args.hipDrvMemcpy3D.pCopy__val = *(data->args.hipDrvMemcpy3D.pCopy);
+      break;
+// hipDrvMemcpy3DAsync[('const HIP_MEMCPY3D*', 'pCopy'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipDrvMemcpy3DAsync:
+      if (data->args.hipDrvMemcpy3DAsync.pCopy) data->args.hipDrvMemcpy3DAsync.pCopy__val = *(data->args.hipDrvMemcpy3DAsync.pCopy);
+      break;
+// hipDrvPointerGetAttributes[('unsigned int', 'numAttributes'), ('hipPointer_attribute*', 'attributes'), ('void**', 'data'), ('hipDeviceptr_t', 'ptr')]
+    case HIP_API_ID_hipDrvPointerGetAttributes:
+      if (data->args.hipDrvPointerGetAttributes.attributes) data->args.hipDrvPointerGetAttributes.attributes__val = *(data->args.hipDrvPointerGetAttributes.attributes);
+      if (data->args.hipDrvPointerGetAttributes.data) data->args.hipDrvPointerGetAttributes.data__val = *(data->args.hipDrvPointerGetAttributes.data);
+      break;
+// hipEventCreate[('hipEvent_t*', 'event')]
+    case HIP_API_ID_hipEventCreate:
+      if (data->args.hipEventCreate.event) data->args.hipEventCreate.event__val = *(data->args.hipEventCreate.event);
+      break;
+// hipEventCreateWithFlags[('hipEvent_t*', 'event'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipEventCreateWithFlags:
+      if (data->args.hipEventCreateWithFlags.event) data->args.hipEventCreateWithFlags.event__val = *(data->args.hipEventCreateWithFlags.event);
+      break;
+// hipEventDestroy[('hipEvent_t', 'event')]
+    case HIP_API_ID_hipEventDestroy:
+      break;
+// hipEventElapsedTime[('float*', 'ms'), ('hipEvent_t', 'start'), ('hipEvent_t', 'stop')]
+    case HIP_API_ID_hipEventElapsedTime:
+      if (data->args.hipEventElapsedTime.ms) data->args.hipEventElapsedTime.ms__val = *(data->args.hipEventElapsedTime.ms);
+      break;
+// hipEventQuery[('hipEvent_t', 'event')]
+    case HIP_API_ID_hipEventQuery:
+      break;
+// hipEventRecord[('hipEvent_t', 'event'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipEventRecord:
+      break;
+// hipEventRecordWithFlags[('hipEvent_t', 'event'), ('hipStream_t', 'stream'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipEventRecordWithFlags:
+      break;
+// hipEventSynchronize[('hipEvent_t', 'event')]
+    case HIP_API_ID_hipEventSynchronize:
+      break;
+// hipExtGetLastError[]
+    case HIP_API_ID_hipExtGetLastError:
+      break;
+// hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')]
+    case HIP_API_ID_hipExtGetLinkTypeAndHopCount:
+      if (data->args.hipExtGetLinkTypeAndHopCount.linktype) data->args.hipExtGetLinkTypeAndHopCount.linktype__val = *(data->args.hipExtGetLinkTypeAndHopCount.linktype);
+      if (data->args.hipExtGetLinkTypeAndHopCount.hopcount) data->args.hipExtGetLinkTypeAndHopCount.hopcount__val = *(data->args.hipExtGetLinkTypeAndHopCount.hopcount);
+      break;
+// hipExtLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('int', 'flags')]
+    case HIP_API_ID_hipExtLaunchKernel:
+      if (data->args.hipExtLaunchKernel.args) data->args.hipExtLaunchKernel.args__val = *(data->args.hipExtLaunchKernel.args);
+      break;
+// hipExtLaunchMultiKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
+      if (data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList) data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList__val = *(data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList);
+      break;
+// hipExtMallocWithFlags[('void**', 'ptr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipExtMallocWithFlags:
+      if (data->args.hipExtMallocWithFlags.ptr) data->args.hipExtMallocWithFlags.ptr__val = *(data->args.hipExtMallocWithFlags.ptr);
+      break;
+// hipExtModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'localWorkSizeX'), ('unsigned int', 'localWorkSizeY'), ('unsigned int', 'localWorkSizeZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipExtModuleLaunchKernel:
+      if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams);
+      if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra);
+      break;
+// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
+    case HIP_API_ID_hipExtStreamCreateWithCUMask:
+      if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream);
+      if (data->args.hipExtStreamCreateWithCUMask.cuMask) data->args.hipExtStreamCreateWithCUMask.cuMask__val = *(data->args.hipExtStreamCreateWithCUMask.cuMask);
+      break;
+// hipExtStreamGetCUMask[('hipStream_t', 'stream'), ('unsigned int', 'cuMaskSize'), ('unsigned int*', 'cuMask')]
+    case HIP_API_ID_hipExtStreamGetCUMask:
+      if (data->args.hipExtStreamGetCUMask.cuMask) data->args.hipExtStreamGetCUMask.cuMask__val = *(data->args.hipExtStreamGetCUMask.cuMask);
+      break;
+// hipExternalMemoryGetMappedBuffer[('void**', 'devPtr'), ('hipExternalMemory_t', 'extMem'), ('const hipExternalMemoryBufferDesc*', 'bufferDesc')]
+    case HIP_API_ID_hipExternalMemoryGetMappedBuffer:
+      if (data->args.hipExternalMemoryGetMappedBuffer.devPtr) data->args.hipExternalMemoryGetMappedBuffer.devPtr__val = *(data->args.hipExternalMemoryGetMappedBuffer.devPtr);
+      if (data->args.hipExternalMemoryGetMappedBuffer.bufferDesc) data->args.hipExternalMemoryGetMappedBuffer.bufferDesc__val = *(data->args.hipExternalMemoryGetMappedBuffer.bufferDesc);
+      break;
+// hipExternalMemoryGetMappedMipmappedArray[('hipMipmappedArray_t*', 'mipmap'), ('hipExternalMemory_t', 'extMem'), ('const hipExternalMemoryMipmappedArrayDesc*', 'mipmapDesc')]
+    case HIP_API_ID_hipExternalMemoryGetMappedMipmappedArray:
+      if (data->args.hipExternalMemoryGetMappedMipmappedArray.mipmap) data->args.hipExternalMemoryGetMappedMipmappedArray.mipmap__val = *(data->args.hipExternalMemoryGetMappedMipmappedArray.mipmap);
+      if (data->args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc) data->args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc__val = *(data->args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc);
+      break;
+// hipFree[('void*', 'ptr')]
+    case HIP_API_ID_hipFree:
+      break;
+// hipFreeArray[('hipArray_t', 'array')]
+    case HIP_API_ID_hipFreeArray:
+      break;
+// hipFreeAsync[('void*', 'dev_ptr'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipFreeAsync:
+      break;
+// hipFreeHost[('void*', 'ptr')]
+    case HIP_API_ID_hipFreeHost:
+      break;
+// hipFreeMipmappedArray[('hipMipmappedArray_t', 'mipmappedArray')]
+    case HIP_API_ID_hipFreeMipmappedArray:
+      break;
+// hipFuncGetAttribute[('int*', 'value'), ('hipFunction_attribute', 'attrib'), ('hipFunction_t', 'hfunc')]
+    case HIP_API_ID_hipFuncGetAttribute:
+      if (data->args.hipFuncGetAttribute.value) data->args.hipFuncGetAttribute.value__val = *(data->args.hipFuncGetAttribute.value);
+      break;
+// hipFuncGetAttributes[('hipFuncAttributes*', 'attr'), ('const void*', 'func')]
+    case HIP_API_ID_hipFuncGetAttributes:
+      if (data->args.hipFuncGetAttributes.attr) data->args.hipFuncGetAttributes.attr__val = *(data->args.hipFuncGetAttributes.attr);
+      break;
+// hipFuncSetAttribute[('const void*', 'func'), ('hipFuncAttribute', 'attr'), ('int', 'value')]
+    case HIP_API_ID_hipFuncSetAttribute:
+      break;
+// hipFuncSetCacheConfig[('const void*', 'func'), ('hipFuncCache_t', 'config')]
+    case HIP_API_ID_hipFuncSetCacheConfig:
+      break;
+// hipFuncSetSharedMemConfig[('const void*', 'func'), ('hipSharedMemConfig', 'config')]
+    case HIP_API_ID_hipFuncSetSharedMemConfig:
+      break;
+// hipGLGetDevices[('unsigned int*', 'pHipDeviceCount'), ('int*', 'pHipDevices'), ('unsigned int', 'hipDeviceCount'), ('hipGLDeviceList', 'deviceList')]
+    case HIP_API_ID_hipGLGetDevices:
+      if (data->args.hipGLGetDevices.pHipDeviceCount) data->args.hipGLGetDevices.pHipDeviceCount__val = *(data->args.hipGLGetDevices.pHipDeviceCount);
+      if (data->args.hipGLGetDevices.pHipDevices) data->args.hipGLGetDevices.pHipDevices__val = *(data->args.hipGLGetDevices.pHipDevices);
+      break;
+// hipGetChannelDesc[('hipChannelFormatDesc*', 'desc'), ('hipArray_const_t', 'array')]
+    case HIP_API_ID_hipGetChannelDesc:
+      if (data->args.hipGetChannelDesc.desc) data->args.hipGetChannelDesc.desc__val = *(data->args.hipGetChannelDesc.desc);
+      break;
+// hipGetDevice[('int*', 'deviceId')]
+    case HIP_API_ID_hipGetDevice:
+      if (data->args.hipGetDevice.deviceId) data->args.hipGetDevice.deviceId__val = *(data->args.hipGetDevice.deviceId);
+      break;
+// hipGetDeviceCount[('int*', 'count')]
+    case HIP_API_ID_hipGetDeviceCount:
+      if (data->args.hipGetDeviceCount.count) data->args.hipGetDeviceCount.count__val = *(data->args.hipGetDeviceCount.count);
+      break;
+// hipGetDeviceFlags[('unsigned int*', 'flags')]
+    case HIP_API_ID_hipGetDeviceFlags:
+      if (data->args.hipGetDeviceFlags.flags) data->args.hipGetDeviceFlags.flags__val = *(data->args.hipGetDeviceFlags.flags);
+      break;
+// hipGetDevicePropertiesR0000[('hipDeviceProp_tR0000*', 'prop'), ('int', 'device')]
+    case HIP_API_ID_hipGetDevicePropertiesR0000:
+      if (data->args.hipGetDevicePropertiesR0000.prop) data->args.hipGetDevicePropertiesR0000.prop__val = *(data->args.hipGetDevicePropertiesR0000.prop);
+      break;
+// hipGetDevicePropertiesR0600[('hipDeviceProp_tR0600*', 'prop'), ('int', 'deviceId')]
+    case HIP_API_ID_hipGetDevicePropertiesR0600:
+      if (data->args.hipGetDevicePropertiesR0600.prop) data->args.hipGetDevicePropertiesR0600.prop__val = *(data->args.hipGetDevicePropertiesR0600.prop);
+      break;
+// hipGetDriverEntryPoint[('const char*', 'symbol'), ('void**', 'funcPtr'), ('unsigned long long', 'flags'), ('hipDriverEntryPointQueryResult*', 'driverStatus')]
+    case HIP_API_ID_hipGetDriverEntryPoint:
+      if (data->args.hipGetDriverEntryPoint.symbol) data->args.hipGetDriverEntryPoint.symbol__val = *(data->args.hipGetDriverEntryPoint.symbol);
+      if (data->args.hipGetDriverEntryPoint.funcPtr) data->args.hipGetDriverEntryPoint.funcPtr__val = *(data->args.hipGetDriverEntryPoint.funcPtr);
+      if (data->args.hipGetDriverEntryPoint.driverStatus) data->args.hipGetDriverEntryPoint.driverStatus__val = *(data->args.hipGetDriverEntryPoint.driverStatus);
+      break;
+// hipGetFuncBySymbol[('hipFunction_t*', 'functionPtr'), ('const void*', 'symbolPtr')]
+    case HIP_API_ID_hipGetFuncBySymbol:
+      if (data->args.hipGetFuncBySymbol.functionPtr) data->args.hipGetFuncBySymbol.functionPtr__val = *(data->args.hipGetFuncBySymbol.functionPtr);
+      break;
+// hipGetLastError[]
+    case HIP_API_ID_hipGetLastError:
+      break;
+// hipGetMipmappedArrayLevel[('hipArray_t*', 'levelArray'), ('hipMipmappedArray_const_t', 'mipmappedArray'), ('unsigned int', 'level')]
+    case HIP_API_ID_hipGetMipmappedArrayLevel:
+      if (data->args.hipGetMipmappedArrayLevel.levelArray) data->args.hipGetMipmappedArrayLevel.levelArray__val = *(data->args.hipGetMipmappedArrayLevel.levelArray);
+      break;
+// hipGetProcAddress[('const char*', 'symbol'), ('void**', 'pfn'), ('int', 'hipVersion'), ('uint64_t', 'flags'), ('hipDriverProcAddressQueryResult*', 'symbolStatus')]
+    case HIP_API_ID_hipGetProcAddress:
+      if (data->args.hipGetProcAddress.symbol) data->args.hipGetProcAddress.symbol__val = *(data->args.hipGetProcAddress.symbol);
+      if (data->args.hipGetProcAddress.pfn) data->args.hipGetProcAddress.pfn__val = *(data->args.hipGetProcAddress.pfn);
+      if (data->args.hipGetProcAddress.symbolStatus) data->args.hipGetProcAddress.symbolStatus__val = *(data->args.hipGetProcAddress.symbolStatus);
+      break;
+// hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')]
+    case HIP_API_ID_hipGetSymbolAddress:
+      if (data->args.hipGetSymbolAddress.devPtr) data->args.hipGetSymbolAddress.devPtr__val = *(data->args.hipGetSymbolAddress.devPtr);
+      break;
+// hipGetSymbolSize[('size_t*', 'size'), ('const void*', 'symbol')]
+    case HIP_API_ID_hipGetSymbolSize:
+      if (data->args.hipGetSymbolSize.size) data->args.hipGetSymbolSize.size__val = *(data->args.hipGetSymbolSize.size);
+      break;
+// hipGraphAddBatchMemOpNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const hipBatchMemOpNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphAddBatchMemOpNode:
+      if (data->args.hipGraphAddBatchMemOpNode.phGraphNode) data->args.hipGraphAddBatchMemOpNode.phGraphNode__val = *(data->args.hipGraphAddBatchMemOpNode.phGraphNode);
+      if (data->args.hipGraphAddBatchMemOpNode.dependencies) data->args.hipGraphAddBatchMemOpNode.dependencies__val = *(data->args.hipGraphAddBatchMemOpNode.dependencies);
+      if (data->args.hipGraphAddBatchMemOpNode.nodeParams) data->args.hipGraphAddBatchMemOpNode.nodeParams__val = *(data->args.hipGraphAddBatchMemOpNode.nodeParams);
+      break;
+// hipGraphAddChildGraphNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraph_t', 'childGraph')]
+    case HIP_API_ID_hipGraphAddChildGraphNode:
+      if (data->args.hipGraphAddChildGraphNode.pGraphNode) data->args.hipGraphAddChildGraphNode.pGraphNode__val = *(data->args.hipGraphAddChildGraphNode.pGraphNode);
+      if (data->args.hipGraphAddChildGraphNode.pDependencies) data->args.hipGraphAddChildGraphNode.pDependencies__val = *(data->args.hipGraphAddChildGraphNode.pDependencies);
+      break;
+// hipGraphAddDependencies[('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'from'), ('const hipGraphNode_t*', 'to'), ('size_t', 'numDependencies')]
+    case HIP_API_ID_hipGraphAddDependencies:
+      if (data->args.hipGraphAddDependencies.from) data->args.hipGraphAddDependencies.from__val = *(data->args.hipGraphAddDependencies.from);
+      if (data->args.hipGraphAddDependencies.to) data->args.hipGraphAddDependencies.to__val = *(data->args.hipGraphAddDependencies.to);
+      break;
+// hipGraphAddEmptyNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies')]
+    case HIP_API_ID_hipGraphAddEmptyNode:
+      if (data->args.hipGraphAddEmptyNode.pGraphNode) data->args.hipGraphAddEmptyNode.pGraphNode__val = *(data->args.hipGraphAddEmptyNode.pGraphNode);
+      if (data->args.hipGraphAddEmptyNode.pDependencies) data->args.hipGraphAddEmptyNode.pDependencies__val = *(data->args.hipGraphAddEmptyNode.pDependencies);
+      break;
+// hipGraphAddEventRecordNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphAddEventRecordNode:
+      if (data->args.hipGraphAddEventRecordNode.pGraphNode) data->args.hipGraphAddEventRecordNode.pGraphNode__val = *(data->args.hipGraphAddEventRecordNode.pGraphNode);
+      if (data->args.hipGraphAddEventRecordNode.pDependencies) data->args.hipGraphAddEventRecordNode.pDependencies__val = *(data->args.hipGraphAddEventRecordNode.pDependencies);
+      break;
+// hipGraphAddEventWaitNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphAddEventWaitNode:
+      if (data->args.hipGraphAddEventWaitNode.pGraphNode) data->args.hipGraphAddEventWaitNode.pGraphNode__val = *(data->args.hipGraphAddEventWaitNode.pGraphNode);
+      if (data->args.hipGraphAddEventWaitNode.pDependencies) data->args.hipGraphAddEventWaitNode.pDependencies__val = *(data->args.hipGraphAddEventWaitNode.pDependencies);
+      break;
+// hipGraphAddExternalSemaphoresSignalNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode:
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode) data->args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode__val = *(data->args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode);
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.pDependencies) data->args.hipGraphAddExternalSemaphoresSignalNode.pDependencies__val = *(data->args.hipGraphAddExternalSemaphoresSignalNode.pDependencies);
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.nodeParams) data->args.hipGraphAddExternalSemaphoresSignalNode.nodeParams__val = *(data->args.hipGraphAddExternalSemaphoresSignalNode.nodeParams);
+      break;
+// hipGraphAddExternalSemaphoresWaitNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode:
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode) data->args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode__val = *(data->args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode);
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.pDependencies) data->args.hipGraphAddExternalSemaphoresWaitNode.pDependencies__val = *(data->args.hipGraphAddExternalSemaphoresWaitNode.pDependencies);
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.nodeParams) data->args.hipGraphAddExternalSemaphoresWaitNode.nodeParams__val = *(data->args.hipGraphAddExternalSemaphoresWaitNode.nodeParams);
+      break;
+// hipGraphAddHostNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipHostNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphAddHostNode:
+      if (data->args.hipGraphAddHostNode.pGraphNode) data->args.hipGraphAddHostNode.pGraphNode__val = *(data->args.hipGraphAddHostNode.pGraphNode);
+      if (data->args.hipGraphAddHostNode.pDependencies) data->args.hipGraphAddHostNode.pDependencies__val = *(data->args.hipGraphAddHostNode.pDependencies);
+      if (data->args.hipGraphAddHostNode.pNodeParams) data->args.hipGraphAddHostNode.pNodeParams__val = *(data->args.hipGraphAddHostNode.pNodeParams);
+      break;
+// hipGraphAddKernelNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipKernelNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphAddKernelNode:
+      if (data->args.hipGraphAddKernelNode.pGraphNode) data->args.hipGraphAddKernelNode.pGraphNode__val = *(data->args.hipGraphAddKernelNode.pGraphNode);
+      if (data->args.hipGraphAddKernelNode.pDependencies) data->args.hipGraphAddKernelNode.pDependencies__val = *(data->args.hipGraphAddKernelNode.pDependencies);
+      if (data->args.hipGraphAddKernelNode.pNodeParams) data->args.hipGraphAddKernelNode.pNodeParams__val = *(data->args.hipGraphAddKernelNode.pNodeParams);
+      break;
+// hipGraphAddMemAllocNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipMemAllocNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphAddMemAllocNode:
+      if (data->args.hipGraphAddMemAllocNode.pGraphNode) data->args.hipGraphAddMemAllocNode.pGraphNode__val = *(data->args.hipGraphAddMemAllocNode.pGraphNode);
+      if (data->args.hipGraphAddMemAllocNode.pDependencies) data->args.hipGraphAddMemAllocNode.pDependencies__val = *(data->args.hipGraphAddMemAllocNode.pDependencies);
+      if (data->args.hipGraphAddMemAllocNode.pNodeParams) data->args.hipGraphAddMemAllocNode.pNodeParams__val = *(data->args.hipGraphAddMemAllocNode.pNodeParams);
+      break;
+// hipGraphAddMemFreeNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dev_ptr')]
+    case HIP_API_ID_hipGraphAddMemFreeNode:
+      if (data->args.hipGraphAddMemFreeNode.pGraphNode) data->args.hipGraphAddMemFreeNode.pGraphNode__val = *(data->args.hipGraphAddMemFreeNode.pGraphNode);
+      if (data->args.hipGraphAddMemFreeNode.pDependencies) data->args.hipGraphAddMemFreeNode.pDependencies__val = *(data->args.hipGraphAddMemFreeNode.pDependencies);
+      break;
+// hipGraphAddMemcpyNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipMemcpy3DParms*', 'pCopyParams')]
+    case HIP_API_ID_hipGraphAddMemcpyNode:
+      if (data->args.hipGraphAddMemcpyNode.pGraphNode) data->args.hipGraphAddMemcpyNode.pGraphNode__val = *(data->args.hipGraphAddMemcpyNode.pGraphNode);
+      if (data->args.hipGraphAddMemcpyNode.pDependencies) data->args.hipGraphAddMemcpyNode.pDependencies__val = *(data->args.hipGraphAddMemcpyNode.pDependencies);
+      if (data->args.hipGraphAddMemcpyNode.pCopyParams) data->args.hipGraphAddMemcpyNode.pCopyParams__val = *(data->args.hipGraphAddMemcpyNode.pCopyParams);
+      break;
+// hipGraphAddMemcpyNode1D[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphAddMemcpyNode1D:
+      if (data->args.hipGraphAddMemcpyNode1D.pGraphNode) data->args.hipGraphAddMemcpyNode1D.pGraphNode__val = *(data->args.hipGraphAddMemcpyNode1D.pGraphNode);
+      if (data->args.hipGraphAddMemcpyNode1D.pDependencies) data->args.hipGraphAddMemcpyNode1D.pDependencies__val = *(data->args.hipGraphAddMemcpyNode1D.pDependencies);
+      break;
+// hipGraphAddMemcpyNodeFromSymbol[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol:
+      if (data->args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode) data->args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode__val = *(data->args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode);
+      if (data->args.hipGraphAddMemcpyNodeFromSymbol.pDependencies) data->args.hipGraphAddMemcpyNodeFromSymbol.pDependencies__val = *(data->args.hipGraphAddMemcpyNodeFromSymbol.pDependencies);
+      break;
+// hipGraphAddMemcpyNodeToSymbol[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphAddMemcpyNodeToSymbol:
+      if (data->args.hipGraphAddMemcpyNodeToSymbol.pGraphNode) data->args.hipGraphAddMemcpyNodeToSymbol.pGraphNode__val = *(data->args.hipGraphAddMemcpyNodeToSymbol.pGraphNode);
+      if (data->args.hipGraphAddMemcpyNodeToSymbol.pDependencies) data->args.hipGraphAddMemcpyNodeToSymbol.pDependencies__val = *(data->args.hipGraphAddMemcpyNodeToSymbol.pDependencies);
+      break;
+// hipGraphAddMemsetNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipMemsetParams*', 'pMemsetParams')]
+    case HIP_API_ID_hipGraphAddMemsetNode:
+      if (data->args.hipGraphAddMemsetNode.pGraphNode) data->args.hipGraphAddMemsetNode.pGraphNode__val = *(data->args.hipGraphAddMemsetNode.pGraphNode);
+      if (data->args.hipGraphAddMemsetNode.pDependencies) data->args.hipGraphAddMemsetNode.pDependencies__val = *(data->args.hipGraphAddMemsetNode.pDependencies);
+      if (data->args.hipGraphAddMemsetNode.pMemsetParams) data->args.hipGraphAddMemsetNode.pMemsetParams__val = *(data->args.hipGraphAddMemsetNode.pMemsetParams);
+      break;
+// hipGraphAddNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraphNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphAddNode:
+      if (data->args.hipGraphAddNode.pGraphNode) data->args.hipGraphAddNode.pGraphNode__val = *(data->args.hipGraphAddNode.pGraphNode);
+      if (data->args.hipGraphAddNode.pDependencies) data->args.hipGraphAddNode.pDependencies__val = *(data->args.hipGraphAddNode.pDependencies);
+      if (data->args.hipGraphAddNode.nodeParams) data->args.hipGraphAddNode.nodeParams__val = *(data->args.hipGraphAddNode.nodeParams);
+      break;
+// hipGraphBatchMemOpNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipBatchMemOpNodeParams*', 'nodeParams_out')]
+    case HIP_API_ID_hipGraphBatchMemOpNodeGetParams:
+      if (data->args.hipGraphBatchMemOpNodeGetParams.nodeParams_out) data->args.hipGraphBatchMemOpNodeGetParams.nodeParams_out__val = *(data->args.hipGraphBatchMemOpNodeGetParams.nodeParams_out);
+      break;
+// hipGraphBatchMemOpNodeSetParams[('hipGraphNode_t', 'hNode'), ('hipBatchMemOpNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphBatchMemOpNodeSetParams:
+      if (data->args.hipGraphBatchMemOpNodeSetParams.nodeParams) data->args.hipGraphBatchMemOpNodeSetParams.nodeParams__val = *(data->args.hipGraphBatchMemOpNodeSetParams.nodeParams);
+      break;
+// hipGraphChildGraphNodeGetGraph[('hipGraphNode_t', 'node'), ('hipGraph_t*', 'pGraph')]
+    case HIP_API_ID_hipGraphChildGraphNodeGetGraph:
+      if (data->args.hipGraphChildGraphNodeGetGraph.pGraph) data->args.hipGraphChildGraphNodeGetGraph.pGraph__val = *(data->args.hipGraphChildGraphNodeGetGraph.pGraph);
+      break;
+// hipGraphClone[('hipGraph_t*', 'pGraphClone'), ('hipGraph_t', 'originalGraph')]
+    case HIP_API_ID_hipGraphClone:
+      if (data->args.hipGraphClone.pGraphClone) data->args.hipGraphClone.pGraphClone__val = *(data->args.hipGraphClone.pGraphClone);
+      break;
+// hipGraphCreate[('hipGraph_t*', 'pGraph'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipGraphCreate:
+      if (data->args.hipGraphCreate.pGraph) data->args.hipGraphCreate.pGraph__val = *(data->args.hipGraphCreate.pGraph);
+      break;
+// hipGraphDebugDotPrint[('hipGraph_t', 'graph'), ('const char*', 'path'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipGraphDebugDotPrint:
+      if (data->args.hipGraphDebugDotPrint.path) data->args.hipGraphDebugDotPrint.path__val = *(data->args.hipGraphDebugDotPrint.path);
+      break;
+// hipGraphDestroy[('hipGraph_t', 'graph')]
+    case HIP_API_ID_hipGraphDestroy:
+      break;
+// hipGraphDestroyNode[('hipGraphNode_t', 'node')]
+    case HIP_API_ID_hipGraphDestroyNode:
+      break;
+// hipGraphEventRecordNodeGetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t*', 'event_out')]
+    case HIP_API_ID_hipGraphEventRecordNodeGetEvent:
+      if (data->args.hipGraphEventRecordNodeGetEvent.event_out) data->args.hipGraphEventRecordNodeGetEvent.event_out__val = *(data->args.hipGraphEventRecordNodeGetEvent.event_out);
+      break;
+// hipGraphEventRecordNodeSetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphEventRecordNodeSetEvent:
+      break;
+// hipGraphEventWaitNodeGetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t*', 'event_out')]
+    case HIP_API_ID_hipGraphEventWaitNodeGetEvent:
+      if (data->args.hipGraphEventWaitNodeGetEvent.event_out) data->args.hipGraphEventWaitNodeGetEvent.event_out__val = *(data->args.hipGraphEventWaitNodeGetEvent.event_out);
+      break;
+// hipGraphEventWaitNodeSetEvent[('hipGraphNode_t', 'node'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphEventWaitNodeSetEvent:
+      break;
+// hipGraphExecBatchMemOpNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipBatchMemOpNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExecBatchMemOpNodeSetParams:
+      if (data->args.hipGraphExecBatchMemOpNodeSetParams.nodeParams) data->args.hipGraphExecBatchMemOpNodeSetParams.nodeParams__val = *(data->args.hipGraphExecBatchMemOpNodeSetParams.nodeParams);
+      break;
+// hipGraphExecChildGraphNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('hipGraph_t', 'childGraph')]
+    case HIP_API_ID_hipGraphExecChildGraphNodeSetParams:
+      break;
+// hipGraphExecDestroy[('hipGraphExec_t', 'graphExec')]
+    case HIP_API_ID_hipGraphExecDestroy:
+      break;
+// hipGraphExecEventRecordNodeSetEvent[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphExecEventRecordNodeSetEvent:
+      break;
+// hipGraphExecEventWaitNodeSetEvent[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipGraphExecEventWaitNodeSetEvent:
+      break;
+// hipGraphExecExternalSemaphoresSignalNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExecExternalSemaphoresSignalNodeSetParams:
+      if (data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams) data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams__val = *(data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams);
+      break;
+// hipGraphExecExternalSemaphoresWaitNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExecExternalSemaphoresWaitNodeSetParams:
+      if (data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams) data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams__val = *(data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams);
+      break;
+// hipGraphExecGetFlags[('hipGraphExec_t', 'graphExec'), ('unsigned long long*', 'flags')]
+    case HIP_API_ID_hipGraphExecGetFlags:
+      if (data->args.hipGraphExecGetFlags.flags) data->args.hipGraphExecGetFlags.flags__val = *(data->args.hipGraphExecGetFlags.flags);
+      break;
+// hipGraphExecHostNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipHostNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphExecHostNodeSetParams:
+      if (data->args.hipGraphExecHostNodeSetParams.pNodeParams) data->args.hipGraphExecHostNodeSetParams.pNodeParams__val = *(data->args.hipGraphExecHostNodeSetParams.pNodeParams);
+      break;
+// hipGraphExecKernelNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipKernelNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphExecKernelNodeSetParams:
+      if (data->args.hipGraphExecKernelNodeSetParams.pNodeParams) data->args.hipGraphExecKernelNodeSetParams.pNodeParams__val = *(data->args.hipGraphExecKernelNodeSetParams.pNodeParams);
+      break;
+// hipGraphExecMemcpyNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('hipMemcpy3DParms*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams:
+      if (data->args.hipGraphExecMemcpyNodeSetParams.pNodeParams) data->args.hipGraphExecMemcpyNodeSetParams.pNodeParams__val = *(data->args.hipGraphExecMemcpyNodeSetParams.pNodeParams);
+      break;
+// hipGraphExecMemcpyNodeSetParams1D[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams1D:
+      break;
+// hipGraphExecMemcpyNodeSetParamsFromSymbol[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsFromSymbol:
+      break;
+// hipGraphExecMemcpyNodeSetParamsToSymbol[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsToSymbol:
+      break;
+// hipGraphExecMemsetNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipMemsetParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphExecMemsetNodeSetParams:
+      if (data->args.hipGraphExecMemsetNodeSetParams.pNodeParams) data->args.hipGraphExecMemsetNodeSetParams.pNodeParams__val = *(data->args.hipGraphExecMemsetNodeSetParams.pNodeParams);
+      break;
+// hipGraphExecNodeSetParams[('hipGraphExec_t', 'graphExec'), ('hipGraphNode_t', 'node'), ('hipGraphNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExecNodeSetParams:
+      if (data->args.hipGraphExecNodeSetParams.nodeParams) data->args.hipGraphExecNodeSetParams.nodeParams__val = *(data->args.hipGraphExecNodeSetParams.nodeParams);
+      break;
+// hipGraphExecUpdate[('hipGraphExec_t', 'hGraphExec'), ('hipGraph_t', 'hGraph'), ('hipGraphNode_t*', 'hErrorNode_out'), ('hipGraphExecUpdateResult*', 'updateResult_out')]
+    case HIP_API_ID_hipGraphExecUpdate:
+      if (data->args.hipGraphExecUpdate.hErrorNode_out) data->args.hipGraphExecUpdate.hErrorNode_out__val = *(data->args.hipGraphExecUpdate.hErrorNode_out);
+      if (data->args.hipGraphExecUpdate.updateResult_out) data->args.hipGraphExecUpdate.updateResult_out__val = *(data->args.hipGraphExecUpdate.updateResult_out);
+      break;
+// hipGraphExternalSemaphoresSignalNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreSignalNodeParams*', 'params_out')]
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeGetParams:
+      if (data->args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out) data->args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out__val = *(data->args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out);
+      break;
+// hipGraphExternalSemaphoresSignalNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeSetParams:
+      if (data->args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams) data->args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams__val = *(data->args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams);
+      break;
+// hipGraphExternalSemaphoresWaitNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreWaitNodeParams*', 'params_out')]
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams:
+      if (data->args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out) data->args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out__val = *(data->args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out);
+      break;
+// hipGraphExternalSemaphoresWaitNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams:
+      if (data->args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams) data->args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams__val = *(data->args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams);
+      break;
+// hipGraphGetEdges[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'from'), ('hipGraphNode_t*', 'to'), ('size_t*', 'numEdges')]
+    case HIP_API_ID_hipGraphGetEdges:
+      if (data->args.hipGraphGetEdges.from) data->args.hipGraphGetEdges.from__val = *(data->args.hipGraphGetEdges.from);
+      if (data->args.hipGraphGetEdges.to) data->args.hipGraphGetEdges.to__val = *(data->args.hipGraphGetEdges.to);
+      if (data->args.hipGraphGetEdges.numEdges) data->args.hipGraphGetEdges.numEdges__val = *(data->args.hipGraphGetEdges.numEdges);
+      break;
+// hipGraphGetNodes[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'nodes'), ('size_t*', 'numNodes')]
+    case HIP_API_ID_hipGraphGetNodes:
+      if (data->args.hipGraphGetNodes.nodes) data->args.hipGraphGetNodes.nodes__val = *(data->args.hipGraphGetNodes.nodes);
+      if (data->args.hipGraphGetNodes.numNodes) data->args.hipGraphGetNodes.numNodes__val = *(data->args.hipGraphGetNodes.numNodes);
+      break;
+// hipGraphGetRootNodes[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'pRootNodes'), ('size_t*', 'pNumRootNodes')]
+    case HIP_API_ID_hipGraphGetRootNodes:
+      if (data->args.hipGraphGetRootNodes.pRootNodes) data->args.hipGraphGetRootNodes.pRootNodes__val = *(data->args.hipGraphGetRootNodes.pRootNodes);
+      if (data->args.hipGraphGetRootNodes.pNumRootNodes) data->args.hipGraphGetRootNodes.pNumRootNodes__val = *(data->args.hipGraphGetRootNodes.pNumRootNodes);
+      break;
+// hipGraphHostNodeGetParams[('hipGraphNode_t', 'node'), ('hipHostNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphHostNodeGetParams:
+      if (data->args.hipGraphHostNodeGetParams.pNodeParams) data->args.hipGraphHostNodeGetParams.pNodeParams__val = *(data->args.hipGraphHostNodeGetParams.pNodeParams);
+      break;
+// hipGraphHostNodeSetParams[('hipGraphNode_t', 'node'), ('const hipHostNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphHostNodeSetParams:
+      if (data->args.hipGraphHostNodeSetParams.pNodeParams) data->args.hipGraphHostNodeSetParams.pNodeParams__val = *(data->args.hipGraphHostNodeSetParams.pNodeParams);
+      break;
+// hipGraphInstantiate[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'pErrorNode'), ('char*', 'pLogBuffer'), ('size_t', 'bufferSize')]
+    case HIP_API_ID_hipGraphInstantiate:
+      if (data->args.hipGraphInstantiate.pGraphExec) data->args.hipGraphInstantiate.pGraphExec__val = *(data->args.hipGraphInstantiate.pGraphExec);
+      if (data->args.hipGraphInstantiate.pErrorNode) data->args.hipGraphInstantiate.pErrorNode__val = *(data->args.hipGraphInstantiate.pErrorNode);
+      data->args.hipGraphInstantiate.pLogBuffer = (data->args.hipGraphInstantiate.pLogBuffer) ? strdup(data->args.hipGraphInstantiate.pLogBuffer) : NULL;
+      break;
+// hipGraphInstantiateWithFlags[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipGraphInstantiateWithFlags:
+      if (data->args.hipGraphInstantiateWithFlags.pGraphExec) data->args.hipGraphInstantiateWithFlags.pGraphExec__val = *(data->args.hipGraphInstantiateWithFlags.pGraphExec);
+      break;
+// hipGraphInstantiateWithParams[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphInstantiateParams*', 'instantiateParams')]
+    case HIP_API_ID_hipGraphInstantiateWithParams:
+      if (data->args.hipGraphInstantiateWithParams.pGraphExec) data->args.hipGraphInstantiateWithParams.pGraphExec__val = *(data->args.hipGraphInstantiateWithParams.pGraphExec);
+      if (data->args.hipGraphInstantiateWithParams.instantiateParams) data->args.hipGraphInstantiateWithParams.instantiateParams__val = *(data->args.hipGraphInstantiateWithParams.instantiateParams);
+      break;
+// hipGraphKernelNodeCopyAttributes[('hipGraphNode_t', 'hSrc'), ('hipGraphNode_t', 'hDst')]
+    case HIP_API_ID_hipGraphKernelNodeCopyAttributes:
+      break;
+// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value')]
+    case HIP_API_ID_hipGraphKernelNodeGetAttribute:
+      if (data->args.hipGraphKernelNodeGetAttribute.value) data->args.hipGraphKernelNodeGetAttribute.value__val = *(data->args.hipGraphKernelNodeGetAttribute.value);
+      break;
+// hipGraphKernelNodeGetParams[('hipGraphNode_t', 'node'), ('hipKernelNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphKernelNodeGetParams:
+      if (data->args.hipGraphKernelNodeGetParams.pNodeParams) data->args.hipGraphKernelNodeGetParams.pNodeParams__val = *(data->args.hipGraphKernelNodeGetParams.pNodeParams);
+      break;
+// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')]
+    case HIP_API_ID_hipGraphKernelNodeSetAttribute:
+      if (data->args.hipGraphKernelNodeSetAttribute.value) data->args.hipGraphKernelNodeSetAttribute.value__val = *(data->args.hipGraphKernelNodeSetAttribute.value);
+      break;
+// hipGraphKernelNodeSetParams[('hipGraphNode_t', 'node'), ('const hipKernelNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphKernelNodeSetParams:
+      if (data->args.hipGraphKernelNodeSetParams.pNodeParams) data->args.hipGraphKernelNodeSetParams.pNodeParams__val = *(data->args.hipGraphKernelNodeSetParams.pNodeParams);
+      break;
+// hipGraphLaunch[('hipGraphExec_t', 'graphExec'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipGraphLaunch:
+      break;
+// hipGraphMemAllocNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemAllocNodeParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphMemAllocNodeGetParams:
+      if (data->args.hipGraphMemAllocNodeGetParams.pNodeParams) data->args.hipGraphMemAllocNodeGetParams.pNodeParams__val = *(data->args.hipGraphMemAllocNodeGetParams.pNodeParams);
+      break;
+// hipGraphMemFreeNodeGetParams[('hipGraphNode_t', 'node'), ('void*', 'dev_ptr')]
+    case HIP_API_ID_hipGraphMemFreeNodeGetParams:
+      break;
+// hipGraphMemcpyNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemcpy3DParms*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphMemcpyNodeGetParams:
+      if (data->args.hipGraphMemcpyNodeGetParams.pNodeParams) data->args.hipGraphMemcpyNodeGetParams.pNodeParams__val = *(data->args.hipGraphMemcpyNodeGetParams.pNodeParams);
+      break;
+// hipGraphMemcpyNodeSetParams[('hipGraphNode_t', 'node'), ('const hipMemcpy3DParms*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams:
+      if (data->args.hipGraphMemcpyNodeSetParams.pNodeParams) data->args.hipGraphMemcpyNodeSetParams.pNodeParams__val = *(data->args.hipGraphMemcpyNodeSetParams.pNodeParams);
+      break;
+// hipGraphMemcpyNodeSetParams1D[('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams1D:
+      break;
+// hipGraphMemcpyNodeSetParamsFromSymbol[('hipGraphNode_t', 'node'), ('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsFromSymbol:
+      break;
+// hipGraphMemcpyNodeSetParamsToSymbol[('hipGraphNode_t', 'node'), ('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'count'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsToSymbol:
+      break;
+// hipGraphMemsetNodeGetParams[('hipGraphNode_t', 'node'), ('hipMemsetParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphMemsetNodeGetParams:
+      if (data->args.hipGraphMemsetNodeGetParams.pNodeParams) data->args.hipGraphMemsetNodeGetParams.pNodeParams__val = *(data->args.hipGraphMemsetNodeGetParams.pNodeParams);
+      break;
+// hipGraphMemsetNodeSetParams[('hipGraphNode_t', 'node'), ('const hipMemsetParams*', 'pNodeParams')]
+    case HIP_API_ID_hipGraphMemsetNodeSetParams:
+      if (data->args.hipGraphMemsetNodeSetParams.pNodeParams) data->args.hipGraphMemsetNodeSetParams.pNodeParams__val = *(data->args.hipGraphMemsetNodeSetParams.pNodeParams);
+      break;
+// hipGraphNodeFindInClone[('hipGraphNode_t*', 'pNode'), ('hipGraphNode_t', 'originalNode'), ('hipGraph_t', 'clonedGraph')]
+    case HIP_API_ID_hipGraphNodeFindInClone:
+      if (data->args.hipGraphNodeFindInClone.pNode) data->args.hipGraphNodeFindInClone.pNode__val = *(data->args.hipGraphNodeFindInClone.pNode);
+      break;
+// hipGraphNodeGetDependencies[('hipGraphNode_t', 'node'), ('hipGraphNode_t*', 'pDependencies'), ('size_t*', 'pNumDependencies')]
+    case HIP_API_ID_hipGraphNodeGetDependencies:
+      if (data->args.hipGraphNodeGetDependencies.pDependencies) data->args.hipGraphNodeGetDependencies.pDependencies__val = *(data->args.hipGraphNodeGetDependencies.pDependencies);
+      if (data->args.hipGraphNodeGetDependencies.pNumDependencies) data->args.hipGraphNodeGetDependencies.pNumDependencies__val = *(data->args.hipGraphNodeGetDependencies.pNumDependencies);
+      break;
+// hipGraphNodeGetDependentNodes[('hipGraphNode_t', 'node'), ('hipGraphNode_t*', 'pDependentNodes'), ('size_t*', 'pNumDependentNodes')]
+    case HIP_API_ID_hipGraphNodeGetDependentNodes:
+      if (data->args.hipGraphNodeGetDependentNodes.pDependentNodes) data->args.hipGraphNodeGetDependentNodes.pDependentNodes__val = *(data->args.hipGraphNodeGetDependentNodes.pDependentNodes);
+      if (data->args.hipGraphNodeGetDependentNodes.pNumDependentNodes) data->args.hipGraphNodeGetDependentNodes.pNumDependentNodes__val = *(data->args.hipGraphNodeGetDependentNodes.pNumDependentNodes);
+      break;
+// hipGraphNodeGetEnabled[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('unsigned int*', 'isEnabled')]
+    case HIP_API_ID_hipGraphNodeGetEnabled:
+      if (data->args.hipGraphNodeGetEnabled.isEnabled) data->args.hipGraphNodeGetEnabled.isEnabled__val = *(data->args.hipGraphNodeGetEnabled.isEnabled);
+      break;
+// hipGraphNodeGetType[('hipGraphNode_t', 'node'), ('hipGraphNodeType*', 'pType')]
+    case HIP_API_ID_hipGraphNodeGetType:
+      if (data->args.hipGraphNodeGetType.pType) data->args.hipGraphNodeGetType.pType__val = *(data->args.hipGraphNodeGetType.pType);
+      break;
+// hipGraphNodeSetEnabled[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('unsigned int', 'isEnabled')]
+    case HIP_API_ID_hipGraphNodeSetEnabled:
+      break;
+// hipGraphNodeSetParams[('hipGraphNode_t', 'node'), ('hipGraphNodeParams*', 'nodeParams')]
+    case HIP_API_ID_hipGraphNodeSetParams:
+      if (data->args.hipGraphNodeSetParams.nodeParams) data->args.hipGraphNodeSetParams.nodeParams__val = *(data->args.hipGraphNodeSetParams.nodeParams);
+      break;
+// hipGraphReleaseUserObject[('hipGraph_t', 'graph'), ('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+    case HIP_API_ID_hipGraphReleaseUserObject:
+      break;
+// hipGraphRemoveDependencies[('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'from'), ('const hipGraphNode_t*', 'to'), ('size_t', 'numDependencies')]
+    case HIP_API_ID_hipGraphRemoveDependencies:
+      if (data->args.hipGraphRemoveDependencies.from) data->args.hipGraphRemoveDependencies.from__val = *(data->args.hipGraphRemoveDependencies.from);
+      if (data->args.hipGraphRemoveDependencies.to) data->args.hipGraphRemoveDependencies.to__val = *(data->args.hipGraphRemoveDependencies.to);
+      break;
+// hipGraphRetainUserObject[('hipGraph_t', 'graph'), ('hipUserObject_t', 'object'), ('unsigned int', 'count'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipGraphRetainUserObject:
+      break;
+// hipGraphUpload[('hipGraphExec_t', 'graphExec'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipGraphUpload:
+      break;
+// hipGraphicsGLRegisterBuffer[('hipGraphicsResource**', 'resource'), ('GLuint', 'buffer'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipGraphicsGLRegisterBuffer:
+      if (data->args.hipGraphicsGLRegisterBuffer.resource) data->args.hipGraphicsGLRegisterBuffer.resource__val = *(data->args.hipGraphicsGLRegisterBuffer.resource);
+      break;
+// hipGraphicsGLRegisterImage[('hipGraphicsResource**', 'resource'), ('GLuint', 'image'), ('GLenum', 'target'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipGraphicsGLRegisterImage:
+      if (data->args.hipGraphicsGLRegisterImage.resource) data->args.hipGraphicsGLRegisterImage.resource__val = *(data->args.hipGraphicsGLRegisterImage.resource);
+      break;
+// hipGraphicsMapResources[('int', 'count'), ('hipGraphicsResource_t*', 'resources'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipGraphicsMapResources:
+      if (data->args.hipGraphicsMapResources.resources) data->args.hipGraphicsMapResources.resources__val = *(data->args.hipGraphicsMapResources.resources);
+      break;
+// hipGraphicsResourceGetMappedPointer[('void**', 'devPtr'), ('size_t*', 'size'), ('hipGraphicsResource_t', 'resource')]
+    case HIP_API_ID_hipGraphicsResourceGetMappedPointer:
+      if (data->args.hipGraphicsResourceGetMappedPointer.devPtr) data->args.hipGraphicsResourceGetMappedPointer.devPtr__val = *(data->args.hipGraphicsResourceGetMappedPointer.devPtr);
+      if (data->args.hipGraphicsResourceGetMappedPointer.size) data->args.hipGraphicsResourceGetMappedPointer.size__val = *(data->args.hipGraphicsResourceGetMappedPointer.size);
+      break;
+// hipGraphicsSubResourceGetMappedArray[('hipArray_t*', 'array'), ('hipGraphicsResource_t', 'resource'), ('unsigned int', 'arrayIndex'), ('unsigned int', 'mipLevel')]
+    case HIP_API_ID_hipGraphicsSubResourceGetMappedArray:
+      if (data->args.hipGraphicsSubResourceGetMappedArray.array) data->args.hipGraphicsSubResourceGetMappedArray.array__val = *(data->args.hipGraphicsSubResourceGetMappedArray.array);
+      break;
+// hipGraphicsUnmapResources[('int', 'count'), ('hipGraphicsResource_t*', 'resources'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipGraphicsUnmapResources:
+      if (data->args.hipGraphicsUnmapResources.resources) data->args.hipGraphicsUnmapResources.resources__val = *(data->args.hipGraphicsUnmapResources.resources);
+      break;
+// hipGraphicsUnregisterResource[('hipGraphicsResource_t', 'resource')]
+    case HIP_API_ID_hipGraphicsUnregisterResource:
+      break;
+// hipHccModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent')]
+    case HIP_API_ID_hipHccModuleLaunchKernel:
+      if (data->args.hipHccModuleLaunchKernel.kernelParams) data->args.hipHccModuleLaunchKernel.kernelParams__val = *(data->args.hipHccModuleLaunchKernel.kernelParams);
+      if (data->args.hipHccModuleLaunchKernel.extra) data->args.hipHccModuleLaunchKernel.extra__val = *(data->args.hipHccModuleLaunchKernel.extra);
+      break;
+// hipHostAlloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipHostAlloc:
+      if (data->args.hipHostAlloc.ptr) data->args.hipHostAlloc.ptr__val = *(data->args.hipHostAlloc.ptr);
+      break;
+// hipHostFree[('void*', 'ptr')]
+    case HIP_API_ID_hipHostFree:
+      break;
+// hipHostGetDevicePointer[('void**', 'devPtr'), ('void*', 'hstPtr'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipHostGetDevicePointer:
+      if (data->args.hipHostGetDevicePointer.devPtr) data->args.hipHostGetDevicePointer.devPtr__val = *(data->args.hipHostGetDevicePointer.devPtr);
+      break;
+// hipHostGetFlags[('unsigned int*', 'flagsPtr'), ('void*', 'hostPtr')]
+    case HIP_API_ID_hipHostGetFlags:
+      if (data->args.hipHostGetFlags.flagsPtr) data->args.hipHostGetFlags.flagsPtr__val = *(data->args.hipHostGetFlags.flagsPtr);
+      break;
+// hipHostMalloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipHostMalloc:
+      if (data->args.hipHostMalloc.ptr) data->args.hipHostMalloc.ptr__val = *(data->args.hipHostMalloc.ptr);
+      break;
+// hipHostRegister[('void*', 'hostPtr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipHostRegister:
+      break;
+// hipHostUnregister[('void*', 'hostPtr')]
+    case HIP_API_ID_hipHostUnregister:
+      break;
+// hipImportExternalMemory[('hipExternalMemory_t*', 'extMem_out'), ('const hipExternalMemoryHandleDesc*', 'memHandleDesc')]
+    case HIP_API_ID_hipImportExternalMemory:
+      if (data->args.hipImportExternalMemory.extMem_out) data->args.hipImportExternalMemory.extMem_out__val = *(data->args.hipImportExternalMemory.extMem_out);
+      if (data->args.hipImportExternalMemory.memHandleDesc) data->args.hipImportExternalMemory.memHandleDesc__val = *(data->args.hipImportExternalMemory.memHandleDesc);
+      break;
+// hipImportExternalSemaphore[('hipExternalSemaphore_t*', 'extSem_out'), ('const hipExternalSemaphoreHandleDesc*', 'semHandleDesc')]
+    case HIP_API_ID_hipImportExternalSemaphore:
+      if (data->args.hipImportExternalSemaphore.extSem_out) data->args.hipImportExternalSemaphore.extSem_out__val = *(data->args.hipImportExternalSemaphore.extSem_out);
+      if (data->args.hipImportExternalSemaphore.semHandleDesc) data->args.hipImportExternalSemaphore.semHandleDesc__val = *(data->args.hipImportExternalSemaphore.semHandleDesc);
+      break;
+// hipInit[('unsigned int', 'flags')]
+    case HIP_API_ID_hipInit:
+      break;
+// hipIpcCloseMemHandle[('void*', 'devPtr')]
+    case HIP_API_ID_hipIpcCloseMemHandle:
+      break;
+// hipIpcGetEventHandle[('hipIpcEventHandle_t*', 'handle'), ('hipEvent_t', 'event')]
+    case HIP_API_ID_hipIpcGetEventHandle:
+      if (data->args.hipIpcGetEventHandle.handle) data->args.hipIpcGetEventHandle.handle__val = *(data->args.hipIpcGetEventHandle.handle);
+      break;
+// hipIpcGetMemHandle[('hipIpcMemHandle_t*', 'handle'), ('void*', 'devPtr')]
+    case HIP_API_ID_hipIpcGetMemHandle:
+      if (data->args.hipIpcGetMemHandle.handle) data->args.hipIpcGetMemHandle.handle__val = *(data->args.hipIpcGetMemHandle.handle);
+      break;
+// hipIpcOpenEventHandle[('hipEvent_t*', 'event'), ('hipIpcEventHandle_t', 'handle')]
+    case HIP_API_ID_hipIpcOpenEventHandle:
+      if (data->args.hipIpcOpenEventHandle.event) data->args.hipIpcOpenEventHandle.event__val = *(data->args.hipIpcOpenEventHandle.event);
+      break;
+// hipIpcOpenMemHandle[('void**', 'devPtr'), ('hipIpcMemHandle_t', 'handle'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipIpcOpenMemHandle:
+      if (data->args.hipIpcOpenMemHandle.devPtr) data->args.hipIpcOpenMemHandle.devPtr__val = *(data->args.hipIpcOpenMemHandle.devPtr);
+      break;
+// hipLaunchByPtr[('const void*', 'hostFunction')]
+    case HIP_API_ID_hipLaunchByPtr:
+      break;
+// hipLaunchCooperativeKernel[('const void*', 'f'), ('dim3', 'gridDim'), ('dim3', 'blockDimX'), ('void**', 'kernelParams'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipLaunchCooperativeKernel:
+      if (data->args.hipLaunchCooperativeKernel.kernelParams) data->args.hipLaunchCooperativeKernel.kernelParams__val = *(data->args.hipLaunchCooperativeKernel.kernelParams);
+      break;
+// hipLaunchCooperativeKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
+      if (data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList) data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val = *(data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList);
+      break;
+// hipLaunchHostFunc[('hipStream_t', 'stream'), ('hipHostFn_t', 'fn'), ('void*', 'userData')]
+    case HIP_API_ID_hipLaunchHostFunc:
+      break;
+// hipLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipLaunchKernel:
+      if (data->args.hipLaunchKernel.args) data->args.hipLaunchKernel.args__val = *(data->args.hipLaunchKernel.args);
+      break;
+// hipLaunchKernelExC[('const hipLaunchConfig_t*', 'config'), ('const void*', 'fPtr'), ('void**', 'args')]
+    case HIP_API_ID_hipLaunchKernelExC:
+      if (data->args.hipLaunchKernelExC.config) data->args.hipLaunchKernelExC.config__val = *(data->args.hipLaunchKernelExC.config);
+      if (data->args.hipLaunchKernelExC.args) data->args.hipLaunchKernelExC.args__val = *(data->args.hipLaunchKernelExC.args);
+      break;
+// hipLibraryGetKernel[('hipKernel_t*', 'pKernel'), ('hipLibrary_t', 'library'), ('const char*', 'name')]
+    case HIP_API_ID_hipLibraryGetKernel:
+      if (data->args.hipLibraryGetKernel.pKernel) data->args.hipLibraryGetKernel.pKernel__val = *(data->args.hipLibraryGetKernel.pKernel);
+      if (data->args.hipLibraryGetKernel.name) data->args.hipLibraryGetKernel.name__val = *(data->args.hipLibraryGetKernel.name);
+      break;
+// hipLibraryGetKernelCount[('unsigned int*', 'count'), ('hipLibrary_t', 'library')]
+    case HIP_API_ID_hipLibraryGetKernelCount:
+      if (data->args.hipLibraryGetKernelCount.count) data->args.hipLibraryGetKernelCount.count__val = *(data->args.hipLibraryGetKernelCount.count);
+      break;
+// hipLibraryLoadData[('hipLibrary_t*', 'library'), ('const void*', 'code'), ('hipJitOption**', 'jitOptions'), ('void**', 'jitOptionsValues'), ('unsigned int', 'numJitOptions'), ('hipLibraryOption**', 'libraryOptions'), ('void**', 'libraryOptionValues'), ('unsigned int', 'numLibraryOptions')]
+    case HIP_API_ID_hipLibraryLoadData:
+      if (data->args.hipLibraryLoadData.library) data->args.hipLibraryLoadData.library__val = *(data->args.hipLibraryLoadData.library);
+      if (data->args.hipLibraryLoadData.jitOptions) data->args.hipLibraryLoadData.jitOptions__val = *(data->args.hipLibraryLoadData.jitOptions);
+      if (data->args.hipLibraryLoadData.jitOptionsValues) data->args.hipLibraryLoadData.jitOptionsValues__val = *(data->args.hipLibraryLoadData.jitOptionsValues);
+      if (data->args.hipLibraryLoadData.libraryOptions) data->args.hipLibraryLoadData.libraryOptions__val = *(data->args.hipLibraryLoadData.libraryOptions);
+      if (data->args.hipLibraryLoadData.libraryOptionValues) data->args.hipLibraryLoadData.libraryOptionValues__val = *(data->args.hipLibraryLoadData.libraryOptionValues);
+      break;
+// hipLibraryLoadFromFile[('hipLibrary_t*', 'library'), ('const char*', 'fileName'), ('hipJitOption**', 'jitOptions'), ('void**', 'jitOptionsValues'), ('unsigned int', 'numJitOptions'), ('hipLibraryOption**', 'libraryOptions'), ('void**', 'libraryOptionValues'), ('unsigned int', 'numLibraryOptions')]
+    case HIP_API_ID_hipLibraryLoadFromFile:
+      if (data->args.hipLibraryLoadFromFile.library) data->args.hipLibraryLoadFromFile.library__val = *(data->args.hipLibraryLoadFromFile.library);
+      if (data->args.hipLibraryLoadFromFile.fileName) data->args.hipLibraryLoadFromFile.fileName__val = *(data->args.hipLibraryLoadFromFile.fileName);
+      if (data->args.hipLibraryLoadFromFile.jitOptions) data->args.hipLibraryLoadFromFile.jitOptions__val = *(data->args.hipLibraryLoadFromFile.jitOptions);
+      if (data->args.hipLibraryLoadFromFile.jitOptionsValues) data->args.hipLibraryLoadFromFile.jitOptionsValues__val = *(data->args.hipLibraryLoadFromFile.jitOptionsValues);
+      if (data->args.hipLibraryLoadFromFile.libraryOptions) data->args.hipLibraryLoadFromFile.libraryOptions__val = *(data->args.hipLibraryLoadFromFile.libraryOptions);
+      if (data->args.hipLibraryLoadFromFile.libraryOptionValues) data->args.hipLibraryLoadFromFile.libraryOptionValues__val = *(data->args.hipLibraryLoadFromFile.libraryOptionValues);
+      break;
+// hipLibraryUnload[('hipLibrary_t', 'library')]
+    case HIP_API_ID_hipLibraryUnload:
+      break;
+// hipLinkAddData[('hipLinkState_t', 'state'), ('hipJitInputType', 'type'), ('void*', 'data'), ('size_t', 'size'), ('const char*', 'name'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues')]
+    case HIP_API_ID_hipLinkAddData:
+      if (data->args.hipLinkAddData.name) data->args.hipLinkAddData.name__val = *(data->args.hipLinkAddData.name);
+      if (data->args.hipLinkAddData.options) data->args.hipLinkAddData.options__val = *(data->args.hipLinkAddData.options);
+      if (data->args.hipLinkAddData.optionValues) data->args.hipLinkAddData.optionValues__val = *(data->args.hipLinkAddData.optionValues);
+      break;
+// hipLinkAddFile[('hipLinkState_t', 'state'), ('hipJitInputType', 'type'), ('const char*', 'path'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues')]
+    case HIP_API_ID_hipLinkAddFile:
+      if (data->args.hipLinkAddFile.path) data->args.hipLinkAddFile.path__val = *(data->args.hipLinkAddFile.path);
+      if (data->args.hipLinkAddFile.options) data->args.hipLinkAddFile.options__val = *(data->args.hipLinkAddFile.options);
+      if (data->args.hipLinkAddFile.optionValues) data->args.hipLinkAddFile.optionValues__val = *(data->args.hipLinkAddFile.optionValues);
+      break;
+// hipLinkComplete[('hipLinkState_t', 'state'), ('void**', 'hipBinOut'), ('size_t*', 'sizeOut')]
+    case HIP_API_ID_hipLinkComplete:
+      if (data->args.hipLinkComplete.hipBinOut) data->args.hipLinkComplete.hipBinOut__val = *(data->args.hipLinkComplete.hipBinOut);
+      if (data->args.hipLinkComplete.sizeOut) data->args.hipLinkComplete.sizeOut__val = *(data->args.hipLinkComplete.sizeOut);
+      break;
+// hipLinkCreate[('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionValues'), ('hipLinkState_t*', 'stateOut')]
+    case HIP_API_ID_hipLinkCreate:
+      if (data->args.hipLinkCreate.options) data->args.hipLinkCreate.options__val = *(data->args.hipLinkCreate.options);
+      if (data->args.hipLinkCreate.optionValues) data->args.hipLinkCreate.optionValues__val = *(data->args.hipLinkCreate.optionValues);
+      if (data->args.hipLinkCreate.stateOut) data->args.hipLinkCreate.stateOut__val = *(data->args.hipLinkCreate.stateOut);
+      break;
+// hipLinkDestroy[('hipLinkState_t', 'state')]
+    case HIP_API_ID_hipLinkDestroy:
+      break;
+// hipMalloc[('void**', 'ptr'), ('size_t', 'size')]
+    case HIP_API_ID_hipMalloc:
+      if (data->args.hipMalloc.ptr) data->args.hipMalloc.ptr__val = *(data->args.hipMalloc.ptr);
+      break;
+// hipMalloc3D[('hipPitchedPtr*', 'pitchedDevPtr'), ('hipExtent', 'extent')]
+    case HIP_API_ID_hipMalloc3D:
+      if (data->args.hipMalloc3D.pitchedDevPtr) data->args.hipMalloc3D.pitchedDevPtr__val = *(data->args.hipMalloc3D.pitchedDevPtr);
+      break;
+// hipMalloc3DArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMalloc3DArray:
+      if (data->args.hipMalloc3DArray.array) data->args.hipMalloc3DArray.array__val = *(data->args.hipMalloc3DArray.array);
+      if (data->args.hipMalloc3DArray.desc) data->args.hipMalloc3DArray.desc__val = *(data->args.hipMalloc3DArray.desc);
+      break;
+// hipMallocArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('size_t', 'width'), ('size_t', 'height'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMallocArray:
+      if (data->args.hipMallocArray.array) data->args.hipMallocArray.array__val = *(data->args.hipMallocArray.array);
+      if (data->args.hipMallocArray.desc) data->args.hipMallocArray.desc__val = *(data->args.hipMallocArray.desc);
+      break;
+// hipMallocAsync[('void**', 'dev_ptr'), ('size_t', 'size'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMallocAsync:
+      if (data->args.hipMallocAsync.dev_ptr) data->args.hipMallocAsync.dev_ptr__val = *(data->args.hipMallocAsync.dev_ptr);
+      break;
+// hipMallocFromPoolAsync[('void**', 'dev_ptr'), ('size_t', 'size'), ('hipMemPool_t', 'mem_pool'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMallocFromPoolAsync:
+      if (data->args.hipMallocFromPoolAsync.dev_ptr) data->args.hipMallocFromPoolAsync.dev_ptr__val = *(data->args.hipMallocFromPoolAsync.dev_ptr);
+      break;
+// hipMallocHost[('void**', 'ptr'), ('size_t', 'size')]
+    case HIP_API_ID_hipMallocHost:
+      if (data->args.hipMallocHost.ptr) data->args.hipMallocHost.ptr__val = *(data->args.hipMallocHost.ptr);
+      break;
+// hipMallocManaged[('void**', 'dev_ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMallocManaged:
+      if (data->args.hipMallocManaged.dev_ptr) data->args.hipMallocManaged.dev_ptr__val = *(data->args.hipMallocManaged.dev_ptr);
+      break;
+// hipMallocMipmappedArray[('hipMipmappedArray_t*', 'mipmappedArray'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'numLevels'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMallocMipmappedArray:
+      if (data->args.hipMallocMipmappedArray.mipmappedArray) data->args.hipMallocMipmappedArray.mipmappedArray__val = *(data->args.hipMallocMipmappedArray.mipmappedArray);
+      if (data->args.hipMallocMipmappedArray.desc) data->args.hipMallocMipmappedArray.desc__val = *(data->args.hipMallocMipmappedArray.desc);
+      break;
+// hipMallocPitch[('void**', 'ptr'), ('size_t*', 'pitch'), ('size_t', 'width'), ('size_t', 'height')]
+    case HIP_API_ID_hipMallocPitch:
+      if (data->args.hipMallocPitch.ptr) data->args.hipMallocPitch.ptr__val = *(data->args.hipMallocPitch.ptr);
+      if (data->args.hipMallocPitch.pitch) data->args.hipMallocPitch.pitch__val = *(data->args.hipMallocPitch.pitch);
+      break;
+// hipMemAddressFree[('void*', 'devPtr'), ('size_t', 'size')]
+    case HIP_API_ID_hipMemAddressFree:
+      break;
+// hipMemAddressReserve[('void**', 'ptr'), ('size_t', 'size'), ('size_t', 'alignment'), ('void*', 'addr'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipMemAddressReserve:
+      if (data->args.hipMemAddressReserve.ptr) data->args.hipMemAddressReserve.ptr__val = *(data->args.hipMemAddressReserve.ptr);
+      break;
+// hipMemAdvise[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('int', 'device')]
+    case HIP_API_ID_hipMemAdvise:
+      break;
+// hipMemAdvise_v2[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('hipMemLocation', 'location')]
+    case HIP_API_ID_hipMemAdvise_v2:
+      break;
+// hipMemAllocHost[('void**', 'ptr'), ('size_t', 'size')]
+    case HIP_API_ID_hipMemAllocHost:
+      if (data->args.hipMemAllocHost.ptr) data->args.hipMemAllocHost.ptr__val = *(data->args.hipMemAllocHost.ptr);
+      break;
+// hipMemAllocPitch[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'pitch'), ('size_t', 'widthInBytes'), ('size_t', 'height'), ('unsigned int', 'elementSizeBytes')]
+    case HIP_API_ID_hipMemAllocPitch:
+      if (data->args.hipMemAllocPitch.dptr) data->args.hipMemAllocPitch.dptr__val = *(data->args.hipMemAllocPitch.dptr);
+      if (data->args.hipMemAllocPitch.pitch) data->args.hipMemAllocPitch.pitch__val = *(data->args.hipMemAllocPitch.pitch);
+      break;
+// hipMemCreate[('hipMemGenericAllocationHandle_t*', 'handle'), ('size_t', 'size'), ('const hipMemAllocationProp*', 'prop'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipMemCreate:
+      if (data->args.hipMemCreate.handle) data->args.hipMemCreate.handle__val = *(data->args.hipMemCreate.handle);
+      if (data->args.hipMemCreate.prop) data->args.hipMemCreate.prop__val = *(data->args.hipMemCreate.prop);
+      break;
+// hipMemExportToShareableHandle[('void*', 'shareableHandle'), ('hipMemGenericAllocationHandle_t', 'handle'), ('hipMemAllocationHandleType', 'handleType'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipMemExportToShareableHandle:
+      break;
+// hipMemGetAccess[('unsigned long long*', 'flags'), ('const hipMemLocation*', 'location'), ('void*', 'ptr')]
+    case HIP_API_ID_hipMemGetAccess:
+      if (data->args.hipMemGetAccess.flags) data->args.hipMemGetAccess.flags__val = *(data->args.hipMemGetAccess.flags);
+      if (data->args.hipMemGetAccess.location) data->args.hipMemGetAccess.location__val = *(data->args.hipMemGetAccess.location);
+      break;
+// hipMemGetAddressRange[('hipDeviceptr_t*', 'pbase'), ('size_t*', 'psize'), ('hipDeviceptr_t', 'dptr')]
+    case HIP_API_ID_hipMemGetAddressRange:
+      if (data->args.hipMemGetAddressRange.pbase) data->args.hipMemGetAddressRange.pbase__val = *(data->args.hipMemGetAddressRange.pbase);
+      if (data->args.hipMemGetAddressRange.psize) data->args.hipMemGetAddressRange.psize__val = *(data->args.hipMemGetAddressRange.psize);
+      break;
+// hipMemGetAllocationGranularity[('size_t*', 'granularity'), ('const hipMemAllocationProp*', 'prop'), ('hipMemAllocationGranularity_flags', 'option')]
+    case HIP_API_ID_hipMemGetAllocationGranularity:
+      if (data->args.hipMemGetAllocationGranularity.granularity) data->args.hipMemGetAllocationGranularity.granularity__val = *(data->args.hipMemGetAllocationGranularity.granularity);
+      if (data->args.hipMemGetAllocationGranularity.prop) data->args.hipMemGetAllocationGranularity.prop__val = *(data->args.hipMemGetAllocationGranularity.prop);
+      break;
+// hipMemGetAllocationPropertiesFromHandle[('hipMemAllocationProp*', 'prop'), ('hipMemGenericAllocationHandle_t', 'handle')]
+    case HIP_API_ID_hipMemGetAllocationPropertiesFromHandle:
+      if (data->args.hipMemGetAllocationPropertiesFromHandle.prop) data->args.hipMemGetAllocationPropertiesFromHandle.prop__val = *(data->args.hipMemGetAllocationPropertiesFromHandle.prop);
+      break;
+// hipMemGetHandleForAddressRange[('void*', 'handle'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'size'), ('hipMemRangeHandleType', 'handleType'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipMemGetHandleForAddressRange:
+      break;
+// hipMemGetInfo[('size_t*', 'free'), ('size_t*', 'total')]
+    case HIP_API_ID_hipMemGetInfo:
+      if (data->args.hipMemGetInfo.free) data->args.hipMemGetInfo.free__val = *(data->args.hipMemGetInfo.free);
+      if (data->args.hipMemGetInfo.total) data->args.hipMemGetInfo.total__val = *(data->args.hipMemGetInfo.total);
+      break;
+// hipMemImportFromShareableHandle[('hipMemGenericAllocationHandle_t*', 'handle'), ('void*', 'osHandle'), ('hipMemAllocationHandleType', 'shHandleType')]
+    case HIP_API_ID_hipMemImportFromShareableHandle:
+      if (data->args.hipMemImportFromShareableHandle.handle) data->args.hipMemImportFromShareableHandle.handle__val = *(data->args.hipMemImportFromShareableHandle.handle);
+      break;
+// hipMemMap[('void*', 'ptr'), ('size_t', 'size'), ('size_t', 'offset'), ('hipMemGenericAllocationHandle_t', 'handle'), ('unsigned long long', 'flags')]
+    case HIP_API_ID_hipMemMap:
+      break;
+// hipMemMapArrayAsync[('hipArrayMapInfo*', 'mapInfoList'), ('unsigned int', 'count'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemMapArrayAsync:
+      if (data->args.hipMemMapArrayAsync.mapInfoList) data->args.hipMemMapArrayAsync.mapInfoList__val = *(data->args.hipMemMapArrayAsync.mapInfoList);
+      break;
+// hipMemPoolCreate[('hipMemPool_t*', 'mem_pool'), ('const hipMemPoolProps*', 'pool_props')]
+    case HIP_API_ID_hipMemPoolCreate:
+      if (data->args.hipMemPoolCreate.mem_pool) data->args.hipMemPoolCreate.mem_pool__val = *(data->args.hipMemPoolCreate.mem_pool);
+      if (data->args.hipMemPoolCreate.pool_props) data->args.hipMemPoolCreate.pool_props__val = *(data->args.hipMemPoolCreate.pool_props);
+      break;
+// hipMemPoolDestroy[('hipMemPool_t', 'mem_pool')]
+    case HIP_API_ID_hipMemPoolDestroy:
+      break;
+// hipMemPoolExportPointer[('hipMemPoolPtrExportData*', 'export_data'), ('void*', 'dev_ptr')]
+    case HIP_API_ID_hipMemPoolExportPointer:
+      if (data->args.hipMemPoolExportPointer.export_data) data->args.hipMemPoolExportPointer.export_data__val = *(data->args.hipMemPoolExportPointer.export_data);
+      break;
+// hipMemPoolExportToShareableHandle[('void*', 'shared_handle'), ('hipMemPool_t', 'mem_pool'), ('hipMemAllocationHandleType', 'handle_type'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMemPoolExportToShareableHandle:
+      break;
+// hipMemPoolGetAccess[('hipMemAccessFlags*', 'flags'), ('hipMemPool_t', 'mem_pool'), ('hipMemLocation*', 'location')]
+    case HIP_API_ID_hipMemPoolGetAccess:
+      if (data->args.hipMemPoolGetAccess.flags) data->args.hipMemPoolGetAccess.flags__val = *(data->args.hipMemPoolGetAccess.flags);
+      if (data->args.hipMemPoolGetAccess.location) data->args.hipMemPoolGetAccess.location__val = *(data->args.hipMemPoolGetAccess.location);
+      break;
+// hipMemPoolGetAttribute[('hipMemPool_t', 'mem_pool'), ('hipMemPoolAttr', 'attr'), ('void*', 'value')]
+    case HIP_API_ID_hipMemPoolGetAttribute:
+      break;
+// hipMemPoolImportFromShareableHandle[('hipMemPool_t*', 'mem_pool'), ('void*', 'shared_handle'), ('hipMemAllocationHandleType', 'handle_type'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipMemPoolImportFromShareableHandle:
+      if (data->args.hipMemPoolImportFromShareableHandle.mem_pool) data->args.hipMemPoolImportFromShareableHandle.mem_pool__val = *(data->args.hipMemPoolImportFromShareableHandle.mem_pool);
+      break;
+// hipMemPoolImportPointer[('void**', 'dev_ptr'), ('hipMemPool_t', 'mem_pool'), ('hipMemPoolPtrExportData*', 'export_data')]
+    case HIP_API_ID_hipMemPoolImportPointer:
+      if (data->args.hipMemPoolImportPointer.dev_ptr) data->args.hipMemPoolImportPointer.dev_ptr__val = *(data->args.hipMemPoolImportPointer.dev_ptr);
+      if (data->args.hipMemPoolImportPointer.export_data) data->args.hipMemPoolImportPointer.export_data__val = *(data->args.hipMemPoolImportPointer.export_data);
+      break;
+// hipMemPoolSetAccess[('hipMemPool_t', 'mem_pool'), ('const hipMemAccessDesc*', 'desc_list'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemPoolSetAccess:
+      if (data->args.hipMemPoolSetAccess.desc_list) data->args.hipMemPoolSetAccess.desc_list__val = *(data->args.hipMemPoolSetAccess.desc_list);
+      break;
+// hipMemPoolSetAttribute[('hipMemPool_t', 'mem_pool'), ('hipMemPoolAttr', 'attr'), ('void*', 'value')]
+    case HIP_API_ID_hipMemPoolSetAttribute:
+      break;
+// hipMemPoolTrimTo[('hipMemPool_t', 'mem_pool'), ('size_t', 'min_bytes_to_hold')]
+    case HIP_API_ID_hipMemPoolTrimTo:
+      break;
+// hipMemPrefetchAsync[('const void*', 'dev_ptr'), ('size_t', 'count'), ('int', 'device'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemPrefetchAsync:
+      break;
+// hipMemPrefetchAsync_v2[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemLocation', 'location'), ('unsigned int', 'flags'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemPrefetchAsync_v2:
+      break;
+// hipMemPtrGetInfo[('void*', 'ptr'), ('size_t*', 'size')]
+    case HIP_API_ID_hipMemPtrGetInfo:
+      if (data->args.hipMemPtrGetInfo.size) data->args.hipMemPtrGetInfo.size__val = *(data->args.hipMemPtrGetInfo.size);
+      break;
+// hipMemRangeGetAttribute[('void*', 'data'), ('size_t', 'data_size'), ('hipMemRangeAttribute', 'attribute'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemRangeGetAttribute:
+      break;
+// hipMemRangeGetAttributes[('void**', 'data'), ('size_t*', 'data_sizes'), ('hipMemRangeAttribute*', 'attributes'), ('size_t', 'num_attributes'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemRangeGetAttributes:
+      if (data->args.hipMemRangeGetAttributes.data) data->args.hipMemRangeGetAttributes.data__val = *(data->args.hipMemRangeGetAttributes.data);
+      if (data->args.hipMemRangeGetAttributes.data_sizes) data->args.hipMemRangeGetAttributes.data_sizes__val = *(data->args.hipMemRangeGetAttributes.data_sizes);
+      if (data->args.hipMemRangeGetAttributes.attributes) data->args.hipMemRangeGetAttributes.attributes__val = *(data->args.hipMemRangeGetAttributes.attributes);
+      break;
+// hipMemRelease[('hipMemGenericAllocationHandle_t', 'handle')]
+    case HIP_API_ID_hipMemRelease:
+      break;
+// hipMemRetainAllocationHandle[('hipMemGenericAllocationHandle_t*', 'handle'), ('void*', 'addr')]
+    case HIP_API_ID_hipMemRetainAllocationHandle:
+      if (data->args.hipMemRetainAllocationHandle.handle) data->args.hipMemRetainAllocationHandle.handle__val = *(data->args.hipMemRetainAllocationHandle.handle);
+      break;
+// hipMemSetAccess[('void*', 'ptr'), ('size_t', 'size'), ('const hipMemAccessDesc*', 'desc'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemSetAccess:
+      if (data->args.hipMemSetAccess.desc) data->args.hipMemSetAccess.desc__val = *(data->args.hipMemSetAccess.desc);
+      break;
+// hipMemUnmap[('void*', 'ptr'), ('size_t', 'size')]
+    case HIP_API_ID_hipMemUnmap:
+      break;
+// hipMemcpy[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpy:
+      break;
+// hipMemcpy2D[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpy2D:
+      break;
+// hipMemcpy2DArrayToArray[('hipArray_t', 'dst'), ('size_t', 'wOffsetDst'), ('size_t', 'hOffsetDst'), ('hipArray_const_t', 'src'), ('size_t', 'wOffsetSrc'), ('size_t', 'hOffsetSrc'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpy2DArrayToArray:
+      break;
+// hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy2DAsync:
+      break;
+// hipMemcpy2DFromArray[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpy2DFromArray:
+      break;
+// hipMemcpy2DFromArrayAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy2DFromArrayAsync:
+      break;
+// hipMemcpy2DToArray[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpy2DToArray:
+      break;
+// hipMemcpy2DToArrayAsync[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy2DToArrayAsync:
+      break;
+// hipMemcpy3D[('const hipMemcpy3DParms*', 'p')]
+    case HIP_API_ID_hipMemcpy3D:
+      if (data->args.hipMemcpy3D.p) data->args.hipMemcpy3D.p__val = *(data->args.hipMemcpy3D.p);
+      break;
+// hipMemcpy3DAsync[('const hipMemcpy3DParms*', 'p'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy3DAsync:
+      if (data->args.hipMemcpy3DAsync.p) data->args.hipMemcpy3DAsync.p__val = *(data->args.hipMemcpy3DAsync.p);
+      break;
+// hipMemcpy3DBatchAsync[('size_t', 'numOps'), ('hipMemcpy3DBatchOp*', 'opList'), ('size_t*', 'failIdx'), ('unsigned long long', 'flags'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy3DBatchAsync:
+      if (data->args.hipMemcpy3DBatchAsync.opList) data->args.hipMemcpy3DBatchAsync.opList__val = *(data->args.hipMemcpy3DBatchAsync.opList);
+      if (data->args.hipMemcpy3DBatchAsync.failIdx) data->args.hipMemcpy3DBatchAsync.failIdx__val = *(data->args.hipMemcpy3DBatchAsync.failIdx);
+      break;
+// hipMemcpy3DPeer[('hipMemcpy3DPeerParms*', 'p')]
+    case HIP_API_ID_hipMemcpy3DPeer:
+      if (data->args.hipMemcpy3DPeer.p) data->args.hipMemcpy3DPeer.p__val = *(data->args.hipMemcpy3DPeer.p);
+      break;
+// hipMemcpy3DPeerAsync[('hipMemcpy3DPeerParms*', 'p'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpy3DPeerAsync:
+      if (data->args.hipMemcpy3DPeerAsync.p) data->args.hipMemcpy3DPeerAsync.p__val = *(data->args.hipMemcpy3DPeerAsync.p);
+      break;
+// hipMemcpyAsync[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyAsync:
+      break;
+// hipMemcpyAtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')]
+    case HIP_API_ID_hipMemcpyAtoA:
+      break;
+// hipMemcpyAtoD[('hipDeviceptr_t', 'dstDevice'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')]
+    case HIP_API_ID_hipMemcpyAtoD:
+      break;
+// hipMemcpyAtoH[('void*', 'dst'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemcpyAtoH:
+      break;
+// hipMemcpyAtoHAsync[('void*', 'dstHost'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyAtoHAsync:
+      break;
+// hipMemcpyBatchAsync[('void**', 'dsts'), ('void**', 'srcs'), ('size_t*', 'sizes'), ('size_t', 'count'), ('hipMemcpyAttributes*', 'attrs'), ('size_t*', 'attrsIdxs'), ('size_t', 'numAttrs'), ('size_t*', 'failIdx'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyBatchAsync:
+      if (data->args.hipMemcpyBatchAsync.dsts) data->args.hipMemcpyBatchAsync.dsts__val = *(data->args.hipMemcpyBatchAsync.dsts);
+      if (data->args.hipMemcpyBatchAsync.srcs) data->args.hipMemcpyBatchAsync.srcs__val = *(data->args.hipMemcpyBatchAsync.srcs);
+      if (data->args.hipMemcpyBatchAsync.sizes) data->args.hipMemcpyBatchAsync.sizes__val = *(data->args.hipMemcpyBatchAsync.sizes);
+      if (data->args.hipMemcpyBatchAsync.attrs) data->args.hipMemcpyBatchAsync.attrs__val = *(data->args.hipMemcpyBatchAsync.attrs);
+      if (data->args.hipMemcpyBatchAsync.attrsIdxs) data->args.hipMemcpyBatchAsync.attrsIdxs__val = *(data->args.hipMemcpyBatchAsync.attrsIdxs);
+      if (data->args.hipMemcpyBatchAsync.failIdx) data->args.hipMemcpyBatchAsync.failIdx__val = *(data->args.hipMemcpyBatchAsync.failIdx);
+      break;
+// hipMemcpyDtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipDeviceptr_t', 'srcDevice'), ('size_t', 'ByteCount')]
+    case HIP_API_ID_hipMemcpyDtoA:
+      break;
+// hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+    case HIP_API_ID_hipMemcpyDtoD:
+      break;
+// hipMemcpyDtoDAsync[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyDtoDAsync:
+      break;
+// hipMemcpyDtoH[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+    case HIP_API_ID_hipMemcpyDtoH:
+      break;
+// hipMemcpyDtoHAsync[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyDtoHAsync:
+      break;
+// hipMemcpyFromArray[('void*', 'dst'), ('hipArray_const_t', 'srcArray'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpyFromArray:
+      break;
+// hipMemcpyFromSymbol[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpyFromSymbol:
+      break;
+// hipMemcpyFromSymbolAsync[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyFromSymbolAsync:
+      break;
+// hipMemcpyHtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemcpyHtoA:
+      break;
+// hipMemcpyHtoAAsync[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyHtoAAsync:
+      break;
+// hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes')]
+    case HIP_API_ID_hipMemcpyHtoD:
+      break;
+// hipMemcpyHtoDAsync[('hipDeviceptr_t', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyHtoDAsync:
+      break;
+// hipMemcpyParam2D[('const hip_Memcpy2D*', 'pCopy')]
+    case HIP_API_ID_hipMemcpyParam2D:
+      if (data->args.hipMemcpyParam2D.pCopy) data->args.hipMemcpyParam2D.pCopy__val = *(data->args.hipMemcpyParam2D.pCopy);
+      break;
+// hipMemcpyParam2DAsync[('const hip_Memcpy2D*', 'pCopy'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyParam2DAsync:
+      if (data->args.hipMemcpyParam2DAsync.pCopy) data->args.hipMemcpyParam2DAsync.pCopy__val = *(data->args.hipMemcpyParam2DAsync.pCopy);
+      break;
+// hipMemcpyPeer[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDeviceId'), ('size_t', 'sizeBytes')]
+    case HIP_API_ID_hipMemcpyPeer:
+      break;
+// hipMemcpyPeerAsync[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDevice'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyPeerAsync:
+      break;
+// hipMemcpyToArray[('hipArray_t', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpyToArray:
+      break;
+// hipMemcpyToSymbol[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+    case HIP_API_ID_hipMemcpyToSymbol:
+      break;
+// hipMemcpyToSymbolAsync[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyToSymbolAsync:
+      break;
+// hipMemcpyWithStream[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemcpyWithStream:
+      break;
+// hipMemset[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes')]
+    case HIP_API_ID_hipMemset:
+      break;
+// hipMemset2D[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+    case HIP_API_ID_hipMemset2D:
+      break;
+// hipMemset2DAsync[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemset2DAsync:
+      break;
+// hipMemset3D[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent')]
+    case HIP_API_ID_hipMemset3D:
+      break;
+// hipMemset3DAsync[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemset3DAsync:
+      break;
+// hipMemsetAsync[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetAsync:
+      break;
+// hipMemsetD16[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemsetD16:
+      break;
+// hipMemsetD16Async[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD16Async:
+      break;
+// hipMemsetD2D16[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned short', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+    case HIP_API_ID_hipMemsetD2D16:
+      break;
+// hipMemsetD2D16Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned short', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD2D16Async:
+      break;
+// hipMemsetD2D32[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+    case HIP_API_ID_hipMemsetD2D32:
+      break;
+// hipMemsetD2D32Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD2D32Async:
+      break;
+// hipMemsetD2D8[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned char', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+    case HIP_API_ID_hipMemsetD2D8:
+      break;
+// hipMemsetD2D8Async[('hipDeviceptr_t', 'dst'), ('size_t', 'dstPitch'), ('unsigned char', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD2D8Async:
+      break;
+// hipMemsetD32[('hipDeviceptr_t', 'dest'), ('int', 'value'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemsetD32:
+      break;
+// hipMemsetD32Async[('hipDeviceptr_t', 'dst'), ('int', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD32Async:
+      break;
+// hipMemsetD8[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count')]
+    case HIP_API_ID_hipMemsetD8:
+      break;
+// hipMemsetD8Async[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipMemsetD8Async:
+      break;
+// hipMipmappedArrayCreate[('hipMipmappedArray_t*', 'pHandle'), ('HIP_ARRAY3D_DESCRIPTOR*', 'pMipmappedArrayDesc'), ('unsigned int', 'numMipmapLevels')]
+    case HIP_API_ID_hipMipmappedArrayCreate:
+      if (data->args.hipMipmappedArrayCreate.pHandle) data->args.hipMipmappedArrayCreate.pHandle__val = *(data->args.hipMipmappedArrayCreate.pHandle);
+      if (data->args.hipMipmappedArrayCreate.pMipmappedArrayDesc) data->args.hipMipmappedArrayCreate.pMipmappedArrayDesc__val = *(data->args.hipMipmappedArrayCreate.pMipmappedArrayDesc);
+      break;
+// hipMipmappedArrayDestroy[('hipMipmappedArray_t', 'hMipmappedArray')]
+    case HIP_API_ID_hipMipmappedArrayDestroy:
+      break;
+// hipMipmappedArrayGetLevel[('hipArray_t*', 'pLevelArray'), ('hipMipmappedArray_t', 'hMipMappedArray'), ('unsigned int', 'level')]
+    case HIP_API_ID_hipMipmappedArrayGetLevel:
+      if (data->args.hipMipmappedArrayGetLevel.pLevelArray) data->args.hipMipmappedArrayGetLevel.pLevelArray__val = *(data->args.hipMipmappedArrayGetLevel.pLevelArray);
+      break;
+// hipModuleGetFunction[('hipFunction_t*', 'function'), ('hipModule_t', 'module'), ('const char*', 'kname')]
+    case HIP_API_ID_hipModuleGetFunction:
+      if (data->args.hipModuleGetFunction.function) data->args.hipModuleGetFunction.function__val = *(data->args.hipModuleGetFunction.function);
+      if (data->args.hipModuleGetFunction.kname) data->args.hipModuleGetFunction.kname__val = *(data->args.hipModuleGetFunction.kname);
+      break;
+// hipModuleGetFunctionCount[('unsigned int*', 'count'), ('hipModule_t', 'mod')]
+    case HIP_API_ID_hipModuleGetFunctionCount:
+      if (data->args.hipModuleGetFunctionCount.count) data->args.hipModuleGetFunctionCount.count__val = *(data->args.hipModuleGetFunctionCount.count);
+      break;
+// hipModuleGetGlobal[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'bytes'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+    case HIP_API_ID_hipModuleGetGlobal:
+      if (data->args.hipModuleGetGlobal.dptr) data->args.hipModuleGetGlobal.dptr__val = *(data->args.hipModuleGetGlobal.dptr);
+      if (data->args.hipModuleGetGlobal.bytes) data->args.hipModuleGetGlobal.bytes__val = *(data->args.hipModuleGetGlobal.bytes);
+      if (data->args.hipModuleGetGlobal.name) data->args.hipModuleGetGlobal.name__val = *(data->args.hipModuleGetGlobal.name);
+      break;
+// hipModuleGetTexRef[('textureReference**', 'texRef'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+    case HIP_API_ID_hipModuleGetTexRef:
+      if (data->args.hipModuleGetTexRef.texRef) data->args.hipModuleGetTexRef.texRef__val = *(data->args.hipModuleGetTexRef.texRef);
+      if (data->args.hipModuleGetTexRef.name) data->args.hipModuleGetTexRef.name__val = *(data->args.hipModuleGetTexRef.name);
+      break;
+// hipModuleLaunchCooperativeKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams')]
+    case HIP_API_ID_hipModuleLaunchCooperativeKernel:
+      if (data->args.hipModuleLaunchCooperativeKernel.kernelParams) data->args.hipModuleLaunchCooperativeKernel.kernelParams__val = *(data->args.hipModuleLaunchCooperativeKernel.kernelParams);
+      break;
+// hipModuleLaunchCooperativeKernelMultiDevice[('hipFunctionLaunchParams*', 'launchParamsList'), ('unsigned int', 'numDevices'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice:
+      if (data->args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList) data->args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList__val = *(data->args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList);
+      break;
+// hipModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams'), ('void**', 'extra')]
+    case HIP_API_ID_hipModuleLaunchKernel:
+      if (data->args.hipModuleLaunchKernel.kernelParams) data->args.hipModuleLaunchKernel.kernelParams__val = *(data->args.hipModuleLaunchKernel.kernelParams);
+      if (data->args.hipModuleLaunchKernel.extra) data->args.hipModuleLaunchKernel.extra__val = *(data->args.hipModuleLaunchKernel.extra);
+      break;
+// hipModuleLoad[('hipModule_t*', 'module'), ('const char*', 'fname')]
+    case HIP_API_ID_hipModuleLoad:
+      if (data->args.hipModuleLoad.module) data->args.hipModuleLoad.module__val = *(data->args.hipModuleLoad.module);
+      if (data->args.hipModuleLoad.fname) data->args.hipModuleLoad.fname__val = *(data->args.hipModuleLoad.fname);
+      break;
+// hipModuleLoadData[('hipModule_t*', 'module'), ('const void*', 'image')]
+    case HIP_API_ID_hipModuleLoadData:
+      if (data->args.hipModuleLoadData.module) data->args.hipModuleLoadData.module__val = *(data->args.hipModuleLoadData.module);
+      break;
+// hipModuleLoadDataEx[('hipModule_t*', 'module'), ('const void*', 'image'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionsValues')]
+    case HIP_API_ID_hipModuleLoadDataEx:
+      if (data->args.hipModuleLoadDataEx.module) data->args.hipModuleLoadDataEx.module__val = *(data->args.hipModuleLoadDataEx.module);
+      if (data->args.hipModuleLoadDataEx.options) data->args.hipModuleLoadDataEx.options__val = *(data->args.hipModuleLoadDataEx.options);
+      if (data->args.hipModuleLoadDataEx.optionsValues) data->args.hipModuleLoadDataEx.optionsValues__val = *(data->args.hipModuleLoadDataEx.optionsValues);
+      break;
+// hipModuleLoadFatBinary[('hipModule_t*', 'module'), ('const void*', 'fatbin')]
+    case HIP_API_ID_hipModuleLoadFatBinary:
+      if (data->args.hipModuleLoadFatBinary.module) data->args.hipModuleLoadFatBinary.module__val = *(data->args.hipModuleLoadFatBinary.module);
+      break;
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk')]
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor:
+      if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks) data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val = *(data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks);
+      break;
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+      if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks) data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val = *(data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks);
+      break;
+// hipModuleOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize:
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize) data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize);
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize) data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize);
+      break;
+// hipModuleOccupancyMaxPotentialBlockSizeWithFlags[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags:
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize) data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize);
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize) data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize);
+      break;
+// hipModuleUnload[('hipModule_t', 'module')]
+    case HIP_API_ID_hipModuleUnload:
+      break;
+// hipOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize')]
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor:
+      if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks) data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val = *(data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks);
+      break;
+// hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+      if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks) data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val = *(data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks);
+      break;
+// hipOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('const void*', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+    case HIP_API_ID_hipOccupancyMaxPotentialBlockSize:
+      if (data->args.hipOccupancyMaxPotentialBlockSize.gridSize) data->args.hipOccupancyMaxPotentialBlockSize.gridSize__val = *(data->args.hipOccupancyMaxPotentialBlockSize.gridSize);
+      if (data->args.hipOccupancyMaxPotentialBlockSize.blockSize) data->args.hipOccupancyMaxPotentialBlockSize.blockSize__val = *(data->args.hipOccupancyMaxPotentialBlockSize.blockSize);
+      break;
+// hipPeekAtLastError[]
+    case HIP_API_ID_hipPeekAtLastError:
+      break;
+// hipPointerGetAttribute[('void*', 'data'), ('hipPointer_attribute', 'attribute'), ('hipDeviceptr_t', 'ptr')]
+    case HIP_API_ID_hipPointerGetAttribute:
+      break;
+// hipPointerGetAttributes[('hipPointerAttribute_t*', 'attributes'), ('const void*', 'ptr')]
+    case HIP_API_ID_hipPointerGetAttributes:
+      if (data->args.hipPointerGetAttributes.attributes) data->args.hipPointerGetAttributes.attributes__val = *(data->args.hipPointerGetAttributes.attributes);
+      break;
+// hipPointerSetAttribute[('const void*', 'value'), ('hipPointer_attribute', 'attribute'), ('hipDeviceptr_t', 'ptr')]
+    case HIP_API_ID_hipPointerSetAttribute:
+      break;
+// hipProfilerStart[]
+    case HIP_API_ID_hipProfilerStart:
+      break;
+// hipProfilerStop[]
+    case HIP_API_ID_hipProfilerStop:
+      break;
+// hipRuntimeGetVersion[('int*', 'runtimeVersion')]
+    case HIP_API_ID_hipRuntimeGetVersion:
+      if (data->args.hipRuntimeGetVersion.runtimeVersion) data->args.hipRuntimeGetVersion.runtimeVersion__val = *(data->args.hipRuntimeGetVersion.runtimeVersion);
+      break;
+// hipSetDevice[('int', 'deviceId')]
+    case HIP_API_ID_hipSetDevice:
+      break;
+// hipSetDeviceFlags[('unsigned int', 'flags')]
+    case HIP_API_ID_hipSetDeviceFlags:
+      break;
+// hipSetValidDevices[('int*', 'device_arr'), ('int', 'len')]
+    case HIP_API_ID_hipSetValidDevices:
+      if (data->args.hipSetValidDevices.device_arr) data->args.hipSetValidDevices.device_arr__val = *(data->args.hipSetValidDevices.device_arr);
+      break;
+// hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')]
+    case HIP_API_ID_hipSetupArgument:
+      break;
+// hipSignalExternalSemaphoresAsync[('const hipExternalSemaphore_t*', 'extSemArray'), ('const hipExternalSemaphoreSignalParams*', 'paramsArray'), ('unsigned int', 'numExtSems'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipSignalExternalSemaphoresAsync:
+      if (data->args.hipSignalExternalSemaphoresAsync.extSemArray) data->args.hipSignalExternalSemaphoresAsync.extSemArray__val = *(data->args.hipSignalExternalSemaphoresAsync.extSemArray);
+      if (data->args.hipSignalExternalSemaphoresAsync.paramsArray) data->args.hipSignalExternalSemaphoresAsync.paramsArray__val = *(data->args.hipSignalExternalSemaphoresAsync.paramsArray);
+      break;
+// hipStreamAddCallback[('hipStream_t', 'stream'), ('hipStreamCallback_t', 'callback'), ('void*', 'userData'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamAddCallback:
+      break;
+// hipStreamAttachMemAsync[('hipStream_t', 'stream'), ('void*', 'dev_ptr'), ('size_t', 'length'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamAttachMemAsync:
+      break;
+// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'), ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamBatchMemOp:
+      if (data->args.hipStreamBatchMemOp.paramArray) data->args.hipStreamBatchMemOp.paramArray__val = *(data->args.hipStreamBatchMemOp.paramArray);
+      break;
+// hipStreamBeginCapture[('hipStream_t', 'stream'), ('hipStreamCaptureMode', 'mode')]
+    case HIP_API_ID_hipStreamBeginCapture:
+      break;
+// hipStreamBeginCaptureToGraph[('hipStream_t', 'stream'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'dependencies'), ('const hipGraphEdgeData*', 'dependencyData'), ('size_t', 'numDependencies'), ('hipStreamCaptureMode', 'mode')]
+    case HIP_API_ID_hipStreamBeginCaptureToGraph:
+      if (data->args.hipStreamBeginCaptureToGraph.dependencies) data->args.hipStreamBeginCaptureToGraph.dependencies__val = *(data->args.hipStreamBeginCaptureToGraph.dependencies);
+      if (data->args.hipStreamBeginCaptureToGraph.dependencyData) data->args.hipStreamBeginCaptureToGraph.dependencyData__val = *(data->args.hipStreamBeginCaptureToGraph.dependencyData);
+      break;
+// hipStreamCreate[('hipStream_t*', 'stream')]
+    case HIP_API_ID_hipStreamCreate:
+      if (data->args.hipStreamCreate.stream) data->args.hipStreamCreate.stream__val = *(data->args.hipStreamCreate.stream);
+      break;
+// hipStreamCreateWithFlags[('hipStream_t*', 'stream'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamCreateWithFlags:
+      if (data->args.hipStreamCreateWithFlags.stream) data->args.hipStreamCreateWithFlags.stream__val = *(data->args.hipStreamCreateWithFlags.stream);
+      break;
+// hipStreamCreateWithPriority[('hipStream_t*', 'stream'), ('unsigned int', 'flags'), ('int', 'priority')]
+    case HIP_API_ID_hipStreamCreateWithPriority:
+      if (data->args.hipStreamCreateWithPriority.stream) data->args.hipStreamCreateWithPriority.stream__val = *(data->args.hipStreamCreateWithPriority.stream);
+      break;
+// hipStreamDestroy[('hipStream_t', 'stream')]
+    case HIP_API_ID_hipStreamDestroy:
+      break;
+// hipStreamEndCapture[('hipStream_t', 'stream'), ('hipGraph_t*', 'pGraph')]
+    case HIP_API_ID_hipStreamEndCapture:
+      if (data->args.hipStreamEndCapture.pGraph) data->args.hipStreamEndCapture.pGraph__val = *(data->args.hipStreamEndCapture.pGraph);
+      break;
+// hipStreamGetAttribute[('hipStream_t', 'stream'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value_out')]
+    case HIP_API_ID_hipStreamGetAttribute:
+      if (data->args.hipStreamGetAttribute.value_out) data->args.hipStreamGetAttribute.value_out__val = *(data->args.hipStreamGetAttribute.value_out);
+      break;
+// hipStreamGetCaptureInfo[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'pCaptureStatus'), ('unsigned long long*', 'pId')]
+    case HIP_API_ID_hipStreamGetCaptureInfo:
+      if (data->args.hipStreamGetCaptureInfo.pCaptureStatus) data->args.hipStreamGetCaptureInfo.pCaptureStatus__val = *(data->args.hipStreamGetCaptureInfo.pCaptureStatus);
+      if (data->args.hipStreamGetCaptureInfo.pId) data->args.hipStreamGetCaptureInfo.pId__val = *(data->args.hipStreamGetCaptureInfo.pId);
+      break;
+// hipStreamGetCaptureInfo_v2[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'captureStatus_out'), ('unsigned long long*', 'id_out'), ('hipGraph_t*', 'graph_out'), ('const hipGraphNode_t**', 'dependencies_out'), ('size_t*', 'numDependencies_out')]
+    case HIP_API_ID_hipStreamGetCaptureInfo_v2:
+      if (data->args.hipStreamGetCaptureInfo_v2.captureStatus_out) data->args.hipStreamGetCaptureInfo_v2.captureStatus_out__val = *(data->args.hipStreamGetCaptureInfo_v2.captureStatus_out);
+      if (data->args.hipStreamGetCaptureInfo_v2.id_out) data->args.hipStreamGetCaptureInfo_v2.id_out__val = *(data->args.hipStreamGetCaptureInfo_v2.id_out);
+      if (data->args.hipStreamGetCaptureInfo_v2.graph_out) data->args.hipStreamGetCaptureInfo_v2.graph_out__val = *(data->args.hipStreamGetCaptureInfo_v2.graph_out);
+      if (data->args.hipStreamGetCaptureInfo_v2.dependencies_out) data->args.hipStreamGetCaptureInfo_v2.dependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.dependencies_out);
+      if (data->args.hipStreamGetCaptureInfo_v2.numDependencies_out) data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.numDependencies_out);
+      break;
+// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')]
+    case HIP_API_ID_hipStreamGetDevice:
+      if (data->args.hipStreamGetDevice.device) data->args.hipStreamGetDevice.device__val = *(data->args.hipStreamGetDevice.device);
+      break;
+// hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
+    case HIP_API_ID_hipStreamGetFlags:
+      if (data->args.hipStreamGetFlags.flags) data->args.hipStreamGetFlags.flags__val = *(data->args.hipStreamGetFlags.flags);
+      break;
+// hipStreamGetId[('hipStream_t', 'stream'), ('unsigned long long*', 'streamId')]
+    case HIP_API_ID_hipStreamGetId:
+      if (data->args.hipStreamGetId.streamId) data->args.hipStreamGetId.streamId__val = *(data->args.hipStreamGetId.streamId);
+      break;
+// hipStreamGetPriority[('hipStream_t', 'stream'), ('int*', 'priority')]
+    case HIP_API_ID_hipStreamGetPriority:
+      if (data->args.hipStreamGetPriority.priority) data->args.hipStreamGetPriority.priority__val = *(data->args.hipStreamGetPriority.priority);
+      break;
+// hipStreamIsCapturing[('hipStream_t', 'stream'), ('hipStreamCaptureStatus*', 'pCaptureStatus')]
+    case HIP_API_ID_hipStreamIsCapturing:
+      if (data->args.hipStreamIsCapturing.pCaptureStatus) data->args.hipStreamIsCapturing.pCaptureStatus__val = *(data->args.hipStreamIsCapturing.pCaptureStatus);
+      break;
+// hipStreamQuery[('hipStream_t', 'stream')]
+    case HIP_API_ID_hipStreamQuery:
+      break;
+// hipStreamSetAttribute[('hipStream_t', 'stream'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')]
+    case HIP_API_ID_hipStreamSetAttribute:
+      if (data->args.hipStreamSetAttribute.value) data->args.hipStreamSetAttribute.value__val = *(data->args.hipStreamSetAttribute.value);
+      break;
+// hipStreamSynchronize[('hipStream_t', 'stream')]
+    case HIP_API_ID_hipStreamSynchronize:
+      break;
+// hipStreamUpdateCaptureDependencies[('hipStream_t', 'stream'), ('hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamUpdateCaptureDependencies:
+      if (data->args.hipStreamUpdateCaptureDependencies.dependencies) data->args.hipStreamUpdateCaptureDependencies.dependencies__val = *(data->args.hipStreamUpdateCaptureDependencies.dependencies);
+      break;
+// hipStreamWaitEvent[('hipStream_t', 'stream'), ('hipEvent_t', 'event'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamWaitEvent:
+      break;
+// hipStreamWaitValue32[('hipStream_t', 'stream'), ('void*', 'ptr'), ('unsigned int', 'value'), ('unsigned int', 'flags'), ('unsigned int', 'mask')]
+    case HIP_API_ID_hipStreamWaitValue32:
+      break;
+// hipStreamWaitValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags'), ('uint64_t', 'mask')]
+    case HIP_API_ID_hipStreamWaitValue64:
+      break;
+// hipStreamWriteValue32[('hipStream_t', 'stream'), ('void*', 'ptr'), ('unsigned int', 'value'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamWriteValue32:
+      break;
+// hipStreamWriteValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipStreamWriteValue64:
+      break;
+// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetAddress:
+      if (data->args.hipTexRefGetAddress.dev_ptr) data->args.hipTexRefGetAddress.dev_ptr__val = *(data->args.hipTexRefGetAddress.dev_ptr);
+      if (data->args.hipTexRefGetAddress.texRef) data->args.hipTexRefGetAddress.texRef__val = *(data->args.hipTexRefGetAddress.texRef);
+      break;
+// hipTexRefGetArray[('hipArray_t*', 'pArray'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetArray:
+      if (data->args.hipTexRefGetArray.pArray) data->args.hipTexRefGetArray.pArray__val = *(data->args.hipTexRefGetArray.pArray);
+      if (data->args.hipTexRefGetArray.texRef) data->args.hipTexRefGetArray.texRef__val = *(data->args.hipTexRefGetArray.texRef);
+      break;
+// hipTexRefGetBorderColor[('float*', 'pBorderColor'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetBorderColor:
+      if (data->args.hipTexRefGetBorderColor.pBorderColor) data->args.hipTexRefGetBorderColor.pBorderColor__val = *(data->args.hipTexRefGetBorderColor.pBorderColor);
+      if (data->args.hipTexRefGetBorderColor.texRef) data->args.hipTexRefGetBorderColor.texRef__val = *(data->args.hipTexRefGetBorderColor.texRef);
+      break;
+// hipTexRefGetFlags[('unsigned int*', 'pFlags'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetFlags:
+      if (data->args.hipTexRefGetFlags.pFlags) data->args.hipTexRefGetFlags.pFlags__val = *(data->args.hipTexRefGetFlags.pFlags);
+      if (data->args.hipTexRefGetFlags.texRef) data->args.hipTexRefGetFlags.texRef__val = *(data->args.hipTexRefGetFlags.texRef);
+      break;
+// hipTexRefGetFormat[('hipArray_Format*', 'pFormat'), ('int*', 'pNumChannels'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetFormat:
+      if (data->args.hipTexRefGetFormat.pFormat) data->args.hipTexRefGetFormat.pFormat__val = *(data->args.hipTexRefGetFormat.pFormat);
+      if (data->args.hipTexRefGetFormat.pNumChannels) data->args.hipTexRefGetFormat.pNumChannels__val = *(data->args.hipTexRefGetFormat.pNumChannels);
+      if (data->args.hipTexRefGetFormat.texRef) data->args.hipTexRefGetFormat.texRef__val = *(data->args.hipTexRefGetFormat.texRef);
+      break;
+// hipTexRefGetMaxAnisotropy[('int*', 'pmaxAnsio'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetMaxAnisotropy:
+      if (data->args.hipTexRefGetMaxAnisotropy.pmaxAnsio) data->args.hipTexRefGetMaxAnisotropy.pmaxAnsio__val = *(data->args.hipTexRefGetMaxAnisotropy.pmaxAnsio);
+      if (data->args.hipTexRefGetMaxAnisotropy.texRef) data->args.hipTexRefGetMaxAnisotropy.texRef__val = *(data->args.hipTexRefGetMaxAnisotropy.texRef);
+      break;
+// hipTexRefGetMipMappedArray[('hipMipmappedArray_t*', 'pArray'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetMipMappedArray:
+      if (data->args.hipTexRefGetMipMappedArray.pArray) data->args.hipTexRefGetMipMappedArray.pArray__val = *(data->args.hipTexRefGetMipMappedArray.pArray);
+      if (data->args.hipTexRefGetMipMappedArray.texRef) data->args.hipTexRefGetMipMappedArray.texRef__val = *(data->args.hipTexRefGetMipMappedArray.texRef);
+      break;
+// hipTexRefGetMipmapLevelBias[('float*', 'pbias'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetMipmapLevelBias:
+      if (data->args.hipTexRefGetMipmapLevelBias.pbias) data->args.hipTexRefGetMipmapLevelBias.pbias__val = *(data->args.hipTexRefGetMipmapLevelBias.pbias);
+      if (data->args.hipTexRefGetMipmapLevelBias.texRef) data->args.hipTexRefGetMipmapLevelBias.texRef__val = *(data->args.hipTexRefGetMipmapLevelBias.texRef);
+      break;
+// hipTexRefGetMipmapLevelClamp[('float*', 'pminMipmapLevelClamp'), ('float*', 'pmaxMipmapLevelClamp'), ('const textureReference*', 'texRef')]
+    case HIP_API_ID_hipTexRefGetMipmapLevelClamp:
+      if (data->args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp) data->args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp__val = *(data->args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp);
+      if (data->args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp) data->args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp__val = *(data->args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp);
+      if (data->args.hipTexRefGetMipmapLevelClamp.texRef) data->args.hipTexRefGetMipmapLevelClamp.texRef__val = *(data->args.hipTexRefGetMipmapLevelClamp.texRef);
+      break;
+// hipTexRefSetAddress[('size_t*', 'ByteOffset'), ('textureReference*', 'texRef'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'bytes')]
+    case HIP_API_ID_hipTexRefSetAddress:
+      if (data->args.hipTexRefSetAddress.ByteOffset) data->args.hipTexRefSetAddress.ByteOffset__val = *(data->args.hipTexRefSetAddress.ByteOffset);
+      if (data->args.hipTexRefSetAddress.texRef) data->args.hipTexRefSetAddress.texRef__val = *(data->args.hipTexRefSetAddress.texRef);
+      break;
+// hipTexRefSetAddress2D[('textureReference*', 'texRef'), ('const HIP_ARRAY_DESCRIPTOR*', 'desc'), ('hipDeviceptr_t', 'dptr'), ('size_t', 'Pitch')]
+    case HIP_API_ID_hipTexRefSetAddress2D:
+      if (data->args.hipTexRefSetAddress2D.texRef) data->args.hipTexRefSetAddress2D.texRef__val = *(data->args.hipTexRefSetAddress2D.texRef);
+      if (data->args.hipTexRefSetAddress2D.desc) data->args.hipTexRefSetAddress2D.desc__val = *(data->args.hipTexRefSetAddress2D.desc);
+      break;
+// hipTexRefSetArray[('textureReference*', 'tex'), ('hipArray_const_t', 'array'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipTexRefSetArray:
+      if (data->args.hipTexRefSetArray.tex) data->args.hipTexRefSetArray.tex__val = *(data->args.hipTexRefSetArray.tex);
+      break;
+// hipTexRefSetBorderColor[('textureReference*', 'texRef'), ('float*', 'pBorderColor')]
+    case HIP_API_ID_hipTexRefSetBorderColor:
+      if (data->args.hipTexRefSetBorderColor.texRef) data->args.hipTexRefSetBorderColor.texRef__val = *(data->args.hipTexRefSetBorderColor.texRef);
+      if (data->args.hipTexRefSetBorderColor.pBorderColor) data->args.hipTexRefSetBorderColor.pBorderColor__val = *(data->args.hipTexRefSetBorderColor.pBorderColor);
+      break;
+// hipTexRefSetFlags[('textureReference*', 'texRef'), ('unsigned int', 'Flags')]
+    case HIP_API_ID_hipTexRefSetFlags:
+      if (data->args.hipTexRefSetFlags.texRef) data->args.hipTexRefSetFlags.texRef__val = *(data->args.hipTexRefSetFlags.texRef);
+      break;
+// hipTexRefSetFormat[('textureReference*', 'texRef'), ('hipArray_Format', 'fmt'), ('int', 'NumPackedComponents')]
+    case HIP_API_ID_hipTexRefSetFormat:
+      if (data->args.hipTexRefSetFormat.texRef) data->args.hipTexRefSetFormat.texRef__val = *(data->args.hipTexRefSetFormat.texRef);
+      break;
+// hipTexRefSetMaxAnisotropy[('textureReference*', 'texRef'), ('unsigned int', 'maxAniso')]
+    case HIP_API_ID_hipTexRefSetMaxAnisotropy:
+      if (data->args.hipTexRefSetMaxAnisotropy.texRef) data->args.hipTexRefSetMaxAnisotropy.texRef__val = *(data->args.hipTexRefSetMaxAnisotropy.texRef);
+      break;
+// hipTexRefSetMipmapLevelBias[('textureReference*', 'texRef'), ('float', 'bias')]
+    case HIP_API_ID_hipTexRefSetMipmapLevelBias:
+      if (data->args.hipTexRefSetMipmapLevelBias.texRef) data->args.hipTexRefSetMipmapLevelBias.texRef__val = *(data->args.hipTexRefSetMipmapLevelBias.texRef);
+      break;
+// hipTexRefSetMipmapLevelClamp[('textureReference*', 'texRef'), ('float', 'minMipMapLevelClamp'), ('float', 'maxMipMapLevelClamp')]
+    case HIP_API_ID_hipTexRefSetMipmapLevelClamp:
+      if (data->args.hipTexRefSetMipmapLevelClamp.texRef) data->args.hipTexRefSetMipmapLevelClamp.texRef__val = *(data->args.hipTexRefSetMipmapLevelClamp.texRef);
+      break;
+// hipTexRefSetMipmappedArray[('textureReference*', 'texRef'), ('hipMipmappedArray*', 'mipmappedArray'), ('unsigned int', 'Flags')]
+    case HIP_API_ID_hipTexRefSetMipmappedArray:
+      if (data->args.hipTexRefSetMipmappedArray.texRef) data->args.hipTexRefSetMipmappedArray.texRef__val = *(data->args.hipTexRefSetMipmappedArray.texRef);
+      if (data->args.hipTexRefSetMipmappedArray.mipmappedArray) data->args.hipTexRefSetMipmappedArray.mipmappedArray__val = *(data->args.hipTexRefSetMipmappedArray.mipmappedArray);
+      break;
+// hipThreadExchangeStreamCaptureMode[('hipStreamCaptureMode*', 'mode')]
+    case HIP_API_ID_hipThreadExchangeStreamCaptureMode:
+      if (data->args.hipThreadExchangeStreamCaptureMode.mode) data->args.hipThreadExchangeStreamCaptureMode.mode__val = *(data->args.hipThreadExchangeStreamCaptureMode.mode);
+      break;
+// hipUserObjectCreate[('hipUserObject_t*', 'object_out'), ('void*', 'ptr'), ('hipHostFn_t', 'destroy'), ('unsigned int', 'initialRefcount'), ('unsigned int', 'flags')]
+    case HIP_API_ID_hipUserObjectCreate:
+      if (data->args.hipUserObjectCreate.object_out) data->args.hipUserObjectCreate.object_out__val = *(data->args.hipUserObjectCreate.object_out);
+      break;
+// hipUserObjectRelease[('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+    case HIP_API_ID_hipUserObjectRelease:
+      break;
+// hipUserObjectRetain[('hipUserObject_t', 'object'), ('unsigned int', 'count')]
+    case HIP_API_ID_hipUserObjectRetain:
+      break;
+// hipWaitExternalSemaphoresAsync[('const hipExternalSemaphore_t*', 'extSemArray'), ('const hipExternalSemaphoreWaitParams*', 'paramsArray'), ('unsigned int', 'numExtSems'), ('hipStream_t', 'stream')]
+    case HIP_API_ID_hipWaitExternalSemaphoresAsync:
+      if (data->args.hipWaitExternalSemaphoresAsync.extSemArray) data->args.hipWaitExternalSemaphoresAsync.extSemArray__val = *(data->args.hipWaitExternalSemaphoresAsync.extSemArray);
+      if (data->args.hipWaitExternalSemaphoresAsync.paramsArray) data->args.hipWaitExternalSemaphoresAsync.paramsArray__val = *(data->args.hipWaitExternalSemaphoresAsync.paramsArray);
+      break;
+    default: break;
+  };
+}
+
+#include <sstream>
+#include <string>
+// HIP API string method, method name and parameters
+static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {
+  std::ostringstream oss;
+  switch (id) {
+    case HIP_API_ID___hipPopCallConfiguration:
+      oss << "__hipPopCallConfiguration(";
+      if (data->args.__hipPopCallConfiguration.gridDim == NULL) oss << "gridDim=NULL";
+      else { oss << "gridDim="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPopCallConfiguration.gridDim__val); }
+      if (data->args.__hipPopCallConfiguration.blockDim == NULL) oss << ", blockDim=NULL";
+      else { oss << ", blockDim="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPopCallConfiguration.blockDim__val); }
+      if (data->args.__hipPopCallConfiguration.sharedMem == NULL) oss << ", sharedMem=NULL";
+      else { oss << ", sharedMem="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPopCallConfiguration.sharedMem__val); }
+      if (data->args.__hipPopCallConfiguration.stream == NULL) oss << ", stream=NULL";
+      else { oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPopCallConfiguration.stream__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID___hipPushCallConfiguration:
+      oss << "__hipPushCallConfiguration(";
+      oss << "gridDim="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPushCallConfiguration.gridDim);
+      oss << ", blockDim="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPushCallConfiguration.blockDim);
+      oss << ", sharedMem="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPushCallConfiguration.sharedMem);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.__hipPushCallConfiguration.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArray3DCreate:
+      oss << "hipArray3DCreate(";
+      if (data->args.hipArray3DCreate.array == NULL) oss << "array=NULL";
+      else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DCreate.array__val); }
+      if (data->args.hipArray3DCreate.pAllocateArray == NULL) oss << ", pAllocateArray=NULL";
+      else { oss << ", pAllocateArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DCreate.pAllocateArray__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArray3DGetDescriptor:
+      oss << "hipArray3DGetDescriptor(";
+      if (data->args.hipArray3DGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL";
+      else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.pArrayDescriptor__val); }
+      oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArrayCreate:
+      oss << "hipArrayCreate(";
+      if (data->args.hipArrayCreate.pHandle == NULL) oss << "pHandle=NULL";
+      else { oss << "pHandle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayCreate.pHandle__val); }
+      if (data->args.hipArrayCreate.pAllocateArray == NULL) oss << ", pAllocateArray=NULL";
+      else { oss << ", pAllocateArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayCreate.pAllocateArray__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArrayDestroy:
+      oss << "hipArrayDestroy(";
+      oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayDestroy.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArrayGetDescriptor:
+      oss << "hipArrayGetDescriptor(";
+      if (data->args.hipArrayGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL";
+      else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.pArrayDescriptor__val); }
+      oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArrayGetInfo:
+      oss << "hipArrayGetInfo(";
+      if (data->args.hipArrayGetInfo.desc == NULL) oss << "desc=NULL";
+      else { oss << "desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.desc__val); }
+      if (data->args.hipArrayGetInfo.extent == NULL) oss << ", extent=NULL";
+      else { oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.extent__val); }
+      if (data->args.hipArrayGetInfo.flags == NULL) oss << ", flags=NULL";
+      else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.flags__val); }
+      oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipChooseDeviceR0000:
+      oss << "hipChooseDeviceR0000(";
+      if (data->args.hipChooseDeviceR0000.device == NULL) oss << "device=NULL";
+      else { oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipChooseDeviceR0000.device__val); }
+      if (data->args.hipChooseDeviceR0000.prop == NULL) oss << ", prop=NULL";
+      else { oss << ", prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipChooseDeviceR0000.prop__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipChooseDeviceR0600:
+      oss << "hipChooseDeviceR0600(";
+      if (data->args.hipChooseDeviceR0600.device == NULL) oss << "device=NULL";
+      else { oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipChooseDeviceR0600.device__val); }
+      if (data->args.hipChooseDeviceR0600.prop == NULL) oss << ", prop=NULL";
+      else { oss << ", prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipChooseDeviceR0600.prop__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipConfigureCall:
+      oss << "hipConfigureCall(";
+      oss << "gridDim="; roctracer::hip_support::detail::operator<<(oss, data->args.hipConfigureCall.gridDim);
+      oss << ", blockDim="; roctracer::hip_support::detail::operator<<(oss, data->args.hipConfigureCall.blockDim);
+      oss << ", sharedMem="; roctracer::hip_support::detail::operator<<(oss, data->args.hipConfigureCall.sharedMem);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipConfigureCall.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCreateSurfaceObject:
+      oss << "hipCreateSurfaceObject(";
+      if (data->args.hipCreateSurfaceObject.pSurfObject == NULL) oss << "pSurfObject=NULL";
+      else { oss << "pSurfObject="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCreateSurfaceObject.pSurfObject__val); }
+      if (data->args.hipCreateSurfaceObject.pResDesc == NULL) oss << ", pResDesc=NULL";
+      else { oss << ", pResDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCreateSurfaceObject.pResDesc__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxCreate:
+      oss << "hipCtxCreate(";
+      if (data->args.hipCtxCreate.ctx == NULL) oss << "ctx=NULL";
+      else { oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxCreate.ctx__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxCreate.flags);
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxCreate.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxDestroy:
+      oss << "hipCtxDestroy(";
+      oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxDestroy.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxDisablePeerAccess:
+      oss << "hipCtxDisablePeerAccess(";
+      oss << "peerCtx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxDisablePeerAccess.peerCtx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxEnablePeerAccess:
+      oss << "hipCtxEnablePeerAccess(";
+      oss << "peerCtx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxEnablePeerAccess.peerCtx);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxEnablePeerAccess.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetApiVersion:
+      oss << "hipCtxGetApiVersion(";
+      oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetApiVersion.ctx);
+      if (data->args.hipCtxGetApiVersion.apiVersion == NULL) oss << ", apiVersion=NULL";
+      else { oss << ", apiVersion="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetApiVersion.apiVersion__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetCacheConfig:
+      oss << "hipCtxGetCacheConfig(";
+      if (data->args.hipCtxGetCacheConfig.cacheConfig == NULL) oss << "cacheConfig=NULL";
+      else { oss << "cacheConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetCacheConfig.cacheConfig__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetCurrent:
+      oss << "hipCtxGetCurrent(";
+      if (data->args.hipCtxGetCurrent.ctx == NULL) oss << "ctx=NULL";
+      else { oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetCurrent.ctx__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetDevice:
+      oss << "hipCtxGetDevice(";
+      if (data->args.hipCtxGetDevice.device == NULL) oss << "device=NULL";
+      else { oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetDevice.device__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetFlags:
+      oss << "hipCtxGetFlags(";
+      if (data->args.hipCtxGetFlags.flags == NULL) oss << "flags=NULL";
+      else { oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetFlags.flags__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxGetSharedMemConfig:
+      oss << "hipCtxGetSharedMemConfig(";
+      if (data->args.hipCtxGetSharedMemConfig.pConfig == NULL) oss << "pConfig=NULL";
+      else { oss << "pConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxGetSharedMemConfig.pConfig__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxPopCurrent:
+      oss << "hipCtxPopCurrent(";
+      if (data->args.hipCtxPopCurrent.ctx == NULL) oss << "ctx=NULL";
+      else { oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxPopCurrent.ctx__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxPushCurrent:
+      oss << "hipCtxPushCurrent(";
+      oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxPushCurrent.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxSetCacheConfig:
+      oss << "hipCtxSetCacheConfig(";
+      oss << "cacheConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxSetCacheConfig.cacheConfig);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxSetCurrent:
+      oss << "hipCtxSetCurrent(";
+      oss << "ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxSetCurrent.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxSetSharedMemConfig:
+      oss << "hipCtxSetSharedMemConfig(";
+      oss << "config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipCtxSetSharedMemConfig.config);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipCtxSynchronize:
+      oss << "hipCtxSynchronize(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDestroyExternalMemory:
+      oss << "hipDestroyExternalMemory(";
+      oss << "extMem="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDestroyExternalMemory.extMem);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDestroyExternalSemaphore:
+      oss << "hipDestroyExternalSemaphore(";
+      oss << "extSem="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDestroyExternalSemaphore.extSem);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDestroySurfaceObject:
+      oss << "hipDestroySurfaceObject(";
+      oss << "surfaceObject="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDestroySurfaceObject.surfaceObject);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceCanAccessPeer:
+      oss << "hipDeviceCanAccessPeer(";
+      if (data->args.hipDeviceCanAccessPeer.canAccessPeer == NULL) oss << "canAccessPeer=NULL";
+      else { oss << "canAccessPeer="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceCanAccessPeer.canAccessPeer__val); }
+      oss << ", deviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceCanAccessPeer.deviceId);
+      oss << ", peerDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceCanAccessPeer.peerDeviceId);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceComputeCapability:
+      oss << "hipDeviceComputeCapability(";
+      if (data->args.hipDeviceComputeCapability.major == NULL) oss << "major=NULL";
+      else { oss << "major="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceComputeCapability.major__val); }
+      if (data->args.hipDeviceComputeCapability.minor == NULL) oss << ", minor=NULL";
+      else { oss << ", minor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceComputeCapability.minor__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceComputeCapability.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceDisablePeerAccess:
+      oss << "hipDeviceDisablePeerAccess(";
+      oss << "peerDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceDisablePeerAccess.peerDeviceId);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceEnablePeerAccess:
+      oss << "hipDeviceEnablePeerAccess(";
+      oss << "peerDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceEnablePeerAccess.peerDeviceId);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceEnablePeerAccess.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGet:
+      oss << "hipDeviceGet(";
+      if (data->args.hipDeviceGet.device == NULL) oss << "device=NULL";
+      else { oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGet.device__val); }
+      oss << ", ordinal="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGet.ordinal);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetAttribute:
+      oss << "hipDeviceGetAttribute(";
+      if (data->args.hipDeviceGetAttribute.pi == NULL) oss << "pi=NULL";
+      else { oss << "pi="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetAttribute.pi__val); }
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetAttribute.attr);
+      oss << ", deviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetAttribute.deviceId);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetByPCIBusId:
+      oss << "hipDeviceGetByPCIBusId(";
+      if (data->args.hipDeviceGetByPCIBusId.device == NULL) oss << "device=NULL";
+      else { oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetByPCIBusId.device__val); }
+      if (data->args.hipDeviceGetByPCIBusId.pciBusId == NULL) oss << ", pciBusId=NULL";
+      else { oss << ", pciBusId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetByPCIBusId.pciBusId__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetCacheConfig:
+      oss << "hipDeviceGetCacheConfig(";
+      if (data->args.hipDeviceGetCacheConfig.cacheConfig == NULL) oss << "cacheConfig=NULL";
+      else { oss << "cacheConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetCacheConfig.cacheConfig__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetDefaultMemPool:
+      oss << "hipDeviceGetDefaultMemPool(";
+      if (data->args.hipDeviceGetDefaultMemPool.mem_pool == NULL) oss << "mem_pool=NULL";
+      else { oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetDefaultMemPool.mem_pool__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetDefaultMemPool.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetGraphMemAttribute:
+      oss << "hipDeviceGetGraphMemAttribute(";
+      oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetGraphMemAttribute.device);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetGraphMemAttribute.attr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetGraphMemAttribute.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetLimit:
+      oss << "hipDeviceGetLimit(";
+      if (data->args.hipDeviceGetLimit.pValue == NULL) oss << "pValue=NULL";
+      else { oss << "pValue="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetLimit.pValue__val); }
+      oss << ", limit="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetLimit.limit);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetMemPool:
+      oss << "hipDeviceGetMemPool(";
+      if (data->args.hipDeviceGetMemPool.mem_pool == NULL) oss << "mem_pool=NULL";
+      else { oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetMemPool.mem_pool__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetMemPool.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetName:
+      oss << "hipDeviceGetName(";
+      if (data->args.hipDeviceGetName.name == NULL) oss << "name=NULL";
+      else { oss << "name="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetName.name__val); }
+      oss << ", len="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetName.len);
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetName.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetP2PAttribute:
+      oss << "hipDeviceGetP2PAttribute(";
+      if (data->args.hipDeviceGetP2PAttribute.value == NULL) oss << "value=NULL";
+      else { oss << "value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetP2PAttribute.value__val); }
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetP2PAttribute.attr);
+      oss << ", srcDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetP2PAttribute.srcDevice);
+      oss << ", dstDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetP2PAttribute.dstDevice);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetPCIBusId:
+      oss << "hipDeviceGetPCIBusId(";
+      if (data->args.hipDeviceGetPCIBusId.pciBusId == NULL) oss << "pciBusId=NULL";
+      else { oss << "pciBusId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetPCIBusId.pciBusId__val); }
+      oss << ", len="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetPCIBusId.len);
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetPCIBusId.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetSharedMemConfig:
+      oss << "hipDeviceGetSharedMemConfig(";
+      if (data->args.hipDeviceGetSharedMemConfig.pConfig == NULL) oss << "pConfig=NULL";
+      else { oss << "pConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetSharedMemConfig.pConfig__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetStreamPriorityRange:
+      oss << "hipDeviceGetStreamPriorityRange(";
+      if (data->args.hipDeviceGetStreamPriorityRange.leastPriority == NULL) oss << "leastPriority=NULL";
+      else { oss << "leastPriority="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetStreamPriorityRange.leastPriority__val); }
+      if (data->args.hipDeviceGetStreamPriorityRange.greatestPriority == NULL) oss << ", greatestPriority=NULL";
+      else { oss << ", greatestPriority="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetStreamPriorityRange.greatestPriority__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGetUuid:
+      oss << "hipDeviceGetUuid(";
+      if (data->args.hipDeviceGetUuid.uuid == NULL) oss << "uuid=NULL";
+      else { oss << "uuid="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetUuid.uuid__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGetUuid.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceGraphMemTrim:
+      oss << "hipDeviceGraphMemTrim(";
+      oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceGraphMemTrim.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDevicePrimaryCtxGetState:
+      oss << "hipDevicePrimaryCtxGetState(";
+      oss << "dev="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxGetState.dev);
+      if (data->args.hipDevicePrimaryCtxGetState.flags == NULL) oss << ", flags=NULL";
+      else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxGetState.flags__val); }
+      if (data->args.hipDevicePrimaryCtxGetState.active == NULL) oss << ", active=NULL";
+      else { oss << ", active="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxGetState.active__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDevicePrimaryCtxRelease:
+      oss << "hipDevicePrimaryCtxRelease(";
+      oss << "dev="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxRelease.dev);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDevicePrimaryCtxReset:
+      oss << "hipDevicePrimaryCtxReset(";
+      oss << "dev="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxReset.dev);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDevicePrimaryCtxRetain:
+      oss << "hipDevicePrimaryCtxRetain(";
+      if (data->args.hipDevicePrimaryCtxRetain.pctx == NULL) oss << "pctx=NULL";
+      else { oss << "pctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxRetain.pctx__val); }
+      oss << ", dev="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxRetain.dev);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDevicePrimaryCtxSetFlags:
+      oss << "hipDevicePrimaryCtxSetFlags(";
+      oss << "dev="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxSetFlags.dev);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDevicePrimaryCtxSetFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceReset:
+      oss << "hipDeviceReset(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSetCacheConfig:
+      oss << "hipDeviceSetCacheConfig(";
+      oss << "cacheConfig="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetCacheConfig.cacheConfig);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSetGraphMemAttribute:
+      oss << "hipDeviceSetGraphMemAttribute(";
+      oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetGraphMemAttribute.device);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetGraphMemAttribute.attr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetGraphMemAttribute.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSetLimit:
+      oss << "hipDeviceSetLimit(";
+      oss << "limit="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetLimit.limit);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetLimit.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSetMemPool:
+      oss << "hipDeviceSetMemPool(";
+      oss << "device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetMemPool.device);
+      oss << ", mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetMemPool.mem_pool);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSetSharedMemConfig:
+      oss << "hipDeviceSetSharedMemConfig(";
+      oss << "config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceSetSharedMemConfig.config);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceSynchronize:
+      oss << "hipDeviceSynchronize(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDeviceTotalMem:
+      oss << "hipDeviceTotalMem(";
+      if (data->args.hipDeviceTotalMem.bytes == NULL) oss << "bytes=NULL";
+      else { oss << "bytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceTotalMem.bytes__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDeviceTotalMem.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDriverGetVersion:
+      oss << "hipDriverGetVersion(";
+      if (data->args.hipDriverGetVersion.driverVersion == NULL) oss << "driverVersion=NULL";
+      else { oss << "driverVersion="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDriverGetVersion.driverVersion__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphAddMemFreeNode:
+      oss << "hipDrvGraphAddMemFreeNode(";
+      if (data->args.hipDrvGraphAddMemFreeNode.phGraphNode == NULL) oss << "phGraphNode=NULL";
+      else { oss << "phGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemFreeNode.phGraphNode__val); }
+      oss << ", hGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemFreeNode.hGraph);
+      if (data->args.hipDrvGraphAddMemFreeNode.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemFreeNode.dependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemFreeNode.numDependencies);
+      oss << ", dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemFreeNode.dptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphAddMemcpyNode:
+      oss << "hipDrvGraphAddMemcpyNode(";
+      if (data->args.hipDrvGraphAddMemcpyNode.phGraphNode == NULL) oss << "phGraphNode=NULL";
+      else { oss << "phGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.phGraphNode__val); }
+      oss << ", hGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.hGraph);
+      if (data->args.hipDrvGraphAddMemcpyNode.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.dependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.numDependencies);
+      if (data->args.hipDrvGraphAddMemcpyNode.copyParams == NULL) oss << ", copyParams=NULL";
+      else { oss << ", copyParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.copyParams__val); }
+      oss << ", ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemcpyNode.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphAddMemsetNode:
+      oss << "hipDrvGraphAddMemsetNode(";
+      if (data->args.hipDrvGraphAddMemsetNode.phGraphNode == NULL) oss << "phGraphNode=NULL";
+      else { oss << "phGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.phGraphNode__val); }
+      oss << ", hGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.hGraph);
+      if (data->args.hipDrvGraphAddMemsetNode.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.dependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.numDependencies);
+      if (data->args.hipDrvGraphAddMemsetNode.memsetParams == NULL) oss << ", memsetParams=NULL";
+      else { oss << ", memsetParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.memsetParams__val); }
+      oss << ", ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphExecMemcpyNodeSetParams:
+      oss << "hipDrvGraphExecMemcpyNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemcpyNodeSetParams.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemcpyNodeSetParams.hNode);
+      if (data->args.hipDrvGraphExecMemcpyNodeSetParams.copyParams == NULL) oss << ", copyParams=NULL";
+      else { oss << ", copyParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemcpyNodeSetParams.copyParams__val); }
+      oss << ", ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemcpyNodeSetParams.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphExecMemsetNodeSetParams:
+      oss << "hipDrvGraphExecMemsetNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemsetNodeSetParams.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemsetNodeSetParams.hNode);
+      if (data->args.hipDrvGraphExecMemsetNodeSetParams.memsetParams == NULL) oss << ", memsetParams=NULL";
+      else { oss << ", memsetParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemsetNodeSetParams.memsetParams__val); }
+      oss << ", ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphExecMemsetNodeSetParams.ctx);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams:
+      oss << "hipDrvGraphMemcpyNodeGetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeGetParams.hNode);
+      if (data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams:
+      oss << "hipDrvGraphMemcpyNodeSetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeSetParams.hNode);
+      if (data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvLaunchKernelEx:
+      oss << "hipDrvLaunchKernelEx(";
+      if (data->args.hipDrvLaunchKernelEx.config == NULL) oss << "config=NULL";
+      else { oss << "config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvLaunchKernelEx.config__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvLaunchKernelEx.f);
+      if (data->args.hipDrvLaunchKernelEx.params == NULL) oss << ", params=NULL";
+      else { oss << ", params="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvLaunchKernelEx.params__val); }
+      if (data->args.hipDrvLaunchKernelEx.extra == NULL) oss << ", extra=NULL";
+      else { oss << ", extra="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvLaunchKernelEx.extra__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvMemcpy2DUnaligned:
+      oss << "hipDrvMemcpy2DUnaligned(";
+      if (data->args.hipDrvMemcpy2DUnaligned.pCopy == NULL) oss << "pCopy=NULL";
+      else { oss << "pCopy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvMemcpy2DUnaligned.pCopy__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvMemcpy3D:
+      oss << "hipDrvMemcpy3D(";
+      if (data->args.hipDrvMemcpy3D.pCopy == NULL) oss << "pCopy=NULL";
+      else { oss << "pCopy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvMemcpy3D.pCopy__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvMemcpy3DAsync:
+      oss << "hipDrvMemcpy3DAsync(";
+      if (data->args.hipDrvMemcpy3DAsync.pCopy == NULL) oss << "pCopy=NULL";
+      else { oss << "pCopy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvMemcpy3DAsync.pCopy__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvMemcpy3DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipDrvPointerGetAttributes:
+      oss << "hipDrvPointerGetAttributes(";
+      oss << "numAttributes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvPointerGetAttributes.numAttributes);
+      if (data->args.hipDrvPointerGetAttributes.attributes == NULL) oss << ", attributes=NULL";
+      else { oss << ", attributes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvPointerGetAttributes.attributes__val); }
+      if (data->args.hipDrvPointerGetAttributes.data == NULL) oss << ", data=NULL";
+      else { oss << ", data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvPointerGetAttributes.data__val); }
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvPointerGetAttributes.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventCreate:
+      oss << "hipEventCreate(";
+      if (data->args.hipEventCreate.event == NULL) oss << "event=NULL";
+      else { oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventCreate.event__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventCreateWithFlags:
+      oss << "hipEventCreateWithFlags(";
+      if (data->args.hipEventCreateWithFlags.event == NULL) oss << "event=NULL";
+      else { oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventCreateWithFlags.event__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventCreateWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventDestroy:
+      oss << "hipEventDestroy(";
+      oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventDestroy.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventElapsedTime:
+      oss << "hipEventElapsedTime(";
+      if (data->args.hipEventElapsedTime.ms == NULL) oss << "ms=NULL";
+      else { oss << "ms="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventElapsedTime.ms__val); }
+      oss << ", start="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventElapsedTime.start);
+      oss << ", stop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventElapsedTime.stop);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventQuery:
+      oss << "hipEventQuery(";
+      oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventQuery.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventRecord:
+      oss << "hipEventRecord(";
+      oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventRecord.event);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventRecord.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventRecordWithFlags:
+      oss << "hipEventRecordWithFlags(";
+      oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventRecordWithFlags.event);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventRecordWithFlags.stream);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventRecordWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipEventSynchronize:
+      oss << "hipEventSynchronize(";
+      oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtGetLastError:
+      oss << "hipExtGetLastError(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtGetLinkTypeAndHopCount:
+      oss << "hipExtGetLinkTypeAndHopCount(";
+      oss << "device1="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtGetLinkTypeAndHopCount.device1);
+      oss << ", device2="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtGetLinkTypeAndHopCount.device2);
+      if (data->args.hipExtGetLinkTypeAndHopCount.linktype == NULL) oss << ", linktype=NULL";
+      else { oss << ", linktype="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtGetLinkTypeAndHopCount.linktype__val); }
+      if (data->args.hipExtGetLinkTypeAndHopCount.hopcount == NULL) oss << ", hopcount=NULL";
+      else { oss << ", hopcount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtGetLinkTypeAndHopCount.hopcount__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtLaunchKernel:
+      oss << "hipExtLaunchKernel(";
+      oss << "function_address="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.function_address);
+      oss << ", numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.numBlocks);
+      oss << ", dimBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.dimBlocks);
+      if (data->args.hipExtLaunchKernel.args == NULL) oss << ", args=NULL";
+      else { oss << ", args="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.args__val); }
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.sharedMemBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.stream);
+      oss << ", startEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.startEvent);
+      oss << ", stopEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.stopEvent);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchKernel.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
+      oss << "hipExtLaunchMultiKernelMultiDevice(";
+      if (data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList == NULL) oss << "launchParamsList=NULL";
+      else { oss << "launchParamsList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList__val); }
+      oss << ", numDevices="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchMultiKernelMultiDevice.numDevices);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtLaunchMultiKernelMultiDevice.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtMallocWithFlags:
+      oss << "hipExtMallocWithFlags(";
+      if (data->args.hipExtMallocWithFlags.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtMallocWithFlags.ptr__val); }
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtMallocWithFlags.sizeBytes);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtMallocWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtModuleLaunchKernel:
+      oss << "hipExtModuleLaunchKernel(";
+      oss << "f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.f);
+      oss << ", globalWorkSizeX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.globalWorkSizeX);
+      oss << ", globalWorkSizeY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.globalWorkSizeY);
+      oss << ", globalWorkSizeZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.globalWorkSizeZ);
+      oss << ", localWorkSizeX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.localWorkSizeX);
+      oss << ", localWorkSizeY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.localWorkSizeY);
+      oss << ", localWorkSizeZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.localWorkSizeZ);
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.sharedMemBytes);
+      oss << ", hStream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.hStream);
+      if (data->args.hipExtModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+      else { oss << ", kernelParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.kernelParams__val); }
+      if (data->args.hipExtModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+      else { oss << ", extra="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.extra__val); }
+      oss << ", startEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.startEvent);
+      oss << ", stopEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.stopEvent);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtStreamCreateWithCUMask:
+      oss << "hipExtStreamCreateWithCUMask(";
+      if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL";
+      else { oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamCreateWithCUMask.stream__val); }
+      oss << ", cuMaskSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamCreateWithCUMask.cuMaskSize);
+      if (data->args.hipExtStreamCreateWithCUMask.cuMask == NULL) oss << ", cuMask=NULL";
+      else { oss << ", cuMask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamCreateWithCUMask.cuMask__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExtStreamGetCUMask:
+      oss << "hipExtStreamGetCUMask(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamGetCUMask.stream);
+      oss << ", cuMaskSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamGetCUMask.cuMaskSize);
+      if (data->args.hipExtStreamGetCUMask.cuMask == NULL) oss << ", cuMask=NULL";
+      else { oss << ", cuMask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtStreamGetCUMask.cuMask__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExternalMemoryGetMappedBuffer:
+      oss << "hipExternalMemoryGetMappedBuffer(";
+      if (data->args.hipExternalMemoryGetMappedBuffer.devPtr == NULL) oss << "devPtr=NULL";
+      else { oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedBuffer.devPtr__val); }
+      oss << ", extMem="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedBuffer.extMem);
+      if (data->args.hipExternalMemoryGetMappedBuffer.bufferDesc == NULL) oss << ", bufferDesc=NULL";
+      else { oss << ", bufferDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedBuffer.bufferDesc__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipExternalMemoryGetMappedMipmappedArray:
+      oss << "hipExternalMemoryGetMappedMipmappedArray(";
+      if (data->args.hipExternalMemoryGetMappedMipmappedArray.mipmap == NULL) oss << "mipmap=NULL";
+      else { oss << "mipmap="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedMipmappedArray.mipmap__val); }
+      oss << ", extMem="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedMipmappedArray.extMem);
+      if (data->args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc == NULL) oss << ", mipmapDesc=NULL";
+      else { oss << ", mipmapDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExternalMemoryGetMappedMipmappedArray.mipmapDesc__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFree:
+      oss << "hipFree(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFree.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFreeArray:
+      oss << "hipFreeArray(";
+      oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFreeArray.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFreeAsync:
+      oss << "hipFreeAsync(";
+      oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFreeAsync.dev_ptr);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFreeAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFreeHost:
+      oss << "hipFreeHost(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFreeHost.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFreeMipmappedArray:
+      oss << "hipFreeMipmappedArray(";
+      oss << "mipmappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFreeMipmappedArray.mipmappedArray);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFuncGetAttribute:
+      oss << "hipFuncGetAttribute(";
+      if (data->args.hipFuncGetAttribute.value == NULL) oss << "value=NULL";
+      else { oss << "value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncGetAttribute.value__val); }
+      oss << ", attrib="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncGetAttribute.attrib);
+      oss << ", hfunc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncGetAttribute.hfunc);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFuncGetAttributes:
+      oss << "hipFuncGetAttributes(";
+      if (data->args.hipFuncGetAttributes.attr == NULL) oss << "attr=NULL";
+      else { oss << "attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncGetAttributes.attr__val); }
+      oss << ", func="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncGetAttributes.func);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFuncSetAttribute:
+      oss << "hipFuncSetAttribute(";
+      oss << "func="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetAttribute.func);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetAttribute.attr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetAttribute.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFuncSetCacheConfig:
+      oss << "hipFuncSetCacheConfig(";
+      oss << "func="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetCacheConfig.func);
+      oss << ", config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetCacheConfig.config);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipFuncSetSharedMemConfig:
+      oss << "hipFuncSetSharedMemConfig(";
+      oss << "func="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetSharedMemConfig.func);
+      oss << ", config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipFuncSetSharedMemConfig.config);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGLGetDevices:
+      oss << "hipGLGetDevices(";
+      if (data->args.hipGLGetDevices.pHipDeviceCount == NULL) oss << "pHipDeviceCount=NULL";
+      else { oss << "pHipDeviceCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGLGetDevices.pHipDeviceCount__val); }
+      if (data->args.hipGLGetDevices.pHipDevices == NULL) oss << ", pHipDevices=NULL";
+      else { oss << ", pHipDevices="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGLGetDevices.pHipDevices__val); }
+      oss << ", hipDeviceCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGLGetDevices.hipDeviceCount);
+      oss << ", deviceList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGLGetDevices.deviceList);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetChannelDesc:
+      oss << "hipGetChannelDesc(";
+      if (data->args.hipGetChannelDesc.desc == NULL) oss << "desc=NULL";
+      else { oss << "desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetChannelDesc.desc__val); }
+      oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetChannelDesc.array);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDevice:
+      oss << "hipGetDevice(";
+      if (data->args.hipGetDevice.deviceId == NULL) oss << "deviceId=NULL";
+      else { oss << "deviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevice.deviceId__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDeviceCount:
+      oss << "hipGetDeviceCount(";
+      if (data->args.hipGetDeviceCount.count == NULL) oss << "count=NULL";
+      else { oss << "count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDeviceCount.count__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDeviceFlags:
+      oss << "hipGetDeviceFlags(";
+      if (data->args.hipGetDeviceFlags.flags == NULL) oss << "flags=NULL";
+      else { oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDeviceFlags.flags__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDevicePropertiesR0000:
+      oss << "hipGetDevicePropertiesR0000(";
+      if (data->args.hipGetDevicePropertiesR0000.prop == NULL) oss << "prop=NULL";
+      else { oss << "prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0000.prop__val); }
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0000.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDevicePropertiesR0600:
+      oss << "hipGetDevicePropertiesR0600(";
+      if (data->args.hipGetDevicePropertiesR0600.prop == NULL) oss << "prop=NULL";
+      else { oss << "prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0600.prop__val); }
+      oss << ", deviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0600.deviceId);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetDriverEntryPoint:
+      oss << "hipGetDriverEntryPoint(";
+      if (data->args.hipGetDriverEntryPoint.symbol == NULL) oss << "symbol=NULL";
+      else { oss << "symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDriverEntryPoint.symbol__val); }
+      if (data->args.hipGetDriverEntryPoint.funcPtr == NULL) oss << ", funcPtr=NULL";
+      else { oss << ", funcPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDriverEntryPoint.funcPtr__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDriverEntryPoint.flags);
+      if (data->args.hipGetDriverEntryPoint.driverStatus == NULL) oss << ", driverStatus=NULL";
+      else { oss << ", driverStatus="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDriverEntryPoint.driverStatus__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetFuncBySymbol:
+      oss << "hipGetFuncBySymbol(";
+      if (data->args.hipGetFuncBySymbol.functionPtr == NULL) oss << "functionPtr=NULL";
+      else { oss << "functionPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetFuncBySymbol.functionPtr__val); }
+      oss << ", symbolPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetFuncBySymbol.symbolPtr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetLastError:
+      oss << "hipGetLastError(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetMipmappedArrayLevel:
+      oss << "hipGetMipmappedArrayLevel(";
+      if (data->args.hipGetMipmappedArrayLevel.levelArray == NULL) oss << "levelArray=NULL";
+      else { oss << "levelArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetMipmappedArrayLevel.levelArray__val); }
+      oss << ", mipmappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetMipmappedArrayLevel.mipmappedArray);
+      oss << ", level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetMipmappedArrayLevel.level);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetProcAddress:
+      oss << "hipGetProcAddress(";
+      if (data->args.hipGetProcAddress.symbol == NULL) oss << "symbol=NULL";
+      else { oss << "symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.symbol__val); }
+      if (data->args.hipGetProcAddress.pfn == NULL) oss << ", pfn=NULL";
+      else { oss << ", pfn="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.pfn__val); }
+      oss << ", hipVersion="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.hipVersion);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.flags);
+      if (data->args.hipGetProcAddress.symbolStatus == NULL) oss << ", symbolStatus=NULL";
+      else { oss << ", symbolStatus="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.symbolStatus__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetSymbolAddress:
+      oss << "hipGetSymbolAddress(";
+      if (data->args.hipGetSymbolAddress.devPtr == NULL) oss << "devPtr=NULL";
+      else { oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetSymbolAddress.devPtr__val); }
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetSymbolAddress.symbol);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGetSymbolSize:
+      oss << "hipGetSymbolSize(";
+      if (data->args.hipGetSymbolSize.size == NULL) oss << "size=NULL";
+      else { oss << "size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetSymbolSize.size__val); }
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetSymbolSize.symbol);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddBatchMemOpNode:
+      oss << "hipGraphAddBatchMemOpNode(";
+      if (data->args.hipGraphAddBatchMemOpNode.phGraphNode == NULL) oss << "phGraphNode=NULL";
+      else { oss << "phGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddBatchMemOpNode.phGraphNode__val); }
+      oss << ", hGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddBatchMemOpNode.hGraph);
+      if (data->args.hipGraphAddBatchMemOpNode.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddBatchMemOpNode.dependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddBatchMemOpNode.numDependencies);
+      if (data->args.hipGraphAddBatchMemOpNode.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddBatchMemOpNode.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddChildGraphNode:
+      oss << "hipGraphAddChildGraphNode(";
+      if (data->args.hipGraphAddChildGraphNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddChildGraphNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddChildGraphNode.graph);
+      if (data->args.hipGraphAddChildGraphNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddChildGraphNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddChildGraphNode.numDependencies);
+      oss << ", childGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddChildGraphNode.childGraph);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddDependencies:
+      oss << "hipGraphAddDependencies(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddDependencies.graph);
+      if (data->args.hipGraphAddDependencies.from == NULL) oss << ", from=NULL";
+      else { oss << ", from="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddDependencies.from__val); }
+      if (data->args.hipGraphAddDependencies.to == NULL) oss << ", to=NULL";
+      else { oss << ", to="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddDependencies.to__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddDependencies.numDependencies);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddEmptyNode:
+      oss << "hipGraphAddEmptyNode(";
+      if (data->args.hipGraphAddEmptyNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEmptyNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEmptyNode.graph);
+      if (data->args.hipGraphAddEmptyNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEmptyNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEmptyNode.numDependencies);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddEventRecordNode:
+      oss << "hipGraphAddEventRecordNode(";
+      if (data->args.hipGraphAddEventRecordNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventRecordNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventRecordNode.graph);
+      if (data->args.hipGraphAddEventRecordNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventRecordNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventRecordNode.numDependencies);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventRecordNode.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddEventWaitNode:
+      oss << "hipGraphAddEventWaitNode(";
+      if (data->args.hipGraphAddEventWaitNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventWaitNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventWaitNode.graph);
+      if (data->args.hipGraphAddEventWaitNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventWaitNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventWaitNode.numDependencies);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddEventWaitNode.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode:
+      oss << "hipGraphAddExternalSemaphoresSignalNode(";
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresSignalNode.graph);
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresSignalNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresSignalNode.numDependencies);
+      if (data->args.hipGraphAddExternalSemaphoresSignalNode.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresSignalNode.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode:
+      oss << "hipGraphAddExternalSemaphoresWaitNode(";
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresWaitNode.graph);
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresWaitNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresWaitNode.numDependencies);
+      if (data->args.hipGraphAddExternalSemaphoresWaitNode.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddExternalSemaphoresWaitNode.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddHostNode:
+      oss << "hipGraphAddHostNode(";
+      if (data->args.hipGraphAddHostNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddHostNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddHostNode.graph);
+      if (data->args.hipGraphAddHostNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddHostNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddHostNode.numDependencies);
+      if (data->args.hipGraphAddHostNode.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddHostNode.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddKernelNode:
+      oss << "hipGraphAddKernelNode(";
+      if (data->args.hipGraphAddKernelNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddKernelNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddKernelNode.graph);
+      if (data->args.hipGraphAddKernelNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddKernelNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddKernelNode.numDependencies);
+      if (data->args.hipGraphAddKernelNode.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddKernelNode.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemAllocNode:
+      oss << "hipGraphAddMemAllocNode(";
+      if (data->args.hipGraphAddMemAllocNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemAllocNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemAllocNode.graph);
+      if (data->args.hipGraphAddMemAllocNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemAllocNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemAllocNode.numDependencies);
+      if (data->args.hipGraphAddMemAllocNode.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemAllocNode.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemFreeNode:
+      oss << "hipGraphAddMemFreeNode(";
+      if (data->args.hipGraphAddMemFreeNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemFreeNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemFreeNode.graph);
+      if (data->args.hipGraphAddMemFreeNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemFreeNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemFreeNode.numDependencies);
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemFreeNode.dev_ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemcpyNode:
+      oss << "hipGraphAddMemcpyNode(";
+      if (data->args.hipGraphAddMemcpyNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode.graph);
+      if (data->args.hipGraphAddMemcpyNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode.numDependencies);
+      if (data->args.hipGraphAddMemcpyNode.pCopyParams == NULL) oss << ", pCopyParams=NULL";
+      else { oss << ", pCopyParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode.pCopyParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemcpyNode1D:
+      oss << "hipGraphAddMemcpyNode1D(";
+      if (data->args.hipGraphAddMemcpyNode1D.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.graph);
+      if (data->args.hipGraphAddMemcpyNode1D.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.numDependencies);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.count);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNode1D.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol:
+      oss << "hipGraphAddMemcpyNodeFromSymbol(";
+      if (data->args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.graph);
+      if (data->args.hipGraphAddMemcpyNodeFromSymbol.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.numDependencies);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.dst);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.symbol);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeFromSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemcpyNodeToSymbol:
+      oss << "hipGraphAddMemcpyNodeToSymbol(";
+      if (data->args.hipGraphAddMemcpyNodeToSymbol.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.graph);
+      if (data->args.hipGraphAddMemcpyNodeToSymbol.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.numDependencies);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.symbol);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemcpyNodeToSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddMemsetNode:
+      oss << "hipGraphAddMemsetNode(";
+      if (data->args.hipGraphAddMemsetNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.graph);
+      if (data->args.hipGraphAddMemsetNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.numDependencies);
+      if (data->args.hipGraphAddMemsetNode.pMemsetParams == NULL) oss << ", pMemsetParams=NULL";
+      else { oss << ", pMemsetParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.pMemsetParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphAddNode:
+      oss << "hipGraphAddNode(";
+      if (data->args.hipGraphAddNode.pGraphNode == NULL) oss << "pGraphNode=NULL";
+      else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.pGraphNode__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.graph);
+      if (data->args.hipGraphAddNode.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.pDependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.numDependencies);
+      if (data->args.hipGraphAddNode.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphBatchMemOpNodeGetParams:
+      oss << "hipGraphBatchMemOpNodeGetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphBatchMemOpNodeGetParams.hNode);
+      if (data->args.hipGraphBatchMemOpNodeGetParams.nodeParams_out == NULL) oss << ", nodeParams_out=NULL";
+      else { oss << ", nodeParams_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphBatchMemOpNodeGetParams.nodeParams_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphBatchMemOpNodeSetParams:
+      oss << "hipGraphBatchMemOpNodeSetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphBatchMemOpNodeSetParams.hNode);
+      if (data->args.hipGraphBatchMemOpNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphBatchMemOpNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphChildGraphNodeGetGraph:
+      oss << "hipGraphChildGraphNodeGetGraph(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphChildGraphNodeGetGraph.node);
+      if (data->args.hipGraphChildGraphNodeGetGraph.pGraph == NULL) oss << ", pGraph=NULL";
+      else { oss << ", pGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphChildGraphNodeGetGraph.pGraph__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphClone:
+      oss << "hipGraphClone(";
+      if (data->args.hipGraphClone.pGraphClone == NULL) oss << "pGraphClone=NULL";
+      else { oss << "pGraphClone="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphClone.pGraphClone__val); }
+      oss << ", originalGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphClone.originalGraph);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphCreate:
+      oss << "hipGraphCreate(";
+      if (data->args.hipGraphCreate.pGraph == NULL) oss << "pGraph=NULL";
+      else { oss << "pGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphCreate.pGraph__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphCreate.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphDebugDotPrint:
+      oss << "hipGraphDebugDotPrint(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphDebugDotPrint.graph);
+      if (data->args.hipGraphDebugDotPrint.path == NULL) oss << ", path=NULL";
+      else { oss << ", path="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphDebugDotPrint.path__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphDebugDotPrint.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphDestroy:
+      oss << "hipGraphDestroy(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphDestroy.graph);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphDestroyNode:
+      oss << "hipGraphDestroyNode(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphDestroyNode.node);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphEventRecordNodeGetEvent:
+      oss << "hipGraphEventRecordNodeGetEvent(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventRecordNodeGetEvent.node);
+      if (data->args.hipGraphEventRecordNodeGetEvent.event_out == NULL) oss << ", event_out=NULL";
+      else { oss << ", event_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventRecordNodeGetEvent.event_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphEventRecordNodeSetEvent:
+      oss << "hipGraphEventRecordNodeSetEvent(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventRecordNodeSetEvent.node);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventRecordNodeSetEvent.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphEventWaitNodeGetEvent:
+      oss << "hipGraphEventWaitNodeGetEvent(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventWaitNodeGetEvent.node);
+      if (data->args.hipGraphEventWaitNodeGetEvent.event_out == NULL) oss << ", event_out=NULL";
+      else { oss << ", event_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventWaitNodeGetEvent.event_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphEventWaitNodeSetEvent:
+      oss << "hipGraphEventWaitNodeSetEvent(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventWaitNodeSetEvent.node);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphEventWaitNodeSetEvent.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecBatchMemOpNodeSetParams:
+      oss << "hipGraphExecBatchMemOpNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecBatchMemOpNodeSetParams.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecBatchMemOpNodeSetParams.hNode);
+      if (data->args.hipGraphExecBatchMemOpNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecBatchMemOpNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecChildGraphNodeSetParams:
+      oss << "hipGraphExecChildGraphNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecChildGraphNodeSetParams.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecChildGraphNodeSetParams.node);
+      oss << ", childGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecChildGraphNodeSetParams.childGraph);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecDestroy:
+      oss << "hipGraphExecDestroy(";
+      oss << "graphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecDestroy.graphExec);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecEventRecordNodeSetEvent:
+      oss << "hipGraphExecEventRecordNodeSetEvent(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventRecordNodeSetEvent.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventRecordNodeSetEvent.hNode);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventRecordNodeSetEvent.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecEventWaitNodeSetEvent:
+      oss << "hipGraphExecEventWaitNodeSetEvent(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventWaitNodeSetEvent.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventWaitNodeSetEvent.hNode);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecEventWaitNodeSetEvent.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecExternalSemaphoresSignalNodeSetParams:
+      oss << "hipGraphExecExternalSemaphoresSignalNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hNode);
+      if (data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecExternalSemaphoresWaitNodeSetParams:
+      oss << "hipGraphExecExternalSemaphoresWaitNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hNode);
+      if (data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecGetFlags:
+      oss << "hipGraphExecGetFlags(";
+      oss << "graphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecGetFlags.graphExec);
+      if (data->args.hipGraphExecGetFlags.flags == NULL) oss << ", flags=NULL";
+      else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecGetFlags.flags__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecHostNodeSetParams:
+      oss << "hipGraphExecHostNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecHostNodeSetParams.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecHostNodeSetParams.node);
+      if (data->args.hipGraphExecHostNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecHostNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecKernelNodeSetParams:
+      oss << "hipGraphExecKernelNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecKernelNodeSetParams.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecKernelNodeSetParams.node);
+      if (data->args.hipGraphExecKernelNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecKernelNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams:
+      oss << "hipGraphExecMemcpyNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams.node);
+      if (data->args.hipGraphExecMemcpyNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParams1D:
+      oss << "hipGraphExecMemcpyNodeSetParams1D(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.node);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.count);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParams1D.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsFromSymbol:
+      oss << "hipGraphExecMemcpyNodeSetParamsFromSymbol(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.node);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.dst);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.symbol);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsFromSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecMemcpyNodeSetParamsToSymbol:
+      oss << "hipGraphExecMemcpyNodeSetParamsToSymbol(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.node);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.symbol);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemcpyNodeSetParamsToSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecMemsetNodeSetParams:
+      oss << "hipGraphExecMemsetNodeSetParams(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemsetNodeSetParams.hGraphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemsetNodeSetParams.node);
+      if (data->args.hipGraphExecMemsetNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecMemsetNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecNodeSetParams:
+      oss << "hipGraphExecNodeSetParams(";
+      oss << "graphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecNodeSetParams.graphExec);
+      oss << ", node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecNodeSetParams.node);
+      if (data->args.hipGraphExecNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExecUpdate:
+      oss << "hipGraphExecUpdate(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecUpdate.hGraphExec);
+      oss << ", hGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecUpdate.hGraph);
+      if (data->args.hipGraphExecUpdate.hErrorNode_out == NULL) oss << ", hErrorNode_out=NULL";
+      else { oss << ", hErrorNode_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecUpdate.hErrorNode_out__val); }
+      if (data->args.hipGraphExecUpdate.updateResult_out == NULL) oss << ", updateResult_out=NULL";
+      else { oss << ", updateResult_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExecUpdate.updateResult_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeGetParams:
+      oss << "hipGraphExternalSemaphoresSignalNodeGetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresSignalNodeGetParams.hNode);
+      if (data->args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out == NULL) oss << ", params_out=NULL";
+      else { oss << ", params_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExternalSemaphoresSignalNodeSetParams:
+      oss << "hipGraphExternalSemaphoresSignalNodeSetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresSignalNodeSetParams.hNode);
+      if (data->args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams:
+      oss << "hipGraphExternalSemaphoresWaitNodeGetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresWaitNodeGetParams.hNode);
+      if (data->args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out == NULL) oss << ", params_out=NULL";
+      else { oss << ", params_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams:
+      oss << "hipGraphExternalSemaphoresWaitNodeSetParams(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresWaitNodeSetParams.hNode);
+      if (data->args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphGetEdges:
+      oss << "hipGraphGetEdges(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetEdges.graph);
+      if (data->args.hipGraphGetEdges.from == NULL) oss << ", from=NULL";
+      else { oss << ", from="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetEdges.from__val); }
+      if (data->args.hipGraphGetEdges.to == NULL) oss << ", to=NULL";
+      else { oss << ", to="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetEdges.to__val); }
+      if (data->args.hipGraphGetEdges.numEdges == NULL) oss << ", numEdges=NULL";
+      else { oss << ", numEdges="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetEdges.numEdges__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphGetNodes:
+      oss << "hipGraphGetNodes(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetNodes.graph);
+      if (data->args.hipGraphGetNodes.nodes == NULL) oss << ", nodes=NULL";
+      else { oss << ", nodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetNodes.nodes__val); }
+      if (data->args.hipGraphGetNodes.numNodes == NULL) oss << ", numNodes=NULL";
+      else { oss << ", numNodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetNodes.numNodes__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphGetRootNodes:
+      oss << "hipGraphGetRootNodes(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetRootNodes.graph);
+      if (data->args.hipGraphGetRootNodes.pRootNodes == NULL) oss << ", pRootNodes=NULL";
+      else { oss << ", pRootNodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetRootNodes.pRootNodes__val); }
+      if (data->args.hipGraphGetRootNodes.pNumRootNodes == NULL) oss << ", pNumRootNodes=NULL";
+      else { oss << ", pNumRootNodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphGetRootNodes.pNumRootNodes__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphHostNodeGetParams:
+      oss << "hipGraphHostNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphHostNodeGetParams.node);
+      if (data->args.hipGraphHostNodeGetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphHostNodeGetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphHostNodeSetParams:
+      oss << "hipGraphHostNodeSetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphHostNodeSetParams.node);
+      if (data->args.hipGraphHostNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphHostNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphInstantiate:
+      oss << "hipGraphInstantiate(";
+      if (data->args.hipGraphInstantiate.pGraphExec == NULL) oss << "pGraphExec=NULL";
+      else { oss << "pGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiate.pGraphExec__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiate.graph);
+      if (data->args.hipGraphInstantiate.pErrorNode == NULL) oss << ", pErrorNode=NULL";
+      else { oss << ", pErrorNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiate.pErrorNode__val); }
+      if (data->args.hipGraphInstantiate.pLogBuffer == NULL) oss << ", pLogBuffer=NULL";
+      else { oss << ", pLogBuffer="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiate.pLogBuffer__val); }
+      oss << ", bufferSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiate.bufferSize);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphInstantiateWithFlags:
+      oss << "hipGraphInstantiateWithFlags(";
+      if (data->args.hipGraphInstantiateWithFlags.pGraphExec == NULL) oss << "pGraphExec=NULL";
+      else { oss << "pGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithFlags.pGraphExec__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithFlags.graph);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphInstantiateWithParams:
+      oss << "hipGraphInstantiateWithParams(";
+      if (data->args.hipGraphInstantiateWithParams.pGraphExec == NULL) oss << "pGraphExec=NULL";
+      else { oss << "pGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.pGraphExec__val); }
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.graph);
+      if (data->args.hipGraphInstantiateWithParams.instantiateParams == NULL) oss << ", instantiateParams=NULL";
+      else { oss << ", instantiateParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.instantiateParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphKernelNodeCopyAttributes:
+      oss << "hipGraphKernelNodeCopyAttributes(";
+      oss << "hSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeCopyAttributes.hSrc);
+      oss << ", hDst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeCopyAttributes.hDst);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphKernelNodeGetAttribute:
+      oss << "hipGraphKernelNodeGetAttribute(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeGetAttribute.hNode);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeGetAttribute.attr);
+      if (data->args.hipGraphKernelNodeGetAttribute.value == NULL) oss << ", value=NULL";
+      else { oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeGetAttribute.value__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphKernelNodeGetParams:
+      oss << "hipGraphKernelNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeGetParams.node);
+      if (data->args.hipGraphKernelNodeGetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeGetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphKernelNodeSetAttribute:
+      oss << "hipGraphKernelNodeSetAttribute(";
+      oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeSetAttribute.hNode);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeSetAttribute.attr);
+      if (data->args.hipGraphKernelNodeSetAttribute.value == NULL) oss << ", value=NULL";
+      else { oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeSetAttribute.value__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphKernelNodeSetParams:
+      oss << "hipGraphKernelNodeSetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeSetParams.node);
+      if (data->args.hipGraphKernelNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphLaunch:
+      oss << "hipGraphLaunch(";
+      oss << "graphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphLaunch.graphExec);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphLaunch.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemAllocNodeGetParams:
+      oss << "hipGraphMemAllocNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemAllocNodeGetParams.node);
+      if (data->args.hipGraphMemAllocNodeGetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemAllocNodeGetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemFreeNodeGetParams:
+      oss << "hipGraphMemFreeNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemFreeNodeGetParams.node);
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemFreeNodeGetParams.dev_ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemcpyNodeGetParams:
+      oss << "hipGraphMemcpyNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeGetParams.node);
+      if (data->args.hipGraphMemcpyNodeGetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeGetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams:
+      oss << "hipGraphMemcpyNodeSetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams.node);
+      if (data->args.hipGraphMemcpyNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemcpyNodeSetParams1D:
+      oss << "hipGraphMemcpyNodeSetParams1D(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams1D.node);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams1D.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams1D.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams1D.count);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParams1D.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsFromSymbol:
+      oss << "hipGraphMemcpyNodeSetParamsFromSymbol(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.node);
+      oss << ", dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.dst);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.symbol);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsFromSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemcpyNodeSetParamsToSymbol:
+      oss << "hipGraphMemcpyNodeSetParamsToSymbol(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.node);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.symbol);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.count);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemcpyNodeSetParamsToSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemsetNodeGetParams:
+      oss << "hipGraphMemsetNodeGetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemsetNodeGetParams.node);
+      if (data->args.hipGraphMemsetNodeGetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemsetNodeGetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphMemsetNodeSetParams:
+      oss << "hipGraphMemsetNodeSetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemsetNodeSetParams.node);
+      if (data->args.hipGraphMemsetNodeSetParams.pNodeParams == NULL) oss << ", pNodeParams=NULL";
+      else { oss << ", pNodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphMemsetNodeSetParams.pNodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeFindInClone:
+      oss << "hipGraphNodeFindInClone(";
+      if (data->args.hipGraphNodeFindInClone.pNode == NULL) oss << "pNode=NULL";
+      else { oss << "pNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeFindInClone.pNode__val); }
+      oss << ", originalNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeFindInClone.originalNode);
+      oss << ", clonedGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeFindInClone.clonedGraph);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeGetDependencies:
+      oss << "hipGraphNodeGetDependencies(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependencies.node);
+      if (data->args.hipGraphNodeGetDependencies.pDependencies == NULL) oss << ", pDependencies=NULL";
+      else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependencies.pDependencies__val); }
+      if (data->args.hipGraphNodeGetDependencies.pNumDependencies == NULL) oss << ", pNumDependencies=NULL";
+      else { oss << ", pNumDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependencies.pNumDependencies__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeGetDependentNodes:
+      oss << "hipGraphNodeGetDependentNodes(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependentNodes.node);
+      if (data->args.hipGraphNodeGetDependentNodes.pDependentNodes == NULL) oss << ", pDependentNodes=NULL";
+      else { oss << ", pDependentNodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependentNodes.pDependentNodes__val); }
+      if (data->args.hipGraphNodeGetDependentNodes.pNumDependentNodes == NULL) oss << ", pNumDependentNodes=NULL";
+      else { oss << ", pNumDependentNodes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetDependentNodes.pNumDependentNodes__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeGetEnabled:
+      oss << "hipGraphNodeGetEnabled(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetEnabled.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetEnabled.hNode);
+      if (data->args.hipGraphNodeGetEnabled.isEnabled == NULL) oss << ", isEnabled=NULL";
+      else { oss << ", isEnabled="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetEnabled.isEnabled__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeGetType:
+      oss << "hipGraphNodeGetType(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetType.node);
+      if (data->args.hipGraphNodeGetType.pType == NULL) oss << ", pType=NULL";
+      else { oss << ", pType="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeGetType.pType__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeSetEnabled:
+      oss << "hipGraphNodeSetEnabled(";
+      oss << "hGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeSetEnabled.hGraphExec);
+      oss << ", hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeSetEnabled.hNode);
+      oss << ", isEnabled="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeSetEnabled.isEnabled);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphNodeSetParams:
+      oss << "hipGraphNodeSetParams(";
+      oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeSetParams.node);
+      if (data->args.hipGraphNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL";
+      else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphNodeSetParams.nodeParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphReleaseUserObject:
+      oss << "hipGraphReleaseUserObject(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphReleaseUserObject.graph);
+      oss << ", object="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphReleaseUserObject.object);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphReleaseUserObject.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphRemoveDependencies:
+      oss << "hipGraphRemoveDependencies(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRemoveDependencies.graph);
+      if (data->args.hipGraphRemoveDependencies.from == NULL) oss << ", from=NULL";
+      else { oss << ", from="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRemoveDependencies.from__val); }
+      if (data->args.hipGraphRemoveDependencies.to == NULL) oss << ", to=NULL";
+      else { oss << ", to="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRemoveDependencies.to__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRemoveDependencies.numDependencies);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphRetainUserObject:
+      oss << "hipGraphRetainUserObject(";
+      oss << "graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRetainUserObject.graph);
+      oss << ", object="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRetainUserObject.object);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRetainUserObject.count);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphRetainUserObject.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphUpload:
+      oss << "hipGraphUpload(";
+      oss << "graphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphUpload.graphExec);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphUpload.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsGLRegisterBuffer:
+      oss << "hipGraphicsGLRegisterBuffer(";
+      if (data->args.hipGraphicsGLRegisterBuffer.resource == NULL) oss << "resource=NULL";
+      else { oss << "resource="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipGraphicsGLRegisterBuffer.resource__val); }
+      oss << ", buffer="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsGLRegisterBuffer.buffer);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsGLRegisterBuffer.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsGLRegisterImage:
+      oss << "hipGraphicsGLRegisterImage(";
+      if (data->args.hipGraphicsGLRegisterImage.resource == NULL) oss << "resource=NULL";
+      else { oss << "resource="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipGraphicsGLRegisterImage.resource__val); }
+      oss << ", image="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsGLRegisterImage.image);
+      oss << ", target="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsGLRegisterImage.target);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsGLRegisterImage.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsMapResources:
+      oss << "hipGraphicsMapResources(";
+      oss << "count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsMapResources.count);
+      if (data->args.hipGraphicsMapResources.resources == NULL) oss << ", resources=NULL";
+      else { oss << ", resources="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsMapResources.resources__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsMapResources.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsResourceGetMappedPointer:
+      oss << "hipGraphicsResourceGetMappedPointer(";
+      if (data->args.hipGraphicsResourceGetMappedPointer.devPtr == NULL) oss << "devPtr=NULL";
+      else { oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsResourceGetMappedPointer.devPtr__val); }
+      if (data->args.hipGraphicsResourceGetMappedPointer.size == NULL) oss << ", size=NULL";
+      else { oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsResourceGetMappedPointer.size__val); }
+      oss << ", resource="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsResourceGetMappedPointer.resource);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsSubResourceGetMappedArray:
+      oss << "hipGraphicsSubResourceGetMappedArray(";
+      if (data->args.hipGraphicsSubResourceGetMappedArray.array == NULL) oss << "array=NULL";
+      else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsSubResourceGetMappedArray.array__val); }
+      oss << ", resource="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsSubResourceGetMappedArray.resource);
+      oss << ", arrayIndex="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsSubResourceGetMappedArray.arrayIndex);
+      oss << ", mipLevel="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsSubResourceGetMappedArray.mipLevel);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsUnmapResources:
+      oss << "hipGraphicsUnmapResources(";
+      oss << "count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsUnmapResources.count);
+      if (data->args.hipGraphicsUnmapResources.resources == NULL) oss << ", resources=NULL";
+      else { oss << ", resources="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsUnmapResources.resources__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsUnmapResources.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipGraphicsUnregisterResource:
+      oss << "hipGraphicsUnregisterResource(";
+      oss << "resource="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphicsUnregisterResource.resource);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHccModuleLaunchKernel:
+      oss << "hipHccModuleLaunchKernel(";
+      oss << "f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.f);
+      oss << ", globalWorkSizeX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.globalWorkSizeX);
+      oss << ", globalWorkSizeY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.globalWorkSizeY);
+      oss << ", globalWorkSizeZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.globalWorkSizeZ);
+      oss << ", blockDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.blockDimX);
+      oss << ", blockDimY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.blockDimY);
+      oss << ", blockDimZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.blockDimZ);
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.sharedMemBytes);
+      oss << ", hStream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.hStream);
+      if (data->args.hipHccModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+      else { oss << ", kernelParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.kernelParams__val); }
+      if (data->args.hipHccModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+      else { oss << ", extra="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.extra__val); }
+      oss << ", startEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.startEvent);
+      oss << ", stopEvent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHccModuleLaunchKernel.stopEvent);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostAlloc:
+      oss << "hipHostAlloc(";
+      if (data->args.hipHostAlloc.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostAlloc.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostAlloc.size);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostAlloc.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostFree:
+      oss << "hipHostFree(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostFree.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostGetDevicePointer:
+      oss << "hipHostGetDevicePointer(";
+      if (data->args.hipHostGetDevicePointer.devPtr == NULL) oss << "devPtr=NULL";
+      else { oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostGetDevicePointer.devPtr__val); }
+      oss << ", hstPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostGetDevicePointer.hstPtr);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostGetDevicePointer.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostGetFlags:
+      oss << "hipHostGetFlags(";
+      if (data->args.hipHostGetFlags.flagsPtr == NULL) oss << "flagsPtr=NULL";
+      else { oss << "flagsPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostGetFlags.flagsPtr__val); }
+      oss << ", hostPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostGetFlags.hostPtr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostMalloc:
+      oss << "hipHostMalloc(";
+      if (data->args.hipHostMalloc.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostMalloc.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostMalloc.size);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostMalloc.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostRegister:
+      oss << "hipHostRegister(";
+      oss << "hostPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostRegister.hostPtr);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostRegister.sizeBytes);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostRegister.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipHostUnregister:
+      oss << "hipHostUnregister(";
+      oss << "hostPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipHostUnregister.hostPtr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipImportExternalMemory:
+      oss << "hipImportExternalMemory(";
+      if (data->args.hipImportExternalMemory.extMem_out == NULL) oss << "extMem_out=NULL";
+      else { oss << "extMem_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipImportExternalMemory.extMem_out__val); }
+      if (data->args.hipImportExternalMemory.memHandleDesc == NULL) oss << ", memHandleDesc=NULL";
+      else { oss << ", memHandleDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipImportExternalMemory.memHandleDesc__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipImportExternalSemaphore:
+      oss << "hipImportExternalSemaphore(";
+      if (data->args.hipImportExternalSemaphore.extSem_out == NULL) oss << "extSem_out=NULL";
+      else { oss << "extSem_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipImportExternalSemaphore.extSem_out__val); }
+      if (data->args.hipImportExternalSemaphore.semHandleDesc == NULL) oss << ", semHandleDesc=NULL";
+      else { oss << ", semHandleDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipImportExternalSemaphore.semHandleDesc__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipInit:
+      oss << "hipInit(";
+      oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipInit.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipIpcCloseMemHandle:
+      oss << "hipIpcCloseMemHandle(";
+      oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcCloseMemHandle.devPtr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipIpcGetEventHandle:
+      oss << "hipIpcGetEventHandle(";
+      if (data->args.hipIpcGetEventHandle.handle == NULL) oss << "handle=NULL";
+      else { oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcGetEventHandle.handle__val); }
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcGetEventHandle.event);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipIpcGetMemHandle:
+      oss << "hipIpcGetMemHandle(";
+      if (data->args.hipIpcGetMemHandle.handle == NULL) oss << "handle=NULL";
+      else { oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcGetMemHandle.handle__val); }
+      oss << ", devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcGetMemHandle.devPtr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipIpcOpenEventHandle:
+      oss << "hipIpcOpenEventHandle(";
+      if (data->args.hipIpcOpenEventHandle.event == NULL) oss << "event=NULL";
+      else { oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcOpenEventHandle.event__val); }
+      oss << ", handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcOpenEventHandle.handle);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipIpcOpenMemHandle:
+      oss << "hipIpcOpenMemHandle(";
+      if (data->args.hipIpcOpenMemHandle.devPtr == NULL) oss << "devPtr=NULL";
+      else { oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcOpenMemHandle.devPtr__val); }
+      oss << ", handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcOpenMemHandle.handle);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipIpcOpenMemHandle.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchByPtr:
+      oss << "hipLaunchByPtr(";
+      oss << "hostFunction="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchByPtr.hostFunction);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchCooperativeKernel:
+      oss << "hipLaunchCooperativeKernel(";
+      oss << "f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.f);
+      oss << ", gridDim="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.gridDim);
+      oss << ", blockDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.blockDimX);
+      if (data->args.hipLaunchCooperativeKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+      else { oss << ", kernelParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.kernelParams__val); }
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.sharedMemBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernel.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
+      oss << "hipLaunchCooperativeKernelMultiDevice(";
+      if (data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList == NULL) oss << "launchParamsList=NULL";
+      else { oss << "launchParamsList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val); }
+      oss << ", numDevices="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernelMultiDevice.numDevices);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchCooperativeKernelMultiDevice.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchHostFunc:
+      oss << "hipLaunchHostFunc(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchHostFunc.stream);
+      oss << ", fn="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchHostFunc.fn);
+      oss << ", userData="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchHostFunc.userData);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchKernel:
+      oss << "hipLaunchKernel(";
+      oss << "function_address="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.function_address);
+      oss << ", numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.numBlocks);
+      oss << ", dimBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.dimBlocks);
+      if (data->args.hipLaunchKernel.args == NULL) oss << ", args=NULL";
+      else { oss << ", args="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.args__val); }
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.sharedMemBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernel.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLaunchKernelExC:
+      oss << "hipLaunchKernelExC(";
+      if (data->args.hipLaunchKernelExC.config == NULL) oss << "config=NULL";
+      else { oss << "config="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernelExC.config__val); }
+      oss << ", fPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernelExC.fPtr);
+      if (data->args.hipLaunchKernelExC.args == NULL) oss << ", args=NULL";
+      else { oss << ", args="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLaunchKernelExC.args__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLibraryGetKernel:
+      oss << "hipLibraryGetKernel(";
+      if (data->args.hipLibraryGetKernel.pKernel == NULL) oss << "pKernel=NULL";
+      else { oss << "pKernel="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryGetKernel.pKernel__val); }
+      oss << ", library="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryGetKernel.library);
+      if (data->args.hipLibraryGetKernel.name == NULL) oss << ", name=NULL";
+      else { oss << ", name="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryGetKernel.name__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLibraryGetKernelCount:
+      oss << "hipLibraryGetKernelCount(";
+      if (data->args.hipLibraryGetKernelCount.count == NULL) oss << "count=NULL";
+      else { oss << "count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryGetKernelCount.count__val); }
+      oss << ", library="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryGetKernelCount.library);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLibraryLoadData:
+      oss << "hipLibraryLoadData(";
+      if (data->args.hipLibraryLoadData.library == NULL) oss << "library=NULL";
+      else { oss << "library="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.library__val); }
+      oss << ", code="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.code);
+      if (data->args.hipLibraryLoadData.jitOptions == NULL) oss << ", jitOptions=NULL";
+      else { oss << ", jitOptions="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipLibraryLoadData.jitOptions__val); }
+      if (data->args.hipLibraryLoadData.jitOptionsValues == NULL) oss << ", jitOptionsValues=NULL";
+      else { oss << ", jitOptionsValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.jitOptionsValues__val); }
+      oss << ", numJitOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.numJitOptions);
+      if (data->args.hipLibraryLoadData.libraryOptions == NULL) oss << ", libraryOptions=NULL";
+      else { oss << ", libraryOptions="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipLibraryLoadData.libraryOptions__val); }
+      if (data->args.hipLibraryLoadData.libraryOptionValues == NULL) oss << ", libraryOptionValues=NULL";
+      else { oss << ", libraryOptionValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.libraryOptionValues__val); }
+      oss << ", numLibraryOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadData.numLibraryOptions);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLibraryLoadFromFile:
+      oss << "hipLibraryLoadFromFile(";
+      if (data->args.hipLibraryLoadFromFile.library == NULL) oss << "library=NULL";
+      else { oss << "library="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.library__val); }
+      if (data->args.hipLibraryLoadFromFile.fileName == NULL) oss << ", fileName=NULL";
+      else { oss << ", fileName="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.fileName__val); }
+      if (data->args.hipLibraryLoadFromFile.jitOptions == NULL) oss << ", jitOptions=NULL";
+      else { oss << ", jitOptions="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipLibraryLoadFromFile.jitOptions__val); }
+      if (data->args.hipLibraryLoadFromFile.jitOptionsValues == NULL) oss << ", jitOptionsValues=NULL";
+      else { oss << ", jitOptionsValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.jitOptionsValues__val); }
+      oss << ", numJitOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.numJitOptions);
+      if (data->args.hipLibraryLoadFromFile.libraryOptions == NULL) oss << ", libraryOptions=NULL";
+      else { oss << ", libraryOptions="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipLibraryLoadFromFile.libraryOptions__val); }
+      if (data->args.hipLibraryLoadFromFile.libraryOptionValues == NULL) oss << ", libraryOptionValues=NULL";
+      else { oss << ", libraryOptionValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.libraryOptionValues__val); }
+      oss << ", numLibraryOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryLoadFromFile.numLibraryOptions);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLibraryUnload:
+      oss << "hipLibraryUnload(";
+      oss << "library="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLibraryUnload.library);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLinkAddData:
+      oss << "hipLinkAddData(";
+      oss << "state="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.state);
+      oss << ", type="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.type);
+      oss << ", data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.data);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.size);
+      if (data->args.hipLinkAddData.name == NULL) oss << ", name=NULL";
+      else { oss << ", name="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.name__val); }
+      oss << ", numOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.numOptions);
+      if (data->args.hipLinkAddData.options == NULL) oss << ", options=NULL";
+      else { oss << ", options="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.options__val); }
+      if (data->args.hipLinkAddData.optionValues == NULL) oss << ", optionValues=NULL";
+      else { oss << ", optionValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddData.optionValues__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLinkAddFile:
+      oss << "hipLinkAddFile(";
+      oss << "state="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.state);
+      oss << ", type="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.type);
+      if (data->args.hipLinkAddFile.path == NULL) oss << ", path=NULL";
+      else { oss << ", path="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.path__val); }
+      oss << ", numOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.numOptions);
+      if (data->args.hipLinkAddFile.options == NULL) oss << ", options=NULL";
+      else { oss << ", options="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.options__val); }
+      if (data->args.hipLinkAddFile.optionValues == NULL) oss << ", optionValues=NULL";
+      else { oss << ", optionValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkAddFile.optionValues__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLinkComplete:
+      oss << "hipLinkComplete(";
+      oss << "state="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkComplete.state);
+      if (data->args.hipLinkComplete.hipBinOut == NULL) oss << ", hipBinOut=NULL";
+      else { oss << ", hipBinOut="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkComplete.hipBinOut__val); }
+      if (data->args.hipLinkComplete.sizeOut == NULL) oss << ", sizeOut=NULL";
+      else { oss << ", sizeOut="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkComplete.sizeOut__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLinkCreate:
+      oss << "hipLinkCreate(";
+      oss << "numOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkCreate.numOptions);
+      if (data->args.hipLinkCreate.options == NULL) oss << ", options=NULL";
+      else { oss << ", options="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkCreate.options__val); }
+      if (data->args.hipLinkCreate.optionValues == NULL) oss << ", optionValues=NULL";
+      else { oss << ", optionValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkCreate.optionValues__val); }
+      if (data->args.hipLinkCreate.stateOut == NULL) oss << ", stateOut=NULL";
+      else { oss << ", stateOut="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkCreate.stateOut__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipLinkDestroy:
+      oss << "hipLinkDestroy(";
+      oss << "state="; roctracer::hip_support::detail::operator<<(oss, data->args.hipLinkDestroy.state);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMalloc:
+      oss << "hipMalloc(";
+      if (data->args.hipMalloc.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc.size);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMalloc3D:
+      oss << "hipMalloc3D(";
+      if (data->args.hipMalloc3D.pitchedDevPtr == NULL) oss << "pitchedDevPtr=NULL";
+      else { oss << "pitchedDevPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3D.pitchedDevPtr__val); }
+      oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3D.extent);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMalloc3DArray:
+      oss << "hipMalloc3DArray(";
+      if (data->args.hipMalloc3DArray.array == NULL) oss << "array=NULL";
+      else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3DArray.array__val); }
+      if (data->args.hipMalloc3DArray.desc == NULL) oss << ", desc=NULL";
+      else { oss << ", desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3DArray.desc__val); }
+      oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3DArray.extent);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMalloc3DArray.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocArray:
+      oss << "hipMallocArray(";
+      if (data->args.hipMallocArray.array == NULL) oss << "array=NULL";
+      else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocArray.array__val); }
+      if (data->args.hipMallocArray.desc == NULL) oss << ", desc=NULL";
+      else { oss << ", desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocArray.desc__val); }
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocArray.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocArray.height);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocArray.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocAsync:
+      oss << "hipMallocAsync(";
+      if (data->args.hipMallocAsync.dev_ptr == NULL) oss << "dev_ptr=NULL";
+      else { oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocAsync.dev_ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocAsync.size);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocFromPoolAsync:
+      oss << "hipMallocFromPoolAsync(";
+      if (data->args.hipMallocFromPoolAsync.dev_ptr == NULL) oss << "dev_ptr=NULL";
+      else { oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocFromPoolAsync.dev_ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocFromPoolAsync.size);
+      oss << ", mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocFromPoolAsync.mem_pool);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocFromPoolAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocHost:
+      oss << "hipMallocHost(";
+      if (data->args.hipMallocHost.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocHost.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocHost.size);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocManaged:
+      oss << "hipMallocManaged(";
+      if (data->args.hipMallocManaged.dev_ptr == NULL) oss << "dev_ptr=NULL";
+      else { oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocManaged.dev_ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocManaged.size);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocManaged.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocMipmappedArray:
+      oss << "hipMallocMipmappedArray(";
+      if (data->args.hipMallocMipmappedArray.mipmappedArray == NULL) oss << "mipmappedArray=NULL";
+      else { oss << "mipmappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocMipmappedArray.mipmappedArray__val); }
+      if (data->args.hipMallocMipmappedArray.desc == NULL) oss << ", desc=NULL";
+      else { oss << ", desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocMipmappedArray.desc__val); }
+      oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocMipmappedArray.extent);
+      oss << ", numLevels="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocMipmappedArray.numLevels);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocMipmappedArray.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMallocPitch:
+      oss << "hipMallocPitch(";
+      if (data->args.hipMallocPitch.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocPitch.ptr__val); }
+      if (data->args.hipMallocPitch.pitch == NULL) oss << ", pitch=NULL";
+      else { oss << ", pitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocPitch.pitch__val); }
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocPitch.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMallocPitch.height);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAddressFree:
+      oss << "hipMemAddressFree(";
+      oss << "devPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressFree.devPtr);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressFree.size);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAddressReserve:
+      oss << "hipMemAddressReserve(";
+      if (data->args.hipMemAddressReserve.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressReserve.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressReserve.size);
+      oss << ", alignment="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressReserve.alignment);
+      oss << ", addr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressReserve.addr);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAddressReserve.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAdvise:
+      oss << "hipMemAdvise(";
+      oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise.count);
+      oss << ", advice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise.advice);
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise.device);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAdvise_v2:
+      oss << "hipMemAdvise_v2(";
+      oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise_v2.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise_v2.count);
+      oss << ", advice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise_v2.advice);
+      oss << ", location="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAdvise_v2.location);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAllocHost:
+      oss << "hipMemAllocHost(";
+      if (data->args.hipMemAllocHost.ptr == NULL) oss << "ptr=NULL";
+      else { oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocHost.ptr__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocHost.size);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemAllocPitch:
+      oss << "hipMemAllocPitch(";
+      if (data->args.hipMemAllocPitch.dptr == NULL) oss << "dptr=NULL";
+      else { oss << "dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocPitch.dptr__val); }
+      if (data->args.hipMemAllocPitch.pitch == NULL) oss << ", pitch=NULL";
+      else { oss << ", pitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocPitch.pitch__val); }
+      oss << ", widthInBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocPitch.widthInBytes);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocPitch.height);
+      oss << ", elementSizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemAllocPitch.elementSizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemCreate:
+      oss << "hipMemCreate(";
+      if (data->args.hipMemCreate.handle == NULL) oss << "handle=NULL";
+      else { oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemCreate.handle__val); }
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemCreate.size);
+      if (data->args.hipMemCreate.prop == NULL) oss << ", prop=NULL";
+      else { oss << ", prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemCreate.prop__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemCreate.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemExportToShareableHandle:
+      oss << "hipMemExportToShareableHandle(";
+      oss << "shareableHandle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemExportToShareableHandle.shareableHandle);
+      oss << ", handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemExportToShareableHandle.handle);
+      oss << ", handleType="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemExportToShareableHandle.handleType);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemExportToShareableHandle.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetAccess:
+      oss << "hipMemGetAccess(";
+      if (data->args.hipMemGetAccess.flags == NULL) oss << "flags=NULL";
+      else { oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAccess.flags__val); }
+      if (data->args.hipMemGetAccess.location == NULL) oss << ", location=NULL";
+      else { oss << ", location="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAccess.location__val); }
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAccess.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetAddressRange:
+      oss << "hipMemGetAddressRange(";
+      if (data->args.hipMemGetAddressRange.pbase == NULL) oss << "pbase=NULL";
+      else { oss << "pbase="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAddressRange.pbase__val); }
+      if (data->args.hipMemGetAddressRange.psize == NULL) oss << ", psize=NULL";
+      else { oss << ", psize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAddressRange.psize__val); }
+      oss << ", dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAddressRange.dptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetAllocationGranularity:
+      oss << "hipMemGetAllocationGranularity(";
+      if (data->args.hipMemGetAllocationGranularity.granularity == NULL) oss << "granularity=NULL";
+      else { oss << "granularity="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAllocationGranularity.granularity__val); }
+      if (data->args.hipMemGetAllocationGranularity.prop == NULL) oss << ", prop=NULL";
+      else { oss << ", prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAllocationGranularity.prop__val); }
+      oss << ", option="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAllocationGranularity.option);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetAllocationPropertiesFromHandle:
+      oss << "hipMemGetAllocationPropertiesFromHandle(";
+      if (data->args.hipMemGetAllocationPropertiesFromHandle.prop == NULL) oss << "prop=NULL";
+      else { oss << "prop="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAllocationPropertiesFromHandle.prop__val); }
+      oss << ", handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetAllocationPropertiesFromHandle.handle);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetHandleForAddressRange:
+      oss << "hipMemGetHandleForAddressRange(";
+      oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetHandleForAddressRange.handle);
+      oss << ", dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetHandleForAddressRange.dptr);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetHandleForAddressRange.size);
+      oss << ", handleType="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetHandleForAddressRange.handleType);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetHandleForAddressRange.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemGetInfo:
+      oss << "hipMemGetInfo(";
+      if (data->args.hipMemGetInfo.free == NULL) oss << "free=NULL";
+      else { oss << "free="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetInfo.free__val); }
+      if (data->args.hipMemGetInfo.total == NULL) oss << ", total=NULL";
+      else { oss << ", total="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemGetInfo.total__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemImportFromShareableHandle:
+      oss << "hipMemImportFromShareableHandle(";
+      if (data->args.hipMemImportFromShareableHandle.handle == NULL) oss << "handle=NULL";
+      else { oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemImportFromShareableHandle.handle__val); }
+      oss << ", osHandle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemImportFromShareableHandle.osHandle);
+      oss << ", shHandleType="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemImportFromShareableHandle.shHandleType);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemMap:
+      oss << "hipMemMap(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMap.ptr);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMap.size);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMap.offset);
+      oss << ", handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMap.handle);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMap.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemMapArrayAsync:
+      oss << "hipMemMapArrayAsync(";
+      if (data->args.hipMemMapArrayAsync.mapInfoList == NULL) oss << "mapInfoList=NULL";
+      else { oss << "mapInfoList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMapArrayAsync.mapInfoList__val); }
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMapArrayAsync.count);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemMapArrayAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolCreate:
+      oss << "hipMemPoolCreate(";
+      if (data->args.hipMemPoolCreate.mem_pool == NULL) oss << "mem_pool=NULL";
+      else { oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolCreate.mem_pool__val); }
+      if (data->args.hipMemPoolCreate.pool_props == NULL) oss << ", pool_props=NULL";
+      else { oss << ", pool_props="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolCreate.pool_props__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolDestroy:
+      oss << "hipMemPoolDestroy(";
+      oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolDestroy.mem_pool);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolExportPointer:
+      oss << "hipMemPoolExportPointer(";
+      if (data->args.hipMemPoolExportPointer.export_data == NULL) oss << "export_data=NULL";
+      else { oss << "export_data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportPointer.export_data__val); }
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportPointer.dev_ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolExportToShareableHandle:
+      oss << "hipMemPoolExportToShareableHandle(";
+      oss << "shared_handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportToShareableHandle.shared_handle);
+      oss << ", mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportToShareableHandle.mem_pool);
+      oss << ", handle_type="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportToShareableHandle.handle_type);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolExportToShareableHandle.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolGetAccess:
+      oss << "hipMemPoolGetAccess(";
+      if (data->args.hipMemPoolGetAccess.flags == NULL) oss << "flags=NULL";
+      else { oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAccess.flags__val); }
+      oss << ", mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAccess.mem_pool);
+      if (data->args.hipMemPoolGetAccess.location == NULL) oss << ", location=NULL";
+      else { oss << ", location="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAccess.location__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolGetAttribute:
+      oss << "hipMemPoolGetAttribute(";
+      oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAttribute.mem_pool);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAttribute.attr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolGetAttribute.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolImportFromShareableHandle:
+      oss << "hipMemPoolImportFromShareableHandle(";
+      if (data->args.hipMemPoolImportFromShareableHandle.mem_pool == NULL) oss << "mem_pool=NULL";
+      else { oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportFromShareableHandle.mem_pool__val); }
+      oss << ", shared_handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportFromShareableHandle.shared_handle);
+      oss << ", handle_type="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportFromShareableHandle.handle_type);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportFromShareableHandle.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolImportPointer:
+      oss << "hipMemPoolImportPointer(";
+      if (data->args.hipMemPoolImportPointer.dev_ptr == NULL) oss << "dev_ptr=NULL";
+      else { oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportPointer.dev_ptr__val); }
+      oss << ", mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportPointer.mem_pool);
+      if (data->args.hipMemPoolImportPointer.export_data == NULL) oss << ", export_data=NULL";
+      else { oss << ", export_data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolImportPointer.export_data__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolSetAccess:
+      oss << "hipMemPoolSetAccess(";
+      oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAccess.mem_pool);
+      if (data->args.hipMemPoolSetAccess.desc_list == NULL) oss << ", desc_list=NULL";
+      else { oss << ", desc_list="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAccess.desc_list__val); }
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAccess.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolSetAttribute:
+      oss << "hipMemPoolSetAttribute(";
+      oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAttribute.mem_pool);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAttribute.attr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolSetAttribute.value);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPoolTrimTo:
+      oss << "hipMemPoolTrimTo(";
+      oss << "mem_pool="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolTrimTo.mem_pool);
+      oss << ", min_bytes_to_hold="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPoolTrimTo.min_bytes_to_hold);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPrefetchAsync:
+      oss << "hipMemPrefetchAsync(";
+      oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync.count);
+      oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync.device);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPrefetchAsync_v2:
+      oss << "hipMemPrefetchAsync_v2(";
+      oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync_v2.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync_v2.count);
+      oss << ", location="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync_v2.location);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync_v2.flags);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPrefetchAsync_v2.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemPtrGetInfo:
+      oss << "hipMemPtrGetInfo(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPtrGetInfo.ptr);
+      if (data->args.hipMemPtrGetInfo.size == NULL) oss << ", size=NULL";
+      else { oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemPtrGetInfo.size__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemRangeGetAttribute:
+      oss << "hipMemRangeGetAttribute(";
+      oss << "data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttribute.data);
+      oss << ", data_size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttribute.data_size);
+      oss << ", attribute="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttribute.attribute);
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttribute.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttribute.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemRangeGetAttributes:
+      oss << "hipMemRangeGetAttributes(";
+      if (data->args.hipMemRangeGetAttributes.data == NULL) oss << "data=NULL";
+      else { oss << "data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.data__val); }
+      if (data->args.hipMemRangeGetAttributes.data_sizes == NULL) oss << ", data_sizes=NULL";
+      else { oss << ", data_sizes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.data_sizes__val); }
+      if (data->args.hipMemRangeGetAttributes.attributes == NULL) oss << ", attributes=NULL";
+      else { oss << ", attributes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.attributes__val); }
+      oss << ", num_attributes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.num_attributes);
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.dev_ptr);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRangeGetAttributes.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemRelease:
+      oss << "hipMemRelease(";
+      oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRelease.handle);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemRetainAllocationHandle:
+      oss << "hipMemRetainAllocationHandle(";
+      if (data->args.hipMemRetainAllocationHandle.handle == NULL) oss << "handle=NULL";
+      else { oss << "handle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRetainAllocationHandle.handle__val); }
+      oss << ", addr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemRetainAllocationHandle.addr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemSetAccess:
+      oss << "hipMemSetAccess(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemSetAccess.ptr);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemSetAccess.size);
+      if (data->args.hipMemSetAccess.desc == NULL) oss << ", desc=NULL";
+      else { oss << ", desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemSetAccess.desc__val); }
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemSetAccess.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemUnmap:
+      oss << "hipMemUnmap(";
+      oss << "ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemUnmap.ptr);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemUnmap.size);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy:
+      oss << "hipMemcpy(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy.sizeBytes);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2D:
+      oss << "hipMemcpy2D(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.dst);
+      oss << ", dpitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.dpitch);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.src);
+      oss << ", spitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.spitch);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DArrayToArray:
+      oss << "hipMemcpy2DArrayToArray(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.dst);
+      oss << ", wOffsetDst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.wOffsetDst);
+      oss << ", hOffsetDst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.hOffsetDst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.src);
+      oss << ", wOffsetSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.wOffsetSrc);
+      oss << ", hOffsetSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.hOffsetSrc);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DAsync:
+      oss << "hipMemcpy2DAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.dst);
+      oss << ", dpitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.dpitch);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.src);
+      oss << ", spitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.spitch);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DFromArray:
+      oss << "hipMemcpy2DFromArray(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.dst);
+      oss << ", dpitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.dpitch);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.src);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.hOffset);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArray.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DFromArrayAsync:
+      oss << "hipMemcpy2DFromArrayAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.dst);
+      oss << ", dpitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.dpitch);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.src);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.hOffset);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DFromArrayAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DToArray:
+      oss << "hipMemcpy2DToArray(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.dst);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.hOffset);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.src);
+      oss << ", spitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.spitch);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArray.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy2DToArrayAsync:
+      oss << "hipMemcpy2DToArrayAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.dst);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.hOffset);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.src);
+      oss << ", spitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.spitch);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.height);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DToArrayAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy3D:
+      oss << "hipMemcpy3D(";
+      if (data->args.hipMemcpy3D.p == NULL) oss << "p=NULL";
+      else { oss << "p="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3D.p__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy3DAsync:
+      oss << "hipMemcpy3DAsync(";
+      if (data->args.hipMemcpy3DAsync.p == NULL) oss << "p=NULL";
+      else { oss << "p="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DAsync.p__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy3DBatchAsync:
+      oss << "hipMemcpy3DBatchAsync(";
+      oss << "numOps="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DBatchAsync.numOps);
+      if (data->args.hipMemcpy3DBatchAsync.opList == NULL) oss << ", opList=NULL";
+      else { oss << ", opList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DBatchAsync.opList__val); }
+      if (data->args.hipMemcpy3DBatchAsync.failIdx == NULL) oss << ", failIdx=NULL";
+      else { oss << ", failIdx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DBatchAsync.failIdx__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DBatchAsync.flags);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DBatchAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy3DPeer:
+      oss << "hipMemcpy3DPeer(";
+      if (data->args.hipMemcpy3DPeer.p == NULL) oss << "p=NULL";
+      else { oss << "p="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DPeer.p__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpy3DPeerAsync:
+      oss << "hipMemcpy3DPeerAsync(";
+      if (data->args.hipMemcpy3DPeerAsync.p == NULL) oss << "p=NULL";
+      else { oss << "p="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DPeerAsync.p__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy3DPeerAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyAsync:
+      oss << "hipMemcpyAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.sizeBytes);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyAtoA:
+      oss << "hipMemcpyAtoA(";
+      oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.dstArray);
+      oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.dstOffset);
+      oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.srcArray);
+      oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.srcOffset);
+      oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.ByteCount);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyAtoD:
+      oss << "hipMemcpyAtoD(";
+      oss << "dstDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.dstDevice);
+      oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.srcArray);
+      oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.srcOffset);
+      oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.ByteCount);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyAtoH:
+      oss << "hipMemcpyAtoH(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.dst);
+      oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.srcArray);
+      oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.srcOffset);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyAtoHAsync:
+      oss << "hipMemcpyAtoHAsync(";
+      oss << "dstHost="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.dstHost);
+      oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.srcArray);
+      oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.srcOffset);
+      oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.ByteCount);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyBatchAsync:
+      oss << "hipMemcpyBatchAsync(";
+      if (data->args.hipMemcpyBatchAsync.dsts == NULL) oss << "dsts=NULL";
+      else { oss << "dsts="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.dsts__val); }
+      if (data->args.hipMemcpyBatchAsync.srcs == NULL) oss << ", srcs=NULL";
+      else { oss << ", srcs="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.srcs__val); }
+      if (data->args.hipMemcpyBatchAsync.sizes == NULL) oss << ", sizes=NULL";
+      else { oss << ", sizes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.sizes__val); }
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.count);
+      if (data->args.hipMemcpyBatchAsync.attrs == NULL) oss << ", attrs=NULL";
+      else { oss << ", attrs="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.attrs__val); }
+      if (data->args.hipMemcpyBatchAsync.attrsIdxs == NULL) oss << ", attrsIdxs=NULL";
+      else { oss << ", attrsIdxs="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.attrsIdxs__val); }
+      oss << ", numAttrs="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.numAttrs);
+      if (data->args.hipMemcpyBatchAsync.failIdx == NULL) oss << ", failIdx=NULL";
+      else { oss << ", failIdx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.failIdx__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyBatchAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyDtoA:
+      oss << "hipMemcpyDtoA(";
+      oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.dstArray);
+      oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.dstOffset);
+      oss << ", srcDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.srcDevice);
+      oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.ByteCount);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyDtoD:
+      oss << "hipMemcpyDtoD(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoD.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoD.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoD.sizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyDtoDAsync:
+      oss << "hipMemcpyDtoDAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoDAsync.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoDAsync.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoDAsync.sizeBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoDAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyDtoH:
+      oss << "hipMemcpyDtoH(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoH.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoH.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoH.sizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyDtoHAsync:
+      oss << "hipMemcpyDtoHAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoHAsync.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoHAsync.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoHAsync.sizeBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoHAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyFromArray:
+      oss << "hipMemcpyFromArray(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.dst);
+      oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.srcArray);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.hOffset);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.count);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromArray.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyFromSymbol:
+      oss << "hipMemcpyFromSymbol(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbol.dst);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbol.symbol);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbol.sizeBytes);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyFromSymbolAsync:
+      oss << "hipMemcpyFromSymbolAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.dst);
+      oss << ", symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.symbol);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.sizeBytes);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyFromSymbolAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyHtoA:
+      oss << "hipMemcpyHtoA(";
+      oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoA.dstArray);
+      oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoA.dstOffset);
+      oss << ", srcHost="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoA.srcHost);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoA.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyHtoAAsync:
+      oss << "hipMemcpyHtoAAsync(";
+      oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.dstArray);
+      oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.dstOffset);
+      oss << ", srcHost="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.srcHost);
+      oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.ByteCount);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyHtoD:
+      oss << "hipMemcpyHtoD(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoD.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoD.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoD.sizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyHtoDAsync:
+      oss << "hipMemcpyHtoDAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoDAsync.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoDAsync.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoDAsync.sizeBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoDAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyParam2D:
+      oss << "hipMemcpyParam2D(";
+      if (data->args.hipMemcpyParam2D.pCopy == NULL) oss << "pCopy=NULL";
+      else { oss << "pCopy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyParam2D.pCopy__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyParam2DAsync:
+      oss << "hipMemcpyParam2DAsync(";
+      if (data->args.hipMemcpyParam2DAsync.pCopy == NULL) oss << "pCopy=NULL";
+      else { oss << "pCopy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyParam2DAsync.pCopy__val); }
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyParam2DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyPeer:
+      oss << "hipMemcpyPeer(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeer.dst);
+      oss << ", dstDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeer.dstDeviceId);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeer.src);
+      oss << ", srcDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeer.srcDeviceId);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeer.sizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyPeerAsync:
+      oss << "hipMemcpyPeerAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.dst);
+      oss << ", dstDeviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.dstDeviceId);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.src);
+      oss << ", srcDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.srcDevice);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.sizeBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyPeerAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyToArray:
+      oss << "hipMemcpyToArray(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.dst);
+      oss << ", wOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.wOffset);
+      oss << ", hOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.hOffset);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.src);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.count);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToArray.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyToSymbol:
+      oss << "hipMemcpyToSymbol(";
+      oss << "symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbol.symbol);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbol.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbol.sizeBytes);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbol.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbol.kind);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyToSymbolAsync:
+      oss << "hipMemcpyToSymbolAsync(";
+      oss << "symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.symbol);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.sizeBytes);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.offset);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyToSymbolAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemcpyWithStream:
+      oss << "hipMemcpyWithStream(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyWithStream.dst);
+      oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyWithStream.src);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyWithStream.sizeBytes);
+      oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyWithStream.kind);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyWithStream.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemset:
+      oss << "hipMemset(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset.dst);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset.value);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset.sizeBytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemset2D:
+      oss << "hipMemset2D(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2D.dst);
+      oss << ", pitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2D.pitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2D.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2D.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2D.height);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemset2DAsync:
+      oss << "hipMemset2DAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.dst);
+      oss << ", pitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.pitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.height);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset2DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemset3D:
+      oss << "hipMemset3D(";
+      oss << "pitchedDevPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3D.pitchedDevPtr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3D.value);
+      oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3D.extent);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemset3DAsync:
+      oss << "hipMemset3DAsync(";
+      oss << "pitchedDevPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3DAsync.pitchedDevPtr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3DAsync.value);
+      oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3DAsync.extent);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemset3DAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetAsync:
+      oss << "hipMemsetAsync(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetAsync.dst);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetAsync.value);
+      oss << ", sizeBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetAsync.sizeBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD16:
+      oss << "hipMemsetD16(";
+      oss << "dest="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16.dest);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD16Async:
+      oss << "hipMemsetD16Async(";
+      oss << "dest="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16Async.dest);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16Async.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16Async.count);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD16Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D16:
+      oss << "hipMemsetD2D16(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16.height);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D16Async:
+      oss << "hipMemsetD2D16Async(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.height);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D16Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D32:
+      oss << "hipMemsetD2D32(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32.height);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D32Async:
+      oss << "hipMemsetD2D32Async(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.height);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D32Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D8:
+      oss << "hipMemsetD2D8(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8.height);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD2D8Async:
+      oss << "hipMemsetD2D8Async(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.dst);
+      oss << ", dstPitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.dstPitch);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.value);
+      oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.width);
+      oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.height);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD2D8Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD32:
+      oss << "hipMemsetD32(";
+      oss << "dest="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32.dest);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD32Async:
+      oss << "hipMemsetD32Async(";
+      oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32Async.dst);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32Async.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32Async.count);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD32Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD8:
+      oss << "hipMemsetD8(";
+      oss << "dest="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8.dest);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMemsetD8Async:
+      oss << "hipMemsetD8Async(";
+      oss << "dest="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8Async.dest);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8Async.value);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8Async.count);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemsetD8Async.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMipmappedArrayCreate:
+      oss << "hipMipmappedArrayCreate(";
+      if (data->args.hipMipmappedArrayCreate.pHandle == NULL) oss << "pHandle=NULL";
+      else { oss << "pHandle="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayCreate.pHandle__val); }
+      if (data->args.hipMipmappedArrayCreate.pMipmappedArrayDesc == NULL) oss << ", pMipmappedArrayDesc=NULL";
+      else { oss << ", pMipmappedArrayDesc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayCreate.pMipmappedArrayDesc__val); }
+      oss << ", numMipmapLevels="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayCreate.numMipmapLevels);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMipmappedArrayDestroy:
+      oss << "hipMipmappedArrayDestroy(";
+      oss << "hMipmappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayDestroy.hMipmappedArray);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipMipmappedArrayGetLevel:
+      oss << "hipMipmappedArrayGetLevel(";
+      if (data->args.hipMipmappedArrayGetLevel.pLevelArray == NULL) oss << "pLevelArray=NULL";
+      else { oss << "pLevelArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayGetLevel.pLevelArray__val); }
+      oss << ", hMipMappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayGetLevel.hMipMappedArray);
+      oss << ", level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMipmappedArrayGetLevel.level);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleGetFunction:
+      oss << "hipModuleGetFunction(";
+      if (data->args.hipModuleGetFunction.function == NULL) oss << "function=NULL";
+      else { oss << "function="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetFunction.function__val); }
+      oss << ", module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetFunction.module);
+      if (data->args.hipModuleGetFunction.kname == NULL) oss << ", kname=NULL";
+      else { oss << ", kname="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetFunction.kname__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleGetFunctionCount:
+      oss << "hipModuleGetFunctionCount(";
+      if (data->args.hipModuleGetFunctionCount.count == NULL) oss << "count=NULL";
+      else { oss << "count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetFunctionCount.count__val); }
+      oss << ", mod="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetFunctionCount.mod);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleGetGlobal:
+      oss << "hipModuleGetGlobal(";
+      if (data->args.hipModuleGetGlobal.dptr == NULL) oss << "dptr=NULL";
+      else { oss << "dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetGlobal.dptr__val); }
+      if (data->args.hipModuleGetGlobal.bytes == NULL) oss << ", bytes=NULL";
+      else { oss << ", bytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetGlobal.bytes__val); }
+      oss << ", hmod="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetGlobal.hmod);
+      if (data->args.hipModuleGetGlobal.name == NULL) oss << ", name=NULL";
+      else { oss << ", name="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetGlobal.name__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleGetTexRef:
+      oss << "hipModuleGetTexRef(";
+      if (data->args.hipModuleGetTexRef.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipModuleGetTexRef.texRef__val); }
+      oss << ", hmod="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetTexRef.hmod);
+      if (data->args.hipModuleGetTexRef.name == NULL) oss << ", name=NULL";
+      else { oss << ", name="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleGetTexRef.name__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLaunchCooperativeKernel:
+      oss << "hipModuleLaunchCooperativeKernel(";
+      oss << "f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.f);
+      oss << ", gridDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.gridDimX);
+      oss << ", gridDimY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.gridDimY);
+      oss << ", gridDimZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.gridDimZ);
+      oss << ", blockDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.blockDimX);
+      oss << ", blockDimY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.blockDimY);
+      oss << ", blockDimZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.blockDimZ);
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.sharedMemBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.stream);
+      if (data->args.hipModuleLaunchCooperativeKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+      else { oss << ", kernelParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernel.kernelParams__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice:
+      oss << "hipModuleLaunchCooperativeKernelMultiDevice(";
+      if (data->args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList == NULL) oss << "launchParamsList=NULL";
+      else { oss << "launchParamsList="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernelMultiDevice.launchParamsList__val); }
+      oss << ", numDevices="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernelMultiDevice.numDevices);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchCooperativeKernelMultiDevice.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLaunchKernel:
+      oss << "hipModuleLaunchKernel(";
+      oss << "f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.f);
+      oss << ", gridDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.gridDimX);
+      oss << ", gridDimY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.gridDimY);
+      oss << ", gridDimZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.gridDimZ);
+      oss << ", blockDimX="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.blockDimX);
+      oss << ", blockDimY="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.blockDimY);
+      oss << ", blockDimZ="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.blockDimZ);
+      oss << ", sharedMemBytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.sharedMemBytes);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.stream);
+      if (data->args.hipModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+      else { oss << ", kernelParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.kernelParams__val); }
+      if (data->args.hipModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+      else { oss << ", extra="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLaunchKernel.extra__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLoad:
+      oss << "hipModuleLoad(";
+      if (data->args.hipModuleLoad.module == NULL) oss << "module=NULL";
+      else { oss << "module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoad.module__val); }
+      if (data->args.hipModuleLoad.fname == NULL) oss << ", fname=NULL";
+      else { oss << ", fname="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoad.fname__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLoadData:
+      oss << "hipModuleLoadData(";
+      if (data->args.hipModuleLoadData.module == NULL) oss << "module=NULL";
+      else { oss << "module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadData.module__val); }
+      oss << ", image="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadData.image);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLoadDataEx:
+      oss << "hipModuleLoadDataEx(";
+      if (data->args.hipModuleLoadDataEx.module == NULL) oss << "module=NULL";
+      else { oss << "module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadDataEx.module__val); }
+      oss << ", image="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadDataEx.image);
+      oss << ", numOptions="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadDataEx.numOptions);
+      if (data->args.hipModuleLoadDataEx.options == NULL) oss << ", options=NULL";
+      else { oss << ", options="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadDataEx.options__val); }
+      if (data->args.hipModuleLoadDataEx.optionsValues == NULL) oss << ", optionsValues=NULL";
+      else { oss << ", optionsValues="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadDataEx.optionsValues__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleLoadFatBinary:
+      oss << "hipModuleLoadFatBinary(";
+      if (data->args.hipModuleLoadFatBinary.module == NULL) oss << "module=NULL";
+      else { oss << "module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadFatBinary.module__val); }
+      oss << ", fatbin="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleLoadFatBinary.fatbin);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor:
+      oss << "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(";
+      if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks == NULL) oss << "numBlocks=NULL";
+      else { oss << "numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.f);
+      oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.blockSize);
+      oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.dynSharedMemPerBlk);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+      oss << "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(";
+      if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks == NULL) oss << "numBlocks=NULL";
+      else { oss << "numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f);
+      oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize);
+      oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynSharedMemPerBlk);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize:
+      oss << "hipModuleOccupancyMaxPotentialBlockSize(";
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize == NULL) oss << "gridSize=NULL";
+      else { oss << "gridSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize__val); }
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize == NULL) oss << ", blockSize=NULL";
+      else { oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSize.f);
+      oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk);
+      oss << ", blockSizeLimit="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSizeLimit);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags:
+      oss << "hipModuleOccupancyMaxPotentialBlockSizeWithFlags(";
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize == NULL) oss << "gridSize=NULL";
+      else { oss << "gridSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize__val); }
+      if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize == NULL) oss << ", blockSize=NULL";
+      else { oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.f);
+      oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.dynSharedMemPerBlk);
+      oss << ", blockSizeLimit="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSizeLimit);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipModuleUnload:
+      oss << "hipModuleUnload(";
+      oss << "module="; roctracer::hip_support::detail::operator<<(oss, data->args.hipModuleUnload.module);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor:
+      oss << "hipOccupancyMaxActiveBlocksPerMultiprocessor(";
+      if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks == NULL) oss << "numBlocks=NULL";
+      else { oss << "numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.f);
+      oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.blockSize);
+      oss << ", dynamicSMemSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.dynamicSMemSize);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+      oss << "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(";
+      if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks == NULL) oss << "numBlocks=NULL";
+      else { oss << "numBlocks="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f);
+      oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize);
+      oss << ", dynamicSMemSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynamicSMemSize);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipOccupancyMaxPotentialBlockSize:
+      oss << "hipOccupancyMaxPotentialBlockSize(";
+      if (data->args.hipOccupancyMaxPotentialBlockSize.gridSize == NULL) oss << "gridSize=NULL";
+      else { oss << "gridSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxPotentialBlockSize.gridSize__val); }
+      if (data->args.hipOccupancyMaxPotentialBlockSize.blockSize == NULL) oss << ", blockSize=NULL";
+      else { oss << ", blockSize="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxPotentialBlockSize.blockSize__val); }
+      oss << ", f="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxPotentialBlockSize.f);
+      oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk);
+      oss << ", blockSizeLimit="; roctracer::hip_support::detail::operator<<(oss, data->args.hipOccupancyMaxPotentialBlockSize.blockSizeLimit);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipPeekAtLastError:
+      oss << "hipPeekAtLastError(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipPointerGetAttribute:
+      oss << "hipPointerGetAttribute(";
+      oss << "data="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerGetAttribute.data);
+      oss << ", attribute="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerGetAttribute.attribute);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerGetAttribute.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipPointerGetAttributes:
+      oss << "hipPointerGetAttributes(";
+      if (data->args.hipPointerGetAttributes.attributes == NULL) oss << "attributes=NULL";
+      else { oss << "attributes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerGetAttributes.attributes__val); }
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerGetAttributes.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipPointerSetAttribute:
+      oss << "hipPointerSetAttribute(";
+      oss << "value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerSetAttribute.value);
+      oss << ", attribute="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerSetAttribute.attribute);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipPointerSetAttribute.ptr);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipProfilerStart:
+      oss << "hipProfilerStart(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipProfilerStop:
+      oss << "hipProfilerStop(";
+      oss << ")";
+    break;
+    case HIP_API_ID_hipRuntimeGetVersion:
+      oss << "hipRuntimeGetVersion(";
+      if (data->args.hipRuntimeGetVersion.runtimeVersion == NULL) oss << "runtimeVersion=NULL";
+      else { oss << "runtimeVersion="; roctracer::hip_support::detail::operator<<(oss, data->args.hipRuntimeGetVersion.runtimeVersion__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipSetDevice:
+      oss << "hipSetDevice(";
+      oss << "deviceId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetDevice.deviceId);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipSetDeviceFlags:
+      oss << "hipSetDeviceFlags(";
+      oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetDeviceFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipSetValidDevices:
+      oss << "hipSetValidDevices(";
+      if (data->args.hipSetValidDevices.device_arr == NULL) oss << "device_arr=NULL";
+      else { oss << "device_arr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetValidDevices.device_arr__val); }
+      oss << ", len="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetValidDevices.len);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipSetupArgument:
+      oss << "hipSetupArgument(";
+      oss << "arg="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetupArgument.arg);
+      oss << ", size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetupArgument.size);
+      oss << ", offset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetupArgument.offset);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipSignalExternalSemaphoresAsync:
+      oss << "hipSignalExternalSemaphoresAsync(";
+      if (data->args.hipSignalExternalSemaphoresAsync.extSemArray == NULL) oss << "extSemArray=NULL";
+      else { oss << "extSemArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSignalExternalSemaphoresAsync.extSemArray__val); }
+      if (data->args.hipSignalExternalSemaphoresAsync.paramsArray == NULL) oss << ", paramsArray=NULL";
+      else { oss << ", paramsArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSignalExternalSemaphoresAsync.paramsArray__val); }
+      oss << ", numExtSems="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSignalExternalSemaphoresAsync.numExtSems);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSignalExternalSemaphoresAsync.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamAddCallback:
+      oss << "hipStreamAddCallback(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAddCallback.stream);
+      oss << ", callback="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAddCallback.callback);
+      oss << ", userData="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAddCallback.userData);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAddCallback.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamAttachMemAsync:
+      oss << "hipStreamAttachMemAsync(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAttachMemAsync.stream);
+      oss << ", dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAttachMemAsync.dev_ptr);
+      oss << ", length="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAttachMemAsync.length);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamAttachMemAsync.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamBatchMemOp:
+      oss << "hipStreamBatchMemOp(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.stream);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.count);
+      if (data->args.hipStreamBatchMemOp.paramArray == NULL) oss << ", paramArray=NULL";
+      else { oss << ", paramArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.paramArray__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamBeginCapture:
+      oss << "hipStreamBeginCapture(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCapture.stream);
+      oss << ", mode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCapture.mode);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamBeginCaptureToGraph:
+      oss << "hipStreamBeginCaptureToGraph(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.stream);
+      oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.graph);
+      if (data->args.hipStreamBeginCaptureToGraph.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.dependencies__val); }
+      if (data->args.hipStreamBeginCaptureToGraph.dependencyData == NULL) oss << ", dependencyData=NULL";
+      else { oss << ", dependencyData="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.dependencyData__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.numDependencies);
+      oss << ", mode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.mode);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamCreate:
+      oss << "hipStreamCreate(";
+      if (data->args.hipStreamCreate.stream == NULL) oss << "stream=NULL";
+      else { oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreate.stream__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamCreateWithFlags:
+      oss << "hipStreamCreateWithFlags(";
+      if (data->args.hipStreamCreateWithFlags.stream == NULL) oss << "stream=NULL";
+      else { oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreateWithFlags.stream__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreateWithFlags.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamCreateWithPriority:
+      oss << "hipStreamCreateWithPriority(";
+      if (data->args.hipStreamCreateWithPriority.stream == NULL) oss << "stream=NULL";
+      else { oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreateWithPriority.stream__val); }
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreateWithPriority.flags);
+      oss << ", priority="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamCreateWithPriority.priority);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamDestroy:
+      oss << "hipStreamDestroy(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamDestroy.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamEndCapture:
+      oss << "hipStreamEndCapture(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamEndCapture.stream);
+      if (data->args.hipStreamEndCapture.pGraph == NULL) oss << ", pGraph=NULL";
+      else { oss << ", pGraph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamEndCapture.pGraph__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetAttribute:
+      oss << "hipStreamGetAttribute(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetAttribute.stream);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetAttribute.attr);
+      if (data->args.hipStreamGetAttribute.value_out == NULL) oss << ", value_out=NULL";
+      else { oss << ", value_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetAttribute.value_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetCaptureInfo:
+      oss << "hipStreamGetCaptureInfo(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo.stream);
+      if (data->args.hipStreamGetCaptureInfo.pCaptureStatus == NULL) oss << ", pCaptureStatus=NULL";
+      else { oss << ", pCaptureStatus="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo.pCaptureStatus__val); }
+      if (data->args.hipStreamGetCaptureInfo.pId == NULL) oss << ", pId=NULL";
+      else { oss << ", pId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo.pId__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetCaptureInfo_v2:
+      oss << "hipStreamGetCaptureInfo_v2(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.stream);
+      if (data->args.hipStreamGetCaptureInfo_v2.captureStatus_out == NULL) oss << ", captureStatus_out=NULL";
+      else { oss << ", captureStatus_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.captureStatus_out__val); }
+      if (data->args.hipStreamGetCaptureInfo_v2.id_out == NULL) oss << ", id_out=NULL";
+      else { oss << ", id_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.id_out__val); }
+      if (data->args.hipStreamGetCaptureInfo_v2.graph_out == NULL) oss << ", graph_out=NULL";
+      else { oss << ", graph_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.graph_out__val); }
+      if (data->args.hipStreamGetCaptureInfo_v2.dependencies_out == NULL) oss << ", dependencies_out=NULL";
+      else { oss << ", dependencies_out="; roctracer::hip_support::detail::operator<<(oss, (void*)data->args.hipStreamGetCaptureInfo_v2.dependencies_out__val); }
+      if (data->args.hipStreamGetCaptureInfo_v2.numDependencies_out == NULL) oss << ", numDependencies_out=NULL";
+      else { oss << ", numDependencies_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetDevice:
+      oss << "hipStreamGetDevice(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.stream);
+      if (data->args.hipStreamGetDevice.device == NULL) oss << ", device=NULL";
+      else { oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.device__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetFlags:
+      oss << "hipStreamGetFlags(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetFlags.stream);
+      if (data->args.hipStreamGetFlags.flags == NULL) oss << ", flags=NULL";
+      else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetFlags.flags__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetId:
+      oss << "hipStreamGetId(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetId.stream);
+      if (data->args.hipStreamGetId.streamId == NULL) oss << ", streamId=NULL";
+      else { oss << ", streamId="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetId.streamId__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamGetPriority:
+      oss << "hipStreamGetPriority(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetPriority.stream);
+      if (data->args.hipStreamGetPriority.priority == NULL) oss << ", priority=NULL";
+      else { oss << ", priority="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetPriority.priority__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamIsCapturing:
+      oss << "hipStreamIsCapturing(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamIsCapturing.stream);
+      if (data->args.hipStreamIsCapturing.pCaptureStatus == NULL) oss << ", pCaptureStatus=NULL";
+      else { oss << ", pCaptureStatus="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamIsCapturing.pCaptureStatus__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamQuery:
+      oss << "hipStreamQuery(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamQuery.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamSetAttribute:
+      oss << "hipStreamSetAttribute(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamSetAttribute.stream);
+      oss << ", attr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamSetAttribute.attr);
+      if (data->args.hipStreamSetAttribute.value == NULL) oss << ", value=NULL";
+      else { oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamSetAttribute.value__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamSynchronize:
+      oss << "hipStreamSynchronize(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamSynchronize.stream);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamUpdateCaptureDependencies:
+      oss << "hipStreamUpdateCaptureDependencies(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamUpdateCaptureDependencies.stream);
+      if (data->args.hipStreamUpdateCaptureDependencies.dependencies == NULL) oss << ", dependencies=NULL";
+      else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamUpdateCaptureDependencies.dependencies__val); }
+      oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamUpdateCaptureDependencies.numDependencies);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamUpdateCaptureDependencies.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamWaitEvent:
+      oss << "hipStreamWaitEvent(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitEvent.stream);
+      oss << ", event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitEvent.event);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitEvent.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamWaitValue32:
+      oss << "hipStreamWaitValue32(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue32.stream);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue32.ptr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue32.value);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue32.flags);
+      oss << ", mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue32.mask);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamWaitValue64:
+      oss << "hipStreamWaitValue64(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue64.stream);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue64.ptr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue64.value);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue64.flags);
+      oss << ", mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWaitValue64.mask);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamWriteValue32:
+      oss << "hipStreamWriteValue32(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue32.stream);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue32.ptr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue32.value);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue32.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipStreamWriteValue64:
+      oss << "hipStreamWriteValue64(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.stream);
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.ptr);
+      oss << ", value="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.value);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetAddress:
+      oss << "hipTexRefGetAddress(";
+      if (data->args.hipTexRefGetAddress.dev_ptr == NULL) oss << "dev_ptr=NULL";
+      else { oss << "dev_ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetAddress.dev_ptr__val); }
+      if (data->args.hipTexRefGetAddress.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetAddress.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetArray:
+      oss << "hipTexRefGetArray(";
+      if (data->args.hipTexRefGetArray.pArray == NULL) oss << "pArray=NULL";
+      else { oss << "pArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetArray.pArray__val); }
+      if (data->args.hipTexRefGetArray.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetArray.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetBorderColor:
+      oss << "hipTexRefGetBorderColor(";
+      if (data->args.hipTexRefGetBorderColor.pBorderColor == NULL) oss << "pBorderColor=NULL";
+      else { oss << "pBorderColor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetBorderColor.pBorderColor__val); }
+      if (data->args.hipTexRefGetBorderColor.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetBorderColor.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetFlags:
+      oss << "hipTexRefGetFlags(";
+      if (data->args.hipTexRefGetFlags.pFlags == NULL) oss << "pFlags=NULL";
+      else { oss << "pFlags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetFlags.pFlags__val); }
+      if (data->args.hipTexRefGetFlags.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetFlags.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetFormat:
+      oss << "hipTexRefGetFormat(";
+      if (data->args.hipTexRefGetFormat.pFormat == NULL) oss << "pFormat=NULL";
+      else { oss << "pFormat="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetFormat.pFormat__val); }
+      if (data->args.hipTexRefGetFormat.pNumChannels == NULL) oss << ", pNumChannels=NULL";
+      else { oss << ", pNumChannels="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetFormat.pNumChannels__val); }
+      if (data->args.hipTexRefGetFormat.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetFormat.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetMaxAnisotropy:
+      oss << "hipTexRefGetMaxAnisotropy(";
+      if (data->args.hipTexRefGetMaxAnisotropy.pmaxAnsio == NULL) oss << "pmaxAnsio=NULL";
+      else { oss << "pmaxAnsio="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMaxAnisotropy.pmaxAnsio__val); }
+      if (data->args.hipTexRefGetMaxAnisotropy.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMaxAnisotropy.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetMipMappedArray:
+      oss << "hipTexRefGetMipMappedArray(";
+      if (data->args.hipTexRefGetMipMappedArray.pArray == NULL) oss << "pArray=NULL";
+      else { oss << "pArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipMappedArray.pArray__val); }
+      if (data->args.hipTexRefGetMipMappedArray.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipMappedArray.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetMipmapLevelBias:
+      oss << "hipTexRefGetMipmapLevelBias(";
+      if (data->args.hipTexRefGetMipmapLevelBias.pbias == NULL) oss << "pbias=NULL";
+      else { oss << "pbias="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipmapLevelBias.pbias__val); }
+      if (data->args.hipTexRefGetMipmapLevelBias.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipmapLevelBias.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefGetMipmapLevelClamp:
+      oss << "hipTexRefGetMipmapLevelClamp(";
+      if (data->args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp == NULL) oss << "pminMipmapLevelClamp=NULL";
+      else { oss << "pminMipmapLevelClamp="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipmapLevelClamp.pminMipmapLevelClamp__val); }
+      if (data->args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp == NULL) oss << ", pmaxMipmapLevelClamp=NULL";
+      else { oss << ", pmaxMipmapLevelClamp="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipmapLevelClamp.pmaxMipmapLevelClamp__val); }
+      if (data->args.hipTexRefGetMipmapLevelClamp.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetMipmapLevelClamp.texRef__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetAddress:
+      oss << "hipTexRefSetAddress(";
+      if (data->args.hipTexRefSetAddress.ByteOffset == NULL) oss << "ByteOffset=NULL";
+      else { oss << "ByteOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress.ByteOffset__val); }
+      if (data->args.hipTexRefSetAddress.texRef == NULL) oss << ", texRef=NULL";
+      else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress.texRef__val); }
+      oss << ", dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress.dptr);
+      oss << ", bytes="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress.bytes);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetAddress2D:
+      oss << "hipTexRefSetAddress2D(";
+      if (data->args.hipTexRefSetAddress2D.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress2D.texRef__val); }
+      if (data->args.hipTexRefSetAddress2D.desc == NULL) oss << ", desc=NULL";
+      else { oss << ", desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress2D.desc__val); }
+      oss << ", dptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress2D.dptr);
+      oss << ", Pitch="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetAddress2D.Pitch);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetArray:
+      oss << "hipTexRefSetArray(";
+      if (data->args.hipTexRefSetArray.tex == NULL) oss << "tex=NULL";
+      else { oss << "tex="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetArray.tex__val); }
+      oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetArray.array);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetArray.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetBorderColor:
+      oss << "hipTexRefSetBorderColor(";
+      if (data->args.hipTexRefSetBorderColor.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetBorderColor.texRef__val); }
+      if (data->args.hipTexRefSetBorderColor.pBorderColor == NULL) oss << ", pBorderColor=NULL";
+      else { oss << ", pBorderColor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetBorderColor.pBorderColor__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetFlags:
+      oss << "hipTexRefSetFlags(";
+      if (data->args.hipTexRefSetFlags.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetFlags.texRef__val); }
+      oss << ", Flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetFlags.Flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetFormat:
+      oss << "hipTexRefSetFormat(";
+      if (data->args.hipTexRefSetFormat.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetFormat.texRef__val); }
+      oss << ", fmt="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetFormat.fmt);
+      oss << ", NumPackedComponents="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetFormat.NumPackedComponents);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetMaxAnisotropy:
+      oss << "hipTexRefSetMaxAnisotropy(";
+      if (data->args.hipTexRefSetMaxAnisotropy.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMaxAnisotropy.texRef__val); }
+      oss << ", maxAniso="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMaxAnisotropy.maxAniso);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetMipmapLevelBias:
+      oss << "hipTexRefSetMipmapLevelBias(";
+      if (data->args.hipTexRefSetMipmapLevelBias.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmapLevelBias.texRef__val); }
+      oss << ", bias="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmapLevelBias.bias);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetMipmapLevelClamp:
+      oss << "hipTexRefSetMipmapLevelClamp(";
+      if (data->args.hipTexRefSetMipmapLevelClamp.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmapLevelClamp.texRef__val); }
+      oss << ", minMipMapLevelClamp="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmapLevelClamp.minMipMapLevelClamp);
+      oss << ", maxMipMapLevelClamp="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmapLevelClamp.maxMipMapLevelClamp);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipTexRefSetMipmappedArray:
+      oss << "hipTexRefSetMipmappedArray(";
+      if (data->args.hipTexRefSetMipmappedArray.texRef == NULL) oss << "texRef=NULL";
+      else { oss << "texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmappedArray.texRef__val); }
+      if (data->args.hipTexRefSetMipmappedArray.mipmappedArray == NULL) oss << ", mipmappedArray=NULL";
+      else { oss << ", mipmappedArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmappedArray.mipmappedArray__val); }
+      oss << ", Flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefSetMipmappedArray.Flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipThreadExchangeStreamCaptureMode:
+      oss << "hipThreadExchangeStreamCaptureMode(";
+      if (data->args.hipThreadExchangeStreamCaptureMode.mode == NULL) oss << "mode=NULL";
+      else { oss << "mode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipThreadExchangeStreamCaptureMode.mode__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipUserObjectCreate:
+      oss << "hipUserObjectCreate(";
+      if (data->args.hipUserObjectCreate.object_out == NULL) oss << "object_out=NULL";
+      else { oss << "object_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectCreate.object_out__val); }
+      oss << ", ptr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectCreate.ptr);
+      oss << ", destroy="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectCreate.destroy);
+      oss << ", initialRefcount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectCreate.initialRefcount);
+      oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectCreate.flags);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipUserObjectRelease:
+      oss << "hipUserObjectRelease(";
+      oss << "object="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectRelease.object);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectRelease.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipUserObjectRetain:
+      oss << "hipUserObjectRetain(";
+      oss << "object="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectRetain.object);
+      oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipUserObjectRetain.count);
+      oss << ")";
+    break;
+    case HIP_API_ID_hipWaitExternalSemaphoresAsync:
+      oss << "hipWaitExternalSemaphoresAsync(";
+      if (data->args.hipWaitExternalSemaphoresAsync.extSemArray == NULL) oss << "extSemArray=NULL";
+      else { oss << "extSemArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipWaitExternalSemaphoresAsync.extSemArray__val); }
+      if (data->args.hipWaitExternalSemaphoresAsync.paramsArray == NULL) oss << ", paramsArray=NULL";
+      else { oss << ", paramsArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipWaitExternalSemaphoresAsync.paramsArray__val); }
+      oss << ", numExtSems="; roctracer::hip_support::detail::operator<<(oss, data->args.hipWaitExternalSemaphoresAsync.numExtSems);
+      oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipWaitExternalSemaphoresAsync.stream);
+      oss << ")";
+    break;
+    default: oss << "unknown";
+  };
+  return strdup(oss.str().c_str());
+}
+#endif  // HIP_PROF_HIP_API_STRING
+#endif  // _HIP_PROF_STR_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h
new file mode 100644
index 0000000000000000000000000000000000000000..d53eeffca6a64c5c343293222b4cc1c663aed6cd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP ROCclr Op IDs enumeration
+enum HipVdiOpId {
+  kHipVdiOpIdDispatch = 0,
+  kHipVdiOpIdCopy = 1,
+  kHipVdiOpIdBarrier = 2,
+  kHipVdiOpIdNumber = 3
+};
+
+// Types of ROCclr commands
+enum HipVdiCommandKind {
+  kHipVdiCommandKernel = 0x11F0,
+  kHipVdiCommandTask = 0x11F1,
+  kHipVdiMemcpyDeviceToHost = 0x11F3,
+  kHipHipVdiMemcpyHostToDevice = 0x11F4,
+  kHipVdiMemcpyDeviceToDevice = 0x11F5,
+  kHipVidMemcpyDeviceToHostRect = 0x1201,
+  kHipVdiMemcpyHostToDeviceRect = 0x1202,
+  kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+  kHipVdiFillMemory = 0x1207,
+};
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg         Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op      Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable  Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id      Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/host_defines.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..8081966cf7afb515f676fdd5613909b1840ad08a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/host_defines.h
@@ -0,0 +1,262 @@
+/*
+Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/host_defines.h
+ *  @brief TODO-doc
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+
+// Add guard to Generic Grid Launch method
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#if defined(__cplusplus)
+namespace __hip_internal {
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef signed int int32_t;
+typedef signed long long int64_t;
+#if defined(_MSC_VER)
+typedef unsigned long long size_t;
+#else
+typedef unsigned long size_t;
+#endif
+
+template <class _Tp, _Tp __v> struct integral_constant {
+  static constexpr const _Tp value = __v;
+  typedef _Tp value_type;
+  typedef integral_constant type;
+  constexpr operator value_type() const { return value; }
+  constexpr value_type operator()() const { return value; }
+};
+template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
+
+typedef integral_constant<bool, true> true_type;
+typedef integral_constant<bool, false> false_type;
+
+template <bool B> using bool_constant = integral_constant<bool, B>;
+typedef bool_constant<true> true_type;
+typedef bool_constant<false> false_type;
+
+template <bool __B, class __T = void> struct enable_if {};
+template <class __T> struct enable_if<true, __T> {
+  typedef __T type;
+};
+
+template <bool _B> struct true_or_false_type : public false_type {};
+template <> struct true_or_false_type<true> : public true_type {};
+
+template <class _Tp> struct is_integral : public false_type {};
+template <> struct is_integral<bool> : public true_type {};
+template <> struct is_integral<char> : public true_type {};
+template <> struct is_integral<signed char> : public true_type {};
+template <> struct is_integral<unsigned char> : public true_type {};
+template <> struct is_integral<wchar_t> : public true_type {};
+template <> struct is_integral<short> : public true_type {};
+template <> struct is_integral<unsigned short> : public true_type {};
+template <> struct is_integral<int> : public true_type {};
+template <> struct is_integral<unsigned int> : public true_type {};
+template <> struct is_integral<long> : public true_type {};
+template <> struct is_integral<unsigned long> : public true_type {};
+template <> struct is_integral<long long> : public true_type {};
+template <> struct is_integral<unsigned long long> : public true_type {};
+
+template <class _Tp> struct is_arithmetic : public false_type {};
+template <> struct is_arithmetic<bool> : public true_type {};
+template <> struct is_arithmetic<char> : public true_type {};
+template <> struct is_arithmetic<signed char> : public true_type {};
+template <> struct is_arithmetic<unsigned char> : public true_type {};
+template <> struct is_arithmetic<wchar_t> : public true_type {};
+template <> struct is_arithmetic<short> : public true_type {};
+template <> struct is_arithmetic<unsigned short> : public true_type {};
+template <> struct is_arithmetic<int> : public true_type {};
+template <> struct is_arithmetic<unsigned int> : public true_type {};
+template <> struct is_arithmetic<long> : public true_type {};
+template <> struct is_arithmetic<unsigned long> : public true_type {};
+template <> struct is_arithmetic<long long> : public true_type {};
+template <> struct is_arithmetic<unsigned long long> : public true_type {};
+template <> struct is_arithmetic<float> : public true_type {};
+template <> struct is_arithmetic<double> : public true_type {};
+
+template <typename _Tp> struct is_floating_point : public false_type {};
+template <> struct is_floating_point<float> : public true_type {};
+template <> struct is_floating_point<double> : public true_type {};
+template <> struct is_floating_point<long double> : public true_type {};
+
+template <typename __T, typename __U> struct is_same : public false_type {};
+template <typename __T> struct is_same<__T, __T> : public true_type {};
+
+template <typename _Tp, bool = is_arithmetic<_Tp>::value> struct is_signed : public false_type {};
+template <typename _Tp> struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {
+};
+
+template <class T> auto test_returnable(int)
+    -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
+template <class> auto test_returnable(...) -> false_type;
+
+template <class T> struct type_identity {
+  using type = T;
+};
+
+template <class T>  // Note that `cv void&` is a substitution failure
+auto try_add_lvalue_reference(int) -> type_identity<T&>;
+template <class T>  // Handle T = cv void case
+auto try_add_lvalue_reference(...) -> type_identity<T>;
+
+template <class T> auto try_add_rvalue_reference(int) -> type_identity<T&&>;
+template <class T> auto try_add_rvalue_reference(...) -> type_identity<T>;
+
+template <class T> struct add_lvalue_reference : decltype(try_add_lvalue_reference<T>(0)) {};
+
+template <class T> struct add_rvalue_reference : decltype(try_add_rvalue_reference<T>(0)) {};
+
+template <typename T> typename add_rvalue_reference<T>::type declval() noexcept;
+
+template <class From, class To> auto test_implicitly_convertible(int)
+    -> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{});
+
+template <class, class> auto test_implicitly_convertible(...) -> false_type;
+
+template <class T> struct remove_cv {
+  typedef T type;
+};
+template <class T> struct remove_cv<const T> {
+  typedef T type;
+};
+template <class T> struct remove_cv<volatile T> {
+  typedef T type;
+};
+template <class T> struct remove_cv<const volatile T> {
+  typedef T type;
+};
+
+template <class T> struct is_void : public is_same<void, typename remove_cv<T>::type> {};
+
+template <class From, class To> struct is_convertible
+    : public integral_constant<bool, (decltype(test_returnable<To>(0))::value &&
+                                      decltype(test_implicitly_convertible<From, To>(0))::value) ||
+                                         (is_void<From>::value && is_void<To>::value)> {};
+
+template <typename _CharT> struct char_traits;
+template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
+template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
+typedef basic_istream<char> istream;
+typedef basic_ostream<char> ostream;
+
+template <typename _Tp> struct is_standard_layout
+    : public integral_constant<bool, __is_standard_layout(_Tp)> {};
+
+template <typename _Tp> struct is_trivial : public integral_constant<bool, __is_trivial(_Tp)> {};
+
+
+template <bool B, class T, class F> struct conditional {
+  using type = T;
+};
+template <class T, class F> struct conditional<false, T, F> {
+  using type = F;
+};
+
+template <class T> struct alignment_of : integral_constant<size_t, alignof(T)> {};
+
+template <typename T, T... Ints> struct integer_sequence {
+  using value_type = T;
+  static constexpr size_t size() noexcept { return sizeof...(Ints); }
+};
+
+template <size_t... Ints> using index_sequence = integer_sequence<size_t, Ints...>;
+
+template <size_t _hip_N, size_t... Ints> struct make_index_sequence_impl
+    : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
+
+template <size_t... Ints> struct make_index_sequence_impl<0, Ints...> {
+  using type = index_sequence<Ints...>;
+};
+
+template <size_t _hip_N> using make_index_sequence =
+    typename make_index_sequence_impl<_hip_N>::type;
+
+template <size_t... Ints>
+constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
+  return {};
+}
+}  // namespace __hip_internal
+typedef __hip_internal::uint8_t __hip_uint8_t;
+typedef __hip_internal::uint16_t __hip_uint16_t;
+typedef __hip_internal::uint32_t __hip_uint32_t;
+typedef __hip_internal::uint64_t __hip_uint64_t;
+typedef __hip_internal::int8_t __hip_int8_t;
+typedef __hip_internal::int16_t __hip_int16_t;
+typedef __hip_internal::int32_t __hip_int32_t;
+typedef __hip_internal::int64_t __hip_int64_t;
+#endif  // defined(__cplusplus)
+
+#if defined(__clang__) && defined(__HIP__)
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#endif  // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
+#define __noinline__ __attribute__((noinline))
+#endif
+
+#define __forceinline__ inline __attribute__((always_inline))
+
+#if __HIP_NO_IMAGE_SUPPORT
+#define __hip_img_chk__                                                                            \
+  __attribute__((unavailable("The image/texture API not supported on the device")))
+#else
+#define __hip_img_chk__
+#endif
+
+#else
+
+// Non-HCC compiler
+/**
+ * Function and kernel markers
+ */
+#define __host__
+#define __device__
+
+#define __global__
+
+#define __noinline__
+#define __forceinline__ inline
+
+#define __shared__
+#define __constant__
+
+#define __hip_img_chk__
+#endif  // defined(__clang__) && defined(__HIP__)
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/math_fwd.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/math_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a20594d6170ecbdc5ccfd151ad36fc57dba9f67d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/math_fwd.h
@@ -0,0 +1,289 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if !defined(__HIPCC_RTC__)
+#include "host_defines.h"
+#include "amd_hip_vector_types.h"  // For Native_vec_
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// DOT FUNCTIONS
+#if defined(__clang__) && defined(__HIP__)
+__device__ __attribute__((const)) int __ockl_sdot2(HIP_vector_base<short, 2>::Native_vec_,
+                                                   HIP_vector_base<short, 2>::Native_vec_, int,
+                                                   bool);
+
+__device__ __attribute__((const)) unsigned int __ockl_udot2(
+    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    HIP_vector_base<unsigned short, 2>::Native_vec_, unsigned int, bool);
+
+__device__ __attribute__((const)) int __ockl_sdot4(HIP_vector_base<char, 4>::Native_vec_,
+                                                   HIP_vector_base<char, 4>::Native_vec_, int,
+                                                   bool);
+
+__device__ __attribute__((const)) unsigned int __ockl_udot4(
+    HIP_vector_base<unsigned char, 4>::Native_vec_, HIP_vector_base<unsigned char, 4>::Native_vec_,
+    unsigned int, bool);
+
+__device__ __attribute__((const)) int __ockl_sdot8(int, int, int, bool);
+
+__device__ __attribute__((const)) unsigned int __ockl_udot8(unsigned int, unsigned int,
+                                                            unsigned int, bool);
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__device__ __attribute__((const)) float __ocml_acos_f32(float);
+__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
+__device__ __attribute__((const)) float __ocml_asin_f32(float);
+__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
+__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
+__device__ __attribute__((const)) float __ocml_atan_f32(float);
+__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
+__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_ceil_f32(float);
+__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float, float);
+__device__ float __ocml_cos_f32(float);
+__device__ float __ocml_native_cos_f32(float);
+__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
+__device__ float __ocml_cospi_f32(float);
+__device__ float __ocml_i0_f32(float);
+__device__ float __ocml_i1_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
+__device__ __attribute__((pure)) float __ocml_erf_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
+__device__ __attribute__((const)) float __ocml_fabs_f32(float);
+__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
+__device__ __attribute__((const)) float __ocml_floor_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float, float);
+__device__ float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
+__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
+__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
+__device__ __attribute__((const)) int __ocml_isinf_f32(float);
+__device__ __attribute__((const)) int __ocml_isnan_f32(float);
+__device__ float __ocml_j0_f32(float);
+__device__ float __ocml_j1_f32(float);
+__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
+__device__ float __ocml_lgamma_f32(float);
+__device__ __attribute__((pure)) float __ocml_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
+__device__ __attribute__((pure)) float __ocml_log2_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
+__device__ __attribute__((const)) float __ocml_logb_f32(float);
+__device__ __attribute__((pure)) float __ocml_log_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
+__device__ float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
+__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
+__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float, float);
+__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
+__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
+__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
+__device__ float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
+__device__ __attribute__((const)) float __ocml_rint_f32(float);
+__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float, float);
+__device__ __attribute__((const)) float __ocml_round_f32(float);
+__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
+__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
+__device__ __attribute__((const)) int __ocml_signbit_f32(float);
+__device__ float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+__device__ float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+__device__ float __ocml_sin_f32(float);
+__device__ float __ocml_native_sin_f32(float);
+__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
+__device__ float __ocml_sinpi_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
+__device__ float __ocml_tan_f32(float);
+__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
+__device__ float __ocml_tgamma_f32(float);
+__device__ __attribute__((const)) float __ocml_trunc_f32(float);
+__device__ float __ocml_y0_f32(float);
+__device__ float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__ __attribute__((const)) double __ocml_acos_f64(double);
+__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
+__device__ __attribute__((const)) double __ocml_asin_f64(double);
+__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
+__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
+__device__ __attribute__((const)) double __ocml_atan_f64(double);
+__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
+__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_ceil_f64(double);
+__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
+__device__ double __ocml_cos_f64(double);
+__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
+__device__ double __ocml_cospi_f64(double);
+__device__ double __ocml_i0_f64(double);
+__device__ double __ocml_i1_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
+__device__ __attribute__((pure)) double __ocml_erf_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp_f64(double);
+__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
+__device__ __attribute__((const)) double __ocml_fabs_f64(double);
+__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
+__device__ __attribute__((const)) double __ocml_floor_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
+__device__ double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
+__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
+__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
+__device__ __attribute__((const)) int __ocml_isinf_f64(double);
+__device__ __attribute__((const)) int __ocml_isnan_f64(double);
+__device__ double __ocml_j0_f64(double);
+__device__ double __ocml_j1_f64(double);
+__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
+__device__ double __ocml_lgamma_f64(double);
+__device__ __attribute__((pure)) double __ocml_log10_f64(double);
+__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
+__device__ __attribute__((pure)) double __ocml_log2_f64(double);
+__device__ __attribute__((const)) double __ocml_logb_f64(double);
+__device__ __attribute__((pure)) double __ocml_log_f64(double);
+__device__ double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
+__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
+__device__ __attribute__((const)) double __ocml_len3_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double, double);
+__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
+__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
+__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
+__device__ double __ocml_remquo_f64(double, double, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
+__device__ __attribute__((const)) double __ocml_rint_f64(double);
+__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double, double, double);
+__device__ __attribute__((const)) double __ocml_round_f64(double);
+__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
+__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
+__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
+__device__ __attribute__((const)) int __ocml_signbit_f64(double);
+__device__ double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+__device__ double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+__device__ double __ocml_sin_f64(double);
+__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
+__device__ double __ocml_sinpi_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
+__device__ double __ocml_tan_f64(double);
+__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
+__device__ double __ocml_tgamma_f64(double);
+__device__ __attribute__((const)) double __ocml_trunc_f64(double);
+__device__ double __ocml_y0_f64(double);
+__device__ double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, double);
+// END INTRINSICS
+// END DOUBLE
+
+#endif  // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/ockl_image.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/ockl_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..d874bee4873c111960eb463c0ef2aea03c086fa0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/ockl_image.h
@@ -0,0 +1,257 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
+#endif
+
+extern "C" {
+
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+
+__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int2::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int4::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int2::Native_vec_ c, int f, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int4::Native_vec_ c, int f, int l);
+
+__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c,
+                                      float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                       float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                      float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                       float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                      float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                      int f, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                       int f, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c, int l,
+                                          float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int4::Native_vec_ c, int f, int l,
+                                           float4::Native_vec_ p);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                            unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                            float2::Native_vec_ c, float dx,
+                                                            float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float2::Native_vec_ c,
+                                                           float2::Native_vec_ dx,
+                                                           float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                            unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                            float4::Native_vec_ c,
+                                                            float2::Native_vec_ dx,
+                                                            float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c,
+                                                           float4::Native_vec_ dx,
+                                                           float4::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);
+
+__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd1580c1ed900b602f5a11d2c62f4dc3391604df
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h
@@ -0,0 +1,466 @@
+/*
+Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/ockl_image.h>
+#include <type_traits>
+#endif  // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_PARAMETERS_INIT                                                                    \
+  unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject;  \
+  unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;                    \
+  (void)s;
+
+template <typename T> struct __hip_is_tex_surf_scalar_channel_type {
+  static constexpr bool value =
+      __hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
+      __hip_internal::is_same<T, short>::value ||
+      __hip_internal::is_same<T, unsigned short>::value || __hip_internal::is_same<T, int>::value ||
+      __hip_internal::is_same<T, unsigned int>::value || __hip_internal::is_same<T, float>::value;
+};
+
+template <typename T> struct __hip_is_tex_surf_channel_type {
+  static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value;
+};
+
+template <typename T, unsigned int rank>
+struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>> {
+  static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value &&
+                                ((rank == 1) || (rank == 2) || (rank == 4));
+};
+
+template <typename T> struct __hip_is_tex_normalized_channel_type {
+  static constexpr bool value =
+      __hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
+      __hip_internal::is_same<T, short>::value || __hip_internal::is_same<T, unsigned short>::value;
+};
+
+template <typename T, unsigned int rank>
+struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>> {
+  static constexpr bool value =
+      __hip_is_tex_normalized_channel_type<T>::value && ((rank == 1) || (rank == 2) || (rank == 4));
+};
+
+template <typename T, hipTextureReadMode readMode, typename Enable = void> struct __hip_tex_ret {
+  static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+/*
+ * Map from device function return U to scalar texture type T
+ */
+template <typename T, typename U> __forceinline__ __device__
+    typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
+                                       const T>::type
+    __hipMapFrom(const U& u) {
+  if constexpr (sizeof(T) < sizeof(float)) {
+    union {
+      U u;
+      int i;
+    } d = {u};
+    return static_cast<T>(d.i);
+  } else {  // sizeof(T) == sizeof(float)
+    union {
+      U u;
+      T t;
+    } d = {u};
+    return d.t;
+  }
+}
+
+/*
+ * Map from device function return U to vector texture type T
+ */
+template <typename T, typename U> __forceinline__ __device__ typename __hip_internal::enable_if<
+    __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
+__hipMapFrom(const U& u) {
+  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
+    union {
+      U u;
+      int4 i4;
+    } d = {u};
+    return __hipMapVector<typename T::value_type, sizeof(T) / sizeof(typename T::value_type)>(d.i4);
+  } else {  // sizeof(typename T::value_type) == sizeof(float)
+    union {
+      U u;
+      T t;
+    } d = {u};
+    return d.t;
+  }
+}
+
+/*
+ * Map from scalar texture type T to device function input U
+ */
+template <typename U, typename T> __forceinline__ __device__
+    typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
+                                       const U>::type
+    __hipMapTo(const T& t) {
+  if constexpr (sizeof(T) < sizeof(float)) {
+    union {
+      U u;
+      int i;
+    } d = {0};
+    d.i = static_cast<int>(t);
+    return d.u;
+  } else {  // sizeof(T) == sizeof(float)
+    union {
+      U u;
+      T t;
+    } d = {0};
+    d.t = t;
+    return d.u;
+  }
+}
+
+/*
+ * Map from vector texture type T to device function input U
+ */
+template <typename U, typename T> __forceinline__ __device__ typename __hip_internal::enable_if<
+    __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
+__hipMapTo(const T& t) {
+  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
+    union {
+      U u;
+      int4 i4;
+    } d = {0};
+    d.i4 = __hipMapVector<int, 4>(t);
+    return d.u;
+  } else {  // sizeof(typename T::value_type) == sizeof(float)
+    union {
+      U u;
+      T t;
+    } d = {0};
+    d.t = t;
+    return d.u;
+  }
+}
+
+template <typename T, hipTextureReadMode readMode> using __hip_tex_ret_t =
+    typename __hip_tex_ret<T, readMode, bool>::type;
+
+template <typename T> struct __hip_tex_ret<
+    T, hipReadModeElementType,
+    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
+  using type = T;
+};
+
+template <typename T, unsigned int rank> struct __hip_tex_ret<
+    HIP_vector_type<T, rank>, hipReadModeElementType,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
+  using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
+};
+
+template <typename T>
+struct __hip_tex_ret<T, hipReadModeNormalizedFloat,
+                     typename __hip_internal::enable_if<
+                         __hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
+  using type = float;
+};
+
+template <typename T, unsigned int rank> struct __hip_tex_ret<
+    HIP_vector_type<T, rank>, hipReadModeNormalizedFloat,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
+  using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
+};
+
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(
+    texture<T, hipTextureType1D, readMode> t, int x) {
+  TEXTURE_PARAMETERS_INIT;
+  auto tmp = __ockl_image_load_1Db(i, x);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(
+    texture<T, hipTextureType1D, readMode> t, float x) {
+  TEXTURE_PARAMETERS_INIT;
+  auto tmp = __ockl_image_sample_1D(i, s, x);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(
+    texture<T, hipTextureType2D, readMode> t, float x, float y) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(
+    texture<T, hipTextureType1D, readMode> t, float x, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_lod_1Da(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_lod_2Da(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(
+    texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, layer};
+  auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(
+    texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer,
+    float level) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, layer};
+  auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx,
+    float4 dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  (void)x;
+  (void)y;
+  (void)z;
+  (void)dPdx;
+  (void)dPdy;
+  // TODO missing in device libs.
+  // auto tmp = __ockl_image_sample_grad_CM(i, s, get_native_vector(float4(x, y, z, 0.0f)),
+  // get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
+  // dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+  return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode>
+texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y,
+                      float z, int layer, float4 dPdx, float4 dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  (void)x;
+  (void)y;
+  (void)z;
+  (void)layer;
+  (void)dPdx;
+  (void)dPdy;
+  // TODO missing in device libs.
+  // auto tmp = __ockl_image_sample_grad_CMa(i, s, get_native_vector(float4(x, y, z, layer)),
+  // get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
+  // dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+  return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(
+    texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                         get_native_vector(dPdy));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx,
+    float2 dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                          get_native_vector(dPdy));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy) {
+  TEXTURE_PARAMETERS_INIT;
+  float4 coords{x, y, z, 0.0f};
+  float4 gradx{dPdx.x, dPdx.y, dPdx.z, 0.0f};
+  float4 grady{dPdy.x, dPdy.y, dPdy.z, 0.0f};
+  auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords), get_native_vector(gradx),
+                                         get_native_vector(grady));
+  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode, typename Enable = void>
+struct __hip_tex2dgather_ret {
+  static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <typename T, hipTextureReadMode readMode> using __hip_tex2dgather_ret_t =
+    typename __hip_tex2dgather_ret<T, readMode, bool>::type;
+
+template <typename T> struct __hip_tex2dgather_ret<
+    T, hipReadModeElementType,
+    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
+  using type = HIP_vector_type<T, 4>;
+};
+
+template <typename T, unsigned int rank> struct __hip_tex2dgather_ret<
+    HIP_vector_type<T, rank>, hipReadModeElementType,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
+  using type = HIP_vector_type<T, 4>;
+};
+
+template <typename T>
+struct __hip_tex2dgather_ret<T, hipReadModeNormalizedFloat,
+                             typename __hip_internal::enable_if<
+                                 __hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
+  using type = float4;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, int comp = 0) {
+  TEXTURE_PARAMETERS_INIT;
+  float2 coords{x, y};
+  switch (comp) {
+    case 1: {
+      auto tmp = __ockl_image_gather4g_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    case 2: {
+      auto tmp = __ockl_image_gather4b_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    case 3: {
+      auto tmp = __ockl_image_gather4a_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    default: {
+      auto tmp = __ockl_image_gather4r_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+  }
+  return {};
+}
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f48b3bcf8c2e3e26a423ae254b195a200a623407
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h
@@ -0,0 +1,474 @@
+/*
+Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/texture_fetch_functions.h>
+#include <hip/amd_detail/ockl_image.h>
+#include <type_traits>
+#endif  // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_OBJECT_PARAMETERS_INIT                                                             \
+  unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject;    \
+  unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;                    \
+  (void)s;
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  auto tmp = __ockl_image_load_1Db(i, x);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) {
+  *ptr = tex1Dfetch<T>(textureObject, x);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  auto tmp = __ockl_image_sample_1D(i, s, x);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) {
+  *ptr = tex1D<T>(textureObject, x);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x,
+                                             float y) {
+  *ptr = tex2D<T>(textureObject, x, y);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y,
+                                          float z) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x,
+                                             float y, float z) {
+  *ptr = tex3D<T>(textureObject, x, y, z);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x,
+                                                 int layer) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject,
+                                                    float x, int layer) {
+  *ptr = tex1DLayered<T>(textureObject, x, layer);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
+                                                 int layer) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject,
+                                                    float x, float y, int layer) {
+  *ptr = tex1DLayered<T>(textureObject, x, y, layer);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y,
+                                               float z) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x,
+                                                  float y, float z) {
+  *ptr = texCubemap<T>(textureObject, x, y, z);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x,
+                                                      float y, float z, int layer) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, layer};
+  auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject,
+                                                         float x, float y, float z, int layer) {
+  *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y,
+                                                int comp = 0) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, y};
+  switch (comp) {
+    case 1: {
+      auto tmp = __ockl_image_gather4r_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<T>(tmp);
+      break;
+    }
+    case 2: {
+      auto tmp = __ockl_image_gather4g_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<T>(tmp);
+      break;
+    }
+    case 3: {
+      auto tmp = __ockl_image_gather4b_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<T>(tmp);
+      break;
+    }
+    default: {
+      auto tmp = __ockl_image_gather4a_2D(i, s, get_native_vector(coords));
+      return __hipMapFrom<T>(tmp);
+      break;
+    }
+  }
+  return {};
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject,
+                                                   float x, float y, int comp = 0) {
+  *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x,
+                                             float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float level) {
+  *ptr = tex1DLod<T>(textureObject, x, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
+                                             float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float y, float level) {
+  *ptr = tex2DLod<T>(textureObject, x, y, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y,
+                                             float z, float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float y, float z, float level) {
+  *ptr = tex3DLod<T>(textureObject, x, y, z, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x,
+                                                    int layer, float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT;
+  (void)level;
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject,
+                                                       float x, int layer, float level) {
+  *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x,
+                                                    float y, int layer, float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT;
+  (void)level;
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject,
+                                                       float x, float y, int layer, float level) {
+  *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x,
+                                                  float y, float z, float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, 0.0f};
+  auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject,
+                                                     float x, float y, float z, float level) {
+  *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x,
+                                                   float y, float z, float4 dPdx, float4 dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT;
+  (void)x;
+  (void)y;
+  (void)z;
+  (void)dPdx;
+  (void)dPdy;
+  // TODO missing in device libs.
+  // auto tmp = __ockl_image_sample_grad_CM(i, s, get_native_vector(float4(x, y, z, 0.0f)),
+  // get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
+  // dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<T>(tmp);
+  return {};
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject,
+                                                      float x, float y, float z, float4 dPdx,
+                                                      float4 dPdy) {
+  *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x,
+                                                         float y, float z, int layer, float level) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, z, layer};
+  auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
+                                                            hipTextureObject_t textureObject,
+                                                            float x, float y, float z, int layer,
+                                                            float level) {
+  *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx,
+                                              float dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float dPdx, float dPdy) {
+  *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y,
+                                              float2 dPdx, float2 dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, y};
+  auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                         get_native_vector(dPdy));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float y, float2 dPdx, float2 dPdy) {
+  *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y,
+                                              float z, float4 dPdx, float4 dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT;
+  (void)dPdx;
+  float4 coords{x, y, z, 0.0f};
+  float4 gradx{dPdy.x, dPdy.y, dPdy.z, 0.0f};
+  float4 grady{dPdy.x, dPdy.y, dPdy.z, 0.0f};
+  auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords), get_native_vector(gradx),
+                                         get_native_vector(grady));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float y, float z, float4 dPdx, float4 dPdy) {
+  *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                     int layer, float dPdx, float dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float2 coords{x, layer};
+  auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
+                                                        float x, int layer, float dPdx,
+                                                        float dPdy) {
+  *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                     float y, int layer, float2 dPdx, float2 dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT
+  float4 coords{x, y, layer, 0.0f};
+  auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                          get_native_vector(dPdy));
+  return __hipMapFrom<T>(tmp);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
+                                                        float x, float y, int layer, float2 dPdx,
+                                                        float2 dPdy) {
+  *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                          float y, float z, int layer, float4 dPdx,
+                                                          float4 dPdy) {
+  TEXTURE_OBJECT_PARAMETERS_INIT;
+  (void)x;
+  (void)y;
+  (void)z;
+  (void)layer;
+  (void)dPdx;
+  (void)dPdy;
+  // TODO missing in device libs.
+  // auto tmp = __ockl_image_sample_grad_CMa(i, s, get_native_vector(float4(x, y, z, layer)),
+  // get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
+  // dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<T>(tmp);
+  return {};
+}
+
+template <typename T, typename __hip_internal::enable_if<
+                          __hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr,
+                                                             hipTextureObject_t textureObject,
+                                                             float x, float y, float z, int layer,
+                                                             float4 dPdx, float4 dPdy) {
+  *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
+}
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/channel_descriptor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..21d5f2052e2bed51f8be063fb9315466ab745efa
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/channel_descriptor.h
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_CHANNEL_DESCRIPTOR_H
+
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/amd_detail/amd_channel_descriptor.h>
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/nvidia_detail/nvidia_channel_descriptor.h>
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/driver_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/driver_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b64d165a40fbaa58efff8e6d8f9940fed192def
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/driver_types.h
@@ -0,0 +1,681 @@
+/*
+Copyright (c) 2015 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H
+#define HIP_INCLUDE_HIP_DRIVER_TYPES_H
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_common.h>
+#if __cplusplus
+#include <cstdlib>
+#else
+#include <stdlib.h>  // size_t
+#endif
+#endif
+
+#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include "driver_types.h"
+#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+
+/**
+ *  @defgroup DriverTypes Driver Types
+ *  @{
+ *  This section describes the driver data types.
+ *
+ */
+
+typedef void* hipDeviceptr_t;
+/**
+ * HIP channel format kinds
+ */
+typedef enum hipChannelFormatKind {
+  hipChannelFormatKindSigned = 0,    ///< Signed channel format
+  hipChannelFormatKindUnsigned = 1,  ///< Unsigned channel format
+  hipChannelFormatKindFloat = 2,     ///< Float channel format
+  hipChannelFormatKindNone = 3       ///< No channel format
+} hipChannelFormatKind;
+/**
+ * HIP channel format descriptor
+ */
+typedef struct hipChannelFormatDesc {
+  int x;
+  int y;
+  int z;
+  int w;
+  enum hipChannelFormatKind f;  ///< Channel format kind
+} hipChannelFormatDesc;
+/** @brief The hipTexRefSetArray function flags parameter override format value*/
+#define HIP_TRSA_OVERRIDE_FORMAT 0x01
+/** @brief The hipTexRefSetFlags function flags parameter read as integer value*/
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+/** @brief The hipTexRefSetFlags function flags parameter normalized coordinate value*/
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+/** @brief The hipTexRefSetFlags function flags parameter srgb value*/
+#define HIP_TRSF_SRGB 0x10
+
+typedef struct hipArray* hipArray_t;
+typedef const struct hipArray* hipArray_const_t;
+/**
+ * HIP array format
+ */
+typedef enum hipArray_Format {
+  HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,   ///< Unsigned 8-bit array format
+  HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,  ///< Unsigned 16-bit array format
+  HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,  ///< Unsigned 32-bit array format
+  HIP_AD_FORMAT_SIGNED_INT8 = 0x08,     ///< Signed 8-bit array format
+  HIP_AD_FORMAT_SIGNED_INT16 = 0x09,    ///< Signed 16-bit array format
+  HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,    ///< Signed 32-bit array format
+  HIP_AD_FORMAT_HALF = 0x10,            ///< Half array format
+  HIP_AD_FORMAT_FLOAT = 0x20            ///< Float array format
+} hipArray_Format;
+/**
+ * HIP array descriptor
+ */
+typedef struct HIP_ARRAY_DESCRIPTOR {
+  size_t Width;                 ///< Width of the array
+  size_t Height;                ///< Height of the array
+  enum hipArray_Format Format;  ///< Format of the array
+  unsigned int NumChannels;     ///< Number of channels of the array
+} HIP_ARRAY_DESCRIPTOR;
+
+/**
+ * HIP 3D array descriptor
+ */
+typedef struct HIP_ARRAY3D_DESCRIPTOR {
+  size_t Width;                 ///< Width of the array
+  size_t Height;                ///< Height of the array
+  size_t Depth;                 ///< Depth of the array
+  enum hipArray_Format Format;  ///< Format of the array
+  unsigned int NumChannels;     ///< Number of channels of the array
+  unsigned int Flags;           ///< Flags of the array
+} HIP_ARRAY3D_DESCRIPTOR;
+#if !defined(__HIPCC_RTC__)
+/**
+ * HIP 2D memory copy parameters
+ */
+typedef struct hip_Memcpy2D {
+  size_t srcXInBytes;           ///< Source width in bytes
+  size_t srcY;                  ///< Source height
+  hipMemoryType srcMemoryType;  ///< Source memory type
+  const void* srcHost;          ///< Source pointer
+  hipDeviceptr_t srcDevice;     ///< Source device
+  hipArray_t srcArray;          ///< Source array
+  size_t srcPitch;              ///< Source pitch
+  size_t dstXInBytes;           ///< Destination width in bytes
+  size_t dstY;                  ///< Destination height
+  hipMemoryType dstMemoryType;  ///< Destination memory type
+  void* dstHost;                ///< Destination pointer
+  hipDeviceptr_t dstDevice;     ///< Destination device
+  hipArray_t dstArray;          ///< Destination array
+  size_t dstPitch;              ///< Destination pitch
+  size_t WidthInBytes;          ///< Width in bytes of the 2D memory copy
+  size_t Height;                ///< Height of the 2D memory copy
+} hip_Memcpy2D;
+#endif  // !defined(__HIPCC_RTC__)
+/**
+ * HIP mipmapped array
+ */
+typedef struct hipMipmappedArray {
+  void* data;                        ///< Data pointer of the mipmapped array
+  struct hipChannelFormatDesc desc;  ///< Description of the mipmapped array
+  unsigned int type;                 ///< Type of the mipmapped array
+  unsigned int width;                ///< Width of the mipmapped array
+  unsigned int height;               ///< Height of the mipmapped array
+  unsigned int depth;                ///< Depth of the mipmapped array
+  unsigned int min_mipmap_level;     ///< Minimum level of the mipmapped array
+  unsigned int max_mipmap_level;     ///< Maximum level of the mipmapped array
+  unsigned int flags;                ///< Flags of the mipmapped array
+  enum hipArray_Format format;       ///< Format of the mipmapped array
+  unsigned int num_channels;         ///< Number of channels of the mipmapped array
+} hipMipmappedArray;
+/**
+ * HIP mipmapped array pointer
+ */
+typedef struct hipMipmappedArray* hipMipmappedArray_t;
+typedef hipMipmappedArray_t hipmipmappedArray;
+typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
+/**
+ * HIP resource types
+ */
+typedef enum hipResourceType {
+  hipResourceTypeArray = 0x00,           ///< Array resource
+  hipResourceTypeMipmappedArray = 0x01,  ///< Mipmapped array resource
+  hipResourceTypeLinear = 0x02,          ///< Linear resource
+  hipResourceTypePitch2D = 0x03          ///< Pitch 2D resource
+} hipResourceType;
+typedef enum HIPresourcetype_enum {
+  HIP_RESOURCE_TYPE_ARRAY = 0x00,            ///< Array resource
+  HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,  ///< Mipmapped array resource
+  HIP_RESOURCE_TYPE_LINEAR = 0x02,           ///< Linear resource
+  HIP_RESOURCE_TYPE_PITCH2D = 0x03           ///< Pitch 2D resource
+} HIPresourcetype,
+    hipResourcetype;
+/**
+ * HIP texture address modes
+ */
+typedef enum HIPaddress_mode_enum {
+  HIP_TR_ADDRESS_MODE_WRAP = 0,    ///< Wrap address mode
+  HIP_TR_ADDRESS_MODE_CLAMP = 1,   ///< Clamp address mode
+  HIP_TR_ADDRESS_MODE_MIRROR = 2,  ///< Mirror address mode
+  HIP_TR_ADDRESS_MODE_BORDER = 3   ///< Border address mode
+} HIPaddress_mode;
+/**
+ * HIP filter modes
+ */
+typedef enum HIPfilter_mode_enum {
+  HIP_TR_FILTER_MODE_POINT = 0,  ///< Filter mode point
+  HIP_TR_FILTER_MODE_LINEAR = 1  ///< Filter mode linear
+} HIPfilter_mode;
+/**
+ * HIP texture descriptor
+ */
+typedef struct HIP_TEXTURE_DESC_st {
+  HIPaddress_mode addressMode[3];   ///< Address modes
+  HIPfilter_mode filterMode;        ///< Filter mode
+  unsigned int flags;               ///< Flags
+  unsigned int maxAnisotropy;       ///< Maximum anisotropy ratio
+  HIPfilter_mode mipmapFilterMode;  ///< Mipmap filter mode
+  float mipmapLevelBias;            ///< Mipmap level bias
+  float minMipmapLevelClamp;        ///< Mipmap minimum level clamp
+  float maxMipmapLevelClamp;        ///< Mipmap maximum level clamp
+  float borderColor[4];             ///< Border Color
+  int reserved[12];
+} HIP_TEXTURE_DESC;
+/**
+ * HIP texture resource view formats
+ */
+typedef enum hipResourceViewFormat {
+  hipResViewFormatNone = 0x00,  ///< No resource view format (use underlying resource format)
+  hipResViewFormatUnsignedChar1 = 0x01,              ///< 1 channel, unsigned 8-bit integers
+  hipResViewFormatUnsignedChar2 = 0x02,              ///< 2 channels, unsigned 8-bit integers
+  hipResViewFormatUnsignedChar4 = 0x03,              ///< 4 channels, unsigned 8-bit integers
+  hipResViewFormatSignedChar1 = 0x04,                ///< 1 channel, signed 8-bit integers
+  hipResViewFormatSignedChar2 = 0x05,                ///< 2 channels, signed 8-bit integers
+  hipResViewFormatSignedChar4 = 0x06,                ///< 4 channels, signed 8-bit integers
+  hipResViewFormatUnsignedShort1 = 0x07,             ///< 1 channel, unsigned 16-bit integers
+  hipResViewFormatUnsignedShort2 = 0x08,             ///< 2 channels, unsigned 16-bit integers
+  hipResViewFormatUnsignedShort4 = 0x09,             ///< 4 channels, unsigned 16-bit integers
+  hipResViewFormatSignedShort1 = 0x0a,               ///< 1 channel, signed 16-bit integers
+  hipResViewFormatSignedShort2 = 0x0b,               ///< 2 channels, signed 16-bit integers
+  hipResViewFormatSignedShort4 = 0x0c,               ///< 4 channels, signed 16-bit integers
+  hipResViewFormatUnsignedInt1 = 0x0d,               ///< 1 channel, unsigned 32-bit integers
+  hipResViewFormatUnsignedInt2 = 0x0e,               ///< 2 channels, unsigned 32-bit integers
+  hipResViewFormatUnsignedInt4 = 0x0f,               ///< 4 channels, unsigned 32-bit integers
+  hipResViewFormatSignedInt1 = 0x10,                 ///< 1 channel, signed 32-bit integers
+  hipResViewFormatSignedInt2 = 0x11,                 ///< 2 channels, signed 32-bit integers
+  hipResViewFormatSignedInt4 = 0x12,                 ///< 4 channels, signed 32-bit integers
+  hipResViewFormatHalf1 = 0x13,                      ///< 1 channel, 16-bit floating point
+  hipResViewFormatHalf2 = 0x14,                      ///< 2 channels, 16-bit floating point
+  hipResViewFormatHalf4 = 0x15,                      ///< 4 channels, 16-bit floating point
+  hipResViewFormatFloat1 = 0x16,                     ///< 1 channel, 32-bit floating point
+  hipResViewFormatFloat2 = 0x17,                     ///< 2 channels, 32-bit floating point
+  hipResViewFormatFloat4 = 0x18,                     ///< 4 channels, 32-bit floating point
+  hipResViewFormatUnsignedBlockCompressed1 = 0x19,   ///< Block-compressed 1
+  hipResViewFormatUnsignedBlockCompressed2 = 0x1a,   ///< Block-compressed 2
+  hipResViewFormatUnsignedBlockCompressed3 = 0x1b,   ///< Block-compressed 3
+  hipResViewFormatUnsignedBlockCompressed4 = 0x1c,   ///< Block-compressed 4 unsigned
+  hipResViewFormatSignedBlockCompressed4 = 0x1d,     ///< Block-compressed 4 signed
+  hipResViewFormatUnsignedBlockCompressed5 = 0x1e,   ///< Block-compressed 5 unsigned
+  hipResViewFormatSignedBlockCompressed5 = 0x1f,     ///< Block-compressed 5 signed
+  hipResViewFormatUnsignedBlockCompressed6H = 0x20,  ///< Block-compressed 6 unsigned half-float
+  hipResViewFormatSignedBlockCompressed6H = 0x21,    ///< Block-compressed 6 signed half-float
+  hipResViewFormatUnsignedBlockCompressed7 = 0x22    ///< Block-compressed 7
+} hipResourceViewFormat;
+/**
+ * HIP texture resource view formats
+ */
+typedef enum HIPresourceViewFormat_enum {
+  HIP_RES_VIEW_FORMAT_NONE = 0x00,  ///< No resource view format (use underlying resource format)
+  HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01,       ///< 1 channel, unsigned 8-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02,       ///< 2 channels, unsigned 8-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03,       ///< 4 channels, unsigned 8-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04,       ///< 1 channel, signed 8-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05,       ///< 2 channels, signed 8-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06,       ///< 4 channels, signed 8-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07,      ///< 1 channel, unsigned 16-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08,      ///< 2 channels, unsigned 16-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09,      ///< 4 channels, unsigned 16-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a,      ///< 1 channel, signed 16-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b,      ///< 2 channels, signed 16-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c,      ///< 4 channels, signed 16-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d,      ///< 1 channel, unsigned 32-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e,      ///< 2 channels, unsigned 32-bit integers
+  HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f,      ///< 4 channels, unsigned 32-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10,      ///< 1 channel, signed 32-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11,      ///< 2 channels, signed 32-bit integers
+  HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12,      ///< 4 channels, signed 32-bit integers
+  HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13,     ///< 1 channel, 16-bit floating point
+  HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14,     ///< 2 channels, 16-bit floating point
+  HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15,     ///< 4 channels, 16-bit floating point
+  HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16,     ///< 1 channel, 32-bit floating point
+  HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17,     ///< 2 channels, 32-bit floating point
+  HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18,     ///< 4 channels, 32-bit floating point
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19,   ///< Block-compressed 1
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a,   ///< Block-compressed 2
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b,   ///< Block-compressed 3
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c,   ///< Block-compressed 4 unsigned
+  HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d,     ///< Block-compressed 4 signed
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e,   ///< Block-compressed 5 unsigned
+  HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f,     ///< Block-compressed 5 signed
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20,  ///< Block-compressed 6 unsigned half-float
+  HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21,    ///< Block-compressed 6 signed half-float
+  HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22    ///< Block-compressed 7
+} HIPresourceViewFormat;
+/**
+ * HIP resource descriptor
+ */
+typedef struct hipResourceDesc {
+  enum hipResourceType resType;  ///< Resource type
+  union {
+    struct {
+      hipArray_t array;  ///< HIP array
+    } array;
+    struct {
+      hipMipmappedArray_t mipmap;  ///< HIP mipmapped array
+    } mipmap;
+    struct {
+      void* devPtr;                      ///< Device pointer
+      struct hipChannelFormatDesc desc;  ///< Channel format description
+      size_t sizeInBytes;                ///< Size in bytes
+    } linear;
+    struct {
+      void* devPtr;                      ///< Device pointer
+      struct hipChannelFormatDesc desc;  ///< Channel format description
+      size_t width;                      ///< Width of the array in elements
+      size_t height;                     ///< Height of the array in elements
+      size_t pitchInBytes;               ///< Pitch between two rows in bytes
+    } pitch2D;
+  } res;
+} hipResourceDesc;
+
+/**
+ * HIP resource view descriptor struct
+ */
+typedef struct HIP_RESOURCE_DESC_st {
+  HIPresourcetype resType;  ///< Resource type
+  union {
+    struct {
+      hipArray_t hArray;  ///< HIP array
+    } array;
+    struct {
+      hipMipmappedArray_t hMipmappedArray;  ///< HIP mipmapped array
+    } mipmap;
+    struct {
+      hipDeviceptr_t devPtr;     ///< Device pointer
+      hipArray_Format format;    ///< Array format
+      unsigned int numChannels;  ///< Channels per array element
+      size_t sizeInBytes;        ///< Size in bytes
+    } linear;
+    struct {
+      hipDeviceptr_t devPtr;     ///< Device pointer
+      hipArray_Format format;    ///< Array format
+      unsigned int numChannels;  ///< Channels per array element
+      size_t width;              ///< Width of the array in elements
+      size_t height;             ///< Height of the array in elements
+      size_t pitchInBytes;       ///< Pitch between two rows in bytes
+    } pitch2D;
+    struct {
+      int reserved[32];
+    } reserved;
+  } res;
+  unsigned int flags;  ///< Flags (must be zero)
+} HIP_RESOURCE_DESC;
+/**
+ * HIP resource view descriptor
+ */
+struct hipResourceViewDesc {
+  enum hipResourceViewFormat format;  ///< Resource view format
+  size_t width;                       ///< Width of the resource view
+  size_t height;                      ///< Height of the resource view
+  size_t depth;                       ///< Depth of the resource view
+  unsigned int firstMipmapLevel;      ///< First defined mipmap level
+  unsigned int lastMipmapLevel;       ///< Last defined mipmap level
+  unsigned int firstLayer;            ///< First layer index
+  unsigned int lastLayer;             ///< Last layer index
+};
+/**
+ * Resource view descriptor
+ */
+typedef struct HIP_RESOURCE_VIEW_DESC_st {
+  HIPresourceViewFormat format;   ///< Resource view format
+  size_t width;                   ///< Width of the resource view
+  size_t height;                  ///< Height of the resource view
+  size_t depth;                   ///< Depth of the resource view
+  unsigned int firstMipmapLevel;  ///< First defined mipmap level
+  unsigned int lastMipmapLevel;   ///< Last defined mipmap level
+  unsigned int firstLayer;        ///< First layer index
+  unsigned int lastLayer;         ///< Last layer index
+  unsigned int reserved[16];
+} HIP_RESOURCE_VIEW_DESC;
+/**
+ * Memory copy types
+ */
+#if !defined(__HIPCC_RTC__)
+typedef enum hipMemcpyKind {
+  hipMemcpyHostToHost = 0,            ///< Host-to-Host Copy
+  hipMemcpyHostToDevice = 1,          ///< Host-to-Device Copy
+  hipMemcpyDeviceToHost = 2,          ///< Device-to-Host Copy
+  hipMemcpyDeviceToDevice = 3,        ///< Device-to-Device Copy
+  hipMemcpyDefault = 4,               ///< Runtime will automatically determine
+                                      ///< copy-kind based on virtual addresses.
+  hipMemcpyDeviceToDeviceNoCU = 1024  ///< Device-to-Device Copy without using compute units
+} hipMemcpyKind;
+/**
+ * HIP pithed pointer
+ */
+typedef struct hipPitchedPtr {
+  void* ptr;     ///< Pointer to the allocated memory
+  size_t pitch;  ///< Pitch in bytes
+  size_t xsize;  ///< Logical size of the first dimension of allocation in elements
+  size_t ysize;  ///< Logical size of the second dimension of allocation in elements
+} hipPitchedPtr;
+/**
+ * HIP extent
+ */
+typedef struct hipExtent {
+  size_t width;  // Width in elements when referring to array memory, in bytes when referring to
+                 // linear memory
+  size_t height;
+  size_t depth;
+} hipExtent;
+/**
+ *  HIP position
+ */
+typedef struct hipPos {
+  size_t x;  ///< X coordinate
+  size_t y;  ///< Y coordinate
+  size_t z;  ///< Z coordinate
+} hipPos;
+/**
+ * HIP 3D memory copy parameters
+ */
+typedef struct hipMemcpy3DParms {
+  hipArray_t srcArray;          ///< Source array
+  struct hipPos srcPos;         ///< Source position
+  struct hipPitchedPtr srcPtr;  ///< Source pointer
+  hipArray_t dstArray;          ///< Destination array
+  struct hipPos dstPos;         ///< Destination position
+  struct hipPitchedPtr dstPtr;  ///< Destination pointer
+  struct hipExtent extent;      ///< Extent of 3D memory copy
+  enum hipMemcpyKind kind;      ///< Kind of 3D memory copy
+} hipMemcpy3DParms;
+/**
+ * HIP 3D memory copy
+ */
+typedef struct HIP_MEMCPY3D {
+  size_t srcXInBytes;           ///< Source X in bytes
+  size_t srcY;                  ///< Source Y
+  size_t srcZ;                  ///< Source Z
+  size_t srcLOD;                ///< Source LOD
+  hipMemoryType srcMemoryType;  ///< Source memory type
+  const void* srcHost;          ///< Source host pointer
+  hipDeviceptr_t srcDevice;     ///< Source device
+  hipArray_t srcArray;          ///< Source array
+  size_t srcPitch;              ///< Source pitch
+  size_t srcHeight;             ///< Source height
+  size_t dstXInBytes;           ///< Destination X in bytes
+  size_t dstY;                  ///< Destination Y
+  size_t dstZ;                  ///< Destination Z
+  size_t dstLOD;                ///< Destination LOD
+  hipMemoryType dstMemoryType;  ///< Destination memory type
+  void* dstHost;                ///< Destination host pointer
+  hipDeviceptr_t dstDevice;     ///< Destination device
+  hipArray_t dstArray;          ///< Destination array
+  size_t dstPitch;              ///< Destination pitch
+  size_t dstHeight;             ///< Destination height
+  size_t WidthInBytes;          ///< Width in bytes of 3D memory copy
+  size_t Height;                ///< Height in bytes of 3D memory copy
+  size_t Depth;                 ///< Depth in bytes of 3D memory copy
+} HIP_MEMCPY3D;
+/**
+ * Specifies the type of location
+ */
+typedef enum hipMemLocationType {
+  hipMemLocationTypeInvalid = 0,
+  hipMemLocationTypeNone = 0,
+  hipMemLocationTypeDevice = 1,    ///< Device location, thus it's HIP device ID
+  hipMemLocationTypeHost = 2,      ///< Host location, id is ignored
+  hipMemLocationTypeHostNuma = 3,  ///< Host NUMA node location, id is host NUMA node id
+  hipMemLocationTypeHostNumaCurrent =
+      4  ///< Host NUMA node closest to current thread’s CPU, id is ignored
+} hipMemLocationType;
+/**
+ * Specifies a memory location.
+ *
+ * To specify a gpu, set type = @p hipMemLocationTypeDevice and set id = the gpu's device ID
+ */
+typedef struct hipMemLocation {
+  hipMemLocationType type;  ///< Specifies the location type, which describes the meaning of id
+  int id;                   ///< Identifier for the provided location type @p hipMemLocationType
+} hipMemLocation;
+
+/**
+ * Flags to specify for copies within a batch. Used with hipMemcpyBatchAsync
+ */
+typedef enum hipMemcpyFlags {
+  hipMemcpyFlagDefault = 0x0,                  ///< Default flag
+  hipMemcpyFlagPreferOverlapWithCompute = 0x1  ///< Tries to overlap copy with compute work.
+} hipMemcpyFlags;
+
+/**
+ * Flags to specify order in which source pointer is accessed by Batch memcpy
+ */
+typedef enum hipMemcpySrcAccessOrder {
+  hipMemcpySrcAccessOrderInvalid = 0x0,  ///< Default Invalid.
+  hipMemcpySrcAccessOrderStream = 0x1,   ///< Access to source pointer must be in stream order.
+  hipMemcpySrcAccessOrderDuringApiCall =
+      0x2,  ///< Access to source pointer can be out of stream order and all accesses must be
+            ///< complete before API call returns.
+  hipMemcpySrcAccessOrderAny =
+      0x3,  ///< Access to the source pointer can be out of stream order and the accesses can happen
+            ///< even after the API call return.
+  hipMemcpySrcAccessOrderMax = 0x7FFFFFFF
+} hipMemcpySrcAccessOrder;
+
+/**
+ * Attributes for copies within a batch.
+ */
+typedef struct hipMemcpyAttributes {
+  hipMemcpySrcAccessOrder
+      srcAccessOrder;  ///< Source access ordering to be observed for copies with this attribute.
+  hipMemLocation srcLocHint;  ///< Location hint for src operand.
+  hipMemLocation dstLocHint;  ///< Location hint for destination operand.
+  unsigned int flags;         ///< Additional Flags for copies. See hipMemcpyFlags.
+} hipMemcpyAttributes;
+/**
+ * Operand types for individual copies within a batch
+ */
+typedef enum hipMemcpy3DOperandType {
+  hipMemcpyOperandTypePointer = 0x1,  ///< Mempcy operand is a valid pointer.
+  hipMemcpyOperandTypeArray = 0x2,    ///< Memcpy operand is a valid hipArray.
+  hipMemcpyOperandTypeMax = 0x7FFFFFFF
+} hipMemcpy3DOperandType;
+
+/**
+ * Struct representing offset into a hipArray_t in elements.
+ */
+typedef struct hipOffset3D {
+  size_t x;
+  size_t y;
+  size_t z;
+} hipOffset3D;
+/**
+ *  Struct representing an operand for copy with hipMemcpy3DBatchAsync.
+ */
+typedef struct hipMemcpy3DOperand {
+  hipMemcpy3DOperandType type;
+  union {
+    struct {
+      void* ptr;
+      size_t rowLength;        ///< Length of each row in elements.
+      size_t layerHeight;      ///< Height of each layer in elements.
+      hipMemLocation locHint;  ///< Location Hint for the operand.
+    } ptr;
+    struct {
+      hipArray_t array;    ///< Array struct for hipMemcpyOperandTypeArray.
+      hipOffset3D offset;  ///< Offset into array in elements.
+    } array;
+  } op;
+} hipMemcpy3DOperand;
+
+/**
+ * HIP 3D Batch Op
+ */
+typedef struct hipMemcpy3DBatchOp {
+  hipMemcpy3DOperand src;
+  hipMemcpy3DOperand dst;
+  hipExtent extent;
+  hipMemcpySrcAccessOrder srcAccessOrder;
+  unsigned int flags;
+} hipMemcpy3DBatchOp;
+
+typedef struct hipMemcpy3DPeerParms {
+  hipArray_t srcArray;   ///< Source memory address
+  hipPos srcPos;         ///< Source position offset
+  hipPitchedPtr srcPtr;  ///< Pitched source memory address
+  int srcDevice;         ///< Source device
+  hipArray_t dstArray;   ///< Destination memory address
+  hipPos dstPos;         ///< Destination position offset
+  hipPitchedPtr dstPtr;  ///< Pitched destination memory address
+  int dstDevice;         ///< Destination device
+  hipExtent extent;      ///< Requested memory copy size
+} hipMemcpy3DPeerParms;
+
+/**
+ * @brief Make hipPitchedPtr
+ *
+ * @param [in] d Pointer to the allocated memory
+ * @param [in] p Pitch in bytes
+ * @param [in] xsz Logical size of the first dimension of allocation in elements
+ * @param [in] ysz Logical size of the second dimension of allocation in elements
+ *
+ * @returns The created hipPitchedPtr
+ */
+static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz, size_t ysz) {
+  struct hipPitchedPtr s;
+  s.ptr = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+  return s;
+}
+/**
+ * @brief Make hipPos struct
+ *
+ * @param [in] x X coordinate of the new hipPos
+ * @param [in] y Y coordinate of the new hipPos
+ * @param [in] z Z coordinate of the new hipPos
+ *
+ * @returns The created hipPos struct
+ */
+static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+  struct hipPos p;
+  p.x = x;
+  p.y = y;
+  p.z = z;
+  return p;
+}
+/**
+ * @brief Make hipExtent struct
+ *
+ * @param [in] w Width of the new hipExtent
+ * @param [in] h Height of the new hipExtent
+ * @param [in] d Depth of the new hipExtent
+ *
+ * @returns The created hipExtent struct
+ */
+static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+  struct hipExtent e;
+  e.width = w;
+  e.height = h;
+  e.depth = d;
+  return e;
+}
+typedef enum hipFunction_attribute {
+  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,  ///< The maximum number of threads per block. Depends
+                                             ///< on function and device.
+  HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,  ///< The statically allocated shared memory size in bytes
+                                         ///< per block required by the function.
+  HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,   ///< The user-allocated constant memory by the function in
+                                         ///< bytes.
+  HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,   ///< The local memory usage of each thread by this function
+                                         ///< in bytes.
+  HIP_FUNC_ATTRIBUTE_NUM_REGS,  ///< The number of registers used by each thread of this function.
+  HIP_FUNC_ATTRIBUTE_PTX_VERSION,                       ///< PTX version
+  HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                    ///< Binary version
+  HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                     ///< Cache mode
+  HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,     ///< The maximum dynamic shared memory per
+                                                        ///< block for this function in bytes.
+  HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,  ///< The shared memory carveout preference
+                                                        ///< in percent of the maximum shared
+                                                        ///< memory.
+  HIP_FUNC_ATTRIBUTE_MAX
+} hipFunction_attribute;
+
+typedef enum hipPointer_attribute {
+  HIP_POINTER_ATTRIBUTE_CONTEXT = 1,     ///< The context on which a pointer was allocated
+                                         ///< @warning This attribute is not supported in HIP
+  HIP_POINTER_ATTRIBUTE_MEMORY_TYPE,     ///< memory type describing the location of a pointer
+  HIP_POINTER_ATTRIBUTE_DEVICE_POINTER,  ///< address at which the pointer is allocated on the
+                                         ///< device
+  HIP_POINTER_ATTRIBUTE_HOST_POINTER,    ///< address at which the pointer is allocated on the host
+  HIP_POINTER_ATTRIBUTE_P2P_TOKENS,      ///< A pair of tokens for use with Linux kernel interface
+                                         ///< @warning This attribute is not supported in HIP
+  HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS,     ///< Synchronize every synchronous memory operation
+                                         ///< initiated on this region
+  HIP_POINTER_ATTRIBUTE_BUFFER_ID,       ///< Unique ID for an allocated memory region
+  HIP_POINTER_ATTRIBUTE_IS_MANAGED,      ///< Indicates if the pointer points to managed memory
+  HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,  ///< device ordinal of a device on which a pointer
+                                         ///< was allocated or registered
+  HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE,  ///< if this pointer maps to an allocation
+                                                    ///< that is suitable for hipIpcGetMemHandle
+                                                    ///< @warning This attribute is not supported in
+                                                    ///< HIP
+  HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,           ///< Starting address for this requested pointer
+  HIP_POINTER_ATTRIBUTE_RANGE_SIZE,  ///< Size of the address range for this requested pointer
+  HIP_POINTER_ATTRIBUTE_MAPPED,      ///< tells if this pointer is in a valid address range
+                                     ///< that is mapped to a backing allocation
+  HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,  ///< Bitmask of allowed hipmemAllocationHandleType
+                                               ///< for this allocation @warning This attribute is
+                                               ///< not supported in HIP
+  HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,  ///< returns if the memory referenced by
+                                                     ///< this pointer can be used with the
+                                                     ///< GPUDirect RDMA API
+                                                     ///< @warning This attribute is not supported
+                                                     ///< in HIP
+  HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS,   ///< Returns the access flags the device associated with
+                                        ///< for the corresponding memory referenced by the ptr
+  HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE  ///< Returns the mempool handle for the allocation if
+                                        ///< it was allocated from a mempool
+                                        ///< @warning This attribute is not supported in HIP
+} hipPointer_attribute;
+
+// doxygen end DriverTypes
+/**
+ * @}
+ */
+
+#endif  // !defined(__HIPCC_RTC__)
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_common.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a7dcff6cbd5f530e3ac23f3c82c33cda92d67e0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_common.h
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_HIP_COMMON_H
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#endif
+// Common code included at start of every hip file.
+// Auto enable __HIP_PLATFORM_AMD__ if compiling on AMD platform
+// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
+#if defined(__clang__) && defined(__HIP__)
+#ifndef __HIP_PLATFORM_AMD__
+#define __HIP_PLATFORM_AMD__
+#endif
+#endif  // defined(__clang__) && defined(__HIP__)
+
+// Auto enable __HIP_PLATFORM_NVIDIA__ if compiling with NVIDIA platform
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
+#ifndef __HIP_PLATFORM_NVIDIA__
+#define __HIP_PLATFORM_NVIDIA__
+#endif
+
+#ifdef __CUDACC__
+#define __HIPCC__
+#endif
+
+#endif  //__NVCC__
+
+// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
+#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) ||                                  \
+    (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
+#define __HIP_DEVICE_COMPILE__ 1
+#endif
+
+#ifdef __GNUC__
+#define HIP_PUBLIC_API __attribute__((visibility("default")))
+#define HIP_INTERNAL_EXPORTED_API __attribute__((visibility("default")))
+#else
+#define HIP_PUBLIC_API
+#define HIP_INTERNAL_EXPORTED_API
+#endif
+
+#if __HIP_DEVICE_COMPILE__ == 0
+// 32-bit Atomics
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
+
+// 64-bit Atomics
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (0)
+
+// Warp cross-lane operations
+#define __HIP_ARCH_HAS_WARP_VOTE__ (0)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// Sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// Misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (0)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_deprecated.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..91c58e28b587db34f72daefc0bcbddc576da1735
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_deprecated.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+// This file will add older hip functions used in the versioning system
+// Find the deprecated functions and structs in hip_device.cpp
+
+// This struct is also kept in hip_device.cpp
+typedef struct hipDeviceProp_tR0000 {
+  char name[256];            ///< Device name.
+  size_t totalGlobalMem;     ///< Size of global memory region (in bytes).
+  size_t sharedMemPerBlock;  ///< Size of shared memory region (in bytes).
+  int regsPerBlock;          ///< Registers per block.
+  int warpSize;              ///< Warp size.
+  int maxThreadsPerBlock;    ///< Max work items per work group or workgroup max size.
+  int maxThreadsDim[3];      ///< Max number of threads in each dimension (XYZ) of a block.
+  int maxGridSize[3];        ///< Max grid dimensions (XYZ).
+  int clockRate;             ///< Max clock frequency of the multiProcessors in khz.
+  int memoryClockRate;       ///< Max global memory clock frequency in khz.
+  int memoryBusWidth;        ///< Global memory bus width in bits.
+  size_t totalConstMem;      ///< Size of shared memory region (in bytes).
+  int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
+              ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+              ///< feature caps.
+  int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
+              ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+              ///< feature caps.
+  int multiProcessorCount;          ///< Number of multi-processors. When the GPU works in Compute
+                                    ///< Unit (CU) mode, this value equals the number of CUs;
+                                    ///< when in Workgroup Processor (WGP) mode, this value equels
+                                    ///< half of CUs, because a single WGP contains two CUs.
+  int l2CacheSize;                  ///< L2 cache size.
+  int maxThreadsPerMultiProcessor;  ///< Maximum resident threads per multi-processor.
+  int computeMode;                  ///< Compute mode.
+  int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
+                             ///< instructions.  New for HIP.
+  hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
+  int concurrentKernels;     ///< Device can possibly execute multiple kernels concurrently.
+  int pciDomainID;           ///< PCI Domain ID
+  int pciBusID;              ///< PCI Bus ID.
+  int pciDeviceID;           ///< PCI Device ID.
+  size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per Multiprocessor.
+  int isMultiGpuBoard;                      ///< 1 if device is on a multi-GPU board, 0 if not.
+  int canMapHostMemory;                     ///< Check whether HIP can map host memory
+  int gcnArch;                              ///< DEPRECATED: use gcnArchName instead
+  char gcnArchName[256];                    ///< AMD GCN Arch Name.
+  int integrated;                           ///< APU vs dGPU
+  int cooperativeLaunch;                    ///< HIP device supports cooperative launch
+  int cooperativeMultiDeviceLaunch;         ///< HIP device supports cooperative launch on multiple
+                                            ///< devices
+  int maxTexture1DLinear;                   ///< Maximum size for 1D textures bound to linear memory
+  int maxTexture1D;                         ///< Maximum number of elements in 1D images
+  int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
+  int maxTexture3D[3];  ///< Maximum dimensions (width, height, depth) of 3D images, in image
+                        ///< elements
+  unsigned int* hdpMemFlushCntl;  ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+  unsigned int* hdpRegFlushCntl;  ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+  size_t memPitch;                ///< Maximum pitch in bytes allowed by memory copies
+  size_t textureAlignment;        ///< Alignment requirement for textures
+  size_t texturePitchAlignment;   ///< Pitch alignment requirement for texture references bound to
+                                  ///< pitched memory
+  int kernelExecTimeoutEnabled;   ///< Run time limit for kernels executed on the device
+  int ECCEnabled;                 ///< Device has ECC support enabled
+  int tccDriver;                  ///< 1:If device is Tesla device using TCC driver, else 0
+  int cooperativeMultiDeviceUnmatchedFunc;       ///< HIP device supports cooperative launch on
+                                                 ///< multiple
+                                                 /// devices with unmatched functions
+  int cooperativeMultiDeviceUnmatchedGridDim;    ///< HIP device supports cooperative launch on
+                                                 ///< multiple
+                                                 /// devices with unmatched grid dimensions
+  int cooperativeMultiDeviceUnmatchedBlockDim;   ///< HIP device supports cooperative launch on
+                                                 ///< multiple
+                                                 /// devices with unmatched block dimensions
+  int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
+                                                 ///< multiple
+                                                 /// devices with unmatched shared memories
+  int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
+  int asicRevision;                              ///< Revision of the GPU in this device
+  int managedMemory;                   ///< Device supports allocating managed memory on this system
+  int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
+                                       ///< without migration
+  int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
+                                ///< the CPU
+  int pageableMemoryAccess;     ///< Device supports coherently accessing pageable memory
+                                ///< without calling hipHostRegister on it
+  int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
+                                               ///< page tables
+} hipDeviceProp_tR0000;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+hipError_t hipGetDevicePropertiesR0000(hipDeviceProp_tR0000* prop, int device);
+hipError_t hipChooseDeviceR0000(int* device, const hipDeviceProp_tR0000* prop);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..7834d0e0d9d081ec7a7327e4b61fd6a84c03e969
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! HIP = Heterogeneous-compute Interface for Portability
+//!
+//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
+//! through either AMD CLANG or NVCC.   Key features tend to be in the spirit
+//! and terminology of CUDA, but with a portable path to other accelerators as well:
+//
+//! Both paths support rich C++ features including classes, templates, lambdas, etc.
+//! Runtime API is C
+//! Memory management is based on pure pointers and resembles malloc/free/copy.
+//
+//! hip_runtime.h     : includes everything in hip_api.h, plus math builtins and kernel launch
+//! macros. hip_runtime_api.h : Defines HIP API.  This is a C header file and does not use any C++
+//! features.
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
+
+#if !defined(__HIPCC_RTC__)
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+#if __cplusplus
+#include <cstdint>
+#include <cstdlib>
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif  // __cplusplus
+#endif  // !defined(__HIPCC_RTC__)
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/amd_detail/amd_hip_runtime.h>
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/nvidia_detail/nvidia_hip_runtime.h>
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+#include <hip/library_types.h>
+#endif  // !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime_api.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..d91d9eeb5a03b15296b2af85b55354af660a5f72
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_runtime_api.h
@@ -0,0 +1,10267 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+
+* @file hip_runtime_api.h
+ *
+ * @brief Defines the API signatures for HIP runtime.
+ * This file can be compiled with a standard compiler.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_API_H
+
+#if __cplusplus
+#include <climits>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+#else
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+#include <hip/linker_types.h>
+
+enum {
+  HIP_SUCCESS = 0,
+  HIP_ERROR_INVALID_VALUE,
+  HIP_ERROR_NOT_INITIALIZED,
+  HIP_ERROR_LAUNCH_OUT_OF_RESOURCES
+};
+// hack to get these to show up in Doxygen:
+/**
+ * @defgroup GlobalDefs Global enum and defines
+ * @{
+ *
+ */
+/**
+ * hipDeviceArch_t
+ *
+ */
+typedef struct {
+  // 32-bit Atomics
+  unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
+  unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
+  unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
+  unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
+  unsigned hasFloatAtomicAdd : 1;         ///< 32-bit float atomic add in global and shared memory.
+
+  // 64-bit Atomics
+  unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
+  unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+
+  // Doubles
+  unsigned hasDoubles : 1;  ///< Double-precision floating point.
+
+  // Warp cross-lane operations
+  unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
+  unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
+  unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
+  unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+
+  // Sync
+  unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
+  unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+
+  // Misc
+  unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
+  unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
+  unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
+} hipDeviceArch_t;
+
+typedef struct hipUUID_t {
+  char bytes[16];
+} hipUUID;
+
+//---
+// Common headers for both NVCC and HIP-Clang paths:
+
+#define hipGetDeviceProperties hipGetDevicePropertiesR0600
+#define hipDeviceProp_t hipDeviceProp_tR0600
+#define hipChooseDevice hipChooseDeviceR0600
+
+/**
+ * hipDeviceProp
+ *
+ */
+typedef struct hipDeviceProp_t {
+  char name[256];                   ///< Device name.
+  hipUUID uuid;                     ///< UUID of a device
+  char luid[8];                     ///< 8-byte unique identifier. Only valid on windows
+  unsigned int luidDeviceNodeMask;  ///< LUID node mask
+  size_t totalGlobalMem;            ///< Size of global memory region (in bytes).
+  size_t sharedMemPerBlock;         ///< Size of shared memory per block (in bytes).
+  int regsPerBlock;                 ///< Registers per block.
+  int warpSize;                     ///< Warp size.
+  size_t memPitch;                  ///< Maximum pitch in bytes allowed by memory copies
+                                    ///< pitched memory
+  int maxThreadsPerBlock;           ///< Max work items per work group or workgroup max size.
+  int maxThreadsDim[3];             ///< Max number of threads in each dimension (XYZ) of a block.
+  int maxGridSize[3];               ///< Max grid dimensions (XYZ).
+  int clockRate;                    ///< Max clock frequency of the multiProcessors in khz.
+  size_t totalConstMem;             ///< Size of shared constant memory region on the device
+                                    ///< (in bytes).
+  int major;  ///< Major compute capability version.  This indicates the core instruction set
+              ///< of the GPU architecture.  For example, a value of 11 would correspond to
+              ///< Navi III (RDNA3).  See the arch feature flags for portable ways to query
+              ///< feature caps.
+  int minor;  ///< Minor compute capability version.  This indicates a particular configuration,
+              ///< feature set, or variation within the group represented by the major compute
+              ///< capability version.  For example, different models within the same major version
+              ///< might have varying levels of support for certain features or optimizations.
+              ///< See the arch feature flags for portable ways to query feature caps.
+  size_t textureAlignment;       ///< Alignment requirement for textures
+  size_t texturePitchAlignment;  ///< Pitch alignment requirement for texture references bound to
+  int deviceOverlap;             ///< Deprecated. Use asyncEngineCount instead
+  int multiProcessorCount;       ///< Number of multi-processors. When the GPU works in Compute
+                                 ///< Unit (CU) mode, this value equals the number of CUs;
+                                 ///< when in Workgroup Processor (WGP) mode, this value equels
+                                 ///< half of CUs, because a single WGP contains two CUs.
+  int kernelExecTimeoutEnabled;  ///< Run time limit for kernels executed on the device
+  int integrated;                ///< APU vs dGPU
+  int canMapHostMemory;          ///< Check whether HIP can map host memory
+  int computeMode;               ///< Compute mode.
+  int maxTexture1D;              ///< Maximum number of elements in 1D images
+  int maxTexture1DMipmap;        ///< Maximum 1D mipmap texture size
+  int maxTexture1DLinear;        ///< Maximum size for 1D textures bound to linear memory
+  int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
+  int maxTexture2DMipmap[2];   ///< Maximum number of elements in 2D array mipmap of images
+  int maxTexture2DLinear[3];   ///< Maximum 2D tex dimensions if tex are bound to pitched memory
+  int maxTexture2DGather[2];   ///< Maximum 2D tex dimensions if gather has to be performed
+  int maxTexture3D[3];         ///< Maximum dimensions (width, height, depth) of 3D images, in image
+                               ///< elements
+  int maxTexture3DAlt[3];      ///< Maximum alternate 3D texture dims
+  int maxTextureCubemap;       ///< Maximum cubemap texture dims
+  int maxTexture1DLayered[2];  ///< Maximum number of elements in 1D array images
+  int maxTexture2DLayered[3];  ///< Maximum number of elements in 2D array images
+  int maxTextureCubemapLayered[2];  ///< Maximum cubemaps layered texture dims
+  int maxSurface1D;                 ///< Maximum 1D surface size
+  int maxSurface2D[2];              ///< Maximum 2D surface size
+  int maxSurface3D[3];              ///< Maximum 3D surface size
+  int maxSurface1DLayered[2];       ///< Maximum 1D layered surface size
+  int maxSurface2DLayered[3];       ///< Maximum 2D layared surface size
+  int maxSurfaceCubemap;            ///< Maximum cubemap surface size
+  int maxSurfaceCubemapLayered[2];  ///< Maximum cubemap layered surface size
+  size_t surfaceAlignment;          ///< Alignment requirement for surface
+  int concurrentKernels;            ///< Device can possibly execute multiple kernels concurrently.
+  int ECCEnabled;                   ///< Device has ECC support enabled
+  int pciBusID;                     ///< PCI Bus ID.
+  int pciDeviceID;                  ///< PCI Device ID
+  int pciDomainID;                  ///< PCI Domain ID
+  int tccDriver;                    ///< 1:If device is Tesla device using TCC driver, else 0
+  int asyncEngineCount;             ///< Number of async engines
+  int unifiedAddressing;            ///< Does device and host share unified address space
+  int memoryClockRate;              ///< Max global memory clock frequency in khz.
+  int memoryBusWidth;               ///< Global memory bus width in bits.
+  int l2CacheSize;                  ///< L2 cache size.
+  int persistingL2CacheMaxSize;     ///< Device's max L2 persisting lines in bytes
+  int maxThreadsPerMultiProcessor;  ///< Maximum resident threads per multi-processor.
+  int streamPrioritiesSupported;    ///< Device supports stream priority
+  int globalL1CacheSupported;       ///< Indicates globals are cached in L1
+  int localL1CacheSupported;        ///< Locals are cahced in L1
+  size_t sharedMemPerMultiprocessor;  ///< Amount of shared memory available per multiprocessor.
+  int regsPerMultiprocessor;          ///< registers available per multiprocessor
+  int managedMemory;                  ///< Device supports allocating managed memory on this system
+  int isMultiGpuBoard;                ///< 1 if device is on a multi-GPU board, 0 if not.
+  int multiGpuBoardGroupID;  ///< Unique identifier for a group of devices on same multiboard GPU
+  int hostNativeAtomicSupported;         ///< Link between host and device supports native atomics
+  int singleToDoublePrecisionPerfRatio;  ///< Deprecated. CUDA only.
+  int pageableMemoryAccess;              ///< Device supports coherently accessing pageable memory
+                                         ///< without calling hipHostRegister on it
+  int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
+                                ///< the CPU
+  int computePreemptionSupported;         ///< Is compute preemption supported on the device
+  int canUseHostPointerForRegisteredMem;  ///< Device can access host registered memory with same
+                                          ///< address as the host
+  int cooperativeLaunch;                  ///< HIP device supports cooperative launch
+  int cooperativeMultiDeviceLaunch;       ///< HIP device supports cooperative launch on multiple
+                                          ///< devices
+  size_t sharedMemPerBlockOptin;  ///< Per device m ax shared mem per block usable by special opt in
+  int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
+                                               ///< page tables
+  int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
+                                       ///< without migration
+  int maxBlocksPerMultiProcessor;      ///< Max number of blocks on CU
+  int accessPolicyMaxWindowSize;       ///< Max value of access policy window
+  size_t reservedSharedMemPerBlock;    ///< Shared memory reserved by driver per block
+  int hostRegisterSupported;           ///< Device supports hipHostRegister
+  int sparseHipArraySupported;         ///< Indicates if device supports sparse hip arrays
+  int hostRegisterReadOnlySupported;   ///< Device supports using the hipHostRegisterReadOnly flag
+                                       ///< with hipHostRegistger
+  int timelineSemaphoreInteropSupported;  ///< Indicates external timeline semaphore support
+  int memoryPoolsSupported;    ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
+  int gpuDirectRDMASupported;  ///< Indicates device support of RDMA APIs
+  unsigned int gpuDirectRDMAFlushWritesOptions;  ///< Bitmask to be interpreted according to
+                                                 ///< hipFlushGPUDirectRDMAWritesOptions
+  int gpuDirectRDMAWritesOrdering;               ///< value of hipGPUDirectRDMAWritesOrdering
+  unsigned int
+      memoryPoolSupportedHandleTypes;    ///< Bitmask of handle types support with mempool based IPC
+  int deferredMappingHipArraySupported;  ///< Device supports deferred mapping HIP arrays and HIP
+                                         ///< mipmapped arrays
+  int ipcEventSupported;                 ///< Device supports IPC events
+  int clusterLaunch;                     ///< Device supports cluster launch
+  int unifiedFunctionPointers;           ///< Indicates device supports unified function pointers
+  int reserved[63];                      ///< CUDA Reserved.
+
+  int hipReserved[32];  ///< Reserved for adding new entries for HIP/CUDA.
+
+  /* HIP Only struct members */
+  char gcnArchName[256];                    ///< AMD GCN Arch Name. HIP Only.
+  size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per CU. HIP Only.
+  int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
+                             ///< instructions.  New for HIP.
+  hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
+  unsigned int* hdpMemFlushCntl;                ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+  unsigned int* hdpRegFlushCntl;                ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+  int cooperativeMultiDeviceUnmatchedFunc;      ///< HIP device supports cooperative launch on
+                                                ///< multiple
+                                                /// devices with unmatched functions
+  int cooperativeMultiDeviceUnmatchedGridDim;   ///< HIP device supports cooperative launch on
+                                                ///< multiple
+                                                /// devices with unmatched grid dimensions
+  int cooperativeMultiDeviceUnmatchedBlockDim;  ///< HIP device supports cooperative launch on
+                                                ///< multiple
+                                                /// devices with unmatched block dimensions
+  int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
+                                                 ///< multiple
+                                                 /// devices with unmatched shared memories
+  int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
+  int asicRevision;                              ///< Revision of the GPU in this device
+} hipDeviceProp_t;
+
+/**
+ * hipMemoryType (for pointer attributes)
+ *
+ * @note hipMemoryType enum values are combination of cudaMemoryType and cuMemoryType and AMD
+ * specific enum values.
+ *
+ */
+typedef enum hipMemoryType {
+  hipMemoryTypeUnregistered = 0,  ///< Unregistered memory
+  hipMemoryTypeHost = 1,          ///< Memory is physically located on host
+  hipMemoryTypeDevice = 2,        ///< Memory is physically located on device. (see deviceId for
+                                  ///< specific device)
+  hipMemoryTypeManaged = 3,       ///< Managed memory, automaticallly managed by the unified
+                                  ///< memory system
+                                  ///< place holder for new values.
+  hipMemoryTypeArray = 10,        ///< Array memory, physically located on device. (see deviceId for
+                                  ///< specific device)
+  hipMemoryTypeUnified = 11       ///< unified address space
+
+} hipMemoryType;
+
+/**
+ * Pointer attributes
+ */
+typedef struct hipPointerAttribute_t {
+  enum hipMemoryType type;
+  int device;
+  void* devicePointer;
+  void* hostPointer;
+  int isManaged;
+  unsigned allocationFlags; /* flags specified when memory was allocated*/
+                            /* peers? */
+} hipPointerAttribute_t;
+
+// Ignoring error-code return values from hip APIs is discouraged. On C++17,
+// we can make that yield a warning
+#if __cplusplus >= 201703L
+#define __HIP_NODISCARD [[nodiscard]]
+#else
+#define __HIP_NODISCARD
+#endif
+
+/**
+ * HIP error type
+ *
+ */
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
+// NVCC and HIP-Clang paths Also update the hipCUDAErrorTohipError function in NVCC path.
+
+typedef enum __HIP_NODISCARD hipError_t {
+  hipSuccess = 0,            ///< Successful completion.
+  hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
+                             ///< or not in an acceptable range.
+  hipErrorOutOfMemory = 2,   ///< out of memory range.
+  // Deprecated
+  hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
+  hipErrorNotInitialized = 3,    ///< Invalid not initialized
+  // Deprecated
+  hipErrorInitializationError = 3,
+  hipErrorDeinitialized = 4,  ///< Deinitialized
+  hipErrorProfilerDisabled = 5,
+  hipErrorProfilerNotInitialized = 6,
+  hipErrorProfilerAlreadyStarted = 7,
+  hipErrorProfilerAlreadyStopped = 8,
+  hipErrorInvalidConfiguration = 9,     ///< Invalide configuration
+  hipErrorInvalidPitchValue = 12,       ///< Invalid pitch value
+  hipErrorInvalidSymbol = 13,           ///< Invalid symbol
+  hipErrorInvalidDevicePointer = 17,    ///< Invalid Device Pointer
+  hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
+  hipErrorInsufficientDriver = 35,
+  hipErrorMissingConfiguration = 52,
+  hipErrorPriorLaunchFailure = 53,
+  hipErrorInvalidDeviceFunction = 98,  ///< Invalid device function
+  hipErrorNoDevice = 100,              ///< Call to hipGetDeviceCount returned 0 devices
+  hipErrorInvalidDevice = 101,         ///< DeviceID must be in range from 0 to compute-devices.
+  hipErrorInvalidImage = 200,          ///< Invalid image
+  hipErrorInvalidContext = 201,        ///< Produced when input context is invalid.
+  hipErrorContextAlreadyCurrent = 202,
+  hipErrorMapFailed = 205,
+  // Deprecated
+  hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
+  hipErrorUnmapFailed = 206,
+  hipErrorArrayIsMapped = 207,
+  hipErrorAlreadyMapped = 208,
+  hipErrorNoBinaryForGpu = 209,
+  hipErrorAlreadyAcquired = 210,
+  hipErrorNotMapped = 211,
+  hipErrorNotMappedAsArray = 212,
+  hipErrorNotMappedAsPointer = 213,
+  hipErrorECCNotCorrectable = 214,
+  hipErrorUnsupportedLimit = 215,     ///< Unsupported limit
+  hipErrorContextAlreadyInUse = 216,  ///< The context is already in use
+  hipErrorPeerAccessUnsupported = 217,
+  hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+  hipErrorInvalidGraphicsContext = 219,
+  hipErrorInvalidSource = 300,  ///< Invalid source.
+  hipErrorFileNotFound = 301,   ///< the file is not found.
+  hipErrorSharedObjectSymbolNotFound = 302,
+  hipErrorSharedObjectInitFailed = 303,  ///< Failed to initialize shared object.
+  hipErrorOperatingSystem = 304,         ///< Not the correct operating system
+  hipErrorInvalidHandle = 400,           ///< Invalide handle
+  // Deprecated
+  hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
+  hipErrorIllegalState = 401,  ///< Resource required is not in a valid state to perform operation.
+  hipErrorNotFound = 500,      ///< Not found
+  hipErrorNotReady = 600,      ///< Indicates that asynchronous operations enqueued earlier are not
+                           ///< ready.  This is not actually an error, but is used to distinguish
+                           ///< from hipSuccess (which indicates completion).  APIs that return
+                           ///< this error include hipEventQuery and hipStreamQuery.
+  hipErrorIllegalAddress = 700,
+  hipErrorLaunchOutOfResources = 701,      ///< Out of resources error.
+  hipErrorLaunchTimeOut = 702,             ///< Timeout for the launch.
+  hipErrorPeerAccessAlreadyEnabled = 704,  ///< Peer access was already enabled from the current
+                                           ///< device.
+  hipErrorPeerAccessNotEnabled = 705,  ///< Peer access was never enabled from the current device.
+  hipErrorSetOnActiveProcess = 708,    ///< The process is active.
+  hipErrorContextIsDestroyed = 709,    ///< The context is already destroyed
+  hipErrorAssert = 710,                ///< Produced when the kernel calls assert.
+  hipErrorHostMemoryAlreadyRegistered = 712,  ///< Produced when trying to lock a page-locked
+                                              ///< memory.
+  hipErrorHostMemoryNotRegistered = 713,      ///< Produced when trying to unlock a non-page-locked
+                                              ///< memory.
+  hipErrorLaunchFailure = 719,  ///< An exception occurred on the device while executing a kernel.
+  hipErrorCooperativeLaunchTooLarge = 720,  ///< This error indicates that the number of blocks
+                                            ///< launched per grid for a kernel that was launched
+                                            ///< via cooperative launch APIs exceeds the maximum
+                                            ///< number of allowed blocks for the current device.
+  hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
+  hipErrorStreamCaptureUnsupported = 900,  ///< The operation is not permitted when the stream
+                                           ///< is capturing.
+  hipErrorStreamCaptureInvalidated = 901,  ///< The current capture sequence on the stream
+                                           ///< has been invalidated due to a previous error.
+  hipErrorStreamCaptureMerge = 902,        ///< The operation would have resulted in a merge of
+                                           ///< two independent capture sequences.
+  hipErrorStreamCaptureUnmatched = 903,    ///< The capture was not initiated in this stream.
+  hipErrorStreamCaptureUnjoined = 904,     ///< The capture sequence contains a fork that was not
+                                           ///< joined to the primary stream.
+  hipErrorStreamCaptureIsolation = 905,    ///< A dependency would have been created which crosses
+                                           ///< the capture sequence boundary. Only implicit
+                                           ///< in-stream ordering dependencies  are allowed
+                                           ///< to cross the boundary
+  hipErrorStreamCaptureImplicit = 906,     ///< The operation would have resulted in a disallowed
+                                           ///< implicit dependency on a current capture sequence
+                                           ///< from hipStreamLegacy.
+  hipErrorCapturedEvent = 907,  ///< The operation is not permitted on an event which was last
+                                ///< recorded in a capturing stream.
+  hipErrorStreamCaptureWrongThread = 908,  ///< A stream capture sequence not initiated with
+                                           ///< the hipStreamCaptureModeRelaxed argument to
+                                           ///< hipStreamBeginCapture was passed to
+                                           ///< hipStreamEndCapture in a different thread.
+  hipErrorGraphExecUpdateFailure = 910,    ///< This error indicates that the graph update
+                                           ///< not performed because it included changes which
+                                           ///< violated constraintsspecific to instantiated graph
+                                           ///< update.
+  hipErrorInvalidChannelDescriptor = 911,  ///< Invalid channel descriptor.
+  hipErrorInvalidTexture = 912,            ///< Invalid texture.
+  hipErrorUnknown = 999,                   ///< Unknown error.
+  // HSA Runtime Error Codes start here.
+  hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
+                                 ///< in production systems.
+  hipErrorRuntimeOther = 1053,   ///< HSA runtime call other than memory returned error.  Typically
+                                 ///< not seen in production systems.
+  hipErrorTbd                    ///< Marker that more error codes are needed.
+} hipError_t;
+
+#undef __HIP_NODISCARD
+
+/**
+ * hipDeviceAttribute_t
+ * hipDeviceAttributeUnused number: 5
+ */
+typedef enum hipDeviceAttribute_t {
+  hipDeviceAttributeCudaCompatibleBegin = 0,
+
+  hipDeviceAttributeEccEnabled =
+      hipDeviceAttributeCudaCompatibleBegin,    ///< Whether ECC support is enabled.
+  hipDeviceAttributeAccessPolicyMaxWindowSize,  ///< Cuda only. The maximum size of the window
+                                                ///< policy in bytes.
+  hipDeviceAttributeAsyncEngineCount,           ///< Asynchronous engines number.
+  hipDeviceAttributeCanMapHostMemory,  ///< Whether host memory can be mapped into device address
+                                       ///< space
+  hipDeviceAttributeCanUseHostPointerForRegisteredMem,  ///< Device can access host registered
+                                                        ///< memory at the same virtual address as
+                                                        ///< the CPU
+  hipDeviceAttributeClockRate,                          ///< Peak clock frequency in kilohertz.
+  hipDeviceAttributeComputeMode,                   ///< Compute mode that device is currently in.
+  hipDeviceAttributeComputePreemptionSupported,    ///< Device supports Compute Preemption.
+  hipDeviceAttributeConcurrentKernels,             ///< Device can possibly execute multiple kernels
+                                                   ///< concurrently.
+  hipDeviceAttributeConcurrentManagedAccess,       ///< Device can coherently access managed memory
+                                                   ///< concurrently with the CPU
+  hipDeviceAttributeCooperativeLaunch,             ///< Support cooperative launch
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,  ///< Support cooperative launch on multiple
+                                                   ///< devices
+  hipDeviceAttributeDeviceOverlap,  ///< Device can concurrently copy memory and execute a kernel.
+                                    ///< Deprecated. Use instead asyncEngineCount.
+  hipDeviceAttributeDirectManagedMemAccessFromHost,  ///< Host can directly access managed memory on
+                                                     ///< the device without migration
+  hipDeviceAttributeGlobalL1CacheSupported,          ///< Device supports caching globals in L1
+  hipDeviceAttributeHostNativeAtomicSupported,  ///< Link between the device and the host supports
+                                                ///< native atomic operations
+  hipDeviceAttributeIntegrated,                 ///< Device is integrated GPU
+  hipDeviceAttributeIsMultiGpuBoard,            ///< Multiple GPU devices.
+  hipDeviceAttributeKernelExecTimeout,  ///< Run time limit for kernels executed on the device
+  hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
+                                  ///< cache.
+  hipDeviceAttributeLocalL1CacheSupported,  ///< caching locals in L1 is supported
+  hipDeviceAttributeLuid,  ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and
+                           ///< non-Windows platforms
+  hipDeviceAttributeLuidDeviceNodeMask,      ///< Luid device node mask. Undefined on TCC and
+                                             ///< non-Windows platforms
+  hipDeviceAttributeComputeCapabilityMajor,  ///< Major compute capability version number.
+  hipDeviceAttributeManagedMemory,  ///< Device supports allocating managed memory on this system
+  hipDeviceAttributeMaxBlocksPerMultiProcessor,  ///< Max block size per multiprocessor
+  hipDeviceAttributeMaxBlockDimX,                ///< Max block size in width.
+  hipDeviceAttributeMaxBlockDimY,                ///< Max block size in height.
+  hipDeviceAttributeMaxBlockDimZ,                ///< Max block size in depth.
+  hipDeviceAttributeMaxGridDimX,                 ///< Max grid size  in width.
+  hipDeviceAttributeMaxGridDimY,                 ///< Max grid size  in height.
+  hipDeviceAttributeMaxGridDimZ,                 ///< Max grid size  in depth.
+  hipDeviceAttributeMaxSurface1D,                ///< Maximum size of 1D surface.
+  hipDeviceAttributeMaxSurface1DLayered,  ///< Cuda only. Maximum dimensions of 1D layered surface.
+  hipDeviceAttributeMaxSurface2D,         ///< Maximum dimension (width, height) of 2D surface.
+  hipDeviceAttributeMaxSurface2DLayered,  ///< Cuda only. Maximum dimensions of 2D layered surface.
+  hipDeviceAttributeMaxSurface3D,       ///< Maximum dimension (width, height, depth) of 3D surface.
+  hipDeviceAttributeMaxSurfaceCubemap,  ///< Cuda only. Maximum dimensions of Cubemap surface.
+  hipDeviceAttributeMaxSurfaceCubemapLayered,  ///< Cuda only. Maximum dimension of Cubemap layered
+                                               ///< surface.
+  hipDeviceAttributeMaxTexture1DWidth,         ///< Maximum size of 1D texture.
+  hipDeviceAttributeMaxTexture1DLayered,       ///< Maximum dimensions of 1D layered texture.
+  hipDeviceAttributeMaxTexture1DLinear,   ///< Maximum number of elements allocatable in a 1D linear
+                                          ///< texture. Use cudaDeviceGetTexture1DLinearMaxWidth()
+                                          ///< instead on Cuda.
+  hipDeviceAttributeMaxTexture1DMipmap,   ///< Maximum size of 1D mipmapped texture.
+  hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D texture.
+  hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension hight of 2D texture.
+  hipDeviceAttributeMaxTexture2DGather,   ///< Maximum dimensions of 2D texture if gather operations
+                                          ///< performed.
+  hipDeviceAttributeMaxTexture2DLayered,  ///< Maximum dimensions of 2D layered texture.
+  hipDeviceAttributeMaxTexture2DLinear,   ///< Maximum dimensions (width, height, pitch) of 2D
+                                          ///< textures bound to pitched memory.
+  hipDeviceAttributeMaxTexture2DMipmap,   ///< Maximum dimensions of 2D mipmapped texture.
+  hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D texture.
+  hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimension height of 3D texture.
+  hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimension depth of 3D texture.
+  hipDeviceAttributeMaxTexture3DAlt,      ///< Maximum dimensions of alternate 3D texture.
+  hipDeviceAttributeMaxTextureCubemap,    ///< Maximum dimensions of Cubemap texture
+  hipDeviceAttributeMaxTextureCubemapLayered,  ///< Maximum dimensions of Cubemap layered texture.
+  hipDeviceAttributeMaxThreadsDim,             ///< Maximum dimension of a block
+  hipDeviceAttributeMaxThreadsPerBlock,        ///< Maximum number of threads per block.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per multiprocessor.
+  hipDeviceAttributeMaxPitch,                ///< Maximum pitch in bytes allowed by memory copies
+  hipDeviceAttributeMemoryBusWidth,          ///< Global memory bus width in bits.
+  hipDeviceAttributeMemoryClockRate,         ///< Peak memory clock frequency in kilohertz.
+  hipDeviceAttributeComputeCapabilityMinor,  ///< Minor compute capability version number.
+  hipDeviceAttributeMultiGpuBoardGroupID,    ///< Unique ID of device group on the same multi-GPU
+                                             ///< board
+  hipDeviceAttributeMultiprocessorCount,     ///< Number of multi-processors. When the GPU works in Compute
+                                             ///< Unit (CU) mode, this value equals the number of CUs;
+                                             ///< when in Workgroup Processor (WGP) mode, this value equels
+                                             ///< half of CUs, because a single WGP contains two CUs.
+  hipDeviceAttributeUnused1,                 ///< Previously hipDeviceAttributeName
+  hipDeviceAttributePageableMemoryAccess,  ///< Device supports coherently accessing pageable memory
+                                           ///< without calling hipHostRegister on it
+  hipDeviceAttributePageableMemoryAccessUsesHostPageTables,  ///< Device accesses pageable memory
+                                                             ///< via the host's page tables
+  hipDeviceAttributePciBusId,                                ///< PCI Bus ID.
+  hipDeviceAttributePciDeviceId,  ///< PCI Device ID. Returns pcie slot id
+  hipDeviceAttributePciDomainId,  ///< PCI Domain Id.
+  hipDeviceAttributePciDomainID =
+      hipDeviceAttributePciDomainId,           ///< PCI Domain ID, for backward compatibility.
+  hipDeviceAttributePersistingL2CacheMaxSize,  ///< Maximum l2 persisting lines capacity in bytes
+  hipDeviceAttributeMaxRegistersPerBlock,  ///< 32-bit registers available to a thread block. This
+                                           ///< number is shared by all thread blocks simultaneously
+                                           ///< resident on a multiprocessor.
+  hipDeviceAttributeMaxRegistersPerMultiprocessor,  ///< 32-bit registers available per block.
+  hipDeviceAttributeReservedSharedMemPerBlock,      ///< Shared memory reserved by CUDA driver per
+                                                    ///< block.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
+                                              ///< bytes.
+  hipDeviceAttributeSharedMemPerBlockOptin,   ///< Maximum shared memory per block usable by special
+                                              ///< opt in.
+  hipDeviceAttributeSharedMemPerMultiprocessor,  ///< Shared memory available per multiprocessor.
+  hipDeviceAttributeSingleToDoublePrecisionPerfRatio,  ///< Cuda only. Performance ratio of single
+                                                       ///< precision to double precision.
+  hipDeviceAttributeStreamPrioritiesSupported,         ///< Whether to support stream priorities.
+  hipDeviceAttributeSurfaceAlignment,                  ///< Alignment requirement for surfaces
+  hipDeviceAttributeTccDriver,  ///< Cuda only. Whether device is a Tesla device using TCC driver
+  hipDeviceAttributeTextureAlignment,       ///< Alignment requirement for textures
+  hipDeviceAttributeTexturePitchAlignment,  ///< Pitch alignment requirement for 2D texture
+                                            ///< references bound to pitched memory;
+  hipDeviceAttributeTotalConstantMemory,    ///< Constant memory size in bytes.
+  hipDeviceAttributeTotalGlobalMem,         ///< Global memory available on devicice.
+  hipDeviceAttributeUnifiedAddressing,      ///< Cuda only. An unified address space shared with the
+                                            ///< host.
+  hipDeviceAttributeUnused2,                ///< Previously hipDeviceAttributeUuid
+  hipDeviceAttributeWarpSize,               ///< Warp size in threads.
+  hipDeviceAttributeMemoryPoolsSupported,   ///< Device supports HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeVirtualMemoryManagementSupported,  ///< Device supports HIP virtual memory
+                                                       ///< management
+  hipDeviceAttributeHostRegisterSupported,  ///< Can device support host memory registration via
+                                            ///< hipHostRegister
+  hipDeviceAttributeMemoryPoolSupportedHandleTypes,  ///< Supported handle mask for HIP Stream
+                                                     ///< Ordered Memory Allocator
+
+  hipDeviceAttributeCudaCompatibleEnd = 9999,
+  hipDeviceAttributeAmdSpecificBegin = 10000,
+
+  hipDeviceAttributeClockInstructionRate =
+      hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the
+                                           ///< device-side "clock*"
+  hipDeviceAttributeUnused3,               ///< Previously hipDeviceAttributeArch
+  hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory PerMultiprocessor.
+  hipDeviceAttributeUnused4,                           ///< Previously hipDeviceAttributeGcnArch
+  hipDeviceAttributeUnused5,                           ///< Previously hipDeviceAttributeGcnArchName
+  hipDeviceAttributeHdpMemFlushCntl,  ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeHdpRegFlushCntl,  ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,       ///< Supports cooperative launch on
+                                                               ///< multiple devices with unmatched
+                                                               ///< functions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,    ///< Supports cooperative launch on
+                                                               ///< multiple devices with unmatched
+                                                               ///< grid dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,   ///< Supports cooperative launch on
+                                                               ///< multiple devices with unmatched
+                                                               ///< block dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,  ///< Supports cooperative launch on
+                                                               ///< multiple devices with unmatched
+                                                               ///< shared memories
+  hipDeviceAttributeIsLargeBar,                                ///< Whether it is LargeBar
+  hipDeviceAttributeAsicRevision,           ///< Revision of the GPU in this device
+  hipDeviceAttributeCanUseStreamWaitValue,  ///< '1' if Device supports hipStreamWaitValue32() and
+                                            ///< hipStreamWaitValue64(), '0' otherwise.
+  hipDeviceAttributeImageSupport,           ///< '1' if Device supports image, '0' otherwise.
+  hipDeviceAttributePhysicalMultiProcessorCount,  ///< All available physical compute
+                                                  ///< units for the device
+  hipDeviceAttributeFineGrainSupport,  ///< '1' if Device supports fine grain, '0' otherwise
+  hipDeviceAttributeWallClockRate,     ///< Constant frequency of wall clock in kilohertz.
+  hipDeviceAttributeNumberOfXccs,      ///< The number of XCC(s) on the device
+  hipDeviceAttributeMaxAvailableVgprsPerThread,  ///< Max number of available (directly or
+                                                 ///< indirectly addressable) VGPRs per thread in
+                                                 ///< DWORDs.
+  hipDeviceAttributePciChipId,                   ///< GPU Manufacturer device id
+
+  hipDeviceAttributeAmdSpecificEnd = 19999,
+  hipDeviceAttributeVendorSpecificBegin = 20000,
+  // Extended attributes for vendors
+} hipDeviceAttribute_t;
+
+typedef enum hipDriverProcAddressQueryResult {
+  HIP_GET_PROC_ADDRESS_SUCCESS = 0,
+  HIP_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND = 1,
+  HIP_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2
+} hipDriverProcAddressQueryResult;
+
+enum hipComputeMode {
+  hipComputeModeDefault = 0,
+  hipComputeModeExclusive = 1,
+  hipComputeModeProhibited = 2,
+  hipComputeModeExclusiveProcess = 3
+};
+
+enum hipFlushGPUDirectRDMAWritesOptions {
+  hipFlushGPUDirectRDMAWritesOptionHost = 1 << 0,
+  hipFlushGPUDirectRDMAWritesOptionMemOps = 1 << 1
+};
+
+enum hipGPUDirectRDMAWritesOrdering {
+  hipGPUDirectRDMAWritesOrderingNone = 0,
+  hipGPUDirectRDMAWritesOrderingOwner = 100,
+  hipGPUDirectRDMAWritesOrderingAllDevices = 200
+};
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+#include <hip/amd_detail/host_defines.h>
+#include <hip/driver_types.h>
+#include <hip/texture_types.h>
+#include <hip/surface_types.h>
+#if defined(_MSC_VER)
+#define HIP_DEPRECATED(msg) __declspec(deprecated(msg))
+#else  // !defined(_MSC_VER)
+#define HIP_DEPRECATED(msg) __attribute__((deprecated(msg)))
+#endif  // !defined(_MSC_VER)
+#define HIP_DEPRECATED_MSG                                                                         \
+  "This API is marked as deprecated and might not be supported in future releases. For more "      \
+  "details please refer "                                                                          \
+  "https://github.com/ROCm/HIP/blob/develop/docs/reference/deprecated_api_list.md"
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
+#define HIP_LAUNCH_PARAM_END ((void*)0x03)
+#ifdef __cplusplus
+#define __dparm(x) = x
+#else
+#define __dparm(x)
+#endif
+#ifdef __GNUC__
+#pragma GCC visibility push(default)
+#endif
+#ifdef __cplusplus
+namespace hip_impl {
+hipError_t hip_init();
+}  // namespace hip_impl
+#endif
+// Structure definitions:
+#ifdef __cplusplus
+extern "C" {
+#endif
+//---
+// API-visible structures
+typedef struct ihipCtx_t* hipCtx_t;
+// Note many APIs also use integer deviceIds as an alternative to the device pointer:
+typedef int hipDevice_t;
+typedef enum hipDeviceP2PAttr {
+  hipDevP2PAttrPerformanceRank = 0,
+  hipDevP2PAttrAccessSupported,
+  hipDevP2PAttrNativeAtomicSupported,
+  hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+typedef enum hipDriverEntryPointQueryResult {
+  hipDriverEntryPointSuccess = 0,
+  hipDriverEntryPointSymbolNotFound = 1,
+  hipDriverEntryPointVersionNotSufficent = 2
+} hipDriverEntryPointQueryResult;
+typedef struct ihipStream_t* hipStream_t;
+#define hipIpcMemLazyEnablePeerAccess 0x01
+#define HIP_IPC_HANDLE_SIZE 64
+typedef struct hipIpcMemHandle_st {
+  char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcMemHandle_t;
+typedef struct hipIpcEventHandle_st {
+  char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcEventHandle_t;
+typedef struct ihipModule_t* hipModule_t;
+typedef struct ihipModuleSymbol_t* hipFunction_t;
+typedef struct ihipLinkState_t* hipLinkState_t;
+typedef struct ihipLibrary_t* hipLibrary_t;
+typedef struct ihipKernel_t* hipKernel_t;
+/**
+ * HIP memory pool
+ */
+typedef struct ihipMemPoolHandle_t* hipMemPool_t;
+
+typedef struct hipFuncAttributes {
+  int binaryVersion;
+  int cacheModeCA;
+  size_t constSizeBytes;
+  size_t localSizeBytes;
+  int maxDynamicSharedSizeBytes;
+  int maxThreadsPerBlock;
+  int numRegs;
+  int preferredShmemCarveout;
+  int ptxVersion;
+  size_t sharedSizeBytes;
+} hipFuncAttributes;
+typedef struct ihipEvent_t* hipEvent_t;
+
+/**
+ * hipLimit
+ *
+ * @note In HIP device limit-related APIs, any input limit value other than those defined in the
+ * enum is treated as "UnsupportedLimit" by default.
+ */
+enum hipLimit_t {
+  hipLimitStackSize = 0x0,         ///< Limit of stack size in bytes on the current device, per
+                                   ///< thread. The size is in units of 256 dwords, up to the
+                                   ///< limit of (128K - 16)
+  hipLimitPrintfFifoSize = 0x01,   ///< Size limit in bytes of fifo used by printf call on the
+                                   ///< device. Currently not supported
+  hipLimitMallocHeapSize = 0x02,   ///< Limit of heap size in bytes on the current device, should
+                                   ///< be less than the global memory size on the device
+  hipExtLimitScratchMin = 0x1000,  ///< Minimum allowed value in bytes for scratch limit on this
+                                   ///< device. Valid only on Rocm device. This is read only.
+  hipExtLimitScratchMax = 0x1001,  ///< Maximum allowed value in bytes for scratch limit on this
+                                   ///< device. Valid only on Rocm device. This is read only.
+  hipExtLimitScratchCurrent = 0x1002,  ///< Current scratch limit threshold in bytes on this
+                                       ///< device. Must be between hipExtLimitScratchMin and
+                                       ///< hipExtLimitScratchMaxValid values. Valid only on Rocm
+                                       ///< device. This can be modified.
+  hipLimitRange                        ///< Supported limit range
+};
+
+/**
+ * Flags that can be used with hipStreamCreateWithFlags.
+ */
+// Flags that can be used with hipStreamCreateWithFlags.
+/** Default stream creation flags. These are used with hipStreamCreate().*/
+#define hipStreamDefault 0x00
+
+/** Stream does not implicitly synchronize with null stream.*/
+#define hipStreamNonBlocking 0x01
+
+// Flags that can be used with hipEventCreateWithFlags.
+/** Default flags.*/
+#define hipEventDefault 0x0
+
+/** Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency.*/
+#define hipEventBlockingSync 0x1
+
+/** Disable event's capability to record timing information. May improve performance.*/
+#define hipEventDisableTiming 0x2
+
+/** Event can support IPC. hipEventDisableTiming also must be set.*/
+#define hipEventInterprocess 0x4
+
+// Flags that can be used with hipEventRecordWithFlags.
+/** Default flag. */
+#define hipEventRecordDefault 0x00
+
+/** Event is captured in the graph as an external event node when performing stream capture. */
+#define hipEventRecordExternal 0x01
+
+//Flags that can be used with hipStreamWaitEvent.
+/** Default flag. */
+#define hipEventWaitDefault 0x00
+
+/** Wait is captured in the graph as an external event node when performing stream capture. */
+#define hipEventWaitExternal 0x01
+
+/** Disable performing a system scope sequentially consistent memory fence when the event
+ * transitions from recording to recorded.  This can be used for events that are only being
+ * used to measure timing, and do not require the event inspection operations
+ * (see ::hipEventSynchronize, ::hipEventQuery, and ::hipEventElapsedTime) to synchronize-with
+ * the work on which the recorded event (see ::hipEventRecord) is waiting.
+ * On some AMD GPU devices this can improve the accuracy of timing measurements by avoiding the
+ * cost of cache writeback and invalidation, and the performance impact of those actions on the
+ * execution of following work. */
+#define hipEventDisableSystemFence 0x20000000
+
+/** Use a device-scope release when recording this event. This flag is useful to obtain more
+ * precise timings of commands between events.  The flag is a no-op on CUDA platforms.*/
+#define hipEventReleaseToDevice 0x40000000
+
+/** Use a system-scope release when recording this event. This flag is useful to make
+ * non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms.*/
+#define hipEventReleaseToSystem 0x80000000
+
+// Flags that can be used with hipGetDriverEntryPoint.
+/** Default flag. Equivalent to hipEnablePerThreadDefaultStream if compiled with
+ *  -fgpu-default-stream=per-thread flag or HIP_API_PER_THREAD_DEFAULT_STREAM macro is
+ * defined.*/
+#define hipEnableDefault 0x0
+
+/** Search for all symbols except the corresponding per-thread versions.*/
+#define hipEnableLegacyStream 0x1
+
+/** Search for all symbols including the per-thread versions. If a per-thread version cannot be
+ * found, returns the legacy version.*/
+#define hipEnablePerThreadDefaultStream 0x2
+
+// Flags that can be used with hipHostMalloc/hipHostAlloc.
+/** Default pinned memory allocation on the host.*/
+#define hipHostAllocDefault 0x0
+
+/** Default pinned memory allocation on the host.
+ * @note This is the same definition as #hipHostAllocPortable.*/
+#define hipHostMallocDefault 0x0
+
+/** Memory is considered allocated by all contexts.*/
+#define hipHostAllocPortable 0x1
+
+/** Memory is considered allocated by all contexts.
+ * @note This is the same definition as #hipHostAllocPortable.*/
+#define hipHostMallocPortable 0x1
+
+/** Map the allocation into the address space for the current device. The device pointer
+ * can be obtained with #hipHostGetDevicePointer.*/
+#define hipHostAllocMapped 0x2
+
+/** Map the allocation into the address space for the current device. The device pointer
+ * can be obtained with #hipHostGetDevicePointer.
+ * @note This is the same #hipHostMallocMapped.*/
+#define hipHostMallocMapped 0x2
+
+/** Allocates the memory as write-combined. On some system configurations, write-combined allocation
+ * may be transferred faster across the PCI Express bus, however, could have low read efficiency by
+ * most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.
+ * @note  This flag is only for CUDA source compatibility but not functional within HIP runtime,
+ * because the allocation path is currently not supported on the AMD platform.*/
+#define hipHostAllocWriteCombined 0x4
+
+/** Allocates the memory as write-combined. On some system configurations, write-combined allocation
+ * may be transferred faster across the PCI Express bus, however, could have low read efficiency by
+ * most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.
+ * @note  This flag is the same definition as #hipHostAllocWriteCombined which is equivalent to
+ * cudaHostAllocWriteCombined. It is only for CUDA source compatibility but not functional within
+ * HIP runtime, because the allocation path is currently not supported on the AMD platform.*/
+#define hipHostMallocWriteCombined 0x4
+
+/**
+ * Host memory will be forcedly allocated on extended fine grained system memory
+ * pool which is with MTYPE_UC.
+ * @note  This allocation flag is applicable on AMD devices, except for Navi4X, in Linux only.
+ */
+#define hipHostMallocUncached 0x10000000
+#define hipHostAllocUncached hipHostMallocUncached
+
+/**
+ * Host memory allocation will follow numa policy set by user.
+ * @note  This numa allocation flag is applicable on Linux, under development on Windows.
+ */
+#define hipHostMallocNumaUser 0x20000000
+
+/** Allocate coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
+#define hipHostMallocCoherent 0x40000000
+
+/** Allocate non-coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
+#define hipHostMallocNonCoherent 0x80000000
+
+/** Memory can be accessed by any stream on any device*/
+#define hipMemAttachGlobal 0x01
+
+/** Memory cannot be accessed by any stream on any device.*/
+#define hipMemAttachHost 0x02
+
+/** Memory can only be accessed by a single stream on the associated device.*/
+#define hipMemAttachSingle 0x04
+
+#define hipDeviceMallocDefault 0x0
+
+/** Memory is allocated in fine grained region of device.*/
+#define hipDeviceMallocFinegrained 0x1
+
+/** Memory represents a HSA signal.*/
+#define hipMallocSignalMemory 0x2
+
+/** Memory allocated will be uncached. */
+#define hipDeviceMallocUncached 0x3
+
+/** Memory allocated will be contiguous. */
+#define hipDeviceMallocContiguous 0x4
+
+// Flags that can be used with hipHostRegister.
+/** Memory is Mapped and Portable.*/
+#define hipHostRegisterDefault 0x0
+
+/** Memory is considered registered by all contexts.*/
+#define hipHostRegisterPortable 0x1
+
+/** Map the allocation into the address space for the current device. The device pointer
+ * can be obtained with #hipHostGetDevicePointer.*/
+#define hipHostRegisterMapped 0x2
+
+/** Not supported.*/
+#define hipHostRegisterIoMemory 0x4
+
+/** This flag is ignored On AMD devices.*/
+#define hipHostRegisterReadOnly 0x08
+
+/** Coarse Grained host memory lock.*/
+#define hipExtHostRegisterCoarseGrained 0x8
+
+/** Map host memory onto extended fine grained access host memory pool when enabled.
+ * It is applicable on AMD devices, except for Navi4X, in Linux only.
+ */
+#define hipExtHostRegisterUncached 0x80000000
+
+/** Automatically select between Spin and Yield.*/
+#define hipDeviceScheduleAuto 0x0
+
+/** Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may
+ * consume more power.*/
+#define hipDeviceScheduleSpin 0x1
+
+/** Yield the CPU to the operating system when waiting. May increase latency, but lowers power
+ * and is friendlier to other threads in the system.*/
+#define hipDeviceScheduleYield 0x2
+#define hipDeviceScheduleBlockingSync 0x4
+#define hipDeviceScheduleMask 0x7
+#define hipDeviceMapHost 0x8
+#define hipDeviceLmemResizeToMax 0x10
+/** Default HIP array allocation flag.*/
+#define hipArrayDefault 0x00
+#define hipArrayLayered 0x01
+#define hipArraySurfaceLoadStore 0x02
+#define hipArrayCubemap 0x04
+#define hipArrayTextureGather 0x08
+#define hipOccupancyDefault 0x00
+#define hipOccupancyDisableCachingOverride 0x01
+#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
+#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
+#define hipCpuDeviceId ((int)-1)
+#define hipInvalidDeviceId ((int)-2)
+// Flags that can be used with hipExtLaunch Set of APIs.
+/** AnyOrderLaunch of kernels.*/
+#define hipExtAnyOrderLaunch 0x01
+// Flags to be used with hipStreamWaitValue32 and hipStreamWaitValue64.
+#define hipStreamWaitValueGte 0x0
+#define hipStreamWaitValueEq 0x1
+#define hipStreamWaitValueAnd 0x2
+#define hipStreamWaitValueNor 0x3
+
+/** Operations for hipStreamBatchMemOp*/
+typedef enum hipStreamBatchMemOpType {
+  hipStreamMemOpWaitValue32 = 0x1,
+  hipStreamMemOpWriteValue32 = 0x2,
+  hipStreamMemOpWaitValue64 = 0x4,
+  hipStreamMemOpWriteValue64 = 0x5,
+  hipStreamMemOpBarrier = 0x6,           ///< Currently not supported
+  hipStreamMemOpFlushRemoteWrites = 0x3  ///< Currently not supported
+} hipStreamBatchMemOpType;
+
+/**
+ * @brief Union representing batch memory operation parameters for HIP streams.
+ *
+ * hipStreamBatchMemOpParams is used to specify the parameters for batch memory
+ * operations in a HIP stream. This union supports various operations including
+ * waiting for a specific value, writing a value, and different flags for wait conditions.
+ *
+ * @details
+ * The union includes fields for different types of operations defined in the
+ * enum hipStreamBatchMemOpType:
+ * - hipStreamMemOpWaitValue32:  Wait for a 32-bit value.
+ * - hipStreamMemOpWriteValue32: Write a 32-bit value.
+ * - hipStreamMemOpWaitValue64:  Wait for a 64-bit value.
+ * - hipStreamMemOpWriteValue64: Write a 64-bit value.
+ *
+ * Each operation type includes an address, the value to wait for or write, flags, and an
+ * optional alias that is not relevant on AMD GPUs. Flags can be used to specify different
+ * wait conditions such as equality, bitwise AND, greater than or equal, and bitwise NOR.
+ *
+ * Example usage:
+ * @code
+ * hipStreamBatchMemOpParams myArray[2];
+ * myArray[0].operation = hipStreamMemOpWaitValue32;
+ * myArray[0].waitValue.address = waitAddr1;
+ * myArray[0].waitValue.value = 0x1;
+ * myArray[0].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+ *
+ * myArray[1].operation = hipStreamMemOpWriteValue32;
+ * myArray[1].writeValue.address = writeAddr1;
+ * myArray[1].writeValue.value = 0x1;
+ * myArray[1].writeValue.flags = 0x0;
+ *
+ * result = hipStreamBatchMemOp(stream, 2, myArray, 0);
+ * @endcode
+ */
+
+typedef union hipStreamBatchMemOpParams_union {
+  hipStreamBatchMemOpType operation;
+  struct hipStreamMemOpWaitValueParams_t {
+    hipStreamBatchMemOpType operation;
+    hipDeviceptr_t address;
+    union {
+      uint32_t value;
+      uint64_t value64;
+    };
+    unsigned int flags;
+    hipDeviceptr_t alias;  ///< Not valid for AMD backend. Initial value is unimportant
+  } waitValue;
+  struct hipStreamMemOpWriteValueParams_t {
+    hipStreamBatchMemOpType operation;
+    hipDeviceptr_t address;
+    union {
+      uint32_t value;
+      uint64_t value64;
+    };
+    unsigned int flags;
+    hipDeviceptr_t alias;  ///< Not valid for AMD backend. Initial value is unimportant
+  } writeValue;
+  struct hipStreamMemOpFlushRemoteWritesParams_t {
+    hipStreamBatchMemOpType operation;
+    unsigned int flags;
+  } flushRemoteWrites;  ///< Currently not supported on AMD
+  struct hipStreamMemOpMemoryBarrierParams_t {
+    hipStreamBatchMemOpType operation;
+    unsigned int flags;
+  } memoryBarrier;  ///< Currently not supported on AMD
+  uint64_t pad[6];
+} hipStreamBatchMemOpParams;
+
+/**
+ * @brief Structure representing node parameters for batch memory operations in HIP graphs.
+ *
+ * hipBatchMemOpNodeParams is used to specify the parameters for batch memory
+ * operations in HIP graphs. This struct includes the context to use for the operations, the
+ * number of operations, and an array of hipStreamBatchMemOpParams that describe the operations.
+ *
+ * @details
+ * The structure includes the following fields:
+ * - ctx: The HIP context to use for the operations.
+ * - count: The number of operations in the paramArray.
+ * - paramArray: A pointer to an array of hipStreamBatchMemOpParams.
+ * - flags: Flags to control the node.
+ *
+ * Example usage:
+ * @code
+ * hipBatchMemOpNodeParams nodeParams;
+ * nodeParams.ctx = context;
+ * nodeParams.count = ARRAY_SIZE;
+ * nodeParams.paramArray = myArray;
+ * nodeParams.flags = 0;
+ *
+ * Pass nodeParams to a HIP graph APIs hipGraphAddBatchMemOpNode, hipGraphBatchMemOpNodeGetParams,
+ * hipGraphBatchMemOpNodeSetParams, hipGraphExecBatchMemOpNodeSetParams
+ * @endcode
+ */
+
+typedef struct hipBatchMemOpNodeParams {
+  hipCtx_t ctx;
+  unsigned int count;
+  hipStreamBatchMemOpParams* paramArray;
+  unsigned int flags;
+} hipBatchMemOpNodeParams;
+
+// Stream per thread
+/** Implicit stream per application thread.*/
+#define hipStreamPerThread ((hipStream_t)2)
+
+#define hipStreamLegacy ((hipStream_t)1)
+
+// Indicates that the external memory object is a dedicated resource
+#define hipExternalMemoryDedicated 0x1
+/**
+ * HIP Memory Advise values
+ *
+ * @note This memory advise enumeration is used on Linux, not Windows.
+ */
+typedef enum hipMemoryAdvise {
+  hipMemAdviseSetReadMostly = 1,           ///< Data will mostly be read and only occassionally
+                                           ///< be written to
+  hipMemAdviseUnsetReadMostly = 2,         ///< Undo the effect of hipMemAdviseSetReadMostly
+  hipMemAdviseSetPreferredLocation = 3,    ///< Set the preferred location for the data as
+                                           ///< the specified device
+  hipMemAdviseUnsetPreferredLocation = 4,  ///< Clear the preferred location for the data
+  hipMemAdviseSetAccessedBy = 5,           ///< Data will be accessed by the specified device
+                                           ///< so prevent page faults as much as possible
+  hipMemAdviseUnsetAccessedBy = 6,         ///< Let HIP to decide on the page faulting policy
+                                           ///< for the specified device
+  hipMemAdviseSetCoarseGrain = 100,        ///< The default memory model is fine-grain. That allows
+                                           ///< coherent operations between host and device, while
+                                           ///< executing kernels. The coarse-grain can be used
+                                           ///< for data that only needs to be coherent at dispatch
+                                           ///< boundaries for better performance
+  hipMemAdviseUnsetCoarseGrain = 101       ///< Restores cache coherency policy back to fine-grain
+} hipMemoryAdvise;
+/**
+ * HIP Coherency Mode
+ */
+typedef enum hipMemRangeCoherencyMode {
+  hipMemRangeCoherencyModeFineGrain = 0,     ///< Updates to memory with this attribute can be
+                                             ///< done coherently from all devices
+  hipMemRangeCoherencyModeCoarseGrain = 1,   ///< Writes to memory with this attribute can be
+                                             ///< performed by a single device at a time
+  hipMemRangeCoherencyModeIndeterminate = 2  ///< Memory region queried contains subregions with
+                                             ///< both hipMemRangeCoherencyModeFineGrain and
+                                             ///< hipMemRangeCoherencyModeCoarseGrain attributes
+} hipMemRangeCoherencyMode;
+/**
+ * HIP range attributes
+ */
+typedef enum hipMemRangeAttribute {
+  hipMemRangeAttributeReadMostly = 1,            ///< Whether the range will mostly be read and
+                                                 ///< only occassionally be written to
+  hipMemRangeAttributePreferredLocation = 2,     ///< The preferred location of the range
+  hipMemRangeAttributeAccessedBy = 3,            ///< Memory range has hipMemAdviseSetAccessedBy
+                                                 ///< set for the specified device
+  hipMemRangeAttributeLastPrefetchLocation = 4,  ///< The last location to where the range was
+                                                 ///< prefetched
+  hipMemRangeAttributeCoherencyMode = 100,       ///< Returns coherency mode
+                                                 ///< @ref hipMemRangeCoherencyMode for the range
+} hipMemRangeAttribute;
+
+/**
+ * HIP memory pool attributes
+ */
+typedef enum hipMemPoolAttr {
+  /**
+   * (value type = int)
+   * Allow @p hipMemAllocAsync to use memory asynchronously freed
+   * in another streams as long as a stream ordering dependency
+   * of the allocating stream on the free action exists.
+   * hip events and null stream interactions can create the required
+   * stream ordered dependencies. (default enabled)
+   */
+  hipMemPoolReuseFollowEventDependencies = 0x1,
+  /**
+   * (value type = int)
+   * Allow reuse of already completed frees when there is no dependency
+   * between the free and allocation. (default enabled)
+   */
+  hipMemPoolReuseAllowOpportunistic = 0x2,
+  /**
+   * (value type = int)
+   * Allow @p hipMemAllocAsync to insert new stream dependencies
+   * in order to establish the stream ordering required to reuse
+   * a piece of memory released by cuFreeAsync (default enabled).
+   */
+  hipMemPoolReuseAllowInternalDependencies = 0x3,
+  /**
+   * (value type = uint64_t)
+   * Amount of reserved memory in bytes to hold onto before trying
+   * to release memory back to the OS. When more than the release
+   * threshold bytes of memory are held by the memory pool, the
+   * allocator will try to release memory back to the OS on the
+   * next call to stream, event or context synchronize. (default 0)
+   */
+  hipMemPoolAttrReleaseThreshold = 0x4,
+  /**
+   * (value type = uint64_t)
+   * Amount of backing memory currently allocated for the mempool.
+   */
+  hipMemPoolAttrReservedMemCurrent = 0x5,
+  /**
+   * (value type = uint64_t)
+   * High watermark of backing memory allocated for the mempool since the
+   * last time it was reset. High watermark can only be reset to zero.
+   */
+  hipMemPoolAttrReservedMemHigh = 0x6,
+  /**
+   * (value type = uint64_t)
+   * Amount of memory from the pool that is currently in use by the application.
+   */
+  hipMemPoolAttrUsedMemCurrent = 0x7,
+  /**
+   * (value type = uint64_t)
+   * High watermark of the amount of memory from the pool that was in use by the application since
+   * the last time it was reset. High watermark can only be reset to zero.
+   */
+  hipMemPoolAttrUsedMemHigh = 0x8
+} hipMemPoolAttr;
+
+/**
+ * Specifies the memory protection flags for mapping
+ *
+ */
+typedef enum hipMemAccessFlags {
+  hipMemAccessFlagsProtNone = 0,      ///< Default, make the address range not accessible
+  hipMemAccessFlagsProtRead = 1,      ///< Set the address range read accessible
+  hipMemAccessFlagsProtReadWrite = 3  ///< Set the address range read-write accessible
+} hipMemAccessFlags;
+/**
+ * Memory access descriptor structure is used to specify memory access
+ * permissions for a virtual memory region in Virtual Memory Management API.
+ * This structure changes read, and write permissions for
+ * specific memory regions.
+ */
+typedef struct hipMemAccessDesc {
+  hipMemLocation location;  ///< Location on which the accessibility has to change
+  hipMemAccessFlags flags;  ///< Accessibility flags to set
+} hipMemAccessDesc;
+/**
+ * Defines the allocation types
+ */
+typedef enum hipMemAllocationType {
+  hipMemAllocationTypeInvalid = 0x0,
+  /** This allocation type is 'pinned', i.e. cannot migrate from its current
+   * location while the application is actively using it
+   */
+  hipMemAllocationTypePinned = 0x1,
+  hipMemAllocationTypeUncached = 0x40000000,
+  hipMemAllocationTypeMax = 0x7FFFFFFF
+} hipMemAllocationType;
+/**
+ * Flags for specifying handle types for memory pool allocations
+ *
+ */
+typedef enum hipMemAllocationHandleType {
+  hipMemHandleTypeNone = 0x0,  ///< Does not allow any export mechanism
+  hipMemHandleTypePosixFileDescriptor =
+      0x1,  ///< Allows a file descriptor for exporting. Permitted only on POSIX systems
+  hipMemHandleTypeWin32 = 0x2,    ///< Allows a Win32 NT handle for exporting. (HANDLE)
+  hipMemHandleTypeWin32Kmt = 0x4  ///< Allows a Win32 KMT handle for exporting. (D3DKMT_HANDLE)
+} hipMemAllocationHandleType;
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+typedef struct hipMemPoolProps {
+  hipMemAllocationType
+      allocType;  ///< Allocation type. Currently must be specified as @p hipMemAllocationTypePinned
+  hipMemAllocationHandleType
+      handleTypes;          ///< Handle types that will be supported by allocations from the pool
+  hipMemLocation location;  ///< Location where allocations should reside
+  /**
+   * Windows-specific LPSECURITYATTRIBUTES required when @p hipMemHandleTypeWin32 is specified
+   */
+  void* win32SecurityAttributes;
+  size_t maxSize;  ///< Maximum pool size. When set to 0, defaults to a system dependent value
+  unsigned char reserved[56];  ///< Reserved for future use, must be 0
+} hipMemPoolProps;
+/**
+ * Opaque data structure for exporting a pool allocation
+ */
+typedef struct hipMemPoolPtrExportData {
+  unsigned char reserved[64];
+} hipMemPoolPtrExportData;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncAttribute {
+  hipFuncAttributeMaxDynamicSharedMemorySize =
+      8,  ///< The maximum number of bytes requested for dynamically allocated shared memory
+  hipFuncAttributePreferredSharedMemoryCarveout =
+      9,  ///< Sets the percentage of total shared memory allocated as the shared memory carveout
+  hipFuncAttributeMax
+} hipFuncAttribute;
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncCache_t {
+  hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
+  hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
+  hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
+  hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
+} hipFuncCache_t;
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipSharedMemConfig {
+  hipSharedMemBankSizeDefault,   ///< The compiler selects a device-specific value for the banking.
+  hipSharedMemBankSizeFourByte,  ///< Shared mem is banked at 4-bytes intervals and performs best
+                                 ///< when adjacent threads access data 4 bytes apart.
+  hipSharedMemBankSizeEightByte  ///< Shared mem is banked at 8-byte intervals and performs best
+                                 ///< when adjacent threads access data 4 bytes apart.
+} hipSharedMemConfig;
+/**
+ * Struct for data in 3D
+ */
+typedef struct dim3 {
+  uint32_t x;  ///< x
+  uint32_t y;  ///< y
+  uint32_t z;  ///< z
+#ifdef __cplusplus
+  constexpr __host__ __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1)
+      : x(_x), y(_y), z(_z) {};
+#endif
+} dim3;
+/**
+ * struct hipLaunchParams_t
+ */
+typedef struct hipLaunchParams_t {
+  void* func;          ///< Device function symbol
+  dim3 gridDim;        ///< Grid dimensions
+  dim3 blockDim;       ///< Block dimensions
+  void** args;         ///< Arguments
+  size_t sharedMem;    ///< Shared memory
+  hipStream_t stream;  ///< Stream identifier
+} hipLaunchParams;
+/**
+ * struct hipFunctionLaunchParams_t
+ */
+typedef struct hipFunctionLaunchParams_t {
+  hipFunction_t function;       ///< Kernel to launch
+  unsigned int gridDimX;        ///< Width(X) of grid in blocks
+  unsigned int gridDimY;        ///< Height(Y) of grid in blocks
+  unsigned int gridDimZ;        ///< Depth(Z) of grid in blocks
+  unsigned int blockDimX;       ///< X dimension of each thread block
+  unsigned int blockDimY;       ///< Y dimension of each thread block
+  unsigned int blockDimZ;       ///< Z dimension of each thread block
+  unsigned int sharedMemBytes;  ///< Shared memory
+  hipStream_t hStream;          ///< Stream identifier
+  void** kernelParams;          ///< Kernel parameters
+} hipFunctionLaunchParams;
+typedef enum hipExternalMemoryHandleType_enum {
+  hipExternalMemoryHandleTypeOpaqueFd = 1,
+  hipExternalMemoryHandleTypeOpaqueWin32 = 2,
+  hipExternalMemoryHandleTypeOpaqueWin32Kmt = 3,
+  hipExternalMemoryHandleTypeD3D12Heap = 4,
+  hipExternalMemoryHandleTypeD3D12Resource = 5,
+  hipExternalMemoryHandleTypeD3D11Resource = 6,
+  hipExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+  hipExternalMemoryHandleTypeNvSciBuf = 8
+} hipExternalMemoryHandleType;
+typedef struct hipExternalMemoryHandleDesc_st {
+  hipExternalMemoryHandleType type;
+  union {
+    int fd;
+    struct {
+      void* handle;
+      const void* name;
+    } win32;
+    const void* nvSciBufObject;
+  } handle;
+  unsigned long long size;
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalMemoryHandleDesc;
+typedef struct hipExternalMemoryBufferDesc_st {
+  unsigned long long offset;
+  unsigned long long size;
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalMemoryBufferDesc;
+typedef struct hipExternalMemoryMipmappedArrayDesc_st {
+  unsigned long long offset;
+  hipChannelFormatDesc formatDesc;
+  hipExtent extent;
+  unsigned int flags;
+  unsigned int numLevels;
+} hipExternalMemoryMipmappedArrayDesc;
+typedef void* hipExternalMemory_t;
+typedef enum hipExternalSemaphoreHandleType_enum {
+  hipExternalSemaphoreHandleTypeOpaqueFd = 1,
+  hipExternalSemaphoreHandleTypeOpaqueWin32 = 2,
+  hipExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+  hipExternalSemaphoreHandleTypeD3D12Fence = 4,
+  hipExternalSemaphoreHandleTypeD3D11Fence = 5,
+  hipExternalSemaphoreHandleTypeNvSciSync = 6,
+  hipExternalSemaphoreHandleTypeKeyedMutex = 7,
+  hipExternalSemaphoreHandleTypeKeyedMutexKmt = 8,
+  hipExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9,
+  hipExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10
+} hipExternalSemaphoreHandleType;
+typedef struct hipExternalSemaphoreHandleDesc_st {
+  hipExternalSemaphoreHandleType type;
+  union {
+    int fd;
+    struct {
+      void* handle;
+      const void* name;
+    } win32;
+    const void* NvSciSyncObj;
+  } handle;
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalSemaphoreHandleDesc;
+typedef void* hipExternalSemaphore_t;
+typedef struct hipExternalSemaphoreSignalParams_st {
+  struct {
+    struct {
+      unsigned long long value;
+    } fence;
+    union {
+      void* fence;
+      unsigned long long reserved;
+    } nvSciSync;
+    struct {
+      unsigned long long key;
+    } keyedMutex;
+    unsigned int reserved[12];
+  } params;
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalSemaphoreSignalParams;
+/**
+ * External semaphore wait parameters, compatible with driver type
+ */
+typedef struct hipExternalSemaphoreWaitParams_st {
+  struct {
+    struct {
+      unsigned long long value;
+    } fence;
+    union {
+      void* fence;
+      unsigned long long reserved;
+    } nvSciSync;
+    struct {
+      unsigned long long key;
+      unsigned int timeoutMs;
+    } keyedMutex;
+    unsigned int reserved[10];
+  } params;
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalSemaphoreWaitParams;
+
+#if __HIP_HAS_GET_PCH
+/**
+ * Internal use only. This API may change in the future
+ * Pre-Compiled header for online compilation
+ */
+void __hipGetPCH(const char** pch, unsigned int* size);
+#endif
+
+/**
+ * HIP Access falgs for Interop resources.
+ */
+typedef enum hipGraphicsRegisterFlags {
+  hipGraphicsRegisterFlagsNone = 0,
+  hipGraphicsRegisterFlagsReadOnly = 1,  ///< HIP will not write to this registered resource
+  hipGraphicsRegisterFlagsWriteDiscard =
+      2,  ///< HIP will only write and will not read from this registered resource
+  hipGraphicsRegisterFlagsSurfaceLoadStore = 4,  ///< HIP will bind this resource to a surface
+  hipGraphicsRegisterFlagsTextureGather =
+      8  ///< HIP will perform texture gather operations on this registered resource
+} hipGraphicsRegisterFlags;
+
+typedef struct _hipGraphicsResource hipGraphicsResource;
+
+typedef hipGraphicsResource* hipGraphicsResource_t;
+
+/**
+ * An opaque value that represents a hip graph
+ */
+typedef struct ihipGraph* hipGraph_t;
+/**
+ * An opaque value that represents a hip graph node
+ */
+typedef struct hipGraphNode* hipGraphNode_t;
+/**
+ * An opaque value that represents a hip graph Exec
+ */
+typedef struct hipGraphExec* hipGraphExec_t;
+
+/**
+ * An opaque value that represents a user obj
+ */
+typedef struct hipUserObject* hipUserObject_t;
+
+
+/**
+ * hipGraphNodeType
+ */
+typedef enum hipGraphNodeType {
+  hipGraphNodeTypeKernel = 0,              ///< GPU kernel node
+  hipGraphNodeTypeMemcpy = 1,              ///< Memcpy node
+  hipGraphNodeTypeMemset = 2,              ///< Memset node
+  hipGraphNodeTypeHost = 3,                ///< Host (executable) node
+  hipGraphNodeTypeGraph = 4,               ///< Node which executes an embedded graph
+  hipGraphNodeTypeEmpty = 5,               ///< Empty (no-op) node
+  hipGraphNodeTypeWaitEvent = 6,           ///< External event wait node
+  hipGraphNodeTypeEventRecord = 7,         ///< External event record node
+  hipGraphNodeTypeExtSemaphoreSignal = 8,  ///< External Semaphore signal node
+  hipGraphNodeTypeExtSemaphoreWait = 9,    ///< External Semaphore wait node
+  hipGraphNodeTypeMemAlloc = 10,           ///< Memory alloc node
+  hipGraphNodeTypeMemFree = 11,            ///< Memory free node
+  hipGraphNodeTypeMemcpyFromSymbol = 12,   ///< MemcpyFromSymbol node
+  hipGraphNodeTypeMemcpyToSymbol = 13,     ///< MemcpyToSymbol node
+  hipGraphNodeTypeBatchMemOp = 14,         ///< BatchMemOp node
+  hipGraphNodeTypeCount
+} hipGraphNodeType;
+
+typedef void (*hipHostFn_t)(void* userData);
+typedef struct hipHostNodeParams {
+  hipHostFn_t fn;
+  void* userData;
+} hipHostNodeParams;
+typedef struct hipKernelNodeParams {
+  dim3 blockDim;
+  void** extra;
+  void* func;
+  dim3 gridDim;
+  void** kernelParams;
+  unsigned int sharedMemBytes;
+} hipKernelNodeParams;
+typedef struct hipMemsetParams {
+  void* dst;
+  unsigned int elementSize;
+  size_t height;
+  size_t pitch;
+  unsigned int value;
+  size_t width;
+} hipMemsetParams;
+
+typedef struct hipMemAllocNodeParams {
+  hipMemPoolProps poolProps;            ///< Pool properties, which contain where
+                                        ///< the location should reside
+  const hipMemAccessDesc* accessDescs;  ///< The number of memory access descriptors.
+  size_t accessDescCount;               ///< The number of access descriptors.
+                                        ///< Must not be bigger than the number of GPUs
+  size_t bytesize;                      ///< The size of the requested allocation in bytes
+  void* dptr;                           ///< Returned device address of the allocation
+} hipMemAllocNodeParams;
+
+/**
+ * Specifies performance hint with hipAccessPolicyWindow
+ */
+typedef enum hipAccessProperty {
+  hipAccessPropertyNormal = 0,      ///< Normal cache persistence.
+  hipAccessPropertyStreaming = 1,   ///< Streaming access is less likely to persist from cache
+  hipAccessPropertyPersisting = 2,  ///< Persisting access is more likely to persist in cache
+} hipAccessProperty;
+
+/***
+ * Specifies access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ */
+typedef struct hipAccessPolicyWindow {
+  void* base_ptr;              ///< Starting address of the access policy window
+  hipAccessProperty hitProp;   ///< hipAccessProperty set for hit
+  float hitRatio;              ///< hitRatio specifies percentage of lines assigned hitProp
+  hipAccessProperty missProp;  ///< hipAccessProperty set for miss
+  size_t num_bytes;            ///< Size in bytes of the window policy.
+} hipAccessPolicyWindow;
+
+/**
+ * Memory Synchronization Domain map
+ */
+typedef struct hipLaunchMemSyncDomainMap {
+  unsigned char default_; /**< The default domain ID to use for designated kernels */
+  unsigned char remote;   /**< The remote domain ID to use for designated kernels */
+} hipLaunchMemSyncDomainMap;
+
+/**
+ * Memory Synchronization Domain
+ */
+typedef enum hipLaunchMemSyncDomain {
+  hipLaunchMemSyncDomainDefault = 0, /**< Launch kernels in the default domain */
+  hipLaunchMemSyncDomainRemote = 1   /**< Launch kernels in the remote domain */
+} hipLaunchMemSyncDomain;
+
+/**
+ * Stream Synchronization Policy.
+ * Can be set with hipStreamSetAttribute
+ */
+typedef enum hipSynchronizationPolicy {
+  hipSyncPolicyAuto = 1,        /**< Default Synchronization Policy. Host thread waits actively */
+  hipSyncPolicySpin = 2,        /**< Host thread spins in tight loop waiting for completition */
+  hipSyncPolicyYield = 3,       /**< Host spins but yields to other threads, reducing CPU usage */
+  hipSyncPolicyBlockingSync = 4 /**< Host thread blocks (sleeps) until the stream completes */
+} hipSynchronizationPolicy;
+
+/**
+ *  Launch Attribute ID
+ */
+typedef enum hipLaunchAttributeID {
+  hipLaunchAttributeAccessPolicyWindow = 1,     ///< Valid for Streams, graph nodes, launches
+  hipLaunchAttributeCooperative = 2,            ///< Valid for graph nodes, launches
+  hipLaunchAttributeSynchronizationPolicy = 3,  ///< Valid for streams
+  hipLaunchAttributePriority = 8,               ///< Valid for graph node, streams, launches
+  hipLaunchAttributeMemSyncDomainMap = 9,       ///< Valid for streams, graph nodes, launches
+  hipLaunchAttributeMemSyncDomain = 10,         ///< Valid for streams, graph nodes, launches
+  hipLaunchAttributeMax
+} hipLaunchAttributeID;
+
+
+/**
+ *  Launch Attribute Value
+ */
+typedef union hipLaunchAttributeValue {
+  char pad[64];  ///< 64 byte padding
+  hipAccessPolicyWindow
+      accessPolicyWindow;  ///< Value of launch attribute ::hipLaunchAttributeAccessPolicyWindow.
+  int cooperative;         ///< Value of launch attribute ::hipLaunchAttributeCooperative. Indicates
+                           ///< whether the kernel is cooperative.
+  int priority;  ///< Value of launch attribute :: hipLaunchAttributePriority. Execution priority of
+                 ///< kernel
+  hipSynchronizationPolicy
+      syncPolicy;  ///< Value of launch attribute :: hipLaunchAttributeSynchronizationPolicy. Used
+                   ///< to work queued up in stream
+  hipLaunchMemSyncDomainMap
+      memSyncDomainMap;  ///< Value of launch attribute hipLaunchAttributeMemSyncDomainMap
+  hipLaunchMemSyncDomain
+      memSyncDomain;  ///< Value of launch attribute hipLaunchAttributeMemSyncDomain
+} hipLaunchAttributeValue;
+
+/**
+ * Stream attributes
+ */
+#define hipStreamAttrID hipLaunchAttributeID
+#define hipStreamAttributeAccessPolicyWindow hipLaunchAttributeAccessPolicyWindow
+#define hipStreamAttributeSynchronizationPolicy hipLaunchAttributeSynchronizationPolicy
+#define hipStreamAttributeMemSyncDomainMap hipLaunchAttributeMemSyncDomainMap
+#define hipStreamAttributeMemSyncDomain hipLaunchAttributeMemSyncDomain
+#define hipStreamAttributePriority hipLaunchAttributePriority
+
+#define hipStreamAttrValue hipLaunchAttributeValue
+
+/**
+ * Kernel node attributeID
+ */
+#define hipKernelNodeAttrID hipLaunchAttributeID
+#define hipKernelNodeAttributeAccessPolicyWindow hipLaunchAttributeAccessPolicyWindow
+#define hipKernelNodeAttributeCooperative hipLaunchAttributeCooperative
+#define hipKernelNodeAttributePriority hipLaunchAttributePriority
+
+/**
+ * Kernel node attribute value
+ */
+#define hipKernelNodeAttrValue hipLaunchAttributeValue
+
+/**
+ * hip Drv attributes
+ */
+#define hipDrvLaunchAttributeCooperative hipLaunchAttributeCooperative
+
+#define hipDrvLaunchAttributeID hipLaunchAttributeID
+#define hipDrvLaunchAttributeValue hipLaunchAttributeValue
+#define hipDrvLaunchAttribute hipLaunchAttribute
+
+/**
+ * Graph execution update result
+ */
+typedef enum hipGraphExecUpdateResult {
+  hipGraphExecUpdateSuccess = 0x0,  ///< The update succeeded
+  hipGraphExecUpdateError = 0x1,  ///< The update failed for an unexpected reason which is described
+                                  ///< in the return value of the function
+  hipGraphExecUpdateErrorTopologyChanged = 0x2,  ///< The update failed because the topology changed
+  hipGraphExecUpdateErrorNodeTypeChanged = 0x3,  ///< The update failed because a node type changed
+  hipGraphExecUpdateErrorFunctionChanged =
+      0x4,  ///< The update failed because the function of a kernel node changed
+  hipGraphExecUpdateErrorParametersChanged =
+      0x5,  ///< The update failed because the parameters changed in a way that is not supported
+  hipGraphExecUpdateErrorNotSupported =
+      0x6,  ///< The update failed because something about the node is not supported
+  hipGraphExecUpdateErrorUnsupportedFunctionChange = 0x7
+} hipGraphExecUpdateResult;
+
+typedef enum hipStreamCaptureMode {
+  hipStreamCaptureModeGlobal = 0,
+  hipStreamCaptureModeThreadLocal,
+  hipStreamCaptureModeRelaxed
+} hipStreamCaptureMode;
+typedef enum hipStreamCaptureStatus {
+  hipStreamCaptureStatusNone = 0,    ///< Stream is not capturing
+  hipStreamCaptureStatusActive,      ///< Stream is actively capturing
+  hipStreamCaptureStatusInvalidated  ///< Stream is part of a capture sequence that has been
+                                     ///< invalidated, but not terminated
+} hipStreamCaptureStatus;
+
+typedef enum hipStreamUpdateCaptureDependenciesFlags {
+  hipStreamAddCaptureDependencies = 0,  ///< Add new nodes to the dependency set
+  hipStreamSetCaptureDependencies,      ///< Replace the dependency set with the new nodes
+} hipStreamUpdateCaptureDependenciesFlags;
+
+typedef enum hipGraphMemAttributeType {
+  hipGraphMemAttrUsedMemCurrent =
+      0,                       ///< Amount of memory, in bytes, currently associated with graphs
+  hipGraphMemAttrUsedMemHigh,  ///< High watermark of memory, in bytes, associated with graphs since
+                               ///< the last time.
+  hipGraphMemAttrReservedMemCurrent,  ///< Amount of memory, in bytes, currently allocated for
+                                      ///< graphs.
+  hipGraphMemAttrReservedMemHigh,  ///< High watermark of memory, in bytes, currently allocated for
+                                   ///< graphs
+} hipGraphMemAttributeType;
+typedef enum hipUserObjectFlags {
+  hipUserObjectNoDestructorSync = 0x1,  ///< Destructor execution is not synchronized.
+} hipUserObjectFlags;
+
+typedef enum hipUserObjectRetainFlags {
+  hipGraphUserObjectMove = 0x1,  ///< Add new reference or retain.
+} hipUserObjectRetainFlags;
+
+typedef enum hipGraphInstantiateFlags {
+  hipGraphInstantiateFlagAutoFreeOnLaunch =
+      1,  ///< Automatically free memory allocated in a graph before relaunching.
+  hipGraphInstantiateFlagUpload = 2,  ///< Automatically upload the graph after instantiation.
+  hipGraphInstantiateFlagDeviceLaunch =
+      4,  ///< Instantiate the graph to be launched from the device.
+  hipGraphInstantiateFlagUseNodePriority =
+      8,  ///< Run the graph using the per-node priority attributes rather than the priority of the
+          ///< stream it is launched into.
+} hipGraphInstantiateFlags;
+
+enum hipGraphDebugDotFlags {
+  hipGraphDebugDotFlagsVerbose =
+      1 << 0, /**< Output all debug data as if every debug flag is enabled */
+  hipGraphDebugDotFlagsKernelNodeParams = 1 << 2, /**< Adds hipKernelNodeParams to output */
+  hipGraphDebugDotFlagsMemcpyNodeParams = 1 << 3, /**< Adds hipMemcpy3DParms to output */
+  hipGraphDebugDotFlagsMemsetNodeParams = 1 << 4, /**< Adds hipMemsetParams to output */
+  hipGraphDebugDotFlagsHostNodeParams = 1 << 5,   /**< Adds hipHostNodeParams to output */
+  hipGraphDebugDotFlagsEventNodeParams =
+      1 << 6, /**< Adds hipEvent_t handle from record and wait nodes to output */
+  hipGraphDebugDotFlagsExtSemasSignalNodeParams =
+      1 << 7, /**< Adds hipExternalSemaphoreSignalNodeParams values to output */
+  hipGraphDebugDotFlagsExtSemasWaitNodeParams =
+      1 << 8, /**< Adds hipExternalSemaphoreWaitNodeParams to output */
+  hipGraphDebugDotFlagsKernelNodeAttributes =
+      1 << 9, /**< Adds hipKernelNodeAttrID values to output */
+  hipGraphDebugDotFlagsHandles =
+      1 << 10 /**< Adds node handles and every kernel function handle to output */
+};
+
+/**
+ * hipGraphInstantiateWithParams results
+ */
+typedef enum hipGraphInstantiateResult {
+  hipGraphInstantiateSuccess = 0,                     /**< Instantiation Success */
+  hipGraphInstantiateError = 1,                       /**< Instantiation failed for an
+                        unexpected reason which is described in the return value of the function */
+  hipGraphInstantiateInvalidStructure = 2,            /**< Instantiation failed due
+             to invalid structure, such as cycles */
+  hipGraphInstantiateNodeOperationNotSupported = 3,   /**< Instantiation for device launch failed
+    because the graph contained an unsupported operation */
+  hipGraphInstantiateMultipleDevicesNotSupported = 4, /**< Instantiation for device launch failed
+  due to the nodes belonging to different contexts */
+} hipGraphInstantiateResult;
+
+/**
+ * Graph Instantiation parameters
+ */
+typedef struct hipGraphInstantiateParams {
+  hipGraphNode_t errNode_out;           /**< The node which caused instantiation to fail, if any*/
+  unsigned long long flags;             /**< Instantiation flags */
+  hipGraphInstantiateResult result_out; /**< Whether instantiation was successful.
+  If it failed, the reason why */
+  hipStream_t uploadStream;             /**< Upload stream */
+} hipGraphInstantiateParams;
+
+
+/**
+ * Memory allocation properties
+ */
+typedef struct hipMemAllocationProp {
+  hipMemAllocationType type;  ///< Memory allocation type
+  union {
+    hipMemAllocationHandleType requestedHandleType;   ///< Requested handle type
+    hipMemAllocationHandleType requestedHandleTypes;  ///< Requested handle types
+  };
+  hipMemLocation location;    ///< Memory location
+  void* win32HandleMetaData;  ///< Metadata for Win32 handles
+  struct {
+    unsigned char compressionType;       ///< Compression type
+    unsigned char gpuDirectRDMACapable;  ///< RDMA capable
+    unsigned short usage;                ///< Usage
+  } allocFlags;
+} hipMemAllocationProp;
+
+/**
+ * External semaphore signal node parameters
+ */
+typedef struct hipExternalSemaphoreSignalNodeParams {
+  ///< Array containing external semaphore handles.
+  hipExternalSemaphore_t* extSemArray;
+  ///< Array containing parameters of external signal semaphore.
+  const hipExternalSemaphoreSignalParams* paramsArray;
+  ///< Total number of handles and parameters contained in extSemArray and paramsArray.
+  unsigned int numExtSems;
+} hipExternalSemaphoreSignalNodeParams;
+
+/**
+ * External semaphore wait node parameters
+ */
+typedef struct hipExternalSemaphoreWaitNodeParams {
+  ///< Array containing external semaphore handles.
+  hipExternalSemaphore_t* extSemArray;
+  ///< Array containing parameters of external wait semaphore.
+  const hipExternalSemaphoreWaitParams* paramsArray;
+  ///< Total number of handles and parameters contained in extSemArray and paramsArray.
+  unsigned int numExtSems;
+} hipExternalSemaphoreWaitNodeParams;
+
+/**
+ * Generic handle for memory allocation
+ */
+typedef struct ihipMemGenericAllocationHandle* hipMemGenericAllocationHandle_t;
+
+/**
+ * Flags for granularity
+ */
+typedef enum hipMemAllocationGranularity_flags {
+  hipMemAllocationGranularityMinimum = 0x0,     ///< Minimum granularity
+  hipMemAllocationGranularityRecommended = 0x1  ///< Recommended granularity for performance
+} hipMemAllocationGranularity_flags;
+
+/**
+ * Memory handle type
+ */
+typedef enum hipMemHandleType {
+  hipMemHandleTypeGeneric = 0x0  ///< Generic handle type
+} hipMemHandleType;
+
+/**
+ * Memory operation types
+ */
+typedef enum hipMemOperationType {
+  hipMemOperationTypeMap = 0x1,   ///< Map operation
+  hipMemOperationTypeUnmap = 0x2  ///< Unmap operation
+} hipMemOperationType;
+
+/**
+ * Subresource types for sparse arrays
+ */
+typedef enum hipArraySparseSubresourceType {
+  hipArraySparseSubresourceTypeSparseLevel = 0x0,  ///< Sparse level
+  hipArraySparseSubresourceTypeMiptail = 0x1       ///< Miptail
+} hipArraySparseSubresourceType;
+
+/**
+ * Map info for arrays
+ */
+typedef struct hipArrayMapInfo {
+  hipResourceType resourceType;  ///< Resource type
+  union {
+    hipMipmappedArray mipmap;
+    hipArray_t array;
+  } resource;
+  hipArraySparseSubresourceType subresourceType;  ///< Sparse subresource type
+  union {
+    struct {
+      unsigned int
+          level;  ///< For mipmapped arrays must be a valid mipmap level. For arrays must be zero
+      unsigned int
+          layer;  ///< For layered arrays must be a valid layer index. Otherwise, must be zero
+      unsigned int offsetX;       ///< X offset in elements
+      unsigned int offsetY;       ///< Y offset in elements
+      unsigned int offsetZ;       ///< Z offset in elements
+      unsigned int extentWidth;   ///< Width in elements
+      unsigned int extentHeight;  ///< Height in elements
+      unsigned int extentDepth;   ///< Depth in elements
+    } sparseLevel;
+    struct {
+      unsigned int
+          layer;  ///< For layered arrays must be a valid layer index. Otherwise, must be zero
+      unsigned long long offset;  ///< Offset within mip tail
+      unsigned long long size;    ///< Extent in bytes
+    } miptail;
+  } subresource;
+  hipMemOperationType memOperationType;  ///< Memory operation type
+  hipMemHandleType memHandleType;        ///< Memory handle type
+  union {
+    hipMemGenericAllocationHandle_t memHandle;
+  } memHandle;
+  unsigned long long offset;   ///< Offset within the memory
+  unsigned int deviceBitMask;  ///< Device ordinal bit mask
+  unsigned int flags;          ///< flags for future use, must be zero now.
+  unsigned int reserved[2];    ///< Reserved for future use, must be zero now.
+} hipArrayMapInfo;
+
+/**
+ * Memcpy node params
+ */
+typedef struct hipMemcpyNodeParams {
+  int flags;                    ///< Must be zero.
+  int reserved[3];              ///< Must be zero.
+  hipMemcpy3DParms copyParams;  ///< Params set for the memory copy.
+} hipMemcpyNodeParams;
+
+/**
+ * Child graph node params
+ */
+typedef struct hipChildGraphNodeParams {
+  hipGraph_t graph;  ///< Either the child graph to clone into the node, or
+                     ///< a handle to the graph possesed by the node used during query
+} hipChildGraphNodeParams;
+
+/**
+ * Event record node params
+ */
+typedef struct hipEventWaitNodeParams {
+  hipEvent_t event;  ///< Event to wait on
+} hipEventWaitNodeParams;
+
+/**
+ * Event record node params
+ */
+typedef struct hipEventRecordNodeParams {
+  hipEvent_t event;  ///< The event to be recorded when node executes
+} hipEventRecordNodeParams;
+
+/**
+ * Memory free node params
+ */
+typedef struct hipMemFreeNodeParams {
+  void* dptr;  ///< the pointer to be freed
+} hipMemFreeNodeParams;
+
+/**
+ * Params for different graph nodes
+ */
+typedef struct hipGraphNodeParams {
+  hipGraphNodeType type;
+  int reserved0[3];
+  union {
+    long long reserved1[29];
+    hipKernelNodeParams kernel;
+    hipMemcpyNodeParams memcpy;
+    hipMemsetParams memset;
+    hipHostNodeParams host;
+    hipChildGraphNodeParams graph;
+    hipEventWaitNodeParams eventWait;
+    hipEventRecordNodeParams eventRecord;
+    hipExternalSemaphoreSignalNodeParams extSemSignal;
+    hipExternalSemaphoreWaitNodeParams extSemWait;
+    hipMemAllocNodeParams alloc;
+    hipMemFreeNodeParams free;
+  };
+
+  long long reserved2;
+} hipGraphNodeParams;
+
+/**
+ * This port activates when the kernel has finished executing.
+ */
+#define hipGraphKernelNodePortDefault 0
+
+/**
+ * This port activates when all blocks of the kernel have begun execution.
+ */
+#define hipGraphKernelNodePortLaunchCompletion 2
+
+/**
+ * This port activates when all blocks of the kernel have performed
+ * hipTriggerProgrammaticLaunchCompletion() or have terminated.
+ * It must be used with edge type hipGraphDependencyTypeProgrammatic.
+ */
+#define hipGraphKernelNodePortProgrammatic 1
+
+typedef enum hipGraphDependencyType {
+  hipGraphDependencyTypeDefault = 0,
+  hipGraphDependencyTypeProgrammatic = 1
+} hipGraphDependencyType;
+
+typedef struct hipGraphEdgeData {
+  unsigned char
+      from_port;  ///< This indicates when the dependency is triggered from the upstream node on the
+                  ///< edge. The meaning is specfic to the node type. A value of 0 in all cases
+                  ///< means full completion of the upstream node, with memory visibility to the
+                  ///< downstream node or portion thereof (indicated by to_port). Only kernel nodes
+                  ///< define non-zero ports. A kernel node can use the following output port types:
+                  ///< hipGraphKernelNodePortDefault, hipGraphKernelNodePortProgrammatic, or
+                  ///< hipGraphKernelNodePortLaunchCompletion.
+  unsigned char reserved[5];  ///< These bytes are unused and must be zeroed
+  unsigned char
+      to_port;  ///< Currently no node types define non-zero ports. This field must be set to zero.
+  unsigned char type;  ///< This should be populated with a value from hipGraphDependencyType
+} hipGraphEdgeData;
+
+
+/**
+ * Used to specify custom attributes for launching kernels
+ */
+typedef struct hipLaunchAttribute_st {
+  hipLaunchAttributeID id;                     ///< Identifier of the launch attribute
+  char pad[8 - sizeof(hipLaunchAttributeID)];  ///< Padding to align the structure to 8 bytes
+  union {
+    hipLaunchAttributeValue val;    ///< Value associated with the launch attribute
+    hipLaunchAttributeValue value;  ///< Value associated with the launch attribute
+  };
+} hipLaunchAttribute;
+
+/**
+ * HIP extensible launch configuration
+ */
+typedef struct hipLaunchConfig_st {
+  dim3 gridDim;               ///< Grid dimensions
+  dim3 blockDim;              ///< Block dimensions
+  size_t dynamicSmemBytes;    ///< Dynamic shared-memory size per thread block
+  hipStream_t stream;         ///< Stream identifier
+  hipLaunchAttribute* attrs;  ///< Attributes list
+  unsigned int numAttrs;      ///< Number of attributes
+} hipLaunchConfig_t;
+
+/**
+ * HIP driver extensible launch configuration
+ */
+typedef struct HIP_LAUNCH_CONFIG_st {
+  unsigned int gridDimX;        ///< Grid width in blocks
+  unsigned int gridDimY;        ///< Grid height in blocks
+  unsigned int gridDimZ;        ///< Grid depth in blocks
+  unsigned int blockDimX;       ///< Thread block dimension in X
+  unsigned int blockDimY;       ///< Thread block dimension in Y
+  unsigned int blockDimZ;       ///< Thread block dimension in Z
+  unsigned int sharedMemBytes;  ///< Dynamic shared-memory size in bytes per block
+  hipStream_t hStream;          ///< HIP stream identifier
+  hipLaunchAttribute* attrs;    ///< Attribute list
+  unsigned int numAttrs;        ///< Number of attributes
+} HIP_LAUNCH_CONFIG;
+
+/**
+ * Requested handle type for address range.
+ */
+typedef enum hipMemRangeHandleType {
+  hipMemRangeHandleTypeDmaBufFd = 0x1,
+  hipMemRangeHandleTypeMax = 0x7fffffff
+} hipMemRangeHandleType;
+
+/**
+ * Mem Range Flags used in hipMemGetHandleForAddressRange.
+ */
+typedef enum hipMemRangeFlags {
+  hipMemRangeFlagDmaBufMappingTypePcie = 0x1,
+  hipMemRangeFlagsMax = 0x7fffffff
+} hipMemRangeFlags;
+
+// Doxygen end group GlobalDefs
+/**
+ * @}
+ */
+/**
+ *  @defgroup API HIP API
+ *  @{
+ *
+ *  Defines the HIP API.  See the individual sections for more information.
+ */
+/**
+ *  @defgroup Driver Initialization and Version
+ *  @{
+ *  This section describes the initializtion and version functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief Explicitly initializes the HIP runtime.
+ *
+ * @param [in] flags  Initialization flag, should be zero.
+ *
+ * Most HIP APIs implicitly initialize the HIP runtime.
+ * This API provides control over the timing of the initialization.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+// TODO-ctx - more description on error codes.
+hipError_t hipInit(unsigned int flags);
+
+/**
+ * @brief Returns the approximate HIP driver version.
+ *
+ * @param [out] driverVersion driver version
+ *
+ * HIP driver version shows up in the format:
+ * HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning The HIP driver version does not correspond to an exact CUDA driver revision.
+ * On AMD platform, the API returns the HIP driver version, while on NVIDIA platform, it calls
+ * the corresponding CUDA runtime API and returns the CUDA driver version.
+ * There is no mapping/correlation between HIP driver version and CUDA driver version.
+ *
+ * @see hipRuntimeGetVersion
+ */
+hipError_t hipDriverGetVersion(int* driverVersion);
+/**
+ * @brief Returns the approximate HIP Runtime version.
+ *
+ * @param [out] runtimeVersion HIP runtime version
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning The version definition of HIP runtime is different from CUDA.
+ * On AMD platform, the function returns HIP runtime version,
+ * while on NVIDIA platform, it returns CUDA runtime version.
+ * And there is no mapping/correlation between HIP version and CUDA version.
+ *
+ * @see hipDriverGetVersion
+ */
+hipError_t hipRuntimeGetVersion(int* runtimeVersion);
+/**
+ * @brief Returns a handle to a compute device
+ * @param [out] device Handle of device
+ * @param [in] ordinal Device ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
+
+/**
+ * @brief Returns the compute capability of the device
+ * @param [out] major Major compute capability version number
+ * @param [out] minor Minor compute capability version number
+ * @param [in] device Device ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
+/**
+ * @brief Returns an identifer string for the device.
+ * @param [out] name String of the device name
+ * @param [in] len Maximum length of string to store in device name
+ * @param [in] device Device ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
+/**
+ * @brief Returns an UUID for the device.[BETA]
+ * @param [out] uuid UUID for the device
+ * @param [in] device device ordinal
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorDeinitialized
+ */
+hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device);
+/**
+ * @brief Returns a value for attribute of link between two devices
+ * @param [out] value Pointer of the value for the attrubute
+ * @param [in] attr enum of hipDeviceP2PAttr to query
+ * @param [in] srcDevice The source device of the link
+ * @param [in] dstDevice The destination device of the link
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, int srcDevice,
+                                    int dstDevice);
+/**
+ * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
+ * @param [out] pciBusId The string of PCI Bus Id format for the device
+ * @param [in] len Maximum length of string
+ * @param [in] device The device ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
+/**
+ * @brief Returns a handle to a compute device.
+ * @param [out] device The handle of the device
+ * @param [in] pciBusId The string of PCI Bus Id for the device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
+/**
+ * @brief Returns the total amount of memory on the device.
+ * @param [out] bytes The size of memory in bytes, on the device
+ * @param [in] device The ordinal of the device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ */
+hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device);
+// doxygen end initialization
+/**
+ * @}
+ */
+/**
+ *  @defgroup Device Device Management
+ *  @{
+ *  This section describes the device management functions of HIP runtime API.
+ */
+/**
+ * @brief Waits on all active streams on current device
+ *
+ * When this command is invoked, the host thread gets blocked until all the commands associated
+ * with streams associated with the device. HIP does not support multiple blocking modes (yet!).
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipSetDevice, hipDeviceReset
+ */
+hipError_t hipDeviceSynchronize(void);
+/**
+ * @brief The state of current device is discarded and updated to a fresh state.
+ *
+ * Calling this function deletes all streams created, memory allocated, kernels running, events
+ * created. Make sure that no other thread is using the device or streams, memory, kernels, events
+ * associated with the current device.
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipDeviceSynchronize
+ */
+hipError_t hipDeviceReset(void);
+/**
+ * @brief Set default device to be used for subsequent hip API calls from this thread.
+ *
+ * @param[in] deviceId Valid device in range 0...hipGetDeviceCount().
+ *
+ * Sets @p device as the default device for the calling host thread.  Valid device id's are 0...
+ * (hipGetDeviceCount()-1).
+ *
+ * Many HIP APIs implicitly use the "default device" :
+ *
+ * - Any device memory subsequently allocated from this host thread (using hipMalloc) will be
+ * allocated on device.
+ * - Any streams or events created from this host thread will be associated with device.
+ * - Any kernels launched from this host thread (using hipLaunchKernel) will be executed on device
+ * (unless a specific stream is specified, in which case the device associated with that stream will
+ * be used).
+ *
+ * This function may be called from any host thread.  Multiple host threads may use the same device.
+ * This function does no synchronization with the previous or new device, and has very little
+ * runtime overhead. Applications can use hipSetDevice to quickly switch the default device before
+ * making a HIP runtime call which uses the default device.
+ *
+ * The default device is stored in thread-local-storage for each thread.
+ * Thread-pool implementations may inherit the default device of the previous thread.  A good
+ * practice is to always call hipSetDevice at the start of HIP coding sequency to establish a known
+ * standard device.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorNoDevice
+ *
+ * @see #hipGetDevice, #hipGetDeviceCount
+ */
+hipError_t hipSetDevice(int deviceId);
+/**
+ * @brief Set a list of devices that can be used.
+ *
+ * @param[in] device_arr List of devices to try
+ * @param[in] len Number of devices in specified list
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see #hipGetDevice, #hipGetDeviceCount. #hipSetDevice. #hipGetDeviceProperties.
+ * #hipSetDeviceFlags. #hipChooseDevice
+ *
+ * */
+hipError_t hipSetValidDevices(int* device_arr, int len);
+/**
+ * @brief Return the default device id for the calling host thread.
+ *
+ * @param [out] deviceId *device is written with the default device
+ *
+ * HIP maintains an default device for each thread using thread-local-storage.
+ * This device is used implicitly for HIP runtime APIs called by this thread.
+ * hipGetDevice returns in * @p device the default device for the calling host thread.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see hipSetDevice, hipGetDevicesizeBytes
+ */
+hipError_t hipGetDevice(int* deviceId);
+/**
+ * @brief Return number of compute-capable devices.
+ *
+ * @param [out] count Returns number of compute-capable devices.
+ *
+ * @returns #hipSuccess, #hipErrorNoDevice
+ *
+ *
+ * Returns in @p *count the number of devices that have ability to run compute commands.  If there
+ * are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. If 1 or more
+ * devices can be found, then hipGetDeviceCount returns #hipSuccess.
+ */
+hipError_t hipGetDeviceCount(int* count);
+/**
+ * @brief Query for a specific device attribute.
+ *
+ * @param [out] pi pointer to value to return
+ * @param [in] attr attribute to query
+ * @param [in] deviceId which device to query for information
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId);
+/**
+ * @brief Returns the default memory pool of the specified device
+ *
+ * @param [out] mem_pool Default memory pool to return
+ * @param [in] device    Device index for query the default memory pool
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ */
+hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device);
+/**
+ * @brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * @p hipMallocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * @note Use @p hipMallocFromPoolAsync for asynchronous memory allocations from a device
+ * different than the one the stream runs on.
+ *
+ * @param [in] device   Device index for the update
+ * @param [in] mem_pool Memory pool for update as the current on the specified device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice, #hipErrorNotSupported
+ *
+ * @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ */
+hipError_t hipDeviceSetMemPool(int device, hipMemPool_t mem_pool);
+/**
+ * @brief Gets the current memory pool for the specified device
+ *
+ * Returns the last pool provided to @p hipDeviceSetMemPool for this device
+ * or the device's default memory pool if @p hipDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device,
+ * otherwise the returned pool must have been set with @p hipDeviceSetMemPool.
+ *
+ * @param [out] mem_pool Current memory pool on the specified device
+ * @param [in] device    Device index to query the current memory pool
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ */
+hipError_t hipDeviceGetMemPool(hipMemPool_t* mem_pool, int device);
+/**
+ * @brief Returns device properties.
+ *
+ * @param [out] prop written with device properties
+ * @param [in]  deviceId which device to query for information
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ * @bug HIP-Clang always returns 0 for maxThreadsPerMultiProcessor
+ * @bug HIP-Clang always returns 0 for regsPerBlock
+ * @bug HIP-Clang always returns 0 for l2CacheSize
+ *
+ * Populates hipGetDeviceProperties with information for the specified device.
+ */
+hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
+/**
+ * @brief Gets the maximum width for 1D linear textures on the specified device
+ *
+ * This function queries the maximum width, in elements, of 1D linear textures that can be allocated
+ * on the specified device. The maximum width depends on the texture element size and the hardware
+ * limitations of the device.
+ *
+ * @param [out] max_width Maximum width, in elements, of 1D linear textures that the device can
+ * support
+ * @param [in] desc       Requested channel format
+ * @param [in] device     Device index to query for maximum 1D texture width
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ *
+ * @see hipDeviceGetAttribute, hipMalloc, hipTexRefSetAddressMode
+ */
+hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* max_width, const hipChannelFormatDesc* desc,
+                                               int device);
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * @param [in] cacheConfig Cache configuration
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorNotSupported
+ *
+ * Note: AMD devices do not support reconfigurable cache. This API is not implemented
+ * on AMD platform. If the function is called, it will return hipErrorNotSupported.
+ *
+ */
+hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
+/**
+ * @brief Get Cache configuration for a specific Device
+ *
+ * @param [out] cacheConfig Pointer of cache configuration
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices do not support reconfigurable cache. This hint is ignored
+ * on these architectures.
+ *
+ */
+hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig);
+/**
+ * @brief Gets resource limits of current device
+ *
+ * The function queries the size of limit value, as required by the input enum value hipLimit_t,
+ * which can be either #hipLimitStackSize, or #hipLimitMallocHeapSize. Any other input as
+ * default, the function will return #hipErrorUnsupportedLimit.
+ *
+ * @param [out] pValue Returns the size of the limit in bytes
+ * @param [in]  limit The limit to query
+ *
+ * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
+/**
+ * @brief Sets resource limits of current device.
+ *
+ * As the input enum limit,
+ * #hipLimitStackSize sets the limit value of the stack size on the current GPU device, per thread.
+ * The limit size can get via hipDeviceGetLimit. The size is in units of 256 dwords, up to the limit
+ * (128K - 16).
+ *
+ * #hipLimitMallocHeapSize sets the limit value of the heap used by the malloc()/free()
+ * calls. For limit size, use the #hipDeviceGetLimit API.
+ *
+ * Any other input as default, the funtion will return hipErrorUnsupportedLimit.
+ *
+ * @param [in] limit Enum of hipLimit_t to set
+ * @param [in] value The size of limit value in bytes
+ *
+ * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDeviceSetLimit(enum hipLimit_t limit, size_t value);
+/**
+ * @brief Returns bank width of shared memory for current device
+ *
+ * @param [out] pConfig The pointer of the bank width for shared memory
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
+/**
+ * @brief Gets the flags set for current device
+ *
+ * @param [out] flags Pointer of the flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipGetDeviceFlags(unsigned int* flags);
+/**
+ * @brief The bank width of shared memory on current device is set
+ *
+ * @param [in] config Configuration for the bank width of shared memory
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config);
+/**
+ * @brief The current device behavior is changed according to the flags passed.
+ *
+ * @param [in] flags Flag to set on the current device
+ *
+ * The schedule flags impact how HIP waits for the completion of a command running on a device.
+ *
+ * #hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted
+ * the work until the command completes.  This offers the lowest latency, but will consume a CPU
+ * core and may increase power.
+ *
+ * #hipDeviceScheduleYield        : The HIP runtime will yield the CPU to system so that other
+ * tasks can use it. This may increase latency to detect the completion but will consume less
+ * power and is friendlier to other tasks in the system.
+ *
+ * #hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
+ *
+ * #hipDeviceScheduleAuto         : This is the default value if the input 'flags' is zero.
+ * Uses a heuristic to select between Spin and Yield modes. If the number of HIP contexts is
+ * greater than the number of logical processors in the system, uses Spin scheduling, otherwise
+ * uses Yield scheduling.
+ *
+ * #hipDeviceMapHost              : Allows mapping host memory. On ROCm, this is always allowed and
+ * the flag is ignored.
+ *
+ * #hipDeviceLmemResizeToMax      : This flag is silently ignored on ROCm.
+ *
+ * @returns #hipSuccess, #hipErrorNoDevice, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
+ *
+ *
+ */
+hipError_t hipSetDeviceFlags(unsigned flags);
+/**
+ * @brief Device which matches hipDeviceProp_t is returned
+ *
+ * @param [out] device Pointer of the device
+ * @param [in]  prop Pointer of the properties
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop);
+/**
+ * @brief Returns the link type and hop count between two devices
+ *
+ * @param [in] device1 Ordinal for device1
+ * @param [in] device2 Ordinal for device2
+ * @param [out] linktype Returns the link type (See hsa_amd_link_info_type_t) between the two
+ * devices
+ * @param [out] hopcount Returns the hop count between the two devices
+ *
+ * Queries and returns the HSA link type and the hop count between the two specified devices.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype,
+                                        uint32_t* hopcount);
+// TODO: implement IPC apis
+/**
+ * @brief Gets an interprocess memory handle for an existing device memory
+ *          allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with hipMalloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with hipFree and a subsequent call
+ * to hipMalloc returns memory with the same device address,
+ * hipIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * @param handle - Pointer to user allocated hipIpcMemHandle to return
+ *                    the handle in.
+ * @param devPtr - Base pointer to previously allocated device memory
+ *
+ * @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorOutOfMemory, #hipErrorMapFailed
+ *
+ * @note This IPC memory related feature API on Windows may behave differently from Linux.
+ *
+ */
+hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr);
+/**
+ * @brief Opens an interprocess memory handle exported from another process
+ *          and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with hipIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * hipIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called hipDeviceEnablePeerAccess. This behavior is
+ * controlled by the hipIpcMemLazyEnablePeerAccess flag.
+ * hipDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open hipIpcMemHandles are restricted in the following way.
+ * hipIpcMemHandles from each device in a given process may only be opened
+ * by one context per device per other process.
+ *
+ * Memory returned from hipIpcOpenMemHandle must be freed with
+ * hipIpcCloseMemHandle.
+ *
+ * Calling hipFree on an exported memory region before calling
+ * hipIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * @param devPtr - Returned device pointer
+ * @param handle - hipIpcMemHandle to open
+ * @param flags  - Flags for this operation. Must be specified as hipIpcMemLazyEnablePeerAccess
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext,
+ *  #hipErrorInvalidDevicePointer
+ *
+ * @note During multiple processes, using the same memory handle opened by the current context,
+ * there is no guarantee that the same device poiter will be returned in @p *devPtr.
+ * This is diffrent from CUDA.
+ * @note This IPC memory related feature API on Windows may behave differently from Linux.
+ *
+ */
+hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags);
+/**
+ * @brief Close memory mapped with hipIpcOpenMemHandle
+ *
+ * Unmaps memory returnd by hipIpcOpenMemHandle. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * @param devPtr - Device pointer returned by hipIpcOpenMemHandle
+ *
+ * @returns #hipSuccess, #hipErrorMapFailed, #hipErrorInvalidHandle
+ *
+ * @note This IPC memory related feature API on Windows may behave differently from Linux.
+ *
+ */
+hipError_t hipIpcCloseMemHandle(void* devPtr);
+
+/**
+ * @brief Gets an opaque interprocess handle for an event.
+ *
+ * This opaque handle may be copied into other processes and opened with hipIpcOpenEventHandle.
+ * Then hipEventRecord, hipEventSynchronize, hipStreamWaitEvent and hipEventQuery may be used in
+ * either process. Operations on the imported event after the exported event has been freed with
+ * hipEventDestroy will result in undefined behavior.
+ *
+ * @param[out]  handle Pointer to hipIpcEventHandle to return the opaque event handle
+ * @param[in]   event  Event allocated with hipEventInterprocess and hipEventDisableTiming flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidConfiguration, #hipErrorInvalidValue
+ *
+ * @note This IPC event related feature API is currently applicable on Linux.
+ *
+ */
+hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
+
+/**
+ * @brief Opens an interprocess event handles.
+ *
+ * Opens an interprocess event handle exported from another process with hipIpcGetEventHandle. The
+ * returned hipEvent_t behaves like a locally created event with the hipEventDisableTiming flag
+ * specified. This event need be freed with hipEventDestroy. Operations on the imported event after
+ * the exported event has been freed with hipEventDestroy will result in undefined behavior. If the
+ * function is called within the same process where handle is returned by hipIpcGetEventHandle, it
+ * will return hipErrorInvalidContext.
+ *
+ * @param[out]  event  Pointer to hipEvent_t to return the event
+ * @param[in]   handle The opaque interprocess handle to open
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext
+ *
+ * @note This IPC event related feature API is currently applicable on Linux.
+ *
+ */
+hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle);
+
+// end doxygen Device
+/**
+ * @}
+ */
+/**
+ *
+ *  @defgroup Execution Execution Control
+ *  @{
+ *  This section describes the execution control functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief Set attribute for a specific function
+ *
+ * @param [in] func Pointer of the function
+ * @param [in] attr Attribute to set
+ * @param [in] value Value to set
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value);
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [in] func Pointer of the function.
+ * @param [in] config Configuration to set.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
+/**
+ * @brief Set shared memory configuation for a specific function
+ *
+ * @param [in] func Pointer of the function
+ * @param [in] config Configuration
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config);
+// doxygen end execution
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Error Error Handling
+ *  @{
+ *  This section describes the error handling functions of HIP runtime API.
+ */
+/**
+ * @brief Return last error returned by any HIP runtime API call and resets the stored error code to
+ * #hipSuccess
+ *
+ * @returns return code from last HIP called from the active host thread
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread, and then resets the saved error to #hipSuccess.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipGetLastError(void);
+
+/**
+ * @brief Return last error returned by any HIP runtime API call and resets the stored error code to
+ * #hipSuccess
+ *
+ * @returns return code from last HIP called from the active host thread
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread, and then resets the saved error to #hipSuccess.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipExtGetLastError(void);
+
+/**
+ * @brief Return last error returned by any HIP runtime API call.
+ *
+ * @returns #hipSuccess
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread. Unlike hipGetLastError, this function does not reset the saved error code.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipPeekAtLastError(void);
+/**
+ * @brief Return hip error as text string form.
+ *
+ * @param hip_error Error code to convert to name.
+ * @returns const char pointer to the NULL-terminated error name
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorName(hipError_t hip_error);
+/**
+ * @brief Return handy text string message to explain the error which occurred
+ *
+ * @param hipError Error code to convert to string.
+ * @returns const char pointer to the NULL-terminated error string
+ *
+ * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorString(hipError_t hipError);
+/**
+ * @brief Return hip error as text string form.
+ *
+ * @param [in] hipError Error code to convert to string.
+ * @param [out] errorString char pointer to the NULL-terminated error string
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipDrvGetErrorName(hipError_t hipError, const char** errorString);
+/**
+ * @brief Return handy text string message to explain the error which occurred
+ *
+ * @param [in] hipError Error code to convert to string.
+ * @param [out] errorString char pointer to the NULL-terminated error string
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipDrvGetErrorString(hipError_t hipError, const char** errorString);
+// end doxygen Error
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Stream Stream Management
+ *  @{
+ *  This section describes the stream management functions of HIP runtime API.
+ *  The following Stream APIs are not (yet) supported in HIP:
+ *  - hipStreamAttachMemAsync is a nop
+ *  - hipDeviceGetStreamPriorityRange returns #hipSuccess
+ */
+
+/**
+ * @brief Creates an asynchronous stream.
+ *
+ * @param[in, out] stream  Valid pointer to hipStream_t.  This function writes the memory with the
+ * newly created stream.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Creates a new asynchronous stream with its associated current device. The @p stream returns an
+ * opaque handle that can be used to reference the newly created stream in subsequent hipStream*
+ * commands. The stream is allocated on the heap and will remain allocated even if the handle goes
+ * out-of-scope. To release the memory used by the stream, the application must call
+ * hipStreamDestroy.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize,
+ * hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipStreamCreate(hipStream_t* stream);
+/**
+ * @brief Creates an asynchronous stream with flag.
+ *
+ * @param[in, out] stream  Pointer to new stream
+ * @param[in] flags  Parameters to control stream creation
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Creates a new asynchronous stream with its associated current device. @p stream returns an
+ * opaque handle that can be used to reference the newly created stream in subsequent hipStream*
+ * commands. The stream is allocated on the heap and will remain allocated even if the handle
+ * goes out-of-scope. To release the memory used by the stream, application must call
+ * hipStreamDestroy.
+ *
+ * The @p flags parameter controls behavior of the stream. The valid values are #hipStreamDefault
+ * and #hipStreamNonBlocking.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent,
+ * hipStreamDestroy.
+ *
+ */
+hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
+/**
+ * @brief Creates an asynchronous stream with the specified priority.
+ *
+ * @param[in, out] stream  Pointer to new stream
+ * @param[in] flags  Parameters to control stream creation
+ * @param[in] priority  Priority of the stream. Lower numbers represent higher priorities.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Creates a new asynchronous stream with the specified priority, with its associated current
+ * device.
+ * @p stream returns an opaque handle that can be used to reference the newly created stream in
+ * subsequent hipStream* commands. The stream is allocated on the heap and will remain allocated
+ * even if the handle goes out-of-scope. To release the memory used by the stream, application must
+ * call hipStreamDestroy.
+ *
+ * The @p flags parameter controls behavior of the stream. The valid values are #hipStreamDefault
+ * and #hipStreamNonBlocking.
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ *
+ */
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
+/**
+ * @brief Returns numerical values that correspond to the least and greatest stream priority.
+ *
+ * @param[in, out] leastPriority  Pointer in which a value corresponding to least priority
+ * is returned.
+ * @param[in, out] greatestPriority  Pointer in which a value corresponding to greatest priority
+ * is returned.
+ * @returns #hipSuccess
+ *
+ * Returns in *leastPriority and *greatestPriority the numerical values that correspond to the
+ * least and greatest stream priority respectively. Stream priorities follow a convention where
+ * lower numbers imply greater priorities. The range of meaningful stream priorities is given by
+ * [*leastPriority,*greatestPriority]. If the user attempts to create a stream with a priority
+ * value that is outside the meaningful range as specified by this API, the priority is
+ * automatically clamped to within the valid range.
+ *
+ * @warning This API is under development on AMD GPUs and simply returns #hipSuccess.
+ */
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+/**
+ * @brief Destroys the specified stream.
+ *
+ * @param[in] stream  Stream identifier
+ * @returns #hipSuccess #hipErrorInvalidHandle
+ *
+ * Destroys the specified stream.
+ *
+ * If commands are still executing on the specified stream, some may complete execution before the
+ * queue is deleted.
+ *
+ * The queue may be destroyed while some commands are still inflight, or may wait for all commands
+ * queued to the stream before destroying it.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamQuery,
+ * hipStreamWaitEvent, hipStreamSynchronize
+ */
+hipError_t hipStreamDestroy(hipStream_t stream);
+/**
+ * @brief Returns #hipSuccess if all of the operations in the specified @p stream have completed, or
+ * #hipErrorNotReady if not.
+ *
+ * @param[in] stream  Stream to query
+ *
+ * @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
+ *
+ * This is thread-safe and returns a snapshot of the current state of the queue.  However, if other
+ * host threads are sending work to the stream, the status may change immediately after the function
+ * is called.  It is typically used for debug.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent,
+ * hipStreamSynchronize, hipStreamDestroy
+ */
+hipError_t hipStreamQuery(hipStream_t stream);
+/**
+ * @brief Waits for all commands in the stream to complete.
+ *
+ * @param[in] stream  Stream identifier.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidHandle
+ *
+ * This command is host-synchronous : the host will block until all operations on the specified
+ * stream with its associated device are completed. On multiple device systems, the @p stream is
+ * associated with its device, no need to call hipSetDevice before this API.
+ *
+ * This command follows standard null-stream semantics. Specifying the null stream will cause the
+ * command to wait for other streams on the same device to complete all pending operations.
+ *
+ * This command honors the #hipDeviceScheduleBlockingSync flag, which controls whether the wait is
+ * active or blocking.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent,
+ * hipStreamDestroy
+ *
+ */
+hipError_t hipStreamSynchronize(hipStream_t stream);
+/**
+ * @brief Makes the specified compute stream wait for the specified event
+ *
+ * @param[in] stream  Stream to make wait
+ * @param[in] event  Event to wait on
+ * @param[in] flags  Parameters to control the operation
+ *
+ * @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue,
+ * #hipErrorStreamCaptureIsolation
+ *
+ * This function inserts a wait operation into the specified stream.
+ * All future work submitted to @p stream will wait until @p event reports completion before
+ * beginning execution.
+ *
+ * Flags include:
+ *   hipEventWaitDefault: Default event creation flag.
+ *   hipEventWaitExternal: Wait is captured in the graph as an external event node when
+ *                           performing stream capture
+ *
+ * This function only waits for commands in the current stream to complete.  Notably, this function
+ * does not implicitly wait for commands in the default stream to complete, even if the specified
+ * stream is created with hipStreamNonBlocking = 0.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority,
+ * hipStreamSynchronize, hipStreamDestroy
+ */
+hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags __dparm(0));
+/**
+ * @brief Returns flags associated with this stream.
+ *
+ * @param[in] stream  Stream to be queried
+ * @param[in,out] flags  Pointer to an unsigned integer in which the stream's flags are returned
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags);
+/**
+ * @brief Queries the Id of a stream.
+ *
+ * @param[in] stream  Stream to be queried
+ * @param[in,out] flags  Pointer to an unsigned long long in which the stream's id is returned
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle.
+ *
+ * @see hipStreamCreateWithFlags, hipStreamGetFlags, hipStreamCreateWithPriority, hipStreamGetPriority
+ */
+hipError_t hipStreamGetId(hipStream_t stream, unsigned long long* streamId);
+/**
+ * @brief Queries the priority of a stream.
+ *
+ * @param[in] stream  Stream to be queried
+ * @param[in,out] priority  Pointer to an unsigned integer in which the stream's priority is
+ * returned
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle.
+ *
+ * @see hipStreamCreateWithPriority
+ */
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority);
+/**
+ * @brief Gets the device associated with the stream.
+ *
+ * @param[in] stream  Stream to be queried
+ * @param[out] device  Device associated with the stream
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorContextIsDestroyed, #hipErrorInvalidHandle,
+ * #hipErrorNotInitialized, #hipErrorDeinitialized, #hipErrorInvalidContext
+ *
+ * @see hipStreamCreate, hipStreamDestroy, hipDeviceGetStreamPriorityRange
+ */
+hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device);
+/**
+ * @brief Creates an asynchronous stream with the specified CU mask.
+ *
+ * @param[in, out] stream  Pointer to new stream
+ * @param[in] cuMaskSize  Size of CU mask bit array passed in.
+ * @param[in] cuMask Bit-vector representing the CU mask. Each active bit represents using one CU.
+ * The first 32 bits represent the first 32 CUs, and so on. If its size is greater than physical
+ * CU number (i.e., multiProcessorCount member of hipDeviceProp_t), the extra elements are ignored.
+ * It is user's responsibility to make sure the input is meaningful.
+ * @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * Creates  a new asynchronous stream with the specified CU mask.  @p stream returns an opaque
+ * handle that can be used to reference the newly created stream in subsequent hipStream* commands.
+ * The stream is allocated on the heap and will remain allocated even if the handle goes
+ * out-of-scope. To release the memory used by the stream, application must call hipStreamDestroy.
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize,
+                                        const uint32_t* cuMask);
+/**
+ * @brief Gets CU mask associated with an asynchronous stream
+ *
+ * @param[in] stream  Stream to be queried
+ * @param[in] cuMaskSize  Number of the block of memories (uint32_t *) allocated by user
+ * @param[out] cuMask  Pointer to a pre-allocated block of memories (uint32_t *) in which
+ * the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where
+ * each active bit represents one active CU.
+ * @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask);
+/**
+ * Stream CallBack struct
+ */
+typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+/**
+ * @brief Adds a callback to be called on the host after all currently enqueued items in the stream
+ * have completed.  For each hipStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * @param[in] stream   - Stream to add callback to
+ * @param[in] callback - The function to call once preceding stream operations are complete
+ * @param[in] userData - User specified data to be passed to the callback function
+ * @param[in] flags    - Reserved for future use, must be 0
+ * @returns #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
+ * hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
+ *
+ */
+hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+                                unsigned int flags);
+
+/**
+ *@brief Sets stream attribute. Updated attribute is applied to work submitted to the stream.
+ * @param[in] stream - Stream to set attributes to
+ * @param[in] attr   - Attribute ID for the attribute to set
+ * @param[in] value  - Attribute value for the attribute to set
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
+ */
+hipError_t hipStreamSetAttribute(hipStream_t stream, hipStreamAttrID attr,
+                                 const hipStreamAttrValue* value);
+
+/**
+ *@brief queries stream attribute.
+ * @param[in] stream - Stream to geet attributes from
+ * @param[in] attr   - Attribute ID for the attribute to query
+ * @param[out] value  - Attribute value output
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
+ */
+hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
+                                 hipStreamAttrValue* value_out);
+
+// end doxygen Stream
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup StreamM Stream Memory Operations
+ *  @{
+ *  This section describes Stream Memory Wait and Write functions of HIP runtime API.
+ */
+
+/**
+ * @brief Enqueues a wait command to the stream.[BETA]
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to memory object allocated using #hipMallocSignalMemory flag
+ * @param [in] value  - Value to be used in compare operation
+ * @param [in] flags  - Defines the compare operation, supported values are #hipStreamWaitValueGte
+ * #hipStreamWaitValueEq, #hipStreamWaitValueAnd and #hipStreamWaitValueNor
+ * @param [in] mask   - Mask to be applied on value at memory before it is compared with value,
+ * default value is set to enable every bit
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
+ * not execute until the defined wait condition is true.
+ *
+ * #hipStreamWaitValueGte: waits until *ptr&mask >= value
+ *
+ * #hipStreamWaitValueEq : waits until *ptr&mask == value
+ *
+ * #hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+ *
+ * #hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+ *
+ * @note when using #hipStreamWaitValueNor, mask is applied on both 'value' and '*ptr'.
+ *
+ * @note Support for #hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and
+ * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,
+ * hipStreamWriteValue32, hipDeviceGetAttribute
+ */
+
+hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags,
+                                uint32_t mask __dparm(0xFFFFFFFF));
+
+/**
+ * @brief Enqueues a wait command to the stream.[BETA]
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
+ * @param [in] value  - Value to be used in compare operation
+ * @param [in] flags  - Defines the compare operation, supported values are #hipStreamWaitValueGte
+ * #hipStreamWaitValueEq, #hipStreamWaitValueAnd and #hipStreamWaitValueNor.
+ * @param [in] mask   - Mask to be applied on value at memory before it is compared with value
+ * default value is set to enable every bit
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
+ * not execute until the defined wait condition is true.
+ *
+ * #hipStreamWaitValueGte: waits until *ptr&mask >= value
+ *
+ * #hipStreamWaitValueEq : waits until *ptr&mask == value
+ *
+ * #hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+ *
+ * #hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+ *
+ * @note when using #hipStreamWaitValueNor, mask is applied on both 'value' and '*ptr'.
+ *
+ * @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and
+ * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,
+ * hipStreamWriteValue32, hipDeviceGetAttribute
+ */
+
+hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags,
+                                uint64_t mask __dparm(0xFFFFFFFFFFFFFFFF));
+
+/**
+ * @brief Enqueues a write command to the stream.[BETA]
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to a GPU accessible memory object
+ * @param [in] value  - Value to be written
+ * @param [in] flags  - reserved, ignored for now, will be used in future releases
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a write command to the stream, write operation is performed after all earlier commands
+ * on this stream have completed the execution.
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64
+ */
+
+hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags);
+/**
+ * @brief Enqueues a write command to the stream.[BETA]
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to a GPU accessible memory object
+ * @param [in] value  - Value to be written
+ * @param [in] flags  - reserved, ignored for now, will be used in future releases
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a write command to the stream, write operation is performed after all earlier commands
+ * on this stream have completed the execution.
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64
+ */
+
+hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags);
+
+/**
+ * @brief Enqueues an array of stream memory operations in the stream.[BETA]
+ *
+ * @param [in] stream      - Stream identifier
+ * @param [in] count       - The number of operations in the array. Must be less than 256
+ * @param [in] paramArray  - The types and parameters of the individual operations.
+ * @param [in] flags       - Reserved for future expansion; must be 0.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Batch operations to synchronize the stream via memory operations.
+ *
+ * @warning This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ *
+ * @see hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64. hipStreamWriteValue64
+ */
+
+hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
+                               hipStreamBatchMemOpParams* paramArray, unsigned int flags);
+
+/**
+ * @brief Creates a batch memory operation node and adds it to a graph.[BETA]
+ *
+ * @param [in] phGraphNode      - Returns the newly created node
+ * @param [in] hGraph           - Graph to which to add the node
+ * @param [in] dependencies     -  Dependencies of the node
+ * @param [in] numDependencies  - Number of dependencies
+ * @param [in] nodeParams       - Parameters for the node
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ *
+ * @see hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64. hipStreamWriteValue64, hipStreamBatchMemOp
+ */
+hipError_t hipGraphAddBatchMemOpNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                     const hipGraphNode_t* dependencies, size_t numDependencies,
+                                     const hipBatchMemOpNodeParams* nodeParams);
+
+/**
+ * @brief Returns a batch mem op node's parameters.[BETA]
+ *
+ * @param [in] hNode           - Node to get the parameters for
+ * @param [in] nodeParams_out  - Pointer to return the parameters
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Returns the parameters of batch mem op node hNode in nodeParams_out.
+ * The paramArray returned in nodeParams_out is owned by the node.
+ * This memory remains valid until the node is destroyed or its parameters are modified,
+ * and should not be modified directly.
+ *
+ * @warning This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ *
+ * @see hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64. hipStreamWriteValue64. hipGraphBatchMemOpNodeSetParams
+ */
+
+hipError_t hipGraphBatchMemOpNodeGetParams(hipGraphNode_t hNode,
+                                           hipBatchMemOpNodeParams* nodeParams_out);
+
+/**
+ * @brief Sets the batch mem op node's parameters.[BETA]
+ *
+ * @param [in] hNode       - Node to set the parameters for
+ * @param [in] nodeParams  - Parameters to copy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Sets the parameters of batch mem op node hNode to nodeParams.
+ *
+ * @warning This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ *
+ * @see hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64. hipStreamWriteValue64, hipGraphBatchMemOpNodeGetParams
+ */
+
+hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
+                                           hipBatchMemOpNodeParams* nodeParams);
+
+/**
+ * @brief Sets the parameters for a batch mem op node in the given graphExec.[BETA]
+ *
+ * @param [in] hGraphExec  - The executable graph in which to set the specified node
+ * @param [in] hNode       - Batch mem op node from the graph from which graphExec was instantiated
+ * @param [in] nodeParams  - Updated Parameters to set
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Sets the parameters of a batch mem op node in an executable graph hGraphExec.
+ * The node is identified by the corresponding node hNode in the non-executable graph,
+ * from which the executable graph was instantiated.
+ *
+ * @warning This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ *
+ * @see hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64. hipStreamWriteValue64, hipStreamBatchMemOp
+ */
+hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                               const hipBatchMemOpNodeParams* nodeParams);
+
+// end doxygen Stream Memory Operations
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Event Event Management
+ *  @{
+ *  This section describes the event management functions of HIP runtime API.
+ */
+/**
+ * @brief Create an event with the specified flags
+ *
+ * @param[in,out] event Returns the newly created event.
+ * @param[in] flags     Flags to control event behavior.  Valid values are #hipEventDefault,
+ #hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess
+ * #hipEventDefault : Default flag.  The event will use active synchronization and will support
+ timing.  Blocking synchronization provides lowest possible latency at the expense of dedicating a
+ CPU to poll on the event.
+ * #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is
+ called on this event, the thread will block until the event completes.  This can increase latency
+ for the synchroniation but can result in lower power and more resources for other CPU threads.
+ * #hipEventDisableTiming : Disable recording of timing information. Events created with this flag
+ would not record profiling data and provide best performance if used for synchronization.
+ * #hipEventInterprocess : The event can be used as an interprocess event. hipEventDisableTiming
+ flag also must be set when hipEventInterprocess flag is set.
+ * #hipEventDisableSystemFence : Disable acquire and release system scope fence. This may
+ improve performance but device memory may not be visible to the host and other devices
+ if this flag is set.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreate, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
+/**
+ *  Create an event
+ *
+ * @param[in,out] event Returns the newly created event.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreateWithFlags, hipEventRecord, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreate(hipEvent_t* event);
+/**
+ * @brief Record an event in the specified stream.
+ *
+ * @param[in] event event to record.
+ * @param[in] stream stream in which to record event.
+ * @param[in] flags parameter for operations
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ * hipEventQuery() or hipEventSynchronize() must be used to determine when the event
+ * transitions from "recording" (after hipEventRecord() is called) to "recorded"
+ * (when timestamps are set, if requested).
+ *
+ * Events which are recorded in a non-NULL stream will transition to
+ * from recording to "recorded" state when they reach the head of
+ * the specified stream, after all previous
+ * commands in that stream have completed executing.
+ *
+ * Flags include:
+ *   hipEventRecordDefault: Default event creation flag.
+ *   hipEventRecordExternal: Event is captured in the graph as an external event node when
+ *                           performing stream capture
+ *
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any
+ * existing state in event.
+ *
+ * If this function is called on an event that is currently being recorded, results are undefined
+ * - either outstanding recording may save state into the event, and the order is not guaranteed.
+ *
+ * @note: If this function is not called before use hipEventQuery() or hipEventSynchronize(),
+ * #hipSuccess is returned, meaning no pending event in the stream.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ *
+ */
+hipError_t hipEventRecordWithFlags(hipEvent_t event, hipStream_t stream __dparm(0),
+                                   unsigned int flags __dparm(0));
+/**
+ * @brief Record an event in the specified stream.
+ *
+ * @param[in] event event to record.
+ * @param[in] stream stream in which to record event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ * hipEventQuery() or hipEventSynchronize() must be used to determine when the event
+ * transitions from "recording" (after hipEventRecord() is called) to "recorded"
+ * (when timestamps are set, if requested).
+ *
+ * Events which are recorded in a non-NULL stream will transition to
+ * from recording to "recorded" state when they reach the head of
+ * the specified stream, after all previous
+ * commands in that stream have completed executing.
+ *
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any
+ * existing state in event.
+ *
+ * If this function is called on an event that is currently being recorded, results are undefined
+ * - either outstanding recording may save state into the event, and the order is not guaranteed.
+ *
+ * @note If this function is not called before use hipEventQuery() or hipEventSynchronize(),
+ * #hipSuccess is returned, meaning no pending event in the stream.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ *
+ */
+#ifdef __cplusplus
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
+#else
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
+#endif
+/**
+ *  @brief Destroy the specified event.
+ *
+ *  @param[in] event Event to destroy.
+ *  @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure
+ *
+ *  Releases memory associated with the event.  If the event is recording but has not completed
+ * recording when hipEventDestroy() is called, the function will return immediately and the
+ * completion_future resources will be released later, when the hipDevice is synchronized.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventRecord,
+ * hipEventElapsedTime
+ *
+ * @returns #hipSuccess
+ */
+hipError_t hipEventDestroy(hipEvent_t event);
+/**
+ *  @brief Wait for an event to complete.
+ *
+ *  This function will block until the event is ready, waiting for all previous work in the stream
+ * specified when event was recorded with hipEventRecord().
+ *
+ *  If hipEventRecord() has not been called on @p event, this function returns #hipSuccess when no
+ *  event is captured.
+ *
+ *
+ *  @param[in] event Event on which to wait.
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ *  @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventElapsedTime
+ */
+hipError_t hipEventSynchronize(hipEvent_t event);
+/**
+ * @brief Return the elapsed time between two events.
+ *
+ * @param[out] ms : Return time between start and stop in ms.
+ * @param[in]   start : Start event.
+ * @param[in]   stop  : Stop event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotReady, #hipErrorInvalidHandle,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Computes the elapsed time between two events. Time is computed in ms, with
+ * a resolution of approximately 1 us.
+ *
+ * Events which are recorded in a NULL stream will block until all commands
+ * on all other streams complete execution, and then record the timestamp.
+ *
+ * Events which are recorded in a non-NULL stream will record their timestamp
+ * when they reach the head of the specified stream, after all previous
+ * commands in that stream have completed executing.  Thus the time that
+ * the event recorded may be significantly after the host calls hipEventRecord().
+ *
+ * If hipEventRecord() has not been called on either event, then #hipErrorInvalidHandle is
+ * returned. If hipEventRecord() has been called on both events, but the timestamp has not yet been
+ * recorded on one or both events (that is, hipEventQuery() would return #hipErrorNotReady on at
+ * least one of the events), then #hipErrorNotReady is returned.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventSynchronize
+ */
+hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop);
+/**
+ * @brief Query event status
+ *
+ * @param[in] event Event to query.
+ * @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle, #hipErrorInvalidValue,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Query the status of the specified event.  This function will return #hipSuccess if all
+ * commands in the appropriate stream (specified to hipEventRecord()) have completed.  If any
+ * execution has not completed, then #hipErrorNotReady is returned.
+ *
+ * @note This API returns #hipSuccess, if hipEventRecord() is not called before this API.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
+ * hipEventSynchronize, hipEventElapsedTime
+ */
+hipError_t hipEventQuery(hipEvent_t event);
+// end doxygen Events
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Memory Memory Management
+ *  @{
+ *  This section describes the memory management functions of HIP runtime API.
+ *  The following CUDA APIs are not currently supported:
+ *  - cudaMalloc3D
+ *  - cudaMalloc3DArray
+ *  - TODO - more 2D, 3D, array APIs here.
+ *
+ *
+ */
+
+/**
+ *  @brief Sets information on the specified pointer.[BETA]
+ *
+ *  @param [in]      value     Sets pointer attribute value
+ *  @param [in]      attribute  Attribute to set
+ *  @param [in]      ptr      Pointer to set attributes for
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @warning This API is marked as Beta. While this feature is complete, it can
+ *           change and might have outstanding issues.
+ *
+ */
+hipError_t hipPointerSetAttribute(const void* value, hipPointer_attribute attribute,
+                                  hipDeviceptr_t ptr);
+
+
+/**
+ *  @brief Returns attributes for the specified pointer
+ *
+ *  @param [out]  attributes  attributes for the specified pointer
+ *  @param [in]   ptr         pointer to get attributes for
+ *
+ *  The output parameter 'attributes' has a member named 'type' that describes what memory the
+ *  pointer is associated with, such as device memory, host memory, managed memory, and others.
+ *  Otherwise, the API cannot handle the pointer and returns #hipErrorInvalidValue.
+ *
+ *  @note  The unrecognized memory type is unsupported to keep the HIP functionality backward
+ *  compatibility due to #hipMemoryType enum values.
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @note  The current behavior of this HIP API corresponds to the CUDA API before version 11.0.
+ *
+ *  @see hipPointerGetAttribute
+ */
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr);
+/**
+ *  @brief Returns information about the specified pointer.[BETA]
+ *
+ *  @param [in, out] data     Returned pointer attribute value
+ *  @param [in]      attribute  Attribute to query for
+ *  @param [in]      ptr      Pointer to get attributes for
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @warning This API is marked as Beta. While this feature is complete, it can
+ *           change and might have outstanding issues.
+ *
+ *  @see hipPointerGetAttributes
+ */
+hipError_t hipPointerGetAttribute(void* data, hipPointer_attribute attribute, hipDeviceptr_t ptr);
+/**
+ *  @brief Returns information about the specified pointer.[BETA]
+ *
+ *  @param [in]  numAttributes   number of attributes to query for
+ *  @param [in]  attributes      attributes to query for
+ *  @param [in, out] data        a two-dimensional containing pointers to memory locations
+ *                               where the result of each attribute query will be written to
+ *  @param [in]  ptr             pointer to get attributes for
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @warning This API is marked as Beta. While this feature is complete, it can
+ *           change and might have outstanding issues.
+ *
+ *  @see hipPointerGetAttribute
+ */
+hipError_t hipDrvPointerGetAttributes(unsigned int numAttributes, hipPointer_attribute* attributes,
+                                      void** data, hipDeviceptr_t ptr);
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup External External Resource Interoperability
+ *  @{
+ *  @ingroup API
+ *
+ *  This section describes the external resource interoperability functions of HIP runtime API.
+ *
+ */
+/**
+ *  @brief Imports an external semaphore.
+ *
+ *  @param[out] extSem_out  External semaphores to be waited on
+ *  @param[in] semHandleDesc Semaphore import handle descriptor
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ *
+ *  @note  This API is currently not supported on Linux.
+ *
+ */
+hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
+                                      const hipExternalSemaphoreHandleDesc* semHandleDesc);
+/**
+ *  @brief Signals a set of external semaphore objects.
+ *
+ *  @param[in] extSemArray  External semaphores to be waited on
+ *  @param[in] paramsArray Array of semaphore parameters
+ *  @param[in] numExtSems Number of semaphores to wait on
+ *  @param[in] stream Stream to enqueue the wait operations in
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ *
+ *  @note  This API is currently not supported on Linux.
+ *
+ */
+hipError_t hipSignalExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                            const hipExternalSemaphoreSignalParams* paramsArray,
+                                            unsigned int numExtSems, hipStream_t stream);
+/**
+ *  @brief Waits on a set of external semaphore objects
+ *
+ *  @param[in] extSemArray  External semaphores to be waited on
+ *  @param[in] paramsArray Array of semaphore parameters
+ *  @param[in] numExtSems Number of semaphores to wait on
+ *  @param[in] stream Stream to enqueue the wait operations in
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ *
+ *  @note  This API is currently not supported on Linux.
+ *
+ */
+hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                          const hipExternalSemaphoreWaitParams* paramsArray,
+                                          unsigned int numExtSems, hipStream_t stream);
+/**
+ *  @brief Destroys an external semaphore object and releases any references to the underlying
+ * resource. Any outstanding signals or waits must have completed before the semaphore is destroyed.
+ *
+ *  @param[in] extSem handle to an external memory object
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ *
+ *  @note  This API is currently not supported on Linux.
+ *
+ */
+hipError_t hipDestroyExternalSemaphore(hipExternalSemaphore_t extSem);
+
+/**
+ *  @brief Imports an external memory object.
+ *
+ *  @param[out] extMem_out  Returned handle to an external memory object
+ *  @param[in]  memHandleDesc Memory import handle descriptor
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ *
+ */
+hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out,
+                                   const hipExternalMemoryHandleDesc* memHandleDesc);
+/**
+ *  @brief Maps a buffer onto an imported memory object.
+ *
+ *  @param[out] devPtr Returned device pointer to buffer
+ *  @param[in]  extMem  Handle to external memory object
+ *  @param[in]  bufferDesc  Buffer descriptor
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+hipError_t hipExternalMemoryGetMappedBuffer(void** devPtr, hipExternalMemory_t extMem,
+                                            const hipExternalMemoryBufferDesc* bufferDesc);
+/**
+ *  @brief Destroys an external memory object.
+ *
+ *  @param[in] extMem  External memory object to be destroyed
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem);
+/**
+ *  @brief Maps a mipmapped array onto an external memory object.
+ *
+ *  @param[out] mipmap mipmapped array to return
+ *  @param[in]  extMem external memory object handle
+ *  @param[in]  mipmapDesc external mipmapped array descriptor
+ *
+ *  Returned mipmapped array must be freed using hipFreeMipmappedArray.
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
+ *
+ *  @see hipImportExternalMemory, hipDestroyExternalMemory, hipExternalMemoryGetMappedBuffer,
+ * hipFreeMipmappedArray
+ */
+hipError_t hipExternalMemoryGetMappedMipmappedArray(
+    hipMipmappedArray_t* mipmap, hipExternalMemory_t extMem,
+    const hipExternalMemoryMipmappedArrayDesc* mipmapDesc);
+// end of external resource
+/**
+ * @}
+ */
+/**
+ *  @brief Allocate memory on the default accelerator
+ *
+ *  @param[out] ptr Pointer to the allocated memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hipHostMalloc
+ */
+hipError_t hipMalloc(void** ptr, size_t size);
+/**
+ *  @brief Allocate memory on the default accelerator
+ *
+ *  @param[out] ptr  Pointer to the allocated memory
+ *  @param[in]  sizeBytes  Requested memory size
+ *  @param[in]  flags  Type of memory allocation
+ *
+ *  If requested memory size is 0, no memory is allocated, *ptr returns nullptr, and #hipSuccess
+ *  is returned.
+ *
+ *  The memory allocation flag should be either #hipDeviceMallocDefault,
+ *  #hipDeviceMallocFinegrained, #hipDeviceMallocUncached, or #hipMallocSignalMemory.
+ *  If the flag is any other value, the API returns #hipErrorInvalidValue.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hiHostMalloc
+ */
+hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags);
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup MemoryD Memory Management [Deprecated]
+ *  @ingroup Memory
+ *  @{
+ *  This section describes the deprecated memory management functions of HIP runtime API.
+ *
+ */
+
+/**
+ *  @brief Allocate pinned host memory [Deprecated]
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @warning  This API is deprecated, use hipHostMalloc() instead
+ */
+HIP_DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMallocHost(void** ptr, size_t size);
+/**
+ *  @brief Allocate pinned host memory [Deprecated]
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @warning  This API is deprecated, use hipHostMalloc() instead
+ */
+HIP_DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMemAllocHost(void** ptr, size_t size);
+// end doxygen deprecated management memory
+/**
+ * @}
+ */
+/**
+ *  @brief Allocates device accessible page locked (pinned) host memory
+ *
+ *  This API allocates pinned host memory which is mapped into the address space of all GPUs
+ *  in the system, the memory can be accessed directly by the GPU device, and can be read or
+ *  written with much higher bandwidth than pageable memory obtained with functions such as
+ *  malloc().
+ *
+ *  Using the pinned host memory, applications can implement faster data transfers for HostToDevice
+ *  and DeviceToHost. The runtime tracks the hipHostMalloc allocations and can avoid some of the
+ *  setup required for regular unpinned memory.
+ *
+ *  When the memory accesses are infrequent, zero-copy memory can be a good choice, for coherent
+ *  allocation. GPU can directly access the host memory over the CPU/GPU interconnect, without need
+ *  to copy the data.
+ *
+ *  Currently the allocation granularity is 4KB for the API.
+ *
+ *  Developers need to choose proper allocation flag with consideration of synchronization.
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size in bytes
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *  @param[in]  flags Type of host memory allocation. See the description of flags in
+ *  hipSetDeviceFlags.
+ *
+ *  If no input for flags, it will be the default pinned memory allocation on the host.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory
+ *
+ *
+ *  @see hipSetDeviceFlags, hiptHostFree
+ */
+hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags);
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup MemoryM Managed Memory
+ *
+ *  @ingroup Memory
+ * @{
+ *  This section describes the managed memory management functions of HIP runtime API.
+ *
+ *  @note  The managed memory management APIs are implemented on Linux, under developement
+ *  on Windows.
+ *
+ */
+/**
+ * @brief Allocates memory that will be automatically managed by HIP.
+ *
+ * This API is used for managed memory, allows data be shared and accessible to both CPU and
+ * GPU using a single pointer.
+ *
+ * The API returns the allocation pointer, managed by HMM, can be used further to execute kernels
+ * on device and fetch data between the host and device as needed.
+ *
+ * If HMM is not supported, the function behaves the same as @p hipMallocHost .
+ *
+ * @note   It is recommend to do the capability check before call this API.
+ *
+ * @param [out] dev_ptr - pointer to allocated device memory
+ * @param [in]  size    - requested allocation size in bytes, it should be granularity of 4KB
+ * @param [in]  flags   - must be either hipMemAttachGlobal or hipMemAttachHost
+ *                        (defaults to hipMemAttachGlobal)
+ *
+ * @returns #hipSuccess, #hipErrorMemoryAllocation, #hipErrorNotSupported, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipMallocManaged(void** dev_ptr, size_t size,
+                            unsigned int flags __dparm(hipMemAttachGlobal));
+/**
+ * @brief Prefetches memory to the specified destination device using HIP.
+ *
+ * @param [in] dev_ptr  pointer to be prefetched
+ * @param [in] count    size in bytes for prefetching
+ * @param [in] device   destination device to prefetch to
+ * @param [in] stream   stream to enqueue prefetch operation
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
+                               hipStream_t stream __dparm(0));
+/**
+ * @brief Prefetches memory to the specified destination device using HIP.
+ *
+ * @param [in] dev_ptr    pointer to be prefetched
+ * @param [in] count      size in bytes for prefetching
+ * @param [in] location   destination location to prefetch to
+ * @param [in] flags      flags for future use, must be zero now.
+ * @param [in] stream     stream to enqueue prefetch operation
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPrefetchAsync_v2(const void* dev_ptr, size_t count, hipMemLocation location,
+                                  unsigned int flags, hipStream_t stream __dparm(0));
+/**
+ * @brief Advise about the usage of a given memory range to HIP.
+ *
+ * @param [in] dev_ptr  pointer to memory to set the advice for
+ * @param [in] count    size in bytes of the memory range, it should be CPU page size alligned.
+ * @param [in] advice   advice to be applied for the specified memory range
+ * @param [in] device   device to apply the advice for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * This HIP API advises about the usage to be applied on unified memory allocation in the
+ * range starting from the pointer address devPtr, with the size of count bytes.
+ * The memory range must refer to managed memory allocated via the API hipMallocManaged, and the
+ * range will be handled with proper round down and round up respectively in the driver to
+ * be aligned to CPU page size, the same way as corresponding CUDA API behaves in CUDA version 8.0
+ * and afterwards.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice, int device);
+/**
+ * @brief Advise about the usage of a given memory range to HIP.
+ *
+ * @param [in] dev_ptr    pointer to memory to set the advice for
+ * @param [in] count      size in bytes of the memory range, it should be CPU page size alligned.
+ * @param [in] advice     advice to be applied for the specified memory range
+ * @param [in] location   location to apply the advice for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * This HIP API advises about the usage to be applied on unified memory allocation in the
+ * range starting from the pointer address devPtr, with the size of count bytes.
+ * The memory range must refer to managed memory allocated via the API hipMallocManaged, and the
+ * range will be handled with proper round down and round up respectively in the driver to
+ * be aligned to CPU page size, the same way as corresponding CUDA API behaves in CUDA version 8.0
+ * and afterwards.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemAdvise_v2(const void* dev_ptr, size_t count, hipMemoryAdvise advice,
+                           hipMemLocation location);
+/**
+ * @brief Query an attribute of a given memory range in HIP.
+ *
+ * @param [in,out] data   a pointer to a memory location where the result of each
+ *                        attribute query will be written to
+ * @param [in] data_size  the size of data
+ * @param [in] attribute  the attribute to query
+ * @param [in] dev_ptr    start of the range to query
+ * @param [in] count      size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttribute attribute,
+                                   const void* dev_ptr, size_t count);
+/**
+ * @brief Query attributes of a given memory range in HIP.
+ *
+ * @param [in,out] data     a two-dimensional array containing pointers to memory locations
+ *                          where the result of each attribute query will be written to
+ * @param [in] data_sizes   an array, containing the sizes of each result
+ * @param [in] attributes   the attribute to query
+ * @param [in] num_attributes  an array of attributes to query (numAttributes and the number
+ *                          of attributes in this array should match)
+ * @param [in] dev_ptr      start of the range to query
+ * @param [in] count        size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
+                                    hipMemRangeAttribute* attributes, size_t num_attributes,
+                                    const void* dev_ptr, size_t count);
+/**
+ * @brief Attach memory to a stream asynchronously in HIP.
+ *
+ * @param [in] stream     - stream in which to enqueue the attach operation
+ * @param [in] dev_ptr    - pointer to memory (must be a pointer to managed memory or
+ *                          to a valid host-accessible region of system-allocated memory)
+ * @param [in] length     - length of memory (defaults to zero)
+ * @param [in] flags      - must be one of hipMemAttachGlobal, hipMemAttachHost or
+ *                          hipMemAttachSingle (defaults to hipMemAttachSingle)
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is under development. Currently it is a no-operation (NOP)
+ *          function on AMD GPUs and returns #hipSuccess.
+ */
+hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t length __dparm(0),
+                                   unsigned int flags __dparm(hipMemAttachSingle));
+// end doxygen Managed Memory
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup StreamO Stream Ordered Memory Allocator
+ * @{
+ * @ingroup Memory
+ * This section describes Stream Ordered Memory Allocator functions of HIP runtime API.
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between the stream executions of
+ * the allocation and the free. If the memory is accessed outside of the promised stream order,
+ * a use before allocation / use after free error  will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee that compliant memory
+ * accesses will not overlap temporally. The allocator may refer to internal stream ordering as well
+ * as inter-stream dependencies (such as HIP events and null stream dependencies) when establishing
+ * the temporal guarantee. The allocator may also insert inter-stream dependencies to establish
+ * the temporal guarantee.  Whether or not a device supports the integrated stream ordered memory
+ * allocator may be queried by calling @p hipDeviceGetAttribute with the device attribute
+ * @p hipDeviceAttributeMemoryPoolsSupported
+ *
+ * @note  APIs in this section are implemented on Linux, under development on Windows.
+ */
+
+/**
+ * @brief Allocates memory with stream ordered semantics
+ *
+ * Inserts a memory allocation operation into @p stream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the allocation operation completes.
+ * The allocation comes from the memory pool associated with the stream's device.
+ *
+ * @note The default memory pool of a device contains device memory from that device.
+ * @note Basic stream ordering allows future work submitted into the same stream to use the
+ *  allocation. Stream query, stream synchronize, and HIP events can be used to guarantee that
+ *  the allocation operation completes before work submitted in a separate stream runs.
+ * @note During stream capture, this function results in the creation of an allocation node.
+ *  In this case, the allocation is owned by the graph instead of the memory pool. The memory
+ *  pool's properties are used to set the node's creation parameters.
+ *
+ * @param [out] dev_ptr  Returned device pointer of memory allocation
+ * @param [in] size      Number of bytes to allocate
+ * @param [in] stream    The stream establishing the stream ordering contract and
+ *                       the memory pool to allocate from
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
+ *
+ * @see hipMallocFromPoolAsync, hipFreeAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream);
+/**
+ * @brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into @p stream.
+ * The allocation must not be used after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * @note During stream capture, this function results in the creation of a free node and
+ * must therefore be passed the address of a graph allocation.
+ *
+ * @param [in] dev_ptr Pointer to device memory to free
+ * @param [in] stream  The stream, where the destruciton will occur according to the execution order
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream);
+/**
+ * @brief Releases freed memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than @p min_bytes_to_keep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * @note Allocations that have not been freed count as outstanding.
+ * @note Allocations that have been asynchronously freed but whose completion has
+ * not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * @param[in] mem_pool          The memory pool to trim allocations
+ * @param[in] min_bytes_to_hold If the pool has less than min_bytes_to_hold reserved,
+ * then the TrimTo operation is a no-op.  Otherwise the memory pool will contain
+ * at least min_bytes_to_hold bytes reserved after the operation.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolTrimTo(hipMemPool_t mem_pool, size_t min_bytes_to_hold);
+/**
+ * @brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - @p hipMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                                  Amount of reserved memory in bytes to hold onto before trying
+ *                                  to release memory back to the OS. When more than the release
+ *                                  threshold bytes of memory are held by the memory pool, the
+ *                                  allocator will try to release memory back to the OS on the
+ *                                  next call to stream, event or context synchronize. (default 0)
+ * - @p hipMemPoolReuseFollowEventDependencies: (value type = int)
+ *                                  Allow @p hipMallocAsync to use memory asynchronously freed
+ *                                  in another stream as long as a stream ordering dependency
+ *                                  of the allocating stream on the free action exists.
+ *                                  HIP events and null stream interactions can create the required
+ *                                  stream ordered dependencies. (default enabled)
+ * - @p hipMemPoolReuseAllowOpportunistic: (value type = int)
+ *                                  Allow reuse of already completed frees when there is no
+ * dependency between the free and allocation. (default enabled)
+ * - @p hipMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                                  Allow @p hipMallocAsync to insert new stream dependencies
+ *                                  in order to establish the stream ordering required to reuse
+ *                                  a piece of memory released by @p hipFreeAsync (default enabled).
+ *
+ * @param [in] mem_pool The memory pool to modify
+ * @param [in] attr     The attribute to modify
+ * @param [in] value    Pointer to the value to assign
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolSetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value);
+/**
+ * @brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - @p hipMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                                  Amount of reserved memory in bytes to hold onto before trying
+ *                                  to release memory back to the OS. When more than the release
+ *                                  threshold bytes of memory are held by the memory pool, the
+ *                                  allocator will try to release memory back to the OS on the
+ *                                  next call to stream, event or context synchronize. (default 0)
+ * - @p hipMemPoolReuseFollowEventDependencies: (value type = int)
+ *                                  Allow @p hipMallocAsync to use memory asynchronously freed
+ *                                  in another stream as long as a stream ordering dependency
+ *                                  of the allocating stream on the free action exists.
+ *                                  HIP events and null stream interactions can create the required
+ *                                  stream ordered dependencies. (default enabled)
+ * - @p hipMemPoolReuseAllowOpportunistic: (value type = int)
+ *                                  Allow reuse of already completed frees when there is no
+ * dependency between the free and allocation. (default enabled)
+ * - @p hipMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                                  Allow @p hipMallocAsync to insert new stream dependencies
+ *                                  in order to establish the stream ordering required to reuse
+ *                                  a piece of memory released by @p hipFreeAsync (default enabled).
+ *
+ * @param [in] mem_pool The memory pool to get attributes of
+ * @param [in] attr     The attribute to get
+ * @param [in] value    Retrieved value
+ *
+ * @returns  #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync,
+ * hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess,
+ * hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value);
+/**
+ * @brief Controls visibility of the specified pool between devices
+ *
+ * @param [in] mem_pool   Memory pool for acccess change
+ * @param [in] desc_list  Array of access descriptors. Each descriptor instructs the access to
+ * enable for a single gpu
+ * @param [in] count  Number of descriptors in the map array.
+ *
+ * @returns  #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolSetAccess(hipMemPool_t mem_pool, const hipMemAccessDesc* desc_list,
+                               size_t count);
+/**
+ * @brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location.
+ *
+ * @param [out] flags    Accessibility of the memory pool from the specified location/device
+ * @param [in] mem_pool   Memory pool being queried
+ * @param [in] location  Location/device for memory pool access
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolGetAccess(hipMemAccessFlags* flags, hipMemPool_t mem_pool,
+                               hipMemLocation* location);
+/**
+ * @brief Creates a memory pool
+ *
+ * Creates a HIP memory pool and returns the handle in @p mem_pool. The @p pool_props determines
+ * the properties of the pool such as the backing device and IPC capabilities.
+ *
+ * By default, the memory pool will be accessible from the device it is allocated on.
+ *
+ * @param [out] mem_pool    Contains createed memory pool
+ * @param [in] pool_props   Memory pool properties
+ *
+ * @note Specifying hipMemHandleTypeNone creates a memory pool that will not support IPC.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipMemPoolDestroy, hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute,
+ * hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolCreate(hipMemPool_t* mem_pool, const hipMemPoolProps* pool_props);
+/**
+ * @brief Destroys the specified memory pool
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when @p hipMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations.
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * @param [in] mem_pool Memory pool for destruction
+ *
+ * @note A device's default memory pool cannot be destroyed.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
+ * hipMemPoolCreate hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute,
+ * hipMemPoolSetAccess, hipMemPoolGetAccess
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool);
+/**
+ * @brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into @p stream.
+ * A pointer to the allocated memory is returned immediately in @p dev_ptr.
+ * The allocation must not be accessed until the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * @note The specified memory pool may be from a device different than that of the specified @p
+ * stream.
+ *
+ * Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ * Stream query, stream synchronize, and HIP events can be used to guarantee that the allocation
+ * operation completes before work submitted in a separate stream runs.
+ *
+ * @note During stream capture, this function results in the creation of an allocation node. In this
+ * case, the allocation is owned by the graph instead of the memory pool. The memory pool's
+ * properties are used to set the node's creation parameters.
+ *
+ * @param [out] dev_ptr Returned device pointer
+ * @param [in] size     Number of bytes to allocate
+ * @param [in] mem_pool The pool to allocate from
+ * @param [in] stream   The stream establishing the stream ordering semantic
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
+ *
+ * @see hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolCreate
+ * hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess,
+ * hipMemPoolGetAccess,
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_pool,
+                                  hipStream_t stream);
+/**
+ * @brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with @p
+ * hipMemPoolImportFromShareableHandle. Individual pointers can then be shared with the @p
+ * hipMemPoolExportPointer and @p hipMemPoolImportPointer APIs. The implementation of what the
+ * shareable handle is and how it can be transferred is defined by the requested handle type.
+ *
+ * @note To create an IPC capable mempool, create a mempool with a @p hipMemAllocationHandleType
+ * other than @p hipMemHandleTypeNone.
+ *
+ * @param [out] shared_handle Pointer to the location in which to store the requested handle
+ * @param [in] mem_pool       Pool to export
+ * @param [in] handle_type    The type of handle to create
+ * @param [in] flags          Must be 0
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ * @see hipMemPoolImportFromShareableHandle
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolExportToShareableHandle(void* shared_handle, hipMemPool_t mem_pool,
+                                             hipMemAllocationHandleType handle_type,
+                                             unsigned int flags);
+/**
+ * @brief Imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with @p hipMemPoolImportPointer.
+ *
+ * @note Imported memory pools do not support creating new allocations.
+ * As such imported memory pools may not be used in @p hipDeviceSetMemPool
+ * or @p hipMallocFromPoolAsync calls.
+ *
+ * @param [out] mem_pool     Returned memory pool
+ * @param [in] shared_handle OS handle of the pool to open
+ * @param [in] handle_type   The type of handle being imported
+ * @param [in] flags         Must be 0
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ * @see hipMemPoolExportToShareableHandle
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolImportFromShareableHandle(hipMemPool_t* mem_pool, void* shared_handle,
+                                               hipMemAllocationHandleType handle_type,
+                                               unsigned int flags);
+/**
+ * @brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs @p export_data for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the @p hipMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * @param[out] export_data  Returned export data
+ * @param[in] dev_ptr       Pointer to memory being exported
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ * @see hipMemPoolImportPointer
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* dev_ptr);
+/**
+ * @brief Import a memory pool allocation from another process.
+ *
+ * Returns in @p dev_ptr a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with @p hipFree
+ * or @p hipFreeAsync. If @p hipFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * @note The @p hipFreeAsync api may be used in the exporting process before
+ * the @p hipFreeAsync operation completes in its stream as long as the
+ * @p hipFreeAsync in the exporting process specifies a stream with
+ * a stream dependency on the importing process's @p hipFreeAsync.
+ *
+ * @param [out] dev_ptr     Pointer to imported memory
+ * @param [in] mem_pool     Memory pool from which to import a pointer
+ * @param [in] export_data  Data specifying the memory to import
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized, #hipErrorOutOfMemory
+ *
+ * @see hipMemPoolExportPointer
+ *
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemPoolImportPointer(void** dev_ptr, hipMemPool_t mem_pool,
+                                   hipMemPoolPtrExportData* export_data);
+// Doxygen end of ordered memory allocator
+/**
+ * @}
+ */
+
+/**
+ *  @brief Allocate device accessible page locked host memory
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size in bytes
+ *  @param[in]  flags Type of host memory allocation see below
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  Flags:
+ *  - #hipHostAllocDefault   Default pinned memory allocation on the host.
+ *  - #hipHostAllocPortable  Memory is considered allocated by all contexts.
+ *  - #hipHostAllocMapped    Map the allocation into the address space for the current device.
+ *  - #hipHostAllocWriteCombined  Allocates the memory as write-combined.
+ *  - #hipHostAllocUncached  Allocate the host memory on extended fine grained access system
+ *                           memory pool
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue
+ */
+hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags);
+/**
+ *  @brief Get Device pointer from Host Pointer allocated through hipHostMalloc
+ *
+ *  @param[out] devPtr Device Pointer mapped to passed host pointer
+ *  @param[in]  hstPtr Host Pointer allocated through hipHostMalloc
+ *  @param[in]  flags Flags to be passed for extension
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ *  @see hipSetDeviceFlags, hipHostMalloc
+ */
+hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int flags);
+/**
+ *  @brief Return flags associated with host pointer
+ *
+ *  @param[out] flagsPtr Memory location to store flags
+ *  @param[in]  hostPtr Host Pointer allocated through hipHostMalloc
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ *  @see hipHostMalloc
+ */
+hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr);
+/**
+ *  @brief Register host memory so it can be accessed from the current device.
+ *
+ *  @param[out] hostPtr Pointer to host memory to be registered.
+ *  @param[in] sizeBytes Size of the host memory
+ *  @param[in] flags  See below.
+ *
+ *  Flags:
+ *  - #hipHostRegisterDefault   Memory is Mapped and Portable
+ *  - #hipHostRegisterPortable  Memory is considered registered by all contexts.  HIP only supports
+ * one context so this is always assumed true.
+ *  - #hipHostRegisterMapped    Map the allocation into the address space for the current device.
+ * The device pointer can be obtained with #hipHostGetDevicePointer.
+ *  - #hipExtHostRegisterUncached  Map the host memory onto extended fine grained access system
+ * memory pool.
+ *
+ *  After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
+ *  On many systems, the mapped device pointer will have a different value than the mapped host
+ * pointer.  Applications must use the device pointer in device code, and the host pointer in host
+ * code.
+ *
+ *  On some systems, registered memory is pinned.  On some systems, registered memory may not be
+ * actually be pinned but uses OS or hardware facilities to all GPU access to the host memory.
+ *
+ *  Developers are strongly encouraged to register memory blocks which are aligned to the host
+ * cache-line size. (typically 64-bytes but can be obtains from the CPUID instruction).
+ *
+ *  If registering non-aligned pointers, the application must take care when register pointers from
+ * the same cache line on different devices.  HIP's coarse-grained synchronization model does not
+ * guarantee correct results if different devices write to different parts of the same cache block -
+ * typically one of the writes will "win" and overwrite data from the other registered memory
+ * region.
+ *
+ *  @returns #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer
+ */
+hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
+/**
+ *  @brief Un-register host pointer
+ *
+ *  @param[in] hostPtr Host pointer previously registered with #hipHostRegister
+ *  @returns Error code
+ *
+ *  @see hipHostRegister
+ */
+hipError_t hipHostUnregister(void* hostPtr);
+/**
+ *  Allocates at least width (in bytes) * height bytes of linear memory
+ *  Padding may occur to ensure alighnment requirements are met for the given row
+ *  The change in width size due to padding will be returned in *pitch.
+ *  Currently the alignment is set to 128 bytes
+ *
+ *  @param[out] ptr Pointer to the allocated device memory
+ *  @param[out] pitch Pitch for allocation (in bytes)
+ *  @param[in]  width Requested pitched allocation width (in bytes)
+ *  @param[in]  height Requested pitched allocation height
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @returns Error code
+ *
+ *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height);
+/**
+ *  Allocates at least width (in bytes) * height bytes of linear memory
+ *  Padding may occur to ensure alighnment requirements are met for the given row
+ *  The change in width size due to padding will be returned in *pitch.
+ *  Currently the alignment is set to 128 bytes
+ *
+ *  @param[out] dptr  Pointer to the allocated device memory
+ *  @param[out] pitch  Pitch for allocation (in bytes)
+ *  @param[in]  widthInBytes  Requested pitched allocation width (in bytes)
+ *  @param[in]  height  Requested pitched allocation height
+ *  @param[in]  elementSizeBytes  The size of element bytes, should be 4, 8 or 16
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *  The intended usage of pitch is as a separate parameter of the allocation, used to compute
+ * addresses within the 2D array. Given the row and column of an array element of type T, the
+ * address is computed as: T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ *
+ *  @returns Error code
+ *
+ *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes, size_t height,
+                            unsigned int elementSizeBytes);
+/**
+ *  @brief Free memory allocated by the HIP-Clang hip memory allocation API.
+ *  This API performs an implicit hipDeviceSynchronize() call.
+ *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @returns #hipSuccess
+ *  @returns #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
+ * with hipHostMalloc)
+ *
+ *  @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipFree(void* ptr);
+/**
+ *  @brief Frees page-locked memory
+ *  This API performs an implicit hipDeviceSynchronize() call.
+ *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @returns #hipSuccess,
+ *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated
+ *  with hipMalloc)
+ *
+ */
+hipError_t hipFreeHost(void* ptr);
+/**
+ *  @brief Free memory allocated by the HIP-Clang hip host memory allocation API
+ *  This API performs an implicit hipDeviceSynchronize() call.
+ *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ *  @ingroup MemoryD
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @returns #hipSuccess,
+ *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
+ * hipMalloc)
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ *
+ */
+hipError_t hipHostFree(void* ptr);
+/**
+ *  @brief Copy data from src to dst.
+ *
+ *  It supports memory from host to device,
+ *  device to host, device to device and host to host
+ *  The src and dst must not overlap.
+ *
+ *  For hipMemcpy, the copy is always performed by the current device (set by hipSetDevice).
+ *  For multi-gpu or peer-to-peer configurations, it is recommended to set the current device to the
+ *  device where the src data is physically located. For optimal peer-to-peer copies, the copy
+ * device must be able to access the src and dst pointers (by calling hipDeviceEnablePeerAccess with
+ * copy agent as the current device and src/dst as the peerDevice argument.  if this is not done,
+ * the hipMemcpy will still work, but will perform the copy using a staging buffer on the host.
+ *  Calling hipMemcpy with dst and src pointers that do not match the hipMemcpyKind results in
+ *  undefined behavior.
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]  src Data being copy from
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  kind Kind of transfer
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+/**
+ *  @brief Memory copy on the stream.
+ *  It allows single or multiple devices to do memory copy on single or multiple streams.
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]  src Data being copy from
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  kind Kind of transfer
+ *  @param[in]  stream Valid stream
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorContextIsDestroyed
+ *
+ *  @see hipMemcpy, hipStreamCreate, hipStreamSynchronize, hipStreamDestroy, hipSetDevice,
+ * hipLaunchKernelGGL
+ *
+ */
+hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                               hipStream_t stream);
+/**
+ *  @brief Copy data from Host to Device
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, const void* src, size_t sizeBytes);
+/**
+ *  @brief Copy data from Device to Host
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
+/**
+ *  @brief Copy data from Device to Device
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
+/**
+ *  @brief Copies from one 1D array to device memory.
+ *
+ *  @param[out]  dstDevice Destination device pointer
+ *  @param[in]   srcArray Source array
+ *  @param[in]   srcOffset Offset in bytes of source array
+ *  @param[in]   ByteCount Size of memory copy in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset,
+                         size_t ByteCount);
+/**
+ *  @brief Copies from device memory to a 1D array.
+ *
+ *  @param[out]  dstArray Destination array
+ *  @param[in]   dstOffset Offset in bytes of destination array
+ *  @param[in]   srcDevice Source device pointer
+ *  @param[in]   ByteCount Size of memory copy in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset, hipDeviceptr_t srcDevice,
+                         size_t ByteCount);
+
+/**
+ *  @brief Copies from one 1D array to another.
+ *
+ *  @param[out]  dstArray Destination array
+ *  @param[in]   dstOffset Offset in bytes of destination array
+ *  @param[in]   srcArray Source array
+ *  @param[in]   srcOffset Offset in bytes of source array
+ *  @param[in]   ByteCount Size of memory copy in bytes
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyAtoA(hipArray_t dstArray, size_t dstOffset, hipArray_t srcArray,
+                         size_t srcOffset, size_t ByteCount);
+/**
+ *  @brief Copy data from Host to Device asynchronously
+ *
+ *  @param[out]  dst  Data being copy to
+ *  @param[in]   src  Data being copy from
+ *  @param[in]   sizeBytes  Data size in bytes
+ *  @param[in]   stream  Stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, const void* src, size_t sizeBytes,
+                              hipStream_t stream);
+/**
+ *  @brief Copy data from Device to Host asynchronously
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *  @param[in]   stream  Stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream);
+/**
+ *  @brief Copy data from Device to Device asynchronously
+ *
+ *  @param[out]  dst  Data being copy to
+ *  @param[in]   src  Data being copy from
+ *  @param[in]   sizeBytes  Data size in bytes
+ *  @param[in]   stream  Stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
+                              hipStream_t stream);
+/**
+ * @brief Copies from one 1D array to host memory.
+ *
+ *  @param[out]  dstHost Destination pointer
+ *  @param[in]   srcArray Source array
+ *  @param[in]   srcOffset Offset in bytes of source array
+ *  @param[in]   ByteCount Size of memory copy in bytes
+ *  @param[in]   stream Stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyAtoHAsync(void* dstHost, hipArray_t srcArray, size_t srcOffset,
+                              size_t ByteCount, hipStream_t stream);
+/**
+ * @brief Copies from host memory to a 1D array.
+ *
+ *  @param[out]  dstArray Destination array
+ *  @param[in]   dstOffset Offset in bytes of destination array
+ *  @param[in]   srcHost Source host pointer
+ *  @param[in]   ByteCount Size of memory copy in bytes
+ *  @param[in]   stream Stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoAAsync(hipArray_t dstArray, size_t dstOffset, const void* srcHost,
+                              size_t ByteCount, hipStream_t stream);
+/**
+ *  @brief Returns a global pointer from a module.
+ *  @ingroup Module
+ *
+ *  Returns in *dptr and *bytes the pointer and size of the global of name name located in module
+ * hmod. If no variable of that name exists, it returns hipErrorNotFound. Both parameters dptr and
+ * bytes are optional. If one of them is NULL, it is ignored and hipSuccess is returned.
+ *
+ *  @param[out]  dptr  Returns global device pointer
+ *  @param[out]  bytes Returns global size in bytes
+ *  @param[in]   hmod  Module to retrieve global from
+ *  @param[in]   name  Name of global to retrieve
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotFound, #hipErrorInvalidContext
+ *
+ */
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+                              const char* name);
+
+/**
+ *  @brief Gets device pointer associated with symbol on the device.
+ *
+ *  @param[out]  devPtr  pointer to the device associated the symbole
+ *  @param[in]   symbol  pointer to the symbole of the device
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
+
+
+/**
+ *  @brief Gets the size of the given symbol on the device.
+ *
+ *  @param[in]   symbol  pointer to the device symbole
+ *  @param[out]  size  pointer to the size
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetSymbolSize(size_t* size, const void* symbol);
+
+/**
+ * @brief Gets the pointer of requested HIP driver function.
+ *
+ * @param[in] symbol  The Symbol name of the driver function to request.
+ * @param[out] pfn  Output pointer to the requested driver function.
+ * @param[in] hipVersion  The HIP version for the requested driver function symbol.
+ * HIP version is defined as 100*version_major + version_minor. For example, in HIP 6.1, the
+ * hipversion is 601, for the symbol function "hipGetDeviceProperties", the specified hipVersion 601
+ * is greater or equal to the version 600, the symbol function will be handle properly as backend
+ * compatible function.
+ *
+ * @param[in] flags  Currently only default flag is suppported.
+ * @param[out] symbolStatus  Optional enumeration for returned status of searching for symbol driver
+ * function based on the input hipVersion.
+ *
+ * Returns hipSuccess if the returned pfn is addressed to the pointer of found driver function.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue.
+ */
+hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
+                             hipDriverProcAddressQueryResult* symbolStatus);
+
+/**
+ *  @brief Copies data to the given symbol on the device.
+ * Symbol HIP APIs allow a kernel to define a device-side data symbol which can be accessed on
+ * the host side. The symbol can be in __constant or device space.
+ * Note that the symbol name needs to be encased in the HIP_SYMBOL macro.
+ * This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize.
+ * For detailed usage, see the
+ * <a
+ * href="https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html#memcpytosymbol">memcpyToSymbol
+ * example</a> in the HIP Porting Guide.
+ *
+ *
+ *  @param[out]  symbol  pointer to the device symbole
+ *  @param[in]   src  pointer to the source address
+ *  @param[in]   sizeBytes  size in bytes to copy
+ *  @param[in]   offset  offset in bytes from start of symbole
+ *  @param[in]   kind  type of memory transfer
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+                             size_t offset __dparm(0),
+                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
+
+/**
+ *  @brief Copies data to the given symbol on the device asynchronously.
+ *
+ *  @param[out]  symbol  pointer to the device symbole
+ *  @param[in]   src  pointer to the source address
+ *  @param[in]   sizeBytes  size in bytes to copy
+ *  @param[in]   offset  offset in bytes from start of symbole
+ *  @param[in]   kind  type of memory transfer
+ *  @param[in]   stream  stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes,
+                                  size_t offset, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data from the given symbol on the device.
+ *
+ *  @param[out]  dst  Returns pointer to destinition memory address
+ *  @param[in]   symbol  Pointer to the symbole address on the device
+ *  @param[in]   sizeBytes  Size in bytes to copy
+ *  @param[in]   offset  Offset in bytes from the start of symbole
+ *  @param[in]   kind  Type of memory transfer
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes,
+                               size_t offset __dparm(0),
+                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
+
+/**
+ *  @brief Copies data from the given symbol on the device asynchronously.
+ *
+ *  @param[out]  dst  Returns pointer to destinition memory address
+ *  @param[in]   symbol  pointer to the symbole address on the device
+ *  @param[in]   sizeBytes  size in bytes to copy
+ *  @param[in]   offset  offset in bytes from the start of symbole
+ *  @param[in]   kind  type of memory transfer
+ *  @param[in]   stream  stream identifier
+ *
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBytes, size_t offset,
+                                    hipMemcpyKind kind, hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data from src to dst asynchronously.
+ *
+ *  The copy is always performed by the device associated with the specified stream.
+ *
+ *  For multi-gpu or peer-to-peer configurations, it is recommended to use a stream which is
+ * attached to the device where the src data is physically located.
+ *  For optimal peer-to-peer copies, the copy device must be able to access the src and dst
+ * pointers (by calling hipDeviceEnablePeerAccess) with copy agent as the current device and
+ * src/dest as the peerDevice argument. If enabling device peer access is not done, the memory copy
+ * will still work, but will perform the copy using a staging buffer on the host.
+ *
+ *  @note If host or dst are not pinned, the memory copy will be performed synchronously. For
+ * best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+ *
+ *  @param[out] dst Data being copy to
+ *  @param[in]  src Data being copy from
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  kind  Type of memory transfer
+ *  @param[in]  stream  Stream identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyToSymbol,
+ * hipMemcpyFromSymbol, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
+ * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                          hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ *  @param[out] dst  Data being filled
+ *  @param[in]  value  Value to be set
+ *  @param[in]  sizeBytes  Data size in bytes
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ *  @param[out] dest  Data ptr to be filled
+ *  @param[in]  value  Value to be set
+ *  @param[in]  count  Number of values to be set
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ * hipMemsetD8Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dest  Data ptr to be filled
+ *  @param[in]  value  Constant value to be set
+ *  @param[in]  count  Number of values to be set
+ *  @param[in]  stream  Stream identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count,
+                            hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ *  @param[out] dest  Data ptr to be filled
+ *  @param[in]  value  Constant value to be set
+ *  @param[in]  count  Number of values to be set
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ * hipMemsetD16Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dest  Data ptr to be filled
+ *  @param[in]  value  Constant value to be set
+ *  @param[in]  count  Number of values to be set
+ *  @param[in]  stream  Stream identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count,
+                             hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills the memory area pointed to by dest with the constant integer
+ * value for specified number of times.
+ *
+ *  @param[out] dest  Data being filled
+ *  @param[in]  value  Constant value to be set
+ *  @param[in]  count  Number of values to be set
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
+ * byte value value.
+ *
+ * hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Pointer to device memory
+ *  @param[in]  value  Value to set for each byte of specified memory
+ *  @param[in]  sizeBytes  Size in bytes to set
+ *  @param[in]  stream  Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills the memory area pointed to by dev with the constant integer
+ * value for specified number of times.
+ *
+ *  hipMemsetD32Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Pointer to device memory
+ *  @param[in]  value  Value to set for each byte of specified memory
+ *  @param[in]  count  Number of values to be set
+ *  @param[in]  stream  Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
+                             hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills the memory area pointed to by dst with the constant value.
+ *
+ *  @param[out] dst Pointer to 2D device memory
+ *  @param[in]  pitch  Pitch size in bytes of 2D device memory, unused if height equals 1
+ *  @param[in]  value  Constant value to set for each byte of specified memory
+ *  @param[in]  width  Width size in bytes in 2D memory
+ *  @param[in]  height  Height size in bytes in 2D memory
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height);
+/**
+ *  @brief Fills asynchronously the memory area pointed to by dst with the constant value.
+ *
+ *  @param[in]  dst Pointer to 2D device memory
+ *  @param[in]  pitch  Pitch size in bytes of 2D device memory, unused if height equals 1
+ *  @param[in]  value  Value to set for each byte of specified memory
+ *  @param[in]  width  Width size in bytes in 2D memory
+ *  @param[in]  height  Height size in bytes in 2D memory
+ *  @param[in]  stream  Stream identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height,
+                            hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills synchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ *  @param[in] pitchedDevPtr  Pointer to pitched device memory
+ *  @param[in]  value  Value to set for each byte of specified memory
+ *  @param[in]  extent  Size parameters for width field in bytes in device memory
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent);
+/**
+ *  @brief Fills asynchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ *  @param[in] pitchedDevPtr  Pointer to pitched device memory
+ *  @param[in]  value  Value to set for each byte of specified memory
+ *  @param[in]  extent  Size parameters for width field in bytes in device memory
+ *  @param[in]  stream  Stream identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
+                            hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills 2D memory range of 'width' 8-bit values synchronously to the specified char value.
+ * Height specifies numbers of rows to set and dstPitch speicifies the number of bytes between each
+ * row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D8(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
+                         size_t height);
+/**
+ *  @brief Fills 2D memory range of 'width' 8-bit values asynchronously to the specified char value.
+ * Height specifies numbers of rows to set and dstPitch speicifies the number of bytes between each
+ * row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @param[in] stream    Stream Identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value,
+                              size_t width, size_t height, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills 2D memory range of 'width' 16-bit values synchronously to the specified short
+ * value. Height specifies numbers of rows to set and dstPitch speicifies the number of bytes
+ * between each row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D16(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
+                          size_t height);
+/**
+ *  @brief Fills 2D memory range of 'width' 16-bit values asynchronously to the specified short
+ * value. Height specifies numbers of rows to set and dstPitch speicifies the number of bytes
+ * between each row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @param[in] stream    Stream Identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value,
+                               size_t width, size_t height, hipStream_t stream __dparm(0));
+/**
+ *  @brief Fills 2D memory range of 'width' 32-bit values synchronously to the specified int value.
+ * Height specifies numbers of rows to set and dstPitch speicifies the number of bytes between each
+ * row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D32(hipDeviceptr_t dst, size_t dstPitch, unsigned int value, size_t width,
+                          size_t height);
+/**
+ *  @brief Fills 2D memory range of 'width' 32-bit values asynchronously to the specified int
+ * value. Height specifies numbers of rows to set and dstPitch speicifies the number of bytes
+ * between each row.
+ *  @param[in] dst       Pointer to device memory
+ *  @param[in] dstPitch  Pitch of dst device pointer
+ *  @param[in] value     value to set
+ *  @param[in] width     Width of row
+ *  @param[in] height    Number of rows
+ *  @param[in] stream    Stream Identifier
+ *  @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value,
+                               size_t width, size_t height, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Query memory info.
+ *
+ * On ROCM, this function gets the actual free memory left on the current device, so supports
+ * the cases while running multi-workload (such as multiple processes, multiple threads, and
+ * multiple GPUs).
+ *
+ * @warning On Windows, the free memory only accounts for memory allocated by this process and may
+ * be optimistic.
+ *
+ * @param[out] free Returns free memory on the current device in bytes
+ * @param[out] total Returns total allocatable memory on the current device in bytes
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ **/
+hipError_t hipMemGetInfo(size_t* free, size_t* total);
+
+/**
+ * @brief Get allocated memory size via memory pointer.
+ *
+ * This function gets the allocated shared virtual memory size from memory pointer.
+ *
+ * @param[in] ptr Pointer to allocated memory
+ * @param[out] size Returns the allocated memory size in bytes
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ **/
+hipError_t hipMemPtrGetInfo(void* ptr, size_t* size);
+/**
+ *  @brief Allocate an array on the device.
+ *
+ *  @param[out]  array  Pointer to allocated array in device memory
+ *  @param[in]   desc   Requested channel format
+ *  @param[in]   width  Requested array allocation width
+ *  @param[in]   height Requested array allocation height
+ *  @param[in]   flags  Requested properties of allocated array
+ *  @returns     #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipMallocArray(hipArray_t* array, const hipChannelFormatDesc* desc, size_t width,
+                          size_t height __dparm(0), unsigned int flags __dparm(hipArrayDefault));
+/**
+ *  @brief Create an array memory pointer on the device.
+ *
+ *  @param[out]  pHandle  Pointer to the array memory
+ *  @param[in]   pAllocateArray   Requested array desciptor
+ *
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ *  @see hipMallocArray, hipArrayDestroy, hipFreeArray
+ */
+hipError_t hipArrayCreate(hipArray_t* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray);
+/**
+ *  @brief Destroy an array memory pointer on the device.
+ *
+ *  @param[in]  array  Pointer to the array memory
+ *
+ *  @returns     #hipSuccess, #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipFreeArray
+ */
+hipError_t hipArrayDestroy(hipArray_t array);
+/**
+ *  @brief Create a 3D array memory pointer on the device.
+ *
+ *  @param[out]  array  Pointer to the 3D array memory
+ *  @param[in]   pAllocateArray   Requested array desciptor
+ *
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ *  @see hipMallocArray, hipArrayDestroy, hipFreeArray
+ */
+hipError_t hipArray3DCreate(hipArray_t* array, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray);
+/**
+ *  @brief Create a 3D memory pointer on the device.
+ *
+ *  @param[out]  pitchedDevPtr  Pointer to the 3D memory
+ *  @param[in]   extent   Requested extent
+ *
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ *  @see hipMallocPitch, hipMemGetInfo, hipFree
+ */
+hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent);
+/**
+ *  @brief Frees an array on the device.
+ *
+ *  @param[in]  array  Pointer to array to free
+ *  @returns    #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipFreeArray(hipArray_t array);
+/**
+ *  @brief Allocate an array on the device.
+ *
+ *  @param[out]  array  Pointer to allocated array in device memory
+ *  @param[in]   desc   Requested channel format
+ *  @param[in]   extent Requested array allocation width, height and depth
+ *  @param[in]   flags  Requested properties of allocated array
+ *  @returns     #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipMalloc3DArray(hipArray_t* array, const struct hipChannelFormatDesc* desc,
+                            struct hipExtent extent, unsigned int flags);
+/**
+ * @brief Gets info about the specified array
+ *
+ * @param[out] desc   - Returned array type
+ * @param[out] extent - Returned array shape. 2D arrays will have depth of zero
+ * @param[out] flags  - Returned array flags
+ * @param[in]  array  - The HIP array to get info for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * @see hipArrayGetDescriptor, hipArray3DGetDescriptor
+ */
+hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent, unsigned int* flags,
+                           hipArray_t array);
+/**
+ * @brief Gets a 1D or 2D array descriptor
+ *
+ * @param[out] pArrayDescriptor - Returned array descriptor
+ * @param[in]  array            - Array to get descriptor of
+ *
+ * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * @see hipArray3DCreate, hipArray3DGetDescriptor, hipArrayCreate, hipArrayDestroy, hipMemAlloc,
+ * hipMemAllocHost, hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned,
+ * hipMemcpy3D, hipMemcpy3DAsync, hipMemcpyAtoA, hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync,
+ * hipMemcpyDtoA, hipMemcpyDtoD, hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync,
+ * hipMemcpyHtoA, hipMemcpyHtoAAsync, hipMemcpyHtoD, hipMemcpyHtoDAsync, hipMemFree,
+ * hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, hipMemHostAlloc,
+ * hipMemHostGetDevicePointer, hipMemsetD8, hipMemsetD16, hipMemsetD32, hipArrayGetInfo
+ */
+hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, hipArray_t array);
+/**
+ * @brief Gets a 3D array descriptor
+ *
+ * @param[out] pArrayDescriptor - Returned 3D array descriptor
+ * @param[in]  array            - 3D array to get descriptor of
+ *
+ * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue #hipErrorInvalidHandle, #hipErrorContextIsDestroyed
+ *
+ * @see hipArray3DCreate, hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc,
+ * hipMemAllocHost, hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned,
+ * hipMemcpy3D, hipMemcpy3DAsync, hipMemcpyAtoA, hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync,
+ * hipMemcpyDtoA, hipMemcpyDtoD, hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync,
+ * hipMemcpyHtoA, hipMemcpyHtoAAsync, hipMemcpyHtoD, hipMemcpyHtoDAsync, hipMemFree,
+ * hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, hipMemHostAlloc,
+ * hipMemHostGetDevicePointer, hipMemsetD8, hipMemsetD16, hipMemsetD32, hipArrayGetInfo
+ */
+hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, hipArray_t array);
+/**
+ *  @brief Copies data between host and device.
+ *
+ * hipMemcpy2D supports memory matrix copy from the pointed area src to the pointed area dst.
+ * The copy direction is defined by kind which must be one of #hipMemcpyHostToDevice,
+ * #hipMemcpyHostToDevice, #hipMemcpyDeviceToHost #hipMemcpyDeviceToDevice or #hipMemcpyDefault.
+ * Device to Device copies don't need to wait for host synchronization.
+ * The copy is executed on the default null tream. The src and dst must not overlap.
+ * dpitch and spitch are the widths in bytes in memory matrix, width cannot exceed dpitch or
+ * spitch.
+ *
+ * For hipMemcpy2D, the copy is always performed by the current device (set by hipSetDevice).
+ * For multi-gpu or peer-to-peer configurations, it is recommended to set the current device to the
+ * device where the src data is physically located. For optimal peer-to-peer copies, the copy device
+ * must be able to access the src and dst pointers (by calling hipDeviceEnablePeerAccess with copy
+ * agent as the current device and src/dst as the peerDevice argument.  if this is not done, the
+ * hipMemcpy2D will still work, but will perform the copy using a staging buffer on the host.
+ *
+ *  @warning  Calling hipMemcpy2D with dst and src pointers that do not match the hipMemcpyKind
+ * results in undefined behavior.
+ *
+ *  @param[in]   dst    Destination memory address
+ *  @param[in]   dpitch Pitch size in bytes of destination memory
+ *  @param[in]   src    Source memory address
+ *  @param[in]   spitch Pitch size in bytes of source memory
+ *  @param[in]   width  Width size in bytes of matrix transfer (columns)
+ *  @param[in]   height Height size in bytes of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                       size_t height, hipMemcpyKind kind);
+/**
+ *  @brief Copies memory for 2D arrays.
+ *  @param[in]   pCopy Parameters for the memory copy
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
+/**
+ *  @brief Copies memory for 2D arrays.
+ *  @param[in]   pCopy Parameters for the memory copy
+ *  @param[in]   stream Stream to use
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  hipMemcpy2DAsync supports memory matrix copy from the pointed area src to the pointed area dst.
+ * The copy direction is defined by kind which must be one of #hipMemcpyHostToDevice,
+ * #hipMemcpyDeviceToHost, #hipMemcpyDeviceToDevice or #hipMemcpyDefault.
+ * dpitch and spitch are the widths in bytes for memory matrix corresponds to dst and src.
+ * width cannot exceed dpitch or spitch.
+ *
+ * The copy is always performed by the device associated with the specified stream.
+ * The API is asynchronous with respect to the host, so the call may return before the copy is
+ * complete. The copy can optionally be excuted in a specific stream by passing a non-zero stream
+ * argument, for HostToDevice or DeviceToHost copies, the copy can overlap with operations
+ * in other streams.
+ *
+ * For multi-gpu or peer-to-peer configurations, it is recommended to use a stream which is
+ * attached to the device where the src data is physically located.
+ *
+ * For optimal peer-to-peer copies, the copy device must be able to access the src and dst pointers
+ * (by calling hipDeviceEnablePeerAccess) with copy agent as the current device and src/dst as the
+ * peerDevice argument. If enabling device peer access is not done, the API will still work, but
+ * will perform the copy using a staging buffer on the host.
+ *
+ *  @note If host or dst are not pinned, the memory copy will be performed synchronously.  For
+ * best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+ *
+ *  @param[in]   dst    Pointer to destination memory address
+ *  @param[in]   dpitch Pitch size in bytes of destination memory
+ *  @param[in]   src    Pointer to source memory address
+ *  @param[in]   spitch Pitch size in bytes of source memory
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height Height of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @param[in]   stream Stream to use
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                            size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   spitch  Pitch of source memory
+ *  @param[in]   width   Width of matrix transfer (columns in bytes)
+ *  @param[in]   height  Height of matrix transfer (rows)
+ *  @param[in]   kind    Type of transfer
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
+                              size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   spitch  Pitch of source memory
+ *  @param[in]   width   Width of matrix transfer (columns in bytes)
+ *  @param[in]   height  Height of matrix transfer (rows)
+ *  @param[in]   kind    Type of transfer
+ *  @param[in]   stream    Accelerator view which the copy is being enqueued
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
+                                   size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
+                                   hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst Destination memory address
+ *  @param[in]   wOffsetDst Destination starting X offset
+ *  @param[in]   hOffsetDst Destination starting Y offset
+ *  @param[in]   src  Source memory address
+ *  @param[in]   wOffsetSrc Source starting X offset
+ *  @param[in]   hOffsetSrc Source starting Y offset (columns in bytes)
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height  Height of matrix transfer (rows)
+ *  @param[in]   kind Type of transfer
+ *
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+                                   hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc,
+                                   size_t width, size_t height, hipMemcpyKind kind);
+/**
+ *  @brief Copies data between host and device [Deprecated]
+ *
+ *  @ingroup MemoryD
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   count   size in bytes to copy
+ *  @param[in]   kind    Type of transfer
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ *  hipMemcpyAsync
+ *  @warning  This API is deprecated.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipMemcpyToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
+                            size_t count, hipMemcpyKind kind);
+/**
+ *  @brief Copies data between host and device [Deprecated]
+ *
+ *  @ingroup MemoryD
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source memory address
+ *  @param[in]   wOffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   count     Size in bytes to copy
+ *  @param[in]   kind      Type of transfer
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ * @warning  This API is deprecated.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+                              size_t count, hipMemcpyKind kind);
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   dpitch    Pitch of destination memory
+ *  @param[in]   src       Source memory address
+ *  @param[in]   wOffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   width     Width of matrix transfer (columns in bytes)
+ *  @param[in]   height    Height of matrix transfer (rows)
+ *  @param[in]   kind      Type of transfer
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
+                                size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   dpitch    Pitch of destination memory
+ *  @param[in]   src       Source memory address
+ *  @param[in]   wOffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   width     Width of matrix transfer (columns in bytes)
+ *  @param[in]   height    Height of matrix transfer (rows)
+ *  @param[in]   kind      Type of transfer
+ *  @param[in]   stream    Accelerator view which the copy is being enqueued
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
+                                     size_t hOffset, size_t width, size_t height,
+                                     hipMemcpyKind kind, hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source array
+ *  @param[in]   srcOffset Offset in bytes of source array
+ *  @param[in]   count     Size of memory copy in bytes
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyAtoH(void* dst, hipArray_t srcArray, size_t srcOffset, size_t count);
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dstArray   Destination memory address
+ *  @param[in]   dstOffset  Offset in bytes of destination array
+ *  @param[in]   srcHost    Source host pointer
+ *  @param[in]   count      Size of memory copy in bytes
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyHtoA(hipArray_t dstArray, size_t dstOffset, const void* srcHost, size_t count);
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   p   3D memory copy parameters
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p);
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   p        3D memory copy parameters
+ *  @param[in]   stream   Stream to use
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms* p, hipStream_t stream __dparm(0));
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   pCopy   3D memory copy parameters
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy);
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   pCopy    3D memory copy parameters
+ *  @param[in]   stream   Stream to use
+ *  @returns     #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream);
+/**
+ * @brief Get information on memory allocations.
+ *
+ * @param [out] pbase - BAse pointer address
+ * @param [out] psize - Size of allocation
+ * @param [in]  dptr- Device Pointer
+ *
+ * @returns #hipSuccess, #hipErrorNotFound
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
+
+/**
+ * @brief Perform Batch of 1D copies
+ *
+ * @param [in] dsts      - Array of destination pointers
+ * @param [in] srcs      - Array of source pointers.
+ * @param [in] sizes     - Array of sizes for memcpy operations
+ * @param [in] count     - Size of dsts, srcs and sizes arrays
+ * @param [in] attrs     - Array of memcpy attributes (not supported)
+ * @param [in] attrsIdxs - Array of indices to map attrs to copies (not supported)
+ * @param [in] numAttrs  - Size of attrs and attrsIdxs arrays (not supported)
+ * @param [in] failIdx   - Pointer to a location to return failure index inside the batch
+ * @param [in] stream    - stream used to enqueue operations in.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count,
+                               hipMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs,
+                               size_t* failIdx, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Perform Batch of 3D copies
+ *
+ * @param [in] numOps  - Total number of memcpy operations.
+ * @param [in] opList  - Array of size numOps containing the actual memcpy operations.
+ * @param [in] failIdx - Pointer to a location to return the index of the copy where a failure
+ *                     - was encountered.
+ * @param [in] flags   - Flags for future use, must be zero now.
+ * @param [in] stream  - The stream to enqueue the operations in.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemcpy3DBatchAsync(size_t numOps, struct hipMemcpy3DBatchOp* opList, size_t* failIdx,
+                                 unsigned long long flags, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Performs 3D memory copies between devices
+ * This API is asynchronous with respect to host
+ *
+ * @param [in] p  - Parameters for memory copy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, hipErrorInvalidDevice
+ */
+hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms* p);
+
+/**
+ * @brief Performs 3D memory copies between devices asynchronously
+ *
+ * @param [in] p  - Parameters for memory copy
+ * @param [in] stream - Stream to enqueue operation in.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, hipErrorInvalidDevice
+ */
+hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms* p, hipStream_t stream __dparm(0));
+// doxygen end Memory
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup PeerToPeer PeerToPeer Device Memory Access
+ *  @{
+ *  @ingroup API
+ *  This section describes the PeerToPeer device memory access functions of HIP runtime API.
+ */
+/**
+ * @brief Determines if a device can access a peer device's memory.
+ *
+ * @param [out] canAccessPeer - Returns the peer access capability (0 or 1)
+ * @param [in] deviceId - The device accessing the peer device memory.
+ * @param [in] peerDeviceId - Peer device where memory is physically located
+ *
+ * The value of @p canAccessPeer,
+ *
+ * Returns "1" if the specified @p deviceId is capable of directly accessing memory physically
+ * located on @p peerDeviceId,
+ *
+ * Returns "0" if the specified @p deviceId is not capable of directly accessing memory physically
+ * located on @p peerDeviceId.
+ *
+ * Returns "0" if @p deviceId == @p peerDeviceId, both are valid devices,
+ * however, a device is not a peer of itself.
+ *
+ * Returns #hipErrorInvalidDevice if deviceId or peerDeviceId are not valid devices
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ *
+ */
+hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId);
+/**
+ * @brief Enables direct access to memory allocations on a peer device.
+ *
+ * When this API is successful, all memory allocations on peer device will be mapped into the
+ * address space of the current device. In addition, any future memory allocation on the
+ * peer device will remain accessible from the current device, until the access is disabled using
+ * hipDeviceDisablePeerAccess or device is reset using hipDeviceReset.
+ *
+ * @param [in] peerDeviceId - Peer device to enable direct access to from the current device
+ * @param [in] flags - Reserved for future use, must be zero
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ */
+hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags);
+/**
+ * @brief Disables direct access to memory allocations on a peer device.
+ *
+ * If direct access to memory allocations on peer device has not been enabled yet from the current
+ * device, it returns #hipErrorPeerAccessNotEnabled.
+ *
+ * @param [in] peerDeviceId  Peer device to disable direct access to
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ */
+hipError_t hipDeviceDisablePeerAccess(int peerDeviceId);
+
+/**
+ * @brief Copies memory between two peer accessible devices.
+ *
+ * @param [out] dst - Destination device pointer
+ * @param [in] dstDeviceId - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDeviceId - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId,
+                         size_t sizeBytes);
+/**
+ * @brief Copies memory between two peer accessible devices asynchronously.
+ *
+ * @param [out] dst - Destination device pointer
+ * @param [in] dstDeviceId - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDevice - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ * @param [in] stream - Stream identifier
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeerAsync(void* dst, int dstDeviceId, const void* src, int srcDevice,
+                              size_t sizeBytes, hipStream_t stream __dparm(0));
+
+// doxygen end PeerToPeer
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Context Context Management [Deprecated]
+ *  @{
+ *  This section describes the context management functions of HIP runtime API.
+ *
+ *  @warning
+ *
+ *  On the AMD platform, context management APIs are deprecated as there are better alternate
+ *  interfaces, such as using hipSetDevice and stream APIs to achieve the required functionality.
+ *
+ *  On the NVIDIA platform, CUDA supports the driver API that defines "Context" and "Devices" as
+ *  separate entities. Each context contains a single device, which can theoretically have multiple
+ *  contexts. HIP initially added limited support for these APIs to facilitate easy porting from
+ *  existing driver codes.
+ *
+ *  These APIs are only for equivalent driver APIs on the NVIDIA platform.
+ *
+ */
+
+/**
+ * @brief Create a context and set it as current/default context
+ *
+ * @param [out] ctx  Context to create
+ * @param [in] flags  Context creation flags
+ * @param [in] device  device handle
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent,
+ * hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device);
+/**
+ * @brief Destroy a HIP context [Deprecated]
+ *
+ * @param [in] ctx Context to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipCtxCreate, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,hipCtxSetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxDestroy(hipCtx_t ctx);
+/**
+ * @brief Pop the current/default context and return the popped context [Deprecated]
+ *
+ * @param [out] ctx  The current context to pop
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxSetCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
+/**
+ * @brief Push the context to be set as current/ default context [Deprecated]
+ *
+ * @param [in] ctx  The current context to push
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxPushCurrent(hipCtx_t ctx);
+/**
+ * @brief Set the passed context as current/default [Deprecated]
+ *
+ * @param [in] ctx The context to set as current
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxSetCurrent(hipCtx_t ctx);
+/**
+ * @brief Get the handle of the current/ default context [Deprecated]
+ *
+ * @param [out] ctx  The context to get as current
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
+/**
+ * @brief Get the handle of the device associated with current/default context [Deprecated]
+ *
+ * @param [out] device The device from the current context
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetDevice(hipDevice_t* device);
+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @param [in]  ctx Context to check [Deprecated]
+ * @param [out] apiVersion API version to get
+ *
+ * @returns #hipSuccess
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetApiVersion(hipCtx_t ctx, unsigned int* apiVersion);
+/**
+ * @brief Get Cache configuration for a specific function [Deprecated]
+ *
+ * @param [out] cacheConfig  Cache configuration
+ *
+ * @returns #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig);
+/**
+ * @brief Set L1/Shared cache partition [Deprecated]
+ *
+ * @param [in] cacheConfig  Cache configuration to set
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig);
+/**
+ * @brief Set Shared memory bank configuration  [Deprecated]
+ *
+ * @param [in] config  Shared memory configuration to set
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config);
+/**
+ * @brief Get Shared memory bank configuration [Deprecated]
+ *
+ * @param [out] pConfig  Pointer of shared memory configuration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks [Deprecated]
+ *
+ * @return #hipSuccess
+ *
+ * @warning This function waits for all streams on the default context to complete execution, and
+ * then returns.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxSynchronize(void);
+/**
+ * @brief Return flags used for creating default context [Deprecated]
+ *
+ * @param [out] flags  Pointer of flags
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxGetFlags(unsigned int* flags);
+/**
+ * @brief Enables direct access to memory allocations in a peer context [Deprecated]
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the
+ * current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
+ * the address space of the current device when the memory is allocated. The peer memory remains
+ * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx  Peer context
+ * @param [in] flags  flags, need to set as 0
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * #hipErrorPeerAccessAlreadyEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags);
+/**
+ * @brief Disable direct access from current context's virtual address space to memory allocations
+ * physically located on a peer context.Disables direct access to memory allocations in a peer
+ * context and unregisters any registered allocations [Deprecated]
+ *
+ * Returns #hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
+ * enabled from the current device.
+ *
+ * @param [in] peerCtx  Peer context to be disabled
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent cuCtx driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx);
+
+/**
+ * @brief Get the state of the primary context [Deprecated]
+ *
+ * @param [in] dev  Device to get primary context flags for
+ * @param [out] flags  Pointer to store flags
+ * @param [out] active  Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
+/**
+ * @brief Release the primary context on the GPU.
+ *
+ * @param [in] dev  Device which primary context is released [Deprecated]
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning This function return #hipSuccess though doesn't release the primaryCtx by design on
+ * HIP/HIP-CLANG path.
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev);
+/**
+ * @brief Retain the primary context on the GPU [Deprecated]
+ *
+ * @param [out] pctx  Returned context handle of the new context
+ * @param [in] dev  Device which primary context is released
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev);
+/**
+ * @brief Resets the primary context on the GPU [Deprecated]
+ *
+ * @param [in] dev  Device which primary context is reset
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev);
+/**
+ * @brief Set flags for the primary context [Deprecated]
+ *
+ * @param [in] dev  Device for which the primary context flags are set
+ * @param [in] flags  New flags for the device
+ *
+ * @returns #hipSuccess, #hipErrorContextAlreadyInUse
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ *
+ * @warning  This API is deprecated on the AMD platform, only for equivalent driver API on the
+ * NVIDIA platform.
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags);
+// doxygen end Context Management
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *
+ *  @defgroup Module Module Management
+ *  @{
+ *  @ingroup API
+ *  This section describes the module management functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief Loads fatbin object
+ *
+ * @param [in] fatbin  fatbin to be loaded as a module
+ * @param [out] module  Module
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext, #hipErrorFileNotFound,
+ * #hipErrorOutOfMemory, #hipErrorSharedObjectInitFailed, #hipErrorNotInitialized
+ *
+ */
+hipError_t hipModuleLoadFatBinary(hipModule_t* module, const void* fatbin);
+/**
+ * @brief Loads code object from file into a module the currrent context.
+ *
+ * @param [in] fname  Filename of code object to load
+
+ * @param [out] module  Module
+ *
+ * @warning File/memory resources allocated in this function are released only in hipModuleUnload.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext, #hipErrorFileNotFound,
+ * #hipErrorOutOfMemory, #hipErrorSharedObjectInitFailed, #hipErrorNotInitialized
+ *
+ */
+hipError_t hipModuleLoad(hipModule_t* module, const char* fname);
+/**
+ * @brief Frees the module
+ *
+ * @param [in] module  Module to free
+ *
+ * @returns #hipSuccess, #hipErrorInvalidResourceHandle
+ *
+ * The module is freed, and the code objects associated with it are destroyed.
+ */
+hipError_t hipModuleUnload(hipModule_t module);
+/**
+ * @brief Function with kname will be extracted if present in module
+ *
+ * @param [in] module  Module to get function from
+ * @param [in] kname  Pointer to the name of function
+ * @param [out] function  Pointer to function handle
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext, #hipErrorNotInitialized,
+ * #hipErrorNotFound,
+ */
+hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname);
+
+/**
+ * @brief Returns the number of functions within a module.
+ *
+ * @param [in] mod  Module to get function count from
+ * @param [out] count  function count from module
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext, #hipErrorNotInitialized,
+ * #hipErrorNotFound,
+ */
+hipError_t hipModuleGetFunctionCount(unsigned int* count, hipModule_t mod);
+
+/**
+ * @brief Load hip Library from inmemory object
+ *
+ * @param [out] library Output Library
+ * @param [in] code In memory object
+ * @param [in] jitOptions JIT options, CUDA only
+ * @param [in] jitOptionsValues JIT options values, CUDA only
+ * @param [in] numJitOptions Number of JIT options
+ * @param [in] libraryOptions Library options
+ * @param [in] libraryOptionValues Library options values
+ * @param [in] numLibraryOptions Number of library options
+ * @return #hipSuccess, #hipErrorInvalidValue,
+ */
+hipError_t hipLibraryLoadData(hipLibrary_t* library, const void* code, hipJitOption** jitOptions,
+                              void** jitOptionsValues, unsigned int numJitOptions,
+                              hipLibraryOption** libraryOptions, void** libraryOptionValues,
+                              unsigned int numLibraryOptions);
+
+/**
+ * @brief Load hip Library from file
+ *
+ * @param [out] library Output Library
+ * @param [in] fileName file which contains code object
+ * @param [in] jitOptions JIT options, CUDA only
+ * @param [in] jitOptionsValues JIT options values, CUDA only
+ * @param [in] numJitOptions Number of JIT options
+ * @param [in] libraryOptions Library options
+ * @param [in] libraryOptionValues Library options values
+ * @param [in] numLibraryOptions Number of library options
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipLibraryLoadFromFile(hipLibrary_t* library, const char* fileName,
+                                  hipJitOption** jitOptions, void** jitOptionsValues,
+                                  unsigned int numJitOptions, hipLibraryOption** libraryOptions,
+                                  void** libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * @brief Unload HIP Library
+ *
+ * @param [in] library Input created hip library
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipLibraryUnload(hipLibrary_t library);
+
+/**
+ * @brief Get Kernel object from library
+ *
+ * @param [out] pKernel Output kernel object
+ * @param [in] library Input hip library
+ * @param [in] name kernel name to be searched for
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipLibraryGetKernel(hipKernel_t* pKernel, hipLibrary_t library, const char* name);
+
+/**
+ * @brief Get Kernel count in library
+ *
+ * @param [out] count Count of kernels in library
+ * @param [in] library Input created hip library
+ * @return #hipSuccess, #hipErrorInvalidValue
+*/
+hipError_t hipLibraryGetKernelCount(unsigned int *count, hipLibrary_t library);
+
+/**
+ * @brief Find out attributes for a given function.
+ * @ingroup Execution
+ * @param [out] attr  Attributes of funtion
+ * @param [in] func  Pointer to the function handle
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
+ */
+hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func);
+/**
+ * @brief Find out a specific attribute for a given function.
+ * @ingroup Execution
+ * @param [out] value  Pointer to the value
+ * @param [in]  attrib  Attributes of the given funtion
+ * @param [in]  hfunc  Function to get attributes from
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
+ */
+hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc);
+/**
+ * @brief Gets pointer to device entry function that matches entry function symbolPtr.
+ *
+ * @param [out] functionPtr  Device entry function
+ * @param [in]  symbolPtr  Pointer to device entry function to search for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction
+ *
+ */
+hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr);
+/**
+ * @brief Gets function pointer of a requested HIP API
+ *
+ * @param [in]  symbol  The API base name
+ * @param [out] funcPtr  Pointer to the requested function
+ * @param [in]  flags  Flags for the search
+ * @param [out] driverStatus  Optional returned status of the search
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned long long flags,
+                                  hipDriverEntryPointQueryResult* driverStatus);
+/**
+ * @brief returns the handle of the texture reference with the name from the module.
+ *
+ * @param [in] hmod  Module
+ * @param [in] name  Pointer of name of texture reference
+ * @param [out] texRef  Pointer of texture reference
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound, #hipErrorInvalidValue
+ */
+hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
+/**
+ * @brief builds module from code object data which resides in host memory.
+ *
+ * The "image" is a pointer to the location of code object data. This data can be either
+ * a single code object or a fat binary (fatbin), which serves as the entry point for loading and
+ * launching device-specific kernel executions.
+ *
+ * By default, the following command generates a fatbin:
+ *
+ * "amdclang++ -O3 -c --offload-device-only --offload-arch=<GPU_ARCH> <input_file> -o <output_file>"
+ *
+ * For more details, refer to:
+ * <a
+ * href= "https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/kernel_language_cpp_support.html#kernel-compilation">
+ * Kernel Compilation</a> in the HIP kernel language C++ support, or
+ * <a
+ * href="https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_rtc.html">HIP runtime compilation (HIP RTC)</a>.
+ *
+ * @param [in] image  The pointer to the location of data
+ * @param [out] module  Retuned module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+/**
+ * @brief builds module from code object which resides in host memory. Image is pointer to that
+ * location. Options are not used. hipModuleLoadData is called.
+ *
+ * @param [in] image  The pointer to the location of data
+ * @param [out] module  Retuned module
+ * @param [in] numOptions Number of options
+ * @param [in] options Options for JIT
+ * @param [in] optionValues  Option values for JIT
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
+                               hipJitOption* options, void** optionValues);
+/**
+ * @brief Adds bitcode data to be linked with options.
+ * @param [in] state hip link state
+ * @param [in] type  Type of the input data or bitcode
+ * @param [in] data  Input data which is null terminated
+ * @param [in] size  Size of the input data
+ * @param [in] name  Optional name for this input
+ * @param [in] numOptions  Size of the options
+ * @param [in] options  Array of options applied to this input
+ * @param [in] optionValues  Array of option values cast to void*
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ *
+ * If adding the file fails, it will
+ * @return #hipErrorInvalidConfiguration
+ *
+ * @see hipError_t
+ */
+hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size,
+                          const char* name, unsigned int numOptions, hipJitOption* options,
+                          void** optionValues);
+
+/**
+ * @brief Adds a file with bitcode to be linked with options.
+ * @param [in] state hip link state
+ * @param [in] type  Type of the input data or bitcode
+ * @param [in] path  Path to the input file where bitcode is present
+ * @param [in] numOptions  Size of the options
+ * @param [in] options  Array of options applied to this input
+ * @param [in] optionValues  Array of option values cast to void*
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * If adding the file fails, it will
+ * @return #hipErrorInvalidConfiguration
+ *
+ * @see hipError_t
+ */
+hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path,
+                          unsigned int numOptions, hipJitOption* options, void** optionValues);
+
+/**
+ * @brief Completes the linking of the given program.
+ * @param [in]   state hip link state
+ * @param [out]  hipBinOut  Upon success, points to the output binary
+ * @param [out]  sizeOut  Size of the binary is stored (optional)
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue
+ *
+ * If adding the data fails, it will
+ * @return #hipErrorInvalidConfiguration
+ *
+ * @see hipError_t
+ */
+
+hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeOut);
+
+/**
+ * @brief Creates a linker instance with options.
+ * @param [in] numOptions  Number of options
+ * @param [in] options  Array of options
+ * @param [in] optionValues  Array of option values cast to void*
+ * @param [out] stateOut  hip link state created upon success
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidConfiguration
+ *
+ * @see hipSuccess
+ */
+hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues,
+                         hipLinkState_t* stateOut);
+/**
+ * @brief Deletes the linker instance.
+ * @param [in] state link state instance
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue
+ *
+ * @see hipSuccess
+ */
+hipError_t hipLinkDestroy(hipLinkState_t state);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra
+ * @ingroup Execution
+ * @param [in] f         Kernel to launch.
+ * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
+ * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
+ * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ * @param [in] kernelParams  Kernel parameters to launch
+ * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and
+ * must be in the memory layout and alignment expected by the kernel.
+ * All passed arguments must be naturally aligned according to their type. The memory address of
+ * each argument should be a multiple of its size in bytes. Please refer to
+ * hip_porting_driver_api.md for sample usage.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32. So gridDim.x * blockDim.x, gridDim.y * blockDim.y
+ * and gridDim.z * blockDim.z are always less than 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ */
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+                                 unsigned int gridDimZ, unsigned int blockDimX,
+                                 unsigned int blockDimY, unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes, hipStream_t stream,
+                                 void** kernelParams, void** extra);
+/** \addtogroup ModuleCooperativeG Cooperative groups kernel launch of Module management.
+ * \ingroup Module
+ *  @{ */
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelParams, where thread blocks can cooperate and synchronize as they execute
+ *
+ * @param [in] f              Kernel to launch.
+ * @param [in] gridDimX       X grid dimension specified as multiple of blockDimX.
+ * @param [in] gridDimY       Y grid dimension specified as multiple of blockDimY.
+ * @param [in] gridDimZ       Z grid dimension specified as multiple of blockDimZ.
+ * @param [in] blockDimX      X block dimension specified in work-items.
+ * @param [in] blockDimY      Y block dimension specified in work-items.
+ * @param [in] blockDimZ      Z block dimension specified in work-items.
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream         Stream where the kernel should be dispatched. May be 0,
+ * in which case the default stream is used with associated synchronization rules.
+ * @param [in] kernelParams   A list of kernel arguments.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$.
+ *
+ * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue,
+ * #hipErrorInvalidConfiguration, #hipErrorLaunchFailure, #hipErrorLaunchOutOfResources,
+ * #hipErrorLaunchTimeOut, #hipErrorCooperativeLaunchTooLarge, #hipErrorSharedObjectInitFailed
+ */
+hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX,
+                                            unsigned int gridDimY, unsigned int gridDimZ,
+                                            unsigned int blockDimX, unsigned int blockDimY,
+                                            unsigned int blockDimZ, unsigned int sharedMemBytes,
+                                            hipStream_t stream, void** kernelParams);
+/**
+ * @brief Launches kernels on multiple devices where thread blocks can cooperate and
+ * synchronize as they execute.
+ *
+ * @param [in] launchParamsList         List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue,
+ * #hipErrorInvalidConfiguration, #hipErrorInvalidResourceHandle, #hipErrorLaunchFailure,
+ * #hipErrorLaunchOutOfResources, #hipErrorLaunchTimeOut, #hipErrorCooperativeLaunchTooLarge,
+ * #hipErrorSharedObjectInitFailed
+ */
+hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
+                                                       unsigned int numDevices, unsigned int flags);
+/**
+ * @brief Launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute.
+ *
+ * @param [in] f - Kernel to launch.
+ * @param [in] gridDim - Grid dimensions specified as multiple of blockDim.
+ * @param [in] blockDimX - Block dimensions specified in work-items
+ * @param [in] kernelParams - Pointer of arguments passed to the kernel. If the kernel has multiple
+ * parameters, 'kernelParams' should be array of pointers, each points the corresponding argument.
+ * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream);
+/**
+ * @brief Launches kernels on multiple devices where thread blocks can cooperate and
+ * synchronize as they execute.
+ *
+ * @param [in] launchParamsList         List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ *  #hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+                                                 unsigned int flags);
+
+// Doxygen end group ModuleCooperativeG
+/** @} */
+
+/**
+ * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
+ * on respective streams before enqueuing any other work on the specified streams from any other
+ * threads
+ * @ingroup Execution
+ * @param [in] launchParamsList          List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ */
+hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+                                              unsigned int flags);
+/**
+ * @brief Launches a HIP kernel using a generic function pointer and the specified configuration.
+ * @ingroup Execution
+ *
+ * This function is equivalent to hipLaunchKernelEx but accepts the kernel as a generic function
+ * pointer.
+ *
+ * @param [in] config                 Pointer to the kernel launch configuration structure.
+ * @param [in] fPtr                   Pointer to the device kernel function.
+ * @param [in] args                   Array of pointers to the kernel arguments.
+ *
+ * @returns #hipSuccess if the kernel is launched successfully, otherwise an appropriate error code.
+ */
+hipError_t hipLaunchKernelExC(const hipLaunchConfig_t* config, const void* fPtr, void** args);
+/**
+ * @brief Launches a HIP kernel using the driver API with the specified configuration.
+ * @ingroup Execution
+ *
+ * This function dispatches the device kernel represented by a HIP function object.
+ * It passes both the kernel parameters and any extra configuration arguments to the kernel launch.
+ *
+ * @param [in] config  Pointer to the kernel launch configuration structure.
+ * @param [in] f       HIP function object representing the device kernel to be launched.
+ * @param [in] params  Array of pointers to the kernel parameters.
+ * @param [in] extra   Array of pointers for additional launch parameters or extra configuration
+ * data.
+ *
+ * @returns #hipSuccess if the kernel is launched successfully, otherwise an appropriate error code.
+ */
+hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f, void** params,
+                                void** extra);
+/**
+ * @brief Returns a handle for the address range requested.
+ *
+ * This function returns a handle to a device pointer created using either hipMalloc set of APIs
+ * or through hipMemAddressReserve (as long as the ptr is mapped).
+ *
+ * @param [out] handle     Ptr to the handle where the fd or other types will be returned.
+ * @param [in] dptr        Device ptr for which we get the handle.
+ * @param [in] size        Size of the address range.
+ * @param [in] handleType  Type of the handle requested for the address range.
+ * @param [in] flags       Any flags set regarding the handle requested.
+ *
+ * @returns #hipSuccess if the kernel is launched successfully, otherwise an appropriate error code.
+ */
+hipError_t hipMemGetHandleForAddressRange(void* handle, hipDeviceptr_t dptr, size_t size,
+                                          hipMemRangeHandleType handleType,
+                                          unsigned long long flags);
+// doxygen end Module
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Occupancy Occupancy
+ *  @{
+ *  This section describes the occupancy functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+// TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f,
+                                                   size_t dynSharedMemPerBlk, int blockSizeLimit);
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+// TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                                            hipFunction_t f,
+                                                            size_t dynSharedMemPerBlk,
+                                                            int blockSizeLimit, unsigned int flags);
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function (hipFunction) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk Dynamic shared memory usage (in bytes) intended for each block
+ * @returns  #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f,
+                                                              int blockSize,
+                                                              size_t dynSharedMemPerBlk);
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function(hipFunction_t) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk Dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
+ * @returns  #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk Dynamic shared memory usage (in bytes) intended for each block
+ * @returns  #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const void* f,
+                                                        int blockSize, size_t dynSharedMemPerBlk);
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk Dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
+ * @returns  #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
+    unsigned int flags __dparm(hipOccupancyDefault));
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, const void* f,
+                                             size_t dynSharedMemPerBlk, int blockSizeLimit);
+// doxygen end Occupancy
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Profiler Profiler Control [Deprecated]
+ *  @{
+ *  This section describes the profiler control functions of HIP runtime API.
+ *
+ *  @warning The cudaProfilerInitialize API format for "configFile" is not supported.
+ *
+ */
+// TODO - expand descriptions:
+/**
+ * @brief Start recording of profiling information [Deprecated]
+ * When using this API, start the profiler with profiling disabled.  (--startdisabled)
+ * @returns  #hipErrorNotSupported
+ * @warning hipProfilerStart API is deprecated, use roctracer/rocTX instead.
+ */
+HIP_DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStart();
+/**
+ * @brief Stop recording of profiling information [Deprecated]
+ * When using this API, start the profiler with profiling disabled.  (--startdisabled)
+ * @returns  #hipErrorNotSupported
+ * @warning  hipProfilerStart API is deprecated, use roctracer/rocTX instead.
+ */
+HIP_DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStop();
+// doxygen end profiler
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Clang Launch API to support the triple-chevron syntax
+ *  @{
+ *  This section describes the API to support the triple-chevron syntax.
+ */
+/**
+ * @brief Configure a kernel launch.
+ *
+ * @param [in] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim  block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0),
+                            hipStream_t stream __dparm(0));
+/**
+ * @brief Set a kernel argument.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ *
+ * @param [in] arg    Pointer the argument in host memory.
+ * @param [in] size   Size of the argument.
+ * @param [in] offset Offset of the argument on the argument stack.
+ *
+ */
+hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+/**
+ * @brief Launch a kernel.
+ *
+ * @param [in] func Kernel to launch.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipLaunchByPtr(const void* func);
+/**
+ * @brief Push configuration of a kernel launch.
+ *
+ * @param [in] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim  block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ *
+ */
+hipError_t __hipPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0),
+                                      hipStream_t stream __dparm(0));
+/**
+ * @brief Pop configuration of a kernel launch.
+ *
+ * @param [out] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [out] blockDim  block dimensions specified in work-items
+ * @param [out] sharedMem Amount of dynamic shared memory to allocate for this kernel.  The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [out] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * Please note, HIP does not support kernel launch with total work items defined in dimension with
+ * size gridDim x blockDim >= 2^32.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue
+ *
+ */
+hipError_t __hipPopCallConfiguration(dim3* gridDim, dim3* blockDim, size_t* sharedMem,
+                                     hipStream_t* stream);
+/**
+ * @brief C compliant kernel launch API
+ *
+ * @param [in] function_address - Kernel stub function pointer.
+ * @param [in] numBlocks - Number of blocks.
+ * @param [in] dimBlocks - Dimension of a block
+ * @param [in] args - Pointer of arguments passed to the kernel. If the kernel has multiple
+ * parameters, 'args' should be array of pointers, each points the corresponding argument.
+ * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
+ *  default stream is used with associated synchronization rules.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+                           void** args, size_t sharedMemBytes __dparm(0),
+                           hipStream_t stream __dparm(0));
+
+/**
+ * @brief Enqueues a host function call in a stream.
+ *
+ * @param [in] stream - The stream to enqueue work in.
+ * @param [in] fn - The function to call once enqueued preceeding operations are complete.
+ * @param [in] userData - User-specified data to be passed to the function.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidResourceHandle, #hipErrorInvalidValue,
+ * #hipErrorNotSupported
+ *
+ * The host function to call in this API will be executed after the preceding operations in
+ * the stream are complete. The function is a blocking operation that blocks operations in the
+ * stream that follow it, until the function is returned.
+ * Event synchronization and internal callback functions make sure enqueued operations will
+ * execute in order, in the stream.
+ *
+ * The host function must not make any HIP API calls. The host function is non-reentrant. It must
+ * not perform sychronization with any operation that may depend on other processing execution
+ * but is not enqueued to run earlier in the stream.
+ *
+ * Host functions that are enqueued respectively in different non-blocking streams can run
+ * concurrently.
+ *
+ * @warning  This API is marked as beta, meaning, while this is feature complete,
+ * it is still open to changes and may have outstanding issues.
+ */
+hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData);
+
+/**
+ * Copies memory for 2D arrays.
+ *
+ * @param pCopy           - Parameters for the memory copy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy);
+// TODO: Move this to hip_ext.h
+/**
+ * @brief Launches kernel from the pointer address, with arguments and shared memory on stream.
+ *
+ * @param [in] function_address - Pointer to the Kernel to launch.
+ * @param [in] numBlocks -  Number of blocks.
+ * @param [in] dimBlocks - Dimension of a block.
+ * @param [in] args - Pointer of arguments passed to the kernel. If the kernel has multiple
+ * parameters, 'args' should be array of pointers, each points the corresponding argument.
+ * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel.
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream - Stream where the kernel should be dispatched.
+ * May be 0, in which case the default stream is used with associated synchronization rules.
+ * @param [in] startEvent - If non-null, specified event will be updated to track the start time of
+ * the kernel launch. The event must be created before calling this API.
+ * @param [in] stopEvent - If non-null, specified event will be updated to track the stop time of
+ * the kernel launch. The event must be created before calling this API.
+ * @param [in] flags - The value of hipExtAnyOrderLaunch, signifies if kernel can be
+ * launched in any order.
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue.
+ *
+ */
+hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+                              void** args, size_t sharedMemBytes, hipStream_t stream,
+                              hipEvent_t startEvent, hipEvent_t stopEvent, int flags);
+// doxygen end Clang launch
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Texture Texture Management
+ *  @{
+ *  This section describes the texture management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Creates a texture object.
+ *
+ * @param [out] pTexObject  pointer to the texture object to create
+ * @param [in] pResDesc  pointer to resource descriptor
+ * @param [in] pTexDesc  pointer to texture descriptor
+ * @param [in] pResViewDesc  pointer to resource view descriptor
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
+ *
+ * @note 3D linear filter isn't supported on GFX90A boards, on which the API @p
+ * hipCreateTextureObject will return hipErrorNotSupported.
+ *
+ */
+hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc,
+                                  const hipTextureDesc* pTexDesc,
+                                  const struct hipResourceViewDesc* pResViewDesc);
+
+/**
+ * @brief Destroys a texture object.
+ *
+ * @param [in] textureObject  texture object to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
+
+/**
+ * @brief Gets the channel descriptor in an array.
+ *
+ * @param [in] desc  pointer to channel format descriptor
+ * @param [out] array  memory array on the device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array);
+
+/**
+ * @brief Gets resource descriptor for the texture object.
+ *
+ * @param [out] pResDesc  pointer to resource descriptor
+ * @param [in] textureObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+                                           hipTextureObject_t textureObject);
+
+/**
+ * @brief Gets resource view descriptor for the texture object.
+ *
+ * @param [out] pResViewDesc  pointer to resource view descriptor
+ * @param [in] textureObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetTextureObjectResourceViewDesc(struct hipResourceViewDesc* pResViewDesc,
+                                               hipTextureObject_t textureObject);
+
+/**
+ * @brief Gets texture descriptor for the texture object.
+ *
+ * @param [out] pTexDesc  pointer to texture descriptor
+ * @param [in] textureObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc,
+                                          hipTextureObject_t textureObject);
+
+/**
+ * @brief Creates a texture object.
+ *
+ * @param [out] pTexObject  pointer to texture object to create
+ * @param [in] pResDesc  pointer to resource descriptor
+ * @param [in] pTexDesc  pointer to texture descriptor
+ * @param [in] pResViewDesc  pointer to resource view descriptor
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
+                              const HIP_TEXTURE_DESC* pTexDesc,
+                              const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
+
+/**
+ * @brief Destroys a texture object.
+ *
+ * @param [in] texObject  texture object to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipTexObjectDestroy(hipTextureObject_t texObject);
+
+/**
+ * @brief Gets resource descriptor of a texture object.
+ *
+ * @param [out] pResDesc  pointer to resource descriptor
+ * @param [in] texObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc, hipTextureObject_t texObject);
+
+/**
+ * @brief Gets resource view descriptor of a texture object.
+ *
+ * @param [out] pResViewDesc  pointer to resource view descriptor
+ * @param [in] texObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipTexObjectGetResourceViewDesc(HIP_RESOURCE_VIEW_DESC* pResViewDesc,
+                                           hipTextureObject_t texObject);
+
+/**
+ * @brief Gets texture descriptor of a texture object.
+ *
+ * @param [out] pTexDesc  pointer to texture descriptor
+ * @param [in] texObject  texture object
+ *
+ * @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc, hipTextureObject_t texObject);
+
+/**
+ * @brief Allocate a mipmapped array on the device.
+ *
+ * @param[out] mipmappedArray  - Pointer to allocated mipmapped array in device memory
+ * @param[in]  desc            - Requested channel format
+ * @param[in]  extent          - Requested allocation size (width field in elements)
+ * @param[in]  numLevels       - Number of mipmap levels to allocate
+ * @param[in]  flags           - Flags for extensions
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipMallocMipmappedArray(hipMipmappedArray_t* mipmappedArray,
+                                   const struct hipChannelFormatDesc* desc, struct hipExtent extent,
+                                   unsigned int numLevels, unsigned int flags __dparm(0));
+
+/**
+ * @brief Frees a mipmapped array on the device.
+ *
+ * @param[in] mipmappedArray - Pointer to mipmapped array to free
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
+
+/**
+ * @brief Gets a mipmap level of a HIP mipmapped array.
+ *
+ * @param[out] levelArray     - Returned mipmap level HIP array
+ * @param[in]  mipmappedArray - HIP mipmapped array
+ * @param[in]  level          - Mipmap level
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipGetMipmappedArrayLevel(hipArray_t* levelArray,
+                                     hipMipmappedArray_const_t mipmappedArray, unsigned int level);
+
+/**
+ * @brief Create a mipmapped array.
+ *
+ * @param [out] pHandle  pointer to mipmapped array
+ * @param [in] pMipmappedArrayDesc  mipmapped array descriptor
+ * @param [in] numMipmapLevels  mipmap level
+ *
+ * @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle,
+                                   HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
+                                   unsigned int numMipmapLevels);
+
+/**
+ * @brief Destroy a mipmapped array.
+ *
+ * @param [out] hMipmappedArray  pointer to mipmapped array to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray);
+
+/**
+ * @brief Get a mipmapped array on a mipmapped level.
+ *
+ * @param [in] pLevelArray Pointer of array
+ * @param [out] hMipMappedArray Pointer of mipmapped array on the requested mipmap level
+ * @param [out] level  Mipmap level
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipMipmappedArrayGetLevel(hipArray_t* pLevelArray, hipMipmappedArray_t hMipMappedArray,
+                                     unsigned int level);
+
+/**
+ *
+ *  @addtogroup TextureD Texture Management [Deprecated]
+ *  @{
+ *  @ingroup Texture
+ *  This section describes the deprecated texture management functions of HIP runtime API.
+ */
+
+/**
+ * @brief  Binds a mipmapped array to a texture [Deprecated]
+ *
+ * @param [in] tex  pointer to the texture reference to bind
+ * @param [in] mipmappedArray memory mipmapped array on the device
+ * @param [in] desc  opointer to the channel format
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipBindTextureToMipmappedArray(const textureReference* tex,
+                                          hipMipmappedArray_const_t mipmappedArray,
+                                          const hipChannelFormatDesc* desc);
+
+/**
+ * @brief Gets the texture reference related with the symbol [Deprecated]
+ *
+ * @param [out] texref  texture reference
+ * @param [in] symbol  pointer to the symbol related with the texture for the reference
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol);
+
+/**
+ * @brief Gets the border color used by a texture reference [Deprecated]
+ *
+ * @param [out] pBorderColor  Returned Type and Value of RGBA color.
+ * @param [in] texRef  Texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetBorderColor(float* pBorderColor, const textureReference* texRef);
+
+/**
+ * @brief Gets the array bound to a texture reference [Deprecated]
+
+ *
+ * @param [in] pArray  Returned array.
+ * @param [in] texRef  texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetArray(hipArray_t* pArray, const textureReference* texRef);
+
+/**
+ * @brief Sets address mode for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  texture reference.
+ * @param [in] dim  Dimension of the texture.
+ * @param [in] am  Value of the texture address mode.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetAddressMode(textureReference* texRef, int dim,
+                                   enum hipTextureAddressMode am);
+/**
+ * @brief Binds an array as a texture reference [Deprecated]
+ *
+ * @param [in] tex  Pointer texture reference.
+ * @param [in] array  Array to bind.
+ * @param [in] flags  Flags should be set as HIP_TRSA_OVERRIDE_FORMAT, as a valid value.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetArray(textureReference* tex, hipArray_const_t array, unsigned int flags);
+/**
+ * @brief Set filter mode for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer texture reference.
+ * @param [in] fm  Value of texture filter mode.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetFilterMode(textureReference* texRef, enum hipTextureFilterMode fm);
+/**
+ * @brief Set flags for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer texture reference.
+ * @param [in] Flags  Value of flags.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetFlags(textureReference* texRef, unsigned int Flags);
+/**
+ * @brief Set format for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer texture reference.
+ * @param [in] fmt  Value of format.
+ * @param [in] NumPackedComponents  Number of components per array.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetFormat(textureReference* texRef, hipArray_Format fmt,
+                              int NumPackedComponents);
+/**
+ * @brief Binds a memory area to a texture [Deprecated]
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of memory on the device.
+ * @param [in] desc  Pointer of channel format descriptor.
+ * @param [in] size  Size of memory in bites.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipBindTexture(size_t* offset, const textureReference* tex, const void* devPtr,
+                          const hipChannelFormatDesc* desc, size_t size __dparm(UINT_MAX));
+/**
+ * @brief Binds a 2D memory area to a texture [Deprecated]
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of 2D memory area on the device.
+ * @param [in] desc  Pointer of channel format descriptor.
+ * @param [in] width  Width in texel units.
+ * @param [in] height  Height in texel units.
+ * @param [in] pitch  Pitch in bytes.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipBindTexture2D(size_t* offset, const textureReference* tex, const void* devPtr,
+                            const hipChannelFormatDesc* desc, size_t width, size_t height,
+                            size_t pitch);
+/**
+ * @brief Binds a memory area to a texture [Deprecated]
+ *
+ * @param [in] tex  Pointer of texture reference.
+ * @param [in] array  Array to bind.
+ * @param [in] desc  Pointer of channel format descriptor.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(const textureReference* tex, hipArray_const_t array,
+                                 const hipChannelFormatDesc* desc);
+/**
+ * @brief Get the offset of the alignment in a texture [Deprecated]
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] texref  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref);
+/**
+ * @brief Unbinds a texture [Deprecated]
+ *
+ * @param [in] tex  Texture to unbind.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipUnbindTexture(const textureReference* tex);
+/**
+ * @brief Gets the address for a texture reference [Deprecated]
+ *
+ * @param [out] dev_ptr  Pointer of device address.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetAddress(hipDeviceptr_t* dev_ptr, const textureReference* texRef);
+/**
+ * @brief Gets the address mode for a texture reference [Deprecated]
+ *
+ * @param [out] pam  Pointer of address mode.
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] dim  Dimension.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetAddressMode(enum hipTextureAddressMode* pam, const textureReference* texRef,
+                                   int dim);
+/**
+ * @brief Gets filter mode for a texture reference [Deprecated]
+ *
+ * @param [out] pfm  Pointer of filter mode.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetFilterMode(enum hipTextureFilterMode* pfm, const textureReference* texRef);
+/**
+ * @brief Gets flags for a texture reference [Deprecated]
+ *
+ * @param [out] pFlags  Pointer of flags.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetFlags(unsigned int* pFlags, const textureReference* texRef);
+/**
+ * @brief Gets texture format for a texture reference [Deprecated]
+ *
+ * @param [out] pFormat  Pointer of the format.
+ * @param [out] pNumChannels  Pointer of number of channels.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetFormat(hipArray_Format* pFormat, int* pNumChannels,
+                              const textureReference* texRef);
+/**
+ * @brief Gets the maximum anisotropy for a texture reference [Deprecated]
+ *
+ * @param [out] pmaxAnsio  Pointer of the maximum anisotropy.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAnsio, const textureReference* texRef);
+/**
+ * @brief Gets the mipmap filter mode for a texture reference [Deprecated]
+ *
+ * @param [out] pfm  Pointer of the mipmap filter mode.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapFilterMode(enum hipTextureFilterMode* pfm,
+                                        const textureReference* texRef);
+/**
+ * @brief Gets the mipmap level bias for a texture reference [Deprecated]
+ *
+ * @param [out] pbias  Pointer of the mipmap level bias.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapLevelBias(float* pbias, const textureReference* texRef);
+/**
+ * @brief Gets the minimum and maximum mipmap level clamps for a texture reference [Deprecated]
+ *
+ * @param [out] pminMipmapLevelClamp  Pointer of the minimum mipmap level clamp.
+ * @param [out] pmaxMipmapLevelClamp  Pointer of the maximum mipmap level clamp.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp,
+                                        const textureReference* texRef);
+/**
+ * @brief Gets the mipmapped array bound to a texture reference [Deprecated]
+ *
+ * @param [out] pArray  Pointer of the mipmapped array.
+ * @param [in] texRef  Pointer of texture reference.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefGetMipMappedArray(hipMipmappedArray_t* pArray, const textureReference* texRef);
+/**
+ * @brief Sets an bound address for a texture reference [Deprecated]
+ *
+ * @param [out] ByteOffset  Pointer of the offset in bytes.
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] dptr  Pointer of device address to bind.
+ * @param [in] bytes  Size in bytes.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetAddress(size_t* ByteOffset, textureReference* texRef, hipDeviceptr_t dptr,
+                               size_t bytes);
+/**
+ * @brief Set a bind an address as a 2D texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] desc  Pointer of array descriptor.
+ * @param [in] dptr  Pointer of device address to bind.
+ * @param [in] Pitch  Pitch in bytes.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetAddress2D(textureReference* texRef, const HIP_ARRAY_DESCRIPTOR* desc,
+                                 hipDeviceptr_t dptr, size_t Pitch);
+/**
+ * @brief Sets the maximum anisotropy for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [out] maxAniso  Value of the maximum anisotropy.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetMaxAnisotropy(textureReference* texRef, unsigned int maxAniso);
+/**
+ * @brief Sets border color for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] pBorderColor  Pointer of border color.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetBorderColor(textureReference* texRef, float* pBorderColor);
+/**
+ * @brief Sets mipmap filter mode for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] fm  Value of filter mode.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetMipmapFilterMode(textureReference* texRef, enum hipTextureFilterMode fm);
+/**
+ * @brief Sets mipmap level bias for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] bias  Value of mipmap bias.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetMipmapLevelBias(textureReference* texRef, float bias);
+/**
+ * @brief Sets mipmap level clamp for a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference.
+ * @param [in] minMipMapLevelClamp  Value of minimum mipmap level clamp.
+ * @param [in] maxMipMapLevelClamp  Value of maximum mipmap level clamp.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetMipmapLevelClamp(textureReference* texRef, float minMipMapLevelClamp,
+                                        float maxMipMapLevelClamp);
+/**
+ * @brief Binds mipmapped array to a texture reference [Deprecated]
+ *
+ * @param [in] texRef  Pointer of texture reference to bind.
+ * @param [in] mipmappedArray  Pointer of mipmapped array to bind.
+ * @param [in] Flags  Flags should be set as HIP_TRSA_OVERRIDE_FORMAT, as a valid value.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API is deprecated.
+ *
+ */
+HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
+                                      struct hipMipmappedArray* mipmappedArray, unsigned int Flags);
+
+// doxygen end deprecated texture management
+/**
+ * @}
+ */
+
+// doxygen end Texture management
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Runtime Runtime Compilation
+ *  @{
+ *  This section describes the runtime compilation functions of HIP runtime API.
+ *
+ */
+// This group is for HIPrtc
+
+// doxygen end Runtime
+/**
+ * @}
+ */
+
+/**
+ *
+ *  @defgroup Callback Callback Activity APIs
+ *  @{
+ *  This section describes the callback/Activity of HIP runtime API.
+ */
+/**
+ * @brief Returns HIP API name by ID.
+ *
+ * @param [in] id ID of HIP API
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+const char* hipApiName(uint32_t id);
+/**
+ * @brief Returns kernel name reference by function name.
+ *
+ * @param [in] f Name of function
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+const char* hipKernelNameRef(const hipFunction_t f);
+/**
+ * @brief Retrives kernel for a given host pointer, unless stated otherwise.
+ *
+ * @param [in] hostFunction Pointer of host function.
+ * @param [in] stream Stream the kernel is executed on.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream);
+/**
+ * @brief Returns device ID on the stream.
+ *
+ * @param [in] stream Stream of device executed on.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+int hipGetStreamDeviceId(hipStream_t stream);
+
+// doxygen end Callback
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Graph Graph Management
+ *  @{
+ *  This section describes the graph management types & functions of HIP runtime API.
+ */
+
+/**
+ * @brief Begins graph capture on a stream.
+ *
+ * @param [in] stream - Stream to initiate capture.
+ * @param [in] mode - Controls the interaction of this capture sequence with other API calls that
+ * are not safe.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode);
+
+/**
+* @brief Begins graph capture on a stream to an existing graph.
+*
+* @param [in] stream - Stream to initiate capture.
+* @param [in] graph - Graph to capture into.
+* @param [in] dependencies - Dependencies of the first node captured in the stream. Can be NULL if
+* numDependencies is 0.
+* @param [in] dependencyData - Optional array of data associated with each dependency.
+* @param [in] numDependencies - Number of dependencies.
+* @param [in] mode - Controls the interaction of this capture sequence with other API calls that
+are not safe.
+*
+* @returns #hipSuccess, #hipErrorInvalidValue
+*
+* @warning param "const hipGraphEdgeData* dependencyData" is currently not supported and has to be
+passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still
+open to changes and may have outstanding issues.
+*
+*/
+hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
+                                        const hipGraphNode_t* dependencies,
+                                        const hipGraphEdgeData* dependencyData,
+                                        size_t numDependencies, hipStreamCaptureMode mode);
+
+/**
+ * @brief Ends capture on a stream, returning the captured graph.
+ *
+ * @param [in] stream - Stream to end capture.
+ * @param [out] pGraph - Captured graph.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
+
+/**
+ * @brief Get capture status of a stream.
+ *
+ * @param [in] stream - Stream of which to get capture status from.
+ * @param [out] pCaptureStatus - Returns current capture status.
+ * @param [out] pId - Unique capture ID.
+ *
+ * @returns #hipSuccess, #hipErrorStreamCaptureImplicit
+ *
+ */
+hipError_t hipStreamGetCaptureInfo(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
+                                   unsigned long long* pId);
+
+/**
+ * @brief Get stream's capture state
+ *
+ * @param [in] stream - Stream of which to get capture status from.
+ * @param [out] captureStatus_out - Returns current capture status.
+ * @param [out] id_out - Unique capture ID.
+ * @param [out] graph_out - Returns the graph being captured into.
+ * @param [out] dependencies_out - Pointer to an array of nodes representing the graphs
+ * dependencies.
+ * @param [out] numDependencies_out - Returns size of the array returned in dependencies_out.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
+ *
+ */
+hipError_t hipStreamGetCaptureInfo_v2(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
+                                      unsigned long long* id_out __dparm(0),
+                                      hipGraph_t* graph_out __dparm(0),
+                                      const hipGraphNode_t** dependencies_out __dparm(0),
+                                      size_t* numDependencies_out __dparm(0));
+
+/**
+ * @brief Get stream's capture state
+ *
+ * @param [in] stream - Stream of which to get capture status from.
+ * @param [out] pCaptureStatus - Returns current capture status.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
+ *
+ */
+hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
+
+/**
+ * @brief Update the set of dependencies in a capturing stream
+ *
+ * @param [in] stream  Stream that is being captured.
+ * @param [in] dependencies  Pointer to an array of nodes to add/replace.
+ * @param [in] numDependencies  Size of the dependencies array.
+ * @param [in] flags  Flag to update dependency set. Should be one of the values
+ * in enum #hipStreamUpdateCaptureDependenciesFlags.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorIllegalState
+ *
+ */
+hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t* dependencies,
+                                              size_t numDependencies,
+                                              unsigned int flags __dparm(0));
+
+/**
+ * @brief Swaps the stream capture mode of a thread.
+ *
+ * @param [in] mode - Pointer to mode value to swap with the current mode.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode);
+
+/**
+ * @brief Creates a graph
+ *
+ * @param [out] pGraph - pointer to graph to create.
+ * @param [in] flags - flags for graph creation, must be 0.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
+ *
+ */
+hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
+
+/**
+ * @brief Destroys a graph
+ *
+ * @param [in] graph - instance of graph to destroy.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphDestroy(hipGraph_t graph);
+
+/**
+ * @brief Adds dependency edges to a graph.
+ *
+ * @param [in] graph - Instance of the graph to add dependencies to.
+ * @param [in] from - Pointer to the graph nodes with dependencies to add from.
+ * @param [in] to - Pointer to the graph nodes to add dependencies to.
+ * @param [in] numDependencies - Number of dependencies to add.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from,
+                                   const hipGraphNode_t* to, size_t numDependencies);
+
+/**
+ * @brief Removes dependency edges from a graph.
+ *
+ * @param [in] graph - Instance of the graph to remove dependencies from.
+ * @param [in] from - Array of nodes that provide the dependencies.
+ * @param [in] to - Array of dependent nodes.
+ * @param [in] numDependencies - Number of dependencies to remove.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from,
+                                      const hipGraphNode_t* to, size_t numDependencies);
+
+/**
+ * @brief Returns a graph's dependency edges.
+ *
+ * @param [in] graph - Instance of the graph to get the edges from.
+ * @param [out] from - Pointer to the graph nodes to return edge endpoints.
+ * @param [out] to - Pointer to the graph nodes to return edge endpoints.
+ * @param [out] numEdges - Returns number of edges.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * from and to may both be NULL, in which case this function only returns the number of edges in
+ * numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual
+ * number of edges, the remaining entries in from and to will be set to NULL, and the number of
+ * edges actually returned will be written to numEdges.
+ *
+ */
+hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from, hipGraphNode_t* to,
+                            size_t* numEdges);
+
+/**
+ * @brief Returns a graph's nodes.
+ *
+ * @param [in] graph - Instance of graph to get the nodes from.
+ * @param [out] nodes - Pointer to return the  graph nodes.
+ * @param [out] numNodes - Returns the number of graph nodes.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * nodes may be NULL, in which case this function will return the number of nodes in numNodes.
+ * Otherwise, numNodes entries will be filled in. If numNodes is higher than the actual number of
+ * nodes, the remaining entries in nodes will be set to NULL, and the number of nodes actually
+ * obtained will be returned in numNodes.
+ *
+ */
+hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes, size_t* numNodes);
+
+/**
+ * @brief Returns a graph's root nodes.
+ *
+ * @param [in] graph - Instance of the graph to get the nodes from.
+ * @param [out] pRootNodes - Pointer to return the graph's root nodes.
+ * @param [out] pNumRootNodes - Returns the number of graph's root nodes.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * pRootNodes may be NULL, in which case this function will return the number of root nodes in
+ * pNumRootNodes. Otherwise, pNumRootNodes entries will be filled in. If pNumRootNodes is higher
+ * than the actual number of root nodes, the remaining entries in pRootNodes will be set to NULL,
+ * and the number of nodes actually obtained will be returned in pNumRootNodes.
+ *
+ */
+hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes,
+                                size_t* pNumRootNodes);
+
+/**
+ * @brief Returns a node's dependencies.
+ *
+ * @param [in] node - Graph node to get the dependencies from.
+ * @param [out] pDependencies - Pointer to return the dependencies.
+ * @param [out] pNumDependencies -  Returns the number of graph node dependencies.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * pDependencies may be NULL, in which case this function will return the number of dependencies in
+ * pNumDependencies. Otherwise, pNumDependencies entries will be filled in. If pNumDependencies is
+ * higher than the actual number of dependencies, the remaining entries in pDependencies will be set
+ * to NULL, and the number of nodes actually obtained will be returned in pNumDependencies.
+ *
+ */
+hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDependencies,
+                                       size_t* pNumDependencies);
+
+/**
+ * @brief Returns a node's dependent nodes.
+ *
+ * @param [in] node - Graph node to get the dependent nodes from.
+ * @param [out] pDependentNodes - Pointer to return the graph dependent nodes.
+ * @param [out] pNumDependentNodes - Returns the number of graph node dependent nodes.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * pDependentNodes may be NULL, in which case this function will return the number of dependent
+ * nodes in pNumDependentNodes. Otherwise, pNumDependentNodes entries will be filled in. If
+ * pNumDependentNodes is higher than the actual number of dependent nodes, the remaining entries in
+ * pDependentNodes will be set to NULL, and the number of nodes actually obtained will be returned
+ * in pNumDependentNodes.
+ *
+ */
+hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pDependentNodes,
+                                         size_t* pNumDependentNodes);
+
+/**
+ * @brief Returns a node's type.
+ *
+ * @param [in] node - Node to get type of.
+ * @param [out] pType - Returns the node's type.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType);
+
+/**
+ * @brief Remove a node from the graph.
+ *
+ * @param [in] node - graph node to remove
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphDestroyNode(hipGraphNode_t node);
+
+/**
+ * @brief Clones a graph.
+ *
+ * @param [out] pGraphClone - Returns newly created cloned graph.
+ * @param [in] originalGraph - original graph to clone from.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
+ *
+ */
+hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
+
+/**
+ * @brief Finds a cloned version of a node.
+ *
+ * @param [out] pNode - Returns the cloned node.
+ * @param [in] originalNode - original node handle.
+ * @param [in] clonedGraph - Cloned graph to query.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t originalNode,
+                                   hipGraph_t clonedGraph);
+
+/**
+ * @brief Creates an executable graph from a graph
+ *
+ * @param [out] pGraphExec - Pointer to instantiated executable graph.
+ * @param [in] graph - Instance of graph to instantiate.
+ * @param [out] pErrorNode - Pointer to error node. In case an error occured during
+ * graph instantiation, it could modify the corresponding node.
+ * @param [out] pLogBuffer - Pointer to log buffer.
+ * @param [out] bufferSize - Size of the log buffer.
+ *
+ * @returns #hipSuccess, #hipErrorOutOfMemory
+ *
+ */
+hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                               hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize);
+
+/**
+ * @brief Creates an executable graph from a graph.
+ *
+ * @param [out] pGraphExec - Pointer to instantiated executable graph.
+ * @param [in] graph - Instance of graph to instantiate.
+ * @param [in] flags - Flags to control instantiation.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @warning This API does not support any of flag and is behaving as hipGraphInstantiate.
+ */
+hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                        unsigned long long flags);
+
+/**
+ * @brief Creates an executable graph from a graph.
+ *
+ * @param [out] pGraphExec - Pointer to instantiated executable graph.
+ * @param [in] graph - Instance of graph to instantiate.
+ * @param [in] instantiateParams - Graph instantiation Params
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                         hipGraphInstantiateParams* instantiateParams);
+/**
+ * @brief Launches an executable graph in the specified stream.
+ *
+ * @param [in] graphExec - Instance of executable graph to launch.
+ * @param [in] stream - Instance of stream in which to launch executable graph.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
+
+/**
+ * @brief Uploads an executable graph to a stream
+ *
+ * @param [in] graphExec - Instance of executable graph to be uploaded.
+ * @param [in] stream - Instance of stream to which the executable graph is uploaded to.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream);
+
+/**
+ * @brief Creates a kernel execution node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to kernel graph node that is created.
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - Pointer to the dependencies on the kernel execution node.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] nodeParams - Pointer to the node parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue.
+ *
+ */
+hipError_t hipGraphAddNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                           const hipGraphNode_t* pDependencies, size_t numDependencies,
+                           hipGraphNodeParams* nodeParams);
+
+/**
+ * @brief Return the flags of an executable graph.
+ *
+ * @param [in] graphExec - Executable graph to get the flags from.
+ * @param [out] flags - Flags used to instantiate this executable graph.
+ * @returns #hipSuccess, #hipErrorInvalidValue.
+ *
+ */
+hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags);
+
+/**
+ * @brief Updates parameters of a graph's node.
+ *
+ * @param [in] node - Instance of the node to set parameters for.
+ * @param [in] nodeParams - Pointer to the parameters to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction,
+ * #hipErrorNotSupported.
+ *
+ */
+hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams* nodeParams);
+
+/**
+ * @brief Updates parameters of an executable graph's node.
+ *
+ * @param [in] graphExec - Instance of the executable graph.
+ * @param [in] node - Instance of the node to set parameters to.
+ * @param [in] nodeParams - Pointer to the parameters to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction,
+ * #hipErrorNotSupported.
+ *
+ */
+hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t node,
+                                     hipGraphNodeParams* nodeParams);
+
+/**
+ * @brief Destroys an executable graph
+ *
+ * @param [in] graphExec - Instance of executable graph to destroy.
+ *
+ * @returns #hipSuccess.
+ *
+ */
+hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec);
+
+// Check whether an executable graph can be updated with a graph and perform the update if possible.
+/**
+ * @brief Check whether an executable graph can be updated with a graph and perform the update if  *
+ * possible.
+ *
+ * @param [in] hGraphExec - instance of executable graph to update.
+ * @param [in] hGraph - graph that contains the updated parameters.
+ * @param [in] hErrorNode_out -  node which caused the permissibility check to forbid the update.
+ * @param [in] updateResult_out - Return code whether the graph update was performed.
+ * @returns #hipSuccess, #hipErrorGraphExecUpdateFailure
+ *
+ */
+hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
+                              hipGraphNode_t* hErrorNode_out,
+                              hipGraphExecUpdateResult* updateResult_out);
+
+/**
+ * @brief Creates a kernel execution node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - Pointer to the dependencies of the kernel execution node.
+ * @param [in] numDependencies - The number of the dependencies.
+ * @param [in] pNodeParams - Pointer to the parameters of the kernel execution node.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
+ *
+ */
+hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipKernelNodeParams* pNodeParams);
+
+/**
+ * @brief Gets kernel node's parameters.
+ *
+ * @param [in] node - instance of the node to get parameters from.
+ * @param [out] pNodeParams - pointer to the parameters
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node, hipKernelNodeParams* pNodeParams);
+
+/**
+ * @brief Sets a kernel node's parameters.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - const pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node, const hipKernelNodeParams* pNodeParams);
+
+/**
+ * @brief Sets the parameters for a kernel node in the given graphExec.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - const pointer to the kernel node parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                           const hipKernelNodeParams* pNodeParams);
+
+/**
+ * @brief Creates a memcpy node and adds it to a graph.
+ *
+ * @param [out] phGraphNode - Pointer to graph node that is created.
+ * @param [in] hGraph - Instance of graph to add the created node to.
+ * @param [in] dependencies - const pointer to the dependencies of the memcpy execution node.
+ * @param [in] numDependencies - The number of dependencies.
+ * @param [in] copyParams - const pointer to the parameters for the memory copy.
+ * @param [in] ctx - context related to current device.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                    const hipGraphNode_t* dependencies, size_t numDependencies,
+                                    const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
+/**
+ * @brief Creates a memcpy node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ * @param [in] numDependencies - The number of dependencies.
+ * @param [in] pCopyParams - const pointer to the parameters for the memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemcpy3DParms* pCopyParams);
+/**
+ * @brief Gets a memcpy node's parameters.
+ *
+ * @param [in] node - instance of the node to get parameters from.
+ * @param [out] pNodeParams - pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pNodeParams);
+
+/**
+ * @brief Sets a memcpy node's parameters.
+ *
+ * @param [in] node - instance of the node to set parameters to.
+ * @param [in] pNodeParams - const pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node, const hipMemcpy3DParms* pNodeParams);
+
+/**
+ * @brief Sets a node's attribute.
+ *
+ * @param [in] hNode - Instance of the node to set parameters of.
+ * @param [in] attr - The attribute type to be set.
+ * @param [in] value - const pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
+                                          const hipKernelNodeAttrValue* value);
+/**
+ * @brief Gets a node's attribute.
+ *
+ * @param [in] hNode - Instance of the node to set parameters of.
+ * @param [in] attr - The attribute type to be set.
+ * @param [in] value - const pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
+                                          hipKernelNodeAttrValue* value);
+/**
+ * @brief Sets the parameters of a memcpy node in the given graphExec.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - const pointer to the kernel node parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                           hipMemcpy3DParms* pNodeParams);
+
+/**
+ * @brief Creates a 1D memcpy node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ * @param [in] numDependencies - The number of dependencies.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] src - Pointer to memory address of the source.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                   const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                   void* dst, const void* src, size_t count, hipMemcpyKind kind);
+
+/**
+ * @brief Sets a memcpy node's parameters to perform a 1-dimensional copy.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] src - Pointer to memory address of the source.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const void* src,
+                                         size_t count, hipMemcpyKind kind);
+
+/**
+ * @brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional
+ * copy.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] src - Pointer to memory address of the source.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                             void* dst, const void* src, size_t count,
+                                             hipMemcpyKind kind);
+
+/**
+ * @brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node.
+ * @param [in] numDependencies - Number of the dependencies.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                           const hipGraphNode_t* pDependencies,
+                                           size_t numDependencies, void* dst, const void* symbol,
+                                           size_t count, size_t offset, hipMemcpyKind kind);
+
+/**
+ * @brief Sets a memcpy node's parameters to copy from a symbol on the device.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst, const void* symbol,
+                                                 size_t count, size_t offset, hipMemcpyKind kind);
+
+/**
+ * @brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the
+ * * device.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] dst - Pointer to memory address of the destination.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                                     void* dst, const void* symbol, size_t count,
+                                                     size_t offset, hipMemcpyKind kind);
+
+/**
+ * @brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] src - Pointer to memory address of the src.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                         const hipGraphNode_t* pDependencies,
+                                         size_t numDependencies, const void* symbol,
+                                         const void* src, size_t count, size_t offset,
+                                         hipMemcpyKind kind);
+
+/**
+ * @brief Sets a memcpy node's parameters to copy to a symbol on the device.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] src - Pointer to memory address of the src.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void* symbol,
+                                               const void* src, size_t count, size_t offset,
+                                               hipMemcpyKind kind);
+
+
+/**
+ * @brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the
+ * device.
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] symbol - Device symbol address.
+ * @param [in] src - Pointer to memory address of the src.
+ * @param [in] count - Size of the memory to copy.
+ * @param [in] offset - Offset from start of symbol in bytes.
+ * @param [in] kind - Type of memory copy.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                                   const void* symbol, const void* src,
+                                                   size_t count, size_t offset, hipMemcpyKind kind);
+
+/**
+ * @brief Creates a memset node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] pMemsetParams - const pointer to the parameters for the memory set.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemsetParams* pMemsetParams);
+
+/**
+ * @brief Gets a memset node's parameters.
+ *
+ * @param [in] node - Instance of the node to get parameters of.
+ * @param [out] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node, hipMemsetParams* pNodeParams);
+
+/**
+ * @brief Sets a memset node's parameters.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node, const hipMemsetParams* pNodeParams);
+
+/**
+ * @brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                           const hipMemsetParams* pNodeParams);
+
+/**
+ * @brief Creates a host execution node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph to add the created node to.
+ * @param [in] pDependencies - const pointer to the dependencies of the memset execution node.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                               const hipGraphNode_t* pDependencies, size_t numDependencies,
+                               const hipHostNodeParams* pNodeParams);
+
+/**
+ * @brief Returns a host node's parameters.
+ *
+ * @param [in] node - Instance of the node to get parameters of.
+ * @param [out] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node, hipHostNodeParams* pNodeParams);
+
+/**
+ * @brief Sets a host node's parameters.
+ *
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node, const hipHostNodeParams* pNodeParams);
+
+/**
+ * @brief Sets the parameters for a host node in the given graphExec.
+ *
+ * @param [in] hGraphExec - Instance of the executable graph with the node.
+ * @param [in] node - Instance of the node to set parameters of.
+ * @param [in] pNodeParams - Pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                         const hipHostNodeParams* pNodeParams);
+
+/**
+ * @brief Creates a child graph node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph to add the created node.
+ * @param [in] pDependencies - const pointer to the dependencies of the memset execution node.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] childGraph - Graph to clone into this node
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                     const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                     hipGraph_t childGraph);
+
+/**
+ * @brief Gets a handle to the embedded graph of a child graph node.
+ *
+ * @param [in] node - Instance of the node to get child graph of.
+ * @param [out] pGraph - Pointer to get the graph.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph);
+
+/**
+ * @brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * @param [in] hGraphExec - instance of the executable graph with the node.
+ * @param [in] node - node from the graph which was used to instantiate graphExec.
+ * @param [in] childGraph - child graph with updated parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
+                                               hipGraph_t childGraph);
+
+/**
+ * @brief Creates an empty node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph the node is added to.
+ * @param [in] pDependencies - const pointer to the node dependencies.
+ * @param [in] numDependencies - Number of dependencies.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                const hipGraphNode_t* pDependencies, size_t numDependencies);
+
+
+/**
+ * @brief Creates an event record node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph the node is added to.
+ * @param [in] pDependencies - const pointer to the node dependencies.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] event - Event of the node.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                      const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                      hipEvent_t event);
+
+/**
+ * @brief Returns the event associated with an event record node.
+ *
+ * @param [in] node -  Instance of the node to get event of.
+ * @param [out] event_out - Pointer to return the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
+
+/**
+ * @brief Sets an event record node's event.
+ *
+ * @param [in] node - Instance of the node to set event to.
+ * @param [in] event - Pointer to the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
+
+/**
+ * @brief Sets the event for an event record node in the given graphExec.
+ *
+ * @param [in] hGraphExec - instance of the executable graph with the node.
+ * @param [in] hNode - node from the graph which was used to instantiate graphExec.
+ * @param [in] event - pointer to the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                               hipEvent_t event);
+
+/**
+ * @brief Creates an event wait node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - Pointer to graph node that is created.
+ * @param [in] graph - Instance of the graph the node to be added.
+ * @param [in] pDependencies - const pointer to the node dependencies.
+ * @param [in] numDependencies - Number of dependencies.
+ * @param [in] event - Event for the node.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                    const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                    hipEvent_t event);
+
+
+/**
+ * @brief Returns the event associated with an event wait node.
+ *
+ * @param [in] node -  Instance of the node to get event of.
+ * @param [out] event_out - Pointer to return the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
+
+/**
+ * @brief Sets an event wait node's event.
+ *
+ * @param [in] node - Instance of the node to set event of.
+ * @param [in] event - Pointer to the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
+
+/**
+ * @brief Sets the event for an event record node in the given graphExec.
+ *
+ * @param [in] hGraphExec - instance of the executable graph with the node.
+ * @param [in] hNode - node from the graph which was used to instantiate graphExec.
+ * @param [in] event - pointer to the event.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                             hipEvent_t event);
+
+/**
+ * @brief Creates a memory allocation node and adds it to a graph
+ *
+ * @param [out] pGraphNode      - Pointer to the graph node to create and add to the graph
+ * @param [in] graph            - Instance of the graph node to be added
+ * @param [in] pDependencies    - Const pointer to the node dependencies
+ * @param [in] numDependencies  - The number of dependencies
+ * @param [in, out] pNodeParams - Node parameters for memory allocation, returns a pointer to the
+ * allocated memory.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                   const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                   hipMemAllocNodeParams* pNodeParams);
+
+/**
+ * @brief Returns parameters for memory allocation node
+ *
+ * @param [in] node         - Memory allocation node to query
+ * @param [out] pNodeParams - Parameters for the specified memory allocation node
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodeParams* pNodeParams);
+
+/**
+ * @brief Creates a memory free node and adds it to a graph
+ *
+ * @param [out] pGraphNode      - Pointer to the graph node to create and add to the graph
+ * @param [in] graph            - Instance of the graph node to be added
+ * @param [in] pDependencies    - Const pointer to the node dependencies
+ * @param [in] numDependencies  - The number of dependencies
+ * @param [in] dev_ptr          - Pointer to the memory to be freed
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                  void* dev_ptr);
+
+/**
+ * @brief Returns parameters for memory free node
+ *
+ * @param [in] node     - Memory free node to query
+ * @param [out] dev_ptr - Device pointer of the specified memory free node
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr);
+
+/**
+ * @brief Get the mem attribute for graphs.
+ *
+ * @param [in] device - Device to get attributes from
+ * @param [in] attr - Attribute type to be queried
+ * @param [out] value - Value of the queried attribute
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ *
+ */
+hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value);
+
+/**
+ * @brief Set the mem attribute for graphs.
+ *
+ * @param [in] device - Device to set attribute of.
+ * @param [in] attr - Attribute type to be set.
+ * @param [in] value - Value of the attribute.
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ *
+ */
+hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value);
+
+/**
+ * @brief Free unused memory reserved for graphs on a specific device and return it back to the OS.
+ *
+ * @param [in] device - Device for which memory should be trimmed
+ * @returns #hipSuccess, #hipErrorInvalidDevice
+ *
+ */
+hipError_t hipDeviceGraphMemTrim(int device);
+
+/**
+ * @brief Create an instance of userObject to manage lifetime of a resource.
+ *
+ * @param [out] object_out - pointer to instace of userobj.
+ * @param [in] ptr - pointer to pass to destroy function.
+ * @param [in] destroy - destroy callback to remove resource.
+ * @param [in] initialRefcount - reference to resource.
+ * @param [in] flags - flags passed to API.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy,
+                               unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * @brief Release number of references to resource.
+ *
+ * @param [in] object - pointer to instace of userobj.
+ * @param [in] count - reference to resource to be retained.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dparm(1));
+
+/**
+ * @brief Retain number of references to resource.
+ *
+ * @param [in] object - pointer to instace of userobj.
+ * @param [in] count - reference to resource to be retained.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dparm(1));
+
+/**
+ * @brief Retain user object for graphs.
+ *
+ * @param [in] graph - pointer to graph to retain the user object for.
+ * @param [in] object - pointer to instace of userobj.
+ * @param [in] count - reference to resource to be retained.
+ * @param [in] flags - flags passed to API.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object,
+                                    unsigned int count __dparm(1), unsigned int flags __dparm(0));
+
+/**
+ * @brief Release user object from graphs.
+ *
+ * @param [in] graph - pointer to graph to retain the user object for.
+ * @param [in] object - pointer to instace of userobj.
+ * @param [in] count - reference to resource to be retained.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object,
+                                     unsigned int count __dparm(1));
+
+/**
+ * @brief Write a DOT file describing graph structure.
+ *
+ * @param [in] graph - graph object for which DOT file has to be generated.
+ * @param [in] path - path to write the DOT file.
+ * @param [in] flags - Flags from hipGraphDebugDotFlags to get additional node information.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOperatingSystem
+ *
+ */
+hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags);
+
+/**
+ * @brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node to destination node.
+ * Both node must have the same context.
+ *
+ * @param [out] hDst - Destination node.
+ * @param [in] hSrc - Source node.
+ * For list of attributes see ::hipKernelNodeAttrID.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ */
+hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t hDst);
+
+/**
+ * @brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets hNode to be either enabled or disabled. Disabled nodes are functionally equivalent
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by
+ * disabling/enabling the node.
+ *
+ * The node is identified by the corresponding hNode in the non-executable graph, from which the
+ * executable graph was instantiated.
+ *
+ * hNode must not have been removed from the original graph.
+ *
+ * @note Currently only kernel, memset and memcpy nodes are supported.
+ *
+ * @param [in] hGraphExec - The executable graph in which to set the specified node.
+ * @param [in] hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [in] isEnabled  - Node is enabled if != 0, otherwise the node is disabled.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue,
+ *
+ */
+hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                  unsigned int isEnabled);
+/**
+ * @brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if hNode is enabled, or 0 if it is disabled.
+ *
+ * The node is identified by the corresponding node in the non-executable graph, from which the
+ * executable graph was instantiated.
+ *
+ * hNode must not have been removed from the original graph.
+ *
+ * @note Currently only kernel, memset and memcpy nodes are supported.
+ *
+ * @param [in]  hGraphExec - The executable graph in which to set the specified node.
+ * @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [out] isEnabled  - Location to return the enabled status of the node.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                  unsigned int* isEnabled);
+
+/**
+ * @brief Creates a external semaphor wait node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - pointer to the graph node to create.
+ * @param [in] graph - instance of the graph to add the created node.
+ * @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
+ * @param [in] numDependencies - the number of the dependencies.
+ * @param [in] nodeParams -pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddExternalSemaphoresWaitNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams);
+
+/**
+ * @brief Creates a external semaphor signal node and adds it to a graph.
+ *
+ * @param [out] pGraphNode - pointer to the graph node to create.
+ * @param [in] graph - instance of the graph to add the created node.
+ * @param [in] pDependencies - const pointer to the dependencies on the memset execution node.
+ * @param [in] numDependencies - the number of the dependencies.
+ * @param [in] nodeParams -pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphAddExternalSemaphoresSignalNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams);
+/**
+ * @brief Updates node parameters in the external semaphore signal node.
+ *
+ * @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [in]  nodeParams  - Pointer to the params to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams);
+/**
+ * @brief Updates node parameters in the external semaphore wait node.
+ *
+ * @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [in]  nodeParams  - Pointer to the params to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams);
+/**
+ * @brief Returns external semaphore signal node params.
+ *
+ * @param [in]   hNode       - Node from the graph from which graphExec was instantiated.
+ * @param [out]  params_out  - Pointer to params.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* params_out);
+/**
+ * @brief Returns external semaphore wait node params.
+ *
+ * @param [in]   hNode       - Node from the graph from which graphExec was instantiated.
+ * @param [out]  params_out  - Pointer to params.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* params_out);
+/**
+ * @brief Updates node parameters in the external semaphore signal node in the given graphExec.
+ *
+ * @param [in]  hGraphExec - The executable graph in which to set the specified node.
+ * @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [in]  nodeParams  - Pointer to the params to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+    const hipExternalSemaphoreSignalNodeParams* nodeParams);
+/**
+ * @brief Updates node parameters in the external semaphore wait node in the given graphExec.
+ *
+ * @param [in]  hGraphExec - The executable graph in which to set the specified node.
+ * @param [in]  hNode      - Node from the graph from which graphExec was instantiated.
+ * @param [in]  nodeParams  - Pointer to the params to be set.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+    const hipExternalSemaphoreWaitNodeParams* nodeParams);
+
+/**
+ * @brief Gets a memcpy node's parameters.
+ *
+ * @param [in] hNode - instance of the node to get parameters from.
+ * @param [out] nodeParams - pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* nodeParams);
+
+/**
+ * @brief Sets a memcpy node's parameters.
+ *
+ * @param [in] hNode - instance of the node to Set parameters for.
+ * @param [out] nodeParams - pointer to the parameters.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY3D* nodeParams);
+
+/**
+ * @brief Creates a memset node and adds it to a graph.
+ *
+ * @param [out] phGraphNode - pointer to graph node to create.
+ * @param [in] hGraph - instance of graph to add the created node to.
+ * @param [in] dependencies - const pointer to the dependencies on the memset execution node.
+ * @param [in] numDependencies - number of the dependencies.
+ * @param [in] memsetParams - const pointer to the parameters for the memory set.
+ * @param [in] ctx - cotext related to current device.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                    const hipGraphNode_t* dependencies, size_t numDependencies,
+                                    const hipMemsetParams* memsetParams, hipCtx_t ctx);
+
+/**
+ * @brief Creates a memory free node and adds it to a graph
+ *
+ * @param [out] phGraphNode - Pointer to the graph node to create and add to the graph
+ * @param [in]  hGraph - Instance of the graph the node to be added
+ * @param [in]  dependencies - Const pointer to the node dependencies
+ * @param [in]  numDependencies - The number of dependencies
+ * @param [in]  dptr - Pointer to the memory to be freed
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                     const hipGraphNode_t* dependencies, size_t numDependencies,
+                                     hipDeviceptr_t dptr);
+
+/**
+ * @brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * @param [in] hGraphExec - instance of the executable graph with the node.
+ * @param [in] hNode - instance of the node to set parameters to.
+ * @param [in] copyParams - const pointer to the memcpy node params.
+ * @param [in] ctx - cotext related to current device.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                              const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
+
+/**
+ * @brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * @param [in] hGraphExec - instance of the executable graph with the node.
+ * @param [in] hNode - instance of the node to set parameters to.
+ * @param [in] memsetParams - pointer to the parameters.
+ * @param [in] ctx - cotext related to current device.
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                              const hipMemsetParams* memsetParams, hipCtx_t ctx);
+
+// doxygen end graph API
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Virtual Virtual Memory Management
+ *  @{
+ *  This section describes the virtual memory management functions of HIP runtime API.
+ *
+ *  @note  Please note, the virtual memory management functions of HIP runtime
+ *         API are implemented on Linux, under development on Windows. The
+ *         following Virtual Memory Management APIs are not (yet)
+ *         supported in HIP:
+ *          - hipMemMapArrayAsync
+ */
+
+/**
+ * @brief Frees an address range reservation made via hipMemAddressReserve
+ *
+ * @param [in] devPtr - starting address of the range.
+ * @param [in] size - size of the range.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemAddressFree(void* devPtr, size_t size);
+
+/**
+ * @brief Reserves an address range
+ *
+ * @param [out] ptr - starting address of the reserved range.
+ * @param [in] size - size of the reservation.
+ * @param [in] alignment - alignment of the address.
+ * @param [in] addr - requested starting address of the range.
+ * @param [in] flags - currently unused, must be zero.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* addr,
+                                unsigned long long flags);
+
+/**
+ * @brief Creates a memory allocation described by the properties and size
+ *
+ * @param [out] handle - value of the returned handle.
+ * @param [in] size - size of the allocation.
+ * @param [in] prop - properties of the allocation.
+ * @param [in] flags - currently unused, must be zero.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
+                        const hipMemAllocationProp* prop, unsigned long long flags);
+
+/**
+ * @brief Exports an allocation to a requested shareable handle type.
+ *
+ * @param [out] shareableHandle - value of the returned handle.
+ * @param [in] handle - handle to share.
+ * @param [in] handleType - type of the shareable handle.
+ * @param [in] flags - currently unused, must be zero.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemExportToShareableHandle(void* shareableHandle,
+                                         hipMemGenericAllocationHandle_t handle,
+                                         hipMemAllocationHandleType handleType,
+                                         unsigned long long flags);
+
+/**
+ * @brief Get the access flags set for the given location and ptr.
+ *
+ * @param [out] flags - flags for this location.
+ * @param [in] location - target location.
+ * @param [in] ptr - address to check the access flags.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr);
+
+/**
+ * @brief Calculates either the minimal or recommended granularity.
+ *
+ * @param [out] granularity - returned granularity.
+ * @param [in] prop - location properties.
+ * @param [in] option - determines which granularity to return.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ *
+ */
+hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop,
+                                          hipMemAllocationGranularity_flags option);
+
+/**
+ * @brief Retrieve the property structure of the given handle.
+ *
+ * @param [out] prop - properties of the given handle.
+ * @param [in] handle - handle to perform the query on.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
+                                                   hipMemGenericAllocationHandle_t handle);
+
+/**
+ * @brief Imports an allocation from a requested shareable handle type.
+ *
+ * @param [out] handle - returned value.
+ * @param [in] osHandle - shareable handle representing the memory allocation.
+ * @param [in] shHandleType - handle type.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle, void* osHandle,
+                                           hipMemAllocationHandleType shHandleType);
+
+/**
+ * @brief Maps an allocation handle to a reserved virtual address range.
+ *
+ * @param [in] ptr - address where the memory will be mapped.
+ * @param [in] size - size of the mapping.
+ * @param [in] offset - offset into the memory, currently must be zero.
+ * @param [in] handle - memory allocation to be mapped.
+ * @param [in] flags - currently unused, must be zero.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle,
+                     unsigned long long flags);
+
+/**
+ * @brief Maps or unmaps subregions of sparse HIP arrays and sparse HIP mipmapped arrays.
+ *
+ * @param [in] mapInfoList - list of hipArrayMapInfo.
+ * @param [in] count - number of hipArrayMapInfo in mapInfoList.
+ * @param [in] stream - stream identifier for the stream to use for map or unmap operations.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is under development. Currently it is not supported on AMD
+ *          GPUs and returns #hipErrorNotSupported.
+ */
+hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count,
+                               hipStream_t stream);
+
+/**
+ * @brief Release a memory handle representing a memory allocation which was previously allocated
+ * through hipMemCreate.
+ *
+ * @param [in] handle - handle of the memory allocation.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle);
+
+/**
+ * @brief Returns the allocation handle of the backing memory allocation given the address.
+ *
+ * @param [out] handle - handle representing addr.
+ * @param [in] addr - address to look up.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, void* addr);
+
+/**
+ * @brief Set the access flags for each location specified in desc for the given virtual address
+ * range.
+ *
+ * @param [in] ptr - starting address of the virtual address range.
+ * @param [in] size - size of the range.
+ * @param [in] desc - array of hipMemAccessDesc.
+ * @param [in] count - number of hipMemAccessDesc in desc.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, size_t count);
+
+/**
+ * @brief Unmap memory allocation of a given address range.
+ *
+ * @param [in] ptr - starting address of the range to unmap.
+ * @param [in] size - size of the virtual address range.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ *          change and might have outstanding issues.
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+hipError_t hipMemUnmap(void* ptr, size_t size);
+
+// doxygen end virtual memory management API
+/**
+ * @}
+ */
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup GraphicsInterop Graphics Interoperability
+ * @{
+ * This section describes graphics interoperability functions of HIP runtime API.
+ */
+
+/**
+ * @brief Maps a graphics resource for access.
+ *
+ * @param [in] count - Number of resources to map.
+ * @param [in] resources - Pointer of resources to map.
+ * @param [in] stream - Stream for synchronization.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
+ *
+ */
+hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources,
+                                   hipStream_t stream __dparm(0));
+/**
+ * @brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * @param [out] array - Pointer of array through which a subresource of resource may be accessed.
+ * @param [in] resource - Mapped resource to access.
+ * @param [in] arrayIndex - Array index for the subresource to access.
+ * @param [in] mipLevel - Mipmap level for the subresource to access.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @note  In this API, the value of arrayIndex higher than zero is currently not supported.
+ *
+ */
+hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsResource_t resource,
+                                                unsigned int arrayIndex, unsigned int mipLevel);
+/**
+ * @brief Gets device accessible address of a graphics resource.
+ *
+ * @param [out] devPtr - Pointer of device through which graphic resource may be accessed.
+ * @param [out] size - Size of the buffer accessible from devPtr.
+ * @param [in] resource - Mapped resource to access.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipGraphicsResourceGetMappedPointer(void** devPtr, size_t* size,
+                                               hipGraphicsResource_t resource);
+/**
+ * @brief Unmaps graphics resources.
+ *
+ * @param [in] count - Number of resources to unmap.
+ * @param [in] resources - Pointer of resources to unmap.
+ * @param [in] stream - Stream for synchronization.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorContextIsDestroyed
+ *
+ */
+hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources,
+                                     hipStream_t stream __dparm(0));
+/**
+ * @brief Unregisters a graphics resource.
+ *
+ * @param [in] resource - Graphics resources to unregister.
+ *
+ * @returns #hipSuccess
+ *
+ */
+hipError_t hipGraphicsUnregisterResource(hipGraphicsResource_t resource);
+// doxygen end GraphicsInterop
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Surface Surface Object
+ * @{
+ *
+ *  This section describes surface object functions of HIP runtime API.
+ *
+ *  @note  APIs in this section are under development.
+ *
+ */
+
+/**
+ * @brief Create a surface object.
+ *
+ * @param [out] pSurfObject  Pointer of surface object to be created.
+ * @param [in] pResDesc  Pointer of suface object descriptor.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
+/**
+ * @brief Destroy a surface object.
+ *
+ * @param [in] surfaceObject  Surface object to be destroyed.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
+// end of surface
+/**
+ * @}
+ */
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
+#ifdef __cplusplus
+#if defined(__clang__) && defined(__HIP__)
+template <typename T> static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(
+    int* gridSize, int* blockSize, T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+  return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),
+                                           dynSharedMemPerBlk, blockSizeLimit);
+}
+template <typename T> static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeWithFlags(
+    int* gridSize, int* blockSize, T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0,
+    unsigned int flags = 0) {
+  (void)flags;
+  return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),
+                                           dynSharedMemPerBlk, blockSizeLimit);
+}
+#endif  // defined(__clang__) && defined(__HIP__)
+
+/**
+ * @brief Gets the address of a symbol.
+ * @ingroup Memory
+ * @param [out] devPtr - Returns device pointer associated with symbol.
+ * @param [in] symbol - Device symbol.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+template <typename T> hipError_t hipGetSymbolAddress(void** devPtr, const T& symbol) {
+  return ::hipGetSymbolAddress(devPtr, (const void*)&symbol);
+}
+/**
+ * @ingroup Memory
+ * @brief Gets the size of a symbol.
+ *
+ * @param [out] size - Returns the size of a symbol.
+ * @param [in] symbol - Device symbol address.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+template <typename T> hipError_t hipGetSymbolSize(size_t* size, const T& symbol) {
+  return ::hipGetSymbolSize(size, (const void*)&symbol);
+}
+
+/**
+ * @ingroup Memory
+ * @brief Copies data to the given symbol on the device.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue
+ *
+ * @see hipMemcpyToSymbol
+ */
+template <typename T>
+hipError_t hipMemcpyToSymbol(const T& symbol, const void* src, size_t sizeBytes,
+                             size_t offset __dparm(0),
+                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
+  return ::hipMemcpyToSymbol((const void*)&symbol, src, sizeBytes, offset, kind);
+}
+/**
+ * @ingroup Memory
+ * @brief Copies data to the given symbol on the device asynchronously on the stream.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue
+ *
+ * @see hipMemcpyToSymbolAsync
+ */
+template <typename T>
+hipError_t hipMemcpyToSymbolAsync(const T& symbol, const void* src, size_t sizeBytes, size_t offset,
+                                  hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+  return ::hipMemcpyToSymbolAsync((const void*)&symbol, src, sizeBytes, offset, kind, stream);
+}
+/**
+ * @brief Copies data from the given symbol on the device.
+ * @ingroup Memory
+ * @returns #hipSuccess, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue
+ *
+ * @see hipMemcpyFromSymbol
+ */
+template <typename T>
+hipError_t hipMemcpyFromSymbol(void* dst, const T& symbol, size_t sizeBytes,
+                               size_t offset __dparm(0),
+                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+  return ::hipMemcpyFromSymbol(dst, (const void*)&symbol, sizeBytes, offset, kind);
+}
+/**
+ * @brief Copies data from the given symbol on the device asynchronously on the stream.
+ * @ingroup Memory
+ * @returns #hipSuccess, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue
+ *
+ * @see hipMemcpyFromSymbolAsync
+ */
+template <typename T>
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const T& symbol, size_t sizeBytes, size_t offset,
+                                    hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+  return ::hipMemcpyFromSymbolAsync(dst, (const void*)&symbol, sizeBytes, offset, kind, stream);
+}
+
+/**
+ * @brief Returns occupancy for a kernel function.
+ * @ingroup Occupancy
+ * @param [out] numBlocks - Pointer of occupancy in number of blocks.
+ * @param [in] f - The kernel function to launch on the device.
+ * @param [in] blockSize - The block size as kernel launched.
+ * @param [in] dynSharedMemPerBlk - Dynamic shared memory in bytes per block.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T f, int blockSize,
+                                                               size_t dynSharedMemPerBlk) {
+  return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, reinterpret_cast<const void*>(f),
+                                                      blockSize, dynSharedMemPerBlk);
+}
+/**
+ * @brief Returns occupancy for a device function with the specified flags.
+ *
+ * @ingroup Occupancy
+ * @param [out] numBlocks - Pointer of occupancy in number of blocks.
+ * @param [in] f - The kernel function to launch on the device.
+ * @param [in] blockSize - The block size as kernel launched.
+ * @param [in] dynSharedMemPerBlk - Dynamic shared memory in bytes per block.
+ * @param [in] flags - Flag to handle the behavior for the occupancy calculator.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ */
+template <class T> inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+  return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+      numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
+}
+/**
+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device
+ * function
+ *
+ * @ingroup Occupancy
+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps on the current device with the smallest number
+ * of blocks for a particular function).
+ *
+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
+ * @param [out] block_size    block size required for the best potential occupancy
+ * @param [in]  func          device function symbol
+ * @param [in]  block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
+ * and returns the size, in bytes, of dynamic shared memory needed for a block
+ * @param [in]  block_size_limit the maximum block size \p func is designed to work with. 0 means no
+ * limit.
+ * @param [in]  flags         reserved
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction,
+ * #hipErrorInvalidValue, #hipErrorUnknown
+ */
+template <typename UnaryFunction, class T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int* min_grid_size, int* block_size, T func, UnaryFunction block_size_to_dynamic_smem_size,
+    int block_size_limit = 0, unsigned int flags = 0) {
+  if (min_grid_size == nullptr || block_size == nullptr ||
+      reinterpret_cast<const void*>(func) == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  int dev;
+  hipError_t status;
+  if ((status = hipGetDevice(&dev)) != hipSuccess) {
+    return status;
+  }
+
+  int max_threads_per_cu;
+  if ((status = hipDeviceGetAttribute(&max_threads_per_cu,
+                                      hipDeviceAttributeMaxThreadsPerMultiProcessor, dev)) !=
+      hipSuccess) {
+    return status;
+  }
+
+  int warp_size;
+  if ((status = hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, dev)) != hipSuccess) {
+    return status;
+  }
+
+  int max_cu_count;
+  if ((status = hipDeviceGetAttribute(&max_cu_count, hipDeviceAttributeMultiprocessorCount, dev)) !=
+      hipSuccess) {
+    return status;
+  }
+
+  struct hipFuncAttributes attr;
+  if ((status = hipFuncGetAttributes(&attr, reinterpret_cast<const void*>(func))) != hipSuccess) {
+    return status;
+  }
+
+  // Initial limits for the execution
+  const int func_max_threads_per_block = attr.maxThreadsPerBlock;
+  if (block_size_limit == 0) {
+    block_size_limit = func_max_threads_per_block;
+  }
+
+  if (func_max_threads_per_block < block_size_limit) {
+    block_size_limit = func_max_threads_per_block;
+  }
+
+  const int block_size_limit_aligned =
+      ((block_size_limit + (warp_size - 1)) / warp_size) * warp_size;
+
+  // For maximum search
+  int max_threads = 0;
+  int max_block_size{};
+  int max_num_blocks{};
+  for (int block_size_check_aligned = block_size_limit_aligned; block_size_check_aligned > 0;
+       block_size_check_aligned -= warp_size) {
+    // Make sure the logic uses the requested limit and not aligned
+    int block_size_check =
+        (block_size_limit < block_size_check_aligned) ? block_size_limit : block_size_check_aligned;
+
+    size_t dyn_smem_size = block_size_to_dynamic_smem_size(block_size_check);
+    int optimal_blocks;
+    if ((status = hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+             &optimal_blocks, func, block_size_check, dyn_smem_size, flags)) != hipSuccess) {
+      return status;
+    }
+
+    int total_threads = block_size_check * optimal_blocks;
+    if (total_threads > max_threads) {
+      max_block_size = block_size_check;
+      max_num_blocks = optimal_blocks;
+      max_threads = total_threads;
+    }
+
+    // Break if the logic reached possible maximum
+    if (max_threads_per_cu == max_threads) {
+      break;
+    }
+  }
+
+  // Grid size is the number of blocks per CU * CU count
+  *min_grid_size = max_num_blocks * max_cu_count;
+  *block_size = max_block_size;
+
+  return status;
+}
+
+/**
+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device
+ * function
+ *
+ * @ingroup Occupancy
+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps on the current device with the smallest number
+ * of blocks for a particular function).
+ *
+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
+ * @param [out] block_size    block size required for the best potential occupancy
+ * @param [in]  func          device function symbol
+ * @param [in]  block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
+ * and returns the size, in bytes, of dynamic shared memory needed for a block
+ * @param [in]  block_size_limit the maximum block size \p func is designed to work with. 0 means no
+ * limit.
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction,
+ * #hipErrorInvalidValue, #hipErrorUnknown
+ */
+template <typename UnaryFunction, class T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMem(
+    int* min_grid_size, int* block_size, T func, UnaryFunction block_size_to_dynamic_smem_size,
+    int block_size_limit = 0) {
+  return hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+      min_grid_size, block_size, func, block_size_to_dynamic_smem_size, block_size_limit);
+}
+/**
+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device
+ * function
+ *
+ * @ingroup Occupancy
+ *
+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps on the current device with the smallest number
+ * of blocks for a particular function).
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see hipOccupancyMaxPotentialBlockSize
+ */
+template <typename F> inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize,
+                                                                          int* blockSize, F kernel,
+                                                                          size_t dynSharedMemPerBlk,
+                                                                          uint32_t blockSizeLimit) {
+  return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, (hipFunction_t)kernel,
+                                           dynSharedMemPerBlk, blockSizeLimit);
+}
+/**
+ * @brief Launches a device function
+ *
+ * @ingroup Execution
+ * @ingroup ModuleCooperativeG
+ *
+ * \tparam T                  The type of the kernel function.
+ *
+ * @param [in] f              Kernel function to launch.
+ * @param [in] gridDim        Grid dimensions specified as multiple of blockDim.
+ * @param [in] blockDim       Block dimensions specified in work-items.
+ * @param [in] kernelParams   A list of kernel arguments.
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for
+ *                            this kernel. The HIP-Clang compiler provides
+ *                            support for extern shared declarations.
+ * @param [in] stream         Stream which on the kernel launched.
+ *
+ * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue,
+ * #hipErrorInvalidResourceHandle
+ *
+ */
+template <class T>
+inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams,
+                                             unsigned int sharedMemBytes, hipStream_t stream) {
+  return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim,
+                                    kernelParams, sharedMemBytes, stream);
+}
+/**
+ * @brief Launches kernel function on multiple devices, where thread blocks can
+ *        cooperate and synchronize on execution.
+ *
+ * @ingroup Execution
+ * @ingroup ModuleCooperativeG
+ *
+ * @param [in] launchParamsList List of kernel launch parameters, one per device.
+ * @param [in] numDevices       Size of launchParamsList array.
+ * @param [in] flags            Flag to handle launch behavior.
+ *
+ * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue,
+ * #hipErrorInvalidResourceHandle
+ *
+ */
+template <class T>
+inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                        unsigned int numDevices,
+                                                        unsigned int flags = 0) {
+  return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+/**
+ * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
+ * on respective streams before enqueuing any other work on the specified streams from any other
+ * threads
+ * @ingroup Execution
+ *
+ * @param [in] launchParamsList         List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+template <class T>
+inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                     unsigned int numDevices,
+                                                     unsigned int flags = 0) {
+  return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+/**
+ * @brief Binds a memory area to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of memory on the device.
+ * @param [in] size  Size of memory in bites.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
+  return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size);
+}
+/**
+ * @brief Binds a memory area to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of memory on the device.
+ * @param [in] desc  Texture channel format.
+ * @param [in] size  Size of memory in bites.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex, const void* devPtr,
+                   const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
+  return hipBindTexture(offset, &tex, devPtr, &desc, size);
+}
+/**
+ * @brief Binds a 2D memory area to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of 2D memory area on the device.
+ * @param [in] width  Width in texel units.
+ * @param [in] height  Height in texel units.
+ * @param [in] pitch  Pitch in bytes.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTexture2D(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                     const void* devPtr, size_t width, size_t height, size_t pitch) {
+  return hipBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+/**
+ * @brief Binds a 2D memory area to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] offset  Offset in bytes.
+ * @param [in] tex  Texture to bind.
+ * @param [in] devPtr  Pointer of 2D memory area on the device.
+ * @param [in] desc  Texture channel format.
+ * @param [in] width  Width in texel units.
+ * @param [in] height  Height in texel units.
+ * @param [in] pitch  Pitch in bytes.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTexture2D(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                     const void* devPtr, const struct hipChannelFormatDesc& desc, size_t width,
+                     size_t height, size_t pitch) {
+  return hipBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+/**
+ * @brief Binds an array to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] tex  Texture to bind.
+ * @param [in] array  Array of memory on the device.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTextureToArray(const struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+  struct hipChannelFormatDesc desc;
+  hipError_t err = hipGetChannelDesc(&desc, array);
+  return (err == hipSuccess) ? hipBindTextureToArray(&tex, array, &desc) : err;
+}
+/**
+ * @brief Binds an array to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] tex  Texture to bind.
+ * @param [in] array  Array of memory on the device.
+ * @param [in] desc  Texture channel format.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTextureToArray(const struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+                          const struct hipChannelFormatDesc& desc) {
+  return hipBindTextureToArray(&tex, array, &desc);
+}
+/**
+ * @brief Binds a mipmapped array to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] tex  Texture to bind.
+ * @param [in] mipmappedArray  Mipmapped Array of memory on the device.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToMipmappedArray(const struct texture<T, dim, readMode>& tex,
+                                                        hipMipmappedArray_const_t mipmappedArray) {
+  struct hipChannelFormatDesc desc;
+  hipArray_t levelArray;
+  hipError_t err = hipGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+  if (err != hipSuccess) {
+    return err;
+  }
+  err = hipGetChannelDesc(&desc, levelArray);
+  return (err == hipSuccess) ? hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc) : err;
+}
+/**
+ * @brief Binds a mipmapped array to a texture [Deprecated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] tex  Texture to bind.
+ * @param [in] mipmappedArray  Mipmapped Array of memory on the device.
+ * @param [in] desc  Texture channel format.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToMipmappedArray(const struct texture<T, dim, readMode>& tex,
+                                                        hipMipmappedArray_const_t mipmappedArray,
+                                                        const struct hipChannelFormatDesc& desc) {
+  return hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+/**
+ * @brief Unbinds a texture [Depreacated]
+ *
+ * @ingroup TextureD
+ *
+ * @param [in] tex  Texture to unbind.
+ *
+ * @warning This API is deprecated.
+ *
+ */
+template <class T, int dim, enum hipTextureReadMode readMode> HIP_DEPRECATED(HIP_DEPRECATED_MSG)
+static inline hipError_t hipUnbindTexture(const struct texture<T, dim, readMode>& tex) {
+  return hipUnbindTexture(&tex);
+}
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @ingroup StreamO
+ * @{
+ *
+ *  This section describes wrappers for stream Ordered allocation from memory pool functions of
+ *  HIP runtime API.
+ *
+ *  @note  APIs in this section are implemented on Linux, under development on Windows.
+ *
+ */
+
+/**
+ * @brief C++ wrappers for allocations from a memory pool
+ *
+ * This is an alternate C++ calls for @p hipMallocFromPoolAsync made available through
+ * function overloading.
+ *
+ * @see hipMallocFromPoolAsync
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+static inline hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipMemPool_t mem_pool,
+                                        hipStream_t stream) {
+  return hipMallocFromPoolAsync(dev_ptr, size, mem_pool, stream);
+}
+/**
+ * @brief C++ wrappers for allocations from a memory pool on the stream
+ *
+ * This is an alternate C++ calls for @p hipMallocFromPoolAsync made available through
+ * function overloading.
+ *
+ * @see hipMallocFromPoolAsync
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+template <class T> static inline hipError_t hipMallocAsync(T** dev_ptr, size_t size,
+                                                           hipMemPool_t mem_pool,
+                                                           hipStream_t stream) {
+  return hipMallocFromPoolAsync(reinterpret_cast<void**>(dev_ptr), size, mem_pool, stream);
+}
+/**
+ * @brief C++ wrappers for allocations from a memory pool
+ *
+ * This is an alternate C++ calls for @p hipMallocFromPoolAsync made available through
+ * function overloading.
+ *
+ * @see hipMallocFromPoolAsync
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+template <class T>
+static inline hipError_t hipMallocAsync(T** dev_ptr, size_t size, hipStream_t stream) {
+  return hipMallocAsync(reinterpret_cast<void**>(dev_ptr), size, stream);
+}
+/**
+ * @brief C++ wrappers for allocations from a memory pool
+ *
+ * This is an alternate C++ calls for @p hipMallocFromPoolAsync made available through
+ * function overloading.
+ *
+ * @see hipMallocFromPoolAsync
+ *
+ * @note  This API is implemented on Linux and is under development on Microsoft Windows.
+ */
+template <class T> static inline hipError_t hipMallocFromPoolAsync(T** dev_ptr, size_t size,
+                                                                   hipMemPool_t mem_pool,
+                                                                   hipStream_t stream) {
+  return hipMallocFromPoolAsync(reinterpret_cast<void**>(dev_ptr), size, mem_pool, stream);
+}
+/**
+ * @brief Launches a HIP kernel using the specified configuration.
+ * @ingroup Execution
+ *
+ * This function dispatches the provided kernel with the given launch configuration and forwards the
+ * kernel arguments.
+ *
+ * @param [in] config                 Pointer to the kernel launch configuration structure.
+ * @param [in] kernel                 Pointer to the device kernel function to be launched.
+ * @param [in] args                   Variadic list of arguments to be passed to the kernel.
+ *
+ * @returns #hipSuccess if the kernel is launched successfully, otherwise an appropriate error code.
+ */
+template <typename... KernelArgs, typename... Params>
+static inline __host__ hipError_t hipLaunchKernelEx(const hipLaunchConfig_t* config,
+                                                    void (*kernel)(KernelArgs...),
+                                                    Params&&... args) {
+  return [&](KernelArgs... convertedArgs) {
+    void* pArgs[] = {&convertedArgs...};
+    return ::hipLaunchKernelExC(config, reinterpret_cast<void*>(kernel), pArgs);
+  }(std::forward<Params>(args)...);
+}
+/**
+ * @}
+ */
+
+
+#endif  // __cplusplus
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include "hip/nvidia_detail/nvidia_hip_runtime_api.h"
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+
+/**
+ * @brief: C++ wrapper for hipMalloc
+ * @ingroup Memory
+ * Perform automatic type conversion to eliminate the need for excessive typecasting (ie void**)
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipMalloc
+ */
+#if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__)
+template <class T> static inline hipError_t hipMalloc(T** devPtr, size_t size) {
+  return hipMalloc((void**)devPtr, size);
+}
+/**
+ * @brief: C++ wrapper for hipMallocPitch
+ * @ingroup Memory
+ * Perform automatic type conversion to eliminate the need for excessive typecasting (ie void**)
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipMallocPitch
+ */
+template <class T>
+static inline hipError_t hipMallocPitch(T** devPtr, size_t* pitch, size_t width, size_t height) {
+  return hipMallocPitch((void**)devPtr, pitch, width, height);
+}
+/**
+ * @brief: C++ wrapper for hipHostMalloc
+ * @ingroup Memory
+ * Provide an override to automatically typecast the pointer type from void**, and also provide a
+ * default for the flags.
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipHostMalloc
+ */
+template <class T>
+static inline hipError_t hipHostMalloc(T** ptr, size_t size,
+                                       unsigned int flags = hipHostMallocDefault) {
+  return hipHostMalloc((void**)ptr, size, flags);
+}
+/**
+ * @brief: C++ wrapper for hipHostAlloc
+ * @ingroup Memory
+ * Provide an override to automatically typecast the pointer type from void**, and also provide a
+ * default for the flags.
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipHostAlloc
+ */
+template <class T> static inline hipError_t hipHostAlloc(T** ptr, size_t size,
+                                                         unsigned int flags = hipHostAllocDefault) {
+  return hipHostAlloc((void**)ptr, size, flags);
+}
+/**
+ * @brief: C++ wrapper for hipMallocManaged
+ *
+ * @ingroup MemoryM
+ * Provide an override to automatically typecast the pointer type from void**, and also provide a
+ * default for the flags.
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipMallocManaged
+ *
+ */
+template <class T>
+static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
+                                          unsigned int flags = hipMemAttachGlobal) {
+  return hipMallocManaged((void**)devPtr, size, flags);
+}
+
+
+#endif
+#endif
+// doxygen end HIP API
+/**
+ * @}
+ */
+#include <hip/amd_detail/amd_hip_runtime_pt_api.h>
+
+#if USE_PROF_API
+#include <hip/amd_detail/hip_prof_str.h>
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_texture_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cefbe674b2108da2807dd979654c526ee92316e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_texture_types.h
@@ -0,0 +1,29 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
+
+#include <hip/texture_types.h>
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_vector_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_vector_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a0bcdea179bef4bbd7a365c4f4977f536bc839
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_vector_types.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! hip_vector_types.h : Defines the HIP vector types.
+
+#ifndef HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+
+#include <hip/hip_common.h>
+
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#if __cplusplus
+#include <hip/amd_detail/amd_hip_vector_types.h>
+#endif
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include <vector_types.h>
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_version.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd3cbbc3a36fa3f30a8eed7d1fc249889b7e439f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/hip_version.h
@@ -0,0 +1,17 @@
+// Auto-generated by cmake
+
+#ifndef HIP_VERSION_H
+#define HIP_VERSION_H
+
+#define HIP_VERSION_MAJOR 7
+#define HIP_VERSION_MINOR 1
+#define HIP_VERSION_PATCH 25424
+#define HIP_VERSION_GITHASH "4179531dcd"
+#define HIP_VERSION_BUILD_ID 0
+#define HIP_VERSION_BUILD_NAME ""
+#define HIP_VERSION    (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
+
+#define __HIP_HAS_GET_PCH 1
+
+#endif
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/library_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/library_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3c8d5d835371ccc41ca26630f337ce2191be78b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/library_types.h
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H
+#define HIP_INCLUDE_HIP_LIBRARY_TYPES_H
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_common.h>
+#endif
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+
+typedef enum hipDataType {
+  HIP_R_32F = 0,
+  HIP_R_64F = 1,
+  HIP_R_16F = 2,
+  HIP_R_8I = 3,
+  HIP_C_32F = 4,
+  HIP_C_64F = 5,
+  HIP_C_16F = 6,
+  HIP_C_8I = 7,
+  HIP_R_8U = 8,
+  HIP_C_8U = 9,
+  HIP_R_32I = 10,
+  HIP_C_32I = 11,
+  HIP_R_32U = 12,
+  HIP_C_32U = 13,
+  HIP_R_16BF = 14,
+  HIP_C_16BF = 15,
+  HIP_R_4I = 16,
+  HIP_C_4I = 17,
+  HIP_R_4U = 18,
+  HIP_C_4U = 19,
+  HIP_R_16I = 20,
+  HIP_C_16I = 21,
+  HIP_R_16U = 22,
+  HIP_C_16U = 23,
+  HIP_R_64I = 24,
+  HIP_C_64I = 25,
+  HIP_R_64U = 26,
+  HIP_C_64U = 27,
+  HIP_R_8F_E4M3 = 28,
+  HIP_R_8F_E5M2 = 29,
+  HIP_R_8F_UE8M0 = 30,
+  HIP_R_6F_E2M3 = 31,
+  HIP_R_6F_E3M2 = 32,
+  HIP_R_4F_E2M1 = 33,
+  // HIP specific Data Types
+  HIP_R_8F_E4M3_FNUZ = 1000,
+  HIP_R_8F_E5M2_FNUZ = 1001,
+} hipDataType;
+
+typedef enum hipLibraryPropertyType {
+  HIP_LIBRARY_MAJOR_VERSION,
+  HIP_LIBRARY_MINOR_VERSION,
+  HIP_LIBRARY_PATCH_LEVEL
+} hipLibraryPropertyType;
+
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include "library_types.h"
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/linker_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/linker_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..1131910322196a2903d5bad665baef9378607e9c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/linker_types.h
@@ -0,0 +1,138 @@
+
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_LINKER_TYPES_H
+#define HIP_INCLUDE_HIP_LINKER_TYPES_H
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#endif
+
+
+#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+
+/**
+ *  @defgroup LinkerTypes Jit Linker Data Types
+ *  @{
+ *  This section describes the Jit Linker data types.
+ *
+ */
+
+/**
+ * hipJitOption
+ */
+typedef enum hipJitOption {
+  hipJitOptionMaxRegisters = 0,         ///< CUDA Only Maximum registers may be used in a thread,
+                                        ///< passed to compiler
+  hipJitOptionThreadsPerBlock,          ///< CUDA Only Number of thread per block
+  hipJitOptionWallTime,                 ///< CUDA Only Value for total wall clock time
+  hipJitOptionInfoLogBuffer,            ///< CUDA Only Pointer to the buffer with logged information
+  hipJitOptionInfoLogBufferSizeBytes,   ///< CUDA Only Size of the buffer in bytes for logged info
+  hipJitOptionErrorLogBuffer,           ///< CUDA Only Pointer to the buffer with logged error(s)
+  hipJitOptionErrorLogBufferSizeBytes,  ///< CUDA Only Size of the buffer in bytes for logged
+                                        ///< error(s)
+  hipJitOptionOptimizationLevel,  ///< Value of optimization level for generated codes, acceptable
+                                  ///< options -O0, -O1, -O2, -O3
+  hipJitOptionTargetFromContext,  ///< CUDA Only The target context, which is the default
+  hipJitOptionTarget,             ///< CUDA Only JIT target
+  hipJitOptionFallbackStrategy,   ///< CUDA Only Fallback strategy
+  hipJitOptionGenerateDebugInfo,  ///< CUDA Only Generate debug information
+  hipJitOptionLogVerbose,         ///< CUDA Only Generate log verbose
+  hipJitOptionGenerateLineInfo,   ///< CUDA Only Generate line number information
+  hipJitOptionCacheMode,          ///< CUDA Only Set cache mode
+  hipJitOptionSm3xOpt,            ///< @deprecated CUDA Only New SM3X option.
+  hipJitOptionFastCompile,        ///< CUDA Only Set fast compile
+  hipJitOptionGlobalSymbolNames,  ///< CUDA Only Array of device symbol names to be relocated to the
+                                  ///< host
+  hipJitOptionGlobalSymbolAddresses,  ///< CUDA Only Array of host addresses to be relocated to the
+                                      ///< device
+  hipJitOptionGlobalSymbolCount,      ///< CUDA Only Number of symbol count.
+  hipJitOptionLto,       ///< @deprecated CUDA Only Enable link-time optimization for device code
+  hipJitOptionFtz,       ///< @deprecated CUDA Only Set single-precision denormals.
+  hipJitOptionPrecDiv,   ///< @deprecated CUDA Only Set single-precision floating-point division
+                         ///< and reciprocals
+  hipJitOptionPrecSqrt,  ///< @deprecated CUDA Only Set single-precision floating-point square root
+  hipJitOptionFma,       ///< @deprecated CUDA Only Enable floating-point multiplies and
+                         ///< adds/subtracts operations
+  hipJitOptionPositionIndependentCode,  ///< CUDA Only Generates Position Independent code
+  hipJitOptionMinCTAPerSM,  ///< CUDA Only Hints to JIT compiler the minimum number of CTAs frin
+                            ///< kernel's grid to be mapped to SM
+  hipJitOptionMaxThreadsPerBlock,       ///< CUDA only Maximum number of threads in a thread block
+  hipJitOptionOverrideDirectiveValues,  ///< Cuda only Override Directive values
+  hipJitOptionNumOptions,               ///< Number of options
+  hipJitOptionIRtoISAOptExt = 10000,    ///< Hip Only Linker options to be passed on to compiler
+  hipJitOptionIRtoISAOptCountExt,  ///< Hip Only Count of linker options to be passed on to compiler
+} hipJitOption;
+/**
+ * hipJitInputType
+ */
+typedef enum hipJitInputType {
+  hipJitInputCubin = 0,                 ///< Cuda only Input cubin
+  hipJitInputPtx,                       ///< Cuda only Input PTX
+  hipJitInputFatBinary,                 ///< Cuda Only Input FAT Binary
+  hipJitInputObject,                    ///< Cuda Only Host Object with embedded device code
+  hipJitInputLibrary,                   ///< Cuda Only Archive of Host Objects with embedded
+                                        ///< device code
+  hipJitInputNvvm,                      ///< @deprecated Cuda only High Level intermediate
+                                        ///< code for LTO
+  hipJitNumLegacyInputTypes,            ///< Count of Legacy Input Types
+  hipJitInputLLVMBitcode = 100,         ///< HIP Only LLVM Bitcode or IR assembly
+  hipJitInputLLVMBundledBitcode = 101,  ///< HIP Only LLVM Clang Bundled Code
+  hipJitInputLLVMArchivesOfBundledBitcode = 102,  ///< HIP Only LLVM Archive of Bundled Bitcode
+  hipJitInputSpirv = 103,                         ///< HIP Only SPIRV Code Object
+  hipJitNumInputTypes = 10                        ///< Count of Input Types
+} hipJitInputType;
+/**
+ * hipJitCacheMode
+ */
+typedef enum hipJitCacheMode {
+  hipJitCacheOptionNone = 0,
+  hipJitCacheOptionCG,
+  hipJitCacheOptionCA
+} hipJitCacheMode;
+/**
+ * hipJitFallback
+ */
+typedef enum hipJitFallback {
+  hipJitPreferPTX = 0,
+  hipJitPreferBinary,
+} hipJitFallback;
+
+typedef enum hipLibraryOption_e {
+  hipLibraryHostUniversalFunctionAndDataTable = 0,
+  hipLibraryBinaryIsPreserved = 1
+} hipLibraryOption;
+
+// doxygen end LinkerTypes
+/**
+ * @}
+ */
+
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#endif  // HIP_INCLUDE_HIP_LINKER_TYPES_H
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/surface_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5cc457b8593654b65d3f0a5baa08c409cd00afe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/surface_types.h
@@ -0,0 +1,65 @@
+/*
+Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  surface_types.h
+ *  @brief Defines surface types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_SURFACE_TYPES_H
+#define HIP_INCLUDE_HIP_SURFACE_TYPES_H
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/driver_types.h>
+#endif
+
+/**
+ * An opaque value that represents a hip surface object
+ */
+struct __hip_surface;
+typedef struct __hip_surface* hipSurfaceObject_t;
+
+/**
+ * hip surface reference
+ */
+struct surfaceReference {
+  hipSurfaceObject_t surfaceObject;
+};
+
+/**
+ * hip surface boundary modes
+ */
+enum hipSurfaceBoundaryMode {
+  hipBoundaryModeZero = 0,
+  hipBoundaryModeTrap = 1,
+  hipBoundaryModeClamp = 2
+};
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#endif /* !HIP_INCLUDE_HIP_SURFACE_TYPES_H */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/texture_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..65290cd52c72974292765aa95e7065adb9ee090f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hip/texture_types.h
@@ -0,0 +1,193 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_common.h>
+#endif
+
+#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include "texture_types.h"
+#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if !defined(__HIPCC_RTC__)
+#include <hip/channel_descriptor.h>
+#include <hip/driver_types.h>
+#endif  // !defined(__HIPCC_RTC__)
+
+#define hipTextureType1D 0x01
+#define hipTextureType2D 0x02
+#define hipTextureType3D 0x03
+#define hipTextureTypeCubemap 0x0C
+#define hipTextureType1DLayered 0xF1
+#define hipTextureType2DLayered 0xF2
+#define hipTextureTypeCubemapLayered 0xFC
+
+/**
+ * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
+ */
+#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
+#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
+#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
+#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
+
+/**
+ * An opaque value that represents a hip texture object
+ */
+struct __hip_texture;
+typedef struct __hip_texture* hipTextureObject_t;
+
+/**
+ * hip texture address modes
+ */
+enum hipTextureAddressMode {
+  hipAddressModeWrap = 0,
+  hipAddressModeClamp = 1,
+  hipAddressModeMirror = 2,
+  hipAddressModeBorder = 3
+};
+
+/**
+ * hip texture filter modes
+ */
+enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
+
+/**
+ * hip texture read modes
+ */
+enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
+
+/**
+ * hip texture reference
+ */
+typedef struct textureReference {
+  int normalized;
+  enum hipTextureReadMode readMode;  // used only for driver API's
+  enum hipTextureFilterMode filterMode;
+  enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+  struct hipChannelFormatDesc channelDesc;
+  int sRGB;                    // Perform sRGB->linear conversion during texture read
+  unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
+  enum hipTextureFilterMode mipmapFilterMode;
+  float mipmapLevelBias;
+  float minMipmapLevelClamp;
+  float maxMipmapLevelClamp;
+
+  hipTextureObject_t textureObject;
+  int numChannels;
+  enum hipArray_Format format;
+} textureReference;
+
+/**
+ * hip texture descriptor
+ */
+typedef struct hipTextureDesc {
+  enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+  enum hipTextureFilterMode filterMode;
+  enum hipTextureReadMode readMode;
+  int sRGB;  // Perform sRGB->linear conversion during texture read
+  float borderColor[4];
+  int normalizedCoords;
+  unsigned int maxAnisotropy;
+  enum hipTextureFilterMode mipmapFilterMode;
+  float mipmapLevelBias;
+  float minMipmapLevelClamp;
+  float maxMipmapLevelClamp;
+} hipTextureDesc;
+
+#if __cplusplus
+
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if __HIP__
+#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
+#else
+#define __HIP_TEXTURE_ATTRIB
+#endif
+
+typedef textureReference* hipTexRef;
+
+template <class T, int texType = hipTextureType1D,
+          enum hipTextureReadMode mode = hipReadModeElementType>
+struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
+  texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
+          enum hipTextureAddressMode aMode = hipAddressModeClamp) {
+    normalized = norm;
+    readMode = mode;
+    filterMode = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc = hipCreateChannelDesc<T>();
+    sRGB = 0;
+    textureObject = nullptr;
+    maxAnisotropy = 0;
+    mipmapLevelBias = 0;
+    minMipmapLevelClamp = 0;
+    maxMipmapLevelClamp = 0;
+  }
+
+  texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
+          struct hipChannelFormatDesc desc) {
+    normalized = norm;
+    readMode = mode;
+    filterMode = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc = desc;
+    sRGB = 0;
+    textureObject = nullptr;
+    maxAnisotropy = 0;
+    mipmapLevelBias = 0;
+    minMipmapLevelClamp = 0;
+    maxMipmapLevelClamp = 0;
+  }
+};
+
+#endif /* __cplusplus */
+
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hipblas-common/hipblas-common.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hipblas-common/hipblas-common.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ce438c201d0d115ab9f7912bd844cf05dcbf930
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hipblas-common/hipblas-common.h
@@ -0,0 +1,93 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * ************************************************************************ */
+
+//! HIP = Heterogeneous-compute Interface for Portability
+//!
+//! Define an extremely thin runtime layer that allows source code to be compiled unmodified
+//! through either AMD HCC or NVCC.   Key features tend to be in the spirit
+//! and terminology of CUDA, but with a portable path to other accelerators as well.
+//!
+//!  This is the master include file for hipblas-common, providing shared functionality
+//!  between hipBLAS and hipBLASLt.
+
+#ifndef HIPBLAS_COMMON_H
+#define HIPBLAS_COMMON_H
+
+/*! \brief hipblas status codes definition */
+typedef enum
+{
+    HIPBLAS_STATUS_SUCCESS           = 0, /**< Function succeeds */
+    HIPBLAS_STATUS_NOT_INITIALIZED   = 1, /**< HIPBLAS library not initialized */
+    HIPBLAS_STATUS_ALLOC_FAILED      = 2, /**< resource allocation failed */
+    HIPBLAS_STATUS_INVALID_VALUE     = 3, /**< unsupported numerical value was passed to function */
+    HIPBLAS_STATUS_MAPPING_ERROR     = 4, /**< access to GPU memory space failed */
+    HIPBLAS_STATUS_EXECUTION_FAILED  = 5, /**< GPU program failed to execute */
+    HIPBLAS_STATUS_INTERNAL_ERROR    = 6, /**< an internal HIPBLAS operation failed */
+    HIPBLAS_STATUS_NOT_SUPPORTED     = 7, /**< function not implemented */
+    HIPBLAS_STATUS_ARCH_MISMATCH     = 8, /**< architecture mismatch */
+    HIPBLAS_STATUS_HANDLE_IS_NULLPTR = 9, /**< hipBLAS handle is null pointer */
+    HIPBLAS_STATUS_INVALID_ENUM      = 10, /**<  unsupported enum value was passed to function */
+    HIPBLAS_STATUS_UNKNOWN           = 11, /**<  back-end returned an unsupported status code */
+} hipblasStatus_t;
+
+#ifndef HIPBLAS_OPERATION_DECLARED
+#define HIPBLAS_OPERATION_DECLARED
+/*! \brief Used to specify whether the matrix is to be transposed or not. */
+typedef enum
+{
+    HIPBLAS_OP_N = 111, /**<  Operate with the matrix. */
+    HIPBLAS_OP_T = 112, /**<  Operate with the transpose of the matrix. */
+    HIPBLAS_OP_C = 113 /**< Operate with the conjugate transpose of the matrix. */
+} hipblasOperation_t;
+
+#elif __cplusplus >= 201103L
+static_assert(HIPBLAS_OP_N == 111, "Inconsistent declaration of HIPBLAS_OP_N");
+static_assert(HIPBLAS_OP_T == 112, "Inconsistent declaration of HIPBLAS_OP_T");
+static_assert(HIPBLAS_OP_C == 113, "Inconsistent declaration of HIPBLAS_OP_C");
+#endif // HIPBLAS_OPERATION_DECLARED
+
+/*! \brief The compute type to be used. Currently only used with GemmEx with the HIPBLAS_V2 interface.
+ *         Note that support for compute types is largely dependent on backend. */
+typedef enum
+{
+    // Note that these types are taken from cuBLAS. With the rocBLAS backend, currently hipBLAS will
+    // convert to rocBLAS types to get equivalent functionality where supported.
+    HIPBLAS_COMPUTE_16F           = 0, /**< compute will be at least 16-bit precision */
+    HIPBLAS_COMPUTE_16F_PEDANTIC  = 1, /**< compute will be exactly 16-bit precision */
+    HIPBLAS_COMPUTE_32F           = 2, /**< compute will be at least 32-bit precision */
+    HIPBLAS_COMPUTE_32F_PEDANTIC  = 3, /**< compute will be exactly 32-bit precision */
+    HIPBLAS_COMPUTE_32F_FAST_16F  = 4, /**< 32-bit input can use 16-bit compute */
+    HIPBLAS_COMPUTE_32F_FAST_16BF = 5, /**< 32-bit input can is bf16 compute */
+    HIPBLAS_COMPUTE_32F_FAST_TF32
+    = 6, /**< 32-bit input can use tensor cores w/ TF32 compute. Only supported with cuBLAS and hipBLASLT backend currently */
+    HIPBLAS_COMPUTE_64F          = 7, /**< compute will be at least 64-bit precision */
+    HIPBLAS_COMPUTE_64F_PEDANTIC = 8, /**< compute will be exactly 64-bit precision */
+    HIPBLAS_COMPUTE_32I          = 9, /**< compute will be at least 32-bit integer precision */
+    HIPBLAS_COMPUTE_32I_PEDANTIC = 10, /**< compute will be exactly 32-bit integer precision */
+    HIPBLAS_COMPUTE_32F_FAST_8F_FNUZ    = 100, /**< 32-bit compute using fp8 mfma instruction */
+    HIPBLAS_COMPUTE_32F_FAST_8BF_FNUZ   = 101, /**< 32-bit compute using bf8 mfma instruction */
+    HIPBLAS_COMPUTE_32F_FAST_8F8BF_FNUZ = 102, /**< 32-bit compute using f8bf8 mfma instruction */
+    HIPBLAS_COMPUTE_32F_FAST_8BF8F_FNUZ = 103, /**< 32-bit compute using bf8f8 mfma instruction */
+} hipblasComputeType_t;
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/amd_hsa_kernel_code.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/amd_hsa_kernel_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..c00c88c02432a56254e06fdcb5d5923d529f6414
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/amd_hsa_kernel_code.h
@@ -0,0 +1,270 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_KERNEL_CODE_H
+#define AMD_HSA_KERNEL_CODE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Kernel Code Version Enumeration Values.
+typedef uint32_t amd_kernel_code_version32_t;
+enum amd_kernel_code_version_t {
+  AMD_KERNEL_CODE_VERSION_MAJOR = 1,
+  AMD_KERNEL_CODE_VERSION_MINOR = 1
+};
+
+// AMD Machine Kind Enumeration Values.
+typedef uint16_t amd_machine_kind16_t;
+enum amd_machine_kind_t {
+  AMD_MACHINE_KIND_UNDEFINED = 0,
+  AMD_MACHINE_KIND_AMDGPU = 1
+};
+
+// AMD Machine Version.
+typedef uint16_t amd_machine_version16_t;
+
+// AMD Float Round Mode Enumeration Values.
+enum amd_float_round_mode_t {
+  AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0,
+  AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1,
+  AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
+  AMD_FLOAT_ROUND_MODE_ZERO = 3
+};
+
+// AMD Float Denorm Mode Enumeration Values.
+enum amd_float_denorm_mode_t {
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0,
+  AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1,
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2,
+  AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
+};
+
+// AMD Compute Program Resource Register One.
+typedef uint32_t amd_compute_pgm_rsrc_one32_t;
+enum amd_compute_pgm_rsrc_one_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6)
+};
+
+// AMD System VGPR Workitem ID Enumeration Values.
+enum amd_system_vgpr_workitem_id_t {
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
+};
+
+// AMD Compute Program Resource Register Two.
+typedef uint32_t amd_compute_pgm_rsrc_two32_t;
+enum amd_compute_pgm_rsrc_two_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1)
+};
+
+// AMD Element Byte Size Enumeration Values.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_BYTE_SIZE_2 = 0,
+  AMD_ELEMENT_BYTE_SIZE_4 = 1,
+  AMD_ELEMENT_BYTE_SIZE_8 = 2,
+  AMD_ELEMENT_BYTE_SIZE_16 = 3
+};
+
+// AMD Kernel Code Properties.
+typedef uint32_t amd_kernel_code_properties32_t;
+enum amd_kernel_code_properties_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9)
+};
+
+// AMD Power Of Two Enumeration Values.
+typedef uint8_t amd_powertwo8_t;
+enum amd_powertwo_t {
+  AMD_POWERTWO_1 = 0,
+  AMD_POWERTWO_2 = 1,
+  AMD_POWERTWO_4 = 2,
+  AMD_POWERTWO_8 = 3,
+  AMD_POWERTWO_16 = 4,
+  AMD_POWERTWO_32 = 5,
+  AMD_POWERTWO_64 = 6,
+  AMD_POWERTWO_128 = 7,
+  AMD_POWERTWO_256 = 8
+};
+
+// AMD Enabled Control Directive Enumeration Values.
+typedef uint64_t amd_enabled_control_directive64_t;
+enum amd_enabled_control_directive_t {
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1,
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
+};
+
+// AMD Exception Kind Enumeration Values.
+typedef uint16_t amd_exception_kind16_t;
+enum amd_exception_kind_t {
+  AMD_EXCEPTION_KIND_INVALID_OPERATION = 1,
+  AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2,
+  AMD_EXCEPTION_KIND_OVERFLOW = 4,
+  AMD_EXCEPTION_KIND_UNDERFLOW = 8,
+  AMD_EXCEPTION_KIND_INEXACT = 16
+};
+
+// AMD Control Directives.
+#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64
+#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES)
+typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s {
+  amd_enabled_control_directive64_t enabled_control_directives;
+  uint16_t enable_break_exceptions;
+  uint16_t enable_detect_exceptions;
+  uint32_t max_dynamic_group_size;
+  uint64_t max_flat_grid_size;
+  uint32_t max_flat_workgroup_size;
+  uint8_t required_dim;
+  uint8_t reserved1[3];
+  uint64_t required_grid_size[3];
+  uint32_t required_workgroup_size[3];
+  uint8_t reserved2[60];
+} amd_control_directives_t;
+
+// AMD Kernel Code.
+#define AMD_ISA_ALIGN_BYTES 256
+#define AMD_KERNEL_CODE_ALIGN_BYTES 64
+#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES)
+typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s {
+  amd_kernel_code_version32_t amd_kernel_code_version_major;
+  amd_kernel_code_version32_t amd_kernel_code_version_minor;
+  amd_machine_kind16_t amd_machine_kind;
+  amd_machine_version16_t amd_machine_version_major;
+  amd_machine_version16_t amd_machine_version_minor;
+  amd_machine_version16_t amd_machine_version_stepping;
+  int64_t kernel_code_entry_byte_offset;
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+  uint64_t max_scratch_backing_memory_byte_size;
+  amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1;
+  amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2;
+  amd_kernel_code_properties32_t kernel_code_properties;
+  uint32_t workitem_private_segment_byte_size;
+  uint32_t workgroup_group_segment_byte_size;
+  uint32_t gds_segment_byte_size;
+  uint64_t kernarg_segment_byte_size;
+  uint32_t workgroup_fbarrier_count;
+  uint16_t wavefront_sgpr_count;
+  uint16_t workitem_vgpr_count;
+  uint16_t reserved_vgpr_first;
+  uint16_t reserved_vgpr_count;
+  uint16_t reserved_sgpr_first;
+  uint16_t reserved_sgpr_count;
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+  uint16_t debug_private_segment_buffer_sgpr;
+  amd_powertwo8_t kernarg_segment_alignment;
+  amd_powertwo8_t group_segment_alignment;
+  amd_powertwo8_t private_segment_alignment;
+  amd_powertwo8_t wavefront_size;
+  int32_t call_convention;
+  uint8_t reserved1[12];
+  uint64_t runtime_loader_kernel_symbol;
+  amd_control_directives_t control_directives;
+} amd_kernel_code_t;
+
+// TODO: this struct should be completely gone once debugger designs/implements
+// Debugger APIs.
+typedef struct amd_runtime_loader_debug_info_s {
+  const void* elf_raw;
+  size_t elf_size;
+  const char *kernel_name;
+  const void *owning_segment;
+} amd_runtime_loader_debug_info_t;
+
+#endif // AMD_HSA_KERNEL_CODE_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa.h
new file mode 100644
index 0000000000000000000000000000000000000000..00753e992e9a640e0835f201d5ef11c14c607f95
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa.h
@@ -0,0 +1,5752 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_H_
+#define HSA_RUNTIME_INC_HSA_H_
+
+#include <stddef.h>   /* size_t */
+#include <stdint.h>   /* uintXX_t */
+
+#ifndef __cplusplus
+#include <stdbool.h>  /* bool */
+#endif /* __cplusplus */
+
+// Placeholder for calling convention and import/export macros
+#ifndef HSA_CALL
+#define HSA_CALL
+#endif
+
+#ifndef HSA_EXPORT_DECORATOR
+#ifdef __GNUC__
+#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default")))
+#else
+#define HSA_EXPORT_DECORATOR
+#endif
+#endif
+#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL
+#define HSA_API_IMPORT HSA_CALL
+
+#if !defined(HSA_API) && defined(HSA_EXPORT)
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+// Detect and set large model builds.
+#undef HSA_LARGE_MODEL
+#if defined(__LP64__) || defined(_M_X64)
+#define HSA_LARGE_MODEL
+#endif
+
+// Try to detect CPU endianness
+#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU)
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LITTLEENDIAN_CPU
+#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define BIGENDIAN_CPU
+#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+      defined(_M_X64) || defined(__loongarch64) || defined(__riscv)
+#define LITTLEENDIAN_CPU
+#endif
+#endif
+
+#undef HSA_LITTLE_ENDIAN
+#if defined(LITTLEENDIAN_CPU)
+#define HSA_LITTLE_ENDIAN
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+#ifndef HSA_DEPRECATED
+#define HSA_DEPRECATED
+//#ifdef __GNUC__
+//#define HSA_DEPRECATED __attribute__((deprecated))
+//#else
+//#define HSA_DEPRECATED __declspec(deprecated)
+//#endif
+#endif
+
+#define HSA_VERSION_1_0                              1
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+/** \addtogroup error-codes Error codes
+ *  @{
+ */
+
+/**
+ * @brief Status codes.
+ */
+typedef enum {
+  /**
+   * The function has been executed successfully.
+   */
+  HSA_STATUS_SUCCESS = 0x0,
+  /**
+   * A traversal over a list of elements has been interrupted by the
+   * application before completing.
+   */
+  HSA_STATUS_INFO_BREAK = 0x1,
+  /**
+   * A generic error has occurred.
+   */
+  HSA_STATUS_ERROR = 0x1000,
+  /**
+   * One of the actual arguments does not meet a precondition stated in the
+   * documentation of the corresponding formal argument.
+   */
+  HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001,
+  /**
+   * The requested queue creation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002,
+  /**
+   * The requested allocation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003,
+  /**
+   * The agent is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_AGENT = 0x1004,
+  /**
+   * The memory region is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_REGION = 0x1005,
+  /**
+   * The signal is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006,
+  /**
+   * The queue is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007,
+  /**
+   * The HSA runtime failed to allocate the necessary resources. This error
+   * may also occur when the HSA runtime needs to spawn threads or create
+   * internal OS-specific events.
+   */
+  HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008,
+  /**
+   * The AQL packet is malformed.
+   */
+  HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009,
+  /**
+   * An error has been detected while releasing a resource.
+   */
+  HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A,
+  /**
+   * An API other than ::hsa_init has been invoked while the reference count
+   * of the HSA runtime is 0.
+   */
+  HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  /**
+   * The maximum reference count for the object has been reached.
+   */
+  HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C,
+  /**
+   * The arguments passed to a functions are not compatible.
+   */
+  HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D,
+  /**
+   * The index is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_INDEX = 0x100E,
+  /**
+   * The instruction set architecture is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
+  /**
+   * The instruction set architecture name is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017,
+  /**
+   * The code object is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
+  /**
+   * The executable is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011,
+  /**
+   * The executable is frozen.
+   */
+  HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012,
+  /**
+   * There is no symbol with the given name.
+   */
+  HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013,
+  /**
+   * The variable is already defined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014,
+  /**
+   * The variable is undefined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015,
+  /**
+   * An HSAIL operation resulted in a hardware exception.
+   */
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
+  /**
+   * The code object symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018,
+  /**
+   * The executable symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019,
+  /**
+   * The file descriptor is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_FILE = 0x1020,
+  /**
+   * The code object reader is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021,
+  /**
+   * The cache is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CACHE = 0x1022,
+  /**
+   * The wavefront is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023,
+  /**
+   * The signal group is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024,
+  /**
+   * The HSA runtime is not in the configuration state.
+   */
+  HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025,
+  /**
+  * The queue received an error that may require process termination.
+  */
+  HSA_STATUS_ERROR_FATAL = 0x1026
+} hsa_status_t;
+
+/**
+ * @brief Query additional information about a status code.
+ *
+ * @param[in] status Status code.
+ *
+ * @param[out] status_string A NUL-terminated string that describes the error
+ * status.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid
+ * status code, or @p status_string is NULL.
+ */
+hsa_status_t HSA_API hsa_status_string(
+    hsa_status_t status,
+    const char ** status_string);
+
+/** @} */
+
+/** \defgroup common Common Definitions
+ *  @{
+ */
+
+/**
+ * @brief Three-dimensional coordinate.
+ */
+typedef struct hsa_dim3_s {
+  /**
+   * X dimension.
+   */
+   uint32_t x;
+
+  /**
+   * Y dimension.
+   */
+   uint32_t y;
+
+   /**
+    * Z dimension.
+    */
+   uint32_t z;
+} hsa_dim3_t;
+
+/**
+ * @brief Access permissions.
+ */
+typedef enum {
+  /**
+   * Used to remove existing access
+   */
+  HSA_ACCESS_PERMISSION_NONE = 0,
+  /**
+   * Read-only access.
+   */
+  HSA_ACCESS_PERMISSION_RO = 1,
+  /**
+   * Write-only access.
+   */
+  HSA_ACCESS_PERMISSION_WO = 2,
+  /**
+   * Read and write access.
+   */
+  HSA_ACCESS_PERMISSION_RW = 3
+} hsa_access_permission_t;
+
+/**
+ * @brief POSIX file descriptor.
+ */
+typedef int hsa_file_t;
+
+/** @} **/
+
+
+/** \defgroup initshutdown Initialization and Shut Down
+ *  @{
+ */
+
+/**
+ * @brief Initialize the HSA runtime.
+ *
+ * @details Initializes the HSA runtime if it is not already initialized, and
+ * increases the reference counter associated with the HSA runtime for the
+ * current process. Invocation of any HSA function other than ::hsa_init results
+ * in undefined behavior if the current HSA runtime reference counter is less
+ * than one.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference
+ * count reaches INT32_MAX.
+ */
+hsa_status_t HSA_API hsa_init();
+
+/**
+ * @brief Shut down the HSA runtime.
+ *
+ * @details Decreases the reference count of the HSA runtime instance. When the
+ * reference count reaches 0, the HSA runtime is no longer considered valid
+ * but the application might call ::hsa_init to initialize the HSA runtime
+ * again.
+ *
+ * Once the reference count of the HSA runtime reaches 0, all the resources
+ * associated with it (queues, signals, agent information, etc.) are
+ * considered invalid and any attempt to reference them in subsequent API calls
+ * results in undefined behavior. When the reference count reaches 0, the HSA
+ * runtime may release resources associated with it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_shut_down();
+
+/** @} **/
+
+/** \defgroup agentinfo System and Agent Information
+ *  @{
+ */
+
+/**
+ * @brief Endianness. A convention used to interpret the bytes making up a data
+ * word.
+ */
+typedef enum {
+    /**
+     * The least significant byte is stored in the smallest address.
+     */
+    HSA_ENDIANNESS_LITTLE = 0,
+    /**
+     * The most significant byte is stored in the smallest address.
+     */
+    HSA_ENDIANNESS_BIG = 1
+} hsa_endianness_t;
+
+/**
+ * @brief Machine model. A machine model determines the size of certain data
+ * types in HSA runtime and an agent.
+ */
+typedef enum {
+    /**
+     * Small machine model. Addresses use 32 bits.
+     */
+    HSA_MACHINE_MODEL_SMALL = 0,
+    /**
+     * Large machine model. Addresses use 64 bits.
+     */
+    HSA_MACHINE_MODEL_LARGE = 1
+} hsa_machine_model_t;
+
+/**
+ * @brief Profile. A profile indicates a particular level of feature
+ * support. For example, in the base profile the application must use the HSA
+ * runtime allocator to reserve shared virtual memory, while in the full profile
+ * any host pointer can be shared across all the agents.
+ */
+typedef enum {
+    /**
+     * Base profile.
+     */
+    HSA_PROFILE_BASE = 0,
+    /**
+     * Full profile.
+     */
+    HSA_PROFILE_FULL = 1
+} hsa_profile_t;
+
+/**
+ * @brief System attributes.
+ */
+typedef enum {
+  /**
+   * Major version of the HSA runtime specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
+  /**
+   * Minor version of the HSA runtime specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+  /**
+   * Current timestamp. The value of this attribute monotonically increases at a
+   * constant rate. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_TIMESTAMP = 2,
+  /**
+   * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+   * in the range 1-400MHz. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
+  /**
+   * Maximum duration of a signal wait operation. Expressed as a count based on
+   * the timestamp frequency. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4,
+  /**
+   * Endianness of the system. The type of this attribute is ::hsa_endianness_t.
+   */
+  HSA_SYSTEM_INFO_ENDIANNESS = 5,
+  /**
+   * Machine model supported by the HSA runtime. The type of this attribute is
+   * ::hsa_machine_model_t.
+   */
+  HSA_SYSTEM_INFO_MACHINE_MODEL = 6,
+  /**
+   * Bit-mask indicating which extensions are supported by the
+   * implementation. An extension with an ID of @p i is supported if the bit at
+   * position @p i is set. The type of this attribute is uint8_t[128].
+   */
+  HSA_SYSTEM_INFO_EXTENSIONS = 7,
+  /**
+  * String containing the ROCr build identifier.
+  */
+  HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
+  /**
+   * Returns true if hsa_amd_svm_* APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+  // TODO: Should this be per Agent?
+  /**
+   * Returns true if all Agents have access to system allocated memory (such as
+   * that allocated by mmap, malloc, or new) by default.
+   * If false then system allocated memory may only be made SVM accessible to
+   * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202,
+  /**
+   * Returns true if mwaitx is enabled on this system
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203,
+  /**
+   * Returns true if DMABUF APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204,
+  /**
+   * Returns true if Virtual Memory APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED = 0x205,
+  /**
+   * Returns true if XNACK is enabled on this system.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_XNACK_ENABLED = 0x206,
+  /**
+   * Major version of the HSA runtime extension specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_AMD_SYSTEM_INFO_EXT_VERSION_MAJOR = 0x207,
+  /**
+   * Minor version of the HSA runtime extension specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_AMD_SYSTEM_INFO_EXT_VERSION_MINOR = 0x208,
+} hsa_system_info_t;
+
+/**
+ * @brief Get the current value of a system attribute.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * system attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_info(
+    hsa_system_info_t attribute,
+    void* value);
+
+/**
+ * @brief HSA extensions.
+ */
+typedef enum {
+  /**
+   * Finalizer extension.
+   */
+  HSA_EXTENSION_FINALIZER = 0,
+  /**
+   * Images extension.
+   */
+  HSA_EXTENSION_IMAGES = 1,
+
+  /**
+   * Performance counter extension.
+   */
+  HSA_EXTENSION_PERFORMANCE_COUNTERS = 2,
+
+  /**
+   * Profiling events extension.
+   */
+  HSA_EXTENSION_PROFILING_EVENTS = 3,
+  /**
+   * Extension count.
+   */
+  HSA_EXTENSION_STD_LAST = 3,
+  /**
+   * First AMD extension number.
+   */
+  HSA_AMD_FIRST_EXTENSION = 0x200,
+  /**
+   * Profiler extension.
+   */
+  HSA_EXTENSION_AMD_PROFILER = 0x200,
+  /**
+   * Loader extension.
+   */
+  HSA_EXTENSION_AMD_LOADER = 0x201,
+  /**
+   * AqlProfile extension.
+   */
+  HSA_EXTENSION_AMD_AQLPROFILE = 0x202,
+  /**
+   * PC Sampling extension.
+   */
+  HSA_EXTENSION_AMD_PC_SAMPLING = 0x203,
+  /**
+   * Last AMD extension.
+   */
+  HSA_AMD_LAST_EXTENSION = 0x203
+} hsa_extension_t;
+
+/**
+ * @brief Query the name of a given extension.
+ *
+ * @param[in] extension Extension identifier. If the extension is not supported
+ * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior
+ * is undefined.
+ *
+ * @param[out] name Pointer to a memory location where the HSA runtime stores
+ * the extension name. The extension name is a NUL-terminated string.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p name is NULL.
+ */
+hsa_status_t HSA_API hsa_extension_get_name(
+    uint16_t extension,
+    const char **name);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t version_minor,
+    bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation. All minor versions from 0 up to the returned @p version_minor
+ * must be supported by the implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_system_major_extension_supported(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t *version_minor,
+    bool* result);
+
+
+/**
+ * @deprecated
+ *
+ * @brief Retrieve the function pointers corresponding to a given version of an
+ * extension. Portable applications are expected to invoke the extension API
+ * using the returned function pointers
+ *
+ * @details The application is responsible for verifying that the given version
+ * of the extension is supported by the HSA implementation (see
+ * ::hsa_system_extension_supported). If the given combination of extension,
+ * major version, and minor version is not supported by the implementation, the
+ * behavior is undefined.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] version_minor Minor version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t version_minor,
+    void *table);
+
+/**
+ * @brief Retrieve the function pointers corresponding to a given major version
+ * of an extension. Portable applications are expected to invoke the extension
+ * API using the returned function pointers.
+ *
+ * @details The application is responsible for verifying that the given major
+ * version of the extension is supported by the HSA implementation (see
+ * ::hsa_system_major_extension_supported). If the given combination of extension
+ * and major version is not supported by the implementation, the behavior is
+ * undefined. Additionally if the length doesn't allow space for a full minor
+ * version, it is implementation defined if only some of the function pointers for
+ * that minor version get written.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] table_length Size in bytes of the function pointer table to be
+ * populated. The implementation will not write more than this many bytes to the
+ * table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_major_extension_table(
+    uint16_t extension,
+    uint16_t version_major,
+    size_t table_length,
+    void *table);
+
+/**
+ * @brief Struct containing an opaque handle to an agent, a device that participates in
+ * the HSA memory model. An agent can submit AQL packets for execution, and
+ * may also accept AQL packets for execution (agent dispatch packets or kernel
+ * dispatch packets launching HSAIL-derived binaries).
+ */
+typedef struct hsa_agent_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_agent_t;
+
+/**
+ * @brief Agent features.
+ */
+typedef enum {
+    /**
+     * The agent supports AQL packets of kernel dispatch type. If this
+     * feature is enabled, the agent is also a kernel agent.
+     */
+    HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1,
+    /**
+     * The agent supports AQL packets of agent dispatch type.
+     */
+    HSA_AGENT_FEATURE_AGENT_DISPATCH = 2
+} hsa_agent_feature_t;
+
+/**
+ * @brief Hardware device type.
+ */
+typedef enum {
+  /**
+   * CPU device.
+   */
+  HSA_DEVICE_TYPE_CPU = 0,
+  /**
+   * GPU device.
+   */
+  HSA_DEVICE_TYPE_GPU = 1,
+  /**
+   * DSP device.
+   */
+  HSA_DEVICE_TYPE_DSP = 2,
+  /**
+   * AI Engine (AIE) device.
+   */
+  HSA_DEVICE_TYPE_AIE = 3
+} hsa_device_type_t;
+
+/**
+ * @brief Default floating-point rounding mode.
+ */
+typedef enum {
+  /**
+   * Use a default floating-point rounding mode specified elsewhere.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0,
+  /**
+   * Operations that specify the default floating-point mode are rounded to zero
+   * by default.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1,
+  /**
+   * Operations that specify the default floating-point mode are rounded to the
+   * nearest representable number and that ties should be broken by selecting
+   * the value with an even least significant bit.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2
+} hsa_default_float_rounding_mode_t;
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum {
+  /**
+   * Agent name. The type of this attribute is a NUL-terminated char[64]. The
+   * name must be at most 63 characters long (not including the NUL terminator)
+   * and all array elements not used for the name must be NUL.
+   */
+  HSA_AGENT_INFO_NAME = 0,
+  /**
+   * Name of vendor. The type of this attribute is a NUL-terminated char[64].
+   * The name must be at most 63 characters long (not including the NUL
+   * terminator) and all array elements not used for the name must be NUL.
+   */
+  HSA_AGENT_INFO_VENDOR_NAME = 1,
+  /**
+   * Agent capability. The type of this attribute is ::hsa_agent_feature_t.
+   */
+  HSA_AGENT_INFO_FEATURE = 2,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Machine model supported by the agent. The type of this attribute is
+   * ::hsa_machine_model_t.
+   */
+  HSA_AGENT_INFO_MACHINE_MODEL = 3,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Profile supported by the agent. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_AGENT_INFO_PROFILE = 4,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given
+   * intruction set architecture supported by the agent instead.  If more than
+   * one ISA is supported by the agent, the returned value corresponds to the
+   * first ISA enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Default floating-point rounding mode. The type of this attribute is
+   * ::hsa_default_float_rounding_mode_t, but the value
+   * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed.
+   */
+  HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES
+   * for a given intruction set architecture supported by the agent instead.  If
+   * more than one ISA is supported by the agent, the returned value corresponds
+   * to the first ISA enumerated by ::hsa_agent_iterate_isas.
+   *
+   * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the
+   * default floating-point rounding modes supported by the agent in the Base
+   * profile. The type of this attribute is uint32_t. The default floating-point
+   * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not
+   * be set.
+   */
+  HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Flag indicating that the f16 HSAIL operation is at least as fast as the
+   * f32 operation in the current agent. The value of this attribute is
+   * undefined if the agent is not a kernel agent. The type of this
+   * attribute is bool.
+   */
+  HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
+  /**
+   * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and
+   * intruction set architecture supported by the agent instead.  If more than
+   * one ISA is supported by the agent, the returned value corresponds to the
+   * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront
+   * enumerated by ::hsa_isa_iterate_wavefronts for that ISA.
+   *
+   * Number of work-items in a wavefront. Must be a power of 2 in the range
+   * [1,256]. The value of this attribute is undefined if the agent is not
+   * a kernel agent. The type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum number of work-items of each dimension of a work-group.  Each
+   * maximum must be greater than 0. No maximum can exceed the value of
+   * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is
+   * undefined if the agent is not a kernel agent. The type of this
+   * attribute is uint16_t[3].
+   */
+  HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum total number of work-items in a work-group. The value of this
+   * attribute is undefined if the agent is not a kernel agent. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set
+   * architecture supported by the agent instead.
+   *
+   * Maximum number of work-items of each dimension of a grid. Each maximum must
+   * be greater than 0, and must not be smaller than the corresponding value in
+   * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+   * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined
+   * if the agent is not a kernel agent. The type of this attribute is
+   * ::hsa_dim3_t.
+   */
+  HSA_AGENT_INFO_GRID_MAX_DIM = 9,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum total number of work-items in a grid. The value of this attribute
+   * is undefined if the agent is not a kernel agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum number of fbarriers per work-group. Must be at least 32. The value
+   * of this attribute is undefined if the agent is not a kernel agent. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
+  /**
+   * @deprecated The maximum number of queues is not statically determined.
+   *
+   * Maximum number of queues that can be active (created but not destroyed) at
+   * one time in the agent. The type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUES_MAX = 12,
+  /**
+   * Minimum number of packets that a queue created in the agent
+   * can hold. Must be a power of 2 greater than 0. Must not exceed
+   * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
+  /**
+   * Maximum number of packets that a queue created in the agent can
+   * hold. Must be a power of 2 greater than 0. The type of this attribute
+   * is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
+  /**
+   * Type of a queue created in the agent. The type of this attribute is
+   * ::hsa_queue_type32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_TYPE = 15,
+  /**
+   * @deprecated NUMA information is not exposed anywhere else in the API.
+   *
+   * Identifier of the NUMA node associated with the agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_NODE = 16,
+  /**
+   * Type of hardware device associated with the agent. The type of this
+   * attribute is ::hsa_device_type_t.
+   */
+  HSA_AGENT_INFO_DEVICE = 17,
+  /**
+   * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about
+   * the caches present in a given agent.
+   *
+   * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size
+   * of 0 for a particular level indicates that there is no cache information
+   * for that level. The type of this attribute is uint32_t[4].
+   */
+  HSA_AGENT_INFO_CACHE_SIZE = 18,
+  /**
+   * @deprecated An agent may support multiple instruction set
+   * architectures. See ::hsa_agent_iterate_isas.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Instruction set architecture of the agent. The type of this attribute
+   * is ::hsa_isa_t.
+   */
+  HSA_AGENT_INFO_ISA = 19,
+  /**
+   * Bit-mask indicating which extensions are supported by the agent. An
+   * extension with an ID of @p i is supported if the bit at position @p i is
+   * set. The type of this attribute is uint8_t[128].
+   */
+  HSA_AGENT_INFO_EXTENSIONS = 20,
+  /**
+   * Major version of the HSA runtime specification supported by the
+   * agent. The type of this attribute is uint16_t.
+   */
+  HSA_AGENT_INFO_VERSION_MAJOR = 21,
+  /**
+   * Minor version of the HSA runtime specification supported by the
+   * agent. The type of this attribute is uint16_t.
+   */
+  HSA_AGENT_INFO_VERSION_MINOR = 22,
+  /**
+   * This enum does not have a fixed underlying type, thus in C++ post D2338:
+   * If the enumeration type does not have a fixed underlying type, the value is
+   * unchanged if the original value is within the range of the enumeration
+   * values (9.7.1 [dcl.enum]), and otherwise, the behavior is
+   * undefined.
+   * Thus increase the range of this enum to encompass vendor extensions.
+   */
+  HSA_AGENT_INFO_LAST = INT32_MAX
+} hsa_agent_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * agent attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_get_info(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the available agents, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] callback Callback to be invoked once per agent. The HSA
+ * runtime passes two arguments to the callback: the agent and the
+ * application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_iterate_agents returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t HSA_API hsa_iterate_agents(
+    hsa_status_t (*callback)(hsa_agent_t agent, void* data),
+    void* data);
+
+/*
+
+// If we do not know the size of an attribute, we need to query it first
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_get_info_size(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    size_t* size);
+
+// Set the value of an agents attribute
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_set_info(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    void* value);
+
+*/
+
+/**
+ * @brief Exception policies applied in the presence of hardware exceptions.
+ */
+typedef enum {
+    /**
+     * If a hardware exception is detected, a work-item signals an exception.
+     */
+    HSA_EXCEPTION_POLICY_BREAK = 1,
+    /**
+     * If a hardware exception is detected, a hardware status bit is set.
+     */
+    HSA_EXCEPTION_POLICY_DETECT = 2
+} hsa_exception_policy_t;
+
+/**
+ * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, this function uses the first value returned by
+ * ::hsa_agent_iterate_isas.
+ *
+ * @brief Retrieve the exception policy support for a given combination of
+ * agent and profile
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ *
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies(
+    hsa_agent_t agent,
+    hsa_profile_t profile,
+    uint16_t *mask);
+
+/**
+ * @brief Cache handle.
+ */
+typedef struct hsa_cache_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_cache_t;
+
+/**
+ * @brief Cache attributes.
+ */
+typedef enum {
+  /**
+   * The length of the cache name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_CACHE_INFO_NAME_LENGTH = 0,
+  /**
+   * Human-readable description.  The type of this attribute is a NUL-terminated
+   * character array with the length equal to the value of
+   * ::HSA_CACHE_INFO_NAME_LENGTH attribute.
+   */
+  HSA_CACHE_INFO_NAME = 1,
+  /**
+   * Cache level. A L1 cache must return a value of 1, a L2 must return a value
+   * of 2, and so on.  The type of this attribute is uint8_t.
+   */
+  HSA_CACHE_INFO_LEVEL = 2,
+  /**
+   * Cache size, in bytes. A value of 0 indicates that there is no size
+   * information available. The type of this attribute is uint32_t.
+   */
+  HSA_CACHE_INFO_SIZE = 3
+} hsa_cache_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given cache object.
+ *
+ * @param[in] cache Cache.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_cache_get_info(
+    hsa_cache_t cache,
+    hsa_cache_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the memory caches of a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details Caches are visited in ascending order according to the value of the
+ * ::HSA_CACHE_INFO_LEVEL attribute.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per cache that is present in
+ * the agent.  The HSA runtime passes two arguments to the callback: the cache
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_caches(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_cache_t cache, void* data),
+    void* data);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by an agent
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported(
+    uint16_t extension,
+    hsa_agent_t agent,
+    uint16_t version_major,
+    uint16_t version_minor,
+    bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by an agent. All
+ * minor versions from 0 up to the returned @p version_minor must be supported.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_major_extension_supported(
+    uint16_t extension,
+    hsa_agent_t agent,
+    uint16_t version_major,
+    uint16_t *version_minor,
+    bool* result);
+
+
+/** @} */
+
+
+/** \defgroup signals Signals
+ *  @{
+ */
+
+/**
+ * @brief Signal handle.
+ */
+typedef struct hsa_signal_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal. The value 0 is reserved.
+   */
+  uint64_t handle;
+} hsa_signal_t;
+
+/**
+ * @brief Signal value. The value occupies 32 bits in small machine mode, and 64
+ * bits in large machine mode.
+ */
+#ifdef HSA_LARGE_MODEL
+  typedef int64_t hsa_signal_value_t;
+#else
+  typedef int32_t hsa_signal_value_t;
+#endif
+
+/**
+ * @brief Create a signal.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_signal_create(
+    hsa_signal_value_t initial_value,
+    uint32_t num_consumers,
+    const hsa_agent_t *consumers,
+    hsa_signal_t *signal);
+
+/**
+ * @brief Destroy a signal previous created by ::hsa_signal_create.
+ *
+ * @param[in] signal Signal.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0.
+ */
+hsa_status_t HSA_API hsa_signal_destroy(
+    hsa_signal_t signal);
+
+/**
+ * @brief Atomically read the current value of a signal.
+ *
+ * @param[in] signal Signal.
+ *
+ * @return Value of the signal.
+*/
+hsa_signal_value_t HSA_API hsa_signal_load_scacquire(
+    hsa_signal_t signal);
+
+/**
+ * @copydoc hsa_signal_load_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_load_relaxed(
+    hsa_signal_t signal);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_load_scacquire.
+ *
+ * @copydoc hsa_signal_load_scacquire
+*/
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire(
+    hsa_signal_t signal);
+
+/**
+ * @brief Atomically set the value of a signal.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_store_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_store_relaxed
+ */
+void HSA_API hsa_signal_store_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_store_screlease.
+ *
+ * @copydoc hsa_signal_store_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_store_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal without necessarily notifying the
+ * the agents waiting on it.
+ *
+ * @details The agents waiting on @p signal may not wake up even when the new
+ * value satisfies their wait condition. If the application wants to update the
+ * signal and there is no need to notify any agent, invoking this function can
+ * be more efficient than calling the non-silent counterpart.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_silent_store_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_silent_store_relaxed
+ */
+void HSA_API hsa_signal_silent_store_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal and return its previous value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value New value.
+ *
+ * @return Value of the signal prior to the exchange.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl.
+ *
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacquire.
+ *
+ * @copydoc hsa_signal_exchange_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_screlease.
+ *
+ * @copydoc hsa_signal_exchange_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal if the observed value is equal to
+ * the expected value. The observed value is returned regardless of whether the
+ * replacement was done.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue
+ * doorbell signal, the behavior is undefined.
+ *
+ * @param[in] expected Value to compare with.
+ *
+ * @param[in] value New value.
+ *
+ * @return Observed value of the signal.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacq_screl.
+ *
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacquire.
+ *
+ * @copydoc hsa_signal_cas_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_screlease.
+ *
+ * @copydoc hsa_signal_cas_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically increment the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to add to the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_add_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacq_screl.
+ *
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacquire.
+ *
+ * @copydoc hsa_signal_add_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_screlease.
+ *
+ * @copydoc hsa_signal_add_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically decrement the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to subtract from the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_subtract_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl.
+ *
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacquire.
+ *
+ * @copydoc hsa_signal_subtract_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_screlease.
+ *
+ * @copydoc hsa_signal_subtract_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise AND operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to AND with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_and_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacq_screl.
+ *
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacquire.
+ *
+ * @copydoc hsa_signal_and_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_screlease.
+ *
+ * @copydoc hsa_signal_and_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise OR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to OR with the value of the signal.
+ */
+void HSA_API hsa_signal_or_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacq_screl.
+ *
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacquire.
+ *
+ * @copydoc hsa_signal_or_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_screlease.
+ *
+ * @copydoc hsa_signal_or_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise XOR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to XOR with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_xor_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacq_screl.
+ *
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacquire.
+ *
+ * @copydoc hsa_signal_xor_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_screlease.
+ *
+ * @copydoc hsa_signal_xor_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Wait condition operator.
+ */
+typedef enum {
+    /**
+     * The two operands are equal.
+     */
+    HSA_SIGNAL_CONDITION_EQ = 0,
+    /**
+     * The two operands are not equal.
+     */
+    HSA_SIGNAL_CONDITION_NE = 1,
+    /**
+     * The first operand is less than the second operand.
+     */
+    HSA_SIGNAL_CONDITION_LT = 2,
+    /**
+     * The first operand is greater than or equal to the second operand.
+     */
+    HSA_SIGNAL_CONDITION_GTE = 3
+} hsa_signal_condition_t;
+
+/**
+ * @brief State of the application thread during a signal wait.
+ */
+typedef enum {
+    /**
+     * The application thread may be rescheduled while waiting on the signal.
+     */
+    HSA_WAIT_STATE_BLOCKED = 0,
+    /**
+     * The application thread stays active while waiting on a signal.
+     */
+    HSA_WAIT_STATE_ACTIVE = 1
+} hsa_wait_state_t;
+
+
+/**
+ * @brief Wait until a signal value satisfies a specified condition, or a
+ * certain amount of time has elapsed.
+ *
+ * @details A wait operation can spuriously resume at any time sooner than the
+ * timeout (for example, due to system or other external factors) even when the
+ * condition has not been met.
+ *
+ * The function is guaranteed to return if the signal value satisfies the
+ * condition at some point in time during the wait, but the value returned to
+ * the application might not satisfy the condition. The application must ensure
+ * that signals are used in such way that wait wakeup conditions are not
+ * invalidated before dependent threads have woken up.
+ *
+ * When the wait operation internally loads the value of the passed signal, it
+ * uses the memory order indicated in the function name.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] condition Condition used to compare the signal value with @p
+ * compare_value.
+ *
+ * @param[in] compare_value Value to compare with.
+ *
+ * @param[in] timeout_hint Maximum duration of the wait.  Specified in the same
+ * unit as the system timestamp. The operation might block for a shorter or
+ * longer time even if the condition is not met. A value of UINT64_MAX indicates
+ * no maximum.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is ultimately decided by
+ * HSA runtime and may not match the provided hint. A value of
+ * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal
+ * update by avoiding rescheduling overhead.
+ *
+ * @return Observed value of the signal, which might not satisfy the specified
+ * condition.
+ *
+*/
+hsa_signal_value_t HSA_API hsa_signal_wait_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_wait_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_wait_scacquire.
+ *
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @brief Group of signals.
+ */
+typedef struct hsa_signal_group_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_signal_group_t;
+
+/**
+ * @brief Create a signal group.
+ *
+ * @param[in] num_signals Number of elements in @p signals. Must not be 0.
+ *
+ * @param[in] signals List of signals in the group. The list must not contain
+ * any repeated elements. Must not be NULL.
+ *
+ * @param[in] num_consumers Number of elements in @p consumers. Must not be 0.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the signal
+ * group. The list must not contain repeated elements, and must be a subset of
+ * the set of agents that are allowed to wait on all the signals in the
+ * group. If an agent not listed in @p consumers waits on the returned group,
+ * the behavior is undefined. The memory associated with @p consumers can be
+ * reused or freed after the function returns. Must not be NULL.
+ *
+ * @param[out] signal_group Pointer to newly created signal group. Must not be
+ * NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals
+ * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_create(
+    uint32_t num_signals,
+    const hsa_signal_t *signals,
+    uint32_t num_consumers,
+    const hsa_agent_t *consumers,
+    hsa_signal_group_t *signal_group);
+
+/**
+ * @brief Destroy a signal group previous created by ::hsa_signal_group_create.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ */
+hsa_status_t HSA_API hsa_signal_group_destroy(
+    hsa_signal_group_t signal_group);
+
+/**
+ * @brief Wait until the value of at least one of the signals in a signal group
+ * satisfies its associated condition.
+ *
+ * @details The function is guaranteed to return if the value of at least one of
+ * the signals in the group satisfies its associated condition at some point in
+ * time during the wait, but the signal value returned to the application may no
+ * longer satisfy the condition. The application must ensure that signals in the
+ * group are used in such way that wait wakeup conditions are not invalidated
+ * before dependent threads have woken up.
+ *
+ * When this operation internally loads the value of the passed signal, it uses
+ * the memory order indicated in the function name.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @param[in] conditions List of conditions. Each condition, and the value at
+ * the same index in @p compare_values, is used to compare the value of the
+ * signal at that index in @p signal_group (the signal passed by the application
+ * to ::hsa_signal_group_create at that particular index). The size of @p
+ * conditions must not be smaller than the number of signals in @p signal_group;
+ * any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] compare_values List of comparison values.  The size of @p
+ * compare_values must not be smaller than the number of signals in @p
+ * signal_group; any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is decided by the HSA runtime
+ * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may
+ * improve the latency of response to a signal update by avoiding rescheduling
+ * overhead.
+ *
+ * @param[out] signal Signal in the group that satisfied the associated
+ * condition. If several signals satisfied their condition, the function can
+ * return any of those signals. Must not be NULL.
+ *
+ * @param[out] value Observed value for @p signal, which might no longer satisfy
+ * the specified condition. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p
+ * compare_values is NULL, @p signal is NULL, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire(
+    hsa_signal_group_t signal_group,
+    const hsa_signal_condition_t *conditions,
+    const hsa_signal_value_t *compare_values,
+    hsa_wait_state_t wait_state_hint,
+    hsa_signal_t *signal,
+    hsa_signal_value_t *value);
+
+/**
+ * @copydoc hsa_signal_group_wait_any_scacquire
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed(
+    hsa_signal_group_t signal_group,
+    const hsa_signal_condition_t *conditions,
+    const hsa_signal_value_t *compare_values,
+    hsa_wait_state_t wait_state_hint,
+    hsa_signal_t *signal,
+    hsa_signal_value_t *value);
+
+/** @} */
+
+/** \defgroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief A memory region represents a block of virtual memory with certain
+ * properties. For example, the HSA runtime represents fine-grained memory in
+ * the global segment using a region. A region might be associated with more
+ * than one agent.
+ */
+typedef struct hsa_region_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_region_t;
+
+/** @} */
+
+
+/** \defgroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Queue type. Intended to be used for dynamic queue protocol
+ * determination.
+ */
+typedef enum {
+  /**
+   * Queue supports multiple producers. Use of multiproducer queue mechanics is
+   * required.
+   */
+  HSA_QUEUE_TYPE_MULTI = 0,
+  /**
+   * Queue only supports a single producer. In some scenarios, the application
+   * may want to limit the submission of AQL packets to a single agent. Queues
+   * that support a single producer may be more efficient than queues supporting
+   * multiple producers. Use of multiproducer queue mechanics is not supported.
+   */
+  HSA_QUEUE_TYPE_SINGLE = 1,
+  /**
+   * Queue supports multiple producers and cooperative dispatches. Cooperative
+   * dispatches are able to use GWS synchronization. Queues of this type may be
+   * limited in number. The runtime may return the same queue to serve multiple
+   * ::hsa_queue_create calls when this type is given. Callers must inspect the
+   * returned queue to discover queue size. Queues of this type are reference
+   * counted and require a matching number of ::hsa_queue_destroy calls to
+   * release. Use of multiproducer queue mechanics is required. See
+   * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this
+   * type.
+   */
+  HSA_QUEUE_TYPE_COOPERATIVE = 2
+} hsa_queue_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_queue_type_t constants.
+ */
+typedef uint32_t hsa_queue_type32_t;
+
+/**
+ * @brief Queue features.
+ */
+typedef enum {
+  /**
+   * Queue supports kernel dispatch packets.
+   */
+  HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1,
+
+  /**
+   * Queue supports agent dispatch packets.
+   */
+  HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
+} hsa_queue_feature_t;
+
+/**
+ * @brief User mode queue.
+ *
+ * @details The queue structure is read-only and allocated by the HSA runtime,
+ * but agents can directly modify the contents of the buffer pointed by @a
+ * base_address, or use HSA runtime APIs to access the doorbell signal.
+ *
+ */
+typedef struct hsa_queue_s {
+  /**
+   * Queue type.
+   */
+  hsa_queue_type32_t type;
+
+  /**
+   * Queue features mask. This is a bit-field of ::hsa_queue_feature_t
+   * values. Applications should ignore any unknown set bits.
+   */
+  uint32_t features;
+
+#ifdef HSA_LARGE_MODEL
+  void* base_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Starting address of the HSA runtime-allocated buffer used to store the AQL
+   * packets. Must be aligned to the size of an AQL packet.
+   */
+  void* base_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+#else
+  uint32_t reserved0;
+  void* base_address;
+#endif
+
+  /**
+   * Signal object used by the application to indicate the ID of a packet that
+   * is ready to be processed. The HSA runtime manages the doorbell signal. If
+   * the application tries to replace or destroy this signal, the behavior is
+   * undefined.
+   *
+   * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be
+   * updated in a monotonically increasing fashion. If @a type is
+   * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any
+   * value.
+   */
+  hsa_signal_t doorbell_signal;
+
+  /**
+   * Maximum number of packets the queue can hold. Must be a power of 2.
+   */
+  uint32_t size;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+  /**
+   * Queue identifier, which is unique over the lifetime of the application.
+   */
+  uint64_t id;
+
+} hsa_queue_t;
+
+/**
+ * @brief Create a user mode queue.
+ *
+ * @details The HSA runtime creates the queue structure, the underlying packet
+ * buffer, the completion signal, and the write and read indexes. The initial
+ * value of the write and read indexes is 0. The type of every packet in the
+ * buffer is initialized to ::HSA_PACKET_TYPE_INVALID.
+ *
+ * The application should only rely on the error code returned to determine if
+ * the queue is valid.
+ *
+ * @param[in] agent Agent where to create the queue.
+ *
+ * @param[in] size Number of packets the queue is expected to
+ * hold. Must be a power of 2 between 1 and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly
+ * created queue is the maximum of @p size and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent.
+ *
+ * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values.
+ * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE,
+ * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE.
+ *
+ * @param[in] callback Callback invoked by the HSA runtime for every
+ * asynchronous event related to the newly created queue. May be NULL. The HSA
+ * runtime passes three arguments to the callback: a code identifying the event
+ * that triggered the invocation, a pointer to the queue where the event
+ * originated, and the application data.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @param[in] private_segment_size Hint indicating the maximum
+ * expected private segment usage per work-item, in bytes. There may
+ * be performance degradation if the application places a kernel
+ * dispatch packet in the queue and the corresponding private segment
+ * usage exceeds @p private_segment_size. If the application does not
+ * want to specify any particular value for this argument, @p
+ * private_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[in] group_segment_size Hint indicating the maximum expected
+ * group segment usage per work-group, in bytes. There may be
+ * performance degradation if the application places a kernel dispatch
+ * packet in the queue and the corresponding group segment usage
+ * exceeds @p group_segment_size. If the application does not want to
+ * specify any particular value for this argument, @p
+ * group_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not
+ * support queues of the given type.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two,
+ * @p size is 0, @p type is an invalid queue type, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_queue_create(
+    hsa_agent_t agent,
+    uint32_t size,
+    hsa_queue_type32_t type,
+    void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+    void *data,
+    uint32_t private_segment_size,
+    uint32_t group_segment_size,
+    hsa_queue_t **queue);
+
+/**
+ * @brief Create a queue for which the application or a kernel is responsible
+ * for processing the AQL packets.
+ *
+ * @details The application can use this function to create queues where AQL
+ * packets are not parsed by the packet processor associated with an agent,
+ * but rather by a unit of execution running on that agent (for example, a
+ * thread in the host application).
+ *
+ * The application is responsible for ensuring that all the producers and
+ * consumers of the resulting queue can access the provided doorbell signal
+ * and memory region. The application is also responsible for ensuring that the
+ * unit of execution processing the queue packets supports the indicated
+ * features (AQL packet types).
+ *
+ * When the queue is created, the HSA runtime allocates the packet buffer using
+ * @p region, and the write and read indexes. The initial value of the write and
+ * read indexes is 0, and the type of every packet in the buffer is initialized
+ * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features,
+ * and @e doorbell_signal fields in the returned queue match the values passed
+ * by the application.
+ *
+ * @param[in] region Memory region that the HSA runtime should use to allocate
+ * the AQL packet buffer and any other queue metadata.
+ *
+ * @param[in] size Number of packets the queue is expected to hold. Must be a
+ * power of 2 greater than 0.
+ *
+ * @param[in] type Queue type.
+ *
+ * @param[in] features Supported queue features. This is a bit-field of
+ * ::hsa_queue_feature_t values.
+ *
+ * @param[in] doorbell_signal Doorbell signal that the HSA runtime must
+ * associate with the returned queue. The signal handle must not be 0.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue. The application should not rely on the value
+ * returned for this argument but only in the status code to determine if the
+ * queue is valid. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p
+ * size is 0, @p type is an invalid queue type, the doorbell signal handle is
+ * 0, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_soft_queue_create(
+    hsa_region_t region,
+    uint32_t size,
+    hsa_queue_type32_t type,
+    uint32_t features,
+    hsa_signal_t doorbell_signal,
+    hsa_queue_t **queue);
+
+/**
+ * @brief Destroy a user mode queue.
+ *
+ * @details When a queue is destroyed, the state of the AQL packets that have
+ * not been yet fully processed (their completion phase has not finished)
+ * becomes undefined. It is the responsibility of the application to ensure that
+ * all pending queue operations are finished if their results are required.
+ *
+ * The resources allocated by the HSA runtime during queue creation (queue
+ * structure, ring buffer, doorbell signal) are released.  The queue should not
+ * be accessed after being destroyed.
+ *
+ * @param[in] queue Pointer to a queue created using ::hsa_queue_create.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_destroy(
+    hsa_queue_t *queue);
+
+/**
+ * @brief Inactivate a queue.
+ *
+ * @details Inactivating the queue aborts any pending executions and prevent any
+ * new packets from being processed. Any more packets written to the queue once
+ * it is inactivated will be ignored by the packet processor.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_inactivate(
+    hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the read index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Read index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_read_index_scacquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_read_index_relaxed(
+    const hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the write index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Write index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_write_index_scacquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_write_index_relaxed(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically set the write index of a queue.
+ *
+ * @details It is recommended that the application uses this function to update
+ * the write index when there is a single agent submitting work to the queue
+ * (the queue type is ::HSA_QUEUE_TYPE_SINGLE).
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the write index.
+ *
+ */
+void HSA_API hsa_queue_store_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_write_index_screlease.
+ *
+ * @copydoc hsa_queue_store_write_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_write_index_relaxed
+ */
+void HSA_API hsa_queue_store_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @brief Atomically set the write index of a queue if the observed value is
+ * equal to the expected value. The application can inspect the returned value
+ * to determine if the replacement was done.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] expected Expected value.
+ *
+ * @param[in] value Value to assign to the write index if @p expected matches
+ * the observed write index. Must be greater than @p expected.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacquire(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease.
+ *
+ * @copydoc hsa_queue_cas_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @brief Atomically increment the write index of a queue by an offset.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to add to the write index.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacq_screl(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_add_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacquire(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_screlease.
+ *
+ * @copydoc hsa_queue_add_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @brief Atomically set the read index of a queue.
+ *
+ * @details Modifications of the read index are not allowed and result in
+ * undefined behavior if the queue is associated with an agent for which
+ * only the corresponding packet processor is permitted to update the read
+ * index.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the read index.
+ *
+ */
+void HSA_API hsa_queue_store_read_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_read_index_screlease.
+ *
+ * @copydoc hsa_queue_store_read_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_read_index_relaxed
+ */
+void HSA_API hsa_queue_store_read_index_screlease(
+   const hsa_queue_t *queue,
+   uint64_t value);
+/** @} */
+
+
+/** \defgroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief Packet type.
+ */
+typedef enum {
+  /**
+   * Vendor-specific packet.
+   */
+  HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0,
+  /**
+   * The packet has been processed in the past, but has not been reassigned to
+   * the packet processor. A packet processor must not process a packet of this
+   * type. All queues support this packet type.
+   */
+  HSA_PACKET_TYPE_INVALID = 1,
+  /**
+   * Packet used by agents for dispatching jobs to kernel agents. Not all
+   * queues support packets of this type (see ::hsa_queue_feature_t).
+   */
+  HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
+  /**
+   * Packet used by agents to delay processing of subsequent packets, and to
+   * express complex dependencies between multiple packets. All queues support
+   * this packet type.
+   */
+  HSA_PACKET_TYPE_BARRIER_AND = 3,
+  /**
+   * Packet used by agents for dispatching jobs to agents.  Not all
+   * queues support packets of this type (see ::hsa_queue_feature_t).
+   */
+  HSA_PACKET_TYPE_AGENT_DISPATCH = 4,
+  /**
+   * Packet used by agents to delay processing of subsequent packets, and to
+   * express complex dependencies between multiple packets. All queues support
+   * this packet type.
+   */
+  HSA_PACKET_TYPE_BARRIER_OR = 5
+} hsa_packet_type_t;
+
+/**
+ * @brief Scope of the memory fence operation associated with a packet.
+ */
+typedef enum {
+  /**
+   * No scope (no fence is applied). The packet relies on external fences to
+   * ensure visibility of memory updates.
+   */
+  HSA_FENCE_SCOPE_NONE = 0,
+  /**
+   * The fence is applied with agent scope for the global segment.
+   */
+  HSA_FENCE_SCOPE_AGENT = 1,
+  /**
+   * The fence is applied across both agent and system scope for the global
+   * segment.
+   */
+  HSA_FENCE_SCOPE_SYSTEM = 2
+} hsa_fence_scope_t;
+
+/**
+ * @brief Sub-fields of the @a header field that is present in any AQL
+ * packet. The offset (with respect to the address of @a header) of a sub-field
+ * is identical to its enumeration constant. The width of each sub-field is
+ * determined by the corresponding value in ::hsa_packet_header_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+  /**
+   * Packet type. The value of this sub-field must be one of
+   * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the
+   * packet layout is vendor-specific.
+   */
+   HSA_PACKET_HEADER_TYPE = 0,
+  /**
+   * Barrier bit. If the barrier bit is set, the processing of the current
+   * packet only launches when all preceding packets (within the same queue) are
+   * complete.
+   */
+   HSA_PACKET_HEADER_BARRIER = 8,
+  /**
+   * Acquire fence scope. The value of this sub-field determines the scope and
+   * type of the memory fence operation applied before the packet enters the
+   * active phase. An acquire fence ensures that any subsequent global segment
+   * or image loads by any unit of execution that belongs to a dispatch that has
+   * not yet entered the active phase on any queue of the same kernel agent,
+   * sees any data previously released at the scopes specified by the acquire
+   * fence. The value of this sub-field must be one of ::hsa_fence_scope_t.
+   */
+   HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
+   /**
+    * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
+  /**
+   * Release fence scope, The value of this sub-field determines the scope and
+   * type of the memory fence operation applied after kernel completion but
+   * before the packet is completed. A release fence makes any global segment or
+   * image data that was stored by any unit of execution that belonged to a
+   * dispatch that has completed the active phase on any queue of the same
+   * kernel agent visible in all the scopes specified by the release fence. The
+   * value of this sub-field must be one of ::hsa_fence_scope_t.
+   */
+   HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11,
+   /**
+    * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+ } hsa_packet_header_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t.
+ */
+ typedef enum {
+   HSA_PACKET_HEADER_WIDTH_TYPE = 8,
+   HSA_PACKET_HEADER_WIDTH_BARRIER = 1,
+   HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2,
+   /**
+    * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2,
+   HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2,
+   /**
+    * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2
+ } hsa_packet_header_width_t;
+
+/**
+ * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset
+ * (with respect to the address of @a setup) of a sub-field is identical to its
+ * enumeration constant. The width of each sub-field is determined by the
+ * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+  /**
+   * Number of dimensions of the grid. Valid values are 1, 2, or 3.
+   *
+   */
+   HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
+ } hsa_kernel_dispatch_packet_setup_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in
+ * ::hsa_kernel_dispatch_packet_setup_t.
+ */
+ typedef enum {
+   HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2
+ } hsa_kernel_dispatch_packet_setup_width_t;
+
+/**
+ * @brief AQL kernel dispatch packet
+ */
+typedef struct hsa_kernel_dispatch_packet_s {
+  union {
+    struct {
+        /**
+         * Packet header. Used to configure multiple packet parameters such as the
+         * packet type. The parameters are described by ::hsa_packet_header_t.
+         */
+        uint16_t header;
+
+        /**
+         * Dispatch setup parameters. Used to configure kernel dispatch parameters
+         * such as the number of dimensions in the grid. The parameters are described
+         * by ::hsa_kernel_dispatch_packet_setup_t.
+         */
+        uint16_t setup;
+    };
+    uint32_t full_header;
+  };
+
+  /**
+   * X dimension of work-group, in work-items. Must be greater than 0.
+   */
+  uint16_t workgroup_size_x;
+
+  /**
+   * Y dimension of work-group, in work-items. Must be greater than
+   * 0. If the grid has 1 dimension, the only valid value is 1.
+   */
+  uint16_t workgroup_size_y;
+
+  /**
+   * Z dimension of work-group, in work-items. Must be greater than
+   * 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
+   */
+  uint16_t workgroup_size_z;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * X dimension of grid, in work-items. Must be greater than 0. Must
+   * not be smaller than @a workgroup_size_x.
+   */
+  uint32_t grid_size_x;
+
+  /**
+   * Y dimension of grid, in work-items. Must be greater than 0. If the grid has
+   * 1 dimension, the only valid value is 1. Must not be smaller than @a
+   * workgroup_size_y.
+   */
+  uint32_t grid_size_y;
+
+  /**
+   * Z dimension of grid, in work-items. Must be greater than 0. If the grid has
+   * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
+   * workgroup_size_z.
+   */
+  uint32_t grid_size_z;
+
+  /**
+   * Size in bytes of private memory allocation request (per work-item).
+   */
+  uint32_t private_segment_size;
+
+  /**
+   * Size in bytes of group memory allocation request (per work-group). Must not
+   * be less than the sum of the group memory used by the kernel (and the
+   * functions it calls directly or indirectly) and the dynamically allocated
+   * group segment variables.
+   */
+  uint32_t group_segment_size;
+
+  /**
+   * Opaque handle to a code object that includes an implementation-defined
+   * executable code for the kernel.
+   */
+  uint64_t kernel_object;
+
+#ifdef HSA_LARGE_MODEL
+  void* kernarg_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Pointer to a buffer containing the kernel arguments. May be NULL.
+   *
+   * The buffer must be allocated using ::hsa_memory_allocate, and must not be
+   * modified once the kernel dispatch packet is enqueued until the dispatch has
+   * completed execution.
+   */
+  void* kernarg_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+#else
+  uint32_t reserved1;
+  void* kernarg_address;
+#endif
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_kernel_dispatch_packet_t;
+
+/**
+ * @brief Agent dispatch packet.
+ */
+typedef struct hsa_agent_dispatch_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Application-defined function to be performed by the destination agent.
+   */
+  uint16_t type;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+#ifdef HSA_LARGE_MODEL
+  void* return_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Address where to store the function return values, if any.
+   */
+  void* return_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+#else
+  uint32_t reserved1;
+  void* return_address;
+#endif
+
+  /**
+   * Function arguments.
+   */
+  uint64_t arg[4];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_agent_dispatch_packet_t;
+
+/**
+ * @brief Barrier-AND packet.
+ */
+typedef struct hsa_barrier_and_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Array of dependent signal objects. Signals with a handle value of 0 are
+   * allowed and are interpreted by the packet processor as satisfied
+   * dependencies.
+   */
+  hsa_signal_t dep_signal[5];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_barrier_and_packet_t;
+
+/**
+ * @brief Barrier-OR packet.
+ */
+typedef struct hsa_barrier_or_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Array of dependent signal objects. Signals with a handle value of 0 are
+   * allowed and are interpreted by the packet processor as dependencies not
+   * satisfied.
+   */
+  hsa_signal_t dep_signal[5];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_barrier_or_packet_t;
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Memory segments associated with a region.
+ */
+typedef enum {
+  /**
+   * Global segment. Used to hold data that is shared by all agents.
+   */
+  HSA_REGION_SEGMENT_GLOBAL = 0,
+  /**
+   * Read-only segment. Used to hold data that remains constant during the
+   * execution of a kernel.
+   */
+  HSA_REGION_SEGMENT_READONLY = 1,
+  /**
+   * Private segment. Used to hold data that is local to a single work-item.
+   */
+  HSA_REGION_SEGMENT_PRIVATE = 2,
+  /**
+   * Group segment. Used to hold data that is shared by the work-items of a
+   * work-group.
+  */
+  HSA_REGION_SEGMENT_GROUP = 3,
+  /**
+   * Kernarg segment. Used to store kernel arguments.
+  */
+  HSA_REGION_SEGMENT_KERNARG = 4
+} hsa_region_segment_t;
+
+/**
+ * @brief Global region flags.
+ */
+typedef enum {
+  /**
+   * The application can use memory in the region to store kernel arguments, and
+   * provide the values for the kernarg segment of a kernel dispatch. If this
+   * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set.
+   */
+  HSA_REGION_GLOBAL_FLAG_KERNARG = 1,
+  /**
+   * Updates to memory in this region are immediately visible to all the
+   * agents under the terms of the HSA memory model. If this
+   * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set.
+   */
+  HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2,
+  /**
+   * Updates to memory in this region can be performed by a single agent at
+   * a time. If a different agent in the system is allowed to access the
+   * region, the application must explicitely invoke ::hsa_memory_assign_agent
+   * in order to transfer ownership to that agent for a particular buffer.
+   */
+  HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /**
+   * Updates to memory in this region have extended scope, where the device-scope atomics
+   * to this memory type act as system-scope with respect to all variables located in
+   * memory regions of this type.
+   * Note: On non-compliant systems, the application may still be responsible for performing
+   * device-specific actions necessary to achieve system-scope coherence.
+   */
+  HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8
+} hsa_region_global_flag_t;
+
+/**
+ * @brief Attributes of a memory region.
+ */
+
+#ifdef __cplusplus
+typedef enum : int {
+#else
+typedef enum {
+#endif
+  /**
+   * Segment where memory in the region can be used. The type of this
+   * attribute is ::hsa_region_segment_t.
+   */
+  HSA_REGION_INFO_SEGMENT = 0,
+  /**
+   * Flag mask. The value of this attribute is undefined if the value of
+   * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of
+   * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t
+   * values.
+   */
+  HSA_REGION_INFO_GLOBAL_FLAGS = 1,
+  /**
+   * Size of this region, in bytes. The type of this attribute is size_t.
+   */
+  HSA_REGION_INFO_SIZE = 2,
+  /**
+   * Maximum allocation size in this region, in bytes. Must not exceed the value
+   * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t.
+   *
+   * If the region is in the global or readonly segments, this is the maximum
+   * size that the application can pass to ::hsa_memory_allocate.
+   *
+   * If the region is in the group segment, this is the maximum size (per
+   * work-group) that can be requested for a given kernel dispatch. If the
+   * region is in the private segment, this is the maximum size (per work-item)
+   * that can be requested for a specific kernel dispatch, and must be at least
+   * 256 bytes.
+   */
+  HSA_REGION_INFO_ALLOC_MAX_SIZE = 4,
+  /**
+   * Maximum size (per work-group) of private memory that can be requested for a
+   * specific kernel dispatch. Must be at least 65536 bytes. The type of this
+   * attribute is uint32_t. The value of this attribute is undefined if the
+   * region is not in the private segment.
+   */
+  HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8,
+  /**
+   * Indicates whether memory in this region can be allocated using
+   * ::hsa_memory_allocate. The type of this attribute is bool.
+   *
+   * The value of this flag is always false for regions in the group and private
+   * segments.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+  /**
+   * Allocation granularity of buffers allocated by ::hsa_memory_allocate in
+   * this region. The size of a buffer allocated in this region is a multiple of
+   * the value of this attribute. The value of this attribute is only defined if
+   * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type
+   * of this attribute is size_t.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6,
+  /**
+   * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The
+   * value of this attribute is only defined if
+   * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be
+   * a power of 2. The type of this attribute is size_t.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+} hsa_region_info_t;
+
+/**
+ * @brief Get the current value of an attribute of a region.
+ *
+ * @param[in] region A valid region.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * region attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_region_get_info(
+    hsa_region_t region,
+    hsa_region_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the memory regions associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per region that is
+ * accessible from the agent.  The HSA runtime passes two arguments to the
+ * callback, the region and the application data.  If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and ::hsa_agent_iterate_regions returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_regions(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_region_t region, void* data),
+    void* data);
+
+/**
+ * @brief Allocate a block of memory in a given region.
+ *
+ * @param[in] region Region where to allocate memory from. The region must have
+ * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
+ * in @p region.
+ *
+ * @param[out] ptr Pointer to the location where to store the base address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation
+ * fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p region, or @p size is greater than the value of
+ * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0.
+ */
+hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region,
+    size_t size,
+    void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_memory_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_memory_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_memory_free(void* ptr);
+
+/**
+ * @brief Copy a block of memory from the location pointed to by @p src to the
+ * memory block pointed to by @p dst.
+ *
+ * @param[out] dst Buffer where the content is to be copied. If @p dst is in
+ * coarse-grained memory, the copied data is only visible to the agent currently
+ * assigned (::hsa_memory_assign_agent) to @p dst.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer. If the source buffer is
+ * in coarse-grained memory then it must be assigned to an agent, from which the
+ * data will be retrieved.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL.
+ */
+hsa_status_t HSA_API hsa_memory_copy(
+    void *dst,
+    const void *src,
+    size_t size);
+
+/**
+ * @brief Change the ownership of a global, coarse-grained buffer.
+ *
+ * @details The contents of a coarse-grained buffer are visible to an agent
+ * only after ownership has been explicitely transferred to that agent. Once the
+ * operation completes, the previous owner cannot longer access the data in the
+ * buffer.
+ *
+ * An implementation of the HSA runtime is allowed, but not required, to change
+ * the physical location of the buffer when ownership is transferred to a
+ * different agent. In general the application must not assume this
+ * behavior. The virtual location (address) of the passed buffer is never
+ * modified.
+ *
+ * @param[in] ptr Base address of a global buffer. The pointer must match an
+ * address previously returned by ::hsa_memory_allocate. The size of the buffer
+ * affected by the ownership change is identical to the size of that previous
+ * allocation. If @p ptr points to a fine-grained global buffer, no operation is
+ * performed and the function returns success. If @p ptr does not point to
+ * global memory, the behavior is undefined.
+ *
+ * @param[in] agent Agent that becomes the owner of the buffer. The
+ * application is responsible for ensuring that @p agent has access to the
+ * region that contains the buffer. It is allowed to change ownership to an
+ * agent that is already the owner of the buffer, with the same or different
+ * access permissions.
+ *
+ * @param[in] access Access permissions requested for the new owner.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is
+ * not a valid access value.
+ */
+hsa_status_t HSA_API hsa_memory_assign_agent(
+    void *ptr,
+    hsa_agent_t agent,
+    hsa_access_permission_t access);
+
+/**
+ *
+ * @brief Register a global, fine-grained buffer.
+ *
+ * @details Registering a buffer serves as an indication to the HSA runtime that
+ * the memory might be accessed from a kernel agent other than the
+ * host. Registration is a performance hint that allows the HSA runtime
+ * implementation to know which buffers will be accessed by some of the kernel
+ * agents ahead of time.
+ *
+ * Registration is only recommended for buffers in the global segment that have
+ * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS
+ * allocator instead. Registering an OS-allocated buffer in the base profile is
+ * equivalent to a no-op.
+ *
+ * Registrations should not overlap.
+ *
+ * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is
+ * passed, no operation is performed. If the buffer has been allocated using
+ * ::hsa_memory_allocate, or has already been registered, no operation is
+ * performed.
+ *
+ * @param[in] size Requested registration size in bytes. A size of 0 is
+ * only allowed if @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr
+ * is not NULL.
+ */
+hsa_status_t HSA_API hsa_memory_register(
+    void *ptr,
+    size_t size);
+
+/**
+ *
+ * @brief Deregister memory previously registered using ::hsa_memory_register.
+ *
+ * @details If the memory interval being deregistered does not match a previous
+ * registration (start and end addresses), the behavior is undefined.
+ *
+ * @param[in] ptr A pointer to the base of the buffer to be deregistered. If
+ * a NULL pointer is passed, no operation is performed.
+ *
+ * @param[in] size Size of the buffer to be deregistered.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_memory_deregister(
+    void *ptr,
+    size_t size);
+
+/** @} */
+
+
+/** \defgroup instruction-set-architecture Instruction Set Architecture.
+ *  @{
+ */
+
+/**
+ * @brief Instruction set architecture.
+ */
+typedef struct hsa_isa_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_isa_t;
+
+/**
+ * @brief Retrieve a reference to an instruction set architecture handle out of
+ * a symbolic name.
+ *
+ * @param[in] name Vendor-specific name associated with a a particular
+ * instruction set architecture. @p name must start with the vendor name and a
+ * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be
+ * a NUL-terminated string.
+ *
+ * @param[out] isa Memory location where the HSA runtime stores the ISA handle
+ * corresponding to the given name. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not
+ * correspond to any instruction set architecture.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_from_name(
+    const char *name,
+    hsa_isa_t *isa);
+
+/**
+ * @brief Iterate over the instruction sets supported by the given agent, and
+ * invoke an application-defined callback on every iteration. The iterator is
+ * deterministic: if an agent supports several instruction set architectures,
+ * they are traversed in the same order in every invocation of this function.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per instruction set
+ * architecture.  The HSA runtime passes two arguments to the callback: the
+ * ISA and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that status value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_isas(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_isa_t isa, void *data),
+    void *data);
+
+/**
+ * @brief Instruction set architecture attributes.
+ */
+typedef enum {
+  /**
+   * The length of the ISA name in bytes, not including the NUL terminator. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_NAME_LENGTH = 0,
+  /**
+   * Human-readable description.  The type of this attribute is character array
+   * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute.
+   */
+  HSA_ISA_INFO_NAME = 1,
+  /**
+   * @deprecated
+   *
+   * Number of call conventions supported by the instruction set architecture.
+   * Must be greater than zero. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2,
+  /**
+   * @deprecated
+   *
+   * Number of work-items in a wavefront for a given call convention. Must be a
+   * power of 2 in the range [1,256]. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3,
+  /**
+   * @deprecated
+   *
+   * Number of wavefronts per compute unit for a given call convention. In
+   * practice, other factors (for example, the amount of group memory used by a
+   * work-group) may further limit the number of wavefronts per compute
+   * unit. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4,
+  /**
+   * Machine models supported by the instruction set architecture. The type of
+   * this attribute is a bool[2]. If the ISA supports the small machine model,
+   * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports
+   * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true.
+   */
+  HSA_ISA_INFO_MACHINE_MODELS = 5,
+  /**
+   * Profiles supported by the instruction set architecture. The type of this
+   * attribute is a bool[2]. If the ISA supports the base profile, the element
+   * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile,
+   * the element at index ::HSA_PROFILE_FULL is true.
+   */
+  HSA_ISA_INFO_PROFILES = 6,
+  /**
+   * Default floating-point rounding modes supported by the instruction set
+   * architecture. The type of this attribute is a bool[3]. The value at a given
+   * index is true if the corresponding rounding mode in
+   * ::hsa_default_float_rounding_mode_t is supported. At least one default mode
+   * has to be supported.
+   *
+   * If the default mode is supported, then
+   * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that
+   * both the zero and the near roundings modes are supported.
+   */
+  HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7,
+  /**
+   * Default floating-point rounding modes supported by the instruction set
+   * architecture in the Base profile. The type of this attribute is a
+   * bool[3]. The value at a given index is true if the corresponding rounding
+   * mode in ::hsa_default_float_rounding_mode_t is supported. The value at
+   * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false.  At least one
+   * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or
+   * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true.
+   */
+  HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8,
+  /**
+   * Flag indicating that the f16 HSAIL operation is at least as fast as the
+   * f32 operation in the instruction set architecture. The type of this
+   * attribute is bool.
+   */
+  HSA_ISA_INFO_FAST_F16_OPERATION = 9,
+  /**
+   * Maximum number of work-items of each dimension of a work-group.  Each
+   * maximum must be greater than 0. No maximum can exceed the value of
+   * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is
+   * uint16_t[3].
+   */
+  HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12,
+  /**
+   * Maximum total number of work-items in a work-group. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13,
+  /**
+   * Maximum number of work-items of each dimension of a grid. Each maximum must
+   * be greater than 0, and must not be smaller than the corresponding value in
+   * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+   * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is
+   * ::hsa_dim3_t.
+   */
+  HSA_ISA_INFO_GRID_MAX_DIM = 14,
+  /**
+   * Maximum total number of work-items in a grid. The type of this
+   * attribute is uint64_t.
+   */
+  HSA_ISA_INFO_GRID_MAX_SIZE = 16,
+  /**
+   * Maximum number of fbarriers per work-group. Must be at least 32. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17
+} hsa_isa_info_t;
+
+/**
+ * @deprecated The concept of call convention has been deprecated. If the
+ * application wants to query the value of an attribute for a given instruction
+ * set architecture, use ::hsa_isa_get_info_alt instead. If the application
+ * wants to query an attribute that is specific to a given combination of ISA
+ * and wavefront, use ::hsa_wavefront_get_info.
+ *
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[in] index Call convention index. Used only for call convention
+ * attributes, otherwise ignored. Must have a value between 0 (inclusive) and
+ * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not
+ * inclusive) in @p isa.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info(
+    hsa_isa_t isa,
+    hsa_isa_info_t attribute,
+    uint32_t index,
+    void *value);
+
+/**
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_info_alt(
+    hsa_isa_t isa,
+    hsa_isa_info_t attribute,
+    void *value);
+
+/**
+ * @brief Retrieve the exception policy support for a given combination of
+ * instruction set architecture and profile.
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_exception_policies(
+    hsa_isa_t isa,
+    hsa_profile_t profile,
+    uint16_t *mask);
+
+/**
+ * @brief Floating-point types.
+ */
+typedef enum {
+  /**
+   * 16-bit floating-point type.
+   */
+  HSA_FP_TYPE_16 = 1,
+  /**
+   * 32-bit floating-point type.
+   */
+  HSA_FP_TYPE_32 = 2,
+  /**
+   * 64-bit floating-point type.
+   */
+  HSA_FP_TYPE_64 = 4
+} hsa_fp_type_t;
+
+/**
+ * @brief Flush to zero modes.
+ */
+typedef enum {
+  /**
+   * Flush to zero.
+   */
+  HSA_FLUSH_MODE_FTZ = 1,
+  /**
+   * Do not flush to zero.
+   */
+  HSA_FLUSH_MODE_NON_FTZ = 2
+} hsa_flush_mode_t;
+
+/**
+ * @brief Round methods.
+ */
+typedef enum {
+  /**
+   * Single round method.
+   */
+  HSA_ROUND_METHOD_SINGLE = 1,
+  /**
+   * Double round method.
+   */
+  HSA_ROUND_METHOD_DOUBLE = 2
+} hsa_round_method_t;
+
+/**
+ * @brief Retrieve the round method (single or double) used to implement the
+ * floating-point multiply add instruction (mad) for a given combination of
+ * instruction set architecture, floating-point type, and flush to zero
+ * modifier.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] fp_type Floating-point type.
+ *
+ * @param[in] flush_mode Flush to zero modifier.
+ *
+ * @param[out] round_method Pointer to a memory location where the HSA
+ * runtime stores the round method used by the implementation. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid
+ * floating-point type, or @p flush_mode is not a valid flush to zero modifier,
+ * or @p round_method is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_round_method(
+    hsa_isa_t isa,
+    hsa_fp_type_t fp_type,
+    hsa_flush_mode_t flush_mode,
+    hsa_round_method_t *round_method);
+
+/**
+ * @brief Wavefront handle
+ */
+typedef struct hsa_wavefront_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_wavefront_t;
+
+/**
+ * @brief Wavefront attributes.
+ */
+typedef enum {
+  /**
+   * Number of work-items in the wavefront. Must be a power of 2 in the range
+   * [1,256]. The type of this attribute is uint32_t.
+   */
+  HSA_WAVEFRONT_INFO_SIZE = 0
+} hsa_wavefront_info_t;
+
+/**
+ * @brief Get the current value of a wavefront attribute.
+ *
+ * @param[in] wavefront A wavefront.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * wavefront attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_wavefront_get_info(
+    hsa_wavefront_t wavefront,
+    hsa_wavefront_info_t attribute,
+    void *value);
+
+/**
+ * @brief Iterate over the different wavefronts supported by an instruction set
+ * architecture, and invoke an application-defined callback on every iteration.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] callback Callback to be invoked once per wavefront that is
+ * supported by the agent. The HSA runtime passes two arguments to the callback:
+ * the wavefront handle and the application data.  If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_iterate_wavefronts(
+    hsa_isa_t isa,
+    hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data),
+    void *data);
+
+/**
+ * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set
+ * architectures are supported by a given agent.
+ *
+ * @brief Check if the instruction set architecture of a code object can be
+ * executed on an agent associated with another architecture.
+ *
+ * @param[in] code_object_isa Instruction set architecture associated with a
+ * code object.
+ *
+ * @param[in] agent_isa Instruction set architecture associated with an agent.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. If the two architectures are compatible, the result
+ * is true; if they are incompatible, the result is false.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible(
+    hsa_isa_t code_object_isa,
+    hsa_isa_t agent_isa,
+    bool *result);
+
+/** @} */
+
+
+/** \defgroup executable Executable
+ *  @{
+ */
+
+/**
+ * @brief Code object reader handle. A code object reader is used to
+ * load a code object from file (when created using
+ * ::hsa_code_object_reader_create_from_file), or from memory (if created using
+ * ::hsa_code_object_reader_create_from_memory).
+ */
+typedef struct hsa_code_object_reader_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_object_reader_t;
+
+/**
+ * @brief Create a code object reader to operate on a file.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_file(
+    hsa_file_t file,
+    hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Create a code object reader to operate on memory.
+ *
+ * @param[in] code_object Memory buffer that contains a vendor-specific code
+ * object. The buffer is owned and managed by the application; the lifetime of
+ * the buffer must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the buffer pointed to by @p code_object. Must not be
+ * 0.
+ *
+ * @param[out] code_object_reader Memory location to store newly created code
+ * object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size
+ * is zero, or @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_memory(
+    const void *code_object,
+    size_t size,
+    hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Destroy a code object reader.
+ *
+ * @details The code object reader handle becomes invalid after completion of
+ * this function. Any file or memory used to create the code object read is not
+ * closed, removed, or deallocated by this function.
+ *
+ * @param[in] code_object_reader Code object reader to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_destroy(
+    hsa_code_object_reader_t code_object_reader);
+
+/**
+ * @brief Struct containing an opaque handle to an executable, which contains
+ * ISA for finalized kernels and indirect functions together with the allocated
+ * global or readonly segment variables they reference.
+ */
+typedef struct hsa_executable_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_executable_t;
+
+/**
+ * @brief Executable state.
+ */
+typedef enum {
+  /**
+   * Executable state, which allows the user to load code objects and define
+   * external variables. Variable addresses, kernel code handles, and
+   * indirect function code handles are not available in query operations until
+   * the executable is frozen (zero always returned).
+   */
+  HSA_EXECUTABLE_STATE_UNFROZEN = 0,
+  /**
+   * Executable state, which allows the user to query variable addresses,
+   * kernel code handles, and indirect function code handles using query
+   * operations. Loading new code objects, as well as defining external
+   * variables, is not allowed in this state.
+   */
+  HSA_EXECUTABLE_STATE_FROZEN = 1
+} hsa_executable_state_t;
+
+/**
+ * @deprecated Use ::hsa_executable_create_alt instead, which allows the
+ * application to specify the default floating-point rounding mode of the
+ * executable and assumes an unfrozen initial state.
+ *
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] executable_state Executable state. If the state is
+ * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no
+ * code objects can be loaded, and no variables can be defined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores the newly
+ * created executable handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create(
+    hsa_profile_t profile,
+    hsa_executable_state_t executable_state,
+    const char *options,
+    hsa_executable_t *executable);
+
+/**
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] default_float_rounding_mode Default floating-point rounding mode
+ * used in the executable. Allowed rounding modes are near and zero (default is
+ * not allowed).
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores newly
+ * created executable handle. The initial state of the executable is
+ * ::HSA_EXECUTABLE_STATE_UNFROZEN.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_create_alt(
+    hsa_profile_t profile,
+    hsa_default_float_rounding_mode_t default_float_rounding_mode,
+    const char *options,
+    hsa_executable_t *executable);
+
+/**
+ * @brief Destroy an executable.
+ *
+ * @details An executable handle becomes invalid after the executable has been
+ * destroyed. Code object handles that were loaded into this executable are
+ * still valid after the executable has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with this executable
+ * (such as external global or readonly variables) can be released after the
+ * executable has been destroyed.
+ *
+ * Executable should not be destroyed while kernels are in flight.
+ *
+ * @param[in] executable Executable.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ */
+hsa_status_t HSA_API hsa_executable_destroy(
+    hsa_executable_t executable);
+
+/**
+ * @brief Loaded code object handle.
+ */
+typedef struct hsa_loaded_code_object_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_loaded_code_object_t;
+
+/**
+ * @brief Load a program code object into an executable.
+ *
+ * @details A program code object contains information about resources that are
+ * accessible by all kernel agents that run the executable, and can be loaded
+ * at most once into an executable.
+ *
+ * If the program code object uses extensions, the implementation must support
+ * them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] code_object_reader A code object reader that holds the program
+ * code object to load. If a code object reader is destroyed before all the
+ * associated executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is
+ * not compatible with the executable or the implementation (for example, the
+ * code object uses an extension that is not supported by the implementation).
+ */
+hsa_status_t HSA_API hsa_executable_load_program_code_object(
+    hsa_executable_t executable,
+    hsa_code_object_reader_t code_object_reader,
+    const char *options,
+    hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Load an agent code object into an executable.
+ *
+ * @details The agent code object contains all defined agent
+ * allocation variables, functions, indirect functions, and kernels in a given
+ * program for a given instruction set architecture.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * The default floating-point rounding mode of the code object associated with
+ * @p code_object_reader must match that of the executable
+ * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which
+ * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used).
+ * If the agent code object uses extensions, the implementation and the agent
+ * must support them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. A code object can be loaded
+ * into an executable at most once for a given agent. The instruction set
+ * architecture of the code object must be supported by the agent.
+ *
+ * @param[in] code_object_reader A code object reader that holds the code object
+ * to load. If a code object reader is destroyed before all the associated
+ * executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p
+ * code_object_reader is not compatible with the agent (for example, the agent
+ * does not support the instruction set architecture of the code object), the
+ * executable (for example, there is a default floating-point mode mismatch
+ * between the two), or the implementation.
+ */
+hsa_status_t HSA_API hsa_executable_load_agent_code_object(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_code_object_reader_t code_object_reader,
+    const char *options,
+    hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Freeze the executable.
+ *
+ * @details No modifications to executable can be made after freezing: no code
+ * objects can be loaded to the executable, and no external variables can be
+ * defined. Freezing the executable does not prevent querying the executable's
+ * attributes. The application must define all the external variables in an
+ * executable before freezing it.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are
+ * undefined in the executable.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen.
+ */
+hsa_status_t HSA_API hsa_executable_freeze(
+    hsa_executable_t executable,
+    const char *options);
+
+/**
+ * @brief Executable attributes.
+ */
+typedef enum {
+  /**
+   * Profile this executable is created for. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_EXECUTABLE_INFO_PROFILE = 1,
+  /**
+   * Executable state. The type of this attribute is ::hsa_executable_state_t.
+   */
+  HSA_EXECUTABLE_INFO_STATE = 2,
+  /**
+   * Default floating-point rounding mode specified when executable was created.
+   * The type of this attribute is ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3
+} hsa_executable_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_info(
+    hsa_executable_t executable,
+    hsa_executable_info_t attribute,
+    void *value);
+
+/**
+ * @brief Define an external global variable with program allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with program allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * be in global memory and can be read and written by any agent in the
+ * system. The application cannot deallocate the buffer pointed by @p address
+ * before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_global_variable_define(
+    hsa_executable_t executable,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Define an external global variable with agent allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with agent allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a global region
+ * that is only visible to @p agent. The application cannot deallocate the
+ * buffer pointed by @p address before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_agent_global_variable_define(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Define an external readonly variable.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the readonly segment memory. The variable must be defined
+ * before loading a code object into an executable. In addition, code objects
+ * loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a readonly
+ * region associated with @p agent. The application cannot deallocate the buffer
+ * pointed by @p address before @p executable is destroyed.
+ *
+ * @param[in] address Address where the variable is defined. The buffer pointed
+ * by @p address is owned by the application, and cannot be deallocated before
+ * @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_readonly_variable_define(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules). Invoking this function is equivalent to invoking
+ * ::hsa_executable_validate_alt with no options.
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate(
+    hsa_executable_t executable,
+    uint32_t *result);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules).
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate_alt(
+    hsa_executable_t executable,
+    const char *options,
+    uint32_t *result);
+
+/**
+ * @brief Executable symbol handle.
+ *
+ * The lifetime of an executable object symbol matches that of the executable
+ * associated with it. An operation on a symbol whose associated executable has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_executable_symbol_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_executable_symbol_t;
+
+/**
+ * @deprecated Use ::hsa_executable_get_symbol_by_name instead.
+ *
+ * @brief Get the symbol handle for a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[in] agent Agent associated with the symbol. If the symbol is
+ * independent of any agent (for example, a variable with program
+ * allocation), this argument is ignored.
+ *
+ * @param[in] call_convention Call convention associated with the symbol. If the
+ * symbol does not correspond to an indirect function, this argument is ignored.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol(
+    hsa_executable_t executable,
+    const char *module_name,
+    const char *symbol_name,
+    hsa_agent_t agent,
+    int32_t call_convention,
+    hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Retrieve the symbol handle corresponding to a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] symbol_name Symbol name. Must be a NUL-terminated character
+ * array. The Programmer's Reference Manual describes the standard name mangling
+ * scheme.
+ *
+ * @param[in] agent Pointer to the agent for which the symbol with the given
+ * name is defined. If the symbol corresponding to the given name has program
+ * allocation, @p agent must be NULL.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p
+ * symbol is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_symbol_by_name(
+    hsa_executable_t executable,
+    const char *symbol_name,
+    const hsa_agent_t *agent,
+    hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Symbol type.
+ */
+typedef enum {
+  /**
+   * Variable.
+   */
+  HSA_SYMBOL_KIND_VARIABLE = 0,
+  /**
+   * Kernel.
+   */
+  HSA_SYMBOL_KIND_KERNEL = 1,
+  /**
+   * Indirect function.
+   */
+  HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2
+} hsa_symbol_kind_t;
+
+/**
+ * @brief Linkage type of a symbol.
+ */
+typedef enum {
+  /**
+   * Module linkage.
+   */
+  HSA_SYMBOL_LINKAGE_MODULE = 0,
+  /**
+   * Program linkage.
+   */
+  HSA_SYMBOL_LINKAGE_PROGRAM = 1
+} hsa_symbol_linkage_t;
+
+/**
+ * @brief Allocation type of a variable.
+ */
+typedef enum {
+  /**
+   * Agent allocation.
+   */
+  HSA_VARIABLE_ALLOCATION_AGENT = 0,
+  /**
+   * Program allocation.
+   */
+  HSA_VARIABLE_ALLOCATION_PROGRAM = 1
+} hsa_variable_allocation_t;
+
+/**
+ * @brief Memory segment associated with a variable.
+ */
+typedef enum {
+  /**
+   * Global memory segment.
+   */
+  HSA_VARIABLE_SEGMENT_GLOBAL = 0,
+  /**
+   * Readonly memory segment.
+   */
+  HSA_VARIABLE_SEGMENT_READONLY = 1
+} hsa_variable_segment_t;
+
+/**
+ * @brief Executable symbol attributes.
+ */
+typedef enum {
+  /**
+   * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0,
+  /**
+   * The length of the symbol name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1,
+  /**
+   * The name of the symbol. The type of this attribute is character array with
+   * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
+   * attribute.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2,
+  /**
+   * @deprecated
+   *
+   * The length of the module name in bytes (not including the NUL terminator)
+   * to which this symbol belongs if this symbol has module linkage, otherwise 0
+   * is returned. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+  /**
+   * @deprecated
+   *
+   * The module name to which this symbol belongs if this symbol has module
+   * linkage, otherwise an empty string is returned. The type of this attribute
+   * is character array with the length equal to the value of
+   * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4,
+  /**
+   * @deprecated
+   *
+   * Agent associated with this symbol. If the symbol is a variable, the
+   * value of this attribute is only defined if
+   * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is
+   * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20,
+  /**
+   * The address of the variable. The value of this attribute is undefined if
+   * the symbol is not a variable. The type of this attribute is uint64_t.
+   *
+   * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is
+   * returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21,
+  /**
+   * The linkage kind of the symbol. The type of this attribute is
+   * ::hsa_symbol_linkage_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5,
+  /**
+   * Indicates whether the symbol corresponds to a definition. The type of this
+   * attribute is bool.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17,
+  /**
+   * @deprecated
+   *
+   * The allocation kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable.  The type of this attribute is
+   * ::hsa_variable_allocation_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+  /**
+   * @deprecated
+   *
+   * The segment kind of the variable. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_segment_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+  /**
+   * @deprecated
+   *
+   * Alignment of the symbol in memory. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * The current alignment of the variable in memory may be greater than the
+   * value specified in the source program variable declaration.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+  /**
+   * @deprecated
+   *
+   * Size of the variable. The value of this attribute is undefined if
+   * the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * A value of 0 is returned if the variable is an external variable and has an
+   * unknown dimension.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+  /**
+   * @deprecated
+   *
+   * Indicates whether the variable is constant. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * bool.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+  /**
+   * Kernel object handle, used in the kernel dispatch packet. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint64_t.
+   *
+   * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+   * is returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22,
+  /**
+   * Size of kernarg segment memory that is required to hold the values of the
+   * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+  /**
+   * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+   * which is the maximum of 16 and the maximum alignment of any of the kernel
+   * arguments. The value of this attribute is undefined if the symbol is not a
+   * kernel. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+  /**
+   * Size of static group segment memory required by the kernel (per
+   * work-group), in bytes. The value of this attribute is undefined
+   * if the symbol is not a kernel. The type of this attribute is uint32_t.
+   *
+   * The reported amount does not include any dynamically allocated group
+   * segment memory that may be requested by the application when a kernel is
+   * dispatched.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+  /**
+   * Size of static private, spill, and arg segment memory required by
+   * this kernel (per work-item), in bytes. The value of this attribute is
+   * undefined if the symbol is not a kernel. The type of this attribute is
+   * uint32_t.
+   *
+   * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is
+   * true, the kernel may use more private memory than the reported value, and
+   * the application must add the dynamic call stack usage to @a
+   * private_segment_size when populating a kernel dispatch packet.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+  /**
+   * Dynamic callstack flag. The value of this attribute is undefined if the
+   * symbol is not a kernel. The type of this attribute is bool.
+   *
+   * If this flag is set (the value is true), the kernel uses a dynamically
+   * sized call stack. This can happen if recursive calls, calls to indirect
+   * functions, or the HSAIL alloca instruction are present in the kernel.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+  /**
+   * @deprecated
+   *
+   * Call convention of the kernel. The value of this attribute is undefined if
+   * the symbol is not a kernel. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+  /**
+   * Indirect function object handle. The value of this attribute is undefined
+   * if the symbol is not an indirect function, or the associated agent does
+   * not support the Full Profile. The type of this attribute depends on the
+   * machine model: the type is uint32_t for small machine model, and uint64_t
+   * for large model.
+   *
+   * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+   * is returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23,
+  /**
+   * @deprecated
+   *
+   * Call convention of the indirect function. The value of this attribute is
+   * undefined if the symbol is not an indirect function, or the associated
+   * agent does not support the Full Profile. The type of this attribute is
+   * uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+} hsa_executable_symbol_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable symbol.
+ *
+ * @param[in] executable_symbol Executable symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_symbol_get_info(
+    hsa_executable_symbol_t executable_symbol,
+    hsa_executable_symbol_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a executable, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/**
+ * @brief Iterate over the kernels, indirect functions, and agent allocation
+ * variables in an executable for a given agent, and invoke an application-
+ * defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_agent_symbols(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_agent_t agent,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/**
+ * @brief Iterate over the program allocation variables in an executable, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_program_symbols(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/** @} */
+
+
+/** \defgroup code-object Code Objects (deprecated).
+ *  @{
+ */
+
+/**
+ * @deprecated
+ *
+ * @brief Struct containing an opaque handle to a code object, which contains
+ * ISA for finalized kernels and indirect functions together with information
+ * about the global or readonly segment variables they reference.
+ */
+typedef struct hsa_code_object_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_object_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Application data handle that is passed to the serialization
+ * and deserialization functions.
+ */
+typedef struct hsa_callback_data_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_callback_data_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Serialize a code object. Can be used for offline finalization,
+ * install-time finalization, disk code caching, etc.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] alloc_callback Callback function for memory allocation. Must not
+ * be NULL. The HSA runtime passes three arguments to the callback: the
+ * allocation size, the application data, and a pointer to a memory location
+ * where the application stores the allocation result. The HSA runtime invokes
+ * @p alloc_callback once to allocate a buffer that contains the serialized
+ * version of @p code_object.  If the callback returns a status code other than
+ * ::HSA_STATUS_SUCCESS, this function returns the same code.
+ *
+ * @param[in] callback_data Application data that is passed to @p
+ * alloc_callback. May be NULL.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] serialized_code_object Memory location where the HSA runtime
+ * stores a pointer to the serialized code object. Must not be NULL.
+ *
+ * @param[out] serialized_code_object_size Memory location where the HSA runtime
+ * stores the size (in bytes) of @p serialized_code_object. The returned value
+ * matches the allocation size passed by the HSA runtime to @p
+ * alloc_callback. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p
+ * serialized_code_object, or @p serialized_code_object_size are NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize(
+    hsa_code_object_t code_object,
+    hsa_status_t (*alloc_callback)(size_t size,
+                                   hsa_callback_data_t data,
+                                   void **address),
+    hsa_callback_data_t callback_data,
+    const char *options,
+    void **serialized_code_object,
+    size_t *serialized_code_object_size);
+
+/**
+ * @deprecated
+ *
+ * @brief Deserialize a code object.
+ *
+ * @param[in] serialized_code_object A serialized code object. Must not be NULL.
+ *
+ * @param[in] serialized_code_object_size The size (in bytes) of @p
+ * serialized_code_object. Must not be 0.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] code_object Memory location where the HSA runtime stores the
+ * deserialized code object.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p
+ * code_object are NULL, or @p serialized_code_object_size is 0.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize(
+    void *serialized_code_object,
+    size_t serialized_code_object_size,
+    const char *options,
+    hsa_code_object_t *code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Destroy a code object.
+ *
+ * @details The lifetime of a code object must exceed that of any executable
+ * where it has been loaded. If an executable that loaded @p code_object has not
+ * been destroyed, the behavior is undefined.
+ *
+ * @param[in] code_object Code object. The handle becomes invalid after it has
+ * been destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy(
+    hsa_code_object_t code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object type.
+ */
+typedef enum {
+  /**
+   * Produces code object that contains ISA for all kernels and indirect
+   * functions in HSA source.
+   */
+  HSA_CODE_OBJECT_TYPE_PROGRAM = 0
+} hsa_code_object_type_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Code object attributes.
+ */
+typedef enum {
+  /**
+   * The version of the code object. The type of this attribute is a
+   * NUL-terminated char[64]. The name must be at most 63 characters long (not
+   * including the NUL terminator) and all array elements not used for the name
+   * must be NUL.
+   */
+  HSA_CODE_OBJECT_INFO_VERSION = 0,
+  /**
+   * Type of code object. The type of this attribute is
+   * ::hsa_code_object_type_t.
+   */
+  HSA_CODE_OBJECT_INFO_TYPE = 1,
+  /**
+   * Instruction set architecture this code object is produced for. The type of
+   * this attribute is ::hsa_isa_t.
+   */
+  HSA_CODE_OBJECT_INFO_ISA = 2,
+  /**
+   * Machine model this code object is produced for. The type of this attribute
+   * is ::hsa_machine_model_t.
+   */
+  HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3,
+  /**
+   * Profile this code object is produced for. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_CODE_OBJECT_INFO_PROFILE = 4,
+  /**
+   * Default floating-point rounding mode used when the code object is
+   * produced. The type of this attribute is
+   * ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5
+} hsa_code_object_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code object.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code object attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info(
+    hsa_code_object_t code_object,
+    hsa_code_object_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Load code object into the executable.
+ *
+ * @details Every global or readonly variable that is external must be defined
+ * before loading the code object. An internal global or readonly variable is
+ * allocated once the code object, that is being loaded, references this
+ * variable and this variable is not allocated.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. The agent must support the
+ * default floating-point rounding mode used by @p code_object.
+ *
+ * @param[in] code_object Code object to load.  The lifetime of the code object
+ * must exceed that of the executable: if @p code_object is destroyed before @p
+ * executable, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible
+ * with @p code_object (for example, @p agent does not support the default
+ * floating-point rounding mode specified by @p code_object), or @p code_object
+ * is not compatible with @p executable (for example, @p code_object and @p
+ * executable have different machine models or profiles).
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_code_object_t code_object,
+    const char *options);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol handle.
+ *
+ * The lifetime of a code object symbol matches that of the code object
+ * associated with it. An operation on a symbol whose associated code object has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_code_symbol_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_symbol_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol(
+    hsa_code_object_t code_object,
+    const char *symbol_name,
+    hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name(
+    hsa_code_object_t code_object,
+    const char *module_name,
+    const char *symbol_name,
+    hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol attributes.
+ */
+typedef enum {
+  /**
+   * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+   */
+  HSA_CODE_SYMBOL_INFO_TYPE = 0,
+  /**
+   * The length of the symbol name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1,
+  /**
+   * The name of the symbol. The type of this attribute is character array with
+   * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH
+   * attribute.
+   */
+  HSA_CODE_SYMBOL_INFO_NAME = 2,
+  /**
+   * The length of the module name in bytes (not including the NUL terminator)
+   * to which this symbol belongs if this symbol has module linkage, otherwise 0
+   * is returned. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+  /**
+   * The module name to which this symbol belongs if this symbol has module
+   * linkage, otherwise an empty string is returned. The type of this attribute
+   * is character array with the length equal to the value of
+   * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+   */
+  HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4,
+  /**
+   * The linkage kind of the symbol. The type of this attribute is
+   * ::hsa_symbol_linkage_t.
+   */
+  HSA_CODE_SYMBOL_INFO_LINKAGE = 5,
+  /**
+   * Indicates whether the symbol corresponds to a definition. The type of this
+   * attribute is bool.
+   */
+  HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17,
+  /**
+   * The allocation kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_allocation_t.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+  /**
+   * The segment kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_segment_t.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+  /**
+   * Alignment of the symbol in memory. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * The current alignment of the variable in memory may be greater than the
+   * value specified in the source program variable declaration.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+  /**
+   * Size of the variable. The value of this attribute is undefined if the
+   * symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * A size of 0 is returned if the variable is an external variable and has an
+   * unknown dimension.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+  /**
+   * Indicates whether the variable is constant. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * bool.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+  /**
+   * Size of kernarg segment memory that is required to hold the values of the
+   * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+  /**
+   * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+   * which is the maximum of 16 and the maximum alignment of any of the kernel
+   * arguments. The value of this attribute is undefined if the symbol is not a
+   * kernel. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+  /**
+   * Size of static group segment memory required by the kernel (per
+   * work-group), in bytes. The value of this attribute is undefined
+   * if the symbol is not a kernel. The type of this attribute is uint32_t.
+   *
+   * The reported amount does not include any dynamically allocated group
+   * segment memory that may be requested by the application when a kernel is
+   * dispatched.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+  /**
+   * Size of static private, spill, and arg segment memory required by
+   * this kernel (per work-item), in bytes. The value of this attribute is
+   * undefined if the symbol is not a kernel. The type of this attribute is
+   * uint32_t.
+   *
+   * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true,
+   * the kernel may use more private memory than the reported value, and the
+   * application must add the dynamic call stack usage to @a
+   * private_segment_size when populating a kernel dispatch packet.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+  /**
+   * Dynamic callstack flag. The value of this attribute is undefined if the
+   * symbol is not a kernel. The type of this attribute is bool.
+   *
+   * If this flag is set (the value is true), the kernel uses a dynamically
+   * sized call stack. This can happen if recursive calls, calls to indirect
+   * functions, or the HSAIL alloca instruction are present in the kernel.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+  /**
+   * Call convention of the kernel. The value of this attribute is undefined if
+   * the symbol is not a kernel. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+  /**
+   * Call convention of the indirect function. The value of this attribute is
+   * undefined if the symbol is not an indirect function. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16,
+  /**
+   * Wavefront size used by the kernel. The value of this attribute is either
+   * 32 or 64. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19
+} hsa_code_symbol_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code symbol.
+ *
+ * @param[in] code_symbol Code symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info(
+    hsa_code_symbol_t code_symbol,
+    hsa_code_symbol_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a code object, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] callback Callback to be invoked once per code object symbol. The
+ * HSA runtime passes three arguments to the callback: the code object, a
+ * symbol, and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_code_object_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols(
+    hsa_code_object_t code_object,
+    hsa_status_t (*callback)(hsa_code_object_t code_object,
+                             hsa_code_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/** @} */
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif
+
+#endif  // header guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_amd.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_amd.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd1f9348ec24dd88c41b32f2e05f38be7e9dacf
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_amd.h
@@ -0,0 +1,3782 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension.
+
+#ifndef HSA_RUNTIME_EXT_AMD_H_
+#define HSA_RUNTIME_EXT_AMD_H_
+
+#include "hsa.h"
+#include "hsa_ext_image.h"
+#include "hsa_ven_amd_pc_sampling.h"
+
+/**
+ * - 1.0 - initial version
+ * - 1.1 - dmabuf export
+ * - 1.2 - hsa_amd_memory_async_copy_on_engine
+ * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool
+ * - 1.4 - Virtual Memory API
+ * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES
+ * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align
+ * - 1.7 - hsa_amd_signal_wait_all
+ * - 1.8 - hsa_amd_memory_get_preferred_copy_engine
+ * - 1.9 - hsa_amd_portable_export_dmabuf_v2
+ * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER
+ * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS
+ * - 1.12 - hsa_amd_pointer_info: HSA_EXT_POINTER_TYPE_HSA_VMEM and HSA_EXT_POINTER_TYPE_RESERVED_ADDR
+ * - 1.13 - hsa_amd_pointer_info: Added new registered field to hsa_amd_pointer_info_t
+ * - 1.14 - hsa_amd_ais_file_write, hsa_amd_ais_file_read
+ */
+#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
+#define HSA_AMD_INTERFACE_VERSION_MINOR 14
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief Macro to set a flag within uint8_t[8] types.
+ */
+static inline void hsa_flag_set64(uint8_t* value, uint32_t bit) {
+  unsigned int index = bit / 8;
+  unsigned int subBit = bit % 8;
+  (((uint8_t*)value)[index]) |= (1 << subBit);
+}
+
+/**
+ * @brief Macro to determine whether a flag is set within uint8_t[8] types.
+ */
+static inline bool hsa_flag_isset64(uint8_t* value, uint32_t bit) {
+  unsigned int index = bit / 8;
+  unsigned int subBit = bit % 8;
+  return ((uint8_t*)value)[index] & (1 << subBit);
+}
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants.
+ */
+typedef uint32_t hsa_signal_condition32_t;
+
+/**
+ * @brief AMD vendor specific packet type.
+ */
+typedef enum {
+  /**
+   * Packet used by agents to delay processing of subsequent packets until a
+   * configurable condition is satisfied by an HSA signal.  Only kernel dispatch
+   * queues created from AMD GPU Agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+  /**
+   * Packet used to send commands to an AIE agent's embedded runtime (ERT). The
+   * ERT is responsible for, among other things, handling dispatches. Only
+   * queues created on AIE agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_AIE_ERT = 3
+} hsa_amd_packet_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants.
+ */
+typedef uint8_t hsa_amd_packet_type8_t;
+
+/**
+ * @brief AMD vendor specific AQL packet header
+ */
+typedef struct hsa_amd_packet_header_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Format of the vendor specific packet.
+   */
+  hsa_amd_packet_type8_t AmdFormat;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved;
+} hsa_amd_vendor_packet_header_t;
+
+/**
+ * @brief AMD barrier value packet.  Halts packet processing and waits for
+ * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value
+ * is the value of the signal ::signal.
+ */
+typedef struct hsa_amd_barrier_value_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Dependent signal object. A signal with a handle value of 0 is
+   * allowed and is interpreted by the packet processor a satisfied
+   * dependency.
+   */
+  hsa_signal_t signal;
+
+  /**
+   * Value to compare against.
+   */
+  hsa_signal_value_t value;
+
+  /**
+   * Bit mask to be combined by bitwise AND with ::signal's value.
+   */
+  hsa_signal_value_t mask;
+
+  /**
+   * Comparison operation.  See ::hsa_signal_condition_t.
+   */
+  hsa_signal_condition32_t cond;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+} hsa_amd_barrier_value_packet_t;
+
+/**
+ * State of an AIE ERT command.
+ */
+typedef enum {
+  /**
+   * Set by the host before submitting a command to the scheduler.
+   */
+  HSA_AMD_AIE_ERT_STATE_NEW = 1,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_QUEUED = 2,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_RUNNING = 3,
+  /**
+   * Set by the scheduler when a command completes.
+   */
+  HSA_AMD_AIE_ERT_STATE_COMPLETED = 4,
+  /**
+   * Set by the scheduler if a command failed.
+   */
+  HSA_AMD_AIE_ERT_STATE_ERROR = 5,
+  /**
+   * Set by the scheduler if a command aborted.
+   */
+  HSA_AMD_AIE_ERT_STATE_ABORT = 6,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7,
+  /**
+   * Set by the scheduler on a timeout and reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8,
+  /**
+   * Set by the scheduler on a timeout and fail to reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9,
+  HSA_AMD_AIE_ERT_STATE_SKERROR = 10,
+  HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11,
+  HSA_AMD_AIE_ERT_STATE_MAX
+} hsa_amd_aie_ert_state;
+
+/**
+ * Opcode types for HSA AIE ERT commands.
+ */
+typedef enum {
+  /**
+   * Start a workgroup on a compute unit (CU).
+   */
+  HSA_AMD_AIE_ERT_START_CU = 0,
+  /**
+   * Currently aliased to HSA_AMD_AIE_ERT_START_CU.
+   */
+  HSA_AMD_AIE_ERT_START_KERNEL = 0,
+  /**
+   * Configure command scheduler.
+   */
+  HSA_AMD_AIE_ERT_CONFIGURE = 2,
+  HSA_AMD_AIE_ERT_EXIT = 3,
+  HSA_AMD_AIE_ERT_ABORT = 4,
+  /**
+   * Execute a specified CU after writing.
+   */
+  HSA_AMD_AIE_ERT_EXEC_WRITE = 5,
+  /**
+   * Get stats about a CU's execution.
+   */
+  HSA_AMD_AIE_ERT_CU_STAT = 6,
+  /**
+   * Start KDMA CU or P2P.
+   */
+  HSA_AMD_AIE_ERT_START_COPYBO = 7,
+  /**
+   * Configure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_CONFIG = 8,
+  /**
+   * Start a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_START = 9,
+  /**
+   * Unconfigure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_UNCONFIG = 10,
+  /**
+   * Initialize a CU.
+   */
+  HSA_AMD_AIE_ERT_INIT_CU = 11,
+  HSA_AMD_AIE_ERT_START_FA = 12,
+  HSA_AMD_AIE_ERT_CLK_CALIB = 13,
+  HSA_AMD_AIE_ERT_MB_VALIDATE = 14,
+  /**
+   * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair.
+   */
+  HSA_AMD_AIE_ERT_START_KEY_VAL = 15,
+  HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16,
+  HSA_AMD_AIE_ERT_ACCESS_TEST = 17,
+  /**
+   * Instruction buffer command format.
+   */
+  HSA_AMD_AIE_ERT_START_DPU = 18,
+  /**
+   * Command chain.
+   */
+  HSA_AMD_AIE_ERT_CMD_CHAIN = 19,
+  /**
+   * Instruction buffer command format on NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU = 20,
+  /**
+   * Instruction buffer command with pre-emption format on the NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21
+} hsa_amd_aie_ert_cmd_opcode_t;
+
+/**
+ * Payload data for AIE ERT start kernel packets (i.e., when the opcode is
+ * HSA_AMD_AIE_ERT_START_KERNEL).
+ */
+typedef struct hsa_amd_aie_ert_start_kernel_data_s {
+  /**
+   * Address to the PDI.
+   */
+  void* pdi_addr;
+  /**
+   * Opcode, instructions and kernel arguments.
+   */
+  uint32_t data[];
+} hsa_amd_aie_ert_start_kernel_data_t;
+
+/**
+ * AMD AIE ERT packet. Used for sending a command to an AIE agent.
+ */
+typedef struct hsa_amd_aie_ert_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+  /**
+   * Format for packets interpreted by the ERT to understand the command and
+   * payload data.
+   */
+  struct {
+    /**
+     * Current state of a command.
+     */
+    uint32_t state : 4;
+    /**
+     * Flexible field that can be interpreted on a per-command basis.
+     */
+    uint32_t custom : 8;
+    /**
+     * Number of DWORDs in the payload data.
+     */
+    uint32_t count : 11;
+    /**
+     * Opcode identifying the command.
+     */
+    uint32_t opcode : 5;
+    /**
+     * Type of a command (currently 0).
+     */
+    uint32_t type : 4;
+  };
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved0;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved1;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved4;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved5;
+  /**
+   * Address of packet data payload. ERT commands contain arbitrarily sized
+   * data payloads.
+   */
+  uint64_t payload_data;
+} hsa_amd_aie_ert_packet_t;
+
+/** @} */
+
+/** \defgroup error-codes Error codes
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+  /**
+   * The memory pool is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40,
+
+  /**
+   * Agent accessed memory beyond the maximum legal address.
+   */
+  HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41,
+
+  /**
+   * Agent executed an invalid shader instruction.
+   */
+  HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+
+  /**
+   * Agent attempted to access an inaccessible address.
+   * See hsa_amd_register_system_event_handler and
+   * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
+   */
+  HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
+
+  /**
+   * Exceeded number of VGPRs available on this agent
+   */
+  HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45,
+
+  /**
+   * Resource is busy or temporarily unavailable
+   */
+  HSA_STATUS_ERROR_RESOURCE_BUSY = 46,
+
+  /**
+   * Request is not supported by this system
+   */
+  HSA_STATUS_ERROR_NOT_SUPPORTED = 47,
+};
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief IOMMU version supported
+ */
+typedef enum {
+  /**
+   * IOMMU not supported
+   */
+  HSA_IOMMU_SUPPORT_NONE = 0,
+  /* IOMMU V1 support is not relevant to user applications, so not reporting it */
+  /**
+   * IOMMU V2 supported
+   */
+  HSA_IOMMU_SUPPORT_V2 = 1,
+} hsa_amd_iommu_version_t;
+
+/**
+ * @brief Structure containing information on the agent's clock counters.
+ */
+typedef struct hsa_amd_clock_counters_s {
+  uint64_t gpu_clock_counter;
+  uint64_t cpu_clock_counter;
+  uint64_t system_clock_counter;
+  uint64_t system_clock_frequency;
+} hsa_amd_clock_counters_t;
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum hsa_amd_agent_info_s {
+  /**
+   * Chip identifier. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
+  /**
+   * Size of a cacheline in bytes. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
+  /**
+   * The number of compute unit available in the agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
+  /**
+   * The maximum clock frequency of the agent in MHz. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+  /**
+   * Internal driver node identifier. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004,
+  /**
+   * Max number of watch points on memory address ranges to generate exception
+   * events when the watched addresses are accessed.  The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005,
+  /**
+   * Agent BDF_ID, named LocationID in thunk. The type of this attribute is
+   * uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_BDFID = 0xA006,
+  /**
+   * Memory Interface width, the return value type is uint32_t.
+   * This attribute is deprecated.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007,
+  /**
+   * Max Memory Clock, the return value type is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008,
+  /**
+   * Board name of Agent - populated from MarketingName of Kfd Node
+   * The value is an Ascii string of 64 chars.
+   */
+  HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
+  /**
+   * Maximum number of waves possible in a Compute Unit.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
+  /**
+   * Number of SIMD's per compute unit CU
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
+  /**
+   * Number of Shader Engines (SE) in Gpu
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C,
+  /**
+   * Number of Shader Arrays Per Shader Engines in Gpu
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D,
+  /**
+   * Address of the HDP flush registers.  Use of these registers does not conform to the HSA memory
+   * model and should be treated with caution.
+   * The type of this attribute is hsa_amd_hdp_flush_t.
+   */
+  HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E,
+  /**
+   * PCIe domain for the agent.  Pairs with HSA_AMD_AGENT_INFO_BDFID
+   * to give the full physical location of the Agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F,
+  /**
+   * Queries for support of cooperative queues.  See ::HSA_QUEUE_TYPE_COOPERATIVE.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+  /**
+   * Queries UUID of an agent. The value is an Ascii string with a maximum
+   * of 21 chars including NUL. The string value consists of two parts: header
+   * and body. The header identifies device type (GPU, CPU, DSP) while body
+   * encodes UUID as a 16 digit hex string
+   *
+   * Agents that do not support UUID will return the string "GPU-XX" or
+   * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t
+   */
+  HSA_AMD_AGENT_INFO_UUID = 0xA011,
+  /**
+   * Queries for the ASIC revision of an agent. The value is an integer that
+   * increments for each revision. This can be used by user-level software to
+   * change how it operates, depending on the hardware version. This allows
+   * selective workarounds for hardware errata.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012,
+  /**
+   * Queries whether or not the host can directly access SVM memory that is
+   * physically resident in the agent's local memory.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013,
+  /**
+   * Some processors support more CUs than can reliably be used in a cooperative
+   * dispatch.  This queries the count of CUs which are fully enabled for
+   * cooperative dispatch.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014,
+  /**
+   * Queries the amount of memory available in bytes accross all global pools
+   * owned by the agent.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015,
+  /**
+   * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+   * in the range 1-400MHz.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
+  /**
+   * Queries for the ASIC family ID of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107,
+  /**
+   * Queries for the Packet Processor(CP Firmware) ucode version of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_UCODE_VERSION = 0xA108,
+  /**
+   * Queries for the SDMA engine ucode of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION = 0xA109,
+  /**
+   * Queries the number of SDMA engines.
+   * If HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG query returns non-zero,
+   * this query returns the the number of SDMA engines optimized for
+   * host to device bidirectional traffic.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SDMA_ENG = 0xA10A,
+  /**
+   * Queries the number of additional SDMA engines optimized for D2D xGMI copies.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG = 0xA10B,
+  /**
+   * Queries for version of IOMMU supported by agent.
+   * The type of this attribute is hsa_amd_iommu_version_t.
+   */
+  HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110,
+  /**
+   * Queries for number of XCCs within the agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111,
+  /**
+   * Queries for driver unique identifier.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112,
+  /**
+   * Returns the hsa_agent_t of the nearest CPU agent
+   * The type of this attribute is hsa_agent_t.
+   */
+  HSA_AMD_AGENT_INFO_NEAREST_CPU = 0xA113,
+  /**
+   * Bit-mask indicating memory properties of this agent. A memory property is set if the flag bit
+   * is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag
+   * is set. The type of this attribute is uint8_t[8].
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES = 0xA114,
+  /**
+   * Bit-mask indicating AQL Extensions supported by this agent. An AQL extension is set if the flag
+   * bit is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag
+   * is set. The type of this attribute is uint8_t[8].
+   */
+  HSA_AMD_AGENT_INFO_AQL_EXTENSIONS = 0xA115, /* Not implemented yet */
+  /**
+   * Maximum allowed value in bytes for scratch limit for this agent. This amount
+   * is shared accross all queues created on this agent.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX = 0xA116,
+  /**
+   * Current scratch limit threshold in bytes for this agent. This limit can be
+   * modified using the hsa_amd_agent_set_async_scratch_limit call.
+   * - AQL dispatches that require scratch-memory above this threshold will trigger a
+   *   scratch use-once.
+   * - AQL dispatches using less scratch-memory than this threshold, ROCr will
+   *   permanently assign the allocated scratch memory to the queue handling the dispatch.
+   *   This memory can be reclaimed by calling hsa_amd_agent_set_async_scratch_limit
+   *   with a lower threshold by current value.
+   *
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT = 0xA117,
+  /**
+   * Queries the driver for clock counters of the agent.
+   * The type of this attribute is hsa_amd_clock_counters_t.
+   */
+  HSA_AMD_AGENT_INFO_CLOCK_COUNTERS = 0xA118
+} hsa_amd_agent_info_t;
+
+/**
+ * @brief Agent memory properties attributes
+ */
+typedef enum hsa_amd_agent_memory_properties_s {
+  HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU = (1 << 0),
+} hsa_amd_agent_memory_properties_t;
+
+/**
+ * @brief SDMA engine IDs unique by single set bit position.
+ */
+typedef enum hsa_amd_sdma_engine_id {
+  HSA_AMD_SDMA_ENGINE_0 = 0x1,
+  HSA_AMD_SDMA_ENGINE_1 = 0x2,
+  HSA_AMD_SDMA_ENGINE_2 = 0x4,
+  HSA_AMD_SDMA_ENGINE_3 = 0x8,
+  HSA_AMD_SDMA_ENGINE_4 = 0x10,
+  HSA_AMD_SDMA_ENGINE_5 = 0x20,
+  HSA_AMD_SDMA_ENGINE_6 = 0x40,
+  HSA_AMD_SDMA_ENGINE_7 = 0x80,
+  HSA_AMD_SDMA_ENGINE_8 = 0x100,
+  HSA_AMD_SDMA_ENGINE_9 = 0x200,
+  HSA_AMD_SDMA_ENGINE_10 = 0x400,
+  HSA_AMD_SDMA_ENGINE_11 = 0x800,
+  HSA_AMD_SDMA_ENGINE_12 = 0x1000,
+  HSA_AMD_SDMA_ENGINE_13 = 0x2000,
+  HSA_AMD_SDMA_ENGINE_14 = 0x4000,
+  HSA_AMD_SDMA_ENGINE_15 = 0x8000
+} hsa_amd_sdma_engine_id_t;
+
+typedef struct hsa_amd_hdp_flush_s {
+  uint32_t* HDP_MEM_FLUSH_CNTL;
+  uint32_t* HDP_REG_FLUSH_CNTL;
+} hsa_amd_hdp_flush_t;
+
+/**
+ * @brief Region attributes.
+ */
+#ifdef __cplusplus
+typedef enum hsa_amd_region_info_s : int {
+#else
+typedef enum hsa_amd_region_info_s {
+#endif
+  /**
+   * Determine if host can access the region. The type of this attribute
+   * is bool.
+   */
+  HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000,
+  /**
+   * Base address of the region in flat address space.
+   */
+  HSA_AMD_REGION_INFO_BASE = 0xA001,
+  /**
+   * Memory Interface width, the return value type is uint32_t.
+   * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
+   */
+  HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002,
+  /**
+   * Max Memory Clock, the return value type is uint32_t.
+   * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
+   */
+  HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+} hsa_amd_region_info_t;
+
+/**
+ * @brief Coherency attributes of fine grain region.
+ */
+typedef enum hsa_amd_coherency_type_s {
+  /**
+   * Coherent region.
+   */
+  HSA_AMD_COHERENCY_TYPE_COHERENT = 0,
+  /**
+   * Non coherent region.
+   */
+  HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
+} hsa_amd_coherency_type_t;
+
+
+/**
+ * @brief dmabuf attributes
+ */
+#ifdef __cplusplus
+typedef enum hsa_amd_dma_buf_mapping_type_s : int {
+#else
+typedef enum hsa_amd_dma_buf_mapping_type_s {
+#endif
+  HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0,
+  HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1
+} hsa_amd_dma_buf_mapping_type_t;
+/**
+ * @brief Get the coherency type of the fine grain region of an agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[out] type Pointer to a memory location where the HSA runtime will
+ * store the coherency type of the fine grain region.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t* type);
+
+/**
+ * @brief Set the coherency type of the fine grain region of an agent.
+ * Deprecated.  This is supported on KV platforms.  For backward compatibility
+ * other platforms will spuriously succeed.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] type The coherency type to be set.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t type);
+
+/** @} */
+
+/** \defgroup profile Profiling
+ *  @{
+ */
+
+/**
+ * @brief Structure containing profiling dispatch time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_dispatch_time_s {
+  /**
+   * Dispatch packet processing start time.
+   */
+  uint64_t start;
+  /**
+   * Dispatch packet completion time.
+   */
+  uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+/**
+ * @brief Structure containing profiling async copy time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_async_copy_time_s {
+  /**
+   * Async copy processing start time.
+   */
+  uint64_t start;
+  /**
+   * Async copy completion time.
+   */
+  uint64_t end;
+} hsa_amd_profiling_async_copy_time_t;
+
+/**
+ * @brief Enable or disable profiling capability of a queue.
+ *
+ * @param[in] queue A valid queue.
+ *
+ * @param[in] enable 1 to enable profiling. 0 to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+
+/**
+ * @brief Enable or disable asynchronous memory copy profiling.
+ *
+ * @details The runtime will provide the copy processing start timestamp and
+ * completion timestamp of each call to hsa_amd_memory_async_copy if the
+ * async copy profiling is enabled prior to the call to
+ * hsa_amd_memory_async_copy. The completion signal object is used to
+ * hold the last async copy start and end timestamp. The client can retrieve
+ * these timestamps via call to hsa_amd_profiling_get_async_copy_time.
+ *
+ * @param[in] enable True to enable profiling. False to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources
+ * needed to profile the asynchronous copy.
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_async_copy_enable(bool enable);
+
+/**
+ * @brief Retrieve packet processing time stamps.
+ *
+ * @param[in] agent The agent with which the signal was last used.  For
+ * instance, if the profiled dispatch packet is dispatched onto queue Q,
+ * which was created on agent A, then this parameter must be A.
+ *
+ * @param[in] signal A signal used as the completion signal of the dispatch
+ * packet to retrieve time stamps from.  This dispatch packet must have been
+ * issued to a queue with profiling enabled and have already completed.  Also
+ * the signal must not have yet been used in any other packet following the
+ * completion of the profiled dispatch packet.
+ *
+ * @param[out] time Packet processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+    hsa_agent_t agent, hsa_signal_t signal,
+    hsa_amd_profiling_dispatch_time_t* time);
+
+/**
+ * @brief Retrieve asynchronous copy timestamps.
+ *
+ * @details Async copy profiling is enabled via call to
+ * hsa_amd_profiling_async_copy_enable.
+ *
+ * @param[in] signal A signal used as the completion signal of the call to
+ * hsa_amd_memory_async_copy.
+ *
+ * @param[out] time Async copy processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time);
+
+/**
+ * @brief Computes the frequency ratio and offset between the agent clock and
+ * HSA system clock and converts the agent's tick to HSA system domain tick.
+ *
+ * @param[in] agent The agent used to retrieve the agent_tick. It is user's
+ * responsibility to make sure the tick number is from this agent, otherwise,
+ * the behavior is undefined.
+ *
+ * @param[in] agent_tick The tick count retrieved from the specified @p agent.
+ *
+ * @param[out] system_tick The translated HSA system domain clock counter tick.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL;
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+                                                    uint64_t agent_tick,
+                                                    uint64_t* system_tick);
+
+/** @} */
+
+/** \defgroup status Runtime notifications
+ *  @{
+ */
+
+/**
+ * @brief Signal attribute flags.
+ */
+typedef enum {
+  /**
+   * Signal will only be consumed by AMD GPUs.  Limits signal consumption to
+   * AMD GPU agents only.  Ignored if @p num_consumers is not zero (all agents).
+   */
+  HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1,
+  /**
+   * Signal may be used for interprocess communication.
+   * IPC signals can be read, written, and waited on from any process.
+   * Profiling using an IPC enabled signal is only supported in a single process
+   * at a time.  Producing profiling data in one process and consuming it in
+   * another process is undefined.
+   */
+  HSA_AMD_SIGNAL_IPC = 2,
+} hsa_amd_signal_attribute_t;
+
+/**
+ * @brief Create a signal with specific attributes.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[in] attributes Requested signal attributes.  Multiple signal attributes
+ * may be requested by combining them with bitwise OR.  Requesting no attributes
+ * (@p attributes == 0) results in the same signal as would have been obtained
+ * via hsa_signal_create.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers,
+                                           const hsa_agent_t* consumers, uint64_t attributes,
+                                           hsa_signal_t* signal);
+
+/**
+ * @brief Returns a pointer to the value of a signal.
+ *
+ * Use of this API does not modify the lifetime of ::signal and any
+ * hsa_signal_value_t retrieved by this API has lifetime equal to that of
+ * ::signal.
+ *
+ * This API is intended for partial interoperability with non-HSA compatible
+ * devices and should not be used where HSA interfaces are available.
+ *
+ * Use of the signal value must comply with use restritions of ::signal.
+ * Use may result in data races if the operations performed are not platform
+ * atomic.  Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC
+ * attributed signals is required.
+ *
+ * @param[in] Signal handle to extract the signal value pointer from.
+ *
+ * @param[out] Location where the extracted signal value pointer will be placed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL.
+ */
+hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
+                                          volatile hsa_signal_value_t** value_ptr);
+
+/**
+ * @brief Asyncronous signal handler function type.
+ *
+ * @details Type definition of callback function to be used with
+ * hsa_amd_signal_async_handler. This callback is invoked if the associated
+ * signal and condition are met. The callback receives the value of the signal
+ * which satisfied the associated wait condition and a user provided value. If
+ * the callback returns true then the callback will be called again if the
+ * associated signal and condition are satisfied again. If the callback returns
+ * false then it will not be called again.
+ *
+ * @param[in] value Contains the value of the signal observed by
+ * hsa_amd_signal_async_handler which caused the signal handler to be invoked.
+ *
+ * @param[in] arg Contains the user provided value given when the signal handler
+ * was registered with hsa_amd_signal_async_handler
+ *
+ * @retval true resumes monitoring the signal with this handler (as if calling
+ * hsa_amd_signal_async_handler again with identical parameters)
+ *
+ * @retval false stops monitoring the signal with this handler (handler will
+ * not be called again for this signal)
+ *
+ */
+typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg);
+
+/**
+ * @brief Register asynchronous signal handler function.
+ *
+ * @details Allows registering a callback function and user provided value with
+ * a signal and wait condition. The callback will be invoked if the associated
+ * signal and wait condition are satisfied. Callbacks will be invoked serially
+ * but in an arbitrary order so callbacks should be independent of each other.
+ * After being invoked a callback may continue to wait for its associated signal
+ * and condition and, possibly, be invoked again. Or the callback may stop
+ * waiting. If the callback returns true then it will continue waiting and may
+ * be called again. If false then the callback will not wait again and will not
+ * be called again for the associated signal and condition. It is possible to
+ * register the same callback multiple times with the same or different signals
+ * and/or conditions. Each registration of the callback will be treated entirely
+ * independently.
+ *
+ * @param[in] signal hsa signal to be asynchronously monitored
+ *
+ * @param[in] cond condition value to monitor for
+ *
+ * @param[in] value signal value used in condition expression
+ *
+ * @param[in] handler asynchronous signal handler invoked when signal's
+ * condition is met
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_signal_async_handler(hsa_signal_t signal,
+                                 hsa_signal_condition_t cond,
+                                 hsa_signal_value_t value,
+                                 hsa_amd_signal_handler handler, void* arg);
+
+/**
+ * @brief Wait for all signal-condition pairs to be satisfied.
+ *
+ * @details Allows waiting for all of several signal and condition pairs to be
+ * satisfied. The function returns 0 if all signals met their conditions and -1
+ * on a timeout. The value of each signal's satisfying value is returned in
+ * satisfying_value unless satisfying_value is nullptr. NULL and invalid signals
+ * are considered to have value 0 and their conditions already satisfied. This
+ * function provides only relaxed memory semantics.
+ */
+uint32_t HSA_API hsa_amd_signal_wait_all(uint32_t signal_count, hsa_signal_t* signals,
+                                         hsa_signal_condition_t* conds, hsa_signal_value_t* values,
+                                         uint64_t timeout_hint, hsa_wait_state_t wait_hint,
+                                         hsa_signal_value_t* satisfying_values);
+
+/**
+ * @brief Wait for any signal-condition pair to be satisfied.
+ *
+ * @details Allows waiting for any of several signal and conditions pairs to be
+ * satisfied. The function returns the index into the list of signals of the
+ * first satisfying signal-condition pair. The function returns
+ * std::numeric_limits<uint32_t>::max() if no valid signal is provided. The value
+ * of the satisfying signal's value is returned in satisfying_value, unless
+ * satisfying_value is nullptr or there's no valid signal in the signal-condition
+ * pairs. NULL and invalid signals are ignored. This function provides only
+ * relaxed memory semantics.
+ */
+uint32_t HSA_API
+    hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+                            hsa_signal_condition_t* conds,
+                            hsa_signal_value_t* values, uint64_t timeout_hint,
+                            hsa_wait_state_t wait_hint,
+                            hsa_signal_value_t* satisfying_value);
+
+/** @} */
+
+/**
+ * @brief Call a function asynchronously
+ *
+ * @details Provides access to the runtime's asynchronous event handling thread
+ * for general asynchronous functions.  Functions queued this way are executed
+ * in the same manner as if they were a signal handler who's signal is
+ * satisfied.
+ *
+ * @param[in] callback asynchronous function to be invoked
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_async_function(void (*callback)(void* arg), void* arg);
+
+/** \addtogroup ext-images Images and samplers
+ *  @{
+ */
+
+/**
+ * @brief Encodes an opaque vendor specific image format.  The length of data
+ * depends on the underlying format.  This structure must not be copied as its
+ * true length can not be determined.
+ */
+typedef struct hsa_amd_image_descriptor_s {
+  /*
+  Version number of the descriptor
+  */
+  uint32_t version;
+
+  /*
+  Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID.
+  */
+  uint32_t deviceID;
+
+  /*
+  Start of vendor specific data.
+  */
+  uint32_t data[1];
+} hsa_amd_image_descriptor_t;
+
+/**
+ * @brief Creates an image from an opaque vendor specific image format.
+ * Does not modify data at image_data.  Intended initially for
+ * accessing interop images.
+ *
+ * @param agent[in] Agent on which to create the image
+ *
+ * @param[in] image_descriptor[in] Vendor specific image format
+ *
+ * @param[in] image_data Pointer to image backing store
+ *
+ * @param[in] access_permission Access permissions for the image object
+ *
+ * @param[out] image Created image object.
+ *
+ * @retval HSA_STATUS_SUCCESS Image created successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor,
+ * null image_data, or mismatched access_permission.
+ */
+hsa_status_t HSA_API hsa_amd_image_create(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const hsa_amd_image_descriptor_t *image_layout,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image
+);
+
+/**
+ * @brief Query image limits.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute HSA image info attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute <
+ * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute >
+ * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
+                                                    hsa_agent_info_t attribute,
+                                                    void* value);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
+ * If num_cu_mask_count is 0 then the request is interpreted as a request to
+ * enable all CUs and no cu_mask array need be given.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[in] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
+ * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL.
+ * Devices with work group processors must even-index contiguous pairwise
+ * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110)
+ * are invalid.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask);
+
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Memory segments associated with a memory pool.
+ */
+typedef enum {
+  /**
+   * Global segment. Used to hold data that is shared by all agents.
+   */
+  HSA_AMD_SEGMENT_GLOBAL = 0,
+  /**
+   * Read-only segment. Used to hold data that remains constant during the
+   * execution of a kernel.
+   */
+  HSA_AMD_SEGMENT_READONLY = 1,
+  /**
+   * Private segment. Used to hold data that is local to a single work-item.
+   */
+  HSA_AMD_SEGMENT_PRIVATE = 2,
+  /**
+   * Group segment. Used to hold data that is shared by the work-items of a
+   * work-group.
+   */
+  HSA_AMD_SEGMENT_GROUP = 3,
+} hsa_amd_segment_t;
+
+/**
+ * @brief A memory pool encapsulates physical storage on an agent
+ * along with a memory access model.
+ *
+ * @details A memory pool encapsulates a physical partition of an agent's
+ * memory system along with a memory access model.  Division of a single
+ * memory system into separate pools allows querying each partition's access
+ * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations
+ * from a pool are preferentially bound to that pool's physical partition.
+ * Binding to the pool's preferential physical partition may not be
+ * possible or persistent depending on the system's memory policy
+ * and/or state which is beyond the scope of HSA APIs.
+ *
+ * For example, a multi-node NUMA memory system may be represented by multiple
+ * pool's with each pool providing size and access path information for the
+ * partition it represents.  Allocations from a pool are preferentially bound
+ * to the pool's partition (which in this example is a NUMA node) while
+ * following its memory access model. The actual placement may vary or migrate
+ * due to the system's NUMA policy and state, which is beyond the scope of
+ * HSA APIs.
+ */
+typedef struct hsa_amd_memory_pool_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_amd_memory_pool_t;
+
+typedef enum hsa_amd_memory_pool_global_flag_s {
+  /**
+   * The application can use allocations in the memory pool to store kernel
+   * arguments, and provide the values for the kernarg segment of
+   * a kernel dispatch.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1,
+  /**
+   * Updates to memory in this pool conform to HSA memory consistency model.
+   * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+   * must not be set.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2,
+  /**
+   * Writes to memory in this pool can be performed by a single agent at a time.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /** Updates to memory in this memory pool have extended scope, acting as
+   * system-scope atomics for variables in memory regions of this type.
+   * Note: On non-compliant systems, device-specific actions may be required
+   * for system-scope coherence. */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8,
+
+} hsa_amd_memory_pool_global_flag_t;
+
+typedef enum hsa_amd_memory_pool_location_s {
+    /**
+     * This memory pool resides on the host (CPU)
+     */
+    HSA_AMD_MEMORY_POOL_LOCATION_CPU = 0,
+    /**
+     * This memory pool resides on a GPU
+     */
+    HSA_AMD_MEMORY_POOL_LOCATION_GPU = 1
+} hsa_amd_memory_pool_location_t;
+
+/**
+ * @brief Memory pool features.
+ */
+typedef enum {
+  /**
+  * Segment where the memory pool resides. The type of this attribute is
+  * ::hsa_amd_segment_t.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
+  /**
+  * Flag mask. The value of this attribute is undefined if the value of
+  * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type
+  * of
+  * this attribute is uint32_t, a bit-field of
+  * ::hsa_amd_memory_pool_global_flag_t
+  * values.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
+  /**
+  * Size of this pool, in bytes. The type of this attribute is size_t.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
+  /**
+  * Indicates whether memory in this pool can be allocated using
+  * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool.
+  *
+  * The value of this flag is always false for memory pools in the group and
+  * private segments.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+  /**
+   * Allocation granularity of buffers allocated by
+   * ::hsa_amd_memory_pool_allocate
+   * in this memory pool. The size of a buffer allocated in this pool is a
+   * multiple of the value of this attribute. While this is the minimum size of
+   * allocation allowed, it is recommened to use
+   * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE to obtain the recommended
+   * allocation granularity size for this pool.
+   * The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
+   * this pool. The type of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
+  /**
+   * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this
+   * pool. The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
+   * must be a power of 2. The type of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
+  /**
+   * This memory_pool can be made directly accessible by all the agents in the
+   * system (::hsa_amd_agent_memory_pool_get_info does not return
+   * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this
+   * attribute is bool.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+  /**
+   * Maximum aggregate allocation size in bytes. The type of this attribute
+   * is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
+  /**
+   * Location of this memory pool. The type of this attribute
+   * is hsa_amd_memory_pool_location_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_LOCATION = 17,
+  /**
+   * Internal block size for allocations. This would also be the recommended
+   * granularity size for allocations as this prevents internal fragmentation.
+   * The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool.
+   * The size of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18,
+} hsa_amd_memory_pool_info_t;
+
+/**
+ * @brief Memory pool flag used to specify allocation directives
+ *
+ */
+typedef enum hsa_amd_memory_pool_flag_s {
+  /**
+   * Allocates memory that conforms to standard HSA memory consistency model
+   */
+  HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0,
+  /**
+   * Allocates fine grain memory type where memory ordering is per point to point
+   * connection. Atomic memory operations on these memory buffers are not
+   * guaranteed to be visible at system scope.
+   */
+  HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0),
+  /**
+   *  Allocates physically contiguous memory
+   */
+  HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1),
+  /**
+   *  Allocates executable memory
+   */
+  HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2),
+  /**
+   *  Allocates uncached memory
+   */
+  HSA_AMD_MEMORY_POOL_UNCACHED_FLAG = (1 << 3),
+} hsa_amd_memory_pool_flag_t;
+
+/**
+ * @brief Get the current value of an attribute of a memory pool.
+ *
+ * @param[in] memory_pool A valid memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+                                 hsa_amd_memory_pool_info_t attribute,
+                                 void* value);
+
+/**
+ * @brief Iterate over the memory pools associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details An agent can directly access buffers located in some memory pool, or
+ * be enabled to access them by the application (see ::hsa_amd_agents_allow_access),
+ * yet that memory pool may not be returned by this function for that given
+ * agent.
+ *
+ * A memory pool of fine-grained type must be associated only with the host.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked on the same thread that called
+ * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is
+ * associated with the agent.  The HSA runtime passes two arguments to the
+ * callback: the memory pool, and the application data.  If @p callback
+ * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration,
+ * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status
+ * value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+    void* data);
+
+/**
+ * @brief Allocate a block of memory (or buffer) in the specified pool.
+ *
+ * @param[in] memory_pool Memory pool where to allocate memory from. The memory
+ * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives.
+ *
+ * @param[out] ptr Pointer to the location where to store the base virtual
+ * address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the
+ * allocation fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p memory_pool, or @p size is greater than
+ * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0,
+ * or flags is not 0.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
+                                 uint32_t flags, void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_amd_memory_pool_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
+
+/**
+ * @brief Asynchronously copy a block of memory from the location pointed to by
+ * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p
+ * dst_agent.
+ * Because the DMA engines used may not be in the same coherency domain, the caller must ensure
+ * that buffers are system-level coherent. In general this requires the sending device to have
+ * released the buffer to system scope prior to executing the copy API and the receiving device
+ * must execute a system scope acquire fence prior to use of the destination buffer.
+ *
+ * @param[out] dst Buffer where the content is to be copied.
+ *
+ * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer, otherwise the copy will succeed
+ * but contents of @p dst is undefined.
+ *
+ * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the copy
+ * operation starts. The copy will start after every signal has been observed with
+ * the value 0. The dependent signal should not include completion signal from
+ * hsa_amd_memory_async_copy operation to be issued in future as that can result
+ * in a deadlock. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the copy
+ * operation. When the copy operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. The signal handle must not be 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The
+ * application is responsible for checking for asynchronous error conditions
+ * (see the description of @p completion_signal).
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL, or the completion signal is 0.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal);
+
+/**
+ * @brief Asynchronously copy a block of memory from the location pointed to by
+ * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p
+ * dst_agent on engine_id.
+ *
+ * WARNING: Concurrent use of this call with hsa_amd_memory_async_copy can result
+ * in resource conflicts as HSA runtime will auto assign engines with the latter
+ * call.  Approach using both calls concurrently with caution.
+ *
+ * All param definitions are identical to hsa_amd_memory_async_copy with the
+ * exception of engine_id and force_copy_on_sdma.
+ *
+ * @param[in] - engine_id Target engine defined by hsa_amd_sdma_engine_id_t.
+ * Client should use hsa_amd_memory_copy_engine_status first to get the ID
+ * availability.
+ *
+ * @param[in] - force_copy_on_sdma By default, blit kernel copies are used when
+ * dst_agent == src_agent.  Setting this to true will force the copy over SDMA1.
+ *
+ * All return definitions are identical to hsa_amd_memory_async_copy with the
+ * following ammendments:
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL, or the completion signal is 0 or engine_id is improperly
+ * bounded.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy_on_engine(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal,
+                              hsa_amd_sdma_engine_id_t engine_id,
+                              bool force_copy_on_sdma);
+/**
+ * @brief Reports the availability of SDMA copy engines.
+ *
+ * @param[in] dst_agent Destination agent of copy status direction.
+ *
+ * @param[in] src_agent Source agent of copy status direction.
+ *
+ * @param[out] engine_ids_mask returns available SDMA engine IDs that can be masked
+ * with hsa_amd_sdma_engine_id_t.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Agent has available SDMA engines.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines.
+ *
+ */
+hsa_status_t HSA_API
+hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent,
+                                      uint32_t *engine_ids_mask);
+ /**
+ * @brief Returns the preferred SDMA engine mask.
+ *
+ * @param[in] dst_agent Destination agent of copy status direction.
+ *
+ * @param[in] src_agent Source agent of copy status direction.
+ *
+ * @param[out] recommended_ids_mask returns available SDMA engine IDs for max bandwidth
+ * that can be masked with hsa_amd_sdma_engine_id_t. Can be 0 if there is no preference
+ *
+ * @retval ::HSA_STATUS_SUCCESS For mask returned
+ *
+ */
+hsa_status_t HSA_API
+hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent,
+                                         uint32_t* recommended_ids_mask);
+
+/*
+[Provisional API]
+Pitched memory descriptor.
+All elements must be 4 byte aligned.  Pitch and slice are in bytes.
+*/
+typedef struct hsa_pitched_ptr_s {
+  void* base;
+  size_t pitch;
+  size_t slice;
+} hsa_pitched_ptr_t;
+
+/*
+[Provisional API]
+Copy direction flag.
+*/
+typedef enum {
+  hsaHostToHost = 0,
+  hsaHostToDevice = 1,
+  hsaDeviceToHost = 2,
+  hsaDeviceToDevice = 3
+} hsa_amd_copy_direction_t;
+
+/*
+[Provisional API]
+SDMA 3D memory copy API.  The same requirements must be met by src and dst as in
+hsa_amd_memory_async_copy.
+Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
+must not overlap.
+CPU agents are not supported.  API requires SDMA and will return an error if SDMA is not available.
+Offsets and range carry x in bytes, y and z in rows and layers.
+*/
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal);
+
+/**
+ * @brief Type of accesses to a memory pool from a given agent.
+ */
+typedef enum {
+  /**
+  * The agent cannot directly access any buffer in the memory pool.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0,
+  /**
+  * The agent can directly access a buffer located in the pool; the application
+  * does not need to invoke ::hsa_amd_agents_allow_access.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1,
+  /**
+  * The agent can directly access a buffer located in the pool, but only if the
+  * application has previously requested access to that buffer using
+  * ::hsa_amd_agents_allow_access.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
+} hsa_amd_memory_pool_access_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+  /**
+  * Hyper-transport bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0,
+
+  /**
+  * QPI bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_QPI = 1,
+
+  /**
+  * PCIe bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_PCIE = 2,
+
+  /**
+  * Infiniband bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3,
+
+  /**
+  * xGMI link type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_XGMI = 4
+
+} hsa_amd_link_info_type_t;
+
+/**
+ * @brief Link properties when accessing the memory pool from the specified
+ * agent.
+ */
+typedef struct hsa_amd_memory_pool_link_info_s {
+  /**
+  * Minimum transfer latency (rounded to ns).
+  */
+  uint32_t min_latency;
+
+  /**
+  * Maximum transfer latency (rounded to ns).
+  */
+  uint32_t max_latency;
+
+  /**
+  * Minimum link interface bandwidth in MB/s.
+  */
+  uint32_t min_bandwidth;
+
+  /**
+  * Maximum link interface bandwidth in MB/s.
+  */
+  uint32_t max_bandwidth;
+
+  /**
+  * Support for 32-bit atomic transactions.
+  */
+  bool atomic_support_32bit;
+
+  /**
+  * Support for 64-bit atomic transactions.
+  */
+  bool atomic_support_64bit;
+
+  /**
+  * Support for cache coherent transactions.
+  */
+  bool coherent_support;
+
+  /**
+  * The type of bus/link.
+  */
+  hsa_amd_link_info_type_t link_type;
+
+  /**
+   * NUMA distance of memory pool relative to querying agent
+   */
+  uint32_t numa_distance;
+} hsa_amd_memory_pool_link_info_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+  /**
+  * Access to buffers located in the memory pool. The type of this attribute
+  * is ::hsa_amd_memory_pool_access_t.
+  *
+  * An agent can always directly access buffers currently located in a memory
+  * pool that is associated (the memory_pool is one of the values returned by
+  * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
+  * buffer is currently located in a memory pool that is not associated with
+  * the agent, and the value returned by this function for the given
+  * combination of agent and memory pool is not
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke
+  * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer.
+  *
+  * If the given agent can directly access buffers the pool, the result is not
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with
+  * the agent, or it is of fined-grained type, the result must not be
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated
+  * with the agent, and does not reside in the global segment, the result must
+  * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0,
+
+  /**
+  * Number of links to hop when accessing the memory pool from the specified
+  * agent. The value of this attribute is zero if the memory pool is associated
+  * with the agent, or if the access type is
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is
+  * uint32_t.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1,
+
+  /**
+  * Details of each link hop when accessing the memory pool starting from the
+  * specified agent. The type of this attribute is an array size of
+  * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
+  * ::hsa_amd_memory_pool_link_info_t.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
+
+} hsa_amd_agent_memory_pool_info_t;
+
+/**
+ * @brief Get the current value of an attribute of the relationship between an
+ * agent and a memory pool.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] memory_pool Memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+    hsa_amd_agent_memory_pool_info_t attribute, void* value);
+
+/**
+ * @brief Enable direct access to a buffer from a given set of agents.
+ *
+ * @details
+ *
+ * Upon return, only the listed agents and the agent associated with the
+ * buffer's memory pool have direct access to the @p ptr.
+ *
+ * Any agent that has access to the buffer before and after the call to
+ * ::hsa_amd_agents_allow_access will also have access while
+ * ::hsa_amd_agents_allow_access is in progress.
+ *
+ * The caller is responsible for ensuring that each agent in the list
+ * must be able to access the memory pool containing @p ptr
+ * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute),
+ * otherwise error code is returned.
+ *
+ * @param[in] num_agents Size of @p agents.
+ *
+ * @param[in] agents List of agents. If @p num_agents is 0, this argument is
+ * ignored.
+ *
+ * @param[in] flags A list of bit-field that is used to specify access
+ * information in a per-agent basis. This is currently reserved and must be NULL.
+ *
+ * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents
+ * is NULL, @p flags is not NULL, or attempting to enable access to agent(s)
+ * because @p ptr is allocated from an inaccessible pool.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+                                const uint32_t* flags, const void* ptr);
+
+/**
+ * @brief Query if buffers currently located in some memory pool can be
+ * relocated to a destination memory pool.
+ *
+ * @details If the returned value is non-zero, a migration of a buffer to @p
+ * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to
+ * resource limitations.
+ *
+ * @param[in] src_memory_pool Source memory pool.
+ *
+ * @param[in] dst_memory_pool Destination memory pool.
+ *
+ * @param[out] result Pointer to a memory location where the result of the query
+ * is stored. Must not be NULL. If buffers currently located in @p
+ * src_memory_pool can be relocated to @p dst_memory_pool, the result is
+ * true.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+                                    hsa_amd_memory_pool_t dst_memory_pool,
+                                    bool* result);
+
+/**
+ * @brief Relocate a buffer to a new memory pool.
+ *
+ * @details When a buffer is migrated, its virtual address remains the same but
+ * its physical contents are moved to the indicated memory pool.
+ *
+ * After migration, only the agent associated with the destination pool will have access.
+ *
+ * The caller is also responsible for ensuring that the allocation in the
+ * source memory pool where the buffer is currently located can be migrated to the
+ * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true
+ * for the source and destination memory pools), otherwise behavior is undefined.
+ *
+ * The caller must ensure that the buffer is not accessed while it is migrated.
+ *
+ * @param[in] ptr Buffer to be relocated. The buffer must have been released to system
+ * prior to call this API.  The buffer will be released to system upon completion.
+ *
+ * @param[in] memory_pool Memory pool where to place the buffer.
+ *
+ * @param[in] flags A bit-field that is used to specify migration
+ * information. Must be zero.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+                                            hsa_amd_memory_pool_t memory_pool,
+                                            uint32_t flags);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In
+ * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does,
+ * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Accesses to @p agent_ptr are coarse grained.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+                                         hsa_agent_t* agents, int num_agent,
+                                         void** agent_ptr);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted).
+ * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it
+ * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from
+ * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info
+ * (ex. coarse/fine grain, platform atomic support, link info).  Physical composition and placement
+ * of the memory (ex. page size, NUMA binding) is not changed.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[in] pool Global memory pool owned by a CPU agent.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives. Reserved parameter, must be 0.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid or can not access @p pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned
+ * by a CPU agent.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0 or flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents,
+                                                 int num_agent, hsa_amd_memory_pool_t pool,
+                                                 uint32_t flags, void** agent_ptr);
+
+/**
+ *
+ * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or
+ * ::hsa_amd_memory_lock_to_pool.
+ *
+ * @details The behavior is undefined if the host pointer being unpinned does not
+ * match previous pinned address or if the host pointer was already deallocated.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was
+ * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
+
+/**
+ * @brief Sets the first @p count of uint32_t of the block of memory pointed by
+ * @p ptr to the specified @p value.
+ *
+ * @param[in] ptr Pointer to the block of memory to fill.
+ *
+ * @param[in] value Value to be set.
+ *
+ * @param[in] count Number of uint32_t element to be set to the value.
+ *
+ * @retval HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
+ * not 4 bytes aligned
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory
+ * region was not allocated with HSA runtime APIs.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
+
+/**
+ * @brief Maps an interop object into the HSA flat address space and establishes
+ * memory residency.  The metadata pointer is valid during the lifetime of the
+ * map (until hsa_amd_interop_unmap_buffer is called).
+ * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle
+ * result in multiple mappings with potentially different addresses and
+ * different metadata pointers.  Concurrent operations on these addresses are
+ * not coherent.  Memory must be fenced to system scope to ensure consistency,
+ * between mappings and with any views of this buffer in the originating
+ * software stack.
+ *
+ * @param[in] num_agents Number of agents which require access to the memory
+ *
+ * @param[in] agents List of accessing agents.
+ *
+ * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux)
+ *
+ * @param [in] flags Reserved, must be 0
+ *
+ * @param[out] size Size in bytes of the mapped object
+ *
+ * @param[out] ptr Base address of the mapped object
+ *
+ * @param[out] metadata_size Size of metadata in bytes, may be NULL
+ *
+ * @param[out] metadata Pointer to metadata, may be NULL
+ *
+ * @retval HSA_STATUS_SUCCESS if successfully mapped
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors
+ */
+hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,
+                                        hsa_agent_t* agents,
+                                        int interop_handle,
+                                        uint32_t flags,
+                                        size_t* size,
+                                        void** ptr,
+                                        size_t* metadata_size,
+                                        const void** metadata);
+
+/**
+ * @brief Removes a previously mapped interop object from HSA's flat address space.
+ * Ends lifetime for the mapping's associated metadata pointer.
+ */
+hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
+
+/**
+ * @brief Denotes the type of memory in a pointer info query.
+ */
+typedef enum {
+  /*
+  Memory is not known to the HSA driver.  Unallocated or unlocked system memory.
+  */
+  HSA_EXT_POINTER_TYPE_UNKNOWN = 0,
+  /*
+  Memory was allocated with an HSA memory allocator.
+  */
+  HSA_EXT_POINTER_TYPE_HSA = 1,
+  /*
+  System memory which has been locked for use with an HSA agent.
+
+  Memory of this type is normal malloc'd memory and is always accessible to
+  the CPU.  Pointer info queries may not include CPU agents in the accessible
+  agents list as the CPU has implicit access.
+  */
+  HSA_EXT_POINTER_TYPE_LOCKED = 2,
+  /*
+  Memory originated in a graphics component and is shared with ROCr.
+  */
+  HSA_EXT_POINTER_TYPE_GRAPHICS = 3,
+  /*
+  Memory has been shared with the local process via ROCr IPC APIs.
+  */
+  HSA_EXT_POINTER_TYPE_IPC = 4,
+  /*
+  No backend memory but virtual address
+  */
+  HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5,
+  /*
+  Memory was allocated with an HSA virtual memory allocator
+  */
+  HSA_EXT_POINTER_TYPE_HSA_VMEM = 6
+} hsa_amd_pointer_type_t;
+
+/**
+ * @brief Describes a memory allocation known to ROCr.
+ * Within a ROCr major version this structure can only grow.
+ */
+typedef struct hsa_amd_pointer_info_s {
+  /*
+  Size in bytes of this structure.  Used for version control within a major ROCr
+  revision.  Set to sizeof(hsa_amd_pointer_t) prior to calling
+  hsa_amd_pointer_info.  If the runtime supports an older version of pointer
+  info then size will be smaller on return.  Members starting after the return
+  value of size will not be updated by hsa_amd_pointer_info.
+  */
+  uint32_t size;
+  /*
+  The type of allocation referenced.
+  */
+  hsa_amd_pointer_type_t type;
+  /*
+  Base address at which non-host agents may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* agentBaseAddress;
+  /*
+  Base address at which the host agent may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* hostBaseAddress;
+  /*
+  Size of the allocation. This field is not meaningful if the type of the allocation
+  is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  size_t sizeInBytes;
+  /*
+  Application provided value. This field is not meaningful if the type of the
+  allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* userData;
+  /*
+  Reports an agent which "owns" (ie has preferred access to) the pool in which the
+  allocation was
+  made.  When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
+  GPU boards) any such agent may be returned. This field is not meaningful if
+  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in
+  this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES.
+  */
+  hsa_agent_t agentOwner;
+  /*
+  Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
+  Reports the effective global flags bitmask for the allocation.  This field is not
+  meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  uint32_t global_flags;
+
+  /*
+  Set to true if this allocation was registered with the underlying driver
+  This field is not meaningful if the type of the allocation is
+  HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  bool registered;
+} hsa_amd_pointer_info_t;
+
+/**
+ * @brief Retrieves information about the allocation referenced by the given
+ * pointer.  Optionally returns the number and list of agents which can
+ * directly access the allocation. In case this virtual address is unknown, the
+ * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields
+ * that are valid after hsa_amd_pointer_info returns are size and type.
+ *
+ * @param[in] ptr Pointer which references the allocation to retrieve info for.
+ *
+ * @param[in, out] info Pointer to structure to be filled with allocation info.
+ * Data member size must be set to the size of the structure prior to calling
+ * hsa_amd_pointer_info.  On return size will be set to the size of the
+ * pointer info structure supported by the runtime, if smaller.  Members
+ * beyond the returned value of size will not be updated by the API.
+ * Must not be NULL.
+ *
+ * @param[in] alloc Function pointer to an allocator used to allocate the
+ * @p accessible array.  If NULL @p accessible will not be returned.
+ *
+ * @param[out] num_agents_accessible Recieves the count of agents in
+ * @p accessible.  If NULL @p accessible will not be returned.
+ *
+ * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc,
+ * holding the list of agents which may directly access the allocation.
+ * May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS Info retrieved successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr,
+                                          hsa_amd_pointer_info_t* info,
+                                          void* (*alloc)(size_t),
+                                          uint32_t* num_agents_accessible,
+                                          hsa_agent_t** accessible);
+
+/**
+ * @brief Associates an arbitrary pointer with an allocation known to ROCr.
+ * The pointer can be fetched by hsa_amd_pointer_info in the userData field.
+ *
+ * @param[in] ptr Pointer to the first byte of an allocation known to ROCr
+ * with which to associate @p userdata.
+ *
+ * @param[in] userdata Abitrary pointer to associate with the allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS @p userdata successfully stored.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr,
+                                                       void* userdata);
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr shared memory
+ * allocation.
+ */
+typedef struct hsa_amd_ipc_memory_s {
+  uint32_t handle[8];
+} hsa_amd_ipc_memory_t;
+
+/**
+ * @brief Prepares an allocation for interprocess sharing and creates a
+ * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation.  A
+ * handle is valid while the allocation it references remains accessible in
+ * any process.  In general applications should confirm that a shared memory
+ * region has been attached (via hsa_amd_ipc_memory_attach) in the remote
+ * process prior to releasing that memory in the local process.
+ * Repeated calls for the same allocation may, but are not required to, return
+ * unique handles. The allocation needs to be on memory on an agent of type
+ * HSA_DEVICE_TYPE_GPU.
+ *
+ * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for
+ * sharing.
+ *
+ * @param[in] len Length in bytes of the allocation to share.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the
+ * first byte of an allocation made through ROCr, or len is not the full length
+ * of the allocation or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len,
+                                               hsa_amd_ipc_memory_t* handle);
+
+/**
+ * @brief Imports shared memory into the local process and makes it accessible
+ * by the given agents.  If a shared memory handle is attached multiple times
+ * in a process each attach may return a different address.  Each returned
+ * address is refcounted and requires a matching number of calls to
+ * hsa_amd_ipc_memory_detach to release the shared memory mapping.
+ *
+ * @param[in] handle Pointer to the identifier for the shared memory.
+ *
+ * @param[in] len Length of the shared memory to import.
+ * Reserved.  Must be the full length of the shared allocation in this version.
+ *
+ * @param[in] num_agents Count of agents in @p mapping_agents.
+ * May be zero if all agents are to be allowed access.
+ *
+ * @param[in] mapping_agents List of agents to access the shared memory.
+ * Ignored if @p num_agents is zero.
+ *
+ * @param[out] mapped_ptr Recieves a process local pointer to the shared memory.
+ *
+ * @retval HSA_STATUS_SUCCESS if memory is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is
+ * incorrect, @p mapped_ptr is NULL, or some agent for which access was
+ * requested can not access the shared memory.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_attach(
+    const hsa_amd_ipc_memory_t* handle, size_t len,
+    uint32_t num_agents,
+    const hsa_agent_t* mapping_agents,
+    void** mapped_ptr);
+
+/**
+ * @brief Decrements the reference count for the shared memory mapping and
+ * releases access to shared memory imported with hsa_amd_ipc_memory_attach.
+ *
+ * @param[in] mapped_ptr Pointer to the first byte of a shared allocation
+ * imported with hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with
+ * hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported
+ * with hsa_amd_ipc_memory_attach.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr);
+
+/** @} */
+
+/** \addtogroup status Runtime notifications
+ *  @{
+ */
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr IPC signal.
+ */
+typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t;
+
+/**
+ * @brief Obtains an interprocess sharing handle for a signal.  The handle is
+ * valid while the signal it references remains valid in any process.  In
+ * general applications should confirm that the signal has been attached (via
+ * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that
+ * signal in the local process.
+ * Repeated calls for the same signal may, but are not required to, return
+ * unique handles.
+ *
+ * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * signal.
+ *
+ * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal
+ * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle);
+
+/**
+ * @brief Imports an IPC capable signal into the local process.  If an IPC
+ * signal handle is attached multiple times in a process each attach may return
+ * a different signal handle.  Each returned signal handle is refcounted and
+ * requires a matching number of calls to hsa_signal_destroy to release the
+ * shared signal.
+ *
+ * @param[in] handle Pointer to the identifier for the shared signal.
+ *
+ * @param[out] signal Recieves a process local signal handle to the shared signal.
+ *
+ * @retval HSA_STATUS_SUCCESS if the signal is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle,
+                                               hsa_signal_t* signal);
+
+/**
+ * @brief GPU system event type.
+ */
+typedef enum hsa_amd_event_type_s {
+  /*
+   AMD GPU memory fault.
+   */
+  HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
+  /*
+   AMD GPU HW Exception.
+   */
+  HSA_AMD_GPU_HW_EXCEPTION_EVENT,
+  /*
+   AMD GPU memory error.
+   */
+  HSA_AMD_GPU_MEMORY_ERROR_EVENT,
+} hsa_amd_event_type_t;
+
+/**
+ * @brief Flags denoting the cause of a memory fault.
+ */
+typedef enum {
+  // Page not present or supervisor privilege.
+  HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
+  // Write access to a read-only page.
+  HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
+  // Execute access to a page marked NX.
+  HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
+  // GPU attempted access to a host only page.
+  HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
+  // DRAM ECC failure.
+  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
+  // Can't determine the exact fault address.
+  HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
+  // SRAM ECC failure (ie registers, no fault address).
+  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
+  // GPU reset following unspecified hang.
+  HSA_AMD_MEMORY_FAULT_HANG = 1U << 31
+} hsa_amd_memory_fault_reason_t;
+
+/**
+ * @brief AMD GPU memory fault event data.
+ */
+typedef struct hsa_amd_gpu_memory_fault_info_s {
+  /*
+  The agent where the memory fault occurred.
+  */
+  hsa_agent_t agent;
+  /*
+  Virtual address accessed.
+  */
+  uint64_t virtual_address;
+  /*
+  Bit field encoding the memory access failure reasons. There could be multiple bits set
+  for one fault.  Bits are defined in hsa_amd_memory_fault_reason_t.
+  */
+  uint32_t fault_reason_mask;
+} hsa_amd_gpu_memory_fault_info_t;
+
+/**
+ * @brief Flags denoting the cause of a memory error.
+ */
+typedef enum {
+  // Memory was in use by low-level HW component and cannot be released
+  HSA_AMD_MEMORY_ERROR_MEMORY_IN_USE = (1 << 0),
+} hsa_amd_memory_error_reason_t;
+
+/**
+ * @brief AMD GPU memory error event data.
+ */
+typedef struct hsa_amd_gpu_memory_error_info_s {
+  /*
+  The agent where the memory error occurred.
+  */
+  hsa_agent_t agent;
+  /*
+  Virtual address involved.
+  */
+  uint64_t virtual_address;
+  /*
+  Bit field encoding the memory error failure reasons. There could be multiple bits set
+  for one error.  Bits are defined in hsa_amd_memory_error_reason_t.
+  */
+  uint32_t error_reason_mask;
+} hsa_amd_gpu_memory_error_info_t;
+
+/**
+ * @brief Flags denoting the type of a HW exception
+ */
+typedef enum {
+  // Unused for now
+  HSA_AMD_HW_EXCEPTION_RESET_TYPE_OTHER = 1 << 0,
+} hsa_amd_hw_exception_reset_type_t;
+
+/**
+ * @brief Flags denoting the cause of a HW exception
+ */
+typedef enum {
+  // GPU Hang
+  HSA_AMD_HW_EXCEPTION_CAUSE_GPU_HANG = 1 << 0,
+  // SRAM ECC
+  HSA_AMD_HW_EXCEPTION_CAUSE_ECC = 1 << 1,
+} hsa_amd_hw_exception_reset_cause_t;
+
+/**
+ * @brief AMD GPU HW Exception event data.
+ */
+typedef struct hsa_amd_gpu_hw_exception_info_s {
+  /*
+  The agent where the HW exception occurred.
+  */
+  hsa_agent_t agent;
+  hsa_amd_hw_exception_reset_type_t reset_type;
+  hsa_amd_hw_exception_reset_cause_t reset_cause;
+} hsa_amd_gpu_hw_exception_info_t;
+
+/**
+ * @brief AMD GPU event data passed to event handler.
+ */
+typedef struct hsa_amd_event_s {
+  /*
+  The event type.
+  */
+  hsa_amd_event_type_t event_type;
+  union {
+    /*
+    The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT.
+    */
+    hsa_amd_gpu_memory_fault_info_t memory_fault;
+    /*
+    The memory fault info, only valid when @p event_type is HSA_AMD_GPU_HW_EXCEPTION_EVENT.
+    */
+    hsa_amd_gpu_hw_exception_info_t hw_exception;
+    /*
+    The memory error info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_ERROR_EVENT.
+    */
+    hsa_amd_gpu_memory_error_info_t memory_error;
+  };
+} hsa_amd_event_t;
+
+typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data);
+
+/**
+ * @brief Register AMD GPU event handler.
+ *
+ * @param[in] callback Callback to be invoked when an event is triggered.
+ * The HSA runtime passes two arguments to the callback: @p event
+ * is defined per event by the HSA runtime, and @p data is the user data.
+ *
+ * @param[in] data User data that is passed to @p callback. May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS The handler has been registered successfully.
+ *
+ * @retval HSA_STATUS_ERROR An event handler has already been registered.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback,
+                                                   void* data);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Per-queue dispatch and wavefront scheduling priority.
+ */
+typedef enum hsa_amd_queue_priority_s {
+  /*
+  Below normal/high priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_LOW = 0,
+  /*
+  Above low priority compute, below high priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_NORMAL = 1,
+  /*
+  Above low/normal priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_HIGH = 2,
+} hsa_amd_queue_priority_t;
+
+/**
+ * @brief Modifies the dispatch and wavefront scheduling prioirty for a
+ * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL.
+ *
+ * @param[in] queue Compute queue to apply new priority to.
+ *
+ * @param[in] priority Priority to associate with queue.
+ *
+ * @retval HSA_STATUS_SUCCESS if priority was changed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid
+ * compute queue handle.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid
+ * value from hsa_amd_queue_priority_t.
+ */
+hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue,
+                                                hsa_amd_queue_priority_t priority);
+
+/**
+ * @brief Queue creation attributes.
+ */
+typedef enum {
+  /**
+   * The queue's packet buffer and queue descriptor struct should be
+   * allocated in system memory (default). Mutually exclusive with
+   * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF and
+   * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR.
+   */
+  HSA_AMD_QUEUE_CREATE_SYSTEM_MEM = 0,
+  /**
+   * The queue's packet buffer should be allocated in the agent's
+   * fine-grain device memory region.
+   */
+  HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF = (1 << 0),
+  /**
+   * The queue desciptor struct should be allocated in the agent's
+   * fine-grain device memory region. Not supported for devices
+   * connected via PCIe because the CPU's atomic read-modify-write
+   * operations cannot be promoted to PCIe atomic read-modify-write
+   * operations.
+   */
+  HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR = (1 << 1),
+} hsa_amd_queue_create_flag_t;
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Deallocation notifier function type.
+ */
+typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data);
+
+/**
+ * @brief Registers a deallocation notifier monitoring for release of agent
+ * accessible address @p ptr.  If successful, @p callback will be invoked when
+ * @p ptr is removed from accessibility from all agents.
+ *
+ * Notification callbacks are automatically deregistered when they are invoked.
+ *
+ * Note: The current version supports notifications of address release
+ * originating from ::hsa_amd_memory_pool_free.  Support for other address
+ * release APIs will follow.
+ *
+ * @param[in] ptr Agent accessible address to monitor for deallocation.  Passed
+ * to @p callback.
+ *
+ * @param[in] callback Notifier to be invoked when @p ptr is released from
+ * agent accessibility.
+ *
+ * @param[in] user_data User provided value passed to @p callback.  May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible
+ * address.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ */
+hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
+                                                    hsa_amd_deallocation_callback_t callback,
+                                                    void* user_data);
+
+/**
+ * @brief Removes a deallocation notifier previously registered with
+ * ::hsa_amd_register_deallocation_callback.  Arguments must be identical to
+ * those given in ::hsa_amd_register_deallocation_callback.
+ *
+ * @param[in] ptr Agent accessible address which was monitored for deallocation.
+ *
+ * @param[in] callback Notifier to be removed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered.
+ */
+hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
+                                                      hsa_amd_deallocation_callback_t callback);
+
+typedef enum hsa_amd_svm_model_s {
+  /**
+   * Updates to memory with this attribute conform to HSA memory consistency
+   * model.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+  /**
+   * Writes to memory with this attribute can be performed by a single agent
+   * at a time.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1,
+  /**
+   * Memory region queried contains subregions with both
+   * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and
+   * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes.
+   *
+   * This attribute can not be used in hsa_amd_svm_attributes_set.  It is a
+   * possible return from hsa_amd_svm_attributes_get indicating that the query
+   * region contains both coarse and fine grained memory.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+  // Memory model attribute.
+  // Type of this attribute is hsa_amd_svm_model_t.
+  HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+  // Marks the range read only.  This allows multiple physical copies to be
+  // placed local to each accessing device.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+  // Automatic migrations should attempt to keep the memory within the xgmi hive
+  // containing accessible agents.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+  // Page granularity to migrate at once.  Page granularity is specified as
+  // log2(page_count).
+  // Type of this attribute is uint64_t.
+  HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+  // Physical location to prefer when automatic migration occurs.
+  // Set to the null agent handle (handle == 0) to indicate there
+  // is no preferred location.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+  // ::hsa_amd_svm_prefetch_async).
+  // Queries the physical location of most recent prefetch command.
+  // If the prefetch location has not been set or is not uniform across the
+  // address range then returned hsa_agent_t::handle will be 0.
+  // Querying this attribute will return the destination agent of the most
+  // recent ::hsa_amd_svm_prefetch_async targeting the address range.  If
+  // multiple async prefetches have been issued targeting the region and the
+  // most recently issued prefetch has completed then the query will return
+  // the location of the most recently completed prefetch.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+  // Optimizes with the anticipation that the majority of operations to the
+  // range will be read operations.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6,
+  // Allows the execution on GPU.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range.  Access may incur a page fault
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+  // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range without page faults.  Access
+  // will not incur a page fault and will not cause access based migration.
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+  // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Denies an agent access to the memory range.  Access will cause a terminal
+  // segfault.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+  // Returns the access attribute associated with the agent.
+  // The agent to query must be set in the attribute value field.
+  // The attribute enum will be replaced with the agent's current access
+  // attribute for the address range.
+  // TODO: Clarify KFD return value for non-uniform access attribute.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get.
+typedef struct hsa_amd_svm_attribute_pair_s {
+  // hsa_amd_svm_attribute_t value.
+  uint64_t attribute;
+  // Attribute value.  Bit values should be interpreted according to the type
+  // given in the associated attribute description.
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
+/** @} */
+
+/** \addtogroup profile Profiling
+ *  @{
+ */
+
+/**
+ * @brief Acquire Stream Performance Monitor on an agent
+ *
+ * Acquire exclusive use of SPM on @p preferred_agent.
+ * See hsa_amd_spm_set_dest_buffer to provide a destination buffer to KFD to start recording and
+ * retrieve this data.
+ * @param[in] preferred_agent Agent on which to acquire SPM
+ */
+hsa_status_t hsa_amd_spm_acquire(hsa_agent_t preferred_agent);
+
+/**
+ * @brief Release Stream Performance Monitor on an agent
+ *
+ * Release exclusive use of SPM on @p preferred_agent. This will stop KFD writing SPM data.
+ * If a destination buffer is set, then data in the destination buffer is available to user
+ * when this function returns.
+ *
+ * @param[in] preferred_agent Agent on which to release SPM
+ */
+hsa_status_t hsa_amd_spm_release(hsa_agent_t preferred_agent);
+
+/**
+ * @brief  Set up the current destination user mode buffer for stream performance
+ * counter data. KFD will start writing SPM data into the destination buffer. KFD will continue
+ * to copy data into the current destination buffer until any of the following functions are called
+ * - hsa_amd_spm_release
+ * - hsa_amd_spm_set_dest_buffer with dest set to NULL
+ * - hsa_amd_spm_set_dest_buffer with dest set to a new buffer
+ *
+ * if @p timeout is non-0, the call will wait for up to @p timeout ms for the previous
+ * buffer to be filled. If previous buffer to be filled before timeout, the @p timeout
+ * will be updated value with the time remaining. If the timeout is exceeded, the function
+ * copies any partial data available into the previous user buffer and returns success.
+ * User should not access destination data while KFD is copying data.
+ * If the previous destination buffer was full, then @p is_data_loss flag is set.
+ * @p dest is CPU accessible memory. It could be malloc'ed memory or host allocated memory
+ *
+ * @param[in] preferred_agent Agent on which to set the dest buffer
+ *
+ * @param[in] size_in_bytes size of the buffer
+ *
+ * @param[in,out] timeout timeout in milliseconds
+ *
+ * @param[out] size_copied number of bytes copied
+ *
+ * @param[in] dest destination address. Set to NULL to stop copy on previous buffer
+ *
+ * @param[out] is_data_loss true is data was lost
+ */
+hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t size_in_bytes,
+                                         uint32_t* timeout, uint32_t* size_copied, void* dest,
+                                         bool* is_data_loss);
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Older version of export dmabuf
+ *
+ * This is the same as calling the v2 version of export dmabuf with the
+ * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE.
+ *
+ * @param[in] ptr Pointer to the allocation being exported.
+ *
+ * @param[in] size Size in bytes to export following @p ptr.  The entire range
+ * being exported must be contained within a single allocation.
+ *
+ * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the
+ * allocation.  Contents will not be altered in the event of failure.
+ *
+ * @param[out] offset Offset in bytes into the memory referenced by the dma-buf
+ * object at which @p ptr resides.  Contents will not be altered in the event
+ * of failure.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Export completed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by
+ * @p ptr and @p size are not contained within a single allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr
+ * and @p size was allocated on a device which can not export memory.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor,
+ * @p dmabuf, could not be created.
+ */
+hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf,
+                                            uint64_t* offset);
+
+                                            /**
+ * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation.
+ *
+ * Obtains an OS specific handle to GPU agent memory.  The memory must be part
+ * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent.
+ * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access
+ * to the allocation.
+ *
+ * Shared access to the memory is not guaranteed to be fine grain coherent even
+ * if the allocation exported is from a fine grain pool.  The shared memory
+ * consistency model will be no stronger than the model exported from, consult
+ * the importing API to determine the final consistency model.
+ *
+ * The allocation's memory remains valid as long as the handle and any mapping
+ * of the handle remains valid.  When the handle and all mappings are closed
+ * the backing memory will be released for reuse.
+ *
+ * @param[in] ptr Pointer to the allocation being exported.
+ *
+ * @param[in] size Size in bytes to export following @p ptr.  The entire range
+ * being exported must be contained within a single allocation.
+ *
+ * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the
+ * allocation.  Contents will not be altered in the event of failure.
+ *
+ * @param[out] offset Offset in bytes into the memory referenced by the dma-buf
+ * object at which @p ptr resides.  Contents will not be altered in the event
+ * of failure.
+ *
+ * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Export completed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by
+ * @p ptr and @p size are not contained within a single allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr
+ * and @p size was allocated on a device which can not export memory.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor,
+ * @p dmabuf, could not be created.
+ */
+hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size,
+                               int* dmabuf, uint64_t* offset, uint64_t flags);
+
+/**
+ * @brief Closes an OS specific, vendor neutral, handle to a memory allocation.
+ *
+ * Closes an OS specific handle to GPU agent memory.
+ *
+ * Applications should close a handle after imports are complete.  The handle
+ * is not required to remain open for the lifetime of imported mappings.  The
+ * referenced allocation will remain valid until all handles and mappings
+ * are closed.
+ *
+ * @param[in] dmabuf Handle to be closed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Handle closed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE A generic error was encountered
+ * when closing the handle.  The handle may have been closed already or an
+ * async IO error may have occured.
+ */
+hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
+
+typedef enum hsa_amd_vmem_address_reserve_flag_s {
+  // Only reserve a VA range without registering it to the underlying driver
+  HSA_AMD_VMEM_ADDRESS_NO_REGISTER = (1UL << 0),
+} hsa_amd_vmem_address_reserve_flag_t;
+
+/**
+ * @brief Allocate a reserved address range
+ *
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
+ * a different address range.
+ * Address range should be released by calling hsa_amd_vmem_address_free.
+ *
+ * @param[out] va virtual address allocated
+ * @param[in] size of address range requested
+ * @param[in] address requested
+ * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
+ * range of this size.
+ *
+ * Note that this API will be deprecated in a future release and replaced by
+ * hsa_amd_vmem_address_reserve_align
+ */
+hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
+                                          uint64_t flags);
+
+/**
+ * @brief Allocate a reserved address range
+ *
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
+ * a different address range.
+ * Address range should be released by calling hsa_amd_vmem_address_free.
+ *
+ * @param[out] va virtual address allocated
+ * @param[in] size of address range requested
+ * @param[in] address requested
+ * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2
+ * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
+ * range of this size.
+ */
+hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address,
+                                          uint64_t alignment, uint64_t flags);
+
+/**
+ * @brief Free a reserved address range
+ *
+ * Free a previously allocated address range. The size must match the size of a previously
+ * allocated address range.
+ *
+ * @param[out] va virtual address to be freed
+ * @param[in] size of address range
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range released successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid va specified
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid size specified
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE Address range is still in use
+ * @retval ::HSA_STATUS_ERROR Internal unexpected error
+ */
+hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size);
+
+/**
+ * @brief Struct containing an opaque handle to a memory allocation handle
+ */
+typedef struct hsa_amd_vmem_alloc_handle_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_amd_vmem_alloc_handle_t;
+
+typedef enum {
+  MEMORY_TYPE_NONE,
+  MEMORY_TYPE_PINNED,
+} hsa_amd_memory_type_t;
+
+/**
+ * @brief Create a virtual memory handle
+ *
+ * Create a virtual memory handle within this pool
+ * @p size must be a aligned to allocation granule size for this memory pool, see
+ * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE
+ * To minimize internal memory fragmentation, align the size to the recommended allocation granule
+ * size, see HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE
+ *
+ * @param[in] pool memory to use
+ * @param[in] size of the memory allocation
+ * @param[in] type of memory
+ * @param[in] flags - currently unsupported
+ * @param[out] memory_handle - handle for the allocation
+ *
+ * @retval ::HSA_STATUS_SUCCESS memory allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid arguments
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION This memory pool does not support allocations
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate this memory
+ */
+hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
+                                        hsa_amd_memory_type_t type, uint64_t flags,
+                                        hsa_amd_vmem_alloc_handle_t* memory_handle);
+
+/**
+ * @brief Release a virtual memory handle
+ *
+ * @param[in] memory handle that was previously allocated
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ */
+hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle);
+
+/**
+ * @brief Map a virtual memory handle
+ *
+ * Map a virtual memory handle to a reserved address range. The virtual address requested must be
+ * within a previously reserved address range. @p va and (@p va + size) must be must be within
+ * (va + size) of the previous allocated address range.
+ * @p size must be equal to size of the @p memory_handle
+ * hsa_amd_vmem_set_access needs to be called to make the memory accessible to specific agents
+ *
+ * @param[in] va virtual address range where memory will be mapped
+ * @param[in] size of memory mapping
+ * @param[in] in_offset offset into memory. Currently unsupported
+ * @param[in] memory_handle virtual memory handle to be mapped
+ * @param[in] flags. Currently unsupported
+ *
+ * @retval ::HSA_STATUS_SUCCESS Memory mapped successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset,
+                              hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags);
+
+/**
+ * @brief Unmap a virtual memory handle
+ *
+ * Unmap previously mapped virtual address range
+ *
+ * @param[in] va virtual address range where memory will be mapped
+ * @param[in] size of memory mapping
+ *
+ * @retval ::HSA_STATUS_SUCCESS Memory backing unmapped successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT size is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_unmap(void* va, size_t size);
+
+typedef struct hsa_amd_memory_access_desc_s {
+  hsa_access_permission_t permissions;
+  hsa_agent_t agent_handle;
+} hsa_amd_memory_access_desc_t;
+
+/**
+ * @brief Make a memory mapping accessible
+ *
+ * Make previously mapped virtual address accessible to specific agents. @p size must be equal to
+ * size of previously mapped virtual memory handle.
+ * Calling hsa_amd_vmem_set_access multiple times on the same @p va:
+ *  - Will overwrite permissions for agents specified in @p desc
+ *  - Will leave permissions unchanged for agents not specified in @p desc
+ *
+ * @param[in] va previously mapped virtual address
+ * @param[in] size of memory mapping
+ * @param[in] desc list of access permissions for each agent
+ * @param[in] desc_cnt number of elements in desc
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent in desc
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
+                                     const hsa_amd_memory_access_desc_t* desc,
+                                     size_t desc_cnt);
+
+/**
+ * @brief Get current access permissions for memory mapping
+ *
+ * Get access permissions for memory mapping for specific agent.
+ *
+ * @param[in] va previously mapped virtual address
+ * @param[in] perms current permissions
+ * @param[in] agent_handle agent
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION va is not mapped or permissions never set for this
+ * agent
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
+                                     hsa_agent_t agent_handle);
+
+/**
+ * @brief Get an exportable shareable handle
+ *
+ * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to
+ * re-create a virtual memory handle using hsa_amd_vmem_import_shareable_handle. The shareable
+ * handle can be transferred using mechanisms that support posix file descriptors Once all shareable
+ * handles are closed, the memory_handle is released.
+ *
+ * @param[out] dmabuf_fd shareable handle
+ * @param[in] handle previously allocated virtual memory handle
+ * @param[in] flags Currently unsupported
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
+                                                  hsa_amd_vmem_alloc_handle_t handle,
+                                                  uint64_t flags);
+/**
+ * @brief Import a shareable handle
+ *
+ * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed
+ * and released results in undefined behavior.
+ *
+ * @param[in] dmabuf_fd shareable handle exported with hsa_amd_vmem_export_shareable_handle
+ * @param[out] handle virtual memory handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
+                                                  hsa_amd_vmem_alloc_handle_t* handle);
+
+/**
+ * @brief Returns memory handle for mapped memory
+ *
+ * Return a memory handle for previously mapped memory. The handle will be the same value of handle
+ * used to map the memory. The returned handle must be released with corresponding number of calls
+ * to hsa_amd_vmem_handle_release.
+ *
+ * @param[out] memory_handle memory handle for this mapped address
+ * @param[in] mapped address
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid address
+ */
+hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle,
+                                              void* addr);
+
+/**
+ * @brief Returns the current allocation properties of a handle
+ *
+ * Returns the allocation properties of an existing handle
+ *
+ * @param[in] memory_handle memory handle to be queried
+ * @param[out] pool memory pool that owns this handle
+ * @param[out] memory type
+
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
+ */
+hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
+    hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool,
+    hsa_amd_memory_type_t* type);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Set the asynchronous scratch limit threshold on all the queues for this agent.
+ * Dispatches that are enqueued on HW queues on this agent that are smaller than threshold will not
+ * result in a scratch use-once method.
+ *
+ * Increasing this threshold will only increase the internal limit and not cause immediate allocation
+ * of additional scratch memory. Decreasing this threshold will result in a release in scratch memory
+ * on queues where the current amount of allocated scratch exceeds the new limit.
+ *
+ * If this API call would result in a release in scratch memory and there are dispatches that are
+ * currently using scratch memory on this agent, this will result into a blocking call until the
+ * current dispatches are completed.
+ *
+ * This API is only supported on devices that support asynchronous scratch reclaim.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] threshold Threshold size in bytes
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT This agent does not support asynchronous scratch
+ * reclaim
+ */
+hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold);
+
+typedef enum {
+  /*
+   * Returns the agent that owns the underlying HW queue.
+   * The type of this attribute is hsa_agent_t.
+   */
+  HSA_AMD_QUEUE_INFO_AGENT,
+  /*
+   * Returns the doorbell ID of the completion signal of the queue
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_QUEUE_INFO_DOORBELL_ID,
+} hsa_queue_info_attribute_t;
+
+hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
+                                    void* value);
+
+typedef struct hsa_amd_ais_file_handle_s {
+  /*
+   * file handle for AIS read & write. Linux will use fd.
+   * pad is keep the size consistent accross different platforms.
+   */
+  union {
+    void*      handle;
+    int        fd;
+    uint8_t    pad[8];
+  };
+} hsa_amd_ais_file_handle_t;
+
+/**
+ * @brief Write data from device memory to a file
+ *
+ * Writes data from device memory buffer to a file at the specified offset.
+ * The device memory pointer must be accessible from the host and point to
+ * a valid allocation.
+ *
+ * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and
+ *  APIs may be modified
+ *
+ * @param[in] handle Handle of the file to write to.
+ *
+ * @param[in] devicePtr Device memory buffer pointer containing data to write.
+ *
+ * @param[in] size Size in bytes of the data to write.
+ *
+ * @param[in] file_offset Offset in bytes into the file where data will be written.
+ *
+ * @param[in/out] size_copied Actual number of bytes copied
+ *
+ * @param[in/out] status Additional status if any
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr
+ * is NULL, or @p size is 0.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to
+ * a valid allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR An error occurred during the write operation.
+ */
+hsa_status_t HSA_API hsa_amd_ais_file_write(hsa_amd_ais_file_handle_t handle, void *devicePtr,
+                                            uint64_t size, int64_t file_offset,
+                                            uint64_t *size_copied, int32_t *status);
+
+/**
+ * @brief Read data from a file to device memory
+ *
+ * Reads data from a file at the specified offset into a device memory buffer.
+ * The device memory pointer must be accessible from the host and point to
+ * a valid allocation.
+ *
+ * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and
+ *  APIs may be modified
+ * @param[in] hanlde Handle of the file to read from.
+ *
+ * @param[in] devicePtr Device memory buffer pointer to store the read data.
+ *
+ * @param[in] size Size in bytes of the data to read.
+ *
+ * @param[in] file_offset Offset in bytes into the file where data will be read from.
+ *
+ * @param[in/out] size_copied Actual number of bytes copied
+ *
+ * @param[in/out] status Additional status if any
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr
+ * is NULL, or @p size is 0.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to
+ * a valid allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR An error occurred during the read operation.
+ */
+hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devicePtr,
+                                           uint64_t size, int64_t file_offset,
+                                           uint64_t *size_copied, int32_t *status);
+
+/**
+ * @brief logging types
+ */
+typedef enum hsa_amd_log_flag_s {
+  /* Log AQL packets internally enqueued by ROCr */
+  HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0,
+  HSA_AMD_LOG_FLAG_AQL = 0,
+  /* Log SDMA packets */
+  HSA_AMD_LOG_FLAG_SDMA = 1,
+  /* Log INFO */
+  HSA_AMD_LOG_FLAG_INFO = 2,
+} hsa_amd_log_flag_t;
+
+/**
+ * @brief Enable logging via external file
+ * If this function is called multiple times, the last call to this function will overwrite the
+ * previous @p flags and @p file.
+ *
+ * @param[in] flags is used to filter types of logging. Type is uint8_t[8].
+ * Can be set using the hsa_flag_set64 macro. Setting @p flags to 0 will disable logging.
+ * @param[in] file file stream to output logging. If file is NULL, prints are sent to stderr.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void* file);
+
+/** @} */
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif
+
+#endif  // header guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_image.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..cad9b50820e0b3051b18f6f015704010ffd6775a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ext_image.h
@@ -0,0 +1,1515 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_EXT_IMAGE_H
+#define HSA_EXT_IMAGE_H
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_IMAGES
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/ 
+
+/** \defgroup ext-images Images and Samplers
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+    /**
+     * Image format is not supported.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000,
+    /**
+     * Image size is not supported.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001,
+    /**
+     * Image pitch is not supported or invalid.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002,
+    /**
+     * Sampler descriptor is not supported or invalid.
+     */
+    HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003
+};
+
+/**
+ * @brief Enumeration constants added to ::hsa_agent_info_t by this
+ * extension.
+ *
+ * @remark Additions to hsa_agent_info_t
+ */
+enum {
+  /**
+   * Maximum number of elements in 1D images. Must be at least 16384. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000,
+  /**
+   * Maximum number of elements in 1DA images. Must be at least 16384. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001,
+  /**
+   * Maximum number of elements in 1DB images. Must be at least 65536. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002,
+  /**
+   * Maximum dimensions (width, height) of 2D images, in image elements. The X
+   * and Y maximums must be at least 16384. The type of this attribute is
+   * size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003,
+  /**
+   * Maximum dimensions (width, height) of 2DA images, in image elements. The X
+   * and Y maximums must be at least 16384. The type of this attribute is
+   * size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004,
+  /**
+   * Maximum dimensions (width, height) of 2DDEPTH images, in image
+   * elements. The X and Y maximums must be at least 16384. The type of this
+   * attribute is size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005,
+  /**
+   * Maximum dimensions (width, height) of 2DADEPTH images, in image
+   * elements. The X and Y maximums must be at least 16384. The type of this
+   * attribute is size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006,
+  /**
+   * Maximum dimensions (width, height, depth) of 3D images, in image
+   * elements. The maximum along any dimension must be at least 2048. The type
+   * of this attribute is size_t[3].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007,
+  /**
+   * Maximum number of image layers in a image array. Must be at least 2048. The
+   * type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008,
+  /**
+   * Maximum number of read-only image handles that can be created for an agent at any one
+   * time. Must be at least 128. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009,
+  /**
+   * Maximum number of write-only and read-write image handles (combined) that
+   * can be created for an agent at any one time. Must be at least 64. The type of this
+   * attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A,
+  /**
+   * Maximum number of sampler handlers that can be created for an agent at any one
+   * time. Must be at least 16. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B,
+  /**
+   * Image pitch alignment. The agent only supports linear image data
+   * layouts with a row pitch that is a multiple of this value. Must be
+   * a power of 2. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C
+};
+
+/**
+ * @brief Image handle, populated by ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout. Image
+ * handles are only unique within an agent, not across agents.
+ *
+ */
+typedef struct hsa_ext_image_s {
+  /**
+   *  Opaque handle. For a given agent, two handles reference the same object of
+   *  the enclosing type if and only if they are equal.
+   */
+    uint64_t handle;
+
+} hsa_ext_image_t;
+
+/**
+ * @brief Geometry associated with the image. This specifies the
+ * number of image dimensions and whether the image is an image
+ * array. See the <em>Image Geometry</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * geometry. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_geometry_t.
+ */
+typedef enum {
+/**
+   * One-dimensional image addressed by width coordinate.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1D = 0,
+
+  /**
+   * Two-dimensional image addressed by width and height coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2D = 1,
+
+  /**
+   * Three-dimensional image addressed by width, height, and depth coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_3D = 2,
+
+  /**
+   * Array of one-dimensional images with the same size and format. 1D arrays
+   * are addressed by width and index coordinate.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1DA = 3,
+
+  /**
+   * Array of two-dimensional images with the same size and format. 2D arrays
+   * are addressed by width,  height, and index coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DA = 4,
+
+  /**
+   * One-dimensional image addressed by width coordinate. It has
+   * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An
+   * image with an opaque image data layout will always use a linear
+   * image data layout, and one with an explicit image data layout
+   * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1DB = 5,
+
+  /**
+   * Two-dimensional depth image addressed by width and height coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6,
+
+  /**
+   * Array of two-dimensional depth images with the same size and format. 2D
+   * arrays are addressed by width, height, and index coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7
+} hsa_ext_image_geometry_t;
+
+/**
+ * @brief Channel type associated with the elements of an image. See
+ * the <em>Channel Type</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel type. The
+ * enumeration values and definition match the BRIG type @p
+ * hsa_ext_brig_image_channel_type_t.
+ */
+typedef enum {
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} hsa_ext_image_channel_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_type32_t;
+    
+/**
+ *
+ * @brief Channel order associated with the elements of an image. See
+ * the <em>Channel Order</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel order. The
+ * enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_channel_order_t.
+ */
+typedef enum {
+    HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} hsa_ext_image_channel_order_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_order32_t;
+    
+
+/**
+ * @brief Image format.
+ */
+typedef struct hsa_ext_image_format_s {
+  /**
+    * Channel type.
+    */
+    hsa_ext_image_channel_type32_t channel_type;
+
+   /**
+    * Channel order.
+    */
+    hsa_ext_image_channel_order32_t channel_order;
+} hsa_ext_image_format_t;
+
+/**
+ * @brief Implementation independent image descriptor.
+ */
+typedef struct hsa_ext_image_descriptor_s {
+    /**
+     * Image geometry.
+     */
+    hsa_ext_image_geometry_t geometry;
+    /**
+     * Width of the image, in components.
+     */
+    size_t width;
+    /**
+     * Height of the image, in components. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D,
+     * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or
+     * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+     */
+    size_t height;
+    /**
+     * Depth of the image, in components. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0.
+     */
+    size_t depth;
+    /**
+     * Number of image layers in the image array. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+     * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+     */
+    size_t array_size;
+    /**
+     * Image format.
+     */
+    hsa_ext_image_format_t format;
+} hsa_ext_image_descriptor_t;
+
+/**
+ * @brief Image capability.
+ */
+typedef enum  {
+   /**
+    * Images of this geometry, format, and layout are not supported by
+    * the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0,
+   /**
+    * Read-only images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1,
+   /**
+    * Write-only images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2,
+   /**
+    * Read-write images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4,
+   /**
+    * @deprecated Images of this geometry, format, and layout can be accessed from
+    * read-modify-write atomic operations in the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8,
+    /**
+    * Images of this geometry, format, and layout are guaranteed to
+    * have a consistent data layout regardless of how they are
+    * accessed by the associated agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10
+} hsa_ext_image_capability_t;
+
+/**
+ * @brief Image data layout.
+ *
+ * @details An image data layout denotes such aspects of image data
+ * layout as tiling and organization of channels in memory. Some image
+ * data layouts may only apply to specific image geometries, formats,
+ * and access permissions. Different agents may support different
+ * image layout identifiers, including vendor specific layouts. Note
+ * that an agent may not support the same image data layout for
+ * different access permissions to images with the same image
+ * geometry, size, and format. If multiple agents support the same
+ * image data layout then it is possible to use separate image handles
+ * for each agent that references the same image data.
+ */
+
+typedef enum  {
+   /**
+    * An implementation specific opaque image data layout which can
+    * vary depending on the agent, geometry, image format, image size,
+    * and access permissions.
+    */
+    HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0,
+   /**
+    * The image data layout is specified by the following rules in
+    * ascending byte address order. For a 3D image, 2DA image array,
+    * or 1DA image array, the image data is stored as a linear sequence
+    * of adjacent 2D image slices, 2D images, or 1D images
+    * respectively, spaced according to the slice pitch. Each 2D image
+    * is stored as a linear sequence of adjacent image rows, spaced
+    * according to the row pitch. Each 1D or 1DB image is stored as a
+    * single image row. Each image row is stored as a linear sequence
+    * of image elements. Each image element is stored as a linear
+    * sequence of image components specified by the left to right
+    * channel order definition. Each image component is stored using
+    * the memory type specified by the channel type.
+    *
+    * The 1DB image geometry always uses the linear image data layout.
+    */
+    HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1
+} hsa_ext_image_data_layout_t;
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, and image format for an image created with an opaque image
+ * data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, image format, and image layout for an image created with
+ * an explicit image data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[in] image_data_layout The image data layout.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_get_capability instead.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    hsa_ext_image_data_layout_t image_data_layout,
+    uint32_t *capability_mask);
+
+/**
+ * @brief Agent specific image size and alignment requirements, populated by
+ * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout.
+ */
+typedef struct hsa_ext_image_data_info_s {
+  /**
+   * Image data size, in bytes.
+   */
+  size_t size;
+
+  /**
+   * Image data alignment, in bytes. Must always be a power of 2.
+   */
+  size_t alignment;
+
+} hsa_ext_image_data_info_t;
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of agent, image
+ * descriptor, and access permission for an image created with an opaque image
+ * data layout.
+ *
+ * @details The optimal image data size and alignment requirements may
+ * vary depending on the image attributes specified in @p
+ * image_descriptor, the @p access_permission, and the @p agent. Also,
+ * different implementations of the HSA runtime may return different
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors as long
+ * as ::hsa_ext_image_get_capability reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by @p agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The @p agent must support the image format
+ * specified in @p image_descriptor for the given @p
+ * access_permission.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p
+ * agent does not support the image format specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * access_permission is not a valid access permission value, or @p
+ * image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of
+ * image descriptor, access permission, image data layout, image data row pitch,
+ * and image data slice pitch for an image created with an explicit image
+ * data layout.
+ *
+ * @details The image data size and alignment requirements may vary
+ * depending on the image attributes specified in @p image_descriptor,
+ * the @p access_permission, and the image layout. However, different
+ * implementations of the HSA runtime will return the same
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors and
+ * matching image layouts as long as ::hsa_ext_image_get_capability
+ * reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa. Image layouts match if they are the
+ * same image data layout and use the same image row and slice pitch
+ * values.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by an agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type.
+ *
+ * @param[in] image_data_layout The image data layout to use.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_data_get_info instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image
+ * format specified by @p image_descriptor is not supported for the
+ * @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image
+ * dimensions specified by @p image_descriptor are not supported for
+ * the @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and
+ * slice pitch specified by @p image_data_row_pitch and @p
+ * image_data_slice_pitch are invalid or not supported.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an
+ * opaque image data layout.
+ *
+ * @details Images with an opaque image data layout created with
+ * different access permissions but matching image descriptors and
+ * same agent can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability for the image format specified in
+ * the image descriptor. Image descriptors match if they have the same
+ * values, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa.
+ *
+ * If necessary, an application can use image operations (import,
+ * export, copy, clear) to prepare the image for the intended use
+ * regardless of the access permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by agent. The access permission defines how the agent
+ * is allowed to access the image using the image handle created and
+ * must match the corresponding HSAIL image handle type. The agent
+ * must support the image format specified in @p image_descriptor for
+ * the given @p access_permission.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent
+ * does not have the capability to support the image format contained
+ * in @p image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p access_permission is not a valid access permission
+ * value, or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an explicit
+ * image data layout.
+ *
+ * @details Images with an explicit image data layout created with
+ * different access permissions but matching image descriptors and
+ * matching image layout can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability_with_layout for the image format
+ * specified in the image descriptor and specified image data
+ * layout. Image descriptors match if they have the same values, with
+ * the exception that s-form channel orders match the corresponding
+ * non-s-form channel order and vice versa. Image layouts match if
+ * they are the same image data layout and use the same image row and
+ * slice values.
+ *
+ * If necessary, an application can use image operations (import, export, copy,
+ * clear) to prepare the image for the intended use regardless of the access
+ * permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by the agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The agent must support the image format
+ * specified in @p image_descriptor for the given @p access_permission
+ * and @p image_data_layout.
+ *
+ * @param[in] image_data_layout The image data layout to use for the
+ * @p image_data. It is invalid to use
+ * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create
+ * instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does
+ * not have the capability to support the image format contained in the image
+ * descriptor using the specified @p access_permission and @p image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission and @p
+ * image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does
+ * not support the row and slice pitch specified by @p image_data_row_pitch
+ * and @p image_data_slice_pitch, or the values are invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create_with_layout(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_t *image);
+
+/**
+ * @brief Destroy an image handle previously created using ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout.
+ *
+ * @details Destroying the image handle does not free the associated image data,
+ * or modify its contents. The application should not destroy an image handle while
+ * there are references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_image_destroy(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+/**
+ * @brief Copies a portion of one image (the source) to another image (the
+ * destination).
+ *
+ * @details The source and destination image formats should be the
+ * same, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa. For example,
+ * it is allowed to copy a source image with a channel order of
+ * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a
+ * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB.
+ *
+ * The source and destination images do not have to be of the same geometry and
+ * appropriate scaling is performed by the HSA runtime. It is possible to copy
+ * subregions between any combinations of source and destination geometries, provided
+ * that the dimensions of the subregions are the same. For example, it is
+ * allowed to copy a rectangular region from a 2D image to a slice of a 3D
+ * image.
+ *
+ * If the source and destination image data overlap, or the combination of
+ * offset and range references an out-out-bounds element in any of the images,
+ * the behavior is undefined.
+ *
+ * @param[in] agent Agent associated with both the source and destination image handles.
+ *
+ * @param[in] src_image Image handle of source image. The agent associated with the source
+ * image handle must be identical to that of the destination image.
+ *
+ * @param[in] src_offset Pointer to the offset within the source image where to
+ * copy the data from. Must not be NULL.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] dst_offset Pointer to the offset within the destination
+ * image where to copy the data. Must not be NULL.
+ *
+ * @param[in] range Dimensions of the image portion to be copied. The HSA
+ * runtime computes the size of the image data to be copied using this
+ * argument. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is
+ * NULL, @p dst_offset is NULL, or @p range is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_copy(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+/**
+ * @brief Image region.
+ */
+typedef struct hsa_ext_image_region_s {
+   /**
+    * Offset within an image (in coordinates).
+    */
+    hsa_dim3_t offset;
+
+   /**
+    * Dimension size of the image range (in coordinates). The x, y, and z dimensions
+    * correspond to width, height, and depth or index respectively.
+    */
+    hsa_dim3_t range;
+} hsa_ext_image_region_t;
+
+/**
+ * @brief Import a linearly organized image data from memory directly to an
+ * image handle.
+ *
+ * @details This operation updates the image data referenced by the image handle
+ * from the source memory. The size of the data imported from memory is
+ * implicitly derived from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the source memory or destination image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the import operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_memory Source memory. Must not be NULL.
+ *
+ * @param[in] src_row_pitch The size in bytes of a single row of the image in the
+ * source memory. If the value is smaller than the destination image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image layer in an image array in the source memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p src_row_pitch * destination image region height, then the value used for
+ * @p src_row_pitch * destination image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] image_region Pointer to the image region to be updated. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p
+ * image_region is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_ext_image_import(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Export the image data to linearly organized memory.
+ *
+ * @details The operation updates the destination memory with the image data of
+ * @p src_image. The size of the data exported to memory is implicitly derived
+ * from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the destination memory or source image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the export operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_image Image handle of source image.
+ *
+ * @param[in] dst_memory Destination memory. Must not be NULL.
+ *
+ * @param[in] dst_row_pitch The size in bytes of a single row of the image in the
+ * destination memory. If the value is smaller than the source image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image in an image array in the destination memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p dst_row_pitch * source image region height, then the value used for
+ * @p dst_row_pitch * source image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] image_region Pointer to the image region to be exported. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_export(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Clear a region of an image so that every image element has
+ * the specified value.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle for image to be cleared.
+ *
+ * @param[in] data The value to which to set each image element being
+ * cleared. It is specified as an array of image component values. The
+ * number of array elements must match the number of access components
+ * for the image channel order. The type of each array element must
+ * match the image access type of the image channel type. When the
+ * value is used to set the value of an image element, the conversion
+ * method corresponding to the image channel type is used. See the
+ * <em>Channel Order</em> section and <em>Channel Type</em> section in
+ * the <em>HSA Programming Reference Manual</em> for more
+ * information. Must not be NULL.
+ *
+ * @param[in] image_region Pointer to the image region to clear. Must not be
+ * NULL. If the region references an out-out-bounds element, the behavior is
+ * undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_clear(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Sampler handle. Samplers are populated by
+ * ::hsa_ext_sampler_create or ::hsa_ext_sampler_create_v2. Sampler handles are only unique
+ *  within an agent, not across agents.
+ */
+typedef struct hsa_ext_sampler_s {
+  /**
+   *  Opaque handle. For a given agent, two handles reference the same object of
+   *  the enclosing type if and only if they are equal.
+   */
+    uint64_t handle;
+} hsa_ext_sampler_t;
+
+/**
+ * @brief Sampler address modes. The sampler address mode describes
+ * the processing of out-of-range image coordinates. See the
+ * <em>Addressing Mode</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each address mode. The values
+ * match the BRIG type @p hsa_ext_brig_sampler_addressing_t.
+ */
+typedef enum {
+  /**
+   * Out-of-range coordinates are not handled.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0,
+
+  /**
+   * Clamp out-of-range coordinates to the image edge.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1,
+
+  /**
+   * Clamp out-of-range coordinates to the image border color.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2,
+
+  /**
+   * Wrap out-of-range coordinates back into the valid coordinate
+   * range so the image appears as repeated tiles.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3,
+
+  /**
+   * Mirror out-of-range coordinates back into the valid coordinate
+   * range so the image appears as repeated tiles with every other
+   * tile a reflection.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4
+
+} hsa_ext_sampler_addressing_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_addressing_mode32_t;
+
+/**
+ * @brief Sampler coordinate normalization modes. See the
+ * <em>Coordinate Normalization Mode</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * coordinate normalization mode. The values match the BRIG type @p
+ * hsa_ext_brig_sampler_coord_normalization_t.
+ */
+typedef enum {
+
+  /**
+   * Coordinates are used to directly address an image element.
+   */
+  HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0,
+
+  /**
+   * Coordinates are scaled by the image dimension size before being
+   * used to address an image element.
+   */
+  HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1
+
+} hsa_ext_sampler_coordinate_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_coordinate_mode32_t;
+    
+
+/**
+ * @brief Sampler filter modes. See the <em>Filter Mode</em> section
+ * in the <em>HSA Programming Reference Manual</em> for definitions
+ * on each address mode. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_sampler_filter_t.
+ */
+typedef enum {
+  /**
+   * Filter to the image element nearest (in Manhattan distance) to the
+   * specified coordinate.
+   */
+  HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0,
+
+  /**
+   * Filter to the image element calculated by combining the elements in a 2x2
+   * square block or 2x2x2 cube block around the specified coordinate. The
+   * elements are combined using linear interpolation.
+   */
+  HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1
+
+} hsa_ext_sampler_filter_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_filter_mode32_t;
+
+/**
+ * @brief Implementation independent sampler descriptor.
+ */
+typedef struct hsa_ext_sampler_descriptor_s {
+  /**
+   * Sampler coordinate mode describes the normalization of image coordinates.
+   */
+  hsa_ext_sampler_coordinate_mode32_t coordinate_mode;
+
+  /**
+   * Sampler filter type describes the type of sampling performed.
+   */
+  hsa_ext_sampler_filter_mode32_t filter_mode;
+
+  /**
+   * Sampler address mode describes the processing of out-of-range image
+   * coordinates.
+   */
+  hsa_ext_sampler_addressing_mode32_t address_mode;
+} hsa_ext_sampler_descriptor_t;
+
+/**
+ * @brief Implementation independent sampler descriptor v2 which supports
+ *  different address modes in X, Y and Z axises.
+ */
+typedef struct hsa_ext_sampler_descriptor_v2_s {
+  /**
+   * Sampler coordinate mode describes the normalization of image coordinates.
+   */
+  hsa_ext_sampler_coordinate_mode32_t coordinate_mode;
+
+  /**
+   * Sampler filter type describes the type of sampling performed.
+   */
+  hsa_ext_sampler_filter_mode32_t filter_mode;
+
+  /**
+   * Sampler address mode describes the processing of out-of-range image
+   * coordinates.
+   */
+  hsa_ext_sampler_addressing_mode32_t address_modes[3]; // in X, Y and Z axises
+} hsa_ext_sampler_descriptor_v2_t;
+
+/**
+ * @brief Create an agent specific sampler handle for a given agent
+ * independent sampler descriptor and agent.
+ *
+ * @param[in] agent Agent to be associated with the sampler handle created.
+ *
+ * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be
+ * NULL.
+ *
+ * @param[out] sampler Memory location where the HSA runtime stores the newly
+ * created sampler handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The
+ * @p agent does not have the capability to support the properties
+ * specified by @p sampler_descriptor or it is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or
+ * @p sampler is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_create(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+/**
+ * @brief Create an agent specific sampler handle for a given agent
+ * independent sampler descriptor v2 and agent.
+ *
+ * @param[in] agent Agent to be associated with the sampler handle created.
+ *
+ * @param[in] sampler_descriptor v2 Pointer to a sampler descriptor. Must not be
+ * NULL.
+ *
+ * @param[out] sampler Memory location where the HSA runtime stores the newly
+ * created sampler handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The
+ * @p agent does not have the capability to support the properties
+ * specified by @p sampler_descriptor or it is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or
+ * @p sampler is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_create_v2(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+/**
+ * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create or
+ * ::hsa_ext_sampler_create_v2.
+ *
+ * @details The sampler handle should not be destroyed while there are
+ * references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the sampler handle.
+ *
+ * @param[in] sampler Sampler handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_destroy(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+
+#define hsa_ext_images_1_00
+
+/**
+ * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_00_pfn_s {
+
+  hsa_status_t (*hsa_ext_image_get_capability)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_image_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+  hsa_status_t (*hsa_ext_image_copy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+  hsa_status_t (*hsa_ext_image_import)(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_export)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_clear)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_sampler_create)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+  hsa_status_t (*hsa_ext_sampler_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+} hsa_ext_images_1_00_pfn_t;
+
+#define hsa_ext_images_1
+
+/**
+ * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_pfn_s {
+
+  hsa_status_t (*hsa_ext_image_get_capability)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_image_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+  hsa_status_t (*hsa_ext_image_copy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+  hsa_status_t (*hsa_ext_image_import)(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_export)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_clear)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_sampler_create)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+  hsa_status_t (*hsa_ext_sampler_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+  hsa_status_t (*hsa_ext_image_get_capability_with_layout)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    hsa_ext_image_data_layout_t image_data_layout,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info_with_layout)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create_with_layout)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_sampler_create_v2)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+} hsa_ext_images_1_pfn_t;
+/** @} */
+    
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/ 
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_loader.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..47236c86e994706c726c0f062a8f5ba036f9ce7d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_loader.h
@@ -0,0 +1,667 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension for additional loader functionality.
+
+#ifndef HSA_VEN_AMD_LOADER_H
+#define HSA_VEN_AMD_LOADER_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @brief Queries equivalent host address for given @p device_address, and
+ * records it in @p host_address.
+ *
+ *
+ * @details Contents of memory pointed to by @p host_address would be identical
+ * to contents of memory pointed to by @p device_address. Only difference
+ * between the two is host accessibility: @p host_address is always accessible
+ * from host, @p device_address might not be accessible from host.
+ *
+ * If @p device_address already points to host accessible memory, then the value
+ * of @p device_address is simply copied into @p host_address.
+ *
+ * The lifetime of @p host_address is the same as the lifetime of @p
+ * device_address, and both lifetimes are limited by the lifetime of the
+ * executable that is managing these addresses.
+ *
+ *
+ * @param[in] device_address Device address to query equivalent host address
+ * for.
+ *
+ * @param[out] host_address Pointer to application-allocated buffer to record
+ * queried equivalent host address in.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or
+ * null, or @p host_address is null.
+ */
+hsa_status_t hsa_ven_amd_loader_query_host_address(
+  const void *device_address,
+  const void **host_address);
+
+/**
+ * @brief The storage type of the code object that is backing loaded memory
+ * segment.
+ */
+typedef enum {
+  /**
+   * Loaded memory segment is not backed by any code object (anonymous), as the
+   * case would be with BSS (uninitialized data).
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * file.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * memory.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2
+} hsa_ven_amd_loader_code_object_storage_type_t;
+
+/**
+ * @brief Loaded memory segment descriptor.
+ *
+ *
+ * @details Loaded memory segment descriptor describes underlying loaded memory
+ * segment. Loaded memory segment is created/allocated by the executable during
+ * the loading of the code object that is backing underlying memory segment.
+ *
+ * The lifetime of underlying memory segment is limited by the lifetime of the
+ * executable that is managing underlying memory segment.
+ */
+typedef struct hsa_ven_amd_loader_segment_descriptor_s {
+  /**
+   * Agent underlying memory segment is allocated on. If the code object that is
+   * backing underlying memory segment is program code object, then 0.
+   */
+  hsa_agent_t agent;
+  /**
+   * Executable that is managing this underlying memory segment.
+   */
+  hsa_executable_t executable;
+  /**
+   * Storage type of the code object that is backing underlying memory segment.
+   */
+  hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated
+   *     filepath to the code object;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host
+   *     accessible pointer to the first byte of the code object.
+   */
+  const void *code_object_storage_base;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of
+   *     the filepath to the code object (including null-terminating character);
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in
+   *     bytes, of the memory occupied by the code object.
+   */
+  size_t code_object_storage_size;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - other, then offset, in bytes, from the beginning of the code object to
+   *     the first byte in the code object data is copied from.
+   */
+  size_t code_object_storage_offset;
+  /**
+   * Starting address of the underlying memory segment.
+   */
+  const void *segment_base;
+  /**
+   * Size, in bytes, of the underlying memory segment.
+   */
+  size_t segment_size;
+} hsa_ven_amd_loader_segment_descriptor_t;
+
+/**
+ * @brief Either queries loaded memory segment descriptors, or total number of
+ * loaded memory segment descriptors.
+ *
+ *
+ * @details If @p segment_descriptors is not null and @p num_segment_descriptors
+ * points to number that exactly matches total number of loaded memory segment
+ * descriptors, then queries loaded memory segment descriptors, and records them
+ * in @p segment_descriptors. If @p segment_descriptors is null and @p
+ * num_segment_descriptors points to zero, then queries total number of loaded
+ * memory segment descriptors, and records it in @p num_segment_descriptors. In
+ * all other cases returns appropriate error code (see below).
+ *
+ * The caller of this function is responsible for the allocation/deallocation
+ * and the lifetime of @p segment_descriptors and @p num_segment_descriptors.
+ *
+ * The lifetime of loaded memory segments that are described by queried loaded
+ * memory segment descriptors is limited by the lifetime of the executable that
+ * is managing loaded memory segments.
+ *
+ * Queried loaded memory segment descriptors are always self-consistent: they
+ * describe a complete set of loaded memory segments that are being backed by
+ * fully loaded code objects that are present at the time (i.e. this function
+ * is blocked until all executable manipulations are fully complete).
+ *
+ *
+ * @param[out] segment_descriptors Pointer to application-allocated buffer to
+ * record queried loaded memory segment descriptors in. Can be null if @p
+ * num_segment_descriptors points to zero.
+ *
+ * @param[in,out] num_segment_descriptors Pointer to application-allocated
+ * buffer that contains either total number of loaded memory segment descriptors
+ * or zero.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null
+ * while @p num_segment_descriptors points to non-zero number, @p
+ * segment_descriptors is not null while @p num_segment_descriptors points to
+ * zero, or @p num_segment_descriptors is null.
+ *
+ * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors
+ * does not point to number that exactly matches total number of loaded memory
+ * segment descriptors.
+ */
+hsa_status_t hsa_ven_amd_loader_query_segment_descriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors);
+
+/**
+ * @brief Obtains the handle of executable to which the device address belongs.
+ *
+ * @details This method should not be used to obtain executable handle by using
+ * a host address. The executable returned is expected to be alive until its
+ * destroyed by the user.
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there
+ * is no exectuable found for this kernel code object.
+ */
+hsa_status_t hsa_ven_amd_loader_query_executable(
+  const void *device_address,
+  hsa_executable_t *executable);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the loaded code objects in an executable, and invoke
+ * an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per loaded code object. The
+ * HSA runtime passes three arguments to the callback: the executable, a
+ * loaded code object, and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and
+ * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that
+ * status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
+  hsa_executable_t executable,
+  hsa_status_t (*callback)(
+    hsa_executable_t executable,
+    hsa_loaded_code_object_t loaded_code_object,
+    void *data),
+  void *data);
+
+/**
+ * @brief Loaded code object kind.
+ */
+typedef enum {
+  /**
+   * Program code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1,
+  /**
+   * Agent code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2
+} hsa_ven_amd_loader_loaded_code_object_kind_t;
+
+/**
+ * @brief Loaded code object attributes.
+ */
+typedef enum hsa_ven_amd_loader_loaded_code_object_info_e {
+  /**
+   * The executable in which this loaded code object is loaded. The
+   * type of this attribute is ::hsa_executable_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1,
+  /**
+   * The kind of this loaded code object. The type of this attribute is
+   * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2,
+  /**
+   * The agent on which this loaded code object is loaded. The
+   * value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this
+   * attribute is ::hsa_agent_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3,
+  /**
+   * The storage type of the code object reader used to load the loaded code object.
+   * The type of this attribute is ::uint32_t interpreted as a
+   * ::hsa_ven_amd_loader_code_object_storage_type_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4,
+  /**
+   * The memory address of the first byte of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5,
+  /**
+   * The memory size in bytes of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6,
+  /**
+   * The file descriptor of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this
+   * attribute is ::int.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7,
+  /**
+   * The signed byte address difference of the memory address at which the code
+   * object is loaded minus the virtual address specified in the code object
+   * that is loaded. The value of this attribute is only defined if the
+   * executable in which the code object is loaded is froozen. The type of this
+   * attribute is ::int64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8,
+  /**
+   * The base memory address at which the code object is loaded. This is the
+   * base address of the allocation for the lowest addressed segment of the code
+   * object that is loaded. Note that any non-loaded segments before the first
+   * loaded segment are ignored. The value of this attribute is only defined if
+   * the executable in which the code object is loaded is froozen. The type of
+   * this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9,
+  /**
+   * The byte size of the loaded code objects contiguous memory allocation. The
+   * value of this attribute is only defined if the executable in which the code
+   * object is loaded is froozen. The type of this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10,
+  /**
+   * The length of the URI in bytes, not including the NUL terminator. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11,
+  /**
+   * The URI name from which the code object was loaded. The type of this
+   * attribute is a NUL terminated \p char* with the length equal to the value
+   * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute.
+   * The URI name syntax is defined by the following BNF syntax:
+   *
+   *     code_object_uri ::== file_uri | memory_uri
+   *     file_uri        ::== "file://" file_path [ range_specifier ]
+   *     memory_uri      ::== "memory://" process_id range_specifier
+   *     range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
+   *     file_path       ::== URI_ENCODED_OS_FILE_PATH
+   *     process_id      ::== DECIMAL_NUMBER
+   *     number          ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
+   *
+   * ``number`` is a C integral literal where hexadecimal values are prefixed by
+   * "0x" or "0X", and octal values by "0".
+   *
+   * ``file_path`` is the file's path specified as a URI encoded UTF-8 string.
+   * In URI encoding, every character that is not in the regular expression
+   * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits
+   * proceeded by "%".  Directories in the path are separated by "/".
+   *
+   * ``offset`` is a 0-based byte offset to the start of the code object.  For a
+   * file URI, it is from the start of the file specified by the ``file_path``,
+   * and if omitted defaults to 0. For a memory URI, it is the memory address
+   * and is required.
+   *
+   * ``size`` is the number of bytes in the code object.  For a file URI, if
+   * omitted it defaults to the size of the file.  It is required for a memory
+   * URI.
+   *
+   * ``process_id`` is the identity of the process owning the memory.  For Linux
+   * it is the C unsigned integral decimal literal for the process ID (PID).
+   *
+   * For example:
+   *
+   *     file:///dir1/dir2/file1
+   *     file:///dir3/dir4/file2#offset=0x2000&size=3000
+   *     memory://1234#offset=0x20000&size=3000
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12,
+} hsa_ven_amd_loader_loaded_code_object_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given loaded code
+ * object.
+ *
+ * @param[in] loaded_code_object Loaded code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * loaded code object attribute, or @p value is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info(
+  hsa_loaded_code_object_t loaded_code_object,
+  hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+  void *value);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Create a code object reader to operate on a file with size and offset.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the code object embedded in @p file.
+ *
+ * @param[in] offset 0-based offset relative to the beginning of the @p file
+ * that denotes the beginning of the code object embedded within the @p file.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least
+ * read permissions. This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset
+ * do not form a valid code object. If file size is 0. Or offset > file size.
+ * This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t
+hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size(
+    hsa_file_t file,
+    size_t offset,
+    size_t size,
+    hsa_code_object_reader_t *code_object_reader);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the available executables, and invoke an
+ * application-defined callback on every iteration. While
+ * ::hsa_ven_amd_loader_iterate_executables is executing any calls to
+ * ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy will be blocked.
+ *
+ * @param[in] callback Callback to be invoked once per executable. The HSA
+ * runtime passes two arguments to the callback: the executable and the
+ * application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_ven_amd_loader_iterate_executables returns that status value. If
+ * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy then the behavior is undefined.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t
+hsa_ven_amd_loader_iterate_executables(
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      void *data),
+    void *data);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_loader 001003
+
+/**
+ * @brief Extension function table version 1.00.
+ */
+typedef struct hsa_ven_amd_loader_1_00_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+} hsa_ven_amd_loader_1_00_pfn_t;
+
+/**
+ * @brief Extension function table version 1.01.
+ */
+typedef struct hsa_ven_amd_loader_1_01_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+} hsa_ven_amd_loader_1_01_pfn_t;
+
+/**
+ * @brief Extension function table version 1.02.
+ */
+typedef struct hsa_ven_amd_loader_1_02_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+} hsa_ven_amd_loader_1_02_pfn_t;
+
+/**
+ * @brief Extension function table version 1.03.
+ */
+typedef struct hsa_ven_amd_loader_1_03_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_iterate_executables)(
+      hsa_status_t (*callback)(
+        hsa_executable_t executable,
+        void *data),
+      void *data);
+} hsa_ven_amd_loader_1_03_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* HSA_VEN_AMD_LOADER_H */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..019f0ea5c960d442cf330bc5798daeb9bf6c01e5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h
@@ -0,0 +1,416 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_VEN_AMD_PC_SAMPLING_H
+#define HSA_VEN_AMD_PC_SAMPLING_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+
+/**
+ * @brief HSA AMD Vendor PC Sampling APIs
+ * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be
+ * modified extensively in the future
+ */
+
+/**
+ * @brief PC Sampling sample data for hosttrap sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t reserved0;
+  uint64_t reserved1;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_hosttrap_v1_t;
+
+/**
+ * @brief PC Sampling sample data for stochastic sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t perf_snapshot_data;
+  uint32_t perf_snapshot_data1;
+  uint32_t perf_snapshot_data2;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_snapshot_v1_t;
+
+/**
+ * @brief PC Sampling method kinds
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1,
+  HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
+} hsa_ven_amd_pcs_method_kind_t;
+
+/**
+ * @brief PC Sampling interval unit type
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS
+} hsa_ven_amd_pcs_units_t;
+
+/**
+ * @brief HSA callback function to perform the copy onto a destination buffer
+ *
+ * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal
+ * buffers. Remaining contents of HSA internal buffers will be included in next
+ * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling
+ * hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] hsa_callback_data private data to pass back to HSA. Provided in
+ * hsa_ven_amd_pcs_data_ready_callback_t
+ *
+ * @param[in] data_size size of destination buffer in bytes.
+ * @param[in] destination destination buffer
+ * @retval    TBD: but could be used to indicate that there is no more data to be read.
+ * Or indicate an error and abort of current copy operations
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data,
+                                                             size_t data_size, void* destination);
+
+/**
+ * @brief HSA callback function to to indicate that there is data ready to be copied
+ *
+ * When the client receives this callback, the client should call back @p data_copy_callback for HSA
+ * to perform the copy operation into an available buffer. @p data_copy_callback can be called back
+ * multiple times with smaller @p data_size to split the copy operation.
+ *
+ * This callback must not call ::hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] client_callback_data client private data passed in via
+ * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id
+ * @param[in] data_size size of data available to be copied
+ * @param[in] lost_sample_count number of lost samples since last call to
+ * hsa_ven_amd_pcs_data_ready_callback_t.
+ * @param[in] data_copy_callback callback function for HSA to perform the actual copy
+ * @param[in] hsa_callback_data private data to pass back to HSA
+ */
+typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)(
+    void* client_callback_data, size_t data_size, size_t lost_sample_count,
+    hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data);
+
+/**
+ * @brief Opaque handle representing a sampling session.
+ * Two sessions having same handle value represent the same session
+ */
+typedef struct {
+  uint64_t handle;
+} hsa_ven_amd_pcs_t;
+
+/**
+ * @brief PC Sampling configuration flag options
+ */
+typedef enum {
+  /* The interval for this sampling method have to be a power of 2 */
+  HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0)
+} hsa_ven_amd_pcs_configuration_flags_t;
+
+/**
+ * @brief PC Sampling method information
+ * Used to provide client with list of supported PC Sampling methods
+ */
+typedef struct {
+  hsa_ven_amd_pcs_method_kind_t method;
+  hsa_ven_amd_pcs_units_t units;
+  size_t min_interval;
+  size_t max_interval;
+  uint64_t flags;
+} hsa_ven_amd_pcs_configuration_t;
+
+/**
+ * @brief Callback function to iterate through list of supported PC Sampling configurations
+ *
+ * @param[in] configuration one entry for supported PC Sampling method and configuration options
+ * @param[in] callback_data client private callback data that was passed in when calling
+ * hsa_ven_amd_pcs_iterate_configuration
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)(
+    const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data);
+
+/**
+ * @brief Iterate through list of current supported PC Sampling configurations for this @p agent
+ *
+ * HSA will callback @p configuration_callback for each currently available PC Sampling
+ * configuration. The list of currently available configurations may not be the complete list of
+ * configurations supported on the @p agent. The list of currently available configurations may be
+ * reduced if the @p agent is currently handling other PC sampling sessions.
+ *
+ * @param[in] agent target agent
+ * @param[in] configuration_callback callback function to iterate through list of configurations
+ * @param[in] callback_data client private callback data
+ **/
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data);
+
+/**
+ * @brief  Create a PC Sampling session on @p agent
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, as described by the
+ * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of
+ * hsa_ven_amd_pcs_iterate_configuration for this @p agent.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                    void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
+
+
+/**
+ * @brief  Creates a PC Sampling session on @p agent. Assumes that the caller provides the
+ * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling
+ * on the @p agent.
+ *
+ * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing
+ * PC sampling session that was previously created in the underlying driver.
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, and match the parameters that we used to create
+ * the underlying PC Sampling session in the underlying driver.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling);
+
+/**
+ * @brief  Free a PC Sampling session on @p agent
+ *
+ * Free all the resources allocated for a PC Sampling session on @p agent
+ * Internal buffers for this session will be lost.
+ * If the session was active, the session will be stopped before it is destroyed.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Start a PC Sampling session
+ *
+ * Activate a PC Sampling session that was previous created.
+ * The session with be in a active state after this call
+ * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session started successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Stop a PC Sampling session
+ *
+ * Stop a session that is currently active
+ * After a session is stopped HSA may still have some PC Sampling data in its internal buffers.
+ * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal
+ * buffers are not drained and the session is started again, the internal buffers will be available
+ * on the next data_ready_callback.
+ * If the session was already inactive, this will result in a no-op and will return
+ * HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session stopped successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Flush internal buffers for a PC Sampling session
+ *
+ * Drain internal buffers for a PC Sampling session. If internal buffers have available data,
+ * this trigger a data_ready_callback.
+ *
+ * The function blocks until all PC samples associated with the @p pc_sampling session
+ * generated prior to the function call have been communicated by invocations of
+ * @p data_ready_callback having completed execution.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session flushed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling);
+
+#define hsa_ven_amd_pc_sampling_1_00
+
+/**
+ * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by
+ * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t {
+  hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)(
+      hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+      void* callback_data);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                         hsa_ven_amd_pcs_units_t units, size_t interval,
+                                         size_t latency, size_t buffer_size,
+                                         hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                         void* client_callback_data,
+                                         hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create_from_id)(
+      uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+      hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+      hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+      hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling);
+
+} hsa_ven_amd_pc_sampling_1_00_pfn_t;
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/
+
+#endif /* HSA_VEN_AMD_PC_SAMPLING_H */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/ext/prof_protocol.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/ext/prof_protocol.h
new file mode 100644
index 0000000000000000000000000000000000000000..69a2b0b3471602c6b66cd2a9cf483b3d0ec0b91a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/ext/prof_protocol.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef EXT_PROF_PROTOCOL_H_
+#define EXT_PROF_PROTOCOL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Traced API domains */
+typedef enum {
+  ACTIVITY_DOMAIN_HSA_API = 0, /* HSA API domain */
+  ACTIVITY_DOMAIN_HSA_OPS = 1, /* HSA async activity domain */
+  ACTIVITY_DOMAIN_HIP_OPS = 2, /* HIP async activity domain */
+  ACTIVITY_DOMAIN_HCC_OPS =
+      ACTIVITY_DOMAIN_HIP_OPS, /* HCC async activity domain */
+  ACTIVITY_DOMAIN_HIP_VDI =
+      ACTIVITY_DOMAIN_HIP_OPS, /* HIP VDI async activity domain */
+  ACTIVITY_DOMAIN_HIP_API = 3, /* HIP API domain */
+  ACTIVITY_DOMAIN_KFD_API = 4, /* KFD API domain */
+  ACTIVITY_DOMAIN_EXT_API = 5, /* External ID domain */
+  ACTIVITY_DOMAIN_ROCTX = 6,   /* ROCTX domain */
+  ACTIVITY_DOMAIN_HSA_EVT = 7, /* HSA events */
+  ACTIVITY_DOMAIN_NUMBER
+} activity_domain_t;
+
+/* API callback type */
+typedef void (*activity_rtapi_callback_t)(uint32_t domain, uint32_t cid,
+                                          const void* data, void* arg);
+typedef uint32_t activity_kind_t;
+typedef uint32_t activity_op_t;
+
+/* API callback phase */
+typedef enum {
+  ACTIVITY_API_PHASE_ENTER = 0,
+  ACTIVITY_API_PHASE_EXIT = 1
+} activity_api_phase_t;
+
+/* Trace record types */
+
+/* Correlation id */
+typedef uint64_t activity_correlation_id_t;
+
+/* Timestamp in nanoseconds */
+typedef uint64_t roctracer_timestamp_t;
+
+/* Activity record type */
+typedef struct activity_record_s {
+  uint32_t domain;      /* activity domain id */
+  activity_kind_t kind; /* activity kind */
+  activity_op_t op;     /* activity op */
+  union {
+    struct {
+      activity_correlation_id_t correlation_id; /* activity ID */
+      roctracer_timestamp_t begin_ns;           /* host begin timestamp */
+      roctracer_timestamp_t end_ns;             /* host end timestamp */
+    };
+    struct {
+      uint32_t se;    /* sampled SE */
+      uint64_t cycle; /* sample cycle */
+      uint64_t pc;    /* sample PC */
+    } pc_sample;
+  };
+  union {
+    struct {
+      int device_id;     /* device id */
+      uint64_t queue_id; /* queue id */
+    };
+    struct {
+      uint32_t process_id; /* device id */
+      uint32_t thread_id;  /* thread id */
+    };
+    struct {
+      activity_correlation_id_t external_id; /* external correlation id */
+    };
+  };
+  union {
+    size_t bytes;            /* data size bytes */
+    const char* kernel_name; /* kernel name */
+    const char* mark_message;
+  };
+} activity_record_t;
+
+/* Activity sync callback type */
+typedef void (*activity_sync_callback_t)(uint32_t cid, activity_record_t* record, const void* data,
+                                         void* arg);
+/* Activity async callback type */
+typedef void (*activity_async_callback_t)(uint32_t op, void* record, void* arg);
+
+#endif /* EXT_PROF_PROTOCOL_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb9b13faccaf78fbb873e54163df930c7f0b4b3f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer.h
@@ -0,0 +1,779 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+/** \mainpage ROC Tracer API Specification
+ *
+ * \section introduction Introduction
+ *
+ * ROCtracer library, Runtimes Generic Callback/Activity APIs.
+ *
+ * The goal of the implementation is to provide a generic independent from
+ * specific runtime profiler to trace API and asynchronous activity.
+ *
+ * The API provides functionality for registering the runtimes API callbacks
+ * and asynchronous activity records pool support.
+ *
+ * \section known_limitations Known Limitations and Restrictions
+ *
+ * The ROCtracer API library implementation currently has the following
+ * restrictions.  Future releases aim to address these restrictions.
+ *
+ * 1. The ACTIVITY_DOMAIN_HSA_OPS operations HSA_OP_ID_DISPATCH,
+ *    HSA_OP_ID_BARRIER, and HSA_OP_ID_RESERVED1 are not currently implemented.
+ */
+
+/**
+ * \file
+ * ROCtracer API interface.
+ */
+
+#ifndef ROCTRACER_H_
+#define ROCTRACER_H_
+
+/* Placeholder for calling convention and import/export macros */
+#if !defined(ROCTRACER_CALL)
+#define ROCTRACER_CALL
+#endif /* !defined (ROCTRACER_CALL) */
+
+#if !defined(ROCTRACER_EXPORT_DECORATOR)
+#if defined(__GNUC__)
+#define ROCTRACER_EXPORT_DECORATOR __attribute__((visibility("default")))
+#elif defined(_MSC_VER)
+#define ROCTRACER_EXPORT_DECORATOR __declspec(dllexport)
+#endif /* defined (_MSC_VER) */
+#endif /* !defined (ROCTRACER_EXPORT_DECORATOR) */
+
+#if !defined(ROCTRACER_IMPORT_DECORATOR)
+#if defined(__GNUC__)
+#define ROCTRACER_IMPORT_DECORATOR
+#elif defined(_MSC_VER)
+#define ROCTRACER_IMPORT_DECORATOR __declspec(dllimport)
+#endif /* defined (_MSC_VER) */
+#endif /* !defined (ROCTRACER_IMPORT_DECORATOR) */
+
+#define ROCTRACER_EXPORT ROCTRACER_EXPORT_DECORATOR ROCTRACER_CALL
+#define ROCTRACER_IMPORT ROCTRACER_IMPORT_DECORATOR ROCTRACER_CALL
+
+#if !defined(ROCTRACER)
+#if defined(ROCTRACER_EXPORTS)
+#define ROCTRACER_API ROCTRACER_EXPORT
+#else /* !defined (ROCTRACER_EXPORTS) */
+#define ROCTRACER_API ROCTRACER_IMPORT
+#endif /* !defined (ROCTRACER_EXPORTS) */
+#endif /* !defined (ROCTRACER) */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "ext/prof_protocol.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** \defgroup symbol_versions_group Symbol Versions
+ *
+ * The names used for the shared library versioned symbols.
+ *
+ * Every function is annotated with one of the version macros defined in this
+ * section.  Each macro specifies a corresponding symbol version string.  After
+ * dynamically loading the shared library with \p dlopen, the address of each
+ * function can be obtained using \p dlvsym with the name of the function and
+ * its corresponding symbol version string.  An error will be reported by \p
+ * dlvsym if the installed library does not support the version for the
+ * function specified in this version of the interface.
+ *
+ * @{
+ */
+
+/**
+ * The function was introduced in version 4.1 of the interface and has the
+ * symbol version string of ``"ROCTRACER_4.1"``.
+ */
+#define ROCTRACER_VERSION_4_1
+
+/** @} */
+
+/** \defgroup versioning_group Versioning
+ *
+ * Version information about the interface and the associated installed
+ * library.
+ *
+ * The semantic version of the interface following semver.org rules. A client
+ * that uses this interface is only compatible with the installed library if
+ * the major version numbers match and the interface minor version number is
+ * less than or equal to the installed library minor version number.
+ *
+ * @{
+ */
+
+/**
+ * The major version of the interface as a macro so it can be used by the
+ * preprocessor.
+ */
+#define ROCTRACER_VERSION_MAJOR 4
+
+/**
+ * The minor version of the interface as a macro so it can be used by the
+ * preprocessor.
+ */
+#define ROCTRACER_VERSION_MINOR 1
+
+/**
+ * Query the major version of the installed library.
+ *
+ * Return the major version of the installed library.  This can be used to
+ * check if it is compatible with this interface version.  This function can be
+ * used even when the library is not initialized.
+ */
+ROCTRACER_API uint32_t roctracer_version_major() ROCTRACER_VERSION_4_1;
+
+/**
+ * Query the minor version of the installed library.
+ *
+ * Return the minor version of the installed library.  This can be used to
+ * check if it is compatible with this interface version.  This function can be
+ * used even when the library is not initialized.
+ */
+ROCTRACER_API uint32_t roctracer_version_minor() ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup status_codes_group Status Codes
+ *
+ * Most operations return a status code to indicate success or error.
+ *
+ * @{
+ */
+
+/**
+ * ROC Tracer API status codes.
+ */
+typedef enum {
+  /**
+   * The function has executed successfully.
+   */
+  ROCTRACER_STATUS_SUCCESS = 0,
+  /**
+   * A generic error has occurred.
+   */
+  ROCTRACER_STATUS_ERROR = -1,
+  /**
+   * The domain ID is invalid.
+   */
+  ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID = -2,
+  /**
+   * An invalid argument was given to the function.
+   */
+  ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT = -3,
+  /**
+   * No default pool is defined.
+   */
+  ROCTRACER_STATUS_ERROR_DEFAULT_POOL_UNDEFINED = -4,
+  /**
+   * The default pool is already defined.
+   */
+  ROCTRACER_STATUS_ERROR_DEFAULT_POOL_ALREADY_DEFINED = -5,
+  /**
+   * Memory allocation error.
+   */
+  ROCTRACER_STATUS_ERROR_MEMORY_ALLOCATION = -6,
+  /**
+   * External correlation ID pop mismatch.
+   */
+  ROCTRACER_STATUS_ERROR_MISMATCHED_EXTERNAL_CORRELATION_ID = -7,
+  /**
+   * The operation is not currently implemented.  This error may be reported by
+   * any function.  Check the \ref known_limitations section to determine the
+   * status of the library implementation of the interface.
+   */
+  ROCTRACER_STATUS_ERROR_NOT_IMPLEMENTED = -8,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_UNINIT = 2,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_BREAK = 3,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_BAD_DOMAIN = ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_BAD_PARAMETER = ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_HIP_API_ERR = 6,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_HIP_OPS_ERR = 7,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_HCC_OPS_ERR = ROCTRACER_STATUS_HIP_OPS_ERR,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_HSA_ERR = 7,
+  /**
+   * Deprecated error code.
+   */
+  ROCTRACER_STATUS_ROCTX_ERR = 8,
+} roctracer_status_t;
+
+/**
+ * Query the textual description of the last error for the current thread.
+ *
+ * Returns a NUL terminated string describing the error of the last ROC Tracer
+ * API call by the calling thread that did not return success.  The empty
+ * string is returned if there is no previous error.  The last error is not
+ * cleared.
+ *
+ * \return Return the error string.  The caller owns the returned string and
+ * should use \p free() to deallocate it.
+ */
+ROCTRACER_API const char* roctracer_error_string() ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup domain_group Traced Runtime Domains
+ *
+ * The ROC Tracer API can trace multiple runtime libraries.  Each library can
+ * have API operations and asynchronous operations that can be traced.
+ *
+ * @{
+ */
+
+/**
+ * Enumeration of domains that can be traced.
+ */
+typedef activity_domain_t roctracer_domain_t;
+
+/**
+ * Query textual name of an operation of a domain.
+ *
+ * @param[in] domain Domain being queried.
+ *
+ * @param[in] op Operation within \p domain.
+ *
+ * @param[in] kind \todo Define kind.
+ *
+ * @return Returns the NUL terminated string for the operation name, or NULL if
+ * the domain or operation are invalid.  The string is owned by the ROC Tracer
+ * library.
+ */
+ROCTRACER_API const char* roctracer_op_string(
+    uint32_t domain, uint32_t op, uint32_t kind) ROCTRACER_VERSION_4_1;
+
+/**
+ * Query the operation code given a domain and the name of an operation.
+ *
+ * @param[in] domain The domain being queried.
+ *
+ * @param[in] str The NUL terminated name of the operation name being queried.
+ *
+ * @param[out] op The operation code.
+ *
+ * @param[out] kind If not NULL then the operation kind code.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.  \p op and \p kind have been updated.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT The \p op is invalid for
+ * \p domain.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID The domain is invalid or
+ * not supported.
+ */
+ROCTRACER_API roctracer_status_t
+roctracer_op_code(uint32_t domain, const char* str, uint32_t* op,
+                  uint32_t* kind) ROCTRACER_VERSION_4_1;
+
+/**
+ * Set the properties of a domain.
+ *
+ * @param[in] domain The domain.
+ *
+ * @param[in] properties The properties. Each domain defines its own type for
+ * the properties. Some domains require the properties to be set before they
+ * can be enabled.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t roctracer_set_properties(
+    roctracer_domain_t domain, void* properties) ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup callback_api_group Callback API
+ *
+ * ROC tracer provides support for runtime API callbacks and activity
+ * records logging. The API callbacks provide the API calls arguments and are
+ * called on different phases, on enter, on exit, on kernel completion.
+ *
+ * @{
+ */
+
+/**
+ * Runtime API callback type.
+ *
+ * The callback that will be invoked when an enabled runtime API is called. The
+ * callback is invoked on entry and on exit.
+ */
+typedef activity_rtapi_callback_t roctracer_rtapi_callback_t;
+
+/**
+ * Enable runtime API callback for a specific operation of a domain.
+ *
+ * @param domain The domain.
+ *
+ * @param op The operation ID in \p domain.
+ *
+ * @param callback The callback to invoke each time the operation is performed
+ * on entry and exit.
+ *
+ * @param arg Value to pass as last argument of \p callback.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT \p op is invalid for \p
+ * domain.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_op_callback(
+    activity_domain_t domain, uint32_t op, activity_rtapi_callback_t callback,
+    void* arg) ROCTRACER_VERSION_4_1;
+
+/**
+ * Enable runtime API callback for all operations of a domain.
+ *
+ * @param domain The domain
+ *
+ * @param callback The callback to invoke each time the operation is performed
+ * on entry and exit.
+ *
+ * @param arg Value to pass as last argument of \p callback.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_domain_callback(
+    activity_domain_t domain, activity_rtapi_callback_t callback,
+    void* arg) ROCTRACER_VERSION_4_1;
+
+/**
+ * Disable runtime API callback for a specific operation of a domain.
+ *
+ * @param domain The domain
+ *
+ * @param op The operation in \p domain.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT \p op is invalid for \p
+ * domain.
+ */
+ROCTRACER_API roctracer_status_t roctracer_disable_op_callback(
+    activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1;
+
+/**
+ * Disable runtime API callback for all operations of a domain.
+ *
+ * @param domain The domain
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid.
+ */
+ROCTRACER_API roctracer_status_t roctracer_disable_domain_callback(
+    activity_domain_t domain) ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup activity_api_group Activity API
+ *
+ * The activity records are asynchronously logged to the pool and can be
+ * associated with the respective API callbacks using the correlation ID.
+ * Activity API can be used to enable collecting of the records with
+ * timestamping data for API calls and the kernel submits.
+ *
+ * @{
+ */
+
+/**
+ * Activity record.
+ *
+ * Asynchronous activity events generate activity records.
+ */
+typedef activity_record_t roctracer_record_t;
+
+/**
+ * Get a pointer to the next activity record.
+ *
+ * A memory pool generates buffers that contain multiple activity records.
+ * This function steps to the next activity record.
+ *
+ * @param[in] record Pointer to ac activity record in a memory pool buffer.
+ *
+ * @param[out] next Pointer to the following activity record in the memory pool
+ * buffer.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t
+roctracer_next_record(const activity_record_t* record,
+                      const activity_record_t** next) ROCTRACER_VERSION_4_1;
+
+/**
+ * Memory pool allocator callback.
+ *
+ * If \p *ptr is NULL, then allocate memory of \p size bytes and save address
+ * in \p *ptr.
+ *
+ * If \p *ptr is non-NULL and size is non-0, then reallocate the memory at \p
+ * *ptr with size \p size and save the address in \p *ptr. The memory will have
+ * been allocated by the same callback.
+ *
+ * If \p *ptr is non-NULL and size is 0, then deallocate the memory at \p *ptr.
+ * The memory will have been allocated by the same callback.
+ *
+ * \p size is the size of the memory allocation or reallocation, or 0 if
+ * deallocating.
+ *
+ * \p arg Argument provided in the ::roctracer_properties_t passed to the
+ * ::roctracer_open_pool function.
+ */
+typedef void (*roctracer_allocator_t)(char** ptr, size_t size, void* arg);
+
+/**
+ * Memory pool buffer callback.
+ *
+ * The callback that will be invoked when a memory pool buffer becomes full or
+ * is flushed.
+ *
+ * \p begin pointer to first entry entry in the buffer.
+ *
+ * \p end pointer to one past the end entry in the buffer.
+ *
+ * \p arg the argument specified when the callback was defined.
+ */
+typedef void (*roctracer_buffer_callback_t)(const char* begin, const char* end,
+                                            void* arg);
+
+/**
+ * Memory pool properties.
+ *
+ * Defines the properties when a tracer memory pool is created.
+ */
+typedef struct {
+  /**
+   * ROC Tracer mode.
+   */
+  uint32_t mode;
+
+  /**
+   * Size of buffer in bytes.
+   */
+  size_t buffer_size;
+
+  /**
+   * The allocator function to use to allocate and deallocate the buffer. If
+   * NULL then \p malloc, \p realloc, and \p free are used.
+   */
+  roctracer_allocator_t alloc_fun;
+
+  /**
+   * The argument to pass when invoking the \p alloc_fun allocator.
+   */
+  void* alloc_arg;
+
+  /**
+   * The function to call when a buffer becomes full or is flushed.
+   */
+  roctracer_buffer_callback_t buffer_callback_fun;
+
+  /**
+   * The argument to pass when invoking the \p buffer_callback_fun callback.
+   */
+  void* buffer_callback_arg;
+} roctracer_properties_t;
+
+/**
+ * Tracer memory pool type.
+ */
+typedef void roctracer_pool_t;
+
+/**
+ * Create tracer memory pool.
+ *
+ * If \p pool is not NULL, returns the created memory pool. Does not change the
+ * default memory pool.
+ *
+ * If \p pool is NULL, sets the default memory pool to the created pool if not
+ * already defined. Otherwise, return an error.
+ *
+ * @param[in] properties Tracer memory pool properties.
+ *
+ * @param[out] pool Tracer memory pool created if not NULL.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR_DEFAULT_POOL_ALREADY_DEFINED \p pool is NULL
+ * and the default pool is already defined. Unable to create the pool.
+ *
+ * @retval ROCTRACER_STATUS_ERROR_MEMORY_ALLOCATION Unable to allocate memory
+ * for the \p pool. Unable to create the pool.
+ */
+ROCTRACER_API roctracer_status_t
+roctracer_open_pool_expl(const roctracer_properties_t* properties,
+                         roctracer_pool_t** pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Create tracer memory pool.
+ *
+ * Sets the default memory pool to the created pool if not already defined.
+ * Otherwise, return an error.
+ *
+ * @param[in] properties Tracer memory pool properties.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR_DEFAULT_POOL_ALREADY_DEFINED The default pool
+ * is already defined. Unable to create the pool.
+ *
+ * @retval ROCTRACER_STATUS_ERROR_MEMORY_ALLOCATION Unable to allocate memory
+ * for the \p pool. Unable to create the pool.
+ */
+ROCTRACER_API roctracer_status_t roctracer_open_pool(
+    const roctracer_properties_t* properties) ROCTRACER_VERSION_4_1;
+
+/**
+ * Close tracer memory pool.
+ *
+ * All enabled activities that use the pool must have completed writing to the
+ * pool, before deleting the pool. Deleting a pool automatically disables any
+ * activities that specify the pool, and flushes it.
+ *
+ * @param[in] pool Memory pool to close. If NULL, the default memory pool is
+ * closed if defined. The default memory pool is set to undefined if closed.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully or pool was NULL and there is no default pool.
+ */
+ROCTRACER_API roctracer_status_t
+roctracer_close_pool_expl(roctracer_pool_t* pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Close default tracer memory pool, if defined, and set to undefined.
+ *
+ * All enabled activities that use the pool must have completed writing to the
+ * pool, before deleting the pool. Deleting a pool automatically disables any
+ * activities that specify the pool, and flushes it.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully or there is no default pool.
+ */
+ROCTRACER_API roctracer_status_t roctracer_close_pool() ROCTRACER_VERSION_4_1;
+
+/**
+ * Query and set the default memory pool.
+ *
+ * @param[in] pool If not NULL, change the current default pool to \p pool. If
+ * NULL, the default pool is not changed.
+ *
+ * @return Return the current default memory pool before any change, or NULL if
+ * none is defined.
+ */
+ROCTRACER_API roctracer_pool_t* roctracer_default_pool_expl(
+    roctracer_pool_t* pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Query the current default memory pool.
+ *
+ * @return Return the current default memory pool, or NULL is none is defined.
+ */
+ROCTRACER_API roctracer_pool_t* roctracer_default_pool() ROCTRACER_VERSION_4_1;
+
+/**
+ * Enable activity record logging for a specified operation of a domain
+ * providing a memory pool.
+ *
+ * @param[in] domain The domain.
+ *
+ * @param[in] op The activity operation ID in \p domain.
+ *
+ * @param[in] pool The memory pool to write the activity record. If NULL, use
+ * the default memory pool.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is
+ * defined.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_op_activity_expl(
+    activity_domain_t domain, uint32_t op,
+    roctracer_pool_t* pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Enable activity record logging for a specified operation of a domain using
+ * the default memory pool.
+ *
+ * @param[in] domain The domain.
+ *
+ * @param[in] op The activity operation ID in \p domain.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR No default pool is defined.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_op_activity(
+    activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1;
+
+/**
+ * Enable activity record logging for all operations of a domain providing a
+ * memory pool.
+ *
+ * @param[in] domain The domain.
+ *
+ * @param[in] pool The memory pool to write the activity record. If NULL, use
+ * the default memory pool.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is
+ * defined.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity_expl(
+    activity_domain_t domain, roctracer_pool_t* pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Enable activity record logging for all operations of a domain using the
+ * default memory pool.
+ *
+ * @param[in] domain The domain.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ *
+ * @retval ROCTRACER_STATUS_ERROR No default pool is defined.
+ */
+ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity(
+    activity_domain_t domain) ROCTRACER_VERSION_4_1;
+
+/**
+ * Disable activity record logging for a specified operation of a domain.
+ *
+ * @param[in] domain The domain.
+ *
+ * @param[in] op The activity operation ID in \p domain.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t roctracer_disable_op_activity(
+    activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1;
+
+/**
+ * Disable activity record logging for all operations of a domain.
+ *
+ * @param[in] domain The domain.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t roctracer_disable_domain_activity(
+    activity_domain_t domain) ROCTRACER_VERSION_4_1;
+
+/**
+ * Flush available activity records for a memory pool.
+ *
+ * If flushing encounters an activity record still being written, flushing
+ * stops. Use a subsequent flush when the record has completed being written to
+ * resume the flush.
+ *
+ * @param[in] pool The memory pool to flush. If NULL, flushes the default
+ * memory pool.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t
+roctracer_flush_activity_expl(roctracer_pool_t* pool) ROCTRACER_VERSION_4_1;
+
+/**
+ * Flush available activity records for the default memory pool.
+ *
+ * If flushing encounters an activity record still being written, flushing
+ * stops. Use a subsequent flush when the record has completed being written to
+ * resume the flush.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t roctracer_flush_activity()
+    ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup timestamp_group Timestamp Operations
+ *
+ *
+ *
+ * @{
+ */
+
+/**
+ * Get the system clock timestamp.
+ *
+ * @param[out] timestamp The system clock timestamp in nano seconds.
+ *
+ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed
+ * successfully.
+ */
+ROCTRACER_API roctracer_status_t roctracer_get_timestamp(
+    roctracer_timestamp_t* timestamp) ROCTRACER_VERSION_4_1;
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" block */
+#endif /* __cplusplus */
+
+#endif /* ROCTRACER_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_ext.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fc4f6a67dc6e46053ac9a929b584cab00bc0368
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_ext.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ROC Tracer Extension API
+//
+// The API provides functionality for application annotation with event and
+// external ranges correlation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef ROCTRACER_EXT_H_
+#define ROCTRACER_EXT_H_
+
+#include "roctracer.h"
+
+/* Extension API opcodes */
+typedef enum {
+  ACTIVITY_EXT_OP_MARK = 0,
+  ACTIVITY_EXT_OP_EXTERN_ID = 1
+} activity_ext_op_t;
+
+typedef void (*roctracer_start_cb_t)();
+typedef void (*roctracer_stop_cb_t)();
+typedef struct {
+  roctracer_start_cb_t start_cb;
+  roctracer_stop_cb_t stop_cb;
+} roctracer_ext_properties_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+////////////////////////////////////////////////////////////////////////////////
+// Application annotation API
+
+// Tracing start API
+void ROCTRACER_API roctracer_start() ROCTRACER_VERSION_4_1;
+
+// Tracing stop API
+void ROCTRACER_API roctracer_stop() ROCTRACER_VERSION_4_1;
+
+////////////////////////////////////////////////////////////////////////////////
+// External correlation id API
+
+// Notifies that the calling thread is entering an external API region.
+// Push an external correlation id for the calling thread.
+roctracer_status_t ROCTRACER_API
+roctracer_activity_push_external_correlation_id(activity_correlation_id_t id)
+    ROCTRACER_VERSION_4_1;
+
+// Notifies that the calling thread is leaving an external API region.
+// Pop an external correlation id for the calling thread.
+// 'lastId' returns the last external correlation if not NULL
+roctracer_status_t ROCTRACER_API
+roctracer_activity_pop_external_correlation_id(
+    activity_correlation_id_t* last_id) ROCTRACER_VERSION_4_1;
+
+#ifdef __cplusplus
+}  // extern "C" block
+#endif  // __cplusplus
+
+#endif  // ROCTRACER_EXT_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_hip.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_hip.h
new file mode 100644
index 0000000000000000000000000000000000000000..83ed7fc957e6bd1232887a4248c513668d0fcd61
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_hip.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef ROCTRACER_HIP_H_
+#define ROCTRACER_HIP_H_
+
+#include "roctracer.h"
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_deprecated.h>
+#include <hip/amd_detail/hip_prof_str.h>
+
+typedef enum {
+  HIP_OP_ID_DISPATCH = 0,
+  HIP_OP_ID_COPY = 1,
+  HIP_OP_ID_BARRIER = 2,
+  HIP_OP_ID_NUMBER = 3
+} hip_op_id_t;
+
+#endif  // ROCTRACER_HIP_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_roctx.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_roctx.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3190f6336247c41b65c22680eff41c6b50d97
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctracer_roctx.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef ROCTRACER_ROCTX_H_
+#define ROCTRACER_ROCTX_H_
+
+#include "roctx.h"
+
+/**
+ *  ROCTX API ID enumeration
+ */
+enum roctx_api_id_t {
+  ROCTX_API_ID_roctxMarkA = 0,
+  ROCTX_API_ID_roctxRangePushA = 1,
+  ROCTX_API_ID_roctxRangePop = 2,
+  ROCTX_API_ID_roctxRangeStartA = 3,
+  ROCTX_API_ID_roctxRangeStop = 4,
+  ROCTX_API_ID_NUMBER,
+};
+
+/**
+ *  ROCTX callbacks data type
+ */
+typedef struct roctx_api_data_s {
+  union {
+    struct {
+      const char* message;
+      roctx_range_id_t id;
+    };
+    struct {
+      const char* message;
+    } roctxMarkA;
+    struct {
+      const char* message;
+    } roctxRangePushA;
+    struct {
+      const char* message;
+    } roctxRangePop;
+    struct {
+      const char* message;
+      roctx_range_id_t id;
+    } roctxRangeStartA;
+    struct {
+      const char* message;
+      roctx_range_id_t id;
+    } roctxRangeStop;
+  } args;
+} roctx_api_data_t;
+
+#endif /* ROCTRACER_ROCTX_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctx.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctx.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccec5a185badb7fc43d625c8987a65ecbe17a6e3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/include/roctracer/roctx.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+/** \mainpage ROCTX API Specification
+ *
+ * \section introduction Introduction
+ * ROCTX is a library that implements the AMD code annotation API.  It provides
+ * the support necessary to annotate events and code ranges in applications.
+ */
+
+/**
+ * \file
+ * ROCTX API interface.
+ */
+
+#ifndef ROCTX_H_
+#define ROCTX_H_ 1
+
+/* Placeholder for calling convention and import/export macros */
+#if !defined(ROCTX_CALL)
+#define ROCTX_CALL
+#endif /* !defined (ROCTX_CALL) */
+
+#if !defined(ROCTX_EXPORT_DECORATOR)
+#if defined(__GNUC__)
+#define ROCTX_EXPORT_DECORATOR __attribute__((visibility("default")))
+#elif defined(_MSC_VER)
+#define ROCTX_EXPORT_DECORATOR __declspec(dllexport)
+#endif /* defined (_MSC_VER) */
+#endif /* !defined (ROCTX_EXPORT_DECORATOR) */
+
+#if !defined(ROCTX_IMPORT_DECORATOR)
+#if defined(__GNUC__)
+#define ROCTX_IMPORT_DECORATOR
+#elif defined(_MSC_VER)
+#define ROCTX_IMPORT_DECORATOR __declspec(dllimport)
+#endif /* defined (_MSC_VER) */
+#endif /* !defined (ROCTX_IMPORT_DECORATOR) */
+
+#define ROCTX_EXPORT ROCTX_EXPORT_DECORATOR ROCTX_CALL
+#define ROCTX_IMPORT ROCTX_IMPORT_DECORATOR ROCTX_CALL
+
+#if !defined(ROCTX)
+#if defined(ROCTX_EXPORTS)
+#define ROCTX_API ROCTX_EXPORT
+#else /* !defined (ROCTX_EXPORTS) */
+#define ROCTX_API ROCTX_IMPORT
+#endif /* !defined (ROCTX_EXPORTS) */
+#endif /* !defined (ROCTX) */
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* defined(__cplusplus) */
+
+/** \defgroup symbol_versions_group Symbol Versions
+ *
+ * The names used for the shared library versioned symbols.
+ *
+ * Every function is annotated with one of the version macros defined in this
+ * section.  Each macro specifies a corresponding symbol version string.  After
+ * dynamically loading the shared library with \p dlopen, the address of each
+ * function can be obtained using \p dlvsym with the name of the function and
+ * its corresponding symbol version string.  An error will be reported by \p
+ * dlvsym if the installed library does not support the version for the
+ * function specified in this version of the interface.
+ *
+ * @{
+ */
+
+/**
+ * The function was introduced in version 4.1 of the interface and has the
+ * symbol version string of ``"ROCTX_4.1"``.
+ */
+#define ROCTX_VERSION_4_1
+
+/** @} */
+
+/** \defgroup versioning_group Versioning
+ *
+ * Version information about the interface and the associated installed
+ * library.
+ *
+ * @{
+ */
+
+/**
+ * The semantic version of the interface following
+ * [semver.org][semver] rules.
+ *
+ * A client that uses this interface is only compatible with the installed
+ * library if the major version numbers match and the interface minor version
+ * number is less than or equal to the installed library minor version number.
+ */
+
+/**
+ * The major version of the interface as a macro so it can be used by the
+ * preprocessor.
+ */
+#define ROCTX_VERSION_MAJOR 4
+
+/**
+ * The minor version of the interface as a macro so it can be used by the
+ * preprocessor.
+ */
+#define ROCTX_VERSION_MINOR 1
+
+/**
+ * Query the major version of the installed library.
+ *
+ * Return the major version of the installed library. This can be used to check
+ * if it is compatible with this interface version.
+ *
+ * \return Returns the major version number.
+ */
+ROCTX_API uint32_t roctx_version_major() ROCTX_VERSION_4_1;
+
+/**
+ * Query the minor version of the installed library.
+ *
+ * Return the minor version of the installed library. This can be used to check
+ * if it is compatible with this interface version.
+ *
+ * \return Returns the minor version number.
+ */
+ROCTX_API uint32_t roctx_version_minor() ROCTX_VERSION_4_1;
+
+/** @} */
+
+/** \defgroup marker_group ROCTX Markers
+ *
+ * Marker annotations are used to describe events in a ROCm application.
+ *
+ * @{
+ */
+
+/**
+ * Mark an event.
+ *
+ * \param[in] message The message associated with the event.
+ */
+ROCTX_API void roctxMarkA(const char* message) ROCTX_VERSION_4_1;
+#define roctxMark(message) roctxMarkA(message)
+
+/** @} */
+
+/** \defgroup range_group ROCTX Ranges
+ *
+ * Range annotations are used to describe events in a ROCm application.
+ *
+ * @{
+ */
+
+/**
+ * Start a new nested range.
+ *
+ * Nested ranges are stacked and local to the current CPU thread.
+ *
+ * \param[in] message The message associated with this range.
+ *
+ * \return Returns the level this nested range is started at. Nested range
+ * levels are 0 based.
+ */
+ROCTX_API int roctxRangePushA(const char* message) ROCTX_VERSION_4_1;
+#define roctxRangePush(message) roctxRangePushA(message)
+
+/**
+ * Stop the current nested range.
+ *
+ * Stop the current nested range, and pop it from the stack. If a nested range
+ * was active before the last one was started, it becomes again the current
+ * nested range.
+ *
+ * \return Returns the level the stopped nested range was started at, or a
+ * negative value if there was no nested range active.
+ */
+ROCTX_API int roctxRangePop() ROCTX_VERSION_4_1;
+
+/**
+ * ROCTX range ID.
+ *
+ * This is the range ID used to identify start/end ranges.
+ */
+typedef uint64_t roctx_range_id_t;
+
+/**
+ * Starts a process range.
+ *
+ * Start/stop ranges can be started and stopped in different threads. Each
+ * timespan is assigned a unique range ID.
+ *
+ * \param[in] message The message associated with this range.
+ *
+ * \return Returns the ID of the new range.
+ */
+ROCTX_API roctx_range_id_t roctxRangeStartA(const char* message)
+    ROCTX_VERSION_4_1;
+#define roctxRangeStart(message) roctxRangeStartA(message)
+
+/**
+ * Stop a process range.
+ */
+ROCTX_API void roctxRangeStop(roctx_range_id_t id) ROCTX_VERSION_4_1;
+
+/** @} */
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif /* defined (__cplusplus) */
+
+#endif /* ROCTX_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/lib/asanrtl.bc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/lib/asanrtl.bc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4432074e720c2ba09c1375588cb368afcbd418
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/amd/lib/asanrtl.bc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/compiler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..10754e71579bd40c5399aa3a22f86582acca5657
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/compiler.py
@@ -0,0 +1,92 @@
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, Union
+from types import ModuleType
+
+
+@dataclass(frozen=True)
+class GPUTarget(object):
+    # Target backend, e.g., cuda, hip
+    backend: str
+    # Target architecture, e.g., 90 (for cuda compute capability), gfx940 (for hip)
+    arch: Union[int, str]
+    warp_size: int
+
+
+class Language(Enum):
+    """The input language being compiled by the backend."""
+    TRITON = 0
+    GLUON = 1
+
+
+class BaseBackend(metaclass=ABCMeta):
+    supports_native_tensor_specialization = True
+
+    def __init__(self, target: GPUTarget) -> None:
+        self.target = target
+        assert self.supports_target(target)
+
+    @staticmethod
+    @abstractmethod
+    def supports_target(target: GPUTarget):
+        raise NotImplementedError
+
+    @abstractmethod
+    def hash(self) -> str:
+        """Returns a unique identifier for this backend"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_options(self, options: dict) -> object:
+        """
+        Converts an `options` dictionary into an arbitrary object and returns it.
+        This function may contain target-specific heuristics and check the legality of the provided options
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_stages(self, stages: dict, options: object) -> None:
+        """
+        Populates `stages` dictionary with entries of the form:
+        ir_name [str] => Function[(src: str, metadata: dict) -> str|bytes]
+        The value of each entry may populate a `metadata` dictionary.
+        Stages will be run sequentially (in inseriton order) and can communicate using `metadata`.
+        All stages are expected to return a `str` object, except for the last stage which returns
+        a `bytes` object for execution by the launcher.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_dialects(self, context):
+        """
+        Load additional MLIR dialects into the provided `context`
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        """
+        Return a map of interface modules to their device-specific implementations
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def parse_attr(desc):
+        assert isinstance(desc, str)
+        ret = []
+        if "D" in desc:
+            ret += [["tt.divisibility", 16]]
+        return ret
+
+    @staticmethod
+    def get_int_specialization(arg, **kwargs):
+        if arg % 16 == 0 and kwargs.get("align", False):
+            return "D"
+        return ""
+
+    @staticmethod
+    def get_tensor_specialization(arg, **kwargs):
+        if arg.data_ptr() % 16 == 0 and kwargs.get("align", False):
+            return "D"
+        return ""
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/driver.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a658b47e48b00fb575d27f6b1c7d59107f6c7c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/driver.py
@@ -0,0 +1,66 @@
+from abc import ABCMeta, abstractmethod
+from typing import Callable, List, Protocol, Sequence
+
+
+class Benchmarker(Protocol):
+
+    def __call__(self, kernel_call: Callable, *, quantiles: List[float], **kwargs) -> Sequence[float]:
+        pass
+
+
+class DriverBase(metaclass=ABCMeta):
+
+    @classmethod
+    @abstractmethod
+    def is_active(self):
+        pass
+
+    @abstractmethod
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        """
+        Converts a Triton type string to its corresponding C++ type string for this backend.
+
+        Args:
+            ty (str): The Triton type string. e.g., 'i32', '*fp16', 'fp32'.
+
+        Returns:
+            str: The C++ type string.
+        """
+        pass
+
+    @abstractmethod
+    def get_current_target(self):
+        pass
+
+    @abstractmethod
+    def get_active_torch_device(self):
+        pass
+
+    @abstractmethod
+    def get_benchmarker(self) -> Benchmarker:
+        """
+        Return the benchmarking function that this backend should use by default.
+        """
+        raise NotImplementedError
+
+    def __init__(self) -> None:
+        pass
+
+
+class GPUDriver(DriverBase):
+
+    def __init__(self):
+        # TODO: support other frameworks than torch
+        import torch
+        self.get_device_capability = torch.cuda.get_device_capability
+        try:
+            from torch._C import _cuda_getCurrentRawStream
+            self.get_current_stream = _cuda_getCurrentRawStream
+        except ImportError:
+            self.get_current_stream = lambda idx: torch.cuda.current_stream(idx).cuda_stream
+        self.get_current_device = torch.cuda.current_device
+        self.set_current_device = torch.cuda.set_device
+
+    # TODO: remove once TMA is cleaned up
+    def assemble_tensormap_to_arg(self, tensormaps_info, args):
+        return args
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6fba1ce71ae3c9074f7307677fcb0827b07880c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..837461b368c4d004443c262e6bdb7b708cc4f966
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d73322476bc4c89cb5d4b58ff372c4a69dffa76
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1366e70ab7cf1474a5759b842ae56b7b72d7eb20
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py
@@ -0,0 +1,553 @@
+from triton.backends.compiler import BaseBackend, GPUTarget, Language
+from triton._C.libtriton import ir, passes, llvm, nvidia
+from triton import knobs
+from triton.runtime.errors import PTXASError
+
+from dataclasses import dataclass
+import functools
+from typing import Any, Dict, Tuple, Optional
+from types import ModuleType
+import hashlib
+import re
+import tempfile
+import signal
+import os
+import subprocess
+from pathlib import Path
+
+
+def min_dot_size(target: GPUTarget):
+
+    def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]:  # [m, n, k]
+        lhs_bitwidth = lhs_type.scalar.primitive_bitwidth
+        rhs_bitwidth = rhs_type.scalar.primitive_bitwidth
+        assert lhs_bitwidth == rhs_bitwidth, "lhs and rhs bitwidth must be the same"
+        # For small M/N the input we can still use tensorcores with padding.
+        if lhs_bitwidth == 8:
+            return (1, 1, 32)
+        else:
+            return (1, 1, 16)
+
+    return check_dot_compatibility
+
+
+def get_ptxas(arch: int) -> knobs.NvidiaTool:
+    return knobs.nvidia.ptxas_blackwell if arch >= 100 else knobs.nvidia.ptxas
+
+
+@functools.lru_cache()
+def get_ptxas_version(arch: int = 80):
+    mock_ver = knobs.nvidia.mock_ptx_version
+    if mock_ver is not None:
+        return mock_ver  # This is not really a version of ptxas, but it is good enough for testing
+    version = subprocess.check_output([get_ptxas(arch).path, "--version"]).decode("utf-8")
+    return version
+
+
+@functools.lru_cache()
+def ptx_get_version(cuda_version) -> int:
+    '''
+    Get the highest PTX version supported by the current CUDA driver.
+    '''
+    assert isinstance(cuda_version, str)
+    major, minor = map(int, cuda_version.split('.'))
+    if major == 12:
+        if minor < 6:
+            return 80 + minor
+        else:
+            return 80 + minor - 1
+    if major == 11:
+        return 70 + minor
+    if major == 10:
+        return 63 + minor
+
+    if major >= 13:
+        base_ptx = 90
+        return base_ptx + (major - 13) * 10 + minor
+
+    raise RuntimeError("Triton only support CUDA 10.0 or higher, but got CUDA version: " + cuda_version)
+
+
+def get_ptx_version_from_options(options, arch: int):
+    ptx_version = options.ptx_version
+    if ptx_version is None:
+        cuda_version = get_ptxas(arch).version
+        ptx_version = ptx_get_version(cuda_version)
+    return ptx_version
+
+
+@functools.lru_cache()
+def get_features(options, arch: int):
+    ptx_version = get_ptx_version_from_options(options, arch)
+
+    # PTX 8.6 is the max version supported by llvm c1188642.
+    #
+    # To check if a newer PTX version is supported, increase this value
+    # and run a test.  If it's not supported, LLVM will print a warning
+    # like "+ptx8.4 is not a recognized feature for this target".
+    llvm_ptx_version = min(86, ptx_version)
+    features = f'+ptx{llvm_ptx_version}'
+    return features
+
+
+@functools.lru_cache(None)
+def file_hash(path):
+    with open(path, "rb") as f:
+        return hashlib.sha256(f.read()).hexdigest()
+
+
+def sm_arch_from_capability(capability: int):
+    # TODO: Handle non-"a" sms
+    suffix = "a" if capability >= 90 else ""
+    return f"sm_{capability}{suffix}"
+
+
+@dataclass(frozen=True)
+class CUDAOptions:
+    num_warps: int = 4
+    num_ctas: int = 1
+    num_stages: int = 3
+    warp_size: int = 32
+    # maxnreg corresponds to the ptx parameter .maxnreg, which controls the
+    # maximum number of 32-bit registers used by one thread.
+    maxnreg: Optional[int] = None
+    ptx_version: int = None
+    ptx_options: Optional[str] = knobs.nvidia.ptxas_options
+    ir_override: Optional[str] = None  # filename of a user-defined IR (*.{ttir|ttgir|llir|ptx})
+    enable_fp_fusion: bool = True
+    enable_reflect_ftz: bool = True  # ftz in libdevice
+    launch_cooperative_grid: bool = False
+    launch_pdl: bool = False
+    supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e4b15")
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "tf32"
+    allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee", 'bf16x3', 'bf16x6')
+    max_num_imprecise_acc_default: bool = None
+    extern_libs: dict = None
+    debug: bool = False
+    backend_name: str = 'cuda'
+    sanitize_overflow: bool = True
+    arch: str = None
+    instrumentation_mode: str = ""
+
+    def __post_init__(self):
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        if not extern_libs.get('libdevice', None):
+            extern_libs['libdevice'] = knobs.nvidia.libdevice_path or str(default_libdir / 'libdevice.10.bc')
+
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+               "num_warps must be a power of 2"
+
+    def hash(self):
+        hash_dict = dict(self.__dict__)
+        hash_dict["extern_libs"] = tuple((k, file_hash(v)) for k, v in sorted(hash_dict["extern_libs"]))
+        key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items())])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+
+class CUDABackend(BaseBackend):
+    instrumentation = None
+
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'cuda'
+
+    def _parse_arch(self, arch):
+        pattern = r"^sm(\d+)$"
+        match = re.fullmatch(pattern, arch)
+        if not match:
+            raise ValueError(f"TRITON_OVERRIDE_ARCH must have the form {pattern}")
+        return int(match.group(1))
+
+    def get_target_name(self, options) -> str:
+        capability = self._parse_arch(options.arch)
+        return f"cuda:{capability}"
+
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        self.binary_ext = "cubin"
+
+    def parse_options(self, opts) -> Any:
+        # Enable debug mode for ConSan, so device-side assertions are not optimized out
+        if "instrumentation_mode" in opts and opts["instrumentation_mode"] == "consan":
+            opts["debug"] = True
+
+        args = {'arch': knobs.runtime.override_arch or f"sm{self.target.arch}"}
+        args.update({k: opts[k] for k in CUDAOptions.__dataclass_fields__.keys() if k in opts if opts[k] is not None})
+        capability = int(self._parse_arch(args["arch"]))
+
+        if args.get("num_ctas", 1) > 1 and capability < 90:
+            raise ValueError((f"num_ctas > 1 requires NVIDIA SM90+ (Hopper). "
+                              f"Current target is sm_{capability}. This configuration will fail. "
+                              f"Please set num_ctas=1 or target an SM90+ GPU."))
+
+        if "supported_fp8_dtypes" not in args:
+            supported_fp8_dtypes = set(CUDAOptions.supported_fp8_dtypes)
+            if capability >= 89:
+                supported_fp8_dtypes.add("fp8e4nv")
+            args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
+
+        if "deprecated_fp8_dot_operand_dtypes" not in args:
+            if capability >= 90:
+                args["deprecated_fp8_dot_operand_dtypes"] = ("fp8e4b15", )
+
+        if "enable_fp_fusion" not in args:
+            args["enable_fp_fusion"] = knobs.language.default_fp_fusion
+
+        args["max_num_imprecise_acc_default"] = 2**30 if capability == 90 else 0
+
+        return CUDAOptions(**args)
+
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+        )
+
+    def get_codegen_implementation(self, options):
+        import triton.language.extra.cuda as cuda
+        capability = int(self._parse_arch(options.arch))
+        codegen_fns = {
+            "convert_custom_types":
+            cuda.convert_custom_float8_sm80 if capability >= 80 else cuda.convert_custom_float8_sm70, "min_dot_size":
+            min_dot_size(self.target)
+        }
+        return codegen_fns
+
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        from triton.language.extra.cuda import libdevice
+        return {"triton.language.extra.libdevice": libdevice}
+
+    def load_dialects(self, ctx):
+        nvidia.load_dialects(ctx)
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.load_dialects(ctx)
+
+    @staticmethod
+    def make_ttir(mod, metadata, opt, capability):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        if capability // 10 < 9:
+            passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_combine(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.ttir.add_loop_unroll(pm)
+        pm.run(mod, 'make_ttir')
+        return mod
+
+    @staticmethod
+    def make_ttgir(mod, metadata, opt, capability):
+        # Set maxnreg on all kernels, if it was provided.
+        if opt.maxnreg is not None:
+            mod.set_attr("ttg.maxnreg", ir.builder(mod.context).get_int32_attr(opt.maxnreg))
+
+        pm = ir.pass_manager(mod.context)
+        dump_enabled = pm.enable_debug()
+        emuTF32 = (capability // 10 >= 8)
+        passes.ttir.add_convert_to_ttgpuir(pm, f"cuda:{capability}", opt.num_warps, 32, opt.num_ctas)
+        # optimize TTGIR
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_f32_dot_tc(pm, emuTF32)
+        # TODO(Qingyi): Move PlanCTAPass to the front of CoalescePass
+        nvidia.passes.ttnvgpuir.add_plan_cta(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        passes.ttgpuir.add_accelerate_matmul(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
+        nvidia.passes.ttnvgpuir.add_optimize_descriptor_encoding(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        if capability // 10 in [8, 9]:
+            passes.ttgpuir.add_fuse_nested_loops(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttir.add_triton_licm(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            nvidia.passes.hopper.add_hopper_warpspec(pm, opt.num_stages, dump_enabled)
+            passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
+            passes.ttgpuir.add_schedule_loops(pm)
+            passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
+        elif capability // 10 >= 10:
+            passes.ttgpuir.add_fuse_nested_loops(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttir.add_triton_licm(pm)
+            passes.ttgpuir.add_optimize_accumulator_init(pm)
+            passes.ttgpuir.add_hoist_tmem_alloc(pm, False)
+            nvidia.passes.ttnvgpuir.add_promote_lhs_to_tmem(pm)
+            passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
+            passes.ttgpuir.add_schedule_loops(pm)
+            passes.ttgpuir.add_warp_specialize(pm, opt.num_stages)
+            passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
+            passes.ttgpuir.add_optimize_partition_warps(pm)
+            passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            # hoist again and allow hoisting out of if statements
+            passes.ttgpuir.add_hoist_tmem_alloc(pm, True)
+            nvidia.passes.ttnvgpuir.add_remove_tmem_tokens(pm)
+        else:
+            passes.ttir.add_triton_licm(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.ttgpuir.add_prefetch(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
+        passes.ttgpuir.add_coalesce_async_copy(pm)
+        nvidia.passes.ttnvgpuir.add_optimize_tmem_layouts(pm)
+        if capability // 10 >= 9:
+            nvidia.passes.ttnvgpuir.add_tma_lowering(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        nvidia.passes.ttnvgpuir.add_interleave_tmem(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        passes.ttgpuir.add_reorder_instructions(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        nvidia.passes.ttnvgpuir.add_fence_insertion(pm, capability)
+        nvidia.passes.ttnvgpuir.add_lower_mma(pm)
+        passes.common.add_sccp(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_canonicalizer(pm)
+
+        pm.run(mod, 'make_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+
+    def gluon_to_ttgir(self, src, metadata, options, capability):
+        mod = src
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+
+        passes.gluon.add_inliner(pm)
+        passes.gluon.add_infer_coalesced_encodings(pm)
+        passes.gluon.add_resolve_auto_encodings(pm)
+        nvidia.passes.ttnvgpuir.add_tma_lowering(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.common.add_sccp(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+
+        pm.run(mod, 'gluon_to_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+
+    def make_llir(self, src, metadata, options, capability):
+        ptx_version = get_ptx_version_from_options(options, self.target.arch)
+
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+        passes.ttgpuir.add_allocate_warp_groups(pm)
+        passes.convert.add_scf_to_cf(pm)
+        passes.gluon.add_inliner(pm)
+        nvidia.passes.ttgpuir.add_allocate_shared_memory_nv(pm, capability, ptx_version)
+        nvidia.passes.ttnvgpuir.add_allocate_tensor_memory(pm)
+        nvidia.passes.ttnvgpuir.add_check_matmul_two_cta(pm)
+        if knobs.compilation.instrumentation_mode == "consan":
+            # Call ConcurrencySanitizerPass here, before allocating global scratch memory but after allocating tensor and shared
+            passes.ttgpuir.add_concurrency_sanitizer(pm)
+        passes.ttgpuir.add_allocate_global_scratch_memory(pm)
+        nvidia.passes.ttnvgpuir.add_proxy_fence_insertion(pm, capability)
+        # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
+        nvidia.passes.ttgpuir.add_to_llvmir(pm, capability, ptx_version)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        nvidia.passes.ttnvgpuir.add_nvgpu_to_llvm(pm)
+        nvidia.passes.ttnvgpuir.add_warp_specialize_to_llvm(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.convert.add_nvvm_to_llvm(pm)
+
+        if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables:
+            passes.llvmir.add_di_scope(pm)
+
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
+
+        pm.run(mod, 'make_llir')
+
+        if knobs.compilation.dump_ir_extract_di_local_variables:
+            # comments below on why separate it
+            if not knobs.compilation.disable_line_info:
+                pm = ir.pass_manager(mod.context)
+                pm.enable_debug()
+                passes.llvmir.add_di_scope(pm)
+                pm.run(mod, 'make_llir.disable_line_info')
+
+            # insert dbg intrinsic with several DI Attribute including source
+            # var name and type info note: unknown reason for now, but this
+            # pass and add_di_scope has to be run separately, otherwise if we
+            # put them into previous pipline, it trigger a segmentfault without
+            # any error message; could be due to a bug in mlir or pybind11
+            pm = ir.pass_manager(mod.context)
+            pm.enable_debug()
+            passes.llvmir.add_di_local_variable(pm)
+            pm.run(mod, 'make_llir.dump_ir_extract_di_local_variables')
+
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        if knobs.compilation.enable_asan:
+            raise RuntimeError(
+                "Address Sanitizer Error: Address sanitizer is currently only supported on the AMD backend")
+        llvm_mod = llvm.to_module(mod, context)
+        proc = sm_arch_from_capability(capability)
+        features = get_features(options, self.target.arch)
+        triple = 'nvptx64-nvidia-cuda'
+        nvidia.set_short_ptr()
+        llvm.attach_datalayout(llvm_mod, triple, proc, features)
+        if options.enable_reflect_ftz:
+            nvidia.set_nvvm_reflect_ftz(llvm_mod)
+
+        if options.extern_libs and nvidia.has_extern_deps(llvm_mod):
+            paths = [path for (name, path) in options.extern_libs]
+            llvm.link_extern_libs(llvm_mod, paths)
+
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3)
+
+        # Get some metadata
+        # warp-specialization mutates num_warps
+        total_num_warps = src.get_int_attr("ttg.total-num-warps")
+        if total_num_warps is not None:
+            metadata["num_warps"] = total_num_warps
+        metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["tmem_size"] = src.get_int_attr("ttg.tensor_memory_size")
+        metadata["global_scratch_size"] = src.get_int_attr("ttg.global_scratch_memory_size")
+        metadata["global_scratch_align"] = src.get_int_attr("ttg.global_scratch_memory_alignment")
+        metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
+        metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
+        ret = str(llvm_mod)
+        del llvm_mod
+        del context
+        return ret
+
+    def make_ptx(self, src, metadata, opt, capability):
+        ptx_version = get_ptx_version_from_options(opt, self.target.arch)
+
+        triple = 'nvptx64-nvidia-cuda'
+        proc = sm_arch_from_capability(capability)
+        features = get_features(opt, self.target.arch)
+        flags = ["nvptx-mad-wide-opt"]
+        ret = llvm.translate_to_asm(src, triple, proc, features, flags, opt.enable_fp_fusion, False)
+        # Find kernel names (there should only be one)
+        names = re.findall(r".visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)", ret)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # post-process
+        ptx_version = f'{ptx_version//10}.{ptx_version%10}'
+        ret = re.sub(r'\.version \d+\.\d+', f'.version {ptx_version}', ret, flags=re.MULTILINE)
+        ret = re.sub(r'\.target sm_\d+', f'.target sm_{capability}', ret, flags=re.MULTILINE)
+        if not knobs.compilation.dump_ir_extract_di_local_variables:
+            # Remove the debug flag that prevents ptxas from optimizing the code
+            # Note: if this flag is removed, the source var name and type info will be lost when ptx was compiled into cubin
+            #           and we may not be able to see them in cuda-gdb
+            ret = re.sub(r",\s*debug|debug,\s*", "", ret)
+        if knobs.nvidia.dump_nvptx:
+            print("// -----// NVPTX Dump //----- //")
+            print(ret)
+        return ret
+
+    def make_cubin(self, src, metadata, opt, capability):
+        ptxas = get_ptxas(self.target.arch).path
+        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.ptx') as fsrc, \
+            tempfile.NamedTemporaryFile(delete=False, mode='r', suffix='.log') as flog:
+            fsrc.write(src)
+            fsrc.flush()
+            fbin = fsrc.name + '.o'
+
+            debug_info = []
+            if knobs.compilation.disable_line_info:
+                # This option is ignored if used without -lineinfo
+                debug_info += ["-lineinfo", "-suppress-debug-info"]
+            elif knobs.nvidia.disable_ptxas_opt:
+                # Synthesize complete debug info
+                debug_info += ["-g"]
+            else:
+                # Only emit line info
+                debug_info += ["-lineinfo"]
+
+            fmad = [] if opt.enable_fp_fusion else ["--fmad=false"]
+            arch = sm_arch_from_capability(capability)
+
+            # Disable ptxas optimizations if requested
+            disable_opt = ['--opt-level', '0'] if knobs.nvidia.disable_ptxas_opt else []
+
+            # Accept more ptxas options if provided
+            ptx_extra_options = opt.ptx_options.split(" ") if opt.ptx_options else []
+
+            ptxas_cmd = [
+                ptxas, *debug_info, *fmad, '-v', *disable_opt, *ptx_extra_options, f'--gpu-name={arch}', fsrc.name,
+                '-o', fbin
+            ]
+            try:
+                subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog)
+                if knobs.nvidia.dump_ptxas_log:
+                    with open(flog.name) as log_file:
+                        print(log_file.read())
+
+                if os.path.exists(fsrc.name):
+                    os.remove(fsrc.name)
+                if os.path.exists(flog.name):
+                    os.remove(flog.name)
+            except subprocess.CalledProcessError as e:
+                with open(flog.name) as log_file:
+                    log = log_file.read()
+                if os.path.exists(flog.name):
+                    os.remove(flog.name)
+
+                if e.returncode == 255:
+                    error = 'Internal Triton PTX codegen error'
+                elif e.returncode == 128 + signal.SIGSEGV:
+                    error = '`ptxas` raised SIGSEGV'
+                else:
+                    error = f'`ptxas` failed with error code {e.returncode}'
+
+                error = (f"{error}\n"
+                         f"`ptxas` stderr:\n{log}\n"
+                         f'Repro command: {" ".join(ptxas_cmd)}\n')
+
+                print(f"""
+
+================================================================
+{error}
+
+{src}
+================================================================
+please share the reproducer above with Triton project.
+""")
+                raise PTXASError(error)
+
+            with open(fbin, 'rb') as f:
+                cubin = f.read()
+            if os.path.exists(fbin):
+                os.remove(fbin)
+        return cubin
+
+    def add_stages(self, stages, options, language):
+        capability = self._parse_arch(options.arch)
+        if language == Language.TRITON:
+            stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options, capability)
+            stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, capability)
+        elif language == Language.GLUON:
+            stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options, capability)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
+        stages["ptx"] = lambda src, metadata: self.make_ptx(src, metadata, options, self.target.arch)
+        stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch)
+        if knobs.runtime.add_stages_inspection_hook is not None:
+            knobs.runtime.add_stages_inspection_hook(self, stages, options, language, capability)
+
+    @functools.lru_cache()
+    def hash(self):
+        version = get_ptxas_version(self.target.arch)
+        return f'{version}-{self.target.arch}'
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.c b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.c
new file mode 100644
index 0000000000000000000000000000000000000000..8e06e6369128a730698fe233bf6f0898447213a6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.c
@@ -0,0 +1,518 @@
+#include "cuda.h"
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+typedef struct {
+  PyObject_HEAD;
+  _Alignas(128) CUtensorMap tensorMap;
+} PyCUtensorMapObject;
+
+// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
+static bool gpuAssert(CUresult code, const char *file, int line) {
+  if (code == CUDA_SUCCESS)
+    return true;
+
+  const char *prefix = "Triton Error [CUDA]: ";
+  const char *str;
+  cuGetErrorString(code, &str);
+  char err[1024] = {0};
+  strcat(err, prefix);
+  strcat(err, str);
+  PyGILState_STATE gil_state;
+  gil_state = PyGILState_Ensure();
+  PyErr_SetString(PyExc_RuntimeError, err);
+  PyGILState_Release(gil_state);
+  return false;
+}
+
+// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define CUDA_CHECK_AND_RETURN_NULL(ans)                                        \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__))                                 \
+      goto cleanup;                                                            \
+  } while (0)
+
+// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans)                          \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__)) {                               \
+      PyEval_RestoreThread(_save);                                             \
+      return NULL;                                                             \
+    }                                                                          \
+  } while (0)
+
+// Used to check if functions exist in old CUDA driver versions.
+#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction)  \
+  do {                                                                         \
+    if ((funcPointer) == NULL) {                                               \
+      (funcPointer) = (initializerFunction)();                                 \
+      if ((funcPointer) == NULL) {                                             \
+        goto cleanup;                                                          \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+  // Get device handle
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // create a struct to hold device properties
+  int max_shared_mem;
+  int max_num_regs;
+  int multiprocessor_count;
+  int warp_size;
+  int sm_clock_rate;
+  int mem_clock_rate;
+  int mem_bus_width;
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+  CUDA_CHECK_AND_RETURN_NULL(
+      cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
+
+  return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
+                       max_shared_mem, "max_num_regs", max_num_regs,
+                       "multiprocessor_count", multiprocessor_count, "warpSize",
+                       warp_size, "sm_clock_rate", sm_clock_rate,
+                       "mem_clock_rate", mem_clock_rate, "mem_bus_width",
+                       mem_bus_width);
+
+cleanup:
+  return NULL;
+}
+
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+  CUfunction fun;
+  CUmodule mod;
+  int32_t n_regs = 0;
+  int32_t n_spills = 0;
+  int32_t n_max_threads = 0;
+  // create driver handles
+  CUcontext pctx = 0;
+
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx));
+  }
+
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuModuleGetFunction(&fun, mod, name));
+  // get allocated registers and spilled registers from the function
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
+  n_spills /= 4;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
+      &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
+  // set dynamic shared memory if necessary
+  int shared_optin;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
+      &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  if (shared > 49152 && shared_optin > 49152) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
+    int shared_total, shared_static;
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
+        &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        device));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
+        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                           shared_optin - shared_static));
+  }
+  Py_END_ALLOW_THREADS;
+
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills, n_max_threads);
+}
+
+typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
+    int *numClusters, CUfunction func, const CUlaunchConfig *config);
+
+typedef CUresult (*cuTensorMapEncodeTiled_t)(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+
+#define defineGetFunctionHandle(name, symbolName)                              \
+  static symbolName##_t name() {                                               \
+    /* Open the shared library */                                              \
+    void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY);                       \
+    if (!libHandle) {                                                          \
+      PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");      \
+      return NULL;                                                             \
+    }                                                                          \
+    /* Clear any existing error */                                             \
+    dlerror();                                                                 \
+    symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \
+    /* Check for errors */                                                     \
+    const char *err = dlerror();                                               \
+    if (err) {                                                                 \
+      PyErr_SetString(PyExc_RuntimeError,                                      \
+                      "Failed to retrieve " #symbolName " from libcuda.so.1"); \
+      dlclose(libHandle);                                                      \
+      return NULL;                                                             \
+    }                                                                          \
+    return funcHandle;                                                         \
+  }
+
+defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
+                        cuOccupancyMaxActiveClusters);
+
+defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
+                        cuTensorMapEncodeTiled);
+
+static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
+  int clusterDim = -1, maxActiveClusters = -1;
+  int shared = 0;
+  CUfunction func;
+
+  if (!PyArg_ParseTuple(args, "Kii", &func, &shared, &clusterDim)) {
+    return NULL;
+  }
+
+  // Let each SM have one block
+  int maxActiveBlocks = 1;
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
+      func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared));
+  Py_END_ALLOW_THREADS;
+
+  CUlaunchAttribute launchAttr[1];
+  launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+  launchAttr[0].value.clusterDim.x = clusterDim;
+  launchAttr[0].value.clusterDim.y = 1;
+  launchAttr[0].value.clusterDim.z = 1;
+  CUlaunchConfig config;
+  config.gridDimX = clusterDim * maxActiveBlocks;
+  config.gridDimY = 1;
+  config.gridDimZ = 1;
+  config.blockDimX = 128;
+  config.blockDimY = 1;
+  config.blockDimZ = 1;
+  config.sharedMemBytes = shared;
+  config.hStream = 0;
+  config.numAttrs = 1;
+  config.attrs = launchAttr;
+
+  static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
+                                      getCuOccupancyMaxActiveClustersHandle);
+
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
+      func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
+  Py_END_ALLOW_THREADS;
+  return PyLong_FromLong(maxActiveClusters);
+
+cleanup:
+  return NULL;
+}
+
+static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
+  long size;
+  if (!PyArg_ParseTuple(args, "l", &size)) {
+    return NULL;
+  }
+  if (size < 0) {
+    PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
+    return NULL;
+  }
+
+  Py_BEGIN_ALLOW_THREADS;
+
+  // Ensure we have an active context.
+  CUcontext ctx = NULL;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx));
+  if (!ctx) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuDevicePrimaryCtxRetain(&ctx, /*device=*/0));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx));
+  }
+
+  // We can't set the fifo size after running a kernel that calls printf.  This
+  // is true even if the set() call is a nop and the new size is the same as the
+  // old size.
+  //
+  // This is unfriendly, so check if the old size matches the new size, and skip
+  // the set() call if so.
+  size_t oldSize = 0;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
+  if (oldSize != size) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
+  }
+
+  Py_END_ALLOW_THREADS;
+  Py_RETURN_NONE;
+}
+
+static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
+  PyCUtensorMapObject *self = NULL;
+  void *mem = NULL;
+  size_t size = type->tp_basicsize;
+
+  if (posix_memalign(&mem, 128, size) != 0) {
+    PyErr_NoMemory();
+    return NULL;
+  }
+
+  self = (PyCUtensorMapObject *)mem;
+  PyObject_INIT(self, type);
+  return (PyObject *)self;
+}
+
+static void PyCUtensorMap_dealloc(PyObject *self) {
+  Py_TYPE(self)->tp_free(self);
+}
+
+static void PyCUtensorMap_free(void *ptr) { free(ptr); }
+
+// clang-format off
+static PyTypeObject PyCUtensorMapType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "triton.backends.nvidia.PyCUtensorMap",
+    .tp_basicsize = sizeof(PyCUtensorMapObject),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "<PyCUtensorMap object>",
+    .tp_new = PyType_GenericNew,
+    .tp_alloc = PyCUtensorMap_alloc,
+    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+    .tp_free = PyCUtensorMap_free,
+};
+// clang-format on
+
+static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
+  unsigned long long global_address;
+  int swizzle;
+  int elemSize;
+  int elemType;
+  PyObject *blockSize;
+  PyObject *shape;
+  PyObject *strides;
+  int padding;
+
+  if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
+                        &elemType, &blockSize, &shape, &strides, &padding)) {
+    return NULL;
+  }
+
+  PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
+      (PyObject *)&PyCUtensorMapType, NULL);
+  if (!desc) {
+    return NULL;
+  }
+
+  PyObject *blockSizeFast = NULL;
+  PyObject *shapeFast = NULL;
+  PyObject *stridesFast = NULL;
+
+  uint32_t blockSizeInt[5];
+  uint64_t shapeInt[5];
+  uint64_t stridesLL[5];
+
+  blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
+  if (!blockSizeFast)
+    goto cleanup;
+  int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
+
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "block size must be an int");
+      goto cleanup;
+    }
+    blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item);
+  }
+
+  shapeFast = PySequence_Fast(shape, "shape must be a sequence");
+  if (!shapeFast)
+    goto cleanup;
+
+  if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    shapeInt[rank - i - 1] = PyLong_AsLong(item);
+  }
+
+  stridesFast = PySequence_Fast(strides, "strides must be a sequence");
+  if (!stridesFast)
+    goto cleanup;
+
+  if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i + 1 < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item);
+  }
+  stridesLL[rank - 1] =
+      shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
+  Py_DECREF(blockSizeFast);
+  blockSizeFast = NULL;
+  Py_DECREF(shapeFast);
+  shapeFast = NULL;
+  Py_DECREF(stridesFast);
+  stridesFast = NULL;
+
+  CUtensorMapFloatOOBfill fill =
+      (padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+                     : CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+
+  uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
+  static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                      getCuTensorMapEncodeTiledHandle);
+  CUresult res = cuTensorMapEncodeTiled(
+      &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
+      stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
+      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
+  if (res != CUDA_SUCCESS) {
+    const char *str;
+    cuGetErrorString(res, &str);
+    char err[4096] = {0};
+    size_t off = 0;
+    off += snprintf(
+        err + off, sizeof(err) - off,
+        "Triton Error [CUDA]: Failed to create tensor map descriptor: %s\n",
+        str ? str : "Unknown error");
+    off += snprintf(err + off, sizeof(err) - off,
+                    "elemType=%d rank=%d global_address=0x%llx elemSize=%d "
+                    "swizzle=%d padding=%d\n",
+                    elemType, rank, (unsigned long long)global_address,
+                    elemSize, swizzle, padding);
+    off += snprintf(err + off, sizeof(err) - off, "shape=[");
+    for (int i = 0; i < rank; ++i) {
+      off +=
+          snprintf(err + off, sizeof(err) - off, "%llu%s",
+                   (unsigned long long)shapeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "strides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%llu%s",
+                      (unsigned long long)stridesLL[i],
+                      (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "blockSize=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)blockSizeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "] elementStrides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)elementStrides[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    PyErr_SetString(PyExc_RuntimeError, err);
+
+    goto cleanup;
+  }
+
+  return (PyObject *)desc;
+
+cleanup:
+  Py_XDECREF(blockSizeFast);
+  Py_XDECREF(shapeFast);
+  Py_XDECREF(stridesFast);
+  Py_XDECREF(desc);
+  return NULL;
+}
+
+static PyMethodDef ModuleMethods[] = {
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided cubin into CUDA driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS,
+     "Python interface for cuOccupancyMaxActiveClusters function"},
+    {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
+     "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
+     "controls how many bytes can be streamed from kernels before data starts "
+     "being dropped.  This inherits all the limitations of this call; in "
+     "particular it's an error to change this value after launching any kernel "
+     "that calls printf()."},
+    {"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"},
+
+    {NULL, NULL, 0, NULL} // sentinel
+};
+
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
+
+PyMODINIT_FUNC PyInit_cuda_utils(void) {
+  if (PyType_Ready(&PyCUtensorMapType) < 0) {
+    return NULL;
+  }
+
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+
+  PyModule_AddFunctions(m, ModuleMethods);
+  Py_INCREF(&PyCUtensorMapType);
+  PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
+
+  return m;
+}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a2ddb378d32165b53653e4540135fdc0080a1ba
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/driver.py
@@ -0,0 +1,764 @@
+import functools
+import os
+import subprocess
+import triton
+import re
+from pathlib import Path
+from triton import knobs
+from triton.runtime.build import compile_module_from_src
+from triton.runtime import _allocation
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dirs = [os.path.join(dirname, "include")]
+libdevice_dir = os.path.join(dirname, "lib")
+libraries = ['libcuda.so.1']
+PyCUtensorMap = None
+
+
+@functools.lru_cache()
+def libcuda_dirs():
+    if env_libcuda_path := knobs.nvidia.libcuda_path:
+        return [env_libcuda_path]
+
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode(errors="ignore")
+    # each line looks like the following:
+    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+    locs = [line.split()[-1] for line in libs.splitlines() if "libcuda.so.1" in line]
+    dirs = [os.path.dirname(loc) for loc in locs]
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path and not dirs:
+        dirs = [dir for dir in env_ld_library_path.split(":") if os.path.exists(os.path.join(dir, "libcuda.so.1"))]
+    msg = 'libcuda.so cannot found!\n'
+    if locs:
+        msg += 'Possible files are located at %s.' % str(locs)
+        msg += 'Please create a symlink of libcuda.so to any of the files.'
+    else:
+        msg += 'Please make sure GPU is set up and then run "/sbin/ldconfig"'
+        msg += ' (requires sudo) to refresh the linker cache.'
+    assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
+    return dirs
+
+
+@functools.lru_cache()
+def library_dirs():
+    return [libdevice_dir, *libcuda_dirs()]
+
+
+# ------------------------
+# Utils
+# ------------------------
+
+
+class CudaUtils(object):
+
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(CudaUtils, cls).__new__(cls)
+        return cls.instance
+
+    def __init__(self):
+        mod = compile_module_from_src(
+            src=Path(os.path.join(dirname, "driver.c")).read_text(),
+            name="cuda_utils",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+        global PyCUtensorMap
+        PyCUtensorMap = mod.PyCUtensorMap
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
+        self.set_printf_fifo_size = mod.set_printf_fifo_size
+        self.fill_tma_descriptor = mod.fill_tma_descriptor
+
+
+# ------------------------
+# Launcher
+# ------------------------
+
+
+def ty_to_cpp(ty):
+    if ty[0] == '*':
+        return "CUdeviceptr"
+    if ty.startswith("tensordesc"):
+        return "CUtensorMap"
+    return {
+        "i1": "int8_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint8_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
+        "fp64": "double",
+        "nvTmaDesc": "CUtensorMap",
+    }[ty]
+
+
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+
+_BASE_ARGS_FORMAT = "iiiKKppOOOOOO"
+_BASE_ARGS_FORMAT_LEN = len(_BASE_ARGS_FORMAT)
+
+
+def make_launcher(constants, signature, tensordesc_meta):
+
+    def _expand_signature(signature):
+        output = []
+        tensordesc_idx = 0
+        # Expand tensor descriptor arguments into either nvTmaDesc, shape and
+        # strides, or base pointer, shape and strides depending on whether the
+        # kernel was lowered to use the nvTmaDesc or not.
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+
+                if meta is None:
+                    output.append("*" + dtype)
+                    # Currently the host side tensor descriptors get passed in as a
+                    # tensor desc, shape, and strides. We have no way to use these
+                    # shape and strides when processing tensor descriptors which is
+                    # why we provide our own decomposition above. Sadly this means
+                    # we have to pass the shape and strides twice.
+                    for _ in range(2 * ndim):
+                        output.append("i64")
+                    output.append("i1")
+                else:
+                    output.append("nvTmaDesc")
+
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+
+        assert not tensordesc_meta or tensordesc_idx == len(tensordesc_meta)
+        return output
+
+    def _flatten_signature(sig, output):
+        # Flatten tuples
+        if isinstance(sig, tuple):
+            for x in sig:
+                _flatten_signature(x, output)
+        else:
+            output.append(sig)
+
+    def _extracted_type(ty):
+        if isinstance(ty, tuple):
+            val = ','.join(map(_extracted_type, ty))
+            return f"[{val}]"
+        if ty[0] == '*':
+            return "PyObject*"
+        if ty in ("constexpr", "nvTmaDesc"):
+            return "PyObject*"
+        return ty_to_cpp(ty)
+
+    def format_of(ty):
+        if isinstance(ty, tuple):
+            val = ''.join(map(format_of, ty))
+            return f"({val})"
+        if ty[0] == '*':
+            return "O"
+        if ty in ("constexpr", "nvTmaDesc"):
+            return "O"
+        if ty.startswith("tensordesc"):
+            return "O"
+        return {
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "L",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty_to_cpp(ty)]
+
+    expand_signature = _expand_signature(signature.values())
+    signature = {i: s for i, s in enumerate(expand_signature)}
+
+    args_format = ''.join([format_of(ty) for ty in signature.values()])
+    format = _BASE_ARGS_FORMAT + args_format
+
+    flat_signature = []
+    for sig in signature.values():
+        _flatten_signature(sig, flat_signature)
+    signature = {i: s for i, s in enumerate(flat_signature)}
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty[0] == "*":
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
+        elif ty == "nvTmaDesc":
+            # Note: we have to dereference the pointer
+            internal_args_list.append(f"*tma_ptr{i}")
+        elif ty != "constexpr":
+            internal_args_list.append(f"_arg{i}")
+    params = range(len(signature))
+
+    # generate glue code
+    newline = '\n  '
+    ptr_decls = [
+        f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;"
+        for i, ty in signature.items()
+        if ty[0] == "*"
+    ]
+    tma_decls = [
+        f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" for i, ty in signature.items()
+        if ty == "nvTmaDesc"
+    ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
+    params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
+    params.append("&global_scratch")
+    params.append("&profile_scratch")
+    src = f"""
+#include \"cuda.h\"
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+typedef struct {{
+  PyObject_HEAD;
+  _Alignas(128) CUtensorMap tensorMap;
+}} PyCUtensorMapObject;
+
+static inline void gpuAssert(CUresult code, const char *file, int line)
+{{
+   if (code != CUDA_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [CUDA]: ";
+      const char* str;
+      cuGetErrorString(code, &str);
+      char err[1024] = {{0}};
+      strcat(err, prefix);
+      strcat(err, str);
+      PyGILState_STATE gil_state;
+      gil_state = PyGILState_Ensure();
+      PyErr_SetString(PyExc_RuntimeError, err);
+      PyGILState_Release(gil_state);
+   }}
+}}
+
+#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+
+typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
+
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
+    return NULL;
+  }}
+  // Clear any existing error
+  dlerror();
+  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
+  // Check for errors
+  const char *dlsym_error = dlerror();
+  if (dlsym_error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  void *params[] = {{ {', '.join(params)} }};
+  if (gridX*gridY*gridZ > 0) {{
+    // 4 attributes that we can currently pass maximum
+    CUlaunchAttribute launchAttr[4];
+    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+    if (cuLaunchKernelExHandle == NULL) {{
+      cuLaunchKernelExHandle = getLaunchKernelExHandle();
+    }}
+    CUlaunchConfig config;
+    config.gridDimX = gridX * num_ctas;
+    config.gridDimY = gridY;
+    config.gridDimZ = gridZ;
+
+    config.blockDimX = 32 * num_warps;
+    config.blockDimY = 1;
+    config.blockDimZ = 1;
+    config.sharedMemBytes = shared_memory;
+    config.hStream = stream;
+    config.attrs = launchAttr;
+    int num_attrs = 0;
+
+    if (launch_pdl != 0) {{
+      CUlaunchAttribute pdlAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1}};
+      launchAttr[num_attrs] = pdlAttr;
+      ++num_attrs;
+    }}
+
+    if (launch_cooperative_grid != 0) {{
+      CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
+      launchAttr[num_attrs] = coopAttr;
+      ++num_attrs;
+    }}
+
+    if (num_ctas != 1) {{
+      CUlaunchAttribute clusterAttr = {{}};
+      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      clusterAttr.value.clusterDim.x = num_ctas;
+      clusterAttr.value.clusterDim.y = 1;
+      clusterAttr.value.clusterDim.z = 1;
+      launchAttr[num_attrs] = clusterAttr;
+      ++num_attrs;
+
+      CUlaunchAttribute clusterSchedulingAttr = {{}};
+      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+      launchAttr[num_attrs] = clusterSchedulingAttr;
+      ++num_attrs;
+    }}
+
+    // num_ctas == 16 is non-portable. Does work for H100 and B200 tho
+    config.numAttrs = num_attrs;
+    if (num_ctas == 16) {{
+      CUDA_CHECK(cuFuncSetAttribute(
+          function,
+          CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+          1
+      ));
+    }}
+
+    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
+  }}
+}}
+
+typedef struct _DevicePtrInfo {{
+    CUdeviceptr dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+
+static PyObject* data_ptr_str = NULL;
+static PyObject* py_tensor_map_type = NULL;
+
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
+  if (!ret) {{
+    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  if (!PyLong_Check(ret)) {{
+    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
+  if(!ptr_info.dev_ptr)
+    return ptr_info;
+  uint64_t dev_ptr;
+  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+  if (status == CUDA_ERROR_INVALID_VALUE) {{
+      PyErr_Format(PyExc_ValueError,
+                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+      ptr_info.valid = false;
+  }} else if (status != CUDA_SUCCESS) {{
+      CUDA_CHECK(status);  // Catch any other cuda API errors
+      ptr_info.valid = false;
+  }}
+  ptr_info.dev_ptr = dev_ptr;
+cleanup:
+  Py_XDECREF(ret);
+  return ptr_info;
+
+}}
+
+static inline CUtensorMap* getTmaDesc(PyObject *obj) {{
+  if (sizeof(CUtensorMap*) != 8) {{
+    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
+    return NULL;
+  }}
+
+if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {{
+    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
+    return NULL;
+}}
+
+  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
+  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
+  if (align_128 != 0) {{
+    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
+    return NULL;
+  }}
+  return map;
+}}
+
+static void ensureCudaContext() {{
+  CUcontext pctx;
+  CUDA_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {{
+    // Ensure device context.
+    CUdevice device;
+    CUDA_CHECK(cuDeviceGet(&device, 0));
+    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK(cuCtxSetCurrent(pctx));
+  }}
+}}
+
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#else
+    PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#endif
+    return result;
+}}
+
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
+  ensureCudaContext();
+
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  int launch_cooperative_grid;
+  int launch_pdl;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  PyObject *global_scratch_obj = NULL;
+  PyObject *profile_scratch_obj = NULL;
+  {newline.join([f"{_extracted_type(ty)} _arg{i};" for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ,
+                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook{args_list})) {{
+    return NULL;
+  }}
+
+  int num_warps, num_ctas, shared_memory;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iii\", &num_warps, &num_ctas, &shared_memory)) {{
+    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
+    return NULL;
+  }}
+
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+
+  CUdeviceptr global_scratch = 0;
+  if (global_scratch_obj != Py_None) {{
+    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
+    if (!global_scratch_info.valid) {{
+      return NULL;
+    }}
+    global_scratch = global_scratch_info.dev_ptr;
+  }}
+
+  CUdeviceptr profile_scratch = 0;
+  if (profile_scratch_obj != Py_None) {{
+    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
+    if (!profile_scratch_info.valid) {{
+      return NULL;
+    }}
+    profile_scratch = profile_scratch_info.dev_ptr;
+  }}
+
+  // raise exception asap
+  {newline.join(ptr_decls)}
+  {newline.join(tma_decls)}
+  {newline.join(float_storage_decls)}
+  Py_BEGIN_ALLOW_THREADS;
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  Py_END_ALLOW_THREADS;
+  if (PyErr_Occurred()) {{
+    return NULL;
+  }}
+
+  if(launch_exit_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+
+  Py_RETURN_NONE;
+}}
+
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  data_ptr_str = PyUnicode_InternFromString("data_ptr");
+  if(data_ptr_str == NULL) {{
+    return NULL;
+  }}
+  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
+  if (driver_mod == NULL) {{
+    return NULL;
+  }}
+  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
+  if (py_tensor_map_type == NULL) {{
+    return NULL;
+  }}
+
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+
+
+# The TMA dtype enum values are slightly different on host vs device...
+TMA_DTYPE_DEVICE_TO_HOST = dict((i, i) for i in range(16))
+TMA_DTYPE_DEVICE_TO_HOST[8] = 10
+TMA_DTYPE_DEVICE_TO_HOST[9] = 8
+TMA_DTYPE_DEVICE_TO_HOST[10] = 9
+
+
+def make_tensordesc_arg(arg, metadata):
+    if metadata is None:
+        # Currently the host side tensor descriptors get decomposed in
+        # the frontend to tensor desc, shape, and strides. We have no
+        # way to use these shape and strides when processing tensor
+        # descriptors which is why we provide our own decomposition
+        # above. Sadly this means we have to pass the shape and strides
+        # twice.
+        return [arg.base, *arg.shape, *arg.strides, arg.padding == "nan", *arg.shape, *arg.strides]
+
+    swizzle = metadata["swizzle"]
+    elem_size = metadata["elem_size"]
+    elem_type = metadata["elem_type"]
+    block_size = metadata["block_size"]
+    fp4_padded = metadata["fp4_padded"]
+
+    shape = arg.shape
+    strides = arg.strides
+    assert strides[-1] == 1
+    padding = 1 if arg.padding == "nan" else 0
+
+    if fp4_padded:
+        shape = list(shape)
+        shape[-1] *= 2
+
+    cu_tensor_map = triton.runtime.driver.active.utils.fill_tma_descriptor(
+        arg.base.data_ptr(),
+        swizzle,
+        elem_size,
+        TMA_DTYPE_DEVICE_TO_HOST[elem_type],
+        block_size,
+        shape,
+        strides,
+        padding,
+    )
+
+    return [cu_tensor_map, *shape, *strides]
+
+
+def wrap_handle_tensordesc(launcher, signature, tensordesc_meta):
+    has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+    if not has_tensor_desc_arg:
+        return launcher
+
+    tensordesc_indices = set(
+        [i for i, sig in enumerate(signature.values()) if isinstance(sig, str) and sig.startswith("tensordesc")])
+    assert not tensordesc_meta or len(tensordesc_meta) == len(tensordesc_indices)
+    if not tensordesc_meta:
+        tensordesc_meta = [None] * len(tensordesc_indices)
+
+    def inner(*args):
+        final_args = list(args[:_BASE_ARGS_FORMAT_LEN])
+        tensordesc_idx = 0
+        for i, arg in enumerate(args[_BASE_ARGS_FORMAT_LEN:]):
+            if i in tensordesc_indices:
+                final_args.extend(make_tensordesc_arg(arg, tensordesc_meta[tensordesc_idx]))
+                tensordesc_idx += 1
+            else:
+                final_args.append(arg)
+        return launcher(*final_args)
+
+    return inner
+
+
+class CudaLauncher(object):
+
+    def __init__(self, src, metadata):
+        constants = src.constants if hasattr(src, "constants") else dict()
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
+        src = make_launcher(constants, signature, tensordesc_meta)
+        mod = compile_module_from_src(
+            src=src,
+            name="__triton_launcher",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+
+        self.num_ctas = getattr(metadata, "num_ctas", 1)
+        self.launch = wrap_handle_tensordesc(mod.launch, signature, tensordesc_meta)
+        self.global_scratch_size = metadata.global_scratch_size
+        self.global_scratch_align = metadata.global_scratch_align
+        self.profile_scratch_size = metadata.profile_scratch_size
+        self.profile_scratch_align = metadata.profile_scratch_align
+        self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.launch_pdl = metadata.launch_pdl
+
+    def __call__(self, gridX, gridY, gridZ, stream, function, *args):
+
+        def allocate_scratch(size, align, allocator):
+            if size > 0:
+                grid_size = gridX * gridY * gridZ
+                alloc_size = grid_size * self.num_ctas * size
+                alloc_fn = allocator.get()
+                return alloc_fn(alloc_size, align, stream)
+            return None
+
+        global_scratch = allocate_scratch(self.global_scratch_size, self.global_scratch_align, _allocation._allocator)
+        profile_scratch = allocate_scratch(self.profile_scratch_size, self.profile_scratch_align,
+                                           _allocation._profile_allocator)
+        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
+                    global_scratch, profile_scratch, *args)
+
+
+class CudaDriver(GPUDriver):
+
+    def __init__(self):
+        self.utils = CudaUtils()  # TODO: make static
+        self.launcher_cls = CudaLauncher
+        super().__init__()
+
+    def get_current_target(self):
+        device = self.get_current_device()
+        capability = self.get_device_capability(device)
+        capability = capability[0] * 10 + capability[1]
+        warp_size = 32
+        return GPUTarget("cuda", capability, warp_size)
+
+    def get_active_torch_device(self):
+        import torch
+        return torch.device("cuda", self.get_current_device())
+
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+
+    @staticmethod
+    def is_active():
+        try:
+            import torch
+            return torch.cuda.is_available() and (torch.version.hip is None)
+        except ImportError:
+            return False
+
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+
+    def get_empty_cache_for_benchmark(self):
+        import torch
+
+        # We maintain a buffer of 256 MB that we clear
+        # before each kernel call to make sure that the L2 cache
+        # doesn't contain any input data before the run
+        cache_size = 256 * 1024 * 1024
+        return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
+
+    def clear_cache(self, cache):
+        cache.zero_()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+
+#if !defined(_CUPTI_OPENACC_H_)
+#define _CUPTI_OPENACC_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OpenACC support
+ *
+ * \param profRegister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
+ */
+CUptiResult CUPTIAPI
+cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENACC_H_*/
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h
new file mode 100644
index 0000000000000000000000000000000000000000..303dd42878fb02774d872c197ccc27b17f2af69e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+#include "Openmp/omp-tools.h"
+
+#if !defined(_CUPTI_OPENMP_H_)
+#define _CUPTI_OPENMP_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
+
+/**
+ * \brief Initialize OPENMP support
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENMP_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..276967d07e8f8c0f7686e5b3b15151edf2415ae7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h
@@ -0,0 +1,1083 @@
+/*
+ * include/50/omp-tools.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32
+} ompt_callbacks_t;
+
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+
+typedef uint64_t ompt_id_t;
+
+typedef uint64_t ompt_device_time_t;
+
+typedef uint64_t ompt_buffer_cursor_t;
+
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2
+} ompt_scope_endpoint_t;
+
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2
+} ompt_dispatch_t;
+
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                = 1,
+  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7
+} ompt_sync_region_t;
+
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                = 1,
+  ompt_target_data_transfer_to_device   = 2,
+  ompt_target_data_transfer_from_device = 3,
+  ompt_target_data_delete               = 4,
+  ompt_target_data_associate            = 5,
+  ompt_target_data_disassociate         = 6
+} ompt_target_data_op_t;
+
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7
+} ompt_work_t;
+
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7
+} ompt_task_status_t;
+
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4
+} ompt_target_t;
+
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in              = 1,
+  ompt_dependence_type_out             = 2,
+  ompt_dependence_type_inout           = 3,
+  ompt_dependence_type_mutexinoutset   = 4,
+  ompt_dependence_type_source          = 5,
+  ompt_dependence_type_sink            = 6
+} ompt_dependence_type_t;
+
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef uint64_t ompt_hwid_t;
+
+typedef uint64_t ompt_wait_id_t;
+
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t; 
+
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+
+  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+
+typedef uint64_t ompd_size_t;
+
+typedef uint64_t ompd_wait_id_t;
+
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+
+typedef uint64_t ompd_device_t;
+
+typedef uint64_t ompd_thread_id_t;
+
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+
+typedef uint64_t ompd_icv_id_t;
+
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+} ompd_rc_t;
+
+typedef void (*ompt_interface_fn_t) (void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+
+typedef void (*ompt_callback_t) (void);
+
+typedef void ompt_device_t;
+
+typedef void ompt_buffer_t;
+
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+
+typedef int (*ompt_get_num_procs_t) (void);
+
+typedef int (*ompt_get_num_places_t) (void);
+
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+
+typedef int (*ompt_get_place_num_t) (void);
+
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+
+typedef int (*ompt_get_proc_id_t) (void);
+
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+
+typedef int (*ompt_get_num_devices_t) (void);
+
+typedef void (*ompt_finalize_tool_t) (void);
+
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_work_t {
+  ompt_work_t wstype;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance 
+);
+
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance; 
+} ompt_record_dispatch_t;
+
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+
+typedef void (*ompt_callback_master_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_master_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_master_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_master_t master;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+
+#define ompd_segment_none 0
+
+#endif /* __OMPT__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/builtin_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/builtin_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..5247c40807f0dd36a886513ab1bff5d2977364db
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/builtin_types.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "device_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "driver_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "surface_types.h"
+#include "texture_types.h"
+#include "vector_types.h"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4fba89435ec69efeddaaaacfe2b6e2f4144dd34
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h
@@ -0,0 +1,597 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4, 
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, 
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
+ * and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, 
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+/* Int101010 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized1010102>(void)
+{
+    return cudaCreateChannelDesc(10, 10, 10, 2, cudaChannelFormatKindUnsignedNormalized1010102);
+}
+
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f8ea3d242640f2196b789c7da6c05d2ed1bed3e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/common_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h
new file mode 100644
index 0000000000000000000000000000000000000000..0532a97bbaba37b6aa8540426d9d89adef6f4612
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h
@@ -0,0 +1,1743 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _COOPERATIVE_GROUPS_H_
+#define _COOPERATIVE_GROUPS_H_
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cooperative_groups/details/info.h"
+#include "cooperative_groups/details/driver_abi.h"
+#include "cooperative_groups/details/helpers.h"
+#include "cooperative_groups/details/memory.h"
+
+#if defined(_CG_HAS_STL_ATOMICS)
+#include <cuda/atomic>
+#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
+#else
+#define _CG_THREAD_SCOPE(scope)
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_CONST_DECL unsigned int coalesced_group_id = 1;
+    _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
+    _CG_CONST_DECL unsigned int grid_group_id = 3;
+    _CG_CONST_DECL unsigned int thread_block_id = 4;
+    _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
+    _CG_CONST_DECL unsigned int cluster_group_id = 6;
+}
+
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+protected:
+    struct group_data {
+        unsigned int _unused : 1;
+        unsigned int type : 7, : 0;
+    };
+
+    struct gg_data  {
+        details::grid_workspace *gridWs;
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    struct mg_data  {
+        unsigned long long _unused : 1;
+        unsigned long long type    : 7;
+        unsigned long long handle  : 56;
+        const details::multi_grid::multi_grid_functions *functions;
+    };
+#endif
+
+    struct tg_data {
+        unsigned int is_tiled : 1;
+        unsigned int type : 7;
+        unsigned int size : 24;
+        // packed to 4b
+        unsigned int metaGroupSize : 16;
+        unsigned int metaGroupRank : 16;
+        // packed to 8b
+        unsigned int mask;
+        // packed to 12b
+        unsigned int _res;
+    };
+
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+
+    union __align__(8) {
+        group_data  group;
+        tg_data     coalesced;
+        gg_data     grid;
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+        mg_data     multi_grid;
+#endif
+    } _data;
+
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+
+    _CG_QUALIFIER thread_group(unsigned int type) {
+        _data.group.type = type;
+        _data.group._unused = false;
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(sizeof(tg_data) <= 16, "Failed size check");
+    static_assert(sizeof(gg_data) <= 16, "Failed size check");
+#  ifdef _CG_ABI_EXPERIMENTAL
+    static_assert(sizeof(mg_data) <= 16, "Failed size check");
+#  endif
+#endif
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER unsigned long long size() const;
+    _CG_QUALIFIER unsigned long long num_threads() const;
+    _CG_QUALIFIER unsigned long long thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+    _CG_QUALIFIER unsigned int get_type() const {
+        return _data.group.type;
+    }
+
+};
+
+template <unsigned int TyId>
+struct thread_group_base : public thread_group {
+    _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
+    _CG_STATIC_CONST_DECL unsigned int id = TyId;
+};
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+
+
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+class multi_grid_group;
+
+// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
+template <typename = void>
+__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
+
+class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
+{
+private:
+    template <typename = void>
+    _CG_QUALIFIER multi_grid_group() {
+        _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
+        _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
+    }
+
+    friend multi_grid_group this_multi_grid<void>();
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.multi_grid.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        _data.multi_grid.functions->sync(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->size(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
+    }
+
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
+    }
+};
+# else
+class multi_grid_group
+{
+private:
+    unsigned long long _handle;
+    unsigned int _size;
+    unsigned int _rank;
+
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+
+    _CG_QUALIFIER multi_grid_group() {
+        _handle = details::multi_grid::get_intrinsic_handle();
+        _size = details::multi_grid::size(_handle);
+        _rank = details::multi_grid::thread_rank(_handle);
+    }
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
+        return (_handle != 0);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::multi_grid::sync(_handle);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _size;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _rank;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::grid_rank(_handle));
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::num_grids(_handle));
+    }
+};
+# endif
+
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <typename>
+__device__
+#else
+_CG_QUALIFIER
+# endif
+_CG_DEPRECATED
+multi_grid_group this_multi_grid()
+{
+    return multi_grid_group();
+}
+#endif
+
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group : public thread_group_base<details::grid_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
+    friend _CG_QUALIFIER grid_group this_grid();
+
+private:
+    _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
+        _data.grid.gridWs = gridWs;
+    }
+
+ public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.grid.gridWs != NULL);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::grid::sync(&_data.grid.gridWs->barrier);
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    using arrival_token = unsigned int;
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
+        details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned long long size() {
+        return details::grid::size();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::grid::grid_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::grid::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_threads() {
+        return details::grid::num_threads();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::grid::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
+        return details::grid::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks() {
+        return details::grid::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
+        return details::grid::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index() {
+        return details::grid::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long block_rank() {
+        return details::grid::block_rank();
+    }
+
+# if defined(_CG_HAS_CLUSTER_GROUP)
+    _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+        return details::grid::dim_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+        return details::grid::num_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 cluster_index() {
+        return details::grid::cluster_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+        return details::grid::cluster_rank();
+    }
+# endif
+};
+
+_CG_QUALIFIER grid_group this_grid() {
+    // Load a workspace from the driver
+    grid_group gg(details::get_grid_workspace());
+#ifdef _CG_DEBUG
+    // *all* threads must be available to synchronize
+    gg.sync();
+#endif // _CG_DEBUG
+    return gg;
+}
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+/**
+ * class cluster_group
+ *
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
+ * divided along all dimensions to form groups of blocks, each group of which is
+ * a block cluster. Clustered grids are subject to various restrictions and
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
+ * grids are subject to additional occupancy limitations due to per-cluster
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
+ * be a cooperative group, with access to all cooperative group capabilities, as
+ * well as cluster specific capabilities and accelerations. A cluster_group
+ * represents a block cluster.
+ *
+ * Constructed via this_cluster_group();
+ */
+class cluster_group : public thread_group_base<details::cluster_group_id>
+{
+    // Friends
+    friend _CG_QUALIFIER cluster_group this_cluster();
+
+    // Disable constructor
+    _CG_QUALIFIER cluster_group()
+    {
+    }
+
+ public:
+    //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
+
+    using arrival_token = struct {};
+
+    // Functionality exposed by the group
+    _CG_STATIC_QUALIFIER void sync()
+    {
+        return details::cluster::sync();
+    }
+
+    _CG_STATIC_QUALIFIER arrival_token barrier_arrive()
+    {
+        details::cluster::barrier_arrive();
+        return arrival_token();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait()
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+    {
+        return details::cluster::query_shared_rank(addr);
+    }
+
+    template <typename T>
+    _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+    {
+        return details::cluster::map_shared_rank(addr, rank);
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index()
+    {
+        return details::cluster::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int block_rank()
+    {
+        return details::cluster::block_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index()
+    {
+        return details::cluster::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank()
+    {
+        return details::cluster::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks()
+    {
+        return details::cluster::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_blocks()
+    {
+        return details::cluster::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads()
+    {
+        return details::cluster::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads()
+    {
+        return details::cluster::num_threads();
+    }
+
+    // Legacy aliases
+    _CG_STATIC_QUALIFIER unsigned int size()
+    {
+        return num_threads();
+    }
+};
+
+/*
+ * cluster_group this_cluster()
+ *
+ * Constructs a cluster_group
+ */
+_CG_QUALIFIER cluster_group this_cluster()
+{
+    cluster_group cg;
+#ifdef _CG_DEBUG
+    cg.sync();
+#endif
+    return cg;
+}
+#endif
+
+#if defined(_CG_CPP11_FEATURES)
+class thread_block;
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+#endif
+
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group_base<details::thread_block_id>
+{
+    // Friends
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int MaxBlockSize>
+    friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+    template <unsigned int Size>
+    friend class __static_size_multi_warp_tile_base;
+
+    details::multi_warp_scratch* const tile_memory;
+
+    template <unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
+        tile_memory(details::get_scratch_ptr(&scratch)) {
+#ifdef _CG_DEBUG
+        if (num_threads() > MaxBlockSize) {
+            details::abort();
+        }
+#endif
+
+
+#if defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+#define _CG_SKIP_BARRIER_INIT_TARGET NV_NO_TARGET
+#else
+#define _CG_SKIP_BARRIER_INIT_TARGET NV_PROVIDES_SM_80
+#endif
+        NV_IF_ELSE_TARGET(
+            _CG_SKIP_BARRIER_INIT_TARGET,
+            // skip if clause
+        ,
+            (tile_memory->init_barriers(thread_rank());
+            sync();)
+        )
+    }
+#endif
+#undef _CG_SKIP_BARRIER_INIT_TARGET
+
+    // Disable constructor
+    _CG_QUALIFIER thread_block()
+#if defined(_CG_CPP11_FEATURES)
+    : tile_memory(details::get_scratch_ptr(NULL))
+#endif
+    { }
+
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (thread_block());
+        }
+
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (details::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(details::coalesced_group_id);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
+        tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
+        tile._data.coalesced.is_tiled = true;
+        return (tile);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_STATIC_QUALIFIER void sync() {
+        details::cta::sync();
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    struct arrival_token {};
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        return arrival_token();
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&&) const {
+        details::cta::sync();
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int size() {
+        return details::cta::size();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return details::cta::thread_rank();
+    }
+
+    // Additional functionality exposed by the group
+    _CG_STATIC_QUALIFIER dim3 group_index() {
+        return details::cta::group_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::cta::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::cta::block_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::cta::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads() {
+        return details::cta::num_threads();
+    }
+
+};
+
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
+    return (thread_block(scratch));
+}
+#endif
+
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group_base<details::coalesced_group_id>
+{
+private:
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+    friend class details::_coalesced_group_data_access;
+
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+
+        if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+
+            mask <<= (details::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            coalesced_tile._data.coalesced.is_tiled = true;
+            return (coalesced_tile);
+        }
+        else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            // Override parent with the size of this group
+            coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            return coalesced_tile;
+        }
+        else {
+            // None in _CG_VERSION 1000
+            details::abort();
+        }
+
+        return (coalesced_group(0));
+    }
+
+ protected:
+    _CG_QUALIFIER coalesced_group(unsigned int mask) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+        _data.coalesced.metaGroupRank = 0;
+        _data.coalesced.metaGroupSize = 1;
+        _data.coalesced.is_tiled = false;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_QUALIFIER unsigned int num_threads() const {
+        return _data.coalesced.size;
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
+    }
+
+    // Rank of this group in the upper level of the hierarchy
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+#endif
+
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_any_sync(0xFFFFFFFF, val));
+        }
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
+        return (_packLanes(lane_match));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));
+        }
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
+        return (_packLanes(lane_match));
+    }
+
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+
+};
+
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+
+namespace details {
+    template <unsigned int Size> struct verify_thread_block_tile_size;
+    template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<8>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<4>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<2>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<1>  { typedef void OK; };
+
+#ifdef _CG_CPP11_FEATURES
+    template <unsigned int Size>
+    using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
+
+    template <unsigned int Size>
+    using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
+    template <unsigned int Size>
+    using _is_multi_warp =
+    _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
+
+    template <unsigned int Size>
+    using _is_valid_single_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
+    template <unsigned int Size>
+    using _is_valid_multi_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
+#else
+    template <unsigned int Size>
+    struct _is_multi_warp {
+        static const bool value = false;
+    };
+#endif
+}
+
+template <unsigned int Size>
+class __static_size_tile_base
+{
+protected:
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    // Rank of thread within tile
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return (details::cta::thread_rank() & (numThreads - 1));
+    }
+
+    // Number of threads within tile
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
+        return numThreads;
+    }
+
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
+        return num_threads();
+    }
+};
+
+template <unsigned int Size>
+class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
+{
+    friend class details::_coalesced_group_data_access;
+    typedef details::tile::tile_helpers<Size> th;
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
+#else
+    typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
+#endif
+    using __static_size_tile_base<Size>::numThreads;
+    _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
+
+ protected:
+    _CG_STATIC_QUALIFIER unsigned int build_mask() {
+        unsigned int mask = fullMask;
+        if (numThreads != 32) {
+            // [0,31] representing the current active thread in the warp
+            unsigned int laneId = details::laneid();
+            // shift mask according to the partition it belongs to
+            mask = th::tileMask << (laneId & ~(th::laneMask));
+        }
+        return (mask);
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+
+    _CG_STATIC_QUALIFIER void sync() {
+        __syncwarp(build_mask());
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    // PTX supported collectives
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif //_CG_CPP11_FEATURES
+
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_any_sync(build_mask(), val);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+#endif
+
+};
+
+template <unsigned int Size, typename ParentT>
+class __static_parent_thread_block_tile_base
+{
+public:
+    // Rank of this group in the upper level of the hierarchy
+    _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
+        return ParentT::thread_rank() / Size;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
+        return (ParentT::size() + Size - 1) / Size;
+    }
+};
+
+/**
+ * class thread_block_tile<unsigned int Size, ParentT = void>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
+ */
+
+template <unsigned int Size, typename ParentT = void>
+class __single_warp_thread_block_tile :
+    public __static_size_thread_block_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    friend class details::_coalesced_group_data_access;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile() { };
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
+
+    _CG_STATIC_QUALIFIER unsigned int get_mask() {
+        return __static_size_thread_block_tile_base<Size>::build_mask();
+    }
+};
+
+template <unsigned int Size>
+class __single_warp_thread_block_tile<Size, void> :
+    public __static_size_thread_block_tile_base<Size>,
+    public thread_group_base<details::coalesced_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+    template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
+    friend class details::_coalesced_group_data_access;
+
+    typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
+        _data.coalesced.mask = staticSizeBaseT::build_mask();
+        _data.coalesced.size = numThreads;
+        _data.coalesced.metaGroupRank = meta_group_rank;
+        _data.coalesced.metaGroupSize = meta_group_size;
+        _data.coalesced.is_tiled = true;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+public:
+    using staticSizeBaseT::sync;
+    using staticSizeBaseT::size;
+    using staticSizeBaseT::num_threads;
+    using staticSizeBaseT::thread_rank;
+
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+};
+
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT>
+_CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+
+// TODO: Use a static dispatch to determine appropriate return type
+// C++03 is stuck with unsigned long long for now
+#ifdef _CG_CPP11_FEATURES
+template <class GroupT>
+_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
+    return g.thread_rank();
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
+    return g.num_threads();
+}
+#else
+template <class GroupT>
+_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
+    return static_cast<unsigned long long>(g.thread_rank());
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
+    return static_cast<unsigned long long>(g.num_threads());
+}
+#endif
+
+
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent.get_type() == details::coalesced_group_id) {
+        const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
+        return _cg->_get_tiled_threads(tilesz);
+    }
+    else {
+        const thread_block *_tb = static_cast<const thread_block*>(&parent);
+        return _tb->_get_tiled_threads(tilesz);
+    }
+}
+
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
+
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
+        return internal_thread_block_tile<Size, ParentT>();
+    }
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda) {
+                return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
+            }
+
+    template <typename T, typename GroupT>
+    _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
+        return group.template get_scratch_location<T>(warp_id);
+    }
+
+    template <typename GroupT>
+    _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
+        return group.get_sync_location();
+    }
+
+}
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int Size>
+class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
+{
+    static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    friend __device__ TyVal details::multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda);
+    template <typename T, typename GroupT>
+    friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
+    template <typename GroupT>
+    friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
+    template <unsigned int OtherSize>
+    friend class __static_size_multi_warp_tile_base;
+    using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+    using ThisType = __static_size_multi_warp_tile_base<Size>;
+    _CG_STATIC_CONST_DECL int numWarps = Size / 32;
+
+protected:
+    details::multi_warp_scratch* const tile_memory;
+
+    template <typename GroupT>
+    _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
+#if !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+        NV_IF_TARGET(NV_PROVIDES_SM_80,
+            details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
+            g.sync();
+        )
+#endif
+    }
+
+
+private:
+    _CG_QUALIFIER details::barrier_t* get_sync_location() const {
+        // Different group sizes use different barriers, all groups of a given size share one barrier.
+        unsigned int sync_id = details::log2(Size / 64);
+        return &tile_memory->barriers[sync_id];
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
+        unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location() const {
+        unsigned int scratch_id = details::cta::thread_rank() / 32;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
+        unsigned int src_warp = src / 32;
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+
+        // Get warp slot of the source threads warp.
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
+
+        if (warp.meta_group_rank() == src_warp) {
+            warp.sync();
+            // Put shuffled value into my warp slot and let my warp arrive at the barrier.
+            if (thread_rank() == src) {
+                *warp_scratch_location = val;
+            }
+            details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps_wait(sync_location, details::cta::thread_rank());
+            return result;
+        }
+        else {
+            // Wait for the source warp to arrive on the barrier.
+            details::sync_warps_wait_for_specific_warp(sync_location,
+                    (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
+            TyVal result = *warp_scratch_location;
+            details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
+            return result;
+        }
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>();
+
+        warp_lambda(warp, warp_scratch_location);
+
+        if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            if (subwarp.meta_group_rank() == 0) {
+                TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
+                inter_warp_lambda(subwarp, thread_scratch_location);
+            }
+            warp.sync();
+            details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
+        }
+        TyVal result = *warp_scratch_location;
+        return result;
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
+
+    using __static_size_tile_base<Size>::thread_rank;
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        return shfl_impl(val, src);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
+    }
+
+    _CG_QUALIFIER int any(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+
+    _CG_QUALIFIER int all(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+};
+
+
+template <unsigned int Size, typename ParentT = void>
+class __multi_warp_thread_block_tile :
+    public __static_size_multi_warp_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
+protected:
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
+        __static_size_multi_warp_tile_base<Size>(g) {}
+};
+
+template <unsigned int Size>
+class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
+{
+    const unsigned int metaGroupRank;
+    const unsigned int metaGroupSize;
+
+protected:
+    template <unsigned int OtherSize, typename ParentT>
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
+        __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
+
+public:
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return metaGroupSize;
+    }
+};
+#endif
+
+template <unsigned int Size, typename ParentT = void>
+class thread_block_tile;
+
+namespace details {
+    template <unsigned int Size, typename ParentT, bool IsMultiWarp>
+    class thread_block_tile_impl;
+
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
+    {
+    protected:
+        template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
+            __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
+
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
+            __single_warp_thread_block_tile<Size, ParentT>() {}
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
+            __multi_warp_thread_block_tile<Size, ParentT>(g) {}
+    };
+#else
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
+    };
+#endif
+}
+
+template <unsigned int Size, typename ParentT>
+class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
+{
+    friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
+
+protected:
+    _CG_QUALIFIER thread_block_tile(const ParentT& g) :
+        details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
+        return thread_block_tile<Size, void>(*this);
+    }
+};
+
+template <unsigned int Size>
+class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
+{
+    template <unsigned int, typename ParentT>
+    friend class thread_block_tile;
+
+protected:
+    template <unsigned int OtherSize, typename OtherParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    template <typename ParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+};
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
+        _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
+            thread_block_tile<Size, thread_block>(g) {}
+    };
+
+    // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
+    template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
+    struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
+        public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
+#endif
+        _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
+            thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
+    };
+
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+{
+    return details::tiled_partition_impl<Size, ParentT>(g);
+}
+
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_block_tile<1, void> this_thread()
+{
+    // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
+    // meta group rank and size set to 0 and 1 respectively.
+    return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
+}
+
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        cooperative_groups::sync(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        cooperative_groups::sync(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        cooperative_groups::sync(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned long long thread_group::size() const
+{
+    unsigned long long size = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return size;
+}
+
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
+{
+    unsigned long long rank = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return rank;
+}
+
+_CG_END_NAMESPACE
+
+#include <cooperative_groups/details/partitioning.h>
+#if (!defined(_MSC_VER) || defined(_WIN64))
+# include <cooperative_groups/details/invoke.h>
+#endif
+
+# endif /* ! (__cplusplus, __CUDACC__) */
+
+#endif /* !_COOPERATIVE_GROUPS_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h
@@ -0,0 +1,452 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_ASYNC_H
+#define _CG_ASYNC_H
+
+#include "helpers.h"
+#include "info.h"
+
+#include <cuda_pipeline.h>
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+// Groups supported by memcpy_async
+template <class TyGroup>
+struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
+
+// Groups that require optimization
+template <class TyGroup>
+struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
+
+template <typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
+    : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
+
+// SFINAE helpers for tile optimizations
+template <class TyGroup>
+using enable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+template <class TyGroup>
+using disable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+// Segment for punning to aligned types
+template <unsigned int N>
+struct _Segment {
+    int _seg[N];
+};
+
+// Trivial layout guaranteed-aligned copy-async compatible segments
+template <unsigned int N>
+struct Segment;
+template <>
+struct __align__(4) Segment<1> : public _Segment<1>{};
+template <>
+struct __align__(8) Segment<2> : public _Segment<2>{};
+template <>
+struct __align__(16) Segment<4> : public _Segment<4>{};
+
+// Interleaved element by element copies from source to dest
+template <typename TyGroup, typename TyElem>
+_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
+                                      size_t count) {
+    const unsigned int rank = group.thread_rank();
+    const unsigned int stride = group.size();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        dst[idx] = src[idx];
+    }
+}
+
+template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    if (count == 0) {
+        return;
+    }
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    const unsigned int stride = group.size();
+    const unsigned int rank = group.thread_rank();
+    // Efficient copies require warps to operate on the same amount of work at each step.
+    // remainders are handled in a separate stage to prevent branching
+    const unsigned int subWarpMask = (stride - 1);
+    const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
+    const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
+
+    const size_t warpCopies = (count & (~subWarpMask));
+
+    for (size_t idx = 0; idx < warpCopies; idx += stride) {
+        size_t _srcIdx = rank + idx;
+        size_t _dstIdx = rank + idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+
+    if (subwarpCopies) {
+        size_t _srcIdx = warpCopies + maxSubwarpRank;
+        size_t _dstIdx = warpCopies + maxSubwarpRank;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    unsigned int stride = group.size();
+    unsigned int rank = group.thread_rank();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        size_t _srcIdx = idx;
+        size_t _dstIdx = idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <unsigned int MinAlignment, unsigned int MaxAlignment>
+_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
+    // Narrowing conversion intentional
+    uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
+
+    uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
+
+    // range [MaxAlignment, alignof(elem)], step: x >> 1
+    // over range of possible alignments, choose best available out of range
+    uint32_t out = MaxAlignment;
+#pragma unroll
+    for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
+        if (alignment & diff)
+            out = alignment;
+    }
+
+    return out;
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <typename TyType, typename TyGroup>
+_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                    size_t count) {
+    const char *src = reinterpret_cast<const char *>(_src);
+    char *dst = reinterpret_cast<char *>(_dst);
+
+    constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
+
+    uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
+
+    inline_copy(group, dst, src, alignOffset);
+    count -= alignOffset;
+    src += alignOffset;
+    dst += alignOffset;
+
+    // Copy using the best available alignment, async_copy expects n-datums, not bytes
+    size_t asyncCount = count / sizeof(TyType);
+    accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
+    asyncCount *= sizeof(TyType);
+
+    count -= asyncCount;
+    src += asyncCount;
+    dst += asyncCount;
+    inline_copy(group, dst, src, count);
+}
+
+// We must determine alignment and manually align src/dst ourselves
+template <size_t AlignHint>
+struct _memcpy_async_align_dispatch {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
+        uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
+
+        // Avoid copying the extra bytes if desired copy count is smaller
+        alignment = count < alignment ? AlignHint : alignment;
+
+        switch (alignment) {
+        default:
+        case 1:
+            inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
+            break;
+        case 2:
+            inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
+            break;
+        case 4:
+            copy_like<Segment<1>>(group, dst, src, count);
+            break;
+        case 8:
+            copy_like<Segment<2>>(group, dst, src, count);
+            break;
+        case 16:
+            copy_like<Segment<4>>(group, dst, src, count);
+            break;
+        }
+    }
+};
+
+// Specialization for 4 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<4> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
+        Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Specialization for 8 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<8> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
+        Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Alignments over 16 are truncated to 16 and bypass alignment
+// This is the highest performing memcpy available
+template <>
+struct _memcpy_async_align_dispatch<16> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
+        Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// byte-wide API
+template <size_t Alignment, class TyGroup>
+_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
+                                                                 const void *__restrict__ _src, size_t count) {
+    static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
+    details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
+}
+
+// Internal dispatch APIs
+// These deduce the alignments and sizes necessary to invoke the underlying copy engine
+template <typename Ty>
+using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
+
+template <typename Ty>
+using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_integral =
+    typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
+
+// byte-wide API using aligned_sized_t
+template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
+                                              const void *__restrict__ _src, const Alignment<Hint> &count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
+}
+
+// byte-wide API using type for aligment
+template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
+          enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
+}
+
+// byte-wide API with full alignment deduction required
+template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
+          enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
+}
+
+// 1d-datum API
+template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
+                                              const TyElem *__restrict__ src, const size_t srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+// 1d-datum API using aligned_size_t
+template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
+                                              const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+} // namespace details
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ */
+template <class TyGroup, typename TyElem, typename TySizeT>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
+                                       const TySizeT &count) {
+    details::_memcpy_async_bytes(group, _dst, _src, count);
+    __pipeline_commit();
+}
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ * Object counts are in datum sized chunks, not bytes.
+ */
+template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
+                                       const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
+    details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
+    __pipeline_commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
+    __pipeline_wait_prior(Stage);
+    group.sync();
+}
+
+/* Group wait all previously submitted memcpy_async to complete. */
+template <class TyGroup>
+_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
+    __pipeline_wait_prior(0);
+    group.sync();
+}
+
+/***************** CG APIs including pipeline are deprecated *****************/
+
+/* Group submit batch of async-copy to cover of contiguous 1D array
+   to a pipeline and commit the batch*/
+template <class TyGroup, class TyElem>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
+                                       nvcuda::experimental::pipeline &pipe) {
+    details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
+    pipe.commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
+    pipe.wait_prior<Stage>();
+    group.sync();
+}
+
+/* Group wait for stage-S of memcpy_async to complete. */
+template <class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
+    pipe.wait(stage);
+    group.sync();
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_ASYNC_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ba03fc9e4d0c78f07e3e5e1f97aff03e7a3d6f8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h
@@ -0,0 +1,95 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_REDUCE_H_
+#define _CG_COALESCED_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
+_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, 
+                                    TyVal&& val,
+                                    TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
+        out = op(out, group.shfl_xor(out, mask));
+    }
+
+    return out;
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        // Full coalesced group can go through faster path by being treated as a tile of size 32
+        auto tile = details::tiled_partition_internal<32, void>();
+        return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        auto scan_result =
+            inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
+        unsigned int last_thread_id = 31 - __clz(group_mask);
+        return details::tile::shuffle_dispatch<TyVal>::shfl(
+            _CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
+    }
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_REDUCE_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_scan.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..383f4bde059dd8daad7d1c56e99152ea7ee28a08
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_scan.h
@@ -0,0 +1,174 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_SCAN_H_
+#define _CG_COALESCED_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "functional.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = 1; mask < group.size(); mask <<= 1) {
+        auto tmp = group.shfl_up(out, mask);
+        if (mask <= group.thread_rank()) {
+            out = op(out, tmp);
+        }
+    }
+
+    return out;
+}
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    const unsigned int groupSize = group.size();
+    auto out = val;
+
+    const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
+    unsigned int lanemask = details::lanemask32_lt() & mask;
+    unsigned int srcLane = details::laneid();
+
+    const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
+    const unsigned int rank = __popc(lanemask);
+
+    for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
+        if (i <= rank) {
+            srcLane -= j;
+            j = i; /* maximum possible lane */
+
+            unsigned int begLane = base + rank - i; /* minimum possible lane */
+
+            /*  Next source lane is in the range [ begLane .. srcLane ]
+                *  If begLane < srcLane then do a binary search.
+                */
+            while (begLane < srcLane) {
+                const unsigned int halfLane = (begLane + srcLane) >> 1;
+                const unsigned int halfMask = lanemask >> halfLane;
+                const unsigned int d = __popc(halfMask);
+                if (d < i) {
+                    srcLane = halfLane - 1; /* halfLane too large */
+                }
+                else if ((i < d) || !(halfMask & 0x01)) {
+                    begLane = halfLane + 1; /* halfLane too small */
+                }
+                else {
+                    begLane = srcLane = halfLane; /* happen to hit */
+                }
+            }
+        }
+
+        auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
+        if (i <= rank) {
+            out = op(out, tmp);
+        }
+    }
+    return out;
+}
+
+template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
+                                            TyVal&& val,
+                                            TyOp&& op) -> decltype(op(val, val)) {
+    return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+
+template <bool IntegralOptimized>
+struct scan_choose_convertion;
+
+template<>
+struct scan_choose_convertion<true> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        return result - val;
+    }
+};
+
+template<>
+struct scan_choose_convertion<false> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        auto ret = group.shfl_up(result, 1);
+        if (group.thread_rank() == 0) {
+            return {};
+        }
+        else {
+            return ret;
+        }
+    }
+};
+
+template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
+                                 && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
+    return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_SCAN_H_
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/driver_abi.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/driver_abi.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c866fcf740beb709a106057d28e8a2a1ac37924
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/driver_abi.h
@@ -0,0 +1,99 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_DRIVER_API_H
+#define _CG_DRIVER_API_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    template <unsigned int RegId>
+    _CG_QUALIFIER unsigned int load_env_reg() {
+        // Abort by default
+        _CG_ABORT();
+        return 0;
+    }
+
+    template <unsigned int HiReg, unsigned int LoReg>
+    _CG_QUALIFIER unsigned long long load_env_reg64() {
+        unsigned long long registerLo = load_env_reg<LoReg>();
+        unsigned long long registerHi = load_env_reg<HiReg>();
+
+        return (registerHi << 32) | registerLo;
+    }
+
+// inline PTX for accessing registers requires an immediate for the special reg
+# define LOAD_ENVREG(NUMBER) \
+    template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
+        unsigned int r; \
+        asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
+        return r; \
+    }
+
+    // Instantiate loaders for registers used
+    LOAD_ENVREG(0);
+    LOAD_ENVREG(1);
+    LOAD_ENVREG(2);
+# undef LOAD_ENVREG
+
+    struct grid_workspace {
+        unsigned int wsSize;
+        unsigned int barrier;
+    };
+
+    _CG_QUALIFIER grid_workspace* get_grid_workspace() {
+        unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
+        // Interpret the address from envreg 1 and 2 as the driver's grid workspace
+        return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
+    }
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_DRIVER_API_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f151fe2c270421ba56e22935e84c4bf93790eff
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h
@@ -0,0 +1,212 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_FUNCTIONAL_H
+#define _CG_FUNCTIONAL_H
+
+#include "info.h"
+#include "helpers.h"
+
+#ifdef _CG_CPP11_FEATURES
+#ifdef _CG_USE_CUDA_STL
+# include <cuda/std/functional>
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_USE_CUDA_STL
+    using cuda::std::plus;
+    using cuda::std::bit_and;
+    using cuda::std::bit_xor;
+    using cuda::std::bit_or;
+#else
+    template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
+    template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
+    template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
+    template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
+#endif // _CG_USE_PLATFORM_STL
+} // details
+
+template <typename Ty>
+struct plus : public details::plus<Ty> {};
+
+template <typename Ty>
+struct less {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg2 < arg1) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct greater {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg1 < arg2) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct bit_and : public details::bit_and<Ty> {};
+
+template <typename Ty>
+struct bit_xor : public details::bit_xor<Ty> {};
+
+template <typename Ty>
+struct bit_or : public details::bit_or<Ty> {};
+
+#if defined(_CG_HAS_STL_ATOMICS)
+namespace details {
+    template <class Ty>
+    using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
+
+    template <typename TyOp> struct _atomic_op_supported                                : public _CG_STL_NAMESPACE::false_type {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>>  : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
+        auto old = atomic.load(cuda::std::memory_order_relaxed);
+        while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
+        return old;
+    }
+
+    template<typename TyOp>
+    struct op_picker;
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::plus<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::less<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::greater<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_and<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_xor<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_or<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<bool atomic_supported>
+    struct atomic_update_dispatch {};
+
+    template<>
+    struct atomic_update_dispatch<false> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+            return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+
+    template<>
+    struct atomic_update_dispatch<true> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
+            using dispatch = op_picker<details::remove_qual<TyOp>>;
+
+            return dispatch::atomic_update(atomic, val);
+        }
+    };
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+        using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
+
+        return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template<typename TyAtomic, typename TyVal>
+    _CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
+        atomic.store(val, cuda::std::memory_order_relaxed);
+    }
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif
+#endif //_CG_FUNCTIONAL_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..1485d9f503daa8d518af75775f7a7a415cb031d4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h
@@ -0,0 +1,693 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
+# define _COOPERATIVE_GROUPS_HELPERS_H_
+
+#include "info.h"
+#include "sync.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_CPP11_FEATURES
+    template <typename Ty> struct _is_float_or_half          : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
+# ifdef _CG_HAS_FP16_COLLECTIVE
+    template <>            struct _is_float_or_half<__half>  : public _CG_STL_NAMESPACE::true_type {};
+    template <>            struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
+# endif
+    template <typename Ty>
+    using  is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
+
+    // Non-STL utility templates 
+    template <typename Ty>
+    using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
+
+    template <typename TyLhs, typename TyRhs>
+    using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
+    >;
+#endif
+
+    template <typename TyTrunc>
+    _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
+        return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
+               ((TyTrunc)index.y * nIndex.x) +
+                (TyTrunc)index.x;
+    }
+
+    namespace cta {
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            __barrier_sync(0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return vec3_to_linear<unsigned int>(threadIdx, blockDim);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 group_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(blockDim.x, blockDim.y, blockDim.z);
+        }
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned int size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_dim()
+        {
+            return dim_threads();
+        }
+
+    };
+
+    class _coalesced_group_data_access {
+    public:
+        // Retrieve mask of coalesced groups and tiles
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
+            return group.get_mask();
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
+            return TyGroup(mask);
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
+            group._data.coalesced.metaGroupRank = mgRank;
+            group._data.coalesced.metaGroupSize = mgSize;
+        }
+    };
+
+    namespace tile {
+        template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
+        struct _tile_helpers{
+            _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
+            _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
+            _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
+            _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
+        };
+
+        template <unsigned int> struct tile_helpers;
+        template <> struct tile_helpers<32> : public _tile_helpers<1,  0xFFFFFFFF, 0x1F, 5> {};
+        template <> struct tile_helpers<16> : public _tile_helpers<2,  0x0000FFFF, 0x0F, 4> {};
+        template <> struct tile_helpers<8>  : public _tile_helpers<4,  0x000000FF, 0x07, 3> {};
+        template <> struct tile_helpers<4>  : public _tile_helpers<8,  0x0000000F, 0x03, 2> {};
+        template <> struct tile_helpers<2>  : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
+        template <> struct tile_helpers<1>  : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
+
+#ifdef _CG_CPP11_FEATURES
+        namespace shfl {
+            /***********************************************************************************
+             * Recursively Sliced Shuffle
+             *  Purpose:
+             *      Slices an input type a number of times into integral types so that shuffles
+             *      are well defined
+             *  Expectations:
+             *      This object *should not* be used from a reinterpret_cast pointer unless
+             *      some alignment guarantees can be met. Use a memcpy to guarantee that loads
+             *      from the integral types stored within are aligned and correct.
+             **********************************************************************************/
+            template <unsigned int count, bool intSized = (count <= sizeof(int))>
+            struct recursive_sliced_shuffle_helper;
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, true> {
+                int val;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                }
+            };
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, false> {
+                int val;
+                recursive_sliced_shuffle_helper<count - sizeof(int)> next;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                    next.invoke_shuffle(shfl);
+                }
+            };
+        }
+
+        struct _memory_shuffle {
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
+                return TyElem{};
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        /***********************************************************************************
+         * Intrinsic Device Function Shuffle
+         *  Purpose:
+         *      Uses a shuffle helper that has characteristics best suited for moving
+         *      elements between threads
+         *  Expectations:
+         *      Object given will be forced into an l-value type so that it can be used
+         *      with a helper structure that reinterprets the data into intrinsic compatible
+         *      types
+         *  Notes:
+         *      !! TyRet is required so that objects are returned by value and not as
+         *      dangling references depending on the value category of the passed object
+         **********************************************************************************/
+        struct _intrinsic_compat_shuffle {
+            template <unsigned int count>
+            using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
+
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
+                shfl_helper<sizeof(TyElem)> helper;
+                memcpy(&helper, &elem, sizeof(TyElem));
+                helper.invoke_shuffle(fn);
+                memcpy(&elem, &helper, sizeof(TyElem));
+                return elem;
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_sync(gMask, val, srcRank, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_down_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_up_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_xor_sync(gMask, val, lMask, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        struct _native_shuffle {
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl(
+                    TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_down(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_up(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_xor(
+                    TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
+            }
+        };
+
+        // Almost all arithmetic types are supported by native shuffle
+        // Vector types are the exception
+        template <typename TyElem>
+        using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<
+                remove_qual<TyElem>>::value ||
+            details::is_float_or_half<
+                remove_qual<TyElem>>::value
+        >;
+
+        constexpr unsigned long long _MemoryShuffleCutoff = 32;
+
+        template <typename TyElem,
+                  bool IsNative = use_native_shuffle<TyElem>::value,
+                  bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
+        struct shuffle_dispatch;
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, true, false> :  public _native_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, true> :  public _memory_shuffle {};
+
+#endif //_CG_CPP11_FEATURES
+    };
+
+    namespace multi_grid {
+        struct multi_grid_functions;
+    };
+
+    namespace grid {
+        _CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
+            return details::sync_grids_arrive(bar);
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
+            details::sync_grids_wait(token, bar);
+        }
+
+        _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
+            unsigned int token = details::sync_grids_arrive(bar);
+            details::sync_grids_wait(token, bar);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_blocks()
+        {
+            // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
+            // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)]  exceeds 4b, promote before multiplication
+            return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long block_rank()
+        {
+            return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return dim3(gridDim.x, gridDim.y, gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(blockIdx.x * blockDim.x + threadIdx.x,
+                        blockIdx.y * blockDim.y + threadIdx.y,
+                        blockIdx.z * blockDim.z + threadIdx.z);
+        }
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+        _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+            return __clusterGridDimInClusters();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+            const dim3 dimClusters = dim_clusters();
+            return dimClusters.x * dimClusters.y * dimClusters.z;
+        }
+
+        _CG_STATIC_QUALIFIER dim3 cluster_index() {
+            return __clusterIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+            return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
+        }
+#endif
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned long long size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 grid_dim()
+        {
+            return dim_blocks();
+        }
+    };
+
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+    namespace multi_grid {
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
+#else   /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
+            return 0;
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+        }
+
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaError_t err = cudaCGSynchronize(handle, 0);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            unsigned int numThreads = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetSize(&numThreads, NULL, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return numThreads;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int threadRank = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetRank(&threadRank, NULL, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return threadRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
+        {
+            unsigned int gridRank = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetRank(NULL, &gridRank, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return gridRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
+        {
+            unsigned int numGrids = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetSize(NULL, &numGrids, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return numGrids;
+        }
+
+# ifdef _CG_CPP11_FEATURES
+        struct multi_grid_functions {
+            decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
+            decltype(multi_grid::sync) *sync;
+            decltype(multi_grid::size) *size;
+            decltype(multi_grid::thread_rank) *thread_rank;
+            decltype(multi_grid::grid_rank) *grid_rank;
+            decltype(multi_grid::num_grids) *num_grids;
+        };
+
+        template <typename = void>
+        _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
+            __constant__ static const multi_grid_functions mgf {
+                &multi_grid::get_intrinsic_handle,
+                &multi_grid::sync,
+                &multi_grid::size,
+                &multi_grid::thread_rank,
+                &multi_grid::grid_rank,
+                &multi_grid::num_grids
+            };
+
+            return &mgf;
+        }
+# endif
+    };
+#endif
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    namespace cluster {
+
+        _CG_STATIC_QUALIFIER bool isReal()
+        {
+            return __clusterDimIsSpecified();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_arrive()
+        {
+            __cluster_barrier_arrive();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_wait()
+        {
+            __cluster_barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            barrier_arrive();
+            barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+        {
+            return __cluster_query_shared_rank(addr);
+        }
+
+        template <typename T>
+        _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+        {
+            return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return __clusterRelativeBlockIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int block_rank()
+        {
+            return __clusterRelativeBlockRank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            const dim3 blockIndex = block_index();
+            return dim3(blockIndex.x * blockDim.x + threadIdx.x,
+                        blockIndex.y * blockDim.y + threadIdx.y,
+                        blockIndex.z * blockDim.z + threadIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return __clusterDim();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_blocks()
+        {
+            return __clusterSizeInBlocks();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            const dim3 dimBlocks = dim_blocks();
+            const unsigned int x = dimBlocks.x * blockDim.x;
+            const unsigned int y = dimBlocks.y * blockDim.y;
+            const unsigned int z = dimBlocks.z * blockDim.z;
+            return dim3(x, y, z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+    };
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int laneid()
+    {
+        unsigned int laneid;
+        asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
+        return laneid;
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
+    {
+        unsigned int lanemask32_eq;
+        asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
+        return (lanemask32_eq);
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
+    {
+        unsigned int lanemask32_lt;
+        asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+        return (lanemask32_lt);
+    }
+
+    _CG_STATIC_QUALIFIER void abort()
+    {
+        _CG_ABORT();
+    }
+
+    template <typename Ty>
+    _CG_QUALIFIER void assert_if_not_arithmetic() {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(
+            _CG_STL_NAMESPACE::is_integral<Ty>::value ||
+            details::is_float_or_half<Ty>::value,
+            "Error: Ty is neither integer or float"
+        );
+#endif //_CG_CPP11_FEATURES
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
+        return x == 1 ? 0 : 1 + log2(x / 2);
+    }
+#endif //_CG_CPP11_FEATURES
+
+}; // !Namespace internal
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/info.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/info.h
new file mode 100644
index 0000000000000000000000000000000000000000..6abbea1e1ec5b0a92c18baabec7485929fe91c0d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/info.h
@@ -0,0 +1,345 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+
+#include <nv/target>
+
+#ifndef _CG_INFO_H_
+#define _CG_INFO_H_
+/*
+** Define: _CG_VERSION
+*/
+#define _CG_VERSION 1000
+
+/*
+** Define: _CG_ABI_VERSION
+*/
+#ifndef _CG_ABI_VERSION
+# define _CG_ABI_VERSION 1
+#endif
+
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+#if defined(_CG_ABI_EXPERIMENTAL)
+#endif
+
+#define _CG_CONCAT_INNER(x, y) x ## y
+#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+
+#define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+#define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+
+#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+# define _CG_CPP11_FEATURES
+#endif
+
+#if !defined(_CG_QUALIFIER)
+# define _CG_QUALIFIER __forceinline__ __device__
+#endif
+#if !defined(_CG_STATIC_QUALIFIER)
+# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+#endif
+#if !defined(_CG_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
+# else
+#  define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
+# endif
+#endif
+#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
+# else
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
+# endif
+#endif
+
+#if defined(_MSC_VER)
+# define _CG_DEPRECATED __declspec(deprecated)
+#else
+# define _CG_DEPRECATED __attribute__((deprecated))
+#endif
+
+#if defined(__CUDA_MINIMUM_ARCH__)
+# define _CG_CUDA_ARCH __CUDA_MINIMUM_ARCH__
+#elif defined(__CUDA_ARCH__)
+# define _CG_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
+# define _CG_HAS_GRID_GROUP
+#endif
+#if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
+# define _CG_HAS_MULTI_GRID_GROUP
+#endif
+#if (_CG_CUDA_ARCH >= 700) || !defined(_CG_CUDA_ARCH)
+# define _CG_HAS_MATCH_COLLECTIVE
+#endif
+
+#if ((_CG_CUDA_ARCH >= 800) || !defined(_CG_CUDA_ARCH)) && !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+# define _CG_HAS_RESERVED_SHARED
+#endif
+
+#if ((_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) && \
+    defined(_CG_CPP11_FEATURES)
+# define _CG_HAS_CLUSTER_GROUP
+#endif
+
+#if (_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)
+# define _CG_HAS_INSTR_ELECT
+#endif
+
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+#ifdef __CUDA_FP16_TYPES_EXIST__
+# define _CG_HAS_FP16_COLLECTIVE
+#endif
+
+// Include libcu++ where supported.
+#if defined(_CG_CPP11_FEATURES) && !defined(__ibmxl__) && (!defined(_MSC_VER) || defined(_WIN64)) && \
+    !defined(_CG_LIMIT_INCLUDED_DEPENDENCIES)
+# define _CG_USE_CUDA_STL
+#else
+# define _CG_USE_OWN_TRAITS
+#endif
+
+#if defined(_CG_USE_CUDA_STL) && !defined(__QNX__) && (!defined(__CUDA_ARCH__) || \
+    ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
+# define _CG_HAS_STL_ATOMICS
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+// Use cuda::std:: for type_traits
+# if defined(_CG_USE_CUDA_STL)
+#  define _CG_STL_NAMESPACE cuda::std
+#  include <cuda/std/type_traits>
+// Use CG's implementation of type traits
+# else
+#  define _CG_STL_NAMESPACE cooperative_groups::details::templates
+# endif
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+# define _CG_STATIC_CONST_DECL static constexpr
+# define _CG_CONST_DECL constexpr
+#else
+# define _CG_STATIC_CONST_DECL static const
+# define _CG_CONST_DECL const
+#endif
+
+#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+# define _CG_ASM_PTR_CONSTRAINT "r"
+#else
+#  define _CG_ASM_PTR_CONSTRAINT "l"
+#endif
+
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
+# define _CG_DEBUG
+#endif
+
+#if defined(_CG_DEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_STATIC_CONST_DECL unsigned int default_max_block_size = 1024;
+
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
+namespace templates {
+
+/**
+ * Integral constants
+ **/
+template <typename Ty, Ty Val>
+struct integral_constant {
+    static constexpr Ty value = Val;
+    typedef Ty type;
+
+    _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
+    _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
+};
+
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+
+/**
+ * CV Qualifiers
+ **/
+template <class Ty> struct is_lvalue_reference       : public details::templates::false_type {};
+template <class Ty> struct is_lvalue_reference<Ty&>  : public details::templates::true_type {};
+
+template <class Ty> struct remove_reference       {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&>  {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
+
+template <class Ty>
+using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
+
+template <class Ty> struct remove_const           {typedef Ty type;};
+template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_volatile              {typedef Ty type;};
+template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
+
+template <class Ty>
+using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
+    return static_cast<Ty&&>(t);
+}
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
+    static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
+    return static_cast<Ty&&>(t);
+}
+
+/**
+ * is_integral
+ **/
+template <class Ty> struct _is_integral                     : public details::templates::false_type {};
+template <>         struct _is_integral<bool>               : public details::templates::true_type {};
+template <>         struct _is_integral<char>               : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned char>      : public details::templates::true_type {};
+template <>         struct _is_integral<short>              : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned short>     : public details::templates::true_type {};
+template <>         struct _is_integral<int>                : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned int>       : public details::templates::true_type {};
+template <>         struct _is_integral<long>               : public details::templates::true_type {};
+template <>         struct _is_integral<long long>          : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long>      : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long long> : public details::templates::true_type {};
+//Vector type support?
+
+template <typename Ty>
+struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * is_floating_point
+ **/
+template <class Ty> struct _is_floating_point              : public details::templates::false_type {};
+template <>         struct _is_floating_point<float>       : public details::templates::true_type {};
+template <>         struct _is_floating_point<double>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<long double> : public details::templates::true_type {};
+# ifdef __CUDA_FP16_TYPES_EXIST__
+template <>         struct _is_floating_point<__half>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<__half2>     : public details::templates::true_type {};
+# endif
+//Vector type support?
+
+template <typename Ty>
+struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
+
+template <class T>
+struct is_arithmetic : details::templates::integral_constant<
+    bool,
+    details::templates::is_integral<T>::value ||
+    details::templates::is_floating_point<T>::value> {};
+
+template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
+struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
+
+template <typename Ty>
+struct _is_unsigned<Ty,false> : details::templates::false_type {};
+
+template <typename Ty>
+struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
+
+template <typename Ty> struct _is_pointer      : public details::templates::false_type {};
+template <typename Ty> struct _is_pointer<Ty*> : public details::templates::true_type {};
+
+template <typename Ty>
+struct is_pointer : _is_pointer<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * programmatic type traits
+ **/
+template<bool B, class Ty = void>
+struct enable_if {};
+
+template<class Ty>
+struct enable_if<true, Ty> { typedef Ty type; };
+
+template<bool Cond, typename Ty = void>
+using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
+
+template<class Ty1, class Ty2>
+struct is_same : details::templates::false_type {};
+
+template<class Ty>
+struct is_same<Ty, Ty> : details::templates::true_type {};
+
+} // templates
+#endif // _CG_CPP11_FEATURES
+
+} // details
+_CG_END_NAMESPACE
+
+
+#endif // _CG_INFO_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/invoke.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/invoke.h
new file mode 100644
index 0000000000000000000000000000000000000000..f00314ce140e390be90a1ab3c328fd73d73c0d46
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/invoke.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_INVOKE_H
+#define _CG_INVOKE_H
+
+#include "info.h"
+#include "helpers.h"
+
+#if defined(_CG_CPP11_FEATURES)
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <typename Group>
+    struct _elect_group_supported : _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_INSTR_ELECT
+    template<>
+    struct _elect_group_supported<coalesced_group> : _CG_STL_NAMESPACE::true_type {};
+    template<unsigned int Size, typename Parent>
+    struct _elect_group_supported<thread_block_tile<Size, Parent>> :
+        _CG_STL_NAMESPACE::integral_constant<bool, (Size <= 32)> {};
+#endif
+
+    template <typename Group>
+    struct elect_group_supported : public _elect_group_supported<details::remove_qual<Group>> {};
+
+    template<typename Group>
+    _CG_STATIC_QUALIFIER bool elect_one(const Group& group, unsigned int mask, unsigned int& leader_lane) {
+        int is_leader = 0;
+#ifdef _CG_HAS_INSTR_ELECT
+        asm("{\n\t"
+          " .reg .pred p;\n\t"
+          "  elect.sync %0|p, %2;\n\t"
+          " @p mov.s32 %1, 1;\n\t"
+          "}"
+          : "+r"(leader_lane), "+r"(is_leader) : "r" (mask));
+#endif
+        return is_leader;
+    }
+
+    template<bool UseElect>
+    struct invoke_one_impl {};
+
+    template<>
+    struct invoke_one_impl<true> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+
+            if (elect_one(group, mask, leader_lane)) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+
+            if (elect_one(group, mask, leader_lane)) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+
+            // Need to use low level api instead of group.shfl, because elect_one returns lane id, not group rank.
+            return tile::shuffle_dispatch<ResultType>::shfl(result, mask, leader_lane, 32);
+        }
+    };
+
+    template<>
+    struct invoke_one_impl<false> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            if (group.thread_rank() == 0) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+
+            if (group.thread_rank() == 0) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+
+            return group.shfl(result, 0);
+        }
+    };
+
+
+}; // namespace details
+
+template<typename Group, typename Fn, typename... Args>
+_CG_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+    using impl = details::invoke_one_impl<details::elect_group_supported<Group>::value>;
+    impl::invoke_one(group, _CG_STL_NAMESPACE::forward<Fn>(fn), _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+template<typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const coalesced_group& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<coalesced_group>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+template<unsigned int Size, typename Parent, typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const thread_block_tile<Size, Parent>& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<thread_block_tile<Size, Parent>>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+_CG_END_NAMESPACE
+
+#endif //_CG_CPP11_FEATURES
+
+#endif // _CG_INVOKE_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..25fdc7646932889a4992b8d3a6249a610c76709a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h
@@ -0,0 +1,136 @@
+/* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
+# define _COOPERATIVE_GROUPS_MEMORY_H_
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+#if defined(_CG_CPP11_FEATURES)
+namespace details {
+    _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
+
+    // Should only be called for SM80+
+    _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
+    {
+        unsigned long long ptr = 0;
+        NV_IF_TARGET(NV_PROVIDES_SM_80,
+        (asm ("{\n\t"
+             " .reg .u32 start;\n\t"
+             " .reg .u64 extended;\n\t"
+             " mov.u32 start, %%reserved_smem_offset_1;\n\t"
+             " cvt.u64.u32 extended, start;\n\t"
+             " cvta.shared.u64 %0, extended;\n\t"
+             "}"
+             : "=l"(ptr));)
+        )
+        return reinterpret_cast<void*>(ptr);
+    }
+
+    struct multi_warp_scratch {
+        // One barrier per possible size of the group.
+        _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
+        _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
+
+        using communication_type = unsigned long long;
+        _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
+
+        // Layout of the scratch space:
+        barrier_t barriers[memory_barriers_count];
+        char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
+        communication_type communication_memory[default_max_block_size / 32];
+
+        _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
+            // One slot of collectives memory per warp.
+            return scratch_num_reserved_bytes + (unsigned int)sync_memory_size + max_block_size / 32 * (unsigned int)communication_size;
+        }
+
+        _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
+            if (thread_rank < memory_barriers_count) {
+                barriers[thread_rank] = 0;
+            }
+        }
+    };
+
+#if defined(_CG_HAS_RESERVED_SHARED)
+    // CG can expect at least 288 bytes available in reserved shared
+    static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
+#endif
+
+    // Make sure the structure can fit into the user provided memory
+    static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
+                  "multi-warp scratch size is too large");
+
+
+    _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
+        void *ptr;
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+            (ptr = reserved_shared_ptr();)
+        ,
+            (ptr = user_scratch;)
+        )
+        return static_cast<multi_warp_scratch*>(ptr);
+
+    }
+
+}
+
+template <unsigned int MaxBlockSize = details::default_max_block_size>
+struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
+private:
+#if !defined(_CG_HAS_RESERVED_SHARED)
+    char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
+#endif
+};
+#endif
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/partitioning.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/partitioning.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe66fb926876c045b49a2c8f0b83379dae7599fa
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/partitioning.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_PARTITIONING_H
+#define _CG_PARTITIONING_H
+
+#include "info.h"
+#include "helpers.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
+        const unsigned int fullMask = ~0u;
+
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int predMask = pred ? 0 : fullMask;
+        unsigned int setMask = __ballot_sync(thisMask, pred);
+
+        if (setMask == thisMask || setMask == 0) {
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
+            return subTile;
+        }
+        else {
+            unsigned int subMask = thisMask & (setMask ^ predMask);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
+            return subTile;
+        }
+    }
+
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate pred) {
+            unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+            unsigned int subMask  = __match_any_sync(thisMask, pred);
+            unsigned int laneId   = details::laneid();
+
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+
+            int leaderLaneId = __ffs(subMask) - 1;
+            bool isLeader = leaderLaneId == laneId;
+            unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
+
+            // Count leaders with lower laneid, that will be the meta rank of this tile
+            unsigned int tileRank = __popc(leaderMask & ((1 << leaderLaneId) - 1));
+
+            _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
+
+            return subTile;
+        }
+    };
+
+    template <>
+    struct _labeled_partition_dispatch<bool> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, bool pred) {
+            return _binary_partition(tile, pred);
+        }
+    };
+
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch<TyPredicate*> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate* pred) {
+            auto impl = _labeled_partition_dispatch<unsigned long long>();
+            return impl(tile, reinterpret_cast<unsigned long long>(pred));
+        }
+    };
+#endif
+}; // namespace details
+
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
+    return details::_binary_partition(tile, pred);
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
+#ifdef _CG_CPP11_FEATURES
+    static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
+#endif
+    return details::_binary_partition(tile, pred);
+}
+
+
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+template <typename TyPredicate>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+
+template <typename TyPredicate, unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_PARTITIONING_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..b89d7fbaf340c1febd6c8a7de13f346daa1a1371
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h
@@ -0,0 +1,424 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_and_sync(mask, val));), return 0;)
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_xor_sync(mask, val));), return 0;)
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_or_sync(mask, val));), return 0;)
+    }
+
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_and_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_xor_sync(mask, val);), return 0;)
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_or_sync(mask, val);), return 0;)
+    }
+
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+
+
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+                // Retrieve the mask for the group and dispatch to redux
+                return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+            ,
+                // Arch does not support redux, fallback to shuffles
+                return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+            )
+        }
+
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+                // Retrieve the mask for the group and dispatch to redux
+                return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+            ,
+                // Arch does not support redux, fallback to shuffles
+                return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+            )
+        }
+
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+
+    };
+
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+
+    template <unsigned int GroupId>
+    struct tile_async_reduce_dispatch;
+
+    template <>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id> {
+        template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                res_handler(result);
+            }
+        }
+    };
+
+    template <>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        res_handler(result);
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+#if defined(_CG_CPP11_FEATURES)
+
+# if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+# endif
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        *dst = result;
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_REDUCE_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d68350e48307d120289e22872abc66f5188115
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h
@@ -0,0 +1,320 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+
+    enum class ScanType { exclusive, inclusive };
+
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {                                                
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+#if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_SCAN_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/sync.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/sync.h
new file mode 100644
index 0000000000000000000000000000000000000000..07ec3736c71fd6095b661dc0a66602d8e384f83e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/details/sync.h
@@ -0,0 +1,281 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_GRID_H
+#define _CG_GRID_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details
+{
+typedef unsigned int barrier_t;
+
+_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
+    return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
+}
+
+_CG_STATIC_QUALIFIER bool is_cta_master() {
+    return (threadIdx.x + threadIdx.y + threadIdx.z == 0);
+}
+
+_CG_STATIC_QUALIFIER unsigned int sync_grids_arrive(volatile barrier_t *arrived) {
+    unsigned int oldArrive = 0;
+
+    __barrier_sync(0);
+
+    if (is_cta_master()) {
+        unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
+        bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
+        unsigned int nb = 1;
+
+        if (gpu_master) {
+            nb = 0x80000000 - (expected - 1);
+        }
+
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+        // Barrier update with release; polling with acquire
+        asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(oldArrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb) : "memory");
+        ,
+        // Fence; barrier update; volatile polling; fence
+        __threadfence();
+        oldArrive = atomicAdd((unsigned int*)arrived, nb);
+        );
+    }
+
+    return oldArrive;
+}
+
+
+_CG_STATIC_QUALIFIER void sync_grids_wait(unsigned int oldArrive, volatile barrier_t *arrived) {
+    if (is_cta_master()) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+        unsigned int current_arrive;
+        do {
+            asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(current_arrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int *)arrived) : "memory");
+        } while (!bar_has_flipped(oldArrive, current_arrive));
+        ,
+        while (!bar_has_flipped(oldArrive, *arrived));
+        __threadfence();
+        );
+    }
+
+    __barrier_sync(0);
+}
+
+/* - Multi warp groups synchronization routines - */
+
+#ifdef _CG_CPP11_FEATURES
+// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
+_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
+    unsigned int old;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+    (asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
+    ,
+    (__threadfence_block();
+    old = atomicOr(addr, val);)
+    );
+    return old;
+}
+
+// Special case where barrier is arrived, but not waited on
+_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+    (asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
+    ,
+    (__threadfence_block();
+    atomicOr(addr, val);)
+    );
+}
+
+// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
+_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+    (asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
+    ,
+    (atomicAnd(addr, val);)
+    );
+}
+
+// Special case of release, where last warp was doing extra work before releasing others, need to be release
+//  to ensure that extra work is visible
+_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+    (asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
+    ,
+    (__threadfence_block();
+    atomicAnd(addr, val);)
+    );
+}
+
+// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
+_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
+    unsigned int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+    (asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");)
+    ,
+    (val = *((volatile unsigned int*) addr);
+    __threadfence_block();)
+    );
+    return val;
+}
+
+// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
+// thread ranks 32..63 second etc 
+// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions 
+_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
+    return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
+}
+
+_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
+    while(ld_acquire_cta(arrived) & warp_bit);
+}
+
+// Default blocking sync.
+_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
+        if (((old | warp_bit) & group_mask) == group_mask) {
+            red_and_relaxed_cta(arrived, ~group_mask);
+        }
+        else {
+            barrier_wait(arrived, warp_bit);
+        }
+    }
+
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
+// Warp returning true from this function needs to call sync_warps_release.
+_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    unsigned int old = 0;
+    if (warp_master) {
+        old = atom_or_acq_rel_cta(arrived, warp_bit);
+    }
+    old = __shfl_sync(0xFFFFFFFF, old, 0);
+    if (((old | warp_bit) & group_mask) == group_mask) {
+        return true;
+    }
+    barrier_wait(arrived, warp_bit);
+
+    return false;
+}
+
+// Release my group from the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    if (is_master) {
+        red_and_release_cta(arrived, ~group_mask);
+    }
+}
+
+// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
+// sync_warps_release needs to be called by some warp after this one to reset the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        red_or_release_cta(arrived, warp_bit);
+    }
+}
+
+// Wait for my warp to be released from the barrier. Warp must have arrived first.
+_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    barrier_wait(arrived, warp_bit);
+}
+
+// Wait for specific warp to arrive at the barrier
+_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
+    unsigned int wait_mask = 1 << wait_warp_id;
+    while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
+}
+
+// Initialize the bit corresponding to my warp in the barrier
+_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (thread_rank % 32 == 0) {
+        red_and_release_cta(arrived, ~warp_bit);
+    }
+    // No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
+}
+
+#endif
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_GRID_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h
new file mode 100644
index 0000000000000000000000000000000000000000..50b907d9a1fe45cdc411891a20d8fd035118e5be
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h
@@ -0,0 +1,62 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/async.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c87d780db0b437f1ae06e0ef8d60137233795c0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h
@@ -0,0 +1,63 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_REDUCE_H
+#define _COOPERATIVE_GROUPS_REDUCE_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/reduce.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_REDUCE_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bc27078028318ada00cbcccd052e0d6cc930cfe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h
@@ -0,0 +1,63 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_SCAN_H
+#define _COOPERATIVE_GROUPS_SCAN_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/scan.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_SCAN_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e70950fb51d0d58f8dd99239e6b36ba89c4779
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
+
+#if !defined(__COMMON_FUNCTIONS_H__)
+#define __COMMON_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported.  Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
+
+#ifndef __CUDA_API_VER_MAJOR__
+#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
+#endif /* __CUDA_API_VER_MAJOR__ */
+
+#ifndef __CUDA_API_VER_MINOR__
+#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
+#endif /* __CUDA_API_VER_MINOR__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <string.h>
+#include <time.h>
+
+extern "C"
+{
+#endif /* !__CUDACC_RTC__ */
+extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
+#if defined(__QNX__)
+asm("clock32")
+#endif
+__THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memset(void*, int, size_t) __THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memcpy(void*, const void*, size_t) __THROW;
+#if !defined(__CUDACC_RTC__)
+}
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__CUDA_ARCH__)
+
+#if defined(__CUDACC_RTC__)
+inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
+inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
+inline __host__ __device__ void operator delete(void*, void*) { }
+inline __host__ __device__ void operator delete[](void*, void*) { }
+#else /* !__CUDACC_RTC__ */
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <new>
+#endif
+
+#if defined (__GNUC__)
+
+#define STD \
+        std::
+        
+#else /* __GNUC__ */
+
+#define STD
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__)  || defined(__CUDA_ICC_CPP14__) */
+#endif /* __CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <stdio.h>
+#include <stdlib.h>
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern "C"
+{
+extern
+#if !defined(_MSC_VER) || _MSC_VER < 1900
+_CRTIMP
+#endif
+            
+#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) 
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...) __THROW;
+#else /* newer glibc */
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...);
+#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
+
+
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void*   __cdecl malloc(size_t) __THROW;
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void    __cdecl free(void*) __THROW;
+
+#if defined(_MSC_VER)
+extern  __host__ __device__ __cudart_builtin__ void*   __cdecl _alloca(size_t);
+#endif
+
+#if defined(__QNX__)
+#undef alloca
+#define alloca(__S) __builtin_alloca(__S)
+#endif
+}
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <assert.h>
+#endif /* !__CUDACC_RTC__ */
+
+extern "C"
+{
+#if defined(__CUDACC_RTC__)
+extern __host__ __device__ void __assertfail(const char * __assertion, 
+                                             const char *__file,
+                                             unsigned int __line,
+                                             const char *__function,
+                                             size_t charsize);
+#elif defined(__APPLE__)
+#define __builtin_expect(exp,c) (exp)
+extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
+  const char *, const char *, int, const char *);
+#elif defined(__ANDROID__)
+extern __host__ __device__ __cudart_builtin__ void __assert2(
+  const char *, int, const char *, const char *);
+#elif defined(__QNX__)
+#if !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern __host__ __device__ __cudart_builtin__ void __assert(
+  const char *, const char *, unsigned int, const char *);
+#if !defined(_LIBCPP_VERSION)
+}
+#endif
+#elif defined(__HORIZON__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, int, const char *);
+#elif defined(__GNUC__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, unsigned int, const char *)
+  __THROW; 
+#elif defined(_WIN32)
+extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
+  const wchar_t *, const wchar_t *, unsigned);
+#endif
+}
+
+#if defined(__CUDACC_RTC__)
+#ifdef NDEBUG
+#define assert(e) (static_cast<void>(0))
+#else /* !NDEBUG */
+#define __ASSERT_STR_HELPER(x) #x
+#define assert(e) ((e) ? static_cast<void>(0)\
+                       : __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
+                                      __LINE__, __PRETTY_FUNCTION__,\
+                                      sizeof(char)))
+#endif /* NDEBUG */
+__host__ __device__  void* operator new(size_t);
+__host__ __device__  void* operator new[](size_t);
+__host__ __device__  void operator delete(void*);
+__host__ __device__  void operator delete[](void*);
+# if __cplusplus >= 201402L
+__host__ __device__  void operator delete(void*, size_t);
+__host__ __device__  void operator delete[](void*, size_t);
+#endif /* __cplusplus >= 201402L */
+
+#if __cplusplus >= 201703L
+namespace std { enum class align_val_t : size_t {}; }
+__host__ __device__ void*   __cdecl operator new(size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void*   __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
+#endif  /* __cplusplus >= 201703L */
+
+#else /* !__CUDACC_RTC__ */
+#if defined (__GNUC__)
+
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+
+#if (__cplusplus >= 201103L)  && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
+#define THROWBADALLOC 
+#else
+#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
+#define THROWBADALLOC
+#else
+#define THROWBADALLOC  throw(STD bad_alloc)
+#endif
+#endif
+#define __DELETE_THROW throw()
+
+#undef __NV_GLIBCXX_VERSION
+
+#else /* __GNUC__ */
+
+#define THROWBADALLOC  throw(...)
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)  */
+
+#if __cpp_aligned_new
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
+#endif  /* __cpp_aligned_new */
+
+#undef THROWBADALLOC
+#undef STD
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDA_ARCH__ */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
+#include "cuda_device_runtime_api.h"
+#endif
+
+#include "math_functions.h"
+
+#endif /* !__COMMON_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d244463e73f0f7569a4707002c8e059bca67c6d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2021-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
+
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7849c6c6e099e85a4676e7c9c38c05b5a5b02d26
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h
@@ -0,0 +1,1192 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_double_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_double_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
+#endif
+
+#if !defined(__DEVICE_DOUBLE_FUNCTIONS_H__)
+#define __DEVICE_DOUBLE_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
+#else
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+extern "C"
+{
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
+ *
+ * Reinterpret the bits in the double-precision floating-point value \p x
+ * as a signed 64-bit integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ long long int         __double_as_longlong(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
+ *
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
+ * a double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                __longlong_as_double(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-to-nearest-even mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rn(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-towards-zero mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rz(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-up mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_ru(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-down mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rd(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dadd_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-down mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dsub_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-down mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-down mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed int to a double.
+ *
+ * Convert the signed integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __int2double_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned int to a double.
+ *
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __uint2double_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2hiint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2loint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high and low 32-bit integer values as a double.
+ *
+ * Reinterpret the integer value of \p hi as the high 32 bits of a 
+ * double-precision floating-point value and the integer value of \p lo
+ * as the low 32 bits of the same double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                 __hiloint2double(int hi, int lo);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode);
+
+#undef EXCLUDE_FROM_RTC
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode = cudaRoundNearest);
+
+#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#if !defined(__CUDACC_RTC__)
+#include "device_double_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__DEVICE_DOUBLE_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f63063689d65c4a1dffb9a823ddaf6a5b353cba3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
+#else
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __fma_rz(a, b, c) :
+         mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
+         mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
+                                   __fma_rn(a, b, c);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dmul_rz(a, b) :
+         mode == cudaRoundPosInf ? __dmul_ru(a, b) :
+         mode == cudaRoundMinInf ? __dmul_rd(a, b) :
+                                   __dmul_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dadd_rz(a, b) :
+         mode == cudaRoundPosInf ? __dadd_ru(a, b) :
+         mode == cudaRoundMinInf ? __dadd_rd(a, b) :
+                                   __dadd_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dsub_rz(a, b) :
+         mode == cudaRoundPosInf ? __dsub_ru(a, b) :
+         mode == cudaRoundMinInf ? __dsub_rd(a, b) :
+                                   __dsub_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2int_rn(a) :
+         mode == cudaRoundPosInf  ? __double2int_ru(a) :
+         mode == cudaRoundMinInf  ? __double2int_rd(a) :
+                                    __double2int_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2uint_rn(a) :
+         mode == cudaRoundPosInf  ? __double2uint_ru(a) :
+         mode == cudaRoundMinInf  ? __double2uint_rd(a) :
+                                    __double2uint_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ll_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ll_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ll_rd(a) :
+                                    __double2ll_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ull_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ull_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ull_rd(a) :
+                                    __double2ull_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ll2double_rz(a) :
+         mode == cudaRoundPosInf ? __ll2double_ru(a) :
+         mode == cudaRoundMinInf ? __ll2double_rd(a) :
+                                   __ll2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ull2double_rz(a) :
+         mode == cudaRoundPosInf ? __ull2double_ru(a) :
+         mode == cudaRoundMinInf ? __ull2double_rd(a) :
+                                   __ull2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..715220e121f790ab8ff2aeaed25620fe9759236f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h
@@ -0,0 +1,1217 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// to easily switch off fp128 device functions if needed
+#ifndef __NV_DISABLE_DEVICE_FP128_FUNCTIONS__
+
+#if !defined(__DEVICE_FP128_FUNCTIONS_H__)
+#define __DEVICE_FP128_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#define __INLINE_IF_HOST__ __inline__
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#define __INLINE_IF_HOST__
+#endif /* __CUDA_ARCH__ */
+
+#define __DEVICE_FP128_FUNCTIONS_DECL__ __device__ __cudart_builtin__ __INLINE_IF_HOST__
+
+/*******************************************************************************
+*                                                                              *
+* Support for __float128 on:                                                   *
+*    - NVRTC on Linux                                                          *
+*    - GCC version 4.1 or later on x86_64/amd64                                *
+*    - Clang version 3.9 or later on x86_64/amd64                              *
+*    - NVHPC version 21.1 or later on x86_64/amd64                             *
+*                                                                              *
+*******************************************************************************/
+#if defined(__CUDACC_RTC__)
+#if !_WIN64
+#define __FLOAT128_CPP_SPELLING_ENABLED__
+#endif
+#else /* !__CUDACC_RTC__ */
+
+#if (defined __NVCOMPILER_MAJOR__)
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__NVCOMPILER_MAJOR__ > 21) || \
+            (__NVCOMPILER_MAJOR__ == 21 && __NVCOMPILER_MINOR__ >= 1))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#elif defined(__clang__)
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__clang_major__ > 3) || \
+            (__clang_major__ == 3 && __clang_minor__ >= 9))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#elif defined(__GNUC__)
+    // check gcc version if no other host compiler is used
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__GNUC__ > 4) || \
+            (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#endif /* (defined __NVCOMPILER_MAJOR__) */
+
+#endif /* !__CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+* Support for _Float128 on:                                                    *
+*    - GCC version 13.1 or later on x86_64/amd64/aarch64                       *
+*                                                                              *
+*******************************************************************************/
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER_MAJOR__)
+    // check gcc version if no other host compiler is used
+    #if (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__)) && \
+        ((__GNUC__ > 13) || \
+            (__GNUC__ == 13 && __GNUC_MINOR__ >= 1))
+        #define __FLOAT128_C_SPELLING_ENABLED__
+    #endif
+#endif /* defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER_MAJOR__) */
+
+/**
+ * \defgroup CUDA_MATH_QUAD FP128 Quad Precision Mathematical Functions
+ * This section describes quad precision mathematical functions.
+ * To use these functions, include the header file \p device_fp128_functions.h in your program.
+ * 
+ * Functions declared here have \p __nv_fp128_ prefix to distinguish them
+ * from other global namespace symbols.
+ *
+ * Note that FP128 CUDA Math functions are only available to device programs
+ * on platforms where host compiler supports the basic quad precision datatype
+ * \p __float128 or \p _Float128.
+ * 
+ * Every FP128 CUDA Math function name is overloaded to support either of these
+ * host-compiler-specific types, whenever the types are available. See for example:
+ * \code
+ * #ifdef __FLOAT128_CPP_SPELLING_ENABLED__
+ *     __float128 __nv_fp128_sqrt(__float128 x);
+ * #endif
+ * #ifdef __FLOAT128_C_SPELLING_ENABLED__
+ *     _Float128 __nv_fp128_sqrt(_Float128 x);
+ * #endif
+ * \endcode
+ *
+ * \note_fp128_target_arch
+ */
+
+#ifdef __FLOAT128_CPP_SPELLING_ENABLED__
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sqrt{x} \end_cuda_math_formula, the square root of the input argument.
+ *
+ * \return 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(\p x) returns NaN if \p x is less than 0.
+ * - __nv_fp128_sqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sqrt(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sin{x} \end_cuda_math_formula, the sine of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \sin{x} \end_cuda_math_formula.
+ * - __nv_fp128_sin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sin(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_sin(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sin(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cos{x} \end_cuda_math_formula, the cosine of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \cos{x} \end_cuda_math_formula.
+ * - __nv_fp128_cos(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula 1 \end_cuda_math_formula.
+ * - __nv_fp128_cos(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_cos(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_cos(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tan{x} \end_cuda_math_formula, the tangent of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \tan{x} \end_cuda_math_formula.
+ * - __nv_fp128_tan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_tan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_tan(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_tan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sin^{-1}{x} \end_cuda_math_formula, the arc sine of input argument.
+ * 
+ * \return 
+ * The principal value of the arc sine of the input argument \p x.
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - __nv_fp128_asin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_asin(\p x) returns NaN for \p x outside [-1, +1].
+ * - __nv_fp128_asin(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_asin(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cos^{-1}{x} \end_cuda_math_formula, the arc cosine of input argument.
+ *
+ * \return 
+ * The principal value of the arc cosine of the input argument \p x.
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - __nv_fp128_acos(1) returns +0.
+ * - __nv_fp128_acos(\p x) returns NaN for \p x outside [-1, +1].
+ * - __nv_fp128_acos(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_acos(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tan^{-1}{x} \end_cuda_math_formula, the arc tangent of input argument.
+ *
+ * \return 
+ * The principal value of the arc tangent of the input argument \p x.
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ].
+ * - __nv_fp128_atan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_atan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - __nv_fp128_atan(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_atan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula e^x \end_cuda_math_formula, the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_exp(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula 2^x \end_cuda_math_formula, the base 2 exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - ex__nv_fp128_exp2p2f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp2(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp2(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula 10^x \end_cuda_math_formula, the base 10 exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp10(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp10(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate 
+ * \cuda_math_formula e^x - 1 \end_cuda_math_formula,
+ * the base e exponential of the input argument, minus 1.
+ *
+ * \return
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_expm1(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_expm1(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{e}{x} \end_cuda_math_formula, the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * \return
+ * - __nv_fp128_log(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log(1) returns +0.
+ * - __nv_fp128_log(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{2}{x} \end_cuda_math_formula, the base 2 logarithm of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_log2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log2(1) returns +0.
+ * - __nv_fp128_log2(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log2(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log2(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{10}{x} \end_cuda_math_formula, the base 10 logarithm of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_log10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log10(1) returns +0.
+ * - __nv_fp128_log10(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log10(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log10(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * \return
+ * - __nv_fp128_log1p(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_log1p(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log1p(\p x) returns NaN for \p x < -1.
+ * - __nv_fp128_log1p(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log1p(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log1p(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of \cuda_math_formula x^{y} \end_cuda_math_formula, first argument to the power of second argument.
+ *
+ * \return 
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - __nv_fp128_pow(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_pow(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - __nv_fp128_pow(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - __nv_fp128_pow(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_pow(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sinh{x} \end_cuda_math_formula, the hyperbolic sine of the input argument.
+ *
+ * Calculate \cuda_math_formula \sinh{x} \end_cuda_math_formula, the hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - __nv_fp128_sinhinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_sinh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sinh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cosh{x} \end_cuda_math_formula, the hyperbolic cosine of the input argument.
+ *
+ * \return
+ * - __nv_fp128_cosh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_cosh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_cosh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_cosh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tanh{x} \end_cuda_math_formula, the hyperbolic tangent of the input argument.
+ *
+ * \return
+ * - __nv_fp128_tanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_tanh( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - __nv_fp128_tanh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_tanh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sinh^{-1}{x} \end_cuda_math_formula, the inverse hyperbolic sine of the input argument.
+ *
+ * \return
+ * - __nv_fp128_asinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_asinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula. 
+ * - __nv_fp128_asinh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_asinh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cosh^{-1}{x} \end_cuda_math_formula, the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - __nv_fp128_acosh(1) returns 0.
+ * - __nv_fp128_acosh(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - __nv_fp128_acosh( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_acosh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_acosh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tanh^{-1}{x} \end_cuda_math_formula, the inverse hyperbolic tangent of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_atanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_atanh(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_atanh(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - __nv_fp128_atanh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_atanh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Truncate input argument to the integral part.
+ *
+ * \return 
+ * Rounded \p x to the nearest integer value in floating-point format, that does not exceed \p x in 
+ * magnitude.
+ * - __nv_fp128_trunc(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_trunc(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_trunc(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_trunc(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula, the largest integer less than or equal to \p x.
+ * 
+ * \return
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - __nv_fp128_floor(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_floor(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_floor(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_floor(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \lceil x \rceil \end_cuda_math_formula, the smallest integer greater than or equal to \p x.
+ * 
+ * \return
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - __nv_fp128_ceil(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_ceil(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_ceil(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_ceil(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Round to nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return 
+ * - __nv_fp128_round(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_round(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_round(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_round(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Round to nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * - __nv_fp128_rint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_rint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_rint(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_rint(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula |x| \end_cuda_math_formula, the absolute value of the input argument.
+ *
+ * \return
+ * - __nv_fp128_fabs(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_fabs(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_fabs(NaN) returns an unspecified NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fabs(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Create value with the magnitude of the first agument \p x, and the sign of the second argument \p y.
+ *
+ * \return
+ * - copysign(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_copysign(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * \return
+ * The maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmax(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * \return
+ * The minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmin(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * \return
+ * - __nv_fp128_fdim(\p x, \p y) returns \p x - \p y if \cuda_math_formula x > y \end_cuda_math_formula.
+ * - __nv_fp128_fdim(\p x, \p y) returns +0 if \cuda_math_formula x \leq y \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fdim(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the floating-point remainder of \p x / \p y.
+ *
+ * \return
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * - The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ * - __nv_fp128_fmod(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - __nv_fp128_fmod(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - __nv_fp128_fmod(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmod(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the floating-point remainder function.
+ *
+ * \return 
+ * The floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y is defined as 
+ * \cuda_math_formula r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula \frac{x}{y} \end_cuda_math_formula. 
+ * In the halfway cases when 
+ * \cuda_math_formula | n -\frac{x}{y} | = \frac{1}{2} \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ * - __nv_fp128_remainder(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_remainder(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - __nv_fp128_remainder(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_remainder(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Extract mantissa and exponent of the floating-point input argument.
+ * 
+ * Decompose the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and an integral term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to 0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * The fractional component \p m.
+ * - __nv_fp128_frexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - __nv_fp128_frexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - __nv_fp128_frexp(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_frexp(__float128 x, int* nptr) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The 
+ * integral part is stored in floating-point format in the location to which \p iptr points.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - __nv_fp128_modf(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - __nv_fp128_modf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - __nv_fp128_modf(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_modf(__float128 x, __float128* iptr) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula, the square root of the sum of squares of two arguments.
+ *
+ * \return
+ * The length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \cuda_math_formula |x| \end_cuda_math_formula and \cuda_math_formula |y| \end_cuda_math_formula without undue overflow or underflow.
+ * - __nv_fp128_hypot(\p x,\p y), __nv_fp128_hypot(\p y,\p x), and __nv_fp128_hypot(\p x, \p -y) are equivalent.
+ * - __nv_fp128_hypot(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to __nv_fp128_fabs(\p x).
+ * - __nv_fp128_hypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula,
+ * even if \p y is a NaN.
+ * - __nv_fp128_hypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_hypot(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * as a single operation using round-to-nearest-even rounding mode.
+ *
+ * \return
+ * The value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * as a single ternary operation, rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ * - __nv_fp128_fma(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __nv_fp128_fma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __nv_fp128_fma(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fma(__float128 x, __float128 y, __float128 c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * \return
+ * - __nv_fp128_ldexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p exp) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_ldexp(\p x, 0) returns \p x.
+ * - __nv_fp128_ldexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p exp) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_ldexp(NaN, \p exp) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_ldexp(__float128 x, int exp) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the unbiased integer exponent of the input argument.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - __nv_fp128_ilogb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - __nv_fp128_ilogb(NaN) returns <tt>INT_MIN</tt>.
+ * - __nv_fp128_ilogb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_ilogb(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x \cdot y \end_cuda_math_formula, the product of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __nv_fp128_mul(\p x, \p y) is equivalent to __nv_fp128_mul(\p y, \p x).
+ * - __nv_fp128_mul(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __nv_fp128_mul(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_mul(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_mul(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x + y \end_cuda_math_formula, the sum of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __nv_fp128_add(\p x, \p y) is equivalent to __nv_fp128_add(\p y, \p x).
+ * - __nv_fp128_add(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __nv_fp128_add(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __nv_fp128_add(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_add(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_add(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_add(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x - y \end_cuda_math_formula, the difference of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __nv_fp128_sub(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __nv_fp128_sub(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sub(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sub(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula \frac{x}{y} \end_cuda_math_formula, the quotient of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __nv_fp128_div(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_div(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_div(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __nv_fp128_div(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __nv_fp128_div(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __nv_fp128_div(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_div(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine whether the input argument is a NaN.
+ *
+ * \return
+ * A nonzero value if and only if \p x is a NaN value.
+ *
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isnan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine whether the pair of inputs is unordered.
+ *
+ * \return
+ * - nonzero value if at least one of input values is a NaN.
+ * - zero otherwise
+ *
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isunordered(__float128 x, __float128 y) __DEF_IF_HOST
+#endif /* __FLOAT128_CPP_SPELLING_ENABLED__ */
+
+
+#ifdef __FLOAT128_C_SPELLING_ENABLED__
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sqrt(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sin(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_cos(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_tan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_asin(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_acos(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_atan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp2(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp10(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_expm1(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log2(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log10(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log1p(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_pow(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sinh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_cosh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_tanh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_asinh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_acosh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_atanh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_trunc(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_floor(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_ceil(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_round(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_rint(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fabs(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_copysign(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmax(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmin(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fdim(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmod(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_remainder(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_frexp(_Float128 x, int* nptr) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_modf(_Float128 x, _Float128* iptr) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_hypot(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fma(_Float128 x, _Float128 y, _Float128 c) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_ldexp(_Float128 x, int exp) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_ilogb(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_mul(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_add(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sub(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_div(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isnan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isunordered(_Float128 x, _Float128 y) __DEF_IF_HOST
+#endif /* __FLOAT_C_SPELLING_ENABLED */
+
+
+#undef __DEVICE_FP128_FUNCTIONS_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__DEVICE_FP128_FUNCTIONS_H__ */
+
+#endif /* !__NV_DISABLE_DEVICE_FP128_FUNCTIONS__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae9de40d680c6e50c25b0c4a01c00679bd0c8fe4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h
@@ -0,0 +1,2993 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
+
+#if !defined(__DEVICE_FUNCTIONS_H__)
+#define __DEVICE_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
+#else
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit integers.
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mulhi(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit unsigned integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit unsigned integers. 
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umulhi(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64-bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __mul64hi(long long int x, long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit unsigned integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an integer as a float.
+ *
+ * Reinterpret the bits in the signed integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int_as_float(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a signed integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float_as_int(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an unsigned integer as a float.
+ *
+ * Reinterpret the bits in the unsigned integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint_as_float(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a unsigned integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a unsigned integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float_as_uint(float x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __syncthreads(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __prof_trigger(int);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence_block(void);
+__DEVICE_FUNCTIONS_DECL__ 
+#if defined(__GNUC__) || defined(__CUDACC_RTC__)
+__attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+__declspec(noreturn)
+#endif  /* defined(__GNUC__) || defined(__CUDACC_RTC__) */
+__device_builtin__ void                   __trap(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __brkpt();
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Clamp the input argument to [+0.0, 1.0].
+ *
+ * Clamp the input argument \p x to be within the interval [+0.0, 1.0].
+ * \return 
+ * - __saturatef(\p x) returns +0 if \cuda_math_formula x \le 0 \end_cuda_math_formula.
+ * - __saturatef(\p x) returns 1 if \cuda_math_formula x \ge 1 \end_cuda_math_formula.
+ * - __saturatef(\p x) returns \p x if \cuda_math_formula 0 < x < 1 \end_cuda_math_formula.
+ * - __saturatef(NaN) returns +0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __saturatef(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x and \p y are signed 32-bit integers, input \p z is 
+ * a 32-bit unsigned integer.
+ *
+ * \return Returns 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __sad(int x, int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x, \p y, and \p z are unsigned 32-bit integers.
+ * 
+ * \return Returns 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __usad(unsigned int x, unsigned int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of \p x and \p y are ignored.
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mul24(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two unsigned integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of  \p x and  \p y are ignored. 
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umul24(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Divide two floating-point values.
+ *
+ * Compute \p x divided by \p y.
+ *
+ * \return Returns \p x / \p y.
+ * - Follows the regular division operation behavior by default.
+ * - If \p -use_fast_math is specified and is not amended by
+ * an explicit \p -prec_div=true, uses ::__fdividef() for higher
+ * performance
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdividef(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate division of the input arguments.
+ *
+ * Calculate the fast approximate division of \p x by \p y.
+ *
+ * \return Returns \p x / \p y.
+ * - __fdividef(
+ * \cuda_math_formula \infty \end_cuda_math_formula
+ * , \p y) returns NaN for 
+ * \cuda_math_formula 2^{126} < |y| < 2^{128} \end_cuda_math_formula.
+ * - __fdividef(\p x, \p y) returns 0 for 
+ * \cuda_math_formula 2^{126} < |y| < 2^{128} \end_cuda_math_formula
+ *  and finite
+ * \cuda_math_formula x \end_cuda_math_formula.
+ * \see __fdiv_rn() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdividef(float x, float y);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdivide(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate sine of the input argument.
+ *
+ * Calculate the fast approximate sine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate sine of \p x.
+ *
+ * \see sinf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ * \note Output in the denormal range is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate cosine of the input argument.
+ *
+ * Calculate the fast approximate cosine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate cosine of \p x.
+ *
+ * \see cosf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate tangent of the input argument.
+ *
+ * Calculate the fast approximate tangent of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate tangent of \p x.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note The result is computed as the fast divide of ::__sinf()
+ * by ::__cosf(). Denormal output is flushed to sign-preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate hyperbolic tangent of the input argument.
+ *
+ * Calculate the fast approximate hyperbolic tangent of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate hyperbolic tangent of \p x.
+ *
+ * \see tanhf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __tanhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of sine and cosine of the first input argument.
+ *
+ * Calculate the fast approximate of sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see __sinf() and __cosf().
+ * \note_accuracy_single_intrinsic
+ * \note Denorm input/output is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ void                   __sincosf(float x, float *sptr, float *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, 
+ * \cuda_math_formula e^x \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula e^x \end_cuda_math_formula.
+ * \see expf() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 exponential of the input argument.
+ *
+ * Calculate the fast approximate base 10 exponential of the input argument \p x, 
+ * \cuda_math_formula 10^x \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula 10^x \end_cuda_math_formula.
+ * \see exp10f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __exp10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 2 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 2 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_2(x) \end_cuda_math_formula.
+ * \see log2f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log2f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 10 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_{10}(x) \end_cuda_math_formula.
+ * \see log10f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_e(x) \end_cuda_math_formula.
+ * \see logf() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * Calculate the fast approximate of \p x, the first input argument, 
+ * raised to the power of \p y, the second input argument, 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_ru(float);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_ru(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rd(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rz(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_ru(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rd(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the sum of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rn(\p x, \p y) is equivalent to __fadd_rn(\p y, \p x).
+ * - __fadd_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rn(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the sum of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rz(\p x, \p y) is equivalent to __fadd_rz(\p y, \p x).
+ * - __fadd_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rz(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Compute the sum of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_ru(\p x, \p y) is equivalent to __fadd_ru(\p y, \p x).
+ * - __fadd_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_ru(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-down mode.
+ * 
+ * Compute the sum of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rd(\p x, \p y) is equivalent to __fadd_rd(\p y, \p x).
+ * - __fadd_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rd(\p x, \p -x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the difference of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rn(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the difference of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rz(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Compute the difference of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_ru(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-down mode.
+ * 
+ * Compute the difference of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rd(\p x, \p x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the product of \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rn(\p x, \p y) is equivalent to __fmul_rn(\p y, \p x).
+ * - __fmul_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the product of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rz(\p x, \p y) is equivalent to __fmul_rz(\p y, \p x).
+ * - __fmul_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Compute the product of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_ru(\p x, \p y) is equivalent to __fmul_ru(\p y, \p x).
+ * - __fmul_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-down mode.
+ * 
+ * Compute the product of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rd(\p x, \p y) is equivalent to __fmul_rd(\p y, \p x).
+ * - __fmul_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-to-nearest-even mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rn(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rn(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rn(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rn(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-towards-zero mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rz(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rz(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rz(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rz(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-up mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_ru(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_ru(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_ru(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_ru(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-down mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rd(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rd(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rd(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \p z) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rd(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ * 
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rz(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_ru(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rd(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rn(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rn(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ * 
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rz(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rz(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rz(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_ru(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_ru(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_ru(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rd(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rd(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rd(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - __frsqrt_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frsqrt_rn(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula.
+ * - __frsqrt_rn(\p x) returns NaN for \p x < 0.
+ * - __frsqrt_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rn(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rn(float x, float y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rz(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-up mode.
+ * 
+ * Divide two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_ru(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rd(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return the number of consecutive high-order zero bits in a 32-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 31) of \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 32-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the position of the first bit set.
+ * - __ffs(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffs(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 32-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popc(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 32-bit unsigned integer.
+ *
+ * Reverses the bit order of the 32-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 31-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __brev(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of consecutive high-order zero bits in a 64-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 63) of \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clzll(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 64-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the position of the first bit set.
+ * - __ffsll(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffsll(long long int x);
+
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 64-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popcll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 64-bit unsigned integer.
+ *
+ * Reverses the bit order of the 64-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 63-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __brevll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return selected bytes from two 32-bit unsigned integers.
+ *
+ * \return Returns a 32-bit integer consisting of four bytes from eight input bytes provided in the two
+ * input integers \p x and \p y, as specified by a selector, \p s.
+ *
+ * Create 8-byte source
+ * - uint64_t \p tmp64 = ((uint64_t)\p y << 32) | \p x;
+ *
+ * Extract selector bits
+ * - \p selector0 = (\p s >>  0) & 0x7;
+ * - \p selector1 = (\p s >>  4) & 0x7;
+ * - \p selector2 = (\p s >>  8) & 0x7;
+ * - \p selector3 = (\p s >> 12) & 0x7;
+ *
+ * Return 4 selected bytes from 8-byte source:
+ * - \p res[07:00] = \p tmp64[\p selector0];
+ * - \p res[15:08] = \p tmp64[\p selector1];
+ * - \p res[23:16] = \p tmp64[\p selector2];
+ * - \p res[31:24] = \p tmp64[\p selector3];
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of signed input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __hadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of signed input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __rhadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of unsigned input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __uhadd(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of unsigned input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __urhadd(unsigned int x, unsigned int y);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __double2int_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __double2uint_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __double2ll_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __double2ull_rz(double x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm0(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm1(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm2(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm3(void);
+
+/*******************************************************************************
+ *                                                                             *
+ *                        FP16 SIMD functions                                  *
+ *                                                                             *
+ *******************************************************************************/
+
+ //  #include "fp16.h"
+
+
+/*******************************************************************************
+ *                                                                             *
+ *                                SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value: |a|.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value with signed saturation: |a|.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed addition, with wrap-around: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs unsigned addition on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with signed saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with unsigned saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: a == b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if they are equal, and 0000 otherwise.
+ * For example __vcmpeq2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a >= b ? 0xffff : 0.
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpges2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a >= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpgeu2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgts2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgtu2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a <= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmples2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a <= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmpleu2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmpltu2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: a != b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part != 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute difference of unsigned integer: |a - b|.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: returns 1 if both parts compare equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare greater than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare greater than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare less than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare less than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: returns 1 if both parts compare not equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword sum of abs diff of unsigned.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with wrap-around: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with signed saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword subtraction with unsigned saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute difference of signed integer: |a - b|.
+ *
+ * Splits 4 bytes of each into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword sum of absolute difference of signed.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value: |a|.
+ *
+ * Splits argument by bytes. Computes absolute value of each byte.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value with signed saturation: |a|.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed addition: a + b.
+ *
+ * Splits 'a' into 4 bytes, then performs unsigned addition on each of these
+ * bytes with the corresponding byte from 'b', ignoring overflow.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with signed saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with unsigned saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: a == b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if they are equal, and 00 otherwise.
+ * For example __vcmpeq4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a >= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpges4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a >= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpgeu4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a > b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgts4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a > b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgtu4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a <= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmples4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a <= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmpleu4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a < b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a < b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmpltu4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: a != b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part != 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of unsigned integer: |a - b|.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: returns 1 if all 4 pairs compare equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 part, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare less than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare less than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare greater than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare greater than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: returns 1 if all 4 pairs compare not equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of unsigned.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with signed saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with unsigned saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of signed integer: |a - b|.
+ *
+ * Splits 4 bytes of each into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of signed.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), 0)
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a max with relu ( = max(a_part, b_part, 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a, b), 0)
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a min with relu ( = max(min(a_part, b_part), 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(max(a, b), c), 0)
+ *
+ * Calculates the maximum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(max(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way max with relu ( = max(a_part, b_part, c_part, 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(min(a, b), c), 0)
+ *
+ * Calculates the minimum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(min(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way min with relu ( = max(min(a_part, b_part, c_part), 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * If the result is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a max with relu: max(max(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * If the result is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a min with relu: max(min(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmax_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmin_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/*******************************************************************************
+ *                                                                             *
+ *                            END SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+} //extern "c"
+#undef EXCLUDE_FROM_RTC
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __DEVICE_FUNCTIONS_STATIC_DECL__
+#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#include "device_functions.hpp"
+#endif /* !defined(__CUDACC_RTC__) */
+
+#include "device_atomic_functions.h"
+#include "device_double_functions.h"
+#include "sm_20_atomic_functions.h"
+#include "sm_32_atomic_functions.h"
+#include "sm_35_atomic_functions.h"
+#include "sm_60_atomic_functions.h"
+#include "sm_20_intrinsics.h"
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.h"
+#include "sm_35_intrinsics.h"
+#include "sm_61_intrinsics.h"
+#include "sm_70_rt.h"
+#include "sm_80_rt.h"
+#include "sm_90_rt.h"
+#include "sm_100_rt.h"
+#ifndef __CUDACC_RTC_MINIMAL__
+#include "texture_indirect_functions.h"
+#include "surface_indirect_functions.h"
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#include "cudacc_ext.h"
+
+#ifdef __CUDACC__
+extern "C" __host__ __device__  unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim,
+                                      dim3 blockDim, 
+                                      size_t sharedMem = 0, 
+                                      struct CUstream_st *stream = 0);
+
+#if !defined(__CUDACC_RTC__) &&!defined(__NV_LEGACY_LAUNCH)
+extern "C" cudaError_t CUDARTAPI __cudaGetKernel(cudaKernel_t *, const void *);
+
+extern "C"  cudaError_t CUDARTAPI __cudaLaunchKernel(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel_ptsz(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+//referenced from compiler generated kernel launch code
+static inline cudaError_t __cudaLaunchKernel_helper(
+                                  cudaKernel_t kernel,
+                                  dim3 gridDim,
+                                  dim3 blockDim,
+                                  void **args,
+                                  size_t sharedMem,
+                                  cudaStream_t stream)
+{
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+  return __cudaLaunchKernel_ptsz(kernel, gridDim, blockDim, args, sharedMem,
+                                 stream);
+#else  /* !__CUDART_API_PER_THREAD_DEFAULT_STREAM */
+  return __cudaLaunchKernel(kernel, gridDim, blockDim, args, sharedMem,
+                            stream);
+#endif  /* __CUDART_API_PER_THREAD_DEFAULT_STREAM */
+}
+#endif  /* !defined(__CUDACC_RTC__) && !defined(__NV_LEGACY_LAUNCH) */
+
+enum {
+  __NV_ATOMIC_RELAXED,
+  __NV_ATOMIC_CONSUME,
+  __NV_ATOMIC_ACQUIRE,
+  __NV_ATOMIC_RELEASE,
+  __NV_ATOMIC_ACQ_REL,
+  __NV_ATOMIC_SEQ_CST
+};
+
+enum {
+  __NV_THREAD_SCOPE_THREAD,
+  __NV_THREAD_SCOPE_BLOCK,
+  __NV_THREAD_SCOPE_CLUSTER,
+  __NV_THREAD_SCOPE_DEVICE,
+  __NV_THREAD_SCOPE_SYSTEM
+};
+
+#endif  /* __CUDACC__ */
+
+#endif /* !__DEVICE_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..429b2298a8fdd95338c132996b1d9dca74130193
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp
@@ -0,0 +1,1163 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__DEVICE_FUNCTIONS_HPP__)
+#define __DEVICE_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_FUNCTIONS_DECL__ __device__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
+#else
+#define __DEVICE_FUNCTIONS_DECL__ __device__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __DEVICE_FUNCTIONS_STATIC_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifdef __CUDACC__
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __CUDA_AND_AT_LEAST_SM_90__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) */
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+#define __CUDA_AND_AT_LEAST_SM_70__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) */
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+#define __CUDA_AND_AT_LEAST_SM_75__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) */
+#endif /* __CUDACC__ */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+static __host__ __device__ short __internal_cast_u2s(unsigned short x)
+{
+  short res;
+#if defined(__CUDACC__)
+    (void)memcpy(&res, &x, sizeof(x));
+#else
+    (void)std::memcpy(&res, &x, sizeof(x));
+#endif
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm("{max.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(a, b);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm("{max.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vmaxs2(a, b), 0U);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  // Get answer
+  int ansI_lo = max(aS_lo, bS_lo);
+  int ansI_hi = max(aS_hi, bS_hi);
+
+  // relu
+  if(ansI_lo < 0){ ansI_lo = 0; }
+  if(ansI_hi < 0){ ansI_hi = 0; }
+
+  // Cast back to unsigned:
+  unsigned ansU_lo = (unsigned)ansI_lo;
+  unsigned ansU_hi = (unsigned)ansI_hi;
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin_s32_relu(const int a, const int b){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm("{min.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+  return res;
+#else
+  // Host and older architecture code
+    int ans = min(a, b);
+    
+    return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm("{min.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vmins2(a, b), 0U);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  // Get answer
+  int ansI_lo = min(aS_lo, bS_lo);
+  int ansI_hi = min(aS_hi, bS_hi);
+
+  // relu
+  if(ansI_lo < 0){ ansI_lo = 0; }
+  if(ansI_hi < 0){ ansI_hi = 0; }
+
+  // Cast back to unsigned:
+  unsigned ansU_lo = (unsigned)ansI_lo;
+  unsigned ansU_hi = (unsigned)ansI_hi;
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "max.s32 t1, %1, %2; \n\t"
+      "max.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(max(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  // Future asm code (naming/syntax may change):
+  asm ("{.reg .b32 t1; \n\t"
+      "max.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
+  res = __vmaxs2(__vmaxs2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned int ansU_lo = (unsigned int)max(max(aS_lo, bS_lo), cS_lo);
+  unsigned int ansU_hi = (unsigned int)max(max(aS_hi, bS_hi), cS_hi);
+
+  // Put answer back together:
+  res = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+#endif
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "max.u32 t1, %1, %2; \n\t"
+      "max.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(max(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "max.u16x2 t1, %1, %2; \n\t"
+      "max.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxu2(__vmaxu2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max(max(aU_lo, bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)max(max(aU_hi, bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "min.s32 t1, %1, %2; \n\t"
+      "min.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(min(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
+  res = __vmins2(__vmins2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned int ansU_lo = (unsigned int)min(min(aS_lo, bS_lo), cS_lo);
+  unsigned int ansU_hi = (unsigned int)min(min(aS_hi, bS_hi), cS_hi);
+
+  // Put answer back together:
+  res = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "min.u32 t1, %1, %2; \n\t"
+      "min.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(min(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.u16x2 t1, %1, %2; \n\t"
+      "min.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vminu2(__vminu2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min(min(aU_lo, bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)min(min(aU_hi, bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "max.s32.relu t1, %1, %2; \n\t"
+      "max.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(max(a, b), c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "max.s16x2.relu t1, %1, %2; \n\t"
+      "max.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
+  res = __vimax_s16x2_relu(__vmaxs2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)max(0, max(max(aS_lo, bS_lo), cS_lo));
+  unsigned ansU_hi = (unsigned)max(0, max(max(aS_hi, bS_hi), cS_hi));
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "min.s32.relu t1, %1, %2; \n\t"
+      "min.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = min(min(a, b), c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.s16x2.relu t1, %1, %2; \n\t"
+      "min.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
+  res = __vimin_s16x2_relu(__vmins2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)max(0, min(min(aS_lo, bS_lo), cS_lo));
+  unsigned ansU_hi = (unsigned)max(0, min(min(aS_hi, bS_hi), cS_hi));
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "max.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  aU_lo += bU_lo;
+  aU_hi += bU_hi;
+
+  //cast to signed:
+  short sS_lo = __internal_cast_u2s(aU_lo);
+  short sS_hi = __internal_cast_u2s(aU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)max(sS_lo, cS_lo);
+  unsigned ansU_hi = (unsigned)max(sS_hi, cS_hi);
+
+  // Put answer back together:
+  res = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "add.u32 t1, %1, %2; \n\t"
+      "max.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.u16x2 t1, %1, %2; \n\t"
+      "max.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxu2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max((unsigned short)(aU_lo + bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)max((unsigned short)(aU_hi + bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "min.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmins2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  aU_lo += bU_lo;
+  aU_hi += bU_hi;
+
+  //cast to signed:
+  short sS_lo = __internal_cast_u2s(aU_lo);
+  short sS_hi = __internal_cast_u2s(aU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)min(sS_lo, cS_lo);
+  unsigned ansU_hi = (unsigned)min(sS_hi, cS_hi);
+
+  // Put answer back together:
+  res = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "add.u32 t1, %1, %2; \n\t"
+      "min.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.u16x2 t1, %1, %2; \n\t"
+      "min.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vminu2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min((unsigned short)(aU_lo + bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)min((unsigned short)(aU_hi + bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "max.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(a + b, c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vimax_s16x2_relu(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  aU_lo += bU_lo;
+  aU_hi += bU_hi;
+
+  //cast to signed:
+  short sS_lo = __internal_cast_u2s(aU_lo);
+  short sS_hi = __internal_cast_u2s(aU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)max(0, max(sS_lo, cS_lo));
+  unsigned ansU_hi = (unsigned)max(0, max(sS_hi, cS_hi));
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "min.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = min(a + b, c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vimin_s16x2_relu(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  aU_lo += bU_lo;
+  aU_hi += bU_hi;
+
+  //cast to signed:
+  short sS_lo = __internal_cast_u2s(aU_lo);
+  short sS_hi = __internal_cast_u2s(aU_hi);
+
+  short cS_lo = __internal_cast_u2s(cU_lo);
+  short cS_hi = __internal_cast_u2s(cU_hi);
+
+  // Get answer
+  unsigned ansU_lo = (unsigned)max(0, min(sS_lo, cS_lo));
+  unsigned ansU_hi = (unsigned)max(0, min(sS_hi, cS_hi));
+
+  // Put answer back together:
+  res = ansU_lo | (ansU_hi << 16);
+#endif
+
+  return res;
+}
+
+// vimax vimin with predicate
+// *pred gets set to '(a >= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmax_s32(const int a, const int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.ge.s32  __$temp1, %2, %3;\n\t"
+      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  int ans = max(a, b);
+
+  *pred = (a >= b);
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.ge.u32  __$temp1, %2, %3;\n\t"
+      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  unsigned int ans = max(a, b);
+
+  *pred = (a >= b);
+  return ans;
+#endif
+}
+
+// *pred gets set to '(a <= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmin_s32(const int a, const int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.le.s32  __$temp1, %2, %3;\n\t"
+      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  int ans = min(a, b);
+
+  *pred = (a <= b);
+  return ans;
+#endif
+}
+
+// *pred gets set to '(a <= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.le.u32  __$temp1, %2, %3;\n\t"
+      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  unsigned int ans = min(a, b);
+
+  *pred = (a <= b);
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .s16 rs0, rs1, rs2, rs3; \n\t"
+      "max.s16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.s16 pv, rs0, rs2; \n\t"
+      "setp.eq.s16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  // Get answer
+  unsigned int ansU_lo = (unsigned int)max(aS_lo, bS_lo);
+  unsigned int ansU_hi = (unsigned int)max(aS_hi, bS_hi);
+
+  *pred_hi = (aS_hi >= bS_hi);
+  *pred_lo = (aS_lo >= bS_lo);
+
+  // Put answer back together:
+  unsigned int ans = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "max.u16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.u16 pv, rs0, rs2; \n\t"
+      "setp.eq.u16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max(aU_lo, bU_lo);
+  unsigned short ansU_hi = (unsigned short)max(aU_hi, bU_hi);
+
+  *pred_hi = (aU_hi >= bU_hi);
+  *pred_lo = (aU_lo >= bU_lo);
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "min.s16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.s16 pv, rs0, rs2; \n\t"
+      "setp.eq.s16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = __internal_cast_u2s(aU_lo);
+  short aS_hi = __internal_cast_u2s(aU_hi);
+
+  short bS_lo = __internal_cast_u2s(bU_lo);
+  short bS_hi = __internal_cast_u2s(bU_hi);
+
+  // Get answer
+  unsigned int ansU_lo = (unsigned int)min(aS_lo, bS_lo);
+  unsigned int ansU_hi = (unsigned int)min(aS_hi, bS_hi);
+
+  *pred_hi = (aS_hi <= bS_hi);
+  *pred_lo = (aS_lo <= bS_lo);
+
+  // Put answer back together:
+  unsigned int ans = (ansU_lo & 0x0000FFFFU) | (ansU_hi << 16);
+
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "min.u16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.u16 pv, rs0, rs2; \n\t"
+      "setp.eq.u16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min(aU_lo, bU_lo);
+  unsigned short ansU_hi = (unsigned short)min(aU_hi, bU_hi);
+
+  *pred_hi = (aU_hi <= bU_hi);
+  *pred_lo = (aU_lo <= bU_lo);
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+#undef __CUDA_AND_AT_LEAST_SM_90__
+#endif
+
+#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* !__DEVICE_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..633554a01aaabd1bca5ae278c276710f323d5d7b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h
@@ -0,0 +1,57 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
+
+#if !defined(__FUNC_MACRO_H__)
+#define __FUNC_MACRO_H__
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#error -- incorrect inclusion of a cudart header file
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__GNUC__)
+
+#define __func__(decl) \
+        inline decl
+
+#define __device_func__(decl) \
+        static __attribute__((__unused__)) decl
+
+#elif defined(_WIN32)
+
+#define __func__(decl) \
+        static inline decl
+
+#define __device_func__(decl) \
+        static decl
+
+#endif /* __GNUC__ */
+
+#endif /* __FUNC_MACRO_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..820b81c2945d8dcc241329673a558090a4922e52
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
+
+#if !defined(__HOST_CONFIG_H__)
+#define __HOST_CONFIG_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+
+#define _CRTIMP
+#define __THROW
+
+#else /* __CUDACC_RTC__ */
+
+/* check for host compilers that are compatible with nvcc */
+#if !defined(__GNUC__) && !defined(_WIN32)
+
+#error --- !!! UNSUPPORTED COMPILER !!! ---
+
+#endif /* !__GNUC__ && !_WIN32 */
+
+/* check invalid configurations */
+#if defined(__PGIC__)
+#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
+#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
+#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
+#endif  /* defined(__PGIC__) */
+
+#if defined(__powerpc__)
+#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
+#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
+#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
+#endif /* __powerpc__ */
+
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
+#error -- clang and clang++ are the only supported host compilers on Mac OS X!
+#endif /* __APPLE__ && __MACH__ && !__clang__ */
+
+
+/* check host compiler version  */
+#if !__NV_NO_HOST_COMPILER_CHECK
+
+#if defined(__ICC)
+
+#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+ 
+#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
+
+#endif /* __ICC */
+
+#if defined(__GRCO_CLANG_COMPILER__)
+#if (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 19))
+#error -- unsupported Grace clang version! The version must be 16.x to 19.x. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 19)) */
+
+#endif /* __GRCO_CLANG_COMPILER__  */
+
+#if defined(__INTEL_CLANG_COMPILER)
+#error -- unsupported Intel ICX compiler! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif /* __INTEL_CLANG_COMPILER */
+
+#if defined(__powerpc__)
+
+#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
+                              !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
+
+#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
+                           !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
+
+#endif /* __powerpc__ */
+
+#if defined(__GNUC__)
+
+#if __GNUC__ > 14
+
+#error -- unsupported GNU version! gcc versions later than 14 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __GNUC__ > 14 */
+
+
+#if defined(__HORIZON__)
+#if (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported HOS clang version! The version must be must be less than 20 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+#endif /* __HORIZON__  */
+
+#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__)
+
+#if (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported clang version! clang version must be less than 20 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif  /* (__clang_major__ >=  20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+
+#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__) */
+
+
+#endif /* __GNUC__ */
+
+#if defined(_WIN32)
+
+#if _MSC_VER < 1910 || _MSC_VER >= 1950
+
+#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#elif _MSC_VER >= 1910 && _MSC_VER < 1910
+
+#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
+
+#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1950) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
+
+#endif /* _WIN32 */
+#endif  /* !__NV_NO_HOST_COMPILER_CHECK */
+
+
+/* configure host compiler */
+#if defined(__APPLE__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#if defined(__BLOCKS__) /* nvcc does not support closures */
+
+#undef __BLOCKS__
+
+#endif /* __BLOCKS__ */
+
+#elif defined(__ANDROID__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__QNX__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__HORIZON__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__GNUC__)
+
+#define _CRTIMP
+#define _ACRTIMP
+
+#include <features.h> /* for __THROW */
+
+#elif defined(_WIN32)
+
+#if _MSC_VER >= 1500
+
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL \
+        1
+
+#endif /* _MSC_VER >= 1500 */
+
+#if !defined(_CRT_NONSTDC_NO_WARNINGS)
+
+#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_NONSTDC_NO_WARNINGS */
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+
+#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_SECURE_NO_WARNINGS */
+
+#if !defined(NOMINMAX)
+
+#define NOMINMAX /* min and max are part of cuda runtime */
+
+#endif /* !NOMINMAX */
+
+#include <crtdefs.h> /* for _CRTIMP */
+#if _MSC_VER >= 1900
+#include <corecrt.h> /* for _ACRTIMP */
+#endif /* _MSC_VER >= 1900 */
+
+#define __THROW
+
+#endif /* __APPLE__ */
+
+#endif /* __CUDACC_RTC__ */
+
+
+#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
+
+#if __CUDACC_RTC__
+typedef char *va_list;
+#else /* !__CUDACC_RTC__ */
+#include <cstdarg>
+#endif /* __CUDACC_RTC__ */
+
+
+#undef va_start
+#undef va_end
+#undef va_arg
+
+#ifdef __PGIC__
+
+#undef __builtin_va_end
+
+#define va_start(v,l) __builtin_alt_va_start(v,l)
+#define va_end(v) __builtin_va_end(v)
+#define va_arg(v,l) __builtin_alt_va_arg(v,l)
+
+#if (__cplusplus >= 201103L)
+#undef va_copy
+#define va_copy(d,s)  __builtin_va_copy(d,s)
+#endif
+
+#else /* !__PGIC__ */
+
+
+#define va_start(ap, x) (__cu_va_start(&ap, x))
+#define va_end(ap) (__cu_va_end(&ap))
+#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
+
+#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
+#undef va_copy
+#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
+#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
+#endif /* __PGIC__ */
+
+#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
+
+
+
+#endif /* __CUDACC__ */
+
+#endif /* !__HOST_CONFIG_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_defines.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..b58cb3cc1086bdc6e0376f042c86b2755ef2ff00
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_defines.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
+
+#if !defined(__HOST_DEFINES_H__)
+#define __HOST_DEFINES_H__
+
+#if defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_ALLOW_UNSUPPORTED_LIBCPP)
+#include <ctype.h>
+#if ((defined(_MSC_VER ) && (defined(_M_X64) || defined(_M_AMD64))) ||\
+     (defined(__x86_64__) || defined(__amd64__))) && defined(_LIBCPP_VERSION) && !(defined(__HORIZON__) || defined(__ANDROID__) || defined(__QNX__))
+#error "libc++ is not supported on x86 system"
+#endif
+#endif
+
+/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
+#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
+
+#if defined(__CUDACC_RTC__)
+#define __volatile__ volatile
+#endif /* __CUDACC_RTC__ */
+
+#define __no_return__ \
+        __attribute__((noreturn))
+        
+#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
+/* gcc allows users to define attributes with underscores, 
+   e.g., __attribute__((__noinline__)).
+   Consider a non-CUDA source file (e.g. .cpp) that has the 
+   above attribute specification, and includes this header file. In that case,
+   defining __noinline__ as below  would cause a gcc compilation error.
+   Hence, only define __noinline__ when the code is being processed
+   by a  CUDA compiler component.
+*/   
+#define __noinline__ \
+        __attribute__((noinline))
+#endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
+
+#undef __forceinline__
+#define __forceinline__ \
+        __inline__ __attribute__((always_inline))
+#define __inline_hint__ \
+        __attribute__((nv_inline_hint))
+#define __align__(n) \
+        __attribute__((aligned(n)))
+#define __maxnreg__(a) \
+        __attribute__((maxnreg(a)))
+#define __thread__ \
+        __thread
+#define __import__
+#define __export__
+#define __cdecl
+#define __annotate__(a) \
+        __attribute__((a))
+#define __location__(a) \
+        __annotate__(a)
+#define CUDARTAPI
+#define CUDARTAPI_CDECL
+
+#elif defined(_MSC_VER)
+
+#if _MSC_VER >= 1400
+
+#define __restrict__ \
+        __restrict
+
+#else /* _MSC_VER >= 1400 */
+
+#define __restrict__
+
+#endif /* _MSC_VER >= 1400 */
+
+#define __inline__ \
+        __inline
+#define __no_return__ \
+        __declspec(noreturn)
+#define __noinline__ \
+        __declspec(noinline)
+#define __forceinline__ \
+        __forceinline
+#define __inline_hint__ \
+        __declspec(nv_inline_hint)
+#define __align__(n) \
+        __declspec(align(n))
+#define __maxnreg__(n) \
+        __declspec(maxnreg(n))
+#define __thread__ \
+        __declspec(thread)
+#define __import__ \
+        __declspec(dllimport)
+#define __export__ \
+        __declspec(dllexport)
+#define __annotate__(a) \
+        __declspec(a)
+#define __location__(a) \
+        __annotate__(__##a##__)
+#define CUDARTAPI \
+        __stdcall
+#define CUDARTAPI_CDECL \
+        __cdecl
+
+#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#define __inline__
+
+#if !defined(__align__)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
+
+#endif /* !__align__ */
+
+#if !defined(CUDARTAPI)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
+
+#endif /* !CUDARTAPI */
+
+#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
+    (defined(_MSC_VER) && _MSC_VER < 1900) || \
+    (!defined(__GNUC__) && !defined(_MSC_VER))
+
+#define __specialization_static \
+        static
+
+#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#define __specialization_static
+
+#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
+
+#undef __annotate__
+#define __annotate__(a)
+
+#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#define __launch_bounds__(...) \
+        __annotate__(launch_bounds(__VA_ARGS__))
+
+#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
+    defined(__GNUC__) || defined(_WIN64)
+
+#define __builtin_align__(a) \
+        __align__(a)
+
+#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
+
+#define __builtin_align__(a)
+
+#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
+
+#if defined(__CUDACC__) || !defined(__grid_constant__)
+#define __grid_constant__ \
+        __location__(grid_constant)
+#endif /* defined(__CUDACC__) || !defined(__grid_constant__) */
+        
+#if defined(__CUDACC__) || !defined(__host__)
+#define __host__ \
+        __location__(host)
+#endif /* defined(__CUDACC__) || !defined(__host__) */
+#if defined(__CUDACC__) || !defined(__device__)
+#define __device__ \
+        __location__(device)
+#endif /* defined(__CUDACC__) || !defined(__device__) */
+#if defined(__CUDACC__) || !defined(__global__)
+#define __global__ \
+        __location__(global)
+#endif /* defined(__CUDACC__) || !defined(__global__) */
+#if defined(__CUDACC__) || !defined(__shared__)
+#define __shared__ \
+        __location__(shared)
+#endif /* defined(__CUDACC__) || !defined(__shared__) */
+#if defined(__CUDACC__) || !defined(__constant__)
+#define __constant__ \
+        __location__(constant)
+#endif /* defined(__CUDACC__) || !defined(__constant__) */
+#if defined(__CUDACC__) || !defined(__managed__)
+#define __managed__ \
+        __location__(managed)
+#endif /* defined(__CUDACC__) || !defined(__managed__) */
+#if defined(__CUDACC__) || !defined(__nv_pure__)
+#define __nv_pure__ \
+        __location__(nv_pure)
+#endif /* defined(__CUDACC__) || !defined(__nv_pure__) */  
+#if !defined(__CUDACC__)
+#define __device_builtin__
+#define __device_builtin_texture_type__
+#define __device_builtin_surface_type__
+#define __cudart_builtin__
+#else /* defined(__CUDACC__) */
+#define __device_builtin__ \
+        __location__(device_builtin)
+#define __device_builtin_texture_type__ \
+        __location__(device_builtin_texture_type)
+#define __device_builtin_surface_type__ \
+        __location__(device_builtin_surface_type)
+#define __cudart_builtin__ \
+        __location__(cudart_builtin)
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__CUDACC__) || !defined(__cluster_dims__)
+#if defined(_MSC_VER)        
+#define __cluster_dims__(...) \
+        __declspec(__cluster_dims__(__VA_ARGS__))
+        
+#else  /* !defined(_MSC_VER) */
+#define __cluster_dims__(...) \
+        __attribute__((cluster_dims(__VA_ARGS__)))
+#endif  /* defined(_MSC_VER) */
+#endif  /* defined(__CUDACC__) || !defined(__cluster_dims__) */
+
+#define __CUDA_ARCH_HAS_FEATURE__(_FEAT) __CUDA_ARCH_FEAT_##_FEAT
+
+#endif /* !__HOST_DEFINES_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..22e3a1bea875ddb2a15075f6e0ecb10b7ce1a6a7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
@@ -0,0 +1,306 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#define __CUDA_INTERNAL_COMPILATION__
+#define __text__
+#define __surf__
+#define __name__shadow_var(c, cpp) \
+        #c
+#define __name__text_var(c, cpp) \
+        #cpp
+#define __host__shadow_var(c, cpp) \
+        cpp
+#define __text_var(c, cpp) \
+        cpp
+#define __device_fun(fun) \
+        #fun
+#define __device_var(var) \
+        #var
+#define __device__text_var(c, cpp) \
+        #c
+#define __device__shadow_var(c, cpp) \
+        #c
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+#define __pad__(f) \
+        f
+
+#else /* _WIN32 && !_WIN64 */
+
+#define __pad__(f)
+
+#endif /* _WIN32 && !_WIN64 */
+
+#include "builtin_types.h"
+#include "storage_class.h"
+
+#else /* !__CUDA_INTERNAL_COMPILATION__ */
+
+template <typename T>
+static inline T *__cudaAddressOf(T &val) 
+{
+    return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
+}
+
+#define __cudaRegisterBinary(X)                                                   \
+        __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
+        { void (*callback_fp)(void **) =  (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
+        atexit(__cudaUnregisterBinaryUtil)
+        
+#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+
+#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
+        __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
+#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
+        __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
+#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
+        __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
+
+extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
+  dim3         *gridDim,
+  dim3         *blockDim,
+  size_t       *sharedMem,
+  void         *stream
+);
+
+#define __cudaLaunchPrologue(size) \
+        void * __args_arr[size]; \
+        int __args_idx = 0
+        
+#define __cudaSetupArg(arg, offset) \
+        __args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
+          
+#define __cudaSetupArgSimple(arg, offset) \
+        __args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
+        
+#if defined(__GNUC__)
+#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
+#else  /* !__GNUC__ */
+#define __NV_ATTR_UNUSED_FOR_LAUNCH
+#endif  /* __GNUC__ */
+
+#ifdef __NV_LEGACY_LAUNCH
+/* the use of __args_idx in the expression below avoids host compiler warning about it being an
+   unused variable when the launch has no arguments */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#else  /* !__NV_LEGACY_LAUNCH */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          static cudaKernel_t __handle = 0; \
+          volatile static bool __tmp __NV_ATTR_UNUSED_FOR_LAUNCH = (__cudaGetKernel(&__handle, (const void *)fun) == cudaSuccess); \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#endif  /* __NV_LEGACY_LAUNCH */
+
+#if defined(__GNUC__)
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
+#else /* __GNUC__ */
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref; __ref = (volatile void **)param; }
+#endif /* __GNUC__ */
+
+static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
+
+#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
+#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
+
+extern "C" {
+void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
+}
+
+#define __TO_STRING_CORE(X) #X
+#define __TO_STRING(X) __TO_STRING_CORE(X)
+
+extern "C" {
+#if defined(_WIN32)
+#pragma data_seg("__nv_module_id")
+  static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
+#pragma data_seg()
+#elif defined(__APPLE__)
+  static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#else
+  static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#endif
+
+#undef __FATIDNAME_CORE
+#undef __FATIDNAME
+#define __FATIDNAME_CORE(X) __fatbinwrap##X
+#define __FATIDNAME(X) __FATIDNAME_CORE(X)
+
+#define  ____cudaRegisterLinkedBinary(X) \
+{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
+
+}
+
+extern "C" {
+extern void** CUDARTAPI __cudaRegisterFatBinary(
+  void *fatCubin
+);
+
+extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaUnregisterFatBinary(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterVar(
+        void **fatCubinHandle,
+        char  *hostVar,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern void CUDARTAPI __cudaRegisterManagedVar(
+        void **fatCubinHandle,
+        void **hostVarPtrAddress,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern char CUDARTAPI __cudaInitModule(
+        void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterTexture(
+        void                    **fatCubinHandle,
+  const struct textureReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       norm,      
+        int                        ext        
+);
+
+extern void CUDARTAPI __cudaRegisterSurface(
+        void                    **fatCubinHandle,
+  const struct surfaceReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       ext        
+);
+
+extern void CUDARTAPI __cudaRegisterFunction(
+        void   **fatCubinHandle,
+  const char    *hostFun,
+        char    *deviceFun,
+  const char    *deviceName,
+        int      thread_limit,
+        uint3   *tid,
+        uint3   *bid,
+        dim3    *bDim,
+        dim3    *gDim,
+        int     *wSize
+);
+
+#if defined(__APPLE__)
+extern "C" int atexit(void (*)(void));
+
+#elif  defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
+extern int atexit(void(*)(void)) throw();
+
+#elif defined(__HORIZON__)
+
+// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
+#define atexit(p)
+
+#else /* __GNUC__ && !__ANDROID__ */
+extern int __cdecl atexit(void(__cdecl *)(void));
+#endif
+
+}
+
+static void **__cudaFatCubinHandle;
+
+static void __cdecl __cudaUnregisterBinaryUtil(void)
+{
+  ____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
+  __cudaUnregisterFatBinary(__cudaFatCubinHandle);
+}
+
+static char __nv_init_managed_rt_with_module(void **handle)
+{
+  return __cudaInitModule(handle);
+}
+
+#include "common_functions.h"
+
+#pragma pack()
+
+#if defined(_WIN32)
+
+#pragma warning(disable: 4099)
+
+#if !defined(_WIN64)
+
+#pragma warning(disable: 4408)
+
+#endif /* !_WIN64 */
+
+#endif /* _WIN32 */
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8201f97efb3aed940f62360d90899a5171eeb0d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h
@@ -0,0 +1,6257 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_H__)
+#define __MATH_FUNCTIONS_H__
+
+#if defined(__QNX__) && (__GNUC__ >= 5) && defined(__CUDACC__)
+#if __has_include(<__config>)
+#include <__config>
+#endif
+#endif
+
+/**
+ * \defgroup CUDA_MATH Mathematical Functions
+ *
+ * CUDA mathematical functions are always available in device code.
+ *
+ * Host implementations of the common mathematical functions are mapped
+ * in a platform-specific way to standard math library functions, provided
+ * by the host compiler and respective host libm where available.
+ * Some functions, not available with the host compilers, are implemented
+ * in crt/math_functions.hpp header file.
+ * For example, see ::erfinv(). Other, less common functions,
+ * like ::rhypot(), ::cyl_bessel_i0() are only available in device code.
+ *
+ * CUDA Math device functions are no-throw for well-formed CUDA programs.
+ *
+ * Note that many floating-point and integer functions names are
+ * overloaded for different argument types. For example, the ::log()
+ * function has the following prototypes:
+ * \code
+ * double log(double x);
+ * float log(float x);
+ * float logf(float x);
+ * \endcode
+ *
+ * Note also that due to implementation constraints, certain math functions
+ * from std:: namespace may be callable in device code even via explicitly
+ * qualified std:: names. However, such use is discouraged, since this
+ * capability is unsupported, unverified, undocumented, not portable, and
+ * may change without notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+
+/**
+ * @{
+ */
+
+/* Define math function DOXYGEN toplevel groups, functions will
+   be added to these groups later.
+*/
+/**
+ * \defgroup CUDA_MATH_SINGLE Single Precision Mathematical Functions
+ * This section describes single precision mathematical functions.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_DOUBLE Double Precision Mathematical Functions
+ * This section describes double precision mathematical functions.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INT Integer Mathematical Functions
+ * This section describes integer mathematical functions.
+ * To use these functions, you do not need to include any additional
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_SINGLE Single Precision Intrinsics
+ * This section describes single precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_DOUBLE Double Precision Intrinsics
+ * This section describes double precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_INT Integer Intrinsics
+ * This section describes integer intrinsic functions. All of these
+ * functions are supported in device code. For some of the functions,
+ * host-specific implementations are also provided. For example, 
+ * see `::__nv_bswap16()`.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_CAST Type Casting Intrinsics
+ * This section describes type casting intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ *
+ * \defgroup CUDA_MATH_INTRINSIC_SIMD SIMD Intrinsics
+ * This section describes SIMD intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+
+/**
+ * @}
+ */
+#define __DEVICE_FUNCTIONS_DECL__ __host__ __device__
+#if !defined(_MSC_VER)
+#define __CUDA_MATH_CRTIMP
+#else
+#if _MSC_VER < 1900
+#define __CUDA_MATH_CRTIMP _CRTIMP
+#else
+#define __CUDA_MATH_CRTIMP _ACRTIMP
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ <= 20) && !defined(__aarch64__)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int                    abs(int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int               labs(long int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int);
+#else /* __ANDROID__ */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p INT_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int            __cdecl abs(int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - labs(\p LONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int       __cdecl labs(long int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - llabs(\p LLONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int a) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+}
+#endif
+#endif /* __ANDROID__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+/* put all math functions in std */
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the absolute value of the input argument.
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - fabs(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fabs(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - fabs(NaN) returns an unspecified NaN.
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fabs(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the absolute value of its argument
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of its argument.
+ * - fabsf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fabsf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - fabsf(NaN) returns an unspecified NaN.
+ *
+ * \note_accuracy_single
+ */
+#if defined(_WIN32) && defined(_M_ARM64)
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float  __cdecl    fabsf(float x) __THROW;
+#else
+extern                    __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float             fabsf(float x) __THROW;
+#endif
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    min(const int a, const int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umin(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmin(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmin(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fminf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fminf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmin(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmin(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    max(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umax(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmax(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmax(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaxf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaxf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmax(double, double) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmax(double, double);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sin(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sin(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cos(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cos(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cos(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cos(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured 
+ * in radians). The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sin() and ::cos().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincos(double x, double *sptr, double *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinf() and ::cosf().
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincosf(float x, float *sptr, float *cptr) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tan(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - sqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - sqrt(\p x) returns NaN if \p x is less than 0.
+ * - sqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sqrt(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - rsqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - rsqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rsqrt(\p x) returns NaN if \p x is less than 0.
+ * - rsqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - rsqrtf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - rsqrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rsqrtf(\p x) returns NaN if \p x is less than 0.
+ * - rsqrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log2(1) returns +0.
+ * - log2(\p x) returns NaN for \p x < 0.
+ * - log2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log2(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ * 
+ * Calculate
+ * \cuda_math_formula 2^x \end_cuda_math_formula
+,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp2(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp2(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl exp2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 2^x \end_cuda_math_formula
+,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp2f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp2f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp2f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp2f(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl exp2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 10^x \end_cuda_math_formula
+,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp10(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp10(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */         
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp10(double x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 10^x \end_cuda_math_formula
+,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp10f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp10f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp10f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp10f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+ * -1, the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - expm1(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - expm1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expm1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 expm1(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl expm1(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+ * -1, the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - expm1f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - expm1f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expm1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expm1f(float x) __THROW;        
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl expm1f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log2f(1) returns +0.
+ * - log2f(\p x) returns NaN for \p x < 0.
+ * - log2f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log2f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log2f(float x) __THROW;         
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log10(1) returns +0.
+ * - log10(\p x) returns NaN for \p x < 0.
+ * - log10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log10(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log10(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument \p x.
+ *
+ * \return
+ * - log(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log(1) returns +0.
+ * - log(\p x) returns NaN for \p x < 0.
+ * - log(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1p(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - log1p(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log1p(\p x) returns NaN for \p x < -1.
+ * - log1p(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log1p(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log1p(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log1p(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1pf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - log1pf(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log1pf(\p x) returns NaN for \p x < -1.
+ * - log1pf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log1pf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log1pf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log1pf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculates the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - floor(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - floor(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - floor(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl floor(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+,
+ * the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - exp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl exp(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return
+ * - cosh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cosh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cosh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cosh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - sinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - sinh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sinh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return
+ * - tanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanh( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - tanh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tanh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - acosh(1) returns 0.
+ * - acosh(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - acosh( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - acosh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 acosh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl acosh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - acoshf(1) returns 0.
+ * - acoshf(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - acoshf( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - acoshf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acoshf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl acoshf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - asinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula. 
+ * - asinh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 asinh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl asinh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - asinhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula. 
+ * - asinhf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - asinhf(NaN) returns NaN.
+ * 
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl asinhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanh(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - atanh(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - atanh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 atanh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl atanh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanhf(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - atanhf(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - atanhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl atanhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexp(\p x, \p exp) is equivalent to scalbn(\p x, \p exp).
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ldexp(double x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexpf(\p x, \p exp) is equivalent to scalbnf(\p x, \p exp).
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ldexpf(float x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logb(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 logb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl logb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logbf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logbf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logbf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl logbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogb(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogbf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogbf(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogbf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbn(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbn(\p x, 0) returns \p x.
+ * - scalbn(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbn(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbn(double x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbn(double x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbnf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbnf(\p x, 0) returns \p x.
+ * - scalbnf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbnf(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalbnf(float x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalbnf(float x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbln(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbln(\p x, 0) returns \p x.
+ * - scalbln(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbln(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbln(double x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbln(double x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalblnf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalblnf(\p x, 0) returns \p x.
+ * - scalblnf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalblnf(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalblnf(float x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalblnf(float x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decompose the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexp(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl frexp(double x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decomposes the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexpf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexpf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexpf(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  frexpf(float x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - round(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - round(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - round(NaN) returns NaN.
+ *
+ * \note_slow_round See ::rint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 round(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl round(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return
+ * Returns rounded integer value.
+ * - roundf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - roundf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - roundf(NaN) returns NaN.
+ *
+ * \note_slow_round See ::rintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  roundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl roundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rint(NaN) returns NaN.
+ */
+#if defined(__CUDA_ARCH__) || defined(__DOXYGEN_ONLY__)
+/*
+ * We don't generate the declaration of rint for host compilation.
+ * This is acaully a workaround to compile the boost header file when
+ * Clang 3.8 is used as the host compiler. The boost header file has
+ * the following example code:
+ *   namespace NS { extern "C" { double rint(double); }
+ *   }
+ *
+ * After preprocessing, we get something like below:
+ *
+ * extern "C" { double rint(double x) throw(); }
+ * # 30 "/usr/include/math.h" 3
+ * extern "C" { double rint(double x) throw(); }
+ * namespace NS { extern "C" { double rint(double); } }
+ *
+ * Although GCC accepts this output, Clang 3.8 doesn't.
+ * Furthermore, we cannot change the boost header file by adding "throw()"
+ * to rint's declaration there. So, as a workaround, we just don't generate
+ * our re-declaration for the host compilation.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl rint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#endif /* __CUDA_ARCH__ || __DOXYGEN_ONLY__ */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rintf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rintf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rintf(NaN) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl rintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in double precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - nearbyint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - nearbyint(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nearbyint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nearbyint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in single precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyintf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - nearbyintf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - nearbyintf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nearbyintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nearbyintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ expressed as a floating-point number.
+ * - ceil(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - ceil(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - ceil(NaN) returns NaN.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ceil(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - trunc(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - trunc(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - trunc(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 trunc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl trunc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - truncf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - truncf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - truncf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  truncf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl truncf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdim(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdim(\p x, \p y) returns +0 if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdim(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fdim(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdimf(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdimf(\p x, \p y) returns +0 if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ \p y.
+ * - If either argument is NaN, NaN is returned.
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdimf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fdimf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ].
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , -0) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , +0) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for \p x < 0.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for \p x > 0.
+ * - atan2(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\pi \end_cuda_math_formula
+ * /2 for \p y < 0.
+ * - atan2(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2 for \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2 for finite \p x.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 3\pi \end_cuda_math_formula
+ * /4.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /4.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan2(double y, double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2, +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2].
+ * - atan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - atan(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - acos(1) returns +0.
+ * - acos(\p x) returns NaN for \p x outside [-1, +1].
+ * - acos(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl acos(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2, +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2] for \p x inside [-1, +1].
+ * - asin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asin(\p x) returns NaN for \p x outside [-1, +1].
+ * - asin(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl asin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculate the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula. 
+ * - hypot(\p x,\p y), hypot(\p y,\p x), and hypot(\p x, \p -y) are equivalent.
+ * - hypot(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to fabs(\p x).
+ * - hypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+,
+ * even if \p y is a NaN.
+ * - hypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+#if defined(_WIN32)
+#if defined(_MSC_VER) && _MSC_VER < 1900
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __CRTDECL hypot(double x, double y);
+#else
+extern _ACRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __cdecl hypot(double x, double y);
+#endif
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double           hypot(double x, double y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculate one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \cuda_math_formula \frac{1}{\sqrt{x^2+y^2}} \end_cuda_math_formula. 
+ * - rhypot(\p x,\p y), rhypot(\p y,\p x), and rhypot(\p x, \p -y) are equivalent.
+ * - rhypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ * - rhypot(\cuda_math_formula \pm 0, \pm 0 \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - rhypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rhypot(double x, double y) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculates the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula. 
+ * - hypotf(\p x,\p y), hypotf(\p y,\p x), and hypotf(\p x, \p -y) are equivalent.
+ * - hypotf(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to fabsf(\p x).
+ * - hypotf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+,
+ * even if \p y is a NaN.
+ * - hypotf(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ */
+#if defined(_WIN32)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __CRTDECL hypotf(float x, float y);
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float           hypotf(float x, float y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculates one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \cuda_math_formula \frac{1}{\sqrt{x^2+y^2}} \end_cuda_math_formula. 
+ * - rhypotf(\p x,\p y), rhypotf(\p y,\p x), and rhypotf(\p x, \p -y) are equivalent.
+ * - rhypotf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ * - rhypotf(\cuda_math_formula \pm 0, \pm 0 \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - rhypotf(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                 rhypotf(float x, float y) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculate the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 3D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculate one over the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rnorm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculate the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 4D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2+d^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculate one over the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2+d^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculate the length of a vector p, dimension of which is passed as an argument \p without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \cuda_math_formula \sqrt{\sum_{i=0}^{dim-1} p_i^2} \end_cuda_math_formula.
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+__device__ __device_builtin__  double norm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \cuda_math_formula \frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \cuda_math_formula \frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float rnormf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates the length of a vector \p p, dimension of which is passed as an argument without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \cuda_math_formula \sqrt{\sum_{i=0}^{dim-1} p_i^2} \end_cuda_math_formula.
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+__device__ __device_builtin__  float normf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculates the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 3D vector 
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float norm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculates one over the length of three dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculates the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 4D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2+d^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float norm4df(float a, float b, float c, float d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculates one over the length of four dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2+d^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm4df(float a, float b, float c, float d) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ * - cbrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cbrt(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - cbrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cbrt(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cbrt(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ * - cbrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cbrtf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - cbrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cbrtf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl cbrtf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rcbrt(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rcbrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rcbrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rcbrtf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rcbrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rcbrtf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpi(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinpi(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinpi(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpif(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinpif(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinpif(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the cosine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospi(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cospi(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cospi(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the cosine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians),
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospif(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cospif(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cospif(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinpi() and ::cospi().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospi(double x, double *sptr, double *cptr);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinpif() and ::cospif().
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospif(float x, float *sptr, float *cptr);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - pow(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - pow(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - pow(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - pow(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - pow(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl pow(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The 
+ * integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modf(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - modf(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl modf(double x, double *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the double-precision floating-point remainder of \p x / \p y.
+ *
+ * Calculate the double-precision floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ *
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmod(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - fmod(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - fmod(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fmod(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder.
+ *
+ * Compute double-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \cuda_math_formula  r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula. 
+ * In the case when 
+ * \cuda_math_formula  | n -\frac{x}{y} | = \frac{1}{2}  \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainder(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - remainder(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - remainder(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remainder(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remainder(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder.
+ *
+ * Compute single-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \cuda_math_formula  r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula. 
+ * In the case when 
+ * \cuda_math_formula  | n -\frac{x}{y} | = \frac{1}{2}  \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainderf(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - remainderf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - remainderf(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remainderf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remainderf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder and part of quotient.
+ *
+ * Compute a double-precision floating-point remainder in the same way as the
+ * ::remainder() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquo(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquo(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remquo(double x, double y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remquo(double x, double y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder and part of quotient.
+ *
+ * Compute a single-precision floating-point remainder in the same way as the 
+ * ::remainderf() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquof(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquof(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remquof(float x, float y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remquof(float x, float y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns +0.
+ * - j0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns +0.
+ * - j0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula J_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jn(\p n, NaN) returns NaN.
+ * - jn(\p n, \p x) returns NaN for \p n < 0.
+ * - jn(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl jn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula J_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jnf(\p n, NaN) returns NaN.
+ * - jnf(\p n, \p x) returns NaN for \p n < 0.
+ * - jnf(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  jnf(int n, float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y0(\p x) returns NaN for \p x < 0.
+ * - y0(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y0f(\p x) returns NaN for \p x < 0.
+ * - y0f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y1(\p x) returns NaN for \p x < 0.
+ * - y1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y1f(\p x) returns NaN for \p x < 0.
+ * - y1f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - yn(\p n, \p x) returns NaN for \p n < 0.
+ * - yn(\p n, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - yn(\p n, \p x) returns NaN for \p x < 0.
+ * - yn(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - yn(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl yn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - ynf(\p n, \p x) returns NaN for \p n < 0.
+ * - ynf(\p n, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - ynf(\p n, \p x) returns NaN for \p x < 0.
+ * - ynf(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - ynf(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ynf(int n, float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ * - cyl_bessel_i0(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns +1.
+ * - cyl_bessel_i0(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cyl_bessel_i0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i0(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ * - cyl_bessel_i0f(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns +1.
+ * - cyl_bessel_i0f(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cyl_bessel_i0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i0f(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ * - cyl_bessel_i1(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cyl_bessel_i1(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - cyl_bessel_i1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i1(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ * - cyl_bessel_i1f(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cyl_bessel_i1f(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - cyl_bessel_i1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \cuda_math_formula \frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt \end_cuda_math_formula.
+ *
+ * \return 
+ * - erf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - erf(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erf(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erf(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \cuda_math_formula \frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt \end_cuda_math_formula.
+ *
+ * \return  
+ * - erff(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erff(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - erff(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erff(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erff(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \cuda_math_formula \operatorname{erf}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return
+ * - erfinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erfinv(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfinv(-1) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfinv(\p x) returns NaN for \p x outside [-1, +1].
+ * - erfinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \cuda_math_formula \operatorname{erf}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return 
+ * - erfinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erfinvf(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfinvf(-1) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfinvf(\p x) returns NaN for \p x outside [-1, +1].
+ * - erfinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfinvf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfc(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 2.
+ * - erfc(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfc(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erfc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfcf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 2.
+ * - erfcf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erfcf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \log_{e}\left|\Gamma(x)\right| \end_cuda_math_formula
+ *
+ * \return 
+ * - lgamma(1) returns +0.
+ * - lgamma(2) returns +0.
+ * - lgamma(\p x) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ 0 and \p x is an integer.
+ * - lgamma(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgamma(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgamma(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 lgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl lgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \cuda_math_formula \operatorname{erfc}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcinv(2) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfcinv(\p x) returns NaN for \p x outside [0, 2].
+ * - erfcinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \cuda_math_formula \operatorname{erfc}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcinvf(2) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfcinvf(\p x) returns NaN for \p x outside [0, 2].
+ * - erfcinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \cuda_math_formula \Phi^{-1}(x) \end_cuda_math_formula. The function is defined for input values in the interval 
+ * \cuda_math_formula (0, 1) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdfinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - normcdfinv(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - normcdfinv(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ * - normcdfinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \cuda_math_formula \Phi^{-1}(x) \end_cuda_math_formula. The function is defined for input values in the interval 
+ * \cuda_math_formula (0, 1) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdfinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - normcdfinvf(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - normcdfinvf(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ * - normcdfinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdfinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \cuda_math_formula \Phi(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 1.
+ * - normcdf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - normcdf(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdf(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \cuda_math_formula \Phi(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdff(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 1.
+ * - normcdff(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0
+ * - normcdff(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdff(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \cuda_math_formula e^{x^2}\cdot \operatorname{erfc}(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - erfcx(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcx(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcx(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcx(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \cuda_math_formula e^{x^2}\cdot \operatorname{erfc}(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - erfcxf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcxf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcxf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcxf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \log_{e}\left|\Gamma(x)\right| \end_cuda_math_formula
+ *
+ * \return 
+ * - lgammaf(1) returns +0.
+ * - lgammaf(2) returns +0.
+ * - lgammaf(\p x) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  if \p x
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ *  0 and \p x is an integer.
+ * - lgammaf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgammaf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgammaf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  lgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl lgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \Gamma(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - tgamma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - tgamma(\p x) returns NaN if \p x < 0 and \p x is an integer.
+ * - tgamma(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tgamma(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - tgamma(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 tgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl tgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \Gamma(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - tgammaf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - tgammaf(\p x) returns NaN if \p x < 0  and \p x is an integer.
+ * - tgammaf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tgammaf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - tgammaf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl tgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_DOUBLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * - a value with the magnitude of \p x and the sign of \p y.
+ * - copysign(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 copysign(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl copysign(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_SINGLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * - a value with the magnitude of \p x and the sign of \p y.
+ * - copysignf(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  copysignf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl copysignf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Return next representable double-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable double-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafter()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafter(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafter(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nextafter(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nextafter(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Return next representable single-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable single-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafterf()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafterf(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafterf(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nextafterf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nextafterf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Returns "Not a Number" value.
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nan(\p tagp) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nan(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nan(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Returns "Not a Number" value
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nanf(\p tagp) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nanf(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nanf(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* namespace std */
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinff(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanf(float) __THROW;
+
+
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinited(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitd(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnand(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfd(double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finite(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbit(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(double) __THROW;
+#endif /* __APPLE__ */
+
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitf(float) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - fma(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fma(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fma(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - fma(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fma(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fma(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fma(double x, double y, double z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fma(double x, double y, double z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - fmaf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaf(float x, float y, float z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaf(float x, float y, float z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+
+/* these are here to avoid warnings on the call graph.
+   long double is not supported on the device */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitl(long double) __THROW;
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinite(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(long double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitel(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfl(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanl(long double) __THROW;
+#endif /* __APPLE__ */
+
+#if defined(_WIN32) && ( defined(_M_AMD64) || defined(_M_ARM64) )
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl acosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl asinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atan2f(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl cosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl coshf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl expf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl logf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl log10f(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl modff(float, float*) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl powf(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sqrtf(float) __THROW;         
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl ceilf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl floorf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl fmodf(float, float) __THROW;
+#else /* _WIN32 && (_M_AMD64 || _M_ARM64) */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - acosf(1) returns +0.
+ * - acosf(\p x) returns NaN for \p x outside [-1, +1].
+ * - acosf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - asinf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asinf(\p x) returns NaN for \p x outside [-1, +1].
+ * - asinf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinf(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ].
+ * - atanf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - atanf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is 
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ].
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , -0) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , +0) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for \p x < 0.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for \p x > 0.
+ * - atan2f(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\pi \end_cuda_math_formula
+ * /2 for \p y < 0.
+ * - atan2f(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2 for \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2 for finite \p x.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 3\pi \end_cuda_math_formula
+ * /4.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /4.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atan2f(float y, float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cosf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cosf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cosf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sinf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tanf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tanf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * - coshf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - coshf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - coshf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  coshf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - sinhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinhf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - sinhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - tanhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanhf( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - tanhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the input argument.
+ *
+ * Calculate the natural logarithm of the input argument \p x.
+ *
+ * \return 
+ * - logf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logf(1) returns +0.
+ * - logf(\p x) returns NaN for \p x < 0.
+ * - logf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+,
+ * the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - expf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - expf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - expf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log10f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log10f(1) returns +0.
+ * - log10f(\p x) returns NaN for \p x < 0.
+ * - log10f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log10f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modff(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modff(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - modff(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  modff(float x, float *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - powf(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - powf(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - powf(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - powf(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - powf(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - powf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - powf(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - sqrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sqrtf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - sqrtf(\p x) returns NaN if \p x is less than 0.
+ * - sqrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sqrtf(float x) __THROW;         
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - ceilf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - ceilf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - ceilf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ceilf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculate the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - floorf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - floorf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - floorf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  floorf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point remainder of \p x / \p y.
+ *
+ * Calculate the floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmodf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - fmodf(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - fmodf(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmodf(float x, float y) __THROW;
+#if defined(__QNX__)
+/* redeclare some builtins that QNX uses */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FLog(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FCosh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinx(float, unsigned int, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _FDsign(float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _Dsign(double);
+#endif
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+#endif /* _WIN32 && (_M_AMD64 || _M_ARM64) */
+
+}
+
+#if !defined(__CUDACC_RTC__)
+#include <math.h>
+#include <stdlib.h>
+
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <cmath>
+#include <cstdlib>
+#endif /* __CUDA_INTERNAL_SKIP_CPP_HEADERS__ */
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(__APPLE__)
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x); 
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if !defined(_NVHPC_CUDA)
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(float x);
+/* GCC 6.1 uses ::isnan(double x) for isnan(double x) if the condition is true */
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(float x);
+/* GCC 6.1 uses ::isinf(double x) for isinf(double x) if the condition is true. */
+#if _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(long double x);
+}
+#endif
+
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if (__QNX__) && !defined(_LIBCPP_VERSION)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double x);
+}
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const float x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const double x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const long double x);
+#endif
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const long double a);
+#else /* ! __QNX__ */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const long double x);
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+#else /* !((defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+#endif /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* !(__ANDROID__ || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#endif /* __ANDROID__ */
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned __FLOAT_BITS(float __f);
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned long long __DOUBLE_BITS(double __f);
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* ! (__ANDROID__  || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#endif /* __ANDROID__ || __HORIZON__ */
+#if (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#endif  /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#endif /* __QNX__  */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if !defined(_LIBCPP_VERSION)
+#if defined(__clang__)
+#if __has_include(<ext/random>)
+#define __NV_GLIBCXX_VERSION 40800
+#endif /* __has_include(<random>) */
+#endif /* __clang__ */
+
+#if !defined(__NV_GLIBCXX_VERSION)
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+#endif /* !__NV_GLIBCXX_VERSION */
+#endif /* !defined(_LIBCPP_VERSION) */
+
+#if !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800)
+
+#if defined(__QNX__)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a);
+}
+#elif defined(__HORIZON__)
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+_LIBCPP_BEGIN_NAMESPACE_STD
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a) throw();
+_LIBCPP_END_NAMESPACE_STD
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+#endif /* __QNX__ || __HORIZON__*/
+
+#endif /* !__ANDROID__ || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+#endif /* !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+
+#if defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !defined(__ibmxl__)
+
+#if !defined(_STLPORT_VERSION)
+namespace __gnu_cxx
+{
+#endif /* !_STLPORT_VERSION */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+
+#if !defined(_STLPORT_VERSION)
+}
+#endif /* !_STLPORT_VERSION */
+
+#endif /* defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !__ibmxl__ */
+
+namespace std
+{
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __pow_helper(T, int);
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __cmath_power(T, unsigned int);
+}
+
+using std::abs;
+using std::fabs;
+using std::ceil;
+using std::floor;
+using std::sqrt;
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+using std::pow;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+using std::log;
+using std::log10;
+using std::fmod;
+using std::modf;
+using std::exp;
+using std::frexp;
+using std::ldexp;
+using std::asin;
+using std::sin;
+using std::sinh;
+using std::acos;
+using std::cos;
+using std::cosh;
+using std::atan;
+using std::atan2;
+using std::tan;
+using std::tanh;
+
+#elif defined(_WIN32)
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP double __cdecl _hypot(double x, double y);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP float  __cdecl _hypotf(float x, float y);
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int signbit(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _ldsign(long double);
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(double a);
+#undef __RETURN_TYPE 
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _dsign(double);
+#undef __RETURN_TYPE 
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(float) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _fdsign(float);
+#undef __RETURN_TYPE
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isinf(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isnan(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isfinite(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p LLONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if __cplusplus >= 201103L
+#define __NV_NOEXCEPT noexcept
+#else /* !__cplusplus >= 201103L */
+#define __NV_NOEXCEPT throw()
+#endif /* __cplusplus >= 201103L */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#endif /* __clang__ */
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+namespace std {
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#if defined(__CUDACC_RTC__) || defined(__GNUC__)
+
+#if defined(__CUDACC_RTC__) || \
+    (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) || \
+    defined(__ibmxl__)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int);
+#endif /* __CUDACC__RTC__ ||
+          (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) ||
+          __ibmxl__ */
+
+#endif /* __CUDACC_RTC__ || __GNUC__ */
+
+#if defined(__CUDACC_RTC__) || \
+    (!defined(_MSC_VER) || _MSC_VER < 1800) && \
+    (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101))
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p LONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int a);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float);
+
+#if !defined(__QNX__)
+     
+#if defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)
+template<typename _Tp, typename _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename __gnu_cxx::__promote_2<_Tp, _Up>::__type pow(_Tp, _Up);
+#else  /* !(defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int);
+#endif  /* defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION) */
+     
+#endif  /* !defined(__QNX__) */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float);
+#else /* __CUDACC_RTC__ ||
+         (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+         (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int) throw();
+#if defined(_LIBCPP_VERSION)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int __cdecl abs(long long int) throw();
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float) throw();
+#if defined(_LIBCPP_VERSION)
+#if (defined (__ANDROID__) || defined(__HORIZON__)) && (_LIBCPP_VERSION >= 9000)
+template <class _A1, class _A2>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if _LIBCPP_VERSION >= 14000
+typename std::__enable_if_t
+#else /* _LIBCPP_VERSION < 14000 */
+typename std::_EnableIf
+#endif /*  _LIBCPP_VERSION >= 14000 */
+<
+    std::is_arithmetic<_A1>::value &&
+    std::is_arithmetic<_A2>::value,
+    std::__promote<_A1, _A2>
+>::type pow(_A1 __lcpp_x, _A2 __lcpp_y) __NV_NOEXCEPT;
+#elif (defined(__APPLE__) && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 || defined(__QNX__)
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if defined(__QNX__) && (_LIBCPP_VERSION >= 160000)
+typename std::__enable_if_t <
+#elif _LIBCPP_VERSION >= 13000
+typename std::enable_if <
+#else /* #defined(__QNX__) && (_LIBCPP_VERSION >= 160000) */
+typename std::__lazy_enable_if <
+#endif /* _LIBCPP_VERSION >= 160000  */
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  std::__promote<_Tp, _Up>
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#else /* !((__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800) */
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename enable_if <
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  typename std::__promote<_Tp, _Up>::type
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#endif /* (__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 */
+#else /* !defined(_LIBCPP_VERSION) */
+#if !(defined(__GNUC__) && __cplusplus >= 201103L)
+#if (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__)
+template <class _Ty1, class _Ty2, ::std:: enable_if_t< ::std:: is_arithmetic_v<_Ty1> && ::std:: is_arithmetic_v<_Ty2>, int> > [[nodiscard]] __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ ::std:: _Common_float_type_t<_Ty1, _Ty2> __cdecl pow(_Ty1 _Left, _Ty2 _Right) noexcept;
+#else
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int) throw();
+#endif /* (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__) */
+#endif /* !(defined(__GNUC__) && __cplusplus >= 201103L) */
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float) throw();
+#endif /* __CUDACC_RTC__ ||
+          (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+          (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_END_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_END_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif /* __clang__ */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+}
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __NV_NOEXCEPT
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__ __cudart_builtin__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+#define __NV_NOEXCEPT _NOEXCEPT
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+#define __NV_NOEXCEPT
+namespace std {
+__host__ __device__ __cudart_builtin__ int ilogbf(float a);
+#endif
+#else /* !(defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)) */
+#define __NV_NOEXCEPT _NOEXCEPT
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+__host__ __device__ __cudart_builtin__ float logb(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ int ilogb(float a) __NV_NOEXCEPT;
+
+__host__ __device__ __cudart_builtin__ float scalbn(float a, int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float scalbln(float a, long int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float exp2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float expm1(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log1p(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float acosh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float asinh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float atanh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float hypot(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float cbrt(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erf(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erfc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float lgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float tgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float copysign(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nextafter(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remainder(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float round(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float trunc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float rint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nearbyint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fdim(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fma(float a, float b, float c) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmax(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmin(float a, float b) __NV_NOEXCEPT;
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+using _VSTD::logb;
+using _VSTD::ilogb;
+using _VSTD::scalbn;
+using _VSTD::scalbln;
+using _VSTD::exp2;
+using _VSTD::expm1;
+using _VSTD::log2;
+using _VSTD::log1p;
+using _VSTD::acosh;
+using _VSTD::asinh;
+using _VSTD::atanh;
+using _VSTD::hypot;
+using _VSTD::cbrt;
+using _VSTD::erf;
+using _VSTD::erfc;
+using _VSTD::lgamma;
+using _VSTD::tgamma;
+using _VSTD::copysign;
+using _VSTD::nextafter;
+using _VSTD::remainder;
+using _VSTD::remquo;
+using _VSTD::round;
+using _VSTD::lround;
+using _VSTD::llround;
+using _VSTD::trunc;
+using _VSTD::rint;
+using _VSTD::lrint;
+using _VSTD::llrint;
+using _VSTD::nearbyint;
+using _VSTD::fdim;
+using _VSTD::fma;
+using _VSTD::fmax;
+using _VSTD::fmin;
+#else
+}
+#endif
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+#undef __NV_NOEXCEPT
+#else /* !(defined(__QNX__ ) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+namespace std {
+__host__ __device__ __cudart_builtin__ constexpr float logb(float a);
+__host__ __device__ __cudart_builtin__ constexpr int ilogb(float a);
+__host__ __device__ __cudart_builtin__ constexpr float scalbn(float a, int b);
+__host__ __device__ __cudart_builtin__ constexpr float scalbln(float a, long int b);
+__host__ __device__ __cudart_builtin__ constexpr float exp2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float expm1(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log1p(float a);
+__host__ __device__ __cudart_builtin__ constexpr float acosh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float asinh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float atanh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float hypot(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float cbrt(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erf(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erfc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float lgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float tgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float copysign(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float nextafter(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float remainder(float a, float b);
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo);
+__host__ __device__ __cudart_builtin__ constexpr float round(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lround(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llround(float a);
+__host__ __device__ __cudart_builtin__ constexpr float trunc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float rint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float nearbyint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float fdim(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fma(float a, float b, float c);
+__host__ __device__ __cudart_builtin__ constexpr float fmax(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fmin(float a, float b);
+}
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+__MATH_FUNCTIONS_DECL__ float logb(float a);
+
+__MATH_FUNCTIONS_DECL__ int ilogb(float a);
+
+__MATH_FUNCTIONS_DECL__ float scalbn(float a, int b);
+
+__MATH_FUNCTIONS_DECL__ float scalbln(float a, long int b);
+
+__MATH_FUNCTIONS_DECL__ float exp2(float a);
+
+__MATH_FUNCTIONS_DECL__ float expm1(float a);
+
+__MATH_FUNCTIONS_DECL__ float log2(float a);
+
+__MATH_FUNCTIONS_DECL__ float log1p(float a);
+
+__MATH_FUNCTIONS_DECL__ float acosh(float a);
+
+__MATH_FUNCTIONS_DECL__ float asinh(float a);
+
+__MATH_FUNCTIONS_DECL__ float atanh(float a);
+
+__MATH_FUNCTIONS_DECL__ float hypot(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float cbrt(float a);
+
+__MATH_FUNCTIONS_DECL__ float erf(float a);
+
+__MATH_FUNCTIONS_DECL__ float erfc(float a);
+
+__MATH_FUNCTIONS_DECL__ float lgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float tgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float copysign(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float nextafter(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remainder(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remquo(float a, float b, int *quo);
+
+__MATH_FUNCTIONS_DECL__ float round(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lround(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llround(float a);
+
+__MATH_FUNCTIONS_DECL__ float trunc(float a);
+
+__MATH_FUNCTIONS_DECL__ float rint(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lrint(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(float a);
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(float a);
+
+__MATH_FUNCTIONS_DECL__ float fdim(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fma(float a, float b, float c);
+
+__MATH_FUNCTIONS_DECL__ float fmax(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fmin(float a, float b);
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800) */
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __host__ __device__ __cudart_builtin__ float __cdecl logb(float) throw();
+extern __host__ __device__ __cudart_builtin__ int   __cdecl ilogb(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbn(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbln(float, long int) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl exp2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl expm1(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log1p(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl acosh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl asinh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl atanh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl hypot(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl cbrt(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erf(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erfc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl lgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl tgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl copysign(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl nextafter(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remainder(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remquo(float, float, int *) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl round(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lround(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llround(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl trunc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl rint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl nearbyint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fdim(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fma(float, float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmax(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmin(float, float) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a);
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a);
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a);
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fminf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmin() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmaxf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmax() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b);
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#undef EXCLUDE_FROM_RTC
+
+extern "C"{
+inline __device__ void *__nv_aligned_device_malloc(size_t size, size_t align)
+{
+  __device__ void *__nv_aligned_device_malloc_impl(size_t, size_t);
+  return __nv_aligned_device_malloc_impl(size, align);
+}
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+__func__(double rsqrt(double a));
+
+__func__(double rcbrt(double a));
+
+__func__(double sinpi(double a));
+
+__func__(double cospi(double a));
+
+__func__(void sincospi(double a, double *sptr, double *cptr));
+
+__func__(double erfinv(double a));
+
+__func__(double erfcinv(double a));
+
+__func__(double normcdfinv(double a));
+
+__func__(double normcdf(double a));
+
+__func__(double erfcx(double a));
+
+__func__(float rsqrtf(float a));
+
+__func__(float rcbrtf(float a));
+
+__func__(float sinpif(float a));
+
+__func__(float cospif(float a));
+
+__func__(void sincospif(float a, float *sptr, float *cptr));
+
+__func__(float erfinvf(float a));
+
+__func__(float erfcinvf(float a));
+
+__func__(float normcdfinvf(float a));
+
+__func__(float normcdff(float a));
+
+__func__(float erfcxf(float a));
+
+__func__(int min(int a, int b));
+
+__func__(unsigned int umin(unsigned int a, unsigned int b));
+
+__func__(long long int llmin(long long int a, long long int b));
+
+__func__(unsigned long long int ullmin(unsigned long long int a, unsigned long long int b));
+
+__func__(int max(int a, int b));
+
+__func__(unsigned int umax(unsigned int a, unsigned int b));
+
+__func__(long long int llmax(long long int a, long long int b));
+
+__func__(unsigned long long int ullmax(unsigned long long int a, unsigned long long int b));
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__)
+
+__func__(int __isnan(double a));
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__QNX__)
+
+__func__(void sincos(double a, double *sptr, double *cptr));
+
+#endif /* _WIN32 || __APPLE__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__)
+
+__func__(double exp10(double a));
+
+__func__(float exp10f(float a));
+
+__func__(void sincosf(float a, float *sptr, float *cptr));
+
+__func__(int __isinf(double a));
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if (defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)) || defined (__ANDROID__)
+
+__func__(double log2(double a));
+
+#endif /* (_WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800)) || __ANDROID__ */
+
+#if defined(_WIN32)
+
+__func__(int __signbit(double a));
+
+__func__(int __finite(double a));
+
+__func__(int __signbitl(long double a));
+
+__func__(int __signbitf(float a));
+
+__func__(int __finitel(long double a));
+
+__func__(int __finitef(float a));
+
+__func__(int __isinfl(long double a));
+
+__func__(int __isinff(float a));
+
+__func__(int __isnanl(long double a));
+
+__func__(int __isnanf(float a));
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)
+
+__func__(double copysign(double a, double b));
+
+__func__(double fmax(double a, double b));
+
+__func__(double fmin(double a, double b));
+
+__func__(double trunc(double a));
+
+__func__(double round(double a));
+
+__func__(long int lround(double a));
+
+__func__(long long int llround(double a));
+
+__func__(double rint(double a));
+
+__func__(double nearbyint(double a));
+
+__func__(long int lrint(double a));
+
+__func__(long long int llrint(double a));
+
+__func__(double fdim(double a, double b));
+
+__func__(double scalbn(double a, int b));
+
+__func__(double scalbln(double a, long int b));
+
+__func__(double exp2(double a));
+
+__func__(double log1p(double a));
+
+__func__(double expm1(double a));
+
+__func__(double cbrt(double a));
+
+__func__(double acosh(double a));
+
+__func__(double asinh(double a));
+
+__func__(double atanh(double a));
+
+__func__(int ilogb(double a));
+
+__func__(double logb(double a));
+
+__func__(double remquo(double a, double b, int *quo));
+
+__func__(double remainder(double a, double b));
+
+__func__(double fma (double a, double b, double c));
+
+__func__(double nextafter(double a, double b));
+
+__func__(double erf(double a));
+
+__func__(double erfc(double a));
+
+__func__(double lgamma(double a));
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s));
+
+__func__(double nan(const char *tagp));
+
+__func__(double __host_tgamma_kernel(double a));
+
+__func__(double __host_stirling_poly(double a));
+
+__func__(double __host_tgamma_stirling(double a));
+
+__func__(double tgamma(double a));
+
+__func__(float fmaxf(float a, float b));
+
+__func__(float fminf(float a, float b));
+
+__func__(float roundf(float a));
+
+__func__(long int lroundf(float a));
+
+__func__(long long int llroundf(float a));
+
+__func__(float truncf(float a));
+
+__func__(float rintf(float a));
+
+__func__(float nearbyintf(float a));
+
+__func__(long int lrintf(float a));
+
+__func__(long long int llrintf(float a));
+
+__func__(float logbf(float a));
+
+__func__(float scalblnf(float a, long int b));
+
+__func__(float log2f(float a));
+
+__func__(float exp2f(float a));
+
+__func__(float acoshf(float a));
+
+__func__(float asinhf(float a));
+
+__func__(float atanhf(float a));
+
+__func__(float cbrtf(float a));
+
+__func__(float expm1f(float a));
+
+__func__(float fdimf(float a, float b));
+
+__func__(float log1pf(float a));
+
+__func__(float scalbnf(float a, int b));
+
+__func__(float fmaf(float a, float b, float c));
+
+__func__(int ilogbf(float a));
+
+__func__(float erff(float a));
+
+__func__(float erfcf(float a));
+
+__func__(float lgammaf(float a));
+
+__func__(float tgammaf(float a));
+
+__func__(float remquof(float a, float b, int *quo));
+
+__func__(float remainderf(float a, float b));
+
+__func__(float copysignf(float a, float b));
+
+__func__(float nextafterf(float a, float b));
+
+__func__(float nanf(const char *tagp));
+
+#endif /* _WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#undef EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC_RTC__)
+
+#include "math_functions.hpp"
+
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__MATH_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc09b915ea07f8ef376f5c3640f963a09e86dbfd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
@@ -0,0 +1,3398 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_HPP__)
+#define __MATH_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(static_cast<double>(x));}
+
+__host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ long long int abs(const long long int a) { return llabs(a); }
+
+__host__ __device__ __cudart_builtin__ long int  abs(const long int in)        { return llabs(in); }
+__host__ __device__ __cudart_builtin__ float     abs(const float in)           { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ double    abs(const double in)          { return fabs(in); }
+__host__ __device__ __cudart_builtin__ float     fabs(const float in)          { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ float     ceil(const float in)          { return ceilf(in); }
+__host__ __device__ __cudart_builtin__ float     floor(const float in)         { return floorf(in); }
+__host__ __device__ __cudart_builtin__ float     sqrt(const float in)          { return sqrtf(in); }
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const float b)   { return powf(a, b); }
+extern "C" __device__ float powif(float, int); 
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const int b)     { return powif(a, b); }
+extern "C" __device__ double powi(double, int);
+__host__ __device__ __cudart_builtin__ double    pow(const double a, const int b)    { return powi(a, b); }
+__host__ __device__ __cudart_builtin__ float     log(const float in)           { return logf(in); }
+__host__ __device__ __cudart_builtin__ float     log10(const float in)         { return log10f(in); }
+__host__ __device__ __cudart_builtin__ float     fmod(const float a, const float b)  { return fmodf(a, b); }
+__host__ __device__ __cudart_builtin__ float     modf(const float a, float*b)  { return modff(a, b); }
+__host__ __device__ __cudart_builtin__ float     exp(const float in)           { return expf(in); }
+__host__ __device__ __cudart_builtin__ float     frexp(const float a, int*b)   { return frexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     ldexp(const float a, int b)   { return ldexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     asin(const float in)          { return asinf(in); }
+__host__ __device__ __cudart_builtin__ float     sin(const float in)           { return sinf(in); }
+__host__ __device__ __cudart_builtin__ float     sinh(const float in)          { return sinhf(in); }
+__host__ __device__ __cudart_builtin__ float     acos(const float in)          { return acosf(in); }
+__host__ __device__ __cudart_builtin__ float     cos(const float in)           { return cosf(in); }
+__host__ __device__ __cudart_builtin__ float     cosh(const float in)          { return coshf(in); }
+__host__ __device__ __cudart_builtin__ float     atan(const float in)          { return atanf(in); }
+__host__ __device__ __cudart_builtin__ float     atan2(const float a, const float b) { return atan2f(a, b); }
+__host__ __device__ __cudart_builtin__ float     tan(const float in)           { return tanf(in); }
+__host__ __device__ __cudart_builtin__ float     tanh(const float in)          { return tanhf(in); }
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(_LIBCPP_VERSION)
+extern "C" __device__ float powif(float, int);
+extern "C" __device__ double powi(double, int);
+#endif /* _LIBCPP_VERSION */
+
+#if defined(__APPLE__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbitd(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __isfinitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __isfinited(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __isfinite(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnand(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnan(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinfd(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinf(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if defined(__CUDA_ARCH__)
+#define __NV_BUILTIN_FUNC_DECL__ __forceinline__ __host__ __device__ __cudart_builtin__
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__NV_BUILTIN_FUNC_DECL__ int  isnan(const double a) throw() { return __isnan(a); }
+__NV_BUILTIN_FUNC_DECL__ int  isinf(const double x) throw() { return __isinf(x); }
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+#undef __NV_BUILTIN_FUNC_DECL__
+#endif /* __CUDA_ARCH */
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const float x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitf(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<float>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbit(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const long double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitl(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<long double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+#endif /* (__QNX__ && _LIBCPP_VERSION) */
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+#elif ( (defined(__ANDROID__) || defined(__HORIZON__)) && defined(_LIBCPP_VERSION))
+#if defined(__CUDA_ARCH__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x)  { return __isnan(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+#endif  /* _LIBCPP_VERSION < 8000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+#else /* !defined(__CUDA_ARCH__) */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return signbit<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return signbit<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return signbit<long double>(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return isfinite<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return isfinite<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return isfinite<long double>(x); }
+
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return isnan<float>(x); }
+/* int isnan(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return isnan<long double>(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return isinf<float>(x); }
+/* int isinf(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return isinf<long double>(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+
+#endif  /* defined(__CUDA_ARCH__) */
+
+#else /* !__QNX__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+#if defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) throw() { return __signbit(x); }
+#else /* !__ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+#endif /* __ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) {
+#if defined(__CUDA_ARCH__)
+  return __finitef(x);
+#else	/* !__CUDA_ARCH__ */
+  return __isfinitef(x);
+#endif /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+#endif  /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x)
+{
+#ifdef __CUDA_ARCH__
+  return __finite(x);
+#else  /* !__CUDA_ARCH__ */
+  return __isfinite(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#elif defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) throw() { return __finite(x); }
+#else
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+#endif /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x)
+{
+#ifdef __CUDA_ARCH__
+   return __finitel(x);
+#else /* !__CUDA_ARCH__ */
+   return __isfinitel(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+#endif  /* __ANDROID__ */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnan(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinf(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* __QNX__ || __HORIZON__ */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)))
+
+#if !defined(__QNX__) && !defined(__HORIZON__)
+static __inline__ __host__ __device__ __cudart_builtin__ long long int abs(const long long int a)
+{
+  return llabs(a);
+}
+#endif /* !__QNX__ && !__HORIZON__*/
+
+#endif /* !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+
+#elif defined(_WIN32)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const long double a)
+{
+  return __signbitl(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const double a)
+{
+  return __signbit(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const float a)
+{
+  return __signbitf(a);
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const long double a)
+{
+  return __isinfl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const double a)
+{
+  return __isinf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const float a)
+{
+  return __isinff(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const long double a)
+{
+  return __isnanl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const double a)
+{
+  return __isnan(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const float a)
+{
+  return __isnanf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double a)
+{
+  return __finitel(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const double a)
+{
+  return __finite(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const float a)
+{
+  return __finitef(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* __QNX__ && _LIBCPP_VERSION */
+#if !defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L))
+__MATH_FUNCTIONS_DECL__ float logb(const float a)
+{
+  return logbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ int ilogb(const float a)
+{
+  return ilogbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbn(const float a, const int b)
+{
+  return scalbnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbln(const float a, const long int b)
+{
+  return scalblnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp2(const float a)
+{
+  return exp2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float expm1(const float a)
+{
+  return expm1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log2(const float a)
+{
+  return log2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(const float a)
+{
+  return log1pf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float acosh(const float a)
+{
+  return acoshf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float asinh(const float a)
+{
+  return asinhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float atanh(const float a)
+{
+  return atanhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float hypot(const float a, const float b)
+{
+  return hypotf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float cbrt(const float a)
+{
+  return cbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erf(const float a)
+{
+  return erff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfc(const float a)
+{
+  return erfcf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float lgamma(const float a)
+{
+  return lgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float tgamma(const float a)
+{
+  return tgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(const float a, const float b)
+{
+  return copysignf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float nextafter(const float a, const float b)
+{
+  return nextafterf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remainder(const float a, const float b)
+{
+  return remainderf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remquo(const float a, const float b, int *quo)
+{
+  return remquof(a, b, quo);
+}
+
+__MATH_FUNCTIONS_DECL__ float round(const float a)
+{
+  return roundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lround(const float a)
+{
+  return lroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llround(const float a)
+{
+  return llroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float trunc(const float a)
+{
+  return truncf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rint(const float a)
+{
+  return rintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lrint(const float a)
+{
+  return lrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(const float a)
+{
+  return llrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(const float a)
+{
+  return nearbyintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float fdim(const float a, const float b)
+{
+  return fdimf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fma(const float a, const float b, const float c)
+{
+  return fmaf(a, b, c);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmax(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmin(const float a, const float b)
+{
+  return fminf(a, b);
+}
+#endif /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+#endif /* !(!defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+#endif
+#endif /* __CUDACC_RTC__ || (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a)
+{
+  return exp10f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a)
+{
+  return rsqrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a)
+{
+  return rcbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a)
+{
+  return sinpif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a)
+{
+  return cospif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr)
+{
+  sincospif(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr)
+{
+  sincosf(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a)
+{
+  return j0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a)
+{
+  return j1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a)
+{
+  return jnf(n, a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a)
+{
+  return y0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a)
+{
+  return y1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a)
+{ 
+  return ynf(n, a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a)
+{
+  return cyl_bessel_i0f(a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a)
+{
+  return cyl_bessel_i1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a)
+{
+  return erfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a)
+{
+  return erfcinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a)
+{
+  return normcdfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a)
+{
+  return normcdff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a)
+{
+  return erfcxf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b)
+{
+  return copysign(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b)
+{
+  return copysign(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b)
+{
+  return umin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b)
+{
+  return umin(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b)
+{
+  return umin(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b)
+{
+  long int retval;
+  /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  /* long can be of 32-bit type on some systems. */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(min(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmin(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b)
+{
+  return llmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b)
+{
+  return ullmin(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b)
+{
+  return ullmin(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b)
+{
+  return fminf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b)
+{
+  return fmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b)
+{
+  return fmin(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b)
+{
+  return fmin(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b)
+{
+  return umax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b)
+{
+  return umax(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b)
+{
+  return umax(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b)
+{
+  long int retval;
+  /* long can be of 32-bit type on some systems. */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(max(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmax(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b)
+{
+  return llmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b)
+{
+  return ullmax(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b)
+{
+  return ullmax(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b)
+{
+  return fmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b)
+{
+  return fmax(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b)
+{
+  return fmax(a, static_cast<double>(b));
+}
+
+
+#if !defined(__CUDA_ARCH__)
+#if defined(_WIN32)
+#define __HELPER_FUNC_LINKAGE static inline __host__ __device__
+#pragma warning (push)
+#pragma warning (disable : 4211)
+#else  /* !defined(_WIN32) */
+#define __HELPER_FUNC_LINKAGE inline __host__ __device__
+#endif  /* defined(_WIN32) */
+
+__HELPER_FUNC_LINKAGE int min(const int a, const int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umin(const unsigned int a, const unsigned int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmin(const long long int a, const long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmin(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE int max(const int a, const int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umax(const unsigned int a, const unsigned int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmax(const long long int a, const long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmax(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* defined(_WIN32) */
+
+#undef __HELPER_FUNC_LINKAGE
+
+#endif /* !defined(__CUDA_ARCH__) */
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+#if !defined(__CUDACC__)
+
+#include "host_defines.h"
+#include "math_constants.h"
+
+#define __cuda_INT_MAX \
+        ((int)((unsigned int)-1 >> 1))
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__) || defined(__QNX__)
+
+__func__(int __isnan(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) > 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined(__QNX__)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS & APPLE PLATFORMS        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double exp10(const double a))
+{
+  return pow(10.0, a);
+}
+
+__func__(float exp10f(const float a))
+{
+    return static_cast<float>(exp10(static_cast<double>(a)));
+}
+
+__func__(void sincos(const double a, double *sptr, double *cptr))
+{
+  *sptr = sin(a);
+  *cptr = cos(a);
+}
+
+__func__(void sincosf(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincos(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(int __isinf(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) == 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if defined(_WIN32) || defined (__ANDROID__)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double log2(const double a))
+{
+  return log(a) * 1.44269504088896340;
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 || __ANDROID__ */
+
+#if defined(_WIN32)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS PLATFORM                 *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbit(double a))
+{
+  signed long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return l < 0LL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double copysign(double a, double b))
+{
+  unsigned long long int la, lb;
+  memcpy(&la, &a, sizeof(double));
+  memcpy(&lb, &b, sizeof(double));
+  la = (la & 0x7fffffffffffffffULL) | (lb & 0x8000000000000000ULL);
+  memcpy(&a, &la, sizeof(double));
+  return a;
+}
+#endif /* MSC_VER < 1800 */
+
+__func__(int __finite(double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) < 0xffe0000000000000ULL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double fmax(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(b)) return a;
+  return a > b ? a : b;
+}
+
+__func__(double fmin(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(a)) return a;
+  return a < b ? a : b;
+}
+
+__func__(double trunc(double a))
+{
+  return a < 0.0 ? ceil(a) : floor(a);
+}
+
+__func__(double round(double a))
+{
+  double fa = fabs(a);
+
+  if (fa > CUDART_TWO_TO_52) {
+    return a;
+  } else {
+    double u = floor(fa + 0.5);
+    if (fa < 0.5) u = 0;
+    u = copysign (u, a);
+    return u;
+  }
+}
+
+__func__(long int lround(double a))
+{
+  return static_cast<long int>(round(a));
+}
+
+__func__(long long int llround(double a))
+{
+  return static_cast<long long int>(round(a));
+}
+
+__func__(double rint(double a))
+{
+  double fa = fabs(a);
+  double u = CUDART_TWO_TO_52 + fa;
+  if (fa >= CUDART_TWO_TO_52) {
+    u = a;
+  } else {
+    u = u - CUDART_TWO_TO_52;
+    u = copysign (u, a);
+  }
+  return u;  
+}
+
+__func__(double nearbyint(double a))
+{
+  return rint(a);
+}
+
+__func__(long int lrint(double a))
+{
+  return static_cast<long int>(rint(a));
+}
+
+__func__(long long int llrint(double a))
+{
+  return static_cast<long long int>(rint(a));
+}
+
+__func__(double fdim(double a, double b))
+{
+  if (a > b) {
+    return (a - b);
+  } else if (a <= b) {
+    return 0.0;
+  } else if (__isnan(a)) {
+    return a;
+  } else {
+    return b;
+  }
+}
+
+__func__(double scalbn(double a, int b))
+{
+  return ldexp(a, b);
+}
+
+__func__(double scalbln(double a, long int b))
+{
+  int t;
+
+  if (b > 2147483647L) {
+    t = 2147483647;
+  } else if (b < (-2147483647 - 1)) {
+    t = (-2147483647 - 1);
+  } else {
+    t = static_cast<int>(b);
+  }
+  return scalbn(a, t);
+}
+
+__func__(double exp2(double a))
+{
+  return pow(2.0, a);
+}
+
+/*  
+ * The following is based on: David Goldberg, "What every computer scientist 
+ * should know about floating-point arithmetic", ACM Computing Surveys, Volume 
+ * 23, Issue 1, March 1991.
+ */
+__func__(double log1p(double a))
+{
+  volatile double u, m;
+
+  u = 1.0 + a;
+  if (u == 1.0) {
+    /* a very close to zero */
+    u = a;
+  } else {
+    m = u - 1.0;
+    u = log(u);
+    if (a < 1.0) {
+      /* a somewhat close to zero */
+      u = a * u;
+      u = u / m;
+    }
+  }
+  return u;
+}
+
+/*
+ * This code based on: http://www.cs.berkeley.edu/~wkahan/Math128/Sumnfp.pdf
+ */
+__func__(double expm1(double a))
+{
+  volatile double u, m;
+
+  u = exp(a);
+  m = u - 1.0;
+  if (m == 0.0) {
+    /* a very close zero */
+    m = a;
+  } 
+  else if (fabs(a) < 1.0) {
+    /* a somewhat close zero */
+    u = log(u);
+    m = m * a;
+    m = m / u;
+  }
+  return m;
+}
+
+__func__(double cbrt(double a))
+{
+  double s, t;
+
+  if (a == 0.0 || __isinf(a)) {
+    return a;
+  } 
+  s = fabs(a);
+  t = exp2(CUDART_THIRD * log2(s));           /* initial approximation */
+  t = t - (t - (s / (t * t))) * CUDART_THIRD; /* refine approximation */
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double acosh(double a))
+{
+  double s, t;
+
+  t = a - 1.0;
+  if (t == a) {
+    return log(2.0) + log(a);
+  } else {
+    s = a + 1.0;
+    t = t + sqrt(s * t);
+    return log1p(t);
+  }
+}
+
+__func__(double asinh(double a))
+{
+  double fa, oofa, t;
+
+  fa = fabs(a);
+  if (fa > 1e18) {
+    t = log(2.0) + log(fa);
+  } else {
+    oofa = 1.0 / fa;
+    t = fa + fa / (oofa + sqrt(1.0 + oofa * oofa));
+    t = log1p(t);
+  }
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double atanh(double a))
+{
+  double fa, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  fa = fabs(a);
+  t = (2.0 * fa) / (1.0 - fa);
+  t = 0.5 * log1p(t);
+  if (__isnan(t) || !__signbit(a)) {
+    return t;
+  }
+  return -t;
+}
+
+__func__(int ilogb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return -__cuda_INT_MAX-1;
+  if (__isinf(a)) return __cuda_INT_MAX;
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -__cuda_INT_MAX-1;
+  if (i >= 0x0010000000000000ULL) {
+    return (int)(((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return expo;
+}
+
+__func__(double logb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return a + a;
+  if (__isinf(a)) return fabs(a);
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -1.0/fabs(a);
+  if (i >= 0x0010000000000000ULL) {
+    return (double)((int)((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return static_cast<double>(expo);
+}
+
+__func__(double remquo(double a, double b, int *quo))
+{
+  unsigned long long int aa, bb;
+  int rem1 = 1; /* do FPREM1, a.k.a IEEE remainder */
+  int expo_a;
+  int expo_b;
+  unsigned long long mant_a;
+  unsigned long long mant_b;
+  unsigned long long mant_c;
+  unsigned long long temp;
+  int sign_a;
+  int sign_b;
+  int sign_c;
+  int expo_c;
+  int expodiff;
+  int quot = 0;                 /* initialize quotient */
+  int l;
+  int iter;
+
+  memcpy(&aa, &a, sizeof(double));
+  mant_a = (aa << 11ULL) | 0x8000000000000000ULL;
+  expo_a = (int)((aa >> 52ULL) & 0x7ffU) - 1023;
+  sign_a = (int)(aa >> 63ULL);
+
+  memcpy(&bb, &b, sizeof(double));
+  mant_b = (bb << 11ULL) | 0x8000000000000000ULL;
+  expo_b = (int)((bb >> 52ULL) & 0x7ffU) - 1023;
+  sign_b = (int)(bb >> 63ULL);
+
+  sign_c = sign_a;  /* remainder has sign of dividend */
+  expo_c = expo_a;  /* default */
+      
+  /* handled NaNs and infinities */
+  if (__isnan(a) || __isnan(b)) {
+    *quo = quot;
+    return a + b;
+  }
+  if (__isinf(a) || (b == 0.0)) {
+    *quo = quot;
+    aa = 0xfff8000000000000ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  if ((a == 0.0) || (__isinf(b))) {
+    *quo = quot;
+    return a;
+  }
+  /* normalize denormals */
+  if (expo_a < -1022) {
+    mant_a = mant_a + mant_a;
+    while (mant_a < 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_a;
+      expo_a--;
+    }
+  } 
+  if (expo_b < -1022) {
+    mant_b = mant_b + mant_b;
+    while (mant_b < 0x8000000000000000ULL) {
+      mant_b = mant_b + mant_b;
+      expo_b--;
+    }
+  }
+  expodiff = expo_a - expo_b;
+  /* clamp iterations if exponent difference negative */
+  if (expodiff < 0) {
+    iter = -1;
+  } else {
+    iter = expodiff;
+  }
+  /* Shift dividend and divisor right by one bit to prevent overflow
+     during the division algorithm.
+   */
+  mant_a = mant_a >> 1ULL;
+  mant_b = mant_b >> 1ULL;
+  expo_c = expo_a - iter; /* default exponent of result   */
+
+  /* Use binary longhand division (restoring) */
+  for (l = 0; l < (iter + 1); l++) {
+    mant_a = mant_a - mant_b;
+    if (mant_a & 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_b;
+      quot = quot + quot;
+    } else {
+      quot = quot + quot + 1;
+    }
+    mant_a = mant_a + mant_a;
+  }
+
+  /* Save current remainder */
+  mant_c = mant_a;
+  /* If remainder's mantissa is all zeroes, final result is zero. */
+  if (mant_c == 0) {
+    quot = quot & 7;
+    *quo = (sign_a ^ sign_b) ? -quot : quot;
+    aa = static_cast<unsigned long long int>(sign_c) << 63ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  /* Normalize result */
+  while (!(mant_c & 0x8000000000000000ULL)) {
+    mant_c = mant_c + mant_c;
+    expo_c--;
+  }
+  /* For IEEE remainder (quotient rounded to nearest-even we might need to 
+     do a final subtraction of the divisor from the remainder.
+  */
+  if (rem1 && ((expodiff+1) >= 0)) {
+    temp = mant_a - mant_b;
+    /* round quotient to nearest even */
+    if (((temp != 0ULL) && (!(temp & 0x8000000000000000ULL))) ||
+        ((temp == 0ULL) && (quot & 1))) {
+      mant_a = mant_a >> 1ULL;
+      quot++;
+      /* Since the divisor is greater than the remainder, the result will
+         have opposite sign of the dividend. To avoid a negative mantissa
+         when subtracting the divisor from remainder, reverse subtraction
+      */
+      sign_c = 1 ^ sign_c;
+      expo_c = expo_a - iter + 1;
+      mant_c = mant_b - mant_a;
+      /* normalize result */
+      while (!(mant_c & 0x8000000000000000ULL)) {
+        mant_c = mant_c + mant_c;
+        expo_c--;
+      }
+    }
+  }
+  /* package up result */
+  if (expo_c >= -1022) { /* normal */
+    mant_c = ((mant_c >> 11ULL) +
+              (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+               (((unsigned long long)(expo_c + 1022)) << 52ULL)));
+  } else { /* denormal */
+    mant_c = (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+              (mant_c >> (unsigned long long)(11 - expo_c - 1022)));
+  }
+  quot = quot & 7; /* mask quotient down to least significant three bits */
+  *quo = (sign_a ^ sign_b) ? -quot : quot;
+  memcpy(&a, &mant_c, sizeof(double));
+  return a;
+}
+
+__func__(double remainder(double a, double b))
+{
+  int quo;
+  return remquo (a, b, &quo);
+}
+
+__func__(double fma (double a, double b, double c))
+{
+  struct {
+    unsigned int lo;
+    unsigned int hi;
+  } xx, yy, zz, ww;
+  double d;
+  unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
+
+  memcpy(&xx, &a, sizeof(double));
+  memcpy(&yy, &b, sizeof(double));
+  memcpy(&zz, &c, sizeof(double));
+
+  expo_z = 0x7FFU;
+  t =  xx.hi >> 20;
+  expo_x = expo_z & t;
+  expo_x = expo_x - 1;    /* expo(x) - 1 */
+  t =  yy.hi >> 20;
+  expo_y = expo_z & t;
+  expo_y = expo_y - 1;    /* expo(y) - 1 */
+  t =  zz.hi >> 20;
+  expo_z = expo_z & t;
+  expo_z = expo_z - 1;    /* expo(z) - 1 */
+
+  if (!((expo_x <= 0x7FDU) &&
+        (expo_y <= 0x7FDU) &&
+        (expo_z <= 0x7FDU))) {
+    
+    /* fma (nan, y, z) --> nan
+       fma (x, nan, z) --> nan
+       fma (x, y, nan) --> nan 
+    */
+    if (((yy.hi << 1) | (yy.lo != 0)) > 0xffe00000U) {
+      yy.hi |= 0x00080000U;
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) > 0xffe00000U) {
+      zz.hi |= 0x00080000U;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    if (((xx.hi << 1) | (xx.lo != 0)) > 0xffe00000U) {
+      xx.hi |= 0x00080000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, inf, z) --> INDEFINITE
+       fma (inf, 0, z) --> INDEFINITE
+       fma (-inf,+y,+inf) --> INDEFINITE
+       fma (+x,-inf,+inf) --> INDEFINITE
+       fma (+inf,-y,+inf) --> INDEFINITE
+       fma (-x,+inf,+inf) --> INDEFINITE
+       fma (-inf,-y,-inf) --> INDEFINITE
+       fma (-x,-inf,-inf) --> INDEFINITE
+       fma (+inf,+y,-inf) --> INDEFINITE
+       fma (+x,+inf,-inf) --> INDEFINITE
+    */
+    if (((((xx.hi << 1) | xx.lo) == 0) && 
+         (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U)) ||
+        ((((yy.hi << 1) | yy.lo) == 0) && 
+         (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U))) {
+      xx.hi = 0xfff80000U;
+      xx.lo = 0x00000000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      if ((((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) ||
+          (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U)) {
+        if ((int)(xx.hi ^ yy.hi ^ zz.hi) < 0) {
+          xx.hi = 0xfff80000U;
+          xx.lo = 0x00000000U;
+          memcpy(&d, &xx, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (inf, y, z) --> inf
+       fma (x, inf, z) --> inf
+       fma (x, y, inf) --> inf
+    */
+    if (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U) {
+      xx.hi = xx.hi ^ (yy.hi & 0x80000000U);
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) {
+      yy.hi = yy.hi ^ (xx.hi & 0x80000000U);
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    /* fma (+0, -y, -0) --> -0
+       fma (-0, +y, -0) --> -0
+       fma (+x, -0, -0) --> -0
+       fma (-x, +0, -0) --> -0
+    */
+    if ((zz.hi == 0x80000000U) && (zz.lo == 0)) {
+      if ((((xx.hi << 1) | xx.lo) == 0) ||
+          (((yy.hi << 1) | yy.lo) == 0)) {
+        if ((int)(xx.hi ^ yy.hi) < 0) {
+          memcpy(&d, &zz, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (0, y, 0) --> +0  (-0 if round down and signs of addend differ)
+       fma (x, 0, 0) --> +0  (-0 if round down and signs of addend differ)
+    */
+    if ((((zz.hi << 1) | zz.lo) == 0) &&
+        ((((xx.hi << 1) | xx.lo) == 0) ||
+         (((yy.hi << 1) | yy.lo) == 0))) {
+      zz.hi &= 0x7fffffffU;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, y, z) --> z
+       fma (x, 0, z) --> z
+    */
+    if ((((xx.hi << 1) | xx.lo) == 0) ||
+        (((yy.hi << 1) | yy.lo) == 0)) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    if (expo_x == 0xffffffffU) {
+      expo_x++;
+      t = xx.hi & 0x80000000U;
+      s = xx.lo >> 21;
+      xx.lo = xx.lo << 11;
+      xx.hi = xx.hi << 11;
+      xx.hi = xx.hi | s;
+      if (!xx.hi) {
+        xx.hi = xx.lo;
+        xx.lo = 0;
+        expo_x -= 32;
+      }
+      while (static_cast<int>(xx.hi) > 0) {
+        s = xx.lo >> 31;
+        xx.lo = xx.lo + xx.lo;
+        xx.hi = xx.hi + xx.hi;
+        xx.hi = xx.hi | s;
+        expo_x--;
+      }
+      xx.lo = (xx.lo >> 11);
+      xx.lo |= (xx.hi << 21);
+      xx.hi = (xx.hi >> 11) | t;
+    }
+    if (expo_y == 0xffffffffU) {
+      expo_y++;
+      t = yy.hi & 0x80000000U;
+      s = yy.lo >> 21;
+      yy.lo = yy.lo << 11;
+      yy.hi = yy.hi << 11;
+      yy.hi = yy.hi | s;
+      if (!yy.hi) {
+        yy.hi = yy.lo;
+        yy.lo = 0;
+        expo_y -= 32;
+      }
+      while (static_cast<int>(yy.hi) > 0) {
+        s = yy.lo >> 31;
+        yy.lo = yy.lo + yy.lo;
+        yy.hi = yy.hi + yy.hi;
+        yy.hi = yy.hi | s;
+        expo_y--;
+      }
+      yy.lo = (yy.lo >> 11);
+      yy.lo |= (yy.hi << 21);
+      yy.hi = (yy.hi >> 11) | t;
+    }
+    if (expo_z == 0xffffffffU) {
+      expo_z++;
+      t = zz.hi & 0x80000000U;
+      s = zz.lo >> 21;
+      zz.lo = zz.lo << 11;
+      zz.hi = zz.hi << 11;
+      zz.hi = zz.hi | s;
+      if (!zz.hi) {
+        zz.hi = zz.lo;
+        zz.lo = 0;
+        expo_z -= 32;
+      }
+      while (static_cast<int>(zz.hi) > 0) {
+        s = zz.lo >> 31;
+        zz.lo = zz.lo + zz.lo;
+        zz.hi = zz.hi + zz.hi;
+        zz.hi = zz.hi | s;
+        expo_z--;
+      }
+      zz.lo = (zz.lo >> 11);
+      zz.lo |= (zz.hi << 21);
+      zz.hi = (zz.hi >> 11) | t;
+    }
+  }
+  
+  expo_x = expo_x + expo_y;
+  expo_y = xx.hi ^ yy.hi;
+  t = xx.lo >> 21;
+  xx.lo = xx.lo << 11;
+  xx.hi = xx.hi << 11;
+  xx.hi = xx.hi | t;
+  yy.hi = yy.hi & 0x000fffffU;
+  xx.hi = xx.hi | 0x80000000U; /* set mantissa hidden bit */
+  yy.hi = yy.hi | 0x00100000U; /* set mantissa hidden bit */
+
+  prod0 = xx.lo * yy.lo;
+  prod1 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod2 = xx.hi * yy.lo;
+  prod3 = xx.lo * yy.hi;
+  prod1 += prod2;
+  t = (unsigned)(prod1 < prod2);
+  prod1 += prod3;
+  t += prod1 < prod3;
+  prod2 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod2 += prod3;
+  s = (unsigned)(prod2 < prod3);
+  prod3 = xx.hi * yy.hi;
+  prod2 += prod3;
+  s += prod2 < prod3;
+  prod2 += t;
+  s += prod2 < t;
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod3 = prod3 + s;
+  
+  yy.lo = prod0;                 /* mantissa */
+  yy.hi = prod1;                 /* mantissa */
+  xx.lo = prod2;                 /* mantissa */
+  xx.hi = prod3;                 /* mantissa */
+  expo_x = expo_x - (1023 - 2);  /* expo-1 */
+  expo_y = expo_y & 0x80000000U;  /* sign */
+
+  if (xx.hi < 0x00100000U) {
+    s = xx.lo >> 31;
+    s = (xx.hi << 1) + s;
+    xx.hi = s;
+    s = yy.hi >> 31;
+    s = (xx.lo << 1) + s;
+    xx.lo = s;
+    s = yy.lo >> 31;
+    s = (yy.hi << 1) + s;
+    yy.hi = s;
+    s = yy.lo << 1;
+    yy.lo = s;
+    expo_x--;
+  }
+
+  t = 0;
+  if (((zz.hi << 1) | zz.lo) != 0) { /* z is not zero */
+    
+    s = zz.hi & 0x80000000U;
+    
+    zz.hi &= 0x000fffffU;
+    zz.hi |= 0x00100000U;
+    ww.hi = 0;
+    ww.lo = 0;
+    
+    /* compare and swap. put augend into xx:yy */
+    if (static_cast<int>(expo_z) > static_cast<int>(expo_x)) {
+      t = expo_z;
+      expo_z = expo_x;
+      expo_x = t;
+      t = zz.hi;
+      zz.hi = xx.hi;
+      xx.hi = t;
+      t = zz.lo;
+      zz.lo = xx.lo;
+      xx.lo = t;
+      t = ww.hi;
+      ww.hi = yy.hi;
+      yy.hi = t;
+      t = ww.lo;
+      ww.lo = yy.lo;
+      yy.lo = t;
+      t = expo_y;
+      expo_y = s;
+      s = t;
+    }
+    
+    /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
+    /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
+    expo_z = expo_x - expo_z;
+    u = expo_y ^ s;
+    if (expo_z <= 107) {
+      /* denormalize addend */
+      t = 0;
+      while (expo_z >= 32) {
+        t     = ww.lo | (t != 0);
+        ww.lo = ww.hi;
+        ww.hi = zz.lo;
+        zz.lo = zz.hi;
+        zz.hi = 0;
+        expo_z -= 32;
+      }
+      if (expo_z) {
+        t     = (t     >> expo_z) | (ww.lo << (32 - expo_z)) | 
+                ((t << (32 - expo_z)) != 0);
+        ww.lo = (ww.lo >> expo_z) | (ww.hi << (32 - expo_z));
+        ww.hi = (ww.hi >> expo_z) | (zz.lo << (32 - expo_z));
+        zz.lo = (zz.lo >> expo_z) | (zz.hi << (32 - expo_z));
+        zz.hi = (zz.hi >> expo_z);
+      }
+    } else {
+      t = 1;
+      ww.lo = 0;
+      ww.hi = 0;
+      zz.lo = 0;
+      zz.hi = 0;
+    }
+    if (static_cast<int>(u) < 0) {
+      /* signs differ, effective subtraction */
+      t = (unsigned)(-static_cast<int>(t));
+      s = (unsigned)(t != 0);
+      u = yy.lo - s;
+      s = (unsigned)(u > yy.lo);
+      yy.lo = u - ww.lo;
+      s += yy.lo > u;
+      u = yy.hi - s;
+      s = (unsigned)(u > yy.hi);
+      yy.hi = u - ww.hi;
+      s += yy.hi > u;
+      u = xx.lo - s;
+      s = (unsigned)(u > xx.lo);
+      xx.lo = u - zz.lo;
+      s += xx.lo > u;
+      xx.hi = (xx.hi - zz.hi) - s;
+      if (!(xx.hi | xx.lo | yy.hi | yy.lo | t)) {
+        /* complete cancelation, return 0 */
+        memcpy(&d, &xx, sizeof(double));
+        return d;
+      }
+      if (static_cast<int>(xx.hi) < 0) {
+        /* Oops, augend had smaller mantissa. Negate mantissa and flip
+           sign of result
+        */
+        t = ~t;
+        yy.lo = ~yy.lo;
+        yy.hi = ~yy.hi;
+        xx.lo = ~xx.lo;
+        xx.hi = ~xx.hi;
+        if (++t == 0) {
+          if (++yy.lo == 0) {
+            if (++yy.hi == 0) {
+              if (++xx.lo == 0) {
+              ++xx.hi;
+              }
+            }
+          }
+        }
+        expo_y ^= 0x80000000U;
+      }
+        
+      /* normalize mantissa, if necessary */
+      while (!(xx.hi & 0x00100000U)) {
+        xx.hi = (xx.hi << 1) | (xx.lo >> 31);
+        xx.lo = (xx.lo << 1) | (yy.hi >> 31);
+        yy.hi = (yy.hi << 1) | (yy.lo >> 31);
+        yy.lo = (yy.lo << 1);
+        expo_x--;
+      }
+    } else {
+      /* signs are the same, effective addition */
+      yy.lo = yy.lo + ww.lo;
+      s = (unsigned)(yy.lo < ww.lo);
+      yy.hi = yy.hi + s;
+      u = (unsigned)(yy.hi < s);
+      yy.hi = yy.hi + ww.hi;
+      u += yy.hi < ww.hi;
+      xx.lo = xx.lo + u;
+      s = (unsigned)(xx.lo < u);
+      xx.lo = xx.lo + zz.lo;
+      s += xx.lo < zz.lo;
+      xx.hi = xx.hi + zz.hi + s;
+      if (xx.hi & 0x00200000U) {
+        t = t | (yy.lo << 31);
+        yy.lo = (yy.lo >> 1) | (yy.hi << 31);
+        yy.hi = (yy.hi >> 1) | (xx.lo << 31);
+        xx.lo = (xx.lo >> 1) | (xx.hi << 31);
+        xx.hi = ((xx.hi & 0x80000000U) | (xx.hi >> 1)) & ~0x40000000U;
+        expo_x++;
+      }
+    }
+  }
+  t = yy.lo | (t != 0);
+  t = yy.hi | (t != 0);
+        
+  xx.hi |= expo_y; /* or in sign bit */
+  if (expo_x <= 0x7FDU) {
+    /* normal */
+    xx.hi = xx.hi & ~0x00100000U; /* lop off integer bit */
+    s = xx.lo & 1; /* mantissa lsb */
+    u = xx.lo;
+    xx.lo += (t == 0x80000000U) ? s : (t >> 31);
+    xx.hi += (u > xx.lo);
+    xx.hi += ((expo_x + 1) << 20);
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  } else if (static_cast<int>(expo_x) >= 2046) {
+    /* overflow */
+    xx.hi = (xx.hi & 0x80000000U) | 0x7ff00000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }
+  /* subnormal */
+  expo_x = (unsigned)(-static_cast<int>(expo_x));
+  if (expo_x > 54) {
+    xx.hi = xx.hi & 0x80000000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }  
+  yy.hi = xx.hi &  0x80000000U;   /* save sign bit */
+  xx.hi = xx.hi & ~0xffe00000U;
+  if (expo_x >= 32) {
+    t = xx.lo | (t != 0);
+    xx.lo = xx.hi;
+    xx.hi = 0;
+    expo_x -= 32;
+  }
+  if (expo_x) {
+    t     = (t     >> expo_x) | (xx.lo << (32 - expo_x)) | (t != 0);
+    xx.lo = (xx.lo >> expo_x) | (xx.hi << (32 - expo_x));
+    xx.hi = (xx.hi >> expo_x);
+  }
+  expo_x = xx.lo & 1;
+  u = xx.lo;
+  xx.lo += (t == 0x80000000U) ? expo_x : (t >> 31);
+  xx.hi += (u > xx.lo);
+  xx.hi |= yy.hi;
+  memcpy(&d, &xx, sizeof(double));
+  return d;
+}
+
+__func__(double nextafter(double a, double b))
+{
+  unsigned long long int ia;
+  unsigned long long int ib;
+  memcpy(&ia, &a, sizeof(double));
+  memcpy(&ib, &b, sizeof(double));
+  if (__isnan(a) || __isnan(b)) return a + b; /* NaN */
+  if (((ia | ib) << 1ULL) == 0ULL) return b;
+  if (a == 0.0) {
+    return copysign (4.9406564584124654e-324, b); /* crossover */
+  }
+  if ((a < b) && (a < 0.0)) ia--;
+  if ((a < b) && (a > 0.0)) ia++;
+  if ((a > b) && (a < 0.0)) ia++;
+  if ((a > b) && (a > 0.0)) ia--;
+  memcpy(&a, &ia, sizeof(double));
+  return a;
+}
+
+__func__(double erf(double a))
+{
+  double t, r, q;
+
+  t = fabs(a);
+  if (t >= 1.0) {
+    r =        -1.28836351230756500E-019;
+    r = r * t + 1.30597472161093370E-017;
+    r = r * t - 6.33924401259620500E-016;
+    r = r * t + 1.96231865908940140E-014;
+    r = r * t - 4.35272243559990750E-013;
+    r = r * t + 7.37083927929352150E-012;
+    r = r * t - 9.91402142550461630E-011;
+    r = r * t + 1.08817017167760820E-009;
+    r = r * t - 9.93918713097634620E-009;
+    r = r * t + 7.66739923255145500E-008;
+    r = r * t - 5.05440278302806720E-007;
+    r = r * t + 2.87474157099000620E-006;
+    r = r * t - 1.42246725399722510E-005;
+    r = r * t + 6.16994555079419460E-005;
+    r = r * t - 2.36305221938908790E-004;
+    r = r * t + 8.05032844055371070E-004;
+    r = r * t - 2.45833366629108140E-003;
+    r = r * t + 6.78340988296706120E-003;
+    r = r * t - 1.70509103597554640E-002;
+    r = r * t + 3.93322852515666300E-002;
+    r = r * t - 8.37271292613764040E-002;
+    r = r * t + 1.64870423707623280E-001;
+    r = r * t - 2.99729521787681470E-001;
+    r = r * t + 4.99394435612628580E-001;
+    r = r * t - 7.52014596480123030E-001;
+    r = r * t + 9.99933138314926250E-001;
+    r = r * t - 1.12836725321102670E+000;
+    r = r * t + 9.99998988715182450E-001;
+    q = exp (-t * t);
+    r = 1.0 - r * q;
+    if (t >= 6.5) {
+      r = 1.0;
+    }    
+    a = copysign (r, a);
+  } else {
+    q = a * a;
+    r =        -7.77946848895991420E-010;
+    r = r * q + 1.37109803980285950E-008;
+    r = r * q - 1.62063137584932240E-007;
+    r = r * q + 1.64471315712790040E-006;
+    r = r * q - 1.49247123020098620E-005;
+    r = r * q + 1.20552935769006260E-004;
+    r = r * q - 8.54832592931448980E-004;
+    r = r * q + 5.22397760611847340E-003;
+    r = r * q - 2.68661706431114690E-002;
+    r = r * q + 1.12837916709441850E-001;
+    r = r * q - 3.76126389031835210E-001;
+    r = r * q + 1.12837916709551260E+000;
+    a = r * a;
+  }
+  return a;
+}
+
+__func__(double erfc(double a))
+{
+  double p, q, h, l;
+
+  if (a < 0.75) {
+    return 1.0 - erf(a);
+  } 
+  if (a > 27.3) {
+    return 0.0;
+  }
+  if (a < 5.0) {
+    double t;
+    t = 1.0 / a;
+    p =         1.9759923722227928E-008;
+    p = p * t - 1.0000002670474897E+000;
+    p = p * t - 7.4935303236347828E-001;
+    p = p * t - 1.5648136328071860E-001;
+    p = p * t + 1.2871196242447239E-001;
+    p = p * t + 1.1126459974811195E-001;
+    p = p * t + 4.0678642255914332E-002;
+    p = p * t + 7.9915414156678296E-003;
+    p = p * t + 7.1458332107840234E-004;
+    q =     t + 2.7493547525030619E+000;
+    q = q * t + 3.3984254815725423E+000;
+    q = q * t + 2.4635304979947761E+000;
+    q = q * t + 1.1405284734691286E+000;
+    q = q * t + 3.4130157606195649E-001;
+    q = q * t + 6.2250967676044953E-002;
+    q = q * t + 5.5661370941268700E-003;
+    q = q * t + 1.0575248365468671E-009;
+    p = p / q;
+    p = p * t;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    q = q * 0.5;
+    p = p * q + q;
+    p = p * t;
+  } else {
+    double ooa, ooasq;
+
+    ooa = 1.0 / a;
+    ooasq = ooa * ooa;
+    p =            -4.0025406686930527E+005;
+    p = p * ooasq + 1.4420582543942123E+005;
+    p = p * ooasq - 2.7664185780951841E+004;
+    p = p * ooasq + 4.1144611644767283E+003;
+    p = p * ooasq - 5.8706000519209351E+002;
+    p = p * ooasq + 9.1490086446323375E+001;
+    p = p * ooasq - 1.6659491387740221E+001;
+    p = p * ooasq + 3.7024804085481784E+000;
+    p = p * ooasq - 1.0578553994424316E+000;
+    p = p * ooasq + 4.2314218745087778E-001;
+    p = p * ooasq - 2.8209479177354962E-001;
+    p = p * ooasq + 5.6418958354775606E-001;
+    h = a * a;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    p = p * ooa;
+    p = p * q;
+  }
+  return p;
+}
+
+__func__(double lgamma(double a))
+{
+  double s;
+  double t;
+  double i;
+  double fa;
+  double sum;
+  long long int quot;
+  if (__isnan(a) || __isinf(a)) {
+    return a * a;
+  }
+  fa = fabs(a);
+  if (fa >= 3.0) {
+    if (fa >= 8.0) {
+      /* Stirling approximation; coefficients from Hart et al, "Computer 
+       * Approximations", Wiley 1968. Approximation 5404. 
+       */
+      s = 1.0 / fa;
+      t = s * s;
+      sum =          -0.1633436431e-2;
+      sum = sum * t + 0.83645878922e-3;
+      sum = sum * t - 0.5951896861197e-3;
+      sum = sum * t + 0.793650576493454e-3;
+      sum = sum * t - 0.277777777735865004e-2;
+      sum = sum * t + 0.833333333333331018375e-1;
+      sum = sum * s + 0.918938533204672;
+      s = 0.5 * log (fa);
+      t = fa - 0.5;
+      s = s * t;
+      t = s - fa;
+      s = s + sum;
+      t = t + s;
+    } else {
+      i = fa - 3.0;
+      s =        -4.02412642744125560E+003;
+      s = s * i - 2.97693796998962000E+005;
+      s = s * i - 6.38367087682528790E+006;
+      s = s * i - 5.57807214576539320E+007;
+      s = s * i - 2.24585140671479230E+008;
+      s = s * i - 4.70690608529125090E+008;
+      s = s * i - 7.62587065363263010E+008;
+      s = s * i - 9.71405112477113250E+008;
+      t =     i - 1.02277248359873170E+003;
+      t = t * i - 1.34815350617954480E+005;
+      t = t * i - 4.64321188814343610E+006;
+      t = t * i - 6.48011106025542540E+007;
+      t = t * i - 4.19763847787431360E+008;
+      t = t * i - 1.25629926018000720E+009;
+      t = t * i - 1.40144133846491690E+009;
+      t = s / t;
+      t = t + i;
+    }
+  } else if (fa >= 1.5) {
+    i = fa - 2.0;
+    t =         9.84839283076310610E-009;
+    t = t * i - 6.69743850483466500E-008;
+    t = t * i + 2.16565148880011450E-007;
+    t = t * i - 4.86170275781575260E-007;
+    t = t * i + 9.77962097401114400E-007;
+    t = t * i - 2.03041287574791810E-006;
+    t = t * i + 4.36119725805364580E-006;
+    t = t * i - 9.43829310866446590E-006;
+    t = t * i + 2.05106878496644220E-005;
+    t = t * i - 4.49271383742108440E-005;
+    t = t * i + 9.94570466342226000E-005;
+    t = t * i - 2.23154589559238440E-004;
+    t = t * i + 5.09669559149637430E-004;
+    t = t * i - 1.19275392649162300E-003;
+    t = t * i + 2.89051032936815490E-003;
+    t = t * i - 7.38555102806811700E-003;
+    t = t * i + 2.05808084278121250E-002;
+    t = t * i - 6.73523010532073720E-002;
+    t = t * i + 3.22467033424113040E-001;
+    t = t * i + 4.22784335098467190E-001;
+    t = t * i;
+  } else if (fa >= 0.7) {
+    i = 1.0 - fa;
+    t =         1.17786911519331130E-002;  
+    t = t * i + 3.89046747413522300E-002;
+    t = t * i + 5.90045711362049900E-002;
+    t = t * i + 6.02143305254344420E-002;
+    t = t * i + 5.61652708964839180E-002;
+    t = t * i + 5.75052755193461370E-002;
+    t = t * i + 6.21061973447320710E-002;
+    t = t * i + 6.67614724532521880E-002;
+    t = t * i + 7.14856037245421020E-002;
+    t = t * i + 7.69311251313347100E-002;
+    t = t * i + 8.33503129714946310E-002;
+    t = t * i + 9.09538288991182800E-002;
+    t = t * i + 1.00099591546322310E-001;
+    t = t * i + 1.11334278141734510E-001;
+    t = t * i + 1.25509666613462880E-001;
+    t = t * i + 1.44049896457704160E-001;
+    t = t * i + 1.69557177031481600E-001;
+    t = t * i + 2.07385551032182120E-001;
+    t = t * i + 2.70580808427600350E-001;
+    t = t * i + 4.00685634386517050E-001;
+    t = t * i + 8.22467033424113540E-001;
+    t = t * i + 5.77215664901532870E-001;
+    t = t * i;
+  } else {
+    t =         -9.04051686831357990E-008;
+    t = t * fa + 7.06814224969349250E-007;
+    t = t * fa - 3.80702154637902830E-007;
+    t = t * fa - 2.12880892189316100E-005;
+    t = t * fa + 1.29108470307156190E-004;
+    t = t * fa - 2.15932815215386580E-004;
+    t = t * fa - 1.16484324388538480E-003;
+    t = t * fa + 7.21883433044470670E-003;
+    t = t * fa - 9.62194579514229560E-003;
+    t = t * fa - 4.21977386992884450E-002;
+    t = t * fa + 1.66538611813682460E-001;
+    t = t * fa - 4.20026350606819980E-002;
+    t = t * fa - 6.55878071519427450E-001;
+    t = t * fa + 5.77215664901523870E-001;
+    t = t * fa;
+    t = t * fa + fa;
+    t = -log (t);
+  }
+  if (a >= 0.0) return t;
+  if (fa < 1e-19) return -log(fa);
+  i = floor(fa);       
+  if (fa == i) return 1.0 / (fa - i); /* a is an integer: return infinity */
+  i = rint (2.0 * fa);
+  quot = static_cast<long long int>(i);
+  i = fa - 0.5 * i;
+  i = i * CUDART_PI;
+  if (quot & 1) {
+    i = cos(i);
+  } else {
+    i = sin(i);
+  }
+  i = fabs(i);
+  t = log(CUDART_PI / (i * fa)) - t;
+  return t;
+}
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s))
+{
+  unsigned long long i = 0;
+  int c;
+  int ovfl = 0;
+  int invld = 0;
+  if (s && (*s == '0')) {
+    s++;
+    if ((*s == 'x') || (*s == 'X')) {
+      s++; 
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x0fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = (((*s) >= 'A') && ((*s) <= 'F')) ? (*s + 'a' - 'A') : (*s);
+        if ((c >= 'a') && (c <= 'f')) { 
+          c = c - 'a' + 10;
+          i = i * 16 + c;
+        } else if ((c >= '0') && (c <= '9')) { 
+          c = c - '0';
+          i = i * 16 + c;
+        } else {
+          invld = 1;
+        }
+        s++;
+      }
+    } else {
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x1fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = *s;
+        if ((c >= '0') && (c <= '7')) { 
+          c = c - '0';
+          i = i * 8 + c;
+        } else {
+          invld = 1; 
+        }
+        s++;
+      }
+    }
+  } else if (s) {
+    while (*s) {
+      c = *s;
+      if ((i > 1844674407370955161ULL) || 
+          ((i == 1844674407370955161ULL) && (c > '5'))) {
+        ovfl = 1;
+      }
+      if ((c >= '0') && (c <= '9')) { 
+        c = c - '0';
+        i = i * 10 + c;
+      } else {
+        invld = 1;
+      }
+      s++;
+    }
+  }
+  if (ovfl) {
+    i = ~0ULL;
+  }
+  if (invld) {
+    i = 0ULL;
+  }
+  i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL;
+  return i;
+}
+
+__func__(double nan(const char *tagp))
+{
+  unsigned long long l;
+  double d;
+  l = __internal_host_nan_kernel(tagp);
+  memcpy(&d, &l, sizeof(double));
+  return d;
+}
+
+__func__(double __host_tgamma_kernel(double a))
+{
+  double t;
+  t =       - 4.4268934071252475E-010;
+  t = t * a - 2.0266591846658954E-007;
+  t = t * a + 1.1381211721119527E-006;
+  t = t * a - 1.2507734816630748E-006;
+  t = t * a - 2.0136501740408771E-005;
+  t = t * a + 1.2805012607354486E-004;
+  t = t * a - 2.1524140811527418E-004;
+  t = t * a - 1.1651675459704604E-003;
+  t = t * a + 7.2189432248466381E-003;
+  t = t * a - 9.6219715326862632E-003;
+  t = t * a - 4.2197734554722394E-002;
+  t = t * a + 1.6653861138250356E-001;
+  t = t * a - 4.2002635034105444E-002;
+  t = t * a - 6.5587807152025712E-001;
+  t = t * a + 5.7721566490153287E-001;
+  t = t * a + 1.0000000000000000E+000;
+  return t;
+}
+
+__func__(double __host_stirling_poly(double a))
+{
+  double x = 1.0 / a;
+  double z = 0.0;
+  z =       + 8.3949872067208726e-004;
+  z = z * x - 5.1717909082605919e-005;
+  z = z * x - 5.9216643735369393e-004;
+  z = z * x + 6.9728137583658571e-005;
+  z = z * x + 7.8403922172006662e-004;
+  z = z * x - 2.2947209362139917e-004;
+  z = z * x - 2.6813271604938273e-003;
+  z = z * x + 3.4722222222222220e-003;
+  z = z * x + 8.3333333333333329e-002;
+  z = z * x + 1.0000000000000000e+000;
+  return z;
+}
+
+__func__(double __host_tgamma_stirling(double a))
+{
+  double z;
+  double x;
+  z = __host_stirling_poly (a);
+  if (a < 142.0) {
+    x = pow (a, a - 0.5);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    return a * z;
+  } else if (a < 172.0) {
+    x = pow (a, 0.5 * a - 0.25);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    a = a * z;
+    return a * x;
+  } else {
+    return exp(1000.0); /* INF */
+  }
+}
+
+__func__(double tgamma(double a))
+{
+  double s, xx, x = a;
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (fabs(x) < 20.0) {
+    if (x >= 0.0) {
+      s = 1.0;
+      xx = x;
+      while (xx > 1.5) {
+        xx = xx - 1.0;
+        s = s * xx;
+      }
+      if (x >= 0.5) {
+        xx = xx - 1.0;
+      }
+      xx = __host_tgamma_kernel (xx);
+      if (x < 0.5) {
+        xx = xx * x;
+      }
+      s = s / xx;
+    } else {
+      xx = x;
+      s = xx;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      while (xx < -0.5) {
+        xx = xx + 1.0;
+        s = s * xx;
+      }
+      xx = __host_tgamma_kernel (xx);
+      s = s * xx;
+      s = 1.0 / s;
+    }
+    return s;
+  } else {
+    if (x >= 0.0) {
+      return __host_tgamma_stirling (x);
+    } else {
+      double t;
+      int quot;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      if (x < -185.0) {
+        int negative;
+        x = floor(x);
+        negative = ((x - (2.0 * floor(0.5 * x))) == 1.0);
+        return negative ? (-1.0 / 1e308 / 1e308) : CUDART_ZERO;
+      }
+      /* compute sin(pi*x) accurately */
+      xx = rint (2.0 * x);
+      quot = static_cast<int>(xx);
+      xx = -0.5 * xx + x;
+      xx = xx * CUDART_PI;
+      if (quot & 1) {
+        xx = cos (xx);
+      } else {
+        xx = sin (xx);
+      }
+      if (quot & 2) {
+        xx = -xx;
+      }
+      x = fabs (x);
+      s = exp (-x);
+      t = x - 0.5;
+      if (x > 140.0) t = 0.5 * t;
+      t = pow (x, t);
+      if (x > 140.0) s = s * t;
+      s = s * __host_stirling_poly (x);
+      s = s * x;
+      s = s * xx;
+      s = 1.0 / s;
+      s = s * CUDART_SQRT_PIO2;
+      s = s / t;
+      return s;
+    }
+  }
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT AND LONG DOUBLE ROUTINES FOR WINDOWS PLATFORM  *
+* MAP FLOAT AND LONG DOUBLE ROUTINES TO DOUBLE ROUTINES                        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbitl(const long double a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __signbitf(const float a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __finitel(const long double a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __finitef(const float a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __isinfl(const long double a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isinff(const float a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isnanl(const long double a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+__func__(int __isnanf(const float a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float fmaxf(const float a, const float b))
+{
+  return static_cast<float>(fmax(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float fminf(const float a, const float b))
+{
+  return static_cast<float>(fmin(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float roundf(const float a))
+{
+  return static_cast<float>(round(static_cast<double>(a)));
+}
+
+__func__(long int lroundf(const float a))
+{
+  return lround(static_cast<double>(a));
+}
+
+__func__(long long int llroundf(const float a))
+{
+  return llround(static_cast<double>(a));
+}
+
+__func__(float truncf(const float a))
+{
+  return static_cast<float>(trunc(static_cast<double>(a)));
+}
+
+__func__(float rintf(const float a))
+{
+  return static_cast<float>(rint(static_cast<double>(a)));
+}
+
+__func__(float nearbyintf(const float a))
+{
+  return static_cast<float>(nearbyint(static_cast<double>(a)));
+}
+
+__func__(long int lrintf(const float a))
+{
+  return lrint(static_cast<double>(a));
+}
+
+__func__(long long int llrintf(const float a))
+{
+  return llrint(static_cast<double>(a));
+}
+
+__func__(float logbf(const float a))
+{
+  return static_cast<float>(logb(static_cast<double>(a)));
+}
+
+__func__(float scalblnf(const float a, const long int b))
+{
+  return static_cast<float>(scalbln(static_cast<double>(a), b));
+}
+
+__func__(float log2f(const float a))
+{
+  return static_cast<float>(log2(static_cast<double>(a)));
+}
+
+__func__(float exp2f(const float a))
+{
+  return static_cast<float>(exp2(static_cast<double>(a)));
+}
+
+__func__(float acoshf(const float a))
+{
+  return static_cast<float>(acosh(static_cast<double>(a)));
+}
+
+__func__(float asinhf(const float a))
+{
+  return static_cast<float>(asinh(static_cast<double>(a)));
+}
+
+__func__(float atanhf(const float a))
+{
+  return static_cast<float>(atanh(static_cast<double>(a)));
+}
+
+__func__(float cbrtf(const float a))
+{
+  return static_cast<float>(cbrt(static_cast<double>(a)));
+}
+
+__func__(float expm1f(const float a))
+{
+  return static_cast<float>(expm1(static_cast<double>(a)));
+}
+
+__func__(float fdimf(const float a, const float b))
+{
+  return static_cast<float>(fdim(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float log1pf(const float a))
+{
+  return static_cast<float>(log1p(static_cast<double>(a)));
+}
+
+__func__(float scalbnf(const float a, const int b))
+{
+  return static_cast<float>(scalbn(static_cast<double>(a), b));
+}
+
+__func__(float fmaf(const float a, const float b, const float c))
+{
+  return static_cast<float>(fma(static_cast<double>(a), static_cast<double>(b), static_cast<double>(c)));
+}
+
+__func__(int ilogbf(const float a))
+{
+  return ilogb(static_cast<double>(a));
+}
+
+__func__(float erff(const float a))
+{
+  return static_cast<float>(erf(static_cast<double>(a)));
+}
+
+__func__(float erfcf(const float a))
+{
+  return static_cast<float>(erfc(static_cast<double>(a)));
+}
+
+__func__(float lgammaf(const float a))
+{
+  return static_cast<float>(lgamma(static_cast<double>(a)));
+}
+
+__func__(float tgammaf(const float a))
+{
+  return static_cast<float>(tgamma(static_cast<double>(a)));
+}
+
+__func__(float remquof(const float a, const float b, int *quo))
+{
+  return static_cast<float>(remquo(static_cast<double>(a), static_cast<double>(b), quo));
+}
+
+__func__(float remainderf(const float a, const float b))
+{
+  return static_cast<float>(remainder(static_cast<double>(a), static_cast<double>(b)));
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (defined _MSC_VER) && (_MSC_VER >= 1700)
+__func__(float j0f(const float a))
+{
+  return static_cast<float>(_j0(static_cast<double>(a)));
+}
+
+__func__(float j1f(const float a))
+{
+  return static_cast<float>(_j1(static_cast<double>(a)));
+}
+
+__func__(float jnf(const int n, const float a))
+{
+  return static_cast<float>(_jn(n, static_cast<double>(a)));
+}
+
+__func__(float y0f(const float a))
+{
+  return static_cast<float>(_y0(static_cast<double>(a)));
+}
+
+__func__(float y1f(const float a))
+{
+  return static_cast<float>(_y1(static_cast<double>(a)));
+}
+
+__func__(float ynf(const int n, const float a))
+{
+  return static_cast<float>(_yn(n, static_cast<double>(a)));
+}
+#endif /* (defined _MSC_VER) && (_MSC_VER >= 1700) */
+
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT ROUTINES FOR WINDOWS PLATFORM                  *
+*                                                                              *
+*******************************************************************************/
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float copysignf(float a, const float b))
+{
+  unsigned int aa, bb;
+  memcpy(&aa, &a, sizeof(float));
+  memcpy(&bb, &b, sizeof(float));
+  aa = (aa & ~0x80000000U) | (bb & 0x80000000U);
+  memcpy(&a, &aa, sizeof(float));
+  return a;
+}
+
+__func__(float nextafterf(float a, const float b))
+{
+  unsigned int ia;
+  unsigned int ib;
+  memcpy(&ia, &a, sizeof(float));
+  memcpy(&ib, &b, sizeof(float));
+  if (__isnanf(a) || __isnanf(b)) return a + b; /*NaN*/
+  if (((ia | ib) << 1U) == 0U) return b;
+  if (a == 0.0F) {
+    return copysignf(1.401298464e-045F, b); /*crossover*/
+  }
+  if ((a < b) && (a < 0.0F)) ia--;
+  if ((a < b) && (a > 0.0F)) ia++;
+  if ((a > b) && (a < 0.0F)) ia++;
+  if ((a > b) && (a > 0.0F)) ia--;
+  memcpy(&a, &ia, sizeof(float));
+  return a;
+}
+
+__func__(float nanf(const char *tagp))
+{
+  float f;
+  unsigned int i;
+  i = static_cast<unsigned int>(__internal_host_nan_kernel(tagp));
+  i = (i & 0x007fffffU) | 0x7fc00000U;
+  memcpy(&f, &i, sizeof(float));
+  return f;
+}
+
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE AND FLOAT ROUTINES. ALL PLATFORMS             *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double rsqrt(const double a))
+{
+  return 1.0 / sqrt(a);
+}
+
+__func__(double rcbrt(const double a))
+{
+  double s, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return 1.0 / a;
+  } 
+  s = fabs(a);
+  t = exp2(-CUDART_THIRD * log2(s));                /* initial approximation */
+  t = ((t*t) * (-s*t) + 1.0) * (CUDART_THIRD*t) + t;/* refine approximation */
+#if defined(__APPLE__)
+  if (__signbitd(a))
+#else /* __APPLE__ */
+  if (__signbit(a))
+#endif /* __APPLE__ */
+  {
+    t = -t;
+  }
+  return t;
+}
+
+__func__(double sinpi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return sin (a);
+  } 
+  if (a == floor(a)) {
+    return ((a / 1.0e308) / 1.0e308) / 1.0e308;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  return a;
+}
+
+__func__(double cospi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (__isinf(a)) {
+    return cos (a);
+  } 
+  if (fabs(a) > 9.0071992547409920e+015) {
+    a = 0.0;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  n++;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  if (a == 0.0) {
+    a = fabs(a);
+  }
+  return a;
+}
+
+__func__(void sincospi(const double a, double *sptr, double *cptr))
+{
+  *sptr = sinpi(a);
+  *cptr = cospi(a);
+}
+
+__func__(double erfinv(const double a))
+{
+  double p, q, t, fa;
+  unsigned long long int l;
+
+  fa = fabs(a);
+  if (fa >= 1.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double)); /* INDEFINITE */
+    if (fa == 1.0) {
+      t = a * exp(1000.0);          /* Infinity */
+    }
+  } else if (fa >= 0.9375) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+     */
+    t = log1p(-fa);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+    if (a < 0.0) t = -t;
+  } else if (fa >= 0.75) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39
+    */
+    t = a * a - .87890625;
+    p =         .21489185007307062000e+0;
+    p = p * t - .64200071507209448655e+1;
+    p = p * t + .29631331505876308123e+2;
+    p = p * t - .47644367129787181803e+2;
+    p = p * t + .34810057749357500873e+2;
+    p = p * t - .12954198980646771502e+2;
+    p = p * t + .25349389220714893917e+1;
+    p = p * t - .24758242362823355486e+0;
+    p = p * t + .94897362808681080020e-2;
+    q =     t - .12831383833953226499e+2;
+    q = q * t + .41409991778428888716e+2;
+    q = q * t - .53715373448862143349e+2;
+    q = q * t + .33880176779595142685e+2;
+    q = q * t - .11315360624238054876e+2;
+    q = q * t + .20369295047216351160e+1;
+    q = q * t - .18611650627372178511e+0;
+    q = q * t + .67544512778850945940e-2;
+    p = p / q;
+    t = a * p;
+  } else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18
+    */
+    t = a * a - .5625;
+    p =       - .23886240104308755900e+2;
+    p = p * t + .45560204272689128170e+3;
+    p = p * t - .22977467176607144887e+4;
+    p = p * t + .46631433533434331287e+4;
+    p = p * t - .43799652308386926161e+4;
+    p = p * t + .19007153590528134753e+4;
+    p = p * t - .30786872642313695280e+3;
+    q =     t - .83288327901936570000e+2;
+    q = q * t + .92741319160935318800e+3;
+    q = q * t - .35088976383877264098e+4;
+    q = q * t + .59039348134843665626e+4;
+    q = q * t - .48481635430048872102e+4;
+    q = q * t + .18997769186453057810e+4;
+    q = q * t - .28386514725366621129e+3;
+    p = p / q;
+    t = a * p;
+  }
+  return t;
+}
+
+__func__(double erfcinv(const double a))
+{
+  double t;
+  unsigned long long int l;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a <= 0.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double));   /* INDEFINITE */
+    if (a == 0.0) {
+        t = (1.0 - a) * exp(1000.0);  /* Infinity */
+    }
+  } 
+  else if (a >= 0.0625) {
+    t = erfinv (1.0 - a);
+  }
+  else if (a >= 1e-100) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+  }
+  else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         6.9952990607058154858e-1;
+    p = p * t + 1.9507620287580568829e+0;
+    p = p * t + 8.2810030904462690216e-1;
+    p = p * t + 1.1279046353630280005e-1;
+    p = p * t + 6.0537914739162189689e-3;
+    p = p * t + 1.3714329569665128933e-4;
+    p = p * t + 1.2964481560643197452e-6;
+    p = p * t + 4.6156006321345332510e-9;
+    p = p * t + 4.5344689563209398450e-12;
+    q =     t + 1.5771922386662040546e+0;
+    q = q * t + 2.1238242087454993542e+0;
+    q = q * t + 8.4001814918178042919e-1;
+    q = q * t + 1.1311889334355782065e-1;
+    q = q * t + 6.0574830550097140404e-3;
+    q = q * t + 1.3715891988350205065e-4;
+    q = q * t + 1.2964671850944981713e-6;
+    q = q * t + 4.6156017600933592558e-9;
+    q = q * t + 4.5344687377088206783e-12;
+    t = p / (q * t);
+  }
+  return t;
+}
+
+__func__(double normcdfinv(const double a))
+{
+  return -1.4142135623730951 * erfcinv(a + a);
+}
+
+__func__(double normcdf(double a))
+{
+  double ah, al, t1, t2, u1, u2, v1, v2, z;
+  if (fabs (a) > 38.5) a = copysign (38.5, a);
+  ah = a * 134217729.0;
+  u1 = (a - ah) + ah;
+  u2 = a - u1;
+  v1 = -7.0710678398609161e-01;
+  v2 =  2.7995440410322203e-09;
+  t1 = a * -CUDART_SQRT_HALF_HI;
+  t2 = (((u1 * v1 - t1) + u1 * v2) + u2 * v1) + u2 * v2;
+  t2 = (a * -CUDART_SQRT_HALF_LO) + t2;
+  ah = t1 + t2;
+  z = erfc (ah);
+  if (a < -1.0) {
+    al = (t1 - ah) + t2;
+    t1 = -2.0 * ah * z;
+    z = t1 * al + z;
+  }
+  return 0.5 * z;
+}
+
+__func__(double erfcx(const double a))
+{
+  double x, t1, t2, t3;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  x = fabs(a); 
+  if (x < 32.0) {
+    /*  
+     * This implementation of erfcx() is based on the algorithm in: M. M. 
+     * Shepherd and J. G. Laframboise, "Chebyshev Approximation of (1 + 2x)
+     * exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 
+     * 36, No. 153, January 1981, pp. 249-253. For the core approximation,
+     * the input domain [0,INF] is transformed via (x-k) / (x+k) where k is
+     * a precision-dependent constant. Here, we choose k = 4.0, so the input 
+     * domain [0, 27.3] is transformed into the core approximation domain 
+     * [-1, 0.744409].   
+     */
+    /*
+    // Compute (1+2*x)*exp(x*x)*erfc(x)
+    */
+    /* t2 = (x-4.0)/(x+4.0), transforming [0,INF] to [-1,+1] */ 
+    t1 = x - 4.0; 
+    t2 = x + 4.0; 
+    t2 = t1 / t2;
+    /* approximate on [-1, 0.744409] */   
+    t1 =         - 3.5602694826817400E-010; 
+    t1 = t1 * t2 - 9.7239122591447274E-009; 
+    t1 = t1 * t2 - 8.9350224851649119E-009; 
+    t1 = t1 * t2 + 1.0404430921625484E-007; 
+    t1 = t1 * t2 + 5.8806698585341259E-008; 
+    t1 = t1 * t2 - 8.2147414929116908E-007; 
+    t1 = t1 * t2 + 3.0956409853306241E-007; 
+    t1 = t1 * t2 + 5.7087871844325649E-006; 
+    t1 = t1 * t2 - 1.1231787437600085E-005; 
+    t1 = t1 * t2 - 2.4399558857200190E-005; 
+    t1 = t1 * t2 + 1.5062557169571788E-004; 
+    t1 = t1 * t2 - 1.9925637684786154E-004; 
+    t1 = t1 * t2 - 7.5777429182785833E-004; 
+    t1 = t1 * t2 + 5.0319698792599572E-003; 
+    t1 = t1 * t2 - 1.6197733895953217E-002; 
+    t1 = t1 * t2 + 3.7167515553018733E-002; 
+    t1 = t1 * t2 - 6.6330365827532434E-002; 
+    t1 = t1 * t2 + 9.3732834997115544E-002; 
+    t1 = t1 * t2 - 1.0103906603555676E-001; 
+    t1 = t1 * t2 + 6.8097054254735140E-002; 
+    t1 = t1 * t2 + 1.5379652102605428E-002; 
+    t1 = t1 * t2 - 1.3962111684056291E-001; 
+    t1 = t1 * t2 + 1.2329951186255526E+000; 
+    /*
+    // Note: (1+2*x)*exp(x*x)*erfc(x) / (1+2*x) = exp(x*x)*erfc(x)
+    */
+    t2 = 2.0 * x + 1.0; 
+    t1 = t1 / t2;
+  } else {
+    /* asymptotic expansion for large aguments */
+    t2 = 1.0 / x;
+    t3 = t2 * t2;
+    t1 =         -29.53125;
+    t1 = t1 * t3 + 6.5625;
+    t1 = t1 * t3 - 1.875;
+    t1 = t1 * t3 + 0.75;
+    t1 = t1 * t3 - 0.5;
+    t1 = t1 * t3 + 1.0;
+    t2 = t2 * 5.6418958354775628e-001;
+    t1 = t1 * t2;
+  }
+  if (a < 0.0) {
+    /*
+    // Note: erfcx(x) = 2*exp(x^2) - erfcx(|x|)
+    */
+    t2 = (static_cast<int>(x * 16.0)) * 0.0625;
+    t3 = (x - t2) * (x + t2);
+    t3 = exp(t2 * t2) * exp(t3);
+    t3 = t3 + t3;
+    t1 = t3 - t1;
+  }
+  return t1;
+}
+
+__func__(float rsqrtf(const float a))
+{
+  return static_cast<float>(rsqrt(static_cast<double>(a)));
+}
+
+__func__(float rcbrtf(const float a))
+{
+  return static_cast<float>(rcbrt(static_cast<double>(a)));
+}
+
+__func__(float sinpif(const float a))
+{
+  return static_cast<float>(sinpi(static_cast<double>(a)));
+}
+
+__func__(float cospif(const float a))
+{
+  return static_cast<float>(cospi(static_cast<double>(a)));
+}
+
+__func__(void sincospif(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincospi(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(float erfinvf(const float a))
+{
+  return static_cast<float>(erfinv(static_cast<double>(a)));
+}
+
+__func__(float erfcinvf(const float a))
+{
+  return static_cast<float>(erfcinv(static_cast<double>(a)));
+}
+
+__func__(float normcdfinvf(const float a))
+{
+  return static_cast<float>(normcdfinv(static_cast<double>(a)));
+}
+
+__func__(float normcdff(const float a))
+{
+  return static_cast<float>(normcdf(static_cast<double>(a)));
+}
+
+__func__(float erfcxf(const float a))
+{
+  return static_cast<float>(erfcx(static_cast<double>(a)));
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#endif /* !__MATH_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d0b79141f65461e0384f34f9e30c482969041ca
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.h
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2017-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead.")
+#else
+#warning "crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
+#endif
+
+#if !defined(__CUDA_MMA_H__)
+#define __CUDA_MMA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
+#define __CUDA_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
+#define __CUDA_SUBBYTE_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#define __CUDA_AMPERE_MMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
+
+namespace nvcuda {
+namespace wmma {
+  
+  // utility functions
+#ifdef __CUDA_AMPERE_MMA__
+  inline __device__ float __float_to_tf32(float in) 
+  { 
+    float ret; 
+    asm("{\n  .reg .b32 __$1;"
+        "\n   cvt.rna.tf32.f32 __$1, %1;"
+        "\n   mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) ); 
+    return ret; 
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */  
+  
+  // 
+  // tags 
+  // 
+  struct row_major;
+  struct col_major;
+  struct matrix_a;
+  struct matrix_b;
+  struct accumulator;
+
+#ifdef __CUDA_AMPERE_MMA__
+  namespace precision {
+    struct tf32;
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */  
+#ifdef __CUDA_SUBBYTE_IMMA__
+  namespace experimental {
+    namespace precision {
+      struct u4; // 4-bit unsigned
+      struct s4; // 4-bit signed
+      struct b1; // 1-bit
+    }
+    enum bmmaBitOp { bmmaBitOpXOR = 1
+#ifdef __CUDA_AMPERE_MMA__
+                    , bmmaBitOpAND = 2
+#endif  /* __CUDA_AMPERE_MMA__ */
+    };
+    enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+  // 
+  // layout
+  //
+  enum layout_t {
+    mem_row_major, mem_col_major
+  };
+  
+  template <typename T>
+  struct helper_traits {
+    typedef T element_type;
+    typedef T storage_element_type;
+    typedef T fill_argument_type;
+  };
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  template<> struct helper_traits<experimental::precision::u4> {
+    typedef experimental::precision::u4 element_type;
+    typedef unsigned int storage_element_type;
+    typedef unsigned int fill_argument_type;
+  };
+
+  template<> struct helper_traits<experimental::precision::s4> {
+    typedef experimental::precision::s4 element_type;
+    typedef int storage_element_type;
+    typedef int fill_argument_type;
+  };
+  
+  template<> struct helper_traits<experimental::precision::b1> {
+    typedef experimental::precision::b1 element_type;
+    typedef unsigned int storage_element_type;
+    typedef unsigned int fill_argument_type;
+  };
+#endif /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> struct helper_traits<precision::tf32> {
+    typedef precision::tf32 element_type;
+    typedef float storage_element_type;
+    typedef float fill_argument_type;
+  };
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4324)
+#endif
+  // 
+  // The base fragment type
+  // 
+  /* note: alignment required for compiler implementation */
+  template <typename T, int size, int packed_size = size> 
+  struct __align__(8) __frag_base {
+
+    /* Number of elements in the fragment */
+    enum {num_elements = size};
+    
+    /* Number of storage elements in the fragment. 
+
+       The elements of the fragment are packed together when the 
+       fragment element type is experimental::precision::u4, 
+       experimental::precision::s4 or experimental::precision::b1.
+       When elements are packed, num_storage_elements 
+       will be smaller than num_elements.
+    */
+    enum {num_storage_elements = packed_size};
+
+    /* element type of the fragment */
+    typedef T element_type;
+
+    /* element type of the storage representation. 
+    
+       The mapping from element_type to storage_element_type is as follows:
+       experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
+       experimental::precision::s4 -> int (8 elements in 1 storage element)
+       experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
+       precision::tf32             -> float (1 element in 1 storage element)       
+       all other types T           -> T
+    */
+    typedef typename helper_traits<T>::storage_element_type storage_element_type;
+
+    /* Storage for the (possibly packed) fragment elements. */
+    storage_element_type x[num_storage_elements];
+  };
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+  template <typename FragEleType, typename StorageType, typename ArgType>
+  static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  template<>
+  __device__ inline unsigned 
+  __get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
+  {
+    /* For experimental::precision::u4 fragment element type, pack 8 elements into a single 
+       32-bit unsigned int storage element */
+    unsigned val = in & 0xf;
+    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
+            (val << 20) | (val << 24) | (val << 28));
+  };
+
+  template<>
+  __device__ inline int
+  __get_storage_value<experimental::precision::s4, int, int>(int in)
+  {
+    /* For experimental::precision::s4 fragment element type, pack 8 elements into a single 
+       32-bit signed int storage element */
+    int val = in & 0xf;
+    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
+            (val << 20) | (val << 24) | (val << 28));
+  };
+  
+  template<>
+  __device__ inline unsigned 
+  __get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
+  {
+    /* For experimental::precision::b1 fragment element type, pack 32 elements into a 
+       single 32-bit unsigned int storage element */
+    return (in & 0x1) ? 0xFFFFFFFFU : 0;
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+  template <typename FragEleType, int size, int packed_size>
+    __CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f, 
+       /*  The mapping from fragment element type (FragEleType) to fill_argument_type is:
+       experimental::precision::u4 -> unsigned (only lower 4 bits taken)
+       experimental::precision::s4 -> int (only lower 4 bits taken)
+       experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
+       precision::tf32             -> float
+       all other types T           -> T
+       */        
+   const typename helper_traits<FragEleType>::fill_argument_type & in) {
+
+   /* get the (possibly packed) storage element value. See the specializations above for fragment
+      element types where the storage representation is packed */
+   typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
+   storage_type v = __get_storage_value<FragEleType, storage_type>(in);
+#pragma unroll
+    for (int i=0; i< f.num_storage_elements; i++)
+      f.x[i] = v; 
+  }
+  
+  // 
+  // Fragment template
+  // 
+  template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
+
+  // 
+  // Fragments for 16x16x16
+  // 
+  template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};  
+  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};  
+  template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+  // 
+  // Fragments for 32x8x16
+  // 
+  template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+  // 
+  // Fragments for 8x32x16
+  // 
+  template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
+#endif  /* __CUDA_AMPERE_MMA__ */  
+  
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Fragments for 8x8x32
+  // 
+  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
+  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
+  template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
+
+  // 
+  // Fragments for 8x8x128
+  // 
+  template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
+  template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Fragments for 16x16x8
+  //
+  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
+  
+  //
+  // Fragments for 8x8x4
+  //
+  template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
+  template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
+#endif  /* __CUDA_AMPERE_MMA__ */  
+
+  
+  // 
+  // Load functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+  
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  //
+  // Load functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  //
+  // Load functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  //
+  // Load functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  //
+  // Load functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Load functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  
+  //
+  // Load functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // Store functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */  
+
+  // 
+  // Store functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Store functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  // 
+  // Store functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Store functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  //
+  // Store functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__  
+  // 
+  // MMA functions for shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
+  
+
+  // 
+  // MMA functions for shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
+                                          experimental::bmmaBitOp = experimental::bmmaBitOpXOR, 
+                                          experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  // 
+  // MMA functions for shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+
+  // 
+  // MMA functions for shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+};
+};
+
+#undef __DEF_IF_HOST
+#undef __CUDA_IMMA__
+#undef __CUDA_SUBBYTE_IMMA__
+#undef __CUDA_AMPERE_MMA__
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __CUDA_MMA_DEVICE_DECL__
+
+#if defined(__CUDA_ARCH__)
+#include "mma.hpp"
+#endif /* defined(__CUDA_ARCH__) */
+
+
+#endif /* !__CUDA_MMA_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e10f2a982bd2dcf9814a2fc05a3f200d5a1cb07
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/mma.hpp
@@ -0,0 +1,1128 @@
+/*
+ * Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/mma.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/mma.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
+#endif
+
+#if !defined(__CUDA_MMA_HPP__)
+#define __CUDA_MMA_HPP__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
+#define __CUDA_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
+#define __CUDA_SUBBYTE_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#define __CUDA_AMPERE_MMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
+
+namespace nvcuda {
+namespace wmma {
+
+  // 
+  // Load functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm)  {
+    __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm)  {
+    __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // Load functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */ 
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // Load functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */ 
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  //
+  // Load functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_a_s4((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_a_u4((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_b_s4((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_b_u4((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+
+  //
+  // Load functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) {
+    __bmma_m8n8k128_ld_a_b1((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) {
+    __bmma_m8n8k128_ld_b_b1((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+
+#ifdef __CUDA_AMPERE_MMA__
+  // load functions for frags of shape m16n16k8
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 0);
+    else
+      __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 1);      
+  }
+  
+  // load functions for frags of shape m8n8k4
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 0);
+    else
+      __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 1);      
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // Store functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator,16, 16, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator,16, 16, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator,16, 16, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Store functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+
+  // 
+  // Store functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+#ifdef __CUDA_AMPERE_MMA__
+
+  //
+  // Store functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 0);
+    else
+      __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 1);
+  }
+
+  
+  // 
+  // Store functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 0);
+    else
+      __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m16n16k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+      __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // MMA functions for shape m32n8k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }  
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m8n32k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);        
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+#ifdef __CUDA_SUBBYTE_IMMA__  
+  // 
+  // MMA functions for shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  // 
+  // MMA functions for shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__  void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
+                                           experimental::bmmaBitOp op, experimental::bmmaAccumulateOp)
+  {
+     
+#ifdef __CUDA_AMPERE_MMA__
+    if (op == experimental::bmmaBitOpAND) 
+      __bmma_m8n8k128_mma_and_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
+    else 
+#endif  /* __CUDA_AMPERE_MMA__ */      
+      __bmma_m8n8k128_mma_xor_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
+  }
+
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  // 
+  // MMA functions for shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);    
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c)  {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c)  {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+
+  
+  // 
+  // MMA functions for shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 2, 0);
+  }
+  
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+};
+};
+
+#undef __CUDA_IMMA__
+#undef __CUDA_SUBBYTE_IMMA__
+#undef __CUDA_MMA_DEVICE_DECL__
+#undef __CUDA_AMPERE_MMA__
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+
+#endif   /* __CUDA_MMA_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9ffeb9cb9f1d202cb1f5cb1d4d7e88a416475
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional
@@ -0,0 +1,621 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead.")
+#else
+#warning "crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
+
+#ifndef __NV_LIBCXX_FUNCTIONAL_H__
+#define __NV_LIBCXX_FUNCTIONAL_H__
+
+#if __cplusplus < 201103L 
+  #if defined(_MSC_VER)
+    #if _MSC_VER < 1800
+      #error This library requires VS 2013 and above
+    #endif /* _MSC_VER < 1800 */
+  #else /* !_MSC_VER */
+    #error This library requires support for the ISO C++ 2011 standard
+  #endif /* _MSC_VER */
+#endif /* __cplusplus */
+
+#if defined(_MSC_VER)
+  #define __NV_ALIGNOF __alignof
+  #define __NV_NOEXCEPT
+  #define __NV_CONSTEXPR
+#else /* !_MSC_VER */
+  #define __NV_ALIGNOF alignof
+  #define __NV_NOEXCEPT noexcept
+  #define __NV_CONSTEXPR constexpr
+#endif /* _MSC_VER */
+
+#include <type_traits>
+#include <cstddef>
+#include <new>
+
+// n3290 20.8
+namespace nvstd
+{
+
+namespace internal {
+
+// D.8.1 base (deprecated) [depr.base]
+template <class _Arg, class _Result>
+struct unary_function
+{
+  typedef _Arg argument_type;
+  typedef _Result result_type;
+};
+
+template <class _Arg1, class _Arg2, class _Result>
+struct binary_function
+{
+  typedef _Arg1 first_argument_type;
+  typedef _Arg2 second_argument_type;
+  typedef _Result result_type;
+};
+
+// move
+template <class _T>
+inline __device__ __host__
+typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
+{
+  return static_cast<typename std::remove_reference<_T>::type&&>(__t);
+}
+
+// 20.2.2 swap [utility.swap]
+// swap
+template<class _T, 
+         class = typename std::enable_if<
+                   std::is_move_constructible<_T>::value &&
+                   std::is_move_assignable<_T>::value>::type>
+inline __device__ __host__
+void swap(_T& __a, _T& __b) 
+#if !defined(_MSC_VER)
+noexcept(std::is_nothrow_move_constructible<_T>::value &&
+         std::is_nothrow_move_assignable<_T>::value)
+#endif /* !defined(_MSC_VER) */
+{
+  _T __t(internal::move(__a));
+  __a = internal::move(__b);
+  __b = internal::move(__t);
+}
+
+// 20.2.3 forward/move helpers [forward]
+// forward
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
+{
+  return static_cast<_T&&>(__t);
+}
+
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
+{
+  static_assert(!std::is_lvalue_reference<_T>::value,
+                "Error: __t is instantiated with an lvalue reference type");
+  return static_cast<_T&&>(__t);
+}
+
+} // namespace internal
+
+namespace __functional_helpers
+{
+
+struct __dummy_class;
+
+// Store small functors locally:
+// a functor is legitimate to local storage if it is one of the following types:
+// * member object pointer;
+// * member function pointer;
+// * closure type of size less than or equal to the largest size of 
+//   the above types;
+// * function pointer;
+// * any callable class whose size is less than or equal to
+//   the largest one of the above types;
+union _Small_functor_types 
+{
+  void *__obj;
+  void (*__func_ptr)();
+  void (__dummy_class::*mem_fn_ptr)();
+};
+
+struct _Small_functor_data {
+  char __data[sizeof(_Small_functor_types)];
+};
+
+template <class _RetType, class ..._ArgTypes>
+struct __maybe_base_function
+{ };
+
+template <class _RetType, class _T1>
+struct __maybe_base_function<_RetType(_T1)>
+  : public internal::unary_function<_T1, _RetType>
+{ };
+
+template <class _RetType, class _T1, class _T2>
+struct __maybe_base_function<_RetType(_T1, _T2)>
+  : public internal::binary_function<_T1, _T2, _RetType>
+{ };
+
+} // namespace __functional_helpers
+
+// 20.8.11 Polymorphic function wrappers [func.wrap]
+
+// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
+// unimplemented because of exception
+// class bad_function_call : public std::exception
+
+// 20.8.11.2 Class template function [func.wrap.func]
+
+template<class> class function; // undefined
+
+// Simplified version of template class function, which
+//   * does not support allocator_arg_t;
+//   * does not support target and target_type that rely on RTTI
+//   * does not throw bad_function_call exception on invoking a NULL target
+template <class _RetType, class ..._ArgTypes>
+class function<_RetType(_ArgTypes...)> 
+  : public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
+{
+  __functional_helpers::_Small_functor_data __small_functor_data;
+  void *__obj;
+  typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
+  __meta_fn_type __meta_fn;
+  typedef void(*__cloner_type)(function &, const function &);
+  __cloner_type __cloner;
+  typedef void(*__destructor_type)(function *);
+  __destructor_type __destructor;
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  __NV_CONSTEXPR bool __use_small_functor_data() const
+  {
+    return (sizeof(_F) <= sizeof(__small_functor_data) &&
+            __NV_ALIGNOF(_F) <= __NV_ALIGNOF(
+                                  __functional_helpers::_Small_functor_types));
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void* __get_small_functor_data() const
+  {
+    return (void*)(&__small_functor_data.__data[0]);
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  bool __is_small_functor_data() const
+  {
+    return __obj == __get_small_functor_data();
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static _F& __get_functor(void *__p)
+  {
+    return *((_F*)__p);
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F& /*__p*/)
+  {
+    return false;
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F* __p)
+  {
+    return !__p;
+  }
+  
+  #pragma nv_exec_check_disable
+  template <class _Res, class _C>
+  __device__ __host__
+  static bool __is_empty_functor(const _Res _C::* __p)
+  {
+    return !__p;
+  }
+ 
+  #pragma nv_exec_check_disable
+  template <class _Res, class... _Args>
+  __device__ __host__
+  static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
+  {
+    return !__p;
+  }
+  
+  template <class _F>
+  struct __make_cloner
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __clone_data(function &__dest, const function &__src)
+    {
+      if (__dest.__use_small_functor_data<_F>()) {
+        __dest.__obj = __dest.__get_small_functor_data();
+        new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
+      }
+      else {
+        __dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
+      }
+    }
+  };
+
+  template <class _F>
+  struct __make_destructor
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __destruct(function *__fn)
+    {
+      if (__fn->__use_small_functor_data<_F>()) {
+        (__fn->__get_functor<_F>(__fn->__obj)).~_F();
+      }
+      else {
+        delete (_F*)(__fn->__obj);
+      }
+    }
+  };
+
+  // We cannot simple define __make_functor in the following way:
+  // template <class _T, _F>
+  // __make_functor;
+  // template <class _RetType1, class _F, class... _ArgTypes1>
+  // struct __make_functor<_RetType1(_ArgTypes1...), _F> 
+  //
+  // because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
+  template <class _RetType1, class _F, class... _ArgTypes1>
+  struct __make_functor
+  {
+    typedef _RetType1 type;
+
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_F>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+  template <class _RetType1, class _C, class _M, class... _ArgTypes1>
+  struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
+  {
+    typedef _RetType1 type;
+    typedef _RetType1(*_Fn)(_ArgTypes1...);
+
+    #pragma nv_exec_check_disable    
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_Fn>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+// workaround for GCC version below 4.8
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<bool, 
+                                    !std::is_same<_F, std::nullptr_t>::value>
+  { };
+#elif defined(_MSC_VER)
+  // simulate VC 2013's behavior...
+  template <class _F>
+  struct __check_callability1
+    : public 
+        std::integral_constant<bool, 
+          // std::result_of does not handle member pointers well 
+          std::is_member_pointer<_F>::value ||
+          std::is_convertible<
+            _RetType,
+            typename std::result_of<_F(_ArgTypes...)>::type
+          >::value
+        >
+  { };
+
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+               __check_callability1<typename std::remove_cv<_F>::type>::value>
+  { };
+#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
+  template <class _F,
+            class _T = typename std::result_of<_F(_ArgTypes...)>::type>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+                 std::is_convertible< _T, _RetType>::value>
+  { };
+#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void __destroy()
+  {
+    if (__obj) {
+      __destructor(this);
+      __obj = 0;
+    }
+  }
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void __clear()
+  {
+    __obj = 0;
+    __meta_fn = 0;
+    __cloner = 0;
+    __destructor = 0;
+  }
+
+public:
+  typedef _RetType result_type;
+
+/* 
+ * These typedef(s) are derived from __maybe_base_function
+ * typedef T1 argument_type;        // only if sizeof...(ArgTypes) == 1 and
+ *                                  // the type in ArgTypes is T1
+ * typedef T1 first_argument_type;  // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ * typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ */
+
+  // 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function() __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(std::nullptr_t) __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(const function &__fn)
+  {
+    if (__fn.__obj == 0) {
+      __clear();
+    }
+    else {
+      __meta_fn = __fn.__meta_fn;
+      __destructor = __fn.__destructor;
+      __fn.__cloner(*this, __fn);
+      __cloner = __fn.__cloner;
+    }
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(function &&__fn)
+  {
+    __fn.swap(*this);
+  }
+
+  // VS 2013 cannot process __check_callability type trait.
+  // So, we check callability using static_assert instead of
+  // using SFINAE such as
+  // template<class _F, 
+  //          class = typename std::enable_if<
+  //                    __check_callability<_F>::value
+  //         >::type>
+  
+  #pragma nv_exec_check_disable   
+  template<class _F>
+  __device__ __host__ 
+  function(_F);
+
+  // copy and swap
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  function& operator=(const function& __fn)
+  {
+    function(__fn).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(function&& __fn)
+  {
+    function(internal::move(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(std::nullptr_t)
+  {
+    __destroy();
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  template<class _F>
+  __device__ __host__
+  function&
+  operator=(_F&& __fn) 
+  {
+    static_assert(__check_callability<_F>::value,
+                  "Unable to create functor object!");
+    function(internal::forward<_F>(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  ~function()
+  {
+    __destroy();
+  }
+
+  // 20.8.11.2.2 function modifiers [func.wrap.func.mod]
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void swap(function& __fn) __NV_NOEXCEPT
+  {
+    internal::swap(__meta_fn, __fn.__meta_fn);
+    internal::swap(__cloner, __fn.__cloner);
+    internal::swap(__destructor, __fn.__destructor);
+
+    if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+    }
+    else if (__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __fn.__obj = __fn.__get_small_functor_data();
+    }
+    else if (__fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __obj = __get_small_functor_data();
+    }
+    else {
+      internal::swap(__obj, __fn.__obj);
+    }
+  }
+
+  // 20.8.11.2.3 function capacity [func.wrap.func.cap]
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  explicit operator bool() const __NV_NOEXCEPT
+  {
+    return __obj;
+  }
+
+  // 20.8.11.2.4 function invocation [func.wrap.func.inv]
+  // function::operator() can only be called in device code
+  // to avoid cross-execution space calls
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  _RetType operator()(_ArgTypes...) const;
+
+};
+
+// Out-of-line definitions
+#pragma nv_exec_check_disable
+template<class _RetType, class... _ArgTypes>
+template<class _F>
+__device__ __host__
+function<_RetType(_ArgTypes...)>::function(_F __fn)
+  : __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
+{
+  static_assert(__check_callability<_F>::value,
+                "Unable to construct functor object!");
+  if (__is_empty_functor(__fn))
+    return;
+  __meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
+  __cloner = &__make_cloner<_F>::__clone_data;
+  __destructor = &__make_destructor<_F>::__destruct;
+
+  if (__use_small_functor_data<_F>()) {
+    __obj = __get_small_functor_data();
+    new ((void*)__obj) _F(internal::move(__fn));
+  }
+  else {
+    __obj = new _F(internal::move(__fn));
+  }
+}
+
+#pragma nv_exec_check_disable 
+template <class _RetType, class..._ArgTypes>
+__device__ __host__
+_RetType
+function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
+{
+  return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
+}
+
+// 20.8.11.2.6, Null pointer comparisons:
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t) 
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+// 20.8.11.2.7, specialized algorithms:
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
+{
+  __fn1.swap(__fn2);
+}
+
+} // namespace nvstd
+
+#undef __NV_NOEXCEPT
+#undef __NV_CONSTEXPR
+#undef __NV_ALIGNOF
+
+#endif // __NV_LIBCXX_FUNCTIONAL_H__
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d798a6e5392ed631ed3b546304b16c94d65a1c8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_100_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_100_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__
+#endif
+
+#if !defined(__SM_100_RT_H__)
+#define __SM_100_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_100_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_100_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 1000
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rn(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rz(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rd(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_ru(float2 x, float2 y, float2 z) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rn(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rz(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rd(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_ru(float2 x, float2 y) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rn(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rz(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rd(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_ru(float2 x, float2 y) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 1000 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_100_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_100_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_100_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5d620bf0b8091e0ea6cd48da00e8689b92cdd88
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_100_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_100_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__
+#endif
+
+#if !defined(__SM_100_RT_HPP__)
+#define __SM_100_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_100_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_100_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 1000
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-10.0 builtin functions which are included   *
+*  as source (instead of being built in to the compiler)                       *
+*                                                                              *
+*******************************************************************************/
+
+extern "C" {
+  __device__ __device_builtin__ float2 __ffma2_rn_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_rz_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_rd_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_ru_impl(float2 x, float2 y, float2 z);
+
+  __device__ __device_builtin__ float2 __fadd2_rn_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_rz_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_rd_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_ru_impl(float2 x, float2 y);
+
+  __device__ __device_builtin__ float2 __fmul2_rn_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_rz_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_rd_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_ru_impl(float2 x, float2 y);
+} // extern "C"
+
+__SM_100_RT_DECL__ float2 __ffma2_rn(float2 x, float2 y, float2 z) {
+  return __ffma2_rn_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_rz(float2 x, float2 y, float2 z) {
+  return __ffma2_rz_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_rd(float2 x, float2 y, float2 z) {
+  return __ffma2_rd_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_ru(float2 x, float2 y, float2 z) {
+  return __ffma2_ru_impl(x, y, z);
+}
+
+__SM_100_RT_DECL__ float2 __fadd2_rn(float2 x, float2 y) {
+  return __fadd2_rn_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_rz(float2 x, float2 y) {
+  return __fadd2_rz_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_rd(float2 x, float2 y) {
+  return __fadd2_rd_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_ru(float2 x, float2 y) {
+  return __fadd2_ru_impl(x, y);
+}
+
+__SM_100_RT_DECL__ float2 __fmul2_rn(float2 x, float2 y) {
+  return __fmul2_rn_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_rz(float2 x, float2 y) {
+  return __fmul2_rz_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_rd(float2 x, float2 y) {
+  return __fmul2_rd_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_ru(float2 x, float2 y) {
+  return __fmul2_ru_impl(x, y);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 1000 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_100_RT_DECL__
+
+#endif /* !__SM_100_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..6046953afa8c5f71cf7058436de10397d6353e9e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2017-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+#if !defined(__SM_70_RT_H__)
+#define __SM_70_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_70_RT_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/******************************************************************************
+ *                                   match                                   *
+ ******************************************************************************/
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_70_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_70_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_70_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+
+#undef EXCLUDE_FROM_RTC
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..322496587325a1387e4280a509455e3ccc7caa1b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
+
+#if !defined(__SM_70_RT_HPP__)
+#define __SM_70_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-7.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+//
+// __match_any_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
+  return __match32_any_sync(mask, __float_as_uint(value));
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
+  return __match64_any_sync(mask, __double_as_longlong(value));
+}
+
+//
+// __match_all_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
+  return __match32_all_sync(mask, __float_as_uint(value), pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
+  return __match64_all_sync(mask, __double_as_longlong(value), pred);
+}
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
+    asm volatile("nanosleep.u32 %0;" :: "r"(ns));
+}
+
+
+extern "C" __device__ __device_builtin__
+unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
+  return __usAtomicCAS(address, compare, val);
+}
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_70_RT_DECL__
+
+#endif /* !__SM_70_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc18290966875591b6a6efa1f8564eb76e5aa34b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
+#endif
+
+#if !defined(__SM_80_RT_H__)
+#define __SM_80_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_80_RT_DECL__ __host__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_80_RT_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_80_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+/******************************************************************************
+ *                                   reduce                                   *
+ ******************************************************************************/
+__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+
+__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
+
+__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+
+#undef EXCLUDE_FROM_RTC
+
+
+extern "C" {
+inline __device__ void *__nv_associate_access_property(const void *ptr, 
+                                                       unsigned long long property) {
+  extern __device__ void *__nv_associate_access_property_impl(const void *,
+                                                              unsigned long long);
+  return __nv_associate_access_property_impl(ptr, property);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_4(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_8(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_16(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
+}
+
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_80_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_80_rt.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_80_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..857bd44a3bb0d8480560047a85f9059bc370b52f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
+#endif
+
+#if !defined(__SM_80_RT_HPP__)
+#define __SM_80_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_80_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_80_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-8.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+extern "C" { 
+  __device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
+  return __reduce_add_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
+  return __reduce_min_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
+  return __reduce_max_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
+  return __reduce_add_sync_signed_impl(mask, value);
+}
+  
+__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
+  return __reduce_min_sync_signed_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
+  return __reduce_max_sync_signed_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
+  return __reduce_and_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
+  return __reduce_or_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
+  return __reduce_xor_sync_unsigned_impl(mask, value);
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_80_RT_DECL__
+
+#endif /* !__SM_80_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e250634fe76651c2a15b5b492378efec1d3e0c5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2022-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
+#if !defined(__SM_90_RT_H__)
+#define __SM_90_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+__SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, unsigned target_block_rank)  __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, unsigned cluster_cta_mask) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterDim() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_wait() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __threadfence_cluster() __DEF_IF_HOST
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *__address, float4 val) __DEF_IF_HOST
+
+#undef EXCLUDE_FROM_RTC
+
+//Note: below atomic functions are templates, so cannot be represented in NVRTC
+//builtins representation, so they have to be parsed on every NVRTC compilation.
+//(notice 'EXCLUDE_FROM_RTC' ends above)
+
+
+#ifndef __NV_DISABLE_128_ATOMICS
+// lgen definitions for 128b atomics
+extern "C" {
+  __device__ __device_builtin__ void __u128AtomicCAS(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_block(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_system(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_block(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_system(void *, void *, void *);
+}
+
+// macro to get address of object, to workaround situations where the type overloads the "&" operator
+#define __NV_ATOMIC_ADDRESSOF(__val) \
+        (void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(__val))))
+
+// enable_if
+template<bool __b, typename _T>
+struct __nv_atomic_enable_if { };
+
+template<typename _T>
+struct __nv_atomic_enable_if<true, _T> { typedef _T __type; };
+
+// alignof
+#if defined(__CUDACC_RTC__)
+#define __NV_ATOMIC_ALIGNOF __alignof__
+#else
+#define __NV_ATOMIC_ALIGNOF __alignof
+#endif
+
+// trivially copyable
+template <typename _T>
+struct __nv_atomic_triv_cp_helper {
+#if defined(__GNUC__)
+#if  (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+  static const bool __val = true;
+#elif (__GNUC__ < 5)
+  static const bool __val = __has_trivial_copy(_T);
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+};
+#define __NV_ATOMIC_TRIVIALLY_COPYABLE(_T) \
+        __nv_atomic_triv_cp_helper<_T>::__val
+
+// return type
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_RET_TYPE(_T) _T
+#else
+#define __NV_ATOMIC_RET_TYPE(_T) typename \
+  __nv_atomic_enable_if<sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T), _T>::__type
+#endif
+
+// requires
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_REQUIRES(_T) \
+  requires(sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T))
+#else
+#define __NV_ATOMIC_REQUIRES(_T)
+#endif
+
+// temp value and return value
+#if __cplusplus >= 201103L || defined(_MSC_VER) // C++11 or greater, or MSC
+#define __NV_ATOMIC_TEMP(_T) union _U \
+  {_T __ret; __device__ __inline__ _U() {}}; _U __u
+#define __NV_ATOMIC_RET(_T) __u.__ret
+#else
+#define __NV_ATOMIC_TEMP(_T) _T __ret
+#define __NV_ATOMIC_RET(_T) __ret
+#endif
+
+// templated 128-bit atomics
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_block(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_system(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_block(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_system(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+#endif /* !__NV_DISABLE_128_ATOMICS */
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_90_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_90_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_90_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e61ac78b996fa03cadf60208bbd58f2e781f3ec
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
+
+#if !defined(__SM_90_RT_HPP__)
+#define __SM_90_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-9.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+extern "C" {
+  __device__ unsigned  __nv_isClusterShared_impl(const void *);
+  __device__ void * __nv_cluster_map_shared_rank_impl(const void *, unsigned);
+  __device__ unsigned __nv_cluster_query_shared_rank_impl(const void *);
+  __device__ unsigned __nv_clusterDimIsSpecifed_impl();
+  __device__ void __nv_clusterDim_impl(unsigned *, unsigned *, unsigned *);
+  __device__ void __nv_clusterRelativeBlockIdx_impl(unsigned *, 
+                                                    unsigned *, unsigned *);
+  __device__ void __nv_clusterGridDimInClusters_impl(unsigned *, 
+                                                     unsigned *, unsigned *);
+  __device__ void __nv_clusterIdx_impl(unsigned *, unsigned *, unsigned *);
+  __device__ unsigned __nv_clusterRelativeBlockRank_impl();
+  __device__ unsigned __nv_clusterSizeInBlocks_impl();
+  __device__ void __nv_cluster_barrier_arrive_impl();
+  __device__ void __nv_cluster_barrier_arrive_relaxed_impl();
+  __device__ void __nv_cluster_barrier_wait_impl();
+  __device__ void __nv_threadfence_cluster_impl();
+
+  __device__ __device_builtin__ float2 __f2AtomicAdd(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_block(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_system(float2 *, float2);
+  __device__ __device_builtin__ float4 __f4AtomicAdd(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_block(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_system(float4 *, float4);
+} // extern "C"
+
+__SM_90_RT_DECL__  unsigned __isCtaShared(const void *ptr) 
+{
+  return __isShared(ptr);
+}
+
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) 
+{
+  return __nv_isClusterShared_impl(ptr);
+}
+
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, 
+                                                  unsigned target_block_rank)
+{
+  return __nv_cluster_map_shared_rank_impl(ptr, target_block_rank);
+}
+
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr)
+{
+  return __nv_cluster_query_shared_rank_impl(ptr);
+}
+
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, 
+                                                 unsigned int cluster_cta_mask)
+{
+  return make_uint2((unsigned)__cvta_generic_to_shared(ptr), cluster_cta_mask);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified()
+{
+  return __nv_clusterDimIsSpecifed_impl();
+}  
+
+__SM_90_RT_DECL__ dim3 __clusterDim()
+{
+  unsigned x, y, z;
+  __nv_clusterDim_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterRelativeBlockIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters()
+{
+  unsigned x, y, z;
+  __nv_clusterGridDimInClusters_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank()
+{
+  return __nv_clusterRelativeBlockRank_impl();
+}
+
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks()
+{
+  return __nv_clusterSizeInBlocks_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive()
+{
+  __nv_cluster_barrier_arrive_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed()
+{
+  __nv_cluster_barrier_arrive_relaxed_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_wait()
+{
+  __nv_cluster_barrier_wait_impl();
+}
+
+__SM_90_RT_DECL__ void __threadfence_cluster()
+{
+  __nv_threadfence_cluster_impl();
+}
+
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *address, float2 val) {
+  return __f2AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *address, float2 val) {
+  return __f2AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *address, float2 val) {
+  return __f2AtomicAdd_system(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *address, float4 val) {
+  return __f4AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *address, float4 val) {
+  return __f4AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *address, float4 val) {
+  return __f4AtomicAdd_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_90_RT_DECL__
+
+#endif /* !__SM_90_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb19bd46ebde4a53dfad866050fad9fb0cbd222
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h
@@ -0,0 +1,142 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
+
+#if !defined(__STORAGE_CLASS_H__)
+#define __STORAGE_CLASS_H__
+
+#if !defined(__var_used__)
+
+#define __var_used__
+
+#endif /* __var_used__ */
+
+#if !defined(__loc_sc__)
+
+#define __loc_sc__(loc, size, sc) \
+        __storage##_##sc##size##loc loc
+
+#endif /* !__loc_sc__ */
+
+#if !defined(__storage___device__)
+#define __storage___device__ static __var_used__
+#endif /* __storage___device__ */
+
+#if !defined(__storage_extern__device__)
+#define __storage_extern__device__ static __var_used__
+#endif /* __storage_extern__device__ */
+
+#if !defined(__storage_auto__device__)
+#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__device__ */
+
+#if !defined(__storage_static__device__)
+#define __storage_static__device__ static __var_used__
+#endif /* __storage_static__device__ */
+
+#if !defined(__storage___constant__)
+#define __storage___constant__ static __var_used__
+#endif /* __storage___constant__ */
+
+#if !defined(__storage_extern__constant__)
+#define __storage_extern__constant__ static __var_used__
+#endif /* __storage_extern__constant__ */
+
+#if !defined(__storage_auto__constant__)
+#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__constant__ */
+
+#if !defined(__storage_static__constant__)
+#define __storage_static__constant__ static __var_used__
+#endif /* __storage_static__constant__ */
+
+#if !defined(__storage___shared__)
+#define __storage___shared__ static __var_used__
+#endif /* __storage___shared__ */
+
+#if !defined(__storage_extern__shared__)
+#define __storage_extern__shared__ static __var_used__
+#endif /* __storage_extern__shared__ */
+
+#if !defined(__storage_auto__shared__)
+#define __storage_auto__shared__ static
+#endif /* __storage_auto__shared__ */
+
+#if !defined(__storage_static__shared__)
+#define __storage_static__shared__ static __var_used__
+#endif /* __storage_static__shared__ */
+
+#if !defined(__storage__unsized__shared__)
+#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage__unsized__shared__ */
+
+#if !defined(__storage_extern_unsized__shared__)
+#define __storage_extern_unsized__shared__ static __var_used__
+#endif /* __storage_extern_unsized__shared__ */
+
+#if !defined(__storage_auto_unsized__shared__)
+#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto_unsized__shared__ */
+
+#if !defined(__storage_static_unsized__shared__)
+#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_static_unsized__shared__ */
+
+#if !defined(__storage___text__)
+#define __storage___text__ static __var_used__
+#endif /* __storage___text__ */
+
+#if !defined(__storage_extern__text__)
+#define __storage_extern__text__ static __var_used__
+#endif /* __storage_extern__text__ */
+
+#if !defined(__storage_auto__text__)
+#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__text__ */
+
+#if !defined(__storage_static__text__)
+#define __storage_static__text__ static __var_used__
+#endif /* __storage_static__text__ */
+
+#if !defined(__storage___surf__)
+#define __storage___surf__ static __var_used__
+#endif /* __storage___surf__ */
+
+#if !defined(__storage_extern__surf__)
+#define __storage_extern__surf__ static __var_used__
+#endif /* __storage_extern__surf__ */
+
+#if !defined(__storage_auto__surf__)
+#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__surf__ */
+
+#if !defined(__storage_static__surf__)
+#define __storage_static__surf__ static __var_used__
+#endif /* __storage_static__surf__ */
+
+#endif /* !__STORAGE_CLASS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+#endif
+
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+
+#if !defined(__CUDACC__)
+#include <math.h>       /* import fabsf, sqrt */
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#include "vector_types.h"
+
+typedef float2 cuFloatComplex;
+
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
+                                cuCimagf(x) + cuCimagf(y));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
+                                    cuCimagf(x) - cuCimagf(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) + 
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* 
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and 
+ * overflow by scaling. Otherwise we would lose half the exponent range. 
+ * There are various ways of doing guarded computation. For now chose the 
+ * simplest and fastest solution, however this may suffer from inaccuracies 
+ * if sqrt and division are not IEEE compliant. 
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+
+/* Double precision */
+typedef double2 cuDoubleComplex;
+
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
+                                 cuCimag(x) + cuCimag(y));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
+                                 cuCimag(x) - cuCimag(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) + 
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) || 
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
+                                                                float y) 
+{ 
+    return make_cuFloatComplex (x, y); 
+}
+
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+
+
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+            
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
+     
+    return make_cuComplex(real_res, imag_res);
+}
+
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+            
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
+     
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+
+#endif /* !defined(CU_COMPLEX_H_) */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c74d8c48d2a80fdfbccb3dca0c992c59c1d0ff
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h
@@ -0,0 +1,26280 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+
+
+
+#include <stdlib.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#if defined(CUDA_FORCE_API_VERSION)
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+#define cuCtxCreate                         cuCtxCreate_v2
+#define cuCtxCreate_v3                      cuCtxCreate_v3
+#define cuCtxCreate_v4                      cuCtxCreate_v4
+#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+#define cuMemGetInfo                        cuMemGetInfo_v2
+#define cuMemAlloc                          cuMemAlloc_v2
+#define cuMemAllocPitch                     cuMemAllocPitch_v2
+#define cuMemFree                           cuMemFree_v2
+#define cuMemGetAddressRange                cuMemGetAddressRange_v2
+#define cuMemAllocHost                      cuMemAllocHost_v2
+#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+#define cuMemcpyBatchAsync                  __CUDA_API_PTSZ(cuMemcpyBatchAsync)
+#define cuMemcpy3DBatchAsync                __CUDA_API_PTSZ(cuMemcpy3DBatchAsync)
+#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+#define cuArrayCreate                       cuArrayCreate_v2
+#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+#define cuArray3DCreate                     cuArray3DCreate_v2
+#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#define cuCtxDestroy                        cuCtxDestroy_v2
+#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+#define cuStreamDestroy                     cuStreamDestroy_v2
+#define cuEventDestroy                      cuEventDestroy_v2
+#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#define cuLinkCreate                        cuLinkCreate_v2
+#define cuLinkAddData                       cuLinkAddData_v2
+#define cuLinkAddFile                       cuLinkAddFile_v2
+#define cuMemHostRegister                   cuMemHostRegister_v2
+#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
+#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
+#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
+#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
+#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
+
+#define cuGraphInstantiate                  cuGraphInstantiateWithFlags
+
+#define cuGraphExecUpdate                   cuGraphExecUpdate_v2 
+#define cuGetProcAddress                    cuGetProcAddress_v2
+#define cuGraphAddKernelNode                cuGraphAddKernelNode_v2
+#define cuGraphKernelNodeGetParams          cuGraphKernelNodeGetParams_v2
+#define cuGraphKernelNodeSetParams          cuGraphKernelNodeSetParams_v2
+#define cuGraphExecKernelNodeSetParams      cuGraphExecKernelNodeSetParams_v2
+
+#define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32_v2)
+#define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32_v2)
+#define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64_v2)
+#define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64_v2)
+#define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp_v2)
+#define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+#define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+
+#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
+    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
+    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
+    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
+    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
+    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
+    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
+    #define cuMemPrefetchAsync_v2               __CUDA_API_PTSZ(cuMemPrefetchAsync_v2)
+
+    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
+    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
+    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
+    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
+    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
+    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
+
+    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
+    #define cuStreamGetId                       __CUDA_API_PTSZ(cuStreamGetId)
+    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetDevice                   __CUDA_API_PTSZ(cuStreamGetDevice)
+    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamGetCtx_v2                   __CUDA_API_PTSZ(cuStreamGetCtx_v2)
+    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
+    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
+    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamGetCaptureInfo_v3           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v3)
+    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
+    #define cuStreamUpdateCaptureDependencies_v2 __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies_v2)
+    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
+    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
+    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
+    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
+    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
+    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
+    #define cuLaunchKernelEx                    __CUDA_API_PTSZ(cuLaunchKernelEx)
+    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
+    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
+    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
+
+
+    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
+
+    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
+    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
+
+    #define cuGraphInstantiateWithParams        __CUDA_API_PTSZ(cuGraphInstantiateWithParams)
+    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
+    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
+    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
+    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
+    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
+
+    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
+    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
+    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
+
+    #define cuStreamBeginCaptureToGraph         __CUDA_API_PTSZ(cuStreamBeginCaptureToGraph)
+
+#endif
+
+#define cuMemBatchDecompressAsync               __CUDA_API_PTSZ(cuMemBatchDecompressAsync)
+
+/**
+ * \file cuda.h
+ * \brief Header file for the CUDA Toolkit application programming interface.
+ *
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 12080
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr_v2;
+#else
+typedef unsigned int CUdeviceptr_v2;
+#endif
+typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
+
+typedef int CUdevice_v1;                                     /**< CUDA device */
+typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                          /**< A regular context handle */
+typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
+typedef struct CUlib_st *CUlibrary;                          /**< CUDA library */
+typedef struct CUkern_st *CUkernel;                          /**< CUDA kernel */
+typedef struct CUarray_st *CUarray;                          /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                          /**< CUDA event */
+typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
+typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
+typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
+typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
+typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
+typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
+typedef cuuint64_t CUgraphConditionalHandle; /**< CUDA graph conditional handle */
+typedef struct CUgraphDeviceUpdatableNode_st *CUgraphDeviceNode; /**< CUDA graph device node handle */
+typedef struct CUasyncCallbackEntry_st *CUasyncCallbackHandle;            /**< CUDA async notification callback handle */
+/*!
+ * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
+ * A green context handle. This handle can be used safely from only one CPU thread at a time.
+ * Created via ::cuGreenCtxCreate
+ */
+typedef struct CUgreenCtx_st *CUgreenCtx;
+
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
+    char bytes[16];
+} CUuuid;
+#endif
+
+/**
+ * CUDA IPC handle size
+ */
+#define CU_IPC_HANDLE_SIZE 64
+
+/**
+ * Fabric handle - An opaque handle representing a memory allocation
+ * that can be exported to processes in same or different nodes. For IPC
+ * between processes on different nodes they must be connected via the
+ * NVSwitch fabric.
+ */
+typedef struct CUmemFabricHandle_st {
+    unsigned char data[CU_IPC_HANDLE_SIZE];
+} CUmemFabricHandle_v1;
+typedef CUmemFabricHandle_v1 CUmemFabricHandle;
+
+/**
+ * CUDA IPC event handle
+ */
+typedef struct CUipcEventHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcEventHandle_v1;
+typedef CUipcEventHandle_v1 CUipcEventHandle;
+
+/**
+ * CUDA IPC mem handle
+ */
+typedef struct CUipcMemHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle_v1;
+typedef CUipcMemHandle_v1 CUipcMemHandle;
+
+/**
+ * CUDA Ipc Mem Flags
+ */
+typedef enum CUipcMem_flags_enum {
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+} CUipcMem_flags;
+
+
+/**
+ * CUDA Mem Attach Flags
+ */
+typedef enum CUmemAttach_flags_enum {
+    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
+    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
+    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
+} CUmemAttach_flags;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
+                                         *  and it no longer has any effect. All contexts 
+                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_COREDUMP_ENABLE     = 0x20, /**< Trigger coredumps from exceptions in this context */
+    CU_CTX_USER_COREDUMP_ENABLE= 0x40, /**< Enable user pipe to trigger coredumps in this context */
+    CU_CTX_SYNC_MEMOPS         = 0x80, /**< Ensure synchronous memory operations on this context will synchronize */
+    CU_CTX_FLAGS_MASK          = 0xFF
+} CUctx_flags;
+
+/**
+ * Event sched flags
+ */
+typedef enum CUevent_sched_flags_enum {
+    CU_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    CU_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    CU_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    CU_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} CUevent_sched_flags;
+
+/**
+ * NVCL event scheduling flags
+ */
+typedef enum cl_event_flags_enum {
+    NVCL_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_event_flags;
+
+/**
+ * NVCL context scheduling flags
+ */
+typedef enum cl_context_flags_enum {
+    NVCL_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_context_flags;
+
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_LEGACY     ((CUstream)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_PER_THREAD ((CUstream)0x2)
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+/**
+ * Event record flags
+ */
+typedef enum CUevent_record_flags_enum {
+    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
+    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
+                                      *  instead of the default behavior.  This flag is invalid
+                                      *  when used outside of capture. */
+} CUevent_record_flags;
+
+/**
+ * Event wait flags
+ */
+typedef enum CUevent_wait_flags_enum {
+    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
+    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
+                                    *  instead of the default behavior.  This flag is invalid
+                                    *  when used outside of capture.*/
+} CUevent_wait_flags;
+
+/**
+ * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
+ */
+typedef enum CUstreamWaitValue_flags_enum {
+    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
+                                             values). Note this is a cyclic comparison which ignores wraparound.
+                                             (Default behavior.) */
+    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
+    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
+    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
+                                             queried with ::cuDeviceGetAttribute() and
+                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
+    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
+                                             means that, if a remote write operation is guaranteed to have reached the
+                                             device before the wait can be satisfied, that write is guaranteed to be
+                                             visible to downstream device work. The device is permitted to reorder
+                                             remote writes internally. For example, this flag would be required if
+                                             two remote writes arrive in a defined order, the wait is satisfied by the
+                                             second write, and downstream work needs to observe the first write.
+                                             Support for this operation is restricted to selected platforms and can be
+                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/
+} CUstreamWaitValue_flags;
+
+/**
+ * Flags for ::cuStreamWriteValue32
+ */
+typedef enum CUstreamWriteValue_flags_enum {
+    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
+                                                        before it, as a performance optimization. Normally,
+                                                        ::cuStreamWriteValue32 will provide a memory fence before the
+                                                        write, which has similar semantics to
+                                                        __threadfence_system() but is scoped to the stream
+                                                        rather than a CUDA thread.
+                                                        This flag is not supported in the v2 API. */
+} CUstreamWriteValue_flags;
+
+/**
+ * Operations for ::cuStreamBatchMemOp
+ */
+typedef enum CUstreamBatchMemOpType_enum {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
+    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
+    CU_STREAM_MEM_OP_BARRIER = 6,            /**< Insert a memory barrier of the specified type */ 
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
+                                                  standalone operation. */
+} CUstreamBatchMemOpType;
+
+/**
+ * Flags for ::cuStreamMemoryBarrier
+ */
+typedef enum CUstreamMemoryBarrier_flags_enum {
+    CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */
+    CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 0x1 /**< Limit memory barrier scope to the GPU. */
+} CUstreamMemoryBarrier_flags;
+
+/**
+ * Per-operation parameters for ::cuStreamBatchMemOp
+ */
+typedef union CUstreamBatchMemOpParams_union {
+    CUstreamBatchMemOpType operation;
+    struct CUstreamMemOpWaitValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } waitValue;
+    struct CUstreamMemOpWriteValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } writeValue;
+    struct CUstreamMemOpFlushRemoteWritesParams_st {
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;
+    struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2 API */
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } memoryBarrier;
+    cuuint64_t pad[6];
+} CUstreamBatchMemOpParams_v1;
+typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
+
+typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st {
+    CUcontext ctx;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} CUDA_BATCH_MEM_OP_NODE_PARAMS_v1;
+typedef CUDA_BATCH_MEM_OP_NODE_PARAMS_v1 CUDA_BATCH_MEM_OP_NODE_PARAMS;
+
+/**
+ * Batch memory operation node parameters
+ */
+typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st {
+    CUcontext ctx;                        /**< Context to use for the operations. */
+    unsigned int count;                   /**< Number of operations in paramArray. */
+    CUstreamBatchMemOpParams *paramArray; /**< Array of batch memory operations. */
+    unsigned int flags;                   /**< Flags to control the node. */
+} CUDA_BATCH_MEM_OP_NODE_PARAMS_v2;
+
+/**
+ * Occupancy calculator flag
+ */
+typedef enum CUoccupancy_flags_enum {
+    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
+    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
+} CUoccupancy_flags;
+
+/**
+ * Flags for ::cuStreamUpdateCaptureDependencies
+ */
+typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
+    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
+    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
+} CUstreamUpdateCaptureDependencies_flags;
+
+/**
+* Types of async notification that can be sent
+*/
+typedef enum CUasyncNotificationType_enum {
+    CU_ASYNC_NOTIFICATION_TYPE_OVER_BUDGET = 0x1
+} CUasyncNotificationType;
+
+/**
+* Information passed to the user via the async notification callback
+*/
+typedef struct CUasyncNotificationInfo_st {
+    CUasyncNotificationType type;
+    union {
+        struct {
+            unsigned long long bytesOverBudget;
+        } overBudget;
+    } info;
+} CUasyncNotificationInfo;
+
+/**
+ * CUDA async notification callback
+ * \param info Information describing what actions to take as a result of this trim notification.
+ * \param userData Pointer to user defined data provided at registration.
+ * \param callback The callback handle associated with this specific callback.
+ */
+typedef void (*CUasyncCallback)(CUasyncNotificationInfo *info, void *userData, CUasyncCallbackHandle callback);
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8            = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16           = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32           = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8              = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16             = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32             = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF                     = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT                    = 0x20, /**< 32-bit floating point */
+    CU_AD_FORMAT_NV12                     = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_UNORM_INT8X1             = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X2             = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X4             = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X1            = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X2            = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X4            = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X1             = 0xc6, /**< 1 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X2             = 0xc7, /**< 2 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X4             = 0xc8, /**< 4 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X1            = 0xc9, /**< 1 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X2            = 0xca, /**< 2 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X4            = 0xcb, /**< 4 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_BC1_UNORM                = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    CU_AD_FORMAT_BC1_UNORM_SRGB           = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC2_UNORM                = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    CU_AD_FORMAT_BC2_UNORM_SRGB           = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC3_UNORM                = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    CU_AD_FORMAT_BC3_UNORM_SRGB           = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC4_UNORM                = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC4_SNORM                = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC5_UNORM                = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC5_SNORM                = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC6H_UF16                = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC6H_SF16                = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC7_UNORM                = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    CU_AD_FORMAT_BC7_UNORM_SRGB           = 0x9e, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+    CU_AD_FORMAT_P010                     = 0x9f, /**< 10-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_P016                     = 0xa1, /**< 16-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_NV16                     = 0xa2, /**< 8-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P210                     = 0xa3, /**< 10-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P216                     = 0xa4, /**< 16-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_YUY2                     = 0xa5, /**< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y210                     = 0xa6, /**< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y216                     = 0xa7, /**< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_AYUV                     = 0xa8, /**< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y410                     = 0xa9, /**< 10-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y416                     = 0xb1, /**< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR8             = 0xb2, /**< 3 channel 8-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR10            = 0xb3, /**< 3 channel 10-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_8bit_SemiPlanar   = 0xb4, /**< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_16bit_SemiPlanar  = 0xb5, /**< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_UNORM_INT_101010_2       = 0x50, /**< 4 channel unorm R10G10B10A2 RGB format */
+    CU_AD_FORMAT_MAX                      = 0x7FFFFFFF
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1 = 92,                     /**< Deprecated, along with v1 MemOps API, ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 = 93,              /**< Deprecated, along with v1 MemOps API, 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 = 94,              /**< Deprecated, along with v1 MemOps API, ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
+    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
+    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
+    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
+    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
+    CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120,                               /**< Indicates device supports cluster launch */
+    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 122,                /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 123,                /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            /**< Device supports buffer sharing with dma_buf mechanism. */ 
+    CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED = 125,                          /**< Device supports IPC Events. */ 
+    CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT = 126,                        /**< Number of memory domains the device supports. */
+    CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127,                  /**< Device supports accessing memory using Tensor Map. */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED = 128,                 /**< Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129,                    /**< Device supports unified function pointers. */
+    CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,                                  /**< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum */
+    CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,                                      /**< NUMA node ID of the GPU memory */
+    CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132,                          /**< Device supports switch multicast and reduction operations. */
+    CU_DEVICE_ATTRIBUTE_MPS_ENABLED = 133,                                  /**< Indicates if contexts created on this device will be shared via MPS */
+    CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID = 134,                                 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
+    CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135,                          /**< Device supports CIG with D3D12. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK = 136,                /**< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH = 137,                /**< The returned valued is the maximum length in bytes of a single decompress operation that is allowed. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID    = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
+    CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143,             /**< Device supports HOST_NUMA location IPC between nodes in a multi-node system. */
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st {
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop_v1;
+typedef CUdevprop_v1 CUdevprop;
+
+/**
+ * Pointer information
+ */
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
+    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
+    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
+    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
+    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17,             /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
+    CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18,               /**< Size of the actual underlying mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19,          /**< The start address of the mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20             /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
+  , CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = 21    /**< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. */
+} CUpointer_attribute;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources, 
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    /**
+     * If this attribute is set, the kernel must launch with a valid cluster
+     * size specified.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10,
+
+    /**
+     * The required cluster width in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11,
+
+    /**
+     * The required cluster height in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12,
+
+    /**
+     * The required cluster depth in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13,
+
+    /**
+     * Whether the function can be launched with non-portable cluster size. 1 is
+     * allowed, 0 is disallowed. A non-portable cluster size may only function
+     * on the specific SKUs the program is tested on. The launch might fail if
+     * the program is run on a different hardware platform.
+     *
+     * CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking
+     * whether the desired size can be launched on the current device.
+     *
+     * Portable Cluster Size
+     *
+     * A portable cluster size is guaranteed to be functional on all compute
+     * capabilities higher than the target compute capability. The portable
+     * cluster size for sm_90 is 8 blocks per cluster. This value may increase
+     * for future compute capabilities.
+     *
+     * The specific hardware unit may support higher cluster sizes that’s not
+     * guaranteed to be portable.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14,
+
+    /**
+     * The block scheduling policy of a function. The value type is
+     * CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * \deprecated
+ *
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute
+ */
+typedef enum CUshared_carveout_enum {
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+} CUshared_carveout;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum {
+    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+} CUcomputemode;
+
+/**
+ * Memory advise values
+ */
+typedef enum CUmem_advise_enum {
+    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occasionally be written to */
+    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
+    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
+    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
+    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY                 = 1, /**< Whether the range will mostly be read and only occasionally be written to */
+    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION          = 2, /**< The preferred location of the range */
+    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY                 = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
+    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION      = 4  /**< The last location to which the range was prefetched */
+    , CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE     = 5 /**< The preferred location type of the range */
+    , CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID       = 6 /**< The preferred location id of the range */
+    , CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE = 7 /**< The last location type to which the range was prefetched */
+    , CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID   = 8 /**< The last location id to which the range was prefetched */
+} CUmem_range_attribute;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization of the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK = 1,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME = 2,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER = 3,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER = 5,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL = 7,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT = 8,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET = 9,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY = 10,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO = 11,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE = 12,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO = 13,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE = 14,
+
+    /**
+     * \deprecated
+     * This jit option is deprecated and should not be used.
+     */
+    CU_JIT_NEW_SM3X_OPT = 15,
+
+    /**
+     * This jit option is used for internal purpose only.
+     */
+    CU_JIT_FAST_COMPILE = 16,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponding
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loading a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES = 17,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES = 18,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT = 19,
+
+    /**
+     * \deprecated
+     * Enable link-time optimization (-dlto) for device code (Disabled by default).\n
+     * This option is not supported on 32-bit platforms.\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_LTO = 20,
+
+    /**
+     * \deprecated
+     * Control single-precision denormals (-ftz) support (0: false, default).
+     * 1 : flushes denormal values to zero
+     * 0 : preserves denormal values
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_FTZ = 21,
+
+    /**
+     * \deprecated
+     * Control single-precision floating-point division and reciprocals
+     * (-prec-div) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_PREC_DIV = 22,
+
+    /**
+     * \deprecated
+     * Control single-precision floating-point square root
+     * (-prec-sqrt) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_PREC_SQRT = 23,
+
+    /**
+     * \deprecated
+     * Enable/Disable the contraction of floating-point multiplies
+     * and adds/subtracts into floating-point multiply-add (-fma)
+     * operations (1: Enable, default; 0: Disable).
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_FMA = 24,
+
+    /**
+     * \deprecated
+     * Array of kernel names that should be preserved at link time while others
+     * can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n
+     * Note that kernel names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all kernels with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_KERNEL_NAMES = 25,
+
+    /**
+     * \deprecated
+     * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_KERNEL_COUNT = 26,
+
+    /**
+     * \deprecated
+     * Array of variable names (__device__ and/or __constant__) that should be
+     * preserved at link time while others can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n
+     * Note that variable names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all variables with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_VARIABLE_NAMES = 27,
+
+    /**
+     * \deprecated
+     * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_VARIABLE_COUNT = 28,
+
+    /**
+     * \deprecated
+     * This option serves as a hint to enable the JIT compiler/linker
+     * to remove constant (__constant__) and device (__device__) variables
+     * unreferenced in device code (Disabled by default).\n
+     * Note that host references to constant and device variables using APIs like
+     * ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless
+     * the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29,
+
+    /**
+     * Generate position independent code (0: false)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_POSITION_INDEPENDENT_CODE = 30,
+
+    /**
+     * This option hints to the JIT compiler the minimum number of CTAs from the
+     * kernel’s grid to be mapped to a SM. This option is ignored when used together
+     * with ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK.
+     * Optimizations based on this option need ::CU_JIT_MAX_THREADS_PER_BLOCK to
+     * be specified as well. For kernels already using PTX directive .minnctapersm,
+     * this option will be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES
+     * to let this option take precedence over the PTX directive.
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_MIN_CTA_PER_SM = 31,
+
+     /**
+     * Maximum number threads in a thread block, computed as the product of
+     * the maximum extent specifed for each dimension of the block. This limit
+     * is guaranteed not to be exeeded in any invocation of the kernel. Exceeding
+     * the the maximum number of threads results in runtime error or kernel launch
+     * failure. For kernels already using PTX directive .maxntid, this option will
+     * be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES to let this
+     * option take precedence over the PTX directive.
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_MAX_THREADS_PER_BLOCK = 32,
+
+    /**
+     * This option lets the values specified using ::CU_JIT_MAX_REGISTERS,
+     * ::CU_JIT_THREADS_PER_BLOCK, ::CU_JIT_MAX_THREADS_PER_BLOCK and
+     * ::CU_JIT_MIN_CTA_PER_SM take precedence over any PTX directives.
+     * (0: Disable, default; 1: Enable)
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33,
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/*
+ * Indicates that compute device class supports accelerated features.
+ */
+#define CU_COMPUTE_ACCELERATED_TARGET_BASE   0x10000
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
+    CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
+    CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
+    CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
+    CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
+    CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
+    CU_TARGET_COMPUTE_100 = 100, /**< Compute device class 10.0.*/
+    CU_TARGET_COMPUTE_101 = 101,       /**< Compute device class 10.1.*/
+    CU_TARGET_COMPUTE_120 = 120, /**< Compute device class 12.0.*/
+
+    /**< Compute device class 9.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90,
+    /**< Compute device class 10.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_100A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_100,
+    /**< Compute device class 10.1 with accelerated features.*/
+    CU_TARGET_COMPUTE_101A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_101,
+    /**< Compute device class 12.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_120A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_120,
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
+
+} CUjit_fallback;
+
+/**
+ * Caching modes for dlcm
+ */
+typedef enum CUjit_cacheMode_enum
+{
+    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
+    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
+    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
+} CUjit_cacheMode;
+
+/**
+ * Device code formats
+ */
+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX = 1,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY = 2,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT = 3,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY = 4,
+
+    /**
+     * \deprecated
+     * High-level intermediate code for link-time optimization\n
+     * Applicable options: NVVM compiler options, PTX compiler options
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_INPUT_NVVM = 5,
+
+    CU_JIT_NUM_INPUT_TYPES = 6
+} CUjitInputType;
+
+typedef struct CUlinkState_st *CUlinkState;
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum {
+    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
+    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum {
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum {
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum {
+    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
+    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
+    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
+    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
+    CU_LIMIT_SHMEM_SIZE                       = 0x07, /**< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_ENABLED                      = 0x08, /**< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED       = 0x09, /**< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available */
+    CU_LIMIT_MAX
+} CUlimit;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resource */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
+
+#ifdef _WIN32
+#define CUDA_CB __stdcall
+#else
+#define CUDA_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDA_CB *CUhostFn)(void *userData);
+
+/**
+ * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
+ */
+typedef enum CUaccessProperty_enum {
+    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
+    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
+    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
+} CUaccessProperty;
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
+ * Partition into many segments and assign segments such that:
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+typedef struct CUaccessPolicyWindow_st {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
+    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
+} CUaccessPolicyWindow_v1;
+/**
+ * Access policy window
+ */
+typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+} CUDA_KERNEL_NODE_PARAMS_v1;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_v2_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+    CUkernel kern;               /**< Kernel to launch, will only be referenced if func is NULL */
+    CUcontext ctx;               /**< Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. */
+} CUDA_KERNEL_NODE_PARAMS_v2;
+typedef CUDA_KERNEL_NODE_PARAMS_v2 CUDA_KERNEL_NODE_PARAMS;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_v3_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+    CUkernel kern;               /**< Kernel to launch, will only be referenced if func is NULL */
+    CUcontext ctx;               /**< Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. */
+} CUDA_KERNEL_NODE_PARAMS_v3;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+} CUDA_MEMSET_NODE_PARAMS_v1;
+typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_v2_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+    CUcontext ctx;                          /**< Context on which to run the node */
+} CUDA_MEMSET_NODE_PARAMS_v2;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS_v1;
+typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_v2_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS_v2;
+
+/**
+ * Conditional node handle flags
+ */
+#define CU_GRAPH_COND_ASSIGN_DEFAULT   0x1 /**< Default value is applied when graph is launched. */
+
+/**
+ * Conditional node types
+ */
+typedef enum CUgraphConditionalNodeType_enum {
+     CU_GRAPH_COND_TYPE_IF = 0,     /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero.  If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
+     CU_GRAPH_COND_TYPE_WHILE = 1,  /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
+     CU_GRAPH_COND_TYPE_SWITCH = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
+} CUgraphConditionalNodeType;
+
+/**
+ * Conditional node parameters
+ */
+typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
+    CUgraphConditionalHandle handle;   /**< Conditional node handle.
+                                            Handles must be created in advance of creating the node
+                                            using ::cuGraphConditionalHandleCreate. */
+    CUgraphConditionalNodeType type;   /**< Type of conditional node. */
+    unsigned int size;                 /**< Size of graph output array.  Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
+                                            for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. */
+    CUgraph *phGraph_out;              /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
+                                            Valid for the lifetime of the conditional node.
+                                            The contents of the graph(s) are subject to the following constraints:
+
+                                            - Allowed node types are kernel nodes, empty nodes, child graphs, memsets,
+                                              memcopies, and conditionals. This applies recursively to child graphs and conditional bodies.
+                                            - All kernels, including kernels in nested conditionals or child graphs at any level,
+                                              must belong to the same CUDA context.
+
+                                            These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
+
+                                            CU_GRAPH_COND_TYPE_IF:
+                                            phGraph_out[0] is executed when the condition is non-zero.  If \p size == 2, phGraph_out[1] will
+                                            be executed when the condition is zero.
+                                            CU_GRAPH_COND_TYPE_WHILE:
+                                            phGraph_out[0] is executed as long as the condition is non-zero.
+                                            CU_GRAPH_COND_TYPE_SWITCH:
+                                            phGraph_out[n] is executed when the condition is equal to n.  If the condition >= \p size,
+                                            no body graph is executed.
+                                         */
+    CUcontext ctx;                     /**< Context on which to run the node.  Must match context used to create the handle and all body nodes. */
+} CUDA_CONDITIONAL_NODE_PARAMS;
+
+/**
+ * Graph node types
+ */
+typedef enum CUgraphNodeType_enum {
+    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
+    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
+    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
+    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11,/**< Memory Free Node */
+    CU_GRAPH_NODE_TYPE_BATCH_MEM_OP     = 12,/**< Batch MemOp Node */
+    CU_GRAPH_NODE_TYPE_CONDITIONAL      = 13 /**< Conditional Node
+
+                                                  May be used to implement a conditional execution path or loop
+                                                  inside of a graph. The graph(s) contained within the body of the conditional node
+                                                  can be selectively executed or iterated upon based on the value of a conditional
+                                                  variable.
+
+                                                  Handles must be created in advance of creating the node
+                                                  using ::cuGraphConditionalHandleCreate.
+
+                                                  The following restrictions apply to graphs which contain conditional nodes:
+                                                   The graph cannot be used in a child node.
+                                                   Only one instantiation of the graph may exist at any point in time.
+                                                   The graph cannot be cloned.
+
+                                                  To set the control value, supply a default value when creating the handle and/or
+                                                  call ::cudaGraphSetConditional from device code.*/
+} CUgraphNodeType;
+
+/**
+ * Type annotations that can be applied to graph edges as part of ::CUgraphEdgeData.
+ */
+typedef enum CUgraphDependencyType_enum {
+    CU_GRAPH_DEPENDENCY_TYPE_DEFAULT = 0, /**< This is an ordinary dependency. */
+    CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC = 1  /**< This dependency type allows the downstream node to
+                                                    use \c cudaGridDependencySynchronize(). It may only be used
+                                                    between kernel nodes, and must be used with either the
+                                                    ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC or
+                                                    ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER outgoing port. */
+} CUgraphDependencyType;
+
+/**
+ * This port activates when the kernel has finished executing.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_DEFAULT 0
+/**
+ * This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion()
+ * or have terminated. It must be used with edge type ::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC. See also
+ * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC 1
+/**
+ * This port activates when all blocks of the kernel have begun execution. See also
+ * ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER 2
+
+/**
+ * Optional annotation for edges in a CUDA graph. Note, all edges implicitly have annotations and
+ * default to a zero-initialized value if not specified. A zero-initialized struct indicates a
+ * standard full serialization of two nodes with memory visibility.
+ */
+typedef struct CUgraphEdgeData_st {
+    unsigned char from_port; /**< This indicates when the dependency is triggered from the upstream
+                                  node on the edge. The meaning is specfic to the node type. A value
+                                  of 0 in all cases means full completion of the upstream node, with
+                                  memory visibility to the downstream node or portion thereof
+                                  (indicated by \c to_port).
+                                  <br>
+                                  Only kernel nodes define non-zero ports. A kernel node
+                                  can use the following output port types:
+                                  ::CU_GRAPH_KERNEL_NODE_PORT_DEFAULT, ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC,
+                                  or ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER. */
+    unsigned char to_port; /**< This indicates what portion of the downstream node is dependent on
+                                the upstream node or portion thereof (indicated by \c from_port). The
+                                meaning is specific to the node type. A value of 0 in all cases means
+                                the entirety of the downstream node is dependent on the upstream work.
+                                <br>
+                                Currently no node types define non-zero ports. Accordingly, this field
+                                must be set to zero. */
+    unsigned char type; /**< This should be populated with a value from ::CUgraphDependencyType. (It
+                             is typed as char due to compiler-specific layout of bitfields.) See
+                             ::CUgraphDependencyType. */
+    unsigned char reserved[5]; /**< These bytes are unused and must be zeroed. This ensures
+                                    compatibility if additional fields are added in the future. */
+} CUgraphEdgeData;
+
+/**
+ * Graph instantiation results
+*/
+typedef enum CUgraphInstantiateResult_enum
+{
+    CUDA_GRAPH_INSTANTIATE_SUCCESS = 0,                          /**< Instantiation succeeded */
+    CUDA_GRAPH_INSTANTIATE_ERROR = 1,                            /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
+    CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2,                /**< Instantiation failed due to invalid structure, such as cycles */
+    CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3,     /**< Instantiation for device launch failed because the graph contained an unsupported operation */
+    CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4,      /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
+    CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED = 5,        /**< One or more conditional handles are not associated with conditional nodes */
+} CUgraphInstantiateResult;
+
+/**
+ * Graph instantiation parameters
+ */
+typedef struct CUDA_GRAPH_INSTANTIATE_PARAMS_st
+{
+	cuuint64_t flags;                    /**< Instantiation flags */
+	CUstream hUploadStream;              /**< Upload stream */
+	CUgraphNode hErrNode_out;            /**< The node which caused instantiation to fail, if any */
+	CUgraphInstantiateResult result_out; /**< Whether instantiation was successful.  If it failed, the reason why */
+} CUDA_GRAPH_INSTANTIATE_PARAMS;
+
+typedef enum CUsynchronizationPolicy_enum {
+    CU_SYNC_POLICY_AUTO = 1,
+    CU_SYNC_POLICY_SPIN = 2,
+    CU_SYNC_POLICY_YIELD = 3,
+    CU_SYNC_POLICY_BLOCKING_SYNC = 4
+} CUsynchronizationPolicy;
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute
+ */
+typedef enum CUclusterSchedulingPolicy_enum {
+    CU_CLUSTER_SCHEDULING_POLICY_DEFAULT        = 0, /**< the default policy */
+    CU_CLUSTER_SCHEDULING_POLICY_SPREAD         = 1, /**< spread the blocks within a cluster to the SMs */
+    CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+} CUclusterSchedulingPolicy;
+
+/**
+ * Memory Synchronization Domain
+ *
+ * A kernel can be launched in a specified memory synchronization domain that affects all memory operations issued by
+ * that kernel. A memory barrier issued in one domain will only order memory operations in that domain, thus eliminating
+ * latency increase from memory barriers ordering unrelated traffic.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a
+ * different domain ID. User may also alter the domain ID with ::CUlaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN, ::cuStreamSetAttribute, ::cuLaunchKernelEx,
+ * ::cuGraphKernelNodeSetAttribute.
+ *
+ * Memory operations done in kernels launched in different domains are considered system-scope distanced. In other
+ * words, a GPU scoped memory synchronization is not sufficient for memory order to be observed by kernels in another
+ * memory synchronization domain even if they are on the same GPU.
+ */
+typedef enum CUlaunchMemSyncDomain_enum {
+    CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT = 0,    /**< Launch kernels in the default domain */
+    CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE  = 1     /**< Launch kernels in the remote domain */
+} CUlaunchMemSyncDomain;
+
+/**
+ * Memory Synchronization Domain map
+ *
+ * See ::cudaLaunchMemSyncDomain.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a
+ * different domain ID. User may also alter the domain ID with ::CUlaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
+ *
+ * Domain ID range is available through ::CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT.
+ */
+typedef struct CUlaunchMemSyncDomainMap_st {
+    unsigned char default_;     /**< The default domain ID to use for designated kernels */
+    unsigned char remote;       /**< The remote domain ID to use for designated kernels */
+} CUlaunchMemSyncDomainMap;
+
+/**
+ * Launch attributes enum; used as id field of ::CUlaunchAttribute
+ */
+typedef enum CUlaunchAttributeID_enum {
+    CU_LAUNCH_ATTRIBUTE_IGNORE = 0 /**< Ignored entry, for convenient composition */
+  , CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1 /**< Valid for streams, graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::accessPolicyWindow. */
+  , CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2 /**< Valid for graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::cooperative. */
+  , CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< Valid for streams. See
+                                                      ::CUlaunchAttributeValue::syncPolicy. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4 /**< Valid for graph nodes, launches. See ::CUlaunchAttributeValue::clusterDim. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 /**< Valid for graph nodes, launches. See ::CUlaunchAttributeValue::clusterSchedulingPolicyPreference. */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6 /**< Valid for launches. Setting
+                                                                  ::CUlaunchAttributeValue::programmaticStreamSerializationAllowed
+                                                                  to non-0 signals that the kernel will use programmatic
+                                                                  means to resolve its stream dependency, so that the
+                                                                  CUDA runtime should opportunistically allow the grid's
+                                                                  execution to overlap with the previous kernel in the
+                                                                  stream, if that kernel requests the overlap. The
+                                                                  dependent launches can choose to wait on the
+                                                                  dependency using the programmatic sync
+                                                                  (cudaGridDependencySynchronize() or equivalent PTX
+                                                                  instructions). */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7 /**< Valid for launches. Set
+                                                                      ::CUlaunchAttributeValue::programmaticEvent to
+                                                                      record the event. Event recorded through this
+                                                                      launch attribute is guaranteed to only trigger
+                                                                      after all block in the associated kernel trigger
+                                                                      the event. A block can trigger the event through
+                                                                      PTX launchdep.release or CUDA builtin function
+                                                                      cudaTriggerProgrammaticLaunchCompletion(). A
+                                                                      trigger can also be inserted at the beginning of
+                                                                      each block's execution if triggerAtBlockStart is
+                                                                      set to non-0. The dependent launches can choose to
+                                                                      wait on the dependency using the programmatic sync
+                                                                      (cudaGridDependencySynchronize() or equivalent PTX
+                                                                      instructions). Note that dependents (including the
+                                                                      CPU thread calling cuEventSynchronize()) are not
+                                                                      guaranteed to observe the release precisely when
+                                                                      it is released.  For example, cuEventSynchronize()
+                                                                      may only observe the event trigger long after the
+                                                                      associated kernel has completed. This recording
+                                                                      type is primarily meant for establishing
+                                                                      programmatic dependency between device tasks. Note
+                                                                      also this type of dependency allows, but does not
+                                                                      guarantee, concurrent execution of tasks.
+                                                                      <br>
+                                                                      The event supplied must not be an interprocess or
+                                                                      interop event. The event must disable timing (i.e.
+                                                                      must be created with the ::CU_EVENT_DISABLE_TIMING
+                                                                      flag set).
+                                                                      */
+  , CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8 /**< Valid for streams, graph nodes, launches. See
+                                                        ::CUlaunchAttributeValue::priority. */
+  , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9 /**< Valid for streams, graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::memSyncDomainMap. */
+  , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10 /**< Valid for streams, graph nodes, launches. See
+                                                       ::CUlaunchAttributeValue::memSyncDomain. */
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11 /**< Valid for graph nodes, launches. Set
+                                                              ::CUlaunchAttributeValue::preferredClusterDim
+                                                              to allow the kernel launch to specify a preferred substitute
+                                                              cluster dimension. Blocks may be grouped according to either
+                                                              the dimensions specified with this attribute (grouped into a
+                                                              "preferred substitute cluster"), or the one specified with
+                                                              ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
+                                                              into a "regular cluster"). The cluster dimensions of a
+                                                              "preferred substitute cluster" shall be an integer multiple
+                                                              greater than zero of the regular cluster dimensions. The
+                                                              device will attempt - on a best-effort basis - to group
+                                                              thread blocks into preferred clusters over grouping them
+                                                              into regular clusters. When it deems necessary (primarily
+                                                              when the device temporarily runs out of physical resources
+                                                              to launch the larger preferred clusters), the device may
+                                                              switch to launch the regular clusters instead to attempt to
+                                                              utilize as much of the physical device resources as possible.
+                                                              <br>
+                                                              Each type of cluster will have its enumeration / coordinate
+                                                              setup as if the grid consists solely of its type of cluster.
+                                                              For example, if the preferred substitute cluster dimensions
+                                                              double the regular cluster dimensions, there might be
+                                                              simultaneously a regular cluster indexed at (1,0,0), and a
+                                                              preferred cluster indexed at (1,0,0). In this example, the
+                                                              preferred substitute cluster (1,0,0) replaces regular
+                                                              clusters (2,0,0) and (3,0,0) and groups their blocks.
+                                                              <br>
+                                                              This attribute will only take effect when a regular cluster
+                                                              dimension has been specified. The preferred substitute
+                                                              cluster dimension must be an integer multiple greater than
+                                                              zero of the regular cluster dimension and must divide the
+                                                              grid. It must also be no more than `maxBlocksPerCluster`, if
+                                                              it is set in the kernel's `__launch_bounds__`. Otherwise it
+                                                              must be less than the maximum value the driver can support.
+                                                              Otherwise, setting this attribute to a value physically
+                                                              unable to fit on any particular device is permitted. */
+  , CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12 /**< Valid for launches. Set
+                                                          ::CUlaunchAttributeValue::launchCompletionEvent to record the
+                                                          event.
+                                                          <br>
+                                                          Nominally, the event is triggered once all blocks of the kernel
+                                                          have begun execution. Currently this is a best effort. If a kernel
+                                                          B has a launch completion dependency on a kernel A, B may wait
+                                                          until A is complete. Alternatively, blocks of B may begin before
+                                                          all blocks of A have begun, for example if B can claim execution
+                                                          resources unavailable to A (e.g. they run on different GPUs) or
+                                                          if B is a higher priority than A.
+                                                          Exercise caution if such an ordering inversion could lead
+                                                          to deadlock.
+                                                          <br>
+                                                          A launch completion event is nominally similar to a programmatic
+                                                          event with \c triggerAtBlockStart set except that it is not
+                                                          visible to \c cudaGridDependencySynchronize() and can be used with
+                                                          compute capability less than 9.0.
+                                                          <br>
+                                                          The event supplied must not be an interprocess or interop
+                                                          event. The event must disable timing (i.e. must be created
+                                                          with the ::CU_EVENT_DISABLE_TIMING flag set). */
+  , CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13 /**< Valid for graph nodes, launches. This attribute is graphs-only,
+                                                               and passing it to a launch in a non-capturing stream will result
+                                                               in an error.
+                                                               <br>
+                                                               ::CUlaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can 
+                                                               only be set to 0 or 1. Setting the field to 1 indicates that the
+                                                               corresponding kernel node should be device-updatable. On success, a handle
+                                                               will be returned via
+                                                               ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be
+                                                               passed to the various device-side update functions to update the node's
+                                                               kernel parameters from within another kernel. For more information on the
+                                                               types of device updates that can be made, as well as the relevant limitations
+                                                               thereof, see ::cudaGraphKernelNodeUpdatesApply.
+                                                               <br>
+                                                               Nodes which are device-updatable have additional restrictions compared to
+                                                               regular kernel nodes. Firstly, device-updatable nodes cannot be removed
+                                                               from their graph via ::cuGraphDestroyNode. Additionally, once opted-in
+                                                               to this functionality, a node cannot opt out, and any attempt to set the
+                                                               deviceUpdatable attribute to 0 will result in an error. Device-updatable
+                                                               kernel nodes also cannot have their attributes copied to/from another kernel
+                                                               node via ::cuGraphKernelNodeCopyAttributes. Graphs containing one or more
+                                                               device-updatable nodes also do not allow multiple instantiation, and neither
+                                                               the graph nor its instantiated version can be passed to ::cuGraphExecUpdate.
+                                                               <br>
+                                                               If a graph contains device-updatable nodes and updates those nodes from the device
+                                                               from within the graph, the graph must be uploaded with ::cuGraphUpload before it
+                                                               is launched. For such a graph, if host-side executable graph updates are made to the
+                                                               device-updatable nodes, the graph must be uploaded before it is launched again. */
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
+                                                                   same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a 
+                                                                   percentage between 0-100 signals the CUDA driver to set the shared memory carveout 
+                                                                   preference, in percent of the total shared memory for that kernel launch. 
+                                                                   This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+                                                                   This is only a hint, and the CUDA driver can choose a different configuration if
+                                                                   required for the launch. */
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
+  , CU_LAUNCH_ATTRIBUTE_MAX
+#endif
+} CUlaunchAttributeID;
+
+/**
+ * Launch attributes union; used as value field of ::CUlaunchAttribute
+ */
+typedef union CUlaunchAttributeValue_union {
+    char pad[64]; /* Pad to 64 bytes */
+    CUaccessPolicyWindow accessPolicyWindow; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW. */
+    int cooperative; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero indicates a cooperative
+                        kernel (see ::cuLaunchCooperativeKernel). */
+    CUsynchronizationPolicy syncPolicy; /**< Value of launch attribute
+                                           ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. ::CUsynchronizationPolicy for
+                                           work queued up in this stream */
+
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION that
+     *  represents the desired cluster dimensions for the kernel. Opaque type
+     *  with the following fields:
+     *      - \p x - The X dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid X dimension.
+     *      - \p y - The Y dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid Y dimension.
+     *      - \p z - The Z dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid Z dimension.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; /**< Value of launch attribute
+                                                                    ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
+                                                                    scheduling policy preference for the kernel. */
+    int programmaticStreamSerializationAllowed;  /**< Value of launch attribute
+                                                   ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION. */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
+     *  with the following fields:
+     *      - \p CUevent event - Event to fire when all blocks trigger it.
+     *      - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
+     *      - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
+     */
+    struct {
+        CUevent event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    /**
+     * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
+     * with the following fields:
+     *     - \p CUevent event - Event to fire when the last block launches
+     *     - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
+     */ 
+    struct {
+        CUevent event;
+        int flags;
+    } launchCompletionEvent;
+    int priority; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel. */
+    CUlaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
+                                                  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP. See
+                                                  ::CUlaunchMemSyncDomainMap. */
+    CUlaunchMemSyncDomain memSyncDomain;       /**< Value of launch attribute
+                                                  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+     *  that represents the desired preferred cluster dimensions for the kernel.
+     *  Opaque type with the following fields:
+     *      - \p x - The X dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid X dimension, and must be a
+     *               multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p y - The Y dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid Y dimension, and must be a
+     *               multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p z - The Z dimension of the preferred cluster, in blocks. Must be
+     *               equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } preferredClusterDim;
+
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
+     *  with the following fields:
+     *      - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
+     *      - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.
+     */
+    struct {
+        int deviceUpdatable;
+        CUgraphDeviceNode devNode;
+    } deviceUpdatableKernelNode;
+    unsigned int sharedMemCarveout; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. */
+} CUlaunchAttributeValue;
+
+/**
+ * Launch attribute
+ */
+typedef struct CUlaunchAttribute_st {
+    CUlaunchAttributeID id; /**< Attribute to set */
+    char pad[8 - sizeof(CUlaunchAttributeID)];
+    CUlaunchAttributeValue value; /**< Value of the attribute */
+} CUlaunchAttribute;
+
+/**
+ * CUDA extensible launch configuration
+ */
+typedef struct CUlaunchConfig_st {
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    CUlaunchAttribute *attrs;    /**< List of attributes; nullable if ::CUlaunchConfig::numAttrs == 0 */
+    unsigned int numAttrs;       /**< Number of attributes populated in ::CUlaunchConfig::attrs */
+} CUlaunchConfig;
+
+typedef CUlaunchAttributeID CUkernelNodeAttrID;
+#define CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE          CU_LAUNCH_ATTRIBUTE_COOPERATIVE
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION                    CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY             CU_LAUNCH_ATTRIBUTE_PRIORITY
+#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP  CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
+#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN      CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+#define CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+
+typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
+typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
+
+/**
+ * Possible stream capture statuses returned by ::cuStreamIsCapturing
+ */
+typedef enum CUstreamCaptureStatus_enum {
+    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
+    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
+    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+} CUstreamCaptureStatus;
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
+ */
+typedef enum CUstreamCaptureMode_enum {
+    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
+    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
+} CUstreamCaptureMode;
+
+typedef CUlaunchAttributeID CUstreamAttrID;
+#define CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW   CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+#define CU_STREAM_ATTRIBUTE_PRIORITY               CU_LAUNCH_ATTRIBUTE_PRIORITY
+#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
+#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN        CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
+
+typedef CUlaunchAttributeValue CUstreamAttrValue_v1;
+typedef CUstreamAttrValue_v1 CUstreamAttrValue;
+
+/**
+ * Flags to specify search options. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddress_flags_enum {
+    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
+    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
+    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
+} CUdriverProcAddress_flags;
+
+/**
+ * Flags to indicate search status. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddressQueryResult_enum {
+    CU_GET_PROC_ADDRESS_SUCCESS                = 0,  /**< Symbol was succesfully found */
+    CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND       = 1,  /**< Symbol was not found in search */
+    CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2   /**< Symbol was found but version supplied was not sufficient */
+}  CUdriverProcAddressQueryResult;
+
+/**
+ * Execution Affinity Types 
+ */
+typedef enum CUexecAffinityType_enum {
+    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
+    CU_EXEC_AFFINITY_TYPE_MAX
+} CUexecAffinityType;
+
+/**
+ * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ */
+typedef struct CUexecAffinitySmCount_st {
+    unsigned int val;    /**< The number of SMs the context is limited to use. */
+} CUexecAffinitySmCount_v1;
+typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
+
+/**
+ * Execution Affinity Parameters 
+ */
+typedef struct CUexecAffinityParam_st {
+    CUexecAffinityType type;
+    union {
+        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
+    } param;
+} CUexecAffinityParam_v1;
+/**
+ * Execution Affinity Parameters
+ */
+typedef CUexecAffinityParam_v1 CUexecAffinityParam;
+
+typedef enum CUcigDataType_enum {
+    CIG_DATA_TYPE_D3D12_COMMAND_QUEUE = 0x1,    /** D3D12 Command Queue Handle */
+} CUcigDataType;
+
+/**
+* CIG Context Create Params
+*/
+typedef struct CUctxCigParam_st {
+    CUcigDataType sharedDataType;
+    void* sharedData;
+} CUctxCigParam;
+
+/**
+* Params for creating CUDA context
+* Exactly one of execAffinityParams and cigParams 
+* must be non-NULL.
+*/
+typedef struct CUctxCreateParams_st {
+    CUexecAffinityParam *execAffinityParams;
+    int                  numExecAffinityParams;
+    CUctxCigParam       *cigParams;
+} CUctxCreateParams;
+
+/**
+ * Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile()
+ */
+typedef enum CUlibraryOption_enum
+{
+    CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE = 0,
+
+    /**
+     * Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved.
+     * Specifying this option will let the driver know that \p code can be accessed at any point
+     * until ::cuLibraryUnload(). The default behavior is for the driver to allocate and
+     * maintain its own copy of \p code. Note that this is only a memory usage optimization
+     * hint and the driver can choose to ignore it if required.
+     * Specifying this option with ::cuLibraryLoadFromFile() is invalid and
+     * will return ::CUDA_ERROR_INVALID_VALUE.
+     */
+    CU_LIBRARY_BINARY_IS_PRESERVED = 1,
+
+    CU_LIBRARY_NUM_OPTIONS
+} CUlibraryOption;
+
+typedef struct CUlibraryHostUniversalFunctionAndDataTable_st
+{
+    void *functionTable;
+    size_t functionWindowSize;
+    void *dataTable;
+    size_t dataWindowSize;
+} CUlibraryHostUniversalFunctionAndDataTable;
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory or
+     * other resources to perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    CUDA_ERROR_STUB_LIBRARY                   = 34,
+
+    /**  
+     * This indicates that requested CUDA device is unavailable at the current
+     * time. Devices are often unavailable due to use of
+     * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
+     */
+    CUDA_ERROR_DEVICE_UNAVAILABLE            = 46,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+    /**
+     * This error indicates that the Grid license is not applied.
+     */
+    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     * This can also be returned if the green context passed to an API call
+     * was not converted to a ::CUcontext using ::cuCtxFromGreenCtx API.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     */
+
+    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
+
+    /**
+     * This indicates that the PTX JIT compilation was disabled.
+     */
+    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
+
+    /**
+     * This indicates that the ::CUexecAffinityType passed to the API call is not
+     * supported by the active device.
+     */ 
+    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
+
+    /**
+     * This indicates that the code to be compiled by the PTX JIT contains
+     * unsupported call to cudaDeviceSynchronize.
+     */
+    CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC       = 225,
+
+    /**
+     * This indicates that an exception occurred on the device that is now
+     * contained by the GPU's error containment capability. Common causes are -
+     * a. Certain types of invalid accesses of peer GPU memory over nvlink
+     * b. Certain classes of hardware errors
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    CUDA_ERROR_CONTAINED                      = 226,
+
+    /**
+     * This indicates that the device kernel source is invalid. This includes
+     * compilation/linker errors encountered in device code or user error.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates an attempt was made to introspect an object in a way that
+     * would discard semantically important information. This is either due to
+     * the object using funtionality newer than the API version used to
+     * introspect it or omission of optional return arguments.
+     */
+    CUDA_ERROR_LOSSY_QUERY                    = 402,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * An exception occurred on the device while exiting a kernel using tensor memory: the
+     * tensor memory was not completely deallocated. This leaves the process in an inconsistent
+     * state and any further CUDA work will return the same error. To continue using CUDA, the
+     * process must be terminated and relaunched.
+     */
+    CUDA_ERROR_TENSOR_MEMORY_LEAK             = 721,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
+
+    /**
+     * This error indicates the the hardware resources required to support device connections have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    CUDA_ERROR_MPS_CLIENT_TERMINATED          = 810,
+
+    /**
+     * This error indicates that the module is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
+     */
+    CUDA_ERROR_CDP_NOT_SUPPORTED              = 811,
+
+    /**
+     * This error indicates that a module contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
+     */
+    CUDA_ERROR_CDP_VERSION_MISMATCH           = 812,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This error indicates that the timeout specified for the wait operation has lapsed.
+     */
+    CUDA_ERROR_TIMEOUT                        = 909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
+
+    /**
+     * Indicates a kernel launch error due to cluster misconfiguration.
+     */
+    CUDA_ERROR_INVALID_CLUSTER_SIZE           = 912,
+
+    /**
+     * Indiciates a function handle is not loaded when calling an API that requires
+     * a loaded function.
+    */
+    CUDA_ERROR_FUNCTION_NOT_LOADED            = 913,
+
+    /**
+     * This error indicates one or more resources passed in are not valid resource
+     * types for the operation.
+    */
+    CUDA_ERROR_INVALID_RESOURCE_TYPE          = 914,
+
+    /**
+     * This error indicates one or more resources are insufficient or non-applicable for
+     * the operation.
+    */
+    CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915,
+
+    /**
+     * This error indicates that an error happened during the key rotation
+     * sequence.
+    */
+    CUDA_ERROR_KEY_ROTATION                   = 916,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * P2P Attributes
+ */
+typedef enum CUdevice_P2PAttribute_enum {
+    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
+    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
+} CUdevice_P2PAttribute;
+
+/**
+ * CUDA stream callback
+ * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
+ * \param status ::CUDA_SUCCESS or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
+
+/**
+ * Block size to per-block dynamic shared memory mapping for a certain
+ * kernel \param blockSize Block size of the kernel.
+ *
+ * \return The dynamic shared memory needed by a block.
+ */
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, the passed memory pointer is treated as pointing to some
+ * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+ * On Windows the flag is a no-op.
+ * On Linux that memory is marked as non cache-coherent for the GPU and
+ * is expected to be physically contiguous. It may return
+ * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
+ * is returned.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
+
+/**
+* If set, the passed memory pointer is treated as pointing to memory that is
+* considered read-only by the device.  On platforms without
+* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+* required in order to register memory mapped to the CPU as read-only.  Support
+* for the use of this flag can be queried from the device attribute
+* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+* a current context associated with a device that does not have this attribute
+* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
+*/
+#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
+
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D_v2;
+typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_v2;
+typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER_v1;
+typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
+
+/**
+ * Memcpy node parameters
+ */
+typedef struct CUDA_MEMCPY_NODE_PARAMS_st {
+    int flags;                 /**< Must be zero */
+    int reserved;              /**< Must be zero */
+    CUcontext copyCtx;         /**< Context on which to run the node */
+    CUDA_MEMCPY3D copyParams;  /**< Parameters for the memory copy */
+} CUDA_MEMCPY_NODE_PARAMS;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR_v2;
+typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR_v2;
+typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
+
+/**
+ * CUDA array sparse properties
+ */
+typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
+    struct {
+        unsigned int width;     /**< Width of sparse tile in elements */
+        unsigned int height;    /**< Height of sparse tile in elements */
+        unsigned int depth;     /**< Depth of sparse tile in elements */
+    } tileExtent;
+
+    /**
+     * First mip level at which the mip tail begins.
+     */
+    unsigned int miptailFirstLevel;
+    /**
+     * Total size of the mip tail.
+     */
+    unsigned long long miptailSize;
+    /**
+     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+     */
+    unsigned int flags;
+    unsigned int reserved[4];
+} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
+typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
+
+/**
+ * CUDA array memory requirements
+ */
+typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st {
+    size_t size;                /**< Total required memory size */
+    size_t alignment;           /**< alignment requirement */
+    unsigned int reserved[4];
+} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1;
+typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS;
+
+/**
+ * CUDA Resource descriptor
+ */
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
+
+    union {
+        struct {
+            CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC_v1;
+typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC_v1;
+typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
+
+/**
+ * Resource view format
+ */
+typedef enum CUresourceViewFormat_enum
+{
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC_v1;
+typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * Size of tensor map descriptor
+ */
+#define CU_TENSOR_MAP_NUM_QWORDS 16
+
+/**
+ * Tensor map descriptor. Requires compiler support for aligning to 64 bytes.
+ */
+typedef struct CUtensorMap_st {
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+    alignas(64)
+#elif __STDC_VERSION__ >= 201112L
+    _Alignas(64)
+#endif
+    cuuint64_t opaque[CU_TENSOR_MAP_NUM_QWORDS];
+} CUtensorMap;
+
+/**
+ * Tensor map data type
+ */
+typedef enum CUtensorMapDataType_enum {
+    CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,
+    CU_TENSOR_MAP_DATA_TYPE_UINT16,
+    CU_TENSOR_MAP_DATA_TYPE_UINT32,
+    CU_TENSOR_MAP_DATA_TYPE_INT32,
+    CU_TENSOR_MAP_DATA_TYPE_UINT64,
+    CU_TENSOR_MAP_DATA_TYPE_INT64,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT16,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT32,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT64,
+    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
+    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
+    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,
+    CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B
+} CUtensorMapDataType;
+
+/**
+ * Tensor map interleave layout type
+ */
+typedef enum CUtensorMapInterleave_enum {
+    CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+    CU_TENSOR_MAP_INTERLEAVE_16B,
+    CU_TENSOR_MAP_INTERLEAVE_32B
+} CUtensorMapInterleave;
+
+/**
+ * Tensor map swizzling mode of shared memory banks
+ */
+typedef enum CUtensorMapSwizzle_enum {
+    CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+    CU_TENSOR_MAP_SWIZZLE_32B,
+    CU_TENSOR_MAP_SWIZZLE_64B,
+    CU_TENSOR_MAP_SWIZZLE_128B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B
+} CUtensorMapSwizzle;
+
+/**
+ * Tensor map L2 promotion type
+ */
+typedef enum CUtensorMapL2promotion_enum {
+    CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+} CUtensorMapL2promotion;
+
+/**
+ * Tensor map out-of-bounds fill type
+ */
+typedef enum CUtensorMapFloatOOBfill_enum {
+    CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+    CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+} CUtensorMapFloatOOBfill;
+
+/**
+ * Tensor map Im2Col wide mode
+ */
+typedef enum CUtensorMapIm2ColWideMode_enum {
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W = 0,
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+} CUtensorMapIm2ColWideMode;
+
+/**
+ * GPU Direct v3 tokens
+ */
+typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+    unsigned long long p2pToken;
+    unsigned int vaSpaceToken;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
+typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+/**
+* Access flags that specify the level of access the current context's device has
+* on the memory referenced.
+*/
+typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
+} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
+
+/**
+ * Kernel launch parameters
+ */
+typedef struct CUDA_LAUNCH_PARAMS_st {
+    CUfunction function;         /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+} CUDA_LAUNCH_PARAMS_v1;
+typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
+
+/**
+ * External memory handle types
+ */
+typedef enum CUexternalMemoryHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+    /**
+     * Handle is a shared NT handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+    /**
+     * Handle is a globally shared handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+    /**
+     * Handle is an NvSciBuf object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
+} CUexternalMemoryHandleType;
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+ * contains this flag, it indicates that waiting on an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs signaler specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs waiter specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
+/**
+ * External memory handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing an NvSciBuf Object. Valid when type
+         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+
+/**
+ * External memory buffer descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+
+/**
+ * External memory mipmap descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format, dimension and type of base level of the mipmap chain
+     */
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+
+/**
+ * External semaphore handle types
+ */
+typedef enum CUexternalSemaphoreHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+	 */
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
+    /**
+     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
+    /**
+     * Handle is an opaque file descriptor referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
+    /**
+     * Handle is an opaque shared NT handle referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+} CUexternalSemaphoreHandleType;
+
+/**
+ * External semaphore handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid
+         * when type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+
+/**
+ * External semaphore signal parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
+             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
+     * signal a ::CUexternalSemaphore of type
+     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
+     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
+     * that while signaling the ::CUexternalSemaphore, no memory synchronization
+     * operations should be performed for any external memory object imported
+     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+
+/**
+ * External semaphore wait parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be waited on
+             */
+            unsigned long long value;
+        } fence;
+        /**
+         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
+         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+         */
+        union {
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
+     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
+
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2;
+
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
+
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2;
+
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+/**
+ * Flags for specifying particular handle types
+ */
+typedef enum CUmemAllocationHandleType_enum {
+    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    CU_MEM_HANDLE_TYPE_FABRIC                = 0x8,  /**< Allows a fabric handle to be used for exporting. (CUmemFabricHandle)*/
+    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+typedef enum CUmemAccess_flags_enum {
+    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+/**
+ * Specifies the type of location
+ */
+typedef enum CUmemLocationType_enum {
+    CU_MEM_LOCATION_TYPE_INVALID    = 0x0,
+    CU_MEM_LOCATION_TYPE_DEVICE     = 0x1,  /**< Location is a device location, thus id is a device ordinal */
+    CU_MEM_LOCATION_TYPE_HOST       = 0x2,   /**< Location is host, id is ignored */
+    CU_MEM_LOCATION_TYPE_HOST_NUMA  = 0x3,  /**< Location is a host NUMA node, thus id is a host NUMA node id */
+    CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT = 0x4,  /**< Location is a host NUMA node of the current thread, id is ignored */
+    CU_MEM_LOCATION_TYPE_MAX        = 0x7FFFFFFF
+} CUmemLocationType;
+
+/**
+* Defines the allocation types available
+*/
+typedef enum CUmemAllocationType_enum {
+    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
+    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemAllocationType;
+
+/**
+* Flag for requesting different optimal and required granularities for an allocation.
+*/
+typedef enum CUmemAllocationGranularity_flags_enum {
+    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
+    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
+} CUmemAllocationGranularity_flags;
+
+/**
+* Specifies the handle type for address range
+*/
+typedef enum CUmemRangeHandleType_enum
+{
+    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
+    CU_MEM_RANGE_HANDLE_TYPE_MAX        = 0x7FFFFFFF
+} CUmemRangeHandleType;
+
+/**
+* Flag for requesting handle type for address range.
+*/
+typedef enum CUmemRangeFlags_enum {
+    CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE     = 0x1   /**< Indicates that DMA_BUF handle should be mapped via PCIe BAR1 */
+} CUmemRangeFlags;
+
+/**
+ * Sparse subresource types
+ */
+typedef enum CUarraySparseSubresourceType_enum {
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+} CUarraySparseSubresourceType;
+
+/**
+ * Memory operation types
+ */
+typedef enum CUmemOperationType_enum {
+    CU_MEM_OPERATION_TYPE_MAP = 1,
+    CU_MEM_OPERATION_TYPE_UNMAP = 2
+} CUmemOperationType;
+
+/**
+ * Memory handle types
+ */
+typedef enum CUmemHandleType_enum {
+    CU_MEM_HANDLE_TYPE_GENERIC = 0
+} CUmemHandleType;
+
+/**
+ * Specifies the CUDA array or CUDA mipmapped array memory mapping information
+ */
+typedef struct CUarrayMapInfo_st {    
+    CUresourcetype resourceType;                    /**< Resource type */
+
+    union {
+        CUmipmappedArray mipmap;
+        CUarray array;
+    } resource;
+
+    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
+
+    union {
+        struct {
+            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned int offsetX;                   /**< Starting X offset in elements */
+            unsigned int offsetY;                   /**< Starting Y offset in elements */
+            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
+            unsigned int extentWidth;               /**< Width in elements */
+            unsigned int extentHeight;              /**< Height in elements */
+            unsigned int extentDepth;               /**< Depth in elements */
+        } sparseLevel;
+        struct {
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned long long offset;              /**< Offset within mip tail */
+            unsigned long long size;                /**< Extent in bytes */
+        } miptail;
+    } subresource;
+    
+    CUmemOperationType memOperationType;            /**< Memory operation type */
+    CUmemHandleType memHandleType;                  /**< Memory handle type */
+
+    union {
+        CUmemGenericAllocationHandle memHandle;
+    } memHandle;
+    
+    unsigned long long offset;                      /**< Offset within the memory */
+    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
+    unsigned int flags;                             /**< flags for future use, must be zero now. */
+    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
+} CUarrayMapInfo_v1;
+typedef CUarrayMapInfo_v1 CUarrayMapInfo;
+
+/**
+ * Specifies a memory location.
+ */
+typedef struct CUmemLocation_st {
+    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+/**
+ * Specifies compression attribute for an allocation.
+ */
+typedef enum CUmemAllocationCompType_enum {
+    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
+    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
+} CUmemAllocationCompType;
+
+/**
+ * This flag if set indicates that the memory will be used as a tile pool.
+ */
+#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_CREATE_USAGE_HW_DECOMPRESS 0x2
+
+/**
+* Specifies the allocation properties for a allocation.
+*/
+typedef struct CUmemAllocationProp_st {
+    /** Allocation type */
+    CUmemAllocationType type;
+    /** requested ::CUmemAllocationHandleType */
+    CUmemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    CUmemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object attributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be transferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are 
+         * expected to query allocation property of the handle obtained with 
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
+         * validate if the obtained allocation is compressible or not. Note that 
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
+/**
+* Flags for querying different granularities for a multicast object
+*/
+typedef enum CUmulticastGranularity_flags_enum {
+    CU_MULTICAST_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity */
+    CU_MULTICAST_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for best performance */
+} CUmulticastGranularity_flags;
+
+/**
+* Specifies the properties for a multicast object.
+*/
+typedef struct CUmulticastObjectProp_st {
+    /**
+     * The number of devices in the multicast team that will bind memory to this
+     * object
+     */
+    unsigned int numDevices;
+    /** 
+     * The maximum amount of memory that can be bound to this multicast object
+     * per device
+     */
+    size_t size;
+    /**
+     * Bitmask of exportable handle types (see ::CUmemAllocationHandleType) for
+     * this object
+     */
+    unsigned long long handleTypes;
+    /** 
+     * Flags for future use, must be zero now
+     */
+    unsigned long long flags;
+} CUmulticastObjectProp_v1;
+typedef CUmulticastObjectProp_v1 CUmulticastObjectProp;
+
+/**
+ * Memory access descriptor
+ */
+typedef struct CUmemAccessDesc_st {
+    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
+    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
+} CUmemAccessDesc_v1;
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+/**
+ * CUDA Graph Update error types
+ */
+typedef enum CUgraphExecUpdateResult_enum {
+    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
+    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED    = 0x8  /**< The update failed because the node attributes changed in a way that is not supported */
+} CUgraphExecUpdateResult;
+
+/**
+ * Result information returned by cuGraphExecUpdate
+ */
+typedef struct CUgraphExecUpdateResultInfo_st {
+    /**
+     * Gives more specific detail when a cuda graph update fails.
+     */
+    CUgraphExecUpdateResult result;
+
+    /**
+     * The "to node" of the error edge when the topologies do not match.
+     * The error node when the error is associated with a specific node.
+     * NULL when the error is generic.
+     */
+    CUgraphNode errorNode;
+
+    /**
+     * The from node of error edge when the topologies do not match. Otherwise NULL.
+     */
+    CUgraphNode errorFromNode;
+} CUgraphExecUpdateResultInfo_v1; 
+typedef CUgraphExecUpdateResultInfo_v1 CUgraphExecUpdateResultInfo;
+
+/**
+ * CUDA memory pool attributes
+ */
+typedef enum CUmemPool_attribute_enum {
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_HIGH
+} CUmemPool_attribute;
+
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS 0x2
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+typedef struct CUmemPoolProps_st {
+    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
+    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    CUmemLocation location;                /**< Location where allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be transferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32SecurityAttributes;
+    size_t maxSize;             /**< Maximum pool size. When set to 0, defaults to a system dependent value. */
+    unsigned short usage;       /**< Bitmask indicating intended usage for the pool. */
+    unsigned char reserved[54]; /**< reserved for future use, must be 0 */
+} CUmemPoolProps_v1;
+typedef CUmemPoolProps_v1 CUmemPoolProps;
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+typedef struct CUmemPoolPtrExportData_st {
+    unsigned char reserved[64];
+} CUmemPoolPtrExportData_v1;
+typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_v1_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS_v1;
+typedef CUDA_MEM_ALLOC_NODE_PARAMS_v1 CUDA_MEM_ALLOC_NODE_PARAMS;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_v2_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS_v2;
+
+/**
+ * Memory free node parameters
+ */
+typedef struct CUDA_MEM_FREE_NODE_PARAMS_st {
+    CUdeviceptr dptr; /**< in: the pointer to free */
+} CUDA_MEM_FREE_NODE_PARAMS;
+
+typedef enum CUgraphMem_attribute_enum {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
+} CUgraphMem_attribute;
+
+/**
+ * Child graph node parameters
+ */
+typedef struct CUDA_CHILD_GRAPH_NODE_PARAMS_st {
+    CUgraph graph; /**< The child graph to clone into the node for node creation, or
+                        a handle to the graph owned by the node for node query */
+} CUDA_CHILD_GRAPH_NODE_PARAMS;
+
+/**
+ * Event record node parameters
+ */
+typedef struct CUDA_EVENT_RECORD_NODE_PARAMS_st {
+    CUevent event; /**< The event to record when the node executes */
+} CUDA_EVENT_RECORD_NODE_PARAMS;
+
+/**
+ * Event wait node parameters
+ */
+typedef struct CUDA_EVENT_WAIT_NODE_PARAMS_st {
+    CUevent event; /**< The event to wait on from the node */
+} CUDA_EVENT_WAIT_NODE_PARAMS;
+
+/**
+ * Graph node parameters.  See ::cuGraphAddNode.
+ */
+typedef struct CUgraphNodeParams_st {
+    CUgraphNodeType type; /**< Type of the node */
+    int reserved0[3]; /**< Reserved. Must be zero. */
+
+    union {
+        long long                             reserved1[29]; /**< Padding. Unused bytes must be zero. */
+        CUDA_KERNEL_NODE_PARAMS_v3            kernel;        /**< Kernel node parameters. */
+        CUDA_MEMCPY_NODE_PARAMS               memcpy;        /**< Memcpy node parameters. */
+        CUDA_MEMSET_NODE_PARAMS_v2            memset;        /**< Memset node parameters. */
+        CUDA_HOST_NODE_PARAMS_v2              host;          /**< Host node parameters. */
+        CUDA_CHILD_GRAPH_NODE_PARAMS          graph;         /**< Child graph node parameters. */
+        CUDA_EVENT_WAIT_NODE_PARAMS           eventWait;     /**< Event wait node parameters. */
+        CUDA_EVENT_RECORD_NODE_PARAMS         eventRecord;   /**< Event record node parameters. */
+        CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2    extSemSignal;  /**< External semaphore signal node parameters. */
+        CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2      extSemWait;    /**< External semaphore wait node parameters. */
+        CUDA_MEM_ALLOC_NODE_PARAMS_v2         alloc;         /**< Memory allocation node parameters. */
+        CUDA_MEM_FREE_NODE_PARAMS             free;          /**< Memory free node parameters. */
+        CUDA_BATCH_MEM_OP_NODE_PARAMS_v2      memOp;         /**< MemOp node parameters. */
+        CUDA_CONDITIONAL_NODE_PARAMS          conditional;   /**< Conditional node parameters. */
+    };
+
+    long long reserved2; /**< Reserved bytes. Must be zero. */
+} CUgraphNodeParams;
+
+/**
+ * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
+ * width of such a CUDA array must be equal to its height, and Depth must be six.
+ * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
+ * and Depth must be a multiple of six.
+ */
+#define CUDA_ARRAY3D_CUBEMAP        0x04
+
+/**
+ * This flag must be set in order to perform texture gather operations
+ * on a CUDA array.
+ */
+#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
+
+/**
+ * This flag if set indicates that the CUDA
+ * array is a DEPTH_TEXTURE.
+ */
+#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
+
+/**
+ * This flag indicates that the CUDA array may be bound as a color target
+ * in an external graphics API
+ */
+#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * is a sparse CUDA array or CUDA mipmapped array respectively
+ */
+#define CUDA_ARRAY3D_SPARSE 0x40
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * will allow deferred memory mapping
+ */
+#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
+
+/**
+ * This flag indicates that the CUDA array will be used for hardware accelerated
+ * video encode/decode operations.
+ */
+#define CUDA_ARRAY3D_VIDEO_ENCODE_DECODE 0x100
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SRGB  0x10
+
+ /**
+  * Disable any trilinear filtering optimizations.
+  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+  */
+#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
+
+/**
+ * Enable seamless cube map filtering.
+ * Flag for ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SEAMLESS_CUBEMAP  0x40
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_END
+ */
+#define CU_LAUNCH_PARAM_END_AS_INT     0x00
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)CU_LAUNCH_PARAM_END_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER        ((void*)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE        ((void*)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * Device that represents the CPU
+ */
+#define CU_DEVICE_CPU               ((CUdevice)-1)
+
+/**
+ * Device that represents an invalid device
+ */
+#define CU_DEVICE_INVALID           ((CUdevice)-2)
+
+/**
+ * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+ */
+typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
+} CUflushGPUDirectRDMAWritesOptions;
+
+/**
+ * Platform native ordering for GPUDirect RDMA writes
+ */
+typedef enum CUGPUDirectRDMAWritesOrdering_enum {
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
+} CUGPUDirectRDMAWritesOrdering;
+
+/**
+ * The scopes for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesScope_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+} CUflushGPUDirectRDMAWritesScope;
+ 
+/**
+ * The targets for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+} CUflushGPUDirectRDMAWritesTarget;
+
+/**
+ * The additional write options for ::cuGraphDebugDotPrint
+ */
+typedef enum CUgraphDebugDot_flags_enum {
+    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /**< Output all debug data as if every debug flag is enabled */
+    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /**< Use CUDA Runtime structures for output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /**< Adds CUDA_KERNEL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /**< Adds CUDA_MEMCPY3D values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /**< Adds CUDA_MEMSET_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /**< Adds CUDA_HOST_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /**< Adds CUevent handle from record and wait nodes to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /**< Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /**< Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /**< Adds CUkernelNodeAttrValue values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /**< Adds node handles and every kernel function handle to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /**< Adds memory alloc node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12, /**< Adds memory free node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS       = 1<<13, /**< Adds batch mem op node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO                = 1<<14, /**< Adds edge numbering information */
+    CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS        = 1<<15  /**< Adds conditional node parameters to output */
+} CUgraphDebugDot_flags;
+
+/**
+ * Flags for user objects for graphs
+ */
+typedef enum CUuserObject_flags_enum {
+    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+} CUuserObject_flags;
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+typedef enum CUuserObjectRetain_flags_enum {
+    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
+} CUuserObjectRetain_flags;
+
+/**
+ * Flags for instantiating a graph
+ */
+typedef enum CUgraphInstantiate_flags_enum {
+    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD               = 2 /**< Automatically upload the graph after instantiation. Only supported by
+                                                              ::cuGraphInstantiateWithParams.  The upload will be performed using the
+                                                              stream provided in \p instantiateParams. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH        = 4 /**< Instantiate the graph to be launchable from the device. This flag can only
+                                                              be used on platforms which support unified addressing. This flag cannot be
+                                                              used in conjunction with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY    = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                              priority of the stream it is launched into. */
+} CUgraphInstantiate_flags;
+
+/**
+ * CUDA device NUMA configuration
+ */
+typedef enum CUdeviceNumaConfig_enum {
+    CU_DEVICE_NUMA_CONFIG_NONE = 0, /**< The GPU is not a NUMA node */
+    CU_DEVICE_NUMA_CONFIG_NUMA_NODE, /**< The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID */
+} CUdeviceNumaConfig;
+
+/**
+ * CUDA Process States
+ */
+typedef enum CUprocessState_enum {
+    CU_PROCESS_STATE_RUNNING = 0,  /**< Default process state */
+    CU_PROCESS_STATE_LOCKED,       /**< CUDA API locks are taken so further CUDA API calls will block */
+    CU_PROCESS_STATE_CHECKPOINTED, /**< Application memory contents have been checkpointed and underlying allocations and device handles have been released */
+    CU_PROCESS_STATE_FAILED,       /**< Application entered an uncorrectable error during the checkpoint/restore process */
+} CUprocessState;
+
+/**
+ * CUDA checkpoint optional lock arguments
+ */
+typedef struct CUcheckpointLockArgs_st {
+    unsigned int timeoutMs; /**< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout */
+    unsigned int reserved0; /**< Reserved for future use, must be zero */
+    cuuint64_t reserved1[7]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointLockArgs;
+
+/**
+ * CUDA checkpoint optional checkpoint arguments
+ */
+typedef struct CUcheckpointCheckpointArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointCheckpointArgs;
+
+/**
+ * CUDA checkpoint optional restore arguments
+ */
+typedef struct CUcheckpointRestoreArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointRestoreArgs;
+
+/**
+ * CUDA checkpoint optional unlock arguments
+ */
+typedef struct CUcheckpointUnlockArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointUnlockArgs;
+
+/**
+ * Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
+ */
+typedef enum CUmemcpyFlags_enum {
+    CU_MEMCPY_FLAG_DEFAULT = 0x0,
+
+    /**
+     * Hint to the driver to try and overlap the copy with compute work on the SMs.
+     */
+    CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE = 0x1
+} CUmemcpyFlags;
+
+/**
+ * These flags allow applications to convey the source access ordering CUDA must maintain.
+ * The destination will always be accessed in stream order.
+ */
+typedef enum CUmemcpySrcAccessOrder_enum {
+    /**
+     * Default invalid.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_INVALID = 0x0,
+
+    /**
+     * Indicates that access to the source pointer must be in stream order.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_STREAM = 0x1,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and
+     * all accesses must be complete before the API call returns. This flag is suited for
+     * ephemeral sources (ex., stack variables) when it's known that no prior operations
+     * in the stream can be accessing the memory and also that the lifetime of the memory
+     * is limited to the scope that the source variable was declared in. Specifying
+     * this flag allows the driver to optimize the copy and removes the need for the user
+     * to synchronize the stream after the API call.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL = 0x2,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and the accesses
+     * can happen even after the API call returns. This flag is suited for host pointers
+     * allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+     * in the stream can be accessing the memory. Specifying this flag allows the driver
+     * to optimize the copy on certain platforms.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_ANY = 0x3,
+
+    CU_MEMCPY_SRC_ACCESS_ORDER_MAX = 0x7FFFFFFF
+}  CUmemcpySrcAccessOrder;
+
+/**
+ * Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+ */
+typedef struct CUmemcpyAttributes_st {
+    CUmemcpySrcAccessOrder srcAccessOrder;  /**< Source access ordering to be observed for copies with this attribute. */
+    CUmemLocation srcLocHint;               /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    CUmemLocation dstLocHint;               /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    unsigned int flags;                     /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUmemcpyAttributes_v1;
+typedef CUmemcpyAttributes_v1 CUmemcpyAttributes;
+
+/**
+ * These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
+ */
+typedef enum CUmemcpy3DOperandType_enum {
+    CU_MEMCPY_OPERAND_TYPE_POINTER = 0x1,     /**< Memcpy operand is a valid pointer. */
+    CU_MEMCPY_OPERAND_TYPE_ARRAY = 0x2,       /**< Memcpy operand is a CUarray. */
+    CU_MEMCPY_OPERAND_TYPE_MAX = 0x7FFFFFFF
+} CUmemcpy3DOperandType;
+
+/**
+ * Struct representing offset into a CUarray in elements
+ */
+typedef struct CUoffset3D_st {
+    size_t x;
+    size_t y;
+    size_t z;
+} CUoffset3D_v1;
+typedef CUoffset3D_v1 CUoffset3D;
+
+/**
+ * Struct representing width/height/depth of a CUarray in elements
+ */
+typedef struct CUextent3D_st {
+    size_t width;
+    size_t height;
+    size_t depth;
+} CUextent3D_v1;
+typedef CUextent3D_v1 CUextent3D;
+
+/**
+ * Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+ */
+typedef struct CUmemcpy3DOperand_st {
+    CUmemcpy3DOperandType type;
+    union {
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
+         */
+        struct {
+            CUdeviceptr ptr;
+            size_t rowLength;        /**< Length of each row in elements. */
+            size_t layerHeight;      /**< Height of each layer in elements. */ 
+            CUmemLocation locHint;   /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+        } ptr;
+
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
+         */
+        struct {
+            CUarray array;
+            CUoffset3D offset;
+        } array;
+    } op;
+} CUmemcpy3DOperand_v1;
+typedef CUmemcpy3DOperand_v1 CUmemcpy3DOperand;
+
+typedef struct CUDA_MEMCPY3D_BATCH_OP_st {
+    CUmemcpy3DOperand src;                    /**< Source memcpy operand. */
+    CUmemcpy3DOperand dst;                    /**< Destination memcpy operand. */
+    CUextent3D extent;                        /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
+    CUmemcpySrcAccessOrder srcAccessOrder;    /**< Source access ordering to be observed for copy from src to dst. */
+    unsigned int flags;                       /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUDA_MEMCPY3D_BATCH_OP_v1;
+typedef CUDA_MEMCPY3D_BATCH_OP_v1 CUDA_MEMCPY3D_BATCH_OP;
+
+/** @} */ /* END CUDA_TYPES */
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility push(default)
+  #endif
+#endif
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the string description of an error code
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string description
+ * of the error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorString
+ */
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
+
+/**
+ * \brief Gets the string representation of an error code enum name
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string representation
+ * of the name of the enum error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorName
+ */
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
+
+/** @} */ /* END CUDA_ERROR */
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Initialize the CUDA driver API
+ * Initializes the driver API and must be called before any other function from
+ * the driver API in the current process. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ * has not been called, any function from the driver API will return
+ * ::CUDA_ERROR_NOT_INITIALIZED.
+ *
+ * \param Flags - Initialization flag for CUDA.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+ * \notefnerr
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags);
+
+/** @} */ /* END CUDA_INITIALIZE */
+
+/**
+ * \defgroup CUDA_VERSION Version Management
+ *
+ * ___MANBRIEF___ version management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the version management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest CUDA version supported by driver
+ *
+ * Returns in \p *driverVersion the version of CUDA supported by
+ * the driver.  The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ * would be represented by 9020.
+ *
+ * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ * \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cudaRuntimeGetVersion
+ */
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
+
+/** @} */ /* END CUDA_VERSION */
+
+/**
+ * \defgroup CUDA_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ * ::cuDeviceGetCount()-1]</b>.
+ *
+ * \param device  - Returned device handle
+ * \param ordinal - Device number to get handle for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport
+ */
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * than or equal to 2.0 that are available for execution. If there is no such
+ * device, ::cuDeviceGetCount() returns 0.
+ *
+ * \param count - Returned number of compute-capable devices
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceCount
+ */
+CUresult CUDAAPI cuDeviceGetCount(int *count);
+
+/**
+ * \brief Returns an identifier string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p name. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param name - Returned identifier string for the device
+ * \param len  - Maximum length of string to store in \p name
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device
+ *
+ * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Returns 16-octets identifying the device \p dev in the structure
+ * pointed by the \p uuid.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetUuid_v2
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device (11.4+)
+ *
+ * Returns 16-octets identifying the device \p dev in the structure
+ * pointed by the \p uuid. If the device is in MIG mode, returns its
+ * MIG UUID which uniquely identifies the subscribed MIG compute instance.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an LUID and device node mask for the device
+ *
+ * Return identifying information (\p luid and \p deviceNodeMask) to allow
+ * matching device with graphics APIs.
+ *
+ * \param luid - Returned LUID
+ * \param deviceNodeMask - Returned device node mask
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
+
+/**
+ * \brief Returns the total amount of memory on the device
+ *
+ * Returns in \p *bytes the total amount of memory available on the device
+ * \p dev in bytes.
+ *
+ * \param bytes - Returned memory available on device in bytes
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
+ * for given \p format and \p numChannels.
+ *
+ * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
+ * \param format                - Texture format.
+ * \param numChannels           - Number of channels per texture element.
+ * \param dev                   - Device handle.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * \p dev. The supported attributes are:
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+ *   block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+ *   shared memory available to a thread block in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+ *   memory copy functions that involve memory regions allocated through
+ *   ::cuMemAllocPitch()
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+ *  for a 1D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 1D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+ *  in bytes for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 2D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+ *  mipmapped 2D texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+ *  texture depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+ *  Alternate maximum 3D texture width, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+ *  Alternate maximum 3D texture height, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+ *  Alternate maximum 3D texture depth, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+ *  Maximum cubemap texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+ *  Maximum 1D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+ *  Maximum 2D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+ *   Maximum 1D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+ *   Maximum 2D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+ *   Maximum 2D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+ *   Maximum 3D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+ *   Maximum 3D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+ *   Maximum 3D surface depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+ *   Maximum 1D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+ *   Maximum 2D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+ *   Maximum cubemap surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+ *   registers available to a thread block
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+ *   base addresses aligned to ::textureAlign bytes do not need an offset
+ *   applied to texture fetches
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+ *   for 2D texture references bound to pitched memory
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+ *   memory between host and device while executing a kernel, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+ *   the device
+ * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+ *   for kernels executed on the device, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+ *   memory subsystem, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+ *   memory into the CUDA address space, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+ *   in. Available modes are as follows:
+ *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+ *     can have multiple CUDA contexts present at a single time.
+ *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+ *     prohibited from creating new CUDA contexts.
+ *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+ *     can have only one context used by a single process at a time.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+ *   executing multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident
+ *   on the device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+ *    device, 0 if error correction is disabled or not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+ *   of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+ *    is only available on Tesla hardware running Windows Vista or later
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+ *   the host, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+ *   shared memory available to a multiprocessor in bytes; this amount is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+ *   registers available to a multiprocessor; this number is shared by all thread
+ *   blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+ *   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+ *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+ *   supports native atomic operations.
+ * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance.
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+ *   concurrently with the CPU.
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+ *   memory at the same virtual address as the CPU.
+ * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+ *    supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() or cuKernelSetAttribute() call.
+ *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+ *   page tables.
+ * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
+ * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
+ * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
+ * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
+ * - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+ID.
+ *
+ * \param pi     - Returned device attribute value
+ * \param attrib - Device attribute to query
+ * \param dev    - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaDeviceGetAttribute,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync object that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * Note that this API updates the input \p nvSciSyncAttrList with values equivalent
+ * to the following public attribute key-values:
+ * NvSciSyncAttrKey_RequiredPerm is set to
+ * - NvSciSyncAccessPerm_SignalOnly if ::CUDA_NVSCISYNC_ATTR_SIGNAL is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitOnly if ::CUDA_NVSCISYNC_ATTR_WAIT is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitSignal if both ::CUDA_NVSCISYNC_ATTR_WAIT and
+ * ::CUDA_NVSCISYNC_ATTR_SIGNAL are set in \p flags.
+ * NvSciSyncAttrKey_PrimitiveInfo is set to
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device.
+ * - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device.
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+.
+ * NvSciSyncAttrKey_GpuId is set to the same UUID that is returned for this 
+ * \p device from ::cuDeviceGetUuid.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on. 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cuDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device.
+ * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
+ */
+CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
+
+/**
+ * \brief Returns information about the execution affinity support of the device.
+ *
+ * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
+ * The supported types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
+ *   or 0 if not;
+ *
+ * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
+ * \param type - Execution affinity type to query
+ * \param dev  - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until GPUDirect RDMA writes to the target context via mappings
+ * created through APIs like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
+ * will be a no-op and can be safely omitted for performance. This can be
+ * determined by comparing the numerical values between the two enums, with
+ * smaller scopes having smaller values.
+ *
+ * On platforms that support GPUDirect RDMA writes via more than one path in
+ * hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
+ * consider those paths as belonging to separate ordering domains. Note that in
+ * such cases CUDA driver will report both RDMA writes ordering and RDMA write
+ * scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
+ * but when these multiple paths are used simultaneously, it is the user's
+ * responsibility to ensure ordering by using mechanisms outside the scope of
+ * CUDA.
+ *
+ * Users may query support for this API via
+ * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
+ *
+ * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+
+/** @} */ /* END CUDA_DEVICE */
+
+/**
+ * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns properties for a selected device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ * structure is defined as:
+ *
+ * \code
+     typedef struct CUdevprop_st {
+     int maxThreadsPerBlock;
+     int maxThreadsDim[3];
+     int maxGridSize[3];
+     int sharedMemPerBlock;
+     int totalConstantMemory;
+     int SIMDWidth;
+     int memPitch;
+     int regsPerBlock;
+     int clockRate;
+     int textureAlign
+  } CUdevprop;
+ * \endcode
+ * where:
+ *
+ * - ::maxThreadsPerBlock is the maximum number of threads per block;
+ * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ * - ::sharedMemPerBlock is the total amount of shared memory available per
+ *   block in bytes;
+ * - ::totalConstantMemory is the total amount of constant memory available on
+ *   the device in bytes;
+ * - ::SIMDWidth is the warp size;
+ * - ::memPitch is the maximum pitch allowed by the memory copy functions that
+ *   involve memory regions allocated through ::cuMemAllocPitch();
+ * - ::regsPerBlock is the total number of registers available per block;
+ * - ::clockRate is the clock frequency in kilohertz;
+ * - ::textureAlign is the alignment requirement; texture base addresses that
+ *   are aligned to ::textureAlign bytes do not need an offset applied to
+ *   texture fetches.
+ *
+ * \param prop - Returned properties of device
+ * \param dev  - Device to get properties for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+
+/**
+ * \brief Returns the compute capability of the device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and its functionality superseded
+ * by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *major and \p *minor the major and minor revision numbers that
+ * define the compute capability of the device \p dev.
+ *
+ * \param major - Major revision number
+ * \param minor - Minor revision number
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_PRIMARY_CTX Primary Context Management
+ *
+ * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the primary context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The primary context is unique per device and shared with the CUDA runtime API.
+ * These functions allow integration with other libraries using CUDA.
+ *
+ * @{
+ */
+
+/**
+ * \brief Retain the primary context on the GPU
+ *
+ * Retains the primary context on the device.
+ * Once the user successfully retains the primary context, the primary context
+ * will be active and available to the user until the user releases it
+ * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
+ * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
+ *
+ * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
+ * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
+ * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
+ * determine the compute mode  of the device.
+ * The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Please note that the primary context always supports pinned allocations. Other
+ * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param dev   - Device for which primary context is requested
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRelease,
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+
+/**
+ * \brief Release the primary context on the GPU
+ *
+ * Releases the primary context interop on the device.
+ * A retained context should always be released once the user is done using
+ * it. The context is automatically reset once the last reference to it is
+ * released. This behavior is different when the primary context was retained
+ * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
+ * context remains always active.
+ *
+ * Releasing a primary context that has not been previously retained will
+ * fail with ::CUDA_ERROR_INVALID_CONTEXT.
+ *
+ * Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ * from stack in any circumstances.
+ *
+ * \param dev - Device which primary context is released
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+
+/**
+ * \brief Set flags for the primary context
+ *
+ * Sets the flags for the primary context on the device overwriting perviously
+ * set ones.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial settings will be taken from the global settings at the time of
+ * context creation. The other settings that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial settings will be taken from the global settings at the time of
+ * context creation. The other settings that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * \param dev   - Device for which the primary context flags are set
+ * \param flags - New flags for the device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxGetState,
+ * ::cuCtxCreate,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetFlags,
+ * ::cudaSetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Get the state of the primary context
+ *
+ * Returns in \p *flags the flags for the primary context of \p dev, and in
+ * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ * values.
+ *
+ * \param dev    - Device to get primary context flags for
+ * \param flags  - Pointer to store flags
+ * \param active - Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+
+/**
+ * \brief Destroy all allocations and reset all state on the primary context
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process.
+ *
+ * Note that it is responsibility of the calling function to ensure that no
+ * other module in the process is using the device any more. For that reason
+ * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ * even after resetting the device.
+ * Resetting the primary context does not release it, an application that has
+ * retained the primary context should explicitly release its usage.
+ *
+ * \param dev - Device for which primary context is destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxRelease,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceReset
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+
+/** @} */ /* END CUDA_PRIMARY_CTX */
+
+/**
+ * \defgroup CUDA_CTX Context Management
+ *
+ * ___MANBRIEF___ context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * Please note that some functions are described in
+ * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a CUDA context
+ *
+ * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the 
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param flags - Context creation flags
+ * \param dev   - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Create a CUDA context with execution affinity
+ *
+ * Creates a new CUDA context with execution affinity and associates it with
+ * the calling thread. The \p paramsArray and \p flags parameter are described below.
+ * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
+ * call ::cuCtxDestroy() when done using the context. If a context is already
+ * current to the thread, it is supplanted by the newly created context and may
+ * be restored by a subsequent call to ::cuCtxPopCurrent().
+ *
+ * The type and the amount of execution resource the context can use is limited by \p paramsArray
+ * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
+ * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the 
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx        - Returned context handle of the new context
+ * \param paramsArray - Execution affinity parameters
+ * \param numParams   - Number of execution affinity parameters
+ * \param flags       - Context creation flags
+ * \param dev         - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Create a CUDA context
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ * 
+ * CUDA context can be created with execution affinity. The type and the amount of 
+   execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
+   in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
+ * describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ * 
+ * CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams. 
+ * Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams. 
+ * Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with 
+ * ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
+ * Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a 
+ * non-null value will result in an undefined behavior.
+ * 
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current. This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ * This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
+ * passed by client to create the CUDA context.
+ * 
+ * Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
+ * by the device or the driver.
+ * \param pctx              - Returned context handle of the new context
+ * \param ctxCreateParams   - Context creation parameters
+ * \param flags             - Context creation flags
+ * \param dev               - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate_v4(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Destroy a CUDA context
+ *
+ * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ * destroyed regardless of how many threads it is current to.
+ * It is the responsibility of the calling function to ensure that no API
+ * call issues using \p ctx while ::cuCtxDestroy() is executing.
+ *
+ * Destroys and cleans up all resources associated with the context.
+ * It is the caller's responsibility to ensure that the context or its resources
+ * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
+ * These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
+ * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ * These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
+ * ::cuMemAllocManaged() and ::cuMemAllocPitch().
+ *
+ * If \p ctx is current to the calling thread then \p ctx will also be
+ * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ * were called).  If \p ctx is current to other threads, then \p ctx will
+ * remain current to those threads, and attempting to access \p ctx from
+ * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+ *
+ * \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
+ * ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
+ * be destroyed explicitly.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+
+/**
+ * \brief Pushes a context on the current CPU thread
+ *
+ * Pushes the given context \p ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context, so
+ * all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * ::cuCtxDestroy() or ::cuCtxPopCurrent().
+ *
+ * \param ctx - Context to push
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+
+/**
+ * \brief Pops the current CUDA context from the current CPU thread.
+ *
+ * Pops the current CUDA context from the CPU thread and passes back the
+ * old context handle in \p *pctx. That context may then be made current
+ * to a different CPU thread by calling ::cuCtxPushCurrent().
+ *
+ * If a context was current to the CPU thread before ::cuCtxCreate() or
+ * ::cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * \param pctx - Returned popped context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+
+/**
+ * \brief Binds the specified CUDA context to the calling CPU thread
+ *
+ * Binds the specified CUDA context to the calling CPU thread.
+ * If \p ctx is NULL then the CUDA context previously bound to the
+ * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+ *
+ * If there exists a CUDA context stack on the calling CPU thread, this
+ * will replace the top of that stack with \p ctx.
+ * If \p ctx is NULL then this will be equivalent to popping the top
+ * of the calling CPU thread's CUDA context stack (or a no-op if the
+ * calling CPU thread's CUDA context stack is empty).
+ *
+ * \param ctx - Context to bind to the calling CPU thread
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxGetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaSetDevice
+ */
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
+
+/**
+ * \brief Returns the CUDA context bound to the calling CPU thread.
+ *
+ * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ * If no context is bound to the calling CPU thread then \p *pctx is
+ * set to NULL and ::CUDA_SUCCESS is returned.
+ *
+ * \param pctx - Returned context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxSetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
+
+/**
+ * \brief Returns the device handle for the current context
+ *
+ * Returns in \p *device the handle of the current context's device.
+ *
+ * \param device - Returned device handle for the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
+
+/**
+ * \brief Returns the flags for the current context
+ *
+ * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ * for flag values.
+ *
+ * \param flags - Pointer to store flags of current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuCtxSetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
+
+/**
+ * \brief Sets the flags for the current context
+ *
+ * Sets the flags for the current context overwriting previously set ones. See
+ * ::cuDevicePrimaryCtxSetFlags for flag values.
+ *
+ * \param flags - Flags to set on the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuCtxGetFlags,
+ * ::cudaGetDeviceFlags,
+ * ::cuDevicePrimaryCtxSetFlags,
+ */
+CUresult CUDAAPI cuCtxSetFlags(unsigned int flags);
+
+/**
+ * \brief Returns the unique Id associated with the context supplied
+ *
+ * Returns in \p ctxId the unique Id which is associated with a given context.
+ * The Id is unique for the life of the program for this instance of CUDA.
+ * If context is supplied as NULL and there is one current, the Id of the
+ * current context is returned.
+ *
+ * \param ctx - Context for which to obtain the Id
+ * \param ctxId - Pointer to store the Id of the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent
+ */
+CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId);
+
+/**
+ * \brief Block for the current context's tasks to complete
+ *
+ * Blocks until the current context has completed all preceding requested tasks.
+ * If the current context is the primary context, green contexts that have been
+ * created will also be synchronized.
+ * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ * CPU thread will block until the GPU context has finished its work.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cudaDeviceSynchronize
+ */
+CUresult CUDAAPI cuCtxSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the context. The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ * what the limit has been set to.
+ *
+ * Setting each ::CUlimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ *   The driver automatically increases the per-thread stack size
+ *   for each kernel launch as needed. This size isn't reset back to the
+ *   original value after each launch. Setting this value will take effect 
+ *   immediately, and if necessary, the device will block until all preceding 
+ *   requested tasks are complete.
+ *
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+ *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+ *   must be performed before launching any kernel that uses the ::printf()
+ *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+ *   by the ::malloc() and ::free() device system calls. Setting
+ *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls, otherwise
+ *   ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+ *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the driver to reserve large amounts of device
+ *   memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability < 9.0.
+ *   Attempting to set this limit on devices of other compute capability
+ *   versions will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   context. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the driver to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for
+ *   persisting L2 cache. This is purely a performance hint and it can be
+ *   ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceSetLimit
+ */
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * Returns in \p *pvalue the current size of \p limit.  The supported
+ * ::CUlimit values are:
+ * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+ *   ::printf() device system call.
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+ *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+ *   child grid launches to complete.
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+ *   device runtime launches that can be made from this context.
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
+ *
+ * \param limit  - Limit to query
+ * \param pvalue - Returned size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetLimit
+ */
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this function returns through \p pconfig the preferred cache configuration
+ * for the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute functions.
+ *
+ * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param pconfig - Returned cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig
+ */
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute the function. Any function preference
+ * set via ::cuFuncSetCacheConfig() or ::cuKernelSetCacheConfig() will be preferred over this context-wide
+ * setting. Setting the context-wide cache configuration to
+ * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ * to not change the cache configuration unless required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetCacheConfig,
+ * ::cuKernelSetCacheConfig
+ */
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
+
+/**
+ * \brief Gets the context's API version.
+ *
+ * Returns a version number in \p version corresponding to the capabilities of
+ * the context (e.g. 3010 or 3020), which library developers can use to direct
+ * callers to a specific API version. If \p ctx is NULL, returns the API version
+ * used to create the currently bound context.
+ *
+ * Note that new API versions are only introduced when context capabilities are
+ * changed that break binary compatibility, so the API version and driver version
+ * may be different. For example, it is valid for the API version to be 3020 while
+ * the driver version is 4020.
+ *
+ * \param ctx     - Context to check
+ * \param version - Pointer to version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cuStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cuDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetStreamPriorityRange
+ */
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
+ * status. Takes effect on function return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Returns the execution affinity setting for the current context.
+ *
+ * Returns in \p *pExecAffinity the current value of \p type. The supported
+ * ::CUexecAffinityType values are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
+ *
+ * \param type          - Execution affinity type to query
+ * \param pExecAffinity - Returned execution affinity
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
+ * \notefnerr
+ *
+ * \sa
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+
+/**
+ * \brief Records an event.
+ *
+ * Captures in \p hEvent all the activities of the context \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
+ * Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
+ * or wait for completion of the work that was captured.
+ * Uses of \p hCtx after this call do not modify \p hEvent.
+ * If the context passed to \p hCtx is the primary context, \p hEvent will
+ * capture all the activities of the primary context and its green contexts.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified context \p hCtx has a stream in the capture mode. In such a case,
+ * the call will invalidate all the conflicting captures.
+ * 
+ * \param hCtx - Context to record event for
+ * \param hEvent - Event to record
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxWaitEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuCtxRecordEvent(CUcontext hCtx, CUevent hEvent);
+
+/**
+ * \brief Make a context wait on an event
+ *
+ * Makes all future work submitted to context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
+ * and will not block the calling CPU thread. See ::cuCtxRecordEvent()
+ * for details on what is captured by an event.
+ * If the context passed to \p hCtx is the primary context, the primary context
+ * and its green contexts will wait for \p hEvent.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an ongoing
+ * capture sequence or if the specified context \p hCtx has a stream in the capture mode.
+ *
+ * \param hCtx    - Context to wait
+ * \param hEvent  - Event to wait on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxRecordEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuCtxWaitEvent(CUcontext hCtx, CUevent hEvent);
+
+/** @} */ /* END CUDA_CTX */
+
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Increment a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Increments the usage count of the context and passes back a context handle
+ * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ * done with the context. ::cuCtxAttach() fails if there is no context current
+ * to the thread.
+ *
+ * Currently, the \p flags parameter must be 0.
+ *
+ * \param pctx  - Returned context handle of the current context
+ * \param flags - Context attach flags (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxDetach,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
+
+/**
+ * \brief Decrement a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Decrements the usage count of the context \p ctx, and destroys the context
+ * if the usage count goes to 0. The context must be a handle that was passed
+ * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ * calling thread.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
+
+
+/**
+ * \brief Returns the current shared memory configuration for the current context.
+ *
+ * \deprecated
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * in the current context. On devices with configurable shared memory banks,
+ * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ * subsequent kernel launches will by default use the new bank size. When
+ * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+ *   four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+ *   eight bytes.
+ *
+ * \param pConfig - returned shared memory configuration
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current context.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the context's shared memory bank size which is used for subsequent kernel
+ * launches.
+ *
+ * Changed the shared memory configuration between launches may insert a device
+ * side synchronization point between those launches.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+ *   setting (currently, four bytes).
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes.
+ *
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
+
+/** @} */ /* END CUDA_CTX_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_MODULE Module Management
+ *
+ * ___MANBRIEF___ module management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the module management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Loads a compute module
+ *
+ * Takes a filename \p fname and loads the corresponding module \p module into
+ * the current context. The CUDA driver API does not attempt to lazily
+ * allocate the resources needed by a module; if the memory for functions and
+ * data (constant and global) needed by the module cannot be allocated,
+ * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+ *
+ * \param module - Returned module
+ * \param fname  - Filename of module to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_FILE_NOT_FOUND,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The \p image may be a \e cubin or \e fatbin
+ * as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ * or hand-written.
+ *
+ * \param module - Returned module
+ * \param image  - Module data to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
+
+/**
+ * \brief Load a module's data with options
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The \p image may be a \e cubin or \e fatbin
+ * as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ * or hand-written.
+ *
+ * \param module       - Returned module
+ * \param image        - Module data to load
+ * \param numOptions   - Number of options
+ * \param options      - Options for JIT
+ * \param optionValues - Option values for JIT
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p fatCubin and loads the corresponding module \p module
+ * into the current context. The pointer represents a <i>fat binary</i> object,
+ * which is a collection of different \e cubin and/or \e PTX files, all
+ * representing the same device code, but compiled and optimized for different
+ * architectures.
+ *
+ * Prior to CUDA 4.0, there was no documented API for constructing and using
+ * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ * More information can be found in the \b nvcc document.
+ *
+ * \param module   - Returned module
+ * \param fatCubin - Fat binary to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+
+/**
+ * \brief Unloads a module
+ *
+ * Unloads a module \p hmod from the current context. Attempting to unload
+ * a module which was obtained from the Library Management API such as
+ * ::cuLibraryGetModule will return ::CUDA_ERROR_NOT_PERMITTED.
+ *
+ * \param hmod - Module to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ * \notefnerr
+ * \note_destroy_ub
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary
+ */
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
+
+/**
+ * CUDA Lazy Loading status
+ */
+typedef enum CUmoduleLoadingMode_enum {
+    CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */
+    CU_MODULE_LAZY_LOADING  = 0x2, /**< Lazy Kernel Loading is enabled */
+} CUmoduleLoadingMode;
+
+/**
+ * \brief Query lazy loading mode
+ *
+ * Returns lazy loading mode
+ * Module loading mode is controlled by CUDA_MODULE_LOADING env variable
+ *
+ * \param mode      - Returns the lazy loading mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleLoad,
+ */
+CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p *hfunc the handle of the function of name \p name located in
+ * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ * returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param hfunc - Returned function handle
+ * \param hmod  - Module to retrieve function from
+ * \param name  - Name of function to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns the number of functions within a module
+ *
+ * Returns in \p count the number of functions in \p mod.
+ *
+ * \param count - Number of functions found within the module
+ * \param mod - Module to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ */
+CUresult CUDAAPI cuModuleGetFunctionCount(unsigned int *count, CUmodule mod);
+
+/**
+ * \brief Returns the function handles within a module.
+ *
+ * Returns in \p functions a maximum number of \p numFunctions function handles within \p mod. When
+ * function loading mode is set to LAZY the function retrieved may be partially loaded. The loading
+ * state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function
+ * automatically when called with partially loaded function handle which may incur additional
+ * latency. Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned
+ * function handles become invalid when the module is unloaded.
+ *
+ * \param functions - Buffer where the function handles are returned to
+ * \param numFunctions - Maximum number of function handles may be returned to the buffer
+ * \param mod - Module to query from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetFunctionCount,
+ * ::cuFuncIsLoaded,
+ * ::cuFuncLoad
+ */
+CUresult CUDAAPI cuModuleEnumerateFunctions(CUfunction *functions, unsigned int numFunctions, CUmodule mod);
+
+/**
+ * \brief Returns a global pointer from a module
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the
+ * global of name \p name located in module \p hmod. If no variable of that name
+ * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored.
+ *
+ * \param dptr  - Returned global device pointer
+ * \param bytes - Returned global size in bytes
+ * \param hmod  - Module to retrieve global from
+ * \param name  - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSymbolAddress,
+ * ::cudaGetSymbolSize
+ */
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+
+/**
+ * \brief Creates a pending JIT linker invocation.
+ *
+ * If the call is successful, the caller owns the returned CUlinkState, which
+ * should eventually be destroyed with ::cuLinkDestroy.  The
+ * device code machine size (32 or 64 bit) will match the calling application.
+ *
+ * Both linker and compiler options may be specified.  Compiler options will
+ * be applied to inputs to this linker action which must be compiled from PTX.
+ * The options ::CU_JIT_WALL_TIME,
+ * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ * will accumulate data until the CUlinkState is destroyed.
+ *
+ * The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
+ * as relocatable (-rdc=true to nvcc) when linking the final cubin during 
+ * ::cuLinkComplete and will have similar consequences as offline relocatable 
+ * device code linking.
+ *
+ * \p optionValues must remain valid for the life of the CUlinkState if output
+ * options are used.  No other references to inputs are maintained after this
+ * call returns.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param numOptions   Size of options arrays
+ * \param options      Array of linker and compiler options
+ * \param optionValues Array of option values, each cast to void *
+ * \param stateOut     On success, this will contain a CUlinkState to specify
+ *                     and complete this action
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+
+/**
+ * \brief Add an input to a pending linker invocation
+ *
+ * Ownership of \p data is retained by the caller.  No reference is retained to any
+ * inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the data must
+ * be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param state        A pending linker action.
+ * \param type         The type of the input data.
+ * \param data         The input data.  PTX must be NULL-terminated.
+ * \param size         The length of the input data.
+ * \param name         An optional name for this input in log messages.
+ * \param numOptions   Size of options.
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ * \param optionValues Array of option values, each cast to void *.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Add a file input to a pending linker invocation
+ *
+ * No reference is retained to any inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the input
+ * must be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * This method is equivalent to invoking ::cuLinkAddData on the contents
+ * of the file.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param state        A pending linker action
+ * \param type         The type of the input data
+ * \param path         Path to the input file
+ * \param numOptions   Size of options
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ * \param optionValues Array of option values, each cast to void *
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_FILE_NOT_FOUND
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Complete a pending linker invocation
+ *
+ * Completes the pending linker action and returns the cubin image for the linked
+ * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ * This call does not destroy \p state.
+ *
+ * \param state    A pending linker invocation
+ * \param cubinOut On success, this will point to the output image
+ * \param sizeOut  Optional parameter to receive the size of the generated image
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkDestroy,
+ * ::cuModuleLoadData
+ */
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+
+/**
+ * \brief Destroys state for a JIT linker invocation.
+ *
+ * \param state State object for the linker invocation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \sa ::cuLinkCreate
+ */
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state);
+
+/** @} */ /* END CUDA_MODULE */
+
+/**
+ * \defgroup CUDA_MODULE_DEPRECATED Module Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated module management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated module management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pTexRef the handle of the texture reference of name \p name
+ * in the module \p hmod. If no texture reference of that name exists,
+ * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ * handle should not be destroyed, since it will be destroyed when the module
+ * is unloaded.
+ *
+ * \param pTexRef  - Returned texture reference
+ * \param hmod     - Module to retrieve texture reference from
+ * \param name     - Name of texture reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetSurfRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a surface reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ * in the module \p hmod. If no surface reference of that name exists,
+ * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pSurfRef  - Returned surface reference
+ * \param hmod     - Module to retrieve surface reference from
+ * \param name     - Name of surface reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+/** @} */ /* END CUDA_MODULE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_LIBRARY Library Management
+ *
+ * ___MANBRIEF___ library management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the library management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Load a library with specified code and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
+ * or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
+ * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \param library             - Returned library
+ * \param code                - Code to load
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx
+ */
+CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
+                                   CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                   CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Load a library with specified file and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
+ * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are
+ * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \param library             - Returned library
+ * \param fileName            - File to load from
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryUnload,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx
+ */
+CUresult CUDAAPI cuLibraryLoadFromFile(CUlibrary *library, const char *fileName,
+                                       CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                       CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Unloads a library
+ *
+ * Unloads the library specified with \p library
+ *
+ * \param library - Library to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuLibraryUnload(CUlibrary library);
+
+/**
+ * \brief Returns a kernel handle
+ *
+ * Returns in \p pKernel the handle of the kernel with name \p name located in library \p library.
+ * If kernel handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pKernel - Returned kernel handle
+ * \param library - Library to retrieve kernel from
+ * \param name - Name of kernel to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuLibraryGetKernel(CUkernel *pKernel, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns the number of kernels within a library
+ *
+ * Returns in \p count the number of kernels in \p lib.
+ *
+ * \param count - Number of kernels found within the library
+ * \param lib - Library to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ */
+CUresult CUDAAPI cuLibraryGetKernelCount(unsigned int *count, CUlibrary lib);
+ 
+/**
+ * \brief Retrieve the kernel handles within a library.
+ *
+ * Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib.
+ * The returned kernel handle becomes invalid when the library is unloaded.
+ *
+ * \param kernels - Buffer where the kernel handles are returned to
+ * \param numKernels - Maximum number of kernel handles may be returned to the buffer
+ * \param lib - Library to query from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuLibraryGetKernelCount
+ */
+CUresult CUDAAPI cuLibraryEnumerateKernels(CUkernel *kernels, unsigned int numKernels, CUlibrary lib);
+
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p pMod the module handle associated with the current context located in
+ * library \p library. If module handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pMod - Returned module handle
+ * \param library - Library to retrieve module from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p pFunc the handle of the function for the requested kernel \p kernel and
+ * the current context. If function handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pFunc - Returned function handle
+ * \param kernel - Kernel to retrieve function for the requested context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel);
+
+/**
+ * \brief Returns a library handle
+ *
+ * Returns in \p pLib the handle of the library for the requested kernel \p kernel
+ *
+ * \param pLib - Returned library handle
+ * \param kernel - Kernel to retrieve library handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel
+ */
+CUresult CUDAAPI cuKernelGetLibrary(CUlibrary *pLib, CUkernel kernel);
+
+/**
+ * \brief Returns a global device pointer
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the global with
+ * name \p name for the requested library \p library and the current context.
+ * If no global for the requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored.
+ *
+ * \param dptr - Returned global device pointer for the requested context
+ * \param bytes - Returned global size in bytes
+ * \param library - Library to retrieve global from
+ * \param name - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetModule,
+ * cuModuleGetGlobal
+ */
+CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns a pointer to managed memory
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with
+ * name \p name for the requested library \p library. If no managed memory with the
+ * requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND. One of the parameters
+ * \p dptr or \p bytes (not both) can be NULL in which case it is ignored.
+ * Note that managed memory for library \p library is shared across devices and is registered
+ * when the library is loaded into atleast one context.
+ *
+ * \param dptr - Returned pointer to the managed memory
+ * \param bytes - Returned memory size in bytes
+ * \param library - Library to retrieve managed memory from
+ * \param name - Name of managed memory to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload
+ */
+CUresult CUDAAPI cuLibraryGetManaged(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns a pointer to a unified function
+ *
+ * Returns in \p *fptr the function pointer to a unified function denoted by \p symbol.
+ * If no unified function with name \p symbol exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ * If there is no device with attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS present in the system,
+ * the call may return ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param fptr - Returned pointer to a unified function
+ * \param library - Library to retrieve function pointer memory from
+ * \param symbol - Name of function pointer to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload
+ */
+CUresult CUDAAPI cuLibraryGetUnifiedFunction(void **fptr, CUlibrary library, const char *symbol);
+
+/**
+ * \brief Returns information about a kernel
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib for the kernel
+ * \p kernel for the requested device \p dev. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the kernel would fail. This number
+ *   depends on both the kernel and the requested device.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this kernel.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the kernel was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the kernel was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the kernel has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set.
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+ *   kernel must launch with a valid cluster size specified.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed. A non-portable cluster size may only function on the
+ *   specific SKUs the program is tested on. The launch might fail if the
+ *   program is run on a different hardware platform. CUDA API provides
+ *   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+ *   size can be launched on the current device. A portable cluster size is
+ *   guaranteed to be functional on all compute capabilities higher than the
+ *   target compute capability. The portable cluster size for sm_90 is 8 blocks
+ *   per cluster. This value may increase for future compute capabilities. The
+ *   specific hardware unit may support higher cluster sizes that’s not
+ *   guaranteed to be portable.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \note If another thread is trying to set the same attribute on the same device using
+ * ::cuKernelSetAttribute() simultaneously, the attribute query will give the old or new
+ * value depending on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param kernel  - Kernel to query attribute of
+ * \param dev - Device to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelSetAttribute,
+ * ::cuLibraryGetKernel,
+ * ::cuLaunchKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev);
+
+/**
+ * \brief Sets information about a kernel
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel \p kernel
+ * for the requested device \p dev to an integer value specified by \p val.
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Note that attributes set using ::cuFuncSetAttribute() will override the attribute
+ * set by this API irrespective of whether the call to ::cuFuncSetAttribute() is made
+ * before or after this API call. However, ::cuKernelGetAttribute() will always
+ * return the attribute value set by this API.
+ *
+ * Supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This is the maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory.
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \note The API has stricter locking requirements in comparison to its legacy counterpart
+ * ::cuFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to
+ * set the same attribute on the same device simultaneously, the attribute setting will depend
+ * on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param attrib - Attribute requested
+ * \param val - Value to set
+ * \param kernel  - Kernel to set attribute of
+ * \param dev - Device to set attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelGetAttribute,
+ * ::cuLibraryGetKernel,
+ * ::cuLaunchKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncSetAttribute
+ */
+CUresult CUDAAPI cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev);
+
+/**
+ * \brief Sets the preferred cache configuration for a device kernel.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device kernel \p kernel on the requested device \p dev. This is only a preference.
+ * The driver will use the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p kernel.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel
+ * setting.
+ *
+ * Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute
+ * set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made
+ * before or after this API call.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \note The API has stricter locking requirements in comparison to its legacy counterpart
+ * ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to
+ * set a config on the same device simultaneously, the cache config setting will depend
+ * on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param kernel  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ * \param dev - Device to set attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncSetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuLaunchKernel
+ */
+CUresult CUDAAPI cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice dev);
+
+/**
+ * \brief Returns the function name for a ::CUkernel handle
+ *
+ * Returns in \p **name the function name associated with the kernel handle \p hfunc .
+ * The function name is returned as a null-terminated string. The returned name is only 
+ * valid when the kernel handle is valid. If the library is unloaded or reloaded, one 
+ * must call the API again to get the updated name. This API may return a mangled name if
+ * the function is not declared as having C linkage. If either \p **name or \p hfunc 
+ * is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param name - The returned name of the function
+ * \param hfunc - The function handle to retrieve the name for 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuKernelGetName(const char **name, CUkernel hfunc);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+ *
+ * Queries the kernel parameter at \p paramIndex into \p kernel's list of parameters, and returns
+ * in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ * will reside in the device-side parameter layout. This information can be used to update kernel
+ * node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ * ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ * that \p kernel takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+ *
+ * \param kernel      - The kernel to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+* \sa ::cuFuncGetParamInfo
+ */
+CUresult CUDAAPI cuKernelGetParamInfo(CUkernel kernel, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+/** @} */ /* END CUDA_LIBRARY */
+
+/**
+ * \defgroup CUDA_MEM Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets free and total memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Allocates device memory
+ *
+ * Allocates \p bytesize bytes of linear memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc
+ */
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+
+/**
+ * \brief Allocates pitched device memory
+ *
+ * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ * the device and returns in \p *dptr a pointer to the allocated memory. The
+ * function may pad the allocation to ensure that corresponding pointers in
+ * any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. \p ElementSizeBytes
+ * specifies the size of the largest reads and writes that will be performed
+ * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ * memory transactions are not possible on other data sizes). If
+ * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ * the kernel will run correctly, but possibly at reduced speed. The pitch
+ * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ * allocation. The intended usage of pitch is as a separate parameter of the
+ * allocation, used to compute addresses within the 2D array. Given the row
+ * and column of an array element of type \b T, the address is computed as:
+ * \code
+   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ * \endcode
+ *
+ * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ * recommended that programmers consider performing pitch allocations using
+ * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing 2D memory copies
+ * between different regions of device memory (whether linear memory or CUDA
+ * arrays).
+ *
+ * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ * to match or exceed the alignment requirement for texture binding with
+ * ::cuTexRefSetAddress2D().
+ *
+ * \param dptr             - Returned device pointer
+ * \param pPitch           - Returned pitch of allocation in bytes
+ * \param WidthInBytes     - Requested allocation width in bytes
+ * \param Height           - Requested allocation height in rows
+ * \param ElementSizeBytes - Size of largest reads/writes for range
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocPitch
+ */
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+
+/**
+ * \brief Frees device memory
+ *
+ * Frees the memory space pointed to by \p dptr, which must have been returned
+ * by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(), 
+ * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
+ *
+ * Note - This API will not perform any implict synchronization when the pointer was allocated with
+ * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
+ * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
+ * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+ * For all other pointers, this API may perform implicit synchronization.
+ * 
+ * \param dptr - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync, 
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync,
+ * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFree
+ */
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+
+/**
+ * \brief Get information on memory allocations
+ *
+ * Returns the base address in \p *pbase and size in \p *psize of the
+ * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ * of them is NULL, it is ignored.
+ *
+ * \param pbase - Returned base address
+ * \param psize - Returned size of device memory allocation
+ * \param dptr  - Device pointer to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
+ */
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and
+ * accessible to the device. The driver tracks the virtual memory ranges
+ * allocated with this function and automatically accelerates calls to
+ * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ * the device, it can be read or written with much higher bandwidth than
+ * pageable memory obtained with functions such as ::malloc(). 
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemAllocHost may not page-lock the allocated memory.
+ *
+ * Page-locking excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ * performance, since it reduces the amount of memory available to the system
+ * for paging. As a result, this function is best used sparingly to allocate
+ * staging areas for data exchange between host and device.
+ *
+ * Note all host memory allocated using ::cuMemAllocHost() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * The device pointer that may be used to access this host memory from those
+ * contexts is always equal to the returned host pointer \p *pp.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned pointer to host memory
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocHost
+ */
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
+
+/**
+ * \brief Frees page-locked host memory
+ *
+ * Frees the memory space pointed to by \p p, which must have been returned by
+ * a previous call to ::cuMemAllocHost().
+ *
+ * \param p - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeHost
+ */
+CUresult CUDAAPI cuMemFreeHost(void *p);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). 
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemHostAlloc may not page-lock the allocated memory.
+ *
+ * Page-locking excessive amounts of memory may degrade system performance, 
+ * since it reduces the amount of memory available to the system for paging. 
+ * As a result, this function is best used sparingly to allocate staging areas 
+ * for data exchange between host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+ *   (WC). WC memory can be transferred across the PCI Express bus more
+ *   quickly on some system configurations, but cannot be read efficiently by
+ *   most CPUs. WC memory is a good option for buffers that will be written by
+ *   the CPU and read by the GPU via mapped pinned memory or host->device
+ *   transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+ *
+ * The memory allocated by this function must be freed with ::cuMemFreeHost().
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ * that may be used to access this host memory from those contexts is always equal
+ * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ * is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ * to query the device pointer, even if the context supports unified addressing.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned pointer to host memory
+ * \param bytesize - Requested allocation size in bytes
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostAlloc
+ */
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Passes back device pointer of mapped pinned memory
+ *
+ * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ * host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ * flag was not specified at the time the memory was allocated, or if the
+ * function is called on a GPU that does not support mapped pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p p and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p p. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only one of the two pointers and not both.
+ *
+ * \p Flags provides for future releases. For now, it must be set to 0.
+ *
+ * \param pdptr - Returned device pointer
+ * \param p     - Host pointer
+ * \param Flags - Options (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostGetDevicePointer
+ */
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+
+/**
+ * \brief Passes back flags that were used for a pinned allocation
+ *
+ * Passes back the flags \p pFlags that were specified when allocating
+ * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param p     - Host pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemAllocHost,
+ * ::cuMemHostAlloc,
+ * ::cudaHostGetFlags
+ */
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p bytesize bytes of managed memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ * ::cuStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cuStreamAttachMemAsync to
+ * a single stream, the default association as specified during ::cuMemAllocManaged
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a system where all GPUs have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cuMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ * non-zero value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all contexts created in
+ * that process on devices that support managed memory have to be peer-to-peer compatible
+ * with each other. Context creation will fail if a context is created on a device that
+ * supports managed memory and is not peer-to-peer compatible with any of the other
+ * managed memory supporting devices on which contexts were previously created, even if
+ * those contexts have been destroyed. These environment variables are described
+ * in the CUDA programming guide under the "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ * ::cudaMallocManaged
+ */
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+
+/**
+* \brief Registers a callback function to receive async notifications
+* 
+* Registers \p callbackFunc to receive async notifications.
+* 
+* The \p userData parameter is passed to the callback function at async notification time.  
+* Likewise, \p callback is also passed to the callback function to distinguish between
+* multiple registered callbacks.
+* 
+* The callback function being registered should be designed to return quickly (~10ms).  
+* Any long running tasks should be queued for execution on an application thread.
+* 
+* Callbacks may not call cuDeviceRegisterAsyncNotification or cuDeviceUnregisterAsyncNotification.
+* Doing so will result in ::CUDA_ERROR_NOT_PERMITTED. Async notification callbacks execute
+* in an undefined order and may be serialized.
+* 
+* Returns in \p *callback a handle representing the registered callback instance.
+* 
+* \param device - The device on which to register the callback
+* \param callbackFunc - The function to register as a callback
+* \param userData - A generic pointer to user data. This is passed into the callback function.
+* \param callback - A handle representing the registered callback instance
+* 
+* \return
+* ::CUDA_SUCCESS
+* ::CUDA_ERROR_NOT_SUPPORTED
+* ::CUDA_ERROR_INVALID_DEVICE
+* ::CUDA_ERROR_INVALID_VALUE
+* ::CUDA_ERROR_NOT_PERMITTED
+* ::CUDA_ERROR_UNKNOWN
+* \notefnerr
+* 
+* \sa
+* ::cuDeviceUnregisterAsyncNotification
+*/
+CUresult CUDAAPI cuDeviceRegisterAsyncNotification(CUdevice device, CUasyncCallback callbackFunc, void *userData, CUasyncCallbackHandle *callback);
+
+/**
+* \brief Unregisters an async notification callback
+* 
+* Unregisters \p callback so that the corresponding callback function will stop receiving
+* async notifications.
+* 
+* \param device - The device from which to remove \p callback.
+* \param callback - The callback instance to unregister from receiving async notifications.
+* 
+* \return
+* ::CUDA_SUCCESS
+* ::CUDA_ERROR_NOT_SUPPORTED
+* ::CUDA_ERROR_INVALID_DEVICE
+* ::CUDA_ERROR_INVALID_VALUE
+* ::CUDA_ERROR_NOT_PERMITTED
+* ::CUDA_ERROR_UNKNOWN
+* \notefnerr
+* 
+* \sa
+* ::cuDeviceRegisterAsyncNotification
+*/
+CUresult CUDAAPI cuDeviceUnregisterAsyncNotification(CUdevice device, CUasyncCallbackHandle callback);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given a PCI bus ID string.
+ *
+ * \param dev      - Returned device handle
+ *
+ * \param pciBusId - String in one of the following forms:
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetPCIBusId,
+ * ::cudaDeviceGetByPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param dev      - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetByPCIBusId,
+ * ::cudaDeviceGetPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been
+ * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been opened in the importing process,
+ * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ * ::cuEventQuery may be used in either process. Performing operations
+ * on the imported event after the exported event has been freed
+ * with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pHandle - Pointer to a user allocated CUipcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+ *                    ::CU_EVENT_DISABLE_TIMING flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetEventHandle
+ */
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with
+ * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ * This event must be freed with ::cuEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has
+ * been freed with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param phEvent - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcOpenEventHandle
+ */
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with ::cuMemAlloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with ::cuMemFree and a subsequent call
+ * to ::cuMemAlloc returns memory with the same device address,
+ * ::cuIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+ *                    the handle in.
+ * \param dptr    - Base pointer to previously allocated device memory
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetMemHandle
+ */
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cuIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ * by one ::CUcontext per ::CUdevice per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cuIpcOpenMemHandle must be freed with
+ * ::cuIpcCloseMemHandle.
+ *
+ * Calling ::cuMemFree on an exported memory region before calling
+ * ::cuIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pdptr  - Returned device pointer
+ * \param handle - ::CUipcMemHandle to open
+ * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \note No guarantees are made about the address returned in \p *pdptr.
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cuCtxEnablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaIpcOpenMemHandle
+ */
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+
+/**
+ * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
+ *
+ * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle
+ */
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p p and \p bytesize and maps it
+ * for the device(s) as specified by \p Flags. This memory range also is added
+ * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ * directly by the device, it can be read or written with much higher bandwidth
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemHostRegister will not page-lock the memory range specified 
+ * by \p ptr but only populate unpopulated pages.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+ *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+ *
+ * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
+ *   that is considered read-only by the device.  On platforms without
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+ *   required in order to register memory mapped to the CPU as read-only.  Support
+ *   for the use of this flag can be queried from the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with
+ * ::cuMemHostUnregister().
+ *
+ * \param p        - Host pointer to memory to page-lock
+ * \param bytesize - Size in bytes of the address range to page-lock
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostUnregister,
+ * ::cuMemHostGetFlags,
+ * ::cuMemHostGetDevicePointer,
+ * ::cudaHostRegister
+ */
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cuMemHostRegister.
+ *
+ * Unmaps the memory range whose base address is specified by \p p, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cuMemHostRegister().
+ *
+ * \param p - Host pointer to memory to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostRegister,
+ * ::cudaHostUnregister
+ */
+CUresult CUDAAPI cuMemHostUnregister(void *p);
+
+/**
+ * \brief Copies memory
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst - Destination unified virtual address space pointer
+ * \param src - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+
+/**
+ * \brief Copies device memory between two contexts
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeer
+ */
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol
+ */
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Array
+ *
+ * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting index of the destination data.
+ * \p srcDevice specifies the base pointer of the source. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Device
+ *
+ * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ * base pointer of the destination and must be naturally aligned with the CUDA
+ * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ * and the offset in bytes into the array where the copy is to begin.
+ * \p ByteCount specifies the number of bytes to copy and must be evenly
+ * divisible by the array element size.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the destination
+ * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ * the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Array
+ *
+ * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ * specify the handles of the destination and source CUDA arrays for the copy,
+ * respectively. \p dstOffset and \p srcOffset specify the destination and
+ * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ * bytes to be copied. The size of the elements in the CUDA arrays need not be
+ * the same format, but the elements must be the same size; and count must be
+ * evenly divisible by that size.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyArrayToArray
+ */
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+          const void *srcHost;
+          CUdeviceptr srcDevice;
+          CUarray srcArray;
+          unsigned int srcPitch;
+
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+          void *dstHost;
+          CUdeviceptr dstDevice;
+          CUarray dstArray;
+          unsigned int dstPitch;
+
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy3D
+ */
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+
+/**
+ * \brief Copies memory between contexts
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeer
+ */
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+
+/**
+ * \brief Copies memory asynchronously
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst       - Destination unified virtual address space pointer
+ * \param src       - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies device memory between two contexts asynchronously.
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ * \param hStream    - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeerAsync
+ */
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the
+ * destination data. \p srcHost specifies the base address of the source.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyToArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+ *
+ * \param pCopy   - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy3DAsync
+ */
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory between contexts asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeerAsync
+ */
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
+ * For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
+ *
+ * Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
+ * The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
+ * by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
+ * within a batch will result in undefined behavior.
+ *
+ * Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
+ * Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
+ * the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
+ * \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
+ * in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
+ * will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
+ * in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
+ * less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
+ * have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
+ * will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
+ * for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
+ * only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
+ * system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
+ * For other cases, these hints are ignored.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param dsts          - Array of destination pointers.
+ * \param srcs          - Array of memcpy source pointers.
+ * \param sizes         - Array of sizes for memcpy operations.
+ * \param count         - Size of \p dsts, \p srcs and \p sizes arrays
+ * \param attrs         - Array of memcpy attributes. 
+ * \param attrsIdxs     - Array of indices to specify which copies each entry in the \p attrs array applies to.
+                          The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
+                          through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
+                          attrsIdxs[numAttrs-1] through count - 1.
+ * \param numAttrs      - Size of \p attrs and \p attrsIdxs arrays.
+ * \param failIdx       - Pointer to a location to return the index of the copy where a failure was encountered.
+                          The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param hStream       - The stream to enqueue the operations in. Must not be legacy NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                    CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                    size_t *failIdx, CUstream hStream);
+
+/**
+ * \brief Performs a batch of 3D memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
+ * copies within a batch will result in undefined behavior.
+ *
+ * Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
+ * Each entry in this array describes a copy operation. This includes among other things, the source and destination
+ * operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
+ * The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
+ * of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
+ * elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
+ * to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
+ * the element size of the two CUDA arrays must match.
+ *
+ * For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
+ * ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
+ * the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
+ * must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
+ * The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
+ * or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
+ * that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
+ * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
+ * the location of the operand.
+ *
+ * If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
+ * The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
+ * the 3D offset into that array where the copy begins.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
+ * have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param numOps     - Total number of memcpy operations. 
+ * \param opList     - Array of size \p numOps containing the actual memcpy operations. 
+ * \param failIdx    - Pointer to a location to return the index of the copy where a failure was encountered.
+ *                     The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param flags      - Flags for future use, must be zero now.
+ * \param hStream    - The stream to enqueue the operations in. Must not be default NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                      size_t *failIdx, unsigned long long flags, CUstream hStream);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Creates a 1D or 2D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        CUarray_format Format;
+        unsigned int NumChannels;
+    } CUDA_ARRAY_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, and \p Height are the width, and height of the CUDA array (in
+ * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ * otherwise;
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+   } CUarray_format;
+ *  \endcode
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 1;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ * float16's:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 16-bit elements, each
+ * of which is two 8-bit unsigned chars:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.NumChannels = 2;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - Array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocArray
+ */
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 1D or 2D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * \param pArrayDescriptor - Returned array descriptor
+ * \param hArray           - Array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
+ * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] array - CUDA array to get the sparse properties of
+ * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
+ * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
+ * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
+ * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device);
+ 
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA mipmapped  
+ * array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - Multiplanar CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayCreate,
+ * ::cudaArrayGetPlane
+ */
+CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+
+/**
+ * \brief Destroys a CUDA array
+ *
+ * Destroys the CUDA array \p hArray.
+ *
+ * \param hArray - Array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeArray
+ */
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
+
+/**
+ * \brief Creates a 3D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D array is allocated if only \p Depth extent is zero.
+ *     - A 3D array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+ *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+ *     to a surface reference.
+ *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 0;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ * 4x16-bit float16's:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+    desc.Depth = depth;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - 3D array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc3DArray
+ */
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 3D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * This function may be called on 1D and 2D arrays, in which case the \p Height
+ * and/or \p Depth members of the descriptor struct will be set to 0.
+ *
+ * \param pArrayDescriptor - Returned 3D array descriptor
+ * \param hArray           - 3D array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Creates a CUDA mipmapped array
+ *
+ * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+ *     - A 3D mipmapped array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+ *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+ *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ *
+ * \param pHandle             - Returned mipmapped array
+ * \param pMipmappedArrayDesc - mipmapped array descriptor
+ * \param numMipmapLevels     - Number of mipmap levels
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayDestroy,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaMallocMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pLevelArray     - Returned mipmap level CUDA array
+ * \param hMipmappedArray - CUDA mipmapped array
+ * \param level           - Mipmap level
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayDestroy,
+ * ::cuArrayCreate,
+ * ::cudaGetMipmappedArrayLevel
+ */
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+
+/**
+ * \brief Destroys a CUDA mipmapped array
+ *
+ * Destroys the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * \param hMipmappedArray - Mipmapped array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaFreeMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+
+/** 
+* \brief Retrieve handle for an address range 
+* 
+* Get a handle of the specified type to an address range. The address range
+* must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve.
+* If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap.
+* The address range must have been obtained by a prior call to either ::cuMemAllocHost or
+* ::cuMemHostAlloc on Tegra.
+* 
+* Users must ensure the \p dptr and \p size are aligned to the host page size.
+* 
+* When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+* users are expected to query for dma_buf support for the platform
+* by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling
+* this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor.
+* Users must ensure the entire address range is backed and mapped when
+* the address range is allocated by ::cuMemAddressReserve. All the physical
+* allocations backing the address range must be resident on the same device and
+* have identical allocation properties. Users are also expected to retrieve a
+* new handle every time the underlying physical allocation(s) corresponding
+* to a previously queried VA range are changed.
+*
+* For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
+* flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
+* supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
+* return an error otherwise.
+* 
+* \param[out] handle     - Pointer to the location where the returned handle will be stored. 
+* \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
+* \param[in] size        - Length of the address range. Must be aligned to host page size.
+* \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
+* \param[in] flags       - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
+*                          ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
+* 
+* \return
+* CUDA_SUCCESS 
+* CUDA_ERROR_INVALID_VALUE 
+* CUDA_ERROR_NOT_SUPPORTED 
+*/
+CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+ */
+typedef enum CUmemDecompressAlgorithm_enum {
+    CU_MEM_DECOMPRESS_UNSUPPORTED       = 0,    /**< Decompression is unsupported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE = 1<<0, /**< Deflate is supported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY  = 1<<1  /**< Snappy is supported. */
+} CUmemDecompressAlgorithm;
+
+/**
+ * \brief Structure describing the parameters that compose a single
+ *        decompression operation.
+ */
+typedef struct CUmemDecompressParams_st {
+    /** The number of bytes to be read and decompressed from
+     *  ::CUmemDecompressParams_st.src. */
+    size_t srcNumBytes;
+    /** The number of bytes that the decompression operation will be expected to
+     *  write to ::CUmemDecompressParams_st.dst. This value is optional; if
+     *  present, it may be used by the CUDA driver as a heuristic for scheduling
+     *  the individual decompression operations. */
+    size_t dstNumBytes;
+    /** After the decompression operation has completed, the actual number of
+     * bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
+     * unsigned integer in the memory at this address. */
+    cuuint32_t *dstActBytes;
+    /** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
+      * compressed bytes. */
+    const void *src;
+    /** Pointer to a buffer where the decompressed data will be written. The
+      * number of bytes written to this location will be recorded in the memory
+      * pointed to by ::CUmemDecompressParams_st.dstActBytes */
+    void *dst;
+    /** The decompression algorithm to use. */
+    CUmemDecompressAlgorithm algo;
+    /*  These bytes are unused and must be zeroed. This ensures compatibility if
+     *  additional fields are added in the future. */
+    unsigned char padding[20];
+} CUmemDecompressParams;
+
+/**
+ * \brief   Submit a batch of \p count independent decompression operations.
+ *
+ * \details Each of the \p count decompression operations is described by a
+ *          single entry in the \p paramsArray array. Once the batch has been
+ *          submitted, the function will return, and decompression will happen
+ *          asynchronously w.r.t. the CPU. To the work completion tracking
+ *          mechanisms in the CUDA driver, the batch will be considered a single
+ *          unit of work and processed according to stream semantics, i.e., it
+ *          is not possible to query the completion of individual decompression
+ *          operations within a batch.
+ *
+ *          The memory pointed to by each of ::CUmemDecompressParams.src,
+ *          ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
+ *          must be capable of usage with the hardware decompress feature. That
+ *          is, for each of said pointers, the pointer attribute
+ *          ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
+ *          non-zero value. To ensure this, the memory backing the pointers
+ *          should have been allocated using one of the following CUDA memory
+ *          allocators:
+ *          * ::cuMemAlloc()
+ *          * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
+ *          * ::cuMemAllocFromPoolAsync() from a pool that was created with
+ *            the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
+ *          Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
+ *          and ::CUmemDecompressParams.dstActBytes, must all be accessible from
+ *          the device associated with the context where \p stream was created.
+ *          For information on how to ensure this, see the documentation for the
+ *          allocator of interest.
+ *
+ * \param[in]  paramsArray  The array of structures describing the independent
+ *                          decompression operations.
+ * \param[in]  count        The number of entries in \p paramsArray array.
+ * \param[in]  flags        Must be 0.
+ * \param[out] errorIndex   The index into \p paramsArray of the decompression
+ *                          operation for which the error returned by this
+ *                          function pertains to. If \p index is SIZE_MAX and
+ *                          the value returned is not ::CUDA_SUCCESS, then the
+ *                          error returned by this function should be considered
+ *                          a general error that does not pertain to a
+ *                          particular decompression operation. May be \p NULL,
+ *                          in which case, no index will be recorded in the
+ *                          event of error.
+ * \param[in]  stream       The stream where the work will be enqueued.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuMemBatchDecompressAsync(
+    CUmemDecompressParams *paramsArray,
+    size_t count,
+    unsigned int flags,
+    size_t *errorIndex,
+    CUstream stream
+);
+
+/** @} */ /* END CUDA_MEM */
+
+/**
+ * \defgroup CUDA_VA Virtual Memory Management
+ *
+ * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the virtual memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+* \brief Allocate an address range reservation. 
+* 
+* Reserves a virtual address range based on the given parameters, giving
+* the starting address of the range in \p ptr.  This API requires a system that
+* supports UVA.  The size and address parameters must be a multiple of the
+* host page size and the alignment must be a power of two or zero for default
+* alignment.
+*
+* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
+* \param[in]  size      - Size of the reserved virtual address range requested
+* \param[in]  alignment - Alignment of the reserved virtual address range requested
+* \param[in]  addr      - Fixed starting address range requested
+* \param[in]  flags     - Currently unused, must be zero
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressFree
+*/
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+
+/**
+* \brief Free an address range reservation.
+* 
+* Frees a virtual address range reserved by cuMemAddressReserve.  The size
+* must match what was given to memAddressReserve and the ptr given must
+* match what was returned from memAddressReserve.
+*
+* \param[in] ptr  - Starting address of the virtual address range to free
+* \param[in] size - Size of the virtual address region to free
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
+*
+* This creates a memory allocation on the target device specified through the
+* \p prop structure. The created allocation will not have any device or host
+* mappings. The generic memory \p handle for the allocation can be
+* mapped to the address space of calling process via ::cuMemMap. This handle
+* cannot be transmitted directly to other processes (see
+* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
+* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
+* limits or allows access to this handle for a recipient process (see
+* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
+* allocation must be a multiple of the the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
+* flag.
+* To create a CPU allocation targeting a specific host NUMA node, applications must
+* set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+* ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
+* On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
+* Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+* ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
+*
+* Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+* (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices 
+* (2) have at least one IMEX channel file accessible by the user launching the application.
+*
+* When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+* share memory.
+*
+* The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+* memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+* channel is required for each user.
+*
+* These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+* native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+* users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+*
+* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
+* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
+* and sparse CUDA mipmapped arrays.
+* (see ::cuMemMapArrayAsync).
+*
+* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
+* \param[in]  size   - Size of the allocation requested
+* \param[in]  prop   - Properties of the allocation to create.
+* \param[in]  flags  - flags for future use, must be zero now.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
+
+/**
+* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+* 
+* Frees the memory that was allocated on a device through cuMemCreate.
+*
+* The memory allocation will be freed when all outstanding mappings to the memory
+* are unmapped and when all outstanding references to the handle (including it's
+* shareable counterparts) are also released. The generic memory handle can be
+* freed when there are still outstanding mappings made with this handle. Each
+* time a recipient process imports a shareable handle, it needs to pair it with
+* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
+* the behavior is undefined. 
+*
+* \param[in] handle Value of handle which was returned previously by cuMemCreate.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemCreate
+*/
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Maps an allocation handle to a reserved virtual address range.
+*
+* Maps bytes of memory represented by \p handle starting from byte \p offset to
+* \p size to address range [\p addr, \p addr + \p size]. This range must be an
+* address reservation previously reserved with ::cuMemAddressReserve, and
+* \p offset + \p size must be less than the size of the memory allocation.
+* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
+* If \p handle represents a multicast object, \p ptr, \p size and \p offset must
+* be aligned to the value returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_MINIMUM_GRANULARITY. For best performance however, it is
+* recommended that \p ptr, \p size and \p offset be aligned to the value
+* returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+* 
+* Please note calling ::cuMemMap does not make the address accessible,
+* the caller needs to update accessibility of a contiguous mapped VA
+* range by calling ::cuMemSetAccess.
+* 
+* Once a recipient process obtains a shareable memory handle
+* from ::cuMemImportFromShareableHandle, the process must
+* use ::cuMemMap to map the memory into its address ranges before
+* setting accessibility with ::cuMemSetAccess.
+*  
+* ::cuMemMap can only create mappings on VA range reservations 
+* that are not currently mapped.
+* 
+* \param[in] ptr    - Address where memory will be mapped. 
+* \param[in] size   - Size of the memory mapping. 
+* \param[in] offset - Offset into the memory represented by 
+*                   - \p handle from which to start mapping
+*                   - Note: currently must be zero.
+* \param[in] handle - Handle to a shareable memory 
+* \param[in] flags  - flags for future use, must be zero now. 
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+
+/**
+ * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
+ *
+ * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
+ * The structure ::CUarrayMapInfo is defined as follow:
+ \code
+     typedef struct CUarrayMapInfo_st {
+        CUresourcetype resourceType;                   
+        union {
+            CUmipmappedArray mipmap;
+            CUarray array;
+        } resource;
+
+        CUarraySparseSubresourceType subresourceType;   
+        union {
+            struct {
+                unsigned int level;                     
+                unsigned int layer;                     
+                unsigned int offsetX;                   
+                unsigned int offsetY;                   
+                unsigned int offsetZ;                   
+                unsigned int extentWidth;               
+                unsigned int extentHeight;              
+                unsigned int extentDepth;               
+            } sparseLevel;
+            struct {
+                unsigned int layer;
+                unsigned long long offset;              
+                unsigned long long size;                
+            } miptail;
+        } subresource;
+
+        CUmemOperationType memOperationType;
+        
+        CUmemHandleType memHandleType;                  
+        union {
+            CUmemGenericAllocationHandle memHandle;
+        } memHandle;
+
+        unsigned long long offset;                      
+        unsigned int deviceBitMask;                     
+        unsigned int flags;                             
+        unsigned int reserved[2];                       
+    } CUarrayMapInfo;
+ \endcode
+ *
+ * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
+ * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
+ * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
+ * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
+ * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
+ * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
+ * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ *
+ * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
+ * ::CUarraySparseSubresourceType_enum is defined as:
+ \code
+    typedef enum CUarraySparseSubresourceType_enum {
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+    } CUarraySparseSubresourceType;
+ \endcode
+ *
+ * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
+ * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
+ * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
+ * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+ * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
+ * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
+ * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
+ * These offsets and extents must be aligned to the corresponding tile dimension.
+ * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
+ * must be zero.
+ * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
+ * must be zero.
+ * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
+ * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
+ * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+ * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
+ * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
+ * Both, mip tail offset and mip tail size must be aligned to the tile size. 
+ * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
+ * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
+ * Otherwise, must be zero.
+ *
+ * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored.
+ *
+ * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
+ \code
+    typedef enum CUmemOperationType_enum {
+        CU_MEM_OPERATION_TYPE_MAP = 1,
+        CU_MEM_OPERATION_TYPE_UNMAP = 2
+    } CUmemOperationType;
+ \endcode
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
+ * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
+ * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
+ * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
+ * 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
+ * is performed. ::CUarrayMapInfo::memHandle must be NULL.
+ *
+ * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
+ * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
+ * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
+ *
+ * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \param[in] mapInfoList - List of ::CUarrayMapInfo
+ * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
+ * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
+ *
+ * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
+ */
+CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
+
+/**
+* \brief Unmap the backing memory of a given address range.
+*
+* The range must be the entire contiguous address range that was mapped to.  In
+* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
+* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
+* if there are no existing mappings and there are no unreleased memory handles.
+*
+* When ::cuMemUnmap returns successfully the address range is converted to an
+* address reservation and can be used for a future calls to ::cuMemMap.  Any new
+* mapping to this virtual address will need to have access granted through
+* ::cuMemSetAccess, as all mappings start with no accessibility setup.
+*
+* \param[in] ptr  - Starting address for the virtual address range to unmap
+* \param[in] size - Size of the virtual address range to unmap
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemCreate, ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Set the access flags for each location specified in \p desc for the given virtual address range
+* 
+* Given the virtual address range via \p ptr and \p size, and the locations
+* in the array given by \p desc and \p count, set the access flags for the
+* target locations.  The range must be a fully mapped address range
+* containing all allocations created by ::cuMemMap / ::cuMemCreate.
+* Users cannot specify ::CU_MEM_LOCATION_TYPE_HOST_NUMA accessibility for allocations created on with other location types.
+* Note: When ::CUmemAccessDesc::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST_NUMA, ::CUmemAccessDesc::CUmemLocation::id
+* is ignored.
+* When setting the access flags for a virtual address range mapping a multicast
+* object, \p ptr and \p size must be aligned to the value returned by
+* ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_MINIMUM_GRANULARITY.
+* For best performance however, it is recommended that \p ptr and \p size be
+* aligned to the value returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+*
+* \param[in] ptr   - Starting address for the virtual address range
+* \param[in] size  - Length of the virtual address range
+* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
+*                  - mapping for each location specified
+* \param[in] count - Number of ::CUmemAccessDesc in \p desc
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
+*/
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
+
+/**
+* \brief Get the access \p flags set for the given \p location and \p ptr
+*
+* \param[out] flags   - Flags set for this location
+* \param[in] location - Location in which to check the flags for
+* \param[in] ptr      - Address in which to check the access flags for
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemSetAccess
+*/
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
+
+/**
+* \brief Exports an allocation to a requested shareable handle type
+*
+* Given a CUDA memory handle, create a shareable memory
+* allocation handle that can be used to share the memory with other
+* processes. The recipient process can convert the shareable handle back into a
+* CUDA memory handle using ::cuMemImportFromShareableHandle and map
+* it with ::cuMemMap. The implementation of what this handle is and how it
+* can be transferred is defined by the requested handle type in \p handleType
+*
+* Once all shareable handles are closed and the allocation is released, the allocated
+* memory referenced will be released back to the OS and uses of the CUDA handle afterward
+* will lead to undefined behavior.
+*
+* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
+* that support importing memory from the shareable type
+*
+* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
+* \param[in] handle           - CUDA handle for the memory allocation
+* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
+* \param[in] flags            - Reserved, must be zero
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+* \brief Imports an allocation from a requested shareable handle type.
+*
+* If the current process cannot support the memory described by this shareable
+* handle, this API will error as ::CUDA_ERROR_NOT_SUPPORTED.
+*
+* If \p shHandleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+* granted access to the same IMEX channel as the exporter process, this API will error
+* as ::CUDA_ERROR_NOT_PERMITTED.
+*
+* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
+* created on devices under an SLI group may not be supported, and thus this API will
+* return CUDA_ERROR_NOT_SUPPORTED.
+* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
+* for the same given OS shareable handle, or the same underlying allocation.
+*
+* \param[out] handle       - CUDA Memory handle for the memory allocation.
+* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
+* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
+*/
+CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+
+/**
+* \brief Calculates either the minimal or recommended granularity 
+*
+* Calculates either the minimal or recommended granularity
+* for a given allocation specification and returns it in granularity.  This
+* granularity can be used as a multiple for alignment, size, or address mapping.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop Property for which to determine the granularity for
+* \param[in]  option Determines which granularity to return
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
+
+/**
+* \brief Retrieve the contents of the property structure defining properties for this handle
+*
+* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
+* \param[in] handle - Handle which to perform the query on
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
+*
+* The handle is guaranteed to be the same handle value used to map the memory. If the address
+* requested is not mapped, the function will fail. The returned handle must be released with
+* corresponding number of calls to ::cuMemRelease.
+*
+* \note The address \p addr, can be any address in a range previously mapped
+* by ::cuMemMap, and not necessarily the start address.
+*
+* \param[out] handle CUDA Memory handle for the backing memory allocation.
+* \param[in] addr Memory address to query, that has been mapped previously.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
+
+/** @} */ /* END CUDA_VA */
+
+/**
+ * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream ordered memory allocator exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MALLOC_ASYNC_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
+ *
+ * \section CUDA_MALLOC_ASYNC_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+ */
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ * 
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering contract. 
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool current to the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding. 
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since the
+ *                    last time it was reset.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the application.
+ *
+ * \param[in] pool   - The memory pool to get attributes of
+ * \param[in] attr   - The attribute to get 
+ * \param[out] value - Retrieved value
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location. 
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities. 
+ *
+ * To create a memory pool targeting a specific host NUMA node, applications must
+ * set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ * ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+ * ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
+* By default, the pool's memory will be accessible from the device it is allocated on.
+ * In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
+ * will be from the host CPU.
+ * Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
+ * If set to 0, the maximum size of the pool will default to a system dependent value.
+ *
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices 
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
+ *
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ * share memory.
+ *
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ * channel is required for each user.
+ *
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+ *
+ * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
+ *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
+ */
+CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations. 
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
+ * 
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] pool     - The pool to allocate from 
+ * \param[in] hStream  - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
+ *     ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+ *
+ * \param[out] handle_out  - Returned OS handle 
+ * \param[in] pool         - pool to export 
+ * \param[in] handleType   - the type of handle to create 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
+ *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
+ *
+ * If \p handleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+ * granted access to the same IMEX channel as the exporter process, this API will error
+ * as ::CUDA_ERROR_NOT_PERMITTED.
+ * 
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in cuDeviceSetMemPool
+ *       or ::cuMemAllocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open 
+ * \param[in] handleType   - The type of handle being imported 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
+        CUmemoryPool *pool_out,
+        void *handle,
+        CUmemAllocationHandleType handleType,
+        unsigned long long flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data  
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cuMemFree
+ * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The cuMemFreeAsync api may be used in the exporting process before
+ *       the cuMemFreeAsync operation completes in its stream as long as the
+ *       cuMemFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's cuMemFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
+
+/** @} */ /* END CUDA_MALLOC_ASYNC */
+
+/**
+ * \defgroup CUDA_MULTICAST Multicast Object Management
+ *
+ * ___MANBRIEF___ Functions for creating multicast objects, adding devices to them and binding/unbinding memory
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the CUDA multicast object operations exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MULTICAST_overview overview
+ *
+ * A multicast object created via ::cuMulticastCreate enables certain memory
+ * operations to be broadcast to a team of devices. Devices can be added to a
+ * multicast object via ::cuMulticastAddDevice. Memory can be bound on each
+ * participating device via either ::cuMulticastBindMem or ::cuMulticastBindAddr.
+ * Multicast objects can be mapped into a device's virtual address space using
+ * the virtual memmory management APIs (see ::cuMemMap and ::cuMemSetAccess).
+ *
+ * \section CUDA_MULTICAST_support Supported Platforms
+ *
+ * Support for multicast on a specific device can be queried using the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED
+ */
+
+/**
+ * \brief Create a generic allocation handle representing a multicast object described by the given properties.
+ *
+ * This creates a multicast object as described by \p prop. The number of
+ * participating devices is specified by ::CUmulticastObjectProp::numDevices.
+ * Devices can be added to the multicast object via ::cuMulticastAddDevice.
+ * All participating devices must be added to the multicast object before memory
+ * can be bound to it. Memory is bound to the multicast object via either
+ * ::cuMulticastBindMem or ::cuMulticastBindAddr, and can be unbound via
+ * ::cuMulticastUnbind. The total amount of memory that can be bound per device
+ * is specified by :CUmulticastObjectProp::size. This size must be a multiple of
+ * the value returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however, the size
+ * should be aligned to the value returned by ::cuMulticastGetGranularity with
+ * the flag ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * After all participating devices have been added, multicast objects can also
+ * be mapped to a device's virtual address space using the virtual memory
+ * management APIs (see ::cuMemMap and ::cuMemSetAccess). Multicast objects can
+ * also be shared with other processes by requesting a shareable handle via
+ * ::cuMemExportToShareableHandle. Note that the desired types of shareable
+ * handles must be specified in the bitmask ::CUmulticastObjectProp::handleTypes.
+ * Multicast objects can be released using the virtual memory management API
+ * ::cuMemRelease.
+ *
+ * \param[out] mcHandle     Value of handle returned.
+ * \param[in]  prop         Properties of the multicast object to create.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastAddDevice, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind
+ * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+ */
+CUresult CUDAAPI cuMulticastCreate(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop);
+
+/**
+ * \brief Associate a device to a multicast object.
+ *
+ * Associates a device to a multicast object. The added device will be a part of
+ * the multicast team of size specified by CUmulticastObjectProp::numDevices
+ * during ::cuMulticastCreate.
+ * The association of the device to the multicast object is permanent during
+ * the life time of the multicast object.
+ * All devices must be added to the multicast team before any memory can be
+ * bound to any device in the team. Any calls to ::cuMulticastBindMem or
+ * ::cuMulticastBindAddr will block until all devices have been added.
+ * Similarly all devices must be added to the multicast team before a virtual
+ * address range can be mapped to the multicast object. A call to ::cuMemMap
+ * will block until all devices have been added.
+ *
+ * \param[in] mcHandle     Handle representing a multicast object.
+ * \param[in] dev          Device that will be associated to the multicast
+ *                         object.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr
+ */
+CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUdevice dev);
+
+/**
+ * \brief Bind a memory allocation represented by a handle to a multicast object.
+ *
+ * Binds a memory allocation specified by \p memHandle and created via
+ * ::cuMemCreate to a multicast object represented by \p mcHandle and created
+ * via ::cuMulticastCreate. The intended \p size of the bind, the offset in the
+ * multicast range \p mcOffset as well as the offset in the memory \p memOffset
+ * must be a multiple of the value returned by ::cuMulticastGetGranularity with
+ * the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however,
+ * \p size, \p mcOffset and \p memOffset should be aligned to the granularity of
+ * the memory allocation(see ::cuMemGetAllocationGranularity) or to the value
+ * returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * The \p size + \p memOffset cannot be larger than the size of the allocated
+ * memory. Similarly the \p size + \p mcOffset cannot be larger than the size
+ * of the multicast object.
+ * The memory allocation must have beeen created on one of the devices
+ * that was added to the multicast team via ::cuMulticastAddDevice.
+ * Externally shareable as well as imported multicast objects can be bound only
+ * to externally shareable memory.
+ * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ * insufficient resources required to perform the bind. This call may also
+ * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ * initialized or running.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  mcOffset     Offset into the multicast object for attachment.
+ * \param[in]  memHandle    Handle representing a memory allocation.
+ * \param[in]  memOffset    Offset into the memory for attachment.
+ * \param[in]  size         Size of the memory that will be bound to the
+ *                          multicast object.
+ * \param[in]  flags        Flags for future use, must be zero for now.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_SYSTEM_NOT_READY
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate
+ */
+CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags);
+
+/**
+ * \brief Bind a memory allocation represented by a virtual address to a multicast object.
+ *
+ * Binds a memory allocation specified by its mapped address \p memptr to a
+ * multicast object represented by \p mcHandle.
+ * The memory must have been allocated via ::cuMemCreate or ::cudaMallocAsync.
+ * The intended \p size of the bind, the offset in the multicast range
+ * \p mcOffset and \p memptr must be a multiple of the value returned by
+ * ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ * For best performance however, \p size, \p mcOffset and \p memptr should be
+ * aligned to the value returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * The \p size cannot be larger than the size of the allocated memory.
+ * Similarly the \p size + \p mcOffset cannot be larger than the total size
+ * of the multicast object.
+ * The memory allocation must have beeen created on one of the devices
+ * that was added to the multicast team via ::cuMulticastAddDevice.
+ * Externally shareable as well as imported multicast objects can be bound only
+ * to externally shareable memory.
+ * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ * insufficient resources required to perform the bind. This call may also
+ * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ * initialized or running.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  mcOffset     Offset into multicast va range for attachment.
+ * \param[in]  memptr       Virtual address of the memory allocation.
+ * \param[in]  size         Size of memory that will be bound to the
+ *                          multicast object.
+ * \param[in]  flags        Flags for future use, must be zero now.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_SYSTEM_NOT_READY
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate
+ */
+CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags);
+
+/**
+ * \brief Unbind any memory allocations bound to a multicast object at a given offset and upto a given size.
+ *
+ * Unbinds any memory allocations hosted on \p dev and bound to a multicast
+ * object at \p mcOffset and upto a given \p size.
+ * The intended \p size of the unbind and the offset in the multicast range
+ * ( \p mcOffset ) must be a multiple of the value returned by
+ * ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ * The \p size + \p mcOffset cannot be larger than the total size of the
+ * multicast object.
+ *
+ * \note 
+ * Warning:
+ * The \p mcOffset and the \p size must match the corresponding values specified
+ * during the bind call. Any other values may result in undefined behavior.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  dev          Device that hosts the memory allocation.
+ * \param[in]  mcOffset     Offset into the multicast object.
+ * \param[in]  size         Desired size to unbind.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastBindMem, ::cuMulticastBindAddr
+ */
+CUresult CUDAAPI cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size);
+
+/**
+* \brief Calculates either the minimal or recommended granularity for multicast object
+*
+* Calculates either the minimal or recommended granularity for a given set of
+* multicast object properties and returns it in granularity.  This granularity
+* can be used as a multiple for size, bind offsets and address mappings of the
+* multicast object.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop        Properties of the multicast object.
+* \param[in]  option      Determines which granularity to return.
+*
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind
+*/
+CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option);
+
+/** @} */ /* END CUDA_MULTICAST */
+
+/**
+ * \defgroup CUDA_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be
+ * used to access memory from the host program and from a kernel
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDA_UNIFIED_support Supported Platforms
+ *
+ * Whether or not a device supports unified addressing may be
+ * queried by calling ::cuDeviceGetAttribute() with the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes
+ *
+ * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device
+ * memory, one may want to know on which CUDA device the memory
+ * resides.  These properties may be queried using the function
+ * ::cuPointerGetAttribute()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to the various copy functions in the
+ * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
+ * between two pointers, ignoring whether they point to host or device
+ * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
+ * unnecessary for devices supporting unified addressing).  For
+ * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
+ * used to specify that the CUDA driver should infer the location of the
+ * pointer from its value.
+ *
+ * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated in all contexts using ::cuMemAllocHost() and
+ * ::cuMemHostAlloc() is always directly accessible from all contexts on
+ * all devices that support unified addressing.  This is the case regardless
+ * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
+ * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
+ *
+ * The pointer value through which allocated host memory may be accessed
+ * in kernels on all devices that support unified addressing is the same
+ * as the pointer value through which that memory is accessed on the host,
+ * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
+ * pointer for these allocations.
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+ *
+ * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
+ *
+ * Upon enabling direct access from a context that supports unified addressing
+ * to another peer context that supports unified addressing using
+ * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
+ * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
+ * by the current context.  The device pointer value through
+ * which any peer memory may be accessed in the current context
+ * is the same pointer value through which that memory may be
+ * accessed in the peer context.
+ *
+ * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ *
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cuMemHostRegister() and host memory
+ * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all
+ * contexts that support unified addressing.
+ *
+ * This device address may be queried using ::cuMemHostGetDevicePointer()
+ * when a context using unified addressing is current.  Either the host
+ * or the unified device pointer value may be used to refer to this memory
+ * through ::cuMemcpy() and similar functions using the
+ * ::CU_MEMORYTYPE_UNIFIED memory type.
+ *
+ */
+
+/**
+ * \brief Returns information about a pointer
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+ *
+ *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+ *      registered.
+ *      The type of \p data must be ::CUcontext *.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+ *
+ *      Returns in \p *data the physical memory type of the memory that
+ *      \p ptr addresses as a ::CUmemorytype enumerated value.
+ *      The type of \p data must be unsigned int.
+ *
+ *      If \p ptr addresses device memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+ *      memory resides is the ::CUdevice of the ::CUcontext returned by the
+ *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+ *
+ *      If \p ptr addresses host memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_HOST.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If the current ::CUcontext does not support unified virtual
+ *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+ *
+ *      Returns in \p *data the device pointer value through which
+ *      \p ptr may be accessed by kernels running in the current
+ *      ::CUcontext.
+ *      The type of \p data must be CUdeviceptr *.
+ *
+ *      If there exists no device pointer value through which
+ *      kernels running in the current ::CUcontext may access
+ *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If there is no current ::CUcontext then
+ *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+ *
+ *      Returns in \p *data the host pointer value through which
+ *      \p ptr may be accessed by by the host program.
+ *      The type of \p data must be void **.
+ *      If there exists no host pointer value through which
+ *      the host program may directly access \p ptr then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+ *
+ *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+ *      kernel interface. \p data must be a struct of type
+ *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+ *
+ *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+ *      Note that p2pToken and vaSpaceToken are only valid for the
+ *      lifetime of the source allocation. A subsequent allocation at
+ *      the same address may return completely different tokens.
+ *      Querying this attribute has a side effect of setting the attribute
+ *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+ *      \p ptr points to.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute which when set, ensures that synchronous memory operations
+ *      initiated on the region of memory that \p ptr points to will always synchronize.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+ *
+ *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+ *      \p data must point to an unsigned long long.
+ *
+ *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+ *      Every memory allocation from any of the CUDA memory allocation APIs will
+ *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+ *      from previous freed allocations. IDs are only unique within a single process.
+ *
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points to
+ *      managed memory or not.
+ *
+ *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+ *
+ *      Returns in \p *data an integer representing a device ordinal of a device against
+ *      which the memory was allocated or registered.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer maps to
+ *      an allocation that is suitable for ::cudaIpcGetMemHandle.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
+ *
+ *      Returns in \p *data the starting address for the allocation referenced
+ *      by the device pointer \p ptr.  Note that this is not necessarily the
+ *      address of the mapped region, but the address of the mappable address
+ *      range \p ptr references (e.g. from ::cuMemAddressReserve).
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
+ *
+ *      Returns in \p *data the size for the allocation referenced by the device
+ *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
+ *      region, but the size of the mappable address range \p ptr references
+ *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
+ *      region, see ::cuMemGetAddressRange
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer is in a
+ *      valid address range that is mapped to a backing allocation.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
+ *
+ *      Returns a bitmask of the allowed handle types for an allocation that may
+ *      be passed to ::cuMemExportToShareableHandle.
+ * 
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
+ * 
+ *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points
+ *      to memory that is capable to be used for hardware accelerated
+ *      decompression.
+ *
+ * \par
+ *
+ * Note that for most allocations in the unified virtual address space
+ * the host and device pointer for accessing the allocation will be the
+ * same.  The exceptions to this are
+ *  - user memory registered using ::cuMemHostRegister
+ *  - host memory allocated using ::cuMemHostAlloc with the
+ *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ * For these types of allocation there will exist separate, disjoint host
+ * and device addresses for accessing the allocation.  In particular
+ *  - The host address will correspond to an invalid unmapped device address
+ *    (which will result in an exception if accessed from the device)
+ *  - The device address will correspond to an invalid unmapped host address
+ *    (which will result in an exception if accessed from the host).
+ * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ * and device addresses from either address.
+ *
+ * \param data      - Returned pointer attribute value
+ * \param attribute - Pointer attribute to query
+ * \param ptr       - Pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerSetAttribute,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ * 
+ * Note there is a later version of this API, ::cuMemPrefetchAsync_v2. It will
+ * supplant this version in 13.0, which is retained for minor version compatibility.
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p dstDevice is the
+ * destination device. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
+ * or it may also refer to system-allocated memory on systems with non-zero 
+ * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param hStream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ * ::cudaMemPrefetchAsync_v2
+ */
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+
+/**
+ * \brief Prefetches memory to the specified destination location
+ *
+ * Prefetches memory to the specified destination location.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p location specifies the
+ * destination location. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ *
+ * Specifying ::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU
+ * specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Additionally, \p hStream must be associated with a device
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory.
+ * Applications can request prefetching memory to a specific host NUMA node by specifying
+ * ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id
+ * Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying
+ * ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type. Note when ::CUmemLocation::type is etiher
+ * ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on the destination location. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on destination location.
+ * If however the destination location is a host NUMA node, then any pages of that subset
+ * that are already in another host NUMA node will be transferred to the destination.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p location even if \p location is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param flags     - flags for future use, must be zero now. 
+ * \param hStream   - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ * ::cudaMemPrefetchAsync_v2
+ */
+CUresult CUDAAPI cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Note there is a later version of this API, ::cuMemAdvise_v2. It will
+ * supplant this version in 13.0, which is retained for minor version compatibility.
+ * 
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise_v2
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * or ::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on
+ * another host NUMA node, that copy will be migrated to the targeted host NUMA node.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. If the writing processor is the CPU and the preferred location of
+ * the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ * Note: The \p location argument is ignored for this advice.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p location. When ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST,
+ * ::CUmemLocation::id is ignored and the preferred location is set to be host memory. To set the preferred location
+ * to a specific host NUMA node, applications must set ::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ * ::CUmemLocation::id must specify the NUMA ID of the host NUMA node. If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT,
+ * ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location.
+ * If ::CUmemLocation::type is a ::CU_MEM_LOCATION_TYPE_DEVICE, then ::CUmemLocation::id must be a valid device ordinal
+ * and the device must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p location will not result in a read-only copy being created on that procesor as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE
+ * then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none. The \p location argument is ignored for this advice.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location.
+ * The ::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device
+ * ordinal or ::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid.
+ * If ::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p location, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ * then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ * Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p location may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ * then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ * Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr   - Pointer to memory to set the advice for
+ * \param count    - Size in bytes of the memory range
+ * \param advice   - Advice to be applied for the specified memory range
+ * \param location - location to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location);
+
+/**
+ * \brief Query an attribute of a given memory range
+ *
+ * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables.
+ *
+ * The \p attribute parameter can take the following values:
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ * memory range have read-duplication enabled, or 0 otherwise.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a
+ * preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ * the query may be different from the preferred location.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ * If any device does not have that advice set for the entire memory range, that device will not be included.
+ * If \p data is larger than the number of devices that have that advice set for that memory range,
+ * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ * that advice set, then only as many devices will be returned as can fit in the array. There is no
+ * guarantee on which specific devices will be returned, however.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ * last location that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE: If this attribute is specified, \p data will be
+ * interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The ::CUmemLocationType returned will be
+ * ::CU_MEM_LOCATION_TYPE_DEVICE if all pages in the memory range have the same GPU as their preferred location, or ::CUmemLocationType
+ * will be ::CU_MEM_LOCATION_TYPE_HOST if all pages in the memory range have the CPU as their preferred location, or it will be ::CU_MEM_LOCATION_TYPE_HOST_NUMA
+ * if all the pages in the memory range have the same host NUMA node ID as their preferred location or it will be ::CU_MEM_LOCATION_TYPE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a preferred location at all.
+ * Note that the actual location type of the pages in the memory range at the time of the query may be different from the preferred location type.
+ *  - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE query for the same address range
+ * returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE: If this attribute is specified, \p data will be
+ * interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. The ::CUmemLocationType returned
+ * will be ::CU_MEM_LOCATION_TYPE_DEVICE if the last prefetch location was a GPU or ::CU_MEM_LOCATION_TYPE_HOST if it was the CPU or ::CU_MEM_LOCATION_TYPE_HOST_NUMA if
+ * the last prefetch location was a specific host NUMA node. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, ::CUmemLocationType will be ::CU_MEM_LOCATION_TYPE_INVALID.
+ * Note that this simply returns the last location type that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *  - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE query for the same address range
+ * returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ *
+ * \param data      - A pointers to a memory location where the result
+ *                    of each attribute query will be written to.
+ * \param dataSize  - Array containing the size of data
+ * \param attribute - The attribute to query
+ * \param devPtr    - Start of the range to query
+ * \param count     - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ * ::cuMemAdvise,
+ * ::cudaMemRangeGetAttribute
+ */
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
+ * ::cuMemPrefetchAsync,
+ * ::cudaMemRangeGetAttributes
+ */
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Set attributes on a previously allocated memory region
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute that can either be set (1) or unset (0). When set,
+ *      the region of memory that \p ptr points to is guaranteed to always synchronize
+ *      memory operations that are synchronous. If there are some previously initiated
+ *      synchronous memory operations that are pending when this attribute is set, the
+ *      function does not return until those memory operations are complete.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *
+ * \param value     - Pointer to memory containing the value to be set
+ * \param attribute - Pointer attribute to set
+ * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuPointerGetAttribute,
+ * ::cuPointerGetAttributes,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister
+ */
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Returns information about a pointer.
+ *
+ * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
+ *
+ * \param numAttributes - Number of attributes to query
+ * \param attributes    - An array of attributes to query
+ *                      (numAttributes and the number of attributes in this array should match)
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                      locations where the result of each attribute query will be written to.
+ * \param ptr           - Pointer to query
+ *
+ * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ * and CUDA_SUCCESS is returned.
+ *
+ * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerGetAttribute,
+ * ::cuPointerSetAttribute,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+
+/** @} */ /* END CUDA_UNIFIED */
+
+/**
+ * \defgroup CUDA_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a stream
+ *
+ * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ * determines behaviors of the stream.
+ *
+ * Valid values for \p Flags are:
+ * - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param phStream - Returned newly created stream
+ * \param Flags    - Parameters for stream creation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
+
+/**
+ * \brief Create a stream with the given priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p phStream.
+ * This affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do
+ * not preempt already-running work or provide any other functional guarantee on
+ * execution order.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream    - Returned newly created stream
+ * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+ *                      valid flags
+ * \param priority    - Stream priority. Lower numbers represent higher priorities.
+ *                      See ::cuCtxGetStreamPriorityRange for more information about
+ *                      meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+
+
+/**
+ * \brief Query the priority of a given stream
+ *
+ * Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
+ * and return the priority in \p priority. Note that if the stream was created with a
+ * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cuStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cudaStreamGetPriority
+ */
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+
+/**
+ * \brief Returns the device handle of the stream
+ *
+ * Returns in \p *device the device handle of the stream
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param device - Returns the device to which a stream belongs 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
+
+/**
+ * \brief Query the flags of a given stream
+ *
+ * Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
+ * and return the flags in \p flags.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ *                     The value returned in \p flags is a logical 'OR' of all flags that
+ *                     were used while creating this stream. See ::cuStreamCreate for the list
+ *                     of valid flags
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetFlags
+ * ::cuStreamGetDevice
+ */
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+
+/**
+ * \brief Returns the unique Id associated with the stream handle supplied
+ *
+ * Returns in \p streamId the unique Id which is associated with the given stream handle.
+ * The Id is unique for the life of the program.
+ * 
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.</li>
+ * </ul>
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param streamId   - Pointer to store the Id of the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetId
+ */
+CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
+
+/**
+ * \brief Query the context associated with a stream
+ *
+ * Returns the CUDA context that the stream is associated with.
+ *
+ * Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
+ * supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
+ * till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   The returned context is the context that was active in the calling thread when the
+ *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   Specifying any of the special handles will return the context current to the
+ *   calling thread. If no context is current to the calling thread,
+ *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param pctx    - Returned context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cuStreamGetCtx_v2,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+
+/**
+ * \brief Query the contexts associated with a stream
+ *
+ * Returns the contexts that the stream is associated with.
+ *
+ * If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
+ * and the primary context of the associated device in \p pCtx.
+ *
+ * If the stream is associated with a regular context, the API returns the regular context in \p pCtx
+ * and NULL in \p pGreenCtx.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
+ *   ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   If any of the special handles are specified, the API will operate on the context current to the
+ *   calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
+ *   is current to the calling thread, the API will return the green context in \p pGreenCtx
+ *   and the primary context of the associated device in \p pCtx. If a regular context is current,
+ *   the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
+ *   Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
+ *   if a green context is current to the calling thread.
+ *   If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream   - Handle to the stream to be queried
+ * \param pCtx      - Returned regular context associated with the stream
+ * \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ */
+CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p hStream wait for all work captured in
+ * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p hEvent may be from a different context or device than \p hStream.
+ *
+ * flags include:
+ * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hStream - Stream to wait
+ * \param hEvent  - Event to wait on (may not be NULL)
+ * \param Flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuEventRecord,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cuStreamDestroy,
+ * ::cudaStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cuLaunchHostFunc. Additionally, this function is not
+ * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ * ::cuLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cuStreamAddCallback call, the callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::CUresult.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   callback with an event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param hStream  - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuLaunchHostFunc,
+ * ::cudaStreamAddCallback
+ */
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode
+ */
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+
+/**
+ * \brief Begins graph capture on a stream to an existing graph
+ *
+ * Begin graph capture on \p hStream, placing new nodes into an existing graph. When a stream is 
+ * in capture mode, all operations pushed into the stream will not be executed, but will instead 
+ * be captured into \p hGraph. The graph will not be instantiable until the user calls 
+ * ::cuStreamEndCapture. 
+ *  
+ * Capture may not be initiated if \p stream is CU_STREAM_LEGACY. Capture must be ended on the 
+ * same stream in which it was initiated, and it may only be initiated if the stream is not 
+ * already in capture mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream         - Stream in which to initiate capture.
+ * \param hGraph          - Graph to capture into.
+ * \param dependencies    - Dependencies of the first node captured in the stream.  Can be NULL if numDependencies is 0.
+ * \param dependencyData  - Optional array of data associated with each dependency.
+ * \param numDependencies - Number of dependencies.
+ * \param mode            - Controls the interaction of this capture sequence with other API
+ *                          calls that are potentially unsafe. For more details see
+ *                          ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode,
+ * ::cuGraphAddNode,
+ */
+CUresult CUDAAPI cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     CUstreamCaptureMode mode = desiredMode;
+     cuThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+ *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture
+ */
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p hStream, returning the captured graph via \p phGraph.
+ * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cuStreamBeginCapture was not
+ * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ * ::cuStreamBeginCapture.
+ *
+ * \param hStream - Stream to query
+ * \param phGraph - The captured graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuGraphDestroy
+ */
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p hStream via \p captureStatus. After a successful
+ * call, \p *captureStatus will contain one of the following:
+ * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cuStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p hStream.
+ *
+ * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ * a blocking stream in the same context is capturing, it will return
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param hStream       - Stream to query
+ * \param captureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+
+
+/**
+ * \brief Query a stream's capture state
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo_v3
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+
+/**
+ * \brief Query a stream's capture state (12.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * If \p edgeData_out is non-NULL then \p dependencies_out must be as well. If
+ * \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge
+ * data for one or more of the current stream dependencies, the call will return
+ * ::CUDA_ERROR_LOSSY_QUERY.
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param edgeData_out - Optional location to store a pointer to an array of graph edge
+ *           data. This array parallels \c dependencies_out; the next node to be added
+ *           has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for
+ *           each \c i. The array pointer is valid until the next API call which operates
+ *           on the stream or until the capture is terminated.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
+ * ::CUDA_ERROR_LOSSY_QUERY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out,
+        const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions to CUDA 11.0 should not use this API or provide a fallback.
+ *
+ * \param hStream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (12.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on along with the
+ * edge data for those dependencies.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * \param hStream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param dependencyData - Optional array of data associated with each dependency.
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies,
+    const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p hStream to specify stream association of
+ * \p length bytes of memory starting from \p dptr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p dptr must point to one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cuMemAllocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable host allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::CUmemAttach_flags.
+ * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p hStream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p hStream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param hStream - Stream in which to enqueue the attach operation
+ * \param dptr    - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  pageable memory)
+ * \param length  - Length of memory
+ * \param flags   - Must be one of ::CUmemAttach_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cudaStreamAttachMemAsync
+ */
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+
+/**
+ * \brief Determine status of a compute stream
+ *
+ * Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuStreamSynchronize().
+ *
+ * \param hStream - Stream to query status of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamQuery
+ */
+CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+
+/**
+ * \brief Wait until a stream's tasks are completed
+ *
+ * Waits until the device has completed all operations in the stream specified
+ * by \p hStream. If the context was created with the
+ * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ * stream is finished with all of its tasks.
+ *
+ * \param hStream - Stream to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamDestroy,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamSynchronize
+ */
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+
+/**
+ * \brief Destroys a stream
+ *
+ * Destroys the stream specified by \p hStream.
+ *
+ * In case the device is still doing work in the stream \p hStream
+ * when ::cuStreamDestroy() is called, the function will return immediately
+ * and the resources associated with \p hStream will be released automatically
+ * once the device has completed all work in \p hStream.
+ *
+ * \param hStream - Stream to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamDestroy
+ */
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For list of attributes see ::CUstreamAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
+
+/**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out);
+
+/**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value);
+
+/** @} */ /* END CUDA_STREAM */
+
+
+/**
+ * \defgroup CUDA_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event
+ *
+ * Creates an event *phEvent for the current context with the flags specified via
+ * \p Flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ *   an event created with this flag will block until the event has actually
+ *   been recorded.
+ * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+ *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+ *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+ *   be specified along with ::CU_EVENT_DISABLE_TIMING.
+ *
+ * \param phEvent - Returns newly created event
+ * \param Flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventCreate,
+ * ::cudaEventCreateWithFlags
+ */
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventRecord,
+ * ::cuEventRecordWithFlags
+ */
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * flags include:
+ * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ * \param flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cuEventRecord,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p hEvent. See
+ * ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuEventSynchronize().
+ *
+ * \param hEvent - Event to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventQuery
+ */
+CUresult CUDAAPI cuEventQuery(CUevent hEvent);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p hEvent.
+ * See ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param hEvent - Event to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventSynchronize
+ */
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
+
+/**
+ * \brief Destroys an event
+ *
+ * Destroys the event specified by \p hEvent.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param hEvent - Event to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventElapsedTime,
+ * ::cudaEventDestroy
+ */
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
+ * supplant this version in CUDA 13.0, which is retained for minor version compatibility.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds). Note this API is not guaranteed
+ * to return the latest errors for pending work. As such this API is intended to
+ * serve as an elapsed time calculation only and any polling for completion on the
+ * events to be compared should be done with ::cuEventQuery instead.
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime_v2(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/** @} */ /* END CUDA_EVENT */
+
+/**
+ * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+ /**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+            CUexternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ * of handle being imported. ::CUexternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum CUexternalMemoryHandleType_enum {
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8,
+        } CUexternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared NT handle that is returned by
+ * IDXGIResource1::CreateSharedHandle when referring to a
+ * ID3D11Resource object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared KMT handle that is returned by
+ * IDXGIResource::GetSharedHandle when referring to a
+ * ID3D11Resource object and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
+ * as appropriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+ * for memory synchronization.
+ *
+ *
+ * The size of the memory object must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+ *
+ * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
+ * is one of the following:
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ * \sa ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cuMemFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+            unsigned long long offset;
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            unsigned int numLevels;
+        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ * the format, dimensions and type of the base level of the mipmap
+ * chain. For further details on these parameters, please refer to the
+ * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
+ *
+ *
+ * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cuMemFree and
+ * ::cuMipmappedArrayDestroy respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+            CUexternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ * handle being imported. ::CUexternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum CUexternalSemaphoreHandleType_enum {
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+        } CUexternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that is returned by
+ * ID3D11Fence::CreateSharedHandle. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid IDXGIKeyedMutex object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared KMT handle that
+ * is returned by IDXGIResource::GetSharedHandle when referring to
+ * a IDXGIKeyedMutex object and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then the semaphore will be set to the value specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
+ * to a value that can be used by subsequent waiters of the same NvSciSync object
+ * to order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all external memory objects that are imported as
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ * NvSciSyncFence associated with semaphore object of the type 
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC can be deterministic. For this the 
+ * NvSciSyncAttrList used to create the semaphore object must have value of 
+ * NvSciSyncAttrKey_RequireDeterministicFences key set to true. Deterministic fences 
+ * allow users to enqueue a wait over the semaphore object even before corresponding
+ * signal is enqueued. For such a semaphore object, CUDA guarantees that each signal 
+ * operation will increment the fence value by '1'. Users are expected to track count 
+ * of signals enqueued on the semaphore object and insert waits accordingly. When such 
+ * a semaphore object is signaled from multiple streams, due to concurrent stream 
+ * execution, it is possible that the order in which the semaphore gets signaled is 
+ * indeterministic. This could lead to waiters of the semaphore getting unblocked 
+ * incorrectly. Users are expected to handle such situations, either by not using the 
+ * same semaphore object with deterministic fence support enabled in different streams 
+ * or by adding explicit dependency amongst such streams so that the semaphore is 
+ * signaled in order.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be released with the key specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream      - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * then, waiting on the semaphore will wait until the
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be acquired when it is released with the key 
+ * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
+ * or until the timeout specified by
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout.
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_TIMEOUT
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
+
+/** @} */ /* END CUDA_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDA_MEMOP Stream Memory Operations
+ *
+ * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream memory operations of the low-level CUDA
+ * driver application programming interface.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * Note that all memory pointers passed as parameters to these operations
+ * are device pointers. Where necessary a device pointer should be
+ * obtained, for example with ::cuMemHostGetDevicePointer().
+ *
+ * None of the operations accepts pointers to managed memory buffers
+ * (::cuMemAllocManaged).
+ *
+ * \note
+ * Warning:
+ * Improper use of these APIs may deadlock the application. Synchronization 
+ * ordering established through these APIs is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by these APIs should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * @{
+ */
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ * and ::cuStreamWriteValue64() for details of specific operations.
+ *
+ * See related APIs for details on querying support for specific operations.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+/** @} */ /* END CUDA_MEMOP */
+
+/**
+ * \defgroup CUDA_EXEC Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns information about a function
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * given by \p hfunc. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the function would fail. This number
+ *   depends on both the function and the device on which the function is
+ *   currently loaded.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this function.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this function.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this function.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this function.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the function was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the function was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+ *   kernel must launch with a valid cluster size specified.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed. A non-portable cluster size may only function on the
+ *   specific SKUs the program is tested on. The launch might fail if the
+ *   program is run on a different hardware platform. CUDA API provides
+ *   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+ *   size can be launched on the current device. A portable cluster size is
+ *   guaranteed to be functional on all compute capabilities higher than the
+ *   target compute capability. The portable cluster size for sm_90 is 8 blocks
+ *   per cluster. This value may increase for future compute capabilities. The
+ *   specific hardware unit may support higher cluster sizes that’s not
+ *   guaranteed to be portable.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * With a few execeptions, function attributes may also be queried on unloaded
+ * function handles returned from ::cuModuleEnumerateFunctions.
+ * ::CUDA_ERROR_FUNCTION_NOT_LOADED is returned if the attribute requires a fully
+ * loaded function but the function is not loaded. The loading state of a function
+ * may be queried using ::cuFuncIsloaded. ::cuFuncLoad may be called to explicitly
+ * load a function before querying the following attributes that require the function
+ * to be loaded:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param hfunc  - Function to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_FUNCTION_NOT_LOADED
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute,
+ * ::cuFuncIsLoaded,
+ * ::cuFuncLoad,
+ * ::cuKernelGetAttribute
+ */
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+
+/**
+ * \brief Sets information about a function
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel given
+ * by \p hfunc to an integer value specified by \p val
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Supported attributes for the cuFuncSetAttribute call are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory. 
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \param hfunc  - Function to query attribute of
+ * \param attrib - Attribute requested
+ * \param value   - The value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device function \p hfunc. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p hfunc.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ * that case, the current context-wide setting will be used.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param hfunc  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuKernelSetCacheConfig
+ */
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+
+
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p *hmod the handle of the module that function \p hfunc
+ * is located in. The lifetime of the module corresponds to the lifetime of
+ * the context it was loaded in or until the module is explicitly unloaded.
+ *
+ * The CUDA runtime manages its own modules loaded into the primary context.
+ * If the handle returned by this API refers to a module loaded by the CUDA runtime,
+ * calling ::cuModuleUnload() on that module will result in undefined behavior.
+ *
+ * \param hmod - Returned module handle
+ * \param hfunc   - Function to retrieve module for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
+
+/**
+ * \brief Returns the function name for a ::CUfunction handle
+ *
+ * Returns in \p **name the function name associated with the function handle \p hfunc .
+ * The function name is returned as a null-terminated string. The returned name is only 
+ * valid when the function handle is valid. If the module is unloaded or reloaded, one 
+ * must call the API again to get the updated name. This API may return a mangled name if
+ * the function is not declared as having C linkage. If either \p **name or \p hfunc 
+ * is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param name - The returned name of the function
+ * \param hfunc - The function handle to retrieve the name for 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetName(const char **name, CUfunction hfunc);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+ *
+ * Queries the kernel parameter at \p paramIndex into \p func's list of parameters, and returns
+ * in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ * will reside in the device-side parameter layout. This information can be used to update kernel
+ * node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ * ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ * that \p func takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+ *
+ * \param func        - The function to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+* \sa ::cuKernelGetParamInfo
+ */
+CUresult CUDAAPI cuFuncGetParamInfo(CUfunction func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+
+typedef enum CUfunctionLoadingState_enum {
+    CU_FUNCTION_LOADING_STATE_UNLOADED = 0,
+    CU_FUNCTION_LOADING_STATE_LOADED = 1,
+    CU_FUNCTION_LOADING_STATE_MAX
+} CUfunctionLoadingState;
+
+/**
+ * \brief Returns if the function is loaded
+ *
+ * Returns in \p state the loading state of \p function.
+ *
+ * \param state - returned loading state
+ * \param function - the function to check
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuFuncLoad,
+ * ::cuModuleEnumerateFunctions
+ */
+CUresult CUDAAPI cuFuncIsLoaded(CUfunctionLoadingState *state, CUfunction function);
+
+/**
+ * \brief Loads a function
+ *
+ * Finalizes function loading for \p function. Calling this API with a
+ * fully loaded function has no effect.
+ *
+ * \param function - the function to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleEnumerateFunctions,
+ * ::cuFuncIsLoaded
+ */
+CUresult CUDAAPI cuFuncLoad(CUfunction function);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f
+ * on a \p gridDimX x \p gridDimY x \p gridDimZ grid of blocks.
+ * Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p f can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into
+ * a single buffer that is passed in via the \p extra parameter.
+ * This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer.  Here is
+ * an example of using the \p extra parameter in this manner:
+ * \code
+    size_t argBufferSize;
+    char argBuffer[256];
+
+    // populate argBuffer and argBufferSize
+
+    void *config[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+        CU_LAUNCH_PARAM_END
+    };
+    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ * \endcode
+ *
+ * The \p extra parameter exists to allow ::cuLaunchKernel to take
+ * additional less commonly used arguments.  \p extra specifies a list of
+ * names of extra settings and their corresponding values.  Each extra
+ * setting name is immediately followed by the corresponding value.  The
+ * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer containing all
+ *   the kernel parameters for launching kernel \p f;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t containing the
+ *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ * parameters are specified with both \p kernelParams and \p extra
+ * (i.e. both \p kernelParams and \p extra are non-NULL).
+ *
+ * Calling ::cuLaunchKernel() invalidates the persistent function state
+ * set through the following deprecated APIs:
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(),
+ *  ::cuParamSetv().
+ *
+ * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream \p hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel with launch-time configuration
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f with the specified launch-time configuration
+ * \p config.
+ *
+ * The ::CUlaunchConfig structure is defined as:
+ *
+ * \code
+ *       typedef struct CUlaunchConfig_st {
+ *     unsigned int gridDimX;
+ *     unsigned int gridDimY;
+ *     unsigned int gridDimZ;
+ *     unsigned int blockDimX;
+ *     unsigned int blockDimY;
+ *     unsigned int blockDimZ;
+ *     unsigned int sharedMemBytes;
+ *     CUstream hStream;
+ *     CUlaunchAttribute *attrs;
+ *     unsigned int numAttrs;
+ * } CUlaunchConfig;
+ * \endcode
+ *
+ * where:
+ * - ::CUlaunchConfig::gridDimX is the width of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimY is the height of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks.
+ * - ::CUlaunchConfig::blockDimX is the X dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block.
+ * - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per
+ *   thread block in bytes.
+ * - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch
+ *   in. The CUDA context associated with this stream must match that associated
+ *   with function f.
+ * - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs
+ *   continguous ::CUlaunchAttribute elements. The value of this pointer is not
+ *   considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it
+ *   is recommended to set the pointer to NULL.
+ * - ::CUlaunchConfig::numAttrs is the number of attributes populating the
+ *   first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs
+ *   array.
+ *
+ * Launch-time configuration is specified by adding entries to
+ * ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding
+ * attribute value.
+ *
+ * The ::CUlaunchAttribute structure is defined as:
+ * \code
+ *       typedef struct CUlaunchAttribute_st {
+ *     CUlaunchAttributeID id;
+ *     CUlaunchAttributeValue value;
+ * } CUlaunchAttribute;
+ * \endcode
+ * where:
+ * - ::CUlaunchAttribute::id is a unique enum identifying the attribute.
+ * - ::CUlaunchAttribute::value is a union that hold the attribute value.
+ *
+ * An example of using the \p config parameter:
+ * \code
+ *       CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE,
+ *                               .value = 1};
+ * CUlaunchConfig config = {... // set block and grid dimensions
+ *                        .attrs = &coopAttr,
+ *                        .numAttrs = 1};
+ *
+ * cuLaunchKernelEx(&config, kernel, NULL, NULL);
+ * \endcode
+ *
+ * The ::CUlaunchAttributeID enum is defined as:
+ * \code
+ *       typedef enum CUlaunchAttributeID_enum {
+ *     CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+ *     CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1,
+ *     CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2,
+ *     CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+ *     CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4,
+ *     CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+ *     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6,
+ *     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7,
+ *     CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8,
+ *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9,
+ *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10,
+ *     CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
+ *     CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
+ *     CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
+ * } CUlaunchAttributeID;
+ * \endcode
+ *
+ * and the corresponding ::CUlaunchAttributeValue union as :
+ * \code
+ *       typedef union CUlaunchAttributeValue_union {
+ *     CUaccessPolicyWindow accessPolicyWindow;
+ *     int cooperative;
+ *     CUsynchronizationPolicy syncPolicy;
+ *     struct {
+ *         unsigned int x;
+ *         unsigned int y;
+ *         unsigned int z;
+ *     } clusterDim;
+ *     CUclusterSchedulingPolicy clusterSchedulingPolicyPreference;
+ *     int programmaticStreamSerializationAllowed;
+ *     struct {
+ *         CUevent event;
+ *         int flags;
+ *         int triggerAtBlockStart;
+ *     } programmaticEvent;
+ *     int priority;
+ *     CUlaunchMemSyncDomainMap memSyncDomainMap;
+ *     CUlaunchMemSyncDomain memSyncDomain;
+ *     struct {
+ *         unsigned int x;
+ *         unsigned int y;
+ *         unsigned int z;
+ *     } preferredClusterDim;
+ *     struct {
+ *         CUevent event;
+ *         int flags;
+ *     } launchCompletionEvent;
+ *     struct {
+ *         int deviceUpdatable;
+ *         CUgraphDeviceNode devNode;
+ *     } deviceUpdatableKernelNode;
+ * } CUlaunchAttributeValue;
+ * \endcode
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the
+ * kernel launch to be a cooperative launch, with exactly the same usage and
+ * semantics of ::cuLaunchCooperativeKernel.
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero
+ * values causes the kernel to use programmatic means to resolve its stream
+ * dependency -- enabling the CUDA runtime to opportunistically allow the grid's
+ * execution to overlap with the previous kernel in the stream, if that kernel
+ * requests the overlap.
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the
+ * kernel launch. Event recorded through this launch attribute is guaranteed to
+ * only trigger after all block in the associated kernel trigger the event. A
+ * block can trigger the event through PTX launchdep.release or CUDA builtin
+ * function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be
+ * inserted at the beginning of each block's execution if triggerAtBlockStart is
+ * set to non-0. Note that dependents (including the CPU thread calling
+ * cuEventSynchronize()) are not guaranteed to observe the release precisely
+ * when it is released. For example, cuEventSynchronize() may only observe the
+ * event trigger long after the associated kernel has completed. This recording
+ * type is primarily meant for establishing programmatic dependency between
+ * device tasks. The event supplied must not be an interprocess or interop
+ * event. The event must disable timing (i.e. created with
+ * ::CU_EVENT_DISABLE_TIMING flag set).
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT records an event along with
+ * the kernel launch. Nominally, the event is triggered once all blocks of the
+ * kernel have begun execution. Currently this is a best effort. If a kernel B
+ * has a launch completion dependency on a kernel A, B may wait until A is
+ * complete. Alternatively, blocks of B may begin before all blocks of A have
+ * begun, for example:
+ *
+ *  - If B can claim execution resources unavaiable to A, for example if they
+ *    run on different GPUs.
+ *  - If B is a higher priority than A.
+ *
+ * Exercise caution if such an ordering inversion could lead to deadlock. The
+ * event supplied must not be an interprocess or interop event. The event must
+ * disable timing (i.e. must be created with the ::CU_EVENT_DISABLE_TIMING flag
+ * set).
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE to 1
+ * on a captured launch causes the resulting kernel node to be device-updatable.
+ * This attribute is specific to graphs, and passing it to a launch in a
+ * non-capturing stream results in an error. Passing a value other than 0 or 1 is
+ * not allowed.
+ *
+ * On success, a handle will be returned via
+ * ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be passed
+ * to the various device-side update functions to update the node's kernel parameters
+ * from within another kernel. For more information on the types of device updates
+ * that can be made, as well as the relevant limitations thereof, see
+ * ::cudaGraphKernelNodeUpdatesApply.
+ *
+ * Kernel nodes which are device-updatable have additional restrictions compared to regular
+ * kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via
+ * ::cuGraphDestroyNode. Additionally, once opted-in to this functionality, a node cannot
+ * opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
+ * containing one or more device-updatable node also do not allow multiple instantiation.
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
+ * specify a preferred substitute cluster dimension. Blocks may be grouped
+ * according to either the dimensions specified with this attribute (grouped
+ * into a "preferred substitute cluster"), or the one specified with
+ * ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
+ * cluster"). The cluster dimensions of a "preferred substitute cluster" shall
+ * be an integer multiple greater than zero of the regular cluster dimensions.
+ * The device will attempt - on a best-effort basis - to group thread blocks
+ * into preferred clusters over grouping them into regular clusters. When it
+ * deems necessary (primarily when the device temporarily runs out of physical
+ * resources to launch the larger preferred clusters), the device may switch to
+ * launch the regular clusters instead to attempt to utilize as much of the
+ * physical device resources as possible.
+ *
+ * Each type of cluster will have its enumeration / coordinate setup as if the
+ * grid consists solely of its type of cluster. For example, if the preferred
+ * substitute cluster dimensions double the regular cluster dimensions, there
+ * might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
+ * cluster indexed at (1,0,0). In this example, the preferred substitute cluster
+ * (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
+ * blocks.
+ *
+ * This attribute will only take effect when a regular cluster dimension has
+ * been specified. The preferred substitute The preferred substitute cluster
+ * dimension must be an integer multiple greater than zero of the regular
+ * cluster dimension and must divide the grid. It must also be no more than
+ * `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
+ * Otherwise it must be less than the maximum value the driver can support.
+ * Otherwise, setting this attribute to a value physically unable to fit on any
+ * particular device is permitted.
+ *
+ * The effect of other attributes is consistent with their effect when set via
+ * persistent APIs.
+ *
+ * See ::cuStreamSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+ * - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+ *
+ * See ::cuFuncSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+ *
+ * Kernel parameters to \p f can be specified in the same ways that they can be
+ * using ::cuLaunchKernel.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream ::CUlaunchConfig::hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param config         - Config to launch
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel,
+ * ::cudaLaunchKernelEx,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
+                                  CUfunction f,
+                                  void **kernelParams,
+                                  void **extra);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel where thread blocks
+ * can cooperate and synchronize as they execute
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * Kernel parameters must be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API
+ *
+ * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream \p hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernelMultiDevice,
+ * ::cudaLaunchCooperativeKernel,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams);
+
+/**
+ * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+ *
+ * All kernels launched must be identical with respect to the compiled code. Note that
+ * any __device__, __constant__ or __managed__ variables present in the module that owns
+ * the kernel launched on each device, are independently instantiated on every device.
+ * It is the application's responsibility to ensure these variables are initialized and
+ * used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves
+ * and the amount of shared memory used by each thread block must also match across
+ * all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ * cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernels cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ * \code
+        typedef struct CUDA_LAUNCH_PARAMS_st
+        {
+            CUfunction function;
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            void **kernelParams;
+        } CUDA_LAUNCH_PARAMS;
+ * \endcode
+ * where:
+ * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+ *   be identical with respect to the compiled code.
+ *   Note that you can also specify context-less kernel ::CUkernel by querying the handle
+ *   using ::cuLibraryGetKernel() and then casting to ::CUfunction. In this case, the context to
+ *   launch the kernel on be taken from the specified stream ::CUDA_LAUNCH_PARAMS::hStream.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+ *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+ *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+ *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+ *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+ *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+ *   do not need to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API when called individually for each
+ * element in \p launchParamsList.
+ *
+ * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ * in \p launchParamsList is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernel,
+ * ::cudaLaunchCooperativeKernelMultiDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamAddCallback
+ */
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+
+/** @} */ /* END CUDA_EXEC */
+
+/**
+ * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the block-dimensions for the function
+ *
+ * \deprecated
+ *
+ * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ * created when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dimensions of
+ * \param x     - X dimension
+ * \param y     - Y dimension
+ * \param z     - Z dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetSharedSize,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+
+/**
+ * \brief Sets the dynamic shared-memory size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p bytes the amount of dynamic shared memory that will be
+ * available to each thread block when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dynamic shared-memory size for
+ * \param bytes - Dynamic shared-memory size per thread in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+
+/**
+ * \brief Sets the parameter size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p numbytes the total size in bytes needed by the function
+ * parameters of the kernel corresponding to \p hfunc.
+ *
+ * \param hfunc    - Kernel to set parameter size for
+ * \param numbytes - Size of parameter list in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+
+/**
+ * \brief Adds an integer parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets an integer parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+
+/**
+ * \brief Adds a floating-point parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets a floating-point parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
+
+/**
+ * \brief Adds arbitrary data to the function's argument list
+ *
+ * \deprecated
+ *
+ * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ * into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ * is a byte offset.
+ *
+ * \param hfunc    - Kernel to add data to
+ * \param offset   - Offset to add data to argument list
+ * \param ptr      - Pointer to arbitrary data
+ * \param numbytes - Size of data to copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ * contains the number of threads specified by a previous call to
+ * ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f - Kernel to launch
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ * \param hStream     - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ *
+ * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+ *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
+ *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
+ *
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+
+/**
+ * \brief Adds a texture-reference to the function's argument list
+ *
+ * \deprecated
+ *
+ * Makes the CUDA array or linear memory bound to the texture reference
+ * \p hTexRef available to a device program as a texture. In this version of
+ * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+ *
+ * \param hfunc   - Kernel to add texture-reference to
+ * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ * \param hTexRef - Texture-reference to add to argument list
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+
+/**
+ * \brief Sets the shared memory configuration for a device function.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions,
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via
+ * ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ * ::cuCtxSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+ *   configuration when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes when launching this function.
+ *
+ * \param hfunc  - kernel to be given a shared memory config
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxSetSharedMemConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+
+/** @} */ /* END CUDA_EXEC_DEPRECATED */
+
+/**
+ * \defgroup CUDA_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p phGraph.
+ *
+ * \param phGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphInstantiate,
+ * ::cuGraphDestroy,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+ *
+ * \code
+ *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+ *      CUfunction func;
+ *      unsigned int gridDimX;
+ *      unsigned int gridDimY;
+ *      unsigned int gridDimZ;
+ *      unsigned int blockDimX;
+ *      unsigned int blockDimY;
+ *      unsigned int blockDimZ;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *      CUkernel kern;
+ *      CUcontext ctx;
+ *  } CUDA_KERNEL_NODE_PARAMS;
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
+ * buffer that is passed in via \p extra. This places the burden on the application of knowing each
+ * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
+ * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuLaunchKernel,
+ * ::cuLaunchCooperativeKernel,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p hNode in \p nodeParams.
+ * The \p kernelParams or \p extra array returned in \p nodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeGetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ * See ::cuMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ * for those operand(s). The managed memory will be treated as residing on either the
+ * host or the device, depending on which memory type is specified.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param copyParams      - Parameters for the memory copy
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuMemcpy3D,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p memsetParams.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param memsetParams    - Parameters for the memory set
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuMemsetD2D32,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode
+ */
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuLaunchHostFunc,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeGetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param hNode   - Node to get the embedded graph for
+ * \param phGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event. \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a batch memory operation node and adds it to a graph
+ *
+ * Creates a new batch memory operation node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuStreamBatchMemOp,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue64,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a batch mem op node's parameters
+ *
+ * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out.
+ * The \p paramArray returned in \p nodeParams_out is owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode          - Node to get the parameters for
+ * \param nodeParams_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeSetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+
+/**
+ * \brief Sets a batch mem op node's parameters
+ *
+ * Sets the parameters of batch mem op node \p hNode to \p nodeParams.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a batch mem op node in the given graphExec
+ *
+ * Sets the parameters of a batch mem op node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The following fields on operations may be modified on an executable graph:
+ *
+ *  op.waitValue.address
+ *  op.waitValue.value[64]
+ *  op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified)
+ *  op.writeValue.address
+ *  op.writeValue.value[64]
+ *
+ * Other fields, such as the context, count or type of operations, and other types of operations such as membars, 
+ * may not be modified.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Batch mem op node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
+ * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
+ * 
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param hNode    - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuDeviceGetGraphMemAttribute
+ */
+CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param phGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ * in the original graph.
+ *
+ * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ * been removed. The cloned node is then returned via \p phClonedNode.
+ *
+ * \param phNode  - Returns handle to the cloned node
+ * \param hOriginalNode - Handle to the original node
+ * \param hClonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p hNode in \p type.
+ *
+ * \param hNode - Node to query
+ * \param type  - Pointer to return the node type
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param hGraph   - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p numRootNodes. Otherwise,
+ * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numRootNodes.
+ *
+ * \param hGraph       - Graph to query
+ * \param rootNodes    - Pointer to return the root nodes
+ * \param numRootNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+
+/**
+ * \brief Returns a graph's dependency edges (12.3+)
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a
+ * dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may
+ * both be NULL, in which case this function only returns the number of edges in
+ * \p numEdges. Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher
+ * than the actual number of edges, the remaining entries in \p from and \p to will be
+ * set to NULL, and the number of edges actually returned will be written to \p numEdges.
+ * \p edgeData may alone be NULL, in which case the edges must all have default (zeroed)
+ * edge data. Attempting a lossy query via NULL \p edgeData will result in
+ * ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL then \p from and \p to must be
+ * as well.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param edgeData - Optional location to return edge data
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, CUgraphEdgeData *edgeData, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependencies (12.3+)
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL, then
+ * \p dependencies must be as well.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param edgeData        - Optional array to return edge data for each dependency
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode *dependencies, CUgraphEdgeData *edgeData, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+
+/**
+ * \brief Returns a node's dependent nodes (12.3+)
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::CUDA_ERROR_LOSSY_QUERY.  If \p edgeData is non-NULL, then
+ * \p dependentNodes must be as well.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param edgeData          - Optional pointer to return edge data for dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode *dependentNodes, CUgraphEdgeData *edgeData, size_t *numDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Adds dependency edges to a graph (12.3+)
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, default (zeroed) edge data is assumed.
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph (12.3+)
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an edge that does not exist in the graph, with data matching
+ * \p edgeData, results in an error. \p edgeData is nullable, which is equivalent
+ * to passing default (zeroed) data for each edge.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, edge data is assumed to
+ *                   be default (zeroed).
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ * on \p hNode and vice versa.
+ *
+ * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
+ * Any attempt to do so will return an error.
+ *
+ * \param hNode  - Node to remove
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p phGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate
+ * a second executable graph before destroying the first with ::cuGraphExecDestroy
+ * will result in an error.
+ * The same also applies if \p hGraph contains any device-updatable kernel nodes.
+ *
+ * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * contexts, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single context.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current context, and the current context must
+ *     match the context of other nodes in the graph.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphCreate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph according to the \p instantiateParams structure.
+ * The graph is validated for any structural constraints or intra-node constraints
+ * which were not previously validated. If instantiation is successful, a handle to
+ * the instantiated graph is returned in \p phGraphExec.
+ *
+ * \p instantiateParams controls the behavior of instantiation and subsequent
+ * graph launches, as well as returning more detailed information in the event of an error.
+ * ::CUDA_GRAPH_INSTANTIATE_PARAMS is defined as:
+ *
+ * \code
+    typedef struct {
+        cuuint64_t flags;
+        CUstream hUploadStream;
+        CUgraphNode hErrNode_out;
+        CUgraphInstantiateResult result_out;
+    } CUDA_GRAPH_INSTANTIATE_PARAMS;
+ * \endcode
+ *
+ * The \p flags field controls the behavior of instantiation and subsequent
+ * graph launches. Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD, which will perform an upload of the graph
+ * into \p hUploadStream once the graph has been instantiated.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate a
+ * second executable graph before destroying the first with ::cuGraphExecDestroy will
+ * result in an error.
+ * The same also applies if \p hGraph contains any device-updatable kernel nodes.
+ *
+ * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * contexts, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single context.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current context, and the current context must
+ *     match the context of other nodes in the graph.
+ *
+ * In the event of an error, the \p result_out and \p hErrNode_out fields will contain more
+ * information about the nature of the error. Possible error reporting includes:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_ERROR, if passed an invalid value or if an unexpected error occurred
+ *   which is described by the return value of the function. \p hErrNode_out will be set to NULL.
+ * - ::CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE, if the graph structure is invalid. \p hErrNode_out
+ *   will be set to one of the offending nodes.
+ * - ::CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED, if the graph is instantiated for device
+ *   launch but contains a node of an unsupported node type, or a node which performs unsupported
+ *   operations, such as use of CUDA dynamic parallelism within a kernel node. \p hErrNode_out will
+ *   be set to this node.
+ * - ::CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED, if the graph is instantiated for device
+ *   launch but a node’s context differs from that of another node. This error can also be returned
+ *   if a graph is not instantiated for device launch and it contains kernels which call device-side
+ *   cudaGraphLaunch() from multiple contexts. \p hErrNode_out will be set to this node.
+ *
+ * If instantiation is successful, \p result_out will be set to ::CUDA_GRAPH_INSTANTIATE_SUCCESS,
+ * and \p hErrNode_out will be set to NULL.
+ *
+ * \param phGraphExec       - Returns instantiated graph
+ * \param hGraph            - Graph to instantiate
+ * \param instantiateParams - Instantiation parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphInstantiate,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+
+/**
+ * \brief Query the instantiation flags of an executable graph
+ *
+ * Returns the flags that were passed to instantiation for the given executable graph.
+ * ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does
+ * not affect the resulting executable graph.
+ *
+ * \param hGraphExec - The executable graph to query
+ * \param flags      - Returns the instantiation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphInstantiateWithParams
+ */
+CUresult CUDAAPI cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t *flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p hNode in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p hNode must not have been removed from the original graph. All \p nodeParams 
+ * fields may change, but the following restrictions apply to \p func updates: 
+ *
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p hNode is also not modified by this call.
+ *
+ * If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ * 
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param hNode       - kernel node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ * 
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The source and destination memory in \p copyParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p copyParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
+ * \param copyParams - The updated parameters to set
+ * \param ctx        - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * Zero sized operations are not supported.
+ *
+ * The new destination pointer in memsetParams must be to the same kind of allocation
+ * as the original destination pointer and have the same context association and device mapping
+ * as the original destination pointer.
+ *
+ * Both the value and pointer address may be updated.  
+ * Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
+ * Specifically, for 2d memsets, all dimension changes are rejected.
+ * For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
+ * if the resulting work maps onto the work resources already allocated for the node.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec   - The executable graph in which to set the specified node
+ * \param hNode        - Memset node from the graph which was used to instantiate graphExec
+ * \param memsetParams - The updated parameters to set
+ * \param ctx          - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param nodeParams - The updated parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
+ * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ * \note This function will not reflect device-side updates for device-updatable kernel nodes.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p hStream
+ * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
+ * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
+ * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param hGraphExec - Executable graph to launch
+ * \param hStream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p hGraphExec, as well
+ * as all of its executable nodes. If the executable graph is
+ * in-flight, it will not be terminated, but rather freed
+ * asynchronously on completion.
+ *
+ * \param hGraphExec - Executable graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p hGraph, as well as all of its nodes.
+ *
+ * \param hGraph - Graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *   - Neither \p hGraph nor \p hGraphExec may contain device-updatable kernel nodes.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - For 2d memsets, only address and assinged value may be updated.
+ *   - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
+ *     map onto the work resources already allocated for the node. 
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - External semaphore wait nodes and record nodes:
+ *   - Changing the number of semaphores is not supported.
+ * - Conditional nodes:
+ *   - Changing node parameters is not supported.
+ *   - Changeing parameters of nodes within the conditional body graph is subject to the rules above.
+ *   - Conditional handle flags and default values are updated as part of the graph update.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED
+ * under the following conditions:
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode
+ *   is set to NULL.
+ * - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of
+ *   the exit nodes in hGraph. 
+ * - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with,
+ *   in which case resultInfo->errorNode is set to the node from \p hGraph.
+ * - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node
+ *   from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode
+ *   will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency
+ *   does not match when the nodes are already paired based on other edges examined in the graph.
+ *
+ * cuGraphExecUpdate sets the result member of \p resultInfo to: 
+ * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
+ *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If the update fails for a reason not listed above, the result member of \p resultInfo will be set
+ * to CU_GRAPH_EXEC_UPDATE_ERROR. If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
+ *
+ * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
+ * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+ * \param resultInfo the error info structure 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::CUkernelNodeAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
+
+/**
+ * \brief Queries node attribute.
+ * 
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *  
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      CUkernelNodeAttrValue *value_out);
+ 
+/**
+ * \brief Sets node attribute.
+ * 
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      const CUkernelNodeAttrValue *value);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param hGraph - The graph to create a DOT file from
+ * \param path   - The path to write the DOT file to
+ * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ */
+CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
+                                    unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph
+ *
+ * Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p dependencies. \p numDependencies may be 0.
+ * \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphAddNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph (12.3+)
+ *
+ * Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p dependencies. \p numDependencies may be 0.
+ * \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param dependencyData  - Optional edge data for the dependencies. If NULL, the data is
+ *                          assumed to be default (zeroed) for all dependencies.
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphAddNode_v2(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters
+ *
+ * Sets the parameters of graph node \p hNode to \p nodeParams. The node type specified by
+ * \p nodeParams->type must match the type of \p hNode. \p nodeParams must be fully
+ * initialized and all unused bytes (reserved, padding) zeroed.
+ *
+ * Modifying parameters is not supported for node types CU_GRAPH_NODE_TYPE_MEM_ALLOC and
+ * CU_GRAPH_NODE_TYPE_MEM_FREE.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeSetParams(CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters in an instantiated graph
+ *
+ * Sets the parameters of a node in an executable graph \p hGraphExec. The node is identified
+ * by the corresponding node \p hNode in the non-executable graph from which the executable
+ * graph was instantiated. \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Allowed changes to parameters on executable graphs are as follows:
+ * <table>
+ *   <tr><th>Node type<th>Allowed changes
+ *   <tr><td>kernel<td>See ::cuGraphExecKernelNodeSetParams
+ *   <tr><td>memcpy<td>Addresses for 1-dimensional copies if allocated in same context; see ::cuGraphExecMemcpyNodeSetParams
+ *   <tr><td>memset<td>Addresses for 1-dimensional memsets if allocated in same context; see ::cuGraphExecMemsetNodeSetParams
+ *   <tr><td>host<td>Unrestricted
+ *   <tr><td>child graph<td>Topology must match and restrictions apply recursively; see ::cuGraphExecUpdate
+ *   <tr><td>event wait<td>Unrestricted
+ *   <tr><td>event record<td>Unrestricted
+ *   <tr><td>external semaphore signal<td>Number of semaphore operations cannot change
+ *   <tr><td>external semaphore wait<td>Number of semaphore operations cannot change
+ *   <tr><td>memory allocation<td>API unsupported
+ *   <tr><td>memory free<td>API unsupported
+ *   <tr><td>batch memops<td>Addresses, values, and operation type for wait operations; see ::cuGraphExecBatchMemOpNodeSetParams
+ * </table>
+ *
+ * \param hGraphExec  - The executable graph in which to update the specified node
+ * \param hNode       - Corresponding node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphNodeSetParams
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Create a conditional handle
+ *
+ * Creates a conditional handle associated with \p hGraph. 
+ *  
+ * The conditional handle must be associated with a conditional node in this graph or one of its children.
+ *  
+ * Handles not associated with a conditional node may cause graph instantiation to fail. 
+ *  
+ * Handles can only be set from the context with which they are associated. 
+ *
+ * \param pHandle_out        - Pointer used to return the handle to the caller.
+ * \param hGraph             - Graph which will contain the conditional node using this handle.
+ * \param ctx                - Context for the handle and associated conditional node.
+ * \param defaultLaunchValue - Optional initial value for the conditional variable.
+ *                             Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
+ * \param flags              - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode
+ */
+CUresult CUDAAPI cuGraphConditionalHandleCreate(CUgraphConditionalHandle *pHandle_out, CUgraph hGraph, CUcontext ctx, unsigned int defaultLaunchValue, unsigned int flags);
+
+/** @} */ /* END CUDA_GRAPH */
+
+/**
+ * \defgroup CUDA_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, if caching is enabled, but
+ *   per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching
+ *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+ *   the occupancy calculator to return 0 in such cases. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Note that the API can also be with launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * Returns in \p *blockSize a reasonable block size that can achieve
+ * the maximum occupancy (or, the maximum number of active warps with
+ * the fewest blocks per multiprocessor), and in \p *minGridSize the
+ * minimum grid size to achieve the maximum occupancy.
+ *
+ * If \p blockSizeLimit is 0, the configurator will use the maximum
+ * block size permitted by the device / function instead.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the
+ * user should leave both \p blockSizeToDynamicSMemSize and \p
+ * dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if
+ * the dynamic shared memory size is constant regardless of block
+ * size, the size should be passed through \p dynamicSMemSize, and \p
+ * blockSizeToDynamicSMemSize should be NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with
+ * different block sizes, the user needs to provide a unary function
+ * through \p blockSizeToDynamicSMemSize that computes the dynamic
+ * shared memory needed by \p func for any given block size. \p
+ * dynamicSMemSize is ignored. An example signature is:
+ *
+ * \code
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ * \endcode
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSize
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ * parameter.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxPotentialBlockSize;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, the launch configurations that
+ *   produces maximal occupancy might not support global
+ *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+ *   guarantees that the the produced launch configuration is global
+ *   caching compatible at a potential cost of occupancy. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ * \param flags       - Options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ */
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p
+ * *clusterSize will reflect the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size
+ * from config must either be unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CLUSTER_SIZE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to
+ * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ * unbound.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param hArray  - Array to bind
+ * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Binds a mipmapped array to a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ * Any previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ * Any CUDA array previously bound to \p hTexRef is unbound.
+ *
+ * \param hTexRef         - Texture reference to bind
+ * \param hMipmappedArray - Mipmapped array to bind
+ * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+
+/**
+ * \brief Binds an address as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ * \p *ByteOffset that must be applied to texture fetches in order to read from
+ * the desired memory. This offset must be divided by the texel size and
+ * passed to kernels that read from the texture so they can be applied to the
+ * ::tex1Dfetch() function.
+ *
+ * If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ * The number of elements is computed as (\p bytes / bytesPerElement),
+ * where bytesPerElement is determined from the data format and number of
+ * components set using ::cuTexRefSetFormat().
+ *
+ * \param ByteOffset - Returned byte offset
+ * \param hTexRef    - Texture reference to bind
+ * \param dptr       - Device pointer to bind
+ * \param bytes      - Size of memory to bind in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+
+/**
+ * \brief Binds an address as a 2D texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Using a ::tex2D() function inside a kernel requires a call to either
+ * ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ * memory.
+ *
+ * Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ * ::cuTexRefSetAddress2D() for the same texture reference.
+ *
+ * It is required that \p dptr be aligned to the appropriate hardware-specific
+ * texture alignment. You can query this value using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ * This value can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Width and Height, which are specified in elements (or texels), cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * \p Pitch, which is specified in bytes, cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param desc    - Descriptor of CUDA array
+ * \param dptr    - Device pointer to bind
+ * \param Pitch   - Line pitch in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+/**
+ * \brief Sets the format for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the format of the data to be read by the texture reference
+ * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ * They specify the format of each component and the number of components per
+ * array element.
+ *
+ * \param hTexRef             - Texture reference
+ * \param fmt                 - Format to set
+ * \param NumPackedComponents - Number of components per array element
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaCreateChannelDesc
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+
+/**
+ * \brief Sets the addressing mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the addressing mode \p am for the given dimension \p dim of the
+ * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ * applied to the first parameter of the functions used to fetch from the
+ * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ * as:
+ * \code
+   typedef enum CUaddress_mode_enum {
+      CU_TR_ADDRESS_MODE_WRAP = 0,
+      CU_TR_ADDRESS_MODE_CLAMP = 1,
+      CU_TR_ADDRESS_MODE_MIRROR = 2,
+      CU_TR_ADDRESS_MODE_BORDER = 3
+   } CUaddress_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ * \param am      - Addressing mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+
+/**
+ * \brief Sets the filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ * reading memory through the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param bias    - Mipmap level bias
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+
+/**
+ * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ * respectively, to be used when reading memory through the texture reference
+ * \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef        - Texture reference
+ * \param minMipmapLevelClamp - Mipmap min level clamp
+ * \param maxMipmapLevelClamp - Mipmap max level clamp
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+
+/**
+ * \brief Sets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef  - Texture reference
+ * \param maxAniso - Maximum anisotropy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+
+/**
+ * \brief Sets the border color for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ * \p hTexRef. The color value supports only float type and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * Note that the color values can be set only when the Address mode is set to
+ * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ * Applications using integer border color values have to "reinterpret_cast" their values to float.
+ *
+ * \param hTexRef       - Texture reference
+ * \param pBorderColor  - RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+
+/**
+ * \brief Sets the flags for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies optional flags via \p Flags to specify the behavior of data
+ * returned through the texture reference \p hTexRef. The valid flags are:
+ *
+ * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format
+ *   would not be promoted, regardless of whether or not this
+ *   flag is specified;
+ * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+ *   default behavior of having the texture coordinates range
+ *   from [0, Dim) where Dim is the width or height of the CUDA
+ *   array. Instead, the texture coordinates [0, 1.0) reference
+ *   the entire breadth of the array dimension;
+ * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *
+ * \param hTexRef - Texture reference
+ * \param Flags   - Optional flags to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+/**
+ * \brief Gets the address associated with a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pdptr the base address bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any device memory range.
+ *
+ * \param pdptr   - Returned device address
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+
+/**
+ * \brief Gets the array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA array.
+ *
+ * \param phArray - Returned array
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmapped array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA mipmapped array.
+ *
+ * \param phMipmappedArray - Returned mipmapped array
+ * \param hTexRef          - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the addressing mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pam the addressing mode corresponding to the
+ * dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ * valid value for \p dim are 0 and 1.
+ *
+ * \param pam     - Returned addressing mode
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+
+/**
+ * \brief Gets the filter-mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pfm the filtering mode of the texture reference
+ * \p hTexRef.
+ *
+ * \param pfm     - Returned filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the format used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFormat and \p *pNumChannels the format and number
+ * of components of the CUDA array bound to the texture reference \p hTexRef.
+ * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+ *
+ * \param pFormat      - Returned format
+ * \param pNumChannels - Returned number of components
+ * \param hTexRef      - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pfm     - Returned mipmap filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ * level when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pbias   - Returned mipmap level bias
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+
+/**
+ * \brief Gets the min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ * that's used when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ * \param hTexRef              - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+
+/**
+ * \brief Gets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pmaxAniso - Returned maximum anisotropy
+ * \param hTexRef   - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+
+/**
+ * \brief Gets the border color used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p pBorderColor, values of the RGBA color used by
+ * the texture reference \p hTexRef.
+ * The color value is of type float and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * \param hTexRef  - Texture reference
+ * \param pBorderColor   - Returned Type and Value of RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+
+/**
+ * \brief Gets the flags used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+ *
+ * \param pFlags  - Returned flags
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/**
+ * \brief Creates a texture reference
+ *
+ * \deprecated
+ *
+ * Creates a texture reference and returns its handle in \p *pTexRef. Once
+ * created, the application must call ::cuTexRefSetArray() or
+ * ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ * Other texture reference functions are used to specify the format and
+ * interpretation (addressing, filtering, etc.) to be used when the memory is
+ * read through this texture reference.
+ *
+ * \param pTexRef - Returned texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefDestroy
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
+
+/**
+ * \brief Destroys a texture reference
+ *
+ * \deprecated
+ *
+ * Destroys the texture reference specified by \p hTexRef.
+ *
+ * \param hTexRef - Texture reference to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefCreate
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ surface reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the CUDA array for a surface reference.
+ *
+ * \deprecated
+ *
+ * Sets the CUDA array \p hArray to be read and written by the surface reference
+ * \p hSurfRef.  Any previous CUDA array state associated with the surface
+ * reference is superseded by this function.  \p Flags must be set to 0.
+ * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ * Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ * \param hSurfRef - Surface reference handle
+ * \param hArray - CUDA array handle
+ * \param Flags - set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuModuleGetSurfRef,
+ * ::cuSurfRefGetArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Passes back the CUDA array bound to a surface reference.
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the surface reference
+ * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ * is not bound to any CUDA array.
+
+ * \param phArray - Surface reference handle
+ * \param hSurfRef - Surface reference handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/** @} */ /* END CUDA_SURFREF_DEPRECATED */
+
+/**
+ * \defgroup CUDA_TEXOBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture object management functions of the
+ * low-level CUDA driver application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array not in a block
+ * compressed format.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::CUDA_RESOURCE_DESC structure is defined as:
+ * \code
+        typedef struct CUDA_RESOURCE_DESC_st
+        {
+            CUresourcetype resType;
+
+            union {
+                struct {
+                    CUarray hArray;
+                } array;
+                struct {
+                    CUmipmappedArray hMipmappedArray;
+                } mipmap;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+
+            unsigned int flags;
+        } CUDA_RESOURCE_DESC;
+
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        typedef enum CUresourcetype_enum {
+            CU_RESOURCE_TYPE_ARRAY           = 0x00,
+            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+            CU_RESOURCE_TYPE_LINEAR          = 0x02,
+            CU_RESOURCE_TYPE_PITCH2D         = 0x03
+        } CUresourcetype;
+ * \endcode
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ * must be set to a valid CUDA mipmapped array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * - ::flags must be set to zero.
+ *
+ *
+ * The ::CUDA_TEXTURE_DESC struct is defined as
+ * \code
+        typedef struct CUDA_TEXTURE_DESC_st {
+            CUaddress_mode addressMode[3];
+            CUfilter_mode filterMode;
+            unsigned int flags;
+            unsigned int maxAnisotropy;
+            CUfilter_mode mipmapFilterMode;
+            float mipmapLevelBias;
+            float minMipmapLevelClamp;
+            float maxMipmapLevelClamp;
+        } CUDA_TEXTURE_DESC;
+ * \endcode
+ * where
+ * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+ *   \code
+        typedef enum CUaddress_mode_enum {
+            CU_TR_ADDRESS_MODE_WRAP = 0,
+            CU_TR_ADDRESS_MODE_CLAMP = 1,
+            CU_TR_ADDRESS_MODE_MIRROR = 2,
+            CU_TR_ADDRESS_MODE_BORDER = 3
+        } CUaddress_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+ *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+ *   \code
+        typedef enum CUfilter_mode_enum {
+            CU_TR_FILTER_MODE_POINT = 0,
+            CU_TR_FILTER_MODE_LINEAR = 1
+        } CUfilter_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+ *
+ * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format would not be 
+ *   promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
+ *   of having the texture coordinates range from [0, Dim) where Dim is the 
+ *   width or height of the CUDA array. Instead, the texture coordinates 
+ *   [0, 1.0) reference the entire breadth of the array dimension; Note that
+ *   for CUDA mipmapped arrays, this flag has to be set.
+ *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. 
+ *   This flag can only be specified if the underlying resource is a CUDA array 
+ *   or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP.
+ *   When seamless cube map filtering is enabled, texture address modes specified 
+ *   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode 
+ *   is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP 
+ *   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is 
+ *   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed
+ *   when sampling along the cube face borders.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ *
+ * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ * \code
+        typedef struct CUDA_RESOURCE_VIEW_DESC_st
+        {
+            CUresourceViewFormat format;
+            size_t width;
+            size_t height;
+            size_t depth;
+            unsigned int firstMipmapLevel;
+            unsigned int lastMipmapLevel;
+            unsigned int firstLayer;
+            unsigned int lastLayer;
+        } CUDA_RESOURCE_VIEW_DESC;
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+ *   format but with 4 channels.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectDestroy,
+ * ::cudaCreateTextureObject
+ */
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaDestroyTextureObject
+ */
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceDesc,
+ */
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectTextureDesc
+ */
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceViewDesc
+ */
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+
+/** @} */ /* END CUDA_TEXOBJECT */
+
+/**
+ * \defgroup CUDA_SURFOBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface object management functions of the
+ * low-level CUDA driver application programming interface. The surface
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectDestroy,
+ * ::cudaCreateSurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaDestroySurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ *
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaGetSurfaceObjectResourceDesc
+ */
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+
+/** @} */ /* END CUDA_SURFOBJECT */
+
+/**
+ * \defgroup CUDA_TENSOR_MEMORY Tensor Map Object Managment
+ *
+ * ___MANBRIEF___ tensor map object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the tensor map object management functions of the
+ * low-level CUDA driver application programming interface. The tensor
+ * core API is only supported on devices of compute capability 9.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a tensor map descriptor object representing tiled memory region
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified
+ * by the parameters describing a tiled region and returns it in \p tensorMap.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,    // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
+ * ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
+ * and less than or equal to 256. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of \p elementStrides array is one, \p boxDim specifies the number of elements to load. However, if the \p elementStrides[i]
+ * is not equal to one, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension. To load N elements along
+ * i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap         - Tensor map object to create
+ * \param tensorDataType    - Tensor data type
+ * \param tensorRank        - Dimensionality of tensor
+ * \param globalAddress     - Starting address of memory region described by tensor
+ * \param globalDim         - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides     - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param boxDim            - Array containing traversal box size (number of elments) along each of the \p tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension.
+ * \param elementStrides    - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave        - Type of interleaved layout the tensor addresses
+ * \param swizzle           - Bank swizzling pattern inside shared memory
+ * \param l2Promotion       - L2 promotion size
+ * \param oobFill           - Indicate whether zero or special NaN constant must be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapEncodeIm2colWide,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Create a tensor map descriptor object representing im2col memory region
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified
+ * by the parameters describing a im2col memory layout and returns it in \p tensorMap.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+ * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p pixelBoxLowerCorner array specifies the coordinate offsets {D, H, W} of the bounding box from top/left/front corner. The number of
+ * offsets and their precision depend on the tensor dimensionality:
+ *    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+ *    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+ *    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+ *
+ * - \p pixelBoxUpperCorner array specifies the coordinate offsets {D, H, W} of the bounding box from bottom/right/back corner. The number of
+ * offsets and their precision depend on the tensor dimensionality:
+ *    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+ *    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+ *    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+ * The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
+ *
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+ *
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
+ * equal to 1024.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ * (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap             - Tensor map object to create
+ * \param tensorDataType        - Tensor data type
+ * \param tensorRank            - Dimensionality of tensor; must be at least 3
+ * \param globalAddress         - Starting address of memory region described by tensor
+ * \param globalDim             - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides         - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param pixelBoxLowerCorner   - Array containing DHW dimensions of lower box corner
+ * \param pixelBoxUpperCorner   - Array containing DHW dimensions of upper box corner
+ * \param channelsPerPixel      - Number of channels per pixel
+ * \param pixelsPerColumn       - Number of pixels per column
+ * \param elementStrides        - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave            - Type of interleaved layout the tensor addresses
+ * \param swizzle               - Bank swizzling pattern inside shared memory
+ * \param l2Promotion           - L2 promotion size
+ * \param oobFill               - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2colWide,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Create a tensor map descriptor object representing im2col memory region, but where
+ * the elements are exclusively loaded along the W dimension.
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
+ * describing a im2col memory layout and where the row is always loaded along the W dimensuin
+ * and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
+ * NHWC, or NWC.
+ *
+ * This API is only supported on devices of compute capability 10.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
+ * that the size of the box along D and H dimensions is always equal to one.
+ *
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+ *
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
+ * equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
+ * element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
+ * \code
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ * \endcode
+ * ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
+ * via the \p pixelsPerColumn field.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
+ * ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap                - Tensor map object to create
+ * \param tensorDataType           - Tensor data type
+ * \param tensorRank               - Dimensionality of tensor; must be at least 3
+ * \param globalAddress            - Starting address of memory region described by tensor
+ * \param globalDim                - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides            - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param pixelBoxLowerCornerWidth - Width offset of left box corner
+ * \param pixelBoxUpperCornerWidth - Width offset of right box corner
+ * \param channelsPerPixel         - Number of channels per pixel
+ * \param pixelsPerColumn          - Number of pixels per column
+ * \param elementStrides           - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave               - Type of interleaved layout the tensor addresses
+ * \param mode                     - W or W128 mode
+ * \param swizzle                  - Bank swizzling pattern inside shared memory
+ * \param l2Promotion              - L2 promotion size
+ * \param oobFill                  - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeIm2colWide(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Modify an existing tensor map descriptor with an updated global address
+ *
+ * Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
+ * an updated \p globalAddress.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param tensorMap             - Tensor map object to modify
+ * \param globalAddress         - Starting address of memory region described by tensor, must follow previous alignment requirements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2col
+ * ::cuTensorMapEncodeIm2colWide
+ */
+CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress);
+
+/** @} */
+/* END CUDA_TENSOR_MEMORY */
+
+/**
+ * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
+ *
+ * ___MANBRIEF___ direct peer context memory access functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the direct peer context memory access functions
+ * of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ * directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ * If direct access of \p peerDev from \p dev is possible, then access may be
+ * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param dev           - Device from which allocations on \p peerDev are to
+ *                        be directly accessed.
+ * \param peerDev       - Device on which the allocations to be directly accessed
+ *                        by \p dev reside.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer
+ */
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+
+/**
+ * \brief Enables direct access to memory allocations in a peer context.
+ *
+ * If both the current context and \p peerContext are on devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ * major compute capability, then on success all allocations from \p peerContext will
+ * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ * details.
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory from the current context in \p peerContext, a separate symmetric call
+ * to ::cuCtxEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ * that the ::CUdevice of the current context cannot directly access memory
+ * from the ::CUdevice of \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ * \p peerContext from the current context has already been enabled.
+ *
+ * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ * because hardware resources required for peer access have been exhausted.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ * is not a valid context, or if the current context is \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+ *
+ * \param peerContext - Peer context to enable direct access to from the current context
+ * \param Flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceEnablePeerAccess
+ */
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+
+/**
+ * \brief Disables direct access to memory allocations in a peer context and
+ * unregisters any registered allocations.
+ *
+  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ * not yet been enabled from \p peerContext to the current context.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ * \p peerContext is not a valid context.
+ *
+ * \param peerContext - Peer context to disable direct access to
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess
+ */
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+ *   performance of the link between two devices.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+ *   the link are supported.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+ *   be accessed over the link.
+ *
+ * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaDeviceGetP2PAttribute
+ */
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+
+/** @} */ /* END CUDA_PEER_ACCESS */
+
+/**
+ * \defgroup CUDA_GRAPHICS Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource,
+ * ::cuGraphicsD3D10RegisterResource,
+ * ::cuGraphicsD3D11RegisterResource,
+ * ::cuGraphicsGLRegisterBuffer,
+ * ::cuGraphicsGLRegisterImage,
+ * ::cudaGraphicsUnregisterResource
+ */
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::CUarray_cubemap_face for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ * resource \p resource. The value set in \p *pMipmappedArray may change every time
+ * that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource        - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+
+/**
+ * \brief Get a device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p pPointer may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels.  This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then
+ * ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsResourceSetMapFlags
+ */
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param count      - Number of resources to map
+ * \param resources  - Resources to map for CUDA usage
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsMapResources
+ */
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param count      - Number of resources to unmap
+ * \param resources  - Resources to unmap
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsUnmapResources
+ */
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/** @} */ /* END CUDA_GRAPHICS */
+
+/**
+ * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
+ *
+ * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **pfn the address of the CUDA driver function for the requested
+ * CUDA version and flags.
+ *
+ * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ * should be specified as 11020. For a requested driver symbol, if the specified
+ * CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ * was introduced, this API will return the function pointer to the corresponding
+ * versioned function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::CUDA_SUCCESS and set the returned \p pfn to NULL if the 
+ * requested driver function is not supported on the platform, no ABI 
+ * compatible driver function exists for the specified \p cudaVersion or if the 
+ * driver symbol is invalid.
+ *
+ * It will also set the optional \p symbolStatus to one of the values in
+ * ::CUdriverProcAddressQueryResult with the following meanings:
+ * - ::CU_GET_PROC_ADDRESS_SUCCESS - The requested symbol was succesfully found based
+ *   on input arguments and \p pfn is valid
+ * - ::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND - The requested symbol was not found
+ * - ::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT - The requested symbol was found but is
+ *   not supported by cudaVersion specified
+ *
+ * The requested flags can be:
+ * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
+ *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
+ * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
+ *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
+ * \param pfn - Location to return the function pointer to the requested driver function
+ * \param cudaVersion - The CUDA version to look for the requested driver symbol 
+ * \param flags -  Flags to specify search options.
+ * \param symbolStatus - Optional location to store the status of the search for
+ *                       \p symbol based on \p cudaVersion. See ::CUdriverProcAddressQueryResult
+ *                       for possible values.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_version_mixing
+ *
+ * \sa
+ * ::cudaGetDriverEntryPoint
+ */
+CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus);
+
+/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
+
+/**
+ * \defgroup CUDA_COREDUMP Coredump Attributes Control API
+ *
+ * ___MANBRIEF___ coredump attribute control functions for the low-level CUDA API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the coredump attribute control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * Flags for choosing a coredump attribute to get/set
+ */
+typedef enum CUcoredumpSettings_enum {
+    CU_COREDUMP_ENABLE_ON_EXCEPTION = 1,
+    CU_COREDUMP_TRIGGER_HOST,
+    CU_COREDUMP_LIGHTWEIGHT,
+    CU_COREDUMP_ENABLE_USER_TRIGGER,
+    CU_COREDUMP_FILE,
+    CU_COREDUMP_PIPE,
+    CU_COREDUMP_GENERATION_FLAGS,
+    CU_COREDUMP_MAX
+} CUcoredumpSettings;
+
+/**
+ * Flags for controlling coredump contents
+ */
+typedef enum CUCoredumpGenerationFlags {
+    CU_COREDUMP_DEFAULT_FLAGS                = 0,
+    CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = (1 << 0),
+    CU_COREDUMP_SKIP_GLOBAL_MEMORY           = (1 << 1),
+    CU_COREDUMP_SKIP_SHARED_MEMORY           = (1 << 2),
+    CU_COREDUMP_SKIP_LOCAL_MEMORY            = (1 << 3),
+    CU_COREDUMP_SKIP_ABORT                   = (1 << 4),
+    CU_COREDUMP_SKIP_CONSTBANK_MEMORY        = (1 << 5),
+
+    CU_COREDUMP_LIGHTWEIGHT_FLAGS = CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES
+                                     | CU_COREDUMP_SKIP_GLOBAL_MEMORY
+                                     | CU_COREDUMP_SKIP_SHARED_MEMORY
+                                     | CU_COREDUMP_SKIP_LOCAL_MEMORY
+                                     | CU_COREDUMP_SKIP_CONSTBANK_MEMORY
+} CUCoredumpGenerationFlags;
+
+/**
+ * \brief Allows caller to fetch a coredump attribute value for the current context
+ *
+ * Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p attrib will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false unless set to ::true globally or locally, or the
+ *      CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false unless set to ::true globally or locally. This attribute is deprecated as
+ *      of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false unless set to ::true globally or locally.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. The default value is
+ *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to fetch.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value, size_t *size);
+
+/**
+ * \brief Allows caller to fetch a coredump attribute value for the entire application
+ *
+ * Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p attrib will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. The default value is
+ *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to fetch.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size);
+
+/**
+ * \brief Allows caller to set a coredump attribute value for the current context
+ *
+ * This function should be considered an alternate interface to the CUDA-GDB environment
+ * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+ *
+ * An important design decision to note is that any coredump environment variable values
+ * set before CUDA initializes will take permanent precedence over any values set with this
+ * function. This decision was made to ensure no change in behavior for any users that
+ * may be currently using these variables to get coredumps.
+ *
+ * \p *value shall contain the requested value specified by \p set. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p set will be placed in \p size.
+ *
+ * /note This function will return ::CUDA_ERROR_NOT_SUPPORTED if the caller attempts to set
+ * ::CU_COREDUMP_ENABLE_ON_EXCEPTION on a GPU of with Compute Capability < 6.0. ::cuCoredumpSetAttributeGlobal
+ * works on those platforms as an alternative.
+ *
+ * /note ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to set.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value, size_t *size);
+
+/**
+ * \brief Allows caller to set a coredump attribute value globally
+ *
+ * This function should be considered an alternate interface to the CUDA-GDB environment
+ * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+ *
+ * An important design decision to note is that any coredump environment variable values
+ * set before CUDA initializes will take permanent precedence over any values set with this
+ * function. This decision was made to ensure no change in behavior for any users that
+ * may be currently using these variables to get coredumps.
+ *
+ * \p *value shall contain the requested value specified by \p set. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p set will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. This value may not be
+ *      changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
+ *      value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
+ *      running the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to set.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ *
+ * \sa
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpSetAttribute
+ */
+CUresult CUDAAPI cuCoredumpSetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size);
+
+/** @} */ /* END CUDA_COREDUMP */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+/*
+** ******************* GREEN CONTEXTS **********************
+*/
+
+/**
+ * \defgroup CUDA_GREEN_CONTEXTS Green Contexts
+ *
+ * ___MANBRIEF___ Driver level API for creation and manipulation of green contexts
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the APIs for creation and manipulation of green contexts in the CUDA
+ * driver. Green contexts are a lightweight alternative to traditional contexts, with the ability
+ * to pass in a set of resources that they should be initialized with. This allows the developer to
+ * represent distinct spatial partitions of the GPU, provision resources for them, and target them
+ * via the same programming model that CUDA exposes (streams, kernel launches, etc.).
+ *
+ * There are 4 main steps to using these new set of APIs.
+ * - (1) Start with an initial set of resources, for example via ::cuDeviceGetDevResource. Only SM type is supported today.
+ * - (2) Partition this set of resources by providing them as input to a partition API, for example: ::cuDevSmResourceSplitByCount.
+ * - (3) Finalize the specification of resources by creating a descriptor via ::cuDevResourceGenerateDesc.
+ * - (4) Provision the resources and create a green context via ::cuGreenCtxCreate.
+ *
+ * For \p CU_DEV_RESOURCE_TYPE_SM, the partitions created have minimum SM count requirements, often rounding up and aligning the
+ * minCount provided to ::cuDevSmResourceSplitByCount. The following is a guideline for each architecture
+ * and may be subject to change:
+ * - On Compute Architecture 6.X: The minimum count is 1 SM.
+ * - On Compute Architecture 7.X: The minimum count is 2 SMs and must be a multiple of 2.
+ * - On Compute Architecture 8.X: The minimum count is 4 SMs and must be a multiple of 2.
+ * - On Compute Architecture 9.0+: The minimum count is 8 SMs and must be a multiple of 8.
+ *
+ * In the future, flags can be provided to tradeoff functional and performance characteristics versus finer grained SM partitions.
+ *
+ * Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched
+ * in them will run concurrently or have forward progress guarantees. This is due to other resources (like HW connections,
+ * see ::CUDA_DEVICE_MAX_CONNECTIONS) that could cause a dependency. Additionally, in certain scenarios,
+ * it is possible for the workload to run on more SMs than was provisioned (but never less).
+ * The following are two scenarios which can exhibit this behavior:
+ * - On Volta+ MPS: When \p CUDA_MPS_ACTIVE_THREAD_PERCENTAGE is used,
+ * the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+ * - On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future
+ * kernels running under green contexts may use and share an additional set of 2 SMs.
+ *
+ * @{
+ */
+
+/*!
+ * \typedef struct CUdevResourceDesc_st* CUdevResourceDesc;
+ * An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources.
+ * Created via ::cuDevResourceGenerateDesc
+ */
+typedef struct CUdevResourceDesc_st *CUdevResourceDesc;
+
+typedef enum {
+    CU_GREEN_CTX_DEFAULT_STREAM = 0x1, /**< Required. Creates a default stream to use inside the green context */
+} CUgreenCtxCreate_flags;
+
+typedef enum {
+    CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 0x1,
+    CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 0x2,
+} CUdevSmResourceSplit_flags;
+
+#define RESOURCE_ABI_VERSION 1
+#define RESOURCE_ABI_EXTERNAL_BYTES 48
+
+#define _CONCAT_INNER(x, y) x ## y
+#define _CONCAT_OUTER(x, y) _CONCAT_INNER(x, y)
+
+/*!
+ * \typedef enum CUdevResourceType
+ * Type of resource
+ */
+typedef enum {
+    CU_DEV_RESOURCE_TYPE_INVALID = 0,
+    CU_DEV_RESOURCE_TYPE_SM = 1, /**< Streaming multiprocessors related information */
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
+    CU_DEV_RESOURCE_TYPE_MAX,
+#endif
+} CUdevResourceType;
+
+/*!
+ * \struct CUdevSmResource
+ * Data for SM-related resources
+ */
+typedef struct CUdevSmResource_st {
+    unsigned int smCount; /**< The amount of streaming multiprocessors available in this resource. This is an output parameter only, do not write to this field. */
+} CUdevSmResource;
+
+/*!
+ * \struct CUdevResource
+ * A tagged union describing different resources identified by the type field. This structure should not be directly modified outside of the API that created it.
+ * \code
+ * struct {
+ *     CUdevResourceType type;
+ *     union {
+ *         CUdevSmResource sm;
+ *     };
+ * };
+ * \endcode
+ * - If \p type is \p CU_DEV_RESOURCE_TYPE_INVALID, this resoure is not valid and cannot be further accessed.
+ * - If \p type is \p CU_DEV_RESOURCE_TYPE_SM, the ::CUdevSmResource structure \p sm is filled in. For example,
+ * \p sm.smCount will reflect the amount of streaming multiprocessors available in this resource.
+ */
+typedef struct CUdevResource_st {
+    CUdevResourceType type; /**< Type of resource, dictates which union field was last set */
+    unsigned char _internal_padding[92];
+    union {
+        CUdevSmResource sm; /**< Resource corresponding to CU_DEV_RESOURCE_TYPE_SM \p. type. */
+        unsigned char _oversize[RESOURCE_ABI_EXTERNAL_BYTES];
+    };
+} _CONCAT_OUTER(CUdevResource_v, RESOURCE_ABI_VERSION);
+typedef _CONCAT_OUTER(CUdevResource_v, RESOURCE_ABI_VERSION) CUdevResource;
+
+#undef _CONCAT_INNER
+#undef _CONCAT_OUTER
+
+#undef ABI_PER_RESOURCE_EXTERNAL_BYTES
+#undef ABI_RESOURCE_VERSION
+
+/**
+ * \brief Creates a green context with a specified set of resources.
+ *
+ * This API creates a green context with the resources specified in the descriptor \p desc and
+ * returns it in the handle represented by \p phCtx. This API will retain the primary context on device \p dev,
+ * which will is released when the green context is destroyed. It is advised to have the primary context active
+ * before calling this API to avoid the heavy cost of triggering primary context initialization and
+ * deinitialization multiple times.
+ *
+ * The API does not set the green context current. In order to set it current, you need to explicitly set it current
+ * by first converting the green context to a CUcontext using ::cuCtxFromGreenCtx and subsequently calling
+ * ::cuCtxSetCurrent / ::cuCtxPushCurrent. It should be noted that a green context can be current to only one
+ * thread at a time. There is no internal synchronization to make API calls accessing the same green context
+ * from multiple threads work.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param phCtx - Pointer for the output handle to the green context
+ * \param desc - Descriptor generated via ::cuDevResourceGenerateDesc which contains the set of resources to be used
+ * \param dev - Device on which to create the green context.
+ * \param flags - One of the supported green context creation flags. \p CU_GREEN_CTX_DEFAULT_STREAM is required.
+ *
+ * The supported flags are:
+ * - \p CU_GREEN_CTX_DEFAULT_STREAM : Creates a default stream to use inside the green context. Required.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuGreenCtxDestroy,
+ * ::cuCtxFromGreenCtx,
+ * ::cuCtxSetCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuDevResourceGenerateDesc,
+ * ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxCreate,
+ * ::cuCtxCreate_v3
+ */
+CUresult CUDAAPI cuGreenCtxCreate(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Destroys a green context
+ *
+ * Destroys the green context, releasing the primary context of the device that this green context was created for.
+ * Any resources provisioned for this green context (that were initially available via the resource descriptor)
+ * are released as well.
+ * \param hCtx - Green context to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa
+ * ::cuGreenCtxCreate,
+ * ::cuCtxDestroy
+ */
+CUresult CUDAAPI cuGreenCtxDestroy(CUgreenCtx hCtx);
+
+/**
+ * \brief Converts a green context into the primary context
+ *
+ * The API converts a green context into the primary context returned in \p pContext. It is important
+ * to note that the converted context \p pContext is a normal primary context but with
+ * the resources of the specified green context \p hCtx. Once converted, it can then
+ * be used to set the context current with ::cuCtxSetCurrent or with any of the CUDA APIs
+ * that accept a CUcontext parameter.
+ *
+ * Users are expected to call this API before calling any CUDA APIs that accept a
+ * CUcontext. Failing to do so will result in the APIs returning ::CUDA_ERROR_INVALID_CONTEXT.
+ *
+ * \param pContext Returned primary context with green context resources
+ * \param hCtx Green context to convert
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuGreenCtxCreate
+ */
+CUresult CUDAAPI cuCtxFromGreenCtx(CUcontext *pContext, CUgreenCtx hCtx);
+
+/**
+ * \brief Get device resources
+ *
+ * Get the \p type resources available to the \p device.
+ * This may often be the starting point for further partitioning or configuring of resources.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param device - Device to get resource for
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuDeviceGetDevResource(CUdevice device, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Get context resources
+ *
+ * Get the \p type resources available to the context represented by \p hCtx
+ * \param hCtx - Context to get resource for
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuCtxGetDevResource(CUcontext hCtx, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Get green context resources
+ *
+ * Get the \p type resources available to the green context represented by \p hCtx
+ * \param hCtx - Green context to get resource for
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Splits \p CU_DEV_RESOURCE_TYPE_SM resources.
+ *
+ * Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount
+ * and the usage flags in \p useFlags. If \p result is NULL, the API simulates a split and provides the amount of groups that
+ * would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return,
+ * the API will overwrite \p nbGroups with the amount actually created. The groups are written to the array in \p result.
+ * \p nbGroups can be less than the total amount if a smaller number of groups is needed.
+ *
+ * This API is used to spatially partition the input resource. The input resource needs to come from one of
+ * ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or ::cuGreenCtxGetDevResource.
+ * A limitation of the API is that the output results cannot be split again without
+ * first creating a descriptor and a green context with that descriptor.
+ *
+ * When creating the groups, the API will take into account the performance and functional characteristics of the
+ * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
+ * than purely dividing the total SM count by the \p minCount due to cluster requirements or
+ * alignment and granularity requirements for the minCount.
+ *
+ * The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
+ * Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
+ *
+ * The following flags are supported:
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
+ *  This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
+ *  for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
+ *
+ * A successful API call must either have:
+ * - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ * This queries the number of groups that would be created by the API.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param result - Output array of \p CUdevResource resources. Can be NULL to query the number of groups.
+ * \param nbGroups - This is a pointer, specifying the number of groups that would be or should be created as described below.
+ * \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
+ * \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
+ * Can be ommitted (NULL) if the user does not need the remaining set.
+ * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
+ * \param minCount - Minimum number of SMs required
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+ *
+ * \sa
+ * ::cuGreenCtxGetDevResource,
+ * ::cuCtxGetDevResource,
+ * ::cuDeviceGetDevResource
+ */
+CUresult CUDAAPI cuDevSmResourceSplitByCount(
+    CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
+
+/**
+ * \brief Generate a resource descriptor
+ *
+ * Generates a single resource descriptor with the set of resources specified in \p resources.
+ * The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
+ * Resources of the same type can be passed in, provided they meet the requirements as noted below.
+ *
+ * A successful API call must have:
+ * - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
+ * with the array size passed in \p nbResources.
+ * If multiple resources are provided in \p resources, the device they came from must be the same,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ * If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
+ * they must be outputs (whether \p result or \p remaining) from the same split API instance,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param phDesc - Output descriptor
+ * \param resources - Array of resources to be included in the descriptor
+ * \param nbResources - Number of resources passed in \p resources
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+ *
+ * \sa
+ * ::cuDevSmResourceSplitByCount
+ */
+CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResource *resources, unsigned int nbResources);
+
+/**
+ * \brief Records an event.
+ *
+ * Captures in \p hEvent all the activities of the green context of \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
+ * then examine or wait for completion of the work that was captured. Uses of
+ * \p hCtx after this call do not modify \p hEvent.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified green context \p hCtx has a stream in the capture mode. In such
+ * a case, the call will invalidate all the conflicting captures.
+ *
+ * \param hCtx - Green context to record event for
+ * \param hEvent  - Event to record
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuGreenCtxWaitEvent,
+ * ::cuEventRecord,
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
+ */
+CUresult CUDAAPI cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
+
+/**
+ * \brief Make a green context wait on an event
+ *
+ * Makes all future work submitted to green context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
+ * and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
+ * or ::cuEventRecord(), for details on what is captured by an event.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an
+ * ongoing capture sequence or if the specified green context \p hCtx has
+ * a stream in the capture mode.
+ *
+ * \param hCtx    - Green context to wait
+ * \param hEvent  - Event to wait on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuGreenCtxRecordEvent,
+ * ::cuStreamWaitEvent
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
+*/
+CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
+
+/**
+ * \brief Query the green context associated with a stream
+ *
+ * Returns the CUDA green context that the stream is associated with, or NULL if the stream
+ * is not associated with any green context.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>
+ *   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
+ *   and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   If during stream creation the context that was active in the calling thread was obtained
+ *   with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
+ *   Otherwise, \p *phCtx is set to NULL instead.
+ *   </li>
+ *   <li>
+ *   special stream such as the NULL stream or ::CU_STREAM_LEGACY.
+ *   In that case if context that is active in the calling thread was obtained
+ *   with cuCtxFromGreenCtx, that green context is returned.
+ *   Otherwise, \p *phCtx is set to NULL instead.
+ *   </li>
+ * </ul>
+ * Passing an invalid handle will result in undefined behavior.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param phCtx   - Returned green context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetCtx_v2,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
+
+/**
+ * \brief Create a stream for use in the green context
+ *
+ * Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
+ * The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
+ * is current to the calling thread and creates a stream in the specified green context \p greenCtx.
+ *
+ * The supported values for \p flags are:
+ * - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
+ *   stream may run concurrently with work in the default stream, and that
+ *   the created stream should perform no implicit synchronization with the default stream.
+ *
+ * Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do not preempt
+ * already-running work or provide any other functional guarantee on execution order.
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream - Returned newly created stream
+ * \param greenCtx - Green context for which to create the stream for
+ * \param flags    - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
+ * \param priority - Stream priority. Lower numbers represent higher priorities.
+ *                   See ::cuCtxGetStreamPriorityRange for more information about
+ *                   meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuGreenCtxCreate
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
+
+/** @} */
+
+/*
+** *************** END CUDA_GREEN_CONTEXTS *****************
+*/
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuMemHostRegister
+    #undef cuGraphicsResourceSetMapFlags
+    #undef cuLinkCreate
+    #undef cuLinkAddData
+    #undef cuLinkAddFile
+    #undef cuDeviceTotalMem
+    #undef cuCtxCreate
+    #undef cuModuleGetGlobal
+    #undef cuMemGetInfo
+    #undef cuMemAlloc
+    #undef cuMemAllocPitch
+    #undef cuMemFree
+    #undef cuMemGetAddressRange
+    #undef cuMemAllocHost
+    #undef cuMemHostGetDevicePointer
+    #undef cuMemcpyHtoD
+    #undef cuMemcpyDtoH
+    #undef cuMemcpyDtoD
+    #undef cuMemcpyDtoA
+    #undef cuMemcpyAtoD
+    #undef cuMemcpyHtoA
+    #undef cuMemcpyAtoH
+    #undef cuMemcpyAtoA
+    #undef cuMemcpyHtoAAsync
+    #undef cuMemcpyAtoHAsync
+    #undef cuMemcpy2D
+    #undef cuMemcpy2DUnaligned
+    #undef cuMemcpy3D
+    #undef cuMemcpyHtoDAsync
+    #undef cuMemcpyDtoHAsync
+    #undef cuMemcpyDtoDAsync
+    #undef cuMemcpy2DAsync
+    #undef cuMemcpy3DAsync
+    #undef cuMemcpyBatchAsync
+    #undef cuMemcpy3DBatchAsync
+    #undef cuMemsetD8
+    #undef cuMemsetD16
+    #undef cuMemsetD32
+    #undef cuMemsetD2D8
+    #undef cuMemsetD2D16
+    #undef cuMemsetD2D32
+    #undef cuArrayCreate
+    #undef cuArrayGetDescriptor
+    #undef cuArray3DCreate
+    #undef cuArray3DGetDescriptor
+    #undef cuTexRefSetAddress
+    #undef cuTexRefSetAddress2D
+    #undef cuTexRefGetAddress
+    #undef cuGraphicsResourceGetMappedPointer
+    #undef cuCtxDestroy
+    #undef cuCtxPopCurrent
+    #undef cuCtxPushCurrent
+    #undef cuStreamDestroy
+    #undef cuEventDestroy
+    #undef cuMemcpy
+    #undef cuMemcpyAsync
+    #undef cuMemcpyPeer
+    #undef cuMemcpyPeerAsync
+    #undef cuMemcpy3DPeer
+    #undef cuMemcpy3DPeerAsync
+    #undef cuMemsetD8Async
+    #undef cuMemsetD16Async
+    #undef cuMemsetD32Async
+    #undef cuMemsetD2D8Async
+    #undef cuMemsetD2D16Async
+    #undef cuMemsetD2D32Async
+    #undef cuStreamGetPriority
+    #undef cuStreamGetId
+    #undef cuStreamGetFlags
+    #undef cuStreamGetDevice
+    #undef cuStreamGetCtx
+    #undef cuStreamWaitEvent
+    #undef cuStreamAddCallback
+    #undef cuStreamAttachMemAsync
+    #undef cuStreamQuery
+    #undef cuStreamSynchronize
+    #undef cuEventRecord
+    #undef cuEventRecordWithFlags
+    #undef cuLaunchKernel
+    #undef cuLaunchKernelEx
+    #undef cuLaunchHostFunc
+    #undef cuGraphicsMapResources
+    #undef cuGraphicsUnmapResources
+    #undef cuStreamWriteValue32
+    #undef cuStreamWaitValue32
+    #undef cuStreamWriteValue64
+    #undef cuStreamWaitValue64
+    #undef cuStreamBatchMemOp
+    #undef cuStreamWriteValue32_v2
+    #undef cuStreamWaitValue32_v2
+    #undef cuStreamWriteValue64_v2
+    #undef cuStreamWaitValue64_v2
+    #undef cuStreamBatchMemOp_v2
+    #undef cuMemPrefetchAsync
+    #undef cuMemPrefetchAsync_v2
+    #undef cuLaunchCooperativeKernel
+    #undef cuSignalExternalSemaphoresAsync
+    #undef cuWaitExternalSemaphoresAsync
+    #undef cuStreamBeginCapture
+    #undef cuStreamBeginCaptureToGraph
+    #undef cuStreamEndCapture
+    #undef cuStreamIsCapturing
+    #undef cuStreamGetCaptureInfo
+    #undef cuStreamGetCaptureInfo_v2
+    #undef cuStreamGetCaptureInfo_v3
+    #undef cuGraphInstantiateWithParams
+    #undef cuGraphExecUpdate
+    #undef cuGraphUpload
+    #undef cuGraphLaunch
+    #undef cuDevicePrimaryCtxRelease
+    #undef cuDevicePrimaryCtxReset
+    #undef cuDevicePrimaryCtxSetFlags
+    #undef cuIpcOpenMemHandle
+    #undef cuStreamCopyAttributes
+    #undef cuStreamSetAttribute
+    #undef cuStreamGetAttribute
+    #undef cuGraphInstantiate
+    #undef cuGraphAddKernelNode
+    #undef cuGraphKernelNodeGetParams
+    #undef cuGraphKernelNodeSetParams
+    #undef cuGraphExecKernelNodeSetParams
+    #undef cuMemMapArrayAsync
+    #undef cuMemFreeAsync 
+    #undef cuMemAllocAsync 
+    #undef cuMemAllocFromPoolAsync 
+    #undef cuStreamUpdateCaptureDependencies
+    #undef cuStreamUpdateCaptureDependencies_v2
+    #undef cuGetProcAddress
+    #undef cuStreamGetCtx_v2
+    #undef cuMemBatchDecompressAsync
+
+    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+    typedef unsigned int CUdeviceptr_v1;
+
+    typedef struct CUDA_MEMCPY2D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+        unsigned int Height;        /**< Height of 2D memory copy */
+    } CUDA_MEMCPY2D_v1;
+
+    typedef struct CUDA_MEMCPY3D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        unsigned int srcZ;          /**< Source Z */
+        unsigned int srcLOD;        /**< Source LOD */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        void *reserved0;            /**< Must be NULL */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        unsigned int dstZ;          /**< Destination Z */
+        unsigned int dstLOD;        /**< Destination LOD */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        void *reserved1;            /**< Must be NULL */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+        unsigned int Height;        /**< Height of 3D memory copy */
+        unsigned int Depth;         /**< Depth of 3D memory copy */
+    } CUDA_MEMCPY3D_v1;
+
+    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of array */
+        unsigned int Height;        /**< Height of array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+    } CUDA_ARRAY_DESCRIPTOR_v1;
+
+    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of 3D array */
+        unsigned int Height;        /**< Height of 3D array */
+        unsigned int Depth;         /**< Depth of 3D array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+        unsigned int Flags;         /**< Flags */
+    } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+
+    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                        CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                        size_t *failIdx, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                          size_t *failIdx, unsigned long long flags, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+    CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
+    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
+    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
+    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+    CUresult CUDAAPI cuStreamWriteValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp_ptsz(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+    CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+    CUresult CUDAAPI cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream);
+    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_ptsz(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+    CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+    CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
+    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
+    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
+
+    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    CUresult CUDAAPI cuGraphInstantiate_v2(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+
+    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
+
+    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+
+    CUresult CUDAAPI cuMemBatchDecompressAsync(
+        CUmemDecompressParams *paramsArray,
+        size_t count,
+        unsigned int flags,
+        size_t *errorIndex,
+        CUstream stream
+    );
+
+    CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
+
+#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus) {
+    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
+                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
+    if ((flags & procAddressMask) == 0) {
+        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
+    }
+    return cuGetProcAddress_v2(symbol, funcPtr, driverVersion, flags, symbolStatus); 
+}
+#define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz
+#endif
+
+/**
+ * \defgroup CUDA_CHECKPOINT CUDA Checkpointing
+ *
+ * ___MANBRIEF___ CUDA checkpoint and restore functionality of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This sections describes the checkpoint and restore functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The CUDA checkpoint and restore API's provide a way to save and restore GPU
+ * state for full process checkpoints when used with CPU side process
+ * checkpointing solutions. They can also be used to pause GPU work and suspend
+ * a CUDA process to allow other applications to make use of GPU resources.
+ *
+ * Checkpoint and restore capabilities are currently restricted to Linux.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the restore thread ID for a CUDA process
+ *
+ * Returns in \p *tid the thread ID of the CUDA restore thread for the process
+ * specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param tid - Returned restore thread ID
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetRestoreThreadId(int pid, int *tid);
+
+/**
+ * \brief Returns the process state of a CUDA process
+ *
+ * Returns in \p *state the current state of the CUDA process specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param state - Returned CUDA process state
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetState(int pid, CUprocessState *state);
+
+/**
+ * \brief Lock a running CUDA process
+ *
+ * Lock the CUDA process specified by \p pid which will block further CUDA API
+ * calls. Process must be in the RUNNING state in order to lock.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * If timeoutMs is specified and the timeout is reached the process will be left
+ * in the RUNNING state upon return.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional lock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * ::CUDA_ERROR_NOT_READY
+ */
+CUresult CUDAAPI cuCheckpointProcessLock(int pid, CUcheckpointLockArgs *args);
+
+/**
+ * \brief Checkpoint a CUDA process's GPU memory contents
+ *
+ * Checkpoints a CUDA process specified by \p pid that is in the LOCKED
+ * state. The GPU memory contents will be brought into host memory and all
+ * underlying references will be released. Process must be in the LOCKED state
+ * to checkpoint.
+ *
+ * Upon successful return the process will be in the CHECKPOINTED state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional checkpoint operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs *args);
+
+/**
+ * \brief Restore a CUDA process's GPU memory contents from its last checkpoint
+ *
+ * Restores a CUDA process specified by \p pid from its last checkpoint. Process
+ * must be in the CHECKPOINTED state to restore.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * CUDA process restore requires persistence mode to be enabled or ::cuInit to
+ * have been called before execution.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional restore operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa
+ * ::cuInit
+ */
+CUresult CUDAAPI cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs *args);
+
+/**
+ * \brief Unlock a CUDA process to allow CUDA API calls
+ *
+ * Unlocks a process specified by \p pid allowing it to resume making CUDA API
+ * calls. Process must be in the LOCKED state.
+ *
+ * Upon successful return the process will be in the RUNNING state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional unlock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs *args);
+
+/** @} */ /* End CUDA_CHECKPOINT */
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility pop
+  #endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __cuda_cuda_h__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3578faa0304289cdef811af509eded71691352a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h
@@ -0,0 +1,662 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAEGL_H
+#define CUDAEGL_H
+
+#include "cuda.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  * \addtogroup CUDA_TYPES
+  * @{
+  */
+
+/**
+ * Maximum number of planes per frame
+ */
+#define MAX_PLANES 3
+
+/**
+  * CUDA EglFrame type - array or pointer
+  */
+typedef enum CUeglFrameType_enum {
+    CU_EGL_FRAME_TYPE_ARRAY = 0,  /**< Frame type CUDA array */
+    CU_EGL_FRAME_TYPE_PITCH = 1,  /**< Frame type pointer */
+} CUeglFrameType;
+
+/**
+ * Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
+ */
+#define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
+
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+
+  */
+typedef enum CUeglResourceLocationFlags_enum {
+    CU_EGL_RESOURCE_LOCATION_SYSMEM   = 0x00,       /**< Resource location sysmem */
+    CU_EGL_RESOURCE_LOCATION_VIDMEM   = 0x01        /**< Resource location vidmem */
+} CUeglResourceLocationFlags;
+
+/**
+  * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+  * Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
+  */
+typedef enum CUeglColorFormat_enum {
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR              = 0x00,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR          = 0x01,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR              = 0x02,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR          = 0x03,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    CU_EGL_COLOR_FORMAT_RGB                        = 0x04,  /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_BGR                        = 0x05,  /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_ARGB                       = 0x06,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    CU_EGL_COLOR_FORMAT_RGBA                       = 0x07,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    CU_EGL_COLOR_FORMAT_L                          = 0x08,  /**< single luminance channel in one surface. */
+    CU_EGL_COLOR_FORMAT_R                          = 0x09,  /**< single color channel in one surface. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR              = 0x0A,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR          = 0x0B,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    CU_EGL_COLOR_FORMAT_YUYV_422                   = 0x0C,  /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_422                   = 0x0D,  /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_ABGR                       = 0x0E,  /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    CU_EGL_COLOR_FORMAT_BGRA                       = 0x0F,  /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    CU_EGL_COLOR_FORMAT_A                          = 0x10,  /**< Alpha color format - one channel in one surface. */
+    CU_EGL_COLOR_FORMAT_RG                         = 0x11,  /**< R/G color format - two channels in one surface with GR byte ordering */
+    CU_EGL_COLOR_FORMAT_AYUV                       = 0x12,  /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR          = 0x13,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR          = 0x14,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR          = 0x15,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR   = 0x16,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR   = 0x17,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR   = 0x18,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR   = 0x19,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_VYUY_ER                    = 0x1A,  /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_ER                    = 0x1B,  /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_YUYV_ER                    = 0x1C,  /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_YVYU_ER                    = 0x1D,  /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_YUV_ER                     = 0x1E,  /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YUVA_ER                    = 0x1F,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_AYUV_ER                    = 0x20,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER           = 0x21,  /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER           = 0x22,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER           = 0x23,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER       = 0x24,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER       = 0x25,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER       = 0x26,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER           = 0x27,  /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER           = 0x28,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER           = 0x29,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER       = 0x2A,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER       = 0x2B,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER       = 0x2C,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_RGGB                 = 0x2D,  /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_BGGR                 = 0x2E,  /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GRBG                 = 0x2F,  /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GBRG                 = 0x30,  /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_RGGB               = 0x31,  /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_BGGR               = 0x32,  /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GRBG               = 0x33,  /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GBRG               = 0x34,  /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RGGB               = 0x35,  /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BGGR               = 0x36,  /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GRBG               = 0x37,  /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GBRG               = 0x38,  /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_RGGB               = 0x39,  /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_BGGR               = 0x3A,  /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GRBG               = 0x3B,  /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GBRG               = 0x3C,  /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_RGGB               = 0x3D,  /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_BGGR               = 0x3E,  /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GRBG               = 0x3F,  /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GBRG               = 0x40,  /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR              = 0x41,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR              = 0x42,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR              = 0x43,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB             = 0x44,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR             = 0x45,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG             = 0x46,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG             = 0x47,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_BCCR                 = 0x48,  /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_RCCB                 = 0x49,  /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CRBC                 = 0x4A,  /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CBRC                 = 0x4B,  /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_CCCC               = 0x4C,  /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BCCR               = 0x4D,  /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RCCB               = 0x4E,  /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CRBC               = 0x4F,  /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CBRC               = 0x50,  /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CCCC               = 0x51,  /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_Y                          = 0x52, /**< Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020     = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020     = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020         = 0x55, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */             
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020         = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709      = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709      = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709          = 0x59, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709          = 0x5A,  /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709  = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR      = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709  = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y_ER                          = 0x60, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y_709_ER                      = 0x61, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y10_ER                        = 0x62, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y10_709_ER                    = 0x63, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_ER                        = 0x64, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_709_ER                    = 0x65, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_YUVA                          = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_YUV                           = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YVYU                          = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_VYUY                          = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER     = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER     = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface)  U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER     = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER     = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_UYVY_709                        = 0x72, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_709_ER                     = 0x73, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_2020                       = 0x74, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_MAX
+} CUeglColorFormat;
+
+/**
+ * CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface  * is Multiplanar or not.
+ */
+typedef struct CUeglFrame_st {
+    union {
+        CUarray pArray[MAX_PLANES];     /**< Array of CUarray corresponding to each plane*/
+        void*   pPitch[MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+    } frame;
+    unsigned int width;                 /**< Width of first plane */
+    unsigned int height;                /**< Height of first plane */
+    unsigned int depth;                 /**< Depth of first plane */
+    unsigned int pitch;                 /**< Pitch of first plane */
+    unsigned int planeCount;            /**< Number of planes */
+    unsigned int numChannels;           /**< Number of channels for the plane */
+    CUeglFrameType frameType;           /**< Array or Pitch */
+    CUeglColorFormat eglColorFormat;    /**< CUDA EGL Color Format*/
+    CUarray_format cuFormat;            /**< CUDA Array Format*/
+} CUeglFrame_v1;
+typedef CUeglFrame_v1 CUeglFrame;
+
+/**
+  * CUDA EGLSream Connection
+  */
+typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
+
+/** @} */ /* END CUDA_TYPES */
+
+/**
+ * \file cudaEGL.h
+ * \brief Header file for the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_EGL EGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsEGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn            - Pointer to the returned connection handle
+ * \param stream          - EGLStreamKHR handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param stream            - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnectWithFlags
+ */
+
+CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
+
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
+
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
+ * by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
+ * during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
+ * ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::CUeglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the stream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ *                          implied by ::CUeglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec for a new frame to be acquired.
+ *                          If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
+ *                          After timeout occurs CUDA consumer tries to acquire an old frame
+ *                          if available and EGL_SUPPORT_REUSE_NV flag is set.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerAcquireFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ * If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
+ * this API doesn't release the last frame acquired on the EGLStream.
+ * By default, EGLStream is created with this flag set to EGL_TRUE.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource pCudaResource, CUstream *pStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param stream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerConnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
+                                             EGLint width, EGLint height);
+
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
+
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * When a frame is presented by the producer, it gets associated with the EGLStream
+ * and thus it is illegal to free the frame before the producer is disconnected.
+ * If a frame is freed and reused it may lead to undefined behavior.
+ *
+ * If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
+ * ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
+ * such cross-device applications.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
+ * allocation. In that case, the pitched pointer will specify the start address of the sub-region in
+ * the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
+ * 
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerReturnFrame,
+ * ::cudaEGLStreamProducerPresentFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
+                                                 CUeglFrame eglframe, CUstream *pStream);
+
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
+ *
+ * This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not 
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to return
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
+                                                CUeglFrame *eglframe, CUstream *pStream);
+
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for registered EGL graphics resources.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedEglFrame
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
+ * via \p flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
+ * that can be invoked on the event.
+ *
+ * ::cuEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy
+ */
+CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+/** @} */ /* END CUDA_EGL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..61b82337dc4bb280869934b11c2105db62ae20c3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAEGLTYPEDEFS_H
+#define CUDAEGLTYPEDEFS_H
+
+#include <cudaEGL.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaEGL.h
+ */
+#define PFN_cuGraphicsEGLRegisterImage  PFN_cuGraphicsEGLRegisterImage_v7000
+#define PFN_cuEGLStreamConsumerConnect  PFN_cuEGLStreamConsumerConnect_v7000
+#define PFN_cuEGLStreamConsumerConnectWithFlags  PFN_cuEGLStreamConsumerConnectWithFlags_v8000
+#define PFN_cuEGLStreamConsumerDisconnect  PFN_cuEGLStreamConsumerDisconnect_v7000
+#define PFN_cuEGLStreamConsumerAcquireFrame  PFN_cuEGLStreamConsumerAcquireFrame_v7000
+#define PFN_cuEGLStreamConsumerReleaseFrame  PFN_cuEGLStreamConsumerReleaseFrame_v7000
+#define PFN_cuEGLStreamProducerConnect  PFN_cuEGLStreamProducerConnect_v7000
+#define PFN_cuEGLStreamProducerDisconnect  PFN_cuEGLStreamProducerDisconnect_v7000
+#define PFN_cuEGLStreamProducerPresentFrame  PFN_cuEGLStreamProducerPresentFrame_v7000
+#define PFN_cuEGLStreamProducerReturnFrame  PFN_cuEGLStreamProducerReturnFrame_v7000
+#define PFN_cuGraphicsResourceGetMappedEglFrame  PFN_cuGraphicsResourceGetMappedEglFrame_v7000
+#define PFN_cuEventCreateFromEGLSync  PFN_cuEventCreateFromEGLSync_v9000
+
+
+/**
+ * Type definitions for functions defined in cudaEGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGL.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGL.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c70e881774c8f3cf8b6430e7aa53a98d74669
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGL.h
@@ -0,0 +1,608 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGL_H
+#define CUDAGL_H
+
+#include <cuda.h>
+#include <GL/gl.h>
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuGLCtxCreate            cuGLCtxCreate_v2
+#define cuGLMapBufferObject      __CUDA_API_PTDS(cuGLMapBufferObject_v2)
+#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
+#define cuGLGetDevices           cuGLGetDevices_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_GL OpenGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of OpenGL resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+#if defined(_WIN32)
+#if !defined(WGL_NV_gpu_affinity)
+typedef void* HGPUNV;
+#endif
+#endif /* _WIN32 */
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * pCudaResource.  The register flags \p Flags specify the intended usage,
+ * as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param buffer - name of buffer object to be registered
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsGLRegisterBuffer
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.  
+ * A handle to the registered object is returned as \p pCudaResource.  
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p Flags specify the intended usage, as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param image - name of texture or renderbuffer object to be registered
+ * \param target - Identifies the type of object specified by \p image
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+
+#ifdef _WIN32
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
+ * applicable.
+ *
+ * \param pDevice - Device associated with hGpu
+ * \param hGpu    - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
+ * ::cuGLSetBufferObjectMapFlags,
+ * ::cudaWGLGetDevice
+ */
+CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
+#endif /* _WIN32 */
+
+/**
+ * CUDA devices corresponding to an OpenGL device
+ */
+typedef enum CUGLDeviceList_enum {
+    CU_GL_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+    CU_GL_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+    CU_GL_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
+} CUGLDeviceList;
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
+ *
+ * The \p deviceList argument may be any of the following:
+ * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
+ * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the current frame (in SLI).
+ * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
+ *   this is correct in all cases.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices.
+ * \param pCudaDevices     - Returned CUDA devices.
+ * \param cudaDeviceCount  - The size of the output device array pCudaDevices.
+ * \param deviceList       - The set of devices to return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ *
+ * \notefnerr
+ *
+ * \sa
+ * ::cuWGLGetDevice,
+ * ::cudaGLGetDevices
+ */
+CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+
+/**
+ * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/** Flags to map or unmap a resource */
+typedef enum CUGLmap_flags_enum {
+    CU_GL_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,    
+} CUGLmap_flags;
+
+/**
+ * \brief Create a CUDA context for interoperability with OpenGL
+ *
+ * \deprecated This function is deprecated as of Cuda 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx   - Returned CUDA context
+ * \param Flags  - Options for CUDA context creation
+ * \param device - Device on which to create the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+
+/**
+ * \brief Initializes OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Initializes OpenGL interoperability. This function is deprecated
+ * and calling it is no longer required. It may fail if the needed
+ * OpenGL driver facilities are not available.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  There must be a valid OpenGL context bound to the current
+ * thread when this function is called, and the buffer name is
+ * resolved by that context.
+ *
+ * \param buffer - The name of the buffer object to register.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsGLRegisterBuffer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param dptr   - Returned mapped base pointer
+ * \param size   - Returned size of mapping
+ * \param buffer - The name of the buffer object to map
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size,  GLuint buffer);  
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param buffer - Buffer object to unmap
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
+
+/**
+ * \brief Unregister an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unregisters the buffer object specified by \p buffer.  This
+ * releases any resources associated with the registered buffer.
+ * After this call, the buffer may no longer be mapped for access by
+ * CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Name of the buffer object to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Set the map flags for an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Sets the map flags for the buffer object specified by \p buffer.
+ *
+ * Changes to \p Flags will take effect the next time \p buffer is mapped.
+ * The \p Flags argument may be any of the following:
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p buffer has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Buffer object to unmap
+ * \param Flags  - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param dptr    - Returned mapped base pointer
+ * \param size    - Returned size of mapping
+ * \param buffer  - The name of the buffer object to map
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param buffer  - Name of the buffer object to unmap
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
+
+/** @} */ /* END CUDA_GL_DEPRECATED */
+/** @} */ /* END CUDA_GL */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuGLCtxCreate
+    #undef cuGLMapBufferObject
+    #undef cuGLMapBufferObjectAsync
+    #undef cuGLGetDevices
+
+    CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+    CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+    CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+    CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer, CUstream hStream);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f0d5349e435159647af9af379d1e8e8441221c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGLTYPEDEFS_H
+#define CUDAGLTYPEDEFS_H
+
+// Dependent includes for cudagl.h
+#include <GL/gl.h>
+
+#include <cudaGL.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaGL.h
+ */
+#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
+#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
+#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
+#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
+#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
+#define PFN_cuGLInit  PFN_cuGLInit_v2000
+#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
+#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
+#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
+#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
+#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
+#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
+#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
+
+
+/**
+ * Type definitions for functions defined in cudaGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
+#endif
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAPROFILERTYPEDEFS_H
+#define CUDAPROFILERTYPEDEFS_H
+
+#include <cudaProfiler.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaProfiler.h
+ */
+#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
+#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
+#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
+
+
+/**
+ * Type definitions for functions defined in cudaProfiler.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..4957e9a07cdb10141586620f9e0f4d36ede345be
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
@@ -0,0 +1,1144 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDATYPEDEFS_H
+#define CUDATYPEDEFS_H
+
+#include <cuda.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cuda.h
+ */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuInit  PFN_cuInit_v2000
+#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
+#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
+#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
+#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
+#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
+#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
+#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
+#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
+#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
+#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
+#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
+#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
+#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
+#define PFN_cuCtxGetId  PFN_cuCtxGetId_v12000
+#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
+#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
+#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
+#define PFN_cuCtxSetFlags  PFN_cuCtxSetFlags_v12010
+#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
+#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
+#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
+#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
+#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
+#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
+#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
+#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
+#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
+#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
+#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
+#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
+#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
+#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
+#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
+#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
+#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
+#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
+#define PFN_cuModuleGetFunctionCount PFN_cuModuleGetFunctionCount_v12040
+#define PFN_cuModuleEnumerateFunctions PFN_cuModuleEnumerateFunctions_v12040
+#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
+#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
+#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
+#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
+#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
+#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
+#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
+#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
+#define PFN_cuMemFree  PFN_cuMemFree_v3020
+#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
+#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
+#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
+#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
+#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
+#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
+#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
+#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
+#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
+#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
+#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
+#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
+#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
+#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
+#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
+#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
+#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
+#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
+#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
+#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
+#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
+#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
+#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyBatchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyBatchAsync, 12080, 12080)
+#define PFN_cuMemcpy3DBatchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DBatchAsync, 12080, 12080)
+#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
+#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
+#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
+#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
+#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
+#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
+#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
+#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
+#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
+#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
+#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
+#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
+#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
+#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
+#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
+#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
+#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
+#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
+#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
+#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
+#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
+#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
+#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
+#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
+#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
+#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
+#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
+#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
+#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
+#define PFN_cuMemMap  PFN_cuMemMap_v10020
+#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
+#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
+#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
+#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
+#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
+#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
+#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
+#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
+#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
+#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
+#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
+#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
+#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
+#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
+#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
+#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
+#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
+#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
+#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
+#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
+#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
+#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
+#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
+#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
+#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
+#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
+#define PFN_cuMemAdvise_v2  PFN_cuMemAdvise_v12020
+#define PFN_cuMemPrefetchAsync_v2  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 12020, 12020)
+#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
+#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
+#define PFN_cuMulticastCreate  PFN_cuMulticastCreate_v12010
+#define PFN_cuMulticastAddDevice  PFN_cuMulticastAddDevice_v12010
+#define PFN_cuMulticastBindMem  PFN_cuMulticastBindMem_v12010
+#define PFN_cuMulticastBindAddr  PFN_cuMulticastBindAddr_v12010
+#define PFN_cuMulticastUnbind  PFN_cuMulticastUnbind_v12010
+#define PFN_cuMulticastGetGranularity  PFN_cuMulticastGetGranularity_v12010
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
+#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
+#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
+#define PFN_cuStreamGetId	__API_TYPEDEF_PTSZ(PFN_cuStreamGetId_v12000, 12000, 12000)
+#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
+#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
+#define PFN_cuStreamGetDevice __API_TYPEDEF_PTSZ(PFN_cuStreamGetDevice, 12080, 12080)
+#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
+#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
+#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
+#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
+#define PFN_cuStreamBeginCaptureToGraph  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCaptureToGraph, 12030, 12030)
+#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
+#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
+#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
+#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
+#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
+#define PFN_cuStreamGetCaptureInfo_v3  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 12030, 12030)
+#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
+#define PFN_cuStreamUpdateCaptureDependencies_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 12030, 12030)
+#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
+#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
+#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
+#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
+#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
+#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
+#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
+#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
+#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
+#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
+#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
+#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
+#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
+#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
+#define PFN_cuEventElapsedTime_v2  PFN_cuEventElapsedTime_v12080
+#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
+#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
+#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
+#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
+#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
+#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
+#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
+#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
+#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
+#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
+#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
+#define PFN_cuStreamWaitValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
+#define PFN_cuStreamWaitValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
+#define PFN_cuStreamWriteValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
+#define PFN_cuStreamWriteValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
+#define PFN_cuStreamBatchMemOp_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
+#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
+#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
+#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
+#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
+#define PFN_cuFuncGetName  PFN_cuFuncGetName_v12030
+#define PFN_cuFuncGetParamInfo  PFN_cuFuncGetParamInfo_v12040
+#define PFN_cuFuncIsLoaded PFN_cuFuncIsLoaded_v12040
+#define PFN_cuFuncLoad PFN_cuFuncLoad_v12040
+#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
+#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
+#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
+#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
+#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
+#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
+#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
+#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
+#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
+#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
+#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
+#define PFN_cuLaunch  PFN_cuLaunch_v2000
+#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
+#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
+#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
+#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
+#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v12000
+#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v12000
+#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v12000
+#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
+#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
+#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
+#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
+#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
+#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
+#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
+#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
+#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
+#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
+#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
+#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
+#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
+#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
+#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
+#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
+#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
+#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
+#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
+#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
+#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
+#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
+#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
+#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
+#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
+#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v12030
+#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v12030
+#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v12030
+#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v12030
+#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v12030
+#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
+
+#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiateWithFlags_v11040
+
+#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
+#define PFN_cuGraphInstantiateWithParams  __API_TYPEDEF_PTSZ(PFN_cuGraphInstantiateWithParams, 12000, 12000)
+#define PFN_cuGraphExecGetFlags  PFN_cuGraphExecGetFlags_v12000
+#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v12000
+#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
+#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
+#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
+#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
+#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
+#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
+#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
+#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
+#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v12000
+#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
+#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
+#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
+#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
+#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
+#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
+#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
+#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
+#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
+#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
+#define PFN_cuGraphAddNode PFN_cuGraphAddNode_v12030
+#define PFN_cuGraphNodeSetParams PFN_cuGraphNodeSetParams_v12020
+#define PFN_cuGraphExecNodeSetParams PFN_cuGraphExecNodeSetParams_v12020
+#define PFN_GraphConditionalHandleCreate PFN_cuGraphConditionalHandleCreate_v12030
+#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
+#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
+#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
+#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
+#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
+#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
+#define PFN_cuOccupancyMaxPotentialClusterSize  PFN_cuOccupancyMaxPotentialClusterSize_v11070
+#define PFN_cuOccupancyMaxActiveClusters  PFN_cuOccupancyMaxActiveClusters_v11070
+#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
+#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
+#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
+#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
+#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
+#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
+#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
+#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
+#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
+#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
+#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
+#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
+#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
+#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
+#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
+#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
+#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
+#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
+#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
+#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
+#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
+#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
+#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
+#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
+#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
+#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
+#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
+#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
+#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
+#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
+#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
+#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
+#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
+#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
+#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
+#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
+#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
+#define PFN_cuTensorMapEncodeTiled  PFN_cuTensorMapEncodeTiled_v12000
+#define PFN_cuTensorMapEncodeIm2col  PFN_cuTensorMapEncodeIm2col_v12000
+#define PFN_cuTensorMapReplaceAddress  PFN_cuTensorMapReplaceAddress_v12000
+#define PFN_cuTensorMapEncodeIm2colWide  PFN_cuTensorMapEncodeIm2colWide_v12080
+#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
+#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
+#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
+#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
+#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
+#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
+#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
+#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
+#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
+#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
+#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
+#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
+#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v12000
+#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
+#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
+#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
+#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
+#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
+#define PFN_cuModuleGetLoadingMode  PFN_cuModuleGetLoadingMode_v11070
+#define PFN_cuMemGetHandleForAddressRange  PFN_cuMemGetHandleForAddressRange_v11070
+#define PFN_cuLibraryLoadData PFN_cuLibraryLoadData_v12000
+#define PFN_cuLibraryLoadFromFile PFN_cuLibraryLoadFromFile_v12000
+#define PFN_cuLibraryUnload PFN_cuLibraryUnload_v12000
+#define PFN_cuLibraryGetKernel PFN_cuLibraryGetKernel_v12000
+#define PFN_cuLibraryGetModule PFN_cuLibraryGetModule_v12000
+#define PFN_cuKernelGetFunction PFN_cuKernelGetFunction_v12000
+#define PFN_cuKernelGetLibrary PFN_cuKernelGetLibrary_v12050
+#define PFN_cuLibraryGetGlobal PFN_cuLibraryGetGlobal_v12000
+#define PFN_cuLibraryGetManaged PFN_cuLibraryGetManaged_v12000
+#define PFN_cuLibraryGetKernelCount PFN_cuLibraryGetKernelCount_v12040
+#define PFN_cuLibraryEnumerateKernels PFN_cuLibraryEnumerateKernels_v12040
+#define PFN_cuKernelGetAttribute PFN_cuKernelGetAttribute_v12000
+#define PFN_cuKernelSetAttribute PFN_cuKernelSetAttribute_v12000
+#define PFN_cuKernelSetCacheConfig PFN_cuKernelSetCacheConfig_v12000
+#define PFN_cuKernelGetName  PFN_cuKernelGetName_v12030
+#define PFN_cuKernelGetParamInfo  PFN_cuKernelGetParamInfo_v12040
+#define PFN_cuLibraryGetUnifiedFunction PFN_cuLibraryGetUnifiedFunction_v12000
+#define PFN_cuCoredumpGetAttribute PFN_cuCoredumpGetAttribute_v12010
+#define PFN_cuCoredumpGetAttributeGlobal PFN_cuCoredumpGetAttributeGlobal_v12010
+#define PFN_cuCoredumpSetAttribute PFN_cuCoredumpSetAttribute_v12010
+#define PFN_cuCoredumpSetAttributeGlobal PFN_cuCoredumpSetAttributeGlobal_v12010
+#define PFN_cuDeviceRegisterAsyncNotification PFN_cuDeviceRegisterAsyncNotification_v12040
+#define PFN_cuDeviceUnregisterAsyncNotification PFN_cuDeviceUnregisterAsyncNotification_v12040
+#define PFN_cuGreenCtxCreate PFN_cuGreenCtxCreate_v12040
+#define PFN_cuGreenCtxDestroy PFN_cuGreenCtxDestroy_v12040
+#define PFN_cuDeviceGetDevResource PFN_cuDeviceGetDevResource_v12040
+#define PFN_cuCtxGetDevResource PFN_cuCtxGetDevResource_v12040
+#define PFN_cuGreenCtxGetDevResource PFN_cuGreenCtxGetDevResource_v12040
+#define PFN_cuGreenCtxRecordEvent PFN_cuGreenCtxRecordEvent_v12040
+#define PFN_cuGreenCtxWaitEvent PFN_cuGreenCtxWaitEvent_v12040
+#define PFN_cuDevResourceGenerateDesc PFN_cuDevResourceGenerateDesc_v12040
+#define PFN_cuDevSmResourceSplitByCount PFN_cuDevSmResourceSplitByCount_v12040
+#define PFN_cuStreamGetGreenCtx PFN_cuStreamGetGreenCtx_v12040
+#define PFN_cuCtxFromGreenCtx PFN_cuCtxFromGreenCtx_v12040
+#define PFN_cuCtxRecordEvent PFN_cuCtxRecordEvent_v12050
+#define PFN_cuCtxWaitEvent PFN_cuCtxWaitEvent_v12050
+#define PFN_cuGreenCtxStreamCreate PFN_cuGreenCtxStreamCreate_v12050
+#define PFN_cuStreamGetCtx_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 12050, 12050)
+#define PFN_cuMemBatchDecompressAsync __API_TYPEDEF_PTSZ(PFN_cuMemBatchDecompressAsync, 12060, 12060)
+
+#define PFN_cuCheckpointProcessGetRestoreThreadId PFN_cuCheckpointProcessGetRestoreThreadId_v12080
+#define PFN_cuCheckpointProcessGetState PFN_cuCheckpointProcessGetState_v12080
+#define PFN_cuCheckpointProcessLock PFN_cuCheckpointProcessLock_v12080
+#define PFN_cuCheckpointProcessCheckpoint PFN_cuCheckpointProcessCheckpoint_v12080
+#define PFN_cuCheckpointProcessRestore PFN_cuCheckpointProcessRestore_v12080
+#define PFN_cuCheckpointProcessUnlock PFN_cuCheckpointProcessUnlock_v12080
+
+/*
+ * Type definitions for functions defined in cuda.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v12050)(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetId_v12000)(CUcontext ctx, unsigned long long *ctxId);
+typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetFlags_v12010)(unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
+typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
+typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunctionCount)(unsigned int *count, CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleEnumerateFunctions)(CUfunction *functions, unsigned int numFunctions, CUmodule mod);
+typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
+typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
+typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyBatchAsync_v12080_ptsz)(CUdeviceptr_v2 *dsts, CUdeviceptr_v2 *srcs, size_t *sizes, size_t count, CUmemcpyAttributes_v1 *attrs, size_t *attrIdxs, size_t numAttrs, size_t *failIdx, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DBatchAsync_v12080_ptsz)(size_t numParams, CUDA_MEMCPY3D_BATCH_OP_v1 *opList, size_t *failIdx, unsigned long long flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUmemLocation_v1 location);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMulticastCreate_v12010)(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop);
+typedef CUresult (CUDAAPI *PFN_cuMulticastAddDevice_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindMem_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindAddr_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastUnbind_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMulticastGetGranularity_v12010)(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000_ptsz)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030_ptsz)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030_ptsz)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v12080)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetName_v12030)(const char **name, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetParamInfo_v12040)(CUfunction func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuFuncIsLoaded_v12040)(CUfunctionLoadingState *state, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncLoad_v12040)(CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v12000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v12000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v12000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v12030)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, CUgraphEdgeData *edgeData, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v12030)(CUgraphNode hNode, CUgraphNode *dependencies, CUgraphEdgeData *edgeData, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v12030)(CUgraphNode hNode, CUgraphNode *dependentNodes, CUgraphEdgeData *edgeData, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000_ptsz)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecGetFlags_v12000)(CUgraphExec hGraphExec, cuuint64_t *flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v12000)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v12000)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12030)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetParams_v12020)(CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecNodeSetParams_v12020)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphConditionalHandleCreate_v12030)(CUgraphConditionalHandle *pHandle_out, CUgraph hGraph, CUcontext ctx, unsigned int defaultLaunchValue, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeTiled_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeIm2col_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapReplaceAddress_v12000)(CUtensorMap *tensorMap, void *globalAddress);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeIm2colWide_v12080)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
+typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v12000)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolFound);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyBatchAsync_v12080)(CUdeviceptr_v2 *dsts, CUdeviceptr_v2 *srcs, size_t *sizes, size_t count, CUmemcpyAttributes_v1 *attrs, size_t *attrIdxs, size_t numAttrs, size_t *failIdx, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DBatchAsync_v12080)(size_t numParams, CUDA_MEMCPY3D_BATCH_OP_v1 *opList, size_t *failIdx, unsigned long long flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetDevice_v12080)(CUstream hStream, CUdevice *device);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetDevice_v12080_ptsz)(CUstream hStream, CUdevice *device);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadData_v12000)(CUlibrary *library, const void *code, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadFromFile_v12000)(CUlibrary *library, const char *fileName, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryUnload_v12000)(CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernel_v12000)(CUkernel *pKernel, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetModule_v12000)(CUmodule *pMod, CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernelCount)(unsigned int *count, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuLibraryEnumerateKernels)(CUkernel *kernels, unsigned int numKernels, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetFunction_v12000)(CUfunction *pFunc, CUkernel kernel);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetLibrary_v12050)(CUlibrary *pLib, CUkernel kernel);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetGlobal_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetManaged_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetAttribute_v12000)(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetAttribute_v12000)(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetCacheConfig_v12000)(CUkernel kernel, CUfunc_cache config, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetName_v12030)(const char **name, CUkernel hfunc);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetParamInfo_v12040)(CUkernel kernel, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetUnifiedFunction_v12000)(void **fptr, CUlibrary library, const char *symbol);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttribute_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttributeGlobal_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttribute_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttributeGlobal_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuDeviceRegisterAsyncNotification_v12040)(CUdevice device, CUasyncCallback callbackFunc, void *userData, CUasyncCallbackHandle *callback);
+typedef CUresult(CUDAAPI *PFN_cuDeviceUnregisterAsyncNotification_v12040)(CUdevice device, CUasyncCallbackHandle callback);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxCreate_v12040)(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxDestroy_v12040)(CUgreenCtx hCtx);
+typedef CUresult(CUDAAPI *PFN_cuDeviceGetDevResource_v12040)(CUdevice dev, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuCtxGetDevResource_v12040)(CUcontext hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxGetDevResource_v12040)(CUgreenCtx hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxRecordEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxWaitEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuDevResourceGenerateDesc_v12040)(CUdevResourceDesc* phDesc, CUdevResource* resources, unsigned int nbResources);
+typedef CUresult(CUDAAPI *PFN_cuDevSmResourceSplitByCount_v12040)(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
+typedef CUresult(CUDAAPI *PFN_cuStreamGetGreenCtx_v12040)(CUstream hStream, CUgreenCtx *phCtx);
+typedef CUresult(CUDAAPI *PFN_cuCtxFromGreenCtx_v12040)(CUcontext *pContext, CUgreenCtx hCtx);
+typedef CUresult(CUDAAPI *PFN_cuCtxRecordEvent_v12050)(CUcontext hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuCtxWaitEvent_v12050)(CUcontext hCtx, CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuGreenCtxStreamCreate_v12050)(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v12050)(CUstream hStream, CUcontext *pctx, CUgreenCtx *pGreenCtx);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v12050_ptsz)(CUstream hStream, CUcontext *pctx, CUgreenCtx *pGreenCtx);
+    typedef CUresult (CUDAAPI *PFN_cuMemBatchDecompressAsync_v12060)(CUmemDecompressParams *paramsArray, size_t count, unsigned int flags, size_t *errorIndex, CUstream stream);
+    typedef CUresult (CUDAAPI *PFN_cuMemBatchDecompressAsync_v12060_ptsz)(CUmemDecompressParams *paramsArray, size_t count, unsigned int flags, size_t *errorIndex, CUstream stream);
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
+    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+#endif
+
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessGetRestoreThreadId_v12080)(int pid, int *tid);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessGetState_v12080)(int pid, CUprocessState *state);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessLock_v12080)(int pid, CUcheckpointLockArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessCheckpoint_v12080)(int pid, CUcheckpointCheckpointArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessRestore_v12080)(int pid, CUcheckpointRestoreArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessUnlock_v12080)(int pid, CUcheckpointUnlockArgs *args);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h
new file mode 100644
index 0000000000000000000000000000000000000000..97de57ae494d62ae176fc02ad3c0c3f4d43e1526
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAVDPAU_H
+#define CUDAVDPAU_H
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the VDPAU interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VDPAU device
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
+ * applicable.
+ *
+ * \param pDevice           - Device associated with vdpDevice
+ * \param vdpDevice         - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Create a CUDA context for interoperability with VDPAU
+ *
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
+ * associates the CUDA context with the calling thread. It must be called
+ * before performing any other VDPAU interoperability operations. It may fail
+ * if the needed VDPAU driver facilities are not available. For usage of the
+ * \p flags parameter, see ::cuCtxCreate().
+ *
+ * \param pCtx              - Returned CUDA context
+ * \param flags             - Options for CUDA context creation
+ * \param device            - Device on which to create the context
+ * \param vdpDevice         - The VdpDevice to interop with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Registers a VDPAU VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpVideoSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpChromaType                               </th><th>arrayIndex</th><th>Size     </th><th>Format</th><th>Content            </th></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpChromaType          & arrayIndex & Size      & Format & Content             \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_420 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/4 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/4 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_422 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/2 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/2 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpVideoSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterVideoSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Registers a VDPAU VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpOutputSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpRGBAFormat              </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content       </th></tr>
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8   </td><td>0         </td><td>w x h</td><td>ARGB8  </td><td>Entire surface</td></tr>
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0         </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpRGBAFormat                  & arrayIndex & Size  & Format  & Content        \\
+ * \hline
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8    & 0          & w x h & ARGB8   & Entire surface \\
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0          & w x h & A2BGR10 & Entire surface \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpOutputSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterOutputSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDA_VDPAU */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuVDPAUCtxCreate
+
+    CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bfd148632827d222548be49b3a2ffb7caa1c4dc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAVDPAUTYPEDEFS_H
+#define CUDAVDPAUTYPEDEFS_H
+
+// Dependent includes for cudavdpau.h
+#include <vdpau/vdpau.h>
+
+#include <cudaVDPAU.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaVDPAU.h
+ */
+#define PFN_cuVDPAUGetDevice  PFN_cuVDPAUGetDevice_v3010
+#define PFN_cuVDPAUCtxCreate  PFN_cuVDPAUCtxCreate_v3020
+#define PFN_cuGraphicsVDPAURegisterVideoSurface  PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
+#define PFN_cuGraphicsVDPAURegisterOutputSurface  PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
+
+
+/**
+ * Type definitions for functions defined in cudaVDPAU.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/*
+ * Type definitions for older versioned functions in cudaVDPAU.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..12fd878dd10d9f18ad944a0d62ae1caba123fd06
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_H_
+# define _CUDA_AWBARRIER_H_
+
+# include "cuda_awbarrier_primitives.h"
+
+# if !defined(_CUDA_AWBARRIER_SM_TARGET)
+#  error This file requires compute capability 7.0 or greater.
+# endif
+
+# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+             -std=c++11 compiler option.
+# endif
+
+_CUDA_AWBARRIER_BEGIN_NAMESPACE
+
+class awbarrier {
+public:
+    class arrival_token {
+    public:
+        arrival_token() = default;
+        ~arrival_token() = default;
+        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
+    private:
+        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
+        uint64_t token;
+        friend awbarrier;
+    };
+    awbarrier() = default;
+    awbarrier(const awbarrier&) = delete;
+    awbarrier& operator=(const awbarrier&) = delete;
+    ~awbarrier() = default;
+
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait_parity(bool phase, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
+    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
+    _CUDA_AWBARRIER_QUALIFIER bool try_wait(arrival_token token, uint32_t maxSleepNanosec);
+    _CUDA_AWBARRIER_QUALIFIER bool try_wait_parity(bool phase, uint32_t maxSleepNanosec);
+    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
+
+private:
+    uint64_t barrier;
+    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
+    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
+    friend class pipeline;
+};
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier::arrival_token::pending_count() const
+{
+    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
+#if (__CUDA_ARCH__ >= 900)
+    return pending_count;
+#else
+    return (pending_count >> 15);
+#endif
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token::arrival_token(uint64_t token)
+    : token(token)
+{
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void init(awbarrier* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
+
+#if (__CUDA_ARCH__ >= 900)
+    const uint32_t init_count = expected_count;
+#else
+    const uint32_t init_count = (expected_count << 15) + expected_count;
+#endif
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void inval(awbarrier* barrier)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive_and_drop()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait_parity(bool phase, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::try_wait(arrival_token token, uint32_t maxSleepNanosec)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait(&this->barrier, token.token, maxSleepNanosec);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::try_wait_parity(bool phase, uint32_t maxSleepNanosec)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait_parity(&this->barrier, phase, maxSleepNanosec);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::wait(arrival_token token)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    while (!timed_wait(token, ~0u));
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::arrive_and_wait()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    this->wait(this->arrive());
+}
+
+_CUDA_AWBARRIER_QUALIFIER __host__
+constexpr uint32_t awbarrier::max()
+{
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_END_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c58346fe78c59329aca138ebc92add9015c005c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_HELPERS_H_
+
+#define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
+#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+#define _CUDA_AWBARRIER_END_NAMESPACE   } }
+
+#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
+#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
+#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
+
+# if !defined(_CUDA_AWBARRIER_QUALIFIER)
+#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
+#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 900)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_90
+#elif  (__CUDA_ARCH__ >= 800)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
+#elif (__CUDA_ARCH__ >= 700)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif
+#else
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif
+
+#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
+#endif
+
+#if !defined(_CUDA_AWBARRIER_DEBUG)
+# if defined(__CUDACC_DEBUG__)
+#  define _CUDA_AWBARRIER_DEBUG 1
+# else
+#  define _CUDA_AWBARRIER_DEBUG 0
+# endif
+#endif
+
+#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
+# if !defined(__CUDACC_RTC__)
+#  include <cassert>
+# endif
+# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
+# define _CUDA_AWBARRIER_ABORT() assert(0);
+#else
+# define _CUDA_AWBARRIER_ASSERT(x)
+# define _CUDA_AWBARRIER_ABORT() __trap();
+#endif
+
+#if defined(__CUDACC_RTC__)
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+#else
+# include <stdint.h>
+#endif
+
+// implicitly provided by NVRTC
+#ifndef __CUDACC_RTC__
+#include <nv/target>
+#endif /* !defined(__CUDACC_RTC__) */
+
+typedef uint64_t __mbarrier_t;
+typedef uint64_t __mbarrier_token_t;
+
+_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+union AWBarrier {
+    struct {
+        uint32_t expected;
+        uint32_t pending;
+    } split;
+    uint64_t raw;
+};
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
+                : "memory");
+        return;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        awbarrier->split.expected = 0x40000000 - expected_count;
+        awbarrier->split.pending = 0x80000000 - expected_count;
+        return;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void awbarrier_inval(uint64_t* barrier) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        asm volatile ("mbarrier.inval.shared.b64 [%0];"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier))
+                : "memory");
+        return;
+    )
+    return;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t awbarrier_token_pending_count(uint64_t token) {
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint32_t __pending_count;
+
+        asm ("mbarrier.pending_count.b64 %0, %1;"
+                : "=r"(__pending_count)
+                : "l"(token));
+        return __pending_count;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        const uint32_t pending = token >> 32;
+        return 0x80000000 - (pending & 0x7fffffff);
+    )
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint64_t awbarrier_arrive_drop(uint64_t* barrier) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        }
+
+        return token;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, 1);
+        }
+
+        __threadfence_block();
+
+        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
+        const uint32_t new_pending = old_pending + 1;
+        const bool reset = (old_pending ^ new_pending) & 0x80000000;
+
+        if (reset) {
+            __threadfence_block();
+
+            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
+            new_expected &= ~0x40000000;
+            if (new_expected & 0x20000000) {
+                new_expected |= 0x40000000;
+            }
+            atomicAdd_block(&awbarrier->split.pending, new_expected);
+        }
+
+        return static_cast<uint64_t>(old_pending) << 32;
+    )
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        }
+
+        return token;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, count);
+        }
+
+        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint32_t __wait_complete;
+
+        asm volatile ("{"
+                "    .reg .pred %%p;"
+                "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
+                "    selp.b32 %0, 1, 0, %%p;"
+                "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
+                : "memory");
+        return bool(__wait_complete);
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
+
+        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_test_wait_parity(uint64_t* barrier, bool phase_parity) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{"
+                    ".reg .pred %%p;"
+                    "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+                    "selp.b32 %0, 1, 0, %%p;"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity))
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_try_wait(uint64_t* barrier, uint64_t token, uint32_t max_sleep_nanosec) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token), "r"(max_sleep_nanosec)
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_try_wait_parity(uint64_t* barrier, bool phase_parity, uint32_t max_sleep_nanosec) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity)), "r"(max_sleep_nanosec)
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_primitives.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..5562ef3f6afeb7fce4bad4cb8067b3cb1b9a690f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_primitives.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
+#define _CUDA_AWBARRIER_PRIMITIVES_H_
+
+#include "cuda_awbarrier_helpers.h"
+
+#if !defined(_CUDA_AWBARRIER_SM_TARGET)
+# error This file requires compute capability 7.0 or greater.
+#endif
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
+uint32_t __mbarrier_maximum_count() {
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_inval(__mbarrier_t* barrier) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_test_wait_parity(__mbarrier_t* barrier, bool phase_parity) {
+   return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(barrier, phase_parity);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_try_wait(__mbarrier_t* barrier, __mbarrier_token_t token, uint32_t max_sleep_nanosec) {
+   return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait(barrier, token, max_sleep_nanosec);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_try_wait_parity(__mbarrier_t* barrier, bool phase_parity, uint32_t max_sleep_nanosec) {
+   return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait_parity(barrier, phase_parity, max_sleep_nanosec);
+}
+
+#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd049cafd252175a2c3107d86fcdbb6f8a7d0250
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h
@@ -0,0 +1,5118 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
+* This section describes nv_bfloat16 precision intrinsic functions.
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header. Specific examples are:
+* - hsin(__nv_bfloat16);
+* - hcos(__nv_bfloat16);
+* - h2sin(__nv_bfloat162);
+* - h2cos(__nv_bfloat162);
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent
+* the use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p __nv_bfloat16 which is essentially a user-defined type.
+* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ -
+* If defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these constants, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+#ifndef __CUDA_BF16_H__
+#define __CUDA_BF16_H__
+
+/* bring in __half data type and operations, for use in converting constructors */
+#include "cuda_fp16.h"
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+#define __CUDA_BF16_TYPES_EXIST__
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
+struct __nv_bfloat16;
+struct __nv_bfloat162;
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode.
+* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2bfloat16(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value. 
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. 
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+* 
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-towards-zero mode.
+* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-down mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-down mode.
+* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-up mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-up mode.
+* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts \p nv_bfloat16 number to float.
+* 
+* \details Converts nv_bfloat16 number \p a to float.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns float
+* - \p a converted to float. 
+* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __bfloat162float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+*
+* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+* \param[in] a - float. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
+* precision number.
+* 
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
+* mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
+* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read. 
+* \param[in] b - float. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with corresponding halves equal to the
+* converted input floats.
+* 
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
+* 
+* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float
+* - The low 16 bits of \p a converted to float.
+* 
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
+* 
+* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float
+* - The high 16 bits of \p a converted to float.
+* 
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both components of float2 number to nv_bfloat16 precision in
+* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
+* 
+* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even
+* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read. 
+*  
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 which has corresponding halves equal to the
+* converted float2 components.
+* 
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
+* 
+* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the
+* result as a \p float2 packed value.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float2
+* - \p a converted to float2.
+* 
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __bfloat162char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rd(NaN) returns 0.* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+* 
+* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - nv_bfloat16. Is only being read. 
+* \param[in] y - nv_bfloat16. Is only being read. 
+* 
+* \returns __nv_bfloat162
+* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+* 
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The truncated integer value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+* 
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The smallest integer value not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The largest integer value which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+* 
+* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
+* format, with bfloat16way cases rounded to the nearest even integer value.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The nearest integer to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
+* 
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The truncated \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
+* 
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of smallest integers not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of largest integers which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+* 
+* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
+* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
+* nearest even integer value.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of rounded integer values. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
+* 
+* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
+* number.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector which has both its halves equal to the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Swaps both halves of the \p nv_bfloat162 input.
+* 
+* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
+* with swapped halves.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - \p a with its halves being swapped. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
+* into one \p nv_bfloat162 number. 
+* 
+* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value. 
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The low 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
+* combines into one \p nv_bfloat162 number.
+* 
+* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The high 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns high 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns low 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat16
+* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Checks if the input \p nv_bfloat16 number is infinite.
+* 
+* \details Checks if the input \p nv_bfloat16 number \p a is infinite. 
+* \param[in] a - nv_bfloat16. Is only being read. 
+* 
+* \returns int 
+* - -1 if \p a is equal to negative infinity, 
+* - 1 if \p a is equal to positive infinity, 
+* - 0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+* 
+* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from \p nv_bfloat162 input.
+* 
+* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from \p nv_bfloat162 input.
+* 
+* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
+* 
+* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
+* as a signed short integer. 
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - The reinterpreted value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
+* 
+* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
+* as an unsigned short number.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
+* 
+* \details Reinterprets the bits in the signed short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
+* 
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. 
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], 
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. 
+* within the same subsection). \p width must have a value which is a power of 2; 
+* results are undefined if \p width is not a power of 2, or is a number greater than 
+* \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] srcLane - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. 
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up 
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. 
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, 
+* or is a number greater than \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. 
+* The value of \p var held by the resulting thread ID is returned: this has the effect 
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of \p width and the upper \p delta threads 
+* will remain unchanged. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: 
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each 
+* group of \p width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of \p var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] laneMask - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. 
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], 
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. 
+* within the same subsection). \p width must have a value which is a power of 2; 
+* results are undefined if \p width is not a power of 2, or is a number greater than 
+* \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] srcLane - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. 
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up 
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. 
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, 
+* or is a number greater than \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. 
+* The value of \p var held by the resulting thread ID is returned: this has the effect 
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of \p width and the upper \p delta threads 
+* will remain unchanged. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: 
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each 
+* group of \p width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of \p var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] laneMask - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA)
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+* 
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+* 
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Determine whether \p nv_bfloat162 argument is a NaN.
+*
+* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
+* 1.0 for NaN, 0.0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
+* or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise division of \p a with \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns bfloat2
+* - Returns \p a with the absolute value of both halves. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The sum of \p a and \p b, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplication of vectors \p a and \p b, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* \param[in] c - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* \param[in] c - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - Returns \p a with both halves negated. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+*
+* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The absolute value of a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b. 
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
+* 
+* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The result of dividing \p a by \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* \param[in] c - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* \param[in] c - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Negates input \p nv_bfloat16 number and returns the result.
+*
+* \details Negates input \p nv_bfloat16 number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
+* if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of not-equal comparison
+* of vectors \p a and \p b are true, 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-equal comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-than comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns bool 
+* - true if both \p nv_bfloat16 results of greater-than
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered if-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-equal comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-than comparison of 
+* vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Determine whether \p nv_bfloat16 argument is a NaN.
+*
+* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - true if argument is NaN. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b. 
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
+* complex numbers in \p nv_bfloat16 precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function.
+*
+* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+* 
+* \see htanh_approx(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise hyperbolic tangent function on vector \p a.
+* 
+* \see htanh(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+* \returns nv_bfloat16
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
+* round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
+* mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \returns nv_bfloat162
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \returns nv_bfloat162
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices use emulation path.
+* 
+* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat162. The value to be added.
+* 
+* \returns __nv_bfloat162
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices of compute capability 7.x and 8.x use emulation path.
+* 
+* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat16. The value to be added.
+* 
+* \returns __nv_bfloat16
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for std::move. 
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_INLINE__
+#define __CUDA_BF16_FORCEINLINE__
+#else
+#define __CUDA_BF16_INLINE__ inline
+#define __CUDA_BF16_FORCEINLINE__ __forceinline__
+#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_BF16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_BF16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_BF16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat16_raw data type
+ * \details Type allows static initialization of \p nv_bfloat16 until it becomes
+ * a built-in type.
+ * 
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat16,
+ * and not a conversion from \p short to \p nv_bfloat16.
+ * Such representation will be deprecated in a future version of CUDA.
+ * 
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p nv_bfloat16 floating-point number.
+     */
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat162_raw data type
+ * \details Type allows static initialization of \p nv_bfloat162 until it becomes
+ * a built-in type.
+ * 
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat162,
+ * and not a conversion from \p short2 to \p nv_bfloat162.
+ * Such representation will be deprecated in a future version of CUDA.
+ * 
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p nv_bfloat16 part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p nv_bfloat16 part.
+     */
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat16 datatype 
+ * 
+ * \details This structure implements the datatype for storing 
+ * nv_bfloat16 floating-point numbers. The structure implements 
+ * assignment operators and type conversions. 16 bits are being 
+ * used in total: 1 sign bit, 8 bits for the exponent, and 
+ * the significand is being stored in 7 bits. The total 
+ * precision is 8 bits.
+ * 
+ */
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile;
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2bfloat16(__half2float(f)).__x;
+)
+}
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2bfloat16_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2bfloat16_rn(static_cast<int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2bfloat16_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2bfloat16_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162char_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162uchar_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162short_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162ushort_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162int_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162uint_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162ll_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * See __bfloat162ull_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val);
+   /**
+    * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 addition operation.
+ * See also __hadd(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 subtraction operation.
+ * See also __hsub(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 multiplication operation.
+ * See also __hmul(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 division operation.
+ * See also __hdiv(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored);
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary minus operator.
+ * See also __hneg(__nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h);
+
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered compare equal operation.
+ * See also __heq(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hneu(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hgt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hlt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hge(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hle(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+/**
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat162 datatype
+ * \details This structure implements the datatype for storing two 
+ * nv_bfloat16 floating-point numbers. 
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions. 
+ * 
+ * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    /**
+     * Storage field holding lower \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 x;
+    /**
+     * Storage field holding upper \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162();
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from two \p __nv_bfloat16 variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src);
+
+    /* Convert to/from __nv_bfloat162_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r );
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const;
+};
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 addition operation.
+ * See also __hadd2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 subtraction operation.
+ * See also __hsub2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 multiplication operation.
+ * See also __hmul2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 division operation.
+ * See also __h2div(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary minus operator.
+ * See also __hneg2(__nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered compare equal operation.
+ * See also __hbeq2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hbneu2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hbgt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hblt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hbge2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hble2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__ 
+#ifdef __CUDACC_RTC__
+inline
+#else
+__CUDA_BF16_FORCEINLINE__ 
+#endif
+__half::__half(const __nv_bfloat16 f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2half_rn(__bfloat162float(f)).__x;
+)
+}
+#endif
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_bf16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the bfloat16 numbers format.
+ * 
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat16  nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of bfloat16 numbers.
+ * 
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_BF16_INLINE__
+#undef __CUDA_BF16_FORCEINLINE__
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55568d4e4dd07a42b616593c621637f56a030f69
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp
@@ -0,0 +1,3865 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_BF16_HPP__)
+#define __CUDA_BF16_HPP__
+
+#if !defined(__CUDA_BF16_H__)
+#error "Do not include this file directly. Instead, include cuda_bf16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p nv_bfloat16 data type
+ */
+#define CUDART_INF_BF16            __ushort_as_bfloat16((unsigned short)0x7F80U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines canonical NaN value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NAN_BF16            __ushort_as_bfloat16((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MIN_DENORM_BF16     __ushort_as_bfloat16((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a maximum representable value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MAX_NORMAL_BF16     __ushort_as_bfloat16((unsigned short)0x7F7FU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a negative zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NEG_ZERO_BF16       __ushort_as_bfloat16((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a positive zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_ZERO_BF16           __ushort_as_bfloat16((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p nv_bfloat16 data type
+ */
+#define CUDART_ONE_BF16            __ushort_as_bfloat16((unsigned short)0x3F80U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator float() const { return __bfloat162float(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator signed char() const { return __bfloat162char_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned char() const { return __bfloat162uchar_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(__bfloat162char_rz(*this));
+        }
+        else
+        {
+            value = static_cast<char>(__bfloat162uchar_rz(*this));
+        }
+        return value;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator short() const { return __bfloat162short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned short() const { return __bfloat162ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator int() const { return __bfloat162int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned int() const { return __bfloat162uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__bfloat162ll_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<long>(__bfloat162int_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__bfloat162ull_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__bfloat162uint_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long long() const { return __bfloat162ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long long() const { return __bfloat162ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#else
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::operator __nv_bfloat162_raw() const {
+    __nv_bfloat162_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this);
+,
+    ret.x = static_cast<__nv_bfloat16_raw>(this->x).x;
+    ret.y = static_cast<__nv_bfloat16_raw>(this->y).x;
+)
+    return ret;
+}
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_float_as_uint(const float f)
+{
+    unsigned int u;
+IF_DEVICE_OR_CUDACC(
+    u = __float_as_uint(f);
+,
+    memcpy(&u, &f, sizeof(f));
+,
+    std::memcpy(&u, &f, sizeof(f));
+)
+    return u;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_uint_as_float(const unsigned int u)
+{
+    float f;
+IF_DEVICE_OR_CUDACC(
+    f = __uint_as_float(u);
+,
+    memcpy(&f, &u, sizeof(u));
+,
+    std::memcpy(&f, &u, sizeof(u));
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+
+    x = __internal_float_as_uint(f);
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+        sign = 0U;
+        remainder = 0U;
+        return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31U;
+    remainder = x << 16U;
+    return static_cast<unsigned short>(x >> 16U);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_double2float_rn(const double x)
+{
+    float r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f32.f64 %0, %1;" : "=f"(r) : "d"(x));
+,
+    r = static_cast<float>(x);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ double __internal_float2double(const float x)
+{
+    double r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.f64.f32 %0, %1;" : "=d"(r) : "f"(x));
+,
+    r = static_cast<double>(x);
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
+    return val;
+,
+    float f = __internal_double2float_rn(x);
+    const double d = __internal_float2double(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
+
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && x_is_not_nan) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+
+    return __float2bfloat16(f);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg .b16 low;\n"
+        "  cvt.rn.bf16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ float __internal_device_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
+,
+    asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
+)
+    return f;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    f = __internal_device_bfloat162float(h);
+,
+    unsigned int u = static_cast<unsigned int>(h) << 16;
+    f = __internal_uint_as_float(u);
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
+}
+
+/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
+{
+    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
+{
+    float hi_float;
+    float lo_float;
+    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
+    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2int_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __internal_bfloat162int_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    int   i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2int_rz(f);
+,
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162int_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+        __nv_bfloat16 val;
+       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+       return val;
+,
+        const float ru = __int2float_ru(i);
+        const float rd = __int2float_rd(i);
+        float rz = __int2float_rz(i);
+        if (ru != rd) {
+            rz = __uint_as_float(__float_as_uint(rz) | 1U);
+        }
+        return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_int2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.s8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __bfloat162float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.u8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __bfloat162float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ short int __internal_device_bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    asm("{ .reg.f32 f;\n"
+        "  mov.b32 f, {0,%1};\n"
+        "  cvt.rzi.s16.f32 %0,f;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_bfloat162short_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_bfloat162uint_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2uint_rz(f);
+,
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162uint_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
+{
+    unsigned int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    const float ru = __uint2float_ru(i);
+    const float rd = __uint2float_rd(i);
+    float rz = __uint2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_uint2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ unsigned short int __internal_device_bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   val = __internal_device_bfloat162ushort_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<unsigned short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __internal_device_bfloat162ull_rz(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ull_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ull_rz(h);
+,
+    const float f = __bfloat162float(h);
+    unsigned long long int i;
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ull2float_ru(i);
+    const float rd = __ull2float_rd(i);
+    float rz = __ull2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ull2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const unsigned long long int uf = static_cast<unsigned long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+    // round up happened here
+    // note: no need to handle round up to f == 0x1.p64 specially
+    if (uf > i) {
+        u--;
+    }
+    if (uf != i) {
+        u |= 1U;
+    }
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ull2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ull2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ull2float_ru(i));
+)
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    long long int i;
+    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ll_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ long long int __internal_device_bfloat162ll_rz(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ll_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ll_rz(h);
+,
+    long long int i;
+    const float f = __bfloat162float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = min_val;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ll2float_ru(i);
+    const float rd = __ll2float_rd(i);
+    float rz = __ll2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ll2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const long long int lf = static_cast<long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    if ((f > 0.0f) && (lf > i)) {
+        u--;
+    }
+    if ((f < 0.0f) && (lf < i)) {
+        u--;
+    }
+    if (lf != i) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ll2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ll2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ll2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rpi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rmi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(rintf(__bfloat162float(h)));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = htrunc(h.x);
+    const __nv_bfloat16 high = htrunc(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hceil(h.x);
+    const __nv_bfloat16 high = hceil(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hfloor(h.x);
+    const __nv_bfloat16 high = hfloor(h.y);
+    return __nv_bfloat162(low, high);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
+{
+    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
+{
+    int retval;
+    const __nv_bfloat16_raw araw = __nv_bfloat16_raw(a);
+    if (araw.x == 0xFF80U) {
+        retval = -1;
+    } else if (araw.x == 0x7F80U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
+,
+    return static_cast<short int>(__nv_bfloat16_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __BFLOAT16_TO_CUS(h);
+,
+    return __nv_bfloat16_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __nv_bfloat16(hr);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = i;
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = i;
+    return __nv_bfloat16(hr);
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __nv_bfloat16, __nv_bfloat162 warp shuffle        *
+******************************************************************************/
+#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name, var, delta, c, mask) /* do */ {\
+   __nv_bfloat162 r; \
+   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2bfloat16(temp2);
+}
+
+/******************************************************************************
+*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs   *
+******************************************************************************/
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+
+#undef __LDG_PTR
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                             __nv_bfloat162 comparison                       *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+,\
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  shr.u32 low_res, low_res, 16;\n"\
+        "  or.b32  %0, high_res, low_res;}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+)\
+   return val; \
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_heq2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hne2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hle2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hge2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hlt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hequ2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hleu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgeu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hltu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgtu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+
+/******************************************************************************
+*                __nv_bfloat162 comparison with mask output                   *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT162_MACRO_MASK(name) {\
+   unsigned val; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".u32.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO_MASK
+
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   unsigned int val; \
+   bool retval; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   if (val == 0x3F803F80U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+,
+    return (__heq(a.x, b.x) && __heq(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+,
+    return (__hne(a.x, b.x) && __hne(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
+,
+    return (__hle(a.x, b.x) && __hle(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+,
+    return (__hge(a.x, b.x) && __hge(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+,
+    return (__hlt(a.x, b.x) && __hlt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+,
+    return (__hgt(a.x, b.x) && __hgt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+,
+    return (__hequ(a.x, b.x) && __hequ(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+,
+    return (__hneu(a.x, b.x) && __hneu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+,
+    return (__hleu(a.x, b.x) && __hleu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+,
+    return (__hgeu(a.x, b.x) && __hgeu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+,
+    return (__hltu(a.x, b.x) && __hltu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+,
+    return (__hgtu(a.x, b.x) && __hgtu(a.y, b.y));
+)
+}
+#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
+/******************************************************************************
+*                             __nv_bfloat16 comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+,\
+   unsigned int val; \
+   asm( "{.reg .b32 a,b;\n"\
+        "  mov.b32 a, {0, %1};\n"\
+        "  mov.b32 b, {0, %2};\n"\
+        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
+        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+)\
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(eq)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ne)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(le)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ge)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(lt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(equ)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(neu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(leu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(geu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_BFLOAT16_MACRO
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2(a, b);
+,
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2(a, b);
+,
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2(a, b);
+,
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2_rn(a, b);
+,
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2_rn(a, b);
+,
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2_rn(a, b);
+,
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  fma.rn.bf16x2 f,%1,one,%2;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mone;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mone, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mzero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mzero, 0x80008000U;\n"
+        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ .reg .b32 f, one, zero;\n"
+         "  mov.b32 one, 0x3f803f80U;\n"
+         "  mov.b32 zero, 0;\n"
+         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
+         "  max.bf16x2 f, f, zero;\n"
+         "  min.bf16x2 %0, f, one;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
+    __nv_bfloat16 ha, hb;
+
+    ha = __low2bfloat16(a);
+    hb = __low2bfloat16(b);
+
+    const __nv_bfloat16 v1 = __hdiv(ha, hb);
+
+    ha = __high2bfloat16(a);
+    hb = __high2bfloat16(b);
+
+    const __nv_bfloat16 v2 = __hdiv(ha, hb);
+
+    return __halves2bfloat162(v1, v2);
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, 1.0f, fb));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fb, -1.0f, fa));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, fb, -0.0f));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hadd_rn(a, b);
+,
+    return __hadd(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hsub_rn(a, b);
+,
+    return __hsub(a, b);
+
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hmul_rn(a, b);
+,
+    return __hmul(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, one, %2;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hadd(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mone;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mone, 0xbf80U;\n"
+         "  fma.rn.bf16 f, %2, mone, %1;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hsub(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mzero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mzero, 0x8000U;\n"
+         "  fma.rn.bf16 f, %1, %2, mzero;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hmul(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, %2, %3;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
+   __nv_bfloat16 val; \
+   asm( "{.reg .b32 a,b,res;\n"\
+        "  mov.b32 a, {0,%1};\n"\
+        "  mov.b32 b, {0,%2};\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
+        "  cvt.rn.bf16.f32 %0, res;}\n"\
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+    const float two_126 =  __uint_as_float(0x7E800000U) ; //2^126
+    const float a_f = __bfloat162float(a);
+    float b_f = __bfloat162float(b);
+    float ans;
+    bool b_big = (fabsf(b_f) >= two_126);
+    if(b_big){b_f *= 0.25f;}
+
+    // f32 div approximation. Good enough for c-r bfloat div.
+    asm("{ div.approx.f32 %0, %1, %2; }" : "=f"(ans) : "f"(a_f), "f"(b_f));
+
+    // Prevent ftz:
+    if(b_big){ans = __fmaf_ieee_rn(ans, 0.25f, -0.0f);} 
+    return __float2bfloat16(ans);
+}
+
+#undef __BINARY_OP_BFLOAT16_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hdiv(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat162 functions                        *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    float r = sinf(f);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of sinf()
+    // Otherwise, ftz=on, then sinf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
+    return __hsin_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = cosf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
+    return __hcos_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16exp(const float x)
+{
+    const float log2e_up = __uint_as_float(0x3FB8AA3CU);
+    float fa = x * log2e_up;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return fa;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16exp(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __nv_bfloat162 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  mov.b32         fl, {0,hl};     \n"\
+                "  mov.b32         fu, {0,hu};     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
+                "  cvt.rn.bf16.f32    hl, fl;     \n"\
+                "  cvt.rn.bf16.f32    hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x3FB8AA3CU;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    return __floats2bfloat162_rn( __internal_device_fast_bf16exp(__low2float(a)), __internal_device_fast_bf16exp(__high2float(a)) );
+)
+}
+
+__CUDA_BF16_DECL__ float __internal_device_tanhf_noftz(const float x)
+{
+    float f = x;
+    float r = tanhf(x);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of tanhf()
+    // Otherwise, ftz=on, then tanhf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return f;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f));
+,
+    f = __internal_device_tanhf_noftz(f);
+)
+    __nv_bfloat16 h = __float2bfloat16_rn(f);
+    return h;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a) {
+    float2 f = __bfloat1622float2(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.x));
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.y));
+,
+    f.x = __internal_device_tanhf_noftz(f.x);
+    f.y = __internal_device_tanhf_noftz(f.y);
+)
+    __nv_bfloat162 h = __float22bfloat162_rn(f);
+    return h;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a) {
+    __nv_bfloat16 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16_raw hr = (__nv_bfloat16_raw)a;
+    asm("tanh.approx.bf16 %0, %0;" : "+h"(hr.x));
+    r = (__nv_bfloat16)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a) {
+    __nv_bfloat162 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("tanh.approx.bf16x2 %0, %1;" : "=r"(__BFLOAT162_TO_UI(res)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(ex2)
+,
+    float fl = __low2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fa = __bfloat162float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    __nv_bfloat16 r = __float2bfloat16_rn(fa);
+    __nv_bfloat16_raw araw = static_cast<__nv_bfloat16_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        araw.x = 0x3f75U;
+        r = static_cast<__nv_bfloat16>(araw);
+    }
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fl = __low2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+
+    float fh = __high2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+
+    r = __floats2bfloat162_rn( fl, fh );
+
+    const __nv_bfloat162_raw araw = static_cast<__nv_bfloat162_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.x = static_cast<__nv_bfloat16>(raw_fix);
+    }
+    if (araw.y == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.y = static_cast<__nv_bfloat16>(raw_fix);
+    }
+)
+    return r;
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16log2(float x)
+{
+    asm("{ lg2.approx.f32 %0, %0; }" : "+f"(x));
+    return x;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(lg2)
+,
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_ln2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_ln2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_ln2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_log10_2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;      \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_log10_2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_log10_2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
+    float fl = __low2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(rsqrt)
+,
+    float fl = __low2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(sqrt)
+,
+    float fl = __low2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+#undef __APPROX_FCAST2
+#undef __BF16_SPEC_CASE2
+
+__CUDA_BF16_DECL__ bool __internal_device_hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return __BFLOAT16_TO_CUS(r) != 0U;
+,
+    unsigned int r;
+    asm( "{.reg .b32 a;\n"
+         "  mov.b32 a, {0,%1};\n"
+         "  set.nan.f32.f32 %0, a, a;}\n"
+         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r != 0U;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    __nv_bfloat162_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    r = __nv_bfloat162(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hisnan(a);
+,
+    const __nv_bfloat16_raw hr = static_cast<__nv_bfloat16_raw>(a);
+    return ((hr.x & 0x7FFFU) > 0x7F80U);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{neg.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{neg.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(__fmaf_ieee_rn(fa, -1.0f, -0.0f));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneg(a);
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(-fa);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{abs.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{abs.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    __nv_bfloat16_raw abs_a_raw = static_cast<__nv_bfloat16_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7F80U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__nv_bfloat16>(abs_a_raw);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.bf16 %0,%1,%2;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        maxval = __hge(a, b) ? a : b;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        minval = __hle(a, b) ? a : b;
+    }
+
+    return minval;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax_nan(a.x, b.x);
+    val.y = __hmax_nan(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin_nan(a.x, b.x);
+    val.y = __hmin_nan(a.y, b.y);
+    return val;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
+    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_bfloat162(real_tmp, img_tmp);
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat162 r;
+    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
+                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__nv_bfloat162*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
+                  : "=h"(__BFLOAT16_TO_US(r))
+                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
+                  : "memory");
+    return r;
+,
+    unsigned short int* address_as_us = (unsigned short int*)address;
+    unsigned short int old = *address_as_us;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_us, assumed,
+            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
+    } while (assumed != old);
+    return __ushort_as_bfloat16(old);
+)
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+#endif /* defined(__cplusplus) */
+
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_BF16_CONSTEXPR__
+ 
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#undef __CPP_VERSION_AT_LEAST_11_BF16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_HPP__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..07b7ec75b43325ea76b50f718822b1caf82cc9da
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h
@@ -0,0 +1,914 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
+#define __CUDA_DEVICE_RUNTIME_API_H__
+
+#if defined(__CUDACC__) && !defined(__CUDACC_RTC__)
+#include <stdlib.h>
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(CUDA_FORCE_CDP1_IF_SUPPORTED) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_NVHPC_CUDA) && !(defined(_WIN32) && !defined(_WIN64))
+#define __CUDA_INTERNAL_USE_CDP2
+#endif
+
+#if !defined(__CUDACC_RTC__)
+
+#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudaFuncAttributes;
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifndef __CUDA_INTERNAL_USE_CDP2
+inline __device__  cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+#else // __CUDA_INTERNAL_USE_CDP2
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2Malloc(void **p, size_t s)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+/** \endcond  */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) &&  !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
+
+#endif /* !defined(__CUDACC_RTC__) */
+
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+# define __DEPRECATED__(msg)
+#elif defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
+# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated. Moreover, such use will cause this module to fail to load on sm_90+ devices. If calls to "#func_name" from device code cannot be removed for older devices at this time, you may guard them with __CUDA_ARCH__ macros to remove them only for sm_90+ devices, making sure to generate code for compute_90 for the macros to take effect. Note that this mitigation will no longer work when support for "#func_name" from device code is eventually dropped for all devices. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
+#else
+# define __CDPRT_DEPRECATED(func_name)
+#endif
+
+#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
+
+#include "driver_types.h"
+#include "crt/host_defines.h"
+
+#define cudaStreamGraphTailLaunch             (cudaStream_t)0x0100000000000000
+#define cudaStreamGraphFireAndForget          (cudaStream_t)0x0200000000000000
+#define cudaStreamGraphFireAndForgetAsSibling (cudaStream_t)0x0300000000000000
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+#define cudaStreamTailLaunch                ((cudaStream_t)0x3) /**< Per-grid stream with a tail launch semantics. Only applicable when used with CUDA Dynamic Parallelism. */
+#define cudaStreamFireAndForget             ((cudaStream_t)0x4) /**< Per-grid stream with a fire-and-forget synchronization behavior. Only applicable when used with CUDA Dynamic Parallelism. */
+#endif
+
+extern "C"
+{
+
+// Symbols beginning with __cudaCDP* should not be used outside
+// this header file. Instead, compile with -DCUDA_FORCE_CDP1_IF_SUPPORTED if
+// CDP1 support is required.
+
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifndef __CUDA_INTERNAL_USE_CDP2
+//// CDP1 endpoints
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __DEPRECATED__("cudaDeviceGetSharedMemConfig deprecated") __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+#if (__CUDA_ARCH__ < 900) && (defined(CUDA_FORCE_CDP1_IF_SUPPORTED) || (defined(_WIN32) && !defined(_WIN64)))
+// cudaDeviceSynchronize is removed on sm_90+
+extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+#endif
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+//// CDP2 endpoints
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2PeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Free(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Malloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2RuntimeGetVersion(int *runtimeVersion);
+extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBuffer(size_t alignment, size_t size);
+extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) 
+static inline  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch_ptsz(cudaGraphExec_t graphExec, cudaStream_t stream)
+{
+    if (stream == 0) {
+        stream = cudaStreamPerThread;
+    }
+    return  cudaGraphLaunch(graphExec, stream);
+}
+#endif
+
+/** \endcond */
+
+/**
+  * \ingroup CUDART_GRAPH
+  * \brief Get the currently running device graph id.
+  *
+  * Get the currently running device graph id.
+  * \return Returns the current device graph id, 0 if the call is outside of a device graph.
+  * \sa cudaGraphLaunch
+  */
+static inline __device__ __cudart_builtin__ cudaGraphExec_t CUDARTAPI cudaGetCurrentGraphExec(void)
+{
+    unsigned long long current_graph_exec;
+    asm ("mov.u64 %0, %%current_graph_exec;" : "=l"(current_graph_exec));
+    return (cudaGraphExec_t)current_graph_exec;
+}
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the kernel parameters of the given kernel node
+ *
+ * Updates \p size bytes in the kernel parameters of \p node at \p offset to
+ * the contents of \p value. \p node must be device-updatable, and must reside upon the same
+ * device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param offset    - The offset into the params at which to make the update
+ * \param value     - Buffer containing the params to write
+ * \param size      - Size in bytes to update
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeSetGridDim,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const void *value , size_t size);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Enables or disables the given kernel node
+ *
+ * Enables or disables \p node based upon \p enable. If \p enable is true, the node will be enabled;
+ * if it is false, the node will be disabled. Disabled nodes will act as a NOP during execution.
+ * \p node must be device-updatable, and must reside upon the same device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param enable    - Whether to enable or disable the node
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetGridDim,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetEnabled(cudaGraphDeviceNode_t node, bool enable);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the grid dimensions of the given kernel node
+ *
+ * Sets the grid dimensions of \p node to \p gridDim. \p node must be device-updatable,
+ * and must reside upon the same device as thecalling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param gridDim   - The grid dimensions to set
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetGridDim(cudaGraphDeviceNode_t node, dim3 gridDim);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Batch applies multiple kernel node updates
+ *
+ * Batch applies one or more kernel node updates based on the information provided in \p updates.
+ * \p updateCount specifies the number of updates to apply. Each entry in \p updates must specify
+ * a node to update, the type of update to apply, and the parameters for that type of update. See
+ * the documentation for ::cudaGraphKernelNodeUpdate for more detail.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param updates     - The updates to apply
+ * \param updateCount - The number of updates to apply
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeSetGridDim
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeUpdatesApply(const cudaGraphKernelNodeUpdate *updates, size_t updateCount);
+
+/**
+  * \ingroup CUDART_EXECUTION
+  * \brief Programmatic dependency trigger
+  *
+  * This device function ensures the programmatic launch completion edges /
+  * events are fulfilled. See
+  * ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
+  * and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
+  * information. The event / edge kick off only happens when every CTAs
+  * in the grid has either exited or called this function at least once,
+  * otherwise the kick off happens automatically after all warps finishes
+  * execution but before the grid completes. The kick off only enables
+  * scheduling of the secondary kernel. It provides no memory visibility
+  * guarantee itself. The user could enforce memory visibility by inserting a
+  * memory fence of the correct scope.
+  */
+static inline __device__ __cudart_builtin__ void CUDARTAPI cudaTriggerProgrammaticLaunchCompletion(void)
+{
+    asm volatile("griddepcontrol.launch_dependents;":::);
+}
+
+/**
+  * \ingroup CUDART_EXECUTION
+  * \brief Programmatic grid dependency synchronization
+  *
+  * This device function will block the thread until all direct grid
+  * dependencies have completed. This API is intended to use in conjuncture with
+  * programmatic / launch event / dependency. See
+  * ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
+  * and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
+  * information.
+  */
+static inline __device__ __cudart_builtin__ void CUDARTAPI cudaGridDependencySynchronize(void)
+{
+    asm volatile("griddepcontrol.wait;":::"memory");
+}
+
+/**
+  * \ingroup CUDART_GRAPH
+  * \brief Sets the condition value associated with a conditional node.
+  *
+  * Sets the condition value associated with a conditional node.
+  *
+  * Note: \p handle must be associated with the same context as the kernel calling this function.
+  *
+  * \sa cudaGraphConditionalHandleCreate
+  */
+extern __device__ __cudart_builtin__ void CUDARTAPI cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+
+//// CG API
+extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
+
+
+//// CDP API
+
+#ifdef __CUDA_ARCH__
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+    return __cudaCDP2DeviceGetAttribute(value, attr, device);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit)
+{
+    return __cudaCDP2DeviceGetLimit(pValue, limit);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig)
+{
+    return __cudaCDP2DeviceGetCacheConfig(pCacheConfig);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig)
+{
+    return __cudaCDP2DeviceGetSharedMemConfig(pConfig);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void)
+{
+    return __cudaCDP2GetLastError();
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void)
+{
+    return __cudaCDP2PeekAtLastError();
+}
+
+static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error)
+{
+    return __cudaCDP2GetErrorString(error);
+}
+
+static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error)
+{
+    return __cudaCDP2GetErrorName(error);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count)
+{
+    return __cudaCDP2GetDeviceCount(count);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+    return __cudaCDP2GetDevice(device);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags)
+{
+    return __cudaCDP2StreamCreateWithFlags(pStream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream)
+{
+    return __cudaCDP2StreamDestroy(stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
+{
+    return __cudaCDP2StreamWaitEvent(stream, event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
+{
+    return __cudaCDP2StreamWaitEvent_ptsz(stream, event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags)
+{
+    return __cudaCDP2EventCreateWithFlags(event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream)
+{
+    return __cudaCDP2EventRecord(event, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream)
+{
+    return __cudaCDP2EventRecord_ptsz(event, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
+{
+    return __cudaCDP2EventRecordWithFlags(event, stream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
+{
+    return __cudaCDP2EventRecordWithFlags_ptsz(event, stream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event)
+{
+    return __cudaCDP2EventDestroy(event);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func)
+{
+    return __cudaCDP2FuncGetAttributes(attr, func);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr)
+{
+    return __cudaCDP2Free(devPtr);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size)
+{
+    return __cudaCDP2Malloc(devPtr, size);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2MemcpyAsync(dst, src, count, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2MemcpyAsync_ptsz(dst, src, count, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy2DAsync_ptsz(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy3DAsync(p, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy3DAsync_ptsz(p, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream)
+{
+    return __cudaCDP2MemsetAsync(devPtr, value, count, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream)
+{
+    return __cudaCDP2MemsetAsync_ptsz(devPtr, value, count, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+    return __cudaCDP2Memset2DAsync(devPtr, pitch, value, width, height, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+    return __cudaCDP2Memset2DAsync_ptsz(devPtr, pitch, value, width, height, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
+{
+    return __cudaCDP2Memset3DAsync(pitchedDevPtr, value, extent, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
+{
+    return __cudaCDP2Memset3DAsync_ptsz(pitchedDevPtr, value, extent, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion)
+{
+    return __cudaCDP2RuntimeGetVersion(runtimeVersion);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+    return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSmemSize);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+    return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSmemSize, flags);
+}
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+/** \endcond */
+
+#endif // __CUDA_ARCH__
+
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Obtains a parameter buffer
+ *
+ * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
+ * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch kernels.
+ *
+ * \param alignment - Specifies alignment requirement of the parameter buffer
+ * \param size      - Specifies size requirement in bytes
+ *
+ * \return
+ * Returns pointer to the allocated parameterBuffer
+ * \notefnerr
+ *
+ * \sa cudaLaunchDevice
+ */
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size)
+{
+    return __cudaCDP2GetParameterBuffer(alignment, size);
+}
+#else
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
+#endif
+
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize)
+{
+    return __cudaCDP2GetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize);
+}
+#else
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+#endif
+
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+{
+    return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream)
+{
+    return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
+}
+#else
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+#endif
+
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Launches a specified kernel
+ *
+ * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
+ * by calling ::cudaGetParameterBuffer().
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch the kernels.
+ *
+ * \param func            - Pointer to the kernel to be launched
+ * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
+ * \param gridDimension   - Specifies grid dimensions
+ * \param blockDimension  - Specifies block dimensions
+ * \param sharedMemSize   - Specifies size of shared memory
+ * \param stream          - Specifies the stream to be used
+ *
+ * \return
+ * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
+ * \notefnerr
+ * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
+ * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
+ *
+ * \sa cudaGetParameterBuffer
+ */
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+    // When compiling for the device and per thread default stream is enabled, add
+    // a static inline redirect to the per thread stream entry points.
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+#ifdef __CUDA_INTERNAL_USE_CDP2
+        return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+#else
+        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+#endif
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+#ifdef __CUDA_INTERNAL_USE_CDP2
+        return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
+#else
+        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
+#endif
+    }
+#else // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+#ifdef __CUDA_INTERNAL_USE_CDP2
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+        return __cudaCDP2LaunchDevice(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+        return __cudaCDP2LaunchDeviceV2(parameterBuffer, stream);
+    }
+#else // __CUDA_INTERNAL_USE_CDP2
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+#endif // __CUDA_INTERNAL_USE_CDP2
+#endif // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+
+
+// These symbols should not be used outside of this header file.
+#define __cudaCDP2DeviceGetAttribute
+#define __cudaCDP2DeviceGetLimit
+#define __cudaCDP2DeviceGetCacheConfig
+#define __cudaCDP2DeviceGetSharedMemConfig
+#define __cudaCDP2GetLastError
+#define __cudaCDP2PeekAtLastError
+#define __cudaCDP2GetErrorString
+#define __cudaCDP2GetErrorName
+#define __cudaCDP2GetDeviceCount
+#define __cudaCDP2GetDevice
+#define __cudaCDP2StreamCreateWithFlags
+#define __cudaCDP2StreamDestroy
+#define __cudaCDP2StreamWaitEvent
+#define __cudaCDP2StreamWaitEvent_ptsz
+#define __cudaCDP2EventCreateWithFlags
+#define __cudaCDP2EventRecord
+#define __cudaCDP2EventRecord_ptsz
+#define __cudaCDP2EventRecordWithFlags
+#define __cudaCDP2EventRecordWithFlags_ptsz
+#define __cudaCDP2EventDestroy
+#define __cudaCDP2FuncGetAttributes
+#define __cudaCDP2Free
+#define __cudaCDP2Malloc
+#define __cudaCDP2MemcpyAsync
+#define __cudaCDP2MemcpyAsync_ptsz
+#define __cudaCDP2Memcpy2DAsync
+#define __cudaCDP2Memcpy2DAsync_ptsz
+#define __cudaCDP2Memcpy3DAsync
+#define __cudaCDP2Memcpy3DAsync_ptsz
+#define __cudaCDP2MemsetAsync
+#define __cudaCDP2MemsetAsync_ptsz
+#define __cudaCDP2Memset2DAsync
+#define __cudaCDP2Memset2DAsync_ptsz
+#define __cudaCDP2Memset3DAsync
+#define __cudaCDP2Memset3DAsync_ptsz
+#define __cudaCDP2RuntimeGetVersion
+#define __cudaCDP2GetParameterBuffer
+#define __cudaCDP2GetParameterBufferV2
+#define __cudaCDP2LaunchDevice_ptsz
+#define __cudaCDP2LaunchDeviceV2_ptsz
+#define __cudaCDP2LaunchDevice
+#define __cudaCDP2LaunchDeviceV2
+#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor
+#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+
+}
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+/** \endcond */
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the kernel parameters of the given kernel node
+ *
+ * Updates the kernel parameters of \p node at \p offset to \p value. \p node must be
+ * device-updatable, and must reside upon the same device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param offset    - The offset into the params at which to make the update
+ * \param value     - Parameter value to write
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::etblGraphKernelNodeSetEnabled,
+ * ::etblGraphKernelNodeSetGridDim,
+ * ::etblGraphKernelNodeUpdatesApply
+ */
+template <typename T>
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const T &value)
+{
+    return cudaGraphKernelNodeSetParam(node, offset, &value, sizeof(T));
+}
+
+#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
+#endif /* defined(__cplusplus) && defined(__CUDACC__) */
+
+#undef __DEPRECATED__
+#undef __CDPRT_DEPRECATED
+#undef __CUDA_INTERNAL_USE_CDP2
+
+#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..a87d7dc81e34291f526b2b02baf3a47066c3730e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h
@@ -0,0 +1,645 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_EGL_INTEROP_H__)
+#define __CUDA_EGL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+#include "cuda_runtime.h"
+#include "cudart_platform.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_TYPES
+ * @{
+ */
+
+ /**
+ * Maximum number of planes per frame
+ */
+#define CUDA_EGL_MAX_PLANES 3
+
+/**
+ * CUDA EglFrame type - array or pointer
+ */
+typedef enum cudaEglFrameType_enum
+{
+    cudaEglFrameTypeArray = 0,  /**< Frame type CUDA array */
+    cudaEglFrameTypePitch = 1,  /**< Frame type CUDA pointer */
+} cudaEglFrameType;
+
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+ */
+typedef enum cudaEglResourceLocationFlags_enum {
+    cudaEglResourceLocationSysmem   = 0x00,       /**< Resource location sysmem */
+    cudaEglResourceLocationVidmem   = 0x01,       /**< Resource location vidmem */
+} cudaEglResourceLocationFlags;
+
+/**
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+ */
+typedef enum cudaEglColorFormat_enum {
+    cudaEglColorFormatYUV420Planar            = 0,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar        = 1,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    cudaEglColorFormatYUV422Planar            = 2,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar        = 3,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    cudaEglColorFormatARGB                    = 6,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    cudaEglColorFormatRGBA                    = 7,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    cudaEglColorFormatL                       = 8,  /**< single luminance channel in one surface. */
+    cudaEglColorFormatR                       = 9,  /**< single color channel in one surface. */
+    cudaEglColorFormatYUV444Planar            = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV444SemiPlanar        = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    cudaEglColorFormatYUYV422                 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY422                 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatABGR                    = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    cudaEglColorFormatBGRA                    = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    cudaEglColorFormatA                       = 16, /**< Alpha color format - one channel in one surface. */
+    cudaEglColorFormatRG                      = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
+    cudaEglColorFormatAYUV                    = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYVU444SemiPlanar        = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar        = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar        = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatVYUY_ER                 = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatUYVY_ER                 = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatYUYV_ER                 = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatYVYU_ER                 = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatYUVA_ER                 = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatAYUV_ER                 = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYUV444Planar_ER         = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422Planar_ER         = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420Planar_ER         = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV444SemiPlanar_ER     = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar_ER     = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_ER     = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444Planar_ER         = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar_ER         = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar_ER         = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444SemiPlanar_ER     = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar_ER     = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_ER     = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerRGGB               = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    cudaEglColorFormatBayerBGGR               = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    cudaEglColorFormatBayerGRBG               = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    cudaEglColorFormatBayerGBRG               = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    cudaEglColorFormatBayer10RGGB             = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10BGGR             = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GRBG             = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GBRG             = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12RGGB             = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12BGGR             = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GRBG             = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GBRG             = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer14RGGB             = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14BGGR             = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GRBG             = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GBRG             = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer20RGGB             = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20BGGR             = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GRBG             = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GBRG             = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatYVU444Planar            = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar            = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar            = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerIspRGGB            = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspBGGR            = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGRBG            = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGBRG            = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerBCCR               = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    cudaEglColorFormatBayerRCCB               = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    cudaEglColorFormatBayerCRBC               = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    cudaEglColorFormatBayerCBRC               = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    cudaEglColorFormatBayer10CCCC             = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12BCCR             = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12RCCB             = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CRBC             = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CBRC             = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CCCC             = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatY                       = 82, /**< Color format for single Y plane. */
+    cudaEglColorFormatYUV420SemiPlanar_2020   = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_2020   = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_2020       = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_2020       = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_709    = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_709    = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_709        = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_709        = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709  = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar      = 94, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_709  = 95, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY_ER                         = 96, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY_709_ER                     = 97, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY10_ER                       = 98, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY10_709_ER                   = 99, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY12_ER                       = 100, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatY12_709_ER                   = 101, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatYUVA                         = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatYVYU                         = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatVYUY                         = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_ER     = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_ER     = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_ER     = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_ER     = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatUYVY709                        = 114, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY709_ER                     = 115, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY2020                       = 116,  /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+} cudaEglColorFormat;
+
+/**
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
+ */
+typedef struct cudaEglPlaneDesc_st {
+    unsigned int width;                         /**< Width of plane */
+    unsigned int height;                        /**< Height of plane */
+    unsigned int depth;                         /**< Depth of plane */
+    unsigned int pitch;                         /**< Pitch of plane */
+    unsigned int numChannels;                   /**< Number of channels for the plane */
+    struct cudaChannelFormatDesc channelDesc;   /**< Channel Format Descriptor */
+    unsigned int reserved[4];                   /**< Reserved for future use */
+} cudaEglPlaneDesc;
+
+/**
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
+ * \code
+ * typedef struct cudaEglPlaneDesc_st {
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int numChannels;
+ *     struct cudaChannelFormatDesc channelDesc;
+ *     unsigned int reserved[4];
+ * } cudaEglPlaneDesc;
+ * \endcode
+
+*/
+typedef struct cudaEglFrame_st {
+   union {
+       cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];     /**< Array of CUDA arrays corresponding to each plane*/
+       struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+   } frame;
+   cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];     /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
+   unsigned int planeCount;                             /**< Number of planes */
+   cudaEglFrameType frameType;                          /**< Array or Pitch */
+   cudaEglColorFormat eglColorFormat;                   /**< CUDA EGL Color Format*/
+} cudaEglFrame;
+
+/**
+ * CUDA EGLSream Connection
+ */
+typedef struct  CUeglStreamConnection_st *cudaEglStreamConnection;
+
+/** @} */ /* END CUDART_TYPES */
+
+/**
+ * \addtogroup CUDART_EGL EGL Interoperability
+ * This section describes the EGL interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsResourceGetMappedEglFrame,
+ * ::cuGraphicsEGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
+ * ::cudaEglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::cudaEglResourceLocationVidmem.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnectWithFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
+
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
+
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR.
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::cudaEglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the EGLStream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ * implied by ::cudaEglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorLaunchTimeout
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerAcquireFrame
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
+        cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cuEGLStreamConsumerReleaseFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
+                                                  cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param eglStream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
+                                                EGLStreamKHR eglStream, EGLint width, EGLint height);
+
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
+
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * The ::cudaEglFrame is defined as:
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerPresentFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
+                                                 cudaEglFrame eglframe, cudaStream_t *pStream);
+
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
+ * 
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not 
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cuEGLStreamProducerReturnFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
+                                                cudaEglFrame *eglframe, cudaStream_t *pStream);
+
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for EGL graphics resources.
+ *
+ * The ::cudaEglFrame is defined as
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t             pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr   pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
+ *
+ * \sa
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedEglFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
+                                        cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
+
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
+ * via \p flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cudaEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaEventQuery,
+ * ::cudaEventSynchronize,
+ * ::cudaEventDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+/** @} */ /* END CUDART_EGL */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* __CUDA_EGL_INTEROP_H__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecfa4584a4cb31d3a75f418cf2e60034b51806b3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h
@@ -0,0 +1,5363 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
+* This section describes half precision intrinsic functions.
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header.
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the
+* use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p half which is essentially a user-defined type.
+* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If
+* defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p half and \p half2 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS Half Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these constants, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+#ifndef __CUDA_FP16_H__
+#define __CUDA_FP16_H__
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+#define __CUDA_FP16_TYPES_EXIST__
+
+/* Macros to allow half & half2 to be used by inline assembly */
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_fp16.hpp" */
+struct __half;
+struct __half2;
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts double number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts double number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __double2half \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2half \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2half(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value. 
+* 
+* \details Converts float number \p a to half precision in round-to-nearest-even mode. 
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* 
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __float2half_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-towards-zero mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half precision using round-towards-zero mode.
+* - __float2half_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-down mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-down mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns half
+* - \p a converted to half precision using round-down mode.
+* - __float2half_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-up mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-up mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns half
+* - \p a converted to half precision using round-up mode.
+* - __float2half_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts \p half number to float.
+* 
+* \details Converts half number \p a to float.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns float
+* - \p a converted to float. 
+* - __half2float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __half2float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __half2float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts input to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+*
+* \details Converts input \p a to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+* \param[in] a - float. Is only being read. 
+*
+* \returns half2
+* - The \p half2 value with both halves equal to the converted half
+* precision number.
+* 
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both input floats to half precision in round-to-nearest-even
+* mode and returns \p half2 with converted values.
+*
+* \details Converts both input floats to half precision in round-to-nearest-even mode
+* and combines the results into one \p half2 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read. 
+* \param[in] b - float. Is only being read. 
+* 
+* \returns half2
+* - The \p half2 value with corresponding halves equal to the
+* converted input floats.
+* 
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts low 16 bits of \p half2 to float and returns the result
+* 
+* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float
+* - The low 16 bits of \p a converted to float.
+* 
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts high 16 bits of \p half2 to float and returns the result
+* 
+* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float
+* - The high 16 bits of \p a converted to float.
+* 
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed char in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed char
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __half2char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __half2char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __half2char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned char in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __half2uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __half2uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __half2short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-towards-zero mode.
+* - __half2ushort_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __half2int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-towards-zero mode.
+* - __half2uint_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-towards-zero mode.
+* - __half2ll_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-towards-zero mode.
+* - __half2ull_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Vector function, combines two \p __half numbers into one \p __half2 number.
+* 
+* \details Combines two input \p __half number \p x and \p y into one \p __half2 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - half. Is only being read. 
+* \param[in] y - half. Is only being read. 
+* 
+* \returns __half2
+* - The \p __half2 vector with one half equal to \p x and the other to \p y. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both components of \p float2 number to half precision in
+* round-to-nearest-even mode and returns \p half2 with converted values.
+* 
+* \details Converts both components of \p float2 to half precision in round-to-nearest-even
+* mode and combines the results into one \p half2 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read. 
+*  
+* \returns half2
+* - The \p half2 which has corresponding halves equal to the
+* converted \p float2 components.
+* 
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both halves of \p half2 to \p float2 and returns the result.
+* 
+* \details Converts both halves of \p half2 input \p a to \p float2 and returns the
+* result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float2
+* - \p a converted to \p float2.
+* 
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-to-nearest-even mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __half2int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __half2int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __half2int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-to-nearest-even mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-down mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-up mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __half2short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __half2short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __half2short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-down mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-up mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-to-nearest-even mode.
+* - __half2uint_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-down mode.
+* - __half2uint_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-up mode.
+* - __half2uint_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-towards-zero mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-down mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-up mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-to-nearest-even mode.
+* - __half2ushort_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-down mode.
+* - __half2ushort_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rd(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-up mode.
+* - __half2ushort_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_ru(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-down mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-up mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-to-nearest-even mode.
+* - __half2ull_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-down mode.
+* - __half2ull_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-up mode.
+* - __half2ull_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-down mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-up mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-to-nearest-even mode.
+* - __half2ll_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-down mode.
+* - __half2ll_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-up mode.
+* - __half2ll_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-down mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-up mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+* 
+* \details Round \p h to the largest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The truncated value. 
+* - htrunc(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - htrunc(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - htrunc(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htrunc(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+* 
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The smallest integer value not less than \p h. 
+* - hceil(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hceil(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hceil(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hceil(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The largest integer value which is less than or equal to \p h. 
+* - hfloor(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hfloor(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hfloor(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hfloor(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+* 
+* \details Round \p h to the nearest integer value in half-precision floating-point
+* format, with halfway cases rounded to the nearest even integer value.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The nearest integer to \p h. 
+* - hrint(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrint(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns 
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrint(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrint(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Truncate \p half2 vector input argument to the integral part.
+* 
+* \details Round each component of vector \p h to the largest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The truncated \p h. 
+*
+* \see htrunc(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate \p half2 vector ceiling of the input argument.
+* 
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of smallest integers not less than \p h. 
+*
+* \see hceil(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of largest integers which is less than or equal to \p h. 
+*
+* \see hfloor(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+* 
+* \details Round each component of \p half2 vector \p h to the nearest integer value in
+* half-precision floating-point format, with halfway cases rounded to the
+* nearest even integer value.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of rounded integer values. 
+*
+* \see hrint(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns \p half2 with both halves equal to the input value.
+* 
+* \details Returns \p half2 number with both halves equal to the input \p a \p half
+* number.
+* \param[in] a - half. Is only being read. 
+* 
+* \returns half2
+* - The vector which has both its halves equal to the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Swaps both halves of the \p half2 input.
+* 
+* \details Swaps both halves of the \p half2 input and returns a new \p half2 number
+* with swapped halves.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - \p a with its halves being swapped. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
+* into one \p half2 number. 
+* 
+* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value. 
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The low 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from each of the two \p half2 inputs and
+* combines into one \p half2 number.
+* 
+* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The high 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns high 16 bits of \p half2 input.
+*
+* \details Returns high 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half
+* - The high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns low 16 bits of \p half2 input.
+*
+* \details Returns low 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half
+* - Returns \p half which contains low 16 bits of the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Checks if the input \p half number is infinite.
+* 
+* \details Checks if the input \p half number \p a is infinite. 
+* \param[in] a - half. Is only being read. 
+* 
+* \returns int 
+* - -1 if \p a is equal to negative infinity, 
+* - 1 if \p a is equal to positive infinity, 
+* - 0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Combines two \p half numbers into one \p half2 number.
+* 
+* \details Combines two input \p half number \p a and \p b into one \p half2 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* 
+* \returns half2
+* - The half2 with one half equal to \p a and the other to \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from \p half2 input.
+* 
+* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The half2 with both halves equal to the low 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from \p half2 input.
+* 
+* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The half2 with both halves equal to the high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as a signed short integer.
+* 
+* \details Reinterprets the bits in the half-precision floating-point number \p h
+* as a signed short integer. 
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - The reinterpreted value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as an unsigned short integer.
+* 
+* \details Reinterprets the bits in the half-precision floating-point \p h
+* as an unsigned short number.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a signed short integer as a \p half.
+* 
+* \details Reinterprets the bits in the signed short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p half.
+* 
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+#if defined(_WIN32)
+# define __CUDA_FP16_DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(_NVHPC_CUDA)
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release."
+#else
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize);
+
+#undef __CUDA_FP16_WSB_DEPRECATION_MESSAGE
+#undef __CUDA_FP16_DEPRECATED__
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. 
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], 
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. 
+* within the same subsection). \p width must have a value which is a power of 2; 
+* results are undefined if \p width is not a power of 2, or is a number greater than 
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read. 
+* \param[in] srcLane - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. 
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up 
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. 
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, 
+* or is a number greater than \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read. 
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. 
+* The value of \p var held by the resulting thread ID is returned: this has the effect 
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of \p width and the upper \p delta threads 
+* will remain unchanged. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: 
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each 
+* group of \p width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of \p var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read. 
+* \param[in] laneMask - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. 
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], 
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. 
+* within the same subsection). \p width must have a value which is a power of 2; 
+* results are undefined if \p width is not a power of 2, or is a number greater than 
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read. 
+* \param[in] srcLane - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. 
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up 
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. 
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, 
+* or is a number greater than \p warpSize. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. 
+* The value of \p var held by the resulting thread ID is returned: this has the effect 
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of \p width and the upper \p delta threads 
+* will remain unchanged. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - unsigned int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: 
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each 
+* group of \p width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of \p var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read. 
+* \param[in] laneMask - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width = warpSize);
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */
+
+#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
+#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+* 
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The half2 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+* 
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+* 
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+* 
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Determine whether \p half2 argument is a NaN.
+*
+* \details Determine whether each half of input \p half2 number \p a is a NaN.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The half2 with the corresponding \p half results set to
+* 1.0 for NaN, 0.0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The sum of vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub
+* into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of
+* mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise division of \p a with \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - Returns \p a with the absolute value of both halves. 
+*
+* \see __habs(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The sum of \p a and \p b, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise multiplication of vectors \p a and \p b, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* \param[in] c - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* \param[in] c - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Negates both halves of the input \p half2 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p half2 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - Returns \p a with both halves negated. 
+* 
+* \see __hneg(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Calculates the absolute value of input \p half number and returns the result.
+*
+* \details Calculates the absolute value of input \p half number and returns the result.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The absolute value of \p a.
+* - __habs \cuda_math_formula (\pm 0)\end_cuda_math_formula returns +0.
+* - __habs \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - __habs(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The sum of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of subtracting \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of multiplying \p a and \p b. 
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half division in round-to-nearest-even mode.
+* 
+* \details Divides \p half input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* 
+* \returns half
+* - The result of dividing \p a by \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__  __half __hdiv(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* \param[in] c - half. Is only being read. 
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* \param[in] c - half. Is only being read. 
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Negates input \p half number and returns the result.
+*
+* \details Negates input \p half number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - Negated input \p a.
+* - __hneg \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \mp 0 \end_cuda_math_formula.
+* - __hneg \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \mp \infty \end_cuda_math_formula.
+* - __hneg(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector if-equal comparison and returns boolean true
+* if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of not-equal comparison
+* of vectors \p a and \p b are true, 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of less-equal comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of greater-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of less-than comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns bool 
+* - true if both \p half results of greater-than
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered if-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered less-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-equal comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered less-than comparison of 
+* vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Determine whether \p half argument is a NaN.
+*
+* \details Determine whether \p half value \p a is a NaN.
+* \param[in] a - half. Is only being read. 
+*
+* \returns bool
+* - true if argument is NaN. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as
+* complex numbers in \p half precision: (a.x + I*a.y), (b.x + I*b.y), (c.x + I*c.y)
+* and performs complex multiply-accumulate operation: a*b + c in a simple way:
+* ((a.x*b.x + c.x) - a.y*b.y) + I*((a.x*b.y + c.y) + a.y*b.x)
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* - __half2 result = __hcmadd(a, b, c) is numerically in agreement with:
+* - result.x = __hfma(-a.y, b.y, __hfma(a.x, b.x, c.x))
+* - result.y = __hfma( a.y, b.x, __hfma(a.x, b.y, c.y))
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half square root of input: \cuda_math_formula \sqrt{a} \end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The square root of \p a.
+* - hsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half reciprocal square root of input: \cuda_math_formula \frac{1}{\sqrt{a}}\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The reciprocal square root of \p a.
+* - hrsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns +0.
+* - hrsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hrsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half reciprocal of input: \cuda_math_formula \frac{1}{a}\end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The reciprocal of \p a.
+* - hrcp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrcp \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrcp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrcp(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half natural logarithm of input: \cuda_math_formula \ln(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The natural logarithm of \p a.
+* - hlog \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog(1) returns +0.
+* - hlog(x), x < 0 returns NaN.
+* - hlog \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half binary logarithm of input: \cuda_math_formula \log_{2}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The binary logarithm of \p a.
+* - hlog2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog2(1) returns +0.
+* - hlog2(x), x < 0 returns NaN.
+* - hlog2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half decimal logarithm of input: \cuda_math_formula \log_{10}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The decimal logarithm of \p a.
+* - hlog10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog10(1) returns +0.
+* - hlog10(x), x < 0 returns NaN.
+* - hlog10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half natural exponential function of input: \cuda_math_formula e^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The natural exponential function on \p a.
+* - hexp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates approximate \p half hyperbolic tangent function.
+*
+* \details Calculates approximate \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p half2 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+* 
+* \see htanh_approx(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise hyperbolic tangent function on vector \p a.
+* 
+* \see htanh(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half binary exponential function of input: \cuda_math_formula 2^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The binary exponential function on \p a.
+* - hexp2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp2 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp2(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half decimal exponential function of input: \cuda_math_formula 10^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The decimal exponential function on \p a.
+* - hexp10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp10 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The cosine of \p a.
+* - hcos \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hcos \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hcos(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hcos(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The sine of \p a.
+* - hsin \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - hsin \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hsin(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsin(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise square root on vector \p a.
+* 
+* \see hsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise reciprocal square root on vector \p a.
+* 
+* \see hrsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise reciprocal on vector \p a.
+* 
+* \see hrcp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise natural logarithm on vector \p a.
+* 
+* \see hlog(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise binary logarithm on vector \p a.
+* 
+* \see hlog2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise decimal logarithm on vector \p a.
+* 
+* \see hlog10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise exponential function on vector \p a.
+* 
+* \see hexp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise binary exponential function on vector \p a.
+* 
+* \see hexp2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal exponential function in
+* round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise decimal exponential function on vector \p a.
+* 
+* \see hexp10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise cosine on vector \p a.
+* 
+* \see hcos(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise sine on vector \p a.
+* 
+* \see hsin(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two \p __half elements; the entire \p __half2 is not guaranteed to be atomic as a single 32-bit access.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 6.x and higher,
+* older devices use emulation path.
+* 
+* \param[in] address - half2*. An address in global or shared memory.
+* \param[in] val - half2. The value to be added.
+* 
+* \returns half2
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher.
+* 
+* \param[in] address - half*. An address in global or shared memory.
+* \param[in] val - half. The value to be added.
+* 
+* \returns half
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /*defined(__CUDACC__) || defined(_NVHPC_CUDA)*/
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#endif
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+/* C++11 header for std::move. 
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_INLINE__
+#define __CUDA_FP16_FORCEINLINE__
+#else
+#define __CUDA_FP16_INLINE__ inline
+#define __CUDA_FP16_FORCEINLINE__ __forceinline__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_FP16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_FP16)
+#define __CUDA_FP16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_FP16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half_raw data type
+ * \details Type allows static initialization of \p half until it becomes
+ * a built-in type.
+ * 
+ * - Note: this initialization is as a bit-field representation of \p half,
+ * and not a conversion from \p short to \p half.
+ * Such representation will be deprecated in a future version of CUDA.
+ * 
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p half floating-point number.
+     */
+    unsigned short x;
+} __half_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2_raw data type
+ * \details Type allows static initialization of \p half2 until it becomes
+ * a built-in type.
+ * 
+ * - Note: this initialization is as a bit-field representation of \p half2,
+ * and not a conversion from \p short2 to \p half2.
+ * Such representation will be deprecated in a future version of CUDA.
+ * 
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p half part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p half part.
+     */
+    unsigned short y;
+} __half2_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+// forward-declaration of bfloat type to be used in converting constructor
+struct __nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half data type
+ * \details This structure implements the datatype for storing 
+ * half-precision floating-point numbers. The structure implements 
+ * assignment, arithmetic and comparison operators, and type conversions. 
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, 
+ * and the significand is being stored in 10 bits. 
+ * The total precision is 11 bits. There are 15361 representable 
+ * numbers within the interval [0.0, 1.0], endpoints included. 
+ * On average we have log10(2**11) ~ 3.311 decimal digits. 
+ * 
+ * The objective here is to provide IEEE754-compliant implementation
+ * of \p binary16 type and arithmetic with limitations due to
+ * device HW not supporting floating-point exceptions.
+ */
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half() = default;
+#else
+    __CUDA_HOSTDEVICE__ __half() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /* Convert to/from __half_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half(const __half_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p volatile \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const volatile;
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p __nv_bfloat16 input using default round-to-nearest-even rounding mode.
+     * Need to include the header file \p cuda_bf16.h
+     */
+    explicit __CUDA_HOSTDEVICE__ __half(const __nv_bfloat16 f); //forward declaration only, implemented in cuda_bf16.hpp
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2half_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2half_rn(static_cast<int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2half_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2half_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2char_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2uchar_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in __half2char_rz(__half) and __half2uchar_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2short_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2ushort_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2int_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2uint_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p long type and proceeds accordingly, see
+     * further details in __half2int_rz(__half) and __half2ll_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p unsigned \p long type and proceeds
+     * accordingly, see further details in __half2uint_rz(__half) and __half2ull_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2ll_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     * 
+     * \see __half2ull_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh);
+/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored);
+
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary minus operator.
+ * \see __hneg(__half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h);
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered compare equal operation.
+ * \see __heq(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half unordered compare not-equal operation.
+ * \see __hneu(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-than compare operation.
+ * \see __hgt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-than compare operation.
+ * \see __hlt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-or-equal compare operation.
+ * \see __hge(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-or-equal compare operation.
+ * \see __hle(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh);
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2 data type
+ * \details This structure implements the datatype for storing two 
+ * half-precision floating-point numbers. 
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions. 
+ * 
+ * - NOTE: __half2 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __half2 {
+    /**
+     * Storage field holding lower \p __half part.
+     */
+    __half x;
+    /**
+     * Storage field holding upper \p __half part.
+     */
+    __half y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half2() = default;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __half2() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from two \p __half variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src);
+
+    /* Convert to/from __half2_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __half2_raw() const;
+};
+
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary minus operator.
+ * \see __hneg2(__half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered compare equal operation.
+ * \see __hbeq2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half unordered compare not-equal operation.
+ * \see __hbneu2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-than compare operation.
+ * \see __hbgt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-than compare operation.
+ * \see __hblt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-or-equal compare operation.
+ * \see __hbge2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-or-equal compare operation.
+ * \see __hble2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh);
+
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+
+/* Note the .hpp file is included to capture the "half" & "half2" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_fp16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
+/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the half-precision numbers format.
+ * 
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half half;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of half-precision numbers.
+ * 
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half2 half2;
+// for consistency with __nv_bfloat16
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half      __nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2     __nv_half2;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half_raw  __nv_half_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2_raw __nv_half2_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half        nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half2       nv_half2;
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
+
+#undef __CUDA_FP16_DECL__
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_FP16_INLINE__
+#undef __CUDA_FP16_FORCEINLINE__
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0eb872db456a661133a1a90bfed36864ff47ebf
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.hpp
@@ -0,0 +1,3483 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_FP16_HPP__)
+#define __CUDA_FP16_HPP__
+
+#if !defined(__CUDA_FP16_H__)
+#error "Do not include this file directly. Instead, include cuda_fp16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* Macros for half & half2 binary arithmetic */
+#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
+   return val; \
+} /* while(0) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p half data type
+ */
+#define CUDART_INF_FP16            __ushort_as_half((unsigned short)0x7C00U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines canonical NaN value for the \p half data type
+ */
+#define CUDART_NAN_FP16            __ushort_as_half((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p half data type
+ */
+#define CUDART_MIN_DENORM_FP16     __ushort_as_half((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a maximum representable value for the \p half data type
+ */
+#define CUDART_MAX_NORMAL_FP16     __ushort_as_half((unsigned short)0x7BFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a negative zero value for the \p half data type
+ */
+#define CUDART_NEG_ZERO_FP16       __ushort_as_half((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a positive zero value for the \p half data type
+ */
+#define CUDART_ZERO_FP16           __ushort_as_half((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p half data type
+ */
+#define CUDART_ONE_FP16            __ushort_as_half((unsigned short)0x3C00U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const __half_raw &hr) { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator float() const { return __half2float(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; }
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator signed char() const { return __half2char_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned char() const { return __half2uchar_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator char() const {
+    char value;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        value = static_cast<char>(__half2char_rz(*this));
+    }
+    else
+    {
+        value = static_cast<char>(__half2uchar_rz(*this));
+    }
+    return value;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator short() const { return __half2short_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned short() const { return __half2ushort_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator int() const { return __half2int_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned int() const { return __half2uint_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long() const {
+    long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<long>(__half2ll_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<long>(__half2int_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long() const {
+    unsigned long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<unsigned long>(__half2ull_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<unsigned long>(__half2uint_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long long() const { return __half2ll_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long long() const { return __half2ull_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
+
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h)      { __half_raw one; one.x = 0x3C00U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h)      { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2::operator __half2_raw() const {
+    __half2_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this);
+,
+    ret.x = static_cast<__half_raw>(this->x).x;
+    ret.y = static_cast<__half_raw>(this->y).x;
+)
+    return ret;
+}
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+    unsigned int u;
+    unsigned int result;
+#if defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)std::memcpy(&x, &f, sizeof(f));
+#endif
+    u = (x & 0x7fffffffU);
+    sign = ((x >> 16U) & 0x8000U);
+    // NaN/+Inf/-Inf
+    if (u >= 0x7f800000U) {
+        remainder = 0U;
+        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
+    } else if (u > 0x477fefffU) { // Overflows
+        remainder = 0x80000000U;
+        result = (sign | 0x7bffU);
+    } else if (u >= 0x38800000U) { // Normal numbers
+        remainder = u << 19U;
+        u -= 0x38000000U;
+        result = (sign | (u >> 13U));
+    } else if (u < 0x33000001U) { // +0/-0
+        remainder = u;
+        result = sign;
+    } else { // Denormal numbers
+        const unsigned int exponent = u >> 23U;
+        const unsigned int shift = 0x7eU - exponent;
+        unsigned int mantissa = (u & 0x7fffffU);
+        mantissa |= 0x800000U;
+        remainder = mantissa << (32U - shift);
+        result = (sign | (mantissa >> shift));
+        result &= 0x0000FFFFU;
+    }
+    return static_cast<unsigned short>(result);
+}
+#endif  /* #if !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
+{
+IF_DEVICE_OR_CUDACC(
+    __half val;
+    asm("{  cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
+    return val;
+,
+    __half result;
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {   // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {   // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        unsigned long long int aShiftRoundBits;
+        (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+,
+    __half result;
+    /*
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    */
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)std::memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        /*
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        */
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        /*
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        */
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {
+            /*
+            // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            */
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {
+            /*
+            // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            */
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        /*
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        */
+        unsigned long long int aShiftRoundBits;
+        (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(a));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) {
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
+        : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+,
+    asm("{.reg .f16 low,high;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  cvt.rn.f16.f32 high, %2;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+)
+    return val;
+}
+
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_float2_to_half2_rn(a,b);
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(b));
+)
+    return val;
+}
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline float __internal_half2float(const unsigned short h)
+{
+    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
+    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
+    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
+    float f;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+#endif  /* !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
+,
+    val = __internal_half2float(static_cast<__half_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.s8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __half2float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.u8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __half2float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
+{
+    short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
+{
+    unsigned short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
+{
+    int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
+{
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = min_val;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<long long int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned long long int>(f);
+    }
+)
+    return i;
+}
+/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y)
+{
+    __half2 t; t.x = x; t.y = y; return t;
+}
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
+{
+    const __half2 val = __floats2half2_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
+{
+    float hi_float;
+    float lo_float;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
+
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
+,
+    lo_float = __internal_half2float(((__half2_raw)a).x);
+    hi_float = __internal_half2float(((__half2_raw)a).y);
+)
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
+{
+    int i;
+    asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
+{
+    int i;
+    asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
+{
+    int i;
+    asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
+{
+    short int i;
+    asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
+{
+    short int i;
+    asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
+{
+    short int i;
+    asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
+{
+    long long int i;
+    asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
+{
+    long long int i;
+    asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
+{
+    long long int i;
+    asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half htrunc(const __half h)
+{
+    __half r;
+    asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hceil(const __half h)
+{
+    __half r;
+    asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hfloor(const __half h)
+{
+    __half r;
+    asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hrint(const __half h)
+{
+    __half r;
+    asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rzi.f16.f16 low, low;\n"
+        "  cvt.rzi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rpi.f16.f16 low, low;\n"
+        "  cvt.rpi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rmi.f16.f16 low, low;\n"
+        "  cvt.rmi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rni.f16.f16 low, low;\n"
+        "  cvt.rni.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a)
+{
+    int retval;
+    const __half_raw araw = __half_raw(a);
+    if (araw.x == 0xFC00U) {
+        retval = -1;
+    } else if (araw.x == 0x7C00U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__HALF_TO_CUS(h));
+,
+    return static_cast<short int>(__half_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __HALF_TO_CUS(h);
+,
+    return __half_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __half_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __half(hr);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = i;
+    return h;
+,
+    __half_raw hr;
+    hr.x = i;
+    return __half(hr);)
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __internal_device_hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+__CUDA_FP16_DECL__ __half __internal_device_hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmax(a, b);
+,
+    __half maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmin(a, b);
+,
+    __half minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+
+
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max)
+,
+    __half2 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min)
+,
+    __half2 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __half, __half2 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
+   return r; \
+} /* while(0) */
+
+#define __SHUFFLE_SYNC_HALF2_MACRO(name, var, delta, c, mask) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_HALF2_MACRO(shfl.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_HALF2_MACRO
+#undef __SHUFFLE_SYNC_HALF2_MACRO
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor(temp1, delta, width);
+    return __low2half(temp2);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2half(temp2);
+}
+
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*               __half and __half2 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+#undef __LDG_PTR
+#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/******************************************************************************
+*                             __half2 comparison                             *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.eq)
+,
+    __half2_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ne)
+,
+    __half2_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.le)
+,
+    __half2_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ge)
+,
+    __half2_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.lt)
+,
+    __half2_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gt)
+,
+    __half2_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.equ)
+,
+    __half2_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.neu)
+,
+    __half2_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.leu)
+,
+    __half2_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.geu)
+,
+    __half2_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ltu)
+,
+    __half2_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gtu)
+,
+    __half2_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO
+/******************************************************************************
+*                 __half2 comparison with mask output                        *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\
+   unsigned val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO_MASK
+
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __heq2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hne2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hle2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hge2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hlt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hequ2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hneu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hleu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgeu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hltu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgtu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_FP16_STRINGIFY(name) ".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(eq)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ne)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(le)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ge)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(lt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(equ)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(neu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(leu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(geu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ltu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gtu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_HALF_MACRO
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add)
+,
+    __half2 val;
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub)
+,
+    __half2 val;
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul)
+,
+    __half2 val;
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.sat)
+,
+    __half2 val;
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.sat)
+,
+    __half2 val;
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.sat)
+,
+    __half2 val;
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.rn)
+,
+    __half2 val;
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.rn)
+,
+    __half2 val;
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.rn)
+,
+    __half2 val;
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+    return val;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
+    __half ha = __low2half(a);
+    __half hb = __low2half(b);
+
+    const __half v1 = __hdiv(ha, hb);
+
+    ha = __high2half(a);
+    hb = __high2half(b);
+
+    const __half v2 = __hdiv(ha, hb);
+
+    return __halves2half2(v1, v2);
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.sat)
+,
+    return __hmin(__hmax(__hadd(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.sat)
+,
+    return __hmin(__hmax(__hsub(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.sat)
+,
+    return __hmin(__hmax(__hmul(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half v;
+    __half abs;
+    __half den;
+    __HALF_TO_US(den) = 0x008FU;
+
+    float rcp;
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+
+    asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+
+    float fv = rcp * fa;
+
+    v = __float2half(fv);
+    abs = __habs(v);
+    if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs))  {
+        const float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __half2 functions                  *
+******************************************************************************/
+#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
+#define __APPROX_FCAST(fun) /* do */ {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __half2 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  cvt.f32.f16     fl, hl;         \n"\
+                "  cvt.f32.f16     fu, hu;         \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fl, fl;     \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fu, fu;     \n"\
+                "  cvt.rn.f16.f32      hl, fl;     \n"\
+                "  cvt.rn.f16.f32      hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+#define __SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+static __device__ __forceinline__ float __float_simpl_sinf(float a);
+static __device__ __forceinline__ float __float_simpl_cosf(float a);
+__CUDA_FP16_DECL__ __half hsin(const __half a) {
+    const float sl = __float_simpl_sinf(__half2float(a));
+    __half r = __float2half_rn(sl);
+    asm("{\n\t"
+        "  .reg.b16 i,r,t;     \n\t"
+        "  mov.b16 r, %0;      \n\t"
+        "  mov.b16 i, %1;      \n\t"
+        "  and.b16 t, r, 0x8000U; \n\t"
+        "  abs.f16 r, r;   \n\t"
+        "  abs.f16 i, i;   \n\t"
+        __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
+        __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
+        "  or.b16  r,r,t;      \n\t"
+        "  mov.b16 %0, r;      \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
+    const float sl = __float_simpl_sinf(__half2float(a.x));
+    const float sh = __float_simpl_sinf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(sl, sh);
+    asm("{\n\t"
+        "  .reg.b32 i,r,t;             \n\t"
+        "  mov.b32 r, %0;              \n\t"
+        "  mov.b32 i, %1;              \n\t"
+        "  and.b32 t, r, 0x80008000U;   \n\t"
+        "  abs.f16x2 r, r;   \n\t"
+        "  abs.f16x2 i, i;   \n\t"
+        __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
+        __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
+        "  or.b32  r, r, t;            \n\t"
+        "  mov.b32 %0, r;              \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hcos(const __half a) {
+    const float cl = __float_simpl_cosf(__half2float(a));
+    __half r = __float2half_rn(cl);
+    asm("{\n\t"
+        "  .reg.b16 i,r;        \n\t"
+        "  mov.b16 r, %0;       \n\t"
+        "  mov.b16 i, %1;       \n\t"
+        "  abs.f16 i, i;        \n\t"
+        __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
+        "  mov.b16 %0, r;       \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
+    const float cl = __float_simpl_cosf(__half2float(a.x));
+    const float ch = __float_simpl_cosf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(cl, ch);
+    asm("{\n\t"
+        "  .reg.b32 i,r;   \n\t"
+        "  mov.b32 r, %0;  \n\t"
+        "  mov.b32 i, %1;  \n\t"
+        "  abs.f16x2 i, i; \n\t"
+        __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
+        "  mov.b32 %0, r;  \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
+{
+    const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
+    const unsigned q = __float_as_uint(ar);
+    const float j = __fsub_rn(ar, 12582912.0F);
+    float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
+    t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
+    *quadrant = q;
+    return t;
+}
+static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
+{
+    float z;
+    const float x2 = x*x;
+    float a8;
+    float a6;
+    float a4;
+    float a2;
+    float a1;
+    float a0;
+
+    if ((i & 1U) != 0U) {
+        // cos
+        a8 =  2.44331571e-5F;
+        a6 = -1.38873163e-3F;
+        a4 =  4.16666457e-2F;
+        a2 = -5.00000000e-1F;
+        a1 = x2;
+        a0 = 1.0F;
+    }
+    else {
+        // sin
+        a8 = -1.95152959e-4F;
+        a6 =  8.33216087e-3F;
+        a4 = -1.66666546e-1F;
+        a2 = 0.0F;
+        a1 = x;
+        a0 = x;
+    }
+
+    z = __fmaf_rn(a8, x2, a6);
+    z = __fmaf_rn(z, x2, a4);
+    z = __fmaf_rn(z, x2, a2);
+    z = __fmaf_rn(z, a1, a0);
+
+    if ((i & 2U) != 0U) {
+        z = -z;
+    }
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_sinf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, i);
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_cosf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
+    return z;
+}
+
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C, nZ;       \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
+        __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
+        __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
+        __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
+        __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
+        __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
+        __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+
+__CUDA_FP16_DECL__ __half htanh(const __half a) {
+    float f = __half2float(a);
+    f = tanhf(f);
+    __half h = __float2half_rn(f);
+    return h;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a) {
+    float2 f = __half22float2(a);
+    f.x = tanhf(f.x);
+    f.y = tanhf(f.y);
+    __half2 h = __float22half2_rn(f);
+    return h;
+}
+
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a) {
+    __half r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    __half_raw hr = (__half_raw)a;
+    asm("tanh.approx.f16 %0, %0;" : "+h"(hr.x));
+    r = (__half)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a) {
+    __half2 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(res)) : "r"(__HALF2_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_FP16_DECL__ __half hexp2(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, ULP;         \n"
+        " .reg.b16         r;              \n"
+        "  mov.b16         r,%1;           \n"
+        "  cvt.f32.f16     f,r;            \n"
+        "  ex2.approx.ftz.f32      f,f;    \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      f,f,ULP,f;      \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, ULP;    \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      fl,fl,ULP,fl;   \n"
+        "  fma.rn.f32      fu,fu,ULP,fu;   \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         %0, {hl, hu};   \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half hexp10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h,r;            \n"
+        " .reg.b32         f, C, nZ;       \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
+        __SPEC_CASE(h, r, 0x9766U, 0x9000U)
+        __SPEC_CASE(h, r, 0x9972U, 0x1000U)
+        __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
+        __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog2(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f;              \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
+        __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, r, p;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  lg2.approx.ftz.f32  fl, fl;     \n"
+        "  lg2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
+        __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
+        "  mov.b32         %0, r;          \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.ftz.f32  f,f;        \n"
+        "  mov.b32         C, 0x3f317218U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
+        __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
+        __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
+        __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
+        __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
+        __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
+        __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x338FU, 0x1000U)
+        __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
+        __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
+        __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
+        __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#undef __SPEC_CASE2
+#undef __SPEC_CASE
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_FP16_DECL__ __half hrcp(const __half a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+#endif /* defined(_NVHPC_CUDA) || defined(__CUDACC__) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
+,
+    __half2_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    r = __half2(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{set.nan.f16.f16 %0,%1,%2;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
+    return __HALF_TO_CUS(r) != 0U;
+,
+    const __half_raw hr = static_cast<__half_raw>(a);
+    return ((hr.x & (unsigned short)0x7FFFU) > (unsigned short)0x7C00U);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{neg.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{neg.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    const float fa = __half2float(a);
+    return __float2half(-fa);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{abs.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{abs.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    __half_raw abs_a_raw = static_cast<__half_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7C00U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__half>(abs_a_raw);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __half real_tmp =  __hfma(a.x, b.x, c.x);
+    __half img_tmp  =  __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_half2(real_tmp, img_tmp);
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max.NaN)
+,
+    __half maxval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        maxval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        maxval = __hmax(a, b);
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min.NaN)
+,
+    __half minval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        minval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        minval = __hmin(a, b);
+    }
+    return minval;
+)
+}
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF_MACRO(fma.rn.relu)
+,
+    return __hmax_nan(__hfma(a, b, c), CUDART_ZERO_FP16);
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max.NaN)
+,
+    __half2 result = __hmax2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min.NaN)
+,
+    __half2 result = __hmin2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
+,
+    __half2_raw hzero;
+    hzero.x = (unsigned short)0U;
+    hzero.y = (unsigned short)0U;
+    return __hmax2_nan(__hfma2(a, b, c), __half2(hzero));
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_FP16_DECL__  __half2 atomicAdd(__half2 *const address, const __half2 val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_60,
+    __half2 r;
+    asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
+                  : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __half2 new_val = __hadd2(val, *(__half2*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__half2*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
+    __half r;
+    asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                  : "=h"(__HALF_TO_US(r))
+                  : __PTR(address), "h"(__HALF_TO_CUS(val))
+                  : "memory");
+    return r;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#endif /* !(defined __DOXYGEN_ONLY__) */
+#endif /* defined(__cplusplus) */
+
+#undef __TERNARY_OP_HALF2_MACRO
+#undef __TERNARY_OP_HALF_MACRO
+#undef __BINARY_OP_HALF2_MACRO
+#undef __BINARY_OP_HALF_MACRO
+
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_FP16_DECL__
+
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+#undef __HALF2_TO_UI
+#undef __HALF2_TO_CUI
+#undef __CUDA_FP16_CONSTEXPR__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+#undef __CPP_VERSION_AT_LEAST_11_FP16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_HPP__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b291e7b78610171e9068b39ed5c503b35f45ad4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP4_H__
+#define __CUDA_FP4_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP4_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP4__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP4__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP4
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP4
+#endif
+
+/* bring in fp6 types infrastructure and dependencies */
+#include "cuda_fp6.h"
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP4 FP4 Intrinsics
+ * This section describes fp4 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp4.h in your
+ * program.
+ *
+ * \note Most of the operations defined here benefit from native HW support
+ * when compiled for specific GPU targets (e.g. devices of compute capability 10.0a),
+ * other targets use emulation path.
+ *
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP4_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP4_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp4 to other types.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP4_MISC FP4 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ * To use these functions, include the header file \p cuda_fp4.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp4 floating-point
+ * numbers storage.
+ */
+typedef __nv_fp8_storage_t __nv_fp4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp4 floating-point numbers.
+ */
+typedef __nv_fp8_storage_t __nv_fp4x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp4 floating-point numbers.
+ */
+typedef __nv_fp8x2_storage_t __nv_fp4x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 4-bit values when referring to them as
+ * \p fp4 types.
+ */
+typedef enum __nv_fp4_interpretation_t {
+    __NV_E2M1, /**< Stands for \p fp4 numbers of \p e2m1 kind. */
+} __nv_fp4_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p double precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_double_to_fp4(const double x,
+                       const __nv_fp4_interpretation_t fp4_interpretation,
+                       const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_double2_to_fp4x2(const double2 x,
+                          const __nv_fp4_interpretation_t fp4_interpretation,
+                          const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p single precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_float_to_fp4(const float x,
+                      const __nv_fp4_interpretation_t fp4_interpretation,
+                      const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_float2_to_fp4x2(const float2 x,
+                         const __nv_fp4_interpretation_t fp4_interpretation,
+                         const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p half precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_halfraw_to_fp4(const __half_raw x,
+                        const __nv_fp4_interpretation_t fp4_interpretation,
+                        const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t __nv_cvt_halfraw2_to_fp4x2(
+    const __half2_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t __nv_cvt_bfloat16raw_to_fp4(
+    const __nv_bfloat16_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp4x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p fp4 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __half_raw
+__nv_cvt_fp4_to_halfraw(const __nv_fp4_storage_t x,
+                        const __nv_fp4_interpretation_t fp4_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p fp4 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP4_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp4.hpp" */
+struct __nv_fp4_e2m1;
+struct __nv_fp4x2_e2m1;
+struct __nv_fp4x4_e2m1;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp4.hpp"
+
+#undef __CUDA_FP4_DECL__
+#undef __CUDA_HOSTDEVICE_FP4__
+#undef __CUDA_HOSTDEVICE_FP4_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+#undef __CPP_VERSION_AT_LEAST_11_FP4
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#endif /* end of include guard: __CUDA_FP4_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..97716a433d797c49961ee0ba33b4fb84e9b3bb1d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp
@@ -0,0 +1,953 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP4_HPP__)
+#define __CUDA_FP4_HPP__
+
+#if !defined(__CUDA_FP4_H__)
+#error "Do not include this file directly. Instead, include cuda_fp4.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP4)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP4)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP4_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_double_to_fp4(const double x,
+                       const __nv_fp4_interpretation_t fp4_interpretation,
+                       const enum cudaRoundMode rounding) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP4_MAXNORM;
+    unsigned char FP4_MANTISSA_MASK;
+    unsigned short int FP4_EXP_BIAS;
+    unsigned long long int FP4_SIGNIFICAND_BITS;
+    unsigned long long int FP4_MINDENORM_O2;
+    unsigned long long int FP4_OVERFLOW_THRESHOLD;
+    unsigned long long int FP4_MINNORM;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+
+    // fp4_interpretation == __NV_E2M1
+    FP4_EXP_BIAS = 1U;
+    FP4_SIGNIFICAND_BITS = 2ULL;
+    FP4_MANTISSA_MASK = 0x1U;
+    FP4_MINDENORM_O2 = 0x3FD0000000000000ULL; // mindenorm/2 = 2^-2
+    FP4_OVERFLOW_THRESHOLD =
+        0x4018000000000000ULL; // maxnorm = 6.0
+    FP4_MAXNORM = 0x7U;
+    FP4_MINNORM = 0x3FF0000000000000ULL; // minnorm = 2^0
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP4_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP4_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 3U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP4_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP4_SIGNIFICAND_BITS)) &
+        FP4_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP4_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > FP4_OVERFLOW_THRESHOLD) {
+        // overflow or NaN
+        if (absx > DP_INF_BITS)
+        {
+            // NaN converts to positive FP4_MAXNORM
+            sign = 0U;
+        }
+        res = FP4_MAXNORM;
+    } else if (absx >= FP4_MINNORM) {
+        res = (unsigned char)((exp << (FP4_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP4_DP_HALF_ULP << 1ULL) - 1ULL);
+        if (rounding == cudaRoundNearest)
+        {
+            // round-to-nearest-even adjustment
+            if ((round > FP4_DP_HALF_ULP) ||
+                ((round == FP4_DP_HALF_ULP) && (mantissa & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP4_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        if (rounding == cudaRoundNearest)
+        {
+            // rounded-off bits, including implicit leading bit
+            unsigned long long int round =
+                (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+                ((FP4_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+            // round-to-nearest-even adjustment
+            if ((round > (FP4_DP_HALF_ULP << shift)) ||
+                ((round == (FP4_DP_HALF_ULP << shift)) && (res & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp4_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_double2_to_fp4x2(const double2 x,
+                          const __nv_fp4_interpretation_t fp4_interpretation,
+                          const enum cudaRoundMode rounding) {
+    __nv_fp4x2_storage_t storage = (__nv_fp4x2_storage_t)__nv_cvt_double_to_fp4(
+        x.y, fp4_interpretation, rounding);
+    storage = (__nv_fp4x2_storage_t)(storage << 4U);
+    storage = (__nv_fp4x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp4(
+                                         x.x, fp4_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_float_to_fp4(const float x,
+                      const __nv_fp4_interpretation_t fp4_interpretation,
+                      const enum cudaRoundMode rounding) {
+    __nv_fp4_storage_t res = 0U;
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest)
+    {
+        unsigned short storage;
+        // fp4_interpretation == __NV_E2M1
+        asm("{ .reg .b8 __$temp1;                           \n"
+            " cvt.rn.satfinite.e2m1x2.f32 __$temp1, %2, %1; \n"
+            " mov.b16 %0, {__$temp1, 0};                   }\n"
+            : "=h"(storage)
+            : "f"(x), "f"(0.0f));
+        res = (__nv_fp4_storage_t)storage;
+    } else
+#endif
+    {
+        res = __nv_cvt_double_to_fp4((double)x, fp4_interpretation, rounding);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_float2_to_fp4x2(const float2 x,
+                         const __nv_fp4_interpretation_t fp4_interpretation,
+                         const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    unsigned short storage;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest) {
+        // fp4_interpretation == __NV_E2M1
+        asm("{ .reg .b8 __$temp1;                           \n"
+            " cvt.rn.satfinite.e2m1x2.f32 __$temp1, %2, %1; \n"
+            " mov.b16 %0, {__$temp1, 0};                   }\n"
+            : "=h"(storage)
+            : "f"(x.x), "f"(x.y));
+    } else
+#endif
+    {
+        storage = (__nv_fp4x2_storage_t)__nv_cvt_float_to_fp4(
+            x.y, fp4_interpretation, rounding);
+        storage = (__nv_fp4x2_storage_t)(storage << 4U);
+        storage = (__nv_fp4x2_storage_t)(storage | __nv_cvt_float_to_fp4(
+                                                       x.x,
+                                                       fp4_interpretation, rounding));
+    }
+    return (__nv_fp4x2_storage_t)storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_halfraw_to_fp4(const __half_raw x,
+                        const __nv_fp4_interpretation_t fp4_interpretation,
+                        const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp4_storage_t res = 0U;
+    float fx = __internal_halfraw_to_float(x);
+    res = __nv_cvt_float_to_fp4(fx, fp4_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t __nv_cvt_halfraw2_to_fp4x2(
+    const __half2_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    unsigned short tmp;
+    __half_raw raw;
+    raw.x = x.x;
+    __nv_fp4_storage_t lo =
+        __nv_cvt_halfraw_to_fp4(raw, fp4_interpretation, rounding);
+    raw.x = x.y;
+    __nv_fp4_storage_t hi =
+        __nv_cvt_halfraw_to_fp4(raw, fp4_interpretation, rounding);
+    tmp = hi;
+    tmp = (__nv_fp4x2_storage_t)(tmp << 4U);
+    tmp = (__nv_fp4x2_storage_t)(tmp | lo);
+    return (__nv_fp4x2_storage_t)tmp;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t __nv_cvt_bfloat16raw_to_fp4(
+    const __nv_bfloat16_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp4_storage_t res =
+        __nv_cvt_float_to_fp4(fx, fp4_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp4x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp4x2_storage_t storage =
+        (__nv_fp4x2_storage_t)__nv_cvt_bfloat16raw_to_fp4(raw,
+                                        fp4_interpretation, rounding);
+    storage = (__nv_fp4x2_storage_t)(storage << 4U);
+    raw.x = x.x;
+    storage = (__nv_fp4x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp4(raw,
+                                        fp4_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation);
+__CUDA_HOSTDEVICE_FP4_DECL__ __half_raw
+__nv_cvt_fp4_to_halfraw(const __nv_fp4_storage_t x,
+                        const __nv_fp4_interpretation_t fp4_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    res.x =
+        __nv_cvt_fp4x2_to_halfraw2((__nv_fp4x2_storage_t)x, fp4_interpretation)
+            .x;
+#else
+    {
+        // fp4_interpretation == __NV_E2M1
+        // convert to e2m3 first
+        __nv_fp6_storage_t fp6e2m3 = (x & 0xFU) << 2U;
+        res = __nv_cvt_fp6_to_halfraw(fp6e2m3, __NV_E2M3);
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation) {
+    __half2_raw res;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned int half2_storage;
+    unsigned short tmp = (unsigned short)x;
+    asm("{ .reg .b8 __$temp1, __$tempz;                 \n"
+        " mov.b16 {__$temp1, __$tempz}, %1;             \n"
+        " cvt.rn.f16x2.e2m1x2 %0, __$temp1;            }\n"
+        : "=r"(half2_storage)
+        : "h"(tmp));
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp4_to_halfraw((__nv_fp4_storage_t)x, fp4_interpretation).x;
+    res.y = __nv_cvt_fp4_to_halfraw((__nv_fp4_storage_t)(x >> 4U),
+                                    fp4_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ unsigned short int
+__internal_pack_u8x2_to_u16(const unsigned char src_lo,
+                            const unsigned char src_hi) {
+    return (((unsigned short int)src_hi) << 8U) |
+            ((unsigned short int)src_lo);
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP4_E2M1_STRUCT C++ struct for handling fp4 data type of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4_E2M1_STRUCT
+ * \brief __nv_fp4_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp4 floating-point numbers of \p e2m1 kind:
+ * with 1 sign, 2 exponent, 1 implicit and 1 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp4_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4_E2M1_STRUCT
+     * Storage variable contains the \p fp4 floating-point data.
+     */
+    __nv_fp4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp4(static_cast<__half_raw>(f),
+                                      __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp4(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const float f) {
+        __x = __nv_cvt_float_to_fp4(f, __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const double f) {
+        __x = __nv_cvt_double_to_fp4(f, __NV_E2M1, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__
+    __nv_fp4_e2m1(const unsigned short int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const unsigned int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const unsigned long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__
+    __nv_fp4_e2m1(const unsigned long long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const short int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const long long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp4_to_halfraw(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp4_to_halfraw(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator bool() const {
+        return (__x & 0x7U) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP4X2_E2M1_STRUCT C++ struct for handling vector type of two fp4 values of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4X2_E2M1_STRUCT
+ * \brief __nv_fp4x2_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp4 floating-point numbers of \p e2m1 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp4x2_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4X2_E2M1_STRUCT
+     * Storage variable contains the vector of two \p fp4 floating-point data
+     * values.
+     */
+    __nv_fp4x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4x2_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp4x2(static_cast<__half2_raw>(f),
+                                         __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp4x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const float2 f) {
+        __x = __nv_cvt_float2_to_fp4x2(f, __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const double2 f) {
+        __x = __nv_cvt_double2_to_fp4x2(f, __NV_E2M1, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp4x2_to_halfraw2(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(__x, __NV_E2M1));
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP4X4_E2M1_STRUCT C++ struct for handling vector type of four fp4 values of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4X4_E2M1_STRUCT
+ * \brief __nv_fp4x4_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp4 floating-point numbers of \p e2m1 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp4x4_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4X4_E2M1_STRUCT
+     * Storage variable contains the vector of four \p fp4 floating-point data
+     * values.
+     */
+    __nv_fp4x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4x4_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp4x2_storage_t rlo = __nv_cvt_halfraw2_to_fp4x2(
+            static_cast<__half2_raw>(flo), __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi = __nv_cvt_halfraw2_to_fp4x2(
+            static_cast<__half2_raw>(fhi), __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp4x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp4x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp4x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp4x2_storage_t rlo =
+            __nv_cvt_float2_to_fp4x2(flo, __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi =
+            __nv_cvt_float2_to_fp4x2(fhi, __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp4x2_storage_t rlo =
+            __nv_cvt_double2_to_fp4x2(flo, __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi =
+            __nv_cvt_double2_to_fp4x2(fhi, __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float4() const {
+        const __nv_fp4x2_storage_t slo = static_cast<__nv_fp4x2_storage_t>(__x);
+        const __nv_fp4x2_storage_t shi =
+            static_cast<__nv_fp4x2_storage_t>(__x >> 8U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(slo, __NV_E2M1));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(shi, __NV_E2M1));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP4_HPP__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b0d6d5abf0781f4e378877d7395818898fa9ce0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP6_H__
+#define __CUDA_FP6_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP6_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP6__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP6__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP6
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP6
+#endif
+
+/* bring in fp8 types infrastructure and dependencies */
+#include "cuda_fp8.h"
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP6 FP6 Intrinsics
+ * This section describes fp6 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp6.h in your
+ * program.
+ *
+ * \note Most of the operations defined here benefit from native HW support
+ * when compiled for specific GPU targets (e.g. devices of compute capability 10.0a),
+ * other targets use emulation path.
+ *
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP6_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP6_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp6 to other types.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP6_MISC FP6 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ * To use these functions, include the header file \p cuda_fp6.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp6 floating-point
+ * numbers storage.
+ */
+typedef __nv_fp8_storage_t __nv_fp6_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp6 floating-point numbers.
+ */
+typedef __nv_fp8x2_storage_t __nv_fp6x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp6 floating-point numbers.
+ */
+typedef __nv_fp8x4_storage_t __nv_fp6x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp6 types.
+ */
+typedef enum __nv_fp6_interpretation_t {
+    __NV_E2M3, /**< Stands for \p fp6 numbers of \p e2m3 kind. */
+    __NV_E3M2, /**< Stands for \p fp6 numbers of \p e3m2 kind. */
+} __nv_fp6_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p double precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_double_to_fp6(const double x,
+                       const __nv_fp6_interpretation_t fp6_interpretation,
+                       const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_double2_to_fp6x2(const double2 x,
+                          const __nv_fp6_interpretation_t fp6_interpretation,
+                          const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p single precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_float_to_fp6(const float x,
+                      const __nv_fp6_interpretation_t fp6_interpretation,
+                      const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_float2_to_fp6x2(const float2 x,
+                         const __nv_fp6_interpretation_t fp6_interpretation,
+                         const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p half precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_halfraw_to_fp6(const __half_raw x,
+                        const __nv_fp6_interpretation_t fp6_interpretation,
+                        const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t __nv_cvt_halfraw2_to_fp6x2(
+    const __half2_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t __nv_cvt_bfloat16raw_to_fp6(
+    const __nv_bfloat16_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp6x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p fp6 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __half_raw
+__nv_cvt_fp6_to_halfraw(const __nv_fp6_storage_t x,
+                        const __nv_fp6_interpretation_t fp6_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p fp6 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP6_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp6.hpp" */
+struct __nv_fp6_e3m2;
+struct __nv_fp6x2_e3m2;
+struct __nv_fp6x4_e3m2;
+
+struct __nv_fp6_e2m3;
+struct __nv_fp6x2_e2m3;
+struct __nv_fp6x4_e2m3;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp6.hpp"
+
+#undef __CUDA_FP6_DECL__
+#undef __CUDA_HOSTDEVICE_FP6__
+#undef __CUDA_HOSTDEVICE_FP6_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+#undef __CPP_VERSION_AT_LEAST_11_FP6
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#endif /* end of include guard: __CUDA_FP6_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3848cebcc7629972a58c8a330a25195c5c565d4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp
@@ -0,0 +1,1549 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP6_HPP__)
+#define __CUDA_FP6_HPP__
+
+#if !defined(__CUDA_FP6_H__)
+#error "Do not include this file directly. Instead, include cuda_fp6.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP6)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP6)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP6_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_double_to_fp6(const double x,
+                       const __nv_fp6_interpretation_t fp6_interpretation,
+                       const enum cudaRoundMode rounding) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP6_MAXNORM;
+    unsigned char FP6_MANTISSA_MASK;
+    unsigned short int FP6_EXP_BIAS;
+    unsigned long long int FP6_SIGNIFICAND_BITS;
+    unsigned long long int FP6_MINDENORM_O2;
+    unsigned long long int FP6_OVERFLOW_THRESHOLD;
+    unsigned long long int FP6_MINNORM;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+
+    switch (fp6_interpretation)
+    {
+        case __NV_E2M3:
+            FP6_EXP_BIAS = 1U;
+            FP6_SIGNIFICAND_BITS = 4ULL;
+            FP6_MANTISSA_MASK = 0x7U;
+            FP6_MINDENORM_O2 = 0x3FB0000000000000ULL; // mindenorm/2 = 2^-4
+            FP6_OVERFLOW_THRESHOLD =
+                0x401E000000000000ULL; // maxnorm = 7.5
+            FP6_MAXNORM = 0x1FU;
+            FP6_MINNORM = 0x3FF0000000000000ULL; // minnorm = 2^0
+            break;
+        case __NV_E3M2:
+        default:
+            FP6_EXP_BIAS = 3U;
+            FP6_SIGNIFICAND_BITS = 3ULL;
+            FP6_MANTISSA_MASK = 0x3U;
+            FP6_MINDENORM_O2 = 0x3FA0000000000000ULL; // mindenorm/2 = 2^-5
+            FP6_OVERFLOW_THRESHOLD =
+                0x403C000000000000ULL; // maxnorm = 28
+            FP6_MAXNORM = 0x1FU;
+            FP6_MINNORM = 0x3FD0000000000000ULL; // minnorm = 2^-2
+            break;
+    }
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP6_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP6_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 5U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP6_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP6_SIGNIFICAND_BITS)) &
+        FP6_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP6_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > FP6_OVERFLOW_THRESHOLD) {
+        // overflow or NaN
+        if (absx > DP_INF_BITS)
+        {
+            // NaN converts to positive FP6_MAXNORM
+            sign = 0U;
+        }
+        res = FP6_MAXNORM;
+    } else if (absx >= FP6_MINNORM) {
+        res = (unsigned char)((exp << (FP6_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP6_DP_HALF_ULP << 1ULL) - 1ULL);
+        if (rounding == cudaRoundNearest)
+        {
+            // round-to-nearest-even adjustment
+            if ((round > FP6_DP_HALF_ULP) ||
+                ((round == FP6_DP_HALF_ULP) && (mantissa & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP6_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        if (rounding == cudaRoundNearest)
+        {
+            // rounded-off bits, including implicit leading bit
+            unsigned long long int round =
+                (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+                ((FP6_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+            // round-to-nearest-even adjustment
+            if ((round > (FP6_DP_HALF_ULP << shift)) ||
+                ((round == (FP6_DP_HALF_ULP << shift)) && (res & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp6_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_double2_to_fp6x2(const double2 x,
+                          const __nv_fp6_interpretation_t fp6_interpretation,
+                          const enum cudaRoundMode rounding) {
+    __nv_fp6x2_storage_t storage = (__nv_fp6x2_storage_t)__nv_cvt_double_to_fp6(
+        x.y, fp6_interpretation, rounding);
+    storage = (__nv_fp6x2_storage_t)(storage << 8U);
+    storage = (__nv_fp6x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp6(
+                                         x.x, fp6_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_float_to_fp6(const float x,
+                      const __nv_fp6_interpretation_t fp6_interpretation,
+                      const enum cudaRoundMode rounding) {
+    __nv_fp6_storage_t res = 0U;
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest)
+    {
+        __nv_fp6x2_storage_t storage;
+        if (fp6_interpretation == __NV_E3M2) {
+            asm("{cvt.rn.satfinite.e3m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e2m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp6_storage_t)storage;
+    } else
+#endif
+    {
+        res = __nv_cvt_double_to_fp6((double)x, fp6_interpretation, rounding);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_float2_to_fp6x2(const float2 x,
+                         const __nv_fp6_interpretation_t fp6_interpretation,
+                         const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6x2_storage_t storage;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest) {
+        if (fp6_interpretation == __NV_E3M2) {
+            asm("{cvt.rn.satfinite.e3m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e2m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp6x2_storage_t)__nv_cvt_float_to_fp6(
+            x.y, fp6_interpretation, rounding);
+        storage = (__nv_fp6x2_storage_t)(storage << 8U);
+        storage = (__nv_fp6x2_storage_t)(storage | __nv_cvt_float_to_fp6(
+                                                       x.x,
+                                                       fp6_interpretation, rounding));
+    }
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_halfraw_to_fp6(const __half_raw x,
+                        const __nv_fp6_interpretation_t fp6_interpretation,
+                        const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6_storage_t res = 0U;
+    float fx = __internal_halfraw_to_float(x);
+    res = __nv_cvt_float_to_fp6(fx, fp6_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t __nv_cvt_halfraw2_to_fp6x2(
+    const __half2_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6x2_storage_t tmp;
+    __half_raw raw;
+    raw.x = x.x;
+    __nv_fp6_storage_t lo =
+        __nv_cvt_halfraw_to_fp6(raw, fp6_interpretation, rounding);
+    raw.x = x.y;
+    __nv_fp6_storage_t hi =
+        __nv_cvt_halfraw_to_fp6(raw, fp6_interpretation, rounding);
+    tmp = hi;
+    tmp = (__nv_fp6x2_storage_t)(tmp << 8U);
+    tmp = (__nv_fp6x2_storage_t)(tmp | lo);
+    return tmp;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t __nv_cvt_bfloat16raw_to_fp6(
+    const __nv_bfloat16_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp6_storage_t res =
+        __nv_cvt_float_to_fp6(fx, fp6_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp6x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp6x2_storage_t storage =
+        (__nv_fp6x2_storage_t)__nv_cvt_bfloat16raw_to_fp6(raw,
+                                        fp6_interpretation, rounding);
+    storage = (__nv_fp6x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp6x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp6(raw,
+                                        fp6_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation);
+__CUDA_HOSTDEVICE_FP6_DECL__ __half_raw
+__nv_cvt_fp6_to_halfraw(const __nv_fp6_storage_t x,
+                        const __nv_fp6_interpretation_t fp6_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    res.x =
+        __nv_cvt_fp6x2_to_halfraw2((__nv_fp6x2_storage_t)x, fp6_interpretation)
+            .x;
+#else
+    {
+        unsigned short int ur = (unsigned short int)x;
+        ur = (unsigned short int)(ur << 10U);
+
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent;
+        unsigned short int bias_difference;
+        unsigned short int mantissa;
+
+        if (fp6_interpretation == __NV_E3M2) {
+            bias_difference = (unsigned short int)(15 - 3) << 10U;
+            exponent = (unsigned short int)(((ur & 0x7000U) >> 2U) + bias_difference);
+            mantissa = (ur & 0x0C00U) >> 2U;
+        } else {
+            //__NV_E2M3
+            bias_difference = (unsigned short int)(15 - 1) << 10U;
+            exponent = (unsigned short int)(((ur & 0x6000U) >> 3U) + bias_difference);
+            mantissa = (ur & 0x1C00U) >> 3U;
+        }
+
+        if (exponent == bias_difference) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+        }
+
+        res.x = (sign | exponent) | mantissa;
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation) {
+    __half2_raw res;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned int half2_storage;
+    if (fp6_interpretation == __NV_E3M2) {
+        asm("{cvt.rn.f16x2.e3m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e2m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp6_to_halfraw((__nv_fp6_storage_t)x, fp6_interpretation).x;
+    res.y = __nv_cvt_fp6_to_halfraw((__nv_fp6_storage_t)(x >> 8U),
+                                    fp6_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP6_E3M2_STRUCT C++ struct for handling fp6 data type of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_E3M2_STRUCT
+ * \brief __nv_fp6_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp6 floating-point numbers of \p e3m2 kind:
+ * with 1 sign, 3 exponent, 1 implicit and 2 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp6_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6_E3M2_STRUCT
+     * Storage variable contains the \p fp6 floating-point data.
+     */
+    __nv_fp6_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp6(static_cast<__half_raw>(f),
+                                      __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp6(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const float f) {
+        __x = __nv_cvt_float_to_fp6(f, __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const double f) {
+        __x = __nv_cvt_double_to_fp6(f, __NV_E3M2, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e3m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const unsigned int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const unsigned long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e3m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const short int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const long long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp6_to_halfraw(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp6_to_halfraw(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator bool() const {
+        return (__x & 0x1FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X2_E3M2_STRUCT C++ struct for handling vector type of two fp6 values of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X2_E3M2_STRUCT
+ * \brief __nv_fp6x2_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp6 floating-point numbers of \p e3m2 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp6x2_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X2_E3M2_STRUCT
+     * Storage variable contains the vector of two \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x2_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp6x2(static_cast<__half2_raw>(f),
+                                         __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp6x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp6x2(f, __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp6x2(f, __NV_E3M2, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp6x2_to_halfraw2(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(__x, __NV_E3M2));
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X4_E3M2_STRUCT C++ struct for handling vector type of four fp6 values of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X4_E3M2_STRUCT
+ * \brief __nv_fp6x4_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp6 floating-point numbers of \p e3m2 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp6x4_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X4_E3M2_STRUCT
+     * Storage variable contains the vector of four \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x4_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(flo), __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(fhi), __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_float2_to_fp6x2(flo, __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_float2_to_fp6x2(fhi, __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_double2_to_fp6x2(flo, __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_double2_to_fp6x2(fhi, __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float4() const {
+        const __nv_fp6x2_storage_t slo = static_cast<__nv_fp6x2_storage_t>(__x);
+        const __nv_fp6x2_storage_t shi =
+            static_cast<__nv_fp6x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(slo, __NV_E3M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(shi, __NV_E3M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6_E2M3_STRUCT C++ struct for handling fp6 data type of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_E2M3_STRUCT
+ * \brief __nv_fp6_e2m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp6 floating-point numbers of \p e2m3 kind:
+ * with 1 sign, 2 exponent, 1 implicit and 3 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp6_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6_E2M3_STRUCT
+     * Storage variable contains the \p fp6 floating-point data.
+     */
+    __nv_fp6_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp6(static_cast<__half_raw>(f),
+                                      __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp6(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const float f) {
+        __x = __nv_cvt_float_to_fp6(f, __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const double f) {
+        __x = __nv_cvt_double_to_fp6(f, __NV_E2M3, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e2m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const unsigned int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const unsigned long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e2m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const short int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const long long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp6_to_halfraw(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp6_to_halfraw(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator bool() const {
+        return (__x & 0x1FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X2_E2M3_STRUCT C++ struct for handling vector type of two fp6 values of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X2_E2M3_STRUCT
+ * \brief __nv_fp6x2_e2m3 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp6 floating-point numbers of \p e2m3 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp6x2_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X2_E2M3_STRUCT
+     * Storage variable contains the vector of two \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x2_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp6x2(static_cast<__half2_raw>(f),
+                                         __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp6x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp6x2(f, __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp6x2(f, __NV_E2M3, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp6x2_to_halfraw2(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(__x, __NV_E2M3));
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X4_E2M3_STRUCT C++ struct for handling vector type of four fp6 values of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X4_E2M3_STRUCT
+ * \brief __nv_fp6x4_e2m3 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp6 floating-point numbers of \p e2m3 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp6x4_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X4_E2M3_STRUCT
+     * Storage variable contains the vector of four \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x4_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(flo), __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(fhi), __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_float2_to_fp6x2(flo, __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_float2_to_fp6x2(fhi, __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_double2_to_fp6x2(flo, __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_double2_to_fp6x2(fhi, __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float4() const {
+        const __nv_fp6x2_storage_t slo = static_cast<__nv_fp6x2_storage_t>(__x);
+        const __nv_fp6x2_storage_t shi =
+            static_cast<__nv_fp6x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(slo, __NV_E2M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(shi, __NV_E2M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP6_HPP__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h
new file mode 100644
index 0000000000000000000000000000000000000000..6761a4a353140f1b96ae0b0f7f7c439be3aeedae
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2022-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP8_H__
+#define __CUDA_FP8_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP8_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP8__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#endif
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in enum cudaRoundMode */
+#include "device_types.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+/* bring in __half_raw data type */
+#include "cuda_fp16.h"
+/* bring in __nv_bfloat16_raw data type */
+#include "cuda_bf16.h"
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
+ * This section describes fp8 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP8_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP8_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp8 to other types.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp8 floating-point
+ * numbers storage.
+ */
+typedef unsigned char __nv_fp8_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned short int __nv_fp8x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned int __nv_fp8x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the modes applicable when
+ * performing a narrowing conversion to \p fp8 destination types.
+ */
+typedef enum __nv_saturation_t {
+    /**
+     * Means no saturation to finite is performed when conversion
+     * results in rounding values outside the range of destination
+     * type.
+     * NOTE: for fp8 type of e4m3 kind, the results that are larger
+     * than the maximum representable finite number of the target
+     * format become NaN.
+     */
+    __NV_NOSAT,
+    /**
+     * Means input larger than the maximum representable
+     * finite number MAXNORM of the target format round to the
+     * MAXNORM of the same sign as input.
+     */
+    __NV_SATFINITE,
+} __nv_saturation_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp8 types.
+ */
+typedef enum __nv_fp8_interpretation_t {
+    __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
+    __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
+} __nv_fp8_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
+ * kind using round-to-nearest-even rounding and requested saturation mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p fp8 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p fp8 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p bfloat16 input into a scaling factor of \p e8m0 kind.
+ *
+ * \details Input number's absolute value is rounded to the closest power of two in the
+ * direction specified via \p rounding parameter. Rounded results that are
+ * smaller than the smallest representable target format number 2^-127 are then
+ * clipped to 2^-127. Results that are larger than the largest representable
+ * target format number 2^127 are either clipped to 2^127 if \p saturate equals
+ * to \p __NV_SATFINITE, or convert to \p NaN otherwise. \p NaN inputs convert
+ * into \p NaN output, encoded as \p 0xFF in the target format.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_e8m0(const __nv_bfloat16_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p bfloat16 values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_bfloat162raw_to_e8m0x2(const __nv_bfloat162_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p float value into a scaling factor of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_float_to_e8m0(const float x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p float values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_float2_to_e8m0x2(const float2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double value into a scaling factor of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_double_to_e8m0(const double x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p double values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_double2_to_e8m0x2(const double2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input scaling factor value of \p e8m0 kind into \p bfloat16.
+ *
+ * \details Input scales are exact powers of two or a \p NaN value,
+ * also representable in the target format.
+ *
+ * \returns
+ * - The \p __nv_bfloat16_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw __nv_cvt_e8m0_to_bf16raw(const __nv_fp8_storage_t x);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input pair of scaling factors of \p e8m0 kind into a pair of \p bfloat16 values.
+ *
+ * \returns
+ * - The \p __nv_bfloat162_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat162_raw __nv_cvt_e8m0x2_to_bf162raw(const __nv_fp8x2_storage_t x);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP8_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
+struct __nv_fp8_e5m2;
+struct __nv_fp8x2_e5m2;
+struct __nv_fp8x4_e5m2;
+
+struct __nv_fp8_e4m3;
+struct __nv_fp8x2_e4m3;
+struct __nv_fp8x4_e4m3;
+
+struct __nv_fp8_e8m0;
+struct __nv_fp8x2_e8m0;
+struct __nv_fp8x4_e8m0;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp8.hpp"
+
+#undef __CUDA_FP8_DECL__
+#undef __CUDA_HOSTDEVICE_FP8__
+#undef __CUDA_HOSTDEVICE_FP8_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+#undef __CPP_VERSION_AT_LEAST_11_FP8
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#endif /* end of include guard: __CUDA_FP8_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..05d673704c5309d45361855e8425ab18566d38c5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp
@@ -0,0 +1,2728 @@
+/*
+ * Copyright 2022-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP8_HPP__)
+#define __CUDA_FP8_HPP__
+
+#if !defined(__CUDA_FP8_H__)
+#error "Do not include this file directly. Instead, include cuda_fp8.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP8_MAXNORM;
+    unsigned char FP8_MANTISSA_MASK;
+    unsigned short int FP8_EXP_BIAS;
+    unsigned long long int FP8_SIGNIFICAND_BITS;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+    unsigned long long int FP8_MINDENORM_O2;
+    unsigned long long int FP8_OVERFLOW_THRESHOLD;
+    unsigned long long int FP8_MINNORM;
+
+    if (fp8_interpretation == __NV_E4M3) {
+        FP8_EXP_BIAS = 7U;
+        FP8_SIGNIFICAND_BITS = 4ULL;
+        FP8_MANTISSA_MASK = 0x7U;
+        FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
+        FP8_OVERFLOW_THRESHOLD =
+            0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
+        FP8_MAXNORM = 0x7EU;
+        FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
+    } else {                                 //__NV_E5M2
+        FP8_EXP_BIAS = 15U;
+        FP8_SIGNIFICAND_BITS = 3ULL;
+        FP8_MANTISSA_MASK = 0x3U;
+        FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
+        FP8_OVERFLOW_THRESHOLD =
+            0x40EE000000000000ULL -
+            1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
+        FP8_MAXNORM = 0x7BU;
+        FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
+    }
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP8_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP8_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
+        FP8_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP8_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > DP_INF_BITS) {
+        // NaN
+        if (fp8_interpretation == __NV_E4M3) {
+            res = 0x7FU;
+        } else {
+            // NaN --> QNaN
+            res = 0x7EU | mantissa;
+        }
+    } else if (absx > FP8_OVERFLOW_THRESHOLD) {
+        if (saturate == __NV_SATFINITE) {
+            res = FP8_MAXNORM;
+        } else {
+            // __NV_NOSAT
+            if (fp8_interpretation == __NV_E4M3) {
+                // no Inf in E4M3
+                res = 0x7FU; // NaN
+            } else {
+                res = 0x7CU; // Inf in E5M2
+            }
+        }
+    } else if (absx >= FP8_MINNORM) {
+        res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > FP8_DP_HALF_ULP) ||
+            ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        // rounded-off bits, including implicit leading bit
+        unsigned long long int round =
+            (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+            ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > (FP8_DP_HALF_ULP << shift)) ||
+            ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp8_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
+        x.y, saturate, fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp8(
+                                         x.x, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        __nv_fp8x2_storage_t storage;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp8_storage_t)storage;
+    } else
+#endif
+    {
+        unsigned int xbits;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&xbits, &x, sizeof(x));
+#else
+        (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+
+        // isnan
+        if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
+            // Canonical NaN
+            xbits = 0x7FFFFFFFU;
+        }
+
+        float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&fx, &xbits, sizeof(xbits));
+#else
+        (void)std::memcpy(&fx, &xbits, sizeof(xbits));
+#endif
+
+        const double dx = (double)fx;
+        res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
+            x.y, saturate, fp8_interpretation);
+        storage = (__nv_fp8x2_storage_t)(storage << 8U);
+        storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
+                                                       x.x, saturate,
+                                                       fp8_interpretation));
+    }
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_halfraw_to_float(const __half_raw x) {
+    float f;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
+#else
+    const unsigned int ux = (unsigned int)x.x;
+    unsigned int sign = (ux >> 15U) & 1U;
+    unsigned int exponent = (ux >> 10U) & 0x1fU;
+    unsigned int mantissa = (ux & 0x3ffU) << 13U;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
+    return f;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float2
+__internal_halfraw2_to_float2(const __half2_raw x) {
+    __half_raw raw;
+    float2 res;
+    raw.x = x.x;
+    res.x = __internal_halfraw_to_float(raw);
+    raw.x = x.y;
+    res.y = __internal_halfraw_to_float(raw);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage = (unsigned int)(x.x);
+        __nv_fp8x2_storage_t tmp;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+        res = (__nv_fp8_storage_t)tmp;
+    } else
+#endif
+    {
+        float fx = __internal_halfraw_to_float(x);
+        res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t tmp;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage;
+        (void)memcpy(&half2_storage, &x, sizeof(x));
+
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+    } else
+#endif
+    {
+        __half_raw raw;
+        raw.x = x.x;
+        __nv_fp8_storage_t lo =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        raw.x = x.y;
+        __nv_fp8_storage_t hi =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        tmp = hi;
+        tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
+        tmp = (__nv_fp8x2_storage_t)(tmp | lo);
+    }
+    return tmp;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
+    const unsigned int ux = ((unsigned int)x.x) << 16U;
+    float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&fx, &ux, sizeof(ux));
+#else
+    (void)std::memcpy(&fx, &ux, sizeof(ux));
+#endif
+    return fx;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp8_storage_t res =
+        __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp8x2_storage_t storage =
+        (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
+                                                          fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp8(
+                                         raw, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    res.x =
+        __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
+            .x;
+#else
+    unsigned short int ur = (unsigned short int)x;
+    ur = (unsigned short int)(ur << 8U);
+
+    if (fp8_interpretation == __NV_E5M2) {
+        if ((ur & 0x7FFFU) > 0x7C00U) {
+            /* If NaN, return canonical NaN */
+            ur = 0x7FFFU;
+        }
+    } else { // __NV_E4M3
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent =
+            (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
+        unsigned short int mantissa = (ur & 0x0700U) >> 1U;
+        unsigned char absx = 0x7FU & (unsigned char)x;
+
+        if (absx == 0x7FU) // NaN
+        {
+            ur = 0x7FFFU; // fp16 canonical NaN, discard sign
+        } else if (exponent == 0x2000U) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+
+            ur = (sign | exponent) | mantissa;
+        } else {
+            ur = (sign | exponent) | mantissa;
+        }
+    }
+    res.x = ur;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half2_raw res;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    unsigned int half2_storage;
+    if (fp8_interpretation == __NV_E5M2) {
+        asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
+    res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
+                                    fp8_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_bfloat16raw_to_e8m0(const __nv_bfloat16_raw x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short ures = 0U;
+    unsigned in = (unsigned)(x.x);
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    }
+    return (__nv_fp8_storage_t)ures;
+#else
+    // extract exponent bits, provides non-saturated result in RZ
+    __nv_fp8_storage_t res = (unsigned char)(x.x >> 7U);
+
+    if (rounding == cudaRoundPosInf) {
+        // round-up if mantissa non-zero and |x| > 2^-127 and finite
+        if ((x.x & 0x007FU) && ((x.x & 0x7FFFU) > 0x0040U) && ((x.x & 0x7FFFU) < 0x7F80U)) res++;
+    }
+
+    // Handle saturation of non-NaN large inputs to finite
+    if (saturate == __NV_SATFINITE) {
+        // non-NaN, Overflow --> Max
+        if (((x.x & 0x7FFFU) <= 0x7F80U) && (res == 0xFFU))
+        {
+            res--;
+        }
+    }
+    return res;
+#endif
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_bfloat162raw_to_e8m0x2(const __nv_bfloat162_raw x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    __nv_fp8x2_storage_t ures = 0U;
+    unsigned in = (unsigned)(x.x) | ((unsigned)(x.y) << (unsigned)16U);
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    }
+    return ures;
+#else
+    __nv_bfloat16_raw lo, hi;
+    lo.x = x.x;
+    hi.x = x.y;
+    __nv_fp8x2_storage_t ures = __nv_cvt_bfloat16raw_to_e8m0(hi, saturate, rounding);
+    ures <<= (unsigned short)8U;
+    ures |= __nv_cvt_bfloat16raw_to_e8m0(lo, saturate, rounding);
+    return ures;
+#endif
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int __internal_fp8_float_as_uint(const float f)
+{
+    unsigned int u;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&u, &f, sizeof(f));
+#else
+    (void)::std::memcpy(&u, &f, sizeof(f));
+#endif
+    return u;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float __internal_fp8_uint_as_float(const unsigned int u)
+{
+    float f;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)::std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_rz(const float x) {
+    __nv_bfloat16_raw r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(r.x) : "f"(x));
+,
+    unsigned int ux = __internal_fp8_float_as_uint(x);
+    if ((ux & 0x7FFFFFFFU) > 0x7f800000U)
+    {
+        // NaN
+        r.x = (unsigned short int)0x7FFFU;
+    }
+    else
+    {
+        r.x = (unsigned short int)(ux >> 16U);
+    }
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_ru(const float x) {
+    __nv_bfloat16_raw r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(r.x) : "f"(x));
+,
+    unsigned int ux = __internal_fp8_float_as_uint(x);
+    if ((ux & 0x7FFFFFFFU) > 0x7f800000U)
+    {
+        // NaN
+        r.x = (unsigned short int)0x7FFFU;
+    }
+    else if ((ux < 0x7f800000U) && ((ux & 0x0000FFFFU) != 0))
+    {
+        // 0 <= x < +inf, round-up
+        r.x = (unsigned short int)((ux >> 16U) + 1U);
+    }
+    else {
+        // truncate others
+        r.x = (unsigned short int)(ux >> 16U);
+    }
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_float_to_e8m0(const float x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+    __nv_fp8_storage_t res = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short ures = 0U;
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    }
+    res = (__nv_fp8_storage_t)ures;
+#else
+    if (rounding == cudaRoundZero)
+    {
+        res = __nv_cvt_bfloat16raw_to_e8m0(__internal_float_to_bf16raw_rz(x), saturate, rounding);
+    }
+    else
+    {   //cudaRoundPosInf
+        float absx = __internal_fp8_uint_as_float((__internal_fp8_float_as_uint(x) << (unsigned)1U) >> (unsigned)1U);
+        res = __nv_cvt_bfloat16raw_to_e8m0(__internal_float_to_bf16raw_ru(absx), saturate, rounding);
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_float2_to_e8m0x2(const float2 x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+    __nv_fp8x2_storage_t res = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    }
+#else
+    res = __nv_cvt_float_to_e8m0(x.y, saturate, rounding);
+    res <<= (unsigned short)8U;
+    res |= __nv_cvt_float_to_e8m0(x.x, saturate, rounding);
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+float __internal_double_to_float_with_sticky(double x)
+{
+#if (defined __CUDA_ARCH__)
+    // protect from ftz in device code
+    float f;
+    double d;
+    asm("{  cvt.rn.f32.f64 %0, %1;}\n" : "=f"(f) : "d"(x));
+    asm("{  cvt.f64.f32 %0, %1;}\n" : "=d"(d) : "f"(f));
+#else
+    const float f = (float)x;
+    const double d = (double)f;
+#endif
+    unsigned int u = __internal_fp8_float_as_uint(f);
+    int x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U) ? 1 : 0;
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && (x_is_not_nan == 1)) {
+        u |= 1U;
+    }
+    return __internal_fp8_uint_as_float(u);
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_double_to_e8m0(const double x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    float fx_with_sticky = __internal_double_to_float_with_sticky(x);
+    __nv_fp8_storage_t res = __nv_cvt_float_to_e8m0(fx_with_sticky, saturate, rounding);
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_double2_to_e8m0x2(const double2 x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    float2 f;
+    f.x = __internal_double_to_float_with_sticky(x.x);
+    f.y = __internal_double_to_float_with_sticky(x.y);
+    return __nv_cvt_float2_to_e8m0x2(f, saturate, rounding);
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+unsigned short __internal_e8m0_to_bf16(const __nv_fp8_storage_t x)
+{
+    unsigned short res;
+    // shift bias exponent bits into place
+    res = ((unsigned short)x) << 7U;
+
+    if (x == 0xFFU) {
+        res = 0x7FFFU; // NaN --> Canonical QNaN
+    } else if (x == 0U) {
+        res = 0x0040U; // 2^-127
+    }
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_bfloat16_raw __nv_cvt_e8m0_to_bf16raw(const __nv_fp8_storage_t x)
+{
+    __nv_bfloat16_raw res;
+
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short in = (unsigned short)x;
+    unsigned hr = 0U;
+    asm("{cvt.rn.bf16x2.ue8m0x2 %0, %1;}\n"
+                : "=r"(hr)
+                : "h"(in));
+
+    res.x = (unsigned short)hr;
+#else
+    res.x = __internal_e8m0_to_bf16(x);
+#endif
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_bfloat162_raw __nv_cvt_e8m0x2_to_bf162raw(const __nv_fp8x2_storage_t x)
+{
+    __nv_bfloat162_raw res;
+
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short in = (unsigned short)x;
+    unsigned hr = 0U;
+    asm("{cvt.rn.bf16x2.ue8m0x2 %0, %1;}\n"
+                : "=r"(hr)
+                : "h"(in));
+
+    res.x = (unsigned short)hr;
+    res.y = (unsigned short)(hr >> (unsigned)16U);
+#else
+
+    res.x = __internal_e8m0_to_bf16((__nv_fp8_storage_t)x);
+    res.y = __internal_e8m0_to_bf16((__nv_fp8_storage_t)(x >> (unsigned short)8U));
+
+#endif
+
+    return res;
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+ * \brief __nv_fp8_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp8 floating-point numbers of \p e5m2 kind:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+ * \brief __nv_fp8x2_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
+__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
+                             const unsigned short int src_hi) {
+    unsigned int dst;
+#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
+    asm("{  mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
+#else
+    dst = (static_cast<unsigned int>(src_hi) << 16U) |
+          static_cast<unsigned int>(src_lo);
+#endif
+    return dst;
+}
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+ * \brief __nv_fp8x4_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+ * \brief __nv_fp8_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp8 floating-point numbers of \p e4m3 kind:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+ * \brief __nv_fp8x2_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+ * \brief __nv_fp8x4_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of four \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \ingroup CUDA_MATH_FP8_E8M0_STRUCT
+ * \brief __nv_fp8_e8m0 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * 8-bit scale factors of \p e8m0 kind: interpreted as powers of two
+ * with biased exponent. Bias equals to 127, so numbers 0 through 254
+ * represent 2^-127 through 2^127. Number \p 0xFF = 255 is reserved
+ * for NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E8M0_STRUCT
+     * Storage variable contains the 8-bit scale data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_float_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const __half f) {
+        __x = __nv_cvt_float_to_e8m0(__internal_halfraw_to_float(static_cast<__half_raw>(f)),
+                                     __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_bfloat16raw_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_e8m0(static_cast<__nv_bfloat16_raw>(f),
+                                           __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_float_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const float f) {
+        __x = __nv_cvt_float_to_e8m0(f, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_double_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const double f) {
+        __x = __nv_cvt_double_to_e8m0(f, __NV_SATFINITE, cudaRoundZero);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e8m0(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<double>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e8m0(const unsigned long long int val) {
+        __nv_bfloat16 rn = __ull2bfloat16_rn(val);
+        __nv_bfloat16_raw rn_raw = static_cast<__nv_bfloat16_raw>(rn);
+        unsigned long long int back_int = __bfloat162ull_rz(rn);
+        if (back_int > val)
+        {
+            rn_raw.x--;
+        }
+        __x = __nv_cvt_bfloat16raw_to_e8m0(rn_raw, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<unsigned long long int>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const short int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on
+    * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<double>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const long long int val) {
+        __nv_bfloat16 rn = __ll2bfloat16_rn(val);
+        __nv_bfloat16_raw rn_raw = static_cast<__nv_bfloat16_raw>(rn);
+        long long int back_int = __bfloat162ll_rz(rn);
+        if (((val > 0) && (back_int > val)) || ((val < 0) && (back_int < val)))
+        {
+            rn_raw.x--;
+        }
+        __x = __nv_cvt_bfloat16raw_to_e8m0(rn_raw, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const long int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<long long int>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_bf16raw_to_float(__nv_cvt_e8m0_to_bf16raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __nv_cvt_e8m0_to_bf16raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        const float f = float(*this);
+        double d;
+#if (defined __CUDA_ARCH__)
+        // protect from ftz in device code
+        asm("{  cvt.f64.f32 %0, %1;}\n" : "=d"(d) : "f"(f));
+#else
+        d = static_cast<double>(f);
+#endif
+        return d;
+    }
+
+    /* rounding conversion to half */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return __float2half_rn(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if (bits == 0xFFU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __bfloat162ushort_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __bfloat162uint_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__bfloat162ull_rz(__nv_bfloat16(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__bfloat162uint_rz(__nv_bfloat16(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __bfloat162ull_rz(__nv_bfloat16(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if (bits == 0xFFU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __bfloat162short_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __bfloat162int_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__bfloat162ll_rz(__nv_bfloat16(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__bfloat162int_rz(__nv_bfloat16(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __bfloat162ll_rz(__nv_bfloat16(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * All values in input range are non-zero, so result is always \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return true;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E8M0_STRUCT C++ struct for handling vector type of two scale factors of e8m0 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E8M0_STRUCT
+ * \brief __nv_fp8x2_e8m0 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two scale factors of \p e8m0 kind each.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E8M0_STRUCT
+     * Storage variable contains the vector of two scale factor
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const __half2 f) {
+        __x = __nv_cvt_float2_to_e8m0x2(__half22float2(f),
+                                __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat162raw_to_e8m0x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const float2 f) {
+        __x = __nv_cvt_float2_to_e8m0x2(f,
+                                __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const double2 f) {
+        __x = __nv_cvt_double2_to_e8m0x2(f, __NV_SATFINITE, cudaRoundZero);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat162 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat162() const {
+        return static_cast<__nv_bfloat162>(__nv_cvt_e8m0x2_to_bf162raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __bfloat1622float2(static_cast<__nv_bfloat162>(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return __float22half2_rn(static_cast<float2>(*this));
+    }
+
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E8M0_STRUCT C++ struct for handling vector type of four scale factors of e8m0 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E8M0_STRUCT
+ * \brief __nv_fp8x4_e8m0 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of scale factors of \p e8m0 kind each.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E8M0_STRUCT
+     * Storage variable contains the vector of four scale factor
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = static_cast<__nv_fp8x2_e8m0>(flo).__x;
+        const __nv_fp8x2_storage_t rhi = static_cast<__nv_fp8x2_e8m0>(fhi).__x;
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = static_cast<__nv_fp8x2_e8m0>(flo).__x;
+        const __nv_fp8x2_storage_t rhi = static_cast<__nv_fp8x2_e8m0>(fhi).__x;
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_e8m0x2(flo, __NV_SATFINITE, cudaRoundZero);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_e8m0x2(fhi, __NV_SATFINITE, cudaRoundZero);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_e8m0x2(flo, __NV_SATFINITE, cudaRoundZero);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_e8m0x2(fhi, __NV_SATFINITE, cudaRoundZero);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        __nv_fp8x2_e8m0 lo;
+        lo.__x = static_cast<__nv_fp8x2_storage_t>(__x);
+        __nv_fp8x2_e8m0 hi;
+        hi.__x = static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+
+        float2 rlo = static_cast<float2>(lo);
+        float2 rhi = static_cast<float2>(hi);
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP8_HPP__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..df64a8afa14f695bb05810266ac40b227c078cc5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_GL_INTEROP_H__)
+#define __CUDA_GL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#if defined(__APPLE__)
+
+#include <OpenGL/gl.h>
+
+#else /* __APPLE__ */
+
+#if defined(__arm__) || defined(__aarch64__)
+#ifndef GL_VERSION
+#error Please include the appropriate gl headers before including cuda_gl_interop.h
+#endif
+#else
+#include <GL/gl.h>
+#endif
+
+#endif /* __APPLE__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
+ * This section describes the OpenGL interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of OpenGL
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to the current OpenGL context
+ */
+enum cudaGLDeviceList
+{
+  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
+};
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the 
+ *                           current OpenGL context
+ * \param pCudaDevices     - Returned CUDA devices corresponding to the current 
+ *                           OpenGL context
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaGLDeviceListAll for all devices, 
+ *                           ::cudaGLDeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaGLDeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ *
+ * \note This function is not supported on Mac OS X.
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGLGetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p flags specify the intended usage, as follows: 
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param image    - name of texture or renderbuffer object to be registered
+ * \param target   - Identifies the type of object specified by \p image 
+ * \param flags    - Register flags
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * resource.  The register flags \p flags specify the intended usage,
+ * as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param buffer   - name of buffer object to be registered
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsGLRegisterBuffer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
+
+#ifdef _WIN32
+#ifndef WGL_NV_gpu_affinity
+typedef void* HGPUNV;
+#endif
+
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns the CUDA device associated with a hGpu, if applicable.
+ *
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
+ * not a compute device.
+ * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::WGL_NV_gpu_affinity,
+ * ::cuWGLGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
+#endif
+
+/** @} */ /* END CUDART_OPENGL */
+
+/**
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/**
+ * CUDA GL Map Flags
+ */
+enum cudaGLMapFlags
+{
+  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Sets a CUDA device to use OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * \param device - Device to use for OpenGL interoperability
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
+
+/**
+ * \brief Registers a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Registers the buffer object of ID \p bufObj for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  The OpenGL context used to create the buffer, or another
+ * context from the same share group, must be bound to the current
+ * thread when this is called.
+ *
+ * \param bufObj - Buffer object ID to register
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
+
+/**
+ * \brief Unregisters a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
+ * and releases any CUDA resources associated with the buffer.  Once a
+ * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
+ * context used to create the buffer, or another context from the
+ * same share group, must be bound to the current thread when this is
+ * called.
+ *
+ * \param bufObj - Buffer object to unregister
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Set usage flags for mapping an OpenGL buffer
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Set flags for mapping the OpenGL buffer \p bufObj
+ *
+ * Changes to flags will take effect the next time \p bufObj is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
+ * be used. It is therefore assumed that this buffer will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * buffer will not write to the buffer.
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this buffer will not read from the buffer and will write over the
+ * entire contents of the buffer, so none of the data previously stored in
+ * the buffer will be preserved.
+ *
+ * If \p bufObj has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param bufObj    - Registered buffer object to set flags for
+ * \param flags     - Parameters for buffer mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); 
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
+
+/** @} */ /* END CUDART_OPENGL_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_GL_INTEROP_H__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb543adff62f0212ef5423f1e710dcb0d70108a6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h
@@ -0,0 +1,2094 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * CUDA Occupancy Calculator
+ *
+ * NAME
+ *
+ *   cudaOccMaxActiveBlocksPerMultiprocessor,
+ *   cudaOccMaxPotentialOccupancyBlockSize,
+ *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
+ *   cudaOccAvailableDynamicSMemPerBlock
+ *
+ * DESCRIPTION
+ *
+ *   The CUDA occupancy calculator provides a standalone, programmatical
+ *   interface to compute the occupancy of a function on a device. It can also
+ *   provide occupancy-oriented launch configuration suggestions.
+ *
+ *   The function and device are defined by the user through
+ *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
+ *   structures. All APIs require all 3 of them.
+ *
+ *   See the structure definition for more details about the device / function
+ *   descriptors.
+ *
+ *   See each API's prototype for API usage.
+ *
+ * COMPATIBILITY
+ *
+ *   The occupancy calculator will be updated on each major CUDA toolkit
+ *   release. It does not provide forward compatibility, i.e. new hardwares
+ *   released after this implementation's release will not be supported.
+ *
+ * NOTE
+ *
+ *   If there is access to CUDA runtime, and the sole intent is to calculate
+ *   occupancy related values on one of the accessible CUDA devices, using CUDA
+ *   runtime's occupancy calculation APIs is recommended.
+ *
+ */
+
+#ifndef __cuda_occupancy_h__
+#define __cuda_occupancy_h__
+
+#include <stddef.h>
+#include <limits.h>
+#include <string.h>
+
+
+// __OCC_INLINE will be undefined at the end of this header
+//
+#ifdef __CUDACC__
+#define __OCC_INLINE inline __host__ __device__
+#elif defined _MSC_VER
+#define __OCC_INLINE __inline
+#else // GNUCC assumed
+#define __OCC_INLINE inline
+#endif
+
+enum cudaOccError_enum {
+    CUDA_OCC_SUCCESS              = 0,  // no error encountered
+    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
+    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
+                                        // current implementation or device is
+                                        // invalid
+};
+typedef enum cudaOccError_enum       cudaOccError;
+
+typedef struct cudaOccResult         cudaOccResult;
+typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
+typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
+typedef struct cudaOccDeviceState    cudaOccDeviceState;
+
+/**
+ * The CUDA occupancy calculator computes the occupancy of the function
+ * described by attributes with the given block size (blockSize), static device
+ * properties (properties), dynamic device states (states) and per-block dynamic
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
+ * result along with other useful information. The occupancy is computed in
+ * terms of the maximum number of active blocks per multiprocessor. The user can
+ * then convert it to other metrics, such as number of active warps.
+ *
+ * RETURN VALUE
+ *
+ * The occupancy and related information is returned through result.
+ *
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
+ * combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,           // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    int                          blockSize,        // in
+    size_t                       dynamicSmemSize); // in
+
+/**
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
+ * the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the user should
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
+ * shared memory size is constant regardless of block size, the size should be
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
+ * NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to provide a pointer to an unary function through
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
+ * a block of the function for any given block size. dynamicSMemSize is
+ * ignored. An example signature is:
+ *
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,      // out
+    int                         *blockSize,        // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
+    size_t                       dynamicSMemSize); // in
+
+/**
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
+ * for the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
+ * configure the launch. A constant dynamic shared memory allocation size in
+ * bytes can be passed through dynamicSMemSize.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to use
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
+ * computes the dynamic shared memory needed by func for any given block
+ * size. An example signature is:
+ *
+ *  // Take block size, returns per-block dynamic shared memory needed
+ *  size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+
+#if defined(__cplusplus)
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    size_t                       dynamicSMemSize = 0); // in
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    UnaryFunction                blockSizeToDynamicSMemSize); // in
+
+} // namespace anonymous
+#endif // defined(__cplusplus)
+
+/**
+ *
+ * The CUDA dynamic shared memory calculator computes the maximum size of 
+ * per-block dynamic shared memory if we want to place numBlocks blocks
+ * on an SM.
+ *
+ * RETURN VALUE
+ *
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow 
+ * numBlocks blocks per SM.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *dynamicSmemSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize);
+
+/**
+ * Data structures
+ *
+ * These structures are subject to change for future architecture and CUDA
+ * releases. C users should initialize the structure as {0}.
+ *
+ */
+
+/**
+ * Device descriptor
+ *
+ * This structure describes a device.
+ */
+struct cudaOccDeviceProp {
+    int    computeMajor;                // Compute capability major version
+    int    computeMinor;                // Compute capability minor
+                                        // version. None supported minor version
+                                        // may cause error
+    int    maxThreadsPerBlock;          // Maximum number of threads per block
+    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
+                                        // i.e. (Max. number of warps) x (warp
+                                        // size)
+    int    regsPerBlock;                // Maximum number of registers per block
+    int    regsPerMultiprocessor;       // Maximum number of registers per SM
+    int    warpSize;                    // Warp size
+    size_t sharedMemPerBlock;           // Maximum shared memory size per block
+    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
+    int    numSms;                      // Number of SMs available
+    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
+    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver
+
+#ifdef __cplusplus
+    // This structure can be converted from a cudaDeviceProp structure for users
+    // that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the device properties of a CUDA device through
+    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
+    // cudaDeviceProp structure.
+    //
+    // Example:
+    /*
+     {
+         cudaDeviceProp prop;
+
+         cudaGetDeviceProperties(&prop, ...);
+
+         cudaOccDeviceProp occProp = prop;
+
+         ...
+
+         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
+     }
+     */
+    //
+    template<typename DeviceProp>
+    __OCC_INLINE
+    cudaOccDeviceProp(const DeviceProp &props)
+    :   computeMajor                (props.major),
+        computeMinor                (props.minor),
+        maxThreadsPerBlock          (props.maxThreadsPerBlock),
+        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
+        regsPerBlock                (props.regsPerBlock),
+        regsPerMultiprocessor       (props.regsPerMultiprocessor),
+        warpSize                    (props.warpSize),
+        sharedMemPerBlock           (props.sharedMemPerBlock),
+        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
+        numSms                      (props.multiProcessorCount),
+        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
+        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
+    {}
+
+    __OCC_INLINE
+    cudaOccDeviceProp()
+    :   computeMajor                (0),
+        computeMinor                (0),
+        maxThreadsPerBlock          (0),
+        maxThreadsPerMultiprocessor (0),
+        regsPerBlock                (0),
+        regsPerMultiprocessor       (0),
+        warpSize                    (0),
+        sharedMemPerBlock           (0),
+        sharedMemPerMultiprocessor  (0),
+        numSms                      (0),
+        sharedMemPerBlockOptin      (0),
+        reservedSharedMemPerBlock   (0)
+    {}
+#endif // __cplusplus
+};
+
+/**
+ * Partitioned global caching option
+ */
+typedef enum cudaOccPartitionedGCConfig_enum {
+    PARTITIONED_GC_OFF,        // Disable partitioned global caching
+    PARTITIONED_GC_ON,         // Prefer partitioned global caching
+    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
+} cudaOccPartitionedGCConfig;
+
+/**
+ * Per function opt in maximum dynamic shared memory limit
+ */
+typedef enum cudaOccFuncShmemConfig_enum {
+    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
+    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
+} cudaOccFuncShmemConfig;
+
+/**
+ * Function descriptor
+ *
+ * This structure describes a CUDA function.
+ */
+struct cudaOccFuncAttributes {
+    int maxThreadsPerBlock; // Maximum block size the function can work with. If
+                            // unlimited, use INT_MAX or any value greater than
+                            // or equal to maxThreadsPerBlock of the device
+    int numRegs;            // Number of registers used. When the function is
+                            // launched on device, the register count may change
+                            // due to internal tools requirements.
+    size_t sharedSizeBytes; // Number of static shared memory used
+
+    cudaOccPartitionedGCConfig partitionedGCConfig; 
+                            // Partitioned global caching is required to enable
+                            // caching on certain chips, such as sm_52
+                            // devices. Partitioned global caching can be
+                            // automatically disabled if the occupancy
+                            // requirement of the launch cannot support caching.
+                            //
+                            // To override this behavior with caching on and
+                            // calculate occupancy strictly according to the
+                            // preference, set partitionedGCConfig to
+                            // PARTITIONED_GC_ON_STRICT. This is especially
+                            // useful for experimenting and finding launch
+                            // configurations (MaxPotentialOccupancyBlockSize)
+                            // that allow global caching to take effect.
+                            //
+                            // This flag only affects the occupancy calculation.
+
+    cudaOccFuncShmemConfig shmemLimitConfig;
+                            // Certain chips like sm_70 allow a user to opt into
+                            // a higher per block limit of dynamic shared memory
+                            // This optin is performed on a per function basis
+                            // using the cuFuncSetAttribute function
+
+    size_t maxDynamicSharedSizeBytes;
+                            // User set limit on maximum dynamic shared memory
+                            // usable by the kernel
+                            // This limit is set using the cuFuncSetAttribute
+                            // function.
+
+    int numBlockBarriers;   // Number of block barriers used (default to 1)
+#ifdef __cplusplus
+    // This structure can be converted from a cudaFuncAttributes structure for
+    // users that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the function attributes of a CUDA kernel function through
+    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
+    // cudaFuncAttributes structure.
+    //
+    // Example:
+    /*
+      __global__ void foo() {...}
+
+      ...
+
+      {
+          cudaFuncAttributes attr;
+
+          cudaFuncGetAttributes(&attr, foo);
+
+          cudaOccFuncAttributes occAttr = attr;
+
+          ...
+
+          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
+      }
+     */
+    //
+    template<typename FuncAttributes>
+    __OCC_INLINE
+    cudaOccFuncAttributes(const FuncAttributes &attr)
+    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
+        numRegs             (attr.numRegs),
+        sharedSizeBytes     (attr.sharedSizeBytes),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
+        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
+        numBlockBarriers    (1)
+    {}
+
+    __OCC_INLINE
+    cudaOccFuncAttributes()
+    :   maxThreadsPerBlock  (0),
+        numRegs             (0),
+        sharedSizeBytes     (0),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
+        maxDynamicSharedSizeBytes (0),
+        numBlockBarriers    (0)
+    {}
+#endif
+};
+
+typedef enum cudaOccCacheConfig_enum {
+    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
+    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
+    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
+    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
+} cudaOccCacheConfig;
+
+typedef enum cudaOccCarveoutConfig_enum {
+    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
+    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
+    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
+    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
+} cudaOccCarveoutConfig;
+
+/**
+ * Device state descriptor
+ *
+ * This structure describes device settings that affect occupancy calculation.
+ */
+struct cudaOccDeviceState
+{
+    // Cache / shared memory split preference. Deprecated on Volta 
+    cudaOccCacheConfig cacheConfig; 
+    // Shared memory / L1 split preference. Supported on only Volta
+    int carveoutConfig;
+
+#ifdef __cplusplus
+    __OCC_INLINE
+    cudaOccDeviceState()
+    :   cacheConfig     (CACHE_PREFER_NONE),
+        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
+    {}
+#endif
+};
+
+typedef enum cudaOccLimitingFactor_enum {
+                                    // Occupancy limited due to:
+    OCC_LIMIT_WARPS         = 0x01, // - warps available
+    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
+    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
+    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
+    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
+} cudaOccLimitingFactor;
+
+/**
+ * Occupancy output
+ *
+ * This structure contains occupancy calculator's output.
+ */
+struct cudaOccResult {
+    int activeBlocksPerMultiprocessor; // Occupancy
+    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
+                                       // field that counts the limiting
+                                       // factors, see cudaOccLimitingFactor
+    int blockLimitRegs;                // Occupancy due to register
+                                       // usage, INT_MAX if the kernel does not
+                                       // use any register.
+    int blockLimitSharedMem;           // Occupancy due to shared memory
+                                       // usage, INT_MAX if the kernel does not
+                                       // use shared memory.
+    int blockLimitWarps;               // Occupancy due to block size limit
+    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
+                                       // managable per SM
+    int blockLimitBarriers;            // Occupancy due to block barrier usage
+    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
+                                       // block
+    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
+                                       // per block
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                                       // Report if partitioned global caching
+                                       // is actually enabled.
+};
+
+/**
+ * Partitioned global caching support
+ *
+ * See cudaOccPartitionedGlobalCachingModeSupport
+ */
+typedef enum cudaOccPartitionedGCSupport_enum {
+    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
+    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
+} cudaOccPartitionedGCSupport;
+
+/**
+ * Implementation
+ */
+
+/**
+ * Max compute capability supported
+ */
+
+#define __CUDA_OCC_MAJOR__ 12
+#define __CUDA_OCC_MINOR__ 0
+
+//////////////////////////////////////////
+//    Mathematical Helper Functions     //
+//////////////////////////////////////////
+
+static __OCC_INLINE int __occMin(int lhs, int rhs)
+{
+    return rhs < lhs ? rhs : lhs;
+}
+
+static __OCC_INLINE int __occDivideRoundUp(int x, int y)
+{
+    return (x + (y - 1)) / y;
+}
+
+static __OCC_INLINE int __occRoundUp(int x, int y)
+{
+    return y * __occDivideRoundUp(x, y);
+}
+
+//////////////////////////////////////////
+//      Architectural Properties        //
+//////////////////////////////////////////
+
+/**
+ * Granularity of shared memory allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+            value = 256;
+            break;
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 128;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Maximum number of registers per thread
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+            value = 255;
+            break;
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Granularity of register allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Number of sub-partitions
+ */
+static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 4;
+            break;
+        case 6:
+            value = properties->computeMinor ? 4 : 2;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+
+/**
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
+ */
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+            value = 16;
+            break;
+        case 5:
+        case 6:
+            value = 32;
+            break;
+        case 7: {
+            int isTuring = properties->computeMinor == 5;
+            value = (isTuring) ? 16 : 32;
+            break;
+        }
+        case 8:
+            if (properties->computeMinor == 0) {
+                value = 32;
+            }
+            else if (properties->computeMinor == 9) {
+                value = 24;
+            }
+            else {
+                value = 16;
+            }
+            break;
+        case 9:
+            value = 32;
+            break;
+        case 10:
+            switch(properties->computeMinor) {
+                case 1 :
+                    value = 24;
+                    break;
+                case 0 : /* explicitly added to avoid build failure in WDDM driver components */
+                default :
+                    value = 32;
+            }
+            break;
+        case 12:
+            value = 24;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/** 
+ * Align up shared memory based on compute major configurations
+ */
+static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
+{
+    // Volta and Turing have shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // map carveout config ratio to the next available architecture size
+    size_t size = *shMemSize;
+
+    switch (properties->computeMajor) {
+    case 7: {
+        // Turing supports 32KB and 64KB shared mem.
+        int isTuring = properties->computeMinor == 5;
+        if (isTuring) {
+            if      (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 96 * 1024) {
+                *shMemSize = 96 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    }
+    case 8:
+        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else if (size <= 132 * 1024) {
+                *shMemSize = 132 * 1024;
+            }
+            else if (size <= 164 * 1024) {
+                *shMemSize = 164 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    case 9: {
+        if      (size == 0) {
+            *shMemSize = 0;
+        }
+        else if (size <= 8 * 1024) {
+            *shMemSize = 8 * 1024;
+        }
+        else if (size <= 16 * 1024) {
+            *shMemSize = 16 * 1024;
+        }
+        else if (size <= 32 * 1024) {
+            *shMemSize = 32 * 1024;
+        }
+        else if (size <= 64 * 1024) {
+            *shMemSize = 64 * 1024;
+        }
+        else if (size <= 100 * 1024) {
+            *shMemSize = 100 * 1024;
+        }
+        else if (size <= 132 * 1024) {
+            *shMemSize = 132 * 1024;
+        }
+        else if (size <= 164 * 1024) {
+            *shMemSize = 164 * 1024;
+        }
+        else if (size <= 196 * 1024) {
+            *shMemSize = 196 * 1024;
+        }
+        else if (size <= 228 * 1024) {
+            *shMemSize = 228 * 1024;
+        }
+        else {
+            return CUDA_OCC_ERROR_INVALID_INPUT;
+        }
+        break;
+    }
+    case 10: {
+        switch (properties->computeMinor) {
+    // GB10x GPUs in Blackwell family have the below compute minors and corresponding
+    // shared memory configs
+            case 0:
+            case 1:
+                if      (size == 0) {
+                    *shMemSize = 0;
+                }
+                else if (size <= 8 * 1024) {
+                    *shMemSize = 8 * 1024;
+                }
+                else if (size <= 16 * 1024) {
+                    *shMemSize = 16 * 1024;
+                }
+                else if (size <= 32 * 1024) {
+                    *shMemSize = 32 * 1024;
+                }
+                else if (size <= 64 * 1024) {
+                    *shMemSize = 64 * 1024;
+                }
+                else if (size <= 100 * 1024) {
+                    *shMemSize = 100 * 1024;
+                }
+                else if (size <= 132 * 1024) {
+                    *shMemSize = 132 * 1024;
+                }
+                else if (size <= 164 * 1024) {
+                    *shMemSize = 164 * 1024;
+                }
+                else if (size <= 196 * 1024) {
+                    *shMemSize = 196 * 1024;
+                }
+                else if (size <= 228 * 1024) {
+                    *shMemSize = 228 * 1024;
+                }
+                else {
+                    return CUDA_OCC_ERROR_INVALID_INPUT;
+                }
+                break;
+            default:
+                return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+        }
+        break;
+    }
+    case 12: {
+        switch (properties->computeMinor) {
+            case 0:
+                if      (size == 0) {
+                    *shMemSize = 0;
+                }
+                else if (size <= 8 * 1024) {
+                    *shMemSize = 8 * 1024;
+                }
+                else if (size <= 16 * 1024) {
+                    *shMemSize = 16 * 1024;
+                }
+                else if (size <= 32 * 1024) {
+                    *shMemSize = 32 * 1024;
+                }
+                else if (size <= 64 * 1024) {
+                    *shMemSize = 64 * 1024;
+                }
+                else if (size <= 100 * 1024) {
+                    *shMemSize = 100 * 1024;
+                }
+                else {
+                    return CUDA_OCC_ERROR_INVALID_INPUT;
+                }
+                break;
+            default:
+                return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+        }
+        break;
+    }
+    break;
+    default:
+        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on the new carveoutConfig API introduced with Volta
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    size_t preferenceShmemSize;
+
+    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
+    // devices. This preference will take precedence over the older cacheConfig setting.
+    // Map cacheConfig to its effective preference value.
+    int effectivePreference = state->carveoutConfig;
+    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        switch (state->cacheConfig)
+        {
+        case CACHE_PREFER_L1:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
+            break;
+        case CACHE_PREFER_SHARED:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
+            break;
+        case CACHE_PREFER_EQUAL:
+            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
+            break;
+        default:
+            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
+            break;
+        }
+    }
+
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
+    }
+    else {
+        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
+    }
+
+    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
+    *limit = preferenceShmemSize;
+    return status;
+}
+
+/**
+ * Shared memory based on the cacheConfig
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    size_t bytes                          = 0;
+    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
+    cudaOccCacheConfig cacheConfig        = state->cacheConfig;
+
+    // Kepler has shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // calculate the correct shared memory size for user requested cache
+    // configuration.
+    //
+    size_t minCacheSize                   = 16384;
+    size_t maxCacheSize                   = 49152;
+    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
+    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;
+
+    switch (properties->computeMajor) {
+        case 3:
+            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
+            // is shared memory.
+            //
+            switch (cacheConfig) {
+                default :
+                case CACHE_PREFER_NONE:
+                case CACHE_PREFER_SHARED:
+                    bytes = sharedMemPerMultiprocessorHigh;
+                    break;
+                case CACHE_PREFER_L1:
+                    bytes = sharedMemPerMultiprocessorLow;
+                    break;
+                case CACHE_PREFER_EQUAL:
+                    // Equal is the mid-point between high and low. It should be
+                    // equivalent to low + 16KB.
+                    //
+                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
+                    break;
+            }
+            break;
+        case 5:
+        case 6:
+            // Maxwell and Pascal have dedicated shared memory.
+            //
+            bytes = sharedMemPerMultiprocessorHigh;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = bytes;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on config requested by User
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
+    // it is handled separately from the cache config preference.
+    if (properties->computeMajor >= 7) {
+        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
+    }
+    return cudaOccSMemPreference(limit, properties, state);
+}
+
+/**
+ * Return the per block shared memory limit based on function config
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
+{
+    switch (properties->computeMajor) {
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+            *limit = properties->sharedMemPerBlock;
+            break;
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            switch (shmemLimitConfig) {
+                default:
+                case FUNC_SHMEM_LIMIT_DEFAULT:
+                    *limit = properties->sharedMemPerBlock;
+                    break;
+                case FUNC_SHMEM_LIMIT_OPTIN:
+                    if (smemPerCta > properties->sharedMemPerBlock) {
+                        *limit = properties->sharedMemPerBlockOptin;
+                    }
+                    else {
+                        *limit = properties->sharedMemPerBlock;
+                    }
+                    break;
+            }
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    // Starting Ampere, CUDA driver reserves additional shared memory per block
+    if (properties->computeMajor >= 8) {
+        *limit += properties->reservedSharedMemPerBlock;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Partitioned global caching mode support
+ */
+static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
+{
+    *limit = PARTITIONED_GC_NOT_SUPPORTED;
+
+    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
+        properties->computeMajor == 6) {
+        *limit = PARTITIONED_GC_SUPPORTED;
+    }
+
+    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
+        *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+///////////////////////////////////////////////
+//            User Input Sanity              //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
+{
+    // Verify device properties
+    //
+    // Each of these limits must be a positive number.
+    //
+    // Compute capacity is checked during the occupancy calculation
+    //
+    if (properties->maxThreadsPerBlock          <= 0 ||
+        properties->maxThreadsPerMultiprocessor <= 0 ||
+        properties->regsPerBlock                <= 0 ||
+        properties->regsPerMultiprocessor       <= 0 ||
+        properties->warpSize                    <= 0 ||
+        properties->sharedMemPerBlock           <= 0 ||
+        properties->sharedMemPerMultiprocessor  <= 0 ||
+        properties->numSms                      <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
+{
+    // Verify function attributes
+    //
+    if (attributes->maxThreadsPerBlock <= 0 ||
+        attributes->numRegs < 0) {            // Compiler may choose not to use
+                                              // any register (empty kernels,
+                                              // etc.)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
+{
+    (void)state;   // silence unused-variable warning
+    // Placeholder
+    //
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccInputCheck(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    status = cudaOccDevicePropCheck(properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccFuncAttributesCheck(attributes);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccDeviceStateCheck(state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    return status;
+}
+
+///////////////////////////////////////////////
+//    Occupancy calculation Functions        //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccPartitionedGCSupport gcSupport;
+    cudaOccPartitionedGCConfig gcConfig;
+
+    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
+
+    gcConfig = attributes->partitionedGCConfig;
+
+    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
+        gcConfig = PARTITIONED_GC_OFF;
+    }
+
+    return gcConfig;
+}
+
+// Warp limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig   gcConfig,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int maxWarpsPerSm;
+    int warpsAllocatedPerCTA;
+    int maxBlocks;
+    (void)attributes;   // silence unused-variable warning
+
+    if (blockSize > properties->maxThreadsPerBlock) {
+        maxBlocks = 0;
+    }
+    else {
+        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
+        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+        maxBlocks = 0;
+
+        if (gcConfig != PARTITIONED_GC_OFF) {
+            int maxBlocksPerSmPartition;
+            int maxWarpsPerSmPartition;
+
+            // If partitioned global caching is on, then a CTA can only use a SM
+            // partition (a half SM), and thus a half of the warp slots
+            // available per SM
+            //
+            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
+            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
+            maxBlocks               = maxBlocksPerSmPartition * 2;
+        }
+        // On hardware that supports partitioned global caching, each half SM is
+        // guaranteed to support at least 32 warps (maximum number of warps of a
+        // CTA), so caching will not cause 0 occupancy due to insufficient warp
+        // allocation slots.
+        //
+        else {
+            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
+        }
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Shared memory limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
+    int                         *limit,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    size_t userSmemPreference = 0;
+    size_t totalSmemUsagePerCTA;
+    size_t maxSmemUsagePerCTA;
+    size_t smemAllocatedPerCTA;
+    size_t staticSmemSize;
+    size_t sharedMemPerMultiprocessor;
+    size_t smemLimitPerCTA;
+    int maxBlocks;
+    int dynamicSmemSizeExceeded = 0;
+    int totalSmemSizeExceeded = 0;
+    (void)blockSize;   // silence unused-variable warning
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Obtain the user preferred shared memory size. This setting is ignored if
+    // user requests more shared memory than preferred.
+    //
+    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
+    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
+    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
+
+    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
+
+    dynamicSmemSizeExceeded = 0;
+    totalSmemSizeExceeded   = 0;
+
+    // Obtain the user set maximum dynamic size if it exists
+    // If so, the current launch dynamic shared memory must not
+    // exceed the set limit
+    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
+        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
+        dynamicSmemSizeExceeded = 1;
+    }
+
+    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    if (smemAllocatedPerCTA > smemLimitPerCTA) {
+        totalSmemSizeExceeded = 1;
+    }
+
+    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
+        maxBlocks = 0;
+    }
+    else {
+        // User requested shared memory limit is used as long as it is greater
+        // than the total shared memory used per CTA, i.e. as long as at least
+        // one CTA can be launched.
+        if (userSmemPreference >= smemAllocatedPerCTA) {
+            sharedMemPerMultiprocessor = userSmemPreference;
+        }
+        else {
+            // On Volta+, user requested shared memory will limit occupancy
+            // if it's less than shared memory per CTA. Otherwise, the
+            // maximum shared memory limit is used.
+            if (properties->computeMajor >= 7) {
+                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
+                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
+                if (status != CUDA_OCC_SUCCESS) {
+                    return status;
+                }
+            }
+            else {
+                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
+            }
+        }
+
+        if (smemAllocatedPerCTA > 0) {
+            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig  *gcConfig,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    int warpsAllocatedPerCTA;
+    int regsAllocatedPerCTA;
+    int regsAssumedPerCTA;
+    int regsPerWarp;
+    int regsAllocatedPerWarp;
+    int numSubPartitions;
+    int numRegsPerSubPartition;
+    int numWarpsPerSubPartition;
+    int numWarpsPerSM;
+    int maxBlocks;
+    int maxRegsPerThread;
+
+    status = cudaOccRegAllocationGranularity(
+        &allocationGranularity,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccRegAllocationMaxPerThread(
+        &maxRegsPerThread,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    //
+    // Number of regs per warp is regs per thread x warp size, rounded up to
+    // register allocation granularity
+    //
+    regsPerWarp          = attributes->numRegs * properties->warpSize;
+    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
+    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;
+
+    // Hardware verifies if a launch fits the per-CTA register limit. For
+    // historical reasons, the verification logic assumes register
+    // allocations are made to all partitions simultaneously. Therefore, to
+    // simulate the hardware check, the warp allocation needs to be rounded
+    // up to the number of partitions.
+    //
+    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
+
+    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
+        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
+        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
+        maxBlocks = 0;
+    }
+    else {
+        if (regsAllocatedPerWarp > 0) {
+            // Registers are allocated in each sub-partition. The max number
+            // of warps that can fit on an SM is equal to the max number of
+            // warps per sub-partition x number of sub-partitions.
+            //
+            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
+            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
+
+            maxBlocks = 0;
+
+            if (*gcConfig != PARTITIONED_GC_OFF) {
+                int numSubPartitionsPerSmPartition;
+                int numWarpsPerSmPartition;
+                int maxBlocksPerSmPartition;
+
+                // If partitioned global caching is on, then a CTA can only
+                // use a half SM, and thus a half of the registers available
+                // per SM
+                //
+                numSubPartitionsPerSmPartition = numSubPartitions / 2;
+                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
+                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
+                maxBlocks                      = maxBlocksPerSmPartition * 2;
+            }
+
+            // Try again if partitioned global caching is not enabled, or if
+            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
+            // case, the device will automatically turn off caching, except
+            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
+            // occupancy and launch configuration.
+            //
+            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
+               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
+               // this is what it will be if we spread CTA across partitions.
+               //
+               *gcConfig = PARTITIONED_GC_OFF;
+               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
+               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
+            }
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+
+    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Barrier limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
+    int                         *limit,
+    int                          ctaLimitBlocks,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int numBarriersAvailable = 0;
+    int numBarriersUsed = attributes->numBlockBarriers;
+    int maxBlocks = INT_MAX;
+
+    switch(properties->computeMajor) {
+        case 5:
+        case 6:
+        case 7:
+            numBarriersAvailable = ctaLimitBlocks * 2;
+            break;
+        case 8:
+            if (properties->computeMinor == 0) {
+                numBarriersAvailable = ctaLimitBlocks * 2;
+            }
+            else {
+                numBarriersAvailable = ctaLimitBlocks;
+            }
+            break;
+        case 9:
+            numBarriersAvailable = ctaLimitBlocks * 2;
+            break;
+        case 10:
+            switch(properties->computeMinor) {
+                case 1 :
+                    numBarriersAvailable = ctaLimitBlocks;
+                    break;
+                case 0 : /* explicitly added to avoid build failure in WDDM driver components. */
+                default :
+                    numBarriersAvailable = ctaLimitBlocks * 2;
+            }
+
+            break;
+        case 12:
+            numBarriersAvailable = ctaLimitBlocks;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    if (numBarriersUsed) {
+        maxBlocks = numBarriersAvailable / numBarriersUsed;
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+///////////////////////////////////
+//      API Implementations      //
+///////////////////////////////////
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status          = CUDA_OCC_SUCCESS;
+    int          ctaLimitWarps   = 0;
+    int          ctaLimitBlocks  = 0;
+    int          ctaLimitSMem    = 0;
+    int          ctaLimitRegs    = 0;
+    int          ctaLimitBars    = 0;
+    int          ctaLimit        = 0;
+    unsigned int limitingFactors = 0;
+    
+    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
+
+    if (!result || !properties || !attributes || !state || blockSize <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Initialization
+    ///////////////////////////
+
+    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
+
+    ///////////////////////////
+    // Compute occupancy
+    ///////////////////////////
+
+    // Limits due to registers/SM
+    // Also compute if partitioned global caching has to be turned off
+    //
+    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
+    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
+    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
+    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
+    // Therefore, we check the occupancy on GP10x when it can run on GP100
+    //
+    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
+        cudaOccDeviceProp propertiesGP10x;
+        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
+        int ctaLimitRegsGP10x = 0;
+
+        // Set up properties for GP10x
+        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
+        propertiesGP10x.computeMinor = 1;
+
+        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        if (ctaLimitRegsGP10x == 0) {
+            ctaLimitRegs = 0;
+        }
+    }
+
+    // Limits due to warps/SM
+    //
+    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to blocks/SM
+    //
+    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to shared memory/SM
+    //
+    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Overall occupancy
+    ///////////////////////////
+
+    // Overall limit is min() of limits due to above reasons
+    //
+    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
+
+    // Determine occupancy limiting factors
+    //
+    if (ctaLimit == ctaLimitWarps) {
+        limitingFactors |= OCC_LIMIT_WARPS;
+    }
+    if (ctaLimit == ctaLimitRegs) {
+        limitingFactors |= OCC_LIMIT_REGISTERS;
+    }
+    if (ctaLimit == ctaLimitSMem) {
+        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
+    }
+    if (ctaLimit == ctaLimitBlocks) {
+        limitingFactors |= OCC_LIMIT_BLOCKS;
+    }
+
+    // For Hopper onwards compute the limits to occupancy based on block barrier count
+    //
+    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
+        // Limits due to barrier/SM
+        //
+        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, properties, attributes);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        // Recompute overall limit based on barrier/SM
+        //
+        ctaLimit = __occMin(ctaLimitBars, ctaLimit);
+
+        // Determine if this is occupancy limiting factor
+        //
+        if (ctaLimit == ctaLimitBars) {
+            limitingFactors |= OCC_LIMIT_BARRIERS;
+        }
+    }
+    else {
+        ctaLimitBars = INT_MAX;
+    }
+
+    // Fill in the return values
+    //
+    result->limitingFactors = limitingFactors;
+
+    result->blockLimitRegs      = ctaLimitRegs;
+    result->blockLimitSharedMem = ctaLimitSMem;
+    result->blockLimitWarps     = ctaLimitWarps;
+    result->blockLimitBlocks    = ctaLimitBlocks;
+    result->blockLimitBarriers  = ctaLimitBars;
+    result->partitionedGCConfig = gcConfig;
+
+    // Final occupancy
+    result->activeBlocksPerMultiprocessor = ctaLimit;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *bytesAvailable,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize)
+{
+    int allocationGranularity;
+    size_t smemLimitPerBlock;
+    size_t smemAvailableForDynamic;
+    size_t userSmemPreference = 0;
+    size_t sharedMemPerMultiprocessor;
+    cudaOccResult result;
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    if (numBlocks <= 0)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+
+    // First compute occupancy of potential kernel launch.
+    //
+    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Check if occupancy is achievable given user requested number of blocks. 
+    //
+    if (result.activeBlocksPerMultiprocessor < numBlocks) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Return the per block shared memory limit based on function config.
+    //
+    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
+    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
+    // preference sets the total limit of available shared memory.
+    //
+    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (numBlocks == 1) {
+        sharedMemPerMultiprocessor = smemLimitPerBlock;
+    }
+    else {
+        if (!userSmemPreference) {
+            userSmemPreference = 1 ;
+            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
+            if (status != CUDA_OCC_SUCCESS) {
+                return status;
+            }
+        }
+        sharedMemPerMultiprocessor = userSmemPreference;
+    }
+
+    // Compute total shared memory available per SM
+    //
+    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
+    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
+
+    // Cap shared memory
+    //
+    if (smemAvailableForDynamic > smemLimitPerBlock) {
+        smemAvailableForDynamic = smemLimitPerBlock;
+    }
+
+    // Now compute dynamic shared memory size
+    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; 
+
+    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
+    //
+    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
+        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
+
+    *bytesAvailable = smemAvailableForDynamic;
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                     (*blockSizeToDynamicSMemSize)(int),
+    size_t                       dynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        // Ignore dynamicSMemSize if the user provides a mapping
+        //
+        if (blockSizeToDynamicSMemSize) {
+            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
+        }
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+
+#if defined(__cplusplus)
+
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                       dynamicSMemSize)
+{
+    return cudaOccMaxPotentialOccupancyBlockSize(
+        minGridSize,
+        blockSize,
+        properties,
+        attributes,
+        state,
+        NULL,
+        dynamicSMemSize);
+}
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    UnaryFunction                blockSizeToDynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+} // namespace anonymous
+
+#endif /*__cplusplus */
+
+#undef __OCC_INLINE
+
+#endif /*__cuda_occupancy_h__*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline.h
new file mode 100644
index 0000000000000000000000000000000000000000..46bc89e4499576f1ae58848cd8684ba3e32420cf
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_H_
+# define _CUDA_PIPELINE_H_
+
+# include "cuda_pipeline_primitives.h"
+
+# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+# endif
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier.h"
+# endif
+
+// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
+#  else
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
+#  endif
+
+#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
+#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
+#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
+
+namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
+    struct __block_scope_barrier_base;
+}}
+
+# endif
+
+_CUDA_PIPELINE_BEGIN_NAMESPACE
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N];
+
+class pipeline {
+public:
+    pipeline(const pipeline&) = delete;
+    pipeline(pipeline&&) = delete;
+    pipeline& operator=(const pipeline&) = delete;
+    pipeline& operator=(pipeline&&) = delete;
+
+    _CUDA_PIPELINE_QUALIFIER pipeline();
+    _CUDA_PIPELINE_QUALIFIER size_t commit();
+    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
+    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
+    template<unsigned N>
+    _CUDA_PIPELINE_QUALIFIER void wait_prior();
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
+# endif
+
+private:
+    size_t current_batch;
+};
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe);
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N]
+{
+    return (T(*)[N])ptr;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+pipeline::pipeline()
+    : current_batch(0)
+{
+}
+
+_CUDA_PIPELINE_QUALIFIER
+size_t pipeline::commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+    return this->current_batch++;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::commit_and_wait()
+{
+    (void)pipeline::commit();
+    pipeline::wait_prior<0>();
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait(size_t batch)
+{
+    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
+
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
+    }
+}
+
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait_prior()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
+}
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(awbarrier& barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
+}
+# endif
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
+                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
+    } else {
+        dst = src;
+    }
+}
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
+{
+    constexpr size_t dst_size = sizeof(*dst);
+    constexpr size_t src_size = sizeof(*src);
+    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
+    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
+                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
+    } else {
+        for (size_t i = 0; i < DstN; ++i) {
+            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
+        }
+    }
+}
+
+_CUDA_PIPELINE_END_NAMESPACE
+
+#endif /* !_CUDA_PIPELINE_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_helpers.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..01882b6b976347b9bfdf276c3d0adcec1d8a55fa
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_helpers.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_HELPERS_H_
+# define _CUDA_PIPELINE_HELPERS_H_
+
+# define _CUDA_PIPELINE_NAMESPACE       nvcuda::experimental
+# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+# define _CUDA_PIPELINE_END_NAMESPACE   } }
+
+# define _CUDA_PIPELINE_INTERNAL_NAMESPACE       _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
+# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
+# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE   } _CUDA_PIPELINE_END_NAMESPACE
+
+# if !defined(_CUDA_PIPELINE_QUALIFIER)
+#  define _CUDA_PIPELINE_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
+#  define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
+# endif
+
+# if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#  define _CUDA_PIPELINE_ARCH_700_OR_LATER
+# endif
+
+# if (__CUDA_ARCH__ >= 800)
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
+# else
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
+# endif
+
+# if !defined(_CUDA_PIPELINE_MAX_STAGES)
+#  define _CUDA_PIPELINE_MAX_STAGES 8
+# endif
+
+# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+#  define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
+# endif
+
+# if !defined(_CUDA_PIPELINE_DEBUG)
+#  if defined(__CUDACC_DEBUG__)
+#   define _CUDA_PIPELINE_DEBUG 1
+#  else
+#   define _CUDA_PIPELINE_DEBUG 0
+#  endif
+# endif
+
+# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
+#  if !defined(__CUDACC_RTC__)
+#   include <cassert>
+#  endif
+#  define _CUDA_PIPELINE_ASSERT(x) assert((x));
+#  define _CUDA_PIPELINE_ABORT() assert(0);
+# else
+#  define _CUDA_PIPELINE_ASSERT(x)
+#  define _CUDA_PIPELINE_ABORT() __trap();
+# endif
+
+# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
+# else
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
+# endif
+
+# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
+# else
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
+# endif
+
+# if defined(__CUDACC_RTC__)
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+# else
+#  include <stdint.h>
+# endif
+
+_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
+
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) ==  2, "Size mismatch for type 'short'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int)   ==  4, "Size mismatch for type 'int'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2)  ==  8, "Size mismatch for type 'int2'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4)  == 16, "Size mismatch for type 'int4'");
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+    char* const d = reinterpret_cast<char*>(dst);
+    const char* const s = reinterpret_cast<const char*>(src);
+
+    size_t copy_step_size;
+    if (SourceSize == 0) {
+        copy_step_size = CopySize;
+    } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
+        copy_step_size = SourceSize;
+    } else {
+        copy_step_size = 1;
+    }
+
+    for (size_t i = 0; i < CopySize; i += copy_step_size) {
+        const bool copy_source = SourceSize && (i < SourceSize);
+
+        switch (copy_step_size) {
+        case 1:
+            d[i] = copy_source ? s[i] : char();
+            break;
+        case 2:
+            *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
+            break;
+        case 4:
+            *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
+            break;
+        case 8:
+            *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
+            break;
+        case 16:
+            *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
+            break;
+        }
+    }
+}
+
+template<bool UseHwAsyncCopy>
+struct ImplementationChooser;
+
+template<>
+struct ImplementationChooser<true> {
+    template<size_t CopySize, size_t SourceSize>
+    struct CpAsyncChooser {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
+                  "n"(SourceSize)
+                : "memory");
+        }
+    };
+
+    template<size_t SourceSize>
+    struct CpAsyncChooser<16, SourceSize> {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
+                : "memory");
+        }
+    };
+
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+        CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+        asm volatile ("cp.async.commit_group;");
+    }
+
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+        asm volatile ("cp.async.wait_group %0;"
+            :
+            : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+        _CUDA_PIPELINE_ASSERT(__isShared(barrier));
+
+        asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
+            :
+            : "r"(__nvvm_get_smem_pointer(barrier)));
+    }
+};
+
+template<>
+struct ImplementationChooser<false> {
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+    }
+
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+    }
+};
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(__isShared(dst));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_commit()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
+}
+
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_wait_prior()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_arrive_on(uint64_t* barrier)
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
+}
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+
+    if (__isGlobal(src) && __isShared(dst)) {
+        pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+    } else {
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+}
+
+template<size_t CopySize, size_t Align>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
+
+    const char* s = reinterpret_cast<const char*>(src);
+    char* d = reinterpret_cast<char*>(dst);
+    size_t remaining = CopySize;
+
+    while (remaining) {
+        if ((Align >= 16) && (remaining >= 16)) {
+            pipeline_copy_strict<16, 16>(dst, src);
+            d += 16;
+            s += 16;
+            remaining -= 16;
+        } else if ((Align >= 8) && (remaining >= 8)) {
+            pipeline_copy_strict<8, 8>(dst, src);
+            d += 8;
+            s += 8;
+            remaining -= 8;
+        } else if ((Align >= 4) && (remaining >= 4)) {
+            pipeline_copy_strict<4, 4>(dst, src);
+            d += 4;
+            s += 4;
+            remaining -= 4;
+        } else if ((Align >= 2) && (remaining >= 2)) {
+            *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
+            d += 2;
+            s += 2;
+            remaining -= 2;
+        } else {
+            *d = *s;
+            d += 1;
+            s += 1;
+            remaining -= 1;
+        }
+    }
+}
+
+_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
+
+#endif /* !_CUDA_PIPELINE_HELPERS_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaba0cfb5ac9184bec5e837d2ec2f9db11d873ae
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
+# define _CUDA_PIPELINE_PRIMITIVES_H_
+
+# include "cuda_pipeline_helpers.h"
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
+                             size_t zfill = 0)
+{
+    _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
+    _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
+    _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
+
+    switch (size_and_align) {
+    case 16:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  9>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  8>(dst_shared, src_global); return;
+        case  9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  7>(dst_shared, src_global); return;
+        case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  6>(dst_shared, src_global); return;
+        case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  5>(dst_shared, src_global); return;
+        case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  4>(dst_shared, src_global); return;
+        case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  3>(dst_shared, src_global); return;
+        case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  2>(dst_shared, src_global); return;
+        case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  1>(dst_shared, src_global); return;
+        case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 8:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  8>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  7>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  6>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  5>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  4>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  3>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  2>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  1>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 4:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  4>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  3>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  2>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  1>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    default:
+        _CUDA_PIPELINE_ABORT();
+        return;
+    }
+}
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+}
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_wait_prior(size_t prior)
+{
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
+    }
+}
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier_primitives.h"
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_arrive_on(__mbarrier_t* barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
+}
+# endif
+
+#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc36a6079c30f86fa9df3a011a5a2670861c0fe6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime.h
@@ -0,0 +1,2591 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_RUNTIME_H__)
+#define __CUDA_RUNTIME_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#define EXCLUDE_FROM_RTC
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic push
+#endif
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4820)
+#endif
+#ifdef __QNX__
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+typedef unsigned size_t;
+#endif
+#endif
+#undef EXCLUDE_FROM_RTC
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "crt/host_config.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "library_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#include "cuda_runtime_api.h"
+#include "driver_functions.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "crt/host_defines.h"
+#ifdef __CUDACC_RTC__
+#include "target"
+#endif  /* defined(__CUDACC_RTC__) */
+
+
+#include "vector_functions.h"
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#include "nvrtc_device_runtime.h"
+#include "crt/device_functions.h"
+#include "crt/common_functions.h"
+#include "device_launch_parameters.h"
+
+#else /* !__CUDACC_RTC__ */
+#define EXCLUDE_FROM_RTC
+#include "crt/common_functions.h"
+#include "crt/device_functions.h"
+#include "device_launch_parameters.h"
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+#include <functional>
+#include <utility>
+struct  __device_builtin__ __nv_lambda_preheader_injection { };
+#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDACC__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#define EXCLUDE_FROM_RTC
+#if defined(__cplusplus) && !defined(__CUDACC_RTC__)
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+#include <utility>
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ * @{
+ */
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchKernel(
+  T           *func,
+  dim3         gridDim,
+  dim3         blockDim,
+  void       **args,
+  size_t       sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900)) || defined(__DOXYGEN_ONLY__)
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Invokes the kernel \p kernel on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * The kernel arguments should be passed as arguments to this function via the
+ * \p args parameter pack.
+ *
+ * The C API version of this function, \p cudaLaunchKernelExC, is also available
+ * for pre-C++11 compilers and for use cases where the ability to pass kernel
+ * parameters via void* array is preferable.
+ *
+ * \param config - Launch configuration
+ * \param kernel - Kernel to launch
+ * \param args   - Parameter pack of kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)",
+ * ::cuLaunchKernelEx
+ */
+template<typename... ExpTypes, typename... ActTypes>
+static __inline__ __host__ cudaError_t cudaLaunchKernelEx(
+  const cudaLaunchConfig_t *config,
+  void (*kernel)(ExpTypes...),
+  ActTypes &&... args
+)
+{
+    return [&](ExpTypes... coercedArgs){
+        void *pArgs[] = { &coercedArgs... };
+        return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs);
+    }(std::forward<ActTypes>(args)...);
+}
+
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Invokes the kernel \p kernel on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * The kernel arguments should be passed as arguments to this function via the
+ * \p args parameter pack.
+ *
+ * The C API version of this function, \p cudaLaunchKernelExC, is also available
+ * for pre-C++11 compilers and for use cases where the ability to pass kernel
+ * parameters via void* array is preferable.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Parameter pack of kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)",
+ * ::cuLaunchKernelEx
+ */
+template<typename... ActTypes>
+static __inline__ __host__ cudaError_t cudaLaunchKernelEx(
+  const cudaLaunchConfig_t *config,
+  const cudaKernel_t kernel,
+  ActTypes &&... args
+)
+{
+    void *pArgs[] = { &args... };
+    return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs);
+}
+#endif
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel(
+            T *func,
+  dim3         gridDim,
+  dim3         blockDim,
+  void       **args,
+  size_t       sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+/**
+ * \brief \hl Creates an event object with the specified flags
+ *
+ * Creates an event object with the specified flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent
+ */
+static __inline__ __host__ cudaError_t cudaEventCreate(
+  cudaEvent_t  *event,
+  unsigned int  flags
+)
+{
+  return ::cudaEventCreateWithFlags(event, flags);
+}
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * If there are any errors, diagnostic information may be returned in \p pErrorNode and
+ * \p pLogBuffer. This is the primary way to inspect instantiation errors. The output
+ * will be null terminated unless the diagnostics overflow
+ * the buffer. In this case, they will be truncated, and the last byte can be
+ * inspected to determine if truncation occurred.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param pErrorNode - In case of an instantiation error, this may be modified to
+ *                      indicate a node contributing to the error
+ * \param pLogBuffer   - A character buffer to store diagnostic messages
+ * \param bufferSize  - Size of the log buffer in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+static __inline__ __host__ cudaError_t cudaGraphInstantiate(
+  cudaGraphExec_t *pGraphExec,
+  cudaGraph_t graph,
+  cudaGraphNode_t *pErrorNode,
+  char *pLogBuffer,
+  size_t bufferSize
+)
+{
+  (void)pErrorNode;
+  (void)pLogBuffer;
+  (void)bufferSize;
+  return ::cudaGraphInstantiate(pGraphExec, graph, 0);
+}
+
+/**
+ * \brief \hl Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
+ * flag in order for the ::cudaHostAllocMapped flag to have any effect.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param ptr   - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc
+ */
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  void         **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc(ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostAlloc(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
+  T            **pDevice,
+  void          *pHost,
+  unsigned int   flags
+)
+{
+  return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
+}
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocManaged(
+  T            **devPtr,
+  size_t         size,
+  unsigned int   flags = cudaMemAttachGlobal
+)
+{
+  return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
+}
+
+/**
+ * \brief Advise about the usage of a given memory range.
+ *
+ * This is an alternate spelling for cudaMemAdvise made available through function overloading.
+ *
+ * \sa ::cudaMemAdvise,
+ * \ref ::cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvise advice, struct cudaMemLocation location)  "cudaMemAdvise (C API)"
+ */
+template<class T>
+cudaError_t cudaMemAdvise(
+  T                      *devPtr,
+  size_t                 count,
+  enum cudaMemoryAdvise  advice,
+  struct cudaMemLocation location
+)
+{
+  return ::cudaMemAdvise_v2((const void *)devPtr, count, advice, location);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemPrefetchAsync(
+  T                       *devPtr,
+  size_t                  count,
+  struct cudaMemLocation  location,
+  unsigned int            flags,
+  cudaStream_t            stream = 0
+)
+{
+  return ::cudaMemPrefetchAsync_v2((const void *)devPtr, count, location, flags, stream);
+}
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
+  cudaStream_t   stream,
+  T              *devPtr,
+  size_t         length = 0,
+  unsigned int   flags  = cudaMemAttachSingle
+)
+{
+  return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMalloc(
+  T      **devPtr,
+  size_t   size
+)
+{
+  return ::cudaMalloc((void**)(void*)devPtr, size);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags = 0
+)
+{
+  return cudaMallocHost((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocPitch(
+  T      **devPtr,
+  size_t  *pitch,
+  size_t   width,
+  size_t   height
+)
+{
+  return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
+}
+
+/**
+ * \brief Allocate from a pool
+ *
+ * This is an alternate spelling for cudaMallocFromPoolAsync
+ * made available through function overloading.
+ *
+ * \sa ::cudaMallocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream)  "cudaMallocAsync (C API)"
+ */
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  void        **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocAsync((void**)(void*)ptr, size, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * This is an alternate spelling for cudaMemcpyBatchAsync
+ * made available through function overloading.
+ *
+ * \sa ::cudaMemcpyBatchAsync
+ */
+template<typename T, typename U>
+static __inline__ __host__ cudaError_t cudaMemcpyBatchAsync(
+      T **dsts, U **srcs, size_t *sizes, size_t count, struct cudaMemcpyAttributes *attrs,
+      size_t *attrsIdxs, size_t numAttrs, size_t *failIdx, cudaStream_t hStream
+)
+{
+  return ::cudaMemcpyBatchAsync((void **)dsts, (void **)srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, hStream);
+}
+
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * This is an alternate spelling for cudaMemcpyBatchAsync
+ * made available through function overloading.
+ *
+ * The ::cudaMemcpyAttributes specified by \p attr are applicable for all the copies specified in the batch.
+ *
+ * \sa ::cudaMemcpyBatchAsync
+ */
+template<typename T, typename U>
+static __inline__ __host__ cudaError_t cudaMemcpyBatchAsync(
+      T **dsts, U **srcs, size_t *sizes, size_t count, struct cudaMemcpyAttributes attr, size_t *failIdx, cudaStream_t hStream
+)
+{
+  size_t attrsIdxs = 0;
+  return ::cudaMemcpyBatchAsync((void **)dsts, (void **)srcs, sizes, count, &attr, &attrsIdxs, 1, failIdx, hStream);
+}
+
+#if defined(__CUDACC__)
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice
+)
+{
+  return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost
+)
+{
+  return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
+}
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+    return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+// convenience function to avoid source breakage in c++ code
+static __inline__ __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out)
+{
+    cudaGraphExecUpdateResultInfo resultInfo;
+    cudaError_t status = cudaGraphExecUpdate(hGraphExec, hGraph, &resultInfo);
+    if (hErrorNode_out) {
+        *hErrorNode_out = resultInfo.errorNode;
+    }
+    if (updateResult_out) {
+        *updateResult_out = resultInfo.result;
+    }
+    return status;
+}
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+
+/**
+ * \brief Creates a user object by wrapping a C++ object
+ *
+ * TODO detail
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param objectToWrap    - This becomes the \ptr argument to ::cudaUserObjectCreate. A
+ *                          lambda will be passed for the \p destroy argument, which calls
+ *                          delete on this object pointer.
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    unsigned int flags)
+{
+    return ::cudaUserObjectCreate(
+            object_out,
+            objectToWrap,
+            [](void *vpObj) { delete reinterpret_cast<T *>(vpObj); },
+            initialRefcount,
+            flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    cudaUserObjectFlags flags)
+{
+    return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags);
+}
+
+#endif
+
+/**
+ * \brief \hl Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol can either be a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in the global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
+        void **devPtr,
+  const T     &symbol
+)
+{
+  return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
+ * variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in global or constant memory space, \p *size is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolSize(
+        size_t *size,
+  const T      &symbol
+)
+{
+  return ::cudaGetSymbolSize(size, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func must be a pointer to a function that executes on the device.
+ * The parameter specified by \p func must be declared as a \p __global__
+ * function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param func        - device function pointer
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost,
+ * ::cudaThreadGetCacheConfig,
+ * ::cudaThreadSetCacheConfig
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
+  T                  *func,
+  enum cudaFuncCache  cacheConfig
+)
+{
+  return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
+}
+
+template<class T>
+static __inline__ 
+__CUDA_DEPRECATED 
+__host__ cudaError_t cudaFuncSetSharedMemConfig(
+  T                        *func,
+  enum cudaSharedMemConfig  config
+)
+{
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(_MSC_VER)
+#pragma warning(suppress: 4996)    
+#endif
+  return ::cudaFuncSetSharedMemConfig((const void*)func, config);
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+#endif // __CUDACC__
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    int   *numBlocks,
+    T      func,
+    int    blockSize,
+    size_t dynamicSMemSize)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int         *numBlocks,
+    T            func,
+    int          blockSize,
+    size_t       dynamicSMemSize,
+    unsigned int flags)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags);
+}
+
+/**
+ * Helper functor for cudaOccupancyMaxPotentialBlockSize
+ */
+class __cudaOccupancyB2DHelper {
+  size_t n;
+public:
+  inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
+  inline __host__ CUDART_DEVICE size_t operator()(int)
+  {
+      return n;
+  }
+};
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0,
+    unsigned int   flags = 0)
+{
+    cudaError_t status;
+
+    // Device and function properties
+    int                       device;
+    struct cudaFuncAttributes attr;
+
+    // Limits
+    int maxThreadsPerMultiProcessor;
+    int warpSize;
+    int devMaxThreadsPerBlock;
+    int multiProcessorCount;
+    int funcMaxThreadsPerBlock;
+    int occupancyLimit;
+    int granularity;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !func) {
+        return cudaErrorInvalidValue;
+    }
+
+    //////////////////////////////////////////////
+    // Obtain device and function properties
+    //////////////////////////////////////////////
+
+    status = ::cudaGetDevice(&device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &maxThreadsPerMultiProcessor,
+        cudaDevAttrMaxThreadsPerMultiProcessor,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &warpSize,
+        cudaDevAttrWarpSize,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &devMaxThreadsPerBlock,
+        cudaDevAttrMaxThreadsPerBlock,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &multiProcessorCount,
+        cudaDevAttrMultiProcessorCount,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaFuncGetAttributes(&attr, func);
+    if (status != cudaSuccess) {
+        return status;
+    }
+    
+    funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = maxThreadsPerMultiProcessor;
+    granularity    = warpSize;
+
+    if (blockSizeLimit == 0) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (devMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (funcMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = funcMaxThreadsPerBlock;
+    }
+
+    blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        // This is needed for the first iteration, because
+        // blockSizeLimitAligned could be greater than blockSizeLimit
+        //
+        if (blockSizeLimit < blockSizeToTryAligned) {
+            blockSizeToTry = blockSizeLimit;
+        } else {
+            blockSizeToTry = blockSizeToTryAligned;
+        }
+        
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &occupancyInBlocks,
+            func,
+            blockSizeToTry,
+            dynamicSMemSize,
+            flags);
+
+        if (status != cudaSuccess) {
+            return status;
+        }
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * multiProcessorCount;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
+    int    *minGridSize,
+    int    *blockSize,
+    T       func,
+    size_t  dynamicSMemSize = 0,
+    int     blockSizeLimit = 0)
+{
+  return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize,
+    T      *func,
+    int     numBlocks,
+    int     blockSize)
+{
+    return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize);
+}
+
+/**
+ * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handle. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSize
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags(
+    int    *minGridSize,
+    int    *blockSize,
+    T      func,
+    size_t dynamicSMemSize = 0,
+    int    blockSizeLimit = 0,
+    unsigned int flags = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize(
+    int *clusterSize,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters(
+    int *numClusters,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config);
+}
+
+#if defined __CUDACC__
+
+/**
+ * \brief \hl Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The fetched attributes are placed in \p attr. If the specified
+ * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr  - Return pointer to function's attributes
+ * \param entry - Function to get attributes of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaLaunchKernel(T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncGetAttributes(
+  struct cudaFuncAttributes *attr,
+  T                         *entry
+)
+{
+  return ::cudaFuncGetAttributes(attr, (const void*)entry);
+}
+
+/**
+ * \brief \hl Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeNonPortableClusterSizeAllowed: Indicates whether the
+ *   function can be launched with non-portable cluster size. 1 is allowed, 0 is
+ *   disallowed.
+ * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
+ *   scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
+ *
+ * \param entry - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaLaunchKernel(T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetAttribute(
+  T                        *func,
+  enum cudaFuncAttribute    attr,
+  int                       value
+)
+{
+  return ::cudaFuncSetAttribute((const void*)func, attr, value);
+}
+
+/**
+ * \brief Returns the function name for a device entry function pointer.
+ *
+ * Returns in \p **name the function name associated with the symbol \p func .
+ * The function name is returned as a null-terminated string. This API may
+ * return a mangled name if the function is not declared as having C linkage.
+ * If \p **name is NULL, ::cudaErrorInvalidValue is returned. If \p func is
+ * not a device entry function, ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * \param name - The returned name of the function
+ * \param func - The function pointer to retrieve name for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaFuncGetName(const char **name, const void *func) "cudaFuncGetName (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t CUDARTAPI cudaFuncGetName(
+  const char **name,
+            T *func
+)
+{
+  return ::cudaFuncGetName(name, (const void *)func);
+}
+
+/**
+ * \brief Get pointer to device kernel that matches entry function \p entryFuncAddr
+  *
+  * Returns in \p kernelPtr the device kernel corresponding to the entry function \p entryFuncAddr.
+  *
+  * \param kernelPtr          - Returns the device kernel
+  * \param entryFuncAddr      - Address of device entry function to search kernel for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  * \sa
+  * \ref ::cudaGetKernel(cudaKernel_t *kernelPtr, const void *entryFuncAddr) "cudaGetKernel (C API)"
+  */
+template<class T>
+static  __inline__ __host__ cudaError_t cudaGetKernel(
+  cudaKernel_t *kernelPtr,
+             T *func
+)
+{
+  return ::cudaGetKernel(kernelPtr, (const void *)func);
+}
+
+/**
+ * \brief Returns a global device pointer
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the global with
+ * name \p name for the requested library \p library and the current device.
+ * If no global for the requested name \p name exists, the call returns ::cudaErrorSymbolNotFound.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored.
+ *
+ * \param dptr - Returned global device pointer for the requested library
+ * \param bytes - Returned global size in bytes
+ * \param library - Library to retrieve global from
+ * \param name - Name of global to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorContextIsDestroyed
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cudaLibraryGetManaged
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLibraryGetGlobal(T **dptr, size_t *bytes, cudaLibrary_t library, const char *name)
+{
+    return ::cudaLibraryGetGlobal((void**)(void*)dptr, bytes, library, name);
+}
+
+/**
+ * \brief Returns a pointer to managed memory
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with
+ * name \p name for the requested library \p library. If no managed memory with the
+ * requested name \p name exists, the call returns ::cudaErrorSymbolNotFound. One of the parameters
+ * \p dptr or \p bytes (not both) can be NULL in which case it is ignored.
+ * Note that managed memory for library \p library is shared across devices and is registered
+ * when the library is loaded.
+ *
+ * \param dptr - Returned pointer to the managed memory
+ * \param bytes - Returned memory size in bytes
+ * \param library - Library to retrieve managed memory from
+ * \param name - Name of managed memory to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cudaLibraryGetGlobal
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLibraryGetManaged(T **dptr, size_t *bytes, cudaLibrary_t library, const char *name)
+{
+    return ::cudaLibraryGetManaged((void**)(void*)dptr, bytes, library, name);
+}
+
+/**
+ * \brief Returns a pointer to a unified function
+ *
+ * Returns in \p *fptr the function pointer to a unified function denoted by \p symbol.
+ * If no unified function with name \p symbol exists, the call returns ::cudaErrorSymbolNotFound.
+ * If there is no device with attribute ::cudaDeviceProp::unifiedFunctionPointers present in the system,
+ * the call may return ::cudaErrorSymbolNotFound.
+ *
+ * \param fptr - Returned pointer to a unified function
+ * \param library - Library to retrieve function pointer memory from
+ * \param symbol - Name of function pointer to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLibraryGetUnifiedFunction(T **fptr, cudaLibrary_t library, const char *symbol)
+{
+    return ::cudaLibraryGetUnifiedFunction((void**)(void*)fptr, library, symbol);
+}
+
+/** @} */ /* END CUDART_LIBRARY */
+#endif /* __CUDACC__ */
+
+/** @} */ /* END CUDART_HIGHLEVEL */
+
+#endif /* __cplusplus && !__CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic pop
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+#endif
+
+#undef EXCLUDE_FROM_RTC
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..3328e2cb3f78f2f13eb2018f49c8f7ac0d4a06a2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h
@@ -0,0 +1,14933 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+
+#if !defined(__CUDA_RUNTIME_API_H__)
+#define __CUDA_RUNTIME_API_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+/**
+ * \latexonly
+ * \page sync_async API synchronization behavior
+ *
+ * \section memcpy_sync_async_behavior Memcpy
+ * The API provides memcpy/memset functions in both synchronous and asynchronous forms,
+ * the latter having an \e "Async" suffix. This is a misnomer as each function
+ * may exhibit synchronous or asynchronous behavior depending on the arguments
+ * passed to the function. In the reference documentation, each memcpy function is
+ * categorized as \e synchronous or \e asynchronous, corresponding to the definitions
+ * below.
+ * 
+ * \subsection MemcpySynchronousBehavior Synchronous
+ * 
+ * <ol>
+ * <li> For transfers from pageable host memory to device memory, a stream sync is performed
+ * before the copy is initiated. The function will return once the pageable
+ * buffer has been copied to the staging memory for DMA transfer to device memory,
+ * but the DMA to final destination may not have completed.
+ * 
+ * <li> For transfers from pinned host memory to device memory, the function is synchronous
+ * with respect to the host.
+ *
+ * <li> For transfers from device to either pageable or pinned host memory, the function returns
+ * only once the copy has completed.
+ * 
+ * <li> For transfers from device memory to device memory, no host-side synchronization is
+ * performed.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * </ol>
+ * 
+ * \subsection MemcpyAsynchronousBehavior Asynchronous
+ *
+ * <ol>
+ * <li> For transfers between device memory and pageable host memory, the function might 
+ * be synchronous with respect to host.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * 
+ * <li> If pageable memory must first be staged to pinned memory, the driver may
+ * synchronize with the stream and stage the copy into pinned memory.
+ *
+ * <li> For all other transfers, the function should be fully asynchronous.
+ * </ol>
+ *
+ * \section memset_sync_async_behavior Memset
+ * The cudaMemset functions are asynchronous with respect to the host
+ * except when the target memory is pinned host memory. The \e Async
+ * versions are always asynchronous with respect to the host.
+ *
+ * \section kernel_launch_details Kernel Launches
+ * Kernel launches are asynchronous with respect to the host. Details of
+ * concurrent kernel execution and data transfers can be found in the CUDA
+ * Programmers Guide.
+ *
+ * \endlatexonly
+ */
+
+/**
+ * There are two levels for the runtime API.
+ *
+ * The C API (<i>cuda_runtime_api.h</i>) is
+ * a C-style interface that does not require compiling with \p nvcc.
+ *
+ * The \ref CUDART_HIGHLEVEL "C++ API" (<i>cuda_runtime.h</i>) is a
+ * C++-style interface built on top of the C API. It wraps some of the
+ * C API routines, using overloading, references and default arguments.
+ * These wrappers can be used from C++ code and can be compiled with any C++
+ * compiler. The C++ API also has some CUDA-specific wrappers that wrap
+ * C API routines that deal with symbols, textures, and device functions.
+ * These wrappers require the use of \p nvcc because they depend on code being
+ * generated by the compiler. For example, the execution configuration syntax
+ * to invoke kernels is only available in source code compiled with \p nvcc.
+ */
+
+/** CUDA Runtime API Version */
+#define CUDART_VERSION  12080
+
+#if defined(__CUDA_API_VER_MAJOR__) && defined(__CUDA_API_VER_MINOR__)
+# define __CUDART_API_VERSION ((__CUDA_API_VER_MAJOR__ * 1000) + (__CUDA_API_VER_MINOR__ * 10))
+#else
+# define __CUDART_API_VERSION CUDART_VERSION
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "builtin_types.h"
+
+#if !defined(__CUDACC_RTC_MINIMAL__) && ((defined(__CUDACC_RDC__)  || defined(__CUDACC_EWP__) || !defined(__CUDACC_RTC__)))
+#include "cuda_device_runtime_api.h"
+#endif /* !defined(__CUDACC_RTC_MINIMAL__) && (defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) || !defined(__CUDACC_RTC__)) */
+
+
+#ifndef __CUDACC_RTC_MINIMAL__
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL)
+    #define __CUDART_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDART_API_PTDS(api) api ## _ptds
+    #define __CUDART_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDART_API_PTDS(api) api
+    #define __CUDART_API_PTSZ(api) api
+#endif
+
+#define cudaSignalExternalSemaphoresAsync  __CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)
+#define cudaWaitExternalSemaphoresAsync    __CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)
+
+    #define cudaStreamGetCaptureInfo       __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v2)
+
+#define cudaGetDeviceProperties cudaGetDeviceProperties_v2
+
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    #define cudaMemcpy                     __CUDART_API_PTDS(cudaMemcpy)
+    #define cudaMemcpyToSymbol             __CUDART_API_PTDS(cudaMemcpyToSymbol)
+    #define cudaMemcpyFromSymbol           __CUDART_API_PTDS(cudaMemcpyFromSymbol)
+    #define cudaMemcpy2D                   __CUDART_API_PTDS(cudaMemcpy2D)
+    #define cudaMemcpyToArray              __CUDART_API_PTDS(cudaMemcpyToArray)
+    #define cudaMemcpy2DToArray            __CUDART_API_PTDS(cudaMemcpy2DToArray)
+    #define cudaMemcpyFromArray            __CUDART_API_PTDS(cudaMemcpyFromArray)
+    #define cudaMemcpy2DFromArray          __CUDART_API_PTDS(cudaMemcpy2DFromArray)
+    #define cudaMemcpyArrayToArray         __CUDART_API_PTDS(cudaMemcpyArrayToArray)
+    #define cudaMemcpy2DArrayToArray       __CUDART_API_PTDS(cudaMemcpy2DArrayToArray)
+    #define cudaMemcpy3D                   __CUDART_API_PTDS(cudaMemcpy3D)
+    #define cudaMemcpy3DPeer               __CUDART_API_PTDS(cudaMemcpy3DPeer)
+    #define cudaMemset                     __CUDART_API_PTDS(cudaMemset)
+    #define cudaMemset2D                   __CUDART_API_PTDS(cudaMemset2D)
+    #define cudaMemset3D                   __CUDART_API_PTDS(cudaMemset3D)
+    #define cudaGraphInstantiateWithParams __CUDART_API_PTSZ(cudaGraphInstantiateWithParams)
+    #define cudaGraphUpload                __CUDART_API_PTSZ(cudaGraphUpload)
+    #define cudaGraphLaunch                __CUDART_API_PTSZ(cudaGraphLaunch)
+    #define cudaStreamBeginCapture         __CUDART_API_PTSZ(cudaStreamBeginCapture)
+    #define cudaStreamBeginCaptureToGraph  __CUDART_API_PTSZ(cudaStreamBeginCaptureToGraph)
+    #define cudaStreamEndCapture           __CUDART_API_PTSZ(cudaStreamEndCapture)
+    #define cudaStreamGetCaptureInfo_v3    __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v3)
+    #define cudaStreamUpdateCaptureDependencies  __CUDART_API_PTSZ(cudaStreamUpdateCaptureDependencies)
+    #define cudaStreamUpdateCaptureDependencies_v2  __CUDART_API_PTSZ(cudaStreamUpdateCaptureDependencies_v2)
+    #define cudaStreamIsCapturing          __CUDART_API_PTSZ(cudaStreamIsCapturing)
+    #define cudaMemcpyAsync                __CUDART_API_PTSZ(cudaMemcpyAsync)
+    #define cudaMemcpyToSymbolAsync        __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync)
+    #define cudaMemcpyFromSymbolAsync      __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync)
+    #define cudaMemcpy2DAsync              __CUDART_API_PTSZ(cudaMemcpy2DAsync)
+    #define cudaMemcpyToArrayAsync         __CUDART_API_PTSZ(cudaMemcpyToArrayAsync)
+    #define cudaMemcpy2DToArrayAsync       __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync)
+    #define cudaMemcpyFromArrayAsync       __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync)
+    #define cudaMemcpy2DFromArrayAsync     __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync)
+    #define cudaMemcpy3DAsync              __CUDART_API_PTSZ(cudaMemcpy3DAsync)
+    #define cudaMemcpy3DPeerAsync          __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync)
+    #define cudaMemcpyBatchAsync          __CUDART_API_PTSZ(cudaMemcpyBatchAsync)
+    #define cudaMemcpy3DBatchAsync        __CUDART_API_PTSZ(cudaMemcpy3DBatchAsync)
+    #define cudaMemsetAsync                __CUDART_API_PTSZ(cudaMemsetAsync)
+    #define cudaMemset2DAsync              __CUDART_API_PTSZ(cudaMemset2DAsync)
+    #define cudaMemset3DAsync              __CUDART_API_PTSZ(cudaMemset3DAsync)
+    #define cudaStreamQuery                __CUDART_API_PTSZ(cudaStreamQuery)
+    #define cudaStreamGetDevice            __CUDART_API_PTSZ(cudaStreamGetDevice)
+    #define cudaStreamGetFlags             __CUDART_API_PTSZ(cudaStreamGetFlags)
+    #define cudaStreamGetId                __CUDART_API_PTSZ(cudaStreamGetId)
+    #define cudaStreamGetPriority          __CUDART_API_PTSZ(cudaStreamGetPriority)
+    #define cudaEventRecord                __CUDART_API_PTSZ(cudaEventRecord)
+    #define cudaEventRecordWithFlags       __CUDART_API_PTSZ(cudaEventRecordWithFlags)
+    #define cudaStreamWaitEvent            __CUDART_API_PTSZ(cudaStreamWaitEvent)
+    #define cudaStreamAddCallback          __CUDART_API_PTSZ(cudaStreamAddCallback)
+    #define cudaStreamAttachMemAsync       __CUDART_API_PTSZ(cudaStreamAttachMemAsync)
+    #define cudaStreamSynchronize          __CUDART_API_PTSZ(cudaStreamSynchronize)
+    #define cudaLaunchKernel               __CUDART_API_PTSZ(cudaLaunchKernel)
+    #define cudaLaunchKernelExC            __CUDART_API_PTSZ(cudaLaunchKernelExC)
+    #define cudaLaunchHostFunc             __CUDART_API_PTSZ(cudaLaunchHostFunc)
+    #define cudaMemPrefetchAsync           __CUDART_API_PTSZ(cudaMemPrefetchAsync)
+    #define cudaMemPrefetchAsync_v2        __CUDART_API_PTSZ(cudaMemPrefetchAsync_v2)
+    #define cudaLaunchCooperativeKernel    __CUDART_API_PTSZ(cudaLaunchCooperativeKernel)
+    #define cudaStreamCopyAttributes       __CUDART_API_PTSZ(cudaStreamCopyAttributes)
+    #define cudaStreamGetAttribute         __CUDART_API_PTSZ(cudaStreamGetAttribute)
+    #define cudaStreamSetAttribute         __CUDART_API_PTSZ(cudaStreamSetAttribute)
+    #define cudaMallocAsync                __CUDART_API_PTSZ(cudaMallocAsync)
+    #define cudaFreeAsync                  __CUDART_API_PTSZ(cudaFreeAsync)
+    #define cudaMallocFromPoolAsync        __CUDART_API_PTSZ(cudaMallocFromPoolAsync)
+    #define cudaGetDriverEntryPoint        __CUDART_API_PTSZ(cudaGetDriverEntryPoint)
+    #define cudaGetDriverEntryPointByVersion  __CUDART_API_PTSZ(cudaGetDriverEntryPointByVersion)
+#endif
+
+#endif  /* __CUDACC_RTC_MINIMAL__ */
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#if (defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))   /** Visible to SM>=3.5 and "__host__ __device__" only **/
+
+#define CUDART_DEVICE __device__ 
+
+#else
+
+#define CUDART_DEVICE
+
+#endif /** CUDART_DEVICE */
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \defgroup CUDART_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Destroy all allocations and reset all state on the current device
+ * in the current process.
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process. It is the caller's responsibility to ensure
+ * that the resources are not accessed or passed in subsequent API calls and
+ * doing so will result in undefined behavior. These resources include CUDA types
+ * ::cudaStream_t, ::cudaEvent_t, ::cudaArray_t, ::cudaMipmappedArray_t, ::cudaPitchedPtr,
+ * ::cudaTextureObject_t, ::cudaSurfaceObject_t, ::textureReference, ::surfaceReference,
+ * ::cudaExternalMemory_t, ::cudaExternalSemaphore_t and ::cudaGraphicsResource_t.
+ * These resources also include memory allocations by ::cudaMalloc, ::cudaMallocHost,
+ * ::cudaMallocManaged and ::cudaMallocPitch.
+ * Any subsequent API call to this device will reinitialize the device.
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \note ::cudaDeviceReset() will not destroy memory allocations by ::cudaMallocAsync() and
+ * ::cudaMallocFromPoolAsync(). These memory allocations need to be destroyed explicitly.
+ * \note If a non-primary ::CUcontext is current to the thread, ::cudaDeviceReset()
+ * will destroy only the internal CUDA RT state for that ::CUcontext.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \note_device_sync_deprecated
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceReset,
+ * ::cuCtxSynchronize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaDeviceGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO
+ *   used by the ::printf() device system call. Setting
+ *   ::cudaLimitPrintfFifoSize must not be performed after launching any kernel
+ *   that uses the ::printf() device system call - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by
+ *   the ::malloc() and ::free() device system calls. Setting
+ *   ::cudaLimitMallocHeapSize must not be performed after launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a
+ *   grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the runtime to reserve large amounts of
+ *   device memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability < 9.0.
+ *   Attempting to set this limit on devices of other compute capability will
+ *   results in error ::cudaErrorUnsupportedLimit being returned.
+ *
+ * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   device. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the runtime to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
+ *   returned.
+ *
+ * - ::cudaLimitMaxL2FetchGranularity controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::cudaLimitPersistingL2CacheSize controls size in bytes available
+ *   for persisting L2 cache. This is purely a performance hint and it
+ *   can be ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetLimit,
+ * ::cuCtxSetLimit
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Return resource limits
+ *
+ * Returns in \p *pValue the current size of \p limit. The following ::cudaLimit values are supported.
+ * - ::cudaLimitStackSize is the stack size in bytes of each GPU thread.
+ * - ::cudaLimitPrintfFifoSize is the size in bytes of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize is the size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::cudaLimitDevRuntimeSyncDepth is the maximum grid depth at which a
+ *   thread can isssue the device runtime call ::cudaDeviceSynchronize()
+ *   to wait on child grid launches to complete. This functionality is removed
+ *   for devices of compute capability >= 9.0, and hence will return error
+ *   ::cudaErrorUnsupportedLimit on such devices.
+ * - ::cudaLimitDevRuntimePendingLaunchCount is the maximum number of outstanding
+ *   device runtime launches.
+ * - ::cudaLimitMaxL2FetchGranularity is the L2 cache fetch granularity.
+ * - ::cudaLimitPersistingL2CacheSize is the persisting L2 cache size in bytes.
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size of the limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetLimit,
+ * ::cuCtxGetLimit
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of elements allocatable in a 1D linear texture
+ * for given format descriptor \p fmtDesc.
+ *
+ * \param maxWidthInElements    - Returns maximum number of texture elements allocatable for given \p fmtDesc.
+ * \param fmtDesc               - Texture format description.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuDeviceGetTexture1DLinearMaxWidth
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc, int device);
+#endif
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxGetCacheConfig
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cudaStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cudaDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device ordinal given a PCI bus ID string.
+ *
+ * \param device   - Returned device ordinal
+ *
+ * \param pciBusId - String in one of the following forms: 
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetPCIBusId,
+ * ::cuDeviceGetByPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param device   - Device to get identifier string for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetByPCIBusId,
+ * ::cuDeviceGetPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been 
+ * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cudaIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been been opened in the importing process, 
+ * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and 
+ * ::cudaEventQuery may be used in either process. Performing operations 
+ * on the imported event after the exported event has been freed 
+ * with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param handle - Pointer to a user allocated cudaIpcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::cudaEventInterprocess and 
+ *                    ::cudaEventDisableTiming flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with 
+ * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like 
+ * a locally created event with the ::cudaEventDisableTiming flag specified. 
+ * This event must be freed with ::cudaEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has 
+ * been freed with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param event - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUninitialized
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcOpenEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ *          allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created 
+ * with ::cudaMalloc and exports it for use in another process. This is a 
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects. 
+ *
+ * If a region of memory is freed with ::cudaFree and a subsequent call
+ * to ::cudaMalloc returns memory with the same device address,
+ * ::cudaIpcGetMemHandle will return a unique handle for the
+ * new memory. 
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return
+ *                    the handle in.
+ * \param devPtr - Base pointer to previously allocated device memory 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ *          and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cudaIpcGetMemHandle into
+ * the current device address space. For contexts on different devices 
+ * ::cudaIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is 
+ * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. 
+ * ::cudaDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * ::cudaIpcOpenMemHandle can open handles to devices that may not be visible
+ * in the process calling the API.
+ *
+ * Contexts that may open ::cudaIpcMemHandles are restricted in the following way.
+ * ::cudaIpcMemHandles from each device in a given process may only be opened 
+ * by one context per device per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cudaIpcOpenMemHandle must be freed with
+ * ::cudaIpcCloseMemHandle.
+ *
+ * Calling ::cudaFree on an exported memory region before calling
+ * ::cudaIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ * 
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param devPtr - Returned device pointer
+ * \param handle - ::cudaIpcMemHandle to open
+ * \param flags  - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorTooManyPeers,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note No guarantees are made about the address returned in \p *devPtr.  
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceCanAccessPeer,
+ * ::cuIpcOpenMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
+
+/**
+ * \brief Attempts to close memory mapped with cudaIpcOpenMemHandle
+ * 
+ * Decrements the reference count of the memory returnd by ::cudaIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle
+ * 
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until remote writes to the target context via mappings created
+ * through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::cudaDevAttrGPUDirectRDMAWritesOrdering, the call will be a no-op and
+ * can be safely omitted for performance. This can be determined by
+ * comparing the numerical values between the two enums, with smaller
+ * scopes having smaller values.
+ *
+ * Users may query support for this API via ::cudaDevAttrGPUDirectRDMAFlushWritesOptions.
+ *
+ * \param target - The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuFlushGPUDirectRDMAWrites
+ */
+#if __CUDART_API_VERSION >= 11030
+extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope);
+#endif
+
+/**
+* \brief Registers a callback function to receive async notifications
+*
+* Registers \p callbackFunc to receive async notifications.
+*
+* The \p userData parameter is passed to the callback function at async notification time.
+* Likewise, \p callback is also passed to the callback function to distinguish between
+* multiple registered callbacks.
+*
+* The callback function being registered should be designed to return quickly (~10ms).
+* Any long running tasks should be queued for execution on an application thread.
+*
+* Callbacks may not call cudaDeviceRegisterAsyncNotification or cudaDeviceUnregisterAsyncNotification.
+* Doing so will result in ::cudaErrorNotPermitted. Async notification callbacks execute
+* in an undefined order and may be serialized.
+*
+* Returns in \p *callback a handle representing the registered callback instance.
+*
+* \param device - The device on which to register the callback
+* \param callbackFunc - The function to register as a callback
+* \param userData - A generic pointer to user data. This is passed into the callback function.
+* \param callback - A handle representing the registered callback instance
+*
+* \return
+* ::cudaSuccess
+* ::cudaErrorNotSupported
+* ::cudaErrorInvalidDevice
+* ::cudaErrorInvalidValue
+* ::cudaErrorNotPermitted
+* ::cudaErrorUnknown
+* \notefnerr
+*
+* \sa
+* ::cudaDeviceUnregisterAsyncNotification
+*/
+extern __host__ cudaError_t CUDARTAPI cudaDeviceRegisterAsyncNotification(int device, cudaAsyncCallback callbackFunc, void* userData, cudaAsyncCallbackHandle_t* callback);
+
+/**
+* \brief Unregisters an async notification callback
+*
+* Unregisters \p callback so that the corresponding callback function will stop receiving
+* async notifications.
+*
+* \param device - The device from which to remove \p callback.
+* \param callback - The callback instance to unregister from receiving async notifications.
+*
+* \return
+* ::cudaSuccess
+* ::cudaErrorNotSupported
+* ::cudaErrorInvalidDevice
+* ::cudaErrorInvalidValue
+* ::cudaErrorNotPermitted
+* ::cudaErrorUnknown
+* \notefnerr
+*
+* \sa
+* ::cudaDeviceRegisterAsyncNotification
+*/
+extern __host__ cudaError_t CUDARTAPI cudaDeviceUnregisterAsyncNotification(int device, cudaAsyncCallbackHandle_t callback);
+
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated device management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the shared memory configuration for the current device.
+ *
+ * \deprecated
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * on the current device. On devices with configurable shared memory banks, 
+ * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all 
+ * subsequent kernel launches will by default use the new bank size. When 
+ * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared 
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes.
+ * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes.
+ *
+ * \param pConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current device.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the shared memory bank size which is used for all subsequent kernel launches.
+ * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig
+ * will override the device wide setting.
+ *
+ * Changing the shared memory configuration between launches may introduce
+ * a device side synchronization point.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently,
+ *   four bytes)
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes
+ *   natively.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively.
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxSetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);
+/** @} */ /* END CUDART_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated thread management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Exit and clean up from CUDA launches
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceReset(), which should be used
+ * instead.
+ *
+ * Explicitly destroys all cleans up all resources associated with the current
+ * device in the current process.  Any subsequent API call to this device will 
+ * reinitialize the device.  
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceReset
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is similar to the 
+ * non-deprecated function ::cudaDeviceSynchronize(), which should be used
+ * instead.
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaThreadSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetLimit(), which should be used
+ * instead.
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaThreadGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO
+ *   used by the ::printf() device system call.
+ *   Setting ::cudaLimitPrintfFifoSize must be performed before
+ *   launching any kernel that uses the ::printf() device
+ *   system call, otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size of the heap used
+ *   by the ::malloc() and ::free() device system calls.  Setting
+ *   ::cudaLimitMallocHeapSize must be performed before launching
+ *   any kernel that uses the ::malloc() or ::free() device system calls,
+ *   otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * \param limit - Limit to set
+ * \param value - Size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetLimit(), which should be used
+ * instead.
+ *
+ * Returns in \p *pValue the current size of \p limit.  The supported
+ * ::cudaLimit values are:
+ * - ::cudaLimitStackSize: stack size of each GPU thread;
+ * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize: size of the heap used by the
+ *   ::malloc() and ::free() device system calls;
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/** @} */ /* END CUDART_THREAD_DEPRECATED */
+
+/**
+ * \defgroup CUDART_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same instance of the CUDA Runtime library in the host thread and
+ * resets it to ::cudaSuccess.
+ *
+ * Note: Multiple instances of the CUDA Runtime library can be present in an
+ * application when using a library that statically links the CUDA Runtime.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same instance of the CUDA Runtime library in the host thread. This
+ * call does not reset the error to ::cudaSuccess like ::cudaGetLastError().
+ *
+ * Note: Multiple instances of the CUDA Runtime library can be present in an
+ * application when using a library that statically links the CUDA Runtime.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+
+/**
+ * \brief Returns the string representation of an error code enum name
+ *
+ * Returns a string containing the name of an error code in the enum.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorName
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+
+/**
+ * \brief Returns the description string for an error code
+ *
+ * Returns the description string for an error code.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorString
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+/** @} */ /* END CUDART_ERROR */
+
+/**
+ * \addtogroup CUDART_DEVICE 
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * or equal to 2.0 that are available for execution.
+ *
+ * \param count - Returns the number of devices with compute capability
+ * greater or equal to 2.0
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetCount
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+
+/**
+ * \brief Returns information about the compute-device
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp
+ * structure is defined as:
+ * \code
+    struct cudaDeviceProp {
+        char name[256];
+        cudaUUID_t uuid;
+        size_t totalGlobalMem;
+        size_t sharedMemPerBlock;
+        int regsPerBlock;
+        int warpSize;
+        size_t memPitch;
+        int maxThreadsPerBlock;
+        int maxThreadsDim[3];
+        int maxGridSize[3];
+        int clockRate;
+        size_t totalConstMem;
+        int major;
+        int minor;
+        size_t textureAlignment;
+        size_t texturePitchAlignment;
+        int deviceOverlap;
+        int multiProcessorCount;
+        int kernelExecTimeoutEnabled;
+        int integrated;
+        int canMapHostMemory;
+        int computeMode;
+        int maxTexture1D;
+        int maxTexture1DMipmap;
+        int maxTexture1DLinear;
+        int maxTexture2D[2];
+        int maxTexture2DMipmap[2];
+        int maxTexture2DLinear[3];
+        int maxTexture2DGather[2];
+        int maxTexture3D[3];
+        int maxTexture3DAlt[3];
+        int maxTextureCubemap;
+        int maxTexture1DLayered[2];
+        int maxTexture2DLayered[3];
+        int maxTextureCubemapLayered[2];
+        int maxSurface1D;
+        int maxSurface2D[2];
+        int maxSurface3D[3];
+        int maxSurface1DLayered[2];
+        int maxSurface2DLayered[3];
+        int maxSurfaceCubemap;
+        int maxSurfaceCubemapLayered[2];
+        size_t surfaceAlignment;
+        int concurrentKernels;
+        int ECCEnabled;
+        int pciBusID;
+        int pciDeviceID;
+        int pciDomainID;
+        int tccDriver;
+        int asyncEngineCount;
+        int unifiedAddressing;
+        int memoryClockRate;
+        int memoryBusWidth;
+        int l2CacheSize;
+        int persistingL2CacheMaxSize;
+        int maxThreadsPerMultiProcessor;
+        int streamPrioritiesSupported;
+        int globalL1CacheSupported;
+        int localL1CacheSupported;
+        size_t sharedMemPerMultiprocessor;
+        int regsPerMultiprocessor;
+        int managedMemory;
+        int isMultiGpuBoard;
+        int multiGpuBoardGroupID;
+        int singleToDoublePrecisionPerfRatio;
+        int pageableMemoryAccess;
+        int concurrentManagedAccess;
+        int computePreemptionSupported;
+        int canUseHostPointerForRegisteredMem;
+        int cooperativeLaunch;
+        int cooperativeMultiDeviceLaunch;
+        int pageableMemoryAccessUsesHostPageTables;
+        int directManagedMemAccessFromHost;
+        int accessPolicyMaxWindowSize;
+    }
+ \endcode
+ * where:
+ * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying
+ *   the device.
+ * - \ref ::cudaDeviceProp::uuid "uuid" is a 16-byte unique identifier.
+ * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total
+ *   amount of global memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the
+ *   maximum amount of shared memory available to a thread block in bytes.
+ * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number
+ *   of 32-bit registers available to a thread block.
+ * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads.
+ * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in
+ *   bytes allowed by the memory copy functions that involve memory regions
+ *   allocated through ::cudaMallocPitch().
+ * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the
+ *   maximum number of threads per block.
+ * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the
+ *   maximum size of each dimension of a block.
+ * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the
+ *   maximum size of each dimension of a grid.
+ * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in
+ *   kilohertz.
+ * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount
+ *   of constant memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::major "major",
+ *   \ref ::cudaDeviceProp::minor "minor" are the major and minor revision
+ *   numbers defining the device's compute capability.
+ * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the
+ *   alignment requirement; texture base addresses that are aligned to
+ *   \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not
+ *   need an offset applied to texture fetches.
+ * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the
+ *   pitch alignment requirement for 2D texture references that are bound to 
+ *   pitched memory.
+ * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device
+ *   can concurrently copy memory between host and device while executing a
+ *   kernel, or 0 if not.  Deprecated, use instead asyncEngineCount.
+ * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the
+ *   number of multiprocessors on the device.
+ * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+ *   is 1 if there is a run time limit for kernels executed on the device, or
+ *   0 if not.
+ * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an
+ *   integrated (motherboard) GPU and 0 if it is a discrete (card) component.
+ * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the
+ *   device can map host memory into the CUDA address space for use with
+ *   ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not.
+ * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode
+ *   that the device is currently in. Available modes are as follows:
+ *   - cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this device.
+ *   <br> When an occupied exclusive mode device is chosen with ::cudaSetDevice,
+ *   all subsequent non-device management runtime functions will return
+ *   ::cudaErrorDevicesUnavailable.
+ * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D
+ *   texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum
+ *   1D mipmapped texture texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum
+ *   1D texture size for textures bound to linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum
+ *   2D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the
+ *   maximum 2D mipmapped texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the 
+ *   maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the 
+ *   maximum 2D texture dimensions if texture gather operations have to be performed.
+ * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum
+ *   3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]"
+ *   contains the maximum alternate 3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the 
+ *   maximum cubemap texture width or height.
+ * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains
+ *   the maximum 1D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains
+ *   the maximum 2D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]"
+ *   contains the maximum cubemap layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D
+ *   surface size.
+ * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum
+ *   2D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum
+ *   3D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains
+ *   the maximum 1D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains
+ *   the maximum 2D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum 
+ *   cubemap surface width or height.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]"
+ *   contains the maximum cubemap layered surface dimensions.
+ * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the
+ *   alignment requirements for surfaces.
+ * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the
+ *   device supports executing multiple kernels within the same context
+ *   simultaneously, or 0 if not. It is not guaranteed that multiple kernels
+ *   will be resident on the device concurrently so this feature should not be
+ *   relied upon for correctness.
+ * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC
+ *   support turned on, or 0 if not.
+ * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of
+ *   the device.
+ * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device
+ *   (sometimes called slot) identifier of the device.
+ * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier
+ *   of the device.
+ * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a
+ *   TCC driver or 0 if not.
+ * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the
+ *   device can concurrently copy memory between host and device while executing
+ *   a kernel. It is 2 when the device can concurrently copy memory between host
+ *   and device in both directions and execute a kernel at the same time. It is
+ *   0 if neither of these is supported.
+ * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device 
+ *   shares a unified address space with the host and 0 otherwise.
+ * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory 
+ *   clock frequency in kilohertz.
+ * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width  
+ *   in bits.
+ * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. 
+ * - \ref ::cudaDeviceProp::persistingL2CacheMaxSize "persistingL2CacheMaxSize" is L2 cache's maximum persisting lines size in bytes.
+ * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor"  
+ *   is the number of maximum resident threads per multiprocessor.
+ * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported"
+ *   is 1 if the device supports stream priorities, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported"
+ *   is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported"
+ *   is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the
+ *   maximum amount of shared memory available to a multiprocessor in bytes; this amount is
+ *   shared by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number
+ *   of 32-bit registers available to a multiprocessor; this number is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::managedMemory "managedMemory"
+ *   is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard"
+ *   is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
+ * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier
+ *   for a group of devices associated with the same board.
+ *   Devices on the same multi-GPU board will share the same identifier.
+ * - \ref ::cudaDeviceProp::hostNativeAtomicSupported "hostNativeAtomicSupported"
+ *   is 1 if the link between the device and the host supports native atomic operations, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio"  
+ *   is the ratio of single precision performance (in floating-point operations per second)
+ *   to double precision performance.
+ * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports
+ *   coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can
+ *   coherently access managed memory concurrently with the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device
+ *   supports Compute Preemption, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if
+ *   the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching
+ *   cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device
+ *   supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::sharedMemPerBlockOptin "sharedMemPerBlockOptin"
+ *   is the per device maximum shared memory per block usable by special opt in
+ * - \ref ::cudaDeviceProp::pageableMemoryAccessUsesHostPageTables "pageableMemoryAccessUsesHostPageTables" is 1 if the device accesses
+ *   pageable memory via the host's page tables, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::directManagedMemAccessFromHost "directManagedMemAccessFromHost" is 1 if the host can directly access managed
+ *   memory on the device without migration, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::maxBlocksPerMultiProcessor "maxBlocksPerMultiProcessor" is the maximum number of thread blocks
+ *   that can reside on a multiprocessor.
+ * - \ref ::cudaDeviceProp::accessPolicyMaxWindowSize "accessPolicyMaxWindowSize" is
+ *   the maximum value of ::cudaAccessPolicyWindow::num_bytes.
+ * - \ref ::cudaDeviceProp::reservedSharedMemPerBlock "reservedSharedMemPerBlock"
+ *   is the shared memory reserved by CUDA driver per block in bytes
+ * - \ref ::cudaDeviceProp::hostRegisterSupported "hostRegisterSupported"
+ *  is 1 if the device supports host memory registration via ::cudaHostRegister, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::sparseCudaArraySupported "sparseCudaArraySupported"
+ *  is 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise
+ * - \ref ::cudaDeviceProp::hostRegisterReadOnlySupported "hostRegisterReadOnlySupported"
+ *  is 1 if the device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as
+ *  read-only to the GPU
+ * - \ref ::cudaDeviceProp::timelineSemaphoreInteropSupported "timelineSemaphoreInteropSupported"
+ *  is 1 if external timeline semaphore interop is supported on the device, 0 otherwise
+ * - \ref ::cudaDeviceProp::memoryPoolsSupported "memoryPoolsSupported"
+ *  is 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise
+ * - \ref ::cudaDeviceProp::gpuDirectRDMASupported "gpuDirectRDMASupported"
+ *  is 1 if the device supports GPUDirect RDMA APIs, 0 otherwise
+ * - \ref ::cudaDeviceProp::gpuDirectRDMAFlushWritesOptions "gpuDirectRDMAFlushWritesOptions"
+ *  is a bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum
+ * - \ref ::cudaDeviceProp::gpuDirectRDMAWritesOrdering "gpuDirectRDMAWritesOrdering"
+ *  See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+ * - \ref ::cudaDeviceProp::memoryPoolSupportedHandleTypes "memoryPoolSupportedHandleTypes"
+ *  is a bitmask of handle types supported with mempool-based IPC
+ * - \ref ::cudaDeviceProp::deferredMappingCudaArraySupported "deferredMappingCudaArraySupported"
+ *  is 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays
+ * - \ref ::cudaDeviceProp::ipcEventSupported "ipcEventSupported"
+ *  is 1 if the device supports IPC Events, and 0 otherwise
+ * - \ref ::cudaDeviceProp::unifiedFunctionPointers "unifiedFunctionPointers"
+ *  is 1 if the device support unified pointers, and 0 otherwise
+ * \param prop   - Properties for the specified device
+ * \param device - Device number to get properties for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaDeviceGetAttribute, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *value the integer value of the attribute \p attr on device
+ * \p device. The supported attributes are:
+ * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block
+ * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block
+ * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid
+ * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory
+ *   available to a thread block in bytes
+ * - ::cudaDevAttrTotalConstantMemory: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::cudaDevAttrWarpSize: Warp size in threads
+ * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy
+ *   functions that involve memory regions allocated through ::cudaMallocPitch()
+ * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width
+ * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound
+ *   to linear memory
+ * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width
+ * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width
+ * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height
+ * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D
+ *   texture bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
+ *   width
+ * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
+ *   height
+ * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width
+ * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height
+ * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth
+ * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or
+ *   height
+ * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width
+ * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered
+ *   texture
+ * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width
+ * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height
+ * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered
+ *   texture
+ * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered
+ *   texture width or height
+ * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered texture
+ * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width
+ * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width
+ * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height
+ * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width
+ * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height
+ * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth
+ * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width
+ * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width
+ * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height
+ * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered
+ *   surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered surface
+ * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers 
+ *   available to a thread block
+ * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz
+ * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base
+ *   addresses aligned to ::textureAlign bytes do not need an offset applied
+ *   to texture fetches
+ * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D
+ *   texture references bound to pitched memory
+ * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory
+ *   between host and device while executing a kernel, or 0 if not
+ * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device
+ * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels
+ *   executed on the device, or 0 if not
+ * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory
+ *   subsystem, or 0 if not
+ * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into
+ *   the CUDA address space, or 0 if not
+ * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device
+ *   is currently in. Available modes are as follows:
+ *   - ::cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this
+ *     device.
+ * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing
+ *   multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident on the
+ *   device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,
+ *   0 if error correction is disabled or not supported by the device
+ * - ::cudaDevAttrPciBusId: PCI bus identifier of the device
+ * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of
+ *   the device
+ * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only
+ *   available on Tesla hardware running Windows Vista or later.
+ * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz
+ * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits
+ * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device
+ *   doesn't have L2 cache.
+ * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per 
+ *   multiprocessor
+ * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address
+ *   space with the host, or 0 if not
+ * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version
+ *   number
+ * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version
+ *   number
+ * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream
+ *   priorities, or 0 if not
+ * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory
+ *   available to a multiprocessor in bytes; this amount is shared by all 
+ *   thread blocks simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers 
+ *   available to a multiprocessor; this number is shared by all thread blocks
+ *   simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrManagedMemory: 1 if device supports allocating
+ *   managed memory, 0 if not
+ * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not
+ * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the
+ *   same multi-GPU board
+ * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the
+ *   host supports native atomic operations
+ * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance
+ * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it, and 0 otherwise
+ * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed
+ *   memory concurrently with the CPU, and 0 otherwise
+ * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports
+ *   Compute Preemption, 0 if not
+ * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host
+ *   registered memory at the same virtual address as the CPU, and 0 otherwise
+ * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels
+ *   via ::cudaLaunchCooperativeKernel, and 0 otherwise
+ * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative
+ *   kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise
+ * - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding 
+ *   remote writes, and 0 otherwise
+ * - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration
+ *   via ::cudaHostRegister, and 0 otherwise
+ * - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the
+ *   host's page tables, and 0 otherwise
+ * - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device
+ *   without migration, and 0 otherwise
+ * - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can
+ *   be opted into when using ::cudaFuncSetAttribute
+ * - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes
+ * - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes
+ * - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes
+ * - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly
+ *   to register memory that must be mapped as read-only to the GPU
+ * - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum 
+ * - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+ * - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC
+ * - ::cudaDevAttrDeferredMappingCudaArraySupported : 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ * - ::cudaDevAttrIpcEventSupport: 1 if the device supports IPC Events.
+ * - ::cudaDevAttrNumaConfig: NUMA configuration of a device: value is of type ::cudaDeviceNumaConfig enum
+ * - ::cudaDevAttrNumaId: NUMA node ID of the GPU memory
+ * - ::cudaDevAttrGpuPciDeviceId: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+ * - ::cudaDevAttrGpuPciSubsystemId: The combined 16-bit PCI subsystem ID and 16-bit PCI vendor subsystem ID.
+ *
+ * \param value  - Returned device attribute value
+ * \param attr   - Device attribute to query
+ * \param device - Device number to query 
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaGetDeviceProperties, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cudaMallocAsync, ::cudaMemPoolTrimTo, ::cudaMemPoolGetAttribute, ::cudaDeviceSetMemPool, ::cudaMemPoolSetAttribute, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device);
+
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * Unless a mempool is specified in the ::cudaMallocAsync call,
+ * ::cudaMallocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cudaMallocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidDevice
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_callback
+ *
+ * \sa ::cuDeviceSetMemPool, ::cudaDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolDestroy, ::cudaMallocFromPoolAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetMemPool(int device, cudaMemPool_t memPool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cudaDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cudaDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device,
+ * otherwise the returned pool must have been set with ::cuDeviceSetMemPool or ::cudaDeviceSetMemPool.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceSetMemPool
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::cudaErrorInvalidValue.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::cudaErrorInvalidHandle.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::cudaNvSciSyncAttrSignal, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::cudaNvSciSyncAttrWait, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::cudaErrorInvalidValue. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * Note that this API updates the input \p nvSciSyncAttrList with values equivalent
+ * to the following public attribute key-values:
+ * NvSciSyncAttrKey_RequiredPerm is set to
+ * - NvSciSyncAccessPerm_SignalOnly if ::cudaNvSciSyncAttrSignal is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitOnly if ::cudaNvSciSyncAttrWait is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitSignal if both ::cudaNvSciSyncAttrWait and
+ * ::cudaNvSciSyncAttrSignal are set in \p flags.
+ * NvSciSyncAttrKey_PrimitiveInfo is set to
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device.
+ * - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device.
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+.
+ * NvSciSyncAttrKey_GpuId is set to the same UUID that is returned in 
+ * \p cudaDeviceProp.uuid from ::cudaDeviceGetProperties for this \p device.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param device                - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::cudaSuccess,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidHandle,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::cudaDevP2PAttrPerformanceRank: A relative value indicating the
+ *   performance of the link between two devices. Lower value means better
+ *   performance (0 being the value used for most performant link).
+ * - ::cudaDevP2PAttrAccessSupported: 1 if peer access is enabled.
+ * - ::cudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over
+ *   the link are supported.
+ * - ::cudaDevP2PAttrCudaArrayAccessSupported: 1 if accessing CUDA arrays over
+ *   the link is supported.
+ *
+ * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer,
+ * ::cuDeviceGetP2PAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
+
+/**
+ * \brief Select compute-device which best matches criteria
+ *
+ * Returns in \p *device the device which has properties that best match
+ * \p *prop.
+ *
+ * \param device - Device with best match
+ * \param prop   - Desired device properties
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaGetDeviceProperties, 
+ * ::cudaInitDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
+/**
+ * \brief Initialize device to be used for GPU executions
+ *
+ * This function will initialize the CUDA Runtime structures and primary context on \p device when called,
+ * but the context will not be made current to \p device.
+ *
+ * When ::cudaInitDeviceFlagsAreValid is set in \p flags, deviceFlags are applied to the requested device.
+ * The values of deviceFlags match those of the flags parameters in ::cudaSetDeviceFlags. 
+ * The effect may be verified by ::cudaGetDeviceFlags.
+ *
+ * This function will return an error if the device is in ::cudaComputeModeExclusiveProcess
+ * and is occupied by another process or if the device is in ::cudaComputeModeProhibited.
+ *
+ * \param device - Device on which the runtime will initialize itself.
+ * \param deviceFlags - Parameters for device operation.
+ * \param flags - Flags for controlling the device initialization.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice, ::cudaSetDevice
+ * ::cuCtxSetCurrent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags);
+/**
+ * \brief Set device to be used for GPU executions
+ *
+ * Sets \p device as the current device for the calling host thread.
+ * Valid device id's are 0 to (::cudaGetDeviceCount() - 1).
+ *
+ * Any device memory subsequently allocated from this host thread
+ * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray()
+ * will be physically resident on \p device.  Any host memory allocated
+ * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() 
+ * or ::cudaHostRegister() will have its lifetime associated  with
+ * \p device.  Any streams or events created from this host thread will 
+ * be associated with \p device.  Any kernels launched from this host
+ * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed
+ * on \p device.
+ *
+ * This call may be made from any host thread, to any device, and at 
+ * any time.  This function will do no synchronization with the previous 
+ * or new device, 
+ * and should only take significant time when it initializes the runtime's context state.
+ * This call will bind the primary context of the specified device to the calling thread and all the
+ * subsequent memory allocations, stream and event creations, and kernel launches
+ * will be associated with the primary context. 
+ * This function will also immediately initialize the runtime state on the primary context, 
+ * and the context will be current on \p device immediately. This function will return an 
+ * error if the device is in ::cudaComputeModeExclusiveProcess and is occupied by another 
+ * process or if the device is in ::cudaComputeModeProhibited.
+ * 
+ * It is not required to call ::cudaInitDevice before using this function.
+ * \param device - Device on which the active host thread should execute the
+ * device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorDeviceUnavailable,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cudaInitDevice,
+ * ::cuCtxSetCurrent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device);
+
+/**
+ * \brief Returns which device is currently being used
+ *
+ * Returns in \p *device the current device for the calling host thread.
+ *
+ * \param device - Returns the device on which the active host thread
+ * executes the device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUnavailable,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cuCtxGetCurrent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+
+/**
+ * \brief Set a list of devices that can be used for CUDA
+ *
+ * Sets a list of devices for CUDA execution in priority order using
+ * \p device_arr. The parameter \p len specifies the number of elements in the
+ * list.  CUDA will try devices from the list sequentially until it finds one
+ * that works.  If this function is not called, or if it is called with a \p len
+ * of 0, then CUDA will go back to its default behavior of trying devices
+ * sequentially from a default list containing all of the available CUDA
+ * devices in the system. If a specified device ID in the list does not exist,
+ * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and
+ * \p device_arr is NULL or if \p len exceeds the number of devices in
+ * the system, then ::cudaErrorInvalidValue is returned.
+ *
+ * \param device_arr - List of devices to try
+ * \param len        - Number of devices in specified list
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDeviceFlags,
+ * ::cudaChooseDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len);
+
+/**
+ * \brief Sets flags to be used for device executions
+ * 
+ * Records \p flags as the flags for the current device. If the current device
+ * has been set and that device has already been initialized, the previous flags
+ * are overwritten. If the current device has not been initialized, it is
+ * initialized with the provided flags. If no device has been made current to
+ * the calling thread, a default device is selected and initialized with the
+ * provided flags.
+ * 
+ * The three LSBs of the \p flags parameter can be used to control how the CPU
+ * thread interacts with the OS scheduler when waiting for results from the
+ * device.
+ *
+ * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is
+ * zero, uses a heuristic based on the number of active CUDA contexts in the
+ * process \p C and the number of logical processors in the system \p P. If
+ * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the
+ * device, otherwise CUDA will not yield while waiting for results and
+ * actively spin on the processor. Additionally, on Tegra devices,
+ * ::cudaDeviceScheduleAuto uses a heuristic based on the power profile of
+ * the platform and may choose ::cudaDeviceScheduleBlockingSync for low-powered
+ * devices.
+ * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for
+ * results from the device. This can decrease latency when waiting for the
+ * device, but may lower the performance of CPU threads if they are performing
+ * work in parallel with the CUDA thread.
+ * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting
+ * for results from the device. This can increase latency when waiting for the
+ * device, but can increase the performance of CPU threads performing work in
+ * parallel with the device.
+ * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread 
+ * on a synchronization primitive when waiting for the device to finish work.
+ * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a 
+ * synchronization primitive when waiting for the device to finish work. <br>
+ * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and
+ * replaced with ::cudaDeviceScheduleBlockingSync.
+ * - ::cudaDeviceMapHost: This flag enables allocating pinned
+ * host memory that is accessible to the device. It is implicit for the
+ * runtime but may be absent if a context is created using the driver API.
+ * If this flag is not set, ::cudaHostGetDevicePointer() will always return
+ * a failure code.
+ * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * \ref deprecated "Deprecated:" This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * - ::cudaDeviceSyncMemops: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * \param flags - Parameters for device operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetValidDevices,
+ * ::cudaInitDevice,
+ * ::cudaChooseDevice,
+ * ::cuDevicePrimaryCtxSetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );
+
+/**
+ * \brief Gets the flags for the current device
+ *
+ * 
+ * Returns in \p flags the flags for the current device. If there is a current
+ * device for the calling thread, the flags for the device are returned. If
+ * there is no current device, the flags for the first device are returned,
+ * which may be the default flags.  Compare to the behavior of
+ * ::cudaSetDeviceFlags.
+ *
+ * Typically, the flags returned should match the behavior that will be seen
+ * if the calling thread uses a device after this call, without any change to
+ * the flags or current device inbetween by this or another thread.  Note that
+ * if the device is not initialized, it is possible for another thread to
+ * change the flags for the current device before it is initialized.
+ * Additionally, when using exclusive mode, if this thread has not requested a
+ * specific device, it may use a device other than the first device, contrary
+ * to the assumption made by this function.
+ *
+ * If a context has been created via the driver API and is current to the
+ * calling thread, the flags for that context are always returned.
+ *
+ * Flags returned by this function may specifically include ::cudaDeviceMapHost
+ * even though it is not accepted by ::cudaSetDeviceFlags because it is
+ * implicit in runtime API flags.  The reason for this is that the current
+ * context may have been created via the driver API in which case the flag is
+ * not implicit and may be unset.
+ *
+ * \param flags - Pointer to store the device flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetDeviceFlags,
+ * ::cudaInitDevice,
+ * ::cuCtxGetFlags,
+ * ::cuDevicePrimaryCtxGetState
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags );
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream on the context that is current to the calling host thread.
+ * If no context is current to the calling host thread, then the primary context for a device
+ * is selected, made current to the calling thread, and initialized before creating a stream on it.
+ *
+ * \param pStream - Pointer to new stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamGetDevice,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaSetDevice,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream on the context that is current to the calling host thread.
+ * If no context is current to the calling host thread, then the primary context for a device
+ * is selected, made current to the calling thread, and initialized before creating a stream on it.
+ * The \p flags argument determines the behaviors of the stream.  Valid values for \p flags are
+ * - ::cudaStreamDefault: Default stream creation flag.
+ * - ::cudaStreamNonBlocking: Specifies that work running in the created 
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param pStream - Pointer to new stream identifier
+ * \param flags   - Parameters for stream creation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamGetDevice,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaSetDevice,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+
+/**
+ * \brief Create an asynchronous stream with the specified priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p pStream.
+ * The stream is created on the context that is current to the calling host thread.
+ * If no context is current to the calling host thread, then the primary context for a device
+ * is selected, made current to the calling thread, and initialized before creating a stream on it.
+ * This affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do
+ * not preempt already-running work or provide any other functional guarantee on
+ * execution order.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param pStream  - Pointer to new stream identifier
+ * \param flags    - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed
+ * \param priority - Priority of the stream. Lower numbers represent higher priorities.
+ *                   See ::cudaDeviceGetStreamPriorityRange for more information about
+ *                   the meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamSynchronize,
+ * ::cudaSetDevice,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreateWithPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);
+
+/**
+ * \brief Query the priority of a stream
+ *
+ * Query the priority of a stream. The priority is returned in in \p priority.
+ * Note that if the stream was created with a priority outside the meaningful
+ * numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cudaStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamGetDevice,
+ * ::cuStreamGetPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+
+/**
+ * \brief Query the flags of a stream
+ *
+ * Query the flags of a stream. The flags are returned in \p flags.
+ * See ::cudaStreamCreateWithFlags for a list of valid flags.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param flags   - Pointer to an unsigned integer in which the stream's flags are returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetDevice,
+ * ::cuStreamGetFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+
+/**
+ * \brief Query the Id of a stream
+ *
+ * Query the Id of a stream. The Id is returned in \p streamId.
+ * The Id is unique for the life of the program.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA runtime APIs such as ::cudaStreamCreate, 
+ *   ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority, or their driver 
+ *   API equivalents such as ::cuStreamCreate or ::cuStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::cudaStreamLegacy 
+ *   and ::cudaStreamPerThread respectively.  The driver API equivalents of these 
+ *   are also accepted which are NULL, ::CU_STREAM_LEGACY and ::CU_STREAM_PER_THREAD.</li>
+ * </ul>
+ * 
+ * \param hStream    - Handle to the stream to be queried
+ * \param streamId   - Pointer to an unsigned long long in which the stream Id is returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cuStreamGetId
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId);
+
+/**
+ * \brief Query the device of a stream
+ *
+ * Returns in \p *device the device of the stream.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param device - Returns the device to which the stream belongs
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUnavailable,
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDevice,
+ * ::cudaGetDevice,
+ * ::cudaStreamCreate,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cuStreamGetId
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetDevice(cudaStream_t hStream, int *device);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * Resets all persisting lines in cache to normal status.
+ * Takes effect on function return.
+ *
+ * \return
+ * ::cudaSuccess,
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For attributes see ::cudaStreamAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
+
+ /**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        cudaStreamAttrValue *value_out);
+
+ /**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamSetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        const cudaStreamAttrValue *value);
+
+ /**
+ * \brief Destroys and cleans up an asynchronous stream
+ *
+ * Destroys and cleans up the asynchronous stream specified by \p stream.
+ *
+ * In case the device is still doing work in the stream \p stream
+ * when ::cudaStreamDestroy() is called, the function will return immediately 
+ * and the resources associated with \p stream will be released automatically 
+ * once the device has completed all work in \p stream.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamAddCallback,
+ * ::cuStreamDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p stream wait for all work captured in
+ * \p event.  See ::cudaEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p event may be from a different device than \p stream.
+ *
+ * flags include:
+ * - ::cudaEventWaitDefault: Default event creation flag.
+ * - ::cudaEventWaitExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param stream - Stream to wait
+ * \param event  - Event to wait on
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamWaitEvent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0));
+
+/**
+ * Type of stream callback functions.
+ * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL.
+ * \param status ::cudaSuccess or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cudaLaunchHostFunc. Additionally, this function is not
+ * supported with ::cudaStreamBeginCapture and ::cudaStreamEndCapture, unlike
+ * ::cudaLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each 
+ * cudaStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::cudaSuccess or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::cudaError_t.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use CUDA APIs
+ * may result in ::cudaErrorNotPermitted.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding callbacks have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if it has been properly ordered with an
+ *   event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param stream   - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync,
+ * ::cudaLaunchHostFunc, ::cuStreamAddCallback
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,
+        cudaStreamCallback_t callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Waits for stream tasks to complete
+ *
+ * Blocks until \p stream has completed all operations. If the
+ * ::cudaDeviceScheduleBlockingSync flag was set for this device, 
+ * the host thread will block until the stream is finished with 
+ * all of its tasks.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+
+/**
+ * \brief Queries an asynchronous stream for completion status
+ *
+ * Returns ::cudaSuccess if all operations in \p stream have
+ * completed, or ::cudaErrorNotReady if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaStreamSynchronize().
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region. 
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged,
+ * ::cuStreamAttachMemAsync
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags = cudaMemAttachSingle);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags);
+#endif
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p stream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cudaStreamEndCapture. Capture may not be initiated
+ * if \p stream is ::cudaStreamLegacy. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \param stream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cudaThreadExchangeStreamCaptureMode.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamEndCapture,
+ * ::cudaThreadExchangeStreamCaptureMode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+
+/**
+ * \brief Begins graph capture on a stream to an existing graph
+ *
+ * Begin graph capture on \p stream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * \p graph, which will be returned via ::cudaStreamEndCapture.
+ *
+ * Capture may not be initiated if \p stream is ::cudaStreamLegacy. Capture must be ended on the
+ * same stream in which it was initiated, and it may only be initiated if the stream is not
+ * already in capture mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \param stream          - Stream in which to initiate capture.
+ * \param graph           - Graph to capture into.
+ * \param dependencies    - Dependencies of the first node captured in the stream.  Can be NULL if numDependencies is 0.
+ * \param dependencyData  - Optional array of data associated with each dependency.
+ * \param numDependencies - Number of dependencies.
+ * \param mode            - Controls the interaction of this capture sequence with other API
+ *                          calls that are potentially unsafe. For more details see
+ *                          ::cudaThreadExchangeStreamCaptureMode.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamEndCapture,
+ * ::cudaThreadExchangeStreamCaptureMode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, enum cudaStreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     cudaStreamCaptureMode mode = desiredMode;
+     cudaThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cudaThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cudaStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cudaStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cudaStreamBeginCapture-::cudaStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cudaStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p cudaStreamCaptureModeRelaxed at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p cudaStreamCaptureModeGlobal,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture
+ *   sequence not initiated with \p cudaStreamCaptureModeRelaxed, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cudaEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p stream, returning the captured graph via \p pGraph.
+ * Capture must have been initiated on \p stream via a call to ::cudaStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cudaStreamBeginCapture was not
+ * ::cudaStreamCaptureModeRelaxed, this call must be from the same thread as
+ * ::cudaStreamBeginCapture.
+ *
+ * \param stream - Stream to query
+ * \param pGraph - The captured graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureWrongThread
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaGraphDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p stream via \p pCaptureStatus. After a successful
+ * call, \p *pCaptureStatus will contain one of the following:
+ * - ::cudaStreamCaptureStatusNone: The stream is not capturing.
+ * - ::cudaStreamCaptureStatusActive: The stream is capturing.
+ * - ::cudaStreamCaptureStatusInvalidated: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cudaStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p stream.
+ *
+ * Note that, if this is called on ::cudaStreamLegacy (the "null stream") while
+ * a blocking stream on the same device is capturing, it will return
+ * ::cudaErrorStreamCaptureImplicit and \p *pCaptureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param stream         - Stream to query
+ * \param pCaptureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamEndCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+
+
+/**
+ * \brief Query a stream's capture state
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created 
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns cudaSuccess
+ * - the returned capture status is ::cudaStreamCaptureStatusActive
+ *
+ * \param stream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cudaStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamGetCaptureInfo_v3,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamUpdateCaptureDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+
+/**
+ * \brief Query a stream's capture state (12.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created 
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns cudaSuccess
+ * - the returned capture status is ::cudaStreamCaptureStatusActive
+ *
+ * If \p edgeData_out is non-NULL then \p dependencies_out must be as well. If
+ * \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge
+ * data for one or more of the current stream dependencies, the call will return
+ * ::cudaErrorLossyQuery.
+ *
+ * \param stream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cudaStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param edgeData_out - Optional location to store a pointer to an array of graph edge
+ *           data. This array parallels \c dependencies_out; the next node to be added
+ *           has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for
+ *           each \c i. The array pointer is valid until the next API call which operates
+ *           on the stream or until the capture is terminated.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit,
+ * ::cudaErrorLossyQuery
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamUpdateCaptureDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v3(cudaStream_t stream,
+    enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0),
+    cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0),
+    const cudaGraphEdgeData **edgeData_out __dv(0), size_t *numDependencies_out __dv(0));
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::cudaStreamAddCaptureDependencies and
+ * ::cudaStreamSetCaptureDependencies. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::cudaStreamAddCaptureDependencies.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at
+ * ::cudaStreamEndCapture.
+ *
+ * Returns ::cudaErrorIllegalState if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions of the CUDA driver to 11.0 should not use this API or provide a fallback.
+ *
+ * \param stream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorIllegalState
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamGetCaptureInfo,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (12.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::cudaStreamAddCaptureDependencies and
+ * ::cudaStreamSetCaptureDependencies. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::cudaStreamAddCaptureDependencies.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at
+ * ::cudaStreamEndCapture.
+ *
+ * Returns ::cudaErrorIllegalState if the stream is not capturing.
+ *
+ * \param stream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param dependencyData - Optional array of data associated with each dependency.
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorIllegalState
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamGetCaptureInfo,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags __dv(0));
+/** @} */ /* END CUDART_STREAM */
+
+/**
+ * \defgroup CUDART_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event object
+ *
+ * Creates an event object for the current device using ::cudaEventDefault.
+ *
+ * \param event - Newly created event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
+
+/**
+ * \brief Creates an event object with the specified flags
+ *
+ * Creates an event object for the current device with the specified flags. Valid
+ * flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ * - ::cudaEventInterprocess: Specifies that the created event may be used as an
+ *   interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must
+ *   be specified along with ::cudaEventDisableTiming.
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecord(). Before the first call to ::cudaEventRecord(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cuEventRecord
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecordWithFlags(). Before the first call to ::cudaEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * flags include:
+ * - ::cudaEventRecordDefault: Default event creation flag.
+ * - ::cudaEventRecordExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecord,
+ * ::cuEventRecord,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+#endif
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p event. See
+ * ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::cudaSuccess if all captured work has been completed, or
+ * ::cudaErrorNotReady if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaEventSynchronize().
+ *
+ * \param event - Event to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p event.
+ * See ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::cudaEventBlockingSync
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::cudaEventBlockingSync flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param event - Event to wait for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event);
+
+/**
+ * \brief Destroys an event object
+ *
+ * Destroys the event specified by \p event.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cudaEventQuery() would return ::cudaErrorNotReady). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param event - Event to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime,
+ * ::cuEventDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+
+/**
+ * \brief Computes the elapsed time between events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cudaEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cudaEventRecord() has not been called on either event, then
+ * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
+ * called on both events but one or both of them has not yet been completed
+ * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
+ * of the events), ::cudaErrorNotReady is returned. If either event was created
+ * with the ::cudaEventDisableTiming flag, then this function will return
+ * ::cudaErrorInvalidResourceHandle.
+ *
+ * \param ms    - Time between \p start and \p end in ms
+ * \param start - Starting event
+ * \param end   - Ending event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord,
+ * ::cuEventElapsedTime
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
+
+/**
+ * \brief Computes the elapsed time between events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds). Note this API is not guaranteed
+ * to return the latest errors for pending work. As such this API is intended to
+ * serve as a elapsed time calculation only and polling for completion on the
+ * events to be compared should be done with ::cudaEventQuery instead.
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cudaEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cudaEventRecord() has not been called on either event, then
+ * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
+ * called on both events but one or both of them has not yet been completed
+ * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
+ * of the events), ::cudaErrorNotReady is returned. If either event was created
+ * with the ::cudaEventDisableTiming flag, then this function will return
+ * ::cudaErrorInvalidResourceHandle.
+ *
+ * \param ms    - Time between \p start and \p end in ms
+ * \param start - Starting event
+ * \param end   - Ending event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord,
+ * ::cuEventElapsedTime
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime_v2(float *ms, cudaEvent_t start, cudaEvent_t end);
+
+/** @} */ /* END CUDART_EVENT */
+
+/**
+ * \defgroup CUDART_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::cudaExternalMemoryHandleDesc structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryHandleDesc_st {
+            cudaExternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryHandleDesc::type specifies the type
+ * of handle being imported. ::cudaExternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum cudaExternalMemoryHandleType_enum {
+            cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+            cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+            cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+            cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+            cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+	        cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+		    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+            cudaExternalMemoryHandleTypeNvSciBuf         = 8
+        } cudaExternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueFd, then
+ * ::cudaExternalMemoryHandleDesc::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Heap, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Resource, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11Resource,then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle is    
+ * not NULL, then it must represent a valid shared NT handle that is  
+ * returned by  IDXGIResource1::CreateSharedHandle when referring to a 
+ * ID3D11Resource object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a valid shared KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryHandleDesc::handle::nvSciBufObject must be NON-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cudaWaitExternalSemaphoresAsync or ::cudaSignalExternalSemaphoresAsync
+ * as approprriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync and ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync 
+ * for memory synchronization.
+ *
+ * The size of the memory object must be specified in
+ * ::cudaExternalMemoryHandleDesc::size.
+ *
+ * Specifying the flag ::cudaExternalMemoryDedicated in
+ * ::cudaExternalMemoryHandleDesc::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::cudaExternalMemoryHandleDesc::type
+ * is one of the following:
+ * ::cudaExternalMemoryHandleTypeD3D12Resource
+ * ::cudaExternalMemoryHandleTypeD3D11Resource
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ *
+ * \sa ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::cudaExternalMemoryBufferDesc structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryBufferDesc_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryBufferDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryBufferDesc::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::cudaExternalMemoryBufferDesc::size is the size of the buffer.
+ * ::cudaExternalMemoryBufferDesc::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cudaFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::cudaExternalMemoryMipmappedArrayDesc is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryMipmappedArrayDesc_st {
+            unsigned long long offset;
+            cudaChannelFormatDesc formatDesc;
+            cudaExtent extent;
+            unsigned int flags;
+            unsigned int numLevels;
+        } cudaExternalMemoryMipmappedArrayDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryMipmappedArrayDesc::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::cudaExternalMemoryMipmappedArrayDesc::formatDesc describes the
+ * format of the data.
+ * ::cudaExternalMemoryMipmappedArrayDesc::extent specifies the
+ * dimensions of the base level of the mipmap chain.
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags are flags associated
+ * with CUDA mipmapped arrays. For further details, please refer to
+ * the documentation for ::cudaMalloc3DArray. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::cudaArrayColorAttachment must be specified in 
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags.
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cudaFreeMipmappedArray.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer
+ *
+ * \note If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels must not be greater than 1.
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cudaFree and
+ * ::cudaFreeMipmappedArray respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalMemory(cudaExternalMemory_t extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::cudaExternalSemaphoreHandleDesc is defined
+ * as follows:
+ *
+ * \code
+        typedef struct cudaExternalSemaphoreHandleDesc_st {
+            cudaExternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } cudaExternalSemaphoreHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalSemaphoreHandleDesc::type specifies the type of
+ * handle being imported. ::cudaExternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum cudaExternalSemaphoreHandleType_enum {
+            cudaExternalSemaphoreHandleTypeOpaqueFd                = 1,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32             = 2,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt          = 3,
+            cudaExternalSemaphoreHandleTypeD3D12Fence              = 4,
+            cudaExternalSemaphoreHandleTypeD3D11Fence              = 5,
+            cudaExternalSemaphoreHandleTypeNvSciSync               = 6,
+            cudaExternalSemaphoreHandleTypeKeyedMutex              = 7,
+            cudaExternalSemaphoreHandleTypeKeyedMutexKmt           = 8,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd     = 9,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+        } cudaExternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D11Fence::CreateSharedHandle. If 
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeNvSciSync, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it represent a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must represent a valid KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then the semaphore will be set to the value specified in
+ * ::cudaExternalSemaphoreSignalParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * this API sets ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence to a
+ * value that can be used by subsequent waiters of the same NvSciSync object to
+ * order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all the external memory objects that are imported as
+ * ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrSignal, this API will return
+ * cudaErrorNotSupported.
+ * 
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence associated with 
+ * semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync can be 
+ * deterministic. For this the NvSciSyncAttrList used to create the semaphore object 
+ * must have value of NvSciSyncAttrKey_RequireDeterministicFences key set to true. 
+ * Deterministic fences allow users to enqueue a wait over the semaphore object even 
+ * before corresponding signal is enqueued. For such a semaphore object, CUDA guarantees 
+ * that each signal operation will increment the fence value by '1'. Users are expected 
+ * to track count of signals enqueued on the semaphore object and insert waits accordingly. 
+ * When such a semaphore object is signaled from multiple streams, due to concurrent 
+ * stream execution, it is possible that the order in which the semaphore gets signaled 
+ * is indeterministic. This could lead to waiters of the semaphore getting unblocked 
+ * incorrectly. Users are expected to handle such situations, either by not using the 
+ * same semaphore object with deterministic fence support enabled in different streams 
+ * or by adding explicit dependency amongst such streams so that the semaphore is 
+ * signaled in order.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be released with the key specified in
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream     - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::cudaExternalSemaphoreWaitParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * then, waiting on the semaphore will wait until the
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrWait, this API will return
+ * cudaErrorNotSupported.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be acquired when it is released with the key specified 
+ * in ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key or
+ * until the timeout specified by
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * ::cudaErrorTimeout
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem);
+
+/** @} */ /* END CUDART_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDART_EXECUTION Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * ::cuLaunchKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Note that the functionally equivalent variadic template ::cudaLaunchKernelEx
+ * is available for C++11 and newer.
+ *
+ * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.                                  
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * If the kernel has N parameters the \p args should point to array of N
+ * pointers. Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point
+ * to the region of memory from which the actual parameter will be copied.
+ *
+ * N.B. This function is so named to avoid unintentionally invoking the
+ *      templated version, \p cudaLaunchKernelEx, for kernels taking a single
+ *      void** or void* parameter.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelEx(const cudaLaunchConfig_t *config, void (*kernel)(ExpTypes...), ActTypes &&... args) "cudaLaunchKernelEx (C++ API)",
+ * ::cuLaunchKernelEx
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+
+/**
+ * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernelMultiDevice,
+ * ::cuLaunchCooperativeKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::cudaDevAttrCooperativeMultiDeviceLaunch.
+ *
+ * The same kernel must be launched on all devices. Note that any __device__ or __constant__
+ * variables are independently instantiated on every device. It is the application's
+ * responsiblity to ensure these variables are initialized and used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves and the
+ * amount of shared memory used by each thread block must also match across all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cudaStreamCreate
+ * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or
+ * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::cudaLaunchParams structure is defined as:
+ * \code
+        struct cudaLaunchParams
+        {
+            void *func;
+            dim3 gridDim;
+            dim3 blockDim;
+            void **args;
+            size_t sharedMem;
+            cudaStream_t stream;
+        };
+ * \endcode
+ * where:
+ * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must
+ *   be launched on all devices. For templated functions, pass the function symbol as follows:
+ *   func_name<template_arg_0,...,template_arg_N>
+ * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This
+ *   must match across all kernels launched.
+ * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has
+ *   N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each
+ *   pointer, from <tt>::cudaLaunchParams::args[0]</tt> to <tt>::cudaLaunchParams::args[N - 1]</tt>,
+ *   point to the region of memory from which the actual parameter will be copied.
+ * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernel,
+ * ::cuLaunchCooperativeKernelMultiDevice
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  __dv(0));
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions,
+ * pass the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param func        - Device function symbol
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note This API does not accept a ::cudaKernel_t casted as void*. If cache config modification
+ * is required for a ::cudaKernel_t (or a __global__ function), it can be replaced with a call to 
+ * ::cudaFuncSetAttributes with the attribute ::cudaFuncAttributePreferredSharedMemoryCarveout 
+ * to specify a more granular L1 cache and shared memory split configuration.
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cuFuncSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p func.
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. The fetched attributes are placed in \p attr.
+ * If the specified function does not exist, then it is assumed to 
+ * be a ::cudaKernel_t and used as is.
+ * For templated functions, pass the function symbol as follows: 
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr - Return pointer to function's attributes
+ * \param func - Device function symbol
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cuFuncGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+
+
+/**
+ * \brief Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p func.
+ * The parameter \p func must be a pointer to a function that executes
+ * on the device. The parameter specified by \p func must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then it is assumed to 
+ * be a ::cudaKernel_t and used as is.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeNonPortableClusterSizeAllowed: Indicates whether the
+ *   function can be launched with non-portable cluster size. 1 is allowed, 0 is
+ *   disallowed.
+ * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
+ *   scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
+ *
+ * \param func  - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value);
+
+/**
+ * \brief Returns the function name for a device entry function pointer.
+ *
+ * Returns in \p **name the function name associated with the symbol \p func .
+ * The function name is returned as a null-terminated string. This API may
+ * return a mangled name if the function is not declared as having C linkage.
+ * If \p **name is NULL, ::cudaErrorInvalidValue is returned.
+ * If \p func is not a device entry function, then it is assumed to
+ * be a ::cudaKernel_t and used as is.
+ *
+ * \param name - The returned name of the function
+ * \param func - The function pointer to retrieve name for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \ref ::cudaFuncGetName(const char **name, const T *func) "cudaFuncGetName (C++ API)"
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetName(const char **name, const void *func);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout.
+ *
+ * Queries the kernel parameter at \p paramIndex in \p func's list of parameters and returns
+ * parameter information via \p paramOffset and \p paramSize. \p paramOffset returns the
+ * offset of the parameter in the device-side parameter layout. \p paramSize returns the size
+ * in bytes of the parameter. This information can be used to update kernel node parameters
+ * from the device via ::cudaGraphKernelNodeSetParam() and ::cudaGraphKernelNodeUpdatesApply().
+ * \p paramIndex must be less than the number of parameters that \p func takes.
+ *
+ * \param func        - The function to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - The offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - The size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ * \note_cudaKernel_t
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetParamInfo(const void *func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+
+/**
+ * \brief Converts a double argument to be executed on a device
+ *
+ * \param d - Double to convert
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d to an internal float representation if
+ * the device does not support double arithmetic. If the device does natively
+ * support doubles, then this function does nothing.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForHost
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d);
+
+/**
+ * \brief Converts a double argument after execution on a device
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d from a potentially internal float
+ * representation if the device does not support double arithmetic. If the
+ * device does natively support doubles, then this function does nothing.
+ *
+ * \param d - Double to convert
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice
+ */
+extern __CUDA_DEPRECATED  __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::cudaErrorNotPermitted, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in constrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamDestroy,
+ * ::cudaMallocManaged,
+ * ::cudaStreamAttachMemAsync,
+ * ::cudaStreamAddCallback,
+ * ::cuLaunchHostFunc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+
+/** @} */ /* END CUDART_EXECUTION */
+
+/**
+ * \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the shared memory configuration for a device function
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will 
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions, 
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via 
+ * ::cudaFuncSetSharedMemConfig will override the device wide setting set by
+ * ::cudaDeviceSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration
+ *   when launching this function.
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be 
+ *   four bytes natively when launching this function.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively when launching this function.
+ *
+ * \param func   - Device function symbol
+ * \param config - Requested shared memory configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetSharedMemConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuFuncSetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);
+/** @} */ /* END CUDART_EXECUTION_DEPRECATED */
+
+/**
+ * \defgroup CUDART_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Besides the occupancy calculator functions
+ * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags),
+ * there are also C++ only occupancy-based launch configuration functions documented in
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * See
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize);
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxPotentialClusterSize(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxPotentialClusterSize (C++ API)",
+ * ::cuOccupancyMaxPotentialClusterSize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func, const cudaLaunchConfig_t *launchConfig);
+
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxActiveClusters(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxActiveClusters (C++ API)",
+ * ::cuOccupancyMaxActiveClusters
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDART_MEMORY Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync,
+ * ::cuMemAllocManaged
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags = cudaMemAttachGlobal);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
+#endif
+
+/**
+ * \brief Allocate memory on the device
+ *
+ * Allocates \p size bytes of linear memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. The allocated memory is
+ * suitably aligned for any kind of variable. The memory is not cleared.
+ * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAlloc
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy*(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). 
+
+ * On systems where ::pageableMemoryAccessUsesHostPageTables
+ * is true, ::cudaMallocHost may not page-lock the allocated memory.
+
+ * Page-locking excessive amounts of memory with ::cudaMallocHost() may degrade 
+ * system performance, since it reduces the amount of memory available to the 
+ * system for paging. As a result, this function is best used sparingly to allocate 
+ * staging areas for data exchange between host and device.
+ *
+ * \param ptr  - Pointer to allocated host memory
+ * \param size - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D,
+ * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAllocHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size);
+
+/**
+ * \brief Allocates pitched memory on the device
+ *
+ * Allocates at least \p width (in bytes) * \p height bytes of linear memory
+ * on the device and returns in \p *devPtr a pointer to the allocated memory.
+ * The function may pad the allocation to ensure that corresponding pointers
+ * in any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. The pitch returned in
+ * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation.
+ * The intended usage of \p pitch is as a separate parameter of the allocation,
+ * used to compute addresses within the 2D array. Given the row and column of
+ * an array element of type \p T, the address is computed as:
+ * \code
+    T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
+   \endcode
+ *
+ * For allocations of 2D arrays, it is recommended that programmers consider
+ * performing pitch allocations using ::cudaMallocPitch(). Due to pitch
+ * alignment restrictions in the hardware, this is especially true if the
+ * application will be performing 2D memory copies between different regions
+ * of device memory (whether linear memory or CUDA arrays).
+ *
+ * \param devPtr - Pointer to allocated pitched device memory
+ * \param pitch  - Pitch for allocation
+ * \param width  - Requested pitched allocation width (in bytes)
+ * \param height - Requested pitched allocation height
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+    enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can 
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ *
+ * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details.
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param width  - Requested array allocation width
+ * \param height - Requested array allocation height
+ * \param flags  - Requested properties of allocated array
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
+
+/**
+ * \brief Frees memory on the device
+ *
+ * Frees the memory space pointed to by \p devPtr, which must have been
+ * returned by a previous call to one of the following memory allocation APIs -
+ * ::cudaMalloc(), ::cudaMallocPitch(), ::cudaMallocManaged(), ::cudaMallocAsync(),
+ * ::cudaMallocFromPoolAsync().
+ * 
+ * Note - This API will not perform any implicit synchronization when the pointer was
+ * allocated with ::cudaMallocAsync or ::cudaMallocFromPoolAsync. Callers must ensure
+ * that all accesses to these pointer have completed before invoking ::cudaFree. For
+ * best performance and memory reuse, users should use ::cudaFreeAsync to free memory
+ * allocated via the stream ordered memory allocator.
+ * For all other pointers, this API may perform implicit synchronization.
+ * 
+ * If ::cudaFree(\p devPtr) has already been called before,
+ * an error is returned. If \p devPtr is 0, no operation is performed.
+ * ::cudaFree() returns ::cudaErrorValue in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Device pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocManaged, ::cudaMallocArray, ::cudaFreeArray, ::cudaMallocAsync, ::cudaMallocFromPoolAsync
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaFreeAsync
+ * ::cudaHostAlloc,
+ * ::cuMemFree
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+
+/**
+ * \brief Frees page-locked memory
+ *
+ * Frees the memory space pointed to by \p hostPtr, which must have been
+ * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc().
+ *
+ * \param ptr - Pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc,
+ * ::cuMemFreeHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
+
+/**
+ * \brief Frees an array on the device
+ *
+ * Frees the CUDA array \p array, which must have been returned by a
+ * previous call to ::cudaMallocArray(). If \p devPtr is 0,
+ * no operation is performed.
+ *
+ * \param array - Pointer to array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array);
+
+/**
+ * \brief Frees a mipmapped array on the device
+ *
+ * Frees the CUDA mipmapped array \p mipmappedArray, which must have been 
+ * returned by a previous call to ::cudaMallocMipmappedArray(). If \p devPtr
+ * is 0, no operation is performed.
+ *
+ * \param mipmappedArray - Pointer to mipmapped array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMipmappedArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);
+
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes
+ * ::cudaHostAlloc() to emulate ::cudaMallocHost().
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * In order for the ::cudaHostAllocMapped flag to have any effect, the CUDA context
+ * must support the ::cudaDeviceMapHost flag, which can be checked via
+ * ::cudaGetDeviceFlags(). The ::cudaDeviceMapHost flag is implicitly set for
+ * contexts created via the runtime API.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param pHost - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost,
+ * ::cudaGetDeviceFlags,
+ * ::cuMemHostAlloc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p ptr and \p size and maps it
+ * for the device(s) as specified by \p flags. This memory range also is added
+ * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate
+ * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed 
+ * directly by the device, it can be read or written with much higher bandwidth 
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ * 
+ * On systems where ::pageableMemoryAccessUsesHostPageTables is true, ::cudaHostRegister 
+ * will not page-lock the memory range specified by \p ptr but only populate 
+ * unpopulated pages.
+ *
+ * ::cudaHostRegister is supported only on I/O coherent devices that have a non-zero
+ * value for the device attribute ::cudaDevAttrHostRegisterSupported.
+ *
+ * The \p flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::cudaHostRegisterDefault: On a system with unified virtual addressing,
+ *   the memory will be both mapped and portable.  On a system with no unified
+ *   virtual addressing, the memory will be neither mapped nor portable.
+ *
+ * - ::cudaHostRegisterPortable: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cudaHostGetDevicePointer().
+ *
+ * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as
+ *   pointing to some memory-mapped I/O space, e.g. belonging to a
+ *   third-party PCIe device, and it will marked as non cache-coherent and
+ *   contiguous.
+ *
+ * - ::cudaHostRegisterReadOnly: The passed memory pointer is treated as
+ *   pointing to memory that is considered read-only by the device.  On
+ *   platforms without ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, this
+ *   flag is required in order to register memory mapped to the CPU as
+ *   read-only.  Support for the use of this flag can be queried from the device
+ *   attribute ::cudaDevAttrHostRegisterReadOnlySupported.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cudaHostRegister to error with cudaErrorNotSupported.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The CUDA context must have been created with the ::cudaMapHost flag in
+ * order for the ::cudaHostRegisterMapped flag to have any effect.
+ *
+ * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cudaHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::cudaHostRegisterPortable flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p ptr.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with ::cudaHostUnregister().
+ *
+ * \param ptr   - Host pointer to memory to page-lock
+ * \param size  - Size in bytes of the address range to page-lock in bytes
+ * \param flags - Flags for allocation request
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorHostMemoryAlreadyRegistered,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer,
+ * ::cuMemHostRegister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cudaHostRegister
+ *
+ * Unmaps the memory range whose base address is specified by \p ptr, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cudaHostRegister().
+ *
+ * \param ptr - Host pointer to memory to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorHostMemoryNotRegistered
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister,
+ * ::cuMemHostUnregister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);
+
+/**
+ * \brief Passes back device pointer of mapped host memory allocated by
+ * cudaHostAlloc or registered by cudaHostRegister
+ *
+ * Passes back the device pointer corresponding to the mapped, pinned host
+ * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister().
+ *
+ * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was
+ * not specified before deferred context creation occurred, or if called on a
+ * device that does not support mapped, pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p pHost.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p pHost and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p pHost. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * \p flags provides for future releases.  For now, it must be set to 0.
+ *
+ * \param pDevice - Returned device pointer for mapped memory
+ * \param pHost   - Requested host pointer mapping
+ * \param flags   - Flags for extensions (must be 0 for now)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc,
+ * ::cuMemHostGetDevicePointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
+
+/**
+ * \brief Passes back flags used to allocate pinned host memory allocated by
+ * cudaHostAlloc
+ *
+ * ::cudaHostGetFlags() will fail if the input pointer does not
+ * reside in an address range allocated by ::cudaHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param pHost - Host pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostAlloc,
+ * ::cuMemHostGetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost);
+
+/**
+ * \brief Allocates logical 1D, 2D, or 3D memory objects on the device
+ *
+ * Allocates at least \p width * \p height * \p depth bytes of linear memory
+ * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer
+ * to the allocated memory. The function may pad the allocation to ensure
+ * hardware alignment requirements are met. The pitch returned in the \p pitch
+ * field of \p pitchedDevPtr is the width in bytes of the allocation.
+ *
+ * The returned ::cudaPitchedPtr contains additional fields \p xsize and
+ * \p ysize, the logical width and height of the allocation, which are
+ * equivalent to the \p width and \p height \p extent parameters provided by
+ * the programmer during allocation.
+ *
+ * For allocations of 2D and 3D objects, it is highly recommended that
+ * programmers perform allocations using ::cudaMalloc3D() or
+ * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing memory copies
+ * involving 2D or 3D objects (whether linear memory or CUDA arrays).
+ *
+ * \param pitchedDevPtr  - Pointer to allocated pitched device memory
+ * \param extent         - Requested allocation size (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D,
+ * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMalloc3DArray() can allocate the following:
+ *
+ * - A 1D array is allocated if the height and depth extents are both zero.
+ * - A 2D array is allocated if only the depth extent is zero.
+ * - A 3D array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is
+ * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. 
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists 
+ * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form 
+ * the second cubemap, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
+ *   reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA arrays.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array 
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for 
+ *   creating 2D, 3D or 2D layered sparse CUDA arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that
+ * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1D), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param extent - Requested allocation size (\p width field in elements)
+ * \param flags  - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuArray3DCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0));
+
+/**
+ * \brief Allocate a mipmapped array on the device
+ *
+ * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray.
+ * \p numLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMallocMipmappedArray() can allocate the following:
+ *
+ * - A 1D mipmapped array is allocated if the height and depth extents are both zero.
+ * - A 2D mipmapped array is allocated if only the depth extent is zero.
+ * - A 3D mipmapped array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six.
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
+ * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the 
+ * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array 
+ *   will be read from or written to using a surface reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are
+ *   performed only on the most detailed mipmap level.
+ * - ::cudaArraySparse: Allocates a CUDA mipmapped array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for creating 
+ *   2D, 3D or 2D layered sparse CUDA mipmapped arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA mipmapped array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1DMipmap), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param mipmappedArray  - Pointer to allocated mipmapped array in device memory
+ * \param desc            - Requested channel format
+ * \param extent          - Requested allocation size (\p width field in elements)
+ * \param numLevels       - Number of mipmap levels to allocate
+ * \param flags           - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0));
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *levelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p mipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::cudaErrorInvalidValue is returned.
+ *
+ * If \p mipmappedArray is NULL,
+ * ::cudaErrorInvalidResourceHandle is returned.
+ *
+ * \param levelArray     - Returned mipmap level CUDA array
+ * \param mipmappedArray - CUDA mipmapped array
+ * \param level          - Mipmap level
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayGetLevel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or
+ * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3D() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3D() will return
+ * an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must entirely contain the region defined by \p srcPos
+ * and \p extent. The destination object must entirely contain the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr
+ * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated
+ * with ::cudaMalloc3D() will always be valid.
+ *
+ * \param p - 3D memory copy parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3D
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+
+/**
+ * \brief Copies memory between devices
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * Note that this function is synchronous with respect to the host only if
+ * the source or destination of the transfer is host memory.  Note also 
+ * that this copy is serialized with respect to all pending and future 
+ * asynchronous work in to the current device, the copy's source device,
+ * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid 
+ * this synchronization).
+ *
+ * \param p - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidPitchValue
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray
+ * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ * For CUDA arrays, positions must be in the range [0, 2048) for any
+ * dimension.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will
+ * return an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must lie entirely within the region defined by \p srcPos
+ * and \p extent. The destination object must lie entirely within the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or
+ * \p dstPtr exceeds the maximum allowed. The pitch of a
+ * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid.
+ *
+ * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param p      - 3D memory copy parameters
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, :::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between devices asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * \param p      - Parameters for the memory copy
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidPitchValue
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Gets free and total device memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemGetInfo
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Gets info about the specified cudaArray
+ * 
+ * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape 
+ * and flags of \p array.
+ *
+ * Any of \p *desc, \p *extent and \p *flags may be specified as NULL.
+ *
+ * \param desc   - Returned array type
+ * \param extent - Returned array shape. 2D arrays will have depth of zero
+ * \param flags  - Returned array flags
+ * \param array  - The ::cudaArray to get info for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuArrayGetDescriptor,
+ * ::cuArray3DGetDescriptor
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::cudaChannelFormatKindNV12, then ::cudaErrorInvalidValue is returned.
+ *
+ * Note that if the \p hArray has format ::cudaChannelFormatKindNV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one 8-bit channel and ::cudaChannelFormatKindUnsigned as its format kind.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two 8-bit channels and ::cudaChannelFormatKindUnsigned as its format kind.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayGetPlane
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaMipmappedArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements  *memoryRequirements, cudaArray_t array, int device);
+
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA mipmapped
+ * array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaMipmappedArray_t mipmap, int device);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties.
+ * If the CUDA array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * If the returned value in ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::cudaArraySparseProperties::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cudaMallocArray or ::cudaMalloc3DArray. For CUDA arrays obtained
+ * using ::cudaMipmappedArrayGetLevel, ::cudaErrorInvalidValue will be returned. Instead, ::cudaMipmappedArrayGetSparseProperties
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return the ::cudaArraySparseProperties
+ * \param[in] array             - The CUDA array to get the sparse properties of 
+ *
+ * \sa
+ * ::cudaMipmappedArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaArray_t array);
+#endif
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties.
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::cudaArraySparseProperties::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize specifies the size of the mip tail of all layers combined.
+ * Otherwise, ::cudaArraySparseProperties::miptailSize specifies mip tail size per layer.
+ * The returned value of ::cudaArraySparseProperties::miptailFirstLevel is valid only if ::cudaArraySparseProperties::miptailSize is non-zero.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return ::cudaArraySparseProperties
+ * \param[in] mipmap            - The CUDA mipmapped array to get the sparse properties of
+ *
+ * \sa
+ * ::cudaArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaMipmappedArray_t mipmap);
+#endif
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Calling
+ * ::cudaMemcpy() with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param dst   - Destination memory address
+ * \param src   - Source memory address
+ * \param count - Size in bytes to copy
+ * \param kind  - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD,
+ * ::cuMemcpy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies memory between two devices
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host, but 
+ * serialized with respect all pending and future asynchronous work in to the 
+ * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync 
+ * to avoid this synchronization).
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch and
+ * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by
+ * \p dst and \p src, including any padding added to the end of each row. The
+ * memory areas may not overlap. \p width must not exceed either \p dpitch or
+ * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do
+ * not match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds
+ * the maximum allowed.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at
+ * \p hOffset rows and \p wOffset bytes from the upper left corner,
+ * where \p kind specifies the direction of the copy, and must be one
+ * of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch
+ * exceeds the maximum allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch is the
+ * width in memory in bytes of the 2D array pointed to by \p dst, including any
+ * padding added to the end of each row. \p wOffset + \p width must not exceed
+ * the width of the CUDA array \p src. \p width must not exceed \p dpitch.
+ * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum
+ * allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffsetSrc rows and \p wOffsetSrc bytes from the
+ * upper left corner to the CUDA array \p dst starting at \p hOffsetDst rows
+ * and \p wOffsetDst bytes from the upper left corner, where \p kind
+ * specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst.
+ * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param width      - Width of matrix transfer (columns in bytes)
+ * \param height     - Height of matrix transfer (rows)
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,  ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * 
+ * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and
+ * \p src pointers that do not match the direction of the copy results in an
+ * undefined behavior.
+ *
+ * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call
+ * may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between two devices asynchronously.
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ * \param stream    - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
+ * For copies involving CUDA arrays, please see ::cudaMemcpy3DBatchAsync.
+ *
+ * Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
+ * The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
+ * by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
+ * within a batch will result in undefined behavior.
+ *
+ * Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
+ * Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
+ * the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
+ * \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
+ * in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
+ * will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
+ * in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
+ * less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
+ *
+ * The ::cudaMemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::cudaMemcpySrcAccessOrderStream, then the source will
+ * be accessed in stream order. If the source access order is set to ::cudaMemcpySrcAccessOrderDuringApiCall then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before the
+ * API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::cudaMemcpySrcAccessOrderAny then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch
+ * must have a valid ::cudaMemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting,
+ * otherwise the API will return ::cudaErrorInvalidValue.
+ *
+ * The ::cudaMemcpyAttributes::srcLocHint and ::cudaMemcpyAttributes::dstLocHint allows applications to specify hint locations
+ * for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
+ * only applicable for managed memory pointers on devices where ::cudaDevAttrConcurrentManagedAccess is true or
+ * system-allocated pageable memory on devices where ::cudaDevAttrPageableMemoryAccess is true.
+ * For other cases, these hints are ignored.
+ *
+ * The ::cudaMemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::cudaMemcpyFlagPreferOverlapWithCompute flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param dsts          - Array of destination pointers.
+ * \param srcs          - Array of memcpy source pointers.
+ * \param sizes         - Array of sizes for memcpy operations.
+ * \param count         - Size of \p dsts, \p srcs and \p sizes arrays
+ * \param attrs         - Array of memcpy attributes. 
+ * \param attrsIdxs     - Array of indices to specify which copies each entry in the \p attrs array applies to.
+ *                        The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
+ *                        through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
+ *                        attrsIdxs[numAttrs-1] through count - 1.
+ * \param numAttrs      - Size of \p attrs and \p attrsIdxs arrays.
+ * \param failIdx       - Pointer to a location to return the index of the copy where a failure was encountered.
+ *                        The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param hStream       - The stream to enqueue the operations in. Must not be legacy NULL stream.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyBatchAsync(void **dsts, void **srcs, size_t *sizes, size_t count, struct cudaMemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs, size_t *failIdx, cudaStream_t stream);
+
+/**
+ * \brief Performs a batch of 3D memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
+ * copies within a batch will result in undefined behavior.
+ *
+* Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
+ * Each entry in this array describes a copy operation. This includes among other things, the source and destination
+ * operands for the copy as specified in ::cudaMemcpy3DBatchOp::src and ::cudaMemcpy3DBatchOp::dst respectively.
+ * The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
+ * of a copy is specified in ::cudaMemcpy3DBatchOp::extent. The width, height and depth of a copy are specified in
+ * elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
+ * to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
+ * the element size of the two CUDA arrays must match.
+ *
+ * For a given operand, if ::cudaMemcpy3DOperand::type is specified as ::cudaMemcpyOperandTypePointer, then
+ * ::cudaMemcpy3DOperand::op::ptr will be used. The ::cudaMemcpy3DOperand::op::ptr::ptr field must contain the pointer where
+ * the copy should begin. The ::cudaMemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
+ * must either be zero or be greater than or equal to the width of the copy specified in ::cudaMemcpy3DBatchOp::extent::width.
+ * The ::cudaMemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be
+ * zero or be greater than or equal to the height of the copy specified in ::cudaMemcpy3DBatchOp::extent::height.
+ * When either of these values is zero, that aspect of the operand is considered to be tightly packed according to the copy extent.
+ * For managed memory pointers on devices where ::cudaDevAttrConcurrentManagedAccess is true or system-allocated pageable memory
+ * on devices where ::cudaDevAttrPageableMemoryAccess is true, the ::cudaMemcpy3DOperand::op::ptr::locHint field can be used to hint
+ * the location of the operand.
+ *
+ * If an operand's type is specified as ::cudaMemcpyOperandTypeArray, then ::cudaMemcpy3DOperand::op::array will be used.
+ * The ::cudaMemcpy3DOperand::op::array::array field specifies the CUDA array and ::cudaMemcpy3DOperand::op::array::offset specifies
+ * the 3D offset into that array where the copy begins.
+ *
+ * The ::cudaMemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::cudaMemcpySrcAccessOrderStream, then the source will
+ * be accessed in stream order. If the source access order is set to ::cudaMemcpySrcAccessOrderDuringApiCall then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before the
+ * API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::cudaMemcpySrcAccessOrderAny then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList
+ * must have a valid srcAccessOrder setting, otherwise this API will return ::cudaErrorInvalidValue.
+ *
+ * The ::cudaMemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::cudaMemcpyFlagPreferOverlapWithCompute flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param numOps     - Total number of memcpy operations. 
+ * \param opList     - Array of size \p numOps containing the actual memcpy operations. 
+ * \param failIdx    - Pointer to a location to return the index of the copy where a failure was encountered.
+ *                     The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param flags      - Flags for future use, must be zero now.
+ * \param hStream    - The stream to enqueue the operations in. Must not be default NULL stream.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DBatchAsync(size_t numOps, struct cudaMemcpy3DBatchOp *opList, size_t *failIdx, unsigned long long flags, cudaStream_t stream);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays
+ * pointed to by \p dst and \p src, including any padding added to the end of
+ * each row. The memory areas may not overlap. \p width must not exceed either
+ * \p dpitch or \p spitch.
+ *
+ * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not
+ * match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater
+ * than the maximum allowed.
+ *
+ * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at \p hOffset
+ * rows and \p wOffset bytes from the upper left corner, where \p kind specifies
+ * the direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if
+ * \p spitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ *
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst,
+ * where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch is the width in memory in bytes of the 2D
+ * array pointed to by \p dst, including any padding added to the end of each
+ * row. \p wOffset + \p width must not exceed the width of the CUDA array
+ * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync()
+ * returns an error if \p dpitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ *
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemsetD8,
+ * ::cuMemsetD16,
+ * ::cuMemsetD32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8,
+ * ::cuMemsetD2D16,
+ * ::cuMemsetD2D32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p pitchedDevPtr refers to pinned host memory.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * ::cudaMemsetAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD8Async,
+ * ::cuMemsetD16Async,
+ * ::cuMemsetD32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * ::cudaMemset2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async,
+ * ::cuMemsetD2D32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * ::cudaMemset3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol is a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared in the
+ * global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol);
+
+/**
+ * \brief Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that
+ * resides in global or constant memory space. If \p symbol cannot be found, or
+ * if \p symbol is not declared in global or constant memory space, \p *size is
+ * unchanged and the error ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the 
+ * base device pointer of the memory to be prefetched and \p dstDevice is the 
+ * destination device. \p count specifies the number of bytes to copy. \p stream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables,
+ * or it may also refer to system-allocated memory on systems with non-zero
+ * cudaDevAttrPageableMemoryAccess.
+ *
+ * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * must be non-zero. Additionally, \p stream must be associated with a device that has a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cudaMallocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cudaMemAdvise as described
+ * below:
+ *
+ * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param stream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, ::cudaMemAdvise_v2
+ * ::cuMemPrefetchAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Prefetches memory to the specified destination location
+ *
+ * Prefetches memory to the specified destination location.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p location specifies the
+ * destination location. \p count specifies the number of bytes to copy. \p stream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables, 
+ * or it may also refer to system-allocated memory on systems with non-zero
+ * cudaDevAttrPageableMemoryAccess.
+ * 
+ * Specifying ::cudaMemLocationTypeDevice for ::cudaMemLocation::type will prefetch memory to GPU
+ * specified by device ordinal ::cudaMemLocation::id which must have non-zero value for the device attribute
+ * ::concurrentManagedAccess. Additionally, \p stream must be associated with a device
+ * that has a non-zero value for the device attribute ::concurrentManagedAccess.
+ * Specifying ::cudaMemLocationTypeHost as ::cudaMemLocation::type will prefetch data to host memory.
+ * Applications can request prefetching memory to a specific host NUMA node by specifying
+ * ::cudaMemLocationTypeHostNuma for ::cudaMemLocation::type and a valid host NUMA node id in ::cudaMemLocation::id
+ * Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying
+ * ::cudaMemLocationTypeHostNumaCurrent for ::cudaMemLocation::type. Note when ::cudaMemLocation::type is etiher
+ * ::cudaMemLocationTypeHost OR ::cudaMemLocationTypeHostNumaCurrent, ::cudaMemLocation::id will be ignored.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cudaMallocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on the destination location. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on destination location.
+ * If however the destination location is a host NUMA node, then any pages of that subset
+ * that are already in another host NUMA node will be transferred to the destination.
+ *
+ * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory
+ * range, then the pages will be migrated to \p location even if \p location is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param location  - location to prefetch to
+ * \param flags     - flags for future use, must be zero now. 
+ * \param stream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, ::cudaMemAdvise_v2
+ * ::cuMemPrefetchAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync_v2(const void *devPtr, size_t count, struct cudaMemLocation location, unsigned int flags, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cudaMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::cudaMemAdviseSetReadMostly.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ *
+ * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
+ * and changes the preferred location to none.
+ *
+ * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device.
+ * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync,
+ * ::cuMemAdvise
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
+ * or ::cudaMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If the target location for ::cudaMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on
+ * another host NUMA node, that copy will be migrated to the targeted host NUMA node.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. If the writing processor is the CPU and the preferred location of
+ * the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::cudaMemAdviceUnsetReadMostly:  Undoes the effect of ::cudaMemAdviseSetReadMostly and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ * Note: The \p location argument is ignored for this advice.
+ *
+ * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
+ * data to be the memory belonging to \p location. When ::cudaMemLocation::type is ::cudaMemLocationTypeHost,
+ * ::cudaMemLocation::id is ignored and the preferred location is set to be host memory. To set the preferred location
+ * to a specific host NUMA node, applications must set ::cudaMemLocation::type to ::cudaMemLocationTypeHostNuma and
+ * ::cudaMemLocation::id must specify the NUMA ID of the host NUMA node. If ::cudaMemLocation::type is set to ::cudaMemLocationTypeHostNumaCurrent,
+ * ::cudaMemLocation::id will be ignored and the host NUMA node closest to the calling thread's CPU will be used as the preferred location.
+ * If ::cudaMemLocation::type is a ::cudaMemLocationTypeDevice, then ::cudaMemLocation::id must be a valid device ordinal
+ * and the device must have a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cudaMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p location will not result in a read-only copy being created on that procesor as outlined in description for
+ * the advice ::cudaMemAdviseSetReadMostly.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then ::cudaMemLocation::id must be a valid device that has a non-zero alue for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ *
+ * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
+ * and changes the preferred location to none. The \p location argument is ignored for this advice.
+ *
+ * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by processor \p location.
+ * The ::cudaMemLocation::type must be either ::cudaMemLocationTypeDevice with ::cudaMemLocation::id representing a valid device
+ * ordinal or ::cudaMemLocationTypeHost and ::cudaMemLocation::id will be ignored. All other location types are invalid.
+ * If ::cudaMemLocation::id is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::cudaMemAdviseSetAccessedBy flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p location, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then device in ::cudaMemLocation::id must have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ * Additionally, if ::cudaMemLocation::id has a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
+ * the data from \p location may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then device in ::cudaMemLocation::id must have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ * Additionally, if ::cudaMemLocation::id has a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * \param devPtr   - Pointer to memory to set the advice for
+ * \param count    - Size in bytes of the memory range
+ * \param advice   - Advice to be applied for the specified memory range
+ * \param location - location to apply the advice for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync,
+ * ::cuMemAdvise, ::cuMemAdvise_v2
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise_v2(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, struct cudaMemLocation location);
+
+/**
+* \brief Query an attribute of a given memory range
+*
+* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+* __managed__ variables.
+*
+* The \p attribute parameter can take the following values:
+* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted
+* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+* memory range have read-duplication enabled, or 0 otherwise.
+* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId
+* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId
+* if either all the pages don't have the same preferred location or some of the pages don't have a
+* preferred location at all. Note that the actual location of the pages in the memory range at the time of
+* the query may be different from the preferred location.
+* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted
+* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range.
+* If any device does not have that advice set for the entire memory range, that device will not be included.
+* If \p data is larger than the number of devices that have that advice set for that memory range,
+* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12
+* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have
+* that advice set, then only as many devices will be returned as can fit in the array. There is no
+* guarantee on which specific devices will be returned, however.
+* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be
+* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU
+* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the
+* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+* whether the prefetch operation to that location has completed or even begun.
+ * - ::cudaMemRangeAttributePreferredLocationType: If this attribute is specified, \p data will be
+ * interpreted as a ::cudaMemLocationType, and \p dataSize must be sizeof(cudaMemLocationType). The ::cudaMemLocationType returned will be
+ * ::cudaMemLocationTypeDevice if all pages in the memory range have the same GPU as their preferred location, or ::cudaMemLocationType
+ * will be ::cudaMemLocationTypeHost if all pages in the memory range have the CPU as their preferred location, or or it will be ::cudaMemLocationTypeHostNuma
+ * if all the pages in the memory range have the same host NUMA node ID as their preferred location or it will be ::cudaMemLocationTypeInvalid
+ * if either all the pages don't have the same preferred location or some of the pages don't have a preferred location at all.
+ * Note that the actual location type of the pages in the memory range at the time of the query may be different from the preferred location type.
+ *  - ::cudaMemRangeAttributePreferredLocationId: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::cudaMemRangeAttributePreferredLocationType query for the same address range
+ * returns ::cudaMemLocationTypeDevice, it will be a valid device ordinal or if it returns ::cudaMemLocationTypeHostNuma, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ * - ::cudaMemRangeAttributeLastPrefetchLocationType: If this attribute is specified, \p data will be
+ * interpreted as a ::cudaMemLocationType, and \p dataSize must be sizeof(cudaMemLocationType). The result returned will be the last location type
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. The ::cudaMemLocationType returned
+ * will be ::cudaMemLocationTypeDevice if the last prefetch location was the GPU or ::cudaMemLocationTypeHost if it was the CPU or ::cudaMemLocationTypeHostNuma if
+ * the last prefetch location was a specific host NUMA node. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, ::CUmemLocationType will be ::cudaMemLocationTypeInvalid.
+ * Note that this simply returns the last location type that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *  - ::cudaMemRangeAttributeLastPrefetchLocationId: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::cudaMemRangeAttributeLastPrefetchLocationType query for the same address range
+ * returns ::cudaMemLocationTypeDevice, it will be a valid device ordinal or if it returns ::cudaMemLocationTypeHostNuma, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+*
+* \param data      - A pointers to a memory location where the result
+*                    of each attribute query will be written to.
+* \param dataSize  - Array containing the size of data
+* \param attribute - The attribute to query
+* \param devPtr    - Start of the range to query
+* \param count     - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync,
+ * ::cudaMemAdvise,
+ * ::cuMemRangeGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::cudaMemRangeAttributeReadMostly
+ * - ::cudaMemRangeAttributePreferredLocation
+ * - ::cudaMemRangeAttributeAccessedBy
+ * - ::cudaMemRangeAttributeLastPrefetchLocation
+ * - :: cudaMemRangeAttributePreferredLocationType
+ * - :: cudaMemRangeAttributePreferredLocationId
+ * - :: cudaMemRangeAttributeLastPrefetchLocationType
+ * - :: cudaMemRangeAttributeLastPrefetchLocationId
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise,
+ * ::cudaMemPrefetchAsync,
+ * ::cuMemRangeGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count);
+
+/** @} */ /* END CUDART_MEMORY */
+
+/**
+ * \defgroup CUDART_MEMORY_DEPRECATED Memory Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoA,
+ * ::cuMemcpyDtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoH,
+ * ::cuMemcpyAtoD
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffsetSrc
+ * rows and \p wOffsetSrc bytes from the upper left corner to the CUDA array
+ * \p dst starting at \p hOffsetDst rows and \p wOffsetDst bytes from the upper
+ * left corner, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param count      - Size in bytes to copy
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoAAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoHAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/** @} */ /* END CUDART_MEMORY_DEPRECATED */
+
+/**
+ * \defgroup CUDART_MEMORY_POOLS Stream Ordered Memory Allocator 
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ * 
+ *
+ * @{
+ *
+ * \section CUDART_MEMORY_POOLS_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
+ *
+ * \section CUDART_MEMORY_POOLS_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cudaDeviceGetAttribute() with the device attribute
+ * ::cudaDevAttrMemoryPoolsSupported.
+ */
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool associated with the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] devPtr  - Returned device pointer
+ * \param[in] size     - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory,
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemAllocAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocFromPoolAsync, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute, ::cudaMemPoolGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ *
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering promise
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemFreeAsync, ::cudaMallocAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding.
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolTrimTo, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolSetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool.
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since
+ *                    the last time it was reset.
+ * - ::cudaMemPoolAttrUsedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the
+ *                    application since the last time it was reset.
+ *
+ * \param[in] pool  - The memory pool to get attributes of 
+ * \param[in] attr  - The attribute to get
+ * \param[in] value - Retrieved value 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolGetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa ::cuMemPoolSetAccess, ::cudaMemPoolGetAccess, ::cudaMallocAsync, cudaFreeAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc *descList, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location.
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemPoolGetAccess, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool, struct cudaMemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities.
+ *
+ * To create a memory pool targeting a specific host NUMA node, applications must
+ * set ::cudaMemPoolProps::cudaMemLocation::type to ::cudaMemLocationTypeHostNuma and
+ * ::cudaMemPoolProps::cudaMemLocation::id must specify the NUMA ID of the host memory node.
+ * Specifying ::cudaMemLocationTypeHostNumaCurrent or ::cudaMemLocationTypeHost as the
+ * ::cudaMemPoolProps::cudaMemLocation::type will result in ::cudaErrorInvalidValue.
+* By default, the pool's memory will be accessible from the device it is allocated on.
+ * In the case of pools created with ::cudaMemLocationTypeHostNuma, their default accessibility
+ * will be from the host CPU.
+ * Applications can control the maximum size of the pool by specifying a non-zero value for ::cudaMemPoolProps::maxSize.
+ * If set to 0, the maximum size of the pool will default to a system dependent value.
+ *
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices 
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
+ *
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ * share memory.
+ *
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ * channel is required for each user.
+ *
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+ *
+ * \note Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ *
+ * \sa ::cuMemPoolCreate, ::cudaDeviceSetMemPool, ::cudaMallocFromPoolAsync, ::cudaMemPoolExportToShareableHandle, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool 
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cudaMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations.
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa cuMemPoolDestroy, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolDestroy(cudaMemPool_t memPool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream.
+ *
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] ptr     - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] memPool  - The pool to allocate from
+ * \param[in] stream   - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemAllocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cudaMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cudaMemPoolExportPointer and ::cudaMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than cudaMemHandleTypeNone.
+ *
+ * \param[out] handle_out  - pointer to the location in which to store the requested handle 
+ * \param[in] pool         - pool to export
+ * \param[in] handleType   - the type of handle to create
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
+    void                            *shareableHandle,
+    cudaMemPool_t                    memPool,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with ::cudaMemPoolImportPointer.
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in ::cudaDeviceSetMemPool
+ *       or ::cudaMallocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open
+ * \param[in] handleType   - The type of handle being imported
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
+    cudaMemPool_t                   *memPool,
+    void                            *shareableHandle,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cudaMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(struct cudaMemPoolPtrExportData *exportData, void *ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cudaFree
+ * or cudaFreeAsync.  If ::cudaFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The ::cudaFreeAsync api may be used in the exporting process before
+ *       the ::cudaFreeAsync operation completes in its stream as long as the
+ *       ::cudaFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's ::cudaFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData *exportData);
+
+/** @} */ /* END CUDART_MEMORY_POOLS */
+
+/**
+ * \defgroup CUDART_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the CUDA 
+ * runtime application programming interface.
+ *
+ * @{
+ *
+ * \section CUDART_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.  
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be 
+ * used to access memory from the host program and from a kernel 
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDART_UNIFIED_support Supported Platforms
+ * 
+ * Whether or not a device supports unified addressing may be 
+ * queried by calling ::cudaGetDeviceProperties() with the device 
+ * property ::cudaDeviceProp::unifiedAddressing.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes .
+ *
+ * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a 
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device 
+ * memory, one may want to know on which CUDA device the memory 
+ * resides.  These properties may be queried using the function 
+ * ::cudaPointerGetAttributes()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to ::cudaMemcpy() and other copy functions.  
+ * The copy direction ::cudaMemcpyDefault may be used to specify that the 
+ * CUDA runtime should infer the location of the pointer from its value.
+ *
+ * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated through all devices using ::cudaMallocHost() and
+ * ::cudaHostAlloc() is always directly accessible from all devices that 
+ * support unified addressing.  This is the case regardless of whether or 
+ * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are 
+ * specified.
+ *
+ * The pointer value through which allocated host memory may be accessed 
+ * in kernels on all devices that support unified addressing is the same 
+ * as the pointer value through which that memory is accessed on the host.
+ * It is not necessary to call ::cudaHostGetDevicePointer() to get the device 
+ * pointer for these allocations.  
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::cudaHostAllocWriteCombined, as discussed below.
+ *
+ * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory
+ 
+ * Upon enabling direct access from a device that supports unified addressing 
+ * to another peer device that supports unified addressing using 
+ * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using 
+ * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible 
+ * by the current device.  The device pointer value through 
+ * which any peer's memory may be accessed in the current device 
+ * is the same pointer value through which that memory may be 
+ * accessed from the peer device. 
+ *
+ * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ * 
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cudaHostRegister() and host memory
+ * allocated using the flag ::cudaHostAllocWriteCombined.  For these 
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all devices
+ * that support unified addressing.  
+ * 
+ * This device address may be queried using ::cudaHostGetDevicePointer() 
+ * when a device using unified addressing is current.  Either the host 
+ * or the unified device pointer value may be used to refer to this memory 
+ * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault 
+ * memory direction.
+ *
+ */
+
+/**
+ * \brief Returns attributes about a specified pointer
+ *
+ * Returns in \p *attributes the attributes of the pointer \p ptr.
+ * If pointer was not allocated in, mapped by or registered with context
+ * supporting unified addressing ::cudaErrorInvalidValue is returned.
+ *
+ * \note In CUDA 11.0 forward passing host pointer will return ::cudaMemoryTypeUnregistered
+ * in ::cudaPointerAttributes::type and call will return ::cudaSuccess.
+ *
+ * The ::cudaPointerAttributes structure is defined as:
+ * \code
+    struct cudaPointerAttributes {
+        enum cudaMemoryType type;
+        int device;
+        void *devicePointer;
+        void *hostPointer;
+    }
+    \endcode
+ * In this structure, the individual fields mean
+ *
+ * - \ref ::cudaPointerAttributes::type identifies type of memory. It can be
+ *    ::cudaMemoryTypeUnregistered for unregistered host memory,
+ *    ::cudaMemoryTypeHost for registered host memory, ::cudaMemoryTypeDevice for device
+ *    memory or  ::cudaMemoryTypeManaged for managed memory.
+ *
+ * - \ref ::cudaPointerAttributes::device "device" is the device against which
+ *   \p ptr was allocated.  If \p ptr has memory type ::cudaMemoryTypeDevice
+ *   then this identifies the device on which the memory referred to by \p ptr
+ *   physically resides.  If \p ptr has memory type ::cudaMemoryTypeHost then this
+ *   identifies the device which was current when the allocation was made
+ *   (and if that device is deinitialized then this allocation will vanish
+ *   with that device's state).
+ *
+ * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is
+ *   the device pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the current device.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the 
+ *   current device then this is NULL.  
+ *
+ * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is
+ *   the host pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the host.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the
+ *   host then this is NULL.
+ *
+ * \param attributes - Attributes for the specified pointer
+ * \param ptr        - Pointer to get attributes for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaChooseDevice,
+ * ::cudaInitDevice,
+ * ::cuPointerGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);
+
+/** @} */ /* END CUDART_UNIFIED */
+
+/**
+ * \defgroup CUDART_PEER Peer Device Memory Access
+ *
+ * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the peer device memory access functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of
+ * directly accessing memory from \p peerDevice and 0 otherwise.  If direct
+ * access of \p peerDevice from \p device is possible, then access may be
+ * enabled by calling ::cudaDeviceEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param device        - Device from which allocations on \p peerDevice are to
+ *                        be directly accessed.
+ * \param peerDevice    - Device on which the allocations to be directly accessed 
+ *                        by \p device reside.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
+
+/**
+ * \brief Enables direct access to memory allocations on a peer device.
+ *
+ * On success, all allocations from \p peerDevice will immediately be accessible by
+ * the current device.  They will remain accessible until access is explicitly
+ * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using
+ * ::cudaDeviceReset().
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory on the current device from \p peerDevice, a separate symmetric call 
+ * to ::cudaDeviceEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates
+ * that the current device cannot directly access memory from \p peerDevice.
+ *
+ * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of
+ * \p peerDevice from the current device has already been enabled.
+ *
+ * Returns ::cudaErrorInvalidValue if \p flags is not 0.
+ *
+ * \param peerDevice  - Peer device to enable direct access to from the current device
+ * \param flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorPeerAccessAlreadyEnabled,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuCtxEnablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
+
+/**
+ * \brief Disables direct access to memory allocations on a peer device.
+ *
+ * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on
+ * \p peerDevice has not yet been enabled from the current device.
+ *
+ * \param peerDevice - Peer device to disable direct access to
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorPeerAccessNotEnabled,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);
+
+/** @} */ /* END CUDART_PEER */
+
+/** \defgroup CUDART_OPENGL OpenGL Interoperability */
+
+/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */
+
+/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */
+
+/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */
+
+/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_VDPAU VDPAU Interoperability */
+
+/** \defgroup CUDART_EGL EGL Interoperability */
+
+/**
+ * \defgroup CUDART_INTEROP Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphicsD3D9RegisterResource,
+ * ::cudaGraphicsD3D10RegisterResource,
+ * ::cudaGraphicsD3D11RegisterResource,
+ * ::cudaGraphicsGLRegisterBuffer,
+ * ::cudaGraphicsGLRegisterImage,
+ * ::cuGraphicsUnregisterResource
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will
+ *     be used. It is therefore assumed that CUDA may read from or write to \p resource.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will
+ *   write over the entire contents of \p resource, so none of the data
+ *   previously stored in \p resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsResourceSetMapFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to map
+ * \param resources - Resources to map for CUDA
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsUnmapResources,
+ * ::cuGraphicsMapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cudaGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are not presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to unmap
+ * \param resources - Resources to unmap
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsUnmapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Get an device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *devPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p devPtr may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ * *
+ * \param devPtr     - Returned pointer through which \p resource may be accessed
+ * \param size       - Returned size of the buffer accessible starting at \p *devPtr
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *array an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p array may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param array       - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::cudaGraphicsCubeFace for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *mipmappedArray a mipmapped array through which the mapped
+ * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource       - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource);
+
+/** @} */ /* END CUDART_INTEROP */
+
+/**
+ * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Get the channel descriptor of an array
+ *
+ * Returns in \p *desc the channel descriptor of the CUDA array \p array.
+ *
+ * \param desc  - Channel format
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array);
+
+/**
+ * \brief Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * \param x - X component
+ * \param y - Y component
+ * \param z - Z component
+ * \param w - W component
+ * \param f - Channel format
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array not in a block
+ * compressed format.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::cudaResourceDesc structure is defined as:
+ * \code
+        struct cudaResourceDesc {
+            enum cudaResourceType resType;
+            
+            union {
+                struct {
+                    cudaArray_t array;
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;
+                } mipmap;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceDesc::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        enum cudaResourceType {
+            cudaResourceTypeArray          = 0x00,
+            cudaResourceTypeMipmappedArray = 0x01,
+            cudaResourceTypeLinear         = 0x02,
+            cudaResourceTypePitch2D        = 0x03
+        };
+ * \endcode
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
+ * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
+ * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
+ * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
+ * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
+ * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
+ *
+ *
+ * The ::cudaTextureDesc struct is defined as
+ * \code
+        struct cudaTextureDesc {
+            enum cudaTextureAddressMode addressMode[3];
+            enum cudaTextureFilterMode  filterMode;
+            enum cudaTextureReadMode    readMode;
+            int                         sRGB;
+            float                       borderColor[4];
+            int                         normalizedCoords;
+            unsigned int                maxAnisotropy;
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            float                       mipmapLevelBias;
+            float                       minMipmapLevelClamp;
+            float                       maxMipmapLevelClamp;
+            int                         disableTrilinearOptimization;
+            int                         seamlessCubemap;
+        };
+ * \endcode
+ * where
+ * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
+ *   \code
+        enum cudaTextureAddressMode {
+            cudaAddressModeWrap   = 0,
+            cudaAddressModeClamp  = 1,
+            cudaAddressModeMirror = 2,
+            cudaAddressModeBorder = 3
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords
+ *   is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
+ *
+ * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
+ *   \code
+        enum cudaTextureFilterMode {
+            cudaFilterModePoint  = 0,
+            cudaFilterModeLinear = 1
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
+ *
+ * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
+ *   \code
+        enum cudaTextureReadMode {
+            cudaReadModeElementType     = 0,
+            cudaReadModeNormalizedFloat = 1
+        };
+ *   \endcode
+ *   Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of 
+ *   whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified.
+ *
+ * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
+ *
+ * - ::cudaTextureDesc::borderColor specifies the float values of color. where:
+ *   ::cudaTextureDesc::borderColor[0] contains value of 'R', 
+ *   ::cudaTextureDesc::borderColor[1] contains value of 'G',
+ *   ::cudaTextureDesc::borderColor[2] contains value of 'B', 
+ *   ::cudaTextureDesc::borderColor[3] contains value of 'A'
+ *   Note that application using integer border color values will need to <reinterpret_cast> these values to float.
+ *   The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder.
+ *
+ * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not.
+ *
+ * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled.
+ *
+ * - ::cudaTextureDesc::seamlessCubemap specifies whether seamless cube map filtering is enabled. This flag can only be specified if the 
+ *   underlying resource is a CUDA array or a CUDA mipmapped array that was created with the flag ::cudaArrayCubemap.
+ *   When seamless cube map filtering is enabled, texture address modes specified by ::cudaTextureDesc::addressMode are ignored.
+ *   Instead, if the ::cudaTextureDesc::filterMode is set to ::cudaFilterModePoint the address mode ::cudaAddressModeClamp will be applied for all dimensions.
+ *   If the ::cudaTextureDesc::filterMode is set to ::cudaFilterModeLinear seamless cube map filtering will be performed when sampling along the cube face borders.
+ *
+ * The ::cudaResourceViewDesc struct is defined as
+ * \code
+        struct cudaResourceViewDesc {
+            enum cudaResourceViewFormat format;
+            size_t                      width;
+            size_t                      height;
+            size_t                      depth;
+            unsigned int                firstMipmapLevel;
+            unsigned int                lastMipmapLevel;
+            unsigned int                firstLayer;
+            unsigned int                lastLayer;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
+ *   format but with 4 channels.
+ *
+ * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroyTextureObject,
+ * ::cuTexObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetTextureDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was specified, ::cudaErrorInvalidValue is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceViewDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
+
+/** @} */ /* END CUDART_TEXTURE_OBJECT */
+
+/**
+ * \defgroup CUDART_SURFACE_OBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The surface object 
+ * API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be 
+ * ::cudaResourceTypeArray and  ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroySurfaceObject,
+ * ::cuSurfObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject);
+
+/** @} */ /* END CUDART_SURFACE_OBJECT */
+
+/**
+ * \defgroup CUDART__VERSION Version Management
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest version of CUDA supported by the driver
+ *
+ * Returns in \p *driverVersion the latest version of CUDA supported by
+ * the driver. The version is returned as (1000 &times; major + 10 &times; minor).
+ * For example, CUDA 9.2 would be represented by 9020. If no driver is installed,
+ * then 0 is returned as the driver version.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue
+ * if \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaRuntimeGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
+
+/**
+ * \brief Returns the CUDA Runtime version
+ *
+ * Returns in \p *runtimeVersion the version number of the current CUDA
+ * Runtime instance. The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example,
+ * CUDA 9.2 would be represented by 9020.
+ *
+ * As of CUDA 12.0, this function no longer initializes CUDA. The purpose
+ * of this API is solely to return a compile-time constant stating the
+ * CUDA Toolkit version in the above format.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue if
+ * the \p runtimeVersion argument is NULL.
+ *
+ * \param runtimeVersion - Returns the CUDA Runtime version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/** @} */ /* END CUDART__VERSION */
+
+/**
+ * \defgroup CUDART_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p pGraph.
+ *
+ * \param pGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphDestroy,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The cudaKernelNodeParams structure is defined as:
+ *
+ * \code
+ *  struct cudaKernelNodeParams
+ *  {
+ *      void* func;
+ *      dim3 gridDim;
+ *      dim3 blockDim;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  };
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDim.x x
+ * \p gridDim.y x \p gridDim.z) grid of blocks. Each block contains
+ * (\p blockDim.x x \p blockDim.y x \p blockDim.z) threads.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
+ * via \p extra. This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::cudaErrorInvalidValue will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and
+ * \p extra are non-NULL).
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaLaunchKernel,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p node in \p pNodeParams.
+ * The \p kernelParams or \p extra array returned in \p pNodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::cudaKernelNodeAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidContext
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeCopyAttributes(
+        cudaGraphNode_t hSrc,
+        cudaGraphNode_t hDst);
+
+/**
+ * \brief Queries node attribute.
+ *
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    cudaKernelNodeAttrValue *value_out);
+
+/**
+ * \brief Sets node attribute.
+ *
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    const cudaKernelNodeAttrValue *value);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p pCopyParams.
+ * See ::cudaMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pCopyParams      - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams);
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a 1D memcpy node and adds it to a graph
+ *
+ * Creates a new 1D memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaMemcpy3D,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to perform a 1-dimensional copy
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams1D(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p pMemsetParams.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pMemsetParams    - Parameters for the memory set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaMemset2D,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p node to \p nodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param node   - Node to get the embedded graph for
+ * \param pGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event.  The synchronization will be performed
+ * efficiently on the device when applicable.  \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p graph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cudaGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cudaGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cudaMemFreeAsync or ::cudaMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::cudaGraphInstantiateFlagAutoFreeOnLaunch during instantiation, which makes
+ *   each launch behave as though it called ::cudaMemFreeAsync for every unfreed allocation.
+ *
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemAllocNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param node       - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out);
+#endif
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and address specified in \p dptr.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cudaGraphAddMemFreeNode will return ::cudaErrorInvalidValue if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr);
+#endif
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param node     - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out);
+#endif
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device);
+#endif
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemCurrent: Amount of memory, in bytes, currently associated with graphs
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemCurrent: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p pGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified 
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param pGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p clonedGraph corresponding to \p originalNode 
+ * in the original graph.
+ *
+ * \p clonedGraph must have been cloned from \p originalGraph via ::cudaGraphClone. 
+ * \p originalNode must have been in \p originalGraph at the time of the call to 
+ * ::cudaGraphClone, and the corresponding cloned node in \p clonedGraph must not have 
+ * been removed. The cloned node is then returned via \p pClonedNode.
+ *
+ * \param pNode  - Returns handle to the cloned node
+ * \param originalNode - Handle to the original node
+ * \param clonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p node in \p pType.
+ *
+ * \param node - Node to query
+ * \param pType  - Pointer to return the node type
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p graph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param graph    - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p graph's root nodes. \p pRootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p pNumRootNodes. Otherwise,
+ * \p pNumRootNodes entries will be filled in. If \p pNumRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p pRootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumRootNodes.
+ *
+ * \param graph       - Graph to query
+ * \param pRootNodes    - Pointer to return the root nodes
+ * \param pNumRootNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p graph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param graph    - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges);
+
+/**
+ * \brief Returns a graph's dependency edges (12.3+)
+ *
+ * Returns a list of \p graph's dependency edges. Edges are returned via corresponding
+ * indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a
+ * dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may
+ * both be NULL, in which case this function only returns the number of edges in
+ * \p numEdges. Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher
+ * than the actual number of edges, the remaining entries in \p from and \p to will be
+ * set to NULL, and the number of edges actually returned will be written to \p numEdges.
+ * \p edgeData may alone be NULL, in which case the edges must all have default (zeroed)
+ * edge data. Attempting a losst query via NULL \p edgeData will result in
+ * ::cudaErrorLossyQuery. If \p edgeData is non-NULL then \p from and \p to must be as
+ * well.
+ *
+ * \param graph    - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param edgeData - Optional location to return edge data
+ * \param numEdges - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, cudaGraphEdgeData *edgeData, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p pNumDependencies. Otherwise,
+ * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumDependencies.
+ *
+ * \param node           - Node to query
+ * \param pDependencies    - Pointer to return the dependencies
+ * \param pNumDependencies - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependentNodes,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies);
+
+/**
+ * \brief Returns a node's dependencies (12.3+)
+ *
+ * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p pNumDependencies. Otherwise,
+ * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumDependencies.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::cudaErrorLossyQuery. If \p edgeData is non-NULL, then
+ * \p pDependencies must be as well.
+ *
+ * \param node             - Node to query
+ * \param pDependencies    - Pointer to return the dependencies
+ * \param edgeData         - Optional array to return edge data for each dependency
+ * \param pNumDependencies - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependentNodes,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, cudaGraphEdgeData *edgeData, size_t *pNumDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p pNumDependentNodes.
+ * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p pNumDependentNodes.
+ *
+ * \param node             - Node to query
+ * \param pDependentNodes    - Pointer to return the dependent nodes
+ * \param pNumDependentNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes);
+
+/**
+ * \brief Returns a node's dependent nodes (12.3+)
+ *
+ * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p pNumDependentNodes.
+ * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p pNumDependentNodes.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::cudaErrorLossyQuery. If \p edgeData is non-NULL, then
+ * \p pDependentNodes must be as well.
+ *
+ * \param node               - Node to query
+ * \param pDependentNodes    - Pointer to return the dependent nodes
+ * \param edgeData           - Optional pointer to return edge data for dependent nodes
+ * \param pNumDependentNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, cudaGraphEdgeData *edgeData, size_t *pNumDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph.
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param graph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Adds dependency edges to a graph. (12.3+)
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param graph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, default (zeroed) edge data is assumed.
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, const cudaGraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph.
+ *
+ * The number of \p pDependencies to be removed is defined by \p numDependencies.
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * \param graph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph. (12.3+)
+ *
+ * The number of \p pDependencies to be removed is defined by \p numDependencies.
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an edge that does not exist in the graph, with data matching
+ * \p edgeData, results in an error. \p edgeData is nullable, which is equivalent
+ * to passing default (zeroed) data for each edge.
+ *
+ * \param graph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, edge data is assumed to
+ *                   be default (zeroed).
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, const cudaGraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p node from its graph. This operation also severs any dependencies of other nodes 
+ * on \p node and vice versa.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param node  - Node to remove
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to
+ * instantiate a second executable graph before destroying the first with
+ * ::cudaGraphExecDestroy will result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ * 
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * If \p graph is not instantiated for launch on the device but contains kernels which
+ * call device-side cudaGraphLaunch() from multiple devices, this will result in an error.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param flags      - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags __dv(0));
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to
+ * instantiate a second executable graph before destroying the first with
+ * ::cudaGraphExecDestroy will result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ *
+ * If \p graph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * devices, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param flags      - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags __dv(0));
+#endif
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph according to the \p instantiateParams structure.
+ * The graph is validated for any structural constraints or intra-node constraints
+ * which were not previously validated. If instantiation is successful, a handle to
+ * the instantiated graph is returned in \p pGraphExec.
+ *
+ * \p instantiateParams controls the behavior of instantiation and subsequent
+ * graph launches, as well as returning more detailed information in the event of an error.
+ * ::cudaGraphInstantiateParams is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned long long flags;
+        cudaStream_t uploadStream;
+        cudaGraphNode_t errNode_out;
+        cudaGraphInstantiateResult result_out;
+    } cudaGraphInstantiateParams;
+ * \endcode
+ *
+ * The \p flags field controls the behavior of instantiation and subsequent
+ * graph launches. Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagUpload, which will perform an upload of the graph
+ * into \p uploadStream once the graph has been instantiated.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate a
+ * second executable graph before destroying the first with ::cudaGraphExecDestroy will
+ * result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ *
+ * If \p graph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * devices, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * In the event of an error, the \p result_out and \p errNode_out fields will contain more
+ * information about the nature of the error. Possible error reporting includes:
+ *
+ * - ::cudaGraphInstantiateError, if passed an invalid value or if an unexpected error occurred
+ *   which is described by the return value of the function. \p errNode_out will be set to NULL.
+ * - ::cudaGraphInstantiateInvalidStructure, if the graph structure is invalid. \p errNode_out
+ *   will be set to one of the offending nodes.
+ * - ::cudaGraphInstantiateNodeOperationNotSupported, if the graph is instantiated for device
+ *   launch but contains a node of an unsupported node type, or a node which performs unsupported
+ *   operations, such as use of CUDA dynamic parallelism within a kernel node. \p errNode_out will
+ *   be set to this node.
+ * - ::cudaGraphInstantiateMultipleDevicesNotSupported, if the graph is instantiated for device
+ *   launch but a node’s device differs from that of another node. This error can also be returned
+ *   if a graph is not instantiated for device launch and it contains kernels which call device-side
+ *   cudaGraphLaunch() from multiple devices. \p errNode_out will be set to this node.
+ *
+ * If instantiation is successful, \p result_out will be set to ::cudaGraphInstantiateSuccess,
+ * and \p hErrNode_out will be set to NULL.
+ *
+ * \param pGraphExec       - Returns instantiated graph
+ * \param graph            - Graph to instantiate
+ * \param instantiateParams - Instantiation parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams *instantiateParams);
+
+/**
+ * \brief Query the instantiation flags of an executable graph
+ *
+ * Returns the flags that were passed to instantiation for the given executable graph.
+ * ::cudaGraphInstantiateFlagUpload will not be returned by this API as it does
+ * not affect the resulting executable graph.
+ *
+ * \param graphExec - The executable graph to query
+ * \param flags     - Returns the instantiation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphInstantiateWithParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long *flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p node in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p node must not have been removed from the original graph. All \p nodeParams 
+ * fields may change, but the following restrictions apply to \p func updates: 
+ *
+ *   - The owning device of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same device as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p node is also not modified by this call.
+ *
+ * If \p node is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - kernel node from the graph from which graphExec was instantiated
+ * \param pNodeParams - Updated Parameters to set
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_cudaKernel_t
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The source and destination memory in \p pNodeParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p pNodeParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memcpy node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams1D,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p dst must be allocated from the same contexts as the original source
+ * and destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * Zero sized operations are not supported.
+ *
+ * The new destination pointer in \p pNodeParams must be to the same kind of allocation
+ * as the original destination pointer and have the same context association and device mapping
+ * as the original destination pointer.
+ *
+ * Both the value and pointer address may be updated.  
+ * Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
+ * Specifically, for 2d memsets, all dimension changes are rejected.
+ * For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
+ * if the resulting work maps onto the work resources already allocated for the node.
+
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memset node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Host node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though the nodes contained
+ * in \p node's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p node must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p node.  See ::cudaGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param node       - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph);
+#endif
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled);
+#endif
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int *isEnabled);
+#endif
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with cudaGraphInstantiateFlagUseNodePriority, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same device as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *   - Neither \p hGraph nor \p hGraphExec may contain device-updatable kernel nodes.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - For 2d memsets, only address and assinged value may be updated.
+ *   - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
+ *     map onto the work resources already allocated for the node. 
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - Conditional nodes:
+ *   - Changing node parameters is not supported.
+ *   - Changeing parameters of nodes within the conditional body graph is subject to the rules above.
+ *   - Conditional handle flags and default values are updated as part of the graph update.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cudaGraphExecUpdate sets the result member of \p resultInfo to cudaGraphExecUpdateErrorTopologyChanged
+ * under the following conditions:
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode
+ *   is set to NULL.
+ * - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of
+ *   the exit nodes in hGraph. 
+ * - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with,
+ *   in which case resultInfo->errorNode is set to the node from \p hGraph.
+ * - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node
+ *   from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode
+ *   will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency
+ *   does not match when the nodes are already paired based on other edges examined in the graph.
+ *
+ * cudaGraphExecUpdate sets \p the result member of \p resultInfo to:
+ * - cudaGraphExecUpdateError if passed an invalid value.
+ * - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed
+ * - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel node changed (CUDA driver < 11.2)
+ * - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field of a kernel changed in an
+ *   unsupported way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorNotSupported if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If the update fails for a reason not listed above, the result member of \p resultInfo will be set
+ * to cudaGraphExecUpdateError. If the update succeeds, the result member will be set to cudaGraphExecUpdateSuccess.
+ *
+ * cudaGraphExecUpdate returns cudaSuccess when the updated was performed successfully.  It returns
+ * cudaErrorGraphExecUpdateFailure if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+   \param resultInfo the error info structure
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorGraphExecUpdateFailure,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphExecUpdateResultInfo *resultInfo);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p graphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_init_rt
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+#endif
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p graphExec in \p stream. Only one instance of \p graphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p stream
+ * and any previous launches of \p graphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p graphExec remain unfreed (from a previous launch) and
+ * \p graphExec was not instantiated with ::cudaGraphInstantiateFlagAutoFreeOnLaunch,
+ * the launch will fail with ::cudaErrorInvalidValue.
+ *
+ * \param graphExec - Executable graph to launch
+ * \param stream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p graphExec.
+ *
+ * \param graphExec - Executable graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p graph, as well as all of its nodes.
+ *
+ * \param graph - Graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p graph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param graph - The graph to create a DOT file from
+ * \param path  - The path to write the DOT file to
+ * \param flags - Flags from cudaGraphDebugDotFlags for specifying which additional node information to write
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOperatingSystem
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(cudaGraph_t graph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy, unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRetain,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::cudaGraphUserObjectMove transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1), unsigned int flags __dv(0));
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Adds a node of arbitrary type to a graph
+ *
+ * Creates a new node in \p graph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p pDependencies. \p numDependencies may be 0.
+ * \p pDependencies may be null if \p numDependencies is 0. \p pDependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph (12.3+)
+ *
+ * Creates a new node in \p graph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p pDependencies. \p numDependencies may be 0.
+ * \p pDependencies may be null if \p numDependencies is 0. \p pDependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param dependencyData  - Optional edge data for the dependencies. If NULL, the data is
+ *                          assumed to be default (zeroed) for all dependencies.
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddNode_v2(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters
+ *
+ * Sets the parameters of graph node \p node to \p nodeParams. The node type specified by
+ * \p nodeParams->type must match the type of \p node. \p nodeParams must be fully
+ * initialized and all unused bytes (reserved, padding) zeroed.
+ *
+ * Modifying parameters is not supported for node types cudaGraphNodeTypeMemAlloc and
+ * cudaGraphNodeTypeMemFree.
+ *
+ * \param node       - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetParams(cudaGraphNode_t node, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters in an instantiated graph
+ *
+ * Sets the parameters of a node in an executable graph \p graphExec. The node is identified
+ * by the corresponding node \p node in the non-executable graph from which the executable
+ * graph was instantiated. \p node must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p graphExec. Already
+ * enqueued or running launches of \p graphExec are not affected by this call.
+ * \p node is also not modified by this call.
+ *
+ * Allowed changes to parameters on executable graphs are as follows:
+ * <table>
+ *   <tr><th>Node type<th>Allowed changes
+ *   <tr><td>kernel<td>See ::cudaGraphExecKernelNodeSetParams
+ *   <tr><td>memcpy<td>Addresses for 1-dimensional copies if allocated in same context; see ::cudaGraphExecMemcpyNodeSetParams
+ *   <tr><td>memset<td>Addresses for 1-dimensional memsets if allocated in same context; see ::cudaGraphExecMemsetNodeSetParams
+ *   <tr><td>host<td>Unrestricted
+ *   <tr><td>child graph<td>Topology must match and restrictions apply recursively; see ::cudaGraphExecUpdate
+ *   <tr><td>event wait<td>Unrestricted
+ *   <tr><td>event record<td>Unrestricted
+ *   <tr><td>external semaphore signal<td>Number of semaphore operations cannot change
+ *   <tr><td>external semaphore wait<td>Number of semaphore operations cannot change
+ *   <tr><td>memory allocation<td>API unsupported
+ *   <tr><td>memory free<td>API unsupported
+ * </table>
+ *
+ * \param graphExec  - The executable graph in which to update the specified node
+ * \param node       - Corresponding node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphNodeSetParams
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecNodeSetParams(cudaGraphExec_t graphExec, cudaGraphNode_t node, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Create a conditional handle
+ *
+ * Creates a conditional handle associated with \p hGraph.
+ *
+ * The conditional handle must be associated with a conditional node in this graph or one of its children.
+ *  
+ * Handles not associated with a conditional node may cause graph instantiation to fail. 
+ *
+ * \param pHandle_out        - Pointer used to return the handle to the caller.
+ * \param hGraph             - Graph which will contain the conditional node using this handle.
+ * \param defaultLaunchValue - Optional initial value for the conditional variable.
+ *                             Applied at the beginning of each graph execution if cudaGraphCondAssignDefault is set in \p flags.
+ * \param flags              - Currently must be cudaGraphCondAssignDefault or 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphConditionalHandleCreate(cudaGraphConditionalHandle *pHandle_out, cudaGraph_t graph, unsigned int defaultLaunchValue __dv(0), unsigned int flags __dv(0));
+
+/** @} */ /* END CUDART_GRAPH */
+
+/**
+ * \defgroup CUDART_DRIVER_ENTRY_POINT Driver Entry Point Access
+ *
+ * ___MANBRIEF___ driver entry point access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **funcPtr the address of the CUDA driver function for the requested flags.
+ *
+ * For a requested driver symbol, if the CUDA version in which the driver symbol was
+ * introduced is less than or equal to the CUDA runtime version, the API will return
+ * the function pointer to the corresponding versioned driver function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::cudaSuccess and set the returned \p funcPtr if the
+ * requested driver function is valid and supported on the platform.
+ *
+ * The API will return ::cudaSuccess and set the returned \p funcPtr to NULL if the
+ * requested driver function is not supported on the platform, no ABI
+ * compatible driver function exists for the CUDA runtime version or if the
+ * driver symbol is invalid.
+ *
+ * It will also set the optional \p driverStatus to one of the values in 
+ * ::cudaDriverEntryPointQueryResult with the following meanings:
+ * - ::cudaDriverEntryPointSuccess - The requested symbol was succesfully found based
+ *   on input arguments and \p pfn is valid
+ * - ::cudaDriverEntryPointSymbolNotFound - The requested symbol was not found
+ * - ::cudaDriverEntryPointVersionNotSufficent - The requested symbol was found but is
+ *   not supported by the current runtime version (CUDART_VERSION)
+ *
+ * The requested flags can be:
+ * - ::cudaEnableDefault: This is the default mode. This is equivalent to
+ *   ::cudaEnablePerThreadDefaultStream if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::cudaEnableLegacyStream otherwise.
+ * - ::cudaEnableLegacyStream: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::cudaEnablePerThreadDefaultStream: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc.
+ *                 Note that the API will use the CUDA runtime version to return the
+ *                 address to the most recent ABI compatible driver symbol, ::cuMemAlloc
+ *                 or ::cuMemAlloc_v2.
+ * \param funcPtr - Location to return the function pointer to the requested driver function
+ * \param flags -  Flags to specify search options.
+ * \param driverStatus - Optional location to store the status of finding the symbol from
+ *                       the driver. See ::cudaDriverEntryPointQueryResult for 
+ *                       possible values.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_version_mixing
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuGetProcAddress
+ */
+#if defined(__cplusplus)
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus = NULL);
+#else
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+#endif
+
+/**
+ * \brief Returns the requested driver API function pointer by CUDA version
+ *
+ * Returns in \p **funcPtr the address of the CUDA driver function for the requested flags and CUDA driver version.
+ *
+ * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ * should be specified as 11020. For a requested driver symbol, if the specified
+ * CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ * was introduced, this API will return the function pointer to the corresponding
+ * versioned function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * For the case where the CUDA version requested is greater than the CUDA Toolkit 
+ * installed, there may not be an appropriate function pointer typedef in the
+ * corresponding header file and may need a custom typedef to match the driver
+ * function signature returned. This can be done by getting the typedefs from a later
+ * toolkit or creating appropriately matching custom function typedefs.
+ *
+ * The API will return ::cudaSuccess and set the returned \p funcPtr if the
+ * requested driver function is valid and supported on the platform.
+ *
+ * The API will return ::cudaSuccess and set the returned \p funcPtr to NULL if the
+ * requested driver function is not supported on the platform, no ABI
+ * compatible driver function exists for the requested version or if the
+ * driver symbol is invalid.
+ *
+ * It will also set the optional \p driverStatus to one of the values in 
+ * ::cudaDriverEntryPointQueryResult with the following meanings:
+ * - ::cudaDriverEntryPointSuccess - The requested symbol was succesfully found based
+ *   on input arguments and \p pfn is valid
+ * - ::cudaDriverEntryPointSymbolNotFound - The requested symbol was not found
+ * - ::cudaDriverEntryPointVersionNotSufficent - The requested symbol was found but is
+ *   not supported by the specified version \p cudaVersion
+ *
+ * The requested flags can be:
+ * - ::cudaEnableDefault: This is the default mode. This is equivalent to
+ *   ::cudaEnablePerThreadDefaultStream if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::cudaEnableLegacyStream otherwise.
+ * - ::cudaEnableLegacyStream: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::cudaEnablePerThreadDefaultStream: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc.
+ * \param funcPtr - Location to return the function pointer to the requested driver function
+ * \param cudaVersion - The CUDA version to look for the requested driver symbol 
+ * \param flags -  Flags to specify search options.
+ * \param driverStatus - Optional location to store the status of finding the symbol from
+ *                       the driver. See ::cudaDriverEntryPointQueryResult for 
+ *                       possible values.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_version_mixing
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuGetProcAddress
+ */
+#if defined(__cplusplus)
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPointByVersion(const char *symbol, void **funcPtr, unsigned int cudaVersion, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus = NULL);
+#else
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPointByVersion(const char *symbol, void **funcPtr, unsigned int cudaVersion, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+#endif
+
+/** @} */ /* END CUDART_DRIVER_ENTRY_POINT */
+
+/**
+ * \defgroup CUDART_LIBRARY Library Management
+ *
+ * ___MANBRIEF___ library management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the library management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Load a library with specified code and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cudaLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
+ * or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ * Please also see the documentation for nvrtc (https://docs.nvidia.com/cuda/nvrtc/index.html), 
+ * nvjitlink (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
+ * (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information on generating
+ * loadable code at runtime.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
+ * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \param library             - Returned library
+ * \param code                - Code to load
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorSharedObjectSymbolNotFound,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorJitCompilerNotFound
+ *
+ * \sa ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cuLibraryLoadData
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryLoadData(cudaLibrary_t *library, const void *code,
+                                   enum cudaJitOption *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                   enum cudaLibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Load a library with specified file and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cudaLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
+ * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ * Please also see the documentation for nvrtc (https://docs.nvidia.com/cuda/nvrtc/index.html), 
+ * nvjitlink (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
+ * (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information on generating
+ * loadable code at runtime.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are
+ * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \param library             - Returned library
+ * \param fileName            - File to load from
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorSharedObjectSymbolNotFound,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorJitCompilerNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryUnload,
+ * ::cuLibraryLoadFromFile
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryLoadFromFile(cudaLibrary_t *library, const char *fileName,
+                                       enum cudaJitOption *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                       enum cudaLibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Unloads a library
+ *
+ * Unloads the library specified with \p library
+ *
+ * \param library - Library to unload
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cuLibraryUnload
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryUnload(cudaLibrary_t library);
+
+/**
+ * \brief Returns a kernel handle
+ *
+ * Returns in \p pKernel the handle of the kernel with name \p name located in library \p library.
+ * If kernel handle is not found, the call returns ::cudaErrorSymbolNotFound.
+ *
+ * \param pKernel - Returned kernel handle
+ * \param library - Library to retrieve kernel from
+ * \param name - Name of kernel to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cuLibraryGetKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryGetKernel(cudaKernel_t *pKernel, cudaLibrary_t library, const char *name);
+
+/**
+ * \brief Returns a global device pointer
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the global with
+ * name \p name for the requested library \p library and the current device.
+ * If no global for the requested name \p name exists, the call returns ::cudaErrorSymbolNotFound.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored. The returned \p dptr cannot be passed to the Symbol APIs
+ * such as ::cudaMemcpyToSymbol, ::cudaMemcpyFromSymbol, ::cudaGetSymbolAddress, or
+ * ::cudaGetSymbolSize.
+ *
+ * \param dptr - Returned global device pointer for the requested library
+ * \param bytes - Returned global size in bytes
+ * \param library - Library to retrieve global from
+ * \param name - Name of global to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorContextIsDestroyed
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cudaLibraryGetManaged,
+ * ::cuLibraryGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryGetGlobal(void **dptr, size_t *bytes, cudaLibrary_t library, const char *name);
+
+/**
+ * \brief Returns a pointer to managed memory
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with
+ * name \p name for the requested library \p library. If no managed memory with the
+ * requested name \p name exists, the call returns ::cudaErrorSymbolNotFound. One of the parameters
+ * \p dptr or \p bytes (not both) can be NULL in which case it is ignored.
+ * Note that managed memory for library \p library is shared across devices and is registered
+ * when the library is loaded. The returned \p dptr cannot be passed to the Symbol APIs
+ * such as ::cudaMemcpyToSymbol, ::cudaMemcpyFromSymbol, ::cudaGetSymbolAddress, or
+ * ::cudaGetSymbolSize.
+ *
+ * \param dptr - Returned pointer to the managed memory
+ * \param bytes - Returned memory size in bytes
+ * \param library - Library to retrieve managed memory from
+ * \param name - Name of managed memory to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cudaLibraryGetGlobal,
+ * ::cuLibraryGetManaged
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryGetManaged(void **dptr, size_t *bytes, cudaLibrary_t library, const char *name);
+
+/**
+ * \brief Returns a pointer to a unified function
+ *
+ * Returns in \p *fptr the function pointer to a unified function denoted by \p symbol.
+ * If no unified function with name \p symbol exists, the call returns ::cudaErrorSymbolNotFound.
+ * If there is no device with attribute ::cudaDeviceProp::unifiedFunctionPointers present in the system,
+ * the call may return ::cudaErrorSymbolNotFound.
+ *
+ * \param fptr - Returned pointer to a unified function
+ * \param library - Library to retrieve function pointer memory from
+ * \param symbol - Name of function pointer to retrieve
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorSymbolNotFound
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cuLibraryGetUnifiedFunction
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryGetUnifiedFunction(void **fptr, cudaLibrary_t library, const char *symbol);
+
+/**
+ * \brief Returns the number of kernels within a library
+ *
+ * Returns in \p count the number of kernels in \p lib.
+ *
+ * \param count - Number of kernels found within the library
+ * \param lib - Library to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ *
+ * \sa ::cudaLibraryEnumerateKernels,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryLoadData,
+ * ::cuLibraryGetKernelCount
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryGetKernelCount(unsigned int *count, cudaLibrary_t lib);
+
+/**
+ * \brief Retrieve the kernel handles within a library.
+ *
+ * Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib.
+ * The returned kernel handle becomes invalid when the library is unloaded.
+ *
+ * \param kernels - Buffer where the kernel handles are returned to
+ * \param numKernels - Maximum number of kernel handles may be returned to the buffer
+ * \param lib - Library to query from
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ *
+ * \sa ::cudaLibraryGetKernelCount,
+ * ::cuLibraryEnumerateKernels
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLibraryEnumerateKernels(cudaKernel_t *kernels, unsigned int numKernels, cudaLibrary_t lib);
+
+/**
+ * \brief Sets information about a kernel
+ *
+ * This call sets the value of a specified attribute \p attr on the kernel \p kernel
+ * for the requested device \p device to an integer value specified by \p value.
+ * This function returns ::cudaSuccess if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (::cudaErrorInvalidValue)
+ *
+ * Note that attributes set using ::cudaFuncSetAttribute() will override the attribute
+ * set by this API irrespective of whether the call to ::cudaFuncSetAttribute() is made
+ * before or after this API call. Because of this and the stricter locking requirements
+ * mentioned below it is suggested that this call be used during the initialization path
+ * and not on each thread accessing \p kernel such as on kernel launches or on the
+ * critical path.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeNonPortableClusterSizeAllowed: Indicates whether the
+ *   function can be launched with non-portable cluster size. 1 is allowed, 0 is
+ *   disallowed.
+ * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
+ *   scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
+ *
+ * \note The API has stricter locking requirements in comparison to its legacy counterpart
+ * ::cudaFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to
+ * set the same attribute on the same device simultaneously, the attribute setting will depend
+ * on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param kernel  - Kernel to set attribute of
+ * \param attr - Attribute requested
+ * \param value - Value to set
+ * \param device - Device to set attribute of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa ::cudaLibraryLoadData,
+ * ::cudaLibraryLoadFromFile,
+ * ::cudaLibraryUnload,
+ * ::cudaLibraryGetKernel,
+ * ::cudaLaunchKernel,
+ * ::cudaFuncSetAttribute,
+ * ::cuKernelSetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaKernelSetAttributeForDevice(cudaKernel_t kernel, enum cudaFuncAttribute attr, int value, int device);
+
+/** @} */ /* END CUDART_LIBRARY */
+
+/** \cond impl_private */
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId);
+/** \endcond impl_private */
+
+/**
+ * \defgroup CUDART_HIGHLEVEL C++ API Routines
+ *
+ * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the C++ high level API functions of the CUDA runtime
+ * application programming interface. To use these functions, your
+ * application needs to be compiled with the \p nvcc compiler.
+ *
+ * \brief C++-style interface built on top of CUDA runtime API
+ */
+
+/**
+ * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API
+ *
+ * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
+ *
+ * @{
+ *
+ * \section CUDART_CUDA_primary Primary Contexts
+ *
+ * There exists a one to one relationship between CUDA devices in the CUDA Runtime
+ * API and ::CUcontext s in the CUDA Driver API within a process.  The specific
+ * context which the CUDA Runtime API uses for a device is called the device's
+ * primary context.  From the perspective of the CUDA Runtime API, a device and 
+ * its primary context are synonymous.
+ *
+ * \section CUDART_CUDA_init Initialization and Tear-Down
+ *
+ * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to
+ * to the calling host thread.
+ * 
+ * The function ::cudaInitDevice() ensures that the primary context is initialized
+ * for the requested device but does not make it current to the calling thread. 
+ *
+ * The function ::cudaSetDevice() initializes the primary context for the
+ * specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
+ *
+ * The CUDA Runtime API will automatically initialize the primary context for
+ * a device at the first CUDA Runtime API call which requires an active context.
+ * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call 
+ * which requires an active context is made, then the primary context for a device 
+ * will be selected, made current to the calling thread, and initialized.
+ *
+ * The context which the CUDA Runtime API initializes will be initialized using 
+ * the parameters specified by the CUDA Runtime API functions
+ * ::cudaSetDeviceFlags(), 
+ * ::cudaD3D9SetDirect3DDevice(), 
+ * ::cudaD3D10SetDirect3DDevice(), 
+ * ::cudaD3D11SetDirect3DDevice(), 
+ * ::cudaGLSetGLDevice(), and
+ * ::cudaVDPAUSetVDPAUDevice().
+ * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are 
+ * called when the primary context for the specified device has already been initialized,
+ * except for ::cudaSetDeviceFlags() which will simply overwrite the previous settings.
+ *
+ * Primary contexts will remain active until they are explicitly deinitialized 
+ * using ::cudaDeviceReset().  The function ::cudaDeviceReset() will deinitialize the 
+ * primary context for the calling thread's current device immediately.  The context 
+ * will remain current to all of the threads that it was current to.  The next CUDA 
+ * Runtime API call on any thread which requires an active context will trigger the 
+ * reinitialization of that device's primary context.
+ *
+ * Note that primary contexts are shared resources. It is recommended that
+ * the primary context not be reset except just before exit or to recover from an
+ * unspecified launch failure.
+ * 
+ * \section CUDART_CUDA_context Context Interoperability
+ *
+ * Note that the use of multiple ::CUcontext s per device within a single process 
+ * will substantially degrade performance and is strongly discouraged.  Instead,
+ * it is highly recommended that the implicit one-to-one device-to-context mapping
+ * for the process provided by the CUDA Runtime API be used.
+ *
+ * If a non-primary ::CUcontext created by the CUDA Driver API is current to a
+ * thread then the CUDA Runtime API calls to that thread will operate on that 
+ * ::CUcontext, with some exceptions listed below.  Interoperability between data
+ * types is discussed in the following sections.
+ *
+ * The function ::cudaPointerGetAttributes() will return the error 
+ * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a 
+ * non-primary context.  The function ::cudaDeviceEnablePeerAccess() and the rest of 
+ * the peer access API may not be called when a non-primary ::CUcontext is current.  
+ * To use the pointer query and peer access APIs with a context created using the 
+ * CUDA Driver API, it is necessary that the CUDA Driver API be used to access
+ * these features.
+ *
+ * All CUDA Runtime API state (e.g, global variables' addresses and values) travels
+ * with its underlying ::CUcontext.  In particular, if a ::CUcontext is moved from one 
+ * thread to another then all CUDA Runtime API state will move to that thread as well.
+ *
+ * Please note that attaching to legacy contexts (those with a version of 3010 as returned
+ * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return
+ * ::cudaErrorIncompatibleDriverContext in such cases.
+ *
+ * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t
+ *
+ * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t
+ *
+ * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t 
+ *
+ * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *,
+ * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
+ *
+ * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray,
+ * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
+ *
+ * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t
+ *
+ * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a 
+ * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource 
+ * to a ::cudaGraphicsResource_t.
+ *
+ * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a
+ * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t 
+ * to a ::CUgraphicsResource.
+ *
+ * \section CUDART_CUDA_texture_objects Interactions between CUtexObject and cudaTextureObject_t
+ *
+ * The types ::CUtexObject and ::cudaTextureObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUtexObject in a CUDA Runtime API function which takes a ::cudaTextureObject_t,
+ * it is necessary to explicitly cast the ::CUtexObject to a ::cudaTextureObject_t.
+ *
+ * In order to use a ::cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject,
+ * it is necessary to explicitly cast the ::cudaTextureObject_t to a ::CUtexObject.
+ *
+ * \section CUDART_CUDA_surface_objects Interactions between CUsurfObject and cudaSurfaceObject_t
+ *
+ * The types ::CUsurfObject and ::cudaSurfaceObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a ::cudaSurfaceObject_t,
+ * it is necessary to explicitly cast the ::CUsurfObject to a ::cudaSurfaceObject_t.
+ *
+ * In order to use a ::cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject,
+ * it is necessary to explicitly cast the ::cudaSurfaceObject_t to a ::CUsurfObject.
+ *
+ * \section CUDART_CUDA_module Interactions between CUfunction and cudaFunction_t
+ *
+ * The types ::CUfunction and ::cudaFunction_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction,
+ * it is necessary to explicitly cast the ::cudaFunction_t to a ::CUfunction.
+ *
+ * \section CUDART_CUDA_library Interactions between CUkernel and cudaKernel_t
+ *
+ * The types ::CUkernel and ::cudaKernel_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::cudaKernel_t in a CUDA Driver API function which takes a ::CUkernel,
+ * it is necessary to explicitly cast the ::cudaKernel_t to a ::CUkernel.
+ *
+ */
+
+ /**
+  * \brief Get pointer to device entry function that matches entry function \p symbolPtr
+  *
+  * Returns in \p functionPtr the device entry function corresponding to the symbol \p symbolPtr.
+  *
+  * \param functionPtr     - Returns the device entry function
+  * \param symbolPtr       - Pointer to device entry function to search for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  */
+extern __host__ cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr);
+
+/**
+ * \brief Get pointer to device kernel that matches entry function \p entryFuncAddr
+  *
+  * Returns in \p kernelPtr the device kernel corresponding to the entry function \p entryFuncAddr.
+  *
+  * Note that it is possible that there are multiple symbols belonging to different
+  * translation units with the same \p entryFuncAddr registered with this CUDA Runtime
+  * and so the order which the translation units are loaded and registered with the
+  * CUDA Runtime can lead to differing return pointers in \p kernelPtr .
+  * Suggested methods of ensuring uniqueness are to limit visibility of __global__
+  * device functions by using static or hidden visibility attribute in the
+  * respective translation units.
+  *
+  * \param kernelPtr          - Returns the device kernel
+  * \param entryFuncAddr      - Address of device entry function to search kernel for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  * \sa
+  * \ref ::cudaGetKernel(cudaKernel_t *kernelPtr, const T *entryFuncAddr) "cudaGetKernel (C++ API)"
+  */
+extern __host__ cudaError_t CUDARTAPI cudaGetKernel(cudaKernel_t *kernelPtr, const void *entryFuncAddr);
+
+/** @} */ /* END CUDART_DRIVER */
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cudaMemcpy
+    #undef cudaMemcpyToSymbol
+    #undef cudaMemcpyFromSymbol
+    #undef cudaMemcpy2D
+    #undef cudaMemcpyToArray
+    #undef cudaMemcpy2DToArray
+    #undef cudaMemcpyFromArray
+    #undef cudaMemcpy2DFromArray
+    #undef cudaMemcpyArrayToArray
+    #undef cudaMemcpy2DArrayToArray
+    #undef cudaMemcpy3D
+    #undef cudaMemcpy3DPeer
+    #undef cudaMemset
+    #undef cudaMemset2D
+    #undef cudaMemset3D
+    #undef cudaMemcpyAsync
+    #undef cudaMemcpyToSymbolAsync
+    #undef cudaMemcpyFromSymbolAsync
+    #undef cudaMemcpy2DAsync
+    #undef cudaMemcpyToArrayAsync
+    #undef cudaMemcpy2DToArrayAsync
+    #undef cudaMemcpyFromArrayAsync
+    #undef cudaMemcpy2DFromArrayAsync
+    #undef cudaMemcpy3DAsync
+    #undef cudaMemcpy3DPeerAsync
+    #undef cudaMemcpyBatchAsync
+    #undef cudaMemcpy3DBatchAsync
+    #undef cudaMemsetAsync
+    #undef cudaMemset2DAsync
+    #undef cudaMemset3DAsync
+    #undef cudaStreamQuery
+    #undef cudaStreamGetDevice
+    #undef cudaStreamGetFlags
+    #undef cudaStreamGetId
+    #undef cudaStreamGetPriority
+    #undef cudaEventRecord
+    #undef cudaEventRecordWithFlags
+    #undef cudaStreamWaitEvent
+    #undef cudaStreamAddCallback
+    #undef cudaStreamAttachMemAsync
+    #undef cudaStreamSynchronize
+    #undef cudaLaunchKernel
+    #undef cudaLaunchKernelExC
+    #undef cudaLaunchHostFunc
+    #undef cudaMemPrefetchAsync
+    #undef cudaMemPrefetchAsync_v2
+    #undef cudaLaunchCooperativeKernel
+    #undef cudaSignalExternalSemaphoresAsync
+    #undef cudaWaitExternalSemaphoresAsync
+    #undef cudaGraphInstantiateWithParams
+    #undef cudaGraphUpload
+    #undef cudaGraphLaunch
+    #undef cudaStreamBeginCapture
+    #undef cudaStreamBeginCaptureToGraph
+    #undef cudaStreamEndCapture
+    #undef cudaStreamIsCapturing
+    #undef cudaStreamGetCaptureInfo
+    #undef cudaStreamGetCaptureInfo_v2
+    #undef cudaStreamGetCaptureInfo_v3
+    #undef cudaStreamUpdateCaptureDependencies
+    #undef cudaStreamUpdateCaptureDependencies_v2
+    #undef cudaStreamCopyAttributes
+    #undef cudaStreamGetAttribute
+    #undef cudaStreamSetAttribute
+    #undef cudaMallocAsync
+    #undef cudaFreeAsync
+    #undef cudaMallocFromPoolAsync
+    #undef cudaGetDriverEntryPoint
+    #undef cudaGetDriverEntryPointByVersion
+
+    #undef cudaGetDeviceProperties
+
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyBatchAsync(void **dsts, void **srcs, size_t *sizes, size_t count, struct cudaMemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs, size_t *failIdx, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DBatchAsync(size_t numOps, struct cudaMemcpy3DBatchOp *opList, size_t *failIdx, unsigned long long flags, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetDevice(cudaStream_t hStream, int *device);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+    extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync_v2(const void *devPtr, size_t count, struct cudaMemLocation location, unsigned int flags, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams *instantiateParams);
+    extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, enum cudaStreamCaptureMode mode);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_ptsz(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v3(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), const cudaGraphEdgeData **edgeData_out __dv(0), size_t *numDependencies_out __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dstStream, cudaStream_t srcStream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetAttribute(cudaStream_t stream, cudaStreamAttrID attr, cudaStreamAttrValue *value);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSetAttribute(cudaStream_t stream, cudaStreamAttrID attr, const cudaStreamAttrValue *param);
+
+    extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+    extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPointByVersion(const char *symbol, void **funcPtr, unsigned int cudaVersion, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    // nvcc stubs reference the 'cudaLaunch'/'cudaLaunchKernel' identifier even if it was defined
+    // to 'cudaLaunch_ptsz'/'cudaLaunchKernel_ptsz'. Redirect through a static inline function.
+    #undef cudaLaunchKernel
+    static __inline__ __host__ cudaError_t cudaLaunchKernel(const void *func, 
+                                                            dim3 gridDim, dim3 blockDim, 
+                                                            void **args, size_t sharedMem, 
+                                                            cudaStream_t stream)
+    {
+        return cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream);
+    }
+    #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel)
+    #undef cudaLaunchKernelExC
+    static __inline__ __host__ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t *config,
+                                                               const void *func,
+                                                                  void **args)
+    {
+        return cudaLaunchKernelExC_ptsz(config, func, args);
+    }
+    #define cudaLaunchKernelExC __CUDART_API_PTSZ(cudaLaunchKernelExC)
+#endif
+
+#if defined(__cplusplus)
+}
+
+#endif /* __cplusplus */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_API_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a9814410e4b6fb4f07ad9edc8394e956b77dbcd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __cuda_stdint_h__
+#define __cuda_stdint_h__
+
+// Compiler-specific treatment for C99's stdint.h
+//
+// By default, this header will use the standard headers (so it
+// is your responsibility to make sure they are available), except
+// on MSVC before Visual Studio 2010, when they were not provided.
+// To support old MSVC, a few of the commonly-used definitions are
+// provided here.  If more definitions are needed, add them here,
+// or replace these definitions with a complete implementation,
+// such as the ones available from Google, Boost, or MSVC10.  You
+// can prevent the definition of any of these types (in order to
+// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
+
+#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
+// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
+// cudart).
+#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
+
+// These definitions can be used with MSVC 8 and 9,
+// which don't ship with stdint.h:
+
+typedef unsigned   char   uint8_t;
+
+typedef            short  int16_t;
+typedef unsigned   short uint16_t;
+
+// To keep it consistent with all MSVC build. define those types
+// in the exact same way they are defined with the MSVC headers
+#if defined(_MSC_VER)
+typedef signed     char    int8_t;
+
+typedef            int     int32_t;
+typedef unsigned   int     uint32_t;
+
+typedef long long          int64_t;
+typedef unsigned long long uint64_t;
+#else
+typedef            char    int8_t;
+
+typedef            long   int32_t;
+typedef unsigned   long  uint32_t;
+
+typedef          __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+
+#elif defined(__DJGPP__)
+
+// These definitions can be used when compiling
+// C code with DJGPP, which only provides stdint.h
+// when compiling C++ code with TR1 enabled.
+
+typedef               char    int8_t;
+typedef unsigned      char   uint8_t;
+
+typedef               short  int16_t;
+typedef unsigned      short uint16_t;
+
+typedef               long   int32_t;
+typedef unsigned      long  uint32_t;
+
+typedef          long long   int64_t;
+typedef unsigned long long  uint64_t;
+
+#else
+
+// Use standard headers, as specified by C99 and C++ TR1.
+// Known to be provided by:
+// - gcc/glibc, supported by all versions of glibc
+// - djgpp, supported since 2001
+// - MSVC, supported by Visual Studio 2010 and later
+
+#include <stdint.h>
+
+#endif
+
+#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+
+#endif // file guard
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a35c215668e98006c3eaa286deb70461eb1fa62
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_SURFACE_TYPES_H__)
+#define __CUDA_SURFACE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_SURFACE_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f723db5c682a7b4b05491219df8993f0f6ebd59
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_TEXTURE_TYPES_H__)
+#define __CUDA_TEXTURE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_TEXTURE_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1ba357eb02ed82afc2f1812627a8a2d88c6f7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_VDPAU_INTEROP_H__)
+#define __CUDA_VDPAU_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#include <vdpau/vdpau.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_VDPAU VDPAU Interoperability
+ * This section describes the VDPAU interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VdpDevice.
+ *
+ * Returns the CUDA device associated with a VdpDevice, if applicable.
+ *
+ * \param device - Returns the device associated with vdpDevice, or -1 if
+ * the device associated with vdpDevice is not a compute device.
+ * \param vdpDevice - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cuVDPAUGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Sets a CUDA device to use VDPAU interoperability
+ *
+ * Records \p vdpDevice as the VdpDevice for VDPAU interoperability 
+ * with the CUDA device \p device and sets \p device as the current 
+ * device for the calling host thread.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * If \p device has already been initialized then this call will fail 
+ * with the error ::cudaErrorSetOnActiveProcess.  In this case it is 
+ * necessary to reset \p device using ::cudaDeviceReset() before 
+ * VDPAU interoperability on \p device may be enabled.
+ *
+ * \param device - Device to use for VDPAU interoperability
+ * \param vdpDevice - The VdpDevice to interoperate with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsVDPAURegisterVideoSurface,
+ * ::cudaGraphicsVDPAURegisterOutputSurface,
+ * ::cudaDeviceReset
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Register a VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterVideoSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Register a VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterOutputSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDART_VDPAU */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* __CUDA_VDPAU_INTEROP_H__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f022bbe349eba2219a6b74f1ea315c1ce8551b7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDART_PLATFORM_H__
+#define __CUDART_PLATFORM_H__
+
+#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
+#define isEglSupported 1
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..be316531dcfd846bcea8feadf3604437ce2447a1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2010-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_H_)
+#define _CUPTI_H_
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifdef NOMINMAX
+#include <windows.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#endif
+#endif
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_version.h>
+
+/* Activity, callback, event and metric APIs */
+#include <cupti_activity.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+
+/* Runtime, driver, and nvtx function identifiers */
+#include <cupti_driver_cbid.h>
+#include <cupti_runtime_cbid.h>
+#include <cupti_nvtx_cbid.h>
+
+/* To support function parameter structures for obsoleted API. See
+   cuda.h for the actual definition of these structures. */
+typedef unsigned int CUdeviceptr_v1;
+typedef struct CUDA_MEMCPY2D_v1_st { int dummy; } CUDA_MEMCPY2D_v1;
+typedef struct CUDA_MEMCPY3D_v1_st { int dummy; } CUDA_MEMCPY3D_v1;
+typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY_DESCRIPTOR_v1;
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+/* Function parameter structures */
+#include <generated_cuda_runtime_api_meta.h>
+#include <generated_cuda_meta.h>
+
+/* The following parameter structures cannot be included unless a
+   header that defines GL_VERSION is included before including them.
+   If these are needed then make sure such a header is included
+   already. */
+#ifdef GL_VERSION
+#include <generated_cuda_gl_interop_meta.h>
+#include <generated_cudaGL_meta.h>
+#endif
+
+//#include <generated_nvtx_meta.h>
+
+/* The following parameter structures cannot be included by default as
+   they are not guaranteed to be available on all systems. Uncomment
+   the includes that are available, or use the include explicitly. */
+#if defined(__linux__)
+//#include <generated_cuda_vdpau_interop_meta.h>
+//#include <generated_cudaVDPAU_meta.h>
+#endif
+
+#ifdef _WIN32
+//#include <generated_cuda_d3d9_interop_meta.h>
+//#include <generated_cuda_d3d10_interop_meta.h>
+//#include <generated_cuda_d3d11_interop_meta.h>
+//#include <generated_cudaD3D9_meta.h>
+//#include <generated_cudaD3D10_meta.h>
+//#include <generated_cudaD3D11_meta.h>
+#endif
+
+#endif /*_CUPTI_H_*/
+
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdb6b76f8d66e986b20bd481fbeb0a12a791e5a5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h
@@ -0,0 +1,8065 @@
+/*
+ * Copyright 2011-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_H_)
+#define _CUPTI_ACTIVITY_H_
+
+/**
+ * Deprecated APIs and structures have been moved to the
+ * header :doc: `cupti_activity_deprecated.h`, which is included at
+ * the bottom of this file. Header cupti_activity.h contains
+ * only the latest version of APIs and structures.
+ */
+
+#include <cuda.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+#include <cupti_result.h>
+
+#if defined(CUPTI_DIRECTIVE_SUPPORT)
+#include <Openacc/cupti_openacc.h>
+#include <Openmp/cupti_openmp.h>
+#endif
+
+#include <cupti_common.h>
+
+#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#define invalidNumaId ((uint32_t) 0xFFFFFFFF)
+
+/**
+ * \defgroup CUPTI_ACTIVITY_API CUPTI Activity API
+ * Functions, types, and enums that implement the CUPTI Activity API.
+ * @{
+ */
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_Activity
+ * \see CUpti_ActivityAPI
+ * \see CUpti_ActivityContext
+ * \see CUpti_ActivityContext2
+ * \see CUpti_ActivityContext3
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityDeviceAttribute
+ * \see CUpti_ActivityEvent
+ * \see CUpti_ActivityEventInstance
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityKernel9
+ * \see CUpti_ActivityCdpKernel
+ * \see CUpti_ActivityPreemption
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpy5
+ * \see CUpti_ActivityMemcpy6
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemcpyPtoP4
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemset4
+ * \see CUpti_ActivityMemory
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemory3
+ * \see CUpti_ActivityMemory4
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMemoryPool2
+ * \see CUpti_ActivityMetric
+ * \see CUpti_ActivityMetricInstance
+ * \see CUpti_ActivityName
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityMarker2
+ * \see CUpti_ActivityMarkerData
+ * \see CUpti_ActivitySourceLocator
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityGlobalAccess3
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityBranch2
+ * \see CUpti_ActivityOverhead3
+ * \see CUpti_ActivityEnvironment
+ * \see CUpti_ActivityInstructionExecution
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityFunction
+ * \see CUpti_ActivityModule
+ * \see CUpti_ActivitySharedAccess
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityPCSampling3
+ * \see CUpti_ActivityPCSamplingRecordInfo
+ * \see CUpti_ActivityCudaEvent2
+ * \see CUpti_ActivityStream
+ * \see CUpti_ActivitySynchronization2
+ * \see CUpti_ActivityInstructionCorrelation
+ * \see CUpti_ActivityExternalCorrelation
+ * \see CUpti_ActivityUnifiedMemoryCounter3
+ * \see CUpti_ActivityOpenAccData
+ * \see CUpti_ActivityOpenAccLaunch
+ * \see CUpti_ActivityOpenAccOther
+ * \see CUpti_ActivityOpenMp
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ * \see CUpti_ActivityNvLink4
+ * \see CUpti_ActivityPcie
+ * \see CUpti_ActivityConfidentialComputeRotation
+ */
+
+typedef enum {
+  /**
+   * The activity record is invalid.
+   */
+  CUPTI_ACTIVITY_KIND_INVALID  = 0,
+
+  /**
+   * A host<->host, host<->device, or device<->device memory copy.
+   * For peer to peer memory copy, use the kind CUPTI_ACTIVITY_KIND_MEMCPY2.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemcpy6.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY   = 1,
+
+  /**
+   * A memory set executing on the GPU. The corresponding activity
+   * record structure is \ref CUpti_ActivityMemset4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMSET   = 2,
+
+  /**
+   * A kernel executing on the GPU. This activity kind may significantly change
+   * the overall performance characteristics of the application because all
+   * kernel executions are serialized on the GPU. Other activity kind for kernel
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency.
+   * The corresponding activity record structure is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_KERNEL   = 3,
+
+  /**
+   * A CUDA driver API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_DRIVER   = 4,
+
+  /**
+   * A CUDA runtime API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_RUNTIME  = 5,
+
+  /**
+   * A performance counter (aka event) value. The corresponding activity record 
+   * structure is \ref CUpti_ActivityEvent. This activity cannot be directly
+   * enabled or disabled. Information collected using the Event API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT    = 6,
+
+  /**
+   * A performance metric value. The corresponding activity record structure is
+   * \ref CUpti_ActivityMetric. This activity cannot be directly
+   * enabled or disabled. Information collected using the Metric API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC   = 7,
+
+  /**
+   * Information about a CUDA device. The corresponding activity record
+   * structure is \ref CUpti_ActivityDevice5.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE   = 8,
+
+  /**
+   * Information about a CUDA context. The corresponding activity record
+   * structure is \ref CUpti_ActivityContext3.
+   */
+  CUPTI_ACTIVITY_KIND_CONTEXT  = 9,
+
+  /**
+   * A kernel executing on the GPU. This activity kind doesn't break
+   * kernel concurrency. The corresponding activity record structure
+   * is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10,
+
+  /**
+   * Resource naming done via NVTX APIs for thread, device, context, etc.
+   * The corresponding activity record structure is \ref CUpti_ActivityName.
+   */
+  CUPTI_ACTIVITY_KIND_NAME     = 11,
+
+  /**
+   * Instantaneous, start, or end NVTX marker. The corresponding activity
+   * record structure is \ref CUpti_ActivityMarker2.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER = 12,
+
+  /**
+   * Extended, optional, data about a NVTX marker. User must enable
+   * CUPTI_ACTIVITY_KIND_MARKER as well to get records for marker data.
+   * The corresponding activity record structure is \ref CUpti_ActivityMarkerData.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER_DATA = 13,
+
+  /**
+   * Source information about source level result. The corresponding
+   * activity record structure is \ref CUpti_ActivitySourceLocator.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14,
+
+  /**
+   * Results for source-level global access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityGlobalAccess3.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15,
+
+  /**
+   * Results for source-level branch. The corresponding
+   * activity record structure is \ref CUpti_ActivityBranch2.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_BRANCH = 16,
+
+  /**
+   * Overhead added by CUPTI, Compiler, CUDA driver etc. The
+   * corresponding activity record structure is
+   * \ref CUpti_ActivityOverhead3.
+   */
+  CUPTI_ACTIVITY_KIND_OVERHEAD = 17,
+
+  /**
+   * A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCdpKernel. This activity cannot be directly
+   * enabled or disabled. It is enabled and disabled through
+   * concurrent kernel activity i.e. _CONCURRENT_KERNEL.
+   */
+  CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18,
+  /**
+   * Preemption activity record indicating a preemption of a CDP (CUDA
+   * Dynamic Parallel) kernel executing on the GPU. The corresponding
+   * activity record structure is \ref CUpti_ActivityPreemption.
+   */
+  CUPTI_ACTIVITY_KIND_PREEMPTION = 19,
+
+  /**
+   * Environment activity records indicating power, clock, thermal,
+   * etc. levels of the GPU. The corresponding activity record
+   * structure is \ref CUpti_ActivityEnvironment.
+   */
+  CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20,
+
+  /**
+   * An performance counter value associated with a specific event domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityEventInstance. This activity cannot be directly
+   * enabled or disabled. Information collected using the Event API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21,
+
+  /**
+   * A peer to peer memory copy. The corresponding activity record
+   * structure is \ref CUpti_ActivityMemcpyPtoP4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY2 = 22,
+
+  /**
+   * A performance metric value associated with a specific metric domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityMetricInstance. This activity cannot be directly
+   * enabled or disabled. Information collected using the Metric API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23,
+
+  /**
+   * Results for source-level instruction execution.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionExecution.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24,
+
+  /**
+   * Unified Memory counter record. The corresponding activity
+   * record structure is \ref CUpti_ActivityUnifiedMemoryCounter3.
+   */
+  CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25,
+
+  /**
+   * Device global/function record. The corresponding activity
+   * record structure is \ref CUpti_ActivityFunction.
+   */
+  CUPTI_ACTIVITY_KIND_FUNCTION = 26,
+
+  /**
+   * CUDA Module record. The corresponding activity
+   * record structure is \ref CUpti_ActivityModule.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the module callback can be
+   * be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_MODULE = 27,
+
+  /**
+   * A device attribute value. The corresponding activity record
+   * structure is \ref CUpti_ActivityDeviceAttribute.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using attributes CUpti_DeviceAttribute
+   * or CUdevice_attribute can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE   = 28,
+
+  /**
+   * Results for source-level shared access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySharedAccess.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29,
+
+  /**
+   * PC sampling information for kernels. This will serialize
+   * kernels. The corresponding activity record structure
+   * is \ref CUpti_ActivityPCSampling3. In CUDA 12.5, this kind
+   * is deprecated for Volta and later GPU architectures in favor
+   * of PC Sampling APIs from the header cupti_pcsampling.h which
+   * allows concurrent kernel execution.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30,
+
+  /**
+   * Summary information about PC sampling records. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityPCSamplingRecordInfo. In CUDA 12.5, this kind
+   * is deprecated for Volta and later GPU architectures in favor
+   * of PC Sampling APIs from the header cupti_pcsampling.h.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31,
+
+  /**
+   * SASS/Source line-by-line correlation record.
+   * This will generate sass/source correlation for functions that have source
+   * level analysis or pc sampling results. The records will be generated only
+   * when either of source level analysis or pc sampling activity is enabled.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionCorrelation.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32,
+
+  /**
+   * OpenACC data events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccData.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33,
+
+  /**
+   * OpenACC launch events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccLaunch.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34,
+
+  /**
+   * OpenACC other events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccOther.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35,
+
+  /**
+   * Information about a CUDA event (cudaEvent). This activity cannot be
+   * directly enabled or disabled. It is enabled and disabled through
+   * the activity CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityCudaEvent2.
+   */
+  CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36,
+
+  /**
+   * Information about a CUDA stream. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityStream.
+   */
+  CUPTI_ACTIVITY_KIND_STREAM = 37,
+
+  /**
+   * Records for CUDA synchronization primitives. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySynchronization2.
+   */
+  CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38,
+
+  /**
+   * Records for correlation of different programming APIs. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityExternalCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39,
+
+  /**
+   * NVLink topology information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityNvLink4.
+   */
+  CUPTI_ACTIVITY_KIND_NVLINK = 40,
+
+  /**
+   * Instantaneous Event information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEvent.
+   * This activity can not be directly enabled or disabled.
+   * Information collected using the Event API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41,
+
+  /**
+   * Instantaneous Event information for a specific event
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEventInstance.
+   * This activity can not be directly enabled or disabled.
+   * Information collected using the Event API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42,
+
+  /**
+   * Instantaneous Metric information
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetric.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the Metric API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43,
+
+  /**
+   * Instantaneous Metric information for a specific metric
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetricInstance.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the Metric API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY = 45,
+
+  /**
+   * PCI devices information used for PCI topology.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityPcie.
+   */
+  CUPTI_ACTIVITY_KIND_PCIE = 46,
+
+  /**
+   * OpenMP parallel events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenMp.
+   */
+  CUPTI_ACTIVITY_KIND_OPENMP = 47,
+
+  /**
+   * A CUDA driver kernel launch occurring outside of any
+   * public API function execution. Tools can handle these
+   * like records for driver API launch functions, although
+   * the cbid field is not used here.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY2 = 49,
+
+  /**
+   * Memory pool activity tracking creation, destruction and
+   * trimming of the memory pool.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemoryPool2.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50,
+
+  /**
+   * Activity record for graph-level information.
+   * The corresponding activity record structure is
+   * \ref CUpti_ActivityGraphTrace2.
+   */
+  CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51,
+
+  /**
+   * JIT (Just-in-time) operation tracking.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityJit.
+   */
+  CUPTI_ACTIVITY_KIND_JIT = 52,
+
+  /**
+   * This activity can not be directly enabled or disabled.
+   * It is enabled when CUPTI_ACTIVITY_KIND_GRAPH_TRACE is enabled
+   * and device graph trace is enabled through API cuptiActivityEnableDeviceGraph().
+   * The corresponding activity record structure is
+   * \ref CUpti_ActivityDeviceGraphTrace.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE = 53,
+
+  /**
+   * Tracing batches of copies that are to be decompressed.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemDecompress.
+   */
+  CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS = 54,
+
+
+
+  /**
+   * Count of supported activity kinds.
+   */
+  CUPTI_ACTIVITY_KIND_COUNT,
+
+  CUPTI_ACTIVITY_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityKind;
+
+/**
+ * \brief The kinds of activity objects.
+ * \see CUpti_ActivityObjectKindId
+ */
+typedef enum {
+  /**
+   * The object kind is not known.
+   */
+  CUPTI_ACTIVITY_OBJECT_UNKNOWN  = 0,
+
+  /**
+   * A process.
+   */
+  CUPTI_ACTIVITY_OBJECT_PROCESS  = 1,
+
+  /**
+   * A thread.
+   */
+  CUPTI_ACTIVITY_OBJECT_THREAD   = 2,
+
+  /**
+   * A device.
+   */
+  CUPTI_ACTIVITY_OBJECT_DEVICE   = 3,
+
+  /**
+   * A context.
+   */
+  CUPTI_ACTIVITY_OBJECT_CONTEXT  = 4,
+
+  /**
+   * A stream.
+   */
+  CUPTI_ACTIVITY_OBJECT_STREAM   = 5,
+
+  CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff
+} CUpti_ActivityObjectKind;
+
+/**
+ * \brief Identifiers for object kinds as specified by
+ * CUpti_ActivityObjectKind.
+ * \see CUpti_ActivityObjectKind
+ */
+typedef union {
+  /**
+   * A process object requires that we identify the process ID. A
+   * thread object requires that we identify both the process and
+   * thread ID.
+   */
+  struct {
+    uint32_t processId;
+    uint32_t threadId;
+  } pt;
+
+  /**
+   * A device object requires that we identify the device ID. A
+   * context object requires that we identify both the device and
+   * context ID. A stream object requires that we identify device,
+   * context, and stream ID.
+   */
+  struct {
+    uint32_t deviceId;
+    uint32_t contextId;
+    uint32_t streamId;
+  } dcs;
+} CUpti_ActivityObjectKindId;
+
+/**
+ * \brief The structure to provide additional data for CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL.
+ */
+typedef struct {
+  /**
+   * The remaining space in the command buffer. This field will always be zero
+   * when the command buffer is full, making it not useful in such cases.
+   *
+   */
+  uint32_t commandBufferLength;
+  /**
+   * The channel ID of the command buffer.
+   *
+   */
+  uint32_t channelID;
+  /**
+   * The channel type of the command buffer.
+   *
+   */
+  uint32_t channelType;
+} CUpti_ActivityOverheadCommandBufferFullData;
+
+/**
+ * \brief The kinds of activity overhead.
+ */
+typedef enum {
+  /**
+   * The overhead kind is not known.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_UNKNOWN               = 0,
+
+  /**
+   * Compiler overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER       = 1,
+
+  /**
+   * Activity buffer flush overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH               = 1<<16,
+
+  /**
+   * CUPTI instrumentation overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION            = 2<<16,
+
+  /**
+   * CUPTI resource creation and destruction overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE                   = 3<<16,
+
+  /**
+   * CUDA Runtime triggered module loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_RUNTIME_TRIGGERED_MODULE_LOADING = 4<<16,
+
+  /**
+   * Lazy function loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_LAZY_FUNCTION_LOADING            = 5<<16,
+
+  /**
+   * Overhead due to lack of command buffer space.
+   * Refer CUpti_ActivityOverheadCommandBufferFullData for more details.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL              = 6<<16,
+
+  /**
+   * Overhead due to activity buffer request.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_ACTIVITY_BUFFER_REQUEST          = 7<<16,
+
+  /**
+    * Overhead due to UVM activity initialization.
+    */
+   CUPTI_ACTIVITY_OVERHEAD_UVM_ACTIVITY_INIT                = 8<<16,
+
+  CUPTI_ACTIVITY_OVERHEAD_FORCE_INT             = 0x7fffffff
+} CUpti_ActivityOverheadKind;
+
+/**
+ * \brief The kind of a compute API.
+ */
+typedef enum {
+  /**
+   * The compute API is not known.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN    = 0,
+
+  /**
+   * The compute APIs are for CUDA.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA       = 1,
+
+  /**
+   * The compute APIs are for CUDA running
+   * in MPS (Multi-Process Service) environment.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS   = 2,
+
+  CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityComputeApiKind;
+
+/**
+ * \brief Flags associated with activity records.
+ *
+ * Activity record flags. Flags can be combined by bitwise OR to
+ * associated multiple flags with an activity record. Each flag is
+ * specific to a certain activity kind, as noted below.
+ */
+typedef enum {
+  /**
+   * Indicates the activity record has no flags.
+   */
+  CUPTI_ACTIVITY_FLAG_NONE          = 0,
+
+  /**
+   * Indicates the activity represents a device that supports
+   * concurrent kernel execution. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS  = 1 << 0,
+
+  /**
+   * Indicates if the activity represents a CUdevice_attribute value
+   * or a CUpti_DeviceAttribute value. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an asynchronous memcpy
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an instantaneous marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a region start marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_START  = 1 << 1,
+
+  /**
+   * Indicates the activity represents a region end marker. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_END  = 1 << 2,
+
+  /**
+   * Indicates the activity represents an attempt to acquire a user
+   * defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3,
+
+  /**
+   * Indicates the activity represents success in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4,
+
+  /**
+   * Indicates the activity represents failure in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5,
+
+  /**
+   * Indicates the activity represents releasing a reservation on
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6,
+
+  /**
+   * Indicates the activity represents a marker that does not specify
+   * a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a marker that specifies a color
+   * in alpha-red-green-blue format. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB  = 1 << 1,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * If this bit in flag is set, the load access was cached else it is
+   * uncached. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED     = 1 << 9,
+
+  /**
+   * If this bit in flag is set, the metric value overflowed. Valid
+   * for CUpti_ActivityMetric and CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED     = 1 << 0,
+
+  /**
+   * If this bit in flag is set, the metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * metric is missing.  Valid for CUpti_ActivityMetric and
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID  = 1 << 1,
+
+  /**
+   * If this bit in flag is set, the source level metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * source level metric cannot be evaluated.
+   * Valid for CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID  = 1 << 0,
+
+  /**
+   * The mask for the instruction class, \ref CUpti_ActivityInstructionClass
+   * Valid for CUpti_ActivityInstructionExecution and
+   * CUpti_ActivityInstructionCorrelation
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK    = 0xFF << 1,
+
+  /**
+   * When calling cuptiActivityFlushAll, this flag
+   * can be set to force CUPTI to flush all records in the buffer, whether
+   * finished or not
+   */
+  CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access.  Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * Indicates the activity represents an asynchronous memset
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents thrashing in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0,
+
+  /**
+   * Indicates the activity represents page throttling in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0,
+
+  CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_ActivityFlag;
+
+/**
+ * \brief The stall reason for PC sampling activity.
+ */
+typedef enum {
+  /**
+   * Invalid reason
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID      = 0,
+
+  /**
+   * No stall, instruction is selected for issue
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE         = 1,
+
+  /**
+   * Warp is blocked because next instruction is not yet available,
+   * because of instruction cache miss, or because of branching effects
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH   = 2,
+
+  /**
+   * Instruction is waiting on an arithmetic dependency
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY   = 3,
+
+  /**
+   * Warp is blocked because it is waiting for a memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY   = 4,
+
+  /**
+   * Texture sub-system is fully utilized or has too many outstanding requests.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE   = 5,
+
+  /**
+   * Warp is blocked as it is waiting at __syncthreads() or at memory barrier.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC   = 6,
+
+  /**
+   * Warp is blocked waiting for __constant__ memory and immediate memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY   = 7,
+
+  /**
+   * Compute operation cannot be performed due to the required resources not
+   * being available.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY   = 8,
+
+  /**
+   * Warp is blocked because there are too many pending memory operations.
+   * In Kepler architecture it often indicates high number of memory replays.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE   = 9,
+
+  /**
+   * Warp was ready to issue, but some other warp issued instead.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED   = 10,
+
+  /**
+   * Miscellaneous reasons
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER   = 11,
+
+  /**
+   * Sleeping.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING   = 12,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPCSamplingStallReason;
+
+/**
+ * \brief Sampling period for PC sampling method
+ *
+ * Sampling period can be set using \ref cuptiActivityConfigurePCSampling
+ */
+typedef enum {
+  /**
+   * The PC sampling period is not set.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0,
+
+  /**
+   * Minimum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1,
+
+  /**
+   * Sampling period in lower range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2,
+
+  /**
+   * Medium sampling period.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3,
+
+  /**
+   * Sampling period in higher range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4,
+
+  /**
+   * Maximum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff
+} CUpti_ActivityPCSamplingPeriod;
+
+/**
+ * \brief The kind of a memory copy, indicating the source and
+ * destination targets of the copy.
+ *
+ * Each kind represents the source and destination targets of a memory
+ * copy. Targets are host, device, and array.
+ */
+typedef enum {
+  /**
+   * The memory copy kind is not known.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0,
+
+  /**
+   * A host to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOD    = 1,
+
+  /**
+   * A device to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOH    = 2,
+
+  /**
+   * A host to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOA    = 3,
+
+  /**
+   * A device array to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOH    = 4,
+
+  /**
+   * A device array to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOA    = 5,
+
+  /**
+   * A device array to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOD    = 6,
+
+  /**
+   * A device to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOA    = 7,
+
+  /**
+   * A device to device memory copy on the same device.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOD    = 8,
+
+  /**
+   * A host to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOH    = 9,
+
+  /**
+   * A peer to peer memory copy across different devices.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_PTOP    = 10,
+
+  CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemcpyKind;
+
+/**
+ * \brief The kinds of memory accessed by a memory operation/copy.
+ *
+ * Each kind represents the type of the memory
+ * accessed by a memory operation/copy.
+ */
+typedef enum {
+  /**
+   * The memory kind is unknown.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN            = 0,
+
+  /**
+   * The memory is pageable.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE           = 1,
+
+  /**
+   * The memory is pinned.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PINNED             = 2,
+
+  /**
+   * The memory is on the device.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE             = 3,
+
+  /**
+   * The memory is an array.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_ARRAY              = 4,
+
+  /**
+   * The memory is managed
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED            = 5,
+
+  /**
+   * The memory is device static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC      = 6,
+
+  /**
+   * The memory is managed static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC     = 7,
+
+  CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT          = 0x7fffffff
+} CUpti_ActivityMemoryKind;
+
+/**
+ * \brief The kind of a preemption activity.
+ */
+typedef enum {
+  /**
+   * The preemption kind is not known.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN    = 0,
+
+  /**
+   * Preemption to save CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE       = 1,
+
+  /**
+   * Preemption to restore CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE    = 2,
+
+  CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPreemptionKind;
+
+/**
+ * \brief The kind of environment data. Used to indicate what type of
+ * data is being reported by an environment activity record.
+ */
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0,
+
+  /**
+   * The environment data is related to speed.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1,
+
+  /**
+   * The environment data is related to temperature.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2,
+
+  /**
+   * The environment data is related to power.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3,
+
+  /**
+   * The environment data is related to cooling.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_COUNT,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityEnvironmentKind;
+
+/**
+ * \brief Reasons for clock throttling.
+ *
+ * The possible reasons that a clock can be throttled. There can be
+ * more than one reason that a clock is being throttled so these types
+ * can be combined by bitwise OR.  These are used in the
+ * clocksThrottleReason field in the Environment Activity Record.
+ */
+typedef enum {
+  /**
+   * Nothing is running on the GPU and the clocks are dropping to idle
+   * state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE              = 0x00000001,
+
+  /**
+   * The GPU clocks are limited by a user specified limit.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS   = 0x00000002,
+
+  /**
+   * A software power scaling algorithm is reducing the clocks below
+   * requested clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP          = 0x00000004,
+
+  /**
+   * Hardware slowdown to reduce the clock by a factor of two or more
+   * is engaged.  This is an indicator of one of the following: 1)
+   * Temperature is too high, 2) External power brake assertion is
+   * being triggered (e.g. by the system power supply), 3) Change in
+   * power state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN           = 0x00000008,
+
+  /**
+   * Some unspecified factor is reducing the clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN               = 0x80000000,
+
+  /**
+   * Throttle reason is not supported for this GPU.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED           = 0x40000000,
+
+  /**
+   * No clock throttling.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_NONE                  = 0x00000000,
+
+  CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT             = 0x7fffffff
+} CUpti_EnvironmentClocksThrottleReason;
+
+/**
+ * \brief Scope of the unified memory counter (deprecated in CUDA 7.0)
+ */
+typedef enum {
+  /**
+   * The unified memory counter scope is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0,
+
+  /**
+   * Collect unified memory counter for single process on one device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1,
+
+  /**
+   * Collect unified memory counter for single process across all devices
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterScope;
+
+/**
+ * \brief Kind of the Unified Memory counter
+ *
+ * Many activities are associated with Unified Memory mechanism; among them
+ * are transfers from host to device, device to host, page fault at
+ * host side.
+ */
+typedef enum {
+  /**
+   * The unified memory counter kind is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0,
+
+  /**
+   * Number of bytes transferred from host to device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1,
+
+  /**
+   * Number of bytes transferred from device to host
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2,
+
+  /**
+   * Number of CPU page faults, this is only supported on 64 bit
+   * Linux and Mac platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3,
+
+  /**
+   * Number of GPU page faults, this is only supported on devices with
+   * compute capability 6.0 and higher and 64 bit Linux platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4,
+
+  /**
+   * Thrashing occurs when data is frequently accessed by
+   * multiple processors and has to be constantly migrated around
+   * to achieve data locality. In this case the overhead of migration
+   * may exceed the benefits of locality.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5,
+
+  /**
+   * Throttling is a prevention technique used by the driver to avoid
+   * further thrashing. Here, the driver doesn't service the fault for
+   * one of the contending processors for a specific period of time,
+   * so that the other processor can run at full-speed.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6,
+
+  /**
+   * In case throttling does not help, the driver tries to pin the memory
+   * to a processor for a specific period of time. One of the contending
+   * processors will have slow  access to the memory, while the other will
+   * have fast access.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7,
+
+  /**
+   * Number of bytes transferred from one device to another device.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterKind;
+
+/**
+ * \brief Memory access type for unified memory page faults
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+ * and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+ */
+typedef enum {
+  /**
+   * The unified memory access type is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0,
+
+  /**
+   * The page fault was triggered by read memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1,
+
+  /**
+   * The page fault was triggered by write memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2,
+
+  /**
+   * The page fault was triggered by atomic memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3,
+
+  /**
+   * The page fault was triggered by memory prefetch operation
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4
+} CUpti_ActivityUnifiedMemoryAccessType;
+
+/**
+ * \brief Migration cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+ * \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH
+ */
+typedef enum {
+  /**
+   * The unified memory migration cause is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0,
+
+  /**
+   * The unified memory migrated due to an explicit call from
+   * the user e.g. cudaMemPrefetchAsync
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1,
+
+  /**
+   * The unified memory migrated to guarantee data coherence
+   * e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2,
+
+  /**
+   * The unified memory was speculatively migrated by the UVM driver
+   * before being accessed by the destination processor to improve
+   * performance
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3,
+
+  /**
+   * The unified memory migrated to the CPU because it was evicted to make
+   * room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4,
+
+  /**
+    * The unified memory migrated to another processor because of access counter
+    * notifications. Only frequently accessed pages are migrated between CPU and GPU, or
+    * between peer GPUs.
+    */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5,
+} CUpti_ActivityUnifiedMemoryMigrationCause;
+
+/**
+ * \brief Remote memory map cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP
+ */
+typedef enum {
+  /**
+   * The cause of mapping to remote memory was unknown
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0,
+
+  /**
+   * Mapping to remote memory was added to maintain data coherence.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1,
+
+  /**
+   * Mapping to remote memory was added to prevent further thrashing
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2,
+
+  /**
+   * Mapping to remote memory was added to enforce the hints
+   * specified by the programmer or by performance heuristics of the
+   * UVM driver
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3,
+
+  /**
+   * Mapping to remote memory was added because there is no more
+   * memory available on the processor and eviction was not
+   * possible
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4,
+
+  /**
+   * Mapping to remote memory was added after the memory was
+   * evicted to make room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5,
+} CUpti_ActivityUnifiedMemoryRemoteMapCause;
+
+/**
+ * \brief SASS instruction classification.
+ *
+ * The sass instruction are broadly divided into different class. Each enum represents a classification.
+ */
+typedef enum {
+  /**
+   * The instruction class is not known.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0,
+
+  /**
+   * Represents a 32 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1,
+
+  /**
+   * Represents a 64 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2,
+
+  /**
+   * Represents an integer operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3,
+
+  /**
+   * Represents a bit conversion operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4,
+
+  /**
+   * Represents a control flow instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5,
+
+  /**
+   * Represents a global load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6,
+
+  /**
+   * Represents a shared load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7,
+
+  /**
+   * Represents a local load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8,
+
+  /**
+   * Represents a generic load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9,
+
+  /**
+   * Represents a surface load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10,
+
+  /**
+   * Represents a constant load instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11,
+
+  /**
+   * Represents a texture load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12,
+
+  /**
+   * Represents a global atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13,
+
+  /**
+   * Represents a shared atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14,
+
+  /**
+   * Represents a surface atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15,
+
+  /**
+   * Represents a inter-thread communication instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16,
+
+  /**
+   * Represents a barrier instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17,
+
+  /**
+   * Represents some miscellaneous instructions which do not fit in the above classification.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18,
+
+  /**
+   * Represents a 16 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19,
+
+  /**
+   * Represents uniform instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20,
+
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityInstructionClass;
+
+/**
+ * \brief Partitioned global caching option
+ */
+typedef enum {
+  /**
+   * Partitioned global cache config unknown.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN       = 0,
+
+  /**
+   * Partitioned global cache not supported.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1,
+
+  /**
+   * Partitioned global cache config off.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF           = 2,
+
+  /**
+   * Partitioned global cache config on.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON            = 3,
+
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityPartitionedGlobalCacheConfig;
+
+/**
+ * \brief Synchronization type.
+ *
+ * The types of synchronization to be used with
+ * CUpti_ActivitySynchronization2.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN             = 0,
+
+  /**
+   * Event synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE   = 1,
+
+  /**
+   * Stream wait event API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT   = 2,
+
+  /**
+   * Stream synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE  = 3,
+
+  /**
+   * Context synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4,
+
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT           = 0x7fffffff
+} CUpti_ActivitySynchronizationType;
+
+/**
+ * \brief stream type.
+ *
+ * The types of stream to be used with CUpti_ActivityStream.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN      = 0,
+
+  /**
+   * Default stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT      = 1,
+
+  /**
+   * Non-blocking stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2,
+
+  /**
+   * Null stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL         = 3,
+
+  /**
+   * Stream create Mask
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_MASK              = 0xFFFF,
+
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityStreamFlag;
+
+/**
+* \brief Link flags.
+*
+* Describes link properties, to be used with CUpti_ActivityNvLink.
+*/
+
+typedef enum {
+  /**
+   * The flag is invalid.
+   */
+  CUPTI_LINK_FLAG_INVALID        = 0,
+
+  /**
+  * Is peer to peer access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ACCESS    = (1 << 1),
+
+  /**
+  * Is system memory access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ACCESS  = (1 << 2),
+
+  /**
+  * Is peer atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ATOMICS   = (1 << 3),
+
+  /**
+  * Is system memory atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4),
+
+  CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_LinkFlag;
+
+/**
+* \brief Memory operation types.
+*
+* Describes the type of memory operation, to be used with CUpti_ActivityMemory4.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory is allocated.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1,
+
+  /**
+  * Memory is released.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE    = 2,
+
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityMemoryOperationType;
+
+/**
+* \brief Memory pool types.
+*
+* Describes the type of memory pool, to be used with CUpti_ActivityMemory4.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is local to the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL     = 1,
+
+  /**
+  * Memory pool is imported by the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED  = 2,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolType;
+
+/**
+* \brief Memory pool operation types.
+*
+* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is created.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED   = 1,
+
+  /**
+  * Memory pool is destroyed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2,
+
+  /**
+  * Memory pool is trimmed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED   = 3,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolOperationType;
+
+typedef enum {
+  CUPTI_CHANNEL_TYPE_INVALID      = 0,
+
+  /**
+   * Channel is used for standard work launch and tracking
+   */
+  CUPTI_CHANNEL_TYPE_COMPUTE      = 1,
+
+  /**
+   * Channel is used by an asynchronous copy engine
+   * For confidential compute configurations, work launch and
+   * completion are done using the copy engines.
+   */
+  CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2,
+
+
+  /**
+   * Channel is used for memory decompression operations
+   */
+    CUPTI_CHANNEL_TYPE_DECOMP ,
+
+  CUPTI_CHANNEL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ChannelType;
+
+/**
+* \brief CIG (CUDA in Graphics) Modes.
+*
+* Describes the CIG modes associated with the CUDA context.
+*/
+
+typedef enum
+{
+  /**
+   * Regular (non-CIG) mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_NONE         = 0,
+  /**
+   * CIG mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_CIG          = 1,
+  /**
+   * CIG fallback mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_CIG_FALLBACK = 2,
+
+  CUPTI_CONTEXT_CIG_MODE_FORCE_INT    = 0x7fffffff
+} CUpti_ContextCigMode;
+
+/**
+ * The source-locator ID that indicates an unknown source
+ * location. There is not an actual CUpti_ActivitySourceLocator object
+ * corresponding to this value.
+ */
+#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0
+
+/**
+ * An invalid function index ID.
+ */
+#define CUPTI_FUNCTION_INDEX_ID_INVALID 0
+
+/**
+ * An invalid/unknown correlation ID. A correlation ID of this value
+ * indicates that there is no correlation for the activity record.
+ */
+#define CUPTI_CORRELATION_ID_UNKNOWN 0
+
+/**
+ * An invalid/unknown grid ID.
+ */
+#define CUPTI_GRID_ID_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown timestamp for a start, end, queued, submitted,
+ * or completed time.
+ */
+#define CUPTI_TIMESTAMP_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown value.
+ */
+#define CUPTI_SYNCHRONIZATION_INVALID_VALUE ((uint32_t) 0xFFFFFFFFU)
+
+/**
+ * An invalid/unknown process id.
+ */
+#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0
+
+/**
+ * Invalid/unknown NVLink port number.
+*/
+#define CUPTI_NVLINK_INVALID_PORT -1
+
+/**
+ * Maximum NVLink port numbers.
+*/
+#define CUPTI_MAX_NVLINK_PORTS 32
+
+/**
+ * An invalid/unknown value for decompressed bytes.
+*/
+#define CUPTI_DECOMPRESSED_BYTES_UNKNOWN 0LL
+
+START_PACKED_ALIGNMENT
+/**
+ * \brief Unified Memory counters configuration structure
+ *
+ * This structure controls the enable/disable of the various
+ * Unified Memory counters consisting of scope, kind and other parameters.
+ * See function \ref cuptiActivityConfigureUnifiedMemoryCounter
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Unified Memory counter Counter scope. (deprecated in CUDA 7.0)
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * Unified Memory counter Counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind kind;
+
+  /**
+   * Device id of the target device. This is relevant only
+   * for single device scopes. (deprecated in CUDA 7.0)
+   */
+  uint32_t deviceId;
+
+  /**
+   * Control to enable/disable the counter. To enable the counter
+   * set it to non-zero value while disable is indicated by zero.
+   */
+  uint32_t enable;
+} CUpti_ActivityUnifiedMemoryCounterConfig;
+
+/**
+ * \brief Device auto boost state structure
+ *
+ * This structure defines auto boost state for a device.
+ * See function \ref cuptiGetAutoBoostState
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Returned auto boost state. 1 is returned in case auto boost is enabled, 0
+   * otherwise
+   */
+  uint32_t enabled;
+
+  /**
+   * Id of process that has set the current boost state. The value will be
+   * CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the
+   * permission to query process ids or there is an error in querying the
+   * process id.
+   */
+  uint32_t pid;
+
+} CUpti_ActivityAutoBoostState;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure defines the pc sampling configuration.
+ *
+ * See function \ref cuptiActivityConfigurePCSampling
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  uint32_t size;
+
+  /**
+   * There are 5 level provided for sampling period. The level
+   * internally maps to a period in terms of cycles. Same level can
+   * map to different number of cycles on different gpus. No of
+   * cycles will be chosen to minimize information loss. The period
+   * chosen will be given by samplingPeriodInCycles in
+   * \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance.
+   */
+  CUpti_ActivityPCSamplingPeriod samplingPeriod;
+
+  /**
+   * This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be
+   * considered as samplingPeriod2 should not be used and samplingPeriod should be used.
+   * Valid values for samplingPeriod2 are between 5 to 31 both inclusive.
+   * This will set the sampling period to (2^samplingPeriod2) cycles.
+   */
+  uint32_t samplingPeriod2;
+} CUpti_ActivityPCSamplingConfig;
+
+/**
+ * \brief The base activity record.
+ *
+ * The activity API uses a CUpti_Activity as a generic representation
+ * for any activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_Activity object can
+ * be cast to the specific activity record type appropriate for that kind.
+ *
+ * Note that all activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+} CUpti_Activity;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+
+  /**
+   * The total number of memcopy operations traced in this record.
+   * This field is valid for memcpy operations happening using
+   * MemcpyBatchAsync APIs in CUDA.
+   * In MemcpyBatchAsync APIs, multiple memcpy operations are batched
+   * together for optimization purposes based on certain heuristics.
+   * For other memcpy operations, this field will be 1.
+   */
+   uint64_t copyCount;
+} CUpti_ActivityMemcpy6;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityMemcpyPtoP4;
+
+/**
+ * \brief The activity record for memset.
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory set is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Undefined. Reserved for internal use
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemset4;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY).
+ * This activity record provides a single record for the memory
+ * allocation and memory release operations.
+ *
+ * Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory4
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2.
+ * \ref CUpti_ActivityMemory4 provides separate records for memory
+ * allocation and memory release operations. This allows to correlate the
+ * corresponding driver and runtime API activity record with the memory operation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory kind requested by the user
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The virtual address of the allocation
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, i.e.
+   * the time when memory was allocated, in ns.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory operation, i.e.
+   * the time when memory was freed, in ns.
+   * This will be 0 if memory is not freed in the application
+   */
+  uint64_t end;
+
+  /**
+   * The program counter of the allocation of memory
+   */
+  uint64_t allocPC;
+
+  /**
+   * The program counter of the freeing of memory. This will
+   * be 0 if memory is not freed in the application
+   */
+  uint64_t freePC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory allocation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+} CUpti_ActivityMemory;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    /**
+     * The size of memory pool in bytes and the processId of the memory pools
+     * \p size is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     * \p processId is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType
+     */
+    union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+    /**
+     * The shared object or binary that the memory allocation request comes from.
+     */
+    const char* source;
+} CUpti_ActivityMemory4;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The utilized size of the memory pool. \p utilizedSize is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t utilizedSize;
+} CUpti_ActivityMemoryPool2;
+
+/**
+ * \brief The type of the CUDA kernel launch.
+ */
+typedef enum {
+  /**
+  * The kernel was launched via a regular kernel call
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernel() or
+  * \ref cuLaunchCooperativeKernel()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or
+  * \ref cuLaunchCooperativeKernelMultiDevice()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2,
+
+  /**
+  * The kernel was launched as a CBL commandlist
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_CBL_COMMANDLIST = 3,
+} CUpti_ActivityLaunchType;
+
+/**
+ * \brief The shared memory limit per block config for a kernel
+ * This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API
+ */
+typedef enum  {
+    /** The shared memory limit config is default
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_DEFAULT              = 0x00,
+
+    /** User has opted for a higher dynamic shared memory limit using function attribute
+     * 'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or
+     * CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_OPTIN                = 0x01,
+
+    CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT            = 0x7fffffff
+} CUpti_FuncShmemLimitConfig;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+
+  /**
+   * The maximum cluster size for the kernel
+   */
+  uint32_t maxPotentialClusterSize;
+
+  /**
+   * The maximum clusters that could co-exist on the target device for the kernel
+   */
+  uint32_t maxActiveClusters;
+} CUpti_ActivityKernel9;
+
+/**
+ * \brief The activity record for CDP (CUDA Dynamic Parallelism)
+ * kernel.
+ *
+ * This activity record represents a CDP kernel execution.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel execution
+   * is assigned a unique grid ID.
+   */
+  int64_t gridId;
+
+  /**
+   * The grid ID of the parent kernel.
+   */
+  int64_t parentGridId;
+
+  /**
+   * The timestamp when kernel is queued up, in ns. A value of
+   * CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is
+   * unknown.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when kernel is submitted to the gpu, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is
+   * unknown.
+   */
+  uint64_t submitted;
+
+  /**
+   * The timestamp when kernel is marked as completed, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is
+   * unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The X-dimension of the parent block.
+   */
+  uint32_t parentBlockX;
+
+  /**
+   * The Y-dimension of the parent block.
+   */
+  uint32_t parentBlockY;
+
+  /**
+   * The Z-dimension of the parent block.
+   */
+  uint32_t parentBlockZ;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityCdpKernel;
+
+/**
+ * \brief The activity record for a preemption of a CDP kernel.
+ *
+ * This activity record represents a preemption of a CDP kernel.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * kind of the preemption
+  */
+  CUpti_ActivityPreemptionKind preemptionKind;
+
+  /**
+   * The timestamp of the preemption, in ns. A value of 0 indicates
+   * that timestamp information could not be collected for the
+   * preemption.
+   */
+  uint64_t timestamp;
+
+  /**
+  * The grid-id of the block that is preempted
+  */
+  int64_t gridId;
+
+  /**
+   * The X-dimension of the block that is preempted
+   */
+  uint32_t blockX;
+
+  /**
+   * The Y-dimension of the block that is preempted
+   */
+  uint32_t blockY;
+
+  /**
+   * The Z-dimension of the block that is preempted
+   */
+  uint32_t blockZ;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityPreemption;
+
+/**
+ * \brief The activity record for a driver or runtime API invocation.
+ *
+ * This activity record represents an invocation of a driver or
+ * runtime API (CUPTI_ACTIVITY_KIND_DRIVER and
+ * CUPTI_ACTIVITY_KIND_RUNTIME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER,
+   * CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the driver or runtime function.
+   */
+  CUpti_CallbackId cbid;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the driver or runtime CUDA function
+   * is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the driver or runtime CUDA function is
+   * executing.
+   */
+  uint32_t threadId;
+
+  /**
+   * The correlation ID of the driver or runtime CUDA function. Each
+   * function invocation is assigned a unique correlation ID that is
+   * identical to the correlation ID in the memcpy, memset, or kernel
+   * activity record that is associated with this function.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The return value for the function. For a CUDA driver function
+   * with will be a CUresult value, and for a CUDA runtime function
+   * this will be a cudaError_t value.
+   */
+  uint32_t returnValue;
+} CUpti_ActivityAPI;
+
+/**
+ * \brief The activity record for a CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * event data may choose to use this type to store the collected event
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityEvent;
+
+/**
+ * \brief The activity record for a CUPTI event with instance
+ * information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is
+ * not produced by the activity API but is included for completeness
+ * and ease-of-use. Profile frameworks built on top of CUPTI that
+ * collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The event domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityEventInstance;
+
+/**
+ * \brief The activity record for a CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * metric data may choose to use this type to store the collected metric
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[3];
+} CUpti_ActivityMetric;
+
+/**
+ * \brief The activity record for a CUPTI metric with instance
+ * information.
+ *
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE).  This activity record kind
+ * is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profile frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The metric domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[7];
+} CUpti_ActivityMetricInstance;
+
+/**
+ * \brief The activity record for source locator.
+ *
+ * This activity record represents a source locator
+ * (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for the source path, will be used in all the source level
+   * results.
+   */
+  uint32_t id;
+
+  /**
+   * The line number in the source .
+   */
+  uint32_t lineNumber;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The path for the file.
+   */
+  const char *fileName;
+} CUpti_ActivitySourceLocator;
+
+/**
+ * \brief The activity record for source-level global
+ * access.
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint64_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number of
+   * threads that executed this instruction with predicate and condition code
+   * evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this
+     access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+} CUpti_ActivityGlobalAccess3;
+
+/**
+ * \brief The activity record for source level result
+ * branch.
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityBranch2;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * GPU is a NUMA node or not
+  */
+  uint32_t isNumaNode;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * NUMA node ID of the GPU memory
+   * if GPU is not a NUMA node, it returns invalidNumaId
+  */
+  uint32_t numaId;
+} CUpti_ActivityDevice5;
+
+/**
+ * \brief The activity record for a device attribute.
+ *
+ * This activity record represents information about a GPU device:
+ * either a CUpti_DeviceAttribute or CUdevice_attribute value
+ * (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID of the device that this attribute applies to.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The attribute, either a CUpti_DeviceAttribute or
+   * CUdevice_attribute. Flag
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate
+   * what kind of attribute this is. If
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then
+   * CUdevice_attribute field is value, otherwise
+   * CUpti_DeviceAttribute field is valid.
+   */
+  union {
+    CUdevice_attribute cu;
+    CUpti_DeviceAttribute cupti;
+  } attribute;
+
+  /**
+   * The value for the attribute. See CUpti_DeviceAttribute and
+   * CUdevice_attribute for the type of the value for a given
+   * attribute.
+   */
+  union {
+    double vDouble;
+    uint32_t vUint32;
+    uint64_t vUint64;
+    int32_t vInt32;
+    int64_t vInt64;
+  } value;
+} CUpti_ActivityDeviceAttribute;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+  /**
+   * The ID of the parent context. It would be 0 if
+   * context does not have parent
+   */
+  uint32_t parentContextId;
+
+  /**
+   * This field indicates whether the context is a green context
+   */
+  uint8_t isGreenContext;
+
+  uint8_t padding;
+
+  /**
+   * Number of multiprocessors assigned to the green context
+   * Invalid if the field 'isGreenContext' is 0
+   */
+  uint16_t numMultiprocessors;
+
+  /**
+   * This field indicates the CIG mode
+   */
+  CUpti_ContextCigMode cigMode;
+
+  uint32_t padding2;
+
+} CUpti_ActivityContext3;
+
+/**
+ * \brief The activity record providing a name.
+ *
+ * This activity record provides a name for a device, context, thread,
+ * etc. and other resource naming done via NVTX APIs
+ * (CUPTI_ACTIVITY_KIND_NAME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of activity object being named.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name.
+   */
+  const char *name;
+
+} CUpti_ActivityName;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time.
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+  /**
+   * The name of the domain to which this marker belongs to.
+   * This will be NULL for default domain.
+   */
+  const char *domain;
+
+} CUpti_ActivityMarker2;
+
+/**
+ * \brief The activity record providing detailed information for a marker.
+ *
+ * User must enable CUPTI_ACTIVITY_KIND_MARKER as well
+ * to get records for marker data.
+ * The marker data contains color, payload, and category.
+ * (CUPTI_ACTIVITY_KIND_MARKER_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * Defines the payload format for the value associated with the marker.
+   */
+  CUpti_MetricValueKind payloadKind;
+
+  /**
+   * The payload value.
+   */
+  CUpti_MetricValue payload;
+
+  /**
+   * The color for the marker.
+   */
+  uint32_t color;
+
+  /**
+   * The category for the marker.
+   */
+  uint32_t category;
+
+} CUpti_ActivityMarkerData;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_KIND_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Pointer to the struct with additional details about the overhead.
+   * Refer CUpti_ActivityOverheadKind enum and the corresponding structure to typecast and access additional overhead data.
+   * Client is responsible for freeing this memory using the free function when done.
+   */
+  void *overheadData;
+
+} CUpti_ActivityOverhead3;
+
+/**
+ * \brief The activity record for CUPTI environmental data.
+ *
+ * This activity record provides CUPTI environmental data, include
+ * power, clocks, and thermals.  This information is sampled at
+ * various rates and returned in this activity record.  The consumer
+ * of the record needs to check the environmentKind field to figure
+ * out what kind of environmental record this is.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device
+   */
+  uint32_t deviceId;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected for
+   * the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The kind of data reported in this record.
+   */
+  CUpti_ActivityEnvironmentKind environmentKind;
+
+  union {
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment
+     * kind.
+     */
+    struct {
+      /**
+       * The SM frequency in MHz
+       */
+      uint32_t smClock;
+
+      /**
+       * The memory frequency in MHz
+       */
+      uint32_t memoryClock;
+
+      /**
+       * The PCIe link generation.
+       */
+      uint32_t pcieLinkGen;
+
+      /**
+       * The PCIe link width.
+       */
+      uint32_t pcieLinkWidth;
+
+      /**
+       * The clocks throttle reasons.
+       */
+      CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons;
+    } speed;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE
+     * environment kind.
+     */
+    struct {
+      /**
+       * The GPU temperature in degrees C.
+       */
+      uint32_t gpuTemperature;
+    } temperature;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment kind.
+     * The power in milliwatts consumed by GPU and associated circuitry.
+     * The power in milliwatts that will trigger power management algorithm.
+     */
+    struct {
+
+      uint32_t power;
+      uint32_t powerLimit;
+    } power;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING
+     * environment kind.
+     */
+    struct {
+      /**
+       * The fan speed as percentage of maximum.
+       */
+      uint32_t fanSpeed;
+    } cooling;
+  } data;
+} CUpti_ActivityEnvironment;
+
+/**
+ * \brief The activity record for source-level instruction execution.
+ *
+ * This activity records result for source level instruction execution.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction execution.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction, regardless of predicate or condition code.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t notPredOffThreadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionExecution;
+
+/**
+ * \brief The activity record for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint64_t pcOffset;
+} CUpti_ActivityPCSampling3;
+
+/**
+ * \brief The activity record for record status for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Number of times the PC was sampled for this kernel instance including all
+   * dropped samples.
+   */
+  uint64_t totalSamples;
+
+  /**
+   * Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * Sampling period in terms of number of cycles .
+   */
+  uint64_t samplingPeriodInCycles;
+} CUpti_ActivityPCSamplingRecordInfo;
+
+/**
+ * \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling operation was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region, ONLY if there are less than 32 devices. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * \brief The bitmask of devices involved in the operation.
+   *
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. processors[0] represents the device ID of the device 0 to device 63,
+   * processors[1] represents device ID of device 64 to device 127 and so on.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_DTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_FAULT_REPLAY
+   */
+  uint64_t processors[5];
+} CUpti_ActivityUnifiedMemoryCounter3;
+
+/**
+ * \brief The activity record for global/device functions.
+ *
+ * This activity records function name and corresponding module
+ * information.
+ * (CUPTI_ACTIVITY_KIND_FUNCTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * ID to uniquely identify the record
+  */
+  uint32_t id;
+
+  /**
+   * The ID of the context where the function is launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID in which this global/device function is present.
+   */
+  uint32_t moduleId;
+
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the function. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityFunction;
+
+/**
+ * \brief The activity record for a CUDA module.
+ *
+ * This activity record represents a CUDA module
+ * (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * module data from the module callback may choose to use this type to
+ * store the collected module data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the context where the module is loaded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID.
+   */
+  uint32_t id;
+
+  /**
+   * The cubin size.
+   */
+  uint32_t cubinSize;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The pointer to cubin.
+   */
+  const void *cubin;
+} CUpti_ActivityModule;
+
+/**
+ * \brief The activity record for source-level shared
+ * access.
+ *
+ * This activity records the locations of the shared
+ * accesses in the source
+ * (CUPTI_ACTIVITY_KIND_SHARED_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this shared access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of shared memory transactions generated by this access
+   */
+  uint64_t sharedTransactions;
+
+  /**
+   * The minimum number of shared memory transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalSharedTransactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySharedAccess;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The ID of the device where the event was recorded.
+   */
+  uint32_t deviceId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad2;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The device-side timestamp on CUDA event record.
+   * Timestamp is in nanoseconds.
+   */
+  uint64_t deviceTimestamp;
+  /**
+   * A unique ID to associate event synchronization records
+   * with the latest CUDA Event record. Similar field is added
+   * in CUpti_ActivitySynchronization2 to associate CUDA Event
+   * record to the synchronization record.
+   *
+   * The same CUDA event can be used multiple times, so the
+   * event id will not be unique to correlate the synchronization
+   * record with the latest CUDA Event record.
+   * This field will be unique and can be used to do the required
+   * correlation.
+   */
+  uint64_t cudaEventSyncId;
+} CUpti_ActivityCudaEvent2;
+
+/**
+ * \brief The activity record for CUDA stream.
+ *
+ * This activity is used to track created streams.
+ * (CUPTI_ACTIVITY_KIND_STREAM).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * The ID of the context where the stream was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * A unique stream ID to identify the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The clamped priority for the stream.
+   */
+  uint32_t priority;
+
+  /**
+   * Flags associated with the stream.
+   */
+  CUpti_ActivityStreamFlag flag;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityStream;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+
+  /**
+   * A unique ID to associate event synchronization records
+   * with the latest CUDA Event record. Similar field is added
+   * in CUpti_ActivityCudaEvent2 to associate synchronization
+   * record to the CUDA Event record.
+   *
+   * The same CUDA event can be used multiple times, so the
+   * event id will not be unique to correlate the synchronization
+   * record with the latest CUDA Event record.
+   * This field will be unique and can be used to do the required
+   * correlation.
+   *
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicates that
+   * the field is not applicable for this record.
+   * Valid only for synchronization records related to CUDA Events.
+   */
+  uint64_t cudaEventSyncId;
+
+  /**
+   * The return value for the synchronization record.
+   * Use cuptiActivityEnableAllSyncRecords API to enable/disable
+   * collection of synchronization records with return value being
+   * non-zero. This will be a CUresult value.
+   */
+  uint32_t returnValue;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySynchronization2;
+
+/**
+ * \brief The activity record for source-level sass/source
+ * line-by-line correlation.
+ *
+ * This activity records source level sass/source correlation
+ * information.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionCorrelation;
+
+/**
+ * \brief The OpenAcc event kind for OpenAcc activity records.
+ *
+ * \see CUpti_ActivityKindOpenAcc
+ */
+typedef enum {
+  CUPTI_OPENACC_EVENT_KIND_INVALID              = 0,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT          = 1,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN      = 2,
+  CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN     = 3,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH       = 4,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD       = 5,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD     = 6,
+  CUPTI_OPENACC_EVENT_KIND_WAIT                 = 7,
+  CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT        = 8,
+  CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT    = 9,
+  CUPTI_OPENACC_EVENT_KIND_UPDATE               = 10,
+  CUPTI_OPENACC_EVENT_KIND_ENTER_DATA           = 11,
+  CUPTI_OPENACC_EVENT_KIND_EXIT_DATA            = 12,
+  CUPTI_OPENACC_EVENT_KIND_CREATE               = 13,
+  CUPTI_OPENACC_EVENT_KIND_DELETE               = 14,
+  CUPTI_OPENACC_EVENT_KIND_ALLOC                = 15,
+  CUPTI_OPENACC_EVENT_KIND_FREE                 = 16,
+  CUPTI_OPENACC_EVENT_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_OpenAccEventKind;
+
+/**
+ * \brief The OpenAcc parent construct kind for OpenAcc activity records.
+ */
+typedef enum {
+  CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN          = 0,
+  CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL         = 1,
+  CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS          = 2,
+  CUPTI_OPENACC_CONSTRUCT_KIND_LOOP             = 3,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DATA             = 4,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA       = 5,
+  CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA        = 6,
+  CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA        = 7,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC           = 8,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE          = 9,
+  CUPTI_OPENACC_CONSTRUCT_KIND_INIT             = 10,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN         = 11,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SET              = 12,
+  CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE           = 13,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE          = 14,
+  CUPTI_OPENACC_CONSTRUCT_KIND_WAIT             = 15,
+  CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API      = 16,
+  CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT        = 0x7fffffff
+
+} CUpti_OpenAccConstructKind;
+
+typedef enum {
+  CUPTI_OPENMP_EVENT_KIND_INVALID               = 0,
+  CUPTI_OPENMP_EVENT_KIND_PARALLEL              = 1,
+  CUPTI_OPENMP_EVENT_KIND_TASK                  = 2,
+  CUPTI_OPENMP_EVENT_KIND_THREAD                = 3,
+  CUPTI_OPENMP_EVENT_KIND_IDLE                  = 4,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER          = 5,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT         = 6,
+  CUPTI_OPENMP_EVENT_KIND_FORCE_INT             = 0x7fffffff
+} CUpti_OpenMpEventKind;
+
+/**
+ * \brief The base activity record for OpenAcc records.
+ *
+ * The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic
+ * representation for any OpenACC activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_ActivityOpenAcc object can
+ * be cast to the specific OpenACC activity record type appropriate for that kind.
+ *
+ * Note that all OpenACC activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+} CUpti_ActivityOpenAcc;
+
+/**
+ * \brief The activity record for OpenACC data.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * Number of bytes
+   */
+  uint64_t bytes;
+
+  /**
+   * Host pointer if available
+   */
+  uint64_t hostPtr;
+
+  /**
+   * Device pointer if available
+   */
+  uint64_t devicePtr;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /*
+   * A pointer to null-terminated string containing the name of the variable
+   * for which this event is triggered, if known, or a null pointer if not.
+   */
+  const char *varName;
+
+} CUpti_ActivityOpenAccData;
+
+/**
+ * \brief The activity record for OpenACC launch.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * The number of gangs created for this kernel launch
+   */
+  uint64_t numGangs;
+
+  /**
+   * The number of workers created for this kernel launch
+   */
+  uint64_t numWorkers;
+
+  /**
+   * The number of vector lanes created for this kernel launch
+   */
+  uint64_t vectorLength;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * A pointer to null-terminated string containing the name of the
+   * kernel being launched, if known, or a null pointer if not.
+   */
+  const char *kernelName;
+
+} CUpti_ActivityOpenAccLaunch;
+
+/**
+ * \brief The activity record for OpenACC other.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_OTHER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+} CUpti_ActivityOpenAccOther;
+
+/**
+ * \brief The base activity record for OpenMp records.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind)
+   */
+  CUpti_OpenMpEventKind eventKind;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the OpenMP activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenMP activity is executing.
+   */
+  uint32_t cuThreadId;
+} CUpti_ActivityOpenMp;
+
+/**
+ * \brief The kind of external APIs supported for correlation.
+ *
+ * Custom correlation kinds are reserved for usage in external tools.
+ *
+ * \see CUpti_ActivityExternalCorrelation
+ */
+typedef enum {
+    CUPTI_EXTERNAL_CORRELATION_KIND_INVALID              = 0,
+
+    /**
+     * The external API is unknown to CUPTI
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN              = 1,
+
+    /**
+     * The external API is OpenACC
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC              = 2,
+
+    /**
+     * The external API is custom0
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0              = 3,
+
+    /**
+     * The external API is custom1
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1              = 4,
+
+    /**
+     * The external API is custom2
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2              = 5,
+
+    /**
+     * Add new kinds before this line
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_SIZE,
+
+    CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_ExternalCorrelationKind;
+
+/**
+ * \brief The activity record for correlation with external records
+ *
+ * This activity record correlates native CUDA records (e.g. CUDA Driver API,
+ * kernels, memcpys, ...) with records from external APIs such as OpenACC.
+ * (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION).
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of external API this record correlated to.
+   */
+  CUpti_ExternalCorrelationKind externalKind;
+
+  /**
+   * The correlation ID of the associated non-CUDA API record.
+   * The exact field in the associated external record depends
+   * on that record's activity kind (\see externalKind).
+   */
+  uint64_t externalId;
+
+  /**
+   * The correlation ID of the associated CUDA driver or runtime API record.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t reserved;
+} CUpti_ActivityExternalCorrelation;
+
+/**
+* \brief The device type for device connected to NVLink.
+*/
+typedef enum {
+    CUPTI_DEV_TYPE_INVALID = 0,
+
+    /**
+    * The device type is GPU.
+    */
+    CUPTI_DEV_TYPE_GPU = 1,
+
+    /**
+    * The device type is NVLink processing unit in CPU.
+    */
+    CUPTI_DEV_TYPE_NPU = 2,
+
+    CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_DevType;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink4;
+
+#define CUPTI_MAX_GPUS 32
+/**
+ * Field to differentiate whether PCIE Activity record
+ * is of a GPU or a PCI Bridge
+ */
+typedef enum {
+    /**
+     * PCIE GPU record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_GPU       = 0,
+
+    /**
+     * PCIE Bridge record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_BRIDGE    = 1,
+
+    CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_PcieDeviceType;
+
+/**
+ * \brief PCI devices information required to construct topology
+ *
+ * This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus
+ * which can be used to understand the topology.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * Type of device in topology, \ref CUpti_PcieDeviceType. If type is
+   * CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is
+   * CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr.
+   */
+  CUpti_PcieDeviceType type;
+
+  /**
+   * A unique identifier for GPU or Bridge in Topology
+   */
+  union {
+    /**
+     * GPU device ID
+     */
+    CUdevice devId;
+
+    /**
+     * A unique identifier for Bridge in the Topology
+     */
+    uint32_t bridgeId;
+  } id;
+
+  /**
+   * Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in
+   * multiple NUMA systems.
+   */
+  uint32_t domain;
+
+  /**
+   * PCIE Generation of GPU or Bridge.
+   */
+  uint16_t pcieGeneration;
+
+  /**
+   * Link rate of the GPU or bridge in gigatransfers per second (GT/s)
+   */
+  uint16_t linkRate;
+
+  /**
+   * Link width of the GPU or bridge
+   */
+  uint16_t linkWidth;
+
+  /**
+   * Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is
+   * connected to in the topology.
+   */
+  uint16_t upstreamBus;
+
+  /**
+   * Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr)
+   */
+  union {
+    struct {
+      /**
+       * UUID for the device. \ref CUpti_ActivityDevice5.
+       */
+      CUuuid uuidDev;
+
+      /**
+       * CUdevice with which this device has P2P capability.
+       * This can also be obtained by querying cuDeviceCanAccessPeer or
+       * cudaDeviceCanAccessPeer APIs
+       */
+      CUdevice peerDev[CUPTI_MAX_GPUS];
+    } gpuAttr;
+
+    struct {
+      /**
+       * The downstream bus number, used to search downstream devices/bridges connected
+       * to this bridge.
+       */
+      uint16_t secondaryBus;
+
+      /**
+       * Device ID of the bridge
+       */
+      uint16_t deviceId;
+
+      /**
+       * Vendor ID of the bridge
+       */
+      uint16_t vendorId;
+
+      /**
+       * Padding for alignment
+       */
+      uint16_t pad0;
+    } bridgeAttr;
+  } attr;
+} CUpti_ActivityPcie;
+
+/**
+ * \brief PCIE Generation.
+ *
+ * Enumeration of PCIE Generation for
+ * pcie activity attribute pcieGeneration
+ */
+typedef enum {
+  /**
+  * PCIE Generation 1
+  */
+  CUPTI_PCIE_GEN_GEN1       = 1,
+
+  /**
+  * PCIE Generation 2
+  */
+  CUPTI_PCIE_GEN_GEN2       = 2,
+
+  /**
+  * PCIE Generation 3
+  */
+  CUPTI_PCIE_GEN_GEN3       = 3,
+
+  /**
+  * PCIE Generation 4
+  */
+  CUPTI_PCIE_GEN_GEN4       = 4,
+
+  /**
+  * PCIE Generation 5
+  */
+  CUPTI_PCIE_GEN_GEN5       = 5,
+
+  /**
+  * PCIE Generation 6
+  */
+  CUPTI_PCIE_GEN_GEN6       = 6,
+
+  CUPTI_PCIE_GEN_FORCE_INT  = 0x7fffffff
+} CUpti_PcieGen;
+
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data at a particular time may choose to
+ * use this type to store the collected event data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint32_t reserved;
+} CUpti_ActivityInstantaneousEvent;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event
+ * with event domain instance information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The event domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousEventInstance;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance.
+ * This activity record kind is not produced by the activity API but
+ * is included for completeness and ease-of-use. Profiler frameworks built
+ * on top of CUPTI that collect metric data may choose to use this type to
+ * store the collected metric data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousMetric;
+
+/**
+ * \brief The instantaneous activity record for a CUPTI metric with instance
+ * information.
+
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This
+ * activity record kind is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profiler frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The metric domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[2];
+} CUpti_ActivityInstantaneousMetricInstance;
+
+/**
+ * \brief The types of JIT entry.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+typedef enum {
+  CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0,
+
+  /**
+  * PTX to CUBIN.
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1,
+
+  /**
+  * NVVM-IR to PTX
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2,
+
+  CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitEntryType;
+
+/**
+ * \brief The types of JIT compilation operations.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+
+typedef enum {
+  CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0,
+  /**
+  * Loaded from the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1,
+
+  /**
+  * Stored in the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2,
+
+  /**
+  * JIT compilation.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3,
+
+  CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitOperationType;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+
+  /**
+   * The ID of the process where the JIT operation is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the JIT operation is executing.
+   */
+  uint32_t threadId;
+} CUpti_ActivityJit2;
+
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the first node of the graph is executed.
+   * If this is INT_MAX, then the start is on the host.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the first node of the graph is executed.
+   * If this is INT_MAX, then the start is on the host.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+
+  /**
+   * The ID of the device where last node of the graph is executed
+   */
+  uint32_t endDeviceId;
+
+  /**
+   * The ID of the context where the last node of the graph is executed.
+   */
+  uint32_t endContextId;
+} CUpti_ActivityGraphTrace2;
+
+/**
+ * \brief The launch mode for device graph execution.
+ */
+typedef enum {
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_INVALID = 0,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_FIRE_AND_FORGET = 1,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_TAIL = 2,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_FIRE_AND_FORGET_AS_SIBLING = 3,
+} CUpti_DeviceGraphLaunchMode;
+
+/**
+ * \brief The activity record for trace of device graph execution.
+ *
+ * This activity record represents execution for a device launched graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device where the first node of the graph is executed.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The unique ID of the graph that has launched this graph.
+   */
+  uint32_t launcherGraphId;
+
+  /**
+   * The type of launch. See \ref CUpti_DeviceGraphLaunchMode
+   */
+  uint32_t deviceLaunchMode;
+
+  /**
+   * The ID of the context where the first node of the graph is executed.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint64_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+
+} CUpti_ActivityDeviceGraphTrace;
+
+/**
+ * \brief The activity record for trace of decompression operations.
+ *
+ * This activity record represents execution for a batch of decompression operatios.
+ * The activity kind is CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The correlation ID of the decompression operations. Each operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The number of operations in the batch.
+   */
+  uint32_t numberOfOperations;
+
+  /**
+   * The number of bytes to be read and decompressed in the
+   * batch operation.
+   */
+  uint64_t sourceBytes;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved0;
+
+  /**
+   * The start timestamp.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the start time is unknown.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the start time is unknown.
+   */
+  uint64_t end;
+} CUpti_ActivityMemDecompress;
+
+END_PACKED_ALIGNMENT
+
+/**
+ * \brief Activity attributes.
+ *
+ * These attributes are used to control the behavior of the activity
+ * API.
+ */
+typedef enum {
+    /**
+     * The device memory size (in bytes) reserved for storing profiling data for concurrent
+     * kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets
+     * for each buffer on a context. The value is a size_t.
+     *
+     * There is a limit on how many device buffers can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many
+     * buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     * When all of the data in a buffer is consumed, it is added in the reuse pool, and
+     * CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers.
+     * CUPTI allocates another buffer only when it runs out of the buffers in the
+     * reuse pool.
+     *
+     * Since buffer allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger buffer size leaves less device memory for the application.
+     * Having smaller buffer size increases the risk of dropping timestamps for
+     * records if too many kernels or memcopies or memsets are launched at one time.
+     *
+     * This value only applies to new buffer allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 3200000 (~3MB) which can accommodate profiling data
+     * up to 100,000 kernels, memcopies and memsets combined.
+     *
+     * Note: Starting with the CUDA 12.0 Update 1 release, CUPTI allocates profiling buffer in the
+     * device memory by default as this might help in improving the performance of the
+     * tracing run. Refer to the description of the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details.
+     * Size of the memory and maximum number of pools are still controlled by the attributes
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     *
+     * Note: The actual amount of device memory per buffer reserved by CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE                      = 0,
+
+    /**
+     * The device memory size (in bytes) reserved for storing profiling
+     * data for CDP operations for each buffer on a context. The
+     * value is a size_t.
+     *
+     * Having larger buffer size means less flush operations but
+     * consumes more device memory. This value only applies to new
+     * allocations.
+     *
+     * Set this value before initializing CUDA or before creating a
+     * context to ensure it is considered for the following allocations.
+     *
+     * The default value is 8388608 (8MB).
+     *
+     * Note: The actual amount of device memory per context reserved by
+     * CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP          = 1,
+
+    /**
+     * The maximum number of device memory buffers per context. The value is a size_t.
+     *
+     * For an application with high rate of kernel launches, memcopies and memsets having a bigger pool
+     * limit helps in timestamp collection for all these activities at the expense of a larger memory footprint.
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for more details.
+     *
+     * Setting this value will not modify the number of memory buffers
+     * currently stored.
+     *
+     * Set this value before initializing CUDA to ensure the limit is
+     * not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT                = 2,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * There is a limit on how many semaphore pools can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many
+     * semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     * When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and
+     * CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels might result in having CUPTI to allocate more semaphore pools.
+     * CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the
+     * reuse pool.
+     *
+     * Since semaphore pool allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger semaphore pool size leaves less device memory for the application.
+     * Having smaller semaphore pool size increases the risk of dropping timestamps for
+     * kernel records if too many kernels are issued/launched at one time.
+     *
+     * This value only applies to new semaphore pool allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 25000 which can accommodate profiling data for upto 25,000 kernels.
+     *
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE           = 3,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * The maximum number of profiling semaphore pools per context. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for more details.
+     *
+     * Set this value before initializing CUDA to ensure the limit is not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT          = 4,
+
+    /**
+     * The flag to indicate whether user should provide activity buffer of zero value.
+     * The value is a uint8_t.
+     *
+     * If the value of this attribute is non-zero, user should provide
+     * a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc.
+     * If the user does not provide a zero value buffer after setting this to non-zero,
+     * the activity buffer may contain some uninitialized values when CUPTI returns it in
+     * \ref CUpti_BuffersCallbackCompleteFunc
+     *
+     * If the value of this attribute is zero, CUPTI will initialize the user buffer
+     * received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it.
+     * If the user sets this to zero, a few stalls may appear in critical path because CUPTI
+     * will zero out the buffer in the main thread.
+     * Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to
+     * ensure it is considered for all the subsequent user buffers.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER              = 5,
+
+    /**
+     * Number of device buffers to pre-allocate for a context during the initialization phase.
+     * The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of device buffers set using
+     * the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these buffers (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE        = 6,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * Number of profiling semaphore pools to pre-allocate for a context during the
+     * initialization phase. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of profiling semaphore pools set
+     * using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these pools (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE  = 7,
+
+    /**
+     * Allocate page-locked (pinned) host memory for storing profiling data for concurrent
+     * kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t.
+     *
+     * Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host
+     * memory by default as this might help in improving the performance of the tracing run.
+     * Allocating excessive amounts of pinned memory may degrade system performance, since it
+     * reduces the amount of memory available to the system for paging. For this reason user
+     * might want to change the location from pinned host memory to device memory by setting
+     * value of this attribute to 0.
+     *
+     * Using page-locked (pinned) host memory buffers is not supported on confidential computing
+     * devices. On setting this attribute to 1, CUPTI will return CUPTI_ERROR_NOT_SUPPORTED.
+     *
+     * The default value is 1.
+     */
+    CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED         = 8,
+
+    /**
+     * Request activity buffers per-thread to store CUPTI activity records
+     * in the activity buffer on per-thread basis. The value is a uint8_t.
+     *
+     * The attribute should be set before registering the buffer callbacks using
+     * cuptiActivityRegisterCallbacks API and before any of the CUPTI activity kinds are enabled.
+     * This makes sure that all the records are stored in activity buffers allocated per-thread.
+     * Changing this attribute in the middle of the profiling session will result in undefined behavior.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER,
+
+
+
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT                 = 0x7fffffff
+} CUpti_ActivityAttribute;
+
+/**
+ * \brief Thread-Id types.
+ *
+ * CUPTI uses different methods to obtain the thread-id depending on the
+ * support and the underlying platform. This enum documents these methods
+ * for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType
+ * can be used to set and get the thread-id type.
+ */
+typedef enum {
+    /**
+     * Default type
+     * Windows uses API GetCurrentThreadId()
+     * Linux/Mac/Android/QNX use POSIX pthread API pthread_self()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT       = 0,
+
+    /**
+     * This type is based on the system API available on the underlying platform
+     * and thread-id obtained is supposed to be unique for the process lifetime.
+     * Windows uses API GetCurrentThreadId()
+     * Linux uses syscall SYS_gettid
+     * Mac uses syscall SYS_thread_selfid
+     * Android/QNX use gettid()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM        = 1,
+
+    /**
+     * Add new enums before this field.
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SIZE          = 2,
+
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityThreadIdType;
+
+/**
+ * \brief Get the CUPTI timestamp.
+ *
+ * Returns a timestamp normalized to correspond with the start and end
+ * timestamps reported in the CUPTI activity records. The timestamp is
+ * reported in nanoseconds.
+ *
+ * \param timestamp Returns the CUPTI timestamp
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp);
+
+/**
+ * \brief Get the ID of a context.
+ *
+ * Get the ID of a context.
+ *
+ * \param context The context
+ * \param contextId Returns a process-unique ID for the context
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId);
+
+/**
+ * \brief Get the ID of a stream.
+ *
+ * Get the ID of a stream. The stream ID is unique within a context
+ * (i.e. all streams within a context will have unique stream
+ * IDs).
+ *
+ * \param context If non-NULL then the stream is checked to ensure
+ * that it belongs to this context. Typically this parameter should be
+ * null.
+ * \param stream The stream
+ * \param streamId Returns a context-unique ID for the stream
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+ * if \p context is non-NULL and \p stream does not belong to the
+ * context
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+ *
+ * **DEPRECATED** This method is deprecated as of CUDA 8.0.
+ * Use method cuptiGetStreamIdEx instead.
+ */
+CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
+
+/**
+* \brief Get the ID of a stream.
+*
+* Get the ID of a stream. The stream ID is unique within a context
+* (i.e. all streams within a context will have unique stream
+* IDs).
+*
+* \param context If non-NULL then the stream is checked to ensure
+* that it belongs to this context. Typically this parameter should be
+* null.
+* \param stream The stream
+* \param perThreadStream Flag to indicate if program is compiled for per-thread streams
+* \param streamId Returns a context-unique ID for the stream
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_NOT_INITIALIZED
+* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+* if \p context is non-NULL and \p stream does not belong to the
+* context
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId);
+
+/**
+ * \brief Get the ID of a device
+ *
+ * If \p context is NULL, returns the ID of the device that contains
+ * the currently active context. If \p context is non-NULL, returns
+ * the ID of the device which contains that context. Operates in a
+ * similar manner to cudaGetDevice() or cuCtxGetDevice() but may be
+ * called from within callback functions.
+ *
+ * \param context The context, or NULL to indicate the current context.
+ * \param deviceId Returns the ID of the device that is current for
+ * the calling thread.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
+
+/**
+ * \brief Get the unique ID of a graph node
+ *
+ * Returns the unique ID of the CUDA graph node.
+ *
+ * \param node The graph node.
+ * \param nodeId Returns the unique ID of the node
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId);
+
+/**
+ * \brief Get the unique ID of graph
+ *
+ * Returns the unique ID of CUDA graph.
+ *
+ * \param graph The graph.
+ * \param pId Returns the unique ID of the graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId);
+
+/**
+ * \brief Get the unique ID of executable graph
+ *
+ * Returns the unique ID of executable CUDA graph.
+ *
+ * \param graphExec The executable graph.
+ * \param pId Returns the unique ID of the executable graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphExecId(CUgraphExec graphExec, uint32_t *pId);
+
+/**
+ * \brief Enable collection of a specific kind of activity record.
+ *
+ * Enable collection of a specific kind of activity record. Multiple
+ * kinds can be enabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record. For certain activity kinds
+ * it dumps existing records.
+ *
+ * In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it
+ * enables the collection of a specific kind of activity record.
+ * Additionally, this API can help in dumping the records for activities which happened in
+ * the past before enabling the corresponding activity kind.
+ * The API allows to get records for the current resource allocations done in CUDA
+ * For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped
+ * For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped
+ * For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped
+ * For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped
+ * For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped
+ * For other activities, the behavior is similar to the API \ref cuptiActivityEnable
+ *
+ * Device records are emitted in CUPTI on CUDA driver initialization. Those records
+ * can only be retrieved by the user if CUPTI is attached before CUDA initialization.
+ * Context and stream records are emitted on context and stream creation.
+ * The use case of the API is to provide the records for CUDA resources
+ * (contexts/streams/devices) that are currently active if user late attaches CUPTI.
+ *
+ * Before calling this function, the user must register buffer callbacks
+ * to get the activity records by calling \ref cuptiActivityRegisterCallbacks.
+ * If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump,
+ * then CUPTI will enable the activity kind but not provide any records for that
+ * activity kind.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized.
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record.
+ *
+ * Disable collection of a specific kind of activity record. Multiple
+ * kinds can be disabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Enable collection of a specific kind of activity record for a
+ * context.  This setting done by this API will supersede the global
+ * settings for activity records enabled by \ref cuptiActivityEnable.
+ * Multiple kinds can be enabled by calling this function multiple
+ * times.
+ *
+ * \param context The context for which activity is to be enabled
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Disable collection of a specific kind of activity record for a context.
+ * This setting done by this API will supersede the global settings
+ * for activity records.
+ * Multiple kinds can be enabled by calling this function multiple times.
+ *
+ * \param context The context for which activity is to be disabled
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Get the number of activity records that were dropped of
+ * insufficient buffer space.
+ *
+ * Get the number of records that were dropped because of insufficient
+ * buffer space.  The dropped count includes records that could not be
+ * recorded because CUPTI did not have activity buffer space available
+ * for the record (because the CUpti_BuffersCallbackRequestFunc
+ * callback did not return an empty buffer of sufficient size) and
+ * also CDP records that could not be record because the device-size
+ * buffer was full (size is controlled by the
+ * CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped
+ * count maintained for the queue is reset to zero when this function
+ * is called.
+ *
+ * \param context The context, or NULL to get dropped count from global queue
+ * \param streamId The stream ID
+ * \param dropped The number of records that were dropped since the last call
+ * to this function.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
+                                                       size_t *dropped);
+
+/**
+ * \brief Iterate over the activity records in a buffer.
+ *
+ * This is a helper function to iterate over the activity records in a
+ * buffer. A buffer of activity records is typically obtained by
+ * receiving a CUpti_BuffersCallbackCompleteFunc callback. Stop iterating
+ * the buffer when an error occurs.
+ *
+ * An example of typical usage:
+ * \code
+ * CUpti_Activity *record = NULL;
+ * CUptiResult status = CUPTI_SUCCESS;
+ *   do {
+ *      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+ *      if(status == CUPTI_SUCCESS) {
+ *           // Use record here...
+ *      }
+ *      else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
+ *          break;
+ *      else if (status == CUPTI_ERROR_INVALID_KIND)
+ *          break;
+ *      else {
+ *          goto Error;
+ *      }
+ *    } while (1);
+ * \endcode
+ *
+ * \param buffer The buffer containing activity records
+ * \param record Inputs the previous record returned by
+ * cuptiActivityGetNextRecord and returns the next activity record
+ * from the buffer. If input value is NULL, returns the first activity
+ * record in the buffer. Records of certain kinds like CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
+ * may contain invalid (0) timestamps, indicating that no timing information could
+ * be collected for lack of device memory.
+ * \param validBufferSizeBytes The number of valid bytes in the buffer.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL.
+ * \retval CUPTI_ERROR_INVALID_KIND if activity record is either incomplete or invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
+                                                CUpti_Activity **record);
+
+/**
+ * \brief Function type for callback used by CUPTI to request an empty
+ * buffer for storing activity records.
+ *
+ * This callback function signals the CUPTI client that an activity
+ * buffer is needed by CUPTI. The activity buffer is used by CUPTI to
+ * store activity records. The callback function can decline the
+ * request by setting \p *buffer to NULL. In this case CUPTI may drop
+ * activity records.
+ *
+ * \param buffer Returns the new buffer. If set to NULL then no buffer
+ * is returned.
+ * \param size Returns the size of the returned buffer.
+ * \param maxNumRecords Returns the maximum number of records that
+ * should be placed in the buffer. If 0 then the buffer is filled with
+ * as many records as possible. If > 0 the buffer is filled with at
+ * most that many records before it is returned.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)(
+    uint8_t **buffer,
+    size_t *size,
+    size_t *maxNumRecords);
+
+/**
+ * \brief Function type for callback used by CUPTI to return a buffer
+ * of activity records.
+ *
+ * This callback function returns to the CUPTI client a buffer
+ * containing activity records.  The buffer contains \p validSize
+ * bytes of activity records which should be read using
+ * cuptiActivityGetNextRecord. The number of dropped records can be
+ * read using cuptiActivityGetNumDroppedRecords. After this call CUPTI
+ * relinquished ownership of the buffer and will not use it
+ * anymore. The client may return the buffer to CUPTI using the
+ * CUpti_BuffersCallbackRequestFunc callback.
+ * Note: CUDA 6.0 onwards, all buffers returned by this callback are
+ * global buffers i.e. there is no context/stream specific buffer.
+ * User needs to parse the global buffer to extract the context/stream
+ * specific activity records.
+ *
+ * \param context The context this buffer is associated with. If NULL, the
+ * buffer is associated with the global activities. This field is deprecated
+ * as of CUDA 6.0 and will always be NULL.
+ * \param streamId The stream id this buffer is associated with.
+ * This field is deprecated as of CUDA 6.0 and will always be NULL.
+ * \param buffer The activity record buffer.
+ * \param size The total size of the buffer in bytes as set in
+ * CUpti_BuffersCallbackRequestFunc.
+ * \param validSize The number of valid bytes in the buffer.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)(
+    CUcontext context,
+    uint32_t streamId,
+    uint8_t *buffer,
+    size_t size,
+    size_t validSize);
+
+/**
+ * \brief Registers callback functions with CUPTI for activity buffer
+ * handling.
+ *
+ * This function registers two callback functions to be used in asynchronous
+ * buffer handling. If registered, activity record buffers are handled using
+ * asynchronous requested/completed callbacks from CUPTI.
+ *
+ * Registering these callbacks prevents the client from using CUPTI's
+ * blocking enqueue/dequeue functions.
+ *
+ * \param funcBufferRequested callback which is invoked when an empty
+ * buffer is requested by CUPTI
+ * \param funcBufferCompleted callback which is invoked when a buffer
+ * containing activity records is available from CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either \p
+ * funcBufferRequested or \p funcBufferCompleted is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
+        CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
+
+/**
+ * \brief Wait for all activity records to be delivered via the
+ * completion callback.
+ *
+ * This function does not return until all activity records associated
+ * with the specified context/stream are returned to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks. To
+ * ensure that all activity records are complete, the requested
+ * stream(s), if any, are synchronized.
+ *
+ * If \p context is NULL, the global activity records (i.e. those not
+ * associated with a particular stream) are flushed (in this case no
+ * streams are synchronized).  If \p context is a valid CUcontext and
+ * \p streamId is 0, the buffers of all streams of this context are
+ * flushed.  Otherwise, the buffers of the specified stream in this
+ * context is flushed.
+ *
+ * Before calling this function, the buffer handling callback api
+ * must be activated by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param context A valid CUcontext or NULL.
+ * \param streamId The stream ID.
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceded
+ * by a successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * **DEPRECATED** This method is deprecated
+ * CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll
+ * to flush all data.
+ */
+CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag);
+
+/**
+ * \brief Request to deliver activity records via the buffer completion callback.
+ *
+ * This function returns the activity records associated with all contexts/streams
+ * (and the global buffers not associated with any stream) to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks.
+ *
+ * This is a blocking call but it doesn't issue any CUDA synchronization calls
+ * implicitly thus it's not guaranteed that all activities are completed on the
+ * underlying devices. Activity record is considered as completed if it has all
+ * the information filled up including the timestamps if any. It is the client's
+ * responsibility to issue necessary CUDA synchronization calls before calling
+ * this function if all activity records with complete information are expected
+ * to be delivered.
+ *
+ * Behavior of the function based on the input flag:
+ * (-) ::For default flush i.e. when flag is set as 0, it returns all the
+ * activity buffers which have all the activity records completed, buffers need not
+ * to be full though. It doesn't return buffers which have one or more incomplete
+ * records. Default flush can be done at a regular interval in a separate thread.
+ * (-) ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed
+ * to the function, it returns all the activity buffers including the ones which have
+ * one or more incomplete activity records. It's suggested for clients to do the
+ * force flush before the termination of the profiling session to allow remaining
+ * buffers to be delivered. In general, it can be done in the at-exit handler.
+ *
+ * Before calling this function, the buffer handling callback api must be activated
+ * by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if not preceded by a
+ * successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * \see cuptiActivityFlushPeriod
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag);
+
+/**
+ * \brief Read an activity API attribute.
+ *
+ * Read an activity API attribute and return it in \p *value.
+ *
+ * \param attr The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+/**
+ * \brief Write an activity API attribute.
+ *
+ * Write an activity API attribute.
+ *
+ * \param attr The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+
+/**
+ * \brief Set Unified Memory Counter configuration.
+ *
+ * Set the configuration before enabling the corresponding activity kind
+ * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER.
+ * The API should be called after CUDA driver initialization.
+ *
+ * \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures
+ * containing Unified Memory counter configuration.
+ * \param count Number of Unified Memory counter configuration structures
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that
+ * platform (OS/arch) does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device
+ * does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that
+ * multi-GPU configuration without P2P support between any pair of devices
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count);
+
+/**
+ * \brief Get auto boost state
+ *
+ * The profiling results can be inconsistent in case auto boost is enabled.
+ * CUPTI tries to disable auto boost while profiling. It can fail to disable in
+ * cases where user does not have the permissions or CUDA_AUTO_BOOST env
+ * variable is set. The function can be used to query whether auto boost is
+ * enabled.
+ *
+ * \param context A valid CUcontext.
+ * \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which
+ * contains the current state and the id of the process that has requested the
+ * current state
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ */
+CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state);
+
+/**
+ * \brief Set PC sampling configuration.
+ *
+ * For Pascal and older GPU architectures this API must be called before enabling
+ * activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement
+ * for Volta and newer GPU architectures.
+ *
+ * For Volta and newer GPU architectures if this API is called in the middle of
+ * execution, PC sampling configuration will be updated for subsequent kernel launches.
+ *
+ * \param ctx The context
+ * \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while
+ * some valid event collection method is set.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config);
+
+/**
+ * \brief Returns the last error from a cupti call or callback
+ *
+ * Returns the last error that has been produced by any of the cupti api calls
+ * or the callback in the same host thread and resets it to CUPTI_SUCCESS.
+ */
+CUptiResult CUPTIAPI cuptiGetLastError(void);
+
+/**
+ * \brief Set the thread-id type
+ *
+ * CUPTI uses the method corresponding to set type to generate the thread-id.
+ * See enum \ref CUpti_ActivityThreadIdType for the list of methods.
+ * Activity records having thread-id field contain the same value.
+ * Thread id type must not be changed during the profiling session to
+ * avoid thread-id value mismatch across activity records.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform
+ */
+CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type);
+
+/**
+ * \brief Get the thread-id type
+ *
+ * Returns the thread-id type used in CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL
+  */
+CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type);
+
+/**
+* \brief Check support for a compute capability
+*
+* This function is used to check the support for a device based on
+* it's compute capability. It sets the \p support when the compute
+* capability is supported by the current version of CUPTI, and clears
+* it otherwise. This version of CUPTI might not support all GPUs sharing
+* the same compute capability. It is suggested to use API \ref
+* cuptiDeviceSupported which provides correct information.
+*
+* \param major The major revision number of the compute capability
+* \param minor The minor revision number of the compute capability
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+*
+* \sa ::cuptiDeviceSupported
+*/
+CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support);
+
+/**
+* \brief Check support for a compute device
+*
+* This function is used to check the support for a compute device.
+* It sets the \p support when the device is supported by the current
+* version of CUPTI, and clears it otherwise.
+*
+* \param dev The device handle returned by CUDA Driver API cuDeviceGet
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+*
+* \sa ::cuptiComputeCapabilitySupported
+*/
+CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support);
+
+/**
+ * This indicates the virtualization mode in which CUDA device is running
+ */
+typedef enum {
+  /**
+   * No virtualization mode is associated with the device
+   * i.e. it's a baremetal GPU
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0,
+  /**
+   * The device is associated with the pass-through GPU.
+   * In this mode, an entire physical GPU is directly assigned
+   * to one virtual machine (VM).
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1,
+  /**
+   * The device is associated with the virtual GPU (vGPU).
+   * In this mode multiple virtual machines (VMs) have simultaneous,
+   * direct access to a single physical GPU.
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2,
+
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff
+} CUpti_DeviceVirtualizationMode;
+
+/**
+ * \brief Query the virtualization mode of the device
+ *
+ * This function is used to query the virtualization mode of the CUDA device.
+ *
+ * \param dev The device handle returned by CUDA Driver API cuDeviceGet
+ * \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL
+ *
+ */
+CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode);
+
+/**
+ * \brief Detach CUPTI from the running process
+ *
+ * This API detaches the CUPTI from the running process. It destroys and cleans up all the
+ * resources associated with CUPTI in the current process. After CUPTI detaches from the process,
+ * the process will keep on running with no CUPTI attached to it.
+ * For safe operation of the API, it is recommended this API is invoked from the exit callsite
+ * of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that
+ * required CUDA synchronization and CUPTI activity buffer flush is done before calling the API.
+ * Sample code showing the usage of the API in the cupti callback handler code:
+ * \code
+  void CUPTIAPI
+  cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain,
+      CUpti_CallbackId cbid, void *cbdata)
+  {
+    const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
+
+    // Take this code path when CUPTI detach is requested
+    if (detachCupti) {
+      switch(domain)
+      {
+        case CUPTI_CB_DOMAIN_RUNTIME_API:
+        case CUPTI_CB_DOMAIN_DRIVER_API:
+          if (cbInfo->callbackSite == CUPTI_API_EXIT) {
+              // call the CUPTI detach API
+              cuptiFinalize();
+          }
+          break;
+        default:
+          break;
+      }
+    }
+  }
+ \endcode
+ */
+CUptiResult CUPTIAPI cuptiFinalize(void);
+
+/**
+ * \brief Push an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is entering an external API region.
+ * When a CUPTI activity API record is created while within an external API region and
+ * CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will
+ * be preceded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param id External correlation id.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id);
+
+/**
+ * \brief Pop an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is leaving an external API region.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid.
+ * \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind.
+ */
+CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId);
+
+/**
+ * \brief Controls the collection of queued and submitted timestamps for kernels.
+ *
+ * This API is used to control the collection of queued and submitted timestamps
+ * for kernels whose records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these timestamps are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ * 
+ * This API is not supported if the HW trace is enabled through the API \ref cuptiActivityEnableHWTrace. 
+ * \param enable is a boolean, denoting whether these timestamps should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable);
+
+/**
+ * \brief Sets the flush period for the worker thread
+ *
+ * CUPTI creates a worker thread to minimize the perturbance for the application created
+ * threads. CUPTI offloads certain operations from the application threads to the worker
+ * thread, this includes synchronization of profiling resources between host and device,
+ * delivery of the activity buffers to the client using the callback registered in
+ * cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker
+ * thread based on certain heuristics.
+ *
+ * This API is used to control the flush period of the worker thread. This setting will
+ * override the CUPTI heuristics. Setting time to zero disables the periodic flush and
+ * restores the default behavior.
+ *
+ * Periodic flush can return only those activity buffers which are full and have all the
+ * activity records completed.
+ *
+ * It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even
+ * when client sets the periodic flush.
+ *
+ * \param time flush period in milliseconds (ms)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ *
+ * \see cuptiActivityFlushAll
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time);
+
+/**
+ * \brief Controls the collection of launch attributes for kernels.
+ *
+ * This API is used to control the collection of launch attributes for kernels whose
+ * records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these attributes are not collected.
+ *
+ * \param enable is a boolean denoting whether these launch attributes should be collected
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable);
+
+/**
+ * \brief Function type for callback used by CUPTI to request a timestamp
+ * to be used in activity records.
+ *
+ * This callback function signals the CUPTI client that a timestamp needs
+ * to be returned. This timestamp would be treated as normalized timestamp
+ * to be used for various purposes in CUPTI. For example to store start and
+ * end timestamps reported in the CUPTI activity records.
+ * The returned timestamp must be in nanoseconds.
+ *
+ * \sa ::cuptiActivityRegisterTimestampCallback
+ */
+typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void);
+
+/**
+ * \brief Registers callback function with CUPTI for providing timestamp.
+ *
+ * This function registers a callback function to obtain timestamp of user's
+ * choice instead of using CUPTI provided timestamp.
+ * By default CUPTI uses different methods, based on the underlying platform,
+ * to retrieve the timestamp
+ * Linux and Android use clock_gettime(CLOCK_REALTIME, ..)
+ * Windows uses QueryPerformanceCounter()
+ * QNX uses ClockCycles()
+ * Timestamps retrieved using these methods are converted to nanosecond if needed
+ * before usage.
+ *
+ * Timestamps for GPU activities such as kernels, memory copies and memset operations are
+ * recorded directly on the GPU. To provide a unified and normalized view of these timestamps
+ * in relation to CPU time, CUPTI performs a linear interpolation to convert GPU timestamps
+ * into CPU timestamps during post-processing.
+ * For activities where timestamps are captured on the GPU, the timestamp callback is invoked
+ * during the post-processing phase, while converting GPU timestamps into CPU timestamps.
+ * For activities for which timestamps are captured directly on the CPU, the timestamp callback
+ * is invoked immediately at the time of the activity.
+ *
+ * The registration of timestamp callback should be done before any of the CUPTI
+ * activity kinds are enabled to make sure that all the records report the timestamp using
+ * the callback function registered through cuptiActivityRegisterTimestampCallback API.
+ *
+ * Changing the timestamp callback function in CUPTI through
+ * cuptiActivityRegisterTimestampCallback API in the middle of the profiling
+ * session can cause records generated prior to the change to report
+ * timestamps through previous timestamp method.
+ *
+ * \param funcTimestamp callback which is invoked when a timestamp is
+ * needed by CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp);
+
+/**
+ * \brief Controls the collection of records for device launched graphs.
+ *
+ * This API is used to control the collection of records for device launched graphs.
+ * Default value is 0, i.e. these records are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ *
+ * \param enable is a boolean, denoting whether these records should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableDeviceGraph(uint8_t enable);
+
+/**
+ * \brief Controls the collection of activity records for specific CUDA Driver APIs.
+ *
+ * Activity kind CUPTI_ACTIVITY_KIND_DRIVER controls the collection of either all
+ * CUDA Driver APIs or none. API cuptiActivityEnableDriverApi can be used for fine-grained
+ * control, it allows enabling/disabling tracing of a specific set of CUDA Driver APIs.
+ * To disable collection of a small set of CUDA Driver APIs, user can
+ * first enable the collection of all Driver APIs using the activity kind
+ * CUPTI_ACTIVITY_KIND_DRIVER and call this API to disable specific Driver APIs.
+ * And to enable the collection of a small set of CUDA Driver APIs, user can
+ * call this API without using the activity kind CUPTI_ACTIVITY_KIND_DRIVER.
+ *
+ * Note: Activity kind CUPTI_ACTIVITY_KIND_DRIVER overrides the settings done by this API
+ * if it is called after the API.
+ *
+ * \param cbid callback id of the CUDA Driver API. This can be found in the header cupti_driver_cbid.h.
+ * \param enable is a boolean, denoting whether to enable or disable the collection
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableDriverApi(CUpti_CallbackId cbid, uint8_t enable);
+
+/**
+ * \brief Controls the collection of activity records for specific CUDA Runtime APIs.
+ *
+ * Activity kind CUPTI_ACTIVITY_KIND_RUNTIME controls the collection of either all
+ * CUDA Runtime APIs or none. API cuptiActivityEnableRuntimeApi can be used for fine-grained
+ * control, it allows enabling/disabling tracing of a specific set of CUDA Runtime APIs.
+ * To disable collection of a small set of CUDA Runtime APIs, user can
+ * first enable the collection of all Runtime APIs using the activity kind
+ * CUPTI_ACTIVITY_KIND_RUNTIME and call this API to disable specific Runtime APIs.
+ * And to enable the collection of a small set of CUDA Runtime APIs, user can
+ * call this API without using the activity kind CUPTI_ACTIVITY_KIND_RUNTIME.
+ *
+ * Note: Activity kind CUPTI_ACTIVITY_KIND_RUNTIME overrides the settings done by this API
+ * if it is called after the API.
+ *
+ * \param cbid callback id of the CUDA Runtime API. This can be found in the header cupti_runtime_cbid.h.
+ * \param enable is a boolean, denoting whether to enable or disable the collection
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableRuntimeApi(CUpti_CallbackId cbid, uint8_t enable);
+
+/**
+ * \brief Enables the collection of CUDA kernel timestamps through HW events.
+ *
+ * This API enables the collection of CUDA kernel timestamps through HW events instead
+ * of the traditional SW instrumentation and semaphore based approach.
+ * This option is only available on Blackwell architecture.
+ * This API should be called after driver is initialized.
+ *
+ * \param enable is a boolean, denoting whether to enable or disable the collection through HW events
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if CUPTI is not initialized or the CUDA driver is not initialized
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if HW trace cannot be enabled on the current platform
+ * \retval CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_MIG_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_SLI_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_WSL_DEVICE_NOT_SUPPORTED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableHWTrace(uint8_t enable);
+
+
+/**
+ *  \brief Enables tracking the source library for memory allocation requests.
+ *
+ * This API is used to control whether or not we track the source library of
+ * memory allocation requests. Default value is 0, i.e. it is not tracked. The
+ * activity kind CUPTI_ACTIVITY_KIND_MEMORY2 needs to be enabled, and if this flag is
+ * set, we get the full path of the shared object responsible for the GPU memory allocation
+ * request in the member source in the CUpti_ActivityMemory4 records. Also note that this feature
+ * adds runtime overhead.
+ *
+ * \param enable is a boolean, denoting whether the source library of the memory allocation
+ * request needs to be tracked
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+*/
+CUptiResult CUPTIAPI cuptiActivityEnableAllocationSource (uint8_t enable);
+
+/**
+ * \brief Enables collecting records for all synchronization operations.
+ *
+ * CUPTI provides CUDA event query and stream query records via CUPTI_ACTIVTIY_KIND_SYNCHRONIZATION.
+ * Using this API, CUPTI client can enable to record all CUDA event query and stream query records
+ * even if the event has not yet been completed and all operations on stream have not yet been completed
+ * respectively.
+ *
+ * By default, the record is only generated if all captured work has been completed for the CUDA event.
+ * By default, the record is only generated if all operations have been completed on the stream.
+ *
+ * \param enable is a boolean, denoting whether to enable or disable the collection of all CUDA event query
+ * and stream query records
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAllSyncRecords(uint8_t enable);
+
+/** @} */ /* END CUPTI_ACTIVITY_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+// Including deprecated structures of CUPTI_ACTIVITY_API
+#include "cupti_activity_deprecated.h"
+
+#endif /*_CUPTI_ACTIVITY_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d725499ffa13ac7de864719abee2baa88d6c13
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
@@ -0,0 +1,5335 @@
+/*
+ * Copyright 2011-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_)
+#define _CUPTI_ACTIVITY_DEPRECATED_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityOverhead2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemory3
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityUnifiedMemoryCounter2
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ */
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ * (Deprecated in CUDA 12.2)
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using
+ * CUpti_ActivityOverhead3
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+} CUpti_ActivityOverhead2;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+} CUpti_ActivityDevice4;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+   /**
+   * The size of the memory pool in bytes and the processID of the memory pool.
+   * \p size is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   * \p processId is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+   */
+   union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory2
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    /**
+     * The size of memory pool in bytes and the processId of the memory pools
+     * \p size is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     * \p processId is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType
+     */
+    union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory3;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in 12.8)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling operation was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter2;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+  * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+  */
+  CUpti_ActivityKind kind;
+
+  /**
+  * NVLink version.
+  */
+  uint32_t nvlinkVersion;
+
+  /**
+  * Type of device 0 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev0;
+
+  /**
+  * Type of device 1 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+  * Flag gives capabilities of the link \see CUpti_LinkFlag
+  */
+  uint32_t flag;
+
+  /**
+  * Number of physical NVLinks present between two devices.
+  */
+  uint32_t physicalNvLinkCount;
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 0.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev0[4];
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 1.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev1[4];
+
+  /**
+  * Bandwidth of NVLink in kbytes/sec
+  */
+  uint64_t bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record.
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext3 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext3 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+  /**
+   * The ID of the parent context. It would be 0 if
+   * context does not have parent
+   */
+  uint32_t parentContextId;
+
+  /**
+   * This field indicates whether the context is a green context
+   */
+  uint8_t isGreenContext;
+
+  uint8_t padding;
+
+  /**
+   * Number of multiprocessors assigned to the green context
+   * Invalid if the field 'isGreenContext' is 0
+   */
+  uint16_t numMultiprocessors;
+} CUpti_ActivityContext2;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ *
+ * JIT activity is now reported using CUpti_ActivityJit2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivityCudaEvent2
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityCudaEvent;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivitySynchronization2
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+} CUpti_ActivitySynchronization;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivityMemcpy6
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemcpy5;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_callbacks.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_callbacks.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dc1c94b2a6dc2cbab63af058ccec71f822cf63b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_callbacks.h
@@ -0,0 +1,863 @@
+/*
+ * Copyright 2010-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUPTI_CALLBACKS_H__)
+#define __CUPTI_CALLBACKS_H__
+
+#include <cuda.h>
+#include <builtin_types.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_CALLBACK_API CUPTI Callback API
+ * Functions, types, and enums that implement the CUPTI Callback API.
+ * @{
+ */
+
+/**
+ * \brief Specifies the point in an API call that a callback is issued.
+ *
+ * Specifies the point in an API call that a callback is issued. This
+ * value is communicated to the callback function via \ref
+ * CUpti_CallbackData::callbackSite.
+ */
+typedef enum {
+  /**
+   * The callback is at the entry of the API call.
+   */
+  CUPTI_API_ENTER                 = 0,
+  /**
+   * The callback is at the exit of the API call.
+   */
+  CUPTI_API_EXIT                  = 1,
+  CUPTI_API_CBSITE_FORCE_INT     = 0x7fffffff
+} CUpti_ApiCallbackSite;
+
+/**
+ * \brief Callback domains.
+ *
+ * Callback domains. Each domain represents callback points for a
+ * group of related API functions or CUDA driver activity.
+ */
+typedef enum {
+  /**
+   * Invalid domain.
+   */
+  CUPTI_CB_DOMAIN_INVALID           = 0,
+  /**
+   * Domain containing callback points for all driver API functions.
+   */
+  CUPTI_CB_DOMAIN_DRIVER_API        = 1,
+  /**
+   * Domain containing callback points for all runtime API
+   * functions.
+   */
+  CUPTI_CB_DOMAIN_RUNTIME_API       = 2,
+  /**
+   * Domain containing callback points for CUDA resource tracking.
+   */
+  CUPTI_CB_DOMAIN_RESOURCE          = 3,
+  /**
+   * Domain containing callback points for CUDA synchronization.
+   */
+  CUPTI_CB_DOMAIN_SYNCHRONIZE       = 4,
+  /**
+   * Domain containing callback points for NVTX API functions.
+   */
+  CUPTI_CB_DOMAIN_NVTX              = 5,
+  /**
+   * Domain containing callback points for various states.
+   */
+  CUPTI_CB_DOMAIN_STATE             = 6,
+
+  CUPTI_CB_DOMAIN_SIZE,
+
+  CUPTI_CB_DOMAIN_FORCE_INT         = 0x7fffffff
+} CUpti_CallbackDomain;
+
+/**
+ * \brief Callback IDs for resource domain.
+ *
+ * Callback IDs for resource domain, CUPTI_CB_DOMAIN_RESOURCE.  This
+ * value is communicated to the callback function via the \p cbid
+ * parameter.
+ */
+typedef enum {
+  /**
+   * Invalid resource callback ID.
+   */
+  CUPTI_CBID_RESOURCE_INVALID                               = 0,
+  /**
+   * A new context has been created.
+   */
+  CUPTI_CBID_RESOURCE_CONTEXT_CREATED                       = 1,
+  /**
+   * A context is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING              = 2,
+  /**
+   * A new stream has been created.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_CREATED                        = 3,
+  /**
+   * A stream is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING               = 4,
+  /**
+   * The driver has finished initializing.
+   */
+  CUPTI_CBID_RESOURCE_CU_INIT_FINISHED                      = 5,
+  /**
+   * A module has been loaded.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_LOADED                         = 6,
+  /**
+   * A module is about to be unloaded.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING                = 7,
+  /**
+   * The current module which is being profiled.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_PROFILED                       = 8,
+  /**
+   * CUDA graph has been created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_CREATED                         = 9,
+  /**
+   * CUDA graph is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING                = 10,
+  /**
+   * CUDA graph is cloned.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_CLONED                          = 11,
+  /**
+   * CUDA graph node is about to be created
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CREATE_STARTING             = 12,
+  /**
+   * CUDA graph node is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED                     = 13,
+  /**
+   * CUDA graph node is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING            = 14,
+  /**
+   * Dependency on a CUDA graph node is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DEPENDENCY_CREATED          = 15,
+  /**
+   * Dependency on a CUDA graph node is destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DEPENDENCY_DESTROY_STARTING = 16,
+  /**
+   * An executable CUDA graph is about to be created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATE_STARTING             = 17,
+  /**
+   * An executable CUDA graph is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED                     = 18,
+  /**
+   * An executable CUDA graph is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING            = 19,
+  /**
+   * CUDA graph node is cloned.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED                      = 20,
+  /**
+   * CUDA stream attribute is changed.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_ATTRIBUTE_CHANGED              = 21,
+
+  CUPTI_CBID_RESOURCE_SIZE,
+  CUPTI_CBID_RESOURCE_FORCE_INT                   = 0x7fffffff
+} CUpti_CallbackIdResource;
+
+/**
+ * \brief Callback IDs for synchronization domain.
+ *
+ * Callback IDs for synchronization domain,
+ * CUPTI_CB_DOMAIN_SYNCHRONIZE.  This value is communicated to the
+ * callback function via the \p cbid parameter.
+ */
+typedef enum {
+  /**
+   * Invalid synchronize callback ID.
+   */
+  CUPTI_CBID_SYNCHRONIZE_INVALID                  = 0,
+  /**
+   * Stream synchronization has completed for the stream.
+   */
+  CUPTI_CBID_SYNCHRONIZE_STREAM_SYNCHRONIZED      = 1,
+  /**
+   * Context synchronization has completed for the context.
+   */
+  CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED     = 2,
+  CUPTI_CBID_SYNCHRONIZE_SIZE,
+  CUPTI_CBID_SYNCHRONIZE_FORCE_INT                = 0x7fffffff
+} CUpti_CallbackIdSync;
+
+
+/**
+ * \brief Callback IDs for state domain.
+ *
+ * Callback IDs for state domain,
+ * CUPTI_CB_DOMAIN_STATE. This value is communicated to the
+ * callback function via the \p cbid parameter.
+ */
+typedef enum {
+  /**
+   * Invalid state callback ID.
+   */
+  CUPTI_CBID_STATE_INVALID                        = 0,
+  /**
+   * Notification of fatal errors - high impact, non-recoverable
+   * When encountered, CUPTI automatically invokes cuptiFinalize()
+   * User can control behavior of the application in future from 
+   * receiving this callback - such as continuing without profiling, or
+   * terminating the whole application.
+   */
+  CUPTI_CBID_STATE_FATAL_ERROR                    = 1,
+  /**
+   * Notification of non fatal errors - high impact, but recoverable
+   * This notification is not issued in the current release.
+   */
+  CUPTI_CBID_STATE_ERROR                          = 2,
+  /**
+   * Notification of warnings - low impact, recoverable.
+   */
+  CUPTI_CBID_STATE_WARNING                        = 3,
+
+  CUPTI_CBID_STATE_SIZE,
+  CUPTI_CBID_STATE_FORCE_INT         = 0x7fffffff
+} CUpti_CallbackIdState;
+
+
+/**
+ * \brief Data passed into a runtime or driver API callback function.
+ *
+ * Data passed into a runtime or driver API callback function as the
+ * \p cbdata argument to \ref CUpti_CallbackFunc. The \p cbdata will
+ * be this type for \p domain equal to CUPTI_CB_DOMAIN_DRIVER_API or
+ * CUPTI_CB_DOMAIN_RUNTIME_API. The callback data is valid only within
+ * the invocation of the callback function that is passed the data. If
+ * you need to retain some data for use outside of the callback, you
+ * must make a copy of that data. For example, if you make a shallow
+ * copy of CUpti_CallbackData within a callback, you cannot
+ * dereference \p functionParams outside of that callback to access
+ * the function parameters. \p functionName is an exception: the
+ * string pointed to by \p functionName is a global constant and so
+ * may be accessed outside of the callback.
+ */
+typedef struct {
+  /**
+   * Point in the runtime or driver function from where the callback
+   * was issued.
+   */
+  CUpti_ApiCallbackSite callbackSite;
+
+  /**
+   * Name of the runtime or driver API function which issued the
+   * callback. This string is a global constant and so may be
+   * accessed outside of the callback.
+   */
+  const char *functionName;
+
+  /**
+   * Pointer to the arguments passed to the runtime or driver API
+   * call. See generated_cuda_runtime_api_meta.h and
+   * generated_cuda_meta.h for structure definitions for the
+   * parameters for each runtime and driver API function.
+   */
+  const void *functionParams;
+
+  /**
+   * Pointer to the return value of the runtime or driver API
+   * call. This field is only valid within the exit::CUPTI_API_EXIT
+   * callback. For a runtime API \p functionReturnValue points to a
+   * \p cudaError_t. For a driver API \p functionReturnValue points
+   * to a \p CUresult.
+   */
+  void *functionReturnValue;
+
+  /**
+   * Name of the symbol operated on by the runtime or driver API
+   * function which issued the callback. This entry is valid only for
+   * driver and runtime launch callbacks, where it returns the name of
+   * the kernel.
+   */
+  const char *symbolName;
+
+  /**
+   * Driver context current to the thread, or null if no context is
+   * current. This value can change from the entry to exit callback
+   * of a runtime API function if the runtime initializes a context.
+   */
+  CUcontext context;
+
+  /**
+   * Unique ID for the CUDA context associated with the thread. The
+   * UIDs are assigned sequentially as contexts are created and are
+   * unique within a process.
+   */
+  uint32_t contextUid;
+
+  /**
+   * Pointer to data shared between the entry and exit callbacks of
+   * a given runtime or drive API function invocation. This field
+   * can be used to pass 64-bit values from the entry callback to
+   * the corresponding exit callback.
+   */
+  uint64_t *correlationData;
+
+  /**
+   * The activity record correlation ID for this callback. For a
+   * driver domain callback (i.e. \p domain
+   * CUPTI_CB_DOMAIN_DRIVER_API) this ID will equal the correlation ID
+   * in the CUpti_ActivityAPI record corresponding to the CUDA driver
+   * function call. For a runtime domain callback (i.e. \p domain
+   * CUPTI_CB_DOMAIN_RUNTIME_API) this ID will equal the correlation
+   * ID in the CUpti_ActivityAPI record corresponding to the CUDA
+   * runtime function call. Within the callback, this ID can be
+   * recorded to correlate user data with the activity record. This
+   * field is new in 4.1.
+   */
+  uint32_t correlationId;
+
+} CUpti_CallbackData;
+
+/**
+ * \brief Data passed into a resource callback function.
+ *
+ * Data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The callback
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * For CUPTI_CBID_RESOURCE_CONTEXT_CREATED and
+   * CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, the context being
+   * created or destroyed. For CUPTI_CBID_RESOURCE_STREAM_CREATED and
+   * CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING, the context
+   * containing the stream being created or destroyed.
+   */
+  CUcontext context;
+
+  union {
+    /**
+     * For CUPTI_CBID_RESOURCE_STREAM_CREATED and
+     * CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING, the stream being
+     * created or destroyed.
+     */
+    CUstream stream;
+  } resourceHandle;
+
+  /**
+   * Reserved for future use.
+   */
+  void *resourceDescriptor;
+} CUpti_ResourceData;
+
+
+/**
+ * \brief Module data passed into a resource callback function.
+ *
+ * CUDA module data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The module
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+
+typedef struct {
+  /**
+   * Identifier to associate with the CUDA module.
+   */
+    uint32_t moduleId;
+
+  /**
+   * The size of the cubin.
+   */
+    size_t cubinSize;
+
+  /**
+   * Pointer to the associated cubin.
+   */
+    const char *pCubin;
+} CUpti_ModuleResourceData;
+
+/**
+ * \brief CUDA graphs data passed into a resource callback function.
+ *
+ * CUDA graphs data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The graph
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+
+typedef struct {
+  /**
+   * CUDA graph
+   */
+    CUgraph graph;
+  /**
+   * The original CUDA graph from which \param graph is cloned
+   */
+    CUgraph originalGraph;
+  /**
+   * CUDA graph node
+   */
+    CUgraphNode node;
+  /**
+   * The original CUDA graph node from which \param node is cloned
+   */
+    CUgraphNode originalNode;
+  /**
+   * Type of the \param node
+   */
+    CUgraphNodeType nodeType;
+  /**
+   * The dependent graph node
+   * The size of the array is \param numDependencies.
+   */
+    CUgraphNode dependency;
+  /**
+   * CUDA executable graph
+   */
+    CUgraphExec graphExec;
+} CUpti_GraphData;
+
+/**
+ * \brief Data passed into a synchronize callback function.
+ *
+ * Data passed into a synchronize callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_SYNCHRONIZE. The
+ * callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * The context of the stream being synchronized.
+   */
+  CUcontext context;
+  /**
+   * The stream being synchronized.
+   */
+  CUstream  stream;
+} CUpti_SynchronizeData;
+
+/**
+ * \brief Data passed into a NVTX callback function.
+ *
+ * Data passed into a NVTX callback function as the \p cbdata argument
+ * to \ref CUpti_CallbackFunc. The \p cbdata will be this type for \p
+ * domain equal to CUPTI_CB_DOMAIN_NVTX. Unless otherwise notes, the
+ * callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * Name of the NVTX API function which issued the callback. This
+   * string is a global constant and so may be accessed outside of the
+   * callback.
+   */
+  const char *functionName;
+
+  /**
+   * Pointer to the arguments passed to the NVTX API call. See
+   * generated_nvtx_meta.h for structure definitions for the
+   * parameters for each NVTX API function.
+   */
+  const void *functionParams;
+
+  /**
+   * Pointer to the return value of the NVTX API call. See
+   * nvToolsExt.h for each NVTX API function's return value.
+   */
+  const void *functionReturnValue;
+} CUpti_NvtxData;
+
+/**
+ * \brief Stream attribute data passed into a resource callback function
+ * for CUPTI_CBID_RESOURCE_STREAM_ATTRIBUTE_CHANGED callback
+
+ * Data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The
+ * stream attribute data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * The CUDA stream handle for the attribute
+   */
+  CUstream stream;
+
+  /**
+   * The type of the CUDA stream attribute
+   */
+  CUstreamAttrID attr;
+
+  /**
+   * The value of the CUDA stream attribute
+   */
+  const CUstreamAttrValue *value;
+} CUpti_StreamAttrData;
+
+/**
+ * \brief Data passed into a State callback function.
+ *
+ * Data passed into a State callback function as the \p cbdata argument
+ * to \ref CUpti_CallbackFunc. The \p cbdata will be this type for \p
+ * domain equal to CUPTI_CB_DOMAIN_STATE and callback Ids belonging to CUpti_CallbackIdState. 
+ * Unless otherwise noted, the callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  union {
+    /**
+     * Data passed along with the callback Ids 
+     * Enum CUpti_CallbackIdState used to denote callback ids
+     */
+    struct {
+      /**
+       * Error code
+       */
+      CUptiResult result;
+      /**
+       * String containing more details. It can be NULL.
+       */
+      const char *message;
+    } notification;
+  };
+} CUpti_StateData;
+
+/**
+ * \brief An ID for a driver API, runtime API, resource or
+ * synchronization callback.
+ *
+ * An ID for a driver API, runtime API, resource or synchronization
+ * callback. Within a driver API callback this should be interpreted
+ * as a CUpti_driver_api_trace_cbid value (these values are defined in
+ * cupti_driver_cbid.h). Within a runtime API callback this should be
+ * interpreted as a CUpti_runtime_api_trace_cbid value (these values
+ * are defined in cupti_runtime_cbid.h). Within a resource API
+ * callback this should be interpreted as a \ref
+ * CUpti_CallbackIdResource value. Within a synchronize API callback
+ * this should be interpreted as a \ref CUpti_CallbackIdSync value.
+ */
+typedef uint32_t CUpti_CallbackId;
+
+/**
+ * \brief Function type for a callback.
+ *
+ * Function type for a callback. The type of the data passed to the
+ * callback in \p cbdata depends on the \p domain. If \p domain is
+ * CUPTI_CB_DOMAIN_DRIVER_API or CUPTI_CB_DOMAIN_RUNTIME_API the type
+ * of \p cbdata will be CUpti_CallbackData. If \p domain is
+ * CUPTI_CB_DOMAIN_RESOURCE the type of \p cbdata will be
+ * CUpti_ResourceData. If \p domain is CUPTI_CB_DOMAIN_SYNCHRONIZE the
+ * type of \p cbdata will be CUpti_SynchronizeData. If \p domain is
+ * CUPTI_CB_DOMAIN_NVTX the type of \p cbdata will be CUpti_NvtxData.
+ *
+ * \param userdata User data supplied at subscription of the callback
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ * \param cbdata Data passed to the callback.
+ */
+typedef void (CUPTIAPI *CUpti_CallbackFunc)(
+    void *userdata,
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid,
+    const void *cbdata);
+
+/**
+ * \brief A callback subscriber.
+ */
+typedef struct CUpti_Subscriber_st *CUpti_SubscriberHandle;
+
+/**
+ * \brief Pointer to an array of callback domains.
+ */
+typedef CUpti_CallbackDomain *CUpti_DomainTable;
+
+/**
+ * \brief Get the available callback domains.
+ *
+ * Returns in \p *domainTable an array of size \p *domainCount of all
+ * the available callback domains.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param domainCount Returns number of callback domains
+ * \param domainTable Returns pointer to array of available callback domains
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialize CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p domainCount or \p domainTable are NULL
+ */
+CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
+                                           CUpti_DomainTable *domainTable);
+
+/**
+ * \brief Initialize a callback subscriber with a callback function
+ * and user data.
+ *
+ * Initializes a callback subscriber with a callback function and
+ * (optionally) a pointer to user data. The returned subscriber handle
+ * can be used to enable and disable the callback for specific domains
+ * and callback IDs.
+ * \note Only a single subscriber can be registered at a time. To ensure
+ * that no other CUPTI client interrupts the profiling session, it's the
+ * responsibility of all the CUPTI clients to call this function before
+ * starting the profling session. In case profiling session is already
+ * started by another CUPTI client, this function returns the error code
+ * CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED.
+ * Note that this function returns the same error when application is
+ * launched using NVIDIA tools like nvprof, Visual Profiler, Nsight Systems,
+ * Nsight Compute, cuda-gdb and cuda-memcheck.
+ * \note This function does not enable any callbacks.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param subscriber Returns handle to initialize subscriber
+ * \param callback The callback function
+ * \param userdata A pointer to user data. This data will be passed to
+ * the callback function via the \p userdata parameter.
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialize CUPTI
+ * \retval CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED if there is already a CUPTI subscriber
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is NULL
+ */
+CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
+                                    CUpti_CallbackFunc callback,
+                                    void *userdata);
+
+/**
+ * \brief Unregister a callback subscriber.
+ *
+ * Removes a callback subscriber so that no future callbacks will be
+ * issued to that subscriber.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param subscriber Handle to the initialize subscriber
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is NULL or not initialized
+ */
+CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber);
+
+/**
+ * \brief Get the current enabled/disabled state of a callback for a specific
+ * domain and function ID.
+ *
+ * Returns non-zero in \p *enable if the callback for a domain and
+ * callback ID is enabled, and zero if not enabled.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, c) and cuptiEnableCallback(sub, d, c) are called concurrently,
+ * the results are undefined.
+ *
+ * \param enable Returns non-zero if callback enabled, zero if not enabled
+ * \param subscriber Handle to the initialize subscriber
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p enabled is NULL, or if \p
+ * subscriber, \p domain or \p cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
+                                           CUpti_SubscriberHandle subscriber,
+                                           CUpti_CallbackDomain domain,
+                                           CUpti_CallbackId cbid);
+
+/**
+ * \brief Enable or disabled callbacks for a specific domain and
+ * callback ID.
+ *
+ * Enable or disabled callbacks for a subscriber for a specific domain
+ * and callback ID.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, c) and cuptiEnableCallback(sub, d, c) are called concurrently,
+ * the results are undefined.
+ *
+ * \param enable New enable state for the callback. Zero disables the
+ * callback, non-zero enables the callback.
+ * \param subscriber - Handle to callback subscription
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber, \p domain or \p
+ * cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
+                                         CUpti_SubscriberHandle subscriber,
+                                         CUpti_CallbackDomain domain,
+                                         CUpti_CallbackId cbid);
+
+/**
+ * \brief Enable or disabled all callbacks for a specific domain.
+ *
+ * Enable or disabled all callbacks for a specific domain.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackEnabled(sub,
+ * d, *) and cuptiEnableDomain(sub, d) are called concurrently, the
+ * results are undefined.
+ *
+ * \param enable New enable state for all callbacks in the
+ * domain. Zero disables all callbacks, non-zero enables all
+ * callbacks.
+ * \param subscriber - Handle to callback subscription
+ * \param domain The domain of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber or \p domain is invalid
+ */
+CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
+                                       CUpti_SubscriberHandle subscriber,
+                                       CUpti_CallbackDomain domain);
+
+/**
+ * \brief Enable or disable all callbacks in all domains.
+ *
+ * Enable or disable all callbacks in all domains.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, *) and cuptiEnableAllDomains(sub) are called concurrently, the
+ * results are undefined.
+ *
+ * \param enable New enable state for all callbacks in all
+ * domain. Zero disables all callbacks, non-zero enables all
+ * callbacks.
+ * \param subscriber - Handle to callback subscription
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is invalid
+ */
+CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
+                                           CUpti_SubscriberHandle subscriber);
+
+/**
+ * \brief Get the name of a callback for a specific domain and callback ID.
+ *
+ * Returns a pointer to the name c_string in \p **name.
+ *
+ * \note \b Names are available only for the DRIVER and RUNTIME domains.
+ *
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ * \param name Returns pointer to the name string on success, NULL otherwise
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p name is NULL, or if
+ * \p domain or \p cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
+                                          uint32_t cbid,
+                                          const char **name);
+
+/** @} */ /* END CUPTI_CALLBACK_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // file guard
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_checkpoint.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_checkpoint.h
new file mode 100644
index 0000000000000000000000000000000000000000..36eeddc4e2b7bfd1902ce313d71f173db70beaef
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_checkpoint.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <cuda.h>
+#include <cupti_result.h>
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace NV { namespace Cupti { namespace Checkpoint {
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * \defgroup CUPTI_CHECKPOINT_API CUPTI Checkpoint API
+ * Functions, types, and enums that implement the CUPTI Checkpoint API.
+ * @{
+ */
+
+/**
+ * \brief Specifies optimization options for a checkpoint, may be OR'd together to specify multiple options.
+ */
+typedef enum
+{
+    CUPTI_CHECKPOINT_OPT_NONE     = 0, //!< Default behavior
+    CUPTI_CHECKPOINT_OPT_TRANSFER = 1, //!< Determine which mem blocks have changed, and only restore those. This optimization is cached, which means cuptiCheckpointRestore must always be called at the same point in the application when this option is enabled, or the result may be incorrect.
+} CUpti_CheckpointOptimizations;
+
+/**
+ * \brief Configuration and handle for a CUPTI Checkpoint
+ *
+ * A CUptiCheckpoint object should be initialized with desired options prior to passing into any
+ * CUPTI Checkpoint API function.  The first call into a Checkpoint API function will initialize internal
+ * state based on these options.  Subsequent changes to these options will not have any effect.
+ *
+ * Checkpoint data is saved in device, host, and filesystem space.  There are options to reserve memory
+ * at each level (device, host, filesystem) which are intended to allow a guarantee that a certain amount
+ * of memory will remain free for use after the checkpoint is saved.
+ * Note, however, that falling back to slower levels of memory (host, and then filesystem) to save the checkpoint
+ * will result in performance degradation.
+ * Currently, the filesystem limitation is not implemented.  Note that falling back to filesystem storage may
+ * significantly impact the performance for saving and restoring a checkpoint.
+ */
+typedef struct
+{
+   size_t structSize;      //!< [in] Must be set to CUpti_Checkpoint_STRUCT_SIZE
+
+   CUcontext ctx;          //!< [in] Set to context to save from, or will use current context if NULL
+
+   size_t reserveDeviceMB; //!< [in] Restrict checkpoint from using last N MB of device memory (-1 = use no device memory)
+   size_t reserveHostMB;   //!< [in] Restrict checkpoint from using last N MB of host memory (-1 = use no host memory)
+   uint8_t allowOverwrite; //!< [in] Boolean, Allow checkpoint to save over existing checkpoint
+   uint8_t optimizations;  //!< [in] Mask of CUpti_CheckpointOptimizations flags for this checkpoint
+
+   void * pPriv;           //!< [in] Assign to NULL
+} CUpti_Checkpoint;
+
+#define CUpti_Checkpoint_STRUCT_SIZE  \
+(offsetof(CUpti_Checkpoint, pPriv) +  \
+sizeof(((CUpti_Checkpoint*)(nullptr))->pPriv))
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize and save a checkpoint of the device state associated with the handle context
+ *
+ * Uses the handle options to configure and save a checkpoint of the device state associated with the specified context.
+ *
+ * \param handle A pointer to a CUpti_Checkpoint object
+ *
+ * \retval CUPTI_SUCCESS if a checkpoint was successfully initialized and saved
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p handle does not appear to refer to a valid CUpti_Checkpoint
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_DEVICE if device associated with context is not compatible with checkpoint API
+ * \retval CUPTI_ERROR_INVALID_OPERATION if Save is requested over an existing checkpoint, but \p allowOverwrite was not originally specified
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if as configured, not enough backing storage space to save the checkpoint
+ */
+CUptiResult cuptiCheckpointSave(CUpti_Checkpoint * const handle);
+
+/**
+ * \brief Restore a checkpoint to the device associated with its context
+ *
+ * Restores device, pinned, and allocated memory to the state when the checkpoint was saved
+ *
+ * \param handle A pointer to a previously saved CUpti_Checkpoint object
+ *
+ * \retval CUTPI_SUCCESS if the checkpoint was successfully restored
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if the checkpoint was not previously initialized
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if the handle appears invalid
+ * \retval CUPTI_ERROR_UNKNOWN if the restore or optimization operation fails
+ */
+CUptiResult cuptiCheckpointRestore(CUpti_Checkpoint * const handle);
+
+/**
+ * \brief Free the backing data for a checkpoint
+ *
+ * Frees all associated device, host memory and filesystem storage used for this context.
+ * After freeing a handle, it may be re-used as if it was new - options may be re-configured and will
+ * take effect on the next call to \p cuptiCheckpointSave.
+ *
+ * \param handle A pointer to a previously saved CUpti_Checkpoint object
+ *
+ * \retval CUPTI_SUCCESS if the handle was successfully freed
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if the handle was already freed or appears invalid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if the context is no longer valid
+ */
+CUptiResult cuptiCheckpointFree(CUpti_Checkpoint * const handle);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+// Exit namespace NV::Cupti::Checkpoint
+}}}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d228c4df3c1f090a4979bfe10132e080042fef
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+#if !defined(__CUPTI_COMMON_H__)
+#define __CUPTI_COMMON_H__
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#ifndef CUPTIUTILAPI
+#ifdef _WIN32
+#define CUPTIUTILAPI __stdcall
+#else
+#define CUPTIUTILAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#endif /*__CUPTI_COMMON_H__*/
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b23832372a3a69c7bfbf0aa188b0417d9270be6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h
@@ -0,0 +1,799 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+#if !defined(_CUPTI_DRIVER_CBID_H_)
+#define _CUPTI_DRIVER_CBID_H_
+
+typedef enum CUpti_driver_api_trace_cbid_enum {
+    CUPTI_DRIVER_TRACE_CBID_INVALID                                                        = 0,
+    CUPTI_DRIVER_TRACE_CBID_cuInit                                                         = 1,
+    CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion                                             = 2,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGet                                                    = 3,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount                                               = 4,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName                                                = 5,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability                                      = 6,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem                                               = 7,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties                                          = 8,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute                                           = 9,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate                                                    = 10,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy                                                   = 11,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxAttach                                                    = 12,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDetach                                                    = 13,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent                                               = 14,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent                                                = 15,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice                                                 = 16,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize                                               = 17,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoad                                                   = 18,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData                                               = 19,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx                                             = 20,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary                                          = 21,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleUnload                                                 = 22,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction                                            = 23,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal                                              = 24,
+    CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal                                            = 25,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef                                              = 26,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo                                                   = 27,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo                                                 = 28,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc                                                     = 29,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc                                                   = 30,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch                                                = 31,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch                                              = 32,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree                                                      = 33,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemFree                                                    = 34,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange                                           = 35,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange                                         = 36,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost                                                 = 37,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost                                                  = 38,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc                                                 = 39,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer                                      = 40,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer                                    = 41,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags                                              = 42,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD                                                   = 43,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD                                                 = 44,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH                                                   = 45,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH                                                 = 46,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD                                                   = 47,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD                                                 = 48,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA                                                   = 49,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA                                                 = 50,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD                                                   = 51,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD                                                 = 52,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA                                                   = 53,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH                                                   = 54,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA                                                   = 55,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D                                                     = 56,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned                                            = 57,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D                                                     = 58,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D                                                   = 59,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync                                              = 60,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync                                            = 61,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync                                              = 62,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync                                            = 63,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync                                              = 64,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync                                            = 65,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync                                              = 66,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync                                              = 67,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync                                                = 68,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync                                                = 69,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync                                              = 70,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8                                                     = 71,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8                                                   = 72,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16                                                    = 73,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16                                                  = 74,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32                                                    = 75,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32                                                  = 76,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8                                                   = 77,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8                                                 = 78,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16                                                  = 79,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16                                                = 80,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32                                                  = 81,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32                                                = 82,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape                                            = 83,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize                                            = 84,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute                                             = 85,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig                                           = 86,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate                                                  = 87,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor                                           = 88,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy                                                 = 89,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate                                                = 90,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor                                         = 91,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate                                                 = 92,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy                                                = 93,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray                                               = 94,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress                                             = 95,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress                                           = 96,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D                                           = 97,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D                                         = 98,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat                                              = 99,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode                                         = 100,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode                                          = 101,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags                                               = 102,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress                                             = 103,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress                                           = 104,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray                                               = 105,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode                                         = 106,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode                                          = 107,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat                                              = 108,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags                                               = 109,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetSize                                                 = 110,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSeti                                                    = 111,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetf                                                    = 112,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetv                                                    = 113,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef                                               = 114,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunch                                                       = 115,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid                                                   = 116,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync                                              = 117,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreate                                                  = 118,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord                                                  = 119,
+    CUPTI_DRIVER_TRACE_CBID_cuEventQuery                                                   = 120,
+    CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize                                             = 121,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy                                                 = 122,
+    CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime                                             = 123,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreate                                                 = 124,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery                                                  = 125,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize                                            = 126,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy                                                = 127,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource                                   = 128,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray                            = 129,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer                             = 130,
+    CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer                           = 131,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags                                  = 132,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources                                         = 133,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources                                       = 134,
+    CUPTI_DRIVER_TRACE_CBID_cuGetExportTable                                               = 135,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit                                                  = 136,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit                                                  = 137,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice                                               = 138,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate                                               = 139,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource                                = 140,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource                                        = 141,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource                                      = 142,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources                                            = 143,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources                                          = 144,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags                                     = 145,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray                                  = 146,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer                                = 147,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize                                   = 148,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch                                  = 149,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions                            = 150,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice                                               = 151,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate                                               = 152,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource                                = 153,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice                                                = 154,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate                                                = 155,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource                                 = 156,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice                                        = 157,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource                                         = 158,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource                                       = 159,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources                                             = 160,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources                                           = 161,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags                                      = 162,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions                             = 163,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray                                   = 164,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer                                 = 165,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize                                    = 166,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch                                   = 167,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin                                                    = 168,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9End                                                      = 169,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer                                     = 170,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer                                          = 171,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer                                        = 172,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer                                   = 173,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate                                                  = 174,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer                                     = 175,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage                                      = 176,
+    CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice                                                 = 177,
+    CUPTI_DRIVER_TRACE_CBID_cuGLInit                                                       = 178,
+    CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject                                       = 179,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject                                            = 180,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject                                          = 181,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject                                     = 182,
+    CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags                                    = 183,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync                                       = 184,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync                                     = 185,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice                                               = 186,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate                                               = 187,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface                            = 188,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface                           = 189,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef                                             = 190,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate                                                = 191,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy                                               = 192,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat                                             = 193,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray                                              = 194,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat                                             = 195,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray                                              = 196,
+    CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem                                             = 197,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer                              = 198,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize                                 = 199,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch                                = 200,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions                          = 201,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions                           = 202,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer                               = 203,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize                                  = 204,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch                                 = 205,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer                                        = 206,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject                                          = 207,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync                                     = 208,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices                                              = 209,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice                                       = 210,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices                                              = 211,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice                                       = 212,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices                                               = 213,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice                                        = 214,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc                                               = 215,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async                                                = 216,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async                                              = 217,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async                                               = 218,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async                                             = 219,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async                                               = 220,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async                                             = 221,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async                                              = 222,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async                                            = 223,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async                                             = 224,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async                                           = 225,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async                                             = 226,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async                                           = 227,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate                                                = 228,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor                                         = 229,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate                                              = 230,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor                                       = 231,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D                                                   = 232,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned                                          = 233,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync                                              = 234,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2                                                 = 235,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2                                            = 236,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2                                            = 237,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2                                             = 238,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2                                               = 239,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2                                            = 240,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2                                           = 241,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2                                                = 242,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2                                                  = 243,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2                                             = 244,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2                                                   = 245,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2                                        = 246,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2                                   = 247,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2                                                    = 248,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2                                                  = 249,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2                                                 = 250,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2                                                 = 251,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2                                                = 252,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2                                               = 253,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2                                               = 254,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2                                          = 255,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2                                        = 256,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2                                          = 257,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2                          = 258,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2                                            = 259,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2                             = 260,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2                                = 261,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2                               = 262,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2                         = 263,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2                          = 264,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2                              = 265,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2                                 = 266,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2                                = 267,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2                                       = 268,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2                                         = 269,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2                                    = 270,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2                                              = 271,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2                                               = 272,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2                                        = 273,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2                                             = 274,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2                                      = 275,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2                                                = 276,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2                                           = 277,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2                                                = 278,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2                                           = 279,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2                                                = 280,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2                                           = 281,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2                                                = 282,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2                                           = 283,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2                                                = 284,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2                                                = 285,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2                                                = 286,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2                                                  = 287,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2                                         = 288,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2                                             = 289,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2                                                  = 290,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2                                             = 291,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2                                                = 292,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2                                           = 293,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2                                              = 294,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent                                              = 295,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion                                             = 296,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice                                       = 297,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice                                       = 298,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig                                            = 299,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig                                            = 300,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister                                              = 301,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister                                            = 302,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent                                                = 303,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent                                                = 304,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy                                                       = 305,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync                                                  = 306,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel                                                 = 307,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStart                                                = 308,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStop                                                 = 309,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute                                          = 310,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize                                           = 311,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer                                          = 312,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess                                          = 313,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess                                         = 314,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister                                              = 315,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister                                            = 316,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer                                      = 317,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer                                                   = 318,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync                                              = 319,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer                                                 = 320,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync                                            = 321,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2                                                = 322,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2                                            = 323,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2                                             = 324,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2                                              = 325,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2                                             = 326,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3                                        = 327,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle                                              = 328,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle                                             = 329,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle                                            = 330,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId                                          = 331,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId                                            = 332,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices                                                 = 333,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle                                            = 334,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle                                           = 335,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig                                        = 336,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig                                        = 337,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig                                       = 338,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate                                              = 339,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy                                             = 340,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc                                     = 341,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc                                      = 342,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate                                             = 343,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy                                            = 344,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc                                    = 345,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback                                            = 346,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate                                         = 347,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel                                       = 348,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy                                        = 349,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray                                      = 350,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode                                    = 351,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias                                     = 352,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp                                    = 353,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy                                       = 354,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray                                      = 355,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode                                    = 356,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias                                     = 357,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp                                    = 358,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy                                       = 359,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray                      = 360,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc                                 = 361,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate                                                   = 362,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData                                                  = 363,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile                                                  = 364,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkComplete                                                 = 365,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy                                                  = 366,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority                                     = 367,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority                                            = 368,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags                                               = 369,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange                                    = 370,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged                                              = 371,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorString                                               = 372,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorName                                                 = 373,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor                    = 374,
+    CUPTI_DRIVER_TRACE_CBID_cuCompilePtx                                                   = 375,
+    CUPTI_DRIVER_TRACE_CBID_cuBinaryFree                                                   = 376,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync                                         = 377,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute                                          = 378,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2                                           = 379,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2                               = 380,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2                                                = 381,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2                                               = 382,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2                                               = 383,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize                               = 384,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2                                              = 385,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain                                       = 386,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease                                      = 387,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags                                     = 388,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset                                        = 389,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage                                     = 390,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags                                                  = 391,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState                                     = 392,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect                                     = 393,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect                                  = 394,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame                                = 395,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame                                = 396,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds                                           = 397,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds                                           = 398,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds                                           = 399,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds                                           = 400,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds                                           = 401,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds                                           = 402,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds                                           = 403,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds                                           = 404,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds                                             = 405,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds                                    = 406,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds                                             = 407,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds                                                  = 408,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds                                              = 409,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds                                            = 410,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds                                             = 411,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds                                            = 412,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds                                            = 413,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds                                           = 414,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds                                          = 415,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds                                          = 416,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds                                    = 417,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz                                             = 418,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz                                      = 419,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz                                      = 420,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz                                      = 421,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz                                      = 422,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz                                      = 423,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz                                        = 424,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz                                        = 425,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz                                         = 426,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz                                       = 427,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz                                           = 428,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz                                          = 429,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz                                          = 430,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz                                         = 431,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz                                        = 432,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz                                        = 433,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz                                       = 434,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz                                          = 435,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz                                         = 436,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz                                       = 437,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz                                    = 438,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz                                             = 439,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz                                       = 440,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz                                             = 441,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz                                            = 442,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz                                    = 443,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz                                  = 444,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz                               = 445,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect                                     = 446,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect                                  = 447,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame                                = 448,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame                            = 449,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes                                         = 450,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags           = 451,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags                      = 452,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame                                 = 453,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute                                        = 454,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor                                         = 455,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor                                         = 456,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise                                                    = 457,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32                                            = 458,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz                                       = 459,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32                                           = 460,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz                                      = 461,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp                                             = 462,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz                                        = 463,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer                                          = 464,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray                                           = 465,
+    CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator                                               = 466,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync                                             = 467,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz                                        = 468,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync                                       = 469,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags                            = 470,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute                                         = 471,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes                                        = 472,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64                                            = 473,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz                                       = 474,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64                                           = 475,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz                                      = 476,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel                                      = 477,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz                                 = 478,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync                                       = 479,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice                           = 480,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute                                             = 481,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid                                                = 482,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx                                                 = 483,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz                                            = 484,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory                                         = 485,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer                                = 486,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray                        = 487,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory                                        = 488,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore                                      = 489,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync                                = 490,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz                           = 491,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync                                  = 492,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz                             = 493,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore                                     = 494,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture                                           = 495,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz                                      = 496,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture                                             = 497,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz                                        = 498,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing                                            = 499,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz                                       = 500,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphCreate                                                  = 501,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode                                           = 502,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams                                     = 503,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode                                           = 504,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams                                     = 505,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode                                           = 506,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams                                     = 507,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams                                     = 508,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType                                             = 509,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes                                            = 510,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies                                     = 511,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes                                   = 512,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate                                             = 513,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch                                                  = 514,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz                                             = 515,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy                                             = 516,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy                                                 = 517,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies                                         = 518,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies                                      = 519,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams                                     = 520,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams                                     = 521,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode                                             = 522,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphClone                                                   = 523,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone                                         = 524,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode                                       = 525,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode                                            = 526,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc                                               = 527,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz                                          = 528,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph                                  = 529,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode                                             = 530,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams                                       = 531,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid                                                = 532,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams                                       = 533,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes                                                = 534,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges                                                = 535,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo                                         = 536,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz                                    = 537,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams                                 = 538,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2                                        = 539,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz                                   = 540,
+    CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode                              = 541,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes                                 = 542,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock                        = 543,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2                                   = 544,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2                                     = 545,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2                                  = 546,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve                                            = 547,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree                                               = 548,
+    CUPTI_DRIVER_TRACE_CBID_cuMemCreate                                                    = 549,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRelease                                                   = 550,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMap                                                       = 551,
+    CUPTI_DRIVER_TRACE_CBID_cuMemUnmap                                                     = 552,
+    CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess                                                 = 553,
+    CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle                                   = 554,
+    CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle                                 = 555,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity                                  = 556,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle                         = 557,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess                                                 = 558,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags                                               = 559,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz                                          = 560,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate                                              = 561,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams                                 = 562,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams                                 = 563,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams                                   = 564,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle                                    = 565,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule                                                = 566,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2                                          = 567,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache                                    = 568,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes                                = 569,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute                                  = 570,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute                                  = 571,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes                                         = 572,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz                                    = 573,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute                                           = 574,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz                                      = 575,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute                                           = 576,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz                                      = 577,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2                                          = 578,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth                             = 579,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload                                                  = 580,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz                                             = 581,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties                                     = 582,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties                            = 583,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync                                             = 584,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz                                        = 585,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams                             = 586,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags                                         = 587,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz                                    = 588,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode                                      = 589,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode                                        = 590,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent                                 = 591,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent                                   = 592,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent                                 = 593,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent                                   = 594,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent                             = 595,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent                               = 596,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane                                                = 597,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync                                                = 598,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz                                           = 599,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync                                                 = 600,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz                                            = 601,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo                                                = 602,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute                                          = 603,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute                                          = 604,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess                                             = 605,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool                                      = 606,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate                                                = 607,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy                                               = 608,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool                                             = 609,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool                                             = 610,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync                                        = 611,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz                                   = 612,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle                               = 613,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle                             = 614,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer                                         = 615,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer                                         = 616,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess                                             = 617,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode                         = 618,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams                   = 619,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams                   = 620,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode                           = 621,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams                     = 622,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams                     = 623,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams               = 624,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams                 = 625,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress                                               = 626,
+    CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites                                     = 627,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint                                           = 628,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2                                      = 629,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz                                 = 630,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies                              = 631,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz                         = 632,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate                                             = 633,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain                                             = 634,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease                                            = 635,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject                                        = 636,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject                                       = 637,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode                                         = 638,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode                                          = 639,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim                                           = 640,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute                                   = 641,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute                                   = 642,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags                                    = 643,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport                                 = 644,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3                                                 = 645,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity                                           = 646,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2                                             = 647,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams                                   = 648,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams                                    = 649,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled                                          = 650,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled                                          = 651,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx                                               = 652,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz                                          = 653,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements                                   = 654,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements                          = 655,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams                                   = 656,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz                              = 657,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags                                            = 658,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2                                         = 659,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz                                    = 660,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2                                         = 661,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz                                    = 662,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2                                        = 663,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz                                   = 664,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2                                        = 665,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz                                   = 666,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2                                          = 667,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz                                     = 668,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode                                       = 669,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams                                 = 670,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams                                 = 671,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams                             = 672,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode                                         = 673,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange                                  = 674,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize                             = 675,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters                                   = 676,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress_v2                                            = 677,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadData                                              = 678,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadFromFile                                          = 679,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryUnload                                                = 680,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernel                                             = 681,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetModule                                             = 682,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetFunction                                            = 683,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetGlobal                                             = 684,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetManaged                                            = 685,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetAttribute                                           = 686,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetAttribute                                           = 687,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetCacheConfig                                         = 688,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode_v2                                        = 689,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams_v2                                  = 690,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams_v2                                  = 691,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams_v2                              = 692,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId                                                  = 693,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId_ptsz                                             = 694,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetId                                                     = 695,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate_v2                                           = 696,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeTiled                                         = 697,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeIm2col                                        = 698,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapReplaceAddress                                      = 699,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetUnifiedFunction                                    = 700,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttribute                                         = 701,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttributeGlobal                                   = 702,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttribute                                         = 703,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttributeGlobal                                   = 704,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetFlags                                                  = 705,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastCreate                                              = 706,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastAddDevice                                           = 707,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindMem                                             = 708,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindAddr                                            = 709,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastUnbind                                              = 710,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastGetGranularity                                      = 711,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode                                                 = 712,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetParams                                           = 713,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecNodeSetParams                                       = 714,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise_v2                                                 = 715,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2                                          = 716,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2_ptsz                                     = 717,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetName                                                  = 718,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetName                                                = 719,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph                                    = 720,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph_ptsz                               = 721,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphConditionalHandleCreate                                 = 722,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode_v2                                              = 723,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges_v2                                             = 724,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies_v2                                  = 725,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes_v2                                = 726,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies_v2                                      = 727,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies_v2                                   = 728,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3                                      = 729,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3_ptsz                                 = 730,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2                           = 731,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2_ptsz                      = 732,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetParamInfo                                             = 733,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetParamInfo                                           = 734,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceRegisterAsyncNotification                              = 735,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceUnregisterAsyncNotification                            = 736,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunctionCount                                       = 737,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleEnumerateFunctions                                     = 738,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernelCount                                        = 739,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryEnumerateKernels                                      = 740,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncIsLoaded                                                 = 741,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncLoad                                                     = 742,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxCreate                                               = 743,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxDestroy                                              = 744,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDevResource                                         = 745,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevResource                                            = 746,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxGetDevResource                                       = 747,
+    CUPTI_DRIVER_TRACE_CBID_cuDevResourceGenerateDesc                                      = 748,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxRecordEvent                                          = 749,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxWaitEvent                                            = 750,
+    CUPTI_DRIVER_TRACE_CBID_cuDevSmResourceSplitByCount                                    = 751,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetGreenCtx                                            = 752,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxFromGreenCtx                                              = 753,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetLibrary                                             = 754,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxRecordEvent                                               = 755,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxWaitEvent                                                 = 756,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v4                                                 = 757,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxStreamCreate                                         = 758,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_v2                                              = 759,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_v2_ptsz                                         = 760,
+    CUPTI_DRIVER_TRACE_CBID_cuMemBatchDecompressAsync                                      = 761,
+    CUPTI_DRIVER_TRACE_CBID_cuMemBatchDecompressAsync_ptsz                                 = 762,
+    CUPTI_DRIVER_TRACE_CBID_cuLogsRegisterCallback                                         = 763,
+    CUPTI_DRIVER_TRACE_CBID_cuLogsUnregisterCallback                                       = 764,
+    CUPTI_DRIVER_TRACE_CBID_cuLogsCurrent                                                  = 765,
+    CUPTI_DRIVER_TRACE_CBID_cuLogsDumpToFile                                               = 766,
+    CUPTI_DRIVER_TRACE_CBID_cuLogsDumpToMemory                                             = 767,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessGetRestoreThreadId                          = 768,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessGetState                                    = 769,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessLock                                        = 770,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessCheckpoint                                  = 771,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessRestore                                     = 772,
+    CUPTI_DRIVER_TRACE_CBID_cuCheckpointProcessUnlock                                      = 773,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetDevice                                              = 774,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetDevice_ptsz                                         = 775,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyBatchAsync                                             = 776,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyBatchAsync_ptsz                                        = 777,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DBatchAsync                                           = 778,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DBatchAsync_ptsz                                      = 779,
+    CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime_v2                                          = 780,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeIm2colWide                                    = 781,
+    CUPTI_DRIVER_TRACE_CBID_SIZE                                                           = 782,
+    CUPTI_DRIVER_TRACE_CBID_FORCE_INT                                                      = 0x7fffffff
+} CUpti_driver_api_trace_cbid;
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e4aebc2a1389e8693f02df9b6e3be1e90490870
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h
@@ -0,0 +1,1349 @@
+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_EVENTS_H_)
+#define _CUPTI_EVENTS_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_EVENT_API CUPTI Event API
+ * Functions, types, and enums that implement the CUPTI Event API.
+ *
+ * \note The CUPTI event API from the header cupti_events.h is not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * This API is deprecated in CUDA 12.8 release and will be removed in a future CUDA release.
+ * This is replaced by the host profiling API in the header cupti_profiler_host.h and
+ * target profiling API in the header cupti_range_profiler.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for an event.
+ *
+ * An event represents a countable activity, action, or occurrence on
+ * the device.
+ */
+typedef uint32_t CUpti_EventID;
+
+/**
+ * \brief ID for an event domain.
+ *
+ * ID for an event domain. An event domain represents a group of
+ * related events. A device may have multiple instances of a domain,
+ * indicating that the device can simultaneously record multiple
+ * instances of each event within that domain.
+ */
+typedef uint32_t CUpti_EventDomainID;
+
+/**
+ * \brief A group of events.
+ *
+ * An event group is a collection of events that are managed
+ * together. All events in an event group must belong to the same
+ * domain.
+ */
+typedef void *CUpti_EventGroup;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for device attribute
+ * CUPTI_DEVICE_ATTR_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA              = 0,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO             = 1,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE            = 2,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA              = 3,
+} CUpti_DeviceAttributeDeviceClass;
+
+/**
+ * \brief Device attributes.
+ *
+ * CUPTI device attributes. These attributes can be read using \ref
+ * cuptiDeviceGetAttribute.
+ */
+typedef enum {
+  /**
+   * Number of event IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_ID                            = 1,
+  /**
+   * Number of event domain IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID                     = 2,
+  /**
+   * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH                 = 3,
+  /**
+   * Get theoretical maximum number of instructions per cycle. Value
+   * is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE                   = 4,
+  /**
+   * Get theoretical maximum number of single precision instructions
+   * that can be executed per second. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5,
+  /**
+   * Get number of frame buffers for device.  Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS                       = 6,
+  /**
+   * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_RATE                          = 7,
+  /**
+   * Get PCIE link width for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH                         = 8,
+  /**
+   * Get PCIE generation for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_GEN                                = 9,
+  /**
+   * Get the class for the device. Value is a
+   * CUpti_DeviceAttributeDeviceClass.
+   */
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS                            = 10,
+  /**
+   * Get the peak single precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE                       = 11,
+  /**
+   * Get the peak double precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE                       = 12,
+  /**
+   * Get number of L2 units. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_L2_UNITS                           = 13,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16,
+  /**
+   * Get the peak half precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE                       = 17,
+  /**
+   * Check if Nvlink is connected to device. Returns 1, if at least one
+   * Nvlink is connected to the device, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVLINK_PRESENT                          = 18,
+    /**
+   * Check if Nvlink is present between GPU and CPU. Returns Bandwidth,
+   * in Bytes/sec, if Nvlink is present, returns 0 otherwise.
+   * Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW                       = 19,
+  /**
+   * Check if NVSwitch is present in the underlying topology.
+   * Returns 1, if present, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT                        = 20,
+  CUPTI_DEVICE_ATTR_FORCE_INT                               = 0x7fffffff,
+} CUpti_DeviceAttribute;
+
+/**
+ * \brief Event domain attributes.
+ *
+ * Event domain attributes. Except where noted, all the attributes can
+ * be read using either \ref cuptiDeviceGetEventDomainAttribute or
+ * \ref cuptiEventDomainGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event domain name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_NAME                 = 0,
+  /**
+   * Number of instances of the domain for which event counts will be
+   * collected.  The domain may have additional instances that cannot
+   * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT).
+   * Can be read only with \ref
+   * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT       = 1,
+  /**
+   * Total number of instances of the domain, including instances that
+   * cannot be profiled.  Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT
+   * to get the number of instances that can be profiled. Can be read
+   * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a
+   * uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3,
+  /**
+   * Collection method used for events contained in the event domain.
+   * Value is a \ref CUpti_EventCollectionMethod.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD    = 4,
+
+  CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT      = 0x7fffffff,
+} CUpti_EventDomainAttribute;
+
+/**
+ * \brief The collection method used for an event.
+ *
+ * The collection method indicates how an event is collected.
+ */
+typedef enum {
+  /**
+   * Event is collected using a hardware global performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_PM                  = 0,
+  /**
+   * Event is collected using a hardware SM performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_SM                  = 1,
+  /**
+   * Event is collected using software instrumentation.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED        = 2,
+  /**
+   * Event is collected using NvLink throughput counter method.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC           = 3,
+  CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMethod;
+
+/**
+ * \brief Event group attributes.
+ *
+ * Event group attributes. These attributes can be read using \ref
+ * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be
+ * written using \ref cuptiEventGroupSetAttribute.
+ */
+typedef enum {
+  /**
+   * The domain to which the event group is bound. This attribute is
+   * set when the first event is added to the group.  Value is a
+   * CUpti_EventDomainID.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID              = 0,
+  /**
+   * [rw] Profile all the instances of the domain for this
+   * eventgroup. This feature can be used to get load balancing
+   * across all instances of a domain. Value is an integer.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1,
+  /**
+   * [rw] Reserved for user data.
+   */
+  CUPTI_EVENT_GROUP_ATTR_USER_DATA                    = 2,
+  /**
+   * Number of events in the group. Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS                   = 3,
+  /**
+   * Enumerates events in the group. Value is a pointer to buffer of
+   * size sizeof(CUpti_EventID) * num_of_events in the eventgroup.
+   * num_of_events can be queried using
+   * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENTS                       = 4,
+  /**
+   * Number of instances of the domain bound to this event group that
+   * will be counted.  Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT               = 5,
+  /**
+   * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before
+   * adding any event.
+   * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events
+   * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH.
+   * If profiling scope of event is either
+   * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT
+   * then setting this attribute will not affect the default scope.
+   * It is not allowed to add events of different scope to same eventgroup.
+   * Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE               = 6,
+  CUPTI_EVENT_GROUP_ATTR_FORCE_INT                     = 0x7fffffff,
+} CUpti_EventGroupAttribute;
+
+/**
+* \brief Profiling scope for event.
+*
+* Profiling scope of event indicates if the event can be collected at context
+* scope or device scope or both i.e. it can be collected at any of context or
+* device scope.
+*/
+typedef enum {
+  /**
+   * Event is collected at context scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_CONTEXT                 = 0,
+  /**
+   * Event is collected at device scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_DEVICE                  = 1,
+  /**
+   * Event can be collected at device or context scope.
+   * The scope can be set using \ref cuptiEventGroupSetAttribute API.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_BOTH                    = 2,
+  CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT               = 0x7fffffff
+} CUpti_EventProfilingScope;
+
+/**
+ * \brief Event attributes.
+ *
+ * Event attributes. These attributes can be read using \ref
+ * cuptiEventGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_ATTR_NAME              = 0,
+  /**
+   * Short description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of event. Value is CUpti_EventCategory.
+   */
+  CUPTI_EVENT_ATTR_CATEGORY          = 3,
+  /**
+   * Profiling scope of the events. It can be either device or context or both.
+   * Value is a \ref CUpti_EventProfilingScope.
+   */
+  CUPTI_EVENT_ATTR_PROFILING_SCOPE   = 5,
+
+  CUPTI_EVENT_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_EventAttribute;
+
+/**
+ * \brief Event collection modes.
+ *
+ * The event collection mode determines the period over which the
+ * events within the enabled event groups will be collected.
+ */
+typedef enum {
+  /**
+   * Events are collected for the entire duration between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls.
+   * Event values are reset when the events are read.
+   * For CUDA toolkit v6.0 and older this was the default mode.
+   */
+  CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS          = 0,
+  /**
+   * Events are collected only for the durations of kernel executions
+   * that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls. Event collection begins when a
+   * kernel execution begins, and stops when kernel execution
+   * completes. Event values are reset to zero when each kernel
+   * execution begins. If multiple kernel executions occur between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the
+   * event values must be read after each kernel launch if those
+   * events need to be associated with the specific kernel launch.
+   * Note that collection in this mode may significantly change the
+   * overall performance characteristics of the application because
+   * kernel executions that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls are serialized on the GPU.
+   * This is the default mode from CUDA toolkit v6.5
+   */
+  CUPTI_EVENT_COLLECTION_MODE_KERNEL              = 1,
+  CUPTI_EVENT_COLLECTION_MODE_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMode;
+
+/**
+ * \brief An event category.
+ *
+ * Each event is assigned to a category that represents the general
+ * type of the event. A event's category is accessed using \ref
+ * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute.
+ */
+typedef enum {
+  /**
+   * An instruction related event.
+   */
+  CUPTI_EVENT_CATEGORY_INSTRUCTION     = 0,
+  /**
+   * A memory related event.
+   */
+  CUPTI_EVENT_CATEGORY_MEMORY          = 1,
+  /**
+   * A cache related event.
+   */
+  CUPTI_EVENT_CATEGORY_CACHE           = 2,
+  /**
+   * A profile-trigger event.
+   */
+  CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3,
+  /**
+   * A system event.
+   */
+  CUPTI_EVENT_CATEGORY_SYSTEM  = 4,
+  CUPTI_EVENT_CATEGORY_FORCE_INT       = 0x7fffffff
+} CUpti_EventCategory;
+
+/**
+ * \brief The overflow value for a CUPTI event.
+ *
+ * The CUPTI event value that indicates an overflow.
+ */
+#define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
+
+/**
+ * \brief The value that indicates the event value is invalid
+ */
+#define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
+
+/**
+ * \brief Flags for cuptiEventGroupReadEvent an
+ * cuptiEventGroupReadAllEvents.
+ *
+ * Flags for \ref cuptiEventGroupReadEvent an \ref
+ * cuptiEventGroupReadAllEvents.
+ */
+typedef enum {
+  /**
+   * No flags.
+   */
+  CUPTI_EVENT_READ_FLAG_NONE          = 0,
+  CUPTI_EVENT_READ_FLAG_FORCE_INT     = 0x7fffffff,
+} CUpti_ReadEventFlags;
+
+
+/**
+ * \brief A set of event groups.
+ *
+ * A set of event groups. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a set indicates that event groups that can be enabled at the same
+ * time (i.e. all the events in the set can be collected
+ * simultaneously).
+ */
+typedef struct {
+  /**
+   * The number of event groups in the set.
+   */
+  uint32_t numEventGroups;
+  /**
+   * An array of \p numEventGroups event groups.
+   */
+  CUpti_EventGroup *eventGroups;
+} CUpti_EventGroupSet;
+
+/**
+ * \brief A set of event group sets.
+ *
+ * A set of event group sets. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a CUpti_EventGroupSets indicates the number of passes required to
+ * collect all the events, and the event groups that should be
+ * collected during each pass.
+ */
+typedef struct {
+  /**
+   * Number of event group sets.
+   */
+  uint32_t numSets;
+  /**
+   * An array of \p numSets event group sets.
+   */
+  CUpti_EventGroupSet *sets;
+} CUpti_EventGroupSets;
+
+/**
+ * \brief Set the event collection mode.
+ *
+ * Set the event collection mode for a \p context.  The \p mode
+ * controls the event collection behavior of all events in event
+ * groups created in the \p context. This API is invalid in kernel
+ * replay mode.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \param mode The event collection mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device
+ */
+
+CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
+                                                 CUpti_EventCollectionMode mode);
+
+/**
+ * \brief Read a device attribute.
+ *
+ * Read a device attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a device attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
+                                             CUpti_DeviceAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Get the number of domains for a device.
+ *
+ * Returns the number of domains in \p numDomains for a device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
+                                                   uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains for a device.
+ *
+ * Returns the event domains IDs in \p domainArray for a device.  The
+ * size of the \p domainArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p domainArray buffer must be at
+ * least \p numdomains * sizeof(CUpti_EventDomainID) or else all
+ * domains will not be returned. The value returned in \p
+ * *arraySizeBytes contains the number of bytes returned in \p
+ * domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns the IDs of the event domains for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
+                                                 size_t *arraySizeBytes,
+                                                 CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
+                                                        CUpti_EventDomainID eventDomain,
+                                                        CUpti_EventDomainAttribute attrib,
+                                                        size_t *valueSize,
+                                                        void *value);
+
+/**
+ * \brief Get the number of event domains available on any device.
+ *
+ * Returns the total number of event domains available on any
+ * CUDA-capable device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains available on any device.
+ *
+ * Returns all the event domains available on any CUDA-capable device.
+ * Event domain IDs are returned in \p domainArray. The size of the \p
+ * domainArray buffer is given by \p *arraySizeBytes. The size of the
+ * \p domainArray buffer must be at least \p numDomains *
+ * sizeof(CUpti_EventDomainID) or all domains will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns all the event domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
+                                           CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
+                                                  CUpti_EventDomainAttribute attrib,
+                                                  size_t *valueSize,
+                                                  void *value);
+
+/**
+ * \brief Get number of events in a domain.
+ *
+ * Returns the number of events in \p numEvents for a domain.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param numEvents Returns the number of events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
+                                                  uint32_t *numEvents);
+
+/**
+ * \brief Get the events in a domain.
+ *
+ * Returns the event IDs in \p eventArray for a domain.  The size of
+ * the \p eventArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p eventArray buffer must be at least \p numdomainevents *
+ * sizeof(CUpti_EventID) or else all events will not be returned. The
+ * value returned in \p *arraySizeBytes contains the number of bytes
+ * returned in \p eventArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param arraySizeBytes The size of \p eventArray in bytes, and
+ * returns the number of bytes written to \p eventArray
+ * \param eventArray Returns the IDs of the events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p
+ * eventArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
+                                                size_t *arraySizeBytes,
+                                                CUpti_EventID *eventArray);
+
+/**
+ * \brief Get an event attribute.
+ *
+ * Returns an event attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param event ID of the event
+ * \param attrib The event attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
+                                            CUpti_EventAttribute attrib,
+                                            size_t *valueSize,
+                                            void *value);
+
+/**
+ * \brief Find an event by name.
+ *
+ * Find an event by name and return the event ID in \p *event.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventName The name of the event to find
+ * \param event Returns the ID of the found event or undefined if
+ * unable to find the event
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event
+ * with name \p eventName. In this case \p *event is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
+                                             const char *eventName,
+                                             CUpti_EventID *event);
+
+/**
+ * \brief Create a new event group for a context.
+ *
+ * Creates a new event group for \p context and returns the new group
+ * in \p *eventGroup.
+ * \note \p flags are reserved for future use and should be set to zero.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context for the event group
+ * \param eventGroup Returns the new event group
+ * \param flags Reserved - must be zero
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
+                                           CUpti_EventGroup *eventGroup,
+                                           uint32_t flags);
+
+/**
+ * \brief Destroy an event group.
+ *
+ * Destroy an \p eventGroup and free its resources. An event group
+ * cannot be destroyed if it is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read an event group attribute.
+ *
+ * Read an event group attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an eventgroup attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t *valueSize,
+                                                 void *value);
+
+/**
+ * \brief Write an event group attribute.
+ *
+ * Write an event group attribute.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event group attribute, or if
+ * \p attrib is not a writable attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t valueSize,
+                                                 void *value);
+
+/**
+ * \brief Add an event to an event group.
+ *
+ * Add an event to an event group. The event add can fail for a number of reasons:
+ * \li The event group is enabled
+ * \li The event does not belong to the same event domain as the
+ * events that are already in the event group
+ * \li Device limitations on the events that can belong to the same group
+ * \li The event group is full
+ *
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to add to the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a
+ * different event domain than the events already in \p eventGroup, or
+ * if a device limitation prevents \p event from being collected at
+ * the same time as the events already in \p eventGroup
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
+                                             CUpti_EventID event);
+
+/**
+ * \brief Remove an event from an event group.
+ *
+ * Remove \p event from the an event group. The event cannot be
+ * removed if the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to remove from the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
+                                                CUpti_EventID event);
+
+/**
+ * \brief Remove all events from an event group.
+ *
+ * Remove all events from an event group. Events cannot be removed if
+ * the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Zero all the event counts in an event group.
+ *
+ * Zero all the event counts in an event group.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Enable an event group.
+ *
+ * Enable an event group. Enabling an event group zeros the value of
+ * all the events in the group and then starts collection of those
+ * events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling
+ * and hardware is busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Disable an event group.
+ *
+ * Disable an event group. Disabling an event group stops collection
+ * of events contained in the group.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read the value for an event in an event group.
+ *
+ * Read the value for an event in an event group. The event value is
+ * returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of the \p
+ * eventValueBuffer buffer. The buffer must be at least sizeof(uint64)
+ * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set
+ * on the group containing the event.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the
+ * group.
+ *
+ * If any instance of an event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading an event from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param event The event to read
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer
+ * in bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event value(s)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes or \p eventValueBuffer is NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
+                                              CUpti_ReadEventFlags flags,
+                                              CUpti_EventID event,
+                                              size_t *eventValueBufferSizeBytes,
+                                              uint64_t *eventValueBuffer);
+
+/**
+ * \brief Read the values for all the events in an event group.
+ *
+ * Read the values for all the events in an event group. The event
+ * values are returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of \p
+ * eventValueBuffer.  The buffer must be at least (sizeof(uint64) *
+ * number of events in group) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on
+ * the group containing the events.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances * number of events in
+ * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is
+ * set on the group.
+ *
+ * The data format returned in \p eventValueBuffer is:
+ *    - domain instance 0: event0 event1 ... eventN
+ *    - domain instance 1: event0 event1 ... eventN
+ *    - ...
+ *    - domain instance M: event0 event1 ... eventN
+ *
+ * The event order in \p eventValueBuffer is returned in \p
+ * eventIdArray. The size of \p eventIdArray is specified in \p
+ * eventIdArraySizeBytes. The size should be at least
+ * (sizeof(CUpti_EventID) * number of events in group).
+ *
+ * If any instance of any event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading events from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in
+ * bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event values
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events in the same order
+ * as the values return in eventValueBuffer.
+ * \param numEventIdsRead Returns the number of event IDs returned
+ * in \p eventIdArray
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes, \p eventValueBuffer, \p
+ * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is
+ * NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * or \p eventIdArray is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup       eventGroup,
+                                                  CUpti_ReadEventFlags   flags,
+                                                  size_t                 *eventValueBufferSizeBytes,
+                                                  uint64_t               *eventValueBuffer,
+                                                  size_t                 *eventIdArraySizeBytes,
+                                                  CUpti_EventID          *eventIdArray,
+                                                  size_t                 *numEventIdsRead);
+
+/**
+ * \brief For a set of events, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events.
+ *
+ * The number of events that can be collected simultaneously varies by
+ * device and by the type of the events. When events can be collected
+ * simultaneously, they may need to be grouped into multiple event
+ * groups because they are from different event domains. This function
+ * takes a set of events and determines how many passes are required
+ * to collect all those events, and which events can be collected
+ * simultaneously in each pass.
+ *
+ * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates
+ * how many passes are required to collect the events with the \p
+ * numSets field. Within each event group set, the \p sets array
+ * indicates the event groups that should be collected on each pass.
+ * \note \b Thread-safety: this function is thread safe, but client
+ * must guard against another thread simultaneously destroying \p
+ * context.
+ *
+ * \param context The context for event collection
+ * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes
+ * \param eventIdArray Array of event IDs that need to be grouped
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
+                                               size_t eventIdArraySizeBytes,
+                                               CUpti_EventID *eventIdArray,
+                                               CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Destroy a event group sets object.
+ *
+ * Destroy a CUpti_EventGroupSets object.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSets The object to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups
+ * contained in the sets is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets);
+
+
+/**
+ * \brief Enable an event group set.
+ *
+ * Enable a set of event groups. Enabling a set of event groups zeros the value of
+ * all the events in all the groups and then starts collection of those events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is
+ * busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Disable an event group set.
+ *
+ * Disable a set of event groups. Disabling a set of event groups
+ * stops collection of events contained in the groups.
+ * \note \b Thread-safety: this function is thread safe.
+ * \note \b If this call fails, some of the event groups in the set may be disabled
+ * and other event groups may remain enabled.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Enable kernel replay mode.
+ *
+ * Set profiling mode for the context to replay mode. In this mode,
+ * any number of events can be collected in one run of the kernel. The
+ * event collection mode will automatically switch to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  In this mode, \ref
+ * cuptiSetEventCollectionMode will return
+ * CUPTI_ERROR_INVALID_OPERATION.
+ * \note \b Kernels might take longer to run if many events are enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Disable kernel replay mode.
+ *
+ * Set profiling mode for the context to non-replay (default)
+ * mode. Event collection mode will be set to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  All previously enabled
+ * event groups and event group sets will be disabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Function type for getting updates on kernel replay.
+ *
+ * \param kernelName The mangled kernel name
+ * \param numReplaysDone Number of replays done so far
+ * \param customData Pointer of any custom data passed in when subscribing
+ */
+typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)(
+    const char *kernelName,
+    int numReplaysDone,
+    void *customData);
+
+/**
+ * \brief Subscribe to kernel replay updates.
+ *
+ * When subscribed, the function pointer passed in will be called each time a
+ * kernel run is finished during kernel replay. Previously subscribed function
+ * pointer will be replaced. Pass in NULL as the function pointer unsubscribes
+ * the update.
+ *
+ * \param updateFunc The update function pointer
+ * \param customData Pointer to any custom data
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData);
+
+/** @} */ /* END CUPTI_EVENT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_EVENTS_H_*/
+
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..64b7f2d14580320f1ec938da5ea356add191ec3c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h
@@ -0,0 +1,824 @@
+/*
+ * Copyright 2011-2024   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_METRIC_H_)
+#define _CUPTI_METRIC_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
+ * Functions, types, and enums that implement the CUPTI Metric API.
+ *
+ * \note The CUPTI metric API from the header cupti_metrics.h is not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * This API is deprecated in CUDA 12.8 release and will be removed in a future CUDA release.
+ * This is replaced by the host profiling API in the header cupti_profiler_host.h and
+ * target profiling API in the header cupti_range_profiler.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for a metric.
+ *
+ * A metric provides a measure of some aspect of the device.
+ */
+typedef uint32_t CUpti_MetricID;
+
+/**
+ * \brief A metric category.
+ *
+ * Each metric is assigned to a category that represents the general
+ * type of the metric. A metric's category is accessed using \ref
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
+ * attribute.
+ */
+typedef enum {
+  /**
+   * A memory related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MEMORY          = 0,
+  /**
+   * An instruction related metric.
+   */
+  CUPTI_METRIC_CATEGORY_INSTRUCTION     = 1,
+  /**
+   * A multiprocessor related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MULTIPROCESSOR  = 2,
+  /**
+   * A cache related metric.
+   */
+  CUPTI_METRIC_CATEGORY_CACHE           = 3,
+  /**
+   * A texture related metric.
+   */
+  CUPTI_METRIC_CATEGORY_TEXTURE         = 4,
+  /**
+   *A Nvlink related metric.
+   */
+  CUPTI_METRIC_CATEGORY_NVLINK          = 5,
+  /**
+   *A PCIe related metric.
+   */
+  CUPTI_METRIC_CATEGORY_PCIE           = 6,
+  CUPTI_METRIC_CATEGORY_FORCE_INT                         = 0x7fffffff,
+} CUpti_MetricCategory;
+
+/**
+ * \brief A metric evaluation mode.
+ *
+ * A metric can be evaluated per hardware instance to know the load balancing
+ * across instances of a domain or the metric can be evaluated in aggregate mode
+ * when the events involved in metric evaluation are from different event
+ * domains. It might be possible to evaluate some metrics in both
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
+ * attribute.
+ */
+typedef enum {
+  /**
+   * If this bit is set, the metric can be profiled for each instance of the
+   * domain. The event values passed to \ref cuptiMetricGetValue can contain
+   * values for one instance of the domain. And \ref cuptiMetricGetValue can
+   * be called for each instance.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE         = 1,
+  /**
+   * If this bit is set, the metric can be profiled over all instances. The
+   * event values passed to \ref cuptiMetricGetValue can be aggregated values
+   * of events for all instances of the domain.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_AGGREGATE            = 1 << 1,
+  CUPTI_METRIC_EVALUATION_MODE_FORCE_INT            = 0x7fffffff,
+} CUpti_MetricEvaluationMode;
+
+/**
+ * \brief Kinds of metric values.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef enum {
+  /**
+   * The metric value is a 64-bit double.
+   */
+  CUPTI_METRIC_VALUE_KIND_DOUBLE            = 0,
+  /**
+   * The metric value is a 64-bit unsigned integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_UINT64            = 1,
+  /**
+   * The metric value is a percentage represented by a 64-bit
+   * double. For example, 57.5% is represented by the value 57.5.
+   */
+  CUPTI_METRIC_VALUE_KIND_PERCENT           = 2,
+  /**
+   * The metric value is a throughput represented by a 64-bit
+   * integer. The unit for throughput values is bytes/second.
+   */
+  CUPTI_METRIC_VALUE_KIND_THROUGHPUT        = 3,
+  /**
+   * The metric value is a 64-bit signed integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_INT64             = 4,
+  /**
+   * The metric value is a utilization level, as represented by
+   * CUpti_MetricValueUtilizationLevel.
+   */
+  CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
+
+  CUPTI_METRIC_VALUE_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_MetricValueKind;
+
+/**
+ * \brief Enumeration of utilization levels for metrics values of kind
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
+ * specific names for a few values.
+ */
+typedef enum {
+  CUPTI_METRIC_VALUE_UTILIZATION_IDLE      = 0,
+  CUPTI_METRIC_VALUE_UTILIZATION_LOW       = 2,
+  CUPTI_METRIC_VALUE_UTILIZATION_MID       = 5,
+  CUPTI_METRIC_VALUE_UTILIZATION_HIGH      = 8,
+  CUPTI_METRIC_VALUE_UTILIZATION_MAX       = 10,
+  CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
+} CUpti_MetricValueUtilizationLevel;
+
+/**
+ * \brief Metric attributes.
+ *
+ * Metric attributes describe properties of a metric. These attributes
+ * can be read using \ref cuptiMetricGetAttribute.
+ */
+typedef enum {
+  /**
+   * Metric name. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_NAME              = 0,
+  /**
+   * Short description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of the metric. Value is of type CUpti_MetricCategory.
+   */
+  CUPTI_METRIC_ATTR_CATEGORY          = 3,
+  /**
+   * Value type of the metric. Value is of type CUpti_MetricValueKind.
+   */
+  CUPTI_METRIC_ATTR_VALUE_KIND          = 4,
+  /**
+   * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
+   */
+  CUPTI_METRIC_ATTR_EVALUATION_MODE     = 5,
+  CUPTI_METRIC_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_MetricAttribute;
+
+/**
+ * \brief A metric value.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef union {
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
+   */
+  double metricValueDouble;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
+   */
+  uint64_t metricValueUint64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_INT64.
+   */
+  int64_t metricValueInt64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
+   * represented by the value 57.5.
+   */
+  double metricValuePercent;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT.  The unit for
+   * throughput values is bytes/second.
+   */
+  uint64_t metricValueThroughput;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
+   */
+  CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
+} CUpti_MetricValue;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for metric property
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA          = 0,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO         = 1,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE        = 2,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA          = 3,
+} CUpti_MetricPropertyDeviceClass;
+
+/**
+ * \brief Metric device properties.
+ *
+ * Metric device properties describe device properties which are needed for a metric.
+ * Some of these properties can be collected using cuDeviceGetAttribute.
+ */
+typedef enum {
+  /*
+   * Number of multiprocessors on a device.  This can be collected
+   * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
+  /*
+   * Maximum number of warps on a multiprocessor. This can be
+   * collected using ratio of value of \param
+   * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
+   * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
+  /*
+   * GPU Time for kernel in ns. This should be profiled using CUPTI
+   * Activity API.
+   */
+  CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
+  /*
+   * Clock rate for device in KHz.  This should be collected using
+   * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_CLOCK_RATE,
+  /*
+   * Number of Frame buffer units for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
+  /*
+   * Global memory bandwidth in KBytes/sec. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
+   * of cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
+  /*
+   * PCIE link rate in Mega bits/sec. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
+  /*
+   * PCIE link width for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
+  /*
+   * PCIE generation for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_GEN,
+  /*
+   * The device class. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
+  /*
+   * Peak single precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
+  /*
+   * Peak double precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
+  /*
+   * Number of L2 units on a device. This can be collected
+   * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_L2_UNITS,
+  /*
+   * Whether ECC support is enabled on the device. This can be
+   * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_ECC_ENABLED,
+  /*
+   * Peak half precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
+  /*
+   * NVLINK Bandwitdh for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
+} CUpti_MetricPropertyID;
+
+/**
+ * \brief Get the total number of metrics available on any device.
+ *
+ * Returns the total number of metrics available on any CUDA-capable
+ * devices.
+ *
+ * \param numMetrics Returns the number of metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
+
+/**
+ * \brief Get all the metrics available on any device.
+ *
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
+ * devices.  The size of the \p metricArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
+ * not be returned. The value returned in \p *arraySizeBytes contains
+ * the number of bytes returned in \p metricArray.
+ *
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+*/
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get the number of metrics for a device.
+ *
+ * Returns the number of metrics available for a device.
+ *
+ * \param device The CUDA device
+ * \param numMetrics Returns the number of metrics available for the
+ * device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
+                                              uint32_t *numMetrics);
+
+/**
+ * \brief Get the metrics for a device.
+ *
+ * Returns the metric IDs in \p metricArray for a device.  The size of
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p metricArray buffer must be at least \p numMetrics *
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p metricArray.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get a metric attribute.
+ *
+ * Returns a metric attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ *
+ * \param metric ID of the metric
+ * \param attrib The metric attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a metric attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Find an metric by name.
+ *
+ * Find a metric by name and return the metric ID in \p *metric.
+ *
+ * \param device The CUDA device
+ * \param metricName The name of metric to find
+ * \param metric Returns the ID of the found metric or undefined if
+ * unable to find the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
+ * with name \p metricName. In this case \p *metric is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
+ * metric are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
+                                              const char *metricName,
+                                              CUpti_MetricID *metric);
+
+/**
+ * \brief Get number of events required to calculate a metric.
+ *
+ * Returns the number of events in \p numEvents that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numEvents Returns the number of events required for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t *numEvents);
+
+/**
+ * \brief Get the events required to calculating a metric.
+ *
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
+ * metric. The size of the \p eventIdArray buffer is given by \p
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
+ * returned in \p eventIdArray.
+ *
+ * \param metric ID of the metric
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
+ * eventIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray);
+
+/**
+ * \brief Get number of properties required to calculate a metric.
+ *
+ * Returns the number of properties in \p numProp that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numProp Returns the number of properties required for the
+ * metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
+                                                 uint32_t *numProp);
+
+/**
+ * \brief Get the properties required to calculating a metric.
+ *
+ * Gets the property IDs in \p propIdArray required to calculate a \p
+ * metric. The size of the \p propIdArray buffer is given by \p
+ * *propIdArraySizeBytes and must be at least \p numProp *
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
+ * returned. The value returned in \p *propIdArraySizeBytes contains
+ * the number of bytes returned in \p propIdArray.
+ *
+ * \param metric ID of the metric
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
+ * and returns the number of bytes written to \p propIdArray
+ * \param propIdArray Returns the IDs of the properties required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
+ * propIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
+                                               size_t *propIdArraySizeBytes,
+                                               CUpti_MetricPropertyID *propIdArray);
+
+
+/**
+ * \brief For a metric get the groups of events that must be collected
+ * in the same pass.
+ *
+ * For a metric get the groups of events that must be collected in the
+ * same pass to ensure that the metric is calculated correctly. If the
+ * events are not collected as specified then the metric value may be
+ * inaccurate.
+ *
+ * The function returns NULL if a metric does not have any required
+ * event group. In this case the events needed for the metric can be
+ * grouped in any manner for collection.
+ *
+ * \param context The context for event collection
+ * \param metric The metric ID
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
+ * indicates the events that must be collected in the same pass to
+ * ensure the metric is calculated correctly.  Returns NULL if no
+ * grouping is required for metric
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ */
+CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
+                                                          CUpti_MetricID metric,
+                                                          CUpti_EventGroupSets **eventGroupSets);
+
+/**
+ * \brief For a set of metrics, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events required for those metrics.
+ *
+ * For a set of metrics, get the grouping that indicates the number of
+ * passes and the event groups necessary to collect the events
+ * required for those metrics.
+ *
+ * \see cuptiEventGroupSetsCreate for details on event group set
+ * creation.
+ *
+ * \param context The context for event collection
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
+ * \param metricIdArray Array of metric IDs
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
+                                                     size_t metricIdArraySizeBytes,
+                                                     CUpti_MetricID *metricIdArray,
+                                                     CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events collected for a metric to calculate the metric
+ * value. Metric value evaluation depends on the evaluation mode
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
+ * then it assumes that the input event value is for one domain instance.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
+ * it assumes that input event values are
+ * normalized to represent all domain instances on a device. For the
+ * most accurate metric collection, the events required for the metric
+ * should be collected for all profiled domain instances. For example,
+ * to collect all instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param device The CUDA device that the metric is being calculated for
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param timeDuration The duration over which the events were
+ * collected, in ns
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
+                                         CUpti_MetricID metric,
+                                         size_t eventIdArraySizeBytes,
+                                         CUpti_EventID *eventIdArray,
+                                         size_t eventValueArraySizeBytes,
+                                         uint64_t *eventValueArray,
+                                         uint64_t timeDuration,
+                                         CUpti_MetricValue *metricValue);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events and properties collected for a metric to calculate
+ * the metric value. Metric value evaluation depends on the evaluation
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports.  If
+ * a metric has evaluation mode as
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
+ * input event value is for one domain instance.  If a metric has
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
+ * assumes that input event values are normalized to represent all
+ * domain instances on a device. For the most accurate metric
+ * collection, the events required for the metric should be collected
+ * for all profiled domain instances. For example, to collect all
+ * instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
+ * \param propIdArray The metric property IDs required to calculate \p metric
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
+ * \param propValueArray The metric property values required to
+ * calculate \p metric. The values must be order to match the order of
+ * metric properties in \p propIdArray
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
+                                          size_t eventIdArraySizeBytes,
+                                          CUpti_EventID *eventIdArray,
+                                          size_t eventValueArraySizeBytes,
+                                          uint64_t *eventValueArray,
+                                          size_t propIdArraySizeBytes,
+                                          CUpti_MetricPropertyID *propIdArray,
+                                          size_t propValueArraySizeBytes,
+                                          uint64_t *propValueArray,
+                                          CUpti_MetricValue *metricValue);
+
+/** @} */ /* END CUPTI_METRIC_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_METRIC_H_*/
+
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ad8c85e6e674b9a016580be88d3c5a2d2619990
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2013-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+typedef enum {
+  CUPTI_CBID_NVTX_INVALID                               = 0,
+  CUPTI_CBID_NVTX_nvtxMarkA                             = 1,
+  CUPTI_CBID_NVTX_nvtxMarkW                             = 2,
+  CUPTI_CBID_NVTX_nvtxMarkEx                            = 3,
+  CUPTI_CBID_NVTX_nvtxRangeStartA                       = 4,
+  CUPTI_CBID_NVTX_nvtxRangeStartW                       = 5,
+  CUPTI_CBID_NVTX_nvtxRangeStartEx                      = 6,
+  CUPTI_CBID_NVTX_nvtxRangeEnd                          = 7,
+  CUPTI_CBID_NVTX_nvtxRangePushA                        = 8,
+  CUPTI_CBID_NVTX_nvtxRangePushW                        = 9,
+  CUPTI_CBID_NVTX_nvtxRangePushEx                       = 10,
+  CUPTI_CBID_NVTX_nvtxRangePop                          = 11,
+  CUPTI_CBID_NVTX_nvtxNameCategoryA                     = 12,
+  CUPTI_CBID_NVTX_nvtxNameCategoryW                     = 13,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadA                     = 14,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadW                     = 15,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceA                     = 16,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceW                     = 17,
+  CUPTI_CBID_NVTX_nvtxNameCuContextA                    = 18,
+  CUPTI_CBID_NVTX_nvtxNameCuContextW                    = 19,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamA                     = 20,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamW                     = 21,
+  CUPTI_CBID_NVTX_nvtxNameCuEventA                      = 22,
+  CUPTI_CBID_NVTX_nvtxNameCuEventW                      = 23,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceA                   = 24,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceW                   = 25,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamA                   = 26,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamW                   = 27,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventA                    = 28,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventW                    = 29,
+  CUPTI_CBID_NVTX_nvtxDomainMarkEx                      = 30,
+  CUPTI_CBID_NVTX_nvtxDomainRangeStartEx                = 31,
+  CUPTI_CBID_NVTX_nvtxDomainRangeEnd                    = 32,
+  CUPTI_CBID_NVTX_nvtxDomainRangePushEx                 = 33,
+  CUPTI_CBID_NVTX_nvtxDomainRangePop                    = 34,
+  CUPTI_CBID_NVTX_nvtxDomainResourceCreate              = 35,
+  CUPTI_CBID_NVTX_nvtxDomainResourceDestroy             = 36,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryA               = 37,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryW               = 38,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringA             = 39,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringW             = 40,
+  CUPTI_CBID_NVTX_nvtxDomainCreateA                     = 41,
+  CUPTI_CBID_NVTX_nvtxDomainCreateW                     = 42,
+  CUPTI_CBID_NVTX_nvtxDomainDestroy                     = 43,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserCreate              = 44,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserDestroy             = 45,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireStart        = 46,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireFailed       = 47,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireSuccess      = 48,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserReleasing           = 49,
+  CUPTI_CBID_NVTX_SIZE,
+  CUPTI_CBID_NVTX_FORCE_INT                             = 0x7fffffff
+} CUpti_nvtx_api_trace_cbid;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif    
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f42d14b938204b3b79c4ca1356b88896bcae35
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h
@@ -0,0 +1,936 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PCSAMPLING_H_)
+#define _CUPTI_PCSAMPLING_H_
+
+#include <cuda.h>
+#include <stdint.h>
+#include <stddef.h>
+#include "cupti_result.h"
+#include "cupti_common.h"
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API
+ * Functions, types, and enums that implement the CUPTI PC Sampling API.
+ * @{
+ */
+
+#ifndef CUPTI_PCSAMPLING_STRUCT_SIZE
+#define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CUPTI_STALL_REASON_STRING_SIZE
+#define CUPTI_STALL_REASON_STRING_SIZE                                            128
+#endif
+
+/**
+ * \brief PC Sampling collection mode
+ */
+typedef enum
+{
+  /**
+   * INVALID Value
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID                   = 0,
+  /**
+   * Continuous mode. Kernels are not serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS                = 1,
+  /**
+   * Serialized mode. Kernels are serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED         = 2,
+} CUpti_PCSamplingCollectionMode;
+
+/**
+ * \brief PC Sampling stall reasons
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [r] Collected stall reason index
+   */
+  uint32_t pcSamplingStallReasonIndex;
+  /**
+   * [r] Number of times the PC was sampled with the stallReason.
+   */
+  uint32_t samples;
+} CUpti_PCSamplingStallReason;
+
+/**
+ * \brief PC Sampling data
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [r] Unique cubin id
+   */
+  uint64_t cubinCrc;
+  /**
+   * [r] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+  /**
+   * Padding
+   */
+  uint32_t pad;
+  /**
+   * [r] The function name. This name string might be shared across all the records
+   * including records from activity APIs representing the same function, and so it should not be
+   * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to
+   * free the memory using free() function.
+   */
+  char* functionName;
+  /**
+   * [r] Collected stall reason count
+   */
+  size_t stallReasonCount;
+  /**
+   * [r] Stall reason id
+   * Total samples
+   */
+  CUpti_PCSamplingStallReason *stallReason;
+  /**
+   * The correlation ID of the kernel to which this result is associated. Only valid for serialized mode of pc sampling collection.
+   * For continous mode of collection the correlationId will be set to 0.
+   */
+  uint32_t correlationId;
+} CUpti_PCSamplingPCData;
+
+/**
+ * \brief PC Sampling output data format
+ */
+typedef enum
+{
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID          = 0,
+  /**
+   * HW buffer data will be parsed during collection of data
+   */
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED           = 1,
+} CUpti_PCSamplingOutputDataFormat;
+
+/**
+ * \brief Collected PC Sampling data
+ *
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Number of PCs to be collected
+   */
+  size_t collectNumPcs;
+  /**
+   * [r] Number of samples collected across all PCs.
+   * It includes samples for user modules, samples for non-user kernels and dropped samples.
+   * It includes counts for all non selected stall reasons.
+   * CUPTI does not provide PC records for non-user kernels.
+   * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero.
+   */
+  uint64_t totalSamples;
+  /**
+   * [r] Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * [r] Number of PCs collected
+   */
+  size_t totalNumPcs;
+  /**
+   * [r] Number of PCs available for collection
+   */
+  size_t remainingNumPcs;
+  /**
+   * [r] Unique identifier for each range.
+   * Data collected across multiple ranges in multiple buffers can be identified using range id.
+   */
+  uint64_t rangeId;
+  /**
+   * [r] Profiled PC data
+   * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs
+   */
+  CUpti_PCSamplingPCData *pPcData;
+  /**
+   * [r] Number of samples collected across all non user kernels PCs.
+   * It includes samples for non-user kernels.
+   * It includes counts for all non selected stall reasons as well.
+   * CUPTI does not provide PC records for non-user kernels.
+   */
+  uint64_t nonUsrKernelsTotalSamples;
+
+  /**
+   * [r] Status of the hardware buffer.
+   * CUPTI returns the error code CUPTI_ERROR_OUT_OF_MEMORY when hardware buffer is full.
+   * When hardware buffer is full, user will get pc data as 0. To mitigate this issue, one or more of the below options can be tried:
+   * 1. Increase the hardware buffer size using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+   * 2. Decrease the thread sleep span using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+   * 3. Decrease the sampling frequency using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+   */
+  uint8_t hardwareBufferFull;
+} CUpti_PCSamplingData;
+
+/**
+ * \brief PC Sampling configuration attributes
+ *
+ * PC Sampling configuration attribute types. These attributes can be read
+ * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written
+ * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked
+ * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute
+ * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute
+ * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and
+ * written using \ref cuptiPCSamplingSetConfigurationAttribute
+ */
+typedef enum
+{
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID                            = 0,
+  /**
+   * [rw] Sampling period for PC Sampling.
+   * DEFAULT - CUPTI defined value based on number of SMs
+   * Valid values for the sampling
+   * periods are between 5 to 31 both inclusive. This will set the
+   * sampling period to (2^samplingPeriod) cycles.
+   * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD                    = 1,
+  /**
+   * [w] Number of stall reasons to collect.
+   * DEFAULT - All stall reasons will be collected
+   * Value is a size_t
+   * [w] Stall reasons to collect
+   * DEFAULT - All stall reasons will be collected
+   * Input value should be a pointer pointing to array of stall reason indexes
+   * containing all the stall reason indexes to collect.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON                       = 2,
+  /**
+   * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer
+   * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs
+   * with all stall reasons
+   * Approximately it takes 16 Bytes (and some fixed size memory)
+   * to accommodate one PC with one stall reason
+   * For e.g. 1 PC with 1 stall reason = 32 Bytes
+   *          1 PC with 2 stall reason = 48 Bytes
+   *          1 PC with 4 stall reason = 96 Bytes
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE                = 3,
+  /**
+   * [rw] Size of HW buffer in bytes
+   * DEFAULT - 512 MB
+   * If sampling period is too less, HW buffer can overflow
+   * and drop PC data
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE               = 4,
+  /**
+   * [rw] PC Sampling collection mode
+   * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS
+   * Input value should be of type \ref CUpti_PCSamplingCollectionMode.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE                    = 5,
+  /**
+   * [rw] Control over PC Sampling data collection range
+   * Default - 0
+   * 1 - Allows user to start and stop PC Sampling using APIs -
+   * \ref cuptiPCSamplingStart() - Start PC Sampling
+   * \ref cuptiPCSamplingStop() - Stop PC Sampling
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL          = 6,
+  /**
+   * [w] Value for output data format
+   * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED
+   * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT                 = 7,
+  /**
+   * [w] Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Default - none.
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER               = 8,
+  /**
+   * [rw] Control sleep time of the worker threads created by CUPTI for various PC sampling operations.
+   * CUPTI creates multiple worker threads to offload certain operations to these threads. This includes decoding of HW data to
+   * the CUPTI PC sampling data and correlating PC data to SASS instructions. CUPTI wakes up these threads periodically.
+   * Default - 100 milliseconds.
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN  = 9,
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT                          = 0x7fffffff,
+} CUpti_PCSamplingConfigurationAttributeType;
+
+/**
+ * \brief PC sampling configuration information structure
+ *
+ * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured
+ * or queried for PC sampling configuration
+ */
+typedef struct
+{
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types
+   */
+  CUpti_PCSamplingConfigurationAttributeType attributeType;
+  /*
+   * Configure or query status for \p attributeType
+   * CUPTI_SUCCESS for valid \p attributeType and \p attributeData
+   * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid
+   * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid
+   */
+  CUptiResult attributeStatus;
+  union
+  {
+    /**
+     * Invalid Value
+     */
+    struct
+    {
+      uint64_t data[3];
+    } invalidData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+     */
+    struct
+    {
+      uint32_t samplingPeriod;
+    } samplingPeriodData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON
+     */
+    struct
+    {
+      size_t stallReasonCount;
+      uint32_t *pStallReasonIndex;
+    } stallReasonData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t scratchBufferSize;
+    } scratchBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t hardwareBufferSize;
+    } hardwareBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE
+     */
+    struct
+    {
+      CUpti_PCSamplingCollectionMode collectionMode;
+    } collectionModeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+     */
+    struct
+    {
+      uint32_t enableStartStopControl;
+    } enableStartStopControlData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT
+     */
+    struct
+    {
+      CUpti_PCSamplingOutputDataFormat outputDataFormat;
+    } outputDataFormatData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER
+     */
+    struct
+    {
+      void *samplingDataBuffer;
+    } samplingDataBufferData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+     */
+    struct
+    {
+      uint32_t workerThreadPeriodicSleepSpan;
+    } workerThreadPeriodicSleepSpanData;
+    
+  } attributeData;
+} CUpti_PCSamplingConfigurationInfo;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute
+ * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query
+   * using \ref cuptiPCSamplingGetConfigurationAttribute
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+} CUpti_PCSamplingConfigurationInfoParams;
+#define CUpti_PCSamplingConfigurationInfoParamsSize                 CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo)
+
+/**
+ * \brief Write PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid \p attrib.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Read PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid attribute.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that
+ * the \p value buffer is too small to hold the attribute value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  void *pcSamplingData;
+} CUpti_PCSamplingGetDataParams;
+#define CUpti_PCSamplingGetDataParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData)
+/**
+ * \brief Flush GPU PC sampling data periodically.
+ *
+ * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs:
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends
+ * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+ * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop()
+ *
+ * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload,
+ * user can collect data in two ways:
+ * Use \brief cuptiPCSamplingGetData() API periodically
+ * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling
+ * data buffer passed during configuration.
+ * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer
+ * passed during configuration should be large enough to hold all PCs data.
+ *       \brief cuptiPCSamplingGetData() API never does device synchronization.
+ *       It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case
+ * CUPTI provides only the data available with it at that moment.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetDataParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without
+ * enabling PC sampling.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY indicates that the HW buffer is full
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingEnableParams;
+#define CUpti_PCSamplingEnableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx)
+
+/**
+ * \brief Enable PC sampling.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingEnableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingDisable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingDisableParams;
+#define CUpti_PCSamplingDisableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx)
+
+/**
+ * \brief Disable PC sampling.
+ *
+ * For application which doesn't destroy the CUDA context explicitly,
+ * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided
+ * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingDisableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStart
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStartParams;
+#define CUpti_PCSamplingStartParamsSize                             CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx)
+
+/**
+ * \brief Start PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark starting of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStartParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStop
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStopParams;
+#define CUpti_PCSamplingStopParamsSize                              CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx)
+
+/**
+ * \brief Stop PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark end of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStopParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetNumStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [r] Number of stall reasons
+   */
+  size_t *numStallReasons;
+} CUpti_PCSamplingGetNumStallReasonsParams;
+#define CUpti_PCSamplingGetNumStallReasonsParamsSize                CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons)
+
+/**
+ * \brief Get PC sampling stall reason count.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetNumStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * [r] Stall reason index
+   */
+  uint32_t *stallReasonIndex;
+  /**
+   * [r] Stall reasons name
+   */
+  char **stallReasons;
+} CUpti_PCSamplingGetStallReasonsParams;
+#define CUpti_PCSamplingGetStallReasonsParamsSize                   CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons)
+
+/**
+ * \brief Get PC sampling stall reasons.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
+
+
+/**
+ * \brief Params for cuptiGetSassToSourceCorrelation
+ */
+typedef struct CUpti_GetSassToSourceCorrelationParams {
+  /**
+   * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Pointer to cubin binary where function belongs.
+   */
+  const void* cubin;
+  /**
+   * [w] Function name to which PC belongs.
+   */
+  const char *functionName;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [r] Line number in the source code.
+   */
+  uint32_t lineNumber;
+  /**
+   * [w] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * [r] Path for the source file.
+   */
+  char *fileName;
+  /**
+   * [r] Path for the directory of source file.
+   */
+  char *dirName;
+} CUpti_GetSassToSourceCorrelationParams;
+
+#define CUpti_GetSassToSourceCorrelationParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName)
+
+/**
+ * \brief SASS to Source correlation.
+ *
+ * \param pParams A pointer to \ref CUpti_GetSassToSourceCorrelationParams
+ *
+ * It is expected from user to free allocated memory for fileName and dirName after use.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName
+ * is NULL or cubinSize is zero or size field is not set correctly.
+ * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid.
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred.
+ * This error code is also used for cases when the function is not present in the module.
+ * A better error code will be returned in the future release.
+ */
+CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
+
+/**
+ * \brief Params for cuptiGetCubinCrc
+ */
+typedef struct {
+  /**
+   * [w] Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [w] Pointer to cubin binary
+   */
+  const void* cubin;
+  /**
+   * [r] Computed CRC will be stored in it.
+   */
+  uint64_t cubinCrc;
+} CUpti_GetCubinCrcParams;
+#define CUpti_GetCubinCrcParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc)
+
+/**
+ * \brief Get the CRC of cubin.
+ *
+ * This function returns the CRC of provided cubin binary.
+ *
+ * \param pParams A pointer to \ref CUpti_GetCubinCrcParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or
+ * provided cubinSize is zero or size field is not set.
+ */
+CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams);
+
+/**
+ * \brief Function type for callback used by CUPTI to request crc of
+ * loaded module.
+ *
+ * This callback function ask for crc of provided module in function.
+ * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling
+ * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module.
+ *
+ * \param cubin The pointer to cubin binary
+ * \param cubinSize The size of cubin binary.
+ * \param cubinCrc Returns the computed crc of cubin.
+ */
+typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)(
+    const void* cubin,
+    size_t cubinSize,
+    uint64_t *cubinCrc);
+
+/**
+ * \brief Register callback function with CUPTI to use
+ * your own algorithm to compute cubin crc.
+ *
+ * This function registers a callback function and it gets called
+ * from CUPTI when a CUDA module is loaded.
+ *
+ * \param funcComputeCubinCrc callback is invoked when a CUDA module
+ * is loaded.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL.
+ */
+CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc);
+
+/** @} */ /* END CUPTI_PCSAMPLING_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_PCSAMPLING_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling_util.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..595d6028fbf2ff9a3bbffaafe90ec80f7d512533
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling_util.h
@@ -0,0 +1,402 @@
+#if !defined(_CUPTI_PCSAMPLING_UTIL_H_)
+#define _CUPTI_PCSAMPLING_UTIL_H_
+
+#include <cupti_pcsampling.h>
+#include <fstream>
+
+#include <cupti_common.h>
+
+#ifndef CUPTI_UTIL_STRUCT_SIZE
+#define CUPTI_UTIL_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS
+#define CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS(type, member, structSize)    \
+    (offsetof(type, member) < structSize)
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+    #pragma GCC visibility push(default)
+#endif
+
+namespace CUPTI { namespace PcSamplingUtil {
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_UTILITY CUPTI PC Sampling Utility API
+ * Functions, types, and enums that implement the CUPTI PC Sampling Utility API.
+ * @{
+ */
+
+/**
+ * \brief Header info will be stored in file.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Version of file format.
+   */
+  uint32_t version;
+  /**
+   * Total number of buffers present in the file.
+   */
+  uint32_t totalBuffers;
+} Header;
+
+/**
+ * \brief BufferInfo will be stored in the file for every buffer
+ *  i.e for every call of UtilDumpPcSamplingBufferInFile() API.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Total number of PC records.
+   */
+  uint64_t recordCount;
+  /**
+   * Count of all stall reasons supported on the GPU
+   */
+  size_t numStallReasons;
+  /**
+   * Total number of stall reasons in single record.
+   */
+  uint64_t numSelectedStallReasons;
+  /**
+   * Buffer size in Bytes.
+   */
+  uint64_t bufferByteSize;
+} BufferInfo;
+
+/**
+ * \brief All available stall reasons name and respective indexes
+ * will be stored in it.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Number of all available stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * Stall reasons names of all available stall reasons
+   */
+  char **stallReasons;
+  /**
+   * Stall reason index of all available stall reasons
+   */
+  uint32_t *stallReasonIndex;
+} PcSamplingStallReasons;
+
+/**
+ * \brief CUPTI PC sampling buffer types.
+ *
+ */
+typedef enum {
+  /**
+   * Invalid buffer type.
+   */
+  PC_SAMPLING_BUFFER_INVALID             = 0,
+  /**
+   * Refers to CUpti_PCSamplingData buffer.
+   */
+  PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA  = 1
+} PcSamplingBufferType;
+
+/**
+ * \brief CUPTI PC sampling utility API result codes.
+ *
+ * Error and result codes returned by CUPTI PC sampling utility API.
+ */
+typedef enum {
+  /**
+   * No error
+   */
+  CUPTI_UTIL_SUCCESS                                       = 0,
+  /**
+   * One or more of the parameters are invalid.
+   */
+  CUPTI_UTIL_ERROR_INVALID_PARAMETER                       = 1,
+  /**
+   * Unable to create a new file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE                   = 2,
+  /**
+   * Unable to open a file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE                     = 3,
+  /**
+   * Read or write operation failed
+   */
+  CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED             = 4,
+  /**
+   * Provided file handle is corrupted.
+   */
+  CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED                   = 5,
+  /**
+   * seek operation failed.
+   */
+  CUPTI_UTIL_ERROR_SEEK_OPERATION_FAILED                   = 6,
+  /**
+   * Unable to allocate enough memory to perform the requested
+   * operation.
+   */
+  CUPTI_UTIL_ERROR_OUT_OF_MEMORY                           = 7,
+  /**
+   * An unknown internal error has occurred.
+   */
+  CUPTI_UTIL_ERROR_UNKNOWN                                 = 999,
+  CUPTI_UTIL_ERROR_FORCE_INT                               = 0x7fffffff
+} CUptiUtilResult;
+
+/**
+ * \brief Params for \ref CuptiUtilPutPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * PC sampling buffer.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configured attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   * It is expected to provide configuration details of at least
+   * CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON attribute.
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+  /**
+   * File name to store buffer into it.
+   */
+  const char* fileName;
+} CUptiUtil_PutPcSampDataParams;
+#define CUptiUtil_PutPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_PutPcSampDataParams, fileName)
+
+/**
+ * \brief Dump PC sampling data into the file.
+ *
+ * This API can be called multiple times.
+ * It will append buffer in the file.
+ * For every buffer it will store BufferInfo
+ * so that before retrieving data it will help to allocate buffer
+ * to store retrieved data.
+ * This API creates file if file does not present.
+ * If stallReasonIndex or stallReasons pointer of \ref CUptiUtil_PutPcSampDataParams is NULL
+ * then stall reasons data  will not be stored in file.
+ * It is expected to store all available stall reason data at least once to refer it during
+ * offline correlation.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSamplingData, pParams pointer is NULL or stall reason configuration details not provided
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilPutPcSampData(CUptiUtil_PutPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetHeaderData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Header Info.
+   */
+  Header headerInfo;
+
+} CUptiUtil_GetHeaderDataParams;
+#define CUptiUtil_GetHeaderDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetHeaderDataParams, headerInfo)
+
+/**
+ * \brief Get header data of file.
+ *
+ * This API must be called once initially while retrieving data from file.
+ * \ref Header structure, it gives info about total number
+ * of buffers present in the file.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED  failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetHeaderData(CUptiUtil_GetHeaderDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetBufferInfo
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Buffer Info.
+   */
+  BufferInfo bufferInfoData;
+} CUptiUtil_GetBufferInfoParams;
+#define CUptiUtil_GetBufferInfoParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetBufferInfoParams, bufferInfoData)
+
+/**
+ * \brief Get buffer info data of file.
+ *
+ * This API must be called every time before calling CuptiUtilGetPcSampData API.
+ * \ref BufferInfo structure, it gives info about recordCount and stallReasonCount
+ * of every record in the buffer. This will help to allocate exact buffer to retrieve data into it.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetBufferInfo(CUptiUtil_GetBufferInfoParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * Pointer to collected buffer info using \ref CuptiUtilGetBufferInfo
+   */
+  BufferInfo *pBufferInfoData;
+  /**
+   * Pointer to allocated memory to store retrieved data from file.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configuration attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   * For stallReasons field of \ref PcSamplingStallReasons it is expected to
+   * allocate memory for each string element of array.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+} CUptiUtil_GetPcSampDataParams;
+#define CUptiUtil_GetPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetPcSampDataParams, pPcSamplingStallReasons)
+
+/**
+ * \brief Retrieve PC sampling data from file into allocated buffer.
+ *
+ * This API must be called after CuptiUtilGetBufferInfo API.
+ * It will retrieve data from file into allocated buffer.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSampData, pParams is NULL. If pPcSamplingStallReasons is not NULL then
+ * error out if either of stallReasonIndex, stallReasons or stallReasons array element pointer is NULL.
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetPcSampData(CUptiUtil_GetPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilMergePcSampData
+ */
+typedef struct
+{
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Number of buffers to merge.
+   */
+  size_t numberOfBuffers;
+  /**
+   * Pointer to array of buffers to merge
+   */
+  CUpti_PCSamplingData *PcSampDataBuffer;
+  /**
+   * Pointer to array of merged buffers as per the range id.
+   */
+  CUpti_PCSamplingData **MergedPcSampDataBuffers;
+  /**
+   * Number of merged buffers.
+   */
+  size_t *numMergedBuffer;
+} CUptiUtil_MergePcSampDataParams;
+#define CUptiUtil_MergePcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_MergePcSampDataParams, numMergedBuffer)
+
+/**
+ * \brief Merge PC sampling data range id wise.
+ *
+ * This API merge PC sampling data range id wise.
+ * It allocates memory for merged data and fill data in it
+ * and provide buffer pointer in MergedPcSampDataBuffers field.
+ * It is expected from user to free merge data buffers after use.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if param struct size is invalid
+ * or count of buffers to merge is invalid i.e less than 1
+ * or either of PcSampDataBuffer, MergedPcSampDataBuffers, numMergedBuffer is NULL
+ * \retval CUPTI_UTIL_ERROR_OUT_OF_MEMORY Unable to allocate memory for merged buffer.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilMergePcSampData(CUptiUtil_MergePcSampDataParams *pParams);
+
+/** @} */ /* END CUPTI_PCSAMPLING_UTILITY */
+
+} }
+
+#if defined(__GNUC__)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba4171b6710564b56bc7e8e64e46c3674fe6c58c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2024 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PMSAMPLING_H_)
+#define _CUPTI_PMSAMPLING_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/* CUPTI PM sampling APIs */
+/**
+ * \defgroup CUPTI_PM_SAMPLING_API CUPTI PM Sampling API
+ * Functions to enable, disable, start, stop, and decode PM sampling.
+ * @{
+ */
+typedef struct CUpti_PmSampling_Object CUpti_PmSampling_Object;
+
+typedef enum CUpti_PmSampling_TriggerMode
+{
+    /// The trigger is based off of the SYSCLK frequency, note SYS frequency by default is variable.
+    /// the sample interval (set in the struct CUpti_PmSampling_SetConfig_Params) is in terms of clocks.
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_SYSCLK_INTERVAL = 0,
+    /// The trigger is based off of a fixed frequency source.
+    /// The sample interval (set in the struct CUpti_PmSampling_SetConfig_Params) is in terms of nanoseconds.
+    /// Note: This trigger mode is not supported on Turing GPU architecture and GA100 GPU.
+    /// It is supported on Ampere GA10x and later GPU architectures.
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL = 1,
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_COUNT
+} CUpti_PmSampling_TriggerMode;
+
+typedef enum CUpti_PmSampling_DecodeStopReason
+{
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_OTHER = 0,
+    /// Counter data image is full.
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_COUNTER_DATA_FULL,
+    /// All the records in the hardware buffer is decoded.
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_END_OF_RECORDS,
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_COUNT
+} CUpti_PmSampling_DecodeStopReason;
+
+typedef enum CUpti_PmSampling_HardwareBuffer_AppendMode
+{
+    /// Keep the oldest records in the hardware buffer.
+    /// CUPTI will report error for overflow in case hardware buffer is getting filled up.
+    CUPTI_PM_SAMPLING_HARDWARE_BUFFER_APPEND_MODE_KEEP_OLDEST = 0,
+    /// Keep the latest records in the hardware buffer.
+    /// Note: This mode is not supported on Turing GPU architecture.
+    /// It is supported on Ampere and later GPU architectures.
+    CUPTI_PM_SAMPLING_HARDWARE_BUFFER_APPEND_MODE_KEEP_LATEST = 1
+} CUpti_PmSampling_HardwareBuffer_AppendMode;
+
+/**
+ * \brief Params for cuptiPmSamplingSetConfig
+ */
+typedef struct CUpti_PmSampling_SetConfig_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Size of the config image.
+    size_t configSize;
+    /// [in] Config image.
+    const uint8_t* pConfig;
+    /// [in] The hardware buffer size in which raw PM sampling data
+    /// will be stored. These samples will be decoded to counter data
+    /// image with \ref cuptiPmSamplingDecodeData call.
+    size_t hardwareBufferSize;
+    /// [in] For the trigger mode `CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_SYSCLK_INTERVAL`, sampling interval
+    /// is the number of sys clock cycles. For the trigger mode `CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL`,
+    /// sampling interval is in nanoseconds.
+    uint64_t samplingInterval;
+    /// [in] Trigger mode.
+    /// Note: CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL is not supported in Turing and GA100.
+    /// Supported from GA10x onwards.
+    CUpti_PmSampling_TriggerMode triggerMode;
+    /// [in] Append mode for the records in hardware buffer.
+    /// For KEEP_OLDEST mode, all the records will be kept in the buffer and in case hardware buffer is getting filled up.
+    /// overflow will be set to 1 in \ref CUpti_PmSampling_DecodeData_Params. For KEEP_LATEST mode, the new records will
+    /// overwrite the oldest records in the buffer in case of filled buffer.
+    CUpti_PmSampling_HardwareBuffer_AppendMode hwBufferAppendMode;
+} CUpti_PmSampling_SetConfig_Params;
+
+#define CUpti_PmSampling_SetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_SetConfig_Params, hwBufferAppendMode)
+
+/**
+ * \brief Set the configuration for PM sampling like sampling interval, maximum number of samples
+ * filled in HW buffer, trigger mode and the config image which has scheduling info for metric collection.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_SetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED for config image which require multiple passes for data collection
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingSetConfig(CUpti_PmSampling_SetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingEnable
+ */
+typedef struct CUpti_PmSampling_Enable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Device index.
+    size_t deviceIndex;
+    /// [out] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Enable_Params;
+
+#define CUpti_PmSampling_Enable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Enable_Params, pPmSamplingObject)
+
+/**
+ * \brief Create a PM sampling object and enable PM sampling on the CUDA device.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Enable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if memory allocation fails while creating the PM sampling object
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling is already enabled on the device
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingEnable(CUpti_PmSampling_Enable_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingDisable
+ */
+typedef struct CUpti_PmSampling_Disable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Disable_Params;
+
+#define CUpti_PmSampling_Disable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Disable_Params, pPmSamplingObject)
+
+/**
+ * \brief Disable PM sampling on the CUDA device and destroy the PM sampling object.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Disable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingDisable(CUpti_PmSampling_Disable_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingStart
+ */
+typedef struct CUpti_PmSampling_Start_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Start_Params;
+
+#define CUpti_PmSampling_Start_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Start_Params, pPmSamplingObject)
+
+/**
+ * \brief Start the PM sampling. The GPU will start collecting the metrics data
+ * periodically based on trigger type and sampling interval passed in CUpti_PmSampling_SetConfig_Params.
+ * The collected data will be stored in the hardware buffer.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Start_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling Start is called without enabling PM sampling,
+ * and PM sampling is already started
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingStart(CUpti_PmSampling_Start_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingStop
+ */
+typedef struct CUpti_PmSampling_Stop_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Stop_Params;
+
+#define CUpti_PmSampling_Stop_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Stop_Params, pPmSamplingObject)
+
+/**
+ * \brief Stop the PM sampling. The GPU will stop collecting the metrics data.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Stop_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling Stop is called without enabling PM sampling,
+ * and PM sampling is already stopped
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingStop(CUpti_PmSampling_Stop_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingDecodeData
+ */
+typedef struct CUpti_PmSampling_DecodeData_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Counter data image.
+    uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] decode stop reason
+    CUpti_PmSampling_DecodeStopReason decodeStopReason;
+    /// [out] overflow status for hardware buffer.
+    /// To avoid overflow, either increase the maxSamples values in
+    /// \ref CUpti_PmSampling_SetConfig_Params or reduce the sampling interval.
+    uint8_t overflow;
+} CUpti_PmSampling_DecodeData_Params;
+
+#define CUpti_PmSampling_DecodeData_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_DecodeData_Params, overflow)
+
+/**
+ * \brief Decode the metrics data stored in the hardware buffer to the counter data image.
+ *
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_DecodeData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling DecodeData is called without enabling PM sampling
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if there is record overflow in the hardware buffer
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingDecodeData(CUpti_PmSampling_DecodeData_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterData
+ */
+typedef struct CUpti_PmSampling_GetCounterAvailability_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Device index.
+    size_t deviceIndex;
+    /// [inout] Size of the counter availability image. When pCounterAvailabilityImage is NULL,
+    /// this field is used to return the size of the counter availability image.
+    size_t counterAvailabilityImageSize;
+    /// [out] Counter availability image.
+    uint8_t* pCounterAvailabilityImage;
+} CUpti_PmSampling_GetCounterAvailability_Params;
+#define CUpti_PmSampling_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified device.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterAvailability_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterAvailability(CUpti_PmSampling_GetCounterAvailability_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterDataSize
+ */
+typedef struct CUpti_PmSampling_GetCounterDataSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Names of the metrics to be collected.
+    const char** pMetricNames;
+    /// [in] Number of metrics to be collected.
+    size_t numMetrics;
+    /// [in] Maximum number of samples to be stored in the counter data image.
+    uint32_t maxSamples;
+    /// [out] Size of the counter data image.
+    size_t counterDataSize;
+} CUpti_PmSampling_GetCounterDataSize_Params;
+#define CUpti_PmSampling_GetCounterDataSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterDataSize_Params, counterDataSize)
+
+/**
+ * \brief Query the size of the counter data image which will be used to store the metrics data.
+ * User need to allocate the memory for the counter data image based on the size returned by this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterDataSize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling GetCounterDataSize is called without enabling PM sampling
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterDataSize(CUpti_PmSampling_GetCounterDataSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingCounterDataImageInitialize
+ */
+typedef struct CUpti_PmSampling_CounterDataImage_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Size of the counter data image.
+    size_t counterDataSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterData;
+} CUpti_PmSampling_CounterDataImage_Initialize_Params;
+#define CUpti_PmSampling_CounterDataImage_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_CounterDataImage_Initialize_Params, pCounterData)
+
+/**
+ * \brief Initialize the counter data to CUPTI record format for storing the metric data.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_CounterDataImage_Initialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling CounterDataInitialize is called without enabling PM sampling
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingCounterDataImageInitialize(CUpti_PmSampling_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterDataInfo
+ */
+typedef struct CUpti_PmSampling_GetCounterDataInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] Number of samples in the counter data image.
+    size_t numTotalSamples;
+    /// [out] Number of populated samples.
+    size_t numPopulatedSamples;
+    /// [out] Number of samples that have been completed.
+    size_t numCompletedSamples;
+} CUpti_PmSampling_GetCounterDataInfo_Params;
+#define CUpti_PmSampling_GetCounterDataInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterDataInfo_Params, numCompletedSamples)
+
+/**
+ * \brief Get the counter data info like number of samples, number of populated
+ * samples and number of completed samples in a counter data image.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterDataInfo_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterDataInfo(CUpti_PmSampling_GetCounterDataInfo_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingCounterDataGetSampleInfo
+ */
+typedef struct CUpti_PmSampling_CounterData_GetSampleInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Index of the sample.
+    size_t sampleIndex;
+    /// [out] Start time of the sample.
+    uint64_t startTimestamp;
+    /// [out] End time of the sample.
+    uint64_t endTimestamp;
+} CUpti_PmSampling_CounterData_GetSampleInfo_Params;
+#define CUpti_PmSampling_CounterData_GetSampleInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_CounterData_GetSampleInfo_Params, endTimestamp)
+
+/**
+ * \brief Get the sample info (start and end time stamp) for the given sample index.
+ * Each sample is distinguished by the start and end time stamp.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_CounterData_GetSampleInfo_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingCounterDataGetSampleInfo(CUpti_PmSampling_CounterData_GetSampleInfo_Params* pParams);
+
+/** @} */ /* END CUPTI_PMSAMPLING_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_PMSAMPLING_H_
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e38ceb160791ae51fd681623d45dba1c688dda1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h
@@ -0,0 +1,541 @@
+/*
+ * Copyright 2024 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_HOST_H_)
+#define _CUPTI_PROFILER_HOST_H_
+
+/*
+CUPTI profiler host API's
+This file contains the CUPTI profiling host API's.
+*/
+#include <cupti_result.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_HOST_API CUPTI Profiler Host API
+ * Functions, types, and enums that implement the CUPTI Profiler Host API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef enum CUpti_MetricType
+{
+    CUPTI_METRIC_TYPE_COUNTER = 0,
+    CUPTI_METRIC_TYPE_RATIO,
+    CUPTI_METRIC_TYPE_THROUGHPUT,
+    CUPTI_METRIC_TYPE__COUNT
+} CUpti_MetricType;
+
+typedef enum CUpti_ProfilerType
+{
+    CUPTI_PROFILER_TYPE_RANGE_PROFILER,
+    CUPTI_PROFILER_TYPE_PM_SAMPLING,
+    CUPTI_PROFILER_TYPE_PROFILER_INVALID
+} CUpti_ProfilerType;
+
+typedef struct CUpti_Profiler_Host_Object CUpti_Profiler_Host_Object;
+
+/**
+ * \brief Params for cuptiProfilerHostInitialize
+ */
+typedef struct CUpti_Profiler_Host_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    /// CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+    /// available in the structure. Used to preserve backward compatibility.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the profiler kind one from CUpti_ProfilerType
+    CUpti_ProfilerType profilerType;
+    /// [in] accepted for chips supported at the time-of-release.
+    const char* pChipName;
+    /// [in] buffer with counter availability image - required for future chip support
+    const uint8_t* pCounterAvailabilityImage;
+    /// [out] binary blob allocated by CUPTI and operations associated with this object.
+    CUpti_Profiler_Host_Object* pHostObject;
+} CUpti_Profiler_Host_Initialize_Params;
+
+#define CUpti_Profiler_Host_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_Initialize_Params, pHostObject)
+
+/**
+ * \brief Create and initialize the profiler host object (CUpti_Profiler_Host_Object).
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_Initialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostInitialize(CUpti_Profiler_Host_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostDeinitialize
+ */
+typedef struct CUpti_Profiler_Host_Deinitialize_Params
+{
+    /// [in] Size of the data structure.
+    /// CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+    /// available in the structure. Used to preserve backward compatibility.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+} CUpti_Profiler_Host_Deinitialize_Params;
+
+#define CUpti_Profiler_Host_Deinitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_Deinitialize_Params, pHostObject)
+
+/**
+ * \brief Deinitialize and destroy the profiler host object (CUpti_Profiler_Host_Object).
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_Deinitialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostDeinitialize(CUpti_Profiler_Host_Deinitialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSupportedChips
+ */
+typedef struct CUpti_Profiler_Host_GetSupportedChips_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [out] number of supported chips
+    size_t numChips;
+    /// [out] list of supported chips
+    const char* const* ppChipNames;
+} CUpti_Profiler_Host_GetSupportedChips_Params;
+
+#define CUpti_Profiler_Host_GetSupportedChips_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetSupportedChips_Params, ppChipNames)
+
+/**
+ * \brief Get the list of supported chips.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetSupportedChips_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetSupportedChips(CUpti_Profiler_Host_GetSupportedChips_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSupportedMetrics
+ */
+typedef struct CUpti_Profiler_Host_GetBaseMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric type (counter, ratio, throughput)
+    CUpti_MetricType metricType;
+    /// [out] list of base metrics supported of queried metric type for the chip
+    const char** ppMetricNames;
+    /// [out] number of metrics
+    size_t numMetrics;
+} CUpti_Profiler_Host_GetBaseMetrics_Params;
+
+#define CUpti_Profiler_Host_GetBaseMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetBaseMetrics_Params, numMetrics)
+
+/**
+ * \brief Get the list of supported base metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetBaseMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetBaseMetrics(CUpti_Profiler_Host_GetBaseMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSubMetrics
+ */
+typedef struct CUpti_Profiler_Host_GetSubMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] the metric type for queried metric
+    CUpti_MetricType metricType;
+    /// [in] metric name for which sub-metric will be listed
+    const char* pMetricName;
+    /// [out] number of submetrics supported
+    size_t numOfSubmetrics;
+    /// [out] list of submetrics supported for the metric.
+    const char** ppSubMetrics;
+} CUpti_Profiler_Host_GetSubMetrics_Params;
+
+#define CUpti_Profiler_Host_GetSubMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetSubMetrics_Params, ppSubMetrics)
+
+/**
+ * \brief Get the list of supported sub-metrics for the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetSubMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetSubMetrics(CUpti_Profiler_Host_GetSubMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetMetricProperties
+ */
+typedef struct CUpti_Profiler_Host_GetMetricProperties_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric name for which its properties will be listed
+    const char* pMetricName;
+    /// [out] a short description about the metric
+    const char* pDescription;
+    /// [out] associated hw unit for the metric
+    const char* pHwUnit;
+    /// [out] the dimension of the metric values
+    const char* pDimUnit;
+    /// [out] the metric type (counter, ratio or throughput)
+    CUpti_MetricType metricType;
+} CUpti_Profiler_Host_GetMetricProperties_Params;
+
+#define CUpti_Profiler_Host_GetMetricProperties_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetMetricProperties_Params, metricType)
+
+/**
+ * \brief Get the properties of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetMetricProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetMetricProperties(CUpti_Profiler_Host_GetMetricProperties_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetRangeName
+ */
+typedef struct CUpti_Profiler_Host_GetRangeName_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the counter data image where profiling data has been decoded
+    const uint8_t* pCounterDataImage;
+    /// [in] size of counter data image
+    size_t counterDataImageSize;
+    /// [in] range index for which the range name will be queried
+    size_t rangeIndex;
+    /// [in] used in case of nested ranges, default="/". Range1<delimiter>Range2 
+    const char* delimiter;
+    /// [out] the range name.
+    /// Note: that the CUPTI allocate the memory internal and
+    /// its user responsibility to free up the allocated memory
+    const char* pRangeName;
+} CUpti_Profiler_Host_GetRangeName_Params;
+
+#define CUpti_Profiler_Host_GetRangeName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetRangeName_Params, pRangeName)
+
+/**
+ * \brief Get the range name for the range index stored in the counter data.
+ * In Range profiler, for Auto range mode the range name will be numeric value
+ * assigned to the kernel based on execution order. For user range mode, the 
+ * name of range will be based on the range name provided by the user using
+ * Push range API.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetRangeName_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetRangeName(CUpti_Profiler_Host_GetRangeName_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostEvaluateToGpuValues
+ */
+typedef struct CUpti_Profiler_Host_EvaluateToGpuValues_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] the counter data image where profiling data has been decoded
+    const uint8_t* pCounterDataImage;
+    /// [in] size of counter data image
+    size_t counterDataImageSize;
+    /// [in] range index for which the range name will be queried
+    size_t rangeIndex;
+    /// [in] the metrics for which GPU values will be evaluated for the range
+    const char** ppMetricNames;
+    /// [in] number of metrics
+    size_t numMetrics;
+    /// [out] output value for given metric and range index
+    double* pMetricValues;
+} CUpti_Profiler_Host_EvaluateToGpuValues_Params;
+
+#define CUpti_Profiler_Host_EvaluateToGpuValues_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_EvaluateToGpuValues_Params, pMetricValues)
+
+/**
+ * \brief Evaluate the metric values for the range index stored in the counter data.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_EvaluateToGpuValues_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostEvaluateToGpuValues(CUpti_Profiler_Host_EvaluateToGpuValues_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostConfigAddMetrics
+ */
+typedef struct CUpti_Profiler_Host_ConfigAddMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric names for which config image will be generated
+    const char** ppMetricNames;
+    /// [in] number of metrics
+    size_t numMetrics;
+} CUpti_Profiler_Host_ConfigAddMetrics_Params;
+
+#define CUpti_Profiler_Host_ConfigAddMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_ConfigAddMetrics_Params, numMetrics)
+
+/**
+ * \brief Add the metrics to the profiler host object for generating the config image.
+ * The config image will have the required information to schedule the metrics for
+ * collecting the profiling data. 
+ * Note: PM sampling only supports single pass config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_ConfigAddMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostConfigAddMetrics(CUpti_Profiler_Host_ConfigAddMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetConfigImageSize
+ */
+typedef struct CUpti_Profiler_Host_GetConfigImageSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [out] the size of config image, users need to allocate the buffer for storing
+    size_t configImageSize;
+} CUpti_Profiler_Host_GetConfigImageSize_Params;
+
+#define CUpti_Profiler_Host_GetConfigImageSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetConfigImageSize_Params, configImageSize)
+
+/**
+ * \brief Get the size of the config image for the metrics added to the profiler host object.
+ * Users need to allocate the buffer for storing the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetConfigImageSize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetConfigImageSize(CUpti_Profiler_Host_GetConfigImageSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetConfigImage
+ */
+typedef struct CUpti_Profiler_Host_GetConfigImage_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] Number of bytes allocated for pBuffer
+    size_t configImageSize;
+    /// [out] Buffer receiving the config image
+    uint8_t* pConfigImage;
+} CUpti_Profiler_Host_GetConfigImage_Params;
+
+#define CUpti_Profiler_Host_GetConfigImage_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetConfigImage_Params, pConfigImage)
+
+/**
+ * \brief Get the config image for the metrics added to the profiler host object.
+ * User will pass the allocated buffer to store the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetConfigImage_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetConfigImage(CUpti_Profiler_Host_GetConfigImage_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetNumOfPasses
+ */
+typedef struct CUpti_Profiler_Host_GetNumOfPasses_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] Number of bytes allocated for pConfigImage
+    size_t configImageSize;
+    /// [in] the config image buffer
+    uint8_t* pConfigImage;
+    /// [out] number of passes required for profiling scheduled metrics in the config image
+    size_t numOfPasses;
+} CUpti_Profiler_Host_GetNumOfPasses_Params;
+
+#define CUpti_Profiler_Host_GetNumOfPasses_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetNumOfPasses_Params, numOfPasses)
+
+/**
+ * \brief Get the number of passes required for profiling the scheduled metrics in the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetNumOfPasses_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetNumOfPasses(CUpti_Profiler_Host_GetNumOfPasses_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetMaxNumHardwareMetricsPerPass
+ */
+typedef struct CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the profiler kind one from CUpti_ProfilerType
+    CUpti_ProfilerType profilerType;
+    /// [in] accepted for chips supported at the time-of-release.
+    const char* pChipName;
+    /// [in] buffer with counter availability image - required for future chip support
+    uint8_t* pCounterAvailabilityImage;
+    /// [out] maximum number of metrics that can be scheduled in a pass
+    size_t maxMetricsPerPass;
+} CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params;
+
+#define CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params, maxMetricsPerPass)
+
+/**
+ * \brief Get the maximum number of hardware metrics (metric names which doesn't include _sass_ keyword)
+ * that can be scheduled in a single pass for a chip. While this represents a theoretical upper limit,
+ * practical constraints may prevent reaching this threshold for a specific set of metrics. Furthermore,
+ * the maximum achievable value is contingent upon the characteristics and architecture of the chip in question.
+ * 
+ * Use cuptiProfilerHostGetNumOfPasses API for getting the actual number of passes required for the
+ * for collecting the profiling data for the scheduled metrics in a config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetMaxNumHardwareMetricsPerPass(CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params* pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8fc197073dcb3bdec1a7349d136ac03434dc932
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2011-2023   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_TARGET_H_)
+#define _CUPTI_PROFILER_TARGET_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_API CUPTI Profiling API
+ * Functions, types, and enums that implement the CUPTI Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/**
+ * \brief Profiler range attribute
+ *
+ * A metric enabled in the session's configuration is collected separately per unique range-stack in the pass.
+ * This is an attribute to collect metrics around each kernel in a profiling session or in an user defined range.
+ */
+typedef enum
+{
+    /**
+     * Invalid value
+     */
+    CUPTI_Range_INVALID,
+    /**
+     * Ranges are auto defined around each kernel in a profiling session
+     */
+    CUPTI_AutoRange,
+    /**
+     * A range in which metric data to be collected is defined by the user
+     */
+    CUPTI_UserRange,
+    /**
+     * Range count
+     */
+    CUPTI_Range_COUNT,
+} CUpti_ProfilerRange;
+
+/**
+ * \brief Profiler replay attribute
+ *
+ * For metrics which require multipass collection, a replay of the GPU kernel(s) is required.
+ * This is an attribute which specify how the replay of the kernel(s) to be measured is done.
+ */
+typedef enum
+{
+    /**
+     * Invalid Value
+     */
+    CUPTI_Replay_INVALID,
+    /**
+     * Replay is done by CUPTI user around the process
+     */
+    CUPTI_ApplicationReplay,
+    /**
+     * Replay is done around kernel implicitly by CUPTI
+     */
+    CUPTI_KernelReplay,
+    /**
+     * Replay is done by CUPTI user within a process
+     */
+    CUPTI_UserReplay,
+    /**
+     * Replay count
+     */
+    CUPTI_Replay_COUNT,
+} CUpti_ProfilerReplayMode;
+
+/**
+ * \brief Default parameter for cuptiProfilerInitialize
+ */
+typedef struct CUpti_Profiler_Initialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_Initialize_Params;
+#define CUpti_Profiler_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Initialize_Params, pPriv)
+
+/**
+ * \brief Default parameter for cuptiProfilerDeInitialize
+ */
+typedef struct CUpti_Profiler_DeInitialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_DeInitialize_Params;
+#define CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeInitialize_Params, pPriv)
+
+/**
+ * \brief Initializes the profiler interface
+ *
+ * Loads the required libraries in the process address space.
+ * Sets up the hooks with the CUDA driver.
+ */
+CUptiResult CUPTIAPI cuptiProfilerInitialize(CUpti_Profiler_Initialize_Params *pParams);
+
+/**
+ * \brief DeInitializes the profiler interface
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeInitialize(CUpti_Profiler_DeInitialize_Params *pParams);
+
+/**
+ * \brief Input parameter to define the counterDataImage
+ */
+typedef struct CUpti_Profiler_CounterDataImageOptions
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImageOptions_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    const uint8_t* pCounterDataPrefix;                          /**< [in] Address of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+                                                                    Must be align(8).*/
+    size_t counterDataPrefixSize;                               //!< [in] Size of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+    uint32_t maxNumRanges;                                      //!< [in] Maximum number of ranges that can be profiled
+    uint32_t maxNumRangeTreeNodes;                              //!< [in] Maximum number of RangeTree nodes; must be >= maxNumRanges
+    uint32_t maxRangeNameLength;                                //!< [in] Maximum string length of each RangeName, including the trailing NULL character
+} CUpti_Profiler_CounterDataImageOptions;
+#define CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImageOptions, maxRangeNameLength)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateSize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE         CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateSize_Params, counterDataImageSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_Initialize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [in] Size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                                 //!< [in] The buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_Initialize_Params;
+#define CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_Initialize_Params, pCounterDataImage)
+
+/**
+ * \brief A CounterData image allocates space for values for each counter for each range.
+ *
+ * User borne the resposibility of managing the counterDataImage allocations.
+ * CounterDataPrefix contains meta data about the metrics that will be stored in counterDataImage.
+ * Use these APIs to calculate the allocation size and initialize counterData image.
+ */
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageCalculateSize(CUpti_Profiler_CounterDataImage_CalculateSize_Params* pParams);
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageInitialize(CUpti_Profiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateScratchBufferSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params, counterDataScratchBufferSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitializeScratchBuffer
+ */
+typedef struct CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated using cuptiProfilerCounterDataImageCalculateScratchBufferSize
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] the scratch buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params;
+#define CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params, pCounterDataScratchBuffer)
+
+/**
+ * \brief A temporary storage for CounterData image needed for internal operations
+ *
+ * Use these APIs to calculate the allocation size and initialize counterData image scratch buffer.
+ */
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageCalculateScratchBufferSize(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params* pParams);
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageInitializeScratchBuffer(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginSession
+ */
+typedef struct CUpti_Profiler_BeginSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in] address of CounterDataImage
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated from cuptiProfilerCounterDataImageInitializeScratchBuffer
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] address of CounterDataImage scratch buffer
+    uint8_t bDumpCounterDataInFile;                          //!< [in] [optional]
+    const char* pCounterDataFilePath;                        //!< [in] [optional]
+    CUpti_ProfilerRange range;                               //!< [in] CUpti_ProfilerRange
+    CUpti_ProfilerReplayMode replayMode;                     //!< [in] CUpti_ProfilerReplayMode
+    /* Replay options, required when replay is done by cupti user */
+    size_t maxRangesPerPass;                                //!< [in] Maximum number of ranges that can be recorded in a single pass.
+    size_t maxLaunchesPerPass;                              //!< [in] Maximum number of kernel launches that can be recorded in a single pass; must be >= maxRangesPerPass.
+
+} CUpti_Profiler_BeginSession_Params;
+#define CUpti_Profiler_BeginSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginSession_Params, maxLaunchesPerPass)
+/**
+ * \brief Params for cuptiProfilerEndSession
+ */
+typedef struct CUpti_Profiler_EndSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EndSession_Params;
+#define CUpti_Profiler_EndSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndSession_Params, ctx)
+
+/**
+ * \brief Begin profiling session sets up the profiling on the device
+ *
+ * Although, it doesn't start the profiling but GPU resources needed for profiling are allocated.
+ * Outside of a session, the GPU will return to its normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginSession(CUpti_Profiler_BeginSession_Params* pParams);
+/**
+ * \brief Ends profiling session
+ *
+ * Frees up the GPU resources acquired for profiling.
+ * Outside of a session, the GPU will return to it's normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndSession(CUpti_Profiler_EndSession_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerSetConfig
+ */
+typedef struct CUpti_Profiler_SetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_SetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const uint8_t* pConfig;                                 //!< [in] Config created by NVPW_RawMetricsConfig_GetConfigImage(). Must be align(8).
+    size_t configSize;                                      //!< [in] size of config
+    uint16_t minNestingLevel;                               //!< [in] the lowest nesting level to be profiled; must be >= 1
+    uint16_t numNestingLevels;                              //!< [in] the number of nesting levels to profile; must be >= 1
+    size_t passIndex;                                       //!< [in] Set this to zero for in-app replay; set this to the output of EndPass() for application replay
+    uint16_t targetNestingLevel;                            //!< [in] Set this to minNestingLevel for in-app replay; set this to the output of EndPass() for application
+} CUpti_Profiler_SetConfig_Params;
+
+#define CUpti_Profiler_SetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Params for cuptiProfilerUnsetConfig
+ */
+typedef struct CUpti_Profiler_UnsetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_UnsetConfig_Params;
+#define CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_UnsetConfig_Params, ctx)
+
+/**
+ * \brief Set metrics configuration to be profiled
+ *
+ * Use these APIs to set the config to profile in a session. It can be used for advanced cases such as where multiple
+ * configurations are collected into a single CounterData Image on the need basis, without restarting the session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerSetConfig(CUpti_Profiler_SetConfig_Params* pParams);
+/**
+ * \brief Unset metrics configuration profiled
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerUnsetConfig(CUpti_Profiler_UnsetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginPass
+ */
+typedef struct CUpti_Profiler_BeginPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_BeginPass_Params;
+#define CUpti_Profiler_BeginPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginPass_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerEndPass
+ */
+typedef struct CUpti_Profiler_EndPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    uint16_t targetNestingLevel;                            //!  [out] The targetNestingLevel that will be collected by the *next* BeginPass.
+    size_t passIndex;                                       //!< [out] The passIndex that will be collected by the *next* BeginPass
+    uint8_t allPassesSubmitted;                             //!< [out] becomes true when the last pass has been queued to the GPU
+} CUpti_Profiler_EndPass_Params;
+#define CUpti_Profiler_EndPass_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndPass_Params, allPassesSubmitted)
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * It's a no-op in case of \ref CUPTI_KernelReplay.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginPass(CUpti_Profiler_BeginPass_Params* pParams);
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * Its a no-op in case of \ref CUPTI_KernelReplay.
+ * Returns information for next pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndPass(CUpti_Profiler_EndPass_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerEnableProfiling
+ */
+typedef struct CUpti_Profiler_EnableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EnableProfiling_Params;
+#define CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EnableProfiling_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerDisableProfiling
+ */
+typedef struct CUpti_Profiler_DisableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_DisableProfiling_Params;
+#define CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DisableProfiling_Params, ctx)
+
+/**
+ * \brief Enables Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEnableProfiling(CUpti_Profiler_EnableProfiling_Params* pParams);
+
+/**
+ * \brief Disable Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDisableProfiling(CUpti_Profiler_DisableProfiling_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerIsPassCollected
+ */
+typedef struct CUpti_Profiler_IsPassCollected_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed pass
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+    uint8_t onePassCollected;                               //!< [out] true if a pass was successfully decoded
+    uint8_t allPassesCollected;                             //!< [out] becomes true when the last pass has been decoded
+} CUpti_Profiler_IsPassCollected_Params;
+#define CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_IsPassCollected_Params, allPassesCollected)
+
+/**
+ * \brief Asynchronous call to query if the submitted pass to GPU is collected
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerIsPassCollected(CUpti_Profiler_IsPassCollected_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerFlushCounterData
+ */
+typedef struct CUpti_Profiler_FlushCounterData_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed passes
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+} CUpti_Profiler_FlushCounterData_Params;
+#define CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_FlushCounterData_Params, numTraceBytesDropped)
+
+/**
+ * \brief Decode all the submitted passes
+ *
+ * Flush Counter data API to ensure every pass is decoded into the counterDataImage passed at beginSession.
+ * This will cause the CPU/GPU sync to collect all the undecoded pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerFlushCounterData(CUpti_Profiler_FlushCounterData_Params* pParams);
+
+typedef struct CUpti_Profiler_PushRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PushRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const char* pRangeName;                                 //!< [in] specifies the range for subsequent launches; must not be NULL
+    size_t rangeNameLength;                                 //!< [in] assign to strlen(pRangeName) if known; if set to zero, the library will call strlen()
+} CUpti_Profiler_PushRange_Params;
+#define CUpti_Profiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PushRange_Params, rangeNameLength)
+
+typedef struct CUpti_Profiler_PopRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PopRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_PopRange_Params;
+#define CUpti_Profiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PopRange_Params, ctx)
+
+
+/**
+ * \brief Range API's : Push user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPushRange(CUpti_Profiler_PushRange_Params *pParams);
+
+/**
+ * \brief Range API's : Pop user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPopRange(CUpti_Profiler_PopRange_Params *pParams);
+
+/**
+ * \brief Params for cuptiProfilerGetCounterAvailability
+ */
+typedef struct CUpti_Profiler_GetCounterAvailability_Params
+{
+    size_t structSize;                                  //!< [in] CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE
+    void* pPriv;                                        //!< [in] assign to NULL
+    CUcontext ctx;                                      //!< [in] if NULL, the current CUcontext is used
+    size_t counterAvailabilityImageSize;                //!< [in/out] If `pCounterAvailabilityImage` is NULL, then the required size is returned in
+                                                        //!< `counterAvailabilityImageSize`, otherwise `counterAvailabilityImageSize` should be set to the size of
+                                                        //!< `pCounterAvailabilityImage`, and on return it would be overwritten with number of actual bytes copied
+    uint8_t* pCounterAvailabilityImage;                 //!< [in] buffer receiving counter availability image, may be NULL
+} CUpti_Profiler_GetCounterAvailability_Params;
+#define CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility
+ *
+ * Use this API to query counter availability information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified context or its device.
+ */
+CUptiResult CUPTIAPI cuptiProfilerGetCounterAvailability(CUpti_Profiler_GetCounterAvailability_Params *pParams);
+
+/// Generic support level enum for CUPTI
+typedef enum
+{
+    CUPTI_PROFILER_CONFIGURATION_UNKNOWN = 0, //!< Configuration support level unknown - either detection code errored out before setting this value, or unable to determine it
+    CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED, //!< Profiling is unavailable.  For specific feature fields, this means that the current configuration of this feature does not work with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would be returned for SLI on an SLI-enabled device.
+    CUPTI_PROFILER_CONFIGURATION_DISABLED,    //!< Profiling would be available for this configuration, but was disabled by the system
+    CUPTI_PROFILER_CONFIGURATION_SUPPORTED    //!< Profiling is supported.  For specific feature fields, this means that the current configuration of this feature works with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would only be returned for devices which are not SLI-enabled.
+} CUpti_Profiler_Support_Level;
+
+/**
+ * \brief Profiler API types
+ */
+typedef enum
+{
+    CUPTI_PROFILER_RANGE_PROFILING = 0,       //!< CUPTI APIs for range based profiling (cuptiProfiler*)
+    CUPTI_PROFILER_PC_SAMPLING,               //!< CUPTI APIs collecting pc sampling data (cuptiPcSampling*)
+    CUPTI_PROFILER_SASS_METRICS,              //!< CUPTI APIs collecting SASS metrics data (cuptiSassMetrics*)
+    CUPTI_PROFILER_PM_SAMPLING,               //!< CUPTI APIs collecting PM Sampling data (cuptiPmSampling*)
+    CUPTI_PROFILER_UNKNOWN
+} CUpti_Profiler_API;
+
+/**
+ * \brief Params for cuptiProfilerDeviceSupported
+ */
+typedef struct
+{
+    size_t structSize;                                //!< [in] Must be CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE
+    void *pPriv;                                      //!< [in] assign to NULL
+    CUdevice cuDevice;                                //!< [in] if NULL, the current CUcontext is used
+
+    CUpti_Profiler_Support_Level isSupported;         //!< [out] overall SUPPORTED / UNSUPPORTED flag representing whether Profiling and PC Sampling APIs work on the given device and configuration. SUPPORTED if all following flags are SUPPORTED, UNSUPPORTED otherwise.
+
+    CUpti_Profiler_Support_Level architecture;        //!< [out] SUPPORTED if the device architecture level supports the Profiling API (Compute Capability >= 7.0), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level sli;                 //!< [out] SUPPORTED if SLI is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level vGpu;                //!< [out] SUPPORTED if vGPU is supported and profiling is enabled, DISABLED if profiling is supported but not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level confidentialCompute; //!< [out] SUPPORTED if confidential compute is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level cmp;                 //!< [out] SUPPORTED if not NVIDIA Crypto Mining Processors (CMP), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level wsl;                 //!< [out] SUPPORTED if WSL supported, UNSUPPORTED otherwise
+    CUpti_Profiler_API     api;                       //!< [in] the CUPTI API type for which device support will be checked
+} CUpti_Profiler_DeviceSupported_Params;
+#define CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeviceSupported_Params, api)
+
+/**
+ * \brief Query device compatibility with Profiling API
+ *
+ * Use this call to determine whether a compute device and configuration are compatible with the Profiling API.
+ * If the configuration does not support profiling, one of several flags will indicate why.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeviceSupported(CUpti_Profiler_DeviceSupported_Params *pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_PROFILER_TARGET_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebcb25c0921bf473df943d63f476b877fdec2d66
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h
@@ -0,0 +1,465 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RANGE_PROFILER_H_)
+#define _CUPTI_RANGE_PROFILER_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RANGE_PROFILER_API CUPTI Range Profiling API
+ * Functions, types, and enums that implement the CUPTI Range Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+
+typedef struct CUpti_RangeProfiler_Object CUpti_RangeProfiler_Object;
+
+/**
+ * \brief Params for cuptiRangeProfilerSetConfig
+ */
+typedef struct CUpti_RangeProfiler_SetConfig_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Size of the config image.
+    size_t configSize;
+    /// [in] Config image.
+    const uint8_t* pConfig;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterDataImage;
+    /// [in] Profiling Range mode.
+    CUpti_ProfilerRange range;
+    /// [in] Replay mode.
+    CUpti_ProfilerReplayMode replayMode;
+    /// [in] Maximum number of ranges that can be profiled in a pass.
+    size_t maxRangesPerPass;
+    /// [in] number of nesting level to be profiled. For Auto range mode, this should be set to 1.
+    uint16_t numNestingLevels;
+    /// [in] minimum nesting level to be profiled.
+    uint16_t minNestingLevel;
+    /// [in] Pass index for the replay session.
+    size_t passIndex;
+    /// [in] Target nesting level for the replay session.
+    uint16_t targetNestingLevel;
+} CUpti_RangeProfiler_SetConfig_Params;
+
+#define CUpti_RangeProfiler_SetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Set the configuration for range profiler like maximum number of ranges per pass, number of nesting levels,
+ * range and replay mode and the config image which has scheduling info for metric collection.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_SetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerSetConfig(CUpti_RangeProfiler_SetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerEnable
+ */
+typedef struct CUpti_RangeProfiler_Enable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Context to be used for profiling.
+    CUcontext ctx;
+    /// [out] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Enable_Params;
+#define CUpti_RangeProfiler_Enable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Enable_Params, pRangeProfilerObject)
+
+/**
+ * \brief Create a range profiler object and enable range profiling on the CUDA context.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Enable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if memory allocation fails while creating the PM sampling object
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerEnable(CUpti_RangeProfiler_Enable_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerDisable
+ */
+typedef struct CUpti_RangeProfiler_Disable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Disable_Params;
+#define CUpti_RangeProfiler_Disable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Disable_Params, pRangeProfilerObject)
+
+/**
+ * \brief Disable the range profiler on the CUDA context and destroy the range profiler object.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Disable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerDisable(CUpti_RangeProfiler_Disable_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerStart
+ */
+typedef struct CUpti_RangeProfiler_Start_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Start_Params;
+#define CUpti_RangeProfiler_Start_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Start_Params, pRangeProfilerObject)
+
+/**
+ * \brief Start the range profiler.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Start_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler Start is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerStart(CUpti_RangeProfiler_Start_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerStop
+ */
+typedef struct CUpti_RangeProfiler_Stop_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [out] pass index for the replay session.
+    size_t passIndex;
+    /// [out] target nesting level for the replay session.
+    size_t targetNestingLevel;
+    /// [out] 1 if all passes are submitted to GPU for collection, 0 otherwise.
+    uint8_t isAllPassSubmitted;
+} CUpti_RangeProfiler_Stop_Params;
+#define CUpti_RangeProfiler_Stop_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Stop_Params, isAllPassSubmitted)
+
+/**
+ * \brief Stop the range profiler.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Stop_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler Stop is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerStop(CUpti_RangeProfiler_Stop_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerPushRange
+ */
+typedef struct CUpti_RangeProfiler_PushRange_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Name of the range to be profiled (only valid for User range mode).
+    const char* pRangeName;
+} CUpti_RangeProfiler_PushRange_Params;
+#define CUpti_RangeProfiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_PushRange_Params, pRangeName)
+
+/**
+ * \brief Add a new range to the Range Profiler with a given range name.
+ * For nested ranges, this API should be called again for the innermost range. For profiling the nested
+ * range, users need to set the values for minNestingLevel and numNestingLevels in the SetConfig API.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_PushRange_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler PushRange is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerPushRange(CUpti_RangeProfiler_PushRange_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerPopRange
+ */
+typedef struct CUpti_RangeProfiler_PopRange_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_PopRange_Params;
+#define CUpti_RangeProfiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_PopRange_Params, pRangeProfilerObject)
+
+/**
+ * \brief pop the current range to the Range Profiler.
+ * The number of pop range API call should be same as number of push ranges in the same order.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_PopRange_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler PopRange is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerPopRange(CUpti_RangeProfiler_PopRange_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerDecodeData
+ */
+typedef struct CUpti_RangeProfiler_DecodeData_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [out] Number of ranges dropped in the processed passes.
+    size_t numOfRangeDropped;
+} CUpti_RangeProfiler_DecodeData_Params;
+#define CUpti_RangeProfiler_DecodeData_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_DecodeData_Params, numOfRangeDropped)
+
+/**
+ * \brief Decode the profiling data stored in the hardware to the counter data image passed in the
+ * SetConfig API. This API should be called after cuptiRangeProfilerStop. The counter data image
+ * will be updated with the profiling data for the ranges profiled.
+ * 
+ * For the cases where the number of ranges counter data image can store is less than the number of ranges
+ * profiled (= maxRangesPerPass in SetConfig API), the counter data image will report dropped ranges.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_DecodeData_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler DecodeData is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerDecodeData(CUpti_RangeProfiler_DecodeData_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerGetCounterDataSize
+ */
+typedef struct CUpti_RangeProfiler_GetCounterDataSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Periodic sampler object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Names of the metrics to be collected.
+    const char** pMetricNames;
+    /// [in] Number of metrics to be collected.
+    size_t numMetrics;
+    /// [in] Maximum number of ranges to be stored in the counter data image.
+    size_t maxNumOfRanges;
+    /// [in] Maximum number of RangeTree nodes; must be >= maxNumOfRanges
+    uint32_t maxNumRangeTreeNodes;
+    /// [out] Size of the counter data image.
+    size_t counterDataSize;
+} CUpti_RangeProfiler_GetCounterDataSize_Params;
+#define CUpti_RangeProfiler_GetCounterDataSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_GetCounterDataSize_Params, counterDataSize)
+
+/**
+ * \brief Get the size of the counter data image required to store the profiling data for the ranges profiled.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_GetCounterDataSize_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler GetCounterDataSize is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerGetCounterDataSize(CUpti_RangeProfiler_GetCounterDataSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_RangeProfiler_CounterDataImage_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Periodic sampler object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Size of the counter data image.
+    size_t counterDataSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterData;
+} CUpti_RangeProfiler_CounterDataImage_Initialize_Params;
+#define CUpti_RangeProfiler_CounterDataImage_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_CounterDataImage_Initialize_Params, pCounterData)
+
+/**
+ * \brief Initialize the counter data image with the profiling data for the ranges profiled.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_CounterDataImage_Initialize_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler CounterDataImageInitialize is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerCounterDataImageInitialize(CUpti_RangeProfiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerGetCounterDataInfo
+ */
+typedef struct CUpti_RangeProfiler_GetCounterDataInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] Number of ranges in the counter data image.
+    size_t numTotalRanges;
+} CUpti_RangeProfiler_GetCounterDataInfo_Params;
+#define CUpti_RangeProfiler_GetCounterDataInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_GetCounterDataInfo_Params, numTotalRanges)
+
+/**
+ * \brief Get the number of ranges stored in the counter data image.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_GetCounterDataInfo_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerGetCounterDataInfo(CUpti_RangeProfiler_GetCounterDataInfo_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerCounterDataGetRangeInfo
+ */
+typedef struct CUpti_RangeProfiler_CounterData_GetRangeInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Index of the sample.
+    size_t rangeIndex;
+    /// [in] range delimiter.
+    const char* rangeDelimiter;
+    /// [out] RangeName;
+    const char* rangeName;
+} CUpti_RangeProfiler_CounterData_GetRangeInfo_Params;
+#define CUpti_RangeProfiler_CounterData_GetRangeInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_CounterData_GetRangeInfo_Params, rangeName)
+
+/**
+ * \brief Get the range name for the given range index.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_CounterData_GetRangeInfo_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerCounterDataGetRangeInfo(CUpti_RangeProfiler_CounterData_GetRangeInfo_Params* pParams);
+
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_RANGE_PROFILER_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..10371ac621b2472086a4d68af4dc9bdc91f8e417
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RESULT_H_)
+#define _CUPTI_RESULT_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RESULT_API CUPTI Result Codes
+ * Error and result codes returned by CUPTI functions.
+ * @{
+ */
+
+/**
+ * \brief CUPTI result codes.
+ *
+ * Error and result codes returned by CUPTI functions.
+ */
+typedef enum {
+    /**
+     * No error.
+     */
+    CUPTI_SUCCESS                                       = 0,
+    /**
+     * One or more of the parameters is invalid.
+     */
+    CUPTI_ERROR_INVALID_PARAMETER                       = 1,
+    /**
+     * The device does not correspond to a valid CUDA device.
+     */
+    CUPTI_ERROR_INVALID_DEVICE                          = 2,
+    /**
+     * The context is NULL or not valid.
+     */
+    CUPTI_ERROR_INVALID_CONTEXT                         = 3,
+    /**
+     * The event domain id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID                 = 4,
+    /**
+     * The event id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_ID                        = 5,
+    /**
+     * The event name is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_NAME                      = 6,
+    /**
+     * The current operation cannot be performed due to dependency on
+     * other factors.
+     */
+    CUPTI_ERROR_INVALID_OPERATION                       = 7,
+    /**
+     * Unable to allocate enough memory to perform the requested
+     * operation.
+     */
+    CUPTI_ERROR_OUT_OF_MEMORY                           = 8,
+    /**
+     * An error occurred on the performance monitoring hardware.
+     */
+    CUPTI_ERROR_HARDWARE                                = 9,
+    /**
+     * The output buffer size is not sufficient to return all
+     * requested data.
+     */
+    CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT           = 10,
+    /**
+     * API is not implemented.
+     */
+    CUPTI_ERROR_API_NOT_IMPLEMENTED                     = 11,
+    /**
+     * The maximum limit is reached.
+     */
+    CUPTI_ERROR_MAX_LIMIT_REACHED                       = 12,
+    /**
+     * The object is not yet ready to perform the requested operation.
+     */
+    CUPTI_ERROR_NOT_READY                               = 13,
+    /**
+     * The current operation is not compatible with the current state
+     * of the object
+     */
+    CUPTI_ERROR_NOT_COMPATIBLE                          = 14,
+    /**
+     * CUPTI is unable to initialize its connection to the CUDA
+     * driver.
+     */
+    CUPTI_ERROR_NOT_INITIALIZED                         = 15,
+    /**
+     * The metric id is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_ID                        = 16,
+    /**
+     * The metric name is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_NAME                      = 17,
+    /**
+     * The queue is empty.
+     */
+    CUPTI_ERROR_QUEUE_EMPTY                              = 18,
+    /**
+     * Invalid handle (internal?).
+     */
+    CUPTI_ERROR_INVALID_HANDLE                           = 19,
+    /**
+     * Invalid stream.
+     */
+    CUPTI_ERROR_INVALID_STREAM                           = 20,
+    /**
+     * Invalid kind.
+     */
+    CUPTI_ERROR_INVALID_KIND                             = 21,
+    /**
+     * Invalid event value.
+     */
+    CUPTI_ERROR_INVALID_EVENT_VALUE                      = 22,
+    /**
+     * CUPTI is disabled due to conflicts with other enabled profilers
+     */
+    CUPTI_ERROR_DISABLED                                 = 23,
+    /**
+     * Invalid module.
+     */
+    CUPTI_ERROR_INVALID_MODULE                           = 24,
+    /**
+     * Invalid metric value.
+     */
+    CUPTI_ERROR_INVALID_METRIC_VALUE                     = 25,
+    /**
+     * The performance monitoring hardware is in use by other client.
+     */
+    CUPTI_ERROR_HARDWARE_BUSY                            = 26,
+    /**
+     * The attempted operation is not supported on the current
+     * system or device.
+     */
+    CUPTI_ERROR_NOT_SUPPORTED                            = 27,
+    /**
+     * Unified memory profiling is not supported on the system.
+     * Potential reason could be unsupported OS or architecture.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED               = 28,
+    /**
+     * Unified memory profiling is not supported on the device
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE     = 29,
+    /**
+     * Unified memory profiling is not supported on a multi-GPU
+     * configuration without P2P support between any pair of devices
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES = 30,
+    /**
+     * Unified memory profiling is not supported under the
+     * Multi-Process Service (MPS) environment. CUDA 7.5 removes this
+     * restriction.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS      = 31,
+    /**
+     * In CUDA 9.0, devices with compute capability 7.0 don't
+     * support CDP tracing
+     */
+    CUPTI_ERROR_CDP_TRACING_NOT_SUPPORTED                = 32,
+    /**
+     * Profiling on virtualized GPU is not supported.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED         = 33,
+    /**
+     * Profiling results might be incorrect for CUDA applications
+     * compiled with nvcc version older than 9.0 for devices with
+     * compute capability 6.0 and 6.1.
+     * Profiling session will continue and CUPTI will notify it using this error code.
+     * User is advised to recompile the application code with nvcc version 9.0 or later.
+     * Ignore this warning if code is already compiled with the recommended nvcc version.
+     */
+    CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE             = 34,
+    /**
+     * User doesn't have sufficient privileges which are required to
+     * start the profiling session.
+     * One possible reason for this may be that the NVIDIA driver or your system
+     * administrator may have restricted access to the NVIDIA GPU performance counters.
+     * To learn how to resolve this issue and find more information, please visit
+     * https://developer.nvidia.com/CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
+     */
+    CUPTI_ERROR_INSUFFICIENT_PRIVILEGES                  = 35,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not compatible with the
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED             = 36,
+    /**
+     * Missing definition of the OpenACC API routine in the linked OpenACC library.
+     *
+     * One possible reason is that OpenACC library is linked statically in the
+     * user application, which might not have the definition of all the OpenACC
+     * API routines needed for the OpenACC profiling, as compiler might ignore
+     * definitions for the functions not used in the application. This issue
+     * can be mitigated by linking the OpenACC library dynamically.
+     */
+    CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE                = 37,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not supported on devices with
+     * compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+     * These APIs are deprecated in the CUDA 12.8 release and will be removed in a future CUDA release.
+     * These are replaced by the host profiling API in the header cupti_profiler_host.h and
+     * target profiling API in the header cupti_range_profiler.h which are supported on
+     * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+     * architectures).
+     */
+    CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED            = 38,
+    /**
+     * CUPTI doesn't allow multiple callback subscribers. Only a single subscriber
+     * can be registered at a time.
+     * Same error code is used when application is launched using NVIDIA tools
+     * like nvprof, Visual Profiler, Nsight Systems, Nsight Compute, cuda-gdb and
+     * cuda-memcheck.
+     */
+    CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED       = 39,
+    /**
+     * Profiling on virtualized GPU is not allowed by hypervisor.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_INSUFFICIENT_PRIVILEGES = 40,
+    /**
+     * Profiling and tracing are not allowed when confidential computing mode
+     * is enabled.
+     */
+    CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED = 41,
+    /**
+     * CUPTI does not support NVIDIA Crypto Mining Processors (CMP).
+     * For more information, please visit https://developer.nvidia.com/ERR_NVCMPGPU
+    */
+    CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED = 42,
+    /**
+     * Profiling on Multi-instance GPU (MIG) is not supported.
+     */
+    CUPTI_ERROR_MIG_DEVICE_NOT_SUPPORTED = 43,
+    /**
+     * Profiling on SLI device is not supported.
+     */
+    CUPTI_ERROR_SLI_DEVICE_NOT_SUPPORTED = 44,
+    /**
+     * Profiling on WSL device is not supported.
+     */
+    CUPTI_ERROR_WSL_DEVICE_NOT_SUPPORTED = 45,
+    /**
+     * An unknown internal error has occurred.
+     */
+    CUPTI_ERROR_UNKNOWN                                  = 999,
+    CUPTI_ERROR_FORCE_INT                                = 0x7fffffff
+} CUptiResult;
+
+/**
+ * \brief Get the descriptive string for a CUptiResult.
+ *
+ * Return the descriptive string for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param result The result to get the string for
+ * \param str Returns the string
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ */
+CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str);
+
+/**
+ * @brief Get the descriptive message corresponding to error codes returned
+ * by CUPTI.
+ * 
+ * Return the descriptive error message for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ * 
+ * \param result The result to get the descriptive error message for
+ * \param str Returns the error message string
+ * 
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ * 
+ */
+
+CUptiResult CUPTIAPI cuptiGetErrorMessage(CUptiResult result, const char **str);
+
+/** @} */ /* END CUPTI_RESULT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_RESULT_H_*/
+
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..16b41e475fcfcf76e6507949699cd04c594becc9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h
@@ -0,0 +1,504 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+#if !defined(_CUPTI_RUNTIME_CBID_H)
+#define _CUPTI_RUNTIME_CBID_H
+
+typedef enum CUpti_runtime_api_trace_cbid_enum {
+    CUPTI_RUNTIME_TRACE_CBID_INVALID                                                       = 0,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020                                    = 1,
+    CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020                                   = 2,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020                                      = 3,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020                                 = 4,
+    CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020                                        = 5,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020                                      = 6,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020                                   = 7,
+    CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020                                       = 8,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020                                       = 9,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020                                        = 10,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020                                     = 11,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020                                      = 12,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020                                              = 13,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020                                  = 14,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020                                   = 15,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020                                           = 16,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020                                           = 17,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020                                     = 18,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020                                      = 19,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020                                              = 20,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020                                         = 21,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020                                                = 22,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020                                         = 23,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020                                           = 24,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020                                          = 25,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020                                            = 26,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020                                           = 27,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020                                = 28,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020                                        = 29,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020                                          = 30,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020                                              = 31,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020                                            = 32,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020                                       = 33,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020                                     = 34,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020                                     = 35,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020                                   = 36,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020                                  = 37,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020                                = 38,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020                                      = 39,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020                                    = 40,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020                                         = 41,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020                                  = 42,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020                                = 43,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020                                       = 44,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020                                = 45,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020                              = 46,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020                                 = 47,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020                               = 48,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020                                              = 49,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020                                            = 50,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020                                         = 51,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020                                       = 52,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020                                    = 53,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020                                       = 54,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020                                         = 55,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020                                       = 56,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020                                  = 57,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020                                       = 58,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020                           = 59,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020                                 = 60,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020                                  = 61,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020                                 = 62,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020                                       = 63,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020                              = 64,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020                                   = 65,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020                                 = 66,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020                            = 67,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020                           = 68,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020                              = 69,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020                            = 70,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020                                        = 71,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020                             = 72,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020                            = 73,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020                          = 74,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020                         = 75,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020                                = 76,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020                              = 77,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020                    = 78,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020                   = 79,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020                                      = 80,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020                                 = 81,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020                   = 82,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020                  = 83,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020                                      = 84,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020                                     = 85,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020                              = 86,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020                       = 87,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020                                      = 88,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020                                     = 89,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020                              = 90,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020                       = 91,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020                               = 92,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020                             = 93,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020                                   = 94,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020                                 = 95,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020                            = 96,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020                   = 97,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020                         = 98,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020                       = 99,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020                          = 100,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020                         = 101,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020                                       = 102,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020                                      = 103,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020                               = 104,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020                               = 105,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020                        = 106,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020                                = 107,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020                              = 108,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020                                    = 109,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020                                  = 110,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020                             = 111,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020                    = 112,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020                          = 113,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020                        = 114,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020                           = 115,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020                          = 116,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020                                           = 117,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020                                             = 118,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020                            = 119,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020                          = 120,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020                                 = 121,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020                               = 122,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020                                          = 123,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020                                  = 124,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020                                    = 125,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020                                   = 126,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020                                      = 127,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020                                      = 128,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020                                        = 129,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020                                       = 130,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020                                   = 131,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020                                         = 132,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020                                         = 133,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020                                = 134,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020                                         = 135,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020                                        = 136,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020                                    = 137,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020                                          = 138,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020                                    = 139,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020                                            = 140,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020                                       = 141,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020                                            = 142,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020                                       = 143,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020                                            = 144,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020                                       = 145,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020                                = 146,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020                                     = 147,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020                              = 148,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020                              = 149,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020                                = 150,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000                                = 151,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000                                        = 152,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000                                      = 153,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000                                 = 154,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000                              = 155,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000                             = 156,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000                                        = 157,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000                                      = 158,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000                                = 159,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000                                          = 160,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000                                     = 161,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000                                        = 162,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000                                   = 163,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020                                         = 164,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020                                   = 165,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020                                      = 166,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020                                      = 167,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020                                = 168,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020                                = 169,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000                                  = 170,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000                                       = 171,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000                                        = 172,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010                                 = 173,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010                                   = 174,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010                                        = 175,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010                                   = 176,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010                                  = 177,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010                                     = 178,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010                                    = 179,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010                                   = 180,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010                                        = 181,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020                              = 182,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020                            = 183,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020                            = 184,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000                                 = 185,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000                                = 186,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000                        = 187,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000                         = 188,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000                                 = 189,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000                                = 190,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000                        = 191,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000                                = 192,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000                              = 193,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000                                  = 194,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000                         = 195,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000             = 196,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000                                   = 197,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000                               = 198,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000                    = 199,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000                                  = 200,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050                                       = 201,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050                            = 202,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050                                   = 203,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050                                      = 204,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050                        = 205,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000                                       = 206,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000           = 207,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000                                = 208,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050                                        = 209,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050           = 210,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000                                        = 211,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000                                      = 212,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000                                         = 213,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000                                   = 214,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000                                         = 215,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000                                       = 216,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000                                  = 217,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000                                = 218,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000                                = 219,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000                              = 220,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000                             = 221,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000                           = 222,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000                                 = 223,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000                               = 224,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000                                    = 225,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000                             = 226,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000                           = 227,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000                                  = 228,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000                           = 229,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000                         = 230,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000                            = 231,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000                          = 232,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000                                         = 233,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000                                       = 234,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000                                    = 235,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000                                  = 236,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000                              = 237,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000                                 = 238,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000                              = 239,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000                                    = 240,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000                           = 241,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000                                    = 242,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000                                       = 243,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000                                  = 244,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000                                       = 245,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000                                  = 246,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000                                = 247,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000                              = 248,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000                                   = 249,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000                              = 250,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000  = 251,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000                                    = 252,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000                               = 253,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000                                           = 254,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000                               = 255,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000                            = 256,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000                            = 257,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000                         = 258,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000                       = 259,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000                       = 260,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000                            = 261,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000                         = 262,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000                       = 263,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000                        = 264,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000                   = 265,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000                                = 266,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000                               = 267,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000                   = 268,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000                             = 269,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000                        = 270,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000                              = 271,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000                  = 272,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000                                    = 273,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000                               = 274,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000                      = 275,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000              = 276,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000                              = 277,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000                            = 278,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000                      = 279,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000                 = 280,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000                        = 281,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000                   = 282,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000                           = 283,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000                                     = 284,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000                                = 285,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000                                        = 286,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000                           = 287,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000                           = 288,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000                                 = 289,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000                                 = 290,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000                           = 291,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000                           = 292,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000                                 = 293,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000                           = 294,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000                           = 295,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000                                   = 296,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000                             = 297,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000                             = 298,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000                        = 299,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000                                  = 300,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000                                         = 301,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000                               = 302,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000                                   = 303,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000                                  = 304,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000                           = 305,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000                         = 306,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000                               = 307,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000                            = 308,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000                                   = 309,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000                                   = 310,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000                                        = 311,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000                                   = 312,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000                                   = 313,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000                                       = 314,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000                                 = 315,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000                            = 316,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000                                  = 317,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000                             = 318,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000                                   = 319,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000                              = 320,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000                             = 321,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000                                      = 322,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000                                      = 323,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010                               = 324,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010                          = 325,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010                       = 326,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010                    = 327,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020                       = 328,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200              = 329,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200                                     = 330,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200                                = 331,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020                       = 332,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020                       = 333,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020                         = 334,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020                                    = 335,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000                                    = 336,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000                          = 337,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000                      = 338,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000                        = 339,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000                        = 340,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000                               = 341,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000                          = 342,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000                                 = 343,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000                            = 344,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000                                 = 345,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000                            = 346,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010                   = 347,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000                                        = 348,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000                                   = 349,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010                         = 350,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010                       = 351,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010                               = 352,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010                   = 353,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010                 = 354,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010                         = 355,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010               = 356,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010             = 357,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010                     = 358,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010                           = 359,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010                  = 360,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010                   = 361,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010                            = 362,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010                       = 363,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010                       = 364,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010                              = 365,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010                         = 366,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010                         = 367,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010                   = 368,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010                     = 369,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010                               = 370,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010                          = 371,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020                            = 372,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020                                        = 373,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020                                   = 374,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020                                          = 375,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020                                     = 376,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020                                      = 377,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020                                = 378,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020                                = 379,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020                                   = 380,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020                                      = 381,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020                                   = 382,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020                                      = 383,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020                                     = 384,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020                                   = 385,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020                                   = 386,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020                     = 387,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020                   = 388,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020                               = 389,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020                               = 390,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020                                = 391,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020                           = 392,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020                   = 393,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020              = 394,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020                     = 395,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020                = 396,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020               = 397,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020         = 398,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020         = 399,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020                 = 400,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020           = 401,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020           = 402,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020     = 403,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020       = 404,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030                     = 405,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030                                = 406,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030                           = 407,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030                                 = 408,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030                            = 409,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030                       = 410,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030                    = 411,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030               = 412,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030                                   = 413,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030                                   = 414,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030                                  = 415,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030                              = 416,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030                             = 417,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040                          = 418,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040                               = 419,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040                         = 420,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040                                = 421,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040                          = 422,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040                                 = 423,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040                         = 424,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040                         = 425,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060                                = 426,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060                                = 427,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060                         = 428,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060                = 429,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060                                    = 430,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060                               = 431,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070                   = 432,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070                         = 433,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v2_v11080                             = 434,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v2_v11080                     = 435,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_v12000                         = 436,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_ptsz_v12000                    = 437,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecGetFlags_v12000                                  = 438,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetKernel_v12000                                          = 439,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v2_v12000                             = 440,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_v12000                                        = 441,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_ptsz_v12000                                   = 442,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v12000                                   = 443,
+    CUPTI_RUNTIME_TRACE_CBID_cudaInitDevice_v12000                                         = 444,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v12020                                       = 445,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetParams_v12020                                 = 446,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecNodeSetParams_v12020                             = 447,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v2_v12020                                       = 448,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_v12020                                = 449,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_ptsz_v12020                           = 450,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetName_v12030                                        = 451,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_v12030                          = 452,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_ptsz_v12030                     = 453,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphConditionalHandleCreate_v12030                       = 454,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v2_v12030                                   = 455,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v2_v12030                        = 456,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v2_v12030                      = 457,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v2_v12030                            = 458,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v2_v12030                         = 459,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v2_v12030                                    = 460,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_v12030                            = 461,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_ptsz_v12030                       = 462,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_v12030                 = 463,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030            = 464,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceRegisterAsyncNotification_v12040                    = 465,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceUnregisterAsyncNotification_v12040                  = 466,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetParamInfo_v12040                                   = 467,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_v12050                       = 468,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_ptsz_v12050                  = 469,
+    CUPTI_RUNTIME_TRACE_CBID_cuda470_v12060                                                = 470,
+    CUPTI_RUNTIME_TRACE_CBID_cuda471_v12060                                                = 471,
+    CUPTI_RUNTIME_TRACE_CBID_cuda472_v12060                                                = 472,
+    CUPTI_RUNTIME_TRACE_CBID_cuda473_v12060                                                = 473,
+    CUPTI_RUNTIME_TRACE_CBID_cuda474_v12060                                                = 474,
+    CUPTI_RUNTIME_TRACE_CBID_cuda475_v12060                                                = 475,
+    CUPTI_RUNTIME_TRACE_CBID_cuda476_v12060                                                = 476,
+    CUPTI_RUNTIME_TRACE_CBID_cuda477_v12060                                                = 477,
+    CUPTI_RUNTIME_TRACE_CBID_cuda478_v12060                                                = 478,
+    CUPTI_RUNTIME_TRACE_CBID_cuda479_v12060                                                = 479,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_v12080                                    = 480,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_ptsz_v12080                               = 481,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_v12080                                   = 482,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_ptsz_v12080                              = 483,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_v12080                                 = 484,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_ptsz_v12080                            = 485,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v2_v12080                                = 486,
+    CUPTI_RUNTIME_TRACE_CBID_SIZE                                                          = 487,
+    CUPTI_RUNTIME_TRACE_CBID_FORCE_INT                                                     = 0x7fffffff
+} CUpti_runtime_api_trace_cbid;
+
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb59cf8e5882a5ff13b4a1b0fdc6bc7b0ec47f7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2023 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_SASS_METRICS_H_)
+#define _CUPTI_SASS_METRICS_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_SASS_METRICS_API CUPTI SASS Metrics API
+ * Functions, types, and enums that implement the CUPTI SASS Metrics API.
+ * @{
+ */
+
+typedef enum
+{
+    /// SASS metric data will be collected at GPU level. 
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to 1
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_GPU = 0,
+
+    /// SASS metric data will be collected at SM level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SMs in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SM = 1,
+
+    /// SASS metric data will be collected at SM sub-partition level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SM sub-partitions in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SMSP = 2,
+
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_INVALID
+} CUpti_SassMetrics_OutputGranularity;
+
+typedef struct CUpti_SassMetrics_MetricDetails
+{
+    /// unique ID for the SASS metric
+    uint64_t metricId;
+    /// metric name
+    const char* pMetricName;
+    /// metric description
+    const char* pMetricDescription;
+} CUpti_SassMetrics_MetricDetails;
+
+/**
+ * \brief Params for cuptiSassMetricsGetNumOfMetrics
+ */
+typedef struct CUpti_SassMetrics_GetNumOfMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [out] number of metrics supported for the queried chip
+    size_t numOfMetrics;
+} CUpti_SassMetrics_GetNumOfMetrics_Params;
+
+#define CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetNumOfMetrics_Params, numOfMetrics)
+
+/**
+ * \brief Get the number of supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetNumOfMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetNumOfMetrics(CUpti_SassMetrics_GetNumOfMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetMetrics
+ */
+typedef struct CUpti_SassMetrics_GetMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [in] number of metrics supported for the queried chip (can be queried using cuptiSassMetricsGetNumOfMetrics())
+    size_t numOfMetrics;
+    /// [out] list of metrics supported for queried chip
+    CUpti_SassMetrics_MetricDetails* pMetricsList;
+} CUpti_SassMetrics_GetMetrics_Params;
+#define CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetMetrics_Params, pMetricsList)
+
+/**
+ * \brief Get the list of all supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetMetrics(CUpti_SassMetrics_GetMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetProperties
+ */
+typedef struct CUpti_SassMetrics_GetProperties_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metric will be queried
+    const char* pChipName;
+    /// [in] metric name
+    const char* pMetricName;
+    /// [out] returns the metric ID and the metric description
+    CUpti_SassMetrics_MetricDetails metric;
+} CUpti_SassMetrics_GetProperties_Params;
+#define CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE        CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetProperties_Params, metric)
+
+/**
+ * \brief Get metric properties for the queried metric.
+ * For a given metric the results will be put in CUpti_SassMetrics_MetricDetails which
+ * stores metric ID, description of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetProperties(CUpti_SassMetrics_GetProperties_Params *pParams);
+
+typedef struct CUpti_SassMetrics_Config
+{
+    /// [in] unique id for the SASS metric, can be queried using cuptiSassMetricsGetProperties()
+    uint64_t metricId;
+    /// [in] CUpti_SassMetrics_OutputGranularity
+    uint8_t outputGranularity;
+} CUpti_SassMetrics_Config;
+
+/**
+ * \brief Params for cuptiSassMetricsSetConfig
+ */
+typedef struct CUpti_SassMetricsSetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] num of metric configs, will be equal to number of metrics queried
+    size_t numOfMetricConfig;
+    /// [in] list of metric config generated for given sass metrics
+    CUpti_SassMetrics_Config* pConfigs;
+    /// [in] device index for which config will be set, user can call this once for
+    /// the device on which the the SASS metric data will be collected
+    uint32_t deviceIndex;
+} CUpti_SassMetricsSetConfig_Params;
+#define CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsSetConfig_Params, deviceIndex)
+
+/**
+ * \brief Set config for the SASS metric data collection for a device.
+ * User need to call this API before calling any of the SASS metric data collection APIs.
+ * Each set config API call need to be followed by cuptiSassPatchingUnSetConfig API
+ * before calling the cuptiSassMetricsSetConfig() API again for the same device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling unset config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsSetConfig(CUpti_SassMetricsSetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsUnsetConfig
+ */
+typedef struct CUpti_SassMetricsUnsetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] device index for which SASS metric data collection config will get reset, user need to call this API for
+    /// all the devices on which the the SASS metric data collection have been configured.
+    uint32_t deviceIndex;
+} CUpti_SassMetricsUnsetConfig_Params;
+#define CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsUnsetConfig_Params, deviceIndex)
+
+/**
+ * \brief Unset config API will reset the SASS metric data collection configuration for the device.
+ * Once this API called CUPTI will deallocate all the memory allocated and remove all
+ * the configuration for SASS metric data collection. User can only call this API for a device where
+ * cuptiSassMetricsSetConfig() API has been called earlier for the device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling set config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsUnsetConfig(CUpti_SassMetricsUnsetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsEnable
+ */
+typedef struct CUpti_SassMetricsEnable_Params
+{
+    /// [in] equal to CUpti_SassMetricsEnable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] if false, all the functions will patched regardless of their execution with cuptiSassMetricsEnable() API call.
+    /// when this parameter is set to true, metric data collection for the function will be done at the very first execution in the enable/disble
+    /// range.
+    uint8_t enableLazyPatching;
+} CUpti_SassMetricsEnable_Params;
+#define CUpti_SassMetricsEnable_Params_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsEnable_Params, enableLazyPatching)
+
+/**
+ * \brief Sass metric data collection enable API will mark the start of a range, between which kernel
+ *  will be profiled for SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsEnable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsDisable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsEnable(CUpti_SassMetricsEnable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsDisable
+ */
+typedef struct CUpti_SassMetricsDisable_Params
+{
+    /// [in] equal to CUpti_SassMetricsDisable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be disabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] Num of dropped SASS records will be equal to numOfPatchedInstructions * numOfInstances.
+    /// Number of dropped records will be zero when data is flushed prior to calling the disable API.
+    size_t numOfDroppedRecords;
+} CUpti_SassMetricsDisable_Params;
+#define CUpti_SassMetricsDisable_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsDisable_Params, numOfDroppedRecords)
+
+/**
+ * \brief SASS metric data collection disable API will mark the end of a range, any kernel launched after this
+ * API call will not be profiled for the SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsDisable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsEnable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsDisable(CUpti_SassMetricsDisable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetDataProperties
+ */
+typedef struct CUpti_SassMetricsGetDataProperties_Params
+{
+    /// [in] equal to CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] total number of SASS records has been collected
+    size_t numOfPatchedInstructionRecords;
+    /// [out] number of instances for each metric value per instruction.
+    /// This will depend on CUpti_SassPatching_OutputGranularity level set for the metric config.
+    size_t numOfInstances;
+} CUpti_SassMetricsGetDataProperties_Params;
+
+#define CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsGetDataProperties_Params, numOfInstances)
+/**
+ * \brief SASS metric data properties API will give the data regarding number of instances of a metric
+ * value and number of SASS instruction data has been collected. The number of instances of a metric
+ * will vary as per user set the output granularity level with CUpti_SassMetrics_OutputGranularity value.
+ * User need to allocate memory for retriving the SASS data using cuptiSassMetricsFlushData() API.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsGetDataProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetDataProperties(CUpti_SassMetricsGetDataProperties_Params* pParams);
+
+typedef struct CUpti_SassMetrics_InstanceValue
+{
+    // unique id of the metric
+    uint64_t metricId;
+    // metric value 
+    uint64_t value;
+} CUpti_SassMetrics_InstanceValue;
+#define CUpti_SassMetrics_InstanceValue_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_InstanceValue, value)
+
+typedef struct CUpti_SassMetrics_Data
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [out] Unique cubin id
+    uint32_t cubinCrc;
+    /// [out] function's unique symbol index in the module.
+    uint32_t functionIndex;
+    /// [out] The function name
+    const char* functionName;
+    /// [out] pc offset for the function in a module
+    uint32_t pcOffset;
+    /// [out] array of size equal to number of instances per metric, which contains the metric ID and metric value.
+    CUpti_SassMetrics_InstanceValue* pInstanceValues;
+} CUpti_SassMetrics_Data;
+
+/**
+ * \brief Params for cuptiSassMetricsFlushData
+ */
+typedef struct CUpti_SassMetricsFlushData_Params
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] number of patched instruction record will be retrived, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of records available.
+    size_t numOfPatchedInstructionRecords;
+    /// [in] number of patched instruction record instances for a metric, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of instances for each record per metric available.
+    size_t numOfInstances;
+    /// [out] 
+    CUpti_SassMetrics_Data* pMetricsData;
+} CUpti_SassMetricsFlushData_Params;
+#define CUpti_SassMetricsFlushData_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsFlushData_Params, numOfInstances)
+
+/**
+ * \brief Flush SASS metrics data from CUPTI internal buffer to the user buffer.
+ * User needs to allocate the buffer for retrieving the data. The number of records collected
+ * can be queried using the API cuptiSassMetricsGetDataProperties().
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsFlushData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection.
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsFlushData(CUpti_SassMetricsFlushData_Params* pParams);
+
+/** @} */ /* END CUPTI_SASS_METRICS_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_SASS_METRICS_H_
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b625d45c65288fa2ea7dc05819ee4dfc4cbdd3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h
@@ -0,0 +1,43 @@
+#if !defined(_CUPTI_TARGET_H_)
+#define _CUPTI_TARGET_H_
+
+/*
+CUPTI profiler target API's
+This file contains the CUPTI profiling API's.
+*/
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef struct CUpti_Device_GetChipName_Params
+{
+    size_t structSize;                                      //!< [in]
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t deviceIndex;                                     //!< [in]
+    const char* pChipName;                                  //!< [out]
+} CUpti_Device_GetChipName_Params;
+
+#define CUpti_Device_GetChipName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Device_GetChipName_Params, pChipName)
+CUptiResult CUPTIAPI cuptiDeviceGetChipName(CUpti_Device_GetChipName_Params *pParams);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8808ea022b4116a1177e6f78d34d0f39604344
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_VERSION_H_)
+#define _CUPTI_VERSION_H_
+
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_VERSION_API CUPTI Version
+ * Function and macro to determine the CUPTI version.
+ * @{
+ */
+
+/**
+ * \brief The API version for this implementation of CUPTI.
+ *
+ * The API version for this implementation of CUPTI. This define along
+ * with \ref cuptiGetVersion can be used to dynamically detect if the
+ * version of CUPTI compiled against matches the version of the loaded
+ * CUPTI library.
+ *
+ * v1 : CUDAToolsSDK 4.0
+ * v2 : CUDAToolsSDK 4.1
+ * v3 : CUDA Toolkit 5.0
+ * v4 : CUDA Toolkit 5.5
+ * v5 : CUDA Toolkit 6.0
+ * v6 : CUDA Toolkit 6.5
+ * v7 : CUDA Toolkit 6.5(with sm_52 support)
+ * v8 : CUDA Toolkit 7.0
+ * v9 : CUDA Toolkit 8.0
+ * v10 : CUDA Toolkit 9.0
+ * v11 : CUDA Toolkit 9.1
+ * v12 : CUDA Toolkit 10.0, 10.1 and 10.2
+ * v13 : CUDA Toolkit 11.0
+ * v14 : CUDA Toolkit 11.1
+ * v15 : CUDA Toolkit 11.2, 11.3 and 11.4
+ * v16 : CUDA Toolkit 11.5
+ * v17 : CUDA Toolkit 11.6
+ * v18 : CUDA Toolkit 11.8
+ * v19 : CUDA Toolkit 12.0
+ * v20 : CUDA Toolkit 12.2
+ * v21 : CUDA Toolkit 12.3
+ * v22 : CUDA Toolkit 12.4
+ * v23 : CUDA Toolkit 12.5
+ * v24 : CUDA Toolkit 12.6
+ * v26 : CUDA Toolkit 12.8
+ */
+#define CUPTI_API_VERSION 26
+
+/**
+ * \brief Get the CUPTI API version.
+ *
+ * Return the API version in \p *version.
+ *
+ * \param version Returns the version
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p version is NULL
+ * \sa CUPTI_API_VERSION
+ */
+CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version);
+
+/** @} */ /* END CUPTI_VERSION_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_VERSION_H_*/
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa21ad8c1caef27fe00c315759f9379c247302c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__)
+#define __DEVICE_ATOMIC_FUNCTIONS_H__
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+# define __DEVICE_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in PGI CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#elif defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "device_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#undef EXCLUDE_FROM_RTC
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..153ac712aab4288e4c16dd229460b677e7b61152
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+extern "C"
+{
+extern __device__ __device_builtin__ int          __iAtomicAdd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicExch(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ float        __fAtomicExch(float *address, float val);
+extern __device__ __device_builtin__ int          __iAtomicMin(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicMax(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicAnd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicOr(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicXor(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicCAS(int *address, int compare, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
+
+
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_double_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0094cc9a0a57f53f47421a8ecc400fb84c26babe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+#include "vector_types.h"
+
+#if !defined(__STORAGE__)
+
+#if defined(__CUDACC_RTC__)
+#define __STORAGE__ \
+        extern const __device__
+#else /* !__CUDACC_RTC__ */
+#define __STORAGE__ \
+        extern const
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __STORAGE__ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+uint3 __device_builtin__ __STORAGE__ threadIdx;
+uint3 __device_builtin__ __STORAGE__ blockIdx;
+dim3 __device_builtin__ __STORAGE__ blockDim;
+dim3 __device_builtin__ __STORAGE__ gridDim;
+int __device_builtin__ __STORAGE__ warpSize;
+
+#undef __STORAGE__
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#if !defined(__cudaGet_threadIdx)
+
+#define __cudaGet_threadIdx() \
+        threadIdx
+
+#endif /* __cudaGet_threadIdx */
+
+#if !defined(__cudaGet_blockIdx)
+
+#define __cudaGet_blockIdx() \
+        blockIdx
+
+#endif /* __cudaGet_blockIdx */
+
+#if !defined(__cudaGet_blockDim)
+
+#define __cudaGet_blockDim() \
+        blockDim
+
+#endif /* __cudaGet_blockDim */
+
+#if !defined(__cudaGet_gridDim)
+
+#define __cudaGet_gridDim() \
+        gridDim
+
+#endif /* __cudaGet_gridDim */
+
+#if !defined(__cudaGet_warpSize)
+
+#define __cudaGet_warpSize() \
+        warpSize
+
+#endif /* __cudaGet_warpSize */
+
+#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b575a1014c6cdb9bf2f722c2a67e329186079e6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#endif /* !__DEVICE_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..541cbc6eb76dfdf42c407ca7c9537b603714d64c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_types.h
@@ -0,0 +1,4110 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_TYPES_H__)
+#define __DRIVER_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "vector_types.h"
+
+
+
+#ifndef __CUDACC_RTC_MINIMAL__
+/**
+ * \defgroup CUDART_TYPES Data types used by CUDA Runtime
+ * \ingroup CUDART
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*  TYPE DEFINITIONS USED BY RUNTIME API                                        *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+
+#if !defined(__CUDACC_RTC__)
+#include <limits.h>
+#include <stddef.h>
+#endif /* !defined(__CUDACC_RTC__) */
+
+#define cudaHostAllocDefault                0x00  /**< Default page-locked allocation flag */
+#define cudaHostAllocPortable               0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostAllocMapped                 0x02  /**< Map allocation into device space */
+#define cudaHostAllocWriteCombined          0x04  /**< Write-combined memory */
+
+#define cudaHostRegisterDefault             0x00  /**< Default host memory registration flag */
+#define cudaHostRegisterPortable            0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostRegisterMapped              0x02  /**< Map registered memory into device space */
+#define cudaHostRegisterIoMemory            0x04  /**< Memory-mapped I/O space */
+#define cudaHostRegisterReadOnly            0x08  /**< Memory-mapped read-only */
+
+#define cudaPeerAccessDefault               0x00  /**< Default peer addressing enable flag */
+
+#define cudaStreamDefault                   0x00  /**< Default stream flag */
+#define cudaStreamNonBlocking               0x01  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+
+ /**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamLegacy                    ((cudaStream_t)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamPerThread                 ((cudaStream_t)0x2)
+
+#define cudaEventDefault                    0x00  /**< Default event flag */
+#define cudaEventBlockingSync               0x01  /**< Event uses blocking synchronization */
+#define cudaEventDisableTiming              0x02  /**< Event will not record timing data */
+#define cudaEventInterprocess               0x04  /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */
+
+#define cudaEventRecordDefault              0x00  /**< Default event record flag */
+#define cudaEventRecordExternal             0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaEventWaitDefault                0x00  /**< Default event wait flag */
+#define cudaEventWaitExternal               0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaDeviceScheduleAuto              0x00  /**< Device flag - Automatic scheduling */
+#define cudaDeviceScheduleSpin              0x01  /**< Device flag - Spin default scheduling */
+#define cudaDeviceScheduleYield             0x02  /**< Device flag - Yield default scheduling */
+#define cudaDeviceScheduleBlockingSync      0x04  /**< Device flag - Use blocking synchronization */
+#define cudaDeviceBlockingSync              0x04  /**< Device flag - Use blocking synchronization 
+                                                    *  \deprecated This flag was deprecated as of CUDA 4.0 and
+                                                    *  replaced with ::cudaDeviceScheduleBlockingSync. */
+#define cudaDeviceScheduleMask              0x07  /**< Device schedule flags mask */
+#define cudaDeviceMapHost                   0x08  /**< Device flag - Support mapped pinned allocations */
+#define cudaDeviceLmemResizeToMax           0x10  /**< Device flag - Keep local memory allocation after launch */
+#define cudaDeviceSyncMemops                0x80  /**< Device flag - Ensure synchronous memory operations on this context will synchronize */
+#define cudaDeviceMask                      0xff  /**< Device flags mask */
+
+#define cudaArrayDefault                    0x00  /**< Default CUDA array allocation flag */
+#define cudaArrayLayered                    0x01  /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */
+#define cudaArraySurfaceLoadStore           0x02  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */
+#define cudaArrayCubemap                    0x04  /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */
+#define cudaArrayTextureGather              0x08  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */
+#define cudaArrayColorAttachment            0x20  /**< Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API */
+#define cudaArraySparse                     0x40  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array */
+#define cudaArrayDeferredMapping            0x80  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array */
+
+#define cudaIpcMemLazyEnablePeerAccess      0x01  /**< Automatically enable peer access between remote devices as needed */
+
+#define cudaMemAttachGlobal                 0x01  /**< Memory can be accessed by any stream on any device*/
+#define cudaMemAttachHost                   0x02  /**< Memory cannot be accessed by any stream on any device */
+#define cudaMemAttachSingle                 0x04  /**< Memory can only be accessed by a single stream on the associated device */
+
+#define cudaOccupancyDefault                0x00  /**< Default behavior */
+#define cudaOccupancyDisableCachingOverride 0x01  /**< Assume global caching is enabled and cannot be automatically turned off */
+
+#define cudaCpuDeviceId                     ((int)-1) /**< Device id that represents the CPU */
+#define cudaInvalidDeviceId                 ((int)-2) /**< Device id that represents an invalid device */
+#define cudaInitDeviceFlagsAreValid         0x01  /**< Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice call */
+/**
+ * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPreSync  0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA error types
+ */
+enum __device_builtin__ cudaError
+{
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cudaEventQuery() and ::cudaStreamQuery()).
+     */
+    cudaSuccess                           =      0,
+  
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    cudaErrorInvalidValue                 =     1,
+  
+    /**
+     * The API call failed because it was unable to allocate enough memory or
+     * other resources to perform the requested operation.
+     */
+    cudaErrorMemoryAllocation             =      2,
+  
+    /**
+     * The API call failed because the CUDA driver and runtime could not be
+     * initialized.
+     */
+    cudaErrorInitializationError          =      3,
+  
+    /**
+     * This indicates that a CUDA Runtime API call cannot be executed because
+     * it is being called during process shut down, at a point in time after
+     * CUDA driver has been unloaded.
+     */
+    cudaErrorCudartUnloading              =     4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    cudaErrorProfilerDisabled             =     5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cudaProfilerStart or
+     * ::cudaProfilerStop without initialization.
+     */
+    cudaErrorProfilerNotInitialized       =     6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStart() when profiling is already enabled.
+     */
+    cudaErrorProfilerAlreadyStarted       =     7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStop() when profiling is already disabled.
+     */
+     cudaErrorProfilerAlreadyStopped       =    8,
+    /**
+     * This indicates that a kernel launch is requesting resources that can
+     * never be satisfied by the current device. Requesting more shared memory
+     * per block than the device supports will trigger this error, as will
+     * requesting too many threads or blocks. See ::cudaDeviceProp for more
+     * device limitations.
+     */
+    cudaErrorInvalidConfiguration         =      9,
+  
+    /**
+     * This indicates that one or more of the pitch-related parameters passed
+     * to the API call is not within the acceptable range for pitch.
+     */
+    cudaErrorInvalidPitchValue            =     12,
+  
+    /**
+     * This indicates that the symbol name/identifier passed to the API call
+     * is not a valid name or identifier.
+     */
+    cudaErrorInvalidSymbol                =     13,
+
+    /**
+     * This indicates that at least one host pointer passed to the API call is
+     * not a valid host pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidHostPointer           =     16,
+  
+    /**
+     * This indicates that at least one device pointer passed to the API call is
+     * not a valid device pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidDevicePointer         =     17,
+    /**
+     * This indicates that the texture passed to the API call is not a valid
+     * texture.
+     */
+    cudaErrorInvalidTexture               =     18,
+  
+    /**
+     * This indicates that the texture binding is not valid. This occurs if you
+     * call ::cudaGetTextureAlignmentOffset() with an unbound texture.
+     */
+    cudaErrorInvalidTextureBinding        =     19,
+  
+    /**
+     * This indicates that the channel descriptor passed to the API call is not
+     * valid. This occurs if the format is not one of the formats specified by
+     * ::cudaChannelFormatKind, or if one of the dimensions is invalid.
+     */
+    cudaErrorInvalidChannelDescriptor     =     20,
+  
+    /**
+     * This indicates that the direction of the memcpy passed to the API call is
+     * not one of the types specified by ::cudaMemcpyKind.
+     */
+    cudaErrorInvalidMemcpyDirection       =     21,
+
+    /**
+     * This indicated that the user has taken the address of a constant variable,
+     * which was forbidden up until the CUDA 3.1 release.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Variables in constant
+     * memory may now have their address taken by the runtime via
+     * ::cudaGetSymbolAddress().
+     */
+    cudaErrorAddressOfConstant            =     22,
+  
+    /**
+     * This indicated that a texture fetch was not able to be performed.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureFetchFailed           =     23,
+  
+    /**
+     * This indicated that a texture was not bound for access.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureNotBound              =     24,
+  
+    /**
+     * This indicated that a synchronization operation had failed.
+     * This was previously used for some device emulation functions.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorSynchronizationError         =     25,
+    /**
+     * This indicates that a non-float texture was being accessed with linear
+     * filtering. This is not supported by CUDA.
+     */
+    cudaErrorInvalidFilterSetting         =     26,
+  
+    /**
+     * This indicates that an attempt was made to read an unsupported data type as a
+     * normalized float. This is not supported by CUDA.
+     */
+    cudaErrorInvalidNormSetting           =     27,
+
+    /**
+     * Mixing of device and device emulation code was not allowed.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMixedDeviceExecution         =     28,
+
+    /**
+     * This indicates that the API call is not yet implemented. Production
+     * releases of CUDA will never return this error.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    cudaErrorNotYetImplemented            =     31,
+  
+    /**
+     * This indicated that an emulated device pointer exceeded the 32-bit address
+     * range.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMemoryValueTooLarge          =     32,
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    cudaErrorStubLibrary                  =     34,
+
+    /**
+     * This indicates that the installed NVIDIA CUDA driver is older than the
+     * CUDA runtime library. This is not a supported configuration. Users should
+     * install an updated NVIDIA display driver to allow the application to run.
+     */
+    cudaErrorInsufficientDriver           =     35,
+
+    /**
+     * This indicates that the API call requires a newer CUDA driver than the one
+     * currently installed. Users should install an updated NVIDIA CUDA driver
+     * to allow the API call to succeed.
+     */
+    cudaErrorCallRequiresNewerDriver      =     36,
+  
+    /**
+     * This indicates that the surface passed to the API call is not a valid
+     * surface.
+     */
+    cudaErrorInvalidSurface               =     37,
+  
+    /**
+     * This indicates that multiple global or constant variables (across separate
+     * CUDA source files in the application) share the same string name.
+     */
+    cudaErrorDuplicateVariableName        =     43,
+  
+    /**
+     * This indicates that multiple textures (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateTextureName         =     44,
+  
+    /**
+     * This indicates that multiple surfaces (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateSurfaceName         =     45,
+  
+    /**
+     * This indicates that all CUDA devices are busy or unavailable at the current
+     * time. Devices are often busy/unavailable due to use of
+     * ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long
+     * running CUDA kernels have filled up the GPU and are blocking new work
+     * from starting. They can also be unavailable due to memory constraints
+     * on a device that already has active CUDA work being performed.
+     */
+    cudaErrorDevicesUnavailable           =     46,
+  
+    /**
+     * This indicates that the current context is not compatible with this
+     * the CUDA Runtime. This can only occur if you are using CUDA
+     * Runtime/Driver interoperability and have created an existing Driver
+     * context using the driver API. The Driver context may be incompatible
+     * either because the Driver context was created using an older version 
+     * of the API, because the Runtime API call expects a primary driver 
+     * context and the Driver context is not primary, or because the Driver 
+     * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions 
+     * with the CUDA Driver API" for more information.
+     */
+    cudaErrorIncompatibleDriverContext    =     49,
+    
+    /**
+     * The device function being invoked (usually via ::cudaLaunchKernel()) was not
+     * previously configured via the ::cudaConfigureCall() function.
+     */
+    cudaErrorMissingConfiguration         =      52,
+
+    /**
+     * This indicated that a previous kernel launch failed. This was previously
+     * used for device emulation of kernel launches.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorPriorLaunchFailure           =      53,
+    /**
+     * This error indicates that a device runtime grid launch did not occur 
+     * because the depth of the child grid would exceed the maximum supported
+     * number of nested grid launches. 
+     */
+    cudaErrorLaunchMaxDepthExceeded       =     65,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped textures which are unsupported by the device runtime. 
+     * Kernels launched via the device runtime only support textures created with 
+     * the Texture Object API's.
+     */
+    cudaErrorLaunchFileScopedTex          =     66,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped surfaces which are unsupported by the device runtime.
+     * Kernels launched via the device runtime only support surfaces created with
+     * the Surface Object API's.
+     */
+    cudaErrorLaunchFileScopedSurf         =     67,
+
+    /**
+     * This error indicates that a call to ::cudaDeviceSynchronize made from
+     * the device runtime failed because the call was made at grid depth greater
+     * than than either the default (2 levels of grids) or user specified device
+     * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on
+     * launched grids at a greater depth successfully, the maximum nested
+     * depth at which ::cudaDeviceSynchronize will be called must be specified
+     * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit
+     * api before the host-side launch of a kernel using the device runtime.
+     * Keep in mind that additional levels of sync depth require the runtime
+     * to reserve large amounts of device memory that cannot be used for
+     * user allocations. Note that ::cudaDeviceSynchronize made from device
+     * runtime is only supported on devices of compute capability < 9.0.
+     */
+    cudaErrorSyncDepthExceeded            =     68,
+
+    /**
+     * This error indicates that a device runtime grid launch failed because
+     * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount.
+     * For this launch to proceed successfully, ::cudaDeviceSetLimit must be
+     * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher 
+     * than the upper bound of outstanding launches that can be issued to the
+     * device runtime. Keep in mind that raising the limit of pending device
+     * runtime launches will require the runtime to reserve device memory that
+     * cannot be used for user allocations.
+     */
+    cudaErrorLaunchPendingCountExceeded   =     69,
+  
+    /**
+     * The requested device function does not exist or is not compiled for the
+     * proper device architecture.
+     */
+    cudaErrorInvalidDeviceFunction        =      98,
+  
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    cudaErrorNoDevice                     =     100,
+  
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    cudaErrorInvalidDevice                =     101,
+
+    /**
+     * This indicates that the device doesn't have a valid Grid License.
+     */
+    cudaErrorDeviceNotLicensed            =     102,
+
+   /**
+    * By default, the CUDA runtime may perform a minimal set of self-tests,
+    * as well as CUDA driver tests, to establish the validity of both.
+    * Introduced in CUDA 11.2, this error return indicates that at least one
+    * of these tests has failed and the validity of either the runtime
+    * or the driver could not be established.
+    */
+   cudaErrorSoftwareValidityNotEstablished  =     103,
+
+    /**
+     * This indicates an internal startup failure in the CUDA runtime.
+     */
+    cudaErrorStartupFailure               =    127,
+  
+    /**
+     * This indicates that the device kernel image is invalid.
+     */
+    cudaErrorInvalidKernelImage           =     200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    cudaErrorDeviceUninitialized          =     201,
+
+    /**
+     * This indicates that the buffer object could not be mapped.
+     */
+    cudaErrorMapBufferObjectFailed        =     205,
+  
+    /**
+     * This indicates that the buffer object could not be unmapped.
+     */
+    cudaErrorUnmapBufferObjectFailed      =     206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    cudaErrorArrayIsMapped                =     207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    cudaErrorAlreadyMapped                =     208,
+  
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    cudaErrorNoKernelImageForDevice       =     209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    cudaErrorAlreadyAcquired              =     210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    cudaErrorNotMapped                    =     211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    cudaErrorNotMappedAsArray             =     212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    cudaErrorNotMappedAsPointer           =     213,
+  
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    cudaErrorECCUncorrectable             =     214,
+  
+    /**
+     * This indicates that the ::cudaLimit passed to the API call is not
+     * supported by the active device.
+     */
+    cudaErrorUnsupportedLimit             =     215,
+    
+    /**
+     * This indicates that a call tried to access an exclusive-thread device that 
+     * is already in use by a different thread.
+     */
+    cudaErrorDeviceAlreadyInUse           =     216,
+
+    /**
+     * This error indicates that P2P access is not supported across the given
+     * devices.
+     */
+    cudaErrorPeerAccessUnsupported        =     217,
+
+    /**
+     * A PTX compilation failed. The runtime may fall back to compiling PTX if
+     * an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorInvalidPtx                   =     218,
+
+    /**
+     * This indicates an error with the OpenGL or DirectX context.
+     */
+    cudaErrorInvalidGraphicsContext       =     219,
+
+    /**
+     * This indicates that an uncorrectable NVLink error was detected during the
+     * execution.
+     */
+    cudaErrorNvlinkUncorrectable          =     220,
+
+    /**
+     * This indicates that the PTX JIT compiler library was not found. The JIT Compiler
+     * library is used for PTX compilation. The runtime may fall back to compiling PTX
+     * if an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorJitCompilerNotFound          =     221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     * The most common reason for this, is the PTX was generated by a compiler newer
+     * than what is supported by the CUDA driver and PTX JIT compiler.
+     */
+    cudaErrorUnsupportedPtxVersion        =     222,
+
+    /**
+     * This indicates that the JIT compilation was disabled. The JIT compilation compiles
+     * PTX. The runtime may fall back to compiling PTX if an application does not contain
+     * a suitable binary for the current device.
+     */
+    cudaErrorJitCompilationDisabled       =     223,
+
+    /**
+     * This indicates that the provided execution affinity is not supported by the device.
+     */
+    cudaErrorUnsupportedExecAffinity      =     224,
+
+    /**
+     * This indicates that the code to be compiled by the PTX JIT contains
+     * unsupported call to cudaDeviceSynchronize.
+     */
+    cudaErrorUnsupportedDevSideSync       =     225,
+
+    /**
+     * This indicates that an exception occurred on the device that is now
+     * contained by the GPU's error containment capability. Common causes are -
+     * a. Certain types of invalid accesses of peer GPU memory over nvlink
+     * b. Certain classes of hardware errors
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    cudaErrorContained                    =     226,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    cudaErrorInvalidSource                =     300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    cudaErrorFileNotFound                 =     301,
+  
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    cudaErrorSharedObjectSymbolNotFound   =     302,
+  
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    cudaErrorSharedObjectInitFailed       =     303,
+
+    /**
+     * This error indicates that an OS call failed.
+     */
+    cudaErrorOperatingSystem              =     304,
+  
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::cudaStream_t and
+     * ::cudaEvent_t.
+     */
+    cudaErrorInvalidResourceHandle        =     400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    cudaErrorIllegalState                 =     401,
+
+    /**
+     * This indicates an attempt was made to introspect an object in a way that
+     * would discard semantically important information. This is either due to
+     * the object using funtionality newer than the API version used to
+     * introspect it or omission of optional return arguments.
+     */
+    cudaErrorLossyQuery                   =     402,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    cudaErrorSymbolNotFound               =     500,
+  
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::cudaSuccess (which indicates completion). Calls that
+     * may return this value include ::cudaEventQuery() and ::cudaStreamQuery().
+     */
+    cudaErrorNotReady                     =     600,
+
+    /**
+     * The device encountered a load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalAddress               =     700,
+  
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. Although this error is similar to
+     * ::cudaErrorInvalidConfiguration, this error usually indicates that the
+     * user has attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register count.
+     */
+    cudaErrorLaunchOutOfResources         =      701,
+  
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device property
+     * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+     * for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchTimeout                =      702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    cudaErrorLaunchIncompatibleTexturing  =     703,
+      
+    /**
+     * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is
+     * trying to re-enable peer addressing on from a context which has already
+     * had peer addressing enabled.
+     */
+    cudaErrorPeerAccessAlreadyEnabled     =     704,
+    
+    /**
+     * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to 
+     * disable peer addressing which has not been enabled yet via 
+     * ::cudaDeviceEnablePeerAccess().
+     */
+    cudaErrorPeerAccessNotEnabled         =     705,
+  
+    /**
+     * This indicates that the user has called ::cudaSetValidDevices(),
+     * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),
+     * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or
+     * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by
+     * calling non-device management operations (allocating memory and
+     * launching kernels are examples of non-device management operations).
+     * This error can also be returned if using runtime/driver
+     * interoperability and there is an existing ::CUcontext active on the
+     * host thread.
+     */
+    cudaErrorSetOnActiveProcess           =     708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    cudaErrorContextIsDestroyed           =     709,
+
+    /**
+     * An assert triggered in device code during kernel execution. The device
+     * cannot be used again. All existing allocations are invalid. To continue
+     * using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorAssert                        =    710,
+  
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices 
+     * passed to ::cudaEnablePeerAccess().
+     */
+    cudaErrorTooManyPeers                 =     711,
+  
+    /**
+     * This error indicates that the memory range passed to ::cudaHostRegister()
+     * has already been registered.
+     */
+    cudaErrorHostMemoryAlreadyRegistered  =     712,
+        
+    /**
+     * This error indicates that the pointer passed to ::cudaHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    cudaErrorHostMemoryNotRegistered      =     713,
+
+    /**
+     * Device encountered an error in the call stack during kernel execution,
+     * possibly due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorHardwareStackError           =     714,
+
+    /**
+     * The device encountered an illegal instruction during kernel execution
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalInstruction           =     715,
+
+    /**
+     * The device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorMisalignedAddress            =     716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidAddressSpace          =     717,
+
+    /**
+     * The device encountered an invalid program counter.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidPc                    =     718,
+  
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchFailure                =      719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+     */
+    cudaErrorCooperativeLaunchTooLarge    =     720,
+
+    /**
+     * An exception occurred on the device while exiting a kernel using tensor memory: the
+     * tensor memory was not completely deallocated. This leaves the process in an inconsistent
+     * state and any further CUDA work will return the same error. To continue using CUDA, the
+     * process must be terminated and relaunched.
+     */
+    cudaErrorTensorMemoryLeak             =     721,
+    
+    /**
+     * This error indicates the attempted operation is not permitted.
+     */
+    cudaErrorNotPermitted                 =     800,
+
+    /**
+     * This error indicates the attempted operation is not supported
+     * on the current system or device.
+     */
+    cudaErrorNotSupported                 =     801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    cudaErrorSystemNotReady               =     802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    cudaErrorSystemDriverMismatch         =     803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    cudaErrorCompatNotSupportedOnDevice   =     804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    cudaErrorMpsConnectionFailed          =     805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    cudaErrorMpsRpcFailure                =     806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    cudaErrorMpsServerNotReady            =     807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    cudaErrorMpsMaxClientsReached         =     808,
+
+    /**
+     * This error indicates the the hardware resources required to device connections have been exhausted.
+     */
+    cudaErrorMpsMaxConnectionsReached     =     809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorMpsClientTerminated          =     810,
+
+    /**
+     * This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
+     */
+    cudaErrorCdpNotSupported              =     811,
+
+    /**
+     * This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
+     */
+    cudaErrorCdpVersionMismatch           =     812,
+
+    /**
+     * The operation is not permitted when the stream is capturing.
+     */
+    cudaErrorStreamCaptureUnsupported     =    900,
+
+    /**
+     * The current capture sequence on the stream has been invalidated due to
+     * a previous error.
+     */
+    cudaErrorStreamCaptureInvalidated     =    901,
+
+    /**
+     * The operation would have resulted in a merge of two independent capture
+     * sequences.
+     */
+    cudaErrorStreamCaptureMerge           =    902,
+
+    /**
+     * The capture was not initiated in this stream.
+     */
+    cudaErrorStreamCaptureUnmatched       =    903,
+
+    /**
+     * The capture sequence contains a fork that was not joined to the primary
+     * stream.
+     */
+    cudaErrorStreamCaptureUnjoined        =    904,
+
+    /**
+     * A dependency would have been created which crosses the capture sequence
+     * boundary. Only implicit in-stream ordering dependencies are allowed to
+     * cross the boundary.
+     */
+    cudaErrorStreamCaptureIsolation       =    905,
+
+    /**
+     * The operation would have resulted in a disallowed implicit dependency on
+     * a current capture sequence from cudaStreamLegacy.
+     */
+    cudaErrorStreamCaptureImplicit        =    906,
+
+    /**
+     * The operation is not permitted on an event which was last recorded in a
+     * capturing stream.
+     */
+    cudaErrorCapturedEvent                =    907,
+  
+    /**
+     * A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed
+     * argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a
+     * different thread.
+     */
+    cudaErrorStreamCaptureWrongThread     =    908,
+
+    /**
+     * This indicates that the wait operation has timed out.
+     */
+    cudaErrorTimeout                      =    909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    cudaErrorGraphExecUpdateFailure       =    910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    cudaErrorExternalDevice               =    911,
+
+    /**
+     * This indicates that a kernel launch error has occurred due to cluster
+     * misconfiguration.
+     */
+    cudaErrorInvalidClusterSize           =    912,
+
+    /**
+     * Indiciates a function handle is not loaded when calling an API that requires
+     * a loaded function.
+     */
+    cudaErrorFunctionNotLoaded            =    913,
+
+    /**
+     * This error indicates one or more resources passed in are not valid resource
+     * types for the operation.
+     */
+    cudaErrorInvalidResourceType          =    914,
+
+    /**
+     * This error indicates one or more resources are insufficient or non-applicable for
+     * the operation.
+     */
+    cudaErrorInvalidResourceConfiguration =    915,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    cudaErrorUnknown                      =    999
+
+    /**
+     * Any unhandled CUDA driver error is added to this value and returned via
+     * the runtime. Production releases of CUDA should not return such errors.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    , cudaErrorApiFailureBase               =  10000
+};
+
+/**
+ * Channel format kind
+ */
+enum __device_builtin__ cudaChannelFormatKind
+{
+    cudaChannelFormatKindSigned                         =   0,      /**< Signed channel format */
+    cudaChannelFormatKindUnsigned                       =   1,      /**< Unsigned channel format */
+    cudaChannelFormatKindFloat                          =   2,      /**< Float channel format */
+    cudaChannelFormatKindNone                           =   3,      /**< No channel format */
+    cudaChannelFormatKindNV12                           =   4,      /**< Unsigned 8-bit integers, planar 4:2:0 YUV format */
+    cudaChannelFormatKindUnsignedNormalized8X1          =   5,      /**< 1 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X2          =   6,      /**< 2 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X4          =   7,      /**< 4 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X1         =   8,      /**< 1 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X2         =   9,      /**< 2 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X4         =   10,     /**< 4 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X1            =   11,     /**< 1 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X2            =   12,     /**< 2 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X4            =   13,     /**< 4 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X1           =   14,     /**< 1 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X2           =   15,     /**< 2 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X4           =   16,     /**< 4 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedBlockCompressed1       =   17,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed1SRGB   =   18,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    cudaChannelFormatKindUnsignedBlockCompressed2       =   19,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed2SRGB   =   20,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed3       =   21,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed3SRGB   =   22,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed4       =   23,     /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed4         =   24,     /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed5       =   25,     /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed5         =   26,     /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed6H      =   27,     /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindSignedBlockCompressed6H        =   28,     /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7       =   29,     /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7SRGB   =   30,     /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedNormalized1010102      =   31      /**< 4 channel unsigned normalized (10-bit, 10-bit, 10-bit, 2-bit) format */
+
+};
+
+/**
+ * CUDA Channel format descriptor
+ */
+struct __device_builtin__ cudaChannelFormatDesc
+{
+    int                        x; /**< x */
+    int                        y; /**< y */
+    int                        z; /**< z */
+    int                        w; /**< w */
+    enum cudaChannelFormatKind f; /**< Channel format kind */
+};
+
+/**
+ * CUDA array
+ */
+typedef struct cudaArray *cudaArray_t;
+
+/**
+ * CUDA array (as source copy argument)
+ */
+typedef const struct cudaArray *cudaArray_const_t;
+
+struct cudaArray;
+
+/**
+ * CUDA mipmapped array
+ */
+typedef struct cudaMipmappedArray *cudaMipmappedArray_t;
+
+/**
+ * CUDA mipmapped array (as source argument)
+ */
+typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t;
+
+struct cudaMipmappedArray;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define cudaArraySparsePropertiesSingleMipTail   0x1
+
+/**
+ * Sparse CUDA array and CUDA mipmapped array properties
+ */
+struct __device_builtin__ cudaArraySparseProperties {
+    struct {
+        unsigned int width;             /**< Tile width in elements */
+        unsigned int height;            /**< Tile height in elements */
+        unsigned int depth;             /**< Tile depth in elements */
+    } tileExtent;
+    unsigned int miptailFirstLevel;     /**< First mip level at which the mip tail begins */   
+    unsigned long long miptailSize;     /**< Total size of the mip tail. */
+    unsigned int flags;                 /**< Flags will either be zero or ::cudaArraySparsePropertiesSingleMipTail */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA array and CUDA mipmapped array memory requirements
+ */
+struct __device_builtin__ cudaArrayMemoryRequirements {
+    size_t size;                    /**< Total size of the array. */
+    size_t alignment;               /**< Alignment necessary for mapping the array. */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA memory types
+ */
+enum __device_builtin__ cudaMemoryType
+{
+    cudaMemoryTypeUnregistered = 0, /**< Unregistered memory */
+    cudaMemoryTypeHost         = 1, /**< Host memory */
+    cudaMemoryTypeDevice       = 2, /**< Device memory */
+    cudaMemoryTypeManaged      = 3  /**< Managed memory */
+};
+
+/**
+ * CUDA memory copy types
+ */
+enum __device_builtin__ cudaMemcpyKind
+{
+    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
+    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
+    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
+    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
+    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
+};
+
+/**
+ * CUDA Pitched memory pointer
+ *
+ * \sa ::make_cudaPitchedPtr
+ */
+struct __device_builtin__ cudaPitchedPtr
+{
+    void   *ptr;      /**< Pointer to allocated memory */
+    size_t  pitch;    /**< Pitch of allocated memory in bytes */
+    size_t  xsize;    /**< Logical width of allocation in elements */
+    size_t  ysize;    /**< Logical height of allocation in elements */
+};
+
+/**
+ * CUDA extent
+ *
+ * \sa ::make_cudaExtent
+ */
+struct __device_builtin__ cudaExtent
+{
+    size_t width;     /**< Width in elements when referring to array memory, in bytes when referring to linear memory */
+    size_t height;    /**< Height in elements */
+    size_t depth;     /**< Depth in elements */
+};
+
+/**
+ * CUDA 3D position
+ *
+ * \sa ::make_cudaPos
+ */
+struct __device_builtin__ cudaPos
+{
+    size_t x;     /**< x */
+    size_t y;     /**< y */
+    size_t z;     /**< z */
+};
+
+/**
+ * CUDA 3D memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+    enum cudaMemcpyKind    kind;      /**< Type of transfer */
+};
+
+/**
+ * Memcpy node parameters
+ */
+struct __device_builtin__ cudaMemcpyNodeParams {
+    int flags;                            /**< Must be zero */
+    int reserved[3];                      /**< Must be zero */
+    struct cudaMemcpy3DParms copyParams;  /**< Parameters for the memory copy */
+};
+
+/**
+ * CUDA 3D cross-device memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DPeerParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+    int                    srcDevice; /**< Source device */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+    int                    dstDevice; /**< Destination device */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+};
+
+/**
+ * CUDA Memset node parameters
+ */
+struct __device_builtin__  cudaMemsetParams {
+    void *dst;                              /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+};
+
+/**
+ * CUDA Memset node parameters
+ */
+struct __device_builtin__  cudaMemsetParamsV2 {
+    void *dst;                              /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+};
+
+/**
+ * Specifies performance hint with ::cudaAccessPolicyWindow for hitProp and missProp members.
+ */
+enum __device_builtin__  cudaAccessProperty {
+    cudaAccessPropertyNormal = 0,       /**< Normal cache persistence. */
+    cudaAccessPropertyStreaming = 1,    /**< Streaming access is less likely to persit from cache. */
+    cudaAccessPropertyPersisting = 2    /**< Persisting access is more likely to persist in cache.*/
+};
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * Partition into many segments and assign segments such that.
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+struct __device_builtin__ cudaAccessPolicyWindow {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    enum cudaAccessProperty hitProp;    /**< ::CUaccessProperty set for hit. */
+    enum cudaAccessProperty missProp;   /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING. */
+};
+
+#ifdef _WIN32
+#define CUDART_CB __stdcall
+#else
+#define CUDART_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDART_CB *cudaHostFn_t)(void *userData);
+
+/**
+ * CUDA host node parameters
+ */
+struct __device_builtin__ cudaHostNodeParams {
+    cudaHostFn_t fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+};
+
+/**
+ * CUDA host node parameters
+ */
+struct __device_builtin__ cudaHostNodeParamsV2 {
+    cudaHostFn_t fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+};
+
+/**
+ * Possible stream capture statuses returned by ::cudaStreamIsCapturing
+ */
+enum __device_builtin__ cudaStreamCaptureStatus {
+    cudaStreamCaptureStatusNone        = 0, /**< Stream is not capturing */
+    cudaStreamCaptureStatusActive      = 1, /**< Stream is actively capturing */
+    cudaStreamCaptureStatusInvalidated = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+};
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cudaStreamBeginCapture and ::cudaThreadExchangeStreamCaptureMode
+ */
+enum __device_builtin__ cudaStreamCaptureMode {
+    cudaStreamCaptureModeGlobal      = 0,
+    cudaStreamCaptureModeThreadLocal = 1,
+    cudaStreamCaptureModeRelaxed     = 2
+};
+
+enum __device_builtin__ cudaSynchronizationPolicy {
+    cudaSyncPolicyAuto = 1,
+    cudaSyncPolicySpin = 2,
+    cudaSyncPolicyYield = 3,
+    cudaSyncPolicyBlockingSync = 4
+};
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaClusterSchedulingPolicy {
+    cudaClusterSchedulingPolicyDefault       = 0, /**< the default policy */
+    cudaClusterSchedulingPolicySpread        = 1, /**< spread the blocks within a cluster to the SMs */
+    cudaClusterSchedulingPolicyLoadBalancing = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+};
+
+/**
+ * Flags for ::cudaStreamUpdateCaptureDependencies
+ */
+enum __device_builtin__ cudaStreamUpdateCaptureDependenciesFlags {
+    cudaStreamAddCaptureDependencies = 0x0, /**< Add new nodes to the dependency set */
+    cudaStreamSetCaptureDependencies = 0x1  /**< Replace the dependency set with the new nodes */
+};
+
+/**
+ * Flags for user objects for graphs
+ */
+enum __device_builtin__ cudaUserObjectFlags {
+    cudaUserObjectNoDestructorSync = 0x1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+};
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+enum __device_builtin__ cudaUserObjectRetainFlags {
+    cudaGraphUserObjectMove = 0x1  /**< Transfer references from the caller rather than creating new references. */
+};
+
+/**
+ * CUDA graphics interop resource
+ */
+struct cudaGraphicsResource;
+
+/**
+ * CUDA graphics interop register flags
+ */
+enum __device_builtin__ cudaGraphicsRegisterFlags
+{
+    cudaGraphicsRegisterFlagsNone             = 0,  /**< Default */
+    cudaGraphicsRegisterFlagsReadOnly         = 1,  /**< CUDA will not write to this resource */ 
+    cudaGraphicsRegisterFlagsWriteDiscard     = 2,  /**< CUDA will only write to and will not read from this resource */
+    cudaGraphicsRegisterFlagsSurfaceLoadStore = 4,  /**< CUDA will bind this resource to a surface reference */
+    cudaGraphicsRegisterFlagsTextureGather    = 8   /**< CUDA will perform texture gather operations on this resource */
+};
+
+/**
+ * CUDA graphics interop map flags
+ */
+enum __device_builtin__ cudaGraphicsMapFlags
+{
+    cudaGraphicsMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+    cudaGraphicsMapFlagsReadOnly     = 1,  /**< CUDA will not write to this resource */
+    cudaGraphicsMapFlagsWriteDiscard = 2   /**< CUDA will only write to and will not read from this resource */
+};
+
+/**
+ * CUDA graphics interop array indices for cube maps
+ */
+enum __device_builtin__ cudaGraphicsCubeFace 
+{
+    cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */
+    cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */
+    cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */
+    cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */
+    cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */
+    cudaGraphicsCubeFaceNegativeZ = 0x05  /**< Negative Z face of cubemap */
+};
+
+/**
+ * CUDA resource types
+ */
+enum __device_builtin__ cudaResourceType
+{
+    cudaResourceTypeArray          = 0x00, /**< Array resource */
+    cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */
+    cudaResourceTypeLinear         = 0x02, /**< Linear resource */
+    cudaResourceTypePitch2D        = 0x03  /**< Pitch 2D resource */
+};
+
+/**
+ * CUDA texture resource view formats
+ */
+enum __device_builtin__ cudaResourceViewFormat
+{
+    cudaResViewFormatNone                      = 0x00, /**< No resource view format (use underlying resource format) */
+    cudaResViewFormatUnsignedChar1             = 0x01, /**< 1 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar2             = 0x02, /**< 2 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar4             = 0x03, /**< 4 channel unsigned 8-bit integers */
+    cudaResViewFormatSignedChar1               = 0x04, /**< 1 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar2               = 0x05, /**< 2 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar4               = 0x06, /**< 4 channel signed 8-bit integers */
+    cudaResViewFormatUnsignedShort1            = 0x07, /**< 1 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort2            = 0x08, /**< 2 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort4            = 0x09, /**< 4 channel unsigned 16-bit integers */
+    cudaResViewFormatSignedShort1              = 0x0a, /**< 1 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort2              = 0x0b, /**< 2 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort4              = 0x0c, /**< 4 channel signed 16-bit integers */
+    cudaResViewFormatUnsignedInt1              = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt2              = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt4              = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    cudaResViewFormatSignedInt1                = 0x10, /**< 1 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt2                = 0x11, /**< 2 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt4                = 0x12, /**< 4 channel signed 32-bit integers */
+    cudaResViewFormatHalf1                     = 0x13, /**< 1 channel 16-bit floating point */
+    cudaResViewFormatHalf2                     = 0x14, /**< 2 channel 16-bit floating point */
+    cudaResViewFormatHalf4                     = 0x15, /**< 4 channel 16-bit floating point */
+    cudaResViewFormatFloat1                    = 0x16, /**< 1 channel 32-bit floating point */
+    cudaResViewFormatFloat2                    = 0x17, /**< 2 channel 32-bit floating point */
+    cudaResViewFormatFloat4                    = 0x18, /**< 4 channel 32-bit floating point */
+    cudaResViewFormatUnsignedBlockCompressed1  = 0x19, /**< Block compressed 1 */
+    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a, /**< Block compressed 2 */
+    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b, /**< Block compressed 3 */
+    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c, /**< Block compressed 4 unsigned */
+    cudaResViewFormatSignedBlockCompressed4    = 0x1d, /**< Block compressed 4 signed */
+    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e, /**< Block compressed 5 unsigned */
+    cudaResViewFormatSignedBlockCompressed5    = 0x1f, /**< Block compressed 5 signed */
+    cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    cudaResViewFormatSignedBlockCompressed6H   = 0x21, /**< Block compressed 6 signed half-float */
+    cudaResViewFormatUnsignedBlockCompressed7  = 0x22  /**< Block compressed 7 */
+};
+
+/**
+ * CUDA resource descriptor
+ */
+struct __device_builtin__ cudaResourceDesc {
+    enum cudaResourceType resType;             /**< Resource type */
+    
+    union {
+        struct {
+            cudaArray_t array;                 /**< CUDA array */
+        } array;
+        struct {
+            cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t sizeInBytes;                /**< Size in bytes */
+        } linear;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t width;                      /**< Width of the array in elements */
+            size_t height;                     /**< Height of the array in elements */
+            size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+        } pitch2D;
+    } res;
+};
+
+/**
+ * CUDA resource view descriptor
+ */
+struct __device_builtin__ cudaResourceViewDesc
+{
+    enum cudaResourceViewFormat format;           /**< Resource view format */
+    size_t                      width;            /**< Width of the resource view */
+    size_t                      height;           /**< Height of the resource view */
+    size_t                      depth;            /**< Depth of the resource view */
+    unsigned int                firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int                lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int                firstLayer;       /**< First layer index */
+    unsigned int                lastLayer;        /**< Last layer index */
+};
+
+/**
+ * CUDA pointer attributes
+ */
+struct __device_builtin__ cudaPointerAttributes
+{
+    /**
+     * The type of memory - ::cudaMemoryTypeUnregistered, ::cudaMemoryTypeHost,
+     * ::cudaMemoryTypeDevice or ::cudaMemoryTypeManaged.
+     */
+    enum cudaMemoryType type;
+
+    /** 
+     * The device against which the memory was allocated or registered.
+     * If the memory type is ::cudaMemoryTypeDevice then this identifies 
+     * the device on which the memory referred physically resides.  If
+     * the memory type is ::cudaMemoryTypeHost or::cudaMemoryTypeManaged then
+     * this identifies the device which was current when the memory was allocated
+     * or registered (and if that device is deinitialized then this allocation
+     * will vanish with that device's state).
+     */
+    int device;
+
+    /**
+     * The address which may be dereferenced on the current device to access 
+     * the memory or NULL if no such address exists.
+     */
+    void *devicePointer;
+
+    /**
+     * The address which may be dereferenced on the host to access the
+     * memory or NULL if no such address exists.
+     *
+     * \note CUDA doesn't check if unregistered memory is allocated so this field
+     * may contain invalid pointer if an invalid pointer has been passed to CUDA.
+     */
+    void *hostPointer;
+};
+
+/**
+ * CUDA function attributes
+ */
+struct __device_builtin__ cudaFuncAttributes
+{
+   /**
+    * The size in bytes of statically-allocated shared memory per block
+    * required by this function. This does not include dynamically-allocated
+    * shared memory requested by the user at runtime.
+    */
+   size_t sharedSizeBytes;
+
+   /**
+    * The size in bytes of user-allocated constant memory required by this
+    * function.
+    */
+   size_t constSizeBytes;
+
+   /**
+    * The size in bytes of local memory used by each thread of this function.
+    */
+   size_t localSizeBytes;
+
+   /**
+    * The maximum number of threads per block, beyond which a launch of the
+    * function would fail. This number depends on both the function and the
+    * device on which the function is currently loaded.
+    */
+   int maxThreadsPerBlock;
+
+   /**
+    * The number of registers used by each thread of this function.
+    */
+   int numRegs;
+
+   /**
+    * The PTX virtual architecture version for which the function was
+    * compiled. This value is the major PTX version * 10 + the minor PTX
+    * version, so a PTX version 1.3 function would return the value 13.
+    */
+   int ptxVersion;
+
+   /**
+    * The binary architecture version for which the function was compiled.
+    * This value is the major binary version * 10 + the minor binary version,
+    * so a binary version 1.3 function would return the value 13.
+    */
+   int binaryVersion;
+
+   /**
+    * The attribute to indicate whether the function has been compiled with 
+    * user specified option "-Xptxas --dlcm=ca" set.
+    */
+   int cacheModeCA;
+
+   /**
+    * The maximum size in bytes of dynamic shared memory per block for 
+    * this function. Any launch must have a dynamic shared memory size
+    * smaller than this value.
+    */
+   int maxDynamicSharedSizeBytes;
+
+   /**
+    * On devices where the L1 cache and shared memory use the same hardware resources, 
+    * this sets the shared memory carveout preference, in percent of the maximum shared memory. 
+    * Refer to ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+    * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+    * See ::cudaFuncSetAttribute
+    */
+   int preferredShmemCarveout;
+
+   /**
+    * If this attribute is set, the kernel must launch with a valid cluster dimension
+    * specified.
+    */
+   int clusterDimMustBeSet;
+
+   /**
+    * The required cluster width/height/depth in blocks. The values must either
+    * all be 0 or all be positive. The validity of the cluster dimensions is
+    * otherwise checked at launch time.
+    *
+    * If the value is set during compile time, it cannot be set at runtime.
+    * Setting it at runtime should return cudaErrorNotPermitted.
+    * See ::cudaFuncSetAttribute
+    */
+   int requiredClusterWidth;
+   int requiredClusterHeight;
+   int requiredClusterDepth;
+
+   /**
+    * The block scheduling policy of a function.
+    * See ::cudaFuncSetAttribute
+    */
+   int clusterSchedulingPolicyPreference;
+
+   /**
+    * Whether the function can be launched with non-portable cluster size. 1 is
+    * allowed, 0 is disallowed. A non-portable cluster size may only function
+    * on the specific SKUs the program is tested on. The launch might fail if
+    * the program is run on a different hardware platform.
+    *
+    * CUDA API provides ::cudaOccupancyMaxActiveClusters to assist with checking
+    * whether the desired size can be launched on the current device.
+    *
+    * Portable Cluster Size
+    *
+    * A portable cluster size is guaranteed to be functional on all compute
+    * capabilities higher than the target compute capability. The portable
+    * cluster size for sm_90 is 8 blocks per cluster. This value may increase
+    * for future compute capabilities.
+    *
+    * The specific hardware unit may support higher cluster sizes that’s not
+    * guaranteed to be portable.
+    * See ::cudaFuncSetAttribute
+    */
+   int nonPortableClusterSizeAllowed;
+
+   /**
+    * Reserved for future use.
+    */
+   int reserved[16];
+};
+
+/**
+ * CUDA function attributes that can be set using ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaFuncAttribute
+{
+    cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */
+    cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split */
+    cudaFuncAttributeClusterDimMustBeSet = 10, /**< Indicator to enforce valid cluster dimension specification on kernel launch */
+    cudaFuncAttributeRequiredClusterWidth = 11, /**< Required cluster width */
+    cudaFuncAttributeRequiredClusterHeight = 12, /**< Required cluster height */
+    cudaFuncAttributeRequiredClusterDepth = 13, /**< Required cluster depth */
+    cudaFuncAttributeNonPortableClusterSizeAllowed = 14, /**< Whether non-portable cluster scheduling policy is supported */
+    cudaFuncAttributeClusterSchedulingPolicyPreference = 15, /**< Required cluster scheduling policy preference */
+    cudaFuncAttributeMax
+};
+
+/**
+ * CUDA function cache configurations
+ */
+enum __device_builtin__ cudaFuncCache
+{
+    cudaFuncCachePreferNone   = 0,    /**< Default function cache configuration, no preference */
+    cudaFuncCachePreferShared = 1,    /**< Prefer larger shared memory and smaller L1 cache  */
+    cudaFuncCachePreferL1     = 2,    /**< Prefer larger L1 cache and smaller shared memory */
+    cudaFuncCachePreferEqual  = 3     /**< Prefer equal size L1 cache and shared memory */
+};
+
+/**
+ * CUDA shared memory configuration
+ * \deprecated
+ */
+enum __device_builtin__ cudaSharedMemConfig
+{
+    cudaSharedMemBankSizeDefault   = 0,
+    cudaSharedMemBankSizeFourByte  = 1,
+    cudaSharedMemBankSizeEightByte = 2
+};
+
+/** 
+ * Shared memory carveout configurations. These may be passed to cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaSharedCarveout {
+    cudaSharedmemCarveoutDefault      = -1,  /**< No preference for shared memory or L1 (default) */
+    cudaSharedmemCarveoutMaxShared    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    cudaSharedmemCarveoutMaxL1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+};
+
+/**
+ * CUDA device compute modes
+ */
+enum __device_builtin__ cudaComputeMode
+{
+    cudaComputeModeDefault          = 0,  /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusive        = 1,  /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+    cudaComputeModeProhibited       = 2,  /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusiveProcess = 3   /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+};
+
+/**
+ * CUDA Limits
+ */
+enum __device_builtin__ cudaLimit
+{
+    cudaLimitStackSize                    = 0x00, /**< GPU thread stack size */
+    cudaLimitPrintfFifoSize               = 0x01, /**< GPU printf FIFO size */
+    cudaLimitMallocHeapSize               = 0x02, /**< GPU malloc heap size */
+    cudaLimitDevRuntimeSyncDepth          = 0x03, /**< GPU device runtime synchronize depth */
+    cudaLimitDevRuntimePendingLaunchCount = 0x04, /**< GPU device runtime pending launch count */
+    cudaLimitMaxL2FetchGranularity        = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    cudaLimitPersistingL2CacheSize        = 0x06  /**< A size in bytes for L2 persisting lines cache size */
+};
+
+/**
+ * CUDA Memory Advise values
+ */
+enum __device_builtin__ cudaMemoryAdvise
+{
+    cudaMemAdviseSetReadMostly          = 1, /**< Data will mostly be read and only occassionally be written to */
+    cudaMemAdviseUnsetReadMostly        = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */
+    cudaMemAdviseSetPreferredLocation   = 3, /**< Set the preferred location for the data as the specified device */
+    cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */
+    cudaMemAdviseSetAccessedBy          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    cudaMemAdviseUnsetAccessedBy        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+};
+
+/**
+ * CUDA range attributes
+ */
+enum __device_builtin__ cudaMemRangeAttribute
+{
+    cudaMemRangeAttributeReadMostly                 = 1, /**< Whether the range will mostly be read and only occassionally be written to */
+    cudaMemRangeAttributePreferredLocation          = 2, /**< The preferred location of the range */
+    cudaMemRangeAttributeAccessedBy                 = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */
+    cudaMemRangeAttributeLastPrefetchLocation       = 4  /**< The last location to which the range was prefetched */
+    , cudaMemRangeAttributePreferredLocationType    = 5  /**< The preferred location type of the range */
+    , cudaMemRangeAttributePreferredLocationId      = 6  /**< The preferred location id of the range */
+    , cudaMemRangeAttributeLastPrefetchLocationType = 7  /**< The last location type to which the range was prefetched */
+    , cudaMemRangeAttributeLastPrefetchLocationId   = 8  /**< The last location id to which the range was prefetched */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes APIs supported on the device
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesOptions {
+    cudaFlushGPUDirectRDMAWritesOptionHost   = 1<<0, /**< ::cudaDeviceFlushGPUDirectRDMAWrites() and its CUDA Driver API counterpart are supported on the device. */
+    cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the CUDA device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes ordering features of the device
+ */
+enum __device_builtin__ cudaGPUDirectRDMAWritesOrdering {
+    cudaGPUDirectRDMAWritesOrderingNone       = 0,   /**< The device does not natively support ordering of GPUDirect RDMA writes. ::cudaFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    cudaGPUDirectRDMAWritesOrderingOwner      = 100, /**< Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not. */
+    cudaGPUDirectRDMAWritesOrderingAllDevices = 200  /**< Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes scopes
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesScope {
+    cudaFlushGPUDirectRDMAWritesToOwner      = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    cudaFlushGPUDirectRDMAWritesToAllDevices = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes targets
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesTarget {
+    cudaFlushGPUDirectRDMAWritesTargetCurrentDevice /**< Sets the target for ::cudaDeviceFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+};
+
+
+/**
+ * CUDA device attributes
+ */
+enum __device_builtin__ cudaDeviceAttr
+{
+    cudaDevAttrMaxThreadsPerBlock             = 1,  /**< Maximum number of threads per block */
+    cudaDevAttrMaxBlockDimX                   = 2,  /**< Maximum block dimension X */
+    cudaDevAttrMaxBlockDimY                   = 3,  /**< Maximum block dimension Y */
+    cudaDevAttrMaxBlockDimZ                   = 4,  /**< Maximum block dimension Z */
+    cudaDevAttrMaxGridDimX                    = 5,  /**< Maximum grid dimension X */
+    cudaDevAttrMaxGridDimY                    = 6,  /**< Maximum grid dimension Y */
+    cudaDevAttrMaxGridDimZ                    = 7,  /**< Maximum grid dimension Z */
+    cudaDevAttrMaxSharedMemoryPerBlock        = 8,  /**< Maximum shared memory available per block in bytes */
+    cudaDevAttrTotalConstantMemory            = 9,  /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    cudaDevAttrWarpSize                       = 10, /**< Warp size in threads */
+    cudaDevAttrMaxPitch                       = 11, /**< Maximum pitch in bytes allowed by memory copies */
+    cudaDevAttrMaxRegistersPerBlock           = 12, /**< Maximum number of 32-bit registers available per block */
+    cudaDevAttrClockRate                      = 13, /**< Peak clock frequency in kilohertz */
+    cudaDevAttrTextureAlignment               = 14, /**< Alignment requirement for textures */
+    cudaDevAttrGpuOverlap                     = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
+    cudaDevAttrMultiProcessorCount            = 16, /**< Number of multiprocessors on device */
+    cudaDevAttrKernelExecTimeout              = 17, /**< Specifies whether there is a run time limit on kernels */
+    cudaDevAttrIntegrated                     = 18, /**< Device is integrated with host memory */
+    cudaDevAttrCanMapHostMemory               = 19, /**< Device can map host memory into CUDA address space */
+    cudaDevAttrComputeMode                    = 20, /**< Compute mode (See ::cudaComputeMode for details) */
+    cudaDevAttrMaxTexture1DWidth              = 21, /**< Maximum 1D texture width */
+    cudaDevAttrMaxTexture2DWidth              = 22, /**< Maximum 2D texture width */
+    cudaDevAttrMaxTexture2DHeight             = 23, /**< Maximum 2D texture height */
+    cudaDevAttrMaxTexture3DWidth              = 24, /**< Maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeight             = 25, /**< Maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepth              = 26, /**< Maximum 3D texture depth */
+    cudaDevAttrMaxTexture2DLayeredWidth       = 27, /**< Maximum 2D layered texture width */
+    cudaDevAttrMaxTexture2DLayeredHeight      = 28, /**< Maximum 2D layered texture height */
+    cudaDevAttrMaxTexture2DLayeredLayers      = 29, /**< Maximum layers in a 2D layered texture */
+    cudaDevAttrSurfaceAlignment               = 30, /**< Alignment requirement for surfaces */
+    cudaDevAttrConcurrentKernels              = 31, /**< Device can possibly execute multiple kernels concurrently */
+    cudaDevAttrEccEnabled                     = 32, /**< Device has ECC support enabled */
+    cudaDevAttrPciBusId                       = 33, /**< PCI bus ID of the device */
+    cudaDevAttrPciDeviceId                    = 34, /**< PCI device ID of the device */
+    cudaDevAttrTccDriver                      = 35, /**< Device is using TCC driver model */
+    cudaDevAttrMemoryClockRate                = 36, /**< Peak memory clock frequency in kilohertz */
+    cudaDevAttrGlobalMemoryBusWidth           = 37, /**< Global memory bus width in bits */
+    cudaDevAttrL2CacheSize                    = 38, /**< Size of L2 cache in bytes */
+    cudaDevAttrMaxThreadsPerMultiProcessor    = 39, /**< Maximum resident threads per multiprocessor */
+    cudaDevAttrAsyncEngineCount               = 40, /**< Number of asynchronous engines */
+    cudaDevAttrUnifiedAddressing              = 41, /**< Device shares a unified address space with the host */    
+    cudaDevAttrMaxTexture1DLayeredWidth       = 42, /**< Maximum 1D layered texture width */
+    cudaDevAttrMaxTexture1DLayeredLayers      = 43, /**< Maximum layers in a 1D layered texture */
+    cudaDevAttrMaxTexture2DGatherWidth        = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture2DGatherHeight       = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture3DWidthAlt           = 47, /**< Alternate maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeightAlt          = 48, /**< Alternate maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepthAlt           = 49, /**< Alternate maximum 3D texture depth */
+    cudaDevAttrPciDomainId                    = 50, /**< PCI domain ID of the device */
+    cudaDevAttrTexturePitchAlignment          = 51, /**< Pitch alignment requirement for textures */
+    cudaDevAttrMaxTextureCubemapWidth         = 52, /**< Maximum cubemap texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredWidth  = 53, /**< Maximum cubemap layered texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */
+    cudaDevAttrMaxSurface1DWidth              = 55, /**< Maximum 1D surface width */
+    cudaDevAttrMaxSurface2DWidth              = 56, /**< Maximum 2D surface width */
+    cudaDevAttrMaxSurface2DHeight             = 57, /**< Maximum 2D surface height */
+    cudaDevAttrMaxSurface3DWidth              = 58, /**< Maximum 3D surface width */
+    cudaDevAttrMaxSurface3DHeight             = 59, /**< Maximum 3D surface height */
+    cudaDevAttrMaxSurface3DDepth              = 60, /**< Maximum 3D surface depth */
+    cudaDevAttrMaxSurface1DLayeredWidth       = 61, /**< Maximum 1D layered surface width */
+    cudaDevAttrMaxSurface1DLayeredLayers      = 62, /**< Maximum layers in a 1D layered surface */
+    cudaDevAttrMaxSurface2DLayeredWidth       = 63, /**< Maximum 2D layered surface width */
+    cudaDevAttrMaxSurface2DLayeredHeight      = 64, /**< Maximum 2D layered surface height */
+    cudaDevAttrMaxSurface2DLayeredLayers      = 65, /**< Maximum layers in a 2D layered surface */
+    cudaDevAttrMaxSurfaceCubemapWidth         = 66, /**< Maximum cubemap surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredWidth  = 67, /**< Maximum cubemap layered surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */
+    cudaDevAttrMaxTexture1DLinearWidth        = 69, /**< Maximum 1D linear texture width */
+    cudaDevAttrMaxTexture2DLinearWidth        = 70, /**< Maximum 2D linear texture width */
+    cudaDevAttrMaxTexture2DLinearHeight       = 71, /**< Maximum 2D linear texture height */
+    cudaDevAttrMaxTexture2DLinearPitch        = 72, /**< Maximum 2D linear texture pitch in bytes */
+    cudaDevAttrMaxTexture2DMipmappedWidth     = 73, /**< Maximum mipmapped 2D texture width */
+    cudaDevAttrMaxTexture2DMipmappedHeight    = 74, /**< Maximum mipmapped 2D texture height */
+    cudaDevAttrComputeCapabilityMajor         = 75, /**< Major compute capability version number */ 
+    cudaDevAttrComputeCapabilityMinor         = 76, /**< Minor compute capability version number */
+    cudaDevAttrMaxTexture1DMipmappedWidth     = 77, /**< Maximum mipmapped 1D texture width */
+    cudaDevAttrStreamPrioritiesSupported      = 78, /**< Device supports stream priorities */
+    cudaDevAttrGlobalL1CacheSupported         = 79, /**< Device supports caching globals in L1 */
+    cudaDevAttrLocalL1CacheSupported          = 80, /**< Device supports caching locals in L1 */
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */
+    cudaDevAttrMaxRegistersPerMultiprocessor  = 82, /**< Maximum number of 32-bit registers available per multiprocessor */
+    cudaDevAttrManagedMemory                  = 83, /**< Device can allocate managed memory on this system */
+    cudaDevAttrIsMultiGpuBoard                = 84, /**< Device is on a multi-GPU board */
+    cudaDevAttrMultiGpuBoardGroupID           = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */
+    cudaDevAttrHostNativeAtomicSupported      = 86, /**< Link between the device and the host supports native atomic operations */
+    cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    cudaDevAttrPageableMemoryAccess           = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    cudaDevAttrConcurrentManagedAccess        = 89, /**< Device can coherently access managed memory concurrently with the CPU */
+    cudaDevAttrComputePreemptionSupported     = 90, /**< Device supports Compute Preemption */
+    cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    cudaDevAttrReserved92                     = 92,
+    cudaDevAttrReserved93                     = 93,
+    cudaDevAttrReserved94                     = 94,
+    cudaDevAttrCooperativeLaunch              = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
+    cudaDevAttrCooperativeMultiDeviceLaunch   = 96, /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    cudaDevAttrMaxSharedMemoryPerBlockOptin   = 97, /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
+    cudaDevAttrCanFlushRemoteWrites           = 98, /**< Device supports flushing of outstanding remote writes. */
+    cudaDevAttrHostRegisterSupported          = 99, /**< Device supports host memory registration via ::cudaHostRegister. */
+    cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100, /**< Device accesses pageable memory via the host's page tables. */
+    cudaDevAttrDirectManagedMemAccessFromHost = 101, /**< Host can directly access managed memory on the device without migration. */
+    cudaDevAttrMaxBlocksPerMultiprocessor     = 106, /**< Maximum number of blocks per multiprocessor */
+    cudaDevAttrMaxPersistingL2CacheSize       = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */
+    cudaDevAttrMaxAccessPolicyWindowSize      = 109, /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
+    cudaDevAttrReservedSharedMemoryPerBlock   = 111, /**< Shared memory reserved by CUDA driver per block in bytes */
+    cudaDevAttrSparseCudaArraySupported       = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    cudaDevAttrHostRegisterReadOnlySupported  = 113,  /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
+    cudaDevAttrTimelineSemaphoreInteropSupported = 114,  /**< External timeline semaphore interop is supported on the device */
+    cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114,  /**< Deprecated, External timeline semaphore interop is supported on the device */
+    cudaDevAttrMemoryPoolsSupported           = 115, /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
+    cudaDevAttrGPUDirectRDMASupported         = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
+    cudaDevAttrGPUDirectRDMAWritesOrdering    = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    cudaDevAttrMemoryPoolSupportedHandleTypes = 119, /**< Handle types supported with mempool based IPC */
+    cudaDevAttrClusterLaunch                  = 120, /**< Indicates device supports cluster launch */
+    cudaDevAttrDeferredMappingCudaArraySupported = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    cudaDevAttrReserved122                    = 122,
+    cudaDevAttrReserved123                    = 123,
+    cudaDevAttrReserved124                    = 124,
+    cudaDevAttrIpcEventSupport                = 125, /**< Device supports IPC Events. */ 
+    cudaDevAttrMemSyncDomainCount             = 126, /**< Number of memory synchronization domains the device supports. */
+    cudaDevAttrReserved127                    = 127,
+    cudaDevAttrReserved128                    = 128,
+    cudaDevAttrReserved129                    = 129,
+    cudaDevAttrNumaConfig                     = 130, /**< NUMA configuration of a device: value is of type ::cudaDeviceNumaConfig enum */
+    cudaDevAttrNumaId                         = 131, /**< NUMA node ID of the GPU memory */
+    cudaDevAttrReserved132                    = 132,
+    cudaDevAttrMpsEnabled                     = 133, /**< Contexts created on this device will be shared via MPS */
+    cudaDevAttrHostNumaId                     = 134, /**< NUMA ID of the host node closest to the device or -1 when system does not support NUMA */
+    cudaDevAttrD3D12CigSupported              = 135, /**< Device supports CIG with D3D12. */
+    cudaDevAttrGpuPciDeviceId                 = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
+    cudaDevAttrGpuPciSubsystemId              = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
+    cudaDevAttrHostNumaMultinodeIpcSupported  = 143, /**< Device supports HostNuma location IPC between nodes in a multi-node system. */
+    cudaDevAttrMax
+};
+
+/**
+ * CUDA memory pool attributes
+ */
+enum __device_builtin__ cudaMemPoolAttr
+{
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    cudaMemPoolReuseFollowEventDependencies   = 0x1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    cudaMemPoolReuseAllowOpportunistic        = 0x2,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    cudaMemPoolReuseAllowInternalDependencies = 0x3,
+
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    cudaMemPoolAttrReleaseThreshold           = 0x4,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    cudaMemPoolAttrReservedMemCurrent         = 0x5,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrReservedMemHigh            = 0x6,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    cudaMemPoolAttrUsedMemCurrent             = 0x7,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrUsedMemHigh                = 0x8
+};
+
+/**
+ * Specifies the type of location 
+ */
+enum __device_builtin__ cudaMemLocationType {
+    cudaMemLocationTypeInvalid = 0,
+    cudaMemLocationTypeDevice = 1  /**< Location is a device location, thus id is a device ordinal */
+    , cudaMemLocationTypeHost = 2 /**< Location is host, id is ignored */
+    , cudaMemLocationTypeHostNuma = 3 /**< Location is a host NUMA node, thus id is a host NUMA node id */
+    , cudaMemLocationTypeHostNumaCurrent = 4 /**< Location is the host NUMA node closest to the current thread's CPU, id is ignored */
+};
+
+/**
+ * Specifies a memory location.
+ *
+ * To specify a gpu, set type = ::cudaMemLocationTypeDevice and set id = the gpu's device ordinal.
+ * To specify a cpu NUMA node, set type = ::cudaMemLocationTypeHostNuma and set id = host NUMA node id.
+ */
+struct __device_builtin__ cudaMemLocation {
+    enum cudaMemLocationType type;  /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                         /**< identifier for a given this location's ::CUmemLocationType. */
+};
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+enum __device_builtin__ cudaMemAccessFlags {
+    cudaMemAccessFlagsProtNone      = 0,  /**< Default, make the address range not accessible */
+    cudaMemAccessFlagsProtRead      = 1,  /**< Make the address range read accessible */
+    cudaMemAccessFlagsProtReadWrite = 3   /**< Make the address range read-write accessible */
+};
+
+/**
+ * Memory access descriptor
+ */
+struct __device_builtin__ cudaMemAccessDesc {
+    struct cudaMemLocation  location; /**< Location on which the request is to change it's accessibility */
+    enum cudaMemAccessFlags flags;    /**< ::CUmemProt accessibility flags to set on the request */
+};
+
+/**
+ * Defines the allocation types available
+ */
+enum __device_builtin__ cudaMemAllocationType {
+    cudaMemAllocationTypeInvalid = 0x0,
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    cudaMemAllocationTypePinned  = 0x1,
+    cudaMemAllocationTypeMax     = 0x7FFFFFFF 
+};
+
+/**
+ * Flags for specifying particular handle types
+ */
+enum __device_builtin__ cudaMemAllocationHandleType {
+    cudaMemHandleTypeNone                    = 0x0,  /**< Does not allow any export mechanism. > */
+    cudaMemHandleTypePosixFileDescriptor     = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    cudaMemHandleTypeWin32                   = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    cudaMemHandleTypeWin32Kmt                = 0x4,   /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    cudaMemHandleTypeFabric                  = 0x8  /**< Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t) */
+};
+
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define cudaMemPoolCreateUsageHwDecompress 0x2
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+struct __device_builtin__ cudaMemPoolProps {
+    enum cudaMemAllocationType         allocType;   /**< Allocation type. Currently must be specified as cudaMemAllocationTypePinned */
+    enum cudaMemAllocationHandleType   handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    struct cudaMemLocation             location;    /**< Location allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::cudaMemHandleTypeWin32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void                              *win32SecurityAttributes;
+    size_t                             maxSize;     /**< Maximum pool size. When set to 0, defaults to a system dependent value.*/
+    unsigned short                     usage;        /**< Bitmask indicating intended usage for the pool. */
+    unsigned char                      reserved[54]; /**< reserved for future use, must be 0 */
+};
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+struct __device_builtin__ cudaMemPoolPtrExportData {
+    unsigned char reserved[64];
+};
+
+/**
+ * Memory allocation node parameters
+ */
+struct __device_builtin__ cudaMemAllocNodeParams {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported.
+    */
+    struct cudaMemPoolProps         poolProps;       /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    const struct cudaMemAccessDesc *accessDescs;     /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t                          accessDescCount; /**< in: Number of `accessDescs`s */
+    size_t                          bytesize;        /**< in: size in bytes of the requested allocation */
+    void                           *dptr;            /**< out: address of the allocation returned by CUDA */
+};
+
+/**
+ * Memory allocation node parameters
+ */
+struct __device_builtin__ cudaMemAllocNodeParamsV2 {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported.
+    */
+    struct cudaMemPoolProps         poolProps;       /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    const struct cudaMemAccessDesc *accessDescs;     /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t                          accessDescCount; /**< in: Number of `accessDescs`s */
+    size_t                          bytesize;        /**< in: size in bytes of the requested allocation */
+    void                           *dptr;            /**< out: address of the allocation returned by CUDA */
+};
+
+/**
+ * Memory free node parameters
+ */
+struct __device_builtin__ cudaMemFreeNodeParams {
+    void *dptr; /**< in: the pointer to free */
+};
+
+/**
+ * Graph memory attributes
+ */
+enum __device_builtin__ cudaGraphMemAttributeType {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs.
+     */
+    cudaGraphMemAttrUsedMemCurrent      = 0x0,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    cudaGraphMemAttrUsedMemHigh         = 0x1,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemCurrent  = 0x2,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemHigh     = 0x3
+};
+
+/**
+ * Flags to specify for copies within a batch. For more details see ::cudaMemcpyBatchAsync.
+ */
+enum __device_builtin__ cudaMemcpyFlags {
+    cudaMemcpyFlagDefault                  = 0x0,
+
+    /**
+     * Hint to the driver to try and overlap the copy with compute work on the SMs.
+     */
+    cudaMemcpyFlagPreferOverlapWithCompute = 0x1
+};
+
+enum __device_builtin__ cudaMemcpySrcAccessOrder {
+    /**
+     * Default invalid.
+     */
+    cudaMemcpySrcAccessOrderInvalid       = 0x0,
+
+    /**
+     * Indicates that access to the source pointer must be in stream order.
+     */
+    cudaMemcpySrcAccessOrderStream        = 0x1,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and all
+     * accesses must be complete before the API call returns. This flag is suited for
+     * ephemeral sources (ex., stack variables) when it's known that no prior operations
+     * in the stream can be accessing the memory and also that the lifetime of the memory
+     * is limited to the scope that the source variable was declared in. Specifying
+     * this flag allows the driver to optimize the copy and removes the need for the user
+     * to synchronize the stream after the API call.
+     */
+    cudaMemcpySrcAccessOrderDuringApiCall = 0x2,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and the accesses
+     * can happen even after the API call returns. This flag is suited for host pointers
+     * allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+     * in the stream can be accessing the memory. Specifying this flag allows the driver
+     * to optimize the copy on certain platforms.
+     */
+    cudaMemcpySrcAccessOrderAny           = 0x3,
+
+    cudaMemcpySrcAccessOrderMax           = 0x7FFFFFFF
+};
+
+/**
+ * Attributes specific to copies within a batch. For more details on usage see ::cudaMemcpyBatchAsync.
+ */
+struct __device_builtin__ cudaMemcpyAttributes {
+    enum cudaMemcpySrcAccessOrder srcAccessOrder;  /**< Source access ordering to be observed for copies with this attribute. */
+    struct cudaMemLocation srcLocHint;             /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    struct cudaMemLocation dstLocHint;             /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    unsigned int flags;                            /**< Additional flags for copies with this attribute. See ::cudaMemcpyFlags. */
+};
+
+/**
+ * These flags allow applications to convey the operand type for individual copies specified in ::cudaMemcpy3DBatchAsync.
+ */
+enum __device_builtin__ cudaMemcpy3DOperandType {
+    cudaMemcpyOperandTypePointer = 0x1,            /**< Memcpy operand is a valid pointer. */
+    cudaMemcpyOperandTypeArray = 0x2,              /**< Memcpy operand is a CUarray. */
+    cudaMemcpyOperandTypeMax = 0x7FFFFFFF
+};
+
+/**
+ * Struct representing offset into a ::cudaArray_t in elements
+ */
+struct __device_builtin__ cudaOffset3D {
+    size_t x;
+    size_t y;
+    size_t z;
+};
+
+/**
+ * Struct representing an operand for copy with ::cudaMemcpy3DBatchAsync
+ */
+struct __device_builtin__ cudaMemcpy3DOperand {
+    enum cudaMemcpy3DOperandType type;
+    union {
+        /**
+         * Struct representing an operand when ::cudaMemcpy3DOperand::type is ::cudaMemcpyOperandTypePointer
+         */
+        struct {
+            void *ptr;
+            size_t rowLength;                /**< Length of each row in elements. */ 
+            size_t layerHeight;              /**< Height of each layer in elements. */ 
+            struct cudaMemLocation locHint;  /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+        } ptr;
+
+        /**
+         * Struct representing an operand when ::cudaMemcpy3DOperand::type is ::cudaMemcpyOperandTypeArray
+         */
+        struct {
+            cudaArray_t array;
+            struct cudaOffset3D offset;
+        } array;
+    } op;  
+};
+
+struct __device_builtin__ cudaMemcpy3DBatchOp {
+    struct cudaMemcpy3DOperand src;                /**< Source memcpy operand. */
+    struct cudaMemcpy3DOperand dst;                /**< Destination memcpy operand. */
+    struct cudaExtent extent;                      /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
+    enum cudaMemcpySrcAccessOrder srcAccessOrder;  /**< Source access ordering to be observed for copy from src to dst. */
+    unsigned int flags;                            /**< Additional flags for copy from src to dst. See ::cudaMemcpyFlags. */
+};
+
+/**
+ * CUDA device P2P attributes
+ */
+
+enum __device_builtin__ cudaDeviceP2PAttr {
+    cudaDevP2PAttrPerformanceRank              = 1, /**< A relative value indicating the performance of the link between two devices */
+    cudaDevP2PAttrAccessSupported              = 2, /**< Peer access is enabled */
+    cudaDevP2PAttrNativeAtomicSupported        = 3, /**< Native atomic operation over the link supported */
+    cudaDevP2PAttrCudaArrayAccessSupported     = 4  /**< Accessing CUDA arrays over the link supported */
+};
+
+/**
+ * CUDA UUID types
+ */
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+struct __device_builtin__ CUuuid_st {     /**< CUDA definition of UUID */
+    char bytes[16];
+};
+typedef __device_builtin__ struct CUuuid_st CUuuid;
+#endif
+typedef __device_builtin__ struct CUuuid_st cudaUUID_t;
+
+/**
+ * CUDA device properties
+ */
+struct __device_builtin__ cudaDeviceProp
+{
+    char         name[256];                  /**< ASCII string identifying device */
+    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
+    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
+    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
+    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
+    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
+    int          regsPerBlock;               /**< 32-bit registers available per block */
+    int          warpSize;                   /**< Warp size in threads */
+    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
+    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
+    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
+    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
+    int          clockRate;                  /**< Deprecated, Clock frequency in kilohertz */
+    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
+    int          major;                      /**< Major compute capability */
+    int          minor;                      /**< Minor compute capability */
+    size_t       textureAlignment;           /**< Alignment requirement for textures */
+    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
+    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
+    int          multiProcessorCount;        /**< Number of multiprocessors on device */
+    int          kernelExecTimeoutEnabled;   /**< Deprecated, Specified whether there is a run time limit on kernels */
+    int          integrated;                 /**< Device is integrated as opposed to discrete */
+    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
+    int          computeMode;                /**< Deprecated, Compute mode (See ::cudaComputeMode) */
+    int          maxTexture1D;               /**< Maximum 1D texture size */
+    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
+    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
+    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
+    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
+    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
+    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
+    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
+    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
+    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
+    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
+    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
+    int          maxSurface1D;               /**< Maximum 1D surface size */
+    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
+    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
+    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
+    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
+    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
+    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
+    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
+    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
+    int          ECCEnabled;                 /**< Device has ECC support enabled */
+    int          pciBusID;                   /**< PCI bus ID of the device */
+    int          pciDeviceID;                /**< PCI device ID of the device */
+    int          pciDomainID;                /**< PCI domain ID of the device */
+    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
+    int          asyncEngineCount;           /**< Number of asynchronous engines */
+    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
+    int          memoryClockRate;            /**< Deprecated, Peak memory clock frequency in kilohertz */
+    int          memoryBusWidth;             /**< Global memory bus width in bits */
+    int          l2CacheSize;                /**< Size of L2 cache in bytes */
+    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
+    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
+    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
+    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
+    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
+    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
+    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
+    int          managedMemory;              /**< Device supports allocating managed memory on this system */
+    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
+    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
+    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
+    int          singleToDoublePrecisionPerfRatio; /**< Deprecated, Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
+    int          computePreemptionSupported; /**< Device supports Compute Preemption */
+    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
+    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
+    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
+    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
+    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
+    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
+    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
+    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
+    int          hostRegisterSupported;      /**< Device supports host memory registration via ::cudaHostRegister. */
+    int          sparseCudaArraySupported;   /**< 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise */
+    int          hostRegisterReadOnlySupported; /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
+    int          timelineSemaphoreInteropSupported; /**< External timeline semaphore interop is supported on the device */
+    int          memoryPoolsSupported;       /**< 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise */
+    int          gpuDirectRDMASupported;     /**< 1 if the device supports GPUDirect RDMA APIs, 0 otherwise */
+    unsigned int gpuDirectRDMAFlushWritesOptions; /**< Bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum */
+    int          gpuDirectRDMAWritesOrdering;/**< See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values */
+    unsigned int memoryPoolSupportedHandleTypes; /**< Bitmask of handle types supported with mempool-based IPC */
+    int          deferredMappingCudaArraySupported; /**< 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    int          ipcEventSupported;          /**< Device supports IPC Events. */
+    int          clusterLaunch;              /**< Indicates device supports cluster launch */
+    int          unifiedFunctionPointers;    /**< Indicates device supports unified pointers */
+    int          reserved[63];               /**< Reserved for future use */
+};
+
+/**
+ * CUDA IPC Handle Size
+ */
+#define CUDA_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcEventHandle_t;
+
+/**
+ * CUDA IPC memory handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st 
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcMemHandle_t;
+
+/*
+ * CUDA Mem Fabric Handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaMemFabricHandle_st 
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaMemFabricHandle_t;
+
+/**
+ * External memory handle types
+ */
+enum __device_builtin__ cudaExternalMemoryHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+    /**
+    *  Handle is a shared NT handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+    /**
+    *  Handle is a globally shared handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+    /**
+    *  Handle is an NvSciBuf object
+    */
+    cudaExternalMemoryHandleTypeNvSciBuf         = 8
+};
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define cudaExternalMemoryDedicated   0x1
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreSignalParams
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreSignalSkipNvSciBufMemSync     0x01
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreWaitParams
+ * contains this flag, it indicates that waiting an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreWaitSkipNvSciBufMemSync       0x02
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need signaler specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrSignal       0x1
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need waiter specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrWait         0x2
+
+/**
+ * External memory handle descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum  cudaExternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::cudaExternalMemoryHandleTypeOpaqueFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalMemoryHandleTypeD3D12Heap 
+         * - ::cudaExternalMemoryHandleTypeD3D12Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following: 
+         * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing NvSciBuf Object. Valid when type
+         * is ::cudaExternalMemoryHandleTypeNvSciBuf
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::cudaExternalMemoryDedicated
+     */
+    unsigned int flags;
+};
+
+/**
+ * External memory buffer descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryBufferDesc {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+};
+ 
+/**
+ * External memory mipmap descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryMipmappedArrayDesc {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format of base level of the mipmap chain
+     */
+    struct cudaChannelFormatDesc formatDesc;
+    /**
+     * Dimensions of base level of the mipmap chain
+     */
+    struct cudaExtent extent;
+    /**
+     * Flags associated with CUDA mipmapped arrays.
+     * See ::cudaMallocMipmappedArray
+     */
+    unsigned int flags;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+};
+ 
+/**
+ * External semaphore handle types
+ */
+enum __device_builtin__ cudaExternalSemaphoreHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueFd       = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32    = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D12Fence     = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D11Fence     = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+     */
+     cudaExternalSemaphoreHandleTypeNvSciSync     = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutex     = 7,
+    /**
+     * Handle is a shared KMT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutexKmt  = 8,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd  = 9,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+};
+
+/**
+ * External semaphore handle descriptor
+ */
+struct __device_builtin__ cudaExternalSemaphoreHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum cudaExternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueFd
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalSemaphoreHandleTypeD3D12Fence
+         * - ::cudaExternalSemaphoreHandleTypeD3D11Fence
+         * - ::cudaExternalSemaphoreHandleTypeKeyedMutex
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters(deprecated)
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams_v1 {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+* External semaphore wait parameters(deprecated)
+*/
+struct __device_builtin__ cudaExternalSemaphoreWaitParams_v1 {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams{
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/**
+ * External semaphore wait parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitParams {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/*******************************************************************************
+*                                                                              *
+*  SHORTHAND TYPE DEFINITION USED BY RUNTIME API                               *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA Error types
+ */
+typedef __device_builtin__ enum cudaError cudaError_t;
+
+/**
+ * CUDA stream
+ */
+typedef __device_builtin__ struct CUstream_st *cudaStream_t;
+
+/**
+ * CUDA event types
+ */
+typedef __device_builtin__ struct CUevent_st *cudaEvent_t;
+
+/**
+ * CUDA graphics resource types
+ */
+typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t;
+
+/**
+ * CUDA external memory
+ */
+typedef __device_builtin__ struct CUexternalMemory_st *cudaExternalMemory_t;
+
+/**
+ * CUDA external semaphore
+ */
+typedef __device_builtin__ struct CUexternalSemaphore_st *cudaExternalSemaphore_t;
+
+/**
+ * CUDA graph
+ */
+typedef __device_builtin__ struct CUgraph_st *cudaGraph_t;
+
+/**
+ * CUDA graph node.
+ */
+typedef __device_builtin__ struct CUgraphNode_st *cudaGraphNode_t;
+
+/**
+ * CUDA user object for graphs
+ */
+typedef __device_builtin__ struct CUuserObject_st *cudaUserObject_t;
+
+/**
+ * CUDA handle for conditional graph nodes
+ */
+typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+
+/**
+ * CUDA function
+ */
+typedef __device_builtin__ struct CUfunc_st *cudaFunction_t;
+
+/**
+ * CUDA kernel
+ */
+typedef __device_builtin__ struct CUkern_st *cudaKernel_t;
+
+/**
+ * Online compiler and linker options
+ */
+enum __device_builtin__ cudaJitOption
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    cudaJitMaxRegisters = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization of the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    cudaJitThreadsPerBlock = 1,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    cudaJitWallTime = 2,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::cudaJitInfoLogBufferSizeBytes)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    cudaJitInfoLogBuffer = 3,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    cudaJitInfoLogBufferSizeBytes = 4,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::cudaJitErrorLogBufferSizeBytes)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    cudaJitErrorLogBuffer = 5,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    cudaJitErrorLogBufferSizeBytes = 6,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    cudaJitOptimizationLevel = 7,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::cudaJit_Fallback.
+     * Option type: unsigned int for enumerated type ::cudaJit_Fallback\n
+     * Applies to: compiler only
+     */
+    cudaJitFallbackStrategy = 10,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    cudaJitGenerateDebugInfo = 11,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    cudaJitLogVerbose = 12,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    cudaJitGenerateLineInfo = 13,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::cudaJit_CacheMode.\n
+     * Option type: unsigned int for enumerated type ::cudaJit_CacheMode\n
+     * Applies to: compiler only
+     */
+    cudaJitCacheMode = 14,
+
+    /**
+     * Generate position independent code (0: false)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    cudaJitPositionIndependentCode = 30,
+
+    /**
+     * This option hints to the JIT compiler the minimum number of CTAs from the
+     * kernel’s grid to be mapped to a SM. This option is ignored when used together
+     * with ::cudaJitMaxRegisters or ::cudaJitThreadsPerBlock.
+     * Optimizations based on this option need ::cudaJitMaxThreadsPerBlock to
+     * be specified as well. For kernels already using PTX directive .minnctapersm,
+     * this option will be ignored by default. Use ::cudaJitOverrideDirectiveValues
+     * to let this option take precedence over the PTX directive.
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+    */
+    cudaJitMinCtaPerSm = 31,
+
+     /**
+     * Maximum number threads in a thread block, computed as the product of
+     * the maximum extent specifed for each dimension of the block. This limit
+     * is guaranteed not to be exeeded in any invocation of the kernel. Exceeding
+     * the the maximum number of threads results in runtime error or kernel launch
+     * failure. For kernels already using PTX directive .maxntid, this option will
+     * be ignored by default. Use ::cudaJitOverrideDirectiveValues to let this
+     * option take precedence over the PTX directive.
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    cudaJitMaxThreadsPerBlock = 32,
+
+    /**
+     * This option lets the values specified using ::cudaJitMaxRegisters,
+     * ::cudaJitThreadsPerBlock, ::cudaJitMaxThreadsPerBlock and
+     * ::cudaJitMinCtaPerSm take precedence over any PTX directives.
+     * (0: Disable, default; 1: Enable)
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    cudaJitOverrideDirectiveValues = 33,
+};
+
+
+/**
+ * Library options to be specified with ::cudaLibraryLoadData() or ::cudaLibraryLoadFromFile()
+ */
+enum __device_builtin__ cudaLibraryOption
+{
+    cudaLibraryHostUniversalFunctionAndDataTable = 0,
+
+    /**
+     * Specifes that the argument \p code passed to ::cudaLibraryLoadData() will be preserved.
+     * Specifying this option will let the driver know that \p code can be accessed at any point
+     * until ::cudaLibraryUnload(). The default behavior is for the driver to allocate and
+     * maintain its own copy of \p code. Note that this is only a memory usage optimization
+     * hint and the driver can choose to ignore it if required.
+     * Specifying this option with ::cudaLibraryLoadFromFile() is invalid and
+     * will return ::cudaErrorInvalidValue.
+     */
+    cudaLibraryBinaryIsPreserved = 1,
+};
+
+struct __device_builtin__ cudalibraryHostUniversalFunctionAndDataTable
+{
+    void *functionTable;
+    size_t functionWindowSize;
+    void *dataTable;
+    size_t dataWindowSize;
+};
+
+/**
+ * Caching modes for dlcm
+ */
+enum __device_builtin__ cudaJit_CacheMode
+{
+    cudaJitCacheOptionNone = 0,   /**< Compile with no -dlcm flag specified */
+    cudaJitCacheOptionCG,         /**< Compile with L1 cache disabled */
+    cudaJitCacheOptionCA          /**< Compile with L1 cache enabled */
+};
+
+/**
+ * Cubin matching fallback strategies
+ */
+enum __device_builtin__ cudaJit_Fallback
+{
+    cudaPreferPtx = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    cudaPreferBinary    /**< Prefer to fall back to compatible binary code if exact match not found */
+};
+
+/**
+ * CUDA library
+ */
+typedef __device_builtin__ struct CUlib_st *cudaLibrary_t;
+
+/**
+ * CUDA memory pool
+ */
+typedef __device_builtin__ struct CUmemPoolHandle_st *cudaMemPool_t;
+
+/**
+ * CUDA cooperative group scope
+ */
+enum __device_builtin__ cudaCGScope {
+    cudaCGScopeInvalid   = 0, /**< Invalid cooperative group scope */
+    cudaCGScopeGrid      = 1, /**< Scope represented by a grid_group */
+    cudaCGScopeMultiGrid = 2  /**< Scope represented by a multi_grid_group */
+};
+
+/**
+ * CUDA launch parameters
+ */
+struct __device_builtin__ cudaLaunchParams
+{
+    void *func;          /**< Device function symbol */
+    dim3 gridDim;        /**< Grid dimentions */
+    dim3 blockDim;       /**< Block dimentions */
+    void **args;         /**< Arguments */
+    size_t sharedMem;    /**< Shared memory */
+    cudaStream_t stream; /**< Stream identifier */
+};
+
+/**
+ * CUDA GPU kernel node parameters
+ */
+struct __device_builtin__ cudaKernelNodeParams {
+    void* func;                     /**< Kernel to launch */
+    dim3 gridDim;                   /**< Grid dimensions */
+    dim3 blockDim;                  /**< Block dimensions */
+    unsigned int sharedMemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;            /**< Array of pointers to individual kernel arguments*/
+    void **extra;                   /**< Pointer to kernel arguments in the "extra" format */
+};
+
+/**
+ * CUDA GPU kernel node parameters
+ */
+struct __device_builtin__ cudaKernelNodeParamsV2 {
+    void* func;                     /**< Kernel to launch */
+    #if !defined(__cplusplus) || __cplusplus >= 201103L
+        dim3 gridDim;                   /**< Grid dimensions */
+        dim3 blockDim;                  /**< Block dimensions */
+    #else
+        /* Union members cannot have nontrivial constructors until C++11. */
+        uint3 gridDim;                  /**< Grid dimensions */
+        uint3 blockDim;                 /**< Block dimensions */
+    #endif
+    unsigned int sharedMemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;            /**< Array of pointers to individual kernel arguments*/
+    void **extra;                   /**< Pointer to kernel arguments in the "extra" format */
+};
+
+/**
+ * External semaphore signal node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                        /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                     /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+ * External semaphore signal node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalNodeParamsV2 {
+    cudaExternalSemaphore_t* extSemArray;                        /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                     /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+ * External semaphore wait node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                      /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                   /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+ * External semaphore wait node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitNodeParamsV2 {
+    cudaExternalSemaphore_t* extSemArray;                      /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                   /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+enum __device_builtin__ cudaGraphConditionalHandleFlags {
+    cudaGraphCondAssignDefault = 1 /**< Apply default handle value when graph is launched. */
+};
+
+/**
+ * CUDA conditional node types
+ */
+enum __device_builtin__ cudaGraphConditionalNodeType {
+    cudaGraphCondTypeIf  = 0,    /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero.  If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
+    cudaGraphCondTypeWhile = 1,  /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
+    cudaGraphCondTypeSwitch = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
+};
+
+/**
+ * CUDA conditional node parameters
+ */
+struct __device_builtin__ cudaConditionalNodeParams {
+    cudaGraphConditionalHandle handle;       /**< Conditional node handle.
+                                                  Handles must be created in advance of creating the node
+                                                  using ::cudaGraphConditionalHandleCreate. */
+    enum cudaGraphConditionalNodeType type;  /**< Type of conditional node. */
+    unsigned int size;                       /**< Size of graph output array.  Allowed values are 1 for cudaGraphCondTypeWhile, 1 or 2
+                                                  for cudaGraphCondTypeWhile, or any value greater than zero for cudaGraphCondTypeSwitch. */
+    cudaGraph_t *phGraph_out;                /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
+                                                  Valid for the lifetime of the conditional node.
+                                                  The contents of the graph(s) are subject to the following constraints:
+                                                  
+                                                  - Allowed node types are kernel nodes, empty nodes, child graphs, memsets,
+                                                    memcopies, and conditionals. This applies recursively to child graphs and conditional bodies.
+                                                  - All kernels, including kernels in nested conditionals or child graphs at any level,
+                                                    must belong to the same CUDA context.
+                                                  
+                                                  These graphs may be populated using graph node creation APIs or ::cudaStreamBeginCaptureToGraph.
+                                                  cudaGraphCondTypeIf:
+                                                  phGraph_out[0] is executed when the condition is non-zero.  If \p size == 2, phGraph_out[1] will
+                                                  be executed when the condition is zero.
+                                                  cudaGraphCondTypeWhile:
+                                                  phGraph_out[0] is executed as long as the condition is non-zero.
+                                                  cudaGraphCondTypeSwitch:
+                                                  phGraph_out[n] is executed when the condition is equal to n.  If the condition >= \p size,
+                                                  no body graph is executed.
+                                         */
+};
+
+/**
+* CUDA Graph node types
+*/
+enum __device_builtin__ cudaGraphNodeType {
+    cudaGraphNodeTypeKernel      = 0x00, /**< GPU kernel node */
+    cudaGraphNodeTypeMemcpy      = 0x01, /**< Memcpy node */
+    cudaGraphNodeTypeMemset      = 0x02, /**< Memset node */
+    cudaGraphNodeTypeHost        = 0x03, /**< Host (executable) node */
+    cudaGraphNodeTypeGraph       = 0x04, /**< Node which executes an embedded graph */
+    cudaGraphNodeTypeEmpty       = 0x05, /**< Empty (no-op) node */
+    cudaGraphNodeTypeWaitEvent   = 0x06, /**< External event wait node */
+    cudaGraphNodeTypeEventRecord = 0x07, /**< External event record node */
+    cudaGraphNodeTypeExtSemaphoreSignal = 0x08, /**< External semaphore signal node */
+    cudaGraphNodeTypeExtSemaphoreWait = 0x09, /**< External semaphore wait node */
+    cudaGraphNodeTypeMemAlloc    = 0x0a, /**< Memory allocation node */
+    cudaGraphNodeTypeMemFree     = 0x0b, /**< Memory free node */
+    cudaGraphNodeTypeConditional = 0x0d, /**< Conditional node
+                                              
+                                              May be used to implement a conditional execution path or loop
+                                              inside of a graph. The graph(s) contained within the body of the conditional node
+                                              can be selectively executed or iterated upon based on the value of a conditional
+                                              variable.
+                                              
+                                              Handles must be created in advance of creating the node
+                                              using ::cudaGraphConditionalHandleCreate.
+                                              
+                                              The following restrictions apply to graphs which contain conditional nodes:
+                                                The graph cannot be used in a child node.
+                                                Only one instantiation of the graph may exist at any point in time.
+                                                The graph cannot be cloned.
+                                              
+                                              To set the control value, supply a default value when creating the handle and/or
+                                              call ::cudaGraphSetConditional from device code.*/
+    cudaGraphNodeTypeCount
+};
+
+/**
+ * Child graph node parameters
+ */
+struct __device_builtin__ cudaChildGraphNodeParams {
+    cudaGraph_t graph; /**< The child graph to clone into the node for node creation, or
+                            a handle to the graph owned by the node for node query */
+};
+
+/**
+ * Event record node parameters
+ */
+struct __device_builtin__ cudaEventRecordNodeParams {
+    cudaEvent_t event; /**< The event to record when the node executes */
+};
+
+/**
+ * Event wait node parameters
+ */
+struct __device_builtin__ cudaEventWaitNodeParams {
+    cudaEvent_t event; /**< The event to wait on from the node */
+};
+
+/**
+ * Graph node parameters.  See ::cudaGraphAddNode.
+ */
+struct __device_builtin__ cudaGraphNodeParams {
+    enum cudaGraphNodeType type; /**< Type of the node */
+    int reserved0[3];            /**< Reserved.  Must be zero. */
+
+    union {
+        long long                                      reserved1[29]; /**< Padding. Unused bytes must be zero. */
+        struct cudaKernelNodeParamsV2                  kernel;        /**< Kernel node parameters. */
+        struct cudaMemcpyNodeParams                    memcpy;        /**< Memcpy node parameters. */
+        struct cudaMemsetParamsV2                      memset;        /**< Memset node parameters. */
+        struct cudaHostNodeParamsV2                    host;          /**< Host node parameters. */
+        struct cudaChildGraphNodeParams                graph;         /**< Child graph node parameters. */
+        struct cudaEventWaitNodeParams                 eventWait;     /**< Event wait node parameters. */
+        struct cudaEventRecordNodeParams               eventRecord;   /**< Event record node parameters. */
+        struct cudaExternalSemaphoreSignalNodeParamsV2 extSemSignal;  /**< External semaphore signal node parameters. */
+        struct cudaExternalSemaphoreWaitNodeParamsV2   extSemWait;    /**< External semaphore wait node parameters. */
+        struct cudaMemAllocNodeParamsV2                alloc;         /**< Memory allocation node parameters. */
+        struct cudaMemFreeNodeParams                   free;          /**< Memory free node parameters. */
+        struct cudaConditionalNodeParams               conditional;   /**< Conditional node parameters. */
+    };
+
+    long long reserved2; /**< Reserved bytes. Must be zero. */
+};
+
+/**
+ * Type annotations that can be applied to graph edges as part of ::cudaGraphEdgeData.
+ */
+typedef __device_builtin__ enum cudaGraphDependencyType_enum {
+    cudaGraphDependencyTypeDefault = 0, /**< This is an ordinary dependency. */
+    cudaGraphDependencyTypeProgrammatic = 1  /**< This dependency type allows the downstream node to
+                                                  use \c cudaGridDependencySynchronize(). It may only be used
+                                                  between kernel nodes, and must be used with either the
+                                                  ::cudaGraphKernelNodePortProgrammatic or
+                                                  ::cudaGraphKernelNodePortLaunchCompletion outgoing port. */
+} cudaGraphDependencyType;
+
+/**
+ * Optional annotation for edges in a CUDA graph. Note, all edges implicitly have annotations and
+ * default to a zero-initialized value if not specified. A zero-initialized struct indicates a
+ * standard full serialization of two nodes with memory visibility.
+ */
+typedef __device_builtin__ struct cudaGraphEdgeData_st {
+    unsigned char from_port; /**< This indicates when the dependency is triggered from the upstream
+                                  node on the edge. The meaning is specfic to the node type. A value
+                                  of 0 in all cases means full completion of the upstream node, with
+                                  memory visibility to the downstream node or portion thereof
+                                  (indicated by \c to_port).
+                                  <br>
+                                  Only kernel nodes define non-zero ports. A kernel node
+                                  can use the following output port types:
+                                  ::cudaGraphKernelNodePortDefault, ::cudaGraphKernelNodePortProgrammatic,
+                                  or ::cudaGraphKernelNodePortLaunchCompletion. */
+    unsigned char to_port; /**< This indicates what portion of the downstream node is dependent on
+                                the upstream node or portion thereof (indicated by \c from_port). The
+                                meaning is specific to the node type. A value of 0 in all cases means
+                                the entirety of the downstream node is dependent on the upstream work.
+                                <br>
+                                Currently no node types define non-zero ports. Accordingly, this field
+                                must be set to zero. */
+    unsigned char type; /**< This should be populated with a value from ::cudaGraphDependencyType. (It
+                             is typed as char due to compiler-specific layout of bitfields.) See
+                             ::cudaGraphDependencyType. */
+    unsigned char reserved[5]; /**< These bytes are unused and must be zeroed. This ensures
+                                    compatibility if additional fields are added in the future. */
+} cudaGraphEdgeData;
+
+/**
+ * This port activates when the kernel has finished executing.
+ */
+#define cudaGraphKernelNodePortDefault 0
+/**
+ * This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion()
+ * or have terminated. It must be used with edge type ::cudaGraphDependencyTypeProgrammatic. See also
+ * ::cudaLaunchAttributeProgrammaticEvent.
+ */
+#define cudaGraphKernelNodePortProgrammatic 1
+/**
+ * This port activates when all blocks of the kernel have begun execution. See also
+ * ::cudaLaunchAttributeLaunchCompletionEvent.
+ */
+#define cudaGraphKernelNodePortLaunchCompletion 2
+
+/**
+ * CUDA executable (launchable) graph
+ */
+typedef struct CUgraphExec_st* cudaGraphExec_t;
+
+/**
+* CUDA Graph Update error types
+*/
+enum __device_builtin__ cudaGraphExecUpdateResult {
+    cudaGraphExecUpdateSuccess                = 0x0, /**< The update succeeded */
+    cudaGraphExecUpdateError                  = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    cudaGraphExecUpdateErrorTopologyChanged   = 0x2, /**< The update failed because the topology changed */
+    cudaGraphExecUpdateErrorNodeTypeChanged   = 0x3, /**< The update failed because a node type changed */
+    cudaGraphExecUpdateErrorFunctionChanged   = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    cudaGraphExecUpdateErrorParametersChanged = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    cudaGraphExecUpdateErrorNotSupported      = 0x6, /**< The update failed because something about the node is not supported */
+    cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    cudaGraphExecUpdateErrorAttributesChanged = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */
+};
+
+/**
+ * Graph instantiation results
+*/
+typedef __device_builtin__ enum cudaGraphInstantiateResult {
+    cudaGraphInstantiateSuccess = 0,                       /**< Instantiation succeeded */
+    cudaGraphInstantiateError = 1,                         /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
+    cudaGraphInstantiateInvalidStructure = 2,              /**< Instantiation failed due to invalid structure, such as cycles */
+    cudaGraphInstantiateNodeOperationNotSupported = 3,     /**< Instantiation for device launch failed because the graph contained an unsupported operation */
+    cudaGraphInstantiateMultipleDevicesNotSupported = 4,   /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
+    cudaGraphInstantiateConditionalHandleUnused = 5        /**< One or more conditional handles are not associated with conditional nodes */
+} cudaGraphInstantiateResult;
+
+/**
+ * Graph instantiation parameters
+ */
+typedef __device_builtin__ struct cudaGraphInstantiateParams_st
+{
+    unsigned long long flags;              /**< Instantiation flags */
+    cudaStream_t uploadStream;             /**< Upload stream */
+    cudaGraphNode_t errNode_out;           /**< The node which caused instantiation to fail, if any */
+    cudaGraphInstantiateResult result_out; /**< Whether instantiation was successful.  If it failed, the reason why */
+} cudaGraphInstantiateParams;
+
+/**
+ * Result information returned by cudaGraphExecUpdate
+ */
+typedef __device_builtin__ struct cudaGraphExecUpdateResultInfo_st {
+    /**
+     * Gives more specific detail when a cuda graph update fails. 
+     */
+    enum cudaGraphExecUpdateResult result;
+
+    /**
+     * The "to node" of the error edge when the topologies do not match.
+     * The error node when the error is associated with a specific node.
+     * NULL when the error is generic.
+     */
+    cudaGraphNode_t errorNode;
+
+    /**
+     * The from node of error edge when the topologies do not match. Otherwise NULL.
+     */
+    cudaGraphNode_t errorFromNode;
+} cudaGraphExecUpdateResultInfo;
+
+/**
+ * CUDA device node handle for device-side node update
+ */
+typedef struct CUgraphDeviceUpdatableNode_st* cudaGraphDeviceNode_t;
+
+/**
+ * Specifies the field to update when performing multiple node updates from the device
+ */
+enum __device_builtin__ cudaGraphKernelNodeField
+{
+    cudaGraphKernelNodeFieldInvalid = 0, /**< Invalid field */
+    cudaGraphKernelNodeFieldGridDim,     /**< Grid dimension update */
+    cudaGraphKernelNodeFieldParam,       /**< Kernel parameter update */
+    cudaGraphKernelNodeFieldEnabled      /**< Node enable/disable */
+};
+
+/**
+ * Struct to specify a single node update to pass as part of a larger array to ::cudaGraphKernelNodeUpdatesApply
+ */
+struct __device_builtin__ cudaGraphKernelNodeUpdate {
+    cudaGraphDeviceNode_t node;     /**< Node to update */
+    enum cudaGraphKernelNodeField field; /**< Which type of update to apply. Determines how updateData is interpreted */
+    union {
+#if !defined(__cplusplus) || __cplusplus >= 201103L
+        dim3 gridDim;               /**< Grid dimensions */
+#else
+        /* Union members cannot have nontrivial constructors until C++11. */
+        uint3 gridDim;              /**< Grid dimensions */
+#endif
+        struct {
+            const void *pValue;     /**< Kernel parameter data to write in */
+            size_t offset;          /**< Offset into the parameter buffer at which to apply the update */
+            size_t size;            /**< Number of bytes to update */
+        } param;                    /**< Kernel parameter data */
+        unsigned int isEnabled;     /**< Node enable/disable data. Nonzero if the node should be enabled, 0 if it should be disabled */
+    } updateData;                   /**< Update data to apply. Which field is used depends on field's value */
+};
+
+/**
+ * Flags to specify search options to be used with ::cudaGetDriverEntryPoint
+ * For more details see ::cuGetProcAddress
+ */ 
+enum __device_builtin__ cudaGetDriverEntryPointFlags {
+    cudaEnableDefault                = 0x0, /**< Default search mode for driver symbols. */
+    cudaEnableLegacyStream           = 0x1, /**< Search for legacy versions of driver symbols. */
+    cudaEnablePerThreadDefaultStream = 0x2  /**< Search for per-thread versions of driver symbols. */
+};
+
+/**
+ * Enum for status from obtaining driver entry points, used with ::cudaApiGetDriverEntryPoint
+ */
+enum __device_builtin__ cudaDriverEntryPointQueryResult {
+    cudaDriverEntryPointSuccess             = 0,  /**< Search for symbol found a match */
+    cudaDriverEntryPointSymbolNotFound      = 1,  /**< Search for symbol was not found */
+    cudaDriverEntryPointVersionNotSufficent = 2   /**< Search for symbol was found but version wasn't great enough */
+};
+
+/**
+ * CUDA Graph debug write options
+ */
+enum __device_builtin__ cudaGraphDebugDotFlags {
+    cudaGraphDebugDotFlagsVerbose                  = 1<<0,  /**< Output all debug data as if every debug flag is enabled */
+    cudaGraphDebugDotFlagsKernelNodeParams         = 1<<2,  /**< Adds cudaKernelNodeParams to output */
+    cudaGraphDebugDotFlagsMemcpyNodeParams         = 1<<3,  /**< Adds cudaMemcpy3DParms to output */
+    cudaGraphDebugDotFlagsMemsetNodeParams         = 1<<4,  /**< Adds cudaMemsetParams to output */
+    cudaGraphDebugDotFlagsHostNodeParams           = 1<<5,  /**< Adds cudaHostNodeParams to output */
+    cudaGraphDebugDotFlagsEventNodeParams          = 1<<6,  /**< Adds cudaEvent_t handle from record and wait nodes to output */
+    cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7,  /**< Adds cudaExternalSemaphoreSignalNodeParams values to output */
+    cudaGraphDebugDotFlagsExtSemasWaitNodeParams   = 1<<8,  /**< Adds cudaExternalSemaphoreWaitNodeParams to output */
+    cudaGraphDebugDotFlagsKernelNodeAttributes     = 1<<9,  /**< Adds cudaKernelNodeAttrID values to output */
+    cudaGraphDebugDotFlagsHandles                  = 1<<10, /**< Adds node handles and every kernel function handle to output */
+    cudaGraphDebugDotFlagsConditionalNodeParams    = 1<<15, /**< Adds cudaConditionalNodeParams to output */
+};
+
+/**
+ * Flags for instantiating a graph
+ */
+enum __device_builtin__ cudaGraphInstantiateFlags {
+    cudaGraphInstantiateFlagAutoFreeOnLaunch = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , cudaGraphInstantiateFlagUpload           = 2 /**< Automatically upload the graph after instantiation. Only supported by                                                                                                                                                                                                                                                                                                     
+                                                      ::cudaGraphInstantiateWithParams.  The upload will be performed using the                                                                                                                                                                                                                                                                                                   
+                                                      stream provided in \p instantiateParams. */                                                                                                                                                                                                                                                                                                                               
+  , cudaGraphInstantiateFlagDeviceLaunch     = 4 /**< Instantiate the graph to be launchable from the device. This flag can only                                                                                                                                                                                                                                                                                                
+                                                      be used on platforms which support unified addressing. This flag cannot be                                                                                                                                                                                                                                                                                                
+                                                      used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch. */                                                                                                                                                                                                                                                                                              
+  , cudaGraphInstantiateFlagUseNodePriority  = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                      priority of the stream it is launched into. */
+};
+
+/**
+ * Memory Synchronization Domain
+ *
+ * A kernel can be launched in a specified memory synchronization domain that affects all memory operations issued by
+ * that kernel. A memory barrier issued in one domain will only order memory operations in that domain, thus eliminating
+ * latency increase from memory barriers ordering unrelated traffic.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::cudaLaunchMemSyncDomainRemote will have a
+ * different domain ID. User may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::cudaLaunchAttributeMemSyncDomain, ::cudaStreamSetAttribute, ::cudaLaunchKernelEx,
+ * ::cudaGraphKernelNodeSetAttribute.
+ *
+ * Memory operations done in kernels launched in different domains are considered system-scope distanced. In other
+ * words, a GPU scoped memory synchronization is not sufficient for memory order to be observed by kernels in another
+ * memory synchronization domain even if they are on the same GPU.
+ */
+typedef __device_builtin__ enum cudaLaunchMemSyncDomain {
+    cudaLaunchMemSyncDomainDefault = 0,    /**< Launch kernels in the default domain */
+    cudaLaunchMemSyncDomainRemote  = 1     /**< Launch kernels in the remote domain */
+} cudaLaunchMemSyncDomain;
+
+/**
+ * Memory Synchronization Domain map
+ *
+ * See ::cudaLaunchMemSyncDomain.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::cudaLaunchMemSyncDomainRemote will have a
+ * different domain ID. User may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::cudaLaunchAttributeMemSyncDomainMap.
+ *
+ * Domain ID range is available through ::cudaDevAttrMemSyncDomainCount.
+ */
+typedef __device_builtin__ struct cudaLaunchMemSyncDomainMap_st {
+    unsigned char default_;                /**< The default domain ID to use for designated kernels */
+    unsigned char remote;                  /**< The remote domain ID to use for designated kernels */
+} cudaLaunchMemSyncDomainMap;
+
+/**
+ * Launch attributes enum; used as id field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ enum cudaLaunchAttributeID {
+    cudaLaunchAttributeIgnore                = 0 /**< Ignored entry, for convenient composition */
+  , cudaLaunchAttributeAccessPolicyWindow    = 1 /**< Valid for streams, graph nodes, launches. See
+                                                    ::cudaLaunchAttributeValue::accessPolicyWindow. */
+  , cudaLaunchAttributeCooperative           = 2 /**< Valid for graph nodes, launches. See
+                                                    ::cudaLaunchAttributeValue::cooperative. */
+  , cudaLaunchAttributeSynchronizationPolicy = 3 /**< Valid for streams. See ::cudaLaunchAttributeValue::syncPolicy. */
+  , cudaLaunchAttributeClusterDimension                  = 4 /**< Valid for graph nodes, launches. See
+                                                                ::cudaLaunchAttributeValue::clusterDim. */
+  , cudaLaunchAttributeClusterSchedulingPolicyPreference = 5 /**< Valid for graph nodes, launches. See
+                                                                ::cudaLaunchAttributeValue::clusterSchedulingPolicyPreference. */
+  , cudaLaunchAttributeProgrammaticStreamSerialization   = 6 /**< Valid for launches. Setting
+                                                                  ::cudaLaunchAttributeValue::programmaticStreamSerializationAllowed
+                                                                  to non-0 signals that the kernel will use programmatic
+                                                                  means to resolve its stream dependency, so that the
+                                                                  CUDA runtime should opportunistically allow the grid's
+                                                                  execution to overlap with the previous kernel in the
+                                                                  stream, if that kernel requests the overlap. The
+                                                                  dependent launches can choose to wait on the
+                                                                  dependency using the programmatic sync
+                                                                  (cudaGridDependencySynchronize() or equivalent PTX
+                                                                  instructions). */
+  , cudaLaunchAttributeProgrammaticEvent                 = 7 /**< Valid for launches. Set
+                                                                  ::cudaLaunchAttributeValue::programmaticEvent to
+                                                                  record the event. Event recorded through this launch
+                                                                  attribute is guaranteed to only trigger after all
+                                                                  block in the associated kernel trigger the event.  A
+                                                                  block can trigger the event programmatically in a
+                                                                  future CUDA release. A trigger can also be inserted at
+                                                                  the beginning of each block's execution if
+                                                                  triggerAtBlockStart is set to non-0. The dependent
+                                                                  launches can choose to wait on the dependency using
+                                                                  the programmatic sync (cudaGridDependencySynchronize()
+                                                                  or equivalent PTX instructions). Note that dependents
+                                                                  (including the CPU thread calling
+                                                                  cudaEventSynchronize()) are not guaranteed to observe
+                                                                  the release precisely when it is released. For
+                                                                  example, cudaEventSynchronize() may only observe the
+                                                                  event trigger long after the associated kernel has
+                                                                  completed. This recording type is primarily meant for
+                                                                  establishing programmatic dependency between device
+                                                                  tasks. Note also this type of dependency allows, but
+                                                                  does not guarantee, concurrent execution of tasks.
+                                                                  <br>
+                                                                  The event supplied must not be an interprocess or
+                                                                  interop event. The event must disable timing (i.e.
+                                                                  must be created with the ::cudaEventDisableTiming flag
+                                                                  set). */
+  , cudaLaunchAttributePriority              = 8 /**< Valid for streams, graph nodes, launches. See
+                                                    ::cudaLaunchAttributeValue::priority. */
+  , cudaLaunchAttributeMemSyncDomainMap                  = 9 /**< Valid for streams, graph nodes, launches. See
+                                                                ::cudaLaunchAttributeValue::memSyncDomainMap. */
+  , cudaLaunchAttributeMemSyncDomain                    = 10 /**< Valid for streams, graph nodes, launches. See
+                                                                ::cudaLaunchAttributeValue::memSyncDomain. */
+  , cudaLaunchAttributePreferredClusterDimension = 11 /**< Valid for graph nodes and launches. Set
+                                                           ::cudaLaunchAttributeValue::preferredClusterDim
+                                                           to allow the kernel launch to specify a preferred substitute
+                                                           cluster dimension. Blocks may be grouped according to either
+                                                           the dimensions specified with this attribute (grouped into a
+                                                           "preferred substitute cluster"), or the one specified with
+                                                           ::cudaLaunchAttributeClusterDimension attribute (grouped
+                                                           into a "regular cluster"). The cluster dimensions of a
+                                                           "preferred substitute cluster" shall be an integer multiple
+                                                           greater than zero of the regular cluster dimensions. The
+                                                           device will attempt - on a best-effort basis - to group
+                                                           thread blocks into preferred clusters over grouping them
+                                                           into regular clusters. When it deems necessary (primarily
+                                                           when the device temporarily runs out of physical resources
+                                                           to launch the larger preferred clusters), the device may
+                                                           switch to launch the regular clusters instead to attempt to
+                                                           utilize as much of the physical device resources as possible.
+                                                           <br>
+                                                           Each type of cluster will have its enumeration / coordinate
+                                                           setup as if the grid consists solely of its type of cluster.
+                                                           For example, if the preferred substitute cluster dimensions
+                                                           double the regular cluster dimensions, there might be
+                                                           simultaneously a regular cluster indexed at (1,0,0), and a
+                                                           preferred cluster indexed at (1,0,0). In this example, the
+                                                           preferred substitute cluster (1,0,0) replaces regular
+                                                           clusters (2,0,0) and (3,0,0) and groups their blocks.
+                                                           <br>
+                                                           This attribute will only take effect when a regular cluster
+                                                           dimension has been specified. The preferred substitute cluster
+                                                           dimension must be an integer multiple greater than zero of the
+                                                           regular cluster dimension and must divide the grid. It must
+                                                           also be no more than `maxBlocksPerCluster`, if it is set in
+                                                           the kernel's `__launch_bounds__`. Otherwise it must be less
+                                                           than the maximum value the driver can support. Otherwise,
+                                                           setting this attribute to a value physically unable to fit on
+                                                           any particular device is permitted. */
+  , cudaLaunchAttributeLaunchCompletionEvent = 12 /**< Valid for launches. Set
+                                                       ::cudaLaunchAttributeValue::launchCompletionEvent to record the
+                                                       event.
+                                                       <br>
+                                                       Nominally, the event is triggered once all blocks of the kernel
+                                                       have begun execution. Currently this is a best effort. If a kernel
+                                                       B has a launch completion dependency on a kernel A, B may wait
+                                                       until A is complete. Alternatively, blocks of B may begin before
+                                                       all blocks of A have begun, for example if B can claim execution
+                                                       resources unavailable to A (e.g. they run on different GPUs) or
+                                                       if B is a higher priority than A.
+                                                       Exercise caution if such an ordering inversion could lead
+                                                       to deadlock.
+                                                       <br>
+                                                       A launch completion event is nominally similar to a programmatic
+                                                       event with \c triggerAtBlockStart set except that it is not
+                                                       visible to \c cudaGridDependencySynchronize() and can be used with
+                                                       compute capability less than 9.0.
+                                                       <br>
+                                                       The event supplied must not be an interprocess or interop event.
+                                                       The event must disable timing (i.e. must be created with the
+                                                       ::cudaEventDisableTiming flag set). */
+  , cudaLaunchAttributeDeviceUpdatableKernelNode = 13 /**< Valid for graph nodes, launches. This attribute is graphs-only,
+                                                           and passing it to a launch in a non-capturing stream will result
+                                                           in an error.
+                                                           <br>
+                                                           :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can 
+                                                           only be set to 0 or 1. Setting the field to 1 indicates that the
+                                                           corresponding kernel node should be device-updatable. On success, a handle
+                                                           will be returned via
+                                                           ::cudaLaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be
+                                                           passed to the various device-side update functions to update the node's
+                                                           kernel parameters from within another kernel. For more information on the
+                                                           types of device updates that can be made, as well as the relevant limitations
+                                                           thereof, see ::cudaGraphKernelNodeUpdatesApply.
+                                                           <br>
+                                                           Nodes which are device-updatable have additional restrictions compared to
+                                                           regular kernel nodes. Firstly, device-updatable nodes cannot be removed
+                                                           from their graph via ::cudaGraphDestroyNode. Additionally, once opted-in
+                                                           to this functionality, a node cannot opt out, and any attempt to set the
+                                                           deviceUpdatable attribute to 0 will result in an error. Device-updatable
+                                                           kernel nodes also cannot have their attributes copied to/from another kernel
+                                                           node via ::cudaGraphKernelNodeCopyAttributes. Graphs containing one or more
+                                                           device-updatable nodes also do not allow multiple instantiation, and neither
+                                                           the graph nor its instantiated version can be passed to ::cudaGraphExecUpdate.
+                                                           <br>
+                                                           If a graph contains device-updatable nodes and updates those nodes from the device
+                                                           from within the graph, the graph must be uploaded with ::cuGraphUpload before it
+                                                           is launched. For such a graph, if host-side executable graph updates are made to the
+                                                           device-updatable nodes, the graph must be uploaded before it is launched again. */
+  , cudaLaunchAttributePreferredSharedMemoryCarveout = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
+                                                               same hardware resources, setting ::cudaLaunchAttributeValue::sharedMemCarveout 
+                                                               to a percentage between 0-100 signals sets the shared memory carveout 
+                                                               preference in percent of the total shared memory for that kernel launch. 
+                                                               This attribute takes precedence over ::cudaFuncAttributePreferredSharedMemoryCarveout.
+                                                               This is only a hint, and the driver can choose a different configuration if
+                                                               required for the launch.*/  
+} cudaLaunchAttributeID;
+
+/**
+ * Launch attributes union; used as value field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ union cudaLaunchAttributeValue {
+    char pad[64]; /* Pad to 64 bytes */
+    struct cudaAccessPolicyWindow accessPolicyWindow; /**< Value of launch attribute ::cudaLaunchAttributeAccessPolicyWindow. */
+    int cooperative; /**< Value of launch attribute ::cudaLaunchAttributeCooperative. Nonzero indicates a cooperative
+                        kernel (see ::cudaLaunchCooperativeKernel). */
+    enum cudaSynchronizationPolicy syncPolicy; /**< Value of launch attribute
+                                                  ::cudaLaunchAttributeSynchronizationPolicy. ::cudaSynchronizationPolicy
+                                                  for work queued up in this stream. */
+    /**
+     * Value of launch attribute ::cudaLaunchAttributeClusterDimension that
+     * represents the desired cluster dimensions for the kernel. Opaque type
+     * with the following fields:
+     *     - \p x - The X dimension of the cluster, in blocks. Must be a divisor
+     *              of the grid X dimension.
+     *     - \p y - The Y dimension of the cluster, in blocks. Must be a divisor
+     *              of the grid Y dimension.
+     *     - \p z - The Z dimension of the cluster, in blocks. Must be a divisor
+     *              of the grid Z dimension.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    enum cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference; /**< Value of launch attribute
+                                                                           ::cudaLaunchAttributeClusterSchedulingPolicyPreference. Cluster
+                                                                           scheduling policy preference for the kernel. */
+    int programmaticStreamSerializationAllowed; /**< Value of launch attribute
+                                                   ::cudaLaunchAttributeProgrammaticStreamSerialization. */
+
+    /**
+     * Value of launch attribute ::cudaLaunchAttributeProgrammaticEvent
+     * with the following fields:
+     *     - \p cudaEvent_t event - Event to fire when all blocks trigger it.
+     *     - \p int flags;        - Event record flags, see ::cudaEventRecordWithFlags. Does not accept
+     *                               ::cudaEventRecordExternal.
+     *     - \p int triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
+     */
+    struct {
+        cudaEvent_t event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    int priority; /**< Value of launch attribute ::cudaLaunchAttributePriority. Execution priority of the kernel. */
+    cudaLaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
+                                                    ::cudaLaunchAttributeMemSyncDomainMap. See
+                                                    ::cudaLaunchMemSyncDomainMap. */
+    cudaLaunchMemSyncDomain memSyncDomain;       /**< Value of launch attribute ::cudaLaunchAttributeMemSyncDomain. See
+                                                    ::cudaLaunchMemSyncDomain. */
+    /**
+     * Value of launch attribute ::cudaLaunchAttributePreferredClusterDimension
+     * that represents the desired preferred cluster dimensions for the kernel.
+     * Opaque type with the following fields:
+     *     - \p x - The X dimension of the preferred cluster, in blocks. Must be
+     *              a divisor of the grid X dimension, and must be a multiple of
+     *              the \p x field of ::cudaLaunchAttributeValue::clusterDim.
+     *     - \p y - The Y dimension of the preferred cluster, in blocks. Must be
+     *              a divisor of the grid Y dimension, and must be a multiple of
+     *              the \p y field of ::cudaLaunchAttributeValue::clusterDim.
+     *     - \p z - The Z dimension of the preferred cluster, in blocks. Must be
+     *              equal to the \p z field of ::cudaLaunchAttributeValue::clusterDim.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } preferredClusterDim;
+
+    /**
+     * Value of launch attribute ::cudaLaunchAttributeLaunchCompletionEvent
+     * with the following fields:
+     *     - \p cudaEvent_t event - Event to fire when the last block launches.
+     *     - \p int flags - Event record flags, see ::cudaEventRecordWithFlags. Does not accept
+     *                   ::cudaEventRecordExternal.
+     */
+    struct {
+        cudaEvent_t event;
+        int flags;
+    } launchCompletionEvent;
+
+    /**
+     * Value of launch attribute ::cudaLaunchAttributeDeviceUpdatableKernelNode
+     * with the following fields:
+     *    - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
+     *    - \p cudaGraphDeviceNode_t devNode - Returns a handle to pass to the various device-side update functions.
+     */
+    struct {
+        int deviceUpdatable;
+        cudaGraphDeviceNode_t devNode;
+    } deviceUpdatableKernelNode;
+    unsigned int sharedMemCarveout; /**< Value of launch attribute ::cudaLaunchAttributePreferredSharedMemoryCarveout. */
+} cudaLaunchAttributeValue;
+
+/**
+ * Launch attribute
+ */
+typedef __device_builtin__ struct cudaLaunchAttribute_st {
+    cudaLaunchAttributeID id; /**< Attribute to set */
+    char pad[8 - sizeof(cudaLaunchAttributeID)];
+    cudaLaunchAttributeValue val; /**< Value of the attribute */
+} cudaLaunchAttribute;
+
+/**
+ * CUDA extensible launch configuration
+ */
+typedef __device_builtin__ struct cudaLaunchConfig_st {
+    dim3 gridDim;               /**< Grid dimensions */
+    dim3 blockDim;              /**< Block dimensions */
+    size_t dynamicSmemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    cudaStream_t stream;        /**< Stream identifier */
+    cudaLaunchAttribute *attrs; /**< List of attributes; nullable if ::cudaLaunchConfig_t::numAttrs == 0 */
+    unsigned int numAttrs;      /**< Number of attributes populated in ::cudaLaunchConfig_t::attrs */
+} cudaLaunchConfig_t;
+
+#define cudaStreamAttrID cudaLaunchAttributeID
+#define cudaStreamAttributeAccessPolicyWindow    cudaLaunchAttributeAccessPolicyWindow
+#define cudaStreamAttributeSynchronizationPolicy cudaLaunchAttributeSynchronizationPolicy
+#define cudaStreamAttributeMemSyncDomainMap      cudaLaunchAttributeMemSyncDomainMap
+#define cudaStreamAttributeMemSyncDomain         cudaLaunchAttributeMemSyncDomain
+#define cudaStreamAttributePriority cudaLaunchAttributePriority
+
+#define cudaStreamAttrValue cudaLaunchAttributeValue
+
+#define cudaKernelNodeAttrID cudaLaunchAttributeID
+#define cudaKernelNodeAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow
+#define cudaKernelNodeAttributeCooperative        cudaLaunchAttributeCooperative
+#define cudaKernelNodeAttributePriority           cudaLaunchAttributePriority
+#define cudaKernelNodeAttributeClusterDimension                     cudaLaunchAttributeClusterDimension
+#define cudaKernelNodeAttributeClusterSchedulingPolicyPreference    cudaLaunchAttributeClusterSchedulingPolicyPreference
+#define cudaKernelNodeAttributeMemSyncDomainMap   cudaLaunchAttributeMemSyncDomainMap
+#define cudaKernelNodeAttributeMemSyncDomain      cudaLaunchAttributeMemSyncDomain
+#define cudaKernelNodeAttributePreferredSharedMemoryCarveout cudaLaunchAttributePreferredSharedMemoryCarveout
+#define cudaKernelNodeAttributeDeviceUpdatableKernelNode cudaLaunchAttributeDeviceUpdatableKernelNode
+
+#define cudaKernelNodeAttrValue cudaLaunchAttributeValue
+
+/**
+ * CUDA device NUMA config
+ */
+enum __device_builtin__  cudaDeviceNumaConfig {
+    cudaDeviceNumaConfigNone  = 0, /**< The GPU is not a NUMA node */
+    cudaDeviceNumaConfigNumaNode, /**< The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID */
+};
+
+/**
+ * CUDA async callback handle
+ */
+typedef struct cudaAsyncCallbackEntry* cudaAsyncCallbackHandle_t;
+
+struct cudaAsyncCallbackEntry;
+
+/**
+* Types of async notification that can occur
+*/
+typedef __device_builtin__ enum cudaAsyncNotificationType_enum {
+    cudaAsyncNotificationTypeOverBudget = 0x1
+} cudaAsyncNotificationType;
+
+/**
+* Information describing an async notification event
+*/
+typedef __device_builtin__ struct cudaAsyncNotificationInfo
+{
+    cudaAsyncNotificationType type;
+    union {
+        struct {
+            unsigned long long bytesOverBudget;
+        } overBudget;
+    } info;
+} cudaAsyncNotificationInfo_t;
+
+typedef void (*cudaAsyncCallback)(cudaAsyncNotificationInfo_t*, void*, cudaAsyncCallbackHandle_t);
+
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#undef __CUDA_DEPRECATED
+
+
+
+#endif /* !__DRIVER_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h
new file mode 100644
index 0000000000000000000000000000000000000000..c017f98f9d668003c0e73fa513c095bb6e717800
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h
@@ -0,0 +1,61 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2010-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#ifndef fatbinary_section_INCLUDED
+#define fatbinary_section_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * These defines are for the fatbin.c runtime wrapper
+ */
+#define FATBINC_MAGIC   0x466243B1
+#define FATBINC_VERSION 1
+#define FATBINC_LINK_VERSION 2
+
+typedef struct {
+  int magic;
+  int version;
+  const unsigned long long* data;
+  void *filename_or_fatbins;  /* version 1: offline filename,
+                               * version 2: array of prelinked fatbins */
+} __fatBinC_Wrapper_t;
+
+/*
+ * The section that contains the fatbin control structure
+ */
+#ifdef STD_OS_Darwin
+/* mach-o sections limited to 15 chars, and want __ prefix else strip complains, * so use a different name */
+#define FATBIN_CONTROL_SECTION_NAME     "__fatbin"
+#define FATBIN_DATA_SECTION_NAME        "__nv_fatbin"
+/* only need segment name for mach-o */
+#define FATBIN_SEGMENT_NAME             "__NV_CUDA"
+#else
+#define FATBIN_CONTROL_SECTION_NAME     ".nvFatBinSegment"
+/*
+ * The section that contains the fatbin data itself
+ * (put in separate section so easy to find)
+ */
+#define FATBIN_DATA_SECTION_NAME        ".nv_fatbin"
+#endif
+/* section for pre-linked relocatable fatbin data */
+#define FATBIN_PRELINK_DATA_SECTION_NAME "__nv_relfatbin"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* fatbinary_section_INCLUDED */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a52e194b265d32f61d47bd3081f4958755bff46
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h
@@ -0,0 +1,116 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// Dependent includes
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaGL.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGraphicsGLRegisterBuffer_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint buffer;
+    unsigned int Flags;
+} cuGraphicsGLRegisterBuffer_params;
+
+typedef struct cuGraphicsGLRegisterImage_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint image;
+    GLenum target;
+    unsigned int Flags;
+} cuGraphicsGLRegisterImage_params;
+
+typedef struct cuGLGetDevices_v2_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_v2_params;
+
+typedef struct cuGLCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_v2_params;
+
+typedef struct cuGLRegisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLRegisterBufferObject_params;
+
+typedef struct cuGLMapBufferObject_v2_ptds_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_ptds_params;
+
+typedef struct cuGLUnmapBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnmapBufferObject_params;
+
+typedef struct cuGLUnregisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnregisterBufferObject_params;
+
+typedef struct cuGLSetBufferObjectMapFlags_params_st {
+    GLuint buffer;
+    unsigned int Flags;
+} cuGLSetBufferObjectMapFlags_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_ptsz_params;
+
+typedef struct cuGLUnmapBufferObjectAsync_params_st {
+    GLuint buffer;
+    CUstream hStream;
+} cuGLUnmapBufferObjectAsync_params;
+
+typedef struct cuGLGetDevices_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_params;
+
+typedef struct cuGLMapBufferObject_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_params;
+
+typedef struct cuGLCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_params;
+
+typedef struct cuGLMapBufferObject_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+} cuGLMapBufferObject_params;
+
+typedef struct cuGLMapBufferObjectAsync_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_params;
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaVDPAU_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaVDPAU_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..abc603c8d9be21e012a9b1641330c2e203d623b2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaVDPAU_meta.h
@@ -0,0 +1,46 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// Dependent includes
+#include <vdpau/vdpau.h>
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaVDPAU.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuVDPAUGetDevice_params_st {
+    CUdevice *pDevice;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUGetDevice_params;
+
+typedef struct cuVDPAUCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_v2_params;
+
+typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterVideoSurface_params;
+
+typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterOutputSurface_params;
+
+typedef struct cuVDPAUCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_params;
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_gl_interop_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_gl_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaba3ac5a760e338f1edc191609f6fa2a32adee7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_gl_interop_meta.h
@@ -0,0 +1,71 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_gl_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaGLGetDevices_v4010_params_st {
+    unsigned int *pCudaDeviceCount;
+    int *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    enum cudaGLDeviceList deviceList;
+} cudaGLGetDevices_v4010_params;
+
+typedef struct cudaGraphicsGLRegisterImage_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint image;
+    GLenum target;
+    unsigned int flags;
+} cudaGraphicsGLRegisterImage_v3020_params;
+
+typedef struct cudaGraphicsGLRegisterBuffer_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint buffer;
+    unsigned int flags;
+} cudaGraphicsGLRegisterBuffer_v3020_params;
+
+typedef struct cudaGLSetGLDevice_v3020_params_st {
+    int device;
+} cudaGLSetGLDevice_v3020_params;
+
+typedef struct cudaGLRegisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLRegisterBufferObject_v3020_params;
+
+typedef struct cudaGLMapBufferObject_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+} cudaGLMapBufferObject_v3020_params;
+
+typedef struct cudaGLUnmapBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnmapBufferObject_v3020_params;
+
+typedef struct cudaGLUnregisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnregisterBufferObject_v3020_params;
+
+typedef struct cudaGLSetBufferObjectMapFlags_v3020_params_st {
+    GLuint bufObj;
+    unsigned int flags;
+} cudaGLSetBufferObjectMapFlags_v3020_params;
+
+typedef struct cudaGLMapBufferObjectAsync_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLMapBufferObjectAsync_v3020_params;
+
+typedef struct cudaGLUnmapBufferObjectAsync_v3020_params_st {
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLUnmapBufferObjectAsync_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..954db0ad73e2eb029918f595ddee452aa9afd0e3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h
@@ -0,0 +1,3718 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// No dependent includes
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cuda.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGetErrorString_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorString_params;
+
+typedef struct cuGetErrorName_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorName_params;
+
+typedef struct cuInit_params_st {
+    unsigned int Flags;
+} cuInit_params;
+
+typedef struct cuDriverGetVersion_params_st {
+    int *driverVersion;
+} cuDriverGetVersion_params;
+
+typedef struct cuDeviceGet_params_st {
+    CUdevice *device;
+    int ordinal;
+} cuDeviceGet_params;
+
+typedef struct cuDeviceGetCount_params_st {
+    int *count;
+} cuDeviceGetCount_params;
+
+typedef struct cuDeviceGetName_params_st {
+    char *name;
+    int len;
+    CUdevice dev;
+} cuDeviceGetName_params;
+
+typedef struct cuDeviceGetUuid_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_params;
+
+typedef struct cuDeviceGetUuid_v2_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_v2_params;
+
+typedef struct cuDeviceGetLuid_params_st {
+    char *luid;
+    unsigned int *deviceNodeMask;
+    CUdevice dev;
+} cuDeviceGetLuid_params;
+
+typedef struct cuDeviceTotalMem_v2_params_st {
+    size_t *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_v2_params;
+
+typedef struct cuDeviceGetTexture1DLinearMaxWidth_params_st {
+    size_t *maxWidthInElements;
+    CUarray_format format;
+    unsigned numChannels;
+    CUdevice dev;
+} cuDeviceGetTexture1DLinearMaxWidth_params;
+
+typedef struct cuDeviceGetAttribute_params_st {
+    int *pi;
+    CUdevice_attribute attrib;
+    CUdevice dev;
+} cuDeviceGetAttribute_params;
+
+typedef struct cuDeviceGetNvSciSyncAttributes_params_st {
+    void *nvSciSyncAttrList;
+    CUdevice dev;
+    int flags;
+} cuDeviceGetNvSciSyncAttributes_params;
+
+typedef struct cuDeviceSetMemPool_params_st {
+    CUdevice dev;
+    CUmemoryPool pool;
+} cuDeviceSetMemPool_params;
+
+typedef struct cuDeviceGetMemPool_params_st {
+    CUmemoryPool *pool;
+    CUdevice dev;
+} cuDeviceGetMemPool_params;
+
+typedef struct cuDeviceGetDefaultMemPool_params_st {
+    CUmemoryPool *pool_out;
+    CUdevice dev;
+} cuDeviceGetDefaultMemPool_params;
+
+typedef struct cuDeviceGetExecAffinitySupport_params_st {
+    int *pi;
+    CUexecAffinityType type;
+    CUdevice dev;
+} cuDeviceGetExecAffinitySupport_params;
+
+typedef struct cuFlushGPUDirectRDMAWrites_params_st {
+    CUflushGPUDirectRDMAWritesTarget target;
+    CUflushGPUDirectRDMAWritesScope scope;
+} cuFlushGPUDirectRDMAWrites_params;
+
+typedef struct cuDeviceGetProperties_params_st {
+    CUdevprop *prop;
+    CUdevice dev;
+} cuDeviceGetProperties_params;
+
+typedef struct cuDeviceComputeCapability_params_st {
+    int *major;
+    int *minor;
+    CUdevice dev;
+} cuDeviceComputeCapability_params;
+
+typedef struct cuDevicePrimaryCtxRetain_params_st {
+    CUcontext *pctx;
+    CUdevice dev;
+} cuDevicePrimaryCtxRetain_params;
+
+typedef struct cuDevicePrimaryCtxRelease_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_v2_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_v2_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_v2_params;
+
+typedef struct cuDevicePrimaryCtxGetState_params_st {
+    CUdevice dev;
+    unsigned int *flags;
+    int *active;
+} cuDevicePrimaryCtxGetState_params;
+
+typedef struct cuDevicePrimaryCtxReset_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_v2_params;
+
+typedef struct cuCtxCreate_v2_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v2_params;
+
+typedef struct cuCtxCreate_v3_params_st {
+    CUcontext *pctx;
+    CUexecAffinityParam *paramsArray;
+    int numParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v3_params;
+
+typedef struct cuCtxCreate_v4_params_st {
+    CUcontext *pctx;
+    CUctxCreateParams *ctxCreateParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v4_params;
+
+typedef struct cuCtxDestroy_v2_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_v2_params;
+
+typedef struct cuCtxPushCurrent_v2_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_v2_params;
+
+typedef struct cuCtxPopCurrent_v2_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_v2_params;
+
+typedef struct cuCtxSetCurrent_params_st {
+    CUcontext ctx;
+} cuCtxSetCurrent_params;
+
+typedef struct cuCtxGetCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxGetCurrent_params;
+
+typedef struct cuCtxGetDevice_params_st {
+    CUdevice *device;
+} cuCtxGetDevice_params;
+
+typedef struct cuCtxGetFlags_params_st {
+    unsigned int *flags;
+} cuCtxGetFlags_params;
+
+typedef struct cuCtxSetFlags_params_st {
+    unsigned int flags;
+} cuCtxSetFlags_params;
+
+typedef struct cuCtxGetId_params_st {
+    CUcontext ctx;
+    unsigned long long *ctxId;
+} cuCtxGetId_params;
+
+typedef struct cuCtxSetLimit_params_st {
+    CUlimit limit;
+    size_t value;
+} cuCtxSetLimit_params;
+
+typedef struct cuCtxGetLimit_params_st {
+    size_t *pvalue;
+    CUlimit limit;
+} cuCtxGetLimit_params;
+
+typedef struct cuCtxGetCacheConfig_params_st {
+    CUfunc_cache *pconfig;
+} cuCtxGetCacheConfig_params;
+
+typedef struct cuCtxSetCacheConfig_params_st {
+    CUfunc_cache config;
+} cuCtxSetCacheConfig_params;
+
+typedef struct cuCtxGetApiVersion_params_st {
+    CUcontext ctx;
+    unsigned int *version;
+} cuCtxGetApiVersion_params;
+
+typedef struct cuCtxGetStreamPriorityRange_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cuCtxGetStreamPriorityRange_params;
+
+typedef struct cuCtxGetExecAffinity_params_st {
+    CUexecAffinityParam *pExecAffinity;
+    CUexecAffinityType type;
+} cuCtxGetExecAffinity_params;
+
+typedef struct cuCtxRecordEvent_params_st {
+    CUcontext hCtx;
+    CUevent hEvent;
+} cuCtxRecordEvent_params;
+
+typedef struct cuCtxWaitEvent_params_st {
+    CUcontext hCtx;
+    CUevent hEvent;
+} cuCtxWaitEvent_params;
+
+typedef struct cuCtxAttach_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+} cuCtxAttach_params;
+
+typedef struct cuCtxDetach_params_st {
+    CUcontext ctx;
+} cuCtxDetach_params;
+
+typedef struct cuCtxGetSharedMemConfig_params_st {
+    CUsharedconfig *pConfig;
+} cuCtxGetSharedMemConfig_params;
+
+typedef struct cuCtxSetSharedMemConfig_params_st {
+    CUsharedconfig config;
+} cuCtxSetSharedMemConfig_params;
+
+typedef struct cuModuleLoad_params_st {
+    CUmodule *module;
+    const char *fname;
+} cuModuleLoad_params;
+
+typedef struct cuModuleLoadData_params_st {
+    CUmodule *module;
+    const void *image;
+} cuModuleLoadData_params;
+
+typedef struct cuModuleLoadDataEx_params_st {
+    CUmodule *module;
+    const void *image;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuModuleLoadDataEx_params;
+
+typedef struct cuModuleLoadFatBinary_params_st {
+    CUmodule *module;
+    const void *fatCubin;
+} cuModuleLoadFatBinary_params;
+
+typedef struct cuModuleUnload_params_st {
+    CUmodule hmod;
+} cuModuleUnload_params;
+
+typedef struct cuModuleGetLoadingMode_params_st {
+    CUmoduleLoadingMode *mode;
+} cuModuleGetLoadingMode_params;
+
+typedef struct cuModuleGetFunction_params_st {
+    CUfunction *hfunc;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetFunction_params;
+
+typedef struct cuModuleGetFunctionCount_params_st {
+    unsigned int *count;
+    CUmodule mod;
+} cuModuleGetFunctionCount_params;
+
+typedef struct cuModuleEnumerateFunctions_params_st {
+    CUfunction *functions;
+    unsigned int numFunctions;
+    CUmodule mod;
+} cuModuleEnumerateFunctions_params;
+
+typedef struct cuModuleGetGlobal_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_v2_params;
+
+typedef struct cuLinkCreate_v2_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_v2_params;
+
+typedef struct cuLinkAddData_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_v2_params;
+
+typedef struct cuLinkAddFile_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_v2_params;
+
+typedef struct cuLinkComplete_params_st {
+    CUlinkState state;
+    void **cubinOut;
+    size_t *sizeOut;
+} cuLinkComplete_params;
+
+typedef struct cuLinkDestroy_params_st {
+    CUlinkState state;
+} cuLinkDestroy_params;
+
+typedef struct cuModuleGetTexRef_params_st {
+    CUtexref *pTexRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetTexRef_params;
+
+typedef struct cuModuleGetSurfRef_params_st {
+    CUsurfref *pSurfRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetSurfRef_params;
+
+typedef struct cuLibraryLoadData_params_st {
+    CUlibrary *library;
+    const void *code;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadData_params;
+
+typedef struct cuLibraryLoadFromFile_params_st {
+    CUlibrary *library;
+    const char *fileName;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadFromFile_params;
+
+typedef struct cuLibraryUnload_params_st {
+    CUlibrary library;
+} cuLibraryUnload_params;
+
+typedef struct cuLibraryGetKernel_params_st {
+    CUkernel *pKernel;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetKernel_params;
+
+typedef struct cuLibraryGetKernelCount_params_st {
+    unsigned int *count;
+    CUlibrary lib;
+} cuLibraryGetKernelCount_params;
+
+typedef struct cuLibraryEnumerateKernels_params_st {
+    CUkernel *kernels;
+    unsigned int numKernels;
+    CUlibrary lib;
+} cuLibraryEnumerateKernels_params;
+
+typedef struct cuLibraryGetModule_params_st {
+    CUmodule *pMod;
+    CUlibrary library;
+} cuLibraryGetModule_params;
+
+typedef struct cuKernelGetFunction_params_st {
+    CUfunction *pFunc;
+    CUkernel kernel;
+} cuKernelGetFunction_params;
+
+typedef struct cuKernelGetLibrary_params_st {
+    CUlibrary *pLib;
+    CUkernel kernel;
+} cuKernelGetLibrary_params;
+
+typedef struct cuLibraryGetGlobal_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetGlobal_params;
+
+typedef struct cuLibraryGetManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetManaged_params;
+
+typedef struct cuLibraryGetUnifiedFunction_params_st {
+    void **fptr;
+    CUlibrary library;
+    const char *symbol;
+} cuLibraryGetUnifiedFunction_params;
+
+typedef struct cuKernelGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelGetAttribute_params;
+
+typedef struct cuKernelSetAttribute_params_st {
+    CUfunction_attribute attrib;
+    int val;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelSetAttribute_params;
+
+typedef struct cuKernelSetCacheConfig_params_st {
+    CUkernel kernel;
+    CUfunc_cache config;
+    CUdevice dev;
+} cuKernelSetCacheConfig_params;
+
+typedef struct cuKernelGetName_params_st {
+    const char **name;
+    CUkernel hfunc;
+} cuKernelGetName_params;
+
+typedef struct cuKernelGetParamInfo_params_st {
+    CUkernel kernel;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuKernelGetParamInfo_params;
+
+typedef struct cuMemGetInfo_v2_params_st {
+    size_t *free;
+    size_t *total;
+} cuMemGetInfo_v2_params;
+
+typedef struct cuMemAlloc_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+} cuMemAlloc_v2_params;
+
+typedef struct cuMemAllocPitch_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *pPitch;
+    size_t WidthInBytes;
+    size_t Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_v2_params;
+
+typedef struct cuMemFree_v2_params_st {
+    CUdeviceptr dptr;
+} cuMemFree_v2_params;
+
+typedef struct cuMemGetAddressRange_v2_params_st {
+    CUdeviceptr *pbase;
+    size_t *psize;
+    CUdeviceptr dptr;
+} cuMemGetAddressRange_v2_params;
+
+typedef struct cuMemAllocHost_v2_params_st {
+    void **pp;
+    size_t bytesize;
+} cuMemAllocHost_v2_params;
+
+typedef struct cuMemFreeHost_params_st {
+    void *p;
+} cuMemFreeHost_params;
+
+typedef struct cuMemHostAlloc_params_st {
+    void **pp;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostAlloc_params;
+
+typedef struct cuMemHostGetDevicePointer_v2_params_st {
+    CUdeviceptr *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_v2_params;
+
+typedef struct cuMemHostGetFlags_params_st {
+    unsigned int *pFlags;
+    void *p;
+} cuMemHostGetFlags_params;
+
+typedef struct cuMemAllocManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    unsigned int flags;
+} cuMemAllocManaged_params;
+
+typedef struct cuDeviceRegisterAsyncNotification_params_st {
+    CUdevice device;
+    CUasyncCallback callbackFunc;
+    void *userData;
+    CUasyncCallbackHandle *callback;
+} cuDeviceRegisterAsyncNotification_params;
+
+typedef struct cuDeviceUnregisterAsyncNotification_params_st {
+    CUdevice device;
+    CUasyncCallbackHandle callback;
+} cuDeviceUnregisterAsyncNotification_params;
+
+typedef struct cuDeviceGetByPCIBusId_params_st {
+    CUdevice *dev;
+    const char *pciBusId;
+} cuDeviceGetByPCIBusId_params;
+
+typedef struct cuDeviceGetPCIBusId_params_st {
+    char *pciBusId;
+    int len;
+    CUdevice dev;
+} cuDeviceGetPCIBusId_params;
+
+typedef struct cuIpcGetEventHandle_params_st {
+    CUipcEventHandle *pHandle;
+    CUevent event;
+} cuIpcGetEventHandle_params;
+
+typedef struct cuIpcOpenEventHandle_params_st {
+    CUevent *phEvent;
+    CUipcEventHandle handle;
+} cuIpcOpenEventHandle_params;
+
+typedef struct cuIpcGetMemHandle_params_st {
+    CUipcMemHandle *pHandle;
+    CUdeviceptr dptr;
+} cuIpcGetMemHandle_params;
+
+typedef struct cuIpcOpenMemHandle_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_v2_params;
+
+typedef struct cuIpcCloseMemHandle_params_st {
+    CUdeviceptr dptr;
+} cuIpcCloseMemHandle_params;
+
+typedef struct cuMemHostRegister_v2_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_v2_params;
+
+typedef struct cuMemHostUnregister_params_st {
+    void *p;
+} cuMemHostUnregister_params;
+
+typedef struct cuMemcpy_ptds_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_ptds_params;
+
+typedef struct cuMemcpyPeer_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_ptds_params;
+
+typedef struct cuMemcpyHtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_ptds_params;
+
+typedef struct cuMemcpyDtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_ptds_params;
+
+typedef struct cuMemcpyHtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_ptds_params;
+
+typedef struct cuMemcpyAtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_ptds_params;
+
+typedef struct cuMemcpy2D_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_ptds_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_ptds_params;
+
+typedef struct cuMemcpy3D_v2_ptds_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_ptds_params;
+
+typedef struct cuMemcpy3DPeer_ptds_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_ptds_params;
+
+typedef struct cuMemcpyAsync_ptsz_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_ptsz_params;
+
+typedef struct cuMemcpyPeerAsync_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_ptsz_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy2DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DPeerAsync_ptsz_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyBatchAsync_ptsz_params_st {
+    CUdeviceptr *dsts;
+    CUdeviceptr *srcs;
+    size_t *sizes;
+    size_t count;
+    CUmemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    CUstream hStream;
+} cuMemcpyBatchAsync_ptsz_params;
+
+typedef struct cuMemcpy3DBatchAsync_ptsz_params_st {
+    size_t numOps;
+    CUDA_MEMCPY3D_BATCH_OP *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    CUstream hStream;
+} cuMemcpy3DBatchAsync_ptsz_params;
+
+typedef struct cuMemsetD8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_ptds_params;
+
+typedef struct cuMemsetD16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_ptds_params;
+
+typedef struct cuMemsetD32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_ptds_params;
+
+typedef struct cuMemsetD2D8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_ptds_params;
+
+typedef struct cuMemsetD2D16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_ptds_params;
+
+typedef struct cuMemsetD2D32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_ptds_params;
+
+typedef struct cuMemsetD8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_ptsz_params;
+
+typedef struct cuMemsetD16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_ptsz_params;
+
+typedef struct cuMemsetD32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_ptsz_params;
+
+typedef struct cuMemsetD2D8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_ptsz_params;
+
+typedef struct cuMemsetD2D16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_ptsz_params;
+
+typedef struct cuMemsetD2D32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_ptsz_params;
+
+typedef struct cuArrayCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR *pAllocateArray;
+} cuArrayCreate_v2_params;
+
+typedef struct cuArrayGetDescriptor_v2_params_st {
+    CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_v2_params;
+
+typedef struct cuArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUarray array;
+} cuArrayGetSparseProperties_params;
+
+typedef struct cuMipmappedArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUmipmappedArray mipmap;
+} cuMipmappedArrayGetSparseProperties_params;
+
+typedef struct cuArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUarray array;
+    CUdevice device;
+} cuArrayGetMemoryRequirements_params;
+
+typedef struct cuMipmappedArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUmipmappedArray mipmap;
+    CUdevice device;
+} cuMipmappedArrayGetMemoryRequirements_params;
+
+typedef struct cuArrayGetPlane_params_st {
+    CUarray *pPlaneArray;
+    CUarray hArray;
+    unsigned int planeIdx;
+} cuArrayGetPlane_params;
+
+typedef struct cuArrayDestroy_params_st {
+    CUarray hArray;
+} cuArrayDestroy_params;
+
+typedef struct cuArray3DCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray;
+} cuArray3DCreate_v2_params;
+
+typedef struct cuArray3DGetDescriptor_v2_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_v2_params;
+
+typedef struct cuMipmappedArrayCreate_params_st {
+    CUmipmappedArray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc;
+    unsigned int numMipmapLevels;
+} cuMipmappedArrayCreate_params;
+
+typedef struct cuMipmappedArrayGetLevel_params_st {
+    CUarray *pLevelArray;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int level;
+} cuMipmappedArrayGetLevel_params;
+
+typedef struct cuMipmappedArrayDestroy_params_st {
+    CUmipmappedArray hMipmappedArray;
+} cuMipmappedArrayDestroy_params;
+
+typedef struct cuMemGetHandleForAddressRange_params_st {
+    void *handle;
+    CUdeviceptr dptr;
+    size_t size;
+    CUmemRangeHandleType handleType;
+    unsigned long long flags;
+} cuMemGetHandleForAddressRange_params;
+
+typedef struct cuMemBatchDecompressAsync_ptsz_params_st {
+    CUmemDecompressParams *paramsArray;
+    size_t count;
+    unsigned int flags;
+    size_t *errorIndex;
+    CUstream stream;
+} cuMemBatchDecompressAsync_ptsz_params;
+
+typedef struct cuMemAddressReserve_params_st {
+    CUdeviceptr *ptr;
+    size_t size;
+    size_t alignment;
+    CUdeviceptr addr;
+    unsigned long long flags;
+} cuMemAddressReserve_params;
+
+typedef struct cuMemAddressFree_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemAddressFree_params;
+
+typedef struct cuMemCreate_params_st {
+    CUmemGenericAllocationHandle *handle;
+    size_t size;
+    const CUmemAllocationProp *prop;
+    unsigned long long flags;
+} cuMemCreate_params;
+
+typedef struct cuMemRelease_params_st {
+    CUmemGenericAllocationHandle handle;
+} cuMemRelease_params;
+
+typedef struct cuMemMap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    size_t offset;
+    CUmemGenericAllocationHandle handle;
+    unsigned long long flags;
+} cuMemMap_params;
+
+typedef struct cuMemMapArrayAsync_ptsz_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_ptsz_params;
+
+typedef struct cuMemUnmap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemUnmap_params;
+
+typedef struct cuMemSetAccess_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    const CUmemAccessDesc *desc;
+    size_t count;
+} cuMemSetAccess_params;
+
+typedef struct cuMemGetAccess_params_st {
+    unsigned long long *flags;
+    const CUmemLocation *location;
+    CUdeviceptr ptr;
+} cuMemGetAccess_params;
+
+typedef struct cuMemExportToShareableHandle_params_st {
+    void *shareableHandle;
+    CUmemGenericAllocationHandle handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemExportToShareableHandle_params;
+
+typedef struct cuMemImportFromShareableHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *osHandle;
+    CUmemAllocationHandleType shHandleType;
+} cuMemImportFromShareableHandle_params;
+
+typedef struct cuMemGetAllocationGranularity_params_st {
+    size_t *granularity;
+    const CUmemAllocationProp *prop;
+    CUmemAllocationGranularity_flags option;
+} cuMemGetAllocationGranularity_params;
+
+typedef struct cuMemGetAllocationPropertiesFromHandle_params_st {
+    CUmemAllocationProp *prop;
+    CUmemGenericAllocationHandle handle;
+} cuMemGetAllocationPropertiesFromHandle_params;
+
+typedef struct cuMemRetainAllocationHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *addr;
+} cuMemRetainAllocationHandle_params;
+
+typedef struct cuMemFreeAsync_ptsz_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_ptsz_params;
+
+typedef struct cuMemAllocAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_ptsz_params;
+
+typedef struct cuMemPoolTrimTo_params_st {
+    CUmemoryPool pool;
+    size_t minBytesToKeep;
+} cuMemPoolTrimTo_params;
+
+typedef struct cuMemPoolSetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolSetAttribute_params;
+
+typedef struct cuMemPoolGetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolGetAttribute_params;
+
+typedef struct cuMemPoolSetAccess_params_st {
+    CUmemoryPool pool;
+    const CUmemAccessDesc *map;
+    size_t count;
+} cuMemPoolSetAccess_params;
+
+typedef struct cuMemPoolGetAccess_params_st {
+    CUmemAccess_flags *flags;
+    CUmemoryPool memPool;
+    CUmemLocation *location;
+} cuMemPoolGetAccess_params;
+
+typedef struct cuMemPoolCreate_params_st {
+    CUmemoryPool *pool;
+    const CUmemPoolProps *poolProps;
+} cuMemPoolCreate_params;
+
+typedef struct cuMemPoolDestroy_params_st {
+    CUmemoryPool pool;
+} cuMemPoolDestroy_params;
+
+typedef struct cuMemAllocFromPoolAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_ptsz_params;
+
+typedef struct cuMemPoolExportToShareableHandle_params_st {
+    void *handle_out;
+    CUmemoryPool pool;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolExportToShareableHandle_params;
+
+typedef struct cuMemPoolImportFromShareableHandle_params_st {
+    CUmemoryPool *pool_out;
+    void *handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolImportFromShareableHandle_params;
+
+typedef struct cuMemPoolExportPointer_params_st {
+    CUmemPoolPtrExportData *shareData_out;
+    CUdeviceptr ptr;
+} cuMemPoolExportPointer_params;
+
+typedef struct cuMemPoolImportPointer_params_st {
+    CUdeviceptr *ptr_out;
+    CUmemoryPool pool;
+    CUmemPoolPtrExportData *shareData;
+} cuMemPoolImportPointer_params;
+
+typedef struct cuMulticastCreate_params_st {
+    CUmemGenericAllocationHandle *mcHandle;
+    const CUmulticastObjectProp *prop;
+} cuMulticastCreate_params;
+
+typedef struct cuMulticastAddDevice_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+} cuMulticastAddDevice_params;
+
+typedef struct cuMulticastBindMem_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUmemGenericAllocationHandle memHandle;
+    size_t memOffset;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindMem_params;
+
+typedef struct cuMulticastBindAddr_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUdeviceptr memptr;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindAddr_params;
+
+typedef struct cuMulticastUnbind_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+    size_t mcOffset;
+    size_t size;
+} cuMulticastUnbind_params;
+
+typedef struct cuMulticastGetGranularity_params_st {
+    size_t *granularity;
+    const CUmulticastObjectProp *prop;
+    CUmulticastGranularity_flags option;
+} cuMulticastGetGranularity_params;
+
+typedef struct cuPointerGetAttribute_params_st {
+    void *data;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerGetAttribute_params;
+
+typedef struct cuMemPrefetchAsync_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_ptsz_params;
+
+typedef struct cuMemPrefetchAsync_v2_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_ptsz_params;
+
+typedef struct cuMemAdvise_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUdevice device;
+} cuMemAdvise_params;
+
+typedef struct cuMemAdvise_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUmemLocation location;
+} cuMemAdvise_v2_params;
+
+typedef struct cuMemRangeGetAttribute_params_st {
+    void *data;
+    size_t dataSize;
+    CUmem_range_attribute attribute;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttribute_params;
+
+typedef struct cuMemRangeGetAttributes_params_st {
+    void **data;
+    size_t *dataSizes;
+    CUmem_range_attribute *attributes;
+    size_t numAttributes;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttributes_params;
+
+typedef struct cuPointerSetAttribute_params_st {
+    const void *value;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerSetAttribute_params;
+
+typedef struct cuPointerGetAttributes_params_st {
+    unsigned int numAttributes;
+    CUpointer_attribute *attributes;
+    void **data;
+    CUdeviceptr ptr;
+} cuPointerGetAttributes_params;
+
+typedef struct cuStreamCreate_params_st {
+    CUstream *phStream;
+    unsigned int Flags;
+} cuStreamCreate_params;
+
+typedef struct cuStreamCreateWithPriority_params_st {
+    CUstream *phStream;
+    unsigned int flags;
+    int priority;
+} cuStreamCreateWithPriority_params;
+
+typedef struct cuStreamGetPriority_ptsz_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_ptsz_params;
+
+typedef struct cuStreamGetDevice_ptsz_params_st {
+    CUstream hStream;
+    CUdevice *device;
+} cuStreamGetDevice_ptsz_params;
+
+typedef struct cuStreamGetFlags_ptsz_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_ptsz_params;
+
+typedef struct cuStreamGetId_ptsz_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_ptsz_params;
+
+typedef struct cuStreamGetCtx_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_ptsz_params;
+
+typedef struct cuStreamGetCtx_v2_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pCtx;
+    CUgreenCtx *pGreenCtx;
+} cuStreamGetCtx_v2_ptsz_params;
+
+typedef struct cuStreamWaitEvent_ptsz_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_ptsz_params;
+
+typedef struct cuStreamAddCallback_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_ptsz_params;
+
+typedef struct cuStreamBeginCaptureToGraph_ptsz_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_ptsz_params;
+
+typedef struct cuThreadExchangeStreamCaptureMode_params_st {
+    CUstreamCaptureMode *mode;
+} cuThreadExchangeStreamCaptureMode_params;
+
+typedef struct cuStreamEndCapture_ptsz_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_ptsz_params;
+
+typedef struct cuStreamIsCapturing_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_ptsz_params;
+
+typedef struct cuStreamAttachMemAsync_ptsz_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_ptsz_params;
+
+typedef struct cuStreamQuery_ptsz_params_st {
+    CUstream hStream;
+} cuStreamQuery_ptsz_params;
+
+typedef struct cuStreamSynchronize_ptsz_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_ptsz_params;
+
+typedef struct cuStreamDestroy_v2_params_st {
+    CUstream hStream;
+} cuStreamDestroy_v2_params;
+
+typedef struct cuStreamCopyAttributes_ptsz_params_st {
+    CUstream dst;
+    CUstream src;
+} cuStreamCopyAttributes_ptsz_params;
+
+typedef struct cuStreamGetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value_out;
+} cuStreamGetAttribute_ptsz_params;
+
+typedef struct cuStreamSetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *value;
+} cuStreamSetAttribute_ptsz_params;
+
+typedef struct cuEventCreate_params_st {
+    CUevent *phEvent;
+    unsigned int Flags;
+} cuEventCreate_params;
+
+typedef struct cuEventRecord_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_ptsz_params;
+
+typedef struct cuEventRecordWithFlags_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_ptsz_params;
+
+typedef struct cuEventQuery_params_st {
+    CUevent hEvent;
+} cuEventQuery_params;
+
+typedef struct cuEventSynchronize_params_st {
+    CUevent hEvent;
+} cuEventSynchronize_params;
+
+typedef struct cuEventDestroy_v2_params_st {
+    CUevent hEvent;
+} cuEventDestroy_v2_params;
+
+typedef struct cuEventElapsedTime_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_params;
+
+typedef struct cuEventElapsedTime_v2_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_v2_params;
+
+typedef struct cuImportExternalMemory_params_st {
+    CUexternalMemory *extMem_out;
+    const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc;
+} cuImportExternalMemory_params;
+
+typedef struct cuExternalMemoryGetMappedBuffer_params_st {
+    CUdeviceptr *devPtr;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc;
+} cuExternalMemoryGetMappedBuffer_params;
+
+typedef struct cuExternalMemoryGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *mipmap;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc;
+} cuExternalMemoryGetMappedMipmappedArray_params;
+
+typedef struct cuDestroyExternalMemory_params_st {
+    CUexternalMemory extMem;
+} cuDestroyExternalMemory_params;
+
+typedef struct cuImportExternalSemaphore_params_st {
+    CUexternalSemaphore *extSem_out;
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc;
+} cuImportExternalSemaphore_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuDestroyExternalSemaphore_params_st {
+    CUexternalSemaphore extSem;
+} cuDestroyExternalSemaphore_params;
+
+typedef struct cuStreamWaitValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_ptsz_params;
+
+typedef struct cuStreamWaitValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_v2_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_ptsz_params;
+
+typedef struct cuFuncGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUfunction hfunc;
+} cuFuncGetAttribute_params;
+
+typedef struct cuFuncSetAttribute_params_st {
+    CUfunction hfunc;
+    CUfunction_attribute attrib;
+    int value;
+} cuFuncSetAttribute_params;
+
+typedef struct cuFuncSetCacheConfig_params_st {
+    CUfunction hfunc;
+    CUfunc_cache config;
+} cuFuncSetCacheConfig_params;
+
+typedef struct cuFuncGetModule_params_st {
+    CUmodule *hmod;
+    CUfunction hfunc;
+} cuFuncGetModule_params;
+
+typedef struct cuFuncGetName_params_st {
+    const char **name;
+    CUfunction hfunc;
+} cuFuncGetName_params;
+
+typedef struct cuFuncGetParamInfo_params_st {
+    CUfunction func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuFuncGetParamInfo_params;
+
+typedef struct cuFuncIsLoaded_params_st {
+    CUfunctionLoadingState *state;
+    CUfunction function;
+} cuFuncIsLoaded_params;
+
+typedef struct cuFuncLoad_params_st {
+    CUfunction function;
+} cuFuncLoad_params;
+
+typedef struct cuLaunchKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_ptsz_params;
+
+typedef struct cuLaunchKernelEx_ptsz_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernelMultiDevice_params_st {
+    CUDA_LAUNCH_PARAMS *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cuLaunchCooperativeKernelMultiDevice_params;
+
+typedef struct cuLaunchHostFunc_ptsz_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_ptsz_params;
+
+typedef struct cuFuncSetBlockShape_params_st {
+    CUfunction hfunc;
+    int x;
+    int y;
+    int z;
+} cuFuncSetBlockShape_params;
+
+typedef struct cuFuncSetSharedSize_params_st {
+    CUfunction hfunc;
+    unsigned int bytes;
+} cuFuncSetSharedSize_params;
+
+typedef struct cuParamSetSize_params_st {
+    CUfunction hfunc;
+    unsigned int numbytes;
+} cuParamSetSize_params;
+
+typedef struct cuParamSeti_params_st {
+    CUfunction hfunc;
+    int offset;
+    unsigned int value;
+} cuParamSeti_params;
+
+typedef struct cuParamSetf_params_st {
+    CUfunction hfunc;
+    int offset;
+    float value;
+} cuParamSetf_params;
+
+typedef struct cuParamSetv_params_st {
+    CUfunction hfunc;
+    int offset;
+    void *ptr;
+    unsigned int numbytes;
+} cuParamSetv_params;
+
+typedef struct cuLaunch_params_st {
+    CUfunction f;
+} cuLaunch_params;
+
+typedef struct cuLaunchGrid_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+} cuLaunchGrid_params;
+
+typedef struct cuLaunchGridAsync_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+    CUstream hStream;
+} cuLaunchGridAsync_params;
+
+typedef struct cuParamSetTexRef_params_st {
+    CUfunction hfunc;
+    int texunit;
+    CUtexref hTexRef;
+} cuParamSetTexRef_params;
+
+typedef struct cuFuncSetSharedMemConfig_params_st {
+    CUfunction hfunc;
+    CUsharedconfig config;
+} cuFuncSetSharedMemConfig_params;
+
+typedef struct cuGraphCreate_params_st {
+    CUgraph *phGraph;
+    unsigned int flags;
+} cuGraphCreate_params;
+
+typedef struct cuGraphAddKernelNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphAddKernelNode_v2_params;
+
+typedef struct cuGraphKernelNodeGetParams_v2_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeGetParams_v2_params;
+
+typedef struct cuGraphKernelNodeSetParams_v2_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphAddMemcpyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphAddMemcpyNode_params;
+
+typedef struct cuGraphMemcpyNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeGetParams_params;
+
+typedef struct cuGraphMemcpyNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeSetParams_params;
+
+typedef struct cuGraphAddMemsetNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphAddMemsetNode_params;
+
+typedef struct cuGraphMemsetNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeGetParams_params;
+
+typedef struct cuGraphMemsetNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeSetParams_params;
+
+typedef struct cuGraphAddHostNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphAddHostNode_params;
+
+typedef struct cuGraphHostNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeGetParams_params;
+
+typedef struct cuGraphHostNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeSetParams_params;
+
+typedef struct cuGraphAddChildGraphNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraph childGraph;
+} cuGraphAddChildGraphNode_params;
+
+typedef struct cuGraphChildGraphNodeGetGraph_params_st {
+    CUgraphNode hNode;
+    CUgraph *phGraph;
+} cuGraphChildGraphNodeGetGraph_params;
+
+typedef struct cuGraphAddEmptyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+} cuGraphAddEmptyNode_params;
+
+typedef struct cuGraphAddEventRecordNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventRecordNode_params;
+
+typedef struct cuGraphEventRecordNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventRecordNodeGetEvent_params;
+
+typedef struct cuGraphEventRecordNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphAddEventWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventWaitNode_params;
+
+typedef struct cuGraphEventWaitNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventWaitNodeGetEvent_params;
+
+typedef struct cuGraphEventWaitNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphAddExternalSemaphoresSignalNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresSignalNode_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresSignalNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphAddExternalSemaphoresWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresWaitNode_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresWaitNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphAddBatchMemOpNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphAddBatchMemOpNode_params;
+
+typedef struct cuGraphBatchMemOpNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out;
+} cuGraphBatchMemOpNodeGetParams_params;
+
+typedef struct cuGraphBatchMemOpNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphExecBatchMemOpNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphExecBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphAddMemAllocNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams;
+} cuGraphAddMemAllocNode_params;
+
+typedef struct cuGraphMemAllocNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEM_ALLOC_NODE_PARAMS *params_out;
+} cuGraphMemAllocNodeGetParams_params;
+
+typedef struct cuGraphAddMemFreeNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUdeviceptr dptr;
+} cuGraphAddMemFreeNode_params;
+
+typedef struct cuGraphMemFreeNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUdeviceptr *dptr_out;
+} cuGraphMemFreeNodeGetParams_params;
+
+typedef struct cuDeviceGraphMemTrim_params_st {
+    CUdevice device;
+} cuDeviceGraphMemTrim_params;
+
+typedef struct cuDeviceGetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceGetGraphMemAttribute_params;
+
+typedef struct cuDeviceSetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceSetGraphMemAttribute_params;
+
+typedef struct cuGraphClone_params_st {
+    CUgraph *phGraphClone;
+    CUgraph originalGraph;
+} cuGraphClone_params;
+
+typedef struct cuGraphNodeFindInClone_params_st {
+    CUgraphNode *phNode;
+    CUgraphNode hOriginalNode;
+    CUgraph hClonedGraph;
+} cuGraphNodeFindInClone_params;
+
+typedef struct cuGraphNodeGetType_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeType *type;
+} cuGraphNodeGetType_params;
+
+typedef struct cuGraphGetNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *nodes;
+    size_t *numNodes;
+} cuGraphGetNodes_params;
+
+typedef struct cuGraphGetRootNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *rootNodes;
+    size_t *numRootNodes;
+} cuGraphGetRootNodes_params;
+
+typedef struct cuGraphGetEdges_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    size_t *numEdges;
+} cuGraphGetEdges_params;
+
+typedef struct cuGraphGetEdges_v2_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    CUgraphEdgeData *edgeData;
+    size_t *numEdges;
+} cuGraphGetEdges_v2_params;
+
+typedef struct cuGraphNodeGetDependencies_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_params;
+
+typedef struct cuGraphNodeGetDependencies_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_v2_params;
+
+typedef struct cuGraphNodeGetDependentNodes_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_params;
+
+typedef struct cuGraphNodeGetDependentNodes_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_v2_params;
+
+typedef struct cuGraphAddDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphAddDependencies_params;
+
+typedef struct cuGraphAddDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphAddDependencies_v2_params;
+
+typedef struct cuGraphRemoveDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_params;
+
+typedef struct cuGraphRemoveDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_v2_params;
+
+typedef struct cuGraphDestroyNode_params_st {
+    CUgraphNode hNode;
+} cuGraphDestroyNode_params;
+
+typedef struct cuGraphInstantiateWithFlags_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    unsigned long long flags;
+} cuGraphInstantiateWithFlags_params;
+
+typedef struct cuGraphInstantiateWithParams_ptsz_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_ptsz_params;
+
+typedef struct cuGraphExecGetFlags_params_st {
+    CUgraphExec hGraphExec;
+    cuuint64_t *flags;
+} cuGraphExecGetFlags_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphExecKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphExecMemcpyNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphExecMemcpyNodeSetParams_params;
+
+typedef struct cuGraphExecMemsetNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphExecMemsetNodeSetParams_params;
+
+typedef struct cuGraphExecHostNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphExecHostNodeSetParams_params;
+
+typedef struct cuGraphExecChildGraphNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraph childGraph;
+} cuGraphExecChildGraphNodeSetParams_params;
+
+typedef struct cuGraphExecEventRecordNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphExecEventWaitNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphExecExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphExecExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphNodeSetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int isEnabled;
+} cuGraphNodeSetEnabled_params;
+
+typedef struct cuGraphNodeGetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int *isEnabled;
+} cuGraphNodeGetEnabled_params;
+
+typedef struct cuGraphUpload_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphUpload_ptsz_params;
+
+typedef struct cuGraphLaunch_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphLaunch_ptsz_params;
+
+typedef struct cuGraphExecDestroy_params_st {
+    CUgraphExec hGraphExec;
+} cuGraphExecDestroy_params;
+
+typedef struct cuGraphDestroy_params_st {
+    CUgraph hGraph;
+} cuGraphDestroy_params;
+
+typedef struct cuGraphExecUpdate_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphExecUpdateResultInfo *resultInfo;
+} cuGraphExecUpdate_v2_params;
+
+typedef struct cuGraphKernelNodeCopyAttributes_params_st {
+    CUgraphNode dst;
+    CUgraphNode src;
+} cuGraphKernelNodeCopyAttributes_params;
+
+typedef struct cuGraphKernelNodeGetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    CUkernelNodeAttrValue *value_out;
+} cuGraphKernelNodeGetAttribute_params;
+
+typedef struct cuGraphKernelNodeSetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    const CUkernelNodeAttrValue *value;
+} cuGraphKernelNodeSetAttribute_params;
+
+typedef struct cuGraphDebugDotPrint_params_st {
+    CUgraph hGraph;
+    const char *path;
+    unsigned int flags;
+} cuGraphDebugDotPrint_params;
+
+typedef struct cuUserObjectCreate_params_st {
+    CUuserObject *object_out;
+    void *ptr;
+    CUhostFn destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cuUserObjectCreate_params;
+
+typedef struct cuUserObjectRetain_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRetain_params;
+
+typedef struct cuUserObjectRelease_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRelease_params;
+
+typedef struct cuGraphRetainUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+    unsigned int flags;
+} cuGraphRetainUserObject_params;
+
+typedef struct cuGraphReleaseUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+} cuGraphReleaseUserObject_params;
+
+typedef struct cuGraphAddNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_params;
+
+typedef struct cuGraphAddNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_v2_params;
+
+typedef struct cuGraphNodeSetParams_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphNodeSetParams_params;
+
+typedef struct cuGraphExecNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphExecNodeSetParams_params;
+
+typedef struct cuGraphConditionalHandleCreate_params_st {
+    CUgraphConditionalHandle *pHandle_out;
+    CUgraph hGraph;
+    CUcontext ctx;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cuGraphConditionalHandleCreate_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessor_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cuOccupancyMaxActiveBlocksPerMultiprocessor_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSize_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+} cuOccupancyMaxPotentialBlockSize_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSizeWithFlags_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+    unsigned int flags;
+} cuOccupancyMaxPotentialBlockSizeWithFlags_params;
+
+typedef struct cuOccupancyAvailableDynamicSMemPerBlock_params_st {
+    size_t *dynamicSmemSize;
+    CUfunction func;
+    int numBlocks;
+    int blockSize;
+} cuOccupancyAvailableDynamicSMemPerBlock_params;
+
+typedef struct cuOccupancyMaxPotentialClusterSize_params_st {
+    int *clusterSize;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxPotentialClusterSize_params;
+
+typedef struct cuOccupancyMaxActiveClusters_params_st {
+    int *numClusters;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxActiveClusters_params;
+
+typedef struct cuTexRefSetArray_params_st {
+    CUtexref hTexRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuTexRefSetArray_params;
+
+typedef struct cuTexRefSetMipmappedArray_params_st {
+    CUtexref hTexRef;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int Flags;
+} cuTexRefSetMipmappedArray_params;
+
+typedef struct cuTexRefSetAddress_v2_params_st {
+    size_t *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr dptr;
+    size_t bytes;
+} cuTexRefSetAddress_v2_params;
+
+typedef struct cuTexRefSetAddress2D_v3_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v3_params;
+
+typedef struct cuTexRefSetFormat_params_st {
+    CUtexref hTexRef;
+    CUarray_format fmt;
+    int NumPackedComponents;
+} cuTexRefSetFormat_params;
+
+typedef struct cuTexRefSetAddressMode_params_st {
+    CUtexref hTexRef;
+    int dim;
+    CUaddress_mode am;
+} cuTexRefSetAddressMode_params;
+
+typedef struct cuTexRefSetFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetFilterMode_params;
+
+typedef struct cuTexRefSetMipmapFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetMipmapFilterMode_params;
+
+typedef struct cuTexRefSetMipmapLevelBias_params_st {
+    CUtexref hTexRef;
+    float bias;
+} cuTexRefSetMipmapLevelBias_params;
+
+typedef struct cuTexRefSetMipmapLevelClamp_params_st {
+    CUtexref hTexRef;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+} cuTexRefSetMipmapLevelClamp_params;
+
+typedef struct cuTexRefSetMaxAnisotropy_params_st {
+    CUtexref hTexRef;
+    unsigned int maxAniso;
+} cuTexRefSetMaxAnisotropy_params;
+
+typedef struct cuTexRefSetBorderColor_params_st {
+    CUtexref hTexRef;
+    float *pBorderColor;
+} cuTexRefSetBorderColor_params;
+
+typedef struct cuTexRefSetFlags_params_st {
+    CUtexref hTexRef;
+    unsigned int Flags;
+} cuTexRefSetFlags_params;
+
+typedef struct cuTexRefGetAddress_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_v2_params;
+
+typedef struct cuTexRefGetArray_params_st {
+    CUarray *phArray;
+    CUtexref hTexRef;
+} cuTexRefGetArray_params;
+
+typedef struct cuTexRefGetMipmappedArray_params_st {
+    CUmipmappedArray *phMipmappedArray;
+    CUtexref hTexRef;
+} cuTexRefGetMipmappedArray_params;
+
+typedef struct cuTexRefGetAddressMode_params_st {
+    CUaddress_mode *pam;
+    CUtexref hTexRef;
+    int dim;
+} cuTexRefGetAddressMode_params;
+
+typedef struct cuTexRefGetFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetFilterMode_params;
+
+typedef struct cuTexRefGetFormat_params_st {
+    CUarray_format *pFormat;
+    int *pNumChannels;
+    CUtexref hTexRef;
+} cuTexRefGetFormat_params;
+
+typedef struct cuTexRefGetMipmapFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapFilterMode_params;
+
+typedef struct cuTexRefGetMipmapLevelBias_params_st {
+    float *pbias;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelBias_params;
+
+typedef struct cuTexRefGetMipmapLevelClamp_params_st {
+    float *pminMipmapLevelClamp;
+    float *pmaxMipmapLevelClamp;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelClamp_params;
+
+typedef struct cuTexRefGetMaxAnisotropy_params_st {
+    int *pmaxAniso;
+    CUtexref hTexRef;
+} cuTexRefGetMaxAnisotropy_params;
+
+typedef struct cuTexRefGetBorderColor_params_st {
+    float *pBorderColor;
+    CUtexref hTexRef;
+} cuTexRefGetBorderColor_params;
+
+typedef struct cuTexRefGetFlags_params_st {
+    unsigned int *pFlags;
+    CUtexref hTexRef;
+} cuTexRefGetFlags_params;
+
+typedef struct cuTexRefCreate_params_st {
+    CUtexref *pTexRef;
+} cuTexRefCreate_params;
+
+typedef struct cuTexRefDestroy_params_st {
+    CUtexref hTexRef;
+} cuTexRefDestroy_params;
+
+typedef struct cuSurfRefSetArray_params_st {
+    CUsurfref hSurfRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuSurfRefSetArray_params;
+
+typedef struct cuSurfRefGetArray_params_st {
+    CUarray *phArray;
+    CUsurfref hSurfRef;
+} cuSurfRefGetArray_params;
+
+typedef struct cuTexObjectCreate_params_st {
+    CUtexObject *pTexObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+    const CUDA_TEXTURE_DESC *pTexDesc;
+    const CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+} cuTexObjectCreate_params;
+
+typedef struct cuTexObjectDestroy_params_st {
+    CUtexObject texObject;
+} cuTexObjectDestroy_params;
+
+typedef struct cuTexObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceDesc_params;
+
+typedef struct cuTexObjectGetTextureDesc_params_st {
+    CUDA_TEXTURE_DESC *pTexDesc;
+    CUtexObject texObject;
+} cuTexObjectGetTextureDesc_params;
+
+typedef struct cuTexObjectGetResourceViewDesc_params_st {
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceViewDesc_params;
+
+typedef struct cuSurfObjectCreate_params_st {
+    CUsurfObject *pSurfObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+} cuSurfObjectCreate_params;
+
+typedef struct cuSurfObjectDestroy_params_st {
+    CUsurfObject surfObject;
+} cuSurfObjectDestroy_params;
+
+typedef struct cuSurfObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUsurfObject surfObject;
+} cuSurfObjectGetResourceDesc_params;
+
+typedef struct cuTensorMapEncodeTiled_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const cuuint32_t *boxDim;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeTiled_params;
+
+typedef struct cuTensorMapEncodeIm2col_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const int *pixelBoxLowerCorner;
+    const int *pixelBoxUpperCorner;
+    cuuint32_t channelsPerPixel;
+    cuuint32_t pixelsPerColumn;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeIm2col_params;
+
+typedef struct cuTensorMapReplaceAddress_params_st {
+    CUtensorMap *tensorMap;
+    void *globalAddress;
+} cuTensorMapReplaceAddress_params;
+
+typedef struct cuDeviceCanAccessPeer_params_st {
+    int *canAccessPeer;
+    CUdevice dev;
+    CUdevice peerDev;
+} cuDeviceCanAccessPeer_params;
+
+typedef struct cuCtxEnablePeerAccess_params_st {
+    CUcontext peerContext;
+    unsigned int Flags;
+} cuCtxEnablePeerAccess_params;
+
+typedef struct cuCtxDisablePeerAccess_params_st {
+    CUcontext peerContext;
+} cuCtxDisablePeerAccess_params;
+
+typedef struct cuDeviceGetP2PAttribute_params_st {
+    int *value;
+    CUdevice_P2PAttribute attrib;
+    CUdevice srcDevice;
+    CUdevice dstDevice;
+} cuDeviceGetP2PAttribute_params;
+
+typedef struct cuGraphicsUnregisterResource_params_st {
+    CUgraphicsResource resource;
+} cuGraphicsUnregisterResource_params;
+
+typedef struct cuGraphicsSubResourceGetMappedArray_params_st {
+    CUarray *pArray;
+    CUgraphicsResource resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cuGraphicsSubResourceGetMappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *pMipmappedArray;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedMipmappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_v2_params_st {
+    CUdeviceptr *pDevPtr;
+    size_t *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_v2_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_v2_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_v2_params;
+
+typedef struct cuGraphicsMapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_ptsz_params;
+
+typedef struct cuGraphicsUnmapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_ptsz_params;
+
+typedef struct cuGetProcAddress_v2_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+    CUdriverProcAddressQueryResult *symbolStatus;
+} cuGetProcAddress_v2_params;
+
+typedef struct cuCoredumpGetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttribute_params;
+
+typedef struct cuCoredumpGetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttributeGlobal_params;
+
+typedef struct cuCoredumpSetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttribute_params;
+
+typedef struct cuCoredumpSetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttributeGlobal_params;
+
+typedef struct cuGetExportTable_params_st {
+    const void **ppExportTable;
+    const CUuuid *pExportTableId;
+} cuGetExportTable_params;
+
+typedef struct cuGreenCtxCreate_params_st {
+    CUgreenCtx *phCtx;
+    CUdevResourceDesc desc;
+    CUdevice dev;
+    unsigned int flags;
+} cuGreenCtxCreate_params;
+
+typedef struct cuGreenCtxDestroy_params_st {
+    CUgreenCtx hCtx;
+} cuGreenCtxDestroy_params;
+
+typedef struct cuCtxFromGreenCtx_params_st {
+    CUcontext *pContext;
+    CUgreenCtx hCtx;
+} cuCtxFromGreenCtx_params;
+
+typedef struct cuDeviceGetDevResource_params_st {
+    CUdevice device;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuDeviceGetDevResource_params;
+
+typedef struct cuCtxGetDevResource_params_st {
+    CUcontext hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuCtxGetDevResource_params;
+
+typedef struct cuGreenCtxGetDevResource_params_st {
+    CUgreenCtx hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuGreenCtxGetDevResource_params;
+
+typedef struct cuDevSmResourceSplitByCount_params_st {
+    CUdevResource *result;
+    unsigned int *nbGroups;
+    const CUdevResource *input;
+    CUdevResource *remaining;
+    unsigned int useFlags;
+    unsigned int minCount;
+} cuDevSmResourceSplitByCount_params;
+
+typedef struct cuDevResourceGenerateDesc_params_st {
+    CUdevResourceDesc *phDesc;
+    CUdevResource *resources;
+    unsigned int nbResources;
+} cuDevResourceGenerateDesc_params;
+
+typedef struct cuGreenCtxRecordEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxRecordEvent_params;
+
+typedef struct cuGreenCtxWaitEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxWaitEvent_params;
+
+typedef struct cuStreamGetGreenCtx_params_st {
+    CUstream hStream;
+    CUgreenCtx *phCtx;
+} cuStreamGetGreenCtx_params;
+
+typedef struct cuGreenCtxStreamCreate_params_st {
+    CUstream *phStream;
+    CUgreenCtx greenCtx;
+    unsigned int flags;
+    int priority;
+} cuGreenCtxStreamCreate_params;
+
+typedef struct cuMemHostRegister_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_params;
+
+typedef struct cuLinkCreate_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_params;
+
+typedef struct cuLinkAddData_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_params;
+
+typedef struct cuLinkAddFile_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_params;
+
+typedef struct cuTexRefSetAddress2D_v2_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v2_params;
+
+typedef struct cuDeviceTotalMem_params_st {
+    unsigned int *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_params;
+
+typedef struct cuCtxCreate_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_params;
+
+typedef struct cuModuleGetGlobal_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_params;
+
+typedef struct cuMemGetInfo_params_st {
+    unsigned int *free;
+    unsigned int *total;
+} cuMemGetInfo_params;
+
+typedef struct cuMemAlloc_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int bytesize;
+} cuMemAlloc_params;
+
+typedef struct cuMemAllocPitch_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *pPitch;
+    unsigned int WidthInBytes;
+    unsigned int Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_params;
+
+typedef struct cuMemFree_params_st {
+    CUdeviceptr_v1 dptr;
+} cuMemFree_params;
+
+typedef struct cuMemGetAddressRange_params_st {
+    CUdeviceptr_v1 *pbase;
+    unsigned int *psize;
+    CUdeviceptr_v1 dptr;
+} cuMemGetAddressRange_params;
+
+typedef struct cuMemAllocHost_params_st {
+    void **pp;
+    unsigned int bytesize;
+} cuMemAllocHost_params;
+
+typedef struct cuMemHostGetDevicePointer_params_st {
+    CUdeviceptr_v1 *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_params;
+
+typedef struct cuMemcpyHtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoD_params;
+
+typedef struct cuMemcpyDtoH_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoH_params;
+
+typedef struct cuMemcpyDtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoD_params;
+
+typedef struct cuMemcpyDtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoA_params;
+
+typedef struct cuMemcpyAtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoD_params;
+
+typedef struct cuMemcpyHtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoA_params;
+
+typedef struct cuMemcpyAtoH_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoH_params;
+
+typedef struct cuMemcpyAtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoA_params;
+
+typedef struct cuMemcpyHtoAAsync_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_params;
+
+typedef struct cuMemcpyAtoHAsync_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_params;
+
+typedef struct cuMemcpy2D_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2D_params;
+
+typedef struct cuMemcpy2DUnaligned_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2DUnaligned_params;
+
+typedef struct cuMemcpy3D_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+} cuMemcpy3D_params;
+
+typedef struct cuMemcpyHtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_params;
+
+typedef struct cuMemcpyDtoHAsync_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_params;
+
+typedef struct cuMemcpyDtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_params;
+
+typedef struct cuMemcpy2DAsync_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_params;
+
+typedef struct cuMemcpy3DAsync_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_params;
+
+typedef struct cuMemsetD8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned char uc;
+    unsigned int N;
+} cuMemsetD8_params;
+
+typedef struct cuMemsetD16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned short us;
+    unsigned int N;
+} cuMemsetD16_params;
+
+typedef struct cuMemsetD32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int ui;
+    unsigned int N;
+} cuMemsetD32_params;
+
+typedef struct cuMemsetD2D8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned char uc;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D8_params;
+
+typedef struct cuMemsetD2D16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned short us;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D16_params;
+
+typedef struct cuMemsetD2D32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned int ui;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D32_params;
+
+typedef struct cuArrayCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray;
+} cuArrayCreate_params;
+
+typedef struct cuArrayGetDescriptor_params_st {
+    CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_params;
+
+typedef struct cuArray3DCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray;
+} cuArray3DCreate_params;
+
+typedef struct cuArray3DGetDescriptor_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_params;
+
+typedef struct cuTexRefSetAddress_params_st {
+    unsigned int *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr_v1 dptr;
+    unsigned int bytes;
+} cuTexRefSetAddress_params;
+
+typedef struct cuTexRefSetAddress2D_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *desc;
+    CUdeviceptr_v1 dptr;
+    unsigned int Pitch;
+} cuTexRefSetAddress2D_params;
+
+typedef struct cuTexRefGetAddress_params_st {
+    CUdeviceptr_v1 *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_params_st {
+    CUdeviceptr_v1 *pDevPtr;
+    unsigned int *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_params;
+
+typedef struct cuCtxDestroy_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_params;
+
+typedef struct cuCtxPopCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_params;
+
+typedef struct cuCtxPushCurrent_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_params;
+
+typedef struct cuStreamDestroy_params_st {
+    CUstream hStream;
+} cuStreamDestroy_params;
+
+typedef struct cuEventDestroy_params_st {
+    CUevent hEvent;
+} cuEventDestroy_params;
+
+typedef struct cuDevicePrimaryCtxRelease_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_params;
+
+typedef struct cuDevicePrimaryCtxReset_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_params;
+
+typedef struct cuMemcpyHtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_params;
+
+typedef struct cuMemcpyDtoH_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_params;
+
+typedef struct cuMemcpyDtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_params;
+
+typedef struct cuMemcpyDtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_params;
+
+typedef struct cuMemcpyAtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_params;
+
+typedef struct cuMemcpyHtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_params;
+
+typedef struct cuMemcpyAtoH_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_params;
+
+typedef struct cuMemcpyAtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_params;
+
+typedef struct cuMemcpy2D_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_params;
+
+typedef struct cuMemcpy3D_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_params;
+
+typedef struct cuMemcpy2DAsync_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_params;
+
+typedef struct cuMemcpy3DAsync_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_params;
+
+typedef struct cuMemsetD8_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_params;
+
+typedef struct cuMemsetD16_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_params;
+
+typedef struct cuMemsetD32_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_params;
+
+typedef struct cuMemsetD2D8_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_params;
+
+typedef struct cuMemsetD2D16_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_params;
+
+typedef struct cuMemsetD2D32_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_params;
+
+typedef struct cuMemcpy_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_params;
+
+typedef struct cuMemcpyAsync_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_params;
+
+typedef struct cuMemcpyPeer_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_params;
+
+typedef struct cuMemcpyPeerAsync_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_params;
+
+typedef struct cuMemcpy3DPeer_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_params;
+
+typedef struct cuMemcpy3DPeerAsync_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_params;
+
+typedef struct cuMemcpyBatchAsync_params_st {
+    CUdeviceptr *dsts;
+    CUdeviceptr *srcs;
+    size_t *sizes;
+    size_t count;
+    CUmemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    CUstream hStream;
+} cuMemcpyBatchAsync_params;
+
+typedef struct cuMemcpy3DBatchAsync_params_st {
+    size_t numOps;
+    CUDA_MEMCPY3D_BATCH_OP *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    CUstream hStream;
+} cuMemcpy3DBatchAsync_params;
+
+typedef struct cuMemsetD8Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_params;
+
+typedef struct cuMemsetD16Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_params;
+
+typedef struct cuMemsetD32Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_params;
+
+typedef struct cuMemsetD2D8Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_params;
+
+typedef struct cuMemsetD2D16Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_params;
+
+typedef struct cuMemsetD2D32Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_params;
+
+typedef struct cuStreamGetPriority_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_params;
+
+typedef struct cuStreamGetId_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_params;
+
+typedef struct cuStreamGetFlags_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_params;
+
+typedef struct cuStreamGetDevice_params_st {
+    CUstream hStream;
+    CUdevice *device;
+} cuStreamGetDevice_params;
+
+typedef struct cuStreamGetCtx_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_params;
+
+typedef struct cuStreamGetCtx_v2_params_st {
+    CUstream hStream;
+    CUcontext *pCtx;
+    CUgreenCtx *pGreenCtx;
+} cuStreamGetCtx_v2_params;
+
+typedef struct cuStreamWaitEvent_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_params;
+
+typedef struct cuStreamAddCallback_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_params;
+
+typedef struct cuStreamAttachMemAsync_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_params;
+
+typedef struct cuStreamQuery_params_st {
+    CUstream hStream;
+} cuStreamQuery_params;
+
+typedef struct cuStreamSynchronize_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_params;
+
+typedef struct cuEventRecord_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_params;
+
+typedef struct cuEventRecordWithFlags_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_params;
+
+typedef struct cuLaunchKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_params;
+
+typedef struct cuLaunchKernelEx_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_params;
+
+typedef struct cuLaunchHostFunc_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_params;
+
+typedef struct cuGraphicsMapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_params;
+
+typedef struct cuGraphicsUnmapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_params;
+
+typedef struct cuStreamWriteValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_params;
+
+typedef struct cuStreamWaitValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_params;
+
+typedef struct cuStreamWriteValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_params;
+
+typedef struct cuStreamWaitValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_params;
+
+typedef struct cuStreamBatchMemOp_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_params;
+
+typedef struct cuStreamWriteValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_ptsz_params;
+
+typedef struct cuStreamWaitValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_ptsz_params;
+
+typedef struct cuStreamWriteValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_ptsz_params;
+
+typedef struct cuStreamWaitValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_params;
+
+typedef struct cuStreamWaitValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_params;
+
+typedef struct cuStreamWriteValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_params;
+
+typedef struct cuStreamWaitValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_params;
+
+typedef struct cuStreamBatchMemOp_v2_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_params;
+
+typedef struct cuMemPrefetchAsync_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_params;
+
+typedef struct cuMemPrefetchAsync_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_params;
+
+typedef struct cuLaunchCooperativeKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_params;
+
+typedef struct cuStreamBeginCapture_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_params;
+
+typedef struct cuStreamBeginCapture_ptsz_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_params;
+
+typedef struct cuStreamBeginCaptureToGraph_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_params;
+
+typedef struct cuStreamEndCapture_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_params;
+
+typedef struct cuStreamIsCapturing_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_params;
+
+typedef struct cuStreamGetCaptureInfo_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_params;
+
+typedef struct cuStreamGetCaptureInfo_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_params;
+
+typedef struct cuGraphAddKernelNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphAddKernelNode_params;
+
+typedef struct cuGraphKernelNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeGetParams_params;
+
+typedef struct cuGraphKernelNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeSetParams_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphExecKernelNodeSetParams_params;
+
+typedef struct cuGraphInstantiateWithParams_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_params;
+
+typedef struct cuGraphExecUpdate_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *hErrorNode_out;
+    CUgraphExecUpdateResult *updateResult_out;
+} cuGraphExecUpdate_params;
+
+typedef struct cuGraphUpload_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphUpload_params;
+
+typedef struct cuGraphLaunch_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphLaunch_params;
+
+typedef struct cuStreamCopyAttributes_params_st {
+    CUstream dstStream;
+    CUstream srcStream;
+} cuStreamCopyAttributes_params;
+
+typedef struct cuStreamGetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value;
+} cuStreamGetAttribute_params;
+
+typedef struct cuStreamSetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *param;
+} cuStreamSetAttribute_params;
+
+typedef struct cuIpcOpenMemHandle_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_params;
+
+typedef struct cuGraphInstantiate_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_params;
+
+typedef struct cuGraphInstantiate_v2_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_v2_params;
+
+typedef struct cuMemMapArrayAsync_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_params;
+
+typedef struct cuMemFreeAsync_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_params;
+
+typedef struct cuMemAllocAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_params;
+
+typedef struct cuMemAllocFromPoolAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_params;
+
+typedef struct cuMemBatchDecompressAsync_params_st {
+    CUmemDecompressParams *paramsArray;
+    size_t count;
+    unsigned int flags;
+    size_t *errorIndex;
+    CUstream stream;
+} cuMemBatchDecompressAsync_params;
+
+typedef struct cuGetProcAddress_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+} cuGetProcAddress_params;
+
+typedef struct cuCheckpointProcessGetRestoreThreadId_params_st {
+    int pid;
+    int *tid;
+} cuCheckpointProcessGetRestoreThreadId_params;
+
+typedef struct cuCheckpointProcessGetState_params_st {
+    int pid;
+    CUprocessState *state;
+} cuCheckpointProcessGetState_params;
+
+typedef struct cuCheckpointProcessLock_params_st {
+    int pid;
+    CUcheckpointLockArgs *args;
+} cuCheckpointProcessLock_params;
+
+typedef struct cuCheckpointProcessCheckpoint_params_st {
+    int pid;
+    CUcheckpointCheckpointArgs *args;
+} cuCheckpointProcessCheckpoint_params;
+
+typedef struct cuCheckpointProcessRestore_params_st {
+    int pid;
+    CUcheckpointRestoreArgs *args;
+} cuCheckpointProcessRestore_params;
+
+typedef struct cuCheckpointProcessUnlock_params_st {
+    int pid;
+    CUcheckpointUnlockArgs *args;
+} cuCheckpointProcessUnlock_params;
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..52321905dd0a82e550332f5d67b03fd4612860e7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h
@@ -0,0 +1,2372 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_runtime_api.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaDeviceSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaDeviceSetLimit_v3020_params;
+
+typedef struct cudaDeviceGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaDeviceGetLimit_v3020_params;
+
+typedef struct cudaDeviceGetTexture1DLinearMaxWidth_v11010_params_st {
+    size_t *maxWidthInElements;
+    const struct cudaChannelFormatDesc *fmtDesc;
+    int device;
+} cudaDeviceGetTexture1DLinearMaxWidth_v11010_params;
+
+typedef struct cudaDeviceGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaDeviceGetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetStreamPriorityRange_v5050_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cudaDeviceGetStreamPriorityRange_v5050_params;
+
+typedef struct cudaDeviceSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaDeviceSetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetByPCIBusId_v4010_params_st {
+    int *device;
+    const char *pciBusId;
+} cudaDeviceGetByPCIBusId_v4010_params;
+
+typedef struct cudaDeviceGetPCIBusId_v4010_params_st {
+    char *pciBusId;
+    int len;
+    int device;
+} cudaDeviceGetPCIBusId_v4010_params;
+
+typedef struct cudaIpcGetEventHandle_v4010_params_st {
+    cudaIpcEventHandle_t *handle;
+    cudaEvent_t event;
+} cudaIpcGetEventHandle_v4010_params;
+
+typedef struct cudaIpcOpenEventHandle_v4010_params_st {
+    cudaEvent_t *event;
+    cudaIpcEventHandle_t handle;
+} cudaIpcOpenEventHandle_v4010_params;
+
+typedef struct cudaIpcGetMemHandle_v4010_params_st {
+    cudaIpcMemHandle_t *handle;
+    void *devPtr;
+} cudaIpcGetMemHandle_v4010_params;
+
+typedef struct cudaIpcOpenMemHandle_v4010_params_st {
+    void **devPtr;
+    cudaIpcMemHandle_t handle;
+    unsigned int flags;
+} cudaIpcOpenMemHandle_v4010_params;
+
+typedef struct cudaIpcCloseMemHandle_v4010_params_st {
+    void *devPtr;
+} cudaIpcCloseMemHandle_v4010_params;
+
+typedef struct cudaDeviceFlushGPUDirectRDMAWrites_v11030_params_st {
+    enum cudaFlushGPUDirectRDMAWritesTarget target;
+    enum cudaFlushGPUDirectRDMAWritesScope scope;
+} cudaDeviceFlushGPUDirectRDMAWrites_v11030_params;
+
+typedef struct cudaDeviceRegisterAsyncNotification_v12040_params_st {
+    int device;
+    cudaAsyncCallback callbackFunc;
+    void *userData;
+    cudaAsyncCallbackHandle_t *callback;
+} cudaDeviceRegisterAsyncNotification_v12040_params;
+
+typedef struct cudaDeviceUnregisterAsyncNotification_v12040_params_st {
+    int device;
+    cudaAsyncCallbackHandle_t callback;
+} cudaDeviceUnregisterAsyncNotification_v12040_params;
+
+typedef struct cudaDeviceGetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig *pConfig;
+} cudaDeviceGetSharedMemConfig_v4020_params;
+
+typedef struct cudaDeviceSetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig config;
+} cudaDeviceSetSharedMemConfig_v4020_params;
+
+typedef struct cudaGetErrorName_v6050_params_st {
+    cudaError_t error;
+} cudaGetErrorName_v6050_params;
+
+typedef struct cudaGetErrorString_v3020_params_st {
+    cudaError_t error;
+} cudaGetErrorString_v3020_params;
+
+typedef struct cudaGetDeviceCount_v3020_params_st {
+    int *count;
+} cudaGetDeviceCount_v3020_params;
+
+typedef struct cudaGetDeviceProperties_v2_v12000_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v2_v12000_params;
+
+typedef struct cudaDeviceGetAttribute_v5000_params_st {
+    int *value;
+    enum cudaDeviceAttr attr;
+    int device;
+} cudaDeviceGetAttribute_v5000_params;
+
+typedef struct cudaDeviceGetDefaultMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetDefaultMemPool_v11020_params;
+
+typedef struct cudaDeviceSetMemPool_v11020_params_st {
+    int device;
+    cudaMemPool_t memPool;
+} cudaDeviceSetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetNvSciSyncAttributes_v10020_params_st {
+    void *nvSciSyncAttrList;
+    int device;
+    int flags;
+} cudaDeviceGetNvSciSyncAttributes_v10020_params;
+
+typedef struct cudaDeviceGetP2PAttribute_v8000_params_st {
+    int *value;
+    enum cudaDeviceP2PAttr attr;
+    int srcDevice;
+    int dstDevice;
+} cudaDeviceGetP2PAttribute_v8000_params;
+
+typedef struct cudaChooseDevice_v3020_params_st {
+    int *device;
+    const struct cudaDeviceProp *prop;
+} cudaChooseDevice_v3020_params;
+
+typedef struct cudaInitDevice_v12000_params_st {
+    int device;
+    unsigned int deviceFlags;
+    unsigned int flags;
+} cudaInitDevice_v12000_params;
+
+typedef struct cudaSetDevice_v3020_params_st {
+    int device;
+} cudaSetDevice_v3020_params;
+
+typedef struct cudaGetDevice_v3020_params_st {
+    int *device;
+} cudaGetDevice_v3020_params;
+
+typedef struct cudaSetValidDevices_v3020_params_st {
+    int *device_arr;
+    int len;
+} cudaSetValidDevices_v3020_params;
+
+typedef struct cudaSetDeviceFlags_v3020_params_st {
+    unsigned int flags;
+} cudaSetDeviceFlags_v3020_params;
+
+typedef struct cudaGetDeviceFlags_v7000_params_st {
+    unsigned int *flags;
+} cudaGetDeviceFlags_v7000_params;
+
+typedef struct cudaStreamCreate_v3020_params_st {
+    cudaStream_t *pStream;
+} cudaStreamCreate_v3020_params;
+
+typedef struct cudaStreamCreateWithFlags_v5000_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+} cudaStreamCreateWithFlags_v5000_params;
+
+typedef struct cudaStreamCreateWithPriority_v5050_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+    int priority;
+} cudaStreamCreateWithPriority_v5050_params;
+
+typedef struct cudaStreamGetPriority_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_ptsz_v7000_params;
+
+typedef struct cudaStreamGetFlags_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_ptsz_v7000_params;
+
+typedef struct cudaStreamGetId_ptsz_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_ptsz_v12000_params;
+
+typedef struct cudaStreamGetDevice_ptsz_v12080_params_st {
+    cudaStream_t hStream;
+    int *device;
+} cudaStreamGetDevice_ptsz_v12080_params;
+
+typedef struct cudaStreamCopyAttributes_ptsz_v11000_params_st {
+    cudaStream_t dst;
+    cudaStream_t src;
+} cudaStreamCopyAttributes_ptsz_v11000_params;
+
+typedef struct cudaStreamGetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value_out;
+} cudaStreamGetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamSetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *value;
+} cudaStreamSetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamDestroy_v5050_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v5050_params;
+
+typedef struct cudaStreamWaitEvent_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_ptsz_v7000_params;
+
+typedef struct cudaStreamAddCallback_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_ptsz_v7000_params;
+
+typedef struct cudaStreamSynchronize_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_ptsz_v7000_params;
+
+typedef struct cudaStreamQuery_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_ptsz_v7000_params;
+
+typedef struct cudaStreamAttachMemAsync_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_ptsz_v7000_params;
+
+typedef struct cudaStreamBeginCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_ptsz_v12030_params;
+
+typedef struct cudaThreadExchangeStreamCaptureMode_v10010_params_st {
+    enum cudaStreamCaptureMode *mode;
+} cudaThreadExchangeStreamCaptureMode_v10010_params;
+
+typedef struct cudaStreamEndCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamIsCapturing_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_ptsz_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_ptsz_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_ptsz_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_ptsz_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params;
+
+typedef struct cudaEventCreate_v3020_params_st {
+    cudaEvent_t *event;
+} cudaEventCreate_v3020_params;
+
+typedef struct cudaEventCreateWithFlags_v3020_params_st {
+    cudaEvent_t *event;
+    unsigned int flags;
+} cudaEventCreateWithFlags_v3020_params;
+
+typedef struct cudaEventRecord_ptsz_v7000_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_ptsz_v7000_params;
+
+typedef struct cudaEventRecordWithFlags_ptsz_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_ptsz_v11010_params;
+
+typedef struct cudaEventQuery_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventQuery_v3020_params;
+
+typedef struct cudaEventSynchronize_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventSynchronize_v3020_params;
+
+typedef struct cudaEventDestroy_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventDestroy_v3020_params;
+
+typedef struct cudaEventElapsedTime_v3020_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v3020_params;
+
+typedef struct cudaEventElapsedTime_v2_v12080_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v2_v12080_params;
+
+typedef struct cudaImportExternalMemory_v10000_params_st {
+    cudaExternalMemory_t *extMem_out;
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc;
+} cudaImportExternalMemory_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedBuffer_v10000_params_st {
+    void **devPtr;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryBufferDesc *bufferDesc;
+} cudaExternalMemoryGetMappedBuffer_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedMipmappedArray_v10000_params_st {
+    cudaMipmappedArray_t *mipmap;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc;
+} cudaExternalMemoryGetMappedMipmappedArray_v10000_params;
+
+typedef struct cudaDestroyExternalMemory_v10000_params_st {
+    cudaExternalMemory_t extMem;
+} cudaDestroyExternalMemory_v10000_params;
+
+typedef struct cudaImportExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t *extSem_out;
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc;
+} cudaImportExternalSemaphore_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaDestroyExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t extSem;
+} cudaDestroyExternalSemaphore_v10000_params;
+
+typedef struct cudaLaunchKernel_ptsz_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_ptsz_v7000_params;
+
+typedef struct cudaLaunchKernelExC_ptsz_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_ptsz_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_ptsz_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_ptsz_v9000_params;
+
+typedef struct cudaLaunchCooperativeKernelMultiDevice_v9000_params_st {
+    struct cudaLaunchParams *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cudaLaunchCooperativeKernelMultiDevice_v9000_params;
+
+typedef struct cudaFuncSetCacheConfig_v3020_params_st {
+    const void *func;
+    enum cudaFuncCache cacheConfig;
+} cudaFuncSetCacheConfig_v3020_params;
+
+typedef struct cudaFuncGetAttributes_v3020_params_st {
+    struct cudaFuncAttributes *attr;
+    const void *func;
+} cudaFuncGetAttributes_v3020_params;
+
+typedef struct cudaFuncSetAttribute_v9000_params_st {
+    const void *func;
+    enum cudaFuncAttribute attr;
+    int value;
+} cudaFuncSetAttribute_v9000_params;
+
+typedef struct cudaFuncGetName_v12030_params_st {
+    const char **name;
+    const void *func;
+} cudaFuncGetName_v12030_params;
+
+typedef struct cudaFuncGetParamInfo_v12040_params_st {
+    const void *func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cudaFuncGetParamInfo_v12040_params;
+
+typedef struct cudaLaunchHostFunc_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_ptsz_v10000_params;
+
+typedef struct cudaFuncSetSharedMemConfig_v4020_params_st {
+    const void *func;
+    enum cudaSharedMemConfig config;
+} cudaFuncSetSharedMemConfig_v4020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params;
+
+typedef struct cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params_st {
+    size_t *dynamicSmemSize;
+    const void *func;
+    int numBlocks;
+    int blockSize;
+} cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params;
+
+typedef struct cudaOccupancyMaxPotentialClusterSize_v11070_params_st {
+    int *clusterSize;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxPotentialClusterSize_v11070_params;
+
+typedef struct cudaOccupancyMaxActiveClusters_v11070_params_st {
+    int *numClusters;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxActiveClusters_v11070_params;
+
+typedef struct cudaMallocManaged_v6000_params_st {
+    void **devPtr;
+    size_t size;
+    unsigned int flags;
+} cudaMallocManaged_v6000_params;
+
+typedef struct cudaMalloc_v3020_params_st {
+    void **devPtr;
+    size_t size;
+} cudaMalloc_v3020_params;
+
+typedef struct cudaMallocHost_v3020_params_st {
+    void **ptr;
+    size_t size;
+} cudaMallocHost_v3020_params;
+
+typedef struct cudaMallocPitch_v3020_params_st {
+    void **devPtr;
+    size_t *pitch;
+    size_t width;
+    size_t height;
+} cudaMallocPitch_v3020_params;
+
+typedef struct cudaMallocArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    unsigned int flags;
+} cudaMallocArray_v3020_params;
+
+typedef struct cudaFree_v3020_params_st {
+    void *devPtr;
+} cudaFree_v3020_params;
+
+typedef struct cudaFreeHost_v3020_params_st {
+    void *ptr;
+} cudaFreeHost_v3020_params;
+
+typedef struct cudaFreeArray_v3020_params_st {
+    cudaArray_t array;
+} cudaFreeArray_v3020_params;
+
+typedef struct cudaFreeMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t mipmappedArray;
+} cudaFreeMipmappedArray_v5000_params;
+
+typedef struct cudaHostAlloc_v3020_params_st {
+    void **pHost;
+    size_t size;
+    unsigned int flags;
+} cudaHostAlloc_v3020_params;
+
+typedef struct cudaHostRegister_v4000_params_st {
+    void *ptr;
+    size_t size;
+    unsigned int flags;
+} cudaHostRegister_v4000_params;
+
+typedef struct cudaHostUnregister_v4000_params_st {
+    void *ptr;
+} cudaHostUnregister_v4000_params;
+
+typedef struct cudaHostGetDevicePointer_v3020_params_st {
+    void **pDevice;
+    void *pHost;
+    unsigned int flags;
+} cudaHostGetDevicePointer_v3020_params;
+
+typedef struct cudaHostGetFlags_v3020_params_st {
+    unsigned int *pFlags;
+    void *pHost;
+} cudaHostGetFlags_v3020_params;
+
+typedef struct cudaMalloc3D_v3020_params_st {
+    struct cudaPitchedPtr *pitchedDevPtr;
+    struct cudaExtent extent;
+} cudaMalloc3D_v3020_params;
+
+typedef struct cudaMalloc3DArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int flags;
+} cudaMalloc3DArray_v3020_params;
+
+typedef struct cudaMallocMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int numLevels;
+    unsigned int flags;
+} cudaMallocMipmappedArray_v5000_params;
+
+typedef struct cudaGetMipmappedArrayLevel_v5000_params_st {
+    cudaArray_t *levelArray;
+    cudaMipmappedArray_const_t mipmappedArray;
+    unsigned int level;
+} cudaGetMipmappedArrayLevel_v5000_params;
+
+typedef struct cudaMemcpy3D_ptds_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DPeer_ptds_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy3DPeerAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_ptsz_v7000_params;
+
+typedef struct cudaMemGetInfo_v3020_params_st {
+    size_t *free;
+    size_t *total;
+} cudaMemGetInfo_v3020_params;
+
+typedef struct cudaArrayGetInfo_v4010_params_st {
+    struct cudaChannelFormatDesc *desc;
+    struct cudaExtent *extent;
+    unsigned int *flags;
+    cudaArray_t array;
+} cudaArrayGetInfo_v4010_params;
+
+typedef struct cudaArrayGetPlane_v11020_params_st {
+    cudaArray_t *pPlaneArray;
+    cudaArray_t hArray;
+    unsigned int planeIdx;
+} cudaArrayGetPlane_v11020_params;
+
+typedef struct cudaArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaArray_t array;
+    int device;
+} cudaArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaMipmappedArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaMipmappedArray_t mipmap;
+    int device;
+} cudaMipmappedArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaArray_t array;
+} cudaArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMipmappedArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaMipmappedArray_t mipmap;
+} cudaMipmappedArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMemcpy_ptds_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_ptds_v7000_params;
+
+typedef struct cudaMemcpyPeer_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+} cudaMemcpyPeer_v4000_params;
+
+typedef struct cudaMemcpy2D_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DFromArray_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToSymbol_ptds_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromSymbol_ptds_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyPeerAsync_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemcpyPeerAsync_v4000_params;
+
+typedef struct cudaMemcpyBatchAsync_ptsz_v12080_params_st {
+    void **dsts;
+    void **srcs;
+    size_t *sizes;
+    size_t count;
+    struct cudaMemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    cudaStream_t stream;
+} cudaMemcpyBatchAsync_ptsz_v12080_params;
+
+typedef struct cudaMemcpy3DBatchAsync_ptsz_v12080_params_st {
+    size_t numOps;
+    struct cudaMemcpy3DBatchOp *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    cudaStream_t stream;
+} cudaMemcpy3DBatchAsync_ptsz_v12080_params;
+
+typedef struct cudaMemcpy2DAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyToSymbolAsync_ptsz_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset_ptds_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_ptds_v7000_params;
+
+typedef struct cudaMemset2D_ptds_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_ptds_v7000_params;
+
+typedef struct cudaMemset3D_ptds_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_ptds_v7000_params;
+
+typedef struct cudaMemsetAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset2DAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset3DAsync_ptsz_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_ptsz_v7000_params;
+
+typedef struct cudaGetSymbolAddress_v3020_params_st {
+    void **devPtr;
+    const void *symbol;
+} cudaGetSymbolAddress_v3020_params;
+
+typedef struct cudaGetSymbolSize_v3020_params_st {
+    size_t *size;
+    const void *symbol;
+} cudaGetSymbolSize_v3020_params;
+
+typedef struct cudaMemPrefetchAsync_ptsz_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_ptsz_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_ptsz_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_ptsz_v12020_params;
+
+typedef struct cudaMemAdvise_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    int device;
+} cudaMemAdvise_v8000_params;
+
+typedef struct cudaMemAdvise_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    struct cudaMemLocation location;
+} cudaMemAdvise_v2_v12020_params;
+
+typedef struct cudaMemRangeGetAttribute_v8000_params_st {
+    void *data;
+    size_t dataSize;
+    enum cudaMemRangeAttribute attribute;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttribute_v8000_params;
+
+typedef struct cudaMemRangeGetAttributes_v8000_params_st {
+    void **data;
+    size_t *dataSizes;
+    enum cudaMemRangeAttribute *attributes;
+    size_t numAttributes;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttributes_v8000_params;
+
+typedef struct cudaMemcpyToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromArray_ptds_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMallocAsync_ptsz_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_ptsz_v11020_params;
+
+typedef struct cudaFreeAsync_ptsz_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolTrimTo_v11020_params_st {
+    cudaMemPool_t memPool;
+    size_t minBytesToKeep;
+} cudaMemPoolTrimTo_v11020_params;
+
+typedef struct cudaMemPoolSetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolSetAttribute_v11020_params;
+
+typedef struct cudaMemPoolGetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolGetAttribute_v11020_params;
+
+typedef struct cudaMemPoolSetAccess_v11020_params_st {
+    cudaMemPool_t memPool;
+    const struct cudaMemAccessDesc *descList;
+    size_t count;
+} cudaMemPoolSetAccess_v11020_params;
+
+typedef struct cudaMemPoolGetAccess_v11020_params_st {
+    enum cudaMemAccessFlags *flags;
+    cudaMemPool_t memPool;
+    struct cudaMemLocation *location;
+} cudaMemPoolGetAccess_v11020_params;
+
+typedef struct cudaMemPoolCreate_v11020_params_st {
+    cudaMemPool_t *memPool;
+    const struct cudaMemPoolProps *poolProps;
+} cudaMemPoolCreate_v11020_params;
+
+typedef struct cudaMemPoolDestroy_v11020_params_st {
+    cudaMemPool_t memPool;
+} cudaMemPoolDestroy_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_ptsz_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolExportToShareableHandle_v11020_params_st {
+    void *shareableHandle;
+    cudaMemPool_t memPool;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolExportToShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolImportFromShareableHandle_v11020_params_st {
+    cudaMemPool_t *memPool;
+    void *shareableHandle;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolImportFromShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolExportPointer_v11020_params_st {
+    struct cudaMemPoolPtrExportData *exportData;
+    void *ptr;
+} cudaMemPoolExportPointer_v11020_params;
+
+typedef struct cudaMemPoolImportPointer_v11020_params_st {
+    void **ptr;
+    cudaMemPool_t memPool;
+    struct cudaMemPoolPtrExportData *exportData;
+} cudaMemPoolImportPointer_v11020_params;
+
+typedef struct cudaPointerGetAttributes_v4000_params_st {
+    struct cudaPointerAttributes *attributes;
+    const void *ptr;
+} cudaPointerGetAttributes_v4000_params;
+
+typedef struct cudaDeviceCanAccessPeer_v4000_params_st {
+    int *canAccessPeer;
+    int device;
+    int peerDevice;
+} cudaDeviceCanAccessPeer_v4000_params;
+
+typedef struct cudaDeviceEnablePeerAccess_v4000_params_st {
+    int peerDevice;
+    unsigned int flags;
+} cudaDeviceEnablePeerAccess_v4000_params;
+
+typedef struct cudaDeviceDisablePeerAccess_v4000_params_st {
+    int peerDevice;
+} cudaDeviceDisablePeerAccess_v4000_params;
+
+typedef struct cudaGraphicsUnregisterResource_v3020_params_st {
+    cudaGraphicsResource_t resource;
+} cudaGraphicsUnregisterResource_v3020_params;
+
+typedef struct cudaGraphicsResourceSetMapFlags_v3020_params_st {
+    cudaGraphicsResource_t resource;
+    unsigned int flags;
+} cudaGraphicsResourceSetMapFlags_v3020_params;
+
+typedef struct cudaGraphicsMapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsMapResources_v3020_params;
+
+typedef struct cudaGraphicsUnmapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsUnmapResources_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedPointer_v3020_params_st {
+    void **devPtr;
+    size_t *size;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedPointer_v3020_params;
+
+typedef struct cudaGraphicsSubResourceGetMappedArray_v3020_params_st {
+    cudaArray_t *array;
+    cudaGraphicsResource_t resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cudaGraphicsSubResourceGetMappedArray_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedMipmappedArray_v5000_params;
+
+typedef struct cudaGetChannelDesc_v3020_params_st {
+    struct cudaChannelFormatDesc *desc;
+    cudaArray_const_t array;
+} cudaGetChannelDesc_v3020_params;
+
+typedef struct cudaCreateChannelDesc_v3020_params_st {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum cudaChannelFormatKind f;
+} cudaCreateChannelDesc_v3020_params;
+
+typedef struct cudaCreateTextureObject_v5000_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v5000_params;
+
+typedef struct cudaDestroyTextureObject_v5000_params_st {
+    cudaTextureObject_t texObject;
+} cudaDestroyTextureObject_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v5000_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceViewDesc_v5000_params_st {
+    struct cudaResourceViewDesc *pResViewDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceViewDesc_v5000_params;
+
+typedef struct cudaCreateSurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t *pSurfObject;
+    const struct cudaResourceDesc *pResDesc;
+} cudaCreateSurfaceObject_v5000_params;
+
+typedef struct cudaDestroySurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t surfObject;
+} cudaDestroySurfaceObject_v5000_params;
+
+typedef struct cudaGetSurfaceObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaSurfaceObject_t surfObject;
+} cudaGetSurfaceObjectResourceDesc_v5000_params;
+
+typedef struct cudaDriverGetVersion_v3020_params_st {
+    int *driverVersion;
+} cudaDriverGetVersion_v3020_params;
+
+typedef struct cudaRuntimeGetVersion_v3020_params_st {
+    int *runtimeVersion;
+} cudaRuntimeGetVersion_v3020_params;
+
+typedef struct cudaGraphCreate_v10000_params_st {
+    cudaGraph_t *pGraph;
+    unsigned int flags;
+} cudaGraphCreate_v10000_params;
+
+typedef struct cudaGraphAddKernelNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphAddKernelNode_v10000_params;
+
+typedef struct cudaGraphKernelNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeGetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeSetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeCopyAttributes_v11000_params_st {
+    cudaGraphNode_t hSrc;
+    cudaGraphNode_t hDst;
+} cudaGraphKernelNodeCopyAttributes_v11000_params;
+
+typedef struct cudaGraphKernelNodeGetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    cudaKernelNodeAttrValue *value_out;
+} cudaGraphKernelNodeGetAttribute_v11000_params;
+
+typedef struct cudaGraphKernelNodeSetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    const cudaKernelNodeAttrValue *value;
+} cudaGraphKernelNodeSetAttribute_v11000_params;
+
+typedef struct cudaGraphAddMemcpyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemcpy3DParms *pCopyParams;
+} cudaGraphAddMemcpyNode_v10000_params;
+
+typedef struct cudaGraphAddMemcpyNodeToSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeToSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNodeFromSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeFromSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNode1D_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNode1D_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeSetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphAddMemsetNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemsetParams *pMemsetParams;
+} cudaGraphAddMemsetNode_v10000_params;
+
+typedef struct cudaGraphMemsetNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemsetNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddHostNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphAddHostNode_v10000_params;
+
+typedef struct cudaGraphHostNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeGetParams_v10000_params;
+
+typedef struct cudaGraphHostNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddChildGraphNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaGraph_t childGraph;
+} cudaGraphAddChildGraphNode_v10000_params;
+
+typedef struct cudaGraphChildGraphNodeGetGraph_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraph_t *pGraph;
+} cudaGraphChildGraphNodeGetGraph_v10000_params;
+
+typedef struct cudaGraphAddEmptyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+} cudaGraphAddEmptyNode_v10000_params;
+
+typedef struct cudaGraphAddEventRecordNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventRecordNode_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventRecordNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddEventWaitNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventWaitNode_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventWaitNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddExternalSemaphoresSignalNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresSignalNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreSignalNodeParams *params_out;
+} cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddExternalSemaphoresWaitNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresWaitNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreWaitNodeParams *params_out;
+} cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddMemAllocNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaMemAllocNodeParams *nodeParams;
+} cudaGraphAddMemAllocNode_v11040_params;
+
+typedef struct cudaGraphMemAllocNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemAllocNodeParams *params_out;
+} cudaGraphMemAllocNodeGetParams_v11040_params;
+
+typedef struct cudaGraphAddMemFreeNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dptr;
+} cudaGraphAddMemFreeNode_v11040_params;
+
+typedef struct cudaGraphMemFreeNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    void *dptr_out;
+} cudaGraphMemFreeNodeGetParams_v11040_params;
+
+typedef struct cudaDeviceGraphMemTrim_v11040_params_st {
+    int device;
+} cudaDeviceGraphMemTrim_v11040_params;
+
+typedef struct cudaDeviceGetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceGetGraphMemAttribute_v11040_params;
+
+typedef struct cudaDeviceSetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceSetGraphMemAttribute_v11040_params;
+
+typedef struct cudaGraphClone_v10000_params_st {
+    cudaGraph_t *pGraphClone;
+    cudaGraph_t originalGraph;
+} cudaGraphClone_v10000_params;
+
+typedef struct cudaGraphNodeFindInClone_v10000_params_st {
+    cudaGraphNode_t *pNode;
+    cudaGraphNode_t originalNode;
+    cudaGraph_t clonedGraph;
+} cudaGraphNodeFindInClone_v10000_params;
+
+typedef struct cudaGraphNodeGetType_v10000_params_st {
+    cudaGraphNode_t node;
+    enum cudaGraphNodeType *pType;
+} cudaGraphNodeGetType_v10000_params;
+
+typedef struct cudaGraphGetNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *nodes;
+    size_t *numNodes;
+} cudaGraphGetNodes_v10000_params;
+
+typedef struct cudaGraphGetRootNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *pRootNodes;
+    size_t *pNumRootNodes;
+} cudaGraphGetRootNodes_v10000_params;
+
+typedef struct cudaGraphGetEdges_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    size_t *numEdges;
+} cudaGraphGetEdges_v10000_params;
+
+typedef struct cudaGraphGetEdges_v2_v12030_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    cudaGraphEdgeData *edgeData;
+    size_t *numEdges;
+} cudaGraphGetEdges_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependencies_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v10000_params;
+
+typedef struct cudaGraphNodeGetDependencies_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v10000_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v2_v12030_params;
+
+typedef struct cudaGraphAddDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v10000_params;
+
+typedef struct cudaGraphAddDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v2_v12030_params;
+
+typedef struct cudaGraphRemoveDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v10000_params;
+
+typedef struct cudaGraphRemoveDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v2_v12030_params;
+
+typedef struct cudaGraphDestroyNode_v10000_params_st {
+    cudaGraphNode_t node;
+} cudaGraphDestroyNode_v10000_params;
+
+typedef struct cudaGraphInstantiate_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiate_v12000_params;
+
+typedef struct cudaGraphInstantiateWithFlags_v11040_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiateWithFlags_v11040_params;
+
+typedef struct cudaGraphInstantiateWithParams_ptsz_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_ptsz_v12000_params;
+
+typedef struct cudaGraphExecGetFlags_v12000_params_st {
+    cudaGraphExec_t graphExec;
+    unsigned long long *flags;
+} cudaGraphExecGetFlags_v12000_params;
+
+typedef struct cudaGraphExecKernelNodeSetParams_v10010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphExecKernelNodeSetParams_v10010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphExecMemcpyNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphExecMemsetNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphExecMemsetNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecHostNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphExecHostNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecChildGraphNodeSetParams_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    cudaGraph_t childGraph;
+} cudaGraphExecChildGraphNodeSetParams_v11010_params;
+
+typedef struct cudaGraphExecEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphNodeSetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int isEnabled;
+} cudaGraphNodeSetEnabled_v11060_params;
+
+typedef struct cudaGraphNodeGetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int *isEnabled;
+} cudaGraphNodeGetEnabled_v11060_params;
+
+typedef struct cudaGraphExecUpdate_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraph_t hGraph;
+    cudaGraphExecUpdateResultInfo *resultInfo;
+} cudaGraphExecUpdate_v10020_params;
+
+typedef struct cudaGraphUpload_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_ptsz_v10000_params;
+
+typedef struct cudaGraphLaunch_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_ptsz_v10000_params;
+
+typedef struct cudaGraphExecDestroy_v10000_params_st {
+    cudaGraphExec_t graphExec;
+} cudaGraphExecDestroy_v10000_params;
+
+typedef struct cudaGraphDestroy_v10000_params_st {
+    cudaGraph_t graph;
+} cudaGraphDestroy_v10000_params;
+
+typedef struct cudaGraphDebugDotPrint_v11030_params_st {
+    cudaGraph_t graph;
+    const char *path;
+    unsigned int flags;
+} cudaGraphDebugDotPrint_v11030_params;
+
+typedef struct cudaUserObjectCreate_v11030_params_st {
+    cudaUserObject_t *object_out;
+    void *ptr;
+    cudaHostFn_t destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cudaUserObjectCreate_v11030_params;
+
+typedef struct cudaUserObjectRetain_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRetain_v11030_params;
+
+typedef struct cudaUserObjectRelease_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRelease_v11030_params;
+
+typedef struct cudaGraphRetainUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+    unsigned int flags;
+} cudaGraphRetainUserObject_v11030_params;
+
+typedef struct cudaGraphReleaseUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaGraphReleaseUserObject_v11030_params;
+
+typedef struct cudaGraphAddNode_v12020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v12020_params;
+
+typedef struct cudaGraphAddNode_v2_v12030_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v2_v12030_params;
+
+typedef struct cudaGraphNodeSetParams_v12020_params_st {
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphNodeSetParams_v12020_params;
+
+typedef struct cudaGraphExecNodeSetParams_v12020_params_st {
+    cudaGraphExec_t graphExec;
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphExecNodeSetParams_v12020_params;
+
+typedef struct cudaGraphConditionalHandleCreate_v12030_params_st {
+    cudaGraphConditionalHandle *pHandle_out;
+    cudaGraph_t graph;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cudaGraphConditionalHandleCreate_v12030_params;
+
+typedef struct cudaGetDriverEntryPoint_ptsz_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_ptsz_v11030_params;
+
+typedef struct cudaGetDriverEntryPointByVersion_ptsz_v12050_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned int cudaVersion;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPointByVersion_ptsz_v12050_params;
+
+typedef struct cudaGetFuncBySymbol_v11000_params_st {
+    cudaFunction_t *functionPtr;
+    const void *symbolPtr;
+} cudaGetFuncBySymbol_v11000_params;
+
+typedef struct cudaGetKernel_v12000_params_st {
+    cudaKernel_t *kernelPtr;
+    const void *entryFuncAddr;
+} cudaGetKernel_v12000_params;
+
+typedef struct cudaMemcpy_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_v3020_params;
+
+typedef struct cudaMemcpyToSymbol_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_v3020_params;
+
+typedef struct cudaMemcpyFromSymbol_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_v3020_params;
+
+typedef struct cudaMemcpy2D_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_v3020_params;
+
+typedef struct cudaMemcpyToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_v3020_params;
+
+typedef struct cudaMemcpy2DToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_v3020_params;
+
+typedef struct cudaMemcpyFromArray_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_v3020_params;
+
+typedef struct cudaMemcpy2DFromArray_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_v3020_params;
+
+typedef struct cudaMemcpyArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy2DArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy3D_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_v3020_params;
+
+typedef struct cudaMemcpy3DPeer_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_v4000_params;
+
+typedef struct cudaMemcpyBatchAsync_v12080_params_st {
+    void **dsts;
+    void **srcs;
+    size_t *sizes;
+    size_t count;
+    struct cudaMemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    cudaStream_t stream;
+} cudaMemcpyBatchAsync_v12080_params;
+
+typedef struct cudaMemcpy3DBatchAsync_v12080_params_st {
+    size_t numOps;
+    struct cudaMemcpy3DBatchOp *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    cudaStream_t stream;
+} cudaMemcpy3DBatchAsync_v12080_params;
+
+typedef struct cudaMemset_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_v3020_params;
+
+typedef struct cudaMemset2D_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_v3020_params;
+
+typedef struct cudaMemset3D_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_v3020_params;
+
+typedef struct cudaMemcpyAsync_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_v3020_params;
+
+typedef struct cudaMemcpyToSymbolAsync_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpy2DAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_v3020_params;
+
+typedef struct cudaMemcpyToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpyFromArrayAsync_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy3DAsync_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_v3020_params;
+
+typedef struct cudaMemcpy3DPeerAsync_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_v4000_params;
+
+typedef struct cudaMemsetAsync_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_v3020_params;
+
+typedef struct cudaMemset2DAsync_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_v3020_params;
+
+typedef struct cudaMemset3DAsync_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_v3020_params;
+
+typedef struct cudaStreamQuery_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_v3020_params;
+
+typedef struct cudaStreamGetDevice_v12080_params_st {
+    cudaStream_t hStream;
+    int *device;
+} cudaStreamGetDevice_v12080_params;
+
+typedef struct cudaStreamGetFlags_v5050_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_v5050_params;
+
+typedef struct cudaStreamGetId_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_v12000_params;
+
+typedef struct cudaStreamGetPriority_v5050_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_v5050_params;
+
+typedef struct cudaEventRecord_v3020_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_v3020_params;
+
+typedef struct cudaEventRecordWithFlags_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_v11010_params;
+
+typedef struct cudaStreamWaitEvent_v3020_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_v3020_params;
+
+typedef struct cudaStreamAddCallback_v5000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_v5000_params;
+
+typedef struct cudaStreamAttachMemAsync_v6000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_v6000_params;
+
+typedef struct cudaStreamSynchronize_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_v3020_params;
+
+typedef struct cudaLaunchKernel_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_v7000_params;
+
+typedef struct cudaLaunchKernelExC_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_v9000_params;
+
+typedef struct cudaLaunchHostFunc_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_v10000_params;
+
+typedef struct cudaMemPrefetchAsync_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_v12020_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaGraphInstantiateWithParams_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_v12000_params;
+
+typedef struct cudaGraphUpload_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_v10000_params;
+
+typedef struct cudaGraphLaunch_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_v10000_params;
+
+typedef struct cudaStreamBeginCapture_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_v12030_params;
+
+typedef struct cudaStreamEndCapture_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_v10000_params;
+
+typedef struct cudaStreamIsCapturing_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_ptsz_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_ptsz_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_v12030_params;
+
+typedef struct cudaStreamCopyAttributes_v11000_params_st {
+    cudaStream_t dstStream;
+    cudaStream_t srcStream;
+} cudaStreamCopyAttributes_v11000_params;
+
+typedef struct cudaStreamGetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value;
+} cudaStreamGetAttribute_v11000_params;
+
+typedef struct cudaStreamSetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *param;
+} cudaStreamSetAttribute_v11000_params;
+
+typedef struct cudaMallocAsync_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_v11020_params;
+
+typedef struct cudaFreeAsync_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_v11020_params;
+
+typedef struct cudaGetDriverEntryPoint_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_v11030_params;
+
+typedef struct cudaGetDriverEntryPointByVersion_v12050_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned int cudaVersion;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPointByVersion_v12050_params;
+
+typedef struct cudaGetDeviceProperties_v3020_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..88e79d1957925c4bbacd381e9461d5072de88f24
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h
@@ -0,0 +1,38 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_vdpau_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaVDPAUGetDevice_v3020_params_st {
+    int *device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUGetDevice_v3020_params;
+
+typedef struct cudaVDPAUSetVDPAUDevice_v3020_params_st {
+    int device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUSetVDPAUDevice_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterVideoSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterVideoSurface_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterOutputSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterOutputSurface_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0fc27a71bb3fc883db9fe7562eea3f28145430d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h
@@ -0,0 +1,162 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cudart_removed.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaStreamDestroy_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v3020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
+    int *numBlocks;
+    const void *func;
+    size_t numDynamicSmemBytes;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
+
+typedef struct cudaConfigureCall_v3020_params_st {
+    dim3 gridDim;
+    dim3 blockDim;
+    size_t sharedMem  __dv;
+    cudaStream_t stream  __dv;
+} cudaConfigureCall_v3020_params;
+
+typedef struct cudaSetupArgument_v3020_params_st {
+    const void *arg;
+    size_t size;
+    size_t offset;
+} cudaSetupArgument_v3020_params;
+
+typedef struct cudaLaunch_v3020_params_st {
+    const void *func;
+} cudaLaunch_v3020_params;
+
+typedef struct cudaLaunch_ptsz_v7000_params_st {
+    const void *func;
+} cudaLaunch_ptsz_v7000_params;
+
+typedef struct cudaStreamSetFlags_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_v10200_params;
+
+typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_ptsz_v10200_params;
+
+typedef struct cudaProfilerInitialize_v4000_params_st {
+    const char *configFile;
+    const char *outputFile;
+    cudaOutputMode_t outputMode;
+} cudaProfilerInitialize_v4000_params;
+
+typedef struct cudaThreadSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaThreadSetLimit_v3020_params;
+
+typedef struct cudaThreadGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaThreadGetLimit_v3020_params;
+
+typedef struct cudaThreadGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaThreadGetCacheConfig_v3020_params;
+
+typedef struct cudaThreadSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaThreadSetCacheConfig_v3020_params;
+
+typedef struct cudaSetDoubleForDevice_v3020_params_st {
+    double *d;
+} cudaSetDoubleForDevice_v3020_params;
+
+typedef struct cudaSetDoubleForHost_v3020_params_st {
+    double *d;
+} cudaSetDoubleForHost_v3020_params;
+
+typedef struct cudaCreateTextureObject_v2_v11080_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v2_v11080_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v2_v11080_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v2_v11080_params;
+
+typedef struct cudaBindTexture_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t size  __dv;
+} cudaBindTexture_v3020_params;
+
+typedef struct cudaBindTexture2D_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    size_t pitch;
+} cudaBindTexture2D_v3020_params;
+
+typedef struct cudaBindTextureToArray_v3020_params_st {
+    const struct textureReference *texref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToArray_v3020_params;
+
+typedef struct cudaBindTextureToMipmappedArray_v5000_params_st {
+    const struct textureReference *texref;
+    cudaMipmappedArray_const_t mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToMipmappedArray_v5000_params;
+
+typedef struct cudaUnbindTexture_v3020_params_st {
+    const struct textureReference *texref;
+} cudaUnbindTexture_v3020_params;
+
+typedef struct cudaGetTextureAlignmentOffset_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+} cudaGetTextureAlignmentOffset_v3020_params;
+
+typedef struct cudaGetTextureReference_v3020_params_st {
+    const struct textureReference **texref;
+    const void *symbol;
+} cudaGetTextureReference_v3020_params;
+
+typedef struct cudaBindSurfaceToArray_v3020_params_st {
+    const struct surfaceReference *surfref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindSurfaceToArray_v3020_params;
+
+typedef struct cudaGetSurfaceReference_v3020_params_st {
+    const struct surfaceReference **surfref;
+    const void *symbol;
+} cudaGetSurfaceReference_v3020_params;
+
+typedef struct cudaGraphInstantiate_v10000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphNode_t *pErrorNode;
+    char *pLogBuffer;
+    size_t bufferSize;
+} cudaGraphInstantiate_v10000_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed8877e21f0651fe1564151090850694eb495cfb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2013-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct nvtxMarkEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxMarkEx_params;
+
+typedef struct nvtxMarkA_params_st {
+  const char* message;
+} nvtxMarkA_params;
+
+typedef struct nvtxMarkW_params_st {
+  const wchar_t* message;
+} nvtxMarkW_params;
+
+typedef struct nvtxRangeStartEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangeStartEx_params;
+
+typedef struct nvtxRangeStartA_params_st {
+  const char* message;
+} nvtxRangeStartA_params;
+
+typedef struct nvtxRangeStartW_params_st {
+  const wchar_t* message;
+} nvtxRangeStartW_params;
+
+typedef struct nvtxRangeEnd_params_st {
+  nvtxRangeId_t id;
+} nvtxRangeEnd_params;
+
+typedef struct nvtxRangePushEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangePushEx_params;
+
+typedef struct nvtxRangePushA_params_st {
+  const char* message;
+} nvtxRangePushA_params;
+
+typedef struct nvtxRangePushW_params_st {
+  const wchar_t* message;
+} nvtxRangePushW_params;
+
+typedef struct nvtxRangePop_params_st {
+  /* WAR: Windows compiler doesn't allow empty structs */
+  /* This field shouldn't be used */
+  void *dummy;
+} nvtxRangePop_params;
+
+typedef struct nvtxNameCategoryA_params_st {
+  uint32_t category;
+  const char* name;
+} nvtxNameCategoryA_params;
+
+typedef struct nvtxNameCategoryW_params_st {
+  uint32_t category;
+  const wchar_t* name;
+} nvtxNameCategoryW_params;
+
+typedef struct nvtxNameOsThreadA_params_st {
+  uint32_t threadId;
+  const char* name;
+} nvtxNameOsThreadA_params;
+
+typedef struct nvtxNameOsThreadW_params_st {
+  uint32_t threadId;
+  const wchar_t* name;
+} nvtxNameOsThreadW_params;
+
+typedef struct nvtxNameCuDeviceA_params_st {
+  CUdevice device;
+  const char* name;
+} nvtxNameCuDeviceA_params;
+
+typedef struct nvtxNameCuDeviceW_params_st {
+  CUdevice device;
+  const wchar_t* name;
+} nvtxNameCuDeviceW_params;
+
+typedef struct nvtxNameCuContextA_params_st {
+  CUcontext context;
+  const char* name;
+} nvtxNameCuContextA_params;
+
+typedef struct nvtxNameCuContextW_params_st {
+  CUcontext context;
+  const wchar_t* name;
+} nvtxNameCuContextW_params;
+
+typedef struct nvtxNameCuStreamA_params_st {
+  CUstream stream;
+  const char* name;
+} nvtxNameCuStreamA_params;
+
+typedef struct nvtxNameCuStreamW_params_st {
+  CUstream stream;
+  const wchar_t* name;
+} nvtxNameCuStreamW_params;
+
+typedef struct nvtxNameCuEventA_params_st {
+  CUevent event;
+  const char* name;
+} nvtxNameCuEventA_params;
+
+typedef struct nvtxNameCuEventW_params_st {
+  CUevent event;
+  const wchar_t* name;
+} nvtxNameCuEventW_params;
+
+typedef struct nvtxNameCudaDeviceA_params_st {
+  int device;
+  const char* name;
+} nvtxNameCudaDeviceA_params;
+
+typedef struct nvtxNameCudaDeviceW_params_st {
+  int device;
+  const wchar_t* name;
+} nvtxNameCudaDeviceW_params;
+
+typedef struct nvtxNameCudaStreamA_params_st {
+  cudaStream_t stream;
+  const char* name;
+} nvtxNameCudaStreamA_params;
+
+typedef struct nvtxNameCudaStreamW_params_st {
+  cudaStream_t stream;
+  const wchar_t* name;
+} nvtxNameCudaStreamW_params;
+
+typedef struct nvtxNameCudaEventA_params_st {
+  cudaEvent_t event;
+  const char* name;
+} nvtxNameCudaEventA_params;
+
+typedef struct nvtxNameCudaEventW_params_st {
+  cudaEvent_t event;
+  const wchar_t* name;
+} nvtxNameCudaEventW_params;
+
+typedef struct nvtxDomainCreateA_params_st {
+  const char* name;
+} nvtxDomainCreateA_params;
+
+typedef struct nvtxDomainDestroy_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainDestroy_params;
+
+typedef struct nvtxDomainMarkEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxMarkEx_params core;
+} nvtxDomainMarkEx_params;
+
+typedef struct nvtxDomainRangeStartEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeStartEx_params core;
+} nvtxDomainRangeStartEx_params;
+
+typedef struct nvtxDomainRangeEnd_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeEnd_params core;
+} nvtxDomainRangeEnd_params;
+
+typedef struct nvtxDomainRangePushEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangePushEx_params core;
+} nvtxDomainRangePushEx_params;
+
+typedef struct nvtxDomainRangePop_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainRangePop_params;
+
+typedef struct nvtxSyncUserCreate_params_st {
+  nvtxDomainHandle_t domain;
+  const nvtxSyncUserAttributes_t* attribs;
+} nvtxSyncUserCreate_params;
+
+typedef struct nvtxSyncUserCommon_params_st {
+  nvtxSyncUser_t handle;
+} nvtxSyncUserCommon_params;
+
+typedef struct nvtxDomainRegisterStringA_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringA_params;
+
+typedef struct nvtxDomainRegisterStringW_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringW_params;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+
+#include "crt/host_config.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a9c98a957e8f60e872b94fde762516c5523367
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+
+#include "crt/host_defines.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..35872bc5583ebcaea31251b2a0c704f3d11a7191
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+
+
+
+#ifndef __CUDACC_RTC_MINIMAL__
+
+typedef enum cudaDataType_t
+{
+    CUDA_R_16F  =  2, /* real as a half */
+    CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+    CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+    CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+    CUDA_R_32F  =  0, /* real as a float */
+    CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+    CUDA_R_64F  =  1, /* real as a double */
+    CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+    CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+    CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+    CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+    CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+    CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+    CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+    CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+    CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+    CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+    CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+    CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+    CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+    CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+    CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+    CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+    CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+    CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+    CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+    CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+    CUDA_C_64U  = 27, /* complex as a pair of unsigned 64-bit int numbers */
+    CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
+    CUDA_R_8F_UE4M3 = CUDA_R_8F_E4M3, /* real as an unsigned nv_fp8_e4m3 */
+    CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
+    CUDA_R_8F_UE8M0 = 30,  /* real as an exponent-only unsigned nv_fp8_e8m0 */
+    CUDA_R_6F_E2M3  = 31,  /* real as a nv_fp6_e2m3 */
+    CUDA_R_6F_E3M2  = 32,  /* real as a nv_fp6_e3m2 */
+    CUDA_R_4F_E2M1  = 33,  /* real as a nv_fp4_e2m1 */
+} cudaDataType;
+
+
+typedef enum libraryPropertyType_t
+{
+    MAJOR_VERSION,
+    MINOR_VERSION,
+    PATCH_LEVEL
+} libraryPropertyType;
+
+
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__LIBRARY_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..39937e980f88a614d847154f9e4364bd9ba95cbd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000U)
+#define CUDART_NAN_F            __int_as_float(0x7fffffffU)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001U)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000U)
+#define CUDART_ZERO_F           0.0F
+#define CUDART_ONE_F            1.0F
+#define CUDART_SQRT_HALF_F      0.707106781F
+#define CUDART_SQRT_HALF_HI_F   0.707106781F
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08F
+#define CUDART_SQRT_TWO_F       1.414213562F
+#define CUDART_THIRD_F          0.333333333F
+#define CUDART_PIO4_F           0.785398163F
+#define CUDART_PIO2_F           1.570796327F
+#define CUDART_3PIO4_F          2.356194490F
+#define CUDART_2_OVER_PI_F      0.636619772F
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561F
+#define CUDART_PI_F             3.141592654F
+#define CUDART_L2E_F            1.442695041F
+#define CUDART_L2T_F            3.321928094F
+#define CUDART_LG2_F            0.301029996F
+#define CUDART_LGE_F            0.434294482F
+#define CUDART_LN2_F            0.693147181F
+#define CUDART_LNT_F            2.302585093F
+#define CUDART_LNPI_F           1.144729886F
+#define CUDART_TWO_TO_M126_F    1.175494351e-38F
+#define CUDART_TWO_TO_126_F     8.507059173e37F
+#define CUDART_NORM_HUGE_F      3.402823466e38F
+#define CUDART_TWO_TO_23_F      8388608.0F
+#define CUDART_TWO_TO_24_F      16777216.0F
+#define CUDART_TWO_TO_31_F      2147483648.0F
+#define CUDART_TWO_TO_32_F      4294967296.0F
+#define CUDART_REMQUO_BITS_F    3U
+#define CUDART_REMQUO_MASK_F    (~((~0U)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0F
+
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+
+#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/math_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f36f671c0b3a4e95cbb7bddbe41e75ac783b722
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
+
+#include "crt/mma.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5eeac41b766cdecf8b38552578a21cb6dbfa4fd0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h
@@ -0,0 +1,328 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#ifndef nvPTXCompiler_INCLUDED
+#define nvPTXCompiler_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- Dependency --- */
+#include <stddef.h> /* For size_t */
+
+/*************************************************************************/ /**
+ *
+ * \defgroup handle PTX-Compiler Handle
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup handle
+ * \brief   nvPTXCompilerHandle represents a handle to the PTX Compiler.
+ *
+ * To compile a PTX program string, an instance of nvPTXCompiler
+ * must be created and the handle to it must be obtained using the
+ * API nvPTXCompilerCreate(). Then the compilation can be done
+ * using the API nvPTXCompilerCompile().
+ *
+ */
+typedef struct nvPTXCompiler *nvPTXCompilerHandle;
+
+/**
+ *
+ * \defgroup error Error codes
+ *
+ */
+
+/** \ingroup error
+ *
+ * \brief     The nvPTXCompiler APIs return the nvPTXCompileResult codes to indicate the call result
+ */
+
+typedef enum {
+
+    /* Indicates the API completed successfully */
+    NVPTXCOMPILE_SUCCESS                              = 0,
+
+    /* Indicates an invalid nvPTXCompilerHandle was passed to the API */
+    NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE        = 1,
+
+    /* Indicates invalid inputs were given to the API  */
+    NVPTXCOMPILE_ERROR_INVALID_INPUT                  = 2,
+
+    /* Indicates that the compilation of the PTX program failed */
+    NVPTXCOMPILE_ERROR_COMPILATION_FAILURE            = 3,
+
+    /* Indicates that something went wrong internally */
+    NVPTXCOMPILE_ERROR_INTERNAL                       = 4,
+
+    /* Indicates that the API was unable to allocate memory */
+    NVPTXCOMPILE_ERROR_OUT_OF_MEMORY                  = 5,
+
+    /* Indicates that the handle was passed to an API which expected */
+    /* the nvPTXCompilerCompile() to have been called previously */
+    NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE = 6,
+
+    /* Indicates that the PTX version encountered in the PTX is not */
+    /* supported by the current compiler */
+    NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION        = 7,
+
+    /* Indicates that device side sync is not supported by the SM version */
+    NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC       = 8,
+
+    /* Indicates that compilation has been cancelled by the user */
+    NVPTXCOMPILE_ERROR_CANCELLED                      = 9,
+} nvPTXCompileResult;
+
+/* ----------------------------- PTX Compiler APIs ---------------------------- */
+
+/**
+ *
+ * \defgroup versioning API Versioning
+ *
+ * The PTX compiler APIs are versioned so that any new features or API
+ * changes can be done by bumping up the API version.
+ */
+
+/** \ingroup versioning
+ *
+ * \brief            Queries the current \p major and \p minor version of
+ *                   PTX Compiler APIs being used
+ *
+ * \param            [out] major   Major version of the PTX Compiler APIs
+ * \param            [out] minor   Minor version of the PTX Compiler APIs
+ * \note                           The version of PTX Compiler APIs follows the CUDA Toolkit versioning.
+ *                                 The PTX ISA version supported by a PTX Compiler API version is listed
+ *                                 <a href="https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes">here</a>.
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ */
+nvPTXCompileResult nvPTXCompilerGetVersion(unsigned int *major, unsigned int *minor);
+
+/**
+ *
+ * \defgroup compilation Compilation APIs
+ *
+ */
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the handle to an instance of the PTX compiler
+ *                   initialized with the given PTX program \p ptxCode
+ *
+ * \param            [out] compiler  Returns a handle to PTX compiler initialized
+ *                                   with the PTX program \p ptxCode
+ * \param            [in] ptxCodeLen Size of the PTX program \p ptxCode passed as string
+ * \param            [in] ptxCode    The PTX program which is to be compiled passed as string.
+ *
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ */
+nvPTXCompileResult nvPTXCompilerCreate(nvPTXCompilerHandle *compiler, size_t ptxCodeLen, const char *ptxCode);
+
+/** \ingroup compilation
+ *
+ * \brief            Destroys and cleans the already created PTX compiler
+ *
+ * \param            [in] compiler  A handle to the PTX compiler which is to be destroyed
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerDestroy(nvPTXCompilerHandle *compiler);
+
+/** \ingroup compilation
+ *
+ * \brief          Compile a PTX program with the given compiler options
+ *
+ * \param            [in,out] compiler      A handle to PTX compiler initialized with the
+ *                                          PTX program which is to be compiled.
+ *                                          The compiled program can be accessed using the handle
+ * \param            [in] numCompileOptions Length of the array \p compileOptions
+ * \param            [in] compileOptions   Compiler options with which compilation should be done.
+ *                                         The compiler options string is a null terminated character array.
+ *                                         A valid list of compiler options is at
+ *                                         <a href="http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options">link</a>.
+ * \note                                   --gpu-name (-arch) is a mandatory option.
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE  \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION  \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerCompile(nvPTXCompilerHandle compiler, int numCompileOptions,
+                                        const char *const *compileOptions);
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the size of the image of the compiled program
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] binaryImageSize  The size of the image of the compiled program
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \endlink
+ *
+ * \note             nvPTXCompilerCompile() API should be invoked for the handle before calling this API.
+ *                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is returned.
+ */
+nvPTXCompileResult nvPTXCompilerGetCompiledProgramSize(nvPTXCompilerHandle compiler, size_t *binaryImageSize);
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the image of the compiled program
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] binaryImage      The image of the compiled program.
+ *                                         Client should allocate memory for \p binaryImage
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \endlink
+ *
+ * \note             nvPTXCompilerCompile() API should be invoked for the handle before calling this API.
+ *                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is returned.
+ *
+ */
+
+nvPTXCompileResult nvPTXCompilerGetCompiledProgram(nvPTXCompilerHandle compiler, void *binaryImage);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the size of the error message that was seen previously for the handle
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] errorLogSize     The size of the error log in bytes which was produced
+ *                                          in previous call to nvPTXCompilerCompiler().
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetErrorLogSize(nvPTXCompilerHandle compiler, size_t *errorLogSize);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the error message that was seen previously for the handle
+ *
+ * \param            [in] compiler         A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] errorLog        The error log which was produced in previous call to nvPTXCompilerCompiler().
+ *                                         Clients should allocate memory for \p errorLog
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetErrorLog(nvPTXCompilerHandle compiler, char *errorLog);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the size of the information message that was seen previously for the handle
+ *
+ * \param            [in] compiler        A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] infoLogSize    The size of the information log in bytes which was produced
+ *                                         in previous call to nvPTXCompilerCompiler().
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetInfoLogSize(nvPTXCompilerHandle compiler, size_t *infoLogSize);
+
+/** \ingroup compilation
+ *
+ * \brief           Query the information message that was seen previously for the handle
+ *
+ * \param            [in] compiler        A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] infoLog        The information log which was produced in previous call to nvPTXCompilerCompiler().
+ *                                        Clients should allocate memory for \p infoLog
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetInfoLog(nvPTXCompilerHandle compiler, char *infoLog);
+
+/** \ingroup compilation
+ *
+ * \brief           Register a callback function that the compiler will invoke at different phases of
+ *                  PTX Compilation during a call to nvPTXCompilerCompile().
+ *                  The callback function decides to cancel the compilation by returning specific values.
+ *
+ *                  Callback function must satisfy the following constraints
+ *                  (1) Its signature should be
+ *                      @code
+ *                      int callback(void* param1, void* param2);
+ *                      @endcode
+ *                      When invoking the callback, the compiler will always pass \p payload to
+ *                      param1 so that the callback may make decisions based on \p payload . It'll
+ *                      always pass NULL to param2 for now which is reserved for future extensions.
+ *
+ *                  (2) It must return 1 to cancel compilation or 0 to continue.
+ *                      Other return values are reserved for future use.
+ *
+ *                  (3) It must return consistent values. Once it returns 1 at one point, it must
+ *                      return 1 in all following invocations during the current nvPTXCompilerCompile
+ *                      call in progress.
+ *
+ *                  (4) It must be thread-safe.
+ *
+ *                  (5) It must not invoke any nvrtc/libnvvm/ptx APIs.
+ *
+ * \param            [in] compiler        A handle to an initialized PTX compiler in which to introduce the callback.
+ * \param            [in] callback        Function pointer to the callback function.
+ * \param            [in] payload         payload to be passed as a parameter when invoking the callback.
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *     \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerSetFlowCallback(nvPTXCompilerHandle compiler, int (*callback)(void *, void *),
+                                                void *payload);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // nvPTXCompiler_INCLUDED
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvfunctional b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvfunctional
new file mode 100644
index 0000000000000000000000000000000000000000..4fdeeecf6b63f92c5d684a03bb461cd935c0fd35
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvfunctional
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2014-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__
+#endif
+
+#include "crt/nvfunctional"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__
+#endif
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ed01f7bc2851f43678e58efe34fc5579cca3a35
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h
@@ -0,0 +1,393 @@
+#ifndef NVPERF_COMMON_H
+#define NVPERF_COMMON_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_common.h
+ */
+
+#ifndef NVPERF_NVPA_STATUS_DEFINED
+#define NVPERF_NVPA_STATUS_DEFINED
+
+    /// Error codes.
+    typedef enum NVPA_Status
+    {
+        /// Success
+        NVPA_STATUS_SUCCESS = 0,
+        /// Generic error.
+        NVPA_STATUS_ERROR = 1,
+        /// Internal error.  Please file a bug!
+        NVPA_STATUS_INTERNAL_ERROR = 2,
+        /// NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.
+        NVPA_STATUS_NOT_INITIALIZED = 3,
+        /// The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the
+        /// appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the
+        /// LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize
+        /// NVPW_SetLibraryLoadPaths() to define additional library search paths.
+        NVPA_STATUS_NOT_LOADED = 4,
+        /// The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling
+        /// NVPA_GetProcAddress(), please ensure the function name is spelled correctly.
+        NVPA_STATUS_FUNCTION_NOT_FOUND = 5,
+        /// The request was intentionally not supported.
+        NVPA_STATUS_NOT_SUPPORTED = 6,
+        /// The request was not implemented by this version.
+        NVPA_STATUS_NOT_IMPLEMENTED = 7,
+        /// Invalid argument.
+        NVPA_STATUS_INVALID_ARGUMENT = 8,
+        /// UNUSED
+        NVPA_STATUS_INVALID_METRIC_ID = 9,
+        /// No driver has been loaded via NVPW_*_LoadDriver().
+        NVPA_STATUS_DRIVER_NOT_LOADED = 10,
+        /// Failed memory allocation.
+        NVPA_STATUS_OUT_OF_MEMORY = 11,
+        /// UNUSED
+        NVPA_STATUS_INVALID_THREAD_STATE = 12,
+        /// UNUSED
+        NVPA_STATUS_FAILED_CONTEXT_ALLOC = 13,
+        /// The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information
+        NVPA_STATUS_UNSUPPORTED_GPU = 14,
+        /// The installed NVIDIA driver is too old.
+        NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION = 15,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_NOT_REGISTERED = 16,
+        /// Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-
+        /// ERR_NVGPUCTRPERM-permission-issue-performance-counters
+        NVPA_STATUS_INSUFFICIENT_PRIVILEGE = 17,
+        /// UNUSED
+        NVPA_STATUS_INVALID_CONTEXT_STATE = 18,
+        /// UNUSED
+        NVPA_STATUS_INVALID_OBJECT_STATE = 19,
+        /// The request could not be fulfilled because a system resource is already in use.
+        NVPA_STATUS_RESOURCE_UNAVAILABLE = 20,
+        /// UNUSED
+        NVPA_STATUS_DRIVER_LOADED_TOO_LATE = 21,
+        /// The provided buffer is not large enough.
+        NVPA_STATUS_INSUFFICIENT_SPACE = 22,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_MISMATCH = 23,
+        /// Virtualized GPU (vGPU) is not supported.
+        NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED = 24,
+        /// Profiling permission was not granted or the device was disabled.
+        NVPA_STATUS_PROFILING_NOT_ALLOWED = 25,
+        NVPA_STATUS__COUNT
+    } NVPA_Status;
+
+
+    inline void NVPW_NVPAStatusToString(NVPA_Status status, const char** ppStatusStr, const char** ppCommentStr)
+    {
+        switch (status)
+        {
+            case NVPA_STATUS_SUCCESS:
+                *ppStatusStr = "NVPA_STATUS_SUCCESS";
+                *ppCommentStr = "Success";
+                return;
+            case NVPA_STATUS_ERROR:
+                *ppStatusStr = "NVPA_STATUS_ERROR";
+                *ppCommentStr = "Generic error.";
+                return;
+            case NVPA_STATUS_INTERNAL_ERROR:
+                *ppStatusStr = "NVPA_STATUS_INTERNAL_ERROR";
+                *ppCommentStr = "Internal error.  Please file a bug!";
+                return;
+            case NVPA_STATUS_NOT_INITIALIZED:
+                *ppStatusStr = "NVPA_STATUS_NOT_INITIALIZED";
+                *ppCommentStr = "NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.";
+                return;
+            case NVPA_STATUS_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_NOT_LOADED";
+                *ppCommentStr = "The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize NVPW_SetLibraryLoadPaths() to define additional library search paths.";
+                return;
+            case NVPA_STATUS_FUNCTION_NOT_FOUND:
+                *ppStatusStr = "NVPA_STATUS_FUNCTION_NOT_FOUND";
+                *ppCommentStr = "The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling NVPA_GetProcAddress(), please ensure the function name is spelled correctly.";
+                return;
+            case NVPA_STATUS_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_SUPPORTED";
+                *ppCommentStr = "The request was intentionally not supported.";
+                return;
+            case NVPA_STATUS_NOT_IMPLEMENTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_IMPLEMENTED";
+                *ppCommentStr = "The request was not implemented by this version.";
+                return;
+            case NVPA_STATUS_INVALID_ARGUMENT:
+                *ppStatusStr = "NVPA_STATUS_INVALID_ARGUMENT";
+                *ppCommentStr = "Invalid argument.";
+                return;
+            case NVPA_STATUS_INVALID_METRIC_ID:
+                *ppStatusStr = "NVPA_STATUS_INVALID_METRIC_ID";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_DRIVER_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_NOT_LOADED";
+                *ppCommentStr = "No driver has been loaded via NVPW_*_LoadDriver().";
+                return;
+            case NVPA_STATUS_OUT_OF_MEMORY:
+                *ppStatusStr = "NVPA_STATUS_OUT_OF_MEMORY";
+                *ppCommentStr = "Failed memory allocation.";
+                return;
+            case NVPA_STATUS_INVALID_THREAD_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_THREAD_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_FAILED_CONTEXT_ALLOC:
+                *ppStatusStr = "NVPA_STATUS_FAILED_CONTEXT_ALLOC";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_UNSUPPORTED_GPU:
+                *ppStatusStr = "NVPA_STATUS_UNSUPPORTED_GPU";
+                *ppCommentStr = "The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION";
+                *ppCommentStr = "The installed NVIDIA driver is too old.";
+                return;
+            case NVPA_STATUS_OBJECT_NOT_REGISTERED:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_NOT_REGISTERED";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_PRIVILEGE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE";
+                *ppCommentStr = "Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters";
+                return;
+            case NVPA_STATUS_INVALID_CONTEXT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_CONTEXT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INVALID_OBJECT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_OBJECT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_RESOURCE_UNAVAILABLE:
+                *ppStatusStr = "NVPA_STATUS_RESOURCE_UNAVAILABLE";
+                *ppCommentStr = "The request could not be fulfilled because a system resource is already in use.";
+                return;
+            case NVPA_STATUS_DRIVER_LOADED_TOO_LATE:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_SPACE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_SPACE";
+                *ppCommentStr = "The provided buffer is not large enough.";
+                return;
+            case NVPA_STATUS_OBJECT_MISMATCH:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_MISMATCH";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED";
+                *ppCommentStr = "Virtualized GPU (vGPU) is not supported.";
+                return;
+            case NVPA_STATUS_PROFILING_NOT_ALLOWED:
+                *ppStatusStr = "NVPA_STATUS_PROFILING_NOT_ALLOWED";
+                *ppCommentStr = "Profiling permission was not granted or the device was disabled.";
+                return;
+            default:
+                *ppStatusStr = "Unrecognized status";
+                *ppCommentStr = "This status is unrecognized. Is it coming from a newer version of NvPerf library?";
+                return;
+        }
+    }
+
+
+#endif // NVPERF_NVPA_STATUS_DEFINED
+
+
+#ifndef NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+#define NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+    /// The configuration's activity-kind dictates which types of data may be collected.
+    typedef enum NVPA_ActivityKind
+    {
+        /// Invalid value.
+        NVPA_ACTIVITY_KIND_INVALID = 0,
+        /// A workload-centric activity for serialized and pipelined collection.
+        /// 
+        /// Profiler is capable of collecting both serialized and pipelined metrics.  The library introduces any
+        /// synchronization required to collect serialized metrics.
+        NVPA_ACTIVITY_KIND_PROFILER,
+        /// A realtime activity for sampling counters from the CPU or GPU.
+        NVPA_ACTIVITY_KIND_REALTIME_SAMPLED,
+        /// A realtime activity for profiling counters from the CPU or GPU without CPU/GPU synchronizations.
+        NVPA_ACTIVITY_KIND_REALTIME_PROFILER,
+        NVPA_ACTIVITY_KIND__COUNT
+    } NVPA_ActivityKind;
+
+
+#endif // NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+
+#ifndef NVPERF_NVPA_BOOL_DEFINED
+#define NVPERF_NVPA_BOOL_DEFINED
+    /// The type used for boolean values.
+    typedef uint8_t NVPA_Bool;
+#endif // NVPERF_NVPA_BOOL_DEFINED
+
+#ifndef NVPA_STRUCT_SIZE
+#define NVPA_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif // NVPA_STRUCT_SIZE
+
+#ifndef NVPW_FIELD_EXISTS
+#define NVPW_FIELD_EXISTS(pParams_, name_) \
+    ((pParams_)->structSize >= (size_t)((const uint8_t*)(&(pParams_)->name_) + sizeof(pParams_)->name_ - (const uint8_t*)(pParams_)))
+#endif // NVPW_FIELD_EXISTS
+
+
+#ifndef NVPERF_NVPA_GETPROCADDRESS_DEFINED
+#define NVPERF_NVPA_GETPROCADDRESS_DEFINED
+
+typedef NVPA_Status (*NVPA_GenericFn)(void);
+
+
+    /// 
+    /// Gets the address of an NvPerf API function.
+    /// 
+    /// \return A function pointer to the function, or NULL if the function is not available.
+    /// 
+    /// \param pFunctionName [in] Name of the function to retrieve.
+    NVPA_GenericFn NVPA_GetProcAddress(const char* pFunctionName);
+
+#endif
+
+#ifndef NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+#define NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+
+
+    typedef struct NVPW_SetLibraryLoadPaths_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const char** ppPaths;
+    } NVPW_SetLibraryLoadPaths_Params;
+#define NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPaths_Params, ppPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const char* paths[] = {
+    ///         "path1", "path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPaths_Params params{NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(paths)/sizeof(paths[0]);
+    ///     params.ppPaths = paths;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppPaths = NULL;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPaths(NVPW_SetLibraryLoadPaths_Params* pParams);
+
+    typedef struct NVPW_SetLibraryLoadPathsW_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppwPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const wchar_t** ppwPaths;
+    } NVPW_SetLibraryLoadPathsW_Params;
+#define NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPathsW_Params, ppwPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const wchar_t* wpaths[] = {
+    ///         L"path1", L"path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPathsW_Params params{NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(wpaths)/sizeof(wpaths[0]);
+    ///     params.ppwPaths = wpaths;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppwPaths = NULL;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPathsW(NVPW_SetLibraryLoadPathsW_Params* pParams);
+
+#endif
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_COMMON_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b4533b25148b7cd28e0ed30be022893514415a5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h
@@ -0,0 +1,179 @@
+#ifndef NVPERF_CUDA_HOST_H
+#define NVPERF_CUDA_HOST_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+#include "nvperf_host.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_cuda_host.h
+ */
+
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in]
+        const char* pChipName;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create(NVPW_CUDA_RawMetricsConfig_Create_Params* pParams);
+
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_V2_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_V2_Params, pRawMetricsConfig)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create_V2(NVPW_CUDA_RawMetricsConfig_Create_V2_Params* pParams);
+
+    typedef struct NVPW_CUDA_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_CounterDataBuilder object
+        struct NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CUDA_CounterDataBuilder_Create_Params;
+#define NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_CounterDataBuilder_Create_Params, pCounterDataBuilder)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_CounterDataBuilder_Create(NVPW_CUDA_CounterDataBuilder_Create_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+
+    typedef struct NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out]
+        size_t scratchBufferSize;
+    } NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params;
+#define NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params, scratchBufferSize)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params* pParams);
+
+    typedef struct NVPW_CUDA_MetricsEvaluator_Initialize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pScratchBuffer;
+        /// [in] the size of the 'pScratchBuffer' array, should be at least the size of the 'scratchBufferSize' returned
+        /// by 'NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize'
+        size_t scratchBufferSize;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in] must be provided if 'pCounterDataImage' is not NULL
+        size_t counterDataImageSize;
+        /// [out]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_CUDA_MetricsEvaluator_Initialize_Params;
+#define NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_Initialize_Params, pMetricsEvaluator)
+
+    /// Use one of 'pChipName', 'pCounterAvailabilityImage', or 'pCounterDataImage'. 'pChipName' or
+    /// 'pCounterAvailabilityImage' will create a metrics evaluator based on a virtual device while 'pCounterDataImage'
+    /// will create a metrics evaluator based on the actual device.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_Initialize(NVPW_CUDA_MetricsEvaluator_Initialize_Params* pParams);
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_CUDA_HOST_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..62a53528b64d6b3da8daf7058cec21781ae0e8cb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h
@@ -0,0 +1,1178 @@
+#ifndef NVPERF_HOST_H
+#define NVPERF_HOST_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_host.h
+ */
+
+
+// Guard against multiple definition of NvPerf host types
+#ifndef NVPERF_HOST_API_DEFINED
+#define NVPERF_HOST_API_DEFINED
+
+
+/***************************************************************************//**
+ *  @name   Host Configuration
+ *  @{
+ */
+
+    typedef struct NVPW_InitializeHost_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeHost_Params;
+#define NVPW_InitializeHost_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeHost_Params, pPriv)
+
+    /// Load the host library.
+    NVPA_Status NVPW_InitializeHost(NVPW_InitializeHost_Params* pParams);
+
+    typedef struct NVPW_CounterData_CalculateCounterDataImageCopySize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        /// [out] required size of the copy buffer
+        size_t copyDataImageCounterSize;
+    } NVPW_CounterData_CalculateCounterDataImageCopySize_Params;
+#define NVPW_CounterData_CalculateCounterDataImageCopySize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_CalculateCounterDataImageCopySize_Params, copyDataImageCounterSize)
+
+    NVPA_Status NVPW_CounterData_CalculateCounterDataImageCopySize(NVPW_CounterData_CalculateCounterDataImageCopySize_Params* pParams);
+
+    typedef struct NVPW_CounterData_InitializeCounterDataImageCopy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        uint8_t* pCounterDataDst;
+    } NVPW_CounterData_InitializeCounterDataImageCopy_Params;
+#define NVPW_CounterData_InitializeCounterDataImageCopy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_InitializeCounterDataImageCopy_Params, pCounterDataDst)
+
+    NVPA_Status NVPW_CounterData_InitializeCounterDataImageCopy(NVPW_CounterData_InitializeCounterDataImageCopy_Params* pParams);
+
+    typedef struct NVPW_CounterData_ExtractCounterDataPrefix_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The source buffer to extract the prefix from.
+        const uint8_t* pCounterDataSrc;
+        size_t counterDataSrcSize;
+        /// [in] If not NULL, the prefix will be copied into this buffer.
+        uint8_t* pCounterDataPrefix;
+        /// [inout] if 'pCounterDataPrefix' is NULL, size of counter data prefix will be returned; otherwise it should
+        /// be set to the size of buffer allocated for 'pCounterDataPrefix'.
+        size_t counterDataPrefixSize;
+    } NVPW_CounterData_ExtractCounterDataPrefix_Params;
+#define NVPW_CounterData_ExtractCounterDataPrefix_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_ExtractCounterDataPrefix_Params, counterDataPrefixSize)
+
+    NVPA_Status NVPW_CounterData_ExtractCounterDataPrefix(NVPW_CounterData_ExtractCounterDataPrefix_Params* pParams);
+
+    typedef struct NVPA_CounterDataCombiner NVPA_CounterDataCombiner;
+
+    typedef struct NVPW_CounterDataCombiner_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The destination counter data into which the source datas will be combined
+        uint8_t* pCounterDataDst;
+        /// [out] The created counter data combiner
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Create_Params;
+#define NVPW_CounterDataCombiner_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Create_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Create(NVPW_CounterDataCombiner_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Destroy_Params;
+#define NVPW_CounterDataCombiner_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Destroy_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Destroy(NVPW_CounterDataCombiner_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CreateRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t numDescriptions;
+        const char* const* ppDescriptions;
+        /// [out]
+        size_t rangeIndexDst;
+    } NVPW_CounterDataCombiner_CreateRange_Params;
+#define NVPW_CounterDataCombiner_CreateRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CreateRange_Params, rangeIndexDst)
+
+    NVPA_Status NVPW_CounterDataCombiner_CreateRange(NVPW_CounterDataCombiner_CreateRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CopyIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        /// [in]
+        size_t rangeIndexDst;
+        /// [in]
+        const uint8_t* pCounterDataSrc;
+        /// [in]
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_CopyIntoRange_Params;
+#define NVPW_CounterDataCombiner_CopyIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CopyIntoRange_Params, rangeIndexSrc)
+
+    /// In order to use this API, the source counter data and the destination counter data must have identical counters
+    NVPA_Status NVPW_CounterDataCombiner_CopyIntoRange(NVPW_CounterDataCombiner_CopyIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_AccumulateIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        uint32_t dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        uint32_t srcMultiplier;
+    } NVPW_CounterDataCombiner_AccumulateIntoRange_Params;
+#define NVPW_CounterDataCombiner_AccumulateIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_AccumulateIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_AccumulateIntoRange(NVPW_CounterDataCombiner_AccumulateIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_SumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_SumIntoRange_Params;
+#define NVPW_CounterDataCombiner_SumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_SumIntoRange_Params, rangeIndexSrc)
+
+    NVPA_Status NVPW_CounterDataCombiner_SumIntoRange(NVPW_CounterDataCombiner_SumIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_WeightedSumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        double dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        double srcMultiplier;
+    } NVPW_CounterDataCombiner_WeightedSumIntoRange_Params;
+#define NVPW_CounterDataCombiner_WeightedSumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_WeightedSumIntoRange(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Configuration
+ *  @{
+ */
+
+    typedef struct NVPA_RawMetricsConfig NVPA_RawMetricsConfig;
+
+    typedef struct NVPA_RawMetricRequest
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// in
+        const char* pMetricName;
+        /// in
+        NVPA_Bool isolated;
+        /// in; ignored by AddMetric but observed by CounterData initialization
+        NVPA_Bool keepInstances;
+    } NVPA_RawMetricRequest;
+#define NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPA_RawMetricRequest, keepInstances)
+
+    typedef struct NVPW_GetSupportedChipNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        const char* const* ppChipNames;
+        /// [out]
+        size_t numChipNames;
+    } NVPW_GetSupportedChipNames_Params;
+#define NVPW_GetSupportedChipNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetSupportedChipNames_Params, numChipNames)
+
+    NVPA_Status NVPW_GetSupportedChipNames(NVPW_GetSupportedChipNames_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_Destroy_Params;
+#define NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_Destroy_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_Destroy(NVPW_RawMetricsConfig_Destroy_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_SetCounterAvailability_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] buffer with counter availability image
+        const uint8_t* pCounterAvailabilityImage;
+    } NVPW_RawMetricsConfig_SetCounterAvailability_Params;
+#define NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_SetCounterAvailability_Params, pCounterAvailabilityImage)
+
+    NVPA_Status NVPW_RawMetricsConfig_SetCounterAvailability(NVPW_RawMetricsConfig_SetCounterAvailability_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_BeginPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t maxPassCount;
+    } NVPW_RawMetricsConfig_BeginPassGroup_Params;
+#define NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_BeginPassGroup_Params, maxPassCount)
+
+    NVPA_Status NVPW_RawMetricsConfig_BeginPassGroup(NVPW_RawMetricsConfig_BeginPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_EndPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_EndPassGroup_Params;
+#define NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_EndPassGroup_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_EndPassGroup(NVPW_RawMetricsConfig_EndPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_RawMetricsConfig_GetNumMetrics_Params;
+#define NVPW_RawMetricsConfig_GetNumMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumMetrics_Params, numMetrics)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetNumMetrics(NVPW_RawMetricsConfig_GetNumMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+        /// [out]
+        NVPA_Bool supportsPipelined;
+        /// [out]
+        NVPA_Bool supportsIsolated;
+    } NVPW_RawMetricsConfig_GetMetricProperties_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_Params, supportsIsolated)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties(NVPW_RawMetricsConfig_GetMetricProperties_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+    } NVPW_RawMetricsConfig_GetMetricProperties_V2_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params, pMetricName)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties_V2(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_RawMetricsConfig_AddMetrics_Params;
+#define NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_RawMetricsConfig_AddMetrics(NVPW_RawMetricsConfig_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_IsAddMetricsPossible_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+        /// [out]
+        NVPA_Bool isPossible;
+    } NVPW_RawMetricsConfig_IsAddMetricsPossible_Params;
+#define NVPW_RawMetricsConfig_IsAddMetricsPossible_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params, isPossible)
+
+    NVPA_Status NVPW_RawMetricsConfig_IsAddMetricsPossible(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GenerateConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] If true, all existing pass groups may be merged to reduce number of passes.
+        /// If merge was successful, distribution of counters in passes may be updated as a side-effect. The effects
+        /// will be persistent in pRawMetricsConfig.
+        NVPA_Bool mergeAllPassGroups;
+    } NVPW_RawMetricsConfig_GenerateConfigImage_Params;
+#define NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GenerateConfigImage_Params, mergeAllPassGroups)
+
+    /// This API may fail if called inside a pass group with `mergeAllPassGroups` = true.
+    NVPA_Status NVPW_RawMetricsConfig_GenerateConfigImage(NVPW_RawMetricsConfig_GenerateConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the config image
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied into pBuffer
+        size_t bytesCopied;
+    } NVPW_RawMetricsConfig_GetConfigImage_Params;
+#define NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetConfigImage_Params, bytesCopied)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetConfigImage(NVPW_RawMetricsConfig_GetConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_Params, numIsolatedPasses)
+
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses(NVPW_RawMetricsConfig_GetNumPasses_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_V2_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_V2_Params, numPasses)
+
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses_V2(NVPW_RawMetricsConfig_GetNumPasses_V2_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+    typedef struct NVPW_Config_GetRawCounterInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        const char* pRawCounterName;
+        /// [inout] array containing indices of passes the counter resides in. 'pPassIndices' is in, '*pPassIndices' is
+        /// out.
+        size_t* pPassIndices;
+        /// [inout] if 'pPassIndices' is NULL, the count of passes this counter resides in will be returned; otherwise
+        /// it should be set to the capacity of 'pPassIndices' array, and on return, it will be overwritten to reflect
+        /// the actual count filled into 'pPassIndices'
+        size_t numPassIndices;
+    } NVPW_Config_GetRawCounterInfo_Params;
+#define NVPW_Config_GetRawCounterInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounterInfo_Params, numPassIndices)
+
+    NVPA_Status NVPW_Config_GetRawCounterInfo(NVPW_Config_GetRawCounterInfo_Params* pParams);
+
+    typedef struct NVPW_Config_GetRawCounters_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        size_t passIndex;
+        /// [inout] array containing raw counter names. 'ppRawCounterNames' is in, '*ppRawCounterNames' is out.
+        const char** ppRawCounterNames;
+        /// [inout] if 'ppRawCounterNames' is NULL, the count of raw counters will be returned; otherwise it should be
+        /// set to the capacity of 'ppRawCounterNames' array, and on return, it will be overwritten to reflect the
+        /// actual count filled into 'ppRawCounterNames'
+        size_t numRawCounters;
+    } NVPW_Config_GetRawCounters_Params;
+#define NVPW_Config_GetRawCounters_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounters_Params, numRawCounters)
+
+    NVPA_Status NVPW_Config_GetRawCounters(NVPW_Config_GetRawCounters_Params* pParams);
+
+/***************************************************************************//**
+ *  @name   CounterData Creation
+ *  @{
+ */
+
+    typedef struct NVPA_CounterDataBuilder NVPA_CounterDataBuilder;
+
+    typedef struct NVPW_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const char* pChipName;
+    } NVPW_CounterDataBuilder_Create_Params;
+#define NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Create_Params, pChipName)
+
+    NVPA_Status NVPW_CounterDataBuilder_Create(NVPW_CounterDataBuilder_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CounterDataBuilder_Destroy_Params;
+#define NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Destroy_Params, pCounterDataBuilder)
+
+    NVPA_Status NVPW_CounterDataBuilder_Destroy(NVPW_CounterDataBuilder_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_CounterDataBuilder_AddMetrics_Params;
+#define NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_CounterDataBuilder_AddMetrics(NVPW_CounterDataBuilder_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_GetCounterDataPrefix_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the counter data prefix
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied to pBuffer
+        size_t bytesCopied;
+    } NVPW_CounterDataBuilder_GetCounterDataPrefix_Params;
+#define NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params, bytesCopied)
+
+    NVPA_Status NVPW_CounterDataBuilder_GetCounterDataPrefix(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Evaluator
+ *  @{
+ */
+
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+
+#ifndef NVPW_DIM_UNIT_DEFINED
+#define NVPW_DIM_UNIT_DEFINED
+    typedef enum NVPW_DimUnitName
+    {
+        NVPW_DIM_UNIT_INVALID = 3518299157,
+        NVPW_DIM_UNIT_UNITLESS = 2126137902,
+        NVPW_DIM_UNIT_ATTRIBUTES = 3776338729,
+        NVPW_DIM_UNIT_BYTES = 3797850191,
+        NVPW_DIM_UNIT_CTAS = 1960564139,
+        NVPW_DIM_UNIT_CTC_CYCLES = 2224883873,
+        NVPW_DIM_UNIT_DRAM_CYCLES = 2650981327,
+        NVPW_DIM_UNIT_FBP_CYCLES = 1785238957,
+        NVPW_DIM_UNIT_FE_OPS = 2919159083,
+        NVPW_DIM_UNIT_GPC_CYCLES = 1222631184,
+        NVPW_DIM_UNIT_IDC_REQUESTS = 2012649669,
+        NVPW_DIM_UNIT_INSTRUCTIONS = 1418625543,
+        NVPW_DIM_UNIT_KILOBYTES = 1335980302,
+        NVPW_DIM_UNIT_L1DATA_BANK_ACCESSES = 1479493682,
+        NVPW_DIM_UNIT_L1DATA_BANK_CONFLICTS = 3433170787,
+        NVPW_DIM_UNIT_L1TEX_REQUESTS = 1306473767,
+        NVPW_DIM_UNIT_L1TEX_TAGS = 26573010,
+        NVPW_DIM_UNIT_L1TEX_WAVEFRONTS = 129373765,
+        NVPW_DIM_UNIT_L2_REQUESTS = 1143695106,
+        NVPW_DIM_UNIT_L2_SECTORS = 3424101564,
+        NVPW_DIM_UNIT_L2_TAGS = 3755612781,
+        NVPW_DIM_UNIT_LRC_REQUESTS = 2280914327,
+        NVPW_DIM_UNIT_LRC_SECTORS = 7212034,
+        NVPW_DIM_UNIT_MCC_CYCLES = 1826685787,
+        NVPW_DIM_UNIT_NANOSECONDS = 3047500672,
+        NVPW_DIM_UNIT_NVDLA_CYCLES = 3374059789,
+        NVPW_DIM_UNIT_NVENC_CYCLES = 2267185244,
+        NVPW_DIM_UNIT_NVLRX_CYCLES = 4059934930,
+        NVPW_DIM_UNIT_NVLTX_CYCLES = 1814350488,
+        NVPW_DIM_UNIT_OFA_CYCLES = 4290210307,
+        NVPW_DIM_UNIT_PCIE_CYCLES = 1230450943,
+        NVPW_DIM_UNIT_PERCENT = 1284354694,
+        NVPW_DIM_UNIT_PIXELS = 4227616663,
+        NVPW_DIM_UNIT_PIXEL_SHADER_BARRIERS = 3705502518,
+        NVPW_DIM_UNIT_PRIMITIVES = 2373084002,
+        NVPW_DIM_UNIT_PVAVPU_CYCLES = 2238259366,
+        NVPW_DIM_UNIT_PVA_CYCLES = 202044173,
+        NVPW_DIM_UNIT_QUADS = 1539753497,
+        NVPW_DIM_UNIT_REGISTERS = 2837260947,
+        NVPW_DIM_UNIT_SAMPLES = 746046551,
+        NVPW_DIM_UNIT_SECONDS = 1164825258,
+        NVPW_DIM_UNIT_SYSL2_REQUESTS = 2165109286,
+        NVPW_DIM_UNIT_SYSL2_SECTORS = 2268734175,
+        NVPW_DIM_UNIT_SYSL2_TAGS = 3308651352,
+        NVPW_DIM_UNIT_SYSLRC_REQUESTS = 3328245480,
+        NVPW_DIM_UNIT_SYSLRC_SECTORS = 1190477493,
+        NVPW_DIM_UNIT_SYS_CYCLES = 3310821688,
+        NVPW_DIM_UNIT_TEXELS = 1293214069,
+        NVPW_DIM_UNIT_THREADS = 164261907,
+        NVPW_DIM_UNIT_TMEM_ACCESSES = 3742902067,
+        NVPW_DIM_UNIT_VERTICES = 1873662209,
+        NVPW_DIM_UNIT_VIC_CYCLES = 103143588,
+        NVPW_DIM_UNIT_WARPS = 97951949,
+        NVPW_DIM_UNIT_WORKIDS = 1971113483,
+        NVPW_DIM_UNIT_WORKLOADS = 1728142656
+    } NVPW_DimUnitName;
+#endif //NVPW_DIM_UNIT_DEFINED
+
+#ifndef NVPW_HW_UNIT_DEFINED
+#define NVPW_HW_UNIT_DEFINED
+    typedef enum NVPW_HwUnit
+    {
+        NVPW_HW_UNIT_INVALID = 3498035701,
+        NVPW_HW_UNIT_CROP = 2872137846,
+        NVPW_HW_UNIT_CTC = 4123164475,
+        NVPW_HW_UNIT_DRAM = 1662616918,
+        NVPW_HW_UNIT_DRAMC = 1401232876,
+        NVPW_HW_UNIT_FBP = 2947194306,
+        NVPW_HW_UNIT_FBPA = 690045803,
+        NVPW_HW_UNIT_FE = 2204924321,
+        NVPW_HW_UNIT_GPC = 1911735839,
+        NVPW_HW_UNIT_GPU = 1014363534,
+        NVPW_HW_UNIT_GR = 2933618517,
+        NVPW_HW_UNIT_IDC = 842765289,
+        NVPW_HW_UNIT_L1TEX = 893940957,
+        NVPW_HW_UNIT_LRC = 4004756136,
+        NVPW_HW_UNIT_LTS = 2333266697,
+        NVPW_HW_UNIT_MCC = 3980130194,
+        NVPW_HW_UNIT_NVDLA = 4201167892,
+        NVPW_HW_UNIT_NVENC = 207708260,
+        NVPW_HW_UNIT_NVLRX = 3091684901,
+        NVPW_HW_UNIT_NVLTX = 869679659,
+        NVPW_HW_UNIT_OFA = 70307371,
+        NVPW_HW_UNIT_PCIE = 3433264174,
+        NVPW_HW_UNIT_PDA = 345193251,
+        NVPW_HW_UNIT_PES = 804128425,
+        NVPW_HW_UNIT_PROP = 3339255507,
+        NVPW_HW_UNIT_PVA = 2565499490,
+        NVPW_HW_UNIT_PVAVPU = 1656645655,
+        NVPW_HW_UNIT_RASTER = 187932504,
+        NVPW_HW_UNIT_SM = 724224710,
+        NVPW_HW_UNIT_SMSP = 2837616917,
+        NVPW_HW_UNIT_SYS = 768990063,
+        NVPW_HW_UNIT_SYSLRC = 3247626950,
+        NVPW_HW_UNIT_SYSLTS = 4137740217,
+        NVPW_HW_UNIT_TPC = 1889024613,
+        NVPW_HW_UNIT_VAF = 753670509,
+        NVPW_HW_UNIT_VIC = 322439594,
+        NVPW_HW_UNIT_VPC = 275561583,
+        NVPW_HW_UNIT_ZCULL = 2401248356,
+        NVPW_HW_UNIT_ZROP = 979500456
+    } NVPW_HwUnit;
+#endif //NVPW_HW_UNIT_DEFINED
+
+    typedef enum NVPW_RollupOp
+    {
+        NVPW_ROLLUP_OP_AVG = 0,
+        NVPW_ROLLUP_OP_MAX,
+        NVPW_ROLLUP_OP_MIN,
+        NVPW_ROLLUP_OP_SUM,
+        NVPW_ROLLUP_OP__COUNT
+    } NVPW_RollupOp;
+
+    typedef enum NVPW_MetricType
+    {
+        NVPW_METRIC_TYPE_COUNTER = 0,
+        NVPW_METRIC_TYPE_RATIO,
+        NVPW_METRIC_TYPE_THROUGHPUT,
+        NVPW_METRIC_TYPE__COUNT
+    } NVPW_MetricType;
+
+    typedef enum NVPW_Submetric
+    {
+        NVPW_SUBMETRIC_NONE = 0,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED = 1,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE = 2,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE_PER_SECOND = 3,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED = 4,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED_PER_SECOND = 5,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME = 6,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME_PER_SECOND = 7,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION = 8,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION_PER_SECOND = 9,
+        NVPW_SUBMETRIC_PER_CYCLE_ACTIVE = 10,
+        NVPW_SUBMETRIC_PER_CYCLE_ELAPSED = 11,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_FRAME = 12,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_REGION = 13,
+        NVPW_SUBMETRIC_PER_SECOND = 14,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ACTIVE = 15,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ELAPSED = 16,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_FRAME = 17,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_REGION = 18,
+        NVPW_SUBMETRIC_MAX_RATE = 19,
+        NVPW_SUBMETRIC_PCT = 20,
+        NVPW_SUBMETRIC_RATIO = 21,
+        NVPW_SUBMETRIC__COUNT
+    } NVPW_Submetric;
+
+    typedef struct NVPW_MetricEvalRequest
+    {
+        /// the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+        /// one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// one of 'NVPW_RollupOp', required for Counter and Throughput, doesn't apply to Ratio
+        uint8_t rollupOp;
+        /// one of 'NVPW_Submetric', required for Ratio and Throughput, optional for Counter
+        uint16_t submetric;
+    } NVPW_MetricEvalRequest;
+#define NVPW_MetricEvalRequest_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricEvalRequest, submetric)
+
+    typedef struct NVPW_DimUnitFactor
+    {
+        /// one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        int8_t exponent;
+    } NVPW_DimUnitFactor;
+#define NVPW_DimUnitFactor_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_DimUnitFactor, exponent)
+
+    typedef struct NVPW_MetricsEvaluator_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_MetricsEvaluator_Destroy_Params;
+#define NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_Destroy_Params, pMetricsEvaluator)
+
+    NVPA_Status NVPW_MetricsEvaluator_Destroy(NVPW_MetricsEvaluator_Destroy_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out]
+        const char* pMetricNames;
+        /// [out]
+        const size_t* pMetricNameBeginIndices;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_MetricsEvaluator_GetMetricNames_Params;
+#define NVPW_MetricsEvaluator_GetMetricNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricNames_Params, numMetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricNames(NVPW_MetricsEvaluator_GetMetricNames_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] can be either a base metric or a metric
+        const char* pMetricName;
+        /// [out] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+    } NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params;
+#define NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params, metricIndex)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricTypeAndIndex(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const char* pMetricName;
+        /// [inout] 'pMetricEvalRequest' is in, '*pMetricEvalRequest' is out
+        struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+    } NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params;
+#define NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params, metricEvalRequestStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_HwUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+        /// [out]
+        const char* pHwUnitName;
+    } NVPW_MetricsEvaluator_HwUnitToString_Params;
+#define NVPW_MetricsEvaluator_HwUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_HwUnitToString_Params, pHwUnitName)
+
+    NVPA_Status NVPW_MetricsEvaluator_HwUnitToString(NVPW_MetricsEvaluator_HwUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetCounterProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t counterIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+    } NVPW_MetricsEvaluator_GetCounterProperties_Params;
+#define NVPW_MetricsEvaluator_GetCounterProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetCounterProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetCounterProperties(NVPW_MetricsEvaluator_GetCounterProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetRatioMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t ratioMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint64_t hwUnit;
+    } NVPW_MetricsEvaluator_GetRatioMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetRatioMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetRatioMetricProperties(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t throughputMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint32_t hwUnit;
+        /// [out] number of constituent counters for the throughput metric
+        size_t numCounters;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numCounters' > 0, otherwise
+        /// returned as nullptr
+        const size_t* pCounterIndices;
+        /// [out] number of constituent sub-throughputs for the throughput metric
+        size_t numSubThroughputs;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numSubThroughputs' > 0,
+        /// otherwise returned as nullptr
+        const size_t* pSubThroughputIndices;
+    } NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params, pSubThroughputIndices)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetThroughputMetricProperties(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] an array of 'NVPW_Submetric'
+        const uint16_t* pSupportedSubmetrics;
+        /// [out]
+        size_t numSupportedSubmetrics;
+    } NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params;
+#define NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params, numSupportedSubmetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetSupportedSubmetrics(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricRawDependencies_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [inout] 'ppRawDependencies' is in, '*ppRawDependencies' is out
+        const char** ppRawDependencies;
+        /// [inout] if 'ppRawDependencies' is NULL, number of raw dependencies available will be returned; otherwise it
+        /// should be set to the number of elements allocated for 'ppRawDependencies', and on return, it will be
+        /// overwritten by number of elements copied to 'ppRawDependencies'
+        size_t numRawDependencies;
+        /// [inout] 'ppOptionalRawDependencies' is in, '*ppOptionalRawDependencies' is out
+        const char** ppOptionalRawDependencies;
+        /// [inout] if 'ppOptionalRawDependencies' is NULL, number of optional raw dependencies available will be
+        /// returned; otherwise it should be set to the number of elements allocated for 'ppOptionalRawDependencies',
+        /// and on return, it will be overwritten by number of elements copied to 'ppOptionalRawDependencies'
+        size_t numOptionalRawDependencies;
+    } NVPW_MetricsEvaluator_GetMetricRawDependencies_Params;
+#define NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params, numOptionalRawDependencies)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricRawDependencies(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_DimUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        /// [out]
+        const char* pSingularName;
+        /// [out]
+        const char* pPluralName;
+    } NVPW_MetricsEvaluator_DimUnitToString_Params;
+#define NVPW_MetricsEvaluator_DimUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_DimUnitToString_Params, pPluralName)
+
+    NVPA_Status NVPW_MetricsEvaluator_DimUnitToString(NVPW_MetricsEvaluator_DimUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricDimUnits_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [inout] 'pDimUnits' is in, '*pDimUnits' is out
+        NVPW_DimUnitFactor* pDimUnits;
+        /// [inout] if 'pDimUnits' is NULL, number of dim-units available will be returned; otherwise it should be set
+        /// to the number of elements allocated for 'pDimUnits', and on return, it will be overwritten by number of
+        /// elements copied to 'pDimUnits'
+        size_t numDimUnits;
+        /// [in] set to 'NVPW_DimUnitFactor_STRUCT_SIZE'
+        size_t dimUnitFactorStructSize;
+    } NVPW_MetricsEvaluator_GetMetricDimUnits_Params;
+#define NVPW_MetricsEvaluator_GetMetricDimUnits_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricDimUnits_Params, dimUnitFactorStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricDimUnits(NVPW_MetricsEvaluator_GetMetricDimUnits_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetUserData_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] duration in ns of user defined frame
+        double frameDuration;
+        /// [in] duration in ns of user defined region
+        double regionDuration;
+        /// [in]
+        NVPA_Bool isolated;
+    } NVPW_MetricsEvaluator_SetUserData_Params;
+#define NVPW_MetricsEvaluator_SetUserData_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetUserData_Params, isolated)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetUserData(NVPW_MetricsEvaluator_SetUserData_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_EvaluateToGpuValues_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [in]
+        NVPA_Bool isolated;
+        /// [inout] 'pMetricValues' is in, '*pMetricValues' is out
+        double* pMetricValues;
+    } NVPW_MetricsEvaluator_EvaluateToGpuValues_Params;
+#define NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params, pMetricValues)
+
+    NVPA_Status NVPW_MetricsEvaluator_EvaluateToGpuValues(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetDeviceAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+    } NVPW_MetricsEvaluator_SetDeviceAttributes_Params;
+#define NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetDeviceAttributes_Params, counterDataImageSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetDeviceAttributes(NVPW_MetricsEvaluator_SetDeviceAttributes_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+
+#endif // NVPERF_HOST_API_DEFINED
+
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_HOST_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_target.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1c5c85b403c5ebb16d66882aa26c1f1db1d5089
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_target.h
@@ -0,0 +1,626 @@
+#ifndef NVPERF_TARGET_H
+#define NVPERF_TARGET_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_target.h
+ */
+
+#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+    /// GPU architecture support level
+    typedef enum NVPW_GpuArchitectureSupportLevel
+    {
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
+    } NVPW_GpuArchitectureSupportLevel;
+#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#define NVPW_SLI_SUPPORT_LEVEL_DEFINED
+    /// SLI configuration support level
+    typedef enum NVPW_SliSupportLevel
+    {
+        NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Only Non-SLI configurations are supported.
+        NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
+    } NVPW_SliSupportLevel;
+#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+    /// Virtualized GPU configuration support level
+    typedef enum NVPW_VGpuSupportLevel
+    {
+        NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Supported but not allowed by system admin.
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
+    } NVPW_VGpuSupportLevel;
+#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+    /// Confidential Compute mode support level
+    typedef enum NVPW_ConfidentialComputeSupportLevel
+    {
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_CONF_COMPUTE_DEVTOOLS_MODE
+    } NVPW_ConfidentialComputeSupportLevel;
+#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#define NVPW_CMP_SUPPORT_LEVEL_DEFINED
+    /// CMP support level
+    typedef enum NVPW_CmpSupportLevel
+    {
+        NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
+    } NVPW_CmpSupportLevel;
+#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
+#define NVPW_WSL_SUPPORT_LEVEL_DEFINED
+    /// WSL support level
+    typedef enum NVPW_WslSupportLevel
+    {
+        NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
+    } NVPW_WslSupportLevel;
+#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_MIG_SUPPORT_LEVEL_DEFINED
+#define NVPW_MIG_SUPPORT_LEVEL_DEFINED
+    /// MIG support level
+    typedef enum NVPW_MigSupportLevel
+    {
+        NVPW_MIG_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_MIG_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_MIG_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_MIG_SUPPORT_LEVEL_SUPPORTED_NON_MIG_CONFIGURATION
+    } NVPW_MigSupportLevel;
+#endif //NVPW_MIG_SUPPORT_LEVEL_DEFINED
+
+    typedef struct NVPW_InitializeTarget_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeTarget_Params;
+#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
+
+    /// Load the target library.
+    NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
+
+    typedef struct NVPW_GetDeviceCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t numDevices;
+    } NVPW_GetDeviceCount_Params;
+#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
+
+    NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
+
+    typedef struct NVPW_Device_GetNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        const char* pDeviceName;
+        const char* pChipName;
+    } NVPW_Device_GetNames_Params;
+#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
+
+    NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
+
+    typedef struct NVPW_PciBusId
+    {
+        /// The PCI domain on which the device bus resides.
+        uint32_t domain;
+        ///  The bus on which the device resides.
+        uint16_t bus;
+        /// device ID.
+        uint16_t device;
+    } NVPW_PciBusId;
+#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
+
+    typedef struct NVPW_Device_GetPciBusIds_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
+        NVPW_PciBusId* pBusIds;
+        /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
+        size_t numDevices;
+    } NVPW_Device_GetPciBusIds_Params;
+#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
+
+    NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
+
+
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID     0xFFFFFFFFu
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP    0xFFFFFFFEu
+
+
+    typedef struct NVPW_Device_GetMigAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        size_t deviceIndex;
+        /// [out]
+        NVPA_Bool isMigPartition;
+        /// [out]
+        uint32_t gpuInstanceId;
+        /// [out]
+        uint32_t computeInstanceId;
+    } NVPW_Device_GetMigAttributes_Params;
+#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
+
+    NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
+
+    typedef struct NVPW_Adapter_GetDeviceIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct IDXGIAdapter* pAdapter;
+        /// [in]
+        size_t sliIndex;
+        /// [out]
+        size_t deviceIndex;
+    } NVPW_Adapter_GetDeviceIndex_Params;
+#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
+
+    NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetNumRanges_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t numRanges;
+    } NVPW_CounterData_GetNumRanges_Params;
+#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
+
+    NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetChipName_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        const char* pChipName;
+    } NVPW_CounterData_GetChipName_Params;
+#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
+
+    NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
+
+    typedef struct NVPW_Config_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_Config_GetNumPasses_Params;
+#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
+
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
+
+    typedef struct NVPW_Config_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_Config_GetNumPasses_V2_Params;
+#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
+
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
+
+#define NVPW_API_SET_CUDA_PROFILER             0x18209d0775b2f89dULL
+
+#define NVPW_API_SET_D3D11_PROFILER            0xca55c6738445db2bULL
+
+#define NVPW_API_SET_D3D12_PROFILER            0xc0c2d46dd7c7ad78ULL
+
+#define NVPW_API_SET_EGL_PROFILER              0x3c3747dae1f9565cULL
+
+#define NVPW_API_SET_GPU_PERIODICSAMPLER       0x9f4c2571fc0b2e8aULL
+
+#define NVPW_API_SET_METRICSEVALUATOR          0x0368a8768d811af9ULL
+
+#define NVPW_API_SET_METRICS_AD10X_COMP        0xbe57278e12cb5288ULL
+
+#define NVPW_API_SET_METRICS_AD10X_GRFX        0x5cbf0774f81bf491ULL
+
+#define NVPW_API_SET_METRICS_GA100_COMP        0x16b7d8c20d8b4915ULL
+
+#define NVPW_API_SET_METRICS_GA100_GRFX        0xc94eaabec04a94faULL
+
+#define NVPW_API_SET_METRICS_GA10X_COMP        0xb5d6391c2e299ab5ULL
+
+#define NVPW_API_SET_METRICS_GA10X_GRFX        0x6ebc121178b5ce0bULL
+
+#define NVPW_API_SET_METRICS_GV100_COMP        0x863705cc57919f72ULL
+
+#define NVPW_API_SET_METRICS_GV100_GRFX        0x9900da75d164fecfULL
+
+#define NVPW_API_SET_METRICS_GV11B_COMP        0xd3f79a859235848fULL
+
+#define NVPW_API_SET_METRICS_GV11B_GRFX        0xeb8e26220106e227ULL
+
+#define NVPW_API_SET_METRICS_TU10X_COMP        0x70f40be0afd35da8ULL
+
+#define NVPW_API_SET_METRICS_TU10X_GRFX        0xdf219cb838db6968ULL
+
+#define NVPW_API_SET_METRICS_TU11X_COMP        0xeb0069d7d0956678ULL
+
+#define NVPW_API_SET_METRICS_TU11X_GRFX        0x0977d9342bd62743ULL
+
+#define NVPW_API_SET_OPENGL_PROFILER           0xe4cd9ea40f2ee777ULL
+
+#define NVPW_API_SET_VULKAN_PROFILER           0x8c56b6a03d779689ULL
+
+#define NVPW_SDK_VERSION               0x1e128b6f001423fcULL
+
+    typedef struct NVPW_QueryVersionNumber_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint64_t apiSet;
+        /// [out]
+        uint32_t major;
+        /// [out]
+        uint32_t minor;
+        /// [out]
+        uint32_t patch;
+        /// [out]
+        uint32_t relMajor;
+        /// [out]
+        uint32_t relMinor;
+        /// [out]
+        uint32_t relPatch;
+    } NVPW_QueryVersionNumber_Params;
+#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
+
+    /// Query version number of an API set
+    NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
+
+    typedef enum NVPW_Device_ClockStatus
+    {
+        /// clock status is unknown
+        NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
+        /// clocks are locked to rated tdp values - Deprecated, use NVPW_DEVICE_CLOCK_STATUS_LOCKED instead
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
+        /// clocks are not locked and can boost above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
+        /// clocks are not locked and will not go above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
+        /// clocks are locked
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED,
+        /// clocks are not locked
+        NVPW_DEVICE_CLOCK_STATUS_UNLOCKED,
+        NVPW_DEVICE_CLOCK_STATUS__COUNT
+    } NVPW_Device_ClockStatus;
+
+    typedef enum NVPW_Device_ClockLevel
+    {
+        /// clock level is invalid
+        NVPW_DEVICE_CLOCK_LEVEL_INVALID,
+        /// clock level is at rated tdp
+        NVPW_DEVICE_CLOCK_LEVEL_RATED_TDP,
+        /// clock level is at turbo boost
+        NVPW_DEVICE_CLOCK_LEVEL_TURBO_BOOST,
+        NVPW_DEVICE_CLOCK_LEVEL__COUNT
+    } NVPW_Device_ClockLevel;
+
+    typedef struct NVPW_Device_GetClockStatus_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockStatus clockStatus;
+        /// [in]
+        NVPW_Device_ClockLevel clockLevel;
+    } NVPW_Device_GetClockStatus_Params;
+#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockLevel)
+
+    NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
+
+    typedef enum NVPW_Device_ClockSetting
+    {
+        /// invalid op, specify valid clocks operation during profiling
+        NVPW_DEVICE_CLOCK_SETTING_INVALID,
+        /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
+        /// locked to rated TDP)
+        NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
+        /// lock clocks at rated tdp base values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
+        /// lock clocks at turbo boost values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_TURBO_BOOST,
+        NVPW_DEVICE_CLOCK_SETTING__COUNT
+    } NVPW_Device_ClockSetting;
+
+    typedef struct NVPW_Device_SetClockSetting_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockSetting clockSetting;
+    } NVPW_Device_SetClockSetting_Params;
+#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
+
+    NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_CounterData_GetRangeDescriptions_Params;
+#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+
+    NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
+
+    typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
+#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+
+    NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
+
+#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
+    {
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
+    } NVPW_PeriodicSampler_CounterData_AppendMode;
+#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint64_t timestampStart;
+        /// [out]
+        uint64_t timestampEnd;
+    } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
+#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        size_t counterDataImageTrimmedSize;
+    } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
+#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out] total number of ranges in the counter data
+        size_t numTotalRanges;
+        /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
+        /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
+        size_t numPopulatedRanges;
+        /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
+        /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
+        size_t numCompletedRanges;
+    } NVPW_PeriodicSampler_CounterData_GetInfo_Params;
+#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
+
+    /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
+    /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
+    /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
+    /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
+    /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
+    /// written.
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint32_t triggerCount;
+    } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
+#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_IsDataComplete_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        NVPA_Bool isComplete;
+    } NVPW_PeriodicSampler_CounterData_IsDataComplete_Params;
+#define NVPW_PeriodicSampler_CounterData_IsDataComplete_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params, isComplete)
+
+    /// Checks whether a given sample's data is complete. See also 'NVPW_PeriodicSampler_CounterData_GetInfo'
+    NVPA_Status NVPW_PeriodicSampler_CounterData_IsDataComplete(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params* pParams);
+
+
+    typedef struct NVPW_TimestampReport
+    {
+        uint32_t payload;
+        uint8_t reserved0004[4];
+        uint64_t timestamp;
+    } NVPW_TimestampReport;
+
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_TARGET_H
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fa9d6f2f96f48ec46d2da816256be238cf70343
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
+#define __SM_20_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d13b04ae4fb58970eb17ffefc08ae112e3cd24f4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_20_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+extern "C"
+{
+extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
+}
+#endif /* __CUDA_ARCH__ */
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
+{
+  return __fAtomicAdd(address, val);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0405fd837a83ad6b5a65ddcba6d5be707fa5a3d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.h
@@ -0,0 +1,1156 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_20_INTRINSICS_H__)
+#define __SM_20_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#define __COMMON_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#define __COMMON_INTRINSICS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#elif defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+extern __device__ __device_builtin__ void                   __threadfence_system(void);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __ddiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __ddiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __ddiv_rn(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __ddiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __ddiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __ddiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __ddiv_rz(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __ddiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-up mode.
+ * 
+ * Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __ddiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __ddiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __ddiv_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __ddiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __ddiv_ru(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __ddiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __ddiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __ddiv_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __ddiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __ddiv_rd(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __ddiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ *
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ * 
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rd(double x);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int);
+extern __device__ __device_builtin__ int                   __syncthreads_count(int);
+extern __device__ __device_builtin__ int                   __syncthreads_and(int);
+extern __device__ __device_builtin__ int                   __syncthreads_or(int);
+extern __device__ __device_builtin__ long long int         clock64(void);
+
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rn(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rd(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_ru(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rz(float x, float y, float z);
+
+
+// SM_13 intrinsics
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
+ *
+ * Reinterpret the bits in the double-precision floating-point value \p x
+ * as a signed 64-bit integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ long long int         __double_as_longlong(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
+ *
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
+ * a double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                __longlong_as_double(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation in round-to-nearest-even mode.
+ *
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fma_rn(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rn(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rn(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fma_rn(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fma_rn(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rn(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rn(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ */
+extern __device__ __device_builtin__ double                __fma_rn(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation in round-towards-zero mode.
+ *
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fma_rz(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rz(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rz(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fma_rz(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fma_rz(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rz(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rz(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ */
+extern __device__ __device_builtin__ double                __fma_rz(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation in round-up mode.
+ *
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fma_ru(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_ru(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_ru(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fma_ru(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fma_ru(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_ru(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_ru(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ */
+extern __device__ __device_builtin__ double                __fma_ru(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation in round-down mode.
+ *
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fma_rd(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rd(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fma_rd(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fma_rd(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fma_rd(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rd(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fma_rd(\p x, \p y, \p z) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ */
+extern __device__ __device_builtin__ double                __fma_rd(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __dadd_rn(\p x, \p y) is equivalent to __dadd_rn(\p y, \p x).
+ * - __dadd_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __dadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __dadd_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dadd_rn(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __dadd_rz(\p x, \p y) is equivalent to __dadd_rz(\p y, \p x).
+ * - __dadd_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __dadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __dadd_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dadd_rz(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x + \p y.
+ * - __dadd_ru(\p x, \p y) is equivalent to __dadd_ru(\p y, \p x).
+ * - __dadd_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __dadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __dadd_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dadd_ru(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dadd_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-down mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __dadd_rd(\p x, \p y) is equivalent to __dadd_rd(\p y, \p x).
+ * - __dadd_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __dadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __dadd_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dadd_rd(\p x, \p -x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __dsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __dsub_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __dsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dsub_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dsub_rn(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __dsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __dsub_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __dsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dsub_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dsub_rz(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x - \p y.
+ * - __dsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __dsub_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __dsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dsub_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dsub_ru(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dsub_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-down mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __dsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __dsub_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __dsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __dsub_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __dsub_rd(\p x, \p x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __dmul_rn(\p x, \p y) is equivalent to __dmul_rn(\p y, \p x).
+ * - __dmul_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __dmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __dmul_rz(\p x, \p y) is equivalent to __dmul_rz(\p y, \p x).
+ * - __dmul_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __dmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __dmul_ru(\p x, \p y) is equivalent to __dmul_ru(\p y, \p x).
+ * - __dmul_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __dmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-down mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __dmul_rd(\p x, \p y) is equivalent to __dmul_rd(\p y, \p x).
+ * - __dmul_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __dmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __dmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double_intrinsic
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ int                   __double2int_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ int                   __double2int_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ int                   __double2int_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed int to a double.
+ *
+ * Convert the signed integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __int2double_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned int to a double.
+ *
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __uint2double_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2hiint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2loint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high and low 32-bit integer values as a double.
+ *
+ * Reinterpret the integer value of \p hi as the high 32 bits of a 
+ * double-precision floating-point value and the integer value of \p lo
+ * as the low 32 bits of the same double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                 __hiloint2double(int hi, int lo);
+
+
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+// notice: update documentation for __nv_bswap*() when more host compilers are supported
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the order of bytes of the 16-bit unsigned integer.
+ *
+ * Reverse the order of bytes of \p x . Only supported in MSVC and other host 
+ * compilers which define the `__GNUC__` macro, such as GCC and CLANG.
+ *
+ * \return Returns \p x with the order of bytes reversed.
+ */
+__COMMON_INTRINSICS_DECL__ unsigned short __nv_bswap16(unsigned short x) {
+#if defined(__GNUC__)
+    return __builtin_bswap16(x);
+#elif defined(_WIN32)
+    return _byteswap_ushort(x);
+#else
+#error "unsupported platform"
+#endif /* defined(__GNUC__) */
+}
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the order of bytes of the 32-bit unsigned integer.
+ *
+ * Reverse the order of bytes of \p x . Only supported in MSVC and other host 
+ * compilers which define the `__GNUC__` macro, such as GCC and CLANG.
+ *
+ * \return Returns \p x with the order of bytes reversed.
+ */
+__COMMON_INTRINSICS_DECL__ unsigned int __nv_bswap32(unsigned int x) {
+#if defined(__GNUC__)
+    return __builtin_bswap32(x);
+#elif defined(_WIN32)
+    unsigned long ret = _byteswap_ulong(static_cast<unsigned long>(x));
+    return static_cast<unsigned int>(ret);
+#else
+#error "unsupported platform"
+#endif /* defined(__GNUC__) */
+}
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the order of bytes of the 64-bit unsigned integer.
+ *
+ * Reverse the order of bytes of \p x . Only supported in MSVC and other host 
+ * compilers which define the `__GNUC__` macro, such as GCC and CLANG.
+ *
+ * \return Returns \p x with the order of bytes reversed.
+ */
+__COMMON_INTRINSICS_DECL__ unsigned long long __nv_bswap64(unsigned long long x) {
+#if defined(__GNUC__)
+    return __builtin_bswap64(x);
+#elif defined(_WIN32)
+    unsigned __int64 ret = _byteswap_uint64(static_cast<unsigned __int64>(x));
+    return static_cast<unsigned long long>(ret);
+#else
+#error "unsupported platform"
+#endif /* defined(__GNUC__) */
+}
+#else
+__COMMON_INTRINSICS_DECL__ unsigned short __nv_bswap16(unsigned short in);
+__COMMON_INTRINSICS_DECL__ unsigned int __nv_bswap32(unsigned int in);
+__COMMON_INTRINSICS_DECL__ unsigned long long __nv_bswap64(unsigned long long in);
+#endif /* !defined(__CUDA_ARCH__) */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_INTRINSICS_DECL__
+#undef __COMMON_INTRINSICS_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_20_intrinsics.hpp"
+#endif /* (!__CUDACC_RTC__ && __CUDA_ARCH__) || _NVHPC_CUDA */
+#endif /* !__SM_20_INTRINSICS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..53cac60018360d7ab814092118ba9db4c03fcba9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_INTRINSICS_HPP__)
+#define __SM_20_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#define __COMMON_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#define __COMMON_INTRINSICS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
+{
+  return __ballot((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
+{
+  return __syncthreads_count((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
+{
+  return (bool)__syncthreads_and((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
+{
+  return (bool)__syncthreads_or((int)pred);
+}
+
+
+extern "C" {
+  __device__ unsigned __nv_isGlobal_impl(const void *);
+  __device__ unsigned __nv_isShared_impl(const void *);
+  __device__ unsigned __nv_isConstant_impl(const void *);
+  __device__ unsigned __nv_isLocal_impl(const void *);
+  __device__ unsigned __nv_isGridConstant_impl(const void *);
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
+{
+  return __nv_isGlobal_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
+{
+  return __nv_isShared_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
+{
+  return __nv_isConstant_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
+{
+  return __nv_isLocal_impl(ptr); 
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
+{
+  return __nv_isGridConstant_impl(ptr); 
+}
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+extern "C" {
+  __device__ size_t __nv_cvta_generic_to_global_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_local_impl(const void *);
+  __device__ void * __nv_cvta_global_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_shared_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_constant_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_local_to_generic_impl(size_t);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
+{
+  return __nv_cvta_generic_to_global_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
+{
+  return __nv_cvta_generic_to_shared_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
+{
+  return __nv_cvta_generic_to_constant_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
+{
+  return __nv_cvta_generic_to_local_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
+{
+  return __nv_cvta_global_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
+{
+  return __nv_cvta_shared_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
+{
+  return __nv_cvta_constant_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
+{
+  return __nv_cvta_local_to_generic_impl(rawbits);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __CVTA_PTR_64 1
+#endif
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
+{
+#if __CVTA_PTR_64  
+  unsigned long long ret;
+  asm("cvta.to.param.u64 %0, %1;"  : "=l"(ret) : "l"(ptr));
+#else  /* !__CVTA_PTR_64 */
+  unsigned ret;
+  asm("cvta.to.param.u32 %0, %1;"  : "=r"(ret) : "r"(ptr));
+#endif  /* __CVTA_PTR_64 */  
+  return (size_t)ret;
+  
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
+{
+  void *ret;
+#if __CVTA_PTR_64  
+  unsigned long long in = rawbits;
+  asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
+#else  /* !__CVTA_PTR_64 */
+  unsigned in = rawbits;
+  asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
+#endif  /* __CVTA_PTR_64 */
+  return ret;
+}
+#undef __CVTA_PTR_64
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+extern "C" {
+  __device__ unsigned short __nv_bswap16_impl(unsigned short);
+  __device__ unsigned int __nv_bswap32_impl(unsigned int);
+  __device__ unsigned long long __nv_bswap64_impl(unsigned long long);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned short __nv_bswap16(unsigned short in) {
+  return __nv_bswap16_impl(in);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned int __nv_bswap32(unsigned int in) {
+  return __nv_bswap32_impl(in);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned long long __nv_bswap64(unsigned long long in) {
+  return __nv_bswap64_impl(in);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_INTRINSICS_DECL__
+#undef __COMMON_INTRINSICS_DECL__
+
+#endif /* !__SM_20_INTRINSICS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3bfc500fefaaefdc801e2fe42c27def1ddaa58
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_H__)
+#define __SM_30_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_30_INTRINSICS_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.0 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#elif !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the n-th set to 1 bit in a 32-bit integer.
+ *
+ * Given a 32-bit value \p mask and an integer value \p base (between 0 and 31),
+ * find the n-th (given by \p offset) set bit in \p mask from the \p base bit.
+ * If not found, return 0xFFFFFFFF.
+ *
+ * See also https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+ * for more information.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the position
+ * of the n-th set bit.
+ * - parameter \p base must be <=31, otherwise behavior is undefined.
+ */
+__SM_30_INTRINSICS_DECL__ unsigned  __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync(unsigned id) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// 64-bits SHFL
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// long needs some help to choose between 32-bits and 64-bits
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_30_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_30_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && __CUDA_ARCH__ */
+
+#endif /* !__SM_30_INTRINSICS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5bcac5ee68c0cf547e4de7c08badf37106639dc
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_HPP__)
+#define __SM_30_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.0 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+__SM_30_INTRINSICS_DECL__
+unsigned __fns(unsigned mask, unsigned base, int offset) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
+  return __nvvm_fns(mask, base, offset);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync(unsigned id) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
+  return __nvvm_barrier_sync(id);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync_count(unsigned id, unsigned cnt) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
+  return __nvvm_barrier_sync_cnt(id, cnt);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __syncwarp(unsigned mask) {
+  extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
+  return __nvvm_bar_warp_sync(mask);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __all_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __any_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __uni_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+unsigned __ballot_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__
+unsigned __activemask() {
+    unsigned ret;
+    asm volatile ("activemask.b32 %0;" : "=r"(ret));
+    return ret;
+}
+
+// These are removed starting with compute_70 and onwards
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
+	return (unsigned int) __shfl((int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
+	int ret;
+	int c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_up((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_down((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor((int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+// 64-bits SHFL
+
+__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
+	return (unsigned long long) __shfl((long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_up((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_down((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
+	return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((long long) var, srcLane, width) :
+		__shfl((int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((unsigned long long) var, srcLane, width) :
+		__shfl((unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((long long) var, delta, width) :
+		__shfl_up((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((unsigned long long) var, delta, width) :
+		__shfl_up((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((long long) var, delta, width) :
+		__shfl_down((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((unsigned long long) var, delta, width) :
+		__shfl_down((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((long long) var, laneMask, width) :
+		__shfl_xor((int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((unsigned long long) var, laneMask, width) :
+		__shfl_xor((unsigned int) var, laneMask, width);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
+        return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+        int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
+	return __int_as_float(ret);
+}
+
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
+        return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
+        return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+// long needs some help to choose between 32-bits and 64-bits
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (long long) var, srcLane, width) :
+		__shfl_sync(mask, (int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
+		__shfl_sync(mask, (unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (long long) var, delta, width) :
+		__shfl_up_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_up_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (long long) var, delta, width) :
+		__shfl_down_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_down_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
+}
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_30_INTRINSICS_DECL__
+
+#endif /* !__SM_30_INTRINSICS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2070bc8bbfc0c5aa58c45ef1d28623d91f4e938
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_H__)
+#define __SM_32_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_32_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7cfea072a044ceac7fb96f3bf3006520a108020e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_32_ATOMIC_FUNCTIONS_HPP__
+
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ long long __illAtomicMin(long long *address, long long val);
+extern __device__ __device_builtin__ long long __illAtomicMax(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicAnd(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicOr(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicXor(long long *address, long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMin(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMax(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicAnd(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicOr (unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicXor(unsigned long long *address, unsigned long long val);
+}
+#endif /* __CUDA_ARCH__ */
+
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
+{
+    return __illAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
+{
+    return __illAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
+{
+    return __llAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
+{
+    return __llAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
+{
+    return __llAtomicXor(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicXor(address, val);
+}
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f493c1c54c7715703d09c70eb78b70b60d208d9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h
@@ -0,0 +1,516 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_H__)
+#define __SM_32_INTRINSICS_H__
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST ;
+#else  /* defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) */
+#define __DEF_IF_HOST { }
+#endif /* defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.5 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) __DEF_IF_HOST
+
+/******************************************************************************
+ *                                   __ldcg                                   *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) __DEF_IF_HOST
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by \p shift & 31 bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the wrapped value of \p shift (\p shift & 31).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by min(\p shift, 32) bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the clamped value of \p shift (min(\p shift, 32)).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by \p shift & 31 bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the wrapped value of \p shift (\p shift & 31).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by min(\p shift, 32) bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the clamped value of \p shift (min(\p shift, 32)).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))
+#include "sm_32_intrinsics.hpp"
+#endif /* !defined(__CUDACC_RTC__) && (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))  */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* !__SM_32_INTRINSICS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d50f9cea5c4d89bc555855a8ca73d617bcfa461a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_HPP__)
+#define __SM_32_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+    // There are no intrinsics built in to the compiler for SM-3.5,
+    // all intrinsics are now implemented as inline PTX below.
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.5 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// LDG is a "load from global via texture path" command which can exhibit higher
+// bandwidth on GK110 than a regular LD.
+// Define a different pointer storage size for 64 and 32 bit
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+
+/******************************************************************************
+ *                                   __ldcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+#undef __LDG_PTR
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+// This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+// This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#endif /* !__SM_32_INTRINSICS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8961079aeac4c9e73a7c2825cf9ea10b171af09
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
+#define __SM_35_ATOMIC_FUNCTIONS_H__
+
+/*******************************************************************************
+* All sm_35 atomics are supported by sm_32 so simply include its header file   *
+*******************************************************************************/
+#include "sm_32_atomic_functions.h"
+
+#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..a13a4fbb0133ea5ed9f2fcc317292ae3fe5397af
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h
@@ -0,0 +1,106 @@
+/*
+
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+
+ *
+
+ * NOTICE TO LICENSEE:
+
+ *
+
+ * This source code and/or documentation ("Licensed Deliverables") are
+
+ * subject to NVIDIA intellectual property rights under U.S. and
+
+ * international Copyright laws.
+
+ *
+
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+
+ * conditions of a form of NVIDIA software license agreement by and
+
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+
+ * the contrary in the License Agreement, reproduction or disclosure
+
+ * of the Licensed Deliverables to any third party without the express
+
+ * written consent of NVIDIA is prohibited.
+
+ *
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+
+ * OF THESE LICENSED DELIVERABLES.
+
+ *
+
+ * U.S. Government End Users.  These Licensed Deliverables are a
+
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+
+ * 1995), consisting of "commercial computer software" and "commercial
+
+ * computer software documentation" as such terms are used in 48
+
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+
+ * U.S. Government End Users acquire the Licensed Deliverables with
+
+ * only those rights set forth herein.
+
+ *
+
+ * Any use of the Licensed Deliverables in individual and commercial
+
+ * software must include, in the user documentation and internal
+
+ * comments to the code, the above Disclaimer and U.S. Government End
+
+ * Users Notice.
+
+ */
+
+
+
+#if !defined(__SM_35_INTRINSICS_H__)
+#define __SM_35_INTRINSICS_H__
+
+
+
+
+#endif /* !__SM_35_INTRINSICS_H__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..53d607cae6467244b2f99bd891632b8679828b54
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_H__)
+#define __SM_60_ATOMIC_FUNCTIONS_H__
+
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+#undef __DEF_IF_HOST
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_60_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__)  */
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d6ac004cd92d3af9281143123289bc2353dd494
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
@@ -0,0 +1,742 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+extern "C"
+{
+extern __device__ __device_builtin__ double __dAtomicAdd(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_system(float *address, float val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_block(double *address, double val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_system(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_system(float *address, float val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_block(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_system(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_block(unsigned int *address, unsigned int compare,
+                                unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_system(unsigned int *address, unsigned int compare,
+                                 unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_block(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_system(unsigned long long int *address,
+                                         unsigned long long int compare,
+                                         unsigned long long int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_system(unsigned long long *address, unsigned long long val);
+}
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..40dbe98ac42483b53f96d27280e621608ca24094
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2016-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_61_INTRINSICS_H__)
+#define __SM_61_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-6.1 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+
+/******************************************************************************
+ *                                   __dp2a                                   *
+ ******************************************************************************/
+// Generic [_lo]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the lower half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the lower half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_lo]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the lower half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the lower half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+// Generic [_hi]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the upper half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the upper half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_hi]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the upper half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the upper half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+
+/******************************************************************************
+ *                                   __dp4a                                   *
+ ******************************************************************************/
+// Generic
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p signed \p int8 dot product with \p int32 accumulate.
+ *
+ * \details Extracts four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB, then creates four pairwise products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p unsigned \p int8 dot product with \p unsigned \p int32 accumulate.
+ *
+ * \details Extracts four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB, then creates four pairwise products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p signed \p int8 dot product with \p int32 accumulate.
+ *
+ * \details Takes four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB vectors, then creates four pairwise products and adds them
+ * together to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p unsigned \p int8 dot product with \p unsigned \p int32 accumulate.
+ *
+ * \details Takes four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB vectors, then creates four pairwise products and adds them
+ * together to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_61_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_61_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && __CUDA_ARCH__ */
+
+#endif /* !__SM_61_INTRINSICS_H__ */
+#undef EXCLUDE_FROM_RTC
\ No newline at end of file
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a561384b08a65445eed86bfc96a0694e5b9190c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_61_INTRINSICS_HPP__)
+#define __SM_61_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-6.1 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// 4a
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.lo
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.hi
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_61_INTRINSICS_DECL__
+
+#endif /* !__SM_61_INTRINSICS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb940c1d2bd5ee7b4a5020e12297bc2927e0386
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+
+
+#undef __DEPRECATED__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_indirect_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a93faf052f98d81f8cb65bd9591d08ec90c994d9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_indirect_functions.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __SURFACE_INDIRECT_FUNCTIONS_H__
+#define __SURFACE_INDIRECT_FUNCTIONS_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cuda_runtime_api.h"
+
+template<typename T> struct __nv_isurf_trait { };
+template<> struct __nv_isurf_trait<char> { typedef void type; };
+template<> struct __nv_isurf_trait<signed char> { typedef void type; };
+template<> struct __nv_isurf_trait<char1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned char> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar1> { typedef void type; };
+template<> struct __nv_isurf_trait<short> { typedef void type; };
+template<> struct __nv_isurf_trait<short1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned short> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort1> { typedef void type; };
+template<> struct __nv_isurf_trait<int> { typedef void type; };
+template<> struct __nv_isurf_trait<int1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned int> { typedef void type; };
+template<> struct __nv_isurf_trait<uint1> { typedef void type; };
+template<> struct __nv_isurf_trait<long long> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; };
+template<> struct __nv_isurf_trait<float> { typedef void type; };
+template<> struct __nv_isurf_trait<float1> { typedef void type; };
+
+template<> struct __nv_isurf_trait<char2> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar2> { typedef void type; };
+template<> struct __nv_isurf_trait<short2> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort2> { typedef void type; };
+template<> struct __nv_isurf_trait<int2> { typedef void type; };
+template<> struct __nv_isurf_trait<uint2> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong2> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; };
+template<> struct __nv_isurf_trait<float2> { typedef void type; };
+
+template<> struct __nv_isurf_trait<char4> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar4> { typedef void type; };
+template<> struct __nv_isurf_trait<short4> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort4> { typedef void type; };
+template<> struct __nv_isurf_trait<int4> { typedef void type; };
+template<> struct __nv_isurf_trait<uint4> { typedef void type; };
+template<> struct __nv_isurf_trait<float4> { typedef void type; };
+
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode);
+}
+
+template <class T>
+static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surf1Dread(&ret, surfObject, x, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode);
+}
+
+template <class T>
+static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surf2Dread(&ret, surfObject, x, y, boundaryMode);
+   return ret;
+}
+
+
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode);
+}
+
+template <class T>
+static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surf3Dread(&ret, surfObject, x, y, z, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode);
+}
+
+template <class T>
+static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode);
+}
+
+template <class T>
+static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode);
+}
+
+template <class T>
+static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surfCubemapread(&ret, surfObject, x, y, face, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode);
+}
+
+template <class T>
+static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+   T ret;
+   surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode);
+   return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{ 
+  __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode);
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+  __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode);
+}
+
+#endif // __cplusplus && __CUDACC__
+
+#endif // __SURFACE_INDIRECT_FUNCTIONS_H__
+
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d0eccdee2a80132c5dadfcd80643c1b41eb8ec
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_types.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+#ifndef __CUDACC_RTC_MINIMAL__
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__SURFACE_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_fetch_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_fetch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..704e8518da6b3cf7b77e7b9d34638bc06dd3937f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_fetch_functions.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
+#define __TEXTURE_FETCH_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_texture_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+
+template <typename T>
+struct __nv_tex_rmet_ret { };
+
+template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
+template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
+template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
+template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
+template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
+template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
+template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
+template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
+template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
+
+template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
+template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
+template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
+template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
+template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
+template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
+template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
+template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
+
+template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
+template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
+template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
+template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
+template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
+template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
+template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
+template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
+
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
+template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
+template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
+template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
+template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
+template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
+template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
+template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
+#endif /* !__LP64__ */
+template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
+template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
+template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
+template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
+
+
+template <typename T> struct __nv_tex_rmet_cast { typedef T* type;  };
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
+template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
+template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
+template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
+template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
+template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
+template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
+template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
+#endif /* !__LP64__ */
+
+template <typename T>
+struct __nv_tex_rmnf_ret { };
+
+template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
+
+
+template <typename T>
+struct __nv_tex2dgather_ret { };
+template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
+
+template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
+
+template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
+
+template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
+
+
+template<typename T> struct __nv_tex2dgather_rmnf_ret { };
+template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
+
+#undef __DEPRECATED__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5537d87294ee78ecec567893a6aaec333db317
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
@@ -0,0 +1,638 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+#include "cuda_runtime_api.h"
+
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __NV_TEX_SPARSE 1
+#endif  /* endif */
+
+template <typename T> struct __nv_itex_trait {   };
+template<> struct __nv_itex_trait<char> { typedef void type; };
+template<> struct __nv_itex_trait<signed char> { typedef void type; };
+template<> struct __nv_itex_trait<char1> { typedef void type; };
+template<> struct __nv_itex_trait<char2> { typedef void type; };
+template<> struct __nv_itex_trait<char4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
+template<> struct __nv_itex_trait<uchar1> { typedef void type; };
+template<> struct __nv_itex_trait<uchar2> { typedef void type; };
+template<> struct __nv_itex_trait<uchar4> { typedef void type; };
+template<> struct __nv_itex_trait<short> { typedef void type; };
+template<> struct __nv_itex_trait<short1> { typedef void type; };
+template<> struct __nv_itex_trait<short2> { typedef void type; };
+template<> struct __nv_itex_trait<short4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
+template<> struct __nv_itex_trait<ushort1> { typedef void type; };
+template<> struct __nv_itex_trait<ushort2> { typedef void type; };
+template<> struct __nv_itex_trait<ushort4> { typedef void type; };
+template<> struct __nv_itex_trait<int> { typedef void type; };
+template<> struct __nv_itex_trait<int1> { typedef void type; };
+template<> struct __nv_itex_trait<int2> { typedef void type; };
+template<> struct __nv_itex_trait<int4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
+template<> struct __nv_itex_trait<uint1> { typedef void type; };
+template<> struct __nv_itex_trait<uint2> { typedef void type; };
+template<> struct __nv_itex_trait<uint4> { typedef void type; };
+#if !defined(__LP64__)
+template<> struct __nv_itex_trait<long> { typedef void type; };
+template<> struct __nv_itex_trait<long1> { typedef void type; };
+template<> struct __nv_itex_trait<long2> { typedef void type; };
+template<> struct __nv_itex_trait<long4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
+template<> struct __nv_itex_trait<ulong1> { typedef void type; };
+template<> struct __nv_itex_trait<ulong2> { typedef void type; };
+template<> struct __nv_itex_trait<ulong4> { typedef void type; };
+#endif /* !__LP64__ */
+template<> struct __nv_itex_trait<float> { typedef void type; };
+template<> struct __nv_itex_trait<float1> { typedef void type; };
+template<> struct __nv_itex_trait<float2> { typedef void type; };
+template<> struct __nv_itex_trait<float4> { typedef void type; };
+
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
+{
+   __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
+}
+
+template <class T>
+static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
+{
+  T ret;
+  tex1Dfetch(&ret, texObject, x);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
+{
+   __nv_tex_surf_handler("__itex1D", ptr, obj, x);
+}
+
+
+template <class T>
+static __device__  T tex1D(cudaTextureObject_t texObject, float x)
+{
+  T ret;
+  tex1D(&ret, texObject, x);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
+{
+   __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+   __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
+{
+   __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
+}
+
+template <class T>
+static __device__  T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
+{
+  T ret;
+  tex1DLayered(&ret, texObject, x, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
+{
+  __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+  __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
+}
+
+
+template <class T>
+static __device__  T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  texCubemap(&ret, texObject, x, y, z);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
+}
+
+template <class T>
+static __device__  T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
+{
+  T ret;
+  texCubemapLayered(&ret, texObject, x, y, z, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
+{
+  __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y, comp);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp,  &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y,  isResident, comp);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
+{
+  __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
+}
+
+template <class T>
+static __device__  T tex1DLod(cudaTextureObject_t texObject, float x, float level)
+{
+  T ret;
+  tex1DLod(&ret, texObject, x, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
+{
+  __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
+}
+
+template <class T>
+static __device__  T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
+{
+  T ret;
+  tex1DLayeredLod(&ret, texObject, x, layer, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  texCubemapLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
+{
+  T ret;
+  texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
+{
+  __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DGrad(&ret, texObject, x, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
+  return ret;
+}
+
+#undef __NV_TEX_SPARSE
+
+#endif // __cplusplus && __CUDACC__
+#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3e3e90ef5c02d7f3e62178792fd10f93ba4d85f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+#ifndef __CUDACC_RTC_MINIMAL__
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__TEXTURE_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..38a52e3936ca88f6f5437fe8731b6b1cfdc7ca02
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+
+/* NOTE: For NVRTC, these declarations have been moved into the compiler 
+   (to reduce compile time) */
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#undef EXCLUDE_FROM_RTC
+
+#endif /* !__VECTOR_FUNCTIONS_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.hpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab69cf38045a7c44dae67e7149d49ac4c6148747
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.hpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{ 
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */
+
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a755e65f36b56644cd25d08603e10a6efc3fb8b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h
@@ -0,0 +1,449 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_TYPES_H__)
+#define __VECTOR_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/* NVRTC compiler defines these instead of in the header (to reduce compile time)
+*/
+#ifndef __CUDACC_RTC_BUILTIN_VECTOR_TYPES__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(push)
+#pragma warning(disable: 4201 4408)
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ tag                      \
+{                                                  \
+    union                                          \
+    {                                              \
+        struct { members };                        \
+        struct { long long int :1,:0; };           \
+    };                                             \
+}
+
+#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ __align__(8) tag         \
+{                                                  \
+    members                                        \
+}
+
+#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+struct __device_builtin__ char1
+{
+    signed char x;
+};
+
+struct __device_builtin__ uchar1
+{
+    unsigned char x;
+};
+
+
+struct __device_builtin__ __align__(2) char2
+{
+    signed char x, y;
+};
+
+struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+};
+
+struct __device_builtin__ char3
+{
+    signed char x, y, z;
+};
+
+struct __device_builtin__ uchar3
+{
+    unsigned char x, y, z;
+};
+
+struct __device_builtin__ __align__(4) char4
+{
+    signed char x, y, z, w;
+};
+
+struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+};
+
+struct __device_builtin__ short1
+{
+    short x;
+};
+
+struct __device_builtin__ ushort1
+{
+    unsigned short x;
+};
+
+struct __device_builtin__ __align__(4) short2
+{
+    short x, y;
+};
+
+struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+};
+
+struct __device_builtin__ short3
+{
+    short x, y, z;
+};
+
+struct __device_builtin__ ushort3
+{
+    unsigned short x, y, z;
+};
+
+__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
+__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+
+struct __device_builtin__ int1
+{
+    int x;
+};
+
+struct __device_builtin__ uint1
+{
+    unsigned int x;
+};
+
+__cuda_builtin_vector_align8(int2, int x; int y;);
+__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
+
+struct __device_builtin__ int3
+{
+    int x, y, z;
+};
+
+struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) int4
+{
+    int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) uint4
+{
+    unsigned int x, y, z, w;
+};
+
+struct __device_builtin__ long1
+{
+    long int x;
+};
+
+struct __device_builtin__ ulong1
+{
+    unsigned long x;
+};
+
+#if defined(_WIN32)
+__cuda_builtin_vector_align8(long2, long int x; long int y;);
+__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
+#else /* !_WIN32 */
+
+struct __device_builtin__ __align__(2*sizeof(long int)) long2
+{
+    long int x, y;
+};
+
+struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
+{
+    unsigned long int x, y;
+};
+
+#endif /* _WIN32 */
+
+struct __device_builtin__ long3
+{
+    long int x, y, z;
+};
+
+struct __device_builtin__ ulong3
+{
+    unsigned long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) long4
+{
+    long int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulong4
+{
+    unsigned long int x, y, z, w;
+};
+
+struct __device_builtin__ float1
+{
+    float x;
+};
+
+#if !defined(__CUDACC__) && defined(__arm__) && \
+    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-pedantic"
+
+struct __device_builtin__ __attribute__((aligned(8))) float2
+{
+    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
+};
+
+#pragma GCC poison __cuda_gnu_arm_ice_workaround
+#pragma GCC diagnostic pop
+
+#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+__cuda_builtin_vector_align8(float2, float x; float y;);
+
+#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+struct __device_builtin__ float3
+{
+    float x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) float4
+{
+    float x, y, z, w;
+};
+
+struct __device_builtin__ longlong1
+{
+    long long int x;
+};
+
+struct __device_builtin__ ulonglong1
+{
+    unsigned long long int x;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong2
+{
+    long long int x, y;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong2
+{
+    unsigned long long int x, y;
+};
+
+struct __device_builtin__ longlong3
+{
+    long long int x, y, z;
+};
+
+struct __device_builtin__ ulonglong3
+{
+    unsigned long long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong4
+{
+    long long int x, y, z ,w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong4
+{
+    unsigned long long int x, y, z, w;
+};
+
+struct __device_builtin__ double1
+{
+    double x;
+};
+
+struct __device_builtin__ __builtin_align__(16) double2
+{
+    double x, y;
+};
+
+struct __device_builtin__ double3
+{
+    double x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) double4
+{
+    double x, y, z, w;
+};
+
+#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(pop)
+
+#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+typedef __device_builtin__ struct char1 char1;
+typedef __device_builtin__ struct uchar1 uchar1;
+typedef __device_builtin__ struct char2 char2;
+typedef __device_builtin__ struct uchar2 uchar2;
+typedef __device_builtin__ struct char3 char3;
+typedef __device_builtin__ struct uchar3 uchar3;
+typedef __device_builtin__ struct char4 char4;
+typedef __device_builtin__ struct uchar4 uchar4;
+typedef __device_builtin__ struct short1 short1;
+typedef __device_builtin__ struct ushort1 ushort1;
+typedef __device_builtin__ struct short2 short2;
+typedef __device_builtin__ struct ushort2 ushort2;
+typedef __device_builtin__ struct short3 short3;
+typedef __device_builtin__ struct ushort3 ushort3;
+typedef __device_builtin__ struct short4 short4;
+typedef __device_builtin__ struct ushort4 ushort4;
+typedef __device_builtin__ struct int1 int1;
+typedef __device_builtin__ struct uint1 uint1;
+typedef __device_builtin__ struct int2 int2;
+typedef __device_builtin__ struct uint2 uint2;
+typedef __device_builtin__ struct int3 int3;
+typedef __device_builtin__ struct uint3 uint3;
+typedef __device_builtin__ struct int4 int4;
+typedef __device_builtin__ struct uint4 uint4;
+typedef __device_builtin__ struct long1 long1;
+typedef __device_builtin__ struct ulong1 ulong1;
+typedef __device_builtin__ struct long2 long2;
+typedef __device_builtin__ struct ulong2 ulong2;
+typedef __device_builtin__ struct long3 long3;
+typedef __device_builtin__ struct ulong3 ulong3;
+typedef __device_builtin__ struct long4 long4;
+typedef __device_builtin__ struct ulong4 ulong4;
+typedef __device_builtin__ struct float1 float1;
+typedef __device_builtin__ struct float2 float2;
+typedef __device_builtin__ struct float3 float3;
+typedef __device_builtin__ struct float4 float4;
+typedef __device_builtin__ struct longlong1 longlong1;
+typedef __device_builtin__ struct ulonglong1 ulonglong1;
+typedef __device_builtin__ struct longlong2 longlong2;
+typedef __device_builtin__ struct ulonglong2 ulonglong2;
+typedef __device_builtin__ struct longlong3 longlong3;
+typedef __device_builtin__ struct ulonglong3 ulonglong3;
+typedef __device_builtin__ struct longlong4 longlong4;
+typedef __device_builtin__ struct ulonglong4 ulonglong4;
+typedef __device_builtin__ struct double1 double1;
+typedef __device_builtin__ struct double2 double2;
+typedef __device_builtin__ struct double3 double3;
+typedef __device_builtin__ struct double4 double4;
+
+#undef  __cuda_builtin_vector_align8
+
+#endif /* !defined(__CUDACC_RTC_BUILTIN_VECTOR_TYPES__) */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+struct __device_builtin__ dim3
+{
+    unsigned int x, y, z;
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+    __host__ __device__ constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ constexpr operator uint3(void) const { return uint3{x, y, z}; }
+#else
+    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ operator uint3(void) const { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+#endif
+#endif /* __cplusplus */
+};
+
+typedef __device_builtin__ struct dim3 dim3;
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+#endif /* !__VECTOR_TYPES_H__ */
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..127ccf90fbef0c9b89a1d899cedeecc6ed52b3e3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__init__.py
@@ -0,0 +1,7 @@
+from .compiler import CompiledKernel, ASTSource, IRSource, compile, make_backend, LazyDict, get_cache_key
+from .errors import CompilationError
+
+__all__ = [
+    "compile", "make_backend", "ASTSource", "IRSource", "CompiledKernel", "CompilationError", "LazyDict",
+    "get_cache_key"
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ac83c4f34fd09cdb5bacb97e79f7edd1fb63edd
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/compiler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b40c3fe55b1d744952a41905558087ccd4c81f04
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/compiler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/errors.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/errors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f71ed16d1ee38b967858f092f04759935dc2e559
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/errors.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/make_launcher.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/make_launcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bfdc8bd585096df88765bc999e61aaacda73689
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/__pycache__/make_launcher.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/code_generator.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/code_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c572f81e3e13fe7f3f6f2dd72542c3ef737e581d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/code_generator.py
@@ -0,0 +1,1639 @@
+import ast
+import builtins
+import contextlib
+import copy
+import inspect
+import re
+import warnings
+import textwrap
+from dataclasses import dataclass
+from types import ModuleType
+from typing import Any, Callable, Dict, Optional, Tuple, Type, Union, Iterable, List
+
+from .. import knobs, language
+from .._C.libtriton import ir, gluon_ir
+from ..language import constexpr, str_to_ty, tensor, tuple as tl_tuple
+from ..language.core import _unwrap_if_constexpr, base_value, base_type
+# ideally we wouldn't need any runtime component
+from ..runtime.jit import get_jit_fn_file_line, get_full_name, JITCallable, BoundConstexprFunction, ConstexprFunction, JITFunction
+from .._utils import find_paths_if, get_iterable_path, set_iterable_path, is_namedtuple
+
+from .errors import (CompilationError, CompileTimeAssertionFailure, UnsupportedLanguageConstruct)
+
+
+def check_identifier_legality(name, type):
+    pattern = r'^[a-zA-Z_][a-zA-Z0-9_]*$'
+    if not re.match(pattern, name):
+        raise CompilationError(f"invalid {type} identifier: {name}", name)
+    return name
+
+
+def mangle_fn(name, arg_tys, constants, caller_context):
+    # doesn't mangle ret type, which must be a function of arg tys
+    mangled_arg_names = '_'.join([ty.mangle() for ty in arg_tys])
+    mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)])
+    mangled_constants = mangled_constants.replace('.', '_d_')
+    mangled_constants = mangled_constants.replace("'", '_sq_')
+    # [ and ] are not allowed in LLVM identifiers
+    mangled_constants = mangled_constants.replace('[', '_').replace(']', '_')
+    ret = f'{name}__{mangled_arg_names}__{mangled_constants}'
+    if caller_context is not None:
+        ret += caller_context.mangle()
+    return ret
+
+
+def _is_triton_value(o: Any) -> bool:
+    return isinstance(o, base_value)
+
+
+def _is_triton_tensor(o: Any) -> bool:
+    return isinstance(o, tensor)
+
+
+def _is_constexpr(o: Any) -> bool:
+    return o is None or isinstance(o, (constexpr, language.core.dtype, JITCallable))
+
+
+def _is_non_scalar_tensor(o: Any) -> bool:
+    return _is_triton_tensor(o) and (o.type.is_block() and o.type.numel != 1)
+
+
+def _is_list_like(o: Any) -> bool:
+    return isinstance(o, (list, tuple))
+
+
+def _check_fn_args(node, fn, args):
+    if fn.noinline:
+        for idx, arg in enumerate(args):
+            if not _is_constexpr(arg) and _is_non_scalar_tensor(arg):
+                raise UnsupportedLanguageConstruct(
+                    fn.src, node,
+                    f'Function {fn.__name__} is marked noinline, but was called with non-scalar argument {fn.arg_names[idx]}:{arg}'
+                )
+
+
+def _apply_to_tuple_values(value, fn):
+    if is_namedtuple(type(value)):
+        fields = value._fields
+    elif isinstance(value, language.tuple):
+        fields = value.type.fields
+    else:
+        assert False, f"Unsupported type {type(value)}"
+
+    vals = [fn(v) for v in value]
+    vals = [constexpr(v) if v is None else v for v in vals]
+    types = [v.type for v in vals]
+    return language.tuple(vals, language.tuple_type(types, fields))
+
+
+def flatten_values_to_ir(values: Iterable[base_value]):
+    handles = []
+    for v in values:
+        v._flatten_ir(handles)
+    return handles
+
+
+def unflatten_ir_values(handles: List[ir.value], types: List[base_type]):
+    cursor = 0
+    for ty in types:
+        value, cursor = ty._unflatten_ir(handles, cursor)
+        yield value
+    assert cursor == len(handles)
+
+
+_condition_types = {bool, int, type(None)}  # Python types accepted for conditionals inside kernels
+
+
+def _clone_triton_value(val):
+    handles = []
+    val._flatten_ir(handles)
+    clone, _ = val.type._unflatten_ir(handles, 0)
+    return clone
+
+
+def _clone_scope(scope):
+    return {name: _clone_triton_value(val) if _is_triton_value(val) else val for name, val in scope.items()}
+
+
+class enter_sub_region:
+
+    def __init__(self, generator):
+        self.generator = generator
+
+    def __enter__(self):
+        # record lscope & local_defs in the parent scope
+        self.liveins = _clone_scope(self.generator.lscope)
+        self.prev_defs = _clone_scope(self.generator.local_defs)
+        self.generator.local_defs = {}
+        self.insert_block = self.generator.builder.get_insertion_block()
+        self.insert_point = self.generator.builder.get_insertion_point()
+        return self.liveins, self.insert_block
+
+    def __exit__(self, *args, **kwargs):
+        self.generator.builder.restore_insertion_point(self.insert_point)
+        self.generator.lscope = self.liveins
+        self.generator.local_defs = self.prev_defs
+
+
+# Check if the given syntax node has an "early" return
+class ContainsReturnChecker(ast.NodeVisitor):
+
+    def __init__(self, gscope):
+        self.gscope = gscope
+
+    def _visit_stmts(self, body) -> bool:
+        return any(self.visit(s) for s in body)
+
+    def _visit_function(self, fn) -> bool:
+        # No need to check within the function as it won't cause an early return.
+        # If the function itself has unstructured control flow we may not be able to inline it causing poor performance,
+        # we should check for this and emit a warning.
+        return False
+
+    def generic_visit(self, node) -> bool:
+        ret = False
+        for _, value in ast.iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, ast.AST):
+                        ret = ret or self.visit(item)
+            elif isinstance(value, ast.AST):
+                ret = ret or self.visit(value)
+        return ret
+
+    def visit_Attribute(self, node: ast.Attribute) -> bool:
+        # If the left part is a name, it's possible that
+        # we call triton native function or a jit function from another module.
+        # If the left part is not a name, it must return a tensor or a constexpr
+        # whose methods do not contain return statements
+        # e.g., (tl.load(x)).to(y)
+        # So we only check if the expressions within value have return or not
+        if isinstance(node.value, ast.Name):
+            if node.value.id in self.gscope:
+                value = self.gscope[node.value.id]
+                fn = getattr(value, node.attr)
+                return self._visit_function(fn)
+            return False
+        return self.visit(node.value)
+
+    def visit_Name(self, node: ast.Name) -> bool:
+        if type(node.ctx) is ast.Store:
+            return False
+        if node.id in self.gscope:
+            fn = self.gscope[node.id]
+            return self._visit_function(fn)
+        return False
+
+    def visit_Return(self, node: ast.Return) -> bool:
+        return True
+
+    def visit_Assign(self, node: ast.Assign) -> bool:
+        # There couldn't be an early return
+        # x = ...
+        return False
+
+    def visit_AugAssign(self, node: ast.AugAssign) -> bool:
+        # There couldn't be an early return
+        # x += ...
+        return False
+
+    def visit_Module(self, node: ast.Module) -> bool:
+        return self._visit_stmts(node.body)
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> bool:
+        return self._visit_stmts(node.body)
+
+    def visit_If(self, node: ast.If) -> bool:
+        # TODO: optimize the following case in which we actually don't have
+        # a return when static_cond is false:
+        # if dynamic_cond
+        #   if static_cond
+        #     func_with_return
+        #   else
+        #     func_without_return
+        ret = self._visit_stmts(node.body)
+        if node.orelse:
+            ret = ret or self._visit_stmts(node.orelse)
+        return ret
+
+    def visit_IfExp(self, node: ast.IfExp) -> bool:
+        return self.visit(node.body) or self.visit(node.orelse)
+
+    def visit_Call(self, node: ast.Call) -> bool:
+        return self.visit(node.func)
+
+
+class ASTFunction:
+
+    def __init__(self, ret_types, arg_types, constants, attrs):
+        self.ret_types = ret_types
+        self.arg_types = arg_types
+        self.constants = constants
+        self.attrs = attrs
+
+    def flatten_ir_types(self, builder: ir.builder, types: List[base_type]) -> List[ir.type]:
+        ir_types = []
+        for ty in types:
+            if ty is None:
+                continue
+            ty._flatten_ir_types(builder, ir_types)
+        return ir_types
+
+    def return_types_ir(self, builder: ir.builder) -> List[ir.type]:
+        return self.flatten_ir_types(builder, self.ret_types)
+
+    def serialize(self, builder: ir.builder):
+        # fill up IR values in template
+        # > build function
+        is_val = lambda path, _: path not in self.constants and _ is not None
+        val_paths = list(find_paths_if(self.arg_types, is_val))
+        arg_types = [get_iterable_path(self.arg_types, path) for path in val_paths]
+        arg_types_ir = self.flatten_ir_types(builder, arg_types)
+        ret_types_ir = self.return_types_ir(builder)
+        return builder.get_function_ty(arg_types_ir, ret_types_ir)
+
+    def deserialize(self, fn):
+        # create "template"
+        def make_template(ty):
+            if isinstance(ty, (list, tuple, language.tuple_type)):
+                return language.tuple([make_template(x) for x in ty], ty)
+            return language.constexpr(None)
+
+        vals = make_template(self.arg_types)
+        is_val = lambda path, _: path not in self.constants and _ is not None
+        val_paths = list(find_paths_if(self.arg_types, is_val))
+        # > add IR values to the template
+        cursor = 0
+        handles = [fn.args(i) for i in range(fn.get_num_args())]
+        for path in val_paths:
+            ty = get_iterable_path(self.arg_types, path)
+            # > set attributes
+            attr_specs = self.attrs.get(path, [])
+            for attr_name, attr_val in attr_specs:
+                fn.set_arg_attr(cursor, attr_name, attr_val)
+            # > build frontend value
+            val, cursor = ty._unflatten_ir(handles, cursor)
+            set_iterable_path(vals, path, val)
+        # > add constexpr values to the template
+        constants = self.constants
+        for path, val in constants.items():
+            set_iterable_path(vals, path, language.constexpr(val))
+        return vals
+
+
+@dataclass(frozen=True)
+class BoundJITMethod:
+    __self__: base_value
+    __func__: JITFunction
+
+
+class CodeGenerator(ast.NodeVisitor):
+
+    def __init__(self, context, prototype, gscope, function_name, jit_fn: JITFunction, *, options, codegen_fns,
+                 module_map, is_gluon, module=None, is_kernel=False, function_types: Optional[Dict] = None,
+                 noinline=False, caller_context=None, file_name: Optional[str] = None, begin_line=0):
+        self.context = context
+        self.is_gluon = is_gluon
+        if is_gluon:
+            from triton.experimental.gluon.language._semantic import GluonSemantic
+            self.builder = gluon_ir.GluonOpBuilder(context)
+            self.semantic = GluonSemantic(self.builder)
+        else:
+            from triton.language.semantic import TritonSemantic
+            self.builder = ir.builder(context)
+            self.semantic = TritonSemantic(self.builder)
+
+        self.name_loc_as_prefix = None
+        self.file_name = file_name
+        # node.lineno starts from 1, so we need to subtract 1
+        self.begin_line = begin_line - 1
+        self.builder.set_loc(file_name, begin_line, 0)
+        self.builder.options = options
+        # dict of functions provided by the backend. Below are the list of possible functions:
+        # Convert custom types not natively supported on HW.
+        # convert_custom_types(input_tensor, dtype, fp_downcast_rounding=None, _builder=None)
+        self.builder.codegen_fns = codegen_fns
+        self.builder.module_map = {} if module_map is None else module_map
+        self.module = self.builder.create_module() if module is None else module
+        self.function_ret_types = {} if function_types is None else function_types
+        self.prototype = prototype
+
+        self.gscope = {}
+        for k, v in gscope.items():
+            if isinstance(v, ModuleType):
+                self.gscope[k] = module_map.get(v.__name__, v)
+                continue
+
+            module_name = getattr(v, "__module__", "")
+            if module_name in module_map:
+                self.gscope[k] = getattr(module_map[module_name], v.__name__)
+            else:
+                self.gscope[k] = v
+
+        self.lscope = {}
+        self.jit_fn = jit_fn
+        # TODO: we currently generate illegal names for non-kernel functions involving constexprs!
+        if is_kernel:
+            function_name = function_name[function_name.rfind('.') + 1:]
+            function_name = check_identifier_legality(function_name, "function")
+        self.function_name = function_name
+        self.is_kernel = is_kernel
+        self.cur_node = None
+        self.noinline = noinline
+        self.caller_context = caller_context
+        self.scf_stack = []
+        self.ret_type = None
+        # SSA-construction
+        # name => language.tensor
+        self.local_defs: Dict[str, tensor] = {}
+        self.dereference_name: Callable[[str], Any] = self._define_name_lookup()
+        self.fn = None
+        # Are we currently visiting an ast.arg's default value?  These have some
+        # special handling.
+        self.visiting_arg_default_value = False
+
+    builtin_namespace: Dict[str, Any] = {
+        _.__name__: _
+        for _ in (len, list, range, float, int, isinstance, getattr, hasattr)
+    }
+    builtin_namespace.update((
+        ('print', language.core.device_print),
+        ('min', language.core.builtin_min),
+        ('max', language.core.builtin_max),
+    ))
+
+    def _unsupported(self, node, message):
+        return UnsupportedLanguageConstruct(self.jit_fn.src, node, message)
+
+    def _is_constexpr_global(self, name):
+        absent_marker = object()
+        val = self.gscope.get(name, absent_marker)
+        if val is absent_marker:
+            return False
+
+        if _is_constexpr(val):
+            return True
+
+        return False
+
+    def _define_name_lookup(self):
+
+        def local_lookup(name: str, absent):
+            # this needs to be re-fetched from `self` every time, because it gets switched occasionally
+            return self.lscope.get(name, absent)
+
+        def global_lookup(name: str, absent):
+            val = self.gscope.get(name, absent)
+            # The high-level rule is that only constexpr globals are allowed.
+            # But actually a bunch of other things, such as module imports, are
+            # technically Python globals. We have to allow these too!
+            if any([
+                    val is absent,
+                    name in self.builtin_namespace,  #
+                    type(val) is ModuleType,  #
+                    isinstance(val, JITCallable),  #
+                    getattr(val, "__triton_builtin__", False),  #
+                    getattr(val, "__triton_aggregate__", False),  #
+                    getattr(val, "__module__", "").startswith("triton.language"),  #
+                    getattr(val, "__module__", "").startswith("triton.experimental.gluon.language"),  #
+                    isinstance(val, language.dtype),  #
+                    is_namedtuple(val),
+                    self._is_constexpr_global(name),  #
+                    # Allow accesses to globals while visiting an ast.arg
+                    # because you should be able to do
+                    #   @triton.jit def fn(x: tl.constexpr = GLOBAL): ...
+                    self.visiting_arg_default_value,  #
+                    knobs.compilation.allow_non_constexpr_globals,
+            ]):
+                return val
+            raise NameError(
+                textwrap.dedent(f"""\
+                Cannot access global variable {name} from within @jit'ed
+                function. Triton kernels can only access global variables that
+                are instanstiated as constexpr (`x = triton.language.constexpr(42)`). Note that this is different from
+                annotating a variable as constexpr (`x: triton.language.constexpr = 42`), which is not supported.  Alternatively, set the
+                envvar TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1, but we do not
+                promise to support this forever.""").replace("\n", " "))
+
+        absent_marker = object()
+
+        def name_lookup(name: str) -> Any:
+            absent = absent_marker
+            for lookup_function in local_lookup, global_lookup, self.builtin_namespace.get:
+                value = lookup_function(name, absent)
+                if value is not absent:
+                    return value
+            raise NameError(f'{name} is not defined')
+
+        return name_lookup
+
+    @contextlib.contextmanager
+    def _name_loc_prefix(self, prefix):
+        self.name_loc_as_prefix = prefix
+        yield
+        self.name_loc_as_prefix = None
+
+    def _maybe_set_loc_to_name(self, val, name):
+        if isinstance(val, (ir.value, ir.block_argument)):
+            val.set_loc(self.builder.create_name_loc(name, val.get_loc()))
+        elif _is_triton_value(val):
+            handles = []
+            val._flatten_ir(handles)
+            for handle in handles:
+                handle.set_loc(self.builder.create_name_loc(name, handle.get_loc()))
+
+    def set_value(self, name: str, value: Union[base_value, constexpr]) -> None:
+        ''' This function:
+            called by visit_Assign() & visit_FunctionDef() to store left value (lvalue)
+        1. record local defined name (FIXME: should consider control flow)
+        2. store tensor in self.lvalue
+        '''
+        self.lscope[name] = value
+        self.local_defs[name] = value
+
+    def _get_insertion_point_and_loc(self):
+        # XXX: this is a hack to get the location of the insertion point.
+        # The insertion point's location could be invalid sometimes,
+        # so we need to explicitly set the location
+        loc = self.builder.get_loc()
+        ip = self.builder.get_insertion_point()
+        return ip, loc
+
+    def _set_insertion_point_and_loc(self, ip, loc):
+        self.builder.restore_insertion_point(ip)
+        self.builder.set_loc(loc)
+
+    def _find_carries(self, node, liveins, ignore: set[str] = set()):
+        # create loop body block
+        block = self.builder.create_block()
+        self.builder.set_insertion_point_to_start(block)
+        # dry visit loop body
+        self.scf_stack.append(node)
+        self.visit_compound_statement(node.body)
+        self.scf_stack.pop()
+        block.erase()
+
+        # If a variable (name) has changed value within the loop, then it's
+        # a loop-carried variable. (The new and old value must be of the
+        # same type)
+        init_tys = []
+        init_handles = []
+        names = []
+
+        for name, live_val in liveins.items():
+            if name in ignore:
+                continue
+
+            if _is_triton_value(live_val):
+                loop_val = self.lscope[name]
+                self._verify_loop_carried_variable(name, loop_val, live_val)
+
+                live_handles = flatten_values_to_ir([live_val])
+                loop_handles = flatten_values_to_ir([loop_val])
+                if live_handles != loop_handles:
+                    names.append(name)
+                    init_tys.append(live_val.type)
+                    init_handles.extend(live_handles)
+            else:
+                assert name not in self.local_defs, f'Loop carried variable {name} is not a triton value'
+
+        # reset local scope to not pick up local defs from the dry run.
+        self.lscope = liveins.copy()
+        self.local_defs = {}
+
+        return names, init_handles, init_tys
+
+    #
+    # AST visitor
+    #
+    def visit_compound_statement(self, stmts):
+        # Ensure that stmts is iterable
+        if not _is_list_like(stmts):
+            stmts = [stmts]
+        for stmt in stmts:
+            self.visit(stmt)
+            # Stop parsing as soon as we hit a `return` statement; everything
+            # after this is dead code.
+            if isinstance(stmt, ast.Return):
+                break
+
+    def visit_Module(self, node):
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_List(self, node):
+        ctx = self.visit(node.ctx)
+        assert ctx is None
+        elts = language.tuple([self.visit(elt) for elt in node.elts])
+        return elts
+
+    def visit_ListComp(self, node: ast.ListComp):
+        if len(node.generators) != 1:
+            raise ValueError("nested comprehensions are not supported")
+
+        comp = node.generators[0]
+        iter = self.visit(comp.iter)
+        if not isinstance(iter, tl_tuple):
+            raise NotImplementedError("only tuple comprehensions are supported")
+
+        results = []
+        for item in iter:
+            self.set_value(comp.target.id, item)
+            results.append(self.visit(node.elt))
+        return tl_tuple(results)
+
+    # By design, only non-kernel functions can return
+    def visit_Return(self, node):
+        ret_value = self.visit(node.value)
+        handles = []
+
+        def decay(value):
+            if isinstance(value, language.tuple):
+                return _apply_to_tuple_values(value, decay)
+            elif isinstance(value, (language.constexpr, int, float)):
+                return self.semantic.to_tensor(value)
+            return value
+
+        ret_value = decay(ret_value)
+
+        if ret_value is None:
+            ret_ty = language.void
+        else:
+            assert isinstance(ret_value, language.core.base_value)
+            ret_value._flatten_ir(handles)
+            ret_ty = ret_value.type
+        self.builder.ret(handles)
+        if self.ret_type is None:
+            self.ret_type = ret_ty
+        elif self.ret_type != ret_ty:
+            raise TypeError(f'Inconsistent return types: {self.ret_type} and {ret_ty}')
+
+        # A return op must always terminate the basic block, so we create a dead
+        # basic block in case there are any ops after the return.
+        post_ret_block = self.builder.create_block()
+        self.builder.set_insertion_point_to_end(post_ret_block)
+
+    def visit_FunctionDef(self, node):
+        arg_names, kwarg_names = self.visit(node.args)
+        if self.fn:
+            raise self._unsupported(node, "nested function definition is not supported.")
+        # initialize defaults
+        for i, default_value in enumerate(node.args.defaults[::-1]):
+            arg_node = node.args.args[-i - 1]
+            annotation = arg_node.annotation
+            name = arg_node.arg
+            st_target = ast.Name(id=name, ctx=ast.Store())
+            if annotation is None:
+                init_node = ast.Assign(targets=[st_target], value=default_value)
+            else:
+                init_node = ast.AnnAssign(target=st_target, value=default_value, annotation=annotation)
+            try:
+                assert not self.visiting_arg_default_value
+                self.visiting_arg_default_value = True
+                self.visit(init_node)
+            finally:
+                self.visiting_arg_default_value = False
+
+        # initialize function
+        visibility = "public" if self.is_kernel else "private"
+        fn_ty = self.prototype.serialize(self.builder)
+        self.fn = self.builder.get_or_insert_function(self.module, self.function_name, fn_ty, visibility, self.noinline)
+        self.module.push_back(self.fn)
+        entry = self.fn.add_entry_block()
+        arg_values = self.prototype.deserialize(self.fn)
+        if self.caller_context is not None:
+            self.caller_context.initialize_callee(self.fn, self.builder)
+        # bind arguments to symbols
+        for arg_name, arg_value in zip(arg_names, arg_values):
+            self._maybe_set_loc_to_name(arg_value, arg_name)
+            self.set_value(arg_name, arg_value)
+        insert_pt = self.builder.get_insertion_block()
+        self.builder.set_insertion_point_to_start(entry)
+        # visit function body
+        self.visit_compound_statement(node.body)
+
+        # finalize function
+        assert not self.builder.get_insertion_block().has_terminator()
+        if self.ret_type is None or self.ret_type == language.void:
+            self.ret_type = language.void
+            self.builder.ret([])
+        else:
+            if isinstance(self.ret_type, language.tuple_type):
+                self.prototype.ret_types = self.ret_type.types
+            else:
+                self.prototype.ret_types = [self.ret_type]
+            self.fn.reset_type(self.prototype.serialize(self.builder))
+            self.builder.ret([self.builder.create_poison(ty) for ty in self.prototype.return_types_ir(self.builder)])
+        self.fn.finalize()
+
+        if insert_pt:
+            self.builder.set_insertion_point_to_end(insert_pt)
+
+    def visit_arguments(self, node):
+        arg_names = []
+        for arg in node.args:
+            arg_names += [self.visit(arg)]
+        kwarg_names = self.visit(node.kwarg)
+        return arg_names, kwarg_names
+
+    def visit_arg(self, node):
+        ast.NodeVisitor.generic_visit(self, node)
+        param = next(p for p in self.jit_fn.params if p.name == node.arg)
+        if param.is_constexpr and (param.do_not_specialize or param.do_not_specialize_on_alignment):
+            raise CompilationError(
+                self.jit_fn.src, node,
+                f"{node.arg} marked as constexpr and listed in do_not_specialize/do_not_specialize_on_alignment. "
+                "Remove constexpr designation to skip specialization.")
+        return node.arg
+
+    def visit_AnnAssign(self, node):
+        # extract attributes
+        annotation = self.visit(node.annotation)
+        target = self.visit(node.target)
+        value = self.visit(node.value)
+        # constexpr
+        if annotation == constexpr:
+            if target in self.lscope:
+                raise ValueError(f'{target} is already defined.'
+                                 f' constexpr cannot be reassigned.')
+            value = constexpr(value)
+            self.lscope[target] = value
+            return self.lscope[target]
+        # default: call visit_Assign
+        return self.visit_Assign(node)
+
+    def assignTarget(self, target, value):
+        assert isinstance(target.ctx, ast.Store)
+        if isinstance(target, ast.Subscript):
+            return self.visit_Subscript_Store(target, value)
+        if isinstance(target, ast.Tuple):
+            for i, target in enumerate(target.elts):
+                self.assignTarget(target, value.values[i])
+            return
+        if isinstance(target, ast.Attribute):
+            raise NotImplementedError("Attribute assignment is not supported in triton")
+        assert isinstance(target, ast.Name)
+        self.set_value(self.visit(target), value)
+
+    def visit_Assign(self, node):
+        # construct values to assign
+        def _sanitize_value(value):
+            if isinstance(value, language.tuple):
+                return _apply_to_tuple_values(value, _sanitize_value)
+            native_nontensor_types = (language.dtype, language.tuple)
+            value = _unwrap_if_constexpr(value)
+            if value is not None and \
+                not _is_triton_value(value) and \
+                not isinstance(value, native_nontensor_types):
+                value = self.semantic.to_tensor(value)
+            return value
+
+        targets = [node.target] if isinstance(node, ast.AnnAssign) else node.targets
+        assert len(targets) == 1
+        target = targets[0]
+        if isinstance(target, ast.Name):
+            with self._name_loc_prefix(target.id):
+                values = _sanitize_value(self.visit(node.value))
+        else:
+            values = _sanitize_value(self.visit(node.value))
+        self.assignTarget(target, values)
+
+    def visit_AugAssign(self, node):
+        lhs = copy.deepcopy(node.target)
+        lhs.ctx = ast.Load()
+        rhs = ast.BinOp(lhs, node.op, node.value)
+        assign = ast.Assign(targets=[node.target], value=rhs)
+        for x in ['lineno', 'col_offset', 'end_lineno', 'end_col_offset']:
+            if hasattr(node, x):
+                y = getattr(node, x)
+                setattr(rhs, x, y)
+                setattr(assign, x, y)
+        self.visit(assign)
+        return self.visit(lhs)
+
+    def visit_Name(self, node):
+        if type(node.ctx) is ast.Store:
+            return node.id
+        return self.dereference_name(node.id)
+
+    def visit_Store(self, node):
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Load(self, node):
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Tuple(self, node):
+        args = [self.visit(x) for x in node.elts]
+        return language.tuple(args)
+
+    def _apply_binary_method(self, node, method_name, lhs, rhs):
+        # TODO: raise something meaningful if getattr fails below, esp for reverse method
+        if _is_triton_tensor(lhs):
+            return getattr(lhs, method_name)(rhs, _semantic=self.semantic)
+        if _is_triton_tensor(rhs):
+            reverse_method_name = re.sub(r"__(.*)__", r"__r\1__", method_name)
+            return getattr(rhs, reverse_method_name)(lhs, _semantic=self.semantic)
+        if not isinstance(lhs, (constexpr, language.tuple)) and isinstance(rhs, constexpr):
+            lhs = constexpr(lhs)
+        if isinstance(lhs, constexpr):
+            fn = getattr(lhs, method_name)
+        else:
+            fn = self.get_Attribute(lhs, method_name)
+        return self.call_Function(node, fn, [rhs], {})
+
+    def visit_BinOp(self, node):
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.right)
+        method_name = self._method_name_for_bin_op.get(type(node.op))
+        if method_name is None:
+            raise self._unsupported(node,
+                                    "AST binary operator '{}' is not (currently) implemented.".format(node.op.__name__))
+        return self._apply_binary_method(node, method_name, lhs, rhs)
+
+    _method_name_for_bin_op: Dict[Type[ast.operator], str] = {
+        ast.Add: '__add__',
+        ast.Sub: '__sub__',
+        ast.Mult: '__mul__',
+        ast.Div: '__truediv__',
+        ast.FloorDiv: '__floordiv__',
+        ast.Mod: '__mod__',
+        ast.Pow: '__pow__',
+        ast.LShift: '__lshift__',
+        ast.RShift: '__rshift__',
+        ast.BitAnd: '__and__',
+        ast.BitOr: '__or__',
+        ast.BitXor: '__xor__',
+    }
+
+    def visit_then_else_blocks(self, node, liveins, then_block, else_block):
+        # then block
+        self.builder.set_insertion_point_to_start(then_block)
+        self.visit_compound_statement(node.body)
+        then_block = self.builder.get_insertion_block()
+        then_defs = self.local_defs.copy()
+        then_vals = self.lscope.copy()
+        # else block
+        else_defs = {}
+        else_vals = liveins.copy()
+        if node.orelse:
+            self.builder.set_insertion_point_to_start(else_block)
+            self.lscope = liveins.copy()
+            self.local_defs = {}
+            self.visit_compound_statement(node.orelse)
+            else_defs = self.local_defs.copy()
+            else_block = self.builder.get_insertion_block()
+            else_vals = self.lscope.copy()
+
+        # update block arguments
+        names = []
+        # variables in livein whose value is updated in `if`
+        for name, value in liveins.items():
+            # livein variable changed value in either then or else
+            if not _is_triton_value(value):
+                continue
+            then_handles = flatten_values_to_ir([then_vals[name]])
+            else_handles = flatten_values_to_ir([else_vals[name]])
+            if then_handles == else_handles:
+                continue
+            names.append(name)
+            then_defs[name] = then_vals[name]
+            else_defs[name] = else_vals[name]
+            # check type
+            for defs, block_name in [(then_defs, 'then'), (else_defs, 'else')]:
+                type_equal = type(defs[name]) == type(value)  # noqa: E721
+                assert type_equal and defs[name].type == value.type, \
+                    f'initial value for `{name}` is of type {value}, '\
+                    f'but the {block_name} block redefines it as {defs[name]}'
+
+        # variables that are both in then and else but not in liveins
+        # TODO: could probably be cleaned up
+        for name in sorted(then_defs.keys() & else_defs.keys()):
+            if name in names:
+                continue
+            then_val = then_defs[name]
+            then_ty = then_val.type
+            else_val = else_defs[name]
+            else_ty = else_val.type
+            type_equal = type(then_val) == type(else_val)  # noqa: E721
+            assert type_equal and then_ty == else_ty, \
+                f'Mismatched type for {name} between then block ({then_ty}) '\
+                f'and else block ({else_ty})'
+            names.append(name)
+
+        return then_defs, else_defs, then_block, else_block, names
+
+    def visit_if_top_level(self, cond, node):
+        with enter_sub_region(self) as sr:
+            liveins, ip_block = sr
+            then_block = self.builder.create_block()
+            else_block = self.builder.create_block()
+            # create branch
+            self.builder.set_insertion_point_to_end(ip_block)
+            self.builder.create_cond_branch(cond.handle, then_block, else_block)
+            # visit then and else blocks
+            then_defs, else_defs, then_block, else_block, names = \
+                self.visit_then_else_blocks(node, liveins, then_block, else_block)
+            # create basic-block after conditional
+            endif_block = self.builder.create_block()
+            # then terminator
+            self.builder.set_insertion_point_to_end(then_block)
+            assert not then_block.has_terminator(), f"{then_block}"
+            then_handles = flatten_values_to_ir(then_defs[name] for name in names)
+            self.builder.create_branch(endif_block, then_handles)
+            # else terminator
+            self.builder.set_insertion_point_to_end(else_block)
+            assert not else_block.has_terminator(), f"{else_block}"
+            else_handles = flatten_values_to_ir(else_defs[name] for name in names)
+            self.builder.create_branch(endif_block, else_handles)
+            assert len(then_handles) == len(else_handles)
+            for then_h, else_h in zip(then_handles, else_handles):
+                ty = then_h.get_type()
+                assert ty == else_h.get_type()
+                endif_block.add_argument(ty)
+
+        # change block
+        self.builder.set_insertion_point_to_start(endif_block)
+        # update value
+        res_handles = [endif_block.arg(i) for i in range(len(then_handles))]
+        types = [then_defs[name].type for name in names]
+        new_values = unflatten_ir_values(res_handles, types)
+        for name, new_value in zip(names, new_values):
+            self.set_value(name, new_value)
+
+    # TODO: refactor
+    def visit_if_scf(self, cond, node):
+        with enter_sub_region(self) as sr:
+            liveins, _ = sr
+            ip, last_loc = self._get_insertion_point_and_loc()
+            then_block = self.builder.create_block()
+            else_block = self.builder.create_block() if node.orelse else None
+            then_defs, else_defs, then_block, else_block, names = \
+                self.visit_then_else_blocks(node, liveins, then_block, else_block)
+            # create if op
+            then_handles = flatten_values_to_ir(then_defs[name] for name in names)
+            for name, val in zip(names, then_handles):
+                self._maybe_set_loc_to_name(val, name)
+            self._set_insertion_point_and_loc(ip, last_loc)
+            if_op = self.builder.create_if_op([h.get_type() for h in then_handles], cond.handle, True)
+            then_block.merge_block_before(if_op.get_then_block())
+            self.builder.set_insertion_point_to_end(if_op.get_then_block())
+            if len(names) > 0:
+                self.builder.create_yield_op(then_handles)
+            if not node.orelse:
+                else_block = if_op.get_else_block()
+            else:
+                else_block.merge_block_before(if_op.get_else_block())
+            self.builder.set_insertion_point_to_end(if_op.get_else_block())
+            if len(names) > 0:
+                else_handles = flatten_values_to_ir(else_defs[name] for name in names)
+                for name, val in zip(names, else_handles):
+                    self._maybe_set_loc_to_name(val, name)
+                self.builder.create_yield_op(else_handles)
+        # update values
+        res_handles = [if_op.get_result(i) for i in range(len(then_handles))]
+        types = [then_defs[name].type for name in names]
+        new_values = unflatten_ir_values(res_handles, types)
+        for name, new_value in zip(names, new_values):
+            self.set_value(name, new_value)
+
+    def visit_If(self, node):
+        cond = self.visit(node.test)
+
+        if _is_triton_tensor(cond):
+            if _is_non_scalar_tensor(cond):
+                raise self._unsupported(node, "Boolean value of Tensor with more than one value is ambiguous")
+            if cond.type.is_block():
+                warnings.warn(
+                    "If conditional called with multidimensional Tensor instead of scalar; please use \"if (%s).item()\" instead"
+                    % ast.unparse(node.test))
+                cond = language.core._unsplat(cond, _semantic=self.semantic, _generator=self)
+            cond = cond.to(language.int1, _semantic=self.semantic)
+            if ContainsReturnChecker(self.gscope).visit(node):
+                if self.scf_stack:
+                    raise self._unsupported(
+                        node, "Cannot have `return` statements inside `while` or `for` statements in triton.")
+                self.visit_if_top_level(cond, node)
+            else:
+                self.visit_if_scf(cond, node)
+        else:
+            cond = _unwrap_if_constexpr(cond)
+            # not isinstance - we insist the real thing, no subclasses and no ducks
+            if type(cond) not in _condition_types:
+                raise self._unsupported(
+                    node, "`if` conditionals can only accept values of type {{{}}}, not objects of type {}".format(
+                        ', '.join(_.__name__ for _ in _condition_types),
+                        type(cond).__name__))
+
+            active_block = node.body if cond else node.orelse
+            self.visit_compound_statement(active_block)
+
+    def visit_IfExp(self, node):
+        cond = self.visit(node.test)
+        if _is_triton_tensor(cond):
+            cond = cond.to(language.int1, _semantic=self.semantic)
+            # TODO: Deal w/ more complicated return types (e.g tuple)
+            with enter_sub_region(self):
+                ip, last_loc = self._get_insertion_point_and_loc()
+
+                then_block = self.builder.create_block()
+                self.builder.set_insertion_point_to_start(then_block)
+                then_val = self.semantic.to_tensor(self.visit(node.body))
+                then_block = self.builder.get_insertion_block()
+
+                else_block = self.builder.create_block()
+                self.builder.set_insertion_point_to_start(else_block)
+                # do not need to reset lscope since
+                # ternary expressions cannot define new variables
+                else_val = self.semantic.to_tensor(self.visit(node.orelse))
+                else_block = self.builder.get_insertion_block()
+
+                self._set_insertion_point_and_loc(ip, last_loc)
+
+                assert then_val.type == else_val.type, \
+                    f'Ternary expression with dynamic condition has inconsistent types {then_val.type} and {else_val.type}'
+                ret_type = then_val.type
+
+                ret_type_ir = [ret_type.to_ir(self.builder)] if ret_type != language.void else []
+                if_op = self.builder.create_if_op(ret_type_ir, cond.handle, True)
+                then_block.merge_block_before(if_op.get_then_block())
+                if ret_type_ir:
+                    self.builder.set_insertion_point_to_end(if_op.get_then_block())
+                    self.builder.create_yield_op([then_val.handle])
+
+                self.builder.set_insertion_point_to_end(if_op.get_then_block())
+                else_block.merge_block_before(if_op.get_else_block())
+                if ret_type_ir:
+                    self.builder.set_insertion_point_to_end(if_op.get_else_block())
+                    self.builder.create_yield_op([else_val.handle])
+                return language.core.tensor(if_op.get_result(0), ret_type) if ret_type_ir else None
+        else:
+            cond = _unwrap_if_constexpr(cond)
+
+            # not isinstance - we insist the real thing, no subclasses and no ducks
+            if type(cond) not in _condition_types:
+                raise self._unsupported(
+                    node, "`if` conditionals can only accept values of type {{{}}}, not objects of type {}".format(
+                        ', '.join(_.__name__ for _ in _condition_types),
+                        type(cond).__name__))
+            if cond:
+                return self.visit(node.body)
+            else:
+                return self.visit(node.orelse)
+
+    def visit_With(self, node):
+        # Lower `with` statements by constructing context managers and calling their enter/exit hooks
+        # Instantiate each context manager with builder injection
+        cm_list = []
+        for item in node.items:
+            call = item.context_expr
+            fn = self.visit(call.func)
+            args = [self.visit(arg) for arg in call.args]
+            kws = dict(self.visit(kw) for kw in call.keywords)
+            cm = fn(*args, _semantic=self.semantic, **kws)
+            cm_list.append(cm)
+        for cm, item in zip(cm_list, node.items):
+            res = cm.__enter__()
+            if item.optional_vars is not None:
+                var_name = self.visit(item.optional_vars)
+                self.set_value(var_name, res)
+        if ContainsReturnChecker(self.gscope).visit(node):
+            raise self._unsupported(node, "Cannot have `return` statements inside `with` statements in triton ")
+        self.visit_compound_statement(node.body)
+        for cm in reversed(cm_list):
+            cm.__exit__(None, None, None)
+
+    def visit_Pass(self, node):
+        pass
+
+    def visit_Compare(self, node):
+        if not (len(node.comparators) == 1 and len(node.ops) == 1):
+            raise self._unsupported(node, "simultaneous multiple comparison is not supported")
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.comparators[0])
+        lhs_value = _unwrap_if_constexpr(lhs)
+        rhs_value = _unwrap_if_constexpr(rhs)
+        if type(node.ops[0]) is ast.Is:
+            return constexpr(lhs_value is rhs_value)
+        if type(node.ops[0]) is ast.IsNot:
+            return constexpr(lhs_value is not rhs_value)
+        method_name = self._method_name_for_comp_op.get(type(node.ops[0]))
+        if method_name is None:
+            raise self._unsupported(
+                node, "AST comparison operator '{}' is not (currently) implemented.".format(node.ops[0].__name__))
+        return self._apply_binary_method(node, method_name, lhs, rhs)
+
+    _method_name_for_comp_op: Dict[Type[ast.cmpop], str] = {
+        ast.Eq: '__eq__', ast.NotEq: '__ne__', ast.Lt: '__lt__', ast.LtE: '__le__', ast.Gt: '__gt__', ast.GtE: '__ge__'
+    }
+
+    def visit_UnaryOp(self, node):
+        operand = self.visit(node.operand)
+        fn = self._method_name_for_unary_op.get(type(node.op))
+        if fn is None:
+            raise self._unsupported(node, f"AST unary operator '{node.op.__name__}' is not (currently) implemented.")
+        if _is_triton_tensor(operand):
+            return getattr(operand, fn)(_semantic=self.semantic)
+        try:
+            return getattr(operand, fn)()
+        except AttributeError:
+            if fn == "__not__":
+                return constexpr(not operand)
+            raise self._unsupported(
+                node, f"AST unary operator '{fn}' is not (currently) implemented on type {type(operand).__name__}")
+
+    _method_name_for_unary_op: Dict[Type[ast.unaryop], str] = {
+        ast.USub: '__neg__', ast.UAdd: '__pos__', ast.Not: '__not__', ast.Invert: '__invert__'
+    }
+
+    def _verify_loop_carried_variable(self, name, loop_val, live_val):
+        assert _is_triton_value(loop_val), f'cannot reassign constexpr {name} in the loop'
+        assert _is_triton_value(live_val), f'cannot reassign constexpr {name} in the loop'
+        assert type(loop_val) is type(live_val), (
+            f'Loop carried variable {name} changed type, was {type(loop_val)} but is now {type(live_val)}')
+        assert not _is_triton_tensor(loop_val) or loop_val.type == live_val.type, \
+            f'Loop-carried variable {name} has initial type {live_val.type} '\
+            f'but is re-assigned to {loop_val.type} in loop! '\
+            f'Please make sure that the type stays consistent.'
+
+    def visit_While(self, node):
+        with enter_sub_region(self) as sr:
+            liveins, insert_block = sr
+            ip, last_loc = self._get_insertion_point_and_loc()
+
+            names, init_handles, init_fe_tys = self._find_carries(node, liveins)
+
+            init_tys = [h.get_type() for h in init_handles]
+            self._set_insertion_point_and_loc(ip, last_loc)
+            while_op = self.builder.create_while_op(init_tys, init_handles)
+            # merge the condition region
+            before_block = self.builder.create_block_with_parent(while_op.get_before(), init_tys)
+            self.builder.set_insertion_point_to_start(before_block)
+            block_args = [before_block.arg(i) for i in range(len(init_handles))]
+            condition_args = unflatten_ir_values(block_args, init_fe_tys)
+            for name, val in zip(names, condition_args):
+                self.lscope[name] = val
+                self.local_defs[name] = val
+                self._maybe_set_loc_to_name(val, name)
+            cond = self.visit(node.test)
+            if isinstance(cond, language.condition):
+                if cond.disable_licm:
+                    while_op.set_attr("llvm.loop_annotation", self.builder.get_disable_loop_licm_attr())
+                cond = cond.condition
+            self.builder.set_insertion_point_to_end(before_block)
+            # create ConditionOp: e.g., scf.condition(%cond) %arg0, %arg1, ...
+            self.builder.create_condition_op(cond.handle, block_args)
+            # merge the loop body
+            after_block = self.builder.create_block_with_parent(while_op.get_after(), init_tys)
+
+            # generate loop body
+            self.builder.set_insertion_point_to_start(after_block)
+            body_handles = [after_block.arg(i) for i in range(len(init_handles))]
+            body_args = unflatten_ir_values(body_handles, init_fe_tys)
+            for name, val in zip(names, body_args):
+                self.lscope[name] = val
+                self.local_defs[name] = val
+                self._maybe_set_loc_to_name(val, name)
+            self.scf_stack.append(node)
+            self.visit_compound_statement(node.body)
+            self.scf_stack.pop()
+
+            yield_handles = flatten_values_to_ir(self.lscope[name] for name in names)
+            self.builder.create_yield_op(yield_handles)
+
+        # WhileOp defines new values, update the symbol table (lscope, local_defs)
+        result_handles = [while_op.get_result(i) for i in range(len(init_handles))]
+        result_vals = unflatten_ir_values(result_handles, init_fe_tys)
+        for name, new_def in zip(names, result_vals):
+            self.lscope[name] = new_def
+            self.local_defs[name] = new_def
+            self._maybe_set_loc_to_name(new_def, name)
+
+        for stmt in node.orelse:
+            assert False, "Not implemented"
+            ast.NodeVisitor.generic_visit(self, stmt)
+
+    def visit_Subscript_Load(self, node):
+        assert isinstance(node.ctx, ast.Load)
+        lhs = self.visit(node.value)
+        slices = self.visit(node.slice)
+        if _is_triton_value(lhs):
+            return self.call_Method(node, lhs.__getitem__, lhs, [slices], {})
+        return lhs[slices]
+
+    def visit_Subscript_Store(self, node, value):
+        raise NotImplementedError("__setitem__ is not supported in triton")
+
+    def visit_Subscript(self, node):
+        return self.visit_Subscript_Load(node)
+
+    def visit_ExtSlice(self, node):
+        return [self.visit(dim) for dim in node.dims]
+
+    def visit_For(self, node):
+        IteratorClass = self.visit(node.iter.func)
+        iter_args = [self.visit(arg) for arg in node.iter.args]
+        iter_kwargs = dict(self.visit(keyword) for keyword in node.iter.keywords)
+        if IteratorClass == language.static_range:
+            iterator = IteratorClass(*iter_args, **iter_kwargs)
+            static_range = range(iterator.start.value, iterator.end.value, iterator.step.value)
+            for i in static_range:
+                self.lscope[node.target.id] = constexpr(i)
+                self.visit_compound_statement(node.body)
+                for stmt in node.orelse:
+                    ast.NodeVisitor.generic_visit(self, stmt)
+            return
+        num_stages = None
+        loop_unroll_factor = None
+        disallow_acc_multi_buffer = False
+        flatten = False
+        warp_specialize = False
+        disable_licm = False
+        if IteratorClass is language.range:
+            iterator = IteratorClass(*iter_args, **iter_kwargs)
+            # visit iterator arguments
+            # note: only `range` iterator is supported now
+            # collect lower bound (lb), upper bound (ub), and step
+            lb = iterator.start
+            ub = iterator.end
+            step = iterator.step
+            num_stages = iterator.num_stages
+            loop_unroll_factor = iterator.loop_unroll_factor
+            disallow_acc_multi_buffer = iterator.disallow_acc_multi_buffer
+            flatten = iterator.flatten
+            warp_specialize = iterator.warp_specialize
+            disable_licm = iterator.disable_licm
+        elif IteratorClass is range:
+            # visit iterator arguments
+            # note: only `range` iterator is supported now
+            # collect lower bound (lb), upper bound (ub), and step
+            lb = iter_args[0] if len(iter_args) > 1 else self.visit(ast.Constant(0))
+            ub = iter_args[1] if len(iter_args) > 1 else self.visit(node.iter.args[0])
+            step = iter_args[2] if len(iter_args) > 2 else self.visit(ast.Constant(1))
+        else:
+            raise RuntimeError('Only `range` and `static_range` iterators are currently supported')
+        # handle negative constant step (not supported by scf.for in MLIR)
+        negative_step = False
+        if _is_constexpr(step) and step.value < 0:
+            step = constexpr(-step.value)
+            negative_step = True
+            lb, ub = ub, lb
+        lb = self.semantic.to_tensor(lb)
+        ub = self.semantic.to_tensor(ub)
+        step = self.semantic.to_tensor(step)
+        # induction variable type
+        if not lb.dtype.is_int() or not ub.dtype.is_int() or not step.dtype.is_int():
+            raise TypeError(f"For loop bounds and step must all be ints, are ({lb.dtype}, {ub.dtype}, {step.dtype})")
+        if _is_non_scalar_tensor(lb):
+            raise TypeError(f"For lower bound must be a scalar, got {lb.type}")
+        if _is_non_scalar_tensor(ub):
+            raise TypeError(f"For upper bound must be a scalar, got {ub.type}")
+        if _is_non_scalar_tensor(step):
+            raise TypeError(f"For step must be a scalar, got {step.type}")
+        iv_type = self.semantic.integer_promote_impl(lb.dtype, ub.dtype)
+        iv_type = self.semantic.integer_promote_impl(iv_type, step.dtype)
+        iv_ir_type = iv_type.to_ir(self.builder)
+        iv_is_signed = iv_type.int_signedness == language.core.dtype.SIGNEDNESS.SIGNED
+        # lb/ub/step might be constexpr, we need to cast them to tensor
+        lb = lb.handle
+        ub = ub.handle
+        step = step.handle
+        # ForOp can only accept IndexType as lb/ub/step. Cast integer to Index
+        lb = self.builder.create_int_cast(lb, iv_ir_type, iv_is_signed)
+        ub = self.builder.create_int_cast(ub, iv_ir_type, iv_is_signed)
+        step = self.builder.create_int_cast(step, iv_ir_type, iv_is_signed)
+        # Create placeholder for the loop induction variable
+        iv_placeholder = self.builder.create_poison(iv_ir_type)
+        self.set_value(node.target.id, language.core.tensor(iv_placeholder, iv_type))
+
+        with enter_sub_region(self) as sr:
+            liveins, insert_block = sr
+            ip, last_loc = self._get_insertion_point_and_loc()
+
+            names, init_handles, init_tys = self._find_carries(node, liveins, ignore={node.target.id})
+
+            # create ForOp
+            self._set_insertion_point_and_loc(ip, last_loc)
+            for_op = self.builder.create_for_op(lb, ub, step, init_handles)
+            if _unwrap_if_constexpr(num_stages) is not None:
+                for_op.set_attr("tt.num_stages", self.builder.get_int32_attr(num_stages))
+            if _unwrap_if_constexpr(loop_unroll_factor) is not None:
+                for_op.set_attr("tt.loop_unroll_factor", self.builder.get_int32_attr(loop_unroll_factor))
+            if disallow_acc_multi_buffer:
+                for_op.set_attr("tt.disallow_acc_multi_buffer", self.builder.get_unit_attr())
+            if flatten:
+                for_op.set_attr("tt.flatten", self.builder.get_unit_attr())
+            if warp_specialize:
+                for_op.set_attr("tt.warp_specialize", self.builder.get_unit_attr())
+            if disable_licm:
+                for_op.set_attr("llvm.loop_annotation", self.builder.get_disable_loop_licm_attr())
+
+            self.scf_stack.append(node)
+            for_op_body = for_op.get_body(0)
+            self.builder.set_insertion_point_to_start(for_op_body)
+            block_handles = [for_op_body.arg(i + 1) for i in range(len(init_handles))]
+            block_args = unflatten_ir_values(block_handles, init_tys)
+            for name, val in zip(names, block_args):
+                self._maybe_set_loc_to_name(val, name)
+                self.set_value(name, val)
+            self.visit_compound_statement(node.body)
+            self.scf_stack.pop()
+            yield_handles = flatten_values_to_ir(self.lscope[name] for name in names)
+
+            # create YieldOp
+            if len(yield_handles) > 0:
+                self.builder.create_yield_op(yield_handles)
+            for_op_region = for_op_body.get_parent()
+            assert for_op_region.size() == 1, "We use SCF, so the loop body should only have one block"
+
+            # update induction variable with actual value, and replace all uses
+            self.builder.set_insertion_point_to_start(for_op_body)
+            iv = for_op.get_induction_var()
+            if negative_step:
+                iv = self.builder.create_sub(ub, iv)
+                iv = self.builder.create_add(iv, lb)
+            iv_placeholder.replace_all_uses_with(iv)
+            self.set_value(node.target.id, language.core.tensor(iv, iv_type))
+            self._maybe_set_loc_to_name(iv, node.target.id)
+
+        # update lscope & local_defs (ForOp defines new values)
+        result_handles = [for_op.get_result(i) for i in range(len(init_handles))]
+        result_values = unflatten_ir_values(result_handles, init_tys)
+        for name, val in zip(names, result_values):
+            self.set_value(name, val)
+            self._maybe_set_loc_to_name(val, name)
+
+        for stmt in node.orelse:
+            assert False, "Don't know what to do with else after for"
+            ast.NodeVisitor.generic_visit(self, stmt)
+
+    def visit_Slice(self, node):
+        lower = self.visit(node.lower)
+        upper = self.visit(node.upper)
+        step = self.visit(node.step)
+        return language.slice(lower, upper, step)
+
+    def visit_Index(self, node):
+        return self.visit(node.value)
+
+    def visit_keyword(self, node) -> Tuple[str, Any]:
+        return node.arg, self.visit(node.value)
+
+    def visit_Assert(self, node) -> Any:
+        test = self.visit(node.test)
+        msg = self.visit(node.msg) if node.msg is not None else ""
+        return language.core.device_assert(test, msg, _semantic=self.semantic)
+
+    def call_JitFunction(self, fn: JITFunction, args, kwargs, caller_context=None):
+        args = inspect.getcallargs(fn.fn, *args, **kwargs)
+        args = [args[name] for name in fn.arg_names]
+        for i, arg in enumerate(args):
+            if isinstance(arg, (language.dtype, float, int, bool, JITFunction)):
+                args[i] = language.core.constexpr(arg)
+        args_cst = find_paths_if(args, lambda _, x: _is_constexpr(x))
+        args_cst = {path: get_iterable_path(args, path) for path in args_cst}
+        args_path = find_paths_if(args, lambda _, x: not _is_constexpr(x))
+        args_val = [get_iterable_path(args, path) for path in args_path]
+        # mangle
+        caller_context = caller_context or self.caller_context
+        fn_name = mangle_fn(get_full_name(fn), [arg.type for arg in args_val], args_cst, caller_context)
+        # generate function def if necessary
+        if not self.module.has_function(fn_name):
+            # If the callee is not set, we use the same debug setting as the caller
+            file_name, begin_line = get_jit_fn_file_line(fn)
+            arg_types = [
+                language.core.constexpr if arg is None or isinstance(arg,
+                                                                     (bool, int, language.core.dtype)) else arg.type
+                for arg in args
+            ]
+            prototype = ASTFunction([], arg_types, args_cst, dict())
+            generator = CodeGenerator(self.context, prototype, fn.get_capture_scope(), module=self.module, jit_fn=fn,
+                                      function_name=fn_name, function_types=self.function_ret_types,
+                                      noinline=fn.noinline, file_name=file_name, begin_line=begin_line,
+                                      options=self.builder.options, codegen_fns=self.builder.codegen_fns,
+                                      module_map=self.builder.module_map, caller_context=caller_context,
+                                      is_gluon=self.is_gluon)
+            try:
+                generator.visit(fn.parse())
+            except Exception as e:
+                # Wrap the error in the callee with the location of the call.
+                if knobs.compilation.front_end_debugging:
+                    raise
+                raise CompilationError(self.jit_fn.src, self.cur_node, None) from e
+
+            callee_ret_type = generator.ret_type
+            self.function_ret_types[fn_name] = callee_ret_type
+        else:
+            callee_ret_type = self.function_ret_types[fn_name]
+        symbol = self.module.get_function(fn_name)
+        args_val = flatten_values_to_ir(args_val)
+        call_op = self.builder.call(symbol, args_val)
+        if callee_ret_type == language.void:
+            return None
+        handles = [call_op.get_result(i) for i in range(call_op.get_num_results())]
+        return next(unflatten_ir_values(handles, [callee_ret_type]))
+
+    def call_Function(self, node, fn, args, kws):
+        if isinstance(fn, (BoundJITMethod, BoundConstexprFunction)):
+            args.insert(0, fn.__self__)
+            fn = fn.__func__
+        if isinstance(fn, JITFunction):
+            _check_fn_args(node, fn, args)
+            return self.call_JitFunction(fn, args, kws)
+        if (hasattr(fn, '__self__') and _is_triton_value(fn.__self__)) or language.core.is_builtin(fn) or isinstance(
+                fn, ConstexprFunction):
+            extra_kwargs = dict()
+
+            if isinstance(fn, ConstexprFunction):
+                sig = inspect.signature(fn.__call__)
+            else:
+                sig = inspect.signature(fn)
+            if '_semantic' in sig.parameters:
+                extra_kwargs["_semantic"] = self.semantic
+            if '_generator' in sig.parameters:
+                extra_kwargs['_generator'] = self
+            try:
+                ret = fn(*args, **extra_kwargs, **kws)
+                # builtin functions return plain tuples for readability
+                if isinstance(ret, tuple):
+                    ret = language.tuple(ret)
+                return ret
+            except Exception as e:
+                if knobs.compilation.front_end_debugging:
+                    raise
+                # Normally when we raise a CompilationError, we raise it as
+                # `from None`, because the original fileline from the exception
+                # is not relevant (and often points into code_generator.py
+                # itself).  But when calling a function, we raise as `from e` to
+                # preserve the traceback of the original error, which may e.g.
+                # be in core.py.
+                raise CompilationError(self.jit_fn.src, node, str(e)) from e
+
+        if fn in self.builtin_namespace.values() or (hasattr(fn, '__self__') and not _is_triton_value(fn.__self__)):
+            args = map(_unwrap_if_constexpr, args)
+        ret = fn(*args, **kws)
+
+        def wrap_constexpr(x):
+            if _is_triton_value(x):
+                return x
+            return constexpr(x)
+
+        if isinstance(ret, (builtins.tuple, language.tuple)):
+            return _apply_to_tuple_values(ret, wrap_constexpr)
+        return wrap_constexpr(ret)
+
+    def call_Method(self, node, fn, fn_self, args, kws):
+        if isinstance(fn, JITFunction):
+            args.insert(0, fn_self)
+        return self.call_Function(node, fn, args, kws)
+
+    def visit_Call(self, node):
+        fn = _unwrap_if_constexpr(self.visit(node.func))
+        if not isinstance(fn, BoundJITMethod):
+            static_implementation = self.statically_implemented_functions.get(fn)
+            if static_implementation is not None:
+                return static_implementation(self, node)
+
+        mur = getattr(fn, '_must_use_result', False)
+        if mur and getattr(node, '_is_unused', False):
+            error_message = ["The result of %s is not being used." % ast.unparse(node.func)]
+            if isinstance(mur, str):
+                error_message.append(mur)
+            raise CompilationError(self.jit_fn.src, node, " ".join(error_message))
+
+        kws = dict(self.visit(keyword) for keyword in node.keywords)
+        args = []
+        for arg in node.args:
+            if isinstance(arg, ast.Starred):
+                arg = self.visit(arg.value)
+                assert isinstance(arg, language.core.tuple)
+                args.extend(arg.values)
+            else:
+                args.append(self.visit(arg))
+
+        return self.call_Function(node, fn, args, kws)
+
+    def visit_Constant(self, node):
+        return constexpr(node.value)
+
+    def visit_BoolOp(self, node: ast.BoolOp):
+        method_name = self._method_name_for_bool_op.get(type(node.op))
+        if method_name is None:
+            raise self._unsupported(
+                node, "AST boolean operator '{}' is not (currently) implemented.".format(node.op.__name__))
+
+        nontrivial_values = []
+
+        for subnode in node.values:
+            # we visit the values in order, executing their side-effects
+            # and possibly early-exiting:
+            value = self.visit(subnode)
+            if not _is_triton_tensor(value):
+                # this is a constexpr, so we might be able to short-circuit:
+                bv = bool(value)
+                if (bv is False) and (method_name == "logical_and"):
+                    # value is falsey so return that:
+                    return value
+                if (bv is True) and (method_name == "logical_or"):
+                    # value is truthy so return that:
+                    return value
+                # otherwise, our constexpr has no effect on the output of the
+                # expression so we do not append it to nontrivial_values.
+            else:
+                if value.type.is_block():
+                    lineno = getattr(node, "lineno", None)
+                    if lineno is not None:
+                        lineno += self.begin_line
+                    warnings.warn_explicit(
+                        "Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '|' instead",
+                        category=UserWarning,
+                        filename=self.file_name,
+                        lineno=lineno,
+                        source=ast.unparse(node),
+                    )
+                # not a constexpr so we must append it:
+                nontrivial_values.append(value)
+
+        if len(nontrivial_values) == 0:
+            # the semantics of a disjunction of falsey values or conjunction
+            # of truthy values is to return the final value:
+            nontrivial_values.append(value)
+
+        while len(nontrivial_values) >= 2:
+            rhs = nontrivial_values.pop()
+            lhs = nontrivial_values.pop()
+            res = self._apply_binary_method(node, method_name, lhs, rhs)
+            nontrivial_values.append(res)
+
+        assert len(nontrivial_values) == 1
+        return nontrivial_values[0]
+
+    _method_name_for_bool_op: Dict[Type[ast.boolop], str] = {ast.And: 'logical_and', ast.Or: 'logical_or'}
+
+    def get_Attribute(self, lhs, attr):
+        if _is_triton_tensor(lhs) and attr == "T":
+            return self.semantic.permute(lhs, (1, 0))
+        # NOTE: special case ".value" for BC
+        if isinstance(lhs, constexpr) and attr not in ("value", "type"):
+            lhs = lhs.value
+        attr = getattr(lhs, attr)
+        if _is_triton_value(lhs) and isinstance(attr, JITFunction):
+            return BoundJITMethod(lhs, attr)
+        return attr
+
+    def visit_Attribute(self, node):
+        lhs = self.visit(node.value)
+        if isinstance(lhs, ModuleType):
+            # follow module_map until reaching fixed-point:
+            while (name := lhs.__name__) in self.builder.module_map:
+                lhs = self.builder.module_map[name]
+                if lhs.__name__ == name:
+                    break
+        return self.get_Attribute(lhs, node.attr)
+
+    def visit_Expr(self, node):
+        node.value._is_unused = True
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_NoneType(self, node):
+        return None
+
+    def visit_JoinedStr(self, node):
+        values = list(node.values)
+        for i, value in enumerate(values):
+            if isinstance(value, ast.Constant):
+                values[i] = str(value.value)
+            elif isinstance(value, ast.FormattedValue):
+                conversion_code = value.conversion
+                evaluated = self.visit(value.value)
+                if not _is_constexpr(evaluated):
+                    raise self._unsupported(
+                        node,
+                        "Cannot evaluate f-string containing non-constexpr conversion values, found conversion of type "
+                        + str(type(evaluated)))
+                values[i] = ("{}" if conversion_code < 0 else "{!" + chr(conversion_code) + "}").format(evaluated.value)
+            else:
+                raise AssertionError("encountered unexpected node of type {} in a JoinedStr node".format(type(value)))
+        return ''.join(values)
+
+    def visit(self, node):
+        if node is None:
+            return
+        with warnings.catch_warnings():
+            # The ast library added visit_Constant and deprecated some other
+            # methods but we can't move to that without breaking Python 3.6 and 3.7.
+            warnings.simplefilter("ignore", DeprecationWarning)  # python 3.9
+            warnings.simplefilter("ignore", PendingDeprecationWarning)  # python 3.8
+            last_node = self.cur_node
+            last_loc = self.builder.get_loc()
+            self.cur_node = node
+            if hasattr(node, 'lineno') and hasattr(node, 'col_offset'):
+                here_loc = self.builder.create_loc(self.file_name, self.begin_line + node.lineno, node.col_offset)
+                if self.name_loc_as_prefix is not None:
+                    self.builder.set_loc(self.builder.create_name_loc(self.name_loc_as_prefix, here_loc))
+                else:
+                    self.builder.set_loc(here_loc)
+                last_loc = self.builder.get_loc()
+            try:
+                ret = super().visit(node)
+            except CompilationError:
+                raise
+            except Exception as e:
+                if knobs.compilation.front_end_debugging:
+                    raise
+                # Wrap the error in a CompilationError which contains the source
+                # of the @jit function.
+                raise CompilationError(self.jit_fn.src, self.cur_node, repr(e)) from None
+
+            # Reset the location to the last one before the visit
+            if last_loc:
+                self.cur_node = last_node
+                self.builder.set_loc(last_loc)
+            return ret
+
+    def generic_visit(self, node):
+        raise self._unsupported(node, "unsupported AST node type: {}".format(type(node).__name__))
+
+    def execute_static_assert(self, node: ast.Call) -> None:
+        arg_count = len(node.args)
+        if not (0 < arg_count <= 2) or len(node.keywords):
+            raise TypeError("`static_assert` requires one or two positional arguments only")
+
+        passed = _unwrap_if_constexpr(self.visit(node.args[0]))
+        if not isinstance(passed, bool):
+            raise NotImplementedError(
+                "Assertion condition could not be determined at compile-time. Make sure that it depends only on `constexpr` values"
+            )
+        if not passed:
+            if arg_count == 1:
+                message = ""
+            else:
+                try:
+                    message = self.visit(node.args[1])
+                except Exception as e:
+                    message = "<failed to evaluate assertion message: " + repr(e) + ">"
+
+            raise CompileTimeAssertionFailure(self.jit_fn.src, node, _unwrap_if_constexpr(message))
+        return None
+
+    def static_executor(python_fn):
+
+        def ret(self, node: ast.Call):
+            kws = {
+                name: _unwrap_if_constexpr(value)
+                for name, value in (self.visit(keyword) for keyword in node.keywords)
+            }
+            args = [_unwrap_if_constexpr(self.visit(arg)) for arg in node.args]
+            return constexpr(python_fn(*args, **kws))
+
+        return ret
+
+    from ..experimental.gluon import language as ttgl
+    statically_implemented_functions: Dict[object, Callable[[ast.Call], Any]] = {
+        language.core.static_assert: execute_static_assert,
+        language.core.static_print: static_executor(print),
+        ttgl.static_assert: execute_static_assert,
+        ttgl.static_print: static_executor(print),
+        int: static_executor(int),
+        len: static_executor(len),
+    }
+
+
+def ast_to_ttir(fn, src, context, options, codegen_fns, module_map, module=None):
+    arg_types = [None] * len(fn.arg_names)
+
+    for k, v in src.signature.items():
+        idx = fn.arg_names.index(k)
+        arg_types[idx] = str_to_ty(v, None)
+
+    def apply_constexpr_types(argument, indices, value):
+        index = indices.pop()
+        if len(indices) == 0:
+            if isinstance(argument, list):
+                argument[index] = constexpr(value).type
+            else:
+                argument.types[index] = constexpr(value).type
+        else:
+            apply_constexpr_types(argument[index], indices, value)
+
+    for path, value in src.constants.items():
+        apply_constexpr_types(arg_types, list(path)[::-1], value)
+
+    prototype = ASTFunction([], arg_types, src.constants, src.attrs)
+    file_name, begin_line = get_jit_fn_file_line(fn)
+    # query function representation
+    from collections import namedtuple
+    leaves = filter(lambda v: len(v) == 1, src.constants)
+    constants = {fn.arg_names[i[0]]: src.constants[i] for i in leaves}
+    signature = src.signature
+    proxy = namedtuple("SpecializationProxy", ["constants", "signature"])(constants, signature)
+    generator = CodeGenerator(context, prototype, gscope=fn.get_capture_scope(), function_name=fn.repr(proxy),
+                              jit_fn=fn, is_kernel=True, file_name=file_name, begin_line=begin_line, options=options,
+                              codegen_fns=codegen_fns, module_map=module_map, module=module, is_gluon=fn.is_gluon())
+    generator.visit(fn.parse())
+    module = generator.module
+    # module takes ownership of the context
+    module.context = context
+    if not module.verify():
+        if not fn.is_gluon():
+            print(module)
+        raise RuntimeError("error encountered during parsing")
+    return module
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/compiler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4872a463589d2cfdf302a546b8c2e021fb1a7c3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/compiler.py
@@ -0,0 +1,501 @@
+from __future__ import annotations
+import hashlib
+import json
+from .._C.libtriton import get_cache_invalidating_env_vars, ir
+from ..backends import backends
+from ..backends.compiler import Language
+from ..backends.compiler import BaseBackend, GPUTarget
+from .. import __version__, knobs
+from ..runtime.autotuner import OutOfResources
+from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager, get_cache_key
+from ..runtime.driver import driver
+from ..tools.disasm import get_sass
+from pathlib import Path
+import re
+import functools
+import os
+import time
+import copy
+
+# - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
+#    and any following whitespace
+# - (public\s+)? : optionally match the keyword public and any following whitespace
+# - (@\w+) : match an @ symbol followed by one or more word characters
+#   (letters, digits, or underscores), and capture it as group 1 (the function name)
+# - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing
+#   zero or more arguments separated by commas, and capture it as group 2 (the argument list)
+# - (attributes \{[\S\s]+\})? : optionally match attributes enclosed in braces and capture it as group 3
+ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)"
+prototype_pattern = {
+    "ptx": ptx_prototype_pattern,
+}
+
+ptx_arg_type_pattern = r"\.param\s+\.(\w+)"
+arg_type_pattern = {
+    "ptx": ptx_arg_type_pattern,
+}
+
+
+def convert_type_repr(x):
+    # Currently we only capture the pointer type and assume the pointer is on global memory.
+    # TODO: Capture and support shared memory space
+    match = re.search(r'!tt\.ptr<([^,]+)', x)
+    tma = re.search(r'tt.nv_tma_desc = 1', x)
+    if tma is not None:
+        return 'nvTmaDesc'
+    x = re.sub(r' {[^}]+}', '', x)
+    if match is not None:
+        return '*' + convert_type_repr(match.group(1))
+    return x
+
+
+class ASTSource:
+
+    def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
+        self.fn = fn
+        self.language = Language.TRITON
+        self.ext = "ttir"
+        self.name = fn.__name__
+        self.signature = signature
+        self.constants = dict()
+        if constexprs is not None:
+            for k, v in constexprs.items():
+                k = (fn.arg_names.index(k), ) if isinstance(k, str) else k
+                assert isinstance(k, tuple)
+                self.constants[k] = v
+        self.attrs = attrs or dict()
+        for k in self.signature.keys():
+            if not isinstance(k, str):
+                raise TypeError("Signature keys must be string")
+
+    def hash(self):
+        sorted_sig = [v for k, v in sorted(self.signature.items())]
+        get_key = lambda x: x.cache_key if hasattr(x, 'cache_key') else str(x)
+        constants_key = '-'.join([get_key(v) for k, v in sorted(self.constants.items())])
+        key = f"{self.fn.cache_key}-{str(self.attrs)}-{sorted_sig}-{constants_key}"
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
+        from .code_generator import ast_to_ttir
+        return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
+                           module_map=module_map)
+
+    def parse_options(self):
+        return dict()
+
+
+class IRSource:
+
+    def __init__(self, path, context, backend):
+        self.path = path
+        path = Path(path)
+        self.ext = path.suffix[1:]
+        self.language = Language.TRITON
+        self.src = path.read_text()
+        ir.load_dialects(context)
+        backend.load_dialects(context)
+
+        # We don't have a easy-to-use PTX parser that we can use, so keep that regex for now.
+        # TODO - replace with a proper parser
+        if self.ext == "ptx":
+            match = re.search(prototype_pattern[self.ext], self.src, re.MULTILINE)
+            self.name = match.group(1)
+            signature = match.group(2)
+            types = re.findall(arg_type_pattern[self.ext], signature)
+            self.signature = {k: convert_type_repr(ty) for k, ty in enumerate(types)}
+        else:
+            self.module = ir.parse_mlir_module(self.path, context)
+            fn_name = self.module.get_entry_func_name()
+            self.name = "@" + fn_name
+            funcOp = self.module.get_function(fn_name)
+            func_ty = self.module.get_function_signature(funcOp)
+            self.signature = {k: ty for k, ty in enumerate(func_ty)}
+
+    def hash(self):
+        return hashlib.sha256(self.src.encode("utf-8")).hexdigest()
+
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
+        self.module.context = context
+        return self.module
+
+    def parse_options(self):
+        if self.ext == "ttgir":
+            num_warps = self.module.get_int_attr("ttg.num-warps")
+            assert num_warps is not None, "Unable to parse ttg.num-warps attribute"
+            options = {'num_warps': num_warps}
+            num_ctas = self.module.get_int_attr("ttg.num-ctas")
+            if num_ctas is not None:
+                options['num_ctas'] = num_ctas
+            return options
+        return dict()
+
+
+@functools.lru_cache()
+def max_shared_mem(device):
+    return driver.active.utils.get_device_properties(device)["max_shared_mem"]
+
+
+def parse(full_name, ext, context):
+    if ext == "ttir" or ext == "ttgir":
+        module = ir.parse_mlir_module(full_name, context)
+        module.context = context
+        return module
+    if ext == "llir" or ext == "ptx" or ext == "amdgcn":
+        return Path(full_name).read_text()
+    if ext == "cubin" or ext == "hsaco":
+        return Path(full_name).read_bytes()
+
+
+def filter_traceback(e: BaseException):
+    """
+    Removes code_generator.py and related files from tracebacks.
+
+    These are uninteresting to the user -- "just show me *my* code!"
+    """
+    if knobs.compilation.front_end_debugging:
+        return
+
+    if e.__cause__ is not None:
+        filter_traceback(e.__cause__)
+    if e.__context__ is not None:
+        filter_traceback(e.__context__)
+
+    # If a user has a file that matches one of these, they're out of luck.
+    BAD_FILES = [
+        "/triton/compiler/code_generator.py",
+        "/ast.py",
+    ]
+    BAD_FILES = [bad_file.replace("/", os.sep) for bad_file in BAD_FILES]
+
+    tb = e.__traceback__
+    frames = []
+    while tb is not None:
+        if not any(f for f in BAD_FILES if tb.tb_frame.f_code.co_filename.endswith(f)):
+            frames.append(tb)
+        tb = tb.tb_next
+
+    for (cur_frame, next_frame) in zip(frames, frames[1:]):
+        cur_frame.tb_next = next_frame
+
+    if not frames:
+        e.__traceback__ = None
+    else:
+        frames[-1].tb_next = None
+        e.__traceback__ = frames[0]
+
+
+class CompileTimer:
+
+    def __init__(self) -> None:
+        self.start: float = time.time()
+        self.ir_initialization_end: float | None = None
+        self.lowering_stage_ends: list[tuple[str, float]] = []
+        self.store_results_end: float | None = None
+
+    def finished_ir_initialization(self) -> None:
+        self.ir_initialization_end = time.time()
+
+    def stage_finished(self, stage_name: str) -> None:
+        self.lowering_stage_ends.append((stage_name, time.time()))
+
+    def end(self) -> knobs.CompileTimes:
+        timestamp = time.time()
+        if self.ir_initialization_end is None:
+            self.ir_initialization_end = timestamp
+        else:
+            self.store_results_end = timestamp
+
+        def delta(start: float, end: float | None) -> int:
+            if end is None:
+                return 0
+            return int((end - start) * 1000000)
+
+        lowering_stage_durations = []
+        stage_start = self.ir_initialization_end
+        for stage_name, stage_end in self.lowering_stage_ends:
+            lowering_stage_durations.append((stage_name, delta(stage_start, stage_end)))
+            stage_start = stage_end
+
+        return knobs.CompileTimes(
+            ir_initialization=delta(self.start, self.ir_initialization_end),
+            lowering_stages=lowering_stage_durations,
+            store_results=delta(stage_start, self.store_results_end),
+        )
+
+
+def compile(src, target=None, options=None, _env_vars=None):
+    compilation_listener = knobs.compilation.listener
+    if compilation_listener:
+        timer = CompileTimer()
+
+    if target is None:
+        target = driver.active.get_current_target()
+    assert isinstance(target, GPUTarget), "target must be of GPUTarget type"
+    backend = make_backend(target)
+    ir_source = not isinstance(src, ASTSource)
+    # create backend
+    if ir_source:
+        assert isinstance(src, str), "source must be either AST or a filepath"
+        context = ir.context()
+        src = IRSource(src, context, backend)
+
+    extra_options = src.parse_options()
+    options = backend.parse_options(dict(options or dict(), **extra_options))
+    # create cache manager
+    env_vars = get_cache_invalidating_env_vars() if _env_vars is None else _env_vars
+    key = get_cache_key(src, backend, options, env_vars=env_vars)
+    hash = hashlib.sha256(key.encode("utf-8")).hexdigest()
+    fn_cache_manager = get_cache_manager(hash)
+    # For dumping/overriding only hash the source as we want it to be independent of triton
+    # core changes to make it easier to track kernels by hash.
+    enable_override = knobs.compilation.override
+    enable_ir_dump = knobs.compilation.dump_ir
+    store_only_binary = knobs.compilation.store_binary_only
+    fn_override_manager = get_override_manager(src.hash()) if enable_override else None
+    fn_dump_manager = get_dump_manager(src.hash()) if enable_ir_dump else None
+    # Pre-truncate the file name here to avoid hitting the 255 character limit on common platforms.
+    # The final file name in the cache will have a format of f"{filename}.{ext}.tmp.pid_{pid}_{uuid}".
+    # A PID string can be 5-character long. A UUID string has typically 36 characters. Let's truncate
+    # the file name to 150 characters to be safe.
+    file_name = src.name[:150]
+    metadata_filename = f"{file_name}.json"
+    metadata_group = fn_cache_manager.get_group(metadata_filename) or {}
+    metadata_path = metadata_group.get(metadata_filename)
+    always_compile = knobs.compilation.always_compile
+    if not always_compile and metadata_path is not None:
+        # cache hit!
+        res = CompiledKernel(src, metadata_group, hash)
+        if compilation_listener:
+            compilation_listener(
+                src=src,
+                metadata=res.metadata._asdict(),
+                metadata_group=metadata_group,
+                times=timer.end(),
+                cache_hit=True,
+            )
+        return res
+
+    # initialize metadata
+    metadata = {
+        "hash": hash,
+        "target": target,
+        **options.__dict__,
+        **env_vars,
+    }
+    metadata["triton_version"] = __version__
+    # run compilation pipeline  and populate metadata
+    stages = dict()
+    backend.add_stages(stages, options, src.language)
+    first_stage = list(stages.keys()).index(src.ext)
+    # when the source is an IR file, don't apply the passes related to this stage. This makes it easier to write IR level tests.
+    if ir_source:
+        first_stage += 1
+
+    # For IRSource, we have already grabbed the context + called both
+    # ir.load_dialects and backend.load_dialects.
+    if not isinstance(src, IRSource):
+        context = ir.context()
+        ir.load_dialects(context)
+        backend.load_dialects(context)
+
+    codegen_fns = backend.get_codegen_implementation(options)
+    module_map = backend.get_module_map()
+    try:
+        module = src.make_ir(target, options, codegen_fns, module_map, context)
+    except Exception as e:
+        filter_traceback(e)
+        raise
+
+    if ir_source:
+        ir_filename = f"{file_name}.{src.ext}"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+    else:
+        ir_filename = f"{file_name}.source"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+
+    use_ir_loc = knobs.compilation.use_ir_loc
+    if ir_source and use_ir_loc:
+        module.create_location_snapshot(src.path)
+        print(f"Creating new locations for {src.path}")
+
+    if compilation_listener:
+        timer.finished_ir_initialization()
+    for ext, compile_ir in list(stages.items())[first_stage:]:
+        next_module = compile_ir(module, metadata)
+        ir_filename = f"{file_name}.{ext}"
+        if fn_override_manager is None:
+            # Users can override kernels at scale by setting `ir_override` in autotune config
+            # without TRITON_KERNEL_OVERRIDE
+            if (ir_override := metadata.get("ir_override", None)) and ir_override.endswith(f".{ext}"):
+                next_module = parse(ir_override, ext, context)
+        elif full_name := fn_override_manager.get_file(ir_filename):
+            print(f"\nOverriding kernel with file {full_name}")
+            next_module = parse(full_name, ext, context)
+        # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
+        if (not store_only_binary) or (ext in ("cubin", "hsaco", "json")):
+            metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
+        if fn_dump_manager is not None:
+            fn_dump_manager.put(next_module, ir_filename)
+            if ext == "cubin":
+                sass = get_sass(next_module)
+                fn_dump_manager.put(sass, file_name + ".sass")
+        # use an env variable to parse ir from file
+        if use_ir_loc == ext:
+            ir_full_name = fn_cache_manager.get_file(ir_filename)
+            next_module.create_location_snapshot(ir_full_name)
+            print(f"Creating new locations for {ir_full_name}")
+        module = next_module
+        if compilation_listener:
+            timer.stage_finished(ext)
+    # write-back metadata
+    metadata_group[metadata_filename] = fn_cache_manager.put(json.dumps(metadata, default=vars), metadata_filename,
+                                                             binary=False)
+    fn_cache_manager.put_group(metadata_filename, metadata_group)
+
+    # notify any listener
+    if compilation_listener:
+        compilation_listener(src=src, metadata=metadata, metadata_group=metadata_group, times=timer.end(),
+                             cache_hit=False)
+    # return handle to compiled kernel
+    return CompiledKernel(src, metadata_group, hash)
+
+
+def make_backend(target: GPUTarget) -> BaseBackend:
+    actives = [x.compiler for x in backends.values() if x.compiler.supports_target(target)]
+    if len(actives) != 1:
+        raise RuntimeError(
+            f"{len(actives)} compatible backends for target ({target.backend}) ({actives}). There should only be one.")
+    return actives[0](target)
+
+
+class LazyDict:
+
+    def __init__(self, data):
+        self.data = data
+        self.extras = []
+
+    def get(self):
+        for func, args in self.extras:
+            self.data = self.data | func(*args)
+        self.extras.clear()
+        return self.data
+
+    def add(self, func, args):
+        self.extras.append((func, args))
+
+
+class AsmDict(dict):
+
+    def __missing__(self, key):
+
+        if key == "sass":
+            value = get_sass(self["cubin"])
+        else:
+            raise KeyError("Unknown key: '%s'" % key)
+
+        self[key] = value
+        return value
+
+
+def _raise_error(err, *args, **kwargs):
+    raise copy.deepcopy(err)
+
+
+class CompiledKernel:
+
+    def __init__(self, src, metadata_group, hash):
+        from collections import namedtuple
+        metadata_path = next((Path(p) for c, p in metadata_group.items() if c.endswith(".json")))
+        metadata = json.loads(metadata_path.read_text())
+        # JSON serialization dumps the target as a dict. Restore it to a GPUTarget.
+        target = metadata['target']
+        metadata['target'] = GPUTarget(target['backend'], target['arch'], target['warp_size'])
+        KernelMetadata = namedtuple('KernelMetadata', sorted(list(metadata.keys())))
+        self.metadata = KernelMetadata(**metadata)
+        backend = make_backend(self.metadata.target)
+        self.packed_metadata = backend.pack_metadata(self.metadata)
+        self.src = src
+        self.hash = hash
+        self.name = self.metadata.name
+        # stores the text of each level of IR that was generated during compilation
+        asm_files = [Path(p) for c, p in metadata_group.items() if not c.endswith(".json")]
+        binary_ext = backend.binary_ext
+        self.asm = AsmDict({
+            file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
+            for file in asm_files
+        })
+        self.metadata_group = metadata_group
+        self.kernel = self.asm[binary_ext]
+        # binaries are lazily initialized
+        # because it involves doing runtime things
+        # (e.g., checking amount of shared memory on current device)
+        self.module = None
+        self.function = None
+        self._run = None
+
+    def _init_handles(self):
+        if self.module is not None:
+            return
+
+        def raise_(err):
+            # clone the exception object so that the one saved in the closure
+            # of the partial function below doesn't get assigned a stack trace
+            # after the subsequent raise. otherwise, the CompiledKernel instance
+            # saved in the (global) kernel cache will keep references to all the
+            # locals in the traceback via the exception instance in the closure.
+            cloned_err = copy.deepcopy(err)
+            self._run = functools.partial(_raise_error, cloned_err)
+            raise err
+
+        device = driver.active.get_current_device()
+        # create launcher
+        self._run = driver.active.launcher_cls(self.src, self.metadata)
+        # not enough shared memory to run the kernel
+        max_shared = max_shared_mem(device)
+        if self.metadata.shared > max_shared:
+            raise_(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
+        if hasattr(self.metadata, "tmem_size") and self.metadata.tmem_size is not None:
+            # Use blackwell max tmem size for now, this should be moved in device properties
+            max_tmem_size = 512  # tmem size in number of columns
+            if self.metadata.tmem_size > max_tmem_size:
+                raise_(OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory"))
+        if knobs.runtime.kernel_load_start_hook is not None:
+            knobs.runtime.kernel_load_start_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
+        # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
+        self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
+            self.name, self.kernel, self.metadata.shared, device)
+        warp_size = driver.active.get_current_target().warp_size
+        if self.metadata.num_warps * warp_size > self.n_max_threads:
+            raise_(OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads"))
+        if knobs.runtime.kernel_load_end_hook is not None:
+            knobs.runtime.kernel_load_end_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
+
+    @property
+    def run(self):
+        if self._run is None:
+            self._init_handles()
+        return self._run
+
+    def launch_metadata(self, grid, stream, *args):
+        if knobs.runtime.launch_enter_hook is None:
+            return None
+        self._init_handles()
+        ret = LazyDict({"name": self.name, "function": self.function, "stream": stream})
+        if not isinstance(self.src, ASTSource) or self.src.fn.launch_metadata is None:
+            return ret
+        arg_dict = {name: arg for name, arg in zip(self.src.fn.arg_names, args)}
+        ret.add(self.src.fn.launch_metadata, (grid, self.metadata, arg_dict))
+        return ret
+
+    def __getitem__(self, grid):
+        self._init_handles()
+
+        def runner(*args, stream=None):
+            if stream is None:
+                device = driver.active.get_current_device()
+                stream = driver.active.get_current_stream(device)
+            launch_metadata = self.launch_metadata(grid, stream, *args)
+            self.run(grid[0], grid[1], grid[2], stream, self.function, self.packed_metadata, launch_metadata,
+                     knobs.runtime.launch_enter_hook, knobs.runtime.launch_exit_hook, *args)
+
+        return runner
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/errors.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e6c4dfb04dd2067d50ce7c79f762c5e7e2d5b8
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/errors.py
@@ -0,0 +1,51 @@
+import ast
+from typing import Optional
+from ..errors import TritonError
+
+
+class CompilationError(TritonError):
+    """Base class for all errors raised during compilation"""
+    source_line_count_max_in_message = 12
+
+    def _format_message(self) -> str:
+        node = self.node
+        if self.src is None:
+            source_excerpt = " <source unavailable>"
+        else:
+            if hasattr(node, 'lineno'):
+                source_excerpt = self.src.split('\n')[:node.lineno][-self.source_line_count_max_in_message:]
+                if source_excerpt:
+                    source_excerpt.append(' ' * node.col_offset + '^')
+                    source_excerpt = '\n'.join(source_excerpt)
+                else:
+                    source_excerpt = " <source empty>"
+            else:
+                source_excerpt = self.src
+
+        message = "at {}:{}:\n{}".format(node.lineno, node.col_offset, source_excerpt) if hasattr(
+            node, 'lineno') else source_excerpt
+        if self.error_message:
+            message += '\n' + self.error_message
+        return message
+
+    def __init__(self, src: Optional[str], node: ast.AST, error_message: Optional[str] = None):
+        self.src = src
+        self.node = node
+        self.error_message = error_message
+        self.message = self._format_message()
+
+    def __str__(self):
+        return self.message
+
+    def __reduce__(self):
+        # this is necessary to make CompilationError picklable
+        return type(self), (self.src, self.node, self.error_message)
+
+
+class CompileTimeAssertionFailure(CompilationError):
+    """Specific exception for failed tests in `static_assert` invocations"""
+    pass
+
+
+class UnsupportedLanguageConstruct(CompilationError):
+    pass
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/make_launcher.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/compiler/make_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da62b098f4ea54dd9e72c52ef0693b9f6ed2b852
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e286a20f2d132003bc6f9415e18b2c3c7cd401b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__init__.py
@@ -0,0 +1,6 @@
+from . import nvidia
+from . import amd
+from ._runtime import constexpr_function, jit
+from triton.language.core import must_use_result
+
+__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia", "amd"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e293990d14725577e90449ec9ca1b8a50800f2a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_compiler.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..788fff582d7e1f2257c9316dc17235809b934156
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_compiler.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_runtime.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_runtime.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be07b7a091711e9206ddb0a9f7dcf5c59af23eb2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/__pycache__/_runtime.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/_compiler.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/_compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/_runtime.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..d98bb2098b1385e0d338b5dffec62884d3d9a5d9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/_runtime.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+from triton.compiler.compiler import ASTSource
+from triton.backends.compiler import Language
+from triton.runtime.jit import JITFunction, constexpr_function
+from typing import TypeVar, Optional, Callable, Iterable, Union
+from triton._C.libtriton import ir
+
+T = TypeVar("T")
+
+__all__ = ["constexpr_function", "jit"]
+
+
+class GluonASTSource(ASTSource):
+
+    def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
+        super().__init__(fn, signature, constexprs, attrs)
+        self.language = Language.GLUON
+        self.ext = "ttgir"
+
+    def make_ir(self, target, options, codegen_fns, module_map, context):
+        from triton.compiler.compiler import make_backend
+        from triton.compiler.code_generator import ast_to_ttir
+
+        builder = ir.builder(context)
+        module = builder.create_module()
+
+        # Assign module attributes eagerly, as they are needed to verify layouts
+        backend = make_backend(target)
+        target = backend.get_target_name(options)
+
+        module.set_attr("ttg.target", builder.get_string_attr(target))
+        module.set_attr("ttg.num-warps", builder.get_int32_attr(options.num_warps))
+        module.set_attr("ttg.num-ctas", builder.get_int32_attr(options.num_ctas))
+        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(options.warp_size))
+
+        is_cuda = options.backend_name == "cuda"
+        if is_cuda and options.maxnreg is not None:
+            module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
+
+        module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
+                             module_map=module_map, module=module)
+        return module
+
+
+class GluonJITFunction(JITFunction[T]):
+
+    def create_binder(self):
+        result = super().create_binder()
+        self.ASTSource = GluonASTSource
+        return result
+
+    def is_gluon(self):
+        return True
+
+
+def jit(
+    fn: Optional[T] = None,
+    *,
+    version=None,
+    repr: Optional[Callable] = None,
+    launch_metadata: Optional[Callable] = None,
+    do_not_specialize: Optional[Iterable[int | str]] = None,
+    do_not_specialize_on_alignment: Optional[Iterable[int | str]] = None,
+    debug: Optional[bool] = None,
+    noinline: Optional[bool] = None,
+) -> Union[GluonJITFunction[T], Callable[[T], JITFunction[T]]]:
+    """
+    Decorator for JIT-compiling a function using the Triton compiler.
+
+    :note: When a jit'd function is called, arguments are
+        implicitly converted to pointers if they have a :code:`.data_ptr()` method
+        and a `.dtype` attribute.
+
+    :note: This function will be compiled and run on the GPU. It will only have access to:
+
+           * python primitives,
+           * builtins within the triton package,
+           * arguments to this function,
+           * other jit'd functions
+
+    :param fn: the function to be jit-compiled
+    :type fn: Callable
+    """
+
+    def decorator(fn: T) -> JITFunction[T]:
+        assert callable(fn)
+        return GluonJITFunction(
+            fn,
+            version=version,
+            do_not_specialize=do_not_specialize,
+            do_not_specialize_on_alignment=do_not_specialize_on_alignment,
+            debug=debug,
+            noinline=noinline,
+            repr=repr,
+            launch_metadata=launch_metadata,
+        )
+
+    if fn is not None:
+        return decorator(fn)
+
+    else:
+        return decorator
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3271153da6f7f0d01ebac221ac934e7f99c0e9ff
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__init__.py
@@ -0,0 +1,3 @@
+from . import gfx1250
+
+__all__ = ["gfx1250"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6b1c57ea36bf3bf89eb728259b8f882d61a2ad3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/gfx1250.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/gfx1250.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e2d661c3d28a4809f634c5b1cf8f0ea26bb0203
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/__pycache__/gfx1250.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/gfx1250.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/gfx1250.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cab725920b0f47ad05739e4e1be08bbb20c91b0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/amd/gfx1250.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass
+from typing import List, Any
+from triton._utils import validate_block_shape
+from triton.experimental.gluon.language._layouts import PaddedSharedLayout, SwizzledSharedLayout
+
+__all__ = ["TensorDescriptor"]
+
+
+@dataclass
+class TensorDescriptor:
+    base: Any
+    shape: List[int]
+    strides: List[int]
+    block_shape: List[int]
+    layout: PaddedSharedLayout | SwizzledSharedLayout
+    padding: str = "zero"
+
+    def __post_init__(self):
+        ndim = len(self.shape)
+        # TODO: support 1D-5D tensor descriptors
+        assert ndim == 2, f"Expected 2 dimensions but got {ndim} dimensions"
+        assert len(self.strides) == ndim, f"Expected {ndim} strides but got {len(self.strides)}"
+        assert len(self.block_shape) == ndim, \
+            f"Expected block_shape to have {ndim} dimensions but got {len(self.strides)}"
+        validate_block_shape(self.block_shape)
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert isinstance(self.layout, (PaddedSharedLayout, SwizzledSharedLayout)), \
+            "Expected layout to be a PaddedSharedLayout or SwizzledSharedLayout"
+        if isinstance(self.layout, SwizzledSharedLayout):
+            assert self.layout.max_phase == 1, "Expected max_phase to be 1 for SwizzledSharedLayout"
+        assert self.padding == "zero", "Only 'zero' padding is supported"
+
+    @staticmethod
+    def from_tensor(tensor: Any, block_shape: List[int], layout: PaddedSharedLayout | SwizzledSharedLayout):
+        """ Create a TensorDescriptor object from a tensor.
+
+        Args:
+            tensor (torch.Tensor): The input tensor.
+            block_shape (List[int]): The block shape of the tensor.
+            layout (PaddedSharedLayout | SwizzledSharedLayout): The layout of the tensor in shared memory.
+
+        Returns:
+            tensor_descriptor: the created TensorDescriptor object
+
+        """
+        return TensorDescriptor(tensor, tensor.shape, tensor.stride(), block_shape, layout)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2842cc0f35f4ba7c5d4bff3a084ae7cd6c1f1fd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__init__.py
@@ -0,0 +1,137 @@
+from ._core import (
+    base_value,
+    base_type,
+    block_type,
+    broadcast,
+    cast,
+    constexpr,
+    dtype,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    pointer_type,
+    shared_memory_descriptor,
+    tensor,
+    tuple,
+    tuple_type,
+    _unwrap_if_constexpr,
+    # API Functions
+    add,
+    allocate_shared_memory,
+    arange,
+    associative_scan,
+    assume,
+    atomic_add,
+    atomic_and,
+    atomic_cas,
+    atomic_max,
+    atomic_min,
+    atomic_or,
+    atomic_xchg,
+    atomic_xor,
+    bank_conflicts,
+    convert_layout,
+    device_assert,
+    device_print,
+    dot_fma,
+    expand_dims,
+    full,
+    fp4_to_fp,
+    gather,
+    num_warps,
+    num_ctas,
+    histogram,
+    inline_asm_elementwise,
+    join,
+    load,
+    map_elementwise,
+    max_constancy,
+    max_contiguous,
+    maximum,
+    minimum,
+    mul,
+    multiple_of,
+    num_programs,
+    permute,
+    program_id,
+    reduce,
+    reshape,
+    distributed_type,
+    shared_memory_descriptor_type,
+    set_auto_layout,
+    split,
+    static_assert,
+    static_print,
+    static_range,
+    store,
+    sub,
+    thread_barrier,
+    to_linear_layout,
+    to_tensor,
+    warp_specialize,
+    where,
+)
+from ._layouts import (
+    AutoLayout,
+    BlockedLayout,
+    SliceLayout,
+    DistributedLinearLayout,
+    DotOperandLayout,
+    NVMMADistributedLayout,
+    NVMMASharedLayout,
+    SwizzledSharedLayout,
+    PaddedSharedLayout,
+    SharedLinearLayout,
+    CoalescedLayout,
+)
+from ._math import (
+    umulhi,
+    exp,
+    exp2,
+    fma,
+    log,
+    log2,
+    cos,
+    rsqrt,
+    sin,
+    sqrt,
+    sqrt_rn,
+    abs,
+    fdiv,
+    div_rn,
+    erf,
+    floor,
+    ceil,
+)
+from ._standard import (
+    cdiv,
+    full_like,
+    max,
+    min,
+    ravel,
+    reduce_or,
+    sum,
+    xor_sum,
+    zeros,
+    zeros_like,
+)
+
+from . import nvidia
+from . import amd
+from . import extra
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d27d409a471401572a3041e6561071f4b2c96aaa
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_core.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_core.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c35c62d10d3f080461f50b1989560642eb48584
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_core.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_layouts.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_layouts.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44e5c0529faf29c15f5a6ba75cf0fb764195c933
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_layouts.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_math.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_math.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66639786fa5d23d61b94f64086d59b70c9bf0e0c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_math.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_semantic.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_semantic.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17c495a0217d5f885d2a13a4cd14762b7bbc0cd1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_semantic.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_standard.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_standard.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72041eb3735d5cdc78bb14b7caf6d3cc95d8fee6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/__pycache__/_standard.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_core.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00f87fe58491227a1bb657e9dbf9743ef6b76e2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_core.py
@@ -0,0 +1,592 @@
+from __future__ import annotations
+import math
+from typing import TypeVar, List, TYPE_CHECKING, Tuple
+from functools import wraps
+import warnings
+
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from ._semantic import GluonSemantic
+
+from ._layouts import SharedLayout, DistributedLayout, BlockedLayout, DotOperandLayout, AutoLayout, CoalescedLayout
+from triton._C.libtriton import ir
+import triton.language.core as tl_core
+from triton.language.core import (
+    constexpr,
+    base_value,
+    base_type,
+    dtype,
+    block_type,  # TODO: block type with layout info
+    pointer_type,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    _unwrap_if_constexpr,
+    _unwrap_shape,
+    static_range,
+    tensor,
+    tuple,
+    tuple_type,
+)
+
+# We define __all__ only to appease the python linter, these are not used in
+# this file but we want to import them anyway so they are importable from here.
+__all__ = [
+    "constexpr",
+    "pointer_type",
+    "void",
+    "int1",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float8e5",
+    "float8e5b16",
+    "float8e4nv",
+    "float8e4b8",
+    "float8e4b15",
+    "float16",
+    "bfloat16",
+    "float32",
+    "float64",
+    "distributed_type",
+    "shared_memory_descriptor_type",
+    "static_range",
+    "tuple",
+    "tuple_type",
+    "num_ctas",
+]
+
+T = TypeVar("T")
+
+# TODO: split these
+GLUON_BUILTIN = "__triton_builtin__"
+
+
+def builtin(fn: T) -> T:
+    """Mark a function as a builtin."""
+    assert callable(fn)
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if "_semantic" not in kwargs or kwargs["_semantic"] is None:
+            raise ValueError("Did you forget to add @triton.gluon.jit ? "
+                             "(`_semantic` argument must be provided outside of JIT functions.)")
+        return fn(*args, **kwargs)
+
+    setattr(wrapper, GLUON_BUILTIN, True)
+
+    return wrapper
+
+
+# Explicitly import forwarded Triton language symbols so mypy sees them.
+add = builtin(tl_core.add)
+associative_scan = builtin(tl_core.associative_scan)
+assume = builtin(tl_core.assume)
+atomic_add = builtin(tl_core.atomic_add)
+atomic_and = builtin(tl_core.atomic_and)
+atomic_cas = builtin(tl_core.atomic_cas)
+atomic_max = builtin(tl_core.atomic_max)
+atomic_min = builtin(tl_core.atomic_min)
+atomic_or = builtin(tl_core.atomic_or)
+atomic_xchg = builtin(tl_core.atomic_xchg)
+atomic_xor = builtin(tl_core.atomic_xor)
+broadcast = builtin(tl_core.broadcast)
+cast = builtin(tl_core.cast)
+device_assert = builtin(tl_core.device_assert)
+device_print = builtin(tl_core.device_print)
+expand_dims = builtin(tl_core.expand_dims)
+gather = builtin(tl_core.gather)
+inline_asm_elementwise = builtin(tl_core.inline_asm_elementwise)
+join = builtin(tl_core.join)
+load = builtin(tl_core.load)
+map_elementwise = builtin(tl_core.map_elementwise)
+max_constancy = builtin(tl_core.max_constancy)
+max_contiguous = builtin(tl_core.max_contiguous)
+maximum = builtin(tl_core.maximum)
+minimum = builtin(tl_core.minimum)
+mul = builtin(tl_core.mul)
+multiple_of = builtin(tl_core.multiple_of)
+num_programs = builtin(tl_core.num_programs)
+permute = builtin(tl_core.permute)
+program_id = builtin(tl_core.program_id)
+reduce = builtin(tl_core.reduce)
+reshape = builtin(tl_core.reshape)
+split = builtin(tl_core.split)
+static_assert = builtin(tl_core.static_assert)
+static_print = builtin(tl_core.static_print)
+store = builtin(tl_core.store)
+sub = builtin(tl_core.sub)
+to_tensor = builtin(tl_core.to_tensor)
+where = builtin(tl_core.where)
+
+
+class distributed_type(block_type):
+
+    def __init__(self, element_ty: dtype, shape: List[int], layout):
+        layout = _unwrap_if_constexpr(layout)
+        shape = _unwrap_if_constexpr(shape)
+        super().__init__(element_ty, shape)
+        self.layout = layout
+        self.name = f"<{self.shape}, {self.element_ty}, {self.layout}>"
+        assert isinstance(layout, DistributedLayout), "tensor layout must be a DistributedLayout"
+        if not isinstance(layout, (AutoLayout, CoalescedLayout)):
+            assert len(
+                shape
+            ) == layout.rank, f"tensor shape and layout rank mismatch: shape={shape}, layout={layout}, shape rank={len(shape)}, layout rank={layout.rank}"
+
+    def to_ir(self, builder: ir.builder) -> ir.type:
+        elem_ty = self.element_ty.to_ir(builder)
+        layout = self.layout._to_ir(builder)
+        return builder.get_distributed_ty(elem_ty, self.shape, layout)
+
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = "_".join(map(str, self.shape))
+        layout = self.layout.mangle()
+        return f"{elt}S{shape}SL{layout}L"
+
+    def with_element_ty(self, scalar_ty: dtype) -> block_type:
+        return distributed_type(scalar_ty, self.shape, self.layout)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, distributed_type):
+            return False
+        return super().__eq__(other) and self.layout == other.layout
+
+
+class shared_memory_descriptor_type(base_type):
+
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        shape = _unwrap_if_constexpr(shape)
+        alloc_shape = _unwrap_if_constexpr(alloc_shape)
+        layout = _unwrap_if_constexpr(layout)
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, SharedLayout)
+
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_shared_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[shared_memory_descriptor, int]:
+        value = shared_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+
+    def __str__(self) -> str:
+        return f"shared_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}, {self.alloc_shape}>"
+
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+
+
+class shared_memory_descriptor(base_value):
+    """
+    Represents a handle to a shared memory allocation in Gluon IR.
+    """
+
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = shared_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+    @property
+    def dtype(self):
+        return self.type.element_ty
+
+    @property
+    def shape(self):
+        return self.type.shape
+
+    @property
+    def rank(self):
+        return len(self.shape)
+
+    @property
+    def numel(self) -> int:
+        return math.prod(self.shape)
+
+    @property
+    def layout(self):
+        return self.type.layout
+
+    def __str__(self) -> str:
+        return str(self.type)
+
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic = None) -> tensor:
+        """
+        Load a tensor from shared memory.
+
+        Args:
+            layout (DistributedLayout): The destination layout of the tensor.
+
+        Returns:
+            tensor: A Gluon tensor containing the loaded data.
+        """
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.shared_load(self, layout)
+
+    @builtin
+    def store(self, value, _semantic: GluonSemantic = None) -> None:
+        """
+        Store a tensor into shared memory.
+
+        Args:
+            value (tensor): The tensor whose contents to store.
+        """
+        return _semantic.shared_store(self, value)
+
+    @builtin
+    def slice(self, start, length, dim=0, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Create a subview of shared memory by slicing along a given dimension.
+
+        Args:
+            start (int): The starting index of the slice.
+            length (int): The length of the slice.
+            dim (int): The dimension to slice (default: 0).
+
+        Returns:
+            shared_memory_descriptor: Descriptor for the sliced subview.
+        """
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        dim = _unwrap_if_constexpr(dim)
+        return _semantic.memdesc_slice(self, start, length, dim)
+
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Create a subview of shared memory by indexing along the first dimension.
+
+        Args:
+            index (int): The index at which to take the subview.
+
+        Returns:
+            shared_memory_descriptor: Descriptor for the indexed subview.
+        """
+        index = _unwrap_if_constexpr(index)
+        return _semantic.memdesc_index(self, index)
+
+    @builtin
+    def permute(self, order, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Permute the dimensions of the shared memory descriptor.
+
+        Args:
+            order (List[int]): The new ordering of dimensions.
+
+        Returns:
+            shared_memory_descriptor: Descriptor with permuted dimensions.
+        """
+        order = [_unwrap_if_constexpr(o) for o in order]
+        return _semantic.memdesc_trans(self, order)
+
+    @builtin
+    def reshape(self, shape, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Reshape the shared memory descriptor to a new shape and layout.
+
+        Args:
+            shape (List[int]): The target shape.
+
+        Returns:
+            shared_memory_descriptor: Descriptor with the new shape and layout.
+        """
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+
+        return _semantic.memdesc_reshape(self, shape)
+
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Reinterpret the shared memory descriptor as a different dtype, shape, or layout.
+
+        Args:
+            dtype (dtype): The new data type.
+            shape (List[int]): The new shape.
+            layout (SharedLayout): The new layout.
+
+        Returns:
+            shared_memory_descriptor: Descriptor with updated type and layout.
+        """
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+
+        return _semantic.memdesc_reinterpret(self, dtype, shape, layout)
+
+    @builtin
+    def _keep_alive(self, _semantic: GluonSemantic = None) -> None:
+        """
+        Dummy use to keep the shared memory descriptor alive.
+        """
+        return _semantic.shared_dealloc(self)
+
+
+@builtin
+def arange(start, end, layout=None, _semantic=None):
+    """
+    Generate a sequence tensor with values in [start, end) using a specified layout.
+
+    Args:
+        start (int): Inclusive start of the sequence.
+        end (int): Exclusive end of the sequence.
+        layout (DistributedLayout): The layout of the output tensor. Defaults to AutoLayout.
+
+    Returns:
+        tensor: A 1D tensor containing sequential values.
+    """
+    start = _unwrap_if_constexpr(start)
+    end = _unwrap_if_constexpr(end)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.arange(start, end, layout)
+
+
+@builtin
+def convert_layout(value, layout, assert_trivial=False, _semantic=None):
+    """
+    Convert a tensor to a different distributed layout.
+
+    Args:
+        value (tensor): The input tensor.
+        layout (DistributedLayout): The target layout.
+        assert_trivial (bool): If True, asserts that the conversion is trivial (no data movement).
+
+    Returns:
+        tensor: The tensor with the new layout.
+    """
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.convert_layout(value, layout, assert_trivial)
+
+
+@builtin
+def full(shape, value, dtype, layout=None, _semantic=None):
+    """
+    Create a tensor filled with a scalar value, with specified shape, dtype, and layout.
+
+    Args:
+        shape (Sequence[int]): The shape of the tensor.
+        value (int or float): The fill value.
+        dtype (dtype): The data type for the tensor.
+        layout (Optional[DistributedLayout]): The layout of the output tensor, defaults to AutoLayout().
+
+    Returns:
+        tensor: A tensor where every element equals value.
+    """
+    shape = _unwrap_shape(shape)
+    value = _unwrap_if_constexpr(value)
+    dtype = _unwrap_if_constexpr(dtype)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.full(shape, value, dtype, layout)
+
+
+@builtin
+def histogram(input, num_bins, mask=None, layout=None, _semantic=None, _generator=None):
+    """
+    Compute a histogram of a 1D integer tensor.
+
+    Args:
+        input (tensor): 1D tensor of integer values.
+        num_bins (int): Number of bins. Bins have width 1 and start at 0.
+        mask (Optional[tensor]): Boolean mask to exclude elements when False.
+        layout (DistributedLayout): Destination layout of the output histogram.
+
+    Returns:
+        tensor: 1D int32 tensor of length `num_bins` with the requested layout.
+    """
+    num_bins = _unwrap_if_constexpr(num_bins)
+    layout = _unwrap_if_constexpr(layout)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.histogram(input, num_bins, mask, layout)
+
+
+@builtin
+def allocate_shared_memory(element_ty, shape, layout, value=None, _semantic=None) -> shared_memory_descriptor:
+    """
+    Allocate shared memory for a tensor with the given element type, shape, and layout.
+
+    Args:
+        element_ty (dtype): The element data type.
+        shape (Sequence[int]): The dimensions of the shared memory.
+        layout (SharedLayout): The shared memory layout.
+        value (tensor, optional): Initial value to copy into shared memory.
+
+    Returns:
+        shared_memory_descriptor: Descriptor for the allocated memory.
+    """
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    shape = [_unwrap_if_constexpr(s) for s in shape]
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.allocate_shared(element_ty, shape, layout, value)
+
+
+@builtin
+def set_auto_layout(value, layout, _semantic=None):
+    """
+    Set a tensor with AutoLayout to a concrete layout
+
+    Args:
+        value (tensor): The input tensor.
+        layout (DistribtedLayout): The target layout.
+
+    Returns:
+        tensor: The tensor with the new layout.
+    """
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.set_auto_layout(value, layout)
+
+
+@builtin
+def fp4_to_fp(src, elem_type, axis, _semantic=None):
+    """
+    Upcast a tensor from fp4 (e2m1) to another floating point type.
+    """
+    axis = _unwrap_if_constexpr(axis)
+    elem_type = _unwrap_if_constexpr(elem_type)
+    return _semantic.fp4_to_fp(src, elem_type, axis)
+
+
+@builtin
+def warp_specialize(functions_and_args, worker_num_warps, worker_num_regs, _semantic=None, _generator=None):
+    """
+    Create a warp-specialized execution region, partitioning work across warps.
+
+    This forks the current execution into a "default partition" and an arbitrary number of
+    "worker partitons". The default partition is executed in the same :code:`num_warps` warps as
+    the parent region, and may accept tensor arguments and return tensors. Worker partitions are
+    executed in additional warps, which sit idle while executing the parent region.
+
+    Note that calling warp_specialize recursively is not supported.
+
+    Args:
+        functions_and_args (List[Tuple[Callable, Any]]): List of functions and arguments for each partition. The first of which is the default partition.
+        worker_num_warps (List[int]): Number of warps used for each worker partition.
+        worker_num_regs (List[int]): Number of registers for each worker partition.
+
+    Returns:
+        Tuple[Any, ...]: Results from the default partition.
+    """
+    worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
+    worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
+    return _semantic.warp_specialize(functions_and_args, worker_num_warps, worker_num_regs, _generator)
+
+
+@builtin
+def num_warps(_semantic=None, _generator=None):
+    """
+    Returns the number of warps that execute the current context, including in warp-specialized regions.
+    """
+    return _semantic.num_warps(_generator)
+
+
+@builtin
+def num_ctas(_semantic=None):
+    """
+    Returns the number of CTAs in the current kernel
+    """
+    return _semantic.num_ctas()
+
+
+@builtin
+def thread_barrier(_semantic=None):
+    """
+    Insert a barrier to synchronize threads within a CTA.
+    """
+    return _semantic.debug_barrier()
+
+
+@builtin
+def bank_conflicts(distr_ty, shared_ty, _semantic=None) -> int:
+    """
+    Count the bank conflicts per wavefront of each instruction generated when
+    reading/writing the distributed tensor from/to the shared memory descriptor
+    using ld.shared/st.shared instructions.
+
+    We define a bank conflict of N to be the excess number of memory accesses that each
+    wavefront needs to access the shared memory descriptor. When one uses no ld/st
+    vectorization, this is equal to t he number of excess memory accesses per instruction.
+
+    Args:
+        distr_ty (distributed_type): The distributed tensor.
+        shared_ty (shared_memory_descriptor_type): The shared memory descriptor.
+
+    Returns:
+        int: The number of bank conflicts.
+    """
+    distr_ty = _unwrap_if_constexpr(distr_ty)
+    shared_ty = _unwrap_if_constexpr(shared_ty)
+    return _semantic.bank_conflicts(distr_ty, shared_ty)
+
+
+@builtin
+def to_linear_layout(layout, shape, _semantic=None):
+    layout = _unwrap_if_constexpr(layout)
+    shape = _unwrap_shape(shape)
+    return _semantic.to_linear_layout(layout, shape)
+
+
+@builtin
+def dot_fma(a, b, acc, _semantic=None):
+    assert isinstance(a, tensor), "a must be a tensor"
+    assert isinstance(b, tensor), "b must be a tensor"
+    assert isinstance(acc, tensor), "acc must be a tensor"
+
+    mma_layout = acc.type.layout
+    assert isinstance(mma_layout, BlockedLayout), "acc must have a BlockedLayout"
+    assert isinstance(a.type.layout, DotOperandLayout), "a must have a DotOperandLayout"
+    assert isinstance(b.type.layout, DotOperandLayout), "b must have a DotOperandLayout"
+    assert a.type.layout.parent == mma_layout, "a's parent layout must be the same as acc's layout"
+    assert b.type.layout.parent == mma_layout, "b's parent layout must be the same as acc's layout"
+    assert a.type.layout.operand_index == 0, "a's operand index must be 0"
+    assert b.type.layout.operand_index == 1, "b's operand index must be 1"
+
+    M, N = acc.shape
+    K = a.shape[1]
+    if M * N * K > 2**19:
+        warnings.warn(f"Large dot FMA instruction size {M}x{N}x{K} may have slow compile times")
+
+    handle = _semantic.dot(a, b, acc, input_precision=None, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle
+    return tensor(handle, acc.type)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_layouts.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f5a2c4002986b672bb1ecb49414eebf805ad165
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_layouts.py
@@ -0,0 +1,676 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from triton.language.core import _unwrap_if_constexpr, _unwrap_shape, constexpr_type
+from triton.runtime.jit import constexpr_function
+import math
+
+
+class DistributedLayout:
+    """
+    Base class for distributed memory layouts in Gluon IR.
+    """
+
+    @property
+    def type(self):
+        return constexpr_type(self)
+
+    @property
+    def rank(self):
+        raise NotImplementedError("DistributedLayout subclasses must define rank")
+
+
+@dataclass(frozen=True)
+class AutoLayout(DistributedLayout):
+
+    def _to_ir(self, builder):
+        return builder.get_auto_layout()
+
+    def mangle(self):
+        return "AL"
+
+    @property
+    def rank(self):
+        raise ValueError("AutoLayout has no rank")
+
+
+@dataclass(frozen=True)
+class CoalescedLayout(DistributedLayout):
+
+    def _to_ir(self, builder):
+        return builder.get_coalesced_layout()
+
+    def mangle(self):
+        return "CL"
+
+    @property
+    def rank(self):
+        raise ValueError("CoalescedLayout has no rank")
+
+
+@dataclass(frozen=True)
+class BlockedLayout(DistributedLayout):
+    """
+    Represents a blocked layout, partitioning a tensor across threads, warps, and CTAs.
+
+    Args:
+        size_per_thread (List[int]): Number of elements per thread per dimension.
+        threads_per_warp (List[int]): Number of threads per warp per dimension.
+        warps_per_cta (List[int]): Number of warps per CTA per dimension.
+        order (List[int]): The ordering of dimensions for partitioning.
+        cga_layout (Optional[List[List[int]]]): Bases describing how CTAs tile each dimension.
+    """
+    size_per_thread: List[int]
+    threads_per_warp: List[int]
+    warps_per_cta: List[int]
+    order: List[int]
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("size_per_thread", _unwrap_if_constexpr(self.size_per_thread))
+        super().__setattr__("threads_per_warp", _unwrap_if_constexpr(self.threads_per_warp))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("order", _unwrap_if_constexpr(self.order))
+
+        rank = len(self.size_per_thread)
+        object.__setattr__(self, "cga_layout", self.cga_layout)
+        assert len(self.threads_per_warp) == rank
+        assert len(self.warps_per_cta) == rank
+        assert len(self.order) == rank
+
+    def _to_ir(self, builder):
+        return builder.get_blocked_layout(
+            self.size_per_thread,
+            self.threads_per_warp,
+            self.warps_per_cta,
+            self.order,
+            self.cga_layout,
+        )
+
+    def mangle(self) -> str:
+
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+
+        size_per_thread = stringify(self.size_per_thread)
+        threads_per_warp = stringify(self.threads_per_warp)
+        warps_per_cta = stringify(self.warps_per_cta)
+        order = stringify(self.order)
+        cga_layout = "_".join("~".join(map(str, vec)) for vec in self.cga_layout) if self.cga_layout else ""
+        return f"B{size_per_thread}_{threads_per_warp}_{warps_per_cta}_{order}_{cga_layout}B"
+
+    def __hash__(self):
+        return hash((tuple(self.size_per_thread), tuple(self.threads_per_warp), tuple(self.warps_per_cta),
+                     tuple(self.order), tuple(tuple(vec) for vec in self.cga_layout)))
+
+    @property
+    def rank(self):
+        return len(self.order)
+
+
+@dataclass(frozen=True)
+class SliceLayout(DistributedLayout):
+    """
+    Represents a layout corresponding to slicing a distributed tensor along one dimension.
+
+    Args:
+        dim (int): The dimension index to slice.
+        parent (DistributedLayout): The parent layout before slicing.
+    """
+    dim: int
+    parent: DistributedLayout
+
+    def __post_init__(self):
+        super().__setattr__("dim", _unwrap_if_constexpr(self.dim))
+        super().__setattr__("parent", _unwrap_if_constexpr(self.parent))
+
+    def _to_ir(self, builder):
+        return builder.get_slice_layout(
+            self.dim,
+            self.parent._to_ir(builder),
+        )
+
+    def mangle(self) -> str:
+        return f"SL{self.dim}_{self.parent.mangle()}SL"
+
+    def __hash__(self):
+        return hash((self.dim, self.parent))
+
+    @property
+    def rank(self):
+        return self.parent.rank - 1
+
+    @property
+    def cga_layout(self):
+        parent_cga_layout = self.parent.cga_layout
+        if not parent_cga_layout:
+            return []
+
+        rank = self.parent.rank
+        assert 0 <= self.dim < rank
+        return [basis[:self.dim] + basis[self.dim + 1:] for basis in parent_cga_layout]
+
+
+@dataclass(frozen=True)
+class DistributedLinearLayout(DistributedLayout):
+    """
+    Represents a linear distributed layout with explicit bases at register, lane, warp, and block levels.
+    See: https://arxiv.org/abs/2505.23819 for reference.
+
+    Args:
+        reg_bases (List[List[int]]): Bases for register-level distribution.
+        lane_bases (List[List[int]]): Bases for lane-level distribution.
+        warp_bases (List[List[int]]): Bases for warp-level distribution.
+        block_bases (List[List[int]]): Bases for block-level distribution.
+        shape (List[int]): The tensor global shape.
+    """
+    reg_bases: List[List[int]]
+    lane_bases: List[List[int]]
+    warp_bases: List[List[int]]
+    block_bases: List[List[int]]
+    shape: List[int]
+
+    def __post_init__(self):
+        super().__setattr__("reg_bases", _unwrap_shape(self.reg_bases))
+        super().__setattr__("lane_bases", _unwrap_shape(self.lane_bases))
+        super().__setattr__("warp_bases", _unwrap_shape(self.warp_bases))
+        super().__setattr__("block_bases", _unwrap_shape(self.block_bases))
+        super().__setattr__("shape", _unwrap_shape(self.shape))
+
+        rank = len(self.shape)
+
+        for basis in self.reg_bases:
+            assert len(basis) == rank
+        for basis in self.lane_bases:
+            assert len(basis) == rank
+        for basis in self.warp_bases:
+            assert len(basis) == rank
+        for basis in self.block_bases:
+            assert len(basis) == rank
+
+    def _to_ir(self, builder):
+        return builder.get_distributed_linear_layout(self.reg_bases, self.lane_bases, self.warp_bases, self.block_bases,
+                                                     self.shape)
+
+    def mangle(self):
+        return f"DLL{self.reg_bases}_{self.lane_bases}_{self.warp_bases}_{self.block_bases}_{self.shape}DLL"
+
+    def __hash__(self):
+        return hash((
+            tuple(map(tuple, self.reg_bases)),
+            tuple(map(tuple, self.lane_bases)),
+            tuple(map(tuple, self.warp_bases)),
+            tuple(map(tuple, self.block_bases)),
+            tuple(self.shape),
+        ))
+
+    @property
+    def rank(self):
+        return len(self.shape)
+
+
+@dataclass(frozen=True)
+class DotOperandLayout(DistributedLayout):
+    """
+    Represents a layout for a dot operand.
+
+    Args:
+        operand_index (int): 0 for LHS and 1 for RHS of the dot operation.
+        parent (DistributedLayout): The parent layout, representing the MMA.
+        k_width (int): Number of elements per 32-bits.
+    """
+    operand_index: int
+    parent: DistributedLayout
+    k_width: int
+
+    def __post_init__(self):
+        super().__setattr__("operand_index", _unwrap_if_constexpr(self.operand_index))
+        super().__setattr__("parent", _unwrap_if_constexpr(self.parent))
+        super().__setattr__("k_width", _unwrap_if_constexpr(self.k_width))
+
+    def _to_ir(self, builder):
+        return builder.get_dot_operand_layout(self.operand_index, self.parent._to_ir(builder), self.k_width)
+
+    def mangle(self) -> str:
+        return f"DO{self.operand_index}_{self.parent.mangle()}_{self.k_width}DO"
+
+    def __hash__(self):
+        return hash((self.operand_index, self.parent, self.k_width))
+
+    @property
+    def rank(self):
+        return self.parent.rank
+
+    @property
+    def cga_layout(self):
+        parent_cga_layout = _unwrap_if_constexpr(getattr(self.parent, "cga_layout", [])) or []
+        if not parent_cga_layout:
+            return []
+
+        rank = self.parent.rank
+        assert all(len(basis) == rank for basis in parent_cga_layout)
+
+        k_dim = rank - 1 if self.operand_index == 0 else rank - 2
+        assert 0 <= k_dim < rank
+
+        derived = []
+        for basis in parent_cga_layout:
+            new_basis = list(basis)
+            new_basis[k_dim] = 0
+            derived.append(new_basis)
+        return derived
+
+
+@dataclass(frozen=True, eq=True)
+class NVMMADistributedLayout(DistributedLayout):
+    """
+    Represents a layout for NVIDIA MMA (tensor core) operations.
+
+    Args:
+        version (List[int]): Version identifier for the MMA instruction.
+        warps_per_cta (List[int]): Number of warps per CTA.
+        instr_shape (List[int]): Instruction shape for MMA.
+        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
+    """
+    version: List[int]
+    warps_per_cta: List[int]
+    instr_shape: List[int]
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(self.instr_shape))
+
+        object.__setattr__(self, "cga_layout", self.cga_layout)
+
+    def _to_ir(self, builder):
+        return builder.get_mma_layout(
+            self.version,
+            self.warps_per_cta,
+            self.cga_layout,
+            self.instr_shape,
+        )
+
+    def mangle(self) -> str:
+        cga_layout = "_".join("~".join(map(str, vec)) for vec in self.cga_layout) if self.cga_layout else ""
+        return f"MMA_{self.version}_{self.warps_per_cta}_{self.instr_shape}_{cga_layout}_MMA"
+
+    def __hash__(self):
+        return hash((tuple(self.version), tuple(self.warps_per_cta), tuple(self.instr_shape),
+                     tuple(tuple(vec) for vec in self.cga_layout)))
+
+    @property
+    def rank(self):
+        return len(self.warps_per_cta)
+
+
+class SharedLayout:
+    """
+    Base class for shared memory layouts in Gluon IR.
+    """
+
+    @property
+    def type(self):
+        return constexpr_type(self)
+
+
+@constexpr_function
+def _get_shape_per_cta(shape, cga_layout):
+    if not cga_layout:
+        return shape
+    shape_per_cta = list(shape)
+    rank = len(cga_layout[0])
+    cga_shape = [1] * rank
+    for basis in cga_layout:
+        assert len(basis) == rank
+        for i in range(rank):
+            cga_shape[i] = max(cga_shape[i], basis[i])
+    # The shape is the largest stride * 2
+    for i in range(rank):
+        cga_shape[i] *= 2
+    for dim in range(rank):
+        assert shape_per_cta[dim] % cga_shape[dim] == 0, f"Shape {shape} is not divisible by CGA layout {cga_layout}"
+        shape_per_cta[dim] //= cga_shape[dim]
+    return shape_per_cta
+
+
+@dataclass(frozen=True)
+class NVMMASharedLayout(SharedLayout):
+    """
+    Represents a layout for shared memory suitable for NVIDIA MMA operations.
+
+    Args:
+        swizzle_byte_width (int): Width in bytes for swizzling.
+        element_bitwidth (int): Bitwidth of element type.
+        rank (int): Rank of the tensor.
+        transposed (bool): Whether the layout is transposed.
+        fp4_padded (bool): Whether FP4 padding is used.
+        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
+    """
+    swizzle_byte_width: int
+    element_bitwidth: int
+    rank: int = 2
+    transposed: bool = False
+    fp4_padded: bool = False
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("swizzle_byte_width", _unwrap_if_constexpr(self.swizzle_byte_width))
+        super().__setattr__("element_bitwidth", _unwrap_if_constexpr(self.element_bitwidth))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("fp4_padded", _unwrap_if_constexpr(self.fp4_padded))
+
+        # TODO: Make rank optional and check that (rank or cga_layout)
+        cga_layout = self.cga_layout or []
+        if cga_layout:
+            assert len(cga_layout[0]) == self.rank
+
+        super().__setattr__("rank", _unwrap_if_constexpr(self.rank))
+        super().__setattr__("cga_layout", _unwrap_if_constexpr(cga_layout))
+
+        assert self.element_bitwidth in [8, 16, 32, 64]
+        assert self.swizzle_byte_width in [0, 32, 64, 128]
+
+    def _to_ir(self, builder):
+        return builder.get_nvmma_shared_layout(
+            self.swizzle_byte_width,
+            self.element_bitwidth,
+            self.transposed,
+            self.fp4_padded,
+            self.cga_layout,
+            self.rank,
+        )
+
+    @staticmethod
+    @constexpr_function
+    def get_default_for(block_shape, dtype, transposed=False, fp4_padded=False, cga_layout=None):
+        """Returns an NVMMASharedLayout with default swizzling for a given shape.
+
+        This picks the largest swizzle pattern compatible with the shape, which
+        allows emitting the fewest TMA or MMA messages.
+        """
+        packing_factor = 2 if fp4_padded else 1
+        shape_per_cta = block_shape if cga_layout is None else _get_shape_per_cta(block_shape, cga_layout)
+        rank = len(block_shape)
+        if transposed:
+            shape_per_cta = shape_per_cta[1:] + shape_per_cta[:1]
+        contig_dim_size = shape_per_cta[-1] * packing_factor
+        contig_dim_bytes = contig_dim_size * dtype.primitive_bitwidth // 8
+        if contig_dim_bytes >= 128 and contig_dim_bytes % 128 == 0:
+            swizzle_byte_width = 128
+        elif contig_dim_bytes >= 64 and contig_dim_bytes % 64 == 0:
+            swizzle_byte_width = 64
+        elif contig_dim_bytes >= 32 and contig_dim_bytes % 32 == 0:
+            swizzle_byte_width = 32
+        else:
+            swizzle_byte_width = 0
+
+        flatten_outer_dim = 1
+        for size in shape_per_cta[:-1]:
+            flatten_outer_dim *= size
+        if len(block_shape) < 2 or flatten_outer_dim < 8:
+            swizzle_byte_width = 0
+
+        return NVMMASharedLayout(
+            swizzle_byte_width=swizzle_byte_width,
+            element_bitwidth=dtype.primitive_bitwidth,
+            rank=rank,
+            transposed=transposed,
+            fp4_padded=fp4_padded,
+            cga_layout=cga_layout,
+        )
+
+    def mangle(self) -> str:
+        cga_layout = "_".join("~".join(map(str, vec)) for vec in self.cga_layout) if self.cga_layout else ""
+        return f"NVMMA_{self.swizzle_byte_width}_{self.element_bitwidth}_{self.transposed}_{self.fp4_padded}_{cga_layout}_NVMMA"
+
+    def __hash__(self):
+        return hash((self.swizzle_byte_width, self.element_bitwidth, self.rank, self.transposed, self.fp4_padded,
+                     tuple(tuple(vec) for vec in self.cga_layout) if self.cga_layout else None))
+
+
+@dataclass(frozen=True, eq=True)
+class SwizzledSharedLayout(SharedLayout):
+    """
+    Represents a generic swizzled shared memory layout.
+
+    Args:
+        vec (int): Vector width for swizzling.
+        per_phase (int): Elements per swizzle phase.
+        max_phase (int): Maximum number of swizzle phases.
+        order (List[int]): Dimension ordering for swizzling.
+        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
+    """
+    vec: int
+    per_phase: int
+    max_phase: int
+    order: List[int]
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("vec", _unwrap_if_constexpr(self.vec))
+        super().__setattr__("per_phase", _unwrap_if_constexpr(self.per_phase))
+        super().__setattr__("max_phase", _unwrap_if_constexpr(self.max_phase))
+        super().__setattr__("order", _unwrap_if_constexpr(self.order))
+
+        object.__setattr__(self, "cga_layout", self.cga_layout)
+
+    def _to_ir(self, builder):
+        return builder.get_swizzled_shared_layout(
+            self.vec,
+            self.per_phase,
+            self.max_phase,
+            self.order,
+            self.cga_layout,
+        )
+
+    def mangle(self) -> str:
+
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+
+        cga_layout = "_".join("~".join(map(str, vec)) for vec in self.cga_layout) if self.cga_layout else ""
+        return f"SSS_{self.vec}_{self.per_phase}_{self.max_phase}_{stringify(self.order)}_{cga_layout}_SSS"
+
+    def __hash__(self):
+        return hash(
+            (self.vec, self.per_phase, self.max_phase, tuple(self.order), tuple(tuple(vec) for vec in self.cga_layout)))
+
+
+@dataclass(frozen=True, eq=True)
+class PaddedSharedLayout(SharedLayout):
+    """
+    Represents a layout for the access to shared memory. Compared to SwizzledSharedLayout,
+    it combined padding and element reordering via linear transformation (e.g. row permutation)
+    to avoid shared memory bank conflicts. After every interval tensor elements, the
+    corresponding number of padding elements are inserted. If a position corresponds to
+    multiple intervals, the padding amounts are summed.
+
+    In the following example of a tensor,
+    `eM` represents original elements in the and `pN` represents padded element.
+
+    Before padding, the shared memory looks like:
+    [e0, e1,
+     e2, e3,
+     e4, e5,
+     e6, e7,
+     ...]
+
+    After padding with interval-padding list [[2, 1], [4, 2]] with an identity remapping,
+    the shared memory will be
+    [e0, e1, p0,
+     e2, e3, p1, p2, p3,
+     e4, e5, p4,
+     e6, e7, p5, p6, p7,
+     ...]
+
+    Furthermore this encoding allows for a linear remapping from the 1-D shared
+    memory offset to logical n-D tensor elements. The remapping is given in the form
+    of linear bases mapping from offset to [dim0, dim1...dimN-1].
+    See LinearLayout.h for more details how linear layouts are applied to remap
+    elements.
+    Some concrete examples using `xN` and `yN` to mean the logical n-D tensor elements
+    and `pN` to mean padding:
+
+    After padding for shape = [8] with interval-padding list [[2, 2]], offset_bases = [[2], [1]] and block_bases = []:
+    [x0, x2, p0 p1, x1, x3]
+
+    After padding for shape = [8, 4] with interval_padding_pairs = [[8, 1]], offset_bases = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]] and block_bases = []:
+    [
+        x0y0, x0y1, x0y2, x0y3,
+        x2y0, x2y1, x2y2, x2y3,
+        p0,
+        x4y0, x4y1, x4y2, x4y3,
+        x6y0, x6y1, x6y2, x6y3,
+        p1,
+        x1y0, x1y1, x1y2, x1y3,
+        x3y0, x3y1, x3y2, x3y3,
+        p2,
+        x5y0, x5y1, x5y2, x5y3,
+        x7y0, x7y1, x7y2, x7y3,
+    ]
+
+    Args:
+        interval_padding_pairs (List[int]): List of [interval, padding] pair and both interval and padding must be powers of 2.
+        offset_bases (List[int]): Bases for shared memory offsets
+        block_bases (List[List[int]]): Bases for block-level shared memory offsets.
+        shape (List[int]): n-D logical shared memory shape
+    """
+    interval_padding_pairs: List[List[int]]
+    offset_bases: List[List[int]]
+    block_bases: List[List[int]]
+    shape: List[int]
+
+    def __post_init__(self):
+        super().__setattr__("interval_padding_pairs", _unwrap_shape(self.interval_padding_pairs))
+        super().__setattr__("offset_bases", _unwrap_shape(self.offset_bases))
+        super().__setattr__("block_bases", _unwrap_shape(self.block_bases))
+        super().__setattr__("shape", _unwrap_shape(self.shape))
+
+        rank = len(self.shape)
+
+        for basis in self.offset_bases:
+            assert len(basis) == rank
+        for basis in self.block_bases:
+            assert len(basis) == rank
+
+        self.verify()
+
+    def _to_ir(self, builder):
+        intervals, paddings = zip(*self.interval_padding_pairs)
+        return builder.get_padded_shared_layout(intervals, paddings, self.offset_bases, self.block_bases, self.shape)
+
+    def mangle(self) -> str:
+        return f"PaddedShared_{self.interval_padding_pairs}_{self.offset_bases}_{self.block_bases}_{self.shape}_PaddedShared"
+
+    def verify(self):
+        pairs = self.interval_padding_pairs
+        assert len(pairs) > 0, "PaddedSharedLayout interval_padding_pairs must have at least one interval-padding pair"
+        assert all(len(pair) == 2 for pair in pairs)
+        intervals, paddings = zip(*pairs)
+
+        unique_intervals = list(set(intervals))
+        assert len(unique_intervals) == len(intervals)
+
+        is_power_of_2 = lambda n: n > 0 and n & (n - 1) == 0
+        assert all(is_power_of_2(n) for n in intervals), "PaddedSharedLayout interval values must all be power of two"
+        assert all(is_power_of_2(n) for n in paddings), "PaddedSharedLayout padding values must all be power of two"
+
+        rank = len(self.shape)
+        assert rank > 0, "PaddedSharedLayout order must not be empty"
+
+    @staticmethod
+    @constexpr_function
+    def with_identity_for(interval_padding_pairs, shape, order):
+        """Returns a PaddedSharedLayout with the given interval and padding pairs and an identity mapping as the linear component for the given shape and order.
+        """
+        assert len(shape) == len(order)
+        is_power_of_2 = lambda n: n > 0 and n & (n - 1) == 0
+        assert all(is_power_of_2(n) for n in shape)
+
+        rank = len(shape)
+        # Create a idendity mapping based on shape + order
+        offset_bases = []
+        for dim in order:
+            for basis in range(int(math.log2(shape[dim]))):
+                offset_bases.append([1 << basis if i == dim else 0 for i in range(rank)])
+
+        return PaddedSharedLayout(interval_padding_pairs, offset_bases, [], shape)
+
+    def __hash__(self):
+        return hash((tuple(map(tuple, self.interval_padding_pairs)), tuple(map(tuple, self.offset_bases)),
+                     tuple(map(tuple, self.block_bases)), tuple(self.shape)))
+
+
+@dataclass(frozen=True)
+class SharedLinearLayout(SharedLayout):
+    """Represents a shared memory layout defined via an explicit LinearLayout."""
+
+    offset_bases: List[List[int]]
+    block_bases: List[List[int]] = field(default_factory=list)
+    alignment: int = 16
+
+    def __post_init__(self):
+        super().__setattr__("offset_bases", _unwrap_shape(self.offset_bases))
+        super().__setattr__("block_bases", _unwrap_shape(self.block_bases))
+        super().__setattr__("alignment", _unwrap_if_constexpr(self.alignment))
+
+        assert len(self.offset_bases) != 0, "SharedLinearLayout offset_bases must not be empty"
+        rank = len(self.offset_bases[0])
+        assert rank > 0, "SharedLinearLayout offset_bases must not be empty"
+        for basis in self.offset_bases:
+            assert len(basis) == rank
+        for basis in self.block_bases:
+            assert len(basis) == rank
+        assert self.alignment > 0 and (self.alignment & (self.alignment - 1)) == 0, \
+            "SharedLinearLayout alignment must be a positive power of two"
+
+    def _to_ir(self, builder):
+        return builder.get_shared_linear_layout(self.offset_bases, self.block_bases, self.alignment)
+
+    def mangle(self) -> str:
+        return f"SharedLinear_{self.offset_bases}_{self.block_bases}_{self.alignment}_SharedLinear"
+
+    def __hash__(self):
+        return hash((
+            tuple(map(tuple, self.offset_bases)),
+            tuple(map(tuple, self.block_bases)),
+            self.alignment,
+        ))
+
+
+# Python impl of LinearEncodingAttr::basesPerDim
+def bases_per_dim(bases, rank, skip_broadcast=True):
+    result = [1] * rank
+
+    if not bases:
+        return result
+
+    non_zero_idx = None
+
+    for basis in bases:
+        # Find the first non-zero index in the current basis
+        idx = next((i for i, v in enumerate(basis) if v != 0), None)
+        if idx is not None:
+            non_zero_idx = idx
+            result[idx] *= 2
+        elif not skip_broadcast:
+            # If no non-zero found and we're not skipping broadcasts, use the last found non-zero index
+            assert non_zero_idx is not None
+            result[non_zero_idx] *= 2
+
+    return result
+
+
+def warps_per_cta(layout, shape):
+    if isinstance(layout, DistributedLinearLayout):
+        return bases_per_dim(layout.warp_bases, len(shape))
+    elif isinstance(layout, (SliceLayout, DotOperandLayout)):
+        return warps_per_cta(layout.parent, shape)
+    else:
+        return layout.warps_per_cta
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_math.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_math.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9c8d7605e0c25ef5b063027e320486c7c697d66
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_math.py
@@ -0,0 +1,20 @@
+import triton.language.math as tl_math
+from ._core import builtin
+
+umulhi = builtin(tl_math.umulhi)
+exp = builtin(tl_math.exp)
+exp2 = builtin(tl_math.exp2)
+fma = builtin(tl_math.fma)
+log = builtin(tl_math.log)
+log2 = builtin(tl_math.log2)
+cos = builtin(tl_math.cos)
+rsqrt = builtin(tl_math.rsqrt)
+sin = builtin(tl_math.sin)
+sqrt = builtin(tl_math.sqrt)
+sqrt_rn = builtin(tl_math.sqrt_rn)
+abs = builtin(tl_math.abs)
+fdiv = builtin(tl_math.fdiv)
+div_rn = builtin(tl_math.div_rn)
+erf = builtin(tl_math.erf)
+floor = builtin(tl_math.floor)
+ceil = builtin(tl_math.ceil)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_semantic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec019cbe4a4e0e22fc66f54230bc07dc03cf9e84
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_semantic.py
@@ -0,0 +1,573 @@
+from typing import Sequence, List, TypeVar, Tuple, Callable
+import math
+from triton.language.semantic import TritonSemantic
+from . import _core as ttgl
+from ._layouts import AutoLayout, DistributedLayout, DistributedLinearLayout, SliceLayout, SharedLayout, CoalescedLayout
+from triton._C.libtriton.gluon_ir import GluonOpBuilder, compute_tmem_reg_layout
+from triton.compiler.code_generator import flatten_values_to_ir, unflatten_ir_values
+
+TensorTy = TypeVar("TensorTy")
+
+
+def _check(cond: bool, msg_fn: Callable[[], str], category=ValueError):
+    if not cond:
+        raise category(msg_fn())
+
+
+def _is_int_list(value):
+    return isinstance(value, Sequence) and all(isinstance(i, int) for i in value)
+
+
+def _compute_tmem_reg_layout(element_ty, shape, layout, num_warps, instr_variant, cga_layout=None):
+    _check(isinstance(instr_variant, str), lambda: "instr_variant must be a string")
+    _check(instr_variant in ("32x32b", "16x64b", "16x128b", "16x256b", "16x32bx2", "32x32b_splitn"),
+           lambda: f"unknown instr_variant: {instr_variant}")
+    _check(isinstance(num_warps, int), lambda: f"num_warps must be an int but got {type(num_warps)!r}")
+    _check(num_warps >= 4 and (num_warps & (num_warps - 1)) == 0, lambda: "num_warps must be a power of two and >= 4")
+
+    shape = list(shape)
+    _check(all(isinstance(dim, int) for dim in shape), lambda: f"shape entries must be ints but got {shape}")
+    rank = len(shape)
+    _check(rank == 2, lambda: "expected a 2D tensor")
+
+    if cga_layout is None:
+        cga_layout = []
+    splitn = instr_variant == "32x32b_splitn"
+    atom_variant = "32x32b" if splitn else instr_variant
+
+    if cga_layout:
+        for basis in cga_layout:
+            _check(len(basis) == rank, lambda: "cga_layout basis rank mismatch")
+
+    layout_obj = compute_tmem_reg_layout(
+        element_ty,
+        shape,
+        layout,
+        num_warps,
+        atom_variant,
+        cga_layout,
+    )
+    _check(layout_obj is not None,
+           lambda: f"TMEM layout '{atom_variant}' unsupported for shape {shape} and num_warps {num_warps}")
+
+    if splitn:
+        N = shape[1]
+        if not layout_obj.reg_bases:
+            # We cannot use this layout in a load or a store ATM due to a PTX bug!
+            # You can work around this by loading to 32x32b and follow by a convert_layout to this layout.
+            _check(layout_obj.lane_bases[-1] == [0, N // 2],
+                   lambda: f"splitn with 1 register requires the last lane basis to be [0, N / 2]. Got {layout_obj}")
+            layout_obj.reg_bases.append([0, N // 2])
+            layout_obj.lane_bases[-1] = [0, 0]
+        elif layout_obj.reg_bases[-1] != [0, N // 2]:
+            bitwidth = element_ty.primitive_bitwidth
+            num_reg = 2**len(layout_obj.reg_bases)
+            _check(
+                num_reg > 32 // bitwidth, lambda: "To be able to `tmem.load` into `tl.split` you need to have more "
+                f"than {32 // bitwidth} {bitwidth}-bit registers, as you need to use "
+                "the instruction 32x32b.x1 twice. You can always load into "
+                "instr_variant=\"32x32b\" and then convert_layout to this layout otherwise.")
+
+            reg_bases = layout_obj.reg_bases
+            for bases_str in ("lane_bases", "warp_bases"):
+                bases = getattr(layout_obj, bases_str)
+                for i, basis in enumerate(bases):
+                    if basis == [0, N // 2]:
+                        reg_bases[-1], bases[i] = bases[i], reg_bases[-1]
+                        return layout_obj
+            assert False, f"splitn requires at least one basis of [0, N / 2]. Got {layout}"
+    return layout_obj
+
+
+_compute_tmem_reg_layout.__triton_builtin__ = True
+
+
+class GluonCallerContext:
+
+    def __init__(self, num_warps: int):
+        self.num_warps = num_warps
+
+    def mangle(self):
+        return f"_NW{self.num_warps}"
+
+    def initialize_callee(self, fn, builder):
+        fn.set_attr("ttg.num-warps", builder.get_int32_attr(self.num_warps))
+
+
+class GluonSemantic(TritonSemantic[TensorTy]):
+    tensor = ttgl.tensor
+    lang = ttgl
+
+    builder: GluonOpBuilder
+
+    def __init__(self, builder: GluonOpBuilder):
+        self.builder = builder
+
+    def _wrap_handle_infer_layout(self, handle, scalar_ty, shape):
+        if shape == []:
+            ty = scalar_ty
+        else:
+            ty = ttgl.distributed_type(scalar_ty, shape, self.builder.get_gluon_layout_from_tensor(handle))
+        return self.tensor(handle, ty)
+
+    def _wrap_tensor_infer_layout(self, tensor):
+        return self._wrap_handle_infer_layout(tensor.handle, tensor.type.scalar, tensor.shape)
+
+    def _broadcast_shapes(self, lhs_shape: List[int], rhs_shape: List[int]):
+        if len(lhs_shape) != len(rhs_shape):
+            raise ValueError(f"Cannot broadcast, rank mismatch: {lhs_shape}, {rhs_shape}")
+
+        ret_shape = []
+        for i, left in enumerate(lhs_shape):
+            right = rhs_shape[i]
+            if left == 1:
+                ret_shape.append(right)
+            elif (right == 1) or (right == left):
+                ret_shape.append(left)
+            else:
+                raise ValueError("Cannot make_shape_compatible: incompatible dimensions "
+                                 "at index " + str(i) + ": " + str(left) + " and " + str(right))
+        return ret_shape
+
+    def expand_dims(self, input: TensorTy, axis: int) -> TensorTy:
+        dst_shape = [ttgl._unwrap_if_constexpr(x) for x in input.shape]
+        dst_shape.insert(axis, 1)
+
+        if axis < 0:
+            axis += len(input.shape)
+
+        _check(isinstance(input.type, ttgl.distributed_type),
+               lambda: f"expected expand_dims input to be a distributed_type but got: {input.type!r}")
+        layout = input.type.layout
+        _check(isinstance(layout, (SliceLayout, AutoLayout, CoalescedLayout)),
+               lambda: f"expected expand_dims input to have a SliceLayout, but got: {layout}")
+        _check(
+            isinstance(layout, (AutoLayout, CoalescedLayout)) or layout.dim == axis,
+            lambda: f"expected expand_dims input layout to be sliced in axis {axis} but got {layout.dim}")
+
+        handle = self.builder.create_expand_dims(input.handle, axis)
+        return self._wrap_handle_infer_layout(handle, input.type.scalar, dst_shape)
+
+    def join(self, a: TensorTy, b: TensorTy) -> TensorTy:
+        a, b = self.broadcast_impl_value(a, b)
+        _check(a.shape != [], lambda: "Cannot join scalars in gluon")
+        value = super().join(a, b)
+        return self._wrap_tensor_infer_layout(value)
+
+    def split(self, a: TensorTy) -> Tuple[TensorTy, TensorTy]:
+        lhs, rhs = super().split(a)
+        return self._wrap_tensor_infer_layout(lhs), self._wrap_tensor_infer_layout(rhs)
+
+    def permute(self, input: TensorTy, dims: Tuple[int]) -> TensorTy:
+        value = super().permute(input, dims)
+        return self._wrap_tensor_infer_layout(value)
+
+    def broadcast_impl_shape(self, input: TensorTy, shape: Tuple[int]) -> TensorTy:
+        _check(isinstance(input.type, ttgl.distributed_type),
+               lambda: f"expected expand_dims input to be a distributed_type but got: {input.type!r}")
+        src_shape = input.type.get_block_shapes()
+        _check(len(src_shape) == len(shape), lambda: f"Cannot broadcast, rank mismatch: {src_shape}, {shape}")
+        if shape == src_shape:
+            return input
+        for i, item in enumerate(src_shape):
+            if shape[i] != item and item != 1:
+                raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})"
+                                 f" must match the existing size ({item}) at non-singleton dimension"
+                                 f" {i}: {src_shape}, {shape}")
+        ret_ty = ttgl.distributed_type(input.type.scalar, shape, input.type.layout)
+        handle = self.builder.create_broadcast(input.handle, ret_ty.to_ir(self.builder))
+        return self.tensor(handle, ret_ty)
+
+    def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
+        lhs_ty = lhs.type
+        rhs_ty = rhs.type
+
+        if not lhs_ty.is_block() or not rhs_ty.is_block():
+            return super().broadcast_impl_value(lhs, rhs)
+
+        _check(isinstance(lhs_ty, ttgl.distributed_type),
+               lambda: f"expected broadcast left input to be a distributed_type but got: {lhs_ty!r}")
+        _check(isinstance(rhs_ty, ttgl.distributed_type),
+               lambda: f"expected broadcast right input to be a distributed_type but got: {rhs_ty!r}")
+
+        lhs_shape = lhs_ty.get_block_shapes()
+        rhs_shape = rhs_ty.get_block_shapes()
+        ret_shape = self._broadcast_shapes(lhs_shape, rhs_shape)
+
+        is_lhs_auto = isinstance(lhs_ty.layout, AutoLayout)
+        is_rhs_auto = isinstance(rhs_ty.layout, AutoLayout)
+        if is_lhs_auto and not is_rhs_auto:
+            lhs = self.set_auto_layout(lhs, rhs_ty.layout)
+        elif is_rhs_auto and not is_lhs_auto:
+            rhs = self.set_auto_layout(rhs, lhs_ty.layout)
+        elif lhs_ty.layout != rhs_ty.layout:
+            raise ValueError(f"Layout mismatch in broadcast: {lhs_ty.layout} vs {rhs_ty.layout}")
+
+        lhs = self.broadcast_impl_shape(lhs, ret_shape)
+        rhs = self.broadcast_impl_shape(rhs, ret_shape)
+        return lhs, rhs
+
+    def arange(self, start, end, layout):
+        shape = [end - start]
+        if layout is None:
+            layout = AutoLayout()
+        ret_ty = ttgl.distributed_type(ttgl.int32, shape, layout)
+        return super().arange(start, end, ret_ty=ret_ty)
+
+    def reshape(self, input: TensorTy, dst_shape: List[int], can_reorder: bool):
+        _check(not can_reorder, lambda: "can_reorder is not supported in gluon")
+        value = super().reshape(input, dst_shape, can_reorder)
+        return self._wrap_tensor_infer_layout(value)
+
+    def splat(self, value, shape, layout):
+        if len(shape) == 0:
+            return value
+        ret_ty = ttgl.distributed_type(value.dtype, shape, layout)
+        handle = self.builder.create_splat(ret_ty.to_ir(self.builder), value.handle)
+        return ttgl.tensor(handle, ret_ty)
+
+    def full(self, shape, value, dtype, layout):
+        scalar = self.make_scalar(value, dtype)
+        if layout is None:
+            layout = AutoLayout()
+        return self.splat(scalar, shape, layout)
+
+    def convert_layout(self, value, layout, assert_trivial=False):
+        ty = value.type
+        _check(isinstance(ty, ttgl.distributed_type),
+               lambda: f"expected convert_layout input to be a distributed_type but got: {ty!r}")
+        _check(isinstance(layout, ttgl.DistributedLayout),
+               lambda: f"expected 'layout' to be a DistributedLayout but got {layout}")
+        ret_ty = ttgl.distributed_type(ty.element_ty, ty.shape, layout)
+        ret_ty_ir = ret_ty.to_ir(self.builder)
+        if assert_trivial and not self.builder.is_convert_layout_trivial(ret_ty_ir, value.handle):
+            raise TypeError(f"layout conversion from {ty.layout} to {layout} is not trivial.\n"
+                            f"The linear layouts are:\n{self.to_linear_layout(ty.layout, ty.shape)}\n"
+                            f"{self.to_linear_layout(layout, ty.shape)}")
+        handle = self.builder.create_convert_layout(ret_ty_ir, value.handle)
+        return ttgl.tensor(handle, ret_ty)
+
+    def allocate_shared(self, element_ty, shape, layout, value):
+        _check(isinstance(element_ty, ttgl.dtype), lambda: f"expected 'element_ty' to be a dtype but got {element_ty}")
+        _check(_is_int_list(shape), lambda: f"all elements of 'shape' must be integers but got {shape}")
+        _check(isinstance(layout, ttgl.SharedLayout),
+               lambda: f"expected 'layout' to be a SharedLayout but got {layout}")
+        ty = ttgl.shared_memory_descriptor_type(element_ty, shape, layout, shape)
+        if value is not None:
+            handle = self.builder.create_local_alloc(ty.to_ir(self.builder), value.handle)
+        else:
+            handle = self.builder.create_local_alloc(ty.to_ir(self.builder))
+        return ttgl.shared_memory_descriptor(handle, element_ty, shape, layout, shape)
+
+    def shared_load(self, mem_desc, layout):
+        _check(isinstance(layout, ttgl.DistributedLayout),
+               lambda: f"expected 'layout' to be a DistributedLayout but got {layout}")
+        ret_ty = ttgl.distributed_type(mem_desc.dtype, mem_desc.shape, layout)
+        handle = self.builder.create_local_load(ret_ty.to_ir(self.builder), mem_desc.handle)
+        return ttgl.tensor(handle, ret_ty)
+
+    def shared_store(self, mem_desc, value):
+        _check(isinstance(value, ttgl.tensor), lambda: f"expected 'value' to be a tensor, but got a {type(value)}")
+        _check(value.shape == mem_desc.shape,
+               lambda: f"source shape {value.shape} and destination shape {mem_desc.shape} must match")
+        _check(value.dtype == mem_desc.dtype,
+               lambda: f"source dtype {value.dtype} and destination dtype {mem_desc.dtype} must match")
+        self.builder.create_local_store(mem_desc.handle, value.handle)
+
+    def bank_conflicts(self, distr_ty, shared_ty):
+        if not isinstance(distr_ty, ttgl.distributed_type):
+            raise TypeError(
+                f"bank_conflicts expects the register layout to be a distributed_type, got {type(distr_ty)}")
+
+        if not isinstance(shared_ty, ttgl.shared_memory_descriptor_type):
+            raise TypeError(
+                f"bank_conflicts expects the shared layout to be a shared_memory_descriptor_type, got {type(shared_ty)}"
+            )
+
+        if distr_ty.shape != shared_ty.shape:
+            raise ValueError(f"register shape {distr_ty.shape} and shared shape {shared_ty.shape} must match")
+        if shared_ty.element_ty != distr_ty.element_ty:
+            raise ValueError(
+                f"mismatched dtypes between register ({distr_ty.element_ty}) and shared ({shared_ty.element_ty}) layouts"
+            )
+        if shared_ty.shape != shared_ty.alloc_shape[-len(shared_ty.shape):]:
+            raise ValueError(
+                f"bank_conflicts NYI for subslices. Got shape {shared_ty.shape} and alloc_shape {shared_ty.alloc_shape}"
+            )
+
+        reg_attr = distr_ty.layout._to_ir(self.builder)
+        shared_attr = shared_ty.layout._to_ir(self.builder)
+        return self.builder.get_shared_bank_conflicts(reg_attr, shared_attr, list(distr_ty.shape),
+                                                      distr_ty.element_ty.primitive_bitwidth)
+
+    def to_linear_layout(self, layout, shape):
+        _check(isinstance(layout, (DistributedLayout, SharedLayout)),
+               lambda: f"Expected a DistributedLayout or SharedLayout, got {type(layout)}")
+
+        if not isinstance(shape, list):
+            shape = list(shape)
+
+        layout = ttgl._unwrap_if_constexpr(layout)
+
+        if isinstance(layout, (AutoLayout, DistributedLinearLayout)):
+            return ttgl.constexpr(layout)
+
+        return ttgl.constexpr(self.builder.to_linear_layout(layout._to_ir(self.builder), shape))
+
+    def shared_dealloc(self, mem_desc):
+        self.builder.create_local_dealloc(mem_desc.handle)
+
+    def set_auto_layout(self, value, layout):
+        src_ty = value.type
+        _check(isinstance(layout, DistributedLayout),
+               lambda: f"set_auto_layout must set to a distributed layout but got {layout}")
+        _check(isinstance(src_ty.layout, AutoLayout),
+               lambda: f"set_auto_layout input must have auto layout but got {value.type.layout}")
+        handle = self.builder.create_set_auto_layout(layout._to_ir(self.builder), value.handle)
+        res_ty = ttgl.distributed_type(src_ty.element_ty, src_ty.shape, layout)
+        return self.tensor(handle, res_ty)
+
+    def memdesc_slice(self, mem_desc, start, length, dim):
+        _check(isinstance(start, int), lambda: f"expected 'start' to be an int but got {start}")
+        _check(isinstance(length, int), lambda: f"expected 'length' to be an int but got {length}")
+        _check(isinstance(dim, int), lambda: f"expected 'dim' to be an int but got {dim}")
+        offsets = [0] * mem_desc.rank
+        offsets[dim] = start
+        shape = list(mem_desc.shape)
+        shape[dim] = length
+        layout = mem_desc.layout
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
+        builder = self.builder
+        handle = builder.create_memdesc_subslice(ty.to_ir(builder), mem_desc.handle, offsets)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+
+    def memdesc_index(self, mem_desc, index):
+        index = self.to_tensor(index)
+        _check(index.type == ttgl.int32, lambda: f"expected 'index' to be int32 but got {index.type}")
+        shape = mem_desc.shape[1:]
+        index = self.to_tensor(index).handle
+        layout = mem_desc.layout
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, shape)
+        builder = self.builder
+        handle = builder.create_memdesc_index(ty.to_ir(builder), mem_desc.handle, index)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+
+    def memdesc_trans(self, mem_desc, order):
+        _check(_is_int_list(order), lambda: f"all elements of 'order' must be integers but got {order}")
+        _check(
+            len(order) == len(mem_desc.shape),
+            lambda: f"source rank ({mem_desc.rank}) and order length ({len(order)}) must match")
+
+        shape = [mem_desc.shape[i] for i in order]
+        alloc_shape = mem_desc.type.alloc_shape
+        new_alloc_shape = alloc_shape[:len(alloc_shape) - mem_desc.rank]
+        new_alloc_shape += [alloc_shape[len(alloc_shape) - mem_desc.rank:][i] for i in order]
+
+        handle = self.builder.create_memdesc_trans(mem_desc.handle, order)
+        layout = self.builder.get_gluon_layout_from_memdesc(handle)
+        return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape,
+                                             alloc_shape=new_alloc_shape, layout=layout)
+
+    def memdesc_reshape(self, mem_desc, shape):
+        _check(_is_int_list(shape), lambda: f"all elements of 'shape' must be integers but got {shape}")
+        _check(
+            math.prod(shape) == math.prod(mem_desc.shape),
+            lambda: (f"memdesc_reshape total elements mismatch: "
+                     f"{mem_desc.shape} -> {shape}"),
+        )
+
+        handle = self.builder.create_memdesc_reshape(mem_desc.handle, shape)
+        layout = self.builder.get_gluon_layout_from_memdesc(handle)
+        alloc_shape = mem_desc.type.alloc_shape
+        prefix_len = len(alloc_shape) - mem_desc.rank
+        new_alloc_shape = alloc_shape[:prefix_len] + list(shape)
+
+        return ttgl.shared_memory_descriptor(
+            handle,
+            element_ty=mem_desc.dtype,
+            shape=shape,
+            alloc_shape=new_alloc_shape,
+            layout=layout,
+        )
+
+    def memdesc_reinterpret(self, mem_desc, dtype, shape, layout):
+        _check(isinstance(dtype, ttgl.dtype), lambda: f"expected 'dtype' to be a dtype but got {dtype}")
+        _check(_is_int_list(shape), lambda: f"all elements of 'shape' must be integers but got {shape}")
+        _check(isinstance(layout, ttgl.SharedLayout),
+               lambda: f"expected 'layout' to be a SharedLayout but got {layout}")
+        ty = ttgl.shared_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = self.builder.create_memdesc_reinterpret(ty.to_ir(self.builder), mem_desc.handle)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+
+    def wrap_tensor(self, x, scalar_ty, ret_shape, layout):
+        if ret_shape:
+            res_ty = ttgl.distributed_type(scalar_ty, ret_shape, layout)
+        else:
+            res_ty = scalar_ty
+        return self.tensor(x, res_ty)
+
+    @staticmethod
+    def _check_same_layout(xs):
+        for x in xs:
+            _check(isinstance(x.type, ttgl.distributed_type), lambda: f"expected distributed_type but got: {x.type!r}")
+        layouts = [x.type.layout for x in xs]
+        l0 = layouts[0]
+        _check(all(l == l0 for l in layouts[1:]),
+               lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
+
+    def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn,
+                         reverse: bool) -> Tuple[TensorTy, ...]:
+        shape = inputs[0].type.shape
+        rank = len(shape)
+
+        assert -rank <= axis < rank, f"scan axis {axis} must be < inputs rank ({rank})"
+
+        if axis < 0:
+            axis += rank
+
+        for t in inputs:
+            assert t.type.shape == shape, "all scan inputs must have the same shape"
+
+        scan_op = self.builder.create_scan([t.handle for t in inputs], axis, reverse)
+        region_builder_fn(scan_op)
+        assert scan_op.verify()
+
+        return tuple(
+            self._wrap_handle_infer_layout(scan_op.get_result(i), inputs[i].type.scalar, shape)
+            for i in range(len(inputs)))
+
+    def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) -> Tuple[TensorTy, ...]:
+        if axis is None:
+            inputs = tuple(self.reshape(t, [t.numel.value], can_reorder=False) for t in inputs)
+            axis = 0
+        # get result shape
+        shape = inputs[0].type.shape
+        rank = len(shape)
+        _check(0 <= axis < rank, lambda: f"expected reduction axis to be in the range [0, {rank}) but got {axis}")
+        self._check_same_layout(inputs)
+        ret_shape = [s for i, s in enumerate(shape) if i != axis]
+        assert all(t.type.shape == shape for t in inputs), "all reduction inputs must have the same shape"
+
+        reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
+        region_builder_fn(reduce_op)
+        assert reduce_op.verify()
+
+        return tuple(
+            self._wrap_handle_infer_layout(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape)
+            for i in range(len(inputs)))
+
+    def histogram(self, input: TensorTy, num_bins: int, mask: TensorTy, layout) -> TensorTy:
+        _check(len(input.shape) == 1, lambda: "histogram only supports 1D input")
+        _check(input.dtype.is_int(), lambda: "histogram only supports integer input")
+        _check(layout is not None, lambda: "histogram requires a destination layout")
+        if mask is not None:
+            mask, input = self.broadcast_impl_value(mask, input)
+            _check(mask.type.scalar.is_bool(), lambda: "Mask must have boolean scalar type")
+            mask = mask.handle
+        layout_attr = layout._to_ir(self.builder)
+        handle = self.builder.create_histogram(input.handle, num_bins, mask, layout_attr)
+        return self.wrap_tensor(handle, ttgl.int32, [num_bins], layout)
+
+    def cat(self, lhs: TensorTy, rhs: TensorTy, can_reorder: bool, layout) -> TensorTy:
+        _check(layout is not None, lambda: "cat requires a destination layout")
+        _check(can_reorder, lambda: "current implementation of `cat` always may reorder elements")
+        _check(len(lhs.shape) == 1, lambda: "cat requires a rank-1 input")
+        ret_type = ttgl.distributed_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]], layout)
+        return self.tensor(self.builder.create_cat(lhs.handle, rhs.handle, ret_type.to_ir(self.builder)), ret_type)
+
+    def gather(self, src: TensorTy, index: TensorTy, axis: int) -> TensorTy:
+        _check(isinstance(src.type, ttgl.distributed_type), lambda: f"expected distributed_type but got: {src.type!r}")
+        _check(isinstance(index.type, ttgl.distributed_type),
+               lambda: f"expected distributed_type but got: {index.type!r}")
+        _check(index.type.scalar.is_int(), lambda: f"expected integer scalar type but got: {index.type.scalar!r}")
+
+        rank = len(src.type.shape)
+        _check(len(index.type.shape) == rank, lambda: "source and index tensors must have the same rank")
+        _check(-rank <= axis < rank, lambda: f"gather axis {axis} must be < source rank ({rank})")
+        if axis < 0:
+            axis += rank
+
+        for d in range(rank):
+            if d == axis:
+                continue
+            _check(
+                index.type.shape[d] == src.type.shape[d],
+                lambda: f"index dim {axis} must match the corresponding source dim",
+            )
+        gather = self.builder.create_gather(src.handle, index.handle, axis)
+        return self.wrap_tensor(gather, src.type.scalar, index.type.shape, index.type.layout)
+
+    def fp4_to_fp(self, src: TensorTy, elem_type, axis) -> TensorTy:
+        result = self.builder.create_fp4_to_fp(src.handle, elem_type.to_ir(self.builder), axis)
+        shape = list(src.type.shape)
+        shape[axis] *= 2
+        return self._wrap_handle_infer_layout(result, elem_type, shape)
+
+    def warp_specialize(self, functions_and_args, worker_num_warps: Sequence[int], worker_num_regs: Sequence[int],
+                        generator):
+        for _, args in functions_and_args:
+            _check(isinstance(args, (tuple, ttgl.tuple)),
+                   lambda: f"function arguments must be a tuple of arguments, but got {type(args)}")
+
+        assert len(functions_and_args) >= 1, "expected at least one function for the default partition"
+        default_partition, default_args = functions_and_args[0]
+        num_partitions = len(functions_and_args) - 1
+        workers = functions_and_args[1:]
+
+        assert num_partitions == len(
+            worker_num_warps
+        ), f"warp specialize got {num_partitions} partitions but {len(worker_num_warps)} warp counts"
+        assert num_partitions == len(
+            worker_num_regs
+        ), f"warp specialize got {num_partitions} partitions but {len(worker_num_regs)} register counts"
+
+        builder = self.builder
+        insert_pt = builder.get_insertion_point()
+
+        # Emit the default partition to get the result types.
+        default_block = builder.new_block()
+        builder.set_insertion_point_to_start(default_block)
+        default_results = generator.call_JitFunction(default_partition, default_args, kwargs={})
+        mlir_results = []
+        if default_results is not None:
+            mlir_results = flatten_values_to_ir(default_results)
+        builder.create_warp_yield(mlir_results)
+        result_types = [r.get_type() for r in mlir_results]
+
+        # Create the warp specialize op.
+        worker_args = [flatten_values_to_ir(args) for _, args in workers]
+        mlir_args = sum(worker_args, [])
+        builder.restore_insertion_point(insert_pt)
+        ws_op = builder.create_warp_specialize(result_types, mlir_args, worker_num_warps)
+        ws_op.get_default_region().push_back(default_block)
+        ws_op.set_requested_registers(worker_num_regs)
+
+        # Emit the partition regions.
+        builder.create_block_with_parent(ws_op.get_partition_op_holder(), [])
+        partitions_op = builder.create_warp_specialize_partitions(num_partitions)
+        arg_types = [arg.get_type() for arg in mlir_args]
+        arg_it = 0
+        for i, (func, args) in enumerate(workers):
+            caller_context = GluonCallerContext(num_warps=worker_num_warps[i])
+            block = builder.create_block_with_parent(partitions_op.get_region(i), arg_types)
+            mlir_args = worker_args[i]
+            block_args = [block.get_argument(arg_it + j) for j in range(len(mlir_args))]
+            block_args = unflatten_ir_values(block_args, [arg.type for arg in args])
+            generator.call_JitFunction(func, block_args, kwargs={}, caller_context=caller_context)
+            builder.create_warp_return()
+            arg_it += len(mlir_args)
+
+        builder.set_insertion_point_after(ws_op.get_operation())
+        mlir_results = [ws_op.get_result(i) for i in range(len(result_types))]
+        if default_results is None:
+            return
+        return tuple(unflatten_ir_values(mlir_results, [r.type for r in default_results]))
+
+    def num_ctas(self):
+        return ttgl.constexpr(self.builder.options.num_ctas)
+
+    def num_warps(self, generator):
+        if generator.caller_context is not None:
+            assert isinstance(generator.caller_context, GluonCallerContext)
+            return ttgl.constexpr(generator.caller_context.num_warps)
+        return ttgl.constexpr(self.builder.options.num_warps)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_standard.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_standard.py
new file mode 100644
index 0000000000000000000000000000000000000000..caa0e6fb0f5d210048f51cb60ffcdba2bd74715f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/_standard.py
@@ -0,0 +1,81 @@
+from typing import TypeVar
+from triton.runtime.jit import JITFunction
+import triton.language.standard as tl_standard
+from .._runtime import GluonJITFunction, jit
+from triton import knobs
+from . import _core as ttgl
+
+T = TypeVar("T")
+
+
+def _import_from_triton(fn: JITFunction[T]) -> GluonJITFunction[T]:
+    assert knobs.runtime.interpret or isinstance(fn, JITFunction)
+    # Wrap the function and preserve its original docstring
+    gluon_fn = jit(fn.fn)
+    gluon_fn.__doc__ = fn.__doc__
+    return gluon_fn
+
+
+cdiv = _import_from_triton(tl_standard.cdiv)
+sum = _import_from_triton(tl_standard.sum)
+max = _import_from_triton(tl_standard.max)
+min = _import_from_triton(tl_standard.min)
+ravel = _import_from_triton(tl_standard.ravel)
+reduce_or = _import_from_triton(tl_standard.reduce_or)
+xor_sum = _import_from_triton(tl_standard.xor_sum)
+
+
+@jit
+def zeros(shape, dtype, layout=None):
+    """
+    Create a tensor filled with zeros.
+
+    Args:
+        shape (Sequence[int]): The shape of the tensor.
+        dtype (dtype): The data type for the tensor.
+        layout (Optional[DistributedLayout]): The distributed layout of the tensor, defaults to AutoLayout().
+
+    Returns:
+        tensor: A tensor where every element is zero.
+    """
+    return ttgl.full(shape, 0, dtype, layout)
+
+
+@jit
+def full_like(input, value, shape=None, dtype=None, layout=None):
+    """
+    Create a tensor with the same properties as a given tensor, filled with a specified value.
+
+    Args:
+        input (tensor): Reference tensor to infer default shape, dtype, and layout.
+        value (int or float): The fill value.
+        shape (Sequence[int], optional): Target shape. Defaults to input.shape.
+        dtype (dtype, optional): Target data type. Defaults to input.dtype.
+        layout (DistributedLayout, optional): Target layout. Defaults to input.layout.
+
+    Returns:
+        tensor: A tensor where every element equals value.
+    """
+    return ttgl.full(
+        input.shape if shape is None else shape,
+        value,
+        input.dtype if dtype is None else dtype,
+        input.type.layout if layout is None else layout,
+    )
+
+
+@jit
+def zeros_like(input, shape=None, dtype=None, layout=None):
+    """
+    Create a tensor with the same properties as a given tensor, filled with zeros.
+
+    Args:
+        input (tensor): Reference tensor to infer default shape, dtype, and layout.
+        shape (Sequence[int], optional): Target shape. Defaults to input.shape.
+        dtype (dtype, optional): Target data type. Defaults to input.dtype.
+        layout (DistributedLayout, optional): Target layout. Defaults to input.layout.
+
+    Returns:
+        tensor: A tensor where every element is zero.
+    """
+    return full_like(input, 0, shape=shape, dtype=dtype, layout=layout)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89f534c60446d26e3cc27b70061648e29adfba43
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__init__.py
@@ -0,0 +1,6 @@
+from ._layouts import AMDMFMALayout, AMDWMMALayout
+from . import cdna3, cdna4
+from . import rdna3, rdna4
+from . import gfx1250
+
+__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4", "gfx1250"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75b8d4ae9e523f4c27841dabfba57c8f4f428a2d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_layouts.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_layouts.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd2225b433e7c0be239d035b8216d42b64f190ab
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_layouts.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_ops.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c18519b96faea54d22ec5a8307e98721f0a42bb8
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/__pycache__/_ops.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_layouts.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d616fea94255b36312cc0713776f897a1cd04e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_layouts.py
@@ -0,0 +1,187 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+from triton.language.core import _unwrap_if_constexpr
+
+from triton.experimental.gluon.language._layouts import DistributedLayout
+
+__all__ = [
+    "AMDMFMALayout",
+    "AMDWMMALayout",
+]
+
+
+@dataclass(frozen=True)
+class AMDMFMALayout(DistributedLayout):
+    """
+    Represents a layout for AMD MFMA (matrix core) operations.
+
+    Args:
+        version (int): The GPU architecture.
+        instr_shape (List[int]): The shape in the form of (M, N, K) of the matrix.
+        transposed (bool): Indicates the result tensor is transposed so that each thread holds consecutive elements in the same row instead of column, which is good for chained dot and global write.
+        warps_per_cta (List[int]): The warp layout in the block.
+        element_bitwidth Optional(int): Bit width of the output element type. Supported values are 32 and 64. Defaults to 32.
+        tiles_per_warp Optional(List[int]): The tile layout within a warp. Defaults to unit tile layout, i.e., single tile on all dimensions.
+        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
+
+    Current supported versions:
+
+    - 1: gfx908
+    - 2: gfx90a
+    - 3: gfx942
+    - 4: gfx950
+    """
+    version: int
+    instr_shape: List[int]
+    transposed: bool
+    warps_per_cta: List[int]
+    element_bitwidth: Optional[int] = None
+    tiles_per_warp: Optional[List[int]] = None
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(self.instr_shape))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("element_bitwidth", _unwrap_if_constexpr(self.element_bitwidth))
+        super().__setattr__("tiles_per_warp", _unwrap_if_constexpr(self.tiles_per_warp))
+
+        if self.element_bitwidth is None:
+            object.__setattr__(self, "element_bitwidth", 32)
+        if self.tiles_per_warp is None:
+            object.__setattr__(self, "tiles_per_warp", [1] * len(self.warps_per_cta))
+
+        object.__setattr__(self, "cga_layout", self.cga_layout)
+        self.verify()
+
+    def _to_ir(self, builder):
+        return builder.get_amd_mfma_layout(
+            self.version,
+            self.warps_per_cta,
+            self.instr_shape,
+            self.transposed,
+            self.cga_layout,
+            self.tiles_per_warp,
+            self.element_bitwidth,
+        )
+
+    def mangle(self) -> str:
+
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+
+        cga_layout = stringify(["~".join(map(str, vec)) for vec in self.cga_layout] if self.cga_layout else None)
+        return f"MFMA_{self.version}_{stringify(self.instr_shape)}_{self.transposed}_{stringify(self.warps_per_cta)}_{self.element_bitwidth}_{stringify(self.tiles_per_warp)}_{cga_layout}_MFMA"
+
+    def verify(self):
+        assert self.version >= 1 and self.version <= 4, "version must be in the [1, 4] range"
+        assert len(self.instr_shape) == 3, "instr_shape must follow the (M, N, K) format"
+        valid_shapes = [[32, 32], [16, 16], [64, 4], [4, 64]]
+        assert self.instr_shape[0:2] in valid_shapes, f"invalid intrinsic shape {self.instr_shape}"
+        assert self.element_bitwidth in [32, 64], "element bitwidth must be 32 or 64"
+
+        rank = len(self.warps_per_cta)
+        assert all(len(vec) == rank for vec in self.cga_layout), "cga_layout basis rank mismatch"
+
+    def __hash__(self):
+        return hash((
+            self.version,
+            tuple(self.instr_shape),
+            self.transposed,
+            tuple(self.warps_per_cta),
+            self.element_bitwidth if self.element_bitwidth else None,
+            tuple(self.tiles_per_warp) if self.tiles_per_warp else None,
+            tuple(tuple(vec) for vec in self.cga_layout),
+        ))
+
+    @property
+    def rank(self):
+        return len(self.warps_per_cta)
+
+
+@dataclass(frozen=True)
+class AMDWMMALayout(DistributedLayout):
+    """
+    Represents a layout for AMD WMMA (matrix core) operations.
+
+    Args:
+        version (int): Indicates the GPU architecture.
+        transposed (bool): Indicates the result tensor is transposed.
+        warps_per_cta (List[int]): Number of warps per CTA.
+        instr_shape (Optional[List[int]]): Instruction shape (M, N, K). Defaults to (16, 16, 16).
+        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
+
+    Current supported versions:
+
+    - 1: RDNA3; e.g., gfx1100, gfx1101
+    - 2: RDNA4; e.g., gfx1200, gfx1201
+    - 3: gfx1250
+    """
+    version: int
+    transposed: bool
+    warps_per_cta: List[int]
+    instr_shape: Optional[List[int]] = None
+    tiles_per_warp: Optional[List[int]] = None
+    cga_layout: List[List[int]] = field(default_factory=list)
+
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+
+        if self.tiles_per_warp is None:
+            tiles_per_warp = [1] * len(self.warps_per_cta)
+        else:
+            tiles_per_warp = _unwrap_if_constexpr(self.tiles_per_warp)
+
+        super().__setattr__("tiles_per_warp", tiles_per_warp)
+
+        instr_shape = _unwrap_if_constexpr(self.instr_shape) if self.instr_shape is not None else [16, 16, 16]
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(instr_shape))
+        object.__setattr__(self, "cga_layout", self.cga_layout)
+        self.verify()
+
+    def _to_ir(self, builder):
+        return builder.get_amd_wmma_layout(
+            self.version,
+            self.transposed,
+            self.warps_per_cta,
+            self.tiles_per_warp,
+            self.cga_layout,
+            self.instr_shape,
+        )
+
+    def mangle(self) -> str:
+
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+
+        cga_layout = stringify(["~".join(map(str, vec)) for vec in self.cga_layout] if self.cga_layout else None)
+        return f"WMMA_{self.version}_{self.transposed}_{stringify(self.warps_per_cta)}_{stringify(self.tiles_per_warp)}_{stringify(self.instr_shape)}_{cga_layout}_WMMA"
+
+    def verify(self):
+        assert self.version >= 1 and self.version <= 3, "version must be in the [1, 3] range"
+
+        rank = len(self.warps_per_cta)
+        assert all(len(vec) == rank for vec in self.cga_layout), "cga_layout basis rank mismatch"
+
+    def __hash__(self):
+        return hash((
+            self.version,
+            self.transposed,
+            tuple(self.warps_per_cta),
+            tuple(self.tiles_per_warp) if self.tiles_per_warp else None,
+            tuple(self.instr_shape) if self.instr_shape else None,
+            tuple(tuple(vec) for vec in self.cga_layout),
+        ))
+
+    @property
+    def rank(self):
+        return len(self.warps_per_cta)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_ops.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..547761307dc4bce878afb33991758961bc064eb4
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/_ops.py
@@ -0,0 +1,77 @@
+import math
+
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._semantic import _check
+
+from .._core import _unwrap_if_constexpr
+from .._layouts import DotOperandLayout
+from ._layouts import AMDWMMALayout
+
+
+def _verify_wmma(version, a, b, acc):
+    _check(acc is not None, lambda: "acc is required")
+
+    layout = acc.type.layout
+    _check(
+        isinstance(layout, AMDWMMALayout) and layout.version == version,
+        lambda: f"Expected layout to be an instance of AMDWMMALayout with version {version}")
+
+    a_layout = a.type.layout
+    _check(
+        isinstance(a_layout, DotOperandLayout) and isinstance(a_layout.parent, AMDWMMALayout)
+        and a_layout.parent.version == version,
+        lambda: "Expected a's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+
+    b_layout = b.type.layout
+    _check(
+        isinstance(b_layout, DotOperandLayout) and isinstance(b_layout.parent, AMDWMMALayout)
+        and b_layout.parent.version == version,
+        lambda: "Expected b's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+
+
+def _wmma(version, a, b, acc, semantic):
+    """ Shared implementation for AMD WMMA operations for Gluon builtins """
+    _verify_wmma(version, a, b, acc)
+
+    handle = semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None,
+                          out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, acc.type)
+
+
+def _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, scale_fn, semantic):
+    """ Shared implementation for AMD WMMA scaled and MFMA scaled operation. """
+
+    def _get_scale_shape(op_idx, operand, format):
+        operand_shape = [s for s in operand.type.shape]
+        scale_shape = operand_shape
+        unpack_factor = 2 if format.value == "e2m1" else 1
+        if op_idx == 0:
+            k = scale_shape[-1] * unpack_factor
+            scale_shape[-1] = k // 32
+        else:
+            k = scale_shape[-2] * unpack_factor
+            scale_shape[-2] = k // 32
+            scale_shape[-2], scale_shape[-1] = scale_shape[-1], scale_shape[-2]
+        return scale_shape
+
+    def _create_and_broadcast_default_scale(op_idx, scale, format):
+        operand = a if op_idx == 0 else b
+
+        scale_shape = _get_scale_shape(op_idx, operand, format)
+        if isinstance(scale, ttgl.tensor) and scale.numel.value != 1:
+            # In the case of scale pre-shuffling, the input shape is different from the default shape. We only check
+            # the number of elements here.
+            assert math.prod(scale_shape) == scale.numel.value, "Incompatible scale shape"
+            return scale
+
+        scale_layout = scale_fn(operand.type.layout, scale_shape)
+        scale_value = _unwrap_if_constexpr(scale)
+        scale_value = 0x7F if scale_value is None else scale_value
+        return semantic.full(scale_shape, scale_value, ttgl.uint8, scale_layout)
+
+    a_scale = _create_and_broadcast_default_scale(0, a_scale, a_format)
+    b_scale = _create_and_broadcast_default_scale(1, b_scale, b_format)
+    output = semantic.dot_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, fast_math=False, lhs_k_pack=True,
+                                 rhs_k_pack=True, out_dtype=ttgl.float32)
+    return ttgl.tensor(output.handle, acc.type)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d88a62b84a99de5f3f706cb76d078ef6a06c2a9
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__init__.py
@@ -0,0 +1,238 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton._C.libtriton import ir
+from ..._core import builtin, _unwrap_if_constexpr
+
+if TYPE_CHECKING:
+    from ..._semantic import GluonSemantic
+
+__all__ = [
+    "buffer_atomic_add", "buffer_atomic_and", "buffer_atomic_min", "buffer_atomic_max", "buffer_atomic_or",
+    "buffer_atomic_xor", "buffer_atomic_xor", "buffer_load", "buffer_store", "mfma"
+]
+
+_atomic_op_str_to_op = {
+    "smax": ir.ATOMIC_OP.MAX, "smin": ir.ATOMIC_OP.MIN, "umax": ir.ATOMIC_OP.UMAX, "umin": ir.ATOMIC_OP.UMIN, "fadd":
+    ir.ATOMIC_OP.FADD, "iadd": ir.ATOMIC_OP.ADD, "and": ir.ATOMIC_OP.AND, "or": ir.ATOMIC_OP.OR, "xor":
+    ir.ATOMIC_OP.XOR, "xchg": ir.ATOMIC_OP.XCHG
+}
+
+
+def _verify_buffer_ops(ptr, offsets, mask=None, other=None):
+    assert ptr.type.is_ptr(), "ptr must be a scalar pointer type"
+
+    assert isinstance(offsets.type, ttgl.distributed_type), "expected offsets type to be a distributed_type"
+    assert offsets.dtype.is_int32() or offsets.dtype.is_uint32(), "offsets element type must be int32 or uint32"
+
+    if other is not None:
+        assert mask is not None, "when other is not None, mask should not be None"
+
+
+def _verify_element_type_and_dispatch_op(op, elem_type, arch):
+    supported_types = [
+        ttgl.float16, ttgl.float32, ttgl.bfloat16, ttgl.float64, ttgl.int32, ttgl.int64, ttgl.uint32, ttgl.uint64
+    ]
+    assert elem_type in supported_types, f"{elem_type} is not supported in buffer atomic on {arch}."
+
+    if op in ['and', 'or', 'xor', 'xchg']:
+        assert elem_type in [ttgl.int32, ttgl.int64], f"{op} with {elem_type} is not supported on CDNA3 or CDNA4"
+        return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+
+    if op in ['max', 'min']:
+        if elem_type in [ttgl.int32, ttgl.int64, ttgl.float64]:
+            op = 's' + op
+            return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+        elif elem_type in [ttgl.uint32, ttgl.uint64]:
+            op = 'u' + op
+            return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+        else:
+            raise ValueError(f"{op} with {elem_type} is not supported on CDNA3 and CDNA4")
+
+    if op == 'add':
+        if elem_type in [ttgl.uint32, ttgl.uint64]:
+            op = 'i' + op
+            return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+        elif elem_type in [ttgl.float16, ttgl.float32, ttgl.float64]:
+            op = 'f' + op
+            return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+        elif elem_type is ttgl.bfloat16:
+            assert arch == "cdna4", "Buffer atomic fadd with bf16 is only supported on CDNA4 for now."
+            op = 'f' + op
+            return _atomic_op_str_to_op[_unwrap_if_constexpr(op)]
+        else:
+            raise ValueError(f"{op} with {elem_type} is not supported on CDNA3 and CDNA4")
+
+    raise ValueError(f"Unknown {op} on CDNA3 or CDNA4")
+
+
+def _buffer_atomic_rmw_impl(op, ptr, offsets, value, arch, mask, sem, scope, _semantic):
+    _verify_buffer_ops(ptr, offsets, mask)
+
+    op = _verify_element_type_and_dispatch_op(op, ptr.type.scalar.element_ty, arch)
+
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+        mask = _semantic.cast(mask, ttgl.int1)
+        _, mask = _semantic.broadcast_impl_value(offsets, mask)
+    mask = mask.handle if mask is not None else ir.value()
+
+    value = _unwrap_if_constexpr(value)
+    value = _semantic.to_tensor(value)
+    _, value = _semantic.broadcast_impl_value(offsets, value)
+
+    sem = _semantic._str_to_sem(sem)
+    scope = _semantic._str_to_scope(scope)
+    return _semantic.tensor(
+        _semantic.builder.create_buffer_atomic_rmw(op, ptr.handle, offsets.handle, value.handle, sem, scope, mask),
+        value.type)
+
+
+@builtin
+def buffer_load(ptr, offsets, mask=None, other=None, cache=None, _semantic=None):
+    """
+    AMD buffer load from global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers. This operation will load data
+    directly into registers.
+
+    Args:
+        ptr (pointer to scalar): Global memory scalar base pointer to load from.
+        offsets (tensor): Offsets tensor for the load operation.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor or scalar, optional): Tensor or scalar providing default values for masked elements. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _verify_buffer_ops(ptr, offsets, mask, other)
+
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        other = _semantic.to_tensor(other)
+        other = _semantic.cast(other, ptr.dtype.element_ty)
+        offsets, other = _semantic.broadcast_impl_value(offsets, other)
+
+    other = other.handle if other is not None else ir.value()
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+
+    ret_ty = offsets.type.with_element_ty(ptr.type.scalar.element_ty)
+    builder = _semantic.builder
+    handle = builder.create_buffer_load(ret_ty.to_ir(builder), ptr.handle, offsets.handle, mask, other, cache_modifier)
+    return ttgl.tensor(handle, ret_ty)
+
+
+@builtin
+def buffer_store(stored_value, ptr, offsets, mask=None, cache=None, _semantic: GluonSemantic = None):
+    """
+    AMD buffer store a tensor directly to global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers.
+    Args:
+        stored_value (tensor to be stored): The tensor to be stored to global memory.
+        ptr (pointer to scalar): Global memory scalar base pointer to store to.
+        offsets (tensor): Offsets tensor for the store operation.
+        mask (tensor, optional): Mask tensor for predicated store. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _verify_buffer_ops(ptr, offsets, mask)
+
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_store_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+
+    _semantic.builder.create_buffer_store(stored_value.handle, ptr.handle, offsets.handle, mask, cache_modifier)
+
+
+@builtin
+def mfma(a, b, acc, _semantic: GluonSemantic = None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD native matrix core units.
+    Args:
+        a (tensor): The first operand of mfma.
+        b (tensor): The second operand of mfma.
+        acc (tensor): The accumulator tensor.
+    """
+    assert acc is not None, "acc is required"
+    ret_type = acc.type
+    acc = ttgl._unwrap_if_constexpr(acc)
+
+    handle = _semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None,
+                           out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, ret_type)
+
+
+"""
+AMD Buffer Atomic RMW operations.
+The supported operatios are max, min, add, and, or, xor, xchg.
+Similar to normal atomic ops: it loads data at ptr plus offsets, do `op` with `value`, and store result to `ptr` plus `offsets` with
+the specified memory semantics and scope.
+
+Buffer atomics access global memory via a scalar base pointer and a tensor of offsets instead of a tensor of pointers.
+Similar to other buffer ops, the `mask` is a boolean vector that determines if a given element should be processed with
+the atomic RMW op. Elements with `mask[i] == 0` are dropped (i.e., the atomic is not executed).
+
+Buffer Atomic RMW ops return the pre-op value in the global memory.
+
+Args:
+    ptr (pointer to scalar): Global memory scalar base pointer to load from.
+    offsets (tensor): Offsets tensor for the load operation.
+    value (tensor): Another operand of `op`.
+    mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+    sem (str, optional): Memory Semantic Descriptor. Default is None which means acq_rel memory semantic.
+    scope (str, optional): Memory Sync Scope for atomic accesses. Default is None and it will be mapped to `gpu`, which is called `agent` for AMDGPU. Please ref https://llvm.org/docs/AMDGPUUsage.html#memory-model-gfx942 for details.
+"""
+
+
+@builtin
+def buffer_atomic_max(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+    return _buffer_atomic_rmw_impl('max', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_min(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('min', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_add(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('add', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_and(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('and', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_or(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('or', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_xor(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('xor', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_xchg(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('xchg', ptr, offsets, value, "cdna3", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0170086ccd33c8dd9833df624178542b76788e43
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ba53d2ed0b1f34f3a464c261fd09d9ecdbb057d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__init__.py
@@ -0,0 +1,130 @@
+from triton.runtime.jit import constexpr_function
+from triton._C.libtriton.gluon_ir import get_amd_mfma_scale_layout as _get_mfma_scale_layout
+
+from ..._core import builtin
+from ..._layouts import DotOperandLayout
+from .._layouts import AMDMFMALayout
+from .._ops import _mma_scaled
+from ..cdna3 import _buffer_atomic_rmw_impl
+from ..cdna3 import *  # NOQA: F403
+from ..cdna3 import __all__ as __cdna3_all
+from . import async_copy
+
+__all__ = [*__cdna3_all, "async_copy", "mfma_scaled", "get_mfma_scale_layout"]
+
+
+@builtin
+def mfma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None):
+    """
+    AMD Scaled MFMA operation.
+
+    ```
+    c = a * a_scale @ b * b_scale + acc
+    ```
+
+    `a` and `b` use microscaling formats described in
+    "OCP Microscaling Formats (MX) Specification":
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf.
+    Currently supported only on CDNA4 hardware.
+
+    Args:
+        a (tensor): The operand A to be multiplied.
+        a_scale (Optional[tensor]): Scale factor for operand A.
+        a_format (str): Format of the operand A. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        b (tensor): The operand B to be multiplied.
+        b_scale (Optional[tensor]): Scale factor for operand B.
+        b_format (str): Format of the operand B. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        acc (tensor): Accumulator tensor.
+    """
+    layout = acc.type.layout
+    assert isinstance(layout, AMDMFMALayout), "Expected layout to be an instance of AMDMFMALayout"
+    assert (isinstance(a.type.layout, DotOperandLayout) and a.type.layout.parent== layout), \
+            "Expected lhs layout to be a DotOperandLayout with parent matching MFMA layout"
+    assert (isinstance(b.type.layout, DotOperandLayout) and b.type.layout.parent == layout), \
+            "Expected rhs layout to be a DotOperandLayout with parent matching MFMA layout"
+
+    assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
+    assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
+
+    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, get_mfma_scale_layout, _semantic)
+
+
+def _get_mfma_scale_layout_impl(*args, **kwargs):
+    return _get_mfma_scale_layout(*args, **kwargs)
+
+
+_get_mfma_scale_layout_impl.__triton_builtin__ = True
+
+
+@constexpr_function
+def get_mfma_scale_layout(dot_operand_layout, shape):
+    """ Get the scale layout for MFMA scaled operands.
+
+    Args:
+        dot_operand_layout (DotOperandLayout): The dot operand layout.
+        shape (List[int]): The shape of the scale tensor.
+
+    Return:
+        layout (DistributedLinearLayout): The scale layout.
+    """
+    op_idx = dot_operand_layout.operand_index
+    parent = dot_operand_layout.parent
+    assert isinstance(parent, AMDMFMALayout), "Expected parent to be an instance of AMDMFMALayout"
+    mdim = parent.instr_shape[0]
+    tiles_per_warp = parent.tiles_per_warp
+    warps_per_cta = parent.warps_per_cta
+    return _get_mfma_scale_layout_impl(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)
+
+
+"""
+buffer_atomic_rmw of cnda4 shares the same signature and functionalities as cdna3.buffer_atomic_rmw.
+The cdna4 version additionally supports `fadd` with `bf16`.
+"""
+
+
+@builtin
+def buffer_atomic_max(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+    return _buffer_atomic_rmw_impl('max', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_min(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('min', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_add(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('add', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_and(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('and', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_or(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('or', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_xor(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('xor', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
+
+
+@builtin
+def buffer_atomic_xchg(ptr, offsets, value, mask=None, sem=None, scope=None, _semantic=None):
+
+    return _buffer_atomic_rmw_impl('xchg', ptr, offsets, value, "cdna4", mask=mask, sem=sem, scope=scope,
+                                   _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71ca39375d8b378d9d9db562293f2a2912f2cf97
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/async_copy.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/async_copy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d26d70a0ac113d04a6d7eba6812cd099d2a9371e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/__pycache__/async_copy.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/async_copy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/async_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..009707c77924dba04bc2418cc536e67b5486bd21
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/cdna4/async_copy.py
@@ -0,0 +1,170 @@
+from ..._core import ir, builtin, _unwrap_if_constexpr
+from ..._semantic import _check
+from ..._layouts import BlockedLayout, SliceLayout
+from ..cdna3 import _verify_buffer_ops
+
+__all__ = [
+    "global_load_to_shared",
+    "buffer_load_to_shared",
+    "commit_group",
+    "wait_group",
+    "load_shared_relaxed",
+]
+
+
+@builtin
+def global_load_to_shared(dest, ptr, mask=None, other=None, cache_modifier="", _semantic=None):
+    """
+    AMD global load to shared operation. This operation loads data directly
+    from global memory to shared memory without going through registers. It
+    happens asynchronously and requires a subsequent `async_wait` to ensure the
+    data is available in shared memory. Note that this operation does still
+    complete in order with ttgl.loads/stores or buffer_loads/stores on CDNA4,
+    so interleaving with them will hurt performance.
+
+    Compared to `buffer_load_to_shared`, it requires a tensor pointer which
+    supports 64-bit indexing range for each thread in a block, which gives more
+    flexibility, but at the cost of higher register pressure and no hardware
+    out-of-bound masking support. Prefer to use `buffer_load_to_shared` when
+    possible for better performance.
+
+    The underlying hardware instruction uses separate registers for global
+    memory address for each thread but the same register for local memory
+    address for the whole warp. Therefore, while using this operation
+    the following conditions must be met or lowering to LLVM will fail:
+
+    - For the `ptr` layout, size per thread * bits per element must be 128 or 32.
+      To get ideal performance, it is recommended to use 128 bits per element.
+    - Writes to `dest` must be coalesced.
+    - If `dest` is swizzled, it only can be swizzled within warp boundary.
+
+    Args:
+        dest (shared_memory_descriptor): Destination shared memory descriptor.
+        ptr (pointer tensor): Tensor of pointers to global memory to load from.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor or scalar, optional): Tensor or scalar providing default values for masked elements. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _check(ptr.type.is_block(), lambda: "expected ptr to be a tensor")
+    _check(isinstance(ptr.type.layout, (BlockedLayout, SliceLayout)),
+           lambda: "expected ptr type layout to be BlockedLayout or SliceLayout")
+    _check(
+        dest.shape == ptr.shape, lambda:
+        f"expected dest shape to match pointer shape but got dest.shape = {dest.shape}, pointer.shape = {ptr.shape}")
+
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        ptr, mask = _semantic.broadcast_impl_value(ptr, mask)
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        other = _semantic.to_tensor(other)
+        other = _semantic.cast(other, ptr.dtype.element_ty)
+        ptr, other = _semantic.broadcast_impl_value(ptr, other)
+
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache_modifier)
+    mask_handle = mask.handle if mask is not None else ir.value()
+    other_handle = other.handle if other is not None else ir.value()
+    _semantic.builder.create_async_copy_global_to_local(dest.handle, ptr.handle, mask_handle, other_handle,
+                                                        cache_modifier, ir.EVICTION_POLICY.NORMAL, False)
+
+
+@builtin
+def buffer_load_to_shared(dest, ptr, offsets, mask=None, other=None, cache_modifier="", _semantic=None):
+    """
+    AMD buffer load to shared operation. Buffer load is similar to global load
+    but it accesses global memory via a scalar base pointer and a tensor of
+    32-bit offsets instead of a tensor of pointers. This operation loads data
+    directly from global memory to shared memory without going through
+    registers. It happens asynchronously and requires a subsequent `async_wait`
+    to ensure thedata is available in shared memory. Note that this operation
+    does still complete in order with ttgl.loads/stores or buffer_loads/stores
+    on CDNA4, so interleaving with them will hurt performance.
+
+    Compared to `global_load_to_shared`, it has better performance and also
+    supports hardware out-of-bound masking. But it strictly requires a
+    32-bit offset instead of a 64-bit tensor pointer.
+
+    The underlying hardware instruction uses separate registers for global
+    memory address for each thread but the same register for local memory
+    address for the whole warp. Therefore, while using this operation
+    the following conditions must be met or lowering to LLVM will fail:
+
+    - For the `offsets` layout, size per thread * bits per element must be 128 or 32.
+      To get ideal performance, it is recommended to use 128 bits per element.
+    - Writes to `dest` must be coalesced.
+    - If `dest` is swizzled, it only can be swizzled within warp boundary.
+
+    Args:
+        dest (shared_memory_descriptor): Destination shared memory descriptor.
+        ptr (pointer to scalar): Global memory scalar base pointer to load from.
+        offsets (tensor): Offsets tensor for the load operation.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor or scalar, optional): Tensor or scalar providing default values for masked elements. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _check(isinstance(offsets.type.layout, (BlockedLayout, SliceLayout)),
+           lambda: "expected offsets type layout to be BlockedLayout or SliceLayout")
+    _verify_buffer_ops(ptr, offsets, mask, other)
+
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        other = _semantic.to_tensor(other)
+        other = _semantic.cast(other, ptr.type.scalar.element_ty)
+        offsets, other = _semantic.broadcast_impl_value(offsets, other)
+
+    mask = mask.handle if mask is not None else ir.value()
+    other = other.handle if other is not None else ir.value()
+    stride = ir.value()
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache_modifier)
+
+    _semantic.builder.create_buffer_load_to_local(dest.handle, ptr.handle, offsets.handle, mask, other, stride,
+                                                  cache_modifier)
+
+
+@builtin
+def commit_group(_semantic=None):
+    """
+    Commit oustanding async operations.
+
+    This finalizes a set of async copy operations which can be waited upon via `wait_group`.
+    """
+    _semantic.builder.create_async_commit_group()
+
+
+@builtin
+def wait_group(num_outstanding=0, _semantic=None):
+    """
+    Wait for outstanding commit groups. It will block until the number of
+    outstanding commit groups is less than or equal to `num_outstanding`. Note that uncommited
+    async operations will be waited upon even if `num_outstanding` is 0.
+
+    Args:
+        num_outstanding (int): The number of outstanding commit groups to wait for. Defaults to 0.
+    """
+    num_outstanding = _unwrap_if_constexpr(num_outstanding)
+    _semantic.builder.create_async_wait_group(num_outstanding)
+
+
+@builtin
+def load_shared_relaxed(smem, layout, _semantic=None):
+    """
+    Load a tensor from shared memory with extra hints for the underlying
+    compiler to avoid emitting unnecessary waits before loading from the target
+    shared memory.
+
+    Args:
+        smem (shared_memory_descriptor): Shared memory descriptor to load from.
+        layout (DistributedLayout): The destination layout of the tensor.
+
+    Returns:
+        tensor: A Gluon tensor containing the loaded data.
+    """
+    SYNCED_VIA_WAIT_ATTR_NAME = "ttg.amdg.syncedViaAsyncWait"
+
+    layout = _unwrap_if_constexpr(layout)
+    ret = _semantic.shared_load(smem, layout)
+    ret.handle.set_attr(SYNCED_VIA_WAIT_ATTR_NAME, _semantic.builder.get_bool_attr(True))
+    return ret
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c32f877f590e6821a5fec381c8c55ece8ccbc54
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__init__.py
@@ -0,0 +1,96 @@
+from triton.runtime.jit import constexpr_function
+from triton._C.libtriton.gluon_ir import get_amd_wmma_scale_layout as _get_wmma_scale_layout
+
+from ..._core import builtin
+from .._ops import _wmma, _verify_wmma, _mma_scaled
+from .._layouts import AMDWMMALayout
+from ..cdna3 import buffer_load, buffer_store
+from . import tdm
+from . import async_copy
+from . import mbarrier
+
+__all__ = [
+    "async_copy", "tdm", "mbarrier", "wmma", "wmma_scaled", "buffer_load", "buffer_store", "get_wmma_scale_layout"
+]
+
+
+@builtin
+def wmma(a, b, acc, _semantic=None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD WMMA instruction.
+
+    Args:
+        a (tensor): The operand a to be multiplied.
+        b (tensor): The operand b to be multiplied.
+        acc (tensor): The accumulator tensor.
+    """
+    return _wmma(3, a, b, acc, _semantic)
+
+
+@builtin
+def wmma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None):
+    """
+    AMD Scaled WMMA operation.
+
+    ```
+    c = a * a_scale @ b * b_scale + acc
+    ```
+
+    `a` and `b` use microscaling formats described in
+    "OCP Microscaling Formats (MX) Specification":
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf.
+
+    Args:
+        a (tensor): The operand A to be multiplied.
+        a_scale (Optional[tensor]): Scale factor for operand A.
+        a_format (str): Format of the operand A. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        b (tensor): The operand B to be multiplied.
+        b_scale (Optional[tensor]): Scale factor for operand B.
+        b_format (str): Format of the operand B. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        acc (tensor): Accumulator tensor.
+    """
+    _verify_wmma(3, a, b, acc)
+    if a_format.value == "e2m1":
+        wmma_layout = a.type.layout.parent
+        assert isinstance(wmma_layout, AMDWMMALayout) and wmma_layout.instr_shape == [16, 16, 64], \
+            "e2m1 format expects instr_shape to be [16, 16, 64]"
+    if b_format.value == "e2m1":
+        wmma_layout = b.type.layout.parent
+        assert isinstance(wmma_layout, AMDWMMALayout) and wmma_layout.instr_shape == [16, 16, 64], \
+            "e2m1 format expects instr_shape to be [16, 16, 64]"
+
+    acc_layout = acc.type.layout
+    assert isinstance(acc_layout, AMDWMMALayout) and acc_layout.instr_shape == [16, 16, 128], \
+    "accumulator tensor's layout must be [16, 16, 128]"
+
+    assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
+    assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
+
+    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, get_wmma_scale_layout, _semantic)
+
+
+def _get_wmma_scale_layout_impl(*args, **kwargs):
+    return _get_wmma_scale_layout(*args, **kwargs)
+
+
+_get_wmma_scale_layout_impl.__triton_builtin__ = True
+
+
+@constexpr_function
+def get_wmma_scale_layout(dot_operand_layout, shape):
+    """ Get the scale layout for WMMA scaled operands.
+
+    Args:
+        dot_operand_layout (DotOperandLayout): The dot operand layout.
+        shape (List[int]): The shape of the scale tensor.
+
+    Return:
+        layout (DistributedLinearLayout): The scale layout.
+    """
+    op_idx = dot_operand_layout.operand_index
+    parent = dot_operand_layout.parent
+    assert isinstance(parent, AMDWMMALayout), "Expected parent to be an instance of AMDMFMALayout"
+    mdim = parent.instr_shape[0]
+    tiles_per_warp = parent.tiles_per_warp
+    warps_per_cta = parent.warps_per_cta
+    return _get_wmma_scale_layout_impl(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bb359f66eeda09aa7cdcbcc0392af1664016844
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/async_copy.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/async_copy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fca0713dd32debdb53b9c53b5f8c74e8f6fdfbc3
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/async_copy.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/mbarrier.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/mbarrier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08134cd52ecc5a9098937ba5b73d2d894fa74778
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/mbarrier.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/tdm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/tdm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbd861c9e9ca77193a6e6a8c8b2174507a034e21
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/__pycache__/tdm.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/async_copy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/async_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfba91356bb1fefc7760608716954f3b0a2d83a3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/async_copy.py
@@ -0,0 +1,51 @@
+from ..._core import ir, builtin, _unwrap_if_constexpr
+from ..._semantic import _check
+from triton.experimental.gluon.language._layouts import DistributedLayout
+from ..cdna4.async_copy import commit_group, wait_group
+
+__all__ = ["global_to_shared", "commit_group", "wait_group", "mbarrier_arrive"]
+
+
+@builtin
+def global_to_shared(smem, pointer, mask=None, other=None, cache_modifier="", _semantic=None):
+    """
+    Asynchronously copy elements from global memory to shared memory. Requires manual syncronization via `wait_group` before accessing the loaded data.
+
+    Args:
+        smem (shared_memory_descriptor): Destination shared memory descriptor.
+        pointer (tensor): Source pointer tensor.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor or scalar, optional): Tensor or scalar providing default values for masked elements. Defaults to None(0).
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+        eviction_policy (str): Eviction policy specifier. Defaults to "".
+    """
+    _check(pointer.type.is_block(), lambda: "expected ptr to be a tensor")
+    _check(isinstance(pointer.type.layout, DistributedLayout),
+           lambda: "expected ptr type layout to be BlockedLayout or SliceLayout")
+    _check(
+        smem.shape == pointer.shape, lambda:
+        f"expected smem shape to match pointer shape but got smem.shape = {smem.shape}, pointer.shape = {pointer.shape}"
+    )
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        pointer, mask = _semantic.broadcast_impl_value(pointer, mask)
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        other = _semantic.to_tensor(other)
+        other = _semantic.cast(other, pointer.dtype.element_ty)
+        pointer, other = _semantic.broadcast_impl_value(pointer, other)
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache_modifier)
+    mask_handle = mask.handle if mask is not None else ir.value()
+    other_handle = other.handle if other is not None else ir.value()
+    _semantic.builder.create_async_copy_global_to_local(smem.handle, pointer.handle, mask_handle, other_handle,
+                                                        cache_modifier, ir.EVICTION_POLICY.NORMAL, False)
+
+
+@builtin
+def mbarrier_arrive(mbarrier, _semantic=None):
+    """
+    Arrive on the mbarrier once all outstanding async copies are complete.
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier object to arrive on.
+    """
+    _semantic.builder.create_async_copy_lds_barrier_arrive(mbarrier.handle)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/mbarrier.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/mbarrier.py
new file mode 100644
index 0000000000000000000000000000000000000000..f69d3005fbbd727c79ce1321c433e8fc1fabcff7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/mbarrier.py
@@ -0,0 +1,67 @@
+import triton.experimental.gluon.language._core as ttgl
+from triton.experimental.gluon.language._layouts import SwizzledSharedLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+
+__all__ = ["MBarrierLayout", "init", "wait", "arrive"]
+
+
+class MBarrierLayout(SwizzledSharedLayout):
+    """
+    Layout for mbarrier synchronization.
+
+    Args:
+        cga_layout (List[List[int]]): CTA layout bases. Defaults to [].
+    """
+
+    def __init__(self, cga_layout=None):
+        super().__init__(vec=1, per_phase=1, max_phase=1, order=[0], cga_layout=cga_layout or [])
+
+
+@builtin
+def init(mbarrier, count, _semantic=None):
+    """
+    Initialize an mbarrier with a specified count. An mbarrier consists of an init count, a pending count and a phase.
+    At initialization, the init count and pending count are initialized with the given 'count' and the phase is initialized to 0.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to initialize.
+        count (int): The initial count for the barrier. Must be a positive integer.
+    """
+    count = _unwrap_if_constexpr(count)
+    _semantic.builder.create_lds_barrier_init(mbarrier.handle, count)
+
+
+@builtin
+def wait(mbarrier, phase, _semantic=None):
+    """
+    Wait until the mbarrier's phase differs from the provided phase value.
+    This means that the given 'phase' has completed.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to wait on.
+        phase (int): The phase value to compare against. The wait completes when
+        the barrier's phase becomes different from this value.
+    """
+    phase = _semantic.to_tensor(phase)
+
+    _semantic.builder.create_lds_barrier_wait(mbarrier.handle, phase.handle)
+
+
+@builtin
+def arrive(mbarrier, *, count=1, _semantic=None):
+    """
+    Arrive at an mbarrier with a specified count. The operation requires a `count` attribute
+    of at least 1, and decreases the pending arrival count of the mbarrier by the specific count.
+    If the pending count reaches zero, the phase changes (is decremented in a wraparound manner) and the
+    pending count is reloaded with the init count value. Returns the mbarrier's phase prior to the "arrive" operation.
+
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier to be signalled.
+        count (int): Count to arrive with. Defaults to 1.
+
+    Returns:
+        prior phase (int): phase of mbarrier, prior to "arrive" operation.
+    """
+    count = _unwrap_if_constexpr(count)
+    handle = _semantic.builder.create_lds_barrier_arrive(mbarrier.handle, count)
+    return ttgl.tensor(handle, ttgl.int32)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/tdm.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/tdm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ec8b04a22bbd4df8f9aee392dbec01ddd4bc60
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/gfx1250/tdm.py
@@ -0,0 +1,171 @@
+from __future__ import annotations
+from typing import List, Tuple, TYPE_CHECKING
+from dataclasses import dataclass
+
+import triton.experimental.gluon.language._core as ttgl
+from triton.experimental.gluon.language._layouts import PaddedSharedLayout, SwizzledSharedLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+
+if TYPE_CHECKING:
+    from triton._C import ir
+    from triton.experimental.gluon.language._core import shared_memory_descriptor
+
+__all__ = ["async_load", "async_wait", "make_tensor_descriptor", "tensor_descriptor", "tensor_descriptor_type"]
+
+
+@dataclass(eq=True)
+class tensor_descriptor_type(ttgl.base_type):
+    """The type for a tensor descriptor."""
+
+    block_type: ttgl.block_type
+    shape_type: ttgl.tuple_type
+    strides_type: ttgl.tuple_type
+    layout: PaddedSharedLayout | SwizzledSharedLayout
+
+    def __str__(self) -> str:
+        return f"tensor_descriptor<{self.block_type}, {self.layout}>"
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor, int]:
+        handle = handles[cursor]
+        cursor += 1
+        shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
+        strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
+        value = tensor_descriptor(handle, shape, strides, self)
+        return value, cursor
+
+    def _to_ir(self, builder: ir.builder) -> ir.type:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        return builder.get_tensor_descriptor_layout_type(
+            self.block_type.to_ir(builder),
+            is_signed,
+            self.layout._to_ir(builder),
+        )
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(self._to_ir(builder))
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}_{self.shape_type.mangle()}_{self.strides_type.mangle()}_{self.layout.mangle()}TD"
+
+
+@dataclass
+class tensor_descriptor(ttgl.base_value):
+    """A descriptor representing a tensor in global memory."""
+
+    handle: ir.value
+    shape: ttgl.tuple
+    strides: ttgl.tuple
+    type: tensor_descriptor_type
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+
+    @property
+    def block_type(self):
+        return self.type.block_type
+
+    @property
+    def block_shape(self):
+        return self.type.block_type.shape
+
+    @property
+    def dtype(self):
+        return self.type.block_type.element_ty
+
+    @property
+    def layout(self):
+        return self.type.layout
+
+
+@builtin
+def make_tensor_descriptor(base: ttgl.tensor, shape: List[ttgl.constexpr | ttgl.tensor],
+                           strides: List[ttgl.constexpr | ttgl.tensor], block_shape: List[ttgl.constexpr],
+                           layout: PaddedSharedLayout | SwizzledSharedLayout, _semantic=None) -> tensor_descriptor:
+    """Make a tensor descriptor object.
+
+    Args:
+        base (tensor): base pointer of the tensor in global memory.
+        shape (List[int]): shape of the tensor.
+        strides (List[int]): strides of the tensor.
+        block_shape (List[int]): block shape of the tensor.
+        layout (PaddedSharedLayout | SwizzledSharedLayout): the layout of the tensor in shared memory.
+
+    Returns:
+        tensor_descriptor: the created tensor descriptor object
+    """
+    ndim = len(shape)
+    assert 1 <= ndim <= 5, f"Expected 1 <= ndim <= 5 but got {ndim} dimensions"
+    assert len(strides) == ndim, f"Expected {ndim} strides but got {len(strides)}"
+    assert len(block_shape) == ndim, f"Expected block_shape to have {ndim} dimensions but got {len(strides)}"
+    assert isinstance(base.dtype, ttgl.pointer_type), "Expected base to be a pointer"
+
+    layout = _unwrap_if_constexpr(layout)
+    assert isinstance(layout, (PaddedSharedLayout, SwizzledSharedLayout)), \
+        "Expected layout to be a PaddedSharedLayout or SwizzledSharedLayout"
+    if isinstance(layout, SwizzledSharedLayout):
+        assert layout.max_phase == 1, "Expected max_phase to be 1 for SwizzledSharedLayout"
+
+    base_handle = base.handle
+    shape_handles = _semantic._convert_to_ir_values(shape, require_i64=False)  # i32 shape
+    stride_handles = _semantic._convert_to_ir_values(strides, require_i64=True)  # i64 stride
+
+    shape = ttgl.tuple(shape)
+    strides = ttgl.tuple(strides)
+    block_type = ttgl.block_type(base.type.element_ty, block_shape)
+    type = tensor_descriptor_type(block_type, shape.type, strides.type, layout)
+
+    padding = _semantic._str_to_padding_option("zero")
+    handle = _semantic.builder.create_make_tensor_descriptor(type._to_ir(_semantic.builder), base_handle, shape_handles,
+                                                             stride_handles, padding)
+
+    return tensor_descriptor(handle, shape, strides, type)
+
+
+@builtin
+def async_load(src: tensor_descriptor, offsets: List[ttgl.constexpr | ttgl.tensor], dest: shared_memory_descriptor,
+               pred: bool = True, mbarrier: shared_memory_descriptor = None, _semantic=None) -> None:
+    """Load a block of tensor specified in tensor descriptor from global memory to shared memory asynchronously.
+
+    Args:
+        src (tensor_descriptor): the source tensor descriptor.
+        offsets (List[int]): the offsets from the base pointer in the tensor descriptor.
+        dest (shared_memory_descriptor): the shared memory destination to store the loaded data.
+        pred (bool, optional): Predicate to enable or disable the load. Defaults to True.
+        mbarrier (shared_memory_descriptor, optional): The barrier object to signal "arrive" on.
+    """
+    offset_handles = _semantic._convert_to_ir_values(offsets, require_i64=False)
+    pred = _semantic.to_tensor(pred)
+    pred_handle = pred.handle
+    mbarrier = _unwrap_if_constexpr(mbarrier)
+    mbarrier_handle = mbarrier.handle if mbarrier is not None else ttgl.ir.value()
+    _semantic.builder.create_async_tdm_copy_global_to_local(src.handle, offset_handles, dest.handle, pred_handle,
+                                                            mbarrier_handle)
+
+
+@builtin
+def async_store(dest: tensor_descriptor, offsets: List[ttgl.constexpr | ttgl.tensor], src: shared_memory_descriptor,
+                _semantic=None) -> None:
+    """Store a block of tensor specified in tensor descriptor from shared memory to global memory asynchronously.
+
+    Args:
+        dest (tensor_descriptor): the destination tensor descriptor.
+        offsets (List[int]): the offsets from the base pointer in the tensor descriptor.
+        src (shared_memory_descriptor): the shared memory source to load the data.
+    """
+    offset_handles = _semantic._convert_to_ir_values(offsets, require_i64=False)
+    _semantic.builder.create_async_tdm_copy_local_to_global(dest.handle, offset_handles, src.handle)
+
+
+@builtin
+def async_wait(num_outstanding=0, _semantic=None) -> None:
+    """Wait for the outstanding asynchronous tensor operations to complete.
+
+    Args:
+        num_outstanding (int): number of outstanding async tensor operations to wait for.
+    """
+    num_outstanding = _unwrap_if_constexpr(num_outstanding)
+    _semantic.builder.create_async_tdm_wait(num_outstanding)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4359442167982bd9c1d49324c3756a39b7f3920
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__init__.py
@@ -0,0 +1,17 @@
+from ..._core import builtin
+from .._ops import _wmma
+
+__all__ = ["wmma"]
+
+
+@builtin
+def wmma(a, b, acc, _semantic=None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD WMMA instruction.
+
+    Args:
+        a (tensor): The operand a to be multiplied.
+        b (tensor): The operand b to be multiplied.
+        acc (tensor): The accumulator tensor.
+    """
+    return _wmma(1, a, b, acc, _semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e06c3fc480ede9daf9dbc65164be3fdb6b8e2b40
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e3e169bd727ad67ced467fd778bba2b9947093
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__init__.py
@@ -0,0 +1,17 @@
+from ..._core import builtin
+from .._ops import _wmma
+
+__all__ = ["wmma"]
+
+
+@builtin
+def wmma(a, b, acc, _semantic=None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD WMMA instruction.
+
+    Args:
+        a (tensor): The operand a to be multiplied.
+        b (tensor): The operand b to be multiplied.
+        acc (tensor): The accumulator tensor.
+    """
+    return _wmma(2, a, b, acc, _semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9c724fee86b773db640b4a4c7d7c9bcf6fb065
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/amd/rdna4/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2091e0b7e2afcd9fb914745dc64c35351d487f92
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__init__.py
@@ -0,0 +1,3 @@
+from triton.language.extra import libdevice
+
+__all__ = ["libdevice"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6972f7b86b3c3f5d395d6bc68e405a6834fb58c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/extra/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ecf36d3b950635111e792c62a48497ee621ae02
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__init__.py
@@ -0,0 +1,4 @@
+from . import blackwell
+from . import hopper
+
+__all__ = ["blackwell", "hopper"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e866bdff24acce6ccc47251ec5eb917884c9875
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b012f017bfef9be182d70b4e6fe768565bfc1e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__init__.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._layouts import DotOperandLayout, NVMMADistributedLayout
+from ..._core import builtin, _unwrap_if_constexpr
+from . import async_copy, mbarrier
+
+__all__ = ["async_copy", "mbarrier", "mma_v2"]
+
+
+@builtin
+def mma_v2(a, b, acc, input_precision=None, _semantic=None):
+    input_precision = _unwrap_if_constexpr(input_precision)
+    assert isinstance(a, ttgl.tensor), "a must be a tensor"
+    assert isinstance(b, ttgl.tensor), "b must be a tensor"
+    assert isinstance(acc, ttgl.tensor), "acc must be a tensor"
+
+    mma_layout = acc.type.layout
+    assert isinstance(mma_layout, NVMMADistributedLayout), "acc must have an NVMMADistributedLayout"
+    assert mma_layout.version == [2, 0], "MMA layout must have version 2.0"
+
+    assert isinstance(a.type.layout, DotOperandLayout), "a must have a DotOperandLayout"
+    assert isinstance(b.type.layout, DotOperandLayout), "b must have a DotOperandLayout"
+    assert a.type.layout.parent == mma_layout, "a's parent layout must be the same as acc's layout"
+    assert b.type.layout.parent == mma_layout, "b's parent layout must be the same as acc's layout"
+    assert a.type.layout.operand_index == 0, "a's operand index must be 0"
+    assert b.type.layout.operand_index == 1, "b's operand index must be 1"
+
+    handle = _semantic.dot(a, b, acc, input_precision=input_precision, max_num_imprecise_acc=None,
+                           out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, acc.type)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49e1659571ab626a3c5548476441a89cbb6b9ee5
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/async_copy.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/async_copy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..644e51cd0dc4f55590cf0619bfe29ccfd28bc1de
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/async_copy.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/mbarrier.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/mbarrier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1f7dee0bf04169e6a16f7d7a3f7c05b4c483b08
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/__pycache__/mbarrier.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/async_copy.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/async_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6752402bfda1f308724f9fc5a11d2ce2d010fa7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/async_copy.py
@@ -0,0 +1,74 @@
+from ..._semantic import _check
+from ..._core import _unwrap_if_constexpr, builtin
+from triton._C.libtriton import ir
+
+__all__ = [
+    "async_copy_global_to_shared",
+    "mbarrier_arrive",
+    "commit_group",
+    "wait_group",
+]
+
+
+@builtin
+def async_copy_global_to_shared(smem, pointer, mask=None, cache_modifier="", eviction_policy="", volatile=False,
+                                _semantic=None):
+    """
+    Asynchronously copy elements from global memory to shared memory.
+
+    Args:
+        smem (shared_memory_descriptor): Destination shared memory descriptor.
+        pointer (tensor): Source pointer tensor.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+        eviction_policy (str): Eviction policy specifier. Defaults to "".
+        volatile (bool): Whether the load is volatile. Defaults to False.
+    """
+    mask = _unwrap_if_constexpr(mask)
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache_modifier)
+    eviction_policy = _semantic._str_to_eviction_policy(eviction_policy)
+    volatile = _unwrap_if_constexpr(volatile)
+    if mask is not None:
+        pointer, mask = _semantic.broadcast_impl_value(pointer, mask)
+    _check(
+        smem.shape == pointer.shape, lambda:
+        f"expected smem shape to match pointer shape but got smem.shape = {smem.shape}, pointer.shape = {pointer.shape}"
+    )
+    mask_handle = mask.handle if mask is not None else ir.value()
+    _semantic.builder.create_async_copy_global_to_local(smem.handle, pointer.handle, mask_handle, ir.value(),
+                                                        cache_modifier, eviction_policy, volatile)
+
+
+@builtin
+def mbarrier_arrive(mbarrier, increment_count=True, _semantic=None):
+    """
+    Arrive on the mbarrier once all outstanding async copies are complete.
+
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier object to arrive on.
+        increment_count (bool): Whether to increment the arrival count. Defaults to True.
+    """
+    increment_count = _unwrap_if_constexpr(increment_count)
+    _semantic.builder.create_async_copy_mbarrier_arrive(mbarrier.handle, increment_count)
+
+
+@builtin
+def commit_group(_semantic=None):
+    """
+    Commit the current asynchronous copy group.
+
+    This finalizes a set of asynchronous copy operations.
+    """
+    _semantic.builder.create_async_commit_group()
+
+
+@builtin
+def wait_group(num_outstanding=0, _semantic=None):
+    """
+    Wait for outstanding asynchronous copy group operations.
+
+    Args:
+        num_outstanding (int): Wait until `num_outstanding` or less async copy groups in-flight. Defaults to 0.
+    """
+    num_outstanding = _unwrap_if_constexpr(num_outstanding)
+    _semantic.builder.create_async_wait_group(num_outstanding)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/mbarrier.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/mbarrier.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f7ac3457075255646f37a78fa22e656cf0ed769
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/ampere/mbarrier.py
@@ -0,0 +1,71 @@
+from triton.experimental.gluon.language._layouts import SwizzledSharedLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+
+__all__ = ["arrive", "init", "invalidate", "MBarrierLayout", "wait"]
+
+
+class MBarrierLayout(SwizzledSharedLayout):
+    """
+    Layout for mbarrier synchronization in Ampere and later architectures.
+
+    Args:
+        cga_layout (List[List[int]]): CTA layout bases. Defaults to [].
+    """
+
+    def __init__(self, cga_layout=None):
+        super().__init__(vec=1, per_phase=1, max_phase=1, order=[0], cga_layout=cga_layout or [])
+
+
+@builtin
+def init(mbarrier, count, _semantic=None):
+    """
+    Initialize an mbarrier with a specified count.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to initialize.
+        count (int): The initial count for the barrier.
+    """
+    count = _unwrap_if_constexpr(count)
+    _semantic.builder.create_mbarrier_init(mbarrier.handle, count)
+
+
+@builtin
+def invalidate(mbarrier, _semantic=None):
+    """
+    Invalidate an mbarrier, resetting its state.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to invalidate.
+    """
+    _semantic.builder.create_mbarrier_inval(mbarrier.handle)
+
+
+@builtin
+def wait(mbarrier, phase, pred=True, deps=(), _semantic=None):
+    """
+    Wait until the mbarrier object completes its current phase.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to wait on.
+        phase (int): The phase index to wait for.
+        pred (bool): Predicate. Operation is skipped if predicate is False. Defaults to True.
+        deps (Sequence[shared_memory_descriptor]): Dependent allocations barrier is waiting on. Used to track liveness of dependent allocations. Defaults to ().
+    """
+    phase = _semantic.to_tensor(phase)
+    pred = _semantic.to_tensor(pred)
+    deps = [x.handle for x in deps]
+    _semantic.builder.create_mbarrier_wait(mbarrier.handle, phase.handle, pred.handle, deps)
+
+
+@builtin
+def arrive(mbarrier, *, pred=True, _semantic=None):
+    """
+    Arrive on an mbarrier, signaling that a thread has reached the barrier.
+
+    Args:
+        mbarrier (shared_memory_descriptor): The barrier object to arrive on.
+        pred (bool): Predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    count = 1
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_arrive(mbarrier.handle, count, pred.handle)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d1b21c011f15d374eac1a1a5bd894e15c1b0d1a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -0,0 +1,449 @@
+from __future__ import annotations
+from typing import Optional, Tuple, List, TYPE_CHECKING
+
+from dataclasses import dataclass
+from triton.runtime.jit import constexpr_function
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
+from triton.experimental.gluon.language._layouts import SharedLinearLayout
+from triton.experimental.gluon.language._semantic import _check, _compute_tmem_reg_layout
+
+from . import tma
+from ..hopper import fence_async_shared, mbarrier
+from ..ampere import async_copy, mma_v2
+
+from triton._C.libtriton import ir
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from ..._semantic import GluonSemantic
+
+__all__ = [
+    "allocate_tensor_memory",
+    "async_copy",
+    "fence_async_shared",
+    "get_tmem_reg_layout",
+    "mbarrier",
+    "mma_v2",
+    "tensor_memory_descriptor",
+    "TensorMemoryLayout",
+    "tma",
+]
+
+
+@dataclass(frozen=True, eq=True)
+class TensorMemoryLayout:
+    """
+    Describes the layout for tensor memory in Blackwell architecture.
+
+    Args:
+        block (Tuple[int, int]): Number of contiguous elements per row / column in a CTA.
+        col_stride (int): Number of 32-bit columns to advance between logically
+            adjacent columns. Packed layouts use a stride of 1. Unpacked
+            layouts use ``32 / bitwidth``.
+        cta_split_num (Optional[Tuple[int, int]]): CTA split factors. Defaults to None.
+        two_ctas (bool): Whether the layout is for two-CTA mode. Defaults to False.
+    """
+    block: Tuple[int, int]
+    col_stride: int
+    cta_split_num: Optional[Tuple[int, int]] = None
+    two_ctas: bool = False
+
+    def __post_init__(self):
+        super().__setattr__("block", _unwrap_if_constexpr(self.block))
+        super().__setattr__("col_stride", _unwrap_if_constexpr(self.col_stride))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("two_ctas", _unwrap_if_constexpr(self.two_ctas))
+        assert len(self.block) == 2
+        assert self.cta_split_num is None or len(self.cta_split_num) == 2
+        assert self.col_stride >= 1 and (self.col_stride &
+                                         (self.col_stride - 1)) == 0, "tensor memory col_stride must be a power of two"
+
+    def _to_ir(self, builder):
+        cta_split_num = list(self.cta_split_num) if self.cta_split_num else [1, 1]
+        return builder.get_tensor_memory_layout(
+            self.block,
+            self.col_stride,
+            cta_split_num,
+            self.two_ctas,
+        )
+
+    def mangle(self) -> str:
+        block_str = f"{self.block[0]}x{self.block[1]}"
+        stride_str = f"C{self.col_stride}"
+        cta_split_str = (f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else "")
+        two_ctas_str = "2CT" if self.two_ctas else ""
+        return f"TL{block_str}{stride_str}{cta_split_str}{two_ctas_str}TL"
+
+    def __hash__(self):
+        return hash((self.block, self.col_stride, self.cta_split_num, self.two_ctas))
+
+
+@dataclass(frozen=True, eq=True)
+class TensorMemoryScalesLayout:
+    """
+    Describes the layout for tensor memory scales in Blackwell architecture.
+
+    Args:
+        cta_split_num (Optional[Tuple[int, int]]): CTA split factors. Defaults to None.
+    """
+    cta_split_num: Optional[Tuple[int, int]] = None
+
+    def __post_init__(self):
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        assert self.cta_split_num is None or len(self.cta_split_num) == 2
+
+    def _to_ir(self, builder):
+        cta_split_num = list(self.cta_split_num) if self.cta_split_num else [1, 1]
+        return builder.get_tensor_memory_scales_layout(cta_split_num)
+
+    def mangle(self) -> str:
+        cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
+        return f"TLS{cta_split_str}TLS"
+
+    def __hash__(self):
+        return hash(self.cta_split_num)
+
+
+@constexpr_function
+def get_tmem_reg_layout(
+        element_ty,
+        shape,
+        layout,
+        num_warps,
+        instr_variant="32x32b",
+        cga_layout=(),
+):
+    """
+    Returns a DistributedLinearLayout compatible with TMEM load/store instructions.
+
+    Args:
+        element_ty (dtype): Element type stored in tensor memory.
+        shape (Sequence[int]): Global tensor shape addressed by the TMEM descriptor.
+        layout (TensorMemoryLayout): Tensor memory layout descriptor.
+        num_warps (int): Number of warps participating in the operation.
+        instr_variant (str): TMEM instruction variant (e.g. ``\"32x32b\"``).
+        cga_layout (Sequence[Sequence[int]]): CTA layout bases describing CTA distribution.
+    """
+
+    def _unwrap(x):
+        if isinstance(x, ttgl.constexpr):
+            return _unwrap(x.value)
+        if isinstance(x, list):
+            return [_unwrap(i) for i in x]
+        if isinstance(x, tuple):
+            return tuple(_unwrap(i) for i in x)
+        return x
+
+    return _compute_tmem_reg_layout(
+        _unwrap(element_ty),
+        _unwrap(shape),
+        _unwrap(layout),
+        _unwrap(num_warps),
+        _unwrap(instr_variant),
+        _unwrap(cga_layout),
+    )
+
+
+class tensor_memory_descriptor_type(base_type):
+
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, TensorMemoryLayout) or isinstance(layout, TensorMemoryScalesLayout)
+
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_tensor_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[tensor_memory_descriptor, int]:
+        value = tensor_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+
+    def __str__(self) -> str:
+        return f"tensor_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}>"
+
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+
+
+class tensor_memory_descriptor(base_value):
+    """
+    Represents a tensor memory descriptor handle for Tensor Core Gen5 operations.
+    """
+
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = tensor_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+    @property
+    def dtype(self):
+        return self.type.element_ty
+
+    @property
+    def shape(self):
+        return self.type.shape
+
+    @property
+    def rank(self):
+        return len(self.shape)
+
+    @property
+    def layout(self):
+        return self.type.layout
+
+    def __str__(self) -> str:
+        return str(self.type)
+
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic) -> ttgl.tensor:
+        """
+        Load a tensor from tensor memory.
+
+        Args:
+            layout (DistributedLayout): Destination layout of the tensor.
+
+        Returns:
+            tensor: A distributed tensor containing the loaded data.
+        """
+        layout = _unwrap_if_constexpr(layout)
+        ret_ty = ttgl.distributed_type(self.dtype, self.shape, layout)
+        builder = _semantic.builder
+        handle = builder.create_tmem_load(ret_ty.to_ir(builder), self.handle)
+        return ttgl.tensor(handle, ret_ty)
+
+    @builtin
+    def store(self, value, pred=True, _semantic: GluonSemantic = None) -> None:
+        """
+        Store a tensor into tensor memory.
+
+        Args:
+            value (tensor): The tensor to store.
+            pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+        """
+        pred = _unwrap_if_constexpr(pred)
+        pred = _semantic.to_tensor(pred)
+        assert value.shape == self.shape, f"source shape {value.shape} does not match destination shape {self.shape}"
+        assert value.dtype == self.dtype, f"source dtype {value.dtype} does not match destination dtype {self.dtype}"
+        _semantic.builder.create_tmem_store(self.handle, value.handle, pred.handle)
+
+    @builtin
+    def slice(self, start, length, _semantic: GluonSemantic) -> None:
+        """
+        Create a slice of the tensor memory descriptor along the last dimension.
+
+        Args:
+            start (int): The starting index for subslice.
+            length (int): The length of the subslice.
+
+        Returns:
+            tensor_memory_descriptor: Descriptor for the subslice.
+        """
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        _check(isinstance(start, int), lambda: "start must be a constant int")
+        _check(isinstance(length, int), lambda: "length must be a constant int")
+        shape = self.shape[:-1] + [length]
+        layout = self.type.layout
+        layout = TensorMemoryLayout(
+            (layout.block[0], min(layout.block[1], length)),
+            layout.col_stride,
+            layout.cta_split_num,
+            layout.two_ctas,
+        )
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
+        builder = _semantic.builder
+        ret.handle = builder.create_tmem_subslice(ret.type.to_ir(builder), self.handle, start)
+        return ret
+
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        """
+        Create a subview of tensor memory by indexing the first dimension.
+
+        Args:
+            index (tensor): The index tensor for the subview.
+
+        Returns:
+            tensor_memory_descriptor: Descriptor for the indexed subview.
+        """
+        index = _semantic.to_tensor(index)
+        builder = _semantic.builder
+        shape = self.shape[1:]
+        layout = self.layout
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, shape)
+        ret.handle = builder.create_memdesc_index(ret.type.to_ir(builder), self.handle, index.handle)
+        return ret
+
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        """
+        Reinterpret tensor memory descriptor with a new dtype, shape, and layout.
+
+        Args:
+            dtype (dtype): The new data type.
+            shape (Sequence[int]): The new shape.
+            layout (TensorMemoryLayout): The new layout.
+
+        Returns:
+            tensor_memory_descriptor: Descriptor with updated type and layout.
+        """
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+
+        ty = tensor_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = _semantic.builder.create_memdesc_reinterpret(ty.to_ir(_semantic.builder), self.handle)
+        return tensor_memory_descriptor(handle, **ty.__dict__)
+
+
+@builtin
+def allocate_tensor_memory(element_ty, shape, layout, value=None, _semantic=None):
+    """
+    Allocate tensor memory.
+
+    Args:
+        element_ty (dtype): The element data type.
+        shape (Sequence[int]): The descriptor shape.
+        layout (TensorMemoryLayout): The layout of the tensor memory.
+        value (tensor, optional): Initial tensor to copy. Defaults to None.
+
+    Returns:
+        tensor_memory_descriptor: Descriptor for the allocated memory.
+    """
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    layout = _unwrap_if_constexpr(layout)
+    value = value.handle if value is not None else None
+
+    ty = tensor_memory_descriptor_type(element_ty, shape, layout, shape)
+    builder = _semantic.builder
+    handle = builder.create_tmem_alloc(ty.to_ir(builder), value)
+    return tensor_memory_descriptor(handle, element_ty, shape, layout, shape)
+
+
+@builtin
+def tcgen05_copy(src, dst, _semantic=None):
+    """
+    Start an asynchronous copy from shared memory to tensor memory.
+
+    WARNING: The current semantics of the instruction are not well defined and
+    the API will change in the future. Use at your own risk.
+
+    Args:
+        src (shared_memory_descriptor): Shared memory to copy from.
+        dst (tensor_memory_descriptor): Tensor memory to copy to.
+    """
+    assert isinstance(src, ttgl.shared_memory_descriptor), "source must be a shared memory descriptor"
+    assert isinstance(dst, tensor_memory_descriptor), "destination must be a tensor memory descriptor"
+    _semantic.builder.create_tmem_copy(src.handle, dst.handle)
+
+
+@builtin
+def tcgen05_mma(a, b, acc, *, use_acc=True, pred=True, mbarriers=None, mbarrier_preds=None, _semantic=None):
+    """
+    Emit a 5th generation TensorCore MMA instruction.
+    acc = a * b + (acc if use_acc else 0)
+
+    Args:
+        a (shared_memory_descriptor): Left hand side operand in shared memory.
+        b (shared_memory_descriptor or tensor_memory_descriptor): Right hand side operand in shared or tensor memory.
+        acc (tensor_memory_descriptor): Accumulator value in tensor memory (mutated).
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+        mbarriers (Sequence[shared_memory_descriptor], optional): Barriers to signal when the operation is complete. If None, mma is synchronous. Defaults to None.
+        mbarrier_preds (Sequence[bool], optional): Predicates for barriers. Defaults to None.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+    pred = _semantic.to_tensor(pred)
+
+    if mbarriers is None:
+        assert mbarrier_preds is None
+        mbarriers = []
+        mbarrier_preds = []
+    else:
+        mbarriers = [bar.handle for bar in mbarriers]
+        if mbarrier_preds is None:
+            true = _semantic.to_tensor(True)
+            mbarrier_preds = [true.handle] * len(mbarriers)
+        else:
+            mbarrier_preds = _semantic._convert_to_ir_values(mbarrier_preds, require_i64=False)
+
+    _semantic.builder.create_tcgen05_mma(a.handle, b.handle, acc.handle, use_acc.handle, pred.handle, mbarriers,
+                                         mbarrier_preds, acc.layout.two_ctas)
+
+
+@builtin
+def tcgen05_mma_scaled(a, b, acc, a_scale, b_scale, a_type, b_type, *, use_acc=True, pred=True, mbarriers=None,
+                       mbarrier_preds=None, _semantic=None):
+    """
+    Emit a 5th generation TensorCore MMA scaled instruction.
+    acc = (a * a_scale) * (b * b_scale) + (acc if use_acc else 0)
+
+    Args:
+        a (shared_memory_descriptor): Left hand side operand in shared memory.
+        b (shared_memory_descriptor or tensor_memory_descriptor): Right hand side operand in shared or tensor memory.
+        acc (tensor_memory_descriptor): Accumulator value in tensor memory (mutated).
+        a_scale (tensor): Scale factor for operand A.
+        b_scale (tensor): Scale factor for operand B.
+        a_type (str): Type of operand A. One of {"e2m1", "e4m3", "e5m2"}.
+        b_type (str): Type of operand B. One of {"e2m1", "e4m3", "e5m2"}.
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+        mbarriers (Sequence[mbarrier], optional): Barriers to signal when the operation is complete. If None, mma is synchronous. Defaults to None.
+        mbarrier_preds (Sequence[bool], optional): Predicates for barriers. Defaults to None.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+    pred = _semantic.to_tensor(pred)
+
+    if mbarriers is None:
+        assert mbarrier_preds is None
+        mbarriers = []
+        mbarrier_preds = []
+    else:
+        mbarriers = [bar.handle for bar in mbarriers]
+        if mbarrier_preds is None:
+            true = _semantic.to_tensor(True)
+            mbarrier_preds = [true.handle] * len(mbarriers)
+        else:
+            mbarrier_preds = _semantic._convert_to_ir_values(mbarrier_preds, require_i64=False)
+
+    allowed_formats = {"e2m1", "e4m3", "e5m2"}
+    assert a_type.value in allowed_formats, f"Unsupported lhs_format: {a_type.value}"
+    assert b_type.value in allowed_formats, f"Unsupported rhs_format: {b_type.value}"
+    a_type = _semantic._str_to_fp_type(a_type.value)
+    b_type = _semantic._str_to_fp_type(b_type.value)
+    _semantic.builder.create_tcgen05_mma_scaled(a.handle, b.handle, acc.handle, a_scale.handle, b_scale.handle, a_type,
+                                                b_type, use_acc.handle, pred.handle, mbarriers, mbarrier_preds)
+
+
+@builtin
+def tcgen05_commit(barrier, _semantic=None):
+    """
+    This instruction causes the provided mbarrier to be arrived-on with a count
+    of 1 when all async tcgen05 MMA and copy instructions previously issued by
+    the thread are complete.
+
+    Args:
+        barrier (shared_memory_descriptor): The barrier to track completion of tcgen05 MMA and copy instructions.
+    """
+    _semantic.builder.create_tcgen05_commit(barrier.handle)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcde24935ec9506b4c17e5d28ea9cae59d42c8a4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/float2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/float2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1aa507c4edacce7eae207bd693bf26735d8cea30
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/float2.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/tma.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/tma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32d80f327e73738a923f54e5042cd4ab62eedb6a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/__pycache__/tma.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/float2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/float2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c06b103f3675ec38107b292d770b56cecdea450f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/float2.py
@@ -0,0 +1,172 @@
+from triton.language.core import _aggregate as aggregate
+from triton.experimental.gluon.language import _core as ttgl, _standard as stdlib
+from triton.experimental.gluon._runtime import constexpr_function, jit
+
+__all__ = [
+    "pack2",
+    "unpack2",
+    "pack",
+    "unpack",
+    "fma",
+    "Float2Tensor",
+]
+
+
+@jit
+def _add_f32x2(a, b):
+    return ttgl.inline_asm_elementwise(
+        """
+        add.f32x2 $0, $1, $2;
+        """,
+        "=l,l,l",
+        [a, b],
+        dtype=ttgl.int64,
+        is_pure=True,
+        pack=1,
+    )
+
+
+@jit
+def _sub_f32x2(a, b):
+    return ttgl.inline_asm_elementwise(
+        """
+        sub.f32x2 $0, $1, $2;
+        """,
+        "=l,l,l",
+        [a, b],
+        dtype=ttgl.int64,
+        is_pure=True,
+        pack=1,
+    )
+
+
+@jit
+def _mul_f32x2(a, b):
+    return ttgl.inline_asm_elementwise(
+        """
+        mul.f32x2 $0, $1, $2;
+        """,
+        "=l,l,l",
+        [a, b],
+        dtype=ttgl.int64,
+        is_pure=True,
+        pack=1,
+    )
+
+
+@jit
+def _fma_f32x2(a, b, c):
+    return ttgl.inline_asm_elementwise(
+        """
+        fma.rn.f32x2 $0, $1, $2, $3;
+        """,
+        "=l,l,l,l",
+        [a, b, c],
+        dtype=ttgl.int64,
+        is_pure=True,
+        pack=1,
+    )
+
+
+@aggregate
+class Float2Tensor:
+    value: ttgl.tensor
+
+    @constexpr_function
+    def __init__(self, value: ttgl.tensor):
+        self.value = value
+
+    @jit
+    def __add__(self, rhs):
+        ttgl.static_assert(isinstance(rhs, Float2Tensor), "rhs must be a Float2Tensor")
+        return Float2Tensor(_add_f32x2(self.value, rhs.value))
+
+    @jit
+    def __sub__(self, rhs):
+        ttgl.static_assert(isinstance(rhs, Float2Tensor), "rhs must be a Float2Tensor")
+        return Float2Tensor(_sub_f32x2(self.value, rhs.value))
+
+    @jit
+    def __mul__(self, rhs):
+        ttgl.static_assert(isinstance(rhs, Float2Tensor), "rhs must be a Float2Tensor")
+        return Float2Tensor(_mul_f32x2(self.value, rhs.value))
+
+    @jit
+    def sum(self, axis: ttgl.constexpr):
+        return Float2Tensor(ttgl.reduce(self.value, axis=axis, combine_fn=_add_f32x2))
+
+
+@jit
+def pack2(x0, x1):
+    value = ttgl.inline_asm_elementwise(
+        """
+        mov.b64 $0, { $1, $2 };
+        """,
+        "=l,r,r",
+        [x0, x1],
+        dtype=ttgl.int64,
+        is_pure=True,
+        pack=1,
+    )
+    return Float2Tensor(value)
+
+
+@jit
+def unpack2(x):
+    return ttgl.inline_asm_elementwise(
+        """
+        mov.b64 { $0, $1 }, $2;
+        """,
+        "=r,=r,l",
+        [x.value],
+        dtype=[ttgl.float32, ttgl.float32],
+        is_pure=True,
+        pack=1,
+    )
+
+
+@constexpr_function
+def _get_split_shape(shape, axis):
+    shape = [d for d in shape]
+    assert shape[axis] >= 2, f"not enough elements to pack along axis {axis}"
+    shape[axis] //= 2
+    shape.insert(axis + 1, 2)
+    permute = list(range(len(shape)))
+    permute[axis + 1], permute[len(permute) - 1] = permute[len(permute) - 1], permute[axis + 1]
+    return ttgl.tuple(shape), ttgl.tuple(permute)
+
+
+@constexpr_function
+def _get_join_shape(shape, axis):
+    shape = [d for d in shape]
+    shape[axis] *= 2
+    permute = list(range(len(shape)))
+    permute.insert(axis + 1, len(permute))
+    return ttgl.tuple(shape), ttgl.tuple(permute)
+
+
+@jit
+def pack(x, axis):
+    sp: ttgl.constexpr = _get_split_shape(x.shape, axis)
+    x0, x1 = x.reshape(*sp[0]).permute(*sp[1]).split()
+    return pack2(x0, x1)
+
+
+@jit
+def unpack(x, axis):
+    shape: ttgl.constexpr = x.value.shape
+    sp: ttgl.constexpr = _get_join_shape(shape, axis)
+    x0, x1 = unpack2(x)
+    return ttgl.join(x0, x1).permute(*sp[1]).reshape(*sp[0])
+
+
+@jit
+def full_like(x, fill_value):
+    ttgl.static_assert(fill_value.dtype == ttgl.float32, "fill_value must be a float32")
+    fill = stdlib.full_like(x.value, fill_value, dtype=ttgl.float32)
+    return pack2(fill, fill)
+
+
+@jit
+def fma(a, b, c):
+    return Float2Tensor(_fma_f32x2(a.value, b.value, c.value))
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/tma.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/tma.py
new file mode 100644
index 0000000000000000000000000000000000000000..717331e53c04d7d27e2f8369ceae402c1f95d87c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/blackwell/tma.py
@@ -0,0 +1,54 @@
+from triton.experimental.gluon.language._core import builtin
+from triton.experimental.gluon.language.nvidia.hopper.tma import (
+    async_copy_global_to_shared,
+    async_copy_shared_to_global,
+    store_wait,
+    tensor_descriptor,
+    tensor_descriptor_type,
+    make_tensor_descriptor,
+)
+
+__all__ = [
+    "async_gather",
+    "async_scatter",
+    "async_copy_global_to_shared",
+    "async_copy_shared_to_global",
+    "store_wait",
+    "tensor_descriptor",
+    "tensor_descriptor_type",
+    "make_tensor_descriptor",
+]
+
+
+@builtin
+def async_gather(tensor_desc, x_offsets, y_offset, barrier, result, pred=True, _semantic=None):
+    """
+    Asynchronously gather elements from global memory to shared memory using TMA.
+
+    Args:
+        tensor_desc (tensor_descriptor): The tensor descriptor.
+        x_offsets (tensor): 1D tensor of X offsets.
+        y_offset (int): Scalar Y offset.
+        barrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
+        result (tensor_memory_descriptor): Result shared memory, must have NVMMASharedLayout.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    pred = _semantic.to_tensor(pred)
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_gather(tensor_desc.handle, x_offsets.handle, y_offset.handle, barrier.handle,
+                                              result.handle, pred.handle)
+
+
+@builtin
+def async_scatter(tensor_desc, x_offsets, y_offset, src, _semantic=None):
+    """
+    Asynchronously scatter elements from shared memory to global memory using TMA.
+
+    Args:
+        tensor_desc (tensor_descriptor): The tensor descriptor.
+        x_offsets (tensor): 1D tensor of X offsets.
+        y_offset (int): Scalar Y offset.
+        src (tensor_memory_descriptor): The source data, must be in NVMMASharedLayout.
+    """
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_scatter(tensor_desc.handle, x_offsets.handle, y_offset.handle, src.handle)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28557303685b3d6aca734d9ff7a926f4d71b94b0
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+from triton.compiler.code_generator import unflatten_ir_values
+from ..ampere import async_copy, mma_v2
+from . import mbarrier, tma
+from ... import _core
+
+from typing import List, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from triton._C.libtriton import ir
+
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "mma_v2", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
+
+
+@_core.builtin
+def fence_async_shared(cluster=False, _semantic=None):
+    """
+    Issue a fence to complete asynchronous shared memory operations.
+
+    Args:
+        cluster (bool): Whether to fence across cluster. Defaults to False.
+    """
+    cluster = _core._unwrap_if_constexpr(cluster)
+    _semantic.builder.create_fence_async_shared(cluster)
+
+
+class warpgroup_mma_accumulator_type(_core.base_type):
+    tensor_type: _core.dtype
+
+    def __init__(self, tensor_type: _core.dtype):
+        self.tensor_type = tensor_type
+
+    def __str__(self) -> str:
+        return f"warpgroup_mma_accumulator<{self.tensor_type}>"
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[warpgroup_mma_accumulator, int]:
+        return warpgroup_mma_accumulator(handles[cursor], self.tensor_type), cursor + 1
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        self.tensor_type._flatten_ir_types(builder, out)
+
+    def __eq__(self, other) -> bool:
+        return type(self) is type(other) and self.tensor_type == other.tensor_type
+
+    def mangle(self) -> str:
+        return f"FT{self.tensor_type.mangle()}FT"
+
+
+class warpgroup_mma_accumulator(_core.base_value):
+    handle: ir.value
+    type: warpgroup_mma_accumulator_type
+
+    def __init__(self, handle, tensor_type: _core.dtype):
+        self.handle = handle
+        self.type = warpgroup_mma_accumulator_type(tensor_type)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+
+@_core.builtin
+def warpgroup_mma_init(value, _semantic):
+    assert isinstance(value, _core.tensor)
+    return warpgroup_mma_accumulator(value.handle, value.type)
+
+
+@_core.builtin
+def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_acc=None, is_async=False,
+                  _semantic=None):
+    """
+    Perform warpgroup MMA (Tensor Core) operations.
+    acc = a * b + (acc if use_acc else 0)
+
+    Args:
+        a (tensor or shared_memory_descriptor): Left hand side operand.
+        b (shared_memory_descriptor): Right hand side operand.
+        acc (tensor): Accumulator tensor.
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        precision (str, optional): Dot input precision. Defaults to builder default.
+        max_num_imprecise_acc (int): Max imprecise accumulations. Used for fp8 -> fp32 dot. Determines how many accumulation are done in limited precision. Defaults to None, which means no upcasting is done.
+        is_async (bool): Whether operation is asynchronous. Defaults to False.
+
+    Returns:
+        tensor or warpgroup_mma_accumulator: Returns the result if synchronous, or a token to load the value once computed if asynchronous.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+
+    if precision is None:
+        precision = _semantic.builder.options.default_dot_input_precision
+
+    precision = _semantic._str_to_dot_input_precision(precision)
+
+    K = a.type.shape[-1]
+    if max_num_imprecise_acc is None:
+        if a.dtype.is_fp8() and b.dtype.is_fp8():
+            max_num_imprecise_acc = _semantic.builder.options.max_num_imprecise_acc_default
+        else:
+            max_num_imprecise_acc = 0
+    else:
+        if a.dtype.is_fp8() and b.dtype.is_fp8() and max_num_imprecise_acc > K:
+            raise ValueError(f"max_num_imprecise_acc ({max_num_imprecise_acc}) must be <= K ({K})")
+
+    max_num_imprecise_acc = _core._unwrap_if_constexpr(max_num_imprecise_acc)
+    is_async = _core._unwrap_if_constexpr(is_async)
+
+    handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
+                                                    max_num_imprecise_acc, is_async)
+    tensor_ty = acc.type.tensor_type if isinstance(acc, warpgroup_mma_accumulator) else acc.type
+    if is_async:
+        return warpgroup_mma_accumulator(handle, tensor_ty)
+    else:
+        return _core.tensor(handle, tensor_ty)
+
+
+@_core.builtin
+def warpgroup_mma_wait(num_outstanding=0, deps=None, _semantic=None):
+    """
+    Wait until `num_outstanding` or less warpgroup MMA operations are in-flight.
+
+    Args:
+        num_outstanding (int): Number of outstanding warpgroup MMA operations to wait for. Defaults to 0.
+        deps (Sequence[tensor]): List of dependencies that need to be kept alive while the mma is unfinished.
+    """
+    if deps is None:
+        raise ValueError("warpgroup_mma_wait deps must be given")
+    deps_handles = [x.handle for x in deps] if deps is not None else []
+    num_outstanding = _core._unwrap_if_constexpr(num_outstanding)
+    results = _semantic.builder.create_warpgroup_mma_wait(deps_handles, num_outstanding)
+    result_types = [dep.type.tensor_type if isinstance(dep, warpgroup_mma_accumulator) else dep.type for dep in deps]
+    results = unflatten_ir_values(results, result_types)
+    if len(deps) == 1:
+        return next(results)
+    return tuple(results)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06248a4359a4667988c3895a4452cd5049e0b4ca
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/mbarrier.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/mbarrier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..863f1257bc582b5c1a27126125f6124d1eed3aa1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/mbarrier.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/tma.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/tma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8b7703c642a9b35d9fea9f578e245d344ff69d2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/__pycache__/tma.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/mbarrier.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/mbarrier.py
new file mode 100644
index 0000000000000000000000000000000000000000..93bf51ebadac0dfeaeb8a4bfec975b61ab35e90c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/mbarrier.py
@@ -0,0 +1,34 @@
+from ..ampere.mbarrier import MBarrierLayout, init, invalidate, wait
+from ..._core import _unwrap_if_constexpr, builtin
+
+__all__ = ["arrive", "expect", "init", "invalidate", "MBarrierLayout", "wait"]
+
+
+@builtin
+def expect(mbarrier, bytes, pred=True, _semantic=None):
+    """
+    Expect a specific number of bytes being copied. When they are copied, the barrier is signaled.
+
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
+        bytes (int): Expected byte count.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    bytes = _unwrap_if_constexpr(bytes)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_expect(mbarrier.handle, bytes, pred.handle)
+
+
+@builtin
+def arrive(mbarrier, *, count=1, pred=True, _semantic=None):
+    """
+    Arrive at an mbarrier with a specified count.
+
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier to be signalled.
+        count (int): Count to arrive with. Defaults to 1.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    count = _unwrap_if_constexpr(count)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_arrive(mbarrier.handle, count, pred.handle)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/tma.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/tma.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc4ef3ace2958336ce0c0a5c6c5ce0240b5f3ccd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/language/nvidia/hopper/tma.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+from typing import List, Tuple, TYPE_CHECKING
+from dataclasses import dataclass
+from triton.language.core import base_type, base_value
+import triton.experimental.gluon.language._core as ttgl
+from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+
+if TYPE_CHECKING:
+    from triton._C import ir
+
+__all__ = ["async_copy_global_to_shared", "async_copy_shared_to_global", "store_wait"]
+
+
+@dataclass(eq=True)
+class tensor_descriptor_type(base_type):
+    block_type: ttgl.block_type
+    shape_type: ttgl.tuple_type
+    strides_type: ttgl.tuple_type
+    layout: NVMMASharedLayout
+
+    def __str__(self) -> str:
+        return f"tensor_descriptor<{self.block_type}, {self.layout}>"
+
+    def _to_ir(self, builder: ir.builder) -> ir.type:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        return builder.get_tensor_descriptor_layout_type(
+            self.block_type.to_ir(builder),
+            is_signed,
+            self.layout._to_ir(builder),
+        )
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor, int]:
+        handle = handles[cursor]
+        cursor += 1
+        shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
+        strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
+        value = tensor_descriptor(handle, shape, strides, self.block_type, layout=self.layout)
+        return value, cursor
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        ty = builder.get_tensor_descriptor_layout_type(
+            self.block_type.to_ir(builder),
+            is_signed,
+            self.layout._to_ir(builder),
+        )
+        out.append(ty)
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}_{self.layout.mangle()}TD"
+
+
+class tensor_descriptor(base_value):
+
+    def __init__(self, handle, shape: List[ttgl.tensor], strides: List[ttgl.tensor], block_type: ttgl.block_type,
+                 layout: NVMMASharedLayout):
+        self.handle = handle
+        self.shape = ttgl.tuple(shape)
+        self.strides = ttgl.tuple(strides)
+        self.type = tensor_descriptor_type(block_type, shape_type=self.shape.type, strides_type=self.strides.type,
+                                           layout=layout)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+
+    @property
+    def block_type(self):
+        return self.type.block_type
+
+    @property
+    def block_shape(self):
+        return self.type.block_type.shape
+
+    @property
+    def dtype(self):
+        return self.type.block_type.element_ty
+
+    @property
+    def layout(self):
+        return self.type.layout
+
+
+@builtin
+def async_copy_global_to_shared(tensor_desc, coord, barrier, result, pred=True, _semantic=None):
+    coord = _semantic._convert_to_ir_values(coord, require_i64=False)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_async_tma_copy_global_to_local(tensor_desc.handle, coord, barrier.handle, result.handle,
+                                                            pred.handle)
+
+
+@builtin
+def async_copy_shared_to_global(tensor_desc, coord, src, _semantic=None):
+    coord = _semantic._convert_to_ir_values(coord, require_i64=False)
+    _semantic.builder.create_async_tma_copy_local_to_global(tensor_desc.handle, coord, src.handle)
+
+
+@builtin
+def store_wait(pendings, _semantic=None):
+    pendings = _unwrap_if_constexpr(pendings)
+    _semantic.builder.create_async_tma_store_wait(pendings)
+
+
+@builtin
+def make_tensor_descriptor(
+    base: ttgl.tensor,
+    shape: List[ttgl.tensor],
+    strides: List[ttgl.tensor],
+    block_shape: List[ttgl.constexpr],
+    layout: NVMMASharedLayout,
+    padding_option="zero",
+    _semantic=None,
+) -> tensor_descriptor:
+    padding_option = _unwrap_if_constexpr(padding_option)
+    block_shape = _unwrap_if_constexpr(block_shape)
+
+    ndim = len(shape)
+    if not (1 <= ndim <= 5):
+        raise ValueError(f"Expected 1 <= ndim <= 5 but got {ndim} dimensions")
+    if len(strides) != ndim:
+        raise ValueError(f"Expected {ndim} strides but got {len(strides)}")
+    if len(block_shape) != ndim:
+        raise ValueError(f"Expected block_shape to have {ndim} dimensions but got {len(strides)}")
+    assert isinstance(base.dtype, ttgl.pointer_type)
+    elem_size = base.dtype.element_ty.primitive_bitwidth // 8
+    contig_dim_size = ttgl._unwrap_if_constexpr(block_shape[-1])
+    if contig_dim_size * elem_size < 16:
+        raise ValueError(
+            f"Descriptor block shape must have at least 16 bytes in the last dimension, but got {contig_dim_size} * {elem_size} = {contig_dim_size * elem_size} bytes"
+        )
+
+    last_stride = ttgl._unwrap_if_constexpr(strides[-1])
+    if last_stride != 1:
+        raise ValueError(f"Tensor descriptor last dim must be 1 but got {last_stride}")
+
+    shape = [_semantic.make_scalar(x, ttgl.int32) for x in shape]
+    strides = [_semantic.make_scalar(ttgl._unwrap_if_constexpr(x), ttgl.int64) for x in strides]
+
+    # Check whether `block_shape` is static
+    block_shape = ttgl._unwrap_shape(block_shape)
+
+    assert isinstance(base.type, ttgl.pointer_type)
+    block_type = ttgl.block_type(base.type.element_ty, block_shape)
+    base_handle = base.handle
+
+    padding = _semantic._str_to_padding_option(padding_option)
+
+    layout = _unwrap_if_constexpr(layout)
+    assert isinstance(layout, NVMMASharedLayout), \
+        "Expected layout to be a NVMMASharedLayout"
+
+    shape_type = ttgl.tuple(shape).type
+    strides_type = ttgl.tuple(strides).type
+    ty = tensor_descriptor_type(block_type, shape_type, strides_type, layout)
+
+    if base.type.element_ty.is_int() and padding == ttgl.ir.PADDING_OPTION.PAD_NAN:
+        raise ValueError("Padding option `nan` is not supported for integer blocks")
+    handle = _semantic.builder.create_make_tensor_descriptor(
+        ty._to_ir(_semantic.builder),
+        base_handle,
+        [s.handle for s in shape],
+        [s.handle for s in strides],
+        padding,
+    )
+    return tensor_descriptor(handle, shape, strides, block_type, layout)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8184c7388eaa11e018905df24982af333a9df6d5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__init__.py
@@ -0,0 +1,4 @@
+from . import hopper
+from . import blackwell
+
+__all__ = ["hopper", "blackwell"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d53200b1d6a287d66ab3097d1872ab973544dab9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/blackwell.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/blackwell.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..524e6342c130ce4279699ea553e124d07b8a6a4e
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/blackwell.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/hopper.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/hopper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fa257e6856d456c189aa6f3bc85ab840abc103d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/__pycache__/hopper.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/blackwell.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/blackwell.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf919805191d9ebddbf416b3be95187fdf893cb
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/blackwell.py
@@ -0,0 +1,3 @@
+from .hopper import TensorDescriptor
+
+__all__ = ["TensorDescriptor"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/hopper.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/hopper.py
new file mode 100644
index 0000000000000000000000000000000000000000..83bcfc55ce7f659ff38a3a360e4a846315d24835
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/experimental/gluon/nvidia/hopper.py
@@ -0,0 +1,47 @@
+from dataclasses import dataclass
+from typing import List, Any
+from triton._utils import validate_block_shape, canonicalize_dtype, get_primitive_bitwidth
+from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+
+__all__ = ["TensorDescriptor"]
+
+
+@dataclass
+class TensorDescriptor:
+    base: Any
+    shape: List[int]
+    strides: List[int]
+    block_shape: List[int]
+    layout: NVMMASharedLayout
+    padding: str = "zero"
+
+    def __post_init__(self):
+        rank = len(self.shape)
+        assert len(self.strides) == rank, f"rank mismatch: {self}"
+        assert len(self.block_shape) == rank, f"rank mismatch: {self}"
+        assert rank > 0, "rank must not be zero"
+        assert rank <= 5, "rank cannot be more than 5"
+        assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        validate_block_shape(self.block_shape)
+        dtype_str = canonicalize_dtype(self.base.dtype)
+        elem_bytes = get_primitive_bitwidth(dtype_str) // 8
+        for stride in self.strides[:-1]:
+            assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        for shape_dim in self.shape:
+            assert shape_dim > 0, "shape must be positive"
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert isinstance(self.layout, NVMMASharedLayout), "Layout must be NVMMASharedLayout"
+        assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
+        if self.padding == "nan":
+            assert self.base.dtype.is_floating_point, "Padding option `nan` is only supported for floating point tensors"
+
+    @staticmethod
+    def from_tensor(tensor: Any, block_shape: List[int], layout: NVMMASharedLayout, padding="zero"):
+        return TensorDescriptor(
+            tensor,
+            tensor.shape,
+            tensor.stride(),
+            block_shape,
+            layout,
+            padding,
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d548c9a5d2d8d29af0b6e7f47ceeb4af40ba86
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__init__.py
@@ -0,0 +1,350 @@
+"""isort:skip_file"""
+# Import order is significant here.
+
+from . import math
+from . import extra
+from .standard import (
+    argmax,
+    argmin,
+    bitonic_merge,
+    cdiv,
+    cumprod,
+    cumsum,
+    flip,
+    interleave,
+    max,
+    min,
+    ravel,
+    reduce_or,
+    sigmoid,
+    softmax,
+    sort,
+    sum,
+    swizzle2d,
+    topk,
+    xor_sum,
+    zeros,
+    zeros_like,
+)
+from .core import (
+    PropagateNan,
+    TRITON_MAX_TENSOR_NUMEL,
+    load_tensor_descriptor,
+    store_tensor_descriptor,
+    make_tensor_descriptor,
+    tensor_descriptor,
+    tensor_descriptor_type,
+    add,
+    advance,
+    arange,
+    associative_scan,
+    assume,
+    atomic_add,
+    atomic_and,
+    atomic_cas,
+    atomic_max,
+    atomic_min,
+    atomic_or,
+    atomic_xchg,
+    atomic_xor,
+    bfloat16,
+    block_type,
+    broadcast,
+    broadcast_to,
+    cat,
+    cast,
+    clamp,
+    condition,
+    const,
+    constexpr,
+    constexpr_type,
+    debug_barrier,
+    device_assert,
+    device_print,
+    dot,
+    dot_scaled,
+    dtype,
+    expand_dims,
+    float16,
+    float32,
+    float64,
+    float8e4b15,
+    float8e4nv,
+    float8e4b8,
+    float8e5,
+    float8e5b16,
+    full,
+    gather,
+    histogram,
+    inline_asm_elementwise,
+    int1,
+    int16,
+    int32,
+    int64,
+    int8,
+    join,
+    load,
+    make_block_ptr,
+    map_elementwise,
+    max_constancy,
+    max_contiguous,
+    maximum,
+    minimum,
+    mul,
+    multiple_of,
+    num_programs,
+    permute,
+    pi32_t,
+    pointer_type,
+    program_id,
+    range,
+    reduce,
+    reshape,
+    slice,
+    split,
+    static_assert,
+    static_print,
+    static_range,
+    store,
+    sub,
+    tensor,
+    trans,
+    tuple,
+    tuple_type,
+    uint16,
+    uint32,
+    uint64,
+    uint8,
+    view,
+    void,
+    where,
+)
+from .math import (umulhi, exp, exp2, fma, log, log2, cos, rsqrt, sin, sqrt, sqrt_rn, abs, fdiv, div_rn, erf, floor,
+                   ceil)
+from .random import (
+    pair_uniform_to_normal,
+    philox,
+    philox_impl,
+    rand,
+    rand4x,
+    randint,
+    randint4x,
+    randn,
+    randn4x,
+    uint_to_uniform_float,
+)
+from . import target_info
+
+__all__ = [
+    "PropagateNan",
+    "TRITON_MAX_TENSOR_NUMEL",
+    "load_tensor_descriptor",
+    "store_tensor_descriptor",
+    "make_tensor_descriptor",
+    "tensor_descriptor",
+    "abs",
+    "add",
+    "advance",
+    "arange",
+    "argmax",
+    "argmin",
+    "associative_scan",
+    "assume",
+    "atomic_add",
+    "atomic_and",
+    "atomic_cas",
+    "atomic_max",
+    "atomic_min",
+    "atomic_or",
+    "atomic_xchg",
+    "atomic_xor",
+    "bfloat16",
+    "bitonic_merge",
+    "block_type",
+    "broadcast",
+    "broadcast_to",
+    "cat",
+    "cast",
+    "cdiv",
+    "ceil",
+    "clamp",
+    "condition",
+    "const",
+    "constexpr",
+    "constexpr_type",
+    "cos",
+    "cumprod",
+    "cumsum",
+    "debug_barrier",
+    "device_assert",
+    "device_print",
+    "div_rn",
+    "dot",
+    "dot_scaled",
+    "dtype",
+    "erf",
+    "exp",
+    "exp2",
+    "expand_dims",
+    "extra",
+    "fdiv",
+    "flip",
+    "float16",
+    "float32",
+    "float64",
+    "float8e4b15",
+    "float8e4nv",
+    "float8e4b8",
+    "float8e5",
+    "float8e5b16",
+    "floor",
+    "fma",
+    "full",
+    "gather",
+    "histogram",
+    "inline_asm_elementwise",
+    "interleave",
+    "int1",
+    "int16",
+    "int32",
+    "int64",
+    "int8",
+    "join",
+    "load",
+    "log",
+    "log2",
+    "make_block_ptr",
+    "map_elementwise",
+    "math",
+    "max",
+    "max_constancy",
+    "max_contiguous",
+    "maximum",
+    "min",
+    "minimum",
+    "mul",
+    "multiple_of",
+    "num_programs",
+    "pair_uniform_to_normal",
+    "permute",
+    "philox",
+    "philox_impl",
+    "pi32_t",
+    "pointer_type",
+    "program_id",
+    "rand",
+    "rand4x",
+    "randint",
+    "randint4x",
+    "randn",
+    "randn4x",
+    "range",
+    "ravel",
+    "reduce",
+    "reduce_or",
+    "reshape",
+    "rsqrt",
+    "slice",
+    "sigmoid",
+    "sin",
+    "softmax",
+    "sort",
+    "split",
+    "sqrt",
+    "sqrt_rn",
+    "static_assert",
+    "static_print",
+    "static_range",
+    "store",
+    "sub",
+    "sum",
+    "swizzle2d",
+    "target_info",
+    "tensor",
+    "topk",
+    "trans",
+    "tuple",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8",
+    "uint_to_uniform_float",
+    "umulhi",
+    "view",
+    "void",
+    "where",
+    "xor_sum",
+    "zeros",
+    "zeros_like",
+]
+
+
+def str_to_ty(name, c):
+    from builtins import tuple
+
+    if isinstance(name, tuple):
+        fields = type(name).__dict__.get("_fields", None)
+        return tuple_type([str_to_ty(x, c) for x in name], fields)
+
+    if name[0] == "*":
+        name = name[1:]
+        const = False
+        if name[0] == "k":
+            name = name[1:]
+            const = True
+        ty = str_to_ty(name, c)
+        return pointer_type(element_ty=ty, const=const)
+
+    if name.startswith("tensordesc"):
+        inner = name.split("<")[1].rstrip(">")
+        dtype, rest = inner.split("[", maxsplit=1)
+        block_shape, rest = rest.split("]", maxsplit=1)
+        block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
+        layout = rest.lstrip(",")
+        is_gluon = len(layout)
+        dtype = str_to_ty(dtype, None)
+        ndim = len(block_shape)
+        shape_type = tuple_type([int32] * ndim)
+        # FIXME: Last dim stride should be constexpr(1)
+        stride_type = tuple_type(([int64] * ndim))
+        block = block_type(dtype, block_shape)
+        if is_gluon:
+            from triton.experimental.gluon.language._layouts import NVMMASharedLayout, PaddedSharedLayout, SwizzledSharedLayout
+            from triton.experimental.gluon.language.nvidia.hopper.tma import tensor_descriptor_type as nvidia_tensor_descriptor_type
+            from triton.experimental.gluon.language.amd.gfx1250.tdm import tensor_descriptor_type as amd_tensor_descriptor_type
+            layout = eval(
+                layout,
+                dict(NVMMASharedLayout=NVMMASharedLayout, PaddedSharedLayout=PaddedSharedLayout,
+                     SwizzledSharedLayout=SwizzledSharedLayout))
+            if isinstance(layout, NVMMASharedLayout):
+                return nvidia_tensor_descriptor_type(block, shape_type, stride_type, layout)
+            else:
+                return amd_tensor_descriptor_type(block, shape_type, stride_type, layout)
+        return tensor_descriptor_type(block, shape_type, stride_type)
+
+    if name.startswith("constexpr"):
+        return constexpr_type(c)
+
+    tys = {
+        "fp8e4nv": float8e4nv,
+        "fp8e4b8": float8e4b8,
+        "fp8e5": float8e5,
+        "fp8e5b16": float8e5b16,
+        "fp8e4b15": float8e4b15,
+        "fp16": float16,
+        "bf16": bfloat16,
+        "fp32": float32,
+        "fp64": float64,
+        "i1": int1,
+        "i8": int8,
+        "i16": int16,
+        "i32": int32,
+        "i64": int64,
+        "u1": int1,
+        "u8": uint8,
+        "u16": uint16,
+        "u32": uint32,
+        "u64": uint64,
+        "B": int1,
+    }
+    return tys[name]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebe9f2a80bec6b7d1ab2cef0fe7d5693d52e2e81
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/math.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/math.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e557d5297a011323743e9c42d32e2f7b0b37612b
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/math.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/random.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/random.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2ab7acc1a910599938e8f9b140576a8c0e4ece9
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/random.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/standard.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/standard.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4d21766023402fa05860d801976003462800003
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/standard.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/target_info.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/target_info.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7a6a3fe1fae638affb29a592122c39b4385936d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/__pycache__/target_info.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/core.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..29f6a367f3680162dbba9dc6e8bededdf03d12fe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/core.py
@@ -0,0 +1,3490 @@
+from __future__ import annotations
+
+import math
+from warnings import warn
+from contextlib import contextmanager
+from enum import Enum
+from functools import partial, wraps
+import typing
+from typing import Union, Callable, List, Sequence, TypeVar, Optional, Tuple
+from dataclasses import dataclass
+import builtins
+from .. import knobs
+from ..runtime.jit import JITCallable
+import inspect
+
+from .._C.libtriton import ir
+from .._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth
+
+T = TypeVar('T')
+
+TRITON_BUILTIN = "__triton_builtin__"
+
+PropagateNan = ir.PROPAGATE_NAN
+
+
+def must_use_result(x, s=True):
+    """If the result of this function is unused, throw an error."""
+    if isinstance(x, str):
+        return (lambda fn: must_use_result(fn, x))
+    x._must_use_result = s
+    return x
+
+
+def builtin(fn: T) -> T:
+    """Mark a function as a builtin."""
+    assert callable(fn)
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if "_semantic" not in kwargs or kwargs["_semantic"] is None:
+            raise ValueError("Did you forget to add @triton.jit ? "
+                             "(`_semantic` argument must be provided outside of JIT functions.)")
+        return fn(*args, **kwargs)
+
+    setattr(wrapper, TRITON_BUILTIN, True)
+
+    return wrapper
+
+
+def _tensor_member_fn(fn: T) -> T:
+    """Decorator that adds this free function as a member fn on class tensor.
+
+    When called as a member function on class tensor, the first argument to `fn`
+    is `self`, i.e. the tensor object.
+
+    If there are multiple decorators on a function, you probably want this one
+    to be the highest one (i.e. furthest from the function's `def`), so it's
+    applied last.
+
+    Unfortunately you still need to add a type stub to the body of class tensor
+    in order for pytype to know about it.
+    """
+    assert callable(fn)
+    orig_sig = inspect.signature(fn)
+    # Does fn take args other than _semantic, _generator, and the tensor itself?
+    has_args = len(orig_sig.parameters.keys() - {"_semantic", "_generator"}) > 1
+
+    if not fn.__doc__:
+        fn.__doc__ = ""
+    fn.__doc__ += f"""
+    This function can also be called as a member function on :py:class:`tensor`,
+    as :code:`x.{fn.__name__}({"..." if has_args else ""})` instead of
+    :code:`{fn.__name__}(x{", ..." if has_args else ""})`.
+    """
+
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    # Match the signature of `fn`, but change the first arg to `self` so the
+    # docs are a little less weird.
+    new_params = list(orig_sig.parameters.values())
+    new_params[0] = new_params[0].replace(name='self')
+    new_sig = orig_sig.replace(parameters=new_params)
+    wrapper.__signature__ = new_sig
+    wrapper.__doc__ = f"Forwards to :py:func:`{fn.__name__}` free function"
+    # If fn is a builtin, mark the wrapper as a builtin too.
+    if is_builtin(fn):
+        setattr(wrapper, TRITON_BUILTIN, True)
+
+    setattr(tensor, fn.__name__, fn if isinstance(fn, JITCallable) else wrapper)
+    return fn
+
+
+def _unwrap_iterable(x):
+    """Returns x[0] if x has one element and x[0] is iterable."""
+    if len(x) == 1:
+        # Determine whether x[0] is iterable.
+        #
+        # You might want to use collections.abc.Iterable instead of this
+        # try/except block.  Unfortunately, this doesn't work with constexpr.
+        #
+        # The problem is that abc.Iterable checks for __iter__ on the *class*.
+        # But we want constexpr to expose an __iter__ method if and only if the
+        # wrapped *object* (i.e. self.value) is iterable.  Therefore there's no
+        # right answer for whether the class constexpr defines __iter__, and
+        # abc.Iterable doesn't work (at least not without some metaclass magic).
+        try:
+            iter(x[0])
+            return x[0]
+        except TypeError:
+            pass
+
+    return x
+
+
+def is_builtin(fn) -> bool:
+    """Is this a registered triton builtin function?"""
+    return getattr(fn, TRITON_BUILTIN, False)
+
+
+@builtin
+def to_tensor(x, _semantic=None):
+    return _semantic.to_tensor(x)
+
+
+# -----------------------
+# constexpr
+# -----------------------
+
+
+class const:
+    """
+    This class is used as a type annotation to mark pointers to constant data.
+    The `store` function cannot be called with a pointer to const. Constness
+    is part of the pointer type and the usual Triton type consistency rules
+    apply. For example you cannot have a function that returns constant pointer
+    in one return statement and non-constant pointer in another.
+    """
+    pass
+
+
+class base_value:
+    """Base class of values that exist in the triton IR (i.e. not constexprs).
+    """
+    type: base_type
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        """Flatten frontend value into a sequence of mlir handles, which are appended
+        to the output list
+        """
+        raise NotImplementedError
+
+
+class base_type:
+
+    def __eq__(self, other) -> bool:
+        raise NotImplementedError("Types must implement __eq__")
+
+    def __ne__(self, other) -> bool:
+        return not (self == other)
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
+        """Build a frontend value with the current dtype, wrapping a list of existing handles.
+        cursor is the index of the first handle relevant to this value, and the function
+        should return the updated cursor position after any handles consumed by the created value.
+        """
+        raise NotImplementedError
+
+    def mangle(self) -> str:
+        raise NotImplementedError(f"NYI: Type mangling for type {self.__class__}")
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        raise NotImplementedError
+
+
+class constexpr_type(base_type):
+
+    def __init__(self, value):
+        self.value = value
+
+    def __eq__(self, other):
+        return isinstance(other, constexpr_type) and self.value == other.value
+
+    def __repr__(self) -> str:
+        return f"constexpr_type[{self.value}]"
+
+    def __hash__(self):
+        return hash(self.value)
+
+    def mangle(self) -> str:
+        return repr(self)
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        return
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
+        return constexpr(self.value), cursor
+
+
+class constexpr(base_value):
+    """
+    This class is used to store a value that is known at compile-time.
+    """
+
+    def __init__(self, value):
+        while isinstance(value, constexpr):
+            value = value.value
+        self.value = value
+        self.type = constexpr_type(value)
+
+    def __repr__(self) -> str:
+        return f"constexpr[{self.value}]"
+
+    def __hash__(self):
+        return hash((self.value, self.type))
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        return
+
+    def __index__(self):
+        return self.value
+
+    # In interpreter mode, constant values are not wrapped in constexpr,
+    # and therefore do not have a .value attribute.
+    # As a result, from here and below, we need to call the _unwrap_if_constexpr
+    # function to obtain either constexpr.value or the value itself.
+    def __add__(self, other):
+        return constexpr(self.value + _unwrap_if_constexpr(other))
+
+    def __radd__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) + self.value)
+
+    def __sub__(self, other):
+        return constexpr(self.value - _unwrap_if_constexpr(other))
+
+    def __rsub__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) - self.value)
+
+    def __mul__(self, other):
+        return constexpr(self.value * _unwrap_if_constexpr(other))
+
+    def __mod__(self, other):
+        return constexpr(self.value % _unwrap_if_constexpr(other))
+
+    def __rmul__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) * self.value)
+
+    def __truediv__(self, other):
+        return constexpr(self.value / _unwrap_if_constexpr(other))
+
+    def __rtruediv__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) / self.value)
+
+    def __floordiv__(self, other):
+        return constexpr(self.value // _unwrap_if_constexpr(other))
+
+    def __rfloordiv__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) // self.value)
+
+    def __gt__(self, other):
+        return constexpr(self.value > _unwrap_if_constexpr(other))
+
+    def __rgt__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) > self.value)
+
+    def __ge__(self, other):
+        return constexpr(self.value >= _unwrap_if_constexpr(other))
+
+    def __rge__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) >= self.value)
+
+    def __lt__(self, other):
+        return constexpr(self.value < _unwrap_if_constexpr(other))
+
+    def __rlt__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) < self.value)
+
+    def __le__(self, other):
+        return constexpr(self.value <= _unwrap_if_constexpr(other))
+
+    def __rle__(self, other):
+        return constexpr(_unwrap_if_constexpr(other) <= self.value)
+
+    def __eq__(self, other):
+        return constexpr(self.value == _unwrap_if_constexpr(other))
+
+    def __ne__(self, other):
+        return constexpr(self.value != _unwrap_if_constexpr(other))
+
+    def __bool__(self):
+        return bool(self.value)
+
+    def __neg__(self):
+        return constexpr(-self.value)
+
+    def __and__(self, other):
+        return constexpr(self.value & _unwrap_if_constexpr(other))
+
+    def logical_and(self, other):
+        return constexpr(self.value and _unwrap_if_constexpr(other))
+
+    def __or__(self, other):
+        return constexpr(self.value | _unwrap_if_constexpr(other))
+
+    def __xor__(self, other):
+        return constexpr(self.value ^ _unwrap_if_constexpr(other))
+
+    def logical_or(self, other):
+        return constexpr(self.value or _unwrap_if_constexpr(other))
+
+    def __pos__(self):
+        return constexpr(+self.value)
+
+    def __invert__(self):
+        return constexpr(~self.value)
+
+    def __pow__(self, other):
+        return constexpr(self.value**_unwrap_if_constexpr(other))
+
+    def __rpow__(self, other):
+        return constexpr(_unwrap_if_constexpr(other)**self.value)
+
+    def __rshift__(self, other):
+        return constexpr(self.value >> _unwrap_if_constexpr(other))
+
+    def __lshift__(self, other):
+        return constexpr(self.value << _unwrap_if_constexpr(other))
+
+    def __not__(self):
+        return constexpr(not self.value)
+
+    def __iter__(self):
+        return iter(self.value)
+
+    def __call__(self, *args, **kwds):
+        return self.value(*args, **kwds)
+
+    def __getitem__(self, *args):
+        args = (_unwrap_if_constexpr(x) for x in _normalize_tuple(args))
+        return self.value.__getitem__(*args)
+
+
+CONSTEXPR_0 = constexpr(0)
+
+
+def _unwrap_if_constexpr(o):
+    if isinstance(o, list):
+        return [_unwrap_if_constexpr(x) for x in o]
+    if isinstance(o, builtins.tuple):
+        return builtins.tuple(_unwrap_if_constexpr(x) for x in o)
+    if isinstance(o, tuple):
+        return tuple(_unwrap_if_constexpr(x) for x in o)
+    return o.value if isinstance(o, constexpr) else o
+
+
+def _normalize_tuple(t):
+    normalized_tuple = _unwrap_if_constexpr(t)
+    if isinstance(normalized_tuple, (list, builtins.tuple)):
+        normalized_tuple = tuple(normalized_tuple)
+    return normalized_tuple
+
+
+def check_bit_width(value, shift_value):
+    if isinstance(value, tensor) and isinstance(shift_value, constexpr):
+        bitwidth = value.type.scalar.primitive_bitwidth
+        if shift_value.value >= bitwidth:
+            warn(
+                f"Value {shift_value.value} exceeds the maximum bitwidth ({bitwidth}) for type '{value.dtype}'. This may result in undefined behavior."
+            )
+
+
+# -----------------------
+# dtype
+# -----------------------
+
+
+class dtype(base_type):
+    SINT_TYPES = ['int8', 'int16', 'int32', 'int64']
+    UINT_TYPES = ['int1', 'uint8', 'uint16', 'uint32', 'uint64']
+    FP_TYPES = ['fp8e4b15', 'fp8e4nv', 'fp8e4b8', 'fp8e5', 'fp8e5b16', 'fp16', 'bf16', 'fp32', 'fp64']
+    STANDARD_FP_TYPES = ['fp16', 'bf16', 'fp32', 'fp64']
+    OTHER_TYPES = ['void']
+
+    class SIGNEDNESS(Enum):
+        SIGNED = 0
+        UNSIGNED = 1
+
+    class KIND(Enum):
+        BOOLEAN = 0
+        INTEGRAL = 1
+        FLOATING = 2
+
+    def __init__(self, name):
+        name = _unwrap_if_constexpr(name)
+        self.name = name
+        assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name
+        self.primitive_bitwidth = get_primitive_bitwidth(name)
+        self.itemsize = self.primitive_bitwidth // 8
+        if name in dtype.SINT_TYPES:
+            self.int_signedness = dtype.SIGNEDNESS.SIGNED
+            self.int_bitwidth = self.primitive_bitwidth
+        elif name in dtype.UINT_TYPES:
+            self.int_signedness = dtype.SIGNEDNESS.UNSIGNED
+            self.int_bitwidth = self.primitive_bitwidth
+        elif name in dtype.FP_TYPES:
+            if name == 'fp8e4b15':
+                self.fp_mantissa_width = 3
+                self.exponent_bias = 15
+            elif name == 'fp8e4nv':
+                self.fp_mantissa_width = 3
+                self.exponent_bias = 7
+            elif name == 'fp8e4b8':
+                self.fp_mantissa_width = 3
+                self.exponent_bias = 8
+            elif name == 'fp8e5':
+                self.fp_mantissa_width = 2
+                self.exponent_bias = 15
+            elif name == 'fp8e5b16':
+                self.fp_mantissa_width = 2
+                self.exponent_bias = 16
+            elif name == 'fp16':
+                self.fp_mantissa_width = 10
+                self.exponent_bias = 15
+            elif name == 'bf16':
+                self.fp_mantissa_width = 7
+                self.exponent_bias = 127
+            elif name == 'fp32':
+                self.fp_mantissa_width = 23
+                self.exponent_bias = 127
+            elif name == 'fp64':
+                self.fp_mantissa_width = 52
+                self.exponent_bias = 1023
+            else:
+                raise RuntimeError(f'Unsupported floating-point type {name}')
+
+    def is_fp8(self):
+        return 'fp8' in self.name
+
+    def is_fp8e4nv(self):
+        return self.name == 'fp8e4nv'
+
+    def is_fp8e4b8(self):
+        return self.name == 'fp8e4b8'
+
+    def is_fp8e4b15(self):
+        return self.name == 'fp8e4b15'
+
+    def is_fp8e5(self):
+        return self.name == 'fp8e5'
+
+    def is_fp8e5b16(self):
+        return self.name == 'fp8e5b16'
+
+    def is_fp16(self):
+        return self.name == 'fp16'
+
+    def is_bf16(self):
+        return self.name == 'bf16'
+
+    def is_fp32(self):
+        return self.name == 'fp32'
+
+    def is_fp64(self):
+        return self.name == 'fp64'
+
+    def is_int1(self):
+        return self.name == 'int1'
+
+    def is_int8(self):
+        return self.name == 'int8'
+
+    def is_int16(self):
+        return self.name == 'int16'
+
+    def is_int32(self):
+        return self.name == 'int32'
+
+    def is_int64(self):
+        return self.name == 'int64'
+
+    def is_uint8(self):
+        return self.name == 'uint8'
+
+    def is_uint16(self):
+        return self.name == 'uint16'
+
+    def is_uint32(self):
+        return self.name == 'uint32'
+
+    def is_uint64(self):
+        return self.name == 'uint64'
+
+    def is_floating(self):
+        return self.name in dtype.FP_TYPES
+
+    def is_standard_floating(self):
+        return self.name in dtype.STANDARD_FP_TYPES
+
+    def is_int_signed(self):
+        return self.name in dtype.SINT_TYPES
+
+    def is_int_unsigned(self):
+        return self.name in dtype.UINT_TYPES
+
+    def is_int(self):
+        return self.name in dtype.SINT_TYPES + dtype.UINT_TYPES
+
+    def is_bool(self):
+        return self.is_int1()
+
+    def kind(self):
+        # Return int value following the type ordering bool < integer < fp
+        if self.is_bool():
+            return dtype.KIND.BOOLEAN
+        elif self.is_int():
+            return dtype.KIND.INTEGRAL
+        else:
+            assert self.is_floating()
+            return dtype.KIND.FLOATING
+
+    def get_int_max_value(self):
+        if self.is_int_signed():
+            return 2**(self.int_bitwidth - 1) - 1
+        if self.is_int_unsigned():
+            return 2**self.int_bitwidth - 1
+        assert False
+
+    def get_int_min_value(self):
+        if self.is_int_signed():
+            return -2**(self.int_bitwidth - 1)
+        if self.is_int_unsigned():
+            return 0
+        assert False
+
+    @staticmethod
+    def is_dtype(type_str):
+        return type_str in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES
+
+    @staticmethod
+    def is_void():
+        raise RuntimeError("Not implemented")
+
+    @staticmethod
+    def is_block():
+        return False
+
+    @staticmethod
+    def is_ptr():
+        return False
+
+    @staticmethod
+    def is_const():
+        return False
+
+    def __eq__(self, other) -> bool:
+        other = _unwrap_if_constexpr(other)
+        if not isinstance(other, dtype):
+            return False
+        return self.name == other.name
+
+    def __hash__(self):
+        return hash((self.name, ))
+
+    @property
+    def scalar(self):
+        return self
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+
+    def to_ir(self, builder: ir.builder) -> ir.type:
+        if self.name.startswith("fp8"):
+            if hasattr(builder, "options") and self.name not in builder.options.supported_fp8_dtypes:
+                raise ValueError(f'type {self} not supported in this architecture. '
+                                 f'The supported fp8 dtypes are {builder.options.supported_fp8_dtypes}')
+
+        if self.name == 'void':
+            return builder.get_void_ty()
+        elif self.name == 'int1':
+            return builder.get_int1_ty()
+        elif self.name in ('int8', 'uint8'):
+            return builder.get_int8_ty()
+        elif self.name in ('int16', 'uint16'):
+            return builder.get_int16_ty()
+        elif self.name in ('int32', 'uint32'):
+            return builder.get_int32_ty()
+        elif self.name in ('int64', 'uint64'):
+            return builder.get_int64_ty()
+        elif self.name == 'fp8e5':
+            return builder.get_fp8e5_ty()
+        elif self.name == 'fp8e5b16':
+            return builder.get_fp8e5b16_ty()
+        elif self.name == 'fp8e4nv':
+            return builder.get_fp8e4nv_ty()
+        elif self.name == 'fp8e4b8':
+            return builder.get_fp8e4b8_ty()
+        elif self.name == 'fp8e4b15':
+            return builder.get_fp8e4b15_ty()
+        elif self.name == 'fp16':
+            return builder.get_half_ty()
+        elif self.name == 'bf16':
+            return builder.get_bf16_ty()
+        elif self.name == 'fp32':
+            return builder.get_float_ty()
+        elif self.name == 'fp64':
+            return builder.get_double_ty()
+        raise ValueError(f'fail to convert {self} to ir type')
+
+    def __str__(self):
+        return self.name
+
+    def codegen_name(self):
+        if self.name.startswith("fp"):
+            return "float" + self.name[2:]
+        elif self.name.startswith("bf"):
+            return "bfloat" + self.name[2:]
+        else:
+            return self.name
+
+    @property
+    def cache_key_part(self) -> str:
+        """See cache_key_part() in triton.cc."""
+        return self.name
+
+    def __repr__(self):
+        """Output of repr needs to be an evaluatable expression"""
+        return f'triton.language.{self.codegen_name()}'
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
+        return tensor(handles[cursor], self), cursor + 1
+
+    def mangle(self) -> str:
+        if self.is_int():
+            SIGNED = dtype.SIGNEDNESS.SIGNED
+            prefix = 'i' if self.int_signedness == SIGNED else 'u'
+            return prefix + str(self.int_bitwidth)
+        if self.is_floating():
+            return str(self)
+        if self.is_void():
+            return 'V'
+        return super().mangle()
+
+    def with_element_ty(self, element_ty: dtype):
+        assert not self.is_block()
+        return element_ty
+
+
+# Some functions have a param named `dtype`, which shadows the `dtype` class.
+# We can't change the param name because it is part of function's public API.
+# Declare an alias so those functions can still reference the dtype class.
+_DtypeClass = dtype
+
+
+class pointer_type(dtype):
+
+    def __init__(self, element_ty: dtype, address_space: int = 1, const: bool = False):
+        element_ty = _unwrap_if_constexpr(element_ty)
+        if not isinstance(element_ty, dtype):
+            raise TypeError(f'element_ty has type `{type(element_ty).__name__}`; expected `dtype`.')
+        self.element_ty = element_ty
+        self.address_space = address_space
+        self.const = const
+        self.name = f'pointer<{element_ty}>' if not const else f'const_pointer<{element_ty}>'
+
+    def to_ir(self, builder: ir.builder) -> ir.pointer_type:
+        return builder.get_ptr_ty(self.element_ty.to_ir(builder), self.address_space)
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.__str__()
+
+    def is_ptr(self):
+        return True
+
+    def is_const(self):
+        return self.const
+
+    def __eq__(self, other) -> bool:
+        other = _unwrap_if_constexpr(other)
+        if not isinstance(other, pointer_type):
+            return False
+        return self.element_ty == other.element_ty and self.address_space == other.address_space and self.const == other.const
+
+    @property
+    def scalar(self):
+        return self
+
+    def mangle(self) -> str:
+        return f"P{self.element_ty.mangle()}"
+
+
+class block_type(dtype):
+
+    def __init__(self, element_ty: dtype, shape: List):
+        self.element_ty = element_ty
+
+        # Note that block_type's shape is a list of int
+        # while tensor's shape is a list of constexpr.
+        assert (isinstance(shape, (list, tuple)))
+
+        # shape can be empty ([]) when an input is a 0D tensor.
+        self.shape = tuple(_unwrap_shape(shape))
+        if not self.shape:
+            raise TypeError('0d block_type is forbidden')
+
+        self.numel = validate_block_shape(self.shape)
+        self.name = f'<{self.shape}, {self.element_ty}>'
+
+    def to_ir(self, builder: ir.builder) -> ir.block_type:
+        return builder.get_block_ty(self.element_ty.to_ir(builder), self.shape)
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.__str__()
+
+    def is_block(self):
+        return True
+
+    def get_block_shapes(self) -> Tuple[int]:
+        return self.shape
+
+    def with_element_ty(self, scalar_ty: dtype) -> block_type:
+        return block_type(scalar_ty, self.shape)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, block_type):
+            return False
+        return self.element_ty == other.element_ty and self.shape == other.shape
+
+    @property
+    def scalar(self):
+        return self.element_ty
+
+    @property
+    def nbytes(self):
+        return self.numel * (self.element_ty.primitive_bitwidth // 8)
+
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = '_'.join(map(str, self.shape))
+        return f'{elt}S{shape}S'
+
+
+class tuple_type(base_type):
+
+    def __init__(self, types, fields=None):
+        self.types = types
+        self.fields = fields or [''] * len(types)
+        self.name = '[' + ','.join([f"{k}:{v}" for k, v in zip(self.fields, self.types)]) + ']'
+
+    def __str__(self):
+        return self.name
+
+    def __iter__(self):
+        return iter(self.types)
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]):
+        for ty in self.types:
+            if not isinstance(ty, constexpr):
+                ty._flatten_ir_types(builder, out)
+
+    def __getitem__(self, index: int) -> dtype:
+        return self.types[index]
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.types == other.types and self.fields == other.fields
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tuple, int]:
+        values = []
+        for ty in self.types:
+            value, cursor = ty._unflatten_ir(handles, cursor)
+            values.append(value)
+        return tuple(values, self), cursor
+
+    def mangle(self):
+        return 'T' + '_'.join(ty.mangle() for ty in self.types) + 'T'
+
+
+class slice_type(dtype):
+
+    def __init__(self):
+        self.name = 'slice_type'
+
+
+# scalar types
+void = dtype('void')
+int1 = dtype('int1')
+int8 = dtype('int8')
+int16 = dtype('int16')
+int32 = dtype('int32')
+int64 = dtype('int64')
+uint8 = dtype('uint8')
+uint16 = dtype('uint16')
+uint32 = dtype('uint32')
+uint64 = dtype('uint64')
+float8e5 = dtype('fp8e5')
+float8e5b16 = dtype('fp8e5b16')
+float8e4nv = dtype('fp8e4nv')
+float8e4b8 = dtype('fp8e4b8')
+float8e4b15 = dtype('fp8e4b15')
+float16 = dtype('fp16')
+bfloat16 = dtype('bf16')
+float32 = dtype('fp32')
+float64 = dtype('fp64')
+# pointer types
+pi32_t = pointer_type(int32)
+
+
+def get_int_dtype(bitwidth: int, signed: bool) -> dtype:
+    if bitwidth == 1:
+        return int1
+    elif bitwidth == 8 and signed:
+        return int8
+    elif bitwidth == 8 and not signed:
+        return uint8
+    elif bitwidth == 16 and signed:
+        return int16
+    elif bitwidth == 16 and not signed:
+        return uint16
+    elif bitwidth == 32 and signed:
+        return int32
+    elif bitwidth == 32 and not signed:
+        return uint32
+    elif bitwidth == 64 and signed:
+        return int64
+    elif bitwidth == 64 and not signed:
+        return uint64
+    else:
+        raise ValueError(f'Unsupported bitwidth {bitwidth} and signedness {signed}')
+
+
+# -----------------------
+# tensor
+# -----------------------
+
+
+class tensor(base_value):
+    """Represents an N-dimensional array of values or pointers.
+
+    :code:`tensor` is the fundamental data structure in Triton programs.  Most
+    functions in :py:mod:`triton.language` operate on and return tensors.
+
+    Most of the named member functions here are duplicates of the free functions
+    in :code:`triton.language`.  For example, :code:`triton.language.sqrt(x)` is
+    equivalent to :code:`x.sqrt()`.
+
+    :code:`tensor` also defines most of the magic/dunder methods, so you can
+    write :code:`x+y`, :code:`x << 2`, etc.
+
+    .. rubric:: Constructors
+    ..
+       For some reason Sphinx includes __init__ before printing the full table
+       of methods.  Not what I want, but I can't figure out how to fix it.  Give
+       it its own section so it looks intentional. :)
+    """
+
+    def __init__(self, handle, type: dtype):
+        """Not called by user code."""
+        super().__init__()
+        # IR handle
+        self.handle = handle
+        # Block shape
+        self.shape = type.shape if type.is_block() else ()
+        self.numel = constexpr(math.prod(self.shape))
+        self.type = type  # Tensor type (can be block_type)
+        # Following the practice in pytorch, dtype is scalar type
+        self.dtype = type.scalar
+        self.shape = tuple([constexpr(s) for s in self.shape])
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+    def __str__(self) -> str:
+        # ex. "float32[16, 32]"
+        return str(self.dtype) + '[' + ', '.join(str(s) for s in self.shape) + ']'
+
+    @builtin
+    def __add__(self, other, _semantic=None):
+        return add(self, other, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __radd__(self, other, _semantic=None):
+        return add(other, self, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __sub__(self, other, _semantic=None):
+        return sub(self, other, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __rsub__(self, other, _semantic=None):
+        return sub(other, self, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __mul__(self, other, _semantic=None):
+        return mul(self, other, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __rmul__(self, other, _semantic=None):
+        return mul(other, self, sanitize_overflow=True, _semantic=_semantic)
+
+    @builtin
+    def __truediv__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.truediv(self, other)
+
+    @builtin
+    def __rtruediv__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.truediv(other, self)
+
+    @builtin
+    def __floordiv__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.floordiv(self, other)
+
+    @builtin
+    def __rfloordiv__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.floordiv(other, self)
+
+    @builtin
+    def __mod__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.mod(self, other)
+
+    @builtin
+    def __rmod__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.mod(other, self)
+
+    # unary operators
+    @builtin
+    def __neg__(self, _semantic=None):
+        return _semantic.minus(self)
+
+    @builtin
+    def __invert__(self, _semantic=None):
+        return _semantic.invert(self)
+
+    # bitwise operators
+
+    @builtin
+    def __and__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.and_(self, other)
+
+    @builtin
+    def __rand__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.and_(other, self)
+
+    @builtin
+    def __or__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.or_(self, other)
+
+    @builtin
+    def __ror__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.or_(other, self)
+
+    @builtin
+    def __xor__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.xor_(self, other)
+
+    @builtin
+    def __rxor__(self, other, _semantic=None):
+        other = _unwrap_if_constexpr(other)
+        return _semantic.xor_(other, self)
+
+    @builtin
+    def __lshift__(self, other, _semantic=None):
+        check_bit_width(self, other)
+        other = _unwrap_if_constexpr(other)
+        return _semantic.shl(self, other)
+
+    @builtin
+    def __rlshift__(self, other, _semantic=None):
+        check_bit_width(other, self)
+        other = _unwrap_if_constexpr(other)
+        return _semantic.shl(other, self)
+
+    @builtin
+    def __rshift__(self, other, _semantic=None):
+        check_bit_width(self, other)
+        other = _unwrap_if_constexpr(other)
+        if self.dtype.is_int_signed():
+            return _semantic.ashr(self, other)
+        else:
+            return _semantic.lshr(self, other)
+
+    @builtin
+    def __rrshift__(self, other, _semantic=None):
+        check_bit_width(other, self)
+        other = _unwrap_if_constexpr(other)
+        if self.dtype.is_int_signed():
+            return _semantic.ashr(other, self)
+        else:
+            return _semantic.lshr(other, self)
+
+    # >
+    @builtin
+    def __gt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_than(self, other)
+
+    @builtin
+    def __rgt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_than(other, self)
+
+    # >=
+    @builtin
+    def __ge__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_equal(self, other)
+
+    @builtin
+    def __rge__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_equal(other, self)
+
+    # <
+    @builtin
+    def __lt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_than(self, other)
+
+    @builtin
+    def __rlt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_than(other, self)
+
+    # <=
+    @builtin
+    def __le__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_equal(self, other)
+
+    @builtin
+    def __rle__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_equal(other, self)
+
+    # ==
+    @builtin
+    def __eq__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.equal(self, other)
+
+    @builtin
+    def __req__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.equal(other, self)
+
+    @builtin
+    def __ne__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.not_equal(self, other)
+
+    @builtin
+    def __rne__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.not_equal(other, self)
+
+    @builtin
+    def logical_and(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.logical_and(self, other)
+
+    @builtin
+    def logical_or(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.logical_or(self, other)
+
+    # note: __not__ isn't actually a magic method in python
+    # but it's ok because our ASTVisitor handles it
+    @builtin
+    def __not__(self, _semantic=None):
+        return _semantic.not_(self)
+
+    @builtin
+    def __getitem__(self, slices, _semantic=None):
+        if isinstance(slices, (builtins.slice, slice, constexpr)) or slices is None:
+            slices = [slices]
+        if isinstance(slices, tuple):
+            slices = slices.values
+        ret = self
+        for dim, sl in enumerate(slices):
+            if _unwrap_if_constexpr(sl) is None:
+                ret = _semantic.expand_dims(ret, dim)
+            elif isinstance(sl, (builtins.slice, slice)) and all(
+                    _unwrap_if_constexpr(arg) is None for arg in (sl.start, sl.stop, sl.step)):
+                pass  # an unsqueeze
+            else:
+                raise ValueError(f"unsupported tensor index: {sl}")
+        return ret
+
+    @property
+    def T(self):
+        """Transposes a 2D tensor."""
+        assert False, "Transposition must be created by the AST Visitor"
+
+    @builtin
+    def to(self, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _semantic=None):
+        """
+        Alias for :py:func:`tensor.cast`.
+        """
+        return cast(self, dtype, fp_downcast_rounding, bitcast, _semantic=_semantic)
+
+    # Type stubs for functions added by the _tensor_member_fn decorator.
+    # (Unfortunately these can't be created automatically.)
+    #
+    # We couldn't write these definitions out even if we wanted to, because some
+    # of these functions are defined in standard.py.
+    def broadcast_to(self, *shape) -> tensor:
+        ...
+
+    def trans(self, *dims) -> tensor:
+        ...
+
+    def permute(self, *dims) -> tensor:
+        ...
+
+    def split(self) -> tuple[tensor, tensor]:
+        ...
+
+    def view(self, *shape) -> tensor:
+        ...
+
+    def reshape(self, *shape) -> tensor:
+        ...
+
+    def expand_dims(self, axis) -> tensor:
+        ...
+
+    def cast(self, dtype, fp_downcast_rounding=None, bitcast=False) -> tensor:
+        ...
+
+    def store(self, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="") -> tensor:
+        ...
+
+    def advance(self, offsets) -> tensor:
+        ...
+
+    def atomic_cas(self, cmp, val, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_xchg(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_add(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_max(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_min(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_and(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_or(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def atomic_xor(self, val, mask=None, sem=None, scope=None) -> tensor:
+        ...
+
+    def exp(self) -> tensor:
+        ...
+
+    def log(self) -> tensor:
+        ...
+
+    def cos(self) -> tensor:
+        ...
+
+    def sin(self) -> tensor:
+        ...
+
+    def sqrt(self) -> tensor:
+        ...
+
+    def rsqrt(self) -> tensor:
+        ...
+
+    def abs(self) -> tensor:
+        ...
+
+    def reduce(self, axis, combine_fn, keep_dims=False) -> tensor:
+        ...
+
+    def associative_scan(self, axis, combine_fn, reverse=False) -> tensor:
+        ...
+
+    def gather(self, indices, axis) -> tensor:
+        ...
+
+    def histogram(self, num_bins) -> tensor:
+        ...
+
+    def cdiv(self, div) -> tensor:
+        ...
+
+    def sigmoid(self) -> tensor:
+        ...
+
+    def softmax(self, dim=None, keep_dims=False, ieee_rounding=False) -> tensor:
+        ...
+
+    def ravel(self) -> tensor:
+        ...
+
+    def max(self, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False) -> tensor:
+        ...
+
+    def argmax(self, axis, tie_break_left=True, keep_dims=False) -> tensor:
+        ...
+
+    def min(self, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False) -> tensor:
+        ...
+
+    def argmin(self, axis, tie_break_left=True, keep_dims=False) -> tensor:
+        ...
+
+    def sum(self, axis=None, keep_dims=False, dtype=None) -> tensor:
+        ...
+
+    def xor_sum(self, axis=None, keep_dims=False) -> tensor:
+        ...
+
+    def reduce_or(self, axis=None, keep_dims=False) -> tensor:
+        ...
+
+    def cumsum(self, axis=0, reverse=False) -> tensor:
+        ...
+
+    def cumprod(self, axis=0, reverse=False) -> tensor:
+        ...
+
+    def sort(self, dim: constexpr = None, descending: constexpr = CONSTEXPR_0) -> tensor:
+        ...
+
+    def flip(self, dim=None) -> tensor:
+        ...
+
+
+def _type_for_tuple_values(values, fields=None):
+    return tuple_type([constexpr_type(x) if isinstance(x, (int, float, dtype)) else x.type for x in values], fields)
+
+
+class tuple(base_value):
+
+    def __init__(self, args: Sequence, type: Optional[tuple_type] = None):
+        self.values = [i for i in args]
+        if isinstance(type, tuple_type):
+            self.type = type
+        elif type is not None:  # make_template in ASTFunction.deserialize may pass us a list/tuple
+            self.type = tuple_type(type)
+        else:
+            self.type = _type_for_tuple_values(self.values)
+
+    def __getitem__(self, idx: constexpr):
+        if isinstance(idx, int):
+            idx = constexpr(idx)
+        if isinstance(idx, constexpr):
+            return self.values[idx]
+        else:
+            assert isinstance(idx, (slice, builtins.slice))
+            return tuple(self.values[idx.start:idx.stop:idx.step])
+
+    def __getattr__(self, name):
+        return self.values[self.type.fields.index(name)]
+
+    # TODO: remove
+    def _setitem(self, idx, value):
+        idx = _unwrap_if_constexpr(idx)
+        assert isinstance(idx, int)
+        self.values[idx] = value
+        self.type = _type_for_tuple_values(self.values, self.type.fields)
+
+    def __add__(self, other):
+        other = _normalize_tuple(other)
+        return tuple(self.values + other.values)
+        # return tuple(a + b for a, b in zip(self.values, other.values))
+
+    def __mul__(self, other):
+        assert isinstance(other, constexpr)
+        return tuple(self.values * other.value)
+
+    def __eq__(self, other):
+        other = _normalize_tuple(other)
+        return constexpr(self.values == other.values)
+
+    def __hash__(self):
+        return hash(builtins.tuple(self.values))
+
+    def __str__(self):
+        return str([str(x) for x in self.values])
+
+    def __iter__(self):
+        return iter(self.values)
+
+    def __len__(self):
+        return len(self.values)
+
+    def _flatten_ir(self, handles: List[ir.value]):
+        for v in self.values:
+            v._flatten_ir(handles)
+
+    def __repr__(self):
+        return f"({', '.join(repr(x) for x in self.values)})"
+
+
+class slice:
+
+    def __init__(self, start, stop, step):
+        self.start = start
+        self.stop = stop
+        self.step = step
+        self.type = slice_type()
+
+
+class tensor_descriptor_base_type(base_type):
+
+    def __init__(self, block_type: block_type):
+        self.block_type = block_type
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor_base, int]:
+        value = tensor_descriptor_base(handles[cursor], self.block_type)
+        return value, cursor + 1
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        out.append(builder.create_tensor_descriptor_type(self.block_type.to_ir(builder), is_signed))
+
+    def __str__(self) -> str:
+        # ex. "tensor_descriptor<float32[16, 32]>"
+        return f"tensor_descriptor<{self.block_type}>"
+
+    def __eq__(self, other) -> bool:
+        if type(other) is not type(self):
+            return False
+        return self.block_type == other.block_type
+
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}"
+
+
+class tensor_descriptor_base(base_value):
+    """"
+    A tensor descriptor with unknown shape and strides
+    """
+
+    def __init__(self, handle, block_type: block_type):
+        """Not called by user code."""
+        super().__init__()
+
+        self.handle = handle  # IR handle
+        self.type = tensor_descriptor_base_type(block_type)  # Tensor type (block_type)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+    @property
+    def block_type(self):
+        return self.type.block_type
+
+    @property
+    def block_shape(self):
+        return self.type.block_type.shape
+
+    @property
+    def dtype(self):
+        return self.type.block_type.element_ty
+
+    def __str__(self) -> str:
+        return str(self.type)
+
+    @builtin
+    def load(self, offsets: Sequence[constexpr | tensor], _semantic=None) -> tensor:
+        """Load a block from the descriptor starting at the given element offsets.
+
+        Values outside of the tensor bounds will be filled with zeros.
+
+        :note: Offset must be a multiple of 16-bytes
+        """
+        return _semantic.descriptor_load(self, offsets, "", "")
+
+    @builtin
+    def store(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        """Store a block from the descriptor starting at the given element offsets.
+
+        Values outside of the tensor bounds will be ignored.
+
+        :note: Offset must be a multiple of 16-bytes
+        """
+        return _semantic.descriptor_store(self, value, offsets)
+
+    @builtin
+    def atomic_add(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_add(self, value, offsets)
+
+    @builtin
+    def atomic_min(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_min(self, value, offsets)
+
+    @builtin
+    def atomic_max(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_max(self, value, offsets)
+
+    @builtin
+    def atomic_and(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_and(self, value, offsets)
+
+    @builtin
+    def atomic_or(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_or(self, value, offsets)
+
+    @builtin
+    def atomic_xor(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_xor(self, value, offsets)
+
+    @builtin
+    def gather(self, *args, _semantic=None) -> tensor:
+        """Gather multiple descriptors worth of data"""
+        assert len(args) == 2, f"descriptor gather only supports 2D indexing, but got {len(args)}"
+        x_offsets = args[0]
+        y_offset = args[1]
+        return _semantic.descriptor_gather(self, x_offsets, y_offset, "", "")
+
+    @builtin
+    def scatter(self, value, *args, _semantic=None) -> tensor:
+        """Scatter multiple descriptors worth of data"""
+        assert len(args) == 2, f"descriptor scatter only supports 2D indexing, but got {len(args)}"
+        x_offsets = args[0]
+        y_offset = args[1]
+        return _semantic.descriptor_scatter(self, value, x_offsets, y_offset)
+
+
+class tensor_descriptor_type(tensor_descriptor_base_type):
+
+    def __init__(self, block_type: block_type, shape_type: tuple_type, strides_type: tuple_type):
+        self.block_type = block_type
+        self.shape_type = shape_type
+        self.strides_type = strides_type
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor_base, int]:
+        handle = handles[cursor]
+        cursor += 1
+        shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
+        strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
+        shape = shape.values
+        strides = strides.values
+        value = tensor_descriptor(handle, shape, strides, self.block_type)
+        return value, cursor
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        super()._flatten_ir_types(builder, out)
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
+
+    def __eq__(self, other):
+        return super().__eq__(other) and (self.shape_type == other.shape_type) and (self.strides_type
+                                                                                    == other.strides_type)
+
+
+class tensor_descriptor(tensor_descriptor_base):
+    """A descriptor representing a tensor in global memory.
+    """
+
+    def __init__(self, handle, shape: List[tensor], strides: List[tensor], block_type: block_type):
+        """Not called by user code."""
+        # IR handle
+        super().__init__(handle, block_type)
+        # Global shape
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.type = tensor_descriptor_type(
+            block_type,
+            shape_type=self.shape.type,
+            strides_type=self.strides.type,
+        )
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+
+
+# -----------------------
+# aggregate
+# -----------------------
+
+
+@dataclass(frozen=True)
+class _aggregate_type(base_type):
+    """A generic base type for all Triton aggregate types.
+
+    This class contains a reference to the original user-defined Python class
+    and a list of class fields with their Triton types.
+    """
+
+    base_cls: type
+    fields: List[Tuple[str, base_type]]
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[ir.value, int]:
+        instance = self.base_cls._get_instance()
+        for name, ty in self.fields:
+            value, cursor = ty._unflatten_ir(handles, cursor)
+            setattr(instance, name, value)
+        return instance, cursor
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        for name, ty in self.fields:
+            ty._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        name = f"{self.base_cls.__module__}.{self.base_cls.__qualname__}"
+        fields = [ty.mangle() for (name, ty) in self.fields]
+        return f"{name}<{', '.join(fields)}>"
+
+
+def _aggregate(cls):
+
+    # Define the wrapped Triton value type.
+    class aggregate_value(base_value):
+        __triton_builtin__ = True
+        __triton_aggregate__ = True
+
+        @classmethod
+        def _get_instance(this_cls):
+            return super().__new__(this_cls)
+
+        def __new__(this_cls, *args, _semantic=None, _generator=None, **kwargs):
+            # Call into the user-defined constructor.
+            instance = this_cls._get_instance()
+            extra_kwargs = {}
+            if isinstance(cls.__init__, JITCallable):
+                # raise ValueError(f"{cls.__name__}.__init__ cannot be a @triton.jit function")
+                pass
+            else:
+                if "_semantic" in inspect.signature(cls.__init__).parameters:
+                    extra_kwargs["_semantic"] = _semantic
+                if "_generator" in inspect.signature(cls.__init__).parameters:
+                    extra_kwargs["_generator"] = _generator
+            cls.__init__(instance, *args, **extra_kwargs, **kwargs)
+
+            # Require that the user-defined constructor initialized all fields.
+            for name in cls.__annotations__.keys():
+                if not hasattr(instance, name):
+                    raise AttributeError(f"constructor for {cls.__name__} did not initialize attribute '{name}'")
+
+            return instance
+
+        # Only allow setting attributes defined in the class annotations.
+        def __setattr__(self, name, value):
+            if name not in cls.__annotations__:
+                raise AttributeError(f"{cls.__name__} has no attribute '{name}'")
+            if not isinstance(value, cls.__annotations__[name]):
+                raise TypeError(f"Expected {cls.__annotations__[name]} for attribute '{name}', got {type(value)}")
+            super().__setattr__(name, value)
+
+        def _flatten_ir(self, handles: List[ir.value]) -> None:
+            for name in cls.__annotations__.keys():
+                getattr(self, name)._flatten_ir(handles)
+
+        @property
+        def type(self):
+            return _aggregate_type(aggregate_value,
+                                   [(name, getattr(self, name).type) for name in cls.__annotations__.keys()])
+
+    hash_attrs = [cls.__init__]
+
+    for (name, member) in inspect.getmembers(cls):
+        if inspect.isfunction(member) or inspect.ismethod(member) or isinstance(member, JITCallable):
+            if name != "__init__":
+                setattr(aggregate_value, name, member)
+                hash_attrs.append(member)
+
+    aggregate_value.hash_attrs = hash_attrs
+    aggregate_value.__name__ = cls.__name__
+    aggregate_value.__module__ = cls.__module__
+    aggregate_value.__qualname__ = cls.__qualname__
+    aggregate_value.__doc__ = cls.__doc__
+
+    return aggregate_value
+
+
+# -----------------------
+# SPMD Programming Model
+# -----------------------
+
+
+@builtin
+def program_id(axis, _semantic=None):
+    """
+    Returns the id of the current program instance along the given :code:`axis`.
+
+    :param axis: The axis of the 3D launch grid. Must be 0, 1 or 2.
+    :type axis: int
+    """
+    # if axis == -1:
+    #     pid0 = _semantic.program_id(0)
+    #     pid1 = _semantic.program_id(1)
+    #     pid2 = _semantic.program_id(2)
+    #     npg0 = _semantic.num_programs(0)
+    #     npg1 = _semantic.num_programs(1)
+    #     return pid0 + pid1*npg0 + pid2*npg0*npg1
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.program_id(axis)
+
+
+@builtin
+def num_programs(axis, _semantic=None):
+    """
+    Returns the number of program instances launched along the given :code:`axis`.
+
+    :param axis: The axis of the 3D launch grid. Must be 0, 1 or 2.
+    :type axis: int
+    """
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.num_programs(axis)
+
+
+# -----------------------
+# Block Initialization
+# -----------------------
+
+
+@builtin
+def arange(start, end, _semantic=None):
+    start = _unwrap_if_constexpr(start)
+    end = _unwrap_if_constexpr(end)
+    return _semantic.arange(start, end)
+
+
+arange.__doc__ = f"""
+    Returns contiguous values within the half-open interval :code:`[start,
+    end)`.  :code:`end - start` must be less than or equal to
+    :code:`TRITON_MAX_TENSOR_NUMEL = {TRITON_MAX_TENSOR_NUMEL}`
+
+    :param start: Start of the interval. Must be a power of two.
+    :type start: int32
+    :param end: End of the interval. Must be a power of two greater than
+        :code:`start`.
+    :type end: int32
+"""
+
+
+def _unwrap_shape(shape):
+    shape = _unwrap_if_constexpr(shape)
+    return [_unwrap_if_constexpr(s) for s in shape]
+
+
+def _shape_check_impl(shape):
+    shape = _unwrap_shape(shape)
+    validate_block_shape(shape)
+    return shape
+
+
+@builtin
+def full(shape, value, dtype, _semantic=None):
+    """
+    Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`.
+
+    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
+    :type shape: tuple of ints
+    :param value: A scalar value to fill the array with
+    :type value: scalar
+    :param dtype: Data type of the new array, e.g., :code:`tl.float16`
+    :type dtype: tl.dtype
+    """
+    shape = _shape_check_impl(shape)
+    value = _unwrap_if_constexpr(value)
+    dtype = _unwrap_if_constexpr(dtype)
+    return _semantic.full(shape, value, dtype)
+
+
+# -----------------------
+# Shape Manipulation
+# -----------------------
+
+
+@builtin
+def broadcast(input, other, _semantic=None):
+    """
+    Tries to broadcast the two given blocks to a common compatible shape.
+
+    :param input: The first input tensor.
+    :type input: Block
+    :param other: The second input tensor.
+    :type other: Block
+    """
+    return _semantic.broadcast_impl_value(input, other)
+
+
+@_tensor_member_fn
+@builtin
+def broadcast_to(input, *shape, _semantic=None):
+    """
+    Tries to broadcast the given tensor to a new :code:`shape`.
+
+    :param input: The input tensor.
+    :type input: Block
+    :param shape: The desired shape.
+    :type shape:
+
+    :code:`shape` can be passed as a tuple or as individual parameters: ::
+
+        # These are equivalent
+        broadcast_to(x, (32, 32))
+        broadcast_to(x, 32, 32)
+    """
+    shape = _shape_check_impl(_unwrap_iterable(shape))
+    return _semantic.broadcast_impl_shape(input, shape)
+
+
+@_tensor_member_fn
+@builtin
+def trans(input: tensor, *dims, _semantic=None):
+    """
+    Permutes the dimensions of a tensor.
+
+    If the parameter :code:`dims` is not specified, the function defaults to
+    swapping the last two axes, thereby performing an (optionally batched)
+    2D transpose.
+
+    :param input: The input tensor.
+    :param dims: The desired ordering of dimensions.  For example,
+        :code:`(2, 1, 0)` reverses the order dims in a 3D tensor.
+
+    :code:`dims` can be passed as a tuple or as individual parameters: ::
+
+        # These are equivalent
+        trans(x, (2, 1, 0))
+        trans(x, 2, 1, 0)
+
+    :py:func:`permute` is equivalent to this function, except it doesn't
+    have the special case when no permutation is specified.
+    """
+    dims = _unwrap_iterable(dims)
+    if not dims:
+        n = len(input.shape)
+        if n < 2:
+            raise ValueError("tl.trans invoked with a 0- or 1-dimensional tensor")
+        dims = list(builtins.range(n - 2)) + [n - 1, n - 2]
+    return _semantic.permute(input, dims)
+
+
+@_tensor_member_fn
+@builtin
+def permute(input, *dims, _semantic=None):
+    """
+    Permutes the dimensions of a tensor.
+
+    :param input: The input tensor.
+    :type input: Block
+    :param dims: The desired ordering of dimensions.  For example,
+        :code:`(2, 1, 0)` reverses the order dims in a 3D tensor.
+
+    :code:`dims` can be passed as a tuple or as individual parameters: ::
+
+        # These are equivalent
+        permute(x, (2, 1, 0))
+        permute(x, 2, 1, 0)
+
+    :py:func:`trans` is equivalent to this function, except when
+    :code:`dims` is empty, it tries to swap the last two axes.
+    """
+    dims = _unwrap_iterable(dims)
+    return _semantic.permute(input, dims)
+
+
+@builtin
+def cat(input, other, can_reorder=False, _semantic=None):
+    """
+    Concatenate the given blocks
+
+    :param input: The first input tensor.
+    :type input: Tensor
+    :param other: The second input tensor.
+    :type other: Tensor
+    :param reorder: Compiler hint. If true, the compiler is
+        allowed to reorder elements while concatenating inputs.  Only use if the
+        order does not matter (e.g., result is only used in reduction ops).
+        Current implementation of `cat` supports only can_reorder=True.
+    """
+    return _semantic.cat(input, other, can_reorder)
+
+
+@builtin
+def join(a, b, _semantic=None):
+    """
+    Join the given tensors in a new, minor dimension.
+
+    For example, given two tensors of shape (4,8), produces a new tensor of
+    shape (4,8,2).  Given two scalars, returns a tensor of shape (2).
+
+    The two inputs are broadcasted to be the same shape.
+
+    If you want to join more than two elements, you can use multiple calls to
+    this function.  This reflects the constraint in Triton that tensors must
+    have power-of-two sizes.
+
+    join is the inverse of split.
+
+    :param a: The first input tensor.
+    :type a: Tensor
+    :param b: The second input tensor.
+    :type b: Tensor
+    """
+    return _semantic.join(a, b)
+
+
+def _unsplat(x, _semantic=None, _generator=None):
+    """
+    Convert a single-element tensor to a scalar.
+    """
+    if len(x.shape) == 0:
+        return x
+    numel = 1
+    for d in x.shape:
+        numel *= d
+    assert numel == 1, "can only unsplat single-element tensors"
+    return _semantic.unsplat(x)
+
+
+@_tensor_member_fn
+@builtin
+def split(a, _semantic=None, _generator=None) -> tuple[tensor, tensor]:
+    """
+    Split a tensor in two along its last dim, which must have size 2.
+
+    For example, given a tensor of shape (4,8,2), produces two tensors of shape
+    (4,8).  Given a tensor of shape (2), returns two scalars.
+
+    If you want to split into more than two pieces, you can use multiple calls
+    to this function (probably plus calling reshape).  This reflects the
+    constraint in Triton that tensors must have power-of-two sizes.
+
+    split is the inverse of join.
+
+    :param a: The tensor to split.
+    :type a: Tensor
+    """
+    # If len(a.shape) == 1, i.e. a.shape == [2], we should return two scalars.
+    # But _semantic.split can only handle returning tensors.  Work around this by
+    # expanding the input to shape [1,2] and then reducing the result.
+    was_rank_1 = len(a.shape) == 1
+    if was_rank_1:
+        a = _semantic.expand_dims(a, 0)
+
+    out_lhs, out_rhs = _semantic.split(a)
+
+    if was_rank_1:
+        # Currently `reduce` is the best way to convert a tensor of shape [1] to a scalar.
+        out_lhs = _unsplat(out_lhs, _semantic=_semantic, _generator=_generator)
+        out_rhs = _unsplat(out_rhs, _semantic=_semantic, _generator=_generator)
+
+    return out_lhs, out_rhs
+
+
+@_tensor_member_fn
+@builtin
+def view(input, *shape, _semantic=None):
+    """
+    Returns a tensor with the same elements as `input` but a different shape.
+    The order of the elements may not be preserved.
+
+    :param input: The input tensor.
+    :type input: Block
+    :param shape: The desired shape.
+
+    :code:`shape` can be passed as a tuple or as individual parameters: ::
+
+        # These are equivalent
+        view(x, (32, 32))
+        view(x, 32, 32)
+    """
+    warn("view is deprecated, please use reshape with can_reorder being true.")
+    shape = _shape_check_impl(_unwrap_iterable(shape))
+    return _semantic.reshape(input, shape, can_reorder=True)
+
+
+@_tensor_member_fn
+@builtin
+def item(input, _semantic=None, _generator=None):
+    """
+    Converts a single-element tensor into a scalar.
+    """
+    return _unsplat(input, _semantic=_semantic, _generator=_generator)
+
+
+@_tensor_member_fn
+@builtin
+def reshape(input, *shape, can_reorder=False, _semantic=None, _generator=None):
+    """
+    Returns a tensor with the same number of elements as input but with the
+    provided shape.
+
+    :param input: The input tensor.
+    :type input: Block
+    :param shape: The new shape.
+
+    :code:`shape` can be passed as a tuple or as individual parameters: ::
+
+        # These are equivalent
+        reshape(x, (32, 32))
+        reshape(x, 32, 32)
+    """
+    shape = _shape_check_impl(_unwrap_iterable(shape))
+    if len(shape) == 0:
+        return _unsplat(input, _semantic=_semantic, _generator=_generator)
+    return _semantic.reshape(input, shape, can_reorder)
+
+
+def _wrap_axis(axis, ndim):
+    if not (-ndim <= axis < ndim):
+        raise ValueError(f"invalid axis {axis}. Expected {-ndim} <= axis < {ndim}")
+
+    return axis if axis >= 0 else axis + ndim
+
+
+@_tensor_member_fn
+@builtin
+def expand_dims(input, axis, _semantic=None):
+    """
+    Expand the shape of a tensor, by inserting new length-1 dimensions.
+
+    Axis indices are with respect to the resulting tensor, so
+    ``result.shape[axis]`` will be 1 for each axis.
+
+    :param input: The input tensor.
+    :type input: tl.tensor
+    :param axis: The indices to add new axes
+    :type axis: int | Sequence[int]
+
+    """
+    input = _semantic.to_tensor(input)
+    axis = _unwrap_if_constexpr(axis)
+    axes = list(axis) if isinstance(axis, (Sequence, tuple)) else [axis]
+    new_ndim = len(input.shape) + len(axes)
+    axes = [_wrap_axis(_unwrap_if_constexpr(d), new_ndim) for d in axes]
+
+    if len(set(axes)) != len(axes):
+        raise ValueError(f"expand_dims received duplicate axes, normalized axes = {axes}")
+
+    ret = input
+    for a in sorted(axes):
+        ret = _semantic.expand_dims(ret, a)
+    return ret
+
+
+@_tensor_member_fn
+@builtin
+def cast(input, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _semantic=None):
+    """
+    Casts a tensor to the given :code:`dtype`.
+
+    :param dtype: The target data type.
+    :type dtype: tl.dtype
+    :param fp_downcast_rounding: The rounding mode for downcasting
+        floating-point values. This parameter is only used when self is a
+        floating-point tensor and dtype is a floating-point type with a
+        smaller bitwidth. Supported values are :code:`"rtne"` (round to
+        nearest, ties to even) and :code:`"rtz"` (round towards zero).
+    :type fp_downcast_rounding: str, optional
+    :param bitcast: If true, the tensor is bitcasted to the given
+        :code:`dtype`, instead of being numerically casted.
+    :type bitcast: bool, optional
+    """
+    input = _semantic.to_tensor(input)
+    dtype = _unwrap_if_constexpr(dtype)
+    fp_downcast_rounding = _unwrap_if_constexpr(fp_downcast_rounding)
+    bitcast = _unwrap_if_constexpr(bitcast)
+    if bitcast:
+        return _semantic.bitcast(input, dtype)
+    return _semantic.cast(input, dtype, fp_downcast_rounding)
+
+
+# -----------------------
+# Linear Algebra
+# -----------------------
+
+
+@builtin
+def dot(input, other, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None, out_dtype=float32,
+        _semantic=None):
+    """
+    Returns the matrix product of two blocks.
+
+    The two blocks must both be two-dimensional or three-dimensional and have compatible inner dimensions.
+    For three-dimensional blocks, `tl.dot` performs the batched matrix product,
+    where the first dimension of each block represents the batch dimension.
+
+    :param input: The first tensor to be multiplied.
+    :type input: 2D or 3D tensor of scalar-type in {:code:`int8`, :code:`float8_e5m2`, :code:`float16`, :code:`bfloat16`, :code:`float32`}
+    :param other: The second tensor to be multiplied.
+    :type other: 2D or 3D tensor of scalar-type in {:code:`int8`, :code:`float8_e5m2`, :code:`float16`, :code:`bfloat16`, :code:`float32`}
+    :param acc: The accumulator tensor. If not None, the result is added to this tensor.
+    :type acc: 2D or 3D tensor of scalar-type in {:code:`float16`, :code:`float32`, :code:`int32`}
+    :param input_precision: How to exercise the Tensor Cores for f32 x f32. If
+      the device does not have Tensor Cores or the inputs are not of dtype f32,
+      this option is ignored. For devices that do have tensor cores, the
+      default precision is tf32.
+    :type input_precision: string. Available options for nvidia: :code:`"tf32"`, :code:`"tf32x3"`, :code:`"ieee"`. Default: :code:`"tf32"`. Available options for amd: :code:`"ieee"`, (CDNA3 only) :code:`"tf32"`.
+    :param allow_tf32: *Deprecated.* If true, input_precision is set to "tf32".
+      Only one of :code:`input_precision` and :code:`allow_tf32` can be
+      specified (i.e. at least one must be :code:`None`).
+    """
+    assert input_precision is None or allow_tf32 is None, "Only one of input_precision and allow_tf32 can be specified"
+    if input_precision is None:
+        supports_tf32 = "tf32" in _semantic.builder.options.allowed_dot_input_precisions
+        input_precision = knobs.language.fp32_default or ("tf32" if (supports_tf32 and
+                                                                     (allow_tf32 or allow_tf32 is None)) else "ieee")
+
+    input_precision = _unwrap_if_constexpr(input_precision)
+    out_dtype = _unwrap_if_constexpr(out_dtype)
+    max_num_imprecise_acc = _unwrap_if_constexpr(max_num_imprecise_acc)
+    acc = _unwrap_if_constexpr(acc)
+
+    # check shapes make sense:
+    a_shape = list(input.shape)
+    b_shape = list(other.shape)
+    assert len(a_shape) == len(b_shape) >= 2, "input and other must have equal ranks >= 2"
+    assert a_shape[:-2] == b_shape[:-2], "input and other must have equal batch shapes"
+    assert a_shape[-1] == b_shape[-2], "input and other must have equal reduction dimensions"
+
+    # compute shape of accumulator:
+    c_shape = a_shape[:-1] + [b_shape[-1]]
+    if acc is not None:
+        assert list(acc.shape) == c_shape, "accumulator shape is incompatible"
+    rank = len(c_shape)
+
+    if rank >= 4:
+        batch_size = 1
+        for i in builtins.range(rank - 2):
+            batch_size *= c_shape[i]
+        input = _semantic.reshape(input, [batch_size] + a_shape[-2:], can_reorder=False)
+        other = _semantic.reshape(other, [batch_size] + b_shape[-2:], can_reorder=False)
+        if acc is not None:
+            acc = _semantic.reshape(acc, [batch_size] + c_shape[-2:], can_reorder=False)
+
+    res = _semantic.dot(input, other, acc, input_precision, max_num_imprecise_acc, out_dtype)
+
+    if rank >= 4:
+        res = _semantic.reshape(res, c_shape, can_reorder=False)
+
+    assert list(res.shape) == c_shape, "output shape is unexpected"
+    return res
+
+
+@builtin
+def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False, lhs_k_pack=True,
+               rhs_k_pack=True, out_dtype=float32, _semantic=None):
+    """
+    Returns the matrix product of two blocks in microscaling format.
+
+    lhs and rhs use microscaling formats described here:
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+
+    Software emulation enables targeting hardware architectures without native microscaling
+    operation support. Right now for such case, microscaled lhs/rhs are upcasted to
+    :code:`bf16` element type beforehand for dot computation, with one exception:
+    for AMD CDNA3 specifically, if one of the inputs is of :code:`fp16` element type,
+    the other input is also upcasted to :code:`fp16` element type instead.
+    This behavior is experimental and may be subject to change in the future.
+
+    :param lhs: The first tensor to be multiplied.
+    :type lhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
+    :param lhs_scale: Scale factor for lhs tensor. Shape should be [M, K//group_size] when lhs is [M, K], where group_size is 32 if scales type are `e8m0`.
+    :type lhs_scale: e8m0 type represented as an uint8 tensor, or None.
+    :param lhs_format: format of the lhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`, :code:`fp16`}.
+    :type lhs_format: str
+    :param rhs: The second tensor to be multiplied.
+    :type rhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
+    :param rhs_scale: Scale factor for rhs tensor. Shape should be [N, K//group_size] where rhs is [K, N].
+                      Important: Do NOT transpose rhs_scale
+    :type rhs_scale: e8m0 type represented as an uint8 tensor, or None.
+    :param rhs_format: format of the rhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`, :code:`fp16`}.
+    :type rhs_format: str
+    :param acc: The accumulator tensor. If not None, the result is added to this tensor.
+    :param lhs_k_pack: If false, the lhs tensor is packed into uint8 along M dimension.
+    :type lhs_k_pack: bool, optional
+    :param rhs_k_pack: If false, the rhs tensor is packed into uint8 along N dimension.
+    :type rhs_k_pack: bool, optional
+    """
+    out_dtype = _unwrap_if_constexpr(out_dtype)
+    acc = _unwrap_if_constexpr(acc)
+    assert out_dtype == float32, "Only float32 is supported for out_dtype at the moment"
+    return _semantic.dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, fast_math, lhs_k_pack,
+                                rhs_k_pack, out_dtype)
+
+
+# -----------------------
+# Non-Atomic Memory Operations
+# -----------------------
+
+
+@builtin
+def load(pointer, mask=None, other=None, boundary_check=(), padding_option="", cache_modifier="", eviction_policy="",
+         volatile=False, _semantic=None):
+    """
+    Return a tensor of data whose values are loaded from memory at location defined by `pointer`:
+
+        (1) If `pointer` is a single element pointer, a scalar is be loaded.  In
+            this case:
+
+            - `mask` and `other` must also be scalars,
+            - `other` is implicitly typecast to `pointer.dtype.element_ty`, and
+            - `boundary_check` and `padding_option` must be empty.
+
+        (2) If `pointer` is an N-dimensional tensor of pointers, an
+            N-dimensional tensor is loaded.  In this case:
+
+            - `mask` and `other` are implicitly broadcast to `pointer.shape`,
+            - `other` is implicitly typecast to `pointer.dtype.element_ty`, and
+            - `boundary_check` and `padding_option` must be empty.
+
+        (3) If `pointer` is a block pointer defined by `make_block_ptr`, a
+            tensor is loaded.  In this case:
+
+            - `mask` and `other` must be `None`, and
+            - `boundary_check` and `padding_option` can be specified to control the behavior of out-of-bound access.
+
+    :param pointer: Pointer to the data to be loaded
+    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
+    :param mask: if `mask[idx]` is false, do not load the data at address `pointer[idx]`
+        (must be `None` with block pointers)
+    :type mask: Block of `triton.int1`, optional
+    :param other: if `mask[idx]` is false, return `other[idx]`
+    :type other: Block, optional
+    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
+    :type boundary_check: tuple of ints, optional
+    :param padding_option: should be one of {"", "zero", "nan"}, the padding value to use while out of bounds. "" means an undefined value.
+    :param cache_modifier: changes cache option in NVIDIA PTX
+    :type cache_modifier: str, optional, should be one of {"", ".ca", ".cg", ".cv"}, where ".ca" stands for
+        cache at all levels, ".cg" stands for cache at global level (cache in L2 and below, not L1),
+        and ".cv" means don’t cache and fetch again. see
+        `cache operator <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators>`_ for more details.
+    :param eviction_policy: changes eviction policy in NVIDIA PTX
+    :type eviction_policy: str, optional
+    :param volatile: changes volatile option in NVIDIA PTX
+    :type volatile: bool, optional
+    """
+    # `mask` and `other` can be constexpr
+    mask = _unwrap_if_constexpr(mask)
+    other = _unwrap_if_constexpr(other)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    if other is not None:
+        other = _semantic.to_tensor(other)
+    padding_option = _unwrap_if_constexpr(padding_option)
+    cache_modifier = _unwrap_if_constexpr(cache_modifier)
+    eviction_policy = _unwrap_if_constexpr(eviction_policy)
+    volatile = _unwrap_if_constexpr(volatile)
+    return _semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
+                          volatile)
+
+
+@builtin
+def load_tensor_descriptor(desc: tensor_descriptor_base, offsets: Sequence[constexpr | tensor],
+                           _semantic=None) -> tensor:
+    """Load a block of data from a tensor descriptor."""
+    return desc.load(offsets, _semantic=_semantic)
+
+
+@builtin
+def store_tensor_descriptor(desc: tensor_descriptor_base, offsets: Sequence[constexpr | tensor], value: tensor,
+                            _semantic=None) -> tensor:
+    """Store a block of data to a tensor descriptor."""
+    return desc.store(offsets, value, _semantic=_semantic)
+
+
+@_tensor_member_fn
+@builtin
+def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="", _semantic=None):
+    """
+    Store a tensor of data into memory locations defined by `pointer`.
+
+        (1) If `pointer` is a single element pointer, a scalar is stored.  In
+            this case:
+
+            - `mask` must also be scalar, and
+            - `boundary_check` and `padding_option` must be empty.
+
+        (2) If `pointer` is an N-dimensional tensor of pointers, an
+            N-dimensional block is stored.  In this case:
+
+            - `mask` is implicitly broadcast to `pointer.shape`, and
+            - `boundary_check` must be empty.
+
+        (3) If `pointer` is a block pointer defined by `make_block_ptr`, a block
+            of data is stored.  In this case:
+
+            - `mask` must be None, and
+            - `boundary_check` can be specified to control the behavior of out-of-bound access.
+
+    `value` is implicitly broadcast to `pointer.shape` and typecast to `pointer.dtype.element_ty`.
+
+    :param pointer: The memory location where the elements of `value` are stored
+    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
+    :param value: The tensor of elements to be stored
+    :type value: Block
+    :param mask: If `mask[idx]` is false, do not store `value[idx]` at `pointer[idx]`
+    :type mask: Block of triton.int1, optional
+    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
+    :type boundary_check: tuple of ints, optional
+    :param cache_modifier: changes cache option in NVIDIA PTX
+    :type cache_modifier: str, optional, should be one of {"", ".wb", ".cg", ".cs", ".wt"}, where ".wb" stands for
+        cache write-back all coherent levels, ".cg" stands for cache global, ".cs" stands for cache streaming, ".wt"
+        stands for cache write-through, see `cache operator <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators>`_ for more details.
+    :param eviction_policy: changes eviction policy in NVIDIA PTX
+    :type eviction_policy: str, optional, should be one of {"", "evict_first", "evict_last"}
+    """
+    # `value` can be constexpr
+    value = _semantic.to_tensor(value)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    cache_modifier = _unwrap_if_constexpr(cache_modifier)
+    eviction_policy = _unwrap_if_constexpr(eviction_policy)
+    return _semantic.store(pointer, value, mask, boundary_check, cache_modifier, eviction_policy)
+
+
+@builtin
+def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _semantic=None):
+    """
+    Returns a pointer to a block in a parent tensor
+
+    :param base: The base pointer to the parent tensor
+    :param shape: The shape of the parent tensor
+    :param strides: The strides of the parent tensor
+    :param offsets: The offsets to the block
+    :param block_shape: The shape of the block
+    :param order: The order of the original data format
+    """
+    return _semantic.make_block_ptr(base, shape, strides, offsets, block_shape, order)
+
+
+@must_use_result(
+    "Note that tl.advance does not have any side effects. To move the block pointer, you need to assign the result of tl.advance to a variable."
+)
+@_tensor_member_fn
+@builtin
+def advance(base, offsets, _semantic=None):
+    """
+    Advance a block pointer
+
+    :param base: the block pointer to advance
+    :param offsets: the offsets to advance, a tuple by dimension
+    """
+    return _semantic.advance(base, offsets)
+
+
+@builtin
+def make_tensor_descriptor(
+    base: tensor,
+    shape: List[tensor],
+    strides: List[tensor],
+    block_shape: List[constexpr],
+    padding_option="zero",
+    _semantic=None,
+) -> tensor_descriptor:
+    """Make a tensor descriptor object
+
+    :param base: the base pointer of the tensor, must be 16-byte aligned
+    :param shape: A list of non-negative integers representing the tensor shape
+    :param strides: A list of tensor strides. Leading dimensions must be multiples
+        of 16-byte strides and the last dimension must be contiguous.
+    :param block_shape: The shape of block to be loaded/stored from global memory
+
+    Notes
+    *****
+    On NVIDIA GPUs with TMA support, this will result in a TMA descriptor object
+    and loads and stores from the descriptor will be backed by the TMA hardware.
+
+    Currently only 2-5 dimensional tensors are supported.
+
+    Example
+    *******
+    .. code-block:: python
+
+        @triton.jit
+        def inplace_abs(in_out_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
+            desc = tl.make_tensor_descriptor(
+                in_out_ptr,
+                shape=[M, N],
+                strides=[N, 1],
+                block_shape=[M_BLOCK, N_BLOCK],
+            )
+
+            moffset = tl.program_id(0) * M_BLOCK
+            noffset = tl.program_id(1) * N_BLOCK
+
+            value = desc.load([moffset, noffset])
+            desc.store([moffset, noffset], tl.abs(value))
+
+        # TMA descriptors require a global memory allocation
+        def alloc_fn(size: int, alignment: int, stream: Optional[int]):
+            return torch.empty(size, device="cuda", dtype=torch.int8)
+
+        triton.set_allocator(alloc_fn)
+
+        M, N = 256, 256
+        x = torch.randn(M, N, device="cuda")
+        M_BLOCK, N_BLOCK = 32, 32
+        grid = (M / M_BLOCK, N / N_BLOCK)
+        inplace_abs[grid](x, M, N, M_BLOCK, N_BLOCK)
+
+    """
+
+    padding_option = _unwrap_if_constexpr(padding_option)
+    return _semantic.make_tensor_descriptor(base, shape, strides, block_shape, padding_option)
+
+
+# -----------------------
+# Atomic Memory Operations
+# -----------------------
+
+
+def _add_atomic_docstr(name: str, has_cmp: bool = False) -> Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = f"""
+    Performs an atomic {name} at the memory location specified by :code:`pointer`.
+
+    Return the data stored at :code:`pointer` before the atomic operation.
+
+    :param pointer: The memory locations to operate on
+    :type pointer: Block of dtype=triton.PointerDType"""
+        if has_cmp:
+            docstr += """
+    :param cmp: The values expected to be found in the atomic object
+    :type cmp: Block of dtype=pointer.dtype.element_ty"""
+        docstr += """
+    :param val: The values with which to perform the atomic operation
+    :type val: Block of dtype=pointer.dtype.element_ty
+    :param sem: Specifies the memory semantics for the operation. Acceptable values are "acquire",
+        "release", "acq_rel" (stands for "ACQUIRE_RELEASE"), and "relaxed". If not provided,
+        the function defaults to using "acq_rel" semantics.
+    :type sem: str, optional
+    :param scope: Defines the scope of threads that observe the synchronizing effect of the atomic operation.
+        Acceptable values are "gpu" (default), "cta" (cooperative thread array, thread block), or "sys" (stands for "SYSTEM"). The default value is "gpu".
+    :type scope: str, optional
+    """
+        func.__doc__ = docstr
+        return func
+
+    return _decorator
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("compare-and-swap", has_cmp=True)
+def atomic_cas(pointer, cmp, val, sem=None, scope=None, _semantic=None):
+    cmp = _semantic.to_tensor(cmp)
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    return _semantic.atomic_cas(pointer, cmp, val, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("exchange")
+def atomic_xchg(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_xchg(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("add")
+def atomic_add(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_add(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("max")
+def atomic_max(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_max(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("min")
+def atomic_min(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_min(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("logical and")
+def atomic_and(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_and(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("logical or")
+def atomic_or(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_or(pointer, val, mask, sem, scope)
+
+
+@_tensor_member_fn
+@builtin
+@_add_atomic_docstr("logical xor")
+def atomic_xor(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_xor(pointer, val, mask, sem, scope)
+
+
+# -----------------------
+# Conditioning
+# -----------------------
+
+
+@builtin
+def where(condition, x, y, _semantic=None):
+    """
+    Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`.
+
+    Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`.
+
+    If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead.
+
+    The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`.
+    :code:`x` and :code:`y` must have the same data type.
+
+    :param condition: When True (nonzero), yield x, otherwise yield y.
+    :type condition: Block of triton.bool
+    :param x: values selected at indices where condition is True.
+    :param y: values selected at indices where condition is False.
+    """
+    condition = _semantic.to_tensor(condition)
+    x = _unwrap_if_constexpr(x)
+    y = _unwrap_if_constexpr(y)
+    return _semantic.where(condition, x, y)
+
+
+# -----------------------
+# Math
+# -----------------------
+
+
+@builtin
+def add(x, y, sanitize_overflow: constexpr = True, _semantic=None):
+    x = _unwrap_if_constexpr(x)
+    y = _unwrap_if_constexpr(y)
+    return _semantic.add(x, y, sanitize_overflow)
+
+
+@builtin
+def sub(x, y, sanitize_overflow: constexpr = True, _semantic=None):
+    x = _unwrap_if_constexpr(x)
+    y = _unwrap_if_constexpr(y)
+    return _semantic.sub(x, y, sanitize_overflow)
+
+
+@builtin
+def mul(x, y, sanitize_overflow: constexpr = True, _semantic=None):
+    x = _unwrap_if_constexpr(x)
+    y = _unwrap_if_constexpr(y)
+    return _semantic.mul(x, y, sanitize_overflow)
+
+
+@builtin
+def minimum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
+    """
+    Computes the element-wise minimum of :code:`x` and :code:`y`.
+
+    :param x: the first input tensor
+    :type x: Block
+    :param y: the second input tensor
+    :type y: Block
+    :param propagate_nan: whether to propagate NaN values.
+    :type propagate_nan: tl.PropagateNan
+
+    .. seealso:: :class:`tl.PropagateNan`
+    """
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    y = _promote_bfloat16_to_float32(y, _semantic=_semantic)
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
+    return _semantic.minimum(x, y, propagate_nan)
+
+
+@builtin
+def maximum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
+    """
+    Computes the element-wise maximum of :code:`x` and :code:`y`.
+
+    :param x: the first input tensor
+    :type x: Block
+    :param y: the second input tensor
+    :type y: Block
+    :param propagate_nan: whether to propagate NaN values.
+    :type propagate_nan: tl.PropagateNan
+
+    .. seealso:: :class:`tl.PropagateNan`
+    """
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    y = _promote_bfloat16_to_float32(y, _semantic=_semantic)
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
+    return _semantic.maximum(x, y, propagate_nan)
+
+
+@builtin
+def clamp(x, min, max, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
+    """
+    Clamps the input tensor :code:`x` within the range [min, max].
+    Behavior when :code:`min` > :code:`max` is undefined.
+
+    :param x: the input tensor
+    :type x: Block
+    :param min: the lower bound for clamping
+    :type min: Block
+    :param max: the upper bound for clamping
+    :type max: Block
+    :param propagate_nan: whether to propagate NaN values. Applies only to the :code:`x` tensor.
+        If either :code:`min` or :code:`max` is NaN, the result is undefined.
+    :type propagate_nan: tl.PropagateNan
+
+    .. seealso:: :class:`tl.PropagateNan`
+    """
+    x = _semantic.to_tensor(x)
+    min = _semantic.to_tensor(min)
+    max = _semantic.to_tensor(max)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    min = _promote_bfloat16_to_float32(min, _semantic=_semantic)
+    max = _promote_bfloat16_to_float32(max, _semantic=_semantic)
+
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
+
+    return _semantic.clamp(x, min, max, propagate_nan)
+
+
+# -----------------------
+# Reductions
+# -----------------------
+
+
+def _add_reduction_docstr(name: str, return_indices_arg: str = None, tie_break_arg: str = None,
+                          dtype_arg: str = None) -> Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = """
+    Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis`
+
+    :param input: the input values
+    :type input: Tensor
+    :param axis: the dimension along which the reduction should be done. If None, reduce all dimensions
+    :type axis: int
+    :param keep_dims: if true, keep the reduced dimensions with length 1
+    :type keep_dims: bool"""
+        if return_indices_arg is not None:
+            docstr += f"""
+    :param {return_indices_arg}: if true, return index corresponding to the {name} value
+    :type {return_indices_arg}: bool"""
+        if tie_break_arg is not None:
+            docstr += f"""
+    :param {tie_break_arg}: if true, in case of a tie (i.e., multiple elements have the same {name} value), return the left-most index for values that aren't NaN
+    :type {tie_break_arg}: bool"""
+        if dtype_arg is not None:
+            docstr += f"""
+    :param {dtype_arg}: the desired data type of the returned tensor. If specified, the input tensor is casted to :code:`{dtype_arg}` before the operation is performed. This is useful for preventing data overflows. If not specified, integer and bool dtypes are upcasted to :code:`tl.int32` and float dtypes are upcasted to at least :code:`tl.float32`.
+    :type {dtype_arg}: tl.dtype"""
+
+        func.__doc__ = docstr.format(name=name)
+        return func
+
+    return _decorator
+
+
+@contextmanager
+def _insertion_guard(builder):
+    ip = builder.get_insertion_point()
+    yield
+    builder.restore_insertion_point(ip)
+
+
+@_tensor_member_fn
+@builtin
+def reduce(input, axis, combine_fn, keep_dims=False, _semantic=None, _generator=None):
+    """Applies the combine_fn to all elements in :code:`input` tensors along the provided :code:`axis`
+
+    :param input: the input tensor, or tuple of tensors
+    :type input: Tensor
+    :param axis: the dimension along which the reduction should be done. If None, reduce all dimensions
+    :type axis: int | None
+    :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit)
+    :type combine_fn: Callable
+    :param keep_dims: if true, keep the reduced dimensions with length 1
+    :type keep_dims: bool
+
+    """
+    if isinstance(input, tensor):
+        return reduce((input, ), axis, combine_fn, keep_dims=keep_dims, _semantic=_semantic, _generator=_generator)[0]
+
+    def make_combine_region(reduce_op):
+        param_types = [t.type.scalar for t in input] * 2
+        region = reduce_op.get_region(0)
+        builder = _semantic.builder
+        with _insertion_guard(builder):
+            to_ir = lambda T: T.to_ir(builder)
+            block = builder.create_block_with_parent(region, list(map(to_ir, param_types)))
+            args = [tensor(block.arg(i), ty) for i, ty in enumerate(param_types)]
+            results = _generator.call_JitFunction(combine_fn, args, kwargs={})
+            if isinstance(results, tensor):
+                handles = [results.handle]
+            else:
+                handles = [r.handle for r in results]
+            builder.create_reduce_ret(*handles)
+
+    def expand_ndims(t, ndims):
+        for _ in builtins.range(ndims):
+            t = expand_dims(t, 0, _semantic=_semantic)
+        return t
+
+    axis = _unwrap_if_constexpr(axis)
+    keep_dims = _unwrap_if_constexpr(keep_dims)
+    if axis is not None:
+        axis = _wrap_axis(axis, len(input[0].shape))
+    ret = _semantic.reduction(input, axis, make_combine_region)
+    if keep_dims:
+        if axis is not None:
+            ret = tuple(expand_dims(t, axis, _semantic=_semantic) for t in ret)
+        else:
+            ret = tuple(expand_ndims(t, len(input[0].shape)) for t in ret)
+    return ret
+
+
+@builtin
+def _promote_bfloat16_to_float32(t, _semantic=None):
+    scalar_ty = t.type.scalar
+
+    # hardware doesn't support FMAX, FMIN, CMP for bfloat16
+    if scalar_ty is bfloat16:
+        return t.to(float32, _semantic=_semantic)
+    return t
+
+
+@builtin
+def _reduce_with_indices(input, axis, combine_fn, keep_dims=False, _semantic=None, _generator=None):
+    axis = _unwrap_if_constexpr(axis)
+    n = input.shape[axis]
+    index = arange(0, n, _semantic=_semantic)
+
+    if len(input.shape) > 1:
+        # Broadcast index across the non-reduced axes
+        axes_to_expand = [constexpr(d) for d in builtins.range(len(input.shape))]
+        del axes_to_expand[axis]
+        index = expand_dims(index, axes_to_expand, _semantic=_semantic)
+        index = broadcast_to(index, input.shape, _semantic=_semantic)
+
+    rvalue, rindices = reduce((input, index), axis, combine_fn, keep_dims=keep_dims, _semantic=_semantic,
+                              _generator=_generator)
+    return rvalue, rindices
+
+
+# -----------------------
+# Scans
+# -----------------------
+
+
+def _add_scan_docstr(name: str, dtype_arg: str = None) -> Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = """
+    Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis`
+
+    :param input: the input values
+    :type input: Tensor
+    :param axis: the dimension along which the scan should be done
+    :type axis: int
+    :param reverse: if true, the scan is performed in the reverse direction
+    :type reverse: bool"""
+
+        if dtype_arg is not None:
+            docstr += f"""
+    :param {dtype_arg}: the desired data type of the returned tensor. If specified, the input tensor is casted to :code:`{dtype_arg}` before the operation is performed. If not specified, small integer types (< 32 bits) are upcasted to prevent overflow. Note that :code:`tl.bfloat16` inputs are automatically promoted to :code:`tl.float32`.
+    :type {dtype_arg}: tl.dtype"""
+
+        func.__doc__ = docstr.format(name=name)
+        return func
+
+    return _decorator
+
+
+@_tensor_member_fn
+@builtin
+def associative_scan(input, axis, combine_fn, reverse=False, _semantic=None, _generator=None):
+    """Applies the combine_fn to each elements with a carry in :code:`input` tensors along the provided :code:`axis` and update the carry
+
+    :param input: the input tensor, or tuple of tensors
+    :type input: Tensor
+    :param axis: the dimension along which the reduction should be done
+    :type axis: int
+    :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit)
+    :type combine_fn: Callable
+    :param reverse: whether to apply the associative scan in the reverse direction along axis
+    :type reverse: bool
+
+    """
+    if isinstance(input, tensor):
+        return associative_scan((input, ), axis, combine_fn, reverse, _semantic=_semantic, _generator=_generator)[0]
+
+    def make_combine_region(scan_op):
+        param_types = [t.type.scalar for t in input] * 2
+        region = scan_op.get_region(0)
+        builder = _semantic.builder
+        with _insertion_guard(builder):
+            to_ir = lambda T: T.to_ir(builder)
+            block = builder.create_block_with_parent(region, list(map(to_ir, param_types)))
+            args = [tensor(block.arg(i), ty) for i, ty in enumerate(param_types)]
+            results = _generator.call_JitFunction(combine_fn, args, kwargs={})
+            if isinstance(results, tensor):
+                handles = [results.handle]
+            else:
+                handles = [r.handle for r in results]
+            builder.create_scan_ret(*handles)
+
+    axis = _unwrap_if_constexpr(axis)
+    if axis is not None:
+        axis = _wrap_axis(axis, len(input[0].shape))
+    return _semantic.associative_scan(input, axis, make_combine_region, reverse)
+
+
+@_tensor_member_fn
+@builtin
+def histogram(input, num_bins, mask=None, _semantic=None, _generator=None):
+    """computes an histogram based on input tensor with num_bins bins, the bins have a width of 1 and start at 0.
+
+    :param input: the input tensor
+    :type input: Tensor
+    :param num_bins: number of histogram bins
+    :type num_bins: int
+    :param mask: if `mask[idx]` is false, exclude `input[idx]` from histogram
+    :type mask: Block of `triton.int1`, optional
+
+    """
+    num_bins = _unwrap_if_constexpr(num_bins)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.histogram(input, num_bins, mask)
+
+
+@_tensor_member_fn
+@builtin
+def gather(src, index, axis, _semantic=None):
+    """Gather from a tensor along a given dimension.
+
+    :param src: the source tensor
+    :type src: Tensor
+    :param index: the index tensor
+    :type index: Tensor
+    :param axis: the dimension to gather along
+    :type axis: int
+
+    """
+    src = _unwrap_if_constexpr(src)
+    index = _unwrap_if_constexpr(index)
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.gather(src, index, axis)
+
+
+@builtin
+def map_elementwise(
+    scalar_fn: Callable[..., Tuple[tensor, ...]],
+    *args: tensor,
+    pack=1,
+    _semantic=None,
+    _generator=None,
+):
+    '''
+        Map a scalar function over a tensor.
+
+        The input tensors :code:`args` are implicitly broadcasted to the same shape.
+
+        This may be useful in allowing control flow over single elements in a tensor,
+        for example a multi-branch function where one branch is more expensive. With
+        :code:`tl.where` you are forced to calculate both sides of the branch, but
+        with an if we only execute one side.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            @triton.jit
+            def selu_scalar(x, alpha):
+                if x > 0:
+                    return a
+                else:
+                    return alpha * (tl.exp(x) - 1)
+
+            @triton.jit
+            def selu(x, alpha):
+                return tl.map_elementwise(selu_scalar, x, alpha)
+
+        :param scalar_fn: the function to map over.
+        :param pack: the number of elements to be processed by one function call.
+        :return: one tensor or a tuple of tensors, depending on the mapped function.
+    '''
+    # Build the block for the nested region first to discover the return types
+    assert pack >= 1
+    in_scalar_tys = [t.type.scalar for t in args]
+    builder = _semantic.builder
+    block = builder.new_block()
+    scalar_args = []
+    original_loc = builder.get_loc()
+    for i, ty in enumerate(in_scalar_tys):
+        for j in builtins.range(pack):
+            block.add_argument_at(ty.to_ir(builder), original_loc)
+            scalar_args.append(tensor(block.arg(i * pack + j), ty))
+
+    with _insertion_guard(builder):
+        builder.set_insertion_point_to_start(block)
+        scalar_results = _generator.call_JitFunction(scalar_fn, scalar_args, kwargs={})
+
+        is_single = isinstance(scalar_results, tensor)
+        if is_single:
+            scalar_results = scalar_results,
+
+        handles = [r.handle for r in scalar_results]
+        builder.set_loc(original_loc)
+        builder.create_map_elementwise_ret(handles)
+
+    fn_result_types = [x.type for x in scalar_results]
+    scalar_result_types = fn_result_types
+    if pack > 1:
+        scalar_result_types = fn_result_types[::pack]
+        for offset in builtins.range(1, pack):
+            assert scalar_result_types == fn_result_types[offset::pack], "type mismatch in unpacked results"
+
+    def make_elementwise_region(elementwise_op):
+        region = elementwise_op.get_region(0)
+        region.push_back(block)
+
+    builder.set_loc(original_loc)
+    result = _semantic.map_elementwise(args, scalar_result_types, pack, make_elementwise_region)
+    return result[0] if is_single else result
+
+
+# -----------------------
+# Compiler Hint Ops
+# -----------------------
+
+
+@builtin
+def debug_barrier(_semantic=None):
+    '''
+    Insert a barrier to synchronize all threads in a block.
+    '''
+    return _semantic.debug_barrier()
+
+
+@builtin
+def multiple_of(input, values, _semantic=None):
+    """
+    Let the compiler know that the values in :code:`input` are all multiples of :code:`value`.
+    """
+    if isinstance(values, constexpr):
+        values = [values]
+    for i, d in enumerate(values):
+        if not isinstance(d, constexpr):
+            raise TypeError(f"values element {i} must have type `constexpr`")
+        if not isinstance(d.value, int):
+            raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
+    values = [x.value for x in values]
+    return _semantic.multiple_of(input, values)
+
+
+@builtin
+def max_contiguous(input, values, _semantic=None):
+    """
+    Let the compiler know that the `value` first values in :code:`input` are contiguous.
+    """
+    if isinstance(values, constexpr):
+        values = [values]
+    for i, d in enumerate(values):
+        if not isinstance(d, constexpr):
+            raise TypeError(f"values element {i} must have type `constexpr`")
+        if not isinstance(d.value, int):
+            raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
+    values = [x.value for x in values]
+    return _semantic.max_contiguous(input, values)
+
+
+@builtin
+def max_constancy(input, values, _semantic=None):
+    """
+    Let the compiler know that the `value` first values in :code:`input` are constant.
+
+    e.g. if :code:`values` is [4], then each group of 4 values in :code:`input` should all be equal,
+    for example [0, 0, 0, 0, 1, 1, 1, 1].
+    """
+    if isinstance(values, constexpr):
+        values = [values]
+    for i, d in enumerate(values):
+        if not isinstance(d, constexpr):
+            raise TypeError(f"values element {i} must have type `constexpr`")
+        if not isinstance(d.value, int):
+            raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
+    values = [x.value for x in values]
+    return _semantic.max_constancy(input, values)
+
+
+@builtin
+def assume(cond, _semantic=None):
+    '''
+    Allow compiler to assume the :code:`cond` is True.
+    '''
+    return _semantic.assume(_semantic.to_tensor(cond))
+
+
+# -----------------------
+# Debugging functions
+# -----------------------
+
+
+@builtin
+def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=False, _semantic=None):
+    '''
+    Print the values at compile time.  The parameters are the same as the builtin :code:`print`.
+
+    NOTE: Calling the Python builtin :code:`print` is not the same as calling this, it instead maps to :code:`device_print`,
+    which has special requirements for the arguments.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        tl.static_print(f"BLOCK_SIZE={BLOCK_SIZE}")
+    '''
+    pass
+
+
+@builtin
+def static_assert(cond, msg="", _semantic=None):
+    '''
+    Assert the condition at compile time.  Does not require that the :code:`TRITON_DEBUG` environment variable
+    is set.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        tl.static_assert(BLOCK_SIZE == 1024)
+    '''
+    pass
+
+
+@builtin
+def device_print(prefix, *args, hex=False, _semantic=None):
+    '''
+    Print the values at runtime from the device.  String formatting does not work for runtime values, so you should
+    provide the values you want to print as arguments.  The first value must be a string, all following values must
+    be scalars or tensors.
+
+    Calling the Python builtin :code:`print` is the same as calling this function, and the requirements for the arguments will match
+    this function (not the normal requirements for :code:`print`).
+
+    .. highlight:: python
+    .. code-block:: python
+
+        tl.device_print("pid", pid)
+        print("pid", pid)
+
+    On CUDA, printfs are streamed through a buffer of limited size (on one host,
+    we measured the default as 6912 KiB, but this may not be consistent across
+    GPUs and CUDA versions).  If you notice some printfs are being dropped, you
+    can increase the buffer size by calling
+
+    .. highlight:: python
+    .. code-block:: python
+
+        triton.runtime.driver.active.utils.set_printf_fifo_size(size_bytes)
+
+    CUDA may raise an error if you try to change this value after running a
+    kernel that uses printfs.  The value set here may only affect the current
+    device (so if you have multiple GPUs, you'd need to call it multiple times).
+
+    :param prefix: a prefix to print before the values. This is required to be a string literal.
+    :param args: the values to print. They can be any tensor or scalar.
+    :param hex: print all values as hex instead of decimal
+    '''
+    import string
+    prefix = _unwrap_if_constexpr(prefix)
+    assert isinstance(prefix, str), f"{prefix} is not string"
+    b_ascii = True
+    for ch in prefix:
+        if ch not in string.printable:
+            b_ascii = False
+            break
+    assert b_ascii, f"{prefix} is not an ascii string"
+    new_args = []
+    for arg in args:
+        new_args.append(_semantic.to_tensor(arg))
+    return _semantic.device_print(prefix, new_args, hex)
+
+
+@builtin
+def device_assert(cond, msg="", mask=None, _semantic=None):
+    '''
+    Assert the condition at runtime from the device.  Requires that the environment variable :code:`TRITON_DEBUG`
+    is set to a value besides :code:`0` in order for this to have any effect.
+
+    Using the Python :code:`assert` statement is the same as calling this function, except that the second argument
+    must be provided and must be a string, e.g. :code:`assert pid == 0, "pid != 0"`.  The environment variable must
+    be set for this :code:`assert` statement to have any effect.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        tl.device_assert(pid == 0)
+        assert pid == 0, f"pid != 0"
+
+    :param cond: the condition to assert. This is required to be a boolean tensor.
+    :param msg: the message to print if the assertion fails. This is required to be a string literal.
+    '''
+    msg = _unwrap_if_constexpr(msg)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.device_assert(_semantic.to_tensor(cond), msg, mask)
+
+
+@builtin
+def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Union[dtype, Sequence[dtype]],
+                           is_pure: bool, pack: int, _semantic=None):
+    '''
+        Execute inline assembly over a tensor.  Essentially, this is :code:`map`
+        where the function is inline assembly.
+
+        The input tensors :code:`args` are implicitly broadcasted to the same shape.
+
+        :code:`dtype` can be a tuple of types, in which case the output is a
+        tuple of tensors.
+
+        Each invocation of the inline asm processes :code:`pack` elements at a
+        time.  Exactly which set of inputs a block receives is unspecified.
+        Input elements of size less than 4 bytes are packed into 4-byte
+        registers.
+
+        This op does not support empty :code:`dtype` -- the inline asm must
+        return at least one tensor, even if you don't need it.  You can work
+        around this by returning a dummy tensor of arbitrary type; it shouldn't
+        cost you anything if you don't use it.
+
+        Example using
+        `PTX <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html>`_
+        assembly:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            @triton.jit
+            def kernel(A, B, C, D, BLOCK: tl.constexpr):
+                a = tl.load(A + tl.arange(0, BLOCK)) # uint8 tensor
+                b = tl.load(B + tl.arange(0, BLOCK)) # float32 tensor
+
+                # For each (a,b) in zip(a,b), perform the following:
+                # - Let ai be `a` converted to int32.
+                # - Let af be `a` converted to float.
+                # - Let m be the max of ai and b.
+                # - Return ai and mi.
+                # Do the above 4 elements at a time.
+                (c, d) = tl.inline_asm_elementwise(
+                    asm="""
+                    {
+                        // Unpack `a` into `ai`.
+                        .reg .b8 tmp<4>;
+                        mov.b32 {tmp0, tmp1, tmp2, tmp3}, $8;
+                        cvt.u32.u8 $0, tmp0;
+                        cvt.u32.u8 $1, tmp1;
+                        cvt.u32.u8 $2, tmp2;
+                        cvt.u32.u8 $3, tmp3;
+                    }
+                    // Convert `ai` to float.
+                    cvt.rn.f32.s32 $4, $0;
+                    cvt.rn.f32.s32 $5, $1;
+                    cvt.rn.f32.s32 $6, $2;
+                    cvt.rn.f32.s32 $7, $3;
+                    // Take max of `ai` and `b`.
+                    max.f32 $4, $4, $9;
+                    max.f32 $5, $5, $10;
+                    max.f32 $6, $6, $11;
+                    max.f32 $7, $7, $12;
+                    """,
+                    constraints=(
+                        # 8 output registers, namely
+                        #   $0=ai0, $1=ai1, $2=ai2, $3=ai3,
+                        #   $4=m0,  $5=m1,  $6=m2,  $7=m3.
+                        "=r,=r,=r,=r,=r,=r,=r,=r,"
+                        # 5 input registers, namely
+                        #   $8=ai,
+                        #   $9=b0, $10=b1, $11=b2, $12=b3.
+                        # The four elements from `a` are all packed into one register.
+                        "r,r,r,r,r"),
+                    args=[a, b],
+                    dtype=(tl.int32, tl.float32),
+                    is_pure=True,
+                    pack=4,
+                )
+                tl.store(C + tl.arange(0, BLOCK), c)
+                tl.store(D + tl.arange(0, BLOCK), d)
+
+        :param asm: assembly to run.  Must match target's assembly format.
+        :param constraints: asm constraints in
+            `LLVM format <https://llvm.org/docs/LangRef.html#inline-asm-constraint-string>`_
+        :param args: the input tensors, whose values are passed to the asm block
+        :param dtype: the element type(s) of the returned tensor(s)
+        :param is_pure: if true, the compiler assumes the asm block has no side-effects
+        :param pack: the number of elements to be processed by one instance of inline assembly
+        :return: one tensor or a tuple of tensors of the given dtypes
+    '''
+    asm = _unwrap_if_constexpr(asm)
+    constraints = _unwrap_if_constexpr(constraints)
+    pack = _unwrap_if_constexpr(pack)
+    is_pure = _unwrap_if_constexpr(is_pure)
+
+    # Wrap `dtype` in a tuple if it's not already.
+    try:
+        iter(dtype)  # type: ignore
+        has_multiple_outputs = True
+    except TypeError:
+        has_multiple_outputs = False
+        dtype = (dtype, )  # type: ignore
+
+    dtype = typing.cast(Sequence[_DtypeClass], dtype)
+
+    res_tys = dtype
+    if dispatch_args := [_semantic.to_tensor(arg) for arg in args]:
+        bin_op_type_checking = partial(
+            _semantic.binary_op_type_checking_impl,
+            arithmetic_check=False,
+            allow_lhs_ptr=True,
+            allow_rhs_ptr=True,
+        )
+        broadcast_arg = dispatch_args[0]
+        # Get the broadcast shape over all the arguments
+        for item in dispatch_args:
+            _, broadcast_arg = bin_op_type_checking(item, broadcast_arg)
+        if broadcast_arg.shape:
+            # Change the shape of each argument based on the broadcast shape
+            for i, item in enumerate(dispatch_args):
+                dispatch_args[i], _ = bin_op_type_checking(item, broadcast_arg)
+            res_tys = [broadcast_arg.type.with_element_ty(dt) for dt in dtype]
+    handles = [t.handle for t in dispatch_args]
+    builder = _semantic.builder
+    call = builder.create_inline_asm(asm, constraints, handles, [ty.to_ir(builder) for ty in res_tys], is_pure, pack)
+
+    if not has_multiple_outputs:
+        return tensor(call.get_result(0), res_tys[0])
+    return tuple(tensor(call.get_result(i), ty) for i, ty in enumerate(res_tys))
+
+
+# -----------------------
+# Iterators
+# -----------------------
+
+
+class static_range(base_value):
+    """
+    Iterator that counts upward forever.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        @triton.jit
+        def kernel(...):
+            for i in tl.static_range(10):
+                ...
+    :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of
+        :code:`triton.jit` functions. In addition, it also guides the compiler to unroll the loop aggressively.
+    :param arg1: the start value.
+    :param arg2: the end value.
+    :param step: the step value.
+    """
+
+    def __init__(self, arg1, arg2=None, step=None):
+        assert isinstance(arg1, constexpr), f"{arg1} used as tl.static_range start value is not a constexpr"
+        if step is None:
+            self.step = constexpr(1)
+        else:
+            assert isinstance(step, constexpr), f"{step} used as tl.static_range step value is not a constexpr"
+            self.step = step
+        if arg2 is None:
+            self.start = constexpr(0)
+            self.end = arg1
+        else:
+            assert isinstance(arg2, constexpr), f"{arg2} used as tl.static_range end value is not a constexpr"
+            self.start = arg1
+            self.end = arg2
+
+    def __iter__(self):
+        raise RuntimeError("static_range can only be used in @triton.jit'd functions")
+
+    def __next__(self):
+        raise RuntimeError("static_range can only be used in @triton.jit'd functions")
+
+
+class range(base_value):
+    """
+    Iterator that counts upward forever.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        @triton.jit
+        def kernel(...):
+            for i in tl.range(10, num_stages=3):
+                ...
+    :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of
+        :code:`triton.jit` functions. In addition, it allows user to pass extra attributes to the compiler.
+    :param arg1: the start value.
+    :param arg2: the end value.
+    :param step: the step value.
+    :param num_stages: pipeline the loop into this many stages (so there are
+        :code:`num_stages` iterations of the loop in flight at once).
+
+        Note this is subtly different than passing :code:`num_stages` as a
+        kernel argument.  The kernel argument only pipelines loads that feed
+        into :code:`dot` operations, while this attribute tries to pipeline most
+        (though not all) loads in this loop.
+    :param loop_unroll_factor: Tells the Triton IR level loop unroller how many
+        times to unroll a for loop that this range is used with. Less than 2 for
+        this value implies no unrolling.
+    :param disallow_acc_multi_buffer: If true, prevent the accumulator of the dot
+        operation in the loop to be multi-buffered, if applicable.
+    :param flatten: automatically flatten the loop nest starting at this loop to
+        create a single flattened loop. The compiler will try to pipeline the
+        flattened loop which can avoid stage stalling.
+    :param warp_specialize: Enable automatic warp specialization on the loop.
+        The compiler will attempt to partition memory, MMA, and vector
+        operations in the loop into separate async partitions. This will
+        increase the total number of warps required by the kernel.
+    :param disable_licm: Tells the compiler it shouldn't hoist loop invariant
+        code outside the loop. This is often useful to avoid creating long liveranges
+        within a loop.
+
+        Note that warp specialization is only supported on Blackwell GPUs and
+        only works on simple matmul loops. Support for arbitrary loops will be
+        expanded over time.
+    """
+
+    def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_factor=None,
+                 disallow_acc_multi_buffer=False, flatten=False, warp_specialize=False, disable_licm=False):
+        if step is None:
+            self.step = constexpr(1)
+        else:
+            self.step = step
+        if arg2 is None:
+            self.start = constexpr(0)
+            self.end = arg1
+        else:
+            self.start = arg1
+            self.end = arg2
+        self.num_stages = num_stages
+        self.loop_unroll_factor = loop_unroll_factor
+        self.disallow_acc_multi_buffer = disallow_acc_multi_buffer
+        self.flatten = flatten
+        self.warp_specialize = warp_specialize
+        self.disable_licm = disable_licm
+
+    def __iter__(self):
+        raise RuntimeError("tl.range can only be used in @triton.jit'd functions")
+
+    def __next__(self):
+        raise RuntimeError("tl.range can only be used in @triton.jit'd functions")
+
+
+class condition(base_value):
+    """
+    While loop condition wrapper.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        @triton.jit
+        def kernel(...):
+            while tl.condition(c, disable_licm)
+                ...
+    :note: This is a special wrapper used to annotate while loops in the context of
+        :code:`triton.jit` functions. It allows user to pass extra attributes to the compiler.
+    :param disable_licm: Tells the compiler it shouldn't hoist loop invariant
+        code outside the loop. This is often useful to avoid creating long liveranges
+        within a loop.
+    """
+
+    def __init__(self, arg1, disable_licm=False):
+        self.condition = arg1
+        self.disable_licm = disable_licm
+
+
+# -----------------------
+# Extern functions
+# -----------------------
+
+
+def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_type: dtype, is_pure: bool,
+             _semantic):
+    '''
+        Dispatch a function to a library
+        :param func: the function to dispatch
+        :param lib_name: the name of the library
+        :param lib_path: the path of the library
+        :param args: the arguments of the function
+        :param arg_type_symbol_dict: the type of the arguments
+        :param ret_type: the type of the return value
+        :return: the return value of the function
+    '''
+    if len(arg_type_symbol_dict) == 0:
+        raise ValueError("arg_type_symbol_dict is empty")
+
+    num_args = len(list(arg_type_symbol_dict.keys())[0])
+    if len(args) != num_args:
+        raise ValueError(f"length of input args does not match."
+                         f"Expect {len(args)}, got {num_args}")
+
+    arg_types = []
+    arg_list = []
+    for arg in args:
+        if isinstance(arg, tensor):
+            arg_types.append(arg.dtype)
+            arg_list.append(arg.handle)
+        else:
+            arg_types.append(type(arg))
+            arg_list.append(arg)
+    arg_types = tuple(arg_types)
+
+    if arg_types not in arg_type_symbol_dict:
+        raise ValueError(f"input arg type does not match."
+                         f"Expect one of {arg_type_symbol_dict.keys()}, got {arg_types}")
+    else:
+        symbol = arg_type_symbol_dict[arg_types][0]
+        builder = _semantic.builder
+        return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(builder), is_pure), ret_type)
+
+
+@builtin
+def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, is_pure: bool,
+                       _semantic=None):
+    '''
+        Dispatch an elementwise function to a library
+        :param lib_name: the name of the library
+        :param lib_path: the path of the library
+        :param args: the arguments of the function
+        :param arg_type_symbol_dict: the type of the arguments
+        :param is_pure: whether the function is pure
+        :return: the return value of the function
+    '''
+    dispatch_args = args.copy()
+    all_scalar = True
+    arg_types = []
+    for i in builtins.range(len(dispatch_args)):
+        dispatch_args[i] = _semantic.to_tensor(dispatch_args[i])
+        arg_types.append(dispatch_args[i].dtype)
+        if dispatch_args[i].type.is_block():
+            all_scalar = False
+
+    arg_types = tuple(arg_types)
+    ret_type = arg_type_symbol_dict[arg_types][1]
+    if len(arg_types) > 0:
+        arithmetic_check = True
+        # If there's a type tuple that is not supported by the library, we will do arithmetic check
+        if arg_types in arg_type_symbol_dict:
+            arithmetic_check = False
+        broadcast_arg = dispatch_args[0]
+        # Get the broadcast shape over all the arguments
+        for item in dispatch_args:
+            _, broadcast_arg = _semantic.binary_op_type_checking_impl(item, broadcast_arg,
+                                                                      arithmetic_check=arithmetic_check)
+        # Change the shape of each argument based on the broadcast shape
+        for i in builtins.range(len(dispatch_args)):
+            dispatch_args[i], _ = _semantic.binary_op_type_checking_impl(dispatch_args[i], broadcast_arg,
+                                                                         arithmetic_check=arithmetic_check)
+        if not all_scalar:
+            ret_type = broadcast_arg.type.with_element_ty(ret_type)
+    func = _semantic.builder.create_extern_elementwise
+    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_type, is_pure, _semantic)
+
+
+def binary_op_type_legalization(lhs, rhs, semantic):
+    '''
+        Convert both operands to a single common type
+        :param lhs: the left operand
+        :param rhs: the right operand
+        :param builder: the builder
+    '''
+    return semantic.binary_op_type_checking_impl(lhs, rhs)
+
+
+def extern(fn):
+    """A decorator for external functions."""
+    return builtin(fn)
+
+
+_NOTHING = object()
+
+
+def is_negative_zero(x):
+    return x == 0.0 and math.copysign(1.0, x) < 0
+
+
+@builtin
+def builtin_max(*args, propagate_nan=_NOTHING, _semantic=None):
+    args = _unwrap_if_constexpr(args)
+    is_constexpr = all(not isinstance(x, base_value) for x in args)
+    if is_constexpr:
+        assert propagate_nan is _NOTHING, "propagate_nan is not supported on builtin max"
+        assert not any(math.isnan(x) for x in args)
+        assert not any(is_negative_zero(x) for x in args)
+        return constexpr(builtins.max(_unwrap_if_constexpr(args)))
+
+    if propagate_nan is _NOTHING:
+        propagate_nan = PropagateNan.NONE
+    else:
+        warn("passing propagate_nan to builtin max is deprecated, use tl.minimum instead", DeprecationWarning)
+
+    assert len(args) >= 2, "min requires at least 2 values"
+    max_val = args[0]
+    for arg in args[1:]:
+        max_val = maximum(max_val, arg, propagate_nan=propagate_nan, _semantic=_semantic)
+    if max_val.type.is_block():
+        warn("builtin max on non-scalar tensor values is deprecated, use tl.maximum instead", DeprecationWarning)
+    return max_val
+
+
+@builtin
+def builtin_min(*args, propagate_nan=_NOTHING, _semantic=None):
+    args = _unwrap_if_constexpr(args)
+    is_constexpr = all(not isinstance(x, base_value) for x in args)
+    if is_constexpr:
+        assert propagate_nan is _NOTHING, "propagate_nan is not supported on builtin min"
+        assert not any(math.isnan(x) for x in args)
+        assert not any(is_negative_zero(x) for x in args)
+        return constexpr(builtins.min(_unwrap_if_constexpr(args)))
+
+    if propagate_nan is _NOTHING:
+        propagate_nan = PropagateNan.NONE
+    else:
+        warn("passing propagate_nan to builtin min is deprecated, use tl.minimum instead", DeprecationWarning)
+
+    assert len(args) >= 2, "min requires at least 2 values"
+    min_val = args[0]
+    for arg in args[1:]:
+        min_val = minimum(min_val, arg, propagate_nan=propagate_nan, _semantic=_semantic)
+    if min_val.type.is_block():
+        warn("builtin min on non-scalar tensor values is deprecated, use tl.minimum instead", DeprecationWarning)
+    return min_val
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8c70a716a3da3473a4906b44aec7d35fcc35a5
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__init__.py
@@ -0,0 +1,26 @@
+import pkgutil
+from importlib.util import module_from_spec
+from sys import modules
+
+_backends = []
+for module_finder, module_name, is_pkg in pkgutil.iter_modules(
+        __path__,
+        prefix=__name__ + ".",
+):
+    # skip .py files (like libdevice.py)
+    if not is_pkg:
+        continue
+
+    # import backends (like cuda and hip) that are included during setup.py
+    spec = module_finder.find_spec(module_name)
+    if spec is None or spec.loader is None:
+        continue
+    module = module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    _backends.append(module_name)
+    modules[module_name] = module
+
+__all__ = _backends
+
+del _backends
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f1848d5c9d912e039ea6034507cddacda9d99c0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/libdevice.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/libdevice.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3af4b45a8a54ff915df8f384e7b64c1b7230a7bb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/__pycache__/libdevice.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbececf1defce4a9493a9e75cc7cb39571465175
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__init__.py
@@ -0,0 +1,16 @@
+from . import libdevice
+
+from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
+from .gdc import (gdc_launch_dependents, gdc_wait)
+
+__all__ = [
+    "libdevice",
+    "globaltimer",
+    "num_threads",
+    "num_warps",
+    "smid",
+    "convert_custom_float8_sm70",
+    "convert_custom_float8_sm80",
+    "gdc_launch_dependents",
+    "gdc_wait",
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..959a69169af49b5cd2efdbcd2fcd75f32b8eea2d
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/gdc.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/gdc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8a7289a62ae3745196efa4951c87a3665c7938
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/gdc.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/libdevice.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/libdevice.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9eb36b085610ffae82899599fa55b13fc290f3f6
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/libdevice.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0801ead3ca0bc5e2f87f20bf93125339752583de
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/gdc.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/gdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..4376719e3dbe63ac2dfe65bfc6bf936116056676
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/gdc.py
@@ -0,0 +1,42 @@
+"""
+Grid Dependency Control (GDC) is a mechanism used when enabling programmatic dependent launch to launch and
+synchronize grids. These APIs expose GDC to the programmer.
+
+Programmatic dependent launch is supported on SM90 (Hopper) and beyond.
+For PTX reference on grid dependency control see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol.
+"""
+
+from triton.language import core
+
+
+@core.extern
+def gdc_wait(_semantic=None):
+    """
+    GDC wait is a blocking instruction that waits for all instructions in a prior kernel to complete before continuing.
+    This ensures all memory operations happening before the wait is visible to instructions after it,
+    e.g. if the prior kernel writes to address "x" the new values will be visible in this kernel after the wait.
+
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
+
+    See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
+    """
+    core.inline_asm_elementwise("griddepcontrol.wait; // dummy $0", "=r", [], dtype=core.int32, is_pure=False, pack=1,
+                                _semantic=_semantic)
+
+
+@core.extern
+def gdc_launch_dependents(_semantic=None):
+    """
+    This operation when launched with programmatic dependent launch signals that
+    the next program may launch once all programs in the current kernel
+    call this function or complete.
+
+    Repeated calls to this function have no effect past the first call, and the first call should be
+    treated by the programmer as a hint to the runtime system to launch the next kernel.
+
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
+
+    See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
+    """
+    core.inline_asm_elementwise("griddepcontrol.launch_dependents; // dummy $0", "=r", [], dtype=core.int32,
+                                is_pure=False, pack=1, _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/libdevice.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/libdevice.py
new file mode 100644
index 0000000000000000000000000000000000000000..08661f5414a68f43b1fe35a2de945ed30322d73f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/libdevice.py
@@ -0,0 +1,1629 @@
+from triton.language import core
+
+
+@core.extern
+def clz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_clz", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_clzll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def popc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_popc", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_popcll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def byte_perm(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1, arg2], {
+        (core.dtype("int32"), core.dtype("int32"), core.dtype("int32")): ("__nv_byte_perm", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mulhi(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_mulhi", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_umulhi", core.dtype("uint32")),
+            (core.dtype("int64"), core.dtype("int64")): ("__nv_mul64hi", core.dtype("int64")),
+            (core.dtype("uint64"), core.dtype("uint64")): ("__nv_umul64hi", core.dtype("uint64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mul24(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_mul24", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_umul24", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def brev(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_brev", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_brevll", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sad(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("int32"), core.dtype("int32"), core.dtype("uint32")): ("__nv_sad", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32"), core.dtype("uint32")): ("__nv_usad", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def abs(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_abs", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_llabs", core.dtype("int64")),
+            (core.dtype("fp32"), ): ("__nv_fabsf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_fabs", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def floor(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_floorf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_floor", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcp64h(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_rcp64h", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rsqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_rsqrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rsqrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ceil(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp64"), ): ("__nv_ceil", core.dtype("fp64")),
+            (core.dtype("fp32"), ): ("__nv_ceilf", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def trunc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp64"), ): ("__nv_trunc", core.dtype("fp64")),
+            (core.dtype("fp32"), ): ("__nv_truncf", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def exp2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_exp2f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def saturatef(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_saturatef", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma_rn(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma_rz(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma_rd(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma_ru(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_dividef(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fast_fdividef", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def div_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def div_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def div_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def div_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcp_rn(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcp_rz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcp_rd(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcp_ru(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt_rn(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt_rz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt_rd(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt_ru(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sqrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sqrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def add_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rn", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rn", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def add_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rz", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rz", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def add_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rd", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rd", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def add_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_ru", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_ru", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mul_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rn", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rn", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mul_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rz", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rz", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mul_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rd", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rd", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def mul_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+            arg1,
+        ], {
+            (
+                core.dtype("fp64"),
+                core.dtype("fp64"),
+            ): ("__nv_dmul_ru", core.dtype("fp64")),
+            (
+                core.dtype("fp32"),
+                core.dtype("fp32"),
+            ): ("__nv_fmul_ru", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2int_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2int_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2int_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2int_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2uint_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2uint_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2uint_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2uint_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2int_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2int_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2int_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2int_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2uint_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2uint_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2uint_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2uint_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def hiloint2double(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("int32"), core.dtype("int32")): ("__nv_hiloint2double", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2loint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2loint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2hiint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2hiint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ll_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ll_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ll_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ll_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ull_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ull_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ull_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float2ull_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ll_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ll_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ll_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ll_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ull_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ull_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ull_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double2ull_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2double_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rz", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2double_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rd", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ll2double_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_ru", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2double_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rz", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2double_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rd", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ull2double_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_ru", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def int_as_float(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int_as_float", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float_as_int(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float_as_int", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def uint_as_float(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint_as_float", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def float_as_uint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float_as_uint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def longlong_as_double(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_longlong_as_double", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def double_as_longlong(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double_as_longlong", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_sinf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_sinf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_cosf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_cosf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_log2f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_log2f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_logf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_logf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_expf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_expf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_tanf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_tanf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_exp10f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_exp10f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_log10f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_log10f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_powf(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fast_powf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def hadd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_hadd", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_uhadd", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rhadd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_rhadd", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_urhadd", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sub_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sub_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sub_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sub_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rsqrt_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [
+        arg0,
+    ], {
+        (core.dtype("fp32"), ): ("__nv_frsqrt_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ffs(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("int32"), ): ("__nv_ffs", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_ffsll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_rintf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rint", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def llrint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_llrintf", core.dtype("int64")),
+            (core.dtype("fp64"), ): ("__nv_llrint", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def nearbyint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_nearbyintf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_nearbyint", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def isnan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_isnanf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_isnand", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+
+
+@core.extern
+def signbit(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_signbitf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_signbitd", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def copysign(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_copysignf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_copysign", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def finitef(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_finitef", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+
+
+@core.extern
+def isinf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_isinff", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_isinfd", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+
+
+@core.extern
+def nextafter(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_nextafterf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_nextafter", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sin", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cosf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cos", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sinpi(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinpif", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sinpi", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cospi(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cospif", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cospi", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def tan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tanf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tan", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log2f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def exp(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_expf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def exp10(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_exp10f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp10", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_coshf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cosh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sinh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def tanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tanhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tanh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atan2(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_atan2f", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_atan2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_atanf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_atan", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def asin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_asinf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_asin", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def acos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_acosf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_acos", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_logf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log10(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log10f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log10", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log1p(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log1pf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log1p", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def acosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_acoshf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_acosh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def asinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_asinhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_asinh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_atanhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_atanh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def expm1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_expm1f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_expm1", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def hypot(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_hypotf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_hypot", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rhypot(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_rhypotf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_rhypot", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def norm3d(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_norm3df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_norm3d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rnorm3d(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_rnorm3df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_rnorm3d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def norm4d(arg0, arg1, arg2, arg3, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2, arg3], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")):
+            ("__nv_norm4df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")):
+            ("__nv_norm4d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rnorm4d(arg0, arg1, arg2, arg3, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2, arg3], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")):
+            ("__nv_rnorm4df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")):
+            ("__nv_rnorm4d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cbrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cbrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cbrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rcbrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_rcbrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rcbrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def j0(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_j0f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_j0", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def j1(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_j1f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_j1", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def y0(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_y0f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_y0", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def y1(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_y1f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_y1", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def yn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("fp32")): ("__nv_ynf", core.dtype("fp32")),
+            (core.dtype("int32"), core.dtype("fp64")): ("__nv_yn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def jn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("fp32")): ("__nv_jnf", core.dtype("fp32")),
+            (core.dtype("int32"), core.dtype("fp64")): ("__nv_jn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cyl_bessel_i0(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cyl_bessel_i0f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cyl_bessel_i0", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cyl_bessel_i1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cyl_bessel_i1f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cyl_bessel_i1", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erff", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erf", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfc", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfcx(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcxf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfcx", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfcinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfcinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def normcdfinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_normcdfinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_normcdfinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def normcdf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_normcdff", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_normcdf", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def lgamma(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_lgammaf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_lgamma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ldexp(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_ldexpf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_ldexp", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def scalbn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_scalbnf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_scalbn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fmod(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmodf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_fmod", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def remainder(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_remainderf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_remainder", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def pow(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_powif", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_powi", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_powf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_pow", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def tgamma(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tgammaf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tgamma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def round(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_roundf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_round", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def llround(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_llroundf", core.dtype("int64")),
+            (core.dtype("fp64"), ): ("__nv_llround", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fdim(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdimf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_fdim", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ilogb(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_ilogbf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_ilogb", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def logb(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_logbf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_logb", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def isfinited(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_isfinited", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb67b573a381156e7713a3359db859409701d7d7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py
@@ -0,0 +1,109 @@
+from triton.language import core
+
+
+@core.extern
+def globaltimer(_semantic=None):
+    return core.inline_asm_elementwise("mov.u64 $0, %globaltimer;", "=l", [], dtype=core.int64, is_pure=False, pack=1,
+                                       _semantic=_semantic)
+
+
+@core.extern
+def smid(_semantic=None):
+    return core.inline_asm_elementwise("mov.u32 $0, %smid;", "=r", [], dtype=core.int32, is_pure=True, pack=1,
+                                       _semantic=_semantic)
+
+
+@core.builtin
+def num_threads(_semantic=None):
+    return core.constexpr(_semantic.builder.options.num_warps * 32)
+
+
+@core.builtin
+def num_warps(_semantic=None):
+    return core.constexpr(_semantic.builder.options.num_warps)
+
+
+# ----- FP8E4M3B15 ------
+# This data-type is a variant of the standard FP8E4M3 format.
+# It was designed for fast software conversion to FP16 on
+# nvidia GPUs that do not support it natively.
+# This is the same format as FP8E4M3Nv, but:
+#   - the exponent bias is 15 instead of 7
+#   - 0xff and 0x7f are mapped to +-1.750 instead of +-nan
+@core.builtin
+def convert_fp8e4b15_to_float16(arg, _semantic=None):
+    return core.inline_asm_elementwise(
+        "{                                      \n"
+        ".reg .b32 a<2>, b<2>;                  \n"
+        "prmt.b32 a0, 0, $2, 0x5746;            \n"
+        "and.b32 b0, a0, 0x7f007f00;            \n"
+        "and.b32 b1, a0, 0x00ff00ff;            \n"
+        "and.b32 a1, a0, 0x00800080;            \n"
+        "shr.b32  b0, b0, 1;                    \n"
+        "add.u32 b1, b1, a1;                    \n"
+        "lop3.b32 $0, b0, 0x80008000, a0, 0xf8; \n"
+        "shl.b32 $1, b1, 7;                     \n"
+        "}                                      \n", "=r,=r,r", [arg], dtype=core.float16, is_pure=True, pack=4,
+        _semantic=_semantic)
+
+
+@core.builtin
+def convert_float16_to_fp8e4b15(arg, has_minx2, _semantic=None):
+    asm = """{
+            .reg .pred p<4>;
+            .reg .b32 a<2>, b<2>;
+            .reg .b16 c<4>;
+            .reg .b16 max_val_f16;
+            .reg .b32 max_val_f16x2;
+            mov.b16 max_val_f16,   0x3F00;
+            mov.b32 max_val_f16x2, 0x3F003F00;
+            and.b32 a0, $1, 0x7fff7fff;
+            and.b32 a1, $2, 0x7fff7fff;"""
+    if has_minx2:
+        asm += """min.f16x2 a0, a0, max_val_f16x2;
+                  min.f16x2 a1, a1, max_val_f16x2;"""
+    else:
+        asm += """setp.lt.f16x2  p0|p1, a0, max_val_f16x2;
+                  setp.lt.f16x2  p2|p3, a1, max_val_f16x2;
+                  mov.b32 {c0, c1}, a0;
+                  mov.b32 {c2, c3}, a1;
+                  selp.b16  c0, c0, max_val_f16, p0;
+                  selp.b16  c1, c1, max_val_f16, p1;
+                  selp.b16  c2, c2, max_val_f16, p2;
+                  selp.b16  c3, c3, max_val_f16, p3;
+                  mov.b32 a0, {c0, c1};
+                  mov.b32 a1, {c2, c3};"""
+    asm += """mad.lo.u32 a0, a0, 2, 0x00800080;
+              mad.lo.u32 a1, a1, 2, 0x00800080;
+              lop3.b32 b0, $1, 0x80008000, a0, 0xea;
+              lop3.b32 b1, $2, 0x80008000, a1, 0xea;
+              prmt.b32 $0, b0, b1, 0x7531;
+              }"""
+    return core.inline_asm_elementwise(asm, "=r,r,r", [arg], dtype=core.float8e4b15, is_pure=True, pack=4,
+                                       _semantic=_semantic)
+
+
+@core.builtin
+def convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2, _semantic=None):
+    if arg.type.scalar.is_fp8e4b15():
+        upcast_val = convert_fp8e4b15_to_float16(arg, _semantic=_semantic)
+        if dst_ty.scalar.is_fp32():
+            upcast_val = upcast_val.to(core.float32, _semantic=_semantic)
+        return upcast_val
+
+    assert arg.type.scalar.is_fp16() or arg.type.scalar.is_fp32()
+    downcast_val = arg
+    if arg.type.scalar.is_fp32():
+        downcast_val = downcast_val.to(core.float16, fp_downcast_rounding="rtz", _semantic=_semantic)
+    downcast_val = convert_float16_to_fp8e4b15(downcast_val, has_minx2=has_minx2, _semantic=_semantic)
+    return downcast_val
+
+
+@core.builtin
+def convert_custom_float8_sm80(arg, dst_ty, fp_downcast_rounding=None, _semantic=None):
+    return convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2=True, _semantic=_semantic)
+
+
+@core.builtin
+def convert_custom_float8_sm70(arg, dst_ty, fp_downcast_rounding=None, _semantic=None):
+    return convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2=False, _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc9b571ddfacbd15b1e8258cce592313f7d45a3e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__init__.py
@@ -0,0 +1,5 @@
+from . import libdevice
+
+from .utils import memrealtime
+
+__all__ = ["libdevice", "memrealtime"]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c40d7a78e719b657fd3581009ef54fd316ae0aef
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/libdevice.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/libdevice.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e43b5126000543e62e53a7f09de91d2878a14501
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/libdevice.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0777b2c1093798a0ee89da40c2277e87c768f809
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/utils.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/libdevice.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/libdevice.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8d1b11a80299ae9f203bc48f039020faa80353
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/libdevice.py
@@ -0,0 +1,491 @@
+from triton.language import core
+
+
+@core.extern
+def abs(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__triton_hip_iabs", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__triton_hip_iabs", core.dtype("int64")),
+            (core.dtype("fp32"), ): ("__triton_hip_fabs", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__triton_hip_fabs", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def floor(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_floor_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_floor_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def rsqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_rsqrt_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_rsqrt_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ceil(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_ceil_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_ceil_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def trunc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_trunc_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_trunc_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def exp2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_exp2_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_exp2_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def exp(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_exp_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_exp_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_expf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__triton_hip_fast_expf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_tanhf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__triton_hip_fast_tanhf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fast_dividef(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("fp32"), core.dtype("fp32")): ("__triton_hip_fast_fdividef", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_sqrt_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_sqrt_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def llrint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__triton_hip_llrint", core.dtype("int64")),
+            (core.dtype("fp64"), ): ("__triton_hip_llrint", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def nearbyint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__ocml_nearbyint_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_nearbyint_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def isnan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__ocml_isnan_f32", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__ocml_isnan_f64", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+
+
+@core.extern
+def signbit(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__ocml_signbit_f32", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__ocml_signbit_f64", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def copysign(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_copysign_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_copysign_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def isinf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_isinf_f32", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__ocml_isinf_f64", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+
+
+@core.extern
+def nextafter(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_nextafter_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_nextafter_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_sin_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_sin_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_cos_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_cos_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def tan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_tan_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_tan_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_log2_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_log2_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_cosh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_cosh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def sinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_sinh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_sinh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def tanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_tanh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_tanh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atan2(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_atan2_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_atan2_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_atan_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_atan_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def asin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_asin_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_asin_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def acos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_acos_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_acos_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_log_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_log_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log10(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_log10_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_log10_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def log1p(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_log1p_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_log1p_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def acosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_acosh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_acosh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def asinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_asinh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_asinh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def atanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_atanh_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_atanh_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def expm1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_expm1_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_expm1_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def hypot(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_hypot_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_hypot_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def j0(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_j0_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_j0_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def j1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_j1_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_j1_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def y0(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_y0_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_y0_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def y1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_y1_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_y1_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cyl_bessel_i0(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_i0_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_i0_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def cyl_bessel_i1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_i1_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_i1_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_erf_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_erf_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_erfinv_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_erfinv_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_erfc_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_erfc_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def erfcx(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_erfcx_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_erfcx_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def lgamma(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_lgamma_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_lgamma_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ldexp(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__ocml_ldexp_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__ocml_ldexp_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fmod(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_fmod_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_fmod_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def fma(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__ocml_fma_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__ocml_fma_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def pow(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__ocml_pown_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__ocml_pown_f64", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__ocml_pow_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__ocml_pow_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def ilogb(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_ilogb_f32", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__ocml_ilogb_f64", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+
+
+@core.extern
+def round(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__ocml_round_f32", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__ocml_round_f64", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9dbabc4d3cfdbd5ee91b38ef3be969b9f187046
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/hip/utils.py
@@ -0,0 +1,35 @@
+from triton.language import core
+
+
+@core.extern
+def memrealtime(_semantic=None):
+    """
+    Returns a 64-bit real time-counter value
+    """
+    target_arch = _semantic.builder.options.arch
+    if 'gfx11' in target_arch or 'gfx12' in target_arch:
+        return core.inline_asm_elementwise(
+            """
+            s_sendmsg_rtn_b64 $0, sendmsg(MSG_RTN_GET_REALTIME)
+            s_waitcnt lgkmcnt(0)
+            """,
+            "=r",
+            [],
+            dtype=core.int64,
+            is_pure=False,
+            pack=1,
+            _semantic=_semantic,
+        )
+    else:
+        return core.inline_asm_elementwise(
+            """
+            s_memrealtime $0
+            s_waitcnt vmcnt(0)
+            """,
+            "=r",
+            [],
+            dtype=core.int64,
+            is_pure=False,
+            pack=1,
+            _semantic=_semantic,
+        )
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/libdevice.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/libdevice.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29810bfbabdcc09d6a28f062c18ee6af3fe7575
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/extra/libdevice.py
@@ -0,0 +1,790 @@
+def clz(arg0):
+    ...
+
+
+def popc(arg0):
+    ...
+
+
+def byte_perm(arg0, arg1, arg2):
+    ...
+
+
+def mulhi(arg0, arg1):
+    ...
+
+
+def mul24(arg0, arg1):
+    ...
+
+
+def brev(arg0):
+    ...
+
+
+def sad(arg0, arg1, arg2):
+    ...
+
+
+def abs(arg0):
+    ...
+
+
+def floor(arg0):
+    ...
+
+
+def rcp64h(arg0):
+    ...
+
+
+def rsqrt(arg0):
+    ...
+
+
+def ceil(arg0):
+    ...
+
+
+def trunc(arg0):
+    ...
+
+
+def exp2(arg0):
+    ...
+
+
+def saturatef(arg0):
+    ...
+
+
+def fma_rn(arg0, arg1, arg2):
+    ...
+
+
+def fma_rz(arg0, arg1, arg2):
+    ...
+
+
+def fma_rd(arg0, arg1, arg2):
+    ...
+
+
+def fma_ru(arg0, arg1, arg2):
+    ...
+
+
+def fast_dividef(arg0, arg1):
+    ...
+
+
+def div_rn(arg0, arg1):
+    ...
+
+
+def div_rz(arg0, arg1):
+    ...
+
+
+def div_rd(arg0, arg1):
+    ...
+
+
+def div_ru(arg0, arg1):
+    ...
+
+
+def rcp_rn(arg0):
+    ...
+
+
+def rcp_rz(arg0):
+    ...
+
+
+def rcp_rd(arg0):
+    ...
+
+
+def rcp_ru(arg0):
+    ...
+
+
+def sqrt_rn(arg0):
+    ...
+
+
+def sqrt_rz(arg0):
+    ...
+
+
+def sqrt_rd(arg0):
+    ...
+
+
+def sqrt_ru(arg0):
+    ...
+
+
+def sqrt(arg0):
+    ...
+
+
+def add_rn(arg0, arg1):
+    ...
+
+
+def add_rz(arg0, arg1):
+    ...
+
+
+def add_rd(arg0, arg1):
+    ...
+
+
+def add_ru(arg0, arg1):
+    ...
+
+
+def mul_rn(arg0, arg1):
+    ...
+
+
+def mul_rz(arg0, arg1):
+    ...
+
+
+def mul_rd(arg0, arg1):
+    ...
+
+
+def mul_ru(arg0, arg1):
+    ...
+
+
+def double2float_rn(arg0):
+    ...
+
+
+def double2float_rz(arg0):
+    ...
+
+
+def double2float_rd(arg0):
+    ...
+
+
+def double2float_ru(arg0):
+    ...
+
+
+def double2int_rn(arg0):
+    ...
+
+
+def double2int_rz(arg0):
+    ...
+
+
+def double2int_rd(arg0):
+    ...
+
+
+def double2int_ru(arg0):
+    ...
+
+
+def double2uint_rn(arg0):
+    ...
+
+
+def double2uint_rz(arg0):
+    ...
+
+
+def double2uint_rd(arg0):
+    ...
+
+
+def double2uint_ru(arg0):
+    ...
+
+
+def int2double_rn(arg0):
+    ...
+
+
+def uint2double_rn(arg0):
+    ...
+
+
+def float2int_rn(arg0):
+    ...
+
+
+def float2int_rz(arg0):
+    ...
+
+
+def float2int_rd(arg0):
+    ...
+
+
+def float2int_ru(arg0):
+    ...
+
+
+def float2uint_rn(arg0):
+    ...
+
+
+def float2uint_rz(arg0):
+    ...
+
+
+def float2uint_rd(arg0):
+    ...
+
+
+def float2uint_ru(arg0):
+    ...
+
+
+def int2float_rn(arg0):
+    ...
+
+
+def int2float_rz(arg0):
+    ...
+
+
+def int2float_rd(arg0):
+    ...
+
+
+def int2float_ru(arg0):
+    ...
+
+
+def uint2float_rn(arg0):
+    ...
+
+
+def uint2float_rz(arg0):
+    ...
+
+
+def uint2float_rd(arg0):
+    ...
+
+
+def uint2float_ru(arg0):
+    ...
+
+
+def hiloint2double(arg0, arg1):
+    ...
+
+
+def double2loint(arg0):
+    ...
+
+
+def double2hiint(arg0):
+    ...
+
+
+def float2ll_rn(arg0):
+    ...
+
+
+def float2ll_rz(arg0):
+    ...
+
+
+def float2ll_rd(arg0):
+    ...
+
+
+def float2ll_ru(arg0):
+    ...
+
+
+def float2ull_rn(arg0):
+    ...
+
+
+def float2ull_rz(arg0):
+    ...
+
+
+def float2ull_rd(arg0):
+    ...
+
+
+def float2ull_ru(arg0):
+    ...
+
+
+def double2ll_rn(arg0):
+    ...
+
+
+def double2ll_rz(arg0):
+    ...
+
+
+def double2ll_rd(arg0):
+    ...
+
+
+def double2ll_ru(arg0):
+    ...
+
+
+def double2ull_rn(arg0):
+    ...
+
+
+def double2ull_rz(arg0):
+    ...
+
+
+def double2ull_rd(arg0):
+    ...
+
+
+def double2ull_ru(arg0):
+    ...
+
+
+def ll2float_rn(arg0):
+    ...
+
+
+def ll2float_rz(arg0):
+    ...
+
+
+def ll2float_rd(arg0):
+    ...
+
+
+def ll2float_ru(arg0):
+    ...
+
+
+def ull2float_rn(arg0):
+    ...
+
+
+def ull2float_rz(arg0):
+    ...
+
+
+def ull2float_rd(arg0):
+    ...
+
+
+def ull2float_ru(arg0):
+    ...
+
+
+def ll2double_rn(arg0):
+    ...
+
+
+def ll2double_rz(arg0):
+    ...
+
+
+def ll2double_rd(arg0):
+    ...
+
+
+def ll2double_ru(arg0):
+    ...
+
+
+def ull2double_rn(arg0):
+    ...
+
+
+def ull2double_rz(arg0):
+    ...
+
+
+def ull2double_rd(arg0):
+    ...
+
+
+def ull2double_ru(arg0):
+    ...
+
+
+def int_as_float(arg0):
+    ...
+
+
+def float_as_int(arg0):
+    ...
+
+
+def uint_as_float(arg0):
+    ...
+
+
+def float_as_uint(arg0):
+    ...
+
+
+def longlong_as_double(arg0):
+    ...
+
+
+def double_as_longlong(arg0):
+    ...
+
+
+def fast_sinf(arg0):
+    ...
+
+
+def fast_cosf(arg0):
+    ...
+
+
+def fast_log2f(arg0):
+    ...
+
+
+def fast_logf(arg0):
+    ...
+
+
+def fast_expf(arg0):
+    ...
+
+
+def fast_tanhf(arg0):
+    ...
+
+
+def fast_tanf(arg0):
+    ...
+
+
+def fast_exp10f(arg0):
+    ...
+
+
+def fast_log10f(arg0):
+    ...
+
+
+def fast_powf(arg0, arg1):
+    ...
+
+
+def hadd(arg0, arg1):
+    ...
+
+
+def rhadd(arg0, arg1):
+    ...
+
+
+def sub_rn(arg0, arg1):
+    ...
+
+
+def sub_rz(arg0, arg1):
+    ...
+
+
+def sub_rd(arg0, arg1):
+    ...
+
+
+def sub_ru(arg0, arg1):
+    ...
+
+
+def rsqrt_rn(arg0):
+    ...
+
+
+def ffs(arg0):
+    ...
+
+
+def rint(arg0):
+    ...
+
+
+def llrint(arg0):
+    ...
+
+
+def nearbyint(arg0):
+    ...
+
+
+def isnan(arg0):
+    ...
+
+
+def signbit(arg0):
+    ...
+
+
+def copysign(arg0, arg1):
+    ...
+
+
+def finitef(arg0):
+    ...
+
+
+def isinf(arg0):
+    ...
+
+
+def nextafter(arg0, arg1):
+    ...
+
+
+def sin(arg0):
+    ...
+
+
+def cos(arg0):
+    ...
+
+
+def sinpi(arg0):
+    ...
+
+
+def cospi(arg0):
+    ...
+
+
+def tan(arg0):
+    ...
+
+
+def log2(arg0):
+    ...
+
+
+def exp(arg0):
+    ...
+
+
+def exp10(arg0):
+    ...
+
+
+def cosh(arg0):
+    ...
+
+
+def sinh(arg0):
+    ...
+
+
+def tanh(arg0):
+    ...
+
+
+def atan2(arg0, arg1):
+    ...
+
+
+def atan(arg0):
+    ...
+
+
+def asin(arg0):
+    ...
+
+
+def acos(arg0):
+    ...
+
+
+def log(arg0):
+    ...
+
+
+def log10(arg0):
+    ...
+
+
+def log1p(arg0):
+    ...
+
+
+def acosh(arg0):
+    ...
+
+
+def asinh(arg0):
+    ...
+
+
+def atanh(arg0):
+    ...
+
+
+def expm1(arg0):
+    ...
+
+
+def hypot(arg0, arg1):
+    ...
+
+
+def rhypot(arg0, arg1):
+    ...
+
+
+def norm3d(arg0, arg1, arg2):
+    ...
+
+
+def rnorm3d(arg0, arg1, arg2):
+    ...
+
+
+def norm4d(arg0, arg1, arg2, arg3):
+    ...
+
+
+def rnorm4d(arg0, arg1, arg2, arg3):
+    ...
+
+
+def cbrt(arg0):
+    ...
+
+
+def rcbrt(arg0):
+    ...
+
+
+def j0(arg0):
+    ...
+
+
+def j1(arg0):
+    ...
+
+
+def y0(arg0):
+    ...
+
+
+def y1(arg0):
+    ...
+
+
+def yn(arg0, arg1):
+    ...
+
+
+def jn(arg0, arg1):
+    ...
+
+
+def cyl_bessel_i0(arg0):
+    ...
+
+
+def cyl_bessel_i1(arg0):
+    ...
+
+
+def erf(arg0):
+    ...
+
+
+def erfinv(arg0):
+    ...
+
+
+def erfc(arg0):
+    ...
+
+
+def erfcx(arg0):
+    ...
+
+
+def erfcinv(arg0):
+    ...
+
+
+def normcdfinv(arg0):
+    ...
+
+
+def normcdf(arg0):
+    ...
+
+
+def lgamma(arg0):
+    ...
+
+
+def ldexp(arg0, arg1):
+    ...
+
+
+def scalbn(arg0, arg1):
+    ...
+
+
+def fmod(arg0, arg1):
+    ...
+
+
+def remainder(arg0, arg1):
+    ...
+
+
+def fma(arg0, arg1, arg2):
+    ...
+
+
+def pow(arg0, arg1):
+    ...
+
+
+def tgamma(arg0):
+    ...
+
+
+def round(arg0):
+    ...
+
+
+def llround(arg0):
+    ...
+
+
+def fdim(arg0, arg1):
+    ...
+
+
+def ilogb(arg0):
+    ...
+
+
+def logb(arg0):
+    ...
+
+
+def isfinited(arg0):
+    ...
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/math.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..582cd876cb13374a0d31e8c783e4fea1a1003c4a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/math.py
@@ -0,0 +1,249 @@
+from . import core
+from functools import wraps
+from typing import List
+
+T = core.TypeVar('T')
+
+
+def _check_dtype(dtypes: List[str]) -> T:
+    """
+    We're following libdevice's convention to check accepted data types for math functions.
+    It is not a good practice to support all data types as accelerators/GPUs don't support
+    many float16 and bfloat16 math operations.
+    We should let the users know that they are using and invoke explicit cast to convert
+    the data type to the supported one.
+    """
+
+    def wrapper(fn):
+
+        @wraps(fn)
+        def check(*args, **kwargs):
+            # concatenate args and kwargs
+            all_args = list(args) + list(kwargs.values())
+            for arg in [a for a in all_args if isinstance(a, core.tensor)]:
+                if arg.type.scalar.name not in dtypes:
+                    raise ValueError(f"Expected dtype {dtypes} but got {arg.type.scalar.name}")
+            return fn(*args, **kwargs)
+
+        return check
+
+    return wrapper
+
+
+def _add_math_1arg_docstr(name: str) -> core.Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x`.
+
+    :param x: the input values
+    :type x: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+
+    return _decorator
+
+
+def _add_math_2arg_docstr(name: str) -> core.Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x` and :code:`y`.
+
+    :param x: the input values
+    :type x: Block
+    :param y: the input values
+    :type y: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+
+    return _decorator
+
+
+def _add_math_3arg_docstr(name: str) -> core.Callable[[T], T]:
+
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x`, :code:`y`, and :code:`z`.
+
+    :param x: the input values
+    :type x: Block
+    :param y: the input values
+    :type y: Block
+    :param z: the input values
+    :type z: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+
+    return _decorator
+
+
+@core.builtin
+@_check_dtype(dtypes=["int32", "int64", "uint32", "uint64"])
+@_add_math_2arg_docstr("most significant N bits of the 2N-bit product")
+def umulhi(x, y, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    return core.tensor(_semantic.builder.create_umulhi(x.handle, y.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("exponential")
+@core._tensor_member_fn
+def exp(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_exp(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("exponential (base 2)")
+@core._tensor_member_fn
+def exp2(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_exp2(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("natural logarithm")
+@core._tensor_member_fn
+def log(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_log(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("logarithm (base 2)")
+@core._tensor_member_fn
+def log2(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_log2(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("cosine")
+@core._tensor_member_fn
+def cos(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_cos(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("sine")
+@core._tensor_member_fn
+def sin(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_sin(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("fast square root")
+@core._tensor_member_fn
+def sqrt(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_sqrt(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32"])
+@_add_math_1arg_docstr("precise square root (rounding to nearest wrt the IEEE standard)")
+@core._tensor_member_fn
+def sqrt_rn(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_precise_sqrt(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("inverse square root")
+@core._tensor_member_fn
+def rsqrt(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_rsqrt(x.handle), x.type)
+
+
+@core._tensor_member_fn
+@core.builtin
+@_add_math_1arg_docstr("absolute value")
+def abs(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    dtype = x.dtype
+    if dtype.is_fp8e4b15():
+        mask = core.full(x.shape, 0x7F, core.int8, _semantic=_semantic)
+        return core.tensor(_semantic.builder.create_and(x.handle, mask.handle), x.type)
+    elif dtype.is_floating():
+        return core.tensor(_semantic.builder.create_fabs(x.handle), x.type)
+    elif dtype.is_int_signed():
+        return core.tensor(_semantic.builder.create_iabs(x.handle), x.type)
+    elif dtype.is_int_unsigned():
+        return x  # no-op
+    else:
+        assert False, f"Unexpected dtype {dtype}"
+
+
+@core.builtin
+@_add_math_2arg_docstr("fast division")
+def fdiv(x, y, ieee_rounding=False, _semantic=None):
+    ieee_rounding = core._unwrap_if_constexpr(ieee_rounding)
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    return _semantic.fdiv(x, y, ieee_rounding)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32"])
+@_add_math_2arg_docstr("precise division (rounding to nearest wrt the IEEE standard)")
+def div_rn(x, y, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    return core.tensor(_semantic.builder.create_precise_divf(x.handle, y.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("error function")
+@core._tensor_member_fn
+def erf(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_erf(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("floor")
+@core._tensor_member_fn
+def floor(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_floor(x.handle), x.type)
+
+
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("ceil")
+@core._tensor_member_fn
+def ceil(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_ceil(x.handle), x.type)
+
+
+@core.builtin
+@_add_math_3arg_docstr("fused multiply-add")
+def fma(x, y, z, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    z = _semantic.to_tensor(z)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    z, x = core.binary_op_type_legalization(z, x, _semantic)
+    z, y = core.binary_op_type_legalization(z, y, _semantic)
+    return core.tensor(_semantic.builder.create_fma(x.handle, y.handle, z.handle), x.type)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/random.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4790def8767599c9786ce3d03c4f28d8aea2683
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/random.py
@@ -0,0 +1,218 @@
+from ..runtime.jit import jit
+from . import core as tl
+from . import math
+
+N_ROUNDS_DEFAULT = tl.constexpr(10)  # Default number of rounds for philox
+
+# -------------------
+# randint
+# -------------------
+
+
+@jit
+def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Run `n_rounds` rounds of Philox for state (c0, c1, c2, c3) and key (k0, k1).
+    """
+    if c0.dtype == tl.uint32:
+        PHILOX_KEY_A: tl.constexpr = 0x9E3779B9
+        PHILOX_KEY_B: tl.constexpr = 0xBB67AE85
+        PHILOX_ROUND_A: tl.constexpr = 0xD2511F53
+        PHILOX_ROUND_B: tl.constexpr = 0xCD9E8D57
+    else:
+        tl.static_assert(c0.dtype == tl.uint64, "dtype not supported in philox_impl")
+        PHILOX_KEY_A: tl.constexpr = 0x9E3779B97F4A7C15
+        PHILOX_KEY_B: tl.constexpr = 0xBB67AE8584CAA73B
+        PHILOX_ROUND_A: tl.constexpr = 0xD2E7470EE14C6C93
+        PHILOX_ROUND_B: tl.constexpr = 0xCA5A826395121157
+
+    for _ in tl.static_range(n_rounds):
+        # for _ in range(n_rounds):
+        # update random state
+        A = PHILOX_ROUND_A
+        B = PHILOX_ROUND_B
+        _c0, _c2 = c0, c2
+        c0 = math.umulhi(B, _c2) ^ c1 ^ k0
+        c2 = math.umulhi(A, _c0) ^ c3 ^ k1
+        c1 = tl.mul(B, _c2, sanitize_overflow=False)
+        c3 = tl.mul(A, _c0, sanitize_overflow=False)
+        # raise key
+        k0 = tl.add(k0, PHILOX_KEY_A, sanitize_overflow=False)
+        k1 = tl.add(k1, PHILOX_KEY_B, sanitize_overflow=False)
+    return c0, c1, c2, c3
+
+
+@jit
+def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    seed = tl.to_tensor(seed)
+    tl.static_assert(seed.dtype.is_int())
+    seed = seed.to(tl.uint64)
+    c0 = tl.to_tensor(c0)
+    c1 = tl.to_tensor(c1)
+    c2 = tl.to_tensor(c2)
+    c3 = tl.to_tensor(c3)
+
+    if tl.constexpr(c0.dtype.primitive_bitwidth) == 32:
+        int_dtype = tl.uint32
+        seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32)
+        seed_lo = (seed & 0xffffffff).to(tl.uint32)
+    else:
+        tl.static_assert(tl.constexpr(c0.dtype.primitive_bitwidth) == 64, "bitwidth not supported in philox")
+        int_dtype = tl.uint64
+        seed_hi = tl.full((1, ), 0, dtype=int_dtype)
+        seed_lo = seed
+
+    c0 = c0.to(int_dtype, bitcast=True)
+    c1 = c1.to(int_dtype, bitcast=True)
+    c2 = c2.to(int_dtype, bitcast=True)
+    c3 = c3.to(int_dtype, bitcast=True)
+    return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds)
+
+
+@jit
+def randint(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block, returns a single
+    block of random :code:`int32`.
+
+    If you need multiple streams of random numbers,
+    using `randint4x` is likely to be faster than calling `randint` 4 times.
+
+    :param seed: The seed for generating random numbers.
+    :param offset: The offsets to generate random numbers for.
+    """
+    ret, _, _, _ = randint4x(seed, offset, n_rounds)
+    return ret
+
+
+@jit
+def randint4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block, returns four
+    blocks of random :code:`int32`.
+
+    This is the maximally efficient entry point
+    to Triton's Philox pseudo-random number generator.
+
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    # _0 = tl.zeros(offset.shape, offset.dtype)
+
+    offset_lo = offset.to(tl.uint32)
+    _0 = offset_lo * 0
+
+    if tl.constexpr(offset.dtype.primitive_bitwidth) > 32:
+        offset_hi = (offset >> 32).to(tl.uint32)
+    else:
+        offset_hi = _0
+
+    return philox(seed, offset_lo, offset_hi, _0, _0, n_rounds)
+
+
+# -------------------
+# rand
+# -------------------
+
+# @jit
+# def uint32_to_uniform_float(x):
+#     """
+#     Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1).
+#     """
+#     two_to_the_minus_32: tl.constexpr = 2.328306e-10
+#     return x * two_to_the_minus_32
+
+
+@jit
+def uint_to_uniform_float(x):
+    """
+    Numerically stable function to convert a random uint into a random float uniformly sampled in [0, 1).
+    """
+    # TODO: fix frontend issues and cleanup
+    # conditions can be simplified
+    # scale is ((2**23 - 1) / 2**23) * 2**(N_BITS - 1)
+    if tl.constexpr(x.dtype == tl.uint32) or tl.constexpr(x.dtype == tl.int32):
+        # maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+        x = x.to(tl.int32, bitcast=True)
+        scale = 4.6566127342e-10
+    else:
+        tl.static_assert(tl.constexpr(x.dtype == tl.uint64) or tl.constexpr(x.dtype == tl.int64))
+        x = x.to(tl.int64, bitcast=True)
+        scale = 1.0842020432385337e-19
+    x = tl.where(x < 0, -x - 1, x)
+    return x * scale
+
+
+@jit
+def rand(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns a block of random :code:`float32` in :math:`U(0, 1)`.
+
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    source = randint(seed, offset, n_rounds)
+    return uint_to_uniform_float(source)
+
+
+@jit
+def rand4x(seed, offsets, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offsets` block,
+    returns 4 blocks of random :code:`float32` in :math:`U(0, 1)`.
+
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    i1, i2, i3, i4 = randint4x(seed, offsets, n_rounds)
+    u1 = uint_to_uniform_float(i1)
+    u2 = uint_to_uniform_float(i2)
+    u3 = uint_to_uniform_float(i3)
+    u4 = uint_to_uniform_float(i4)
+    return u1, u2, u3, u4
+
+
+# -------------------
+# randn
+# -------------------
+
+
+@jit
+def pair_uniform_to_normal(u1, u2):
+    """Box-Muller transform"""
+    u1 = tl.maximum(1.0e-7, u1)
+    th = 6.283185307179586 * u2
+    r = math.sqrt(-2.0 * math.log(u1))
+    return r * math.cos(th), r * math.sin(th)
+
+
+@jit
+def randn(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns a block of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
+
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    i1, i2, _, _ = randint4x(seed, offset, n_rounds)
+    u1 = uint_to_uniform_float(i1)
+    u2 = uint_to_uniform_float(i2)
+    n1, _ = pair_uniform_to_normal(u1, u2)
+    return n1
+
+
+@jit
+def randn4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns 4 blocks of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
+
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    u1, u2, u3, u4 = rand4x(seed, offset, n_rounds)
+    n1, n2 = pair_uniform_to_normal(u1, u2)
+    n3, n4 = pair_uniform_to_normal(u3, u4)
+    return n1, n2, n3, n4
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/semantic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bf7024f41090e0680f9f262c15074c956c250d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/semantic.py
@@ -0,0 +1,1966 @@
+from __future__ import annotations  # remove after python 3.11
+import warnings
+
+from typing import List, Optional, Sequence, Tuple, TypeVar, Generic, Type
+import numbers
+
+from triton.runtime import driver
+
+from .._C.libtriton import ir
+from . import core as tl
+
+T = TypeVar('T')
+TensorTy = TypeVar('TensorTy')
+
+
+class IncompatibleTypeErrorImpl(Exception):
+
+    def __init__(self, type_a, type_b):
+        self.type_a = type_a
+        self.type_b = type_b
+        self.message = "invalid operands of type " + self.type_a.__repr__() + " and " + self.type_b.__repr__()
+        super(IncompatibleTypeErrorImpl, self).__init__(self.message)
+
+
+class TritonSemantic(Generic[TensorTy]):
+    tensor: Type[TensorTy] = tl.tensor
+    lang = tl
+
+    builder: ir.builder
+
+    def __init__(self, builder):
+        self.builder = builder
+
+# ===----------------------------------------------------------------------===##
+# Programming Model
+# ===----------------------------------------------------------------------===##
+
+    def program_id(self, axis: int) -> TensorTy:
+        if axis not in (0, 1, 2):
+            raise ValueError(f"program_id axis must be 0, 1, or 2 but got {axis}")
+        return self.tensor(self.builder.create_get_program_id(axis), tl.int32)
+
+    def num_programs(self, axis: int) -> TensorTy:
+        if axis not in (0, 1, 2):
+            raise ValueError(f"num_programs axis must be 0, 1, or 2 but got {axis}")
+        return self.tensor(self.builder.create_get_num_programs(axis), tl.int32)
+
+# ===----------------------------------------------------------------------===//
+#                               Implicit Casting Utilities
+# ===----------------------------------------------------------------------===//
+
+    def integer_promote_impl(self, a_ty: tl.dtype, b_ty: tl.dtype) -> tl.dtype:
+        a_rank = a_ty.int_bitwidth
+        b_rank = b_ty.int_bitwidth
+        a_sn = a_ty.int_signedness
+        b_sn = b_ty.int_signedness
+        # Rules for signedness taken from "Usual arithmetic conversions" on
+        # https://en.cppreference.com/w/c/language/conversion.
+        if a_sn == b_sn:
+            return a_ty if a_rank > b_rank else b_ty
+        elif a_sn == tl.dtype.SIGNEDNESS.UNSIGNED:
+            return a_ty if a_rank >= b_rank else b_ty
+        elif b_sn == tl.dtype.SIGNEDNESS.UNSIGNED:
+            return b_ty if b_rank >= a_rank else a_ty
+        raise TypeError(f"unexpected signedness {a_sn} and {b_sn}")
+
+    def computation_type_impl(self, a_ty: tl.dtype, a_is_scalar: bool, b_ty: tl.dtype, b_is_scalar: bool,
+                              div_or_mod: bool) -> tl.dtype:
+        # 0) For scalars we follow semantics similar to PyTorch, namely:
+        # - If the scalar is of a lower or equal kind (bool < uint < int < fp),
+        #   it doesn't participate in the promotion
+        if a_is_scalar != b_is_scalar:
+            scalar_ty, tensor_ty = (a_ty, b_ty) if a_is_scalar else (b_ty, a_ty)
+            if scalar_ty.kind().value <= tensor_ty.kind().value:
+                # Upcast because of 3) and 4) below!
+                if div_or_mod and (tensor_ty in (tl.float16, tl.bfloat16)):
+                    return tl.float32
+                return tensor_ty
+
+        # 1) if one operand is double, the other is implicitly
+        #    converted to double
+        if a_ty.is_fp64() or b_ty.is_fp64():
+            return tl.float64
+        # 2) if one operand is float, the other is implicitly
+        #    converted to float
+        if a_ty.is_fp32() or b_ty.is_fp32():
+            return tl.float32
+        # 3 ) if one operand is half, the other is implicitly converted to half
+        #     unless we're doing / or %, which do not exist natively in PTX for fp16.
+        #     Supported PTX op: add, sub, mul, fma, neg, abs, min, max, tanh, ex2, setp
+        if a_ty.is_fp16() or b_ty.is_fp16():
+            if div_or_mod:
+                return tl.float32
+            else:
+                return tl.float16
+        # 4) return bf16 only if both operands are of bf16
+        if a_ty.is_bf16() and b_ty.is_bf16():
+            if div_or_mod:
+                return tl.float32
+            else:
+                return tl.bfloat16
+        if a_ty.is_bf16() or b_ty.is_bf16():
+            return tl.float32
+        # 5) return fp16 if operands are different fp8
+        if a_ty.is_fp8() and b_ty.is_fp8():
+            return a_ty if a_ty == b_ty else tl.float16
+        if not a_ty.is_int() or not b_ty.is_int():
+            raise TypeError(f"unexpected type {a_ty} and {b_ty}")
+        # 6 ) both operands are integer and undergo
+        #    integer promotion
+        if div_or_mod and a_ty.int_signedness != b_ty.int_signedness:
+            raise TypeError("Cannot use /, #, or % with " + a_ty.__repr__() + " and " + b_ty.__repr__() +
+                            " because they have different signedness;"
+                            "this is unlikely to result in a useful answer. Cast them to the same signedness.")
+        return self.integer_promote_impl(a_ty, b_ty)
+
+    def to_tensor(self, x, check_type: bool = True):
+        if isinstance(x, bool):
+            return self.tensor(self.builder.get_int1(x), tl.int1)
+        # Note: compile-time const integers are represented by unsigned values
+        elif isinstance(x, int):
+            if -2**31 <= x < 2**31:
+                dtype = tl.int32
+            elif 2**31 <= x < 2**32:
+                dtype = tl.uint32
+            elif -2**63 <= x < 2**63:
+                dtype = tl.int64
+            elif 2**63 <= x < 2**64:
+                dtype = tl.uint64
+            else:
+                raise ValueError(f'Nonrepresentable integer {x}.')
+            return self.scalar_constant(x, dtype=dtype)
+        elif isinstance(x, float):
+            min_float32 = 2**-126
+            max_float32 = (2 - 2**-23) * 2**127
+            abs_x = __builtins__['abs'](x)
+            if abs_x == float("inf") or\
+               abs_x == 0.0 or \
+               x != x or \
+               min_float32 <= abs_x <= max_float32:
+                dtype = tl.float32
+            else:
+                dtype = tl.float64
+            return self.scalar_constant(x, dtype=dtype)
+
+        elif isinstance(x, tl.constexpr):
+            return self.to_tensor(x.value)
+        elif isinstance(x, self.tensor):
+            return x
+        if check_type:
+            raise TypeError(f"cannot convert {x} of type {type(x)} to tensor")
+        return x
+
+# ===----------------------------------------------------------------------===//
+#                               Binary Operators
+# ===----------------------------------------------------------------------===//
+
+    def check_ptr_type_impl(self, type_a: tl.dtype, type_b: tl.dtype, allow_ptr_a: bool) -> None:
+        if type_a.is_ptr():
+            if not allow_ptr_a:
+                raise IncompatibleTypeErrorImpl(type_a, type_b)
+            # T* + U* with T != U
+            if type_b.is_ptr() and (type_a != type_b):
+                raise IncompatibleTypeErrorImpl(type_a, type_b)
+            # T* + float
+            if type_b.is_floating():
+                raise IncompatibleTypeErrorImpl(type_a, type_b)
+
+    def binary_op_type_checking_impl(self, lhs: TensorTy | numbers.Number, rhs: TensorTy | numbers.Number,
+                                     allow_lhs_ptr=False, allow_rhs_ptr=False, arithmetic_check=True,
+                                     div_or_mod=False) -> Tuple[TensorTy, TensorTy]:
+        lhs_is_scalar = isinstance(lhs, numbers.Number)
+        rhs_is_scalar = isinstance(rhs, numbers.Number)
+        if lhs_is_scalar:
+            lhs_scalar = lhs
+            lhs = self.to_tensor(lhs)
+        if rhs_is_scalar:
+            rhs_scalar = rhs
+            rhs = self.to_tensor(rhs)
+
+        # implicit typecasting
+        lhs_sca_ty = lhs.type.scalar
+        rhs_sca_ty = rhs.type.scalar
+        self.check_ptr_type_impl(lhs_sca_ty, rhs_sca_ty, allow_lhs_ptr)
+        self.check_ptr_type_impl(rhs_sca_ty, lhs_sca_ty, allow_rhs_ptr)
+        if arithmetic_check and not lhs_sca_ty.is_ptr() and not rhs_sca_ty.is_ptr():
+            ret_sca_ty = self.computation_type_impl(lhs_sca_ty, lhs_is_scalar, rhs_sca_ty, rhs_is_scalar, div_or_mod)
+            if (lhs_is_scalar and lhs_scalar < 0 and ret_sca_ty.is_int_unsigned()
+                    or rhs_is_scalar and rhs_scalar < 0 and ret_sca_ty.is_int_unsigned()):
+                raise ValueError("Cannot perform a binary operation between an unsigned tensor and a negative scalar. "
+                                 "Perform a explicit cast on one of them.")
+            if ret_sca_ty.is_int():
+                if lhs_is_scalar and not (ret_sca_ty.get_int_min_value() <= lhs_scalar <=
+                                          ret_sca_ty.get_int_max_value()):
+                    raise ValueError(f"Scalar {lhs_scalar} is out of range for type {ret_sca_ty}")
+                if rhs_is_scalar and not (ret_sca_ty.get_int_min_value() <= rhs_scalar <=
+                                          ret_sca_ty.get_int_max_value()):
+                    raise ValueError(f"Scalar {rhs_scalar} is out of range for type {ret_sca_ty}")
+            lhs = self.scalar_constant(lhs_scalar, dtype=ret_sca_ty) if lhs_is_scalar else self.cast(lhs, ret_sca_ty)
+            rhs = self.scalar_constant(rhs_scalar, dtype=ret_sca_ty) if rhs_is_scalar else self.cast(rhs, ret_sca_ty)
+
+        # implicit broadcasting
+        lhs, rhs = self.broadcast_impl_value(lhs, rhs)
+        return lhs, rhs
+
+    def binary_op_sanitize_overflow_impl(self, lhs: TensorTy, rhs: TensorTy, binary_op: callable):
+        if lhs.type.scalar.int_bitwidth >= 64 or not self.builder.options.sanitize_overflow:
+            return
+        lhs_sca_ty = lhs.type.scalar
+        rhs_sca_ty = rhs.type.scalar
+        assert lhs_sca_ty == rhs_sca_ty
+        assert lhs_sca_ty.is_int()
+        lhs = self.cast(lhs, tl.int64)
+        rhs = self.cast(rhs, tl.int64)
+        ret = binary_op(lhs, rhs, False)
+        max_value = lhs_sca_ty.get_int_max_value()
+        max_value = self.scalar_constant(max_value, tl.int64)
+        min_value = lhs_sca_ty.get_int_min_value()
+        min_value = self.scalar_constant(min_value, tl.int64)
+        cond = self.and_(self.less_equal(ret, max_value), self.greater_equal(ret, min_value))
+        msg = f"int{lhs_sca_ty.int_bitwidth} overflow detected for operation {binary_op.__name__}"
+        self.device_assert(cond, msg, None)
+
+    def add(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number,
+            sanitize_overflow: bool) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other, True, True)
+        input_scalar_ty = input.type.scalar
+        other_scalar_ty = other.type.scalar
+        if input_scalar_ty.is_ptr() and other_scalar_ty.is_ptr():
+            raise TypeError("cannot add pointers together")
+
+        # offset + ptr
+        # ptr + offset
+        if other_scalar_ty.is_ptr() and not input_scalar_ty.is_ptr():
+            input, other = other, input
+            input_scalar_ty = input.type.scalar
+            other_scalar_ty = other.type.scalar
+        if input_scalar_ty.is_ptr():
+            other_handle = other.handle
+            if other.dtype.is_int_unsigned() and other.dtype.int_bitwidth < 64:
+                # addptr treats offset as signed. Zero-extend unsigned offsets to ensure they're positive
+                i64_ty = other.type.with_element_ty(tl.int64).to_ir(self.builder)
+                other_handle = self.builder.create_int_cast(other.handle, i64_ty, False)
+            return self.tensor(self.builder.create_addptr(input.handle, other_handle), input.type)
+        # float + float
+        elif input_scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fadd(input.handle, other.handle), input.type)
+        # int + int
+        elif input_scalar_ty.is_int():
+            if sanitize_overflow:
+                self.binary_op_sanitize_overflow_impl(input, other, self.add)
+            return self.tensor(self.builder.create_add(input.handle, other.handle), input.type)
+        raise TypeError(f"unexpected type {input_scalar_ty}")
+
+    def sub(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number,
+            sanitize_overflow: bool) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other, True, False)
+        scalar_ty = input.type.scalar
+        # ptr - offset
+        if scalar_ty.is_ptr():
+            return self.add(input, self.minus(other), sanitize_overflow=False)
+        # float - float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fsub(input.handle, other.handle), input.type)
+        # int - int
+        elif scalar_ty.is_int():
+            if sanitize_overflow:
+                self.binary_op_sanitize_overflow_impl(input, other, self.sub)
+            return self.tensor(self.builder.create_sub(input.handle, other.handle), input.type)
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def mul(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number,
+            sanitize_overflow: bool) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float * float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fmul(input.handle, other.handle), input.type)
+        # int * int
+        elif scalar_ty.is_int():
+            if sanitize_overflow:
+                self.binary_op_sanitize_overflow_impl(input, other, self.mul)
+            return self.tensor(self.builder.create_mul(input.handle, other.handle), input.type)
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def truediv(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other, False, False, True, True)
+        input_scalar_ty = input.type.scalar
+        other_scalar_ty = other.type.scalar
+        # float / int
+        if input_scalar_ty.is_floating() and other_scalar_ty.is_int():
+            other = self.cast(other, input_scalar_ty)
+        # int / float
+        elif input_scalar_ty.is_int() and other_scalar_ty.is_floating():
+            input = self.cast(input, other_scalar_ty)
+        # int / int (cast to tl.float32)
+        elif input_scalar_ty.is_int() and other_scalar_ty.is_int():
+            input = self.cast(input, tl.float32)
+            other = self.cast(other, tl.float32)
+        # float / float (cast to the highest exponent type)
+        elif input_scalar_ty.is_floating() and other_scalar_ty.is_floating():
+            if input_scalar_ty.fp_mantissa_width > other_scalar_ty.fp_mantissa_width:
+                other = self.cast(other, input_scalar_ty)
+            else:
+                input = self.cast(input, other_scalar_ty)
+        # unreachable
+        else:
+            raise TypeError(f"unexpected type {input_scalar_ty}")
+        return self.tensor(self.builder.create_fdiv(input.handle, other.handle), input.type)
+
+    def floordiv(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other, False, False, True, True)
+        input_scalar_ty = input.type.scalar
+        other_scalar_ty = other.type.scalar
+        if input_scalar_ty.is_int() and other_scalar_ty.is_int():
+            ret_ty = self.integer_promote_impl(input_scalar_ty, other_scalar_ty)
+            input = self.cast(input, ret_ty)
+            other = self.cast(other, ret_ty)
+            if ret_ty.is_int_signed():
+                return self.tensor(self.builder.create_sdiv(input.handle, other.handle), input.type)
+            else:
+                return self.tensor(self.builder.create_udiv(input.handle, other.handle), input.type)
+        raise TypeError(f"unexpected type {input_scalar_ty}")
+
+    def fdiv(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number, ieee_rounding: bool) -> TensorTy:
+        input_scalar_ty = input.type.scalar
+        other_scalar_ty = other.type.scalar
+        if not input_scalar_ty.is_floating() or not other_scalar_ty.is_floating():
+            raise TypeError("both operands of fdiv must have floating scalar type")
+        input, other = self.binary_op_type_checking_impl(input, other, False, False, False, True)
+        ret = self.builder.create_fdiv(input.handle, other.handle)
+        return self.tensor(ret, input.type)
+
+    def mod(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other, False, False, True, True)
+        scalar_ty = input.type.scalar
+        other_scalar_ty = other.type.scalar
+        # float % float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_frem(input.handle, other.handle), input.type)
+        # % int
+        elif scalar_ty.is_int():
+            if scalar_ty.int_signedness != other_scalar_ty.int_signedness:
+                raise TypeError("Cannot mod " + scalar_ty.__repr__() + " by " + other_scalar_ty.__repr__() + " "
+                                "because they have different signedness;"
+                                "this is unlikely to result in a useful answer. Cast them to the same signedness.")
+            if scalar_ty.is_int_signed():
+                return self.tensor(self.builder.create_srem(input.handle, other.handle), input.type)
+            else:
+                return self.tensor(self.builder.create_urem(input.handle, other.handle), input.type)
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+##############
+# other arithmetic ops
+##############
+
+    def minimum(self, x: TensorTy, y: TensorTy, propagate_nan: tl.PropagateNan):
+        x, y = self.binary_op_type_checking_impl(x, y)
+        dtype = x.dtype
+        if dtype.is_floating():
+            if propagate_nan == tl.PropagateNan.ALL:
+                return self.tensor(self.builder.create_minimumf(x.handle, y.handle), x.type)
+            elif propagate_nan == tl.PropagateNan.NONE:
+                return self.tensor(self.builder.create_minnumf(x.handle, y.handle), x.type)
+            else:
+                raise ValueError(f"Unexpected propagate_nan {propagate_nan}")
+        elif dtype.is_int_signed():
+            return self.tensor(self.builder.create_minsi(x.handle, y.handle), x.type)
+        elif dtype.is_int_unsigned():
+            return self.tensor(self.builder.create_minui(x.handle, y.handle), x.type)
+        else:
+            raise TypeError(f"Unexpected dtype {dtype}")
+
+    def maximum(self, x: TensorTy, y: TensorTy, propagate_nan: tl.PropagateNan):
+        x, y = self.binary_op_type_checking_impl(x, y)
+        dtype = x.dtype
+        if dtype.is_floating():
+            if propagate_nan == tl.PropagateNan.ALL:
+                return self.tensor(self.builder.create_maximumf(x.handle, y.handle), x.type)
+            elif propagate_nan == tl.PropagateNan.NONE:
+                return self.tensor(self.builder.create_maxnumf(x.handle, y.handle), x.type)
+            else:
+                raise ValueError(f"Unexpected propagate_nan {propagate_nan}")
+        elif dtype.is_int_signed():
+            return self.tensor(self.builder.create_maxsi(x.handle, y.handle), x.type)
+        elif dtype.is_int_unsigned():
+            return self.tensor(self.builder.create_maxui(x.handle, y.handle), x.type)
+        else:
+            raise TypeError(f"Unexpected dtype {dtype}")
+
+    def clamp(self, x: TensorTy, min: TensorTy, max: TensorTy, propagate_nan: tl.PropagateNan):
+        min, max = self.binary_op_type_checking_impl(min, max)
+        x, min = self.binary_op_type_checking_impl(x, min)
+        x, max = self.binary_op_type_checking_impl(x, max)
+
+        dtype = x.dtype
+        if dtype.is_floating():
+            return self.tensor(self.builder.create_clampf(x.handle, min.handle, max.handle, propagate_nan), x.type)
+        else:
+            raise TypeError(f"Unexpected dtype {dtype}. Only floating point clamp is supported")
+
+##############
+# bitwise ops
+##############
+
+    def bitwise_op_type_checking_impl(self, input: TensorTy, other: TensorTy) -> Tuple[TensorTy, TensorTy]:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        input_sca_ty = input.type.scalar
+        other_sca_ty = other.type.scalar
+        if not input_sca_ty.is_int() or not other_sca_ty.is_int():
+            raise IncompatibleTypeErrorImpl(input_sca_ty, other_sca_ty)
+        ret_sca_ty = self.integer_promote_impl(input_sca_ty, other_sca_ty)
+        if ret_sca_ty != input_sca_ty:
+            input = self.cast(input, ret_sca_ty)
+        if ret_sca_ty != other_sca_ty:
+            other = self.cast(other, ret_sca_ty)
+        return input, other
+
+    def and_(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_and(input.handle, other.handle), input.type)
+
+    def or_(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_or(input.handle, other.handle), input.type)
+
+    def xor_(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_xor(input.handle, other.handle), input.type)
+
+    def logical_and(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        if not input.type.is_int1():
+            input = self.bitcast(input, tl.int1)
+        if not other.type.is_int1():
+            other = self.bitcast(other, tl.int1)
+        return self.and_(input, other)
+
+    def logical_or(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        if not input.type.is_int1():
+            input = self.bitcast(input, tl.int1)
+        if not other.type.is_int1():
+            other = self.bitcast(other, tl.int1)
+        return self.or_(input, other)
+
+    def not_(self, input: TensorTy):
+        if not input.type.is_int1():
+            input = self.bitcast(input, tl.int1)
+        return self.invert(input)
+
+    def lshr(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_lshr(input.handle, other.handle), input.type)
+
+    def ashr(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_ashr(input.handle, other.handle), input.type)
+
+    def shl(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.bitwise_op_type_checking_impl(input, other)
+        return self.tensor(self.builder.create_shl(input.handle, other.handle), input.type)
+
+# ===----------------------------------------------------------------------===//
+#                               Unary Operators
+# ===----------------------------------------------------------------------===//
+
+    def plus(self, input: TensorTy) -> TensorTy:
+        return input
+
+    def minus(self, input: TensorTy) -> TensorTy:
+        input_sca_ty = input.type.scalar
+        if input_sca_ty.is_ptr():
+            raise ValueError("wrong type argument to unary minus (" + input_sca_ty.__repr__() + ")")
+        _0 = self.tensor(self.builder.get_null_value(input_sca_ty.to_ir(self.builder)), input_sca_ty)
+        return self.sub(_0, input, True)
+
+    def invert(self, input: TensorTy) -> TensorTy:
+        input_sca_ty = input.type.scalar
+        if input_sca_ty.is_ptr() or input_sca_ty.is_floating():
+            raise ValueError("wrong type argument to unary invert (" + input_sca_ty.__repr__() + ")")
+        _1 = self.tensor(self.builder.get_all_ones_value(input_sca_ty.to_ir(self.builder)), input_sca_ty)
+        return self.xor_(input, _1)
+
+# ===----------------------------------------------------------------------===//
+#                               Comparison Operators
+# ===----------------------------------------------------------------------===//
+
+    def _bool_like(self, v: TensorTy) -> tl.block_type:
+        return v.type.with_element_ty(tl.int1)
+
+    def greater_than(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float > float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpOGT(input.handle, other.handle), self._bool_like(input))
+        # > int
+        elif scalar_ty.is_int():
+            if scalar_ty.is_int_signed():
+                return self.tensor(self.builder.create_icmpSGT(input.handle, other.handle), self._bool_like(input))
+            else:
+                return self.tensor(self.builder.create_icmpUGT(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def greater_equal(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float >= float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpOGE(input.handle, other.handle), self._bool_like(input))
+        # >= int
+        elif scalar_ty.is_int():
+            if scalar_ty.is_int_signed():
+                return self.tensor(self.builder.create_icmpSGE(input.handle, other.handle), self._bool_like(input))
+            else:
+                return self.tensor(self.builder.create_icmpUGE(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def less_than(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float < float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpOLT(input.handle, other.handle), self._bool_like(input))
+        # < int
+        elif scalar_ty.is_int():
+            if scalar_ty.is_int_signed():
+                return self.tensor(self.builder.create_icmpSLT(input.handle, other.handle), self._bool_like(input))
+            else:
+                return self.tensor(self.builder.create_icmpULT(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def less_equal(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float < float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpOLE(input.handle, other.handle), self._bool_like(input))
+        # < int
+        elif scalar_ty.is_int():
+            if scalar_ty.is_int_signed():
+                return self.tensor(self.builder.create_icmpSLE(input.handle, other.handle), self._bool_like(input))
+            else:
+                return self.tensor(self.builder.create_icmpULE(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def equal(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float == float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpOEQ(input.handle, other.handle), self._bool_like(input))
+        # == int
+        elif scalar_ty.is_int():
+            return self.tensor(self.builder.create_icmpEQ(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+    def not_equal(self, input: TensorTy, other: TensorTy) -> TensorTy:
+        input, other = self.binary_op_type_checking_impl(input, other)
+        scalar_ty = input.type.scalar
+        # float == float
+        if scalar_ty.is_floating():
+            return self.tensor(self.builder.create_fcmpUNE(input.handle, other.handle), self._bool_like(input))
+        # == int
+        elif scalar_ty.is_int():
+            return self.tensor(self.builder.create_icmpNE(input.handle, other.handle), self._bool_like(input))
+        raise TypeError(f"unexpected type {scalar_ty}")
+
+# ===----------------------------------------------------------------------===//
+#                               Block Creation
+# ===----------------------------------------------------------------------===//
+
+    def arange(self, start: int, end: int, *, ret_ty: tl.block_type = None) -> TensorTy:
+        if not isinstance(start, int) or not isinstance(end, int):
+            raise ValueError("arange's arguments must be of type tl.constexpr")
+        is_start_int64 = bool(start >> 32)
+        is_end_int64 = bool(end >> 32)
+        if is_start_int64 or is_end_int64:
+            raise ValueError("arange must fit in int32")
+        if end <= start:
+            raise ValueError("arange's end argument must be greater than the start argument")
+        range = end - start
+        if (range & (range - 1)) != 0:
+            raise ValueError("arange's range must be a power of 2")
+        shape = [range]
+        if ret_ty is None:
+            ret_ty = tl.block_type(tl.int32, shape)
+        ret_ty_ir = ret_ty.to_ir(self.builder)
+        return self.tensor(self.builder.create_make_range(ret_ty_ir, start, end), ret_ty)
+
+    def scalar_constant(self, value, dtype: tl.dtype) -> TensorTy:
+        # scalar
+        if dtype is None:
+            raise ValueError("dtype must be specified when value is not a tensor")
+        if value == 0:
+            value = self.builder.get_null_value(dtype.to_ir(self.builder))
+        else:
+            get_value_fn = getattr(self.builder, f"get_{dtype.name}")
+            value = get_value_fn(value)
+        return self.tensor(value, dtype)
+
+    def make_scalar(self, value, dtype: tl.dtype) -> TensorTy:
+        if isinstance(value, tl.tensor):
+            assert value.numel.value == 1, "only accepts size-1 tensor"
+            return self.cast(value, dtype)
+        # scalar
+        return self.scalar_constant(value, dtype)
+
+    def full(self, shape: List[int], value, dtype: tl.dtype) -> TensorTy:
+        return self.splat(self.make_scalar(value, dtype), shape)
+
+# ===----------------------------------------------------------------------===//
+#                               Shape Manipulation
+# ===----------------------------------------------------------------------===//
+
+    def splat(self, value: TensorTy, shape: List[int]) -> TensorTy:
+        assert not value.type.is_block(), "Cannot splat a block tensor"
+        if len(shape) == 0:
+            return value
+        ret_ty = tl.block_type(value.dtype, shape)
+        return self.tensor(self.builder.create_splat(ret_ty.to_ir(self.builder), value.handle), ret_ty)
+
+    def unsplat(self, value: TensorTy) -> TensorTy:
+        return self.tensor(self.builder.create_unsplat(value.handle), value.dtype)
+
+    def reshape(self, input: TensorTy, dst_shape: List[int], can_reorder: bool) -> TensorTy:
+        numel = 1
+        for s in dst_shape:
+            numel *= s
+        if input.type.numel != numel:
+            raise ValueError("reshape() cannot change total number of elements in tensor")
+        ret_ty = tl.block_type(input.type.scalar, dst_shape)
+        return self.tensor(self.builder.create_reshape(input.handle, dst_shape, can_reorder), ret_ty)
+
+    def expand_dims(self, input: TensorTy, axis: int) -> TensorTy:
+        dst_shape = [tl._unwrap_if_constexpr(x) for x in input.shape]
+        dst_shape.insert(axis, 1)
+
+        if not input.type.is_block():
+            return self.splat(input, shape=dst_shape)
+
+        ret_ty = tl.block_type(input.type.scalar, dst_shape)
+        return self.tensor(self.builder.create_expand_dims(input.handle, axis), ret_ty)
+
+    def cat(self, lhs: TensorTy, rhs: TensorTy, can_reorder: bool) -> TensorTy:
+        assert can_reorder, "current implementation of `cat` always may reorder elements"
+        assert len(lhs.shape) == 1
+        ret_type = tl.block_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]])
+        return self.tensor(self.builder.create_cat(lhs.handle, rhs.handle), ret_type)
+
+    def join(self, a: TensorTy, b: TensorTy) -> TensorTy:
+        a, b = self.broadcast_impl_value(a, b)
+
+        # The IR can't handle joining two scalars, so upcast them to 1D tensors,
+        # then downcast the result.
+        was_rank_1 = a.shape == []
+        if was_rank_1:
+            a = self.expand_dims(a, 0)
+            b = self.expand_dims(b, 0)
+
+        if isinstance(a.shape[-1], tl.constexpr):
+            two = tl.constexpr(2)
+        else:
+            two = 2
+        new_shape = a.shape + [two]
+
+        ret_type = tl.block_type(a.type.scalar, new_shape)
+        ret = self.tensor(self.builder.create_join(a.handle, b.handle), ret_type)
+
+        if was_rank_1:
+            ret = self.reshape(ret, [2], can_reorder=False)
+
+        return ret
+
+    def split(self, a: TensorTy) -> Tuple[TensorTy, TensorTy]:
+        assert (len(a.shape) > 0)
+        assert (tl._unwrap_if_constexpr(a.shape[-1]) == 2)
+
+        new_shape = a.shape[:-1]
+        ret_type = tl.block_type(a.type.scalar, new_shape)
+        outLHS, outRHS = self.builder.create_split(a.handle)
+        return (
+            self.tensor(outLHS, ret_type),
+            self.tensor(outRHS, ret_type),
+        )
+
+    def permute(self, input: TensorTy, dims: Tuple[int]) -> TensorTy:
+        if len(input.shape) != len(dims):
+            raise ValueError("permute dims must have the same length as input shape")
+        if sorted(tl._unwrap_if_constexpr(d) for d in dims) != list(range(len(dims))):
+            raise ValueError(f"permute dims must be a permutation of 0, 1, ..., n-1, but were {dims}")
+
+        ret_type = tl.block_type(input.type.scalar, [input.shape[d] for d in dims])
+        return self.tensor(self.builder.create_trans(input.handle, dims), ret_type)
+
+    def broadcast_impl_shape(self, input: TensorTy, shape: Tuple[int]) -> TensorTy:
+        if not input.type.is_block():
+            return self.splat(input, shape)
+        src_shape = input.type.get_block_shapes()
+        if len(src_shape) != len(shape):
+            raise ValueError(f"Cannot broadcast, rank mismatch: {src_shape}, {shape}")
+        if shape == src_shape:
+            return input
+        for i, item in enumerate(src_shape):
+            if shape[i] != item and item != 1:
+                raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})"
+                                 f" must match the existing size ({item}) at non-singleton dimension"
+                                 f" {i}: {src_shape}, {shape}")
+        ret_ty = tl.block_type(input.type.scalar, shape)
+        return self.tensor(self.builder.create_broadcast(input.handle, shape), ret_ty)
+
+    def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
+        lhs_ty = lhs.type
+        rhs_ty = rhs.type
+
+        # make_shape_compatible(block, scalar)
+        if lhs_ty.is_block() and not rhs_ty.is_block():
+            rhs_ty = lhs_ty.with_element_ty(rhs_ty.scalar)
+            rhs = self.tensor(self.builder.create_splat(rhs_ty.to_ir(self.builder), rhs.handle), rhs_ty)
+        # make_shape_compatible(scalar, block)
+        elif not lhs_ty.is_block() and rhs_ty.is_block():
+            lhs_ty = rhs_ty.with_element_ty(lhs_ty.scalar)
+            lhs = self.tensor(self.builder.create_splat(lhs_ty.to_ir(self.builder), lhs.handle), lhs_ty)
+        # make_shape_compatible(block, block)
+        elif lhs_ty.is_block() and rhs_ty.is_block():
+            lhs_shape = lhs_ty.get_block_shapes()
+            rhs_shape = rhs_ty.get_block_shapes()
+
+            if len(lhs_shape) < len(rhs_shape):
+                # Add new axes to lhs
+                for _ in range(len(lhs_shape), len(rhs_shape)):
+                    lhs = self.tensor(self.builder.create_expand_dims(lhs.handle, 0),
+                                      tl.block_type(lhs_ty.scalar, [1] + lhs_shape.values))
+                    lhs_ty = lhs.type
+                    lhs_shape = lhs_ty.get_block_shapes()
+            elif len(rhs_shape) < len(lhs_shape):
+                # Add new axes to rhs
+                for _ in range(len(rhs_shape), len(lhs_shape)):
+                    rhs = self.tensor(self.builder.create_expand_dims(rhs.handle, 0),
+                                      tl.block_type(rhs_ty.scalar, [1] + rhs_shape.values))
+                    rhs_ty = rhs.type
+                    rhs_shape = rhs_ty.get_block_shapes()
+            assert len(rhs_shape) == len(lhs_shape)
+
+            ret_shape = []
+            for i, left in enumerate(lhs_shape):
+                right = rhs_shape[i]
+                if left == 1:
+                    ret_shape.append(right)
+                elif (right == 1) or (right == left):
+                    ret_shape.append(left)
+                else:
+                    raise ValueError("Cannot make_shape_compatible: incompatible dimensions "
+                                     "at index " + str(i) + ": " + str(left) + " and " + str(right))
+            if lhs_shape != ret_shape:
+                ret_ty = tl.block_type(lhs_ty.scalar, ret_shape)
+                lhs = self.tensor(self.builder.create_broadcast(lhs.handle, ret_shape), ret_ty)
+            if rhs_shape != ret_shape:
+                ret_ty = tl.block_type(rhs_ty.scalar, ret_shape)
+                rhs = self.tensor(self.builder.create_broadcast(rhs.handle, ret_shape), ret_ty)
+        # (scalar, scalar) => returns original blocks
+        return lhs, rhs
+
+#######
+# cast
+#######
+
+    def _str_to_rounding_mode(self, rounding_mode: Optional[str]):
+        if rounding_mode is None:
+            return None
+        if rounding_mode == 'rtne':
+            return ir.ROUNDING_MODE.RTNE
+        if rounding_mode == 'rtz':
+            return ir.ROUNDING_MODE.RTZ
+        raise ValueError(f"Invalid rounding mode: {rounding_mode}. Supported rounding modes are 'rtne' and 'rtz'.")
+
+    def bitcast(self, input: TensorTy, dst_ty: tl.dtype) -> TensorTy:
+        src_ty = input.type
+        if src_ty.is_block():
+            dst_ty = src_ty.with_element_ty(dst_ty.scalar)
+        if src_ty == dst_ty:
+            return input
+        src_sca_ty = src_ty.scalar
+        dst_sca_ty = dst_ty.scalar
+        if src_sca_ty.is_ptr() or dst_sca_ty.is_ptr():
+            return self.cast(input, dst_ty)
+        # Bitcast
+        src_bits = src_sca_ty.primitive_bitwidth
+        dst_bits = dst_sca_ty.primitive_bitwidth
+        if src_bits != dst_bits:
+            raise ValueError("Cannot bitcast data-type of size " + str(src_bits) + " to "
+                             "data-type of size " + str(dst_bits))
+        return self.tensor(self.builder.create_bitcast(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+    def cast(self, input: TensorTy, dst_ty: tl.dtype, fp_downcast_rounding: Optional[str] = None) -> TensorTy:
+        src_ty = input.type
+        src_sca_ty = src_ty.scalar
+        dst_sca_ty = dst_ty.scalar
+        if src_sca_ty == dst_sca_ty:
+            return input
+        if src_ty.is_block():
+            dst_ty = src_ty.with_element_ty(dst_sca_ty)
+
+        # For fp downcasting default rounding mode should be RTNE, for all other conversions it should
+        # not be set
+        fp_downcast_rounding = self._str_to_rounding_mode(fp_downcast_rounding)
+        use_custom_rounding = False
+        if dst_sca_ty.is_floating() and src_sca_ty.is_floating(
+        ) and dst_sca_ty.primitive_bitwidth < src_sca_ty.primitive_bitwidth:
+            if fp_downcast_rounding is None: fp_downcast_rounding = ir.ROUNDING_MODE.RTNE
+            elif fp_downcast_rounding != ir.ROUNDING_MODE.RTNE: use_custom_rounding = True
+        else:
+            if fp_downcast_rounding is not None:
+                raise ValueError("fp_downcast_rounding should be set only for truncating fp conversions. "
+                                 "Source scalar type is " + str(src_sca_ty) + " and destination type is " +
+                                 str(dst_sca_ty))
+
+        if (src_sca_ty.is_fp8e4b15() or dst_sca_ty.is_fp8e4b15()):
+            assert self.builder.codegen_fns.get(
+                "convert_custom_types") is not None, "target doesn't provide conversion for this type."
+            return self.builder.codegen_fns["convert_custom_types"](input, dst_ty, fp_downcast_rounding, _semantic=self)
+        # Casting with customized floating types involved: fp8 <=> bf16, fp16, fp32, fp64
+        # and non-default rounding modes for downcasting
+        if (src_sca_ty.is_fp8() and dst_sca_ty.is_floating()) or \
+           (src_sca_ty.is_floating() and dst_sca_ty.is_fp8()) or \
+           use_custom_rounding:
+            return self.tensor(
+                self.builder.create_fp_to_fp(input.handle, dst_ty.to_ir(self.builder), fp_downcast_rounding), dst_ty)
+
+        # bf16 <=> (not fp32)
+        if (src_sca_ty.is_fp16() and not dst_sca_ty.is_fp32()) or \
+           (src_sca_ty.is_bf16() and not dst_sca_ty.is_fp32()):
+            return self.cast(self.cast(input, tl.float32), dst_sca_ty)
+
+        # Standard floating types' casting: truncation
+        #   fp64 => fp32, fp16, bf16
+        #   fp32 => fp16, bf16
+        truncate_fp = src_sca_ty.is_floating() and \
+            dst_sca_ty.is_floating() and \
+            src_sca_ty.primitive_bitwidth > dst_sca_ty.primitive_bitwidth
+        if truncate_fp:
+            return self.tensor(self.builder.create_fp_trunc(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        # Standard floating types' casting: extension
+        #   fp32 => fp64
+        #   fp16 => fp32, fp64
+        #   bf16 => fp32, fp64
+        ext_fp = src_sca_ty.is_floating() and \
+            dst_sca_ty.is_floating() and \
+            src_sca_ty.primitive_bitwidth < dst_sca_ty.primitive_bitwidth
+        if ext_fp:
+            return self.tensor(self.builder.create_fp_ext(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        # Casting between integer types
+        if src_sca_ty.is_int() and dst_sca_ty.is_int() and \
+           (src_sca_ty.int_bitwidth != dst_sca_ty.int_bitwidth or src_sca_ty.int_signedness != dst_sca_ty.int_signedness):
+            sign_extend = src_sca_ty.is_int_signed() and not src_sca_ty.is_bool()
+            if dst_sca_ty.is_bool():
+                ty = input.dtype.to_ir(self.builder)
+                _0 = self.tensor(self.builder.get_null_value(ty), input.dtype)
+                return self.not_equal(input, _0)
+            else:
+                return self.tensor(self.builder.create_int_cast(input.handle, dst_ty.to_ir(self.builder), sign_extend),
+                                   dst_ty)
+
+        # Casting standard floating types to integer types
+        if src_sca_ty.is_standard_floating() and dst_sca_ty.is_int():
+            if dst_sca_ty.is_bool():
+                ty = input.dtype.to_ir(self.builder)
+                _0 = self.tensor(self.builder.get_null_value(ty), input.dtype)
+                return self.not_equal(input, _0)
+            elif dst_sca_ty.is_int_signed():
+                return self.tensor(self.builder.create_fp_to_si(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+            else:
+                return self.tensor(self.builder.create_fp_to_ui(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        # Casting integer types to standard floating types
+        if src_sca_ty.is_int() and dst_sca_ty.is_standard_floating():
+            if src_sca_ty.is_bool() or not src_sca_ty.is_int_signed():
+                return self.tensor(self.builder.create_ui_to_fp(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+            else:
+                return self.tensor(self.builder.create_si_to_fp(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        # Casting pointer types to integer types
+        if src_sca_ty.is_ptr() and dst_sca_ty.is_int():
+            bitwidth = dst_sca_ty.int_bitwidth
+            if bitwidth == 64:
+                return self.tensor(self.builder.create_ptr_to_int(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+            if bitwidth == 1:
+                return self.not_equal(self.cast(input, tl.int64), self.tensor(self.builder.get_int64(0), tl.int64))
+
+        # Casting integer types to pointer types
+        if src_sca_ty.is_int() and dst_sca_ty.is_ptr():
+            return self.tensor(self.builder.create_int_to_ptr(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        # Casting pointer types to pointer types
+        if src_sca_ty.is_ptr() and dst_sca_ty.is_ptr():
+            return self.tensor(self.builder.create_bitcast(input.handle, dst_ty.to_ir(self.builder)), dst_ty)
+
+        assert False, f'cannot cast {input} to {dst_ty}'
+
+# ===----------------------------------------------------------------------===//
+#                               Memory Operators
+# ===----------------------------------------------------------------------===//
+
+    def _str_to_load_cache_modifier(self, cache_modifier):
+        cache = ir.CACHE_MODIFIER.NONE  # default
+        if cache_modifier:
+            if cache_modifier == ".ca":
+                cache = ir.CACHE_MODIFIER.CA
+            elif cache_modifier == ".cg":
+                cache = ir.CACHE_MODIFIER.CG
+            elif cache_modifier == ".cv":
+                cache = ir.CACHE_MODIFIER.CV
+            else:
+                raise ValueError(f"Cache modifier {cache_modifier} not supported")
+        return cache
+
+    def _str_to_store_cache_modifier(self, cache_modifier):
+        cache = ir.CACHE_MODIFIER.NONE  # default
+        if cache_modifier:
+            if cache_modifier == ".wb":
+                cache = ir.CACHE_MODIFIER.WB
+            elif cache_modifier == ".cg":
+                cache = ir.CACHE_MODIFIER.CG
+            elif cache_modifier == ".cs":
+                cache = ir.CACHE_MODIFIER.CS
+            elif cache_modifier == ".wt":
+                cache = ir.CACHE_MODIFIER.WT
+            else:
+                raise ValueError(f"Cache modifier {cache_modifier} not supported")
+        return cache
+
+    def _str_to_eviction_policy(self, eviction_policy):
+        eviction = ir.EVICTION_POLICY.NORMAL  # default
+        if eviction_policy:
+            if eviction_policy == "evict_last":
+                eviction = ir.EVICTION_POLICY.EVICT_LAST
+            elif eviction_policy == "evict_first":
+                eviction = ir.EVICTION_POLICY.EVICT_FIRST
+            else:
+                raise ValueError(f"Eviction policy {eviction_policy} not supported")
+        return eviction
+
+    def _str_to_padding_option(self, padding_option):
+        padding = None  # default
+        if padding_option:
+            if padding_option == "zero":
+                padding = ir.PADDING_OPTION.PAD_ZERO
+            elif padding_option == "nan":
+                padding = ir.PADDING_OPTION.PAD_NAN
+            else:
+                raise ValueError(f"Padding option {padding_option} not supported")
+        return padding
+
+    def _str_to_sem(self, sem_option):
+        sem = ir.MEM_SEMANTIC.ACQUIRE_RELEASE
+        if sem_option:
+            if sem_option == "acquire":
+                sem = ir.MEM_SEMANTIC.ACQUIRE
+            elif sem_option == "release":
+                sem = ir.MEM_SEMANTIC.RELEASE
+            elif sem_option == "acq_rel":
+                sem = ir.MEM_SEMANTIC.ACQUIRE_RELEASE
+            elif sem_option == "relaxed":
+                sem = ir.MEM_SEMANTIC.RELAXED
+            else:
+                raise ValueError(f"Memory semantic {sem_option} not supported")
+        return sem
+
+    def _str_to_scope(self, scope_option):
+        scope = ir.MEM_SYNC_SCOPE.GPU
+        if scope_option:
+            if scope_option == "gpu":
+                scope = ir.MEM_SYNC_SCOPE.GPU
+            elif scope_option == "cta":
+                scope = ir.MEM_SYNC_SCOPE.CTA
+            elif scope_option == "sys":
+                scope = ir.MEM_SYNC_SCOPE.SYSTEM
+            else:
+                raise ValueError(f"Memory semantic {scope_option} not supported")
+        return scope
+
+    def _canonicalize_boundary_check(self, boundary_check, block_shape):
+        if boundary_check:
+            if not hasattr(boundary_check, "__iter__"):
+                boundary_check = [boundary_check]
+            boundary_check = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in boundary_check]
+            for dim in boundary_check:
+                assert isinstance(dim, int) and 0 <= dim < len(block_shape)
+            assert len(boundary_check) > 0
+            assert len(boundary_check) == len(set(boundary_check)), "Duplicate dimension in `boundary_check`"
+            return sorted(boundary_check)
+        return ()
+
+    def _load_block_pointer(self, ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile):
+        # Load by a block pointer: `pointer_type<block_type<>>`
+        # Block pointer can not have `mask` and `other` arguments
+        if mask is not None or other is not None:
+            raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers")
+
+        elt_ty = ptr.type.element_ty.element_ty
+        assert elt_ty != tl.int1, "`tl.int1` should be rewritten in `tl.make_block_ptr`"
+        if elt_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN:
+            raise ValueError("Padding option `nan` is not supported for integer block pointers")
+
+        # `dst_ty` is de-referenced type of the pointer type
+        dst_ty = ptr.type.element_ty
+
+        # Check `boundary_check` argument
+        boundary_check = self._canonicalize_boundary_check(boundary_check, dst_ty.get_block_shapes())
+
+        # Build IR
+        return self.tensor(
+            self.builder.create_tensor_pointer_load(ptr.handle, boundary_check, padding, cache, eviction, is_volatile),
+            dst_ty)
+
+    def _load_legacy(self, ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile):
+        # Load by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
+        if not ptr.type.scalar.is_ptr():
+            raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.load`")
+
+        # Check `mask`, `other`, `boundary_check`, and `padding` arguments
+        if mask is None and other is not None:
+            raise ValueError("`other` cannot be provided without `mask`")
+        if padding or boundary_check:
+            raise ValueError("`padding_option` or `boundary_check` argument is not supported for loading a tensor of"
+                             "pointers or loading a scalar. Because the compiler does not know the boundary; please "
+                             "use block pointers (defined by `make_block_ptr`) instead")
+
+        # For a pointer of scalar, check the type of `mask` and `other`
+        if not ptr.type.is_block():
+            if mask and mask.type.is_block():
+                raise ValueError("Mask argument cannot be block type if pointer argument is not a block")
+            if other and other.type.is_block():
+                raise ValueError("Other argument cannot be block type if pointer argument is not a block")
+
+        # Make `mask` and `other` into the same shape as `ptr`
+        if ptr.type.is_block():
+            if mask is not None:
+                ptr, mask = self.broadcast_impl_value(ptr, mask)
+            if other is not None:
+                ptr, other = self.broadcast_impl_value(ptr, other)
+
+        # Get `pointer_type<elt_ty>` and `elt_ty`
+        ptr_ty = ptr.type.scalar
+        elt_ty = ptr_ty.element_ty
+
+        # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
+        is_bool = elt_ty == tl.int1
+        if is_bool:
+            elt_ty = tl.int8
+            ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space)
+            ptr = self.cast(ptr, ptr_ty)
+
+        # Cast `other` into `elt_ty` type
+        if other is not None:
+            other = self.cast(other, elt_ty)
+
+        # Create loaded result type `dst_ty`
+        if ptr.type.is_block():
+            dst_ty = ptr.type.with_element_ty(elt_ty)
+        else:
+            # Load by de-referencing the pointer of scalar
+            dst_ty = elt_ty
+
+        # Build IR
+        if mask is None:
+            ret = self.tensor(self.builder.create_load(ptr.handle, cache, eviction, is_volatile), dst_ty)
+        else:
+            ret = self.tensor(
+                self.builder.create_masked_load(ptr.handle, mask.handle, other.handle if other else None, cache,
+                                                eviction, is_volatile), dst_ty)
+        if is_bool:
+            ret = self.cast(ret, tl.int1)
+        return ret
+
+    def load(self, ptr: TensorTy, mask: Optional[TensorTy], other: Optional[TensorTy], boundary_check: Tuple,
+             padding_option: str, cache_modifier: str, eviction_policy: str, is_volatile: bool) -> TensorTy:
+        # Cache, eviction and padding options
+        cache = self._str_to_load_cache_modifier(cache_modifier)
+        eviction = self._str_to_eviction_policy(eviction_policy)
+        padding = self._str_to_padding_option(padding_option)
+
+        if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
+            # Load by a block pointer: `pointer_type<block_type<>>`
+            return self._load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile)
+        else:
+            # Load by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
+            return self._load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile)
+
+    def descriptor_load(self, desc: tl.tensor_descriptor_base, offsets, cache_modifier: str,
+                        eviction_policy: str) -> TensorTy:
+        assert isinstance(desc, tl.tensor_descriptor_base)
+        ndim = len(desc.block_shape)
+        assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
+
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        x = self.builder.create_descriptor_load(desc.handle, offsets, self._str_to_load_cache_modifier(cache_modifier),
+                                                self._str_to_eviction_policy(eviction_policy))
+        return self.tensor(x, desc.block_type)
+
+    def validate_store_like(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> None:
+        assert isinstance(desc, tl.tensor_descriptor_base)
+        ndim = len(desc.block_shape)
+        assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
+        assert value.shape == desc.block_shape
+
+    def descriptor_store(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        # implicitly cast to the descriptor's type
+        value = self.cast(value, desc.dtype)
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        return self.tensor(self.builder.create_descriptor_store(desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_atomic_add(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        assert desc.dtype in {tl.uint32, tl.int32, tl.uint64, tl.float32, tl.float16, tl.bfloat16}, "Unsupported dtype"
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.ADD
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def _has_native_tma(self, ):
+        target = driver.active.get_current_target()
+        return (target.backend == "cuda" and target.arch >= 90)
+
+    def _descriptor_atomic_min_max_supported(self, dtype):
+        assert dtype in {tl.uint32, tl.int32, tl.uint64, tl.int64, tl.float16, tl.bfloat16}, "Unsupported dtype"
+        if dtype in {tl.float16, tl.bfloat16}:
+            assert self._has_native_tma(), "16-bit float types require native tma support"
+
+    def descriptor_atomic_min(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        self._descriptor_atomic_min_max_supported(desc.dtype)
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.MIN
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_atomic_max(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        self._descriptor_atomic_min_max_supported(desc.dtype)
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.MAX
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_atomic_and(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        assert desc.dtype in {tl.uint32, tl.int32, tl.uint64, tl.int64}, "Unsupported dtype"
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.AND
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_atomic_or(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        assert desc.dtype in {tl.uint32, tl.int32, tl.uint64, tl.int64}, "Unsupported dtype"
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.OR
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_atomic_xor(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
+        self.validate_store_like(desc, value, offsets)
+        assert desc.dtype in {tl.uint32, tl.int32, tl.uint64, tl.int64}, "Unsupported dtype"
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+        kind = ir.DESCRIPTOR_REDUCE_KIND.XOR
+        return self.tensor(self.builder.create_descriptor_reduce(kind, desc.handle, value.handle, offsets), tl.void)
+
+    def descriptor_gather(self, desc, x_offsets, y_offset, cache_modifier: str, eviction_policy: str) -> TensorTy:
+        assert isinstance(desc, tl.tensor_descriptor_base)
+        assert cache_modifier == "", "cache modifier is not supported yet"
+        assert eviction_policy == "", "eviction policy is not supported yet"
+
+        # Validate descriptor.
+        assert len(desc.block_shape) == 2, f"descriptor must be 2D, but got {desc.block_shape}"
+        assert desc.block_shape[0] == 1, f"descriptor block must have 1 row, but got {desc.block_shape}"
+
+        # Validate offsets.
+        assert len(x_offsets.shape) == 1, f"x offsets must be 1D, but got {x_offsets.shape}"
+
+        # Validate minimum block size.
+        assert x_offsets.shape[0] >= 8, f"descriptor gather must have at least 8 rows, but got {x_offsets.shape}"
+        dtype = desc.dtype
+        min_cols = 32 // dtype.primitive_bitwidth * 8
+        assert desc.block_shape[
+            1] >= min_cols, f"descriptor gather of {dtype} must have at least {min_cols} columns, but got {desc.block_shape[1]}"
+
+        type = tl.block_type(desc.dtype, [x_offsets.shape[0], desc.block_shape[1]])
+        y_offset = self._convert_to_ir_values((y_offset, ), require_i64=False)[0]
+        x = self.builder.create_descriptor_gather(desc.handle, x_offsets.handle, y_offset, type.to_ir(self.builder))
+        return self.tensor(x, type)
+
+    def descriptor_scatter(self, desc, value: TensorTy, x_offsets, y_offset) -> TensorTy:
+        assert isinstance(desc, tl.tensor_descriptor_base)
+
+        # Validate descriptor.
+        assert len(desc.block_shape) == 2, f"descriptor must be 2D, but got {desc.block_shape}"
+        assert desc.block_shape[0] == 1, f"descriptor block must have 1 row, but got {desc.block_shape}"
+
+        # Validate offsets.
+        assert len(x_offsets.shape) == 1, f"x offsets must be 1D, but got {x_offsets.shapae}"
+
+        # Validate minimum block size.
+        assert x_offsets.shape[0] >= 8, f"descriptor scatter must have at least 8 rows, but got {x_offsets.shape}"
+        dtype = desc.dtype
+        min_cols = 32 // dtype.primitive_bitwidth * 8
+        assert desc.block_shape[
+            1] >= min_cols, f"descriptor scatter of {dtype} must have at least {min_cols} columns, but got {desc.block_shape[1]}"
+
+        y_offset = self._convert_to_ir_values((y_offset, ), require_i64=False)[0]
+        self.builder.create_descriptor_scatter(desc.handle, value.handle, x_offsets.handle, y_offset)
+        return self.tensor(None, tl.void)
+
+    def _store_block_pointer(self, ptr, val, mask, boundary_check, cache, eviction):
+        # Store by a block pointer: `pointer_type<block_type<>>`
+        # Block pointers can not have the `mask` argument
+        if mask is not None:
+            raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers")
+
+        # Check same shape and element type
+        block_shape = ptr.type.element_ty.get_block_shapes()
+        if not val.type.is_block():
+            val = self.broadcast_impl_shape(val, block_shape)
+        assert val.type.is_block(), "Value argument must be block type or a scalar"
+        assert block_shape == val.type.get_block_shapes(
+        ), f"Block shape({block_shape}) and value shape({val.type.get_block_shapes()}) mismatch"
+        assert ptr.type.element_ty.element_ty == val.type.element_ty, f"Block element type({ptr.type.element_ty.element_ty}) and value element type({val.type.element_ty}) mismatch"
+
+        elt_ty = ptr.type.element_ty.element_ty
+        assert elt_ty != tl.int1, "`tl.int1` should be rewritten in `tl.make_block_ptr`"
+
+        # Check `boundary_check` argument
+        boundary_check = self._canonicalize_boundary_check(boundary_check, block_shape)
+
+        # Cast to target data type
+        val = self.cast(val, elt_ty)
+
+        # Build IR
+        return self.tensor(
+            self.builder.create_tensor_pointer_store(ptr.handle, val.handle, boundary_check, cache, eviction), tl.void)
+
+    def _store_legacy(self, ptr, val, mask, boundary_check, cache, eviction):
+        # Store by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
+        if not ptr.type.scalar.is_ptr():
+            raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.store`")
+
+        # Check `boundary_check` argument
+        if boundary_check:
+            raise ValueError("`boundary_check` argument is not supported for storing a tensor of pointers or storing a "
+                             "scalar. Because the compiler does not know the boundary; please use block pointers "
+                             "(defined by `make_block_ptr`) instead")
+
+        # For a pointer of scalar, check the type of `val` and `mask`
+        if not ptr.type.is_block():
+            if val.type.is_block():
+                raise ValueError("Value argument cannot be block type if pointer argument is not a block")
+            if mask and mask.type.is_block():
+                raise ValueError("Mask argument cannot be block type if pointer argument is not a block")
+
+        # Make `mask` and `val` into the same shape as `ptr`
+        if ptr.type.is_block():
+            ptr_shape = ptr.shape
+            if mask is None:
+                ptr, val = self.broadcast_tensors(ptr, val)
+            else:
+                ptr, val, mask = self.broadcast_tensors(ptr, val, mask)
+            if ptr_shape != ptr.shape:
+                raise ValueError(f"Expected pointer argument to have shape {ptr.shape} but got {ptr_shape}")
+
+        ptr_ty = ptr.type.scalar
+        elt_ty = ptr_ty.element_ty
+
+        # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
+        if elt_ty == tl.int1:
+            elt_ty = tl.int8
+            ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space)
+            ptr = self.cast(ptr, ptr_ty)
+
+        # Cast to target data type
+        val = self.cast(val, elt_ty)
+
+        # Build IR
+        if mask is None:
+            return self.tensor(self.builder.create_store(ptr.handle, val.handle, cache, eviction), tl.void)
+        if not mask.type.scalar.is_bool():
+            raise ValueError("Mask must have boolean scalar type")
+        return self.tensor(self.builder.create_masked_store(ptr.handle, val.handle, mask.handle, cache, eviction),
+                           tl.void)
+
+    def store(self, ptr: TensorTy, val: TensorTy, mask: Optional[TensorTy], boundary_check, cache_modifier: str,
+              eviction_policy: str) -> TensorTy:
+        # Cache and eviction options
+        cache = self._str_to_store_cache_modifier(cache_modifier)
+        eviction = self._str_to_eviction_policy(eviction_policy)
+
+        if ptr.type.is_const() or ptr.type.scalar.is_const():
+            raise ValueError("Cannot store to a constant pointer")
+
+        if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
+            # Store by a block pointer: `pointer_type<block_type<>>`
+            return self._store_block_pointer(ptr, val, mask, boundary_check, cache, eviction)
+        else:
+            # Store by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
+            return self._store_legacy(ptr, val, mask, boundary_check, cache, eviction)
+
+#########
+# atomic
+#########
+
+    def atomic_cas(self, ptr: TensorTy, cmp: TensorTy, val: TensorTy, sem: str, scope: str) -> TensorTy:
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        element_ty = ptr.type.scalar.element_ty
+        if element_ty.primitive_bitwidth not in [16, 32, 64]:
+            raise ValueError("atomic_cas only supports elements with width {16, 32, 64}")
+        return self.tensor(self.builder.create_atomic_cas(ptr.handle, cmp.handle, val.handle, sem, scope), val.type)
+
+    def atom_red_typechecking_impl(self, ptr: TensorTy, val: TensorTy, mask: TensorTy,
+                                   op: str) -> Tuple[TensorTy, TensorTy, TensorTy]:
+        if not ptr.type.scalar.is_ptr():
+            raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__())
+        if ptr.type.is_const() or ptr.type.element_ty.is_const():
+            raise ValueError("Cannot store to a constant pointer")
+        element_ty = ptr.type.scalar.element_ty
+        if element_ty is tl.float16 and op != 'add':
+            raise ValueError("atomic_" + op + " does not support fp16")
+        if element_ty is tl.bfloat16 and op != 'add':
+            raise ValueError("atomic_" + op + " does not support bf16")
+        if element_ty in [tl.int16, tl.uint16] or element_ty.primitive_bitwidth < 16:
+            raise ValueError("atomic_" + op + " does not support " + str(element_ty))
+        if ptr.type.is_block():
+            if mask is not None:
+                mask = self.broadcast_impl_shape(mask, ptr.type.get_block_shapes())
+            if val is not None:
+                val = self.broadcast_impl_shape(val, ptr.type.get_block_shapes())
+        val = self.cast(val, ptr.type.scalar.element_ty)
+        if mask is None:
+            mask_ir = self.builder.get_int1(True)
+            mask_ty = tl.int1
+            if ptr.type.is_block():
+                mask_ty = ptr.type.with_element_ty(tl.int1)
+                mask_ir = self.builder.create_splat(mask_ty.to_ir(self.builder), mask_ir)
+            mask = self.tensor(mask_ir, mask_ty)
+        return ptr, val, mask
+
+    def _signbit(self, x: TensorTy) -> TensorTy:
+        bitwidth = x.dtype.primitive_bitwidth
+        idtype = tl.get_int_dtype(bitwidth=bitwidth, signed=False)
+        ix = self.bitcast(x, idtype)
+        signbit = self.lshr(ix, bitwidth - 1)
+        return self.cast(signbit, tl.int1)
+
+    def atomic_max(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'max')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        sca_ty = val.type.scalar
+        # direct call to atomic_max for integers
+        if sca_ty.is_int():
+            if sca_ty.is_int_signed():
+                return self.tensor(
+                    self.builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, ptr.handle, val.handle, mask.handle, sem, scope),
+                    val.type)
+            else:
+                return self.tensor(
+                    self.builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, ptr.handle, val.handle, mask.handle, sem, scope),
+                    val.type)
+        # for float
+        # return atomic_smax(i_ptr, i_val) if val >= 0
+        # return atomic_umin(i_ptr, i_val) if val < 0
+        if sca_ty not in {tl.float32, tl.float64}:
+            raise TypeError(f"atomic_max not supported for dtype {sca_ty}")
+
+        i_type = tl.int32 if sca_ty == tl.float32 else tl.int64
+        i_val = self.bitcast(val, i_type)
+        i_ptr = self.bitcast(ptr, tl.pointer_type(i_type, 1))
+        ui_type = tl.uint32 if sca_ty == tl.float32 else tl.uint64
+        ui_val = self.bitcast(val, ui_type)
+        ui_ptr = self.bitcast(ptr, tl.pointer_type(ui_type, 1))
+        neg = self._signbit(val)
+        pos = self.not_(neg)
+        pos_ret = self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, i_ptr.handle, i_val.handle,
+                                           self.and_(mask, pos).handle, sem, scope), i_val.type)
+        neg_ret = self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, ui_ptr.handle, ui_val.handle,
+                                           self.and_(mask, neg).handle, sem, scope), ui_val.type)
+        ret = self.where(pos, pos_ret, neg_ret)
+        return self.bitcast(ret, sca_ty)
+
+    def atomic_min(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'min')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        sca_ty = val.type.scalar
+        # direct call to atomic_min for integers
+        if sca_ty.is_int():
+            if sca_ty.is_int_signed():
+                return self.tensor(
+                    self.builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, ptr.handle, val.handle, mask.handle, sem, scope),
+                    val.type)
+            else:
+                return self.tensor(
+                    self.builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, ptr.handle, val.handle, mask.handle, sem, scope),
+                    val.type)
+        # for float
+        # return atomic_smin(i_ptr, i_val) if val >= 0
+        # return atomic_umax(i_ptr, i_val) if val < 0
+        if sca_ty not in {tl.float32, tl.float64}:
+            raise TypeError(f"atomic_min not supported for dtype {sca_ty}")
+
+        i_type = tl.int32 if sca_ty == tl.float32 else tl.int64
+        i_val = self.bitcast(val, i_type)
+        i_ptr = self.bitcast(ptr, tl.pointer_type(i_type, 1))
+        ui_type = tl.uint32 if sca_ty == tl.float32 else tl.uint64
+        ui_val = self.bitcast(val, ui_type)
+        ui_ptr = self.bitcast(ptr, tl.pointer_type(ui_type, 1))
+        neg = self._signbit(val)
+        pos = self.not_(neg)
+        pos_ret = self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, i_ptr.handle, i_val.handle,
+                                           self.and_(mask, pos).handle, sem, scope), i_val.type)
+        neg_ret = self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, ui_ptr.handle, ui_val.handle,
+                                           self.and_(mask, neg).handle, sem, scope), ui_ptr.type)
+        ret = self.where(pos, pos_ret, neg_ret)
+        return self.bitcast(ret, sca_ty)
+
+    def atomic_add(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'add')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        sca_ty = val.type.scalar
+        op = ir.ATOMIC_OP.FADD if sca_ty.is_floating() else ir.ATOMIC_OP.ADD
+        return self.tensor(self.builder.create_atomic_rmw(op, ptr.handle, val.handle, mask.handle, sem, scope),
+                           val.type)
+
+    def atomic_and(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'and')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        return self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.AND, ptr.handle, val.handle, mask.handle, sem, scope), val.type)
+
+    def atomic_or(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'or')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        return self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.OR, ptr.handle, val.handle, mask.handle, sem, scope), val.type)
+
+    def atomic_xor(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'xor')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        return self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.XOR, ptr.handle, val.handle, mask.handle, sem, scope), val.type)
+
+    def atomic_xchg(self, ptr: TensorTy, val: TensorTy, mask: TensorTy, sem: str, scope: str) -> TensorTy:
+        ptr, val, mask = self.atom_red_typechecking_impl(ptr, val, mask, 'xchg')
+        sem = self._str_to_sem(sem)
+        scope = self._str_to_scope(scope)
+        return self.tensor(
+            self.builder.create_atomic_rmw(ir.ATOMIC_OP.XCHG, ptr.handle, val.handle, mask.handle, sem, scope),
+            val.type)
+
+# ===----------------------------------------------------------------------===//
+#                               Linear Algebra
+# ===----------------------------------------------------------------------===//
+
+    def _str_to_dot_input_precision(self, input_precision):
+        assert input_precision.lower() in self.builder.options.allowed_dot_input_precisions, \
+            f"input_precision must be one of {self.builder.options.allowed_dot_input_precisions}. Got {input_precision}"
+        input_precision = input_precision.upper()
+        if input_precision == "TF32X3":
+            input_precision = "TF32x3"
+        if input_precision == "BF16X3":
+            input_precision = "BF16x3"
+        if input_precision == "BF16X6":
+            input_precision = "BF16x6"
+        return getattr(ir.INPUT_PRECISION, input_precision)
+
+    def dot(self, lhs: TensorTy, rhs: TensorTy, acc: TensorTy, input_precision: Optional[str],
+            max_num_imprecise_acc: int, out_dtype: tl.dtype) -> TensorTy:
+        assert lhs.type.is_block() and rhs.type.is_block()
+
+        if lhs.dtype.is_fp8() and rhs.dtype.is_fp8():
+            # All combinations of supported fp8 x fp8 are permitted
+            pass
+        else:
+            assert lhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16, tl.float32,
+                                 tl.float64), f"Unsupported lhs dtype {lhs.dtype}"
+            assert rhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16, tl.float32,
+                                 tl.float64), f"Unsupported rhs dtype {rhs.dtype}"
+            assert lhs.dtype == rhs.dtype, f"Both operands must be same dtype. Got {lhs.dtype} and {rhs.dtype}"
+
+        if lhs.dtype.is_fp8e4b15() or rhs.dtype.is_fp8e4b15():
+            if "fp8e4b15" in self.builder.options.deprecated_fp8_dot_operand_dtypes:
+                warnings.warn(
+                    "the use of fp8e4b15 is deprecated on Hopper and later architectures and can cause significant slow down. It will be removed in a future triton release"
+                )
+            # We upcast because there's no fp8e4b15 type in MLIR
+            lhs = self.cast(lhs, tl.float16)
+            rhs = self.cast(rhs, tl.float16)
+
+        uses_fp8e4b8 = lhs.dtype.is_fp8e4b8() or rhs.dtype.is_fp8e4b8()
+        uses_fp8e5b16 = lhs.dtype.is_fp8e5b16() or rhs.dtype.is_fp8e5b16()
+        if uses_fp8e4b8 or uses_fp8e5b16:
+            type_name = "fp8e4b8" if uses_fp8e4b8 else "fp8e5b16"
+            if type_name in self.builder.options.deprecated_fp8_dot_operand_dtypes:
+                arch = self.builder.options.arch
+                warnings.warn(
+                    f"{type_name} is AMD gfx942 specific and not supported on {arch} so it's upcasted to fp16 and can cause significant slow down. "
+                    f"Please use OCP fp8 variants on {arch} for performance")
+                lhs = self.cast(lhs, tl.float16)
+                rhs = self.cast(rhs, tl.float16)
+
+        if input_precision is None:
+            input_precision = self.builder.options.default_dot_input_precision
+
+        input_precision = self._str_to_dot_input_precision(input_precision)
+
+        lhs_rank = len(lhs.shape)
+        rhs_rank = len(rhs.shape)
+        assert lhs_rank == rhs_rank == 2 or lhs_rank == rhs_rank == 3, f"Both inputs must be either 2D or 3D; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
+        assert lhs.shape[-1].value == rhs.shape[
+            -2].value, f"First input shape ({lhs.shape}) and second input shape {rhs.shape} are not compatible for matmul (second index of first shape ({lhs.shape[-1].value}) must be equal to first index of second shape ({rhs.shape[-2].value})"
+        assert self.builder.codegen_fns.get(
+            "min_dot_size") is not None, "target doesn't provide lower shape bounds for dot."
+        min_dot_size = self.builder.codegen_fns["min_dot_size"](lhs.type, rhs.type)
+        assert lhs.shape[-2].value >= min_dot_size[0] and lhs.shape[-1].value >= min_dot_size[2] \
+            and rhs.shape[-1].value >= min_dot_size[1], \
+                f"Input shapes should have M >= {min_dot_size[0]}, N >= {min_dot_size[1]} and K >= {min_dot_size[2]}"
+        if lhs.type.scalar.is_int():
+            assert lhs.type.scalar == tl.int8, "only int8 supported!"
+            _0 = self.builder.get_int32(0)
+            ret_scalar_ty = tl.int32
+        elif out_dtype.is_bf16():
+            raise ValueError(
+                "out_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`"
+            )
+        elif lhs.type.scalar.is_fp32() or lhs.type.scalar.is_bf16():
+            _0 = self.builder.get_fp32(0)
+            ret_scalar_ty = tl.float32
+        elif lhs.type.scalar.is_fp64():
+            _0 = self.builder.get_fp64(0)
+            ret_scalar_ty = tl.float64
+        else:
+            _0 = self.builder.get_fp16(0) if out_dtype.is_fp16() else self.builder.get_fp32(0)
+            ret_scalar_ty = out_dtype
+
+        M = lhs.type.shape[-2]
+        N = rhs.type.shape[-1]
+        K = lhs.type.shape[-1]
+        B = lhs.type.shape[0] if lhs_rank == 3 else None
+        ret_ty = tl.block_type(ret_scalar_ty, [B, M, N] if B else [M, N])
+        if acc is None:
+            acc_handle = self.builder.create_splat(ret_ty.to_ir(self.builder), _0)
+        else:
+            acc_handle = acc.handle
+            assert acc.type.shape == ret_ty.shape and acc.type.element_ty == out_dtype
+
+        # max_num_imprecise_acc only applies to fp8 -> fp32 dot on sm_90
+        if max_num_imprecise_acc is None:
+            if lhs.dtype.is_fp8() and rhs.dtype.is_fp8():
+                max_num_imprecise_acc = self.builder.options.max_num_imprecise_acc_default
+            else:
+                max_num_imprecise_acc = 0
+        else:
+            if lhs.dtype.is_fp8() and rhs.dtype.is_fp8() and max_num_imprecise_acc > K:
+                raise ValueError(f"max_num_imprecise_acc ({max_num_imprecise_acc}) must be <= K ({K})")
+
+        return self.tensor(
+            self.builder.create_dot(lhs.handle, rhs.handle, acc_handle, input_precision, max_num_imprecise_acc), ret_ty)
+
+    def _str_to_fp_type(self, float_format: str):
+        ty_enum = getattr(ir.ScaleDotElemTypeTY, float_format.upper(), None)
+        if ty_enum is None:
+            raise ValueError(f"Invalid float format: {float_format}.")
+        return ty_enum
+
+    def _bitcast_to_fp_type(self, val: TensorTy, float_format: str):
+        """
+        If float_format is subbyte, make sure it's packed as uint8 and return it.
+        Otherwise, return a tensor (perhaps bitcasting) of the specified float format.
+        """
+        triton_ty = {"e5m2": tl.float8e5, "e4m3": tl.float8e4nv, "bf16": tl.bfloat16, "fp16":
+                     tl.float16}.get(float_format)
+        if triton_ty is None:
+            assert float_format == "e2m1", f"Internal Error: Unexpected float format: {float_format}"
+            assert val.dtype == tl.uint8, f"e2m1 format must be packed as uint8. Got {val.dtype}"
+            return val
+        if val.dtype == triton_ty:
+            return val
+        else:
+            unsigned_ty = {"e5m2": tl.uint8, "e4m3": tl.uint8, "bf16": tl.uint16, "fp16": tl.uint16}[float_format]
+            assert val.dtype == unsigned_ty, f"Unexpected dtype for {float_format}. Got {val.dtype}"
+            return self.bitcast(val, triton_ty)
+
+    def verify_scaled_shape(self, M, N, K, lhs_scale, rhs_scale):
+        if lhs_scale is not None:
+            scale_factor = 16 if lhs_scale.dtype.is_fp8e4nv() else 32
+            lhs_scale_shape = lhs_scale.type.shape
+            assert lhs_scale_shape == [
+                M, K // scale_factor
+            ], f"lhs_scale must be a tensor of shape [{M}, {K // scale_factor}]. Got {lhs_scale_shape}"
+        if rhs_scale is not None:
+            scale_factor = 16 if rhs_scale.dtype.is_fp8e4nv() else 32
+            rhs_scale_shape = rhs_scale.type.shape
+            assert rhs_scale_shape == [
+                N, K // scale_factor
+            ], f"rhs_scale must be a tensor of shape [{N}, {K // scale_factor}]. Got {rhs_scale_shape}"
+
+    def dot_scaled(self, lhs: TensorTy, lhs_scale: TensorTy, lhs_format: str, rhs: TensorTy,
+                   rhs_scale: Optional[TensorTy], rhs_format: str, acc: TensorTy | None, fast_math: bool,
+                   lhs_k_pack: bool, rhs_k_pack: bool, out_dtype: tl.dtype) -> TensorTy:
+        assert lhs.type.is_block() and rhs.type.is_block()
+        #TODO: validate types.
+        lhs_rank = len(lhs.shape)
+        rhs_rank = len(rhs.shape)
+        assert lhs_rank == rhs_rank == 2 or lhs_rank == rhs_rank == 3, f"Both inputs must be either 2D or 3D; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
+        lhs_format: str = lhs_format.value
+        rhs_format: str = rhs_format.value
+        lhs_format_enum = self._str_to_fp_type(lhs_format)
+        rhs_format_enum = self._str_to_fp_type(rhs_format)
+        allowed_formats = {"e2m1", "e4m3", "e5m2", "bf16", "fp16"}
+        assert lhs_format in allowed_formats, f"NYI: lhs_format {lhs_format}"
+        assert rhs_format in allowed_formats, f"NYI: rhs_format {rhs_format}"
+        rhs_scale_is_none = rhs_scale is None or (isinstance(rhs_scale, tl.constexpr) and rhs_scale.value is None)
+        lhs_scale_is_none = lhs_scale is None or (isinstance(lhs_scale, tl.constexpr) and lhs_scale.value is None)
+        lhs = self._bitcast_to_fp_type(lhs, lhs_format)
+        rhs = self._bitcast_to_fp_type(rhs, rhs_format)
+
+        assert lhs_k_pack or lhs_format == "e2m1", "only mxfp4 inputs can be packed along a dimension different than K"
+        assert rhs_k_pack or rhs_format == "e2m1", "only mxfp4 inputs can be packed along a dimension different than K"
+        M, K_LHS = lhs.type.shape[-2:]
+        K_RHS, N = rhs.type.shape[-2:]
+        PACKED_A = 2 if lhs_format == "e2m1" else 1
+        PACKED_B = 2 if rhs_format == "e2m1" else 1
+        PACKED_A_DIM = PACKED_A * K_LHS if lhs_k_pack else K_LHS
+        PACKED_B_DIM = PACKED_B * K_RHS if rhs_k_pack else K_RHS
+        assert PACKED_B_DIM == PACKED_A_DIM, f"Reduction dimension should pack the same number of elements; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
+        #assert K * PACKED_B >= 64, f"scaled_dot NYI for K < 64. Got {K=}"
+        B = lhs.type.shape[0] if lhs_rank == 3 else None
+        K = K_LHS
+        if not lhs_k_pack:
+            M = M * PACKED_A
+        else:
+            K = K * PACKED_A
+        if not rhs_k_pack:
+            N = N * PACKED_B
+        ret_ty = tl.block_type(out_dtype, [B, M, N] if B else [M, N])
+        _0 = self.builder.get_fp32(0)
+        if acc is None:
+            acc_handle = self.builder.create_splat(ret_ty.to_ir(self.builder), _0)
+        else:
+            acc_handle = acc.handle
+            assert acc.type.shape == ret_ty.shape and acc.type.element_ty == out_dtype
+        rhs_scale_handle = None if rhs_scale_is_none else rhs_scale.handle
+        lhs_scale_handle = None if lhs_scale_is_none else lhs_scale.handle
+        self.verify_scaled_shape(M, N, K, None if lhs_scale_is_none else lhs_scale,
+                                 None if rhs_scale_is_none else rhs_scale)
+        return self.tensor(
+            self.builder.create_dot_scaled(lhs.handle, lhs_scale_handle, lhs_format_enum, rhs.handle, rhs_scale_handle,
+                                           rhs_format_enum, fast_math, lhs_k_pack, rhs_k_pack, acc_handle), ret_ty)
+
+# ===----------------------------------------------------------------------===//
+#                               Indexing
+# ===----------------------------------------------------------------------===//
+
+    def where(self, condition: TensorTy, x: TensorTy, y: TensorTy) -> TensorTy:
+        if condition.dtype != tl.int1:
+            warnings.warn(
+                f"tl.where with a non-boolean condition is deprecated and will error out in a future triton release. Got {condition.dtype}"
+            )
+        condition = self.cast(condition, tl.int1)
+        x, y = self.binary_op_type_checking_impl(x, y, True, True)
+        # x, y are broadcasted
+        if condition.type.is_block():
+            condition, x = self.broadcast_impl_value(condition, x)
+            x, y = self.broadcast_impl_value(x, y)
+        else:
+            condition, _ = self.broadcast_impl_value(condition, x)
+        ret_ty = x.type
+        return self.tensor(self.builder.create_select(condition.handle, x.handle, y.handle), ret_ty)
+
+# ===----------------------------------------------------------------------===//
+#                               Reduction
+# ===----------------------------------------------------------------------===
+
+    def wrap_tensor(self, x, scalar_ty, ret_shape):
+        if ret_shape:
+            res_ty = tl.block_type(scalar_ty, ret_shape)
+        else:
+            # 0d-tensor -> scalar
+            res_ty = scalar_ty
+        return self.tensor(x, res_ty)
+
+    def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) -> Tuple[TensorTy, ...]:
+        if axis is None:
+            inputs = tuple(self.reshape(t, [t.numel.value], can_reorder=True) for t in inputs)
+            axis = 0
+        # get result shape
+        shape = inputs[0].type.shape
+        rank = len(shape)
+        assert axis < rank, f"reduction axis must be < inputs rank ({rank})"
+        ret_shape = [s for i, s in enumerate(shape) if i != axis]
+        assert all(t.type.shape == shape for t in inputs), "all reduction inputs must have the same shape"
+
+        reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
+        region_builder_fn(reduce_op)
+        assert reduce_op.verify()
+
+        return tuple(
+            self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape) for i in range(len(inputs)))
+
+# ===----------------------------------------------------------------------===
+#                               Associative Scan
+# ===----------------------------------------------------------------------===
+
+    def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn,
+                         reverse: bool) -> Tuple[TensorTy, ...]:
+        shape = inputs[0].type.shape
+        rank = len(shape)
+
+        assert -rank <= axis < rank, f"scan axis {axis} must be < inputs rank ({rank})"
+
+        if axis < 0:
+            axis += rank
+
+        for t in inputs:
+            assert t.type.shape == shape, "all scan inputs must have the same shape"
+
+        scan_op = self.builder.create_scan([t.handle for t in inputs], axis, reverse)
+        region_builder_fn(scan_op)
+        assert scan_op.verify()
+
+        return tuple(self.wrap_tensor(scan_op.get_result(i), inputs[i].type.scalar, shape) for i in range(len(inputs)))
+
+# ===----------------------------------------------------------------------===
+#                               Gather
+# ===----------------------------------------------------------------------===
+
+    def gather(self, src: TensorTy, index: TensorTy, axis: int) -> TensorTy:
+        assert index.dtype.is_int(), "index must be an integer tensor"
+
+        rank = len(src.type.shape)
+        assert len(index.type.shape) == rank, "source and index tensors must have the same rank"
+
+        assert -rank <= axis < rank, f"gather axis {axis} must be < source rank ({rank})"
+        if axis < 0:
+            axis += rank
+
+        for d in range(rank):
+            if d == axis:
+                continue
+            assert index.type.shape[d] == src.type.shape[d], f"index dim {axis} must match the corresponding source dim"
+
+        gather = self.builder.create_gather(src.handle, index.handle, axis)
+        return self.wrap_tensor(gather, src.type.scalar, index.type.shape)
+
+# ===----------------------------------------------------------------------===
+#                               Map Elementwise
+# ===----------------------------------------------------------------------===
+
+    def broadcast_tensors(self, *inputs):
+        if not inputs:
+            return ()
+        head, *tail = inputs
+        for i in range(len(tail)):
+            head, tail[i] = self.broadcast_impl_value(head, tail[i])
+        for i in range(len(tail) - 1):
+            head, tail[i] = self.broadcast_impl_value(head, tail[i])
+        return (head, *tail)
+
+    def map_elementwise(self, inputs: Sequence[tl.tensor], result_types: Sequence[tl.dtype], pack: int,
+                        region_builder_fn) -> Tuple[tl.tensor, ...]:
+        inputs = self.broadcast_tensors(*inputs)
+
+        assert len(inputs) > 0, "map_elementwise must have at least 1 input tensor"
+        result_types = [inputs[0].type.with_element_ty(ty.scalar) for ty in result_types]
+        elementwise_op = self.builder.create_map_elementwise(
+            [t.handle for t in inputs],
+            [ty.to_ir(self.builder) for ty in result_types],
+            pack,
+        )
+        region_builder_fn(elementwise_op)
+        assert elementwise_op.verify()
+
+        return tuple(self.tensor(elementwise_op.get_result(i), ty) for i, ty in enumerate(result_types))
+
+
+# ===----------------------------------------------------------------------===
+#                               Histogram
+# ===----------------------------------------------------------------------===
+
+    def histogram(self, input: TensorTy, num_bins: int, mask: Optional[TensorTy]) -> TensorTy:
+        assert len(input.shape) == 1, "histogram only supports 1D input"
+        assert input.dtype.is_int(), "histogram only supports integer input"
+        if mask is not None:
+            mask = self.broadcast_impl_shape(mask, input.shape)
+            if not mask.type.scalar.is_bool():
+                raise ValueError("Mask must have boolean scalar type")
+            mask = mask.handle
+        return self.tensor(self.builder.create_histogram(input.handle, num_bins, mask),
+                           tl.block_type(tl.int32, [num_bins]))
+
+    def multiple_of(self, x: TensorTy, values: List[int]) -> TensorTy:
+        if max(1, len(x.shape)) != len(values):
+            raise ValueError("Shape of input to multiple_of does not match the length of values")
+        x.handle.set_attr("tt.divisibility", ir.make_attr(values, x.handle.get_context()))
+        return x
+
+    def max_contiguous(self, x: TensorTy, values: List[int]) -> TensorTy:
+        if len(x.shape) != len(values):
+            raise ValueError("Shape of input to max_contiguous does not match the length of values")
+        x.handle.set_attr("tt.contiguity", ir.make_attr(values, x.handle.get_context()))
+        return x
+
+    def max_constancy(self, x: TensorTy, values: List[int]) -> TensorTy:
+        if len(x.shape) != len(values):
+            raise ValueError("Shape of input to max_constancy does not match the length of values")
+        x.handle.set_attr("tt.constancy", ir.make_attr(values, x.handle.get_context()))
+        return x
+
+    def debug_barrier(self) -> TensorTy:
+        return self.tensor(self.builder.create_barrier(), tl.void)
+
+    def device_print(self, prefix: str, args: List[TensorTy], hex: bool) -> TensorTy:
+        # It makes sense visually for prefix to end in ": "; make it so.  Also,
+        # non-empty prefixes should start with " ".
+        if not prefix.endswith(" ") and args:
+            prefix += " "
+        if not prefix.endswith(": ") and args:
+            prefix = prefix[:-1] + ": "
+        if len(prefix) > 2 and not prefix.startswith(" "):
+            prefix = " " + prefix
+
+        new_args = [arg.handle for arg in args]
+        is_signed = [arg.dtype.is_int_signed() for arg in args]
+        return self.tensor(self.builder.create_print(prefix, hex, new_args, is_signed), tl.void)
+
+    def device_assert(self, cond: TensorTy, msg: str, mask: Optional[TensorTy]) -> TensorTy:
+        if not self.builder.options.debug:
+            return
+        if mask is not None:
+            cond = self.or_(cond, self.not_(mask))
+        return self.tensor(self.builder.create_assert(cond.handle, msg), tl.void)
+
+    def assume(self, cond) -> TensorTy:
+        return self.tensor(self.builder.create_assume(cond.handle), tl.void)
+
+    def _convert_elem_to_ir_value(self, elem, require_i64):
+        if isinstance(elem, int):
+            elem = tl.constexpr(elem)
+        if isinstance(elem, tl.constexpr):
+            if isinstance(elem.value, bool):
+                return self.builder.get_int1(elem.value)
+            if require_i64:
+                assert -2**63 <= elem.value < 2**63, f"Block pointers only support 64 bit `shape/strides`, " \
+                    f"got a value {elem.value} which is out of the range"
+                return self.builder.get_int64(elem.value)
+            else:
+                assert -2**31 <= elem.value < 2**31, f"Block pointers only support 32 bit `offsets/block_shape`, " \
+                    f"got a value {elem.value} which is out of the range"
+                return self.builder.get_int32(elem.value)
+        elif isinstance(elem, tl.tensor):
+            assert elem.numel.value == 1, "Expected a scalar in shape/strides/offsets"
+            assert elem.dtype.is_int(), "Expected an integer scalar type in shape/strides/offsets"
+            if elem.dtype != tl.int64 and require_i64:
+                return self.builder.create_int_cast(elem.handle, self.builder.get_int64_ty(),
+                                                    elem.dtype.is_int_signed())
+            elif elem.dtype == tl.int64 and not require_i64:
+                assert False, "Block pointers only support 32 bit `offsets/block_shape`, " \
+                    "add a `.to(tl.int32)` or use regular indexing for 64 bit support"
+            return elem.handle
+        assert False, f"Unsupported element type in shape/strides/offsets: {type(elem)}"
+
+    def _convert_to_ir_values(self, list_like, require_i64=True):
+        if hasattr(list_like, "__iter__"):
+            return [self._convert_elem_to_ir_value(elem, require_i64) for elem in list_like]
+        return [self._convert_elem_to_ir_value(list_like, require_i64)]
+
+    def make_block_ptr(self, base: TensorTy, shape, strides, offsets, block_shape, order) -> TensorTy:
+        # Convert dynamic arguments to IR values
+        # NOTES(Chenggang): current `shape/strides` are `int64_t`, while `offsets/block_shape` are `int32_t`
+        shape = self._convert_to_ir_values(shape)
+        strides = self._convert_to_ir_values(strides)
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+
+        # Check `base` type
+        if not base.type.is_ptr() or base.type.element_ty.is_block():
+            raise ValueError("Expected `base` to be a pointer type (but not a block pointer type or others)")
+
+        # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
+        if base.type.element_ty == tl.int1:
+            base = self.cast(base, tl.pointer_type(tl.int8, base.type.address_space))
+
+        # Check whether `block_shape` is static
+        if not hasattr(block_shape, "__iter__"):
+            block_shape = [block_shape]
+        block_shape = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in block_shape]
+        assert all(isinstance(elem, int) and -2**31 <= elem < 2**31 for elem in block_shape), \
+            "Expected a list of constant integers (`int32_t` range) in `block_shape`"
+
+        # Check `order`
+        if not hasattr(order, "__iter__"):
+            order = [order]
+        order = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in order]
+        assert sorted(order) == list(range(len(order))), "Expected a permutation of (0, 1, ..., len(order)-1) in order"
+
+        # Must have same length
+        assert all(len(block_shape) == len(list_like) for list_like in [shape, strides, offsets, order]), \
+            "Expected shape/strides/offsets/block_shape to have the same length"
+
+        # Build value, the type is:
+        #   `pointer_type<blocked<shape, element_type>>` in Python
+        #   `tt.ptr<tensor<shape, element_type>>` in MLIR
+        handle = self.builder.create_make_block_ptr(base.handle, shape, strides, offsets, block_shape, order)
+        return self.tensor(handle, tl.pointer_type(tl.block_type(base.type.element_ty, block_shape)))
+
+    def advance(self, base: TensorTy, offsets) -> TensorTy:
+        # Convert dynamic offsets to IR values
+        offsets = self._convert_to_ir_values(offsets, require_i64=False)
+
+        # Advanced block pointer type is the same as before
+        return self.tensor(self.builder.create_advance(base.handle, offsets), base.type)
+
+    def make_tensor_descriptor(self, base: TensorTy, shape: List[TensorTy], strides: List[TensorTy],
+                               block_shape: List[tl.constexpr], padding_option: str = "zero") -> tl.tensor_descriptor:
+        ndim = len(shape)
+        if not (1 <= ndim <= 5):
+            raise ValueError(f"Expected 1 <= ndim <= 5 but got {ndim} dimensions")
+        if len(strides) != ndim:
+            raise ValueError(f"Expected {ndim} strides but got {len(strides)}")
+        if len(block_shape) != ndim:
+            raise ValueError(f"Expected block_shape to have {ndim} dimensions but got {len(strides)}")
+        assert isinstance(base.dtype, tl.pointer_type)
+        elem_size = base.dtype.element_ty.primitive_bitwidth // 8
+        contig_dim_size = tl._unwrap_if_constexpr(block_shape[-1])
+        if contig_dim_size * elem_size < 16:
+            raise ValueError(
+                f"Descriptor block shape must have at least 16 bytes in the last dimension, but got {contig_dim_size} * {elem_size} = {contig_dim_size * elem_size} bytes"
+            )
+
+        last_stride = tl._unwrap_if_constexpr(strides[-1])
+        if last_stride != 1:
+            raise ValueError(f"Tensor descriptor last dim must be 1 but got {last_stride}")
+
+        shape = [self.make_scalar(x, tl.int32) for x in shape]
+        strides = [self.make_scalar(tl._unwrap_if_constexpr(x), tl.int64) for x in strides]
+
+        # Check whether `block_shape` is static
+        block_shape = tl._unwrap_shape(block_shape)
+
+        assert isinstance(base.type, tl.pointer_type)
+        type = tl.block_type(base.type.element_ty, block_shape)
+        base_handle = base.handle
+        is_signed_int = base.type.element_ty.is_int_signed()
+
+        padding = self._str_to_padding_option(padding_option)
+
+        if base.type.element_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN:
+            raise ValueError("Padding option `nan` is not supported for integer blocks")
+
+        handle = self.builder.create_make_tensor_descriptor(base_handle, [s.handle for s in shape],
+                                                            [s.handle for s in strides], block_shape, is_signed_int,
+                                                            padding)
+        return tl.tensor_descriptor(handle, shape, strides, type)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/standard.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/standard.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1dd327bb9947c0c70786912c70863db6c8a905f
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/standard.py
@@ -0,0 +1,536 @@
+from __future__ import annotations
+
+from ..runtime.jit import jit, constexpr_function
+from . import core
+from . import math
+
+# constexpr utilities
+
+
+@constexpr_function
+def _log2(i):
+    log2 = 0
+    n = i
+    while n > 1:
+        n >>= 1
+        log2 += 1
+    return log2
+
+
+@constexpr_function
+def _is_power_of_two(i):
+    return (i & (i - 1)) == 0 and i != 0
+
+
+_get_int_dtype = constexpr_function(core.get_int_dtype)
+
+# -----------------------
+# Standard library
+# -----------------------
+
+
+@core._tensor_member_fn
+@jit
+def cdiv(x, div):
+    """
+    Computes the ceiling division of :code:`x` by :code:`div`
+
+    :param x: the input number
+    :type x: Block
+    :param div: the divisor
+    :type div: Block
+    """
+    return (x + (div - 1)) // div
+
+
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("sigmoid")
+def sigmoid(x):
+    return 1 / (1 + math.exp(-x))
+
+
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("softmax")
+def softmax(x, dim=None, keep_dims=False, ieee_rounding=False):
+    if dim is None:
+        _dim: core.constexpr = 0
+    else:
+        _dim: core.constexpr = dim
+    z = x - max(x, _dim, keep_dims=keep_dims)
+    num = math.exp(z)
+    den = sum(num, _dim, keep_dims=keep_dims)
+    return math.fdiv(num, den, ieee_rounding)
+
+
+@core._tensor_member_fn
+@jit
+def ravel(x, can_reorder=False):
+    """
+    Returns a contiguous flattened view of :code:`x`.
+
+    :param x: the input tensor
+    :type x: Block
+    """
+    return core.reshape(x, [x.numel], can_reorder=can_reorder)
+
+
+@jit
+def swizzle2d(i, j, size_i, size_j, size_g):
+    """
+    Transforms the indices of a row-major `size_i * size_j` matrix into
+    the indices of a column-major matrix for each group of `size_g` rows.
+
+    For example, for :code:`size_i = size_j = 4` and :code:`size_g = 2`, it will
+    transform ::
+
+        [[0 , 1 , 2 , 3 ],
+         [4 , 5 , 6 , 7 ],
+         [8 , 9 , 10, 11],
+         [12, 13, 14, 15]]
+
+    into ::
+
+        [[0, 2,  4 , 6 ],
+         [1, 3,  5 , 7 ],
+         [8, 10, 12, 14],
+         [9, 11, 13, 15]]
+    """
+    # "unrolled index in array"
+    ij = i * size_j + j
+    # number of elements in `size_g` groups
+    # of `size_j` columns
+    size_gj = size_g * size_j
+    # index of the group in which (i,j) is
+    group_id = ij // size_gj
+    # row-index of the first element of this group
+    off_i = group_id * size_g
+    # last group may have fewer rows
+    size_g = core.minimum(size_i - off_i, size_g)
+    # linear index with respect to the first element in this group
+    ij = ij % size_gj
+    # new row and column indices
+    new_i = off_i + ij % size_g
+    new_j = ij // size_g
+    return new_i, new_j
+
+
+@jit
+def zeros(shape, dtype):
+    """
+    Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`.
+
+    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
+    :type shape: tuple of ints
+    :param dtype: Data-type of the new array, e.g., :code:`tl.float16`
+    :type dtype: DType
+    """
+    return core.full(shape, 0, dtype)
+
+
+@jit
+def zeros_like(input):
+    """
+    Returns a tensor of zeros with the same shape and type as a given tensor.
+
+    :param input: input tensor
+    :type input: Tensor
+    """
+    return zeros(input.shape, input.dtype)
+
+
+# max and argmax
+
+
+@jit
+def _argmax_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    gt = value1 > value2 or tie
+    v_ret = core.where(gt, value1, value2)
+    i_ret = core.where(gt, index1, index2)
+    return v_ret, i_ret
+
+
+@jit
+def _argmax_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, True)
+
+
+@jit
+def _argmax_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, False)
+
+
+@jit
+def _elementwise_max(a, b):
+    return core.maximum(a, b)
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def max(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < core.constexpr(32):
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_max, keep_dims=keep_dims)
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum index", tie_break_arg="tie_break_left")
+def argmax(input, axis, tie_break_left=True, keep_dims=False):
+    (_, ret) = max(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+
+
+# min and argmin
+
+
+@jit
+def _argmin_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    lt = value1 < value2 or tie
+    value_ret = core.where(lt, value1, value2)
+    index_ret = core.where(lt, index1, index2)
+    return value_ret, index_ret
+
+
+@jit
+def _argmin_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, True)
+
+
+@jit
+def _argmin_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, False)
+
+
+@jit
+def _elementwise_min(a, b):
+    return core.minimum(a, b)
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def min(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < 32:
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_min, keep_dims=keep_dims)
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum index", tie_break_arg="tie_break_left")
+def argmin(input, axis, tie_break_left=True, keep_dims=False):
+    _, ret = min(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+
+
+@jit
+def _sum_combine(a, b):
+    return a + b
+
+
+# sum
+
+
+@constexpr_function
+def _pick_sum_dtype(in_dtype, dtype):
+    if dtype is not None:
+        return dtype
+
+    # For integer bitwidths less than 32, pick int32 with the same sign to
+    # avoid overflow.
+    out_dtype = None
+    if in_dtype.is_int_signed():
+        out_dtype = core.int32 if in_dtype.int_bitwidth < 32 else None
+    elif in_dtype.is_int_unsigned():
+        out_dtype = core.uint32 if in_dtype.int_bitwidth < 32 else None
+    return out_dtype
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("sum", dtype_arg="dtype")
+def sum(input, axis=None, keep_dims=False, dtype: core.constexpr = None):
+    # Pick a default dtype for the reduction if one was not specified.
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+
+    if out_dtype is not None:
+        input = input.to(out_dtype)
+    return core.reduce(input, axis, _sum_combine, keep_dims=keep_dims)
+
+
+@jit
+def _xor_combine(a, b):
+    return a ^ b
+
+
+# xor sum
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("xor sum")
+def xor_sum(input, axis=None, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "xor_sum only supported for integers")
+    return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims)
+
+
+# or reduction
+
+
+@jit
+def _or_combine(x, y):
+    return x | y
+
+
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("reduce_or")
+def reduce_or(input, axis, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
+    return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
+
+
+# cumsum
+
+
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumsum", dtype_arg="dtype")
+def cumsum(input, axis=0, reverse=False, dtype: core.constexpr = None):
+    # todo rename this to a generic function name
+
+    input = core._promote_bfloat16_to_float32(input)
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+
+    if out_dtype is not None:
+        input = input.to(out_dtype)
+
+    return core.associative_scan(input, axis, _sum_combine, reverse)
+
+
+# cumprod
+
+
+@jit
+def _prod_combine(a, b):
+    return a * b
+
+
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumprod")
+def cumprod(input, axis=0, reverse=False):
+    # todo rename this to a generic function name
+    input = core._promote_bfloat16_to_float32(input)
+    return core.associative_scan(input, axis, _prod_combine, reverse)
+
+
+# sort
+
+
+@jit
+def _indicator(n_dims: core.constexpr, j: core.constexpr):
+    ar = core.arange(0, 2)
+    ar = core.reshape(ar, [1] * (n_dims - j - 1) + [2] + [1] * j)
+    return ar
+
+
+@jit
+def _compare_and_swap(x, flip, i: core.constexpr):
+    # compare-and-swap on the ith *innermost* dimension
+    n_dims: core.constexpr = _log2(x.numel)
+
+    # flip along middle dimension (the bitwise XORs will be optimised away):
+    idtype = _get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    ix = x.to(idtype, bitcast=True)
+    iy = ix ^ xor_sum(ix, n_dims - 1 - i, True)
+    y = iy.to(x.dtype, bitcast=True)
+
+    # determines whether we are in the right (rather than left) position along the axis:
+    is_right = _indicator(n_dims, i)
+
+    # conditional swap:
+    ret = core.where((x > y) != (flip ^ is_right), y, x)
+    return ret
+
+
+@jit
+def _bitonic_merge_hypercube(x, stage: core.constexpr, order: core.constexpr):
+    '''
+    order_type 0 == ascending
+    order_type 1 == descending
+    order_type 2 == alternating
+    '''
+    # flip denotes whether to re-arrange sub-sequences of elements in ascending or
+    # descending order.
+    # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
+    # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
+    # a stride of 2) at this stage
+    if order == 2:
+        flip = _indicator(_log2(x.numel), stage)
+    else:
+        flip = order
+    # perform `stage` rounds of `compare-and-swap`
+    for i in core.static_range(stage):
+        x = _compare_and_swap(x, flip, stage - 1 - i)
+    return x
+
+
+@jit
+def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr):
+    h = core.reshape(x, [2] * _log2(x.numel))
+    h = _bitonic_merge_hypercube(h, stage, order)
+    x = core.reshape(h, x.shape)
+    return x
+
+
+@jit
+def sort_impl(x, k: core.constexpr = None, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    """
+    Sorts a tensor along a specified dimension.
+
+    :param x: The input tensor to be sorted.
+    :type x: Tensor
+    :param dim: The dimension along which to sort the tensor. If None, the tensor is sorted along the last dimension. Currently, only sorting along the last dimension is supported.
+    :type dim: int, optional
+    :param k: the number of top elements to select. If none, assume k = x.shape[dim]
+    :type k: int, optional
+    :param descending: If set to True, the tensor is sorted in descending order. If set to False, the tensor is sorted in ascending order.
+    :type descending: bool, optional
+    """
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+
+    log_n: core.constexpr = _log2(x.shape[_dim])
+    log_k: core.constexpr = log_n if k is None else _log2(k)
+
+    n_dims: core.constexpr = _log2(x.numel)
+
+    # reshape to hypercube:
+    h = core.reshape(x, [2] * n_dims if n_dims else [1])
+
+    # run first log_k bitonic sort iterations:
+    for i in core.static_range(1, log_k + 1):
+        h = _bitonic_merge_hypercube(h, i, 2 if i < log_n else descending)
+
+    # select top k elements using bitonic top-k
+    # https://www.doc.ic.ac.uk/~hlgr/pdfs/MassivelyParallelTopK.pdf
+    for i in core.static_range(log_k + 1, log_n + 1):
+        h = max(h, axis=(_log2(h.numel) - 1 - log_k)) if descending else min(h, axis=(_log2(h.numel) - 1 - log_k))
+        h = _bitonic_merge_hypercube(h, log_k, 2 if i < log_n else descending)
+
+    # reshape back:
+    x = core.reshape(h, x.shape[:-1] + [2**log_k])
+    return x
+
+
+@jit
+def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    return sort_impl(x, dim=dim, descending=descending)
+
+
+@jit
+def topk(x, k: core.constexpr, dim: core.constexpr = None):
+    return sort_impl(x, k=k, dim=dim, descending=True)
+
+
+@jit
+def bitonic_merge(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    n_dims: core.constexpr = _log2(x.shape[-1])
+    return _bitonic_merge(x, n_dims, descending, n_dims)
+
+
+@constexpr_function
+def _get_flip_dim(dim, shape):
+    if dim is None:
+        dim = len(shape) - 1
+    if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
+        dim += len(shape)
+    return dim
+
+
+@core._tensor_member_fn
+@jit
+def flip(x, dim=None):
+    """
+    Flips a tensor `x` along the dimension `dim`.
+
+    :param x: the first input tensor
+    :type x: Block
+    :param dim: the dimension to flip along
+    :type dim: int
+    """
+    core.static_assert(-len(x.shape) <= dim and dim < len(x.shape))
+    _dim: core.constexpr = _get_flip_dim(dim, x.shape)
+    core.static_assert(_is_power_of_two(x.shape[_dim]))
+    steps: core.constexpr = _log2(x.shape[_dim])
+
+    # reshape the swap dimension to (2, 2, ..., 2)
+    idtype = _get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    y = core.reshape(x.to(idtype, bitcast=True), x.shape[:_dim] + [2] * steps + x.shape[_dim + 1:])
+    for i in core.static_range(steps):
+        y = y ^ xor_sum(y, _dim + i, True)
+    x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
+    return x
+
+
+@jit
+def interleave(a, b):
+    """
+    Interleaves the values of two tensors along their last dimension. The two tensors must have the same shape.
+    Equivalent to `tl.join(a, b).reshape(a.shape[:-1] + [2 * a.shape[-1]])`
+
+    :param a: The first input tensor.
+    :type a: Tensor
+    :param b: The second input tensor.
+    :type b: Tensor
+    """
+    c = core.join(a, b)
+
+    if len(c.shape) == 1:
+        # We must have interleaved two scalars.
+        return c
+    else:
+        # This `else` is necessary because Triton's AST parser doesn't
+        # understand that if we take the `if` above we definitely don't run this
+        # `else`.
+        return core.reshape(c, c.shape[:-2] + [2 * c.shape[-2]])
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/target_info.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/target_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1a277f04d5dd79bf506a49f8aa8d2f0e6d8e90
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/language/target_info.py
@@ -0,0 +1,54 @@
+from triton.runtime import driver
+from triton.runtime.jit import constexpr_function
+
+__all__ = ["current_target"]
+
+
+def current_target():
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+
+
+current_target.__triton_builtin__ = True
+
+
+@constexpr_function
+def is_cuda():
+    target = current_target()
+    return target is not None and target.backend == "cuda"
+
+
+@constexpr_function
+def cuda_capability_geq(major, minor=0):
+    """
+    Determines whether we have compute capability >= (major, minor) and
+    returns this as a constexpr boolean. This can be used for guarding
+    inline asm implementations that require a certain compute capability.
+    """
+    target = current_target()
+    if target is None or target.backend != "cuda":
+        return False
+    assert isinstance(target.arch, int)
+    return target.arch >= major * 10 + minor
+
+
+@constexpr_function
+def is_hip():
+    target = current_target()
+    return target is not None and target.backend == "hip"
+
+
+@constexpr_function
+def is_hip_cdna3():
+    target = current_target()
+    return target is not None and target.arch == "gfx942"
+
+
+@constexpr_function
+def is_hip_cdna4():
+    target = current_target()
+    return target is not None and target.arch == "gfx950"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efbd85819526b7556809c6927a02512835c830d7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__init__.py
@@ -0,0 +1,12 @@
+# ruff: noqa
+from .scope import scope, cpu_timed_scope, enter_scope, exit_scope
+from .state import state, enter_state, exit_state
+from .profile import (
+    start,
+    activate,
+    deactivate,
+    finalize,
+    profile,
+    DEFAULT_PROFILE_NAME,
+)
+from . import context, specs, mode
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a025af183752927528a932764c726bdf0110374
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/context.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/context.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54cdd73ee24f7e26c3f924afedda775572eec151
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/context.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/flags.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/flags.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3fab6d600655b08589824f5df3545118a80719
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/flags.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/language.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/language.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e608ae1d98001abcbff4e595573b123178bef4ea
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/language.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/mode.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/mode.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b889aa83599e46656dda283e8f002f6afaf3c579
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/mode.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/profile.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/profile.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..138c2a2043e6d402d7c74da0844ed2e32409aba1
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/profile.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/proton.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/proton.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..312ff1959cbec7139b791794bae3d99ee7ec640f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/proton.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/scope.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/scope.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bfa3b44da95cb60f22a59ee199571325f320fb4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/scope.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/specs.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/specs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbea0b19ee8e1372f30b572a40b838841ab4b960
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/specs.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/state.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/state.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f425ac4725b0037398acb0d3e6aa419aa0c894b7
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/state.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/viewer.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/viewer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c6a49f68616f006744d29cbd3e32ed5d328a831
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/__pycache__/viewer.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/context.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7dff1f071edb28ef11f08d2ea44c34915d103da
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/context.py
@@ -0,0 +1,18 @@
+from typing import Optional
+from triton._C.libproton import proton as libproton
+from .flags import flags
+
+
+def depth(session: Optional[int] = 0) -> Optional[int]:
+    """
+    Get the depth of the context.
+
+    Args:
+        session (int): The session ID of the profiling session. Defaults to 0.
+
+    Returns:
+        depth (int or None): The depth of the context. If profiling is off, returns None.
+    """
+    if not flags.profiling_on:
+        return None
+    return libproton.get_context_depth(session)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/flags.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef762101454284ae5e183665aaf950260cb0b25
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/flags.py
@@ -0,0 +1,28 @@
+"""
+Centralized, process-local flags with a minimal interface (no environment variables).
+
+Usage:
+    from triton.profiler.flags import flags
+
+    # Toggle
+    flags.profiling_on = True
+    flags.instrumentation_on = False
+
+    # Check
+    if flags.command_line:
+            ...
+"""
+from dataclasses import dataclass
+
+
+@dataclass
+class ProfilerFlags:
+    # Whether profiling is enabled. Default is False.
+    profiling_on: bool = False
+    # Whether instrumentation is enabled. Default is False.
+    instrumentation_on: bool = False
+    # Whether the script is run from the command line. Default is False.
+    command_line: bool = False
+
+
+flags = ProfilerFlags()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba3ff539527cd54e488828e92db7c4f7aaed882
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__init__.py
@@ -0,0 +1,4 @@
+# ruff: noqa
+from .hook import HookManager
+from .instrumentation import InstrumentationHook
+from .launch import LaunchHook
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7426de0dc2ea58197a29185985f4ed9dbba95f4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/hook.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/hook.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88da12f727217304f5919783592ef928726d0259
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/hook.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/instrumentation.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/instrumentation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80a5ae5b25f2b2733f71bb3fdd1f66a5a8eb6275
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/instrumentation.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/launch.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/launch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..120587c7fda958c3fdc5e07fe94a657720d39bbc
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/__pycache__/launch.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/hook.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a672722a1279f68cf80d5fde3d0b654100611d7a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/hook.py
@@ -0,0 +1,128 @@
+from triton.compiler import LazyDict
+from abc import abstractmethod
+from typing import Dict, Any, Optional
+from collections import defaultdict
+import triton.knobs as knobs
+
+
+class Hook:
+    priority: int = 0
+
+    @abstractmethod
+    def init_handle(self, module: Any, function: Any, name: str, metadata_group: Dict[str, str],
+                    hash: str) -> None:  # noqa: D401
+        raise NotImplementedError
+
+    @abstractmethod
+    def enter(self, metadata: LazyDict) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def exit(self, metadata: LazyDict) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def activate(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def deactivate(self) -> None:
+        raise NotImplementedError
+
+
+class HookManager:
+    # active hooks
+    active_hooks: list[Hook] = []
+    # session_id -> (hook_type -> active)
+    session_hooks: Dict[int, Dict[Hook, bool]] = defaultdict(lambda: defaultdict(bool))
+
+    @staticmethod
+    def init_handle(module: Any, function: Any, name: str, metadata_group: Dict[str, str], hash: str) -> None:
+        for hook in HookManager.active_hooks:
+            hook.init_handle(module, function, name, metadata_group, hash)
+
+    @staticmethod
+    def enter(metadata: LazyDict) -> None:
+        for hook in HookManager.active_hooks:
+            hook.enter(metadata)
+
+    @staticmethod
+    def exit(metadata: LazyDict) -> None:
+        # It's important to reverse the order of hooks so that we keep the first in last out order
+        for hook in reversed(HookManager.active_hooks):
+            hook.exit(metadata)
+
+    @staticmethod
+    def activate(session: Optional[int] = None) -> None:
+        if session is None:
+            sessions = HookManager.session_hooks.keys()
+        else:
+            sessions = [session]
+
+        for session in sessions:
+            for hook in HookManager.session_hooks[session]:
+                if hook not in HookManager.active_hooks:
+                    hook.activate()
+                    HookManager.active_hooks.append(hook)
+                HookManager.session_hooks[session][hook] = True
+        # Sort active_hooks by priority
+        HookManager.active_hooks.sort(key=lambda x: x.priority, reverse=True)
+
+    @staticmethod
+    def deactivate(session: Optional[int] = None) -> None:
+        if session is None:
+            sessions = HookManager.session_hooks.keys()
+        else:
+            sessions = [session]
+
+        deactivated_hooks = set()
+        for session in sessions:
+            for hook in HookManager.session_hooks[session]:
+                if hook in HookManager.active_hooks:
+                    deactivated_hooks.add(hook)
+                HookManager.session_hooks[session][hook] = False
+
+        # Check if any other sessions rely on this hook
+        for hook in deactivated_hooks:
+            if not any(session_hooks[hook] for session_hooks in HookManager.session_hooks.values()):
+                hook.deactivate()
+                HookManager.active_hooks.remove(hook)
+
+    @staticmethod
+    def register(hook: Hook, session: int) -> None:
+        HookManager.session_hooks[session][hook] = True
+        if hook not in HookManager.active_hooks:
+            hook.activate()
+            HookManager.active_hooks.append(hook)
+        # Sort active_hooks by priority
+        HookManager.active_hooks.sort(key=lambda x: x.priority, reverse=True)
+
+        # Register the heads
+        knobs.runtime.kernel_load_end_hook.add(HookManager.init_handle)
+        knobs.runtime.launch_enter_hook.add(HookManager.enter)
+        knobs.runtime.launch_exit_hook.add(HookManager.exit)
+
+    @staticmethod
+    def unregister(session: Optional[int] = None) -> None:
+        if session is not None and session not in HookManager.session_hooks:
+            return
+
+        if session is None:
+            for hook in HookManager.active_hooks:
+                hook.deactivate()
+            HookManager.active_hooks.clear()
+            HookManager.session_hooks.clear()
+        else:
+            popped_hooks = HookManager.session_hooks.pop(session)
+            # Deactivate hooks that are not used by any other session
+            for hook, active in popped_hooks.items():
+                if not active:
+                    continue
+                if not any(session_hooks[hook] for session_hooks in HookManager.session_hooks.values()):
+                    hook.deactivate()
+                    HookManager.active_hooks.remove(hook)
+        # Unregister the heads
+        if not HookManager.active_hooks:
+            knobs.runtime.kernel_load_end_hook.remove(HookManager.init_handle)
+            knobs.runtime.launch_enter_hook.remove(HookManager.enter)
+            knobs.runtime.launch_exit_hook.remove(HookManager.exit)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/instrumentation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/instrumentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac27ad1887339d44c3d16e47ce48c456b474d4c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/instrumentation.py
@@ -0,0 +1,348 @@
+from typing import Dict, Optional, Union, Any
+
+import triton
+from triton._C.libtriton import ir as triton_ir
+from triton._C.libtriton import proton as triton_proton
+from triton._C.libtriton import amd as triton_amd
+from triton._C.libtriton import nvidia as triton_nvidia
+from triton._C.libtriton import passes as triton_passes
+from triton._C.libproton import proton as libproton
+from triton.compiler import LazyDict
+from triton.runtime._allocation import set_profile_allocator, NullAllocator
+from triton.backends import backends
+
+from .hook import Hook
+from ..flags import flags
+from .. import mode
+
+# TODO(fywkevin): add support for major.minor
+VERSION = 1
+
+
+class CudaAllocator:
+
+    def __init__(self, instrumentation_hook):
+        self.instrumentation_hook = instrumentation_hook
+
+    def __call__(self, size: int, alignment: int, stream: Optional[int]):
+        if alignment != self.instrumentation_hook.profile_buffer_alignment:
+            raise RuntimeError(
+                f"Alignment mismatch: {alignment} != {self.instrumentation_hook.profile_buffer_alignment}")
+        aligned_size = (size + alignment - 1) // alignment * alignment
+        # Note: profile_buffer_size may be smaller than the aligned size if the kernel launches many blocks
+        # and the host CPU cannot store all profiling data in memory. This streaming mode is not yet implemented.
+        # In the future, we should support copying data incrementally from device to host to enable
+        # more efficient profiling data processing, rather than relying solely on post-processing.
+        aligned_size = max(aligned_size, self.instrumentation_hook.profile_buffer_size)
+
+        # Create the buffer
+        import torch
+        buffer = torch.empty((aligned_size, ), dtype=torch.uint8, device="cuda")
+        self.instrumentation_hook.buffer = buffer
+        return buffer
+
+
+class Instrumentation:
+
+    def __init__(self, ir_map: Dict[str, Any]):
+        self.manager = ir_map
+
+    def register(self, ir: str, func):
+        if ir in self.manager:
+            raise RuntimeError(f"IR already registered: {ir}")
+        self.manager[ir] = func
+
+    def patch(self, ir: str, pm, context):
+        self.load_dialects(context)
+        if ir in self.manager:
+            self.manager[ir](pm)
+
+    def load_dialects(self, ctx):
+        triton_proton.load_dialects(ctx)
+
+
+def _interpret_mode(mode_obj: Union[str, mode.InstrumentationMode]) -> mode.InstrumentationMode:
+    if isinstance(mode_obj, mode.InstrumentationMode):
+        return mode_obj
+    elif not mode_obj:
+        mode_obj = "default"
+
+    parts = mode_obj.split(":")
+    mode_name = parts[0]
+    opts: Dict[str, str] = {}
+    for opt in parts[1:]:
+        if "=" in opt:
+            key, val = opt.split("=", 1)
+            opts[key] = val
+        else:
+            raise ValueError(f"Malformed instrumentation option: '{opt}'")
+
+    # Get option values or empty strings
+    options = {
+        "metric_type": opts.get("metric_type", "cycle"), "buffer_type": opts.get("buffer_type", "shared"),
+        "buffer_strategy": opts.get("buffer_strategy", "circular"), "buffer_size": int(opts.get("buffer_size", "0")),
+        "granularity": opts.get("granularity", "warp"), "sampling_strategy": opts.get("sampling_strategy", "none"),
+        "sampling_options": opts.get("sampling_options", ""), "optimizations": opts.get("optimizations", "")
+    }
+
+    # Helper function to validate and map options to their enum values
+    def get_option_value(opt_name, mapping):
+        value = options[opt_name]
+        if value and value not in mapping:
+            raise ValueError(f"Unknown {opt_name}: {value}")
+        return mapping[value] if value else value
+
+    # Look up enum values for each option
+    options["metric_type"] = get_option_value("metric_type", mode.metric_types)
+    options["buffer_type"] = get_option_value("buffer_type", mode.buffer_types)
+    options["buffer_strategy"] = get_option_value("buffer_strategy", mode.buffer_strategies)
+    options["granularity"] = get_option_value("granularity", mode.granularities)
+    options["sampling_strategy"] = get_option_value("sampling_strategy", mode.sampling_strategies)
+
+    values = ([value.strip()
+               for value in options["optimizations"].split(",")] if len(options["optimizations"]) > 0 else [])
+    for value in values:
+        if value not in mode.optimizations:
+            raise ValueError(f"Unknown optimization: {value}")
+    options["optimizations"] = [mode.optimizations[value] for value in values]
+
+    # Create the appropriate mode instance
+    if mode_name == "default":
+        return mode.Default(**options)
+    elif mode_name == "mma":
+        return mode.MMA(**options)
+    else:
+        raise ValueError(f"Unknown mode: {mode_obj}")
+
+
+def _get_backend_name() -> str:
+    backend = triton.runtime.driver.active.get_current_target().backend
+    if backend == "cuda":
+        return "nvidia"
+    elif backend == "hip":
+        return "amd"
+    else:
+        raise RuntimeError(f"Unsupported backend: {backend}")
+
+
+class InstrumentationHook(Hook):
+    priority: int = 0
+    # It's important to note that only one instance of the instrumentation hook can be active at a time.
+    active_count: int = 0
+    enable_host_buffer: bool = False
+    host_buffer: Optional[Any] = None
+    # FIXME(fywkevin): change to a more reasonable value after we have support for periodic buffer dumping.
+    profile_buffer_size: int = 1
+    profile_buffer_alignment: int = 128
+
+    def __init__(self, mode_obj: Union[None, str, mode.InstrumentationMode]):
+        # Mapping of function objects to their scope ID pairs
+        self.mode: mode.InstrumentationMode = _interpret_mode(mode_obj)
+
+        self.allocator = CudaAllocator(self)
+        self.buffer = None
+        self.metadata_path: Dict[Any, Optional[str]] = {}
+
+    def activate(self):
+        if InstrumentationHook.active_count > 0:
+            raise RuntimeError("Only one instance of the instrumentation hook can be active at a time.")
+
+        InstrumentationHook.active_count += 1
+
+        flags.instrumentation_on = True
+
+        device = triton.runtime.driver.active.get_current_device()
+        max_shared_mem = triton.runtime.driver.active.utils.get_device_properties(device)["max_shared_mem"]
+        backend_name = _get_backend_name()
+
+        def to_llvmir_passes(pm):
+            is_long_clk = False if mode.Optimize.CLOCK32 in self.mode.optimizations else True
+            triton_proton.add_convert_proton_to_protongpu(pm, self.mode.metric_type, self.mode.sampling_strategy,
+                                                          self.mode.sampling_options, self.mode.granularity,
+                                                          self.mode.buffer_strategy, self.mode.buffer_type,
+                                                          self.mode.buffer_size, max_shared_mem,
+                                                          self.profile_buffer_size, self.profile_buffer_alignment,
+                                                          is_long_clk)
+            triton_passes.common.add_cse(pm)
+
+            if mode.Optimize.SCHED_STORES in self.mode.optimizations:
+                triton_proton.add_schedule_buffer_store(pm)
+
+            triton_proton.add_allocate_proton_shared_memory(pm)
+
+            if mode.Optimize.SCHED_BARRIERS in self.mode.optimizations and backend_name == "amd":
+                triton_proton.add_sched_barriers(pm)
+
+        def to_llvm_passes(pm):
+            triton_proton.add_allocate_proton_global_scratch_buffer(pm)
+            if backend_name == "nvidia":
+                triton_proton.add_convert_proton_nvidia_gpu_to_llvm(pm)
+            elif backend_name == "amd":
+                arch = triton.runtime.driver.active.utils.get_device_properties(device)["arch"].split(":")[0]
+                triton_proton.add_convert_proton_amd_gpu_to_llvm(pm, arch)
+
+        backends[backend_name].compiler.instrumentation = Instrumentation({
+            "ttgpuir_to_llvmir":
+            lambda pm: to_llvmir_passes(pm),
+            "llvmir_to_llvm":
+            lambda pm: to_llvm_passes(pm),
+        })
+
+        # Set up the profiling allocator
+        set_profile_allocator(self.allocator)
+
+        # Set the instrumentation mode
+        triton.knobs.compilation.instrumentation_mode = str(self.mode)
+
+    def deactivate(self):
+        if InstrumentationHook.active_count == 0:
+            return
+
+        InstrumentationHook.active_count -= 1
+
+        backend_name = _get_backend_name()
+
+        # No instrumentation passes are registered anymore
+        backends[backend_name].compiler.instrumentation = {}
+
+        # No runtime instrumentation hook is active anymore
+        flags.instrumentation_on = False
+
+        # Restore the instrumentation mode
+        triton.knobs.compilation.instrumentation_mode = ""
+
+        # Reset profile allocator
+        set_profile_allocator(NullAllocator())
+
+        # Reset host memory for external processing
+        InstrumentationHook.host_buffer = None
+
+        # Reset the buffer reference
+        self.buffer = None
+
+    def init_handle(self, module: Any, function: Any, name: str, metadata_group: Dict[str, str], hash: str) -> None:
+        if not function:
+            return
+
+        # Find the IR path in metadata
+        ir_path = next((path for key, path in metadata_group.items() if key.endswith(("ttgir"))), None)
+        metadata_path = next((path for key, path in metadata_group.items() if key.endswith(("json"))), None)
+        self.metadata_path[function] = metadata_path
+
+        if ir_path:
+            context = triton_ir.context()
+            triton_ir.load_dialects(context)
+            backend_name = _get_backend_name()
+            if backend_name == "nvidia":
+                triton_nvidia.load_dialects(context)
+            elif backend_name == "amd":
+                triton_amd.load_dialects(context)
+            triton_proton.load_dialects(context)
+            module = triton_ir.parse_mlir_module(ir_path, context)
+            module.context = context
+
+            scope_id_names = triton_proton.get_scope_id_names(module)
+            scope_id_parents = triton_proton.get_scope_id_parents(module)
+            libproton.init_function_metadata(function, name, scope_id_names, scope_id_parents, metadata_path)
+        else:
+            raise RuntimeError(f"IR path not found in metadata for function {function}")
+
+    def _data_ptr(self) -> int:
+        return 0 if self.buffer is None else self.buffer.data_ptr()
+
+    def enter(self, metadata: LazyDict) -> None:
+        func = metadata.data.get("function")
+        stream = metadata.data.get("stream")
+        alloc_size = 0 if self.buffer is None else self.buffer.element_size() * self.buffer.numel()
+        libproton.enter_instrumented_op(stream, func, self._data_ptr(), alloc_size)
+        if InstrumentationHook.enable_host_buffer:
+            InstrumentationHook.host_buffer = None
+
+    def exit(self, metadata: LazyDict) -> None:
+        func = metadata.data.get("function")
+        stream = metadata.data.get("stream")
+        alloc_size = 0 if self.buffer is None else self.buffer.element_size() * self.buffer.numel()
+        libproton.exit_instrumented_op(stream, func, self._data_ptr(), alloc_size)
+
+        if InstrumentationHook.enable_host_buffer:
+            self._populate_host_buffer(func)
+
+    def _populate_host_buffer(self, function: Any) -> None:
+        if function and self.metadata_path[function]:
+            import torch
+            import struct
+            import json
+
+            def encode_target(target: Dict[str, Any]) -> int:
+                #TODO(fywkevin): also account for `arch`
+                if target["backend"] == "cuda":
+                    return 1
+                elif target["backend"] == "hip":
+                    return 2
+                return 0
+
+            alloc_size = 0 if self.buffer is None else self.buffer.element_size() * self.buffer.numel()
+            sampled_warps = self.mode.sampling_options.strip().split(",")
+            data = {}
+            with open(self.metadata_path[function], 'r') as file:
+                data = json.load(file)
+
+            device_type = encode_target(data["target"])
+            scratch_mem_size = data["profile_scratch_size"]
+            total_unit = data["num_warps"]
+            uid_num = total_unit if self.mode.sampling_strategy == triton_proton.SAMPLING_STRATEGY.NONE else len(
+                sampled_warps)
+            block_num = int(alloc_size / scratch_mem_size)
+
+            # Binary trace layout:
+            # +------------------+
+            # |     version      |  4 bytes
+            # +------------------+
+            # |  header_offset   |  4 bytes
+            # +------------------+
+            # |   header_size    |  4 bytes
+            # +------------------+
+            # |  payload_offset  |  4 bytes
+            # +------------------+
+            # |   payload_size   |  4 bytes
+            # +------------------+
+            # |   device_type    |  4 bytes
+            # +------------------+
+            # |    block_num     |  4 bytes
+            # +------------------+
+            # |   total_unit     |  4 bytes
+            # +------------------+
+            # | scratch_mem_size |  4 bytes
+            # +------------------+
+            # |     uid_num      |  4 bytes
+            # +------------------+
+            # |                  |
+            # |     uid_vec      |  uid_num * 4 bytes
+            # |                  |
+            # +------------------+
+            # |                  |
+            # |     payload      |  size_payload bytes
+            # |                  |
+            # +------------------+
+
+            is_all_warps = self.mode.sampling_options == "" and self.mode.granularity == triton_proton.GRANULARITY.WARP
+            if is_all_warps:
+                uid_vec = [i for i in range(total_unit)]
+            else:
+                uid_vec = [int(i) for i in sampled_warps]
+
+            header_size = 40 + uid_num * 4
+            header_offset = 4
+            payload_offset = header_size
+            payload_size = alloc_size
+            header_values = [
+                VERSION, header_offset, header_size, payload_offset, payload_size, device_type, block_num, total_unit,
+                scratch_mem_size, uid_num, *uid_vec
+            ]
+            header_bytes = struct.pack("I" * len(header_values), *header_values)
+
+            InstrumentationHook.host_buffer = torch.empty(header_size + alloc_size, dtype=torch.uint8, device="cpu")
+            config_portion = InstrumentationHook.host_buffer[:header_size]
+            config_portion.copy_(torch.tensor(list(header_bytes), dtype=torch.uint8))
+            data_portion = InstrumentationHook.host_buffer[header_size:].view_as(self.buffer)
+            data_portion.copy_(self.buffer.cpu())
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/launch.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0243c8b67f1adf07b409e11e18a66775f601f1b6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/hooks/launch.py
@@ -0,0 +1,49 @@
+from ..state import enter_state, exit_state
+from triton.compiler import LazyDict
+from .hook import Hook
+from triton._C.libproton import proton as libproton
+from contextvars import ContextVar
+
+COMPUTE_METADATA_SCOPE_NAME = "__proton_launch_metadata"
+
+op_name = ContextVar("op_name", default=None)
+id = ContextVar("id", default=None)
+
+
+class LaunchHook(Hook):
+    # Highest priority
+    priority = 100
+    # This is a singleton class
+    _instance = None
+    flops_width = [8, 16, 32, 64]
+    metrics = [f"flops{width}" for width in flops_width] + ["bytes"] + ["flops"]
+
+    def __init__(self):
+        pass
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(LaunchHook, cls).__new__(cls)
+        return cls._instance
+
+    def init_handle(self, module, function, name: str, metadata_group: dict, hash: str) -> None:
+        pass
+
+    def activate(self):
+        pass
+
+    def deactivate(self):
+        pass
+
+    def enter(self, metadata: LazyDict) -> None:
+        enter_state(COMPUTE_METADATA_SCOPE_NAME)
+        lazy_metadata = metadata.get()
+        exit_state()
+        fn_metrics = {k: lazy_metadata[k] for k in LaunchHook.metrics if k in lazy_metadata}
+        op_name.set(lazy_metadata["name"])
+        id.set(libproton.record_scope())
+        libproton.enter_op(id.get(), lazy_metadata["name"])
+        libproton.add_metrics(id.get(), fn_metrics)
+
+    def exit(self, metadata: LazyDict) -> None:
+        libproton.exit_op(id.get(), op_name.get())
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/language.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/language.py
new file mode 100644
index 0000000000000000000000000000000000000000..2785938194ed0abd4de258a90df21280f64fc342
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/language.py
@@ -0,0 +1,65 @@
+from triton.language import core as tl
+from triton.language.core import builtin
+from triton._C.libtriton import proton as triton_proton
+from triton.language.semantic import TritonSemantic
+from triton.experimental.gluon.language._semantic import GluonSemantic
+
+from .flags import flags
+
+_ALL_SEMANTICS = {
+    "triton": TritonSemantic,
+    "gluon": GluonSemantic,
+}
+"""
+By default **only Gluon** semantic is enabled.
+Instrumenting kernels written in Triton DSL is disable because Triton's higher-level IR undergoes
+aggressive compiler rewrites (loop pipelining, instruction re-ordering, IR duplication, etc.).
+These transformations can invalidate naïve instrumentation and lead to misleading results.
+"""
+_SEMANTICS = {_ALL_SEMANTICS["gluon"]}
+
+
+def _check_supported_semantic(semantic):
+    if not isinstance(semantic, tuple(_SEMANTICS)):
+        raise TypeError(f"Unsupported semantic type: {type(semantic)}. "
+                        f"Supported semantics are: {_SEMANTICS}")
+
+
+def enable_semantic(semantic_name: str):
+    _SEMANTICS.add(_ALL_SEMANTICS[semantic_name])
+
+
+def disable_semantic(semantic_name: str):
+    _SEMANTICS.remove(_ALL_SEMANTICS[semantic_name])
+
+
+def record(is_start: tl.constexpr, scope_name: tl.constexpr, semantic):
+    if not flags.instrumentation_on:
+        return
+    _check_supported_semantic(semantic)
+    is_start = tl._unwrap_if_constexpr(is_start)
+    scope_name = tl._unwrap_if_constexpr(scope_name)
+    return tl.tensor(triton_proton.create_proton_record(semantic.builder, is_start, scope_name), tl.void)
+
+
+@builtin
+def enter_scope(name: tl.constexpr, _semantic=None):
+    record(is_start=True, scope_name=name, semantic=_semantic)
+
+
+@builtin
+def exit_scope(name: tl.constexpr, _semantic=None):
+    record(is_start=False, scope_name=name, semantic=_semantic)
+
+
+class scope:
+
+    def __init__(self, name: str, _semantic=None):
+        self.name = name
+        self.semantic = _semantic
+
+    def __enter__(self):
+        enter_scope(self.name, _semantic=self.semantic)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        exit_scope(self.name, _semantic=self.semantic)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/mode.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff41d58872f906ed2a9895c47bbe4514fad32ee6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/mode.py
@@ -0,0 +1,123 @@
+from dataclasses import dataclass, field
+from triton._C.libtriton import proton as triton_proton
+from typing import List
+from enum import Enum
+
+metric_types = {"cycle": triton_proton.METRIC_TYPE.CYCLE}
+
+buffer_strategies = {
+    "circular": triton_proton.BUFFER_STRATEGY.CIRCULAR,
+    "flush": triton_proton.BUFFER_STRATEGY.FLUSH,
+}
+
+buffer_types = {
+    "shared": triton_proton.BUFFER_TYPE.SHARED,
+    "global": triton_proton.BUFFER_TYPE.GLOBAL,
+}
+
+sampling_strategies = {
+    "none": triton_proton.SAMPLING_STRATEGY.NONE,
+    "selective": triton_proton.SAMPLING_STRATEGY.SELECTIVE,
+}
+
+granularities = {
+    "cta": triton_proton.GRANULARITY.CTA,
+    "warp": triton_proton.GRANULARITY.WARP,
+    "warp_2": triton_proton.GRANULARITY.WARP_2,
+    "warp_4": triton_proton.GRANULARITY.WARP_4,
+    "warp_8": triton_proton.GRANULARITY.WARP_8,
+    "warp_group": triton_proton.GRANULARITY.WARP_GROUP,
+    "warp_group_2": triton_proton.GRANULARITY.WARP_GROUP_2,
+    "warp_group_4": triton_proton.GRANULARITY.WARP_GROUP_4,
+    "warp_group_8": triton_proton.GRANULARITY.WARP_GROUP_8,
+}
+
+
+class Optimize(Enum):
+    TIMESHIFT = "time_shift"
+    SCHED_STORES = "sched_stores"
+    SCHED_BARRIERS = "sched_barriers"
+    CLOCK32 = "clock32"
+
+    def __str__(self):
+        return self.value
+
+
+optimizations = {
+    "time_shift": Optimize.TIMESHIFT,
+    "sched_stores": Optimize.SCHED_STORES,
+    "sched_barriers": Optimize.SCHED_BARRIERS,
+    "clock32": Optimize.CLOCK32,
+}
+
+
+@dataclass(frozen=True)
+class BaseMode:
+    name: str
+
+
+@dataclass(frozen=True)
+class PCSampling(BaseMode):
+    name: str = field(default="pcsampling", init=False)
+    interval: int = 1000
+
+    def __post_init__(self):
+        if self.interval <= 0:
+            raise ValueError("Interval must be a positive integer.")
+
+    def __str__(self):
+        return f"{self.name}:interval={self.interval}"
+
+
+@dataclass(frozen=True)
+class InstrumentationMode(BaseMode):
+    """Common base class for instrumentation modes with shared configuration."""
+    metric_type: triton_proton.METRIC_TYPE = triton_proton.METRIC_TYPE.CYCLE
+    sampling_strategy: triton_proton.SAMPLING_STRATEGY = triton_proton.SAMPLING_STRATEGY.NONE
+    sampling_options: str = ""
+    granularity: triton_proton.GRANULARITY = triton_proton.GRANULARITY.WARP
+    buffer_strategy: triton_proton.BUFFER_STRATEGY = triton_proton.BUFFER_STRATEGY.CIRCULAR
+    buffer_type: triton_proton.BUFFER_TYPE = triton_proton.BUFFER_TYPE.SHARED
+    buffer_size: int = 0
+    optimizations: List[Optimize] = field(default_factory=list)
+
+    def __post_init__(self):
+        # automatically map string inputs to enums using the global lookup dicts
+        mappings = [
+            ("metric_type", metric_types),
+            ("sampling_strategy", sampling_strategies),
+            ("granularity", granularities),
+            ("buffer_strategy", buffer_strategies),
+            ("buffer_type", buffer_types),
+        ]
+        for field_name, lookup in mappings:
+            value = getattr(self, field_name)
+            if isinstance(value, str):
+                if value not in lookup:
+                    raise ValueError(f"Unknown {field_name}: {value}")
+                object.__setattr__(self, field_name, lookup[value])
+
+        values_str = getattr(self, "optimizations")
+        if isinstance(values_str, str):
+            values = [value.strip() for value in values_str.split(",")] if len(values_str) > 0 else []
+            for value in values:
+                if value not in optimizations:
+                    raise ValueError(f"Unknown optimization: {value}")
+            object.__setattr__(self, "optimizations", [optimizations[value] for value in values])
+
+    def __str__(self):
+        optimizations_str = ",".join([str(opt) for opt in self.optimizations])
+        return (f"{self.name}:metric_type={self.metric_type}:sampling_strategy={self.sampling_strategy}"
+                f":sampling_options={self.sampling_options}:granularity={self.granularity}"
+                f":buffer_strategy={self.buffer_strategy}:buffer_type={self.buffer_type}"
+                f":buffer_size={self.buffer_size}:optimizations={optimizations_str}")
+
+
+@dataclass(frozen=True)
+class Default(InstrumentationMode):
+    name: str = field(default="default", init=False)
+
+
+@dataclass(frozen=True)
+class MMA(InstrumentationMode):
+    name: str = field(default="mma", init=False)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/profile.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..126b3abbeaf71fed1781f283fd53704cca41b42e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/profile.py
@@ -0,0 +1,252 @@
+import functools
+import triton
+
+from triton._C.libproton import proton as libproton  # type: ignore
+from triton._C.libtriton import getenv  # type: ignore
+from .flags import flags
+from .hooks import HookManager, LaunchHook, InstrumentationHook
+from .mode import BaseMode
+from typing import Optional, Union
+
+DEFAULT_PROFILE_NAME = "proton"
+
+
+def _select_backend() -> str:
+    backend = triton.runtime.driver.active.get_current_target().backend
+    if backend == "cuda":
+        return "cupti"
+    elif backend == "hip":
+        return "roctracer"
+    else:
+        raise ValueError("No backend is available for the current target.")
+
+
+def _get_mode_str(backend: str, mode: Optional[Union[str, BaseMode]]) -> str:
+    if backend == "instrumentation":
+        prefix = triton.runtime.driver.active.get_current_target().backend
+        return f"{prefix}:{mode}" if mode else prefix
+    return str(mode) if mode else ""
+
+
+def _check_env(backend: str) -> None:
+    if backend == "roctracer":
+        hip_device_envs = ["HIP_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]
+        for env in hip_device_envs:
+            if getenv(env, None) is not None:
+                raise ValueError(
+                    f"Proton does not work when the environment variable {env} is set on AMD GPUs. Please unset it and use `ROCR_VISIBLE_DEVICES` instead"
+                )
+
+    # Ensure default envs are set for Proton knobs if not already set by the user.
+    for attr, desc in triton.knobs.proton.knob_descriptors.items():
+        key = desc.key
+        if getenv(key, None) is None:
+            val = getattr(triton.knobs.proton, attr)
+            if val is not None:
+                if env_val := triton.knobs.toenv(val):
+                    triton.knobs.setenv(key, env_val[0])
+
+
+def start(
+    name: Optional[str] = None,
+    *,
+    context: Optional[str] = "shadow",
+    data: Optional[str] = "tree",
+    backend: Optional[str] = None,
+    mode: Optional[Union[str, BaseMode]] = None,
+    hook: Optional[str] = None,
+):
+    """
+    Start profiling with the given name and backend.
+
+    Usage:
+
+        ```python
+        proton.start("my_profile")
+        # do something
+        proton.finalize()
+        ```
+
+    Args:
+        name (str, optional): The name (with path) of the profiling session.
+                              If not provided, the default name is "~/proton.<suffix>", where suffix is the default
+                              format according to the data type. For example, if data is "tree", the default name is "~/proton.hatchet".
+        context (str, optional): The context to use for profiling.
+                                 Available options are ["shadow", "python"].
+                                 Defaults to "shadow".
+        data (str, optional): The data structure to use for profiling.
+                              Available options are ["tree", "trace"].
+                              Defaults to "tree".
+        backend (str, optional): The backend to use for profiling.
+                                 Available options are [None, "cupti", "roctracer", "instrumentation"].
+                                 Defaults to None, which automatically selects the backend matching the current active runtime.
+        mode (Union[str, BaseMode], optional): The "mode" to use for profiling, which is specific to the backend.
+                                               Can be a string or an instance of BaseMode (or any subclass thereof).
+                                               Defaults to None.
+                                               For "cupti", available options are [None, "pcsampling"].
+                                               For "roctracer", available options are [None].
+                                               For "instrumentation", available options are [None].
+                                               Each mode has a set of control knobs following with the mode name.
+                                               For example, "pcsampling" has an "interval" control knob, expressed as "pcsampling:interval=1000".
+        hook (str, optional): The hook to use for profiling.
+                              Available options are [None, "launch"].
+                              Defaults to None.
+    Returns:
+        session (int): The session ID of the profiling session.
+    """
+    if flags.command_line or triton.knobs.proton.disable:
+        # Ignore the start() call if the script is run from the command line or profiling is disabled.
+        return
+
+    flags.profiling_on = True
+
+    name = DEFAULT_PROFILE_NAME if name is None else name
+    backend = _select_backend() if backend is None else backend
+    # Convert mode to its string representation for libproton's runtime
+    mode_str = _get_mode_str(backend, mode)
+
+    _check_env(backend)
+
+    session = libproton.start(name, context, data, backend, mode_str)
+
+    if hook == "triton":
+        HookManager.register(LaunchHook(), session)
+    if backend == "instrumentation":
+        HookManager.register(InstrumentationHook(mode), session)
+
+    return session
+
+
+def activate(session: Optional[int] = None) -> None:
+    """
+    Activate the specified session.
+    The profiling session will be active and data will be recorded.
+
+    Args:
+        session (int): The session ID of the profiling session. Defaults to None (all sessions)
+
+    Returns:
+        None
+    """
+    if flags.command_line and session != 0:
+        raise ValueError("Only one session can be activated when running from the command line.")
+
+    HookManager.activate(session)
+
+    if session is None:
+        libproton.activate_all()
+    else:
+        libproton.activate(session)
+
+
+def deactivate(session: Optional[int] = None) -> None:
+    """
+    Stop the specified session.
+    The profiling session's data will still be in the memory, but no more data will be recorded.
+
+    Args:
+        session (int): The session ID of the profiling session. Defaults to None (all sessions)
+
+    Returns:
+        None
+    """
+    if flags.command_line and session != 0:
+        raise ValueError("Only one session can be deactivated when running from the command line.")
+
+    HookManager.deactivate(session)
+
+    if session is None:
+        libproton.deactivate_all()
+    else:
+        libproton.deactivate(session)
+
+
+def finalize(session: Optional[int] = None, output_format: Optional[str] = "") -> None:
+    """
+    Finalizes a profiling session.
+    Flush and write the profiling data to the file specified by the session name.
+
+    Args:
+        session (int, optional): The session ID to finalize. If None, all sessions are finalized. Defaults to None.
+        output_format (str, optional): The output format for the profiling results.
+                                       Available options are ["hatchet", "chrome_trace"].
+
+    Returns:
+        None
+    """
+    HookManager.unregister(session)
+
+    if session is None:
+        flags.profiling_on = False
+        libproton.finalize_all(output_format)
+    else:
+        if flags.command_line and session != 0:
+            raise ValueError("Only one session can be finalized when running from the command line.")
+        libproton.finalize(session, output_format)
+
+
+def _profiling(
+    func,
+    name: Optional[str] = None,
+    context: Optional[str] = "shadow",
+    data: Optional[str] = "tree",
+    backend: Optional[str] = None,
+    mode: Optional[str] = None,
+    hook: Optional[str] = None,
+):
+    """
+    Context manager for profiling. Internally use only.
+
+    Args:
+        See start() for the arguments.
+
+    Returns:
+        wrapper (function): The wrapped function.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        session = start(name, context=context, data=data, backend=backend, mode=mode, hook=hook)
+        ret = func(*args, **kwargs)
+        deactivate(session)
+        return ret
+
+    return wrapper
+
+
+def profile(
+    func=None,
+    *,
+    name: Optional[str] = None,
+    context: Optional[str] = "shadow",
+    data: Optional[str] = "tree",
+    backend: Optional[str] = None,
+    mode: Optional[str] = None,
+    hook: Optional[str] = None,
+):
+    """
+    Decorator for profiling.
+
+    Usage:
+
+    ```python
+    @proton.profile
+    def foo():
+        pass
+    ```
+
+    Args:
+        See start() for the arguments.
+
+    Returns:
+        decorator (function): The decorator function.
+    """
+    if func is None:
+        # It's being used with parentheses, so return a decorator
+        def decorator(f):
+            return _profiling(f, name=name, context=context, data=data, backend=backend, mode=mode, hook=hook)
+
+        return decorator
+    else:
+        # It's being used without parentheses, so apply the decorator directly
+        return _profiling(func, name=name, context=context, data=data, backend=backend, mode=mode, hook=hook)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/proton.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/proton.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d3a971e92cf9766dd8a82d6a695e8abd8074b2
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/proton.py
@@ -0,0 +1,88 @@
+import argparse
+import sys
+import os
+from .profile import start, finalize, _select_backend
+from .flags import flags
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="The proton command utility for profiling scripts and pytest tests.", usage="""
+    proton [options] script.py [script_args] [script_options]
+    proton [options] pytest [pytest_args] [script_options]
+    python -m triton.profiler.proton [options] script.py [script_args] [script_options]
+""", formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-n", "--name", type=str, help="Name of the profiling session")
+    parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None,
+                        choices=["cupti", "roctracer", "instrumentation"])
+    parser.add_argument("-c", "--context", type=str, help="Profiling context", default="shadow",
+                        choices=["shadow", "python"])
+    parser.add_argument("-m", "--mode", type=str, help="Profiling mode", default=None)
+    parser.add_argument("-d", "--data", type=str, help="Profiling data", default="tree", choices=["tree", "trace"])
+    parser.add_argument("-k", "--hook", type=str, help="Profiling hook", default=None, choices=[None, "triton"])
+    parser.add_argument('target_args', nargs=argparse.REMAINDER, help='Subcommand and its arguments')
+    args = parser.parse_args()
+    return args, args.target_args
+
+
+def is_pytest(script):
+    return os.path.basename(script) == 'pytest'
+
+
+def execute_as_main(script, args):
+    script_path = os.path.abspath(script)
+    # Prepare a clean global environment
+    clean_globals = {
+        "__name__": "__main__",
+        "__file__": script_path,
+        "__builtins__": __builtins__,
+        sys.__name__: sys,
+    }
+
+    original_argv = sys.argv
+    sys.argv = [script] + args
+    # Append the script's directory in case the script uses relative imports
+    sys.path.append(os.path.dirname(script_path))
+
+    # Execute in the isolated environment
+    try:
+        with open(script_path, 'rb') as file:
+            code = compile(file.read(), script_path, 'exec')
+        exec(code, clean_globals)
+    except Exception as e:
+        print(f"An error occurred while executing the script: {e}")
+        sys.exit(1)
+    finally:
+        sys.argv = original_argv
+
+
+def do_setup_and_execute(target_args):
+    # Set the command line mode to avoid any `start` calls in the script.
+    flags.command_line = True
+
+    script = target_args[0]
+    script_args = target_args[1:] if len(target_args) > 1 else []
+    if is_pytest(script):
+        import pytest
+        pytest.main(script_args)
+    else:
+        execute_as_main(script, script_args)
+
+
+def run_profiling(args, target_args):
+    backend = args.backend if args.backend else _select_backend()
+
+    start(args.name, context=args.context, data=args.data, backend=backend, hook=args.hook)
+
+    do_setup_and_execute(target_args)
+
+    finalize()
+
+
+def main():
+    args, target_args = parse_arguments()
+    run_profiling(args, target_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/scope.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..911ebc46ecb58fa1deb44d45f255cdd9934d8b04
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/scope.py
@@ -0,0 +1,129 @@
+import threading
+import time
+from functools import wraps
+from typing import Optional, Union
+
+from .flags import flags
+from triton._C.libproton import proton as libproton
+
+thread_local_scopes = threading.local()
+
+MetricValueType = Union[float, int]
+
+
+class scope:
+    """
+    A context manager and decorator for entering and exiting a scope.
+
+    Usage:
+        context manager:
+        ```python
+        with proton.scope("test0", {metric_name: metric_value}):
+            foo[1,](x, y)
+        ```
+
+        decorator:
+        ```python
+        @proton.scope("test0", {metric_name: metric_value})
+        def foo(x, y):
+            ...
+        ```
+
+    Args:
+        name (str): The name of the scope.
+        metrics (dict[str, float], optional): The metrics of the scope. Default is None.
+    """
+
+    def __init__(self, name: str, metrics: Optional[dict[str, MetricValueType]] = None) -> None:
+        self.name = name
+        self.metrics = metrics
+        self.id = None
+
+    def _enter_scope(self):
+        if not flags.profiling_on:
+            return
+        self.id = libproton.record_scope()
+        libproton.enter_scope(self.id, self.name)
+        if self.metrics:
+            libproton.add_metrics(self.id, self.metrics)
+
+    def _exit_scope(self):
+        if not flags.profiling_on or self.id is None:
+            return
+        libproton.exit_scope(self.id, self.name)
+
+    def __enter__(self):
+        self._enter_scope()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._exit_scope()
+
+    def __call__(self, func):
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            self._enter_scope()
+            try:
+                return func(*args, **kwargs)
+            finally:
+                self._exit_scope()
+
+        return wrapper
+
+
+class cpu_timed_scope(scope):
+    """
+    A scope that measures elapsed time (cpu_time).
+
+    Args:
+        name (str): The name of the scope.
+        metrics (dict[str, float], optional): Additional metrics to add. Default is None.
+    """
+
+    def __init__(self, name: str, metrics: Optional[dict[str, float]] = None) -> None:
+        super().__init__(name, metrics)
+        self.start_time = None
+        if metrics and "cpu_time" in metrics:
+            raise ValueError("The metric name 'cpu_time' is reserved.")
+
+    def _enter_scope(self):
+        if not flags.profiling_on:
+            return
+        self.start_time = time.time_ns()
+        super()._enter_scope()
+
+    def _exit_scope(self):
+        if not flags.profiling_on:
+            return
+        super()._exit_scope()
+        if self.start_time is not None:
+            cpu_time = time.time_ns() - self.start_time
+            libproton.add_metrics(self.id, {"cpu_time (ns)(exc)": cpu_time})
+
+
+def enter_scope(name: str, *, metrics: Optional[dict[str, MetricValueType]] = None) -> Optional[int]:
+    if not flags.profiling_on:
+        return None
+    id = libproton.record_scope()
+    thread_local_scopes.scopes = getattr(thread_local_scopes, "scopes", [])
+    thread_local_scopes.scopes.append((id, name))
+    libproton.enter_scope(id, name)
+    if metrics:
+        libproton.add_metrics(id, metrics)
+    return id
+
+
+def exit_scope(name: Optional[str] = None, *, metrics: Optional[dict[str, MetricValueType]] = None) -> Optional[int]:
+    # `name` is an optional argument here, only to match the counterpart in enter_scope to make the API consistent with `proton.language.exit_scope`
+    if not flags.profiling_on:
+        return None
+    id, popped_name = thread_local_scopes.scopes.pop()
+    if name and name != popped_name:
+        raise ValueError(f"Scope name mismatch: {name} != {popped_name}")
+    elif not name:
+        name = popped_name
+    libproton.exit_scope(id, name)
+    if metrics:
+        libproton.add_metrics(id, metrics)
+    return id
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/specs.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/specs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30c3416d8ebc60351d6a4ce07b711512f3b22ef
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/specs.py
@@ -0,0 +1,69 @@
+flops_by_device = {
+    "CUDA": {
+        "80":
+        lambda width, **kwargs: 624e12 / (width / 8),
+        "89":
+        lambda width, **kwargs: (330.3 * 1e12) / (width / 8),  # TODO(Keren): Implement fp16 acc-> 660.6 fp8
+        "90":
+        lambda width, num_sms, clock_rate, **kwargs: ((num_sms / 114 * clock_rate / (1755 * 1e3) * 1513) * 1e12) /
+        (width / 8),
+        "100":
+        lambda width, num_sms, clock_rate, **kwargs: (num_sms * 16384 * (clock_rate / 1e3) * 1e6) / (width / 8),
+    }
+}
+
+amd_bps_by_arch = {
+    'gfx90a': 3.2 * 1e12,
+    'gfx942': 5.3 * 1e12,
+    'gfx950': 8.0 * 1e12,
+}
+
+# FP8 Matrix Performance(FLOPS/clock/CU)
+# For gfx90a we use the performance of INT8 since it doesn't support FP8 matrix operations.
+amd_fp8_flops_by_arch = {'gfx90a': 1024, 'gfx942': 4096, 'gfx950': 8192}
+
+
+def max_flops(device_type, arch, width, num_sms, clock_rate):
+    """
+    Calculate the maximum FLOPS for a given device type and width.
+
+    Args:
+        device_type (str): The type of device (e.g., "CUDA", "HIP").
+        arch (str): The architecture of the device (e.g., "80", "90").
+        width (int): The width in bits.
+        num_sms (int): The number of streaming multiprocessors.
+        clock_rate (float): The clock rate in GHz.
+
+    Returns:
+        float: The maximum FLOPS for the given device type and width.
+    """
+    if device_type == "HIP":
+        return amd_fp8_flops_by_arch[arch] * num_sms * clock_rate * 1e3 / (width / 8)
+
+    if device_type not in flops_by_device:
+        raise ValueError(f"Unsupported device type: {device_type}")
+
+    if arch not in flops_by_device[device_type]:
+        raise ValueError(f"Unsupported architecture: {arch}")
+
+    flops_func = flops_by_device[device_type][arch]
+
+    return flops_func(width, num_sms=num_sms, clock_rate=clock_rate)
+
+
+def max_bps(device_type, arch, bus_width, memory_clock_rate):
+    """
+    Calculate the maximum bytes per second for a given bus width and memory clock rate.
+
+    Args:
+        bus_width (int): The bus width in bits.
+        memory_clock_rate (float): The memory clock rate in GHz.
+
+    Returns:
+        float: The maximum bytes per second.
+    """
+    if device_type == "CUDA":
+        return 2 * bus_width * memory_clock_rate * 1e3 / 8
+    else:
+        assert device_type == "HIP"
+        return amd_bps_by_arch[arch]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/state.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/state.py
new file mode 100644
index 0000000000000000000000000000000000000000..decb8e1d0112f312ce1366f7a56df4d60c57d09b
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/state.py
@@ -0,0 +1,61 @@
+from triton._C.libproton import proton as libproton
+from .flags import flags
+from functools import wraps
+
+
+class state:
+    """
+    A context manager and decorator for entering and exiting a state.
+
+    Usage:
+        context manager:
+        ```python
+        with proton.state("test0"):
+            foo[1,](x, y)
+        ```
+
+        decorator:
+        ```python
+        @proton.state("test0")
+        def foo(x, y):
+            ...
+        ```
+
+    Args:
+        name (str): The name of the state.
+    """
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    def __enter__(self):
+        if not flags.profiling_on:
+            return self
+        libproton.enter_state(self.name)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        if not flags.profiling_on:
+            return
+        libproton.exit_state()
+
+    def __call__(self, func):
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if flags.profiling_on:
+                libproton.enter_state(self.name)
+            ret = func(*args, **kwargs)
+            if flags.profiling_on:
+                libproton.exit_state()
+            return ret
+
+        return wrapper
+
+
+def enter_state(name: str) -> None:
+    libproton.enter_state(name)
+
+
+def exit_state() -> None:
+    libproton.exit_state()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/viewer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/viewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d25e59ba093a3cf0fc76422236879975815175
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/profiler/viewer.py
@@ -0,0 +1,426 @@
+import argparse
+from collections import namedtuple
+import json
+import pandas as pd
+
+try:
+    import hatchet as ht
+    from hatchet.query import NegationQuery
+except ImportError:
+    raise ImportError("Failed to import hatchet. `pip install llnl-hatchet` to get the correct version.")
+import numpy as np
+from triton.profiler.hooks.launch import COMPUTE_METADATA_SCOPE_NAME, LaunchHook
+from triton.profiler import specs
+
+
+def match_available_metrics(metrics, inclusive_metrics, exclusive_metrics):
+    ret = []
+    if not isinstance(metrics, list):
+        metrics = [metrics]
+    if metrics:
+        for metric in metrics:
+            metric = metric.lower()
+            for raw_metric in inclusive_metrics + exclusive_metrics:
+                suffix = " (inc)" if raw_metric in inclusive_metrics else ""
+                raw_metric_no_unit = raw_metric.split("(")[0].strip().lower()
+                if metric in (raw_metric, raw_metric_no_unit):
+                    ret.append(raw_metric + suffix)
+                    break
+    if len(ret) == 0:
+        raise RuntimeError(f"Metric {metric} is not found. Use the --list flag to list available metrics")
+    return ret
+
+
+def remove_frames(database: json):
+    # We first fine frames that match either one of the two conditions:
+    # 1. The frame name is COMPUTE_METADATA_SCOPE_NAME
+    # 2. The frame has no metrics and no children
+    # Then we go up from the located nodes and remove the parents if all children were
+    # metadata nodes
+    def remove_frame_helper(node):
+        if "frame" not in node:
+            return node
+        if node["frame"]["name"] == COMPUTE_METADATA_SCOPE_NAME:
+            return None
+        if len(node["metrics"]) == 0 and len(node["children"]) == 0:
+            return None
+        children = node.get("children", [])
+        new_children = []
+        for child in children:
+            new_child = remove_frame_helper(child)
+            if new_child is not None:
+                new_children.append(new_child)
+        if len(new_children) > 0 or len(children) == 0:
+            node["children"] = new_children
+            return node
+        return None
+
+    new_database = []
+    for node in database:
+        new_node = remove_frame_helper(node)
+        if new_node is not None:
+            new_database.append(new_node)
+    return new_database
+
+
+def get_raw_metrics(file):
+    database = json.load(file)
+    database = remove_frames(database)
+    device_info = database.pop(1)
+    gf = ht.GraphFrame.from_literal(database)
+    inclusive_metrics = gf.show_metric_columns()
+    exclusive_metrics = [metric for metric in gf.dataframe.columns if metric not in inclusive_metrics]
+    return gf, inclusive_metrics, exclusive_metrics, device_info
+
+
+def get_min_time_flops(df, device_info):
+    min_time_flops = pd.DataFrame(0.0, index=df.index, columns=["min_time"])
+    for device_type in device_info:
+        for device_index in device_info[device_type]:
+            arch = device_info[device_type][device_index]["arch"]
+            num_sms = device_info[device_type][device_index]["num_sms"]
+            clock_rate = device_info[device_type][device_index]["clock_rate"]
+            for width in LaunchHook.flops_width:
+                idx = df["device_id"] == device_index
+                device_frames = df[idx]
+                if f"flops{width}" not in device_frames.columns:
+                    continue
+                max_flops = specs.max_flops(device_type, arch, width, num_sms, clock_rate)
+                min_time_flops.loc[idx, "min_time"] += device_frames[f"flops{width}"].fillna(0) / max_flops
+    return min_time_flops
+
+
+def get_min_time_bytes(df, device_info):
+    min_time_bytes = pd.DataFrame(0.0, index=df.index, columns=["min_time"])
+    for device_type in device_info:
+        for device_index in device_info[device_type]:
+            idx = df["device_id"] == device_index
+            device_frames = df[idx]
+            device = device_info[device_type][device_index]
+            memory_clock_rate = device["memory_clock_rate"]  # in khz
+            bus_width = device["bus_width"]  # in bits
+            peak_bandwidth = specs.max_bps(device_type, device['arch'], bus_width, memory_clock_rate)
+            min_time_bytes.loc[idx, "min_time"] += device_frames["bytes"] / peak_bandwidth
+    return min_time_bytes
+
+
+FactorDict = namedtuple("FactorDict", ["name", "factor"])
+time_factor_dict = FactorDict("time", {"time/s": 1, "time/ms": 1e-3, "time/us": 1e-6, "time/ns": 1e-9})
+avg_time_factor_dict = FactorDict("avg_time", {f"avg_{key}": value for key, value in time_factor_dict.factor.items()})
+cpu_time_factor_dict = FactorDict("cpu_time",
+                                  {"cpu_time/s": 1, "cpu_time/ms": 1e-3, "cpu_time/us": 1e-6, "cpu_time/ns": 1e-9})
+avg_cpu_time_factor_dict = FactorDict("avg_cpu_time",
+                                      {f"avg_{key}": value
+                                       for key, value in cpu_time_factor_dict.factor.items()})
+bytes_factor_dict = FactorDict("bytes", {"byte/s": 1, "gbyte/s": 1e9, "tbyte/s": 1e12})
+
+derivable_metrics = {
+    **{key: bytes_factor_dict
+       for key in bytes_factor_dict.factor.keys()},
+}
+
+# FLOPS have a specific width to their metric
+default_flop_factor_dict = {"flop/s": 1, "gflop/s": 1e9, "tflop/s": 1e12}
+derivable_metrics.update(
+    {key: FactorDict("flops", default_flop_factor_dict)
+     for key in default_flop_factor_dict.keys()})
+for width in LaunchHook.flops_width:
+    factor_name = f"flops{width}"
+    factor_dict = {f"flop{width}/s": 1, f"gflop{width}/s": 1e9, f"tflop{width}/s": 1e12}
+    derivable_metrics.update({key: FactorDict(factor_name, factor_dict) for key in factor_dict.keys()})
+
+
+def derive_metrics(gf, metrics, inclusive_metrics, exclusive_metrics, device_info):
+    derived_metrics = []
+
+    def get_time_seconds(df, metric, factor_dict):
+        time_metric_name = match_available_metrics(metric, inclusive_metrics, exclusive_metrics)[0]
+        time_unit = factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0]
+        return df[time_metric_name] * factor_dict.factor[time_unit]
+
+    for metric in metrics:
+        if metric == "util":  # exclusive
+            min_time_bytes = get_min_time_bytes(gf.dataframe, device_info)
+            min_time_flops = get_min_time_flops(gf.dataframe, device_info)
+            time_sec = get_time_seconds(gf.dataframe, "time", time_factor_dict)
+            internal_frame_indices = gf.dataframe["device_id"].isna()
+            gf.dataframe["util"] = min_time_flops["min_time"].combine(min_time_bytes["min_time"], max) / time_sec
+            gf.dataframe.loc[internal_frame_indices, "util"] = np.nan
+            derived_metrics.append("util")
+        elif metric in derivable_metrics:  # flop<width>/s, <t/g>byte/s, inclusive
+            derivable_metric = derivable_metrics[metric]
+            metric_name = derivable_metric.name
+            metric_factor_dict = derivable_metric.factor
+            matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0]
+            gf.dataframe[f"{metric} (inc)"] = (gf.dataframe[matched_metric_name] /
+                                               (get_time_seconds(gf.dataframe, "time", time_factor_dict)) /
+                                               metric_factor_dict[metric])
+            derived_metrics.append(f"{metric} (inc)")
+        elif (metric in time_factor_dict.factor or metric in cpu_time_factor_dict.factor
+              or metric in avg_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor):  # inclusive
+            is_cpu = metric in cpu_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor
+            is_avg = metric in avg_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor
+
+            factor_dict = ((avg_cpu_time_factor_dict if is_avg else cpu_time_factor_dict) if is_cpu else
+                           (avg_time_factor_dict if is_avg else time_factor_dict))
+            metric_name = "cpu_time" if is_cpu else "time"
+            metric_time_unit = factor_dict.name + "/" + metric.split("/")[1]
+
+            time_value = get_time_seconds(gf.dataframe, metric_name, factor_dict)
+            if is_avg:
+                time_value = time_value / gf.dataframe["count (inc)"]
+
+            gf.dataframe[f"{metric} (inc)"] = time_value / factor_dict.factor[metric_time_unit]
+            derived_metrics.append(f"{metric} (inc)")
+        else:
+            metric_name_and_unit = metric.split("/")
+            metric_name = metric_name_and_unit[0]
+            if len(metric_name_and_unit) > 1:  # percentage, exclusive or inclusive
+                metric_unit = metric_name_and_unit[1]
+                if metric_unit != "%":
+                    raise ValueError(f"Unsupported unit {metric_unit}")
+                matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0]
+                single_frame = gf.dataframe[matched_metric_name]
+                suffix = ""
+                if "(inc)" in matched_metric_name:
+                    suffix = " (inc)"
+                    total = gf.dataframe[matched_metric_name].iloc[0]
+                else:
+                    total = gf.dataframe[matched_metric_name].sum()
+                gf.dataframe[metric + suffix] = (single_frame / total) * 100.0
+                derived_metrics.append(metric + suffix)
+            else:
+                matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0]
+                derived_metrics.append(matched_metric_name)
+
+    # Update derived metrics to the graph frame
+    for derived_metric in derived_metrics:
+        if derived_metric.endswith("(inc)"):
+            gf.inc_metrics.append(derived_metric)
+        else:
+            gf.exc_metrics.append(derived_metric)
+
+    return derived_metrics
+
+
+def format_frames(gf, format):
+    if format == "file_function_line":
+        gf.dataframe["name"] = gf.dataframe["name"].apply(lambda x: x.split("/")[-1])
+    elif format == "function_line":
+        gf.dataframe["name"] = gf.dataframe["name"].apply(lambda x: x.split(":")[-1])
+    elif format == "file_function":
+        gf.dataframe["name"] = gf.dataframe["name"].apply(
+            lambda x: f"{x.split('/')[-1].split(':')[0]}@{x.split('@')[-1].split(':')[0]}")
+    return gf
+
+
+def filter_frames(gf, include=None, exclude=None, threshold=None, metric=None):
+    if include:
+        query = f"""
+MATCH ("*")->(".", p)->("*")
+WHERE p."name" =~ "{include}"
+"""
+        gf = gf.filter(query, squash=True)
+    if exclude:
+        inclusion_query = f"""
+MATCH (".", p)->("*")
+WHERE p."name" =~ "{exclude}"
+"""
+        query = NegationQuery(inclusion_query)
+        gf = gf.filter(query, squash=True)
+    if threshold:
+        query = ["*", {metric: f">= {threshold}"}]
+        gf = gf.filter(query, squash=True)
+    return gf
+
+
+def emit_warnings(gf, metrics):
+    if "bytes (inc)" in metrics:
+        byte_values = gf.dataframe["bytes (inc)"].values
+        min_byte_value = np.nanmin(byte_values)
+        if min_byte_value < 0:
+            print("Warning: Negative byte values detected, this is usually the result of a datatype overflow\n")
+
+
+def print_tree(gf, metrics, depth=100, format=None, print_sorted=False):
+    gf = format_frames(gf, format)
+    print(gf.tree(metric_column=metrics, expand_name=True, depth=depth, render_header=False))
+
+    if print_sorted:
+        print("Sorted kernels by metric " + metrics[0])
+        sorted_df = gf.dataframe.sort_values(by=[metrics[0]], ascending=False)
+        for row in range(1, len(sorted_df)):
+            kernel_name = (sorted_df.iloc[row]["name"][:100] +
+                           "..." if len(sorted_df.iloc[row]["name"]) > 100 else sorted_df.iloc[row]["name"])
+            print("{:105} {:.4}".format(kernel_name, sorted_df.iloc[row][metrics[0]]))
+    emit_warnings(gf, metrics)
+
+
+def read(filename):
+    with open(filename, "r") as f:
+        gf, inclusive_metrics, exclusive_metrics, device_info = get_raw_metrics(f)
+        assert len(inclusive_metrics + exclusive_metrics) > 0, "No metrics found in the input file"
+        gf.update_inclusive_columns()
+        return gf, inclusive_metrics, exclusive_metrics, device_info
+
+
+def parse(metrics, filename, include=None, exclude=None, threshold=None):
+    gf, inclusive_metrics, exclusive_metrics, device_info = read(filename)
+    metrics = derive_metrics(gf, metrics, inclusive_metrics, exclusive_metrics, device_info)
+    # TODO: generalize to support multiple metrics, not just the first one
+    gf = filter_frames(gf, include, exclude, threshold, metrics[0])
+    return gf, metrics
+
+
+def apply_diff_profile(gf, derived_metrics, diff_file, metrics, include, exclude, threshold):
+    # Compute the diff against a secondary profile while keeping derived metrics consistent.
+    gf2, _ = parse(metrics, diff_file, include, exclude, threshold)
+
+    derived_inc_metrics = [metric for metric in derived_metrics if metric.endswith("(inc)")]
+    derived_exc_metrics = [metric for metric in derived_metrics if not metric.endswith("(inc)")]
+
+    gf.inc_metrics = derived_inc_metrics
+    gf.exc_metrics = derived_exc_metrics
+    gf2.inc_metrics = derived_inc_metrics
+    gf2.exc_metrics = derived_exc_metrics
+    return gf.sub(gf2)
+
+
+def show_metrics(file_name):
+    with open(file_name, "r") as f:
+        _, inclusive_metrics, exclusive_metrics, _ = get_raw_metrics(f)
+        print("Available inclusive metrics:")
+        if inclusive_metrics:
+            for raw_metric in inclusive_metrics:
+                raw_metric_no_unit = raw_metric.split("(")[0].strip().lower()
+                print(f"- {raw_metric_no_unit}")
+        print("Available exclusive metrics:")
+        if exclusive_metrics:
+            for raw_metric in exclusive_metrics:
+                raw_metric_no_unit = raw_metric.split("(")[0].strip().lower()
+                print(f"- {raw_metric_no_unit}")
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        description="Performance data viewer for proton profiles.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    argparser.add_argument(
+        "-l",
+        "--list",
+        action="store_true",
+        help="""List available metrics. Metric names are case insensitive and ignore units.
+Derived metrics can be created when source metrics are available.
+- time/s, time/ms, time/us, time/ns: time
+- avg_time/s, avg_time/ms, avg_time/us, avg_time/ns: time / count
+- flop[<8/16/32/64>]/s, gflop[<8/16/32/64>]/s, tflop[<8/16/32/64>]/s: flops / time
+- byte/s, gbyte/s, tbyte/s: bytes / time
+- util: max(sum(flops<width>) / peak_flops<width>_time, sum(bytes) / peak_bandwidth_time)
+- <metric>/%%: frame(metric) / sum(metric). Only available for inclusive metrics (e.g. time)
+""",
+    )
+    argparser.add_argument(
+        "-m",
+        "--metrics",
+        type=str,
+        default=None,
+        help="""At maximum two metrics can be specified, separated by comma.
+There are two modes:
+1) Choose the output metric to display. It's case insensitive and ignore units.
+2) Derive a new metric from existing metrics.
+""",
+    )
+    argparser.add_argument(
+        "-i",
+        "--include",
+        type=str,
+        default=None,
+        help=
+        """Find frames that match the given regular expression and return all nodes in the paths that pass through the matching frames.
+For example, the following command will display all paths that contain frames that contains "test":
+```
+proton-viewer -i ".*test.*" path/to/file.json
+```
+""",
+    )
+    argparser.add_argument(
+        "-e",
+        "--exclude",
+        type=str,
+        default=None,
+        help="""Exclude frames that match the given regular expression and their children.
+For example, the following command will exclude all paths starting from frames that contains "test":
+```
+proton-viewer -e ".*test.*" path/to/file.json
+```
+""",
+    )
+    argparser.add_argument(
+        "-t",
+        "--threshold",
+        type=float,
+        default=None,
+        help=
+        "Exclude frames(kernels) whose metrics are below the given threshold. This filter only applies on the first metric.",
+    )
+    argparser.add_argument(
+        "-d",
+        "--depth",
+        type=int,
+        default=100,
+        help="The depth of the tree to display",
+    )
+    argparser.add_argument(
+        "-f",
+        "--format",
+        type=str,
+        choices=["full", "file_function_line", "function_line", "file_function"],
+        default="full",
+        help="""Formatting the frame name.
+- full: include the path, file name, function name and line number.
+- file_function_line: include the file name, function name and line number.
+- function_line: include the function name and line number.
+- file_function: include the file name and function name.
+""",
+    )
+    argparser.add_argument(
+        "--print-sorted",
+        action="store_true",
+        default=False,
+        help="Sort output by metric value instead of chronologically",
+    )
+    argparser.add_argument(
+        "--diff-profile",
+        "-diff",
+        type=str,
+        default=None,
+        help="Compare two profiles. When used as 'proton-viewer -m time -diff file1.log file2.log', "
+        "computes the difference: file2['time'] - file1['time']",
+    )
+
+    args, target_args = argparser.parse_known_args()
+    assert len(target_args) == 1, "Must specify a file to read"
+
+    file_name = target_args[0]
+    metrics = args.metrics.split(",") if args.metrics else None
+    include = args.include
+    exclude = args.exclude
+    threshold = args.threshold
+    depth = args.depth
+    format = args.format
+    diff = args.diff_profile
+    print_sorted = args.print_sorted
+    if include and exclude:
+        raise ValueError("Cannot specify both include and exclude")
+    if args.list:
+        show_metrics(file_name)
+    elif metrics:
+        gf, derived_metrics = parse(metrics, file_name, include, exclude, threshold)
+        if diff:
+            gf = apply_diff_profile(gf, derived_metrics, diff, metrics, include, exclude, threshold)
+        print_tree(gf, derived_metrics, depth, format, print_sorted)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3979d28d9a4359be5ceb3d2dea08f4d56899e6
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__init__.py
@@ -0,0 +1,23 @@
+from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
+from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
+from .driver import driver
+from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret
+from .errors import OutOfResources, InterpreterError
+
+__all__ = [
+    "autotune",
+    "Autotuner",
+    "Config",
+    "driver",
+    "Heuristics",
+    "heuristics",
+    "InterpreterError",
+    "JITFunction",
+    "KernelInterface",
+    "MockTensor",
+    "OutOfResources",
+    "RedisRemoteCacheBackend",
+    "reinterpret",
+    "RemoteCacheBackend",
+    "TensorWrapper",
+]
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74daacc3c68b8cf3150b07b0f1b9df6c0f2c1198
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_allocation.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_allocation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c23dccab2c67e1f3393408d39d85d40bc77dce7f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_allocation.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_async_compile.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_async_compile.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..803bb27d3ab0f94e765520b40a26b82d5a9b4117
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/_async_compile.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/autotuner.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/autotuner.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67c2eb5c432421467f63476f35ce5a21c8d72aa4
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/autotuner.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/build.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/build.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b50c094b360215555bd065622531f03aa558664
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/build.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/cache.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/cache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..168a92eda1b522e9689a4f80fe832050a6446b63
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/cache.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/driver.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/driver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf7b2d93b6af5450eef703977f157d7fb6bc4008
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/driver.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/errors.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/errors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f44a11c225d63332c59d377588d357fa612d302c
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/errors.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/jit.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/jit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01c4b0b5ff703e1d06ab525c33d385b8eb7c62c0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/__pycache__/jit.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_allocation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_allocation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ef7d56c4c7a595a5bce9be72be7fb4e0e3f4ed
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_allocation.py
@@ -0,0 +1,64 @@
+from typing import Optional, Protocol
+from contextvars import ContextVar
+
+
+class Buffer(Protocol):
+
+    def data_ptr(self) -> int:
+        ...
+
+
+class Allocator(Protocol):
+
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        ...
+
+
+class NullAllocator:
+
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        raise RuntimeError("Kernel requires a runtime memory allocation, but no allocator was set. " +
+                           "Use triton.set_allocator to specify an allocator.")
+
+
+_NULL_ALLOCATOR = NullAllocator()
+
+_allocator: ContextVar[Allocator] = ContextVar("_allocator", default=_NULL_ALLOCATOR)
+
+
+def set_allocator(allocator: Allocator) -> None:
+    """
+    The allocator function is called during kernel launch for kernels that
+    require additional global memory workspace.
+    """
+    _allocator.set(allocator)
+
+
+class _AllocatorWrapper:
+    """
+    Wrapper to provide ContextVar-like .get()/.set() methods. profile_allocator is
+    used in same way as allocator so it is useful to maintain the interface.
+    """
+
+    def __init__(self, allocator: Allocator) -> None:
+        self._allocator = allocator
+
+    def get(self) -> Allocator:
+        return self._allocator
+
+    def set(self, allocator: Allocator) -> None:
+        self._allocator = allocator
+
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        return self._allocator(size, alignment, stream)
+
+
+_profile_allocator = _AllocatorWrapper(_NULL_ALLOCATOR)
+
+
+def set_profile_allocator(allocator: Optional[Allocator]) -> None:
+    """
+    The profile allocator function is called before kernel launch for kernels
+    that require additional global memory workspace.
+    """
+    _profile_allocator.set(allocator if allocator is not None else _NULL_ALLOCATOR)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_async_compile.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_async_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..518743bde7afae92ee2100f16cdb804ef4338c90
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/_async_compile.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+from typing import Callable, Optional
+from concurrent.futures import Executor, as_completed, Future
+from contextvars import ContextVar
+
+active_mode: ContextVar[Optional[AsyncCompileMode]] = ContextVar("async_compile_active_mode", default=None)
+
+
+class FutureKernel:
+
+    def __init__(self, finalize_compile: Callable, future: Future):
+        self.finalize_compile = finalize_compile
+        self.kernel = None
+        self.future = future
+
+    def result(self, ignore_errors: bool = False):
+        if self.kernel is not None:
+            return self.kernel
+
+        try:
+            kernel = self.future.result()
+        except Exception:
+            if ignore_errors:
+                return
+            else:
+                raise
+        self.finalize_compile(kernel)
+        self.kernel = kernel
+        return kernel
+
+
+class AsyncCompileMode:
+
+    def __init__(self, executor: Executor, *, ignore_errors=False):
+        self.executor = executor
+        self.ignore_errors = ignore_errors
+        self.raw_futures = []
+        self.future_kernels = {}
+
+    def submit(self, key, compile_fn, finalize_fn):
+        future = self.future_kernels.get(key)
+        if future is not None:
+            return future
+
+        future = self.executor.submit(compile_fn)
+        future._key = key
+        self.raw_futures.append(future)
+        future_kernel = FutureKernel(finalize_fn, future)
+        self.future_kernels[key] = future_kernel
+        return future_kernel
+
+    def __enter__(self):
+        if active_mode.get() is not None:
+            raise RuntimeError("Another AsyncCompileMode is already active")
+        active_mode.set(self)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Finalize any outstanding compiles
+        for future in as_completed(self.raw_futures):
+            self.future_kernels[future._key].result(self.ignore_errors)
+        active_mode.set(None)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/autotuner.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/autotuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4d710496fa4143597da86d45bd3b9942ca65fe
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/autotuner.py
@@ -0,0 +1,483 @@
+from __future__ import annotations
+
+import builtins
+import time
+import inspect
+import hashlib
+import json
+from functools import cached_property
+from typing import Dict, Tuple, List, Optional
+
+from .. import knobs
+from .jit import KernelInterface, JITFunction
+from .errors import OutOfResources, PTXASError, AutotunerError
+from .driver import driver
+from .cache import get_cache_manager, triton_key
+from triton._C.libtriton import get_cache_invalidating_env_vars
+
+
+class Autotuner(KernelInterface):
+
+    def __init__(self, fn, arg_names, configs, key, reset_to_zero, restore_value, pre_hook=None, post_hook=None,
+                 prune_configs_by: Optional[Dict] = None, warmup=None, rep=None, use_cuda_graph=False, do_bench=None,
+                 cache_results=False):
+        """
+        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+            'perf_model': performance model used to predicate running time with different configs, returns running time
+            'top_k': number of configs to bench
+            'early_config_prune': a function used to prune configs. It should have the signature
+                `prune_configs_by( configs: List[triton.Config], named_args: Dict[str, Any], **kwargs: Dict[str, Any]) -> List[triton.Config]:`
+                and return pruned configs. It should return at least one config.
+        """
+        if not configs:
+            self.configs = [Config({}, num_warps=4, num_stages=3, num_ctas=1)]
+        else:
+            self.configs = configs
+        self.keys = key
+        self.cache: Dict[Tuple, Config] = {}
+        self.arg_names = arg_names
+        self.cache_results = (cache_results or knobs.autotuning.cache) and not knobs.runtime.interpret
+
+        # Reset to zero or restore values
+        self.reset_to_zero = []
+        if reset_to_zero is not None:
+            self.reset_to_zero = list(reset_to_zero)
+        self.restore_value = []
+        if restore_value is not None:
+            self.restore_value = list(restore_value)
+
+        # Hook to reset or restore for required tensors
+        self.pre_hook = lambda kwargs, reset_only=False: 0
+        self.post_hook = lambda kwargs, exception: 0
+        self.user_defined_pre_hook = False
+        self.user_defined_post_hook = False
+        if pre_hook:
+            self.pre_hook = pre_hook
+            self.user_defined_pre_hook = True
+        elif (len(self.reset_to_zero) > 0 or len(self.restore_value) > 0):
+
+            def _pre_hook(kwargs, reset_only=False):
+                for name in self.reset_to_zero:
+                    kwargs[name].zero_()
+                if not reset_only:
+                    self.restore_copies = {name: kwargs[name].clone() for name in self.restore_value}
+
+            self.pre_hook = _pre_hook
+
+        if post_hook:
+            self.post_hook = post_hook
+            self.user_defined_post_hook = True
+        elif len(self.restore_value) > 0:
+
+            def _post_hook(kwargs, exception):
+                for name in self.restore_value:
+                    kwargs[name].copy_(self.restore_copies[name])
+                self.restore_copies = {}
+
+            self.post_hook = _post_hook
+
+        self.perf_model = None
+        self.configs_top_k = 1.0
+        self.early_config_prune = None
+        if prune_configs_by:
+            self.perf_model = prune_configs_by.get("perf_model", self.perf_model)
+            self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k)
+            self.early_config_prune = prune_configs_by.get("early_config_prune", self.early_config_prune)
+
+        self.fn = fn
+        self.base_fn = fn
+        while not inspect.isfunction(self.base_fn):
+            self.base_fn = self.base_fn.fn
+
+        self._do_bench = do_bench
+        self.num_warmups = warmup
+        self.num_reps = rep
+        self.use_cuda_graph = use_cuda_graph
+
+        # If we got explicitly called via the old interface, raise a warning
+        # and proceed with the old behavior.
+        if warmup is not None or rep is not None or use_cuda_graph:
+            import warnings
+            warnings.warn(("warmup, rep, and use_cuda_graph parameters are deprecated. See "
+                           "https://github.com/triton-lang/triton/pull/4496 for details."), DeprecationWarning,
+                          stacklevel=1)
+            if use_cuda_graph:
+                from ..testing import do_bench_cudagraph
+                self._do_bench = lambda kernel_call, quantiles: do_bench_cudagraph(
+                    kernel_call,
+                    rep=rep if rep is not None else 100,
+                    quantiles=quantiles,
+                )
+                return
+
+            import triton.testing
+            self._do_bench = lambda kernel_call, quantiles: triton.testing.do_bench(
+                kernel_call,
+                warmup=warmup if warmup is not None else 25,
+                rep=rep if rep is not None else 100,
+                quantiles=quantiles,
+            )
+            return
+
+    @cached_property
+    def do_bench(self):
+        if self._do_bench is None:
+            return driver.active.get_benchmarker()
+        return self._do_bench
+
+    def _bench(self, *args, config, **meta):
+        from ..compiler.errors import CompileTimeAssertionFailure
+
+        verbose = knobs.autotuning.print
+        if verbose:
+            print(f"Autotuning kernel {self.base_fn.__name__} with config {config}")
+
+        # check for conflicts, i.e. meta-parameters both provided
+        # as kwargs and by the autotuner
+        conflicts = meta.keys() & config.kwargs.keys()
+        if conflicts:
+            raise ValueError(f"Conflicting meta-parameters: {', '.join(conflicts)}."
+                             " Make sure that you don't re-define auto-tuned symbols.")
+        # augment meta-parameters with tunable ones
+        current = dict(meta, **config.all_kwargs())
+        full_nargs = {**self.nargs, **current}
+
+        def kernel_call():
+            if config.pre_hook:
+                config.pre_hook(full_nargs)
+            self.pre_hook(full_nargs)
+            try:
+                self.fn.run(
+                    *args,
+                    **current,
+                )
+            except Exception as e:
+                try:
+                    self.post_hook(full_nargs, exception=e)
+                finally:
+                    # Throw exception raised by `self.fn.run`
+                    raise
+
+            self.post_hook(full_nargs, exception=None)
+
+        try:
+            return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
+        except (OutOfResources, CompileTimeAssertionFailure, PTXASError) as e:
+            if verbose:
+                print(f"Autotuning failed with {e}")
+            return [float("inf"), float("inf"), float("inf")]
+
+    def check_disk_cache(self, tuning_key, configs, bench_fn):
+        # We can't serialize prehooks, so just give up and run the benchmarks.
+        if not tuning_key or any(cfg.pre_hook for cfg in configs):
+            bench_fn()
+            return False
+
+        from triton.compiler.compiler import make_backend
+
+        fn = self.fn
+        while not isinstance(fn, JITFunction):
+            fn = fn.fn
+
+        env_vars = get_cache_invalidating_env_vars()
+        cache_key = [
+            triton_key(),
+            make_backend(driver.active.get_current_target()).hash(),
+            fn.cache_key,
+            str(sorted(env_vars.items())),
+            str(tuning_key),
+        ] + [str(c) for c in configs]
+        cache_key = hashlib.sha256("-".join(cache_key).encode("utf-8")).hexdigest()
+        cache = get_cache_manager(cache_key)
+        file_name = f"{fn.__name__[:150]}.autotune.json"
+        path = cache.get_file(file_name)
+        if path:
+            with open(path, "r") as cached_configs:
+                timings = json.load(cached_configs)["configs_timings"]
+                timings = {Config(**config): timing for config, timing in timings}
+                self.cache[tuning_key] = builtins.min(timings, key=timings.get)
+                self.configs_timings = timings
+            return True
+
+        bench_fn()
+        cache.put(
+            json.dumps({
+                "key":
+                tuning_key,
+                "configs_timings":
+                [(config.__dict__, timings) for config, timings in self.configs_timings.items() if not config.pre_hook],
+            }), file_name, binary=False)
+        return False
+
+    def run(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        used_cached_result = True
+        if len(self.configs) > 1:
+            all_args = {**self.nargs, **kwargs}
+            _args = {k: v for (k, v) in all_args.items() if k in self.arg_names}
+            key = [_args[key] for key in self.keys if key in _args]
+            for _, arg in _args.items():
+                if hasattr(arg, "dtype"):
+                    key.append(str(arg.dtype))
+            key = tuple(key)
+            if key not in self.cache:
+                used_cached_result = False
+                pruned_configs = self.prune_configs(kwargs)
+
+                def benchmark():
+                    bench_start = time.time()
+                    timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                    bench_end = time.time()
+                    self.bench_time = bench_end - bench_start
+                    self.cache[key] = builtins.min(timings, key=timings.get)
+                    full_nargs = {**self.nargs, **kwargs, **self.cache[key].all_kwargs()}
+                    self.pre_hook(full_nargs, reset_only=True)
+                    self.configs_timings = timings
+
+                if self.cache_results:
+                    used_cached_result = self.check_disk_cache(key, pruned_configs, benchmark)
+                else:
+                    benchmark()
+
+            config = self.cache[key]
+        else:
+            config = self.configs[0]
+        self.best_config = config
+        if knobs.autotuning.print and not used_cached_result:
+            print(f"Triton autotuning for function {self.base_fn.__name__},\nwith key as {key},\n"
+                  f"finished after {self.bench_time:.2f}s,\nbest config selected: {self.best_config};")
+        if config.pre_hook is not None:
+            full_nargs = {**self.nargs, **kwargs, **config.all_kwargs()}
+            config.pre_hook(full_nargs)
+        ret = self.fn.run(
+            *args,
+            **kwargs,
+            **config.all_kwargs(),
+        )
+        self.nargs = None
+        return ret
+
+    def prune_configs(self, kwargs: Dict) -> List[Config]:
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
+            if not pruned_configs:
+                raise AutotunerError(
+                    "No valid autotuner configs after pruning. `early_config_prune` should return at least one config.")
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            elif not isinstance(top_k, int):
+                # Slice index must be an integer
+                raise TypeError("Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int")
+
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.all_kwargs(),
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
+
+    def warmup(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        ret = []
+        for autotune_config in self.prune_configs(kwargs):
+            ret.append(self.fn.warmup(
+                *args,
+                **kwargs,
+                **autotune_config.all_kwargs(),
+            ))
+        self.nargs = None
+        return ret
+
+
+class Config:
+    """
+    An object that represents a possible kernel configuration for the auto-tuner to try.
+
+    :ivar kwargs: a dictionary of meta-parameters to pass to the kernel as keyword arguments.
+    :type kwargs: dict[Str, Any]
+    :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if
+                      `num_warps=8`, then each kernel instance will be automatically parallelized to
+                      cooperatively execute using `8 * 32 = 256` threads.
+    :type num_warps: int
+    :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops.
+                       Mostly useful for matrix multiplication workloads on SM80+ GPUs.
+    :type num_stages: int
+    :ivar num_ctas: number of blocks in a block cluster. SM90+ only.
+    :type num_ctas: int
+    :type maxnreg: Optional[int]
+    :ivar maxnreg: maximum number of registers one thread can use.  Corresponds
+                       to ptx .maxnreg directive.  Not supported on all platforms.
+    :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this
+                    function are args.
+    :ivar ir_override: filename of a user-defined IR (*.{ttgir|llir|ptx|amdgcn}).
+    """
+
+    def __init__(self, kwargs, num_warps=4, num_stages=3, num_ctas=1, maxnreg=None, pre_hook=None, ir_override=None):
+        self.kwargs = kwargs
+        self.num_warps = num_warps
+        self.num_ctas = num_ctas
+        self.num_stages = num_stages
+        self.maxnreg = maxnreg
+        self.pre_hook = pre_hook
+        self.ir_override = ir_override
+
+    def __setstate__(self, state):
+        self.kwargs = state.get("kwargs", {})
+        self.num_warps = state.get("num_warps", 4)
+        self.num_stages = state.get("num_stages", 3)
+        self.num_ctas = state.get("num_ctas", 1)
+        self.maxnreg = state.get("maxnreg", None)
+        self.pre_hook = state.get("pre_hook", None)
+        self.ir_override = state.get("ir_override", None)
+
+    def all_kwargs(self):
+        return {
+            **self.kwargs, **{
+                k: v
+                for (k, v) in (
+                    ("num_warps", self.num_warps),
+                    ("num_ctas", self.num_ctas),
+                    ("num_stages", self.num_stages),
+                    ("maxnreg", self.maxnreg),
+                    ("ir_override", self.ir_override),
+                ) if v is not None
+            }
+        }
+
+    def __str__(self):
+        res = []
+        for k, v in self.kwargs.items():
+            res.append(f"{k}: {v}")
+        res.append(f"num_warps: {self.num_warps}")
+        res.append(f"num_ctas: {self.num_ctas}")
+        res.append(f"num_stages: {self.num_stages}")
+        res.append(f"maxnreg: {self.maxnreg}")
+        return ", ".join(res)
+
+    def __hash__(self):
+        return hash((*self.all_kwargs().items(), self.pre_hook))
+
+    def __eq__(self, other):
+        self_tuple = tuple((
+            *self.all_kwargs().items(),
+            self.pre_hook,
+        ))
+        other_tuple = tuple((
+            *other.all_kwargs().items(),
+            other.pre_hook,
+        ))
+        return self_tuple == other_tuple
+
+
+def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_value=None, pre_hook=None, post_hook=None,
+             warmup=None, rep=None, use_cuda_graph=False, do_bench=None, cache_results=False):
+    """
+    Decorator for auto-tuning a :code:`triton.jit`'d function.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        @triton.autotune(configs=[
+            triton.Config(kwargs={'BLOCK_SIZE': 128}, num_warps=4),
+            triton.Config(kwargs={'BLOCK_SIZE': 1024}, num_warps=8),
+          ],
+          key=['x_size'] # the two above configs will be evaluated anytime
+                         # the value of x_size changes
+        )
+        @triton.jit
+        def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
+            ...
+    :note: When all the configurations are evaluated, the kernel will run multiple times.
+           This means that whatever value the kernel updates will be updated multiple times.
+           To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
+           resets the value of the provided tensor to `zero` before running any configuration.
+
+    If the environment variable :code:`TRITON_PRINT_AUTOTUNING` is set to
+    :code:`"1"`, Triton will print a message to stdout after autotuning each
+    kernel, including the time spent autotuning and the best configuration.
+
+    :param configs: a list of :code:`triton.Config` objects
+    :type configs: list[triton.Config]
+    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
+    :type key: list[str]
+    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+        'perf_model': performance model used to predicate running time with different configs, returns running time
+        'top_k': number of configs to bench
+        'early_config_prune': a function used to prune configs. It should have the signature
+                `prune_configs_by( configs: List[triton.Config], named_args: Dict[str, Any], **kwargs: Dict[str, Any]) -> List[triton.Config]:`
+                and return pruned configs. It should return at least one config.
+    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
+    :type reset_to_zero: list[str]
+    :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
+    :type restore_value: list[str]
+    :param pre_hook: a function that will be called before the kernel is called.
+        This overrides the default pre_hook used for 'reset_to_zero' and 'restore_value'.
+        'kwargs': a dict of all arguments passed to the kernel.
+        'reset_only': a boolean indicating whether the pre_hook is called to reset the values only, without a corresponding post_hook.
+    :type pre_hook: lambda args, reset_only
+    :param post_hook: a function that will be called after the kernel is called.
+        This overrides the default post_hook used for 'restore_value'.
+        'kwargs': a dict of all arguments passed to the kernel.
+        'exception': the exception raised by the kernel in case of a compilation or runtime error.
+    :type post_hook: lambda args, exception
+    :param warmup: warmup time (in ms) to pass to benchmarking (deprecated).
+    :type warmup: int
+    :param rep: repetition time (in ms) to pass to benchmarking (deprecated).
+    :type rep: int
+    :param do_bench: a benchmark function to measure the time of each run.
+    :type do_bench: lambda fn, quantiles
+    :param cache_results: whether to cache autotune timings to disk.  Defaults to False.
+    "type cache_results: bool
+    """
+
+    def decorator(fn):
+        return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, pre_hook=pre_hook,
+                         post_hook=post_hook, prune_configs_by=prune_configs_by, warmup=warmup, rep=rep,
+                         use_cuda_graph=use_cuda_graph, do_bench=do_bench, cache_results=cache_results)
+
+    return decorator
+
+
+class Heuristics(KernelInterface):
+
+    def __init__(self, fn, arg_names, values) -> None:
+        self.fn = fn
+        self.values = values
+        self.arg_names = arg_names
+
+    def run(self, *args, **kwargs):
+        for v, heur in self.values.items():
+            kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
+        return self.fn.run(*args, **kwargs)
+
+
+def heuristics(values):
+    """
+    Decorator for specifying how the values of certain meta-parameters may be computed.
+    This is useful for cases where auto-tuning is prohibitively expensive, or just not applicable.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # smallest power-of-two >= x_size
+        @triton.heuristics(values={'BLOCK_SIZE': lambda args: triton.next_power_of_2(args['x_size'])})
+        @triton.jit
+        def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
+            ...
+    :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter.
+                   each such function takes a list of positional arguments as input.
+    :type values: dict[str, Callable[[dict[str, Any]], Any]]
+    """
+
+    def decorator(fn):
+        return Heuristics(fn, fn.arg_names, values)
+
+    return decorator
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/build.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..786f51e54db77a5e0c04a42a72ce01060f436631
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/build.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import functools
+import hashlib
+import importlib.util
+import logging
+import os
+import shutil
+import subprocess
+import sysconfig
+import tempfile
+import re
+
+from types import ModuleType
+
+from .cache import get_cache_manager
+from .. import knobs
+
+
+def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_dirs: list[str], libraries: list[str],
+           ccflags: list[str]) -> str:
+    if impl := knobs.build.impl:
+        return impl(name, src, srcdir, library_dirs, include_dirs, libraries)
+    suffix = sysconfig.get_config_var('EXT_SUFFIX')
+    so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
+    cc = os.environ.get("CC")
+    if cc is None:
+        clang = shutil.which("clang")
+        gcc = shutil.which("gcc")
+        cc = gcc if gcc is not None else clang
+        if cc is None:
+            raise RuntimeError(
+                "Failed to find C compiler. Please specify via CC environment variable or set triton.knobs.build.impl.")
+    scheme = sysconfig.get_default_scheme()
+    # 'posix_local' is a custom scheme on Debian. However, starting Python 3.10, the default install
+    # path changes to include 'local'. This change is required to use triton with system-wide python.
+    if scheme == 'posix_local':
+        scheme = 'posix_prefix'
+    py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
+    custom_backend_dirs = knobs.build.backend_dirs
+    include_dirs = include_dirs + [srcdir, py_include_dir, *custom_backend_dirs]
+    # for -Wno-psabi, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111047
+    cc_cmd = [cc, src, "-O3", "-shared", "-fPIC", "-Wno-psabi", "-o", so]
+    cc_cmd += [_library_flag(lib) for lib in libraries]
+    cc_cmd += [f"-L{dir}" for dir in library_dirs]
+    cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
+    cc_cmd.extend(ccflags)
+    subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
+    return so
+
+
+def _library_flag(lib: str) -> str:
+    # Match .so files with optional version numbers (e.g., .so, .so.1, .so.513.50.1)
+    if re.search(r'\.so(\.\d+)*$', lib) or lib.endswith(".a"):
+        return f"-l:{lib}"
+    return f"-l{lib}"
+
+
+@functools.lru_cache
+def platform_key() -> str:
+    from platform import machine, system, architecture
+    return ",".join([machine(), system(), *architecture()])
+
+
+def _load_module_from_path(name: str, path: str) -> ModuleType:
+    spec = importlib.util.spec_from_file_location(name, path)
+    if not spec or not spec.loader:
+        raise RuntimeError(f"Failed to load newly compiled {name} from {path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def compile_module_from_src(src: str, name: str, library_dirs: list[str] | None = None,
+                            include_dirs: list[str] | None = None, libraries: list[str] | None = None,
+                            ccflags: list[str] | None = None) -> ModuleType:
+    key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
+    cache = get_cache_manager(key)
+    suffix = sysconfig.get_config_var("EXT_SUFFIX")
+    cache_path = cache.get_file(f"{name}{suffix}")
+
+    if cache_path is not None:
+        try:
+            return _load_module_from_path(name, cache_path)
+        except (RuntimeError, ImportError):
+            log = logging.getLogger(__name__)
+            log.warning(f"Triton cache error: compiled module {name}.so could not be loaded")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_path = os.path.join(tmpdir, name + ".c")
+        with open(src_path, "w") as f:
+            f.write(src)
+        so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [], ccflags or [])
+        with open(so, "rb") as f:
+            cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)
+
+    return _load_module_from_path(name, cache_path)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/cache.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..0442f00e68f5f907a4da56ebd7689e8c469219ef
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/cache.py
@@ -0,0 +1,309 @@
+import json
+import os
+import uuid
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+import base64
+import hashlib
+import functools
+import sysconfig
+
+from triton import __version__, knobs
+
+
+class CacheManager(ABC):
+
+    def __init__(self, key, override=False, dump=False):
+        pass
+
+    @abstractmethod
+    def get_file(self, filename) -> Optional[str]:
+        pass
+
+    @abstractmethod
+    def put(self, data, filename, binary=True) -> str:
+        pass
+
+    @abstractmethod
+    def get_group(self, filename: str) -> Optional[Dict[str, str]]:
+        pass
+
+    @abstractmethod
+    def put_group(self, filename: str, group: Dict[str, str]):
+        pass
+
+
+class FileCacheManager(CacheManager):
+
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = knobs.cache.dump_dir
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = knobs.cache.override_dir
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = knobs.cache.dir
+            if self.cache_dir:
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
+
+    def _make_path(self, filename) -> str:
+        return os.path.join(self.cache_dir, filename)
+
+    def has_file(self, filename) -> bool:
+        if not self.cache_dir:
+            raise RuntimeError("Could not create or locate cache dir")
+        return os.path.exists(self._make_path(filename))
+
+    def get_file(self, filename) -> Optional[str]:
+        if self.has_file(filename):
+            return self._make_path(filename)
+        else:
+            return None
+
+    def get_group(self, filename: str) -> Optional[Dict[str, str]]:
+        grp_filename = f"__grp__{filename}"
+        if not self.has_file(grp_filename):
+            return None
+        grp_filepath = self._make_path(grp_filename)
+        with open(grp_filepath) as f:
+            grp_data = json.load(f)
+        child_paths = grp_data.get("child_paths", None)
+        # Invalid group data.
+        if child_paths is None:
+            return None
+        result = {}
+        for c, p in child_paths.items():
+            if os.path.exists(p):
+                result[c] = p
+        return result
+
+    # Note a group of pushed files as being part of a group
+    def put_group(self, filename: str, group: Dict[str, str]) -> str:
+        if not self.cache_dir:
+            raise RuntimeError("Could not create or locate cache dir")
+        grp_contents = json.dumps({"child_paths": group})
+        grp_filename = f"__grp__{filename}"
+        return self.put(grp_contents, grp_filename, binary=False)
+
+    def put(self, data, filename, binary=True) -> str:
+        if not self.cache_dir:
+            raise RuntimeError("Could not create or locate cache dir")
+        binary = isinstance(data, bytes)
+        if not binary:
+            data = str(data)
+        assert self.lock_path is not None
+        filepath = self._make_path(filename)
+        # Random ID to avoid any collisions
+        rnd_id = str(uuid.uuid4())
+        # we use the PID in case a bunch of these around so we can see what PID made it
+        pid = os.getpid()
+        # use temp dir to be robust against program interruptions
+        temp_dir = os.path.join(self.cache_dir, f"tmp.pid_{pid}_{rnd_id}")
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_path = os.path.join(temp_dir, filename)
+
+        mode = "wb" if binary else "w"
+        with open(temp_path, mode) as f:
+            f.write(data)
+        # Replace is guaranteed to be atomic on POSIX systems if it succeeds
+        # so filepath cannot see a partial write
+        os.replace(temp_path, filepath)
+        os.removedirs(temp_dir)
+        return filepath
+
+
+class RemoteCacheBackend:
+    """
+    A backend implementation for accessing a remote/distributed cache.
+    """
+
+    def __init__(self, key: str):
+        pass
+
+    @abstractmethod
+    def get(self, filenames: List[str]) -> Dict[str, bytes]:
+        pass
+
+    @abstractmethod
+    def put(self, filename: str, data: bytes):
+        pass
+
+
+class RedisRemoteCacheBackend(RemoteCacheBackend):
+
+    def __init__(self, key):
+        import redis
+        self._key = key
+        self._key_fmt = knobs.cache.redis.key_format
+        self._redis = redis.Redis(
+            host=knobs.cache.redis.host,
+            port=knobs.cache.redis.port,
+        )
+
+    def _get_key(self, filename: str) -> str:
+        return self._key_fmt.format(key=self._key, filename=filename)
+
+    def get(self, filenames: List[str]) -> Dict[str, str]:
+        results = self._redis.mget([self._get_key(f) for f in filenames])
+        return {filename: result for filename, result in zip(filenames, results) if result is not None}
+
+    def put(self, filename: str, data: bytes) -> Dict[str, bytes]:
+        self._redis.set(self._get_key(filename), data)
+
+
+class RemoteCacheManager(CacheManager):
+
+    def __init__(self, key, override=False, dump=False):
+        # Setup backend pointed too by `TRITON_REMOTE_CACHE_BACKEND`.
+        remote_cache_cls = knobs.cache.remote_manager_class
+        if not remote_cache_cls:
+            raise RuntimeError(
+                "Unable to instantiate RemoteCacheManager, TRITON_REMOTE_CACHE_BACKEND doesn't point to a valid class")
+        self._backend = remote_cache_cls(key)
+
+        self._override = override
+        self._dump = dump
+
+        # Use a `FileCacheManager` to materialize remote cache paths locally.
+        self._file_cache_manager = FileCacheManager(key, override=override, dump=dump)
+
+    def _materialize(self, filename: str, data: bytes):
+        # We use a backing `FileCacheManager` to provide the materialized data.
+        return self._file_cache_manager.put(data, filename, binary=True)
+
+    def get_file(self, filename: str) -> Optional[str]:
+        # We don't handle the dump/override cases.
+        if self._dump or self._override:
+            return self._file_cache_manager.get_file(filename)
+
+        # We always check the remote cache backend -- even if our internal file-
+        # based cache has the item -- to make sure LRU accounting works as
+        # expected.
+        results = self._backend.get([filename])
+        if len(results) == 0:
+            return None
+        (_, data), = results.items()
+        return self._materialize(filename, data)
+
+    def put(self, data, filename: str, binary=True) -> str:
+        # We don't handle the dump/override cases.
+        if self._dump or self._override:
+            return self._file_cache_manager.put(data, filename, binary=binary)
+
+        if not isinstance(data, bytes):
+            data = str(data).encode("utf-8")
+        self._backend.put(filename, data)
+        return self._materialize(filename, data)
+
+    def get_group(self, filename: str) -> Optional[Dict[str, str]]:
+        # We don't handle the dump/override cases.
+        if self._dump or self._override:
+            return self._file_cache_manager.get_group(filename)
+
+        grp_filename = f"__grp__{filename}"
+        grp_filepath = self.get_file(grp_filename)
+        if grp_filepath is None:
+            return None
+        with open(grp_filepath) as f:
+            grp_data = json.load(f)
+        child_paths = grp_data.get("child_paths", None)
+
+        result = None
+
+        # Found group data.
+        if child_paths is not None:
+            result = {}
+            for child_path, data in self._backend.get(child_paths).items():
+                result[child_path] = self._materialize(child_path, data)
+
+        return result
+
+    def put_group(self, filename: str, group: Dict[str, str]):
+        # We don't handle the dump/override cases.
+        if self._dump or self._override:
+            return self._file_cache_manager.put_group(filename, group)
+
+        grp_contents = json.dumps({"child_paths": sorted(list(group.keys()))})
+        grp_filename = f"__grp__{filename}"
+        return self.put(grp_contents, grp_filename)
+
+
+def _base32(key):
+    # Assume key is a hex string.
+    return base64.b32encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
+
+
+def get_cache_manager(key) -> CacheManager:
+    cls = knobs.cache.manager_class or FileCacheManager
+    return cls(_base32(key))
+
+
+def get_override_manager(key) -> CacheManager:
+    cls = knobs.cache.manager_class or FileCacheManager
+    return cls(_base32(key), override=True)
+
+
+def get_dump_manager(key) -> CacheManager:
+    cls = knobs.cache.manager_class or FileCacheManager
+    return cls(_base32(key), dump=True)
+
+
+def make_so_cache_key(version_hash, signature, constants, ids, **kwargs):
+    # Get unique key for the compiled code
+    signature = {k: 'ptr' if v[0] == '*' else v for k, v in signature.items()}
+    key = f"{version_hash}-{''.join(signature.values())}-{constants}-{ids}"
+    for kw in kwargs:
+        key = f"{key}-{kwargs.get(kw)}"
+    key = hashlib.sha256(key.encode("utf-8")).hexdigest()
+    return _base32(key)
+
+
+@functools.lru_cache()
+def triton_key():
+    import pkgutil
+    TRITON_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    contents = []
+    # frontend
+    with open(__file__, "rb") as f:
+        contents += [hashlib.sha256(f.read()).hexdigest()]
+    # compiler
+    path_prefixes = [
+        (os.path.join(TRITON_PATH, "compiler"), "triton.compiler."),
+        (os.path.join(TRITON_PATH, "backends"), "triton.backends."),
+    ]
+    for path, prefix in path_prefixes:
+        for lib in pkgutil.walk_packages([path], prefix=prefix):
+            with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+                contents += [hashlib.sha256(f.read()).hexdigest()]
+
+    # backend
+    libtriton_hash = hashlib.sha256()
+    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
+    with open(os.path.join(TRITON_PATH, "_C", f"libtriton.{ext}"), "rb") as f:
+        while True:
+            chunk = f.read(1024**2)
+            if not chunk:
+                break
+            libtriton_hash.update(chunk)
+    contents.append(libtriton_hash.hexdigest())
+    # language
+    language_path = os.path.join(TRITON_PATH, 'language')
+    for lib in pkgutil.walk_packages([language_path], prefix="triton.language."):
+        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+            contents += [hashlib.sha256(f.read()).hexdigest()]
+    return f'{__version__}' + '-'.join(contents)
+
+
+def get_cache_key(src, backend, backend_options, env_vars):
+    key = f"{triton_key()}-{src.hash()}-{backend.hash()}-{backend_options.hash()}-{str(sorted(env_vars.items()))}"
+    return key
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/driver.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..0092156792991984045e6158678ec85f74b1776a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/driver.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+from ..backends import backends, DriverBase
+
+
+def _create_driver() -> DriverBase:
+    active_drivers = [x.driver for x in backends.values() if x.driver.is_active()]
+    if len(active_drivers) != 1:
+        raise RuntimeError(f"{len(active_drivers)} active drivers ({active_drivers}). There should only be one.")
+    return active_drivers[0]()
+
+
+class DriverConfig:
+
+    def __init__(self) -> None:
+        self._default: DriverBase | None = None
+        self._active: DriverBase | None = None
+
+    @property
+    def default(self) -> DriverBase:
+        if self._default is None:
+            self._default = _create_driver()
+        return self._default
+
+    @property
+    def active(self) -> DriverBase:
+        if self._active is None:
+            self._active = self.default
+        return self._active
+
+    def set_active(self, driver: DriverBase) -> None:
+        self._active = driver
+
+    def reset_active(self) -> None:
+        self._active = self.default
+
+
+driver = DriverConfig()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/errors.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9a1b60bd6d9debf4d3df57235b87b08bd9330f7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/errors.py
@@ -0,0 +1,46 @@
+from ..errors import TritonError
+from typing import Optional
+
+
+class InterpreterError(TritonError):
+
+    def __init__(self, error_message: Optional[str] = None):
+        self.error_message = error_message
+
+    def __str__(self) -> str:
+        return self.error_message or ""
+
+
+class OutOfResources(TritonError):
+
+    def __init__(self, required, limit, name):
+        self.required = required
+        self.limit = limit
+        self.name = name
+
+    def __str__(self) -> str:
+        return f"out of resource: {self.name}, Required: {self.required}, Hardware limit: {self.limit}. Reducing block sizes or `num_stages` may help."
+
+    def __reduce__(self):
+        # this is necessary to make CompilationError picklable
+        return (type(self), (self.required, self.limit, self.name))
+
+
+class PTXASError(TritonError):
+
+    def __init__(self, error_message: Optional[str] = None):
+        self.error_message = error_message
+
+    def __str__(self) -> str:
+        error_message = self.error_message or ""
+        return f"PTXAS error: {error_message}"
+
+
+class AutotunerError(TritonError):
+
+    def __init__(self, error_message: Optional[str] = None):
+        self.error_message = error_message
+
+    def __str__(self) -> str:
+        error_message = self.error_message or ""
+        return f"Autotuner error: {error_message}"
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/interpreter.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd871cb2e1f1205bf9368235cd6193435aaff12d
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/interpreter.py
@@ -0,0 +1,1492 @@
+from __future__ import annotations
+import ast
+import textwrap
+import inspect
+from typing import Tuple, List, Dict, Callable, TypeVar
+
+import math
+import numpy as np
+
+import triton
+import triton.language as tl
+import dataclasses
+from dataclasses import dataclass
+
+from triton.language.semantic import TritonSemantic
+from triton.runtime.jit import KernelInterface
+from triton.tools.tensor_descriptor import TensorDescriptor
+from .errors import InterpreterError
+from functools import partial
+from .._C.libtriton import interpreter as _interpreter
+from .._C.libtriton import ir as _ir
+
+T = TypeVar("T")
+
+
+@dataclass
+class TensorHandle:
+    '''
+        data: numpy array
+        dtype: triton type, either pointer_type or scalar_type.
+        we don't store block_type here because the shape information is already available in the data field
+        attr: a dictionary of attributes
+    '''
+    data: np.array
+    dtype: tl.dtype
+    attr: Dict = dataclasses.field(default_factory=dict)
+
+    def __post_init__(self):
+        if not _validate_np_data_size(self.data, self.dtype):
+            raise ValueError(f"numpy data itemsize ({self.data.itemsize * 8} bits) exceeds dtype primitive_bitwidth "
+                             f"({self.dtype.primitive_bitwidth} bits) for triton type {self.dtype}")
+
+    def __bool__(self):
+        return bool(self.data.all())
+
+    def get_element_ty(self):
+        dtype = self.dtype
+        while hasattr(dtype, "element_ty"):
+            dtype = dtype.element_ty
+        return dtype
+
+    def clone(self):
+        return TensorHandle(self.data.copy(), self.dtype)
+
+    def set_attr(self, key, value):
+        self.attr[key] = value
+
+
+class BlockPointerHandle:
+
+    def __init__(self, base, shape, strides, offsets, block_shape, order):
+        self.base = base
+        self.shape = shape
+        self.strides = strides
+        self.offsets = offsets
+        self.block_shape = block_shape
+        self.order = order
+
+    def materialize_pointers(self, boundary_check):
+        dtype_tt = self.base.get_element_ty()
+        n_bytes = dtype_tt.primitive_bitwidth // 8
+        ptrs = np.broadcast_to(self.base.data, self.block_shape)
+        masks = np.ones(self.block_shape, dtype=bool)
+        for dim in range(len(self.block_shape)):
+            bcast_dims = [1] * len(self.block_shape)
+            bcast_dims[dim] = self.block_shape[dim]
+            off = (self.offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
+            ptrs = ptrs + (n_bytes * off * self.strides[dim].data).astype(np.uint64)
+            if dim in boundary_check:
+                masks = masks & (off < self.shape[dim].data) & (off >= 0)
+        ptrs = TensorHandle(ptrs, self.base.dtype.scalar)
+        return ptrs, masks
+
+
+class TensorDescHandle:
+
+    def __init__(self, base: TensorHandle, shape: List[TensorHandle], strides: List[TensorHandle],
+                 block_shape: List[int], padding):
+        self.base = base
+        self.ndim = len(shape)
+        self.shape = shape
+        self.strides = strides
+        self.block_shape = block_shape
+        self.padding = padding
+
+    def validate(self):
+        assert self.base.data.item() % 16 == 0, "base must be 16-byte aligned"
+        assert len(self.strides) == self.ndim
+        assert len(self.block_shape) == self.ndim
+        assert self.ndim >= 1, "descriptor cannot be 0 dimensional"
+
+        scalar_ty = self.base.dtype.element_ty
+        itemsize = scalar_ty.primitive_bitwidth // 8
+        for stride in self.strides[:-1]:
+            byte_stride = stride.data.item() * itemsize
+            assert byte_stride % 16 == 0, "stride must be 16-byte aligned"
+        assert self.strides[-1].data.item() == 1, "last dim must be contiguous"
+
+    def materialize_pointers(self, offsets: List[TensorHandle]):
+        assert len(offsets) == self.ndim
+        scalar_ty = self.base.dtype.element_ty
+        itemsize = scalar_ty.primitive_bitwidth // 8
+        assert (offsets[-1].data * itemsize) % 16 == 0, "block offset start must be 16-byte aligned"
+
+        ptrs = np.broadcast_to(self.base.data, self.block_shape)
+        masks = np.ones(self.block_shape, dtype=bool)
+        for dim in range(len(self.block_shape)):
+            bcast_dims = [1] * len(self.block_shape)
+            bcast_dims[dim] = self.block_shape[dim]
+            off = (offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
+            ptrs = ptrs + (itemsize * off * self.strides[dim].data).astype(np.uint64)
+            masks = masks & (0 <= off) & (off < self.shape[dim].data)
+        assert ptrs.dtype == np.uint64
+        ptrs = TensorHandle(ptrs, self.base.dtype.scalar)
+        return ptrs, masks
+
+
+@dataclass(frozen=True)
+class InterpreterOptions:
+    extern_libs: dict = None
+    debug: bool = False
+    sanitize_overflow: bool = True
+    arch: str = None
+    supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e5b16", "fp8e4nv", "fp8e4b8", "fp8e4b15")
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "tf32"
+    allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee")
+    max_num_imprecise_acc_default: int = 0
+    backend_name: str = "interpreter"
+
+
+def _validate_np_data_size(np_array, tl_dtype):
+    if isinstance(tl_dtype, tl.pointer_type):
+        return True
+
+    np_dtype_bitwidth = np_array.itemsize * 8
+    tl_dtype_bitwidth = tl_dtype.primitive_bitwidth
+
+    # numpy lowest itemsize is at least 8 bits
+    if tl_dtype_bitwidth < 8:
+        tl_dtype_bitwidth = 8
+
+    if np_dtype_bitwidth > tl_dtype_bitwidth:
+        return False
+    return True
+
+
+def _get_signed_np_dtype(dtype):
+    if dtype == np.uint8:
+        return np.int8
+    if dtype == np.uint16:
+        return np.int16
+    if dtype == np.uint32:
+        return np.int32
+    if dtype == np.uint64:
+        return np.int64
+    return dtype
+
+
+def _get_np_dtype(tt_dtype):
+    if isinstance(tt_dtype, tl.pointer_type):
+        return np.dtype(np.uint64)
+    np_types = {
+        tl.int1: np.dtype(bool),
+        tl.float16: np.dtype(np.float16),
+        tl.float32: np.dtype(np.float32),
+        tl.float64: np.dtype(np.float64),
+        tl.int8: np.dtype(np.int8),
+        tl.uint8: np.dtype(np.uint8),
+        tl.int16: np.dtype(np.int16),
+        tl.uint16: np.dtype(np.uint16),
+        tl.int32: np.dtype(np.int32),
+        tl.uint32: np.dtype(np.uint32),
+        tl.int64: np.dtype(np.int64),
+        tl.uint64: np.dtype(np.uint64),
+        # bfloat16 types are stored as uint16
+        tl.bfloat16: np.dtype(np.uint16),
+        # float8 types are stored as uint8
+        tl.float8e5: np.dtype(np.uint8),
+        tl.float8e5b16: np.dtype(np.uint8),
+        tl.float8e4nv: np.dtype(np.uint8),
+        tl.float8e4b8: np.dtype(np.uint8),
+        tl.float8e4b15: np.dtype(np.uint8),
+    }
+    if isinstance(tt_dtype, tl.block_type):
+        if isinstance(tt_dtype.element_ty, tl.pointer_type):
+            return np.dtype(np.uint64)
+        return np_types[tt_dtype.element_ty]
+    return np_types[tt_dtype]
+
+
+def _convert_float(input, input_dtype, output_dtype, rounding_mode):
+    input_uint_dtype = getattr(np, f"uint{input_dtype.primitive_bitwidth}")
+    output_unint_dtype = getattr(np, f"uint{output_dtype.primitive_bitwidth}")
+    input_bin = np.frombuffer(input.tobytes(), dtype=input_uint_dtype)
+    sign = (input_bin >> (input_dtype.primitive_bitwidth - 1)) & 0x01
+    input_exponent_width = input_dtype.primitive_bitwidth - input_dtype.fp_mantissa_width - 1
+    output_exponent_width = output_dtype.primitive_bitwidth - output_dtype.fp_mantissa_width - 1
+    significand = input_bin & ((1 << input_dtype.fp_mantissa_width) - 1)
+    bias_input = input_dtype.exponent_bias
+    bias_output = output_dtype.exponent_bias
+    exponent = ((input_bin >> input_dtype.fp_mantissa_width) & ((1 << input_exponent_width) - 1)).astype(np.int32)
+    subnormal_index = exponent == 0
+    if np.any(subnormal_index):
+        # Credit to Phil: phil@openai.com
+        # subnormal repr: ((-1.0)**sign) * (2.0**(1 - exp_bias)) * (2^(m0) + 2^(m1) + ... + 2^(mn))
+        # where m0, m1, ..., mn are the 1-bit of the mantissa
+        # convert it to normal repr: ((-1.0)**sign) * (2.0**(1 + m0 - exp_bias)) * (1 + 2^(m1 - m0) + ... + 2^(mn - m0))
+        bit_pos = np.zeros_like(input_bin, dtype=np.int32)
+        # Find the most significant bit of the mantissa in the significand
+        for i in range(input_dtype.fp_mantissa_width):
+            bit_index = ((significand >> i) & 0x01)
+            # pos should be >= 1
+            bit_pos[bit_index == 1] = input_dtype.fp_mantissa_width - i
+        zero_significand_index = significand == 0
+        exponent[subnormal_index] = 1 - bit_pos[subnormal_index]
+        # 0 significand and subnormal should be treated as 0
+        exponent[zero_significand_index & subnormal_index] = bias_input - bias_output
+        significand[subnormal_index] = (significand[subnormal_index] << bit_pos[subnormal_index]) & (
+            (1 << input_dtype.fp_mantissa_width) - 1)
+    # Prevent overflow and underflow
+    exponent_output = np.maximum(0, np.minimum((exponent - bias_input + bias_output), (1 << output_exponent_width) - 1))
+    exponent_output = exponent_output.astype(output_unint_dtype)
+    sign_output = sign.astype(output_unint_dtype)
+    if input_dtype.primitive_bitwidth > output_dtype.primitive_bitwidth:  # Downcast
+        significand_output = (significand >> (input_dtype.fp_mantissa_width - output_dtype.fp_mantissa_width)) & (
+            (1 << output_dtype.fp_mantissa_width) - 1)
+        if rounding_mode == _ir.ROUNDING_MODE.RTNE:  # Round to nearst even
+            # find the cut-off bit
+            cut_off = significand & (1 << (input_dtype.fp_mantissa_width - output_dtype.fp_mantissa_width - 1))
+            significand_output = significand_output + (cut_off > 0)
+        significand_output = significand_output.astype(output_unint_dtype)
+    else:  # Upcast
+        significand_output = (significand.astype(output_unint_dtype) <<
+                              (output_dtype.fp_mantissa_width - input_dtype.fp_mantissa_width)) & (
+                                  (1 << output_dtype.fp_mantissa_width) - 1)
+    subnormal_index = exponent_output == 0
+    if np.any(subnormal_index):  # underflow
+        # normal repr: ((-1.0)**sign) * (2.0**(exp - exp_bias_input)) * (1 + 2^(m0) + 2^(m1) + ... + 2^(mn))
+        # where m0, m1, ..., mn are the 1-bit of the mantissa
+        # shift = (1 - exp_bias_output) - (exp - exp_bias_input)
+        # convert it to subnormal repr: ((-1.0)**sign) * (2.0**(1 - exp_bias_output)) * (2^(-shift) + 2^(m0 - shift) + 2^(m1 - shift) + ... + 2^(mn - shift))
+        exponent = ((input_bin >> input_dtype.fp_mantissa_width) & ((1 << input_exponent_width) - 1)).astype(np.int32)
+        non_zero_exponent_index = exponent != 0
+        # If the original exponent is not zero, we still need to shift the significand and consider the 1.0 part in mantissa
+        subnormal_index = subnormal_index & non_zero_exponent_index
+        shift = np.zeros_like(input_bin, dtype=np.int32)
+        shift[subnormal_index] = (1 - bias_output) - (exponent[subnormal_index] - bias_input)
+        significand_output[subnormal_index] = (significand_output[subnormal_index] >> shift[subnormal_index]) | (
+            1 << (output_dtype.fp_mantissa_width - shift[subnormal_index]))
+    output = (sign_output << (output_dtype.primitive_bitwidth - 1)) | (
+        exponent_output << output_dtype.fp_mantissa_width) | significand_output
+    return output.reshape(input.shape)
+
+
+def _erf(x):
+    # Numpy does not support erf
+    return math.erf(x)
+
+
+def _umulhi_64(a, b):
+    # Numpy does not support 128-bit multiplication
+    # So we have to implement it manually
+    return (int(a) * int(b)) >> 64
+
+
+np_erf_fp32 = np.vectorize(_erf, otypes=[np.float32])
+np_erf_fp64 = np.vectorize(_erf, otypes=[np.float64])
+np_umulhi_u64 = np.vectorize(_umulhi_64, otypes=[np.uint64])
+
+
+class ExtraFunctions:
+
+    @staticmethod
+    def _convert_custom_types(input, dst_ty, fp_downcast_rounding, _semantic):
+        return tl.tensor(_semantic.builder.create_fp_to_fp(input.handle, dst_ty, fp_downcast_rounding), dst_ty)
+
+
+class InterpreterBuilder:
+    ir_sem_to_interpreter_sem = {
+        _ir.MEM_SEMANTIC.ACQUIRE: _interpreter.MEM_SEMANTIC.ACQUIRE,
+        _ir.MEM_SEMANTIC.RELEASE: _interpreter.MEM_SEMANTIC.RELEASE,
+        _ir.MEM_SEMANTIC.RELAXED: _interpreter.MEM_SEMANTIC.RELAXED,
+        _ir.MEM_SEMANTIC.ACQUIRE_RELEASE: _interpreter.MEM_SEMANTIC.ACQUIRE_RELEASE,
+    }
+
+    ir_rmw_op_to_interpreter_rmw_op = {
+        _ir.ATOMIC_OP.ADD: _interpreter.RMW_OP.ADD,
+        _ir.ATOMIC_OP.FADD: _interpreter.RMW_OP.FADD,
+        _ir.ATOMIC_OP.MIN: _interpreter.RMW_OP.MIN,
+        _ir.ATOMIC_OP.UMIN: _interpreter.RMW_OP.UMIN,
+        _ir.ATOMIC_OP.MAX: _interpreter.RMW_OP.MAX,
+        _ir.ATOMIC_OP.UMAX: _interpreter.RMW_OP.UMAX,
+        _ir.ATOMIC_OP.AND: _interpreter.RMW_OP.AND,
+        _ir.ATOMIC_OP.OR: _interpreter.RMW_OP.OR,
+        _ir.ATOMIC_OP.XOR: _interpreter.RMW_OP.XOR,
+        _ir.ATOMIC_OP.XCHG: _interpreter.RMW_OP.XCHG,
+    }
+
+    def __init__(self) -> None:
+        self.arch = None
+        self.options = InterpreterOptions()
+        self.codegen_fns = {}
+        self.codegen_fns["convert_custom_types"] = ExtraFunctions._convert_custom_types
+        self.codegen_fns["min_dot_size"] = lambda lhsType, rhsType: (1, 1, 1)
+
+    def set_grid_idx(self, x, y, z):
+        if not x < self.grid_dim[0]:
+            raise ValueError("x >= grid_dim[0]")
+        if not y < self.grid_dim[1]:
+            raise ValueError("y >= grid_dim[1]")
+        if not z < self.grid_dim[2]:
+            raise ValueError("z >= grid_dim[2]")
+        self.grid_idx = (x, y, z)
+
+    def set_grid_dim(self, nx, ny, nz):
+        self.grid_dim = (nx, ny, nz)
+
+    # constants
+
+    def get_half_ty(self):
+        return tl.float16
+
+    def get_bf16_ty(self):
+        return tl.bfloat16
+
+    def get_float_ty(self):
+        return tl.float32
+
+    def get_double_ty(self):
+        return tl.float64
+
+    def get_int1_ty(self):
+        return tl.int1
+
+    def get_int8_ty(self):
+        return tl.int8
+
+    def get_uint8_ty(self):
+        return tl.uint8
+
+    def get_int16_ty(self):
+        return tl.int16
+
+    def get_uint16_ty(self):
+        return tl.uint16
+
+    def get_int32_ty(self):
+        return tl.int32
+
+    def get_uint32_ty(self):
+        return tl.uint32
+
+    def get_int64_ty(self):
+        return tl.int64
+
+    def get_uint64_ty(self):
+        return tl.uint64
+
+    def get_fp8e4nv_ty(self):
+        return tl.float8e4nv
+
+    def get_fp8e4b15_ty(self):
+        return tl.float8e4b15
+
+    def get_fp8e4b8_ty(self):
+        return tl.float8e4b8
+
+    def get_fp8e5_ty(self):
+        return tl.float8e5
+
+    def get_fp8e5b16_ty(self):
+        return tl.float8e5b16
+
+    def get_ptr_ty(self, elt_ty, addr_space):
+        return tl.pointer_type(elt_ty, addr_space)
+
+    def get_block_ty(self, dtype, shape):
+        return tl.block_type(dtype, shape)
+
+    def get_int1(self, value):
+        return TensorHandle(np.array([value], dtype=np.bool_), tl.int1)
+
+    def get_uint8(self, value):
+        return TensorHandle(np.array([value], dtype=np.uint8), tl.uint8)
+
+    def get_int8(self, value):
+        return TensorHandle(np.array([value], dtype=np.int8), tl.int8)
+
+    def get_uint16(self, value):
+        return TensorHandle(np.array([value], dtype=np.uint16), tl.uint16)
+
+    def get_int16(self, value):
+        return TensorHandle(np.array([value], dtype=np.int16), tl.int16)
+
+    def get_uint32(self, value):
+        return TensorHandle(np.array([value], dtype=np.uint32), tl.uint32)
+
+    def get_int32(self, value):
+        return TensorHandle(np.array([value], dtype=np.int32), tl.int32)
+
+    def get_uint64(self, value):
+        return TensorHandle(np.array([value], dtype=np.uint64), tl.uint64)
+
+    def get_int64(self, value):
+        return TensorHandle(np.array([value], dtype=np.int64), tl.int64)
+
+    def get_fp16(self, value):
+        return TensorHandle(np.array([value], dtype=np.float16), tl.float16)
+
+    def get_fp32(self, value):
+        return TensorHandle(np.array([value], dtype=np.float32), tl.float32)
+
+    def get_fp64(self, value):
+        return TensorHandle(np.array([value], dtype=np.float64), tl.float64)
+
+    def get_null_value(self, type):
+        return TensorHandle(np.array([0], dtype=_get_np_dtype(type)), type)
+
+    # programming model
+    def create_get_program_id(self, axis):
+        if self.grid_idx is None:
+            raise ValueError("grid_idx is None")
+        return TensorHandle(np.array([self.grid_idx[axis]], dtype=np.int32), tl.int32)
+
+    def create_get_num_programs(self, axis):
+        return TensorHandle(np.array([self.grid_dim[axis]], dtype=np.int32), tl.int32)
+
+    # memory ops
+    def create_load(self, ptr, _0, _1, is_volatile):
+        mask = TensorHandle(np.ones_like(ptr.data, dtype=bool), tl.int1)
+        other = None
+        return self.create_masked_load(ptr, mask, other, _0, _1, is_volatile)
+
+    def create_store(self, ptr, val, _0, _1):
+        mask = TensorHandle(np.ones_like(ptr.data, dtype=bool), tl.int1)
+        return self.create_masked_store(ptr, val, mask, None, None)
+
+    def create_masked_load(self, ptrs, mask, other, cache_modifier, eviction_policy, is_volatile):
+        dtype_tt = ptrs.get_element_ty()
+        dtype_np = _get_np_dtype(dtype_tt)
+        if other is None:
+            other = TensorHandle(np.zeros_like(ptrs.data, dtype=dtype_np), dtype_tt)
+        ret = _interpreter.load(ptrs.data, mask.data, other.data, dtype_np)
+        return TensorHandle(ret, dtype_tt)
+
+    def create_masked_store(self, ptrs, value, mask, cache_modifier, eviction_policy):
+        return _interpreter.store(ptrs.data, value.data, mask.data)
+
+    # casting ops
+    def cast_impl(self, src, dst_type):
+        src_element_type = src.dtype.scalar
+        dst_element_type = dst_type.scalar
+        if (src_element_type == tl.bfloat16 and dst_element_type == tl.float32) or \
+           (src_element_type == tl.float32 and dst_element_type == tl.bfloat16):
+            data = _convert_float(src.data, src_element_type, dst_element_type, None).view(_get_np_dtype(dst_type))
+            return TensorHandle(data, dst_type.scalar)
+        else:
+            return TensorHandle(src.data.astype(_get_np_dtype(dst_type)), dst_type.scalar)
+
+    create_si_to_fp = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_ui_to_fp = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_fp_to_si = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_fp_to_ui = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_fp_ext = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_fp_trunc = lambda self, src, dst_type: self.cast_impl(src, dst_type)
+    create_int_cast = lambda self, src, dst_type, is_signed: self.cast_impl(src, dst_type)
+
+    def create_fp_to_fp(self, src, dst_type, rounding_mode):
+        src_element_type = src.dtype.scalar
+        dst_element_type = dst_type.scalar
+        data = _convert_float(src.data, src_element_type, dst_element_type, rounding_mode).view(_get_np_dtype(dst_type))
+        return TensorHandle(data, dst_type.scalar)
+
+    def create_bitcast(self, src, dst_type):
+        return TensorHandle(src.data.view(_get_np_dtype(dst_type)), dst_type.scalar)
+
+    # binary operators
+    def binary_op(self, lhs, rhs, op):
+        output = op(lhs.data, rhs.data)
+        tl_dtype = lhs.dtype.scalar
+
+        if not _validate_np_data_size(output, tl_dtype):
+            output = output.astype(_get_np_dtype(tl_dtype))
+
+        return TensorHandle(output, tl_dtype)
+
+    create_fadd = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.add)
+    create_fmul = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.multiply)
+    create_fdiv = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.divide)
+    create_frem = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.fmod)
+    create_fsub = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.subtract)
+    create_mul = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.multiply)
+    create_precise_divf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.divide)
+    create_sdiv = lambda self, lhs, rhs: self.create_idiv(lhs, rhs)
+    create_udiv = lambda self, lhs, rhs: self.create_idiv(lhs, rhs)
+    # LLVM has 'numpy.fmod', not 'numpy.remainder', semantics on integer remainders.
+    create_srem = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.fmod)
+    create_urem = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.fmod)
+    create_add = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.add)
+    create_sub = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.subtract)
+    create_shl = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.left_shift)
+    create_lshr = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.right_shift)
+    create_minsi = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.minimum)
+    create_minui = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.minimum)
+    create_minimumf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.minimum)
+    create_minnumf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.minimum)
+    create_maxsi = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.maximum)
+    create_maxui = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.maximum)
+    create_maximumf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.maximum)
+    create_maxnumf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.maximum)
+    create_icmpSLE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less_equal)
+    create_icmpSLT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less)
+    create_icmpSGE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater_equal)
+    create_icmpSGT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater)
+    create_icmpULE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less_equal)
+    create_icmpULT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less)
+    create_icmpUGE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater_equal)
+    create_icmpUGT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater)
+    create_icmpEQ = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.equal)
+    create_icmpNE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.not_equal)
+    create_fcmpOLT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less)
+    create_fcmpOGT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater)
+    create_fcmpOLE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less_equal)
+    create_fcmpOGE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater_equal)
+    create_fcmpOEQ = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.equal)
+    create_fcmpONE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.not_equal)
+    create_fcmpULT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less)
+    create_fcmpUGT = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater)
+    create_fcmpULE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.less_equal)
+    create_fcmpUGE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.greater_equal)
+    create_fcmpUEQ = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.equal)
+    create_fcmpUNE = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.not_equal)
+    create_and = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.bitwise_and)
+    create_xor = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.bitwise_xor)
+    create_or = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.bitwise_or)
+    create_int_to_ptr = create_bitcast
+    create_ptr_to_int = create_bitcast
+
+    def create_idiv(self, lhs, rhs):
+        # Triton has IEEE, not numpy/torch, semantics for %, and those carry
+        # through to //, so we have to use a nonstandard expression to get a
+        # reference result for //.
+        return TensorHandle((lhs.data - np.fmod(lhs.data, rhs.data)) // rhs.data, lhs.dtype.scalar)
+
+    def create_ashr(self, lhs, rhs):
+        # Triton's rshift operator depends on the signedness of the left operand
+        lhs_dtype = _get_signed_np_dtype(lhs.data.dtype)
+        rhs_dtype = _get_signed_np_dtype(rhs.data.dtype)
+        lhs.data = lhs.data.astype(lhs_dtype)
+        rhs.data = rhs.data.astype(rhs_dtype)
+        return self.binary_op(lhs, rhs, np.right_shift)
+
+    def create_umulhi(self, lhs, rhs):
+        dtype = lhs.data.dtype
+        if dtype == np.int64 or dtype == np.uint64:
+            return TensorHandle(np_umulhi_u64(lhs.data, rhs.data), lhs.dtype.scalar)
+        else:
+            compute_dtype = getattr(np, f"uint{dtype.itemsize * 8 * 2}")
+            lhs_data = lhs.data.astype(compute_dtype)
+            rhs_data = rhs.data.astype(compute_dtype)
+            ret_data = np.multiply(lhs_data, rhs_data) >> (dtype.itemsize * 8)
+            return TensorHandle(ret_data.astype(dtype), lhs.dtype.scalar)
+
+    # ternary functions
+    def ternary_op(self, lhs, rhs, other, op):
+        output = op(lhs.data, rhs.data, other.data)
+        tl_dtype = other.dtype.scalar
+
+        if not _validate_np_data_size(output, tl_dtype):
+            output = output.astype(_get_np_dtype(tl_dtype))
+
+        return TensorHandle(output, tl_dtype)
+
+    create_clampf = lambda self, arg, lo, hi, propagate_nans: self.ternary_op(arg, lo, hi, np.clip)
+    create_select = lambda self, cond, lhs, rhs: self.ternary_op(cond, lhs, rhs, np.where)
+
+    def create_fma(self, x, y, z):
+        return TensorHandle(x.data * y.data + z.data, z.dtype.scalar)
+
+    # unary functions
+    def unary_op(self, arg, op):
+        return TensorHandle(op(arg.data), arg.dtype.scalar)
+
+    def create_fabs(self, arg):
+        # Mask out the sign bit based on the primitive length
+        dtype_tt = arg.dtype
+        mask_bitwidth = dtype_tt.primitive_bitwidth - 1
+        np_uint_dtype = getattr(np, f"uint{dtype_tt.primitive_bitwidth}")
+        data = arg.data.view(np_uint_dtype)
+        mask = (1 << mask_bitwidth) - 1
+        ret = (data & mask).view(_get_np_dtype(dtype_tt))
+        return TensorHandle(ret, arg.dtype.scalar)
+
+    create_cos = lambda self, arg: self.unary_op(arg, np.cos)
+    create_exp = lambda self, arg: self.unary_op(arg, np.exp)
+    create_exp2 = lambda self, arg: self.unary_op(arg, np.exp2)
+    create_iabs = lambda self, arg: self.unary_op(arg, np.abs)
+    create_floor = lambda self, arg: self.unary_op(arg, np.floor)
+    create_ceil = lambda self, arg: self.unary_op(arg, np.ceil)
+    create_log = lambda self, arg: self.unary_op(arg, np.log)
+    create_log2 = lambda self, arg: self.unary_op(arg, np.log2)
+    create_precise_sqrt = lambda self, arg: self.unary_op(arg, np.sqrt)
+    create_sqrt = lambda self, arg: self.unary_op(arg, np.sqrt)
+    create_sin = lambda self, arg: self.unary_op(arg, np.sin)
+
+    def create_erf(self, arg):
+        ret = np_erf_fp32(arg.data) if arg.data.dtype == np.float32 else np_erf_fp64(arg.data)
+        return TensorHandle(ret, arg.dtype.scalar)
+
+    def create_rsqrt(self, arg):
+        return TensorHandle(1 / np.sqrt(arg.data), arg.dtype.scalar)
+
+    # tensor operators
+    create_reshape = lambda self, arg, shape, allow_reorder: TensorHandle(arg.data.reshape(shape), arg.dtype.scalar)
+
+    def create_trans(self, arg, perm):
+        return TensorHandle(np.transpose(arg.data, perm), arg.dtype.scalar)
+
+    def create_dot(self, a, b, d, input_precision, max_num_imprecise_acc):
+        a_data = a.data
+        b_data = b.data
+        if (a.dtype.primitive_bitwidth == 8 and a.dtype.is_floating()) or \
+           (b.dtype.primitive_bitwidth == 8 and b.dtype.is_floating()):
+            a_data = _convert_float(a_data, a.dtype, tl.float16, None).view(np.float16)
+            b_data = _convert_float(b_data, b.dtype, tl.float16, None).view(np.float16)
+        return TensorHandle(np.matmul(a_data, b_data, dtype=d.data.dtype) + d.data, d.dtype.scalar)
+
+    def create_make_range(self, ret_ty, start, stop):
+        return TensorHandle(np.arange(start, stop, dtype=np.int32), tl.int32)
+
+    def create_histogram(self, data, bins, mask):
+        if mask is None:
+            mask = TensorHandle(np.ones_like(data.data, dtype=bool), tl.int1)
+
+        # By default np.histogram returns int64 dtype values
+        # Docs specify that returned dtype is taken based on optional weights.dtype
+        # This is fix for interpreter cases where for example int32 tensor is being passed
+        # But unexpectedly int64 values are being returned causing
+        # tl.store to write 8 bytes instead of 4 bytes which lead to silent data corruption
+        dummy_weights = np.ones_like(data.data, dtype=data.data.dtype)
+
+        # force all masked elements to zero
+        data = np.where(mask.data, data.data, np.zeros_like(data.data))
+        histogram = np.histogram(data, bins=bins, range=(0, bins), weights=dummy_weights)[0]
+        # remove overcounted elements
+        histogram[0] -= np.logical_not(mask.data).sum()
+        return TensorHandle(histogram, tl.int32)
+
+    def create_gather(self, src, indices, axis):
+        return TensorHandle(np.take_along_axis(src.data, indices.data, axis=axis), src.dtype.scalar)
+
+    # pointer arithmetic
+
+    def create_addptr(self, ptr, offset):
+        dtype_tt = ptr.get_element_ty()
+        element_bitwidth = dtype_tt.primitive_bitwidth
+        # int1's bitwidth is 1, but we need to use 8 for pointer arithmetic
+        element_bytewidth = max(1, element_bitwidth // 8)
+        return TensorHandle(ptr.data + element_bytewidth * offset.data.astype(np.uint64), ptr.dtype)
+
+    def create_tensor_pointer_load(self, ptr, boundary_check, padding_option, cache_modifier, eviction_policy,
+                                   is_volatile):
+        ptrs, masks = ptr.materialize_pointers(boundary_check)
+        dtype_tt = ptrs.get_element_ty()
+        dtype_np = _get_np_dtype(dtype_tt)
+        if padding_option is None:
+            other = None
+        elif padding_option == _ir.PADDING_OPTION.PAD_ZERO:
+            other = TensorHandle(np.zeros_like(ptrs.data, dtype=dtype_np), dtype_tt)
+        elif padding_option == _ir.PADDING_OPTION.PAD_NAN:
+            other = TensorHandle(np.full_like(ptrs.data, float('nan'), dtype=dtype_np), dtype_tt)
+        else:
+            raise ValueError(f"unsupported padding option {padding_option}")
+        return self.create_masked_load(ptrs, masks, other, cache_modifier, eviction_policy, is_volatile)
+
+    def create_tensor_pointer_store(self, ptr, value, boundary_check, cache_modifier, eviction_policy):
+        ptrs, masks = ptr.materialize_pointers(boundary_check)
+        return self.create_masked_store(ptrs, value, masks, cache_modifier, eviction_policy)
+
+    def create_expand_dims(self, arg, axis):
+        return TensorHandle(np.expand_dims(arg.data, axis), arg.dtype.scalar)
+
+    def create_broadcast(self, arg, shape):
+        return TensorHandle(np.broadcast_to(arg.data, shape), arg.dtype.scalar)
+
+    def create_cat(self, lhs, rhs):
+        return TensorHandle(np.concatenate([lhs.data, rhs.data]), lhs.dtype.scalar)
+
+    def create_join(self, lhs, rhs):
+        # Triton only supports joining two original tensors into a new one along the last axis
+        return TensorHandle(np.stack([lhs.data, rhs.data], axis=-1), lhs.dtype.scalar)
+
+    def create_split(self, val):
+        # Triton only supports splitting the original tensor into two along the last axis
+        return (TensorHandle(val.data[..., 0], val.dtype.scalar), TensorHandle(val.data[..., 1], val.dtype.scalar))
+
+    def create_splat(self, ret_ty, arg):
+        shape = ret_ty.shape
+        if isinstance(arg.dtype, tl.block_type):
+            return TensorHandle(np.full(shape, arg.data[0], dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
+        else:  # scalar
+            return TensorHandle(np.full(shape, arg.data, dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
+
+    def create_unsplat(self, arg):
+        return TensorHandle(np.full((1, ), arg.data[0], dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
+
+    def create_atomic_cas(self, ptr, cmp, val, sem, scope):
+        if sem not in self.ir_sem_to_interpreter_sem:
+            raise ValueError(f"unsupported semantic {sem}")
+        sem = self.ir_sem_to_interpreter_sem[sem]
+        return TensorHandle(_interpreter.atomic_cas(ptr.data, cmp.data, val.data, sem), cmp.dtype.scalar)
+
+    def create_atomic_rmw(self, rmwOp, ptr, val, mask, sem, scope):
+        if rmwOp not in self.ir_rmw_op_to_interpreter_rmw_op:
+            raise ValueError(f"unsupported rmwOp {rmwOp}")
+        if sem not in self.ir_sem_to_interpreter_sem:
+            raise ValueError(f"unsupported semantic {sem}")
+        rmwOp = self.ir_rmw_op_to_interpreter_rmw_op[rmwOp]
+        sem = self.ir_sem_to_interpreter_sem[sem]
+        return TensorHandle(_interpreter.atomic_rmw(rmwOp, ptr.data, val.data, mask.data, sem), val.dtype.scalar)
+
+    def create_extern_elementwise(self, libName, libPath, symbol, argList, retType, isPure):
+        raise NotImplementedError("extern_elementwise not supported in interpreter mode")
+
+    def create_inline_asm(self, inlineAsm, constraints, values, type, isPure, pack):
+        raise NotImplementedError("inline_asm not supported in interpreter mode")
+
+    def create_print(self, prefix, hex, values, isSigned):
+        # NOTE: the `isSigned` variable is not really used here; because Signness is already known
+        # by `values` themselves in python interpreter, thus not really needed here;
+        # it is only used for triton PrintOpToLLVM to correctly construct the format specifier.
+        # Interpreter's device_print function has a different format than Triton's device_print
+        msg = f"({self.grid_idx[0]}, {self.grid_idx[1]}, {self.grid_idx[2]})"
+        if prefix:
+            msg += f" {prefix}"
+        if hex:
+            np.set_printoptions(formatter={'all': lambda x: f"0x{x:02x}"})
+        for value in values:
+            print(msg + f" {value.data}")
+        if hex:
+            np.set_printoptions(formatter=None)
+
+    def create_assert(self, condition, message):
+        # Interpreter's device_assert function has a different format than Triton's device_assert
+        assert condition, f"{message}"
+
+    def create_assume(self, condition):
+        assert condition, "Assume failed"
+
+    def create_barrier(self):
+        # Triton's barrier applies to each program in a grid, so it's a no-op in the interpreter
+        pass
+
+    def create_make_block_ptr(self, base, shape, strides, offsets, block_shape, order):
+        # Create new offsets to avoid modifying the original
+        new_offsets = [offset.clone() for offset in offsets]
+        return BlockPointerHandle(base, shape, strides, new_offsets, block_shape, order)
+
+    def create_advance(self, ptr, offsets):
+        if len(ptr.offsets) != len(offsets):
+            raise ValueError("len(ptr.offsets) != len(offsets)")
+        # Create new offsets to avoid modifying the original
+        new_offsets = [offset.clone() for offset in ptr.offsets]
+        ret = BlockPointerHandle(ptr.base, ptr.shape, ptr.strides, new_offsets, ptr.block_shape, ptr.order)
+        for i in range(len(offsets)):
+            ret.offsets[i].data += offsets[i].data
+        return ret
+
+    def create_make_tensor_descriptor(self, base: TensorHandle, shape: List[TensorHandle], strides: List[TensorHandle],
+                                      tensor_shape: List[int], is_signed: bool, padding: str = "zero"):
+        desc = TensorDescHandle(base, shape, strides, tensor_shape, padding)
+        desc.validate()
+        return desc
+
+    def create_descriptor_load(self, desc: TensorDescHandle, indices: List[TensorHandle], cache_modifier,
+                               eviction_policy):
+        assert isinstance(desc, TensorDescHandle)
+        ptrs, mask = desc.materialize_pointers(indices)
+        dtype_tt = ptrs.get_element_ty()
+        dtype_np = _get_np_dtype(dtype_tt)
+        padding = desc.padding
+        if padding == _ir.PADDING_OPTION.PAD_ZERO:
+            other = TensorHandle(np.zeros_like(ptrs.data, dtype=dtype_np), dtype_tt)
+        elif padding == _ir.PADDING_OPTION.PAD_NAN:
+            other = TensorHandle(np.full_like(ptrs.data, float('nan'), dtype=dtype_np), dtype_tt)
+        else:
+            raise ValueError(f"unsupported padding {padding}")
+        return self.create_masked_load(ptrs, mask, other, cache_modifier=cache_modifier,
+                                       eviction_policy=eviction_policy, is_volatile=False)
+
+    def create_descriptor_store(self, desc: TensorDescHandle, value: TensorHandle, indices: List[TensorHandle]):
+        ptrs, mask = desc.materialize_pointers(indices)
+        return self.create_masked_store(ptrs, value, mask, None, None)
+
+    def create_descriptor_gather(self, desc: TensorDescHandle, x_offsets: TensorHandle, y_offset: TensorHandle, type):
+        dtype = desc.base.dtype.element_ty
+        np_dtype = _get_np_dtype(dtype)
+        result = np.zeros([x_offsets.data.shape[0], desc.block_shape[-1]], dtype=np_dtype)
+        cache_modifier = None
+        eviction_policy = None
+        for i, x_offset in enumerate(x_offsets.data):
+            indices = [TensorHandle(x_offset, tl.int32), y_offset]
+            result[i, :] = self.create_descriptor_load(desc, indices, cache_modifier, eviction_policy).data
+        return TensorHandle(result, dtype)
+
+    def create_descriptor_scatter(self, desc: TensorDescHandle, value: TensorHandle, x_offsets: TensorHandle,
+                                  y_offset: TensorHandle):
+        for i, x_offset in enumerate(x_offsets.data):
+            slice = TensorHandle(value.data[i], value.dtype)
+            indices = [TensorHandle(x_offset, tl.int32), y_offset]
+            self.create_descriptor_store(desc, slice, indices)
+
+    def get_all_ones_value(self, type):
+        np_type = _get_np_dtype(type)
+        if "int" in np_type.name:
+            return TensorHandle(np.full(1, -1, dtype=np_type), type.scalar)
+        elif np_type == np.bool_:
+            return TensorHandle(np.full(1, True, dtype=np_type), type.scalar)
+        else:
+            raise TypeError(f"unsupported type {type}")
+
+
+_MISSING = object()
+
+
+class _LangPatchScope:
+    """Tracks patched attributes so they can be restored."""
+
+    def __init__(self) -> None:
+        self._changes: list[tuple[object, str, object]] = []
+
+    def set_attr(self, obj: object, name: str, value: object) -> None:
+        original = getattr(obj, name, _MISSING)
+        self._changes.append((obj, name, original))
+        setattr(obj, name, value)
+
+    def restore(self) -> None:
+        while self._changes:
+            obj, name, original = self._changes.pop()
+            if original is _MISSING:
+                delattr(obj, name)
+            else:
+                setattr(obj, name, original)
+
+
+def _patch_attr(obj, name, member, builder, scope: _LangPatchScope):
+    semantic = TritonSemantic(builder)
+    new_member = lambda *args, member=member, **kwargs: (member(*args, **
+                                                                {k: v
+                                                                 for k, v in kwargs.items()
+                                                                 if k != "_semantic"}, _semantic=semantic))
+    scope.set_attr(obj, name, new_member)
+
+
+def _patch_builtin(pkg, builder, scope: _LangPatchScope):
+    for name, member in inspect.getmembers(pkg):
+        if tl.core.is_builtin(member):
+            _patch_attr(pkg, name, member, builder, scope)
+
+
+def _patch_lang_tensor(tensor, scope: _LangPatchScope):
+
+    def _get_bool(self):
+        data = self.handle.data
+        # in triton, only scalars can be converted to booleans
+        # here we need this hack because all scalars are tensors
+        return bool(data) if data.size == 1 else True
+
+    def _get_transpose(self):
+        handle = TensorHandle(np.transpose(self.handle.data), self.handle.dtype)
+        assert self.type.is_block()
+        block_shape = list(self.type.shape)
+        block_shape[-1], block_shape[-2] = block_shape[-2], block_shape[-1]
+        res_ty = tl.core.block_type(self.dtype, block_shape)
+        return tl.core.tensor(handle, res_ty)
+
+    scope.set_attr(tensor, "__index__", lambda self: int(self.handle.data))
+    scope.set_attr(tensor, "__bool__", lambda self: _get_bool(self))
+    scope.set_attr(tensor, "__repr__", lambda self: repr(self.handle.data))
+    scope.set_attr(tensor, "__str__", lambda self: str(self.handle.data))
+    scope.set_attr(tensor, "T", property(_get_transpose))
+
+
+class ReduceScanOpInterface:
+
+    def __init__(self, axis, combine_fn):
+        self.axis = axis
+        self.combine_fn = combine_fn
+
+    def check_axis(self, shape, axis):
+        if axis is not None and axis >= len(shape):
+            raise ValueError(f"axis {axis} out of bounds for shape {shape}")
+
+    def check_tensor(self, input):
+        for arg in input:
+            if not isinstance(arg, tl.core.tensor):
+                raise ValueError(f"input must be a tensor, got {type(arg)}")
+            self.check_axis(arg.shape, self.axis)
+
+    def to_tensor(self, ret, dtype):
+        np_dtype = _get_np_dtype(dtype)
+        if hasattr(ret, "shape") and ret.shape:
+            ret = ret.astype(np_dtype)
+            ret_type = tl.block_type(dtype, list(ret.shape))
+        else:
+            ret = np.array([ret], dtype=np_dtype)
+            ret_type = dtype
+        return tl.core.tensor(TensorHandle(ret, dtype.scalar), ret_type)
+
+    def apply(self, input):
+        if not isinstance(input, tuple):
+            return self.apply((input, ))[0]
+        self.check_tensor(input)
+        ret = self.apply_impl(input)
+        return tuple(ret) if isinstance(ret, (list, tuple)) else (ret, )
+
+
+class ReduceOps(ReduceScanOpInterface):
+
+    def __init__(self, axis, combine_fn, keep_dims):
+        super().__init__(axis, combine_fn)
+        self.keep_dims = keep_dims
+
+    def unravel(self, input, axis):
+        ret = []
+        for data in input:
+            if axis is not None:
+                ret.append(data)
+            else:
+                axis = 0
+                ret.append(self.to_tensor(data.handle.data.flatten(), data.dtype))
+        return tuple(ret), axis
+
+    def generic_reduce(self, input):
+        original_axis = self.axis
+        input, axis = self.unravel(input, self.axis)
+        input_data = []
+        output_data = []
+        input_shape = input[0].handle.data.shape
+        output_shape = input_shape[0:axis] + input_shape[axis + 1:]
+        for arg in input:
+            input_data.append(arg.handle.data)
+            output_data.append(np.zeros(output_shape, dtype=arg.handle.data.dtype))
+        # Reduce on axis
+        for i in range(input_data[0].size):
+            # Recover input_index from i using input_shape
+            input_index = np.unravel_index(i, input_shape)
+            output_index = input_index[0:axis] + input_index[axis + 1:]
+            input_tuple = tuple(self.to_tensor(d[input_index], input[ii].dtype) for ii, d in enumerate(input_data))
+            if input_index[axis] == 0:
+                # First element
+                for j in range(len(output_data)):
+                    output_data[j][output_index] = input_tuple[j].handle.data.item()
+            else:
+                acc_tuple = tuple(self.to_tensor(o[output_index], input[oi].dtype) for oi, o in enumerate(output_data))
+                combine_fn_ret = self.combine_fn.fn(*acc_tuple, *input_tuple)
+                acc_tuple = (combine_fn_ret, ) if not isinstance(combine_fn_ret, tuple) else combine_fn_ret
+                for j in range(len(output_data)):
+                    output_data[j][output_index] = acc_tuple[j].handle.data.item() if isinstance(
+                        acc_tuple[j], tl.core.tensor) else acc_tuple[j]
+        # Pack output
+        ret = []
+        for i, data in enumerate(output_data):
+            if self.keep_dims:
+                if original_axis is not None:
+                    data = np.expand_dims(data, axis)
+                else:
+                    for _ in range(len(input_shape)):
+                        data = np.expand_dims(data, 0)
+
+            elif original_axis is None:
+                # Take a scalar
+                data = data.item()
+            ret.append(self.to_tensor(data, input[i].dtype))
+        return ret
+
+    def min_max(self, input, val_reduce_op, idx_reduce_op=None):
+        # If input is a tuple, it must be (val, index), and we only take val
+        input = input[0] if isinstance(input, tuple) else input
+        val = None
+        idx = None
+        if val_reduce_op:
+            val = self.to_tensor(val_reduce_op(input.handle.data, axis=self.axis, keepdims=self.keep_dims), input.dtype)
+        if idx_reduce_op:
+            idx = self.to_tensor(idx_reduce_op(input.handle.data, axis=self.axis, keepdims=self.keep_dims), tl.int32)
+        if val is not None and idx is not None:
+            return val, idx
+        elif val is not None:
+            return val
+        elif idx is not None:
+            return idx
+        else:
+            raise ValueError("val_reduce_op and idx_reduce_op are both None")
+
+    def sum(self, input):
+        return self.to_tensor(np.sum(input.handle.data, axis=self.axis, keepdims=self.keep_dims), input.dtype)
+
+    def apply_impl(self, input):
+        if self.combine_fn == tl.standard._argmin_combine_tie_break_left:
+            return self.min_max(input[0], val_reduce_op=np.min, idx_reduce_op=np.argmin)
+        elif self.combine_fn == tl.standard._argmax_combine_tie_break_left:
+            return self.min_max(input[0], val_reduce_op=np.max, idx_reduce_op=np.argmax)
+        elif self.combine_fn == tl.standard._elementwise_max:
+            return self.min_max(input[0], val_reduce_op=np.nanmax, idx_reduce_op=None)
+        elif self.combine_fn == tl.standard._elementwise_min:
+            return self.min_max(input[0], val_reduce_op=np.nanmin, idx_reduce_op=None)
+        elif self.combine_fn == tl.standard._sum_combine:
+            return self.sum(input[0])
+        else:
+            # Fall back to the slow mode
+            return self.generic_reduce(input)
+
+
+class ScanOps(ReduceScanOpInterface):
+
+    def __init__(self, axis, combine_fn, reverse):
+        super().__init__(axis, combine_fn)
+        self.reverse = reverse
+
+    def cumsum(self, input):
+        return [self.to_tensor(np.cumsum(input.handle.data, axis=self.axis), dtype=input.dtype)]
+
+    def cumprod(self, input):
+        return [self.to_tensor(np.cumprod(input.handle.data, axis=self.axis), dtype=input.dtype)]
+
+    def generic_scan(self, input):
+        input_data = []
+        output_data = []
+        shape = input[0].handle.data.shape
+        for arg in input:
+            input_data.append(arg.handle.data)
+            output_data.append(np.zeros(shape, dtype=arg.handle.data.dtype))
+        # Scan on axis
+        for i in range(input_data[0].size):
+            # Recover index from i using shape
+            index = np.unravel_index(i, shape)
+            data = tuple(self.to_tensor(d[index], input[ii].dtype) for ii, d in enumerate(input_data))
+            if index[self.axis] == 0:
+                # First element
+                for j in range(len(output_data)):
+                    output_data[j][index] = data[j].handle.data.item()
+            else:
+                prev_index = tuple(index[i] - 1 if i == self.axis else index[i] for i in range(len(index)))
+                acc_tuple = tuple(self.to_tensor(o[prev_index], input[oi].dtype) for oi, o in enumerate(output_data))
+                combine_fn_ret = self.combine_fn.fn(*acc_tuple, *data)
+                acc_tuple = (combine_fn_ret, ) if not isinstance(combine_fn_ret, tuple) else combine_fn_ret
+                for j in range(len(output_data)):
+                    output_data[j][index] = acc_tuple[j].handle.data.item() if isinstance(
+                        acc_tuple[j], tl.core.tensor) else acc_tuple[j]
+        # Pack output
+        ret = []
+        for i, data in enumerate(output_data):
+            ret.append(self.to_tensor(data, input[i].dtype))
+        return ret
+
+    def apply_impl(self, input):
+        new_input = []
+        if self.reverse:
+            for arg in input:
+                new_input.append(self.to_tensor(np.flip(arg.handle.data, axis=self.axis), arg.dtype))
+        else:
+            new_input = input
+        if self.combine_fn == tl.standard._sum_combine:
+            ret = self.cumsum(new_input[0])
+        elif self.combine_fn == tl.standard._prod_combine:
+            ret = self.cumprod(new_input[0])
+        else:
+            # Fall back to the slow mode
+            ret = self.generic_scan(new_input)
+        if self.reverse:
+            for arg in ret:
+                arg.handle.data = np.flip(arg.handle.data, axis=self.axis)
+        return ret
+
+
+def _patch_reduce_scan(scope: _LangPatchScope):
+    # Because interpreter doesn't support region_builder_fn, we cannot patch the builder
+    # to use the new reduce and scan functions.
+    # Instead, we need to patch reduce and reduce functions in tl and tl.core
+    def _new_reduce(input, axis, combine_fn, keep_dims=False, **kwargs):
+        return ReduceOps(axis, combine_fn, keep_dims).apply(input)
+
+    def _new_scan(input, axis, combine_fn, reverse=False, **kwargs):
+        return ScanOps(axis, combine_fn, reverse).apply(input)
+
+    scope.set_attr(tl, "reduce", _new_reduce)
+    scope.set_attr(tl, "associative_scan", _new_scan)
+    scope.set_attr(tl.core, "reduce", _new_reduce)
+    scope.set_attr(tl.core, "associative_scan", _new_scan)
+
+
+def _patch_lang_core(lang, scope: _LangPatchScope):
+
+    def _new_to_ir(self, builder):
+        # We need to specify signedness for integer types in the numpy mode
+        if self.name == 'void':
+            return builder.get_void_ty()
+        elif self.name == 'int1':
+            return builder.get_int1_ty()
+        elif self.name == 'int8':
+            return builder.get_int8_ty()
+        elif self.name == 'uint8':
+            return builder.get_uint8_ty()
+        elif self.name == 'int16':
+            return builder.get_int16_ty()
+        elif self.name == 'uint16':
+            return builder.get_uint16_ty()
+        elif self.name == 'int32':
+            return builder.get_int32_ty()
+        elif self.name == 'uint32':
+            return builder.get_uint32_ty()
+        elif self.name == 'int64':
+            return builder.get_int64_ty()
+        elif self.name == 'uint64':
+            return builder.get_uint64_ty()
+        elif self.name == 'fp8e5':
+            return builder.get_fp8e5_ty()
+        elif self.name == 'fp8e4nv':
+            return builder.get_fp8e4nv_ty()
+        elif self.name == 'fp8e4b15':
+            return builder.get_fp8e4b15_ty()
+        elif self.name == 'fp16':
+            return builder.get_half_ty()
+        elif self.name == 'bf16':
+            return builder.get_bf16_ty()
+        elif self.name == 'fp32':
+            return builder.get_float_ty()
+        elif self.name == 'fp64':
+            return builder.get_double_ty()
+        raise ValueError(f'fail to convert {self} to ir type')
+
+    # can't just map lang.static_range to `range`, because `tl.static_range`
+    # can get `step` passed by keyword
+    def _new_range(arg1, arg2=None, step=None, **kwargs):
+        if step is None:
+            step = 1
+        if arg2 is None:
+            start, end = 0, arg1
+        else:
+            start, end = arg1, arg2
+        return range(start, end, step)
+
+    def _new_static_assert(cond, msg=""):
+        assert cond, msg
+
+    def _set_attr(input, values, name):
+        # skip non tensor types. This may happen for induction variables.
+        if not isinstance(input, tl.tensor):
+            return input
+        # Unwrap constexpr
+        values = [values] if not isinstance(values, (list, tuple)) else values
+        values = [v.value if isinstance(v, tl.constexpr) else v for v in values]
+        if len(values) != max(1, len(input.shape)):
+            raise ValueError(f"len(values) != len(input.shape) for {name}")
+        input.handle.set_attr(name, values)
+        return input
+
+    scope.set_attr(lang, "range", _new_range)
+    scope.set_attr(lang, "static_range", _new_range)
+    scope.set_attr(lang, "static_assert", _new_static_assert)
+    scope.set_attr(lang, "static_print", print)
+    scope.set_attr(lang.dtype, "to_ir", _new_to_ir)
+    scope.set_attr(lang, "multiple_of", partial(_set_attr, name="tt.divisibility"))
+    scope.set_attr(lang, "max_contiguous", partial(_set_attr, name="tt.contiguity"))
+    scope.set_attr(lang, "max_constancy", partial(_set_attr, name="tt.constancy"))
+
+    _patch_reduce_scan(scope)
+
+
+def _patch_lang(fn):
+    scope = _LangPatchScope()
+    langs = [value for _, value in fn.__globals__.items() if inspect.ismodule(value) and value in [tl, tl.core]]
+    assert len(langs) >= 1, "triton.language must be visible from within jit'd function"
+    for lang in langs:
+        _patch_builtin(lang, interpreter_builder, scope)
+        _patch_builtin(lang.tensor, interpreter_builder, scope)
+        if lang == tl:
+            _patch_builtin(lang.math, interpreter_builder, scope)
+        _patch_lang_tensor(lang.tensor, scope)
+        _patch_lang_core(lang, scope)
+    _patch_builtin(tl.core.tensor_descriptor_base, interpreter_builder, scope)
+    return scope
+
+
+def _tuple_create(arg, contents):
+    # NamedTuples and tuples have different construction semantics. NamedTuple
+    # has a constructor that takes individual arguments, while tuple takes an
+    # iterable. Both have type "tuple" making it difficult to distinguish
+    # between them, but only NamedTuple has "_fields" and apparently this is how
+    # everyone does the check.
+    return type(arg)(*contents) if hasattr(arg, "_fields") else type(arg)(contents)
+
+
+# TODO: wrap everything in triton tensors
+def _implicit_cvt(arg):
+    if isinstance(arg, int):
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg), None)
+        dtype = np.int32
+        if -2**31 <= arg < 2**31:
+            dtype = np.int32
+        elif 2**31 <= arg < 2**32:
+            dtype = np.uint32
+        elif -2**63 <= arg < 2**63:
+            dtype = np.int64
+        elif 2**63 <= arg < 2**64:
+            dtype = np.uint64
+        else:
+            raise ValueError(f"Unsupported integer value {arg}")
+        handle = TensorHandle(np.array([arg], dtype=dtype), ty)
+        return tl.tensor(handle, ty)
+    if hasattr(arg, "data_ptr"):
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg), None)
+        handle = TensorHandle(np.array([arg.data_ptr()], dtype=np.uint64), ty)
+        return tl.tensor(handle, ty)
+    elif isinstance(arg, tuple):
+        return _tuple_create(arg, map(_implicit_cvt, arg))
+    elif isinstance(arg, TensorDescriptor):
+        strides = [_implicit_cvt(s) for s in arg.strides]
+        assert arg.strides[-1] == 1
+        strides[-1] = tl.constexpr(1)
+        semantic = TritonSemantic(InterpreterBuilder())
+        return semantic.make_tensor_descriptor(base=_implicit_cvt(arg.base),
+                                               shape=[_implicit_cvt(s) for s in arg.shape], strides=strides,
+                                               block_shape=[tl.constexpr(b)
+                                                            for b in arg.block_shape], padding_option=arg.padding)
+    return arg
+
+
+interpreter_builder = InterpreterBuilder()
+interpreter_semantic = TritonSemantic(interpreter_builder)
+
+
+def _unwrap_tensor(t):
+    if isinstance(t, triton.runtime.jit.TensorWrapper):
+        return t.base
+    return t
+
+
+def _rewrap_tensor(t, original_tensor):
+    if isinstance(original_tensor, triton.runtime.jit.TensorWrapper):
+        return triton.runtime.jit.TensorWrapper(t, original_tensor.dtype)
+    return t
+
+
+class GridExecutor:
+
+    def __init__(self, fn, arg_names, grid, pre_run_hooks=[]):
+        from .jit import _normalize_ty  # TODO: modularize
+
+        self.fn = fn
+        self.arg_names = arg_names
+        self.grid = grid
+        self.pre_run_hooks = pre_run_hooks
+        __annotations__ = {name: _normalize_ty(ty) for name, ty in fn.__annotations__.items()}
+        self.constexprs = [name for name in arg_names if __annotations__.get(name) == "constexpr"]
+
+    def _init_args_hst(self, args_dev, kwargs):
+        storages = {}
+
+        def _to_cpu(arg):
+            if isinstance(arg, tuple):
+                return _tuple_create(arg, map(_to_cpu, arg))
+            elif isinstance(arg, TensorDescriptor):
+                return TensorDescriptor(
+                    _to_cpu(arg.base),
+                    arg.shape,
+                    arg.strides,
+                    arg.block_shape,
+                    arg.padding,
+                )
+            elif not hasattr(arg, "data_ptr"):
+                return arg
+
+            unwrapped_arg = _unwrap_tensor(arg)
+            if unwrapped_arg.untyped_storage().data_ptr() not in storages:
+                storage = unwrapped_arg.untyped_storage()
+                storages[storage.data_ptr()] = storage.cpu()
+
+            storage = storages[unwrapped_arg.untyped_storage().data_ptr()]
+            cpu_arg = unwrapped_arg.new_empty(0, device='cpu')
+            cpu_arg.set_(storage, unwrapped_arg.storage_offset(), unwrapped_arg.size(), unwrapped_arg.stride())
+            cpu_arg = _rewrap_tensor(cpu_arg, original_tensor=arg)
+            return cpu_arg
+
+        args_hst = [_to_cpu(arg) for arg in args_dev]
+
+        # Process keyword arguments
+        kwargs_hst = {}
+        for key, value in kwargs.items():
+            kwargs_hst[key] = _to_cpu(value)
+        return args_hst, kwargs_hst
+
+    def _restore_args_dev(self, args_dev, args_hst, kwargs, kwargs_hst):
+        storages = {}
+
+        def _from_cpu(arg_dev, arg_hst):
+            if hasattr(arg_dev, "data_ptr"):
+                # No need to rewrap because this just modifies internal
+                arg_dev, arg_hst = _unwrap_tensor(arg_dev), _unwrap_tensor(arg_hst)
+                storages[arg_dev.untyped_storage().data_ptr()] = (arg_dev.untyped_storage(), arg_hst.untyped_storage())
+            elif isinstance(arg_dev, tuple):
+                for (arg_dev, arg_hst) in zip(arg_dev, arg_hst):
+                    _from_cpu(arg_dev, arg_hst)
+            elif isinstance(arg_dev, TensorDescriptor):
+                _from_cpu(arg_dev.base, arg_hst.base)
+
+        for arg_dev, arg_hst in zip(args_dev, args_hst):
+            _from_cpu(arg_dev, arg_hst)
+
+        # Restore keyword arguments
+        for key, kwarg_dev in kwargs.items():
+            kwarg_hst = kwargs_hst[key]
+            _from_cpu(kwarg_dev, kwarg_hst)
+
+        for (arg_dev, arg_hst) in storages.values():
+            arg_dev.copy_(arg_hst)
+
+    def __call__(self, *args_dev, **kwargs):
+        # Removes not used reserved keywords from kwargs
+        # Triton doesn't support keyword-only, variable positional or variable keyword arguments
+        # It's safe to inspect only positional or keyword arguments (i.e., argspec.args)
+        argspec = inspect.getfullargspec(self.fn)
+        kwargs = {k: v for k, v in kwargs.items() if k in argspec.args}
+        # copy arguments to the host
+        args_hst, kwargs_hst = self._init_args_hst(args_dev, kwargs)
+        # run pre-run hooks
+        for hook in self.pre_run_hooks:
+            hook(*args_hst, **kwargs_hst)
+        # remaps core language functions to interpreted ones
+        patch_scope = _patch_lang(self.fn)
+        try:
+            # we need to copy arguments to the host for the interpreter
+            # implicitly convert tensor arguments to their base pointers
+            args = inspect.getcallargs(self.fn, *args_hst, **kwargs_hst)
+            args = {name: arg if name in self.constexprs else _implicit_cvt(arg) for name, arg in args.items()}
+            # iterate through grid
+            grid = self.grid(args) if callable(self.grid) else self.grid
+            assert len(grid) <= 3, "grid must have at most 3 dimensions"
+            grid = grid + (1, ) * (3 - len(grid))
+            interpreter_builder.set_grid_dim(*grid)
+            try:
+                for x in range(grid[0]):
+                    for y in range(grid[1]):
+                        for z in range(grid[2]):
+                            interpreter_builder.set_grid_idx(x, y, z)
+                            self.fn(**args)
+            except Exception as e:
+                if triton.knobs.compilation.front_end_debugging:
+                    raise
+                raise InterpreterError(repr(e)) from e
+        finally:
+            patch_scope.restore()
+        # copy arguments back to propagate side-effects
+        self._restore_args_dev(args_dev, args_hst, kwargs, kwargs_hst)
+
+
+class ASTTransformer(ast.NodeTransformer):
+
+    def visit_Assign(self, node):
+        names = []
+        for target in node.targets:
+            names += [self.visit(target)]
+        if len(names) > 1:
+            raise ValueError("Multiple assignments are not supported")
+        # Modify the assignment x = value to
+        # interpreter_semantic.to_tensor(value, False)
+        node.value = ast.Call(
+            func=ast.Attribute(value=ast.Name(id="interpreter_semantic", ctx=ast.Load()), attr="to_tensor",
+                               ctx=ast.Load()), args=[node.value, ast.Constant(value=False)], keywords=[])
+        return node
+
+
+class FunctionRewriter:
+    ast_transformer = ASTTransformer()
+
+    def __init__(self, fn, **kwargs):
+        self.fn = fn
+        self.kwargs = kwargs
+        self.filename: str = ""
+        # Absolute line number in the file
+        self.def_file_lineno: int = 0
+
+    def rewrite_ast(self):
+        # If exception is raise, it means the function does not have source code available,
+        # e.g., dynamically generated functions, we cannot rewrite it so just return the original function
+        try:
+            lines, _ = inspect.getsourcelines(self.fn)
+        except Exception:
+            return self.fn
+
+        # truncate lines before def
+        # @triton.autotune(...)
+        # ...
+        # @triton.jit
+        # ...
+        # def foo(...): <- this line is the function definition
+        self.filename, self.def_file_lineno = self._get_jit_fn_file_line()
+        self.def_lineno = self._find_def(lines)
+        src = self._prepare_source(lines)
+        transformed_ast = self._transform_ast(src)
+        return self._compile_and_exec(transformed_ast)
+
+    def _get_jit_fn_file_line(self):
+        from .jit import get_jit_fn_file_line, JITFunction
+        return get_jit_fn_file_line(JITFunction(self.fn))
+
+    def _find_def(self, lines):
+        def_lineno = 0
+        # Line numbers start from 1
+        for i, line in enumerate(lines):
+            if line.strip().startswith("def "):
+                def_lineno = i + 1
+        return def_lineno
+
+    def _prepare_source(self, lines):
+        lines = lines[self.def_lineno - 1:]
+        src = ''.join(lines)
+        return textwrap.dedent(src)
+
+    def _transform_ast(self, src):
+        # src is like:
+        # 1: def foo(...):
+        # 2:  ...
+        parsed_ast = ast.parse(src)
+        transformed_ast = self.ast_transformer.visit(parsed_ast)
+        ast.fix_missing_locations(transformed_ast)
+        inc_lineno = self.def_file_lineno - 1
+        ast.increment_lineno(transformed_ast, inc_lineno)
+        return transformed_ast
+
+    def _compile_and_exec(self, transformed_ast):
+        compiled_code = compile(transformed_ast, filename=self.filename, mode='exec')
+        local_namespace = {**self.kwargs}
+        fn_globals = self.fn.__globals__
+        for key, value in globals().items():
+            if key not in fn_globals:
+                fn_globals[key] = value
+        exec(compiled_code, fn_globals, local_namespace)
+        return local_namespace[self.fn.__name__]
+
+
+class InterpretedFunction(KernelInterface[T]):
+    # Cache all rewritten functions
+    rewritten_fn: Dict[Callable, Callable] = {}
+
+    def __init__(self, fn, **kwargs) -> None:
+        self.fn = fn
+        self.rewriter = FunctionRewriter(fn, **kwargs)
+        self.kwargs = kwargs
+        self.pre_run_hooks = []
+
+        signature = inspect.signature(fn)
+        self.arg_names = [v.name for v in signature.parameters.values()]
+
+    def run(self, *args, grid, warmup, **kwargs):
+        if warmup:
+            return
+        fn = self.rewrite()
+        return GridExecutor(fn, self.arg_names, grid, self.pre_run_hooks)(*args, **kwargs)
+
+    def add_pre_run_hook(self, hook):
+        assert callable(hook)
+        self.pre_run_hooks.append(hook)
+
+    def rewrite(self):
+        if self.fn not in self.rewritten_fn:
+            self.rewritten_fn[self.fn] = self.rewriter.rewrite_ast()
+        return self.rewritten_fn[self.fn]
+
+    @property
+    def __name__(self):
+        return self.fn.__name__
+
+    def __call__(self, *args, **kwargs):
+        # This is a device function call
+        _patch_lang(self.fn)
+        fn = self.rewrite()
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            raise InterpreterError(repr(e)) from e
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/jit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f506818f9d9790961fbd519f7a6067ca1e9c303
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/runtime/jit.py
@@ -0,0 +1,1099 @@
+from __future__ import annotations, division
+import ast
+import copy
+import hashlib
+import inspect
+import itertools
+import threading
+import re
+import textwrap
+from collections import defaultdict
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Callable, Generic, Iterable, Optional, TypeVar, overload, Dict, Any, Tuple
+
+from triton.backends import BaseBackend
+from types import ModuleType
+from .. import knobs
+from .driver import driver
+from . import _async_compile
+from .._utils import find_paths_if, get_iterable_path, type_canonicalisation_dict, is_namedtuple
+from .cache import get_cache_key
+from triton._C.libtriton import get_cache_invalidating_env_vars, native_specialize_impl
+
+TRITON_MODULE = "triton.language"
+GLUON_MODULE = "triton.experimental.gluon.language"
+
+T = TypeVar("T")
+
+# -----------------------------------------------------------------------------
+# Dependencies Finder
+# -----------------------------------------------------------------------------
+
+
+class DependenciesFinder(ast.NodeVisitor):
+    """
+    This AST visitor is used to find dependencies of a JITFunction. This can
+    be used to invalidate a JITFunction's hash when its source code -- or
+    that of its dependencies -- changes.
+
+    This visitor also keeps track of the global variables touched by the
+    JITFunction.  When we launch the kernel, we check that these have the same
+    values as they did when we ran this visitor.  If not, we raise an error (or
+    otherwise we could recompile).
+    """
+
+    def __init__(self, name, globals, nonlocals, src) -> None:
+        super().__init__()
+        self.name = name
+        self.hasher = hashlib.sha256(src.encode("utf-8"))
+
+        # This function's __globals__ dict.
+        self.globals = globals
+        self.nonlocals = nonlocals
+
+        # Python builtins that can be accessed from Triton kernels.
+        self.supported_python_builtins = {
+            'float',
+            'getattr',
+            'int',
+            'isinstance',
+            'len',
+            'list',
+            'max',
+            'min',
+            'print',
+            'range',
+        }
+        self.supported_modules = {
+            GLUON_MODULE,
+            TRITON_MODULE,
+            "copy",
+            "math",
+        }
+
+        # used_global_vals tells us which global variables are used by this
+        # function and all those it transitively calls, plus the values of those
+        # variables when each function was initially run.  (That is, if A calls
+        # C, and B calls C, then the values for C in used_global_vals will be
+        # from the first time C was run, either by A or B.)
+        #
+        # Each function may have a different __globals__ dict, so the global
+        # variable `foo` may actually have a different value in the different
+        # functions.  Thus this map is actually
+        #  (var_name, id(__globals__)) -> (var_value, __globals__).
+        self.used_global_vals: Dict[Tuple[str, int], Tuple[Any, Dict[str, Any]]] = {}
+
+        self.visiting_arg_default_value = False
+
+    @property
+    def ret(self):
+        return self.hasher.hexdigest()
+
+    def _is_triton_builtin(self, node, func):
+        if inspect.isbuiltin(node.func):
+            return True
+        module = getattr(func, "__module__", "")
+        return module.startswith(TRITON_MODULE)
+
+    def _update_hash(self, func):
+        assert isinstance(func, JITCallable)
+        # Merge our used_global_vals with those of the called function,
+        # after checking that all overlapping values are consistent.
+        for k in self.used_global_vals.keys() & func.used_global_vals.keys():
+            var_name, _ = k
+            v1, _ = self.used_global_vals[k]
+            v2, _ = func.used_global_vals[k]
+            if v1 != v2:
+                raise RuntimeError(
+                    f"Global variable {var_name} has value {v1} when compiling {self.name}, but inner kernel {func.__name__} has conflicting value {v2} from when it was first compiled.  This is not allowed."
+                )
+        self.used_global_vals.update(func.used_global_vals)
+        # update hash
+        func_key = func.cache_key
+        func_key += str(getattr(func, "noinline", False))
+        self.hasher.update(func_key.encode("utf-8"))
+
+    def record_reference(self, val, var_dict=None, name=None):
+        from ..language.core import constexpr
+        # Only keep track of "interesting" global variables, that non-evil users
+        # might change.  Don't consider functions, modules, builtins, etc.  This
+        # helps keep the list of vars we have to check small.
+        if val is None or type(val) is ModuleType:
+            return
+
+        if getattr(val, "__triton_aggregate__", False):
+            for attr in val.hash_attrs:
+                self.record_reference(attr)
+            return
+
+        if getattr(val, "__triton_builtin__", False):
+            return
+
+        # Stubs that aren't real functions
+        if getattr(val, "__module__", "") == "triton.language.extra.libdevice":
+            return
+
+        if isinstance(val, JITCallable):
+            self._update_hash(val)
+            return
+
+        if callable(val) and not isinstance(val, type) and not isinstance(val, constexpr):
+            raise RuntimeError(f"Unsupported function referenced: {val}")
+
+        # Python default arguments are resolved only once, when the
+        # function is defined.  So if you do `foo(a=A)` and the value of
+        # A changes, foo will still use the old value of A.
+        # It would be pretty evil if someone did `import x` and then
+        # `x = blah`.
+        if self.visiting_arg_default_value:
+            return
+
+        if var_dict is not None:
+            self.used_global_vals[(name, id(var_dict))] = (copy.deepcopy(val), var_dict)
+        return
+
+    def visit_Name(self, node):
+        if type(node.ctx) is ast.Store:
+            return node.id
+
+        if node.id in self.local_names:
+            # The global name is hidden by the local name.
+            return None
+
+        def name_lookup(name):
+            val = self.globals.get(name, None)
+            if val is not None:
+                return val, self.globals
+            val = self.nonlocals.get(name, None)
+            if val is not None:
+                return val, self.nonlocals
+            return None, None
+
+        val, var_dict = name_lookup(node.id)
+        if node.id in self.supported_python_builtins:
+            return val
+
+        self.record_reference(val, var_dict, node.id)
+        return val
+
+    def visit_Tuple(self, node):
+        # We need to explicitly return the tuple values so that visit_Assign can
+        # access them in the case of `a, b = ...`.
+        return [self.visit(elt) for elt in node.elts]
+
+    def visit_Attribute(self, node):
+        lhs = self.visit(node.value)
+        while isinstance(lhs, ast.Attribute):
+            lhs = self.visit(lhs.value)
+        lhs_name = getattr(lhs, "__name__", "")
+        if lhs is None or lhs_name in self.supported_modules:
+            return None
+        ret = getattr(lhs, node.attr)
+        self.record_reference(ret)
+        return ret
+
+    def visit_FunctionDef(self, node):
+        # Save the local name, which may hide the global name.
+        self.local_names = {arg.arg for arg in node.args.args}
+        self.generic_visit(node)
+
+    def visit_arguments(self, node):
+        # The purpose of this function is to visit everything in `arguments`
+        # just like `generic_visit`, except when we're visiting default values
+        # (i.e. the `foo` part of `def fn(x = foo)`), we set
+        # self.visiting_arg_default_value = True.  This allows visit_Name to be
+        # aware that we're inside function default values, which have special
+        # semantics.
+
+        # According to the AST docs, the arguments node has the following structure.
+        #
+        # arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs,
+        #              expr* kw_defaults, arg? kwarg, expr* defaults)
+        def visit_defaults(defaults):
+            try:
+                assert not self.visiting_arg_default_value
+                self.visiting_arg_default_value = True
+                for expr in defaults:
+                    if expr is not None:
+                        self.visit(expr)
+            finally:
+                self.visiting_arg_default_value = False
+
+        for arg in itertools.chain(node.posonlyargs, node.args, [node.vararg] if node.vararg else [], node.kwonlyargs):
+            self.visit(arg)
+
+        visit_defaults(node.kw_defaults)
+
+        if node.kwarg is not None:
+            self.visit(node.kwarg)
+
+        visit_defaults(node.defaults)
+
+    def visitAssnTarget(self, node):
+        # Target is either a single string, or a list of strings (if the assn
+        # target is a tuple).
+        target = self.visit(node)
+        if isinstance(target, list):
+            self.local_names |= set(target)
+        else:
+            self.local_names.add(target)
+
+    def visit_Assign(self, node):
+        if len(node.targets) != 1:
+            # TODO(jlebar): I don't actually know how to hit this.  You don't
+            # get it from `a, b = ...` -- in that case, node.targets is a single
+            # Tuple, and in fact we *do* need to handle that case if we want
+            # existing code to work.
+            raise TypeError("Simultaneous multiple assignment is not supported.")
+
+        self.visitAssnTarget(node.targets[0])
+
+        # This will re-visit the target, but that's OK.
+        self.generic_visit(node)
+
+    def visit_AnnAssign(self, node):
+        self.visitAssnTarget(node.target)
+
+        # This will re-visit the target, but that's OK.
+        self.generic_visit(node)
+
+    def visit_For(self, node):
+        self.visitAssnTarget(node.target)
+
+        # This will re-visit the target, but that's fine.
+        self.generic_visit(node)
+
+
+# -----------------------------------------------------------------------------
+# JITFunction
+# -----------------------------------------------------------------------------
+
+
+def _normalize_ty(ty) -> str:
+    import triton.language.core as core
+    if isinstance(ty, str):
+        ty = ty.strip()
+        if ty.startswith("const "):
+            ty = ty.removeprefix("const")
+            ty = _normalize_ty(ty)
+            assert ty.startswith("*")
+            return "*k" + ty[1:]
+        if ty.endswith("*"):
+            return "*" + _normalize_ty(ty[:-1])
+        if ty.startswith("*"):
+            return "*" + _normalize_ty(ty[1:])
+        if ty.startswith("tl."):
+            return _normalize_ty(ty.removeprefix("tl."))
+    elif isinstance(ty, core.pointer_type):
+        return f"*{_normalize_ty(ty.element_ty)}"
+    elif isinstance(ty, core.dtype):
+        ty = ty.name
+    elif isinstance(ty, type):
+        ty = ty.__name__
+    else:
+        ty = str(ty)
+    return type_canonicalisation_dict.get(ty.replace("_t", ""), ty)
+
+
+class KernelParam:
+    """Represents a parameter (name plus metadata) to a @jit'ed function."""
+
+    def __init__(self, num: int, param: inspect.Parameter, do_not_specialize: bool,
+                 do_not_specialize_on_alignment: bool):
+        self.num = num
+        self._param = param
+        self.do_not_specialize = do_not_specialize
+        self.do_not_specialize_on_alignment = do_not_specialize_on_alignment
+
+    @cached_property
+    def name(self):
+        return self._param.name
+
+    @cached_property
+    def annotation(self) -> str:
+        if not self._param.annotation or self._param.annotation == inspect.Parameter.empty:
+            return ""
+        return _normalize_ty(self._param.annotation)
+
+    @cached_property
+    def annotation_type(self) -> str:
+        a = self.annotation
+        if a.startswith("*k"):
+            a = a[2:]
+        elif a.startswith("*"):
+            a = a[1:]
+        if a in set(type_canonicalisation_dict.values()):
+            return self.annotation
+        return ""
+
+    @cached_property
+    def is_constexpr(self):
+        return "constexpr" in self.annotation
+
+    @cached_property
+    def is_const(self):
+        if self.is_constexpr:
+            return False
+        return "const" in self.annotation or self.annotation.startswith("*k")
+
+    @property
+    def default(self):
+        return self._param.default
+
+    @property
+    def has_default(self):
+        return self._param.default != inspect.Parameter.empty
+
+
+def mangle_type(arg, specialize=False):
+    is_const = False
+    align = True
+    return native_specialize_impl(BaseBackend, arg, is_const, specialize, align)[0]
+
+
+class KernelInterface(Generic[T]):
+    run: T
+
+    def warmup(self, *args, grid, **kwargs):
+        return self.run(grid=grid, warmup=True, *map(MockTensor.wrap_dtype, args), **kwargs)
+
+    def run(self, *args, grid, warmup, **kwargs):
+        raise NotImplementedError("run not implemented")
+
+    def __getitem__(self, grid) -> T:
+        """
+        A JIT function is launched with: fn[grid](*args, **kwargs).
+        Hence JITFunction.__getitem__ returns a callable proxy that
+        memorizes the grid.
+        """
+        return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+        # return cast(T, functools.partial(cast(Callable, self.run), grid=grid))
+
+
+def serialize_specialization_data(name, signature, constants, attrs, options, key):
+    constants = {
+        key: str(value) if value.__class__.__name__ == "dtype" else
+        {"constexpr": value.value} if value.__class__.__name__ == "constexpr" else value
+        for key, value in constants.items()
+    }
+
+    import json
+    obj = {
+        'name': name, 'signature': signature, 'constant_keys': [list(x) for x in constants.keys()], 'constant_vals':
+        list(constants.values()), 'attrs_keys': [list(x) for x in attrs.keys()], 'attrs_vals': list(attrs.values()),
+        'options': options.__dict__, 'key': key
+    }
+    serialized_obj = json.dumps(obj)
+    return serialized_obj
+
+
+def create_function_from_signature(sig, kparams, backend):
+    """
+    Equivalent to sig.bind followed by apply_defaults. This generates a
+    native Python function (using exec) which can be memoized on a per-kernel
+    basis to avoid having to run these expensive functions -- which constitute
+    much of the kernel launch overhead -- every time we run the kernel.
+    """
+    assert len(sig.parameters) == len(kparams)
+    # Create the function argument list and the dict entries for the return statement
+    specialization = []
+    # signature
+    for name, kp in zip(sig.parameters.keys(), kparams):
+        if kp.is_constexpr:
+            specialization.append(f'("constexpr", {name})')
+        else:
+            is_const = 'True' if kp.is_const else 'False'
+            specialize = 'False' if kp.do_not_specialize else 'True'
+            align = 'False' if kp.do_not_specialize_on_alignment else 'True'
+            ret = f"specialize_impl(backend, {name}, {is_const}, {specialize}, {align})"
+            if kp.annotation_type:
+                if isinstance(kp.annotation_type, str):
+                    if kp.annotation_type == "u1" or kp.annotation_type[:2] in ["fp", "bf"]:
+                        # we do not specialize non-constexpr floats and bools:
+                        specialize = False
+                if specialize:
+                    specialization.append(f'("{kp.annotation_type}",) + {ret}[1:]')
+                else:
+                    # skip runtime specialization:
+                    specialization.append(f'("{kp.annotation_type}", None)')
+            else:
+                specialization.append(f"{ret}")
+
+    # compute argument string for a given parameter
+    arg = lambda x: x[0] if x[1].default is inspect.Parameter.empty else f"{x[0]}=default_{x[0]}"
+    func_body = f"""
+def dynamic_func({", ".join(list(map(arg, sig.parameters.items())) + ["**options"])}):
+    params = {{{', '.join([f"'{name}': {name}" for name in sig.parameters.keys()])}}}
+    specialization = [{','.join(specialization)}]
+    return params, specialization, options
+"""
+
+    # Prepare defaults to be inserted into function namespace
+    func_namespace = {
+        f"default_{name}": param.default
+        for name, param in sig.parameters.items()
+        if param.default is not inspect.Parameter.empty
+    }
+
+    specialize_impl = native_specialize_impl
+    func_namespace["specialize_impl"] = specialize_impl
+    func_namespace["backend"] = backend
+    func_namespace["JITCallable"] = JITCallable
+
+    # Execute the function string in func_namespace to create the function
+    exec(func_body, func_namespace)
+
+    # Extract the newly created function from the namespace
+    return func_namespace['dynamic_func']
+
+
+def get_full_name(fn):
+    return f"{fn.__module__}.{fn.__qualname__}"
+
+
+class JITCallable:
+
+    def __init__(self, fn):
+        self.fn = fn
+        self.signature = inspect.signature(fn)
+        try:
+            self.raw_src, self.starting_line_number = inspect.getsourcelines(fn)
+        except OSError as e:
+            raise ValueError("@jit functions should be defined in a Python file") from e
+        self._fn_name = get_full_name(fn)
+        self._hash_lock = threading.RLock()
+
+        # function source code (without decorators)
+        src = textwrap.dedent("".join(self.raw_src))
+        src = src[re.search(r"^def\s+\w+\s*\(", src, re.MULTILINE).start():]
+        self._src = src
+        self.hash = None
+
+        # Map of global variables used by the function and any functions it
+        # transitively calls, plus their values.  The values are collected when
+        # the function is first compiled.  Then every time we run the function,
+        # we check that the values of the globals match what's expected,
+        # otherwise we raise an error.
+        #
+        # Different functions can have different __globals__ maps, so the map
+        # key is actually (var name, id(__globals__)), and the map value is
+        # (value, __globals__).
+        self.used_global_vals: Dict[Tuple[str, int], Tuple[Any, Dict[str, Any]]] = {}
+
+        # reuse docs of wrapped function
+        self.__doc__ = fn.__doc__
+        self.__name__ = fn.__name__
+        self.__qualname__ = fn.__qualname__
+        self.__globals__ = fn.__globals__
+        self.__module__ = fn.__module__
+
+    def get_capture_scope(self):
+        return self.__globals__ | inspect.getclosurevars(self.fn).nonlocals
+
+    @property
+    def cache_key(self) -> str:
+        # TODO : hash should be attribute of `self`
+        with self._hash_lock:
+            if self.hash is not None:
+                return self.hash
+            # Set a placeholder hash to break recursion in case the function
+            # transitively calls itself. The full hash is set after.
+            self.hash = f"recursion:{self._fn_name}"
+            nonlocals = inspect.getclosurevars(self.fn).nonlocals
+            dependencies_finder = DependenciesFinder(name=self._fn_name, globals=self.__globals__, nonlocals=nonlocals,
+                                                     src=self.src)
+            dependencies_finder.visit(self.parse())
+            self.hash = dependencies_finder.ret + str(self.starting_line_number)
+            self.used_global_vals = dict(sorted(dependencies_finder.used_global_vals.items()))
+
+            from triton.language.core import constexpr
+            self.hash += str([(name, val)
+                              for (name, _), (val, _) in self.used_global_vals.items()
+                              if isinstance(val, constexpr)])
+            self.hash = hashlib.sha256(self.hash.encode("utf-8")).hexdigest()
+        return self.hash
+
+    def __hash__(self):
+        return hash(self.cache_key)
+
+    # we do not parse `src` in the constructor because
+    # the user might want to monkey-patch self.src dynamically.
+    # Our unit tests do this, for example.
+    def parse(self):
+        tree = ast.parse(self._src)
+        assert isinstance(tree, ast.Module)
+        assert len(tree.body) == 1
+        assert isinstance(tree.body[0], ast.FunctionDef)
+        return tree
+
+    @property
+    def type(self):
+        from triton.language.core import constexpr_type
+        return constexpr_type(self)
+
+    def _unsafe_update_src(self, new_src):
+        """
+        The only method allowed to modify src.
+        Bypasses the __setattr__ restriction by calling super().__setattr__ directly.
+
+        Note that it is the callers responsibility to make sure any triton functions that call this function have the `.hash` value reset to None.
+        """
+        self.hash = None
+        self._src = new_src
+
+    def _set_src(self):
+        raise AttributeError("Cannot set attribute 'src' directly. "
+                             "Use '_unsafe_update_src()' and manually clear `.hash` of all callers"
+                             "instead.")
+
+    def _get_src(self):
+        return self._src
+
+    src = property(fget=_get_src, fset=_set_src)
+
+
+@dataclass
+class JitFunctionInfo:
+    module: ModuleType
+    name: str
+    jit_function: JITFunction
+
+
+def compute_cache_key(kernel_key_cache, specialization, options):
+    key = (tuple(specialization), str(options))
+    cache_key = kernel_key_cache.get(key, None)
+    if cache_key is not None:
+        return cache_key
+
+    # Replace JITCallable objects with their hash, so the cache key will change if the src is updated
+    def replace_callables(obj):
+        if isinstance(obj, list):
+            return [replace_callables(arg) for arg in obj]
+        elif is_namedtuple(obj):
+            results = [replace_callables(arg) for arg in obj]
+            return obj.__class__(*results)
+        elif isinstance(obj, tuple):
+            return tuple(replace_callables(arg) for arg in obj)
+        elif isinstance(obj, JITCallable):
+            return obj.cache_key
+        return obj
+
+    cache_key = str(replace_callables(specialization)) + str(options)
+    kernel_key_cache[key] = cache_key
+    return cache_key
+
+
+def convert_to_tuple_if_list(item):
+    # If the incoming item is a list, recursively iterate through it to convert all lists therein into tuples
+    if not isinstance(item, list):
+        return item
+
+    # The value must be a list at this point
+    for i, nested_value in enumerate(item):
+        item[i] = convert_to_tuple_if_list(nested_value)
+
+    return tuple(item)
+
+
+class JITFunction(JITCallable, KernelInterface[T]):
+
+    def is_gluon(self):
+        return False
+
+    def _call_hook(
+        self,
+        hook,
+        key,
+        signature,
+        device,
+        constants,
+        options,
+        configs,
+        is_warmup,
+    ) -> bool | None:
+        if not hook:
+            return None
+
+        name = self.fn.__qualname__
+        module = self.fn.__module__
+        arg_reprs = ", ".join([f"{param.name}: {ty}" for param, ty in zip(self.params, key[1])])
+        repr = f"{name}[num_warps={options.num_warps}, num_ctas={options.num_ctas}, num_stages={options.num_stages}, enable_fp_fusion={options.enable_fp_fusion}, launch_cooperative_grid={options.launch_cooperative_grid}]({arg_reprs})"
+        full_name = get_full_name(self.fn)
+
+        specialization_data = serialize_specialization_data(full_name, signature, constants, configs[0], options, key)
+
+        kwargs = {
+            'signature': signature,
+            'device': device,
+            'constants': constants,
+            'num_warps': options.num_warps,
+            'num_ctas': options.num_ctas,
+            'num_stages': options.num_stages,
+            'enable_fp_fusion': options.enable_fp_fusion,
+            'launch_cooperative_grid': options.launch_cooperative_grid,
+            'extern_libs': options.extern_libs,
+            'configs': configs,
+            'specialization_data': specialization_data,
+            'is_warmup': is_warmup,
+        }
+
+        return hook(
+            key=key,
+            repr=repr,
+            fn=JitFunctionInfo(module, name, self),
+            compile={"key": key, **kwargs},
+            is_manual_warmup=is_warmup,
+            already_compiled=False,
+        )
+
+    def add_pre_run_hook(self, hook):
+        '''
+        Add a hook that will be executed prior to the execution of run
+        function with args and kwargs passed into the kernel
+        '''
+        assert callable(hook)
+        self.pre_run_hooks.append(hook)
+
+    def create_binder(self):
+        """
+        Precompute as much as possible.
+        """
+        from ..compiler import CompiledKernel, compile, ASTSource, make_backend
+        target = driver.active.get_current_target()
+        backend = make_backend(target)
+        self.CompiledKernel = CompiledKernel
+        self.compile = compile
+        self.ASTSource = ASTSource
+        binder = create_function_from_signature(self.signature, self.params, backend)
+        return {}, {}, target, backend, binder
+
+    def _pack_args(self, backend, kwargs, bound_args, specialization, options):
+        # options
+        options = backend.parse_options(kwargs)
+        # signature
+        sigkeys = [x.name for x in self.params]
+        sigvals = [x[0] for x in specialization]
+        signature = {k: v for (k, v) in zip(sigkeys, sigvals)}
+        # check arguments
+        assert "device_type" not in kwargs, "device_type option is deprecated; current target will be used"
+        assert "device" not in kwargs, "device option is deprecated; current device will be used"
+        assert "stream" not in kwargs, "stream option is deprecated; current stream will be used"
+        for k in kwargs:
+            if k not in options.__dict__ and k not in sigkeys:
+                raise KeyError("Keyword argument %s was specified but unrecognised" % k)
+        # constexprs
+        constexprs = find_paths_if(sigvals, lambda _, val: val == "constexpr")
+        constexprs = {path: get_iterable_path(list(bound_args.values()), path) for path in constexprs}
+        # attributes
+        attrvals = [x[1] for x in specialization]
+        attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
+        attrs = {k: backend.parse_attr(get_iterable_path(attrvals, k)) for k in attrs}
+
+        return options, signature, constexprs, attrs
+
+    def run(self, *args, grid, warmup, **kwargs):
+        kwargs["debug"] = kwargs.get("debug", self.debug) or knobs.runtime.debug
+        kwargs["instrumentation_mode"] = knobs.compilation.instrumentation_mode
+
+        # parse options
+        device = driver.active.get_current_device()
+        stream = driver.active.get_current_stream(device)
+
+        # Execute pre run hooks with args and kwargs
+        for hook in self.pre_run_hooks:
+            hook(*args, **kwargs)
+
+        kernel_cache, kernel_key_cache, target, backend, binder = self.device_caches[device]
+        # specialization is list[tuple[str, Any]], where first element of tuple is
+        # the type and the second parameter is the 'specialization' value.
+        bound_args, specialization, options = binder(*args, **kwargs)
+
+        key = compute_cache_key(kernel_key_cache, specialization, options)
+        kernel = kernel_cache.get(key, None)
+
+        # Kernel is not cached; we have to compile.
+        if kernel is None:
+            options, signature, constexprs, attrs = self._pack_args(backend, kwargs, bound_args, specialization,
+                                                                    options)
+
+            kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)
+            if kernel is None:
+                return None
+
+        # Check that used global values have not changed.
+        not_present = object()
+        for (name, _), (val, globals_dict) in self.used_global_vals.items():
+            if (newVal := globals_dict.get(name, not_present)) != val:
+                raise RuntimeError(
+                    f"Global variable {name} has changed since we compiled this kernel, from {val} to {newVal}")
+
+        if not warmup:
+            # canonicalize grid
+            assert grid is not None
+            if callable(grid):
+                grid = grid(bound_args)
+            grid_size = len(grid)
+            grid_0 = grid[0]
+            grid_1 = grid[1] if grid_size > 1 else 1
+            grid_2 = grid[2] if grid_size > 2 else 1
+            if hasattr(kernel, "result"):
+                kernel = kernel.result()
+            # launch kernel
+            launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
+            kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
+                       knobs.runtime.launch_enter_hook, knobs.runtime.launch_exit_hook, *bound_args.values())
+        return kernel
+
+    def repr(self, _):
+        return self._fn_name if self._repr is None else self._repr(_)
+
+    def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_on_alignment=None, debug=None,
+                 noinline=None, repr=None, launch_metadata=None):
+        do_not_specialize = do_not_specialize if do_not_specialize else []
+        do_not_specialize_on_alignment = do_not_specialize_on_alignment if do_not_specialize_on_alignment else []
+
+        super().__init__(fn)
+        self.module = fn.__module__
+        self.version = version
+        self.do_not_specialize = do_not_specialize
+        self.do_not_specialize_on_alignment = do_not_specialize_on_alignment
+        self._repr = repr
+        self.launch_metadata = launch_metadata
+
+        self.params = []
+        for i, param in enumerate(self.signature.parameters.values()):
+            dns = i in do_not_specialize or param.name in do_not_specialize
+            dns_oa = i in do_not_specialize_on_alignment or param.name in do_not_specialize_on_alignment
+            self.params.append(KernelParam(i, param, dns, dns_oa))
+
+        # cache of just-in-time compiled kernels
+        self.device_caches = defaultdict(self.create_binder)
+
+        # JITFunction can be instantiated as kernel
+        # when called with a grid using __getitem__
+        self.kernel = None
+        self.debug = debug
+        self.noinline = noinline
+
+        # TODO(jlebar): Remove uses of these fields outside this file, then
+        # remove the fields here.
+        self.arg_names = [p.name for p in self.params]
+        self.constexprs = [p.num for p in self.params if p.is_constexpr]
+
+        # Hooks that will be called prior to executing "run"
+        self.pre_run_hooks = []
+
+    def preload(self, specialization_data):
+        import json
+        import triton.language as tl
+        device = driver.active.get_current_device()
+        deserialized_obj = json.loads(specialization_data)
+        if deserialized_obj['name'] != self._fn_name:
+            raise RuntimeError(
+                f"Specialization data is for {deserialized_obj['name']} but trying to preload for {self._fn_name}")
+        constant_keys = map(tuple, deserialized_obj['constant_keys'])
+        constant_vals = deserialized_obj['constant_vals']
+        constexprs = {
+            key:
+            tl.dtype(value) if tl.dtype.is_dtype(value) else
+            tl.constexpr(value['constexpr']) if isinstance(value, dict) and 'constexpr' in value else value
+            for key, value in zip(constant_keys, constant_vals)
+        }
+        attrs_keys = map(tuple, deserialized_obj['attrs_keys'])
+        attrs_vals = deserialized_obj['attrs_vals']
+        attrs = dict(zip(attrs_keys, attrs_vals))
+        # JSON serializes tuples as lists, so they need to be converted back;
+        # This can be done unconditionally, since lists are not accepted in Triton kernel signatures.
+        signature = {key: convert_to_tuple_if_list(value) for key, value in deserialized_obj['signature'].items()}
+        options = {
+            key: tuple(value) if isinstance(value, list) else value
+            for key, value in deserialized_obj['options'].items()
+        }
+        key = deserialized_obj['key']
+        _, _, _, backend, _ = self.device_caches[device]
+        options = backend.parse_options(options)
+        return self._do_compile(
+            key,
+            signature,
+            device,
+            constexprs,
+            options,
+            attrs,
+            warmup=True,
+        )
+
+    def _do_compile(self, key, signature, device, constexprs, options, attrs, warmup):
+        kernel_cache, _, target, backend, _ = self.device_caches[device]
+
+        if self._call_hook(knobs.runtime.jit_cache_hook, key, signature, device, constexprs, options, [attrs], warmup):
+            return None
+        src = self.ASTSource(self, signature, constexprs, attrs)
+
+        async_mode = _async_compile.active_mode.get()
+        if async_mode is not None:
+
+            env_vars = get_cache_invalidating_env_vars()
+            cache_key = get_cache_key(src, backend, options, env_vars)
+
+            def async_compile():
+                return self.compile(src, target=target, options=options.__dict__, _env_vars=env_vars)
+
+            def finalize_compile(kernel):
+                kernel_cache[key] = kernel
+                self._call_hook(knobs.runtime.jit_post_compile_hook, key, signature, device, constexprs, options,
+                                [attrs], warmup)
+
+            kernel = async_mode.submit(cache_key, async_compile, finalize_compile)
+        else:
+            kernel = self.compile(src, target=target, options=options.__dict__)
+            kernel_cache[key] = kernel
+            self._call_hook(knobs.runtime.jit_post_compile_hook, key, signature, device, constexprs, options, [attrs],
+                            warmup)
+        return kernel
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError("Cannot call @triton.jit'd outside of the scope of a kernel")
+
+    def __repr__(self):
+        return f"JITFunction({self.module}:{self.fn.__qualname__})"
+
+
+# -----------------------------------------------------------------------------
+# `jit` decorator
+# -----------------------------------------------------------------------------
+
+
+@overload
+def jit(fn: T) -> JITFunction[T]:
+    ...
+
+
+@overload
+def jit(
+    *,
+    version=None,
+    repr: Optional[Callable] = None,
+    launch_metadata: Optional[Callable] = None,
+    do_not_specialize: Optional[Iterable[int | str]] = None,
+    do_not_specialize_on_alignment: Optional[Iterable[int | str]] = None,
+    debug: Optional[bool] = None,
+    noinline: Optional[bool] = None,
+) -> Callable[[T], JITFunction[T]]:
+    ...
+
+
+def jit(
+    fn: Optional[T] = None,
+    *,
+    version=None,
+    repr: Optional[Callable] = None,
+    launch_metadata: Optional[Callable] = None,
+    do_not_specialize: Optional[Iterable[int | str]] = None,
+    do_not_specialize_on_alignment: Optional[Iterable[int | str]] = None,
+    debug: Optional[bool] = None,
+    noinline: Optional[bool] = None,
+) -> KernelInterface[T]:
+    """
+    Decorator for JIT-compiling a function using the Triton compiler.
+
+    :note: When a jit'd function is called, arguments are
+        implicitly converted to pointers if they have a :code:`.data_ptr()` method
+        and a `.dtype` attribute.
+
+    :note: This function will be compiled and run on the GPU. It will only have access to:
+
+           * python primitives,
+           * builtins within the triton package,
+           * arguments to this function,
+           * other jit'd functions
+
+    :param fn: the function to be jit-compiled
+    :type fn: Callable
+    """
+
+    def decorator(fn: T) -> JITFunction[T]:
+        assert callable(fn)
+        if knobs.runtime.interpret:
+            from .interpreter import InterpretedFunction
+            return InterpretedFunction(fn, version=version, do_not_specialize=do_not_specialize,
+                                       do_not_specialize_on_alignment=do_not_specialize_on_alignment, debug=debug,
+                                       noinline=noinline, repr=repr, launch_metadata=launch_metadata)
+        else:
+            return JITFunction(
+                fn,
+                version=version,
+                do_not_specialize=do_not_specialize,
+                do_not_specialize_on_alignment=do_not_specialize_on_alignment,
+                debug=debug,
+                noinline=noinline,
+                repr=repr,
+                launch_metadata=launch_metadata,
+            )
+
+    if fn is not None:
+        return decorator(fn)
+
+    else:
+        return decorator
+
+
+# -----------------------------------------------------------------------------
+# Utilities for mocking tensors
+# -----------------------------------------------------------------------------
+
+
+class MockTensor:
+    """
+    Can be used in place of real tensors when calling:
+        kernel.warmup(MockTensor(torch.float32), ...)
+    """
+
+    @staticmethod
+    def wrap_dtype(arg):
+        if arg.__class__.__name__ == "dtype" and arg.__module__ == "torch":
+            return MockTensor(arg)
+        return arg
+
+    def __init__(self, dtype, shape=None):
+        if shape is None:
+            shape = [1]
+        self.dtype = dtype
+        self.shape = shape
+
+    def stride(self):
+        strides = [1]
+        for size in self.shape[1:]:
+            strides.append(strides[-1] * size)
+        return tuple(reversed(strides))
+
+    @staticmethod
+    def data_ptr():
+        return 0  # optimistically assumes multiple of 16
+
+    @staticmethod
+    def ptr_range():
+        return 0  # optimistically assumes 32 bit pointer range
+
+
+class TensorWrapper:
+
+    def __init__(self, base, dtype):
+        self.dtype = dtype
+        self.base = base
+        self.data = base.data
+        self.device = base.device
+        self.shape = self.base.shape
+
+    def data_ptr(self):
+        return self.base.data_ptr()
+
+    def stride(self, *args):
+        return self.base.stride(*args)
+
+    def __str__(self) -> str:
+        return f"TensorWrapper[{self.dtype}]({self.base})"
+
+    def element_size(self):
+        return self.base.element_size()
+
+    def cpu(self):
+        return TensorWrapper(self.base.cpu(), self.dtype)
+
+    def copy_(self, other):
+        self.base.copy_(other.base)
+
+    def clone(self):
+        return TensorWrapper(self.base.clone(), self.dtype)
+
+    def to(self, device):
+        return TensorWrapper(self.base.to(device), self.dtype)
+
+    def new_empty(self, sizes):
+        return TensorWrapper(self.base.new_empty(sizes), self.dtype)
+
+
+def reinterpret(tensor, dtype):
+    if isinstance(tensor, TensorWrapper):
+        if dtype == tensor.base.dtype:
+            # Reinterpreting to the original interpretation; return the base.
+            return tensor.base
+        else:
+            # Reinterpreting a wrapped tensor to a different type.
+            return TensorWrapper(tensor.base, dtype)
+    elif hasattr(tensor, "data_ptr"):
+        # A new wrapper is needed around an unwrapped tensor.
+        return TensorWrapper(tensor, dtype)
+    else:
+        raise TypeError(f"Cannot reinterpret a {type(tensor)}.")
+
+
+def get_jit_fn_file_line(fn):
+    base_fn = fn
+    while not isinstance(base_fn, JITCallable):
+        base_fn = base_fn.fn
+    file_name = base_fn.fn.__code__.co_filename
+    begin_line = base_fn.starting_line_number
+    # Match the following pattern:
+    # @triton.autotune(...) <- foo.__code__.co_firstlineno
+    # @triton.heuristics(...)
+    # @triton.jit
+    # def foo(...): <- this line is the first line
+    for idx, line in enumerate(base_fn.raw_src):
+        if line.strip().startswith("def "):
+            begin_line += idx
+            break
+    return file_name, begin_line
+
+
+class BoundConstexprFunction(JITCallable):
+
+    def __init__(self, instance, fn):
+        self.__self__ = instance
+        self.__func__ = fn
+
+    @property
+    def cache_key(self):
+        return self.__func__.cache_key
+
+    def __call__(self, *args, **kwargs):
+        return self.__func__(self.__self__, *args, **kwargs)
+
+
+class ConstexprFunction(JITCallable):
+
+    def __init__(self, fn):
+        super().__init__(fn)
+
+    def __get__(self, obj, objclass):
+        # Create a bound function to support constexpr_function methods
+        if obj is not None:
+            return BoundConstexprFunction(obj, self)
+        return self
+
+    def __call__(self, *args, _semantic=None, **kwargs):
+        from triton.language.core import _unwrap_if_constexpr, constexpr
+        # de-constexpr arguments and discard the _semantic keyword argument:
+        args = [_unwrap_if_constexpr(x) for x in args]
+        kwargs = {k: _unwrap_if_constexpr(v) for (k, v) in kwargs.items()}
+
+        # call the raw Python function f:
+        res = self.fn(*args, **kwargs)
+
+        if _semantic is None:
+            # Not called by triton code generator, e.g. in host code, another constexpr function, or even an aggreate's __init__ function
+            return res
+
+        # convert result back to a Triton constexpr:
+        if knobs.runtime.interpret:
+            return res  # No constexpr in interpreter
+        return constexpr(res)
+
+
+def constexpr_function(fn):
+    """
+    Wraps an arbitrary Python function so that it can be called at
+    compile-time on constexpr arguments in a Triton function and
+    returns a constexpr result.
+    """
+    return ConstexprFunction(fn)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb4e3a7a82c5aa7d3c6144ac7f6e793f8c4e9d5a
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__init__.py
@@ -0,0 +1 @@
+from triton._C.libtriton.linear_layout import LinearLayout
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b86e0277c2f4f196f6952822df8d82a0eb7f5d41
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/__init__.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/build_extern.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/build_extern.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e8fbf13348056ba01b8121c5a2248ccbdcbcd35
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/build_extern.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/compile.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/compile.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b8e047be949288e548b25c8838026303aa03dd2
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/compile.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/disasm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/disasm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1871e424513c004aa438bfa16b8321e69715b5a
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/disasm.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/link.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/link.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f549899705e63442ba4e09d18a8138c2dfc2de66
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/link.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/mxfp.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/mxfp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d8a7bccf687412dbad6cab256e444faeff3abbb
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/mxfp.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/ragged_tma.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/ragged_tma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e051a5954973b78a775efdac815fa923255e758f
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/ragged_tma.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/tensor_descriptor.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/tensor_descriptor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f544a11dd486bfc46678a96bf0f710382f66e4a0
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/__pycache__/tensor_descriptor.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/build_extern.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/build_extern.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0168d59d7af045bde68a508a000654f4893bb1
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/build_extern.py
@@ -0,0 +1,365 @@
+import argparse
+import subprocess
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+
+class Symbol:
+    _name: str
+    _op_name: str
+    _ret_type: str
+    _arg_names: List[str]
+    _arg_types: List[str]
+
+    def __init__(
+        self,
+        name: str,
+        op_name: str,
+        ret_type: str,
+        arg_names: List[str],
+        arg_types: List[str],
+    ) -> None:
+        '''
+        A symbol is a function declaration.
+        :param name: name of the symbol
+        :param op_name: name of the operation
+        :param ret_type: return type of the operation
+        :param arg_names: names of the arguments
+        :param arg_types: types of the arguments
+        '''
+        self._name = name
+        self._op_name = op_name
+        self._ret_type = ret_type
+        self._arg_names = list(arg_names)
+        self._arg_types = list(arg_types)
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def op_name(self) -> str:
+        return self._op_name
+
+    @property
+    def ret_type(self) -> str:
+        return self._ret_type
+
+    @property
+    def arg_names(self) -> List[str]:
+        return self._arg_names
+
+    @property
+    def arg_types(self) -> List[str]:
+        return self._arg_types
+
+
+def convert_type(type_str) -> Optional[str]:
+    if type_str == "i32":
+        return "int32"
+    elif type_str == "u32":
+        return "uint32"
+    elif type_str == "i64":
+        return "int64"
+    elif type_str == "u64":
+        return "uint64"
+    elif type_str == "float":
+        return "fp32"
+    elif type_str == "double":
+        return "fp64"
+    else:
+        # ignore other types, such as pointer types
+        return None
+
+
+def to_unsigned(type_str) -> str:
+    if type_str == "int32":
+        return "uint32"
+    elif type_str == "int64":
+        return "uint64"
+    else:
+        return type_str
+
+
+class ExternLibrary(ABC):
+    _name: str
+    _path: str
+    _symbols: Dict[str, Symbol]
+    _format: bool
+    _grouping: bool
+
+    def __init__(
+        self,
+        name: str,
+        path: str,
+        format: bool = True,
+        grouping: bool = True,
+    ) -> None:
+        '''
+        Abstract class for extern library.
+        :param name: name of the library
+        :param path: path of the library
+        :param format: whether to format the generated stub file
+        '''
+        self._name = name
+        self._path = path
+        self._symbols = {}
+        self._format = format
+        self._grouping = grouping
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def path(self) -> str:
+        return self._path
+
+    @property
+    def symbols(self) -> Dict[str, Symbol]:
+        return self._symbols
+
+    @property
+    def grouping(self) -> bool:
+        return self._grouping
+
+    @abstractmethod
+    def parse_symbols(self, input_file) -> None:
+        pass
+
+    @abstractmethod
+    def _output_stubs(self) -> str:
+        pass
+
+    def generate_stub_file(self, output_dir) -> None:
+        file_str = self._output_stubs()
+        if file_str is None or len(file_str) == 0:
+            raise Exception("file_str is empty")
+
+        output_file = f"{output_dir}/{self._name}.py"
+        with open(output_file, "w") as f:
+            f.write(file_str)
+            f.close()
+            if self._format:
+                subprocess.Popen(["autopep8", "-a", "-r", "-i", output_file], stdout=subprocess.PIPE).communicate()
+                subprocess.Popen(["isort", output_file], stdout=subprocess.PIPE).communicate()
+
+
+class Libdevice(ExternLibrary):
+    _symbol_groups: Dict[str, List[Symbol]]
+
+    def __init__(self, path) -> None:
+        '''
+        Constructor for Libdevice.
+        :param path: path of the libdevice library
+        '''
+        super().__init__("libdevice", path)
+        self._symbol_groups = {}
+        self.is_pure = True
+
+    @staticmethod
+    def _extract_symbol(line) -> Optional[Symbol]:
+        # Extract symbols from line in the following format:
+        # "define [internal] <ret_type> @<name>(<arg_types>,)"
+        entries = line.split("@")
+        ret_str = entries[0]
+        func_str = entries[1]
+        # Get ret_type, skip internal symbols
+        ret_strs = ret_str.split()
+        if ret_strs[1] == "internal":
+            return None
+        ret_type = convert_type(ret_strs[1])
+        if ret_type is None:
+            return None
+        # Get function name
+        func_strs = func_str.split("(")
+        func_name = func_strs[0].replace("@", "")
+        op_name = func_name.replace("__nv_", "")
+        if 'ieee' in op_name:
+            return None
+        # Get arg_types
+        arg_strs = func_strs[1].split(",")
+        arg_types = []
+        arg_names = []
+        for i, arg_str in enumerate(arg_strs):
+            arg_type = convert_type(arg_str.split()[0])
+            if arg_type is None:
+                return None
+            arg_name = 'arg' + str(i)
+            arg_types.append(arg_type)
+            arg_names.append(arg_name)
+        if op_name == "sad":
+            # Special case for sad, where the last argument is an unsigned int
+            arg_types[-1] = to_unsigned(arg_types[-1])
+        elif op_name.startswith("u"):
+            # LLVM does not differentiate between signed and unsigned integer type.
+            # We have to convert the types to unsigned
+            ret_type = to_unsigned(ret_type)
+            for i, arg_type in enumerate(arg_types):
+                arg_types[i] = to_unsigned(arg_type)
+        return Symbol(func_name, op_name, ret_type, arg_names, arg_types)
+
+    def _group_symbols(self) -> None:
+        symbol_set = {}
+        for symbol in self._symbols.values():
+            op_name = symbol.op_name
+            symbol_set[op_name] = symbol
+
+        # Group functions together by renaming.
+        renaming = {
+            'llabs': 'abs', 'acosf': 'acos', 'acoshf': 'acosh', 'dadd_rd': 'add_rd', 'fadd_rd': 'add_rd', 'dadd_rn':
+            'add_rn', 'fadd_rn': 'add_rn', 'dadd_ru': 'add_ru', 'fadd_ru': 'add_ru', 'dadd_rz': 'add_rz', 'fadd_rz':
+            'add_rz', 'asinf': 'asin', 'asinhf': 'asinh', 'atanf': 'atan', 'atan2f': 'atan2', 'atanhf': 'atanh',
+            'brevll': 'brev', 'cbrtf': 'cbrt', 'ceilf': 'ceil', 'clzll': 'clz', 'copysignf': 'copysign', 'cosf': 'cos',
+            'coshf': 'cosh', 'cospif': 'cospi', 'cyl_bessel_i0f': 'cyl_bessel_i0', 'cyl_bessel_i1f': 'cyl_bessel_i1',
+            'fdiv_rd': 'div_rd', 'ddiv_rd': 'div_rd', 'fdiv_rn': 'div_rn', 'ddiv_rn': 'div_rn', 'fdiv_ru': 'div_ru',
+            'ddiv_ru': 'div_ru', 'fdiv_rz': 'div_rz', 'ddiv_rz': 'div_rz', 'erff': 'erf', 'erfcf': 'erfc', 'erfcinvf':
+            'erfcinv', 'erfcxf': 'erfcx', 'erfinvf': 'erfinv', 'expf': 'exp', 'exp10f': 'exp10', 'exp2f': 'exp2',
+            'expm1f': 'expm1', 'fabsf': 'abs', 'fabs': 'abs', 'fast_fdividef': 'fast_dividef', 'fdimf': 'fdim', 'ffsll':
+            'ffs', 'floorf': 'floor', 'fmaf': 'fma', 'fmaf_rd': 'fma_rd', 'fmaf_rn': 'fma_rn', 'fmaf_ru': 'fma_ru',
+            'fmaf_rz': 'fma_rz', 'fmodf': 'fmod', 'uhadd': 'hadd', 'hypotf': 'hypot', 'ilogbf': 'ilogb', 'isinff':
+            'isinf', 'isinfd': 'isinf', 'isnanf': 'isnan', 'isnand': 'isnan', 'j0f': 'j0', 'j1f': 'j1', 'jnf': 'jn',
+            'ldexpf': 'ldexp', 'lgammaf': 'lgamma', 'llrintf': 'llrint', 'llroundf': 'llround', 'logf': 'log', 'log10f':
+            'log10', 'log1pf': 'log1p', 'log2f': 'log2', 'logbf': 'logb', 'umax': 'max', 'llmax': 'max', 'ullmax':
+            'max', 'fmaxf': 'max', 'fmax': 'max', 'umin': 'min', 'llmin': 'min', 'ullmin': 'min', 'fminf': 'min',
+            'fmin': 'min', 'dmul_rd': 'mul_rd', 'fmul_rd': 'mul_rd', 'dmul_rn': 'mul_rn', 'fmul_rn': 'mul_rn',
+            'dmul_ru': 'mul_ru', 'fmul_ru': 'mul_ru', 'dmul_rz': 'mul_rz', 'fmul_rz': 'mul_rz', 'umul24': 'mul24',
+            'umulhi': 'mulhi', 'mul64hi': 'mulhi', 'umul64hi': 'mulhi', 'nearbyintf': 'nearbyint', 'nextafterf':
+            'nextafter', 'norm3df': 'norm3d', 'norm4df': 'norm4d', 'normcdff': 'normcdf', 'normcdfinvf': 'normcdfinv',
+            'popcll': 'popc', 'powif': 'pow', 'powi': 'pow', 'powf': 'pow', 'rcbrtf': 'rcbrt', 'frcp_rd': 'rcp_rd',
+            'drcp_rd': 'rcp_rd', 'frcp_rn': 'rcp_rn', 'drcp_rn': 'rcp_rn', 'frcp_ru': 'rcp_ru', 'drcp_ru': 'rcp_ru',
+            'frcp_rz': 'rcp_rz', 'drcp_rz': 'rcp_rz', 'remainderf': 'remainder', 'urhadd': 'rhadd', 'rhypotf': 'rhypot',
+            'rintf': 'rint', 'rnorm3df': 'rnorm3d', 'rnorm4df': 'rnorm4d', 'roundf': 'round', 'rsqrtf': 'rsqrt',
+            'frsqrt_rn': 'rsqrt_rn', 'usad': 'sad', 'scalbnf': 'scalbn', 'signbitf': 'signbit', 'signbitd': 'signbit',
+            'sinf': 'sin', 'sinhf': 'sinh', 'sinpif': 'sinpi', 'sqrtf': 'sqrt', 'fsqrt_rd': 'sqrt_rd', 'dsqrt_rd':
+            'sqrt_rd', 'fsqrt_rn': 'sqrt_rn', 'dsqrt_rn': 'sqrt_rn', 'fsqrt_ru': 'sqrt_ru', 'dsqrt_ru': 'sqrt_ru',
+            'fsqrt_rz': 'sqrt_rz', 'dsqrt_rz': 'sqrt_rz', 'fsub_rd': 'sub_rd', 'dsub_rd': 'sub_rd', 'fsub_rn': 'sub_rn',
+            'dsub_rn': 'sub_rn', 'fsub_ru': 'sub_ru', 'dsub_ru': 'sub_ru', 'fsub_rz': 'sub_rz', 'dsub_rz': 'sub_rz',
+            'tanf': 'tan', 'tanhf': 'tanh', 'tgammaf': 'tgamma', 'truncf': 'trunc', 'y0f': 'y0', 'y1f': 'y1', 'ynf':
+            'yn'
+        }
+
+        for symbol in self._symbols.values():
+            op_name = symbol.op_name
+            if op_name in renaming:
+                op_name = renaming[op_name]
+                symbol._op_name = op_name
+            if op_name in self._symbol_groups:
+                self._symbol_groups[op_name].append(symbol)
+            else:
+                self._symbol_groups[op_name] = [symbol]
+
+    def parse_symbols(self, input_file) -> None:
+        if len(self.symbols) > 0:
+            return
+        output = subprocess.check_output(["grep", "define", input_file]).decode().splitlines()
+        for line in output:
+            symbol = self._extract_symbol(line)
+            if symbol is None:
+                continue
+            self._symbols[symbol.name] = symbol
+
+        self._group_symbols()
+
+    def _output_stubs(self) -> str:
+        # Generate python functions in the following format:
+        # @extern.extern
+        # def <op_name>(<args>, _builder=None):
+        #   arg_type_symbol_dict = {[arg_type]: {(symbol, ret_type)}}
+        #   return core.extern_elementwise("libdevice", <path>, <args>, <arg_type_symbol_dict>, _builder)
+        import_str = "from . import core\n"
+
+        header_str = ""
+        func_str = ""
+        for symbols in self._symbol_groups.values():
+            func_str += "@core.extern\n"
+            func_name_str = f"def {symbols[0].op_name}("
+            for arg_name in symbols[0].arg_names:
+                func_name_str += f"{arg_name}, "
+            func_name_str += "_builder=None):\n"
+
+            return_str = f"\treturn core.extern_elementwise(\"{self._name}\", libdevice_path(), ["
+            for arg_name in symbols[0].arg_names:
+                return_str += f"{arg_name}, "
+            return_str += "], \n"
+
+            arg_type_symbol_dict_str = "{"
+            for symbol in symbols:
+                arg_type_symbol_dict_str += "("
+                for arg_type in symbol.arg_types:
+                    arg_type_symbol_dict_str += f'core.dtype("{arg_type}"),'
+                ret_type = f'core.dtype("{symbol.ret_type}")'
+                arg_type_symbol_dict_str += "): (\"" + symbol.name + "\", " + ret_type + "),\n"
+            arg_type_symbol_dict_str += "}"
+
+            return_str += arg_type_symbol_dict_str
+            return_str += f", is_pure={self.is_pure}"
+            return_str += ", _builder=_builder)\n"
+
+            func_str += func_name_str + return_str + "\n"
+        file_str = import_str + header_str + func_str
+
+        return file_str
+
+
+class LLVMDisassembler:
+    _path: str
+    _ll_file: str
+
+    def __init__(self, path) -> None:
+        '''
+        Invoke llvm-dis to disassemble the given file.
+        :param path: path to llvm-dis
+        '''
+        self._path = path
+        self._ll_file = "/tmp/extern_lib.ll"
+
+    def disasm(self, lib_path: str) -> None:
+        subprocess.Popen([self._path, lib_path, "-o", self.ll_file], stdout=subprocess.PIPE).communicate()
+
+    @property
+    def ll_file(self) -> str:
+        return self._ll_file
+
+    @property
+    def path(self) -> str:
+        return self._path
+
+
+extern_libs = ["libdevice"]
+
+
+def build(
+    llvm_dis_path: str,
+    lib_path: str,
+    lib_name: str,
+    output_dir: str,
+) -> None:
+    '''
+      Interface function to build the library file.
+      :param llvm_dis_path: path to the llvm-dis binary
+      :param lib_path: path to the external library file
+      :param lib_name: name of the library
+      :param output_dir: path to the output directory
+    '''
+    if lib_name == "libdevice":
+        extern_lib = Libdevice(lib_path)
+    else:
+        raise Exception(f"Unknown extern library: {lib_name}")
+
+    llvm_disassembler = LLVMDisassembler(llvm_dis_path)
+    llvm_disassembler.disasm(lib_path)
+
+    extern_lib.parse_symbols(llvm_disassembler.ll_file)
+    extern_lib.generate_stub_file(output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--llvm-dis", dest="llvm_dis_path", help="Path to llvm-dis", default="llvm-dis")
+    parser.add_argument("--lib-path", dest="lib_path", help="Path to the extern library")
+    parser.add_argument("--lib-name", dest="lib_name", help="Name of the extern library")
+    parser.add_argument("--output", dest="output_dir", help="Output file path", default="/tmp/")
+    args = parser.parse_args()
+
+    build(args.llvm_dis_path, args.lib_path, args.lib_name, args.output_dir)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/compile.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..73085d3d316094cbd8b3ce3141f47968412cbe6c
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/compile.py
@@ -0,0 +1,211 @@
+import binascii
+import hashlib
+import importlib.util
+import sys
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+
+import triton
+import triton.backends
+
+
+@dataclass
+class CompileArgs:
+    '''
+    A class to contain arguments from command-line parser.
+    '''
+    path: str = ''
+    kernel_name: str = ''
+    signature: str = ''
+    grid: str = ''
+    target: str | None = None
+    num_warps: int = 1
+    num_stages: int = 3
+    out_name: str | None = None
+    out_path: Path | None = None
+
+
+desc = """
+Triton ahead-of-time compiler:
+
+This program compiles the kernel with name `kernel-name` in the file at the
+provided `path` into self-contained C source-code that embeds the `cubin`
+data along with utilities to load, unload and launch the kernel.
+
+signature is provided as a list of (optionally divisibility-hinted) types
+or constexpr values, e.g.
+
+`compile.py --kernel-name kernel --signature "*fp32:16, i32:16, 1024, i32" --out-name kernel /path/to/kernel.py`
+
+will compile triton.JITFunction of name `kernel` inside the file `/path/to/kernel.py`.
+Said kernel will be specialized such that argument 0, 1 are assumed to be multiple of 16,
+and argument 2 is assumed to be a compile-time constant of value 1024, i.e. it won't be part of the generated prototype.
+
+The resulting entry point will have signature
+
+CUresult kernel_{specialization_suffix}(CUstream stream, unsigned gX, unsigned gY, unsigned gZ, float* arg0, int32_t arg1, int32_t arg2)
+
+Different such specialized entry points can be combined using the `linker.py` script.
+
+NOTE: when resolving the scope of /path/to/kernel.py, the file will be executed from within its parent directory with the python interpreter
+used to run this `compile.py` script
+"""
+
+
+def main():
+    # command-line arguments
+    parser = ArgumentParser(description=desc)
+    parser.add_argument("path",
+                        help="Path to Python source containing desired kernel in its scope. File will be executed.")
+    parser.add_argument("--kernel-name", "-n", type=str, default="", help="Name of the kernel to compile",
+                        required=True)
+    parser.add_argument(
+        "--target", "-t", type=str, default=None,
+        help="The target to compile towards, in format of '<backend>:<arch>:<warp-size>'; "
+        "e.g., 'cuda:80:32', 'hip:gfx942:64'. Default to None, which means using current machine's GPU target")
+    parser.add_argument("--num-warps", "-w", type=int, default=1, help="Number of warps to launch the kernel")
+    parser.add_argument("--num-stages", "-ns", type=int, default=3,
+                        help="Number of stages (meta-parameter of the kernel)")
+    parser.add_argument("--out-name", "-on", type=str, default=None, help="Out name for the compiled kernel")
+    parser.add_argument("--out-path", "-o", type=Path, default=None, help="Out filename")
+    parser.add_argument("--signature", "-s", type=str, help="Signature of the kernel", required=True)
+    parser.add_argument("--grid", "-g", type=str, help="Launch grid of the kernel", required=True)
+    cli_args = parser.parse_args()
+    args = CompileArgs(**vars(cli_args))  # A sanity check to ensure class CompileArgs is updated as well.
+    compile_kernel(args)
+
+
+def compile_kernel(args: CompileArgs):
+    out_name = args.out_name if args.out_name else args.kernel_name
+    out_path = args.out_path if args.out_path else Path(out_name)
+
+    # execute python sources and extract functions wrapped in JITFunction
+    arg_path = Path(args.path)
+    sys.path.insert(0, str(arg_path.parent))
+    spec = importlib.util.spec_from_file_location(arg_path.stem, arg_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    kernel = getattr(mod, args.kernel_name)
+    grid = args.grid.split(",")
+    assert len(grid) == 3
+
+    # validate and parse signature
+    signature = list(map(lambda s: s.strip(" "), args.signature.split(",")))
+
+    def hash_signature(signature: List[str]):
+        m = hashlib.sha256()
+        m.update(" ".join(signature).encode())
+        return m.hexdigest()[:8]
+
+    meta_sig = f"warps{args.num_warps}xstages{args.num_stages}"
+    sig_hash = hash_signature(signature + [meta_sig])
+
+    def constexpr(s):
+        try:
+            ret = int(s)
+            return ret
+        except ValueError:
+            pass
+        try:
+            ret = float(s)
+            return ret
+        except ValueError:
+            pass
+        return None
+
+    hints = {(i, ): constexpr(s.split(":")[1]) for i, s in enumerate(signature) if ":" in s}
+    hints = {k: v for k, v in hints.items() if v is not None}
+    constants = {kernel.arg_names[i]: constexpr(s) for i, s in enumerate(signature)}
+    constants = {k: v for k, v in constants.items() if v is not None}
+    for key, value in hints.items():
+        if value == 1:
+            constants[kernel.arg_names[key[0]]] = value
+    signature = {kernel.arg_names[i]: s.split(":")[0] for i, s in enumerate(signature)}
+    for key in constants:
+        signature[key] = 'constexpr'
+    const_sig = 'x'.join([str(v) for v in constants.values()])
+    doc_string = [f"{k}={v}" for k, v in constants.items()]
+    doc_string += [f"num_warps={args.num_warps}", f"num_stages={args.num_stages}"]
+    # compile ast into cubin
+    for h in hints.values():
+        assert h in [1, 16], f"Only 1 and 16 are valid hints, got {h}"
+    attrs = {k: [["tt.divisibility", 16]] for k, v in hints.items() if v == 16}
+    kernel.create_binder()
+    src = kernel.ASTSource(fn=kernel, constexprs=constants, signature=signature, attrs=attrs)
+    target = triton.backends.compiler.GPUTarget(*args.target.split(":")) \
+        if args.target else triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.make_backend(target)
+    kwargs = {"num_warps": args.num_warps, "num_stages": args.num_stages}
+    options = backend.parse_options(kwargs)
+    ccinfo = triton.compile(src, target=target, options=options.__dict__)
+
+    if getattr(ccinfo.metadata, "global_scratch_size", 0) > 0:
+        raise RuntimeError("AOT compiling kernels with global scratch requirements is not yet implemented")
+    if ccinfo.metadata.profile_scratch_size > 0:
+        raise RuntimeError("AOT compiling kernels with profile scratch requirements is not yet implemented")
+
+    arg_names = []
+    arg_types = []
+    arg_names_not_1 = []
+    arg_types_not_1 = []
+    for i, arg_name in enumerate(kernel.arg_names):
+        if arg_name not in constants:
+            arg_names.append(arg_name)
+            arg_types.append(signature[arg_name])
+            arg_names_not_1.append(arg_name)
+            arg_types_not_1.append(signature[arg_name])
+        elif hints.get((i, ), None) == 1:
+            arg_names.append(arg_name)
+            arg_types.append("i32")
+
+    # dump C stub code
+    suffix = ''
+    for i, ty in enumerate(signature.values()):
+        suffix += str(i)
+        if hints.get((i, ), None) == 1:
+            suffix += 'c'
+        if hints.get((i, ), None) == 16:
+            suffix += 'd'
+    func_name = '_'.join([out_name, sig_hash, suffix])
+    asm = ccinfo.asm[backend.binary_ext]  # store binary data once
+
+    hex_ = str(binascii.hexlify(asm))[2:-1]
+
+    ty_to_cpp = triton.runtime.driver.active.map_python_to_cpp_type
+
+    params = {
+        "kernel_name": func_name,
+        "triton_kernel_name": args.kernel_name,
+        "bin_size": len(asm),
+        "bin_data": ", ".join([f"0x{x}{y}" for x, y in zip(hex_[::2], hex_[1::2])]),
+        "signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names_not_1, arg_types_not_1)]),
+        "full_signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names, arg_types)]),
+        "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1] + ["&global_scratch"] + ["&profile_scratch"]),
+        "num_args": len(arg_names_not_1) + 2,  # +2 for global and profile scratch
+        "kernel_docstring": doc_string,
+        "shared": ccinfo.metadata.shared,
+        "num_warps": args.num_warps,
+        "algo_info": "_".join([const_sig, meta_sig]),
+        "gridX": grid[0],
+        "gridY": grid[1],
+        "gridZ": grid[2],
+        "_placeholder": "",
+        "warp_size": target.warp_size,
+    }
+    output_files = []
+    backend_name = target.backend
+    template_dir = Path(__file__).parent / "extra" / backend_name
+    for template_path in template_dir.glob('compile.*'):
+        ext = template_path.suffix
+        output_file = out_path.with_suffix(f".{sig_hash}_{suffix}{ext}")
+        with output_file.open("w") as fp:
+            fp.write(template_path.read_text().format(**params))
+        output_files.append(output_file)
+
+    return func_name, output_files
+
+
+if __name__ == "__main__":
+    main()
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/disasm.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/disasm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2301fd2eaab5b7e1e7b6b1f2f18e2962b26cabd
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/disasm.py
@@ -0,0 +1,143 @@
+# MIT License
+
+# Copyright (c) 2020 Da Yan @ HKUST
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import functools
+import os
+import re
+import subprocess
+import tempfile
+
+FLINE_RE = re.compile(r'\s*/\*\w{4}\*/\s*([^;]*;)\s*/\* 0x(\w{16}) \*/\s*')
+SLINE_RE = re.compile(r'\s*/\* 0x(\w{16}) \*/\s*')
+FNAME_RE = re.compile(r'\s*Function : (\w+)\s*')
+BRA_RE = re.compile(r'(.*BRA(?:\.U)? )(0x\w+);')
+
+
+def parseCtrl(sline):
+    enc = int(SLINE_RE.match(sline).group(1), 16)
+    stall = (enc >> 41) & 0xf
+    yld = (enc >> 45) & 0x1
+    wrtdb = (enc >> 46) & 0x7
+    readb = (enc >> 49) & 0x7
+    watdb = (enc >> 52) & 0x3f
+
+    yld_str = 'Y' if yld == 0 else '-'
+    wrtdb_str = '-' if wrtdb == 7 else str(wrtdb)
+    readb_str = '-' if readb == 7 else str(readb)
+    watdb_str = '--' if watdb == 0 else f'{watdb:02d}'
+    return f'{watdb_str}:{readb_str}:{wrtdb_str}:{yld_str}:{stall:x}'
+
+
+def processSassLines(fline, sline, labels):
+    asm = FLINE_RE.match(fline).group(1)
+    # Remove tailing space
+    if asm.endswith(" ;"):
+        asm = asm[:-2] + ";"
+    ctrl = parseCtrl(sline)
+    # BRA target address
+    if BRA_RE.match(asm) is not None:
+        target = int(BRA_RE.match(asm).group(2), 16)
+        if target in labels:
+            pass
+        else:
+            labels[target] = len(labels)
+    return (f'{ctrl}', f'{asm}')
+
+
+@functools.lru_cache()
+def get_sass(cubin_asm, fun=None):
+    fd, path = tempfile.mkstemp()
+    try:
+        with open(fd, 'wb') as cubin:
+            cubin.write(cubin_asm)
+        sass = extract(path, fun)
+    finally:
+        os.remove(path)
+    return sass
+
+
+def path_to_cuobjdump():
+    from triton import knobs
+    return knobs.nvidia.cuobjdump.path
+
+
+def extract(file_path, fun):
+    cuobjdump = path_to_cuobjdump()
+    if fun is None:
+        sass_str = subprocess.check_output([cuobjdump, "-sass", file_path])
+    else:
+        sass_str = subprocess.check_output([cuobjdump, "-fun", fun, "-sass", file_path])
+    sass_lines = sass_str.splitlines()
+    line_idx = 0
+    while line_idx < len(sass_lines):
+        line = sass_lines[line_idx].decode()
+        # format:
+        # function : <function_name>
+        # .headerflags: ...
+        # /*0000*/ asmstr /*0x...*/
+        #                 /*0x...*/
+
+        # Looking for new function header (function: <name>)
+        while FNAME_RE.match(line) is None:
+            line_idx += 1
+            if line_idx < len(sass_lines):
+                line = sass_lines[line_idx].decode()
+            else:
+                return
+
+        fname = FNAME_RE.match(line).group(1)
+        ret = ''
+        ret += f'Function:{fname}\n'
+        line_idx += 2  # bypass .headerflags
+        line = sass_lines[line_idx].decode()
+        # Remapping address to label
+        labels = {}  # address -> label_idx
+        # store sass asm in buffer and them print them (for labels)
+        # (ctrl, asm)
+        asm_buffer = []
+        while FLINE_RE.match(line) is not None:
+            # First line (Offset ASM Encoding)
+            fline = sass_lines[line_idx].decode()
+            line_idx += 1
+            # Second line (Encoding)
+            sline = sass_lines[line_idx].decode()
+            line_idx += 1
+            asm_buffer.append(processSassLines(fline, sline, labels))
+            # peek the next line
+            line = sass_lines[line_idx].decode()
+        # Print sass
+        # label naming convention: LBB#i
+        for idx, (ctrl, asm) in enumerate(asm_buffer):
+            # Print label if this is BRA target
+            offset = idx * 16
+            if offset in labels:
+                label_name = f'LBB{labels[offset]}'
+                ret += f'{label_name}:\n'
+            ret += ctrl + '\t'
+            # if this is BRA, remap offset to label
+            if BRA_RE.match(asm):
+                target = int(BRA_RE.match(asm).group(2), 16)
+                target_name = f'LBB{labels[target]}'
+                asm = BRA_RE.sub(rf'\1{target_name};', asm)
+            ret += asm + '\n'
+        ret += '\n'
+        return ret
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.c b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.c
new file mode 100644
index 0000000000000000000000000000000000000000..94e4d15086c7c62ef562a81d99e600950941b1f7
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.c
@@ -0,0 +1,69 @@
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <cuda.h>
+
+
+// helpers to check for cuda errors
+#define CUDA_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+
+static inline void gpuAssert(CUresult code, const char *file, int line) {{
+  if (code != CUDA_SUCCESS) {{
+    const char *prefix = "Triton Error [CUDA]: ";
+    const char *str;
+    cuGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+
+// globals
+#define CUBIN_NAME {kernel_name}_cubin
+CUmodule {kernel_name}_mod = NULL;
+CUfunction {kernel_name}_func = NULL;
+unsigned char CUBIN_NAME[{bin_size}] = {{ {bin_data} }};
+
+
+void unload_{kernel_name}(void) {{
+    CUDA_CHECK(cuModuleUnload({kernel_name}_mod));
+}}
+
+// TODO: some code duplication with `runtime/backend/cuda.c`
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&CUBIN_NAME;
+    int shared = {shared};
+    CUDA_CHECK(cuModuleLoadData(&{kernel_name}_mod, bin));
+    CUDA_CHECK(cuModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+    // set dynamic shared memory if necessary
+    int shared_optin;
+    CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev));
+    if (shared > 49152 && shared_optin > 49152) {{
+      CUDA_CHECK(cuFuncSetCacheConfig({kernel_name}_func, CU_FUNC_CACHE_PREFER_SHARED));
+      CUDA_CHECK(cuFuncSetAttribute({kernel_name}_func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin))
+    }}
+}}
+
+/*
+{kernel_docstring}
+*/
+CUresult {kernel_name}(CUstream stream, {signature}) {{
+    if ({kernel_name}_func == NULL)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    CUdeviceptr global_scratch = 0;
+    CUdeviceptr profile_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return cuLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * {warp_size}, 1, 1, {shared}, stream, args, NULL);
+}}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.h
new file mode 100644
index 0000000000000000000000000000000000000000..d98b7063b6ae6292b65b61abf5a30c58b7d28e95
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/cuda/compile.h
@@ -0,0 +1,14 @@
+#ifndef TT_KERNEL_INCLUDES
+#define TT_KERNEL_INCLUDES
+
+#include <cuda.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#endif
+
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+// tt-linker: {kernel_name}:{full_signature}:{algo_info}
+CUresult{_placeholder} {kernel_name}(CUstream stream, {signature});
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.cpp b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd554e96cc5e45f69e69d2e0096785615dc769b3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <hip/hip_runtime.h>
+
+// helpers to check for hip errors
+#define HIP_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+
+static inline void gpuAssert(hipError_t code, const char *file, int line) {{
+  if (code != hipSuccess) {{
+    const char *prefix = "Triton Error [HIP]: ";
+    const char *str;
+    hipDrvGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+
+// globals
+#define HSACO_NAME {kernel_name}_hsaco
+hipModule_t {kernel_name}_mod = nullptr;
+hipFunction_t {kernel_name}_func = nullptr;
+unsigned char HSACO_NAME[{bin_size}] = {{ {bin_data} }};
+
+
+void unload_{kernel_name}(void) {{
+    HIP_CHECK(hipModuleUnload({kernel_name}_mod));
+}}
+
+
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&HSACO_NAME;
+    int shared = {shared};
+    HIP_CHECK(hipModuleLoadData(&{kernel_name}_mod, bin));
+    HIP_CHECK(hipModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+}}
+
+/*
+{kernel_docstring}
+*/
+hipError_t {kernel_name}(hipStream_t stream, {signature}) {{
+    if ({kernel_name}_func == nullptr)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    hipDeviceptr_t global_scratch = 0;
+    hipDeviceptr_t profile_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return hipModuleLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * {warp_size}, 1, 1, {shared}, stream, args, nullptr);
+    else
+      return hipErrorInvalidValue;
+}}
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.h b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc5007ad939277df890306a84a91c0b87f1c8825
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/extra/hip/compile.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+hipError_t{_placeholder} {kernel_name}(hipStream_t stream, {signature});
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/link.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/link.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a1157a52f92bbd5d2eae640af97ea360da2ef3
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/link.py
@@ -0,0 +1,322 @@
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence, Union
+
+from dataclasses import dataclass
+
+
+def _exists(x):
+    return x is not None
+
+
+class LinkerError(Exception):
+    pass
+
+
+@dataclass
+class KernelLinkerMeta:
+    orig_kernel_name: str
+    arg_names: Sequence[str]
+    arg_ctypes: Sequence[str]
+    sizes: Sequence[Union[int, None]]
+    sig_hash: str
+    triton_suffix: str
+    suffix: str
+    num_specs: int
+    """ number of specialized arguments """
+
+
+class HeaderParser:
+
+    def __init__(self) -> None:
+        import re
+
+        # [kernel_name, c signature]
+        self.linker_directives = re.compile("//[\\s]*tt-linker:[\\s]*([\\w]+):(.+):(.+)")
+        # [name, hash, suffix]
+        self.kernel_name = re.compile("^([\\w]+)_([\\w]+)_([\\w]+)$")
+        # [(type, name)]
+        self.c_sig = re.compile("[\\s]*(\\w+)\\s(\\w+)[,]?")
+        # [d|c]
+        self.arg_suffix = re.compile("[c,d]")
+
+        self.kernels = defaultdict(list)
+
+    def extract_linker_meta(self, header: str):
+        for ln in header.splitlines():
+            if ln.startswith("//"):
+                m = self.linker_directives.match(ln)
+                if _exists(m):
+                    ker_name, c_sig, algo_info = m.group(1), m.group(2), m.group(3)
+                    name, sig_hash, suffix = self._match_name(ker_name)
+                    c_types, arg_names = self._match_c_sig(c_sig)
+                    num_specs, sizes = self._match_suffix(suffix, c_sig)
+                    self._add_kernel(
+                        "_".join([name, algo_info]),
+                        KernelLinkerMeta(
+                            orig_kernel_name=name,
+                            arg_names=arg_names,
+                            arg_ctypes=c_types,
+                            sizes=sizes,
+                            sig_hash=sig_hash,
+                            triton_suffix=suffix,
+                            suffix=suffix,
+                            num_specs=num_specs,
+                        ),
+                    )
+
+    def _match_name(self, ker_name: str):
+        m = self.kernel_name.match(ker_name)
+        if _exists(m):
+            name, sig_hash, suffix = m.group(1), m.group(2), m.group(3)
+            return name, sig_hash, suffix
+        raise LinkerError(f"{ker_name} is not a valid kernel name")
+
+    def _match_c_sig(self, c_sig: str):
+        m = self.c_sig.findall(c_sig)
+        if len(m):
+            tys, args = [], []
+            for ty, arg_name in m:
+                tys.append(ty)
+                args.append(arg_name)
+            return tys, args
+
+        raise LinkerError(f"{c_sig} is not a valid argument signature")
+
+    def _match_suffix(self, suffix: str, c_sig: str):
+        args = c_sig.split(",")
+        s2i = {"c": 1, "d": 16}
+        num_specs = 0
+        sizes = []
+        # scan through suffix, first find the index,
+        # then see if it is followed by d or c
+        for i in range(len(args)):
+            pos = suffix.find(str(i))
+            if pos == -1:
+                raise LinkerError(f"{suffix} is not a valid kernel suffix")
+            pos += len(str(i))
+            if self.arg_suffix.match(suffix, pos):
+                num_specs += 1
+                sizes.extend([None] * (i - len(sizes)))
+                sizes.append(s2i[suffix[pos]])
+                pos += 1
+            if i < len(args) - 1:
+                suffix = suffix[pos:]
+            else:
+                sizes.extend([None] * (len(args) - len(sizes)))
+        return num_specs, sizes
+
+    def _add_kernel(self, name: str, ker: KernelLinkerMeta):
+        if name in self.kernels:
+            last: KernelLinkerMeta = self.kernels[name][-1]
+
+            for cur, new_ in zip(last.arg_ctypes, ker.arg_ctypes):
+                if cur != new_:
+                    raise LinkerError(
+                        f"Mismatched signature for kernel {name}: \n\texisting sig is: {','.join(last.arg_ctypes)}\n\tcurrent is: {','.join(ker.arg_ctypes)}"
+                    )
+
+        self.kernels[name].append(ker)
+
+
+def gen_signature_with_full_args(m):
+    return ", ".join([f"{ty} {arg}" for ty, arg in zip(m.arg_ctypes, m.arg_names)])
+
+
+def gen_signature(m):
+    arg_types = [ty for ty, hint in zip(m.arg_ctypes, m.sizes) if hint != 1]
+    arg_names = [arg for arg, hint in zip(m.arg_names, m.sizes) if hint != 1]
+    sig = ", ".join([f"{ty} {arg}" for ty, arg in zip(arg_types, arg_names)])
+    return sig
+
+
+# generate declarations of kernels with meta-parameter and constant values
+def make_algo_decls(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
+    return f"""
+CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])});
+void load_{name}();
+void unload_{name}();
+    """
+
+
+# generate declarations of kernels with meta-parameter and constant values
+def make_global_decl(meta: KernelLinkerMeta) -> str:
+    return f"""
+CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)});
+CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id);
+void load_{meta.orig_kernel_name}();
+void unload_{meta.orig_kernel_name}();
+    """
+
+
+# generate dispatcher function for kernels with different meta-parameter and constant values
+def make_default_algo_kernel(meta: KernelLinkerMeta) -> str:
+    src = f"CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)}){{\n"
+    src += (f"  return {meta.orig_kernel_name}(stream, {', '.join(meta.arg_names)}, 0);\n")
+    src += "}\n"
+    return src
+
+
+# generate dispatcher function for kernels with different integer value hints
+def make_kernel_hints_dispatcher(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
+    src = f"// launcher for: {name}\n"
+    for meta in sorted(metas, key=lambda m: -m.num_specs):
+        src += f"CUresult {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(CUstream stream, {gen_signature(meta)});\n"
+    src += "\n"
+
+    src += (f"CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])}){{")
+    src += "\n"
+    for meta in sorted(metas, key=lambda m: -m.num_specs):
+        cond_fn = (  #
+            lambda val, hint: f"({val} % {hint} == 0)"  #
+            if hint == 16  #
+            else f"({val} == {hint})"  #
+            if hint == 1  #
+            else None)
+        conds = " && ".join([  #
+            cond_fn(val, hint)  #
+            for val, hint in zip(meta.arg_names, meta.sizes)  #
+            if hint is not None
+        ])
+        src += (f"  if ({conds})\n" if any(meta.sizes) else "if (1)\n"
+                )  # Edge case where no specializations hence no dispatching required
+        arg_names = [arg for arg, hint in zip(meta.arg_names, meta.sizes) if hint != 1]
+        src += f"    return {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(stream, {', '.join(arg_names)});\n"
+    src += "\n"
+    src += "  return CUDA_ERROR_INVALID_VALUE;\n"
+    src += "}\n"
+
+    for mode in ["load", "unload"]:
+        src += f"\n// {mode} for: {name}\n"
+        for meta in sorted(metas, key=lambda m: -m.num_specs):
+            src += f"void {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n"
+        src += f"void {mode}_{name}() {{"
+        src += "\n"
+        for meta in sorted(metas, key=lambda m: -m.num_specs):
+            src += (f"  {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n")
+        src += "}\n"
+    return src
+
+
+# generate dispatcher function for kernels with different meta-parameter and constant values
+def make_kernel_meta_const_dispatcher(meta: KernelLinkerMeta) -> str:
+    src = f"CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id){{\n"
+    src += f"  assert (algo_id < (int)sizeof({meta.orig_kernel_name}_kernels));\n"
+    src += f"  return {meta.orig_kernel_name}_kernels[algo_id](stream, {', '.join(meta.arg_names)});\n"
+    src += "}\n"
+    return src
+
+
+# generate definition of function pointers of kernel dispatchers based on meta-parameter and constant values
+def make_func_pointers(names: str, meta: KernelLinkerMeta) -> str:
+    # the table of hint dispatchers
+    src = f"typedef CUresult (*kernel_func_t)(CUstream stream, {gen_signature_with_full_args(meta)});\n"
+    src += f"kernel_func_t {meta.orig_kernel_name}_kernels[] = {{\n"
+    for name in names:
+        src += f"  {name},\n"
+    src += "};\n"
+    return src
+
+
+# generate definition for load/unload functions for kernels with different meta-parameter and constant values
+def make_kernel_load_def(names: str, meta: KernelLinkerMeta) -> str:
+    src = ""
+    for mode in ["load", "unload"]:
+        src += f"void {mode}_{meta.orig_kernel_name}(void){{\n"
+        for name in names:
+            src += f"  {mode}_{name}();\n"
+        src += "}\n\n"
+    return src
+
+
+def make_get_num_algos_decl(meta: KernelLinkerMeta) -> str:
+    src = f"int {meta.orig_kernel_name}_get_num_algos(void);"
+    return src
+
+
+def make_get_num_algos_def(meta: KernelLinkerMeta) -> str:
+    src = f"int {meta.orig_kernel_name}_get_num_algos(void){{\n"
+    src += f"  return (int)(sizeof({meta.orig_kernel_name}_kernels) / sizeof({meta.orig_kernel_name}_kernels[0]));\n"
+    src += "}\n"
+    return src
+
+
+desc = """
+Triton ahead-of-time linker:
+
+This program takes in header files generated by compile.py, and generates a
+single entry-point responsible for dispatching the user's input to the right
+kernel given the specializations that were compiled.
+
+Example usage:
+python link.py /path/to/headers/*.h -o kernel_name
+"""
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser(description=desc)
+    parser.add_argument(
+        "headers",
+        nargs="+",
+        help="Paths to header files to link. Must include linker directive annotations (autogenerated by ttc)",
+    )
+    parser.add_argument("--out", "-o", type=Path, help="Out filename")
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        default="",
+        help="String to prefix kernel dispatcher names",
+    )
+    args = parser.parse_args()
+
+    # metadata
+    parser = HeaderParser()
+    includes = []
+    for header in args.headers:
+        h_path = Path(header)
+        h_str = h_path.read_text()
+        includes.append(h_path.name)
+        parser.extract_linker_meta(h_str)
+
+    # generate headers
+    algo_decls = [make_algo_decls(name, meta) for name, meta in parser.kernels.items()]
+    meta_lists = [meta for name, meta in parser.kernels.items()]
+    meta = meta_lists[0][0]
+    get_num_algos_decl = make_get_num_algos_decl(meta)
+    global_decl = make_global_decl(meta)
+    with args.out.with_suffix(".h").open("w") as fp:
+        out = "#include <cuda.h>\n"
+        out += "\n".join(algo_decls)
+        out += "\n"
+        out += get_num_algos_decl
+        out += "\n"
+        out += global_decl
+        fp.write(out)
+
+    # generate source
+    defs = [make_kernel_hints_dispatcher(name, meta) for name, meta in parser.kernels.items()]
+    names = [name for name in parser.kernels.keys()]
+    func_pointers_def = make_func_pointers(names, meta)
+    meta_const_def = make_kernel_meta_const_dispatcher(meta)
+    load_unload_def = make_kernel_load_def(names, meta)
+    get_num_algos_def = make_get_num_algos_def(meta)
+    default_algo_kernel = make_default_algo_kernel(meta)
+    with args.out.with_suffix(".c").open("w") as fp:
+        out = ""
+        out += "#include <cuda.h>\n"
+        out += "#include <stdint.h>\n"
+        out += "#include <assert.h>\n"
+        out += "\n"
+        out += "\n".join(defs)
+        out += "\n"
+        out += func_pointers_def
+        out += "\n"
+        out += get_num_algos_def
+        out += "\n"
+        out += meta_const_def
+        out += "\n"
+        out += load_unload_def
+        out += "\n"
+        out += default_algo_kernel
+        fp.write(out)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/mxfp.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/mxfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b129c1aef2ddc8165a2f81718b1be980573c458
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/mxfp.py
@@ -0,0 +1,301 @@
+"""
+Helper classes for working with low precision floating point types that
+align with the opencompute (OCP) microscaling (MX) specification.
+  * MXFP4Tensor: 4-bit E2M1 floating point data
+  * MXScaleTensor: 8-bit E8M0 floating point data
+Reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+"""
+
+import torch
+
+
+class MXFP4Tensor:
+
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with four bit E2M1 floating point data as defined by the
+        opencompute microscaling specification.
+
+
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp4e2m1 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+
+    def random(self):
+        S = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+        E = torch.randint(0, 4, size=self.size, dtype=torch.uint8, device=self.device)
+        M = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+
+        self.data = ((S << 3) | (E << 1) | M).type(torch.uint8)
+        return self
+
+    def to(self, dtype):
+        """
+        Convert fp4e2m1 data to float32.
+
+        Returns:
+        - A torch tensor of type dtype representing the fp4e2m1 data.
+        """
+        assert dtype == torch.float32, "Currently only float32 is supported for fp4e2m1 to float conversion"
+
+        data = self.data
+        S = ((data >> 3) & 0x1).type(dtype)
+        E = ((data >> 1) & 0x3).type(dtype)
+        M = (data & 0x1).type(dtype)
+
+        # The MXF4 E2M1 spec defines 0bS000 as zero
+        value = torch.zeros_like(S)
+        is_zero = (E == 0) & (M == 0)
+        non_zero_mask = ~is_zero
+        if non_zero_mask.any():
+            S_nz = S[non_zero_mask]
+            E_nz = E[non_zero_mask]
+            M_nz = M[non_zero_mask]
+
+            sign = torch.pow(-1, S_nz)
+            # Normal and subnormal handling for the exponent and mantissa
+            exponent = torch.where(E_nz == 0, E_nz, E_nz - 1)
+            mantissa = torch.where(E_nz == 0, M_nz * 0.5, 1.0 + M_nz * 0.5)
+            value_nz = sign * torch.pow(2, exponent) * mantissa
+
+            value[non_zero_mask] = value_nz
+
+        # For zeros, the values must remain zero with the correct sign
+        value[is_zero & (S == 1)] *= -1
+        return value.type(torch.float32)
+
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to mxf4 e2m1 format.
+        * No encodings are reserved for Inf or NaN in mxf4.
+        * Conversion from float supports roundTiesToEven rounding mode.
+        * If a value exceeds the mxf4 representable range after rounding,
+          clamps to the maximum mxf4 magnitude, preserving the sign.
+        * If a value has magnitude less than the minimum subnormal magnitude
+          in mxf4 after rounding, converts to zero.
+
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to fp4 format.
+        """
+        S = torch.signbit(values).type(torch.uint8)
+        abs_values = torch.abs(values)
+
+        is_zero = (abs_values == 0)
+        is_invalid = torch.isnan(values) | torch.isinf(values)
+
+        # Enumerate all possible E2M1 exponent and mantissa values. We will
+        # use these to compare the distance between float32 and all possible
+        # E2M1 floats to find the nearest E2M1 representable value
+        E_bits = torch.tensor([0, 1, 2, 3], dtype=torch.uint8, device=self.device)
+        M_bits = torch.tensor([0, 1], dtype=torch.uint8, device=self.device)
+
+        candidate_values = []
+        candidate_E = []
+        candidate_M = []
+
+        for E in E_bits:
+            if E == 0:
+                # Subnormals
+                exponent = 0
+                for M in M_bits:
+                    significand = M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+            else:
+                # Normals
+                exponent = E.item() - 1
+                for M in M_bits:
+                    significand = 1.0 + M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+
+        candidates = torch.tensor(candidate_values, dtype=torch.float32, device=self.device)
+        candidate_E = torch.tensor(candidate_E, dtype=torch.uint8, device=self.device)
+        candidate_M = torch.tensor(candidate_M, dtype=torch.uint8, device=self.device)
+
+        abs_values_flat = abs_values.view(-1)
+        N = abs_values_flat.shape[0]
+        abs_values_expanded = abs_values_flat.unsqueeze(1)
+
+        # Clamp invalid values to the max e2m1 representable value
+        max_candidate_value = candidates.max().item()
+        abs_values_flat[is_invalid.view(-1)] = max_candidate_value
+
+        # Compute distance between all abs_values and candidate e2m1 values
+        errors = torch.abs(abs_values_expanded - candidates.unsqueeze(0))
+
+        # To implement roundTiesToEven, we need to break ties by preferring
+        # even mantissas (M == 0). We do so by adding an epsilon bias to shift
+        # the closest candidate with an even mantissa closer to the float value
+        min_errors, _ = torch.min(errors, dim=1, keepdim=True)
+        is_tie = (errors == min_errors)
+        # More than one candidate has the min error for some float value
+        if is_tie.sum() > 1:
+            M_bits_expanded = candidate_M.unsqueeze(0).expand(N, -1)
+            tie_breaker = (M_bits_expanded == 0).type(torch.int32)
+
+            errors = errors - (tie_breaker * 1e-6)
+
+        best_indices = torch.argmin(errors, dim=1)
+
+        E_selected = candidate_E[best_indices]
+        M_selected = candidate_M[best_indices]
+        E = E_selected.view(abs_values.shape)
+        M = M_selected.view(abs_values.shape)
+
+        E[is_zero] = 0
+        M[is_zero] = 0
+
+        return ((S << 3) | (E << 1) | M).type(torch.uint8)
+
+    def to_packed_tensor(self, dim):
+        """
+        Packs two e2m1 elements into a single uint8 along the specified dimension.
+
+        Parameters:
+        - dim: The dimension along which to pack the elements.
+
+        Returns:
+        - A torch tensor of dtype uint8 with two e2m1 elements packed into one uint8.
+        """
+        data = self.data
+        assert 0 <= dim < data.ndim, \
+            "The dimension to pack along is not within the range of tensor dimensions"
+
+        size_along_dim = data.size(dim)
+        new_size_along_dim = (size_along_dim + 1) // 2
+
+        # If the size is odd, we pad the data along dim with zeros at the end
+        if size_along_dim % 2 != 0:
+            pad_sizes = [0] * (2 * data.ndim)
+            pad_index = (data.ndim - dim - 1) * 2 + 1
+            pad_sizes[pad_index] = 1
+            data = torch.nn.functional.pad(data, pad_sizes, mode='constant', value=0)
+
+        new_shape = list(data.shape)
+        new_shape[dim] = new_size_along_dim
+        new_shape.insert(dim + 1, 2)  # packed dimension of length 2
+        data = data.reshape(*new_shape)
+
+        low = data.select(dim + 1, 0)
+        high = data.select(dim + 1, 1)
+        packed = (high << 4) | low
+
+        return packed
+
+    def unpack_packed_tensor(self, packed_tensor, dim, original_shape):
+        """
+        Unpacks a tensor where two fp4 elements are packed into a single uint8.
+
+        Parameters:
+        - packed_tensor: The packed tensor
+        - dim: The dimension along which the tensor was packed.
+        - original_shape: The shape of the original tensor before packing.
+
+        Returns:
+        - A tensor with the original data unpacked into uint8 elements containing one
+          fp4e2m1 element in the least significant bits.
+        """
+        high = (packed_tensor >> 4) & 0xF
+        low = packed_tensor & 0xF
+
+        stacked = torch.stack((low, high), dim=dim + 1)
+
+        # Flatten along dim and dim+1 and then merge
+        shape = list(stacked.shape)
+        new_shape = shape[:dim] + [shape[dim] * 2] + shape[dim + 2:]
+        data = stacked.reshape(*new_shape)
+
+        # Remove any padding
+        if original_shape[dim] % 2 != 0:
+            indices = [slice(None)] * data.ndim
+            indices[dim] = slice(0, original_shape[dim])
+            data = data[tuple(indices)]
+
+        return data.type(torch.uint8)
+
+
+class MXScaleTensor:
+
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with microscaling E8M0 block scale factors.
+
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp8e8m0 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+
+    def random(self, low=None, high=None):
+        """
+        Generate random E8M0 data within a specified range.
+        * Excludes the NaN encoding (255).
+        """
+        bias = 127
+
+        min_exponent = 0 if low is None else max(0, int(torch.log2(torch.tensor(low))) + bias)
+        max_exponent = 254 if high is None else min(254, max(0, int(torch.log2(torch.tensor(high))) + bias))
+        assert min_exponent <= max_exponent, "Low must be less than or equal to high"
+
+        E = torch.randint(min_exponent, max_exponent + 1, size=self.size, dtype=torch.uint8, device=self.device)
+        self.data = E
+        return self
+
+    def to(self, dtype):
+        assert dtype == torch.float32, "Currently only float32 is supported for f8e8m0 to float conversion"
+        data = self.data.type(dtype)
+        is_nan = (data == 255)
+        e_biased = data.clone()
+        e_biased[is_nan] = 0
+        e = e_biased - 127
+        value = torch.pow(2.0, e)
+        value[is_nan] = torch.nan
+        return value.type(dtype)
+
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to E8M0 format.
+        * Values <= 0, NaNs, and Infs are converted to the NaN encoding (255).
+        * Positive values are converted by computing the floor of log2(value) to get the exponent.
+
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to E8M0 format.
+        """
+        result = torch.empty_like(values, dtype=torch.uint8, device=self.device)
+
+        is_invalid = torch.isnan(values) | torch.isinf(values) | (values <= 0)
+        result[is_invalid] = 255
+
+        valid_values = values[~is_invalid]
+        e = torch.floor(torch.log2(valid_values))
+        e_biased = e + 127
+        e_biased_int = e_biased.type(torch.int32)
+        e_biased_clamped = torch.clamp(e_biased_int, 0, 254)
+        result[~is_invalid] = e_biased_clamped.type(torch.uint8)
+
+        return result
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/ragged_tma.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/ragged_tma.py
new file mode 100644
index 0000000000000000000000000000000000000000..728dfcd42b3fab01c2b504a37f1e716d00bcef85
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/ragged_tma.py
@@ -0,0 +1,108 @@
+import triton
+import triton.language as tl
+from triton.tools.tensor_descriptor import TensorDescriptor
+
+# fmt: off
+
+
+def create_ragged_descriptor(T, block_shape, ragged_dim=0):
+    """
+    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
+    which behaves like a concatenation (along the first axis) of subarrays
+    of potentially unequal size.
+
+    The load_ragged and store_ragged device functions can be used to read
+    and write from subarrays T[batch_offset : batch_offset + batch_size]
+    with hardware bounds-checking preventing any sort of leakage outside
+    the subarray.
+    """
+
+    block_shape = list(block_shape)
+    tensor_shape = list(T.shape)
+    rank = len(tensor_shape)
+
+    if ragged_dim < 0:
+        ragged_dim += rank
+
+    assert 0 <= ragged_dim < rank - 1, "last dimension cannot be ragged"
+    assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
+
+    assert len(block_shape) == rank, "block shape must have same length as tensor shape"
+
+    max_int = 0x7fff0000
+    billion = 0x40000000  # == 2**30
+
+    assert tensor_shape[ragged_dim] <= billion, "number of rows may not exceed 2**30"
+    tensor_shape[ragged_dim] = billion
+    ragged_stride = T.stride(ragged_dim)
+
+    # we prepend an extra two dimensions and rely on the fact that pointers
+    # have 64-bit wraparound semantics:
+    tma_stride = [2**34 - ragged_stride, ragged_stride] + [T.stride(i) for i in range(rank)]
+    tma_shape  = [max_int, max_int] + tensor_shape
+    box_shape  = [1, 1] + block_shape
+
+    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
+
+
+@triton.jit
+def to_ragged_indices(batch_offset, batch_size, row):
+    """
+    Helper function for load_ragged and store_ragged.
+    """
+
+    billion = 0x40000000  # == 2**30
+    x = billion - batch_size + row
+    y = batch_offset + batch_size
+
+    return billion, y, x
+
+
+@triton.jit
+def load_ragged(TMA, batch_offset, batch_size, coords, ragged_dim: tl.constexpr = 0):
+    """
+    Read from a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where reading outside the subarray gives zeros.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.load().
+    """
+
+    tl.static_assert(len(TMA.shape) == len(coords) + 2, "TMA must be a read-write ragged descriptor")
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = TMA.load([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:])
+    data = tl.reshape(data, data.shape[2:])
+    return data
+
+
+@triton.jit
+def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
+    """
+    Write to a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where writes outside the subarray are masked
+    correctly.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.store().
+    """
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
+
+
+@triton.jit
+def atomic_add_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
+    """
+    Atomic add into a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where adds outside the subarray are masked
+    correctly.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.atomic_add().
+    """
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.atomic_add([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/tensor_descriptor.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/tensor_descriptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c359aa308a0aa14a7d3ca873e21fbba978e347
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/tensor_descriptor.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+from typing import List, Any
+from triton._utils import validate_block_shape
+
+
+@dataclass
+class TensorDescriptor:
+    base: Any
+    shape: List[int]
+    strides: List[int]
+    block_shape: List[int]
+    padding: str = "zero"
+
+    def __post_init__(self):
+        rank = len(self.shape)
+        assert len(self.strides) == rank, f"rank mismatch: {self}"
+        assert len(self.block_shape) == rank, f"rank mismatch: {self}"
+        assert rank > 0, "rank must not be zero"
+        assert rank <= 5, "rank cannot be more than 5"
+        ty = type(self.base)
+        if ty.__name__ not in ("FakeTensor", "FunctionalTensor"):
+            assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        validate_block_shape(self.block_shape)
+        elem_bytes = self.base.dtype.itemsize
+        for stride in self.strides[:-1]:
+            assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        for shape_dim in self.shape:
+            assert shape_dim > 0, "shape must be positive"
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
+        if self.padding == "nan":
+            assert self.base.dtype.is_floating_point, "Padding option `nan` is only supported for floating point tensors"
+
+    @staticmethod
+    def from_tensor(tensor: Any, block_shape: List[int], padding="zero"):
+        return TensorDescriptor(tensor, tensor.shape, tensor.stride(), block_shape, padding)
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..742847bcc18e9fa970e5dce6724015d906b3abab
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator_helpers.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator_helpers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa289c5a87d33606eeb4f08ddb410205ea6dd0ea
Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/__pycache__/translator_helpers.cpython-312.pyc differ
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe9106fb35e333e68c163070621c2fb12f81e57
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator.py
@@ -0,0 +1,383 @@
+# Experimental Triton to Gluon AST translator.
+# This file takes a Triton JIT entry point and generates a Gluon equivalent including all
+# its dependencies. This generates highly inefficient Gluon code and is only used for
+# functional testing.
+#
+import ast
+from typing import Optional
+import triton
+import triton.language.core as tlc
+import triton.experimental.gluon.language as ttgl
+import sys
+import importlib
+import importlib.util
+import copy
+
+GLUON_IMPORT_LINES = ("from triton.experimental import gluon\n"
+                      "from triton.experimental.gluon import language as ttgl\n"
+                      "from triton.tools.triton_to_gluon_translater.translator_helpers import *\n")
+
+
+class TritonToGluonTransformer(ast.NodeTransformer):
+    """Transforms Triton kernel source into a functionally equivalent Gluon source.
+
+    This transformer rewrites builtins, dtype/tensor attributes, constexpr annotations,
+    and records nested JIT callables to be converted and appended to the output.
+    """
+
+    def __init__(self, globals_map: dict, shared_jit_set: set, shared_queue: list, is_jit, constexpr_globals: dict):
+        super().__init__()
+        # Resolution scope (globals ∪ nonlocals)
+        self.scope: dict = globals_map or {}
+        # Track discovered JIT functions to inline/append later
+        self.jit_functions: set = shared_jit_set
+        self.queue: list = shared_queue
+        self.is_jit = is_jit
+        # Maps module_file -> {name: value} to pull constexpr globals from the original source code
+        self.constexpr_globals: dict = constexpr_globals
+
+    def is_triton_constexpr_annotation(self, ann: ast.expr) -> bool:
+        # Resolve the annotation to a Python object and compare by identity
+        obj = self.resolve_value(ann)
+        return obj is tlc.constexpr
+
+    def as_ttgl_constexpr(self) -> ast.expr:
+        # Build ttgl.constexpr
+        return self.ttgl_attr("constexpr")
+
+    def maybe_rewrite_constexpr_annotation(self, ann: Optional[ast.expr]) -> Optional[ast.expr]:
+        if ann is None:
+            return None
+        if self.is_triton_constexpr_annotation(ann):
+            return self.as_ttgl_constexpr()
+        return ann
+
+    def ttgl_attr(self, name: str) -> ast.AST:
+        return ast.Attribute(value=ast.Name(id="ttgl", ctx=ast.Load()), attr=name, ctx=ast.Load())
+
+    def resolve_value(self, expr: ast.expr):
+        if isinstance(expr, ast.Name):
+            value = self.scope.get(expr.id) or sys.modules.get(expr.id)
+            return value
+        if isinstance(expr, ast.Attribute):
+            base = self.resolve_value(expr.value)
+            if base is None:
+                return None
+            return getattr(base, expr.attr, None)
+        return None
+
+    def forward_call(self, node: ast.Call, target_func: ast.expr, filter_keywords: list[str] = []) -> ast.Call:
+        new_keywords = [kw for kw in node.keywords if kw.arg not in filter_keywords]
+        return ast.Call(func=target_func, args=list(node.args), keywords=list(new_keywords))
+
+    def visit_Call(self, node: ast.Call) -> ast.AST:
+        node = self.generic_visit(node)
+        resolved_callable = self.resolve_value(node.func)
+        if resolved_callable is not None:
+            resolved_callable = triton.language.core._unwrap_if_constexpr(resolved_callable)
+            base_function = getattr(resolved_callable, "fn", resolved_callable)
+            function_name = getattr(base_function, "__qualname__", getattr(base_function, "__name__",
+                                                                           str(base_function)))
+            if triton.language.core.is_builtin(resolved_callable):
+                builtin_name = function_name.split(".")[-1]
+                builtin_mapping: dict[str, ast.expr] = {
+                    "arange": ast.Name(id="tl_arange", ctx=ast.Load()),
+                    "full": ast.Name(id="tl_full", ctx=ast.Load()),
+                    "trans": ast.Name(id="tl_trans", ctx=ast.Load()),
+                    "dot": ast.Name(id="tl_dot", ctx=ast.Load()),
+                    "dot_scaled": ast.Name(id="tl_dot_scaled", ctx=ast.Load()),
+                    "make_tensor_descriptor": ast.Name(id="tl_make_tensor_descriptor", ctx=ast.Load()),
+                    "load_tensor_descriptor": ast.Name(id="tl_load_tensor_descriptor", ctx=ast.Load()),
+                    "store_tensor_descriptor": ast.Name(id="tl_store_tensor_descriptor", ctx=ast.Load()),
+                    "num_threads": ast.Name(id="get_num_threads_per_program", ctx=ast.Load()),
+                }
+                mapped_target = builtin_mapping.get(builtin_name)
+                if mapped_target is None and hasattr(ttgl, builtin_name):
+                    mapped_target = self.ttgl_attr(builtin_name)
+
+                filter_keywords = []
+                # for reshape drop the can_reorder keyword, it is just an optimization and doesn't help much in Gluon.
+                if builtin_name == "reshape":
+                    filter_keywords = ["can_reorder"]
+                if mapped_target is not None:
+                    node = self.forward_call(node, mapped_target, filter_keywords)
+                    # For split, apply on the source argument rather than wrapping destination
+                    if builtin_name == "split":
+                        source_arg = node.args[0]
+                        wrapped_src = ast.Call(func=ast.Name(id="set_split_src_layout", ctx=ast.Load()),
+                                               args=[source_arg], keywords=[])
+                        node.args[0] = ast.copy_location(wrapped_src, source_arg)
+                    # For shape/layout changing ops, wrap to reset layout
+                    if builtin_name in {"reshape", "trans", "permute", "join", "reduce", "split"}:
+                        reset_layout_wrapped = ast.Call(func=ast.Name(id="reset_to_default_layout", ctx=ast.Load()),
+                                                        args=[node], keywords=[])
+                        node = ast.copy_location(reset_layout_wrapped, node)
+                    return node
+            # Track JITFunction callees
+            if isinstance(resolved_callable, triton.runtime.jit.JITCallable):
+                if resolved_callable not in self.jit_functions:
+                    self.jit_functions.add(resolved_callable)
+                    self.queue.append(resolved_callable)
+                # Strip namespace: rewrite to local function name
+                return self.forward_call(node, ast.Name(id=getattr(base_function, "__name__", ""), ctx=ast.Load()))
+            if resolved_callable is triton.language.core.range:
+                # skip all keywords except arg1, arg2, and step and replace with range.
+                allowed = {"arg1", "arg2", "step"}
+                new_keywords = [kw for kw in node.keywords if kw.arg in allowed]
+                new_args = list(node.args[:3])
+                return ast.copy_location(
+                    ast.Call(func=ast.Name(id="range", ctx=ast.Load()), args=new_args, keywords=new_keywords),
+                    node,
+                )
+            if resolved_callable is triton.language.core.static_range:
+                return self.forward_call(node, self.ttgl_attr("static_range"))
+        else:
+            if isinstance(node.func, ast.Attribute) and node.func.attr in ["store", "load", "gather", "scatter"]:
+                helper_name = "tl_obj_" + node.func.attr
+                return ast.Call(
+                    func=ast.Name(id=helper_name, ctx=ast.Load()),
+                    args=[node.func.value] + list(node.args),
+                    keywords=list(node.keywords),
+                )
+            if isinstance(node.func,
+                          ast.Attribute) and node.func.attr in ["reshape", "trans", "split", "join", "reduce"]:
+                if node.func.attr == "split":
+                    receiver_expr = node.func.value
+                    wrapped_receiver = ast.Call(func=ast.Name(id="set_split_src_layout", ctx=ast.Load()),
+                                                args=[receiver_expr], keywords=[])
+                    new_func = ast.Attribute(value=ast.copy_location(wrapped_receiver, receiver_expr),
+                                             attr=node.func.attr, ctx=ast.Load())
+                    node = ast.copy_location(
+                        ast.Call(func=new_func, args=list(node.args), keywords=list(node.keywords)), node)
+                wrapped = ast.Call(
+                    func=ast.Name(id="reset_to_default_layout", ctx=ast.Load()),
+                    args=[node],
+                    keywords=[],
+                )
+                return ast.copy_location(wrapped, node)
+        return node
+
+    def visit_Attribute(self, node: ast.Attribute) -> ast.AST:
+        node = self.generic_visit(node)
+        last_part = node.attr
+        # Only rewrite dtypes when the resolved object is a tl.dtype instance
+        # or the tl.dtype class itself (e.g., tl.float16 or tl.dtype.float16 / tl.dtype)
+        resolved_obj = self.resolve_value(node)
+        if resolved_obj is not None:
+            if isinstance(resolved_obj, tlc.dtype):
+                return self.ttgl_attr(last_part)
+            if resolved_obj is tlc.dtype and last_part == "dtype":
+                return self.ttgl_attr("dtype")
+            if resolved_obj is tlc.tensor and last_part == "tensor":
+                return self.ttgl_attr("tensor")
+            if resolved_obj is tlc.constexpr and last_part == "constexpr":
+                return self.ttgl_attr("constexpr")
+        if last_part == "tensor_descriptor":
+            return self.ttgl_attr("nvidia.hopper.tma.tensor_descriptor")
+        return node
+
+    def visit_Name(self, node):
+        node = self.generic_visit(node)
+        resolved_obj = self.resolve_value(node)
+        if resolved_obj is not None:
+            # Track standalone references to JITCallable and normalize name
+            if isinstance(resolved_obj, triton.runtime.jit.JITCallable):
+                if resolved_obj not in self.jit_functions:
+                    self.jit_functions.add(resolved_obj)
+                    self.queue.append(resolved_obj)
+                base_function = getattr(resolved_obj, "fn", resolved_obj)
+                normalized_name = getattr(base_function, "__name__",
+                                          getattr(base_function, "__qualname__", getattr(node, "id", "")))
+                return ast.copy_location(ast.Name(id=normalized_name, ctx=node.ctx), node)
+            if isinstance(resolved_obj, triton.language.core.constexpr):
+                identifier = getattr(node, "id", None)
+                if identifier is not None:
+                    # Use the current capture scope's file for the defining module
+                    module_file = self.scope.get("__file__")
+                    if isinstance(module_file, str):
+                        bucket = self.constexpr_globals.setdefault(module_file, {})
+                        bucket[identifier] = resolved_obj
+        return node
+
+    def visit_Subscript(self, node: ast.Subscript) -> ast.AST:
+        node = self.generic_visit(node)
+        # TODO: generalize to
+        # For patterns like x[None, :] or x[:, None], ensure x has a SliceLayout along the expanded dim
+        expanded_dim = None
+        if isinstance(node.slice, ast.Tuple) and len(node.slice.elts) == 2:
+            first, second = node.slice.elts
+            if isinstance(first, ast.Constant) and first.value is None:
+                expanded_dim = 0
+            elif isinstance(second, ast.Constant) and second.value is None:
+                expanded_dim = 1
+        if expanded_dim is not None:
+            value_expr = node.value
+            # Construct a 2D parent shape with a dummy dimension of size 1 at the expanded dim
+            # Use value.type.shape[0] as the vector length
+            type_attr = ast.Attribute(value=value_expr, attr="type", ctx=ast.Load())
+            shape_attr = ast.Attribute(value=type_attr, attr="shape", ctx=ast.Load())
+            len_expr = ast.Subscript(value=shape_attr, slice=ast.Constant(value=0), ctx=ast.Load())
+            if expanded_dim == 0:
+                parent_shape = ast.List(elts=[len_expr, ast.Constant(value=1)], ctx=ast.Load())
+            else:
+                parent_shape = ast.List(elts=[ast.Constant(value=1), len_expr], ctx=ast.Load())
+            # Build SliceLayout(dim, default_blocked_layout(parent_shape, ttgl.num_warps()))
+            slice_layout = ast.Call(
+                func=self.ttgl_attr("SliceLayout"),
+                args=[
+                    ast.Constant(value=expanded_dim),
+                    ast.Call(
+                        func=ast.Name(id="default_blocked_layout", ctx=ast.Load()),
+                        args=[parent_shape,
+                              ast.Call(func=self.ttgl_attr("num_warps"), args=[], keywords=[])],
+                        keywords=[],
+                    ),
+                ],
+                keywords=[],
+            )
+            converted_value = ast.Call(
+                func=self.ttgl_attr("convert_layout"),
+                args=[value_expr, slice_layout],
+                keywords=[],
+            )
+            return ast.Subscript(value=converted_value, slice=node.slice, ctx=node.ctx)
+        return node
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+        # Rewrite parameter annotations: triton.language.constexpr -> ttgl.constexpr
+        # Positional-only and regular args
+        for arg in list(getattr(node.args, "posonlyargs", [])) + list(node.args.args):
+            arg.annotation = self.maybe_rewrite_constexpr_annotation(arg.annotation)
+        # Vararg / kwarg
+        if node.args.vararg is not None:
+            node.args.vararg.annotation = self.maybe_rewrite_constexpr_annotation(node.args.vararg.annotation)
+        if node.args.kwarg is not None:
+            node.args.kwarg.annotation = self.maybe_rewrite_constexpr_annotation(node.args.kwarg.annotation)
+        # Keyword-only args
+        for arg in node.args.kwonlyargs:
+            arg.annotation = self.maybe_rewrite_constexpr_annotation(arg.annotation)
+        if self.is_jit:
+            node.decorator_list.insert(
+                0, ast.Attribute(value=ast.Name(id="gluon", ctx=ast.Load()), attr="jit", ctx=ast.Load()))
+        else:
+            node.decorator_list.insert(
+                0, ast.Attribute(value=ast.Name(id="gluon", ctx=ast.Load()), attr="constexpr_function", ctx=ast.Load()))
+        # Process body
+        return self.generic_visit(node)
+
+
+def unparse_original_assignments(constexpr_globals: dict) -> list[str]:
+    """Reconstruct original assignments for captured constexpr globals.
+
+    We parse each defining module once to extract assignments, and rewrite tl.constexpr
+    calls to ttgl.constexpr so the generated code remains consistent.
+    """
+
+    # Build assignment strings for captured globals by parsing each module once.
+    def collect_names(target_node, names_out):
+        if isinstance(target_node, ast.Name):
+            names_out.append(target_node.id)
+        elif isinstance(target_node, (ast.Tuple, ast.List)):
+            for element in target_node.elts:
+                collect_names(element, names_out)
+
+    def parse_assigns_and_imports(path: str) -> tuple[dict[str, ast.AST], dict[str, str]]:
+        try:
+            with open(path, "r") as f:
+                module_ast = ast.parse(f.read())
+        except Exception:
+            return {}, {}
+        assigns: dict[str, ast.AST] = {}
+        imports: dict[str, str] = {}
+        for stmt in getattr(module_ast, "body", []):
+            if isinstance(stmt, ast.Assign):
+                names: list[str] = []
+                for target in stmt.targets:
+                    collect_names(target, names)
+                for identifier in names:
+                    assigns[identifier] = stmt
+            elif isinstance(stmt, ast.AnnAssign):
+                names: list[str] = []
+                collect_names(stmt.target, names)
+                if stmt.value is not None:
+                    for identifier in names:
+                        assigns[identifier] = stmt
+            elif isinstance(stmt, ast.ImportFrom) and stmt.level == 0 and isinstance(stmt.module, str):
+                for alias in stmt.names:
+                    alias_name = alias.asname or alias.name.split(".")[-1]
+                    imports[alias_name] = stmt.module
+        return assigns, imports
+
+    def rewrite_constexpr_to_ttgl(node: ast.AST) -> ast.AST:
+
+        class ConstexprToTtglRewriter(ast.NodeTransformer):
+
+            def visit_Call(self, call_node: ast.Call) -> ast.AST:
+                call_node = self.generic_visit(call_node)
+                if isinstance(call_node.func, ast.Attribute) and call_node.func.attr == "constexpr":
+                    call_node.func = ast.copy_location(
+                        ast.Attribute(value=ast.Name(id="ttgl", ctx=ast.Load()), attr="constexpr", ctx=ast.Load()),
+                        call_node.func)
+                return call_node
+
+        return ConstexprToTtglRewriter().visit(node)
+
+    results: list[str] = []
+    imported_cache: dict[str, dict[str, ast.AST]] = {}
+    for mod_file, name_to_obj in constexpr_globals.items():
+        assigns, imports = parse_assigns_and_imports(mod_file)
+        for identifier in sorted(name_to_obj.keys()):
+            node = assigns.get(identifier)
+            if node is None:
+                imported_module_name = imports.get(identifier)
+                if imported_module_name:
+                    try:
+                        module_spec = importlib.util.find_spec(imported_module_name)
+                        origin = getattr(module_spec, "origin", None) if module_spec is not None else None
+                    except Exception:
+                        origin = None
+                    if origin:
+                        assignment_map = imported_cache.get(origin)
+                        if assignment_map is None:
+                            assignment_map, _ = parse_assigns_and_imports(origin)
+                            imported_cache[origin] = assignment_map
+                        node = assignment_map.get(identifier)
+            if node is not None:
+                edited_node = rewrite_constexpr_to_ttgl(copy.deepcopy(node))
+                ast.fix_missing_locations(edited_node)
+                results.append(ast.unparse(edited_node))
+            else:
+                results.append(f"{identifier} = {repr(name_to_obj[identifier])}")
+    return results
+
+
+def convert_triton_to_gluon(src: list[triton.runtime.jit.JITCallable]) -> str:
+    """Convert a Triton JIT entry point into a Gluon source string."""
+    shared_jit_set: set = set()
+    function_queue: list = list(src)
+    constexpr_globals: dict = {}
+    out = ""
+    # Process discovered callee JITFunctions, converting and appending them
+    while function_queue:
+        callee = function_queue.pop(0)
+        callee_src = callee._src
+        callee_tree = ast.parse(callee_src)
+        callee_scope = getattr(callee, "__globals__", {}) or {}
+        jit = isinstance(callee, triton.runtime.JITFunction)
+        callee_transformer = TritonToGluonTransformer(globals_map=callee_scope, shared_jit_set=shared_jit_set,
+                                                      shared_queue=function_queue, is_jit=jit,
+                                                      constexpr_globals=constexpr_globals)
+        callee_new = callee_transformer.visit(callee_tree)
+        ast.fix_missing_locations(callee_new)
+        out += "\n\n" + ast.unparse(callee_new)
+
+    out = "\n\n" + out
+
+    # Pull constexpr globals from the original source code
+    for line in unparse_original_assignments(constexpr_globals):
+        out = line + "\n" + out
+
+    # Prepend required Gluon imports
+    out = GLUON_IMPORT_LINES + "\n\n" + out
+
+    return out
diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator_helpers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b946ee3bf9f304f80cd3cc13a5a3502f517098e
--- /dev/null
+++ b/URSA/.venv_ursa/lib/python3.12/site-packages/triton/tools/triton_to_gluon_translater/translator_helpers.py
@@ -0,0 +1,618 @@
+from triton.experimental import gluon
+from triton.experimental.gluon import language as ttgl
+from triton.experimental.gluon.language.nvidia.hopper import mbarrier
+from triton.experimental.gluon.language.nvidia.blackwell import (
+    TensorMemoryLayout,
+    TensorMemoryScalesLayout,
+    allocate_tensor_memory,
+    get_tmem_reg_layout,
+    tcgen05_mma,
+    tcgen05_mma_scaled,
+    tcgen05_commit,
+)
+from triton.experimental.gluon.language.nvidia.ampere import mma_v2
+from triton.experimental.gluon.language.nvidia.hopper import tma, fence_async_shared
+from triton.experimental.gluon.language.nvidia.blackwell import tma as tma_blackwell
+
+
+@gluon.constexpr_function
+def tl_dot_mma_sync_layout(shape, num_warps):
+    rank = len(shape)
+    assert rank in [2, 3], "MMA sync only supports 2D shapes or 3D shapes with a batch outer dimension"
+    if rank == 2:
+        return ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[num_warps, 1], instr_shape=[16, 8])
+    return ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[num_warps, 1, 1], instr_shape=[1, 16, 8])
+
+
+@gluon.constexpr_function
+def tl_dot_mma_sync_k_width(a_ty, b_ty):
+    a_bitwidth = a_ty.element_ty.primitive_bitwidth
+    b_bitwidth = b_ty.element_ty.primitive_bitwidth
+    min_bitwidth = min(a_bitwidth, b_bitwidth)
+    return max(32 // min_bitwidth, 1)
+
+
+@gluon.jit
+def tl_dot_mma_sync(a, b, acc_init=None, input_precision=None, out_dtype=ttgl.float32):
+    mma_layout: ttgl.constexpr = tl_dot_mma_sync_layout(a.type.shape, ttgl.num_warps())
+    k_width: ttgl.constexpr = tl_dot_mma_sync_k_width(a.type, b.type)
+    a_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=mma_layout, operand_index=0, k_width=k_width)
+    b_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=mma_layout, operand_index=1, k_width=k_width)
+    a = ttgl.convert_layout(a, a_layout)
+    b = ttgl.convert_layout(b, b_layout)
+    if acc_init is not None:
+        acc = ttgl.convert_layout(acc_init, mma_layout)
+    else:
+        acc = ttgl.full([a.shape[0], a.shape[1], b.shape[2]], 0.0, out_dtype, layout=mma_layout)
+    result = mma_v2(a, b, acc, input_precision)
+    if acc_init is not None:
+        result = ttgl.convert_layout(result, acc_init.type.layout)
+    return result
+
+
+@gluon.constexpr_function
+def tl_dot_mmav5_supported(a_ty, b_ty, num_warps, input_precision, allow_tf32, max_num_imprecise_acc):
+    assert max_num_imprecise_acc is None, "max_num_imprecise_acc only applies to Hopper warp_group_dot"
+    assert input_precision is None or allow_tf32 is None, "Only one of input_precision and allow_tf32 can be specified"
+    if input_precision is None and (allow_tf32 or allow_tf32 is None):
+        input_precision = "tf32"
+
+    M = a_ty.shape[0]
+    N = b_ty.shape[1]
+    K = a_ty.shape[1]
+    min_K = 256 // a_ty.element_ty.primitive_bitwidth
+    if a_ty.element_ty.is_int() or b_ty.element_ty.is_int():
+        return False
+    if min(a_ty.element_ty.primitive_bitwidth, b_ty.element_ty.primitive_bitwidth) >= 32 and input_precision != "tf32":
+        return False
+    return num_warps in [4, 8] and len(a_ty.shape) == 2 and len(b_ty.shape) == 2 and K >= min_K and M >= 64 and N >= 16
+
+
+@gluon.constexpr_function
+def get_shared_memory_mma_layout(type, operand_index, allow_transpose, is_fp4_padded=False, force_transpose=False):
+    if not allow_transpose:
+        if operand_index == 1:
+            transposed = True
+        else:
+            transposed = False
+        if force_transpose:
+            transposed = not transposed
+    else:
+        transposed = operand_index == 1
+
+    shape = type.shape
+    swizzle_byte_width = 0
+    ele_bit_width = type.element_ty.primitive_bitwidth
+    packing_factor = 2 if is_fp4_padded else 1
+
+    contig_dim_size_in_byte = (shape[0] if transposed else shape[1]) * packing_factor * ele_bit_width // 8
+    if contig_dim_size_in_byte >= 128 and contig_dim_size_in_byte % 128 == 0:
+        swizzle_byte_width = 128
+    elif contig_dim_size_in_byte >= 64 and contig_dim_size_in_byte % 64 == 0:
+        swizzle_byte_width = 64
+    elif contig_dim_size_in_byte >= 32 and contig_dim_size_in_byte % 32 == 0:
+        swizzle_byte_width = 32
+    else:
+        swizzle_byte_width = 0
+
+    flatten_outer_dim = 1
+    for dim in shape:
+        flatten_outer_dim *= dim
+    if len(shape) < 2 or flatten_outer_dim < 8:
+        swizzle_byte_width = 0
+    return ttgl.NVMMASharedLayout(swizzle_byte_width=swizzle_byte_width, transposed=transposed,
+                                  element_bitwidth=ele_bit_width, rank=len(shape), fp4_padded=is_fp4_padded)
+
+
+@gluon.jit
+def get_shared_memory_mma_operand(value, operand_index, allow_transpose, is_fp4_padded=False, force_transpose=False):
+    layout: ttgl.constexpr = get_shared_memory_mma_layout(value.type, operand_index, allow_transpose, is_fp4_padded,
+                                                          force_transpose)
+    return ttgl.allocate_shared_memory(value.dtype, value.shape, layout, value)
+
+
+@gluon.jit
+def tl_dot_blackwell(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None,
+                     out_dtype=ttgl.float32):
+    M: ttgl.constexpr = a.type.shape[0]
+    N: ttgl.constexpr = b.type.shape[1]
+
+    allow_transpose = not a.type.element_ty.is_fp32()
+    a_smem = get_shared_memory_mma_operand(a, 0, allow_transpose)
+    b_smem = get_shared_memory_mma_operand(b, 1, allow_transpose)
+
+    # MMA instruction shape
+    m: ttgl.constexpr = 128 if M >= 128 else 64
+    n: ttgl.constexpr = 256 if N >= 256 else N
+
+    acc_dtype: ttgl.constexpr = acc.dtype if acc is not None else out_dtype
+    col_stride: ttgl.constexpr = 32 // acc_dtype.primitive_bitwidth
+    acc_tmem_layout: ttgl.constexpr = TensorMemoryLayout([m, n], col_stride=col_stride)
+
+    tmem_reg_layout: ttgl.constexpr = get_tmem_reg_layout(acc_dtype, (M, N), acc_tmem_layout, ttgl.num_warps())
+    if acc is not None:
+        acc_temp = ttgl.convert_layout(acc, tmem_reg_layout)
+    else:
+        acc_temp = ttgl.zeros([M, N], out_dtype, layout=tmem_reg_layout)
+    acc_tmem = allocate_tensor_memory(acc_temp.dtype, [M, N], acc_tmem_layout, acc_temp)
+    fence_async_shared()
+    bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
+    mbarrier.init(bar, count=1)
+    tcgen05_mma(a_smem, b_smem, acc_tmem, use_acc=True)
+    tcgen05_commit(bar)
+    mbarrier.wait(bar, phase=0)
+    mbarrier.invalidate(bar)
+
+    # Load back from TMEM using a register layout and convert to acc layout
+    out = acc_tmem.load(tmem_reg_layout)
+    ret_layout: ttgl.constexpr = default_blocked_layout([M, N], ttgl.num_warps())
+    out = ttgl.convert_layout(out, ret_layout)
+    return out
+
+
+@gluon.jit
+def tl_dot(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None, out_dtype=ttgl.float32):
+    num_warps: ttgl.constexpr = ttgl.num_warps()
+    if tl_dot_mmav5_supported(a.type, b.type, num_warps, input_precision, allow_tf32, max_num_imprecise_acc):
+        return tl_dot_blackwell(a, b, acc, input_precision, allow_tf32, max_num_imprecise_acc, out_dtype)
+    else:
+        return tl_dot_mma_sync(a, b, acc, input_precision, out_dtype)
+
+
+@gluon.constexpr_function
+def tl_dot_scaled_mmav5_supported(a_ty, b_ty, num_warps):
+    M = a_ty.shape[0]
+    N = b_ty.shape[1]
+    K = a_ty.shape[1]
+    min_K = 256 // a_ty.element_ty.primitive_bitwidth
+    return num_warps in [4, 8] and len(a_ty.shape) == 2 and len(b_ty.shape) == 2 and K >= min_K and M >= 128 and N >= 16
+
+
+@gluon.constexpr_function
+def get_swizzle_byte_width(bitwidth):
+    swizzle = min(bitwidth, 128)
+    swizzle = 0 if swizzle < 32 else swizzle
+    return swizzle
+
+
+@gluon.constexpr_function
+def get_int_type(bitwidth):
+    if bitwidth == 64:
+        return ttgl.int64
+    elif bitwidth == 32:
+        return ttgl.int32
+    elif bitwidth == 16:
+        return ttgl.int16
+    elif bitwidth == 8:
+        return ttgl.int8
+    else:
+        assert False, f"Unsupported bitwidth: {bitwidth}"
+
+
+@gluon.jit
+def tl_dot_decomposed_scale_to_16(scale, compute_type):
+    large_fp_type: ttgl.constexpr = ttgl.float32 if compute_type == ttgl.float16 else compute_type
+    int_width: ttgl.constexpr = large_fp_type.primitive_bitwidth
+    int_type: ttgl.constexpr = get_int_type(int_width)
+
+    zexted = ttgl.cast(scale, int_type)
+    shift_value: ttgl.constexpr = large_fp_type.fp_mantissa_width
+    shl_res = zexted << shift_value
+    scale_fp = ttgl.cast(shl_res, large_fp_type, bitcast=True)
+    if large_fp_type != compute_type:
+        scale_fp = ttgl.cast(scale_fp, compute_type)
+    return scale_fp
+
+
+@gluon.constexpr_function
+def tl_dot_get_expand_dims_layout(scale_ty, num_warps, rank):
+    shape = scale_ty.shape.values + [1]
+    blocked = default_blocked_layout(shape, num_warps)
+    slice = ttgl.SliceLayout(rank, blocked)
+    return slice
+
+
+@gluon.constexpr_function
+def tl_dot_get_permute_order(rank, dim):
+    order = list(range(rank))
+    order.insert(dim + 1, rank)
+    return order
+
+
+@gluon.constexpr_function
+def tl_dot_get_reshape_shape(scale_ty, dim):
+    shape = list(scale_ty.shape.values)
+    shape.pop()
+    shape[dim] *= 32
+    return shape
+
+
+@gluon.jit
+def tl_dot_decomposed_broadcast_scale(scale, dim):
+    scale_ty: ttgl.constexpr = scale.type
+    rank: ttgl.constexpr = len(scale_ty.shape)
+
+    num_warps: ttgl.constexpr = ttgl.num_warps()
+    slice_enc: ttgl.constexpr = tl_dot_get_expand_dims_layout(scale_ty, num_warps, rank)
+    scale = ttgl.convert_layout(scale, slice_enc)
+    expand_scale = scale.expand_dims(rank)
+    broadcast_scale = expand_scale.broadcast_to(scale.type.shape + (32, ))
+    permute_order: ttgl.constexpr = tl_dot_get_permute_order(rank, dim)
+    transposed_scale = broadcast_scale.permute(permute_order.value)
+    reshape_shape: ttgl.constexpr = tl_dot_get_reshape_shape(broadcast_scale.type, dim)
+    return transposed_scale.reshape(reshape_shape)
+
+
+@gluon.constexpr_function
+def tl_dot_decomposed_get_transposed_order(rank):
+    assert rank >= 2
+    order = list(range(rank - 2))
+    order += [rank - 1, rank - 2]
+    return order
+
+
+@gluon.jit
+def tl_dot_decomposed_extend_and_broadcast_scale(v, scale, compute_type, operand_index):
+    rank: ttgl.constexpr = len(v.type.shape)
+    k_dim: ttgl.constexpr = rank - 1 if operand_index == 0 else rank - 2
+
+    if operand_index == 1:
+        order: ttgl.constexpr = tl_dot_decomposed_get_transposed_order(rank)
+        scale = ttgl.permute(scale, order.value)
+
+    scale16 = tl_dot_decomposed_scale_to_16(scale, compute_type)
+    reshape_scale = tl_dot_decomposed_broadcast_scale(scale16, k_dim)
+    return ttgl.convert_layout(reshape_scale, v.type.layout), scale
+
+
+@gluon.jit
+def tl_dot_decomposed_mask_nan(mxfp, scale, fast_math):
+    ttgl.static_assert(fast_math, "TODO: support non-fast-math")
+    return mxfp
+
+
+@gluon.jit
+def tl_dot_decomposed_scale_arg(v, scale, arg_format, operand_index, compute_type, fast_math):
+    is_fp4: ttgl.constexpr = arg_format == "e2m1"
+    rank: ttgl.constexpr = len(v.type.shape)
+    k_dim: ttgl.constexpr = rank - 1 if operand_index == 0 else rank - 2
+
+    if is_fp4:
+        v = ttgl.fp4_to_fp(v, compute_type, k_dim)
+    else:
+        v = ttgl.cast(v, compute_type)
+    if scale is None:
+        return v
+    else:
+        reshape_scale, scale = tl_dot_decomposed_extend_and_broadcast_scale(v, scale, compute_type, operand_index)
+        mxfp = ttgl.mul(v, reshape_scale)
+        return tl_dot_decomposed_mask_nan(mxfp, scale, fast_math)
+
+
+@gluon.jit
+def tl_dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False, lhs_k_pack=True,
+                  rhs_k_pack=True, out_dtype=ttgl.float32):
+    if tl_dot_scaled_mmav5_supported(lhs.type, rhs.type,
+                                     ttgl.num_warps() and lhs_scale is not None and rhs_scale is not None):
+        return tl_dot_scaled_blackwell(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, fast_math,
+                                       lhs_k_pack, rhs_k_pack, out_dtype)
+    else:
+        return tl_dot_decomposed_block_scales(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, fast_math,
+                                              lhs_k_pack, rhs_k_pack, out_dtype)
+
+
+@gluon.jit
+def tl_dot_decomposed_block_scales(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False,
+                                   lhs_k_pack=True, rhs_k_pack=True, out_dtype=ttgl.float32):
+    if lhs_scale is None and rhs_scale is not None:
+        lhs_trans = tl_trans(lhs)
+        rhs_trans = tl_trans(rhs)
+        if acc is not None:
+            orig_layout: ttgl.constexpr = acc.type.layout
+            acc = tl_trans(acc)
+        result = tl_dot_scaled(rhs_trans, rhs_scale, rhs_format, lhs_trans, lhs_scale, lhs_format, acc, fast_math,
+                               lhs_k_pack, rhs_k_pack, out_dtype)
+        result = tl_trans(result)
+        if acc is not None:
+            result = ttgl.convert_layout(result, orig_layout)
+        return result
+    else:
+        ttgl.static_assert(not (not lhs_k_pack or not rhs_k_pack), "TODO: support m/n packed formats")
+        compute_type: ttgl.constexpr = ttgl.float16 if (lhs_format == "fp16" or rhs_format == "fp16") else ttgl.bfloat16
+
+        scale_a = tl_dot_decomposed_scale_arg(lhs, lhs_scale, lhs_format, 0, compute_type, fast_math)
+        scale_b = tl_dot_decomposed_scale_arg(rhs, rhs_scale, rhs_format, 1, compute_type, fast_math)
+
+        return tl_dot(scale_a, scale_b, acc, out_dtype=out_dtype)
+
+
+@gluon.jit
+def tl_dot_scaled_blackwell(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False,
+                            lhs_k_pack=True, rhs_k_pack=True, out_dtype=ttgl.float32):
+    is_a_fp4: ttgl.constexpr = lhs_format == "e2m1"
+    is_b_fp4: ttgl.constexpr = rhs_format == "e2m1"
+
+    mixed_prec: ttgl.constexpr = lhs_format != rhs_format
+    is_a_mixed_prec_fp4: ttgl.constexpr = mixed_prec and is_a_fp4
+    is_b_mixed_prec_fp4: ttgl.constexpr = mixed_prec and not is_a_fp4 and is_b_fp4
+
+    is_mmav5_fp4_padded_a: ttgl.constexpr = is_a_mixed_prec_fp4 or not lhs_k_pack
+    is_mmav5_fp4_padded_b: ttgl.constexpr = is_b_mixed_prec_fp4 or not rhs_k_pack
+
+    a_smem = get_shared_memory_mma_operand(lhs, 0, allow_transpose=not is_a_fp4, is_fp4_padded=is_mmav5_fp4_padded_a,
+                                           force_transpose=not lhs_k_pack)
+    b_smem = get_shared_memory_mma_operand(rhs, 1, allow_transpose=not is_b_fp4, is_fp4_padded=is_mmav5_fp4_padded_b,
+                                           force_transpose=not rhs_k_pack)
+
+    M: ttgl.constexpr = lhs.type.shape[0]
+    N: ttgl.constexpr = rhs.type.shape[1]
+
+    m: ttgl.constexpr = 128
+    n: ttgl.constexpr = 256 if N >= 256 else N
+
+    acc_dtype: ttgl.constexpr = acc.dtype if acc is not None else out_dtype
+    col_stride: ttgl.constexpr = 32 // acc_dtype.primitive_bitwidth
+    acc_tmem_layout: ttgl.constexpr = TensorMemoryLayout([m, n], col_stride=col_stride)
+    tmem_reg_layout: ttgl.constexpr = get_tmem_reg_layout(acc_dtype, (M, N), acc_tmem_layout, ttgl.num_warps())
+    if acc is not None:
+        acc_temp = ttgl.convert_layout(acc, tmem_reg_layout)
+    else:
+        acc_temp = ttgl.zeros([M, N], out_dtype, layout=tmem_reg_layout)
+    acc_tmem = allocate_tensor_memory(acc_temp.dtype, [M, N], acc_tmem_layout, acc_temp)
+    fence_async_shared()
+
+    bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
+    mbarrier.init(bar, count=1)
+    scale_layout: ttgl.constexpr = TensorMemoryScalesLayout()
+    scale_layout_reg_lhs: ttgl.constexpr = get_tmem_reg_layout(lhs_scale.dtype, lhs_scale.type.shape, scale_layout,
+                                                               ttgl.num_warps())
+    scale_layout_reg_rhs: ttgl.constexpr = get_tmem_reg_layout(rhs_scale.dtype, rhs_scale.type.shape, scale_layout,
+                                                               ttgl.num_warps())
+    lhs_scale = ttgl.convert_layout(lhs_scale, scale_layout_reg_lhs)
+    rhs_scale = ttgl.convert_layout(rhs_scale, scale_layout_reg_rhs)
+    a_scale_tmem = allocate_tensor_memory(lhs_scale.dtype, lhs_scale.shape, scale_layout, lhs_scale)
+    b_scale_tmem = allocate_tensor_memory(rhs_scale.dtype, rhs_scale.shape, scale_layout, rhs_scale)
+
+    tcgen05_mma_scaled(a_smem, b_smem, acc_tmem, a_scale_tmem, b_scale_tmem, lhs_format, rhs_format, use_acc=True)
+    tcgen05_commit(bar)
+    mbarrier.wait(bar, phase=0)
+    mbarrier.invalidate(bar)
+    # Load back from TMEM using a register layout and convert to acc layout
+    out = acc_tmem.load(tmem_reg_layout)
+    ret_layout: ttgl.constexpr = default_blocked_layout([M, N], ttgl.num_warps())
+    out = ttgl.convert_layout(out, ret_layout)
+    return out
+
+
+@gluon.constexpr_function
+def get_num_threads_per_warp() -> ttgl.constexpr:
+    return ttgl.constexpr(32)
+
+
+@ttgl._core.builtin
+def get_num_threads_per_program(_semantic=None, _generator=None):
+    return ttgl.num_warps(_semantic=_semantic, _generator=_generator) * get_num_threads_per_warp(_semantic=_semantic)
+
+
+@gluon.constexpr_function
+def default_blocked_layout(shape: ttgl.constexpr, num_warps: ttgl.constexpr) -> ttgl.constexpr:
+    rank = len(shape)
+    # 1 element per thread for all dimensions
+    size_per_thread = [1 for _ in range(rank)]
+    # Distribute 32 threads per warp across dimensions (simple heuristic: last-fastest)
+    threads_per_warp = [1 for _ in range(rank)]
+    # TODO: pick a better layout based on shape. Using this allows to not have to convert layout when broadcasting but may blow up register pressure.
+    threads_per_warp[rank - 1] = get_num_threads_per_warp()
+    # remaining_threads = get_num_threads_per_warp()
+    # for dim in range(rank - 1, -1, -1):
+    #     threads_per_warp[dim] = min(remaining_threads, shape[dim])
+    #     remaining_threads = remaining_threads // threads_per_warp[dim]
+    # Use provided num_warps to distribute warps per CTA (put all on first dim)
+    warps_per_cta = [1 for _ in range(rank)]
+    warps_per_cta[0] = num_warps
+    # Natural order [rank-1, rank-2, ..., 0]
+    order = [i for i in range(rank - 1, -1, -1)]
+    return ttgl.BlockedLayout(size_per_thread=size_per_thread, threads_per_warp=threads_per_warp,
+                              warps_per_cta=warps_per_cta, order=order)
+
+
+@gluon.jit
+def tl_obj_store(obj, offsets, value):
+    if isinstance(obj, ttgl.nvidia.hopper.tma.tensor_descriptor):
+        return tl_store_tensor_descriptor(obj, offsets, value)
+    else:
+        return obj.store(offsets, value)
+
+
+@gluon.jit
+def tl_obj_load(obj, offsets):
+    if isinstance(obj, ttgl.nvidia.hopper.tma.tensor_descriptor):
+        return tl_load_tensor_descriptor(obj, offsets)
+    else:
+        return obj.load(offsets)
+
+
+@gluon.jit
+def tl_obj_gather(obj, x_offsets, y_offset):
+    if isinstance(obj, ttgl.nvidia.hopper.tma.tensor_descriptor):
+        desc = obj
+        desc_shape: ttgl.constexpr = [x_offsets.shape[0], desc.block_shape[1]]
+        alloc = ttgl.allocate_shared_memory(desc.dtype, desc_shape, desc.layout)
+        bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
+        mbarrier.init(bar, count=1)
+        x_offsets_layout: ttgl.constexpr = ttgl.SliceLayout(
+            0, ttgl.BlockedLayout([1, 4], [get_num_threads_per_warp(), 1], [1, ttgl.num_warps()], [1, 0]))
+        x_offsets = ttgl.convert_layout(x_offsets, x_offsets_layout)
+        mbarrier.expect(bar, x_offsets.shape[0] * obj.block_type.nbytes)
+        tma_blackwell.async_gather(desc, x_offsets, y_offset, bar, alloc)
+        mbarrier.wait(bar, phase=0)
+        mbarrier.invalidate(bar)
+        # Load from shared memory into a register tensor using a reasonable default layout
+        ret_layout: ttgl.constexpr = default_blocked_layout(desc.block_shape, ttgl.num_warps())
+        out = alloc.load(ret_layout)
+        return out
+    else:
+        return obj.gather(x_offsets, y_offset)
+
+
+@gluon.jit
+def tl_obj_scatter(obj, value, x_offsets, y_offset):
+    if isinstance(obj, ttgl.nvidia.hopper.tma.tensor_descriptor):
+        desc = obj
+        desc_shape: ttgl.constexpr = [x_offsets.shape[0], desc.block_shape[1]]
+        alloc = ttgl.allocate_shared_memory(desc.dtype, desc_shape, desc.layout, value)
+        fence_async_shared()
+        x_offsets_layout: ttgl.constexpr = ttgl.SliceLayout(
+            0, ttgl.BlockedLayout([1, 4], [get_num_threads_per_warp(), 1], [1, ttgl.num_warps()], [1, 0]))
+        x_offsets = ttgl.convert_layout(x_offsets, x_offsets_layout)
+        tma_blackwell.async_scatter(desc, x_offsets, y_offset, alloc)
+        tma.store_wait(0)
+    else:
+        obj.scatter(value, x_offsets, y_offset)
+
+
+@ttgl._core.builtin
+def tl_make_tensor_descriptor(base, shape, strides, block_shape, padding_option="zero", _semantic=None):
+    layout = ttgl.NVMMASharedLayout.get_default_for(block_shape, base.dtype.element_ty)
+    return tma.make_tensor_descriptor(base, shape, strides, block_shape, layout, padding_option, _semantic=_semantic)
+
+
+@gluon.jit
+def tl_store_tensor_descriptor(desc, offsets, value):
+    alloc = ttgl.allocate_shared_memory(desc.dtype, desc.block_shape, desc.layout, value)
+    fence_async_shared()
+    tma.async_copy_shared_to_global(desc, offsets, alloc)
+    tma.store_wait(0)
+    alloc._keep_alive()
+
+
+@gluon.jit
+def tl_load_tensor_descriptor(desc, offsets):
+    smem = ttgl.allocate_shared_memory(desc.dtype, desc.block_shape, desc.layout)
+    bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
+    mbarrier.init(bar, count=1)
+    # Issue async copy from global (descriptor) to shared memory and wait for completion
+    mbarrier.expect(bar, desc.block_type.nbytes)
+    tma.async_copy_global_to_shared(desc, offsets, bar, smem)
+    mbarrier.wait(bar, phase=0)
+    mbarrier.invalidate(bar)
+    # Load from shared memory into a register tensor using a reasonable default layout
+    ret_layout: ttgl.constexpr = default_blocked_layout(desc.block_shape, ttgl.num_warps())
+    out = smem.load(ret_layout)
+    return out
+
+
+@gluon.jit
+def tl_arange(start: ttgl.constexpr, stop: ttgl.constexpr = None):
+    layout: ttgl.constexpr = default_blocked_layout([stop - start], ttgl.num_warps())
+    return ttgl.arange(start, stop, layout=layout)
+
+
+@gluon.jit
+def tl_full(shape, value, dtype=None):
+    layout: ttgl.constexpr = default_blocked_layout(shape, ttgl.num_warps())
+    return ttgl.full(shape, value, dtype, layout=layout)
+
+
+@ttgl._core.builtin
+def tl_trans(value, *dims, _semantic=None):
+    return value.trans(*dims, _semantic=_semantic)
+
+
+@ttgl._core.builtin
+def cat(input, other, can_reorder=False, layout=None, _semantic=None):
+    """
+    Concatenate the two tensors.
+
+    Args:
+        input (tensor): The first input tensor.
+        other (tensor): The second input tensor.
+        can_reorder (bool): Compiler hint. If true, the compiler is allowed to reorder elements while concatenating inputs.  Only use if the order does not matter (e.g., result is only used in reduction ops).  Current implementation of `cat` supports only can_reorder=True.
+        layout (DistributedLayout): The destination layout of the output tensor.
+
+    Returns:
+        tensor: The concatenated tensor.
+    """
+    can_reorder = ttgl._core._unwrap_if_constexpr(can_reorder)
+    layout = ttgl._core._unwrap_if_constexpr(layout)
+    return _semantic.cat(input, other, can_reorder, layout)
+
+
+@gluon.jit
+def tl_cat(lhs, rhs, can_reorder=False):
+    return cat(lhs, rhs, can_reorder, layout=default_blocked_layout([lhs.shape[0] + rhs.shape[0]], ttgl.num_warps()))
+
+
+@gluon.jit
+def reset_to_default_layout(value):
+    ty: ttgl.constexpr = value.type
+    if isinstance(ty, ttgl.tuple_type):
+        out = ()
+        for i in ttgl.static_range(len(value)):
+            r = ttgl.convert_layout(value[i], layout=default_blocked_layout(value[i].type.shape, ttgl.num_warps()))
+            out = out + (r, )
+        return out
+    elif isinstance(value, ttgl.tensor) and isinstance(value.type, ttgl.distributed_type):
+        layout: ttgl.constexpr = default_blocked_layout(ty.shape, ttgl.num_warps())
+        return ttgl.convert_layout(value, layout=layout)
+    else:
+        return value
+
+
+@gluon.constexpr_function
+def get_split_src_layout(shape: ttgl.constexpr, num_warps: ttgl.constexpr) -> ttgl.constexpr:
+    rank = len(shape)
+    size_per_thread = [1 if i != rank - 1 else 2 for i in range(rank)]
+    # Distribute 32 threads per warp across dimensions (simple heuristic: last-fastest)
+    threads_per_warp = [1 for _ in range(rank)]
+    remaining_threads = get_num_threads_per_warp()
+    for dim in range(rank - 2, -1, -1):
+        threads_per_warp[dim] = min(shape[dim], remaining_threads)
+        remaining_threads = remaining_threads // threads_per_warp[dim]
+    # Use provided num_warps to distribute warps per CTA (put all on first dim)
+    warps_per_cta = [1 for _ in range(rank)]
+    warps_per_cta[0] = num_warps
+    # Natural order [rank-1, rank-2, ..., 0]
+    order = [i for i in range(rank - 1, -1, -1)]
+    return ttgl.BlockedLayout(size_per_thread=size_per_thread, threads_per_warp=threads_per_warp,
+                              warps_per_cta=warps_per_cta, order=order)
+
+
+@gluon.jit
+def set_split_src_layout(value):
+    layout: ttgl.constexpr = get_split_src_layout(value.type.shape, ttgl.num_warps())
+    return ttgl.convert_layout(value, layout=layout)
+
+
+def convert_host_descriptor(desc):
+
+    def torch_dtype_to_triton(dtype):
+        import torch
+        if dtype == torch.float8_e5m2:
+            return ttgl.float8e5
+        if dtype == torch.float8_e4m3fn:
+            return ttgl.float8e4nv
+        return getattr(ttgl, str(dtype).split('.')[1])
+
+    from triton.tools.tensor_descriptor import TensorDescriptor
+    assert isinstance(desc, TensorDescriptor)
+    block_shape = desc.block_shape
+    dtype = desc.base.dtype
+    tensor = desc.base
+    layout = ttgl.NVMMASharedLayout.get_default_for(block_shape, torch_dtype_to_triton(dtype))
+    return gluon.nvidia.hopper.TensorDescriptor(tensor, desc.shape, desc.strides, block_shape, layout)
+
+
+# hacks to workaround limited dependencies tracking.
+# TODO: fix this by pulling imports into the generated file.
+def current_target():
+    from triton.runtime import driver
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+
+
+current_target.__triton_builtin__ = True